treat 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +4 -4
- data/TODO +21 -54
- data/lib/economist/half_cocked_basel.txt +16 -0
- data/lib/economist/hose_and_dry.doc +0 -0
- data/lib/economist/hungarys_troubles.abw +70 -0
- data/lib/economist/republican_nomination.pdf +0 -0
- data/lib/economist/saving_the_euro.odt +0 -0
- data/lib/economist/to_infinity_and_beyond.txt +15 -0
- data/lib/economist/zero_sum.html +91 -0
- data/lib/treat.rb +58 -72
- data/lib/treat/buildable.rb +59 -15
- data/lib/treat/categories.rb +26 -14
- data/lib/treat/category.rb +2 -2
- data/lib/treat/delegatable.rb +65 -48
- data/lib/treat/doable.rb +44 -0
- data/lib/treat/entities.rb +34 -14
- data/lib/treat/entities/collection.rb +2 -0
- data/lib/treat/entities/document.rb +3 -2
- data/lib/treat/entities/entity.rb +105 -90
- data/lib/treat/entities/phrases.rb +17 -0
- data/lib/treat/entities/tokens.rb +28 -13
- data/lib/treat/entities/zones.rb +20 -0
- data/lib/treat/extractors.rb +49 -11
- data/lib/treat/extractors/coreferences/stanford.rb +68 -0
- data/lib/treat/extractors/date/chronic.rb +32 -0
- data/lib/treat/extractors/date/ruby.rb +25 -0
- data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
- data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
- data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
- data/lib/treat/extractors/language/what_language.rb +49 -0
- data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
- data/lib/treat/extractors/roles/naive.rb +73 -0
- data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
- data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
- data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
- data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
- data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
- data/lib/treat/extractors/time/nickel.rb +30 -12
- data/lib/treat/extractors/topic_words/lda.rb +9 -9
- data/lib/treat/extractors/topics/reuters.rb +14 -15
- data/lib/treat/extractors/topics/reuters/region.xml +1 -0
- data/lib/treat/features.rb +7 -0
- data/lib/treat/formatters/readers/abw.rb +6 -1
- data/lib/treat/formatters/readers/autoselect.rb +5 -6
- data/lib/treat/formatters/readers/doc.rb +3 -1
- data/lib/treat/formatters/readers/html.rb +1 -1
- data/lib/treat/formatters/readers/image.rb +43 -0
- data/lib/treat/formatters/readers/odt.rb +1 -2
- data/lib/treat/formatters/readers/pdf.rb +9 -1
- data/lib/treat/formatters/readers/xml.rb +40 -0
- data/lib/treat/formatters/serializers/xml.rb +50 -14
- data/lib/treat/formatters/serializers/yaml.rb +7 -2
- data/lib/treat/formatters/unserializers/xml.rb +33 -7
- data/lib/treat/formatters/visualizers/dot.rb +90 -20
- data/lib/treat/formatters/visualizers/short_value.rb +2 -2
- data/lib/treat/formatters/visualizers/standoff.rb +2 -2
- data/lib/treat/formatters/visualizers/tree.rb +1 -1
- data/lib/treat/formatters/visualizers/txt.rb +13 -4
- data/lib/treat/group.rb +16 -10
- data/lib/treat/helpers/linguistics_loader.rb +18 -0
- data/lib/treat/inflectors.rb +10 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
- data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
- data/lib/treat/inflectors/declensions/english.rb +319 -0
- data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
- data/lib/treat/install.rb +59 -0
- data/lib/treat/kernel.rb +18 -8
- data/lib/treat/languages.rb +18 -11
- data/lib/treat/languages/arabic.rb +4 -2
- data/lib/treat/languages/chinese.rb +6 -2
- data/lib/treat/languages/dutch.rb +16 -0
- data/lib/treat/languages/english.rb +47 -19
- data/lib/treat/languages/french.rb +8 -5
- data/lib/treat/languages/german.rb +9 -6
- data/lib/treat/languages/greek.rb +16 -0
- data/lib/treat/languages/italian.rb +6 -3
- data/lib/treat/languages/polish.rb +16 -0
- data/lib/treat/languages/portuguese.rb +16 -0
- data/lib/treat/languages/russian.rb +16 -0
- data/lib/treat/languages/spanish.rb +16 -0
- data/lib/treat/languages/swedish.rb +16 -0
- data/lib/treat/languages/tags.rb +377 -0
- data/lib/treat/lexicalizers.rb +34 -23
- data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
- data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
- data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
- data/lib/treat/lexicalizers/tag/brill.rb +35 -40
- data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
- data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
- data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
- data/lib/treat/processors.rb +8 -8
- data/lib/treat/processors/chunkers/txt.rb +4 -4
- data/lib/treat/processors/parsers/enju.rb +114 -99
- data/lib/treat/processors/parsers/stanford.rb +109 -41
- data/lib/treat/processors/segmenters/punkt.rb +17 -18
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
- data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
- data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
- data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
- data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
- data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
- data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
- data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
- data/lib/treat/processors/segmenters/stanford.rb +38 -37
- data/lib/treat/processors/segmenters/tactful.rb +5 -4
- data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
- data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
- data/lib/treat/processors/tokenizers/perl.rb +2 -2
- data/lib/treat/processors/tokenizers/punkt.rb +6 -2
- data/lib/treat/processors/tokenizers/stanford.rb +25 -24
- data/lib/treat/processors/tokenizers/tactful.rb +1 -2
- data/lib/treat/proxies.rb +2 -35
- data/lib/treat/registrable.rb +17 -22
- data/lib/treat/sugar.rb +11 -11
- data/lib/treat/tree.rb +27 -17
- data/lib/treat/viewable.rb +29 -0
- data/lib/treat/visitable.rb +1 -1
- data/test/tc_entity.rb +56 -49
- data/test/tc_extractors.rb +41 -18
- data/test/tc_formatters.rb +7 -8
- data/test/tc_inflectors.rb +19 -24
- data/test/tc_lexicalizers.rb +12 -19
- data/test/tc_processors.rb +26 -12
- data/test/tc_resources.rb +2 -7
- data/test/tc_treat.rb +20 -22
- data/test/tc_tree.rb +4 -4
- data/test/tests.rb +3 -5
- data/test/texts.rb +13 -14
- data/tmp/INFO +1 -0
- metadata +78 -158
- data/bin/INFO +0 -1
- data/examples/benchmark.rb +0 -81
- data/examples/keywords.rb +0 -148
- data/lib/treat/detectors.rb +0 -31
- data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
- data/lib/treat/detectors/format/file.rb +0 -36
- data/lib/treat/detectors/language/what_language.rb +0 -29
- data/lib/treat/entities/constituents.rb +0 -15
- data/lib/treat/entities/sentence.rb +0 -8
- data/lib/treat/extractors/named_entity/abner.rb +0 -20
- data/lib/treat/extractors/named_entity/stanford.rb +0 -174
- data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
- data/lib/treat/extractors/time/chronic.rb +0 -20
- data/lib/treat/extractors/time/native.rb +0 -18
- data/lib/treat/formatters/readers/gocr.rb +0 -26
- data/lib/treat/formatters/readers/ocropus.rb +0 -31
- data/lib/treat/formatters/visualizers/html.rb +0 -13
- data/lib/treat/formatters/visualizers/inspect.rb +0 -20
- data/lib/treat/inflectors/declensions/en.rb +0 -18
- data/lib/treat/languages/categories.rb +0 -5
- data/lib/treat/languages/english/categories.rb +0 -23
- data/lib/treat/languages/english/tags.rb +0 -352
- data/lib/treat/languages/xinhua.rb +0 -12
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
- data/lib/treat/string.rb +0 -5
- data/test/tc_detectors.rb +0 -26
@@ -6,6 +6,7 @@ module Treat
|
|
6
6
|
# containing the texts of the collection.
|
7
7
|
def initialize(folder = nil, id = nil)
|
8
8
|
super('', id)
|
9
|
+
@type = :collection
|
9
10
|
if folder
|
10
11
|
set :folder, folder
|
11
12
|
Dir.glob("#{folder}/*").each do |f|
|
@@ -14,6 +15,7 @@ module Treat
|
|
14
15
|
end
|
15
16
|
end
|
16
17
|
end
|
18
|
+
def type; :collection; end
|
17
19
|
end
|
18
20
|
end
|
19
21
|
end
|
@@ -4,27 +4,27 @@ require 'treat/delegatable'
|
|
4
4
|
require 'treat/visitable'
|
5
5
|
require 'treat/registrable'
|
6
6
|
require 'treat/buildable'
|
7
|
+
require 'treat/doable'
|
8
|
+
require 'treat/viewable'
|
9
|
+
require 'treat/features'
|
7
10
|
|
8
11
|
module Treat
|
9
12
|
module Entities
|
10
13
|
class Entity < Tree::Node
|
14
|
+
# A Symbol representing the lowercase version of the class name.
|
15
|
+
attr_accessor :type
|
11
16
|
# Implements support for #register
|
12
17
|
include Registrable
|
13
18
|
# Implement support for #accept.
|
14
19
|
include Visitable
|
15
|
-
# Implement support for #self.
|
20
|
+
# Implement support for #self.add_workers
|
16
21
|
extend Delegatable
|
17
22
|
# Implement support for #self.from_*
|
18
23
|
extend Buildable
|
19
|
-
#
|
20
|
-
|
21
|
-
#
|
22
|
-
|
23
|
-
# method #read must be called on the document
|
24
|
-
# object to load it in.
|
25
|
-
def self.build(file_or_value = '', id = nil)
|
26
|
-
from_anything(file_or_value, id)
|
27
|
-
end
|
24
|
+
# Implement support for #do.
|
25
|
+
include Doable
|
26
|
+
# Implement support for to_s, inspect, etc.
|
27
|
+
include Viewable
|
28
28
|
# Initialize the entity with its value and
|
29
29
|
# (optionally) a unique identifier. By default,
|
30
30
|
# the object_id will be used as id. Also initialize
|
@@ -32,12 +32,10 @@ module Treat
|
|
32
32
|
def initialize(value = '', id = nil)
|
33
33
|
id ||= object_id
|
34
34
|
super(value, id)
|
35
|
+
@type = :entity
|
35
36
|
end
|
36
|
-
# Return a lowercase identifier representing the
|
37
|
-
# type of entity (e.g. :word, :token, etc.)
|
38
|
-
def type; :"#{cl(self.class).downcase}"; end
|
39
37
|
# Catch missing methods to support method-like
|
40
|
-
# access to features (e.g. entity.
|
38
|
+
# access to features (e.g. entity.categoryinstead of
|
41
39
|
# entity.features[:cat]) and to support magic
|
42
40
|
# methods (see #parse_magic_method). If the
|
43
41
|
# feature does not exist
|
@@ -49,8 +47,8 @@ module Treat
|
|
49
47
|
begin
|
50
48
|
super(sym, *args, &block)
|
51
49
|
rescue NoMethodError
|
52
|
-
|
53
|
-
if Categories.
|
50
|
+
return false if sym.to_s[-1] == '?'
|
51
|
+
if Categories.lookup(sym)
|
54
52
|
msg = "Method #{sym} cannot be called on a #{type}."
|
55
53
|
else
|
56
54
|
msg = "Method #{sym} does not exist."
|
@@ -87,77 +85,81 @@ module Treat
|
|
87
85
|
# - nouns_with_*(value)
|
88
86
|
# - noun_with_*(value)
|
89
87
|
#
|
90
|
-
|
91
|
-
# (instead of method chaining) is intentional
|
92
|
-
# and aims to reduce the number of method
|
93
|
-
# dispatches done by Ruby to improve performance.
|
94
|
-
def parse_magic_method(sym, *args, &block)
|
88
|
+
def parse_magic_method(sym, *args)
|
95
89
|
@@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
|
96
|
-
@@cats_regexp ||= "(#{Treat::Languages::
|
90
|
+
@@cats_regexp ||= "(#{Treat::Languages::WordCategories.join('|')})"
|
97
91
|
method = sym.to_s =~ /entities/ ?
|
98
|
-
sym.to_s.gsub('entities', 'entitys'):
|
92
|
+
sym.to_s.gsub('entities', 'entitys') :
|
99
93
|
method = sym.to_s
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
ancestor_with_types(:"#{$1}")
|
104
|
-
end
|
105
|
-
ancestor_with_types(:"#{$1}")
|
106
|
-
elsif method =~ /^each_#{@@entities_regexp}$/
|
107
|
-
each_entity(:"#{$1}") { |entity| yield entity }
|
108
|
-
elsif method =~ /^#{@@entities_regexp}s$/
|
109
|
-
each_entity(:"#{$1}") { |e| a << e }
|
94
|
+
if method =~ /^#{@@entities_regexp}s$/
|
95
|
+
a = []
|
96
|
+
each_entity($1.intern) { |e| a << e }
|
110
97
|
a
|
111
98
|
elsif method =~ /^#{@@entities_regexp}$/
|
112
|
-
|
99
|
+
a = []
|
100
|
+
each_entity($1.intern) { |e| a << e }
|
113
101
|
first_but_warn(a, $1)
|
102
|
+
elsif method =~ /^parent_#{@@entities_regexp}$/
|
103
|
+
ancestor_with_types($1.intern)
|
104
|
+
elsif method =~ /^each_#{@@entities_regexp}$/
|
105
|
+
each_entity($1.intern) { |e| yield e }
|
114
106
|
elsif method =~ /^#{@@entities_regexp}_count$/
|
115
107
|
i = 0
|
116
|
-
each_entity(
|
108
|
+
each_entity($1.intern) { |e| i += 1 }
|
117
109
|
i
|
118
|
-
elsif method =~ /^#{@@entities_regexp}s_with_([a-z]
|
119
|
-
|
120
|
-
|
121
|
-
e.
|
110
|
+
elsif method =~ /^#{@@entities_regexp}s_with_([a-z]+)$/
|
111
|
+
a = []
|
112
|
+
each_entity($1.intern) do |e|
|
113
|
+
a << e if e.has?($2.intern) &&
|
114
|
+
e.send($2.intern) == args[0]
|
122
115
|
end
|
123
116
|
a
|
124
117
|
elsif method =~ /^#{@@entities_regexp}s_with_([a-z]*)$/
|
125
|
-
|
126
|
-
|
127
|
-
e.
|
118
|
+
a = []
|
119
|
+
each_entity($1.intern) do |e|
|
120
|
+
a << e if e.has?($2.intern) &&
|
121
|
+
e.send($2.intern) == args[0]
|
128
122
|
end
|
129
123
|
first_but_warn(a, $1)
|
130
124
|
elsif method =~ /^each_with_([a-z]*)$/
|
131
125
|
each_entity do |e|
|
132
|
-
yield e if e.has?(
|
133
|
-
e.send(
|
126
|
+
yield e if e.has?($1.intern) &&
|
127
|
+
e.send($1.intern) == args[0]
|
134
128
|
end
|
135
129
|
elsif method =~ /^each_#{@@cats_regexp}$/
|
136
|
-
each_entity(:word) { |e| yield e if e.
|
130
|
+
each_entity(:word) { |e| yield e if e.category == $1.intern }
|
137
131
|
elsif method =~ /^#{@@cats_regexp}s$/
|
138
|
-
|
132
|
+
a = []
|
133
|
+
each_entity(:word) { |e| a << e if e.category == $1.intern }
|
139
134
|
a
|
140
135
|
elsif method =~ /^#{@@cats_regexp}$/
|
141
|
-
|
136
|
+
a = []
|
137
|
+
each_entity(:word) { |e| a << e if e.category == $1.intern }
|
142
138
|
first_but_warn(a, $1)
|
143
139
|
elsif method =~ /^#{@@cats_regexp}_count$/
|
144
140
|
i = 0
|
145
|
-
each_entity(:word) { |e| i += 1 if e.
|
141
|
+
each_entity(:word) { |e| i += 1 if e.category == $1.intern }
|
146
142
|
i
|
147
143
|
elsif method =~ /^#{@@cats_regexp}s_with_([a-z]*)$/
|
144
|
+
a = []
|
148
145
|
each_entity(:word) do |e|
|
149
|
-
a << e if e.
|
150
|
-
e.has?(
|
146
|
+
a << e if e.category == $1.intern &&
|
147
|
+
e.has?($2.intern) && e.send($2.intern) == args[0]
|
151
148
|
end
|
152
149
|
a
|
153
150
|
elsif method =~ /^#{@@cats_regexp}_with_([a-z]*)$/
|
151
|
+
a = []
|
154
152
|
each_entity(:word) do |e|
|
155
|
-
a << e if e.
|
156
|
-
e.has?(
|
153
|
+
a << e if e.category== $1.intern &&
|
154
|
+
e.has?($2.intern) && e.send($2.intern) == args[0]
|
157
155
|
end
|
158
156
|
first_but_warn(a, $1)
|
157
|
+
elsif method =~ /^is_#{@@entities_regexp}\?$/
|
158
|
+
type.to_s == $1
|
159
|
+
elsif method =~ /^is_#{@@cats_regexp}\?$/
|
160
|
+
category.to_s == $1
|
159
161
|
else
|
160
|
-
:no_magic
|
162
|
+
return :no_magic
|
161
163
|
end
|
162
164
|
end
|
163
165
|
# Add an entity to the current entity.
|
@@ -168,9 +170,9 @@ module Treat
|
|
168
170
|
def <<(entities, clear_parent = true)
|
169
171
|
entities = [entities] unless entities.is_a? Array
|
170
172
|
entities.each do |entity|
|
171
|
-
if entity.is_a?(Treat::Entities::Token) ||
|
172
|
-
entity.is_a?(Treat::Entities::
|
173
|
-
|
173
|
+
if entity.is_a?(Treat::Entities::Token) ||
|
174
|
+
entity.is_a?(Treat::Entities::Phrase)
|
175
|
+
register_token(entity) unless entity.value == ''
|
174
176
|
end
|
175
177
|
end
|
176
178
|
super(entities)
|
@@ -182,39 +184,64 @@ module Treat
|
|
182
184
|
# Note that this function is recursive, unlike
|
183
185
|
# #each. It does not yield the top element being
|
184
186
|
# recursed.
|
187
|
+
#
|
188
|
+
# This function NEEDS to be ported to C (see source).
|
185
189
|
def each_entity(*types)
|
186
|
-
|
187
|
-
|
190
|
+
=begin
|
191
|
+
# Replace with:
|
192
|
+
inline do |builder|
|
193
|
+
|
194
|
+
builder.c_raw <<-EOS, :arity => -1
|
195
|
+
VALUE each_entity_c(int argc, VALUE *types, VALUE self)
|
196
|
+
{
|
197
|
+
|
198
|
+
}
|
199
|
+
EOS
|
200
|
+
end
|
201
|
+
=end
|
202
|
+
types = [:entity] if types.size == 0
|
203
|
+
f = false
|
204
|
+
types.each { |t2| f = true if Treat::Entities.match_types[type][t2] }
|
205
|
+
yield self if f
|
206
|
+
unless @children.size == 0
|
188
207
|
@children.each do |child|
|
189
208
|
child.each_entity(*types) { |y| yield y }
|
190
209
|
end
|
191
210
|
end
|
192
211
|
end
|
193
|
-
# Returns the first ancestor of this
|
194
|
-
# entity that has the given type.
|
212
|
+
# Returns the first ancestor of this entity that has the given type.
|
195
213
|
def ancestor_with_types(*types)
|
196
214
|
ancestor = @parent
|
197
|
-
|
198
|
-
|
199
|
-
|
215
|
+
match_types = lambda do |t1, t2s|
|
216
|
+
f = false
|
217
|
+
t2s.each do |t2|
|
218
|
+
if Treat::Entities.match_types[t1][t2]
|
219
|
+
f = true; break
|
220
|
+
end
|
221
|
+
end
|
222
|
+
f
|
223
|
+
end
|
224
|
+
if ancestor
|
225
|
+
while not match_types.call(ancestor.type, types)
|
226
|
+
return nil unless (ancestor && ancestor.has_parent?)
|
227
|
+
ancestor = ancestor.parent
|
228
|
+
end
|
229
|
+
match_types.call(ancestor.type, types) ? ancestor : nil
|
200
230
|
end
|
201
|
-
match_types(ancestor, types) ? ancestor : nil
|
202
231
|
end
|
203
232
|
alias :ancestor_with_type :ancestor_with_types
|
204
|
-
#
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
def frequency_of(word); statistics(:frequency_of, value: word); end
|
217
|
-
private
|
233
|
+
# Returns the (direct) ancestors of this entity that
|
234
|
+
# have the given type.
|
235
|
+
def ancestors_with_types(*types)
|
236
|
+
ancestor = self
|
237
|
+
ancestors = []
|
238
|
+
while (a = ancestor.ancestor_with_types(*types))
|
239
|
+
ancestors << a
|
240
|
+
ancestor = ancestor.parent
|
241
|
+
end
|
242
|
+
ancestors
|
243
|
+
end
|
244
|
+
alias :ancestors_with_type :ancestors_with_types
|
218
245
|
# Return the first element in the array, warning if not
|
219
246
|
# the only one in the array. Used for magic methods: e.g.,
|
220
247
|
# the magic method "word" if called on a sentence
|
@@ -227,18 +254,6 @@ module Treat
|
|
227
254
|
end
|
228
255
|
array[0]
|
229
256
|
end
|
230
|
-
# Cache a list of the type => class relationships.
|
231
|
-
@@type_classes = {}
|
232
|
-
# Returns true if the node is of the same type or
|
233
|
-
# is a subtype of of one of the specified entity types,
|
234
|
-
# which are supplied as identifiers rather than classes.
|
235
|
-
def match_types(node, entity_types)
|
236
|
-
entity_types.each do |type|
|
237
|
-
@@type_classes[type] ||= Entities.const_get(cc(type))
|
238
|
-
return true if node.is_a? @@type_classes[type]
|
239
|
-
end
|
240
|
-
false
|
241
|
-
end
|
242
257
|
end
|
243
258
|
end
|
244
259
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Treat
|
2
|
+
module Entities
|
3
|
+
# Represents any syntactic phrase of a sentence.
|
4
|
+
class Phrase < Entity
|
5
|
+
def initialize(value = '', id = nil)
|
6
|
+
super(value, id)
|
7
|
+
@type = :phrase
|
8
|
+
end
|
9
|
+
end
|
10
|
+
class Sentence < Phrase
|
11
|
+
def initialize(value = '', id = nil)
|
12
|
+
super(value, id)
|
13
|
+
@type = :sentence
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -4,25 +4,24 @@ module Treat
|
|
4
4
|
class Token < Entity
|
5
5
|
# All tokens are leafs.
|
6
6
|
def is_leaf?; true; end
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
def tf_idf; statistics(:tf_idf); end
|
7
|
+
def initialize(value = '', id = nil)
|
8
|
+
super(value, id)
|
9
|
+
@type = :token
|
10
|
+
end
|
12
11
|
end
|
13
12
|
# Represents a word.
|
14
13
|
class Word < Token
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
# Convenience function for declensions.
|
20
|
-
def plural(declensor = nil); declensions(declensor, :count => :plural); end
|
21
|
-
# Convenience function for declensions.
|
22
|
-
def singular(declensor = nil); declensions(declensor, :count => :singular); end
|
14
|
+
def initialize(value = '', id = nil)
|
15
|
+
super(value, id)
|
16
|
+
@type = :word
|
17
|
+
end
|
23
18
|
end
|
24
19
|
# Represents a clitic ('s).
|
25
20
|
class Clitic < Token
|
21
|
+
def initialize(value = '', id = nil)
|
22
|
+
super(value, id)
|
23
|
+
@type = :clitic
|
24
|
+
end
|
26
25
|
end
|
27
26
|
# Represents a number.
|
28
27
|
class Number < Token
|
@@ -30,17 +29,33 @@ module Treat
|
|
30
29
|
def to_i; to_s.to_i; end
|
31
30
|
# Convert the number to a float.
|
32
31
|
def to_f; to_s.to_f; end
|
32
|
+
def initialize(value = '', id = nil)
|
33
|
+
super(value, id)
|
34
|
+
@type = :number
|
35
|
+
end
|
33
36
|
end
|
34
37
|
# Represents a punctuation sign.
|
35
38
|
class Punctuation < Token
|
39
|
+
def initialize(value = '', id = nil)
|
40
|
+
super(value, id)
|
41
|
+
@type = :punctuation
|
42
|
+
end
|
36
43
|
end
|
37
44
|
# Represents a character that is neither
|
38
45
|
# alphabetical, numerical or a punctuation
|
39
46
|
# character (e.g. @#$%&*).
|
40
47
|
class Symbol < Token
|
48
|
+
def initialize(value = '', id = nil)
|
49
|
+
super(value, id)
|
50
|
+
@type = :symbol
|
51
|
+
end
|
41
52
|
end
|
42
53
|
# Represents an entity of unknown type.
|
43
54
|
class Unknown < Token
|
55
|
+
def initialize(value = '', id = nil)
|
56
|
+
super(value, id)
|
57
|
+
@type = :unknown
|
58
|
+
end
|
44
59
|
end
|
45
60
|
end
|
46
61
|
end
|
data/lib/treat/entities/zones.rb
CHANGED
@@ -3,19 +3,39 @@ module Treat
|
|
3
3
|
# Represents a zone of text
|
4
4
|
# (Title, Paragraph, List, Quote).
|
5
5
|
class Zone < Entity
|
6
|
+
def initialize(value = '', id = nil)
|
7
|
+
super(value, id)
|
8
|
+
@type = :zone
|
9
|
+
end
|
6
10
|
end
|
7
11
|
# Represents a title, subtitle, logical header.
|
8
12
|
class Title < Zone
|
13
|
+
def initialize(value = '', id = nil)
|
14
|
+
super(value, id)
|
15
|
+
@type = :title
|
16
|
+
end
|
9
17
|
end
|
10
18
|
# Represents a paragraph.
|
11
19
|
class Paragraph < Zone
|
20
|
+
def initialize(value = '', id = nil)
|
21
|
+
super(value, id)
|
22
|
+
@type = :paragraph
|
23
|
+
end
|
12
24
|
end
|
13
25
|
# Represents a list.
|
14
26
|
class List < Zone
|
27
|
+
def initialize(value = '', id = nil)
|
28
|
+
super(value, id)
|
29
|
+
@type = :list
|
30
|
+
end
|
15
31
|
end
|
16
32
|
# Represents a section, usually with a title
|
17
33
|
# and at least one paragraph.
|
18
34
|
class Section < Zone
|
35
|
+
def initialize(value = '', id = nil)
|
36
|
+
super(value, id)
|
37
|
+
@type = :section
|
38
|
+
end
|
19
39
|
end
|
20
40
|
end
|
21
41
|
end
|