treat 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -3
- data/README.md +33 -0
- data/files/INFO +1 -0
- data/lib/treat.rb +40 -105
- data/lib/treat/ai.rb +12 -0
- data/lib/treat/ai/classifiers/id3.rb +27 -0
- data/lib/treat/categories.rb +82 -35
- data/lib/treat/categorizable.rb +44 -0
- data/lib/treat/classification.rb +61 -0
- data/lib/treat/configurable.rb +115 -0
- data/lib/treat/data_set.rb +42 -0
- data/lib/treat/dependencies.rb +24 -0
- data/lib/treat/downloader.rb +87 -0
- data/lib/treat/entities.rb +68 -66
- data/lib/treat/entities/abilities.rb +10 -0
- data/lib/treat/entities/abilities/buildable.rb +327 -0
- data/lib/treat/entities/abilities/checkable.rb +31 -0
- data/lib/treat/entities/abilities/copyable.rb +45 -0
- data/lib/treat/entities/abilities/countable.rb +51 -0
- data/lib/treat/entities/abilities/debuggable.rb +83 -0
- data/lib/treat/entities/abilities/delegatable.rb +123 -0
- data/lib/treat/entities/abilities/doable.rb +62 -0
- data/lib/treat/entities/abilities/exportable.rb +11 -0
- data/lib/treat/entities/abilities/iterable.rb +115 -0
- data/lib/treat/entities/abilities/magical.rb +83 -0
- data/lib/treat/entities/abilities/registrable.rb +74 -0
- data/lib/treat/entities/abilities/stringable.rb +91 -0
- data/lib/treat/entities/entities.rb +104 -0
- data/lib/treat/entities/entity.rb +122 -245
- data/lib/treat/exception.rb +4 -4
- data/lib/treat/extractors.rb +77 -80
- data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
- data/lib/treat/extractors/language/what_language.rb +50 -45
- data/lib/treat/extractors/name_tag/stanford.rb +55 -0
- data/lib/treat/extractors/tf_idf/native.rb +87 -0
- data/lib/treat/extractors/time/chronic.rb +55 -0
- data/lib/treat/extractors/time/nickel.rb +86 -62
- data/lib/treat/extractors/time/ruby.rb +53 -0
- data/lib/treat/extractors/topic_words/lda.rb +67 -58
- data/lib/treat/extractors/topics/reuters.rb +100 -87
- data/lib/treat/formatters.rb +39 -35
- data/lib/treat/formatters/readers/abw.rb +49 -29
- data/lib/treat/formatters/readers/autoselect.rb +37 -33
- data/lib/treat/formatters/readers/doc.rb +19 -13
- data/lib/treat/formatters/readers/html.rb +52 -30
- data/lib/treat/formatters/readers/image.rb +41 -40
- data/lib/treat/formatters/readers/odt.rb +59 -45
- data/lib/treat/formatters/readers/pdf.rb +28 -25
- data/lib/treat/formatters/readers/txt.rb +12 -15
- data/lib/treat/formatters/readers/xml.rb +73 -36
- data/lib/treat/formatters/serializers/xml.rb +80 -79
- data/lib/treat/formatters/serializers/yaml.rb +19 -18
- data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
- data/lib/treat/formatters/unserializers/xml.rb +94 -99
- data/lib/treat/formatters/unserializers/yaml.rb +20 -19
- data/lib/treat/formatters/visualizers/dot.rb +132 -132
- data/lib/treat/formatters/visualizers/standoff.rb +52 -44
- data/lib/treat/formatters/visualizers/tree.rb +26 -29
- data/lib/treat/groupable.rb +153 -0
- data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
- data/lib/treat/inflectors.rb +50 -45
- data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
- data/lib/treat/inflectors/declensors/active_support.rb +31 -0
- data/lib/treat/inflectors/declensors/english.rb +38 -0
- data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
- data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
- data/lib/treat/inflectors/stemmers/porter.rb +160 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
- data/lib/treat/inflectors/stemmers/uea.rb +28 -0
- data/lib/treat/installer.rb +308 -0
- data/lib/treat/kernel.rb +105 -27
- data/lib/treat/languages.rb +122 -88
- data/lib/treat/languages/arabic.rb +15 -15
- data/lib/treat/languages/chinese.rb +15 -15
- data/lib/treat/languages/dutch.rb +15 -15
- data/lib/treat/languages/english.rb +61 -62
- data/lib/treat/languages/french.rb +19 -19
- data/lib/treat/languages/german.rb +20 -20
- data/lib/treat/languages/greek.rb +15 -15
- data/lib/treat/languages/italian.rb +16 -16
- data/lib/treat/languages/polish.rb +15 -15
- data/lib/treat/languages/portuguese.rb +15 -15
- data/lib/treat/languages/russian.rb +15 -15
- data/lib/treat/languages/spanish.rb +16 -16
- data/lib/treat/languages/swedish.rb +16 -16
- data/lib/treat/lexicalizers.rb +34 -55
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
- data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
- data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
- data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
- data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
- data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
- data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
- data/lib/treat/linguistics.rb +9 -0
- data/lib/treat/linguistics/categories.rb +11 -0
- data/lib/treat/linguistics/tags.rb +422 -0
- data/lib/treat/loaders/linguistics.rb +30 -0
- data/lib/treat/loaders/stanford.rb +27 -0
- data/lib/treat/object.rb +1 -0
- data/lib/treat/processors.rb +37 -44
- data/lib/treat/processors/chunkers/autoselect.rb +16 -0
- data/lib/treat/processors/chunkers/html.rb +71 -0
- data/lib/treat/processors/chunkers/txt.rb +18 -24
- data/lib/treat/processors/parsers/enju.rb +253 -208
- data/lib/treat/processors/parsers/stanford.rb +130 -131
- data/lib/treat/processors/segmenters/punkt.rb +79 -45
- data/lib/treat/processors/segmenters/stanford.rb +46 -48
- data/lib/treat/processors/segmenters/tactful.rb +43 -36
- data/lib/treat/processors/tokenizers/perl.rb +124 -92
- data/lib/treat/processors/tokenizers/ptb.rb +81 -0
- data/lib/treat/processors/tokenizers/punkt.rb +48 -42
- data/lib/treat/processors/tokenizers/stanford.rb +39 -38
- data/lib/treat/processors/tokenizers/tactful.rb +64 -55
- data/lib/treat/proxies.rb +52 -35
- data/lib/treat/retrievers.rb +26 -16
- data/lib/treat/retrievers/indexers/ferret.rb +47 -26
- data/lib/treat/retrievers/searchers/ferret.rb +69 -50
- data/lib/treat/tree.rb +241 -183
- data/spec/collection.rb +123 -0
- data/spec/document.rb +93 -0
- data/spec/entity.rb +408 -0
- data/spec/languages.rb +25 -0
- data/spec/phrase.rb +146 -0
- data/spec/samples/mathematicians/archimedes.abw +34 -0
- data/spec/samples/mathematicians/euler.html +21 -0
- data/spec/samples/mathematicians/gauss.pdf +0 -0
- data/spec/samples/mathematicians/leibniz.txt +13 -0
- data/spec/samples/mathematicians/newton.doc +0 -0
- data/spec/sandbox.rb +5 -0
- data/spec/token.rb +109 -0
- data/spec/treat.rb +52 -0
- data/spec/tree.rb +117 -0
- data/spec/word.rb +110 -0
- data/spec/zone.rb +66 -0
- data/tmp/INFO +1 -1
- metadata +100 -201
- data/INSTALL +0 -1
- data/README +0 -3
- data/TODO +0 -28
- data/lib/economist/half_cocked_basel.txt +0 -16
- data/lib/economist/hungarys_troubles.txt +0 -46
- data/lib/economist/indias_slowdown.txt +0 -15
- data/lib/economist/merkozy_rides_again.txt +0 -24
- data/lib/economist/prada_is_not_walmart.txt +0 -9
- data/lib/economist/to_infinity_and_beyond.txt +0 -15
- data/lib/ferret/_11.cfs +0 -0
- data/lib/ferret/_14.cfs +0 -0
- data/lib/ferret/_p.cfs +0 -0
- data/lib/ferret/_s.cfs +0 -0
- data/lib/ferret/_v.cfs +0 -0
- data/lib/ferret/_y.cfs +0 -0
- data/lib/ferret/segments +0 -0
- data/lib/ferret/segments_15 +0 -0
- data/lib/treat/buildable.rb +0 -157
- data/lib/treat/category.rb +0 -33
- data/lib/treat/delegatable.rb +0 -116
- data/lib/treat/doable.rb +0 -45
- data/lib/treat/entities/collection.rb +0 -14
- data/lib/treat/entities/document.rb +0 -12
- data/lib/treat/entities/phrases.rb +0 -17
- data/lib/treat/entities/tokens.rb +0 -61
- data/lib/treat/entities/zones.rb +0 -41
- data/lib/treat/extractors/coreferences/stanford.rb +0 -69
- data/lib/treat/extractors/date/chronic.rb +0 -32
- data/lib/treat/extractors/date/ruby.rb +0 -25
- data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
- data/lib/treat/extractors/language/language_extractor.rb +0 -27
- data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
- data/lib/treat/extractors/roles/naive.rb +0 -73
- data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
- data/lib/treat/extractors/statistics/position_in.rb +0 -14
- data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
- data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
- data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
- data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
- data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
- data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
- data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
- data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
- data/lib/treat/feature.rb +0 -58
- data/lib/treat/features.rb +0 -7
- data/lib/treat/formatters/visualizers/short_value.rb +0 -29
- data/lib/treat/formatters/visualizers/txt.rb +0 -45
- data/lib/treat/group.rb +0 -106
- data/lib/treat/helpers/linguistics_loader.rb +0 -18
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
- data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
- data/lib/treat/inflectors/declensions/english.rb +0 -319
- data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
- data/lib/treat/inflectors/stem/porter.rb +0 -162
- data/lib/treat/inflectors/stem/porter_c.rb +0 -26
- data/lib/treat/inflectors/stem/uea.rb +0 -30
- data/lib/treat/install.rb +0 -59
- data/lib/treat/languages/tags.rb +0 -377
- data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
- data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
- data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
- data/lib/treat/lexicalizers/tag/brill.rb +0 -91
- data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
- data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
- data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
- data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
- data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
- data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
- data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
- data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
- data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
- data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
- data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
- data/lib/treat/registrable.rb +0 -28
- data/lib/treat/sugar.rb +0 -50
- data/lib/treat/viewable.rb +0 -29
- data/lib/treat/visitable.rb +0 -28
- data/test/profile.rb +0 -2
- data/test/tc_entity.rb +0 -117
- data/test/tc_extractors.rb +0 -73
- data/test/tc_formatters.rb +0 -41
- data/test/tc_inflectors.rb +0 -34
- data/test/tc_lexicalizers.rb +0 -32
- data/test/tc_processors.rb +0 -50
- data/test/tc_resources.rb +0 -22
- data/test/tc_treat.rb +0 -60
- data/test/tc_tree.rb +0 -60
- data/test/tests.rb +0 -20
- data/test/texts.rb +0 -19
- data/test/texts/english/half_cocked_basel.txt +0 -16
- data/test/texts/english/hose_and_dry.doc +0 -0
- data/test/texts/english/hungarys_troubles.abw +0 -70
- data/test/texts/english/long.html +0 -24
- data/test/texts/english/long.txt +0 -22
- data/test/texts/english/medium.txt +0 -5
- data/test/texts/english/republican_nomination.pdf +0 -0
- data/test/texts/english/saving_the_euro.odt +0 -0
- data/test/texts/english/short.txt +0 -3
- data/test/texts/english/zero_sum.html +0 -111
@@ -0,0 +1,83 @@
|
|
1
|
+
module Treat::Entities::Abilities::Magical
|
2
|
+
|
3
|
+
# Parse "magic methods", which allow the following
|
4
|
+
# syntaxes to be used (where 'word' can be replaced
|
5
|
+
# by any entity type, e.g. token, zone, etc.):
|
6
|
+
#
|
7
|
+
# - each_word : iterate over each entity of type word.
|
8
|
+
# - words: return an array of words in the entity.
|
9
|
+
# - word: return the first word in the entity.
|
10
|
+
# - word_count: return the number of words in the entity.
|
11
|
+
# - words_with_*(value) (where is an arbitrary feature):
|
12
|
+
# return the words that have the given feature.
|
13
|
+
# - word_with_*(value) : return the first word with
|
14
|
+
# the feature specified by * in value.
|
15
|
+
#
|
16
|
+
# Also provides magical methods for types of words:
|
17
|
+
#
|
18
|
+
# - each_noun:
|
19
|
+
# - nouns:
|
20
|
+
# - noun:
|
21
|
+
# - noun_count:
|
22
|
+
# - nouns_with_*(value)
|
23
|
+
# - noun_with_*(value)
|
24
|
+
#
|
25
|
+
def magic(sym, *args)
|
26
|
+
|
27
|
+
@@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
|
28
|
+
@@cats_regexp ||= "(#{Treat::Linguistics::WordCategories.join('|')})"
|
29
|
+
|
30
|
+
method = sym.to_s =~ /entities/ ?
|
31
|
+
sym.to_s.gsub('entities', 'entitys') :
|
32
|
+
method = sym.to_s
|
33
|
+
|
34
|
+
if method =~ /^#{@@entities_regexp}s$/
|
35
|
+
entities_with_type($1.intern)
|
36
|
+
elsif method =~ /^#{@@entities_regexp}$/
|
37
|
+
first_but_warn(entities_with_type($1.intern), $1)
|
38
|
+
elsif method =~ /^parent_#{@@entities_regexp}$/
|
39
|
+
ancestor_with_type($1.intern)
|
40
|
+
elsif method =~ /^each_#{@@entities_regexp}$/
|
41
|
+
each_entity($1.intern) { |e| yield e }
|
42
|
+
elsif method =~ /^#{@@entities_regexp}_count$/
|
43
|
+
entities_with_type($1.intern).size
|
44
|
+
elsif method =~ /^#{@@entities_regexp}s_with_([a-z]+)$/
|
45
|
+
entities_with_feature($2.intern, args[0], $1.intern)
|
46
|
+
elsif method =~ /^#{@@entities_regexp}_with_([a-z]*)$/
|
47
|
+
first_but_warn(entities_with_feature(
|
48
|
+
$2.intern, args[0], $1.intern), $1)
|
49
|
+
elsif method =~ /^each_#{@@entities_regexp}_with_([a-z]*)$/
|
50
|
+
entities_with_feature($2.intern, args[0],
|
51
|
+
$1.intern).each { |e| yield e }
|
52
|
+
elsif method =~ /^each_with_([a-z]*)$/
|
53
|
+
entities_with_feature($2.intern,
|
54
|
+
args[0], $1.intern).each { |e| yield e }
|
55
|
+
elsif method =~ /^each_#{@@cats_regexp}$/
|
56
|
+
entities_with_category($1.intern
|
57
|
+
).each { |e| yield e }
|
58
|
+
elsif method =~ /^#{@@cats_regexp}s$/
|
59
|
+
entities_with_category($1.intern)
|
60
|
+
elsif method =~ /^#{@@cats_regexp}$/
|
61
|
+
first_but_warn(entities_with_category($1.intern), $1)
|
62
|
+
elsif method =~ /^#{@@cats_regexp}_count$/
|
63
|
+
entities_with_category($1.intern).size
|
64
|
+
elsif method =~ /^#{@@cats_regexp}s_with_([a-z]*)$/
|
65
|
+
entities_with_feature($2.intern, args[0], $1)
|
66
|
+
elsif method =~ /^#{@@cats_regexp}_with_([a-z]*)$/
|
67
|
+
first_but_warn(entities_with_feature(
|
68
|
+
$2.intern, args[0], $1.intern), $1)
|
69
|
+
elsif method =~ /^([a-z]*)_of_first_#{@@entities_regexp}$/
|
70
|
+
f = send(:"#{$2}s".intern).first
|
71
|
+
f ? f.send($1.intern) : nil
|
72
|
+
elsif method =~ /^frequency_in_#{@@entities_regexp}$/
|
73
|
+
frequency_in($1.intern)
|
74
|
+
# first_word
|
75
|
+
# tag_of_first_verb
|
76
|
+
# tag_of_title
|
77
|
+
else
|
78
|
+
return :no_magic
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
|
83
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# Registers occurences of textual values inside
|
2
|
+
# all children entity. Useful to calculate frequency.
|
3
|
+
module Treat::Entities::Abilities::Registrable
|
4
|
+
|
5
|
+
# Registers a token in the @registry hash.
|
6
|
+
def register(entity)
|
7
|
+
|
8
|
+
if entity.is_a?(Treat::Entities::Token) ||
|
9
|
+
entity.is_a?(Treat::Entities::Phrase)
|
10
|
+
val = entity.to_s.downcase
|
11
|
+
@registry[:value][val] ||= 0
|
12
|
+
@registry[:value][val] += 1
|
13
|
+
end
|
14
|
+
|
15
|
+
@registry[:id][entity.id] = true
|
16
|
+
@registry[:type][entity.type] ||= 0
|
17
|
+
@registry[:type][entity.type] += 1
|
18
|
+
@registry[:position][entity.id] = @count
|
19
|
+
@count += 1
|
20
|
+
|
21
|
+
@parent.register(entity) if has_parent?
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
# Backtrack up the tree to find a token registry,
|
26
|
+
# by default the one in the root node of any entity.
|
27
|
+
def registry(type = nil)
|
28
|
+
if has_parent? &&
|
29
|
+
type != self.type
|
30
|
+
@parent.registry(type)
|
31
|
+
else
|
32
|
+
@registry
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def contains_id?(id)
|
37
|
+
|
38
|
+
@registry[:id][id]
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
def contains_value?(val)
|
43
|
+
|
44
|
+
@registry[:value][val] ?
|
45
|
+
true : false
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
def contains_type?(type1)
|
50
|
+
|
51
|
+
return true if @registry[:type][type1]
|
52
|
+
|
53
|
+
@registry[:type].each do |type2, count|
|
54
|
+
if Treat::Entities.
|
55
|
+
match_types[type1][type2]
|
56
|
+
return true
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
false
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
def contains_types?(types)
|
65
|
+
|
66
|
+
types.each do |type|
|
67
|
+
return true if contains_type?(type)
|
68
|
+
end
|
69
|
+
|
70
|
+
false
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# Gives entities the ability to be converted
|
2
|
+
# to string representations (#to_string, #to_s,
|
3
|
+
# #to_str, #inspect, #print_tree).
|
4
|
+
module Treat::Entities::Abilities::Stringable
|
5
|
+
|
6
|
+
# Return the entity's true string value in
|
7
|
+
# plain text format. Non-terminal entities
|
8
|
+
# will normally have an empty value.
|
9
|
+
def to_string; @value; end
|
10
|
+
|
11
|
+
# Returns the entity's string value by
|
12
|
+
# imploding the value of all terminal
|
13
|
+
# entities in the subtree of that entity.
|
14
|
+
def to_s
|
15
|
+
@value != '' ? @value : implode.strip
|
16
|
+
end
|
17
|
+
|
18
|
+
# #to_str is the same as #to_s.
|
19
|
+
alias :to_str :to_s
|
20
|
+
|
21
|
+
# Return a shortened value of the entity's
|
22
|
+
# string value using [...], with a cutoff
|
23
|
+
# number of words or length.
|
24
|
+
def short_value(max_length = 30)
|
25
|
+
s = to_s
|
26
|
+
words = s.split(' ')
|
27
|
+
if s.length < max_length
|
28
|
+
s
|
29
|
+
else
|
30
|
+
words[0..2].join(' ') + ' [...] ' +
|
31
|
+
words[-2..-1].join(' ')
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Return an informative string representation
|
36
|
+
# of the entity.
|
37
|
+
def inspect
|
38
|
+
s = "#{cl(self.class)} (#{@id.to_s})"
|
39
|
+
if caller_method(2) == :inspect
|
40
|
+
@id.to_s
|
41
|
+
else
|
42
|
+
dependencies = []
|
43
|
+
@dependencies.each do |dependency|
|
44
|
+
dependencies <<
|
45
|
+
"#{dependency.target}#{dependency.type}"
|
46
|
+
end
|
47
|
+
s += " --- #{short_value.inspect}" +
|
48
|
+
" --- #{@features.inspect} " +
|
49
|
+
" --- #{dependencies.inspect} "
|
50
|
+
end
|
51
|
+
s
|
52
|
+
end
|
53
|
+
|
54
|
+
# Print out an ASCII representation of the tree.
|
55
|
+
def print_tree; puts visualize(:tree); end
|
56
|
+
|
57
|
+
# Helper method to implode the string value of the subtree.
|
58
|
+
def implode
|
59
|
+
|
60
|
+
return @value.dup if !has_children?
|
61
|
+
|
62
|
+
value = ''
|
63
|
+
|
64
|
+
each do |child|
|
65
|
+
|
66
|
+
if child.is_a?(Treat::Entities::Section)
|
67
|
+
value += "\n\n"
|
68
|
+
end
|
69
|
+
|
70
|
+
if child.is_a?(Treat::Entities::Token) || child.value != ''
|
71
|
+
if child.is_a?(Treat::Entities::Punctuation) ||
|
72
|
+
child.is_a?(Treat::Entities::Clitic)
|
73
|
+
value.strip!
|
74
|
+
end
|
75
|
+
value += child.to_s + ' '
|
76
|
+
else
|
77
|
+
value += child.implode
|
78
|
+
end
|
79
|
+
|
80
|
+
if child.is_a?(Treat::Entities::Title) ||
|
81
|
+
child.is_a?(Treat::Entities::Paragraph)
|
82
|
+
value += "\n\n"
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
|
87
|
+
value
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
module Treat::Entities
|
2
|
+
|
3
|
+
# Require the generic entity lass.
|
4
|
+
require 'treat/entities/entity'
|
5
|
+
|
6
|
+
# Represents a collection of texts.
|
7
|
+
class Collection < Entity
|
8
|
+
|
9
|
+
# Initialize the collection with a folder
|
10
|
+
# containing the texts of the collection.
|
11
|
+
def initialize(folder = nil, id = nil)
|
12
|
+
super('', id)
|
13
|
+
set :folder, folder
|
14
|
+
i = folder + '/.index'
|
15
|
+
set :index, i if FileTest.directory?(i)
|
16
|
+
end
|
17
|
+
|
18
|
+
# Works like the default <<, but if the
|
19
|
+
# file being added is a collection or a
|
20
|
+
# document, then copy that collection or
|
21
|
+
# document into this collection's folder.
|
22
|
+
def <<(entities, copy = true)
|
23
|
+
unless entities.is_a? Array
|
24
|
+
entities = [entities]
|
25
|
+
end
|
26
|
+
entities.each do |entity|
|
27
|
+
if [:document, :collection].
|
28
|
+
include?(entity.type) && copy
|
29
|
+
entity = entity.copy_into(self)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
super(entities)
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
# Represents a document.
|
38
|
+
class Document < Entity
|
39
|
+
|
40
|
+
def initialize(file = nil, id = nil)
|
41
|
+
super('', id)
|
42
|
+
set :file, file
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
# Represents a section, usually with a title
|
48
|
+
# and at least one paragraph.
|
49
|
+
class Section < Entity; end
|
50
|
+
|
51
|
+
# Represents a zone of text
|
52
|
+
# (Title, Paragraph, List, Quote).
|
53
|
+
class Zone < Entity; end
|
54
|
+
|
55
|
+
# Represents a title, subtitle, logical header.
|
56
|
+
class Title < Zone; end
|
57
|
+
|
58
|
+
# Represents a paragraph.
|
59
|
+
class Paragraph < Zone; end
|
60
|
+
|
61
|
+
# Represents a list.
|
62
|
+
class List < Zone; end
|
63
|
+
|
64
|
+
# Represents a group of words.
|
65
|
+
class Phrase < Entity; end
|
66
|
+
|
67
|
+
# Represents a group of words with a sentence ender.
|
68
|
+
class Sentence < Phrase; end
|
69
|
+
|
70
|
+
# Represents a terminal element in the text structure.
|
71
|
+
class Token < Entity
|
72
|
+
end
|
73
|
+
|
74
|
+
# Represents a word.
|
75
|
+
class Word < Token
|
76
|
+
end
|
77
|
+
|
78
|
+
# Represents a clitic ('s).
|
79
|
+
class Clitic < Token; end
|
80
|
+
|
81
|
+
# Represents a number.
|
82
|
+
class Number < Token
|
83
|
+
def to_i; to_s.to_i; end
|
84
|
+
def to_f; to_s.to_f; end
|
85
|
+
end
|
86
|
+
|
87
|
+
# Represents a punctuation sign.
|
88
|
+
class Punctuation < Token; end
|
89
|
+
|
90
|
+
# Represents a character that is neither
|
91
|
+
# alphabetical, numerical or a punctuation
|
92
|
+
# character (e.g. @#$%&*).
|
93
|
+
class Symbol < Token; end
|
94
|
+
|
95
|
+
# Represents a url.
|
96
|
+
class Url < Token; end
|
97
|
+
|
98
|
+
# Represents a valid RFC822 address.
|
99
|
+
class Email < Token; end
|
100
|
+
|
101
|
+
# Represents an entity of unknown type.
|
102
|
+
class Unknown; end
|
103
|
+
|
104
|
+
end
|
@@ -1,258 +1,135 @@
|
|
1
|
-
|
2
|
-
require 'treat/feature'
|
3
|
-
require 'treat/delegatable'
|
4
|
-
require 'treat/visitable'
|
5
|
-
require 'treat/registrable'
|
6
|
-
require 'treat/buildable'
|
7
|
-
require 'treat/doable'
|
8
|
-
require 'treat/viewable'
|
9
|
-
require 'treat/features'
|
1
|
+
module Treat::Entities
|
10
2
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
# - nouns:
|
84
|
-
# - noun:
|
85
|
-
# - noun_count:
|
86
|
-
# - nouns_with_*(value)
|
87
|
-
# - noun_with_*(value)
|
88
|
-
#
|
89
|
-
def parse_magic_method(sym, *args)
|
90
|
-
@@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
|
91
|
-
@@cats_regexp ||= "(#{Treat::Languages::WordCategories.join('|')})"
|
92
|
-
method = sym.to_s =~ /entities/ ?
|
93
|
-
sym.to_s.gsub('entities', 'entitys') :
|
94
|
-
method = sym.to_s
|
95
|
-
if method =~ /^#{@@entities_regexp}s$/
|
96
|
-
a = []
|
97
|
-
each_entity($1.intern) { |e| a << e }
|
98
|
-
a
|
99
|
-
elsif method =~ /^#{@@entities_regexp}$/
|
100
|
-
a = []
|
101
|
-
each_entity($1.intern) { |e| a << e }
|
102
|
-
first_but_warn(a, $1)
|
103
|
-
elsif method =~ /^parent_#{@@entities_regexp}$/
|
104
|
-
ancestor_with_types($1.intern)
|
105
|
-
elsif method =~ /^each_#{@@entities_regexp}$/
|
106
|
-
each_entity($1.intern) { |e| yield e }
|
107
|
-
elsif method =~ /^#{@@entities_regexp}_count$/
|
108
|
-
i = 0
|
109
|
-
each_entity($1.intern) { |e| i += 1 }
|
110
|
-
i
|
111
|
-
elsif method =~ /^#{@@entities_regexp}s_with_([a-z]+)$/
|
112
|
-
a = []
|
113
|
-
each_entity($1.intern) do |e|
|
114
|
-
a << e if e.has?($2.intern) &&
|
115
|
-
e.send($2.intern) == args[0]
|
116
|
-
end
|
117
|
-
a
|
118
|
-
elsif method =~ /^#{@@entities_regexp}_with_([a-z]*)$/
|
119
|
-
a = []
|
120
|
-
each_entity($1.intern) do |e|
|
121
|
-
a << e if e.has?($2.intern) &&
|
122
|
-
e.send($2.intern) == args[0]
|
123
|
-
end
|
124
|
-
first_but_warn(a, $1)
|
125
|
-
elsif method =~ /^each_with_([a-z]*)$/
|
126
|
-
each_entity do |e|
|
127
|
-
yield e if e.has?($1.intern) &&
|
128
|
-
e.send($1.intern) == args[0]
|
129
|
-
end
|
130
|
-
elsif method =~ /^each_#{@@cats_regexp}$/
|
131
|
-
each_entity(:word) { |e| yield e if e.category == $1.intern }
|
132
|
-
elsif method =~ /^#{@@cats_regexp}s$/
|
133
|
-
a = []
|
134
|
-
each_entity(:word) { |e| a << e if e.category == $1.intern }
|
135
|
-
a
|
136
|
-
elsif method =~ /^#{@@cats_regexp}$/
|
137
|
-
a = []
|
138
|
-
each_entity(:word) { |e| a << e if e.category == $1.intern }
|
139
|
-
first_but_warn(a, $1)
|
140
|
-
elsif method =~ /^#{@@cats_regexp}_count$/
|
141
|
-
i = 0
|
142
|
-
each_entity(:word) { |e| i += 1 if e.category == $1.intern }
|
143
|
-
i
|
144
|
-
elsif method =~ /^#{@@cats_regexp}s_with_([a-z]*)$/
|
145
|
-
a = []
|
146
|
-
each_entity(:word) do |e|
|
147
|
-
a << e if e.category == $1.intern &&
|
148
|
-
e.has?($2.intern) && e.send($2.intern) == args[0]
|
149
|
-
end
|
150
|
-
a
|
151
|
-
elsif method =~ /^#{@@cats_regexp}_with_([a-z]*)$/
|
152
|
-
a = []
|
153
|
-
each_entity(:word) do |e|
|
154
|
-
a << e if e.category== $1.intern &&
|
155
|
-
e.has?($2.intern) && e.send($2.intern) == args[0]
|
156
|
-
end
|
157
|
-
first_but_warn(a, $1)
|
158
|
-
elsif method =~ /^is_#{@@entities_regexp}\?$/
|
159
|
-
type.to_s == $1
|
160
|
-
elsif method =~ /^is_#{@@cats_regexp}\?$/
|
161
|
-
category.to_s == $1
|
162
|
-
else
|
163
|
-
return :no_magic
|
164
|
-
end
|
3
|
+
# Require base class for Entity.
|
4
|
+
require 'treat/tree'
|
5
|
+
|
6
|
+
class Entity < Treat::Tree::Node
|
7
|
+
|
8
|
+
# A Symbol representing the lowercase
|
9
|
+
# version of the class name.
|
10
|
+
attr_accessor :type
|
11
|
+
|
12
|
+
# Require abilities.
|
13
|
+
require 'treat/entities/abilities'
|
14
|
+
|
15
|
+
# Implements support for #register,
|
16
|
+
# #registry, and #contains_* methods.
|
17
|
+
include Abilities::Registrable
|
18
|
+
|
19
|
+
# Implement support for #self.add_workers
|
20
|
+
extend Abilities::Delegatable
|
21
|
+
|
22
|
+
# Implement support for #self.print_debug and
|
23
|
+
# #self.invalid_call_msg
|
24
|
+
extend Abilities::Debuggable
|
25
|
+
|
26
|
+
# Implement support for #self.build
|
27
|
+
# and #self.from_*
|
28
|
+
extend Abilities::Buildable
|
29
|
+
|
30
|
+
# Implement support for #do.
|
31
|
+
include Abilities::Doable
|
32
|
+
|
33
|
+
# Implement support for #frequency,
|
34
|
+
# #frequency_in_parent and #position_in_parent.
|
35
|
+
include Abilities::Countable
|
36
|
+
|
37
|
+
# Implement support for #magic.
|
38
|
+
include Abilities::Magical
|
39
|
+
|
40
|
+
# Implement support for #to_s, #inspect, etc.
|
41
|
+
include Abilities::Stringable
|
42
|
+
|
43
|
+
# Implement support for #check_has
|
44
|
+
# and #check_hasnt_children?
|
45
|
+
include Abilities::Checkable
|
46
|
+
|
47
|
+
# Implement support for #each_entity, as well as
|
48
|
+
# #entities_with_type, #ancestors_with_type,
|
49
|
+
# #entities_with_feature, #entities_with_category.
|
50
|
+
include Abilities::Iterable
|
51
|
+
|
52
|
+
# Implement support for #export to export
|
53
|
+
# a line of a data set based on a classification.
|
54
|
+
include Abilities::Exportable
|
55
|
+
|
56
|
+
# Implement support for #copy_into.
|
57
|
+
include Abilities::Copyable
|
58
|
+
|
59
|
+
# Initialize the entity with its value and
|
60
|
+
# (optionally) a unique identifier. By default,
|
61
|
+
# the object_id will be used as id.
|
62
|
+
def initialize(value = '', id = nil)
|
63
|
+
id ||= object_id
|
64
|
+
super(value, id)
|
65
|
+
@type = :entity if self == Entity
|
66
|
+
@type ||= ucc(cl(self.class)).intern
|
67
|
+
unless is_a?(Treat::Entities::Token)
|
68
|
+
@count = 0
|
69
|
+
@registry = {
|
70
|
+
:id => {},
|
71
|
+
:value => {},
|
72
|
+
:type => {},
|
73
|
+
:position => {}
|
74
|
+
}
|
165
75
|
end
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
end
|
178
|
-
end
|
179
|
-
super(entities)
|
180
|
-
@parent.value = '' if has_parent?
|
181
|
-
entities[0]
|
76
|
+
end
|
77
|
+
|
78
|
+
|
79
|
+
# Add an entity to the current entity.
|
80
|
+
# Registers the entity in the root node
|
81
|
+
# token registry if the entity is a leaf.
|
82
|
+
#
|
83
|
+
# @see Treat::Registrable
|
84
|
+
def <<(entities, clear_parent = true)
|
85
|
+
unless entities.is_a? Array
|
86
|
+
entities = [entities]
|
182
87
|
end
|
183
|
-
|
184
|
-
|
185
|
-
# Note that this function is recursive, unlike
|
186
|
-
# #each. It does not yield the top element being
|
187
|
-
# recursed.
|
188
|
-
#
|
189
|
-
# This function NEEDS to be ported to C (see source).
|
190
|
-
def each_entity(*types)
|
191
|
-
types = [:entity] if types.size == 0
|
192
|
-
f = false
|
193
|
-
types.each { |t2| f = true if Treat::Entities.match_types[t2][type] }
|
194
|
-
yield self if f
|
195
|
-
unless @children.size == 0
|
196
|
-
@children.each do |child|
|
197
|
-
child.each_entity(*types) { |y| yield y }
|
198
|
-
end
|
199
|
-
end
|
88
|
+
entities.each do |entity|
|
89
|
+
register(entity)
|
200
90
|
end
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
# builder.c_raw <<-EOS, :arity => -1
|
91
|
+
super(entities)
|
92
|
+
@parent.value = '' if has_parent?
|
93
|
+
entities[0]
|
94
|
+
end
|
206
95
|
|
207
96
|
|
97
|
+
# Catch missing methods to support method-like
|
98
|
+
# access to features (e.g. entity.category
|
99
|
+
# instead of entity.features[:category]) and to
|
100
|
+
# support magic methods (see #magic).
|
101
|
+
#
|
102
|
+
# If the feature or magic method does not exist,
|
103
|
+
# or can't be parsed, raises an exception.
|
104
|
+
#
|
105
|
+
# Also catches the "empty" method call (e.g.
|
106
|
+
# Word('hello') or Word 'hello') as syntactic
|
107
|
+
# sugar for the #self.build method.
|
108
|
+
def method_missing(sym, *args, &block)
|
109
|
+
return self.build(*args) if sym == nil
|
208
110
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
if ancestor
|
224
|
-
while not match_types.call(ancestor.type, types)
|
225
|
-
return nil unless (ancestor && ancestor.has_parent?)
|
226
|
-
ancestor = ancestor.parent
|
111
|
+
if !@features.has_key?(sym)
|
112
|
+
r = magic(sym, *args, &block)
|
113
|
+
return r unless r == :no_magic
|
114
|
+
begin
|
115
|
+
super(sym, *args, &block)
|
116
|
+
rescue NoMethodError
|
117
|
+
raise Treat::Exception,
|
118
|
+
if Treat::Categories.lookup(sym)
|
119
|
+
msg = "Method #{sym} cannot " +
|
120
|
+
"be called on a #{type}."
|
121
|
+
else
|
122
|
+
msg = "Method #{sym} does not exist."
|
123
|
+
msg += did_you_mean?(
|
124
|
+
Treat::Categories.methods, sym)
|
227
125
|
end
|
228
|
-
match_types.call(ancestor.type, types) ? ancestor : nil
|
229
|
-
end
|
230
|
-
end
|
231
|
-
alias :ancestor_with_type :ancestor_with_types
|
232
|
-
# Returns the (direct) ancestors of this entity that
|
233
|
-
# have the given type.
|
234
|
-
def ancestors_with_types(*types)
|
235
|
-
ancestor = self
|
236
|
-
ancestors = []
|
237
|
-
while (a = ancestor.ancestor_with_types(*types))
|
238
|
-
ancestors << a
|
239
|
-
ancestor = ancestor.parent
|
240
126
|
end
|
241
|
-
|
242
|
-
|
243
|
-
alias :ancestors_with_type :ancestors_with_types
|
244
|
-
# Return the first element in the array, warning if not
|
245
|
-
# the only one in the array. Used for magic methods: e.g.,
|
246
|
-
# the magic method "word" if called on a sentence
|
247
|
-
# with many words, Treat will return the first word
|
248
|
-
# but warn the user.
|
249
|
-
def first_but_warn(array, type)
|
250
|
-
if array.size > 1
|
251
|
-
warn "Warning: requested one #{type}, but" +
|
252
|
-
" there are many #{type}s in the given entity."
|
253
|
-
end
|
254
|
-
array[0]
|
127
|
+
else
|
128
|
+
@features[sym]
|
255
129
|
end
|
130
|
+
|
256
131
|
end
|
132
|
+
|
257
133
|
end
|
134
|
+
|
258
135
|
end
|