treat 1.2.0 → 2.0.0rc1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +2 -2
- data/README.md +12 -21
- data/lib/treat/autoload.rb +44 -0
- data/lib/treat/config/config.rb +38 -0
- data/lib/treat/config/configurable.rb +51 -0
- data/lib/treat/config/data/config.rb +50 -0
- data/lib/treat/config/data/core.rb +52 -0
- data/lib/treat/config/data/databases.rb +10 -0
- data/lib/treat/config/data/entities.rb +15 -0
- data/lib/treat/config/data/languages/agnostic.rb +31 -0
- data/lib/treat/config/{languages → data/languages}/arabic.rb +0 -0
- data/lib/treat/config/{languages → data/languages}/chinese.rb +0 -0
- data/lib/treat/config/{languages → data/languages}/dutch.rb +1 -1
- data/lib/treat/config/data/languages/english.rb +95 -0
- data/lib/treat/config/data/languages/french.rb +148 -0
- data/lib/treat/config/data/languages/german.rb +135 -0
- data/lib/treat/config/{languages → data/languages}/greek.rb +1 -1
- data/lib/treat/config/data/languages/italian.rb +162 -0
- data/lib/treat/config/data/languages/polish.rb +11 -0
- data/lib/treat/config/{languages → data/languages}/portuguese.rb +1 -1
- data/lib/treat/config/{languages → data/languages}/russian.rb +1 -1
- data/lib/treat/config/data/languages/spanish.rb +291 -0
- data/lib/treat/config/data/languages/swedish.rb +289 -0
- data/lib/treat/config/data/libraries.rb +12 -0
- data/lib/treat/config/data/linguistics.rb +44 -0
- data/lib/treat/config/data/tags.rb +328 -0
- data/lib/treat/config/{workers → data/workers}/extractors.rb +2 -10
- data/lib/treat/config/{workers → data/workers}/formatters.rb +0 -0
- data/lib/treat/config/{workers → data/workers}/inflectors.rb +0 -0
- data/lib/treat/config/{workers → data/workers}/learners.rb +0 -0
- data/lib/treat/config/{workers → data/workers}/lexicalizers.rb +4 -3
- data/lib/treat/config/{workers → data/workers}/processors.rb +3 -3
- data/lib/treat/config/{workers → data/workers}/retrievers.rb +0 -0
- data/lib/treat/config/importable.rb +31 -0
- data/lib/treat/config/paths.rb +23 -0
- data/lib/treat/config/tags.rb +37 -0
- data/lib/treat/core/dsl.rb +55 -0
- data/lib/treat/{installer.rb → core/installer.rb} +10 -12
- data/lib/treat/core/server.rb +40 -0
- data/lib/treat/entities/entities.rb +101 -0
- data/lib/treat/entities/{abilities/doable.rb → entity/applicable.rb} +5 -3
- data/lib/treat/entities/{abilities → entity}/buildable.rb +118 -63
- data/lib/treat/entities/{abilities → entity}/checkable.rb +2 -2
- data/lib/treat/entities/{abilities → entity}/comparable.rb +6 -6
- data/lib/treat/entities/{abilities → entity}/countable.rb +2 -1
- data/lib/treat/entities/entity/debuggable.rb +86 -0
- data/lib/treat/entities/{abilities → entity}/delegatable.rb +16 -26
- data/lib/treat/entities/{abilities → entity}/exportable.rb +2 -2
- data/lib/treat/entities/{abilities → entity}/iterable.rb +4 -16
- data/lib/treat/entities/{abilities → entity}/magical.rb +22 -17
- data/lib/treat/entities/entity/registrable.rb +36 -0
- data/lib/treat/entities/{abilities → entity}/stringable.rb +18 -15
- data/lib/treat/entities/entity.rb +86 -77
- data/lib/treat/exception.rb +3 -0
- data/lib/treat/helpers/hash.rb +29 -0
- data/lib/treat/helpers/help.rb +35 -0
- data/lib/treat/helpers/object.rb +55 -0
- data/lib/treat/helpers/string.rb +124 -0
- data/lib/treat/{core → learning}/data_set.rb +11 -11
- data/lib/treat/{core → learning}/export.rb +3 -3
- data/lib/treat/{core → learning}/problem.rb +26 -16
- data/lib/treat/{core → learning}/question.rb +5 -9
- data/lib/treat/loaders/linguistics.rb +8 -9
- data/lib/treat/loaders/stanford.rb +5 -11
- data/lib/treat/modules.rb +33 -0
- data/lib/treat/proxies/array.rb +27 -0
- data/lib/treat/proxies/language.rb +47 -0
- data/lib/treat/proxies/number.rb +18 -0
- data/lib/treat/proxies/proxy.rb +25 -0
- data/lib/treat/proxies/string.rb +18 -0
- data/lib/treat/version.rb +10 -1
- data/lib/treat/{workers.rb → workers/categorizable.rb} +18 -19
- data/lib/treat/workers/extractors/keywords/tf_idf.rb +11 -11
- data/lib/treat/workers/extractors/language/what_language.rb +8 -6
- data/lib/treat/workers/extractors/name_tag/stanford.rb +10 -4
- data/lib/treat/workers/extractors/similarity/levenshtein.rb +36 -0
- data/lib/treat/workers/extractors/similarity/tf_idf.rb +27 -0
- data/lib/treat/workers/extractors/tf_idf/native.rb +4 -4
- data/lib/treat/workers/extractors/time/chronic.rb +2 -4
- data/lib/treat/workers/extractors/time/nickel.rb +19 -20
- data/lib/treat/workers/extractors/time/ruby.rb +2 -1
- data/lib/treat/workers/extractors/topic_words/lda.rb +12 -12
- data/lib/treat/workers/extractors/topics/reuters.rb +9 -13
- data/lib/treat/workers/formatters/readers/autoselect.rb +1 -1
- data/lib/treat/workers/formatters/readers/image.rb +19 -9
- data/lib/treat/workers/formatters/readers/odt.rb +2 -1
- data/lib/treat/workers/formatters/readers/pdf.rb +20 -3
- data/lib/treat/workers/formatters/readers/xml.rb +0 -1
- data/lib/treat/workers/formatters/serializers/mongo.rb +10 -20
- data/lib/treat/workers/formatters/serializers/xml.rb +17 -26
- data/lib/treat/workers/formatters/serializers/yaml.rb +5 -4
- data/lib/treat/workers/formatters/unserializers/mongo.rb +4 -4
- data/lib/treat/workers/formatters/unserializers/xml.rb +3 -4
- data/lib/treat/workers/formatters/unserializers/yaml.rb +3 -4
- data/lib/treat/workers/formatters/visualizers/dot.rb +1 -0
- data/lib/treat/workers/formatters/visualizers/standoff.rb +2 -3
- data/lib/treat/workers/formatters/visualizers/tree.rb +2 -3
- data/lib/treat/workers/{group.rb → groupable.rb} +9 -9
- data/lib/treat/workers/inflectors/cardinalizers/linguistics.rb +1 -3
- data/lib/treat/workers/inflectors/conjugators/linguistics.rb +5 -7
- data/lib/treat/workers/inflectors/declensors/english.rb +13 -20
- data/lib/treat/workers/inflectors/declensors/linguistics.rb +29 -28
- data/lib/treat/workers/inflectors/ordinalizers/linguistics.rb +0 -2
- data/lib/treat/workers/inflectors/stemmers/porter.rb +8 -10
- data/lib/treat/workers/inflectors/stemmers/porter_c.rb +7 -7
- data/lib/treat/workers/inflectors/stemmers/uea.rb +3 -8
- data/lib/treat/workers/learners/classifiers/id3.rb +17 -14
- data/lib/treat/workers/learners/classifiers/linear.rb +15 -27
- data/lib/treat/workers/learners/classifiers/mlp.rb +32 -19
- data/lib/treat/workers/learners/classifiers/svm.rb +28 -21
- data/lib/treat/workers/lexicalizers/categorizers/from_tag.rb +19 -3
- data/lib/treat/workers/lexicalizers/sensers/wordnet.rb +15 -7
- data/lib/treat/workers/lexicalizers/taggers/brill/patch.rb +4 -1
- data/lib/treat/workers/lexicalizers/taggers/brill.rb +8 -19
- data/lib/treat/workers/lexicalizers/taggers/lingua.rb +4 -15
- data/lib/treat/workers/lexicalizers/taggers/stanford.rb +22 -13
- data/lib/treat/workers/processors/chunkers/autoselect.rb +2 -3
- data/lib/treat/workers/processors/chunkers/html.rb +1 -6
- data/lib/treat/workers/processors/parsers/enju.rb +2 -4
- data/lib/treat/workers/processors/parsers/stanford.rb +13 -7
- data/lib/treat/workers/processors/segmenters/punkt.rb +25 -11
- data/lib/treat/workers/processors/segmenters/scalpel.rb +20 -0
- data/lib/treat/workers/processors/segmenters/srx.rb +42 -0
- data/lib/treat/workers/processors/segmenters/stanford.rb +5 -5
- data/lib/treat/workers/processors/segmenters/tactful.rb +21 -11
- data/lib/treat/workers/processors/tokenizers/ptb.rb +40 -30
- data/lib/treat/workers/processors/tokenizers/punkt.rb +14 -19
- data/lib/treat/workers/processors/tokenizers/stanford.rb +38 -22
- data/lib/treat/workers/retrievers/indexers/ferret.rb +6 -3
- data/lib/treat/workers/retrievers/searchers/ferret.rb +2 -2
- data/lib/treat/workers/workers.rb +6 -0
- data/lib/treat.rb +18 -32
- data/models/MANIFEST +1 -0
- data/spec/core/data_set.rb +174 -0
- data/spec/core/export.rb +52 -0
- data/spec/core/problem.rb +144 -0
- data/spec/core/question.rb +52 -0
- data/spec/{collection.rb → entities/collection.rb} +20 -35
- data/spec/{document.rb → entities/document.rb} +3 -54
- data/spec/{entity.rb → entities/entity.rb} +10 -9
- data/spec/entities/phrase.rb +33 -0
- data/spec/{token.rb → entities/token.rb} +0 -57
- data/spec/entities/word.rb +3 -0
- data/spec/{zone.rb → entities/zone.rb} +0 -26
- data/spec/helper.rb +116 -32
- data/spec/sandbox.rb +258 -25
- data/spec/treat.rb +26 -34
- data/spec/workers/agnostic.rb +137 -0
- data/spec/workers/english.rb +194 -0
- data/spec/workers/examples/english/economist/hungarys_troubles.txt +46 -0
- data/spec/workers/examples/english/economist/saving_the_euro.odt +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/archimedes.abw +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/euler.html +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/gauss.pdf +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/leibniz.txt +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/newton.doc +0 -0
- data/spec/workers/examples/english/phrase.xml +5 -0
- data/spec/workers/examples/english/test.txt +1 -0
- data/spec/workers/language.rb +280 -0
- data/spec/workers.rb +28 -0
- metadata +122 -105
- data/lib/treat/config/core/acronyms.rb +0 -5
- data/lib/treat/config/core/encodings.rb +0 -8
- data/lib/treat/config/core/entities.rb +0 -2
- data/lib/treat/config/core/language.rb +0 -3
- data/lib/treat/config/core/paths.rb +0 -8
- data/lib/treat/config/core/syntax.rb +0 -1
- data/lib/treat/config/core/verbosity.rb +0 -1
- data/lib/treat/config/databases/default.rb +0 -1
- data/lib/treat/config/databases/mongo.rb +0 -1
- data/lib/treat/config/languages/agnostic.rb +0 -34
- data/lib/treat/config/languages/english.rb +0 -60
- data/lib/treat/config/languages/french.rb +0 -18
- data/lib/treat/config/languages/german.rb +0 -18
- data/lib/treat/config/languages/italian.rb +0 -12
- data/lib/treat/config/languages/polish.rb +0 -12
- data/lib/treat/config/languages/spanish.rb +0 -12
- data/lib/treat/config/languages/swedish.rb +0 -12
- data/lib/treat/config/libraries/punkt.rb +0 -1
- data/lib/treat/config/libraries/reuters.rb +0 -1
- data/lib/treat/config/libraries/stanford.rb +0 -1
- data/lib/treat/config/linguistics/categories.rb +0 -4
- data/lib/treat/config/linguistics/punctuation.rb +0 -33
- data/lib/treat/config/tags/aligned.rb +0 -221
- data/lib/treat/config/tags/enju.rb +0 -71
- data/lib/treat/config/tags/paris7.rb +0 -17
- data/lib/treat/config/tags/ptb.rb +0 -15
- data/lib/treat/config/workers/list.rb +0 -1
- data/lib/treat/config.rb +0 -135
- data/lib/treat/core.rb +0 -5
- data/lib/treat/entities/abilities/copyable.rb +0 -47
- data/lib/treat/entities/abilities/debuggable.rb +0 -83
- data/lib/treat/entities/abilities/registrable.rb +0 -46
- data/lib/treat/entities/collection.rb +0 -40
- data/lib/treat/entities/document.rb +0 -10
- data/lib/treat/entities/group.rb +0 -18
- data/lib/treat/entities/section.rb +0 -13
- data/lib/treat/entities/token.rb +0 -47
- data/lib/treat/entities/zone.rb +0 -12
- data/lib/treat/entities.rb +0 -6
- data/lib/treat/helpers/didyoumean.rb +0 -57
- data/lib/treat/helpers/escaping.rb +0 -15
- data/lib/treat/helpers/formatting.rb +0 -41
- data/lib/treat/helpers/objtohash.rb +0 -8
- data/lib/treat/helpers/platform.rb +0 -15
- data/lib/treat/helpers/reflection.rb +0 -17
- data/lib/treat/helpers/temporary.rb +0 -27
- data/lib/treat/helpers/verbosity.rb +0 -19
- data/lib/treat/helpers.rb +0 -5
- data/lib/treat/loaders.rb +0 -10
- data/lib/treat/proxies.rb +0 -106
- data/lib/treat/workers/formatters/unserializers/autoselect.rb +0 -17
- data/lib/treat/workers/inflectors/declensors/active_support.rb +0 -31
- data/lib/treat/workers/processors/tokenizers/tactful.rb +0 -68
- data/spec/core.rb +0 -441
- data/spec/phrase.rb +0 -112
- data/spec/word.rb +0 -111
@@ -1,4 +1,4 @@
|
|
1
|
-
module Treat::Entities::
|
1
|
+
module Treat::Entities::Entity::Iterable
|
2
2
|
|
3
3
|
# Yields each entity of any of the supplied
|
4
4
|
# types in the children tree of this Entity.
|
@@ -6,12 +6,12 @@ module Treat::Entities::Abilities::Iterable
|
|
6
6
|
# #each. It does not yield the top element being
|
7
7
|
# recursed.
|
8
8
|
#
|
9
|
-
# This function NEEDS to be ported to C.
|
9
|
+
# This function NEEDS to be ported to C. #FIXME
|
10
10
|
def each_entity(*types)
|
11
11
|
types = [:entity] if types.size == 0
|
12
12
|
f = false
|
13
13
|
types.each do |t2|
|
14
|
-
if is_a?(Treat::Entities.const_get(cc
|
14
|
+
if is_a?(Treat::Entities.const_get(t2.cc))
|
15
15
|
f = true; break
|
16
16
|
end
|
17
17
|
end
|
@@ -57,7 +57,7 @@ module Treat::Entities::Abilities::Iterable
|
|
57
57
|
def ancestor_with_type(type)
|
58
58
|
return unless has_parent?
|
59
59
|
ancestor = @parent
|
60
|
-
type_klass = Treat::Entities.const_get(cc
|
60
|
+
type_klass = Treat::Entities.const_get(type.cc)
|
61
61
|
while not ancestor.is_a?(type_klass)
|
62
62
|
return nil unless (ancestor && ancestor.has_parent?)
|
63
63
|
ancestor = ancestor.parent
|
@@ -105,18 +105,6 @@ module Treat::Entities::Abilities::Iterable
|
|
105
105
|
end
|
106
106
|
i
|
107
107
|
end
|
108
|
-
|
109
|
-
# Return the first element in the array, warning if not
|
110
|
-
# the only one in the array. Used for magic methods: e.g.,
|
111
|
-
# the magic method "word" if called on a sentence with many
|
112
|
-
# words, Treat will return the first word, but warn the user.
|
113
|
-
def first_but_warn(array, type)
|
114
|
-
if array.size > 1
|
115
|
-
warn "Warning: requested one #{type}, but" +
|
116
|
-
" there are many #{type}s in this entity."
|
117
|
-
end
|
118
|
-
array[0]
|
119
|
-
end
|
120
108
|
|
121
109
|
|
122
110
|
end
|
@@ -1,27 +1,20 @@
|
|
1
|
-
module Treat::Entities::
|
1
|
+
module Treat::Entities::Entity::Magical
|
2
2
|
|
3
3
|
# Parse "magic methods", which allow the following
|
4
4
|
# syntaxes to be used (where 'word' can be replaced
|
5
5
|
# by any entity type, e.g. token, zone, etc.):
|
6
6
|
#
|
7
|
-
# - each_word : iterate over each
|
8
|
-
# - words: return an array of words
|
7
|
+
# - each_word : iterate over each children of type word.
|
8
|
+
# - words: return an array of children words.
|
9
9
|
# - word: return the first word in the entity.
|
10
10
|
# - word_count: return the number of words in the entity.
|
11
|
-
# - words_with_*(value) (where
|
12
|
-
# return the words that have the given feature.
|
13
|
-
# - word_with_*(value) : return the first word with
|
14
|
-
# the feature specified by * in value.
|
15
|
-
#
|
16
|
-
# Also provides magical methods for types of words:
|
17
|
-
#
|
18
|
-
# - each_noun:
|
19
|
-
# - nouns:
|
20
|
-
# - noun:
|
21
|
-
# - noun_count:
|
22
|
-
# - nouns_with_*(value)
|
23
|
-
# - noun_with_*(value)
|
11
|
+
# - words_with_*(value) (where * is an arbitrary feature):
|
12
|
+
# return the words that have the given feature set to value.
|
24
13
|
#
|
14
|
+
# Also provides magical methods for types of words (each_noun,
|
15
|
+
# nouns, noun_count, nouns_with_*(value) noun_with_*(value), etc.)
|
16
|
+
# For this to be used, the words in the text must have been
|
17
|
+
# tokenized and categorized in the first place.
|
25
18
|
def magic(sym, *args)
|
26
19
|
|
27
20
|
# Cache this for performance.
|
@@ -80,9 +73,21 @@ module Treat::Entities::Abilities::Magical
|
|
80
73
|
elsif method =~ /^frequency_in_#{@@entities_regexp}$/
|
81
74
|
frequency_in($1.intern)
|
82
75
|
else
|
83
|
-
return :no_magic
|
76
|
+
return :no_magic # :-(
|
84
77
|
end
|
78
|
+
|
85
79
|
end
|
86
80
|
|
81
|
+
# Return the first element in the array, warning if not
|
82
|
+
# the only one in the array. Used for magic methods: e.g.,
|
83
|
+
# the magic method "word" if called on a sentence with many
|
84
|
+
# words, Treat will return the first word, but warn the user.
|
85
|
+
def first_but_warn(array, type)
|
86
|
+
if array.size > 1
|
87
|
+
warn "Warning: requested one #{type}, but" +
|
88
|
+
" there are many #{type}s in this entity."
|
89
|
+
end
|
90
|
+
array[0]
|
91
|
+
end
|
87
92
|
|
88
93
|
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# Registers the entities ocurring in the subtree of
|
2
|
+
# a node as children are added. Also registers text
|
3
|
+
# occurrences for word groups and tokens (n grams).
|
4
|
+
module Treat::Entities::Entity::Registrable
|
5
|
+
|
6
|
+
# Registers a token or phrase in the registry.
|
7
|
+
# The registry keeps track of children by id,
|
8
|
+
# by entity type, and also keeps the position
|
9
|
+
# of the entity in its parent entity.
|
10
|
+
def register(entity)
|
11
|
+
unless @registry
|
12
|
+
@count, @registry = 0,
|
13
|
+
{id: {}, value: {}, position:{}, type: {}}
|
14
|
+
end
|
15
|
+
if entity.is_a?(Treat::Entities::Token) ||
|
16
|
+
entity.is_a?(Treat::Entities::Group)
|
17
|
+
val = entity.to_s.downcase
|
18
|
+
@registry[:value][val] ||= 0
|
19
|
+
@registry[:value][val] += 1
|
20
|
+
end
|
21
|
+
@registry[:id][entity.id] = true
|
22
|
+
@registry[:type][entity.type] ||= 0
|
23
|
+
@registry[:type][entity.type] += 1
|
24
|
+
@registry[:position][entity.id] = @count
|
25
|
+
@count += 1
|
26
|
+
@parent.register(entity) if has_parent?
|
27
|
+
end
|
28
|
+
|
29
|
+
# Backtrack up the tree to find a token registry,
|
30
|
+
# by default the one in the root node of the tree.
|
31
|
+
def registry(type = nil)
|
32
|
+
(has_parent? && type != self.type) ?
|
33
|
+
@parent.registry(type) : @registry
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
@@ -1,18 +1,22 @@
|
|
1
1
|
# Gives entities the ability to be converted
|
2
2
|
# to string representations (#to_string, #to_s,
|
3
3
|
# #to_str, #inspect, #print_tree).
|
4
|
-
module Treat::Entities::
|
5
|
-
|
6
|
-
#
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
4
|
+
module Treat::Entities::Entity::Stringable
|
5
|
+
|
6
|
+
# Returns the entity's true string value.
|
7
|
+
def to_string; @value.dup; end
|
8
|
+
|
9
|
+
# Returns an array of the childrens' string
|
10
|
+
# values, found by calling #to_s on them.
|
11
|
+
def to_a; @children.map { |c| c.to_s }; end
|
12
|
+
|
13
|
+
alias :to_ary :to_a
|
14
|
+
|
11
15
|
# Returns the entity's string value by
|
12
16
|
# imploding the value of all terminal
|
13
17
|
# entities in the subtree of that entity.
|
14
18
|
def to_s
|
15
|
-
|
19
|
+
has_children? ? implode.strip : @value.dup
|
16
20
|
end
|
17
21
|
|
18
22
|
# #to_str is the same as #to_s.
|
@@ -24,12 +28,10 @@ module Treat::Entities::Abilities::Stringable
|
|
24
28
|
def short_value(max_length = 30)
|
25
29
|
s = to_s
|
26
30
|
words = s.split(' ')
|
27
|
-
if s.length < max_length
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
words[-2..-1].join(' ')
|
32
|
-
end
|
31
|
+
return s if (s.length < max_length) ||
|
32
|
+
!(words[0..2] && words[-2..-1])
|
33
|
+
words[0..2].join(' ') + ' [...] ' +
|
34
|
+
words[-2..-1].join(' ')
|
33
35
|
end
|
34
36
|
|
35
37
|
# Print out an ASCII representation of the tree.
|
@@ -38,7 +40,8 @@ module Treat::Entities::Abilities::Stringable
|
|
38
40
|
# Return an informative string representation
|
39
41
|
# of the entity.
|
40
42
|
def inspect
|
41
|
-
|
43
|
+
name = self.class.mn
|
44
|
+
s = "#{name} (#{@id.to_s})"
|
42
45
|
if caller_method(2) == :inspect
|
43
46
|
@id.to_s
|
44
47
|
else
|
@@ -1,94 +1,106 @@
|
|
1
1
|
module Treat::Entities
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
# Require abilities.
|
6
|
-
p = Treat.paths.lib +
|
7
|
-
'treat/entities/abilities/*.rb'
|
8
|
-
Dir.glob(p).each { |f| require f }
|
9
|
-
|
3
|
+
# Basic tree structure.
|
10
4
|
require 'birch'
|
11
|
-
|
5
|
+
|
6
|
+
# The Entity class extends a basic tree structure
|
7
|
+
# (written in C for optimal speed) and represents
|
8
|
+
# any form of textual entityin a processing task
|
9
|
+
# (this could be a collection of documents, a
|
10
|
+
# single document, a single paragraph, etc.)
|
11
|
+
#
|
12
|
+
# Classes that extend Entity provide the concrete
|
13
|
+
# behavior corresponding to the relevant entity type.
|
14
|
+
# See entities.rb for a full list and description of
|
15
|
+
# the different entity types in the document model.
|
12
16
|
class Entity < ::Birch::Tree
|
13
17
|
|
14
|
-
# A
|
15
|
-
# version of the class name.
|
18
|
+
# A symbol representing the lowercase
|
19
|
+
# version of the class name. This is
|
20
|
+
# the only attribute that the Entity
|
21
|
+
# class adds to the Birch::Tree class.
|
16
22
|
attr_accessor :type
|
23
|
+
|
24
|
+
# Autoload all the classes in /abilities.
|
25
|
+
path = File.expand_path(__FILE__)
|
26
|
+
patt = File.dirname(path) + '/entity/*.rb'
|
27
|
+
Dir.glob(patt).each { |f| require f }
|
28
|
+
|
29
|
+
# Implements support for #register, #registry.
|
30
|
+
include Registrable
|
17
31
|
|
18
|
-
#
|
19
|
-
|
20
|
-
include Abilities::Registrable
|
32
|
+
# Implement support for #self.call_worker, etc.
|
33
|
+
extend Delegatable
|
21
34
|
|
22
|
-
# Implement support for #self.
|
23
|
-
extend
|
35
|
+
# Implement support for #self.print_debug, etc.
|
36
|
+
extend Debuggable
|
24
37
|
|
25
|
-
# Implement support for #self.
|
26
|
-
|
27
|
-
extend Abilities::Debuggable
|
38
|
+
# Implement support for #self.build and #self.from_*
|
39
|
+
extend Buildable
|
28
40
|
|
29
|
-
# Implement support for #
|
30
|
-
|
31
|
-
extend Abilities::Buildable
|
41
|
+
# Implement support for #apply (previously #do).
|
42
|
+
include Applicable
|
32
43
|
|
33
|
-
# Implement support for #
|
34
|
-
|
44
|
+
# Implement support for #frequency, #frequency_in,
|
45
|
+
# #frequency_of, #position, #position_from_end, etc.
|
46
|
+
include Countable
|
35
47
|
|
36
|
-
# Implement support for #
|
37
|
-
|
38
|
-
include Abilities::Countable
|
39
|
-
|
40
|
-
# Implement support for #magic.
|
41
|
-
include Abilities::Magical
|
48
|
+
# Implement support for over 100 #magic methods!
|
49
|
+
include Magical
|
42
50
|
|
43
51
|
# Implement support for #to_s, #inspect, etc.
|
44
|
-
include
|
52
|
+
include Stringable
|
45
53
|
|
46
|
-
# Implement support for #check_has
|
47
|
-
|
48
|
-
include Abilities::Checkable
|
54
|
+
# Implement support for #check_has and others.
|
55
|
+
include Checkable
|
49
56
|
|
50
57
|
# Implement support for #each_entity, as well as
|
51
58
|
# #entities_with_type, #ancestors_with_type,
|
52
|
-
# #entities_with_feature, #entities_with_category.
|
53
|
-
include
|
54
|
-
|
55
|
-
# Implement support for #export to export
|
56
|
-
# a line of a data set based on a classification.
|
57
|
-
include Abilities::Exportable
|
59
|
+
# #entities_with_feature, #entities_with_category, etc.
|
60
|
+
include Iterable
|
58
61
|
|
59
|
-
# Implement support for #
|
60
|
-
|
62
|
+
# Implement support for #export, allowing to export
|
63
|
+
# a data set row from the receiving entity.
|
64
|
+
include Exportable
|
61
65
|
|
62
66
|
# Implement support for #self.compare_with
|
63
|
-
extend
|
67
|
+
extend Comparable
|
64
68
|
|
65
69
|
# Initialize the entity with its value and
|
66
70
|
# (optionally) a unique identifier. By default,
|
67
71
|
# the object_id will be used as id.
|
68
72
|
def initialize(value = '', id = nil)
|
69
|
-
id ||= object_id
|
70
|
-
super(value, id)
|
73
|
+
id ||= object_id; super(value, id)
|
71
74
|
@type = :entity if self == Entity
|
72
|
-
@type ||=
|
75
|
+
@type ||= self.class.mn.ucc.intern
|
73
76
|
end
|
74
77
|
|
75
78
|
# Add an entity to the current entity.
|
76
79
|
# Registers the entity in the root node
|
77
80
|
# token registry if the entity is a leaf.
|
78
|
-
#
|
79
|
-
#
|
81
|
+
# Unsets the parent node's value; in order
|
82
|
+
# to keep the tree clean, only the leaf
|
83
|
+
# values are stored.
|
84
|
+
#
|
85
|
+
# Takes in a single entity or an array of
|
86
|
+
# entities. Returns the first child supplied.
|
87
|
+
# If a string is
|
80
88
|
def <<(entities, clear_parent = true)
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
entities.
|
85
|
-
|
86
|
-
|
89
|
+
entities = (entities.is_a?(::String) ||
|
90
|
+
entities.is_a?(::Numeric)) ?
|
91
|
+
entities.to_entity : entities
|
92
|
+
entities = entities.is_a?(::Array) ?
|
93
|
+
entities : [entities]
|
94
|
+
# Register each entity in this node.
|
95
|
+
entities.each { |e| register(e) }
|
96
|
+
# Pass to the <<() method in Birch.
|
87
97
|
super(entities)
|
98
|
+
# Unset the parent value if necessary.
|
88
99
|
@parent.value = '' if has_parent?
|
89
|
-
|
100
|
+
# Return the first child.
|
101
|
+
return entities[0]
|
90
102
|
end
|
91
|
-
|
103
|
+
|
92
104
|
# Catch missing methods to support method-like
|
93
105
|
# access to features (e.g. entity.category
|
94
106
|
# instead of entity.features[:category]) and to
|
@@ -102,29 +114,26 @@ module Treat::Entities
|
|
102
114
|
# sugar for the #self.build method.
|
103
115
|
def method_missing(sym, *args, &block)
|
104
116
|
return self.build(*args) if sym == nil
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
super(sym, *args, &block)
|
111
|
-
rescue NoMethodError
|
112
|
-
raise Treat::Exception,
|
113
|
-
if Treat::Workers.lookup(sym)
|
114
|
-
msg = "Method #{sym} cannot " +
|
115
|
-
"be called on a #{type}."
|
116
|
-
else
|
117
|
-
msg = "Method #{sym} does not exist."
|
118
|
-
msg += did_you_mean?(
|
119
|
-
Treat::Workers.methods, sym)
|
120
|
-
end
|
121
|
-
end
|
122
|
-
else
|
123
|
-
@features[sym]
|
124
|
-
end
|
125
|
-
|
117
|
+
return @features[sym] if @features.has_key?(sym)
|
118
|
+
result = magic(sym, *args, &block)
|
119
|
+
return result unless result == :no_magic
|
120
|
+
begin; super(sym, *args, &block)
|
121
|
+
rescue NoMethodError; invalid_call(sym); end
|
126
122
|
end
|
127
|
-
|
123
|
+
|
124
|
+
# Raises a Treat::Exception saying that the
|
125
|
+
# method called was invalid, and that the
|
126
|
+
# requested method does not exist. Also
|
127
|
+
# provides suggestions for misspellings.
|
128
|
+
def invalid_call(sym)
|
129
|
+
msg = Treat::Workers.lookup(sym) ?
|
130
|
+
"Method #{sym} can't be called on a #{type}." :
|
131
|
+
"Method #{sym} is not defined by Treat." +
|
132
|
+
Treat::Helpers::Help.did_you_mean?(
|
133
|
+
Treat::Workers.methods, sym)
|
134
|
+
raise Treat::Exception, msg
|
135
|
+
end
|
136
|
+
|
128
137
|
end
|
129
138
|
|
130
139
|
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# Helper methods to manipulate hashes.
|
2
|
+
class Treat::Helpers::Hash
|
3
|
+
|
4
|
+
# Mixin to allow conversion of hashes to
|
5
|
+
# nested structs with the keys as attributes.
|
6
|
+
module ToStruct
|
7
|
+
# Converts a hash to nested structs.
|
8
|
+
def to_struct
|
9
|
+
hash = self
|
10
|
+
symbols = hash.keys.select { |k|
|
11
|
+
!k.is_a?(Symbol) }.size
|
12
|
+
return hash if symbols > 0
|
13
|
+
klass = Struct.new(*hash.keys)
|
14
|
+
struct = klass.new(*hash.values)
|
15
|
+
hash.each do |key, value|
|
16
|
+
if value.is_a?(Hash)
|
17
|
+
v = value.to_struct
|
18
|
+
struct[key] = v
|
19
|
+
end
|
20
|
+
end; return struct
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Include the mixins on the core Hash class.
|
25
|
+
Hash.class_eval do
|
26
|
+
include Treat::Helpers::Hash::ToStruct
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# Helper methods to detect misspellings
|
2
|
+
# and suggest alternatives to the user.
|
3
|
+
class Treat::Helpers::Help
|
4
|
+
|
5
|
+
# Search the list to see if there are
|
6
|
+
# words similar to #name in the #list
|
7
|
+
# If yes, return a string saying
|
8
|
+
# "Did you mean ... ?" with the names.
|
9
|
+
def self.did_you_mean?(list, name)
|
10
|
+
return '' # Fix
|
11
|
+
list = list.map { |e| e.to_s }
|
12
|
+
name = name.to_s
|
13
|
+
sugg = []
|
14
|
+
list.each do |element|
|
15
|
+
l = self.levenshtein(element,name)
|
16
|
+
if l > 0 && l < 2
|
17
|
+
sugg << element
|
18
|
+
end
|
19
|
+
end
|
20
|
+
unless sugg.size == 0
|
21
|
+
if sugg.size == 1
|
22
|
+
msg += " Perhaps you meant '#{sugg[0]}' ?"
|
23
|
+
else
|
24
|
+
sugg_quote = sugg[0..-2].map do
|
25
|
+
|x| '\'' + x + '\''
|
26
|
+
end
|
27
|
+
msg += " Perhaps you meant " +
|
28
|
+
"#{sugg_quote.join(', ')}," +
|
29
|
+
" or '#{sugg[-1]}' ?"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
msg
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# Methods related to object reflection.
|
2
|
+
class Treat::Helpers::Object
|
3
|
+
|
4
|
+
# Allow introspection onto what method called
|
5
|
+
# another one at runtime (useful for debugging).
|
6
|
+
module CallerMethod
|
7
|
+
# Pattern to match method from trace.
|
8
|
+
CMPattern = /^(.+?):(\d+)(?::in `(.*)')?/
|
9
|
+
# Return the name of the method that
|
10
|
+
# called the method that calls this method.
|
11
|
+
def caller_method(n = 3)
|
12
|
+
at = caller(n).first
|
13
|
+
CMPattern =~ at
|
14
|
+
Regexp.last_match[3].
|
15
|
+
gsub('block in ', '').intern
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
# Retrieve the last name of a class/module
|
20
|
+
# (i.e. the part after the last "::").
|
21
|
+
module ModuleName
|
22
|
+
def module_name; self.to_s.split('::')[-1]; end
|
23
|
+
alias :mn :module_name
|
24
|
+
end
|
25
|
+
|
26
|
+
module Verbosity
|
27
|
+
# Runs a block of code without warnings.
|
28
|
+
def silence_warnings(&block)
|
29
|
+
warn_level = $VERBOSE; $VERBOSE = nil
|
30
|
+
result = block.call; $VERBOSE = warn_level
|
31
|
+
result
|
32
|
+
end
|
33
|
+
# Runs a block of code while blocking stdout.
|
34
|
+
def silence_stdout(log = '/dev/null')
|
35
|
+
unless Treat.core.verbosity.silence
|
36
|
+
yield; return
|
37
|
+
end
|
38
|
+
file, old, ret = File.new(log, 'w'),
|
39
|
+
$stdout.dup, nil; $stdout.reopen(file)
|
40
|
+
ret = yield; $stdout = old; return ret
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Allow getting the caller method in any context.
|
45
|
+
Object.class_eval do
|
46
|
+
include Treat::Helpers::Object::CallerMethod
|
47
|
+
include Treat::Helpers::Object::Verbosity
|
48
|
+
end
|
49
|
+
|
50
|
+
# Allow getting the last name of any module/class.
|
51
|
+
Module.class_eval do
|
52
|
+
include Treat::Helpers::Object::ModuleName
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
# Helper methods for camel casing and
|
2
|
+
# escaping standard strings and symbols.
|
3
|
+
class Treat::Helpers::String
|
4
|
+
|
5
|
+
# Utility to escape floating point numbers
|
6
|
+
# from strings (useful for a variety of
|
7
|
+
# applications, including chunking, segmenting
|
8
|
+
# and tokenizing, to exclude periods that
|
9
|
+
# are not sentence terminators).
|
10
|
+
module Escapable
|
11
|
+
|
12
|
+
# Escape char to use.
|
13
|
+
EscapeChar = '^^^'
|
14
|
+
# Regex for escape.
|
15
|
+
Regex = /([0-9]+)\.([0-9]+)/
|
16
|
+
|
17
|
+
# Escape float periods with EscapeChar.
|
18
|
+
def escape_floats!
|
19
|
+
to_s.gsub!(Regex) { $1 + EscapeChar + $2 }
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
# Counterpart to Treat::Helpers::Escapable;
|
25
|
+
# unescapes floats, restoring the orgiinal text.
|
26
|
+
module Unescapable
|
27
|
+
|
28
|
+
# Escaped for regex.
|
29
|
+
EscapedEscapeChar = '\^\^\^'
|
30
|
+
# Regex for unescape.
|
31
|
+
Regex = /([0-9]+)#{EscapedEscapeChar}([0-9]+)/
|
32
|
+
|
33
|
+
# Unescape float periods (restore text).
|
34
|
+
def unescape_floats!
|
35
|
+
to_s.gsub!(Regex) { $1 + '.' + $2 }
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
# Transform an un_camel_cased string
|
41
|
+
# into a CamelCased string. This is
|
42
|
+
# available on String and Symbol.
|
43
|
+
module CamelCaseable
|
44
|
+
|
45
|
+
# A cache to optimize camel casing.
|
46
|
+
@@cc_cache = {}
|
47
|
+
|
48
|
+
# Regex for camel casing.
|
49
|
+
Regex = /^[a-z]|_[a-z]/
|
50
|
+
|
51
|
+
# Convert un_camel_case to CamelCase.
|
52
|
+
def camel_case
|
53
|
+
o_phrase, phrase = to_s, to_s.dup
|
54
|
+
if @@cc_cache[o_phrase]
|
55
|
+
return @@cc_cache[o_phrase]
|
56
|
+
end
|
57
|
+
if Treat.core.acronyms.include?(phrase)
|
58
|
+
phrase = phrase.upcase
|
59
|
+
else
|
60
|
+
phrase.gsub!(Regex) { |a| a.upcase }
|
61
|
+
phrase.gsub!('_', '')
|
62
|
+
end
|
63
|
+
@@cc_cache[o_phrase] = phrase
|
64
|
+
end
|
65
|
+
|
66
|
+
alias :cc :camel_case
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
# Counterpart of Treat::Helpers::CamelCaseable;
|
71
|
+
# transforms a CamelCase string to its un_camel_
|
72
|
+
# case corresponding form.
|
73
|
+
module UnCamelCaseable
|
74
|
+
|
75
|
+
# A cache to optimize un camel casing.
|
76
|
+
@@ucc_cache = {}
|
77
|
+
|
78
|
+
# Convert CamelCase to un_camel_case.
|
79
|
+
def un_camel_case
|
80
|
+
o_phrase, phrase = to_s, to_s.dup
|
81
|
+
if @@ucc_cache[o_phrase]
|
82
|
+
return @@ucc_cache[o_phrase]
|
83
|
+
end
|
84
|
+
acros = Treat.core.acronyms
|
85
|
+
if !acros.include?(phrase.downcase)
|
86
|
+
phrase.gsub!(/[A-Z]/) do |p|
|
87
|
+
'_' + p.downcase
|
88
|
+
end
|
89
|
+
if phrase[0] == '_'
|
90
|
+
return phrase = phrase[1..-1]
|
91
|
+
end
|
92
|
+
else
|
93
|
+
phrase = phrase.downcase
|
94
|
+
end
|
95
|
+
@@ucc_cache[o_phrase] = phrase
|
96
|
+
end
|
97
|
+
|
98
|
+
alias :ucc :un_camel_case
|
99
|
+
|
100
|
+
end
|
101
|
+
|
102
|
+
# Determines whether module is
|
103
|
+
# an "-able" mixin kind of thing.
|
104
|
+
module IsMixin
|
105
|
+
def is_mixin?; to_s[-4..-1] == 'able'; end
|
106
|
+
end
|
107
|
+
|
108
|
+
# Graft the helpers onto the string module.
|
109
|
+
String.class_eval do
|
110
|
+
include Treat::Helpers::String::CamelCaseable
|
111
|
+
include Treat::Helpers::String::UnCamelCaseable
|
112
|
+
include Treat::Helpers::String::Escapable
|
113
|
+
include Treat::Helpers::String::Unescapable
|
114
|
+
include Treat::Helpers::String::IsMixin
|
115
|
+
end
|
116
|
+
|
117
|
+
# Graft camel casing onto symbols.
|
118
|
+
Symbol.class_eval do
|
119
|
+
include Treat::Helpers::String::CamelCaseable
|
120
|
+
include Treat::Helpers::String::UnCamelCaseable
|
121
|
+
end
|
122
|
+
|
123
|
+
|
124
|
+
end
|