treat 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +0 -0
- data/LICENSE +28 -0
- data/README +0 -0
- data/TODO +67 -0
- data/bin/INFO +1 -0
- data/examples/benchmark.rb +81 -0
- data/examples/keywords.rb +60 -0
- data/examples/texts/bugged_out.txt +26 -0
- data/examples/texts/half_cocked_basel.txt +16 -0
- data/examples/texts/hedge_funds.txt +24 -0
- data/examples/texts/hose_and_dry.txt +19 -0
- data/examples/texts/hungarys_troubles.txt +46 -0
- data/examples/texts/indias_slowdown.txt +15 -0
- data/examples/texts/merkozy_rides_again.txt +24 -0
- data/examples/texts/prada_is_not_walmart.txt +9 -0
- data/examples/texts/republican_nomination.txt +26 -0
- data/examples/texts/to_infinity_and_beyond.txt +15 -0
- data/lib/treat.rb +91 -0
- data/lib/treat/buildable.rb +115 -0
- data/lib/treat/categories.rb +29 -0
- data/lib/treat/category.rb +28 -0
- data/lib/treat/delegatable.rb +90 -0
- data/lib/treat/detectors.rb +28 -0
- data/lib/treat/detectors/encoding/native.rb +12 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
- data/lib/treat/detectors/format/file.rb +36 -0
- data/lib/treat/detectors/language/language_detector.rb +19 -0
- data/lib/treat/detectors/language/what_language.rb +29 -0
- data/lib/treat/entities.rb +52 -0
- data/lib/treat/entities/collection.rb +19 -0
- data/lib/treat/entities/constituents.rb +15 -0
- data/lib/treat/entities/document.rb +11 -0
- data/lib/treat/entities/entity.rb +242 -0
- data/lib/treat/entities/sentence.rb +8 -0
- data/lib/treat/entities/text.rb +7 -0
- data/lib/treat/entities/tokens.rb +37 -0
- data/lib/treat/entities/zones.rb +17 -0
- data/lib/treat/exception.rb +5 -0
- data/lib/treat/extractors.rb +41 -0
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
- data/lib/treat/extractors/named_entity/abner.rb +20 -0
- data/lib/treat/extractors/named_entity/stanford.rb +174 -0
- data/lib/treat/extractors/statistics/frequency.rb +22 -0
- data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
- data/lib/treat/extractors/statistics/position_in.rb +13 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
- data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
- data/lib/treat/extractors/time/chronic.rb +12 -0
- data/lib/treat/extractors/time/native.rb +12 -0
- data/lib/treat/extractors/time/nickel.rb +45 -0
- data/lib/treat/extractors/topic_words/lda.rb +71 -0
- data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
- data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
- data/lib/treat/extractors/topics/reuters.rb +91 -0
- data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
- data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
- data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
- data/lib/treat/feature.rb +53 -0
- data/lib/treat/formatters.rb +44 -0
- data/lib/treat/formatters/cleaners/html.rb +17 -0
- data/lib/treat/formatters/readers/autoselect.rb +35 -0
- data/lib/treat/formatters/readers/gocr.rb +24 -0
- data/lib/treat/formatters/readers/html.rb +13 -0
- data/lib/treat/formatters/readers/ocropus.rb +31 -0
- data/lib/treat/formatters/readers/pdf.rb +17 -0
- data/lib/treat/formatters/readers/txt.rb +15 -0
- data/lib/treat/formatters/serializers/xml.rb +48 -0
- data/lib/treat/formatters/serializers/yaml.rb +15 -0
- data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
- data/lib/treat/formatters/unserializers/xml.rb +79 -0
- data/lib/treat/formatters/unserializers/yaml.rb +15 -0
- data/lib/treat/formatters/visualizers/dot.rb +73 -0
- data/lib/treat/formatters/visualizers/html.rb +12 -0
- data/lib/treat/formatters/visualizers/inspect.rb +16 -0
- data/lib/treat/formatters/visualizers/short_value.rb +14 -0
- data/lib/treat/formatters/visualizers/standoff.rb +41 -0
- data/lib/treat/formatters/visualizers/tree.rb +28 -0
- data/lib/treat/formatters/visualizers/txt.rb +31 -0
- data/lib/treat/group.rb +96 -0
- data/lib/treat/inflectors.rb +50 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
- data/lib/treat/inflectors/declensors/en.rb +18 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
- data/lib/treat/inflectors/stemmers/porter.rb +158 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
- data/lib/treat/inflectors/stemmers/uea.rb +30 -0
- data/lib/treat/lexicalizers.rb +49 -0
- data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
- data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
- data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
- data/lib/treat/lexicalizers/tag/brill.rb +101 -0
- data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
- data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
- data/lib/treat/processors.rb +45 -0
- data/lib/treat/processors/chunkers/txt.rb +27 -0
- data/lib/treat/processors/parsers/enju.rb +214 -0
- data/lib/treat/processors/parsers/stanford.rb +60 -0
- data/lib/treat/processors/segmenters/punkt.rb +48 -0
- data/lib/treat/processors/segmenters/stanford.rb +45 -0
- data/lib/treat/processors/segmenters/tactful.rb +34 -0
- data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
- data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
- data/lib/treat/processors/tokenizers/perl.rb +96 -0
- data/lib/treat/processors/tokenizers/punkt.rb +42 -0
- data/lib/treat/processors/tokenizers/stanford.rb +33 -0
- data/lib/treat/processors/tokenizers/tactful.rb +59 -0
- data/lib/treat/proxies.rb +66 -0
- data/lib/treat/registrable.rb +26 -0
- data/lib/treat/resources.rb +10 -0
- data/lib/treat/resources/categories.rb +18 -0
- data/lib/treat/resources/delegates.rb +96 -0
- data/lib/treat/resources/dependencies.rb +0 -0
- data/lib/treat/resources/edges.rb +8 -0
- data/lib/treat/resources/formats.rb +23 -0
- data/lib/treat/resources/languages.rb +86 -0
- data/lib/treat/resources/languages.txt +504 -0
- data/lib/treat/resources/tags.rb +393 -0
- data/lib/treat/sugar.rb +43 -0
- data/lib/treat/tree.rb +174 -0
- data/lib/treat/utilities.rb +127 -0
- data/lib/treat/visitable.rb +27 -0
- data/test/profile.rb +2 -0
- data/test/tc_detectors.rb +27 -0
- data/test/tc_entity.rb +105 -0
- data/test/tc_extractors.rb +48 -0
- data/test/tc_formatters.rb +46 -0
- data/test/tc_inflectors.rb +39 -0
- data/test/tc_lexicalizers.rb +39 -0
- data/test/tc_processors.rb +36 -0
- data/test/tc_resources.rb +27 -0
- data/test/tc_treat.rb +64 -0
- data/test/tc_tree.rb +60 -0
- data/test/tests.rb +19 -0
- data/test/texts.rb +20 -0
- data/test/texts/english/long.html +24 -0
- data/test/texts/english/long.txt +22 -0
- data/test/texts/english/medium.txt +5 -0
- data/test/texts/english/short.txt +3 -0
- metadata +412 -0
@@ -0,0 +1,19 @@
|
|
1
|
+
module Treat
|
2
|
+
module Detectors
|
3
|
+
module Language
|
4
|
+
class LanguageDetector
|
5
|
+
def self.language(entity, options = {})
|
6
|
+
if Treat.detect_language == false
|
7
|
+
return Treat.default_language
|
8
|
+
else
|
9
|
+
dlvl = Treat.language_detection_level
|
10
|
+
if (Entities.rank(entity.type) < Entities.rank(dlvl)) &&
|
11
|
+
entity.has_parent?
|
12
|
+
return entity.ancestor_with_type(dlvl).language
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Treat
|
2
|
+
module Detectors
|
3
|
+
module Language
|
4
|
+
# Require the 'whatlanguage' gem.
|
5
|
+
silently { require 'whatlanguage' }
|
6
|
+
# Adaptor for the 'whatlanguage' gem, which
|
7
|
+
# performs probabilistic language detection.
|
8
|
+
class WhatLanguage < LanguageDetector
|
9
|
+
# Keep only once instance of the gem class.
|
10
|
+
@@wl = nil
|
11
|
+
# Detect the language of an entity using the
|
12
|
+
# 'whatlanguage' gem. Return an identifier
|
13
|
+
# corresponding to the ISO-639-2 code for the
|
14
|
+
# language.
|
15
|
+
def self.language(entity, options = {})
|
16
|
+
predetection = super(entity, options)
|
17
|
+
return predetection if predetection
|
18
|
+
@@wl ||= ::WhatLanguage.new(:all)
|
19
|
+
all = @@wl.process_text(entity.to_s)
|
20
|
+
lang = {}
|
21
|
+
all.each do |k,v|
|
22
|
+
lang[Treat::Resources::Languages.find(k)] = v
|
23
|
+
end
|
24
|
+
Treat::Feature.new(lang).best
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module Treat
|
2
|
+
# Abstract and concrete structures extending the
|
3
|
+
# Tree::Node class to represent textual entities:
|
4
|
+
#
|
5
|
+
# - Collection
|
6
|
+
# - Document
|
7
|
+
# - Text
|
8
|
+
# - Zone (a Section, Title, Paragraph, or List)
|
9
|
+
# - Sentence
|
10
|
+
# - Constituent (a Phrase or Clause)
|
11
|
+
# - Token (a Word, Number, Punctuation, or Symbol).
|
12
|
+
module Entities
|
13
|
+
# Require Entity first, since the other classes
|
14
|
+
# extend this class.
|
15
|
+
require 'treat/entities/entity'
|
16
|
+
require 'treat/entities/collection'
|
17
|
+
require 'treat/entities/document'
|
18
|
+
require 'treat/entities/text'
|
19
|
+
require 'treat/entities/zones'
|
20
|
+
require 'treat/entities/sentence'
|
21
|
+
require 'treat/entities/constituents'
|
22
|
+
require 'treat/entities/tokens'
|
23
|
+
# Make the constants buildable.
|
24
|
+
constants.each do |entity|
|
25
|
+
define_singleton_method(entity) do |value='', id=nil|
|
26
|
+
const_get(entity).build(value, id)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
# Provide a list of defined entity types,
|
30
|
+
# as non-camel case identifiers.
|
31
|
+
@@list = []
|
32
|
+
def self.list
|
33
|
+
return @@list unless @@list.empty?
|
34
|
+
self.constants.each do |constant|
|
35
|
+
@@list << :"#{ucc(constant)}"
|
36
|
+
end
|
37
|
+
@@list
|
38
|
+
end
|
39
|
+
# Return the 'z-order' for hierarchical
|
40
|
+
# comparison of entity types.
|
41
|
+
def self.rank(type)
|
42
|
+
klass = Entities.const_get(cc(type))
|
43
|
+
return 6 if klass == Collection || klass < Collection
|
44
|
+
return 5 if klass == Document || klass < Document
|
45
|
+
return 4 if klass == Text || klass < Text
|
46
|
+
return 3 if klass == Zone || klass < Zone
|
47
|
+
return 2 if klass == Sentence || klass < Sentence
|
48
|
+
return 1 if klass == Constituent || klass < Constituent
|
49
|
+
return 0 if klass == Token || klass < Token
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Treat
|
2
|
+
module Entities
|
3
|
+
# Represents a collection of texts.
|
4
|
+
class Collection < Entity
|
5
|
+
# Initialize the collection with a folder
|
6
|
+
# containing the texts of the collection.
|
7
|
+
def initialize(folder = nil, id = nil)
|
8
|
+
super('', id)
|
9
|
+
if folder
|
10
|
+
set :folder, folder
|
11
|
+
Dir.glob("#{folder}/*").each do |f|
|
12
|
+
next if FileTest.directory?(f)
|
13
|
+
self << Document.new(f)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Treat
|
2
|
+
module Entities
|
3
|
+
# Represents any syntactic constituent
|
4
|
+
# of a sentence.
|
5
|
+
class Constituent < Entity
|
6
|
+
end
|
7
|
+
# Represents a phrase inside a sentence
|
8
|
+
# or by itself.
|
9
|
+
class Phrase < Constituent
|
10
|
+
end
|
11
|
+
# Represents a clause inside a sentence.
|
12
|
+
class Clause < Constituent
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,242 @@
|
|
1
|
+
require 'treat/tree'
|
2
|
+
require 'treat/feature'
|
3
|
+
require 'treat/delegatable'
|
4
|
+
require 'treat/visitable'
|
5
|
+
require 'treat/registrable'
|
6
|
+
require 'treat/buildable'
|
7
|
+
|
8
|
+
module Treat
|
9
|
+
module Entities
|
10
|
+
class Entity < Tree::Node
|
11
|
+
# Implements support for #register
|
12
|
+
include Registrable
|
13
|
+
# Implement support for #accept.
|
14
|
+
include Visitable
|
15
|
+
# Implement support for #self.add_delegators
|
16
|
+
extend Delegatable
|
17
|
+
# Implement support for #self.from_*
|
18
|
+
extend Buildable
|
19
|
+
# Initialize the document with its filename.
|
20
|
+
# Optionally specify a reader to read the file.
|
21
|
+
# If +read+ is set to false, the document will
|
22
|
+
# not be read automatically; in that case, the
|
23
|
+
# method #read must be called on the document
|
24
|
+
# object to load it in.
|
25
|
+
def self.build(file_or_value = '', id = nil)
|
26
|
+
from_anything(file_or_value, id)
|
27
|
+
end
|
28
|
+
# Initialize the entity with its value and
|
29
|
+
# (optionally) a unique identifier. By default,
|
30
|
+
# the object_id will be used as id. Also initialize
|
31
|
+
# the token registry in the root node.
|
32
|
+
def initialize(value = '', id = nil)
|
33
|
+
id ||= object_id
|
34
|
+
super(value, id)
|
35
|
+
end
|
36
|
+
# Return a lowercase identifier representing the
|
37
|
+
# type of entity (e.g. :word, :token, etc.)
|
38
|
+
def type; :"#{cl(self.class).downcase}"; end
|
39
|
+
# Catch missing methods to support method-like
|
40
|
+
# access to features (e.g. entity.cat instead of
|
41
|
+
# entity.features[:cat]) and to support magic
|
42
|
+
# methods (see #parse_magic_method). If the
|
43
|
+
# feature does not exist
|
44
|
+
def method_missing(sym, *args, &block)
|
45
|
+
return self.build(*args) if sym == nil
|
46
|
+
if !@features[sym]
|
47
|
+
r = parse_magic_method(sym, *args, &block)
|
48
|
+
if r == :no_magic
|
49
|
+
begin
|
50
|
+
super(sym, *args, &block)
|
51
|
+
rescue NoMethodError
|
52
|
+
# Check...
|
53
|
+
if Categories.have_method?(sym)
|
54
|
+
msg = "Method #{sym} cannot be called on a #{type}."
|
55
|
+
else
|
56
|
+
msg = "Method #{sym} does not exist."
|
57
|
+
msg += did_you_mean?(Category.methods, sym)
|
58
|
+
end
|
59
|
+
raise Treat::Exception, msg
|
60
|
+
end
|
61
|
+
else
|
62
|
+
r
|
63
|
+
end
|
64
|
+
else
|
65
|
+
@features[sym]
|
66
|
+
end
|
67
|
+
end
|
68
|
+
# Parse "magic methods", which allow the following
|
69
|
+
# syntaxes to be used (where 'word' can be replaced
|
70
|
+
# by any entity type, e.g. token, zone, etc.):
|
71
|
+
#
|
72
|
+
# - each_word : iterate over each entity of type word.
|
73
|
+
# - words: return an array of words in the entity.
|
74
|
+
# - word: return the first word in the entity.
|
75
|
+
# - word_count: return the number of words in the entity.
|
76
|
+
# - words_with_*(value) (where is an arbitrary feature):
|
77
|
+
# return the words that have the given feature.
|
78
|
+
# - word_with_*(value) : return the first word with
|
79
|
+
# the feature specified by * in value.
|
80
|
+
#
|
81
|
+
# Also provides magical methods for types of words:
|
82
|
+
#
|
83
|
+
# - each_noun:
|
84
|
+
# - nouns:
|
85
|
+
# - noun:
|
86
|
+
# - noun_count:
|
87
|
+
# - nouns_with_*(value)
|
88
|
+
# - noun_with_*(value)
|
89
|
+
#
|
90
|
+
# Note that repetition of code in this method
|
91
|
+
# (instead of method chaining) is intentional
|
92
|
+
# and aims to reduce the number of method
|
93
|
+
# dispatches done by Ruby to improve performance.
|
94
|
+
def parse_magic_method(sym, *args, &block)
|
95
|
+
@@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
|
96
|
+
@@cats_regexp ||= "(#{Treat::Resources::Categories::List.join('|')})"
|
97
|
+
method = sym.to_s =~ /entities/ ?
|
98
|
+
sym.to_s.gsub('entities', 'entitys'):
|
99
|
+
method = sym.to_s
|
100
|
+
a = []
|
101
|
+
if method =~ /^parent_#{@@entities_regexp}$/ # Optimize all
|
102
|
+
self.class.send(:define_method, "parent_#{$1}") do
|
103
|
+
ancestor_with_types(:"#{$1}")
|
104
|
+
end
|
105
|
+
ancestor_with_types(:"#{$1}")
|
106
|
+
elsif method =~ /^each_#{@@entities_regexp}$/
|
107
|
+
each_entity(:"#{$1}") { |entity| yield entity }
|
108
|
+
elsif method =~ /^#{@@entities_regexp}s$/
|
109
|
+
each_entity(:"#{$1}") { |e| a << e }
|
110
|
+
a
|
111
|
+
elsif method =~ /^#{@@entities_regexp}$/
|
112
|
+
each_entity(:"#{$1}") { |e| a << e }
|
113
|
+
first_but_warn(a, $1)
|
114
|
+
elsif method =~ /^#{@@entities_regexp}_count$/
|
115
|
+
i = 0
|
116
|
+
each_entity(:"#{$1}") { |e| i += 1 }
|
117
|
+
i
|
118
|
+
elsif method =~ /^#{@@entities_regexp}s_with_([a-z]*)$/
|
119
|
+
each_entity(:"#{$1}") do |e|
|
120
|
+
a << e if e.has?(:"#{$2}") &&
|
121
|
+
e.send(:"#{$2}") == args[0]
|
122
|
+
end
|
123
|
+
a
|
124
|
+
elsif method =~ /^#{@@entities_regexp}s_with_([a-z]*)$/
|
125
|
+
each_entity(:"#{$1}") do |e|
|
126
|
+
a << e if e.has?(:"#{$2}") &&
|
127
|
+
e.send(:"#{$2}") == args[0]
|
128
|
+
end
|
129
|
+
first_but_warn(a, $1)
|
130
|
+
elsif method =~ /^each_with_([a-z]*)$/
|
131
|
+
each_entity do |e|
|
132
|
+
yield e if e.has?(:"#{$2}") &&
|
133
|
+
e.send(:"#{$2}") == args[0]
|
134
|
+
end
|
135
|
+
elsif method =~ /^each_#{@@cats_regexp}$/
|
136
|
+
each_entity(:word) { |e| yield e if e.cat == :"#{$1}" }
|
137
|
+
elsif method =~ /^#{@@cats_regexp}s$/
|
138
|
+
each_entity(:word) { |e| a << e if e.cat == :"#{$1}" }
|
139
|
+
a
|
140
|
+
elsif method =~ /^#{@@cats_regexp}$/
|
141
|
+
each_entity(:word) { |e| a << e if e.cat == :"#{$1}" }
|
142
|
+
first_but_warn(a, $1)
|
143
|
+
elsif method =~ /^#{@@cats_regexp}_count$/
|
144
|
+
i = 0
|
145
|
+
each_entity(:word) { |e| i += 1 if e.cat == :"#{$1}" }
|
146
|
+
i
|
147
|
+
elsif method =~ /^#{@@cats_regexp}s_with_([a-z]*)$/
|
148
|
+
each_entity(:word) do |e|
|
149
|
+
a << e if e.cat == :"#{$1}" &&
|
150
|
+
e.has?(:"#{$2}") && e.send(:"#{$2}") == args[0]
|
151
|
+
end
|
152
|
+
a
|
153
|
+
elsif method =~ /^#{@@cats_regexp}_with_([a-z]*)$/
|
154
|
+
each_entity(:word) do |e|
|
155
|
+
a << e if e.cat == :"#{$1}" &&
|
156
|
+
e.has?(:"#{$2}") && e.send(:"#{$2}") == args[0]
|
157
|
+
end
|
158
|
+
first_but_warn(a, $1)
|
159
|
+
else
|
160
|
+
:no_magic
|
161
|
+
end
|
162
|
+
end
|
163
|
+
# Add an entity to the current entity.
|
164
|
+
# Registers the entity in the root node
|
165
|
+
# token registry if the entity is a leaf.
|
166
|
+
#
|
167
|
+
# @see Treat::Registrable
|
168
|
+
def <<(entities, clear_parent = true)
|
169
|
+
entities = [entities] unless entities.is_a? Array
|
170
|
+
entities.each do |entity|
|
171
|
+
register_token(entity) if entity.is_leaf?
|
172
|
+
end
|
173
|
+
super(entities)
|
174
|
+
@parent.value = '' if has_parent?
|
175
|
+
entities[0]
|
176
|
+
end
|
177
|
+
# Yields each entity of any of the supplied
|
178
|
+
# types in the children tree of this Entity.
|
179
|
+
# Note that this function is recursive, unlike
|
180
|
+
# #each. It does not yield the top element being
|
181
|
+
# recursed.
|
182
|
+
def each_entity(*types)
|
183
|
+
yield self if match_types(self, types)
|
184
|
+
if has_children?
|
185
|
+
@children.each do |child|
|
186
|
+
child.each_entity(*types) { |y| yield y }
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
# Returns the first ancestor of this
|
191
|
+
# entity that has the given type.
|
192
|
+
def ancestor_with_types(*types)
|
193
|
+
ancestor = @parent
|
194
|
+
while not match_types(ancestor, types)
|
195
|
+
return nil unless ancestor.has_parent?
|
196
|
+
ancestor = ancestor.parent
|
197
|
+
end
|
198
|
+
match_types(ancestor, types) ? ancestor : nil
|
199
|
+
end
|
200
|
+
alias :ancestor_with_type :ancestor_with_types
|
201
|
+
# Return the entity's string value in plain text format.
|
202
|
+
def to_string; @value; end
|
203
|
+
# An alias for #to_string.
|
204
|
+
def to_s; visualize(:txt); end
|
205
|
+
alias :to_str :to_s
|
206
|
+
# Return an informative string representation of the entity.
|
207
|
+
def inspect; visualize(:inspect); end
|
208
|
+
# Print out an ASCII representation of the tree.
|
209
|
+
def print_tree; puts visualize(:tree); end
|
210
|
+
# Return a shortened value of the entity's string value using [...].
|
211
|
+
def short_value(ml = 6); visualize(:short_value, :max_length => ml); end
|
212
|
+
# Convenience functions. Convenience decorators.
|
213
|
+
def frequency_of(word); statistics(:frequency_of, value: word); end
|
214
|
+
|
215
|
+
private
|
216
|
+
# Return the first element in the array, warning if not
|
217
|
+
# the only one in the array. Used for magic methods: e.g.,
|
218
|
+
# the magic method "word" if called on a sentence
|
219
|
+
# with many words, Treat will return the first word
|
220
|
+
# but warn the user.
|
221
|
+
def first_but_warn(array, type)
|
222
|
+
if array.size > 1
|
223
|
+
warn "Warning: requested one #{type}, but" +
|
224
|
+
" there are many #{type}s in the given entity."
|
225
|
+
end
|
226
|
+
array[0]
|
227
|
+
end
|
228
|
+
# Cache a list of the type => class relationships.
|
229
|
+
@@type_classes = {}
|
230
|
+
# Returns true if the node is of the same type or
|
231
|
+
# is a subtype of of one of the specified entity types,
|
232
|
+
# which are supplied as identifiers rather than classes.
|
233
|
+
def match_types(node, entity_types)
|
234
|
+
entity_types.each do |type|
|
235
|
+
@@type_classes[type] ||= Entities.const_get(cc(type))
|
236
|
+
return true if node.is_a? @@type_classes[type]
|
237
|
+
end
|
238
|
+
false
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Treat
|
2
|
+
module Entities
|
3
|
+
# Represents a terminal element in the text structure.
|
4
|
+
class Token < Entity
|
5
|
+
# All tokens are leafs.
|
6
|
+
def is_leaf?; true; end
|
7
|
+
def frequency; self.set :frequency, statistics(:frequency); end
|
8
|
+
end
|
9
|
+
# Represents a word.
|
10
|
+
class Word < Token
|
11
|
+
def infinitive(conjugator = nil); conjugate(conjugator, :mode => :infinitive); end
|
12
|
+
def present_participle(conjugator = nil); conjugate(conjugator, :tense => :present, :mode => :participle); end
|
13
|
+
def plural(declensor = nil); declense(declensor, :count => :plural); end
|
14
|
+
def singular(declensor = nil); declense(declensor, :count => :singular); end
|
15
|
+
end
|
16
|
+
class Clitic < Token
|
17
|
+
end
|
18
|
+
# Represents a number.
|
19
|
+
class Number < Token
|
20
|
+
# Convert the number to an integer.
|
21
|
+
def to_i; to_s.to_i; end
|
22
|
+
# Convert the number to a float.
|
23
|
+
def to_f; to_s.to_f; end
|
24
|
+
end
|
25
|
+
# Represents a punctuation sign.
|
26
|
+
class Punctuation < Token
|
27
|
+
end
|
28
|
+
# Represents a character that is neither
|
29
|
+
# alphabetical, numerical or a punctuation
|
30
|
+
# character (e.g. @#$%&*).
|
31
|
+
class Symbol < Token
|
32
|
+
end
|
33
|
+
# Represents an entity of unknown type.
|
34
|
+
class Unknown < Token
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|