treat 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -3
- data/README.md +33 -0
- data/files/INFO +1 -0
- data/lib/treat.rb +40 -105
- data/lib/treat/ai.rb +12 -0
- data/lib/treat/ai/classifiers/id3.rb +27 -0
- data/lib/treat/categories.rb +82 -35
- data/lib/treat/categorizable.rb +44 -0
- data/lib/treat/classification.rb +61 -0
- data/lib/treat/configurable.rb +115 -0
- data/lib/treat/data_set.rb +42 -0
- data/lib/treat/dependencies.rb +24 -0
- data/lib/treat/downloader.rb +87 -0
- data/lib/treat/entities.rb +68 -66
- data/lib/treat/entities/abilities.rb +10 -0
- data/lib/treat/entities/abilities/buildable.rb +327 -0
- data/lib/treat/entities/abilities/checkable.rb +31 -0
- data/lib/treat/entities/abilities/copyable.rb +45 -0
- data/lib/treat/entities/abilities/countable.rb +51 -0
- data/lib/treat/entities/abilities/debuggable.rb +83 -0
- data/lib/treat/entities/abilities/delegatable.rb +123 -0
- data/lib/treat/entities/abilities/doable.rb +62 -0
- data/lib/treat/entities/abilities/exportable.rb +11 -0
- data/lib/treat/entities/abilities/iterable.rb +115 -0
- data/lib/treat/entities/abilities/magical.rb +83 -0
- data/lib/treat/entities/abilities/registrable.rb +74 -0
- data/lib/treat/entities/abilities/stringable.rb +91 -0
- data/lib/treat/entities/entities.rb +104 -0
- data/lib/treat/entities/entity.rb +122 -245
- data/lib/treat/exception.rb +4 -4
- data/lib/treat/extractors.rb +77 -80
- data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
- data/lib/treat/extractors/language/what_language.rb +50 -45
- data/lib/treat/extractors/name_tag/stanford.rb +55 -0
- data/lib/treat/extractors/tf_idf/native.rb +87 -0
- data/lib/treat/extractors/time/chronic.rb +55 -0
- data/lib/treat/extractors/time/nickel.rb +86 -62
- data/lib/treat/extractors/time/ruby.rb +53 -0
- data/lib/treat/extractors/topic_words/lda.rb +67 -58
- data/lib/treat/extractors/topics/reuters.rb +100 -87
- data/lib/treat/formatters.rb +39 -35
- data/lib/treat/formatters/readers/abw.rb +49 -29
- data/lib/treat/formatters/readers/autoselect.rb +37 -33
- data/lib/treat/formatters/readers/doc.rb +19 -13
- data/lib/treat/formatters/readers/html.rb +52 -30
- data/lib/treat/formatters/readers/image.rb +41 -40
- data/lib/treat/formatters/readers/odt.rb +59 -45
- data/lib/treat/formatters/readers/pdf.rb +28 -25
- data/lib/treat/formatters/readers/txt.rb +12 -15
- data/lib/treat/formatters/readers/xml.rb +73 -36
- data/lib/treat/formatters/serializers/xml.rb +80 -79
- data/lib/treat/formatters/serializers/yaml.rb +19 -18
- data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
- data/lib/treat/formatters/unserializers/xml.rb +94 -99
- data/lib/treat/formatters/unserializers/yaml.rb +20 -19
- data/lib/treat/formatters/visualizers/dot.rb +132 -132
- data/lib/treat/formatters/visualizers/standoff.rb +52 -44
- data/lib/treat/formatters/visualizers/tree.rb +26 -29
- data/lib/treat/groupable.rb +153 -0
- data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
- data/lib/treat/inflectors.rb +50 -45
- data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
- data/lib/treat/inflectors/declensors/active_support.rb +31 -0
- data/lib/treat/inflectors/declensors/english.rb +38 -0
- data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
- data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
- data/lib/treat/inflectors/stemmers/porter.rb +160 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
- data/lib/treat/inflectors/stemmers/uea.rb +28 -0
- data/lib/treat/installer.rb +308 -0
- data/lib/treat/kernel.rb +105 -27
- data/lib/treat/languages.rb +122 -88
- data/lib/treat/languages/arabic.rb +15 -15
- data/lib/treat/languages/chinese.rb +15 -15
- data/lib/treat/languages/dutch.rb +15 -15
- data/lib/treat/languages/english.rb +61 -62
- data/lib/treat/languages/french.rb +19 -19
- data/lib/treat/languages/german.rb +20 -20
- data/lib/treat/languages/greek.rb +15 -15
- data/lib/treat/languages/italian.rb +16 -16
- data/lib/treat/languages/polish.rb +15 -15
- data/lib/treat/languages/portuguese.rb +15 -15
- data/lib/treat/languages/russian.rb +15 -15
- data/lib/treat/languages/spanish.rb +16 -16
- data/lib/treat/languages/swedish.rb +16 -16
- data/lib/treat/lexicalizers.rb +34 -55
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
- data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
- data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
- data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
- data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
- data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
- data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
- data/lib/treat/linguistics.rb +9 -0
- data/lib/treat/linguistics/categories.rb +11 -0
- data/lib/treat/linguistics/tags.rb +422 -0
- data/lib/treat/loaders/linguistics.rb +30 -0
- data/lib/treat/loaders/stanford.rb +27 -0
- data/lib/treat/object.rb +1 -0
- data/lib/treat/processors.rb +37 -44
- data/lib/treat/processors/chunkers/autoselect.rb +16 -0
- data/lib/treat/processors/chunkers/html.rb +71 -0
- data/lib/treat/processors/chunkers/txt.rb +18 -24
- data/lib/treat/processors/parsers/enju.rb +253 -208
- data/lib/treat/processors/parsers/stanford.rb +130 -131
- data/lib/treat/processors/segmenters/punkt.rb +79 -45
- data/lib/treat/processors/segmenters/stanford.rb +46 -48
- data/lib/treat/processors/segmenters/tactful.rb +43 -36
- data/lib/treat/processors/tokenizers/perl.rb +124 -92
- data/lib/treat/processors/tokenizers/ptb.rb +81 -0
- data/lib/treat/processors/tokenizers/punkt.rb +48 -42
- data/lib/treat/processors/tokenizers/stanford.rb +39 -38
- data/lib/treat/processors/tokenizers/tactful.rb +64 -55
- data/lib/treat/proxies.rb +52 -35
- data/lib/treat/retrievers.rb +26 -16
- data/lib/treat/retrievers/indexers/ferret.rb +47 -26
- data/lib/treat/retrievers/searchers/ferret.rb +69 -50
- data/lib/treat/tree.rb +241 -183
- data/spec/collection.rb +123 -0
- data/spec/document.rb +93 -0
- data/spec/entity.rb +408 -0
- data/spec/languages.rb +25 -0
- data/spec/phrase.rb +146 -0
- data/spec/samples/mathematicians/archimedes.abw +34 -0
- data/spec/samples/mathematicians/euler.html +21 -0
- data/spec/samples/mathematicians/gauss.pdf +0 -0
- data/spec/samples/mathematicians/leibniz.txt +13 -0
- data/spec/samples/mathematicians/newton.doc +0 -0
- data/spec/sandbox.rb +5 -0
- data/spec/token.rb +109 -0
- data/spec/treat.rb +52 -0
- data/spec/tree.rb +117 -0
- data/spec/word.rb +110 -0
- data/spec/zone.rb +66 -0
- data/tmp/INFO +1 -1
- metadata +100 -201
- data/INSTALL +0 -1
- data/README +0 -3
- data/TODO +0 -28
- data/lib/economist/half_cocked_basel.txt +0 -16
- data/lib/economist/hungarys_troubles.txt +0 -46
- data/lib/economist/indias_slowdown.txt +0 -15
- data/lib/economist/merkozy_rides_again.txt +0 -24
- data/lib/economist/prada_is_not_walmart.txt +0 -9
- data/lib/economist/to_infinity_and_beyond.txt +0 -15
- data/lib/ferret/_11.cfs +0 -0
- data/lib/ferret/_14.cfs +0 -0
- data/lib/ferret/_p.cfs +0 -0
- data/lib/ferret/_s.cfs +0 -0
- data/lib/ferret/_v.cfs +0 -0
- data/lib/ferret/_y.cfs +0 -0
- data/lib/ferret/segments +0 -0
- data/lib/ferret/segments_15 +0 -0
- data/lib/treat/buildable.rb +0 -157
- data/lib/treat/category.rb +0 -33
- data/lib/treat/delegatable.rb +0 -116
- data/lib/treat/doable.rb +0 -45
- data/lib/treat/entities/collection.rb +0 -14
- data/lib/treat/entities/document.rb +0 -12
- data/lib/treat/entities/phrases.rb +0 -17
- data/lib/treat/entities/tokens.rb +0 -61
- data/lib/treat/entities/zones.rb +0 -41
- data/lib/treat/extractors/coreferences/stanford.rb +0 -69
- data/lib/treat/extractors/date/chronic.rb +0 -32
- data/lib/treat/extractors/date/ruby.rb +0 -25
- data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
- data/lib/treat/extractors/language/language_extractor.rb +0 -27
- data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
- data/lib/treat/extractors/roles/naive.rb +0 -73
- data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
- data/lib/treat/extractors/statistics/position_in.rb +0 -14
- data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
- data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
- data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
- data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
- data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
- data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
- data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
- data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
- data/lib/treat/feature.rb +0 -58
- data/lib/treat/features.rb +0 -7
- data/lib/treat/formatters/visualizers/short_value.rb +0 -29
- data/lib/treat/formatters/visualizers/txt.rb +0 -45
- data/lib/treat/group.rb +0 -106
- data/lib/treat/helpers/linguistics_loader.rb +0 -18
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
- data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
- data/lib/treat/inflectors/declensions/english.rb +0 -319
- data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
- data/lib/treat/inflectors/stem/porter.rb +0 -162
- data/lib/treat/inflectors/stem/porter_c.rb +0 -26
- data/lib/treat/inflectors/stem/uea.rb +0 -30
- data/lib/treat/install.rb +0 -59
- data/lib/treat/languages/tags.rb +0 -377
- data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
- data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
- data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
- data/lib/treat/lexicalizers/tag/brill.rb +0 -91
- data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
- data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
- data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
- data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
- data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
- data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
- data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
- data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
- data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
- data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
- data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
- data/lib/treat/registrable.rb +0 -28
- data/lib/treat/sugar.rb +0 -50
- data/lib/treat/viewable.rb +0 -29
- data/lib/treat/visitable.rb +0 -28
- data/test/profile.rb +0 -2
- data/test/tc_entity.rb +0 -117
- data/test/tc_extractors.rb +0 -73
- data/test/tc_formatters.rb +0 -41
- data/test/tc_inflectors.rb +0 -34
- data/test/tc_lexicalizers.rb +0 -32
- data/test/tc_processors.rb +0 -50
- data/test/tc_resources.rb +0 -22
- data/test/tc_treat.rb +0 -60
- data/test/tc_tree.rb +0 -60
- data/test/tests.rb +0 -20
- data/test/texts.rb +0 -19
- data/test/texts/english/half_cocked_basel.txt +0 -16
- data/test/texts/english/hose_and_dry.doc +0 -0
- data/test/texts/english/hungarys_troubles.abw +0 -70
- data/test/texts/english/long.html +0 -24
- data/test/texts/english/long.txt +0 -22
- data/test/texts/english/medium.txt +0 -5
- data/test/texts/english/republican_nomination.pdf +0 -0
- data/test/texts/english/saving_the_euro.odt +0 -0
- data/test/texts/english/short.txt +0 -3
- data/test/texts/english/zero_sum.html +0 -111
@@ -1,77 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Processors
|
3
|
-
module Tokenizers
|
4
|
-
# A native rule-basd tokenizer based on the one
|
5
|
-
# developped by Robert Macyntyre in 1995 for the Penn
|
6
|
-
# Treebank project. This tokenizer follows the
|
7
|
-
# conventions used by the Penn Treebank.
|
8
|
-
#
|
9
|
-
# Original script:
|
10
|
-
# http://www.cis.upenn.edu/~treebank/tokenizer.sed
|
11
|
-
#
|
12
|
-
# Copyright (c) 2004 UTIYAMA Masao <mutiyama@nict.go.jp>
|
13
|
-
# All rights reserved. This program is free software;
|
14
|
-
# you can redistribute it and/or modify it under the
|
15
|
-
# same terms as Ruby itself.
|
16
|
-
class Macintyre
|
17
|
-
# Tokenize the entity using a native rule-based algorithm.
|
18
|
-
def self.tokenize(entity, options = {})
|
19
|
-
if entity.has_children?
|
20
|
-
raise Treat::Exception,
|
21
|
-
'Cannot tokenize a Phrase that already has children.'
|
22
|
-
end
|
23
|
-
chunks = split(entity.to_s)
|
24
|
-
chunks.each do |chunk|
|
25
|
-
next if chunk =~ /([[:space:]]+)/
|
26
|
-
entity << Treat::Entities::Token.from_string(chunk)
|
27
|
-
end
|
28
|
-
end
|
29
|
-
# Helper method to split the string into tokens.
|
30
|
-
def self.split(string)
|
31
|
-
s = " " + string + " "
|
32
|
-
s.gsub!(/\s+/," ")
|
33
|
-
s.gsub!(/(\s+)''/,'\1"')
|
34
|
-
s.gsub!(/(\s+)``/,'\1"')
|
35
|
-
s.gsub!(/''(\s+)/,'"\1')
|
36
|
-
s.gsub!(/``(\s+)/,'"\1')
|
37
|
-
s.gsub!(/ (['`]+)([^0-9].+) /,' \1 \2 ')
|
38
|
-
s.gsub!(/([ (\[{<])"/,'\1 `` ')
|
39
|
-
s.gsub!(/\.\.\./,' ... ')
|
40
|
-
s.gsub!(/[,;:@\#$%&]/,' \& ')
|
41
|
-
s.gsub!(/([^.])([.])([\])}>"']*)[ ]*$/,'\1 \2\3 ')
|
42
|
-
s.gsub!(/[?!]/,' \& ')
|
43
|
-
s.gsub!(/[\]\[(){}<>]/,' \& ')
|
44
|
-
s.gsub!(/--/,' -- ')
|
45
|
-
s.sub!(/$/,' ')
|
46
|
-
s.sub!(/^/,' ')
|
47
|
-
s.gsub!(/"/,' \'\' ')
|
48
|
-
s.gsub!(/([^'])' /,'\1 \' ')
|
49
|
-
s.gsub!(/'([sSmMdD]) /,' \'\1 ')
|
50
|
-
s.gsub!(/'ll /,' \'ll ')
|
51
|
-
s.gsub!(/'re /,' \'re ')
|
52
|
-
s.gsub!(/'ve /,' \'ve ')
|
53
|
-
s.gsub!(/n't /,' n\'t ')
|
54
|
-
s.gsub!(/'LL /,' \'LL ')
|
55
|
-
s.gsub!(/'RE /,' \'RE ')
|
56
|
-
s.gsub!(/'VE /,' \'VE ')
|
57
|
-
s.gsub!(/N'T /,' N\'T ')
|
58
|
-
s.gsub!(/ ([Cc])annot /,' \1an not ')
|
59
|
-
s.gsub!(/ ([Dd])'ye /,' \1\' ye ')
|
60
|
-
s.gsub!(/ ([Gg])imme /,' \1im me ')
|
61
|
-
s.gsub!(/ ([Gg])onna /,' \1on na ')
|
62
|
-
s.gsub!(/ ([Gg])otta /,' \1ot ta ')
|
63
|
-
s.gsub!(/ ([Ll])emme /,' \1em me ')
|
64
|
-
s.gsub!(/ ([Mm])ore'n /,' \1ore \'n ')
|
65
|
-
s.gsub!(/ '([Tt])is /,' \'\1 is ')
|
66
|
-
s.gsub!(/ '([Tt])was /,' \'\1 was ')
|
67
|
-
s.gsub!(/ ([Ww])anna /,' \1an na ')
|
68
|
-
while s.sub!(/(\s)([0-9]+) , ([0-9]+)(\s)/, '\1\2,\3\4'); end
|
69
|
-
s.gsub!(/\//, ' / ')
|
70
|
-
s.gsub!(/\s+/,' ')
|
71
|
-
s.strip!
|
72
|
-
s.split(/\s+/)
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
77
|
-
end
|
@@ -1,30 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Processors
|
3
|
-
module Tokenizers
|
4
|
-
# An adapter for the 'tokenizer' gem, which performs
|
5
|
-
# rule-based tokenizing of texts in English, German
|
6
|
-
# or French.
|
7
|
-
class Multilingual
|
8
|
-
# Hold one tokenizer per language.
|
9
|
-
@@tokenizers = {}
|
10
|
-
# Require the 'tokenizer' gem.
|
11
|
-
silence_warnings { require 'tokenizer' }
|
12
|
-
# Perform the tokenization of English, German or French text.
|
13
|
-
# Options:
|
14
|
-
# :language => (Symbol) Force a language for the tokenizer.
|
15
|
-
def self.tokenize(entity, options = {})
|
16
|
-
lang = options[:language] ? options[:language] : entity.language
|
17
|
-
lang = Treat::Languages.code(lang, 1)
|
18
|
-
if @@tokenizers[lang].nil?
|
19
|
-
@@tokenizers[lang] = ::Tokenizer::Tokenizer.new(lang)
|
20
|
-
end
|
21
|
-
tokens = @@tokenizers[lang].tokenize(entity.to_s)
|
22
|
-
tokens.each do |token|
|
23
|
-
next if token =~ /([[:space:]]+)/
|
24
|
-
entity << Treat::Entities::Token.from_string(token)
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|
data/lib/treat/registrable.rb
DELETED
@@ -1,28 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Registrable
|
3
|
-
# Registers a token in the @token_registry hash.
|
4
|
-
def register_token(token)
|
5
|
-
@token_registry ||= {:value => {}, :id => {}}
|
6
|
-
@token_registry[:id][token.id] = token
|
7
|
-
v = token.to_s.downcase
|
8
|
-
@token_registry[:value][v] ||= []
|
9
|
-
@token_registry[:value][v] << token
|
10
|
-
@parent.register_token(token) if has_parent?
|
11
|
-
end
|
12
|
-
# Find the token registry, by default the one
|
13
|
-
# in the root node.
|
14
|
-
def token_registry(type = nil)
|
15
|
-
if (type == nil && is_root?) || type == self.type
|
16
|
-
@token_registry ||= {:value => {}, :id => {}}
|
17
|
-
return @token_registry
|
18
|
-
else
|
19
|
-
if has_parent?
|
20
|
-
@parent.token_registry(type)
|
21
|
-
else
|
22
|
-
@token_registry ||= {:value => {}, :id => {}}
|
23
|
-
@token_registry
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
data/lib/treat/sugar.rb
DELETED
@@ -1,50 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
# This module provides syntactic sugar in the following manner:
|
3
|
-
# all entities found under Treat::Entities will be made
|
4
|
-
# available within the global namespace. For example,
|
5
|
-
# Treat::Entities::Word can now be referred to as simply 'Word'.
|
6
|
-
module Sugar
|
7
|
-
# Installs syntactic sugar.
|
8
|
-
def sweeten!
|
9
|
-
return if @@sweetened
|
10
|
-
@@sweetened = true
|
11
|
-
each_entity_class do |type, klass|
|
12
|
-
unless type == :Symbol
|
13
|
-
Object.class_eval do
|
14
|
-
define_method(type) do |value='',id=nil|
|
15
|
-
klass.build(value, id)
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
# Uninstalls syntactic sugar.
|
23
|
-
def unsweeten!
|
24
|
-
return unless @@sweetened
|
25
|
-
@@sweetened = false
|
26
|
-
each_entity_class do |type, klass|
|
27
|
-
unless type == :Symbol
|
28
|
-
Object.class_eval do
|
29
|
-
remove_method(type)
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
# Boolean - whether syntactic sugar is
|
36
|
-
# enabled or not.
|
37
|
-
def sweetened?; @@sweetened; end
|
38
|
-
# Syntactic sugar is disabled by default.
|
39
|
-
@@sweetened = false
|
40
|
-
private
|
41
|
-
# Helper method, yields each entity type and class.
|
42
|
-
def each_entity_class
|
43
|
-
Treat::Entities.list.each do |entity_type|
|
44
|
-
type = cc(entity_type).intern
|
45
|
-
klass = Treat::Entities.const_get(type, klass)
|
46
|
-
yield type, klass
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
data/lib/treat/viewable.rb
DELETED
@@ -1,29 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Viewable
|
3
|
-
# Return the entity's string value in plain text format.
|
4
|
-
def to_string; @value; end
|
5
|
-
# An alias for #to_string.
|
6
|
-
def to_s; visualize(:txt); end
|
7
|
-
alias :to_str :to_s
|
8
|
-
# Return a shortened value of the entity's string value using [...].
|
9
|
-
def short_value(ml = 6); visualize(:short_value, :max_length => ml); end
|
10
|
-
# Return an informative string representation of the entity.
|
11
|
-
def inspect
|
12
|
-
s = "#{cl(self.class)} (#{@id.to_s})"
|
13
|
-
if caller_method(2) == :inspect
|
14
|
-
@id.to_s
|
15
|
-
else
|
16
|
-
dependencies = []
|
17
|
-
@dependencies.each do |dependency|
|
18
|
-
dependencies << "#{dependency.target}#{dependency.type}"
|
19
|
-
end
|
20
|
-
s += " | #{short_value.inspect}" +
|
21
|
-
" | #{@features.inspect}" +
|
22
|
-
" | { #{dependencies.join(', ')} }"
|
23
|
-
end
|
24
|
-
s
|
25
|
-
end
|
26
|
-
# Print out an ASCII representation of the tree.
|
27
|
-
def print_tree; puts visualize(:tree); end
|
28
|
-
end
|
29
|
-
end
|
data/lib/treat/visitable.rb
DELETED
@@ -1,28 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
# Make a tree visitable by implementing the method #accept.
|
3
|
-
module Visitable
|
4
|
-
# Accept a visitor implemented by klass, which is
|
5
|
-
# found in the supplied group, and call method on it.
|
6
|
-
def accept(group, klass, method, options)
|
7
|
-
if group.has_target?(self.class)
|
8
|
-
if group.type == :transformer
|
9
|
-
if has_children?
|
10
|
-
@children.each do |entity|
|
11
|
-
if group.has_target?(entity.class) && entity.id != id
|
12
|
-
entity.accept(group, klass, method, options)
|
13
|
-
end
|
14
|
-
end
|
15
|
-
else
|
16
|
-
klass.send(method, self, options)
|
17
|
-
end
|
18
|
-
return self
|
19
|
-
else
|
20
|
-
return klass.send(method, self, options)
|
21
|
-
end
|
22
|
-
else
|
23
|
-
raise Treat::Exception,
|
24
|
-
"This type of visitor cannot visit a #{self.class}."
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
data/test/profile.rb
DELETED
data/test/tc_entity.rb
DELETED
@@ -1,117 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Tests
|
3
|
-
class TestEntity < Test::Unit::TestCase
|
4
|
-
def setup
|
5
|
-
@section = Treat::Entities::Section.new
|
6
|
-
@sentence = Treat::Entities::Sentence.new
|
7
|
-
@noun_cons = Treat::Entities::Phrase.new
|
8
|
-
@noun_cons.set :tag, 'NP'
|
9
|
-
@verb_cons = Treat::Entities::Phrase.new
|
10
|
-
@verb_cons.set :tag, 'VP'
|
11
|
-
@adj_cons = Treat::Entities::Phrase.new
|
12
|
-
@adj_cons.set :tag, 'ADJP'
|
13
|
-
@det = Treat::Entities::Word.new('The')
|
14
|
-
@det.set :category, :determiner
|
15
|
-
@det.set :tag, 'DT'
|
16
|
-
@det.set :tag_set, :penn
|
17
|
-
@adj = Treat::Entities::Word.new('lazy')
|
18
|
-
@adj.set :category, :adjective
|
19
|
-
@adj.set :tag, 'JJ'
|
20
|
-
@adj.set :tag_set, :penn
|
21
|
-
@noun = Treat::Entities::Word.new('fox')
|
22
|
-
@noun.set :category, :noun
|
23
|
-
@noun.set :tag, 'NN'
|
24
|
-
@noun.set :tag_set, :penn
|
25
|
-
@aux = Treat::Entities::Word.new('is')
|
26
|
-
@aux.set :category, :verb
|
27
|
-
@aux.set :tag, 'VBZ'
|
28
|
-
@aux.set :tag_set, :penn
|
29
|
-
@verb = Treat::Entities::Word.new('running')
|
30
|
-
@verb.set :category, :verb
|
31
|
-
@verb.set :tag, 'VBG'
|
32
|
-
@verb.set :tag_set, :penn
|
33
|
-
@dot = Treat::Entities::Punctuation.new('.')
|
34
|
-
@section << @sentence << [@noun_cons, @verb_cons, @dot]
|
35
|
-
@noun_cons << [@det, @adj_cons, @noun]
|
36
|
-
@adj_cons << @adj
|
37
|
-
@verb_cons << [@aux, @verb]
|
38
|
-
end
|
39
|
-
|
40
|
-
def test_viewable
|
41
|
-
s = 'Happiness is not an ideal of reason, but of imagination.'.tokenize
|
42
|
-
assert_nothing_raised do
|
43
|
-
# Return the string value of the sentence.
|
44
|
-
s.to_s
|
45
|
-
# Return a debug description of the sentence.
|
46
|
-
s.inspect
|
47
|
-
# Return a shortened version of the Sentence with [...]
|
48
|
-
s.short_value
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
def test_registrable
|
53
|
-
assert_equal @section.token_registry, @verb.token_registry
|
54
|
-
assert_equal @noun, @section.token_registry[:id][@noun.id]
|
55
|
-
assert_equal [@noun], @section.token_registry[:value][@noun.value]
|
56
|
-
end
|
57
|
-
|
58
|
-
def test_delegatable_visitable
|
59
|
-
assert_raise(Treat::Exception) do
|
60
|
-
@section.encoding(:nonexistent)
|
61
|
-
end
|
62
|
-
assert_nothing_raised do
|
63
|
-
@section.language
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
def test_type
|
68
|
-
assert_equal :section, @section.type
|
69
|
-
end
|
70
|
-
|
71
|
-
def test_printers
|
72
|
-
assert_nothing_raised do
|
73
|
-
@section.to_s
|
74
|
-
@section.to_string
|
75
|
-
@section.short_value
|
76
|
-
@section.inspect
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
def test_magic_methods
|
81
|
-
|
82
|
-
assert_equal true, @sentence.is_sentence?
|
83
|
-
assert_equal true, @noun.is_noun?
|
84
|
-
|
85
|
-
assert_equal @sentence, @section.sentence
|
86
|
-
assert_equal [@sentence], @section.sentences
|
87
|
-
assert_equal 1, @section.sentence_count
|
88
|
-
|
89
|
-
assert_equal [@det], @section.words_with_value('The')
|
90
|
-
assert_equal [@verb], @section.words_with_tag('VBG')
|
91
|
-
|
92
|
-
assert_equal @noun, @section.noun
|
93
|
-
assert_equal [@aux, @verb], @section.verbs
|
94
|
-
assert_equal 6, @section.token_count
|
95
|
-
|
96
|
-
@section.each_sentence do |s|
|
97
|
-
assert_equal @sentence, s
|
98
|
-
end
|
99
|
-
@section.each_noun do |n|
|
100
|
-
assert_equal @noun, n
|
101
|
-
end
|
102
|
-
@section.each_with_value('The') do |x|
|
103
|
-
assert_equal @det, x
|
104
|
-
end
|
105
|
-
|
106
|
-
assert_equal @sentence, @noun.parent_sentence
|
107
|
-
end
|
108
|
-
|
109
|
-
def test_features
|
110
|
-
@verb.set :test, :test
|
111
|
-
assert_equal :test, @verb.test
|
112
|
-
assert_raise(Treat::Exception) { @verb.nonexistent }
|
113
|
-
end
|
114
|
-
|
115
|
-
end
|
116
|
-
end
|
117
|
-
end
|
data/test/tc_extractors.rb
DELETED
@@ -1,73 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
module Treat
|
3
|
-
module Tests
|
4
|
-
class TestExtractors < Test::Unit::TestCase
|
5
|
-
|
6
|
-
def setup
|
7
|
-
@time = Treat::Tests::English::Time
|
8
|
-
@date = Treat::Tests::English::Date
|
9
|
-
@doc = Treat::Tests::English::LongDoc
|
10
|
-
@word = Treat::Tests::English::Word
|
11
|
-
@col = Treat::Tests::English::Collection
|
12
|
-
end
|
13
|
-
|
14
|
-
def test_time
|
15
|
-
assert_nothing_raised { @time.time(:nickel) }
|
16
|
-
end
|
17
|
-
|
18
|
-
def test_date
|
19
|
-
assert_equal 2011, @date.date(:chronic).year
|
20
|
-
assert_equal 2011, @date.date(:ruby).year
|
21
|
-
end
|
22
|
-
|
23
|
-
def test_topic_words
|
24
|
-
assert_nothing_raised { @col.topic_words(:lda) }
|
25
|
-
end
|
26
|
-
|
27
|
-
def test_named_entity
|
28
|
-
p = 'Angela Merkel and Nicolas Sarkozy were the first ones to board the p'
|
29
|
-
assert_nothing_raised { @doc.named_entity(:stanford) }
|
30
|
-
end
|
31
|
-
|
32
|
-
def test_keywords
|
33
|
-
assert_nothing_raised do
|
34
|
-
topics = @col.topic_words(:lda)
|
35
|
-
@doc.keywords(:topics_frequency, :topic_words => topics)
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
def test_topics
|
40
|
-
assert_nothing_raised { @doc.topics(:reuters) }
|
41
|
-
end
|
42
|
-
|
43
|
-
def test_statistics
|
44
|
-
@doc.chunk.segment(:tactful).tokenize
|
45
|
-
assert_equal 1, @word.frequency_in(:document)
|
46
|
-
assert_nothing_raised { @word.tf_idf ; puts @word.tf_idf }
|
47
|
-
# assert_nothing_raised { @doc.statistics(:position_in) }
|
48
|
-
# assert_nothing_raised { @doc.statistics(:transition_matrix) }
|
49
|
-
# assert_nothing_raised { @doc.statistics(:transition_probability) }
|
50
|
-
end
|
51
|
-
|
52
|
-
def test_language
|
53
|
-
assert_equal Treat.default_language, @doc.language
|
54
|
-
Treat.detect_language = true
|
55
|
-
assert_equal :eng, @doc.language
|
56
|
-
|
57
|
-
a = 'I want to know God\'s thoughts; the rest are details. - Albert Einstein'
|
58
|
-
b = 'El mundo de hoy no tiene sentido, así que ¿por qué debería pintar cuadros que lo tuvieran? - Pablo Picasso'
|
59
|
-
c = 'Un bon Allemand ne peut souffrir les Français, mais il boit volontiers les vins de France. - Goethe'
|
60
|
-
d = 'Wir haben die Kunst, damit wir nicht an der Wahrheit zugrunde gehen. - Friedrich Nietzsche'
|
61
|
-
|
62
|
-
assert_equal :eng, a.language
|
63
|
-
assert_equal :spa, b.language
|
64
|
-
assert_equal :fre, c.language
|
65
|
-
assert_equal :ger, d.language
|
66
|
-
|
67
|
-
# Reset defaults
|
68
|
-
Treat.detect_language = false
|
69
|
-
end
|
70
|
-
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|