treat 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -3
- data/README.md +33 -0
- data/files/INFO +1 -0
- data/lib/treat.rb +40 -105
- data/lib/treat/ai.rb +12 -0
- data/lib/treat/ai/classifiers/id3.rb +27 -0
- data/lib/treat/categories.rb +82 -35
- data/lib/treat/categorizable.rb +44 -0
- data/lib/treat/classification.rb +61 -0
- data/lib/treat/configurable.rb +115 -0
- data/lib/treat/data_set.rb +42 -0
- data/lib/treat/dependencies.rb +24 -0
- data/lib/treat/downloader.rb +87 -0
- data/lib/treat/entities.rb +68 -66
- data/lib/treat/entities/abilities.rb +10 -0
- data/lib/treat/entities/abilities/buildable.rb +327 -0
- data/lib/treat/entities/abilities/checkable.rb +31 -0
- data/lib/treat/entities/abilities/copyable.rb +45 -0
- data/lib/treat/entities/abilities/countable.rb +51 -0
- data/lib/treat/entities/abilities/debuggable.rb +83 -0
- data/lib/treat/entities/abilities/delegatable.rb +123 -0
- data/lib/treat/entities/abilities/doable.rb +62 -0
- data/lib/treat/entities/abilities/exportable.rb +11 -0
- data/lib/treat/entities/abilities/iterable.rb +115 -0
- data/lib/treat/entities/abilities/magical.rb +83 -0
- data/lib/treat/entities/abilities/registrable.rb +74 -0
- data/lib/treat/entities/abilities/stringable.rb +91 -0
- data/lib/treat/entities/entities.rb +104 -0
- data/lib/treat/entities/entity.rb +122 -245
- data/lib/treat/exception.rb +4 -4
- data/lib/treat/extractors.rb +77 -80
- data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
- data/lib/treat/extractors/language/what_language.rb +50 -45
- data/lib/treat/extractors/name_tag/stanford.rb +55 -0
- data/lib/treat/extractors/tf_idf/native.rb +87 -0
- data/lib/treat/extractors/time/chronic.rb +55 -0
- data/lib/treat/extractors/time/nickel.rb +86 -62
- data/lib/treat/extractors/time/ruby.rb +53 -0
- data/lib/treat/extractors/topic_words/lda.rb +67 -58
- data/lib/treat/extractors/topics/reuters.rb +100 -87
- data/lib/treat/formatters.rb +39 -35
- data/lib/treat/formatters/readers/abw.rb +49 -29
- data/lib/treat/formatters/readers/autoselect.rb +37 -33
- data/lib/treat/formatters/readers/doc.rb +19 -13
- data/lib/treat/formatters/readers/html.rb +52 -30
- data/lib/treat/formatters/readers/image.rb +41 -40
- data/lib/treat/formatters/readers/odt.rb +59 -45
- data/lib/treat/formatters/readers/pdf.rb +28 -25
- data/lib/treat/formatters/readers/txt.rb +12 -15
- data/lib/treat/formatters/readers/xml.rb +73 -36
- data/lib/treat/formatters/serializers/xml.rb +80 -79
- data/lib/treat/formatters/serializers/yaml.rb +19 -18
- data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
- data/lib/treat/formatters/unserializers/xml.rb +94 -99
- data/lib/treat/formatters/unserializers/yaml.rb +20 -19
- data/lib/treat/formatters/visualizers/dot.rb +132 -132
- data/lib/treat/formatters/visualizers/standoff.rb +52 -44
- data/lib/treat/formatters/visualizers/tree.rb +26 -29
- data/lib/treat/groupable.rb +153 -0
- data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
- data/lib/treat/inflectors.rb +50 -45
- data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
- data/lib/treat/inflectors/declensors/active_support.rb +31 -0
- data/lib/treat/inflectors/declensors/english.rb +38 -0
- data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
- data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
- data/lib/treat/inflectors/stemmers/porter.rb +160 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
- data/lib/treat/inflectors/stemmers/uea.rb +28 -0
- data/lib/treat/installer.rb +308 -0
- data/lib/treat/kernel.rb +105 -27
- data/lib/treat/languages.rb +122 -88
- data/lib/treat/languages/arabic.rb +15 -15
- data/lib/treat/languages/chinese.rb +15 -15
- data/lib/treat/languages/dutch.rb +15 -15
- data/lib/treat/languages/english.rb +61 -62
- data/lib/treat/languages/french.rb +19 -19
- data/lib/treat/languages/german.rb +20 -20
- data/lib/treat/languages/greek.rb +15 -15
- data/lib/treat/languages/italian.rb +16 -16
- data/lib/treat/languages/polish.rb +15 -15
- data/lib/treat/languages/portuguese.rb +15 -15
- data/lib/treat/languages/russian.rb +15 -15
- data/lib/treat/languages/spanish.rb +16 -16
- data/lib/treat/languages/swedish.rb +16 -16
- data/lib/treat/lexicalizers.rb +34 -55
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
- data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
- data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
- data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
- data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
- data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
- data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
- data/lib/treat/linguistics.rb +9 -0
- data/lib/treat/linguistics/categories.rb +11 -0
- data/lib/treat/linguistics/tags.rb +422 -0
- data/lib/treat/loaders/linguistics.rb +30 -0
- data/lib/treat/loaders/stanford.rb +27 -0
- data/lib/treat/object.rb +1 -0
- data/lib/treat/processors.rb +37 -44
- data/lib/treat/processors/chunkers/autoselect.rb +16 -0
- data/lib/treat/processors/chunkers/html.rb +71 -0
- data/lib/treat/processors/chunkers/txt.rb +18 -24
- data/lib/treat/processors/parsers/enju.rb +253 -208
- data/lib/treat/processors/parsers/stanford.rb +130 -131
- data/lib/treat/processors/segmenters/punkt.rb +79 -45
- data/lib/treat/processors/segmenters/stanford.rb +46 -48
- data/lib/treat/processors/segmenters/tactful.rb +43 -36
- data/lib/treat/processors/tokenizers/perl.rb +124 -92
- data/lib/treat/processors/tokenizers/ptb.rb +81 -0
- data/lib/treat/processors/tokenizers/punkt.rb +48 -42
- data/lib/treat/processors/tokenizers/stanford.rb +39 -38
- data/lib/treat/processors/tokenizers/tactful.rb +64 -55
- data/lib/treat/proxies.rb +52 -35
- data/lib/treat/retrievers.rb +26 -16
- data/lib/treat/retrievers/indexers/ferret.rb +47 -26
- data/lib/treat/retrievers/searchers/ferret.rb +69 -50
- data/lib/treat/tree.rb +241 -183
- data/spec/collection.rb +123 -0
- data/spec/document.rb +93 -0
- data/spec/entity.rb +408 -0
- data/spec/languages.rb +25 -0
- data/spec/phrase.rb +146 -0
- data/spec/samples/mathematicians/archimedes.abw +34 -0
- data/spec/samples/mathematicians/euler.html +21 -0
- data/spec/samples/mathematicians/gauss.pdf +0 -0
- data/spec/samples/mathematicians/leibniz.txt +13 -0
- data/spec/samples/mathematicians/newton.doc +0 -0
- data/spec/sandbox.rb +5 -0
- data/spec/token.rb +109 -0
- data/spec/treat.rb +52 -0
- data/spec/tree.rb +117 -0
- data/spec/word.rb +110 -0
- data/spec/zone.rb +66 -0
- data/tmp/INFO +1 -1
- metadata +100 -201
- data/INSTALL +0 -1
- data/README +0 -3
- data/TODO +0 -28
- data/lib/economist/half_cocked_basel.txt +0 -16
- data/lib/economist/hungarys_troubles.txt +0 -46
- data/lib/economist/indias_slowdown.txt +0 -15
- data/lib/economist/merkozy_rides_again.txt +0 -24
- data/lib/economist/prada_is_not_walmart.txt +0 -9
- data/lib/economist/to_infinity_and_beyond.txt +0 -15
- data/lib/ferret/_11.cfs +0 -0
- data/lib/ferret/_14.cfs +0 -0
- data/lib/ferret/_p.cfs +0 -0
- data/lib/ferret/_s.cfs +0 -0
- data/lib/ferret/_v.cfs +0 -0
- data/lib/ferret/_y.cfs +0 -0
- data/lib/ferret/segments +0 -0
- data/lib/ferret/segments_15 +0 -0
- data/lib/treat/buildable.rb +0 -157
- data/lib/treat/category.rb +0 -33
- data/lib/treat/delegatable.rb +0 -116
- data/lib/treat/doable.rb +0 -45
- data/lib/treat/entities/collection.rb +0 -14
- data/lib/treat/entities/document.rb +0 -12
- data/lib/treat/entities/phrases.rb +0 -17
- data/lib/treat/entities/tokens.rb +0 -61
- data/lib/treat/entities/zones.rb +0 -41
- data/lib/treat/extractors/coreferences/stanford.rb +0 -69
- data/lib/treat/extractors/date/chronic.rb +0 -32
- data/lib/treat/extractors/date/ruby.rb +0 -25
- data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
- data/lib/treat/extractors/language/language_extractor.rb +0 -27
- data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
- data/lib/treat/extractors/roles/naive.rb +0 -73
- data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
- data/lib/treat/extractors/statistics/position_in.rb +0 -14
- data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
- data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
- data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
- data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
- data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
- data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
- data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
- data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
- data/lib/treat/feature.rb +0 -58
- data/lib/treat/features.rb +0 -7
- data/lib/treat/formatters/visualizers/short_value.rb +0 -29
- data/lib/treat/formatters/visualizers/txt.rb +0 -45
- data/lib/treat/group.rb +0 -106
- data/lib/treat/helpers/linguistics_loader.rb +0 -18
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
- data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
- data/lib/treat/inflectors/declensions/english.rb +0 -319
- data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
- data/lib/treat/inflectors/stem/porter.rb +0 -162
- data/lib/treat/inflectors/stem/porter_c.rb +0 -26
- data/lib/treat/inflectors/stem/uea.rb +0 -30
- data/lib/treat/install.rb +0 -59
- data/lib/treat/languages/tags.rb +0 -377
- data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
- data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
- data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
- data/lib/treat/lexicalizers/tag/brill.rb +0 -91
- data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
- data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
- data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
- data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
- data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
- data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
- data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
- data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
- data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
- data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
- data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
- data/lib/treat/registrable.rb +0 -28
- data/lib/treat/sugar.rb +0 -50
- data/lib/treat/viewable.rb +0 -29
- data/lib/treat/visitable.rb +0 -28
- data/test/profile.rb +0 -2
- data/test/tc_entity.rb +0 -117
- data/test/tc_extractors.rb +0 -73
- data/test/tc_formatters.rb +0 -41
- data/test/tc_inflectors.rb +0 -34
- data/test/tc_lexicalizers.rb +0 -32
- data/test/tc_processors.rb +0 -50
- data/test/tc_resources.rb +0 -22
- data/test/tc_treat.rb +0 -60
- data/test/tc_tree.rb +0 -60
- data/test/tests.rb +0 -20
- data/test/texts.rb +0 -19
- data/test/texts/english/half_cocked_basel.txt +0 -16
- data/test/texts/english/hose_and_dry.doc +0 -0
- data/test/texts/english/hungarys_troubles.abw +0 -70
- data/test/texts/english/long.html +0 -24
- data/test/texts/english/long.txt +0 -22
- data/test/texts/english/medium.txt +0 -5
- data/test/texts/english/republican_nomination.pdf +0 -0
- data/test/texts/english/saving_the_euro.odt +0 -0
- data/test/texts/english/short.txt +0 -3
- data/test/texts/english/zero_sum.html +0 -111
data/spec/document.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
require_relative '../lib/treat'
|
2
|
+
|
3
|
+
describe Treat::Entities::Document do
|
4
|
+
|
5
|
+
describe "Extractable" do
|
6
|
+
|
7
|
+
describe "#topics" do
|
8
|
+
|
9
|
+
it "returns a list of general topics the document belongs to" do
|
10
|
+
#doc = Treat::Entities::Document.new(
|
11
|
+
#Treat.spec + 'samples/mathematicians/archimedes.abw').read(:abw)
|
12
|
+
#doc.do(:chunk, :segment, :tokenize)
|
13
|
+
#puts doc.topics.inspect
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "Buildable" do
|
21
|
+
|
22
|
+
describe "#build" do
|
23
|
+
|
24
|
+
context "when supplied with a readable file name" do
|
25
|
+
it "opens the file and reads its " +
|
26
|
+
"content into a document" do
|
27
|
+
f = Treat.spec + 'samples/mathematicians/leibniz.txt'
|
28
|
+
d = Treat::Entities::Document.build(f)
|
29
|
+
d.should be_an_instance_of Treat::Entities::Document
|
30
|
+
d.to_s.index('Gottfried Leibniz').should_not eql nil
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
context "when supplied with a url" do
|
35
|
+
it "downloads the file the URL points to and opens " +
|
36
|
+
"a document with the contents of the file" do
|
37
|
+
url = 'http://www.rubyinside.com/nethttp-cheat-sheet-2940.html'
|
38
|
+
d = Treat::Entities::Document.build(url)
|
39
|
+
d.should be_an_instance_of Treat::Entities::Document
|
40
|
+
d.to_s.index('Rubyist').should_not eql nil
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
context "when called with anything else than a " +
|
45
|
+
"readable file name or url" do
|
46
|
+
|
47
|
+
it "raises an exception" do
|
48
|
+
lambda do
|
49
|
+
Treat::Entities::Document.build('nonexistent')
|
50
|
+
end.should raise_error
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
describe "Processable" do
|
60
|
+
|
61
|
+
describe "#chunk" do
|
62
|
+
|
63
|
+
context "when called on an HTML document" do
|
64
|
+
doc = Treat::Entities::Document.new(
|
65
|
+
Treat.spec + 'samples/mathematicians/euler.html').read(:html)
|
66
|
+
it "splits the HTML document into sections, " +
|
67
|
+
"titles, paragraphs and lists" do
|
68
|
+
doc.chunk
|
69
|
+
doc.title_count.should eql 1
|
70
|
+
doc.title.to_s.should eql "Leonhard Euler (1707-1783)"
|
71
|
+
doc.paragraph_count.should eql 5
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
context "when called on a text document" do
|
77
|
+
|
78
|
+
doc = Treat::Entities::Document.new(Treat.spec +
|
79
|
+
'samples/mathematicians/leibniz.txt').read(:txt)
|
80
|
+
it "splits the document into titles and paragraphs" do
|
81
|
+
doc.chunk
|
82
|
+
doc.title_count.should eql 1
|
83
|
+
doc.title.to_s.should eql "Gottfried Leibniz (1646-1716)"
|
84
|
+
doc.paragraph_count.should eql 6
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
data/spec/entity.rb
ADDED
@@ -0,0 +1,408 @@
|
|
1
|
+
require_relative '../lib/treat'
|
2
|
+
|
3
|
+
describe Treat::Entities::Entity do
|
4
|
+
|
5
|
+
before do
|
6
|
+
|
7
|
+
@paragraph = Treat::Entities::Paragraph.new
|
8
|
+
@sentence = Treat::Entities::Sentence.new
|
9
|
+
@noun_phrase = Treat::Entities::Phrase.new
|
10
|
+
@noun_phrase.set :tag, 'NP'
|
11
|
+
@verb_phrase = Treat::Entities::Phrase.new
|
12
|
+
@verb_phrase.set :tag, 'VP'
|
13
|
+
@adj_phrase = Treat::Entities::Phrase.new
|
14
|
+
@adj_phrase.set :tag, 'ADJP'
|
15
|
+
@det = Treat::Entities::Word.new('The')
|
16
|
+
@det.set :category, :determiner
|
17
|
+
@det.set :tag, 'DT'
|
18
|
+
@adj = Treat::Entities::Word.new('lazy')
|
19
|
+
@adj.set :category, :adjective
|
20
|
+
@adj.set :tag, 'JJ'
|
21
|
+
@noun = Treat::Entities::Word.new('fox')
|
22
|
+
@noun.set :category, :noun
|
23
|
+
@noun.set :tag, 'NN'
|
24
|
+
@aux = Treat::Entities::Word.new('is')
|
25
|
+
@aux.set :category, :verb
|
26
|
+
@aux.set :tag, 'VBZ'
|
27
|
+
@verb = Treat::Entities::Word.new('running')
|
28
|
+
@verb.set :category, :verb
|
29
|
+
@verb.set :tag, 'VBG'
|
30
|
+
@dot = Treat::Entities::Punctuation.new('.')
|
31
|
+
@dot.set :tag, '.'
|
32
|
+
@paragraph << @sentence << [@noun_phrase, @verb_phrase, @dot]
|
33
|
+
@noun_phrase << [@det, @adj_phrase, @noun]
|
34
|
+
@adj_phrase << @adj
|
35
|
+
@verb_phrase << [@aux, @verb]
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
describe "Checkable" do
|
41
|
+
|
42
|
+
describe "#check_has(feature, do_it = true) " do
|
43
|
+
|
44
|
+
it "checks if an entity has the feature; if not, " +
|
45
|
+
"calls the default worker to get the feature if do_it " +
|
46
|
+
"is set to true; if the entity doesn't have the feature " +
|
47
|
+
" and do_it is set to false, it raises an exception." do
|
48
|
+
|
49
|
+
# NOT PASSING! Dependence on caller method.
|
50
|
+
|
51
|
+
# lambda { '$'.to_entity.check_has(:tag, false) }.should raise_error Treat::Exception
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
describe "Countable" do
|
60
|
+
|
61
|
+
describe "#position" do
|
62
|
+
|
63
|
+
it "returns the position of the entity in its parent, sarting at 1" do
|
64
|
+
@noun_phrase.position.should eql 1
|
65
|
+
@det.position.should eql 1
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
=begin
|
71
|
+
|
72
|
+
describe "#frequency" do
|
73
|
+
|
74
|
+
it "returns the frequency of the entity's value in the root" do
|
75
|
+
@det.frequency.should eql 1
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
describe "#frequency_in(parent_type = nil)" do
|
82
|
+
|
83
|
+
it "returns the position of the entity's value "+
|
84
|
+
"in the supplied parent type, or root if nil" do
|
85
|
+
@noun_phrase.frequency_in(:sentence).should eql 1
|
86
|
+
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
=end
|
92
|
+
|
93
|
+
end
|
94
|
+
|
95
|
+
describe "Delegatable" do
|
96
|
+
|
97
|
+
describe "#self.call_worker" do
|
98
|
+
|
99
|
+
it "finds the worker class to " +
|
100
|
+
"perform a task and delegates the task to it " do
|
101
|
+
|
102
|
+
Treat::Entities::Entity.call_worker(
|
103
|
+
'$'.to_entity, :tag, :lingua,
|
104
|
+
Treat::Lexicalizers::Taggers, {}).should
|
105
|
+
eql @sentence.tag(:lingua)
|
106
|
+
|
107
|
+
end
|
108
|
+
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
|
113
|
+
describe "Exportable" do
|
114
|
+
|
115
|
+
context "when supplied with a classification to export" do
|
116
|
+
classification = Treat::Classification.new(:word, :tag, :is_keyword?)
|
117
|
+
it "returns a data set with the exported features" do
|
118
|
+
ds = @sentence.export(classification)
|
119
|
+
ds.classification.should eql classification
|
120
|
+
ds.labels.should eql [:tag]
|
121
|
+
ds.ids.should eql @sentence.words.map { |w| w.id }
|
122
|
+
ds.items.should eql [
|
123
|
+
["DT", false], ["JJ", false],
|
124
|
+
["NN", false], ["VBZ", false],
|
125
|
+
["VBG", false]
|
126
|
+
]
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
end
|
131
|
+
|
132
|
+
describe "Iterable" do
|
133
|
+
|
134
|
+
describe "#each { |child| ... }" do
|
135
|
+
it "yields each direct child of a node" do
|
136
|
+
a = []
|
137
|
+
@sentence.each do |child|
|
138
|
+
a << child
|
139
|
+
end
|
140
|
+
a.should eql [@noun_phrase, @verb_phrase, @dot]
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
describe "#each_entity(*entity_types) { |entity| ... }" do
|
145
|
+
|
146
|
+
context "when called with no arguments" do
|
147
|
+
it "recursively yields each element in " +
|
148
|
+
"the tree, including itself, top-down " +
|
149
|
+
"first then left to right" do
|
150
|
+
|
151
|
+
a = []
|
152
|
+
@sentence.each_entity do |e|
|
153
|
+
a << e
|
154
|
+
end
|
155
|
+
|
156
|
+
a.should eql [@sentence, @noun_phrase, @det,
|
157
|
+
@adj_phrase, @adj, @noun,
|
158
|
+
@verb_phrase, @aux, @verb, @dot]
|
159
|
+
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
context "when called with one or more entity " +
|
164
|
+
"types supplied as lowercase symbols" do
|
165
|
+
it "recursively yields all elements with the given type(s), "+
|
166
|
+
"including the receiver if it matches on of the types" do
|
167
|
+
a = []
|
168
|
+
@sentence.each_entity(:phrase, :punctuation) do |e|
|
169
|
+
a << e
|
170
|
+
end
|
171
|
+
a.should eql [@sentence, @noun_phrase,
|
172
|
+
@adj_phrase, @verb_phrase, @dot]
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
describe "Magical" do
|
180
|
+
|
181
|
+
describe "#<entity or word type> - e.g. " +
|
182
|
+
"#title, #paragraph, etc. and #adjective, #noun, etc." do
|
183
|
+
|
184
|
+
it "return the first entity with the corresponding " +
|
185
|
+
"type inside another entity, but raises an exception "+
|
186
|
+
"the type occurs more than once in the entity" do
|
187
|
+
@paragraph.sentence.should eql @sentence
|
188
|
+
end
|
189
|
+
|
190
|
+
end
|
191
|
+
|
192
|
+
|
193
|
+
describe "#<entity or word type>s - e.g. " +
|
194
|
+
"#sections, #words, etc. and #nouns, #adverbs, etc." do
|
195
|
+
|
196
|
+
it "return an array of the entities with the " +
|
197
|
+
"corresponding type in the subtree of an entity" do
|
198
|
+
@paragraph.phrases.should eql [@sentence,
|
199
|
+
@noun_phrase, @adj_phrase, @verb_phrase]
|
200
|
+
end
|
201
|
+
|
202
|
+
end
|
203
|
+
|
204
|
+
describe "#each_<entity type> - e.g. " +
|
205
|
+
"#each_sentence, #each_word, etc." do
|
206
|
+
|
207
|
+
it "yields each of the entities with the " +
|
208
|
+
"corresponding type in the subtree of an entity" do
|
209
|
+
a = []
|
210
|
+
|
211
|
+
@paragraph.each_phrase { |p| a << p }
|
212
|
+
a.should eql [@sentence, @noun_phrase,
|
213
|
+
@adj_phrase, @verb_phrase]
|
214
|
+
|
215
|
+
end
|
216
|
+
|
217
|
+
end
|
218
|
+
|
219
|
+
describe "#<entity or word type>_count - e.g. " +
|
220
|
+
"#sentence_count, #paragraph_count, etc. and " +
|
221
|
+
"#noun_count, #verb_count, etc." do
|
222
|
+
|
223
|
+
it "return the number of entities with the " +
|
224
|
+
"corresponding type inside another entity" do
|
225
|
+
@paragraph.sentence_count.should eql 1
|
226
|
+
@paragraph.phrase_count.should eql 4
|
227
|
+
end
|
228
|
+
|
229
|
+
end
|
230
|
+
|
231
|
+
describe "#<entity or word type>_with_<feature>(value) - " +
|
232
|
+
"e.g. #word_with_id(x) or #adverb_with_value('seemingly')" do
|
233
|
+
|
234
|
+
it "return the entity with the corresponding type " +
|
235
|
+
"that have [feature] set to the supplied value; raise" +
|
236
|
+
"a warning if there are many entities of that type" do
|
237
|
+
@paragraph.word_with_value('The').should eql @det
|
238
|
+
@paragraph.token_with_tag('.').should eql @dot
|
239
|
+
@sentence.phrase_with_tag('NP').should eql @noun_phrase
|
240
|
+
end
|
241
|
+
|
242
|
+
end
|
243
|
+
|
244
|
+
describe "#<entity or word type>s_with_<feature>(value) - " +
|
245
|
+
"e.g. #phrases_with_tag('NP'), #nouns_with_value('foo')" do
|
246
|
+
|
247
|
+
it "return an array of the entities with the " +
|
248
|
+
"corresponding type that have [feature] set to "+
|
249
|
+
"the supplied value" do
|
250
|
+
@paragraph.words_with_value('The').should eql [@det]
|
251
|
+
@paragraph.tokens_with_tag('.').should eql [@dot]
|
252
|
+
@sentence.phrases_with_tag('NP').should eql [@noun_phrase]
|
253
|
+
end
|
254
|
+
|
255
|
+
end
|
256
|
+
|
257
|
+
describe "#parent_<entity type> - e.g. " +
|
258
|
+
"#parent_document, #parent_collection, etc." do
|
259
|
+
|
260
|
+
it "return the first ancestor of the entity " +
|
261
|
+
"that has the supplied type, or nil if none" do
|
262
|
+
@sentence.parent_paragraph.should eql @paragraph
|
263
|
+
@adj.parent_sentence.should eql @sentence
|
264
|
+
end
|
265
|
+
|
266
|
+
end
|
267
|
+
|
268
|
+
describe "#frequency_in_<entity type> - e.g. " +
|
269
|
+
"#frequency_in_collection, #frequency_in_document, etc." do
|
270
|
+
|
271
|
+
it "return the frequency of this entity's value " +
|
272
|
+
"in the parent entity with the corresponding type" do
|
273
|
+
@adj.frequency_in_sentence.should eql 1
|
274
|
+
end
|
275
|
+
|
276
|
+
end
|
277
|
+
|
278
|
+
end
|
279
|
+
|
280
|
+
describe "Stringable" do
|
281
|
+
|
282
|
+
describe "#to_string" do
|
283
|
+
it "returns the true text value of the entity " +
|
284
|
+
"or an empty string if it has none" do
|
285
|
+
@paragraph.to_string.should eql ''
|
286
|
+
@noun.to_string.should eql 'fox'
|
287
|
+
end
|
288
|
+
end
|
289
|
+
|
290
|
+
describe "#to_s" do
|
291
|
+
it "returns the string value of the " +
|
292
|
+
"entity or its full subtree" do
|
293
|
+
@paragraph.to_s.should
|
294
|
+
eql 'The lazy fox is running.'
|
295
|
+
@noun.to_s.should eql 'fox'
|
296
|
+
end
|
297
|
+
end
|
298
|
+
|
299
|
+
describe "#inspect" do
|
300
|
+
it "returns an informative string " +
|
301
|
+
"concerning the entity" do
|
302
|
+
@paragraph.inspect.should
|
303
|
+
be_an_instance_of String
|
304
|
+
end
|
305
|
+
end
|
306
|
+
|
307
|
+
describe "#short_value" do
|
308
|
+
it "returns a shortened version of the " +
|
309
|
+
"entity's string value" do
|
310
|
+
@paragraph.short_value.should
|
311
|
+
eql 'The lazy fox is running.'
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
end
|
316
|
+
|
317
|
+
describe "Formatters" do
|
318
|
+
|
319
|
+
describe "#serialize" do
|
320
|
+
|
321
|
+
before :all do
|
322
|
+
@serializers = [:xml, :yaml] # Treat::Languages::All::Serializers
|
323
|
+
@txt = "The story of the fox. The quick brown fox jumped over the lazy dog."
|
324
|
+
end
|
325
|
+
|
326
|
+
context "when called with a file to save to" do
|
327
|
+
|
328
|
+
it "serializes a document to the supplied format" do
|
329
|
+
|
330
|
+
@serializers.each do |ser|
|
331
|
+
f = Treat.spec + 'test.' + ser.to_s
|
332
|
+
s = Treat::Entities::Paragraph.new(@txt)
|
333
|
+
s.do(:segment, :tokenize)
|
334
|
+
s.serialize(ser, :file => f)
|
335
|
+
d = Treat::Entities::Document.build(f)
|
336
|
+
d.to_s.should eql @txt
|
337
|
+
d.size.should eql s.size
|
338
|
+
d.token_count.should eql s.token_count
|
339
|
+
d.tokens[0].id.should eql s.tokens[0].id
|
340
|
+
File.delete(f)
|
341
|
+
end
|
342
|
+
|
343
|
+
end
|
344
|
+
|
345
|
+
end
|
346
|
+
|
347
|
+
end
|
348
|
+
|
349
|
+
describe "#unserialize" do
|
350
|
+
|
351
|
+
end
|
352
|
+
|
353
|
+
end
|
354
|
+
|
355
|
+
describe "Extractors" do
|
356
|
+
|
357
|
+
describe "#language" do
|
358
|
+
context "when language detection is disabled " +
|
359
|
+
"(Treat.detect_language is set to false)" do
|
360
|
+
it "returns the default language (Treat.default_language)" do
|
361
|
+
Treat.detect_language = false
|
362
|
+
Treat.default_language = :test
|
363
|
+
s = 'Les grands hommes ne sont pas toujours grands, dit un jour Napoleon.'
|
364
|
+
s.language.should eql :test
|
365
|
+
Treat.default_language = :eng
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|
369
|
+
context "when language detection is enabled " +
|
370
|
+
"(Treat.detect_language is set to true)" do
|
371
|
+
|
372
|
+
it "guesses the language of the entity" do
|
373
|
+
|
374
|
+
Treat.detect_language = true
|
375
|
+
a = 'I want to know God\'s thoughts; the rest are details. - Albert Einstein'
|
376
|
+
b = 'El mundo de hoy no tiene sentido, asi que por que deberia pintar cuadros que lo tuvieran? - Pablo Picasso'
|
377
|
+
c = 'Un bon Allemand ne peut souffrir les Francais, mais il boit volontiers les vins de France. - Goethe'
|
378
|
+
d = 'Wir haben die Kunst, damit wir nicht an der Wahrheit zugrunde gehen. - Friedrich Nietzsche'
|
379
|
+
a.language.should eql :eng
|
380
|
+
b.language.should eql :spa
|
381
|
+
c.language.should eql :fre
|
382
|
+
d.language.should eql :ger
|
383
|
+
|
384
|
+
# Reset default
|
385
|
+
Treat.detect_language = false
|
386
|
+
end
|
387
|
+
|
388
|
+
end
|
389
|
+
|
390
|
+
end
|
391
|
+
|
392
|
+
end
|
393
|
+
|
394
|
+
end
|
395
|
+
|
396
|
+
|
397
|
+
=begin
|
398
|
+
|
399
|
+
|
400
|
+
def test_visualizers
|
401
|
+
assert_nothing_raised { @doc.visualize(:tree) }
|
402
|
+
# assert_nothing_raised { @doc.visualize(:html) }
|
403
|
+
assert_nothing_raised { @doc.visualize(:dot) }
|
404
|
+
assert_nothing_raised { @doc.visualize(:short_value) }
|
405
|
+
assert_nothing_raised { @sentence.visualize(:standoff) }
|
406
|
+
end
|
407
|
+
|
408
|
+
=end
|