treat 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +4 -4
- data/TODO +21 -54
- data/lib/economist/half_cocked_basel.txt +16 -0
- data/lib/economist/hose_and_dry.doc +0 -0
- data/lib/economist/hungarys_troubles.abw +70 -0
- data/lib/economist/republican_nomination.pdf +0 -0
- data/lib/economist/saving_the_euro.odt +0 -0
- data/lib/economist/to_infinity_and_beyond.txt +15 -0
- data/lib/economist/zero_sum.html +91 -0
- data/lib/treat.rb +58 -72
- data/lib/treat/buildable.rb +59 -15
- data/lib/treat/categories.rb +26 -14
- data/lib/treat/category.rb +2 -2
- data/lib/treat/delegatable.rb +65 -48
- data/lib/treat/doable.rb +44 -0
- data/lib/treat/entities.rb +34 -14
- data/lib/treat/entities/collection.rb +2 -0
- data/lib/treat/entities/document.rb +3 -2
- data/lib/treat/entities/entity.rb +105 -90
- data/lib/treat/entities/phrases.rb +17 -0
- data/lib/treat/entities/tokens.rb +28 -13
- data/lib/treat/entities/zones.rb +20 -0
- data/lib/treat/extractors.rb +49 -11
- data/lib/treat/extractors/coreferences/stanford.rb +68 -0
- data/lib/treat/extractors/date/chronic.rb +32 -0
- data/lib/treat/extractors/date/ruby.rb +25 -0
- data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
- data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
- data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
- data/lib/treat/extractors/language/what_language.rb +49 -0
- data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
- data/lib/treat/extractors/roles/naive.rb +73 -0
- data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
- data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
- data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
- data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
- data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
- data/lib/treat/extractors/time/nickel.rb +30 -12
- data/lib/treat/extractors/topic_words/lda.rb +9 -9
- data/lib/treat/extractors/topics/reuters.rb +14 -15
- data/lib/treat/extractors/topics/reuters/region.xml +1 -0
- data/lib/treat/features.rb +7 -0
- data/lib/treat/formatters/readers/abw.rb +6 -1
- data/lib/treat/formatters/readers/autoselect.rb +5 -6
- data/lib/treat/formatters/readers/doc.rb +3 -1
- data/lib/treat/formatters/readers/html.rb +1 -1
- data/lib/treat/formatters/readers/image.rb +43 -0
- data/lib/treat/formatters/readers/odt.rb +1 -2
- data/lib/treat/formatters/readers/pdf.rb +9 -1
- data/lib/treat/formatters/readers/xml.rb +40 -0
- data/lib/treat/formatters/serializers/xml.rb +50 -14
- data/lib/treat/formatters/serializers/yaml.rb +7 -2
- data/lib/treat/formatters/unserializers/xml.rb +33 -7
- data/lib/treat/formatters/visualizers/dot.rb +90 -20
- data/lib/treat/formatters/visualizers/short_value.rb +2 -2
- data/lib/treat/formatters/visualizers/standoff.rb +2 -2
- data/lib/treat/formatters/visualizers/tree.rb +1 -1
- data/lib/treat/formatters/visualizers/txt.rb +13 -4
- data/lib/treat/group.rb +16 -10
- data/lib/treat/helpers/linguistics_loader.rb +18 -0
- data/lib/treat/inflectors.rb +10 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
- data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
- data/lib/treat/inflectors/declensions/english.rb +319 -0
- data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
- data/lib/treat/install.rb +59 -0
- data/lib/treat/kernel.rb +18 -8
- data/lib/treat/languages.rb +18 -11
- data/lib/treat/languages/arabic.rb +4 -2
- data/lib/treat/languages/chinese.rb +6 -2
- data/lib/treat/languages/dutch.rb +16 -0
- data/lib/treat/languages/english.rb +47 -19
- data/lib/treat/languages/french.rb +8 -5
- data/lib/treat/languages/german.rb +9 -6
- data/lib/treat/languages/greek.rb +16 -0
- data/lib/treat/languages/italian.rb +6 -3
- data/lib/treat/languages/polish.rb +16 -0
- data/lib/treat/languages/portuguese.rb +16 -0
- data/lib/treat/languages/russian.rb +16 -0
- data/lib/treat/languages/spanish.rb +16 -0
- data/lib/treat/languages/swedish.rb +16 -0
- data/lib/treat/languages/tags.rb +377 -0
- data/lib/treat/lexicalizers.rb +34 -23
- data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
- data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
- data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
- data/lib/treat/lexicalizers/tag/brill.rb +35 -40
- data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
- data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
- data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
- data/lib/treat/processors.rb +8 -8
- data/lib/treat/processors/chunkers/txt.rb +4 -4
- data/lib/treat/processors/parsers/enju.rb +114 -99
- data/lib/treat/processors/parsers/stanford.rb +109 -41
- data/lib/treat/processors/segmenters/punkt.rb +17 -18
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
- data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
- data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
- data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
- data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
- data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
- data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
- data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
- data/lib/treat/processors/segmenters/stanford.rb +38 -37
- data/lib/treat/processors/segmenters/tactful.rb +5 -4
- data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
- data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
- data/lib/treat/processors/tokenizers/perl.rb +2 -2
- data/lib/treat/processors/tokenizers/punkt.rb +6 -2
- data/lib/treat/processors/tokenizers/stanford.rb +25 -24
- data/lib/treat/processors/tokenizers/tactful.rb +1 -2
- data/lib/treat/proxies.rb +2 -35
- data/lib/treat/registrable.rb +17 -22
- data/lib/treat/sugar.rb +11 -11
- data/lib/treat/tree.rb +27 -17
- data/lib/treat/viewable.rb +29 -0
- data/lib/treat/visitable.rb +1 -1
- data/test/tc_entity.rb +56 -49
- data/test/tc_extractors.rb +41 -18
- data/test/tc_formatters.rb +7 -8
- data/test/tc_inflectors.rb +19 -24
- data/test/tc_lexicalizers.rb +12 -19
- data/test/tc_processors.rb +26 -12
- data/test/tc_resources.rb +2 -7
- data/test/tc_treat.rb +20 -22
- data/test/tc_tree.rb +4 -4
- data/test/tests.rb +3 -5
- data/test/texts.rb +13 -14
- data/tmp/INFO +1 -0
- metadata +78 -158
- data/bin/INFO +0 -1
- data/examples/benchmark.rb +0 -81
- data/examples/keywords.rb +0 -148
- data/lib/treat/detectors.rb +0 -31
- data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
- data/lib/treat/detectors/format/file.rb +0 -36
- data/lib/treat/detectors/language/what_language.rb +0 -29
- data/lib/treat/entities/constituents.rb +0 -15
- data/lib/treat/entities/sentence.rb +0 -8
- data/lib/treat/extractors/named_entity/abner.rb +0 -20
- data/lib/treat/extractors/named_entity/stanford.rb +0 -174
- data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
- data/lib/treat/extractors/time/chronic.rb +0 -20
- data/lib/treat/extractors/time/native.rb +0 -18
- data/lib/treat/formatters/readers/gocr.rb +0 -26
- data/lib/treat/formatters/readers/ocropus.rb +0 -31
- data/lib/treat/formatters/visualizers/html.rb +0 -13
- data/lib/treat/formatters/visualizers/inspect.rb +0 -20
- data/lib/treat/inflectors/declensions/en.rb +0 -18
- data/lib/treat/languages/categories.rb +0 -5
- data/lib/treat/languages/english/categories.rb +0 -23
- data/lib/treat/languages/english/tags.rb +0 -352
- data/lib/treat/languages/xinhua.rb +0 -12
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
- data/lib/treat/string.rb +0 -5
- data/test/tc_detectors.rb +0 -26
@@ -0,0 +1,29 @@
|
|
1
|
+
module Treat
|
2
|
+
module Viewable
|
3
|
+
# Return the entity's string value in plain text format.
|
4
|
+
def to_string; @value; end
|
5
|
+
# An alias for #to_string.
|
6
|
+
def to_s; visualize(:txt); end
|
7
|
+
alias :to_str :to_s
|
8
|
+
# Return a shortened value of the entity's string value using [...].
|
9
|
+
def short_value(ml = 6); visualize(:short_value, :max_length => ml); end
|
10
|
+
# Return an informative string representation of the entity.
|
11
|
+
def inspect
|
12
|
+
s = "#{cl(self.class)} (#{@id.to_s})"
|
13
|
+
if caller_method(2) == :inspect
|
14
|
+
@id.to_s
|
15
|
+
else
|
16
|
+
dependencies = []
|
17
|
+
@dependencies.each do |dependency|
|
18
|
+
dependencies << "#{dependency.target}#{dependency.type}"
|
19
|
+
end
|
20
|
+
s += " | #{short_value.inspect}" +
|
21
|
+
" | #{@features.inspect}" +
|
22
|
+
" | { #{dependencies.join(', ')} }"
|
23
|
+
end
|
24
|
+
s
|
25
|
+
end
|
26
|
+
# Print out an ASCII representation of the tree.
|
27
|
+
def print_tree; puts visualize(:tree); end
|
28
|
+
end
|
29
|
+
end
|
data/lib/treat/visitable.rb
CHANGED
data/test/tc_entity.rb
CHANGED
@@ -2,17 +2,14 @@ module Treat
|
|
2
2
|
module Tests
|
3
3
|
class TestEntity < Test::Unit::TestCase
|
4
4
|
def setup
|
5
|
-
@
|
6
|
-
|
5
|
+
@section = Treat::Entities::Section.new
|
7
6
|
@sentence = Treat::Entities::Sentence.new
|
8
|
-
|
9
|
-
@
|
10
|
-
@
|
11
|
-
@
|
12
|
-
@
|
13
|
-
@
|
14
|
-
@adj_phrase.set :tag, 'ADJP'
|
15
|
-
|
7
|
+
@noun_cons = Treat::Entities::Phrase.new
|
8
|
+
@noun_cons.set :tag, 'NP'
|
9
|
+
@verb_cons = Treat::Entities::Phrase.new
|
10
|
+
@verb_cons.set :tag, 'VP'
|
11
|
+
@adj_cons = Treat::Entities::Phrase.new
|
12
|
+
@adj_cons.set :tag, 'ADJP'
|
16
13
|
@det = Treat::Entities::Word.new('The')
|
17
14
|
@det.set :category, :determiner
|
18
15
|
@det.set :tag, 'DT'
|
@@ -34,77 +31,87 @@ module Treat
|
|
34
31
|
@verb.set :tag, 'VBG'
|
35
32
|
@verb.set :tag_set, :penn
|
36
33
|
@dot = Treat::Entities::Punctuation.new('.')
|
37
|
-
|
38
|
-
@
|
39
|
-
@
|
40
|
-
@
|
41
|
-
@verb_phrase << [@aux, @verb]
|
34
|
+
@section << @sentence << [@noun_cons, @verb_cons, @dot]
|
35
|
+
@noun_cons << [@det, @adj_cons, @noun]
|
36
|
+
@adj_cons << @adj
|
37
|
+
@verb_cons << [@aux, @verb]
|
42
38
|
end
|
43
39
|
|
44
|
-
def
|
45
|
-
|
40
|
+
def test_viewable
|
41
|
+
s = 'Happiness is not an ideal of reason, but of imagination.'.tokenize
|
42
|
+
assert_nothing_raised do
|
43
|
+
# Return the string value of the sentence.
|
44
|
+
s.to_s
|
45
|
+
# Return a debug description of the sentence.
|
46
|
+
s.inspect
|
47
|
+
# Return a shortened version of the Sentence with [...]
|
48
|
+
s.short_value
|
49
|
+
end
|
46
50
|
end
|
47
|
-
|
51
|
+
|
48
52
|
def test_registrable
|
49
|
-
assert_equal @
|
50
|
-
assert_equal @noun, @
|
51
|
-
assert_equal [@noun], @
|
53
|
+
assert_equal @section.token_registry, @verb.token_registry
|
54
|
+
assert_equal @noun, @section.token_registry[:id][@noun.id]
|
55
|
+
assert_equal [@noun], @section.token_registry[:value][@noun.value]
|
52
56
|
end
|
53
57
|
|
54
|
-
|
55
58
|
def test_delegatable_visitable
|
56
|
-
assert_raise(Treat::Exception) do
|
57
|
-
@
|
59
|
+
assert_raise(Treat::Exception) do
|
60
|
+
@section.encoding(:nonexistent)
|
58
61
|
end
|
59
62
|
assert_nothing_raised do
|
60
|
-
@
|
63
|
+
@section.language
|
61
64
|
end
|
62
65
|
end
|
63
|
-
|
66
|
+
|
64
67
|
def test_type
|
65
|
-
assert_equal :section, @
|
68
|
+
assert_equal :section, @section.type
|
66
69
|
end
|
67
|
-
|
70
|
+
|
68
71
|
def test_printers
|
69
72
|
assert_nothing_raised do
|
70
|
-
@
|
71
|
-
@
|
72
|
-
@
|
73
|
-
@
|
73
|
+
@section.to_s
|
74
|
+
@section.to_string
|
75
|
+
@section.short_value
|
76
|
+
@section.inspect
|
74
77
|
end
|
75
78
|
end
|
76
79
|
|
77
80
|
def test_magic_methods
|
78
|
-
|
79
|
-
assert_equal
|
80
|
-
assert_equal
|
81
|
-
|
82
|
-
assert_equal
|
83
|
-
assert_equal [@
|
84
|
-
|
85
|
-
|
86
|
-
assert_equal [@
|
87
|
-
assert_equal
|
88
|
-
|
89
|
-
@
|
81
|
+
|
82
|
+
assert_equal true, @sentence.is_sentence?
|
83
|
+
assert_equal true, @noun.is_noun?
|
84
|
+
|
85
|
+
assert_equal @sentence, @section.sentence
|
86
|
+
assert_equal [@sentence], @section.sentences
|
87
|
+
assert_equal 1, @section.sentence_count
|
88
|
+
|
89
|
+
assert_equal [@det], @section.words_with_value('The')
|
90
|
+
assert_equal [@verb], @section.words_with_tag('VBG')
|
91
|
+
|
92
|
+
assert_equal @noun, @section.noun
|
93
|
+
assert_equal [@aux, @verb], @section.verbs
|
94
|
+
assert_equal 6, @section.token_count
|
95
|
+
|
96
|
+
@section.each_sentence do |s|
|
90
97
|
assert_equal @sentence, s
|
91
98
|
end
|
92
|
-
@
|
99
|
+
@section.each_noun do |n|
|
93
100
|
assert_equal @noun, n
|
94
101
|
end
|
95
|
-
@
|
102
|
+
@section.each_with_value('The') do |x|
|
96
103
|
assert_equal @det, x
|
97
104
|
end
|
98
|
-
|
105
|
+
|
99
106
|
assert_equal @sentence, @noun.parent_sentence
|
100
107
|
end
|
101
108
|
|
102
109
|
def test_features
|
103
110
|
@verb.set :test, :test
|
104
111
|
assert_equal :test, @verb.test
|
105
|
-
assert_raise(Treat::Exception) { @verb.nonexistent }
|
112
|
+
assert_raise(Treat::Exception) { @verb.nonexistent }
|
106
113
|
end
|
107
|
-
|
114
|
+
|
108
115
|
end
|
109
116
|
end
|
110
117
|
end
|
data/test/tc_extractors.rb
CHANGED
@@ -1,34 +1,38 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
module Treat
|
2
3
|
module Tests
|
3
4
|
class TestExtractors < Test::Unit::TestCase
|
4
5
|
|
5
6
|
def setup
|
6
|
-
@time = Treat::Tests::
|
7
|
-
@date = Treat::Tests::
|
8
|
-
@doc = Treat::Tests::
|
9
|
-
@word = Treat::Tests::
|
7
|
+
@time = Treat::Tests::English::Time
|
8
|
+
@date = Treat::Tests::English::Date
|
9
|
+
@doc = Treat::Tests::English::LongDoc
|
10
|
+
@word = Treat::Tests::English::Word
|
11
|
+
@col = Treat::Tests::English::Collection
|
10
12
|
end
|
11
13
|
|
12
14
|
def test_time
|
13
|
-
assert_nothing_raised { @
|
14
|
-
|
15
|
-
|
15
|
+
assert_nothing_raised { @time.time(:nickel) }
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_date
|
19
|
+
assert_equal 2011, @date.date(:chronic).year
|
20
|
+
assert_equal 2011, @date.date(:ruby).year
|
16
21
|
end
|
17
22
|
|
18
23
|
def test_topic_words
|
19
|
-
assert_nothing_raised { @
|
24
|
+
assert_nothing_raised { @col.topic_words(:lda) }
|
20
25
|
end
|
21
|
-
|
22
|
-
|
26
|
+
|
23
27
|
def test_named_entity
|
24
|
-
|
25
|
-
|
28
|
+
p = 'Angela Merkel and Nicolas Sarkozy were the first ones to board the p'
|
29
|
+
assert_nothing_raised { @doc.named_entity(:stanford) }
|
26
30
|
end
|
27
31
|
|
28
32
|
def test_keywords
|
29
|
-
assert_nothing_raised do
|
30
|
-
topics = @
|
31
|
-
@doc.keywords(:topics_frequency, topic_words
|
33
|
+
assert_nothing_raised do
|
34
|
+
topics = @col.topic_words(:lda)
|
35
|
+
@doc.keywords(:topics_frequency, :topic_words => topics)
|
32
36
|
end
|
33
37
|
end
|
34
38
|
|
@@ -38,13 +42,32 @@ module Treat
|
|
38
42
|
|
39
43
|
def test_statistics
|
40
44
|
@doc.chunk.segment(:tactful).tokenize
|
41
|
-
|
42
|
-
assert_nothing_raised { @
|
43
|
-
assert_nothing_raised { @word.statistics(:frequency_in) }
|
45
|
+
assert_equal 1, @word.frequency_in(:document)
|
46
|
+
assert_nothing_raised { @word.tf_idf ; puts @word.tf_idf }
|
44
47
|
# assert_nothing_raised { @doc.statistics(:position_in) }
|
45
48
|
# assert_nothing_raised { @doc.statistics(:transition_matrix) }
|
46
49
|
# assert_nothing_raised { @doc.statistics(:transition_probability) }
|
47
50
|
end
|
51
|
+
|
52
|
+
def test_language
|
53
|
+
assert_equal Treat.default_language, @doc.language
|
54
|
+
Treat.detect_language = true
|
55
|
+
assert_equal :eng, @doc.language
|
56
|
+
|
57
|
+
a = 'I want to know God\'s thoughts; the rest are details. - Albert Einstein'
|
58
|
+
b = 'El mundo de hoy no tiene sentido, así que ¿por qué debería pintar cuadros que lo tuvieran? - Pablo Picasso'
|
59
|
+
c = 'Un bon Allemand ne peut souffrir les Français, mais il boit volontiers les vins de France. - Goethe'
|
60
|
+
d = 'Wir haben die Kunst, damit wir nicht an der Wahrheit zugrunde gehen. - Friedrich Nietzsche'
|
61
|
+
|
62
|
+
assert_equal :eng, a.language
|
63
|
+
assert_equal :spa, b.language
|
64
|
+
assert_equal :fre, c.language
|
65
|
+
assert_equal :ger, d.language
|
66
|
+
|
67
|
+
# Reset defaults
|
68
|
+
Treat.detect_language = false
|
69
|
+
end
|
70
|
+
|
48
71
|
end
|
49
72
|
end
|
50
73
|
end
|
data/test/tc_formatters.rb
CHANGED
@@ -3,25 +3,25 @@ module Treat
|
|
3
3
|
class TestFormatters < Test::Unit::TestCase
|
4
4
|
|
5
5
|
def setup
|
6
|
-
@doc = Treat::Tests::
|
7
|
-
@
|
8
|
-
@sentence = Treat::Tests::EnglishSentence
|
6
|
+
@doc = Treat::Tests::English::ShortDoc
|
7
|
+
@sentence = Treat::Tests::English::Sentence
|
9
8
|
end
|
10
9
|
|
11
10
|
def test_readers
|
12
|
-
#
|
11
|
+
# This is done by loading a collection with all types of texts.
|
13
12
|
end
|
14
13
|
|
15
|
-
|
16
14
|
def test_serializers_and_unserializers
|
15
|
+
# Test roundtrip Ruby -> YAML -> Ruby -> YAML
|
17
16
|
create_temp_file('yml') do |tmp|
|
18
|
-
@doc.serialize(:yaml
|
17
|
+
@doc.serialize(:yaml, :file => tmp)
|
19
18
|
doc = Treat::Entities::Document(tmp)
|
20
19
|
assert_equal File.read(tmp).length,
|
21
20
|
doc.serialize(:yaml).length
|
22
21
|
end
|
22
|
+
# Test roundtrip Ruby -> XML -> Ruby -> XML.
|
23
23
|
create_temp_file('xml') do |tmp|
|
24
|
-
@doc.serialize(:xml
|
24
|
+
@doc.serialize(:xml, :file => tmp)
|
25
25
|
doc = Treat::Entities::Document(tmp)
|
26
26
|
assert_equal File.read(tmp).length,
|
27
27
|
doc.serialize(:xml).length
|
@@ -32,7 +32,6 @@ module Treat
|
|
32
32
|
assert_nothing_raised { @doc.visualize(:tree) }
|
33
33
|
# assert_nothing_raised { @doc.visualize(:html) }
|
34
34
|
assert_nothing_raised { @doc.visualize(:dot) }
|
35
|
-
assert_nothing_raised { @doc.visualize(:inspect) }
|
36
35
|
assert_nothing_raised { @doc.visualize(:short_value) }
|
37
36
|
assert_nothing_raised { @sentence.visualize(:standoff) }
|
38
37
|
end
|
data/test/tc_inflectors.rb
CHANGED
@@ -2,38 +2,33 @@ module Treat
|
|
2
2
|
module Tests
|
3
3
|
class TestInflectors < Test::Unit::TestCase
|
4
4
|
|
5
|
-
def setup
|
6
|
-
@word = Treat::Tests::EnglishWord
|
7
|
-
@number = Treat::Tests::Number
|
8
|
-
@verb = Treat::Tests::EnglishVerb
|
9
|
-
@noun = Treat::Tests::EnglishNoun
|
10
|
-
end
|
11
|
-
|
12
5
|
def test_lemmatizers
|
13
6
|
# Not implemented yet.
|
14
7
|
end
|
15
8
|
|
16
9
|
def test_stemmers
|
17
|
-
assert_equal 'run',
|
18
|
-
assert_equal 'run',
|
19
|
-
assert_equal 'run',
|
10
|
+
assert_equal 'run', 'running'.stem(:porter)
|
11
|
+
assert_equal 'run', 'running'.stem(:porter_c)
|
12
|
+
assert_equal 'run', 'running'.stem(:uea)
|
20
13
|
end
|
21
|
-
end
|
22
14
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
15
|
+
def test_conjugators
|
16
|
+
assert_equal 'run', 'running'.infinitive
|
17
|
+
assert_equal 'running', 'run'.present_participle
|
18
|
+
assert_equal 'run', 'runs'.plural_verb
|
19
|
+
end
|
28
20
|
|
29
|
-
|
30
|
-
|
31
|
-
|
21
|
+
def test_declensors
|
22
|
+
assert_equal 'inflections', 'inflection'.plural(:linguistics)
|
23
|
+
assert_equal 'inflections', 'inflection'.plural(:english)
|
24
|
+
assert_equal 'inflection', 'inflections'.singular(:english)
|
25
|
+
end
|
32
26
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
27
|
+
def test_ordinal_and_cardinal_words
|
28
|
+
assert_equal 'twenty', 20.cardinal_words
|
29
|
+
assert_equal 'twentieth', 20.ordinal_words
|
30
|
+
end
|
37
31
|
|
32
|
+
end
|
38
33
|
end
|
39
|
-
end
|
34
|
+
end
|
data/test/tc_lexicalizers.rb
CHANGED
@@ -2,35 +2,28 @@ module Treat
|
|
2
2
|
module Tests
|
3
3
|
class TestLexicalizers < Test::Unit::TestCase
|
4
4
|
|
5
|
-
def setup
|
6
|
-
@word = Treat::Tests::EnglishWord
|
7
|
-
@sentence = Treat::Tests::EnglishSentence.parse
|
8
|
-
end
|
9
|
-
|
10
5
|
def test_category
|
11
|
-
assert_equal :verb,
|
6
|
+
assert_equal :verb, 'visualize'.category(:from_tag, :tagger => :stanford)
|
7
|
+
assert_equal :noun, 'inflection'.category(:from_tag, :tagger => :brill)
|
8
|
+
assert_equal :adjective, 'sweet'.category(:from_tag, :tagger => :lingua)
|
12
9
|
end
|
13
10
|
|
14
11
|
def test_synsets
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
assert_nothing_raised { @word.hyponyms(:wordnet) }
|
20
|
-
assert_nothing_raised { @word.hypernyms(:wordnet) }
|
12
|
+
assert_equal 'mature', 'ripe'.synonyms(:wordnet)[0]
|
13
|
+
# assert_equal 'green', ' ripe'.antonyms(:wordnet)[0]
|
14
|
+
assert_equal 'beverage', 'coffee'.hypernyms(:wordnet)[0]
|
15
|
+
assert_equal 'gravy', 'juice'.hyponyms(:wordnet)[0]
|
21
16
|
end
|
22
17
|
|
23
18
|
def test_linkages
|
24
|
-
|
25
|
-
|
26
|
-
assert_nothing_raised { @sentence.linkages(:naive, :linkage => :object) }
|
27
|
-
assert_nothing_raised { @sentence.linkages(:naive, :linkage => :patient) }
|
19
|
+
sentence = 'Good is bad, but bad is not good'
|
20
|
+
# assert_equal sentence.parse(:enju).linkages
|
28
21
|
end
|
29
22
|
|
30
23
|
def test_taggers
|
31
|
-
|
32
|
-
|
33
|
-
|
24
|
+
assert_equal 'VBG', 'running'.tag(:stanford)
|
25
|
+
assert_equal 'VBG', 'running'.tag(:brill)
|
26
|
+
assert_equal 'VBG', 'running'.tag(:lingua)
|
34
27
|
end
|
35
28
|
|
36
29
|
end
|
data/test/tc_processors.rb
CHANGED
@@ -3,31 +3,45 @@ module Treat
|
|
3
3
|
class TestProcessors < Test::Unit::TestCase
|
4
4
|
|
5
5
|
def setup
|
6
|
-
@doc = Treat::Tests::
|
6
|
+
@doc = Treat::Tests::English::ShortDoc
|
7
7
|
end
|
8
8
|
|
9
9
|
def test_tokenizers
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
10
|
+
words = ['A', 'sentence', 'to', 'tokenize']
|
11
|
+
tokenize_map = lambda do |worker, o={}|
|
12
|
+
'A sentence to tokenize'.
|
13
|
+
tokenize(worker, o).words.map { |w| w.value }
|
14
|
+
end
|
15
|
+
assert_equal words, tokenize_map.call(:macintyre)
|
16
|
+
assert_equal words, tokenize_map.call(:multilingual)
|
17
|
+
assert_equal words, tokenize_map.call(:perl)
|
18
|
+
assert_equal words, tokenize_map.call(:punkt)
|
19
|
+
assert_equal words, tokenize_map.call(:stanford, :silence => true)
|
20
|
+
assert_equal words, tokenize_map.call(:tactful)
|
16
21
|
end
|
17
22
|
|
18
23
|
def test_segmenters
|
19
|
-
|
20
|
-
|
21
|
-
|
24
|
+
sentences = ['This is sentence 1.', 'This is sentence 2.']
|
25
|
+
segment_map = lambda do |worker,o={}|
|
26
|
+
'This is sentence 1. This is sentence 2.'.
|
27
|
+
segment(worker, o).sentences.map { |s| s.value }
|
28
|
+
end
|
29
|
+
assert_equal sentences, segment_map.call(:punkt)
|
30
|
+
assert_equal sentences, segment_map.call(:stanford, :silence => true)
|
31
|
+
assert_equal sentences, segment_map.call(:tactful)
|
22
32
|
end
|
23
33
|
|
24
34
|
def test_chunkers
|
25
|
-
|
35
|
+
title = 'This is a title!'
|
36
|
+
paragraph = 'This is sentence 1. This is a potential sentence inside a pargraph describing the wonders of the world.'
|
37
|
+
s = "This is a title!\nThis is sentence 1. This is a potential sentence inside a pargraph describing the wonders of the world.".chunk
|
38
|
+
assert_equal title, s.title.value
|
39
|
+
assert_equal paragraph, s.paragraph.value
|
26
40
|
end
|
27
41
|
|
28
42
|
def test_parsers
|
29
43
|
assert_nothing_raised { @doc.segment.parse(:enju) }
|
30
|
-
assert_nothing_raised { @doc.segment.parse(:stanford) }
|
44
|
+
assert_nothing_raised { @doc.segment.parse(:stanford, :silence => true) }
|
31
45
|
end
|
32
46
|
|
33
47
|
end
|