treat 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +0 -0
- data/LICENSE +28 -0
- data/README +0 -0
- data/TODO +67 -0
- data/bin/INFO +1 -0
- data/examples/benchmark.rb +81 -0
- data/examples/keywords.rb +60 -0
- data/examples/texts/bugged_out.txt +26 -0
- data/examples/texts/half_cocked_basel.txt +16 -0
- data/examples/texts/hedge_funds.txt +24 -0
- data/examples/texts/hose_and_dry.txt +19 -0
- data/examples/texts/hungarys_troubles.txt +46 -0
- data/examples/texts/indias_slowdown.txt +15 -0
- data/examples/texts/merkozy_rides_again.txt +24 -0
- data/examples/texts/prada_is_not_walmart.txt +9 -0
- data/examples/texts/republican_nomination.txt +26 -0
- data/examples/texts/to_infinity_and_beyond.txt +15 -0
- data/lib/treat.rb +91 -0
- data/lib/treat/buildable.rb +115 -0
- data/lib/treat/categories.rb +29 -0
- data/lib/treat/category.rb +28 -0
- data/lib/treat/delegatable.rb +90 -0
- data/lib/treat/detectors.rb +28 -0
- data/lib/treat/detectors/encoding/native.rb +12 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
- data/lib/treat/detectors/format/file.rb +36 -0
- data/lib/treat/detectors/language/language_detector.rb +19 -0
- data/lib/treat/detectors/language/what_language.rb +29 -0
- data/lib/treat/entities.rb +52 -0
- data/lib/treat/entities/collection.rb +19 -0
- data/lib/treat/entities/constituents.rb +15 -0
- data/lib/treat/entities/document.rb +11 -0
- data/lib/treat/entities/entity.rb +242 -0
- data/lib/treat/entities/sentence.rb +8 -0
- data/lib/treat/entities/text.rb +7 -0
- data/lib/treat/entities/tokens.rb +37 -0
- data/lib/treat/entities/zones.rb +17 -0
- data/lib/treat/exception.rb +5 -0
- data/lib/treat/extractors.rb +41 -0
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
- data/lib/treat/extractors/named_entity/abner.rb +20 -0
- data/lib/treat/extractors/named_entity/stanford.rb +174 -0
- data/lib/treat/extractors/statistics/frequency.rb +22 -0
- data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
- data/lib/treat/extractors/statistics/position_in.rb +13 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
- data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
- data/lib/treat/extractors/time/chronic.rb +12 -0
- data/lib/treat/extractors/time/native.rb +12 -0
- data/lib/treat/extractors/time/nickel.rb +45 -0
- data/lib/treat/extractors/topic_words/lda.rb +71 -0
- data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
- data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
- data/lib/treat/extractors/topics/reuters.rb +91 -0
- data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
- data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
- data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
- data/lib/treat/feature.rb +53 -0
- data/lib/treat/formatters.rb +44 -0
- data/lib/treat/formatters/cleaners/html.rb +17 -0
- data/lib/treat/formatters/readers/autoselect.rb +35 -0
- data/lib/treat/formatters/readers/gocr.rb +24 -0
- data/lib/treat/formatters/readers/html.rb +13 -0
- data/lib/treat/formatters/readers/ocropus.rb +31 -0
- data/lib/treat/formatters/readers/pdf.rb +17 -0
- data/lib/treat/formatters/readers/txt.rb +15 -0
- data/lib/treat/formatters/serializers/xml.rb +48 -0
- data/lib/treat/formatters/serializers/yaml.rb +15 -0
- data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
- data/lib/treat/formatters/unserializers/xml.rb +79 -0
- data/lib/treat/formatters/unserializers/yaml.rb +15 -0
- data/lib/treat/formatters/visualizers/dot.rb +73 -0
- data/lib/treat/formatters/visualizers/html.rb +12 -0
- data/lib/treat/formatters/visualizers/inspect.rb +16 -0
- data/lib/treat/formatters/visualizers/short_value.rb +14 -0
- data/lib/treat/formatters/visualizers/standoff.rb +41 -0
- data/lib/treat/formatters/visualizers/tree.rb +28 -0
- data/lib/treat/formatters/visualizers/txt.rb +31 -0
- data/lib/treat/group.rb +96 -0
- data/lib/treat/inflectors.rb +50 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
- data/lib/treat/inflectors/declensors/en.rb +18 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
- data/lib/treat/inflectors/stemmers/porter.rb +158 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
- data/lib/treat/inflectors/stemmers/uea.rb +30 -0
- data/lib/treat/lexicalizers.rb +49 -0
- data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
- data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
- data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
- data/lib/treat/lexicalizers/tag/brill.rb +101 -0
- data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
- data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
- data/lib/treat/processors.rb +45 -0
- data/lib/treat/processors/chunkers/txt.rb +27 -0
- data/lib/treat/processors/parsers/enju.rb +214 -0
- data/lib/treat/processors/parsers/stanford.rb +60 -0
- data/lib/treat/processors/segmenters/punkt.rb +48 -0
- data/lib/treat/processors/segmenters/stanford.rb +45 -0
- data/lib/treat/processors/segmenters/tactful.rb +34 -0
- data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
- data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
- data/lib/treat/processors/tokenizers/perl.rb +96 -0
- data/lib/treat/processors/tokenizers/punkt.rb +42 -0
- data/lib/treat/processors/tokenizers/stanford.rb +33 -0
- data/lib/treat/processors/tokenizers/tactful.rb +59 -0
- data/lib/treat/proxies.rb +66 -0
- data/lib/treat/registrable.rb +26 -0
- data/lib/treat/resources.rb +10 -0
- data/lib/treat/resources/categories.rb +18 -0
- data/lib/treat/resources/delegates.rb +96 -0
- data/lib/treat/resources/dependencies.rb +0 -0
- data/lib/treat/resources/edges.rb +8 -0
- data/lib/treat/resources/formats.rb +23 -0
- data/lib/treat/resources/languages.rb +86 -0
- data/lib/treat/resources/languages.txt +504 -0
- data/lib/treat/resources/tags.rb +393 -0
- data/lib/treat/sugar.rb +43 -0
- data/lib/treat/tree.rb +174 -0
- data/lib/treat/utilities.rb +127 -0
- data/lib/treat/visitable.rb +27 -0
- data/test/profile.rb +2 -0
- data/test/tc_detectors.rb +27 -0
- data/test/tc_entity.rb +105 -0
- data/test/tc_extractors.rb +48 -0
- data/test/tc_formatters.rb +46 -0
- data/test/tc_inflectors.rb +39 -0
- data/test/tc_lexicalizers.rb +39 -0
- data/test/tc_processors.rb +36 -0
- data/test/tc_resources.rb +27 -0
- data/test/tc_treat.rb +64 -0
- data/test/tc_tree.rb +60 -0
- data/test/tests.rb +19 -0
- data/test/texts.rb +20 -0
- data/test/texts/english/long.html +24 -0
- data/test/texts/english/long.txt +22 -0
- data/test/texts/english/medium.txt +5 -0
- data/test/texts/english/short.txt +3 -0
- metadata +412 -0
@@ -0,0 +1,127 @@
|
|
1
|
+
module Treat
|
2
|
+
# Provides utility functions used across the library.
|
3
|
+
module Utilities
|
4
|
+
# Require file utilities.
|
5
|
+
require 'fileutils'
|
6
|
+
# Returns the platform we are running on.
|
7
|
+
def self.platform
|
8
|
+
RUBY_PLATFORM.split("-")[1]
|
9
|
+
end
|
10
|
+
# Runs a block of code silently, i.e. without
|
11
|
+
# expressing warnings even in verbose mode.
|
12
|
+
# Rename to silence_streamsings.
|
13
|
+
def self.silently(&block)
|
14
|
+
warn_level = $VERBOSE
|
15
|
+
$VERBOSE = nil
|
16
|
+
result = block.call
|
17
|
+
$VERBOSE = warn_level
|
18
|
+
result
|
19
|
+
end
|
20
|
+
def self.silence_streams(*streams)
|
21
|
+
yield
|
22
|
+
end
|
23
|
+
# Create a temporary file which is deleted
|
24
|
+
# after execution of the block.
|
25
|
+
require 'tempfile'
|
26
|
+
def self.create_temp_file(ext, value = nil, &block)
|
27
|
+
tmp = Tempfile.new(['', ".#{ext.to_s}"], Treat.tmp)
|
28
|
+
tmp.puts(value) if value
|
29
|
+
block.call(tmp.path)
|
30
|
+
end
|
31
|
+
# A list of acronyms used in class names within
|
32
|
+
# the program. These do not CamelCase; they
|
33
|
+
# CAMELCASE.
|
34
|
+
@@acronyms = ['XML', 'HTML', 'YAML', 'UEA', 'LDA', 'PDF', 'GOCR', 'Treat'].join('|')
|
35
|
+
@@cc_cache = {}
|
36
|
+
# Convert un_camel_case to CamelCase.
|
37
|
+
def self.camel_case(o_phrase)
|
38
|
+
phrase = o_phrase.to_s.dup
|
39
|
+
return @@cc_cache[o_phrase] if @@cc_cache[o_phrase]
|
40
|
+
phrase.gsub!(/#{@@acronyms.downcase}[^a-z]+/) { |a| a.upcase }
|
41
|
+
phrase.gsub!(/^[a-z]|_[a-z]/) { |a| a.upcase }
|
42
|
+
phrase.gsub!('_', '')
|
43
|
+
@@cc_cache[o_phrase] = phrase
|
44
|
+
phrase
|
45
|
+
end
|
46
|
+
@@ucc_cache = {}
|
47
|
+
# Convert CamelCase to un_camel_case.
|
48
|
+
def self.un_camel_case(o_phrase)
|
49
|
+
phrase = o_phrase.to_s.dup
|
50
|
+
return @@ucc_cache[o_phrase] if @@ucc_cache[o_phrase]
|
51
|
+
phrase.gsub!(/#{@@acronyms}/) { |a| a.downcase.capitalize }
|
52
|
+
phrase.gsub!(/[A-Z]/) { |p| '_' + p.downcase }
|
53
|
+
phrase = phrase[1..-1] if phrase[0] == '_'
|
54
|
+
@@ucc_cache[o_phrase] = phrase
|
55
|
+
phrase
|
56
|
+
end
|
57
|
+
# Return the levensthein distance between two stringsm
|
58
|
+
# taking into account the costs of insertion, deletion,
|
59
|
+
# and substitution. Stolen from:
|
60
|
+
# http://ruby-snippets.heroku.com/string/levenshtein-distance
|
61
|
+
def self.levenshtein(first, other, ins=1, del=1, sub=1)
|
62
|
+
return nil if first.nil? || other.nil?
|
63
|
+
dm = []
|
64
|
+
dm[0] = (0..first.length).collect { |i| i * ins}
|
65
|
+
fill = [0] * (first.length - 1)
|
66
|
+
for i in 1..other.length
|
67
|
+
dm[i] = [i * del, fill.flatten]
|
68
|
+
end
|
69
|
+
for i in 1..other.length
|
70
|
+
for j in 1..first.length
|
71
|
+
dm[i][j] = [
|
72
|
+
dm[i-1][j-1] + (first[i-1] == other[i-1] ? 0 : sub),
|
73
|
+
dm[i][j-1] + ins,
|
74
|
+
dm[i-1][j] + del
|
75
|
+
].min
|
76
|
+
end
|
77
|
+
end
|
78
|
+
dm[other.length][first.length]
|
79
|
+
end
|
80
|
+
# Search the list to see if there are words
|
81
|
+
# similar to name. If yes, return a string
|
82
|
+
# saying "Did you mean ... ?"
|
83
|
+
def self.did_you_mean?(list, name)
|
84
|
+
msg = ''
|
85
|
+
sugg = []
|
86
|
+
list.each do |element|
|
87
|
+
l = levenshtein(element,name)
|
88
|
+
if l > 0 && l < 2
|
89
|
+
sugg << element
|
90
|
+
end
|
91
|
+
end
|
92
|
+
unless sugg.empty?
|
93
|
+
if sugg.size == 1
|
94
|
+
msg += " Perhaps you meant '#{sugg[0]}' ?"
|
95
|
+
else
|
96
|
+
sugg_quote = sugg[0..-2].map {|x| '\'' + x + '\''}
|
97
|
+
msg += " Perhaps you meant #{sugg_quote.join(', ')}," +
|
98
|
+
" or '#{sugg[-1]}' ?"
|
99
|
+
end
|
100
|
+
end
|
101
|
+
msg
|
102
|
+
end
|
103
|
+
def self.caller_method(n = 3)
|
104
|
+
at = caller(n).first
|
105
|
+
/^(.+?):(\d+)(?::in `(.*)')?/ =~ at
|
106
|
+
:"#{Regexp.last_match[3]}"
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# Make undefining constants publicly
|
112
|
+
# available on any object.
|
113
|
+
Object.module_eval do
|
114
|
+
def self.const_unset(const); Object.instance_eval { remove_const(const) }; puts const; end
|
115
|
+
end
|
116
|
+
|
117
|
+
# Make the most common utility functions available in the global scope.
|
118
|
+
def create_temp_file(ext, value = nil, &block)
|
119
|
+
Treat::Utilities.create_temp_file(ext, value) { |f| block.call(f) }
|
120
|
+
end
|
121
|
+
def silence_streams(*streams); Treat::Utilities.silence_streams(*streams) { yield }; end
|
122
|
+
def silently(&block); Treat::Utilities.silently { block.call }; end
|
123
|
+
def cc(w); Treat::Utilities.camel_case(w); end
|
124
|
+
def ucc(w); Treat::Utilities.un_camel_case(w); end
|
125
|
+
def cl(n); n.to_s.split('::')[-1]; end
|
126
|
+
def did_you_mean?(l, e); Treat::Utilities.did_you_mean?(l, e); end
|
127
|
+
def caller_method(n = 3); Treat::Utilities.caller_method(n); end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Treat
|
2
|
+
# Make a tree visitable by implementing the method #accept.
|
3
|
+
module Visitable
|
4
|
+
# Accept a visitor implemented by klass, which is
|
5
|
+
# found in the supplied group, and call method on it.
|
6
|
+
def accept(group, klass, method, options)
|
7
|
+
if group.has_target?(self.class)
|
8
|
+
if group.type == :transformer
|
9
|
+
if has_children?
|
10
|
+
@children.each do |entity|
|
11
|
+
if group.has_target?(entity.class)
|
12
|
+
entity.accept(group, klass, method, options)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
else
|
16
|
+
klass.send(method, self, options)
|
17
|
+
end
|
18
|
+
return self
|
19
|
+
else
|
20
|
+
return klass.send(method, self, options)
|
21
|
+
end
|
22
|
+
else
|
23
|
+
raise "This type of visitor cannot visit a #{self.class}."
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/test/profile.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
module Treat
|
2
|
+
module Tests
|
3
|
+
class TestDetectors < Test::Unit::TestCase
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@doc = Treat::Tests::EnglishLongDoc
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_format_detectors
|
10
|
+
assert_equal :txt, @doc.format
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_encoding_detectors
|
14
|
+
assert_equal :utf_8, @doc.encoding(:native)
|
15
|
+
assert_equal :utf_8, @doc.encoding(:r_chardet19)
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_language_detectors
|
19
|
+
assert_equal Treat.default_language, @doc.language
|
20
|
+
Treat.detect_language = true
|
21
|
+
assert_equal :eng, @doc.language
|
22
|
+
Treat.detect_language = false
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
data/test/tc_entity.rb
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
module Treat
|
2
|
+
module Tests
|
3
|
+
class TestEntity < Test::Unit::TestCase
|
4
|
+
def setup
|
5
|
+
@text = Treat::Entities::Text.new
|
6
|
+
|
7
|
+
@sentence = Treat::Entities::Sentence.new
|
8
|
+
|
9
|
+
@noun_phrase = Treat::Entities::Phrase.new
|
10
|
+
@noun_phrase.set :tag, 'NP'
|
11
|
+
@verb_phrase = Treat::Entities::Phrase.new
|
12
|
+
@verb_phrase.set :tag, 'VP'
|
13
|
+
@adj_phrase = Treat::Entities::Phrase.new
|
14
|
+
@adj_phrase.set :tag, 'ADJP'
|
15
|
+
|
16
|
+
@det = Treat::Entities::Word.new('The')
|
17
|
+
@det.set :cat, :determiner
|
18
|
+
@det.set :tag, 'DT'
|
19
|
+
@adj = Treat::Entities::Word.new('lazy')
|
20
|
+
@adj.set :cat, :adjective
|
21
|
+
@adj.set :tag, 'JJ'
|
22
|
+
@noun = Treat::Entities::Word.new('fox')
|
23
|
+
@noun.set :cat, :noun
|
24
|
+
@noun.set :tag, 'NN'
|
25
|
+
@aux = Treat::Entities::Word.new('is')
|
26
|
+
@aux.set :cat, :verb
|
27
|
+
@aux.set :tag, 'VBZ'
|
28
|
+
@verb = Treat::Entities::Word.new('running')
|
29
|
+
@verb.set :cat, :verb
|
30
|
+
@verb.set :tag, 'VBG'
|
31
|
+
@dot = Treat::Entities::Punctuation.new('.')
|
32
|
+
|
33
|
+
@text << @sentence << [@noun_phrase, @verb_phrase, @dot]
|
34
|
+
@noun_phrase << [@det, @adj_phrase, @noun]
|
35
|
+
@adj_phrase << @adj
|
36
|
+
@verb_phrase << [@aux, @verb]
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_respond_to_missing
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_registrable
|
44
|
+
assert_equal @text.token_registry, @verb.token_registry
|
45
|
+
assert_equal @noun, @text.token_registry[:id][@noun.id]
|
46
|
+
assert_equal [@noun], @text.token_registry[:value][@noun.value]
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
def test_delegatable_visitable
|
51
|
+
assert_raise(Treat::Exception) do
|
52
|
+
@text.encoding(:nonexistent)
|
53
|
+
end
|
54
|
+
assert_nothing_raised do
|
55
|
+
@text.format
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_type
|
60
|
+
assert_equal :text, @text.type
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_printers
|
64
|
+
assert_nothing_raised do
|
65
|
+
@text.to_s
|
66
|
+
@text.to_string
|
67
|
+
@text.short_value
|
68
|
+
@text.inspect
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def test_magic_methods
|
73
|
+
assert_equal @sentence, @text.sentence
|
74
|
+
assert_equal [@sentence], @text.sentences
|
75
|
+
assert_equal 1, @text.sentence_count
|
76
|
+
|
77
|
+
assert_equal [@det], @text.words_with_value('The')
|
78
|
+
assert_equal [@verb], @text.words_with_tag('VBG')
|
79
|
+
|
80
|
+
assert_equal @noun, @text.noun
|
81
|
+
assert_equal [@aux, @verb], @text.verbs
|
82
|
+
assert_equal 6, @text.token_count
|
83
|
+
|
84
|
+
@text.each_sentence do |s|
|
85
|
+
assert_equal @sentence, s
|
86
|
+
end
|
87
|
+
@text.each_noun do |n|
|
88
|
+
assert_equal @noun, n
|
89
|
+
end
|
90
|
+
@text.each_with_value('The') do |x|
|
91
|
+
assert_equal @det, x
|
92
|
+
end
|
93
|
+
|
94
|
+
assert_equal @sentence, @noun.parent_sentence
|
95
|
+
end
|
96
|
+
|
97
|
+
def test_features
|
98
|
+
@verb.set :test, :test
|
99
|
+
assert_equal :test, @verb.test
|
100
|
+
assert_raise(Treat::Exception) { @verb.nonexistent }
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module Treat
|
2
|
+
module Tests
|
3
|
+
class TestExtractors < Test::Unit::TestCase
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@time = Treat::Tests::EnglishTime
|
7
|
+
@date = Treat::Tests::EnglishDate
|
8
|
+
@doc = Treat::Tests::EnglishLongDoc
|
9
|
+
@word = Treat::Tests::EnglishWord
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_time
|
13
|
+
assert_nothing_raised { @date.time(:chronic) }
|
14
|
+
assert_nothing_raised { @date.time(:native) }
|
15
|
+
assert_nothing_raised { @date.time(:nickel) }
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_topic_words
|
19
|
+
assert_nothing_raised { @doc.topic_words(:lda) }
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
def test_named_entity
|
24
|
+
# assert_nothing_raised { @doc.named_entity(:stanford) }
|
25
|
+
# assert_nothing_raised { @doc.named_entity(:abner) }
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_key_sentences
|
29
|
+
topics = @doc.topic_words(:lda)
|
30
|
+
assert_nothing_raised { @doc.key_sentences(:topics_frequency, topics) }
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_topics
|
34
|
+
assert_nothing_raised { @doc.topics(:reuters) }
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_statistics
|
38
|
+
@doc.chunk.segment(:tactful).tokenize
|
39
|
+
|
40
|
+
assert_nothing_raised { @doc.statistics(:frequency_of, value: 'the') }
|
41
|
+
assert_nothing_raised { @word.statistics(:frequency) }
|
42
|
+
# assert_nothing_raised { @doc.statistics(:position_in) }
|
43
|
+
# assert_nothing_raised { @doc.statistics(:transition_matrix) }
|
44
|
+
# assert_nothing_raised { @doc.statistics(:transition_probability) }
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module Treat
|
2
|
+
module Tests
|
3
|
+
class TestFormatters < Test::Unit::TestCase
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@doc = Treat::Tests::EnglishShortDoc
|
7
|
+
@html_doc = Treat::Tests::EnglishHtmlDoc
|
8
|
+
@sentence = Treat::Tests::EnglishSentence
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_readers
|
12
|
+
# How should we test this?
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
def test_serializers_and_unserializers
|
17
|
+
create_temp_file('yml') do |tmp|
|
18
|
+
@doc.serialize(:yaml).save(tmp)
|
19
|
+
doc = Treat::Entities::Document(tmp)
|
20
|
+
assert_equal File.read(tmp).length,
|
21
|
+
doc.serialize(:yaml).length
|
22
|
+
end
|
23
|
+
create_temp_file('xml') do |tmp|
|
24
|
+
@doc.serialize(:xml).save(tmp)
|
25
|
+
doc = Treat::Entities::Document(tmp)
|
26
|
+
assert_equal File.read(tmp).length,
|
27
|
+
doc.serialize(:xml).length
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_visualizers
|
32
|
+
assert_nothing_raised { @doc.visualize(:tree) }
|
33
|
+
# assert_nothing_raised { @doc.visualize(:html) }
|
34
|
+
assert_nothing_raised { @doc.visualize(:dot) }
|
35
|
+
assert_nothing_raised { @doc.visualize(:inspect) }
|
36
|
+
assert_nothing_raised { @doc.visualize(:short_value) }
|
37
|
+
assert_nothing_raised { @sentence.visualize(:standoff) }
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_cleaners
|
41
|
+
assert_nothing_raised { @html_doc.clean(:html) }
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Treat
|
2
|
+
module Tests
|
3
|
+
class TestInflectors < Test::Unit::TestCase
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@word = Treat::Tests::EnglishWord
|
7
|
+
@number = Treat::Tests::Number
|
8
|
+
@verb = Treat::Tests::EnglishVerb
|
9
|
+
@noun = Treat::Tests::EnglishNoun
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_lemmatizers
|
13
|
+
# Not implemented yet.
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_stemmers
|
17
|
+
assert_equal 'run', @word.stem(:porter)
|
18
|
+
assert_equal 'run', @word.stem(:porter_c)
|
19
|
+
assert_equal 'run', @word.stem(:uea)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_conjugators
|
24
|
+
assert_equal 'running', @verb.present_participle
|
25
|
+
assert_equal 'run', @verb.infinitive
|
26
|
+
assert_equal 'run', @verb.plural
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_declensors
|
30
|
+
assert_equal 'geese', @noun.plural
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_ordinal_and_cardinal_words
|
34
|
+
assert_equal 'twenty', @number.cardinal_words
|
35
|
+
assert_equal 'twentieth', @number.ordinal_words
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Treat
|
2
|
+
module Tests
|
3
|
+
class TestLexicalizers < Test::Unit::TestCase
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@word = Treat::Tests::EnglishWord
|
7
|
+
@sentence = Treat::Tests::EnglishSentence.parse
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_category
|
11
|
+
assert_equal :verb, @word.category(:from_tag)
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_synsets
|
15
|
+
# assert_nothing_raised { @word.synsets(:rita_wn) }
|
16
|
+
assert_nothing_raised { @word.synsets(:wordnet) }
|
17
|
+
assert_nothing_raised { @word.synonyms(:wordnet) }
|
18
|
+
assert_nothing_raised { @word.antonyms(:wordnet) }
|
19
|
+
assert_nothing_raised { @word.hyponyms(:wordnet) }
|
20
|
+
assert_nothing_raised { @word.hypernyms(:wordnet) }
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_linkages
|
24
|
+
assert_nothing_raised { @sentence.linkages(:naive, :linkage => :main_verb) }
|
25
|
+
assert_nothing_raised { @sentence.linkages(:naive, :linkage => :subject) }
|
26
|
+
assert_nothing_raised { @sentence.linkages(:naive, :linkage => :object) }
|
27
|
+
assert_nothing_raised { @sentence.linkages(:naive, :linkage => :patient) }
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_taggers
|
31
|
+
assert_nothing_raised { @word.tag(:brill) }
|
32
|
+
assert_nothing_raised { @word.tag(:lingua) }
|
33
|
+
assert_nothing_raised { @word.tag(:stanford) }
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|