treat 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -3
- data/README.md +33 -0
- data/files/INFO +1 -0
- data/lib/treat.rb +40 -105
- data/lib/treat/ai.rb +12 -0
- data/lib/treat/ai/classifiers/id3.rb +27 -0
- data/lib/treat/categories.rb +82 -35
- data/lib/treat/categorizable.rb +44 -0
- data/lib/treat/classification.rb +61 -0
- data/lib/treat/configurable.rb +115 -0
- data/lib/treat/data_set.rb +42 -0
- data/lib/treat/dependencies.rb +24 -0
- data/lib/treat/downloader.rb +87 -0
- data/lib/treat/entities.rb +68 -66
- data/lib/treat/entities/abilities.rb +10 -0
- data/lib/treat/entities/abilities/buildable.rb +327 -0
- data/lib/treat/entities/abilities/checkable.rb +31 -0
- data/lib/treat/entities/abilities/copyable.rb +45 -0
- data/lib/treat/entities/abilities/countable.rb +51 -0
- data/lib/treat/entities/abilities/debuggable.rb +83 -0
- data/lib/treat/entities/abilities/delegatable.rb +123 -0
- data/lib/treat/entities/abilities/doable.rb +62 -0
- data/lib/treat/entities/abilities/exportable.rb +11 -0
- data/lib/treat/entities/abilities/iterable.rb +115 -0
- data/lib/treat/entities/abilities/magical.rb +83 -0
- data/lib/treat/entities/abilities/registrable.rb +74 -0
- data/lib/treat/entities/abilities/stringable.rb +91 -0
- data/lib/treat/entities/entities.rb +104 -0
- data/lib/treat/entities/entity.rb +122 -245
- data/lib/treat/exception.rb +4 -4
- data/lib/treat/extractors.rb +77 -80
- data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
- data/lib/treat/extractors/language/what_language.rb +50 -45
- data/lib/treat/extractors/name_tag/stanford.rb +55 -0
- data/lib/treat/extractors/tf_idf/native.rb +87 -0
- data/lib/treat/extractors/time/chronic.rb +55 -0
- data/lib/treat/extractors/time/nickel.rb +86 -62
- data/lib/treat/extractors/time/ruby.rb +53 -0
- data/lib/treat/extractors/topic_words/lda.rb +67 -58
- data/lib/treat/extractors/topics/reuters.rb +100 -87
- data/lib/treat/formatters.rb +39 -35
- data/lib/treat/formatters/readers/abw.rb +49 -29
- data/lib/treat/formatters/readers/autoselect.rb +37 -33
- data/lib/treat/formatters/readers/doc.rb +19 -13
- data/lib/treat/formatters/readers/html.rb +52 -30
- data/lib/treat/formatters/readers/image.rb +41 -40
- data/lib/treat/formatters/readers/odt.rb +59 -45
- data/lib/treat/formatters/readers/pdf.rb +28 -25
- data/lib/treat/formatters/readers/txt.rb +12 -15
- data/lib/treat/formatters/readers/xml.rb +73 -36
- data/lib/treat/formatters/serializers/xml.rb +80 -79
- data/lib/treat/formatters/serializers/yaml.rb +19 -18
- data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
- data/lib/treat/formatters/unserializers/xml.rb +94 -99
- data/lib/treat/formatters/unserializers/yaml.rb +20 -19
- data/lib/treat/formatters/visualizers/dot.rb +132 -132
- data/lib/treat/formatters/visualizers/standoff.rb +52 -44
- data/lib/treat/formatters/visualizers/tree.rb +26 -29
- data/lib/treat/groupable.rb +153 -0
- data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
- data/lib/treat/inflectors.rb +50 -45
- data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
- data/lib/treat/inflectors/declensors/active_support.rb +31 -0
- data/lib/treat/inflectors/declensors/english.rb +38 -0
- data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
- data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
- data/lib/treat/inflectors/stemmers/porter.rb +160 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
- data/lib/treat/inflectors/stemmers/uea.rb +28 -0
- data/lib/treat/installer.rb +308 -0
- data/lib/treat/kernel.rb +105 -27
- data/lib/treat/languages.rb +122 -88
- data/lib/treat/languages/arabic.rb +15 -15
- data/lib/treat/languages/chinese.rb +15 -15
- data/lib/treat/languages/dutch.rb +15 -15
- data/lib/treat/languages/english.rb +61 -62
- data/lib/treat/languages/french.rb +19 -19
- data/lib/treat/languages/german.rb +20 -20
- data/lib/treat/languages/greek.rb +15 -15
- data/lib/treat/languages/italian.rb +16 -16
- data/lib/treat/languages/polish.rb +15 -15
- data/lib/treat/languages/portuguese.rb +15 -15
- data/lib/treat/languages/russian.rb +15 -15
- data/lib/treat/languages/spanish.rb +16 -16
- data/lib/treat/languages/swedish.rb +16 -16
- data/lib/treat/lexicalizers.rb +34 -55
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
- data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
- data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
- data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
- data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
- data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
- data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
- data/lib/treat/linguistics.rb +9 -0
- data/lib/treat/linguistics/categories.rb +11 -0
- data/lib/treat/linguistics/tags.rb +422 -0
- data/lib/treat/loaders/linguistics.rb +30 -0
- data/lib/treat/loaders/stanford.rb +27 -0
- data/lib/treat/object.rb +1 -0
- data/lib/treat/processors.rb +37 -44
- data/lib/treat/processors/chunkers/autoselect.rb +16 -0
- data/lib/treat/processors/chunkers/html.rb +71 -0
- data/lib/treat/processors/chunkers/txt.rb +18 -24
- data/lib/treat/processors/parsers/enju.rb +253 -208
- data/lib/treat/processors/parsers/stanford.rb +130 -131
- data/lib/treat/processors/segmenters/punkt.rb +79 -45
- data/lib/treat/processors/segmenters/stanford.rb +46 -48
- data/lib/treat/processors/segmenters/tactful.rb +43 -36
- data/lib/treat/processors/tokenizers/perl.rb +124 -92
- data/lib/treat/processors/tokenizers/ptb.rb +81 -0
- data/lib/treat/processors/tokenizers/punkt.rb +48 -42
- data/lib/treat/processors/tokenizers/stanford.rb +39 -38
- data/lib/treat/processors/tokenizers/tactful.rb +64 -55
- data/lib/treat/proxies.rb +52 -35
- data/lib/treat/retrievers.rb +26 -16
- data/lib/treat/retrievers/indexers/ferret.rb +47 -26
- data/lib/treat/retrievers/searchers/ferret.rb +69 -50
- data/lib/treat/tree.rb +241 -183
- data/spec/collection.rb +123 -0
- data/spec/document.rb +93 -0
- data/spec/entity.rb +408 -0
- data/spec/languages.rb +25 -0
- data/spec/phrase.rb +146 -0
- data/spec/samples/mathematicians/archimedes.abw +34 -0
- data/spec/samples/mathematicians/euler.html +21 -0
- data/spec/samples/mathematicians/gauss.pdf +0 -0
- data/spec/samples/mathematicians/leibniz.txt +13 -0
- data/spec/samples/mathematicians/newton.doc +0 -0
- data/spec/sandbox.rb +5 -0
- data/spec/token.rb +109 -0
- data/spec/treat.rb +52 -0
- data/spec/tree.rb +117 -0
- data/spec/word.rb +110 -0
- data/spec/zone.rb +66 -0
- data/tmp/INFO +1 -1
- metadata +100 -201
- data/INSTALL +0 -1
- data/README +0 -3
- data/TODO +0 -28
- data/lib/economist/half_cocked_basel.txt +0 -16
- data/lib/economist/hungarys_troubles.txt +0 -46
- data/lib/economist/indias_slowdown.txt +0 -15
- data/lib/economist/merkozy_rides_again.txt +0 -24
- data/lib/economist/prada_is_not_walmart.txt +0 -9
- data/lib/economist/to_infinity_and_beyond.txt +0 -15
- data/lib/ferret/_11.cfs +0 -0
- data/lib/ferret/_14.cfs +0 -0
- data/lib/ferret/_p.cfs +0 -0
- data/lib/ferret/_s.cfs +0 -0
- data/lib/ferret/_v.cfs +0 -0
- data/lib/ferret/_y.cfs +0 -0
- data/lib/ferret/segments +0 -0
- data/lib/ferret/segments_15 +0 -0
- data/lib/treat/buildable.rb +0 -157
- data/lib/treat/category.rb +0 -33
- data/lib/treat/delegatable.rb +0 -116
- data/lib/treat/doable.rb +0 -45
- data/lib/treat/entities/collection.rb +0 -14
- data/lib/treat/entities/document.rb +0 -12
- data/lib/treat/entities/phrases.rb +0 -17
- data/lib/treat/entities/tokens.rb +0 -61
- data/lib/treat/entities/zones.rb +0 -41
- data/lib/treat/extractors/coreferences/stanford.rb +0 -69
- data/lib/treat/extractors/date/chronic.rb +0 -32
- data/lib/treat/extractors/date/ruby.rb +0 -25
- data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
- data/lib/treat/extractors/language/language_extractor.rb +0 -27
- data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
- data/lib/treat/extractors/roles/naive.rb +0 -73
- data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
- data/lib/treat/extractors/statistics/position_in.rb +0 -14
- data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
- data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
- data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
- data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
- data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
- data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
- data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
- data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
- data/lib/treat/feature.rb +0 -58
- data/lib/treat/features.rb +0 -7
- data/lib/treat/formatters/visualizers/short_value.rb +0 -29
- data/lib/treat/formatters/visualizers/txt.rb +0 -45
- data/lib/treat/group.rb +0 -106
- data/lib/treat/helpers/linguistics_loader.rb +0 -18
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
- data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
- data/lib/treat/inflectors/declensions/english.rb +0 -319
- data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
- data/lib/treat/inflectors/stem/porter.rb +0 -162
- data/lib/treat/inflectors/stem/porter_c.rb +0 -26
- data/lib/treat/inflectors/stem/uea.rb +0 -30
- data/lib/treat/install.rb +0 -59
- data/lib/treat/languages/tags.rb +0 -377
- data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
- data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
- data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
- data/lib/treat/lexicalizers/tag/brill.rb +0 -91
- data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
- data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
- data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
- data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
- data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
- data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
- data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
- data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
- data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
- data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
- data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
- data/lib/treat/registrable.rb +0 -28
- data/lib/treat/sugar.rb +0 -50
- data/lib/treat/viewable.rb +0 -29
- data/lib/treat/visitable.rb +0 -28
- data/test/profile.rb +0 -2
- data/test/tc_entity.rb +0 -117
- data/test/tc_extractors.rb +0 -73
- data/test/tc_formatters.rb +0 -41
- data/test/tc_inflectors.rb +0 -34
- data/test/tc_lexicalizers.rb +0 -32
- data/test/tc_processors.rb +0 -50
- data/test/tc_resources.rb +0 -22
- data/test/tc_treat.rb +0 -60
- data/test/tc_tree.rb +0 -60
- data/test/tests.rb +0 -20
- data/test/texts.rb +0 -19
- data/test/texts/english/half_cocked_basel.txt +0 -16
- data/test/texts/english/hose_and_dry.doc +0 -0
- data/test/texts/english/hungarys_troubles.abw +0 -70
- data/test/texts/english/long.html +0 -24
- data/test/texts/english/long.txt +0 -22
- data/test/texts/english/medium.txt +0 -5
- data/test/texts/english/republican_nomination.pdf +0 -0
- data/test/texts/english/saving_the_euro.odt +0 -0
- data/test/texts/english/short.txt +0 -3
- data/test/texts/english/zero_sum.html +0 -111
data/lib/treat/kernel.rb
CHANGED
@@ -2,20 +2,22 @@
|
|
2
2
|
# easy access to utility functions used across
|
3
3
|
# the library.
|
4
4
|
module Kernel
|
5
|
+
|
6
|
+
# Require file utilities for creating and
|
7
|
+
# deleting temporary files.
|
5
8
|
require 'fileutils'
|
6
|
-
|
9
|
+
|
7
10
|
# A list of acronyms used in class names within
|
8
11
|
# the program. These do not CamelCase; they
|
9
12
|
# CAMELCase.
|
10
|
-
Acronyms = [
|
13
|
+
Acronyms = %w[xml html txt odt abw doc yaml uea lda pdf ptb dot ai id3 svo]
|
14
|
+
|
11
15
|
# A cache to optimize camel casing.
|
12
16
|
@@cc_cache = {}
|
17
|
+
|
13
18
|
# A cache to optimize un camel casing.
|
14
19
|
@@ucc_cache = {}
|
15
|
-
|
16
|
-
def platform
|
17
|
-
RUBY_PLATFORM.split("-")[1]
|
18
|
-
end
|
20
|
+
|
19
21
|
# Runs a block of code without warnings.
|
20
22
|
def silence_warnings(&block)
|
21
23
|
warn_level = $VERBOSE
|
@@ -24,62 +26,85 @@ module Kernel
|
|
24
26
|
$VERBOSE = warn_level
|
25
27
|
result
|
26
28
|
end
|
29
|
+
|
27
30
|
# Runs a block of code while blocking stdout.
|
28
|
-
def silence_stdout(log =
|
31
|
+
def silence_stdout(log = NULL_DEVICE)
|
32
|
+
unless Treat.silence
|
33
|
+
yield; return
|
34
|
+
end
|
29
35
|
old = $stdout.dup
|
30
36
|
$stdout.reopen(File.new(log, 'w'))
|
31
37
|
yield
|
32
38
|
$stdout = old
|
33
39
|
end
|
40
|
+
|
34
41
|
# Create a temporary file which is deleted
|
35
42
|
# after execution of the block.
|
36
43
|
def create_temp_file(ext, value = nil, &block)
|
37
|
-
fname =
|
38
|
-
|
39
|
-
|
44
|
+
fname = Treat.tmp +
|
45
|
+
"#{Random.rand(10000000).to_s}.#{ext}"
|
46
|
+
File.open(fname, 'w') do |f|
|
47
|
+
f.write(value) if value
|
40
48
|
block.call(f.path)
|
41
49
|
end
|
42
50
|
ensure
|
43
51
|
File.delete(fname)
|
44
52
|
end
|
45
|
-
|
53
|
+
|
54
|
+
# Create a temporary directory, which is
|
55
|
+
# deleted after execution of the block.
|
46
56
|
def create_temp_dir(&block)
|
47
|
-
dname = "#{Treat.lib}/../tmp
|
57
|
+
dname = "#{Treat.lib}/../tmp/"+
|
58
|
+
"#{Random.rand(10000000).to_s}"
|
48
59
|
Dir.mkdir(dname)
|
49
60
|
block.call(dname)
|
50
61
|
ensure
|
51
62
|
FileUtils.rm_rf(dname)
|
52
63
|
end
|
64
|
+
|
53
65
|
# Convert un_camel_case to CamelCase.
|
54
66
|
def camel_case(o_phrase)
|
55
67
|
phrase = o_phrase.to_s.dup
|
56
68
|
return @@cc_cache[o_phrase] if @@cc_cache[o_phrase]
|
57
|
-
|
58
|
-
|
59
|
-
|
69
|
+
|
70
|
+
if Acronyms.include?(phrase)
|
71
|
+
phrase = phrase.upcase
|
72
|
+
else
|
73
|
+
phrase.gsub!(/^[a-z]|_[a-z]/) { |a| a.upcase }
|
74
|
+
phrase.gsub!('_', '')
|
75
|
+
end
|
60
76
|
@@cc_cache[o_phrase] = phrase
|
61
|
-
phrase
|
62
77
|
end
|
78
|
+
|
63
79
|
alias :cc :camel_case
|
80
|
+
|
64
81
|
# Convert CamelCase to un_camel_case.
|
65
82
|
def un_camel_case(o_phrase)
|
66
83
|
phrase = o_phrase.to_s.dup
|
67
84
|
return @@ucc_cache[o_phrase] if @@ucc_cache[o_phrase]
|
68
|
-
|
69
|
-
|
70
|
-
|
85
|
+
if Acronyms.include?(phrase.downcase)
|
86
|
+
phrase = phrase.downcase
|
87
|
+
else
|
88
|
+
phrase.gsub!(/[A-Z]/) { |p| '_' + p.downcase }
|
89
|
+
phrase = phrase[1..-1] if phrase[0] == '_'
|
90
|
+
end
|
71
91
|
@@ucc_cache[o_phrase] = phrase
|
72
|
-
phrase
|
73
92
|
end
|
93
|
+
|
74
94
|
alias :ucc :un_camel_case
|
95
|
+
|
75
96
|
# Retrieve the Class from a Module::Class.
|
76
97
|
def class_name(n); n.to_s.split('::')[-1]; end
|
98
|
+
|
77
99
|
alias :cl :class_name
|
100
|
+
|
78
101
|
# Search the list to see if there are words similar to #name
|
79
|
-
# in the #list If yes, return a string saying "Did you mean
|
102
|
+
# in the #list If yes, return a string saying "Did you mean
|
80
103
|
# ... ?" with the names.
|
81
104
|
def did_you_mean?(list, name)
|
82
|
-
|
105
|
+
return '' # Fix
|
106
|
+
list = list.map { |e| e.to_s }
|
107
|
+
name = name.to_s
|
83
108
|
sugg = []
|
84
109
|
list.each do |element|
|
85
110
|
l = levenshtein(element,name)
|
@@ -91,22 +116,38 @@ module Kernel
|
|
91
116
|
if sugg.size == 1
|
92
117
|
msg += " Perhaps you meant '#{sugg[0]}' ?"
|
93
118
|
else
|
94
|
-
sugg_quote = sugg[0..-2].map
|
95
|
-
|
119
|
+
sugg_quote = sugg[0..-2].map do
|
120
|
+
|x| '\'' + x + '\''
|
121
|
+
end
|
122
|
+
msg += " Perhaps you meant " +
|
123
|
+
"#{sugg_quote.join(', ')}," +
|
96
124
|
" or '#{sugg[-1]}' ?"
|
97
125
|
end
|
98
126
|
end
|
99
127
|
msg
|
100
128
|
end
|
129
|
+
|
101
130
|
alias :dym? :did_you_mean?
|
131
|
+
|
102
132
|
# Return the name of the method that called the method
|
103
133
|
# that calls this method.
|
104
134
|
def caller_method(n = 3)
|
105
135
|
at = caller(n).first
|
106
136
|
/^(.+?):(\d+)(?::in `(.*)')?/ =~ at
|
107
|
-
Regexp.last_match[3].intern
|
137
|
+
Regexp.last_match[3].gsub('block in ', '').intern
|
108
138
|
end
|
139
|
+
|
109
140
|
alias :cm :caller_method
|
141
|
+
|
142
|
+
# Detect the platform we're running on.
|
143
|
+
def detect_platform
|
144
|
+
p = RUBY_PLATFORM.downcase
|
145
|
+
return :mac if p.include?("darwin")
|
146
|
+
return :windows if p.include?("mswin")
|
147
|
+
return :linux if p.include?("linux")
|
148
|
+
return :unknown
|
149
|
+
end
|
150
|
+
|
110
151
|
# Return the levensthein distance between two stringsm
|
111
152
|
# taking into account the costs of insertion, deletion,
|
112
153
|
# and substitution. Stolen from:
|
@@ -116,14 +157,16 @@ module Kernel
|
|
116
157
|
return nil if first.nil? || other.nil?
|
117
158
|
dm = []
|
118
159
|
dm[0] = (0..first.length).collect { |i| i * ins}
|
119
|
-
fill = [0] * (first.length - 1)
|
160
|
+
fill = [0] * (first.length - 1).abs
|
120
161
|
for i in 1..other.length
|
121
162
|
dm[i] = [i * del, fill.flatten]
|
122
163
|
end
|
123
164
|
for i in 1..other.length
|
124
165
|
for j in 1..first.length
|
125
166
|
dm[i][j] = [
|
126
|
-
dm[i-1][j-1] +
|
167
|
+
dm[i-1][j-1] +
|
168
|
+
(first[i-1] ==
|
169
|
+
other[i-1] ? 0 : sub),
|
127
170
|
dm[i][j-1] + ins,
|
128
171
|
dm[i-1][j] + del
|
129
172
|
].min
|
@@ -131,4 +174,39 @@ module Kernel
|
|
131
174
|
end
|
132
175
|
dm[other.length][first.length]
|
133
176
|
end
|
177
|
+
|
178
|
+
if detect_platform == :windows
|
179
|
+
NULL_DEVICE = 'NUL'
|
180
|
+
else
|
181
|
+
NULL_DEVICE = '/dev/null'
|
182
|
+
end
|
183
|
+
|
184
|
+
def debug(msg)
|
185
|
+
puts msg if Treat.debug
|
186
|
+
end
|
187
|
+
|
188
|
+
def prompt(msg, valid_answers)
|
189
|
+
|
190
|
+
msg = msg
|
191
|
+
n = msg.include?("\n") ? ":\n" : ''
|
192
|
+
q = msg.include?("\n") ? '' : '?'
|
193
|
+
|
194
|
+
s = "\nPlease enter one of #{valid_answers.join(', ')}: "
|
195
|
+
puts "Do you want to #{n}#{msg}#{q} \n#{s}"
|
196
|
+
|
197
|
+
begin
|
198
|
+
answer = STDIN.gets.strip
|
199
|
+
unless valid_answers.include?(answer)
|
200
|
+
puts "Invalid input."
|
201
|
+
puts s
|
202
|
+
raise Treat::InvalidInputException
|
203
|
+
end
|
204
|
+
puts
|
205
|
+
answer
|
206
|
+
rescue Treat::InvalidInputException
|
207
|
+
retry
|
208
|
+
end
|
209
|
+
|
210
|
+
end
|
211
|
+
|
134
212
|
end
|
data/lib/treat/languages.rb
CHANGED
@@ -1,98 +1,132 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
raise "Must provide a non-nil language identifier to describe." if lang.nil?
|
15
|
-
lang = code(lang).to_s
|
16
|
-
if [:en, :eng, :english, :anglais].include?(desc_lang)
|
17
|
-
l = @@english_full.key(lang)
|
18
|
-
elsif [:fr, :fra, :french, :french].include?(desc_lang)
|
19
|
-
l = @@french_full.key(lang)
|
20
|
-
else
|
21
|
-
raise Treat::Exception,
|
22
|
-
"Unknown language to describe: #{desc_lang}."
|
23
|
-
end
|
24
|
-
not_found(lang) if l.nil?
|
25
|
-
l.intern
|
1
|
+
# This module provides linguistic resources
|
2
|
+
# for the Treat library, including information
|
3
|
+
# about language codes, the functions available
|
4
|
+
# for each language, and the different tags used
|
5
|
+
# to markup that language.
|
6
|
+
module Treat::Languages
|
7
|
+
|
8
|
+
def self.const_missing(const)
|
9
|
+
lang = const.to_s.downcase
|
10
|
+
f = File.join(File.dirname(__FILE__), "languages", lang)
|
11
|
+
unless File.readable?(f + '.rb')
|
12
|
+
raise Treat::Exception,
|
13
|
+
"Language #{lang} is not supported."
|
26
14
|
end
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
15
|
+
require f
|
16
|
+
const_get(const)
|
17
|
+
end
|
18
|
+
|
19
|
+
# Yield a lowercase symbol for each
|
20
|
+
# defined language.
|
21
|
+
def self.each
|
22
|
+
constants.each do |constant|
|
23
|
+
yield constant.to_s.downcase.intern
|
36
24
|
end
|
37
|
-
|
38
|
-
|
39
|
-
|
25
|
+
end
|
26
|
+
|
27
|
+
# Identifier constants for language codes.
|
28
|
+
ISO639_1 = 1
|
29
|
+
ISO639_2 = 2
|
30
|
+
|
31
|
+
# Describe a language code (ISO-639-1 or ISO-639-2)
|
32
|
+
# or its full text description in full French or English.
|
33
|
+
def self.describe(lang, desc_lang = :en)
|
34
|
+
raise "Must provide a non-nil language "+
|
35
|
+
"identifier to describe." if lang.nil?
|
36
|
+
lang = code(lang).to_s
|
37
|
+
if [:en, :eng, :english, :anglais].
|
38
|
+
include?(desc_lang)
|
39
|
+
l = @@english_full.key(lang)
|
40
|
+
elsif [:fr, :fra, :french, :french].
|
41
|
+
include?(desc_lang)
|
42
|
+
l = @@french_full.key(lang)
|
43
|
+
else
|
44
|
+
raise Treat::Exception,
|
45
|
+
"Unknown language to describe: #{desc_lang}."
|
40
46
|
end
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
47
|
+
not_found(lang) if l.nil?
|
48
|
+
l.intern
|
49
|
+
end
|
50
|
+
|
51
|
+
# Raise an error message when a language code
|
52
|
+
# or description is not found and suggest
|
53
|
+
# possible misspellings.
|
54
|
+
def self.not_found(lang)
|
55
|
+
msg = "Language '#{lang}' does not exist."
|
56
|
+
all = @@iso639_2.keys + @@iso639_1.keys +
|
57
|
+
@@english_full.keys + @@french_full.keys
|
58
|
+
msg += did_you_mean?(all, lang)
|
59
|
+
raise Treat::Exception, msg
|
60
|
+
end
|
61
|
+
|
62
|
+
# Return the class representing a language.
|
63
|
+
def self.get(lang)
|
64
|
+
lang = Treat::Languages.describe(lang).to_s
|
65
|
+
begin
|
66
|
+
const_get(lang.capitalize)
|
67
|
+
rescue
|
68
|
+
not_found(lang)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# Find a language by ISO-639-1 or ISO-639-2 code
|
73
|
+
# or full name (in English or French) and return
|
74
|
+
# the ISO-639-1 or ISO-639-2 language code as a
|
75
|
+
# lowercase identifier.
|
76
|
+
def self.code(lang, rc = ISO639_2)
|
77
|
+
raise "Must provide a non-nil language "+
|
78
|
+
"identifier to describe." if lang.nil?
|
79
|
+
get_languages
|
80
|
+
lang = lang.to_s.downcase
|
81
|
+
if @@iso639_1.has_key?(lang)
|
82
|
+
return lang.intern if rc == ISO639_2
|
83
|
+
return @@iso639_1[lang].intern if rc == ISO639_1
|
84
|
+
elsif @@iso639_2.has_key?(lang)
|
85
|
+
return lang.intern if rc == ISO639_2
|
86
|
+
return @@iso639_2[lang].intern if rc == ISO639_1
|
87
|
+
elsif @@english_full.has_key?(lang)
|
88
|
+
return @@english_full[lang].intern if rc == ISO639_2
|
89
|
+
return @@iso639_2[@@english_full[lang]].intern if rc == ISO639_1
|
90
|
+
elsif @@french_full.has_key?(lang)
|
91
|
+
return @@french_full[lang].intern if rc == ISO639_2
|
92
|
+
return @@iso639_2[@@french_full[lang]].intern if rc == ISO639_1
|
93
|
+
else
|
94
|
+
not_found(lang)
|
64
95
|
end
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
96
|
+
|
97
|
+
end
|
98
|
+
|
99
|
+
# Whether the language list has been loaded or not.
|
100
|
+
@@loaded = false
|
101
|
+
|
102
|
+
# Get the languages from the dictionary.
|
103
|
+
def self.get_languages
|
104
|
+
return if @@loaded
|
105
|
+
@@iso639_1 = {}; @@iso639_2 = {};
|
106
|
+
@@english_full = {}; @@french_full = {}
|
107
|
+
languages = IO.readlines(File.join(
|
108
|
+
File.dirname(__FILE__), "languages", "list.txt"))
|
109
|
+
languages.each do |language|
|
110
|
+
iso639_2, iso639_1, english_desc, french_desc =
|
111
|
+
language.split(',')
|
112
|
+
@@iso639_1[iso639_1] = iso639_2
|
113
|
+
@@iso639_2[iso639_2] = iso639_1
|
114
|
+
unless english_desc.nil?
|
115
|
+
english_desc.strip.downcase.split('|').each do |l|
|
116
|
+
@@english_full[l.downcase.strip] = iso639_2
|
81
117
|
end
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
118
|
+
end
|
119
|
+
unless french_desc.nil?
|
120
|
+
french_desc.strip.downcase.split('|').each do |l|
|
121
|
+
@@french_full[l.downcase.strip] = iso639_2
|
86
122
|
end
|
87
123
|
end
|
88
|
-
@@loaded = true
|
89
124
|
end
|
90
|
-
|
91
|
-
WordCategories = [
|
92
|
-
:adjective, :adverb, :noun, :verb, :interjection,
|
93
|
-
:clitic, :coverb, :conjunction, :determiner, :particle,
|
94
|
-
:preposition, :pronoun, :number, :symbol, :punctuation,
|
95
|
-
:complementizer
|
96
|
-
]
|
125
|
+
@@loaded = true
|
97
126
|
end
|
127
|
+
|
128
|
+
# Get the language list.
|
129
|
+
get_languages
|
130
|
+
|
131
|
+
|
98
132
|
end
|
@@ -1,16 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
1
|
+
class Treat::Languages::Arabic
|
2
|
+
|
3
|
+
RequiredDependencies = []
|
4
|
+
OptionalDependencies = []
|
5
|
+
|
6
|
+
Extractors = {}
|
7
|
+
Inflectors = {}
|
8
|
+
Lexicalizers = {
|
9
|
+
:tag => [:stanford]
|
10
|
+
}
|
11
|
+
Processors = {
|
12
|
+
:parsers => [:stanford]
|
13
|
+
}
|
14
|
+
Retrievers = {}
|
15
|
+
|
16
16
|
end
|