treat 0.2.5 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
data/lib/treat/kernel.rb CHANGED
@@ -2,20 +2,22 @@
2
2
  # easy access to utility functions used across
3
3
  # the library.
4
4
  module Kernel
5
+
6
+ # Require file utilities for creating and
7
+ # deleting temporary files.
5
8
  require 'fileutils'
6
- require 'tempfile'
9
+
7
10
  # A list of acronyms used in class names within
8
11
  # the program. These do not CamelCase; they
9
12
  # CAMELCase.
10
- Acronyms = ['XML', 'HTML', 'YAML', 'UEA', 'LDA', 'PDF'].join('|')
13
+ Acronyms = %w[xml html txt odt abw doc yaml uea lda pdf ptb dot ai id3 svo]
14
+
11
15
  # A cache to optimize camel casing.
12
16
  @@cc_cache = {}
17
+
13
18
  # A cache to optimize un camel casing.
14
19
  @@ucc_cache = {}
15
- # Returns the platform we are running on.
16
- def platform
17
- RUBY_PLATFORM.split("-")[1]
18
- end
20
+
19
21
  # Runs a block of code without warnings.
20
22
  def silence_warnings(&block)
21
23
  warn_level = $VERBOSE
@@ -24,62 +26,85 @@ module Kernel
24
26
  $VERBOSE = warn_level
25
27
  result
26
28
  end
29
+
27
30
  # Runs a block of code while blocking stdout.
28
- def silence_stdout(log = '/dev/null')
31
+ def silence_stdout(log = NULL_DEVICE)
32
+ unless Treat.silence
33
+ yield; return
34
+ end
29
35
  old = $stdout.dup
30
36
  $stdout.reopen(File.new(log, 'w'))
31
37
  yield
32
38
  $stdout = old
33
39
  end
40
+
34
41
  # Create a temporary file which is deleted
35
42
  # after execution of the block.
36
43
  def create_temp_file(ext, value = nil, &block)
37
- fname = "#{Treat.lib}/../tmp/#{Random.rand(10000000).to_s}.#{ext}"
38
- File.open(fname, 'w') do |f|
39
- f.write(value) if value
44
+ fname = Treat.tmp +
45
+ "#{Random.rand(10000000).to_s}.#{ext}"
46
+ File.open(fname, 'w') do |f|
47
+ f.write(value) if value
40
48
  block.call(f.path)
41
49
  end
42
50
  ensure
43
51
  File.delete(fname)
44
52
  end
45
- # Create a temporary directory.
53
+
54
+ # Create a temporary directory, which is
55
+ # deleted after execution of the block.
46
56
  def create_temp_dir(&block)
47
- dname = "#{Treat.lib}/../tmp/#{Random.rand(10000000).to_s}"
57
+ dname = "#{Treat.lib}/../tmp/"+
58
+ "#{Random.rand(10000000).to_s}"
48
59
  Dir.mkdir(dname)
49
60
  block.call(dname)
50
61
  ensure
51
62
  FileUtils.rm_rf(dname)
52
63
  end
64
+
53
65
  # Convert un_camel_case to CamelCase.
54
66
  def camel_case(o_phrase)
55
67
  phrase = o_phrase.to_s.dup
56
68
  return @@cc_cache[o_phrase] if @@cc_cache[o_phrase]
57
- phrase.gsub!(/#{Acronyms.downcase}[^a-z]*/) { |a| a.upcase }
58
- phrase.gsub!(/^[a-z]|_[a-z]/) { |a| a.upcase }
59
- phrase.gsub!('_', '')
69
+
70
+ if Acronyms.include?(phrase)
71
+ phrase = phrase.upcase
72
+ else
73
+ phrase.gsub!(/^[a-z]|_[a-z]/) { |a| a.upcase }
74
+ phrase.gsub!('_', '')
75
+ end
60
76
  @@cc_cache[o_phrase] = phrase
61
- phrase
62
77
  end
78
+
63
79
  alias :cc :camel_case
80
+
64
81
  # Convert CamelCase to un_camel_case.
65
82
  def un_camel_case(o_phrase)
66
83
  phrase = o_phrase.to_s.dup
67
84
  return @@ucc_cache[o_phrase] if @@ucc_cache[o_phrase]
68
- phrase.gsub!(/#{Acronyms}/) { |a| a.downcase.capitalize }
69
- phrase.gsub!(/[A-Z]/) { |p| '_' + p.downcase }
70
- phrase = phrase[1..-1] if phrase[0] == '_'
85
+ if Acronyms.include?(phrase.downcase)
86
+ phrase = phrase.downcase
87
+ else
88
+ phrase.gsub!(/[A-Z]/) { |p| '_' + p.downcase }
89
+ phrase = phrase[1..-1] if phrase[0] == '_'
90
+ end
71
91
  @@ucc_cache[o_phrase] = phrase
72
- phrase
73
92
  end
93
+
74
94
  alias :ucc :un_camel_case
95
+
75
96
  # Retrieve the Class from a Module::Class.
76
97
  def class_name(n); n.to_s.split('::')[-1]; end
98
+
77
99
  alias :cl :class_name
100
+
78
101
  # Search the list to see if there are words similar to #name
79
- # in the #list If yes, return a string saying "Did you mean
102
+ # in the #list If yes, return a string saying "Did you mean
80
103
  # ... ?" with the names.
81
104
  def did_you_mean?(list, name)
82
- msg = ''
105
+ return '' # Fix
106
+ list = list.map { |e| e.to_s }
107
+ name = name.to_s
83
108
  sugg = []
84
109
  list.each do |element|
85
110
  l = levenshtein(element,name)
@@ -91,22 +116,38 @@ module Kernel
91
116
  if sugg.size == 1
92
117
  msg += " Perhaps you meant '#{sugg[0]}' ?"
93
118
  else
94
- sugg_quote = sugg[0..-2].map {|x| '\'' + x + '\''}
95
- msg += " Perhaps you meant #{sugg_quote.join(', ')}," +
119
+ sugg_quote = sugg[0..-2].map do
120
+ |x| '\'' + x + '\''
121
+ end
122
+ msg += " Perhaps you meant " +
123
+ "#{sugg_quote.join(', ')}," +
96
124
  " or '#{sugg[-1]}' ?"
97
125
  end
98
126
  end
99
127
  msg
100
128
  end
129
+
101
130
  alias :dym? :did_you_mean?
131
+
102
132
  # Return the name of the method that called the method
103
133
  # that calls this method.
104
134
  def caller_method(n = 3)
105
135
  at = caller(n).first
106
136
  /^(.+?):(\d+)(?::in `(.*)')?/ =~ at
107
- Regexp.last_match[3].intern
137
+ Regexp.last_match[3].gsub('block in ', '').intern
108
138
  end
139
+
109
140
  alias :cm :caller_method
141
+
142
+ # Detect the platform we're running on.
143
+ def detect_platform
144
+ p = RUBY_PLATFORM.downcase
145
+ return :mac if p.include?("darwin")
146
+ return :windows if p.include?("mswin")
147
+ return :linux if p.include?("linux")
148
+ return :unknown
149
+ end
150
+
110
151
  # Return the levensthein distance between two stringsm
111
152
  # taking into account the costs of insertion, deletion,
112
153
  # and substitution. Stolen from:
@@ -116,14 +157,16 @@ module Kernel
116
157
  return nil if first.nil? || other.nil?
117
158
  dm = []
118
159
  dm[0] = (0..first.length).collect { |i| i * ins}
119
- fill = [0] * (first.length - 1)
160
+ fill = [0] * (first.length - 1).abs
120
161
  for i in 1..other.length
121
162
  dm[i] = [i * del, fill.flatten]
122
163
  end
123
164
  for i in 1..other.length
124
165
  for j in 1..first.length
125
166
  dm[i][j] = [
126
- dm[i-1][j-1] + (first[i-1] == other[i-1] ? 0 : sub),
167
+ dm[i-1][j-1] +
168
+ (first[i-1] ==
169
+ other[i-1] ? 0 : sub),
127
170
  dm[i][j-1] + ins,
128
171
  dm[i-1][j] + del
129
172
  ].min
@@ -131,4 +174,39 @@ module Kernel
131
174
  end
132
175
  dm[other.length][first.length]
133
176
  end
177
+
178
+ if detect_platform == :windows
179
+ NULL_DEVICE = 'NUL'
180
+ else
181
+ NULL_DEVICE = '/dev/null'
182
+ end
183
+
184
+ def debug(msg)
185
+ puts msg if Treat.debug
186
+ end
187
+
188
+ def prompt(msg, valid_answers)
189
+
190
+ msg = msg
191
+ n = msg.include?("\n") ? ":\n" : ''
192
+ q = msg.include?("\n") ? '' : '?'
193
+
194
+ s = "\nPlease enter one of #{valid_answers.join(', ')}: "
195
+ puts "Do you want to #{n}#{msg}#{q} \n#{s}"
196
+
197
+ begin
198
+ answer = STDIN.gets.strip
199
+ unless valid_answers.include?(answer)
200
+ puts "Invalid input."
201
+ puts s
202
+ raise Treat::InvalidInputException
203
+ end
204
+ puts
205
+ answer
206
+ rescue Treat::InvalidInputException
207
+ retry
208
+ end
209
+
210
+ end
211
+
134
212
  end
@@ -1,98 +1,132 @@
1
- module Treat
2
- # This module provides linguistic resources
3
- # for the Treat library, including information
4
- # about language codes, the functions available
5
- # for each language, and the different tags used
6
- # to markup that language.
7
- module Languages
8
- Dir["#{Treat.lib}/treat/languages/*.rb"].each { |file| require file }
9
- ISO639_1 = 1
10
- ISO639_2 = 2
11
- # Describe a language code (ISO-639-1 or ISO-639-2)
12
- # or its full text description in full French or English.
13
- def self.describe(lang, desc_lang = :en)
14
- raise "Must provide a non-nil language identifier to describe." if lang.nil?
15
- lang = code(lang).to_s
16
- if [:en, :eng, :english, :anglais].include?(desc_lang)
17
- l = @@english_full.key(lang)
18
- elsif [:fr, :fra, :french, :french].include?(desc_lang)
19
- l = @@french_full.key(lang)
20
- else
21
- raise Treat::Exception,
22
- "Unknown language to describe: #{desc_lang}."
23
- end
24
- not_found(lang) if l.nil?
25
- l.intern
1
+ # This module provides linguistic resources
2
+ # for the Treat library, including information
3
+ # about language codes, the functions available
4
+ # for each language, and the different tags used
5
+ # to markup that language.
6
+ module Treat::Languages
7
+
8
+ def self.const_missing(const)
9
+ lang = const.to_s.downcase
10
+ f = File.join(File.dirname(__FILE__), "languages", lang)
11
+ unless File.readable?(f + '.rb')
12
+ raise Treat::Exception,
13
+ "Language #{lang} is not supported."
26
14
  end
27
- # Raise an error message when a language code
28
- # or description is not found and suggest
29
- # possible misspellings.
30
- def self.not_found(lang)
31
- msg = "Language '#{lang}' does not exist."
32
- all = @@iso639_2.keys + @@iso639_1.keys +
33
- @@english_full.keys + @@french_full.keys
34
- msg += did_you_mean?(all, lang)
35
- raise Treat::Exception, msg
15
+ require f
16
+ const_get(const)
17
+ end
18
+
19
+ # Yield a lowercase symbol for each
20
+ # defined language.
21
+ def self.each
22
+ constants.each do |constant|
23
+ yield constant.to_s.downcase.intern
36
24
  end
37
- # Return the class representing a language.
38
- def self.get(lang)
39
- const_get(Treat::Languages.describe(lang).to_s.capitalize)
25
+ end
26
+
27
+ # Identifier constants for language codes.
28
+ ISO639_1 = 1
29
+ ISO639_2 = 2
30
+
31
+ # Describe a language code (ISO-639-1 or ISO-639-2)
32
+ # or its full text description in full French or English.
33
+ def self.describe(lang, desc_lang = :en)
34
+ raise "Must provide a non-nil language "+
35
+ "identifier to describe." if lang.nil?
36
+ lang = code(lang).to_s
37
+ if [:en, :eng, :english, :anglais].
38
+ include?(desc_lang)
39
+ l = @@english_full.key(lang)
40
+ elsif [:fr, :fra, :french, :french].
41
+ include?(desc_lang)
42
+ l = @@french_full.key(lang)
43
+ else
44
+ raise Treat::Exception,
45
+ "Unknown language to describe: #{desc_lang}."
40
46
  end
41
- # Find a language by ISO-639-1 or ISO-639-2 code
42
- # or full name (in English or French) and return
43
- # the ISO-639-1 or ISO-639-2 language code as a
44
- # lowercase identifier.
45
- def self.code(lang, rc = ISO639_2)
46
- raise "Must provide a non-nil language identifier to describe." if lang.nil?
47
- get_languages
48
- lang = lang.to_s.downcase
49
- if @@iso639_1.has_key?(lang)
50
- return lang.intern if rc == ISO639_1
51
- return @@iso639_1[lang].intern if rc == ISO639_2
52
- elsif @@iso639_2.has_key?(lang)
53
- return lang.intern if rc == ISO639_2
54
- return @@iso639_2[lang].intern if rc == ISO639_1
55
- elsif @@english_full.has_key?(lang)
56
- return @@english_full[lang].intern if rc == ISO639_2
57
- return @@iso639_2[@@english_full[lang]].intern if rc == ISO639_1
58
- elsif @@french_full.has_key?(lang)
59
- return @@french_full[lang].intern if rc == ISO639_2
60
- return @@iso639_1[@@french_full[lang]].intern if rc == ISO639_2
61
- else
62
- not_found(lang)
63
- end
47
+ not_found(lang) if l.nil?
48
+ l.intern
49
+ end
50
+
51
+ # Raise an error message when a language code
52
+ # or description is not found and suggest
53
+ # possible misspellings.
54
+ def self.not_found(lang)
55
+ msg = "Language '#{lang}' does not exist."
56
+ all = @@iso639_2.keys + @@iso639_1.keys +
57
+ @@english_full.keys + @@french_full.keys
58
+ msg += did_you_mean?(all, lang)
59
+ raise Treat::Exception, msg
60
+ end
61
+
62
+ # Return the class representing a language.
63
+ def self.get(lang)
64
+ lang = Treat::Languages.describe(lang).to_s
65
+ begin
66
+ const_get(lang.capitalize)
67
+ rescue
68
+ not_found(lang)
69
+ end
70
+ end
71
+
72
+ # Find a language by ISO-639-1 or ISO-639-2 code
73
+ # or full name (in English or French) and return
74
+ # the ISO-639-1 or ISO-639-2 language code as a
75
+ # lowercase identifier.
76
+ def self.code(lang, rc = ISO639_2)
77
+ raise "Must provide a non-nil language "+
78
+ "identifier to describe." if lang.nil?
79
+ get_languages
80
+ lang = lang.to_s.downcase
81
+ if @@iso639_1.has_key?(lang)
82
+ return lang.intern if rc == ISO639_2
83
+ return @@iso639_1[lang].intern if rc == ISO639_1
84
+ elsif @@iso639_2.has_key?(lang)
85
+ return lang.intern if rc == ISO639_2
86
+ return @@iso639_2[lang].intern if rc == ISO639_1
87
+ elsif @@english_full.has_key?(lang)
88
+ return @@english_full[lang].intern if rc == ISO639_2
89
+ return @@iso639_2[@@english_full[lang]].intern if rc == ISO639_1
90
+ elsif @@french_full.has_key?(lang)
91
+ return @@french_full[lang].intern if rc == ISO639_2
92
+ return @@iso639_2[@@french_full[lang]].intern if rc == ISO639_1
93
+ else
94
+ not_found(lang)
64
95
  end
65
- @@loaded = false
66
- # Get the languages from the dictionary.
67
- def self.get_languages
68
- return if @@loaded
69
- @@iso639_1 = {}; @@iso639_2 = {};
70
- @@english_full = {}; @@french_full = {}
71
- languages = IO.readlines(Treat.lib + '/treat/languages/list.txt')
72
- languages.each do |language|
73
- iso639_2, iso639_1, english_desc, french_desc =
74
- language.split(',')
75
- @@iso639_1[iso639_1] = iso639_2
76
- @@iso639_2[iso639_2] = iso639_1
77
- unless english_desc.nil?
78
- english_desc.strip.downcase.split('|').each do |l|
79
- @@english_full[l.downcase.strip] = iso639_2
80
- end
96
+
97
+ end
98
+
99
+ # Whether the language list has been loaded or not.
100
+ @@loaded = false
101
+
102
+ # Get the languages from the dictionary.
103
+ def self.get_languages
104
+ return if @@loaded
105
+ @@iso639_1 = {}; @@iso639_2 = {};
106
+ @@english_full = {}; @@french_full = {}
107
+ languages = IO.readlines(File.join(
108
+ File.dirname(__FILE__), "languages", "list.txt"))
109
+ languages.each do |language|
110
+ iso639_2, iso639_1, english_desc, french_desc =
111
+ language.split(',')
112
+ @@iso639_1[iso639_1] = iso639_2
113
+ @@iso639_2[iso639_2] = iso639_1
114
+ unless english_desc.nil?
115
+ english_desc.strip.downcase.split('|').each do |l|
116
+ @@english_full[l.downcase.strip] = iso639_2
81
117
  end
82
- unless french_desc.nil?
83
- french_desc.strip.downcase.split('|').each do |l|
84
- @@french_full[l.downcase.strip] = iso639_2
85
- end
118
+ end
119
+ unless french_desc.nil?
120
+ french_desc.strip.downcase.split('|').each do |l|
121
+ @@french_full[l.downcase.strip] = iso639_2
86
122
  end
87
123
  end
88
- @@loaded = true
89
124
  end
90
- # A list of all possible word categories.
91
- WordCategories = [
92
- :adjective, :adverb, :noun, :verb, :interjection,
93
- :clitic, :coverb, :conjunction, :determiner, :particle,
94
- :preposition, :pronoun, :number, :symbol, :punctuation,
95
- :complementizer
96
- ]
125
+ @@loaded = true
97
126
  end
127
+
128
+ # Get the language list.
129
+ get_languages
130
+
131
+
98
132
  end
@@ -1,16 +1,16 @@
1
- module Treat
2
- module Languages
3
- class Arabic
4
- RequiredDependencies = []
5
- OptionalDependencies = []
6
- Extractors = {}
7
- Inflectors = {}
8
- Lexicalizers = {
9
- :tag => [:stanford]
10
- }
11
- Processors = {
12
- :parsers => [:stanford]
13
- }
14
- end
15
- end
1
+ class Treat::Languages::Arabic
2
+
3
+ RequiredDependencies = []
4
+ OptionalDependencies = []
5
+
6
+ Extractors = {}
7
+ Inflectors = {}
8
+ Lexicalizers = {
9
+ :tag => [:stanford]
10
+ }
11
+ Processors = {
12
+ :parsers => [:stanford]
13
+ }
14
+ Retrievers = {}
15
+
16
16
  end