treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
data/lib/treat/kernel.rb CHANGED
@@ -2,20 +2,22 @@
2
2
  # easy access to utility functions used across
3
3
  # the library.
4
4
  module Kernel
5
+
6
+ # Require file utilities for creating and
7
+ # deleting temporary files.
5
8
  require 'fileutils'
6
- require 'tempfile'
9
+
7
10
  # A list of acronyms used in class names within
8
11
  # the program. These do not CamelCase; they
9
12
  # CAMELCase.
10
- Acronyms = ['XML', 'HTML', 'YAML', 'UEA', 'LDA', 'PDF'].join('|')
13
+ Acronyms = %w[xml html txt odt abw doc yaml uea lda pdf ptb dot ai id3 svo]
14
+
11
15
  # A cache to optimize camel casing.
12
16
  @@cc_cache = {}
17
+
13
18
  # A cache to optimize un camel casing.
14
19
  @@ucc_cache = {}
15
- # Returns the platform we are running on.
16
- def platform
17
- RUBY_PLATFORM.split("-")[1]
18
- end
20
+
19
21
  # Runs a block of code without warnings.
20
22
  def silence_warnings(&block)
21
23
  warn_level = $VERBOSE
@@ -24,62 +26,85 @@ module Kernel
24
26
  $VERBOSE = warn_level
25
27
  result
26
28
  end
29
+
27
30
  # Runs a block of code while blocking stdout.
28
- def silence_stdout(log = '/dev/null')
31
+ def silence_stdout(log = NULL_DEVICE)
32
+ unless Treat.silence
33
+ yield; return
34
+ end
29
35
  old = $stdout.dup
30
36
  $stdout.reopen(File.new(log, 'w'))
31
37
  yield
32
38
  $stdout = old
33
39
  end
40
+
34
41
  # Create a temporary file which is deleted
35
42
  # after execution of the block.
36
43
  def create_temp_file(ext, value = nil, &block)
37
- fname = "#{Treat.lib}/../tmp/#{Random.rand(10000000).to_s}.#{ext}"
38
- File.open(fname, 'w') do |f|
39
- f.write(value) if value
44
+ fname = Treat.tmp +
45
+ "#{Random.rand(10000000).to_s}.#{ext}"
46
+ File.open(fname, 'w') do |f|
47
+ f.write(value) if value
40
48
  block.call(f.path)
41
49
  end
42
50
  ensure
43
51
  File.delete(fname)
44
52
  end
45
- # Create a temporary directory.
53
+
54
+ # Create a temporary directory, which is
55
+ # deleted after execution of the block.
46
56
  def create_temp_dir(&block)
47
- dname = "#{Treat.lib}/../tmp/#{Random.rand(10000000).to_s}"
57
+ dname = "#{Treat.lib}/../tmp/"+
58
+ "#{Random.rand(10000000).to_s}"
48
59
  Dir.mkdir(dname)
49
60
  block.call(dname)
50
61
  ensure
51
62
  FileUtils.rm_rf(dname)
52
63
  end
64
+
53
65
  # Convert un_camel_case to CamelCase.
54
66
  def camel_case(o_phrase)
55
67
  phrase = o_phrase.to_s.dup
56
68
  return @@cc_cache[o_phrase] if @@cc_cache[o_phrase]
57
- phrase.gsub!(/#{Acronyms.downcase}[^a-z]*/) { |a| a.upcase }
58
- phrase.gsub!(/^[a-z]|_[a-z]/) { |a| a.upcase }
59
- phrase.gsub!('_', '')
69
+
70
+ if Acronyms.include?(phrase)
71
+ phrase = phrase.upcase
72
+ else
73
+ phrase.gsub!(/^[a-z]|_[a-z]/) { |a| a.upcase }
74
+ phrase.gsub!('_', '')
75
+ end
60
76
  @@cc_cache[o_phrase] = phrase
61
- phrase
62
77
  end
78
+
63
79
  alias :cc :camel_case
80
+
64
81
  # Convert CamelCase to un_camel_case.
65
82
  def un_camel_case(o_phrase)
66
83
  phrase = o_phrase.to_s.dup
67
84
  return @@ucc_cache[o_phrase] if @@ucc_cache[o_phrase]
68
- phrase.gsub!(/#{Acronyms}/) { |a| a.downcase.capitalize }
69
- phrase.gsub!(/[A-Z]/) { |p| '_' + p.downcase }
70
- phrase = phrase[1..-1] if phrase[0] == '_'
85
+ if Acronyms.include?(phrase.downcase)
86
+ phrase = phrase.downcase
87
+ else
88
+ phrase.gsub!(/[A-Z]/) { |p| '_' + p.downcase }
89
+ phrase = phrase[1..-1] if phrase[0] == '_'
90
+ end
71
91
  @@ucc_cache[o_phrase] = phrase
72
- phrase
73
92
  end
93
+
74
94
  alias :ucc :un_camel_case
95
+
75
96
  # Retrieve the Class from a Module::Class.
76
97
  def class_name(n); n.to_s.split('::')[-1]; end
98
+
77
99
  alias :cl :class_name
100
+
78
101
  # Search the list to see if there are words similar to #name
79
- # in the #list If yes, return a string saying "Did you mean
102
+ # in the #list If yes, return a string saying "Did you mean
80
103
  # ... ?" with the names.
81
104
  def did_you_mean?(list, name)
82
- msg = ''
105
+ return '' # Fix
106
+ list = list.map { |e| e.to_s }
107
+ name = name.to_s
83
108
  sugg = []
84
109
  list.each do |element|
85
110
  l = levenshtein(element,name)
@@ -91,22 +116,38 @@ module Kernel
91
116
  if sugg.size == 1
92
117
  msg += " Perhaps you meant '#{sugg[0]}' ?"
93
118
  else
94
- sugg_quote = sugg[0..-2].map {|x| '\'' + x + '\''}
95
- msg += " Perhaps you meant #{sugg_quote.join(', ')}," +
119
+ sugg_quote = sugg[0..-2].map do
120
+ |x| '\'' + x + '\''
121
+ end
122
+ msg += " Perhaps you meant " +
123
+ "#{sugg_quote.join(', ')}," +
96
124
  " or '#{sugg[-1]}' ?"
97
125
  end
98
126
  end
99
127
  msg
100
128
  end
129
+
101
130
  alias :dym? :did_you_mean?
131
+
102
132
  # Return the name of the method that called the method
103
133
  # that calls this method.
104
134
  def caller_method(n = 3)
105
135
  at = caller(n).first
106
136
  /^(.+?):(\d+)(?::in `(.*)')?/ =~ at
107
- Regexp.last_match[3].intern
137
+ Regexp.last_match[3].gsub('block in ', '').intern
108
138
  end
139
+
109
140
  alias :cm :caller_method
141
+
142
+ # Detect the platform we're running on.
143
+ def detect_platform
144
+ p = RUBY_PLATFORM.downcase
145
+ return :mac if p.include?("darwin")
146
+ return :windows if p.include?("mswin")
147
+ return :linux if p.include?("linux")
148
+ return :unknown
149
+ end
150
+
110
151
  # Return the levensthein distance between two stringsm
111
152
  # taking into account the costs of insertion, deletion,
112
153
  # and substitution. Stolen from:
@@ -116,14 +157,16 @@ module Kernel
116
157
  return nil if first.nil? || other.nil?
117
158
  dm = []
118
159
  dm[0] = (0..first.length).collect { |i| i * ins}
119
- fill = [0] * (first.length - 1)
160
+ fill = [0] * (first.length - 1).abs
120
161
  for i in 1..other.length
121
162
  dm[i] = [i * del, fill.flatten]
122
163
  end
123
164
  for i in 1..other.length
124
165
  for j in 1..first.length
125
166
  dm[i][j] = [
126
- dm[i-1][j-1] + (first[i-1] == other[i-1] ? 0 : sub),
167
+ dm[i-1][j-1] +
168
+ (first[i-1] ==
169
+ other[i-1] ? 0 : sub),
127
170
  dm[i][j-1] + ins,
128
171
  dm[i-1][j] + del
129
172
  ].min
@@ -131,4 +174,39 @@ module Kernel
131
174
  end
132
175
  dm[other.length][first.length]
133
176
  end
177
+
178
+ if detect_platform == :windows
179
+ NULL_DEVICE = 'NUL'
180
+ else
181
+ NULL_DEVICE = '/dev/null'
182
+ end
183
+
184
+ def debug(msg)
185
+ puts msg if Treat.debug
186
+ end
187
+
188
+ def prompt(msg, valid_answers)
189
+
190
+ msg = msg
191
+ n = msg.include?("\n") ? ":\n" : ''
192
+ q = msg.include?("\n") ? '' : '?'
193
+
194
+ s = "\nPlease enter one of #{valid_answers.join(', ')}: "
195
+ puts "Do you want to #{n}#{msg}#{q} \n#{s}"
196
+
197
+ begin
198
+ answer = STDIN.gets.strip
199
+ unless valid_answers.include?(answer)
200
+ puts "Invalid input."
201
+ puts s
202
+ raise Treat::InvalidInputException
203
+ end
204
+ puts
205
+ answer
206
+ rescue Treat::InvalidInputException
207
+ retry
208
+ end
209
+
210
+ end
211
+
134
212
  end
@@ -1,98 +1,132 @@
1
- module Treat
2
- # This module provides linguistic resources
3
- # for the Treat library, including information
4
- # about language codes, the functions available
5
- # for each language, and the different tags used
6
- # to markup that language.
7
- module Languages
8
- Dir["#{Treat.lib}/treat/languages/*.rb"].each { |file| require file }
9
- ISO639_1 = 1
10
- ISO639_2 = 2
11
- # Describe a language code (ISO-639-1 or ISO-639-2)
12
- # or its full text description in full French or English.
13
- def self.describe(lang, desc_lang = :en)
14
- raise "Must provide a non-nil language identifier to describe." if lang.nil?
15
- lang = code(lang).to_s
16
- if [:en, :eng, :english, :anglais].include?(desc_lang)
17
- l = @@english_full.key(lang)
18
- elsif [:fr, :fra, :french, :french].include?(desc_lang)
19
- l = @@french_full.key(lang)
20
- else
21
- raise Treat::Exception,
22
- "Unknown language to describe: #{desc_lang}."
23
- end
24
- not_found(lang) if l.nil?
25
- l.intern
1
+ # This module provides linguistic resources
2
+ # for the Treat library, including information
3
+ # about language codes, the functions available
4
+ # for each language, and the different tags used
5
+ # to markup that language.
6
+ module Treat::Languages
7
+
8
+ def self.const_missing(const)
9
+ lang = const.to_s.downcase
10
+ f = File.join(File.dirname(__FILE__), "languages", lang)
11
+ unless File.readable?(f + '.rb')
12
+ raise Treat::Exception,
13
+ "Language #{lang} is not supported."
26
14
  end
27
- # Raise an error message when a language code
28
- # or description is not found and suggest
29
- # possible misspellings.
30
- def self.not_found(lang)
31
- msg = "Language '#{lang}' does not exist."
32
- all = @@iso639_2.keys + @@iso639_1.keys +
33
- @@english_full.keys + @@french_full.keys
34
- msg += did_you_mean?(all, lang)
35
- raise Treat::Exception, msg
15
+ require f
16
+ const_get(const)
17
+ end
18
+
19
+ # Yield a lowercase symbol for each
20
+ # defined language.
21
+ def self.each
22
+ constants.each do |constant|
23
+ yield constant.to_s.downcase.intern
36
24
  end
37
- # Return the class representing a language.
38
- def self.get(lang)
39
- const_get(Treat::Languages.describe(lang).to_s.capitalize)
25
+ end
26
+
27
+ # Identifier constants for language codes.
28
+ ISO639_1 = 1
29
+ ISO639_2 = 2
30
+
31
+ # Describe a language code (ISO-639-1 or ISO-639-2)
32
+ # or its full text description in full French or English.
33
+ def self.describe(lang, desc_lang = :en)
34
+ raise "Must provide a non-nil language "+
35
+ "identifier to describe." if lang.nil?
36
+ lang = code(lang).to_s
37
+ if [:en, :eng, :english, :anglais].
38
+ include?(desc_lang)
39
+ l = @@english_full.key(lang)
40
+ elsif [:fr, :fra, :french, :french].
41
+ include?(desc_lang)
42
+ l = @@french_full.key(lang)
43
+ else
44
+ raise Treat::Exception,
45
+ "Unknown language to describe: #{desc_lang}."
40
46
  end
41
- # Find a language by ISO-639-1 or ISO-639-2 code
42
- # or full name (in English or French) and return
43
- # the ISO-639-1 or ISO-639-2 language code as a
44
- # lowercase identifier.
45
- def self.code(lang, rc = ISO639_2)
46
- raise "Must provide a non-nil language identifier to describe." if lang.nil?
47
- get_languages
48
- lang = lang.to_s.downcase
49
- if @@iso639_1.has_key?(lang)
50
- return lang.intern if rc == ISO639_1
51
- return @@iso639_1[lang].intern if rc == ISO639_2
52
- elsif @@iso639_2.has_key?(lang)
53
- return lang.intern if rc == ISO639_2
54
- return @@iso639_2[lang].intern if rc == ISO639_1
55
- elsif @@english_full.has_key?(lang)
56
- return @@english_full[lang].intern if rc == ISO639_2
57
- return @@iso639_2[@@english_full[lang]].intern if rc == ISO639_1
58
- elsif @@french_full.has_key?(lang)
59
- return @@french_full[lang].intern if rc == ISO639_2
60
- return @@iso639_1[@@french_full[lang]].intern if rc == ISO639_2
61
- else
62
- not_found(lang)
63
- end
47
+ not_found(lang) if l.nil?
48
+ l.intern
49
+ end
50
+
51
+ # Raise an error message when a language code
52
+ # or description is not found and suggest
53
+ # possible misspellings.
54
+ def self.not_found(lang)
55
+ msg = "Language '#{lang}' does not exist."
56
+ all = @@iso639_2.keys + @@iso639_1.keys +
57
+ @@english_full.keys + @@french_full.keys
58
+ msg += did_you_mean?(all, lang)
59
+ raise Treat::Exception, msg
60
+ end
61
+
62
+ # Return the class representing a language.
63
+ def self.get(lang)
64
+ lang = Treat::Languages.describe(lang).to_s
65
+ begin
66
+ const_get(lang.capitalize)
67
+ rescue
68
+ not_found(lang)
69
+ end
70
+ end
71
+
72
+ # Find a language by ISO-639-1 or ISO-639-2 code
73
+ # or full name (in English or French) and return
74
+ # the ISO-639-1 or ISO-639-2 language code as a
75
+ # lowercase identifier.
76
+ def self.code(lang, rc = ISO639_2)
77
+ raise "Must provide a non-nil language "+
78
+ "identifier to describe." if lang.nil?
79
+ get_languages
80
+ lang = lang.to_s.downcase
81
+ if @@iso639_1.has_key?(lang)
82
+ return lang.intern if rc == ISO639_2
83
+ return @@iso639_1[lang].intern if rc == ISO639_1
84
+ elsif @@iso639_2.has_key?(lang)
85
+ return lang.intern if rc == ISO639_2
86
+ return @@iso639_2[lang].intern if rc == ISO639_1
87
+ elsif @@english_full.has_key?(lang)
88
+ return @@english_full[lang].intern if rc == ISO639_2
89
+ return @@iso639_2[@@english_full[lang]].intern if rc == ISO639_1
90
+ elsif @@french_full.has_key?(lang)
91
+ return @@french_full[lang].intern if rc == ISO639_2
92
+ return @@iso639_2[@@french_full[lang]].intern if rc == ISO639_1
93
+ else
94
+ not_found(lang)
64
95
  end
65
- @@loaded = false
66
- # Get the languages from the dictionary.
67
- def self.get_languages
68
- return if @@loaded
69
- @@iso639_1 = {}; @@iso639_2 = {};
70
- @@english_full = {}; @@french_full = {}
71
- languages = IO.readlines(Treat.lib + '/treat/languages/list.txt')
72
- languages.each do |language|
73
- iso639_2, iso639_1, english_desc, french_desc =
74
- language.split(',')
75
- @@iso639_1[iso639_1] = iso639_2
76
- @@iso639_2[iso639_2] = iso639_1
77
- unless english_desc.nil?
78
- english_desc.strip.downcase.split('|').each do |l|
79
- @@english_full[l.downcase.strip] = iso639_2
80
- end
96
+
97
+ end
98
+
99
+ # Whether the language list has been loaded or not.
100
+ @@loaded = false
101
+
102
+ # Get the languages from the dictionary.
103
+ def self.get_languages
104
+ return if @@loaded
105
+ @@iso639_1 = {}; @@iso639_2 = {};
106
+ @@english_full = {}; @@french_full = {}
107
+ languages = IO.readlines(File.join(
108
+ File.dirname(__FILE__), "languages", "list.txt"))
109
+ languages.each do |language|
110
+ iso639_2, iso639_1, english_desc, french_desc =
111
+ language.split(',')
112
+ @@iso639_1[iso639_1] = iso639_2
113
+ @@iso639_2[iso639_2] = iso639_1
114
+ unless english_desc.nil?
115
+ english_desc.strip.downcase.split('|').each do |l|
116
+ @@english_full[l.downcase.strip] = iso639_2
81
117
  end
82
- unless french_desc.nil?
83
- french_desc.strip.downcase.split('|').each do |l|
84
- @@french_full[l.downcase.strip] = iso639_2
85
- end
118
+ end
119
+ unless french_desc.nil?
120
+ french_desc.strip.downcase.split('|').each do |l|
121
+ @@french_full[l.downcase.strip] = iso639_2
86
122
  end
87
123
  end
88
- @@loaded = true
89
124
  end
90
- # A list of all possible word categories.
91
- WordCategories = [
92
- :adjective, :adverb, :noun, :verb, :interjection,
93
- :clitic, :coverb, :conjunction, :determiner, :particle,
94
- :preposition, :pronoun, :number, :symbol, :punctuation,
95
- :complementizer
96
- ]
125
+ @@loaded = true
97
126
  end
127
+
128
+ # Get the language list.
129
+ get_languages
130
+
131
+
98
132
  end
@@ -1,16 +1,16 @@
1
- module Treat
2
- module Languages
3
- class Arabic
4
- RequiredDependencies = []
5
- OptionalDependencies = []
6
- Extractors = {}
7
- Inflectors = {}
8
- Lexicalizers = {
9
- :tag => [:stanford]
10
- }
11
- Processors = {
12
- :parsers => [:stanford]
13
- }
14
- end
15
- end
1
+ class Treat::Languages::Arabic
2
+
3
+ RequiredDependencies = []
4
+ OptionalDependencies = []
5
+
6
+ Extractors = {}
7
+ Inflectors = {}
8
+ Lexicalizers = {
9
+ :tag => [:stanford]
10
+ }
11
+ Processors = {
12
+ :parsers => [:stanford]
13
+ }
14
+ Retrievers = {}
15
+
16
16
  end