treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
@@ -1,16 +1,16 @@
1
- module Treat
2
- module Languages
3
- class Swedish
4
- RequiredDependencies = []
5
- OptionalDependencies = []
6
- Processors = {
7
- :chunkers => [:txt],
8
- :segmenters => [:punkt],
9
- :tokenizers => [:tactful]
10
- }
11
- Extractors = {}
12
- Inflectors = {}
13
- Lexicalizers = {}
14
- end
15
- end
16
- end
1
+ class Treat::Languages::Swedish
2
+
3
+ RequiredDependencies = []
4
+ OptionalDependencies = []
5
+
6
+ Extractors = {}
7
+ Inflectors = {}
8
+ Lexicalizers = {}
9
+ Processors = {
10
+ :chunkers => [:txt],
11
+ :segmenters => [:punkt],
12
+ :tokenizers => [:perl, :tactful]
13
+ }
14
+ Retrievers = {}
15
+
16
+ end
@@ -1,57 +1,36 @@
1
- module Treat
2
- # Lexicalizers allow the retrieval of lexical information
3
- # (part of speech tag, synsets, hypersets, hyposets, etc.)
4
- # of an entity.
5
- module Lexicalizers
6
- # Taggers return the part of speech tag of a word.
7
- module Tag
8
- extend Group
9
- self.type = :annotator
10
- self.targets = [:sentence, :phrase, :token]
11
- end
12
-
13
- # Return the general category of a word.
14
- module Category
15
- extend Group
16
- self.type = :annotator
17
- self.targets = [:word]
18
- self.default = :from_tag
19
- end
20
-
21
- # Lexicons are dictionnaries of semantically linked
22
- # word forms.
23
- module Synsets
24
- extend Group
25
- self.type = :annotator
26
- self.targets = [:word]
27
- self.postprocessors = {
28
- :synonyms => lambda do |entity, synsets|
29
- synsets.collect { |ss| ss.synonyms }.flatten -
30
- [entity.value]
31
- end,
32
- :antonyms => lambda do |entity, synsets|
33
- synsets.collect { |ss| ss.antonyms }.flatten
34
- end,
35
- :hyponyms => lambda do |entity, synsets|
36
- synsets.collect { |ss| ss.hyponyms }.flatten
37
- end,
38
- :hypernyms => lambda do |entity, synsets|
39
- synsets.collect { |ss| ss.hypernyms }.flatten
40
- end
41
- }
42
- end
43
-
44
- module Linkages
45
- extend Group
46
- self.type = :annotator
47
- self.targets = [:zone]
48
- self.presets = {
49
- :is_a => {:linkage => :is_a},
50
- :synonym_of => {:linkage => :synonym_of},
51
- :antonym_of => {:linkage => :antonym_of}
52
- }
53
- end
54
-
55
- extend Treat::Category
1
+ # Lexicalizers allow to retrieve lexical information
2
+ # (part of speech tag, general word category, synsets,
3
+ # synonyms, antonyms, hyponyms, hypernyms, lexical
4
+ # relations, grammatical links).
5
+ # of an entity.
6
+ module Treat::Lexicalizers
7
+
8
+ # Taggers return the part of speech tag of a word.
9
+ module Taggers
10
+ extend Treat::Groupable
11
+ self.type = :annotator
12
+ self.targets = [:sentence, :phrase, :token]
56
13
  end
14
+
15
+ # Return the general category of a word.
16
+ module Categorizers
17
+ extend Treat::Groupable
18
+ self.type = :annotator
19
+ self.targets = [:token]
20
+ self.default = :from_tag
21
+ end
22
+
23
+ # Find the synsets of a word in a lexicon.
24
+ module Sensers
25
+ extend Treat::Groupable
26
+ self.type = :annotator
27
+ self.targets = [:word]
28
+ self.preset_option = :nym
29
+ self.presets = [:synonyms, :antonyms,
30
+ :hyponyms, :hypernyms]
31
+ end
32
+
33
+ # Make Lexicalizers categorizable.
34
+ extend Treat::Categorizable
35
+
57
36
  end
@@ -0,0 +1,54 @@
1
+ # Finds the general part of speech of an entity
2
+ # (:sentence, :noun_phrase, :verb, :adverb, etc.)
3
+ # from its tag (e.g. 'S', 'NP', 'VBZ', 'ADV', etc.).
4
+ class Treat::Lexicalizers::Categorizers::FromTag
5
+
6
+ Pttc = Treat::Linguistics::Tags::PhraseTagToCategory
7
+ Wttc = Treat::Linguistics::Tags::WordTagToCategory
8
+ Ptc = Treat::Linguistics::Tags::PunctuationToCategory
9
+
10
+ # Find the category of the entity from its tag.
11
+ def self.category(entity, options = {})
12
+
13
+ tag = entity.check_has(:tag)
14
+ return :unknown if tag.nil? || tag == '' || entity.type == :symbol
15
+ return :sentence if tag == 'S' || entity.type == :sentence
16
+ return :number if entity.type == :number
17
+ return Ptc[entity.to_s] if entity.type == :punctuation
18
+
19
+ if entity.is_a?(Treat::Entities::Phrase)
20
+ cat = Pttc[tag]
21
+ cat = Wttc[tag] unless cat
22
+ else
23
+ cat = Wttc[tag]
24
+ end
25
+
26
+ return :unknown if cat == nil
27
+
28
+ ts = nil
29
+
30
+ if entity.has?(:tag_set)
31
+ ts = entity.get(:tag_set)
32
+ elsif entity.parent_phrase &&
33
+ entity.parent_phrase.has?(:tag_set)
34
+ ts = entity.parent_phrase.get(:tag_set)
35
+ else
36
+ raise Treat::Exception,
37
+ "No information can be found regarding "+
38
+ "which tag set to use."
39
+ end
40
+
41
+ if cat[ts]
42
+ return cat[ts]
43
+ else
44
+ raise Treat::Exception,
45
+ "The specified tag set (#{ts})" +
46
+ " does not contain the tag #{tag} " +
47
+ "for token #{entity.to_s}."
48
+ end
49
+
50
+ :unknown
51
+
52
+ end
53
+
54
+ end
@@ -0,0 +1,57 @@
1
+ # Obtain lexical information about a word using the
2
+ # ruby 'wordnet' gem.
3
+ class Treat::Lexicalizers::Sensers::Wordnet
4
+
5
+ # Require the 'wordnet' gem.
6
+ require 'wordnet'
7
+
8
+ # Patch for bug.
9
+ ::WordNet.module_eval do
10
+ remove_const(:SynsetType)
11
+ const_set(:SynsetType,
12
+ {"n" => "noun", "v" => "verb", "a" => "adj"})
13
+ end
14
+
15
+ # Require an adaptor for Wordnet synsets.
16
+ require 'treat/lexicalizers/sensers/wordnet/synset'
17
+
18
+ # Noun, adjective and verb indexes.
19
+ @@indexes = {}
20
+
21
+ # Obtain lexical information about a word using the
22
+ # ruby 'wordnet' gem.
23
+ def self.sense(word, options = nil)
24
+
25
+ category = word.check_has(:category)
26
+
27
+ unless options[:nym]
28
+ raise Treat::Exception, "You must supply " +
29
+ "the :nym option (:synonym, :hypernym, etc.)"
30
+ end
31
+
32
+ unless [:noun, :adjective, :verb].
33
+ include?(word.category)
34
+ return []
35
+ end
36
+
37
+ cat = category.to_s.capitalize
38
+
39
+ @@indexes[cat] ||=
40
+ ::WordNet.const_get(cat + 'Index').instance
41
+ lemma = @@indexes[cat].find(word.value.downcase)
42
+
43
+ return [] if lemma.nil?
44
+ synsets = []
45
+
46
+ lemma.synsets.each do |synset|
47
+ synsets <<
48
+ Treat::Lexicalizers::Sensers::Wordnet::Synset.new(synset)
49
+ end
50
+
51
+ ((synsets.collect do |ss|
52
+ ss.send(options[:nym])
53
+ end - [word.value]).flatten).uniq
54
+
55
+ end
56
+
57
+ end
@@ -0,0 +1,71 @@
1
+ # An adaptor for synsets used by the Wordnet gem.
2
+ class Treat::Lexicalizers::Sensers::Wordnet::Synset
3
+
4
+ # The POS tag of the word.
5
+ attr_accessor :pos
6
+ # The definition of the synset.
7
+ attr_accessor :definition
8
+ # The examples in the synset.
9
+ attr_accessor :examples
10
+
11
+ def initialize(synset)
12
+ @original_synset = synset
13
+ @pos, @definition, @examples =
14
+ parse_synset(synset.to_s.split(')'))
15
+ end
16
+
17
+ def parse_synset(res)
18
+ pos = res[0][1..-1].strip
19
+ res2 = res[1].split('(')
20
+ res3 = res2[1].split(';')
21
+ 1.upto(res3.size-1) do |i|
22
+ res3[i] = res3[i].strip[1..-2]
23
+ end
24
+ definition = res3[0]
25
+ examples = res3[1..-1]
26
+ return pos, definition, examples
27
+ end
28
+
29
+ # The words in the synset.
30
+ def words
31
+ @original_synset.words
32
+ end
33
+
34
+ def synonyms
35
+ @original_synset.words
36
+ end
37
+
38
+ # A gloss (short definition with examples)
39
+ # for the synset.
40
+ def gloss
41
+ @original_synset.gloss
42
+ end
43
+
44
+ # The antonym sets of the synset.
45
+ def antonyms
46
+ antonym.collect { |a| a.words }
47
+ end
48
+
49
+ # The hypernym sets of the synset.
50
+ def hypernyms
51
+ h = hypernym
52
+ return [] unless h
53
+ h.words
54
+ end
55
+
56
+ # The hyponym sets of the synset.
57
+ def hyponyms
58
+ hyponym.collect { |h| h.words }
59
+ end
60
+
61
+ # Respond to the missing method event.
62
+ def method_missing(sym, *args, &block)
63
+ ret = @original_synset.send(sym)
64
+ if ret.is_a?(Treat::Lexicalizers::Sensers::Wordnet::Synset)
65
+ self.new(ret)
66
+ else
67
+ ret
68
+ end
69
+ end
70
+
71
+ end
@@ -0,0 +1,70 @@
1
+ # Adapter class for the 'rbtagger' gem, a port
2
+ # of the Perl Lingua::BrillTagger class, based
3
+ # on the rule-based tagger developped by Eric Brill.
4
+ #
5
+ # Original paper:
6
+ #
7
+ # Eric Brill. 1992. A simple rule-based part of speech tagger.
8
+ # In Proceedings of the third conference on Applied natural
9
+ # language processing (ANLC '92). Association for Computational
10
+ # Linguistics, Stroudsburg, PA, USA, 152-155.
11
+ # DOI=10.3115/974499.974526 http://dx.doi.org/10.3115/974499.974526
12
+ #
13
+ # Project website:
14
+ #
15
+ # http://rbtagger.rubyforge.org/
16
+ module Treat::Lexicalizers::Taggers::Brill
17
+
18
+ require 'rbtagger'
19
+
20
+ require 'treat/lexicalizers/taggers/brill/patch'
21
+
22
+ # Hold one instance of the tagger.
23
+ @@tagger = nil
24
+
25
+ # Tag words using a native Brill tagger.
26
+ # Performs own tokenization.
27
+ #
28
+ # Options (see the rbtagger gem for more info):
29
+ #
30
+ # :lexicon => String (Lexicon file to use)
31
+ # :lexical_rules => String (Lexical rule file to use)
32
+ # :contextual_rules => String (Contextual rules file to use)
33
+ def self.tag(entity, options = {})
34
+
35
+ # Tokenize the sentence/phrase.
36
+ if !entity.has_children? &&
37
+ !entity.is_a?(Treat::Entities::Token)
38
+ entity.tokenize(:perl, options)
39
+ end
40
+
41
+ # Create the tagger if necessary
42
+ @@tagger ||= ::Brill::Tagger.new(options[:lexicon],
43
+ options[:lexical_rules], options[:contextual_rules])
44
+
45
+ isolated_token = entity.is_a?(Treat::Entities::Token)
46
+ tokens = isolated_token ? [entity] : entity.tokens
47
+ tokens_s = tokens.map { |t| t.value }
48
+
49
+ tags = @@tagger.tag_tokens( tokens_s )
50
+
51
+ pairs = tokens.zip(tags)
52
+
53
+ pairs.each do |pair|
54
+ pair[0].set :tag, pair[1]
55
+ pair[0].set :tag_set, :penn if isolated_token
56
+ return pair[1] if isolated_token
57
+ end
58
+
59
+ if entity.is_a?(Treat::Entities::Sentence) ||
60
+ (entity.is_a?(Treat::Entities::Phrase) &&
61
+ !entity.parent_sentence)
62
+ entity.set :tag_set, :penn
63
+ end
64
+
65
+ return 'S' if entity.is_a?(Treat::Entities::Sentence)
66
+ return 'P' if entity.is_a?(Treat::Entities::Phrase)
67
+
68
+ end
69
+
70
+ end
@@ -0,0 +1,61 @@
1
+ patch = false
2
+
3
+ begin
4
+ # This whole mess is required to deal with
5
+ # the fact that the 'rbtagger' gem defines
6
+ # a top-level module called 'Word', which
7
+ # will clash with the top-level class 'Word'
8
+ # we define when syntactic sugar is enabled.
9
+ rescue TypeError
10
+ if Treat.sweetened?
11
+ patch = true
12
+ # Unset the class Word for the duration
13
+ # of loading the tagger.
14
+ Object.const_unset(:Word); retry
15
+ else
16
+ raise Treat::Exception,
17
+ 'Something went wrong due to a name clash with the "rbtagger" gem.' +
18
+ 'Turn off syntactic sugar to resolve this problem.'
19
+ end
20
+ ensure
21
+ # Reset the class Word if using syntactic sugar.
22
+ if Treat.sweetened? && patch
23
+ Object.const_set(:Word, Treat::Entities::Word)
24
+ end
25
+ end
26
+
27
+ Brill::Tagger.class_eval do
28
+
29
+ def tag_tokens(tokens)
30
+
31
+ tags = Brill::Tagger.tag_start( tokens )
32
+
33
+ @tagger.apply_lexical_rules( tokens, tags, [], 0 )
34
+ @tagger.default_tag_finish( tokens, tags )
35
+
36
+ # Brill uses these fake "STAART" tags to delimit the start & end of sentence.
37
+ tokens << "STAART"
38
+ tokens << "STAART"
39
+ tokens.unshift "STAART"
40
+ tokens.unshift "STAART"
41
+ tags << "STAART"
42
+ tags << "STAART"
43
+ tags.unshift "STAART"
44
+ tags.unshift "STAART"
45
+
46
+ @tagger.apply_contextual_rules( tokens, tags, 1 )
47
+
48
+ tags.shift
49
+ tags.shift
50
+ tokens.shift
51
+ tokens.shift
52
+ tags.pop
53
+ tags.pop
54
+ tokens.pop
55
+ tokens.pop
56
+
57
+ tags
58
+
59
+ end
60
+
61
+ end
@@ -0,0 +1,90 @@
1
+ # An adapter for the 'engtagger' gem, which
2
+ # is a port of the Perl Lingua::EN::Tagger module.
3
+ #
4
+ # "This module uses part-of-speech statistics from
5
+ # the Penn Treebank to assign POS tags to English text.
6
+ # The tagger applies a bigram (two-word) Hidden Markov
7
+ # Model to guess the appropriate POS tag for a word.
8
+ # That means that the tagger will try to assign a POS
9
+ # tag based on the known POS tags for a given word and
10
+ # the POS tag assigned to its predecessor.
11
+ #
12
+ # Project website: http://engtagger.rubyforge.org/
13
+ # Original Perl module site:
14
+ # http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/
15
+ class Treat::Lexicalizers::Taggers::Lingua
16
+
17
+ # Require the 'engtagger' gem.
18
+ silence_warnings { require 'engtagger' }
19
+
20
+ # Undefine the porter stemming business.
21
+ String.class_eval { undef :stem }
22
+
23
+ # Hold one instance of the tagger.
24
+ @@tagger = nil
25
+
26
+ # Hold the default options.
27
+ DefaultOptions = { :relax => false }
28
+
29
+ # Replace punctuation tags used by this gem
30
+ # to the standard PTB tags.
31
+ Punctuation = {
32
+ 'pp' => '.',
33
+ 'pps' => ';',
34
+ 'ppc' => ',',
35
+ 'ppd' => '$',
36
+ 'ppl' => 'lrb',
37
+ 'ppr' => 'rrb'
38
+ }
39
+
40
+ # Tag the word using a probabilistic model taking
41
+ # into account known words found in a lexicon and
42
+ # the tag of the previous word.
43
+ #
44
+ # Options:
45
+ #
46
+ # - (Boolean) :relax => Relax the HMM model -
47
+ # this may improve accuracy for uncommon words,
48
+ # particularly words used polysemously.
49
+ def self.tag(entity, options = {})
50
+
51
+ if !entity.has_children? &&
52
+ !entity.is_a?(Treat::Entities::Token)
53
+ entity.tokenize
54
+ end
55
+
56
+ options = DefaultOptions.merge(options)
57
+
58
+ @@tagger ||= ::EngTagger.new(options)
59
+ left_tag = @@tagger.conf[:current_tag] = 'pp'
60
+ isolated_token = entity.is_a?(Treat::Entities::Token)
61
+ tokens = isolated_token ? [entity] : entity.tokens
62
+
63
+ tokens.each do |token|
64
+ next if token.to_s == ''
65
+ w = @@tagger.clean_word(token.to_s)
66
+ t = @@tagger.assign_tag(left_tag, w)
67
+ t = 'fw' if t.nil? || t == ''
68
+ @@tagger.conf[:current_tag] = left_tag = t
69
+ t = 'prp$' if t == 'prps'
70
+ t = 'dt' if t == 'det'
71
+ t = Punctuation[t] if Punctuation[t]
72
+ token.set :tag, t.upcase
73
+ token.set :tag_set, :penn if isolated_token
74
+ return t.upcase if isolated_token
75
+
76
+ end
77
+
78
+
79
+ if entity.is_a?(Treat::Entities::Sentence) ||
80
+ (entity.is_a?(Treat::Entities::Phrase) &&
81
+ !entity.parent_sentence)
82
+ entity.set :tag_set, :penn
83
+ end
84
+
85
+ return 'S' if entity.is_a?(Treat::Entities::Sentence)
86
+ return 'P' if entity.is_a?(Treat::Entities::Phrase)
87
+
88
+ end
89
+
90
+ end