treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
@@ -1,49 +0,0 @@
1
- module Treat
2
- module Lexicalizers
3
- module Category
4
- # A class that detects the category of a word from its tag,
5
- # using the default tagger for the language of the entity.
6
- class FromTag
7
- # Find the category of the current entity.
8
- #
9
- # Options:
10
- #
11
- # - (Symbol) :tagger => force the use of a tagger.
12
- def self.category(entity, options = {})
13
- tag = entity.tag(options[:tagger])
14
- return :unknown if tag.nil? || tag == ''
15
- return :sentence if tag == 'S'
16
- if entity.is_a?(Treat::Entities::Phrase)
17
- cat = Treat::Languages::Tags::PhraseTagToCategory[tag]
18
- unless cat
19
- cat = Treat::Languages::Tags::WordTagToCategory[tag]
20
- end
21
- elsif entity.is_a?(Treat::Entities::Word)
22
- cat = Treat::Languages::Tags::WordTagToCategory[tag]
23
- end
24
- if cat == nil
25
- warn "Category not found for tag '#{tag}'."
26
- return :unknown
27
- else
28
- if cat.size == 1
29
- return cat[entity.tag_set]
30
- else
31
- if entity.has?(:tag_set)
32
- if cat[entity.tag_set]
33
- return cat[entity.tag_set]
34
- else
35
- raise Treat::Exception,
36
- "The specified tag set (#{entity.tag_set})" +
37
- " does not contain the tag #{tag}."
38
- end
39
- else
40
- raise Treat::Exception,
41
- "No information can be found regarding which tag set to use."
42
- end
43
- end
44
- end
45
- end
46
- end
47
- end
48
- end
49
- end
@@ -1,63 +0,0 @@
1
- module Treat
2
- module Lexicalizers
3
- module Linkages
4
- class Naive
5
- # Fix - add options for sentences.
6
- def self.linkages(entity, options = {})
7
- if options[:linkage] == :is_a ||
8
- options[:linkage] == :hypernym_of
9
-
10
- entity.each_word do |w1|
11
- hypernyms = []
12
- entity.each_word do |w2|
13
- next if w1 == w2
14
- if w2.hypernyms.include?(w1.value) ||
15
- w1.hyponyms.include?(w2.value)
16
- hypernyms << w1
17
- w2.link(w1, :is_a)
18
- w1.link(w2, :hypernym_of)
19
- end
20
- end
21
- w1.set :hypernyms, hypernyms
22
- end
23
-
24
- elsif options[:linkage] == :synonym_of
25
-
26
- entity.each_word do |w1|
27
- synonyms = []
28
- entity.each_word do |w2|
29
- next if w1 == w2
30
- if w2.synonyms.include?(w1.value)
31
- synonyms << w1
32
- w2.link(w1, :synonym_of)
33
- w1.link(w2, :synonym_of)
34
- end
35
- end
36
- w1.set :synonyms, synonyms
37
- end
38
-
39
- elsif options[:linkage] == :antonym_of
40
-
41
- entity.each_word do |w1|
42
- antonyms = []
43
- entity.each_word do |w2|
44
- next if w1 == w2
45
- if w2.antonyms.include?(w1.value)
46
- antonyms << w1
47
- w2.link(w1, :antonym_of)
48
- w1.link(w2, :antonym_of)
49
- end
50
- end
51
- w1.set :antonyms, antonyms
52
- end
53
-
54
- else
55
- raise Treat::Exception,
56
- "Invalid linkage option '#{options[:linkage]}'."
57
- end
58
-
59
- end
60
- end
61
- end
62
- end
63
- end
@@ -1,76 +0,0 @@
1
- module Treat
2
- module Lexicalizers
3
- module Synsets
4
- # Obtain lexical information about a word using the
5
- # ruby 'wordnet' gem.
6
- class Wordnet
7
- # Require the 'wordnet' gem.
8
- require 'wordnet'
9
- # Obtain lexical information about a word using the
10
- # ruby 'wordnet' gem.
11
- def self.synsets(word, options = nil)
12
- unless [:noun, :adjective, :verb].include?(word.category)
13
- return []
14
- end
15
- cat = word.category.to_s.capitalize
16
- index = ::WordNet.const_get(cat + 'Index').instance
17
- lemma = index.find(word.value.downcase)
18
- return [] if lemma.nil?
19
- synsets = []
20
- lemma.synsets.each { |synset| synsets << Synset.new(synset) }
21
- synsets
22
- end
23
- end
24
- end
25
- # An adaptor for synsets used by the Wordnet gem.
26
- class Synset
27
- # The POS tag of the word.
28
- attr_accessor :pos
29
- # The definition of the synset.
30
- attr_accessor :definition
31
- # The examples in the synset.
32
- attr_accessor :examples
33
- def initialize(synset)
34
- @original_synset = synset
35
- @pos, @definition, @examples =
36
- parse_synset(synset.to_s.split(')'))
37
- end
38
- def parse_synset(res)
39
- pos = res[0][1..-1].strip
40
- res2 = res[1].split('(')
41
- res3 = res2[1].split(';')
42
- 1.upto(res3.size-1) do |i|
43
- res3[i] = res3[i].strip[1..-2]
44
- end
45
- definition = res3[0]
46
- examples = res3[1..-1]
47
- return pos, definition, examples
48
- end
49
- # The words in the synset.
50
- def words; @original_synset.words; end
51
- def synonyms; @original_synset.words; end
52
- # A gloss (short definition with examples)
53
- # for the synset.
54
- def gloss; @original_synset.gloss; end
55
- # The antonym sets of the synset.
56
- def antonyms; antonym.collect { |a| a.words }; end
57
- # The hypernym sets of the synset.
58
- def hypernyms;
59
- h = hypernym
60
- return [] unless h
61
- h.words
62
- end
63
- # The hyponym sets of the synset.
64
- def hyponyms; hyponym.collect { |h| h.words }; end
65
- # Respond to the missing method event.
66
- def method_missing(sym, *args, &block)
67
- ret = @original_synset.send(sym)
68
- if ret.is_a?(::WordNet::Synset)
69
- Synset.new(ret)
70
- else
71
- ret
72
- end
73
- end
74
- end
75
- end
76
- end
@@ -1,91 +0,0 @@
1
- module Treat
2
- module Lexicalizers
3
- module Tag
4
- # Adapter class for the 'rbtagger' gem, a port
5
- # of the Perl Lingua::BrillTagger class, based
6
- # on the rule-based tagger developped by Eric Brill.
7
- #
8
- # The Brill tagger is a simple rule-based part of
9
- # speech tagger. The main advantages over stochastic
10
- # taggers is a vast reduction in information required
11
- # and better portability from one tag set, corpus genre
12
- # or language to another.
13
- #
14
- # Original paper:
15
- # Eric Brill. 1992. A simple rule-based part of speech tagger.
16
- # In Proceedings of the third conference on Applied natural
17
- # language processing (ANLC '92). Association for Computational
18
- # Linguistics, Stroudsburg, PA, USA, 152-155.
19
- # DOI=10.3115/974499.974526 http://dx.doi.org/10.3115/974499.974526
20
- # Project website:
21
- # http://rbtagger.rubyforge.org/
22
- # Original Perl module site:
23
- # http://search.cpan.org/~kwilliams/Lingua-BrillTagger-0.02/lib/Lingua/BrillTagger.pm
24
- class Brill
25
- patch = false
26
- # Require the 'rbtagger' gem.
27
- require 'rbtagger'
28
- begin
29
- # This whole mess is required to deal with
30
- # the fact that the 'rbtagger' gem defines
31
- # a top-level module called 'Word', which
32
- # will clash with the top-level class 'Word'
33
- # we define when syntactic sugar is enabled.
34
- rescue TypeError
35
- if Treat.sweetened?
36
- patch = true
37
- # Unset the class Word for the duration
38
- # of loading the tagger.
39
- Object.const_unset(:Word); retry
40
- else
41
- raise Treat::Exception,
42
- 'Something went wrong due to a name clash with the "rbtagger" gem.' +
43
- 'Turn off syntactic sugar to resolve this problem.'
44
- end
45
- ensure
46
- # Reset the class Word if using syntactic sugar.
47
- if Treat.sweetened? && patch
48
- Object.const_set(:Word, Treat::Entities::Word)
49
- end
50
- end
51
- # Hold the tagger.
52
- @@tagger = nil
53
- # Tag words using a native Brill tagger.
54
- # Performs own tokenization.
55
- #
56
- # Options:
57
- #
58
- # :lexicon => String (Lexicon file to use)
59
- # :lexical_rules => String (Lexical rule file to use)
60
- # :contextual_rules => String (Contextual rules file to use)
61
- def self.tag(entity, options = {})
62
- if entity.has_children?
63
- warn "The Brill tagger performs its own tokenization. " +
64
- "Removing all children of #{entity.type} with value #{entity.short_value}."
65
- entity.remove_all!
66
- end
67
- # Create the tagger if necessary
68
- @@tagger ||= ::Brill::Tagger.new(options[:lexicon],
69
- options[:lexical_rules], options[:contextual_rules])
70
- res = @@tagger.tag(entity.to_s)
71
- res ||= []
72
- isolated_word = entity.is_a?(Treat::Entities::Token)
73
- res.each do |info|
74
- next if info[1] == ')'
75
- token = Treat::Entities::Token.from_string(info[0])
76
- token.set :tag_set, :penn
77
- token.set :tag, info[1]
78
- if isolated_word
79
- entity.set :tag_set, :penn
80
- return info[1]
81
- end
82
- entity << token
83
- end
84
- entity.set :tag_set, :penn
85
- return 'P' if entity.is_a?(Treat::Entities::Phrase)
86
- return 'S' if entity.is_a?(Treat::Entities::Sentence)
87
- end
88
- end
89
- end
90
- end
91
- end
@@ -1,123 +0,0 @@
1
- module Treat
2
- module Lexicalizers
3
- module Tag
4
- # An adapter for the 'engtagger' gem, which
5
- # is a port of the Perl Lingua::EN::Tagger module.
6
- #
7
- # "This module uses part-of-speech statistics from
8
- # the Penn Treebank to assign POS tags to English text.
9
- # The tagger applies a bigram (two-word) Hidden Markov
10
- # Model to guess the appropriate POS tag for a word.
11
- # That means that the tagger will try to assign a POS
12
- # tag based on the known POS tags for a given word and
13
- # the POS tag assigned to its predecessor.
14
- #
15
- # Project website: http://engtagger.rubyforge.org/
16
- # Original Perl module site:
17
- # http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/
18
- class Lingua
19
- # Require the 'engtagger' gem.
20
- silence_warnings { require 'engtagger' }
21
- # Hold the tagger.
22
- @@tagger = nil
23
- # Hold the user-set options
24
- @@options = {}
25
- # Hold the default options.
26
- DefaultOptions = {
27
- :unknown_word_tag => 'pp', # Fix unknown word tag
28
- :relax => false
29
- }
30
- # Tag the word using a probabilistic model taking
31
- # into account known words found in a lexicon and
32
- # the tag of the previous word.
33
- #
34
- # Options:
35
- #
36
- # - (Boolean) :relax => Relax the Hidden Markov Model:
37
- # this may improve accuracy for uncommon words,
38
- # particularly words used polysemously.
39
- # - (String) :unknown_word_tag => Tag for unknown words.
40
- def self.tag(entity, options = {})
41
- if !entity.has_children?
42
- warn "The Lingua tagger requires prior tokenization."
43
- warn "Tokenizing the entity #{entity.short_value}."
44
- end
45
- options = DefaultOptions.merge(options)
46
- # Reinitialize the tagger if the options have changed.
47
- if options != @@options
48
- @@options = DefaultOptions.merge(options)
49
- @@tagger = nil # Reset the tagger
50
- end
51
- @@tagger ||= ::EngTagger.new(@@options)
52
- left_tag = @@tagger.conf[:current_tag] = 'pp'
53
- isolated_word = entity.is_a?(Treat::Entities::Token)
54
- entity.tokens.each do |token|
55
- w = @@tagger.clean_word(token.to_s)
56
- t = @@tagger.assign_tag(left_tag, w)
57
- t = options[:unknown_word_tag] if t.nil? || t == ''
58
- @@tagger.conf[:current_tag] = left_tag = t
59
- token.set :tag, t.upcase
60
- token.set :tag_set, :penn
61
- if isolated_word
62
- entity.set :tag_set, :penn
63
- return t.upcase
64
- end
65
- end
66
- entity.set :tag_set, :penn
67
- return 'P' if entity.is_a?(Treat::Entities::Phrase)
68
- return 'S' if entity.is_a?(Treat::Entities::Sentence)
69
- end
70
- end
71
- end
72
- end
73
- end
74
-
75
- =begin
76
-
77
- CC Conjunction, coordinating and, or
78
- CD Adjective, cardinal number 3, fifteen
79
- DET Determiner this, each, some
80
- EX Pronoun, existential there there
81
- FW Foreign words
82
- IN Preposition / Conjunction for, of, although, that
83
- JJ Adjective happy, bad
84
- JJR Adjective, comparative happier, worse
85
- JJS Adjective, superlative happiest, worst
86
- LS Symbol, list item A, A.
87
- MD Verb, modal can, could, 'll
88
- NN Noun aircraft, data
89
- NNP Noun, proper London, Michael
90
- NNPS Noun, proper, plural Australians, Methodists
91
- NNS Noun, plural women, books
92
- PDT Determiner, prequalifier quite, all, half
93
- POS Possessive 's, '
94
- PRP Determiner, possessive second mine, yours
95
- PRPS Determiner, possessive their, your
96
- RB Adverb often, not, very, here
97
- RBR Adverb, comparative faster
98
- RBS Adverb, superlative fastest
99
- RP Adverb, particle up, off, out
100
- SYM Symbol *
101
- TO Preposition to
102
- UH Interjection oh, yes, mmm
103
- VB Verb, infinitive take, live
104
- VBD Verb, past tense took, lived
105
- VBG Verb, gerund taking, living
106
- VBN Verb, past/passive participle taken, lived
107
- VBP Verb, base present form take, live
108
- VBZ Verb, present 3SG -s form takes, lives
109
- WDT Determiner, question which, whatever
110
- WP Pronoun, question who, whoever
111
- WPS Determiner, possessive & question whose
112
- WRB Adverb, question when, how, however
113
-
114
- PP Punctuation, sentence ender ., !, ?
115
- PPC Punctuation, comma ,
116
- PPD Punctuation, dollar sign $
117
- PPL Punctuation, quotation mark left ``
118
- PPR Punctuation, quotation mark right ''
119
- PPS Punctuation, colon, semicolon, elipsis :, ..., -
120
- LRB Punctuation, left bracket (, {, [
121
- RRB Punctuation, right bracket ), }, ]
122
-
123
- =end
@@ -1,70 +0,0 @@
1
- module Treat
2
- module Lexicalizers
3
- module Tag
4
- class Stanford
5
- require 'stanford-core-nlp'
6
- # Hold one tagger per language.
7
- @@taggers = {}
8
- # Hold the default options.
9
- DefaultOptions = {
10
- :tagger_model => nil,
11
- :silence => false,
12
- :log_to_file => nil
13
- }
14
- LanguageToTagSet = {
15
- :eng => :penn,
16
- :ger => :negra,
17
- :chi => :penn_chinese,
18
- :fre => :simple
19
- }
20
- # Tag the word using one of the Stanford taggers.
21
- def self.tag(entity, options = {})
22
- # Handle options and set models.
23
- options = DefaultOptions.merge(options)
24
- if entity.has_children?
25
- warn "The Stanford tagger performs its own tokenization." +
26
- "Removing all children of #{entity.type} with value #{entity.short_value}."
27
- entity.remove_all!
28
- end
29
- # Arrange options.
30
- lang = entity.language
31
- tag_set = LanguageToTagSet[lang]
32
- warn "The tag set for the Stanford tagger you are requiring is not supported." unless tag_set
33
- ::StanfordCoreNLP.set_model('pos.model', options[:tagger_model]) if options[:tagger_model]
34
- options[:log_to_file] = '/dev/null' if options[:silence]
35
- ::StanfordCoreNLP.log_file = options[:log_to_file] if options[:log_to_file]
36
-
37
- # Load the tagger.
38
- StanfordCoreNLP.use(lang)
39
- @@taggers[lang] ||= ::StanfordCoreNLP.load(:tokenize, :ssplit, :pos)
40
-
41
- # Tag the text.
42
- text = ::StanfordCoreNLP::Text.new(entity.to_s)
43
- isolated_word = entity.is_a?(Treat::Entities::Token)
44
- @@taggers[lang].annotate(text)
45
-
46
- text.get(:tokens).each do |token|
47
- val = token.get(:value).to_s
48
- tok = Treat::Entities::Token.from_string(val)
49
- tag = token.get(:part_of_speech).to_s
50
- tag_s, tag_opt = *tag.split('-')
51
- tag_s ||= ''
52
- tok.set :tag, tag_s
53
- tok.set :tag_opt, tag_opt
54
- tok.set :tag_set, tag_set if tag_set
55
- if isolated_word
56
- entity.set :tag_set, :penn
57
- return tag_s
58
- end
59
- entity << tok
60
- end
61
-
62
- # Handle tags for sentences and phrases.
63
- entity.set :tag_set, tag_set if tag_set
64
- return 'P' if entity.is_a?(Treat::Entities::Phrase)
65
- return 'S' if entity.is_a?(Treat::Entities::Sentence)
66
- end
67
- end
68
- end
69
- end
70
- end