treat 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (160) hide show
  1. data/LICENSE +4 -4
  2. data/TODO +21 -54
  3. data/lib/economist/half_cocked_basel.txt +16 -0
  4. data/lib/economist/hose_and_dry.doc +0 -0
  5. data/lib/economist/hungarys_troubles.abw +70 -0
  6. data/lib/economist/republican_nomination.pdf +0 -0
  7. data/lib/economist/saving_the_euro.odt +0 -0
  8. data/lib/economist/to_infinity_and_beyond.txt +15 -0
  9. data/lib/economist/zero_sum.html +91 -0
  10. data/lib/treat.rb +58 -72
  11. data/lib/treat/buildable.rb +59 -15
  12. data/lib/treat/categories.rb +26 -14
  13. data/lib/treat/category.rb +2 -2
  14. data/lib/treat/delegatable.rb +65 -48
  15. data/lib/treat/doable.rb +44 -0
  16. data/lib/treat/entities.rb +34 -14
  17. data/lib/treat/entities/collection.rb +2 -0
  18. data/lib/treat/entities/document.rb +3 -2
  19. data/lib/treat/entities/entity.rb +105 -90
  20. data/lib/treat/entities/phrases.rb +17 -0
  21. data/lib/treat/entities/tokens.rb +28 -13
  22. data/lib/treat/entities/zones.rb +20 -0
  23. data/lib/treat/extractors.rb +49 -11
  24. data/lib/treat/extractors/coreferences/stanford.rb +68 -0
  25. data/lib/treat/extractors/date/chronic.rb +32 -0
  26. data/lib/treat/extractors/date/ruby.rb +25 -0
  27. data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
  28. data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
  29. data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
  30. data/lib/treat/extractors/language/what_language.rb +49 -0
  31. data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
  32. data/lib/treat/extractors/roles/naive.rb +73 -0
  33. data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
  34. data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
  35. data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
  36. data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
  37. data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
  38. data/lib/treat/extractors/time/nickel.rb +30 -12
  39. data/lib/treat/extractors/topic_words/lda.rb +9 -9
  40. data/lib/treat/extractors/topics/reuters.rb +14 -15
  41. data/lib/treat/extractors/topics/reuters/region.xml +1 -0
  42. data/lib/treat/features.rb +7 -0
  43. data/lib/treat/formatters/readers/abw.rb +6 -1
  44. data/lib/treat/formatters/readers/autoselect.rb +5 -6
  45. data/lib/treat/formatters/readers/doc.rb +3 -1
  46. data/lib/treat/formatters/readers/html.rb +1 -1
  47. data/lib/treat/formatters/readers/image.rb +43 -0
  48. data/lib/treat/formatters/readers/odt.rb +1 -2
  49. data/lib/treat/formatters/readers/pdf.rb +9 -1
  50. data/lib/treat/formatters/readers/xml.rb +40 -0
  51. data/lib/treat/formatters/serializers/xml.rb +50 -14
  52. data/lib/treat/formatters/serializers/yaml.rb +7 -2
  53. data/lib/treat/formatters/unserializers/xml.rb +33 -7
  54. data/lib/treat/formatters/visualizers/dot.rb +90 -20
  55. data/lib/treat/formatters/visualizers/short_value.rb +2 -2
  56. data/lib/treat/formatters/visualizers/standoff.rb +2 -2
  57. data/lib/treat/formatters/visualizers/tree.rb +1 -1
  58. data/lib/treat/formatters/visualizers/txt.rb +13 -4
  59. data/lib/treat/group.rb +16 -10
  60. data/lib/treat/helpers/linguistics_loader.rb +18 -0
  61. data/lib/treat/inflectors.rb +10 -0
  62. data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
  63. data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
  64. data/lib/treat/inflectors/declensions/english.rb +319 -0
  65. data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
  66. data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
  67. data/lib/treat/install.rb +59 -0
  68. data/lib/treat/kernel.rb +18 -8
  69. data/lib/treat/languages.rb +18 -11
  70. data/lib/treat/languages/arabic.rb +4 -2
  71. data/lib/treat/languages/chinese.rb +6 -2
  72. data/lib/treat/languages/dutch.rb +16 -0
  73. data/lib/treat/languages/english.rb +47 -19
  74. data/lib/treat/languages/french.rb +8 -5
  75. data/lib/treat/languages/german.rb +9 -6
  76. data/lib/treat/languages/greek.rb +16 -0
  77. data/lib/treat/languages/italian.rb +6 -3
  78. data/lib/treat/languages/polish.rb +16 -0
  79. data/lib/treat/languages/portuguese.rb +16 -0
  80. data/lib/treat/languages/russian.rb +16 -0
  81. data/lib/treat/languages/spanish.rb +16 -0
  82. data/lib/treat/languages/swedish.rb +16 -0
  83. data/lib/treat/languages/tags.rb +377 -0
  84. data/lib/treat/lexicalizers.rb +34 -23
  85. data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
  86. data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
  87. data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
  88. data/lib/treat/lexicalizers/tag/brill.rb +35 -40
  89. data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
  90. data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
  91. data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
  92. data/lib/treat/processors.rb +8 -8
  93. data/lib/treat/processors/chunkers/txt.rb +4 -4
  94. data/lib/treat/processors/parsers/enju.rb +114 -99
  95. data/lib/treat/processors/parsers/stanford.rb +109 -41
  96. data/lib/treat/processors/segmenters/punkt.rb +17 -18
  97. data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
  98. data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
  99. data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
  100. data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
  101. data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
  102. data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
  103. data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
  104. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
  105. data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
  106. data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
  107. data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
  108. data/lib/treat/processors/segmenters/stanford.rb +38 -37
  109. data/lib/treat/processors/segmenters/tactful.rb +5 -4
  110. data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
  111. data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
  112. data/lib/treat/processors/tokenizers/perl.rb +2 -2
  113. data/lib/treat/processors/tokenizers/punkt.rb +6 -2
  114. data/lib/treat/processors/tokenizers/stanford.rb +25 -24
  115. data/lib/treat/processors/tokenizers/tactful.rb +1 -2
  116. data/lib/treat/proxies.rb +2 -35
  117. data/lib/treat/registrable.rb +17 -22
  118. data/lib/treat/sugar.rb +11 -11
  119. data/lib/treat/tree.rb +27 -17
  120. data/lib/treat/viewable.rb +29 -0
  121. data/lib/treat/visitable.rb +1 -1
  122. data/test/tc_entity.rb +56 -49
  123. data/test/tc_extractors.rb +41 -18
  124. data/test/tc_formatters.rb +7 -8
  125. data/test/tc_inflectors.rb +19 -24
  126. data/test/tc_lexicalizers.rb +12 -19
  127. data/test/tc_processors.rb +26 -12
  128. data/test/tc_resources.rb +2 -7
  129. data/test/tc_treat.rb +20 -22
  130. data/test/tc_tree.rb +4 -4
  131. data/test/tests.rb +3 -5
  132. data/test/texts.rb +13 -14
  133. data/tmp/INFO +1 -0
  134. metadata +78 -158
  135. data/bin/INFO +0 -1
  136. data/examples/benchmark.rb +0 -81
  137. data/examples/keywords.rb +0 -148
  138. data/lib/treat/detectors.rb +0 -31
  139. data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
  140. data/lib/treat/detectors/format/file.rb +0 -36
  141. data/lib/treat/detectors/language/what_language.rb +0 -29
  142. data/lib/treat/entities/constituents.rb +0 -15
  143. data/lib/treat/entities/sentence.rb +0 -8
  144. data/lib/treat/extractors/named_entity/abner.rb +0 -20
  145. data/lib/treat/extractors/named_entity/stanford.rb +0 -174
  146. data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
  147. data/lib/treat/extractors/time/chronic.rb +0 -20
  148. data/lib/treat/extractors/time/native.rb +0 -18
  149. data/lib/treat/formatters/readers/gocr.rb +0 -26
  150. data/lib/treat/formatters/readers/ocropus.rb +0 -31
  151. data/lib/treat/formatters/visualizers/html.rb +0 -13
  152. data/lib/treat/formatters/visualizers/inspect.rb +0 -20
  153. data/lib/treat/inflectors/declensions/en.rb +0 -18
  154. data/lib/treat/languages/categories.rb +0 -5
  155. data/lib/treat/languages/english/categories.rb +0 -23
  156. data/lib/treat/languages/english/tags.rb +0 -352
  157. data/lib/treat/languages/xinhua.rb +0 -12
  158. data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
  159. data/lib/treat/string.rb +0 -5
  160. data/test/tc_detectors.rb +0 -26
@@ -6,42 +6,53 @@ module Treat
6
6
  # Taggers return the part of speech tag of a word.
7
7
  module Tag
8
8
  extend Group
9
+ require 'treat/lexicalizers/tag/tagger'
9
10
  self.type = :annotator
10
11
  self.targets = [:word]
11
12
  end
13
+
14
+ # Return the general category of a word.
12
15
  module Category
13
16
  extend Group
14
17
  self.type = :annotator
15
- self.targets = [:phrase, :word]
16
-
17
- def self.cat(entity, category); category; end # Remove
18
- end
19
- # Linkers allow to retrieve grammatical links
20
- # between words.
21
- module Linkages
22
- extend Group
23
- self.type = :annotator
24
- self.targets = [:sentence, :word]
18
+ self.targets = [:word]
19
+ self.default = :from_tag
25
20
  end
21
+
26
22
  # Lexicons are dictionnaries of semantically linked
27
23
  # word forms.
28
24
  module Synsets
29
25
  extend Group
30
26
  self.type = :annotator
31
- self.targets = [:word, :number]
32
- def self.synonyms(entity, synsets)
33
- synsets.collect { |ss| ss.synonyms }.flatten - [entity.value]
34
- end
35
- def self.antonyms(entity, synsets)
36
- synsets.collect { |ss| ss.antonyms }.flatten
37
- end
38
- def self.hyponyms(entity, synsets)
39
- synsets.collect { |ss| ss.hyponyms }.flatten
40
- end
41
- def self.hypernyms(entity, synsets)
42
- synsets.collect { |ss| ss.hypernyms }.flatten
43
- end
27
+ self.targets = [:word]
28
+ self.postprocessors = {
29
+ :synonyms => lambda do |entity, synsets|
30
+ synsets.collect { |ss| ss.synonyms }.flatten -
31
+ [entity.value]
32
+ end,
33
+ :antonyms => lambda do |entity, synsets|
34
+ synsets.collect { |ss| ss.antonyms }.flatten
35
+ end,
36
+ :hyponyms => lambda do |entity, synsets|
37
+ synsets.collect { |ss| ss.hyponyms }.flatten
38
+ end,
39
+ :hypernyms => lambda do |entity, synsets|
40
+ synsets.collect { |ss| ss.hypernyms }.flatten
41
+ end
42
+ }
43
+ end
44
+
45
+ module Linkages
46
+ extend Group
47
+ self.type = :annotator
48
+ self.targets = [:zone]
49
+ self.presets = {
50
+ :is_a => {:linkage => :is_a},
51
+ :synonym_of => {:linkage => :synonym_of},
52
+ :antonym_of => {:linkage => :antonym_of}
53
+ }
44
54
  end
55
+
45
56
  extend Treat::Category
46
57
  end
47
58
  end
@@ -5,22 +5,29 @@ module Treat
5
5
  # using the default tagger for the language of the entity.
6
6
  class FromTag
7
7
  # Find the category of the current entity.
8
- #
8
+ #
9
9
  # Options:
10
- #
10
+ #
11
11
  # - (Symbol) :tagger => force the use of a tagger.
12
12
  def self.category(entity, options = {})
13
- tag = options[:tagger].nil? ? entity.tag : entity.tag(options[:tagger])
14
- lang = Treat::Languages.get(entity.language)
15
- cat = lang::WordTagToCategory[tag]
16
- if cat.nil?
17
- warn "Category not found for tag #{tag}."
18
- :unknown
13
+ tag = entity.tag(options[:tagger])
14
+ return :unknown if tag.nil? || tag == ''
15
+ return :sentence if tag == 'S'
16
+ if entity.is_a?(Treat::Entities::Phrase)
17
+ cat = Treat::Languages::Tags::PhraseTagToCategory[tag]
18
+ unless cat
19
+ cat = Treat::Languages::Tags::WordTagToCategory[tag]
20
+ end
21
+ elsif entity.is_a?(Treat::Entities::Word)
22
+ cat = Treat::Languages::Tags::WordTagToCategory[tag]
23
+ end
24
+ if cat == nil
25
+ warn "Category not found for tag '#{tag}'."
26
+ return :unknown
19
27
  else
20
28
  if cat.size == 1
21
- return cat[0]
29
+ return cat[entity.tag_set]
22
30
  else
23
- entity.set :tag_set, :penn
24
31
  if entity.has?(:tag_set)
25
32
  if cat[entity.tag_set]
26
33
  return cat[entity.tag_set]
@@ -2,60 +2,60 @@ module Treat
2
2
  module Lexicalizers
3
3
  module Linkages
4
4
  class Naive
5
+ # Fix - add options for sentences.
5
6
  def self.linkages(entity, options = {})
6
- linkage = options.delete(:linkage)
7
- if linkage.nil?
7
+ if options[:linkage] == :is_a ||
8
+ options[:linkage] == :hypernym_of
9
+
10
+ entity.each_word do |w1|
11
+ hypernyms = []
12
+ entity.each_word do |w2|
13
+ next if w1 == w2
14
+ if w2.hypernyms.include?(w1.value) ||
15
+ w1.hyponyms.include?(w2.value)
16
+ hypernyms << w1
17
+ w2.link(w1, :is_a)
18
+ w1.link(w2, :hypernym_of)
19
+ end
20
+ end
21
+ w1.set :hypernyms, hypernyms
22
+ end
23
+
24
+ elsif options[:linkage] == :synonym_of
25
+
26
+ entity.each_word do |w1|
27
+ synonyms = []
28
+ entity.each_word do |w2|
29
+ next if w1 == w2
30
+ if w2.synonyms.include?(w1.value)
31
+ synonyms << w1
32
+ w2.link(w1, :synonym_of)
33
+ w1.link(w2, :synonym_of)
34
+ end
35
+ end
36
+ w1.set :synonyms, synonyms
37
+ end
38
+
39
+ elsif options[:linkage] == :antonym_of
40
+
41
+ entity.each_word do |w1|
42
+ antonyms = []
43
+ entity.each_word do |w2|
44
+ next if w1 == w2
45
+ if w2.antonyms.include?(w1.value)
46
+ antonyms << w1
47
+ w2.link(w1, :antonym_of)
48
+ w1.link(w2, :antonym_of)
49
+ end
50
+ end
51
+ w1.set :antonyms, antonyms
52
+ end
53
+
54
+ else
8
55
  raise Treat::Exception,
9
- "You must supply the :linkage option."
56
+ "Invalid linkage option '#{options[:linkage]}'."
10
57
  end
11
- if !respond_to?(linkage)
12
- raise Treat::Exception,
13
- "No handler to resolve linkage #{linkage}."
14
- end
15
- self.send(linkage, entity, options)
16
- end
17
- # %%%
18
- def self.patient(entity, options)
19
- # Not so simple here... Fix
20
- if main_verb.has_feature?(:aux)
21
- subject
22
- elsif main_verb.voice == 'passive'
23
- subject
24
- elsif main_verb.voice == 'active'
25
- # Each prepos.
26
- end
27
- end
28
- # Return the subject of the sentence|verb.
29
- def self.subject(entity, options)
30
- verb = (entity.has?(:category) && entity.category == :verb) ?
31
- main_verb(entity) : entity.main_verb
32
- args = []
33
- main_verb.edges.each_pair do |id,edge|
34
- args << find(id)
35
- end
36
- args[0]
37
- end
38
- # Return the object of the sentence|verb.
39
- def self.object(entity, options)
40
- verb = (entity.has?(:category) && entity.category == :verb) ?
41
- main_verb(entity) : entity.main_verb
42
- if verb.voice == 'passive'
43
- return
44
- end
45
- args = []
46
- verb.edges.each_pair do |id,edge|
47
- args << find(id)
48
- end
49
- args[1]
50
- end
51
- # Find the main verb (shallowest verb in the tree).
52
- def self.main_verb(entity, options)
53
- verbs = entity.verbs
54
- if verbs.empty?
55
- return
56
- end
57
- verbs.sort! { |a,b| a.depth <=> b.depth }
58
- verbs[0]
58
+
59
59
  end
60
60
  end
61
61
  end
@@ -55,7 +55,11 @@ module Treat
55
55
  # The antonym sets of the synset.
56
56
  def antonyms; antonym.collect { |a| a.words }; end
57
57
  # The hypernym sets of the synset.
58
- def hypernyms; hypernym.words; end
58
+ def hypernyms;
59
+ h = hypernym
60
+ return [] unless h
61
+ h.words
62
+ end
59
63
  # The hyponym sets of the synset.
60
64
  def hyponyms; hyponym.collect { |h| h.words }; end
61
65
  # Respond to the missing method event.
@@ -4,47 +4,47 @@ module Treat
4
4
  # Adapter class for the 'rbtagger' gem, a port
5
5
  # of the Perl Lingua::BrillTagger class, based
6
6
  # on the rule-based tagger developped by Eric Brill.
7
- #
7
+ #
8
8
  # The Brill tagger is a simple rule-based part of
9
9
  # speech tagger. The main advantages over stochastic
10
10
  # taggers is a vast reduction in information required
11
11
  # and better portability from one tag set, corpus genre
12
12
  # or language to another.
13
- #
14
- # Original paper:
15
- # Eric Brill. 1992. A simple rule-based part of speech tagger.
16
- # In Proceedings of the third conference on Applied natural
17
- # language processing (ANLC '92). Association for Computational
18
- # Linguistics, Stroudsburg, PA, USA, 152-155.
13
+ #
14
+ # Original paper:
15
+ # Eric Brill. 1992. A simple rule-based part of speech tagger.
16
+ # In Proceedings of the third conference on Applied natural
17
+ # language processing (ANLC '92). Association for Computational
18
+ # Linguistics, Stroudsburg, PA, USA, 152-155.
19
19
  # DOI=10.3115/974499.974526 http://dx.doi.org/10.3115/974499.974526
20
- # Project website:
20
+ # Project website:
21
21
  # http://rbtagger.rubyforge.org/
22
- # Original Perl module site:
22
+ # Original Perl module site:
23
23
  # http://search.cpan.org/~kwilliams/Lingua-BrillTagger-0.02/lib/Lingua/BrillTagger.pm
24
- class Brill
24
+ class Brill < Tagger
25
25
  patch = false
26
26
  # Require the 'rbtagger' gem.
27
+ require 'rbtagger'
27
28
  begin
28
- silence_warnings { require 'rbtagger' }
29
- # This whole mess is required to deal with
30
- # the fact that the 'rbtagger' gem defines
31
- # a top-level module called 'Word', which
32
- # will clash with the top-level class 'Word'
33
- # we define when syntactic sugar is enabled.
29
+ # This whole mess is required to deal with
30
+ # the fact that the 'rbtagger' gem defines
31
+ # a top-level module called 'Word', which
32
+ # will clash with the top-level class 'Word'
33
+ # we define when syntactic sugar is enabled.
34
34
  rescue TypeError
35
- if Treat.edulcorated?
35
+ if Treat.sweetened?
36
36
  patch = true
37
37
  # Unset the class Word for the duration
38
38
  # of loading the tagger.
39
39
  Object.const_unset(:Word); retry
40
40
  else
41
41
  raise Treat::Exception,
42
- 'Something went wrong due to a name clash with the "rbtagger" gem.' +
42
+ 'Something went wrong due to a name clash with the "rbtagger" gem.' +
43
43
  'Turn off syntactic sugar to resolve this problem.'
44
44
  end
45
45
  ensure
46
46
  # Reset the class Word if using syntactic sugar.
47
- if Treat.edulcorated? && patch
47
+ if Treat.sweetened? && patch
48
48
  Object.const_set(:Word, Treat::Entities::Word)
49
49
  end
50
50
  end
@@ -55,38 +55,33 @@ module Treat
55
55
  # Tag words using a native Brill tagger.
56
56
  #
57
57
  # Options:
58
- #
58
+ #
59
59
  # :lexicon => String (Lexicon file to use)
60
60
  # :lexical_rules => String (Lexical rule file to use)
61
61
  # :contextual_rules => String (Contextual rules file to use)
62
62
  def self.tag(entity, options = {})
63
+ r = super(entity, options)
64
+ return r if r && r != :isolated_word
63
65
  # Reinitialize the tagger if the options have changed.
64
66
  @@tagger = nil if options != @@options
65
67
  # Create the tagger if necessary
66
68
  @@tagger ||= ::Brill::Tagger.new(options[:lexicon],
67
69
  options[:lexical_rules], options[:contextual_rules])
68
- entity.set :tag_set, :penn
69
- # Perform tagging.
70
- if entity.type == :word
71
- # Setup the context of the word
72
- l = entity.left
73
- r = entity.right
74
- l = l.nil? ? '' : l.to_s
75
- r = r.nil? ? '' : r.to_s
76
- c = "#{l} #{entity.value} #{r}"
77
- end
78
- res = @@tagger.tag(c)
79
- if l == ''
80
- unless r == ''
81
- entity.next_sibling.set(:tag, res[3][1])
70
+ words = (r == :isolated_word) ? [entity] : entity.tokens
71
+ res = @@tagger.tag(words.join(' '))[1..-1]
72
+ res ||= []
73
+ res.each do |info|
74
+ words.each do |word|
75
+ if word.value == info[0]
76
+ word.set :tag_set, :penn
77
+ word.set :tag, info[1]
78
+ return info[1] if r == :isolated_word
79
+ end
82
80
  end
83
- return res[2][1]
84
- else
85
- unless r == ''
86
- entity.next_sibling.set(:tag, res[2][1])
87
- end
88
- return res[1][1]
89
81
  end
82
+ entity.set :tag_set, :penn
83
+ return 'P' if entity.is_a?(Treat::Entities::Phrase)
84
+ return 'S' if entity.is_a?(Treat::Entities::Sentence)
90
85
  end
91
86
  end
92
87
  end
@@ -15,7 +15,7 @@ module Treat
15
15
  # Project website: http://engtagger.rubyforge.org/
16
16
  # Original Perl module site:
17
17
  # http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/
18
- class Lingua
18
+ class Lingua < Tagger
19
19
  # Require the 'engtagger' gem.
20
20
  silence_warnings { require 'engtagger' }
21
21
  # Hold the tagger.
@@ -24,8 +24,8 @@ module Treat
24
24
  @@options = {}
25
25
  # Hold the default options.
26
26
  DefaultOptions = {
27
- unknown_word_tag: 'FW',
28
- relax: false
27
+ :unknown_word_tag => 'pp', # Fix unknown word tag
28
+ :relax => false
29
29
  }
30
30
  # Tag the word using a probabilistic model taking
31
31
  # into account known words found in a lexicon and
@@ -38,24 +38,29 @@ module Treat
38
38
  # particularly words used polysemously.
39
39
  # - (String) :unknown_word_tag => Tag for unknown words.
40
40
  def self.tag(entity, options = {})
41
+ options = DefaultOptions.merge(options)
42
+ r = super(entity, options)
43
+ return r if r && r != :isolated_word
41
44
  # Reinitialize the tagger if the options have changed.
42
45
  if options != @@options
43
46
  @@options = DefaultOptions.merge(options)
44
47
  @@tagger = nil # Reset the tagger
45
48
  end
46
49
  @@tagger ||= ::EngTagger.new(@@options)
47
- entity.set :tag_set, :penn
48
- left = entity.left
49
- if left.nil? || left.type != :word
50
- left_tag = 'pp'
51
- else
52
- left_tag = left.tag.downcase
53
- left_tag = 'pp' if left_tag == ''
50
+ left_tag = @@tagger.conf[:current_tag] = 'pp'
51
+ tokens = (r == :isolated_word) ? [entity] : entity.tokens
52
+ tokens.each do |token|
53
+ w = @@tagger.clean_word(token.to_s)
54
+ t = @@tagger.assign_tag(left_tag, w)
55
+ t = options[:unknown_word_tag] if t.nil? || t == ''
56
+ @@tagger.conf[:current_tag] = left_tag = t
57
+ token.set :tag, t.upcase
58
+ token.set :tag_set, :penn
59
+ return t.upcase if r == :isolated_word
54
60
  end
55
- w = @@tagger.clean_word(entity.to_s)
56
- t = @@tagger.conf[:current_tag] =
57
- @@tagger.assign_tag(left_tag, w)
58
- t.upcase
61
+ entity.set :tag_set, :penn
62
+ return 'P' if entity.is_a?(Treat::Entities::Phrase)
63
+ return 'S' if entity.is_a?(Treat::Entities::Sentence)
59
64
  end
60
65
  end
61
66
  end
@@ -1,85 +1,76 @@
1
1
  module Treat
2
2
  module Lexicalizers
3
3
  module Tag
4
- class Stanford
5
- # Require the Ruby-Java bridge.
6
- silence_warnings do
7
- require 'rjb'
8
- jar = "#{Treat.bin}/stanford-tagger*/stanford-postagger*.jar"
9
- jars = Dir.glob(jar)
10
- if jars.empty? || !File.readable?(jars[0])
11
- raise "Could not find stanford tagger JAR file (looking in #{jar})."+
12
- " You may need to manually download the JAR files and/or set Treat.bin."
13
- end
14
- Rjb::load(jars[0], ['-Xms256M', '-Xmx512M'])
15
- MaxentTagger = ::Rjb::import('edu.stanford.nlp.tagger.maxent.MaxentTagger')
16
- Word = ::Rjb::import('edu.stanford.nlp.ling.Word')
17
- List = ::Rjb::import('java.util.ArrayList')
18
- end
19
- # A list of models to use by language.
20
- # Other models are available; see the models/ folder
21
- # in the Stanford Tagger distribution files.
22
- LanguageToModel = {
23
- eng: 'english-left3words-distsim.tagger',
24
- ger: 'german-fast.tagger',
25
- fra: 'french.tagger',
26
- ara: 'arabic-fast.tagger',
27
- chi: 'chinese.tagger'
28
- }
4
+ class Stanford < Tagger
5
+ require 'stanford-core-nlp'
29
6
  # Hold one tagger per language.
30
7
  @@taggers = {}
31
- # Hold the user-set options for each language.
32
- @@options = {}
33
8
  # Hold the default options.
34
- DefaultOptions = {}
9
+ DefaultOptions = {
10
+ :tagger_model => nil,
11
+ :silence => false,
12
+ :log_to_file => nil
13
+ }
14
+ LanguageToTagSet = {
15
+ :eng => :penn,
16
+ :ger => :negra,
17
+ :chi => :penn_chinese,
18
+ :fre => :simple
19
+ }
35
20
  # Tag the word using one of the Stanford taggers.
36
21
  def self.tag(entity, options = {})
22
+ # Handle options and set models.
23
+ options = DefaultOptions.merge(options)
24
+ r = super(entity, options)
25
+ return r if r && r != :isolated_word
26
+ # Arrange options.
37
27
  lang = entity.language
38
- # Find the model.
39
- if options[:model]
40
- model = options[:model]
41
- else
42
- model = LanguageToModel[lang]
43
- if model.nil?
44
- raise Treat::Exception, "There exists no Stanford tagger model for " +
45
- "the #{Treat::Languages.describe(lang)} language ."
46
- end
28
+ @@tag_set = LanguageToTagSet[lang]
29
+ unless @@tag_set
30
+ warn "The tag set for the tagger you are requiring is not supported."
47
31
  end
48
- # Reinitialize the tagger if the options have changed.
49
- if options != @@options[lang]
50
- @@options[lang] = DefaultOptions.merge(options)
51
- @@taggers[lang] = nil # Reset the tagger
32
+
33
+ if options[:tagger_model]
34
+ ::StanfordCoreNLP.set_model(
35
+ 'pos.model', options[:tagger_model]
36
+ )
52
37
  end
53
- if @@taggers[lang].nil?
54
- model = "#{Treat.bin}/stanford-tagger*/models/#{model}"
55
- models = Dir.glob(model)
56
- if models.empty? || !File.readable?(models[0])
57
- raise "Could not find a tagger model for the " +
58
- "#{Treat::Languages.describe(lang)}: looking in #{model}."
59
- end
60
- silence_streams(STDOUT, STDERR) do
61
- @@taggers[lang] =
62
- MaxentTagger.new(models[0])
63
- end
38
+ if options[:silence]
39
+ options[:log_to_file] = '/dev/null'
64
40
  end
65
- entity.set :tag_set, :penn
66
- list = List.new
67
- id_list = {}
68
- i = 0
69
- [entity].each do |word| # Fix...
70
- list.add(Word.new(word.to_s))
71
- id_list[i] = word
72
- i += 1
41
+ if options[:log_to_file]
42
+ ::StanfordCoreNLP.log_file =
43
+ options[:log_to_file]
73
44
  end
74
- it = nil
75
- it = @@taggers[lang].apply(list).iterator
76
- i = 0
77
- while it.has_next
78
- w = it.next
79
- id_list[i].set :tag, w.tag
80
- i += 1
45
+
46
+ # Load the tagger.
47
+ StanfordCoreNLP.use(lang)
48
+ @@taggers[lang] ||= ::StanfordCoreNLP.load(:tokenize, :ssplit, :pos)
49
+ # Tag the text.
50
+ text = ::StanfordCoreNLP::Text.new(entity.to_s)
51
+ @@taggers[lang].annotate(text)
52
+ # Realign the tags.
53
+ entity.each_token do |t1|
54
+ text.get(:sentences).each do |sentence|
55
+ sentence.get(:tokens).each do |t2|
56
+ if t2.value == t1.value
57
+ tag = t2.get(:part_of_speech).to_s
58
+ tag_s, tag_opt = *tag.split('-')
59
+ tag_s ||= ''
60
+ t1.set :tag, tag_s
61
+ t1.set :tag_opt, tag_opt
62
+ t1.set :tag_set, @@tag_set if @@tag_set
63
+ return tag_s if r == :isolated_word
64
+ break
65
+ end
66
+ end
67
+ end
81
68
  end
82
- w.tag
69
+
70
+ # Handle tags for sentences and phrases.
71
+ entity.set :tag_set, @@tag_set if @@tag_set
72
+ return 'P' if entity.is_a?(Treat::Entities::Phrase)
73
+ return 'S' if entity.is_a?(Treat::Entities::Sentence)
83
74
  end
84
75
  end
85
76
  end