treat 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (147) hide show
  1. data/INSTALL +0 -0
  2. data/LICENSE +28 -0
  3. data/README +0 -0
  4. data/TODO +67 -0
  5. data/bin/INFO +1 -0
  6. data/examples/benchmark.rb +81 -0
  7. data/examples/keywords.rb +60 -0
  8. data/examples/texts/bugged_out.txt +26 -0
  9. data/examples/texts/half_cocked_basel.txt +16 -0
  10. data/examples/texts/hedge_funds.txt +24 -0
  11. data/examples/texts/hose_and_dry.txt +19 -0
  12. data/examples/texts/hungarys_troubles.txt +46 -0
  13. data/examples/texts/indias_slowdown.txt +15 -0
  14. data/examples/texts/merkozy_rides_again.txt +24 -0
  15. data/examples/texts/prada_is_not_walmart.txt +9 -0
  16. data/examples/texts/republican_nomination.txt +26 -0
  17. data/examples/texts/to_infinity_and_beyond.txt +15 -0
  18. data/lib/treat.rb +91 -0
  19. data/lib/treat/buildable.rb +115 -0
  20. data/lib/treat/categories.rb +29 -0
  21. data/lib/treat/category.rb +28 -0
  22. data/lib/treat/delegatable.rb +90 -0
  23. data/lib/treat/detectors.rb +28 -0
  24. data/lib/treat/detectors/encoding/native.rb +12 -0
  25. data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
  26. data/lib/treat/detectors/format/file.rb +36 -0
  27. data/lib/treat/detectors/language/language_detector.rb +19 -0
  28. data/lib/treat/detectors/language/what_language.rb +29 -0
  29. data/lib/treat/entities.rb +52 -0
  30. data/lib/treat/entities/collection.rb +19 -0
  31. data/lib/treat/entities/constituents.rb +15 -0
  32. data/lib/treat/entities/document.rb +11 -0
  33. data/lib/treat/entities/entity.rb +242 -0
  34. data/lib/treat/entities/sentence.rb +8 -0
  35. data/lib/treat/entities/text.rb +7 -0
  36. data/lib/treat/entities/tokens.rb +37 -0
  37. data/lib/treat/entities/zones.rb +17 -0
  38. data/lib/treat/exception.rb +5 -0
  39. data/lib/treat/extractors.rb +41 -0
  40. data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
  41. data/lib/treat/extractors/named_entity/abner.rb +20 -0
  42. data/lib/treat/extractors/named_entity/stanford.rb +174 -0
  43. data/lib/treat/extractors/statistics/frequency.rb +22 -0
  44. data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
  45. data/lib/treat/extractors/statistics/position_in.rb +13 -0
  46. data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
  47. data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
  48. data/lib/treat/extractors/time/chronic.rb +12 -0
  49. data/lib/treat/extractors/time/native.rb +12 -0
  50. data/lib/treat/extractors/time/nickel.rb +45 -0
  51. data/lib/treat/extractors/topic_words/lda.rb +71 -0
  52. data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
  53. data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
  54. data/lib/treat/extractors/topics/reuters.rb +91 -0
  55. data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
  56. data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
  57. data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
  58. data/lib/treat/feature.rb +53 -0
  59. data/lib/treat/formatters.rb +44 -0
  60. data/lib/treat/formatters/cleaners/html.rb +17 -0
  61. data/lib/treat/formatters/readers/autoselect.rb +35 -0
  62. data/lib/treat/formatters/readers/gocr.rb +24 -0
  63. data/lib/treat/formatters/readers/html.rb +13 -0
  64. data/lib/treat/formatters/readers/ocropus.rb +31 -0
  65. data/lib/treat/formatters/readers/pdf.rb +17 -0
  66. data/lib/treat/formatters/readers/txt.rb +15 -0
  67. data/lib/treat/formatters/serializers/xml.rb +48 -0
  68. data/lib/treat/formatters/serializers/yaml.rb +15 -0
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
  70. data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
  71. data/lib/treat/formatters/unserializers/xml.rb +79 -0
  72. data/lib/treat/formatters/unserializers/yaml.rb +15 -0
  73. data/lib/treat/formatters/visualizers/dot.rb +73 -0
  74. data/lib/treat/formatters/visualizers/html.rb +12 -0
  75. data/lib/treat/formatters/visualizers/inspect.rb +16 -0
  76. data/lib/treat/formatters/visualizers/short_value.rb +14 -0
  77. data/lib/treat/formatters/visualizers/standoff.rb +41 -0
  78. data/lib/treat/formatters/visualizers/tree.rb +28 -0
  79. data/lib/treat/formatters/visualizers/txt.rb +31 -0
  80. data/lib/treat/group.rb +96 -0
  81. data/lib/treat/inflectors.rb +50 -0
  82. data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
  83. data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
  84. data/lib/treat/inflectors/declensors/en.rb +18 -0
  85. data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
  86. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
  87. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
  88. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
  89. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
  90. data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
  91. data/lib/treat/inflectors/stemmers/porter.rb +158 -0
  92. data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
  93. data/lib/treat/inflectors/stemmers/uea.rb +30 -0
  94. data/lib/treat/lexicalizers.rb +49 -0
  95. data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
  96. data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
  97. data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
  98. data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
  99. data/lib/treat/lexicalizers/tag/brill.rb +101 -0
  100. data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
  101. data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
  102. data/lib/treat/processors.rb +45 -0
  103. data/lib/treat/processors/chunkers/txt.rb +27 -0
  104. data/lib/treat/processors/parsers/enju.rb +214 -0
  105. data/lib/treat/processors/parsers/stanford.rb +60 -0
  106. data/lib/treat/processors/segmenters/punkt.rb +48 -0
  107. data/lib/treat/processors/segmenters/stanford.rb +45 -0
  108. data/lib/treat/processors/segmenters/tactful.rb +34 -0
  109. data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
  110. data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
  111. data/lib/treat/processors/tokenizers/perl.rb +96 -0
  112. data/lib/treat/processors/tokenizers/punkt.rb +42 -0
  113. data/lib/treat/processors/tokenizers/stanford.rb +33 -0
  114. data/lib/treat/processors/tokenizers/tactful.rb +59 -0
  115. data/lib/treat/proxies.rb +66 -0
  116. data/lib/treat/registrable.rb +26 -0
  117. data/lib/treat/resources.rb +10 -0
  118. data/lib/treat/resources/categories.rb +18 -0
  119. data/lib/treat/resources/delegates.rb +96 -0
  120. data/lib/treat/resources/dependencies.rb +0 -0
  121. data/lib/treat/resources/edges.rb +8 -0
  122. data/lib/treat/resources/formats.rb +23 -0
  123. data/lib/treat/resources/languages.rb +86 -0
  124. data/lib/treat/resources/languages.txt +504 -0
  125. data/lib/treat/resources/tags.rb +393 -0
  126. data/lib/treat/sugar.rb +43 -0
  127. data/lib/treat/tree.rb +174 -0
  128. data/lib/treat/utilities.rb +127 -0
  129. data/lib/treat/visitable.rb +27 -0
  130. data/test/profile.rb +2 -0
  131. data/test/tc_detectors.rb +27 -0
  132. data/test/tc_entity.rb +105 -0
  133. data/test/tc_extractors.rb +48 -0
  134. data/test/tc_formatters.rb +46 -0
  135. data/test/tc_inflectors.rb +39 -0
  136. data/test/tc_lexicalizers.rb +39 -0
  137. data/test/tc_processors.rb +36 -0
  138. data/test/tc_resources.rb +27 -0
  139. data/test/tc_treat.rb +64 -0
  140. data/test/tc_tree.rb +60 -0
  141. data/test/tests.rb +19 -0
  142. data/test/texts.rb +20 -0
  143. data/test/texts/english/long.html +24 -0
  144. data/test/texts/english/long.txt +22 -0
  145. data/test/texts/english/medium.txt +5 -0
  146. data/test/texts/english/short.txt +3 -0
  147. metadata +412 -0
@@ -0,0 +1,23 @@
1
+ module Treat
2
+ module Inflectors
3
+ module Stemmers
4
+ # Stems words using the 'ruby-stemmer' gem, which
5
+ # wraps a C version of the Porter stemming algorithm.
6
+ #
7
+ # Project website: https://github.com/aurelian/ruby-stemmer
8
+ # Original paper: Porter, 1980. An algorithm for suffix stripping,
9
+ # Program, Vol. 14, no. 3, pp 130-137,
10
+ # Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
11
+ class PorterC
12
+ silently { require 'lingua/stemmer' }
13
+ ::LinguaStemmer = ::Lingua
14
+ Object.instance_eval { remove_const :Lingua }
15
+ # Stem the word using the Porter C algorithm.
16
+ # Options: none.
17
+ def self.stem(word, options = {})
18
+ silently { ::LinguaStemmer.stemmer(word.to_s) }
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,30 @@
1
+ module Treat
2
+ module Inflectors
3
+ module Stemmers
4
+ # Stems a word using the UEA algorithm, implemented
5
+ # by the 'uea-stemmer' gem.
6
+ #
7
+ # "Similar to other stemmers, UEA-Lite operates on a
8
+ # set of rules which are used as steps. There are two
9
+ # groups of rules: the first to clean the tokens, and
10
+ # the second to alter suffixes."
11
+ #
12
+ # Project website: https://github.com/ealdent/uea-stemmer
13
+ # Original paper: Jenkins, Marie-Claire, Smith, Dan,
14
+ # Conservative stemming for search and indexing, 2005.
15
+ # http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
16
+ class UEA
17
+ # Require the 'uea-stemmer' gem.
18
+ silently { require 'uea-stemmer' }
19
+ # Keep only one copy of the stemmer.
20
+ @@stemmer = nil
21
+ # Stems a word using the UEA algorithm, implemented
22
+ # by the 'uea-stemmer' gem.
23
+ def self.stem(entity, options = {})
24
+ @@stemmer ||= silently { ::UEAStemmer.new }
25
+ @@stemmer.stem(entity.to_s).strip
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,49 @@
1
+ module Treat
2
+ # Lexicalizers allow the retrieval of lexical information
3
+ # (part of speech tag, synsets, hypersets, hyposets, etc.)
4
+ # of an entity.
5
+ module Lexicalizers
6
+ # Taggers return the part of speech tag of a word.
7
+ module Tag
8
+ extend Group
9
+ self.type = :annotator
10
+ self.targets = [:phrase, :word]
11
+ end
12
+ module Category
13
+ extend Group
14
+ self.type = :annotator
15
+ self.targets = [:phrase, :word]
16
+
17
+ def self.cat(entity, category); category; end # Remove
18
+ end
19
+ # Linkers allow to retrieve grammatical links
20
+ # between words.
21
+ module Linkages
22
+ extend Group
23
+ self.type = :annotator
24
+ self.targets = [:sentence, :word]
25
+ end
26
+ # Lexicons are dictionnaries of semantically linked
27
+ # word forms.
28
+ module Synsets
29
+ extend Group
30
+ self.type = :annotator
31
+ self.targets = [:word, :number]
32
+
33
+ def self.synonyms(entity, synsets)
34
+ synsets.collect { |ss| ss.synonyms }.flatten - [entity.value]
35
+ end
36
+ def self.antonyms(entity, synsets)
37
+ synsets.collect { |ss| ss.antonyms }.flatten
38
+ end
39
+ def self.hyponyms(entity, synsets)
40
+ synsets.collect { |ss| ss.hyponyms }.flatten
41
+ end
42
+ def self.hypernyms(entity, synsets)
43
+ synsets.collect { |ss| ss.hypernyms }.flatten
44
+ end
45
+
46
+ end
47
+ extend Treat::Category
48
+ end
49
+ end
@@ -0,0 +1,30 @@
1
+ module Treat
2
+ module Lexicalizers
3
+ module Category
4
+ # A class that detects the category of a word from its tag,
5
+ # using the default tagger for the language of the entity.
6
+ class FromTag
7
+ # Find the category of the current entity.
8
+ # Options:
9
+ # :tagger => (Symbol) force the use of a tagger.
10
+ # :tag_to_cat => (Hash) a list of categories for each possible tag.
11
+ def self.category(entity, options = {})
12
+ if options.empty?
13
+ options = {
14
+ tagger: nil,
15
+ tag_to_cat: Treat::Resources::Tags::PTBWordTagToCategory
16
+ }
17
+ end
18
+ tag = options[:tagger].nil? ? entity.tag : entity.tag(options[:tagger])
19
+ cat = options[:tag_to_cat][tag]
20
+ if cat.nil?
21
+ warn "Category not found for tag #{tag}."
22
+ :unknown
23
+ else
24
+ cat
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,63 @@
1
+ module Treat
2
+ module Lexicalizers
3
+ module Linkages
4
+ class Naive
5
+ def self.linkages(entity, options = {})
6
+ linkage = options.delete(:linkage)
7
+ if linkage.nil?
8
+ raise Treat::Exception,
9
+ "You must supply the :linkage option."
10
+ end
11
+ if !respond_to?(linkage)
12
+ raise Treat::Exception,
13
+ "No handler to resolve linkage #{linkage}."
14
+ end
15
+ self.send(linkage, entity, options)
16
+ end
17
+ # %%%
18
+ def self.patient(entity, options)
19
+ # Not so simple here... Fix
20
+ if main_verb.has_feature?(:aux)
21
+ subject
22
+ elsif main_verb.voice == 'passive'
23
+ subject
24
+ elsif main_verb.voice == 'active'
25
+ # Each prepos.
26
+ end
27
+ end
28
+ # Return the subject of the sentence|verb.
29
+ def self.subject(entity, options)
30
+ verb = entity.category == :verb ?
31
+ main_verb(entity) : entity.main_verb
32
+ args = []
33
+ main_verb.edges.each_pair do |id,edge|
34
+ args << find(id)
35
+ end
36
+ args[0]
37
+ end
38
+ # Return the object of the sentence|verb.
39
+ def self.object(entity, options)
40
+ verb = entity.category == :verb ?
41
+ main_verb(entity) : entity.main_verb
42
+ if verb.voice == 'passive'
43
+ return
44
+ end
45
+ args = []
46
+ verb.edges.each_pair do |id,edge|
47
+ args << find(id)
48
+ end
49
+ args[1]
50
+ end
51
+ # Find the main verb (shallowest verb in the tree).
52
+ def self.main_verb(entity, options)
53
+ verbs = entity.words_with_cat(:verb)
54
+ if verbs.empty?
55
+ return
56
+ end
57
+ verbs.sort! { |a,b| a.depth <=> b.depth }
58
+ verbs[0]
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,23 @@
1
+ module Treat
2
+ module Lexicalizers
3
+ module Synsets
4
+ # Currently not implemented.
5
+ class RitaWn
6
+ # Require the Ruby-Java bridge.
7
+ #silently do
8
+ require 'rjb'
9
+ # Load the RitaWN jars.
10
+ Rjb::load("#{Treat.bin}/jwnl/jwnl.jar", [])
11
+ JWNLException = Rjb::import('net.didion.jwnl.JWNLException')
12
+ Rjb::load("#{Treat.bin}/ritaWN/library/ritaWN.jar", [])
13
+ Rjb::add_jar("#{Treat.bin}/ritaWN/library/supportWN.jar")
14
+ Rjb::add_jar("#{Treat.bin}/ritaWNcore1.0.jar")
15
+ RiWordnet = ::Rjb::import('rita.wordnet.RiWordnet')
16
+ #end
17
+ def self.synsets(word, options = nil)
18
+
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,72 @@
1
+ module Treat
2
+ module Lexicalizers
3
+ module Synsets
4
+ # Obtain lexical information about a word using the
5
+ # ruby 'wordnet' gem.
6
+ class Wordnet
7
+ # Require the 'wordnet' gem.
8
+ require 'wordnet'
9
+ # Obtain lexical information about a word using the
10
+ # ruby 'wordnet' gem.
11
+ def self.synsets(word, options = nil)
12
+ unless [:noun, :adjective, :verb].include?(word.category)
13
+ return []
14
+ end
15
+ cat = word.category.to_s.capitalize
16
+ index = ::WordNet.const_get(cat + 'Index').instance
17
+ lemma = index.find(word.value.downcase)
18
+ return [] if lemma.nil?
19
+ synsets = []
20
+ lemma.synsets.each { |synset| synsets << Synset.new(synset) }
21
+ synsets
22
+ end
23
+ end
24
+ end
25
+ # An adaptor for synsets used by the Wordnet gem.
26
+ class Synset
27
+ # The POS tag of the word.
28
+ attr_accessor :pos
29
+ # The definition of the synset.
30
+ attr_accessor :definition
31
+ # The examples in the synset.
32
+ attr_accessor :examples
33
+ def initialize(synset)
34
+ @original_synset = synset
35
+ @pos, @definition, @examples =
36
+ parse_synset(synset.to_s.split(')'))
37
+ end
38
+ def parse_synset(res)
39
+ pos = res[0][1..-1].strip
40
+ res2 = res[1].split('(')
41
+ res3 = res2[1].split(';')
42
+ 1.upto(res3.size-1) do |i|
43
+ res3[i] = res3[i].strip[1..-2]
44
+ end
45
+ definition = res3[0]
46
+ examples = res3[1..-1]
47
+ return pos, definition, examples
48
+ end
49
+ # The words in the synset.
50
+ def words; @original_synset.words; end
51
+ def synonyms; @original_synset.words; end
52
+ # A gloss (short definition with examples)
53
+ # for the synset.
54
+ def gloss; @original_synset.gloss; end
55
+ # The antonym sets of the synset.
56
+ def antonyms; antonym.collect { |a| a.words }; end
57
+ # The hypernym sets of the synset.
58
+ def hypernyms; hypernym.words; end
59
+ # The hyponym sets of the synset.
60
+ def hyponyms; hyponym.collect { |h| h.words }; end
61
+ # Respond to the missing method event.
62
+ def method_missing(sym, *args, &block)
63
+ ret = @original_synset.send(sym)
64
+ if ret.is_a?(::WordNet::Synset)
65
+ Synset.new(ret)
66
+ else
67
+ ret
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,101 @@
1
+ module Treat
2
+ module Lexicalizers
3
+ module Tag
4
+ # Adapter class for the 'rbtagger' gem, a port
5
+ # of the Perl Lingua::BrillTagger class, based
6
+ # on the rule-based tagger developped by Eric Brill.
7
+ #
8
+ # The Brill tagger is a simple rule-based part of
9
+ # speech tagger. The main advantages over stochastic
10
+ # taggers is a vast reduction in information required
11
+ # and better portability from one tag set, corpus genre
12
+ # or language to another.
13
+ #
14
+ # Original paper:
15
+ # Eric Brill. 1992. A simple rule-based part of speech tagger.
16
+ # In Proceedings of the third conference on Applied natural
17
+ # language processing (ANLC '92). Association for Computational
18
+ # Linguistics, Stroudsburg, PA, USA, 152-155.
19
+ # DOI=10.3115/974499.974526 http://dx.doi.org/10.3115/974499.974526
20
+ # Project website:
21
+ # http://rbtagger.rubyforge.org/
22
+ # Original Perl module site:
23
+ # http://search.cpan.org/~kwilliams/Lingua-BrillTagger-0.02/lib/Lingua/BrillTagger.pm
24
+ class Brill
25
+ patch = false
26
+ # Require the 'rbtagger' gem.
27
+ begin
28
+ silently { require 'rbtagger' }
29
+ # This whole mess is required to deal with
30
+ # the fact that the 'rbtagger' gem defines
31
+ # a top-level module called 'Word', which
32
+ # will clash with the top-level class 'Word'
33
+ # we define when syntactic sugar is enabled.
34
+ rescue TypeError
35
+ if Treat.edulcorated?
36
+ patch = true
37
+ # Unset the class Word for the duration
38
+ # of loading the tagger.
39
+ Object.const_unset(:Word); retry
40
+ else
41
+ raise Treat::Exception,
42
+ 'Something went wrong due to a name clash with the "rbtagger" gem.' +
43
+ 'Turn off syntactic sugar to resolve this problem.'
44
+ end
45
+ ensure
46
+ # Reset the class Word if using syntactic sugar.
47
+ if Treat.edulcorated? && patch
48
+ Object.const_set(:Word, Treat::Entities::Word)
49
+ end
50
+ end
51
+ # Hold the tagger.
52
+ @@tagger = nil
53
+ # Hold the user-set options
54
+ @@options = {}
55
+ # Hold the default options.
56
+ DefaultOptions = {
57
+ lexicon: nil,
58
+ lexical_rules: nil,
59
+ contextual_rules: nil
60
+ }
61
+ # Tag words using a native Brill tagger.
62
+ #
63
+ # Available options:
64
+ # :lexicon => String (Lexicon file to use)
65
+ # :lexical_rules => String (Lexical rule file to use)
66
+ # :contextual_rules => String (Contextual rules file to use)
67
+ def self.tag(entity, options = {})
68
+ # Reinitialize the tagger if the options have changed.
69
+ if options != @@options
70
+ @@options = DefaultOptions.merge(options)
71
+ @@tagger = nil # Reset the tagger
72
+ end
73
+ # Create the tagger if necessary
74
+ @@tagger ||= ::Brill::Tagger.new(options[:lexicon],
75
+ options[:lexical_rules], options[:contextual_rules])
76
+ # Perform tagging.
77
+ if entity.type == :word
78
+ # Setup the context of the word
79
+ l = entity.left
80
+ r = entity.right
81
+ l = l.nil? ? '' : l.to_s
82
+ r = r.nil? ? '' : r.to_s
83
+ c = "#{l} #{entity.value} #{r}"
84
+ end
85
+ res = @@tagger.tag(c)
86
+ if l == ''
87
+ unless r == ''
88
+ entity.next_sibling.set(:tag, res[3][1])
89
+ end
90
+ return res[2][1]
91
+ else
92
+ unless r == ''
93
+ entity.next_sibling.set(:tag, res[2][1])
94
+ end
95
+ return res[1][1]
96
+ end
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,114 @@
1
+ module Treat
2
+ module Lexicalizers
3
+ module Tag
4
+ # An adapter for the 'engtagger' gem, which
5
+ # is a port of the Perl Lingua::EN::Tagger module.
6
+ #
7
+ # "This module uses part-of-speech statistics from
8
+ # the Penn Treebank to assign POS tags to English text.
9
+ # The tagger applies a bigram (two-word) Hidden Markov
10
+ # Model to guess the appropriate POS tag for a word.
11
+ # That means that the tagger will try to assign a POS
12
+ # tag based on the known POS tags for a given word and
13
+ # the POS tag assigned to its predecessor.
14
+ #
15
+ # Project website: http://engtagger.rubyforge.org/
16
+ # Original Perl module site:
17
+ # http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/
18
+ class Lingua
19
+ # Require the 'engtagger' gem.
20
+ silently { require 'engtagger' }
21
+ # Hold the tagger.
22
+ @@tagger = nil
23
+ # Hold the user-set options
24
+ @@options = {}
25
+ # Hold the default options.
26
+ DefaultOptions = {
27
+ unknown_word_tag: '?',
28
+ relax: false,
29
+ debug: false
30
+ }
31
+ # Tag the word using a probabilistic model taking
32
+ # into account known words found in a lexicon and
33
+ # the tag of the previous word.
34
+ #
35
+ # Options:
36
+ #
37
+ # :relax => (Boolean) Relax the Hidden Markov Model:
38
+ # this may improve accuracy for uncommon words,
39
+ # particularly words used polysemously.
40
+ # :debug => (Boolean) Print debug messages.
41
+ # :unknown_word_tag => (String) Tag for unknown words.
42
+ def self.tag(entity, options = {})
43
+ # Reinitialize the tagger if the options have changed.
44
+ if options != @@options
45
+ @@options = DefaultOptions.merge(options)
46
+ @@tagger = nil # Reset the tagger
47
+ end
48
+ @@tagger ||= ::EngTagger.new(@@options)
49
+ left = entity.left
50
+ if left.nil? || left.type != :word
51
+ left_tag = 'pp'
52
+ else
53
+ left_tag = left.tag.downcase
54
+ left_tag = 'pp' if left_tag == ''
55
+ end
56
+ w = @@tagger.clean_word(entity.to_s)
57
+ t = @@tagger.conf[:current_tag] =
58
+ @@tagger.assign_tag(left_tag, w)
59
+ t.upcase
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end
65
+
66
+ =begin
67
+
68
+ CC Conjunction, coordinating and, or
69
+ CD Adjective, cardinal number 3, fifteen
70
+ DET Determiner this, each, some
71
+ EX Pronoun, existential there there
72
+ FW Foreign words
73
+ IN Preposition / Conjunction for, of, although, that
74
+ JJ Adjective happy, bad
75
+ JJR Adjective, comparative happier, worse
76
+ JJS Adjective, superlative happiest, worst
77
+ LS Symbol, list item A, A.
78
+ MD Verb, modal can, could, 'll
79
+ NN Noun aircraft, data
80
+ NNP Noun, proper London, Michael
81
+ NNPS Noun, proper, plural Australians, Methodists
82
+ NNS Noun, plural women, books
83
+ PDT Determiner, prequalifier quite, all, half
84
+ POS Possessive 's, '
85
+ PRP Determiner, possessive second mine, yours
86
+ PRPS Determiner, possessive their, your
87
+ RB Adverb often, not, very, here
88
+ RBR Adverb, comparative faster
89
+ RBS Adverb, superlative fastest
90
+ RP Adverb, particle up, off, out
91
+ SYM Symbol *
92
+ TO Preposition to
93
+ UH Interjection oh, yes, mmm
94
+ VB Verb, infinitive take, live
95
+ VBD Verb, past tense took, lived
96
+ VBG Verb, gerund taking, living
97
+ VBN Verb, past/passive participle taken, lived
98
+ VBP Verb, base present form take, live
99
+ VBZ Verb, present 3SG -s form takes, lives
100
+ WDT Determiner, question which, whatever
101
+ WP Pronoun, question who, whoever
102
+ WPS Determiner, possessive & question whose
103
+ WRB Adverb, question when, how, however
104
+
105
+ PP Punctuation, sentence ender ., !, ?
106
+ PPC Punctuation, comma ,
107
+ PPD Punctuation, dollar sign $
108
+ PPL Punctuation, quotation mark left ``
109
+ PPR Punctuation, quotation mark right ''
110
+ PPS Punctuation, colon, semicolon, elipsis :, ..., -
111
+ LRB Punctuation, left bracket (, {, [
112
+ RRB Punctuation, right bracket ), }, ]
113
+
114
+ =end