treat 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. data/INSTALL +0 -0
  2. data/LICENSE +28 -0
  3. data/README +0 -0
  4. data/TODO +67 -0
  5. data/bin/INFO +1 -0
  6. data/examples/benchmark.rb +81 -0
  7. data/examples/keywords.rb +60 -0
  8. data/examples/texts/bugged_out.txt +26 -0
  9. data/examples/texts/half_cocked_basel.txt +16 -0
  10. data/examples/texts/hedge_funds.txt +24 -0
  11. data/examples/texts/hose_and_dry.txt +19 -0
  12. data/examples/texts/hungarys_troubles.txt +46 -0
  13. data/examples/texts/indias_slowdown.txt +15 -0
  14. data/examples/texts/merkozy_rides_again.txt +24 -0
  15. data/examples/texts/prada_is_not_walmart.txt +9 -0
  16. data/examples/texts/republican_nomination.txt +26 -0
  17. data/examples/texts/to_infinity_and_beyond.txt +15 -0
  18. data/lib/treat.rb +91 -0
  19. data/lib/treat/buildable.rb +115 -0
  20. data/lib/treat/categories.rb +29 -0
  21. data/lib/treat/category.rb +28 -0
  22. data/lib/treat/delegatable.rb +90 -0
  23. data/lib/treat/detectors.rb +28 -0
  24. data/lib/treat/detectors/encoding/native.rb +12 -0
  25. data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
  26. data/lib/treat/detectors/format/file.rb +36 -0
  27. data/lib/treat/detectors/language/language_detector.rb +19 -0
  28. data/lib/treat/detectors/language/what_language.rb +29 -0
  29. data/lib/treat/entities.rb +52 -0
  30. data/lib/treat/entities/collection.rb +19 -0
  31. data/lib/treat/entities/constituents.rb +15 -0
  32. data/lib/treat/entities/document.rb +11 -0
  33. data/lib/treat/entities/entity.rb +242 -0
  34. data/lib/treat/entities/sentence.rb +8 -0
  35. data/lib/treat/entities/text.rb +7 -0
  36. data/lib/treat/entities/tokens.rb +37 -0
  37. data/lib/treat/entities/zones.rb +17 -0
  38. data/lib/treat/exception.rb +5 -0
  39. data/lib/treat/extractors.rb +41 -0
  40. data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
  41. data/lib/treat/extractors/named_entity/abner.rb +20 -0
  42. data/lib/treat/extractors/named_entity/stanford.rb +174 -0
  43. data/lib/treat/extractors/statistics/frequency.rb +22 -0
  44. data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
  45. data/lib/treat/extractors/statistics/position_in.rb +13 -0
  46. data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
  47. data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
  48. data/lib/treat/extractors/time/chronic.rb +12 -0
  49. data/lib/treat/extractors/time/native.rb +12 -0
  50. data/lib/treat/extractors/time/nickel.rb +45 -0
  51. data/lib/treat/extractors/topic_words/lda.rb +71 -0
  52. data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
  53. data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
  54. data/lib/treat/extractors/topics/reuters.rb +91 -0
  55. data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
  56. data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
  57. data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
  58. data/lib/treat/feature.rb +53 -0
  59. data/lib/treat/formatters.rb +44 -0
  60. data/lib/treat/formatters/cleaners/html.rb +17 -0
  61. data/lib/treat/formatters/readers/autoselect.rb +35 -0
  62. data/lib/treat/formatters/readers/gocr.rb +24 -0
  63. data/lib/treat/formatters/readers/html.rb +13 -0
  64. data/lib/treat/formatters/readers/ocropus.rb +31 -0
  65. data/lib/treat/formatters/readers/pdf.rb +17 -0
  66. data/lib/treat/formatters/readers/txt.rb +15 -0
  67. data/lib/treat/formatters/serializers/xml.rb +48 -0
  68. data/lib/treat/formatters/serializers/yaml.rb +15 -0
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
  70. data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
  71. data/lib/treat/formatters/unserializers/xml.rb +79 -0
  72. data/lib/treat/formatters/unserializers/yaml.rb +15 -0
  73. data/lib/treat/formatters/visualizers/dot.rb +73 -0
  74. data/lib/treat/formatters/visualizers/html.rb +12 -0
  75. data/lib/treat/formatters/visualizers/inspect.rb +16 -0
  76. data/lib/treat/formatters/visualizers/short_value.rb +14 -0
  77. data/lib/treat/formatters/visualizers/standoff.rb +41 -0
  78. data/lib/treat/formatters/visualizers/tree.rb +28 -0
  79. data/lib/treat/formatters/visualizers/txt.rb +31 -0
  80. data/lib/treat/group.rb +96 -0
  81. data/lib/treat/inflectors.rb +50 -0
  82. data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
  83. data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
  84. data/lib/treat/inflectors/declensors/en.rb +18 -0
  85. data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
  86. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
  87. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
  88. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
  89. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
  90. data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
  91. data/lib/treat/inflectors/stemmers/porter.rb +158 -0
  92. data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
  93. data/lib/treat/inflectors/stemmers/uea.rb +30 -0
  94. data/lib/treat/lexicalizers.rb +49 -0
  95. data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
  96. data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
  97. data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
  98. data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
  99. data/lib/treat/lexicalizers/tag/brill.rb +101 -0
  100. data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
  101. data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
  102. data/lib/treat/processors.rb +45 -0
  103. data/lib/treat/processors/chunkers/txt.rb +27 -0
  104. data/lib/treat/processors/parsers/enju.rb +214 -0
  105. data/lib/treat/processors/parsers/stanford.rb +60 -0
  106. data/lib/treat/processors/segmenters/punkt.rb +48 -0
  107. data/lib/treat/processors/segmenters/stanford.rb +45 -0
  108. data/lib/treat/processors/segmenters/tactful.rb +34 -0
  109. data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
  110. data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
  111. data/lib/treat/processors/tokenizers/perl.rb +96 -0
  112. data/lib/treat/processors/tokenizers/punkt.rb +42 -0
  113. data/lib/treat/processors/tokenizers/stanford.rb +33 -0
  114. data/lib/treat/processors/tokenizers/tactful.rb +59 -0
  115. data/lib/treat/proxies.rb +66 -0
  116. data/lib/treat/registrable.rb +26 -0
  117. data/lib/treat/resources.rb +10 -0
  118. data/lib/treat/resources/categories.rb +18 -0
  119. data/lib/treat/resources/delegates.rb +96 -0
  120. data/lib/treat/resources/dependencies.rb +0 -0
  121. data/lib/treat/resources/edges.rb +8 -0
  122. data/lib/treat/resources/formats.rb +23 -0
  123. data/lib/treat/resources/languages.rb +86 -0
  124. data/lib/treat/resources/languages.txt +504 -0
  125. data/lib/treat/resources/tags.rb +393 -0
  126. data/lib/treat/sugar.rb +43 -0
  127. data/lib/treat/tree.rb +174 -0
  128. data/lib/treat/utilities.rb +127 -0
  129. data/lib/treat/visitable.rb +27 -0
  130. data/test/profile.rb +2 -0
  131. data/test/tc_detectors.rb +27 -0
  132. data/test/tc_entity.rb +105 -0
  133. data/test/tc_extractors.rb +48 -0
  134. data/test/tc_formatters.rb +46 -0
  135. data/test/tc_inflectors.rb +39 -0
  136. data/test/tc_lexicalizers.rb +39 -0
  137. data/test/tc_processors.rb +36 -0
  138. data/test/tc_resources.rb +27 -0
  139. data/test/tc_treat.rb +64 -0
  140. data/test/tc_tree.rb +60 -0
  141. data/test/tests.rb +19 -0
  142. data/test/texts.rb +20 -0
  143. data/test/texts/english/long.html +24 -0
  144. data/test/texts/english/long.txt +22 -0
  145. data/test/texts/english/medium.txt +5 -0
  146. data/test/texts/english/short.txt +3 -0
  147. metadata +412 -0
@@ -0,0 +1,23 @@
1
+ module Treat
2
+ module Inflectors
3
+ module Stemmers
4
+ # Stems words using the 'ruby-stemmer' gem, which
5
+ # wraps a C version of the Porter stemming algorithm.
6
+ #
7
+ # Project website: https://github.com/aurelian/ruby-stemmer
8
+ # Original paper: Porter, 1980. An algorithm for suffix stripping,
9
+ # Program, Vol. 14, no. 3, pp 130-137,
10
+ # Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
11
+ class PorterC
12
+ silently { require 'lingua/stemmer' }
13
+ ::LinguaStemmer = ::Lingua
14
+ Object.instance_eval { remove_const :Lingua }
15
+ # Stem the word using the Porter C algorithm.
16
+ # Options: none.
17
+ def self.stem(word, options = {})
18
+ silently { ::LinguaStemmer.stemmer(word.to_s) }
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,30 @@
1
+ module Treat
2
+ module Inflectors
3
+ module Stemmers
4
+ # Stems a word using the UEA algorithm, implemented
5
+ # by the 'uea-stemmer' gem.
6
+ #
7
+ # "Similar to other stemmers, UEA-Lite operates on a
8
+ # set of rules which are used as steps. There are two
9
+ # groups of rules: the first to clean the tokens, and
10
+ # the second to alter suffixes."
11
+ #
12
+ # Project website: https://github.com/ealdent/uea-stemmer
13
+ # Original paper: Jenkins, Marie-Claire, Smith, Dan,
14
+ # Conservative stemming for search and indexing, 2005.
15
+ # http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
16
+ class UEA
17
+ # Require the 'uea-stemmer' gem.
18
+ silently { require 'uea-stemmer' }
19
+ # Keep only one copy of the stemmer.
20
+ @@stemmer = nil
21
+ # Stems a word using the UEA algorithm, implemented
22
+ # by the 'uea-stemmer' gem.
23
+ def self.stem(entity, options = {})
24
+ @@stemmer ||= silently { ::UEAStemmer.new }
25
+ @@stemmer.stem(entity.to_s).strip
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,49 @@
1
+ module Treat
2
+ # Lexicalizers allow the retrieval of lexical information
3
+ # (part of speech tag, synsets, hypersets, hyposets, etc.)
4
+ # of an entity.
5
+ module Lexicalizers
6
+ # Taggers return the part of speech tag of a word.
7
+ module Tag
8
+ extend Group
9
+ self.type = :annotator
10
+ self.targets = [:phrase, :word]
11
+ end
12
+ module Category
13
+ extend Group
14
+ self.type = :annotator
15
+ self.targets = [:phrase, :word]
16
+
17
+ def self.cat(entity, category); category; end # Remove
18
+ end
19
+ # Linkers allow to retrieve grammatical links
20
+ # between words.
21
+ module Linkages
22
+ extend Group
23
+ self.type = :annotator
24
+ self.targets = [:sentence, :word]
25
+ end
26
+ # Lexicons are dictionnaries of semantically linked
27
+ # word forms.
28
+ module Synsets
29
+ extend Group
30
+ self.type = :annotator
31
+ self.targets = [:word, :number]
32
+
33
+ def self.synonyms(entity, synsets)
34
+ synsets.collect { |ss| ss.synonyms }.flatten - [entity.value]
35
+ end
36
+ def self.antonyms(entity, synsets)
37
+ synsets.collect { |ss| ss.antonyms }.flatten
38
+ end
39
+ def self.hyponyms(entity, synsets)
40
+ synsets.collect { |ss| ss.hyponyms }.flatten
41
+ end
42
+ def self.hypernyms(entity, synsets)
43
+ synsets.collect { |ss| ss.hypernyms }.flatten
44
+ end
45
+
46
+ end
47
+ extend Treat::Category
48
+ end
49
+ end
@@ -0,0 +1,30 @@
1
+ module Treat
2
+ module Lexicalizers
3
+ module Category
4
+ # A class that detects the category of a word from its tag,
5
+ # using the default tagger for the language of the entity.
6
+ class FromTag
7
+ # Find the category of the current entity.
8
+ # Options:
9
+ # :tagger => (Symbol) force the use of a tagger.
10
+ # :tag_to_cat => (Hash) a list of categories for each possible tag.
11
+ def self.category(entity, options = {})
12
+ if options.empty?
13
+ options = {
14
+ tagger: nil,
15
+ tag_to_cat: Treat::Resources::Tags::PTBWordTagToCategory
16
+ }
17
+ end
18
+ tag = options[:tagger].nil? ? entity.tag : entity.tag(options[:tagger])
19
+ cat = options[:tag_to_cat][tag]
20
+ if cat.nil?
21
+ warn "Category not found for tag #{tag}."
22
+ :unknown
23
+ else
24
+ cat
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,63 @@
1
+ module Treat
2
+ module Lexicalizers
3
+ module Linkages
4
+ class Naive
5
+ def self.linkages(entity, options = {})
6
+ linkage = options.delete(:linkage)
7
+ if linkage.nil?
8
+ raise Treat::Exception,
9
+ "You must supply the :linkage option."
10
+ end
11
+ if !respond_to?(linkage)
12
+ raise Treat::Exception,
13
+ "No handler to resolve linkage #{linkage}."
14
+ end
15
+ self.send(linkage, entity, options)
16
+ end
17
+ # %%%
18
+ def self.patient(entity, options)
19
+ # Not so simple here... Fix
20
+ if main_verb.has_feature?(:aux)
21
+ subject
22
+ elsif main_verb.voice == 'passive'
23
+ subject
24
+ elsif main_verb.voice == 'active'
25
+ # Each prepos.
26
+ end
27
+ end
28
+ # Return the subject of the sentence|verb.
29
+ def self.subject(entity, options)
30
+ verb = entity.category == :verb ?
31
+ main_verb(entity) : entity.main_verb
32
+ args = []
33
+ main_verb.edges.each_pair do |id,edge|
34
+ args << find(id)
35
+ end
36
+ args[0]
37
+ end
38
+ # Return the object of the sentence|verb.
39
+ def self.object(entity, options)
40
+ verb = entity.category == :verb ?
41
+ main_verb(entity) : entity.main_verb
42
+ if verb.voice == 'passive'
43
+ return
44
+ end
45
+ args = []
46
+ verb.edges.each_pair do |id,edge|
47
+ args << find(id)
48
+ end
49
+ args[1]
50
+ end
51
+ # Find the main verb (shallowest verb in the tree).
52
+ def self.main_verb(entity, options)
53
+ verbs = entity.words_with_cat(:verb)
54
+ if verbs.empty?
55
+ return
56
+ end
57
+ verbs.sort! { |a,b| a.depth <=> b.depth }
58
+ verbs[0]
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,23 @@
1
+ module Treat
2
+ module Lexicalizers
3
+ module Synsets
4
+ # Currently not implemented.
5
+ class RitaWn
6
+ # Require the Ruby-Java bridge.
7
+ #silently do
8
+ require 'rjb'
9
+ # Load the RitaWN jars.
10
+ Rjb::load("#{Treat.bin}/jwnl/jwnl.jar", [])
11
+ JWNLException = Rjb::import('net.didion.jwnl.JWNLException')
12
+ Rjb::load("#{Treat.bin}/ritaWN/library/ritaWN.jar", [])
13
+ Rjb::add_jar("#{Treat.bin}/ritaWN/library/supportWN.jar")
14
+ Rjb::add_jar("#{Treat.bin}/ritaWNcore1.0.jar")
15
+ RiWordnet = ::Rjb::import('rita.wordnet.RiWordnet')
16
+ #end
17
+ def self.synsets(word, options = nil)
18
+
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,72 @@
1
+ module Treat
2
+ module Lexicalizers
3
+ module Synsets
4
+ # Obtain lexical information about a word using the
5
+ # ruby 'wordnet' gem.
6
+ class Wordnet
7
+ # Require the 'wordnet' gem.
8
+ require 'wordnet'
9
+ # Obtain lexical information about a word using the
10
+ # ruby 'wordnet' gem.
11
+ def self.synsets(word, options = nil)
12
+ unless [:noun, :adjective, :verb].include?(word.category)
13
+ return []
14
+ end
15
+ cat = word.category.to_s.capitalize
16
+ index = ::WordNet.const_get(cat + 'Index').instance
17
+ lemma = index.find(word.value.downcase)
18
+ return [] if lemma.nil?
19
+ synsets = []
20
+ lemma.synsets.each { |synset| synsets << Synset.new(synset) }
21
+ synsets
22
+ end
23
+ end
24
+ end
25
+ # An adaptor for synsets used by the Wordnet gem.
26
+ class Synset
27
+ # The POS tag of the word.
28
+ attr_accessor :pos
29
+ # The definition of the synset.
30
+ attr_accessor :definition
31
+ # The examples in the synset.
32
+ attr_accessor :examples
33
+ def initialize(synset)
34
+ @original_synset = synset
35
+ @pos, @definition, @examples =
36
+ parse_synset(synset.to_s.split(')'))
37
+ end
38
+ def parse_synset(res)
39
+ pos = res[0][1..-1].strip
40
+ res2 = res[1].split('(')
41
+ res3 = res2[1].split(';')
42
+ 1.upto(res3.size-1) do |i|
43
+ res3[i] = res3[i].strip[1..-2]
44
+ end
45
+ definition = res3[0]
46
+ examples = res3[1..-1]
47
+ return pos, definition, examples
48
+ end
49
+ # The words in the synset.
50
+ def words; @original_synset.words; end
51
+ def synonyms; @original_synset.words; end
52
+ # A gloss (short definition with examples)
53
+ # for the synset.
54
+ def gloss; @original_synset.gloss; end
55
+ # The antonym sets of the synset.
56
+ def antonyms; antonym.collect { |a| a.words }; end
57
+ # The hypernym sets of the synset.
58
+ def hypernyms; hypernym.words; end
59
+ # The hyponym sets of the synset.
60
+ def hyponyms; hyponym.collect { |h| h.words }; end
61
+ # Respond to the missing method event.
62
+ def method_missing(sym, *args, &block)
63
+ ret = @original_synset.send(sym)
64
+ if ret.is_a?(::WordNet::Synset)
65
+ Synset.new(ret)
66
+ else
67
+ ret
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,101 @@
1
+ module Treat
2
+ module Lexicalizers
3
+ module Tag
4
+ # Adapter class for the 'rbtagger' gem, a port
5
+ # of the Perl Lingua::BrillTagger class, based
6
+ # on the rule-based tagger developped by Eric Brill.
7
+ #
8
+ # The Brill tagger is a simple rule-based part of
9
+ # speech tagger. The main advantages over stochastic
10
+ # taggers is a vast reduction in information required
11
+ # and better portability from one tag set, corpus genre
12
+ # or language to another.
13
+ #
14
+ # Original paper:
15
+ # Eric Brill. 1992. A simple rule-based part of speech tagger.
16
+ # In Proceedings of the third conference on Applied natural
17
+ # language processing (ANLC '92). Association for Computational
18
+ # Linguistics, Stroudsburg, PA, USA, 152-155.
19
+ # DOI=10.3115/974499.974526 http://dx.doi.org/10.3115/974499.974526
20
+ # Project website:
21
+ # http://rbtagger.rubyforge.org/
22
+ # Original Perl module site:
23
+ # http://search.cpan.org/~kwilliams/Lingua-BrillTagger-0.02/lib/Lingua/BrillTagger.pm
24
+ class Brill
25
+ patch = false
26
+ # Require the 'rbtagger' gem.
27
+ begin
28
+ silently { require 'rbtagger' }
29
+ # This whole mess is required to deal with
30
+ # the fact that the 'rbtagger' gem defines
31
+ # a top-level module called 'Word', which
32
+ # will clash with the top-level class 'Word'
33
+ # we define when syntactic sugar is enabled.
34
+ rescue TypeError
35
+ if Treat.edulcorated?
36
+ patch = true
37
+ # Unset the class Word for the duration
38
+ # of loading the tagger.
39
+ Object.const_unset(:Word); retry
40
+ else
41
+ raise Treat::Exception,
42
+ 'Something went wrong due to a name clash with the "rbtagger" gem.' +
43
+ 'Turn off syntactic sugar to resolve this problem.'
44
+ end
45
+ ensure
46
+ # Reset the class Word if using syntactic sugar.
47
+ if Treat.edulcorated? && patch
48
+ Object.const_set(:Word, Treat::Entities::Word)
49
+ end
50
+ end
51
+ # Hold the tagger.
52
+ @@tagger = nil
53
+ # Hold the user-set options
54
+ @@options = {}
55
+ # Hold the default options.
56
+ DefaultOptions = {
57
+ lexicon: nil,
58
+ lexical_rules: nil,
59
+ contextual_rules: nil
60
+ }
61
+ # Tag words using a native Brill tagger.
62
+ #
63
+ # Available options:
64
+ # :lexicon => String (Lexicon file to use)
65
+ # :lexical_rules => String (Lexical rule file to use)
66
+ # :contextual_rules => String (Contextual rules file to use)
67
+ def self.tag(entity, options = {})
68
+ # Reinitialize the tagger if the options have changed.
69
+ if options != @@options
70
+ @@options = DefaultOptions.merge(options)
71
+ @@tagger = nil # Reset the tagger
72
+ end
73
+ # Create the tagger if necessary
74
+ @@tagger ||= ::Brill::Tagger.new(options[:lexicon],
75
+ options[:lexical_rules], options[:contextual_rules])
76
+ # Perform tagging.
77
+ if entity.type == :word
78
+ # Setup the context of the word
79
+ l = entity.left
80
+ r = entity.right
81
+ l = l.nil? ? '' : l.to_s
82
+ r = r.nil? ? '' : r.to_s
83
+ c = "#{l} #{entity.value} #{r}"
84
+ end
85
+ res = @@tagger.tag(c)
86
+ if l == ''
87
+ unless r == ''
88
+ entity.next_sibling.set(:tag, res[3][1])
89
+ end
90
+ return res[2][1]
91
+ else
92
+ unless r == ''
93
+ entity.next_sibling.set(:tag, res[2][1])
94
+ end
95
+ return res[1][1]
96
+ end
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,114 @@
1
+ module Treat
2
+ module Lexicalizers
3
+ module Tag
4
+ # An adapter for the 'engtagger' gem, which
5
+ # is a port of the Perl Lingua::EN::Tagger module.
6
+ #
7
+ # "This module uses part-of-speech statistics from
8
+ # the Penn Treebank to assign POS tags to English text.
9
+ # The tagger applies a bigram (two-word) Hidden Markov
10
+ # Model to guess the appropriate POS tag for a word.
11
+ # That means that the tagger will try to assign a POS
12
+ # tag based on the known POS tags for a given word and
13
+ # the POS tag assigned to its predecessor.
14
+ #
15
+ # Project website: http://engtagger.rubyforge.org/
16
+ # Original Perl module site:
17
+ # http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/
18
+ class Lingua
19
+ # Require the 'engtagger' gem.
20
+ silently { require 'engtagger' }
21
+ # Hold the tagger.
22
+ @@tagger = nil
23
+ # Hold the user-set options
24
+ @@options = {}
25
+ # Hold the default options.
26
+ DefaultOptions = {
27
+ unknown_word_tag: '?',
28
+ relax: false,
29
+ debug: false
30
+ }
31
+ # Tag the word using a probabilistic model taking
32
+ # into account known words found in a lexicon and
33
+ # the tag of the previous word.
34
+ #
35
+ # Options:
36
+ #
37
+ # :relax => (Boolean) Relax the Hidden Markov Model:
38
+ # this may improve accuracy for uncommon words,
39
+ # particularly words used polysemously.
40
+ # :debug => (Boolean) Print debug messages.
41
+ # :unknown_word_tag => (String) Tag for unknown words.
42
+ def self.tag(entity, options = {})
43
+ # Reinitialize the tagger if the options have changed.
44
+ if options != @@options
45
+ @@options = DefaultOptions.merge(options)
46
+ @@tagger = nil # Reset the tagger
47
+ end
48
+ @@tagger ||= ::EngTagger.new(@@options)
49
+ left = entity.left
50
+ if left.nil? || left.type != :word
51
+ left_tag = 'pp'
52
+ else
53
+ left_tag = left.tag.downcase
54
+ left_tag = 'pp' if left_tag == ''
55
+ end
56
+ w = @@tagger.clean_word(entity.to_s)
57
+ t = @@tagger.conf[:current_tag] =
58
+ @@tagger.assign_tag(left_tag, w)
59
+ t.upcase
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end
65
+
66
+ =begin
67
+
68
+ CC Conjunction, coordinating and, or
69
+ CD Adjective, cardinal number 3, fifteen
70
+ DET Determiner this, each, some
71
+ EX Pronoun, existential there there
72
+ FW Foreign words
73
+ IN Preposition / Conjunction for, of, although, that
74
+ JJ Adjective happy, bad
75
+ JJR Adjective, comparative happier, worse
76
+ JJS Adjective, superlative happiest, worst
77
+ LS Symbol, list item A, A.
78
+ MD Verb, modal can, could, 'll
79
+ NN Noun aircraft, data
80
+ NNP Noun, proper London, Michael
81
+ NNPS Noun, proper, plural Australians, Methodists
82
+ NNS Noun, plural women, books
83
+ PDT Determiner, prequalifier quite, all, half
84
+ POS Possessive 's, '
85
+ PRP Determiner, possessive second mine, yours
86
+ PRPS Determiner, possessive their, your
87
+ RB Adverb often, not, very, here
88
+ RBR Adverb, comparative faster
89
+ RBS Adverb, superlative fastest
90
+ RP Adverb, particle up, off, out
91
+ SYM Symbol *
92
+ TO Preposition to
93
+ UH Interjection oh, yes, mmm
94
+ VB Verb, infinitive take, live
95
+ VBD Verb, past tense took, lived
96
+ VBG Verb, gerund taking, living
97
+ VBN Verb, past/passive participle taken, lived
98
+ VBP Verb, base present form take, live
99
+ VBZ Verb, present 3SG -s form takes, lives
100
+ WDT Determiner, question which, whatever
101
+ WP Pronoun, question who, whoever
102
+ WPS Determiner, possessive & question whose
103
+ WRB Adverb, question when, how, however
104
+
105
+ PP Punctuation, sentence ender ., !, ?
106
+ PPC Punctuation, comma ,
107
+ PPD Punctuation, dollar sign $
108
+ PPL Punctuation, quotation mark left ``
109
+ PPR Punctuation, quotation mark right ''
110
+ PPS Punctuation, colon, semicolon, elipsis :, ..., -
111
+ LRB Punctuation, left bracket (, {, [
112
+ RRB Punctuation, right bracket ), }, ]
113
+
114
+ =end