treat 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +0 -0
- data/LICENSE +28 -0
- data/README +0 -0
- data/TODO +67 -0
- data/bin/INFO +1 -0
- data/examples/benchmark.rb +81 -0
- data/examples/keywords.rb +60 -0
- data/examples/texts/bugged_out.txt +26 -0
- data/examples/texts/half_cocked_basel.txt +16 -0
- data/examples/texts/hedge_funds.txt +24 -0
- data/examples/texts/hose_and_dry.txt +19 -0
- data/examples/texts/hungarys_troubles.txt +46 -0
- data/examples/texts/indias_slowdown.txt +15 -0
- data/examples/texts/merkozy_rides_again.txt +24 -0
- data/examples/texts/prada_is_not_walmart.txt +9 -0
- data/examples/texts/republican_nomination.txt +26 -0
- data/examples/texts/to_infinity_and_beyond.txt +15 -0
- data/lib/treat.rb +91 -0
- data/lib/treat/buildable.rb +115 -0
- data/lib/treat/categories.rb +29 -0
- data/lib/treat/category.rb +28 -0
- data/lib/treat/delegatable.rb +90 -0
- data/lib/treat/detectors.rb +28 -0
- data/lib/treat/detectors/encoding/native.rb +12 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
- data/lib/treat/detectors/format/file.rb +36 -0
- data/lib/treat/detectors/language/language_detector.rb +19 -0
- data/lib/treat/detectors/language/what_language.rb +29 -0
- data/lib/treat/entities.rb +52 -0
- data/lib/treat/entities/collection.rb +19 -0
- data/lib/treat/entities/constituents.rb +15 -0
- data/lib/treat/entities/document.rb +11 -0
- data/lib/treat/entities/entity.rb +242 -0
- data/lib/treat/entities/sentence.rb +8 -0
- data/lib/treat/entities/text.rb +7 -0
- data/lib/treat/entities/tokens.rb +37 -0
- data/lib/treat/entities/zones.rb +17 -0
- data/lib/treat/exception.rb +5 -0
- data/lib/treat/extractors.rb +41 -0
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
- data/lib/treat/extractors/named_entity/abner.rb +20 -0
- data/lib/treat/extractors/named_entity/stanford.rb +174 -0
- data/lib/treat/extractors/statistics/frequency.rb +22 -0
- data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
- data/lib/treat/extractors/statistics/position_in.rb +13 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
- data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
- data/lib/treat/extractors/time/chronic.rb +12 -0
- data/lib/treat/extractors/time/native.rb +12 -0
- data/lib/treat/extractors/time/nickel.rb +45 -0
- data/lib/treat/extractors/topic_words/lda.rb +71 -0
- data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
- data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
- data/lib/treat/extractors/topics/reuters.rb +91 -0
- data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
- data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
- data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
- data/lib/treat/feature.rb +53 -0
- data/lib/treat/formatters.rb +44 -0
- data/lib/treat/formatters/cleaners/html.rb +17 -0
- data/lib/treat/formatters/readers/autoselect.rb +35 -0
- data/lib/treat/formatters/readers/gocr.rb +24 -0
- data/lib/treat/formatters/readers/html.rb +13 -0
- data/lib/treat/formatters/readers/ocropus.rb +31 -0
- data/lib/treat/formatters/readers/pdf.rb +17 -0
- data/lib/treat/formatters/readers/txt.rb +15 -0
- data/lib/treat/formatters/serializers/xml.rb +48 -0
- data/lib/treat/formatters/serializers/yaml.rb +15 -0
- data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
- data/lib/treat/formatters/unserializers/xml.rb +79 -0
- data/lib/treat/formatters/unserializers/yaml.rb +15 -0
- data/lib/treat/formatters/visualizers/dot.rb +73 -0
- data/lib/treat/formatters/visualizers/html.rb +12 -0
- data/lib/treat/formatters/visualizers/inspect.rb +16 -0
- data/lib/treat/formatters/visualizers/short_value.rb +14 -0
- data/lib/treat/formatters/visualizers/standoff.rb +41 -0
- data/lib/treat/formatters/visualizers/tree.rb +28 -0
- data/lib/treat/formatters/visualizers/txt.rb +31 -0
- data/lib/treat/group.rb +96 -0
- data/lib/treat/inflectors.rb +50 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
- data/lib/treat/inflectors/declensors/en.rb +18 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
- data/lib/treat/inflectors/stemmers/porter.rb +158 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
- data/lib/treat/inflectors/stemmers/uea.rb +30 -0
- data/lib/treat/lexicalizers.rb +49 -0
- data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
- data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
- data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
- data/lib/treat/lexicalizers/tag/brill.rb +101 -0
- data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
- data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
- data/lib/treat/processors.rb +45 -0
- data/lib/treat/processors/chunkers/txt.rb +27 -0
- data/lib/treat/processors/parsers/enju.rb +214 -0
- data/lib/treat/processors/parsers/stanford.rb +60 -0
- data/lib/treat/processors/segmenters/punkt.rb +48 -0
- data/lib/treat/processors/segmenters/stanford.rb +45 -0
- data/lib/treat/processors/segmenters/tactful.rb +34 -0
- data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
- data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
- data/lib/treat/processors/tokenizers/perl.rb +96 -0
- data/lib/treat/processors/tokenizers/punkt.rb +42 -0
- data/lib/treat/processors/tokenizers/stanford.rb +33 -0
- data/lib/treat/processors/tokenizers/tactful.rb +59 -0
- data/lib/treat/proxies.rb +66 -0
- data/lib/treat/registrable.rb +26 -0
- data/lib/treat/resources.rb +10 -0
- data/lib/treat/resources/categories.rb +18 -0
- data/lib/treat/resources/delegates.rb +96 -0
- data/lib/treat/resources/dependencies.rb +0 -0
- data/lib/treat/resources/edges.rb +8 -0
- data/lib/treat/resources/formats.rb +23 -0
- data/lib/treat/resources/languages.rb +86 -0
- data/lib/treat/resources/languages.txt +504 -0
- data/lib/treat/resources/tags.rb +393 -0
- data/lib/treat/sugar.rb +43 -0
- data/lib/treat/tree.rb +174 -0
- data/lib/treat/utilities.rb +127 -0
- data/lib/treat/visitable.rb +27 -0
- data/test/profile.rb +2 -0
- data/test/tc_detectors.rb +27 -0
- data/test/tc_entity.rb +105 -0
- data/test/tc_extractors.rb +48 -0
- data/test/tc_formatters.rb +46 -0
- data/test/tc_inflectors.rb +39 -0
- data/test/tc_lexicalizers.rb +39 -0
- data/test/tc_processors.rb +36 -0
- data/test/tc_resources.rb +27 -0
- data/test/tc_treat.rb +64 -0
- data/test/tc_tree.rb +60 -0
- data/test/tests.rb +19 -0
- data/test/texts.rb +20 -0
- data/test/texts/english/long.html +24 -0
- data/test/texts/english/long.txt +22 -0
- data/test/texts/english/medium.txt +5 -0
- data/test/texts/english/short.txt +3 -0
- metadata +412 -0
@@ -0,0 +1,23 @@
|
|
1
|
+
module Treat
|
2
|
+
module Inflectors
|
3
|
+
module Stemmers
|
4
|
+
# Stems words using the 'ruby-stemmer' gem, which
|
5
|
+
# wraps a C version of the Porter stemming algorithm.
|
6
|
+
#
|
7
|
+
# Project website: https://github.com/aurelian/ruby-stemmer
|
8
|
+
# Original paper: Porter, 1980. An algorithm for suffix stripping,
|
9
|
+
# Program, Vol. 14, no. 3, pp 130-137,
|
10
|
+
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
|
11
|
+
class PorterC
|
12
|
+
silently { require 'lingua/stemmer' }
|
13
|
+
::LinguaStemmer = ::Lingua
|
14
|
+
Object.instance_eval { remove_const :Lingua }
|
15
|
+
# Stem the word using the Porter C algorithm.
|
16
|
+
# Options: none.
|
17
|
+
def self.stem(word, options = {})
|
18
|
+
silently { ::LinguaStemmer.stemmer(word.to_s) }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Treat
|
2
|
+
module Inflectors
|
3
|
+
module Stemmers
|
4
|
+
# Stems a word using the UEA algorithm, implemented
|
5
|
+
# by the 'uea-stemmer' gem.
|
6
|
+
#
|
7
|
+
# "Similar to other stemmers, UEA-Lite operates on a
|
8
|
+
# set of rules which are used as steps. There are two
|
9
|
+
# groups of rules: the first to clean the tokens, and
|
10
|
+
# the second to alter suffixes."
|
11
|
+
#
|
12
|
+
# Project website: https://github.com/ealdent/uea-stemmer
|
13
|
+
# Original paper: Jenkins, Marie-Claire, Smith, Dan,
|
14
|
+
# Conservative stemming for search and indexing, 2005.
|
15
|
+
# http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
|
16
|
+
class UEA
|
17
|
+
# Require the 'uea-stemmer' gem.
|
18
|
+
silently { require 'uea-stemmer' }
|
19
|
+
# Keep only one copy of the stemmer.
|
20
|
+
@@stemmer = nil
|
21
|
+
# Stems a word using the UEA algorithm, implemented
|
22
|
+
# by the 'uea-stemmer' gem.
|
23
|
+
def self.stem(entity, options = {})
|
24
|
+
@@stemmer ||= silently { ::UEAStemmer.new }
|
25
|
+
@@stemmer.stem(entity.to_s).strip
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Treat
|
2
|
+
# Lexicalizers allow the retrieval of lexical information
|
3
|
+
# (part of speech tag, synsets, hypersets, hyposets, etc.)
|
4
|
+
# of an entity.
|
5
|
+
module Lexicalizers
|
6
|
+
# Taggers return the part of speech tag of a word.
|
7
|
+
module Tag
|
8
|
+
extend Group
|
9
|
+
self.type = :annotator
|
10
|
+
self.targets = [:phrase, :word]
|
11
|
+
end
|
12
|
+
module Category
|
13
|
+
extend Group
|
14
|
+
self.type = :annotator
|
15
|
+
self.targets = [:phrase, :word]
|
16
|
+
|
17
|
+
def self.cat(entity, category); category; end # Remove
|
18
|
+
end
|
19
|
+
# Linkers allow to retrieve grammatical links
|
20
|
+
# between words.
|
21
|
+
module Linkages
|
22
|
+
extend Group
|
23
|
+
self.type = :annotator
|
24
|
+
self.targets = [:sentence, :word]
|
25
|
+
end
|
26
|
+
# Lexicons are dictionnaries of semantically linked
|
27
|
+
# word forms.
|
28
|
+
module Synsets
|
29
|
+
extend Group
|
30
|
+
self.type = :annotator
|
31
|
+
self.targets = [:word, :number]
|
32
|
+
|
33
|
+
def self.synonyms(entity, synsets)
|
34
|
+
synsets.collect { |ss| ss.synonyms }.flatten - [entity.value]
|
35
|
+
end
|
36
|
+
def self.antonyms(entity, synsets)
|
37
|
+
synsets.collect { |ss| ss.antonyms }.flatten
|
38
|
+
end
|
39
|
+
def self.hyponyms(entity, synsets)
|
40
|
+
synsets.collect { |ss| ss.hyponyms }.flatten
|
41
|
+
end
|
42
|
+
def self.hypernyms(entity, synsets)
|
43
|
+
synsets.collect { |ss| ss.hypernyms }.flatten
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
extend Treat::Category
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Treat
|
2
|
+
module Lexicalizers
|
3
|
+
module Category
|
4
|
+
# A class that detects the category of a word from its tag,
|
5
|
+
# using the default tagger for the language of the entity.
|
6
|
+
class FromTag
|
7
|
+
# Find the category of the current entity.
|
8
|
+
# Options:
|
9
|
+
# :tagger => (Symbol) force the use of a tagger.
|
10
|
+
# :tag_to_cat => (Hash) a list of categories for each possible tag.
|
11
|
+
def self.category(entity, options = {})
|
12
|
+
if options.empty?
|
13
|
+
options = {
|
14
|
+
tagger: nil,
|
15
|
+
tag_to_cat: Treat::Resources::Tags::PTBWordTagToCategory
|
16
|
+
}
|
17
|
+
end
|
18
|
+
tag = options[:tagger].nil? ? entity.tag : entity.tag(options[:tagger])
|
19
|
+
cat = options[:tag_to_cat][tag]
|
20
|
+
if cat.nil?
|
21
|
+
warn "Category not found for tag #{tag}."
|
22
|
+
:unknown
|
23
|
+
else
|
24
|
+
cat
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Treat
|
2
|
+
module Lexicalizers
|
3
|
+
module Linkages
|
4
|
+
class Naive
|
5
|
+
def self.linkages(entity, options = {})
|
6
|
+
linkage = options.delete(:linkage)
|
7
|
+
if linkage.nil?
|
8
|
+
raise Treat::Exception,
|
9
|
+
"You must supply the :linkage option."
|
10
|
+
end
|
11
|
+
if !respond_to?(linkage)
|
12
|
+
raise Treat::Exception,
|
13
|
+
"No handler to resolve linkage #{linkage}."
|
14
|
+
end
|
15
|
+
self.send(linkage, entity, options)
|
16
|
+
end
|
17
|
+
# %%%
|
18
|
+
def self.patient(entity, options)
|
19
|
+
# Not so simple here... Fix
|
20
|
+
if main_verb.has_feature?(:aux)
|
21
|
+
subject
|
22
|
+
elsif main_verb.voice == 'passive'
|
23
|
+
subject
|
24
|
+
elsif main_verb.voice == 'active'
|
25
|
+
# Each prepos.
|
26
|
+
end
|
27
|
+
end
|
28
|
+
# Return the subject of the sentence|verb.
|
29
|
+
def self.subject(entity, options)
|
30
|
+
verb = entity.category == :verb ?
|
31
|
+
main_verb(entity) : entity.main_verb
|
32
|
+
args = []
|
33
|
+
main_verb.edges.each_pair do |id,edge|
|
34
|
+
args << find(id)
|
35
|
+
end
|
36
|
+
args[0]
|
37
|
+
end
|
38
|
+
# Return the object of the sentence|verb.
|
39
|
+
def self.object(entity, options)
|
40
|
+
verb = entity.category == :verb ?
|
41
|
+
main_verb(entity) : entity.main_verb
|
42
|
+
if verb.voice == 'passive'
|
43
|
+
return
|
44
|
+
end
|
45
|
+
args = []
|
46
|
+
verb.edges.each_pair do |id,edge|
|
47
|
+
args << find(id)
|
48
|
+
end
|
49
|
+
args[1]
|
50
|
+
end
|
51
|
+
# Find the main verb (shallowest verb in the tree).
|
52
|
+
def self.main_verb(entity, options)
|
53
|
+
verbs = entity.words_with_cat(:verb)
|
54
|
+
if verbs.empty?
|
55
|
+
return
|
56
|
+
end
|
57
|
+
verbs.sort! { |a,b| a.depth <=> b.depth }
|
58
|
+
verbs[0]
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Treat
|
2
|
+
module Lexicalizers
|
3
|
+
module Synsets
|
4
|
+
# Currently not implemented.
|
5
|
+
class RitaWn
|
6
|
+
# Require the Ruby-Java bridge.
|
7
|
+
#silently do
|
8
|
+
require 'rjb'
|
9
|
+
# Load the RitaWN jars.
|
10
|
+
Rjb::load("#{Treat.bin}/jwnl/jwnl.jar", [])
|
11
|
+
JWNLException = Rjb::import('net.didion.jwnl.JWNLException')
|
12
|
+
Rjb::load("#{Treat.bin}/ritaWN/library/ritaWN.jar", [])
|
13
|
+
Rjb::add_jar("#{Treat.bin}/ritaWN/library/supportWN.jar")
|
14
|
+
Rjb::add_jar("#{Treat.bin}/ritaWNcore1.0.jar")
|
15
|
+
RiWordnet = ::Rjb::import('rita.wordnet.RiWordnet')
|
16
|
+
#end
|
17
|
+
def self.synsets(word, options = nil)
|
18
|
+
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
module Treat
|
2
|
+
module Lexicalizers
|
3
|
+
module Synsets
|
4
|
+
# Obtain lexical information about a word using the
|
5
|
+
# ruby 'wordnet' gem.
|
6
|
+
class Wordnet
|
7
|
+
# Require the 'wordnet' gem.
|
8
|
+
require 'wordnet'
|
9
|
+
# Obtain lexical information about a word using the
|
10
|
+
# ruby 'wordnet' gem.
|
11
|
+
def self.synsets(word, options = nil)
|
12
|
+
unless [:noun, :adjective, :verb].include?(word.category)
|
13
|
+
return []
|
14
|
+
end
|
15
|
+
cat = word.category.to_s.capitalize
|
16
|
+
index = ::WordNet.const_get(cat + 'Index').instance
|
17
|
+
lemma = index.find(word.value.downcase)
|
18
|
+
return [] if lemma.nil?
|
19
|
+
synsets = []
|
20
|
+
lemma.synsets.each { |synset| synsets << Synset.new(synset) }
|
21
|
+
synsets
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
# An adaptor for synsets used by the Wordnet gem.
|
26
|
+
class Synset
|
27
|
+
# The POS tag of the word.
|
28
|
+
attr_accessor :pos
|
29
|
+
# The definition of the synset.
|
30
|
+
attr_accessor :definition
|
31
|
+
# The examples in the synset.
|
32
|
+
attr_accessor :examples
|
33
|
+
def initialize(synset)
|
34
|
+
@original_synset = synset
|
35
|
+
@pos, @definition, @examples =
|
36
|
+
parse_synset(synset.to_s.split(')'))
|
37
|
+
end
|
38
|
+
def parse_synset(res)
|
39
|
+
pos = res[0][1..-1].strip
|
40
|
+
res2 = res[1].split('(')
|
41
|
+
res3 = res2[1].split(';')
|
42
|
+
1.upto(res3.size-1) do |i|
|
43
|
+
res3[i] = res3[i].strip[1..-2]
|
44
|
+
end
|
45
|
+
definition = res3[0]
|
46
|
+
examples = res3[1..-1]
|
47
|
+
return pos, definition, examples
|
48
|
+
end
|
49
|
+
# The words in the synset.
|
50
|
+
def words; @original_synset.words; end
|
51
|
+
def synonyms; @original_synset.words; end
|
52
|
+
# A gloss (short definition with examples)
|
53
|
+
# for the synset.
|
54
|
+
def gloss; @original_synset.gloss; end
|
55
|
+
# The antonym sets of the synset.
|
56
|
+
def antonyms; antonym.collect { |a| a.words }; end
|
57
|
+
# The hypernym sets of the synset.
|
58
|
+
def hypernyms; hypernym.words; end
|
59
|
+
# The hyponym sets of the synset.
|
60
|
+
def hyponyms; hyponym.collect { |h| h.words }; end
|
61
|
+
# Respond to the missing method event.
|
62
|
+
def method_missing(sym, *args, &block)
|
63
|
+
ret = @original_synset.send(sym)
|
64
|
+
if ret.is_a?(::WordNet::Synset)
|
65
|
+
Synset.new(ret)
|
66
|
+
else
|
67
|
+
ret
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
module Treat
|
2
|
+
module Lexicalizers
|
3
|
+
module Tag
|
4
|
+
# Adapter class for the 'rbtagger' gem, a port
|
5
|
+
# of the Perl Lingua::BrillTagger class, based
|
6
|
+
# on the rule-based tagger developped by Eric Brill.
|
7
|
+
#
|
8
|
+
# The Brill tagger is a simple rule-based part of
|
9
|
+
# speech tagger. The main advantages over stochastic
|
10
|
+
# taggers is a vast reduction in information required
|
11
|
+
# and better portability from one tag set, corpus genre
|
12
|
+
# or language to another.
|
13
|
+
#
|
14
|
+
# Original paper:
|
15
|
+
# Eric Brill. 1992. A simple rule-based part of speech tagger.
|
16
|
+
# In Proceedings of the third conference on Applied natural
|
17
|
+
# language processing (ANLC '92). Association for Computational
|
18
|
+
# Linguistics, Stroudsburg, PA, USA, 152-155.
|
19
|
+
# DOI=10.3115/974499.974526 http://dx.doi.org/10.3115/974499.974526
|
20
|
+
# Project website:
|
21
|
+
# http://rbtagger.rubyforge.org/
|
22
|
+
# Original Perl module site:
|
23
|
+
# http://search.cpan.org/~kwilliams/Lingua-BrillTagger-0.02/lib/Lingua/BrillTagger.pm
|
24
|
+
class Brill
|
25
|
+
patch = false
|
26
|
+
# Require the 'rbtagger' gem.
|
27
|
+
begin
|
28
|
+
silently { require 'rbtagger' }
|
29
|
+
# This whole mess is required to deal with
|
30
|
+
# the fact that the 'rbtagger' gem defines
|
31
|
+
# a top-level module called 'Word', which
|
32
|
+
# will clash with the top-level class 'Word'
|
33
|
+
# we define when syntactic sugar is enabled.
|
34
|
+
rescue TypeError
|
35
|
+
if Treat.edulcorated?
|
36
|
+
patch = true
|
37
|
+
# Unset the class Word for the duration
|
38
|
+
# of loading the tagger.
|
39
|
+
Object.const_unset(:Word); retry
|
40
|
+
else
|
41
|
+
raise Treat::Exception,
|
42
|
+
'Something went wrong due to a name clash with the "rbtagger" gem.' +
|
43
|
+
'Turn off syntactic sugar to resolve this problem.'
|
44
|
+
end
|
45
|
+
ensure
|
46
|
+
# Reset the class Word if using syntactic sugar.
|
47
|
+
if Treat.edulcorated? && patch
|
48
|
+
Object.const_set(:Word, Treat::Entities::Word)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
# Hold the tagger.
|
52
|
+
@@tagger = nil
|
53
|
+
# Hold the user-set options
|
54
|
+
@@options = {}
|
55
|
+
# Hold the default options.
|
56
|
+
DefaultOptions = {
|
57
|
+
lexicon: nil,
|
58
|
+
lexical_rules: nil,
|
59
|
+
contextual_rules: nil
|
60
|
+
}
|
61
|
+
# Tag words using a native Brill tagger.
|
62
|
+
#
|
63
|
+
# Available options:
|
64
|
+
# :lexicon => String (Lexicon file to use)
|
65
|
+
# :lexical_rules => String (Lexical rule file to use)
|
66
|
+
# :contextual_rules => String (Contextual rules file to use)
|
67
|
+
def self.tag(entity, options = {})
|
68
|
+
# Reinitialize the tagger if the options have changed.
|
69
|
+
if options != @@options
|
70
|
+
@@options = DefaultOptions.merge(options)
|
71
|
+
@@tagger = nil # Reset the tagger
|
72
|
+
end
|
73
|
+
# Create the tagger if necessary
|
74
|
+
@@tagger ||= ::Brill::Tagger.new(options[:lexicon],
|
75
|
+
options[:lexical_rules], options[:contextual_rules])
|
76
|
+
# Perform tagging.
|
77
|
+
if entity.type == :word
|
78
|
+
# Setup the context of the word
|
79
|
+
l = entity.left
|
80
|
+
r = entity.right
|
81
|
+
l = l.nil? ? '' : l.to_s
|
82
|
+
r = r.nil? ? '' : r.to_s
|
83
|
+
c = "#{l} #{entity.value} #{r}"
|
84
|
+
end
|
85
|
+
res = @@tagger.tag(c)
|
86
|
+
if l == ''
|
87
|
+
unless r == ''
|
88
|
+
entity.next_sibling.set(:tag, res[3][1])
|
89
|
+
end
|
90
|
+
return res[2][1]
|
91
|
+
else
|
92
|
+
unless r == ''
|
93
|
+
entity.next_sibling.set(:tag, res[2][1])
|
94
|
+
end
|
95
|
+
return res[1][1]
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
module Treat
|
2
|
+
module Lexicalizers
|
3
|
+
module Tag
|
4
|
+
# An adapter for the 'engtagger' gem, which
|
5
|
+
# is a port of the Perl Lingua::EN::Tagger module.
|
6
|
+
#
|
7
|
+
# "This module uses part-of-speech statistics from
|
8
|
+
# the Penn Treebank to assign POS tags to English text.
|
9
|
+
# The tagger applies a bigram (two-word) Hidden Markov
|
10
|
+
# Model to guess the appropriate POS tag for a word.
|
11
|
+
# That means that the tagger will try to assign a POS
|
12
|
+
# tag based on the known POS tags for a given word and
|
13
|
+
# the POS tag assigned to its predecessor.
|
14
|
+
#
|
15
|
+
# Project website: http://engtagger.rubyforge.org/
|
16
|
+
# Original Perl module site:
|
17
|
+
# http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/
|
18
|
+
class Lingua
|
19
|
+
# Require the 'engtagger' gem.
|
20
|
+
silently { require 'engtagger' }
|
21
|
+
# Hold the tagger.
|
22
|
+
@@tagger = nil
|
23
|
+
# Hold the user-set options
|
24
|
+
@@options = {}
|
25
|
+
# Hold the default options.
|
26
|
+
DefaultOptions = {
|
27
|
+
unknown_word_tag: '?',
|
28
|
+
relax: false,
|
29
|
+
debug: false
|
30
|
+
}
|
31
|
+
# Tag the word using a probabilistic model taking
|
32
|
+
# into account known words found in a lexicon and
|
33
|
+
# the tag of the previous word.
|
34
|
+
#
|
35
|
+
# Options:
|
36
|
+
#
|
37
|
+
# :relax => (Boolean) Relax the Hidden Markov Model:
|
38
|
+
# this may improve accuracy for uncommon words,
|
39
|
+
# particularly words used polysemously.
|
40
|
+
# :debug => (Boolean) Print debug messages.
|
41
|
+
# :unknown_word_tag => (String) Tag for unknown words.
|
42
|
+
def self.tag(entity, options = {})
|
43
|
+
# Reinitialize the tagger if the options have changed.
|
44
|
+
if options != @@options
|
45
|
+
@@options = DefaultOptions.merge(options)
|
46
|
+
@@tagger = nil # Reset the tagger
|
47
|
+
end
|
48
|
+
@@tagger ||= ::EngTagger.new(@@options)
|
49
|
+
left = entity.left
|
50
|
+
if left.nil? || left.type != :word
|
51
|
+
left_tag = 'pp'
|
52
|
+
else
|
53
|
+
left_tag = left.tag.downcase
|
54
|
+
left_tag = 'pp' if left_tag == ''
|
55
|
+
end
|
56
|
+
w = @@tagger.clean_word(entity.to_s)
|
57
|
+
t = @@tagger.conf[:current_tag] =
|
58
|
+
@@tagger.assign_tag(left_tag, w)
|
59
|
+
t.upcase
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
=begin
|
67
|
+
|
68
|
+
CC Conjunction, coordinating and, or
|
69
|
+
CD Adjective, cardinal number 3, fifteen
|
70
|
+
DET Determiner this, each, some
|
71
|
+
EX Pronoun, existential there there
|
72
|
+
FW Foreign words
|
73
|
+
IN Preposition / Conjunction for, of, although, that
|
74
|
+
JJ Adjective happy, bad
|
75
|
+
JJR Adjective, comparative happier, worse
|
76
|
+
JJS Adjective, superlative happiest, worst
|
77
|
+
LS Symbol, list item A, A.
|
78
|
+
MD Verb, modal can, could, 'll
|
79
|
+
NN Noun aircraft, data
|
80
|
+
NNP Noun, proper London, Michael
|
81
|
+
NNPS Noun, proper, plural Australians, Methodists
|
82
|
+
NNS Noun, plural women, books
|
83
|
+
PDT Determiner, prequalifier quite, all, half
|
84
|
+
POS Possessive 's, '
|
85
|
+
PRP Determiner, possessive second mine, yours
|
86
|
+
PRPS Determiner, possessive their, your
|
87
|
+
RB Adverb often, not, very, here
|
88
|
+
RBR Adverb, comparative faster
|
89
|
+
RBS Adverb, superlative fastest
|
90
|
+
RP Adverb, particle up, off, out
|
91
|
+
SYM Symbol *
|
92
|
+
TO Preposition to
|
93
|
+
UH Interjection oh, yes, mmm
|
94
|
+
VB Verb, infinitive take, live
|
95
|
+
VBD Verb, past tense took, lived
|
96
|
+
VBG Verb, gerund taking, living
|
97
|
+
VBN Verb, past/passive participle taken, lived
|
98
|
+
VBP Verb, base present form take, live
|
99
|
+
VBZ Verb, present 3SG -s form takes, lives
|
100
|
+
WDT Determiner, question which, whatever
|
101
|
+
WP Pronoun, question who, whoever
|
102
|
+
WPS Determiner, possessive & question whose
|
103
|
+
WRB Adverb, question when, how, however
|
104
|
+
|
105
|
+
PP Punctuation, sentence ender ., !, ?
|
106
|
+
PPC Punctuation, comma ,
|
107
|
+
PPD Punctuation, dollar sign $
|
108
|
+
PPL Punctuation, quotation mark left ``
|
109
|
+
PPR Punctuation, quotation mark right ''
|
110
|
+
PPS Punctuation, colon, semicolon, elipsis :, ..., -
|
111
|
+
LRB Punctuation, left bracket (, {, [
|
112
|
+
RRB Punctuation, right bracket ), }, ]
|
113
|
+
|
114
|
+
=end
|