treat 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +0 -0
- data/LICENSE +28 -0
- data/README +0 -0
- data/TODO +67 -0
- data/bin/INFO +1 -0
- data/examples/benchmark.rb +81 -0
- data/examples/keywords.rb +60 -0
- data/examples/texts/bugged_out.txt +26 -0
- data/examples/texts/half_cocked_basel.txt +16 -0
- data/examples/texts/hedge_funds.txt +24 -0
- data/examples/texts/hose_and_dry.txt +19 -0
- data/examples/texts/hungarys_troubles.txt +46 -0
- data/examples/texts/indias_slowdown.txt +15 -0
- data/examples/texts/merkozy_rides_again.txt +24 -0
- data/examples/texts/prada_is_not_walmart.txt +9 -0
- data/examples/texts/republican_nomination.txt +26 -0
- data/examples/texts/to_infinity_and_beyond.txt +15 -0
- data/lib/treat.rb +91 -0
- data/lib/treat/buildable.rb +115 -0
- data/lib/treat/categories.rb +29 -0
- data/lib/treat/category.rb +28 -0
- data/lib/treat/delegatable.rb +90 -0
- data/lib/treat/detectors.rb +28 -0
- data/lib/treat/detectors/encoding/native.rb +12 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
- data/lib/treat/detectors/format/file.rb +36 -0
- data/lib/treat/detectors/language/language_detector.rb +19 -0
- data/lib/treat/detectors/language/what_language.rb +29 -0
- data/lib/treat/entities.rb +52 -0
- data/lib/treat/entities/collection.rb +19 -0
- data/lib/treat/entities/constituents.rb +15 -0
- data/lib/treat/entities/document.rb +11 -0
- data/lib/treat/entities/entity.rb +242 -0
- data/lib/treat/entities/sentence.rb +8 -0
- data/lib/treat/entities/text.rb +7 -0
- data/lib/treat/entities/tokens.rb +37 -0
- data/lib/treat/entities/zones.rb +17 -0
- data/lib/treat/exception.rb +5 -0
- data/lib/treat/extractors.rb +41 -0
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
- data/lib/treat/extractors/named_entity/abner.rb +20 -0
- data/lib/treat/extractors/named_entity/stanford.rb +174 -0
- data/lib/treat/extractors/statistics/frequency.rb +22 -0
- data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
- data/lib/treat/extractors/statistics/position_in.rb +13 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
- data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
- data/lib/treat/extractors/time/chronic.rb +12 -0
- data/lib/treat/extractors/time/native.rb +12 -0
- data/lib/treat/extractors/time/nickel.rb +45 -0
- data/lib/treat/extractors/topic_words/lda.rb +71 -0
- data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
- data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
- data/lib/treat/extractors/topics/reuters.rb +91 -0
- data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
- data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
- data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
- data/lib/treat/feature.rb +53 -0
- data/lib/treat/formatters.rb +44 -0
- data/lib/treat/formatters/cleaners/html.rb +17 -0
- data/lib/treat/formatters/readers/autoselect.rb +35 -0
- data/lib/treat/formatters/readers/gocr.rb +24 -0
- data/lib/treat/formatters/readers/html.rb +13 -0
- data/lib/treat/formatters/readers/ocropus.rb +31 -0
- data/lib/treat/formatters/readers/pdf.rb +17 -0
- data/lib/treat/formatters/readers/txt.rb +15 -0
- data/lib/treat/formatters/serializers/xml.rb +48 -0
- data/lib/treat/formatters/serializers/yaml.rb +15 -0
- data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
- data/lib/treat/formatters/unserializers/xml.rb +79 -0
- data/lib/treat/formatters/unserializers/yaml.rb +15 -0
- data/lib/treat/formatters/visualizers/dot.rb +73 -0
- data/lib/treat/formatters/visualizers/html.rb +12 -0
- data/lib/treat/formatters/visualizers/inspect.rb +16 -0
- data/lib/treat/formatters/visualizers/short_value.rb +14 -0
- data/lib/treat/formatters/visualizers/standoff.rb +41 -0
- data/lib/treat/formatters/visualizers/tree.rb +28 -0
- data/lib/treat/formatters/visualizers/txt.rb +31 -0
- data/lib/treat/group.rb +96 -0
- data/lib/treat/inflectors.rb +50 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
- data/lib/treat/inflectors/declensors/en.rb +18 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
- data/lib/treat/inflectors/stemmers/porter.rb +158 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
- data/lib/treat/inflectors/stemmers/uea.rb +30 -0
- data/lib/treat/lexicalizers.rb +49 -0
- data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
- data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
- data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
- data/lib/treat/lexicalizers/tag/brill.rb +101 -0
- data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
- data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
- data/lib/treat/processors.rb +45 -0
- data/lib/treat/processors/chunkers/txt.rb +27 -0
- data/lib/treat/processors/parsers/enju.rb +214 -0
- data/lib/treat/processors/parsers/stanford.rb +60 -0
- data/lib/treat/processors/segmenters/punkt.rb +48 -0
- data/lib/treat/processors/segmenters/stanford.rb +45 -0
- data/lib/treat/processors/segmenters/tactful.rb +34 -0
- data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
- data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
- data/lib/treat/processors/tokenizers/perl.rb +96 -0
- data/lib/treat/processors/tokenizers/punkt.rb +42 -0
- data/lib/treat/processors/tokenizers/stanford.rb +33 -0
- data/lib/treat/processors/tokenizers/tactful.rb +59 -0
- data/lib/treat/proxies.rb +66 -0
- data/lib/treat/registrable.rb +26 -0
- data/lib/treat/resources.rb +10 -0
- data/lib/treat/resources/categories.rb +18 -0
- data/lib/treat/resources/delegates.rb +96 -0
- data/lib/treat/resources/dependencies.rb +0 -0
- data/lib/treat/resources/edges.rb +8 -0
- data/lib/treat/resources/formats.rb +23 -0
- data/lib/treat/resources/languages.rb +86 -0
- data/lib/treat/resources/languages.txt +504 -0
- data/lib/treat/resources/tags.rb +393 -0
- data/lib/treat/sugar.rb +43 -0
- data/lib/treat/tree.rb +174 -0
- data/lib/treat/utilities.rb +127 -0
- data/lib/treat/visitable.rb +27 -0
- data/test/profile.rb +2 -0
- data/test/tc_detectors.rb +27 -0
- data/test/tc_entity.rb +105 -0
- data/test/tc_extractors.rb +48 -0
- data/test/tc_formatters.rb +46 -0
- data/test/tc_inflectors.rb +39 -0
- data/test/tc_lexicalizers.rb +39 -0
- data/test/tc_processors.rb +36 -0
- data/test/tc_resources.rb +27 -0
- data/test/tc_treat.rb +64 -0
- data/test/tc_tree.rb +60 -0
- data/test/tests.rb +19 -0
- data/test/texts.rb +20 -0
- data/test/texts/english/long.html +24 -0
- data/test/texts/english/long.txt +22 -0
- data/test/texts/english/medium.txt +5 -0
- data/test/texts/english/short.txt +3 -0
- metadata +412 -0
@@ -0,0 +1,23 @@
|
|
1
|
+
module Treat
|
2
|
+
module Inflectors
|
3
|
+
module Stemmers
|
4
|
+
# Stems words using the 'ruby-stemmer' gem, which
|
5
|
+
# wraps a C version of the Porter stemming algorithm.
|
6
|
+
#
|
7
|
+
# Project website: https://github.com/aurelian/ruby-stemmer
|
8
|
+
# Original paper: Porter, 1980. An algorithm for suffix stripping,
|
9
|
+
# Program, Vol. 14, no. 3, pp 130-137,
|
10
|
+
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
|
11
|
+
class PorterC
|
12
|
+
silently { require 'lingua/stemmer' }
|
13
|
+
::LinguaStemmer = ::Lingua
|
14
|
+
Object.instance_eval { remove_const :Lingua }
|
15
|
+
# Stem the word using the Porter C algorithm.
|
16
|
+
# Options: none.
|
17
|
+
def self.stem(word, options = {})
|
18
|
+
silently { ::LinguaStemmer.stemmer(word.to_s) }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Treat
|
2
|
+
module Inflectors
|
3
|
+
module Stemmers
|
4
|
+
# Stems a word using the UEA algorithm, implemented
|
5
|
+
# by the 'uea-stemmer' gem.
|
6
|
+
#
|
7
|
+
# "Similar to other stemmers, UEA-Lite operates on a
|
8
|
+
# set of rules which are used as steps. There are two
|
9
|
+
# groups of rules: the first to clean the tokens, and
|
10
|
+
# the second to alter suffixes."
|
11
|
+
#
|
12
|
+
# Project website: https://github.com/ealdent/uea-stemmer
|
13
|
+
# Original paper: Jenkins, Marie-Claire, Smith, Dan,
|
14
|
+
# Conservative stemming for search and indexing, 2005.
|
15
|
+
# http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
|
16
|
+
class UEA
|
17
|
+
# Require the 'uea-stemmer' gem.
|
18
|
+
silently { require 'uea-stemmer' }
|
19
|
+
# Keep only one copy of the stemmer.
|
20
|
+
@@stemmer = nil
|
21
|
+
# Stems a word using the UEA algorithm, implemented
|
22
|
+
# by the 'uea-stemmer' gem.
|
23
|
+
def self.stem(entity, options = {})
|
24
|
+
@@stemmer ||= silently { ::UEAStemmer.new }
|
25
|
+
@@stemmer.stem(entity.to_s).strip
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Treat
|
2
|
+
# Lexicalizers allow the retrieval of lexical information
|
3
|
+
# (part of speech tag, synsets, hypersets, hyposets, etc.)
|
4
|
+
# of an entity.
|
5
|
+
module Lexicalizers
|
6
|
+
# Taggers return the part of speech tag of a word.
|
7
|
+
module Tag
|
8
|
+
extend Group
|
9
|
+
self.type = :annotator
|
10
|
+
self.targets = [:phrase, :word]
|
11
|
+
end
|
12
|
+
module Category
|
13
|
+
extend Group
|
14
|
+
self.type = :annotator
|
15
|
+
self.targets = [:phrase, :word]
|
16
|
+
|
17
|
+
def self.cat(entity, category); category; end # Remove
|
18
|
+
end
|
19
|
+
# Linkers allow to retrieve grammatical links
|
20
|
+
# between words.
|
21
|
+
module Linkages
|
22
|
+
extend Group
|
23
|
+
self.type = :annotator
|
24
|
+
self.targets = [:sentence, :word]
|
25
|
+
end
|
26
|
+
# Lexicons are dictionnaries of semantically linked
|
27
|
+
# word forms.
|
28
|
+
module Synsets
|
29
|
+
extend Group
|
30
|
+
self.type = :annotator
|
31
|
+
self.targets = [:word, :number]
|
32
|
+
|
33
|
+
def self.synonyms(entity, synsets)
|
34
|
+
synsets.collect { |ss| ss.synonyms }.flatten - [entity.value]
|
35
|
+
end
|
36
|
+
def self.antonyms(entity, synsets)
|
37
|
+
synsets.collect { |ss| ss.antonyms }.flatten
|
38
|
+
end
|
39
|
+
def self.hyponyms(entity, synsets)
|
40
|
+
synsets.collect { |ss| ss.hyponyms }.flatten
|
41
|
+
end
|
42
|
+
def self.hypernyms(entity, synsets)
|
43
|
+
synsets.collect { |ss| ss.hypernyms }.flatten
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
extend Treat::Category
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Treat
|
2
|
+
module Lexicalizers
|
3
|
+
module Category
|
4
|
+
# A class that detects the category of a word from its tag,
|
5
|
+
# using the default tagger for the language of the entity.
|
6
|
+
class FromTag
|
7
|
+
# Find the category of the current entity.
|
8
|
+
# Options:
|
9
|
+
# :tagger => (Symbol) force the use of a tagger.
|
10
|
+
# :tag_to_cat => (Hash) a list of categories for each possible tag.
|
11
|
+
def self.category(entity, options = {})
|
12
|
+
if options.empty?
|
13
|
+
options = {
|
14
|
+
tagger: nil,
|
15
|
+
tag_to_cat: Treat::Resources::Tags::PTBWordTagToCategory
|
16
|
+
}
|
17
|
+
end
|
18
|
+
tag = options[:tagger].nil? ? entity.tag : entity.tag(options[:tagger])
|
19
|
+
cat = options[:tag_to_cat][tag]
|
20
|
+
if cat.nil?
|
21
|
+
warn "Category not found for tag #{tag}."
|
22
|
+
:unknown
|
23
|
+
else
|
24
|
+
cat
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Treat
|
2
|
+
module Lexicalizers
|
3
|
+
module Linkages
|
4
|
+
class Naive
|
5
|
+
def self.linkages(entity, options = {})
|
6
|
+
linkage = options.delete(:linkage)
|
7
|
+
if linkage.nil?
|
8
|
+
raise Treat::Exception,
|
9
|
+
"You must supply the :linkage option."
|
10
|
+
end
|
11
|
+
if !respond_to?(linkage)
|
12
|
+
raise Treat::Exception,
|
13
|
+
"No handler to resolve linkage #{linkage}."
|
14
|
+
end
|
15
|
+
self.send(linkage, entity, options)
|
16
|
+
end
|
17
|
+
# %%%
|
18
|
+
def self.patient(entity, options)
|
19
|
+
# Not so simple here... Fix
|
20
|
+
if main_verb.has_feature?(:aux)
|
21
|
+
subject
|
22
|
+
elsif main_verb.voice == 'passive'
|
23
|
+
subject
|
24
|
+
elsif main_verb.voice == 'active'
|
25
|
+
# Each prepos.
|
26
|
+
end
|
27
|
+
end
|
28
|
+
# Return the subject of the sentence|verb.
|
29
|
+
def self.subject(entity, options)
|
30
|
+
verb = entity.category == :verb ?
|
31
|
+
main_verb(entity) : entity.main_verb
|
32
|
+
args = []
|
33
|
+
main_verb.edges.each_pair do |id,edge|
|
34
|
+
args << find(id)
|
35
|
+
end
|
36
|
+
args[0]
|
37
|
+
end
|
38
|
+
# Return the object of the sentence|verb.
|
39
|
+
def self.object(entity, options)
|
40
|
+
verb = entity.category == :verb ?
|
41
|
+
main_verb(entity) : entity.main_verb
|
42
|
+
if verb.voice == 'passive'
|
43
|
+
return
|
44
|
+
end
|
45
|
+
args = []
|
46
|
+
verb.edges.each_pair do |id,edge|
|
47
|
+
args << find(id)
|
48
|
+
end
|
49
|
+
args[1]
|
50
|
+
end
|
51
|
+
# Find the main verb (shallowest verb in the tree).
|
52
|
+
def self.main_verb(entity, options)
|
53
|
+
verbs = entity.words_with_cat(:verb)
|
54
|
+
if verbs.empty?
|
55
|
+
return
|
56
|
+
end
|
57
|
+
verbs.sort! { |a,b| a.depth <=> b.depth }
|
58
|
+
verbs[0]
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Treat
|
2
|
+
module Lexicalizers
|
3
|
+
module Synsets
|
4
|
+
# Currently not implemented.
|
5
|
+
class RitaWn
|
6
|
+
# Require the Ruby-Java bridge.
|
7
|
+
#silently do
|
8
|
+
require 'rjb'
|
9
|
+
# Load the RitaWN jars.
|
10
|
+
Rjb::load("#{Treat.bin}/jwnl/jwnl.jar", [])
|
11
|
+
JWNLException = Rjb::import('net.didion.jwnl.JWNLException')
|
12
|
+
Rjb::load("#{Treat.bin}/ritaWN/library/ritaWN.jar", [])
|
13
|
+
Rjb::add_jar("#{Treat.bin}/ritaWN/library/supportWN.jar")
|
14
|
+
Rjb::add_jar("#{Treat.bin}/ritaWNcore1.0.jar")
|
15
|
+
RiWordnet = ::Rjb::import('rita.wordnet.RiWordnet')
|
16
|
+
#end
|
17
|
+
def self.synsets(word, options = nil)
|
18
|
+
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
module Treat
|
2
|
+
module Lexicalizers
|
3
|
+
module Synsets
|
4
|
+
# Obtain lexical information about a word using the
|
5
|
+
# ruby 'wordnet' gem.
|
6
|
+
class Wordnet
|
7
|
+
# Require the 'wordnet' gem.
|
8
|
+
require 'wordnet'
|
9
|
+
# Obtain lexical information about a word using the
|
10
|
+
# ruby 'wordnet' gem.
|
11
|
+
def self.synsets(word, options = nil)
|
12
|
+
unless [:noun, :adjective, :verb].include?(word.category)
|
13
|
+
return []
|
14
|
+
end
|
15
|
+
cat = word.category.to_s.capitalize
|
16
|
+
index = ::WordNet.const_get(cat + 'Index').instance
|
17
|
+
lemma = index.find(word.value.downcase)
|
18
|
+
return [] if lemma.nil?
|
19
|
+
synsets = []
|
20
|
+
lemma.synsets.each { |synset| synsets << Synset.new(synset) }
|
21
|
+
synsets
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
# An adaptor for synsets used by the Wordnet gem.
|
26
|
+
class Synset
|
27
|
+
# The POS tag of the word.
|
28
|
+
attr_accessor :pos
|
29
|
+
# The definition of the synset.
|
30
|
+
attr_accessor :definition
|
31
|
+
# The examples in the synset.
|
32
|
+
attr_accessor :examples
|
33
|
+
def initialize(synset)
|
34
|
+
@original_synset = synset
|
35
|
+
@pos, @definition, @examples =
|
36
|
+
parse_synset(synset.to_s.split(')'))
|
37
|
+
end
|
38
|
+
def parse_synset(res)
|
39
|
+
pos = res[0][1..-1].strip
|
40
|
+
res2 = res[1].split('(')
|
41
|
+
res3 = res2[1].split(';')
|
42
|
+
1.upto(res3.size-1) do |i|
|
43
|
+
res3[i] = res3[i].strip[1..-2]
|
44
|
+
end
|
45
|
+
definition = res3[0]
|
46
|
+
examples = res3[1..-1]
|
47
|
+
return pos, definition, examples
|
48
|
+
end
|
49
|
+
# The words in the synset.
|
50
|
+
def words; @original_synset.words; end
|
51
|
+
def synonyms; @original_synset.words; end
|
52
|
+
# A gloss (short definition with examples)
|
53
|
+
# for the synset.
|
54
|
+
def gloss; @original_synset.gloss; end
|
55
|
+
# The antonym sets of the synset.
|
56
|
+
def antonyms; antonym.collect { |a| a.words }; end
|
57
|
+
# The hypernym sets of the synset.
|
58
|
+
def hypernyms; hypernym.words; end
|
59
|
+
# The hyponym sets of the synset.
|
60
|
+
def hyponyms; hyponym.collect { |h| h.words }; end
|
61
|
+
# Respond to the missing method event.
|
62
|
+
def method_missing(sym, *args, &block)
|
63
|
+
ret = @original_synset.send(sym)
|
64
|
+
if ret.is_a?(::WordNet::Synset)
|
65
|
+
Synset.new(ret)
|
66
|
+
else
|
67
|
+
ret
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
module Treat
|
2
|
+
module Lexicalizers
|
3
|
+
module Tag
|
4
|
+
# Adapter class for the 'rbtagger' gem, a port
|
5
|
+
# of the Perl Lingua::BrillTagger class, based
|
6
|
+
# on the rule-based tagger developped by Eric Brill.
|
7
|
+
#
|
8
|
+
# The Brill tagger is a simple rule-based part of
|
9
|
+
# speech tagger. The main advantages over stochastic
|
10
|
+
# taggers is a vast reduction in information required
|
11
|
+
# and better portability from one tag set, corpus genre
|
12
|
+
# or language to another.
|
13
|
+
#
|
14
|
+
# Original paper:
|
15
|
+
# Eric Brill. 1992. A simple rule-based part of speech tagger.
|
16
|
+
# In Proceedings of the third conference on Applied natural
|
17
|
+
# language processing (ANLC '92). Association for Computational
|
18
|
+
# Linguistics, Stroudsburg, PA, USA, 152-155.
|
19
|
+
# DOI=10.3115/974499.974526 http://dx.doi.org/10.3115/974499.974526
|
20
|
+
# Project website:
|
21
|
+
# http://rbtagger.rubyforge.org/
|
22
|
+
# Original Perl module site:
|
23
|
+
# http://search.cpan.org/~kwilliams/Lingua-BrillTagger-0.02/lib/Lingua/BrillTagger.pm
|
24
|
+
class Brill
|
25
|
+
patch = false
|
26
|
+
# Require the 'rbtagger' gem.
|
27
|
+
begin
|
28
|
+
silently { require 'rbtagger' }
|
29
|
+
# This whole mess is required to deal with
|
30
|
+
# the fact that the 'rbtagger' gem defines
|
31
|
+
# a top-level module called 'Word', which
|
32
|
+
# will clash with the top-level class 'Word'
|
33
|
+
# we define when syntactic sugar is enabled.
|
34
|
+
rescue TypeError
|
35
|
+
if Treat.edulcorated?
|
36
|
+
patch = true
|
37
|
+
# Unset the class Word for the duration
|
38
|
+
# of loading the tagger.
|
39
|
+
Object.const_unset(:Word); retry
|
40
|
+
else
|
41
|
+
raise Treat::Exception,
|
42
|
+
'Something went wrong due to a name clash with the "rbtagger" gem.' +
|
43
|
+
'Turn off syntactic sugar to resolve this problem.'
|
44
|
+
end
|
45
|
+
ensure
|
46
|
+
# Reset the class Word if using syntactic sugar.
|
47
|
+
if Treat.edulcorated? && patch
|
48
|
+
Object.const_set(:Word, Treat::Entities::Word)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
# Hold the tagger.
|
52
|
+
@@tagger = nil
|
53
|
+
# Hold the user-set options
|
54
|
+
@@options = {}
|
55
|
+
# Hold the default options.
|
56
|
+
DefaultOptions = {
|
57
|
+
lexicon: nil,
|
58
|
+
lexical_rules: nil,
|
59
|
+
contextual_rules: nil
|
60
|
+
}
|
61
|
+
# Tag words using a native Brill tagger.
|
62
|
+
#
|
63
|
+
# Available options:
|
64
|
+
# :lexicon => String (Lexicon file to use)
|
65
|
+
# :lexical_rules => String (Lexical rule file to use)
|
66
|
+
# :contextual_rules => String (Contextual rules file to use)
|
67
|
+
def self.tag(entity, options = {})
|
68
|
+
# Reinitialize the tagger if the options have changed.
|
69
|
+
if options != @@options
|
70
|
+
@@options = DefaultOptions.merge(options)
|
71
|
+
@@tagger = nil # Reset the tagger
|
72
|
+
end
|
73
|
+
# Create the tagger if necessary
|
74
|
+
@@tagger ||= ::Brill::Tagger.new(options[:lexicon],
|
75
|
+
options[:lexical_rules], options[:contextual_rules])
|
76
|
+
# Perform tagging.
|
77
|
+
if entity.type == :word
|
78
|
+
# Setup the context of the word
|
79
|
+
l = entity.left
|
80
|
+
r = entity.right
|
81
|
+
l = l.nil? ? '' : l.to_s
|
82
|
+
r = r.nil? ? '' : r.to_s
|
83
|
+
c = "#{l} #{entity.value} #{r}"
|
84
|
+
end
|
85
|
+
res = @@tagger.tag(c)
|
86
|
+
if l == ''
|
87
|
+
unless r == ''
|
88
|
+
entity.next_sibling.set(:tag, res[3][1])
|
89
|
+
end
|
90
|
+
return res[2][1]
|
91
|
+
else
|
92
|
+
unless r == ''
|
93
|
+
entity.next_sibling.set(:tag, res[2][1])
|
94
|
+
end
|
95
|
+
return res[1][1]
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
module Treat
|
2
|
+
module Lexicalizers
|
3
|
+
module Tag
|
4
|
+
# An adapter for the 'engtagger' gem, which
|
5
|
+
# is a port of the Perl Lingua::EN::Tagger module.
|
6
|
+
#
|
7
|
+
# "This module uses part-of-speech statistics from
|
8
|
+
# the Penn Treebank to assign POS tags to English text.
|
9
|
+
# The tagger applies a bigram (two-word) Hidden Markov
|
10
|
+
# Model to guess the appropriate POS tag for a word.
|
11
|
+
# That means that the tagger will try to assign a POS
|
12
|
+
# tag based on the known POS tags for a given word and
|
13
|
+
# the POS tag assigned to its predecessor.
|
14
|
+
#
|
15
|
+
# Project website: http://engtagger.rubyforge.org/
|
16
|
+
# Original Perl module site:
|
17
|
+
# http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/
|
18
|
+
class Lingua
|
19
|
+
# Require the 'engtagger' gem.
|
20
|
+
silently { require 'engtagger' }
|
21
|
+
# Hold the tagger.
|
22
|
+
@@tagger = nil
|
23
|
+
# Hold the user-set options
|
24
|
+
@@options = {}
|
25
|
+
# Hold the default options.
|
26
|
+
DefaultOptions = {
|
27
|
+
unknown_word_tag: '?',
|
28
|
+
relax: false,
|
29
|
+
debug: false
|
30
|
+
}
|
31
|
+
# Tag the word using a probabilistic model taking
|
32
|
+
# into account known words found in a lexicon and
|
33
|
+
# the tag of the previous word.
|
34
|
+
#
|
35
|
+
# Options:
|
36
|
+
#
|
37
|
+
# :relax => (Boolean) Relax the Hidden Markov Model:
|
38
|
+
# this may improve accuracy for uncommon words,
|
39
|
+
# particularly words used polysemously.
|
40
|
+
# :debug => (Boolean) Print debug messages.
|
41
|
+
# :unknown_word_tag => (String) Tag for unknown words.
|
42
|
+
def self.tag(entity, options = {})
|
43
|
+
# Reinitialize the tagger if the options have changed.
|
44
|
+
if options != @@options
|
45
|
+
@@options = DefaultOptions.merge(options)
|
46
|
+
@@tagger = nil # Reset the tagger
|
47
|
+
end
|
48
|
+
@@tagger ||= ::EngTagger.new(@@options)
|
49
|
+
left = entity.left
|
50
|
+
if left.nil? || left.type != :word
|
51
|
+
left_tag = 'pp'
|
52
|
+
else
|
53
|
+
left_tag = left.tag.downcase
|
54
|
+
left_tag = 'pp' if left_tag == ''
|
55
|
+
end
|
56
|
+
w = @@tagger.clean_word(entity.to_s)
|
57
|
+
t = @@tagger.conf[:current_tag] =
|
58
|
+
@@tagger.assign_tag(left_tag, w)
|
59
|
+
t.upcase
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
=begin
|
67
|
+
|
68
|
+
CC Conjunction, coordinating and, or
|
69
|
+
CD Adjective, cardinal number 3, fifteen
|
70
|
+
DET Determiner this, each, some
|
71
|
+
EX Pronoun, existential there there
|
72
|
+
FW Foreign words
|
73
|
+
IN Preposition / Conjunction for, of, although, that
|
74
|
+
JJ Adjective happy, bad
|
75
|
+
JJR Adjective, comparative happier, worse
|
76
|
+
JJS Adjective, superlative happiest, worst
|
77
|
+
LS Symbol, list item A, A.
|
78
|
+
MD Verb, modal can, could, 'll
|
79
|
+
NN Noun aircraft, data
|
80
|
+
NNP Noun, proper London, Michael
|
81
|
+
NNPS Noun, proper, plural Australians, Methodists
|
82
|
+
NNS Noun, plural women, books
|
83
|
+
PDT Determiner, prequalifier quite, all, half
|
84
|
+
POS Possessive 's, '
|
85
|
+
PRP Determiner, possessive second mine, yours
|
86
|
+
PRPS Determiner, possessive their, your
|
87
|
+
RB Adverb often, not, very, here
|
88
|
+
RBR Adverb, comparative faster
|
89
|
+
RBS Adverb, superlative fastest
|
90
|
+
RP Adverb, particle up, off, out
|
91
|
+
SYM Symbol *
|
92
|
+
TO Preposition to
|
93
|
+
UH Interjection oh, yes, mmm
|
94
|
+
VB Verb, infinitive take, live
|
95
|
+
VBD Verb, past tense took, lived
|
96
|
+
VBG Verb, gerund taking, living
|
97
|
+
VBN Verb, past/passive participle taken, lived
|
98
|
+
VBP Verb, base present form take, live
|
99
|
+
VBZ Verb, present 3SG -s form takes, lives
|
100
|
+
WDT Determiner, question which, whatever
|
101
|
+
WP Pronoun, question who, whoever
|
102
|
+
WPS Determiner, possessive & question whose
|
103
|
+
WRB Adverb, question when, how, however
|
104
|
+
|
105
|
+
PP Punctuation, sentence ender ., !, ?
|
106
|
+
PPC Punctuation, comma ,
|
107
|
+
PPD Punctuation, dollar sign $
|
108
|
+
PPL Punctuation, quotation mark left ``
|
109
|
+
PPR Punctuation, quotation mark right ''
|
110
|
+
PPS Punctuation, colon, semicolon, elipsis :, ..., -
|
111
|
+
LRB Punctuation, left bracket (, {, [
|
112
|
+
RRB Punctuation, right bracket ), }, ]
|
113
|
+
|
114
|
+
=end
|