treat 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -3
- data/README.md +33 -0
- data/files/INFO +1 -0
- data/lib/treat.rb +40 -105
- data/lib/treat/ai.rb +12 -0
- data/lib/treat/ai/classifiers/id3.rb +27 -0
- data/lib/treat/categories.rb +82 -35
- data/lib/treat/categorizable.rb +44 -0
- data/lib/treat/classification.rb +61 -0
- data/lib/treat/configurable.rb +115 -0
- data/lib/treat/data_set.rb +42 -0
- data/lib/treat/dependencies.rb +24 -0
- data/lib/treat/downloader.rb +87 -0
- data/lib/treat/entities.rb +68 -66
- data/lib/treat/entities/abilities.rb +10 -0
- data/lib/treat/entities/abilities/buildable.rb +327 -0
- data/lib/treat/entities/abilities/checkable.rb +31 -0
- data/lib/treat/entities/abilities/copyable.rb +45 -0
- data/lib/treat/entities/abilities/countable.rb +51 -0
- data/lib/treat/entities/abilities/debuggable.rb +83 -0
- data/lib/treat/entities/abilities/delegatable.rb +123 -0
- data/lib/treat/entities/abilities/doable.rb +62 -0
- data/lib/treat/entities/abilities/exportable.rb +11 -0
- data/lib/treat/entities/abilities/iterable.rb +115 -0
- data/lib/treat/entities/abilities/magical.rb +83 -0
- data/lib/treat/entities/abilities/registrable.rb +74 -0
- data/lib/treat/entities/abilities/stringable.rb +91 -0
- data/lib/treat/entities/entities.rb +104 -0
- data/lib/treat/entities/entity.rb +122 -245
- data/lib/treat/exception.rb +4 -4
- data/lib/treat/extractors.rb +77 -80
- data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
- data/lib/treat/extractors/language/what_language.rb +50 -45
- data/lib/treat/extractors/name_tag/stanford.rb +55 -0
- data/lib/treat/extractors/tf_idf/native.rb +87 -0
- data/lib/treat/extractors/time/chronic.rb +55 -0
- data/lib/treat/extractors/time/nickel.rb +86 -62
- data/lib/treat/extractors/time/ruby.rb +53 -0
- data/lib/treat/extractors/topic_words/lda.rb +67 -58
- data/lib/treat/extractors/topics/reuters.rb +100 -87
- data/lib/treat/formatters.rb +39 -35
- data/lib/treat/formatters/readers/abw.rb +49 -29
- data/lib/treat/formatters/readers/autoselect.rb +37 -33
- data/lib/treat/formatters/readers/doc.rb +19 -13
- data/lib/treat/formatters/readers/html.rb +52 -30
- data/lib/treat/formatters/readers/image.rb +41 -40
- data/lib/treat/formatters/readers/odt.rb +59 -45
- data/lib/treat/formatters/readers/pdf.rb +28 -25
- data/lib/treat/formatters/readers/txt.rb +12 -15
- data/lib/treat/formatters/readers/xml.rb +73 -36
- data/lib/treat/formatters/serializers/xml.rb +80 -79
- data/lib/treat/formatters/serializers/yaml.rb +19 -18
- data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
- data/lib/treat/formatters/unserializers/xml.rb +94 -99
- data/lib/treat/formatters/unserializers/yaml.rb +20 -19
- data/lib/treat/formatters/visualizers/dot.rb +132 -132
- data/lib/treat/formatters/visualizers/standoff.rb +52 -44
- data/lib/treat/formatters/visualizers/tree.rb +26 -29
- data/lib/treat/groupable.rb +153 -0
- data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
- data/lib/treat/inflectors.rb +50 -45
- data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
- data/lib/treat/inflectors/declensors/active_support.rb +31 -0
- data/lib/treat/inflectors/declensors/english.rb +38 -0
- data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
- data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
- data/lib/treat/inflectors/stemmers/porter.rb +160 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
- data/lib/treat/inflectors/stemmers/uea.rb +28 -0
- data/lib/treat/installer.rb +308 -0
- data/lib/treat/kernel.rb +105 -27
- data/lib/treat/languages.rb +122 -88
- data/lib/treat/languages/arabic.rb +15 -15
- data/lib/treat/languages/chinese.rb +15 -15
- data/lib/treat/languages/dutch.rb +15 -15
- data/lib/treat/languages/english.rb +61 -62
- data/lib/treat/languages/french.rb +19 -19
- data/lib/treat/languages/german.rb +20 -20
- data/lib/treat/languages/greek.rb +15 -15
- data/lib/treat/languages/italian.rb +16 -16
- data/lib/treat/languages/polish.rb +15 -15
- data/lib/treat/languages/portuguese.rb +15 -15
- data/lib/treat/languages/russian.rb +15 -15
- data/lib/treat/languages/spanish.rb +16 -16
- data/lib/treat/languages/swedish.rb +16 -16
- data/lib/treat/lexicalizers.rb +34 -55
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
- data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
- data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
- data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
- data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
- data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
- data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
- data/lib/treat/linguistics.rb +9 -0
- data/lib/treat/linguistics/categories.rb +11 -0
- data/lib/treat/linguistics/tags.rb +422 -0
- data/lib/treat/loaders/linguistics.rb +30 -0
- data/lib/treat/loaders/stanford.rb +27 -0
- data/lib/treat/object.rb +1 -0
- data/lib/treat/processors.rb +37 -44
- data/lib/treat/processors/chunkers/autoselect.rb +16 -0
- data/lib/treat/processors/chunkers/html.rb +71 -0
- data/lib/treat/processors/chunkers/txt.rb +18 -24
- data/lib/treat/processors/parsers/enju.rb +253 -208
- data/lib/treat/processors/parsers/stanford.rb +130 -131
- data/lib/treat/processors/segmenters/punkt.rb +79 -45
- data/lib/treat/processors/segmenters/stanford.rb +46 -48
- data/lib/treat/processors/segmenters/tactful.rb +43 -36
- data/lib/treat/processors/tokenizers/perl.rb +124 -92
- data/lib/treat/processors/tokenizers/ptb.rb +81 -0
- data/lib/treat/processors/tokenizers/punkt.rb +48 -42
- data/lib/treat/processors/tokenizers/stanford.rb +39 -38
- data/lib/treat/processors/tokenizers/tactful.rb +64 -55
- data/lib/treat/proxies.rb +52 -35
- data/lib/treat/retrievers.rb +26 -16
- data/lib/treat/retrievers/indexers/ferret.rb +47 -26
- data/lib/treat/retrievers/searchers/ferret.rb +69 -50
- data/lib/treat/tree.rb +241 -183
- data/spec/collection.rb +123 -0
- data/spec/document.rb +93 -0
- data/spec/entity.rb +408 -0
- data/spec/languages.rb +25 -0
- data/spec/phrase.rb +146 -0
- data/spec/samples/mathematicians/archimedes.abw +34 -0
- data/spec/samples/mathematicians/euler.html +21 -0
- data/spec/samples/mathematicians/gauss.pdf +0 -0
- data/spec/samples/mathematicians/leibniz.txt +13 -0
- data/spec/samples/mathematicians/newton.doc +0 -0
- data/spec/sandbox.rb +5 -0
- data/spec/token.rb +109 -0
- data/spec/treat.rb +52 -0
- data/spec/tree.rb +117 -0
- data/spec/word.rb +110 -0
- data/spec/zone.rb +66 -0
- data/tmp/INFO +1 -1
- metadata +100 -201
- data/INSTALL +0 -1
- data/README +0 -3
- data/TODO +0 -28
- data/lib/economist/half_cocked_basel.txt +0 -16
- data/lib/economist/hungarys_troubles.txt +0 -46
- data/lib/economist/indias_slowdown.txt +0 -15
- data/lib/economist/merkozy_rides_again.txt +0 -24
- data/lib/economist/prada_is_not_walmart.txt +0 -9
- data/lib/economist/to_infinity_and_beyond.txt +0 -15
- data/lib/ferret/_11.cfs +0 -0
- data/lib/ferret/_14.cfs +0 -0
- data/lib/ferret/_p.cfs +0 -0
- data/lib/ferret/_s.cfs +0 -0
- data/lib/ferret/_v.cfs +0 -0
- data/lib/ferret/_y.cfs +0 -0
- data/lib/ferret/segments +0 -0
- data/lib/ferret/segments_15 +0 -0
- data/lib/treat/buildable.rb +0 -157
- data/lib/treat/category.rb +0 -33
- data/lib/treat/delegatable.rb +0 -116
- data/lib/treat/doable.rb +0 -45
- data/lib/treat/entities/collection.rb +0 -14
- data/lib/treat/entities/document.rb +0 -12
- data/lib/treat/entities/phrases.rb +0 -17
- data/lib/treat/entities/tokens.rb +0 -61
- data/lib/treat/entities/zones.rb +0 -41
- data/lib/treat/extractors/coreferences/stanford.rb +0 -69
- data/lib/treat/extractors/date/chronic.rb +0 -32
- data/lib/treat/extractors/date/ruby.rb +0 -25
- data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
- data/lib/treat/extractors/language/language_extractor.rb +0 -27
- data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
- data/lib/treat/extractors/roles/naive.rb +0 -73
- data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
- data/lib/treat/extractors/statistics/position_in.rb +0 -14
- data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
- data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
- data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
- data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
- data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
- data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
- data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
- data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
- data/lib/treat/feature.rb +0 -58
- data/lib/treat/features.rb +0 -7
- data/lib/treat/formatters/visualizers/short_value.rb +0 -29
- data/lib/treat/formatters/visualizers/txt.rb +0 -45
- data/lib/treat/group.rb +0 -106
- data/lib/treat/helpers/linguistics_loader.rb +0 -18
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
- data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
- data/lib/treat/inflectors/declensions/english.rb +0 -319
- data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
- data/lib/treat/inflectors/stem/porter.rb +0 -162
- data/lib/treat/inflectors/stem/porter_c.rb +0 -26
- data/lib/treat/inflectors/stem/uea.rb +0 -30
- data/lib/treat/install.rb +0 -59
- data/lib/treat/languages/tags.rb +0 -377
- data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
- data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
- data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
- data/lib/treat/lexicalizers/tag/brill.rb +0 -91
- data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
- data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
- data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
- data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
- data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
- data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
- data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
- data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
- data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
- data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
- data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
- data/lib/treat/registrable.rb +0 -28
- data/lib/treat/sugar.rb +0 -50
- data/lib/treat/viewable.rb +0 -29
- data/lib/treat/visitable.rb +0 -28
- data/test/profile.rb +0 -2
- data/test/tc_entity.rb +0 -117
- data/test/tc_extractors.rb +0 -73
- data/test/tc_formatters.rb +0 -41
- data/test/tc_inflectors.rb +0 -34
- data/test/tc_lexicalizers.rb +0 -32
- data/test/tc_processors.rb +0 -50
- data/test/tc_resources.rb +0 -22
- data/test/tc_treat.rb +0 -60
- data/test/tc_tree.rb +0 -60
- data/test/tests.rb +0 -20
- data/test/texts.rb +0 -19
- data/test/texts/english/half_cocked_basel.txt +0 -16
- data/test/texts/english/hose_and_dry.doc +0 -0
- data/test/texts/english/hungarys_troubles.abw +0 -70
- data/test/texts/english/long.html +0 -24
- data/test/texts/english/long.txt +0 -22
- data/test/texts/english/medium.txt +0 -5
- data/test/texts/english/republican_nomination.pdf +0 -0
- data/test/texts/english/saving_the_euro.odt +0 -0
- data/test/texts/english/short.txt +0 -3
- data/test/texts/english/zero_sum.html +0 -111
@@ -1,49 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Lexicalizers
|
3
|
-
module Category
|
4
|
-
# A class that detects the category of a word from its tag,
|
5
|
-
# using the default tagger for the language of the entity.
|
6
|
-
class FromTag
|
7
|
-
# Find the category of the current entity.
|
8
|
-
#
|
9
|
-
# Options:
|
10
|
-
#
|
11
|
-
# - (Symbol) :tagger => force the use of a tagger.
|
12
|
-
def self.category(entity, options = {})
|
13
|
-
tag = entity.tag(options[:tagger])
|
14
|
-
return :unknown if tag.nil? || tag == ''
|
15
|
-
return :sentence if tag == 'S'
|
16
|
-
if entity.is_a?(Treat::Entities::Phrase)
|
17
|
-
cat = Treat::Languages::Tags::PhraseTagToCategory[tag]
|
18
|
-
unless cat
|
19
|
-
cat = Treat::Languages::Tags::WordTagToCategory[tag]
|
20
|
-
end
|
21
|
-
elsif entity.is_a?(Treat::Entities::Word)
|
22
|
-
cat = Treat::Languages::Tags::WordTagToCategory[tag]
|
23
|
-
end
|
24
|
-
if cat == nil
|
25
|
-
warn "Category not found for tag '#{tag}'."
|
26
|
-
return :unknown
|
27
|
-
else
|
28
|
-
if cat.size == 1
|
29
|
-
return cat[entity.tag_set]
|
30
|
-
else
|
31
|
-
if entity.has?(:tag_set)
|
32
|
-
if cat[entity.tag_set]
|
33
|
-
return cat[entity.tag_set]
|
34
|
-
else
|
35
|
-
raise Treat::Exception,
|
36
|
-
"The specified tag set (#{entity.tag_set})" +
|
37
|
-
" does not contain the tag #{tag}."
|
38
|
-
end
|
39
|
-
else
|
40
|
-
raise Treat::Exception,
|
41
|
-
"No information can be found regarding which tag set to use."
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
@@ -1,63 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Lexicalizers
|
3
|
-
module Linkages
|
4
|
-
class Naive
|
5
|
-
# Fix - add options for sentences.
|
6
|
-
def self.linkages(entity, options = {})
|
7
|
-
if options[:linkage] == :is_a ||
|
8
|
-
options[:linkage] == :hypernym_of
|
9
|
-
|
10
|
-
entity.each_word do |w1|
|
11
|
-
hypernyms = []
|
12
|
-
entity.each_word do |w2|
|
13
|
-
next if w1 == w2
|
14
|
-
if w2.hypernyms.include?(w1.value) ||
|
15
|
-
w1.hyponyms.include?(w2.value)
|
16
|
-
hypernyms << w1
|
17
|
-
w2.link(w1, :is_a)
|
18
|
-
w1.link(w2, :hypernym_of)
|
19
|
-
end
|
20
|
-
end
|
21
|
-
w1.set :hypernyms, hypernyms
|
22
|
-
end
|
23
|
-
|
24
|
-
elsif options[:linkage] == :synonym_of
|
25
|
-
|
26
|
-
entity.each_word do |w1|
|
27
|
-
synonyms = []
|
28
|
-
entity.each_word do |w2|
|
29
|
-
next if w1 == w2
|
30
|
-
if w2.synonyms.include?(w1.value)
|
31
|
-
synonyms << w1
|
32
|
-
w2.link(w1, :synonym_of)
|
33
|
-
w1.link(w2, :synonym_of)
|
34
|
-
end
|
35
|
-
end
|
36
|
-
w1.set :synonyms, synonyms
|
37
|
-
end
|
38
|
-
|
39
|
-
elsif options[:linkage] == :antonym_of
|
40
|
-
|
41
|
-
entity.each_word do |w1|
|
42
|
-
antonyms = []
|
43
|
-
entity.each_word do |w2|
|
44
|
-
next if w1 == w2
|
45
|
-
if w2.antonyms.include?(w1.value)
|
46
|
-
antonyms << w1
|
47
|
-
w2.link(w1, :antonym_of)
|
48
|
-
w1.link(w2, :antonym_of)
|
49
|
-
end
|
50
|
-
end
|
51
|
-
w1.set :antonyms, antonyms
|
52
|
-
end
|
53
|
-
|
54
|
-
else
|
55
|
-
raise Treat::Exception,
|
56
|
-
"Invalid linkage option '#{options[:linkage]}'."
|
57
|
-
end
|
58
|
-
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end
|
@@ -1,76 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Lexicalizers
|
3
|
-
module Synsets
|
4
|
-
# Obtain lexical information about a word using the
|
5
|
-
# ruby 'wordnet' gem.
|
6
|
-
class Wordnet
|
7
|
-
# Require the 'wordnet' gem.
|
8
|
-
require 'wordnet'
|
9
|
-
# Obtain lexical information about a word using the
|
10
|
-
# ruby 'wordnet' gem.
|
11
|
-
def self.synsets(word, options = nil)
|
12
|
-
unless [:noun, :adjective, :verb].include?(word.category)
|
13
|
-
return []
|
14
|
-
end
|
15
|
-
cat = word.category.to_s.capitalize
|
16
|
-
index = ::WordNet.const_get(cat + 'Index').instance
|
17
|
-
lemma = index.find(word.value.downcase)
|
18
|
-
return [] if lemma.nil?
|
19
|
-
synsets = []
|
20
|
-
lemma.synsets.each { |synset| synsets << Synset.new(synset) }
|
21
|
-
synsets
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
25
|
-
# An adaptor for synsets used by the Wordnet gem.
|
26
|
-
class Synset
|
27
|
-
# The POS tag of the word.
|
28
|
-
attr_accessor :pos
|
29
|
-
# The definition of the synset.
|
30
|
-
attr_accessor :definition
|
31
|
-
# The examples in the synset.
|
32
|
-
attr_accessor :examples
|
33
|
-
def initialize(synset)
|
34
|
-
@original_synset = synset
|
35
|
-
@pos, @definition, @examples =
|
36
|
-
parse_synset(synset.to_s.split(')'))
|
37
|
-
end
|
38
|
-
def parse_synset(res)
|
39
|
-
pos = res[0][1..-1].strip
|
40
|
-
res2 = res[1].split('(')
|
41
|
-
res3 = res2[1].split(';')
|
42
|
-
1.upto(res3.size-1) do |i|
|
43
|
-
res3[i] = res3[i].strip[1..-2]
|
44
|
-
end
|
45
|
-
definition = res3[0]
|
46
|
-
examples = res3[1..-1]
|
47
|
-
return pos, definition, examples
|
48
|
-
end
|
49
|
-
# The words in the synset.
|
50
|
-
def words; @original_synset.words; end
|
51
|
-
def synonyms; @original_synset.words; end
|
52
|
-
# A gloss (short definition with examples)
|
53
|
-
# for the synset.
|
54
|
-
def gloss; @original_synset.gloss; end
|
55
|
-
# The antonym sets of the synset.
|
56
|
-
def antonyms; antonym.collect { |a| a.words }; end
|
57
|
-
# The hypernym sets of the synset.
|
58
|
-
def hypernyms;
|
59
|
-
h = hypernym
|
60
|
-
return [] unless h
|
61
|
-
h.words
|
62
|
-
end
|
63
|
-
# The hyponym sets of the synset.
|
64
|
-
def hyponyms; hyponym.collect { |h| h.words }; end
|
65
|
-
# Respond to the missing method event.
|
66
|
-
def method_missing(sym, *args, &block)
|
67
|
-
ret = @original_synset.send(sym)
|
68
|
-
if ret.is_a?(::WordNet::Synset)
|
69
|
-
Synset.new(ret)
|
70
|
-
else
|
71
|
-
ret
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
@@ -1,91 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Lexicalizers
|
3
|
-
module Tag
|
4
|
-
# Adapter class for the 'rbtagger' gem, a port
|
5
|
-
# of the Perl Lingua::BrillTagger class, based
|
6
|
-
# on the rule-based tagger developped by Eric Brill.
|
7
|
-
#
|
8
|
-
# The Brill tagger is a simple rule-based part of
|
9
|
-
# speech tagger. The main advantages over stochastic
|
10
|
-
# taggers is a vast reduction in information required
|
11
|
-
# and better portability from one tag set, corpus genre
|
12
|
-
# or language to another.
|
13
|
-
#
|
14
|
-
# Original paper:
|
15
|
-
# Eric Brill. 1992. A simple rule-based part of speech tagger.
|
16
|
-
# In Proceedings of the third conference on Applied natural
|
17
|
-
# language processing (ANLC '92). Association for Computational
|
18
|
-
# Linguistics, Stroudsburg, PA, USA, 152-155.
|
19
|
-
# DOI=10.3115/974499.974526 http://dx.doi.org/10.3115/974499.974526
|
20
|
-
# Project website:
|
21
|
-
# http://rbtagger.rubyforge.org/
|
22
|
-
# Original Perl module site:
|
23
|
-
# http://search.cpan.org/~kwilliams/Lingua-BrillTagger-0.02/lib/Lingua/BrillTagger.pm
|
24
|
-
class Brill
|
25
|
-
patch = false
|
26
|
-
# Require the 'rbtagger' gem.
|
27
|
-
require 'rbtagger'
|
28
|
-
begin
|
29
|
-
# This whole mess is required to deal with
|
30
|
-
# the fact that the 'rbtagger' gem defines
|
31
|
-
# a top-level module called 'Word', which
|
32
|
-
# will clash with the top-level class 'Word'
|
33
|
-
# we define when syntactic sugar is enabled.
|
34
|
-
rescue TypeError
|
35
|
-
if Treat.sweetened?
|
36
|
-
patch = true
|
37
|
-
# Unset the class Word for the duration
|
38
|
-
# of loading the tagger.
|
39
|
-
Object.const_unset(:Word); retry
|
40
|
-
else
|
41
|
-
raise Treat::Exception,
|
42
|
-
'Something went wrong due to a name clash with the "rbtagger" gem.' +
|
43
|
-
'Turn off syntactic sugar to resolve this problem.'
|
44
|
-
end
|
45
|
-
ensure
|
46
|
-
# Reset the class Word if using syntactic sugar.
|
47
|
-
if Treat.sweetened? && patch
|
48
|
-
Object.const_set(:Word, Treat::Entities::Word)
|
49
|
-
end
|
50
|
-
end
|
51
|
-
# Hold the tagger.
|
52
|
-
@@tagger = nil
|
53
|
-
# Tag words using a native Brill tagger.
|
54
|
-
# Performs own tokenization.
|
55
|
-
#
|
56
|
-
# Options:
|
57
|
-
#
|
58
|
-
# :lexicon => String (Lexicon file to use)
|
59
|
-
# :lexical_rules => String (Lexical rule file to use)
|
60
|
-
# :contextual_rules => String (Contextual rules file to use)
|
61
|
-
def self.tag(entity, options = {})
|
62
|
-
if entity.has_children?
|
63
|
-
warn "The Brill tagger performs its own tokenization. " +
|
64
|
-
"Removing all children of #{entity.type} with value #{entity.short_value}."
|
65
|
-
entity.remove_all!
|
66
|
-
end
|
67
|
-
# Create the tagger if necessary
|
68
|
-
@@tagger ||= ::Brill::Tagger.new(options[:lexicon],
|
69
|
-
options[:lexical_rules], options[:contextual_rules])
|
70
|
-
res = @@tagger.tag(entity.to_s)
|
71
|
-
res ||= []
|
72
|
-
isolated_word = entity.is_a?(Treat::Entities::Token)
|
73
|
-
res.each do |info|
|
74
|
-
next if info[1] == ')'
|
75
|
-
token = Treat::Entities::Token.from_string(info[0])
|
76
|
-
token.set :tag_set, :penn
|
77
|
-
token.set :tag, info[1]
|
78
|
-
if isolated_word
|
79
|
-
entity.set :tag_set, :penn
|
80
|
-
return info[1]
|
81
|
-
end
|
82
|
-
entity << token
|
83
|
-
end
|
84
|
-
entity.set :tag_set, :penn
|
85
|
-
return 'P' if entity.is_a?(Treat::Entities::Phrase)
|
86
|
-
return 'S' if entity.is_a?(Treat::Entities::Sentence)
|
87
|
-
end
|
88
|
-
end
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|
@@ -1,123 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Lexicalizers
|
3
|
-
module Tag
|
4
|
-
# An adapter for the 'engtagger' gem, which
|
5
|
-
# is a port of the Perl Lingua::EN::Tagger module.
|
6
|
-
#
|
7
|
-
# "This module uses part-of-speech statistics from
|
8
|
-
# the Penn Treebank to assign POS tags to English text.
|
9
|
-
# The tagger applies a bigram (two-word) Hidden Markov
|
10
|
-
# Model to guess the appropriate POS tag for a word.
|
11
|
-
# That means that the tagger will try to assign a POS
|
12
|
-
# tag based on the known POS tags for a given word and
|
13
|
-
# the POS tag assigned to its predecessor.
|
14
|
-
#
|
15
|
-
# Project website: http://engtagger.rubyforge.org/
|
16
|
-
# Original Perl module site:
|
17
|
-
# http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/
|
18
|
-
class Lingua
|
19
|
-
# Require the 'engtagger' gem.
|
20
|
-
silence_warnings { require 'engtagger' }
|
21
|
-
# Hold the tagger.
|
22
|
-
@@tagger = nil
|
23
|
-
# Hold the user-set options
|
24
|
-
@@options = {}
|
25
|
-
# Hold the default options.
|
26
|
-
DefaultOptions = {
|
27
|
-
:unknown_word_tag => 'pp', # Fix unknown word tag
|
28
|
-
:relax => false
|
29
|
-
}
|
30
|
-
# Tag the word using a probabilistic model taking
|
31
|
-
# into account known words found in a lexicon and
|
32
|
-
# the tag of the previous word.
|
33
|
-
#
|
34
|
-
# Options:
|
35
|
-
#
|
36
|
-
# - (Boolean) :relax => Relax the Hidden Markov Model:
|
37
|
-
# this may improve accuracy for uncommon words,
|
38
|
-
# particularly words used polysemously.
|
39
|
-
# - (String) :unknown_word_tag => Tag for unknown words.
|
40
|
-
def self.tag(entity, options = {})
|
41
|
-
if !entity.has_children?
|
42
|
-
warn "The Lingua tagger requires prior tokenization."
|
43
|
-
warn "Tokenizing the entity #{entity.short_value}."
|
44
|
-
end
|
45
|
-
options = DefaultOptions.merge(options)
|
46
|
-
# Reinitialize the tagger if the options have changed.
|
47
|
-
if options != @@options
|
48
|
-
@@options = DefaultOptions.merge(options)
|
49
|
-
@@tagger = nil # Reset the tagger
|
50
|
-
end
|
51
|
-
@@tagger ||= ::EngTagger.new(@@options)
|
52
|
-
left_tag = @@tagger.conf[:current_tag] = 'pp'
|
53
|
-
isolated_word = entity.is_a?(Treat::Entities::Token)
|
54
|
-
entity.tokens.each do |token|
|
55
|
-
w = @@tagger.clean_word(token.to_s)
|
56
|
-
t = @@tagger.assign_tag(left_tag, w)
|
57
|
-
t = options[:unknown_word_tag] if t.nil? || t == ''
|
58
|
-
@@tagger.conf[:current_tag] = left_tag = t
|
59
|
-
token.set :tag, t.upcase
|
60
|
-
token.set :tag_set, :penn
|
61
|
-
if isolated_word
|
62
|
-
entity.set :tag_set, :penn
|
63
|
-
return t.upcase
|
64
|
-
end
|
65
|
-
end
|
66
|
-
entity.set :tag_set, :penn
|
67
|
-
return 'P' if entity.is_a?(Treat::Entities::Phrase)
|
68
|
-
return 'S' if entity.is_a?(Treat::Entities::Sentence)
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
=begin
|
76
|
-
|
77
|
-
CC Conjunction, coordinating and, or
|
78
|
-
CD Adjective, cardinal number 3, fifteen
|
79
|
-
DET Determiner this, each, some
|
80
|
-
EX Pronoun, existential there there
|
81
|
-
FW Foreign words
|
82
|
-
IN Preposition / Conjunction for, of, although, that
|
83
|
-
JJ Adjective happy, bad
|
84
|
-
JJR Adjective, comparative happier, worse
|
85
|
-
JJS Adjective, superlative happiest, worst
|
86
|
-
LS Symbol, list item A, A.
|
87
|
-
MD Verb, modal can, could, 'll
|
88
|
-
NN Noun aircraft, data
|
89
|
-
NNP Noun, proper London, Michael
|
90
|
-
NNPS Noun, proper, plural Australians, Methodists
|
91
|
-
NNS Noun, plural women, books
|
92
|
-
PDT Determiner, prequalifier quite, all, half
|
93
|
-
POS Possessive 's, '
|
94
|
-
PRP Determiner, possessive second mine, yours
|
95
|
-
PRPS Determiner, possessive their, your
|
96
|
-
RB Adverb often, not, very, here
|
97
|
-
RBR Adverb, comparative faster
|
98
|
-
RBS Adverb, superlative fastest
|
99
|
-
RP Adverb, particle up, off, out
|
100
|
-
SYM Symbol *
|
101
|
-
TO Preposition to
|
102
|
-
UH Interjection oh, yes, mmm
|
103
|
-
VB Verb, infinitive take, live
|
104
|
-
VBD Verb, past tense took, lived
|
105
|
-
VBG Verb, gerund taking, living
|
106
|
-
VBN Verb, past/passive participle taken, lived
|
107
|
-
VBP Verb, base present form take, live
|
108
|
-
VBZ Verb, present 3SG -s form takes, lives
|
109
|
-
WDT Determiner, question which, whatever
|
110
|
-
WP Pronoun, question who, whoever
|
111
|
-
WPS Determiner, possessive & question whose
|
112
|
-
WRB Adverb, question when, how, however
|
113
|
-
|
114
|
-
PP Punctuation, sentence ender ., !, ?
|
115
|
-
PPC Punctuation, comma ,
|
116
|
-
PPD Punctuation, dollar sign $
|
117
|
-
PPL Punctuation, quotation mark left ``
|
118
|
-
PPR Punctuation, quotation mark right ''
|
119
|
-
PPS Punctuation, colon, semicolon, elipsis :, ..., -
|
120
|
-
LRB Punctuation, left bracket (, {, [
|
121
|
-
RRB Punctuation, right bracket ), }, ]
|
122
|
-
|
123
|
-
=end
|
@@ -1,70 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Lexicalizers
|
3
|
-
module Tag
|
4
|
-
class Stanford
|
5
|
-
require 'stanford-core-nlp'
|
6
|
-
# Hold one tagger per language.
|
7
|
-
@@taggers = {}
|
8
|
-
# Hold the default options.
|
9
|
-
DefaultOptions = {
|
10
|
-
:tagger_model => nil,
|
11
|
-
:silence => false,
|
12
|
-
:log_to_file => nil
|
13
|
-
}
|
14
|
-
LanguageToTagSet = {
|
15
|
-
:eng => :penn,
|
16
|
-
:ger => :negra,
|
17
|
-
:chi => :penn_chinese,
|
18
|
-
:fre => :simple
|
19
|
-
}
|
20
|
-
# Tag the word using one of the Stanford taggers.
|
21
|
-
def self.tag(entity, options = {})
|
22
|
-
# Handle options and set models.
|
23
|
-
options = DefaultOptions.merge(options)
|
24
|
-
if entity.has_children?
|
25
|
-
warn "The Stanford tagger performs its own tokenization." +
|
26
|
-
"Removing all children of #{entity.type} with value #{entity.short_value}."
|
27
|
-
entity.remove_all!
|
28
|
-
end
|
29
|
-
# Arrange options.
|
30
|
-
lang = entity.language
|
31
|
-
tag_set = LanguageToTagSet[lang]
|
32
|
-
warn "The tag set for the Stanford tagger you are requiring is not supported." unless tag_set
|
33
|
-
::StanfordCoreNLP.set_model('pos.model', options[:tagger_model]) if options[:tagger_model]
|
34
|
-
options[:log_to_file] = '/dev/null' if options[:silence]
|
35
|
-
::StanfordCoreNLP.log_file = options[:log_to_file] if options[:log_to_file]
|
36
|
-
|
37
|
-
# Load the tagger.
|
38
|
-
StanfordCoreNLP.use(lang)
|
39
|
-
@@taggers[lang] ||= ::StanfordCoreNLP.load(:tokenize, :ssplit, :pos)
|
40
|
-
|
41
|
-
# Tag the text.
|
42
|
-
text = ::StanfordCoreNLP::Text.new(entity.to_s)
|
43
|
-
isolated_word = entity.is_a?(Treat::Entities::Token)
|
44
|
-
@@taggers[lang].annotate(text)
|
45
|
-
|
46
|
-
text.get(:tokens).each do |token|
|
47
|
-
val = token.get(:value).to_s
|
48
|
-
tok = Treat::Entities::Token.from_string(val)
|
49
|
-
tag = token.get(:part_of_speech).to_s
|
50
|
-
tag_s, tag_opt = *tag.split('-')
|
51
|
-
tag_s ||= ''
|
52
|
-
tok.set :tag, tag_s
|
53
|
-
tok.set :tag_opt, tag_opt
|
54
|
-
tok.set :tag_set, tag_set if tag_set
|
55
|
-
if isolated_word
|
56
|
-
entity.set :tag_set, :penn
|
57
|
-
return tag_s
|
58
|
-
end
|
59
|
-
entity << tok
|
60
|
-
end
|
61
|
-
|
62
|
-
# Handle tags for sentences and phrases.
|
63
|
-
entity.set :tag_set, tag_set if tag_set
|
64
|
-
return 'P' if entity.is_a?(Treat::Entities::Phrase)
|
65
|
-
return 'S' if entity.is_a?(Treat::Entities::Sentence)
|
66
|
-
end
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|