treat 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +4 -4
- data/TODO +21 -54
- data/lib/economist/half_cocked_basel.txt +16 -0
- data/lib/economist/hose_and_dry.doc +0 -0
- data/lib/economist/hungarys_troubles.abw +70 -0
- data/lib/economist/republican_nomination.pdf +0 -0
- data/lib/economist/saving_the_euro.odt +0 -0
- data/lib/economist/to_infinity_and_beyond.txt +15 -0
- data/lib/economist/zero_sum.html +91 -0
- data/lib/treat.rb +58 -72
- data/lib/treat/buildable.rb +59 -15
- data/lib/treat/categories.rb +26 -14
- data/lib/treat/category.rb +2 -2
- data/lib/treat/delegatable.rb +65 -48
- data/lib/treat/doable.rb +44 -0
- data/lib/treat/entities.rb +34 -14
- data/lib/treat/entities/collection.rb +2 -0
- data/lib/treat/entities/document.rb +3 -2
- data/lib/treat/entities/entity.rb +105 -90
- data/lib/treat/entities/phrases.rb +17 -0
- data/lib/treat/entities/tokens.rb +28 -13
- data/lib/treat/entities/zones.rb +20 -0
- data/lib/treat/extractors.rb +49 -11
- data/lib/treat/extractors/coreferences/stanford.rb +68 -0
- data/lib/treat/extractors/date/chronic.rb +32 -0
- data/lib/treat/extractors/date/ruby.rb +25 -0
- data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
- data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
- data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
- data/lib/treat/extractors/language/what_language.rb +49 -0
- data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
- data/lib/treat/extractors/roles/naive.rb +73 -0
- data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
- data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
- data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
- data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
- data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
- data/lib/treat/extractors/time/nickel.rb +30 -12
- data/lib/treat/extractors/topic_words/lda.rb +9 -9
- data/lib/treat/extractors/topics/reuters.rb +14 -15
- data/lib/treat/extractors/topics/reuters/region.xml +1 -0
- data/lib/treat/features.rb +7 -0
- data/lib/treat/formatters/readers/abw.rb +6 -1
- data/lib/treat/formatters/readers/autoselect.rb +5 -6
- data/lib/treat/formatters/readers/doc.rb +3 -1
- data/lib/treat/formatters/readers/html.rb +1 -1
- data/lib/treat/formatters/readers/image.rb +43 -0
- data/lib/treat/formatters/readers/odt.rb +1 -2
- data/lib/treat/formatters/readers/pdf.rb +9 -1
- data/lib/treat/formatters/readers/xml.rb +40 -0
- data/lib/treat/formatters/serializers/xml.rb +50 -14
- data/lib/treat/formatters/serializers/yaml.rb +7 -2
- data/lib/treat/formatters/unserializers/xml.rb +33 -7
- data/lib/treat/formatters/visualizers/dot.rb +90 -20
- data/lib/treat/formatters/visualizers/short_value.rb +2 -2
- data/lib/treat/formatters/visualizers/standoff.rb +2 -2
- data/lib/treat/formatters/visualizers/tree.rb +1 -1
- data/lib/treat/formatters/visualizers/txt.rb +13 -4
- data/lib/treat/group.rb +16 -10
- data/lib/treat/helpers/linguistics_loader.rb +18 -0
- data/lib/treat/inflectors.rb +10 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
- data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
- data/lib/treat/inflectors/declensions/english.rb +319 -0
- data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
- data/lib/treat/install.rb +59 -0
- data/lib/treat/kernel.rb +18 -8
- data/lib/treat/languages.rb +18 -11
- data/lib/treat/languages/arabic.rb +4 -2
- data/lib/treat/languages/chinese.rb +6 -2
- data/lib/treat/languages/dutch.rb +16 -0
- data/lib/treat/languages/english.rb +47 -19
- data/lib/treat/languages/french.rb +8 -5
- data/lib/treat/languages/german.rb +9 -6
- data/lib/treat/languages/greek.rb +16 -0
- data/lib/treat/languages/italian.rb +6 -3
- data/lib/treat/languages/polish.rb +16 -0
- data/lib/treat/languages/portuguese.rb +16 -0
- data/lib/treat/languages/russian.rb +16 -0
- data/lib/treat/languages/spanish.rb +16 -0
- data/lib/treat/languages/swedish.rb +16 -0
- data/lib/treat/languages/tags.rb +377 -0
- data/lib/treat/lexicalizers.rb +34 -23
- data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
- data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
- data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
- data/lib/treat/lexicalizers/tag/brill.rb +35 -40
- data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
- data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
- data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
- data/lib/treat/processors.rb +8 -8
- data/lib/treat/processors/chunkers/txt.rb +4 -4
- data/lib/treat/processors/parsers/enju.rb +114 -99
- data/lib/treat/processors/parsers/stanford.rb +109 -41
- data/lib/treat/processors/segmenters/punkt.rb +17 -18
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
- data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
- data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
- data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
- data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
- data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
- data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
- data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
- data/lib/treat/processors/segmenters/stanford.rb +38 -37
- data/lib/treat/processors/segmenters/tactful.rb +5 -4
- data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
- data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
- data/lib/treat/processors/tokenizers/perl.rb +2 -2
- data/lib/treat/processors/tokenizers/punkt.rb +6 -2
- data/lib/treat/processors/tokenizers/stanford.rb +25 -24
- data/lib/treat/processors/tokenizers/tactful.rb +1 -2
- data/lib/treat/proxies.rb +2 -35
- data/lib/treat/registrable.rb +17 -22
- data/lib/treat/sugar.rb +11 -11
- data/lib/treat/tree.rb +27 -17
- data/lib/treat/viewable.rb +29 -0
- data/lib/treat/visitable.rb +1 -1
- data/test/tc_entity.rb +56 -49
- data/test/tc_extractors.rb +41 -18
- data/test/tc_formatters.rb +7 -8
- data/test/tc_inflectors.rb +19 -24
- data/test/tc_lexicalizers.rb +12 -19
- data/test/tc_processors.rb +26 -12
- data/test/tc_resources.rb +2 -7
- data/test/tc_treat.rb +20 -22
- data/test/tc_tree.rb +4 -4
- data/test/tests.rb +3 -5
- data/test/texts.rb +13 -14
- data/tmp/INFO +1 -0
- metadata +78 -158
- data/bin/INFO +0 -1
- data/examples/benchmark.rb +0 -81
- data/examples/keywords.rb +0 -148
- data/lib/treat/detectors.rb +0 -31
- data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
- data/lib/treat/detectors/format/file.rb +0 -36
- data/lib/treat/detectors/language/what_language.rb +0 -29
- data/lib/treat/entities/constituents.rb +0 -15
- data/lib/treat/entities/sentence.rb +0 -8
- data/lib/treat/extractors/named_entity/abner.rb +0 -20
- data/lib/treat/extractors/named_entity/stanford.rb +0 -174
- data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
- data/lib/treat/extractors/time/chronic.rb +0 -20
- data/lib/treat/extractors/time/native.rb +0 -18
- data/lib/treat/formatters/readers/gocr.rb +0 -26
- data/lib/treat/formatters/readers/ocropus.rb +0 -31
- data/lib/treat/formatters/visualizers/html.rb +0 -13
- data/lib/treat/formatters/visualizers/inspect.rb +0 -20
- data/lib/treat/inflectors/declensions/en.rb +0 -18
- data/lib/treat/languages/categories.rb +0 -5
- data/lib/treat/languages/english/categories.rb +0 -23
- data/lib/treat/languages/english/tags.rb +0 -352
- data/lib/treat/languages/xinhua.rb +0 -12
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
- data/lib/treat/string.rb +0 -5
- data/test/tc_detectors.rb +0 -26
data/lib/treat/lexicalizers.rb
CHANGED
@@ -6,42 +6,53 @@ module Treat
|
|
6
6
|
# Taggers return the part of speech tag of a word.
|
7
7
|
module Tag
|
8
8
|
extend Group
|
9
|
+
require 'treat/lexicalizers/tag/tagger'
|
9
10
|
self.type = :annotator
|
10
11
|
self.targets = [:word]
|
11
12
|
end
|
13
|
+
|
14
|
+
# Return the general category of a word.
|
12
15
|
module Category
|
13
16
|
extend Group
|
14
17
|
self.type = :annotator
|
15
|
-
self.targets = [:
|
16
|
-
|
17
|
-
def self.cat(entity, category); category; end # Remove
|
18
|
-
end
|
19
|
-
# Linkers allow to retrieve grammatical links
|
20
|
-
# between words.
|
21
|
-
module Linkages
|
22
|
-
extend Group
|
23
|
-
self.type = :annotator
|
24
|
-
self.targets = [:sentence, :word]
|
18
|
+
self.targets = [:word]
|
19
|
+
self.default = :from_tag
|
25
20
|
end
|
21
|
+
|
26
22
|
# Lexicons are dictionnaries of semantically linked
|
27
23
|
# word forms.
|
28
24
|
module Synsets
|
29
25
|
extend Group
|
30
26
|
self.type = :annotator
|
31
|
-
self.targets = [:word
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
27
|
+
self.targets = [:word]
|
28
|
+
self.postprocessors = {
|
29
|
+
:synonyms => lambda do |entity, synsets|
|
30
|
+
synsets.collect { |ss| ss.synonyms }.flatten -
|
31
|
+
[entity.value]
|
32
|
+
end,
|
33
|
+
:antonyms => lambda do |entity, synsets|
|
34
|
+
synsets.collect { |ss| ss.antonyms }.flatten
|
35
|
+
end,
|
36
|
+
:hyponyms => lambda do |entity, synsets|
|
37
|
+
synsets.collect { |ss| ss.hyponyms }.flatten
|
38
|
+
end,
|
39
|
+
:hypernyms => lambda do |entity, synsets|
|
40
|
+
synsets.collect { |ss| ss.hypernyms }.flatten
|
41
|
+
end
|
42
|
+
}
|
43
|
+
end
|
44
|
+
|
45
|
+
module Linkages
|
46
|
+
extend Group
|
47
|
+
self.type = :annotator
|
48
|
+
self.targets = [:zone]
|
49
|
+
self.presets = {
|
50
|
+
:is_a => {:linkage => :is_a},
|
51
|
+
:synonym_of => {:linkage => :synonym_of},
|
52
|
+
:antonym_of => {:linkage => :antonym_of}
|
53
|
+
}
|
44
54
|
end
|
55
|
+
|
45
56
|
extend Treat::Category
|
46
57
|
end
|
47
58
|
end
|
@@ -5,22 +5,29 @@ module Treat
|
|
5
5
|
# using the default tagger for the language of the entity.
|
6
6
|
class FromTag
|
7
7
|
# Find the category of the current entity.
|
8
|
-
#
|
8
|
+
#
|
9
9
|
# Options:
|
10
|
-
#
|
10
|
+
#
|
11
11
|
# - (Symbol) :tagger => force the use of a tagger.
|
12
12
|
def self.category(entity, options = {})
|
13
|
-
tag =
|
14
|
-
|
15
|
-
|
16
|
-
if
|
17
|
-
|
18
|
-
|
13
|
+
tag = entity.tag(options[:tagger])
|
14
|
+
return :unknown if tag.nil? || tag == ''
|
15
|
+
return :sentence if tag == 'S'
|
16
|
+
if entity.is_a?(Treat::Entities::Phrase)
|
17
|
+
cat = Treat::Languages::Tags::PhraseTagToCategory[tag]
|
18
|
+
unless cat
|
19
|
+
cat = Treat::Languages::Tags::WordTagToCategory[tag]
|
20
|
+
end
|
21
|
+
elsif entity.is_a?(Treat::Entities::Word)
|
22
|
+
cat = Treat::Languages::Tags::WordTagToCategory[tag]
|
23
|
+
end
|
24
|
+
if cat == nil
|
25
|
+
warn "Category not found for tag '#{tag}'."
|
26
|
+
return :unknown
|
19
27
|
else
|
20
28
|
if cat.size == 1
|
21
|
-
return cat[
|
29
|
+
return cat[entity.tag_set]
|
22
30
|
else
|
23
|
-
entity.set :tag_set, :penn
|
24
31
|
if entity.has?(:tag_set)
|
25
32
|
if cat[entity.tag_set]
|
26
33
|
return cat[entity.tag_set]
|
@@ -2,60 +2,60 @@ module Treat
|
|
2
2
|
module Lexicalizers
|
3
3
|
module Linkages
|
4
4
|
class Naive
|
5
|
+
# Fix - add options for sentences.
|
5
6
|
def self.linkages(entity, options = {})
|
6
|
-
linkage
|
7
|
-
|
7
|
+
if options[:linkage] == :is_a ||
|
8
|
+
options[:linkage] == :hypernym_of
|
9
|
+
|
10
|
+
entity.each_word do |w1|
|
11
|
+
hypernyms = []
|
12
|
+
entity.each_word do |w2|
|
13
|
+
next if w1 == w2
|
14
|
+
if w2.hypernyms.include?(w1.value) ||
|
15
|
+
w1.hyponyms.include?(w2.value)
|
16
|
+
hypernyms << w1
|
17
|
+
w2.link(w1, :is_a)
|
18
|
+
w1.link(w2, :hypernym_of)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
w1.set :hypernyms, hypernyms
|
22
|
+
end
|
23
|
+
|
24
|
+
elsif options[:linkage] == :synonym_of
|
25
|
+
|
26
|
+
entity.each_word do |w1|
|
27
|
+
synonyms = []
|
28
|
+
entity.each_word do |w2|
|
29
|
+
next if w1 == w2
|
30
|
+
if w2.synonyms.include?(w1.value)
|
31
|
+
synonyms << w1
|
32
|
+
w2.link(w1, :synonym_of)
|
33
|
+
w1.link(w2, :synonym_of)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
w1.set :synonyms, synonyms
|
37
|
+
end
|
38
|
+
|
39
|
+
elsif options[:linkage] == :antonym_of
|
40
|
+
|
41
|
+
entity.each_word do |w1|
|
42
|
+
antonyms = []
|
43
|
+
entity.each_word do |w2|
|
44
|
+
next if w1 == w2
|
45
|
+
if w2.antonyms.include?(w1.value)
|
46
|
+
antonyms << w1
|
47
|
+
w2.link(w1, :antonym_of)
|
48
|
+
w1.link(w2, :antonym_of)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
w1.set :antonyms, antonyms
|
52
|
+
end
|
53
|
+
|
54
|
+
else
|
8
55
|
raise Treat::Exception,
|
9
|
-
"
|
56
|
+
"Invalid linkage option '#{options[:linkage]}'."
|
10
57
|
end
|
11
|
-
|
12
|
-
raise Treat::Exception,
|
13
|
-
"No handler to resolve linkage #{linkage}."
|
14
|
-
end
|
15
|
-
self.send(linkage, entity, options)
|
16
|
-
end
|
17
|
-
# %%%
|
18
|
-
def self.patient(entity, options)
|
19
|
-
# Not so simple here... Fix
|
20
|
-
if main_verb.has_feature?(:aux)
|
21
|
-
subject
|
22
|
-
elsif main_verb.voice == 'passive'
|
23
|
-
subject
|
24
|
-
elsif main_verb.voice == 'active'
|
25
|
-
# Each prepos.
|
26
|
-
end
|
27
|
-
end
|
28
|
-
# Return the subject of the sentence|verb.
|
29
|
-
def self.subject(entity, options)
|
30
|
-
verb = (entity.has?(:category) && entity.category == :verb) ?
|
31
|
-
main_verb(entity) : entity.main_verb
|
32
|
-
args = []
|
33
|
-
main_verb.edges.each_pair do |id,edge|
|
34
|
-
args << find(id)
|
35
|
-
end
|
36
|
-
args[0]
|
37
|
-
end
|
38
|
-
# Return the object of the sentence|verb.
|
39
|
-
def self.object(entity, options)
|
40
|
-
verb = (entity.has?(:category) && entity.category == :verb) ?
|
41
|
-
main_verb(entity) : entity.main_verb
|
42
|
-
if verb.voice == 'passive'
|
43
|
-
return
|
44
|
-
end
|
45
|
-
args = []
|
46
|
-
verb.edges.each_pair do |id,edge|
|
47
|
-
args << find(id)
|
48
|
-
end
|
49
|
-
args[1]
|
50
|
-
end
|
51
|
-
# Find the main verb (shallowest verb in the tree).
|
52
|
-
def self.main_verb(entity, options)
|
53
|
-
verbs = entity.verbs
|
54
|
-
if verbs.empty?
|
55
|
-
return
|
56
|
-
end
|
57
|
-
verbs.sort! { |a,b| a.depth <=> b.depth }
|
58
|
-
verbs[0]
|
58
|
+
|
59
59
|
end
|
60
60
|
end
|
61
61
|
end
|
@@ -55,7 +55,11 @@ module Treat
|
|
55
55
|
# The antonym sets of the synset.
|
56
56
|
def antonyms; antonym.collect { |a| a.words }; end
|
57
57
|
# The hypernym sets of the synset.
|
58
|
-
def hypernyms;
|
58
|
+
def hypernyms;
|
59
|
+
h = hypernym
|
60
|
+
return [] unless h
|
61
|
+
h.words
|
62
|
+
end
|
59
63
|
# The hyponym sets of the synset.
|
60
64
|
def hyponyms; hyponym.collect { |h| h.words }; end
|
61
65
|
# Respond to the missing method event.
|
@@ -4,47 +4,47 @@ module Treat
|
|
4
4
|
# Adapter class for the 'rbtagger' gem, a port
|
5
5
|
# of the Perl Lingua::BrillTagger class, based
|
6
6
|
# on the rule-based tagger developped by Eric Brill.
|
7
|
-
#
|
7
|
+
#
|
8
8
|
# The Brill tagger is a simple rule-based part of
|
9
9
|
# speech tagger. The main advantages over stochastic
|
10
10
|
# taggers is a vast reduction in information required
|
11
11
|
# and better portability from one tag set, corpus genre
|
12
12
|
# or language to another.
|
13
|
-
#
|
14
|
-
# Original paper:
|
15
|
-
# Eric Brill. 1992. A simple rule-based part of speech tagger.
|
16
|
-
# In Proceedings of the third conference on Applied natural
|
17
|
-
# language processing (ANLC '92). Association for Computational
|
18
|
-
# Linguistics, Stroudsburg, PA, USA, 152-155.
|
13
|
+
#
|
14
|
+
# Original paper:
|
15
|
+
# Eric Brill. 1992. A simple rule-based part of speech tagger.
|
16
|
+
# In Proceedings of the third conference on Applied natural
|
17
|
+
# language processing (ANLC '92). Association for Computational
|
18
|
+
# Linguistics, Stroudsburg, PA, USA, 152-155.
|
19
19
|
# DOI=10.3115/974499.974526 http://dx.doi.org/10.3115/974499.974526
|
20
|
-
# Project website:
|
20
|
+
# Project website:
|
21
21
|
# http://rbtagger.rubyforge.org/
|
22
|
-
# Original Perl module site:
|
22
|
+
# Original Perl module site:
|
23
23
|
# http://search.cpan.org/~kwilliams/Lingua-BrillTagger-0.02/lib/Lingua/BrillTagger.pm
|
24
|
-
class Brill
|
24
|
+
class Brill < Tagger
|
25
25
|
patch = false
|
26
26
|
# Require the 'rbtagger' gem.
|
27
|
+
require 'rbtagger'
|
27
28
|
begin
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
# we define when syntactic sugar is enabled.
|
29
|
+
# This whole mess is required to deal with
|
30
|
+
# the fact that the 'rbtagger' gem defines
|
31
|
+
# a top-level module called 'Word', which
|
32
|
+
# will clash with the top-level class 'Word'
|
33
|
+
# we define when syntactic sugar is enabled.
|
34
34
|
rescue TypeError
|
35
|
-
if Treat.
|
35
|
+
if Treat.sweetened?
|
36
36
|
patch = true
|
37
37
|
# Unset the class Word for the duration
|
38
38
|
# of loading the tagger.
|
39
39
|
Object.const_unset(:Word); retry
|
40
40
|
else
|
41
41
|
raise Treat::Exception,
|
42
|
-
'Something went wrong due to a name clash with the "rbtagger" gem.' +
|
42
|
+
'Something went wrong due to a name clash with the "rbtagger" gem.' +
|
43
43
|
'Turn off syntactic sugar to resolve this problem.'
|
44
44
|
end
|
45
45
|
ensure
|
46
46
|
# Reset the class Word if using syntactic sugar.
|
47
|
-
if Treat.
|
47
|
+
if Treat.sweetened? && patch
|
48
48
|
Object.const_set(:Word, Treat::Entities::Word)
|
49
49
|
end
|
50
50
|
end
|
@@ -55,38 +55,33 @@ module Treat
|
|
55
55
|
# Tag words using a native Brill tagger.
|
56
56
|
#
|
57
57
|
# Options:
|
58
|
-
#
|
58
|
+
#
|
59
59
|
# :lexicon => String (Lexicon file to use)
|
60
60
|
# :lexical_rules => String (Lexical rule file to use)
|
61
61
|
# :contextual_rules => String (Contextual rules file to use)
|
62
62
|
def self.tag(entity, options = {})
|
63
|
+
r = super(entity, options)
|
64
|
+
return r if r && r != :isolated_word
|
63
65
|
# Reinitialize the tagger if the options have changed.
|
64
66
|
@@tagger = nil if options != @@options
|
65
67
|
# Create the tagger if necessary
|
66
68
|
@@tagger ||= ::Brill::Tagger.new(options[:lexicon],
|
67
69
|
options[:lexical_rules], options[:contextual_rules])
|
68
|
-
entity
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
res = @@tagger.tag(c)
|
79
|
-
if l == ''
|
80
|
-
unless r == ''
|
81
|
-
entity.next_sibling.set(:tag, res[3][1])
|
70
|
+
words = (r == :isolated_word) ? [entity] : entity.tokens
|
71
|
+
res = @@tagger.tag(words.join(' '))[1..-1]
|
72
|
+
res ||= []
|
73
|
+
res.each do |info|
|
74
|
+
words.each do |word|
|
75
|
+
if word.value == info[0]
|
76
|
+
word.set :tag_set, :penn
|
77
|
+
word.set :tag, info[1]
|
78
|
+
return info[1] if r == :isolated_word
|
79
|
+
end
|
82
80
|
end
|
83
|
-
return res[2][1]
|
84
|
-
else
|
85
|
-
unless r == ''
|
86
|
-
entity.next_sibling.set(:tag, res[2][1])
|
87
|
-
end
|
88
|
-
return res[1][1]
|
89
81
|
end
|
82
|
+
entity.set :tag_set, :penn
|
83
|
+
return 'P' if entity.is_a?(Treat::Entities::Phrase)
|
84
|
+
return 'S' if entity.is_a?(Treat::Entities::Sentence)
|
90
85
|
end
|
91
86
|
end
|
92
87
|
end
|
@@ -15,7 +15,7 @@ module Treat
|
|
15
15
|
# Project website: http://engtagger.rubyforge.org/
|
16
16
|
# Original Perl module site:
|
17
17
|
# http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/
|
18
|
-
class Lingua
|
18
|
+
class Lingua < Tagger
|
19
19
|
# Require the 'engtagger' gem.
|
20
20
|
silence_warnings { require 'engtagger' }
|
21
21
|
# Hold the tagger.
|
@@ -24,8 +24,8 @@ module Treat
|
|
24
24
|
@@options = {}
|
25
25
|
# Hold the default options.
|
26
26
|
DefaultOptions = {
|
27
|
-
unknown_word_tag
|
28
|
-
relax
|
27
|
+
:unknown_word_tag => 'pp', # Fix unknown word tag
|
28
|
+
:relax => false
|
29
29
|
}
|
30
30
|
# Tag the word using a probabilistic model taking
|
31
31
|
# into account known words found in a lexicon and
|
@@ -38,24 +38,29 @@ module Treat
|
|
38
38
|
# particularly words used polysemously.
|
39
39
|
# - (String) :unknown_word_tag => Tag for unknown words.
|
40
40
|
def self.tag(entity, options = {})
|
41
|
+
options = DefaultOptions.merge(options)
|
42
|
+
r = super(entity, options)
|
43
|
+
return r if r && r != :isolated_word
|
41
44
|
# Reinitialize the tagger if the options have changed.
|
42
45
|
if options != @@options
|
43
46
|
@@options = DefaultOptions.merge(options)
|
44
47
|
@@tagger = nil # Reset the tagger
|
45
48
|
end
|
46
49
|
@@tagger ||= ::EngTagger.new(@@options)
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
50
|
+
left_tag = @@tagger.conf[:current_tag] = 'pp'
|
51
|
+
tokens = (r == :isolated_word) ? [entity] : entity.tokens
|
52
|
+
tokens.each do |token|
|
53
|
+
w = @@tagger.clean_word(token.to_s)
|
54
|
+
t = @@tagger.assign_tag(left_tag, w)
|
55
|
+
t = options[:unknown_word_tag] if t.nil? || t == ''
|
56
|
+
@@tagger.conf[:current_tag] = left_tag = t
|
57
|
+
token.set :tag, t.upcase
|
58
|
+
token.set :tag_set, :penn
|
59
|
+
return t.upcase if r == :isolated_word
|
54
60
|
end
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
t.upcase
|
61
|
+
entity.set :tag_set, :penn
|
62
|
+
return 'P' if entity.is_a?(Treat::Entities::Phrase)
|
63
|
+
return 'S' if entity.is_a?(Treat::Entities::Sentence)
|
59
64
|
end
|
60
65
|
end
|
61
66
|
end
|
@@ -1,85 +1,76 @@
|
|
1
1
|
module Treat
|
2
2
|
module Lexicalizers
|
3
3
|
module Tag
|
4
|
-
class Stanford
|
5
|
-
|
6
|
-
silence_warnings do
|
7
|
-
require 'rjb'
|
8
|
-
jar = "#{Treat.bin}/stanford-tagger*/stanford-postagger*.jar"
|
9
|
-
jars = Dir.glob(jar)
|
10
|
-
if jars.empty? || !File.readable?(jars[0])
|
11
|
-
raise "Could not find stanford tagger JAR file (looking in #{jar})."+
|
12
|
-
" You may need to manually download the JAR files and/or set Treat.bin."
|
13
|
-
end
|
14
|
-
Rjb::load(jars[0], ['-Xms256M', '-Xmx512M'])
|
15
|
-
MaxentTagger = ::Rjb::import('edu.stanford.nlp.tagger.maxent.MaxentTagger')
|
16
|
-
Word = ::Rjb::import('edu.stanford.nlp.ling.Word')
|
17
|
-
List = ::Rjb::import('java.util.ArrayList')
|
18
|
-
end
|
19
|
-
# A list of models to use by language.
|
20
|
-
# Other models are available; see the models/ folder
|
21
|
-
# in the Stanford Tagger distribution files.
|
22
|
-
LanguageToModel = {
|
23
|
-
eng: 'english-left3words-distsim.tagger',
|
24
|
-
ger: 'german-fast.tagger',
|
25
|
-
fra: 'french.tagger',
|
26
|
-
ara: 'arabic-fast.tagger',
|
27
|
-
chi: 'chinese.tagger'
|
28
|
-
}
|
4
|
+
class Stanford < Tagger
|
5
|
+
require 'stanford-core-nlp'
|
29
6
|
# Hold one tagger per language.
|
30
7
|
@@taggers = {}
|
31
|
-
# Hold the user-set options for each language.
|
32
|
-
@@options = {}
|
33
8
|
# Hold the default options.
|
34
|
-
DefaultOptions = {
|
9
|
+
DefaultOptions = {
|
10
|
+
:tagger_model => nil,
|
11
|
+
:silence => false,
|
12
|
+
:log_to_file => nil
|
13
|
+
}
|
14
|
+
LanguageToTagSet = {
|
15
|
+
:eng => :penn,
|
16
|
+
:ger => :negra,
|
17
|
+
:chi => :penn_chinese,
|
18
|
+
:fre => :simple
|
19
|
+
}
|
35
20
|
# Tag the word using one of the Stanford taggers.
|
36
21
|
def self.tag(entity, options = {})
|
22
|
+
# Handle options and set models.
|
23
|
+
options = DefaultOptions.merge(options)
|
24
|
+
r = super(entity, options)
|
25
|
+
return r if r && r != :isolated_word
|
26
|
+
# Arrange options.
|
37
27
|
lang = entity.language
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
else
|
42
|
-
model = LanguageToModel[lang]
|
43
|
-
if model.nil?
|
44
|
-
raise Treat::Exception, "There exists no Stanford tagger model for " +
|
45
|
-
"the #{Treat::Languages.describe(lang)} language ."
|
46
|
-
end
|
28
|
+
@@tag_set = LanguageToTagSet[lang]
|
29
|
+
unless @@tag_set
|
30
|
+
warn "The tag set for the tagger you are requiring is not supported."
|
47
31
|
end
|
48
|
-
|
49
|
-
if options
|
50
|
-
|
51
|
-
|
32
|
+
|
33
|
+
if options[:tagger_model]
|
34
|
+
::StanfordCoreNLP.set_model(
|
35
|
+
'pos.model', options[:tagger_model]
|
36
|
+
)
|
52
37
|
end
|
53
|
-
if
|
54
|
-
|
55
|
-
models = Dir.glob(model)
|
56
|
-
if models.empty? || !File.readable?(models[0])
|
57
|
-
raise "Could not find a tagger model for the " +
|
58
|
-
"#{Treat::Languages.describe(lang)}: looking in #{model}."
|
59
|
-
end
|
60
|
-
silence_streams(STDOUT, STDERR) do
|
61
|
-
@@taggers[lang] =
|
62
|
-
MaxentTagger.new(models[0])
|
63
|
-
end
|
38
|
+
if options[:silence]
|
39
|
+
options[:log_to_file] = '/dev/null'
|
64
40
|
end
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
i = 0
|
69
|
-
[entity].each do |word| # Fix...
|
70
|
-
list.add(Word.new(word.to_s))
|
71
|
-
id_list[i] = word
|
72
|
-
i += 1
|
41
|
+
if options[:log_to_file]
|
42
|
+
::StanfordCoreNLP.log_file =
|
43
|
+
options[:log_to_file]
|
73
44
|
end
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
45
|
+
|
46
|
+
# Load the tagger.
|
47
|
+
StanfordCoreNLP.use(lang)
|
48
|
+
@@taggers[lang] ||= ::StanfordCoreNLP.load(:tokenize, :ssplit, :pos)
|
49
|
+
# Tag the text.
|
50
|
+
text = ::StanfordCoreNLP::Text.new(entity.to_s)
|
51
|
+
@@taggers[lang].annotate(text)
|
52
|
+
# Realign the tags.
|
53
|
+
entity.each_token do |t1|
|
54
|
+
text.get(:sentences).each do |sentence|
|
55
|
+
sentence.get(:tokens).each do |t2|
|
56
|
+
if t2.value == t1.value
|
57
|
+
tag = t2.get(:part_of_speech).to_s
|
58
|
+
tag_s, tag_opt = *tag.split('-')
|
59
|
+
tag_s ||= ''
|
60
|
+
t1.set :tag, tag_s
|
61
|
+
t1.set :tag_opt, tag_opt
|
62
|
+
t1.set :tag_set, @@tag_set if @@tag_set
|
63
|
+
return tag_s if r == :isolated_word
|
64
|
+
break
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
81
68
|
end
|
82
|
-
|
69
|
+
|
70
|
+
# Handle tags for sentences and phrases.
|
71
|
+
entity.set :tag_set, @@tag_set if @@tag_set
|
72
|
+
return 'P' if entity.is_a?(Treat::Entities::Phrase)
|
73
|
+
return 'S' if entity.is_a?(Treat::Entities::Sentence)
|
83
74
|
end
|
84
75
|
end
|
85
76
|
end
|