treat 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -3
- data/README.md +33 -0
- data/files/INFO +1 -0
- data/lib/treat.rb +40 -105
- data/lib/treat/ai.rb +12 -0
- data/lib/treat/ai/classifiers/id3.rb +27 -0
- data/lib/treat/categories.rb +82 -35
- data/lib/treat/categorizable.rb +44 -0
- data/lib/treat/classification.rb +61 -0
- data/lib/treat/configurable.rb +115 -0
- data/lib/treat/data_set.rb +42 -0
- data/lib/treat/dependencies.rb +24 -0
- data/lib/treat/downloader.rb +87 -0
- data/lib/treat/entities.rb +68 -66
- data/lib/treat/entities/abilities.rb +10 -0
- data/lib/treat/entities/abilities/buildable.rb +327 -0
- data/lib/treat/entities/abilities/checkable.rb +31 -0
- data/lib/treat/entities/abilities/copyable.rb +45 -0
- data/lib/treat/entities/abilities/countable.rb +51 -0
- data/lib/treat/entities/abilities/debuggable.rb +83 -0
- data/lib/treat/entities/abilities/delegatable.rb +123 -0
- data/lib/treat/entities/abilities/doable.rb +62 -0
- data/lib/treat/entities/abilities/exportable.rb +11 -0
- data/lib/treat/entities/abilities/iterable.rb +115 -0
- data/lib/treat/entities/abilities/magical.rb +83 -0
- data/lib/treat/entities/abilities/registrable.rb +74 -0
- data/lib/treat/entities/abilities/stringable.rb +91 -0
- data/lib/treat/entities/entities.rb +104 -0
- data/lib/treat/entities/entity.rb +122 -245
- data/lib/treat/exception.rb +4 -4
- data/lib/treat/extractors.rb +77 -80
- data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
- data/lib/treat/extractors/language/what_language.rb +50 -45
- data/lib/treat/extractors/name_tag/stanford.rb +55 -0
- data/lib/treat/extractors/tf_idf/native.rb +87 -0
- data/lib/treat/extractors/time/chronic.rb +55 -0
- data/lib/treat/extractors/time/nickel.rb +86 -62
- data/lib/treat/extractors/time/ruby.rb +53 -0
- data/lib/treat/extractors/topic_words/lda.rb +67 -58
- data/lib/treat/extractors/topics/reuters.rb +100 -87
- data/lib/treat/formatters.rb +39 -35
- data/lib/treat/formatters/readers/abw.rb +49 -29
- data/lib/treat/formatters/readers/autoselect.rb +37 -33
- data/lib/treat/formatters/readers/doc.rb +19 -13
- data/lib/treat/formatters/readers/html.rb +52 -30
- data/lib/treat/formatters/readers/image.rb +41 -40
- data/lib/treat/formatters/readers/odt.rb +59 -45
- data/lib/treat/formatters/readers/pdf.rb +28 -25
- data/lib/treat/formatters/readers/txt.rb +12 -15
- data/lib/treat/formatters/readers/xml.rb +73 -36
- data/lib/treat/formatters/serializers/xml.rb +80 -79
- data/lib/treat/formatters/serializers/yaml.rb +19 -18
- data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
- data/lib/treat/formatters/unserializers/xml.rb +94 -99
- data/lib/treat/formatters/unserializers/yaml.rb +20 -19
- data/lib/treat/formatters/visualizers/dot.rb +132 -132
- data/lib/treat/formatters/visualizers/standoff.rb +52 -44
- data/lib/treat/formatters/visualizers/tree.rb +26 -29
- data/lib/treat/groupable.rb +153 -0
- data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
- data/lib/treat/inflectors.rb +50 -45
- data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
- data/lib/treat/inflectors/declensors/active_support.rb +31 -0
- data/lib/treat/inflectors/declensors/english.rb +38 -0
- data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
- data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
- data/lib/treat/inflectors/stemmers/porter.rb +160 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
- data/lib/treat/inflectors/stemmers/uea.rb +28 -0
- data/lib/treat/installer.rb +308 -0
- data/lib/treat/kernel.rb +105 -27
- data/lib/treat/languages.rb +122 -88
- data/lib/treat/languages/arabic.rb +15 -15
- data/lib/treat/languages/chinese.rb +15 -15
- data/lib/treat/languages/dutch.rb +15 -15
- data/lib/treat/languages/english.rb +61 -62
- data/lib/treat/languages/french.rb +19 -19
- data/lib/treat/languages/german.rb +20 -20
- data/lib/treat/languages/greek.rb +15 -15
- data/lib/treat/languages/italian.rb +16 -16
- data/lib/treat/languages/polish.rb +15 -15
- data/lib/treat/languages/portuguese.rb +15 -15
- data/lib/treat/languages/russian.rb +15 -15
- data/lib/treat/languages/spanish.rb +16 -16
- data/lib/treat/languages/swedish.rb +16 -16
- data/lib/treat/lexicalizers.rb +34 -55
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
- data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
- data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
- data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
- data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
- data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
- data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
- data/lib/treat/linguistics.rb +9 -0
- data/lib/treat/linguistics/categories.rb +11 -0
- data/lib/treat/linguistics/tags.rb +422 -0
- data/lib/treat/loaders/linguistics.rb +30 -0
- data/lib/treat/loaders/stanford.rb +27 -0
- data/lib/treat/object.rb +1 -0
- data/lib/treat/processors.rb +37 -44
- data/lib/treat/processors/chunkers/autoselect.rb +16 -0
- data/lib/treat/processors/chunkers/html.rb +71 -0
- data/lib/treat/processors/chunkers/txt.rb +18 -24
- data/lib/treat/processors/parsers/enju.rb +253 -208
- data/lib/treat/processors/parsers/stanford.rb +130 -131
- data/lib/treat/processors/segmenters/punkt.rb +79 -45
- data/lib/treat/processors/segmenters/stanford.rb +46 -48
- data/lib/treat/processors/segmenters/tactful.rb +43 -36
- data/lib/treat/processors/tokenizers/perl.rb +124 -92
- data/lib/treat/processors/tokenizers/ptb.rb +81 -0
- data/lib/treat/processors/tokenizers/punkt.rb +48 -42
- data/lib/treat/processors/tokenizers/stanford.rb +39 -38
- data/lib/treat/processors/tokenizers/tactful.rb +64 -55
- data/lib/treat/proxies.rb +52 -35
- data/lib/treat/retrievers.rb +26 -16
- data/lib/treat/retrievers/indexers/ferret.rb +47 -26
- data/lib/treat/retrievers/searchers/ferret.rb +69 -50
- data/lib/treat/tree.rb +241 -183
- data/spec/collection.rb +123 -0
- data/spec/document.rb +93 -0
- data/spec/entity.rb +408 -0
- data/spec/languages.rb +25 -0
- data/spec/phrase.rb +146 -0
- data/spec/samples/mathematicians/archimedes.abw +34 -0
- data/spec/samples/mathematicians/euler.html +21 -0
- data/spec/samples/mathematicians/gauss.pdf +0 -0
- data/spec/samples/mathematicians/leibniz.txt +13 -0
- data/spec/samples/mathematicians/newton.doc +0 -0
- data/spec/sandbox.rb +5 -0
- data/spec/token.rb +109 -0
- data/spec/treat.rb +52 -0
- data/spec/tree.rb +117 -0
- data/spec/word.rb +110 -0
- data/spec/zone.rb +66 -0
- data/tmp/INFO +1 -1
- metadata +100 -201
- data/INSTALL +0 -1
- data/README +0 -3
- data/TODO +0 -28
- data/lib/economist/half_cocked_basel.txt +0 -16
- data/lib/economist/hungarys_troubles.txt +0 -46
- data/lib/economist/indias_slowdown.txt +0 -15
- data/lib/economist/merkozy_rides_again.txt +0 -24
- data/lib/economist/prada_is_not_walmart.txt +0 -9
- data/lib/economist/to_infinity_and_beyond.txt +0 -15
- data/lib/ferret/_11.cfs +0 -0
- data/lib/ferret/_14.cfs +0 -0
- data/lib/ferret/_p.cfs +0 -0
- data/lib/ferret/_s.cfs +0 -0
- data/lib/ferret/_v.cfs +0 -0
- data/lib/ferret/_y.cfs +0 -0
- data/lib/ferret/segments +0 -0
- data/lib/ferret/segments_15 +0 -0
- data/lib/treat/buildable.rb +0 -157
- data/lib/treat/category.rb +0 -33
- data/lib/treat/delegatable.rb +0 -116
- data/lib/treat/doable.rb +0 -45
- data/lib/treat/entities/collection.rb +0 -14
- data/lib/treat/entities/document.rb +0 -12
- data/lib/treat/entities/phrases.rb +0 -17
- data/lib/treat/entities/tokens.rb +0 -61
- data/lib/treat/entities/zones.rb +0 -41
- data/lib/treat/extractors/coreferences/stanford.rb +0 -69
- data/lib/treat/extractors/date/chronic.rb +0 -32
- data/lib/treat/extractors/date/ruby.rb +0 -25
- data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
- data/lib/treat/extractors/language/language_extractor.rb +0 -27
- data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
- data/lib/treat/extractors/roles/naive.rb +0 -73
- data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
- data/lib/treat/extractors/statistics/position_in.rb +0 -14
- data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
- data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
- data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
- data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
- data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
- data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
- data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
- data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
- data/lib/treat/feature.rb +0 -58
- data/lib/treat/features.rb +0 -7
- data/lib/treat/formatters/visualizers/short_value.rb +0 -29
- data/lib/treat/formatters/visualizers/txt.rb +0 -45
- data/lib/treat/group.rb +0 -106
- data/lib/treat/helpers/linguistics_loader.rb +0 -18
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
- data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
- data/lib/treat/inflectors/declensions/english.rb +0 -319
- data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
- data/lib/treat/inflectors/stem/porter.rb +0 -162
- data/lib/treat/inflectors/stem/porter_c.rb +0 -26
- data/lib/treat/inflectors/stem/uea.rb +0 -30
- data/lib/treat/install.rb +0 -59
- data/lib/treat/languages/tags.rb +0 -377
- data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
- data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
- data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
- data/lib/treat/lexicalizers/tag/brill.rb +0 -91
- data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
- data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
- data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
- data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
- data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
- data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
- data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
- data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
- data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
- data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
- data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
- data/lib/treat/registrable.rb +0 -28
- data/lib/treat/sugar.rb +0 -50
- data/lib/treat/viewable.rb +0 -29
- data/lib/treat/visitable.rb +0 -28
- data/test/profile.rb +0 -2
- data/test/tc_entity.rb +0 -117
- data/test/tc_extractors.rb +0 -73
- data/test/tc_formatters.rb +0 -41
- data/test/tc_inflectors.rb +0 -34
- data/test/tc_lexicalizers.rb +0 -32
- data/test/tc_processors.rb +0 -50
- data/test/tc_resources.rb +0 -22
- data/test/tc_treat.rb +0 -60
- data/test/tc_tree.rb +0 -60
- data/test/tests.rb +0 -20
- data/test/texts.rb +0 -19
- data/test/texts/english/half_cocked_basel.txt +0 -16
- data/test/texts/english/hose_and_dry.doc +0 -0
- data/test/texts/english/hungarys_troubles.abw +0 -70
- data/test/texts/english/long.html +0 -24
- data/test/texts/english/long.txt +0 -22
- data/test/texts/english/medium.txt +0 -5
- data/test/texts/english/republican_nomination.pdf +0 -0
- data/test/texts/english/saving_the_euro.odt +0 -0
- data/test/texts/english/short.txt +0 -3
- data/test/texts/english/zero_sum.html +0 -111
@@ -1,16 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
end
|
1
|
+
class Treat::Languages::Swedish
|
2
|
+
|
3
|
+
RequiredDependencies = []
|
4
|
+
OptionalDependencies = []
|
5
|
+
|
6
|
+
Extractors = {}
|
7
|
+
Inflectors = {}
|
8
|
+
Lexicalizers = {}
|
9
|
+
Processors = {
|
10
|
+
:chunkers => [:txt],
|
11
|
+
:segmenters => [:punkt],
|
12
|
+
:tokenizers => [:perl, :tactful]
|
13
|
+
}
|
14
|
+
Retrievers = {}
|
15
|
+
|
16
|
+
end
|
data/lib/treat/lexicalizers.rb
CHANGED
@@ -1,57 +1,36 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
# Return the general category of a word.
|
14
|
-
module Category
|
15
|
-
extend Group
|
16
|
-
self.type = :annotator
|
17
|
-
self.targets = [:word]
|
18
|
-
self.default = :from_tag
|
19
|
-
end
|
20
|
-
|
21
|
-
# Lexicons are dictionnaries of semantically linked
|
22
|
-
# word forms.
|
23
|
-
module Synsets
|
24
|
-
extend Group
|
25
|
-
self.type = :annotator
|
26
|
-
self.targets = [:word]
|
27
|
-
self.postprocessors = {
|
28
|
-
:synonyms => lambda do |entity, synsets|
|
29
|
-
synsets.collect { |ss| ss.synonyms }.flatten -
|
30
|
-
[entity.value]
|
31
|
-
end,
|
32
|
-
:antonyms => lambda do |entity, synsets|
|
33
|
-
synsets.collect { |ss| ss.antonyms }.flatten
|
34
|
-
end,
|
35
|
-
:hyponyms => lambda do |entity, synsets|
|
36
|
-
synsets.collect { |ss| ss.hyponyms }.flatten
|
37
|
-
end,
|
38
|
-
:hypernyms => lambda do |entity, synsets|
|
39
|
-
synsets.collect { |ss| ss.hypernyms }.flatten
|
40
|
-
end
|
41
|
-
}
|
42
|
-
end
|
43
|
-
|
44
|
-
module Linkages
|
45
|
-
extend Group
|
46
|
-
self.type = :annotator
|
47
|
-
self.targets = [:zone]
|
48
|
-
self.presets = {
|
49
|
-
:is_a => {:linkage => :is_a},
|
50
|
-
:synonym_of => {:linkage => :synonym_of},
|
51
|
-
:antonym_of => {:linkage => :antonym_of}
|
52
|
-
}
|
53
|
-
end
|
54
|
-
|
55
|
-
extend Treat::Category
|
1
|
+
# Lexicalizers allow to retrieve lexical information
|
2
|
+
# (part of speech tag, general word category, synsets,
|
3
|
+
# synonyms, antonyms, hyponyms, hypernyms, lexical
|
4
|
+
# relations, grammatical links).
|
5
|
+
# of an entity.
|
6
|
+
module Treat::Lexicalizers
|
7
|
+
|
8
|
+
# Taggers return the part of speech tag of a word.
|
9
|
+
module Taggers
|
10
|
+
extend Treat::Groupable
|
11
|
+
self.type = :annotator
|
12
|
+
self.targets = [:sentence, :phrase, :token]
|
56
13
|
end
|
14
|
+
|
15
|
+
# Return the general category of a word.
|
16
|
+
module Categorizers
|
17
|
+
extend Treat::Groupable
|
18
|
+
self.type = :annotator
|
19
|
+
self.targets = [:token]
|
20
|
+
self.default = :from_tag
|
21
|
+
end
|
22
|
+
|
23
|
+
# Find the synsets of a word in a lexicon.
|
24
|
+
module Sensers
|
25
|
+
extend Treat::Groupable
|
26
|
+
self.type = :annotator
|
27
|
+
self.targets = [:word]
|
28
|
+
self.preset_option = :nym
|
29
|
+
self.presets = [:synonyms, :antonyms,
|
30
|
+
:hyponyms, :hypernyms]
|
31
|
+
end
|
32
|
+
|
33
|
+
# Make Lexicalizers categorizable.
|
34
|
+
extend Treat::Categorizable
|
35
|
+
|
57
36
|
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# Finds the general part of speech of an entity
|
2
|
+
# (:sentence, :noun_phrase, :verb, :adverb, etc.)
|
3
|
+
# from its tag (e.g. 'S', 'NP', 'VBZ', 'ADV', etc.).
|
4
|
+
class Treat::Lexicalizers::Categorizers::FromTag
|
5
|
+
|
6
|
+
Pttc = Treat::Linguistics::Tags::PhraseTagToCategory
|
7
|
+
Wttc = Treat::Linguistics::Tags::WordTagToCategory
|
8
|
+
Ptc = Treat::Linguistics::Tags::PunctuationToCategory
|
9
|
+
|
10
|
+
# Find the category of the entity from its tag.
|
11
|
+
def self.category(entity, options = {})
|
12
|
+
|
13
|
+
tag = entity.check_has(:tag)
|
14
|
+
return :unknown if tag.nil? || tag == '' || entity.type == :symbol
|
15
|
+
return :sentence if tag == 'S' || entity.type == :sentence
|
16
|
+
return :number if entity.type == :number
|
17
|
+
return Ptc[entity.to_s] if entity.type == :punctuation
|
18
|
+
|
19
|
+
if entity.is_a?(Treat::Entities::Phrase)
|
20
|
+
cat = Pttc[tag]
|
21
|
+
cat = Wttc[tag] unless cat
|
22
|
+
else
|
23
|
+
cat = Wttc[tag]
|
24
|
+
end
|
25
|
+
|
26
|
+
return :unknown if cat == nil
|
27
|
+
|
28
|
+
ts = nil
|
29
|
+
|
30
|
+
if entity.has?(:tag_set)
|
31
|
+
ts = entity.get(:tag_set)
|
32
|
+
elsif entity.parent_phrase &&
|
33
|
+
entity.parent_phrase.has?(:tag_set)
|
34
|
+
ts = entity.parent_phrase.get(:tag_set)
|
35
|
+
else
|
36
|
+
raise Treat::Exception,
|
37
|
+
"No information can be found regarding "+
|
38
|
+
"which tag set to use."
|
39
|
+
end
|
40
|
+
|
41
|
+
if cat[ts]
|
42
|
+
return cat[ts]
|
43
|
+
else
|
44
|
+
raise Treat::Exception,
|
45
|
+
"The specified tag set (#{ts})" +
|
46
|
+
" does not contain the tag #{tag} " +
|
47
|
+
"for token #{entity.to_s}."
|
48
|
+
end
|
49
|
+
|
50
|
+
:unknown
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# Obtain lexical information about a word using the
|
2
|
+
# ruby 'wordnet' gem.
|
3
|
+
class Treat::Lexicalizers::Sensers::Wordnet
|
4
|
+
|
5
|
+
# Require the 'wordnet' gem.
|
6
|
+
require 'wordnet'
|
7
|
+
|
8
|
+
# Patch for bug.
|
9
|
+
::WordNet.module_eval do
|
10
|
+
remove_const(:SynsetType)
|
11
|
+
const_set(:SynsetType,
|
12
|
+
{"n" => "noun", "v" => "verb", "a" => "adj"})
|
13
|
+
end
|
14
|
+
|
15
|
+
# Require an adaptor for Wordnet synsets.
|
16
|
+
require 'treat/lexicalizers/sensers/wordnet/synset'
|
17
|
+
|
18
|
+
# Noun, adjective and verb indexes.
|
19
|
+
@@indexes = {}
|
20
|
+
|
21
|
+
# Obtain lexical information about a word using the
|
22
|
+
# ruby 'wordnet' gem.
|
23
|
+
def self.sense(word, options = nil)
|
24
|
+
|
25
|
+
category = word.check_has(:category)
|
26
|
+
|
27
|
+
unless options[:nym]
|
28
|
+
raise Treat::Exception, "You must supply " +
|
29
|
+
"the :nym option (:synonym, :hypernym, etc.)"
|
30
|
+
end
|
31
|
+
|
32
|
+
unless [:noun, :adjective, :verb].
|
33
|
+
include?(word.category)
|
34
|
+
return []
|
35
|
+
end
|
36
|
+
|
37
|
+
cat = category.to_s.capitalize
|
38
|
+
|
39
|
+
@@indexes[cat] ||=
|
40
|
+
::WordNet.const_get(cat + 'Index').instance
|
41
|
+
lemma = @@indexes[cat].find(word.value.downcase)
|
42
|
+
|
43
|
+
return [] if lemma.nil?
|
44
|
+
synsets = []
|
45
|
+
|
46
|
+
lemma.synsets.each do |synset|
|
47
|
+
synsets <<
|
48
|
+
Treat::Lexicalizers::Sensers::Wordnet::Synset.new(synset)
|
49
|
+
end
|
50
|
+
|
51
|
+
((synsets.collect do |ss|
|
52
|
+
ss.send(options[:nym])
|
53
|
+
end - [word.value]).flatten).uniq
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# An adaptor for synsets used by the Wordnet gem.
|
2
|
+
class Treat::Lexicalizers::Sensers::Wordnet::Synset
|
3
|
+
|
4
|
+
# The POS tag of the word.
|
5
|
+
attr_accessor :pos
|
6
|
+
# The definition of the synset.
|
7
|
+
attr_accessor :definition
|
8
|
+
# The examples in the synset.
|
9
|
+
attr_accessor :examples
|
10
|
+
|
11
|
+
def initialize(synset)
|
12
|
+
@original_synset = synset
|
13
|
+
@pos, @definition, @examples =
|
14
|
+
parse_synset(synset.to_s.split(')'))
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse_synset(res)
|
18
|
+
pos = res[0][1..-1].strip
|
19
|
+
res2 = res[1].split('(')
|
20
|
+
res3 = res2[1].split(';')
|
21
|
+
1.upto(res3.size-1) do |i|
|
22
|
+
res3[i] = res3[i].strip[1..-2]
|
23
|
+
end
|
24
|
+
definition = res3[0]
|
25
|
+
examples = res3[1..-1]
|
26
|
+
return pos, definition, examples
|
27
|
+
end
|
28
|
+
|
29
|
+
# The words in the synset.
|
30
|
+
def words
|
31
|
+
@original_synset.words
|
32
|
+
end
|
33
|
+
|
34
|
+
def synonyms
|
35
|
+
@original_synset.words
|
36
|
+
end
|
37
|
+
|
38
|
+
# A gloss (short definition with examples)
|
39
|
+
# for the synset.
|
40
|
+
def gloss
|
41
|
+
@original_synset.gloss
|
42
|
+
end
|
43
|
+
|
44
|
+
# The antonym sets of the synset.
|
45
|
+
def antonyms
|
46
|
+
antonym.collect { |a| a.words }
|
47
|
+
end
|
48
|
+
|
49
|
+
# The hypernym sets of the synset.
|
50
|
+
def hypernyms
|
51
|
+
h = hypernym
|
52
|
+
return [] unless h
|
53
|
+
h.words
|
54
|
+
end
|
55
|
+
|
56
|
+
# The hyponym sets of the synset.
|
57
|
+
def hyponyms
|
58
|
+
hyponym.collect { |h| h.words }
|
59
|
+
end
|
60
|
+
|
61
|
+
# Respond to the missing method event.
|
62
|
+
def method_missing(sym, *args, &block)
|
63
|
+
ret = @original_synset.send(sym)
|
64
|
+
if ret.is_a?(Treat::Lexicalizers::Sensers::Wordnet::Synset)
|
65
|
+
self.new(ret)
|
66
|
+
else
|
67
|
+
ret
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# Adapter class for the 'rbtagger' gem, a port
|
2
|
+
# of the Perl Lingua::BrillTagger class, based
|
3
|
+
# on the rule-based tagger developped by Eric Brill.
|
4
|
+
#
|
5
|
+
# Original paper:
|
6
|
+
#
|
7
|
+
# Eric Brill. 1992. A simple rule-based part of speech tagger.
|
8
|
+
# In Proceedings of the third conference on Applied natural
|
9
|
+
# language processing (ANLC '92). Association for Computational
|
10
|
+
# Linguistics, Stroudsburg, PA, USA, 152-155.
|
11
|
+
# DOI=10.3115/974499.974526 http://dx.doi.org/10.3115/974499.974526
|
12
|
+
#
|
13
|
+
# Project website:
|
14
|
+
#
|
15
|
+
# http://rbtagger.rubyforge.org/
|
16
|
+
module Treat::Lexicalizers::Taggers::Brill
|
17
|
+
|
18
|
+
require 'rbtagger'
|
19
|
+
|
20
|
+
require 'treat/lexicalizers/taggers/brill/patch'
|
21
|
+
|
22
|
+
# Hold one instance of the tagger.
|
23
|
+
@@tagger = nil
|
24
|
+
|
25
|
+
# Tag words using a native Brill tagger.
|
26
|
+
# Performs own tokenization.
|
27
|
+
#
|
28
|
+
# Options (see the rbtagger gem for more info):
|
29
|
+
#
|
30
|
+
# :lexicon => String (Lexicon file to use)
|
31
|
+
# :lexical_rules => String (Lexical rule file to use)
|
32
|
+
# :contextual_rules => String (Contextual rules file to use)
|
33
|
+
def self.tag(entity, options = {})
|
34
|
+
|
35
|
+
# Tokenize the sentence/phrase.
|
36
|
+
if !entity.has_children? &&
|
37
|
+
!entity.is_a?(Treat::Entities::Token)
|
38
|
+
entity.tokenize(:perl, options)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Create the tagger if necessary
|
42
|
+
@@tagger ||= ::Brill::Tagger.new(options[:lexicon],
|
43
|
+
options[:lexical_rules], options[:contextual_rules])
|
44
|
+
|
45
|
+
isolated_token = entity.is_a?(Treat::Entities::Token)
|
46
|
+
tokens = isolated_token ? [entity] : entity.tokens
|
47
|
+
tokens_s = tokens.map { |t| t.value }
|
48
|
+
|
49
|
+
tags = @@tagger.tag_tokens( tokens_s )
|
50
|
+
|
51
|
+
pairs = tokens.zip(tags)
|
52
|
+
|
53
|
+
pairs.each do |pair|
|
54
|
+
pair[0].set :tag, pair[1]
|
55
|
+
pair[0].set :tag_set, :penn if isolated_token
|
56
|
+
return pair[1] if isolated_token
|
57
|
+
end
|
58
|
+
|
59
|
+
if entity.is_a?(Treat::Entities::Sentence) ||
|
60
|
+
(entity.is_a?(Treat::Entities::Phrase) &&
|
61
|
+
!entity.parent_sentence)
|
62
|
+
entity.set :tag_set, :penn
|
63
|
+
end
|
64
|
+
|
65
|
+
return 'S' if entity.is_a?(Treat::Entities::Sentence)
|
66
|
+
return 'P' if entity.is_a?(Treat::Entities::Phrase)
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
patch = false
|
2
|
+
|
3
|
+
begin
|
4
|
+
# This whole mess is required to deal with
|
5
|
+
# the fact that the 'rbtagger' gem defines
|
6
|
+
# a top-level module called 'Word', which
|
7
|
+
# will clash with the top-level class 'Word'
|
8
|
+
# we define when syntactic sugar is enabled.
|
9
|
+
rescue TypeError
|
10
|
+
if Treat.sweetened?
|
11
|
+
patch = true
|
12
|
+
# Unset the class Word for the duration
|
13
|
+
# of loading the tagger.
|
14
|
+
Object.const_unset(:Word); retry
|
15
|
+
else
|
16
|
+
raise Treat::Exception,
|
17
|
+
'Something went wrong due to a name clash with the "rbtagger" gem.' +
|
18
|
+
'Turn off syntactic sugar to resolve this problem.'
|
19
|
+
end
|
20
|
+
ensure
|
21
|
+
# Reset the class Word if using syntactic sugar.
|
22
|
+
if Treat.sweetened? && patch
|
23
|
+
Object.const_set(:Word, Treat::Entities::Word)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
Brill::Tagger.class_eval do
|
28
|
+
|
29
|
+
def tag_tokens(tokens)
|
30
|
+
|
31
|
+
tags = Brill::Tagger.tag_start( tokens )
|
32
|
+
|
33
|
+
@tagger.apply_lexical_rules( tokens, tags, [], 0 )
|
34
|
+
@tagger.default_tag_finish( tokens, tags )
|
35
|
+
|
36
|
+
# Brill uses these fake "STAART" tags to delimit the start & end of sentence.
|
37
|
+
tokens << "STAART"
|
38
|
+
tokens << "STAART"
|
39
|
+
tokens.unshift "STAART"
|
40
|
+
tokens.unshift "STAART"
|
41
|
+
tags << "STAART"
|
42
|
+
tags << "STAART"
|
43
|
+
tags.unshift "STAART"
|
44
|
+
tags.unshift "STAART"
|
45
|
+
|
46
|
+
@tagger.apply_contextual_rules( tokens, tags, 1 )
|
47
|
+
|
48
|
+
tags.shift
|
49
|
+
tags.shift
|
50
|
+
tokens.shift
|
51
|
+
tokens.shift
|
52
|
+
tags.pop
|
53
|
+
tags.pop
|
54
|
+
tokens.pop
|
55
|
+
tokens.pop
|
56
|
+
|
57
|
+
tags
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
# An adapter for the 'engtagger' gem, which
|
2
|
+
# is a port of the Perl Lingua::EN::Tagger module.
|
3
|
+
#
|
4
|
+
# "This module uses part-of-speech statistics from
|
5
|
+
# the Penn Treebank to assign POS tags to English text.
|
6
|
+
# The tagger applies a bigram (two-word) Hidden Markov
|
7
|
+
# Model to guess the appropriate POS tag for a word.
|
8
|
+
# That means that the tagger will try to assign a POS
|
9
|
+
# tag based on the known POS tags for a given word and
|
10
|
+
# the POS tag assigned to its predecessor.
|
11
|
+
#
|
12
|
+
# Project website: http://engtagger.rubyforge.org/
|
13
|
+
# Original Perl module site:
|
14
|
+
# http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/
|
15
|
+
class Treat::Lexicalizers::Taggers::Lingua
|
16
|
+
|
17
|
+
# Require the 'engtagger' gem.
|
18
|
+
silence_warnings { require 'engtagger' }
|
19
|
+
|
20
|
+
# Undefine the porter stemming business.
|
21
|
+
String.class_eval { undef :stem }
|
22
|
+
|
23
|
+
# Hold one instance of the tagger.
|
24
|
+
@@tagger = nil
|
25
|
+
|
26
|
+
# Hold the default options.
|
27
|
+
DefaultOptions = { :relax => false }
|
28
|
+
|
29
|
+
# Replace punctuation tags used by this gem
|
30
|
+
# to the standard PTB tags.
|
31
|
+
Punctuation = {
|
32
|
+
'pp' => '.',
|
33
|
+
'pps' => ';',
|
34
|
+
'ppc' => ',',
|
35
|
+
'ppd' => '$',
|
36
|
+
'ppl' => 'lrb',
|
37
|
+
'ppr' => 'rrb'
|
38
|
+
}
|
39
|
+
|
40
|
+
# Tag the word using a probabilistic model taking
|
41
|
+
# into account known words found in a lexicon and
|
42
|
+
# the tag of the previous word.
|
43
|
+
#
|
44
|
+
# Options:
|
45
|
+
#
|
46
|
+
# - (Boolean) :relax => Relax the HMM model -
|
47
|
+
# this may improve accuracy for uncommon words,
|
48
|
+
# particularly words used polysemously.
|
49
|
+
def self.tag(entity, options = {})
|
50
|
+
|
51
|
+
if !entity.has_children? &&
|
52
|
+
!entity.is_a?(Treat::Entities::Token)
|
53
|
+
entity.tokenize
|
54
|
+
end
|
55
|
+
|
56
|
+
options = DefaultOptions.merge(options)
|
57
|
+
|
58
|
+
@@tagger ||= ::EngTagger.new(options)
|
59
|
+
left_tag = @@tagger.conf[:current_tag] = 'pp'
|
60
|
+
isolated_token = entity.is_a?(Treat::Entities::Token)
|
61
|
+
tokens = isolated_token ? [entity] : entity.tokens
|
62
|
+
|
63
|
+
tokens.each do |token|
|
64
|
+
next if token.to_s == ''
|
65
|
+
w = @@tagger.clean_word(token.to_s)
|
66
|
+
t = @@tagger.assign_tag(left_tag, w)
|
67
|
+
t = 'fw' if t.nil? || t == ''
|
68
|
+
@@tagger.conf[:current_tag] = left_tag = t
|
69
|
+
t = 'prp$' if t == 'prps'
|
70
|
+
t = 'dt' if t == 'det'
|
71
|
+
t = Punctuation[t] if Punctuation[t]
|
72
|
+
token.set :tag, t.upcase
|
73
|
+
token.set :tag_set, :penn if isolated_token
|
74
|
+
return t.upcase if isolated_token
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
|
79
|
+
if entity.is_a?(Treat::Entities::Sentence) ||
|
80
|
+
(entity.is_a?(Treat::Entities::Phrase) &&
|
81
|
+
!entity.parent_sentence)
|
82
|
+
entity.set :tag_set, :penn
|
83
|
+
end
|
84
|
+
|
85
|
+
return 'S' if entity.is_a?(Treat::Entities::Sentence)
|
86
|
+
return 'P' if entity.is_a?(Treat::Entities::Phrase)
|
87
|
+
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|