treat 0.2.5 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +3 -3
- data/README.md +33 -0
- data/files/INFO +1 -0
- data/lib/treat.rb +40 -105
- data/lib/treat/ai.rb +12 -0
- data/lib/treat/ai/classifiers/id3.rb +27 -0
- data/lib/treat/categories.rb +82 -35
- data/lib/treat/categorizable.rb +44 -0
- data/lib/treat/classification.rb +61 -0
- data/lib/treat/configurable.rb +115 -0
- data/lib/treat/data_set.rb +42 -0
- data/lib/treat/dependencies.rb +24 -0
- data/lib/treat/downloader.rb +87 -0
- data/lib/treat/entities.rb +68 -66
- data/lib/treat/entities/abilities.rb +10 -0
- data/lib/treat/entities/abilities/buildable.rb +327 -0
- data/lib/treat/entities/abilities/checkable.rb +31 -0
- data/lib/treat/entities/abilities/copyable.rb +45 -0
- data/lib/treat/entities/abilities/countable.rb +51 -0
- data/lib/treat/entities/abilities/debuggable.rb +83 -0
- data/lib/treat/entities/abilities/delegatable.rb +123 -0
- data/lib/treat/entities/abilities/doable.rb +62 -0
- data/lib/treat/entities/abilities/exportable.rb +11 -0
- data/lib/treat/entities/abilities/iterable.rb +115 -0
- data/lib/treat/entities/abilities/magical.rb +83 -0
- data/lib/treat/entities/abilities/registrable.rb +74 -0
- data/lib/treat/entities/abilities/stringable.rb +91 -0
- data/lib/treat/entities/entities.rb +104 -0
- data/lib/treat/entities/entity.rb +122 -245
- data/lib/treat/exception.rb +4 -4
- data/lib/treat/extractors.rb +77 -80
- data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
- data/lib/treat/extractors/language/what_language.rb +50 -45
- data/lib/treat/extractors/name_tag/stanford.rb +55 -0
- data/lib/treat/extractors/tf_idf/native.rb +87 -0
- data/lib/treat/extractors/time/chronic.rb +55 -0
- data/lib/treat/extractors/time/nickel.rb +86 -62
- data/lib/treat/extractors/time/ruby.rb +53 -0
- data/lib/treat/extractors/topic_words/lda.rb +67 -58
- data/lib/treat/extractors/topics/reuters.rb +100 -87
- data/lib/treat/formatters.rb +39 -35
- data/lib/treat/formatters/readers/abw.rb +49 -29
- data/lib/treat/formatters/readers/autoselect.rb +37 -33
- data/lib/treat/formatters/readers/doc.rb +19 -13
- data/lib/treat/formatters/readers/html.rb +52 -30
- data/lib/treat/formatters/readers/image.rb +41 -40
- data/lib/treat/formatters/readers/odt.rb +59 -45
- data/lib/treat/formatters/readers/pdf.rb +28 -25
- data/lib/treat/formatters/readers/txt.rb +12 -15
- data/lib/treat/formatters/readers/xml.rb +73 -36
- data/lib/treat/formatters/serializers/xml.rb +80 -79
- data/lib/treat/formatters/serializers/yaml.rb +19 -18
- data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
- data/lib/treat/formatters/unserializers/xml.rb +94 -99
- data/lib/treat/formatters/unserializers/yaml.rb +20 -19
- data/lib/treat/formatters/visualizers/dot.rb +132 -132
- data/lib/treat/formatters/visualizers/standoff.rb +52 -44
- data/lib/treat/formatters/visualizers/tree.rb +26 -29
- data/lib/treat/groupable.rb +153 -0
- data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
- data/lib/treat/inflectors.rb +50 -45
- data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
- data/lib/treat/inflectors/declensors/active_support.rb +31 -0
- data/lib/treat/inflectors/declensors/english.rb +38 -0
- data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
- data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
- data/lib/treat/inflectors/stemmers/porter.rb +160 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
- data/lib/treat/inflectors/stemmers/uea.rb +28 -0
- data/lib/treat/installer.rb +308 -0
- data/lib/treat/kernel.rb +105 -27
- data/lib/treat/languages.rb +122 -88
- data/lib/treat/languages/arabic.rb +15 -15
- data/lib/treat/languages/chinese.rb +15 -15
- data/lib/treat/languages/dutch.rb +15 -15
- data/lib/treat/languages/english.rb +61 -62
- data/lib/treat/languages/french.rb +19 -19
- data/lib/treat/languages/german.rb +20 -20
- data/lib/treat/languages/greek.rb +15 -15
- data/lib/treat/languages/italian.rb +16 -16
- data/lib/treat/languages/polish.rb +15 -15
- data/lib/treat/languages/portuguese.rb +15 -15
- data/lib/treat/languages/russian.rb +15 -15
- data/lib/treat/languages/spanish.rb +16 -16
- data/lib/treat/languages/swedish.rb +16 -16
- data/lib/treat/lexicalizers.rb +34 -55
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
- data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
- data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
- data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
- data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
- data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
- data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
- data/lib/treat/linguistics.rb +9 -0
- data/lib/treat/linguistics/categories.rb +11 -0
- data/lib/treat/linguistics/tags.rb +422 -0
- data/lib/treat/loaders/linguistics.rb +30 -0
- data/lib/treat/loaders/stanford.rb +27 -0
- data/lib/treat/object.rb +1 -0
- data/lib/treat/processors.rb +37 -44
- data/lib/treat/processors/chunkers/autoselect.rb +16 -0
- data/lib/treat/processors/chunkers/html.rb +71 -0
- data/lib/treat/processors/chunkers/txt.rb +18 -24
- data/lib/treat/processors/parsers/enju.rb +253 -208
- data/lib/treat/processors/parsers/stanford.rb +130 -131
- data/lib/treat/processors/segmenters/punkt.rb +79 -45
- data/lib/treat/processors/segmenters/stanford.rb +46 -48
- data/lib/treat/processors/segmenters/tactful.rb +43 -36
- data/lib/treat/processors/tokenizers/perl.rb +124 -92
- data/lib/treat/processors/tokenizers/ptb.rb +81 -0
- data/lib/treat/processors/tokenizers/punkt.rb +48 -42
- data/lib/treat/processors/tokenizers/stanford.rb +39 -38
- data/lib/treat/processors/tokenizers/tactful.rb +64 -55
- data/lib/treat/proxies.rb +52 -35
- data/lib/treat/retrievers.rb +26 -16
- data/lib/treat/retrievers/indexers/ferret.rb +47 -26
- data/lib/treat/retrievers/searchers/ferret.rb +69 -50
- data/lib/treat/tree.rb +241 -183
- data/spec/collection.rb +123 -0
- data/spec/document.rb +93 -0
- data/spec/entity.rb +408 -0
- data/spec/languages.rb +25 -0
- data/spec/phrase.rb +146 -0
- data/spec/samples/mathematicians/archimedes.abw +34 -0
- data/spec/samples/mathematicians/euler.html +21 -0
- data/spec/samples/mathematicians/gauss.pdf +0 -0
- data/spec/samples/mathematicians/leibniz.txt +13 -0
- data/spec/samples/mathematicians/newton.doc +0 -0
- data/spec/sandbox.rb +5 -0
- data/spec/token.rb +109 -0
- data/spec/treat.rb +52 -0
- data/spec/tree.rb +117 -0
- data/spec/word.rb +110 -0
- data/spec/zone.rb +66 -0
- data/tmp/INFO +1 -1
- metadata +100 -201
- data/INSTALL +0 -1
- data/README +0 -3
- data/TODO +0 -28
- data/lib/economist/half_cocked_basel.txt +0 -16
- data/lib/economist/hungarys_troubles.txt +0 -46
- data/lib/economist/indias_slowdown.txt +0 -15
- data/lib/economist/merkozy_rides_again.txt +0 -24
- data/lib/economist/prada_is_not_walmart.txt +0 -9
- data/lib/economist/to_infinity_and_beyond.txt +0 -15
- data/lib/ferret/_11.cfs +0 -0
- data/lib/ferret/_14.cfs +0 -0
- data/lib/ferret/_p.cfs +0 -0
- data/lib/ferret/_s.cfs +0 -0
- data/lib/ferret/_v.cfs +0 -0
- data/lib/ferret/_y.cfs +0 -0
- data/lib/ferret/segments +0 -0
- data/lib/ferret/segments_15 +0 -0
- data/lib/treat/buildable.rb +0 -157
- data/lib/treat/category.rb +0 -33
- data/lib/treat/delegatable.rb +0 -116
- data/lib/treat/doable.rb +0 -45
- data/lib/treat/entities/collection.rb +0 -14
- data/lib/treat/entities/document.rb +0 -12
- data/lib/treat/entities/phrases.rb +0 -17
- data/lib/treat/entities/tokens.rb +0 -61
- data/lib/treat/entities/zones.rb +0 -41
- data/lib/treat/extractors/coreferences/stanford.rb +0 -69
- data/lib/treat/extractors/date/chronic.rb +0 -32
- data/lib/treat/extractors/date/ruby.rb +0 -25
- data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
- data/lib/treat/extractors/language/language_extractor.rb +0 -27
- data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
- data/lib/treat/extractors/roles/naive.rb +0 -73
- data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
- data/lib/treat/extractors/statistics/position_in.rb +0 -14
- data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
- data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
- data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
- data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
- data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
- data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
- data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
- data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
- data/lib/treat/feature.rb +0 -58
- data/lib/treat/features.rb +0 -7
- data/lib/treat/formatters/visualizers/short_value.rb +0 -29
- data/lib/treat/formatters/visualizers/txt.rb +0 -45
- data/lib/treat/group.rb +0 -106
- data/lib/treat/helpers/linguistics_loader.rb +0 -18
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
- data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
- data/lib/treat/inflectors/declensions/english.rb +0 -319
- data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
- data/lib/treat/inflectors/stem/porter.rb +0 -162
- data/lib/treat/inflectors/stem/porter_c.rb +0 -26
- data/lib/treat/inflectors/stem/uea.rb +0 -30
- data/lib/treat/install.rb +0 -59
- data/lib/treat/languages/tags.rb +0 -377
- data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
- data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
- data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
- data/lib/treat/lexicalizers/tag/brill.rb +0 -91
- data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
- data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
- data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
- data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
- data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
- data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
- data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
- data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
- data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
- data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
- data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
- data/lib/treat/registrable.rb +0 -28
- data/lib/treat/sugar.rb +0 -50
- data/lib/treat/viewable.rb +0 -29
- data/lib/treat/visitable.rb +0 -28
- data/test/profile.rb +0 -2
- data/test/tc_entity.rb +0 -117
- data/test/tc_extractors.rb +0 -73
- data/test/tc_formatters.rb +0 -41
- data/test/tc_inflectors.rb +0 -34
- data/test/tc_lexicalizers.rb +0 -32
- data/test/tc_processors.rb +0 -50
- data/test/tc_resources.rb +0 -22
- data/test/tc_treat.rb +0 -60
- data/test/tc_tree.rb +0 -60
- data/test/tests.rb +0 -20
- data/test/texts.rb +0 -19
- data/test/texts/english/half_cocked_basel.txt +0 -16
- data/test/texts/english/hose_and_dry.doc +0 -0
- data/test/texts/english/hungarys_troubles.abw +0 -70
- data/test/texts/english/long.html +0 -24
- data/test/texts/english/long.txt +0 -22
- data/test/texts/english/medium.txt +0 -5
- data/test/texts/english/republican_nomination.pdf +0 -0
- data/test/texts/english/saving_the_euro.odt +0 -0
- data/test/texts/english/short.txt +0 -3
- data/test/texts/english/zero_sum.html +0 -111
@@ -1,42 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Inflectors
|
3
|
-
module Declensions
|
4
|
-
# This class is a wrapper for the functions included
|
5
|
-
# in the 'linguistics' gem that allow to obtain the
|
6
|
-
# declensions of a word.
|
7
|
-
#
|
8
|
-
# Project website: http://deveiate.org/projects/Linguistics/
|
9
|
-
class Linguistics
|
10
|
-
require 'treat/helpers/linguistics_loader'
|
11
|
-
# Retrieve a declension of a word using the 'linguistics' gem.
|
12
|
-
#
|
13
|
-
# Options:
|
14
|
-
#
|
15
|
-
# - (Identifier) :count => :singular, :plural
|
16
|
-
def self.declensions(entity, options = {})
|
17
|
-
unless options[:count]
|
18
|
-
raise Treat::Exception,
|
19
|
-
"Must supply option count (:singular or :plural)."
|
20
|
-
end
|
21
|
-
klass = Treat::Helpers::LinguisticsLoader.load(entity.language)
|
22
|
-
string = entity.to_s
|
23
|
-
if entity.category == :verb
|
24
|
-
raise Treat::Exception,
|
25
|
-
"Cannot retrieve the declensions of a verb. " +
|
26
|
-
"Use #singular_verb and #plural_verb instead."
|
27
|
-
end
|
28
|
-
if options[:count] == :plural
|
29
|
-
if entity.has?(:category) &&
|
30
|
-
[:noun, :adjective, :verb].include?(entity.category)
|
31
|
-
silence_warnings do
|
32
|
-
klass.send(:"plural_#{entity.category}", string)
|
33
|
-
end
|
34
|
-
else
|
35
|
-
silence_warnings { klass.plural(string) }
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|
@@ -1,20 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Inflectors
|
3
|
-
module OrdinalWords
|
4
|
-
# This class is a wrapper for the functions included
|
5
|
-
# in the 'linguistics' gem that allow to describe a
|
6
|
-
# number in words in ordinal form.
|
7
|
-
#
|
8
|
-
# Project website: http://deveiate.org/projects/Linguistics/
|
9
|
-
class Linguistics
|
10
|
-
require 'treat/helpers/linguistics_loader'
|
11
|
-
# Desribe a number in words in ordinal form, using the
|
12
|
-
# 'linguistics' gem.
|
13
|
-
def self.ordinal_words(number, options = {})
|
14
|
-
klass = Treat::Helpers::LinguisticsLoader.load(number.language)
|
15
|
-
klass.ordinate(number.to_s)
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
@@ -1,162 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Inflectors
|
3
|
-
module Stem
|
4
|
-
# Stem a word using a native Ruby implementation of the
|
5
|
-
# Porter stemming algorithm, ported to Ruby from a
|
6
|
-
# version coded up in Perl. This is a simplified
|
7
|
-
# implementation; for a true and fast Porter stemmer,
|
8
|
-
# see Treat::Inflectors::Stem::PorterC.
|
9
|
-
#
|
10
|
-
# Authored by Ray Pereda (raypereda@hotmail.com).
|
11
|
-
# Unknown license.
|
12
|
-
#
|
13
|
-
# Original paper: Porter, 1980. An algorithm for suffix stripping,
|
14
|
-
# Program, Vol. 14, no. 3, pp 130-137,
|
15
|
-
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
|
16
|
-
class Porter
|
17
|
-
# Returns the stem of a word using a native Porter stemmer.
|
18
|
-
#
|
19
|
-
# Options: none.
|
20
|
-
def self.stem(word, options = {})
|
21
|
-
# Copy the word and convert it to a string.
|
22
|
-
w = word.to_s
|
23
|
-
return w if w.length < 3
|
24
|
-
# Map initial y to Y so that the patterns
|
25
|
-
# never treat it as vowel.
|
26
|
-
w[0] = 'Y' if w[0] == ?y
|
27
|
-
# Step 1a
|
28
|
-
if w =~ /(ss|i)es$/
|
29
|
-
w = $` + $1
|
30
|
-
elsif w =~ /([^s])s$/
|
31
|
-
w = $` + $1
|
32
|
-
end
|
33
|
-
# Step 1b
|
34
|
-
if w =~ /eed$/
|
35
|
-
w.chop! if $` =~ MGR0
|
36
|
-
elsif w =~ /(ed|ing)$/
|
37
|
-
stem = $`
|
38
|
-
if stem =~ VOWEL_IN_STEM
|
39
|
-
w = stem
|
40
|
-
case w
|
41
|
-
when /(at|bl|iz)$/ then w << "e"
|
42
|
-
when /([^aeiouylsz])\1$/ then w.chop!
|
43
|
-
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
if w =~ /y$/
|
48
|
-
stem = $`
|
49
|
-
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
50
|
-
end
|
51
|
-
# Step 2
|
52
|
-
if w =~ SUFFIX_1_REGEXP
|
53
|
-
stem = $`
|
54
|
-
suffix = $1
|
55
|
-
if stem =~ MGR0
|
56
|
-
w = stem + STEP_2_LIST[suffix]
|
57
|
-
end
|
58
|
-
end
|
59
|
-
# Step 3
|
60
|
-
if w =~
|
61
|
-
/(icate|ative|alize|iciti|ical|ful|ness)$/
|
62
|
-
stem = $`
|
63
|
-
suffix = $1
|
64
|
-
if stem =~ MGR0
|
65
|
-
w = stem + STEP_3_LIST[suffix]
|
66
|
-
end
|
67
|
-
end
|
68
|
-
# Step 4
|
69
|
-
if w =~ SUFFIX_2_REGEXP
|
70
|
-
stem = $`
|
71
|
-
if stem =~ MGR1
|
72
|
-
w = stem
|
73
|
-
end
|
74
|
-
elsif w =~ /(s|t)(ion)$/
|
75
|
-
stem = $` + $1
|
76
|
-
if stem =~ MGR1
|
77
|
-
w = stem
|
78
|
-
end
|
79
|
-
end
|
80
|
-
# Step 5
|
81
|
-
if w =~ /e$/
|
82
|
-
stem = $`
|
83
|
-
if (stem =~ MGR1) ||
|
84
|
-
(stem =~ MEQ1 && stem !~
|
85
|
-
/^#{CC}#{V}[^aeiouwxy]$/o)
|
86
|
-
w = stem
|
87
|
-
end
|
88
|
-
end
|
89
|
-
if w =~ /ll$/ && w =~ MGR1
|
90
|
-
w.chop!
|
91
|
-
end
|
92
|
-
# and turn initial Y back to y
|
93
|
-
w[0] = 'y' if w[0] == ?Y
|
94
|
-
w
|
95
|
-
end
|
96
|
-
|
97
|
-
STEP_2_LIST = {
|
98
|
-
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
|
99
|
-
'izer'=>'ize', 'bli'=>'ble',
|
100
|
-
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
|
101
|
-
'ization'=>'ize', 'ation'=>'ate',
|
102
|
-
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
|
103
|
-
'ousness'=>'ous', 'anati'=>'al',
|
104
|
-
'iviti'=>'ive', 'binati'=>'ble', 'logi'=>'log'
|
105
|
-
}
|
106
|
-
STEP_3_LIST = {
|
107
|
-
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
|
108
|
-
'ical'=>'ic', 'ful'=>'', 'ness'=>''
|
109
|
-
}
|
110
|
-
SUFFIX_1_REGEXP = /(
|
111
|
-
ational |
|
112
|
-
tional |
|
113
|
-
enci |
|
114
|
-
anci |
|
115
|
-
izer |
|
116
|
-
bli |
|
117
|
-
alli |
|
118
|
-
entli |
|
119
|
-
eli |
|
120
|
-
ousli |
|
121
|
-
ization |
|
122
|
-
ation |
|
123
|
-
ator |
|
124
|
-
alism |
|
125
|
-
iveness |
|
126
|
-
fulness |
|
127
|
-
ousness |
|
128
|
-
anati |
|
129
|
-
iviti |
|
130
|
-
binati |
|
131
|
-
logi)$/x
|
132
|
-
SUFFIX_2_REGEXP = /(
|
133
|
-
al |
|
134
|
-
ance |
|
135
|
-
ence |
|
136
|
-
er |
|
137
|
-
ic |
|
138
|
-
able |
|
139
|
-
ible |
|
140
|
-
ant |
|
141
|
-
ement |
|
142
|
-
ment |
|
143
|
-
ent |
|
144
|
-
ou |
|
145
|
-
ism |
|
146
|
-
ate |
|
147
|
-
iti |
|
148
|
-
ous |
|
149
|
-
ive |
|
150
|
-
ize)$/x
|
151
|
-
C = "[^aeiou]" # consonant
|
152
|
-
V = "[aeiouy]" # vowel
|
153
|
-
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
|
154
|
-
VV = "#{V}(?>[aeiou]*)" # vowel sequence
|
155
|
-
MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
|
156
|
-
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
|
157
|
-
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
|
158
|
-
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
|
159
|
-
end
|
160
|
-
end
|
161
|
-
end
|
162
|
-
end
|
@@ -1,26 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Inflectors
|
3
|
-
module Stem
|
4
|
-
# Stems words using the 'ruby-stemmer' gem, which
|
5
|
-
# wraps a C version of the Porter stemming algorithm.
|
6
|
-
#
|
7
|
-
# Project website: https://github.com/aurelian/ruby-stemmer
|
8
|
-
# Original paper: Porter, 1980. An algorithm for suffix stripping,
|
9
|
-
# Program, Vol. 14, no. 3, pp 130-137,
|
10
|
-
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
|
11
|
-
class PorterC
|
12
|
-
# Require the 'ruby-stemmer' gem.
|
13
|
-
silence_warnings { require 'lingua/stemmer' }
|
14
|
-
# Remove a conflict between this gem and the 'engtagger' gem.
|
15
|
-
::LinguaStemmer = ::Lingua
|
16
|
-
Object.instance_eval { remove_const :Lingua }
|
17
|
-
# Stem the word using a full-blown Porter stemmer in C.
|
18
|
-
#
|
19
|
-
# Options: none.
|
20
|
-
def self.stem(word, options = {})
|
21
|
-
silence_warnings { ::LinguaStemmer.stemmer(word.to_s) }
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
@@ -1,30 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Inflectors
|
3
|
-
module Stem
|
4
|
-
# Stems a word using the UEA algorithm, implemented
|
5
|
-
# by the 'uea-stemmer' gem.
|
6
|
-
#
|
7
|
-
# "Similar to other stemmers, UEA-Lite operates on a
|
8
|
-
# set of rules which are used as steps. There are two
|
9
|
-
# groups of rules: the first to clean the tokens, and
|
10
|
-
# the second to alter suffixes."
|
11
|
-
#
|
12
|
-
# Project website: https://github.com/ealdent/uea-stemmer
|
13
|
-
# Original paper: Jenkins, Marie-Claire, Smith, Dan,
|
14
|
-
# Conservative stemming for search and indexing, 2005.
|
15
|
-
# http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
|
16
|
-
class UEA
|
17
|
-
# Require the 'uea-stemmer' gem.
|
18
|
-
silence_warnings { require 'uea-stemmer' }
|
19
|
-
# Keep only one copy of the stemmer.
|
20
|
-
@@stemmer = nil
|
21
|
-
# Stems a word using the UEA algorithm, implemented
|
22
|
-
# by the 'uea-stemmer' gem.
|
23
|
-
def self.stem(entity, options = {})
|
24
|
-
@@stemmer ||= silence_warnings { ::UEAStemmer.new }
|
25
|
-
@@stemmer.stem(entity.to_s).strip
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|
data/lib/treat/install.rb
DELETED
@@ -1,59 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
class Installer
|
3
|
-
require 'rubygems/dependency_installer'
|
4
|
-
# Install required dependencies and optional dependencies
|
5
|
-
# for a specific language.
|
6
|
-
def self.install(language = :english)
|
7
|
-
|
8
|
-
lang = Treat::Languages.get(language)
|
9
|
-
required = lang::RequiredDependencies
|
10
|
-
optional = lang::OptionalDependencies
|
11
|
-
|
12
|
-
puts "Treat Installer\n\n"
|
13
|
-
puts "Installing dependencies for the #{language.to_s.capitalize} language.\n\n"
|
14
|
-
|
15
|
-
flag = false
|
16
|
-
|
17
|
-
inst = Gem::DependencyInstaller.new
|
18
|
-
|
19
|
-
required.each do |dependency|
|
20
|
-
puts "Installing required dependency '#{dependency}'..."
|
21
|
-
begin
|
22
|
-
silence_warnings { inst.install(dependency) }
|
23
|
-
rescue
|
24
|
-
flag = true
|
25
|
-
puts "Couldn't install '#{dependency}'. " +
|
26
|
-
"You need install this dependency manually by running: " +
|
27
|
-
"'gem install #{dependency}' or use 'sudo' to run this script."
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
optional.each do |dependency|
|
32
|
-
begin
|
33
|
-
puts "Install optional dependency '#{dependency}' (yes/no, <enter> = skip) ?"
|
34
|
-
answer = gets.strip
|
35
|
-
raise Treat::Exception unless ['yes', 'no', ''].include?(answer)
|
36
|
-
if answer == 'yes'
|
37
|
-
silence_warnings { inst.install(dependency) }
|
38
|
-
else
|
39
|
-
puts "Skipped installing '#{dependency}'."
|
40
|
-
next
|
41
|
-
end
|
42
|
-
rescue Treat::Exception
|
43
|
-
puts "Invalid input - valid options are 'yes' or 'no'."
|
44
|
-
retry
|
45
|
-
rescue
|
46
|
-
flag = true
|
47
|
-
puts "Couldn't install '#{dependency}'. " +
|
48
|
-
"You can install this dependency manually by running: " +
|
49
|
-
"'gem install #{dependency}' or use 'sudo' to run this script."
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
w = flag ? 'incompletely' : 'normally'
|
54
|
-
puts "\nInstall proceeded #{w}."
|
55
|
-
puts
|
56
|
-
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
data/lib/treat/languages/tags.rb
DELETED
@@ -1,377 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Languages
|
3
|
-
|
4
|
-
module Tags
|
5
|
-
ClawsC5 = 0
|
6
|
-
Brown = 1
|
7
|
-
Penn = 2
|
8
|
-
Negra = 3
|
9
|
-
PennChinese = 4
|
10
|
-
Simple = 5
|
11
|
-
|
12
|
-
PTBClauseTagDescription = [
|
13
|
-
['S', 'Simple declarative clause'],
|
14
|
-
['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
|
15
|
-
['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
|
16
|
-
['SINV', 'Inverted declarative sentence'],
|
17
|
-
['SQ', 'Inverted yes/no question']
|
18
|
-
]
|
19
|
-
|
20
|
-
AlignedPhraseTags =
|
21
|
-
[
|
22
|
-
'Adjective phrase', ['', '', 'ADJP'],
|
23
|
-
'Adverb phrase', ['', '', 'ADVP'],
|
24
|
-
'Conjunction phrase', ['', '', 'CONJP'],
|
25
|
-
'Fragment', ['', '', 'FRAG'],
|
26
|
-
'Interjection', ['', '', 'INTJ'],
|
27
|
-
'List marker', ['', '', 'LST'],
|
28
|
-
'Not a phrase', ['', '', 'NAC'],
|
29
|
-
'Noun phrase', ['', '', 'NP'],
|
30
|
-
'Head of NP', ['', '', 'NX'],
|
31
|
-
'Prepositional phrase', ['', '', 'PP'],
|
32
|
-
'Parenthetical', ['', '', 'PRN'],
|
33
|
-
'Particle', ['', '', 'PRT'],
|
34
|
-
'Quantifier phrase', ['', '', 'QP'],
|
35
|
-
'Reduced relative clause', ['', '', 'RRC'],
|
36
|
-
'Unlike coordinated phrase', ['', '', 'UCP'],
|
37
|
-
'Verb phrase', ['', '', 'VP'],
|
38
|
-
'Wh adjective phrase', ['', '', 'WHADJP'],
|
39
|
-
'Wh adverb phrase', ['', '', 'WHAVP'],
|
40
|
-
'Wh noun phrase', ['', '', 'WHNP'],
|
41
|
-
'Wh prepositional phrase', ['', '', 'WHPP'],
|
42
|
-
'Unknown', ['', '', 'X'],
|
43
|
-
'Phrase', ['', '', 'P'],
|
44
|
-
'Sentence', ['', '', 'S'],
|
45
|
-
'Phrase', ['', '', 'SBAR'] # Fix
|
46
|
-
]
|
47
|
-
|
48
|
-
# A description of Enju categories.
|
49
|
-
EnjuCatDescription = [
|
50
|
-
['ADJ', 'Adjective'],
|
51
|
-
['ADV', 'Adverb'],
|
52
|
-
['CONJ', 'Coordination conjunction'],
|
53
|
-
['C', 'Complementizer'],
|
54
|
-
['D', 'Determiner'],
|
55
|
-
['N', 'Noun'],
|
56
|
-
['P', 'Preposition'],
|
57
|
-
['SC', 'Subordination conjunction'],
|
58
|
-
['V', 'Verb'],
|
59
|
-
['COOD', 'Part of coordination'],
|
60
|
-
['PN', 'Punctuation'],
|
61
|
-
['PRT', 'Particle'],
|
62
|
-
['S', 'Sentence']
|
63
|
-
]
|
64
|
-
|
65
|
-
# Maps Enju categories to Treat categories.
|
66
|
-
EnjuCatToCategory = {
|
67
|
-
'ADJ' => :adjective,
|
68
|
-
'ADV' => :adverb,
|
69
|
-
'CONJ' => :conjunction,
|
70
|
-
'COOD' => :conjunction,
|
71
|
-
'C' => :complementizer,
|
72
|
-
'D' => :determiner,
|
73
|
-
'N' => :noun,
|
74
|
-
'P' => :preposition,
|
75
|
-
'PN' => :punctuation,
|
76
|
-
'SC' => :conjunction,
|
77
|
-
'V' => :verb,
|
78
|
-
'PRT' => :particle
|
79
|
-
}
|
80
|
-
|
81
|
-
# Description of the xcat in the Enju output specification.
|
82
|
-
EnjuXCatDescription = [
|
83
|
-
['COOD', 'Coordinated phrase/clause'],
|
84
|
-
['IMP', 'Imperative sentence'],
|
85
|
-
['INV', 'Subject-verb inversion'],
|
86
|
-
['Q', 'Interrogative sentence with subject-verb inversion'],
|
87
|
-
['REL', 'A relativizer included'],
|
88
|
-
['FREL', 'A free relative included'],
|
89
|
-
['TRACE', 'A trace included'],
|
90
|
-
['WH', 'A wh-question word included']
|
91
|
-
]
|
92
|
-
|
93
|
-
EnjuCatXcatToPTB = [
|
94
|
-
['ADJP', '', 'ADJP'],
|
95
|
-
['ADJP', 'REL', 'WHADJP'],
|
96
|
-
['ADJP', 'FREL', 'WHADJP'],
|
97
|
-
['ADJP', 'WH', 'WHADJP'],
|
98
|
-
['ADVP', '', 'ADVP'],
|
99
|
-
['ADVP', 'REL', 'WHADVP'],
|
100
|
-
['ADVP', 'FREL', 'WHADVP'],
|
101
|
-
['ADVP', 'WH', 'WHADVP'],
|
102
|
-
['CONJP', '', 'CONJP'],
|
103
|
-
['CP', '', 'SBAR'],
|
104
|
-
['DP', '', 'NP'],
|
105
|
-
['NP', '', 'NP'],
|
106
|
-
['NX', 'NX', 'NAC'],
|
107
|
-
['NP' 'REL' 'WHNP'],
|
108
|
-
['NP' 'FREL' 'WHNP'],
|
109
|
-
['NP' 'WH' 'WHNP'],
|
110
|
-
['PP', '', 'PP'],
|
111
|
-
['PP', 'REL', 'WHPP'],
|
112
|
-
['PP', 'WH', 'WHPP'],
|
113
|
-
['PRT', '', 'PRT'],
|
114
|
-
['S', '', 'S'],
|
115
|
-
['S', 'INV', 'SINV'],
|
116
|
-
['S', 'Q', 'SQ'],
|
117
|
-
['S', 'REL', 'SBAR'],
|
118
|
-
['S', 'FREL', 'SBAR'],
|
119
|
-
['S', 'WH', 'SBARQ'],
|
120
|
-
['SCP', '', 'SBAR'],
|
121
|
-
['VP', '', 'VP'],
|
122
|
-
['VP', '', 'VP'],
|
123
|
-
['', '', 'UK']
|
124
|
-
]
|
125
|
-
|
126
|
-
# Aligned tags for the Claws C5, Brown and Penn tag sets.
|
127
|
-
# Adapted from Manning, Christopher and Schütze, Hinrich,
|
128
|
-
# 1999. Foundations of Statistical Natural Language
|
129
|
-
# Processing. MIT Press, p. 141-142;
|
130
|
-
# http://www.isocat.org/rest/dcs/376;
|
131
|
-
#
|
132
|
-
# JRS?
|
133
|
-
|
134
|
-
|
135
|
-
SimpleWordTagToCategory = {
|
136
|
-
'C' => :complementizer,
|
137
|
-
'PN' => :punctuation,
|
138
|
-
'SC' => :conjunction
|
139
|
-
}
|
140
|
-
|
141
|
-
AlignedWordTags = [
|
142
|
-
|
143
|
-
'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'A'],
|
144
|
-
'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'ADJ'],
|
145
|
-
'Ajective, adverbial or predicative', ['', '', '', 'ADJD', '', 'ADJ'],
|
146
|
-
'Adjective, attribute', ['', '', '', 'ADJA', 'VA', 'ADJ'],
|
147
|
-
'Adjective, ordinal number', ['ORD', 'OD', 'JJ', '', 'OD', 'ADJ'],
|
148
|
-
'Adjective, comparative', ['AJC', 'JJR', 'JJR', 'KOKOM', '', 'ADJ'],
|
149
|
-
'Adjective, superlative', ['AJS', 'JJT', 'JJS', '', 'JJ', 'ADJ'],
|
150
|
-
'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ', '', '', 'ADJ'],
|
151
|
-
'Adjective, cardinal number', ['CRD', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
|
152
|
-
'Adjective, cardinal number, one', ['PNI', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
|
153
|
-
|
154
|
-
'Adverb', ['AV0', 'RB', 'RB', 'ADV', 'AD', 'ADV'],
|
155
|
-
'Adverb, negative', ['XX0', '*', 'RB', 'PTKNEG', '', 'ADV'],
|
156
|
-
'Adverb, comparative', ['AV0', 'RBR', 'RBR', '', 'AD', 'ADV'],
|
157
|
-
'Adverb, superlative', ['AV0', 'RBT', 'RBS', '', 'AD', 'ADV'],
|
158
|
-
'Adverb, particle', ['AVP', 'RP', 'RP', '', '', 'ADV'],
|
159
|
-
'Adverb, question', ['AVQ', 'WRB', 'WRB', '', 'AD', 'ADV'],
|
160
|
-
'Adverb, degree & question', ['AVQ', 'WQL', 'WRB', '', 'ADV'],
|
161
|
-
'Adverb, degree', ['AV0', 'QL', 'RB', '', '', 'ADV'],
|
162
|
-
'Adverb, degree, postposed', ['AV0', 'QLP', 'RB', '', '', 'ADV'],
|
163
|
-
'Adverb, nominal', ['AV0', 'RN', 'RB', 'PROP', '', 'ADV'],
|
164
|
-
'Adverb, pronominal', ['', '', '', '', 'PROP', '', 'ADV'],
|
165
|
-
|
166
|
-
'Clitic', ['', '', 'POS', '', '', ''],
|
167
|
-
|
168
|
-
'Conjunction, coordination', ['CJC', 'CC', 'CC', 'KON', 'CC', 'COOD'],
|
169
|
-
'Conjunction, coordination, and', ['CJC', 'CC', 'CC', 'KON', 'CC', 'ET'],
|
170
|
-
'Conjunction, subordination', ['CJS', 'CS', 'IN', 'KOUS', 'CS', 'CONJ'],
|
171
|
-
'Conjunction, subordination with to and infinitive', ['', '', '', 'KOUI', '', ''],
|
172
|
-
'Conjunction, complementizer, that', ['CJT', 'CS', 'IN', '', '', 'C'],
|
173
|
-
|
174
|
-
'Determiner', ['DT0', 'DT', 'DT', '', 'DT', 'D'],
|
175
|
-
'Determiner', ['DT0', 'DT', 'DET', '', 'DT', 'D'],
|
176
|
-
'Determiner, pronoun', ['DT0', 'DTI', 'DT', '', '', 'D'],
|
177
|
-
'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT', '', '', 'D'],
|
178
|
-
'Determiner, prequalifier', ['DT0', 'ABL', 'DT', '', '', 'D'],
|
179
|
-
'Determiner, prequantifier', ['DT0', 'ABN', 'PDT', '', 'DT', 'D'],
|
180
|
-
'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT', '', '', 'D'],
|
181
|
-
'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT', '', '', 'D'],
|
182
|
-
'Determiner, article', ['AT0', 'AT', 'DT', 'ART', '', 'D'],
|
183
|
-
'Determiner, postdeterminer', ['DT0', 'AP', 'DT', '', '', 'D'],
|
184
|
-
'Determiner, possessive', ['DPS', 'PP$', 'PRP$', '', '', 'D'],
|
185
|
-
'Determiner, possessive, second', ['DPS', 'PPSS', 'PRPS', '', '', 'D'],
|
186
|
-
'Determiner, possessive, second', ['DPS', 'PP$$', 'PRP', '', '', 'D'],
|
187
|
-
'Determiner, possessive, second', ['DPS', 'PPSS', 'PRP', '', '', 'D'],
|
188
|
-
'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
|
189
|
-
'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
|
190
|
-
'Determiner, possessive & question', ['DTQ', 'WPS', 'WPS', '', '', 'D'],
|
191
|
-
|
192
|
-
'Localizer', ['', '', '', '', 'LC'],
|
193
|
-
|
194
|
-
'Measure word', ['', '', '', '', 'M'],
|
195
|
-
|
196
|
-
'Noun, common', ['NN0', 'NN', 'NN', 'N', 'NN', 'NN'],
|
197
|
-
'Noun, singular', ['NN1', 'NN', 'NN', 'NN', 'NN', 'N'],
|
198
|
-
'Noun, plural', ['NN2', 'NNS', 'NNS', 'NN', 'NN', 'N'],
|
199
|
-
'Noun, proper, singular', ['NP0', 'NP', 'NNP', 'NE', 'NR', 'N'],
|
200
|
-
'Noun, proper, plural', ['NP0', 'NPS', 'NNPS', 'NE', 'NR', 'N'],
|
201
|
-
'Noun, adverbial', ['NN0', 'NR', 'NN', 'NE', '', 'N'],
|
202
|
-
'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS', '', 'N'],
|
203
|
-
'Noun, temporal', ['', '', '', '', 'NT', 'N'],
|
204
|
-
'Noun, verbal', ['', '', '', '', 'NN', 'N'],
|
205
|
-
|
206
|
-
'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP', '', 'PN', 'CL'],
|
207
|
-
'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP', 'PPER'],
|
208
|
-
'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP', 'PPER'],
|
209
|
-
'Pronoun, personal, object', ['PNP', 'PPO', 'PRP', 'PPER'],
|
210
|
-
'Pronoun, reflexive', ['PNX', 'PPL', 'PRP', 'PRF'],
|
211
|
-
'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP', 'PRF'],
|
212
|
-
'Pronoun, question, subject', ['PNQ', 'WPS', 'WP', 'PWAV'],
|
213
|
-
'Pronoun, question, object', ['PNQ', 'WPO', 'WP', 'PWAV', 'PWAT'],
|
214
|
-
'Pronoun, existential there', ['EX0', 'EX', 'EX'],
|
215
|
-
'Pronoun, attributive demonstrative', ['', '', '', 'PDAT'],
|
216
|
-
'Prounoun, attributive indefinite without determiner', ['', '', '', 'PIAT'],
|
217
|
-
'Pronoun, attributive possessive', ['', '', '', 'PPOSAT', ''],
|
218
|
-
'Pronoun, substituting demonstrative', ['', '', '', 'PDS'],
|
219
|
-
'Pronoun, substituting possessive', ['', '', '', 'PPOSS', ''],
|
220
|
-
'Prounoun, substituting indefinite', ['', '', '', 'PIS'],
|
221
|
-
'Pronoun, attributive relative', ['', '', '', 'PRELAT', ''],
|
222
|
-
'Pronoun, substituting relative', ['', '', '', 'PRELS', ''],
|
223
|
-
'Pronoun, attributive interrogative', ['', '', '', 'PWAT'],
|
224
|
-
'Pronoun, adverbial interrogative', ['', '', '', 'PWAV'],
|
225
|
-
|
226
|
-
'Pronoun, substituting interrogative', ['', '', '', 'PWS'],
|
227
|
-
'Verb, main, finite', ['', '', '', 'VVFIN', '', 'V'],
|
228
|
-
'Verb, main, infinitive', ['', '', '', 'VVINF', '', 'V'],
|
229
|
-
'Verb, main, imperative', ['', '', '', 'VVIMP', '', 'V'],
|
230
|
-
'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP', '', '', 'V'],
|
231
|
-
'Verb, infinitive', ['VVI', 'VB', 'VB', 'V', '', 'V'],
|
232
|
-
'Verb, past tense', ['VVD', 'VBD', 'VBD', '', '', 'V'],
|
233
|
-
'Verb, present participle', ['VVG', 'VBG', 'VBG', 'VAPP', '', 'V'],
|
234
|
-
'Verb, past/passive participle', ['VVN', 'VBN', 'VBN', 'VVPP', '', 'V'],
|
235
|
-
'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ', '', '', 'V'],
|
236
|
-
'Verb, auxiliary', ['', '', '', 'VAFIN', '', 'V'],
|
237
|
-
'Verb, imperative', ['', '', '', 'VAIMP', '', 'V'],
|
238
|
-
'Verb, imperative infinitive', ['', '', '', 'VAINF', '', 'V'],
|
239
|
-
'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP', '', '', 'V'],
|
240
|
-
'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB', '', '', 'V'],
|
241
|
-
'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD', '', '', 'V'],
|
242
|
-
'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG', '', '', 'V'],
|
243
|
-
'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN', '', '', 'V'],
|
244
|
-
'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ', '', '', 'V'],
|
245
|
-
'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP', 'VA', '', 'V'],
|
246
|
-
'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB', 'VAINF', '', 'V'],
|
247
|
-
'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD', 'VA', '', 'V'],
|
248
|
-
'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG', 'VA', '', 'V'],
|
249
|
-
'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN', 'VAPP', '', 'V'],
|
250
|
-
'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ', 'VA', '', 'V'],
|
251
|
-
'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB', '', '', 'V'],
|
252
|
-
'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD', '', '', 'V'],
|
253
|
-
'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD', '', '', 'V'],
|
254
|
-
'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG', '', '', 'V'],
|
255
|
-
'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN', '', '', 'V'],
|
256
|
-
'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ', '', '', 'V'],
|
257
|
-
'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP', '', '', 'V'],
|
258
|
-
'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP', '', '', 'V'],
|
259
|
-
'Verb, modal', ['VM0', 'MD', 'MD', 'VMFIN', 'VV', 'V'],
|
260
|
-
'Verb, modal', ['VM0', 'MD', 'MD', 'VMINF', 'VV', 'V'],
|
261
|
-
'Verb, modal, finite', ['', '', '', '', 'VMFIN', 'V'],
|
262
|
-
'Verb, modal, infinite', ['', '', '', '', 'VMINF', 'V'],
|
263
|
-
'Verb, modal, past participle', ['', '', '', '', 'VMPP', 'V'],
|
264
|
-
|
265
|
-
'Particle', ['', '', '', '', '', 'PRT'],
|
266
|
-
'Particle, with adverb', ['', '', '', 'PTKA', '', 'PRT'],
|
267
|
-
'Particle, answer', ['', '', '', 'PTKANT', '', 'PRT'],
|
268
|
-
'Particle, negation', ['', '', '', 'PTKNEG', '', 'PRT'],
|
269
|
-
'Particle, separated verb', ['', '', '', 'PTKVZ', '', 'PRT'],
|
270
|
-
'Particle, to as infinitive marker', ['TO0', 'TO', 'TO', 'PTKZU', '', 'PRT'],
|
271
|
-
|
272
|
-
'Preposition, comparative', ['', '', '', 'KOKOM', '', 'P'],
|
273
|
-
'Preposition, to', ['PRP', 'IN', 'TO', '', '', 'P'],
|
274
|
-
'Preposition', ['PRP', 'IN', 'IN', 'APPR', 'P', 'P'],
|
275
|
-
'Preposition, with aritcle', ['', '', '', 'APPART', '', 'P'],
|
276
|
-
'Preposition, of', ['PRF', 'IN', 'IN', '', '', 'P'],
|
277
|
-
|
278
|
-
'Possessive', ['POS', '$', 'POS'],
|
279
|
-
|
280
|
-
'Postposition', ['', '', '', 'APPO'],
|
281
|
-
|
282
|
-
'Circumposition, right', ['', '', '', 'APZR', ''],
|
283
|
-
|
284
|
-
'Interjection, onomatopoeia or other isolate', ['ITJ', 'UH', 'UH', 'ITJ', 'IJ'],
|
285
|
-
|
286
|
-
'Onomatopoeia', ['', '', '', '', 'ON'],
|
287
|
-
|
288
|
-
'Punctuation', ['', '', '', '', 'PU', 'PN'],
|
289
|
-
'Punctuation, sentence ender', ['PUN', '.', '.', '', '', 'PN'],
|
290
|
-
'Punctuation, sentence ender', ['PUN', '.', 'PP', '$.', '', 'PN'],
|
291
|
-
'Punctuation, semicolon', ['PUN', '.', '.', '', '', 'PN'],
|
292
|
-
'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
|
293
|
-
'Punctuationm, comma', ['PUN', ',', ',', '$,'],
|
294
|
-
'Punctuation, dash', ['PUN', '-', '-'],
|
295
|
-
'Punctuation, dollar sign', ['PUN', '', '$'],
|
296
|
-
'Punctuation, left bracket', ['PUL', '(', '(', '$('],
|
297
|
-
'Punctuation, right bracket', ['PUR', ')', ')'],
|
298
|
-
'Punctuation, quotation mark, left', ['PUQ', '', '``'],
|
299
|
-
'Punctuation, quotation mark, right', ['PUQ', '', '"'],
|
300
|
-
|
301
|
-
'Word, truncated, left', ['', '', '', 'TRUNC'],
|
302
|
-
|
303
|
-
'Unknown, foreign words (not in lexicon)', ['UNZ', '(FW-)', 'FW', '', 'FW'],
|
304
|
-
|
305
|
-
'Symbol', ['', '', 'SYM', 'XY'],
|
306
|
-
'Symbol, alphabetical', ['ZZ0', '', ''],
|
307
|
-
'Symbol, list item', ['', '', 'LS'],
|
308
|
-
|
309
|
-
# Not sure about these tags from the Chinese PTB.
|
310
|
-
'Aspect marker', ['', '', '', '', 'AS'], # ?
|
311
|
-
'Ba-construction', ['', '', '', '', 'BA'], # ?
|
312
|
-
'In relative', ['', '', '', '', 'DEC'], # ?
|
313
|
-
'Associative', ['', '', '', '', 'DER'], # ?
|
314
|
-
'In V-de or V-de-R construct', ['', '', '', '', 'DER'], # ?
|
315
|
-
'For words ? ', ['', '', '', '', 'ETC'], # ?
|
316
|
-
'In long bei-construct', ['', '', '', '', 'LB'], # ?
|
317
|
-
'In short bei-construct', ['', '', '', '', 'SB'], # ?
|
318
|
-
'Sentence-nal particle', ['', '', '', '', 'SB'], # ?
|
319
|
-
'Particle, other', ['', '', '', '', 'MSP'], # ?
|
320
|
-
'Before VP', ['', '', '', '', 'DEV'], # ?
|
321
|
-
'Verb, ? as main verb', ['', '', '', '', 'VE'], # ?
|
322
|
-
'Verb, ????', ['', '', '', '', 'VC'] # ?
|
323
|
-
]
|
324
|
-
|
325
|
-
wttc = {
|
326
|
-
|
327
|
-
}
|
328
|
-
Treat::Languages::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
|
329
|
-
|
330
|
-
category = desc.gsub(',', ' ,').split(' ')[0].downcase.intern
|
331
|
-
|
332
|
-
wttc[tags[ClawsC5]] ||= {}
|
333
|
-
wttc[tags[Brown]] ||= {}
|
334
|
-
wttc[tags[Penn]] ||= {}
|
335
|
-
wttc[tags[Negra]] ||= {}
|
336
|
-
wttc[tags[PennChinese]] ||= {}
|
337
|
-
wttc[tags[Simple]] ||= {}
|
338
|
-
|
339
|
-
wttc[tags[ClawsC5]][:claws_5] = category
|
340
|
-
wttc[tags[Brown]][:brown] = category
|
341
|
-
wttc[tags[Penn]][:penn] = category
|
342
|
-
wttc[tags[Negra]][:negra] = category if tags[Negra]
|
343
|
-
wttc[tags[PennChinese]][:penn_chinese] = category if tags[PennChinese]
|
344
|
-
wttc[tags[Simple]][:simple] = category if tags[Simple]
|
345
|
-
|
346
|
-
end
|
347
|
-
# A hash converting word tags to word categories.
|
348
|
-
WordTagToCategory = wttc
|
349
|
-
|
350
|
-
# A hash converting phrase tag to categories.
|
351
|
-
pttc = {}
|
352
|
-
Treat::Languages::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
|
353
|
-
category = desc.gsub(',', ' ,').gsub(' ', '_').downcase.intern
|
354
|
-
pttc[tags[Penn]] ||= {};
|
355
|
-
# Not yet for other tag sts.
|
356
|
-
#pttc[tags[0]][:claws_5] = category
|
357
|
-
#pttc[tags[1]][:brown] = category
|
358
|
-
pttc[tags[Penn]][:penn] = category
|
359
|
-
end
|
360
|
-
|
361
|
-
# A hash converting word tags to word categories.
|
362
|
-
PhraseTagToCategory = pttc
|
363
|
-
|
364
|
-
def self.has_phrase_tag?(tag, tag_set)
|
365
|
-
PhraseTagToCategory[tag] &&
|
366
|
-
PhraseTagToCategory[tag_set]
|
367
|
-
end
|
368
|
-
|
369
|
-
def self.has_word_tag?(tag, tag_set)
|
370
|
-
WordTagToCategory[tag] &&
|
371
|
-
WordTagToCategory[tag_set]
|
372
|
-
end
|
373
|
-
|
374
|
-
|
375
|
-
end
|
376
|
-
end
|
377
|
-
end
|