treat 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -3
- data/README.md +33 -0
- data/files/INFO +1 -0
- data/lib/treat.rb +40 -105
- data/lib/treat/ai.rb +12 -0
- data/lib/treat/ai/classifiers/id3.rb +27 -0
- data/lib/treat/categories.rb +82 -35
- data/lib/treat/categorizable.rb +44 -0
- data/lib/treat/classification.rb +61 -0
- data/lib/treat/configurable.rb +115 -0
- data/lib/treat/data_set.rb +42 -0
- data/lib/treat/dependencies.rb +24 -0
- data/lib/treat/downloader.rb +87 -0
- data/lib/treat/entities.rb +68 -66
- data/lib/treat/entities/abilities.rb +10 -0
- data/lib/treat/entities/abilities/buildable.rb +327 -0
- data/lib/treat/entities/abilities/checkable.rb +31 -0
- data/lib/treat/entities/abilities/copyable.rb +45 -0
- data/lib/treat/entities/abilities/countable.rb +51 -0
- data/lib/treat/entities/abilities/debuggable.rb +83 -0
- data/lib/treat/entities/abilities/delegatable.rb +123 -0
- data/lib/treat/entities/abilities/doable.rb +62 -0
- data/lib/treat/entities/abilities/exportable.rb +11 -0
- data/lib/treat/entities/abilities/iterable.rb +115 -0
- data/lib/treat/entities/abilities/magical.rb +83 -0
- data/lib/treat/entities/abilities/registrable.rb +74 -0
- data/lib/treat/entities/abilities/stringable.rb +91 -0
- data/lib/treat/entities/entities.rb +104 -0
- data/lib/treat/entities/entity.rb +122 -245
- data/lib/treat/exception.rb +4 -4
- data/lib/treat/extractors.rb +77 -80
- data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
- data/lib/treat/extractors/language/what_language.rb +50 -45
- data/lib/treat/extractors/name_tag/stanford.rb +55 -0
- data/lib/treat/extractors/tf_idf/native.rb +87 -0
- data/lib/treat/extractors/time/chronic.rb +55 -0
- data/lib/treat/extractors/time/nickel.rb +86 -62
- data/lib/treat/extractors/time/ruby.rb +53 -0
- data/lib/treat/extractors/topic_words/lda.rb +67 -58
- data/lib/treat/extractors/topics/reuters.rb +100 -87
- data/lib/treat/formatters.rb +39 -35
- data/lib/treat/formatters/readers/abw.rb +49 -29
- data/lib/treat/formatters/readers/autoselect.rb +37 -33
- data/lib/treat/formatters/readers/doc.rb +19 -13
- data/lib/treat/formatters/readers/html.rb +52 -30
- data/lib/treat/formatters/readers/image.rb +41 -40
- data/lib/treat/formatters/readers/odt.rb +59 -45
- data/lib/treat/formatters/readers/pdf.rb +28 -25
- data/lib/treat/formatters/readers/txt.rb +12 -15
- data/lib/treat/formatters/readers/xml.rb +73 -36
- data/lib/treat/formatters/serializers/xml.rb +80 -79
- data/lib/treat/formatters/serializers/yaml.rb +19 -18
- data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
- data/lib/treat/formatters/unserializers/xml.rb +94 -99
- data/lib/treat/formatters/unserializers/yaml.rb +20 -19
- data/lib/treat/formatters/visualizers/dot.rb +132 -132
- data/lib/treat/formatters/visualizers/standoff.rb +52 -44
- data/lib/treat/formatters/visualizers/tree.rb +26 -29
- data/lib/treat/groupable.rb +153 -0
- data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
- data/lib/treat/inflectors.rb +50 -45
- data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
- data/lib/treat/inflectors/declensors/active_support.rb +31 -0
- data/lib/treat/inflectors/declensors/english.rb +38 -0
- data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
- data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
- data/lib/treat/inflectors/stemmers/porter.rb +160 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
- data/lib/treat/inflectors/stemmers/uea.rb +28 -0
- data/lib/treat/installer.rb +308 -0
- data/lib/treat/kernel.rb +105 -27
- data/lib/treat/languages.rb +122 -88
- data/lib/treat/languages/arabic.rb +15 -15
- data/lib/treat/languages/chinese.rb +15 -15
- data/lib/treat/languages/dutch.rb +15 -15
- data/lib/treat/languages/english.rb +61 -62
- data/lib/treat/languages/french.rb +19 -19
- data/lib/treat/languages/german.rb +20 -20
- data/lib/treat/languages/greek.rb +15 -15
- data/lib/treat/languages/italian.rb +16 -16
- data/lib/treat/languages/polish.rb +15 -15
- data/lib/treat/languages/portuguese.rb +15 -15
- data/lib/treat/languages/russian.rb +15 -15
- data/lib/treat/languages/spanish.rb +16 -16
- data/lib/treat/languages/swedish.rb +16 -16
- data/lib/treat/lexicalizers.rb +34 -55
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
- data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
- data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
- data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
- data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
- data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
- data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
- data/lib/treat/linguistics.rb +9 -0
- data/lib/treat/linguistics/categories.rb +11 -0
- data/lib/treat/linguistics/tags.rb +422 -0
- data/lib/treat/loaders/linguistics.rb +30 -0
- data/lib/treat/loaders/stanford.rb +27 -0
- data/lib/treat/object.rb +1 -0
- data/lib/treat/processors.rb +37 -44
- data/lib/treat/processors/chunkers/autoselect.rb +16 -0
- data/lib/treat/processors/chunkers/html.rb +71 -0
- data/lib/treat/processors/chunkers/txt.rb +18 -24
- data/lib/treat/processors/parsers/enju.rb +253 -208
- data/lib/treat/processors/parsers/stanford.rb +130 -131
- data/lib/treat/processors/segmenters/punkt.rb +79 -45
- data/lib/treat/processors/segmenters/stanford.rb +46 -48
- data/lib/treat/processors/segmenters/tactful.rb +43 -36
- data/lib/treat/processors/tokenizers/perl.rb +124 -92
- data/lib/treat/processors/tokenizers/ptb.rb +81 -0
- data/lib/treat/processors/tokenizers/punkt.rb +48 -42
- data/lib/treat/processors/tokenizers/stanford.rb +39 -38
- data/lib/treat/processors/tokenizers/tactful.rb +64 -55
- data/lib/treat/proxies.rb +52 -35
- data/lib/treat/retrievers.rb +26 -16
- data/lib/treat/retrievers/indexers/ferret.rb +47 -26
- data/lib/treat/retrievers/searchers/ferret.rb +69 -50
- data/lib/treat/tree.rb +241 -183
- data/spec/collection.rb +123 -0
- data/spec/document.rb +93 -0
- data/spec/entity.rb +408 -0
- data/spec/languages.rb +25 -0
- data/spec/phrase.rb +146 -0
- data/spec/samples/mathematicians/archimedes.abw +34 -0
- data/spec/samples/mathematicians/euler.html +21 -0
- data/spec/samples/mathematicians/gauss.pdf +0 -0
- data/spec/samples/mathematicians/leibniz.txt +13 -0
- data/spec/samples/mathematicians/newton.doc +0 -0
- data/spec/sandbox.rb +5 -0
- data/spec/token.rb +109 -0
- data/spec/treat.rb +52 -0
- data/spec/tree.rb +117 -0
- data/spec/word.rb +110 -0
- data/spec/zone.rb +66 -0
- data/tmp/INFO +1 -1
- metadata +100 -201
- data/INSTALL +0 -1
- data/README +0 -3
- data/TODO +0 -28
- data/lib/economist/half_cocked_basel.txt +0 -16
- data/lib/economist/hungarys_troubles.txt +0 -46
- data/lib/economist/indias_slowdown.txt +0 -15
- data/lib/economist/merkozy_rides_again.txt +0 -24
- data/lib/economist/prada_is_not_walmart.txt +0 -9
- data/lib/economist/to_infinity_and_beyond.txt +0 -15
- data/lib/ferret/_11.cfs +0 -0
- data/lib/ferret/_14.cfs +0 -0
- data/lib/ferret/_p.cfs +0 -0
- data/lib/ferret/_s.cfs +0 -0
- data/lib/ferret/_v.cfs +0 -0
- data/lib/ferret/_y.cfs +0 -0
- data/lib/ferret/segments +0 -0
- data/lib/ferret/segments_15 +0 -0
- data/lib/treat/buildable.rb +0 -157
- data/lib/treat/category.rb +0 -33
- data/lib/treat/delegatable.rb +0 -116
- data/lib/treat/doable.rb +0 -45
- data/lib/treat/entities/collection.rb +0 -14
- data/lib/treat/entities/document.rb +0 -12
- data/lib/treat/entities/phrases.rb +0 -17
- data/lib/treat/entities/tokens.rb +0 -61
- data/lib/treat/entities/zones.rb +0 -41
- data/lib/treat/extractors/coreferences/stanford.rb +0 -69
- data/lib/treat/extractors/date/chronic.rb +0 -32
- data/lib/treat/extractors/date/ruby.rb +0 -25
- data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
- data/lib/treat/extractors/language/language_extractor.rb +0 -27
- data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
- data/lib/treat/extractors/roles/naive.rb +0 -73
- data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
- data/lib/treat/extractors/statistics/position_in.rb +0 -14
- data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
- data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
- data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
- data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
- data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
- data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
- data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
- data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
- data/lib/treat/feature.rb +0 -58
- data/lib/treat/features.rb +0 -7
- data/lib/treat/formatters/visualizers/short_value.rb +0 -29
- data/lib/treat/formatters/visualizers/txt.rb +0 -45
- data/lib/treat/group.rb +0 -106
- data/lib/treat/helpers/linguistics_loader.rb +0 -18
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
- data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
- data/lib/treat/inflectors/declensions/english.rb +0 -319
- data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
- data/lib/treat/inflectors/stem/porter.rb +0 -162
- data/lib/treat/inflectors/stem/porter_c.rb +0 -26
- data/lib/treat/inflectors/stem/uea.rb +0 -30
- data/lib/treat/install.rb +0 -59
- data/lib/treat/languages/tags.rb +0 -377
- data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
- data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
- data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
- data/lib/treat/lexicalizers/tag/brill.rb +0 -91
- data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
- data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
- data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
- data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
- data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
- data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
- data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
- data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
- data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
- data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
- data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
- data/lib/treat/registrable.rb +0 -28
- data/lib/treat/sugar.rb +0 -50
- data/lib/treat/viewable.rb +0 -29
- data/lib/treat/visitable.rb +0 -28
- data/test/profile.rb +0 -2
- data/test/tc_entity.rb +0 -117
- data/test/tc_extractors.rb +0 -73
- data/test/tc_formatters.rb +0 -41
- data/test/tc_inflectors.rb +0 -34
- data/test/tc_lexicalizers.rb +0 -32
- data/test/tc_processors.rb +0 -50
- data/test/tc_resources.rb +0 -22
- data/test/tc_treat.rb +0 -60
- data/test/tc_tree.rb +0 -60
- data/test/tests.rb +0 -20
- data/test/texts.rb +0 -19
- data/test/texts/english/half_cocked_basel.txt +0 -16
- data/test/texts/english/hose_and_dry.doc +0 -0
- data/test/texts/english/hungarys_troubles.abw +0 -70
- data/test/texts/english/long.html +0 -24
- data/test/texts/english/long.txt +0 -22
- data/test/texts/english/medium.txt +0 -5
- data/test/texts/english/republican_nomination.pdf +0 -0
- data/test/texts/english/saving_the_euro.odt +0 -0
- data/test/texts/english/short.txt +0 -3
- data/test/texts/english/zero_sum.html +0 -111
@@ -1,42 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Inflectors
|
3
|
-
module Declensions
|
4
|
-
# This class is a wrapper for the functions included
|
5
|
-
# in the 'linguistics' gem that allow to obtain the
|
6
|
-
# declensions of a word.
|
7
|
-
#
|
8
|
-
# Project website: http://deveiate.org/projects/Linguistics/
|
9
|
-
class Linguistics
|
10
|
-
require 'treat/helpers/linguistics_loader'
|
11
|
-
# Retrieve a declension of a word using the 'linguistics' gem.
|
12
|
-
#
|
13
|
-
# Options:
|
14
|
-
#
|
15
|
-
# - (Identifier) :count => :singular, :plural
|
16
|
-
def self.declensions(entity, options = {})
|
17
|
-
unless options[:count]
|
18
|
-
raise Treat::Exception,
|
19
|
-
"Must supply option count (:singular or :plural)."
|
20
|
-
end
|
21
|
-
klass = Treat::Helpers::LinguisticsLoader.load(entity.language)
|
22
|
-
string = entity.to_s
|
23
|
-
if entity.category == :verb
|
24
|
-
raise Treat::Exception,
|
25
|
-
"Cannot retrieve the declensions of a verb. " +
|
26
|
-
"Use #singular_verb and #plural_verb instead."
|
27
|
-
end
|
28
|
-
if options[:count] == :plural
|
29
|
-
if entity.has?(:category) &&
|
30
|
-
[:noun, :adjective, :verb].include?(entity.category)
|
31
|
-
silence_warnings do
|
32
|
-
klass.send(:"plural_#{entity.category}", string)
|
33
|
-
end
|
34
|
-
else
|
35
|
-
silence_warnings { klass.plural(string) }
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|
@@ -1,20 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Inflectors
|
3
|
-
module OrdinalWords
|
4
|
-
# This class is a wrapper for the functions included
|
5
|
-
# in the 'linguistics' gem that allow to describe a
|
6
|
-
# number in words in ordinal form.
|
7
|
-
#
|
8
|
-
# Project website: http://deveiate.org/projects/Linguistics/
|
9
|
-
class Linguistics
|
10
|
-
require 'treat/helpers/linguistics_loader'
|
11
|
-
# Desribe a number in words in ordinal form, using the
|
12
|
-
# 'linguistics' gem.
|
13
|
-
def self.ordinal_words(number, options = {})
|
14
|
-
klass = Treat::Helpers::LinguisticsLoader.load(number.language)
|
15
|
-
klass.ordinate(number.to_s)
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
@@ -1,162 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Inflectors
|
3
|
-
module Stem
|
4
|
-
# Stem a word using a native Ruby implementation of the
|
5
|
-
# Porter stemming algorithm, ported to Ruby from a
|
6
|
-
# version coded up in Perl. This is a simplified
|
7
|
-
# implementation; for a true and fast Porter stemmer,
|
8
|
-
# see Treat::Inflectors::Stem::PorterC.
|
9
|
-
#
|
10
|
-
# Authored by Ray Pereda (raypereda@hotmail.com).
|
11
|
-
# Unknown license.
|
12
|
-
#
|
13
|
-
# Original paper: Porter, 1980. An algorithm for suffix stripping,
|
14
|
-
# Program, Vol. 14, no. 3, pp 130-137,
|
15
|
-
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
|
16
|
-
class Porter
|
17
|
-
# Returns the stem of a word using a native Porter stemmer.
|
18
|
-
#
|
19
|
-
# Options: none.
|
20
|
-
def self.stem(word, options = {})
|
21
|
-
# Copy the word and convert it to a string.
|
22
|
-
w = word.to_s
|
23
|
-
return w if w.length < 3
|
24
|
-
# Map initial y to Y so that the patterns
|
25
|
-
# never treat it as vowel.
|
26
|
-
w[0] = 'Y' if w[0] == ?y
|
27
|
-
# Step 1a
|
28
|
-
if w =~ /(ss|i)es$/
|
29
|
-
w = $` + $1
|
30
|
-
elsif w =~ /([^s])s$/
|
31
|
-
w = $` + $1
|
32
|
-
end
|
33
|
-
# Step 1b
|
34
|
-
if w =~ /eed$/
|
35
|
-
w.chop! if $` =~ MGR0
|
36
|
-
elsif w =~ /(ed|ing)$/
|
37
|
-
stem = $`
|
38
|
-
if stem =~ VOWEL_IN_STEM
|
39
|
-
w = stem
|
40
|
-
case w
|
41
|
-
when /(at|bl|iz)$/ then w << "e"
|
42
|
-
when /([^aeiouylsz])\1$/ then w.chop!
|
43
|
-
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
if w =~ /y$/
|
48
|
-
stem = $`
|
49
|
-
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
50
|
-
end
|
51
|
-
# Step 2
|
52
|
-
if w =~ SUFFIX_1_REGEXP
|
53
|
-
stem = $`
|
54
|
-
suffix = $1
|
55
|
-
if stem =~ MGR0
|
56
|
-
w = stem + STEP_2_LIST[suffix]
|
57
|
-
end
|
58
|
-
end
|
59
|
-
# Step 3
|
60
|
-
if w =~
|
61
|
-
/(icate|ative|alize|iciti|ical|ful|ness)$/
|
62
|
-
stem = $`
|
63
|
-
suffix = $1
|
64
|
-
if stem =~ MGR0
|
65
|
-
w = stem + STEP_3_LIST[suffix]
|
66
|
-
end
|
67
|
-
end
|
68
|
-
# Step 4
|
69
|
-
if w =~ SUFFIX_2_REGEXP
|
70
|
-
stem = $`
|
71
|
-
if stem =~ MGR1
|
72
|
-
w = stem
|
73
|
-
end
|
74
|
-
elsif w =~ /(s|t)(ion)$/
|
75
|
-
stem = $` + $1
|
76
|
-
if stem =~ MGR1
|
77
|
-
w = stem
|
78
|
-
end
|
79
|
-
end
|
80
|
-
# Step 5
|
81
|
-
if w =~ /e$/
|
82
|
-
stem = $`
|
83
|
-
if (stem =~ MGR1) ||
|
84
|
-
(stem =~ MEQ1 && stem !~
|
85
|
-
/^#{CC}#{V}[^aeiouwxy]$/o)
|
86
|
-
w = stem
|
87
|
-
end
|
88
|
-
end
|
89
|
-
if w =~ /ll$/ && w =~ MGR1
|
90
|
-
w.chop!
|
91
|
-
end
|
92
|
-
# and turn initial Y back to y
|
93
|
-
w[0] = 'y' if w[0] == ?Y
|
94
|
-
w
|
95
|
-
end
|
96
|
-
|
97
|
-
STEP_2_LIST = {
|
98
|
-
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
|
99
|
-
'izer'=>'ize', 'bli'=>'ble',
|
100
|
-
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
|
101
|
-
'ization'=>'ize', 'ation'=>'ate',
|
102
|
-
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
|
103
|
-
'ousness'=>'ous', 'anati'=>'al',
|
104
|
-
'iviti'=>'ive', 'binati'=>'ble', 'logi'=>'log'
|
105
|
-
}
|
106
|
-
STEP_3_LIST = {
|
107
|
-
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
|
108
|
-
'ical'=>'ic', 'ful'=>'', 'ness'=>''
|
109
|
-
}
|
110
|
-
SUFFIX_1_REGEXP = /(
|
111
|
-
ational |
|
112
|
-
tional |
|
113
|
-
enci |
|
114
|
-
anci |
|
115
|
-
izer |
|
116
|
-
bli |
|
117
|
-
alli |
|
118
|
-
entli |
|
119
|
-
eli |
|
120
|
-
ousli |
|
121
|
-
ization |
|
122
|
-
ation |
|
123
|
-
ator |
|
124
|
-
alism |
|
125
|
-
iveness |
|
126
|
-
fulness |
|
127
|
-
ousness |
|
128
|
-
anati |
|
129
|
-
iviti |
|
130
|
-
binati |
|
131
|
-
logi)$/x
|
132
|
-
SUFFIX_2_REGEXP = /(
|
133
|
-
al |
|
134
|
-
ance |
|
135
|
-
ence |
|
136
|
-
er |
|
137
|
-
ic |
|
138
|
-
able |
|
139
|
-
ible |
|
140
|
-
ant |
|
141
|
-
ement |
|
142
|
-
ment |
|
143
|
-
ent |
|
144
|
-
ou |
|
145
|
-
ism |
|
146
|
-
ate |
|
147
|
-
iti |
|
148
|
-
ous |
|
149
|
-
ive |
|
150
|
-
ize)$/x
|
151
|
-
C = "[^aeiou]" # consonant
|
152
|
-
V = "[aeiouy]" # vowel
|
153
|
-
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
|
154
|
-
VV = "#{V}(?>[aeiou]*)" # vowel sequence
|
155
|
-
MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
|
156
|
-
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
|
157
|
-
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
|
158
|
-
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
|
159
|
-
end
|
160
|
-
end
|
161
|
-
end
|
162
|
-
end
|
@@ -1,26 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Inflectors
|
3
|
-
module Stem
|
4
|
-
# Stems words using the 'ruby-stemmer' gem, which
|
5
|
-
# wraps a C version of the Porter stemming algorithm.
|
6
|
-
#
|
7
|
-
# Project website: https://github.com/aurelian/ruby-stemmer
|
8
|
-
# Original paper: Porter, 1980. An algorithm for suffix stripping,
|
9
|
-
# Program, Vol. 14, no. 3, pp 130-137,
|
10
|
-
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
|
11
|
-
class PorterC
|
12
|
-
# Require the 'ruby-stemmer' gem.
|
13
|
-
silence_warnings { require 'lingua/stemmer' }
|
14
|
-
# Remove a conflict between this gem and the 'engtagger' gem.
|
15
|
-
::LinguaStemmer = ::Lingua
|
16
|
-
Object.instance_eval { remove_const :Lingua }
|
17
|
-
# Stem the word using a full-blown Porter stemmer in C.
|
18
|
-
#
|
19
|
-
# Options: none.
|
20
|
-
def self.stem(word, options = {})
|
21
|
-
silence_warnings { ::LinguaStemmer.stemmer(word.to_s) }
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
@@ -1,30 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Inflectors
|
3
|
-
module Stem
|
4
|
-
# Stems a word using the UEA algorithm, implemented
|
5
|
-
# by the 'uea-stemmer' gem.
|
6
|
-
#
|
7
|
-
# "Similar to other stemmers, UEA-Lite operates on a
|
8
|
-
# set of rules which are used as steps. There are two
|
9
|
-
# groups of rules: the first to clean the tokens, and
|
10
|
-
# the second to alter suffixes."
|
11
|
-
#
|
12
|
-
# Project website: https://github.com/ealdent/uea-stemmer
|
13
|
-
# Original paper: Jenkins, Marie-Claire, Smith, Dan,
|
14
|
-
# Conservative stemming for search and indexing, 2005.
|
15
|
-
# http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
|
16
|
-
class UEA
|
17
|
-
# Require the 'uea-stemmer' gem.
|
18
|
-
silence_warnings { require 'uea-stemmer' }
|
19
|
-
# Keep only one copy of the stemmer.
|
20
|
-
@@stemmer = nil
|
21
|
-
# Stems a word using the UEA algorithm, implemented
|
22
|
-
# by the 'uea-stemmer' gem.
|
23
|
-
def self.stem(entity, options = {})
|
24
|
-
@@stemmer ||= silence_warnings { ::UEAStemmer.new }
|
25
|
-
@@stemmer.stem(entity.to_s).strip
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|
data/lib/treat/install.rb
DELETED
@@ -1,59 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
class Installer
|
3
|
-
require 'rubygems/dependency_installer'
|
4
|
-
# Install required dependencies and optional dependencies
|
5
|
-
# for a specific language.
|
6
|
-
def self.install(language = :english)
|
7
|
-
|
8
|
-
lang = Treat::Languages.get(language)
|
9
|
-
required = lang::RequiredDependencies
|
10
|
-
optional = lang::OptionalDependencies
|
11
|
-
|
12
|
-
puts "Treat Installer\n\n"
|
13
|
-
puts "Installing dependencies for the #{language.to_s.capitalize} language.\n\n"
|
14
|
-
|
15
|
-
flag = false
|
16
|
-
|
17
|
-
inst = Gem::DependencyInstaller.new
|
18
|
-
|
19
|
-
required.each do |dependency|
|
20
|
-
puts "Installing required dependency '#{dependency}'..."
|
21
|
-
begin
|
22
|
-
silence_warnings { inst.install(dependency) }
|
23
|
-
rescue
|
24
|
-
flag = true
|
25
|
-
puts "Couldn't install '#{dependency}'. " +
|
26
|
-
"You need install this dependency manually by running: " +
|
27
|
-
"'gem install #{dependency}' or use 'sudo' to run this script."
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
optional.each do |dependency|
|
32
|
-
begin
|
33
|
-
puts "Install optional dependency '#{dependency}' (yes/no, <enter> = skip) ?"
|
34
|
-
answer = gets.strip
|
35
|
-
raise Treat::Exception unless ['yes', 'no', ''].include?(answer)
|
36
|
-
if answer == 'yes'
|
37
|
-
silence_warnings { inst.install(dependency) }
|
38
|
-
else
|
39
|
-
puts "Skipped installing '#{dependency}'."
|
40
|
-
next
|
41
|
-
end
|
42
|
-
rescue Treat::Exception
|
43
|
-
puts "Invalid input - valid options are 'yes' or 'no'."
|
44
|
-
retry
|
45
|
-
rescue
|
46
|
-
flag = true
|
47
|
-
puts "Couldn't install '#{dependency}'. " +
|
48
|
-
"You can install this dependency manually by running: " +
|
49
|
-
"'gem install #{dependency}' or use 'sudo' to run this script."
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
w = flag ? 'incompletely' : 'normally'
|
54
|
-
puts "\nInstall proceeded #{w}."
|
55
|
-
puts
|
56
|
-
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
data/lib/treat/languages/tags.rb
DELETED
@@ -1,377 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Languages
|
3
|
-
|
4
|
-
module Tags
|
5
|
-
ClawsC5 = 0
|
6
|
-
Brown = 1
|
7
|
-
Penn = 2
|
8
|
-
Negra = 3
|
9
|
-
PennChinese = 4
|
10
|
-
Simple = 5
|
11
|
-
|
12
|
-
PTBClauseTagDescription = [
|
13
|
-
['S', 'Simple declarative clause'],
|
14
|
-
['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
|
15
|
-
['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
|
16
|
-
['SINV', 'Inverted declarative sentence'],
|
17
|
-
['SQ', 'Inverted yes/no question']
|
18
|
-
]
|
19
|
-
|
20
|
-
AlignedPhraseTags =
|
21
|
-
[
|
22
|
-
'Adjective phrase', ['', '', 'ADJP'],
|
23
|
-
'Adverb phrase', ['', '', 'ADVP'],
|
24
|
-
'Conjunction phrase', ['', '', 'CONJP'],
|
25
|
-
'Fragment', ['', '', 'FRAG'],
|
26
|
-
'Interjection', ['', '', 'INTJ'],
|
27
|
-
'List marker', ['', '', 'LST'],
|
28
|
-
'Not a phrase', ['', '', 'NAC'],
|
29
|
-
'Noun phrase', ['', '', 'NP'],
|
30
|
-
'Head of NP', ['', '', 'NX'],
|
31
|
-
'Prepositional phrase', ['', '', 'PP'],
|
32
|
-
'Parenthetical', ['', '', 'PRN'],
|
33
|
-
'Particle', ['', '', 'PRT'],
|
34
|
-
'Quantifier phrase', ['', '', 'QP'],
|
35
|
-
'Reduced relative clause', ['', '', 'RRC'],
|
36
|
-
'Unlike coordinated phrase', ['', '', 'UCP'],
|
37
|
-
'Verb phrase', ['', '', 'VP'],
|
38
|
-
'Wh adjective phrase', ['', '', 'WHADJP'],
|
39
|
-
'Wh adverb phrase', ['', '', 'WHAVP'],
|
40
|
-
'Wh noun phrase', ['', '', 'WHNP'],
|
41
|
-
'Wh prepositional phrase', ['', '', 'WHPP'],
|
42
|
-
'Unknown', ['', '', 'X'],
|
43
|
-
'Phrase', ['', '', 'P'],
|
44
|
-
'Sentence', ['', '', 'S'],
|
45
|
-
'Phrase', ['', '', 'SBAR'] # Fix
|
46
|
-
]
|
47
|
-
|
48
|
-
# A description of Enju categories.
|
49
|
-
EnjuCatDescription = [
|
50
|
-
['ADJ', 'Adjective'],
|
51
|
-
['ADV', 'Adverb'],
|
52
|
-
['CONJ', 'Coordination conjunction'],
|
53
|
-
['C', 'Complementizer'],
|
54
|
-
['D', 'Determiner'],
|
55
|
-
['N', 'Noun'],
|
56
|
-
['P', 'Preposition'],
|
57
|
-
['SC', 'Subordination conjunction'],
|
58
|
-
['V', 'Verb'],
|
59
|
-
['COOD', 'Part of coordination'],
|
60
|
-
['PN', 'Punctuation'],
|
61
|
-
['PRT', 'Particle'],
|
62
|
-
['S', 'Sentence']
|
63
|
-
]
|
64
|
-
|
65
|
-
# Maps Enju categories to Treat categories.
|
66
|
-
EnjuCatToCategory = {
|
67
|
-
'ADJ' => :adjective,
|
68
|
-
'ADV' => :adverb,
|
69
|
-
'CONJ' => :conjunction,
|
70
|
-
'COOD' => :conjunction,
|
71
|
-
'C' => :complementizer,
|
72
|
-
'D' => :determiner,
|
73
|
-
'N' => :noun,
|
74
|
-
'P' => :preposition,
|
75
|
-
'PN' => :punctuation,
|
76
|
-
'SC' => :conjunction,
|
77
|
-
'V' => :verb,
|
78
|
-
'PRT' => :particle
|
79
|
-
}
|
80
|
-
|
81
|
-
# Description of the xcat in the Enju output specification.
|
82
|
-
EnjuXCatDescription = [
|
83
|
-
['COOD', 'Coordinated phrase/clause'],
|
84
|
-
['IMP', 'Imperative sentence'],
|
85
|
-
['INV', 'Subject-verb inversion'],
|
86
|
-
['Q', 'Interrogative sentence with subject-verb inversion'],
|
87
|
-
['REL', 'A relativizer included'],
|
88
|
-
['FREL', 'A free relative included'],
|
89
|
-
['TRACE', 'A trace included'],
|
90
|
-
['WH', 'A wh-question word included']
|
91
|
-
]
|
92
|
-
|
93
|
-
EnjuCatXcatToPTB = [
|
94
|
-
['ADJP', '', 'ADJP'],
|
95
|
-
['ADJP', 'REL', 'WHADJP'],
|
96
|
-
['ADJP', 'FREL', 'WHADJP'],
|
97
|
-
['ADJP', 'WH', 'WHADJP'],
|
98
|
-
['ADVP', '', 'ADVP'],
|
99
|
-
['ADVP', 'REL', 'WHADVP'],
|
100
|
-
['ADVP', 'FREL', 'WHADVP'],
|
101
|
-
['ADVP', 'WH', 'WHADVP'],
|
102
|
-
['CONJP', '', 'CONJP'],
|
103
|
-
['CP', '', 'SBAR'],
|
104
|
-
['DP', '', 'NP'],
|
105
|
-
['NP', '', 'NP'],
|
106
|
-
['NX', 'NX', 'NAC'],
|
107
|
-
['NP' 'REL' 'WHNP'],
|
108
|
-
['NP' 'FREL' 'WHNP'],
|
109
|
-
['NP' 'WH' 'WHNP'],
|
110
|
-
['PP', '', 'PP'],
|
111
|
-
['PP', 'REL', 'WHPP'],
|
112
|
-
['PP', 'WH', 'WHPP'],
|
113
|
-
['PRT', '', 'PRT'],
|
114
|
-
['S', '', 'S'],
|
115
|
-
['S', 'INV', 'SINV'],
|
116
|
-
['S', 'Q', 'SQ'],
|
117
|
-
['S', 'REL', 'SBAR'],
|
118
|
-
['S', 'FREL', 'SBAR'],
|
119
|
-
['S', 'WH', 'SBARQ'],
|
120
|
-
['SCP', '', 'SBAR'],
|
121
|
-
['VP', '', 'VP'],
|
122
|
-
['VP', '', 'VP'],
|
123
|
-
['', '', 'UK']
|
124
|
-
]
|
125
|
-
|
126
|
-
# Aligned tags for the Claws C5, Brown and Penn tag sets.
|
127
|
-
# Adapted from Manning, Christopher and Schütze, Hinrich,
|
128
|
-
# 1999. Foundations of Statistical Natural Language
|
129
|
-
# Processing. MIT Press, p. 141-142;
|
130
|
-
# http://www.isocat.org/rest/dcs/376;
|
131
|
-
#
|
132
|
-
# JRS?
|
133
|
-
|
134
|
-
|
135
|
-
SimpleWordTagToCategory = {
|
136
|
-
'C' => :complementizer,
|
137
|
-
'PN' => :punctuation,
|
138
|
-
'SC' => :conjunction
|
139
|
-
}
|
140
|
-
|
141
|
-
AlignedWordTags = [
|
142
|
-
|
143
|
-
'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'A'],
|
144
|
-
'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'ADJ'],
|
145
|
-
'Ajective, adverbial or predicative', ['', '', '', 'ADJD', '', 'ADJ'],
|
146
|
-
'Adjective, attribute', ['', '', '', 'ADJA', 'VA', 'ADJ'],
|
147
|
-
'Adjective, ordinal number', ['ORD', 'OD', 'JJ', '', 'OD', 'ADJ'],
|
148
|
-
'Adjective, comparative', ['AJC', 'JJR', 'JJR', 'KOKOM', '', 'ADJ'],
|
149
|
-
'Adjective, superlative', ['AJS', 'JJT', 'JJS', '', 'JJ', 'ADJ'],
|
150
|
-
'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ', '', '', 'ADJ'],
|
151
|
-
'Adjective, cardinal number', ['CRD', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
|
152
|
-
'Adjective, cardinal number, one', ['PNI', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
|
153
|
-
|
154
|
-
'Adverb', ['AV0', 'RB', 'RB', 'ADV', 'AD', 'ADV'],
|
155
|
-
'Adverb, negative', ['XX0', '*', 'RB', 'PTKNEG', '', 'ADV'],
|
156
|
-
'Adverb, comparative', ['AV0', 'RBR', 'RBR', '', 'AD', 'ADV'],
|
157
|
-
'Adverb, superlative', ['AV0', 'RBT', 'RBS', '', 'AD', 'ADV'],
|
158
|
-
'Adverb, particle', ['AVP', 'RP', 'RP', '', '', 'ADV'],
|
159
|
-
'Adverb, question', ['AVQ', 'WRB', 'WRB', '', 'AD', 'ADV'],
|
160
|
-
'Adverb, degree & question', ['AVQ', 'WQL', 'WRB', '', 'ADV'],
|
161
|
-
'Adverb, degree', ['AV0', 'QL', 'RB', '', '', 'ADV'],
|
162
|
-
'Adverb, degree, postposed', ['AV0', 'QLP', 'RB', '', '', 'ADV'],
|
163
|
-
'Adverb, nominal', ['AV0', 'RN', 'RB', 'PROP', '', 'ADV'],
|
164
|
-
'Adverb, pronominal', ['', '', '', '', 'PROP', '', 'ADV'],
|
165
|
-
|
166
|
-
'Clitic', ['', '', 'POS', '', '', ''],
|
167
|
-
|
168
|
-
'Conjunction, coordination', ['CJC', 'CC', 'CC', 'KON', 'CC', 'COOD'],
|
169
|
-
'Conjunction, coordination, and', ['CJC', 'CC', 'CC', 'KON', 'CC', 'ET'],
|
170
|
-
'Conjunction, subordination', ['CJS', 'CS', 'IN', 'KOUS', 'CS', 'CONJ'],
|
171
|
-
'Conjunction, subordination with to and infinitive', ['', '', '', 'KOUI', '', ''],
|
172
|
-
'Conjunction, complementizer, that', ['CJT', 'CS', 'IN', '', '', 'C'],
|
173
|
-
|
174
|
-
'Determiner', ['DT0', 'DT', 'DT', '', 'DT', 'D'],
|
175
|
-
'Determiner', ['DT0', 'DT', 'DET', '', 'DT', 'D'],
|
176
|
-
'Determiner, pronoun', ['DT0', 'DTI', 'DT', '', '', 'D'],
|
177
|
-
'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT', '', '', 'D'],
|
178
|
-
'Determiner, prequalifier', ['DT0', 'ABL', 'DT', '', '', 'D'],
|
179
|
-
'Determiner, prequantifier', ['DT0', 'ABN', 'PDT', '', 'DT', 'D'],
|
180
|
-
'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT', '', '', 'D'],
|
181
|
-
'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT', '', '', 'D'],
|
182
|
-
'Determiner, article', ['AT0', 'AT', 'DT', 'ART', '', 'D'],
|
183
|
-
'Determiner, postdeterminer', ['DT0', 'AP', 'DT', '', '', 'D'],
|
184
|
-
'Determiner, possessive', ['DPS', 'PP$', 'PRP$', '', '', 'D'],
|
185
|
-
'Determiner, possessive, second', ['DPS', 'PPSS', 'PRPS', '', '', 'D'],
|
186
|
-
'Determiner, possessive, second', ['DPS', 'PP$$', 'PRP', '', '', 'D'],
|
187
|
-
'Determiner, possessive, second', ['DPS', 'PPSS', 'PRP', '', '', 'D'],
|
188
|
-
'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
|
189
|
-
'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
|
190
|
-
'Determiner, possessive & question', ['DTQ', 'WPS', 'WPS', '', '', 'D'],
|
191
|
-
|
192
|
-
'Localizer', ['', '', '', '', 'LC'],
|
193
|
-
|
194
|
-
'Measure word', ['', '', '', '', 'M'],
|
195
|
-
|
196
|
-
'Noun, common', ['NN0', 'NN', 'NN', 'N', 'NN', 'NN'],
|
197
|
-
'Noun, singular', ['NN1', 'NN', 'NN', 'NN', 'NN', 'N'],
|
198
|
-
'Noun, plural', ['NN2', 'NNS', 'NNS', 'NN', 'NN', 'N'],
|
199
|
-
'Noun, proper, singular', ['NP0', 'NP', 'NNP', 'NE', 'NR', 'N'],
|
200
|
-
'Noun, proper, plural', ['NP0', 'NPS', 'NNPS', 'NE', 'NR', 'N'],
|
201
|
-
'Noun, adverbial', ['NN0', 'NR', 'NN', 'NE', '', 'N'],
|
202
|
-
'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS', '', 'N'],
|
203
|
-
'Noun, temporal', ['', '', '', '', 'NT', 'N'],
|
204
|
-
'Noun, verbal', ['', '', '', '', 'NN', 'N'],
|
205
|
-
|
206
|
-
'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP', '', 'PN', 'CL'],
|
207
|
-
'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP', 'PPER'],
|
208
|
-
'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP', 'PPER'],
|
209
|
-
'Pronoun, personal, object', ['PNP', 'PPO', 'PRP', 'PPER'],
|
210
|
-
'Pronoun, reflexive', ['PNX', 'PPL', 'PRP', 'PRF'],
|
211
|
-
'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP', 'PRF'],
|
212
|
-
'Pronoun, question, subject', ['PNQ', 'WPS', 'WP', 'PWAV'],
|
213
|
-
'Pronoun, question, object', ['PNQ', 'WPO', 'WP', 'PWAV', 'PWAT'],
|
214
|
-
'Pronoun, existential there', ['EX0', 'EX', 'EX'],
|
215
|
-
'Pronoun, attributive demonstrative', ['', '', '', 'PDAT'],
|
216
|
-
'Prounoun, attributive indefinite without determiner', ['', '', '', 'PIAT'],
|
217
|
-
'Pronoun, attributive possessive', ['', '', '', 'PPOSAT', ''],
|
218
|
-
'Pronoun, substituting demonstrative', ['', '', '', 'PDS'],
|
219
|
-
'Pronoun, substituting possessive', ['', '', '', 'PPOSS', ''],
|
220
|
-
'Prounoun, substituting indefinite', ['', '', '', 'PIS'],
|
221
|
-
'Pronoun, attributive relative', ['', '', '', 'PRELAT', ''],
|
222
|
-
'Pronoun, substituting relative', ['', '', '', 'PRELS', ''],
|
223
|
-
'Pronoun, attributive interrogative', ['', '', '', 'PWAT'],
|
224
|
-
'Pronoun, adverbial interrogative', ['', '', '', 'PWAV'],
|
225
|
-
|
226
|
-
'Pronoun, substituting interrogative', ['', '', '', 'PWS'],
|
227
|
-
'Verb, main, finite', ['', '', '', 'VVFIN', '', 'V'],
|
228
|
-
'Verb, main, infinitive', ['', '', '', 'VVINF', '', 'V'],
|
229
|
-
'Verb, main, imperative', ['', '', '', 'VVIMP', '', 'V'],
|
230
|
-
'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP', '', '', 'V'],
|
231
|
-
'Verb, infinitive', ['VVI', 'VB', 'VB', 'V', '', 'V'],
|
232
|
-
'Verb, past tense', ['VVD', 'VBD', 'VBD', '', '', 'V'],
|
233
|
-
'Verb, present participle', ['VVG', 'VBG', 'VBG', 'VAPP', '', 'V'],
|
234
|
-
'Verb, past/passive participle', ['VVN', 'VBN', 'VBN', 'VVPP', '', 'V'],
|
235
|
-
'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ', '', '', 'V'],
|
236
|
-
'Verb, auxiliary', ['', '', '', 'VAFIN', '', 'V'],
|
237
|
-
'Verb, imperative', ['', '', '', 'VAIMP', '', 'V'],
|
238
|
-
'Verb, imperative infinitive', ['', '', '', 'VAINF', '', 'V'],
|
239
|
-
'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP', '', '', 'V'],
|
240
|
-
'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB', '', '', 'V'],
|
241
|
-
'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD', '', '', 'V'],
|
242
|
-
'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG', '', '', 'V'],
|
243
|
-
'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN', '', '', 'V'],
|
244
|
-
'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ', '', '', 'V'],
|
245
|
-
'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP', 'VA', '', 'V'],
|
246
|
-
'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB', 'VAINF', '', 'V'],
|
247
|
-
'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD', 'VA', '', 'V'],
|
248
|
-
'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG', 'VA', '', 'V'],
|
249
|
-
'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN', 'VAPP', '', 'V'],
|
250
|
-
'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ', 'VA', '', 'V'],
|
251
|
-
'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB', '', '', 'V'],
|
252
|
-
'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD', '', '', 'V'],
|
253
|
-
'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD', '', '', 'V'],
|
254
|
-
'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG', '', '', 'V'],
|
255
|
-
'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN', '', '', 'V'],
|
256
|
-
'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ', '', '', 'V'],
|
257
|
-
'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP', '', '', 'V'],
|
258
|
-
'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP', '', '', 'V'],
|
259
|
-
'Verb, modal', ['VM0', 'MD', 'MD', 'VMFIN', 'VV', 'V'],
|
260
|
-
'Verb, modal', ['VM0', 'MD', 'MD', 'VMINF', 'VV', 'V'],
|
261
|
-
'Verb, modal, finite', ['', '', '', '', 'VMFIN', 'V'],
|
262
|
-
'Verb, modal, infinite', ['', '', '', '', 'VMINF', 'V'],
|
263
|
-
'Verb, modal, past participle', ['', '', '', '', 'VMPP', 'V'],
|
264
|
-
|
265
|
-
'Particle', ['', '', '', '', '', 'PRT'],
|
266
|
-
'Particle, with adverb', ['', '', '', 'PTKA', '', 'PRT'],
|
267
|
-
'Particle, answer', ['', '', '', 'PTKANT', '', 'PRT'],
|
268
|
-
'Particle, negation', ['', '', '', 'PTKNEG', '', 'PRT'],
|
269
|
-
'Particle, separated verb', ['', '', '', 'PTKVZ', '', 'PRT'],
|
270
|
-
'Particle, to as infinitive marker', ['TO0', 'TO', 'TO', 'PTKZU', '', 'PRT'],
|
271
|
-
|
272
|
-
'Preposition, comparative', ['', '', '', 'KOKOM', '', 'P'],
|
273
|
-
'Preposition, to', ['PRP', 'IN', 'TO', '', '', 'P'],
|
274
|
-
'Preposition', ['PRP', 'IN', 'IN', 'APPR', 'P', 'P'],
|
275
|
-
'Preposition, with aritcle', ['', '', '', 'APPART', '', 'P'],
|
276
|
-
'Preposition, of', ['PRF', 'IN', 'IN', '', '', 'P'],
|
277
|
-
|
278
|
-
'Possessive', ['POS', '$', 'POS'],
|
279
|
-
|
280
|
-
'Postposition', ['', '', '', 'APPO'],
|
281
|
-
|
282
|
-
'Circumposition, right', ['', '', '', 'APZR', ''],
|
283
|
-
|
284
|
-
'Interjection, onomatopoeia or other isolate', ['ITJ', 'UH', 'UH', 'ITJ', 'IJ'],
|
285
|
-
|
286
|
-
'Onomatopoeia', ['', '', '', '', 'ON'],
|
287
|
-
|
288
|
-
'Punctuation', ['', '', '', '', 'PU', 'PN'],
|
289
|
-
'Punctuation, sentence ender', ['PUN', '.', '.', '', '', 'PN'],
|
290
|
-
'Punctuation, sentence ender', ['PUN', '.', 'PP', '$.', '', 'PN'],
|
291
|
-
'Punctuation, semicolon', ['PUN', '.', '.', '', '', 'PN'],
|
292
|
-
'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
|
293
|
-
'Punctuationm, comma', ['PUN', ',', ',', '$,'],
|
294
|
-
'Punctuation, dash', ['PUN', '-', '-'],
|
295
|
-
'Punctuation, dollar sign', ['PUN', '', '$'],
|
296
|
-
'Punctuation, left bracket', ['PUL', '(', '(', '$('],
|
297
|
-
'Punctuation, right bracket', ['PUR', ')', ')'],
|
298
|
-
'Punctuation, quotation mark, left', ['PUQ', '', '``'],
|
299
|
-
'Punctuation, quotation mark, right', ['PUQ', '', '"'],
|
300
|
-
|
301
|
-
'Word, truncated, left', ['', '', '', 'TRUNC'],
|
302
|
-
|
303
|
-
'Unknown, foreign words (not in lexicon)', ['UNZ', '(FW-)', 'FW', '', 'FW'],
|
304
|
-
|
305
|
-
'Symbol', ['', '', 'SYM', 'XY'],
|
306
|
-
'Symbol, alphabetical', ['ZZ0', '', ''],
|
307
|
-
'Symbol, list item', ['', '', 'LS'],
|
308
|
-
|
309
|
-
# Not sure about these tags from the Chinese PTB.
|
310
|
-
'Aspect marker', ['', '', '', '', 'AS'], # ?
|
311
|
-
'Ba-construction', ['', '', '', '', 'BA'], # ?
|
312
|
-
'In relative', ['', '', '', '', 'DEC'], # ?
|
313
|
-
'Associative', ['', '', '', '', 'DER'], # ?
|
314
|
-
'In V-de or V-de-R construct', ['', '', '', '', 'DER'], # ?
|
315
|
-
'For words ? ', ['', '', '', '', 'ETC'], # ?
|
316
|
-
'In long bei-construct', ['', '', '', '', 'LB'], # ?
|
317
|
-
'In short bei-construct', ['', '', '', '', 'SB'], # ?
|
318
|
-
'Sentence-nal particle', ['', '', '', '', 'SB'], # ?
|
319
|
-
'Particle, other', ['', '', '', '', 'MSP'], # ?
|
320
|
-
'Before VP', ['', '', '', '', 'DEV'], # ?
|
321
|
-
'Verb, ? as main verb', ['', '', '', '', 'VE'], # ?
|
322
|
-
'Verb, ????', ['', '', '', '', 'VC'] # ?
|
323
|
-
]
|
324
|
-
|
325
|
-
wttc = {
|
326
|
-
|
327
|
-
}
|
328
|
-
Treat::Languages::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
|
329
|
-
|
330
|
-
category = desc.gsub(',', ' ,').split(' ')[0].downcase.intern
|
331
|
-
|
332
|
-
wttc[tags[ClawsC5]] ||= {}
|
333
|
-
wttc[tags[Brown]] ||= {}
|
334
|
-
wttc[tags[Penn]] ||= {}
|
335
|
-
wttc[tags[Negra]] ||= {}
|
336
|
-
wttc[tags[PennChinese]] ||= {}
|
337
|
-
wttc[tags[Simple]] ||= {}
|
338
|
-
|
339
|
-
wttc[tags[ClawsC5]][:claws_5] = category
|
340
|
-
wttc[tags[Brown]][:brown] = category
|
341
|
-
wttc[tags[Penn]][:penn] = category
|
342
|
-
wttc[tags[Negra]][:negra] = category if tags[Negra]
|
343
|
-
wttc[tags[PennChinese]][:penn_chinese] = category if tags[PennChinese]
|
344
|
-
wttc[tags[Simple]][:simple] = category if tags[Simple]
|
345
|
-
|
346
|
-
end
|
347
|
-
# A hash converting word tags to word categories.
|
348
|
-
WordTagToCategory = wttc
|
349
|
-
|
350
|
-
# A hash converting phrase tag to categories.
|
351
|
-
pttc = {}
|
352
|
-
Treat::Languages::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
|
353
|
-
category = desc.gsub(',', ' ,').gsub(' ', '_').downcase.intern
|
354
|
-
pttc[tags[Penn]] ||= {};
|
355
|
-
# Not yet for other tag sts.
|
356
|
-
#pttc[tags[0]][:claws_5] = category
|
357
|
-
#pttc[tags[1]][:brown] = category
|
358
|
-
pttc[tags[Penn]][:penn] = category
|
359
|
-
end
|
360
|
-
|
361
|
-
# A hash converting word tags to word categories.
|
362
|
-
PhraseTagToCategory = pttc
|
363
|
-
|
364
|
-
def self.has_phrase_tag?(tag, tag_set)
|
365
|
-
PhraseTagToCategory[tag] &&
|
366
|
-
PhraseTagToCategory[tag_set]
|
367
|
-
end
|
368
|
-
|
369
|
-
def self.has_word_tag?(tag, tag_set)
|
370
|
-
WordTagToCategory[tag] &&
|
371
|
-
WordTagToCategory[tag_set]
|
372
|
-
end
|
373
|
-
|
374
|
-
|
375
|
-
end
|
376
|
-
end
|
377
|
-
end
|