treat 1.0.6 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +2 -4
- data/README.md +13 -12
- data/bin/MANIFEST +1 -0
- data/bin/stanford/bridge.jar +0 -0
- data/bin/stanford/joda-time.jar +0 -0
- data/bin/stanford/stanford-corenlp.jar +0 -0
- data/bin/stanford/stanford-parser.jar +0 -0
- data/bin/stanford/xom.jar +0 -0
- data/files/{www.economist.com/21552208 → 21552208.html} +86 -89
- data/files/{guides.rubyonrails.org/3_2_release_notes.html → 3_2_release_notes.html} +0 -0
- data/files/{INFO → MANIFEST} +0 -0
- data/files/{www.rubyinside.com/nethttp-cheat-sheet-2940.html → nethttp-cheat-sheet-2940.html} +12 -16
- data/files/weather-central-canada-heat-wave.html +1370 -0
- data/lib/treat/config/core/acronyms.rb +4 -0
- data/lib/treat/config/core/encodings.rb +8 -0
- data/lib/treat/config/core/entities.rb +2 -0
- data/lib/treat/config/core/language.rb +3 -0
- data/lib/treat/config/core/paths.rb +8 -0
- data/lib/treat/config/core/syntax.rb +1 -0
- data/lib/treat/config/core/verbosity.rb +1 -0
- data/lib/treat/config/databases/mongo.rb +3 -0
- data/lib/treat/config/languages/agnostic.rb +34 -0
- data/lib/treat/config/languages/arabic.rb +13 -0
- data/lib/treat/config/languages/chinese.rb +13 -0
- data/lib/treat/config/languages/dutch.rb +12 -0
- data/lib/treat/config/languages/english.rb +60 -0
- data/lib/treat/config/languages/french.rb +18 -0
- data/lib/treat/config/languages/german.rb +18 -0
- data/lib/treat/config/languages/greek.rb +12 -0
- data/lib/treat/config/languages/italian.rb +12 -0
- data/lib/treat/config/languages/polish.rb +12 -0
- data/lib/treat/config/languages/portuguese.rb +12 -0
- data/lib/treat/config/languages/russian.rb +12 -0
- data/lib/treat/config/languages/spanish.rb +12 -0
- data/lib/treat/config/languages/swedish.rb +12 -0
- data/lib/treat/config/libraries/stanford.rb +1 -0
- data/lib/treat/config/linguistics/categories.rb +4 -0
- data/lib/treat/config/linguistics/punctuation.rb +33 -0
- data/lib/treat/config/tags/aligned.rb +221 -0
- data/lib/treat/config/tags/enju.rb +71 -0
- data/lib/treat/config/tags/paris7.rb +17 -0
- data/lib/treat/config/tags/ptb.rb +15 -0
- data/lib/treat/config/workers/extractors.rb +39 -0
- data/lib/treat/config/workers/formatters.rb +20 -0
- data/lib/treat/config/workers/inflectors.rb +27 -0
- data/lib/treat/config/workers/learners.rb +6 -0
- data/lib/treat/config/workers/lexicalizers.rb +18 -0
- data/lib/treat/config/workers/list.rb +1 -0
- data/lib/treat/config/workers/processors.rb +19 -0
- data/lib/treat/config/workers/retrievers.rb +12 -0
- data/lib/treat/config.rb +125 -0
- data/lib/treat/{classification.rb → core/classification.rb} +1 -1
- data/lib/treat/{data_set.rb → core/data_set.rb} +1 -4
- data/lib/treat/{tree.rb → core/node.rb} +5 -5
- data/lib/treat/core/server.rb +3 -0
- data/lib/treat/core.rb +5 -0
- data/lib/treat/entities/abilities/buildable.rb +61 -56
- data/lib/treat/entities/abilities/checkable.rb +2 -2
- data/lib/treat/entities/abilities/comparable.rb +21 -0
- data/lib/treat/entities/abilities/copyable.rb +2 -0
- data/lib/treat/entities/abilities/countable.rb +1 -1
- data/lib/treat/entities/abilities/debuggable.rb +1 -1
- data/lib/treat/entities/abilities/delegatable.rb +42 -36
- data/lib/treat/entities/abilities/doable.rb +2 -2
- data/lib/treat/entities/abilities/exportable.rb +1 -1
- data/lib/treat/entities/abilities/iterable.rb +21 -33
- data/lib/treat/entities/abilities/magical.rb +8 -8
- data/lib/treat/entities/abilities/registrable.rb +0 -38
- data/lib/treat/entities/abilities/stringable.rb +19 -19
- data/lib/treat/entities/collection.rb +31 -0
- data/lib/treat/entities/document.rb +10 -0
- data/lib/treat/entities/entity.rb +18 -13
- data/lib/treat/entities/group.rb +15 -0
- data/lib/treat/entities/section.rb +13 -0
- data/lib/treat/entities/token.rb +35 -0
- data/lib/treat/entities/zone.rb +11 -0
- data/lib/treat/entities.rb +5 -75
- data/lib/treat/helpers/didyoumean.rb +57 -0
- data/lib/treat/helpers/escaping.rb +15 -0
- data/lib/treat/helpers/formatting.rb +41 -0
- data/lib/treat/helpers/platform.rb +15 -0
- data/lib/treat/helpers/reflection.rb +17 -0
- data/lib/treat/helpers/temporary.rb +27 -0
- data/lib/treat/helpers/verbosity.rb +19 -0
- data/lib/treat/helpers.rb +5 -0
- data/lib/treat/installer.rb +46 -165
- data/lib/treat/loaders/linguistics.rb +22 -27
- data/lib/treat/loaders/stanford.rb +23 -41
- data/lib/treat/loaders.rb +10 -0
- data/lib/treat/proxies.rb +73 -24
- data/lib/treat/version.rb +3 -0
- data/lib/treat/{extractors → workers/extractors}/keywords/tf_idf.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/language/what_language.rb +11 -4
- data/lib/treat/{extractors → workers/extractors}/name_tag/stanford.rb +3 -4
- data/lib/treat/{extractors → workers/extractors}/tf_idf/native.rb +4 -5
- data/lib/treat/{extractors → workers/extractors}/time/chronic.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/time/nickel.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/time/ruby.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/topic_words/lda.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/topics/reuters.rb +4 -4
- data/lib/treat/{formatters → workers/formatters}/readers/abw.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/autoselect.rb +10 -3
- data/lib/treat/{formatters → workers/formatters}/readers/doc.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/html.rb +4 -4
- data/lib/treat/{formatters → workers/formatters}/readers/image.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/odt.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/pdf.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/txt.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/xml.rb +2 -2
- data/lib/treat/workers/formatters/serializers/mongo.rb +60 -0
- data/lib/treat/{formatters → workers/formatters}/serializers/xml.rb +1 -2
- data/lib/treat/{formatters → workers/formatters}/serializers/yaml.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/unserializers/autoselect.rb +3 -1
- data/lib/treat/workers/formatters/unserializers/mongo.rb +80 -0
- data/lib/treat/{formatters → workers/formatters}/unserializers/xml.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/unserializers/yaml.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/visualizers/dot.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/visualizers/standoff.rb +2 -3
- data/lib/treat/{formatters → workers/formatters}/visualizers/tree.rb +1 -1
- data/lib/treat/{groupable.rb → workers/group.rb} +6 -12
- data/lib/treat/{inflectors → workers/inflectors}/cardinalizers/linguistics.rb +7 -2
- data/lib/treat/{inflectors → workers/inflectors}/conjugators/linguistics.rb +11 -11
- data/lib/treat/{inflectors → workers/inflectors}/declensors/active_support.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/declensors/english/inflect.rb +1 -1
- data/lib/treat/{inflectors → workers/inflectors}/declensors/english.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/declensors/linguistics.rb +4 -4
- data/lib/treat/{inflectors → workers/inflectors}/ordinalizers/linguistics.rb +8 -2
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter_c.rb +1 -1
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/uea.rb +1 -1
- data/lib/treat/{ai → workers/learners}/classifiers/id3.rb +1 -1
- data/lib/treat/{ai → workers/learners}/classifiers/mlp.rb +1 -1
- data/lib/treat/{lexicalizers → workers/lexicalizers}/categorizers/from_tag.rb +9 -9
- data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet/synset.rb +2 -2
- data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet.rb +4 -4
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill/patch.rb +2 -2
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill.rb +2 -8
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/lingua.rb +1 -6
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/stanford.rb +31 -42
- data/lib/treat/workers/processors/chunkers/autoselect.rb +19 -0
- data/lib/treat/{processors → workers/processors}/chunkers/html.rb +4 -3
- data/lib/treat/workers/processors/chunkers/txt.rb +32 -0
- data/lib/treat/{processors → workers/processors}/parsers/enju.rb +3 -3
- data/lib/treat/{processors → workers/processors}/parsers/stanford.rb +6 -8
- data/lib/treat/{processors → workers/processors}/segmenters/punkt.rb +6 -10
- data/lib/treat/{processors → workers/processors}/segmenters/stanford.rb +2 -2
- data/lib/treat/{processors → workers/processors}/segmenters/tactful.rb +3 -6
- data/lib/treat/{processors → workers/processors}/tokenizers/ptb.rb +6 -5
- data/lib/treat/{processors → workers/processors}/tokenizers/punkt.rb +1 -1
- data/lib/treat/{processors → workers/processors}/tokenizers/stanford.rb +1 -1
- data/lib/treat/{processors → workers/processors}/tokenizers/tactful.rb +3 -5
- data/lib/treat/{retrievers → workers/retrievers}/indexers/ferret.rb +1 -1
- data/lib/treat/{retrievers → workers/retrievers}/searchers/ferret.rb +1 -1
- data/lib/treat/workers.rb +96 -0
- data/lib/treat.rb +23 -49
- data/spec/collection.rb +4 -4
- data/spec/document.rb +5 -5
- data/spec/entity.rb +33 -32
- data/spec/{tree.rb → node.rb} +5 -5
- data/spec/phrase.rb +5 -39
- data/spec/sandbox.rb +212 -6
- data/spec/token.rb +12 -9
- data/spec/treat.rb +12 -9
- data/spec/word.rb +10 -9
- data/spec/zone.rb +6 -2
- data/tmp/{INFO → MANIFEST} +0 -0
- data/tmp/english.yaml +10340 -0
- metadata +149 -139
- data/lib/treat/ai.rb +0 -12
- data/lib/treat/categories.rb +0 -90
- data/lib/treat/categorizable.rb +0 -44
- data/lib/treat/configurable.rb +0 -115
- data/lib/treat/dependencies.rb +0 -25
- data/lib/treat/downloader.rb +0 -87
- data/lib/treat/entities/abilities.rb +0 -10
- data/lib/treat/entities/entities.rb +0 -102
- data/lib/treat/exception.rb +0 -7
- data/lib/treat/extractors.rb +0 -79
- data/lib/treat/formatters/serializers/mongo.rb +0 -64
- data/lib/treat/formatters.rb +0 -41
- data/lib/treat/helpers/decimal_point_escaper.rb +0 -22
- data/lib/treat/inflectors.rb +0 -52
- data/lib/treat/kernel.rb +0 -208
- data/lib/treat/languages/arabic.rb +0 -16
- data/lib/treat/languages/chinese.rb +0 -16
- data/lib/treat/languages/dutch.rb +0 -16
- data/lib/treat/languages/english.rb +0 -63
- data/lib/treat/languages/french.rb +0 -20
- data/lib/treat/languages/german.rb +0 -20
- data/lib/treat/languages/greek.rb +0 -16
- data/lib/treat/languages/italian.rb +0 -17
- data/lib/treat/languages/language.rb +0 -10
- data/lib/treat/languages/list.txt +0 -504
- data/lib/treat/languages/polish.rb +0 -16
- data/lib/treat/languages/portuguese.rb +0 -16
- data/lib/treat/languages/russian.rb +0 -16
- data/lib/treat/languages/spanish.rb +0 -16
- data/lib/treat/languages/swedish.rb +0 -16
- data/lib/treat/languages.rb +0 -132
- data/lib/treat/lexicalizers.rb +0 -37
- data/lib/treat/object.rb +0 -7
- data/lib/treat/processors/chunkers/autoselect.rb +0 -16
- data/lib/treat/processors/chunkers/txt.rb +0 -21
- data/lib/treat/processors.rb +0 -38
- data/lib/treat/retrievers.rb +0 -27
- data/lib/treat/server.rb +0 -26
- data/lib/treat/universalisation/encodings.rb +0 -12
- data/lib/treat/universalisation/tags.rb +0 -453
- data/lib/treat/universalisation.rb +0 -9
- data/spec/languages.rb +0 -25
@@ -0,0 +1,2 @@
|
|
1
|
+
{list: [:entity, :unknown, :email, :url, :symbol, :sentence, :punctuation, :number, :enclitic, :word, :token, :fragment, :phrase, :paragraph, :title, :zone, :list, :block, :page, :section, :collection, :document],
|
2
|
+
order: [:token, :fragment, :phrase, :sentence, :zone, :section, :document, :collection]}
|
@@ -0,0 +1 @@
|
|
1
|
+
{sweetened: false}
|
@@ -0,0 +1 @@
|
|
1
|
+
{debug: false, silence: true}
|
@@ -0,0 +1,34 @@
|
|
1
|
+
{
|
2
|
+
dependencies: [
|
3
|
+
'psych',
|
4
|
+
'nokogiri',
|
5
|
+
'ferret',
|
6
|
+
'bson_ext',
|
7
|
+
'mongo',
|
8
|
+
'lda-ruby',
|
9
|
+
'stanford-core-nlp',
|
10
|
+
'linguistics',
|
11
|
+
'ruby-readability',
|
12
|
+
'whatlanguage',
|
13
|
+
'chronic',
|
14
|
+
'nickel',
|
15
|
+
'decisiontree',
|
16
|
+
'ai4r'
|
17
|
+
],
|
18
|
+
workers: {
|
19
|
+
extractors: {
|
20
|
+
keywords: [:tf_idf],
|
21
|
+
language: [:what_language]
|
22
|
+
},
|
23
|
+
formatters: {
|
24
|
+
serializers: [:xml, :yaml, :mongo]
|
25
|
+
},
|
26
|
+
lexicalizers: {
|
27
|
+
categorizers: [:from_tag]
|
28
|
+
},
|
29
|
+
inflectors: {
|
30
|
+
ordinalizers: [:linguistics],
|
31
|
+
cardinalizers: [:linguistics]
|
32
|
+
}
|
33
|
+
}
|
34
|
+
}
|
@@ -0,0 +1,60 @@
|
|
1
|
+
{
|
2
|
+
dependencies: [
|
3
|
+
'rbtagger',
|
4
|
+
'ruby-stemmer',
|
5
|
+
'punkt-segmenter',
|
6
|
+
'tactful_tokenizer',
|
7
|
+
'nickel',
|
8
|
+
'rwordnet',
|
9
|
+
'uea-stemmer',
|
10
|
+
'engtagger',
|
11
|
+
'activesupport',
|
12
|
+
'english'
|
13
|
+
],
|
14
|
+
workers: {
|
15
|
+
extractors: {
|
16
|
+
time: [:chronic, :ruby, :nickel],
|
17
|
+
topics: [:reuters],
|
18
|
+
keywords: [:tf_idf],
|
19
|
+
name_tag: [:stanford],
|
20
|
+
coreferences: [:stanford]
|
21
|
+
},
|
22
|
+
inflectors: {
|
23
|
+
conjugators: [:linguistics],
|
24
|
+
declensors: [:english, :linguistics, :active_support],
|
25
|
+
stemmers: [:porter, :porter_c, :uea],
|
26
|
+
ordinalizers: [:linguistics],
|
27
|
+
cardinalizers: [:linguistics]
|
28
|
+
},
|
29
|
+
lexicalizers: {
|
30
|
+
taggers: [:lingua, :brill, :stanford],
|
31
|
+
sensers: [:wordnet]
|
32
|
+
},
|
33
|
+
processors: {
|
34
|
+
parsers: [:stanford, :enju],
|
35
|
+
segmenters: [:tactful, :punkt, :stanford],
|
36
|
+
tokenizers: [:ptb, :stanford, :tactful, :punkt]
|
37
|
+
}
|
38
|
+
},
|
39
|
+
info: {
|
40
|
+
stopwords:
|
41
|
+
['the', 'of', 'and', 'a', 'to', 'in', 'is',
|
42
|
+
'you', 'that', 'it', 'he', 'was', 'for', 'on',
|
43
|
+
'are', 'as', 'with', 'his', 'they', 'I', 'at',
|
44
|
+
'be', 'this', 'have', 'from', 'or', 'one', 'had',
|
45
|
+
'by', 'word', 'but', 'not', 'what', 'all', 'were',
|
46
|
+
'we', 'when', 'your', 'can', 'said', 'there', 'use',
|
47
|
+
'an', 'each', 'which', 'she', 'do', 'how', 'their',
|
48
|
+
'if', 'will', 'up', 'other', 'about', 'out', 'many',
|
49
|
+
'then', 'them', 'these', 'so', 'some', 'her', 'would',
|
50
|
+
'make', 'like', 'him', 'into', 'time', 'has', 'look',
|
51
|
+
'two', 'more', 'write', 'go', 'see', 'number', 'no',
|
52
|
+
'way', 'could', 'people', 'my', 'than', 'first', 'been',
|
53
|
+
'call', 'who', 'its', 'now', 'find', 'long', 'down',
|
54
|
+
'day', 'did', 'get', 'come', 'made', 'may', 'part',
|
55
|
+
'say', 'also', 'new', 'much', 'should', 'still',
|
56
|
+
'such', 'before', 'after', 'other', 'then', 'over',
|
57
|
+
'under', 'therefore', 'nonetheless', 'thereafter',
|
58
|
+
'afterwards', 'here', 'huh', 'hah', "n't", "'t", 'here']
|
59
|
+
}
|
60
|
+
}
|
@@ -0,0 +1,18 @@
|
|
1
|
+
{
|
2
|
+
dependencies: [
|
3
|
+
'punkt-segmenter',
|
4
|
+
'tactful_tokenizer',
|
5
|
+
'stanford-core-nlp'
|
6
|
+
],
|
7
|
+
workers: {
|
8
|
+
processors: {
|
9
|
+
segmenters: [:punkt],
|
10
|
+
tokenizers: [:tactful],
|
11
|
+
parsers: [:stanford]
|
12
|
+
},
|
13
|
+
lexicalizers: {
|
14
|
+
taggers: [:stanford],
|
15
|
+
categorizers: [:from_tag]
|
16
|
+
}
|
17
|
+
}
|
18
|
+
}
|
@@ -0,0 +1,18 @@
|
|
1
|
+
{
|
2
|
+
dependencies: [
|
3
|
+
'punkt-segmenter',
|
4
|
+
'tactful_tokenizer',
|
5
|
+
'stanford'
|
6
|
+
],
|
7
|
+
workers: {
|
8
|
+
processors: {
|
9
|
+
segmenters: [:punkt],
|
10
|
+
tokenizers: [:tactful],
|
11
|
+
parsers: [:stanford]
|
12
|
+
},
|
13
|
+
lexicalizers: {
|
14
|
+
taggers: [:stanford],
|
15
|
+
categorizers: [:from_tag]
|
16
|
+
}
|
17
|
+
}
|
18
|
+
}
|
@@ -0,0 +1 @@
|
|
1
|
+
{jar_path: nil, model_path: nil}
|
@@ -0,0 +1,33 @@
|
|
1
|
+
{punct_to_category: {
|
2
|
+
'.' => 'period',
|
3
|
+
',' => 'comma',
|
4
|
+
';' => 'semicolon',
|
5
|
+
':' => 'colon',
|
6
|
+
'?' => 'interrogation',
|
7
|
+
'!' => 'exclamation',
|
8
|
+
'"' => 'double_quote',
|
9
|
+
"'" => 'single_quote',
|
10
|
+
'$' => 'dollar',
|
11
|
+
'%' => 'percent',
|
12
|
+
'#' => 'hash',
|
13
|
+
'*' => 'asterisk',
|
14
|
+
'&' => 'ampersand',
|
15
|
+
'+' => 'plus',
|
16
|
+
'-' => 'dash',
|
17
|
+
'/' => 'slash',
|
18
|
+
'\\' => 'backslash',
|
19
|
+
'^' => 'caret',
|
20
|
+
'_' => 'underscore',
|
21
|
+
'`' => 'tick',
|
22
|
+
'|' => 'pipe',
|
23
|
+
'~' => 'tilde',
|
24
|
+
'@' => 'at',
|
25
|
+
'[' => 'bracket',
|
26
|
+
']' => 'bracket',
|
27
|
+
'{' => 'brace',
|
28
|
+
'}' => 'brace',
|
29
|
+
'(' => 'parenthesis',
|
30
|
+
')' => 'parenthesis',
|
31
|
+
'<' => 'tag',
|
32
|
+
'>' => 'tag'
|
33
|
+
}}
|
@@ -0,0 +1,221 @@
|
|
1
|
+
{tag_sets: [
|
2
|
+
:claws_c5, :brown, :penn, :stutgart, :chinese, :paris7
|
3
|
+
],
|
4
|
+
phrase_tags: [
|
5
|
+
'Adjectival phrase', ['', '', 'ADJP', '', '', 'AP'],
|
6
|
+
'Adverbial phrase', ['', '', 'ADVP', '', '', 'AdP'],
|
7
|
+
'Conjunction phrase', ['', '', 'CONJP', '', '', 'Ssub'],
|
8
|
+
'Fragment', ['', '', 'FRAG', '', '', ''],
|
9
|
+
'Interjectional phrase', ['', '', 'INTJ', '', '', ''],
|
10
|
+
'List marker', ['', '', 'LST', '', '', ''],
|
11
|
+
'Not a phrase', ['', '', 'NAC', '', '', ''],
|
12
|
+
'Noun phrase', ['', '', 'NP', '', '', 'NP'],
|
13
|
+
'Verbal nucleus', ['', '', '', '', '', 'VN'],
|
14
|
+
'Head of noun phrase', ['', '', 'NX', '', '', ''],
|
15
|
+
'Prepositional phrase', ['', '', 'PP', '', '', 'PP'],
|
16
|
+
'Parenthetical', ['', '', 'PRN', '', '', ''],
|
17
|
+
'Particle', ['', '', 'PRT', '', '', ''],
|
18
|
+
'Participial phrase', ['', '', '', '', '', 'VPart'],
|
19
|
+
'Quantifier phrase', ['', '', 'QP', '', '', ''],
|
20
|
+
'Relative clause', ['', '', 'RRC', '', '', 'Srel'],
|
21
|
+
'Coordinated phrase', ['', '', 'UCP', '', '', 'COORD'],
|
22
|
+
'Infinitival phrase', ['', '', '', '', '', 'VPinf'],
|
23
|
+
'Verb phrase', ['', '', 'VP', '', '', ''],
|
24
|
+
'Wh adjective phrase', ['', '', 'WHADJP', '', '', ''],
|
25
|
+
'Wh adverb phrase', ['', '', 'WHAVP', '', '', ''],
|
26
|
+
'Wh noun phrase', ['', '', 'WHNP', '', '', ''],
|
27
|
+
'Wh prepositional phrase', ['', '', 'WHPP', '', '', ''],
|
28
|
+
'Unknown', ['', '', 'X', '', '', ''],
|
29
|
+
'Phrase', ['', '', 'P', '', '', 'Sint'],
|
30
|
+
'Sentence', ['', '', 'S', '', '', 'SENT'],
|
31
|
+
'Phrase', ['', '', 'SBAR', '', '', ''] # Fix
|
32
|
+
],
|
33
|
+
word_tags: [
|
34
|
+
|
35
|
+
# Aligned tags for the Claws C5, Brown and Penn tag sets.
|
36
|
+
# Adapted from Manning, Christopher and Schütze, Hinrich,
|
37
|
+
# 1999. Foundations of Statistical Natural Language
|
38
|
+
# Processing. MIT Press, p. 141-142;
|
39
|
+
# http://www.isocat.org/rest/dcs/376;
|
40
|
+
|
41
|
+
'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'A'],
|
42
|
+
'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'ADJ'],
|
43
|
+
'Ajective, adverbial or predicative', ['', '', '', 'ADJD', '', 'ADJ'],
|
44
|
+
'Adjective, attribute', ['', '', '', 'ADJA', 'VA', 'ADJ'],
|
45
|
+
'Adjective, ordinal number', ['ORD', 'OD', 'JJ', '', 'OD', 'ADJ'],
|
46
|
+
'Adjective, comparative', ['AJC', 'JJR', 'JJR', 'KOKOM', '', 'ADJ'],
|
47
|
+
'Adjective, superlative', ['AJS', 'JJT', 'JJS', '', 'JJ', 'ADJ'],
|
48
|
+
'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ', '', '', 'ADJ'],
|
49
|
+
'Adjective, cardinal number', ['CRD', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
|
50
|
+
'Adjective, cardinal number, one', ['PNI', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
|
51
|
+
|
52
|
+
'Adverb', ['AV0', 'RB', 'RB', 'ADV', 'AD', 'ADV'],
|
53
|
+
'Adverb, negative', ['XX0', '*', 'RB', 'PTKNEG', '', 'ADV'],
|
54
|
+
'Adverb, comparative', ['AV0', 'RBR', 'RBR', '', 'AD', 'ADV'],
|
55
|
+
'Adverb, superlative', ['AV0', 'RBT', 'RBS', '', 'AD', 'ADV'],
|
56
|
+
'Adverb, particle', ['AVP', 'RP', 'RP', '', '', 'ADV'],
|
57
|
+
'Adverb, question', ['AVQ', 'WRB', 'WRB', '', 'AD', 'ADV'],
|
58
|
+
'Adverb, degree & question', ['AVQ', 'WQL', 'WRB', '', 'ADV'],
|
59
|
+
'Adverb, degree', ['AV0', 'QL', 'RB', '', '', 'ADV'],
|
60
|
+
'Adverb, degree, postposed', ['AV0', 'QLP', 'RB', '', '', 'ADV'],
|
61
|
+
'Adverb, nominal', ['AV0', 'RN', 'RB', 'PROP', '', 'ADV'],
|
62
|
+
'Adverb, pronominal', ['', '', '', '', 'PROP', '', 'ADV'],
|
63
|
+
|
64
|
+
'Conjunction, coordination', ['CJC', 'CC', 'CC', 'KON', 'CC', 'COOD'],
|
65
|
+
'Conjunction, coordination, and', ['CJC', 'CC', 'CC', 'KON', 'CC', 'ET'],
|
66
|
+
'Conjunction, subordination', ['CJS', 'CS', 'IN', 'KOUS', 'CS', 'CONJ'],
|
67
|
+
'Conjunction, subordination with to and infinitive', ['', '', '', 'KOUI', '', ''],
|
68
|
+
'Conjunction, complementizer, that', ['CJT', 'CS', 'IN', '', '', 'C'],
|
69
|
+
|
70
|
+
'Determiner', ['DT0', 'DT', 'DT', '', 'DT', 'D'],
|
71
|
+
'Determiner, pronoun', ['DT0', 'DTI', 'DT', '', '', 'D'],
|
72
|
+
'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT', '', '', 'D'],
|
73
|
+
'Determiner, prequalifier', ['DT0', 'ABL', 'DT', '', '', 'D'],
|
74
|
+
'Determiner, prequantifier', ['DT0', 'ABN', 'PDT', '', 'DT', 'D'],
|
75
|
+
'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT', '', '', 'D'],
|
76
|
+
'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT', '', '', 'D'],
|
77
|
+
'Determiner, article', ['AT0', 'AT', 'DT', 'ART', '', 'D'],
|
78
|
+
'Determiner, postdeterminer', ['DT0', 'AP', 'DT', '', '', 'D'],
|
79
|
+
'Determiner, possessive', ['DPS', 'PP$', 'PRP$', '', '', 'D'],
|
80
|
+
'Determiner, possessive, second', ['DPS', 'PP$', 'PRPS', '', '', 'D'],
|
81
|
+
'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
|
82
|
+
'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
|
83
|
+
'Interjection', ['', '', '', '', '', 'I'],
|
84
|
+
'Localizer', ['', '', '', '', 'LC'],
|
85
|
+
|
86
|
+
'Measure word', ['', '', '', '', 'M'],
|
87
|
+
|
88
|
+
'Noun, common', ['NN0', 'NN', 'NN', 'N', 'NN', 'NN'],
|
89
|
+
'Noun, singular', ['NN1', 'NN', 'NN', 'NN', 'NN', 'N'],
|
90
|
+
'Noun, plural', ['NN2', 'NNS', 'NNS', 'NN', 'NN', 'N'],
|
91
|
+
'Noun, proper, singular', ['NP0', 'NP', 'NNP', 'NE', 'NR', 'N'],
|
92
|
+
'Noun, proper, plural', ['NP0', 'NPS', 'NNPS', 'NE', 'NR', 'N'],
|
93
|
+
'Noun, adverbial', ['NN0', 'NR', 'NN', 'NE', '', 'N'],
|
94
|
+
'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS', '', 'N'],
|
95
|
+
'Noun, temporal', ['', '', '', '', 'NT', 'N'],
|
96
|
+
'Noun, verbal', ['', '', '', '', 'NN', 'N'],
|
97
|
+
|
98
|
+
'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP', '', 'PN', 'CL'],
|
99
|
+
'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP', 'PPER'],
|
100
|
+
'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP', 'PPER'],
|
101
|
+
'Pronoun, personal, object', ['PNP', 'PPO', 'PRP', 'PPER'],
|
102
|
+
'Pronoun, reflexive', ['PNX', 'PPL', 'PRP', 'PRF'],
|
103
|
+
'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP', 'PRF'],
|
104
|
+
'Pronoun, question, subject', ['PNQ', 'WPS', 'WP', 'PWAV'],
|
105
|
+
'Pronoun, question, subject', ['PNQ', 'WPS', 'WPS', 'PWAV'], # Hack
|
106
|
+
'Pronoun, question, object', ['PNQ', 'WPO', 'WP', 'PWAV', 'PWAT'],
|
107
|
+
'Pronoun, existential there', ['EX0', 'EX', 'EX'],
|
108
|
+
'Pronoun, attributive demonstrative', ['', '', '', 'PDAT'],
|
109
|
+
'Prounoun, attributive indefinite without determiner', ['', '', '', 'PIAT'],
|
110
|
+
'Pronoun, attributive possessive', ['', '', '', 'PPOSAT', ''],
|
111
|
+
'Pronoun, substituting demonstrative', ['', '', '', 'PDS'],
|
112
|
+
'Pronoun, substituting possessive', ['', '', '', 'PPOSS', ''],
|
113
|
+
'Prounoun, substituting indefinite', ['', '', '', 'PIS'],
|
114
|
+
'Pronoun, attributive relative', ['', '', '', 'PRELAT', ''],
|
115
|
+
'Pronoun, substituting relative', ['', '', '', 'PRELS', ''],
|
116
|
+
'Pronoun, attributive interrogative', ['', '', '', 'PWAT'],
|
117
|
+
'Pronoun, adverbial interrogative', ['', '', '', 'PWAV'],
|
118
|
+
|
119
|
+
'Pronoun, substituting interrogative', ['', '', '', 'PWS'],
|
120
|
+
'Verb, main, finite', ['', '', '', 'VVFIN', '', 'V'],
|
121
|
+
'Verb, main, infinitive', ['', '', '', 'VVINF', '', 'V'],
|
122
|
+
'Verb, main, imperative', ['', '', '', 'VVIMP', '', 'V'],
|
123
|
+
'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP', '', '', 'V'],
|
124
|
+
'Verb, infinitive', ['VVI', 'VB', 'VB', 'V', '', 'V'],
|
125
|
+
'Verb, past tense', ['VVD', 'VBD', 'VBD', '', '', 'V'],
|
126
|
+
'Verb, present participle', ['VVG', 'VBG', 'VBG', 'VAPP', '', 'V'],
|
127
|
+
'Verb, past/passive participle', ['VVN', 'VBN', 'VBN', 'VVPP', '', 'V'],
|
128
|
+
'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ', '', '', 'V'],
|
129
|
+
'Verb, auxiliary', ['', '', '', 'VAFIN', '', 'V'],
|
130
|
+
'Verb, imperative', ['', '', '', 'VAIMP', '', 'V'],
|
131
|
+
'Verb, imperative infinitive', ['', '', '', 'VAINF', '', 'V'],
|
132
|
+
'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP', '', '', 'V'],
|
133
|
+
'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB', '', '', 'V'],
|
134
|
+
'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD', '', '', 'V'],
|
135
|
+
'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG', '', '', 'V'],
|
136
|
+
'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN', '', '', 'V'],
|
137
|
+
'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ', '', '', 'V'],
|
138
|
+
'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP', 'VA', '', 'V'],
|
139
|
+
'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB', 'VAINF', '', 'V'],
|
140
|
+
'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD', 'VA', '', 'V'],
|
141
|
+
'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG', 'VA', '', 'V'],
|
142
|
+
'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN', 'VAPP', '', 'V'],
|
143
|
+
'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ', 'VA', '', 'V'],
|
144
|
+
'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB', '', '', 'V'],
|
145
|
+
'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD', '', '', 'V'],
|
146
|
+
'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD', '', '', 'V'],
|
147
|
+
'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG', '', '', 'V'],
|
148
|
+
'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN', '', '', 'V'],
|
149
|
+
'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ', '', '', 'V'],
|
150
|
+
'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP', '', '', 'V'],
|
151
|
+
'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP', '', '', 'V'],
|
152
|
+
'Verb, modal', ['VM0', 'MD', 'MD', 'VMFIN', 'VV', 'V'],
|
153
|
+
'Verb, modal', ['VM0', 'MD', 'MD', 'VMINF', 'VV', 'V'],
|
154
|
+
'Verb, modal, finite', ['', '', '', '', 'VMFIN', 'V'],
|
155
|
+
'Verb, modal, infinite', ['', '', '', '', 'VMINF', 'V'],
|
156
|
+
'Verb, modal, past participle', ['', '', '', '', 'VMPP', 'V'],
|
157
|
+
|
158
|
+
'Particle', ['', '', '', '', '', 'PRT'],
|
159
|
+
'Particle, with adverb', ['', '', '', 'PTKA', '', 'PRT'],
|
160
|
+
'Particle, answer', ['', '', '', 'PTKANT', '', 'PRT'],
|
161
|
+
'Particle, negation', ['', '', '', 'PTKNEG', '', 'PRT'],
|
162
|
+
'Particle, separated verb', ['', '', '', 'PTKVZ', '', 'PRT'],
|
163
|
+
'Particle, to as infinitive marker', ['TO0', 'TO', 'TO', 'PTKZU', '', 'PRT'],
|
164
|
+
|
165
|
+
'Preposition, comparative', ['', '', '', 'KOKOM', '', 'P'],
|
166
|
+
'Preposition, to', ['PRP', 'IN', 'TO', '', '', 'P'],
|
167
|
+
'Preposition', ['PRP', 'IN', 'IN', 'APPR', 'P', 'P'],
|
168
|
+
'Preposition, with aritcle', ['', '', '', 'APPART', '', 'P'],
|
169
|
+
'Preposition, of', ['PRF', 'IN', 'IN', '', '', 'P'],
|
170
|
+
|
171
|
+
'Possessive', ['POS', '$', 'POS'],
|
172
|
+
|
173
|
+
'Postposition', ['', '', '', 'APPO'],
|
174
|
+
|
175
|
+
'Circumposition, right', ['', '', '', 'APZR', ''],
|
176
|
+
|
177
|
+
'Interjection, onomatopoeia or other isolate', ['ITJ', 'UH', 'UH', 'ITJ', 'IJ'],
|
178
|
+
|
179
|
+
'Onomatopoeia', ['', '', '', '', 'ON'],
|
180
|
+
|
181
|
+
'Punctuation', ['', '', '', '', 'PU', 'PN'],
|
182
|
+
'Punctuation, sentence ender', ['PUN', '.', '.', '', '', 'PN'],
|
183
|
+
|
184
|
+
'Punctuation, semicolon', ['PUN', '.', '.', '', '', 'PN'],
|
185
|
+
'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
|
186
|
+
'Punctuationm, comma', ['PUN', ',', ',', '$,'],
|
187
|
+
'Punctuation, dash', ['PUN', '-', '-'],
|
188
|
+
'Punctuation, dollar sign', ['PUN', '', '$'],
|
189
|
+
'Punctuation, left bracket', ['PUL', '(', '(', '$('],
|
190
|
+
'Punctuation, right bracket', ['PUR', ')', ')'],
|
191
|
+
'Punctuation, quotation mark, left', ['PUQ', '', '``'],
|
192
|
+
'Punctuation, quotation mark, right', ['PUQ', '', '"'],
|
193
|
+
|
194
|
+
'Punctuation, left bracket', ['PUL', '(', 'PPL'],
|
195
|
+
'Punctuation, right bracket', ['PUR', ')', 'PPR'],
|
196
|
+
'Punctuation, left square bracket', ['PUL', '(', 'LSB'],
|
197
|
+
'Punctuation, right square bracket', ['PUR', ')', 'RSB'],
|
198
|
+
'Punctuation, left curly bracket', ['PUL', '(', 'LCB'],
|
199
|
+
'Punctuation, right curly bracket', ['PUR', ')', 'RCB'],
|
200
|
+
|
201
|
+
'Unknown, foreign words (not in lexicon)', ['UNZ', '(FW-)', 'FW', '', 'FW'],
|
202
|
+
|
203
|
+
'Symbol', ['', '', 'SYM', 'XY'],
|
204
|
+
'Symbol, alphabetical', ['ZZ0', '', ''],
|
205
|
+
'Symbol, list item', ['', '', 'LS'],
|
206
|
+
|
207
|
+
# Not sure about these tags from the Chinese PTB.
|
208
|
+
'Aspect marker', ['', '', '', '', 'AS'], # ?
|
209
|
+
'Ba-construction', ['', '', '', '', 'BA'], # ?
|
210
|
+
'In relative', ['', '', '', '', 'DEC'], # ?
|
211
|
+
'Associative', ['', '', '', '', 'DER'], # ?
|
212
|
+
'In V-de or V-de-R construct', ['', '', '', '', 'DER'], # ?
|
213
|
+
'For words ? ', ['', '', '', '', 'ETC'], # ?
|
214
|
+
'In long bei-construct', ['', '', '', '', 'LB'], # ?
|
215
|
+
'In short bei-construct', ['', '', '', '', 'SB'], # ?
|
216
|
+
'Sentence-nal particle', ['', '', '', '', 'SB'], # ?
|
217
|
+
'Particle, other', ['', '', '', '', 'MSP'], # ?
|
218
|
+
'Before VP', ['', '', '', '', 'DEV'], # ?
|
219
|
+
'Verb, ? as main verb', ['', '', '', '', 'VE'], # ?
|
220
|
+
'Verb, ????', ['', '', '', '', 'VC'] # ?
|
221
|
+
]}
|