treat 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +4 -4
- data/TODO +21 -54
- data/lib/economist/half_cocked_basel.txt +16 -0
- data/lib/economist/hose_and_dry.doc +0 -0
- data/lib/economist/hungarys_troubles.abw +70 -0
- data/lib/economist/republican_nomination.pdf +0 -0
- data/lib/economist/saving_the_euro.odt +0 -0
- data/lib/economist/to_infinity_and_beyond.txt +15 -0
- data/lib/economist/zero_sum.html +91 -0
- data/lib/treat.rb +58 -72
- data/lib/treat/buildable.rb +59 -15
- data/lib/treat/categories.rb +26 -14
- data/lib/treat/category.rb +2 -2
- data/lib/treat/delegatable.rb +65 -48
- data/lib/treat/doable.rb +44 -0
- data/lib/treat/entities.rb +34 -14
- data/lib/treat/entities/collection.rb +2 -0
- data/lib/treat/entities/document.rb +3 -2
- data/lib/treat/entities/entity.rb +105 -90
- data/lib/treat/entities/phrases.rb +17 -0
- data/lib/treat/entities/tokens.rb +28 -13
- data/lib/treat/entities/zones.rb +20 -0
- data/lib/treat/extractors.rb +49 -11
- data/lib/treat/extractors/coreferences/stanford.rb +68 -0
- data/lib/treat/extractors/date/chronic.rb +32 -0
- data/lib/treat/extractors/date/ruby.rb +25 -0
- data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
- data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
- data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
- data/lib/treat/extractors/language/what_language.rb +49 -0
- data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
- data/lib/treat/extractors/roles/naive.rb +73 -0
- data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
- data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
- data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
- data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
- data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
- data/lib/treat/extractors/time/nickel.rb +30 -12
- data/lib/treat/extractors/topic_words/lda.rb +9 -9
- data/lib/treat/extractors/topics/reuters.rb +14 -15
- data/lib/treat/extractors/topics/reuters/region.xml +1 -0
- data/lib/treat/features.rb +7 -0
- data/lib/treat/formatters/readers/abw.rb +6 -1
- data/lib/treat/formatters/readers/autoselect.rb +5 -6
- data/lib/treat/formatters/readers/doc.rb +3 -1
- data/lib/treat/formatters/readers/html.rb +1 -1
- data/lib/treat/formatters/readers/image.rb +43 -0
- data/lib/treat/formatters/readers/odt.rb +1 -2
- data/lib/treat/formatters/readers/pdf.rb +9 -1
- data/lib/treat/formatters/readers/xml.rb +40 -0
- data/lib/treat/formatters/serializers/xml.rb +50 -14
- data/lib/treat/formatters/serializers/yaml.rb +7 -2
- data/lib/treat/formatters/unserializers/xml.rb +33 -7
- data/lib/treat/formatters/visualizers/dot.rb +90 -20
- data/lib/treat/formatters/visualizers/short_value.rb +2 -2
- data/lib/treat/formatters/visualizers/standoff.rb +2 -2
- data/lib/treat/formatters/visualizers/tree.rb +1 -1
- data/lib/treat/formatters/visualizers/txt.rb +13 -4
- data/lib/treat/group.rb +16 -10
- data/lib/treat/helpers/linguistics_loader.rb +18 -0
- data/lib/treat/inflectors.rb +10 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
- data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
- data/lib/treat/inflectors/declensions/english.rb +319 -0
- data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
- data/lib/treat/install.rb +59 -0
- data/lib/treat/kernel.rb +18 -8
- data/lib/treat/languages.rb +18 -11
- data/lib/treat/languages/arabic.rb +4 -2
- data/lib/treat/languages/chinese.rb +6 -2
- data/lib/treat/languages/dutch.rb +16 -0
- data/lib/treat/languages/english.rb +47 -19
- data/lib/treat/languages/french.rb +8 -5
- data/lib/treat/languages/german.rb +9 -6
- data/lib/treat/languages/greek.rb +16 -0
- data/lib/treat/languages/italian.rb +6 -3
- data/lib/treat/languages/polish.rb +16 -0
- data/lib/treat/languages/portuguese.rb +16 -0
- data/lib/treat/languages/russian.rb +16 -0
- data/lib/treat/languages/spanish.rb +16 -0
- data/lib/treat/languages/swedish.rb +16 -0
- data/lib/treat/languages/tags.rb +377 -0
- data/lib/treat/lexicalizers.rb +34 -23
- data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
- data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
- data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
- data/lib/treat/lexicalizers/tag/brill.rb +35 -40
- data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
- data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
- data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
- data/lib/treat/processors.rb +8 -8
- data/lib/treat/processors/chunkers/txt.rb +4 -4
- data/lib/treat/processors/parsers/enju.rb +114 -99
- data/lib/treat/processors/parsers/stanford.rb +109 -41
- data/lib/treat/processors/segmenters/punkt.rb +17 -18
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
- data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
- data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
- data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
- data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
- data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
- data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
- data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
- data/lib/treat/processors/segmenters/stanford.rb +38 -37
- data/lib/treat/processors/segmenters/tactful.rb +5 -4
- data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
- data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
- data/lib/treat/processors/tokenizers/perl.rb +2 -2
- data/lib/treat/processors/tokenizers/punkt.rb +6 -2
- data/lib/treat/processors/tokenizers/stanford.rb +25 -24
- data/lib/treat/processors/tokenizers/tactful.rb +1 -2
- data/lib/treat/proxies.rb +2 -35
- data/lib/treat/registrable.rb +17 -22
- data/lib/treat/sugar.rb +11 -11
- data/lib/treat/tree.rb +27 -17
- data/lib/treat/viewable.rb +29 -0
- data/lib/treat/visitable.rb +1 -1
- data/test/tc_entity.rb +56 -49
- data/test/tc_extractors.rb +41 -18
- data/test/tc_formatters.rb +7 -8
- data/test/tc_inflectors.rb +19 -24
- data/test/tc_lexicalizers.rb +12 -19
- data/test/tc_processors.rb +26 -12
- data/test/tc_resources.rb +2 -7
- data/test/tc_treat.rb +20 -22
- data/test/tc_tree.rb +4 -4
- data/test/tests.rb +3 -5
- data/test/texts.rb +13 -14
- data/tmp/INFO +1 -0
- metadata +78 -158
- data/bin/INFO +0 -1
- data/examples/benchmark.rb +0 -81
- data/examples/keywords.rb +0 -148
- data/lib/treat/detectors.rb +0 -31
- data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
- data/lib/treat/detectors/format/file.rb +0 -36
- data/lib/treat/detectors/language/what_language.rb +0 -29
- data/lib/treat/entities/constituents.rb +0 -15
- data/lib/treat/entities/sentence.rb +0 -8
- data/lib/treat/extractors/named_entity/abner.rb +0 -20
- data/lib/treat/extractors/named_entity/stanford.rb +0 -174
- data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
- data/lib/treat/extractors/time/chronic.rb +0 -20
- data/lib/treat/extractors/time/native.rb +0 -18
- data/lib/treat/formatters/readers/gocr.rb +0 -26
- data/lib/treat/formatters/readers/ocropus.rb +0 -31
- data/lib/treat/formatters/visualizers/html.rb +0 -13
- data/lib/treat/formatters/visualizers/inspect.rb +0 -20
- data/lib/treat/inflectors/declensions/en.rb +0 -18
- data/lib/treat/languages/categories.rb +0 -5
- data/lib/treat/languages/english/categories.rb +0 -23
- data/lib/treat/languages/english/tags.rb +0 -352
- data/lib/treat/languages/xinhua.rb +0 -12
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
- data/lib/treat/string.rb +0 -5
- data/test/tc_detectors.rb +0 -26
@@ -1,15 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Extractors
|
3
|
-
module Statistics
|
4
|
-
class FrequencyOf
|
5
|
-
# Find the frequency of a given string value.
|
6
|
-
def self.statistics(entity, options = {})
|
7
|
-
w = options[:value]
|
8
|
-
raise Treat::Exception, "Must supply a non-nil value." unless w
|
9
|
-
entity.token_registry[:value][w].nil? ? 0 :
|
10
|
-
entity.token_registry[:value][w].size
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
@@ -1,20 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Extractors
|
3
|
-
module Time
|
4
|
-
# A wrapper for the 'chronic' gem, which parses
|
5
|
-
# time and date information.
|
6
|
-
#
|
7
|
-
# Project website: http://chronic.rubyforge.org/
|
8
|
-
class Chronic
|
9
|
-
silence_warnings { require 'chronic' }
|
10
|
-
# Return the time information contained within the entity
|
11
|
-
# by parsing it with the 'chronic' gem.
|
12
|
-
#
|
13
|
-
# Options: none.
|
14
|
-
def self.time(entity, options = {})
|
15
|
-
silence_warnings { ::Chronic.parse(entity.to_s, {:guess => true}) }
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
@@ -1,18 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Extractors
|
3
|
-
module Time
|
4
|
-
# A wrapper for Ruby's native date/time parsing.
|
5
|
-
module Native
|
6
|
-
require 'date'
|
7
|
-
# Return a DateTime object representing the date/time
|
8
|
-
# contained within the entity, using Ruby's native
|
9
|
-
# date/time parser.
|
10
|
-
#
|
11
|
-
# Options: none.
|
12
|
-
def self.time(entity, options = {})
|
13
|
-
::DateTime.parse(entity.to_s)
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
@@ -1,26 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Formatters
|
3
|
-
module Readers
|
4
|
-
# A wrapper class for the GOCR engine.
|
5
|
-
#
|
6
|
-
# "GOCR is an OCR (Optical Character Recognition)
|
7
|
-
# program, developed under the GNU Public License.
|
8
|
-
# It converts scanned images of text back to text files."
|
9
|
-
#
|
10
|
-
# Project site: http://jocr.sourceforge.net
|
11
|
-
class GOCR
|
12
|
-
# Read a file using the GOCR reader.
|
13
|
-
#
|
14
|
-
# Options: none.
|
15
|
-
def self.read(document, options = {})
|
16
|
-
create_temp_file(:pgm) do |tmp|
|
17
|
-
`convert #{document.file} #{tmp}`
|
18
|
-
f = `gocr #{tmp}`.strip
|
19
|
-
document << Treat::Entities::Entity.from_string(f)
|
20
|
-
end
|
21
|
-
document
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
@@ -1,31 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Formatters
|
3
|
-
module Readers
|
4
|
-
# This class is a wrapper for the Google Ocropus
|
5
|
-
# optical character recognition (OCR) engine.
|
6
|
-
#
|
7
|
-
# "OCRopus(tm) is a state-of-the-art document
|
8
|
-
# analysis and OCR system, featuring pluggable
|
9
|
-
# layout analysis, pluggable character recognition,
|
10
|
-
# statistical natural language modeling, and multi-
|
11
|
-
# lingual capabilities."
|
12
|
-
#
|
13
|
-
# Original paper:
|
14
|
-
# Breuel, Thomas M. The Ocropus Open Source OCR System.
|
15
|
-
# DFKI and U. Kaiserslautern, Germany.
|
16
|
-
class Ocropus
|
17
|
-
# Read a file using the Google Ocropus reader.
|
18
|
-
#
|
19
|
-
# Options: none.
|
20
|
-
def self.read(document, options = {})
|
21
|
-
create_temp_file(:txt) do |tmp|
|
22
|
-
`ocropus page #{document.file} > #{tmp} -STDIO 2>/dev/null`
|
23
|
-
f = File.read(tmp)
|
24
|
-
document << Treat::Entities::Entity.from_string(f)
|
25
|
-
end
|
26
|
-
document
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
@@ -1,20 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Formatters
|
3
|
-
module Visualizers
|
4
|
-
# Handles the call to inspect.
|
5
|
-
class Inspect
|
6
|
-
# Return a terminal-friendly visualization of an entity.
|
7
|
-
#
|
8
|
-
# Options: none.
|
9
|
-
def self.visualize(entity, options = {})
|
10
|
-
s = "#{entity.class.to_s.split('::')[-1]} (#{entity.id.to_s})"
|
11
|
-
unless caller_method == :inspect
|
12
|
-
s += " | #{entity.short_value.inspect} | #{entity.features.inspect}" +
|
13
|
-
" | #{entity.edges.inspect}"
|
14
|
-
end
|
15
|
-
s
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
@@ -1,18 +0,0 @@
|
|
1
|
-
silence_warnings { require 'english' }
|
2
|
-
|
3
|
-
module Treat
|
4
|
-
module Inflectors
|
5
|
-
module Declensions
|
6
|
-
module En
|
7
|
-
def self.declense(entity, options)
|
8
|
-
string = entity.to_s
|
9
|
-
if options[:count] == :plural
|
10
|
-
::English.plural(string)
|
11
|
-
elsif options[:count] == :singular
|
12
|
-
::English.singular(string)
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
@@ -1,23 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Languages
|
3
|
-
class English
|
4
|
-
# A list of all possible word categories.
|
5
|
-
Categories = [
|
6
|
-
:adjective, :adverb, :noun, :verb, :interjection,
|
7
|
-
:clitic, :coverb, :conjunction, :determiner, :particle,
|
8
|
-
:preposition, :pronoun, :number, :symbol, :punctuation,
|
9
|
-
:complementizer
|
10
|
-
]
|
11
|
-
wttc = {}
|
12
|
-
Treat::Languages::English::AlignedWordTags.each_slice(2) do |desc, tags|
|
13
|
-
category = desc.gsub(',', ' ,').split(' ')[0].downcase.intern
|
14
|
-
wttc[tags[0]] ||= {}; wttc[tags[1]] ||= {} ;wttc[tags[2]] ||= {}
|
15
|
-
wttc[tags[0]][:claws_5] = category
|
16
|
-
wttc[tags[1]][:brown] = category
|
17
|
-
wttc[tags[2]][:penn] = category
|
18
|
-
end
|
19
|
-
# A hash converting word tags to word categories.
|
20
|
-
WordTagToCategory = wttc
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
@@ -1,352 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Languages
|
3
|
-
class English
|
4
|
-
|
5
|
-
ClawsC5 = 0
|
6
|
-
Brown = 1
|
7
|
-
Penn = 2
|
8
|
-
|
9
|
-
PTBClauseTagDescription = [
|
10
|
-
['S', 'Simple declarative clause'],
|
11
|
-
['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
|
12
|
-
['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
|
13
|
-
['SINV', 'Inverted declarative sentence'],
|
14
|
-
['SQ', 'Inverted yes/no question']
|
15
|
-
]
|
16
|
-
|
17
|
-
PTBPhraseTagDescription = [
|
18
|
-
['ADJP', 'Adjective phrase'],
|
19
|
-
['ADVP', 'Adverb phrase'],
|
20
|
-
['CONJP', 'Conjunction phrase'],
|
21
|
-
['FRAG', 'Fragment'],
|
22
|
-
['INTJ', 'Interjection'],
|
23
|
-
['LST', 'List marker'],
|
24
|
-
['NAC', 'Not a constituent'],
|
25
|
-
['NP', 'Noun phrase'],
|
26
|
-
['NX', 'Head of an NP'],
|
27
|
-
['PP', 'Prepositional phrase'],
|
28
|
-
['PRN', 'Parenthetical'],
|
29
|
-
['PRT', 'Particle'],
|
30
|
-
['QP', 'Quantifier phrase'],
|
31
|
-
['RRC', 'Reduced relative clause'],
|
32
|
-
['UCP', 'Unlike coordinated phrase'],
|
33
|
-
['VP', 'Verb phrase'],
|
34
|
-
['WHADJP', 'Wh-adjective phrase'],
|
35
|
-
['WHAVP', 'Wh-adverb phrase'],
|
36
|
-
['WHNP', 'Wh-noun phrase'],
|
37
|
-
['WHPP', 'Wh-prepositional phrase'],
|
38
|
-
['X', 'Unknown, uncertain, or unbracketable']
|
39
|
-
]
|
40
|
-
|
41
|
-
PTBWordTagDescription = [
|
42
|
-
['CC', 'Coordinating conjunction'],
|
43
|
-
['CD', 'Cardinal number'],
|
44
|
-
['DT', 'Determiner'],
|
45
|
-
['EX', 'Existential there'],
|
46
|
-
['FW', 'Foreign word'],
|
47
|
-
['IN', 'Preposition or subordinating conjunction'],
|
48
|
-
['JJ', 'Adjective'],
|
49
|
-
['JJR', 'Adjective, comparative'],
|
50
|
-
['JJS', 'Adjective, superlative'],
|
51
|
-
['LS', 'List item marker'],
|
52
|
-
['MD', 'Modal'],
|
53
|
-
['NN', 'Noun, singular or mass'],
|
54
|
-
['NNS', 'Noun, plural'],
|
55
|
-
['NNP', 'Proper noun, singular'],
|
56
|
-
['NNPS', 'Proper noun, plural'],
|
57
|
-
['PDT', 'Predeterminer'],
|
58
|
-
['POS', 'Possessive ending'],
|
59
|
-
['PRP', 'Personal pronoun'],
|
60
|
-
['PRP$', 'Possessive pronoun (prolog version PRP-S)'],
|
61
|
-
['RB', 'Adverb'],
|
62
|
-
['RBR', 'Adverb, comparative'],
|
63
|
-
['RBS', 'Adverb, superlative'],
|
64
|
-
['RP', 'Particle'],
|
65
|
-
['SYM', 'Symbol'],
|
66
|
-
['TO', 'to'],
|
67
|
-
['UH', 'Interjection'],
|
68
|
-
['VB', 'Verb, base form'],
|
69
|
-
['VBD', 'Verb, past tense'],
|
70
|
-
['VBG', 'Verb, gerund or present participle'],
|
71
|
-
['VBN', 'Verb, past participle'],
|
72
|
-
['VBP', 'Verb, non 3rd person singular present'],
|
73
|
-
['VBZ', 'Verb, 3rd person singular present'],
|
74
|
-
['WDT', 'Wh-determiner'],
|
75
|
-
['WP', 'Wh-pronoun'],
|
76
|
-
['WP$', 'Possessive wh-pronoun (prolog version WP-S)'],
|
77
|
-
['WRB', 'Wh-adverb']
|
78
|
-
]
|
79
|
-
|
80
|
-
BrownWordTagDescription = [
|
81
|
-
|
82
|
-
['.', 'sentence closer . ; ? !'],
|
83
|
-
['(', 'left parent'] ,
|
84
|
-
[')', 'right parent'],
|
85
|
-
['*', 'not'],
|
86
|
-
['--', 'dash'],
|
87
|
-
[',', 'comma'],
|
88
|
-
[':', 'colon'],
|
89
|
-
['ABL', 'pre-qualifier quite, rather'],
|
90
|
-
['ABN', 'pre-quantifier half, all'],
|
91
|
-
['ABX', 'pre-quantifier both'],
|
92
|
-
['AP', 'post-determiner many, several, next'],
|
93
|
-
['AT', 'article a, the, no'],
|
94
|
-
['BE', 'be '],
|
95
|
-
['BED', 'were '],
|
96
|
-
['BEDZ', 'was '],
|
97
|
-
['BEG', 'being '],
|
98
|
-
['BEM', 'am '],
|
99
|
-
['BEN', 'been '],
|
100
|
-
['BER', 'are, art '],
|
101
|
-
['BEZ', 'is '],
|
102
|
-
['CC', 'coordinating conjunction and, or'],
|
103
|
-
['CD', 'cardinal numeral one, two, 2, etc.'],
|
104
|
-
['CS', 'subordinating conjunction if, although'],
|
105
|
-
['DO', 'do '],
|
106
|
-
['DOD', 'did '],
|
107
|
-
['DOZ', 'does '],
|
108
|
-
['DT', 'singular determiner this, that'],
|
109
|
-
['DTI', 'singular or plural determiner/quantifier some, any'],
|
110
|
-
['DTS', 'plural determiner these, those'],
|
111
|
-
['DTX', 'determiner/double conjunction either'],
|
112
|
-
['EX', 'existentil there '],
|
113
|
-
['FW', 'foreign word (hyphenated before regular tag) '],
|
114
|
-
['HL', 'word occurring in headline (hyphenated after regular tag) '],
|
115
|
-
['HV', 'have '],
|
116
|
-
['HVD', 'had (past tense) '],
|
117
|
-
['HVG', 'having '],
|
118
|
-
['HVN', 'had (past participle) '],
|
119
|
-
['HVZ', 'has '],
|
120
|
-
['IN', 'preposition '],
|
121
|
-
['JJ', 'adjective '],
|
122
|
-
['JJR', 'comparative adjective '],
|
123
|
-
['JJS', 'semantically superlative adjective chief, top'],
|
124
|
-
['JJT', 'morphologically superlative adjective biggest'],
|
125
|
-
['MD', 'modal auxiliary can, should, will'],
|
126
|
-
['NC', 'cited word (hyphenated after regular tag) '],
|
127
|
-
['NN', 'singular or mass noun '],
|
128
|
-
['NN$', 'possessive singular noun '],
|
129
|
-
['NNS', 'plural noun '],
|
130
|
-
['NNS$', 'possessive plural noun '],
|
131
|
-
['NP', 'proper noun or part of name phrase '],
|
132
|
-
['NP$', 'possessive proper noun '],
|
133
|
-
['NPS', 'plural proper noun '],
|
134
|
-
['NPS$', 'possessive plural proper noun '],
|
135
|
-
['NR', 'adverbial noun home, today, west'],
|
136
|
-
['NRS', 'plural adverbial noun'],
|
137
|
-
['OD', 'ordinal numeral first, 2nd'],
|
138
|
-
['PN', 'nominal pronoun everybody, nothing'],
|
139
|
-
['PN$', 'possessive nominal pronoun '],
|
140
|
-
['PP$', 'possessive personal pronoun my, our'],
|
141
|
-
['PP$$', 'second (nominal) possessive pronoun mine, ours'],
|
142
|
-
['PPL', 'singular reflexive/intensive personal pronoun myself'],
|
143
|
-
['PPLS', 'plural reflexive/intensive personal pronoun ourselves'],
|
144
|
-
['PPO', 'objective personal pronoun me, him, it, them'],
|
145
|
-
['PPS', '3rd. singular nominative pronoun he, she, it, one'],
|
146
|
-
['PPSS', 'other nominative personal pronoun I, we, they, you'],
|
147
|
-
['QL', 'qualifier very, fairly'],
|
148
|
-
['QLP', 'post-qualifier enough, indeed'],
|
149
|
-
['RB', 'adverb '],
|
150
|
-
['RBR', 'comparative adverb '],
|
151
|
-
['RBT', 'superlative adverb '],
|
152
|
-
['RN', 'nominal adverb here then, indoors '],
|
153
|
-
['RP', 'adverb/particle about, off, up'],
|
154
|
-
['TL', 'word occurring in title (hyphenated after regular tag)'],
|
155
|
-
['TO', 'infinitive marker to '],
|
156
|
-
['UH', 'interjection, exclamation '],
|
157
|
-
['VB', 'verb, base form '],
|
158
|
-
['VBD', 'verb, past tense '],
|
159
|
-
['VBG', 'verb, present participle/gerund '],
|
160
|
-
['VBN', 'verb, past participle '],
|
161
|
-
['VBZ', 'verb, 3rd. singular present '],
|
162
|
-
['WDT', 'wh- determiner what, which'],
|
163
|
-
['WP$', 'possessive wh- pronoun whose'],
|
164
|
-
['WPO', 'objective wh- pronoun whom, which, that'],
|
165
|
-
['WPS', 'nominative wh- pronoun who, which, that'],
|
166
|
-
['WQL', 'wh- qualifier how'],
|
167
|
-
['WRB', 'wh- adverb how, where, when']
|
168
|
-
|
169
|
-
]
|
170
|
-
# A description of Enju categories.
|
171
|
-
EnjuCatDescription = [
|
172
|
-
['ADJ', 'Adjective'],
|
173
|
-
['ADV', 'Adverb'],
|
174
|
-
['CONJ', 'Coordination conjunction'],
|
175
|
-
['C', 'Complementizer'],
|
176
|
-
['D', 'Determiner'],
|
177
|
-
['N', 'Noun'],
|
178
|
-
['P', 'Preposition'],
|
179
|
-
['SC', 'Subordination conjunction'],
|
180
|
-
['V', 'Verb'],
|
181
|
-
['COOD', 'Part of coordination'],
|
182
|
-
['PN', 'Punctuation'],
|
183
|
-
['PRT', 'Particle'],
|
184
|
-
['S', 'Sentence']
|
185
|
-
]
|
186
|
-
|
187
|
-
# Maps Enju categories to Treat categories.
|
188
|
-
EnjuCatToCategory = {
|
189
|
-
'ADJ' => :adjective,
|
190
|
-
'ADV' => :adverb,
|
191
|
-
'CONJ' => :conjunction,
|
192
|
-
'COOD' => :conjunction,
|
193
|
-
'C' => :complementizer,
|
194
|
-
'D' => :determiner,
|
195
|
-
'N' => :noun,
|
196
|
-
'P' => :preposition,
|
197
|
-
'PN' => :punctuation,
|
198
|
-
'SC' => :conjunction,
|
199
|
-
'V' => :verb,
|
200
|
-
'PRT' => :particle
|
201
|
-
}
|
202
|
-
|
203
|
-
# Description of the xcat in the Enju output specification.
|
204
|
-
EnjuXCatDescription = [
|
205
|
-
['COOD', 'Coordinated phrase/clause'],
|
206
|
-
['IMP', 'Imperative sentence'],
|
207
|
-
['INV', 'Subject-verb inversion'],
|
208
|
-
['Q', 'Interrogative sentence with subject-verb inversion'],
|
209
|
-
['REL', 'A relativizer included'],
|
210
|
-
['FREL', 'A free relative included'],
|
211
|
-
['TRACE', 'A trace included'],
|
212
|
-
['WH', 'A wh-question word included']
|
213
|
-
]
|
214
|
-
|
215
|
-
EnjuCatXcatToPTB = [
|
216
|
-
['ADJP', '', 'ADJP'],
|
217
|
-
['ADJP', 'REL', 'WHADJP'],
|
218
|
-
['ADJP', 'FREL', 'WHADJP'],
|
219
|
-
['ADJP', 'WH', 'WHADJP'],
|
220
|
-
['ADVP', '', 'ADVP'],
|
221
|
-
['ADVP', 'REL', 'WHADVP'],
|
222
|
-
['ADVP', 'FREL', 'WHADVP'],
|
223
|
-
['ADVP', 'WH', 'WHADVP'],
|
224
|
-
['CONJP', '', 'CONJP'],
|
225
|
-
['CP', '', 'SBAR'],
|
226
|
-
['DP', '', 'NP'],
|
227
|
-
['NP', '', 'NP'],
|
228
|
-
['NX', 'NX', 'NAC'],
|
229
|
-
['NP' 'REL' 'WHNP'],
|
230
|
-
['NP' 'FREL' 'WHNP'],
|
231
|
-
['NP' 'WH' 'WHNP'],
|
232
|
-
['PP', '', 'PP'],
|
233
|
-
['PP', 'REL', 'WHPP'],
|
234
|
-
['PP', 'WH', 'WHPP'],
|
235
|
-
['PRT', '', 'PRT'],
|
236
|
-
['S', '', 'S'],
|
237
|
-
['S', 'INV', 'SINV'],
|
238
|
-
['S', 'Q', 'SQ'],
|
239
|
-
['S', 'REL', 'SBAR'],
|
240
|
-
['S', 'FREL', 'SBAR'],
|
241
|
-
['S', 'WH', 'SBARQ'],
|
242
|
-
['SCP', '', 'SBAR'],
|
243
|
-
['VP', '', 'VP'],
|
244
|
-
['VP', '', 'VP'],
|
245
|
-
['', '', 'UK']
|
246
|
-
]
|
247
|
-
|
248
|
-
# Aligned tags for the Claws C5, Brown and Penn tag sets.
|
249
|
-
# Adapted from Manning, Christopher and Schütze, Hinrich,
|
250
|
-
# 1999. Foundations of Statistical Natural Language
|
251
|
-
# Processing. MIT Press, p. 141-142.
|
252
|
-
AlignedWordTags = [
|
253
|
-
'Adjective', ['AJ0', 'JJ', 'JJ'],
|
254
|
-
'Adjective, ordinal number', ['ORD', 'OD', 'JJ'],
|
255
|
-
'Adjective, comparative', ['AJC', 'JJR', 'JJR'],
|
256
|
-
'Adjective, superlative', ['AJS', 'JJT', 'JJS'],
|
257
|
-
'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ'],
|
258
|
-
'Adjective, cardinal number', ['CRD', 'CD', 'CD'],
|
259
|
-
'Adjective, cardinal number, one', ['PNI', 'CD', 'CD'],
|
260
|
-
'Adverb', ['AV0', 'RB', 'RB'],
|
261
|
-
'Adverb, negative', ['XX0', '*', 'RB'],
|
262
|
-
'Adverb, comparative', ['AV0', 'RBR', 'RBR'],
|
263
|
-
'Adverb, superlative', ['AV0', 'RBT', 'RBS'],
|
264
|
-
'Adverb, particle', ['AVP', 'RP', 'RP'],
|
265
|
-
'Adverb, question', ['AVQ', 'WRB', 'WRB'],
|
266
|
-
'Adverb, degree & question', ['AVQ', 'WQL', 'WRB'],
|
267
|
-
'Adverb, degree', ['AV0', 'QL', 'RB'],
|
268
|
-
'Adverb, degree, postposed', ['AV0', 'QLP', 'RB'],
|
269
|
-
'Adverb, nominal', ['AV0', 'RN', 'RB'],
|
270
|
-
'Conjunction, coordination', ['CJC', 'CC', 'CC'],
|
271
|
-
'Conjunction, subordination', ['CJS', 'CS', 'IN'],
|
272
|
-
'Conjunction, complementizer, that', ['CJT', 'CS', 'IN'],
|
273
|
-
'Determiner', ['DT0', 'DT', 'DT'],
|
274
|
-
'Determiner, pronoun', ['DT0', 'DTI', 'DT'],
|
275
|
-
'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT'],
|
276
|
-
'Determiner, prequalifier', ['DT0', 'ABL', 'DT'],
|
277
|
-
'Determiner, prequantifier', ['DT0', 'ABN', 'PDT'],
|
278
|
-
'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT'],
|
279
|
-
'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT'],
|
280
|
-
'Determiner, article', ['AT0', 'AT', 'DT'],
|
281
|
-
'Determiner, postdeterminer', ['DT0', 'AP', 'JJ'],
|
282
|
-
'Determiner, possessive', ['DPS', 'PP$', 'PRP$'],
|
283
|
-
'Determiner, possessive, second', ['DPS', 'PP$$', 'PRP'],
|
284
|
-
'Determiner, question', ['DTQ', 'WDT', 'WDT'],
|
285
|
-
'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$'],
|
286
|
-
'Noun', ['NN0', 'NN', 'NN'],
|
287
|
-
'Noun, singular', ['NN1', 'NN', 'NN'],
|
288
|
-
'Noun, plural', ['NN2', 'NNS', 'NNS'],
|
289
|
-
'Noun, proper, singular', ['NP0', 'NP', 'NNP'],
|
290
|
-
'Noun, proper, plural', ['NP0', 'NPS', 'NNPS'],
|
291
|
-
'Noun, adverbial', ['NN0', 'NR', 'NN'],
|
292
|
-
'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS'],
|
293
|
-
'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP'],
|
294
|
-
'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP'],
|
295
|
-
'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP'],
|
296
|
-
'Pronoun, personal, object', ['PNP', 'PPO', 'PRP'],
|
297
|
-
'Pronoun, reflexive', ['PNX', 'PPL', 'PRP'],
|
298
|
-
'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP'],
|
299
|
-
'Pronoun, question, subject', ['PNQ', 'WPS', 'WP'],
|
300
|
-
'Pronoun, question, object', ['PNQ', 'WPO', 'WP'],
|
301
|
-
'Pronoun, existential there', ['EX0', 'EX', 'EX'],
|
302
|
-
'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP'],
|
303
|
-
'Verb, infinitive', ['VVI', 'VB', 'VB'],
|
304
|
-
'Verb, past tense', ['VVD', 'VBD', 'VBD'],
|
305
|
-
'Verb, present participle', ['VVG', 'VBG', 'VBG'],
|
306
|
-
'Verb, past/passive participle', ['VVN', 'VBN', 'VBN'],
|
307
|
-
'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ'],
|
308
|
-
'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP'],
|
309
|
-
'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB'],
|
310
|
-
'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD'],
|
311
|
-
'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG'],
|
312
|
-
'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN'],
|
313
|
-
'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ'],
|
314
|
-
'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP'],
|
315
|
-
'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB'],
|
316
|
-
'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD'],
|
317
|
-
'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG'],
|
318
|
-
'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN'],
|
319
|
-
'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ'],
|
320
|
-
'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB'],
|
321
|
-
'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD'],
|
322
|
-
'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD'],
|
323
|
-
'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG'],
|
324
|
-
'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN'],
|
325
|
-
'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ'],
|
326
|
-
'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP'],
|
327
|
-
'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP'],
|
328
|
-
'Verb, modal', ['VM0', 'MD', 'MD'],
|
329
|
-
'Preposition, to as infinitive marker', ['TO0', 'TO', 'TO'],
|
330
|
-
'Preposition, to', ['PRP', 'IN', 'TO'],
|
331
|
-
'Preposition', ['PRP', 'IN', 'IN'],
|
332
|
-
'Preposition, of', ['PRF', 'IN', 'IN'],
|
333
|
-
'Possessive', ['POS', '$', 'POS'],
|
334
|
-
'Interjection (or other isolate)', ['ITJ', 'UH', 'UH'],
|
335
|
-
'Punctuation, sentence ender', ['PUN', '.', '.'],
|
336
|
-
'Punctuation, semicolon', ['PUN', '.', '.'],
|
337
|
-
'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
|
338
|
-
'Punctuationm, comma', ['PUN', ',', ','],
|
339
|
-
'Punctuation, dash', ['PUN', '-', '-'],
|
340
|
-
'Punctuation, dollar sign', ['PUN', '', '$'],
|
341
|
-
'Punctuation, left bracket', ['PUL', '(', '('],
|
342
|
-
'Punctuation, right bracket', ['PUR', ')', ')'],
|
343
|
-
'Punctuation, quotation mark, left', ['PUQ', '', '``'],
|
344
|
-
'Punctuation, quotation mark, right', ['PUQ', '', '"'],
|
345
|
-
'Unknown, foreign words (not in English lexicon)', ['UNZ', '(FW-)', 'FW'],
|
346
|
-
'Symbol', ['', '', 'SYM'],
|
347
|
-
'Symbol, alphabetical', ['ZZ0', '', ''],
|
348
|
-
'Symbol, list item', ['', '', 'LS']
|
349
|
-
]
|
350
|
-
end
|
351
|
-
end
|
352
|
-
end
|