treat 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +4 -4
- data/TODO +21 -54
- data/lib/economist/half_cocked_basel.txt +16 -0
- data/lib/economist/hose_and_dry.doc +0 -0
- data/lib/economist/hungarys_troubles.abw +70 -0
- data/lib/economist/republican_nomination.pdf +0 -0
- data/lib/economist/saving_the_euro.odt +0 -0
- data/lib/economist/to_infinity_and_beyond.txt +15 -0
- data/lib/economist/zero_sum.html +91 -0
- data/lib/treat.rb +58 -72
- data/lib/treat/buildable.rb +59 -15
- data/lib/treat/categories.rb +26 -14
- data/lib/treat/category.rb +2 -2
- data/lib/treat/delegatable.rb +65 -48
- data/lib/treat/doable.rb +44 -0
- data/lib/treat/entities.rb +34 -14
- data/lib/treat/entities/collection.rb +2 -0
- data/lib/treat/entities/document.rb +3 -2
- data/lib/treat/entities/entity.rb +105 -90
- data/lib/treat/entities/phrases.rb +17 -0
- data/lib/treat/entities/tokens.rb +28 -13
- data/lib/treat/entities/zones.rb +20 -0
- data/lib/treat/extractors.rb +49 -11
- data/lib/treat/extractors/coreferences/stanford.rb +68 -0
- data/lib/treat/extractors/date/chronic.rb +32 -0
- data/lib/treat/extractors/date/ruby.rb +25 -0
- data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
- data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
- data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
- data/lib/treat/extractors/language/what_language.rb +49 -0
- data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
- data/lib/treat/extractors/roles/naive.rb +73 -0
- data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
- data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
- data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
- data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
- data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
- data/lib/treat/extractors/time/nickel.rb +30 -12
- data/lib/treat/extractors/topic_words/lda.rb +9 -9
- data/lib/treat/extractors/topics/reuters.rb +14 -15
- data/lib/treat/extractors/topics/reuters/region.xml +1 -0
- data/lib/treat/features.rb +7 -0
- data/lib/treat/formatters/readers/abw.rb +6 -1
- data/lib/treat/formatters/readers/autoselect.rb +5 -6
- data/lib/treat/formatters/readers/doc.rb +3 -1
- data/lib/treat/formatters/readers/html.rb +1 -1
- data/lib/treat/formatters/readers/image.rb +43 -0
- data/lib/treat/formatters/readers/odt.rb +1 -2
- data/lib/treat/formatters/readers/pdf.rb +9 -1
- data/lib/treat/formatters/readers/xml.rb +40 -0
- data/lib/treat/formatters/serializers/xml.rb +50 -14
- data/lib/treat/formatters/serializers/yaml.rb +7 -2
- data/lib/treat/formatters/unserializers/xml.rb +33 -7
- data/lib/treat/formatters/visualizers/dot.rb +90 -20
- data/lib/treat/formatters/visualizers/short_value.rb +2 -2
- data/lib/treat/formatters/visualizers/standoff.rb +2 -2
- data/lib/treat/formatters/visualizers/tree.rb +1 -1
- data/lib/treat/formatters/visualizers/txt.rb +13 -4
- data/lib/treat/group.rb +16 -10
- data/lib/treat/helpers/linguistics_loader.rb +18 -0
- data/lib/treat/inflectors.rb +10 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
- data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
- data/lib/treat/inflectors/declensions/english.rb +319 -0
- data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
- data/lib/treat/install.rb +59 -0
- data/lib/treat/kernel.rb +18 -8
- data/lib/treat/languages.rb +18 -11
- data/lib/treat/languages/arabic.rb +4 -2
- data/lib/treat/languages/chinese.rb +6 -2
- data/lib/treat/languages/dutch.rb +16 -0
- data/lib/treat/languages/english.rb +47 -19
- data/lib/treat/languages/french.rb +8 -5
- data/lib/treat/languages/german.rb +9 -6
- data/lib/treat/languages/greek.rb +16 -0
- data/lib/treat/languages/italian.rb +6 -3
- data/lib/treat/languages/polish.rb +16 -0
- data/lib/treat/languages/portuguese.rb +16 -0
- data/lib/treat/languages/russian.rb +16 -0
- data/lib/treat/languages/spanish.rb +16 -0
- data/lib/treat/languages/swedish.rb +16 -0
- data/lib/treat/languages/tags.rb +377 -0
- data/lib/treat/lexicalizers.rb +34 -23
- data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
- data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
- data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
- data/lib/treat/lexicalizers/tag/brill.rb +35 -40
- data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
- data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
- data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
- data/lib/treat/processors.rb +8 -8
- data/lib/treat/processors/chunkers/txt.rb +4 -4
- data/lib/treat/processors/parsers/enju.rb +114 -99
- data/lib/treat/processors/parsers/stanford.rb +109 -41
- data/lib/treat/processors/segmenters/punkt.rb +17 -18
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
- data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
- data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
- data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
- data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
- data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
- data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
- data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
- data/lib/treat/processors/segmenters/stanford.rb +38 -37
- data/lib/treat/processors/segmenters/tactful.rb +5 -4
- data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
- data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
- data/lib/treat/processors/tokenizers/perl.rb +2 -2
- data/lib/treat/processors/tokenizers/punkt.rb +6 -2
- data/lib/treat/processors/tokenizers/stanford.rb +25 -24
- data/lib/treat/processors/tokenizers/tactful.rb +1 -2
- data/lib/treat/proxies.rb +2 -35
- data/lib/treat/registrable.rb +17 -22
- data/lib/treat/sugar.rb +11 -11
- data/lib/treat/tree.rb +27 -17
- data/lib/treat/viewable.rb +29 -0
- data/lib/treat/visitable.rb +1 -1
- data/test/tc_entity.rb +56 -49
- data/test/tc_extractors.rb +41 -18
- data/test/tc_formatters.rb +7 -8
- data/test/tc_inflectors.rb +19 -24
- data/test/tc_lexicalizers.rb +12 -19
- data/test/tc_processors.rb +26 -12
- data/test/tc_resources.rb +2 -7
- data/test/tc_treat.rb +20 -22
- data/test/tc_tree.rb +4 -4
- data/test/tests.rb +3 -5
- data/test/texts.rb +13 -14
- data/tmp/INFO +1 -0
- metadata +78 -158
- data/bin/INFO +0 -1
- data/examples/benchmark.rb +0 -81
- data/examples/keywords.rb +0 -148
- data/lib/treat/detectors.rb +0 -31
- data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
- data/lib/treat/detectors/format/file.rb +0 -36
- data/lib/treat/detectors/language/what_language.rb +0 -29
- data/lib/treat/entities/constituents.rb +0 -15
- data/lib/treat/entities/sentence.rb +0 -8
- data/lib/treat/extractors/named_entity/abner.rb +0 -20
- data/lib/treat/extractors/named_entity/stanford.rb +0 -174
- data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
- data/lib/treat/extractors/time/chronic.rb +0 -20
- data/lib/treat/extractors/time/native.rb +0 -18
- data/lib/treat/formatters/readers/gocr.rb +0 -26
- data/lib/treat/formatters/readers/ocropus.rb +0 -31
- data/lib/treat/formatters/visualizers/html.rb +0 -13
- data/lib/treat/formatters/visualizers/inspect.rb +0 -20
- data/lib/treat/inflectors/declensions/en.rb +0 -18
- data/lib/treat/languages/categories.rb +0 -5
- data/lib/treat/languages/english/categories.rb +0 -23
- data/lib/treat/languages/english/tags.rb +0 -352
- data/lib/treat/languages/xinhua.rb +0 -12
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
- data/lib/treat/string.rb +0 -5
- data/test/tc_detectors.rb +0 -26
@@ -0,0 +1,29 @@
|
|
1
|
+
module Treat
|
2
|
+
module Lexicalizers
|
3
|
+
module Tag
|
4
|
+
class Tagger
|
5
|
+
def self.tag(entity, options = {})
|
6
|
+
if (entity.is_a?(Treat::Entities::Sentence) ||
|
7
|
+
entity.is_a?(Treat::Entities::Phrase)) &&
|
8
|
+
!entity.has_children?
|
9
|
+
raise Treat::Exception,
|
10
|
+
"Annotator 'tag' requires processor 'tokenize'."
|
11
|
+
elsif entity.is_a?(Treat::Entities::Word)
|
12
|
+
if entity.has_parent?
|
13
|
+
ps = entity.parent_sentence
|
14
|
+
pp = entity.parent_phrase
|
15
|
+
if ps
|
16
|
+
self.tag(ps, options)
|
17
|
+
elsif pp
|
18
|
+
self.tag(pp, options)
|
19
|
+
end
|
20
|
+
return entity.features[:tag]
|
21
|
+
else
|
22
|
+
return :isolated_word
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/treat/processors.rb
CHANGED
@@ -9,15 +9,15 @@ module Treat
|
|
9
9
|
# - Chunkers : split a text into zone objects.
|
10
10
|
# - Segmenters : split a text or zone into sentence objects.
|
11
11
|
# - Tokenizers : split a sentence into Token objects.
|
12
|
-
# - Parsers: split a sentence into a tree of
|
13
|
-
# containing other
|
12
|
+
# - Parsers: split a sentence into a tree of phrases
|
13
|
+
# containing other phrases and Token objects, representing
|
14
14
|
# the syntactic structure.
|
15
15
|
module Processors
|
16
16
|
# Chunkers split a text into zones.
|
17
17
|
module Chunkers
|
18
18
|
extend Group
|
19
19
|
self.type = :transformer
|
20
|
-
self.targets = [:document, :
|
20
|
+
self.targets = [:document, :section]
|
21
21
|
end
|
22
22
|
# Segmenters split a text or zone into sentences.
|
23
23
|
module Segmenters
|
@@ -29,17 +29,17 @@ module Treat
|
|
29
29
|
module Tokenizers
|
30
30
|
extend Group
|
31
31
|
self.type = :transformer
|
32
|
-
self.targets = [:document, :zone, :
|
32
|
+
self.targets = [:document, :zone, :phrase]
|
33
33
|
end
|
34
|
-
# Parsers split a sentence into
|
34
|
+
# Parsers split a sentence into phrase objects
|
35
35
|
# representing its syntactic structure, with the
|
36
|
-
# Token objects as children of the
|
36
|
+
# Token objects as children of the phrases.
|
37
37
|
module Parsers
|
38
38
|
extend Group
|
39
39
|
self.type = :transformer
|
40
|
-
self.targets = [:document, :zone, :
|
40
|
+
self.targets = [:document, :zone, :phrase]
|
41
41
|
end
|
42
|
-
# Makes all the groups autoloadable and creates the
|
42
|
+
# Makes all the groups autoloadable and creates the workers.
|
43
43
|
extend Treat::Category
|
44
44
|
end
|
45
45
|
end
|
@@ -2,14 +2,15 @@ module Treat
|
|
2
2
|
module Processors
|
3
3
|
module Chunkers
|
4
4
|
# This class separates a plain text file into
|
5
|
-
# zones based on
|
6
|
-
# file.
|
5
|
+
# zones based on an extremely naive analysis of the
|
6
|
+
# file. Suprisingly, this works pretty well.
|
7
7
|
class Txt
|
8
8
|
# Split a document into Zone objects.
|
9
9
|
def self.chunk(text, options = {})
|
10
10
|
zones = text.to_s.split("\n")
|
11
11
|
zones.each do |zone|
|
12
|
-
|
12
|
+
zone.strip!
|
13
|
+
next if zone == ''
|
13
14
|
if false # fix
|
14
15
|
text << Treat::Entities::List.new(zone)
|
15
16
|
end
|
@@ -19,7 +20,6 @@ module Treat
|
|
19
20
|
text << Treat::Entities::Paragraph.new(zone)
|
20
21
|
end
|
21
22
|
end
|
22
|
-
text
|
23
23
|
end
|
24
24
|
end
|
25
25
|
end
|
@@ -6,7 +6,7 @@ module Treat
|
|
6
6
|
# the parser formats it runs it through Enju, and
|
7
7
|
# parses the XML output by Enju using the Nokogiri
|
8
8
|
# XML reader. It creates wrappers for the sentences,
|
9
|
-
# syntactical
|
9
|
+
# syntactical phrases and tokens that Enju identified.
|
10
10
|
#
|
11
11
|
# Original paper:
|
12
12
|
# Takuya Matsuzaki, Yusuke Miyao, and Jun'ichi Tsujii.
|
@@ -29,20 +29,15 @@ module Treat
|
|
29
29
|
@@i = 0 if @@i == @@parsers.size
|
30
30
|
@@parsers[@@i-1]
|
31
31
|
end
|
32
|
-
# Parse the entity into its syntactical
|
33
|
-
#
|
32
|
+
# Parse the entity into its syntactical phrases using Enju.
|
33
|
+
# Calls #build to initiate XML parsing.
|
34
34
|
def self.parse(entity, options = {})
|
35
35
|
options[:processes] ||= 1
|
36
36
|
@@options = options
|
37
|
+
@@id_table = {}
|
38
|
+
@@dependencies_table = {}
|
37
39
|
stdin, stdout = proc
|
38
|
-
|
39
|
-
remove_last = true
|
40
|
-
text = entity.to_s + '.'
|
41
|
-
else
|
42
|
-
remove_last = false
|
43
|
-
text = entity.to_s.gsub('.', '')
|
44
|
-
text += '.' unless ['!', '?'].include?(text[-1])
|
45
|
-
end
|
40
|
+
text, remove_last = valid_text(entity)
|
46
41
|
stdin.puts(text + "\n")
|
47
42
|
parsed = build(stdout.gets, remove_last)
|
48
43
|
if not parsed.nil?
|
@@ -50,10 +45,16 @@ module Treat
|
|
50
45
|
parsed.children.each do |child|
|
51
46
|
entity << child
|
52
47
|
end
|
48
|
+
# Remove the period we added at the end.
|
49
|
+
if remove_last
|
50
|
+
last = entity.punctuations[-1]
|
51
|
+
entity.remove!(last)
|
52
|
+
end
|
53
53
|
else
|
54
54
|
warn "Couldn't parse the text '#{entity.to_s}'."
|
55
55
|
end
|
56
|
-
entity
|
56
|
+
link_heads(entity)
|
57
|
+
add_dependencies(entity)
|
57
58
|
end
|
58
59
|
# Parses an Enju XML output file using the Nogoriki
|
59
60
|
# XML reader and converts that structure into a tree
|
@@ -63,8 +64,6 @@ module Treat
|
|
63
64
|
xml_reader = Nokogiri::XML::Reader.from_memory(xml)
|
64
65
|
current_element = nil
|
65
66
|
previous_depth = 0
|
66
|
-
id_table = {}
|
67
|
-
edges_table = {}
|
68
67
|
# Read the XML file entity by entity.
|
69
68
|
while xml_reader.read
|
70
69
|
# The depth in the XML tree.
|
@@ -81,119 +80,135 @@ module Treat
|
|
81
80
|
previous_depth = current_depth
|
82
81
|
next
|
83
82
|
end
|
83
|
+
# Get and format attributes and dependencies.
|
84
84
|
attributes = xml_reader.attributes
|
85
|
-
|
86
|
-
|
87
|
-
unless attributes.
|
88
|
-
new_attributes =
|
89
|
-
|
90
|
-
id = attributes.delete('id')
|
91
|
-
pred = attributes.delete('pred')
|
92
|
-
attributes.each_pair do |attribute, value|
|
93
|
-
if ['arg1', 'arg2'].include?(attribute)
|
94
|
-
edges[value] = pred
|
95
|
-
else
|
96
|
-
if attribute == 'cat'
|
97
|
-
if xml_reader.name == 'tok'
|
98
|
-
if value.length > 1 && ['P', 'X'].include?(value[-1]) &&
|
99
|
-
value != 'PN'
|
100
|
-
new_attributes[:saturated] = (value[-1] == 'P')
|
101
|
-
value = value[0..-2]
|
102
|
-
end
|
103
|
-
cat = Treat::Languages::English::EnjuCatToCategory[value]
|
104
|
-
new_attributes[:cat] = cat
|
105
|
-
else
|
106
|
-
new_attributes[:enju_cat] = value
|
107
|
-
xcat = attributes['xcat'].split(' ')[0]
|
108
|
-
xcat ||= ''
|
109
|
-
tags = Treat::Languages::English::EnjuCatXcatToPTB.select do |m|
|
110
|
-
m[0] == value && m[1] == xcat
|
111
|
-
end
|
112
|
-
if tags.empty?
|
113
|
-
tag = 'UK'
|
114
|
-
else
|
115
|
-
tag = tags[0][2]
|
116
|
-
end
|
117
|
-
new_attributes[:enju_xcat] = xcat
|
118
|
-
attributes.delete('xcat')
|
119
|
-
new_attributes[:tag] = tag
|
120
|
-
end
|
121
|
-
else
|
122
|
-
pre = prefix.include?(attribute) ? 'enju_' : ''
|
123
|
-
new_attributes[:"#{pre+attribute}"] = value
|
124
|
-
end
|
125
|
-
end
|
126
|
-
end
|
127
|
-
attributes.delete('arg1')
|
128
|
-
attributes.delete('arg2')
|
129
|
-
end
|
130
|
-
# Handle naming conventions.
|
131
|
-
if attributes.has_key?('pos')
|
132
|
-
new_attributes[:tag] = new_attributes[:pos]
|
133
|
-
new_attributes[:tag_set] = :penn
|
134
|
-
new_attributes.delete :pos
|
85
|
+
id = attributes.delete('id')
|
86
|
+
new_attributes = {}; dependencies = {}
|
87
|
+
unless attributes.size == 0
|
88
|
+
new_attributes, dependencies =
|
89
|
+
cleanup_attributes(xml_reader.name, attributes)
|
135
90
|
end
|
136
91
|
# Create the appropriate entity for the
|
137
92
|
# element.
|
138
93
|
current_value = ''
|
139
|
-
attributes = new_attributes
|
140
94
|
case xml_reader.name
|
141
95
|
when 'sentence'
|
142
96
|
current_element = Treat::Entities::Sentence.new('')
|
143
|
-
id_table[id] = current_element.id
|
144
|
-
|
145
|
-
current_element.features =
|
97
|
+
@@id_table[id] = current_element.id
|
98
|
+
@@dependencies_table[current_element.id] = dependencies
|
99
|
+
current_element.features = new_attributes
|
146
100
|
when 'cons'
|
147
101
|
current_element = current_element <<
|
148
102
|
Treat::Entities::Phrase.new('')
|
149
|
-
id_table[id] = current_element.id
|
150
|
-
|
151
|
-
current_element.features =
|
103
|
+
@@id_table[id] = current_element.id
|
104
|
+
@@dependencies_table[current_element.id] = dependencies
|
105
|
+
current_element.features = new_attributes
|
152
106
|
when 'tok'
|
153
|
-
tmp_attributes =
|
154
|
-
|
107
|
+
tmp_attributes = new_attributes
|
108
|
+
tmp_dependencies = dependencies
|
155
109
|
else
|
156
110
|
current_value = xml_reader.value.gsub(/\s+/, "")
|
157
|
-
|
111
|
+
unless current_value.size == 0
|
158
112
|
current_element = current_element <<
|
159
|
-
Treat::Entities::
|
113
|
+
Treat::Entities::Token.from_string(current_value)
|
160
114
|
if current_element.is_a?(Treat::Entities::Word)
|
161
115
|
current_element.features = tmp_attributes
|
162
|
-
id_table[id] = current_element.id
|
163
|
-
|
116
|
+
@@id_table[id] = current_element.id
|
117
|
+
@@dependencies_table[current_element.id] = tmp_dependencies
|
164
118
|
end
|
165
119
|
end
|
166
120
|
end
|
167
121
|
previous_depth = current_depth
|
168
122
|
end
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
123
|
+
current_element
|
124
|
+
end
|
125
|
+
# Validate a text - Enju wants period to parse a sentence.
|
126
|
+
def self.valid_text(entity)
|
127
|
+
if entity.to_s.count('.') == 0
|
128
|
+
remove_last = true
|
129
|
+
text = entity.to_s + '.'
|
130
|
+
else
|
131
|
+
remove_last = false
|
132
|
+
text = entity.to_s.gsub('.', '')
|
133
|
+
text += '.' unless ['!', '?'].include?(text[-1])
|
134
|
+
end
|
135
|
+
return text, remove_last
|
136
|
+
end
|
137
|
+
# Link the head and sem_head to their entities.
|
138
|
+
def self.link_heads(entity)
|
139
|
+
entity.each_phrase do |phrase|
|
140
|
+
if phrase.has?(:head)
|
141
|
+
phrase.link(@@id_table[phrase.head], 'head', true, -1)
|
142
|
+
phrase.unset(:head)
|
143
|
+
end
|
144
|
+
if phrase.has?(:sem_head)
|
145
|
+
phrase.link(@@id_table[phrase.sem_head], 'sem_head', true, -1)
|
146
|
+
phrase.unset(:sem_head)
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
# Add dependencies a posterior to a parsed entity.
|
151
|
+
def self.add_dependencies(entity2)
|
152
|
+
entity2.each_entity(:word, :phrase) do |entity|
|
153
|
+
@@dependencies_table.each_pair do |id2, dependencies2|
|
154
|
+
# Next if there are no dependencies.
|
155
|
+
next if dependencies2.nil?
|
156
|
+
entity = entity2.root.find(id2)
|
157
|
+
next if entity.nil?
|
158
|
+
dependencies2.each_pair do |argument, type|
|
159
|
+
# Skip this argument if we don't know the target node.
|
179
160
|
next if argument == 'unk'
|
180
|
-
entity.
|
161
|
+
entity.link(@@id_table[argument], type.intern)
|
181
162
|
end
|
182
163
|
end
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
164
|
+
end
|
165
|
+
end
|
166
|
+
# Helper function to convert Enju attributes to Treat attributes.
|
167
|
+
def self.cleanup_attributes(name, attributes)
|
168
|
+
new_attributes = {}
|
169
|
+
dependencies = {}
|
170
|
+
pred = attributes.delete('pred')
|
171
|
+
attributes.each_pair do |attribute2, value|
|
172
|
+
attribute = attribute2.strip
|
173
|
+
if attribute == 'arg1' || attribute == 'arg2'
|
174
|
+
dependencies[value] = pred
|
175
|
+
next
|
176
|
+
end
|
177
|
+
if attribute == 'cat'
|
178
|
+
new_attributes[:cat] = value
|
179
|
+
if name == 'tok'
|
180
|
+
if value.length > 1 && ['P', 'X'].include?(value[-1]) &&
|
181
|
+
value != 'PN'
|
182
|
+
new_attributes[:saturated] = (value[-1] == 'P')
|
183
|
+
value = value[0..-2]
|
184
|
+
end
|
185
|
+
new_attributes[:category] =
|
186
|
+
Treat::Languages::Tags::EnjuCatToCategory[value]
|
187
|
+
else
|
188
|
+
tags = Treat::Languages::Tags::EnjuCatXcatToPTB.select do |m|
|
189
|
+
m[0] == value && m[1] == attributes['xcat']
|
190
|
+
end
|
191
|
+
tag = (tags.size == 0) ? 'FW' : tags[0][2]
|
192
|
+
new_attributes[:tag] = tag
|
193
|
+
end
|
194
|
+
else
|
195
|
+
new_attributes[:"#{attribute}"] = value
|
189
196
|
end
|
190
197
|
end
|
191
|
-
#
|
192
|
-
|
193
|
-
|
194
|
-
|
198
|
+
# Delete after iteration.
|
199
|
+
attributes.delete('arg1')
|
200
|
+
attributes.delete('arg2')
|
201
|
+
# Handle naming conventions.
|
202
|
+
if attributes.has_key?('pos')
|
203
|
+
new_attributes[:tag] = new_attributes[:pos]
|
204
|
+
new_attributes[:tag_set] = :penn
|
205
|
+
new_attributes.delete :pos
|
195
206
|
end
|
196
|
-
|
207
|
+
if attributes.has_key?('base')
|
208
|
+
new_attributes[:lemma] = new_attributes[:base]
|
209
|
+
new_attributes.delete :base
|
210
|
+
end
|
211
|
+
return new_attributes, dependencies
|
197
212
|
end
|
198
213
|
end
|
199
214
|
end
|
@@ -3,60 +3,128 @@ module Treat
|
|
3
3
|
module Parsers
|
4
4
|
# A wrapper class for the Stanford parser.
|
5
5
|
class Stanford
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
Rjb::load(jars[0], ['-Xms256M', '-Xmx512M'])
|
15
|
-
LexicalizedParser = ::Rjb::import('edu.stanford.nlp.parser.lexparser.LexicalizedParser')
|
16
|
-
@@parsers = {}
|
6
|
+
require 'stanford-core-nlp'
|
7
|
+
@@parser = {}
|
8
|
+
DefaultOptions = {
|
9
|
+
:silence => false,
|
10
|
+
:log_to_file => nil,
|
11
|
+
:parser_model => nil,
|
12
|
+
:tagger_model => nil
|
13
|
+
}
|
17
14
|
# Parse the entity using the Stanford parser.
|
15
|
+
#
|
16
|
+
# Options:
|
17
|
+
# - (String) :log_to_file => a filename to log output to
|
18
|
+
# instead of displaying it.
|
18
19
|
def self.parse(entity, options = {})
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
if
|
23
|
-
|
20
|
+
options = DefaultOptions.merge(options)
|
21
|
+
lang = entity.language
|
22
|
+
StanfordCoreNLP.use(lang)
|
23
|
+
if options[:tagger_model]
|
24
|
+
::StanfordCoreNLP.set_model(
|
25
|
+
'pos.model', options[:tagger_model]
|
26
|
+
)
|
27
|
+
end
|
28
|
+
if options[:parser_model]
|
29
|
+
::StanfordCoreNLP.set_model(
|
30
|
+
'parser.model', options[:parser_model]
|
31
|
+
)
|
32
|
+
end
|
33
|
+
if options[:silence]
|
34
|
+
options[:log_to_file] = '/dev/null'
|
35
|
+
end
|
36
|
+
if options[:log_to_file]
|
37
|
+
::StanfordCoreNLP.log_file =
|
38
|
+
options[:log_to_file]
|
39
|
+
end
|
40
|
+
@@parser[lang] ||=
|
41
|
+
::StanfordCoreNLP.load(
|
42
|
+
:tokenize, :ssplit, :pos, :lemma, :parse
|
43
|
+
)
|
44
|
+
text = ::StanfordCoreNLP::Text.new(entity.to_s)
|
45
|
+
@@parser[lang].annotate(text)
|
46
|
+
|
47
|
+
text.get(:sentences).each do |s|
|
48
|
+
if entity.is_a?(Treat::Entities::Sentence) ||
|
49
|
+
entity.is_a?(Treat::Entities::Phrase)
|
50
|
+
tag = s.get(:category).to_s
|
51
|
+
tag_s, tag_opt = *tag.split('-')
|
52
|
+
tag_s ||= 'S'
|
53
|
+
entity.set :tag_set, :penn
|
54
|
+
entity.set :tag, tag_s
|
55
|
+
entity.set :tag_opt, tag_opt if tag_opt
|
56
|
+
recurse(s.get(:tree), entity)
|
57
|
+
break
|
58
|
+
else
|
59
|
+
recurse(s.get(:tree), entity)
|
60
|
+
end
|
24
61
|
end
|
25
|
-
@@parsers[lang] ||= LexicalizedParser.new(pcfgs[0])
|
26
|
-
parse = @@parsers[lang].apply(entity.to_s)
|
27
|
-
entity.remove_all!
|
28
|
-
recurse(parse, entity)
|
29
|
-
entity
|
30
62
|
end
|
31
|
-
|
63
|
+
|
64
|
+
# Helper method which recurses the tree supplied by
|
32
65
|
# the Stanford parser.
|
33
|
-
def self.recurse(java_node, ruby_node)
|
66
|
+
def self.recurse(java_node, ruby_node, additional_tags = [])
|
34
67
|
# Leaf
|
35
68
|
if java_node.num_children == 0
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
69
|
+
label = java_node.label
|
70
|
+
tag = label.get(:part_of_speech).to_s
|
71
|
+
tag_s, tag_opt = *tag.split('-')
|
72
|
+
tag_s ||= ''
|
73
|
+
ruby_node.value = java_node.value.to_s.strip
|
74
|
+
ruby_node.set :tag_set, :penn
|
75
|
+
ruby_node.set :tag, tag_s
|
76
|
+
ruby_node.set :tag_opt, tag_opt if tag_opt
|
77
|
+
ruby_node.set :tag_set, :penn
|
78
|
+
ruby_node.set :lemma, label.get(:lemma).to_s
|
79
|
+
|
80
|
+
ruby_node.set :character_offset_begin,
|
81
|
+
label.get(:character_offset_begin).to_s
|
82
|
+
|
83
|
+
ruby_node.set :character_offset_end,
|
84
|
+
label.get(:character_offset_end).to_s
|
85
|
+
|
86
|
+
ruby_node.set :begin_index,
|
87
|
+
label.get(:begin_index).to_s
|
88
|
+
|
89
|
+
ruby_node.set :end_index,
|
90
|
+
label.get(:end_index).to_s
|
91
|
+
|
92
|
+
additional_tags.each do |t|
|
93
|
+
lt = label.get(t)
|
94
|
+
ruby_node.set t, lt.to_s if lt
|
43
95
|
end
|
44
|
-
ruby_node
|
96
|
+
return ruby_node
|
45
97
|
else
|
46
|
-
|
47
|
-
|
98
|
+
|
99
|
+
if java_node.num_children == 1 &&
|
100
|
+
java_node.children[0].num_children == 0
|
101
|
+
recurse(java_node.children[0], ruby_node, additional_tags)
|
102
|
+
return
|
48
103
|
end
|
104
|
+
|
49
105
|
java_node.children.each do |java_child|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
106
|
+
label = java_child.label
|
107
|
+
tag = label.get(:category).to_s
|
108
|
+
tag_s, tag_opt = *tag.split('-')
|
109
|
+
tag_s ||= ''
|
110
|
+
|
111
|
+
if Treat::Languages::Tags::PhraseTagToCategory[tag_s]
|
112
|
+
ruby_child = Treat::Entities::Phrase.new
|
113
|
+
else
|
114
|
+
l = java_child.children[0].to_s
|
115
|
+
v = java_child.children[0].value.to_s.strip
|
116
|
+
# Mhmhmhmhmhm
|
117
|
+
val = (l == v) ? v : l.split(' ')[-1].gsub(')', '')
|
118
|
+
ruby_child = Treat::Entities::Token.from_string(val)
|
119
|
+
end
|
120
|
+
|
56
121
|
ruby_child.set :tag_set, :penn
|
122
|
+
ruby_child.set :tag, tag_s
|
123
|
+
ruby_child.set :tag_opt, tag_opt if tag_opt
|
57
124
|
ruby_node << ruby_child
|
125
|
+
|
58
126
|
unless java_child.children.empty?
|
59
|
-
recurse(java_child, ruby_child)
|
127
|
+
recurse(java_child, ruby_child, additional_tags)
|
60
128
|
end
|
61
129
|
end
|
62
130
|
end
|
@@ -64,4 +132,4 @@ module Treat
|
|
64
132
|
end
|
65
133
|
end
|
66
134
|
end
|
67
|
-
end
|
135
|
+
end
|