enju_accessor 0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/enju_parse_text +9 -0
- data/bin/enju_tag_text +9 -0
- data/lib/enju_accessor/enju_accessor.rb +184 -0
- data/lib/enju_accessor.rb +1 -0
- metadata +51 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 4d0b1471223e18688be92aa3b34ed6fb36d7522a
|
4
|
+
data.tar.gz: be48f7ad2f104c76c82ff130241a21d53dd24a53
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d273b60be7d369ecd5fa41cea9f24f1c622c5907c3fba338f110d95166cd65f0002c0b1e94fab65bc02e4650c4d383e4f211b076194ae3a1004a152a8f143f2a
|
7
|
+
data.tar.gz: ae09b24bfe748f1bd46a55759d6f736ce50666fc2dd6d83c917877bc66266cfcdd414c589db9f9b1eaeabbfb4dcea370e2bcebdd5d511d751d822525de100a4f
|
data/bin/enju_parse_text
ADDED
data/bin/enju_tag_text
ADDED
@@ -0,0 +1,184 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rest-client'
|
3
|
+
require 'text_sentencer'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
# An instance of this class holds the parsing result of a natural language query as anlyzed by Enju.
|
7
|
+
class EnjuAccessor
|
8
|
+
def initialize(enju_cgi_url)
|
9
|
+
@enju_cgi = RestClient::Resource.new(enju_cgi_url)
|
10
|
+
@sentencer = TextSentencer.new
|
11
|
+
@tid_base, @rid_base = 0, 0
|
12
|
+
end
|
13
|
+
|
14
|
+
def get_parse (sentence)
|
15
|
+
begin
|
16
|
+
response = @enju_cgi.get :params => {:sentence=>sentence, :format=>'so'}
|
17
|
+
rescue => e
|
18
|
+
raise IOError, "Abnormal behavior of the Enju CGI server: #{e.message}."
|
19
|
+
end
|
20
|
+
|
21
|
+
parse = case response.code
|
22
|
+
when 200 # 200 means success
|
23
|
+
raise "Empty input." if response =~/^Empty line/
|
24
|
+
r = response.encode("ASCII-8BIT").force_encoding("UTF-8").to_s
|
25
|
+
read_parse(sentence, r)
|
26
|
+
else
|
27
|
+
raise IOError, "Abnormal response from the Enju CGI server."
|
28
|
+
end
|
29
|
+
|
30
|
+
parse
|
31
|
+
end
|
32
|
+
|
33
|
+
def read_parse (sentence, r)
|
34
|
+
toks = {}
|
35
|
+
cons = {}
|
36
|
+
|
37
|
+
adjustment = 0
|
38
|
+
|
39
|
+
# r is a parsing result in SO format.
|
40
|
+
lines = r.split(/\r?\n/)
|
41
|
+
|
42
|
+
idx = 0
|
43
|
+
lines.each do |line| # for each line of analysis
|
44
|
+
b, e, attr_str = line.split(/\t/)
|
45
|
+
b = b.to_i
|
46
|
+
e = e.to_i
|
47
|
+
|
48
|
+
node = Nokogiri::HTML.parse('<node ' + attr_str + '>')
|
49
|
+
attrs = node.css('node').first.to_h
|
50
|
+
|
51
|
+
if attrs['tok'] == ""
|
52
|
+
base = attrs['base']
|
53
|
+
|
54
|
+
b += adjustment
|
55
|
+
base.each_char{|c| adjustment += (1 - c.bytesize) if c !~ /\p{ASCII}/}
|
56
|
+
e += adjustment
|
57
|
+
|
58
|
+
id = attrs['id']
|
59
|
+
pos = attrs['pos']
|
60
|
+
pos = attrs['base'] if [',', '.', ':', '(', ')', '``', ''''].include?(pos)
|
61
|
+
pos.sub!('$', '-DOLLAR-')
|
62
|
+
pos = '-COLON-' if pos == 'HYPH'
|
63
|
+
toks[id] = {beg: b, end:e, word:sentence[b ... e], idx:idx, base:base, pos:pos, cat:attrs['cat'], args:{}}
|
64
|
+
toks[id][:args][:arg1] = attrs['arg1'] if attrs['arg1']
|
65
|
+
toks[id][:args][:arg2] = attrs['arg2'] if attrs['arg2']
|
66
|
+
toks[id][:args][:arg3] = attrs['arg3'] if attrs['arg3']
|
67
|
+
toks[id][:args][:mod] = attrs['mod'] if attrs['mod']
|
68
|
+
idx += 1
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
lines.each do |line| # for each line of analysis
|
73
|
+
b, e, attr_str = line.split(/\t/)
|
74
|
+
b = b.to_i
|
75
|
+
e = e.to_i
|
76
|
+
|
77
|
+
node = Nokogiri::HTML.parse('<node ' + attr_str + '>')
|
78
|
+
attrs = node.css('node').first.to_h
|
79
|
+
|
80
|
+
if attrs['cons'] == ""
|
81
|
+
id = attrs['id']
|
82
|
+
head = attrs['head']
|
83
|
+
sem_head = attrs['sem_head']
|
84
|
+
cat = attrs['cat']
|
85
|
+
cons[id] = {head:head, sem_head: sem_head, cat:cat}
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# puts sentence
|
90
|
+
# puts toks.map{|t| t.to_s}.join("\n")
|
91
|
+
# puts cons.map{|c| c.to_s}.join("\n")
|
92
|
+
# puts "-----"
|
93
|
+
# exit
|
94
|
+
|
95
|
+
[toks, cons]
|
96
|
+
end
|
97
|
+
|
98
|
+
def parse_sentence (sentence, offset_base = 0, mode = '')
|
99
|
+
@tid_base, @rid_base = 0, 0 unless mode == 'continue'
|
100
|
+
|
101
|
+
toks, cons = get_parse(sentence)
|
102
|
+
|
103
|
+
denotations = []
|
104
|
+
tid_mapping = {}
|
105
|
+
idx_last = 0
|
106
|
+
toks.each do |id, tok|
|
107
|
+
id = tid_mapping[id] = 'T' + (tok[:idx] + @tid_base).to_s
|
108
|
+
denotations << {id:id, span:{begin: tok[:beg] + offset_base, end: tok[:end] + offset_base}, obj: tok[:pos]}
|
109
|
+
idx_last = tok[:idx]
|
110
|
+
end
|
111
|
+
|
112
|
+
# puts toks.map{|t| t.to_s}.join("\n")
|
113
|
+
|
114
|
+
cons.each do |id, con|
|
115
|
+
thead = con[:sem_head]
|
116
|
+
thead = cons[thead][:sem_head] until thead.start_with?('t')
|
117
|
+
con[:thead] = thead
|
118
|
+
end
|
119
|
+
|
120
|
+
relations = []
|
121
|
+
rid_num = @rid_base
|
122
|
+
toks.each do |id, tok|
|
123
|
+
unless tok[:args].empty?
|
124
|
+
tok[:args].each do |type, arg|
|
125
|
+
arg = cons[arg][:thead] if arg.start_with?('c')
|
126
|
+
next if tid_mapping[arg].nil?
|
127
|
+
relations << {id: 'R' + rid_num.to_s, subj: tid_mapping[arg], obj: tid_mapping[id], pred: type.to_s.downcase + 'Of'}
|
128
|
+
rid_num += 1
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
@tid_base = @tid_base + idx_last + 1
|
134
|
+
@rid_base = rid_num
|
135
|
+
|
136
|
+
{:denotations => denotations, :relations => relations}
|
137
|
+
end
|
138
|
+
|
139
|
+
def tag_sentence (sentence, offset_base = 0, mode = '')
|
140
|
+
@id_base = 0 unless mode == 'continue'
|
141
|
+
|
142
|
+
get_parse(sentence)
|
143
|
+
|
144
|
+
denotations = []
|
145
|
+
idx_last = 0
|
146
|
+
@tokens.each do |token|
|
147
|
+
denotations << {id: 'P' + (token[:idx] + @id_base).to_s, span: {begin: token[:beg] + offset_base, end: token[:end] + offset_base}, obj: token[:pos]}
|
148
|
+
denotations << {id: 'B' + (token[:idx] + @id_base).to_s, span: {begin: token[:beg] + offset_base, end: token[:end] + offset_base}, obj: token[:base]}
|
149
|
+
idx_last = token[:idx]
|
150
|
+
end
|
151
|
+
|
152
|
+
@id_base = @id_base + idx_last + 1
|
153
|
+
|
154
|
+
{:denotations => denotations}
|
155
|
+
end
|
156
|
+
|
157
|
+
def parse_text (text)
|
158
|
+
segments = @sentencer.segment(text)
|
159
|
+
|
160
|
+
denotations, relations = [], []
|
161
|
+
segments.each_with_index do |s, i|
|
162
|
+
mode = (i == 0)? nil : 'continue'
|
163
|
+
annotation = parse_sentence(text[s[0]...s[1]], s[0], mode)
|
164
|
+
denotations += annotation[:denotations]
|
165
|
+
relations += annotation[:relations]
|
166
|
+
end
|
167
|
+
|
168
|
+
{:text=> text, :denotations => denotations, :relations => relations}
|
169
|
+
end
|
170
|
+
|
171
|
+
def tag_text (text)
|
172
|
+
segments = @sentencer.segment(text)
|
173
|
+
|
174
|
+
denotations = []
|
175
|
+
segments.each_with_index do |s, i|
|
176
|
+
mode = (i == 0)? nil : 'continue'
|
177
|
+
annotation = tag_sentence(text[s[0]...s[1]], s[0], mode)
|
178
|
+
denotations += annotation[:denotations]
|
179
|
+
end
|
180
|
+
|
181
|
+
{:text=> text, :denotations => denotations}
|
182
|
+
end
|
183
|
+
|
184
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'enju_accessor/enju_accessor'
|
metadata
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: enju_accessor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.9'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jin-Dong Kim
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-10-13 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A wrapper for Enju CGI service to convert the output to the PubAnnotation
|
14
|
+
JSON format.
|
15
|
+
email: jindong.kim@gmail.com
|
16
|
+
executables:
|
17
|
+
- enju_parse_text
|
18
|
+
- enju_tag_text
|
19
|
+
extensions: []
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- bin/enju_parse_text
|
23
|
+
- bin/enju_tag_text
|
24
|
+
- lib/enju_accessor.rb
|
25
|
+
- lib/enju_accessor/enju_accessor.rb
|
26
|
+
homepage: https://github.com/jdkim/enju_accessor
|
27
|
+
licenses:
|
28
|
+
- MIT
|
29
|
+
metadata: {}
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options: []
|
32
|
+
require_paths:
|
33
|
+
- lib
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
requirements: []
|
45
|
+
rubyforge_project:
|
46
|
+
rubygems_version: 2.6.11
|
47
|
+
signing_key:
|
48
|
+
specification_version: 4
|
49
|
+
summary: A wrapper for Enju CGI service to convert the output to the PubAnnotation
|
50
|
+
JSON format.
|
51
|
+
test_files: []
|