enju_accessor 0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/enju_parse_text +9 -0
- data/bin/enju_tag_text +9 -0
- data/lib/enju_accessor/enju_accessor.rb +184 -0
- data/lib/enju_accessor.rb +1 -0
- metadata +51 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 4d0b1471223e18688be92aa3b34ed6fb36d7522a
|
4
|
+
data.tar.gz: be48f7ad2f104c76c82ff130241a21d53dd24a53
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d273b60be7d369ecd5fa41cea9f24f1c622c5907c3fba338f110d95166cd65f0002c0b1e94fab65bc02e4650c4d383e4f211b076194ae3a1004a152a8f143f2a
|
7
|
+
data.tar.gz: ae09b24bfe748f1bd46a55759d6f736ce50666fc2dd6d83c917877bc66266cfcdd414c589db9f9b1eaeabbfb4dcea370e2bcebdd5d511d751d822525de100a4f
|
data/bin/enju_parse_text
ADDED
data/bin/enju_tag_text
ADDED
@@ -0,0 +1,184 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rest-client'
|
3
|
+
require 'text_sentencer'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
# An instance of this class holds the parsing result of a natural language query as anlyzed by Enju.
|
7
|
+
class EnjuAccessor
|
8
|
+
def initialize(enju_cgi_url)
|
9
|
+
@enju_cgi = RestClient::Resource.new(enju_cgi_url)
|
10
|
+
@sentencer = TextSentencer.new
|
11
|
+
@tid_base, @rid_base = 0, 0
|
12
|
+
end
|
13
|
+
|
14
|
+
def get_parse (sentence)
|
15
|
+
begin
|
16
|
+
response = @enju_cgi.get :params => {:sentence=>sentence, :format=>'so'}
|
17
|
+
rescue => e
|
18
|
+
raise IOError, "Abnormal behavior of the Enju CGI server: #{e.message}."
|
19
|
+
end
|
20
|
+
|
21
|
+
parse = case response.code
|
22
|
+
when 200 # 200 means success
|
23
|
+
raise "Empty input." if response =~/^Empty line/
|
24
|
+
r = response.encode("ASCII-8BIT").force_encoding("UTF-8").to_s
|
25
|
+
read_parse(sentence, r)
|
26
|
+
else
|
27
|
+
raise IOError, "Abnormal response from the Enju CGI server."
|
28
|
+
end
|
29
|
+
|
30
|
+
parse
|
31
|
+
end
|
32
|
+
|
33
|
+
def read_parse (sentence, r)
|
34
|
+
toks = {}
|
35
|
+
cons = {}
|
36
|
+
|
37
|
+
adjustment = 0
|
38
|
+
|
39
|
+
# r is a parsing result in SO format.
|
40
|
+
lines = r.split(/\r?\n/)
|
41
|
+
|
42
|
+
idx = 0
|
43
|
+
lines.each do |line| # for each line of analysis
|
44
|
+
b, e, attr_str = line.split(/\t/)
|
45
|
+
b = b.to_i
|
46
|
+
e = e.to_i
|
47
|
+
|
48
|
+
node = Nokogiri::HTML.parse('<node ' + attr_str + '>')
|
49
|
+
attrs = node.css('node').first.to_h
|
50
|
+
|
51
|
+
if attrs['tok'] == ""
|
52
|
+
base = attrs['base']
|
53
|
+
|
54
|
+
b += adjustment
|
55
|
+
base.each_char{|c| adjustment += (1 - c.bytesize) if c !~ /\p{ASCII}/}
|
56
|
+
e += adjustment
|
57
|
+
|
58
|
+
id = attrs['id']
|
59
|
+
pos = attrs['pos']
|
60
|
+
pos = attrs['base'] if [',', '.', ':', '(', ')', '``', ''''].include?(pos)
|
61
|
+
pos.sub!('$', '-DOLLAR-')
|
62
|
+
pos = '-COLON-' if pos == 'HYPH'
|
63
|
+
toks[id] = {beg: b, end:e, word:sentence[b ... e], idx:idx, base:base, pos:pos, cat:attrs['cat'], args:{}}
|
64
|
+
toks[id][:args][:arg1] = attrs['arg1'] if attrs['arg1']
|
65
|
+
toks[id][:args][:arg2] = attrs['arg2'] if attrs['arg2']
|
66
|
+
toks[id][:args][:arg3] = attrs['arg3'] if attrs['arg3']
|
67
|
+
toks[id][:args][:mod] = attrs['mod'] if attrs['mod']
|
68
|
+
idx += 1
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
lines.each do |line| # for each line of analysis
|
73
|
+
b, e, attr_str = line.split(/\t/)
|
74
|
+
b = b.to_i
|
75
|
+
e = e.to_i
|
76
|
+
|
77
|
+
node = Nokogiri::HTML.parse('<node ' + attr_str + '>')
|
78
|
+
attrs = node.css('node').first.to_h
|
79
|
+
|
80
|
+
if attrs['cons'] == ""
|
81
|
+
id = attrs['id']
|
82
|
+
head = attrs['head']
|
83
|
+
sem_head = attrs['sem_head']
|
84
|
+
cat = attrs['cat']
|
85
|
+
cons[id] = {head:head, sem_head: sem_head, cat:cat}
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# puts sentence
|
90
|
+
# puts toks.map{|t| t.to_s}.join("\n")
|
91
|
+
# puts cons.map{|c| c.to_s}.join("\n")
|
92
|
+
# puts "-----"
|
93
|
+
# exit
|
94
|
+
|
95
|
+
[toks, cons]
|
96
|
+
end
|
97
|
+
|
98
|
+
def parse_sentence (sentence, offset_base = 0, mode = '')
|
99
|
+
@tid_base, @rid_base = 0, 0 unless mode == 'continue'
|
100
|
+
|
101
|
+
toks, cons = get_parse(sentence)
|
102
|
+
|
103
|
+
denotations = []
|
104
|
+
tid_mapping = {}
|
105
|
+
idx_last = 0
|
106
|
+
toks.each do |id, tok|
|
107
|
+
id = tid_mapping[id] = 'T' + (tok[:idx] + @tid_base).to_s
|
108
|
+
denotations << {id:id, span:{begin: tok[:beg] + offset_base, end: tok[:end] + offset_base}, obj: tok[:pos]}
|
109
|
+
idx_last = tok[:idx]
|
110
|
+
end
|
111
|
+
|
112
|
+
# puts toks.map{|t| t.to_s}.join("\n")
|
113
|
+
|
114
|
+
cons.each do |id, con|
|
115
|
+
thead = con[:sem_head]
|
116
|
+
thead = cons[thead][:sem_head] until thead.start_with?('t')
|
117
|
+
con[:thead] = thead
|
118
|
+
end
|
119
|
+
|
120
|
+
relations = []
|
121
|
+
rid_num = @rid_base
|
122
|
+
toks.each do |id, tok|
|
123
|
+
unless tok[:args].empty?
|
124
|
+
tok[:args].each do |type, arg|
|
125
|
+
arg = cons[arg][:thead] if arg.start_with?('c')
|
126
|
+
next if tid_mapping[arg].nil?
|
127
|
+
relations << {id: 'R' + rid_num.to_s, subj: tid_mapping[arg], obj: tid_mapping[id], pred: type.to_s.downcase + 'Of'}
|
128
|
+
rid_num += 1
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
@tid_base = @tid_base + idx_last + 1
|
134
|
+
@rid_base = rid_num
|
135
|
+
|
136
|
+
{:denotations => denotations, :relations => relations}
|
137
|
+
end
|
138
|
+
|
139
|
+
def tag_sentence (sentence, offset_base = 0, mode = '')
|
140
|
+
@id_base = 0 unless mode == 'continue'
|
141
|
+
|
142
|
+
get_parse(sentence)
|
143
|
+
|
144
|
+
denotations = []
|
145
|
+
idx_last = 0
|
146
|
+
@tokens.each do |token|
|
147
|
+
denotations << {id: 'P' + (token[:idx] + @id_base).to_s, span: {begin: token[:beg] + offset_base, end: token[:end] + offset_base}, obj: token[:pos]}
|
148
|
+
denotations << {id: 'B' + (token[:idx] + @id_base).to_s, span: {begin: token[:beg] + offset_base, end: token[:end] + offset_base}, obj: token[:base]}
|
149
|
+
idx_last = token[:idx]
|
150
|
+
end
|
151
|
+
|
152
|
+
@id_base = @id_base + idx_last + 1
|
153
|
+
|
154
|
+
{:denotations => denotations}
|
155
|
+
end
|
156
|
+
|
157
|
+
def parse_text (text)
|
158
|
+
segments = @sentencer.segment(text)
|
159
|
+
|
160
|
+
denotations, relations = [], []
|
161
|
+
segments.each_with_index do |s, i|
|
162
|
+
mode = (i == 0)? nil : 'continue'
|
163
|
+
annotation = parse_sentence(text[s[0]...s[1]], s[0], mode)
|
164
|
+
denotations += annotation[:denotations]
|
165
|
+
relations += annotation[:relations]
|
166
|
+
end
|
167
|
+
|
168
|
+
{:text=> text, :denotations => denotations, :relations => relations}
|
169
|
+
end
|
170
|
+
|
171
|
+
def tag_text (text)
|
172
|
+
segments = @sentencer.segment(text)
|
173
|
+
|
174
|
+
denotations = []
|
175
|
+
segments.each_with_index do |s, i|
|
176
|
+
mode = (i == 0)? nil : 'continue'
|
177
|
+
annotation = tag_sentence(text[s[0]...s[1]], s[0], mode)
|
178
|
+
denotations += annotation[:denotations]
|
179
|
+
end
|
180
|
+
|
181
|
+
{:text=> text, :denotations => denotations}
|
182
|
+
end
|
183
|
+
|
184
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'enju_accessor/enju_accessor'
|
metadata
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: enju_accessor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.9'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jin-Dong Kim
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-10-13 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A wrapper for Enju CGI service to convert the output to the PubAnnotation
|
14
|
+
JSON format.
|
15
|
+
email: jindong.kim@gmail.com
|
16
|
+
executables:
|
17
|
+
- enju_parse_text
|
18
|
+
- enju_tag_text
|
19
|
+
extensions: []
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- bin/enju_parse_text
|
23
|
+
- bin/enju_tag_text
|
24
|
+
- lib/enju_accessor.rb
|
25
|
+
- lib/enju_accessor/enju_accessor.rb
|
26
|
+
homepage: https://github.com/jdkim/enju_accessor
|
27
|
+
licenses:
|
28
|
+
- MIT
|
29
|
+
metadata: {}
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options: []
|
32
|
+
require_paths:
|
33
|
+
- lib
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
requirements: []
|
45
|
+
rubyforge_project:
|
46
|
+
rubygems_version: 2.6.11
|
47
|
+
signing_key:
|
48
|
+
specification_version: 4
|
49
|
+
summary: A wrapper for Enju CGI service to convert the output to the PubAnnotation
|
50
|
+
JSON format.
|
51
|
+
test_files: []
|