opener-chained-daemon 3.1.5 → 3.3.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/opener/chained_daemon.rb +3 -3
- data/lib/opener/chained_daemon/chained_daemon.rb +12 -11
- data/lib/opener/chained_daemon/cli.rb +10 -7
- data/lib/opener/chained_daemon/languages_cache.rb +3 -3
- data/lib/opener/chained_daemon/version.rb +1 -1
- data/lib/opener/kaf/document.rb +15 -13
- data/lib/opener/kaf/text.rb +2 -0
- data/lib/opener/stanza/{tokenizer_pos.rb → processor.rb} +29 -28
- data/lib/opener/sym_mash.rb +14 -0
- data/opener-chained-daemon.gemspec +2 -2
- metadata +9 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5bdd841299458126e66df2dff1857ea3c2d2386efab9f092def36357935e2f9d
|
4
|
+
data.tar.gz: 140ff88d7ddd90a90bc21106bc9a0d3370802969bca20820ee04b45398e975c3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7026baaf0c0541725cb3459575bec129354bada59be1aa30174425051d26ff973f4a0359985cb58b1ad70035ba6c6c9db4409b29631cf9b1d8fbe16152a59268
|
7
|
+
data.tar.gz: d947f1bc75e048b56d18b5406ed455a365c406fc1b01bf6e4b6ee0ae4584f2513e45589f93d5e1486c47fe34e1305d4a0105612ec7b17dc5d75e67c0658e0d0a
|
@@ -8,14 +8,14 @@ require 'rexml/formatters/pretty'
|
|
8
8
|
|
9
9
|
require 'opener/daemons'
|
10
10
|
|
11
|
+
require_relative 'sym_mash'
|
12
|
+
|
11
13
|
require_relative 'chained_daemon/languages_cache'
|
12
14
|
require 'opener/language_identifier'
|
13
|
-
require 'opener/tokenizer'
|
14
|
-
require 'opener/pos_tagger' if RUBY_ENGINE == 'jruby'
|
15
15
|
require 'opener/polarity_tagger'
|
16
16
|
require 'opener/property_tagger'
|
17
17
|
require 'opener/opinion_detector_basic'
|
18
|
-
require 'opener/stanza/
|
18
|
+
require 'opener/stanza/processor'
|
19
19
|
|
20
20
|
require_relative 'chained_daemon/chained_daemon'
|
21
21
|
require_relative 'chained_daemon/cli'
|
@@ -8,20 +8,21 @@ module Opener
|
|
8
8
|
@options = DEFAULT_OPTIONS.merge options
|
9
9
|
@queue_map = {
|
10
10
|
'opener-language-identifier': Opener::LanguageIdentifier.new,
|
11
|
-
'stanza-
|
12
|
-
#'opener-tokenizer': Opener::Tokenizer.new,
|
13
|
-
#'opener-pos-tagger': Opener::POSTagger.new,
|
11
|
+
'stanza-processor': Stanza::Processor.new,
|
14
12
|
'opener-property-tagger': Opener::PropertyTagger.new,
|
15
13
|
'opener-polarity-tagger': Opener::PolarityTagger.new,
|
16
14
|
'opener-opinion-detector-basic': Opener::OpinionDetectorBasic.new,
|
17
15
|
}
|
18
16
|
end
|
19
17
|
|
20
|
-
def run input,
|
21
|
-
params
|
22
|
-
params.
|
23
|
-
params
|
24
|
-
params
|
18
|
+
def run input, _params = {}
|
19
|
+
params = SymMash.new _params
|
20
|
+
params.translate_languages ||= []
|
21
|
+
params.cache_keys = SymMash.new params.cache_keys&.to_h&.sort&.to_h || {}
|
22
|
+
if params.filter_vertical and params.property_type.present?
|
23
|
+
params.cache_keys.property_type = params.property_type
|
24
|
+
end
|
25
|
+
params.cache_keys.environment ||= 'production'
|
25
26
|
|
26
27
|
lang = nil
|
27
28
|
output = nil
|
@@ -34,7 +35,7 @@ module Opener
|
|
34
35
|
rescue Core::UnsupportedLanguageError
|
35
36
|
xml = Nokogiri.parse input
|
36
37
|
lang = xml.root.attr('xml:lang')
|
37
|
-
raise unless lang.in? params
|
38
|
+
raise unless lang.in? params.translate_languages
|
38
39
|
|
39
40
|
input = translate xml, params
|
40
41
|
retry
|
@@ -47,7 +48,7 @@ module Opener
|
|
47
48
|
output = xml.to_s
|
48
49
|
end
|
49
50
|
|
50
|
-
output = pretty_print output if params
|
51
|
+
output = pretty_print output if params.cache_keys.environment == 'staging'
|
51
52
|
output
|
52
53
|
|
53
54
|
rescue Core::UnsupportedLanguageError
|
@@ -84,7 +85,7 @@ module Opener
|
|
84
85
|
protected
|
85
86
|
|
86
87
|
def translate_service params
|
87
|
-
params
|
88
|
+
params.translate_service&.to_sym || :google
|
88
89
|
end
|
89
90
|
|
90
91
|
def google_translator
|
@@ -47,16 +47,19 @@ Example:
|
|
47
47
|
run do |opts, args|
|
48
48
|
daemon = ChainedDaemon.new args: args
|
49
49
|
input = STDIN.tty? ? nil : STDIN.read
|
50
|
+
params = if ENV['PARAMS'] then JSON.parse ENV['PARAMS'] else {} end
|
50
51
|
|
51
|
-
params = if ENV['PARAMS']
|
52
|
-
JSON.parse ENV['PARAMS']
|
53
|
-
else
|
54
|
-
{}
|
55
|
-
end
|
56
52
|
# Set environment as staging from console for testing purposes
|
57
|
-
|
53
|
+
env = ENV['LEXICONS_ENV'] || 'staging'
|
54
|
+
pt = ENV['LEXICONS_PROPERTY_TYPE']
|
55
|
+
params[:cache_keys] = {
|
56
|
+
environment: env,
|
57
|
+
property_type: pt,
|
58
|
+
merged: (true if env == 'staging'),
|
59
|
+
}
|
58
60
|
|
59
|
-
|
61
|
+
output = daemon.run input, params
|
62
|
+
puts output
|
60
63
|
end
|
61
64
|
end
|
62
65
|
end
|
@@ -19,15 +19,15 @@ module Opener
|
|
19
19
|
break @cache if @last_updated and @last_updated > UPDATE_INTERVAL.ago
|
20
20
|
cache_update
|
21
21
|
end
|
22
|
-
@cache
|
23
22
|
end
|
24
23
|
|
25
24
|
def cache_update
|
26
25
|
puts "loading supported languages from url #{@url}" if ENV['DEBUG']
|
27
26
|
|
28
|
-
languages = JSON.parse http.get(@url).body
|
29
|
-
@cache = languages['data'].map { |l| l['code'] }
|
27
|
+
languages = SymMash.new JSON.parse http.get(@url).body
|
30
28
|
@last_updated = Time.now
|
29
|
+
@cache = languages.data.each.with_object({}){ |l,h| h[l.code] = l }
|
30
|
+
@cache
|
31
31
|
end
|
32
32
|
|
33
33
|
def http
|
data/lib/opener/kaf/document.rb
CHANGED
@@ -48,13 +48,14 @@ module Opener
|
|
48
48
|
|
49
49
|
def add_word_form params
|
50
50
|
text = @document.at('text') || @document.root.add_child('<text/>').first
|
51
|
-
wf = text.add_child("<wf>#{params
|
51
|
+
wf = text.add_child("<wf>#{params.text}</wf>")
|
52
52
|
attrs = {
|
53
|
-
wid: "w#{params
|
54
|
-
sent: params
|
55
|
-
para: params
|
56
|
-
offset: params
|
57
|
-
length: params
|
53
|
+
wid: "w#{params.wid}",
|
54
|
+
sent: params.sid,
|
55
|
+
para: params.para,
|
56
|
+
offset: params.offset,
|
57
|
+
length: params.length,
|
58
|
+
head: params.head,
|
58
59
|
}
|
59
60
|
wf.attr attrs
|
60
61
|
end
|
@@ -63,15 +64,16 @@ module Opener
|
|
63
64
|
text = @document.at('terms') || @document.root.add_child('<terms/>').first
|
64
65
|
term = text.add_child("<term/>")
|
65
66
|
attrs = {
|
66
|
-
tid: "t#{params
|
67
|
-
type: params
|
68
|
-
lemma: params
|
69
|
-
text: params
|
70
|
-
pos: params
|
71
|
-
morphofeat: params
|
67
|
+
tid: "t#{params.tid}",
|
68
|
+
type: params.type,
|
69
|
+
lemma: params.lemma,
|
70
|
+
text: params.text,
|
71
|
+
pos: params.pos,
|
72
|
+
morphofeat: params.morphofeat,
|
73
|
+
head: params.head,
|
72
74
|
}
|
73
75
|
term.attr attrs
|
74
|
-
term.first.add_child("<span><target id='w#{params
|
76
|
+
term.first.add_child("<span><target id='w#{params.wid}'/></span>")
|
75
77
|
end
|
76
78
|
|
77
79
|
def to_xml
|
data/lib/opener/kaf/text.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
module Opener
|
2
2
|
module KAF
|
3
3
|
class WordForm
|
4
|
+
|
4
5
|
def initialize(document, xml_node)
|
5
6
|
@document = document
|
6
7
|
@xml_node = xml_node
|
@@ -25,6 +26,7 @@ module Opener
|
|
25
26
|
def paragraph
|
26
27
|
return @paragraph ||= @xml_node.attr('para').to_i
|
27
28
|
end
|
29
|
+
|
28
30
|
end
|
29
31
|
end
|
30
32
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Opener
|
2
2
|
module Stanza
|
3
|
-
class
|
3
|
+
class Processor
|
4
4
|
|
5
5
|
DESC = 'Tokenizer / POS by Stanza'
|
6
6
|
VERSION = '1.0'
|
@@ -8,8 +8,10 @@ module Opener
|
|
8
8
|
BASE_URL = ENV['STANZA_SERVER']
|
9
9
|
LANGUAGES_CACHE = Opener::ChainedDaemon::LanguagesCache.new
|
10
10
|
|
11
|
-
RTL_LANGUAGES = [
|
12
|
-
|
11
|
+
RTL_LANGUAGES = %w[
|
12
|
+
ar ara arc ae ave egy he heb nqo pal phn sam
|
13
|
+
syc syr fa per fas ku kur ur urd
|
14
|
+
]
|
13
15
|
|
14
16
|
POS = {
|
15
17
|
'DET' => 'D',
|
@@ -37,66 +39,65 @@ module Opener
|
|
37
39
|
raise 'missing Stanza server' if ENV['STANZA_SERVER'].blank?
|
38
40
|
|
39
41
|
kaf = KAF::Document.from_xml input
|
40
|
-
|
41
|
-
|
42
|
-
|
42
|
+
lang = LANGUAGES_CACHE.get[kaf.language]
|
43
|
+
env = params.cache_keys.environment
|
44
|
+
unless lang&.environments&.include? env or (env == 'staging' and lang&.environments&.include? 'production')
|
43
45
|
raise Core::UnsupportedLanguageError.new kaf.language
|
44
46
|
end
|
45
47
|
|
46
|
-
input
|
47
|
-
input
|
48
|
-
response
|
48
|
+
input = kaf.raw
|
49
|
+
input = input.gsub(/\,[^\ ]/, ', ')
|
50
|
+
response = Faraday.post BASE_URL, {lang: kaf.language, input: input}.to_query
|
49
51
|
raise Core::UnsupportedLanguageError, kaf.language if response.status == 406
|
50
52
|
raise response.body if response.status >= 400
|
51
|
-
|
53
|
+
sentences = JSON.parse response.body
|
54
|
+
sentences.each{ |s| s.map!{ |t| Hashie::Mash.new t } }
|
52
55
|
|
53
56
|
w_index = 0
|
54
57
|
|
55
58
|
miscs = {}
|
56
|
-
|
57
|
-
miscs[i]
|
58
|
-
|
59
|
-
word
|
60
|
-
puts id
|
61
|
-
miscs[i][id] = word['misc']
|
62
|
-
end
|
59
|
+
sentences.each.with_index do |s, i|
|
60
|
+
miscs[i] = {}
|
61
|
+
s.each do |word|
|
62
|
+
word.id.is_a?(Array) && word.id.each{ |id| miscs[i][id] = word.misc }
|
63
63
|
end
|
64
64
|
end
|
65
65
|
|
66
|
-
|
67
|
-
|
68
|
-
|
66
|
+
sentences.map{ |s| s.reverse! } if RTL_LANGUAGES.include? kaf.language
|
67
|
+
sentences.each.with_index do |s, s_index|
|
68
|
+
s.each do |word|
|
69
69
|
w_index += 1
|
70
70
|
# save misc for later usase in a MWT case
|
71
|
-
next if word
|
71
|
+
next if word.id.is_a? Array
|
72
72
|
|
73
|
-
misc = word
|
73
|
+
misc = word.misc || miscs[s_index][word.id]
|
74
74
|
|
75
|
-
Rollbar.scoped({ input: input, params: params,
|
75
|
+
Rollbar.scoped({ input: input, params: params, sentences: sentences, word: word }) do
|
76
76
|
raise 'Missing misc'
|
77
77
|
end if misc.nil?
|
78
78
|
|
79
79
|
offset = misc.match(/start_char=(\d+)|/)[1].to_i
|
80
80
|
length = misc.match(/end_char=(\d+)/)[1].to_i - offset
|
81
81
|
|
82
|
-
u_pos = word
|
82
|
+
u_pos = word.upos
|
83
83
|
pos = POS[u_pos]
|
84
84
|
raise "Didn't find a map for #{u_pos}" if pos.nil?
|
85
85
|
type = if POS_OPEN.include? pos then 'open' else 'close' end
|
86
86
|
|
87
|
-
params =
|
87
|
+
params = Hashie::Mash.new(
|
88
88
|
wid: w_index,
|
89
89
|
sid: s_index + 1,
|
90
90
|
tid: w_index,
|
91
91
|
para: 1,
|
92
92
|
offset: offset,
|
93
93
|
length: length,
|
94
|
-
text: word
|
95
|
-
lemma: word
|
94
|
+
text: word.text,
|
95
|
+
lemma: word.lemma,
|
96
96
|
morphofeat: u_pos,
|
97
97
|
pos: pos,
|
98
98
|
type: type,
|
99
|
-
|
99
|
+
head: word.head,
|
100
|
+
)
|
100
101
|
|
101
102
|
kaf.add_word_form params
|
102
103
|
kaf.add_term params
|
@@ -35,8 +35,8 @@ Gem::Specification.new do |spec|
|
|
35
35
|
spec.add_dependency 'opener-language-identifier', '>= 4.4.0'
|
36
36
|
spec.add_dependency 'opener-tokenizer', '>= 2.2.0'
|
37
37
|
spec.add_dependency 'opener-pos-tagger', '>= 3.2.0'
|
38
|
-
spec.add_dependency 'opener-property-tagger', '>= 3.
|
39
|
-
spec.add_dependency 'opener-polarity-tagger', '>= 3.
|
38
|
+
spec.add_dependency 'opener-property-tagger', '>= 3.4.0'
|
39
|
+
spec.add_dependency 'opener-polarity-tagger', '>= 3.4.0'
|
40
40
|
spec.add_dependency 'opener-opinion-detector-basic', '>= 3.2.3'
|
41
41
|
|
42
42
|
spec.add_development_dependency 'bundler', '~> 1.3'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-chained-daemon
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.3.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -184,28 +184,28 @@ dependencies:
|
|
184
184
|
requirements:
|
185
185
|
- - ">="
|
186
186
|
- !ruby/object:Gem::Version
|
187
|
-
version: 3.
|
187
|
+
version: 3.4.0
|
188
188
|
type: :runtime
|
189
189
|
prerelease: false
|
190
190
|
version_requirements: !ruby/object:Gem::Requirement
|
191
191
|
requirements:
|
192
192
|
- - ">="
|
193
193
|
- !ruby/object:Gem::Version
|
194
|
-
version: 3.
|
194
|
+
version: 3.4.0
|
195
195
|
- !ruby/object:Gem::Dependency
|
196
196
|
name: opener-polarity-tagger
|
197
197
|
requirement: !ruby/object:Gem::Requirement
|
198
198
|
requirements:
|
199
199
|
- - ">="
|
200
200
|
- !ruby/object:Gem::Version
|
201
|
-
version: 3.
|
201
|
+
version: 3.4.0
|
202
202
|
type: :runtime
|
203
203
|
prerelease: false
|
204
204
|
version_requirements: !ruby/object:Gem::Requirement
|
205
205
|
requirements:
|
206
206
|
- - ">="
|
207
207
|
- !ruby/object:Gem::Version
|
208
|
-
version: 3.
|
208
|
+
version: 3.4.0
|
209
209
|
- !ruby/object:Gem::Dependency
|
210
210
|
name: opener-opinion-detector-basic
|
211
211
|
requirement: !ruby/object:Gem::Requirement
|
@@ -307,10 +307,10 @@ dependencies:
|
|
307
307
|
description: OpeNER daemon for processing multiple queues at once
|
308
308
|
email:
|
309
309
|
executables:
|
310
|
-
- chained-daemon
|
311
310
|
- chained-daemon-daemon
|
312
311
|
- chained-daemon-csv
|
313
312
|
- console
|
313
|
+
- chained-daemon
|
314
314
|
extensions: []
|
315
315
|
extra_rdoc_files: []
|
316
316
|
files:
|
@@ -330,7 +330,8 @@ files:
|
|
330
330
|
- lib/opener/kaf/document.rb
|
331
331
|
- lib/opener/kaf/term.rb
|
332
332
|
- lib/opener/kaf/text.rb
|
333
|
-
- lib/opener/stanza/
|
333
|
+
- lib/opener/stanza/processor.rb
|
334
|
+
- lib/opener/sym_mash.rb
|
334
335
|
- opener-chained-daemon.gemspec
|
335
336
|
homepage:
|
336
337
|
licenses:
|