opener-chained-daemon 3.3.1 → 3.3.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/opener/chained_daemon.rb +1 -3
- data/lib/opener/chained_daemon/chained_daemon.rb +9 -4
- data/lib/opener/chained_daemon/cli.rb +10 -2
- data/lib/opener/chained_daemon/languages_cache.rb +3 -13
- data/lib/opener/chained_daemon/version.rb +1 -1
- data/lib/opener/kaf/document.rb +15 -13
- data/lib/opener/kaf/text.rb +2 -0
- data/lib/opener/stanza/{tokenizer_pos.rb → processor.rb} +31 -24
- data/opener-chained-daemon.gemspec +0 -1
- metadata +3 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: adaeee08b8374c1047c08bed61dc5159ff18178d6f12ccac8715aea05a27019b
|
4
|
+
data.tar.gz: 78e762ec93e9aa6fd19a617690daa19676753d0249727e56b88f299ca60ae2e9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fc2f6223c49f7b4aa78bb4dc1a82f569443c579e3888339067b9faef568c45cfe43854829c0e6256b091fa4b2c6ca2509f8679659f09e66bb5f2614cd521bb91
|
7
|
+
data.tar.gz: cc60b5c3d37a621ebd41608751c2f569aa26af3e36fc1f5a5134168b66bcaf7efb6c0868110e5d69b1a9ae8a50a4b53fb08a3015806fcbf4452eefb19f1f68bb
|
@@ -12,12 +12,10 @@ require_relative 'sym_mash'
|
|
12
12
|
|
13
13
|
require_relative 'chained_daemon/languages_cache'
|
14
14
|
require 'opener/language_identifier'
|
15
|
-
require 'opener/tokenizer'
|
16
|
-
require 'opener/pos_tagger' if RUBY_ENGINE == 'jruby'
|
17
15
|
require 'opener/polarity_tagger'
|
18
16
|
require 'opener/property_tagger'
|
19
17
|
require 'opener/opinion_detector_basic'
|
20
|
-
require 'opener/stanza/
|
18
|
+
require 'opener/stanza/processor'
|
21
19
|
|
22
20
|
require_relative 'chained_daemon/chained_daemon'
|
23
21
|
require_relative 'chained_daemon/cli'
|
@@ -1,6 +1,12 @@
|
|
1
1
|
module Opener
|
2
2
|
class ChainedDaemon
|
3
3
|
|
4
|
+
class_attribute :http
|
5
|
+
self.http = HTTPClient.new
|
6
|
+
self.http.send_timeout = 600
|
7
|
+
self.http.receive_timeout = 600
|
8
|
+
self.http.connect_timeout = 600
|
9
|
+
|
4
10
|
DEFAULT_OPTIONS = {
|
5
11
|
}
|
6
12
|
|
@@ -8,9 +14,7 @@ module Opener
|
|
8
14
|
@options = DEFAULT_OPTIONS.merge options
|
9
15
|
@queue_map = {
|
10
16
|
'opener-language-identifier': Opener::LanguageIdentifier.new,
|
11
|
-
'stanza-
|
12
|
-
#'opener-tokenizer': Opener::Tokenizer.new,
|
13
|
-
#'opener-pos-tagger': Opener::POSTagger.new,
|
17
|
+
'stanza-processor': Stanza::Processor.new,
|
14
18
|
'opener-property-tagger': Opener::PropertyTagger.new,
|
15
19
|
'opener-polarity-tagger': Opener::PolarityTagger.new,
|
16
20
|
'opener-opinion-detector-basic': Opener::OpinionDetectorBasic.new,
|
@@ -24,6 +28,7 @@ module Opener
|
|
24
28
|
if params.filter_vertical and params.property_type.present?
|
25
29
|
params.cache_keys.property_type = params.property_type
|
26
30
|
end
|
31
|
+
params.cache_keys.environment ||= 'production'
|
27
32
|
|
28
33
|
lang = nil
|
29
34
|
output = nil
|
@@ -49,7 +54,7 @@ module Opener
|
|
49
54
|
output = xml.to_s
|
50
55
|
end
|
51
56
|
|
52
|
-
output = pretty_print output if params.cache_keys
|
57
|
+
output = pretty_print output if params.cache_keys.environment == 'staging'
|
53
58
|
output
|
54
59
|
|
55
60
|
rescue Core::UnsupportedLanguageError
|
@@ -48,10 +48,18 @@ Example:
|
|
48
48
|
daemon = ChainedDaemon.new args: args
|
49
49
|
input = STDIN.tty? ? nil : STDIN.read
|
50
50
|
params = if ENV['PARAMS'] then JSON.parse ENV['PARAMS'] else {} end
|
51
|
+
|
51
52
|
# Set environment as staging from console for testing purposes
|
52
|
-
|
53
|
+
env = ENV['LEXICONS_ENV'] || 'staging'
|
54
|
+
pt = ENV['LEXICONS_PROPERTY_TYPE']
|
55
|
+
params[:cache_keys] = {
|
56
|
+
environment: env,
|
57
|
+
property_type: pt,
|
58
|
+
merged: (true if env == 'staging'),
|
59
|
+
}
|
53
60
|
|
54
|
-
|
61
|
+
output = daemon.run input, params
|
62
|
+
puts output
|
55
63
|
end
|
56
64
|
end
|
57
65
|
end
|
@@ -19,25 +19,15 @@ module Opener
|
|
19
19
|
break @cache if @last_updated and @last_updated > UPDATE_INTERVAL.ago
|
20
20
|
cache_update
|
21
21
|
end
|
22
|
-
@cache
|
23
22
|
end
|
24
23
|
|
25
24
|
def cache_update
|
26
25
|
puts "loading supported languages from url #{@url}" if ENV['DEBUG']
|
27
26
|
|
28
|
-
languages = JSON.parse http.get(@url).body
|
29
|
-
@cache = languages['data'].map { |l| l['code'] }
|
27
|
+
languages = SymMash.new JSON.parse ChainedDaemon.http.get(@url).body
|
30
28
|
@last_updated = Time.now
|
31
|
-
|
32
|
-
|
33
|
-
def http
|
34
|
-
return @http if @http
|
35
|
-
|
36
|
-
@http = HTTPClient.new
|
37
|
-
@http.send_timeout = 120
|
38
|
-
@http.receive_timeout = 120
|
39
|
-
@http.connect_timeout = 120
|
40
|
-
@http
|
29
|
+
@cache = languages.data.each.with_object({}){ |l,h| h[l.code] = l }
|
30
|
+
@cache
|
41
31
|
end
|
42
32
|
|
43
33
|
end
|
data/lib/opener/kaf/document.rb
CHANGED
@@ -48,13 +48,14 @@ module Opener
|
|
48
48
|
|
49
49
|
def add_word_form params
|
50
50
|
text = @document.at('text') || @document.root.add_child('<text/>').first
|
51
|
-
wf = text.add_child("<wf>#{params
|
51
|
+
wf = text.add_child("<wf>#{params.text}</wf>")
|
52
52
|
attrs = {
|
53
|
-
wid: "w#{params
|
54
|
-
sent: params
|
55
|
-
para: params
|
56
|
-
offset: params
|
57
|
-
length: params
|
53
|
+
wid: "w#{params.wid}",
|
54
|
+
sent: params.sid,
|
55
|
+
para: params.para,
|
56
|
+
offset: params.offset,
|
57
|
+
length: params.length,
|
58
|
+
head: params.head,
|
58
59
|
}
|
59
60
|
wf.attr attrs
|
60
61
|
end
|
@@ -63,15 +64,16 @@ module Opener
|
|
63
64
|
text = @document.at('terms') || @document.root.add_child('<terms/>').first
|
64
65
|
term = text.add_child("<term/>")
|
65
66
|
attrs = {
|
66
|
-
tid: "t#{params
|
67
|
-
type: params
|
68
|
-
lemma: params
|
69
|
-
text: params
|
70
|
-
pos: params
|
71
|
-
morphofeat: params
|
67
|
+
tid: "t#{params.tid}",
|
68
|
+
type: params.type,
|
69
|
+
lemma: params.lemma,
|
70
|
+
text: params.text,
|
71
|
+
pos: params.pos,
|
72
|
+
morphofeat: params.morphofeat,
|
73
|
+
head: params.head,
|
72
74
|
}
|
73
75
|
term.attr attrs
|
74
|
-
term.first.add_child("<span><target id='w#{params
|
76
|
+
term.first.add_child("<span><target id='w#{params.wid}'/></span>")
|
75
77
|
end
|
76
78
|
|
77
79
|
def to_xml
|
data/lib/opener/kaf/text.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
module Opener
|
2
2
|
module KAF
|
3
3
|
class WordForm
|
4
|
+
|
4
5
|
def initialize(document, xml_node)
|
5
6
|
@document = document
|
6
7
|
@xml_node = xml_node
|
@@ -25,6 +26,7 @@ module Opener
|
|
25
26
|
def paragraph
|
26
27
|
return @paragraph ||= @xml_node.attr('para').to_i
|
27
28
|
end
|
29
|
+
|
28
30
|
end
|
29
31
|
end
|
30
32
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Opener
|
2
2
|
module Stanza
|
3
|
-
class
|
3
|
+
class Processor
|
4
4
|
|
5
5
|
DESC = 'Tokenizer / POS by Stanza'
|
6
6
|
VERSION = '1.0'
|
@@ -8,8 +8,10 @@ module Opener
|
|
8
8
|
BASE_URL = ENV['STANZA_SERVER']
|
9
9
|
LANGUAGES_CACHE = Opener::ChainedDaemon::LanguagesCache.new
|
10
10
|
|
11
|
-
RTL_LANGUAGES = [
|
12
|
-
|
11
|
+
RTL_LANGUAGES = %w[
|
12
|
+
ar ara arc ae ave egy he heb nqo pal phn sam
|
13
|
+
syc syr fa per fas ku kur ur urd
|
14
|
+
]
|
13
15
|
|
14
16
|
POS = {
|
15
17
|
'DET' => 'D',
|
@@ -37,63 +39,68 @@ module Opener
|
|
37
39
|
raise 'missing Stanza server' if ENV['STANZA_SERVER'].blank?
|
38
40
|
|
39
41
|
kaf = KAF::Document.from_xml input
|
40
|
-
|
41
|
-
|
42
|
-
|
42
|
+
lang = LANGUAGES_CACHE.get[kaf.language]
|
43
|
+
env = params.cache_keys.environment
|
44
|
+
unless lang&.environments&.include? env or (params.cache_keys.merged and lang&.environments&.include? 'production')
|
45
|
+
raise Core::UnsupportedLanguageError.new kaf.language
|
46
|
+
end
|
47
|
+
if env == 'production' and !lang.supported_by_opener
|
43
48
|
raise Core::UnsupportedLanguageError.new kaf.language
|
44
49
|
end
|
45
50
|
|
46
|
-
input
|
47
|
-
input
|
48
|
-
response
|
51
|
+
input = kaf.raw
|
52
|
+
input = input.gsub(/\,[^\ ]/, ', ')
|
53
|
+
response = ChainedDaemon.http.post BASE_URL, {lang: kaf.language, input: input}.to_query
|
49
54
|
raise Core::UnsupportedLanguageError, kaf.language if response.status == 406
|
50
55
|
raise response.body if response.status >= 400
|
51
|
-
|
56
|
+
sentences = JSON.parse response.body
|
57
|
+
sentences.each{ |s| s.map!{ |t| Hashie::Mash.new t } }
|
52
58
|
|
53
59
|
w_index = 0
|
54
60
|
|
55
61
|
miscs = {}
|
56
|
-
|
62
|
+
sentences.each.with_index do |s, i|
|
57
63
|
miscs[i] = {}
|
58
|
-
|
59
|
-
word
|
64
|
+
s.each do |word|
|
65
|
+
word.id.is_a?(Array) && word.id.each{ |id| miscs[i][id] = word.misc }
|
60
66
|
end
|
61
67
|
end
|
62
68
|
|
63
|
-
|
64
|
-
|
65
|
-
|
69
|
+
sentences.map{ |s| s.reverse! } if RTL_LANGUAGES.include? kaf.language
|
70
|
+
sentences.each.with_index do |s, s_index|
|
71
|
+
s.each do |word|
|
66
72
|
w_index += 1
|
67
73
|
# save misc for later usase in a MWT case
|
68
|
-
next if word
|
74
|
+
next if word.id.is_a? Array
|
69
75
|
|
70
|
-
misc = word
|
76
|
+
misc = word.misc || miscs[s_index][word.id]
|
71
77
|
|
72
|
-
Rollbar.scoped({ input: input, params: params,
|
78
|
+
Rollbar.scoped({ input: input, params: params, sentences: sentences, word: word }) do
|
73
79
|
raise 'Missing misc'
|
74
80
|
end if misc.nil?
|
75
81
|
|
76
82
|
offset = misc.match(/start_char=(\d+)|/)[1].to_i
|
77
83
|
length = misc.match(/end_char=(\d+)/)[1].to_i - offset
|
78
84
|
|
79
|
-
u_pos = word
|
85
|
+
u_pos = word.upos
|
80
86
|
pos = POS[u_pos]
|
81
87
|
raise "Didn't find a map for #{u_pos}" if pos.nil?
|
82
88
|
type = if POS_OPEN.include? pos then 'open' else 'close' end
|
83
89
|
|
84
|
-
params =
|
90
|
+
params = Hashie::Mash.new(
|
85
91
|
wid: w_index,
|
86
92
|
sid: s_index + 1,
|
87
93
|
tid: w_index,
|
88
94
|
para: 1,
|
89
95
|
offset: offset,
|
90
96
|
length: length,
|
91
|
-
text: word
|
92
|
-
lemma: word
|
97
|
+
text: word.text,
|
98
|
+
lemma: word.lemma,
|
93
99
|
morphofeat: u_pos,
|
94
100
|
pos: pos,
|
95
101
|
type: type,
|
96
|
-
|
102
|
+
head: word.head,
|
103
|
+
)
|
97
104
|
|
98
105
|
kaf.add_word_form params
|
99
106
|
kaf.add_term params
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-chained-daemon
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.3.
|
4
|
+
version: 3.3.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-02-
|
11
|
+
date: 2021-02-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -94,20 +94,6 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
-
- !ruby/object:Gem::Dependency
|
98
|
-
name: faraday
|
99
|
-
requirement: !ruby/object:Gem::Requirement
|
100
|
-
requirements:
|
101
|
-
- - ">="
|
102
|
-
- !ruby/object:Gem::Version
|
103
|
-
version: '0'
|
104
|
-
type: :runtime
|
105
|
-
prerelease: false
|
106
|
-
version_requirements: !ruby/object:Gem::Requirement
|
107
|
-
requirements:
|
108
|
-
- - ">="
|
109
|
-
- !ruby/object:Gem::Version
|
110
|
-
version: '0'
|
111
97
|
- !ruby/object:Gem::Dependency
|
112
98
|
name: opener-daemons
|
113
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -330,7 +316,7 @@ files:
|
|
330
316
|
- lib/opener/kaf/document.rb
|
331
317
|
- lib/opener/kaf/term.rb
|
332
318
|
- lib/opener/kaf/text.rb
|
333
|
-
- lib/opener/stanza/
|
319
|
+
- lib/opener/stanza/processor.rb
|
334
320
|
- lib/opener/sym_mash.rb
|
335
321
|
- opener-chained-daemon.gemspec
|
336
322
|
homepage:
|