opener-chained-daemon 3.3.2 → 3.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/opener/chained_daemon.rb +1 -3
- data/lib/opener/chained_daemon/chained_daemon.rb +10 -4
- data/lib/opener/chained_daemon/cli.rb +10 -2
- data/lib/opener/chained_daemon/languages_cache.rb +3 -13
- data/lib/opener/chained_daemon/version.rb +1 -1
- data/lib/opener/kaf/document.rb +15 -13
- data/lib/opener/kaf/text.rb +2 -0
- data/lib/opener/stanza/{tokenizer_pos.rb → processor.rb} +31 -24
- data/opener-chained-daemon.gemspec +0 -1
- metadata +3 -17
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 0a513f031e01768f7a9e5231949462a0780a03a7b088ac601aac7f4d90b07db9
|
|
4
|
+
data.tar.gz: efbe7d2ef0812a4ac25389ee0a988e89083b02bd9364f355adfc32ecab65dc67
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7791901ba5ebb2f14e14a2cfc07759b50965731d26a2e12b81ba0cf6531f7dcce6647be81265a0bc40588c56b2c704c8d52e02940da8840e1a621ca8abcbff21
|
|
7
|
+
data.tar.gz: b8b650cec738105a9706bd5499828e853e05f652bb6dc125259eb759afd3ef743c0c9a1b78e83d73ee288750dd48ab76d3f61c33d29044b2f1242c746ff0532e
|
|
@@ -12,12 +12,10 @@ require_relative 'sym_mash'
|
|
|
12
12
|
|
|
13
13
|
require_relative 'chained_daemon/languages_cache'
|
|
14
14
|
require 'opener/language_identifier'
|
|
15
|
-
require 'opener/tokenizer'
|
|
16
|
-
require 'opener/pos_tagger' if RUBY_ENGINE == 'jruby'
|
|
17
15
|
require 'opener/polarity_tagger'
|
|
18
16
|
require 'opener/property_tagger'
|
|
19
17
|
require 'opener/opinion_detector_basic'
|
|
20
|
-
require 'opener/stanza/
|
|
18
|
+
require 'opener/stanza/processor'
|
|
21
19
|
|
|
22
20
|
require_relative 'chained_daemon/chained_daemon'
|
|
23
21
|
require_relative 'chained_daemon/cli'
|
|
@@ -1,6 +1,14 @@
|
|
|
1
1
|
module Opener
|
|
2
2
|
class ChainedDaemon
|
|
3
3
|
|
|
4
|
+
def self.http
|
|
5
|
+
http = HTTPClient.new
|
|
6
|
+
http.send_timeout = 600
|
|
7
|
+
http.receive_timeout = 600
|
|
8
|
+
http.connect_timeout = 600
|
|
9
|
+
http
|
|
10
|
+
end
|
|
11
|
+
|
|
4
12
|
DEFAULT_OPTIONS = {
|
|
5
13
|
}
|
|
6
14
|
|
|
@@ -8,9 +16,7 @@ module Opener
|
|
|
8
16
|
@options = DEFAULT_OPTIONS.merge options
|
|
9
17
|
@queue_map = {
|
|
10
18
|
'opener-language-identifier': Opener::LanguageIdentifier.new,
|
|
11
|
-
'stanza-
|
|
12
|
-
#'opener-tokenizer': Opener::Tokenizer.new,
|
|
13
|
-
#'opener-pos-tagger': Opener::POSTagger.new,
|
|
19
|
+
'stanza-processor': Stanza::Processor.new,
|
|
14
20
|
'opener-property-tagger': Opener::PropertyTagger.new,
|
|
15
21
|
'opener-polarity-tagger': Opener::PolarityTagger.new,
|
|
16
22
|
'opener-opinion-detector-basic': Opener::OpinionDetectorBasic.new,
|
|
@@ -50,7 +56,7 @@ module Opener
|
|
|
50
56
|
output = xml.to_s
|
|
51
57
|
end
|
|
52
58
|
|
|
53
|
-
output = pretty_print output if params.cache_keys
|
|
59
|
+
output = pretty_print output if params.cache_keys.environment == 'staging'
|
|
54
60
|
output
|
|
55
61
|
|
|
56
62
|
rescue Core::UnsupportedLanguageError
|
|
@@ -48,10 +48,18 @@ Example:
|
|
|
48
48
|
daemon = ChainedDaemon.new args: args
|
|
49
49
|
input = STDIN.tty? ? nil : STDIN.read
|
|
50
50
|
params = if ENV['PARAMS'] then JSON.parse ENV['PARAMS'] else {} end
|
|
51
|
+
|
|
51
52
|
# Set environment as staging from console for testing purposes
|
|
52
|
-
|
|
53
|
+
env = ENV['LEXICONS_ENV'] || 'staging'
|
|
54
|
+
pt = ENV['LEXICONS_PROPERTY_TYPE']
|
|
55
|
+
params[:cache_keys] = {
|
|
56
|
+
environment: env,
|
|
57
|
+
property_type: pt,
|
|
58
|
+
merged: (true if env == 'staging'),
|
|
59
|
+
}
|
|
53
60
|
|
|
54
|
-
|
|
61
|
+
output = daemon.run input, params
|
|
62
|
+
puts output
|
|
55
63
|
end
|
|
56
64
|
end
|
|
57
65
|
end
|
|
@@ -19,25 +19,15 @@ module Opener
|
|
|
19
19
|
break @cache if @last_updated and @last_updated > UPDATE_INTERVAL.ago
|
|
20
20
|
cache_update
|
|
21
21
|
end
|
|
22
|
-
@cache
|
|
23
22
|
end
|
|
24
23
|
|
|
25
24
|
def cache_update
|
|
26
25
|
puts "loading supported languages from url #{@url}" if ENV['DEBUG']
|
|
27
26
|
|
|
28
|
-
languages = JSON.parse http.get(@url).body
|
|
29
|
-
@cache = languages['data'].map { |l| l['code'] }
|
|
27
|
+
languages = SymMash.new JSON.parse ChainedDaemon.http.get(@url).body
|
|
30
28
|
@last_updated = Time.now
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def http
|
|
34
|
-
return @http if @http
|
|
35
|
-
|
|
36
|
-
@http = HTTPClient.new
|
|
37
|
-
@http.send_timeout = 120
|
|
38
|
-
@http.receive_timeout = 120
|
|
39
|
-
@http.connect_timeout = 120
|
|
40
|
-
@http
|
|
29
|
+
@cache = languages.data.each.with_object({}){ |l,h| h[l.code] = l }
|
|
30
|
+
@cache
|
|
41
31
|
end
|
|
42
32
|
|
|
43
33
|
end
|
data/lib/opener/kaf/document.rb
CHANGED
|
@@ -48,13 +48,14 @@ module Opener
|
|
|
48
48
|
|
|
49
49
|
def add_word_form params
|
|
50
50
|
text = @document.at('text') || @document.root.add_child('<text/>').first
|
|
51
|
-
wf = text.add_child("<wf>#{params
|
|
51
|
+
wf = text.add_child("<wf>#{params.text}</wf>")
|
|
52
52
|
attrs = {
|
|
53
|
-
wid: "w#{params
|
|
54
|
-
sent: params
|
|
55
|
-
para: params
|
|
56
|
-
offset: params
|
|
57
|
-
length: params
|
|
53
|
+
wid: "w#{params.wid}",
|
|
54
|
+
sent: params.sid,
|
|
55
|
+
para: params.para,
|
|
56
|
+
offset: params.offset,
|
|
57
|
+
length: params.length,
|
|
58
|
+
head: params.head,
|
|
58
59
|
}
|
|
59
60
|
wf.attr attrs
|
|
60
61
|
end
|
|
@@ -63,15 +64,16 @@ module Opener
|
|
|
63
64
|
text = @document.at('terms') || @document.root.add_child('<terms/>').first
|
|
64
65
|
term = text.add_child("<term/>")
|
|
65
66
|
attrs = {
|
|
66
|
-
tid: "t#{params
|
|
67
|
-
type: params
|
|
68
|
-
lemma: params
|
|
69
|
-
text: params
|
|
70
|
-
pos: params
|
|
71
|
-
morphofeat: params
|
|
67
|
+
tid: "t#{params.tid}",
|
|
68
|
+
type: params.type,
|
|
69
|
+
lemma: params.lemma,
|
|
70
|
+
text: params.text,
|
|
71
|
+
pos: params.pos,
|
|
72
|
+
morphofeat: params.morphofeat,
|
|
73
|
+
head: params.head,
|
|
72
74
|
}
|
|
73
75
|
term.attr attrs
|
|
74
|
-
term.first.add_child("<span><target id='w#{params
|
|
76
|
+
term.first.add_child("<span><target id='w#{params.wid}'/></span>")
|
|
75
77
|
end
|
|
76
78
|
|
|
77
79
|
def to_xml
|
data/lib/opener/kaf/text.rb
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
module Opener
|
|
2
2
|
module KAF
|
|
3
3
|
class WordForm
|
|
4
|
+
|
|
4
5
|
def initialize(document, xml_node)
|
|
5
6
|
@document = document
|
|
6
7
|
@xml_node = xml_node
|
|
@@ -25,6 +26,7 @@ module Opener
|
|
|
25
26
|
def paragraph
|
|
26
27
|
return @paragraph ||= @xml_node.attr('para').to_i
|
|
27
28
|
end
|
|
29
|
+
|
|
28
30
|
end
|
|
29
31
|
end
|
|
30
32
|
end
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
module Opener
|
|
2
2
|
module Stanza
|
|
3
|
-
class
|
|
3
|
+
class Processor
|
|
4
4
|
|
|
5
5
|
DESC = 'Tokenizer / POS by Stanza'
|
|
6
6
|
VERSION = '1.0'
|
|
@@ -8,8 +8,10 @@ module Opener
|
|
|
8
8
|
BASE_URL = ENV['STANZA_SERVER']
|
|
9
9
|
LANGUAGES_CACHE = Opener::ChainedDaemon::LanguagesCache.new
|
|
10
10
|
|
|
11
|
-
RTL_LANGUAGES = [
|
|
12
|
-
|
|
11
|
+
RTL_LANGUAGES = %w[
|
|
12
|
+
ar ara arc ae ave egy he heb nqo pal phn sam
|
|
13
|
+
syc syr fa per fas ku kur ur urd
|
|
14
|
+
]
|
|
13
15
|
|
|
14
16
|
POS = {
|
|
15
17
|
'DET' => 'D',
|
|
@@ -37,63 +39,68 @@ module Opener
|
|
|
37
39
|
raise 'missing Stanza server' if ENV['STANZA_SERVER'].blank?
|
|
38
40
|
|
|
39
41
|
kaf = KAF::Document.from_xml input
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
42
|
+
lang = LANGUAGES_CACHE.get[kaf.language]
|
|
43
|
+
env = params.cache_keys.environment
|
|
44
|
+
unless lang&.environments&.include? env or (params.cache_keys.merged and lang&.environments&.include? 'production')
|
|
45
|
+
raise Core::UnsupportedLanguageError.new kaf.language
|
|
46
|
+
end
|
|
47
|
+
if env == 'production' and !lang.supported_by_opener
|
|
43
48
|
raise Core::UnsupportedLanguageError.new kaf.language
|
|
44
49
|
end
|
|
45
50
|
|
|
46
|
-
input
|
|
47
|
-
input
|
|
48
|
-
response
|
|
51
|
+
input = kaf.raw
|
|
52
|
+
input = input.gsub(/\,[^\ ]/, ', ')
|
|
53
|
+
response = ChainedDaemon.http.post BASE_URL, {lang: kaf.language, input: input}.to_query
|
|
49
54
|
raise Core::UnsupportedLanguageError, kaf.language if response.status == 406
|
|
50
55
|
raise response.body if response.status >= 400
|
|
51
|
-
|
|
56
|
+
sentences = JSON.parse response.body
|
|
57
|
+
sentences.each{ |s| s.map!{ |t| Hashie::Mash.new t } }
|
|
52
58
|
|
|
53
59
|
w_index = 0
|
|
54
60
|
|
|
55
61
|
miscs = {}
|
|
56
|
-
|
|
62
|
+
sentences.each.with_index do |s, i|
|
|
57
63
|
miscs[i] = {}
|
|
58
|
-
|
|
59
|
-
word
|
|
64
|
+
s.each do |word|
|
|
65
|
+
word.id.is_a?(Array) && word.id.each{ |id| miscs[i][id] = word.misc }
|
|
60
66
|
end
|
|
61
67
|
end
|
|
62
68
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
69
|
+
sentences.map{ |s| s.reverse! } if RTL_LANGUAGES.include? kaf.language
|
|
70
|
+
sentences.each.with_index do |s, s_index|
|
|
71
|
+
s.each do |word|
|
|
66
72
|
w_index += 1
|
|
67
73
|
# save misc for later usase in a MWT case
|
|
68
|
-
next if word
|
|
74
|
+
next if word.id.is_a? Array
|
|
69
75
|
|
|
70
|
-
misc = word
|
|
76
|
+
misc = word.misc || miscs[s_index][word.id]
|
|
71
77
|
|
|
72
|
-
Rollbar.scoped({ input: input, params: params,
|
|
78
|
+
Rollbar.scoped({ input: input, params: params, sentences: sentences, word: word }) do
|
|
73
79
|
raise 'Missing misc'
|
|
74
80
|
end if misc.nil?
|
|
75
81
|
|
|
76
82
|
offset = misc.match(/start_char=(\d+)|/)[1].to_i
|
|
77
83
|
length = misc.match(/end_char=(\d+)/)[1].to_i - offset
|
|
78
84
|
|
|
79
|
-
u_pos = word
|
|
85
|
+
u_pos = word.upos
|
|
80
86
|
pos = POS[u_pos]
|
|
81
87
|
raise "Didn't find a map for #{u_pos}" if pos.nil?
|
|
82
88
|
type = if POS_OPEN.include? pos then 'open' else 'close' end
|
|
83
89
|
|
|
84
|
-
params =
|
|
90
|
+
params = Hashie::Mash.new(
|
|
85
91
|
wid: w_index,
|
|
86
92
|
sid: s_index + 1,
|
|
87
93
|
tid: w_index,
|
|
88
94
|
para: 1,
|
|
89
95
|
offset: offset,
|
|
90
96
|
length: length,
|
|
91
|
-
text: word
|
|
92
|
-
lemma: word
|
|
97
|
+
text: word.text,
|
|
98
|
+
lemma: word.lemma,
|
|
93
99
|
morphofeat: u_pos,
|
|
94
100
|
pos: pos,
|
|
95
101
|
type: type,
|
|
96
|
-
|
|
102
|
+
head: word.head,
|
|
103
|
+
)
|
|
97
104
|
|
|
98
105
|
kaf.add_word_form params
|
|
99
106
|
kaf.add_term params
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: opener-chained-daemon
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.3.
|
|
4
|
+
version: 3.3.7
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- development@olery.com
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2021-02-
|
|
11
|
+
date: 2021-02-26 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: activesupport
|
|
@@ -94,20 +94,6 @@ dependencies:
|
|
|
94
94
|
- - ">="
|
|
95
95
|
- !ruby/object:Gem::Version
|
|
96
96
|
version: '0'
|
|
97
|
-
- !ruby/object:Gem::Dependency
|
|
98
|
-
name: faraday
|
|
99
|
-
requirement: !ruby/object:Gem::Requirement
|
|
100
|
-
requirements:
|
|
101
|
-
- - ">="
|
|
102
|
-
- !ruby/object:Gem::Version
|
|
103
|
-
version: '0'
|
|
104
|
-
type: :runtime
|
|
105
|
-
prerelease: false
|
|
106
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
107
|
-
requirements:
|
|
108
|
-
- - ">="
|
|
109
|
-
- !ruby/object:Gem::Version
|
|
110
|
-
version: '0'
|
|
111
97
|
- !ruby/object:Gem::Dependency
|
|
112
98
|
name: opener-daemons
|
|
113
99
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -330,7 +316,7 @@ files:
|
|
|
330
316
|
- lib/opener/kaf/document.rb
|
|
331
317
|
- lib/opener/kaf/term.rb
|
|
332
318
|
- lib/opener/kaf/text.rb
|
|
333
|
-
- lib/opener/stanza/
|
|
319
|
+
- lib/opener/stanza/processor.rb
|
|
334
320
|
- lib/opener/sym_mash.rb
|
|
335
321
|
- opener-chained-daemon.gemspec
|
|
336
322
|
homepage:
|