opener-chained-daemon 3.3.2 → 3.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/opener/chained_daemon.rb +1 -3
- data/lib/opener/chained_daemon/chained_daemon.rb +2 -4
- data/lib/opener/chained_daemon/cli.rb +10 -2
- data/lib/opener/chained_daemon/languages_cache.rb +3 -3
- data/lib/opener/chained_daemon/version.rb +1 -1
- data/lib/opener/kaf/document.rb +15 -13
- data/lib/opener/kaf/text.rb +2 -0
- data/lib/opener/stanza/{tokenizer_pos.rb → processor.rb} +28 -24
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5bdd841299458126e66df2dff1857ea3c2d2386efab9f092def36357935e2f9d
|
4
|
+
data.tar.gz: 140ff88d7ddd90a90bc21106bc9a0d3370802969bca20820ee04b45398e975c3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7026baaf0c0541725cb3459575bec129354bada59be1aa30174425051d26ff973f4a0359985cb58b1ad70035ba6c6c9db4409b29631cf9b1d8fbe16152a59268
|
7
|
+
data.tar.gz: d947f1bc75e048b56d18b5406ed455a365c406fc1b01bf6e4b6ee0ae4584f2513e45589f93d5e1486c47fe34e1305d4a0105612ec7b17dc5d75e67c0658e0d0a
|
@@ -12,12 +12,10 @@ require_relative 'sym_mash'
|
|
12
12
|
|
13
13
|
require_relative 'chained_daemon/languages_cache'
|
14
14
|
require 'opener/language_identifier'
|
15
|
-
require 'opener/tokenizer'
|
16
|
-
require 'opener/pos_tagger' if RUBY_ENGINE == 'jruby'
|
17
15
|
require 'opener/polarity_tagger'
|
18
16
|
require 'opener/property_tagger'
|
19
17
|
require 'opener/opinion_detector_basic'
|
20
|
-
require 'opener/stanza/
|
18
|
+
require 'opener/stanza/processor'
|
21
19
|
|
22
20
|
require_relative 'chained_daemon/chained_daemon'
|
23
21
|
require_relative 'chained_daemon/cli'
|
@@ -8,9 +8,7 @@ module Opener
|
|
8
8
|
@options = DEFAULT_OPTIONS.merge options
|
9
9
|
@queue_map = {
|
10
10
|
'opener-language-identifier': Opener::LanguageIdentifier.new,
|
11
|
-
'stanza-
|
12
|
-
#'opener-tokenizer': Opener::Tokenizer.new,
|
13
|
-
#'opener-pos-tagger': Opener::POSTagger.new,
|
11
|
+
'stanza-processor': Stanza::Processor.new,
|
14
12
|
'opener-property-tagger': Opener::PropertyTagger.new,
|
15
13
|
'opener-polarity-tagger': Opener::PolarityTagger.new,
|
16
14
|
'opener-opinion-detector-basic': Opener::OpinionDetectorBasic.new,
|
@@ -50,7 +48,7 @@ module Opener
|
|
50
48
|
output = xml.to_s
|
51
49
|
end
|
52
50
|
|
53
|
-
output = pretty_print output if params.cache_keys
|
51
|
+
output = pretty_print output if params.cache_keys.environment == 'staging'
|
54
52
|
output
|
55
53
|
|
56
54
|
rescue Core::UnsupportedLanguageError
|
@@ -48,10 +48,18 @@ Example:
|
|
48
48
|
daemon = ChainedDaemon.new args: args
|
49
49
|
input = STDIN.tty? ? nil : STDIN.read
|
50
50
|
params = if ENV['PARAMS'] then JSON.parse ENV['PARAMS'] else {} end
|
51
|
+
|
51
52
|
# Set environment as staging from console for testing purposes
|
52
|
-
|
53
|
+
env = ENV['LEXICONS_ENV'] || 'staging'
|
54
|
+
pt = ENV['LEXICONS_PROPERTY_TYPE']
|
55
|
+
params[:cache_keys] = {
|
56
|
+
environment: env,
|
57
|
+
property_type: pt,
|
58
|
+
merged: (true if env == 'staging'),
|
59
|
+
}
|
53
60
|
|
54
|
-
|
61
|
+
output = daemon.run input, params
|
62
|
+
puts output
|
55
63
|
end
|
56
64
|
end
|
57
65
|
end
|
@@ -19,15 +19,15 @@ module Opener
|
|
19
19
|
break @cache if @last_updated and @last_updated > UPDATE_INTERVAL.ago
|
20
20
|
cache_update
|
21
21
|
end
|
22
|
-
@cache
|
23
22
|
end
|
24
23
|
|
25
24
|
def cache_update
|
26
25
|
puts "loading supported languages from url #{@url}" if ENV['DEBUG']
|
27
26
|
|
28
|
-
languages = JSON.parse http.get(@url).body
|
29
|
-
@cache = languages['data'].map { |l| l['code'] }
|
27
|
+
languages = SymMash.new JSON.parse http.get(@url).body
|
30
28
|
@last_updated = Time.now
|
29
|
+
@cache = languages.data.each.with_object({}){ |l,h| h[l.code] = l }
|
30
|
+
@cache
|
31
31
|
end
|
32
32
|
|
33
33
|
def http
|
data/lib/opener/kaf/document.rb
CHANGED
@@ -48,13 +48,14 @@ module Opener
|
|
48
48
|
|
49
49
|
def add_word_form params
|
50
50
|
text = @document.at('text') || @document.root.add_child('<text/>').first
|
51
|
-
wf = text.add_child("<wf>#{params
|
51
|
+
wf = text.add_child("<wf>#{params.text}</wf>")
|
52
52
|
attrs = {
|
53
|
-
wid: "w#{params
|
54
|
-
sent: params
|
55
|
-
para: params
|
56
|
-
offset: params
|
57
|
-
length: params
|
53
|
+
wid: "w#{params.wid}",
|
54
|
+
sent: params.sid,
|
55
|
+
para: params.para,
|
56
|
+
offset: params.offset,
|
57
|
+
length: params.length,
|
58
|
+
head: params.head,
|
58
59
|
}
|
59
60
|
wf.attr attrs
|
60
61
|
end
|
@@ -63,15 +64,16 @@ module Opener
|
|
63
64
|
text = @document.at('terms') || @document.root.add_child('<terms/>').first
|
64
65
|
term = text.add_child("<term/>")
|
65
66
|
attrs = {
|
66
|
-
tid: "t#{params
|
67
|
-
type: params
|
68
|
-
lemma: params
|
69
|
-
text: params
|
70
|
-
pos: params
|
71
|
-
morphofeat: params
|
67
|
+
tid: "t#{params.tid}",
|
68
|
+
type: params.type,
|
69
|
+
lemma: params.lemma,
|
70
|
+
text: params.text,
|
71
|
+
pos: params.pos,
|
72
|
+
morphofeat: params.morphofeat,
|
73
|
+
head: params.head,
|
72
74
|
}
|
73
75
|
term.attr attrs
|
74
|
-
term.first.add_child("<span><target id='w#{params
|
76
|
+
term.first.add_child("<span><target id='w#{params.wid}'/></span>")
|
75
77
|
end
|
76
78
|
|
77
79
|
def to_xml
|
data/lib/opener/kaf/text.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
module Opener
|
2
2
|
module KAF
|
3
3
|
class WordForm
|
4
|
+
|
4
5
|
def initialize(document, xml_node)
|
5
6
|
@document = document
|
6
7
|
@xml_node = xml_node
|
@@ -25,6 +26,7 @@ module Opener
|
|
25
26
|
def paragraph
|
26
27
|
return @paragraph ||= @xml_node.attr('para').to_i
|
27
28
|
end
|
29
|
+
|
28
30
|
end
|
29
31
|
end
|
30
32
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Opener
|
2
2
|
module Stanza
|
3
|
-
class
|
3
|
+
class Processor
|
4
4
|
|
5
5
|
DESC = 'Tokenizer / POS by Stanza'
|
6
6
|
VERSION = '1.0'
|
@@ -8,8 +8,10 @@ module Opener
|
|
8
8
|
BASE_URL = ENV['STANZA_SERVER']
|
9
9
|
LANGUAGES_CACHE = Opener::ChainedDaemon::LanguagesCache.new
|
10
10
|
|
11
|
-
RTL_LANGUAGES = [
|
12
|
-
|
11
|
+
RTL_LANGUAGES = %w[
|
12
|
+
ar ara arc ae ave egy he heb nqo pal phn sam
|
13
|
+
syc syr fa per fas ku kur ur urd
|
14
|
+
]
|
13
15
|
|
14
16
|
POS = {
|
15
17
|
'DET' => 'D',
|
@@ -37,63 +39,65 @@ module Opener
|
|
37
39
|
raise 'missing Stanza server' if ENV['STANZA_SERVER'].blank?
|
38
40
|
|
39
41
|
kaf = KAF::Document.from_xml input
|
40
|
-
|
41
|
-
|
42
|
-
|
42
|
+
lang = LANGUAGES_CACHE.get[kaf.language]
|
43
|
+
env = params.cache_keys.environment
|
44
|
+
unless lang&.environments&.include? env or (env == 'staging' and lang&.environments&.include? 'production')
|
43
45
|
raise Core::UnsupportedLanguageError.new kaf.language
|
44
46
|
end
|
45
47
|
|
46
|
-
input
|
47
|
-
input
|
48
|
-
response
|
48
|
+
input = kaf.raw
|
49
|
+
input = input.gsub(/\,[^\ ]/, ', ')
|
50
|
+
response = Faraday.post BASE_URL, {lang: kaf.language, input: input}.to_query
|
49
51
|
raise Core::UnsupportedLanguageError, kaf.language if response.status == 406
|
50
52
|
raise response.body if response.status >= 400
|
51
|
-
|
53
|
+
sentences = JSON.parse response.body
|
54
|
+
sentences.each{ |s| s.map!{ |t| Hashie::Mash.new t } }
|
52
55
|
|
53
56
|
w_index = 0
|
54
57
|
|
55
58
|
miscs = {}
|
56
|
-
|
59
|
+
sentences.each.with_index do |s, i|
|
57
60
|
miscs[i] = {}
|
58
|
-
|
59
|
-
word
|
61
|
+
s.each do |word|
|
62
|
+
word.id.is_a?(Array) && word.id.each{ |id| miscs[i][id] = word.misc }
|
60
63
|
end
|
61
64
|
end
|
62
65
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
+
sentences.map{ |s| s.reverse! } if RTL_LANGUAGES.include? kaf.language
|
67
|
+
sentences.each.with_index do |s, s_index|
|
68
|
+
s.each do |word|
|
66
69
|
w_index += 1
|
67
70
|
# save misc for later usase in a MWT case
|
68
|
-
next if word
|
71
|
+
next if word.id.is_a? Array
|
69
72
|
|
70
|
-
misc = word
|
73
|
+
misc = word.misc || miscs[s_index][word.id]
|
71
74
|
|
72
|
-
Rollbar.scoped({ input: input, params: params,
|
75
|
+
Rollbar.scoped({ input: input, params: params, sentences: sentences, word: word }) do
|
73
76
|
raise 'Missing misc'
|
74
77
|
end if misc.nil?
|
75
78
|
|
76
79
|
offset = misc.match(/start_char=(\d+)|/)[1].to_i
|
77
80
|
length = misc.match(/end_char=(\d+)/)[1].to_i - offset
|
78
81
|
|
79
|
-
u_pos = word
|
82
|
+
u_pos = word.upos
|
80
83
|
pos = POS[u_pos]
|
81
84
|
raise "Didn't find a map for #{u_pos}" if pos.nil?
|
82
85
|
type = if POS_OPEN.include? pos then 'open' else 'close' end
|
83
86
|
|
84
|
-
params =
|
87
|
+
params = Hashie::Mash.new(
|
85
88
|
wid: w_index,
|
86
89
|
sid: s_index + 1,
|
87
90
|
tid: w_index,
|
88
91
|
para: 1,
|
89
92
|
offset: offset,
|
90
93
|
length: length,
|
91
|
-
text: word
|
92
|
-
lemma: word
|
94
|
+
text: word.text,
|
95
|
+
lemma: word.lemma,
|
93
96
|
morphofeat: u_pos,
|
94
97
|
pos: pos,
|
95
98
|
type: type,
|
96
|
-
|
99
|
+
head: word.head,
|
100
|
+
)
|
97
101
|
|
98
102
|
kaf.add_word_form params
|
99
103
|
kaf.add_term params
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-chained-daemon
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.3.
|
4
|
+
version: 3.3.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-02-
|
11
|
+
date: 2021-02-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -307,10 +307,10 @@ dependencies:
|
|
307
307
|
description: OpeNER daemon for processing multiple queues at once
|
308
308
|
email:
|
309
309
|
executables:
|
310
|
-
- chained-daemon
|
310
|
+
- chained-daemon-daemon
|
311
311
|
- chained-daemon-csv
|
312
312
|
- console
|
313
|
-
- chained-daemon
|
313
|
+
- chained-daemon
|
314
314
|
extensions: []
|
315
315
|
extra_rdoc_files: []
|
316
316
|
files:
|
@@ -330,7 +330,7 @@ files:
|
|
330
330
|
- lib/opener/kaf/document.rb
|
331
331
|
- lib/opener/kaf/term.rb
|
332
332
|
- lib/opener/kaf/text.rb
|
333
|
-
- lib/opener/stanza/
|
333
|
+
- lib/opener/stanza/processor.rb
|
334
334
|
- lib/opener/sym_mash.rb
|
335
335
|
- opener-chained-daemon.gemspec
|
336
336
|
homepage:
|
@@ -353,7 +353,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
353
353
|
version: '0'
|
354
354
|
requirements: []
|
355
355
|
rubyforge_project:
|
356
|
-
rubygems_version: 2.7.
|
356
|
+
rubygems_version: 2.7.6.2
|
357
357
|
signing_key:
|
358
358
|
specification_version: 4
|
359
359
|
summary: OpeNER daemon for processing multiple queues at once
|