opener-chained-daemon 3.3.2 → 3.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cc653c1ad199004e447345039efddd2254448322024b4e73b1ef5b1fed16cb2a
4
- data.tar.gz: c4bf149413782c68ec589cf2dc3aa46f102fa7d1da9e86705214117b1ab99197
3
+ metadata.gz: 5bdd841299458126e66df2dff1857ea3c2d2386efab9f092def36357935e2f9d
4
+ data.tar.gz: 140ff88d7ddd90a90bc21106bc9a0d3370802969bca20820ee04b45398e975c3
5
5
  SHA512:
6
- metadata.gz: 36baa4df69fe723ae8901a85343e33e80a21a3f81f0ac31405ccd45ac49000c4433b80bdbe3fa95b5539dfc05335c53b784cb312bd4a787b1e74a72448db10b4
7
- data.tar.gz: 3f8307d1f57cc6da874f2f6a51d52cf72dbe18316f4228406dd13c8aaf7a643722408b1418b5e1b7af9e98687d210558ba177e1d6de2a672d68189957cec21fa
6
+ metadata.gz: 7026baaf0c0541725cb3459575bec129354bada59be1aa30174425051d26ff973f4a0359985cb58b1ad70035ba6c6c9db4409b29631cf9b1d8fbe16152a59268
7
+ data.tar.gz: d947f1bc75e048b56d18b5406ed455a365c406fc1b01bf6e4b6ee0ae4584f2513e45589f93d5e1486c47fe34e1305d4a0105612ec7b17dc5d75e67c0658e0d0a
@@ -12,12 +12,10 @@ require_relative 'sym_mash'
12
12
 
13
13
  require_relative 'chained_daemon/languages_cache'
14
14
  require 'opener/language_identifier'
15
- require 'opener/tokenizer'
16
- require 'opener/pos_tagger' if RUBY_ENGINE == 'jruby'
17
15
  require 'opener/polarity_tagger'
18
16
  require 'opener/property_tagger'
19
17
  require 'opener/opinion_detector_basic'
20
- require 'opener/stanza/tokenizer_pos'
18
+ require 'opener/stanza/processor'
21
19
 
22
20
  require_relative 'chained_daemon/chained_daemon'
23
21
  require_relative 'chained_daemon/cli'
@@ -8,9 +8,7 @@ module Opener
8
8
  @options = DEFAULT_OPTIONS.merge options
9
9
  @queue_map = {
10
10
  'opener-language-identifier': Opener::LanguageIdentifier.new,
11
- 'stanza-tokenizer-pos': Stanza::TokenizerPos.new, # replace this tokenizer-pos with both below with you dont have a stanza server
12
- #'opener-tokenizer': Opener::Tokenizer.new,
13
- #'opener-pos-tagger': Opener::POSTagger.new,
11
+ 'stanza-processor': Stanza::Processor.new,
14
12
  'opener-property-tagger': Opener::PropertyTagger.new,
15
13
  'opener-polarity-tagger': Opener::PolarityTagger.new,
16
14
  'opener-opinion-detector-basic': Opener::OpinionDetectorBasic.new,
@@ -50,7 +48,7 @@ module Opener
50
48
  output = xml.to_s
51
49
  end
52
50
 
53
- output = pretty_print output if params.cache_keys&.environment == 'staging'
51
+ output = pretty_print output if params.cache_keys.environment == 'staging'
54
52
  output
55
53
 
56
54
  rescue Core::UnsupportedLanguageError
@@ -48,10 +48,18 @@ Example:
48
48
  daemon = ChainedDaemon.new args: args
49
49
  input = STDIN.tty? ? nil : STDIN.read
50
50
  params = if ENV['PARAMS'] then JSON.parse ENV['PARAMS'] else {} end
51
+
51
52
  # Set environment as staging from console for testing purposes
52
- params[:cache_keys] = { environment: 'staging', merged: true }
53
+ env = ENV['LEXICONS_ENV'] || 'staging'
54
+ pt = ENV['LEXICONS_PROPERTY_TYPE']
55
+ params[:cache_keys] = {
56
+ environment: env,
57
+ property_type: pt,
58
+ merged: (true if env == 'staging'),
59
+ }
53
60
 
54
- puts daemon.run input, params || {}
61
+ output = daemon.run input, params
62
+ puts output
55
63
  end
56
64
  end
57
65
  end
@@ -19,15 +19,15 @@ module Opener
19
19
  break @cache if @last_updated and @last_updated > UPDATE_INTERVAL.ago
20
20
  cache_update
21
21
  end
22
- @cache
23
22
  end
24
23
 
25
24
  def cache_update
26
25
  puts "loading supported languages from url #{@url}" if ENV['DEBUG']
27
26
 
28
- languages = JSON.parse http.get(@url).body
29
- @cache = languages['data'].map { |l| l['code'] }
27
+ languages = SymMash.new JSON.parse http.get(@url).body
30
28
  @last_updated = Time.now
29
+ @cache = languages.data.each.with_object({}){ |l,h| h[l.code] = l }
30
+ @cache
31
31
  end
32
32
 
33
33
  def http
@@ -1,7 +1,7 @@
1
1
  module Opener
2
2
  class ChainedDaemon
3
3
 
4
- VERSION = '3.3.2'
4
+ VERSION = '3.3.3'
5
5
 
6
6
  end
7
7
  end
@@ -48,13 +48,14 @@ module Opener
48
48
 
49
49
  def add_word_form params
50
50
  text = @document.at('text') || @document.root.add_child('<text/>').first
51
- wf = text.add_child("<wf>#{params[:text]}</wf>")
51
+ wf = text.add_child("<wf>#{params.text}</wf>")
52
52
  attrs = {
53
- wid: "w#{params[:wid]}",
54
- sent: params[:sid],
55
- para: params[:para],
56
- offset: params[:offset],
57
- length: params[:length],
53
+ wid: "w#{params.wid}",
54
+ sent: params.sid,
55
+ para: params.para,
56
+ offset: params.offset,
57
+ length: params.length,
58
+ head: params.head,
58
59
  }
59
60
  wf.attr attrs
60
61
  end
@@ -63,15 +64,16 @@ module Opener
63
64
  text = @document.at('terms') || @document.root.add_child('<terms/>').first
64
65
  term = text.add_child("<term/>")
65
66
  attrs = {
66
- tid: "t#{params[:tid]}",
67
- type: params[:type],
68
- lemma: params[:lemma],
69
- text: params[:text],
70
- pos: params[:pos],
71
- morphofeat: params[:morphofeat],
67
+ tid: "t#{params.tid}",
68
+ type: params.type,
69
+ lemma: params.lemma,
70
+ text: params.text,
71
+ pos: params.pos,
72
+ morphofeat: params.morphofeat,
73
+ head: params.head,
72
74
  }
73
75
  term.attr attrs
74
- term.first.add_child("<span><target id='w#{params[:wid]}' /></span>")
76
+ term.first.add_child("<span><target id='w#{params.wid}'/></span>")
75
77
  end
76
78
 
77
79
  def to_xml
@@ -1,6 +1,7 @@
1
1
  module Opener
2
2
  module KAF
3
3
  class WordForm
4
+
4
5
  def initialize(document, xml_node)
5
6
  @document = document
6
7
  @xml_node = xml_node
@@ -25,6 +26,7 @@ module Opener
25
26
  def paragraph
26
27
  return @paragraph ||= @xml_node.attr('para').to_i
27
28
  end
29
+
28
30
  end
29
31
  end
30
32
  end
@@ -1,6 +1,6 @@
1
1
  module Opener
2
2
  module Stanza
3
- class TokenizerPos
3
+ class Processor
4
4
 
5
5
  DESC = 'Tokenizer / POS by Stanza'
6
6
  VERSION = '1.0'
@@ -8,8 +8,10 @@ module Opener
8
8
  BASE_URL = ENV['STANZA_SERVER']
9
9
  LANGUAGES_CACHE = Opener::ChainedDaemon::LanguagesCache.new
10
10
 
11
- RTL_LANGUAGES = [ "ar", "ara", "arc", "ae", "ave", "egy", "he", "heb", "nqo", "pal", "phn", "sam",
12
- "syc", "syr", "fa", "per", "fas", "ku", "kur", "ur", "urd" ]
11
+ RTL_LANGUAGES = %w[
12
+ ar ara arc ae ave egy he heb nqo pal phn sam
13
+ syc syr fa per fas ku kur ur urd
14
+ ]
13
15
 
14
16
  POS = {
15
17
  'DET' => 'D',
@@ -37,63 +39,65 @@ module Opener
37
39
  raise 'missing Stanza server' if ENV['STANZA_SERVER'].blank?
38
40
 
39
41
  kaf = KAF::Document.from_xml input
40
-
41
- prod = params.cache_keys.environment == 'production'
42
- if prod and !LANGUAGES_CACHE.get.include?(kaf.language)
42
+ lang = LANGUAGES_CACHE.get[kaf.language]
43
+ env = params.cache_keys.environment
44
+ unless lang&.environments&.include? env or (env == 'staging' and lang&.environments&.include? 'production')
43
45
  raise Core::UnsupportedLanguageError.new kaf.language
44
46
  end
45
47
 
46
- input = kaf.raw
47
- input = input.gsub(/\,[^\ ]/, ', ')
48
- response = Faraday.post BASE_URL, {lang: kaf.language, input: input}.to_query
48
+ input = kaf.raw
49
+ input = input.gsub(/\,[^\ ]/, ', ')
50
+ response = Faraday.post BASE_URL, {lang: kaf.language, input: input}.to_query
49
51
  raise Core::UnsupportedLanguageError, kaf.language if response.status == 406
50
52
  raise response.body if response.status >= 400
51
- tokens = JSON.parse response.body
53
+ sentences = JSON.parse response.body
54
+ sentences.each{ |s| s.map!{ |t| Hashie::Mash.new t } }
52
55
 
53
56
  w_index = 0
54
57
 
55
58
  miscs = {}
56
- tokens.each_with_index do |t, i|
59
+ sentences.each.with_index do |s, i|
57
60
  miscs[i] = {}
58
- t.each do |word|
59
- word['id'].is_a?(Array) && word['id'].each { |id| miscs[i][id] = word['misc'] }
61
+ s.each do |word|
62
+ word.id.is_a?(Array) && word.id.each{ |id| miscs[i][id] = word.misc }
60
63
  end
61
64
  end
62
65
 
63
- tokens.map{ |t| t.reverse! } if RTL_LANGUAGES.include? kaf.language
64
- tokens.each_with_index do |sentence, s_index|
65
- sentence.each_with_index do |word|
66
+ sentences.map{ |s| s.reverse! } if RTL_LANGUAGES.include? kaf.language
67
+ sentences.each.with_index do |s, s_index|
68
+ s.each do |word|
66
69
  w_index += 1
67
70
  # save misc for later usase in a MWT case
68
- next if word['id'].is_a? Array
71
+ next if word.id.is_a? Array
69
72
 
70
- misc = word['misc'] || miscs[s_index][word['id']]
73
+ misc = word.misc || miscs[s_index][word.id]
71
74
 
72
- Rollbar.scoped({ input: input, params: params, tokens: tokens, word: word }) do
75
+ Rollbar.scoped({ input: input, params: params, sentences: sentences, word: word }) do
73
76
  raise 'Missing misc'
74
77
  end if misc.nil?
75
78
 
76
79
  offset = misc.match(/start_char=(\d+)|/)[1].to_i
77
80
  length = misc.match(/end_char=(\d+)/)[1].to_i - offset
78
81
 
79
- u_pos = word['upos']
82
+ u_pos = word.upos
80
83
  pos = POS[u_pos]
81
84
  raise "Didn't find a map for #{u_pos}" if pos.nil?
82
85
  type = if POS_OPEN.include? pos then 'open' else 'close' end
83
86
 
84
- params = {
87
+ params = Hashie::Mash.new(
85
88
  wid: w_index,
86
89
  sid: s_index + 1,
87
90
  tid: w_index,
88
91
  para: 1,
89
92
  offset: offset,
90
93
  length: length,
91
- text: word['text'],
92
- lemma: word['lemma'],
94
+ text: word.text,
95
+ lemma: word.lemma,
93
96
  morphofeat: u_pos,
94
97
  pos: pos,
95
98
  type: type,
96
- }
99
+ head: word.head,
100
+ )
97
101
 
98
102
  kaf.add_word_form params
99
103
  kaf.add_term params
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-chained-daemon
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.3.2
4
+ version: 3.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-02-04 00:00:00.000000000 Z
11
+ date: 2021-02-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -307,10 +307,10 @@ dependencies:
307
307
  description: OpeNER daemon for processing multiple queues at once
308
308
  email:
309
309
  executables:
310
- - chained-daemon
310
+ - chained-daemon-daemon
311
311
  - chained-daemon-csv
312
312
  - console
313
- - chained-daemon-daemon
313
+ - chained-daemon
314
314
  extensions: []
315
315
  extra_rdoc_files: []
316
316
  files:
@@ -330,7 +330,7 @@ files:
330
330
  - lib/opener/kaf/document.rb
331
331
  - lib/opener/kaf/term.rb
332
332
  - lib/opener/kaf/text.rb
333
- - lib/opener/stanza/tokenizer_pos.rb
333
+ - lib/opener/stanza/processor.rb
334
334
  - lib/opener/sym_mash.rb
335
335
  - opener-chained-daemon.gemspec
336
336
  homepage:
@@ -353,7 +353,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
353
353
  version: '0'
354
354
  requirements: []
355
355
  rubyforge_project:
356
- rubygems_version: 2.7.8
356
+ rubygems_version: 2.7.6.2
357
357
  signing_key:
358
358
  specification_version: 4
359
359
  summary: OpeNER daemon for processing multiple queues at once