opener-chained-daemon 3.3.2 → 3.3.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cc653c1ad199004e447345039efddd2254448322024b4e73b1ef5b1fed16cb2a
4
- data.tar.gz: c4bf149413782c68ec589cf2dc3aa46f102fa7d1da9e86705214117b1ab99197
3
+ metadata.gz: 5bdd841299458126e66df2dff1857ea3c2d2386efab9f092def36357935e2f9d
4
+ data.tar.gz: 140ff88d7ddd90a90bc21106bc9a0d3370802969bca20820ee04b45398e975c3
5
5
  SHA512:
6
- metadata.gz: 36baa4df69fe723ae8901a85343e33e80a21a3f81f0ac31405ccd45ac49000c4433b80bdbe3fa95b5539dfc05335c53b784cb312bd4a787b1e74a72448db10b4
7
- data.tar.gz: 3f8307d1f57cc6da874f2f6a51d52cf72dbe18316f4228406dd13c8aaf7a643722408b1418b5e1b7af9e98687d210558ba177e1d6de2a672d68189957cec21fa
6
+ metadata.gz: 7026baaf0c0541725cb3459575bec129354bada59be1aa30174425051d26ff973f4a0359985cb58b1ad70035ba6c6c9db4409b29631cf9b1d8fbe16152a59268
7
+ data.tar.gz: d947f1bc75e048b56d18b5406ed455a365c406fc1b01bf6e4b6ee0ae4584f2513e45589f93d5e1486c47fe34e1305d4a0105612ec7b17dc5d75e67c0658e0d0a
@@ -12,12 +12,10 @@ require_relative 'sym_mash'
12
12
 
13
13
  require_relative 'chained_daemon/languages_cache'
14
14
  require 'opener/language_identifier'
15
- require 'opener/tokenizer'
16
- require 'opener/pos_tagger' if RUBY_ENGINE == 'jruby'
17
15
  require 'opener/polarity_tagger'
18
16
  require 'opener/property_tagger'
19
17
  require 'opener/opinion_detector_basic'
20
- require 'opener/stanza/tokenizer_pos'
18
+ require 'opener/stanza/processor'
21
19
 
22
20
  require_relative 'chained_daemon/chained_daemon'
23
21
  require_relative 'chained_daemon/cli'
@@ -8,9 +8,7 @@ module Opener
8
8
  @options = DEFAULT_OPTIONS.merge options
9
9
  @queue_map = {
10
10
  'opener-language-identifier': Opener::LanguageIdentifier.new,
11
- 'stanza-tokenizer-pos': Stanza::TokenizerPos.new, # replace this tokenizer-pos with both below with you dont have a stanza server
12
- #'opener-tokenizer': Opener::Tokenizer.new,
13
- #'opener-pos-tagger': Opener::POSTagger.new,
11
+ 'stanza-processor': Stanza::Processor.new,
14
12
  'opener-property-tagger': Opener::PropertyTagger.new,
15
13
  'opener-polarity-tagger': Opener::PolarityTagger.new,
16
14
  'opener-opinion-detector-basic': Opener::OpinionDetectorBasic.new,
@@ -50,7 +48,7 @@ module Opener
50
48
  output = xml.to_s
51
49
  end
52
50
 
53
- output = pretty_print output if params.cache_keys&.environment == 'staging'
51
+ output = pretty_print output if params.cache_keys.environment == 'staging'
54
52
  output
55
53
 
56
54
  rescue Core::UnsupportedLanguageError
@@ -48,10 +48,18 @@ Example:
48
48
  daemon = ChainedDaemon.new args: args
49
49
  input = STDIN.tty? ? nil : STDIN.read
50
50
  params = if ENV['PARAMS'] then JSON.parse ENV['PARAMS'] else {} end
51
+
51
52
  # Set environment as staging from console for testing purposes
52
- params[:cache_keys] = { environment: 'staging', merged: true }
53
+ env = ENV['LEXICONS_ENV'] || 'staging'
54
+ pt = ENV['LEXICONS_PROPERTY_TYPE']
55
+ params[:cache_keys] = {
56
+ environment: env,
57
+ property_type: pt,
58
+ merged: (true if env == 'staging'),
59
+ }
53
60
 
54
- puts daemon.run input, params || {}
61
+ output = daemon.run input, params
62
+ puts output
55
63
  end
56
64
  end
57
65
  end
@@ -19,15 +19,15 @@ module Opener
19
19
  break @cache if @last_updated and @last_updated > UPDATE_INTERVAL.ago
20
20
  cache_update
21
21
  end
22
- @cache
23
22
  end
24
23
 
25
24
  def cache_update
26
25
  puts "loading supported languages from url #{@url}" if ENV['DEBUG']
27
26
 
28
- languages = JSON.parse http.get(@url).body
29
- @cache = languages['data'].map { |l| l['code'] }
27
+ languages = SymMash.new JSON.parse http.get(@url).body
30
28
  @last_updated = Time.now
29
+ @cache = languages.data.each.with_object({}){ |l,h| h[l.code] = l }
30
+ @cache
31
31
  end
32
32
 
33
33
  def http
@@ -1,7 +1,7 @@
1
1
  module Opener
2
2
  class ChainedDaemon
3
3
 
4
- VERSION = '3.3.2'
4
+ VERSION = '3.3.3'
5
5
 
6
6
  end
7
7
  end
@@ -48,13 +48,14 @@ module Opener
48
48
 
49
49
  def add_word_form params
50
50
  text = @document.at('text') || @document.root.add_child('<text/>').first
51
- wf = text.add_child("<wf>#{params[:text]}</wf>")
51
+ wf = text.add_child("<wf>#{params.text}</wf>")
52
52
  attrs = {
53
- wid: "w#{params[:wid]}",
54
- sent: params[:sid],
55
- para: params[:para],
56
- offset: params[:offset],
57
- length: params[:length],
53
+ wid: "w#{params.wid}",
54
+ sent: params.sid,
55
+ para: params.para,
56
+ offset: params.offset,
57
+ length: params.length,
58
+ head: params.head,
58
59
  }
59
60
  wf.attr attrs
60
61
  end
@@ -63,15 +64,16 @@ module Opener
63
64
  text = @document.at('terms') || @document.root.add_child('<terms/>').first
64
65
  term = text.add_child("<term/>")
65
66
  attrs = {
66
- tid: "t#{params[:tid]}",
67
- type: params[:type],
68
- lemma: params[:lemma],
69
- text: params[:text],
70
- pos: params[:pos],
71
- morphofeat: params[:morphofeat],
67
+ tid: "t#{params.tid}",
68
+ type: params.type,
69
+ lemma: params.lemma,
70
+ text: params.text,
71
+ pos: params.pos,
72
+ morphofeat: params.morphofeat,
73
+ head: params.head,
72
74
  }
73
75
  term.attr attrs
74
- term.first.add_child("<span><target id='w#{params[:wid]}' /></span>")
76
+ term.first.add_child("<span><target id='w#{params.wid}'/></span>")
75
77
  end
76
78
 
77
79
  def to_xml
@@ -1,6 +1,7 @@
1
1
  module Opener
2
2
  module KAF
3
3
  class WordForm
4
+
4
5
  def initialize(document, xml_node)
5
6
  @document = document
6
7
  @xml_node = xml_node
@@ -25,6 +26,7 @@ module Opener
25
26
  def paragraph
26
27
  return @paragraph ||= @xml_node.attr('para').to_i
27
28
  end
29
+
28
30
  end
29
31
  end
30
32
  end
@@ -1,6 +1,6 @@
1
1
  module Opener
2
2
  module Stanza
3
- class TokenizerPos
3
+ class Processor
4
4
 
5
5
  DESC = 'Tokenizer / POS by Stanza'
6
6
  VERSION = '1.0'
@@ -8,8 +8,10 @@ module Opener
8
8
  BASE_URL = ENV['STANZA_SERVER']
9
9
  LANGUAGES_CACHE = Opener::ChainedDaemon::LanguagesCache.new
10
10
 
11
- RTL_LANGUAGES = [ "ar", "ara", "arc", "ae", "ave", "egy", "he", "heb", "nqo", "pal", "phn", "sam",
12
- "syc", "syr", "fa", "per", "fas", "ku", "kur", "ur", "urd" ]
11
+ RTL_LANGUAGES = %w[
12
+ ar ara arc ae ave egy he heb nqo pal phn sam
13
+ syc syr fa per fas ku kur ur urd
14
+ ]
13
15
 
14
16
  POS = {
15
17
  'DET' => 'D',
@@ -37,63 +39,65 @@ module Opener
37
39
  raise 'missing Stanza server' if ENV['STANZA_SERVER'].blank?
38
40
 
39
41
  kaf = KAF::Document.from_xml input
40
-
41
- prod = params.cache_keys.environment == 'production'
42
- if prod and !LANGUAGES_CACHE.get.include?(kaf.language)
42
+ lang = LANGUAGES_CACHE.get[kaf.language]
43
+ env = params.cache_keys.environment
44
+ unless lang&.environments&.include? env or (env == 'staging' and lang&.environments&.include? 'production')
43
45
  raise Core::UnsupportedLanguageError.new kaf.language
44
46
  end
45
47
 
46
- input = kaf.raw
47
- input = input.gsub(/\,[^\ ]/, ', ')
48
- response = Faraday.post BASE_URL, {lang: kaf.language, input: input}.to_query
48
+ input = kaf.raw
49
+ input = input.gsub(/\,[^\ ]/, ', ')
50
+ response = Faraday.post BASE_URL, {lang: kaf.language, input: input}.to_query
49
51
  raise Core::UnsupportedLanguageError, kaf.language if response.status == 406
50
52
  raise response.body if response.status >= 400
51
- tokens = JSON.parse response.body
53
+ sentences = JSON.parse response.body
54
+ sentences.each{ |s| s.map!{ |t| Hashie::Mash.new t } }
52
55
 
53
56
  w_index = 0
54
57
 
55
58
  miscs = {}
56
- tokens.each_with_index do |t, i|
59
+ sentences.each.with_index do |s, i|
57
60
  miscs[i] = {}
58
- t.each do |word|
59
- word['id'].is_a?(Array) && word['id'].each { |id| miscs[i][id] = word['misc'] }
61
+ s.each do |word|
62
+ word.id.is_a?(Array) && word.id.each{ |id| miscs[i][id] = word.misc }
60
63
  end
61
64
  end
62
65
 
63
- tokens.map{ |t| t.reverse! } if RTL_LANGUAGES.include? kaf.language
64
- tokens.each_with_index do |sentence, s_index|
65
- sentence.each_with_index do |word|
66
+ sentences.map{ |s| s.reverse! } if RTL_LANGUAGES.include? kaf.language
67
+ sentences.each.with_index do |s, s_index|
68
+ s.each do |word|
66
69
  w_index += 1
67
70
  # save misc for later usase in a MWT case
68
- next if word['id'].is_a? Array
71
+ next if word.id.is_a? Array
69
72
 
70
- misc = word['misc'] || miscs[s_index][word['id']]
73
+ misc = word.misc || miscs[s_index][word.id]
71
74
 
72
- Rollbar.scoped({ input: input, params: params, tokens: tokens, word: word }) do
75
+ Rollbar.scoped({ input: input, params: params, sentences: sentences, word: word }) do
73
76
  raise 'Missing misc'
74
77
  end if misc.nil?
75
78
 
76
79
  offset = misc.match(/start_char=(\d+)|/)[1].to_i
77
80
  length = misc.match(/end_char=(\d+)/)[1].to_i - offset
78
81
 
79
- u_pos = word['upos']
82
+ u_pos = word.upos
80
83
  pos = POS[u_pos]
81
84
  raise "Didn't find a map for #{u_pos}" if pos.nil?
82
85
  type = if POS_OPEN.include? pos then 'open' else 'close' end
83
86
 
84
- params = {
87
+ params = Hashie::Mash.new(
85
88
  wid: w_index,
86
89
  sid: s_index + 1,
87
90
  tid: w_index,
88
91
  para: 1,
89
92
  offset: offset,
90
93
  length: length,
91
- text: word['text'],
92
- lemma: word['lemma'],
94
+ text: word.text,
95
+ lemma: word.lemma,
93
96
  morphofeat: u_pos,
94
97
  pos: pos,
95
98
  type: type,
96
- }
99
+ head: word.head,
100
+ )
97
101
 
98
102
  kaf.add_word_form params
99
103
  kaf.add_term params
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-chained-daemon
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.3.2
4
+ version: 3.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-02-04 00:00:00.000000000 Z
11
+ date: 2021-02-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -307,10 +307,10 @@ dependencies:
307
307
  description: OpeNER daemon for processing multiple queues at once
308
308
  email:
309
309
  executables:
310
- - chained-daemon
310
+ - chained-daemon-daemon
311
311
  - chained-daemon-csv
312
312
  - console
313
- - chained-daemon-daemon
313
+ - chained-daemon
314
314
  extensions: []
315
315
  extra_rdoc_files: []
316
316
  files:
@@ -330,7 +330,7 @@ files:
330
330
  - lib/opener/kaf/document.rb
331
331
  - lib/opener/kaf/term.rb
332
332
  - lib/opener/kaf/text.rb
333
- - lib/opener/stanza/tokenizer_pos.rb
333
+ - lib/opener/stanza/processor.rb
334
334
  - lib/opener/sym_mash.rb
335
335
  - opener-chained-daemon.gemspec
336
336
  homepage:
@@ -353,7 +353,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
353
353
  version: '0'
354
354
  requirements: []
355
355
  rubyforge_project:
356
- rubygems_version: 2.7.8
356
+ rubygems_version: 2.7.6.2
357
357
  signing_key:
358
358
  specification_version: 4
359
359
  summary: OpeNER daemon for processing multiple queues at once