gn_list_resolver 3.2.0.4 → 3.3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f990bf592ce702c16e20f45e3c881e7f4f4df01c
4
- data.tar.gz: e8b55f1f5e6b27e4b8eddb8e21c65ce900ede729
3
+ metadata.gz: cdcc523cf5153459744453bc7ac1a6ccd0dcc362
4
+ data.tar.gz: a5886cca5aef50a2d17a851152927f1653d850c8
5
5
  SHA512:
6
- metadata.gz: c5586d503371d99d8e26cf26057e95805a5fc22ca6f2073eb4baabe31891a68ccd20b64958f1d8e6947a2ea6469c5e03981e3b514cc1696323809c03892c1f27
7
- data.tar.gz: 44a2e55c363de3858449f0ea954e21ea4cdaf27d30c32a4d12c7d3064c9c69cb98b65fffcdab1261a86d25a01c02cffbc970a11e1909be20120c33749da66470
6
+ metadata.gz: c8f44334200d52e5d407793206a409727ad61c1ae045c2510ef4d8ca6cfd39163dda3ad38017041da5f4e7a9ef4ebd0eb1bb65d8ce8c3eb4c618e98e20037321
7
+ data.tar.gz: 62034abff0ab1a6a85499586ab259cf15138d49e0c0cb578a0e500e3354c38ef09253e579763698f9fa8ab849c09e11fe25411301d7aee36d0a1e551c615f714
@@ -17,6 +17,16 @@
17
17
  @dimus, @alexander-myltsev - sync with `gn_crossmap`'s 3.2.0, current name is
18
18
  still a bit broken
19
19
 
20
+ ## 3.3.1
21
+
22
+ * @dimus - Better error message in logger
23
+
24
+ ## 3.3.0
25
+
26
+ * @dimus - Add option `opts.threads` for threads number for concurrent jobs.
27
+ Resolution now happens concurrently if the number of jobs is more
28
+ than 1. Max amount of jobs is 10.
29
+
20
30
  ## 3.2.0
21
31
 
22
32
  * @dimus - Add a column `matchSize` with number of matches for each name
@@ -8,6 +8,7 @@ puts "This program requires Ruby >= v. 2.4.1" if RUBY_VERSION < "2.4.1"
8
8
 
9
9
  CATALOGUE_OF_LIFE = 1
10
10
  OUTPUT = "output.csv"
11
+ THREADS = 1
11
12
  opts = Trollop.options do
12
13
  banner "Compares a list of scientific names to scientific names from a " \
13
14
  "data source from Global Names Resolver\n\n " \
@@ -16,6 +17,7 @@ opts = Trollop.options do
16
17
 
17
18
  opt(:input, "Path to input file", type: :string)
18
19
  opt(:output, "Path to output file", default: OUTPUT)
20
+ opt(:threads, "Threads number for resolution", default: THREADS)
19
21
  opt(:data_source_id, "Data source id from GN Resolver",
20
22
  default: CATALOGUE_OF_LIFE)
21
23
  opt(:skip_original, "If given, only 'taxonID' is shown " \
@@ -32,6 +32,7 @@ Gem::Specification.new do |gem|
32
32
  gem.require_paths = ["lib"]
33
33
 
34
34
  gem.add_dependency "biodiversity", "~> 3.1"
35
+ gem.add_dependency "concurrent-ruby", "~> 1.0"
35
36
  gem.add_dependency "gn_uuid", "~> 0.5"
36
37
  gem.add_dependency "graphql-client", "~> 0.11.3"
37
38
  gem.add_dependency "logger-colors", "~> 1.0"
@@ -39,7 +40,7 @@ Gem::Specification.new do |gem|
39
40
  gem.add_dependency "trollop", "~> 2.1"
40
41
 
41
42
  gem.add_development_dependency "bundler", "~> 1.7"
42
- gem.add_development_dependency "byebug", "~> 9.0"
43
+ gem.add_development_dependency "byebug", "~> 9.1"
43
44
  gem.add_development_dependency "coveralls", "~> 0.8"
44
45
  gem.add_development_dependency "rake", "~> 12.0"
45
46
  gem.add_development_dependency "rspec", "~> 3.2"
@@ -8,6 +8,7 @@ require "logger"
8
8
  require "logger/colors"
9
9
  require "pp"
10
10
  require "biodiversity"
11
+ require "concurrent"
11
12
  require "gn_uuid"
12
13
  require "graphql/client"
13
14
  require "graphql/client/http"
@@ -20,6 +21,7 @@ require "gn_list_resolver/writer"
20
21
  require "gn_list_resolver/collector"
21
22
  require "gn_list_resolver/column_collector"
22
23
  require "gn_list_resolver/sci_name_collector"
24
+ require "gn_list_resolver/resolver_job"
23
25
  require "gn_list_resolver/resolver"
24
26
  require "gn_list_resolver/result_processor"
25
27
  require "gn_list_resolver/stats"
@@ -37,7 +39,7 @@ module GnListResolver
37
39
  reader = create_reader(input_io, opts)
38
40
  data = block_given? ? reader.read(&Proc.new) : reader.read
39
41
  writer = create_writer(reader, output_io, opts)
40
- resolver = create_resolver(writer, opts)
42
+ resolver = Resolver.new(writer, opts)
41
43
  block_given? ? resolver.resolve(data, &Proc.new) : resolver.resolve(data)
42
44
  logger.warn(resolver.stats.stats.pretty_inspect) if opts[:debug]
43
45
  resolver.stats
@@ -61,13 +63,18 @@ module GnListResolver
61
63
  end
62
64
  end
63
65
 
64
- private
65
-
66
- def create_resolver(writer, opts)
67
- Resolver.new(writer, opts.data_source_id,
68
- opts.stats, opts.with_classification)
66
+ def opts_struct(opts)
67
+ threads = opts[:threads].to_i
68
+ opts[:threads] = threads.between?(1, 10) ? threads : 2
69
+ with_classification = opts[:with_classification] ? true : false
70
+ opts[:with_classification] = with_classification
71
+ data_source_id = opts[:data_source_id].to_i
72
+ opts[:data_source_id] = data_source_id.zero? ? 1 : data_source_id
73
+ OpenStruct.new({ stats: Stats.new, alt_headers: [] }.merge(opts))
69
74
  end
70
75
 
76
+ private
77
+
71
78
  def create_writer(reader, output_io, opts)
72
79
  Writer.new(output_io, reader.original_fields,
73
80
  output_name(opts.output), opts.with_classification)
@@ -78,10 +85,6 @@ module GnListResolver
78
85
  opts.skip_original, opts.alt_headers, opts.stats)
79
86
  end
80
87
 
81
- def opts_struct(opts)
82
- OpenStruct.new({ stats: Stats.new, alt_headers: [] }.merge(opts))
83
- end
84
-
85
88
  def io(input, output)
86
89
  io_in = iogen(input, INPUT_MODE)
87
90
  io_out = iogen(output, OUTPUT_MODE)
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # rubocop:disable Metrics/ClassLength
4
+
3
5
  module GnListResolver
4
6
  # Sends data to GN Resolver and collects results
5
7
  class Resolver
@@ -7,111 +9,122 @@ module GnListResolver
7
9
  QUERY = GRAPHQL.client.parse(GRAPHQL.query)
8
10
  attr_reader :stats
9
11
 
10
- def initialize(writer, data_source_id, stats, with_classification = false)
11
- @stats = stats
12
- @processor = GnListResolver::ResultProcessor.new(writer, @stats,
13
- with_classification)
14
- @ds_id = data_source_id
12
+ def initialize(writer, opts)
13
+ instance_vars_from_opts(opts)
14
+ @processor = GnListResolver::ResultProcessor.
15
+ new(writer, @stats, @with_classification)
15
16
  @count = 0
16
- @current_data = {}
17
+ @jobs = []
17
18
  @batch = 1000
18
19
  end
19
20
 
20
21
  def resolve(data)
21
- update_stats(data.size)
22
- block_given? ? process(data, &Proc.new) : process(data)
22
+ resolution_stats(data.size)
23
+ @threads.times do
24
+ batch = data.shift(@batch)
25
+ add_job(batch)
26
+ end
27
+ block_given? ? traverse_jobs(data, &Proc.new) : traverse_jobs(data)
23
28
  wrap_up
24
29
  block_given? ? yield(@stats.stats) : @stats.stats
25
30
  end
26
31
 
27
32
  private
28
33
 
29
- def process(data)
30
- cmd = nil
31
- data.each_slice(@batch) do |slice|
32
- with_log do
33
- collect_names(slice)
34
- remote_resolve(slice)
35
- cmd = yield(@stats.stats) if block_given?
36
- end
37
- break if cmd == "STOP"
38
- end
39
- end
40
-
41
34
  def wrap_up
42
35
  @stats.stats[:resolution_stop] = Time.now
43
36
  @stats.stats[:status] = :finish
44
37
  @processor.writer.close
45
38
  end
46
39
 
47
- def update_stats(records_num)
48
- @stats.stats[:total_records] = records_num
49
- @stats.stats[:resolution_start] = Time.now
50
- @stats.stats[:status] = :resolution
40
+ def add_job(batch)
41
+ job = batch.empty? ? nil : create_job(batch)
42
+ @jobs << job
51
43
  end
52
44
 
53
- def with_log
54
- s = @count + 1
55
- @count += @batch
56
- e = [@count, @stats.stats[:total_records]].min
57
- GnListResolver.log("Resolve #{s}-#{e} out of " \
58
- "#{@stats.stats[:total_records]} records at " \
59
- "#{RESOLVER_URL}")
60
- yield
45
+ def traverse_jobs(data)
46
+ until data.empty? && @jobs.compact.empty?
47
+ process_results(data)
48
+ cmd = yield(@stats.stats) if block_given?
49
+ break if cmd == "STOP"
50
+ sleep(0.5)
51
+ end
61
52
  end
62
53
 
63
- def collect_names(slice)
64
- @current_data = {}
65
- slice.each_with_object([]) do |row, str|
66
- id = row[:id].strip
67
- @current_data[id] = row[:original]
68
- @processor.input[id] = { rank: row[:rank] }
69
- str << "#{id}|#{row[:name]}"
70
- end.join("\n")
54
+ def resolution_stats(records_num)
55
+ @stats.stats[:total_records] = records_num
56
+ @stats.stats[:resolution_start] = Time.now
57
+ @stats.stats[:status] = :resolution
71
58
  end
72
59
 
73
- def variables(names)
74
- { dataSourceIds: [@ds_id],
75
- names: names.
76
- map { |name| { value: name[:name], suppliedId: name[:id] } } }
60
+ def process_results(data)
61
+ indices = []
62
+ @jobs.each_with_index do |job, i|
63
+ next if job.nil? || !job.complete?
64
+ with_log do
65
+ process_job(job)
66
+ indices << i
67
+ end
68
+ end
69
+ add_jobs(indices, data) unless indices.empty?
77
70
  end
78
71
 
79
- def remote_resolve(names)
80
- batch_start = Time.now
81
-
82
- res = GRAPHQL.client.query(QUERY, variables: variables(names))
83
- if res.data
84
- @processor.process(res.data.name_resolver.responses, @current_data)
85
- else
86
- single_remote_resolve(names)
72
+ def add_jobs(indices, data)
73
+ indices.each do |i|
74
+ batch = data.shift(@batch)
75
+ @jobs[i] = batch.empty? ? nil : create_job(batch)
87
76
  end
88
- update_batch_times(batch_start)
89
77
  end
90
78
 
91
- def single_remote_resolve(names)
92
- names.each do |name|
93
- res = GRAPHQL.client.query(QUERY, variables: variables([name]))
94
- if res.data
95
- @processor.process(res.data.name_resolver, @current_data)
96
- else
97
- process_resolver_error(res, name)
98
- end
79
+ def process_job(job)
80
+ if job.fulfilled?
81
+ results, current_data, stats = job.value
82
+ update_stats(stats)
83
+ @processor.process(results, current_data)
84
+ else
85
+ GnResolver.logger.error(job.reason.message)
99
86
  end
100
87
  end
101
88
 
102
- def update_batch_times(batch_start)
89
+ def update_stats(job_stats)
103
90
  s = @stats.stats
104
91
  s[:last_batches_time].shift if s[:last_batches_time].size > 2
105
- s[:last_batches_time] << Time.now - batch_start
92
+ s[:last_batches_time] << job_stats.stats[:last_batches_time][0]
106
93
  s[:resolution_span] = Time.now - s[:resolution_start]
107
94
  end
108
95
 
109
- def process_resolver_error(res, name)
110
- @stats.stats[:matches][:ErrorInMatch] += 1
111
- @stats.stats[:resolved_records] += 1
112
- error =
113
- "Resolver broke on '#{name}': #{res.errors.messages['data'].first}"
114
- GnListResolver.logger.error(error)
96
+ def create_job(batch)
97
+ batch_data = collect_names(batch)
98
+ rb = ResolverJob.new(batch, batch_data, @ds_id)
99
+ Concurrent::Future.execute { rb.run }
100
+ end
101
+
102
+ def instance_vars_from_opts(opts)
103
+ @stats = opts.stats
104
+ @with_classification = opts.with_classification.freeze
105
+ @ds_id = opts.data_source_id.freeze
106
+ @threads = opts.threads
107
+ end
108
+
109
+ def collect_names(batch)
110
+ batch_data = {}
111
+ batch.each do |row|
112
+ id = row[:id].strip
113
+ batch_data[id] = row[:original]
114
+ @processor.input[id] = { rank: row[:rank] }
115
+ end
116
+ batch_data
117
+ end
118
+
119
+ def with_log
120
+ s = @count + 1
121
+ @count += @batch
122
+ e = [@count, @stats.stats[:total_records]].min
123
+ GnListResolver.log("Resolve #{s}-#{e} out of " \
124
+ "#{@stats.stats[:total_records]} records at " \
125
+ "#{RESOLVER_URL}")
126
+ yield
115
127
  end
116
128
  end
117
129
  end
130
+ # rubocop:enable all
@@ -0,0 +1,134 @@
1
+ # frozen_string_literal: true
2
+
3
+ # rubocop:disable Metrics/ClassLength
4
+
5
+ module GnListResolver
6
+ # Sends data to GN Resolver and collects results
7
+ class Resolver
8
+ GRAPHQL = GnGraphQL.new
9
+ QUERY = GRAPHQL.client.parse(GRAPHQL.query)
10
+ attr_reader :stats
11
+
12
+ def initialize(writer, opts)
13
+ instance_vars_from_opts(opts)
14
+ @processor = GnListResolver::ResultProcessor.
15
+ new(writer, @stats, @with_classification)
16
+ @count = 0
17
+ @jobs = []
18
+ @batch = 1000
19
+ end
20
+
21
+ def resolve(data)
22
+ resolution_stats(data.size)
23
+ @threads.times do
24
+ batch = data.shift(@batch)
25
+ add_job(batch)
26
+ end
27
+ block_given? ? traverse_jobs(data, &Proc.new) : traverse_jobs(data)
28
+ wrap_up
29
+ block_given? ? yield(@stats.stats) : @stats.stats
30
+ end
31
+
32
+ private
33
+
34
+ def wrap_up
35
+ @stats.stats[:resolution_stop] = Time.now
36
+ @stats.stats[:status] = :finish
37
+ @processor.writer.close
38
+ end
39
+
40
+ def add_job(batch)
41
+ job = batch.empty? ? nil : create_job(batch)
42
+ @jobs << job
43
+ end
44
+
45
+ def traverse_jobs(data)
46
+ until data.empty? && @jobs.compact.empty?
47
+ process_results(data)
48
+ cmd = yield(@stats.stats) if block_given?
49
+ break if cmd == "STOP"
50
+ sleep(0.5)
51
+ end
52
+ end
53
+
54
+ def resolution_stats(records_num)
55
+ @stats.stats[:total_records] = records_num
56
+ @stats.stats[:resolution_start] = Time.now
57
+ @stats.stats[:status] = :resolution
58
+ end
59
+
60
+ def process_results(data)
61
+ indices = []
62
+ @jobs.each_with_index do |job, i|
63
+ next if job.nil? || !job.complete?
64
+ with_log do
65
+ process_job(job)
66
+ indices << i
67
+ end
68
+ end
69
+ add_jobs(indices, data) unless indices.empty?
70
+ end
71
+
72
+ def add_jobs(indices, data)
73
+ indices.each do |i|
74
+ batch = data.shift(@batch)
75
+ @jobs[i] = batch.empty? ? nil : create_job(batch)
76
+ end
77
+ end
78
+
79
+ def process_job(job)
80
+ if job.fulfilled?
81
+ results, current_data, stats = job.value
82
+ update_stats(stats)
83
+ @processor.process(results, current_data)
84
+ else
85
+ <<<<<<< HEAD:lib/gn_list_resolver/resolver.rb
86
+ GnListResolver.logger.error("Remote resolver server failed")
87
+ =======
88
+ GnCrossmap.logger.error(job.reason.message)
89
+ >>>>>>> 36115cc... better error log:lib/gn_crossmap/resolver.rb
90
+ end
91
+ end
92
+
93
+ def update_stats(job_stats)
94
+ s = @stats.stats
95
+ s[:last_batches_time].shift if s[:last_batches_time].size > 2
96
+ s[:last_batches_time] << job_stats.stats[:last_batches_time][0]
97
+ s[:resolution_span] = Time.now - s[:resolution_start]
98
+ end
99
+
100
+ def create_job(batch)
101
+ batch_data = collect_names(batch)
102
+ rb = ResolverJob.new(batch, batch_data, @ds_id)
103
+ Concurrent::Future.execute { rb.run }
104
+ end
105
+
106
+ def instance_vars_from_opts(opts)
107
+ @stats = opts.stats
108
+ @with_classification = opts.with_classification.freeze
109
+ @ds_id = opts.data_source_id.freeze
110
+ @threads = opts.threads
111
+ end
112
+
113
+ def collect_names(batch)
114
+ batch_data = {}
115
+ batch.each do |row|
116
+ id = row[:id].strip
117
+ batch_data[id] = row[:original]
118
+ @processor.input[id] = { rank: row[:rank] }
119
+ end
120
+ batch_data
121
+ end
122
+
123
+ def with_log
124
+ s = @count + 1
125
+ @count += @batch
126
+ e = [@count, @stats.stats[:total_records]].min
127
+ GnListResolver.log("Resolve #{s}-#{e} out of " \
128
+ "#{@stats.stats[:total_records]} records at " \
129
+ "#{RESOLVER_URL}")
130
+ yield
131
+ end
132
+ end
133
+ end
134
+ # rubocop:enable all
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module GnListResolver
4
+ # Remote resolution for parallel jobs
5
+ class ResolverJob
6
+ GRAPHQL = GnGraphQL.new
7
+ QUERY = GRAPHQL.client.parse(GRAPHQL.query)
8
+ def initialize(names, batch_data, data_source_id)
9
+ @names = names
10
+ @batch_data = batch_data
11
+ @data_source_id = data_source_id
12
+ @stats = Stats.new
13
+ end
14
+
15
+ def run
16
+ res = remote_resolve(@names)
17
+ [res, @batch_data, @stats]
18
+ end
19
+
20
+ private
21
+
22
+ def variables(names)
23
+ { dataSourceIds: [@data_source_id],
24
+ names: names.
25
+ map { |name| { value: name[:name], suppliedId: name[:id] } } }
26
+ end
27
+
28
+ def remote_resolve(names)
29
+ batch_start = Time.now
30
+ res = GRAPHQL.client.query(QUERY, variables: variables(names))
31
+ stats_add_batch_time(batch_start)
32
+ res.data.name_resolver.responses
33
+ end
34
+
35
+ def stats_add_batch_time(batch_start)
36
+ @stats.stats[:last_batches_time] << Time.now - batch_start
37
+ end
38
+ end
39
+ end
@@ -2,7 +2,7 @@
2
2
 
3
3
  # Namespace module for crossmapping checklists to GN sources
4
4
  module GnListResolver
5
- VERSION = "3.2.0.4"
5
+ VERSION = "3.3.1.0"
6
6
 
7
7
  def self.version
8
8
  VERSION
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Namespace module for crossmapping checklists to GN sources
4
+ <<<<<<< HEAD:lib/gn_list_resolver/version.rb
5
+ module GnListResolver
6
+ VERSION = "3.3.1.0"
7
+ =======
8
+ module GnCrossmap
9
+ VERSION = "3.3.1"
10
+ >>>>>>> 36115cc... better error log:lib/gn_crossmap/version.rb
11
+
12
+ def self.version
13
+ VERSION
14
+ end
15
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gn_list_resolver
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.2.0.4
4
+ version: 3.3.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2017-09-07 00:00:00.000000000 Z
12
+ date: 2017-09-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: biodiversity
@@ -25,6 +25,20 @@ dependencies:
25
25
  - - "~>"
26
26
  - !ruby/object:Gem::Version
27
27
  version: '3.1'
28
+ - !ruby/object:Gem::Dependency
29
+ name: concurrent-ruby
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - "~>"
33
+ - !ruby/object:Gem::Version
34
+ version: '1.0'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - "~>"
40
+ - !ruby/object:Gem::Version
41
+ version: '1.0'
28
42
  - !ruby/object:Gem::Dependency
29
43
  name: gn_uuid
30
44
  requirement: !ruby/object:Gem::Requirement
@@ -115,14 +129,14 @@ dependencies:
115
129
  requirements:
116
130
  - - "~>"
117
131
  - !ruby/object:Gem::Version
118
- version: '9.0'
132
+ version: '9.1'
119
133
  type: :development
120
134
  prerelease: false
121
135
  version_requirements: !ruby/object:Gem::Requirement
122
136
  requirements:
123
137
  - - "~>"
124
138
  - !ruby/object:Gem::Version
125
- version: '9.0'
139
+ version: '9.1'
126
140
  - !ruby/object:Gem::Dependency
127
141
  name: coveralls
128
142
  requirement: !ruby/object:Gem::Requirement
@@ -212,10 +226,13 @@ files:
212
226
  - lib/gn_list_resolver/graphql.rb
213
227
  - lib/gn_list_resolver/reader.rb
214
228
  - lib/gn_list_resolver/resolver.rb
229
+ - lib/gn_list_resolver/resolver.rb.orig
230
+ - lib/gn_list_resolver/resolver_job.rb
215
231
  - lib/gn_list_resolver/result_processor.rb
216
232
  - lib/gn_list_resolver/sci_name_collector.rb
217
233
  - lib/gn_list_resolver/stats.rb
218
234
  - lib/gn_list_resolver/version.rb
235
+ - lib/gn_list_resolver/version.rb.orig
219
236
  - lib/gn_list_resolver/writer.rb
220
237
  homepage: https://github.com/GlobalNamesArchitecture/gn_list_resolver
221
238
  licenses: