gn_list_resolver 3.2.0.4 → 3.3.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f990bf592ce702c16e20f45e3c881e7f4f4df01c
4
- data.tar.gz: e8b55f1f5e6b27e4b8eddb8e21c65ce900ede729
3
+ metadata.gz: cdcc523cf5153459744453bc7ac1a6ccd0dcc362
4
+ data.tar.gz: a5886cca5aef50a2d17a851152927f1653d850c8
5
5
  SHA512:
6
- metadata.gz: c5586d503371d99d8e26cf26057e95805a5fc22ca6f2073eb4baabe31891a68ccd20b64958f1d8e6947a2ea6469c5e03981e3b514cc1696323809c03892c1f27
7
- data.tar.gz: 44a2e55c363de3858449f0ea954e21ea4cdaf27d30c32a4d12c7d3064c9c69cb98b65fffcdab1261a86d25a01c02cffbc970a11e1909be20120c33749da66470
6
+ metadata.gz: c8f44334200d52e5d407793206a409727ad61c1ae045c2510ef4d8ca6cfd39163dda3ad38017041da5f4e7a9ef4ebd0eb1bb65d8ce8c3eb4c618e98e20037321
7
+ data.tar.gz: 62034abff0ab1a6a85499586ab259cf15138d49e0c0cb578a0e500e3354c38ef09253e579763698f9fa8ab849c09e11fe25411301d7aee36d0a1e551c615f714
@@ -17,6 +17,16 @@
17
17
  @dimus, @alexander-myltsev - sync with `gn_crossmap`'s 3.2.0, current name is
18
18
  still a bit broken
19
19
 
20
+ ## 3.3.1
21
+
22
+ * @dimus - Better error message in logger
23
+
24
+ ## 3.3.0
25
+
26
+ * @dimus - Add option `opts.threads` for threads number for concurrent jobs.
27
+ Resolution now happens concurrently if the number of jobs is more
28
+ than 1. Max amount of jobs is 10.
29
+
20
30
  ## 3.2.0
21
31
 
22
32
  * @dimus - Add a column `matchSize` with number of matches for each name
@@ -8,6 +8,7 @@ puts "This program requires Ruby >= v. 2.4.1" if RUBY_VERSION < "2.4.1"
8
8
 
9
9
  CATALOGUE_OF_LIFE = 1
10
10
  OUTPUT = "output.csv"
11
+ THREADS = 1
11
12
  opts = Trollop.options do
12
13
  banner "Compares a list of scientific names to scientific names from a " \
13
14
  "data source from Global Names Resolver\n\n " \
@@ -16,6 +17,7 @@ opts = Trollop.options do
16
17
 
17
18
  opt(:input, "Path to input file", type: :string)
18
19
  opt(:output, "Path to output file", default: OUTPUT)
20
+ opt(:threads, "Threads number for resolution", default: THREADS)
19
21
  opt(:data_source_id, "Data source id from GN Resolver",
20
22
  default: CATALOGUE_OF_LIFE)
21
23
  opt(:skip_original, "If given, only 'taxonID' is shown " \
@@ -32,6 +32,7 @@ Gem::Specification.new do |gem|
32
32
  gem.require_paths = ["lib"]
33
33
 
34
34
  gem.add_dependency "biodiversity", "~> 3.1"
35
+ gem.add_dependency "concurrent-ruby", "~> 1.0"
35
36
  gem.add_dependency "gn_uuid", "~> 0.5"
36
37
  gem.add_dependency "graphql-client", "~> 0.11.3"
37
38
  gem.add_dependency "logger-colors", "~> 1.0"
@@ -39,7 +40,7 @@ Gem::Specification.new do |gem|
39
40
  gem.add_dependency "trollop", "~> 2.1"
40
41
 
41
42
  gem.add_development_dependency "bundler", "~> 1.7"
42
- gem.add_development_dependency "byebug", "~> 9.0"
43
+ gem.add_development_dependency "byebug", "~> 9.1"
43
44
  gem.add_development_dependency "coveralls", "~> 0.8"
44
45
  gem.add_development_dependency "rake", "~> 12.0"
45
46
  gem.add_development_dependency "rspec", "~> 3.2"
@@ -8,6 +8,7 @@ require "logger"
8
8
  require "logger/colors"
9
9
  require "pp"
10
10
  require "biodiversity"
11
+ require "concurrent"
11
12
  require "gn_uuid"
12
13
  require "graphql/client"
13
14
  require "graphql/client/http"
@@ -20,6 +21,7 @@ require "gn_list_resolver/writer"
20
21
  require "gn_list_resolver/collector"
21
22
  require "gn_list_resolver/column_collector"
22
23
  require "gn_list_resolver/sci_name_collector"
24
+ require "gn_list_resolver/resolver_job"
23
25
  require "gn_list_resolver/resolver"
24
26
  require "gn_list_resolver/result_processor"
25
27
  require "gn_list_resolver/stats"
@@ -37,7 +39,7 @@ module GnListResolver
37
39
  reader = create_reader(input_io, opts)
38
40
  data = block_given? ? reader.read(&Proc.new) : reader.read
39
41
  writer = create_writer(reader, output_io, opts)
40
- resolver = create_resolver(writer, opts)
42
+ resolver = Resolver.new(writer, opts)
41
43
  block_given? ? resolver.resolve(data, &Proc.new) : resolver.resolve(data)
42
44
  logger.warn(resolver.stats.stats.pretty_inspect) if opts[:debug]
43
45
  resolver.stats
@@ -61,13 +63,18 @@ module GnListResolver
61
63
  end
62
64
  end
63
65
 
64
- private
65
-
66
- def create_resolver(writer, opts)
67
- Resolver.new(writer, opts.data_source_id,
68
- opts.stats, opts.with_classification)
66
+ def opts_struct(opts)
67
+ threads = opts[:threads].to_i
68
+ opts[:threads] = threads.between?(1, 10) ? threads : 2
69
+ with_classification = opts[:with_classification] ? true : false
70
+ opts[:with_classification] = with_classification
71
+ data_source_id = opts[:data_source_id].to_i
72
+ opts[:data_source_id] = data_source_id.zero? ? 1 : data_source_id
73
+ OpenStruct.new({ stats: Stats.new, alt_headers: [] }.merge(opts))
69
74
  end
70
75
 
76
+ private
77
+
71
78
  def create_writer(reader, output_io, opts)
72
79
  Writer.new(output_io, reader.original_fields,
73
80
  output_name(opts.output), opts.with_classification)
@@ -78,10 +85,6 @@ module GnListResolver
78
85
  opts.skip_original, opts.alt_headers, opts.stats)
79
86
  end
80
87
 
81
- def opts_struct(opts)
82
- OpenStruct.new({ stats: Stats.new, alt_headers: [] }.merge(opts))
83
- end
84
-
85
88
  def io(input, output)
86
89
  io_in = iogen(input, INPUT_MODE)
87
90
  io_out = iogen(output, OUTPUT_MODE)
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # rubocop:disable Metrics/ClassLength
4
+
3
5
  module GnListResolver
4
6
  # Sends data to GN Resolver and collects results
5
7
  class Resolver
@@ -7,111 +9,122 @@ module GnListResolver
7
9
  QUERY = GRAPHQL.client.parse(GRAPHQL.query)
8
10
  attr_reader :stats
9
11
 
10
- def initialize(writer, data_source_id, stats, with_classification = false)
11
- @stats = stats
12
- @processor = GnListResolver::ResultProcessor.new(writer, @stats,
13
- with_classification)
14
- @ds_id = data_source_id
12
+ def initialize(writer, opts)
13
+ instance_vars_from_opts(opts)
14
+ @processor = GnListResolver::ResultProcessor.
15
+ new(writer, @stats, @with_classification)
15
16
  @count = 0
16
- @current_data = {}
17
+ @jobs = []
17
18
  @batch = 1000
18
19
  end
19
20
 
20
21
  def resolve(data)
21
- update_stats(data.size)
22
- block_given? ? process(data, &Proc.new) : process(data)
22
+ resolution_stats(data.size)
23
+ @threads.times do
24
+ batch = data.shift(@batch)
25
+ add_job(batch)
26
+ end
27
+ block_given? ? traverse_jobs(data, &Proc.new) : traverse_jobs(data)
23
28
  wrap_up
24
29
  block_given? ? yield(@stats.stats) : @stats.stats
25
30
  end
26
31
 
27
32
  private
28
33
 
29
- def process(data)
30
- cmd = nil
31
- data.each_slice(@batch) do |slice|
32
- with_log do
33
- collect_names(slice)
34
- remote_resolve(slice)
35
- cmd = yield(@stats.stats) if block_given?
36
- end
37
- break if cmd == "STOP"
38
- end
39
- end
40
-
41
34
  def wrap_up
42
35
  @stats.stats[:resolution_stop] = Time.now
43
36
  @stats.stats[:status] = :finish
44
37
  @processor.writer.close
45
38
  end
46
39
 
47
- def update_stats(records_num)
48
- @stats.stats[:total_records] = records_num
49
- @stats.stats[:resolution_start] = Time.now
50
- @stats.stats[:status] = :resolution
40
+ def add_job(batch)
41
+ job = batch.empty? ? nil : create_job(batch)
42
+ @jobs << job
51
43
  end
52
44
 
53
- def with_log
54
- s = @count + 1
55
- @count += @batch
56
- e = [@count, @stats.stats[:total_records]].min
57
- GnListResolver.log("Resolve #{s}-#{e} out of " \
58
- "#{@stats.stats[:total_records]} records at " \
59
- "#{RESOLVER_URL}")
60
- yield
45
+ def traverse_jobs(data)
46
+ until data.empty? && @jobs.compact.empty?
47
+ process_results(data)
48
+ cmd = yield(@stats.stats) if block_given?
49
+ break if cmd == "STOP"
50
+ sleep(0.5)
51
+ end
61
52
  end
62
53
 
63
- def collect_names(slice)
64
- @current_data = {}
65
- slice.each_with_object([]) do |row, str|
66
- id = row[:id].strip
67
- @current_data[id] = row[:original]
68
- @processor.input[id] = { rank: row[:rank] }
69
- str << "#{id}|#{row[:name]}"
70
- end.join("\n")
54
+ def resolution_stats(records_num)
55
+ @stats.stats[:total_records] = records_num
56
+ @stats.stats[:resolution_start] = Time.now
57
+ @stats.stats[:status] = :resolution
71
58
  end
72
59
 
73
- def variables(names)
74
- { dataSourceIds: [@ds_id],
75
- names: names.
76
- map { |name| { value: name[:name], suppliedId: name[:id] } } }
60
+ def process_results(data)
61
+ indices = []
62
+ @jobs.each_with_index do |job, i|
63
+ next if job.nil? || !job.complete?
64
+ with_log do
65
+ process_job(job)
66
+ indices << i
67
+ end
68
+ end
69
+ add_jobs(indices, data) unless indices.empty?
77
70
  end
78
71
 
79
- def remote_resolve(names)
80
- batch_start = Time.now
81
-
82
- res = GRAPHQL.client.query(QUERY, variables: variables(names))
83
- if res.data
84
- @processor.process(res.data.name_resolver.responses, @current_data)
85
- else
86
- single_remote_resolve(names)
72
+ def add_jobs(indices, data)
73
+ indices.each do |i|
74
+ batch = data.shift(@batch)
75
+ @jobs[i] = batch.empty? ? nil : create_job(batch)
87
76
  end
88
- update_batch_times(batch_start)
89
77
  end
90
78
 
91
- def single_remote_resolve(names)
92
- names.each do |name|
93
- res = GRAPHQL.client.query(QUERY, variables: variables([name]))
94
- if res.data
95
- @processor.process(res.data.name_resolver, @current_data)
96
- else
97
- process_resolver_error(res, name)
98
- end
79
+ def process_job(job)
80
+ if job.fulfilled?
81
+ results, current_data, stats = job.value
82
+ update_stats(stats)
83
+ @processor.process(results, current_data)
84
+ else
85
+ GnResolver.logger.error(job.reason.message)
99
86
  end
100
87
  end
101
88
 
102
- def update_batch_times(batch_start)
89
+ def update_stats(job_stats)
103
90
  s = @stats.stats
104
91
  s[:last_batches_time].shift if s[:last_batches_time].size > 2
105
- s[:last_batches_time] << Time.now - batch_start
92
+ s[:last_batches_time] << job_stats.stats[:last_batches_time][0]
106
93
  s[:resolution_span] = Time.now - s[:resolution_start]
107
94
  end
108
95
 
109
- def process_resolver_error(res, name)
110
- @stats.stats[:matches][:ErrorInMatch] += 1
111
- @stats.stats[:resolved_records] += 1
112
- error =
113
- "Resolver broke on '#{name}': #{res.errors.messages['data'].first}"
114
- GnListResolver.logger.error(error)
96
+ def create_job(batch)
97
+ batch_data = collect_names(batch)
98
+ rb = ResolverJob.new(batch, batch_data, @ds_id)
99
+ Concurrent::Future.execute { rb.run }
100
+ end
101
+
102
+ def instance_vars_from_opts(opts)
103
+ @stats = opts.stats
104
+ @with_classification = opts.with_classification.freeze
105
+ @ds_id = opts.data_source_id.freeze
106
+ @threads = opts.threads
107
+ end
108
+
109
+ def collect_names(batch)
110
+ batch_data = {}
111
+ batch.each do |row|
112
+ id = row[:id].strip
113
+ batch_data[id] = row[:original]
114
+ @processor.input[id] = { rank: row[:rank] }
115
+ end
116
+ batch_data
117
+ end
118
+
119
+ def with_log
120
+ s = @count + 1
121
+ @count += @batch
122
+ e = [@count, @stats.stats[:total_records]].min
123
+ GnListResolver.log("Resolve #{s}-#{e} out of " \
124
+ "#{@stats.stats[:total_records]} records at " \
125
+ "#{RESOLVER_URL}")
126
+ yield
115
127
  end
116
128
  end
117
129
  end
130
+ # rubocop:enable all
@@ -0,0 +1,134 @@
1
+ # frozen_string_literal: true
2
+
3
+ # rubocop:disable Metrics/ClassLength
4
+
5
+ module GnListResolver
6
+ # Sends data to GN Resolver and collects results
7
+ class Resolver
8
+ GRAPHQL = GnGraphQL.new
9
+ QUERY = GRAPHQL.client.parse(GRAPHQL.query)
10
+ attr_reader :stats
11
+
12
+ def initialize(writer, opts)
13
+ instance_vars_from_opts(opts)
14
+ @processor = GnListResolver::ResultProcessor.
15
+ new(writer, @stats, @with_classification)
16
+ @count = 0
17
+ @jobs = []
18
+ @batch = 1000
19
+ end
20
+
21
+ def resolve(data)
22
+ resolution_stats(data.size)
23
+ @threads.times do
24
+ batch = data.shift(@batch)
25
+ add_job(batch)
26
+ end
27
+ block_given? ? traverse_jobs(data, &Proc.new) : traverse_jobs(data)
28
+ wrap_up
29
+ block_given? ? yield(@stats.stats) : @stats.stats
30
+ end
31
+
32
+ private
33
+
34
+ def wrap_up
35
+ @stats.stats[:resolution_stop] = Time.now
36
+ @stats.stats[:status] = :finish
37
+ @processor.writer.close
38
+ end
39
+
40
+ def add_job(batch)
41
+ job = batch.empty? ? nil : create_job(batch)
42
+ @jobs << job
43
+ end
44
+
45
+ def traverse_jobs(data)
46
+ until data.empty? && @jobs.compact.empty?
47
+ process_results(data)
48
+ cmd = yield(@stats.stats) if block_given?
49
+ break if cmd == "STOP"
50
+ sleep(0.5)
51
+ end
52
+ end
53
+
54
+ def resolution_stats(records_num)
55
+ @stats.stats[:total_records] = records_num
56
+ @stats.stats[:resolution_start] = Time.now
57
+ @stats.stats[:status] = :resolution
58
+ end
59
+
60
+ def process_results(data)
61
+ indices = []
62
+ @jobs.each_with_index do |job, i|
63
+ next if job.nil? || !job.complete?
64
+ with_log do
65
+ process_job(job)
66
+ indices << i
67
+ end
68
+ end
69
+ add_jobs(indices, data) unless indices.empty?
70
+ end
71
+
72
+ def add_jobs(indices, data)
73
+ indices.each do |i|
74
+ batch = data.shift(@batch)
75
+ @jobs[i] = batch.empty? ? nil : create_job(batch)
76
+ end
77
+ end
78
+
79
+ def process_job(job)
80
+ if job.fulfilled?
81
+ results, current_data, stats = job.value
82
+ update_stats(stats)
83
+ @processor.process(results, current_data)
84
+ else
85
+ <<<<<<< HEAD:lib/gn_list_resolver/resolver.rb
86
+ GnListResolver.logger.error("Remote resolver server failed")
87
+ =======
88
+ GnCrossmap.logger.error(job.reason.message)
89
+ >>>>>>> 36115cc... better error log:lib/gn_crossmap/resolver.rb
90
+ end
91
+ end
92
+
93
+ def update_stats(job_stats)
94
+ s = @stats.stats
95
+ s[:last_batches_time].shift if s[:last_batches_time].size > 2
96
+ s[:last_batches_time] << job_stats.stats[:last_batches_time][0]
97
+ s[:resolution_span] = Time.now - s[:resolution_start]
98
+ end
99
+
100
+ def create_job(batch)
101
+ batch_data = collect_names(batch)
102
+ rb = ResolverJob.new(batch, batch_data, @ds_id)
103
+ Concurrent::Future.execute { rb.run }
104
+ end
105
+
106
+ def instance_vars_from_opts(opts)
107
+ @stats = opts.stats
108
+ @with_classification = opts.with_classification.freeze
109
+ @ds_id = opts.data_source_id.freeze
110
+ @threads = opts.threads
111
+ end
112
+
113
+ def collect_names(batch)
114
+ batch_data = {}
115
+ batch.each do |row|
116
+ id = row[:id].strip
117
+ batch_data[id] = row[:original]
118
+ @processor.input[id] = { rank: row[:rank] }
119
+ end
120
+ batch_data
121
+ end
122
+
123
+ def with_log
124
+ s = @count + 1
125
+ @count += @batch
126
+ e = [@count, @stats.stats[:total_records]].min
127
+ GnListResolver.log("Resolve #{s}-#{e} out of " \
128
+ "#{@stats.stats[:total_records]} records at " \
129
+ "#{RESOLVER_URL}")
130
+ yield
131
+ end
132
+ end
133
+ end
134
+ # rubocop:enable all
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module GnListResolver
4
+ # Remote resolution for parallel jobs
5
+ class ResolverJob
6
+ GRAPHQL = GnGraphQL.new
7
+ QUERY = GRAPHQL.client.parse(GRAPHQL.query)
8
+ def initialize(names, batch_data, data_source_id)
9
+ @names = names
10
+ @batch_data = batch_data
11
+ @data_source_id = data_source_id
12
+ @stats = Stats.new
13
+ end
14
+
15
+ def run
16
+ res = remote_resolve(@names)
17
+ [res, @batch_data, @stats]
18
+ end
19
+
20
+ private
21
+
22
+ def variables(names)
23
+ { dataSourceIds: [@data_source_id],
24
+ names: names.
25
+ map { |name| { value: name[:name], suppliedId: name[:id] } } }
26
+ end
27
+
28
+ def remote_resolve(names)
29
+ batch_start = Time.now
30
+ res = GRAPHQL.client.query(QUERY, variables: variables(names))
31
+ stats_add_batch_time(batch_start)
32
+ res.data.name_resolver.responses
33
+ end
34
+
35
+ def stats_add_batch_time(batch_start)
36
+ @stats.stats[:last_batches_time] << Time.now - batch_start
37
+ end
38
+ end
39
+ end
@@ -2,7 +2,7 @@
2
2
 
3
3
  # Namespace module for crossmapping checklists to GN sources
4
4
  module GnListResolver
5
- VERSION = "3.2.0.4"
5
+ VERSION = "3.3.1.0"
6
6
 
7
7
  def self.version
8
8
  VERSION
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Namespace module for crossmapping checklists to GN sources
4
+ <<<<<<< HEAD:lib/gn_list_resolver/version.rb
5
+ module GnListResolver
6
+ VERSION = "3.3.1.0"
7
+ =======
8
+ module GnCrossmap
9
+ VERSION = "3.3.1"
10
+ >>>>>>> 36115cc... better error log:lib/gn_crossmap/version.rb
11
+
12
+ def self.version
13
+ VERSION
14
+ end
15
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gn_list_resolver
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.2.0.4
4
+ version: 3.3.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2017-09-07 00:00:00.000000000 Z
12
+ date: 2017-09-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: biodiversity
@@ -25,6 +25,20 @@ dependencies:
25
25
  - - "~>"
26
26
  - !ruby/object:Gem::Version
27
27
  version: '3.1'
28
+ - !ruby/object:Gem::Dependency
29
+ name: concurrent-ruby
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - "~>"
33
+ - !ruby/object:Gem::Version
34
+ version: '1.0'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - "~>"
40
+ - !ruby/object:Gem::Version
41
+ version: '1.0'
28
42
  - !ruby/object:Gem::Dependency
29
43
  name: gn_uuid
30
44
  requirement: !ruby/object:Gem::Requirement
@@ -115,14 +129,14 @@ dependencies:
115
129
  requirements:
116
130
  - - "~>"
117
131
  - !ruby/object:Gem::Version
118
- version: '9.0'
132
+ version: '9.1'
119
133
  type: :development
120
134
  prerelease: false
121
135
  version_requirements: !ruby/object:Gem::Requirement
122
136
  requirements:
123
137
  - - "~>"
124
138
  - !ruby/object:Gem::Version
125
- version: '9.0'
139
+ version: '9.1'
126
140
  - !ruby/object:Gem::Dependency
127
141
  name: coveralls
128
142
  requirement: !ruby/object:Gem::Requirement
@@ -212,10 +226,13 @@ files:
212
226
  - lib/gn_list_resolver/graphql.rb
213
227
  - lib/gn_list_resolver/reader.rb
214
228
  - lib/gn_list_resolver/resolver.rb
229
+ - lib/gn_list_resolver/resolver.rb.orig
230
+ - lib/gn_list_resolver/resolver_job.rb
215
231
  - lib/gn_list_resolver/result_processor.rb
216
232
  - lib/gn_list_resolver/sci_name_collector.rb
217
233
  - lib/gn_list_resolver/stats.rb
218
234
  - lib/gn_list_resolver/version.rb
235
+ - lib/gn_list_resolver/version.rb.orig
219
236
  - lib/gn_list_resolver/writer.rb
220
237
  homepage: https://github.com/GlobalNamesArchitecture/gn_list_resolver
221
238
  licenses: