gn_crossmap 3.2.0 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bfd0f87bbaf73d66c33e130c124e087104994879
4
- data.tar.gz: 11020365c22465b4d804faaf2cf13c6ee168b051
3
+ metadata.gz: 24d63c7b0f6ce958c567008c74eb31108753e745
4
+ data.tar.gz: 988291015dc7712cbbf01e67b1da1272fff11db4
5
5
  SHA512:
6
- metadata.gz: 5ca351156f9007cc6aaa879b3d86852b2a1aeb2584182351ccd3d228997cbccb47604bf97654bd583e58bc1a7280d0f0acb9f6903095db783e4bf0e614cebddf
7
- data.tar.gz: 5580be6d8d50f4385e05b798b37a987338e54270d132560df1465f7d2a8cd101b22394a867481577861f03fea28efdd412542092f7f8b5928bf9d8654c434321
6
+ metadata.gz: 801270ddb8e7ed24a77958bb9bba8db9c8f1f355165fac2ecc119e90494d6cb2bf909e1b24762ad296586657c58c1edc12f86b25a828178b30f273666aa801bf
7
+ data.tar.gz: 43de612ec0c72ddf9c949c209495fb73d0a346632fc5a841185723858048bebcefcbeaefbbc3cb39fe31b291a4da49e1b4908a9ed709d0a6952e70feeeab097b
data/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # ``gn_crossmap`` CHANGELOG
2
2
 
3
+ ## 3.3.0
4
+
5
+ * @dimus - Add option `opts.threads` for threads number for concurrent jobs.
6
+ Resolution now happens concurrently if the number of jobs is more
7
+ than 1. Max amount of jobs is 10.
8
+
3
9
  ## 3.2.0
4
10
 
5
11
  * @dimus - Add a column `matchSize` with number of matches for each name
data/exe/crossmap CHANGED
@@ -8,6 +8,7 @@ puts "This program requires Ruby >= v. 2.4.1" if RUBY_VERSION < "2.4.1"
8
8
 
9
9
  CATALOGUE_OF_LIFE = 1
10
10
  OUTPUT = "output.csv"
11
+ THREADS = 1
11
12
  opts = Trollop.options do
12
13
  banner "Compares a list of scientific names to scientific names from a " \
13
14
  "data source from Global Names Resolver\n\n " \
@@ -16,6 +17,7 @@ opts = Trollop.options do
16
17
 
17
18
  opt(:input, "Path to input file", type: :string)
18
19
  opt(:output, "Path to output file", default: OUTPUT)
20
+ opt(:threads, "Threads number for resolution", default: THREADS)
19
21
  opt(:data_source_id, "Data source id from GN Resolver",
20
22
  default: CATALOGUE_OF_LIFE)
21
23
  opt(:skip_original, "If given, only 'taxonID' is shown " \
data/gn_crossmap.gemspec CHANGED
@@ -31,13 +31,14 @@ Gem::Specification.new do |gem|
31
31
  gem.require_paths = ["lib"]
32
32
 
33
33
  gem.add_dependency "biodiversity", "~> 3.1"
34
+ gem.add_dependency "concurrent-ruby", "~> 1.0"
34
35
  gem.add_dependency "gn_uuid", "~> 0.5"
35
36
  gem.add_dependency "logger-colors", "~> 1.0"
36
37
  gem.add_dependency "rest-client", "~> 2.0"
37
38
  gem.add_dependency "trollop", "~> 2.1"
38
39
 
39
40
  gem.add_development_dependency "bundler", "~> 1.7"
40
- gem.add_development_dependency "byebug", "~> 9.0"
41
+ gem.add_development_dependency "byebug", "~> 9.1"
41
42
  gem.add_development_dependency "coveralls", "~> 0.8"
42
43
  gem.add_development_dependency "rake", "~> 12.0"
43
44
  gem.add_development_dependency "rspec", "~> 3.2"
data/lib/gn_crossmap.rb CHANGED
@@ -7,9 +7,11 @@ require "tempfile"
7
7
  require "logger"
8
8
  require "logger/colors"
9
9
  require "biodiversity"
10
+ require "concurrent"
10
11
  require "gn_uuid"
11
12
  require "gn_crossmap/errors"
12
13
  require "gn_crossmap/version"
14
+ require "gn_crossmap/resolver_job"
13
15
  require "gn_crossmap/reader"
14
16
  require "gn_crossmap/writer"
15
17
  require "gn_crossmap/collector"
@@ -45,7 +47,7 @@ module GnCrossmap
45
47
  reader = create_reader(input_io, opts)
46
48
  data = block_given? ? reader.read(&Proc.new) : reader.read
47
49
  writer = create_writer(reader, output_io, opts)
48
- resolver = create_resolver(writer, opts)
50
+ resolver = Resolver.new(writer, opts)
49
51
  block_given? ? resolver.resolve(data, &Proc.new) : resolver.resolve(data)
50
52
  resolver.stats
51
53
  end
@@ -68,13 +70,20 @@ module GnCrossmap
68
70
  end
69
71
  end
70
72
 
71
- private
72
-
73
- def create_resolver(writer, opts)
74
- Resolver.new(writer, opts.data_source_id, opts.resolver_url,
75
- opts.stats, opts.with_classification)
73
+ def opts_struct(opts)
74
+ resolver_url = "http://resolver.globalnames.org/name_resolvers.json"
75
+ threads = opts[:threads].to_i
76
+ opts[:threads] = threads.between?(1, 10) ? threads : 2
77
+ with_classification = opts[:with_classification] ? true : false
78
+ opts[:with_classification] = with_classification
79
+ data_source_id = opts[:data_source_id].to_i
80
+ opts[:data_source_id] = data_source_id.zero? ? 1 : data_source_id
81
+ OpenStruct.new({ stats: Stats.new, alt_headers: [],
82
+ resolver_url: resolver_url }.merge(opts))
76
83
  end
77
84
 
85
+ private
86
+
78
87
  def create_writer(reader, output_io, opts)
79
88
  Writer.new(output_io, reader.original_fields,
80
89
  output_name(opts.output), opts.with_classification)
@@ -85,12 +94,6 @@ module GnCrossmap
85
94
  opts.skip_original, opts.alt_headers, opts.stats)
86
95
  end
87
96
 
88
- def opts_struct(opts)
89
- resolver_url = "http://resolver.globalnames.org/name_resolvers.json"
90
- OpenStruct.new({ stats: Stats.new, alt_headers: [],
91
- resolver_url: resolver_url }.merge(opts))
92
- end
93
-
94
97
  def io(input, output)
95
98
  io_in = iogen(input, INPUT_MODE)
96
99
  io_out = iogen(output, OUTPUT_MODE)
@@ -1,108 +1,133 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # rubocop:disable Metrics/ClassLength
4
+
3
5
  module GnCrossmap
4
6
  # Sends data to GN Resolver and collects results
5
7
  class Resolver
6
8
  attr_reader :stats
7
9
 
8
- def initialize(writer, data_source_id,
9
- resolver_url, stats, with_classification = false)
10
- @stats = stats
11
- @resolver_url = resolver_url
10
+ def initialize(writer, opts)
11
+ instance_vars_from_opts(opts)
12
12
  @processor = GnCrossmap::ResultProcessor.
13
- new(writer, @stats, with_classification)
14
- @ds_id = data_source_id
13
+ new(writer, @stats, @with_classification)
15
14
  @count = 0
16
- @current_data = {}
15
+ @jobs = []
17
16
  @batch = 200
18
17
  end
19
18
 
20
19
  def resolve(data)
21
- update_stats(data.size)
22
- block_given? ? process(data, &Proc.new) : process(data)
20
+ resolution_stats(data.size)
21
+ @threads.times do
22
+ batch = data.shift(@batch)
23
+ add_job(batch)
24
+ end
25
+ block_given? ? traverse_jobs(data, &Proc.new) : traverse_jobs(data)
23
26
  wrap_up
24
27
  yield(@stats.stats) if block_given?
25
28
  end
26
29
 
27
30
  private
28
31
 
29
- def process(data)
30
- cmd = nil
31
- data.each_slice(@batch) do |slice|
32
- with_log do
33
- remote_resolve(collect_names(slice))
34
- cmd = yield(@stats.stats) if block_given?
35
- end
36
- break if cmd == "STOP"
37
- end
38
- end
39
-
40
32
  def wrap_up
41
33
  @stats.stats[:resolution_stop] = Time.now
42
34
  @stats.stats[:status] = :finish
43
35
  @processor.writer.close
44
36
  end
45
37
 
46
- def update_stats(records_num)
38
+ def add_job(batch)
39
+ job = batch.empty? ? nil : create_job(batch)
40
+ @jobs << job
41
+ end
42
+
43
+ def traverse_jobs(data)
44
+ until data.empty? && @jobs.compact.empty?
45
+ process_results(data)
46
+ cmd = yield(@stats.stats) if block_given?
47
+ break if cmd == "STOP"
48
+ sleep(0.5)
49
+ end
50
+ end
51
+
52
+ def resolution_stats(records_num)
47
53
  @stats.stats[:total_records] = records_num
48
54
  @stats.stats[:resolution_start] = Time.now
49
55
  @stats.stats[:status] = :resolution
50
56
  end
51
57
 
52
- def with_log
53
- s = @count + 1
54
- @count += @batch
55
- e = [@count, @stats.stats[:total_records]].min
56
- GnCrossmap.log("Resolve #{s}-#{e} out of " \
57
- "#{@stats.stats[:total_records]} records at " \
58
- "#{@resolver_url}")
59
- yield
58
+ def process_results(data)
59
+ indices = []
60
+ @jobs.each_with_index do |job, i|
61
+ next if job.nil? || !job.complete?
62
+ with_log do
63
+ process_job(job)
64
+ indices << i
65
+ end
66
+ end
67
+ add_jobs(indices, data) unless indices.empty?
60
68
  end
61
69
 
62
- def collect_names(slice)
63
- @current_data = {}
64
- slice.each_with_object([]) do |row, str|
65
- id = row[:id].strip
66
- @current_data[id] = row[:original]
67
- @processor.input[id] = { rank: row[:rank] }
68
- str << "#{id}|#{row[:name]}"
69
- end.join("\n")
70
+ def add_jobs(indices, data)
71
+ indices.each do |i|
72
+ batch = data.shift(@batch)
73
+ @jobs[i] = batch.empty? ? nil : create_job(batch)
74
+ end
70
75
  end
71
76
 
72
- def remote_resolve(names)
73
- batch_start = Time.now
74
- res = RestClient.post(@resolver_url, data: names, data_source_ids: @ds_id)
75
- @processor.process(res, @current_data)
76
- rescue RestClient::Exception
77
- single_remote_resolve(names)
78
- ensure
79
- update_batch_times(batch_start)
77
+ def process_job(job)
78
+ if job.fulfilled?
79
+ results, current_data, stats = job.value
80
+ update_stats(stats)
81
+ @processor.process(results, current_data)
82
+ else
83
+ GnCrossmap.logger.error("Remote resolver server failed")
84
+ end
80
85
  end
81
86
 
82
- def update_batch_times(batch_start)
87
+ # rubocop:disable Metrics/AbcSize
88
+ def update_stats(job_stats)
83
89
  s = @stats.stats
84
90
  s[:last_batches_time].shift if s[:last_batches_time].size > 2
85
- s[:last_batches_time] << Time.now - batch_start
91
+ s[:last_batches_time] << job_stats.stats[:last_batches_time][0]
86
92
  s[:resolution_span] = Time.now - s[:resolution_start]
93
+ s[:resolved_records] += job_stats.stats[:resolved_records]
94
+ s[:matches][7] += job_stats.stats[:matches][7]
87
95
  end
96
+ # rubocop:enable all
88
97
 
89
- def single_remote_resolve(names)
90
- names.split("\n").each do |name|
91
- begin
92
- res = RestClient.post(@resolver_url, data: name,
93
- data_source_ids: @ds_id)
94
- @processor.process(res, @current_data)
95
- rescue RestClient::Exception => e
96
- process_resolver_error(e, name)
97
- next
98
- end
99
- end
98
+ def create_job(batch)
99
+ names, batch_data = collect_names(batch)
100
+ rb = ResolverJob.new(names, batch_data, @resolver_url, @ds_id)
101
+ Concurrent::Future.execute { rb.run }
102
+ end
103
+
104
+ def instance_vars_from_opts(opts)
105
+ @stats = opts.stats
106
+ @with_classification = opts.with_classification.freeze
107
+ @ds_id = opts.data_source_id.freeze
108
+ @resolver_url = opts.resolver_url.freeze
109
+ @threads = opts.threads
110
+ end
111
+
112
+ def collect_names(batch)
113
+ batch_data = {}
114
+ names = batch.each_with_object([]) do |row, str|
115
+ id = row[:id].strip
116
+ batch_data[id] = row[:original]
117
+ @processor.input[id] = { rank: row[:rank] }
118
+ str << "#{id}|#{row[:name]}"
119
+ end.join("\n")
120
+ [names, batch_data]
100
121
  end
101
122
 
102
- def process_resolver_error(err, name)
103
- @stats.stats[:matches][7] += 1
104
- @stats.stats[:resolved_records] += 1
105
- GnCrossmap.logger.error("Resolver broke on '#{name}': #{err.message}")
123
+ def with_log
124
+ s = @count + 1
125
+ @count += @batch
126
+ e = [@count, @stats.stats[:total_records]].min
127
+ GnCrossmap.log("Resolve #{s}-#{e} out of " \
128
+ "#{@stats.stats[:total_records]} records at " \
129
+ "#{@resolver_url}")
130
+ yield
106
131
  end
107
132
  end
108
133
  end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module GnCrossmap
4
+ # Remote resolution for parallel jobs
5
+ class ResolverJob
6
+ def initialize(names, batch_data, resolver_url, ds_id)
7
+ @names = names
8
+ @batch_data = batch_data
9
+ @resolver_url = resolver_url
10
+ @ds_id = ds_id
11
+ @stats = Stats.new
12
+ end
13
+
14
+ def run
15
+ res = remote_resolve(@names)
16
+ [res, @batch_data, @stats]
17
+ end
18
+
19
+ private
20
+
21
+ def remote_resolve(names)
22
+ batch_start = Time.now
23
+ res = RestClient.post(@resolver_url, data: names,
24
+ data_source_ids: @ds_id)
25
+ [res.body]
26
+ rescue RestClient::Exception
27
+ single_remote_resolve(names)
28
+ ensure
29
+ stats_add_batch_time(batch_start)
30
+ end
31
+
32
+ def single_remote_resolve(names)
33
+ all_res = []
34
+ names.split("\n").each do |name|
35
+ res = single_post(name)
36
+ next unless res
37
+ all_res << res.body
38
+ end
39
+ all_res
40
+ end
41
+
42
+ def single_post
43
+ RestClient.post(@resolver_url, data: name,
44
+ data_source_ids: @ds_id)
45
+ rescue RestClient::Exception => e
46
+ process_resolver_error(e, name)
47
+ nil
48
+ end
49
+
50
+ def process_resolver_error(err, name)
51
+ @stats.stats[:matches][7] += 1
52
+ @stats.stats[:resolved_records] += 1
53
+ GnCrossmap.logger.error("Resolver broke on '#{name}': #{err.message}")
54
+ end
55
+
56
+ def stats_add_batch_time(batch_start)
57
+ @stats.stats[:last_batches_time] << Time.now - batch_start
58
+ end
59
+ end
60
+ end
@@ -13,11 +13,13 @@ module GnCrossmap
13
13
  @input = {}
14
14
  end
15
15
 
16
- def process(result, original_data)
16
+ def process(results, original_data)
17
17
  @original_data = original_data
18
- res = rubyfy(result)
19
- res[:data].each do |d|
20
- d[:results].nil? ? write_empty_result(d) : write_result(d)
18
+ results.each do |result|
19
+ res = rubyfy(result)
20
+ res[:data].each do |d|
21
+ d[:results].nil? ? write_empty_result(d) : write_result(d)
22
+ end
21
23
  end
22
24
  end
23
25
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  # Namespace module for crossmapping checklists to GN sources
4
4
  module GnCrossmap
5
- VERSION = "3.2.0"
5
+ VERSION = "3.3.0"
6
6
 
7
7
  def self.version
8
8
  VERSION
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gn_crossmap
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.2.0
4
+ version: 3.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-08-23 00:00:00.000000000 Z
11
+ date: 2017-09-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: biodiversity
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '3.1'
27
+ - !ruby/object:Gem::Dependency
28
+ name: concurrent-ruby
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: gn_uuid
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -100,14 +114,14 @@ dependencies:
100
114
  requirements:
101
115
  - - "~>"
102
116
  - !ruby/object:Gem::Version
103
- version: '9.0'
117
+ version: '9.1'
104
118
  type: :development
105
119
  prerelease: false
106
120
  version_requirements: !ruby/object:Gem::Requirement
107
121
  requirements:
108
122
  - - "~>"
109
123
  - !ruby/object:Gem::Version
110
- version: '9.0'
124
+ version: '9.1'
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: coveralls
113
127
  requirement: !ruby/object:Gem::Requirement
@@ -194,6 +208,7 @@ files:
194
208
  - lib/gn_crossmap/errors.rb
195
209
  - lib/gn_crossmap/reader.rb
196
210
  - lib/gn_crossmap/resolver.rb
211
+ - lib/gn_crossmap/resolver_job.rb
197
212
  - lib/gn_crossmap/result_processor.rb
198
213
  - lib/gn_crossmap/sci_name_collector.rb
199
214
  - lib/gn_crossmap/stats.rb