gn_crossmap 3.2.0 → 3.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bfd0f87bbaf73d66c33e130c124e087104994879
4
- data.tar.gz: 11020365c22465b4d804faaf2cf13c6ee168b051
3
+ metadata.gz: 24d63c7b0f6ce958c567008c74eb31108753e745
4
+ data.tar.gz: 988291015dc7712cbbf01e67b1da1272fff11db4
5
5
  SHA512:
6
- metadata.gz: 5ca351156f9007cc6aaa879b3d86852b2a1aeb2584182351ccd3d228997cbccb47604bf97654bd583e58bc1a7280d0f0acb9f6903095db783e4bf0e614cebddf
7
- data.tar.gz: 5580be6d8d50f4385e05b798b37a987338e54270d132560df1465f7d2a8cd101b22394a867481577861f03fea28efdd412542092f7f8b5928bf9d8654c434321
6
+ metadata.gz: 801270ddb8e7ed24a77958bb9bba8db9c8f1f355165fac2ecc119e90494d6cb2bf909e1b24762ad296586657c58c1edc12f86b25a828178b30f273666aa801bf
7
+ data.tar.gz: 43de612ec0c72ddf9c949c209495fb73d0a346632fc5a841185723858048bebcefcbeaefbbc3cb39fe31b291a4da49e1b4908a9ed709d0a6952e70feeeab097b
data/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # ``gn_crossmap`` CHANGELOG
2
2
 
3
+ ## 3.3.0
4
+
5
+ * @dimus - Add option `opts.threads` for threads number for concurrent jobs.
6
+ Resolution now happens concurrently if the number of jobs is more
7
+ than 1. Max amount of jobs is 10.
8
+
3
9
  ## 3.2.0
4
10
 
5
11
  * @dimus - Add a column `matchSize` with number of matches for each name
data/exe/crossmap CHANGED
@@ -8,6 +8,7 @@ puts "This program requires Ruby >= v. 2.4.1" if RUBY_VERSION < "2.4.1"
8
8
 
9
9
  CATALOGUE_OF_LIFE = 1
10
10
  OUTPUT = "output.csv"
11
+ THREADS = 1
11
12
  opts = Trollop.options do
12
13
  banner "Compares a list of scientific names to scientific names from a " \
13
14
  "data source from Global Names Resolver\n\n " \
@@ -16,6 +17,7 @@ opts = Trollop.options do
16
17
 
17
18
  opt(:input, "Path to input file", type: :string)
18
19
  opt(:output, "Path to output file", default: OUTPUT)
20
+ opt(:threads, "Threads number for resolution", default: THREADS)
19
21
  opt(:data_source_id, "Data source id from GN Resolver",
20
22
  default: CATALOGUE_OF_LIFE)
21
23
  opt(:skip_original, "If given, only 'taxonID' is shown " \
data/gn_crossmap.gemspec CHANGED
@@ -31,13 +31,14 @@ Gem::Specification.new do |gem|
31
31
  gem.require_paths = ["lib"]
32
32
 
33
33
  gem.add_dependency "biodiversity", "~> 3.1"
34
+ gem.add_dependency "concurrent-ruby", "~> 1.0"
34
35
  gem.add_dependency "gn_uuid", "~> 0.5"
35
36
  gem.add_dependency "logger-colors", "~> 1.0"
36
37
  gem.add_dependency "rest-client", "~> 2.0"
37
38
  gem.add_dependency "trollop", "~> 2.1"
38
39
 
39
40
  gem.add_development_dependency "bundler", "~> 1.7"
40
- gem.add_development_dependency "byebug", "~> 9.0"
41
+ gem.add_development_dependency "byebug", "~> 9.1"
41
42
  gem.add_development_dependency "coveralls", "~> 0.8"
42
43
  gem.add_development_dependency "rake", "~> 12.0"
43
44
  gem.add_development_dependency "rspec", "~> 3.2"
data/lib/gn_crossmap.rb CHANGED
@@ -7,9 +7,11 @@ require "tempfile"
7
7
  require "logger"
8
8
  require "logger/colors"
9
9
  require "biodiversity"
10
+ require "concurrent"
10
11
  require "gn_uuid"
11
12
  require "gn_crossmap/errors"
12
13
  require "gn_crossmap/version"
14
+ require "gn_crossmap/resolver_job"
13
15
  require "gn_crossmap/reader"
14
16
  require "gn_crossmap/writer"
15
17
  require "gn_crossmap/collector"
@@ -45,7 +47,7 @@ module GnCrossmap
45
47
  reader = create_reader(input_io, opts)
46
48
  data = block_given? ? reader.read(&Proc.new) : reader.read
47
49
  writer = create_writer(reader, output_io, opts)
48
- resolver = create_resolver(writer, opts)
50
+ resolver = Resolver.new(writer, opts)
49
51
  block_given? ? resolver.resolve(data, &Proc.new) : resolver.resolve(data)
50
52
  resolver.stats
51
53
  end
@@ -68,13 +70,20 @@ module GnCrossmap
68
70
  end
69
71
  end
70
72
 
71
- private
72
-
73
- def create_resolver(writer, opts)
74
- Resolver.new(writer, opts.data_source_id, opts.resolver_url,
75
- opts.stats, opts.with_classification)
73
+ def opts_struct(opts)
74
+ resolver_url = "http://resolver.globalnames.org/name_resolvers.json"
75
+ threads = opts[:threads].to_i
76
+ opts[:threads] = threads.between?(1, 10) ? threads : 2
77
+ with_classification = opts[:with_classification] ? true : false
78
+ opts[:with_classification] = with_classification
79
+ data_source_id = opts[:data_source_id].to_i
80
+ opts[:data_source_id] = data_source_id.zero? ? 1 : data_source_id
81
+ OpenStruct.new({ stats: Stats.new, alt_headers: [],
82
+ resolver_url: resolver_url }.merge(opts))
76
83
  end
77
84
 
85
+ private
86
+
78
87
  def create_writer(reader, output_io, opts)
79
88
  Writer.new(output_io, reader.original_fields,
80
89
  output_name(opts.output), opts.with_classification)
@@ -85,12 +94,6 @@ module GnCrossmap
85
94
  opts.skip_original, opts.alt_headers, opts.stats)
86
95
  end
87
96
 
88
- def opts_struct(opts)
89
- resolver_url = "http://resolver.globalnames.org/name_resolvers.json"
90
- OpenStruct.new({ stats: Stats.new, alt_headers: [],
91
- resolver_url: resolver_url }.merge(opts))
92
- end
93
-
94
97
  def io(input, output)
95
98
  io_in = iogen(input, INPUT_MODE)
96
99
  io_out = iogen(output, OUTPUT_MODE)
@@ -1,108 +1,133 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # rubocop:disable Metrics/ClassLength
4
+
3
5
  module GnCrossmap
4
6
  # Sends data to GN Resolver and collects results
5
7
  class Resolver
6
8
  attr_reader :stats
7
9
 
8
- def initialize(writer, data_source_id,
9
- resolver_url, stats, with_classification = false)
10
- @stats = stats
11
- @resolver_url = resolver_url
10
+ def initialize(writer, opts)
11
+ instance_vars_from_opts(opts)
12
12
  @processor = GnCrossmap::ResultProcessor.
13
- new(writer, @stats, with_classification)
14
- @ds_id = data_source_id
13
+ new(writer, @stats, @with_classification)
15
14
  @count = 0
16
- @current_data = {}
15
+ @jobs = []
17
16
  @batch = 200
18
17
  end
19
18
 
20
19
  def resolve(data)
21
- update_stats(data.size)
22
- block_given? ? process(data, &Proc.new) : process(data)
20
+ resolution_stats(data.size)
21
+ @threads.times do
22
+ batch = data.shift(@batch)
23
+ add_job(batch)
24
+ end
25
+ block_given? ? traverse_jobs(data, &Proc.new) : traverse_jobs(data)
23
26
  wrap_up
24
27
  yield(@stats.stats) if block_given?
25
28
  end
26
29
 
27
30
  private
28
31
 
29
- def process(data)
30
- cmd = nil
31
- data.each_slice(@batch) do |slice|
32
- with_log do
33
- remote_resolve(collect_names(slice))
34
- cmd = yield(@stats.stats) if block_given?
35
- end
36
- break if cmd == "STOP"
37
- end
38
- end
39
-
40
32
  def wrap_up
41
33
  @stats.stats[:resolution_stop] = Time.now
42
34
  @stats.stats[:status] = :finish
43
35
  @processor.writer.close
44
36
  end
45
37
 
46
- def update_stats(records_num)
38
+ def add_job(batch)
39
+ job = batch.empty? ? nil : create_job(batch)
40
+ @jobs << job
41
+ end
42
+
43
+ def traverse_jobs(data)
44
+ until data.empty? && @jobs.compact.empty?
45
+ process_results(data)
46
+ cmd = yield(@stats.stats) if block_given?
47
+ break if cmd == "STOP"
48
+ sleep(0.5)
49
+ end
50
+ end
51
+
52
+ def resolution_stats(records_num)
47
53
  @stats.stats[:total_records] = records_num
48
54
  @stats.stats[:resolution_start] = Time.now
49
55
  @stats.stats[:status] = :resolution
50
56
  end
51
57
 
52
- def with_log
53
- s = @count + 1
54
- @count += @batch
55
- e = [@count, @stats.stats[:total_records]].min
56
- GnCrossmap.log("Resolve #{s}-#{e} out of " \
57
- "#{@stats.stats[:total_records]} records at " \
58
- "#{@resolver_url}")
59
- yield
58
+ def process_results(data)
59
+ indices = []
60
+ @jobs.each_with_index do |job, i|
61
+ next if job.nil? || !job.complete?
62
+ with_log do
63
+ process_job(job)
64
+ indices << i
65
+ end
66
+ end
67
+ add_jobs(indices, data) unless indices.empty?
60
68
  end
61
69
 
62
- def collect_names(slice)
63
- @current_data = {}
64
- slice.each_with_object([]) do |row, str|
65
- id = row[:id].strip
66
- @current_data[id] = row[:original]
67
- @processor.input[id] = { rank: row[:rank] }
68
- str << "#{id}|#{row[:name]}"
69
- end.join("\n")
70
+ def add_jobs(indices, data)
71
+ indices.each do |i|
72
+ batch = data.shift(@batch)
73
+ @jobs[i] = batch.empty? ? nil : create_job(batch)
74
+ end
70
75
  end
71
76
 
72
- def remote_resolve(names)
73
- batch_start = Time.now
74
- res = RestClient.post(@resolver_url, data: names, data_source_ids: @ds_id)
75
- @processor.process(res, @current_data)
76
- rescue RestClient::Exception
77
- single_remote_resolve(names)
78
- ensure
79
- update_batch_times(batch_start)
77
+ def process_job(job)
78
+ if job.fulfilled?
79
+ results, current_data, stats = job.value
80
+ update_stats(stats)
81
+ @processor.process(results, current_data)
82
+ else
83
+ GnCrossmap.logger.error("Remote resolver server failed")
84
+ end
80
85
  end
81
86
 
82
- def update_batch_times(batch_start)
87
+ # rubocop:disable Metrics/AbcSize
88
+ def update_stats(job_stats)
83
89
  s = @stats.stats
84
90
  s[:last_batches_time].shift if s[:last_batches_time].size > 2
85
- s[:last_batches_time] << Time.now - batch_start
91
+ s[:last_batches_time] << job_stats.stats[:last_batches_time][0]
86
92
  s[:resolution_span] = Time.now - s[:resolution_start]
93
+ s[:resolved_records] += job_stats.stats[:resolved_records]
94
+ s[:matches][7] += job_stats.stats[:matches][7]
87
95
  end
96
+ # rubocop:enable all
88
97
 
89
- def single_remote_resolve(names)
90
- names.split("\n").each do |name|
91
- begin
92
- res = RestClient.post(@resolver_url, data: name,
93
- data_source_ids: @ds_id)
94
- @processor.process(res, @current_data)
95
- rescue RestClient::Exception => e
96
- process_resolver_error(e, name)
97
- next
98
- end
99
- end
98
+ def create_job(batch)
99
+ names, batch_data = collect_names(batch)
100
+ rb = ResolverJob.new(names, batch_data, @resolver_url, @ds_id)
101
+ Concurrent::Future.execute { rb.run }
102
+ end
103
+
104
+ def instance_vars_from_opts(opts)
105
+ @stats = opts.stats
106
+ @with_classification = opts.with_classification.freeze
107
+ @ds_id = opts.data_source_id.freeze
108
+ @resolver_url = opts.resolver_url.freeze
109
+ @threads = opts.threads
110
+ end
111
+
112
+ def collect_names(batch)
113
+ batch_data = {}
114
+ names = batch.each_with_object([]) do |row, str|
115
+ id = row[:id].strip
116
+ batch_data[id] = row[:original]
117
+ @processor.input[id] = { rank: row[:rank] }
118
+ str << "#{id}|#{row[:name]}"
119
+ end.join("\n")
120
+ [names, batch_data]
100
121
  end
101
122
 
102
- def process_resolver_error(err, name)
103
- @stats.stats[:matches][7] += 1
104
- @stats.stats[:resolved_records] += 1
105
- GnCrossmap.logger.error("Resolver broke on '#{name}': #{err.message}")
123
+ def with_log
124
+ s = @count + 1
125
+ @count += @batch
126
+ e = [@count, @stats.stats[:total_records]].min
127
+ GnCrossmap.log("Resolve #{s}-#{e} out of " \
128
+ "#{@stats.stats[:total_records]} records at " \
129
+ "#{@resolver_url}")
130
+ yield
106
131
  end
107
132
  end
108
133
  end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module GnCrossmap
4
+ # Remote resolution for parallel jobs
5
+ class ResolverJob
6
+ def initialize(names, batch_data, resolver_url, ds_id)
7
+ @names = names
8
+ @batch_data = batch_data
9
+ @resolver_url = resolver_url
10
+ @ds_id = ds_id
11
+ @stats = Stats.new
12
+ end
13
+
14
+ def run
15
+ res = remote_resolve(@names)
16
+ [res, @batch_data, @stats]
17
+ end
18
+
19
+ private
20
+
21
+ def remote_resolve(names)
22
+ batch_start = Time.now
23
+ res = RestClient.post(@resolver_url, data: names,
24
+ data_source_ids: @ds_id)
25
+ [res.body]
26
+ rescue RestClient::Exception
27
+ single_remote_resolve(names)
28
+ ensure
29
+ stats_add_batch_time(batch_start)
30
+ end
31
+
32
+ def single_remote_resolve(names)
33
+ all_res = []
34
+ names.split("\n").each do |name|
35
+ res = single_post(name)
36
+ next unless res
37
+ all_res << res.body
38
+ end
39
+ all_res
40
+ end
41
+
42
+ def single_post
43
+ RestClient.post(@resolver_url, data: name,
44
+ data_source_ids: @ds_id)
45
+ rescue RestClient::Exception => e
46
+ process_resolver_error(e, name)
47
+ nil
48
+ end
49
+
50
+ def process_resolver_error(err, name)
51
+ @stats.stats[:matches][7] += 1
52
+ @stats.stats[:resolved_records] += 1
53
+ GnCrossmap.logger.error("Resolver broke on '#{name}': #{err.message}")
54
+ end
55
+
56
+ def stats_add_batch_time(batch_start)
57
+ @stats.stats[:last_batches_time] << Time.now - batch_start
58
+ end
59
+ end
60
+ end
@@ -13,11 +13,13 @@ module GnCrossmap
13
13
  @input = {}
14
14
  end
15
15
 
16
- def process(result, original_data)
16
+ def process(results, original_data)
17
17
  @original_data = original_data
18
- res = rubyfy(result)
19
- res[:data].each do |d|
20
- d[:results].nil? ? write_empty_result(d) : write_result(d)
18
+ results.each do |result|
19
+ res = rubyfy(result)
20
+ res[:data].each do |d|
21
+ d[:results].nil? ? write_empty_result(d) : write_result(d)
22
+ end
21
23
  end
22
24
  end
23
25
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  # Namespace module for crossmapping checklists to GN sources
4
4
  module GnCrossmap
5
- VERSION = "3.2.0"
5
+ VERSION = "3.3.0"
6
6
 
7
7
  def self.version
8
8
  VERSION
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gn_crossmap
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.2.0
4
+ version: 3.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-08-23 00:00:00.000000000 Z
11
+ date: 2017-09-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: biodiversity
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '3.1'
27
+ - !ruby/object:Gem::Dependency
28
+ name: concurrent-ruby
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: gn_uuid
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -100,14 +114,14 @@ dependencies:
100
114
  requirements:
101
115
  - - "~>"
102
116
  - !ruby/object:Gem::Version
103
- version: '9.0'
117
+ version: '9.1'
104
118
  type: :development
105
119
  prerelease: false
106
120
  version_requirements: !ruby/object:Gem::Requirement
107
121
  requirements:
108
122
  - - "~>"
109
123
  - !ruby/object:Gem::Version
110
- version: '9.0'
124
+ version: '9.1'
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: coveralls
113
127
  requirement: !ruby/object:Gem::Requirement
@@ -194,6 +208,7 @@ files:
194
208
  - lib/gn_crossmap/errors.rb
195
209
  - lib/gn_crossmap/reader.rb
196
210
  - lib/gn_crossmap/resolver.rb
211
+ - lib/gn_crossmap/resolver_job.rb
197
212
  - lib/gn_crossmap/result_processor.rb
198
213
  - lib/gn_crossmap/sci_name_collector.rb
199
214
  - lib/gn_crossmap/stats.rb