gn_crossmap 3.2.0 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/exe/crossmap +2 -0
- data/gn_crossmap.gemspec +2 -1
- data/lib/gn_crossmap.rb +15 -12
- data/lib/gn_crossmap/resolver.rb +87 -62
- data/lib/gn_crossmap/resolver_job.rb +60 -0
- data/lib/gn_crossmap/result_processor.rb +6 -4
- data/lib/gn_crossmap/version.rb +1 -1
- metadata +19 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 24d63c7b0f6ce958c567008c74eb31108753e745
|
4
|
+
data.tar.gz: 988291015dc7712cbbf01e67b1da1272fff11db4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 801270ddb8e7ed24a77958bb9bba8db9c8f1f355165fac2ecc119e90494d6cb2bf909e1b24762ad296586657c58c1edc12f86b25a828178b30f273666aa801bf
|
7
|
+
data.tar.gz: 43de612ec0c72ddf9c949c209495fb73d0a346632fc5a841185723858048bebcefcbeaefbbc3cb39fe31b291a4da49e1b4908a9ed709d0a6952e70feeeab097b
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,11 @@
|
|
1
1
|
# ``gn_crossmap`` CHANGELOG
|
2
2
|
|
3
|
+
## 3.3.0
|
4
|
+
|
5
|
+
* @dimus - Add option `opts.threads` for threads number for concurrent jobs.
|
6
|
+
Resolution now happens concurrently if the number of jobs is more
|
7
|
+
than 1. Max amount of jobs is 10.
|
8
|
+
|
3
9
|
## 3.2.0
|
4
10
|
|
5
11
|
* @dimus - Add a column `matchSize` with number of matches for each name
|
data/exe/crossmap
CHANGED
@@ -8,6 +8,7 @@ puts "This program requires Ruby >= v. 2.4.1" if RUBY_VERSION < "2.4.1"
|
|
8
8
|
|
9
9
|
CATALOGUE_OF_LIFE = 1
|
10
10
|
OUTPUT = "output.csv"
|
11
|
+
THREADS = 1
|
11
12
|
opts = Trollop.options do
|
12
13
|
banner "Compares a list of scientific names to scientific names from a " \
|
13
14
|
"data source from Global Names Resolver\n\n " \
|
@@ -16,6 +17,7 @@ opts = Trollop.options do
|
|
16
17
|
|
17
18
|
opt(:input, "Path to input file", type: :string)
|
18
19
|
opt(:output, "Path to output file", default: OUTPUT)
|
20
|
+
opt(:threads, "Threads number for resolution", default: THREADS)
|
19
21
|
opt(:data_source_id, "Data source id from GN Resolver",
|
20
22
|
default: CATALOGUE_OF_LIFE)
|
21
23
|
opt(:skip_original, "If given, only 'taxonID' is shown " \
|
data/gn_crossmap.gemspec
CHANGED
@@ -31,13 +31,14 @@ Gem::Specification.new do |gem|
|
|
31
31
|
gem.require_paths = ["lib"]
|
32
32
|
|
33
33
|
gem.add_dependency "biodiversity", "~> 3.1"
|
34
|
+
gem.add_dependency "concurrent-ruby", "~> 1.0"
|
34
35
|
gem.add_dependency "gn_uuid", "~> 0.5"
|
35
36
|
gem.add_dependency "logger-colors", "~> 1.0"
|
36
37
|
gem.add_dependency "rest-client", "~> 2.0"
|
37
38
|
gem.add_dependency "trollop", "~> 2.1"
|
38
39
|
|
39
40
|
gem.add_development_dependency "bundler", "~> 1.7"
|
40
|
-
gem.add_development_dependency "byebug", "~> 9.
|
41
|
+
gem.add_development_dependency "byebug", "~> 9.1"
|
41
42
|
gem.add_development_dependency "coveralls", "~> 0.8"
|
42
43
|
gem.add_development_dependency "rake", "~> 12.0"
|
43
44
|
gem.add_development_dependency "rspec", "~> 3.2"
|
data/lib/gn_crossmap.rb
CHANGED
@@ -7,9 +7,11 @@ require "tempfile"
|
|
7
7
|
require "logger"
|
8
8
|
require "logger/colors"
|
9
9
|
require "biodiversity"
|
10
|
+
require "concurrent"
|
10
11
|
require "gn_uuid"
|
11
12
|
require "gn_crossmap/errors"
|
12
13
|
require "gn_crossmap/version"
|
14
|
+
require "gn_crossmap/resolver_job"
|
13
15
|
require "gn_crossmap/reader"
|
14
16
|
require "gn_crossmap/writer"
|
15
17
|
require "gn_crossmap/collector"
|
@@ -45,7 +47,7 @@ module GnCrossmap
|
|
45
47
|
reader = create_reader(input_io, opts)
|
46
48
|
data = block_given? ? reader.read(&Proc.new) : reader.read
|
47
49
|
writer = create_writer(reader, output_io, opts)
|
48
|
-
resolver =
|
50
|
+
resolver = Resolver.new(writer, opts)
|
49
51
|
block_given? ? resolver.resolve(data, &Proc.new) : resolver.resolve(data)
|
50
52
|
resolver.stats
|
51
53
|
end
|
@@ -68,13 +70,20 @@ module GnCrossmap
|
|
68
70
|
end
|
69
71
|
end
|
70
72
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
73
|
+
def opts_struct(opts)
|
74
|
+
resolver_url = "http://resolver.globalnames.org/name_resolvers.json"
|
75
|
+
threads = opts[:threads].to_i
|
76
|
+
opts[:threads] = threads.between?(1, 10) ? threads : 2
|
77
|
+
with_classification = opts[:with_classification] ? true : false
|
78
|
+
opts[:with_classification] = with_classification
|
79
|
+
data_source_id = opts[:data_source_id].to_i
|
80
|
+
opts[:data_source_id] = data_source_id.zero? ? 1 : data_source_id
|
81
|
+
OpenStruct.new({ stats: Stats.new, alt_headers: [],
|
82
|
+
resolver_url: resolver_url }.merge(opts))
|
76
83
|
end
|
77
84
|
|
85
|
+
private
|
86
|
+
|
78
87
|
def create_writer(reader, output_io, opts)
|
79
88
|
Writer.new(output_io, reader.original_fields,
|
80
89
|
output_name(opts.output), opts.with_classification)
|
@@ -85,12 +94,6 @@ module GnCrossmap
|
|
85
94
|
opts.skip_original, opts.alt_headers, opts.stats)
|
86
95
|
end
|
87
96
|
|
88
|
-
def opts_struct(opts)
|
89
|
-
resolver_url = "http://resolver.globalnames.org/name_resolvers.json"
|
90
|
-
OpenStruct.new({ stats: Stats.new, alt_headers: [],
|
91
|
-
resolver_url: resolver_url }.merge(opts))
|
92
|
-
end
|
93
|
-
|
94
97
|
def io(input, output)
|
95
98
|
io_in = iogen(input, INPUT_MODE)
|
96
99
|
io_out = iogen(output, OUTPUT_MODE)
|
data/lib/gn_crossmap/resolver.rb
CHANGED
@@ -1,108 +1,133 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
# rubocop:disable Metrics/ClassLength
|
4
|
+
|
3
5
|
module GnCrossmap
|
4
6
|
# Sends data to GN Resolver and collects results
|
5
7
|
class Resolver
|
6
8
|
attr_reader :stats
|
7
9
|
|
8
|
-
def initialize(writer,
|
9
|
-
|
10
|
-
@stats = stats
|
11
|
-
@resolver_url = resolver_url
|
10
|
+
def initialize(writer, opts)
|
11
|
+
instance_vars_from_opts(opts)
|
12
12
|
@processor = GnCrossmap::ResultProcessor.
|
13
|
-
new(writer, @stats, with_classification)
|
14
|
-
@ds_id = data_source_id
|
13
|
+
new(writer, @stats, @with_classification)
|
15
14
|
@count = 0
|
16
|
-
@
|
15
|
+
@jobs = []
|
17
16
|
@batch = 200
|
18
17
|
end
|
19
18
|
|
20
19
|
def resolve(data)
|
21
|
-
|
22
|
-
|
20
|
+
resolution_stats(data.size)
|
21
|
+
@threads.times do
|
22
|
+
batch = data.shift(@batch)
|
23
|
+
add_job(batch)
|
24
|
+
end
|
25
|
+
block_given? ? traverse_jobs(data, &Proc.new) : traverse_jobs(data)
|
23
26
|
wrap_up
|
24
27
|
yield(@stats.stats) if block_given?
|
25
28
|
end
|
26
29
|
|
27
30
|
private
|
28
31
|
|
29
|
-
def process(data)
|
30
|
-
cmd = nil
|
31
|
-
data.each_slice(@batch) do |slice|
|
32
|
-
with_log do
|
33
|
-
remote_resolve(collect_names(slice))
|
34
|
-
cmd = yield(@stats.stats) if block_given?
|
35
|
-
end
|
36
|
-
break if cmd == "STOP"
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
32
|
def wrap_up
|
41
33
|
@stats.stats[:resolution_stop] = Time.now
|
42
34
|
@stats.stats[:status] = :finish
|
43
35
|
@processor.writer.close
|
44
36
|
end
|
45
37
|
|
46
|
-
def
|
38
|
+
def add_job(batch)
|
39
|
+
job = batch.empty? ? nil : create_job(batch)
|
40
|
+
@jobs << job
|
41
|
+
end
|
42
|
+
|
43
|
+
def traverse_jobs(data)
|
44
|
+
until data.empty? && @jobs.compact.empty?
|
45
|
+
process_results(data)
|
46
|
+
cmd = yield(@stats.stats) if block_given?
|
47
|
+
break if cmd == "STOP"
|
48
|
+
sleep(0.5)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def resolution_stats(records_num)
|
47
53
|
@stats.stats[:total_records] = records_num
|
48
54
|
@stats.stats[:resolution_start] = Time.now
|
49
55
|
@stats.stats[:status] = :resolution
|
50
56
|
end
|
51
57
|
|
52
|
-
def
|
53
|
-
|
54
|
-
@
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
58
|
+
def process_results(data)
|
59
|
+
indices = []
|
60
|
+
@jobs.each_with_index do |job, i|
|
61
|
+
next if job.nil? || !job.complete?
|
62
|
+
with_log do
|
63
|
+
process_job(job)
|
64
|
+
indices << i
|
65
|
+
end
|
66
|
+
end
|
67
|
+
add_jobs(indices, data) unless indices.empty?
|
60
68
|
end
|
61
69
|
|
62
|
-
def
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
@processor.input[id] = { rank: row[:rank] }
|
68
|
-
str << "#{id}|#{row[:name]}"
|
69
|
-
end.join("\n")
|
70
|
+
def add_jobs(indices, data)
|
71
|
+
indices.each do |i|
|
72
|
+
batch = data.shift(@batch)
|
73
|
+
@jobs[i] = batch.empty? ? nil : create_job(batch)
|
74
|
+
end
|
70
75
|
end
|
71
76
|
|
72
|
-
def
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
77
|
+
def process_job(job)
|
78
|
+
if job.fulfilled?
|
79
|
+
results, current_data, stats = job.value
|
80
|
+
update_stats(stats)
|
81
|
+
@processor.process(results, current_data)
|
82
|
+
else
|
83
|
+
GnCrossmap.logger.error("Remote resolver server failed")
|
84
|
+
end
|
80
85
|
end
|
81
86
|
|
82
|
-
|
87
|
+
# rubocop:disable Metrics/AbcSize
|
88
|
+
def update_stats(job_stats)
|
83
89
|
s = @stats.stats
|
84
90
|
s[:last_batches_time].shift if s[:last_batches_time].size > 2
|
85
|
-
s[:last_batches_time] <<
|
91
|
+
s[:last_batches_time] << job_stats.stats[:last_batches_time][0]
|
86
92
|
s[:resolution_span] = Time.now - s[:resolution_start]
|
93
|
+
s[:resolved_records] += job_stats.stats[:resolved_records]
|
94
|
+
s[:matches][7] += job_stats.stats[:matches][7]
|
87
95
|
end
|
96
|
+
# rubocop:enable all
|
88
97
|
|
89
|
-
def
|
90
|
-
names
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
98
|
+
def create_job(batch)
|
99
|
+
names, batch_data = collect_names(batch)
|
100
|
+
rb = ResolverJob.new(names, batch_data, @resolver_url, @ds_id)
|
101
|
+
Concurrent::Future.execute { rb.run }
|
102
|
+
end
|
103
|
+
|
104
|
+
def instance_vars_from_opts(opts)
|
105
|
+
@stats = opts.stats
|
106
|
+
@with_classification = opts.with_classification.freeze
|
107
|
+
@ds_id = opts.data_source_id.freeze
|
108
|
+
@resolver_url = opts.resolver_url.freeze
|
109
|
+
@threads = opts.threads
|
110
|
+
end
|
111
|
+
|
112
|
+
def collect_names(batch)
|
113
|
+
batch_data = {}
|
114
|
+
names = batch.each_with_object([]) do |row, str|
|
115
|
+
id = row[:id].strip
|
116
|
+
batch_data[id] = row[:original]
|
117
|
+
@processor.input[id] = { rank: row[:rank] }
|
118
|
+
str << "#{id}|#{row[:name]}"
|
119
|
+
end.join("\n")
|
120
|
+
[names, batch_data]
|
100
121
|
end
|
101
122
|
|
102
|
-
def
|
103
|
-
@
|
104
|
-
@
|
105
|
-
|
123
|
+
def with_log
|
124
|
+
s = @count + 1
|
125
|
+
@count += @batch
|
126
|
+
e = [@count, @stats.stats[:total_records]].min
|
127
|
+
GnCrossmap.log("Resolve #{s}-#{e} out of " \
|
128
|
+
"#{@stats.stats[:total_records]} records at " \
|
129
|
+
"#{@resolver_url}")
|
130
|
+
yield
|
106
131
|
end
|
107
132
|
end
|
108
133
|
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module GnCrossmap
|
4
|
+
# Remote resolution for parallel jobs
|
5
|
+
class ResolverJob
|
6
|
+
def initialize(names, batch_data, resolver_url, ds_id)
|
7
|
+
@names = names
|
8
|
+
@batch_data = batch_data
|
9
|
+
@resolver_url = resolver_url
|
10
|
+
@ds_id = ds_id
|
11
|
+
@stats = Stats.new
|
12
|
+
end
|
13
|
+
|
14
|
+
def run
|
15
|
+
res = remote_resolve(@names)
|
16
|
+
[res, @batch_data, @stats]
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def remote_resolve(names)
|
22
|
+
batch_start = Time.now
|
23
|
+
res = RestClient.post(@resolver_url, data: names,
|
24
|
+
data_source_ids: @ds_id)
|
25
|
+
[res.body]
|
26
|
+
rescue RestClient::Exception
|
27
|
+
single_remote_resolve(names)
|
28
|
+
ensure
|
29
|
+
stats_add_batch_time(batch_start)
|
30
|
+
end
|
31
|
+
|
32
|
+
def single_remote_resolve(names)
|
33
|
+
all_res = []
|
34
|
+
names.split("\n").each do |name|
|
35
|
+
res = single_post(name)
|
36
|
+
next unless res
|
37
|
+
all_res << res.body
|
38
|
+
end
|
39
|
+
all_res
|
40
|
+
end
|
41
|
+
|
42
|
+
def single_post
|
43
|
+
RestClient.post(@resolver_url, data: name,
|
44
|
+
data_source_ids: @ds_id)
|
45
|
+
rescue RestClient::Exception => e
|
46
|
+
process_resolver_error(e, name)
|
47
|
+
nil
|
48
|
+
end
|
49
|
+
|
50
|
+
def process_resolver_error(err, name)
|
51
|
+
@stats.stats[:matches][7] += 1
|
52
|
+
@stats.stats[:resolved_records] += 1
|
53
|
+
GnCrossmap.logger.error("Resolver broke on '#{name}': #{err.message}")
|
54
|
+
end
|
55
|
+
|
56
|
+
def stats_add_batch_time(batch_start)
|
57
|
+
@stats.stats[:last_batches_time] << Time.now - batch_start
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -13,11 +13,13 @@ module GnCrossmap
|
|
13
13
|
@input = {}
|
14
14
|
end
|
15
15
|
|
16
|
-
def process(
|
16
|
+
def process(results, original_data)
|
17
17
|
@original_data = original_data
|
18
|
-
|
19
|
-
|
20
|
-
|
18
|
+
results.each do |result|
|
19
|
+
res = rubyfy(result)
|
20
|
+
res[:data].each do |d|
|
21
|
+
d[:results].nil? ? write_empty_result(d) : write_result(d)
|
22
|
+
end
|
21
23
|
end
|
22
24
|
end
|
23
25
|
|
data/lib/gn_crossmap/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gn_crossmap
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Mozzherin
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-09-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: biodiversity
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '3.1'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: concurrent-ruby
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.0'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: gn_uuid
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -100,14 +114,14 @@ dependencies:
|
|
100
114
|
requirements:
|
101
115
|
- - "~>"
|
102
116
|
- !ruby/object:Gem::Version
|
103
|
-
version: '9.
|
117
|
+
version: '9.1'
|
104
118
|
type: :development
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
107
121
|
requirements:
|
108
122
|
- - "~>"
|
109
123
|
- !ruby/object:Gem::Version
|
110
|
-
version: '9.
|
124
|
+
version: '9.1'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: coveralls
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -194,6 +208,7 @@ files:
|
|
194
208
|
- lib/gn_crossmap/errors.rb
|
195
209
|
- lib/gn_crossmap/reader.rb
|
196
210
|
- lib/gn_crossmap/resolver.rb
|
211
|
+
- lib/gn_crossmap/resolver_job.rb
|
197
212
|
- lib/gn_crossmap/result_processor.rb
|
198
213
|
- lib/gn_crossmap/sci_name_collector.rb
|
199
214
|
- lib/gn_crossmap/stats.rb
|