gn_crossmap 3.2.0 → 3.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/exe/crossmap +2 -0
- data/gn_crossmap.gemspec +2 -1
- data/lib/gn_crossmap.rb +15 -12
- data/lib/gn_crossmap/resolver.rb +87 -62
- data/lib/gn_crossmap/resolver_job.rb +60 -0
- data/lib/gn_crossmap/result_processor.rb +6 -4
- data/lib/gn_crossmap/version.rb +1 -1
- metadata +19 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 24d63c7b0f6ce958c567008c74eb31108753e745
|
4
|
+
data.tar.gz: 988291015dc7712cbbf01e67b1da1272fff11db4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 801270ddb8e7ed24a77958bb9bba8db9c8f1f355165fac2ecc119e90494d6cb2bf909e1b24762ad296586657c58c1edc12f86b25a828178b30f273666aa801bf
|
7
|
+
data.tar.gz: 43de612ec0c72ddf9c949c209495fb73d0a346632fc5a841185723858048bebcefcbeaefbbc3cb39fe31b291a4da49e1b4908a9ed709d0a6952e70feeeab097b
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,11 @@
|
|
1
1
|
# ``gn_crossmap`` CHANGELOG
|
2
2
|
|
3
|
+
## 3.3.0
|
4
|
+
|
5
|
+
* @dimus - Add option `opts.threads` for threads number for concurrent jobs.
|
6
|
+
Resolution now happens concurrently if the number of jobs is more
|
7
|
+
than 1. Max amount of jobs is 10.
|
8
|
+
|
3
9
|
## 3.2.0
|
4
10
|
|
5
11
|
* @dimus - Add a column `matchSize` with number of matches for each name
|
data/exe/crossmap
CHANGED
@@ -8,6 +8,7 @@ puts "This program requires Ruby >= v. 2.4.1" if RUBY_VERSION < "2.4.1"
|
|
8
8
|
|
9
9
|
CATALOGUE_OF_LIFE = 1
|
10
10
|
OUTPUT = "output.csv"
|
11
|
+
THREADS = 1
|
11
12
|
opts = Trollop.options do
|
12
13
|
banner "Compares a list of scientific names to scientific names from a " \
|
13
14
|
"data source from Global Names Resolver\n\n " \
|
@@ -16,6 +17,7 @@ opts = Trollop.options do
|
|
16
17
|
|
17
18
|
opt(:input, "Path to input file", type: :string)
|
18
19
|
opt(:output, "Path to output file", default: OUTPUT)
|
20
|
+
opt(:threads, "Threads number for resolution", default: THREADS)
|
19
21
|
opt(:data_source_id, "Data source id from GN Resolver",
|
20
22
|
default: CATALOGUE_OF_LIFE)
|
21
23
|
opt(:skip_original, "If given, only 'taxonID' is shown " \
|
data/gn_crossmap.gemspec
CHANGED
@@ -31,13 +31,14 @@ Gem::Specification.new do |gem|
|
|
31
31
|
gem.require_paths = ["lib"]
|
32
32
|
|
33
33
|
gem.add_dependency "biodiversity", "~> 3.1"
|
34
|
+
gem.add_dependency "concurrent-ruby", "~> 1.0"
|
34
35
|
gem.add_dependency "gn_uuid", "~> 0.5"
|
35
36
|
gem.add_dependency "logger-colors", "~> 1.0"
|
36
37
|
gem.add_dependency "rest-client", "~> 2.0"
|
37
38
|
gem.add_dependency "trollop", "~> 2.1"
|
38
39
|
|
39
40
|
gem.add_development_dependency "bundler", "~> 1.7"
|
40
|
-
gem.add_development_dependency "byebug", "~> 9.
|
41
|
+
gem.add_development_dependency "byebug", "~> 9.1"
|
41
42
|
gem.add_development_dependency "coveralls", "~> 0.8"
|
42
43
|
gem.add_development_dependency "rake", "~> 12.0"
|
43
44
|
gem.add_development_dependency "rspec", "~> 3.2"
|
data/lib/gn_crossmap.rb
CHANGED
@@ -7,9 +7,11 @@ require "tempfile"
|
|
7
7
|
require "logger"
|
8
8
|
require "logger/colors"
|
9
9
|
require "biodiversity"
|
10
|
+
require "concurrent"
|
10
11
|
require "gn_uuid"
|
11
12
|
require "gn_crossmap/errors"
|
12
13
|
require "gn_crossmap/version"
|
14
|
+
require "gn_crossmap/resolver_job"
|
13
15
|
require "gn_crossmap/reader"
|
14
16
|
require "gn_crossmap/writer"
|
15
17
|
require "gn_crossmap/collector"
|
@@ -45,7 +47,7 @@ module GnCrossmap
|
|
45
47
|
reader = create_reader(input_io, opts)
|
46
48
|
data = block_given? ? reader.read(&Proc.new) : reader.read
|
47
49
|
writer = create_writer(reader, output_io, opts)
|
48
|
-
resolver =
|
50
|
+
resolver = Resolver.new(writer, opts)
|
49
51
|
block_given? ? resolver.resolve(data, &Proc.new) : resolver.resolve(data)
|
50
52
|
resolver.stats
|
51
53
|
end
|
@@ -68,13 +70,20 @@ module GnCrossmap
|
|
68
70
|
end
|
69
71
|
end
|
70
72
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
73
|
+
def opts_struct(opts)
|
74
|
+
resolver_url = "http://resolver.globalnames.org/name_resolvers.json"
|
75
|
+
threads = opts[:threads].to_i
|
76
|
+
opts[:threads] = threads.between?(1, 10) ? threads : 2
|
77
|
+
with_classification = opts[:with_classification] ? true : false
|
78
|
+
opts[:with_classification] = with_classification
|
79
|
+
data_source_id = opts[:data_source_id].to_i
|
80
|
+
opts[:data_source_id] = data_source_id.zero? ? 1 : data_source_id
|
81
|
+
OpenStruct.new({ stats: Stats.new, alt_headers: [],
|
82
|
+
resolver_url: resolver_url }.merge(opts))
|
76
83
|
end
|
77
84
|
|
85
|
+
private
|
86
|
+
|
78
87
|
def create_writer(reader, output_io, opts)
|
79
88
|
Writer.new(output_io, reader.original_fields,
|
80
89
|
output_name(opts.output), opts.with_classification)
|
@@ -85,12 +94,6 @@ module GnCrossmap
|
|
85
94
|
opts.skip_original, opts.alt_headers, opts.stats)
|
86
95
|
end
|
87
96
|
|
88
|
-
def opts_struct(opts)
|
89
|
-
resolver_url = "http://resolver.globalnames.org/name_resolvers.json"
|
90
|
-
OpenStruct.new({ stats: Stats.new, alt_headers: [],
|
91
|
-
resolver_url: resolver_url }.merge(opts))
|
92
|
-
end
|
93
|
-
|
94
97
|
def io(input, output)
|
95
98
|
io_in = iogen(input, INPUT_MODE)
|
96
99
|
io_out = iogen(output, OUTPUT_MODE)
|
data/lib/gn_crossmap/resolver.rb
CHANGED
@@ -1,108 +1,133 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
# rubocop:disable Metrics/ClassLength
|
4
|
+
|
3
5
|
module GnCrossmap
|
4
6
|
# Sends data to GN Resolver and collects results
|
5
7
|
class Resolver
|
6
8
|
attr_reader :stats
|
7
9
|
|
8
|
-
def initialize(writer,
|
9
|
-
|
10
|
-
@stats = stats
|
11
|
-
@resolver_url = resolver_url
|
10
|
+
def initialize(writer, opts)
|
11
|
+
instance_vars_from_opts(opts)
|
12
12
|
@processor = GnCrossmap::ResultProcessor.
|
13
|
-
new(writer, @stats, with_classification)
|
14
|
-
@ds_id = data_source_id
|
13
|
+
new(writer, @stats, @with_classification)
|
15
14
|
@count = 0
|
16
|
-
@
|
15
|
+
@jobs = []
|
17
16
|
@batch = 200
|
18
17
|
end
|
19
18
|
|
20
19
|
def resolve(data)
|
21
|
-
|
22
|
-
|
20
|
+
resolution_stats(data.size)
|
21
|
+
@threads.times do
|
22
|
+
batch = data.shift(@batch)
|
23
|
+
add_job(batch)
|
24
|
+
end
|
25
|
+
block_given? ? traverse_jobs(data, &Proc.new) : traverse_jobs(data)
|
23
26
|
wrap_up
|
24
27
|
yield(@stats.stats) if block_given?
|
25
28
|
end
|
26
29
|
|
27
30
|
private
|
28
31
|
|
29
|
-
def process(data)
|
30
|
-
cmd = nil
|
31
|
-
data.each_slice(@batch) do |slice|
|
32
|
-
with_log do
|
33
|
-
remote_resolve(collect_names(slice))
|
34
|
-
cmd = yield(@stats.stats) if block_given?
|
35
|
-
end
|
36
|
-
break if cmd == "STOP"
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
32
|
def wrap_up
|
41
33
|
@stats.stats[:resolution_stop] = Time.now
|
42
34
|
@stats.stats[:status] = :finish
|
43
35
|
@processor.writer.close
|
44
36
|
end
|
45
37
|
|
46
|
-
def
|
38
|
+
def add_job(batch)
|
39
|
+
job = batch.empty? ? nil : create_job(batch)
|
40
|
+
@jobs << job
|
41
|
+
end
|
42
|
+
|
43
|
+
def traverse_jobs(data)
|
44
|
+
until data.empty? && @jobs.compact.empty?
|
45
|
+
process_results(data)
|
46
|
+
cmd = yield(@stats.stats) if block_given?
|
47
|
+
break if cmd == "STOP"
|
48
|
+
sleep(0.5)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def resolution_stats(records_num)
|
47
53
|
@stats.stats[:total_records] = records_num
|
48
54
|
@stats.stats[:resolution_start] = Time.now
|
49
55
|
@stats.stats[:status] = :resolution
|
50
56
|
end
|
51
57
|
|
52
|
-
def
|
53
|
-
|
54
|
-
@
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
58
|
+
def process_results(data)
|
59
|
+
indices = []
|
60
|
+
@jobs.each_with_index do |job, i|
|
61
|
+
next if job.nil? || !job.complete?
|
62
|
+
with_log do
|
63
|
+
process_job(job)
|
64
|
+
indices << i
|
65
|
+
end
|
66
|
+
end
|
67
|
+
add_jobs(indices, data) unless indices.empty?
|
60
68
|
end
|
61
69
|
|
62
|
-
def
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
@processor.input[id] = { rank: row[:rank] }
|
68
|
-
str << "#{id}|#{row[:name]}"
|
69
|
-
end.join("\n")
|
70
|
+
def add_jobs(indices, data)
|
71
|
+
indices.each do |i|
|
72
|
+
batch = data.shift(@batch)
|
73
|
+
@jobs[i] = batch.empty? ? nil : create_job(batch)
|
74
|
+
end
|
70
75
|
end
|
71
76
|
|
72
|
-
def
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
77
|
+
def process_job(job)
|
78
|
+
if job.fulfilled?
|
79
|
+
results, current_data, stats = job.value
|
80
|
+
update_stats(stats)
|
81
|
+
@processor.process(results, current_data)
|
82
|
+
else
|
83
|
+
GnCrossmap.logger.error("Remote resolver server failed")
|
84
|
+
end
|
80
85
|
end
|
81
86
|
|
82
|
-
|
87
|
+
# rubocop:disable Metrics/AbcSize
|
88
|
+
def update_stats(job_stats)
|
83
89
|
s = @stats.stats
|
84
90
|
s[:last_batches_time].shift if s[:last_batches_time].size > 2
|
85
|
-
s[:last_batches_time] <<
|
91
|
+
s[:last_batches_time] << job_stats.stats[:last_batches_time][0]
|
86
92
|
s[:resolution_span] = Time.now - s[:resolution_start]
|
93
|
+
s[:resolved_records] += job_stats.stats[:resolved_records]
|
94
|
+
s[:matches][7] += job_stats.stats[:matches][7]
|
87
95
|
end
|
96
|
+
# rubocop:enable all
|
88
97
|
|
89
|
-
def
|
90
|
-
names
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
98
|
+
def create_job(batch)
|
99
|
+
names, batch_data = collect_names(batch)
|
100
|
+
rb = ResolverJob.new(names, batch_data, @resolver_url, @ds_id)
|
101
|
+
Concurrent::Future.execute { rb.run }
|
102
|
+
end
|
103
|
+
|
104
|
+
def instance_vars_from_opts(opts)
|
105
|
+
@stats = opts.stats
|
106
|
+
@with_classification = opts.with_classification.freeze
|
107
|
+
@ds_id = opts.data_source_id.freeze
|
108
|
+
@resolver_url = opts.resolver_url.freeze
|
109
|
+
@threads = opts.threads
|
110
|
+
end
|
111
|
+
|
112
|
+
def collect_names(batch)
|
113
|
+
batch_data = {}
|
114
|
+
names = batch.each_with_object([]) do |row, str|
|
115
|
+
id = row[:id].strip
|
116
|
+
batch_data[id] = row[:original]
|
117
|
+
@processor.input[id] = { rank: row[:rank] }
|
118
|
+
str << "#{id}|#{row[:name]}"
|
119
|
+
end.join("\n")
|
120
|
+
[names, batch_data]
|
100
121
|
end
|
101
122
|
|
102
|
-
def
|
103
|
-
@
|
104
|
-
@
|
105
|
-
|
123
|
+
def with_log
|
124
|
+
s = @count + 1
|
125
|
+
@count += @batch
|
126
|
+
e = [@count, @stats.stats[:total_records]].min
|
127
|
+
GnCrossmap.log("Resolve #{s}-#{e} out of " \
|
128
|
+
"#{@stats.stats[:total_records]} records at " \
|
129
|
+
"#{@resolver_url}")
|
130
|
+
yield
|
106
131
|
end
|
107
132
|
end
|
108
133
|
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module GnCrossmap
|
4
|
+
# Remote resolution for parallel jobs
|
5
|
+
class ResolverJob
|
6
|
+
def initialize(names, batch_data, resolver_url, ds_id)
|
7
|
+
@names = names
|
8
|
+
@batch_data = batch_data
|
9
|
+
@resolver_url = resolver_url
|
10
|
+
@ds_id = ds_id
|
11
|
+
@stats = Stats.new
|
12
|
+
end
|
13
|
+
|
14
|
+
def run
|
15
|
+
res = remote_resolve(@names)
|
16
|
+
[res, @batch_data, @stats]
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def remote_resolve(names)
|
22
|
+
batch_start = Time.now
|
23
|
+
res = RestClient.post(@resolver_url, data: names,
|
24
|
+
data_source_ids: @ds_id)
|
25
|
+
[res.body]
|
26
|
+
rescue RestClient::Exception
|
27
|
+
single_remote_resolve(names)
|
28
|
+
ensure
|
29
|
+
stats_add_batch_time(batch_start)
|
30
|
+
end
|
31
|
+
|
32
|
+
def single_remote_resolve(names)
|
33
|
+
all_res = []
|
34
|
+
names.split("\n").each do |name|
|
35
|
+
res = single_post(name)
|
36
|
+
next unless res
|
37
|
+
all_res << res.body
|
38
|
+
end
|
39
|
+
all_res
|
40
|
+
end
|
41
|
+
|
42
|
+
def single_post
|
43
|
+
RestClient.post(@resolver_url, data: name,
|
44
|
+
data_source_ids: @ds_id)
|
45
|
+
rescue RestClient::Exception => e
|
46
|
+
process_resolver_error(e, name)
|
47
|
+
nil
|
48
|
+
end
|
49
|
+
|
50
|
+
def process_resolver_error(err, name)
|
51
|
+
@stats.stats[:matches][7] += 1
|
52
|
+
@stats.stats[:resolved_records] += 1
|
53
|
+
GnCrossmap.logger.error("Resolver broke on '#{name}': #{err.message}")
|
54
|
+
end
|
55
|
+
|
56
|
+
def stats_add_batch_time(batch_start)
|
57
|
+
@stats.stats[:last_batches_time] << Time.now - batch_start
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -13,11 +13,13 @@ module GnCrossmap
|
|
13
13
|
@input = {}
|
14
14
|
end
|
15
15
|
|
16
|
-
def process(
|
16
|
+
def process(results, original_data)
|
17
17
|
@original_data = original_data
|
18
|
-
|
19
|
-
|
20
|
-
|
18
|
+
results.each do |result|
|
19
|
+
res = rubyfy(result)
|
20
|
+
res[:data].each do |d|
|
21
|
+
d[:results].nil? ? write_empty_result(d) : write_result(d)
|
22
|
+
end
|
21
23
|
end
|
22
24
|
end
|
23
25
|
|
data/lib/gn_crossmap/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gn_crossmap
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Mozzherin
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-09-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: biodiversity
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '3.1'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: concurrent-ruby
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.0'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: gn_uuid
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -100,14 +114,14 @@ dependencies:
|
|
100
114
|
requirements:
|
101
115
|
- - "~>"
|
102
116
|
- !ruby/object:Gem::Version
|
103
|
-
version: '9.
|
117
|
+
version: '9.1'
|
104
118
|
type: :development
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
107
121
|
requirements:
|
108
122
|
- - "~>"
|
109
123
|
- !ruby/object:Gem::Version
|
110
|
-
version: '9.
|
124
|
+
version: '9.1'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: coveralls
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -194,6 +208,7 @@ files:
|
|
194
208
|
- lib/gn_crossmap/errors.rb
|
195
209
|
- lib/gn_crossmap/reader.rb
|
196
210
|
- lib/gn_crossmap/resolver.rb
|
211
|
+
- lib/gn_crossmap/resolver_job.rb
|
197
212
|
- lib/gn_crossmap/result_processor.rb
|
198
213
|
- lib/gn_crossmap/sci_name_collector.rb
|
199
214
|
- lib/gn_crossmap/stats.rb
|