gn_crossmap 1.1.0 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +13 -8
- data/lib/gn_crossmap.rb +9 -3
- data/lib/gn_crossmap/reader.rb +15 -4
- data/lib/gn_crossmap/resolver.rb +21 -16
- data/lib/gn_crossmap/result_processor.rb +4 -4
- data/lib/gn_crossmap/stats.rb +25 -0
- data/lib/gn_crossmap/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f5a97cd143dd1e3bd3940b6457dd4b1f824236c7
|
4
|
+
data.tar.gz: 12a4ed179bdadaccacf34fe130300e77147d4c35
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5cbcc463342999b5f9c9e7dcc85f031783f3e4d9e31c8a8a4152bd64dddae086f5f82e1bbcef44f14fdd8fd3721c819d46ae892c7b02cdc256c3728b8359c1f8
|
7
|
+
data.tar.gz: 4d5df19d77f565ab6ad268f9106e93a8aafe6d2de0b6baff7906d78bc469a50a583c29423a611b406612a6355f1aefe106ec16ad0e7653e31f0d5c9fe6ef6ed5
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -104,7 +104,7 @@ block:
|
|
104
104
|
```ruby
|
105
105
|
GnCrossmap.run("path/to/input.csv", "path/to/output.csv", 5, true) do |stats|
|
106
106
|
puts stats
|
107
|
-
|
107
|
+
puts "Matches:"
|
108
108
|
stats[:matches].each do |key, value|
|
109
109
|
puts "#{GnCrossmap::MATCH_TYPES[key]}: #{value}"
|
110
110
|
end
|
@@ -113,13 +113,18 @@ end
|
|
113
113
|
|
114
114
|
#### Intermediate stats format
|
115
115
|
|
116
|
-
|Field
|
117
|
-
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
116
|
+
|Field |Description |
|
117
|
+
|------------------|---------------------------------------------------------|
|
118
|
+
|status |current phase: (init, ingested |
|
119
|
+
|total_records |total number of names in original list |
|
120
|
+
|ingestion_start |time when the reading from csv started |
|
121
|
+
|ingestion_span |time of intermediate checkpoint of reading csv |
|
122
|
+
|ingested_records |number of ingested records at an intermediate checkpoint |
|
123
|
+
|resolution_start |time when resolution of names started |
|
124
|
+
|resolution_span |time of intermediate checkpoint of resolving names |
|
125
|
+
|resolved_records |number of names already processed |
|
126
|
+
|last_batches_time |time required to process the last batch of names |
|
127
|
+
|matches |Distribution of processed data by match type (see below) |
|
123
128
|
|
124
129
|
#### Match types
|
125
130
|
|
data/lib/gn_crossmap.rb
CHANGED
@@ -13,6 +13,7 @@ require "gn_crossmap/column_collector"
|
|
13
13
|
require "gn_crossmap/sci_name_collector"
|
14
14
|
require "gn_crossmap/resolver"
|
15
15
|
require "gn_crossmap/result_processor"
|
16
|
+
require "gn_crossmap/stats"
|
16
17
|
|
17
18
|
# Namespace module for crossmapping checklists wth GN sources
|
18
19
|
module GnCrossmap
|
@@ -32,17 +33,22 @@ module GnCrossmap
|
|
32
33
|
class << self
|
33
34
|
attr_writer :logger
|
34
35
|
|
36
|
+
# rubocop:disable Metrics/AbcSize
|
37
|
+
|
35
38
|
def run(input, output, data_source_id, skip_original)
|
39
|
+
stats = Stats.new
|
36
40
|
input_io, output_io = io(input, output)
|
37
|
-
reader = Reader.new(input_io, input_name(input), skip_original)
|
38
|
-
data = reader.read
|
41
|
+
reader = Reader.new(input_io, input_name(input), skip_original, stats)
|
42
|
+
data = block_given? ? reader.read(&Proc.new) : reader.read
|
39
43
|
writer = Writer.new(output_io, reader.original_fields,
|
40
44
|
output_name(output))
|
41
|
-
resolver = Resolver.new(writer, data_source_id)
|
45
|
+
resolver = Resolver.new(writer, data_source_id, stats)
|
42
46
|
block_given? ? resolver.resolve(data, &Proc.new) : resolver.resolve(data)
|
43
47
|
output
|
44
48
|
end
|
45
49
|
|
50
|
+
# rubocop:enable all
|
51
|
+
|
46
52
|
def logger
|
47
53
|
@logger ||= Logger.new(STDERR)
|
48
54
|
end
|
data/lib/gn_crossmap/reader.rb
CHANGED
@@ -4,7 +4,8 @@ module GnCrossmap
|
|
4
4
|
class Reader
|
5
5
|
attr_reader :original_fields
|
6
6
|
|
7
|
-
def initialize(csv_io, input_name, skip_original)
|
7
|
+
def initialize(csv_io, input_name, skip_original, stats)
|
8
|
+
@stats = stats
|
8
9
|
@csv_io = csv_io
|
9
10
|
@col_sep = col_sep
|
10
11
|
@original_fields = nil
|
@@ -13,14 +14,17 @@ module GnCrossmap
|
|
13
14
|
end
|
14
15
|
|
15
16
|
def read
|
17
|
+
@stats.stats[:ingestion_start] = Time.now
|
18
|
+
@stats.stats[:status] = :ingestion
|
16
19
|
GnCrossmap.log("Read input from #{@input_name}")
|
17
|
-
parse_input
|
20
|
+
block_given? ? parse_input(&Proc.new) : parse_input
|
18
21
|
end
|
19
22
|
|
20
23
|
private
|
21
24
|
|
22
25
|
def col_sep
|
23
26
|
line = @csv_io.first
|
27
|
+
@stats.stats[:total_records] = @csv_io.readlines.size
|
24
28
|
@csv_io.rewind
|
25
29
|
[";", ",", "\t"].map { |s| [line.count(s), s] }.sort.last.last
|
26
30
|
end
|
@@ -30,13 +34,20 @@ module GnCrossmap
|
|
30
34
|
csv = CSV.new(@csv_io, col_sep: col_sep)
|
31
35
|
csv.each_with_index do |row, i|
|
32
36
|
@original_fields = headers(row) if @original_fields.nil?
|
33
|
-
i
|
34
|
-
GnCrossmap.log("Ingesting #{i}th csv row") if (i % 10_000).zero?
|
37
|
+
yield @stats.stats if log_progress(i) && block_given?
|
35
38
|
dc.process_row(row)
|
36
39
|
end && @csv_io.close
|
37
40
|
dc.data
|
38
41
|
end
|
39
42
|
|
43
|
+
def log_progress(count)
|
44
|
+
return false unless (count % 10_000).zero?
|
45
|
+
GnCrossmap.log("Ingesting csv row #{count + 1}")
|
46
|
+
@stats.stats[:ingested_records] = count + 1
|
47
|
+
@stats.stats[:ingestion_span] = Time.now - @stats.stats[:ingestion_start]
|
48
|
+
true
|
49
|
+
end
|
50
|
+
|
40
51
|
def headers(row)
|
41
52
|
hdrs = row.dup
|
42
53
|
@skip_original ? taxon_id_header(hdrs) : hdrs
|
data/lib/gn_crossmap/resolver.rb
CHANGED
@@ -3,9 +3,8 @@ module GnCrossmap
|
|
3
3
|
class Resolver
|
4
4
|
URL = "http://resolver.globalnames.org/name_resolvers.json".freeze
|
5
5
|
|
6
|
-
def initialize(writer, data_source_id)
|
7
|
-
@stats =
|
8
|
-
matches: match_types }
|
6
|
+
def initialize(writer, data_source_id, stats)
|
7
|
+
@stats = stats
|
9
8
|
@processor = GnCrossmap::ResultProcessor.new(writer, @stats)
|
10
9
|
@ds_id = data_source_id
|
11
10
|
@count = 0
|
@@ -14,13 +13,12 @@ module GnCrossmap
|
|
14
13
|
end
|
15
14
|
|
16
15
|
def resolve(data)
|
17
|
-
|
18
|
-
@stats[:start_time] = Time.now
|
16
|
+
update_stats(data.size)
|
19
17
|
data.each_slice(@batch) do |slice|
|
20
18
|
with_log do
|
21
19
|
names = collect_names(slice)
|
22
20
|
remote_resolve(names)
|
23
|
-
yield(@stats) if block_given?
|
21
|
+
yield(@stats.stats) if block_given?
|
24
22
|
end
|
25
23
|
end
|
26
24
|
@processor.writer.close
|
@@ -28,18 +26,18 @@ module GnCrossmap
|
|
28
26
|
|
29
27
|
private
|
30
28
|
|
31
|
-
def
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
end
|
29
|
+
def update_stats(records_num)
|
30
|
+
@stats.stats[:total_records] = records_num
|
31
|
+
@stats.stats[:resolution_start] = Time.now
|
32
|
+
@stats.stats[:status] = :resolution
|
36
33
|
end
|
37
34
|
|
38
35
|
def with_log
|
39
36
|
s = @count + 1
|
40
37
|
@count += @batch
|
41
|
-
e = [@count, @stats[:
|
42
|
-
GnCrossmap.log("Resolve #{s}-#{e} out of
|
38
|
+
e = [@count, @stats.stats[:total_records]].min
|
39
|
+
GnCrossmap.log("Resolve #{s}-#{e} out of " \
|
40
|
+
"#{@stats.stats[:total_records]} records")
|
43
41
|
yield
|
44
42
|
end
|
45
43
|
|
@@ -59,7 +57,14 @@ module GnCrossmap
|
|
59
57
|
rescue RestClient::Exception
|
60
58
|
single_remote_resolve(names)
|
61
59
|
ensure
|
62
|
-
|
60
|
+
update_batch_times(batch_start)
|
61
|
+
end
|
62
|
+
|
63
|
+
def update_batch_times(batch_start)
|
64
|
+
s = @stats.stats
|
65
|
+
s[:last_batches_time].shift if s[:last_batches_time].size > 2
|
66
|
+
s[:last_batches_time] << Time.now - batch_start
|
67
|
+
s[:resolution_span] = Time.now - s[:resolution_start]
|
63
68
|
end
|
64
69
|
|
65
70
|
def single_remote_resolve(names)
|
@@ -75,8 +80,8 @@ module GnCrossmap
|
|
75
80
|
end
|
76
81
|
|
77
82
|
def process_resolver_error(err, name)
|
78
|
-
@stats[:matches][7] += 1
|
79
|
-
@stats[:
|
83
|
+
@stats.stats[:matches][7] += 1
|
84
|
+
@stats.stats[:resolved_records] += 1
|
80
85
|
GnCrossmap.logger.error("Resolver broke on '#{name}': #{err.message}")
|
81
86
|
end
|
82
87
|
end
|
@@ -24,8 +24,8 @@ module GnCrossmap
|
|
24
24
|
end
|
25
25
|
|
26
26
|
def write_empty_result(datum)
|
27
|
-
@stats[:matches][0] += 1
|
28
|
-
@stats[:
|
27
|
+
@stats.stats[:matches][0] += 1
|
28
|
+
@stats.stats[:resolved_records] += 1
|
29
29
|
res = @original_data[datum[:supplied_id]]
|
30
30
|
res += [GnCrossmap::MATCH_TYPES[0], datum[:supplied_name_string], nil,
|
31
31
|
nil, @input[datum[:supplied_id]][:rank], nil,
|
@@ -42,8 +42,8 @@ module GnCrossmap
|
|
42
42
|
|
43
43
|
def collect_stats(datum)
|
44
44
|
match_num = datum[:results].map { |d| d[:match_type] }.min
|
45
|
-
@stats[:matches][match_num] += 1
|
46
|
-
@stats[:
|
45
|
+
@stats.stats[:matches][match_num] += 1
|
46
|
+
@stats.stats[:resolved_records] += 1
|
47
47
|
end
|
48
48
|
|
49
49
|
def compile_result(datum, result)
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module GnCrossmap
|
4
|
+
# Collects statistics about crossmapping process
|
5
|
+
class Stats
|
6
|
+
attr_accessor :stats
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@stats = { status: :init, total_records: 0, ingested_records: 0,
|
10
|
+
resolved_records: 0, ingestion_span: nil,
|
11
|
+
resolution_span: nil, ingestion_start: nil,
|
12
|
+
resolution_start: nil, last_batches_time: [],
|
13
|
+
matches: match_types }
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def match_types
|
19
|
+
matches = GnCrossmap::MATCH_TYPES.keys
|
20
|
+
matches.each_with_object({}) do |key, obj|
|
21
|
+
obj[key] = 0
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/lib/gn_crossmap/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gn_crossmap
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Mozzherin
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-11-
|
11
|
+
date: 2016-11-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: trollop
|
@@ -181,6 +181,7 @@ files:
|
|
181
181
|
- lib/gn_crossmap/resolver.rb
|
182
182
|
- lib/gn_crossmap/result_processor.rb
|
183
183
|
- lib/gn_crossmap/sci_name_collector.rb
|
184
|
+
- lib/gn_crossmap/stats.rb
|
184
185
|
- lib/gn_crossmap/version.rb
|
185
186
|
- lib/gn_crossmap/writer.rb
|
186
187
|
homepage: https://github.com/GlobalNamesArchitecture/gn_crossmap
|