gn_crossmap 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +13 -8
- data/lib/gn_crossmap.rb +9 -3
- data/lib/gn_crossmap/reader.rb +15 -4
- data/lib/gn_crossmap/resolver.rb +21 -16
- data/lib/gn_crossmap/result_processor.rb +4 -4
- data/lib/gn_crossmap/stats.rb +25 -0
- data/lib/gn_crossmap/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f5a97cd143dd1e3bd3940b6457dd4b1f824236c7
|
4
|
+
data.tar.gz: 12a4ed179bdadaccacf34fe130300e77147d4c35
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5cbcc463342999b5f9c9e7dcc85f031783f3e4d9e31c8a8a4152bd64dddae086f5f82e1bbcef44f14fdd8fd3721c819d46ae892c7b02cdc256c3728b8359c1f8
|
7
|
+
data.tar.gz: 4d5df19d77f565ab6ad268f9106e93a8aafe6d2de0b6baff7906d78bc469a50a583c29423a611b406612a6355f1aefe106ec16ad0e7653e31f0d5c9fe6ef6ed5
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -104,7 +104,7 @@ block:
|
|
104
104
|
```ruby
|
105
105
|
GnCrossmap.run("path/to/input.csv", "path/to/output.csv", 5, true) do |stats|
|
106
106
|
puts stats
|
107
|
-
|
107
|
+
puts "Matches:"
|
108
108
|
stats[:matches].each do |key, value|
|
109
109
|
puts "#{GnCrossmap::MATCH_TYPES[key]}: #{value}"
|
110
110
|
end
|
@@ -113,13 +113,18 @@ end
|
|
113
113
|
|
114
114
|
#### Intermediate stats format
|
115
115
|
|
116
|
-
|Field
|
117
|
-
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
116
|
+
|Field |Description |
|
117
|
+
|------------------|---------------------------------------------------------|
|
118
|
+
|status |current phase: (init, ingested |
|
119
|
+
|total_records |total number of names in original list |
|
120
|
+
|ingestion_start |time when the reading from csv started |
|
121
|
+
|ingestion_span |time of intermediate checkpoint of reading csv |
|
122
|
+
|ingested_records |number of ingested records at an intermediate checkpoint |
|
123
|
+
|resolution_start |time when resolution of names started |
|
124
|
+
|resolution_span |time of intermediate checkpoint of resolving names |
|
125
|
+
|resolved_records |number of names already processed |
|
126
|
+
|last_batches_time |time required to process the last batch of names |
|
127
|
+
|matches |Distribution of processed data by match type (see below) |
|
123
128
|
|
124
129
|
#### Match types
|
125
130
|
|
data/lib/gn_crossmap.rb
CHANGED
@@ -13,6 +13,7 @@ require "gn_crossmap/column_collector"
|
|
13
13
|
require "gn_crossmap/sci_name_collector"
|
14
14
|
require "gn_crossmap/resolver"
|
15
15
|
require "gn_crossmap/result_processor"
|
16
|
+
require "gn_crossmap/stats"
|
16
17
|
|
17
18
|
# Namespace module for crossmapping checklists wth GN sources
|
18
19
|
module GnCrossmap
|
@@ -32,17 +33,22 @@ module GnCrossmap
|
|
32
33
|
class << self
|
33
34
|
attr_writer :logger
|
34
35
|
|
36
|
+
# rubocop:disable Metrics/AbcSize
|
37
|
+
|
35
38
|
def run(input, output, data_source_id, skip_original)
|
39
|
+
stats = Stats.new
|
36
40
|
input_io, output_io = io(input, output)
|
37
|
-
reader = Reader.new(input_io, input_name(input), skip_original)
|
38
|
-
data = reader.read
|
41
|
+
reader = Reader.new(input_io, input_name(input), skip_original, stats)
|
42
|
+
data = block_given? ? reader.read(&Proc.new) : reader.read
|
39
43
|
writer = Writer.new(output_io, reader.original_fields,
|
40
44
|
output_name(output))
|
41
|
-
resolver = Resolver.new(writer, data_source_id)
|
45
|
+
resolver = Resolver.new(writer, data_source_id, stats)
|
42
46
|
block_given? ? resolver.resolve(data, &Proc.new) : resolver.resolve(data)
|
43
47
|
output
|
44
48
|
end
|
45
49
|
|
50
|
+
# rubocop:enable all
|
51
|
+
|
46
52
|
def logger
|
47
53
|
@logger ||= Logger.new(STDERR)
|
48
54
|
end
|
data/lib/gn_crossmap/reader.rb
CHANGED
@@ -4,7 +4,8 @@ module GnCrossmap
|
|
4
4
|
class Reader
|
5
5
|
attr_reader :original_fields
|
6
6
|
|
7
|
-
def initialize(csv_io, input_name, skip_original)
|
7
|
+
def initialize(csv_io, input_name, skip_original, stats)
|
8
|
+
@stats = stats
|
8
9
|
@csv_io = csv_io
|
9
10
|
@col_sep = col_sep
|
10
11
|
@original_fields = nil
|
@@ -13,14 +14,17 @@ module GnCrossmap
|
|
13
14
|
end
|
14
15
|
|
15
16
|
def read
|
17
|
+
@stats.stats[:ingestion_start] = Time.now
|
18
|
+
@stats.stats[:status] = :ingestion
|
16
19
|
GnCrossmap.log("Read input from #{@input_name}")
|
17
|
-
parse_input
|
20
|
+
block_given? ? parse_input(&Proc.new) : parse_input
|
18
21
|
end
|
19
22
|
|
20
23
|
private
|
21
24
|
|
22
25
|
def col_sep
|
23
26
|
line = @csv_io.first
|
27
|
+
@stats.stats[:total_records] = @csv_io.readlines.size
|
24
28
|
@csv_io.rewind
|
25
29
|
[";", ",", "\t"].map { |s| [line.count(s), s] }.sort.last.last
|
26
30
|
end
|
@@ -30,13 +34,20 @@ module GnCrossmap
|
|
30
34
|
csv = CSV.new(@csv_io, col_sep: col_sep)
|
31
35
|
csv.each_with_index do |row, i|
|
32
36
|
@original_fields = headers(row) if @original_fields.nil?
|
33
|
-
i
|
34
|
-
GnCrossmap.log("Ingesting #{i}th csv row") if (i % 10_000).zero?
|
37
|
+
yield @stats.stats if log_progress(i) && block_given?
|
35
38
|
dc.process_row(row)
|
36
39
|
end && @csv_io.close
|
37
40
|
dc.data
|
38
41
|
end
|
39
42
|
|
43
|
+
def log_progress(count)
|
44
|
+
return false unless (count % 10_000).zero?
|
45
|
+
GnCrossmap.log("Ingesting csv row #{count + 1}")
|
46
|
+
@stats.stats[:ingested_records] = count + 1
|
47
|
+
@stats.stats[:ingestion_span] = Time.now - @stats.stats[:ingestion_start]
|
48
|
+
true
|
49
|
+
end
|
50
|
+
|
40
51
|
def headers(row)
|
41
52
|
hdrs = row.dup
|
42
53
|
@skip_original ? taxon_id_header(hdrs) : hdrs
|
data/lib/gn_crossmap/resolver.rb
CHANGED
@@ -3,9 +3,8 @@ module GnCrossmap
|
|
3
3
|
class Resolver
|
4
4
|
URL = "http://resolver.globalnames.org/name_resolvers.json".freeze
|
5
5
|
|
6
|
-
def initialize(writer, data_source_id)
|
7
|
-
@stats =
|
8
|
-
matches: match_types }
|
6
|
+
def initialize(writer, data_source_id, stats)
|
7
|
+
@stats = stats
|
9
8
|
@processor = GnCrossmap::ResultProcessor.new(writer, @stats)
|
10
9
|
@ds_id = data_source_id
|
11
10
|
@count = 0
|
@@ -14,13 +13,12 @@ module GnCrossmap
|
|
14
13
|
end
|
15
14
|
|
16
15
|
def resolve(data)
|
17
|
-
|
18
|
-
@stats[:start_time] = Time.now
|
16
|
+
update_stats(data.size)
|
19
17
|
data.each_slice(@batch) do |slice|
|
20
18
|
with_log do
|
21
19
|
names = collect_names(slice)
|
22
20
|
remote_resolve(names)
|
23
|
-
yield(@stats) if block_given?
|
21
|
+
yield(@stats.stats) if block_given?
|
24
22
|
end
|
25
23
|
end
|
26
24
|
@processor.writer.close
|
@@ -28,18 +26,18 @@ module GnCrossmap
|
|
28
26
|
|
29
27
|
private
|
30
28
|
|
31
|
-
def
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
end
|
29
|
+
def update_stats(records_num)
|
30
|
+
@stats.stats[:total_records] = records_num
|
31
|
+
@stats.stats[:resolution_start] = Time.now
|
32
|
+
@stats.stats[:status] = :resolution
|
36
33
|
end
|
37
34
|
|
38
35
|
def with_log
|
39
36
|
s = @count + 1
|
40
37
|
@count += @batch
|
41
|
-
e = [@count, @stats[:
|
42
|
-
GnCrossmap.log("Resolve #{s}-#{e} out of
|
38
|
+
e = [@count, @stats.stats[:total_records]].min
|
39
|
+
GnCrossmap.log("Resolve #{s}-#{e} out of " \
|
40
|
+
"#{@stats.stats[:total_records]} records")
|
43
41
|
yield
|
44
42
|
end
|
45
43
|
|
@@ -59,7 +57,14 @@ module GnCrossmap
|
|
59
57
|
rescue RestClient::Exception
|
60
58
|
single_remote_resolve(names)
|
61
59
|
ensure
|
62
|
-
|
60
|
+
update_batch_times(batch_start)
|
61
|
+
end
|
62
|
+
|
63
|
+
def update_batch_times(batch_start)
|
64
|
+
s = @stats.stats
|
65
|
+
s[:last_batches_time].shift if s[:last_batches_time].size > 2
|
66
|
+
s[:last_batches_time] << Time.now - batch_start
|
67
|
+
s[:resolution_span] = Time.now - s[:resolution_start]
|
63
68
|
end
|
64
69
|
|
65
70
|
def single_remote_resolve(names)
|
@@ -75,8 +80,8 @@ module GnCrossmap
|
|
75
80
|
end
|
76
81
|
|
77
82
|
def process_resolver_error(err, name)
|
78
|
-
@stats[:matches][7] += 1
|
79
|
-
@stats[:
|
83
|
+
@stats.stats[:matches][7] += 1
|
84
|
+
@stats.stats[:resolved_records] += 1
|
80
85
|
GnCrossmap.logger.error("Resolver broke on '#{name}': #{err.message}")
|
81
86
|
end
|
82
87
|
end
|
@@ -24,8 +24,8 @@ module GnCrossmap
|
|
24
24
|
end
|
25
25
|
|
26
26
|
def write_empty_result(datum)
|
27
|
-
@stats[:matches][0] += 1
|
28
|
-
@stats[:
|
27
|
+
@stats.stats[:matches][0] += 1
|
28
|
+
@stats.stats[:resolved_records] += 1
|
29
29
|
res = @original_data[datum[:supplied_id]]
|
30
30
|
res += [GnCrossmap::MATCH_TYPES[0], datum[:supplied_name_string], nil,
|
31
31
|
nil, @input[datum[:supplied_id]][:rank], nil,
|
@@ -42,8 +42,8 @@ module GnCrossmap
|
|
42
42
|
|
43
43
|
def collect_stats(datum)
|
44
44
|
match_num = datum[:results].map { |d| d[:match_type] }.min
|
45
|
-
@stats[:matches][match_num] += 1
|
46
|
-
@stats[:
|
45
|
+
@stats.stats[:matches][match_num] += 1
|
46
|
+
@stats.stats[:resolved_records] += 1
|
47
47
|
end
|
48
48
|
|
49
49
|
def compile_result(datum, result)
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module GnCrossmap
|
4
|
+
# Collects statistics about crossmapping process
|
5
|
+
class Stats
|
6
|
+
attr_accessor :stats
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@stats = { status: :init, total_records: 0, ingested_records: 0,
|
10
|
+
resolved_records: 0, ingestion_span: nil,
|
11
|
+
resolution_span: nil, ingestion_start: nil,
|
12
|
+
resolution_start: nil, last_batches_time: [],
|
13
|
+
matches: match_types }
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def match_types
|
19
|
+
matches = GnCrossmap::MATCH_TYPES.keys
|
20
|
+
matches.each_with_object({}) do |key, obj|
|
21
|
+
obj[key] = 0
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/lib/gn_crossmap/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gn_crossmap
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Mozzherin
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-11-
|
11
|
+
date: 2016-11-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: trollop
|
@@ -181,6 +181,7 @@ files:
|
|
181
181
|
- lib/gn_crossmap/resolver.rb
|
182
182
|
- lib/gn_crossmap/result_processor.rb
|
183
183
|
- lib/gn_crossmap/sci_name_collector.rb
|
184
|
+
- lib/gn_crossmap/stats.rb
|
184
185
|
- lib/gn_crossmap/version.rb
|
185
186
|
- lib/gn_crossmap/writer.rb
|
186
187
|
homepage: https://github.com/GlobalNamesArchitecture/gn_crossmap
|