gn_crossmap 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +38 -0
- data/lib/gn_crossmap.rb +12 -1
- data/lib/gn_crossmap/resolver.rb +27 -7
- data/lib/gn_crossmap/result_processor.rb +13 -13
- data/lib/gn_crossmap/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 215ff795d273c98f6c25eab1e93ac32d21b3e1be
|
4
|
+
data.tar.gz: 5fd8159546705e355fe662ddb6f0b136edc36ce4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9e467719f0be4a7c7a2293e9ec78dbdbf54b8dfc8198a4e1f12417a239cddf982da495434006c3b40d5386b58896b18bbedff68d358fced0b4f1d06d038ed9c4
|
7
|
+
data.tar.gz: c41b99ddafe9479773198e900e2dcd07f83d19ebe9929a3b9b228b0d10abad475b85429301f108fdc94ddb6529b91ebfc6921ac42215dbacfb03aacdbeb822fa
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -98,6 +98,44 @@ GnCrossmap.logger = MyCustomLogger.new
|
|
98
98
|
GnCrossmap.run("path/to/input.csv", "path/to/output.csv", 5, true)
|
99
99
|
```
|
100
100
|
|
101
|
+
If you want to get intermediate statistics for each resolution cycle use a
|
102
|
+
block:
|
103
|
+
|
104
|
+
```ruby
|
105
|
+
GnCrossmap.run("path/to/input.csv", "path/to/output.csv", 5, true) do |stats|
|
106
|
+
puts stats
|
107
|
+
put "Matches:"
|
108
|
+
stats[:matches].each do |key, value|
|
109
|
+
puts "#{GnCrossmap::MATCH_TYPES[key]}: #{value}"
|
110
|
+
end
|
111
|
+
end
|
112
|
+
```
|
113
|
+
|
114
|
+
#### Intermediate stats format
|
115
|
+
|
116
|
+
|Field | Description |
|
117
|
+
|----------------|---------------------------------------------------------|
|
118
|
+
|total | total number of names in original list |
|
119
|
+
|current | number of names already processed |
|
120
|
+
|start_time | Start of resolution |
|
121
|
+
|last_batch_time | time span of the last batch processing |
|
122
|
+
|matches | Distribution of processed data by match type (see below)|
|
123
|
+
|
124
|
+
#### Match types
|
125
|
+
|
126
|
+
Match types dictionary can be accessed with `GnCrossmap::MATCH_TYPES` constant
|
127
|
+
|
128
|
+
| Match code | Match type |
|
129
|
+
|------------|----------------------------------|
|
130
|
+
|0 |No match |
|
131
|
+
|1 |Exact string match |
|
132
|
+
|2 |Canonical form exact match |
|
133
|
+
|3 |Canonical form fuzzy match |
|
134
|
+
|4 |Partial canonical form match |
|
135
|
+
|5 |Partial canonical form fuzzy match|
|
136
|
+
|6 |Genus part match |
|
137
|
+
|7 |Error in matching |
|
138
|
+
|
101
139
|
### Input file format
|
102
140
|
|
103
141
|
- Comma Separated File with names of fields in first row.
|
data/lib/gn_crossmap.rb
CHANGED
@@ -18,6 +18,16 @@ require "gn_crossmap/result_processor"
|
|
18
18
|
module GnCrossmap
|
19
19
|
INPUT_MODE = "r:utf-8".freeze
|
20
20
|
OUTPUT_MODE = "w:utf-8".freeze
|
21
|
+
MATCH_TYPES = {
|
22
|
+
0 => "No match",
|
23
|
+
1 => "Exact string match",
|
24
|
+
2 => "Canonical form exact match",
|
25
|
+
3 => "Canonical form fuzzy match",
|
26
|
+
4 => "Partial canonical form match",
|
27
|
+
5 => "Partial canonical form fuzzy match",
|
28
|
+
6 => "Genus part match",
|
29
|
+
7 => "Error in matching"
|
30
|
+
}.freeze
|
21
31
|
|
22
32
|
class << self
|
23
33
|
attr_writer :logger
|
@@ -28,7 +38,8 @@ module GnCrossmap
|
|
28
38
|
data = reader.read
|
29
39
|
writer = Writer.new(output_io, reader.original_fields,
|
30
40
|
output_name(output))
|
31
|
-
Resolver.new(writer, data_source_id)
|
41
|
+
resolver = Resolver.new(writer, data_source_id)
|
42
|
+
block_given? ? resolver.resolve(data, &Proc.new) : resolver.resolve(data)
|
32
43
|
output
|
33
44
|
end
|
34
45
|
|
data/lib/gn_crossmap/resolver.rb
CHANGED
@@ -4,7 +4,9 @@ module GnCrossmap
|
|
4
4
|
URL = "http://resolver.globalnames.org/name_resolvers.json".freeze
|
5
5
|
|
6
6
|
def initialize(writer, data_source_id)
|
7
|
-
@
|
7
|
+
@stats = { total: 0, current: 0, start_time: nil, last_batch_time: nil,
|
8
|
+
matches: match_types }
|
9
|
+
@processor = GnCrossmap::ResultProcessor.new(writer, @stats)
|
8
10
|
@ds_id = data_source_id
|
9
11
|
@count = 0
|
10
12
|
@current_data = {}
|
@@ -12,11 +14,13 @@ module GnCrossmap
|
|
12
14
|
end
|
13
15
|
|
14
16
|
def resolve(data)
|
15
|
-
|
17
|
+
@stats[:total] = data.size
|
18
|
+
@stats[:start_time] = Time.now
|
16
19
|
data.each_slice(@batch) do |slice|
|
17
|
-
with_log
|
20
|
+
with_log do
|
18
21
|
names = collect_names(slice)
|
19
22
|
remote_resolve(names)
|
23
|
+
yield(@stats) if block_given?
|
20
24
|
end
|
21
25
|
end
|
22
26
|
@processor.writer.close
|
@@ -24,11 +28,18 @@ module GnCrossmap
|
|
24
28
|
|
25
29
|
private
|
26
30
|
|
27
|
-
def
|
31
|
+
def match_types
|
32
|
+
matches = GnCrossmap::MATCH_TYPES.keys
|
33
|
+
matches.each_with_object({}) do |key, obj|
|
34
|
+
obj[key] = 0
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def with_log
|
28
39
|
s = @count + 1
|
29
40
|
@count += @batch
|
30
|
-
e = [@count,
|
31
|
-
GnCrossmap.log("Resolve #{s}-#{e} out of #{
|
41
|
+
e = [@count, @stats[:total]].min
|
42
|
+
GnCrossmap.log("Resolve #{s}-#{e} out of #{@stats[:total]} records")
|
32
43
|
yield
|
33
44
|
end
|
34
45
|
|
@@ -42,10 +53,13 @@ module GnCrossmap
|
|
42
53
|
end
|
43
54
|
|
44
55
|
def remote_resolve(names)
|
56
|
+
batch_start = Time.now
|
45
57
|
res = RestClient.post(URL, data: names, data_source_ids: @ds_id)
|
46
58
|
@processor.process(res, @current_data)
|
47
59
|
rescue RestClient::Exception
|
48
60
|
single_remote_resolve(names)
|
61
|
+
ensure
|
62
|
+
@stats[:last_batch_time] = Time.now - batch_start
|
49
63
|
end
|
50
64
|
|
51
65
|
def single_remote_resolve(names)
|
@@ -54,10 +68,16 @@ module GnCrossmap
|
|
54
68
|
res = RestClient.post(URL, data: name, data_source_ids: @ds_id)
|
55
69
|
@processor.process(res, @current_data)
|
56
70
|
rescue RestClient::Exception => e
|
57
|
-
|
71
|
+
process_resolver_error(e, name)
|
58
72
|
next
|
59
73
|
end
|
60
74
|
end
|
61
75
|
end
|
76
|
+
|
77
|
+
def process_resolver_error(err, name)
|
78
|
+
@stats[:matches][7] += 1
|
79
|
+
@stats[:current] += 1
|
80
|
+
GnCrossmap.logger.error("Resolver broke on '#{name}': #{err.message}")
|
81
|
+
end
|
62
82
|
end
|
63
83
|
end
|
@@ -1,19 +1,10 @@
|
|
1
1
|
module GnCrossmap
|
2
2
|
# Processes data received from the GN Resolver
|
3
3
|
class ResultProcessor
|
4
|
-
MATCH_TYPES = {
|
5
|
-
0 => "No match",
|
6
|
-
1 => "Exact string match",
|
7
|
-
2 => "Canonical form exact match",
|
8
|
-
3 => "Canonical form fuzzy match",
|
9
|
-
4 => "Partial canonical form match",
|
10
|
-
5 => "Partial canonical form fuzzy match",
|
11
|
-
6 => "Genus part match"
|
12
|
-
}.freeze
|
13
|
-
|
14
4
|
attr_reader :input, :writer
|
15
5
|
|
16
|
-
def initialize(writer)
|
6
|
+
def initialize(writer, stats)
|
7
|
+
@stats = stats
|
17
8
|
@writer = writer
|
18
9
|
@input = {}
|
19
10
|
end
|
@@ -33,19 +24,28 @@ module GnCrossmap
|
|
33
24
|
end
|
34
25
|
|
35
26
|
def write_empty_result(datum)
|
27
|
+
@stats[:matches][0] += 1
|
28
|
+
@stats[:current] += 1
|
36
29
|
res = @original_data[datum[:supplied_id]]
|
37
|
-
res += [MATCH_TYPES[0], datum[:supplied_name_string], nil,
|
30
|
+
res += [GnCrossmap::MATCH_TYPES[0], datum[:supplied_name_string], nil,
|
38
31
|
nil, @input[datum[:supplied_id]][:rank], nil,
|
39
32
|
nil, nil, nil]
|
40
33
|
@writer.write(res)
|
41
34
|
end
|
42
35
|
|
43
36
|
def write_result(datum)
|
37
|
+
collect_stats(datum)
|
44
38
|
datum[:results].each do |result|
|
45
39
|
@writer.write(compile_result(datum, result))
|
46
40
|
end
|
47
41
|
end
|
48
42
|
|
43
|
+
def collect_stats(datum)
|
44
|
+
match_num = datum[:results].map { |d| d[:match_type] }.min
|
45
|
+
@stats[:matches][match_num] += 1
|
46
|
+
@stats[:current] += 1
|
47
|
+
end
|
48
|
+
|
49
49
|
def compile_result(datum, result)
|
50
50
|
@original_data[datum[:supplied_id]] + new_data(datum, result)
|
51
51
|
end
|
@@ -64,7 +64,7 @@ module GnCrossmap
|
|
64
64
|
end
|
65
65
|
|
66
66
|
def matched_type(record)
|
67
|
-
MATCH_TYPES[record[:match_type]]
|
67
|
+
GnCrossmap::MATCH_TYPES[record[:match_type]]
|
68
68
|
end
|
69
69
|
end
|
70
70
|
end
|
data/lib/gn_crossmap/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gn_crossmap
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Mozzherin
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-11-
|
11
|
+
date: 2016-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: trollop
|