gn_crossmap 1.0.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +38 -0
- data/lib/gn_crossmap.rb +12 -1
- data/lib/gn_crossmap/resolver.rb +27 -7
- data/lib/gn_crossmap/result_processor.rb +13 -13
- data/lib/gn_crossmap/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 215ff795d273c98f6c25eab1e93ac32d21b3e1be
|
4
|
+
data.tar.gz: 5fd8159546705e355fe662ddb6f0b136edc36ce4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9e467719f0be4a7c7a2293e9ec78dbdbf54b8dfc8198a4e1f12417a239cddf982da495434006c3b40d5386b58896b18bbedff68d358fced0b4f1d06d038ed9c4
|
7
|
+
data.tar.gz: c41b99ddafe9479773198e900e2dcd07f83d19ebe9929a3b9b228b0d10abad475b85429301f108fdc94ddb6529b91ebfc6921ac42215dbacfb03aacdbeb822fa
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -98,6 +98,44 @@ GnCrossmap.logger = MyCustomLogger.new
|
|
98
98
|
GnCrossmap.run("path/to/input.csv", "path/to/output.csv", 5, true)
|
99
99
|
```
|
100
100
|
|
101
|
+
If you want to get intermediate statistics for each resolution cycle use a
|
102
|
+
block:
|
103
|
+
|
104
|
+
```ruby
|
105
|
+
GnCrossmap.run("path/to/input.csv", "path/to/output.csv", 5, true) do |stats|
|
106
|
+
puts stats
|
107
|
+
put "Matches:"
|
108
|
+
stats[:matches].each do |key, value|
|
109
|
+
puts "#{GnCrossmap::MATCH_TYPES[key]}: #{value}"
|
110
|
+
end
|
111
|
+
end
|
112
|
+
```
|
113
|
+
|
114
|
+
#### Intermediate stats format
|
115
|
+
|
116
|
+
|Field | Description |
|
117
|
+
|----------------|---------------------------------------------------------|
|
118
|
+
|total | total number of names in original list |
|
119
|
+
|current | number of names already processed |
|
120
|
+
|start_time | Start of resolution |
|
121
|
+
|last_batch_time | time span of the last batch processing |
|
122
|
+
|matches | Distribution of processed data by match type (see below)|
|
123
|
+
|
124
|
+
#### Match types
|
125
|
+
|
126
|
+
Match types dictionary can be accessed with `GnCrossmap::MATCH_TYPES` constant
|
127
|
+
|
128
|
+
| Match code | Match type |
|
129
|
+
|------------|----------------------------------|
|
130
|
+
|0 |No match |
|
131
|
+
|1 |Exact string match |
|
132
|
+
|2 |Canonical form exact match |
|
133
|
+
|3 |Canonical form fuzzy match |
|
134
|
+
|4 |Partial canonical form match |
|
135
|
+
|5 |Partial canonical form fuzzy match|
|
136
|
+
|6 |Genus part match |
|
137
|
+
|7 |Error in matching |
|
138
|
+
|
101
139
|
### Input file format
|
102
140
|
|
103
141
|
- Comma Separated File with names of fields in first row.
|
data/lib/gn_crossmap.rb
CHANGED
@@ -18,6 +18,16 @@ require "gn_crossmap/result_processor"
|
|
18
18
|
module GnCrossmap
|
19
19
|
INPUT_MODE = "r:utf-8".freeze
|
20
20
|
OUTPUT_MODE = "w:utf-8".freeze
|
21
|
+
MATCH_TYPES = {
|
22
|
+
0 => "No match",
|
23
|
+
1 => "Exact string match",
|
24
|
+
2 => "Canonical form exact match",
|
25
|
+
3 => "Canonical form fuzzy match",
|
26
|
+
4 => "Partial canonical form match",
|
27
|
+
5 => "Partial canonical form fuzzy match",
|
28
|
+
6 => "Genus part match",
|
29
|
+
7 => "Error in matching"
|
30
|
+
}.freeze
|
21
31
|
|
22
32
|
class << self
|
23
33
|
attr_writer :logger
|
@@ -28,7 +38,8 @@ module GnCrossmap
|
|
28
38
|
data = reader.read
|
29
39
|
writer = Writer.new(output_io, reader.original_fields,
|
30
40
|
output_name(output))
|
31
|
-
Resolver.new(writer, data_source_id)
|
41
|
+
resolver = Resolver.new(writer, data_source_id)
|
42
|
+
block_given? ? resolver.resolve(data, &Proc.new) : resolver.resolve(data)
|
32
43
|
output
|
33
44
|
end
|
34
45
|
|
data/lib/gn_crossmap/resolver.rb
CHANGED
@@ -4,7 +4,9 @@ module GnCrossmap
|
|
4
4
|
URL = "http://resolver.globalnames.org/name_resolvers.json".freeze
|
5
5
|
|
6
6
|
def initialize(writer, data_source_id)
|
7
|
-
@
|
7
|
+
@stats = { total: 0, current: 0, start_time: nil, last_batch_time: nil,
|
8
|
+
matches: match_types }
|
9
|
+
@processor = GnCrossmap::ResultProcessor.new(writer, @stats)
|
8
10
|
@ds_id = data_source_id
|
9
11
|
@count = 0
|
10
12
|
@current_data = {}
|
@@ -12,11 +14,13 @@ module GnCrossmap
|
|
12
14
|
end
|
13
15
|
|
14
16
|
def resolve(data)
|
15
|
-
|
17
|
+
@stats[:total] = data.size
|
18
|
+
@stats[:start_time] = Time.now
|
16
19
|
data.each_slice(@batch) do |slice|
|
17
|
-
with_log
|
20
|
+
with_log do
|
18
21
|
names = collect_names(slice)
|
19
22
|
remote_resolve(names)
|
23
|
+
yield(@stats) if block_given?
|
20
24
|
end
|
21
25
|
end
|
22
26
|
@processor.writer.close
|
@@ -24,11 +28,18 @@ module GnCrossmap
|
|
24
28
|
|
25
29
|
private
|
26
30
|
|
27
|
-
def
|
31
|
+
def match_types
|
32
|
+
matches = GnCrossmap::MATCH_TYPES.keys
|
33
|
+
matches.each_with_object({}) do |key, obj|
|
34
|
+
obj[key] = 0
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def with_log
|
28
39
|
s = @count + 1
|
29
40
|
@count += @batch
|
30
|
-
e = [@count,
|
31
|
-
GnCrossmap.log("Resolve #{s}-#{e} out of #{
|
41
|
+
e = [@count, @stats[:total]].min
|
42
|
+
GnCrossmap.log("Resolve #{s}-#{e} out of #{@stats[:total]} records")
|
32
43
|
yield
|
33
44
|
end
|
34
45
|
|
@@ -42,10 +53,13 @@ module GnCrossmap
|
|
42
53
|
end
|
43
54
|
|
44
55
|
def remote_resolve(names)
|
56
|
+
batch_start = Time.now
|
45
57
|
res = RestClient.post(URL, data: names, data_source_ids: @ds_id)
|
46
58
|
@processor.process(res, @current_data)
|
47
59
|
rescue RestClient::Exception
|
48
60
|
single_remote_resolve(names)
|
61
|
+
ensure
|
62
|
+
@stats[:last_batch_time] = Time.now - batch_start
|
49
63
|
end
|
50
64
|
|
51
65
|
def single_remote_resolve(names)
|
@@ -54,10 +68,16 @@ module GnCrossmap
|
|
54
68
|
res = RestClient.post(URL, data: name, data_source_ids: @ds_id)
|
55
69
|
@processor.process(res, @current_data)
|
56
70
|
rescue RestClient::Exception => e
|
57
|
-
|
71
|
+
process_resolver_error(e, name)
|
58
72
|
next
|
59
73
|
end
|
60
74
|
end
|
61
75
|
end
|
76
|
+
|
77
|
+
def process_resolver_error(err, name)
|
78
|
+
@stats[:matches][7] += 1
|
79
|
+
@stats[:current] += 1
|
80
|
+
GnCrossmap.logger.error("Resolver broke on '#{name}': #{err.message}")
|
81
|
+
end
|
62
82
|
end
|
63
83
|
end
|
@@ -1,19 +1,10 @@
|
|
1
1
|
module GnCrossmap
|
2
2
|
# Processes data received from the GN Resolver
|
3
3
|
class ResultProcessor
|
4
|
-
MATCH_TYPES = {
|
5
|
-
0 => "No match",
|
6
|
-
1 => "Exact string match",
|
7
|
-
2 => "Canonical form exact match",
|
8
|
-
3 => "Canonical form fuzzy match",
|
9
|
-
4 => "Partial canonical form match",
|
10
|
-
5 => "Partial canonical form fuzzy match",
|
11
|
-
6 => "Genus part match"
|
12
|
-
}.freeze
|
13
|
-
|
14
4
|
attr_reader :input, :writer
|
15
5
|
|
16
|
-
def initialize(writer)
|
6
|
+
def initialize(writer, stats)
|
7
|
+
@stats = stats
|
17
8
|
@writer = writer
|
18
9
|
@input = {}
|
19
10
|
end
|
@@ -33,19 +24,28 @@ module GnCrossmap
|
|
33
24
|
end
|
34
25
|
|
35
26
|
def write_empty_result(datum)
|
27
|
+
@stats[:matches][0] += 1
|
28
|
+
@stats[:current] += 1
|
36
29
|
res = @original_data[datum[:supplied_id]]
|
37
|
-
res += [MATCH_TYPES[0], datum[:supplied_name_string], nil,
|
30
|
+
res += [GnCrossmap::MATCH_TYPES[0], datum[:supplied_name_string], nil,
|
38
31
|
nil, @input[datum[:supplied_id]][:rank], nil,
|
39
32
|
nil, nil, nil]
|
40
33
|
@writer.write(res)
|
41
34
|
end
|
42
35
|
|
43
36
|
def write_result(datum)
|
37
|
+
collect_stats(datum)
|
44
38
|
datum[:results].each do |result|
|
45
39
|
@writer.write(compile_result(datum, result))
|
46
40
|
end
|
47
41
|
end
|
48
42
|
|
43
|
+
def collect_stats(datum)
|
44
|
+
match_num = datum[:results].map { |d| d[:match_type] }.min
|
45
|
+
@stats[:matches][match_num] += 1
|
46
|
+
@stats[:current] += 1
|
47
|
+
end
|
48
|
+
|
49
49
|
def compile_result(datum, result)
|
50
50
|
@original_data[datum[:supplied_id]] + new_data(datum, result)
|
51
51
|
end
|
@@ -64,7 +64,7 @@ module GnCrossmap
|
|
64
64
|
end
|
65
65
|
|
66
66
|
def matched_type(record)
|
67
|
-
MATCH_TYPES[record[:match_type]]
|
67
|
+
GnCrossmap::MATCH_TYPES[record[:match_type]]
|
68
68
|
end
|
69
69
|
end
|
70
70
|
end
|
data/lib/gn_crossmap/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gn_crossmap
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Mozzherin
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-11-
|
11
|
+
date: 2016-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: trollop
|