gn_crossmap 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 779a3a9193e896242a7e8717fd98bc779b417937
4
- data.tar.gz: d8e9c4c7d72447a62d35f57fe26e33e9b5a69a29
3
+ metadata.gz: 215ff795d273c98f6c25eab1e93ac32d21b3e1be
4
+ data.tar.gz: 5fd8159546705e355fe662ddb6f0b136edc36ce4
5
5
  SHA512:
6
- metadata.gz: 83f6f2f6be28c5891d93e5f6e5e09ac54481f4334640ef3b500822301bb5d1ea2cf0d5ca94f4ba0a2eb7b7e7afcd67fed8b37ad76c61ac184c2d9932d080ccf3
7
- data.tar.gz: ae609d842d36b96de15ffe3013fe49e762da963191609486757d7ba9bbc5e85cdf674a19e24916fdcac7214d37adb1a9c089c95cd89ba9c4c7360256858cca11
6
+ metadata.gz: 9e467719f0be4a7c7a2293e9ec78dbdbf54b8dfc8198a4e1f12417a239cddf982da495434006c3b40d5386b58896b18bbedff68d358fced0b4f1d06d038ed9c4
7
+ data.tar.gz: c41b99ddafe9479773198e900e2dcd07f83d19ebe9929a3b9b228b0d10abad475b85429301f108fdc94ddb6529b91ebfc6921ac42215dbacfb03aacdbeb822fa
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  # ``gn_crossmap`` CHANGELOG
2
2
 
3
+ ## 1.1.0
4
+
5
+ * @dimus - #23 optional block for GnCrossmap.run gives access to intermediate
6
+ results
7
+
3
8
  ## 1.0.0
4
9
 
5
10
  * @dimus - #18 output file optionally removes original fields except `taxonID`
data/README.md CHANGED
@@ -98,6 +98,44 @@ GnCrossmap.logger = MyCustomLogger.new
98
98
  GnCrossmap.run("path/to/input.csv", "path/to/output.csv", 5, true)
99
99
  ```
100
100
 
101
+ If you want to get intermediate statistics for each resolution cycle use a
102
+ block:
103
+
104
+ ```ruby
105
+ GnCrossmap.run("path/to/input.csv", "path/to/output.csv", 5, true) do |stats|
106
+ puts stats
107
+ put "Matches:"
108
+ stats[:matches].each do |key, value|
109
+ puts "#{GnCrossmap::MATCH_TYPES[key]}: #{value}"
110
+ end
111
+ end
112
+ ```
113
+
114
+ #### Intermediate stats format
115
+
116
+ |Field | Description |
117
+ |----------------|---------------------------------------------------------|
118
+ |total | total number of names in original list |
119
+ |current | number of names already processed |
120
+ |start_time | Start of resolution |
121
+ |last_batch_time | time span of the last batch processing |
122
+ |matches | Distribution of processed data by match type (see below)|
123
+
124
+ #### Match types
125
+
126
+ Match types dictionary can be accessed with `GnCrossmap::MATCH_TYPES` constant
127
+
128
+ | Match code | Match type |
129
+ |------------|----------------------------------|
130
+ |0 |No match |
131
+ |1 |Exact string match |
132
+ |2 |Canonical form exact match |
133
+ |3 |Canonical form fuzzy match |
134
+ |4 |Partial canonical form match |
135
+ |5 |Partial canonical form fuzzy match|
136
+ |6 |Genus part match |
137
+ |7 |Error in matching |
138
+
101
139
  ### Input file format
102
140
 
103
141
  - Comma Separated File with names of fields in first row.
data/lib/gn_crossmap.rb CHANGED
@@ -18,6 +18,16 @@ require "gn_crossmap/result_processor"
18
18
  module GnCrossmap
19
19
  INPUT_MODE = "r:utf-8".freeze
20
20
  OUTPUT_MODE = "w:utf-8".freeze
21
+ MATCH_TYPES = {
22
+ 0 => "No match",
23
+ 1 => "Exact string match",
24
+ 2 => "Canonical form exact match",
25
+ 3 => "Canonical form fuzzy match",
26
+ 4 => "Partial canonical form match",
27
+ 5 => "Partial canonical form fuzzy match",
28
+ 6 => "Genus part match",
29
+ 7 => "Error in matching"
30
+ }.freeze
21
31
 
22
32
  class << self
23
33
  attr_writer :logger
@@ -28,7 +38,8 @@ module GnCrossmap
28
38
  data = reader.read
29
39
  writer = Writer.new(output_io, reader.original_fields,
30
40
  output_name(output))
31
- Resolver.new(writer, data_source_id).resolve(data)
41
+ resolver = Resolver.new(writer, data_source_id)
42
+ block_given? ? resolver.resolve(data, &Proc.new) : resolver.resolve(data)
32
43
  output
33
44
  end
34
45
 
@@ -4,7 +4,9 @@ module GnCrossmap
4
4
  URL = "http://resolver.globalnames.org/name_resolvers.json".freeze
5
5
 
6
6
  def initialize(writer, data_source_id)
7
- @processor = GnCrossmap::ResultProcessor.new(writer)
7
+ @stats = { total: 0, current: 0, start_time: nil, last_batch_time: nil,
8
+ matches: match_types }
9
+ @processor = GnCrossmap::ResultProcessor.new(writer, @stats)
8
10
  @ds_id = data_source_id
9
11
  @count = 0
10
12
  @current_data = {}
@@ -12,11 +14,13 @@ module GnCrossmap
12
14
  end
13
15
 
14
16
  def resolve(data)
15
- data_size = data.size
17
+ @stats[:total] = data.size
18
+ @stats[:start_time] = Time.now
16
19
  data.each_slice(@batch) do |slice|
17
- with_log(data_size) do
20
+ with_log do
18
21
  names = collect_names(slice)
19
22
  remote_resolve(names)
23
+ yield(@stats) if block_given?
20
24
  end
21
25
  end
22
26
  @processor.writer.close
@@ -24,11 +28,18 @@ module GnCrossmap
24
28
 
25
29
  private
26
30
 
27
- def with_log(size)
31
+ def match_types
32
+ matches = GnCrossmap::MATCH_TYPES.keys
33
+ matches.each_with_object({}) do |key, obj|
34
+ obj[key] = 0
35
+ end
36
+ end
37
+
38
+ def with_log
28
39
  s = @count + 1
29
40
  @count += @batch
30
- e = [@count, size].min
31
- GnCrossmap.log("Resolve #{s}-#{e} out of #{size} records")
41
+ e = [@count, @stats[:total]].min
42
+ GnCrossmap.log("Resolve #{s}-#{e} out of #{@stats[:total]} records")
32
43
  yield
33
44
  end
34
45
 
@@ -42,10 +53,13 @@ module GnCrossmap
42
53
  end
43
54
 
44
55
  def remote_resolve(names)
56
+ batch_start = Time.now
45
57
  res = RestClient.post(URL, data: names, data_source_ids: @ds_id)
46
58
  @processor.process(res, @current_data)
47
59
  rescue RestClient::Exception
48
60
  single_remote_resolve(names)
61
+ ensure
62
+ @stats[:last_batch_time] = Time.now - batch_start
49
63
  end
50
64
 
51
65
  def single_remote_resolve(names)
@@ -54,10 +68,16 @@ module GnCrossmap
54
68
  res = RestClient.post(URL, data: name, data_source_ids: @ds_id)
55
69
  @processor.process(res, @current_data)
56
70
  rescue RestClient::Exception => e
57
- GnCrossmap.logger.error("Resolver broke on '#{name}': #{e.message}")
71
+ process_resolver_error(e, name)
58
72
  next
59
73
  end
60
74
  end
61
75
  end
76
+
77
+ def process_resolver_error(err, name)
78
+ @stats[:matches][7] += 1
79
+ @stats[:current] += 1
80
+ GnCrossmap.logger.error("Resolver broke on '#{name}': #{err.message}")
81
+ end
62
82
  end
63
83
  end
@@ -1,19 +1,10 @@
1
1
  module GnCrossmap
2
2
  # Processes data received from the GN Resolver
3
3
  class ResultProcessor
4
- MATCH_TYPES = {
5
- 0 => "No match",
6
- 1 => "Exact string match",
7
- 2 => "Canonical form exact match",
8
- 3 => "Canonical form fuzzy match",
9
- 4 => "Partial canonical form match",
10
- 5 => "Partial canonical form fuzzy match",
11
- 6 => "Genus part match"
12
- }.freeze
13
-
14
4
  attr_reader :input, :writer
15
5
 
16
- def initialize(writer)
6
+ def initialize(writer, stats)
7
+ @stats = stats
17
8
  @writer = writer
18
9
  @input = {}
19
10
  end
@@ -33,19 +24,28 @@ module GnCrossmap
33
24
  end
34
25
 
35
26
  def write_empty_result(datum)
27
+ @stats[:matches][0] += 1
28
+ @stats[:current] += 1
36
29
  res = @original_data[datum[:supplied_id]]
37
- res += [MATCH_TYPES[0], datum[:supplied_name_string], nil,
30
+ res += [GnCrossmap::MATCH_TYPES[0], datum[:supplied_name_string], nil,
38
31
  nil, @input[datum[:supplied_id]][:rank], nil,
39
32
  nil, nil, nil]
40
33
  @writer.write(res)
41
34
  end
42
35
 
43
36
  def write_result(datum)
37
+ collect_stats(datum)
44
38
  datum[:results].each do |result|
45
39
  @writer.write(compile_result(datum, result))
46
40
  end
47
41
  end
48
42
 
43
+ def collect_stats(datum)
44
+ match_num = datum[:results].map { |d| d[:match_type] }.min
45
+ @stats[:matches][match_num] += 1
46
+ @stats[:current] += 1
47
+ end
48
+
49
49
  def compile_result(datum, result)
50
50
  @original_data[datum[:supplied_id]] + new_data(datum, result)
51
51
  end
@@ -64,7 +64,7 @@ module GnCrossmap
64
64
  end
65
65
 
66
66
  def matched_type(record)
67
- MATCH_TYPES[record[:match_type]]
67
+ GnCrossmap::MATCH_TYPES[record[:match_type]]
68
68
  end
69
69
  end
70
70
  end
@@ -1,6 +1,6 @@
1
1
  # Namespace module for crossmapping checklists to GN sources
2
2
  module GnCrossmap
3
- VERSION = "1.0.0".freeze
3
+ VERSION = "1.1.0".freeze
4
4
 
5
5
  def self.version
6
6
  VERSION
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gn_crossmap
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-11-22 00:00:00.000000000 Z
11
+ date: 2016-11-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: trollop