gn_crossmap 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 779a3a9193e896242a7e8717fd98bc779b417937
4
- data.tar.gz: d8e9c4c7d72447a62d35f57fe26e33e9b5a69a29
3
+ metadata.gz: 215ff795d273c98f6c25eab1e93ac32d21b3e1be
4
+ data.tar.gz: 5fd8159546705e355fe662ddb6f0b136edc36ce4
5
5
  SHA512:
6
- metadata.gz: 83f6f2f6be28c5891d93e5f6e5e09ac54481f4334640ef3b500822301bb5d1ea2cf0d5ca94f4ba0a2eb7b7e7afcd67fed8b37ad76c61ac184c2d9932d080ccf3
7
- data.tar.gz: ae609d842d36b96de15ffe3013fe49e762da963191609486757d7ba9bbc5e85cdf674a19e24916fdcac7214d37adb1a9c089c95cd89ba9c4c7360256858cca11
6
+ metadata.gz: 9e467719f0be4a7c7a2293e9ec78dbdbf54b8dfc8198a4e1f12417a239cddf982da495434006c3b40d5386b58896b18bbedff68d358fced0b4f1d06d038ed9c4
7
+ data.tar.gz: c41b99ddafe9479773198e900e2dcd07f83d19ebe9929a3b9b228b0d10abad475b85429301f108fdc94ddb6529b91ebfc6921ac42215dbacfb03aacdbeb822fa
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  # ``gn_crossmap`` CHANGELOG
2
2
 
3
+ ## 1.1.0
4
+
5
+ * @dimus - #23 optional block for GnCrossmap.run gives access to intermediate
6
+ results
7
+
3
8
  ## 1.0.0
4
9
 
5
10
  * @dimus - #18 output file optionally removes original fields except `taxonID`
data/README.md CHANGED
@@ -98,6 +98,44 @@ GnCrossmap.logger = MyCustomLogger.new
98
98
  GnCrossmap.run("path/to/input.csv", "path/to/output.csv", 5, true)
99
99
  ```
100
100
 
101
+ If you want to get intermediate statistics for each resolution cycle use a
102
+ block:
103
+
104
+ ```ruby
105
+ GnCrossmap.run("path/to/input.csv", "path/to/output.csv", 5, true) do |stats|
106
+ puts stats
107
+ put "Matches:"
108
+ stats[:matches].each do |key, value|
109
+ puts "#{GnCrossmap::MATCH_TYPES[key]}: #{value}"
110
+ end
111
+ end
112
+ ```
113
+
114
+ #### Intermediate stats format
115
+
116
+ |Field | Description |
117
+ |----------------|---------------------------------------------------------|
118
+ |total | total number of names in original list |
119
+ |current | number of names already processed |
120
+ |start_time | Start of resolution |
121
+ |last_batch_time | time span of the last batch processing |
122
+ |matches | Distribution of processed data by match type (see below)|
123
+
124
+ #### Match types
125
+
126
+ Match types dictionary can be accessed with `GnCrossmap::MATCH_TYPES` constant
127
+
128
+ | Match code | Match type |
129
+ |------------|----------------------------------|
130
+ |0 |No match |
131
+ |1 |Exact string match |
132
+ |2 |Canonical form exact match |
133
+ |3 |Canonical form fuzzy match |
134
+ |4 |Partial canonical form match |
135
+ |5 |Partial canonical form fuzzy match|
136
+ |6 |Genus part match |
137
+ |7 |Error in matching |
138
+
101
139
  ### Input file format
102
140
 
103
141
  - Comma Separated File with names of fields in first row.
data/lib/gn_crossmap.rb CHANGED
@@ -18,6 +18,16 @@ require "gn_crossmap/result_processor"
18
18
  module GnCrossmap
19
19
  INPUT_MODE = "r:utf-8".freeze
20
20
  OUTPUT_MODE = "w:utf-8".freeze
21
+ MATCH_TYPES = {
22
+ 0 => "No match",
23
+ 1 => "Exact string match",
24
+ 2 => "Canonical form exact match",
25
+ 3 => "Canonical form fuzzy match",
26
+ 4 => "Partial canonical form match",
27
+ 5 => "Partial canonical form fuzzy match",
28
+ 6 => "Genus part match",
29
+ 7 => "Error in matching"
30
+ }.freeze
21
31
 
22
32
  class << self
23
33
  attr_writer :logger
@@ -28,7 +38,8 @@ module GnCrossmap
28
38
  data = reader.read
29
39
  writer = Writer.new(output_io, reader.original_fields,
30
40
  output_name(output))
31
- Resolver.new(writer, data_source_id).resolve(data)
41
+ resolver = Resolver.new(writer, data_source_id)
42
+ block_given? ? resolver.resolve(data, &Proc.new) : resolver.resolve(data)
32
43
  output
33
44
  end
34
45
 
@@ -4,7 +4,9 @@ module GnCrossmap
4
4
  URL = "http://resolver.globalnames.org/name_resolvers.json".freeze
5
5
 
6
6
  def initialize(writer, data_source_id)
7
- @processor = GnCrossmap::ResultProcessor.new(writer)
7
+ @stats = { total: 0, current: 0, start_time: nil, last_batch_time: nil,
8
+ matches: match_types }
9
+ @processor = GnCrossmap::ResultProcessor.new(writer, @stats)
8
10
  @ds_id = data_source_id
9
11
  @count = 0
10
12
  @current_data = {}
@@ -12,11 +14,13 @@ module GnCrossmap
12
14
  end
13
15
 
14
16
  def resolve(data)
15
- data_size = data.size
17
+ @stats[:total] = data.size
18
+ @stats[:start_time] = Time.now
16
19
  data.each_slice(@batch) do |slice|
17
- with_log(data_size) do
20
+ with_log do
18
21
  names = collect_names(slice)
19
22
  remote_resolve(names)
23
+ yield(@stats) if block_given?
20
24
  end
21
25
  end
22
26
  @processor.writer.close
@@ -24,11 +28,18 @@ module GnCrossmap
24
28
 
25
29
  private
26
30
 
27
- def with_log(size)
31
+ def match_types
32
+ matches = GnCrossmap::MATCH_TYPES.keys
33
+ matches.each_with_object({}) do |key, obj|
34
+ obj[key] = 0
35
+ end
36
+ end
37
+
38
+ def with_log
28
39
  s = @count + 1
29
40
  @count += @batch
30
- e = [@count, size].min
31
- GnCrossmap.log("Resolve #{s}-#{e} out of #{size} records")
41
+ e = [@count, @stats[:total]].min
42
+ GnCrossmap.log("Resolve #{s}-#{e} out of #{@stats[:total]} records")
32
43
  yield
33
44
  end
34
45
 
@@ -42,10 +53,13 @@ module GnCrossmap
42
53
  end
43
54
 
44
55
  def remote_resolve(names)
56
+ batch_start = Time.now
45
57
  res = RestClient.post(URL, data: names, data_source_ids: @ds_id)
46
58
  @processor.process(res, @current_data)
47
59
  rescue RestClient::Exception
48
60
  single_remote_resolve(names)
61
+ ensure
62
+ @stats[:last_batch_time] = Time.now - batch_start
49
63
  end
50
64
 
51
65
  def single_remote_resolve(names)
@@ -54,10 +68,16 @@ module GnCrossmap
54
68
  res = RestClient.post(URL, data: name, data_source_ids: @ds_id)
55
69
  @processor.process(res, @current_data)
56
70
  rescue RestClient::Exception => e
57
- GnCrossmap.logger.error("Resolver broke on '#{name}': #{e.message}")
71
+ process_resolver_error(e, name)
58
72
  next
59
73
  end
60
74
  end
61
75
  end
76
+
77
+ def process_resolver_error(err, name)
78
+ @stats[:matches][7] += 1
79
+ @stats[:current] += 1
80
+ GnCrossmap.logger.error("Resolver broke on '#{name}': #{err.message}")
81
+ end
62
82
  end
63
83
  end
@@ -1,19 +1,10 @@
1
1
  module GnCrossmap
2
2
  # Processes data received from the GN Resolver
3
3
  class ResultProcessor
4
- MATCH_TYPES = {
5
- 0 => "No match",
6
- 1 => "Exact string match",
7
- 2 => "Canonical form exact match",
8
- 3 => "Canonical form fuzzy match",
9
- 4 => "Partial canonical form match",
10
- 5 => "Partial canonical form fuzzy match",
11
- 6 => "Genus part match"
12
- }.freeze
13
-
14
4
  attr_reader :input, :writer
15
5
 
16
- def initialize(writer)
6
+ def initialize(writer, stats)
7
+ @stats = stats
17
8
  @writer = writer
18
9
  @input = {}
19
10
  end
@@ -33,19 +24,28 @@ module GnCrossmap
33
24
  end
34
25
 
35
26
  def write_empty_result(datum)
27
+ @stats[:matches][0] += 1
28
+ @stats[:current] += 1
36
29
  res = @original_data[datum[:supplied_id]]
37
- res += [MATCH_TYPES[0], datum[:supplied_name_string], nil,
30
+ res += [GnCrossmap::MATCH_TYPES[0], datum[:supplied_name_string], nil,
38
31
  nil, @input[datum[:supplied_id]][:rank], nil,
39
32
  nil, nil, nil]
40
33
  @writer.write(res)
41
34
  end
42
35
 
43
36
  def write_result(datum)
37
+ collect_stats(datum)
44
38
  datum[:results].each do |result|
45
39
  @writer.write(compile_result(datum, result))
46
40
  end
47
41
  end
48
42
 
43
+ def collect_stats(datum)
44
+ match_num = datum[:results].map { |d| d[:match_type] }.min
45
+ @stats[:matches][match_num] += 1
46
+ @stats[:current] += 1
47
+ end
48
+
49
49
  def compile_result(datum, result)
50
50
  @original_data[datum[:supplied_id]] + new_data(datum, result)
51
51
  end
@@ -64,7 +64,7 @@ module GnCrossmap
64
64
  end
65
65
 
66
66
  def matched_type(record)
67
- MATCH_TYPES[record[:match_type]]
67
+ GnCrossmap::MATCH_TYPES[record[:match_type]]
68
68
  end
69
69
  end
70
70
  end
@@ -1,6 +1,6 @@
1
1
  # Namespace module for crossmapping checklists to GN sources
2
2
  module GnCrossmap
3
- VERSION = "1.0.0".freeze
3
+ VERSION = "1.1.0".freeze
4
4
 
5
5
  def self.version
6
6
  VERSION
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gn_crossmap
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-11-22 00:00:00.000000000 Z
11
+ date: 2016-11-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: trollop