gn_crossmap 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 215ff795d273c98f6c25eab1e93ac32d21b3e1be
4
- data.tar.gz: 5fd8159546705e355fe662ddb6f0b136edc36ce4
3
+ metadata.gz: f5a97cd143dd1e3bd3940b6457dd4b1f824236c7
4
+ data.tar.gz: 12a4ed179bdadaccacf34fe130300e77147d4c35
5
5
  SHA512:
6
- metadata.gz: 9e467719f0be4a7c7a2293e9ec78dbdbf54b8dfc8198a4e1f12417a239cddf982da495434006c3b40d5386b58896b18bbedff68d358fced0b4f1d06d038ed9c4
7
- data.tar.gz: c41b99ddafe9479773198e900e2dcd07f83d19ebe9929a3b9b228b0d10abad475b85429301f108fdc94ddb6529b91ebfc6921ac42215dbacfb03aacdbeb822fa
6
+ metadata.gz: 5cbcc463342999b5f9c9e7dcc85f031783f3e4d9e31c8a8a4152bd64dddae086f5f82e1bbcef44f14fdd8fd3721c819d46ae892c7b02cdc256c3728b8359c1f8
7
+ data.tar.gz: 4d5df19d77f565ab6ad268f9106e93a8aafe6d2de0b6baff7906d78bc469a50a583c29423a611b406612a6355f1aefe106ec16ad0e7653e31f0d5c9fe6ef6ed5
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  # ``gn_crossmap`` CHANGELOG
2
2
 
3
+ ## 1.2.0
4
+
5
+ * @dimus - #24 optional block for GnCrossmap.run now also gives access to
6
+ intermediate results of CSV file reading
7
+
3
8
  ## 1.1.0
4
9
 
5
10
  * @dimus - #23 optional block for GnCrossmap.run gives access to intermediate
data/README.md CHANGED
@@ -104,7 +104,7 @@ block:
104
104
  ```ruby
105
105
  GnCrossmap.run("path/to/input.csv", "path/to/output.csv", 5, true) do |stats|
106
106
  puts stats
107
- put "Matches:"
107
+ puts "Matches:"
108
108
  stats[:matches].each do |key, value|
109
109
  puts "#{GnCrossmap::MATCH_TYPES[key]}: #{value}"
110
110
  end
@@ -113,13 +113,18 @@ end
113
113
 
114
114
  #### Intermediate stats format
115
115
 
116
- |Field | Description |
117
- |----------------|---------------------------------------------------------|
118
- |total | total number of names in original list |
119
- |current | number of names already processed |
120
- |start_time | Start of resolution |
121
- |last_batch_time | time span of the last batch processing |
122
- |matches | Distribution of processed data by match type (see below)|
116
+ |Field |Description |
117
+ |------------------|---------------------------------------------------------|
118
+ |status |current phase: (init, ingested |
119
+ |total_records |total number of names in original list |
120
+ |ingestion_start |time when the reading from csv started |
121
+ |ingestion_span |time of intermediate checkpoint of reading csv |
122
+ |ingested_records |number of ingested records at an intermediate checkpoint |
123
+ |resolution_start |time when resolution of names started |
124
+ |resolution_span |time of intermediate checkpoint of resolving names |
125
+ |resolved_records |number of names already processed |
126
+ |last_batches_time |time required to process the last batch of names |
127
+ |matches |Distribution of processed data by match type (see below) |
123
128
 
124
129
  #### Match types
125
130
 
data/lib/gn_crossmap.rb CHANGED
@@ -13,6 +13,7 @@ require "gn_crossmap/column_collector"
13
13
  require "gn_crossmap/sci_name_collector"
14
14
  require "gn_crossmap/resolver"
15
15
  require "gn_crossmap/result_processor"
16
+ require "gn_crossmap/stats"
16
17
 
17
18
  # Namespace module for crossmapping checklists wth GN sources
18
19
  module GnCrossmap
@@ -32,17 +33,22 @@ module GnCrossmap
32
33
  class << self
33
34
  attr_writer :logger
34
35
 
36
+ # rubocop:disable Metrics/AbcSize
37
+
35
38
  def run(input, output, data_source_id, skip_original)
39
+ stats = Stats.new
36
40
  input_io, output_io = io(input, output)
37
- reader = Reader.new(input_io, input_name(input), skip_original)
38
- data = reader.read
41
+ reader = Reader.new(input_io, input_name(input), skip_original, stats)
42
+ data = block_given? ? reader.read(&Proc.new) : reader.read
39
43
  writer = Writer.new(output_io, reader.original_fields,
40
44
  output_name(output))
41
- resolver = Resolver.new(writer, data_source_id)
45
+ resolver = Resolver.new(writer, data_source_id, stats)
42
46
  block_given? ? resolver.resolve(data, &Proc.new) : resolver.resolve(data)
43
47
  output
44
48
  end
45
49
 
50
+ # rubocop:enable all
51
+
46
52
  def logger
47
53
  @logger ||= Logger.new(STDERR)
48
54
  end
@@ -4,7 +4,8 @@ module GnCrossmap
4
4
  class Reader
5
5
  attr_reader :original_fields
6
6
 
7
- def initialize(csv_io, input_name, skip_original)
7
+ def initialize(csv_io, input_name, skip_original, stats)
8
+ @stats = stats
8
9
  @csv_io = csv_io
9
10
  @col_sep = col_sep
10
11
  @original_fields = nil
@@ -13,14 +14,17 @@ module GnCrossmap
13
14
  end
14
15
 
15
16
  def read
17
+ @stats.stats[:ingestion_start] = Time.now
18
+ @stats.stats[:status] = :ingestion
16
19
  GnCrossmap.log("Read input from #{@input_name}")
17
- parse_input
20
+ block_given? ? parse_input(&Proc.new) : parse_input
18
21
  end
19
22
 
20
23
  private
21
24
 
22
25
  def col_sep
23
26
  line = @csv_io.first
27
+ @stats.stats[:total_records] = @csv_io.readlines.size
24
28
  @csv_io.rewind
25
29
  [";", ",", "\t"].map { |s| [line.count(s), s] }.sort.last.last
26
30
  end
@@ -30,13 +34,20 @@ module GnCrossmap
30
34
  csv = CSV.new(@csv_io, col_sep: col_sep)
31
35
  csv.each_with_index do |row, i|
32
36
  @original_fields = headers(row) if @original_fields.nil?
33
- i += 1
34
- GnCrossmap.log("Ingesting #{i}th csv row") if (i % 10_000).zero?
37
+ yield @stats.stats if log_progress(i) && block_given?
35
38
  dc.process_row(row)
36
39
  end && @csv_io.close
37
40
  dc.data
38
41
  end
39
42
 
43
+ def log_progress(count)
44
+ return false unless (count % 10_000).zero?
45
+ GnCrossmap.log("Ingesting csv row #{count + 1}")
46
+ @stats.stats[:ingested_records] = count + 1
47
+ @stats.stats[:ingestion_span] = Time.now - @stats.stats[:ingestion_start]
48
+ true
49
+ end
50
+
40
51
  def headers(row)
41
52
  hdrs = row.dup
42
53
  @skip_original ? taxon_id_header(hdrs) : hdrs
@@ -3,9 +3,8 @@ module GnCrossmap
3
3
  class Resolver
4
4
  URL = "http://resolver.globalnames.org/name_resolvers.json".freeze
5
5
 
6
- def initialize(writer, data_source_id)
7
- @stats = { total: 0, current: 0, start_time: nil, last_batch_time: nil,
8
- matches: match_types }
6
+ def initialize(writer, data_source_id, stats)
7
+ @stats = stats
9
8
  @processor = GnCrossmap::ResultProcessor.new(writer, @stats)
10
9
  @ds_id = data_source_id
11
10
  @count = 0
@@ -14,13 +13,12 @@ module GnCrossmap
14
13
  end
15
14
 
16
15
  def resolve(data)
17
- @stats[:total] = data.size
18
- @stats[:start_time] = Time.now
16
+ update_stats(data.size)
19
17
  data.each_slice(@batch) do |slice|
20
18
  with_log do
21
19
  names = collect_names(slice)
22
20
  remote_resolve(names)
23
- yield(@stats) if block_given?
21
+ yield(@stats.stats) if block_given?
24
22
  end
25
23
  end
26
24
  @processor.writer.close
@@ -28,18 +26,18 @@ module GnCrossmap
28
26
 
29
27
  private
30
28
 
31
- def match_types
32
- matches = GnCrossmap::MATCH_TYPES.keys
33
- matches.each_with_object({}) do |key, obj|
34
- obj[key] = 0
35
- end
29
+ def update_stats(records_num)
30
+ @stats.stats[:total_records] = records_num
31
+ @stats.stats[:resolution_start] = Time.now
32
+ @stats.stats[:status] = :resolution
36
33
  end
37
34
 
38
35
  def with_log
39
36
  s = @count + 1
40
37
  @count += @batch
41
- e = [@count, @stats[:total]].min
42
- GnCrossmap.log("Resolve #{s}-#{e} out of #{@stats[:total]} records")
38
+ e = [@count, @stats.stats[:total_records]].min
39
+ GnCrossmap.log("Resolve #{s}-#{e} out of " \
40
+ "#{@stats.stats[:total_records]} records")
43
41
  yield
44
42
  end
45
43
 
@@ -59,7 +57,14 @@ module GnCrossmap
59
57
  rescue RestClient::Exception
60
58
  single_remote_resolve(names)
61
59
  ensure
62
- @stats[:last_batch_time] = Time.now - batch_start
60
+ update_batch_times(batch_start)
61
+ end
62
+
63
+ def update_batch_times(batch_start)
64
+ s = @stats.stats
65
+ s[:last_batches_time].shift if s[:last_batches_time].size > 2
66
+ s[:last_batches_time] << Time.now - batch_start
67
+ s[:resolution_span] = Time.now - s[:resolution_start]
63
68
  end
64
69
 
65
70
  def single_remote_resolve(names)
@@ -75,8 +80,8 @@ module GnCrossmap
75
80
  end
76
81
 
77
82
  def process_resolver_error(err, name)
78
- @stats[:matches][7] += 1
79
- @stats[:current] += 1
83
+ @stats.stats[:matches][7] += 1
84
+ @stats.stats[:resolved_records] += 1
80
85
  GnCrossmap.logger.error("Resolver broke on '#{name}': #{err.message}")
81
86
  end
82
87
  end
@@ -24,8 +24,8 @@ module GnCrossmap
24
24
  end
25
25
 
26
26
  def write_empty_result(datum)
27
- @stats[:matches][0] += 1
28
- @stats[:current] += 1
27
+ @stats.stats[:matches][0] += 1
28
+ @stats.stats[:resolved_records] += 1
29
29
  res = @original_data[datum[:supplied_id]]
30
30
  res += [GnCrossmap::MATCH_TYPES[0], datum[:supplied_name_string], nil,
31
31
  nil, @input[datum[:supplied_id]][:rank], nil,
@@ -42,8 +42,8 @@ module GnCrossmap
42
42
 
43
43
  def collect_stats(datum)
44
44
  match_num = datum[:results].map { |d| d[:match_type] }.min
45
- @stats[:matches][match_num] += 1
46
- @stats[:current] += 1
45
+ @stats.stats[:matches][match_num] += 1
46
+ @stats.stats[:resolved_records] += 1
47
47
  end
48
48
 
49
49
  def compile_result(datum, result)
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module GnCrossmap
4
+ # Collects statistics about crossmapping process
5
+ class Stats
6
+ attr_accessor :stats
7
+
8
+ def initialize
9
+ @stats = { status: :init, total_records: 0, ingested_records: 0,
10
+ resolved_records: 0, ingestion_span: nil,
11
+ resolution_span: nil, ingestion_start: nil,
12
+ resolution_start: nil, last_batches_time: [],
13
+ matches: match_types }
14
+ end
15
+
16
+ private
17
+
18
+ def match_types
19
+ matches = GnCrossmap::MATCH_TYPES.keys
20
+ matches.each_with_object({}) do |key, obj|
21
+ obj[key] = 0
22
+ end
23
+ end
24
+ end
25
+ end
@@ -1,6 +1,6 @@
1
1
  # Namespace module for crossmapping checklists to GN sources
2
2
  module GnCrossmap
3
- VERSION = "1.1.0".freeze
3
+ VERSION = "1.2.0".freeze
4
4
 
5
5
  def self.version
6
6
  VERSION
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gn_crossmap
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-11-28 00:00:00.000000000 Z
11
+ date: 2016-11-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: trollop
@@ -181,6 +181,7 @@ files:
181
181
  - lib/gn_crossmap/resolver.rb
182
182
  - lib/gn_crossmap/result_processor.rb
183
183
  - lib/gn_crossmap/sci_name_collector.rb
184
+ - lib/gn_crossmap/stats.rb
184
185
  - lib/gn_crossmap/version.rb
185
186
  - lib/gn_crossmap/writer.rb
186
187
  homepage: https://github.com/GlobalNamesArchitecture/gn_crossmap