gn_crossmap 1.1.0 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 215ff795d273c98f6c25eab1e93ac32d21b3e1be
4
- data.tar.gz: 5fd8159546705e355fe662ddb6f0b136edc36ce4
3
+ metadata.gz: f5a97cd143dd1e3bd3940b6457dd4b1f824236c7
4
+ data.tar.gz: 12a4ed179bdadaccacf34fe130300e77147d4c35
5
5
  SHA512:
6
- metadata.gz: 9e467719f0be4a7c7a2293e9ec78dbdbf54b8dfc8198a4e1f12417a239cddf982da495434006c3b40d5386b58896b18bbedff68d358fced0b4f1d06d038ed9c4
7
- data.tar.gz: c41b99ddafe9479773198e900e2dcd07f83d19ebe9929a3b9b228b0d10abad475b85429301f108fdc94ddb6529b91ebfc6921ac42215dbacfb03aacdbeb822fa
6
+ metadata.gz: 5cbcc463342999b5f9c9e7dcc85f031783f3e4d9e31c8a8a4152bd64dddae086f5f82e1bbcef44f14fdd8fd3721c819d46ae892c7b02cdc256c3728b8359c1f8
7
+ data.tar.gz: 4d5df19d77f565ab6ad268f9106e93a8aafe6d2de0b6baff7906d78bc469a50a583c29423a611b406612a6355f1aefe106ec16ad0e7653e31f0d5c9fe6ef6ed5
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  # ``gn_crossmap`` CHANGELOG
2
2
 
3
+ ## 1.2.0
4
+
5
+ * @dimus - #24 optional block for GnCrossmap.run now also gives access to
6
+ intermediate results of CSV file reading
7
+
3
8
  ## 1.1.0
4
9
 
5
10
  * @dimus - #23 optional block for GnCrossmap.run gives access to intermediate
data/README.md CHANGED
@@ -104,7 +104,7 @@ block:
104
104
  ```ruby
105
105
  GnCrossmap.run("path/to/input.csv", "path/to/output.csv", 5, true) do |stats|
106
106
  puts stats
107
- put "Matches:"
107
+ puts "Matches:"
108
108
  stats[:matches].each do |key, value|
109
109
  puts "#{GnCrossmap::MATCH_TYPES[key]}: #{value}"
110
110
  end
@@ -113,13 +113,18 @@ end
113
113
 
114
114
  #### Intermediate stats format
115
115
 
116
- |Field | Description |
117
- |----------------|---------------------------------------------------------|
118
- |total | total number of names in original list |
119
- |current | number of names already processed |
120
- |start_time | Start of resolution |
121
- |last_batch_time | time span of the last batch processing |
122
- |matches | Distribution of processed data by match type (see below)|
116
+ |Field |Description |
117
+ |------------------|---------------------------------------------------------|
118
+ |status |current phase: (init, ingested |
119
+ |total_records |total number of names in original list |
120
+ |ingestion_start |time when the reading from csv started |
121
+ |ingestion_span |time of intermediate checkpoint of reading csv |
122
+ |ingested_records |number of ingested records at an intermediate checkpoint |
123
+ |resolution_start |time when resolution of names started |
124
+ |resolution_span |time of intermediate checkpoint of resolving names |
125
+ |resolved_records |number of names already processed |
126
+ |last_batches_time |time required to process the last batch of names |
127
+ |matches |Distribution of processed data by match type (see below) |
123
128
 
124
129
  #### Match types
125
130
 
data/lib/gn_crossmap.rb CHANGED
@@ -13,6 +13,7 @@ require "gn_crossmap/column_collector"
13
13
  require "gn_crossmap/sci_name_collector"
14
14
  require "gn_crossmap/resolver"
15
15
  require "gn_crossmap/result_processor"
16
+ require "gn_crossmap/stats"
16
17
 
17
18
  # Namespace module for crossmapping checklists wth GN sources
18
19
  module GnCrossmap
@@ -32,17 +33,22 @@ module GnCrossmap
32
33
  class << self
33
34
  attr_writer :logger
34
35
 
36
+ # rubocop:disable Metrics/AbcSize
37
+
35
38
  def run(input, output, data_source_id, skip_original)
39
+ stats = Stats.new
36
40
  input_io, output_io = io(input, output)
37
- reader = Reader.new(input_io, input_name(input), skip_original)
38
- data = reader.read
41
+ reader = Reader.new(input_io, input_name(input), skip_original, stats)
42
+ data = block_given? ? reader.read(&Proc.new) : reader.read
39
43
  writer = Writer.new(output_io, reader.original_fields,
40
44
  output_name(output))
41
- resolver = Resolver.new(writer, data_source_id)
45
+ resolver = Resolver.new(writer, data_source_id, stats)
42
46
  block_given? ? resolver.resolve(data, &Proc.new) : resolver.resolve(data)
43
47
  output
44
48
  end
45
49
 
50
+ # rubocop:enable all
51
+
46
52
  def logger
47
53
  @logger ||= Logger.new(STDERR)
48
54
  end
@@ -4,7 +4,8 @@ module GnCrossmap
4
4
  class Reader
5
5
  attr_reader :original_fields
6
6
 
7
- def initialize(csv_io, input_name, skip_original)
7
+ def initialize(csv_io, input_name, skip_original, stats)
8
+ @stats = stats
8
9
  @csv_io = csv_io
9
10
  @col_sep = col_sep
10
11
  @original_fields = nil
@@ -13,14 +14,17 @@ module GnCrossmap
13
14
  end
14
15
 
15
16
  def read
17
+ @stats.stats[:ingestion_start] = Time.now
18
+ @stats.stats[:status] = :ingestion
16
19
  GnCrossmap.log("Read input from #{@input_name}")
17
- parse_input
20
+ block_given? ? parse_input(&Proc.new) : parse_input
18
21
  end
19
22
 
20
23
  private
21
24
 
22
25
  def col_sep
23
26
  line = @csv_io.first
27
+ @stats.stats[:total_records] = @csv_io.readlines.size
24
28
  @csv_io.rewind
25
29
  [";", ",", "\t"].map { |s| [line.count(s), s] }.sort.last.last
26
30
  end
@@ -30,13 +34,20 @@ module GnCrossmap
30
34
  csv = CSV.new(@csv_io, col_sep: col_sep)
31
35
  csv.each_with_index do |row, i|
32
36
  @original_fields = headers(row) if @original_fields.nil?
33
- i += 1
34
- GnCrossmap.log("Ingesting #{i}th csv row") if (i % 10_000).zero?
37
+ yield @stats.stats if log_progress(i) && block_given?
35
38
  dc.process_row(row)
36
39
  end && @csv_io.close
37
40
  dc.data
38
41
  end
39
42
 
43
+ def log_progress(count)
44
+ return false unless (count % 10_000).zero?
45
+ GnCrossmap.log("Ingesting csv row #{count + 1}")
46
+ @stats.stats[:ingested_records] = count + 1
47
+ @stats.stats[:ingestion_span] = Time.now - @stats.stats[:ingestion_start]
48
+ true
49
+ end
50
+
40
51
  def headers(row)
41
52
  hdrs = row.dup
42
53
  @skip_original ? taxon_id_header(hdrs) : hdrs
@@ -3,9 +3,8 @@ module GnCrossmap
3
3
  class Resolver
4
4
  URL = "http://resolver.globalnames.org/name_resolvers.json".freeze
5
5
 
6
- def initialize(writer, data_source_id)
7
- @stats = { total: 0, current: 0, start_time: nil, last_batch_time: nil,
8
- matches: match_types }
6
+ def initialize(writer, data_source_id, stats)
7
+ @stats = stats
9
8
  @processor = GnCrossmap::ResultProcessor.new(writer, @stats)
10
9
  @ds_id = data_source_id
11
10
  @count = 0
@@ -14,13 +13,12 @@ module GnCrossmap
14
13
  end
15
14
 
16
15
  def resolve(data)
17
- @stats[:total] = data.size
18
- @stats[:start_time] = Time.now
16
+ update_stats(data.size)
19
17
  data.each_slice(@batch) do |slice|
20
18
  with_log do
21
19
  names = collect_names(slice)
22
20
  remote_resolve(names)
23
- yield(@stats) if block_given?
21
+ yield(@stats.stats) if block_given?
24
22
  end
25
23
  end
26
24
  @processor.writer.close
@@ -28,18 +26,18 @@ module GnCrossmap
28
26
 
29
27
  private
30
28
 
31
- def match_types
32
- matches = GnCrossmap::MATCH_TYPES.keys
33
- matches.each_with_object({}) do |key, obj|
34
- obj[key] = 0
35
- end
29
+ def update_stats(records_num)
30
+ @stats.stats[:total_records] = records_num
31
+ @stats.stats[:resolution_start] = Time.now
32
+ @stats.stats[:status] = :resolution
36
33
  end
37
34
 
38
35
  def with_log
39
36
  s = @count + 1
40
37
  @count += @batch
41
- e = [@count, @stats[:total]].min
42
- GnCrossmap.log("Resolve #{s}-#{e} out of #{@stats[:total]} records")
38
+ e = [@count, @stats.stats[:total_records]].min
39
+ GnCrossmap.log("Resolve #{s}-#{e} out of " \
40
+ "#{@stats.stats[:total_records]} records")
43
41
  yield
44
42
  end
45
43
 
@@ -59,7 +57,14 @@ module GnCrossmap
59
57
  rescue RestClient::Exception
60
58
  single_remote_resolve(names)
61
59
  ensure
62
- @stats[:last_batch_time] = Time.now - batch_start
60
+ update_batch_times(batch_start)
61
+ end
62
+
63
+ def update_batch_times(batch_start)
64
+ s = @stats.stats
65
+ s[:last_batches_time].shift if s[:last_batches_time].size > 2
66
+ s[:last_batches_time] << Time.now - batch_start
67
+ s[:resolution_span] = Time.now - s[:resolution_start]
63
68
  end
64
69
 
65
70
  def single_remote_resolve(names)
@@ -75,8 +80,8 @@ module GnCrossmap
75
80
  end
76
81
 
77
82
  def process_resolver_error(err, name)
78
- @stats[:matches][7] += 1
79
- @stats[:current] += 1
83
+ @stats.stats[:matches][7] += 1
84
+ @stats.stats[:resolved_records] += 1
80
85
  GnCrossmap.logger.error("Resolver broke on '#{name}': #{err.message}")
81
86
  end
82
87
  end
@@ -24,8 +24,8 @@ module GnCrossmap
24
24
  end
25
25
 
26
26
  def write_empty_result(datum)
27
- @stats[:matches][0] += 1
28
- @stats[:current] += 1
27
+ @stats.stats[:matches][0] += 1
28
+ @stats.stats[:resolved_records] += 1
29
29
  res = @original_data[datum[:supplied_id]]
30
30
  res += [GnCrossmap::MATCH_TYPES[0], datum[:supplied_name_string], nil,
31
31
  nil, @input[datum[:supplied_id]][:rank], nil,
@@ -42,8 +42,8 @@ module GnCrossmap
42
42
 
43
43
  def collect_stats(datum)
44
44
  match_num = datum[:results].map { |d| d[:match_type] }.min
45
- @stats[:matches][match_num] += 1
46
- @stats[:current] += 1
45
+ @stats.stats[:matches][match_num] += 1
46
+ @stats.stats[:resolved_records] += 1
47
47
  end
48
48
 
49
49
  def compile_result(datum, result)
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module GnCrossmap
4
+ # Collects statistics about crossmapping process
5
+ class Stats
6
+ attr_accessor :stats
7
+
8
+ def initialize
9
+ @stats = { status: :init, total_records: 0, ingested_records: 0,
10
+ resolved_records: 0, ingestion_span: nil,
11
+ resolution_span: nil, ingestion_start: nil,
12
+ resolution_start: nil, last_batches_time: [],
13
+ matches: match_types }
14
+ end
15
+
16
+ private
17
+
18
+ def match_types
19
+ matches = GnCrossmap::MATCH_TYPES.keys
20
+ matches.each_with_object({}) do |key, obj|
21
+ obj[key] = 0
22
+ end
23
+ end
24
+ end
25
+ end
@@ -1,6 +1,6 @@
1
1
  # Namespace module for crossmapping checklists to GN sources
2
2
  module GnCrossmap
3
- VERSION = "1.1.0".freeze
3
+ VERSION = "1.2.0".freeze
4
4
 
5
5
  def self.version
6
6
  VERSION
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gn_crossmap
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-11-28 00:00:00.000000000 Z
11
+ date: 2016-11-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: trollop
@@ -181,6 +181,7 @@ files:
181
181
  - lib/gn_crossmap/resolver.rb
182
182
  - lib/gn_crossmap/result_processor.rb
183
183
  - lib/gn_crossmap/sci_name_collector.rb
184
+ - lib/gn_crossmap/stats.rb
184
185
  - lib/gn_crossmap/version.rb
185
186
  - lib/gn_crossmap/writer.rb
186
187
  homepage: https://github.com/GlobalNamesArchitecture/gn_crossmap