gn_list_resolver 3.3.1.1 → 4.0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bca6cb993b435d76acf6a14bb0c4031892a6a949
4
- data.tar.gz: d7c2c9e5c5132cd93b4dc03857427a0e6e5ed8e2
3
+ metadata.gz: 47abe84b5c8bd829c01d9c50978fe71b3f14aef6
4
+ data.tar.gz: 7852f24f3dc3ba40e3175c92275bcde26eda0463
5
5
  SHA512:
6
- metadata.gz: 64966a526648716221e6024c753fb9cb28b3af1450d928c1e9bc5f70cb614587ac7481674d93dbac1368a0ca90e621930d170ed5db5bf31d94a6478944919a58
7
- data.tar.gz: aabced616396747ce23661c945b29b11b7aebc297ee57618ed5e2a659d581abf446f1af42aafd1032492e54b20b8cc49ccd550650a73222f15d6c4bb69e1d0b3
6
+ metadata.gz: e97f1949f12a97c814adb4e4a4b88223ac00bc5bc6095a553ed22461304f4986a2d32cc7071f8871b83a7957ba76c1fe102e6fd776cfea52a33278e9e1909584
7
+ data.tar.gz: f458552148b3981f6f9c5017ee1b8c9b90bed16e332ef06f5340f88815050905e51277e472be23089be94032c84032f770e3e338dcf7cf00b7dccb64e181e109
@@ -25,6 +25,14 @@
25
25
  @dimus, @alexander-myltsev - sync with `gn_crossmap`'s 3.2.0, current name is
26
26
  still a bit broken
27
27
 
28
+ ## 4.0.1
29
+
30
+ * @dimus - Clean up resolution stats
31
+
32
+ ## 4.0.0
33
+
34
+ * @dimus - Better estimation for time left for resolver and speed
35
+
28
36
  ## 3.3.1
29
37
 
30
38
  * @dimus - Better error message in logger
data/README.md CHANGED
@@ -166,7 +166,8 @@ end
166
166
  |resolution_stop |time when resolution of names stopped |
167
167
  |resolution_span |time of intermediate checkpoint of resolving names |
168
168
  |resolved_records |number of names already processed |
169
- |last_batches_time |time required to process the last batch of names |
169
+ |speed |weighted speed average |
170
+ |current_speed |speed of resolution for the last batch |
170
171
  |matches |Distribution of processed data by match type (see below) |
171
172
  |errors |First 0-10 errors found during the csv file processing |
172
173
 
@@ -16,6 +16,7 @@ module GnListResolver
16
16
  @count = 0
17
17
  @jobs = []
18
18
  @batch = 1000
19
+ @smoothing = 0.05
19
20
  end
20
21
 
21
22
  def resolve(data)
@@ -32,7 +33,7 @@ module GnListResolver
32
33
  private
33
34
 
34
35
  def wrap_up
35
- @stats.stats[:resolution_stop] = Time.now
36
+ @stats.stats[:resolution][:stop_time] = Time.now
36
37
  @stats.stats[:status] = :finish
37
38
  @processor.writer.close
38
39
  end
@@ -53,7 +54,7 @@ module GnListResolver
53
54
 
54
55
  def resolution_stats(records_num)
55
56
  @stats.stats[:total_records] = records_num
56
- @stats.stats[:resolution_start] = Time.now
57
+ @stats.stats[:resolution][:start_time] = Time.now
57
58
  @stats.stats[:status] = :resolution
58
59
  end
59
60
 
@@ -82,17 +83,10 @@ module GnListResolver
82
83
  update_stats(stats)
83
84
  @processor.process(results, current_data)
84
85
  else
85
- GnResolver.logger.error(job.reason.message)
86
+ GnListResolver.logger.error(job.reason.message)
86
87
  end
87
88
  end
88
89
 
89
- def update_stats(job_stats)
90
- s = @stats.stats
91
- s[:last_batches_time].shift if s[:last_batches_time].size > 2
92
- s[:last_batches_time] << job_stats.stats[:last_batches_time][0]
93
- s[:resolution_span] = Time.now - s[:resolution_start]
94
- end
95
-
96
90
  def create_job(batch)
97
91
  batch_data = collect_names(batch)
98
92
  rb = ResolverJob.new(batch, batch_data, @ds_id)
@@ -116,14 +110,28 @@ module GnListResolver
116
110
  batch_data
117
111
  end
118
112
 
113
+ # rubocop:disable Metrics/AbcSize
114
+ def update_stats(job_stats)
115
+ s = @stats.stats
116
+ current_speed = job_stats.stats[:current_speed] *
117
+ @stats.penalty(@threads)
118
+
119
+ s[:resolution][:completed_records] +=
120
+ job_stats.stats[:resolution][:completed_records]
121
+ @stats.update_eta(current_speed)
122
+ end
123
+
119
124
  def with_log
125
+ yield
120
126
  s = @count + 1
121
127
  @count += @batch
122
128
  e = [@count, @stats.stats[:total_records]].min
123
- GnListResolver.log("Resolve #{s}-#{e} out of " \
124
- "#{@stats.stats[:total_records]} records at " \
125
- "#{RESOLVER_URL}")
126
- yield
129
+ eta = @stats.stats[:resolution][:eta].to_i + Time.now.to_i
130
+ msg = format("Resolve %s-%s/%s records %d rec/s; eta: %s", s, e,
131
+ @stats.stats[:total_records],
132
+ @stats.stats[:resolution][:speed].to_i,
133
+ Time.at(eta))
134
+ GnListResolver.log(msg)
127
135
  end
128
136
  end
129
137
  end
@@ -15,7 +15,12 @@ module GnListResolver
15
15
  new(writer, @stats, @with_classification)
16
16
  @count = 0
17
17
  @jobs = []
18
+ <<<<<<< HEAD:lib/gn_list_resolver/resolver.rb
18
19
  @batch = 1000
20
+ =======
21
+ @batch = 200
22
+ @smoothing = 0.05
23
+ >>>>>>> 2a4afb8... Fix #42 - better speed estimation:lib/gn_crossmap/resolver.rb
19
24
  end
20
25
 
21
26
  def resolve(data)
@@ -32,7 +37,7 @@ module GnListResolver
32
37
  private
33
38
 
34
39
  def wrap_up
35
- @stats.stats[:resolution_stop] = Time.now
40
+ @stats.stats[:resolution][:stop_time] = Time.now
36
41
  @stats.stats[:status] = :finish
37
42
  @processor.writer.close
38
43
  end
@@ -53,7 +58,7 @@ module GnListResolver
53
58
 
54
59
  def resolution_stats(records_num)
55
60
  @stats.stats[:total_records] = records_num
56
- @stats.stats[:resolution_start] = Time.now
61
+ @stats.stats[:resolution][:start_time] = Time.now
57
62
  @stats.stats[:status] = :resolution
58
63
  end
59
64
 
@@ -82,14 +87,11 @@ module GnListResolver
82
87
  update_stats(stats)
83
88
  @processor.process(results, current_data)
84
89
  else
85
- <<<<<<< HEAD:lib/gn_list_resolver/resolver.rb
86
- GnListResolver.logger.error("Remote resolver server failed")
87
- =======
88
- GnCrossmap.logger.error(job.reason.message)
89
- >>>>>>> 36115cc... better error log:lib/gn_crossmap/resolver.rb
90
+ GnResolver.logger.error(job.reason.message)
90
91
  end
91
92
  end
92
93
 
94
+ <<<<<<< HEAD:lib/gn_list_resolver/resolver.rb
93
95
  def update_stats(job_stats)
94
96
  s = @stats.stats
95
97
  s[:last_batches_time].shift if s[:last_batches_time].size > 2
@@ -97,6 +99,8 @@ module GnListResolver
97
99
  s[:resolution_span] = Time.now - s[:resolution_start]
98
100
  end
99
101
 
102
+ =======
103
+ >>>>>>> 2a4afb8... Fix #42 - better speed estimation:lib/gn_crossmap/resolver.rb
100
104
  def create_job(batch)
101
105
  batch_data = collect_names(batch)
102
106
  rb = ResolverJob.new(batch, batch_data, @ds_id)
@@ -116,18 +120,46 @@ module GnListResolver
116
120
  id = row[:id].strip
117
121
  batch_data[id] = row[:original]
118
122
  @processor.input[id] = { rank: row[:rank] }
123
+ <<<<<<< HEAD:lib/gn_list_resolver/resolver.rb
119
124
  end
120
125
  batch_data
126
+ =======
127
+ str << "#{id}|#{row[:name]}"
128
+ end
129
+ [names, batch_data]
130
+ >>>>>>> 2a4afb8... Fix #42 - better speed estimation:lib/gn_crossmap/resolver.rb
131
+ end
132
+
133
+ # rubocop:disable Metrics/AbcSize
134
+ def update_stats(job_stats)
135
+ s = @stats.stats
136
+ current_speed = job_stats.stats[:current_speed] *
137
+ @stats.penalty(@threads)
138
+
139
+ s[:resolution][:completed_records] +=
140
+ job_stats.stats[:resolution][:completed_records]
141
+ @stats.update_eta(current_speed)
142
+ s[:matches][7] += job_stats.stats[:matches][7]
121
143
  end
122
144
 
123
145
  def with_log
146
+ yield
124
147
  s = @count + 1
125
148
  @count += @batch
126
149
  e = [@count, @stats.stats[:total_records]].min
150
+ <<<<<<< HEAD:lib/gn_list_resolver/resolver.rb
127
151
  GnListResolver.log("Resolve #{s}-#{e} out of " \
128
152
  "#{@stats.stats[:total_records]} records at " \
129
153
  "#{RESOLVER_URL}")
130
154
  yield
155
+ =======
156
+ eta = @stats.stats[:resolution][:eta].to_i + Time.now.to_i
157
+ msg = format("Resolve %s-%s/%s records %d rec/s; eta: %s", s, e,
158
+ @stats.stats[:total_records],
159
+ @stats.stats[:resolution][:speed].to_i,
160
+ Time.at(eta))
161
+ GnCrossmap.log(msg)
162
+ >>>>>>> 2a4afb8... Fix #42 - better speed estimation:lib/gn_crossmap/resolver.rb
131
163
  end
132
164
  end
133
165
  end
@@ -33,7 +33,8 @@ module GnListResolver
33
33
  end
34
34
 
35
35
  def stats_add_batch_time(batch_start)
36
- @stats.stats[:last_batches_time] << Time.now - batch_start
36
+ @stats.stats[:current_speed] = @names.size / (Time.now - batch_start)
37
+ @stats.stats[:resolution][:completed_records] = @names.size
37
38
  end
38
39
  end
39
40
  end
@@ -24,7 +24,6 @@ module GnListResolver
24
24
 
25
25
  def write_empty_result(datum)
26
26
  @stats.stats[:matches][MATCH_TYPE_EMPTY] += 1
27
- @stats.stats[:resolved_records] += 1
28
27
  res = compile_empty_result(datum)
29
28
  @writer.write(res)
30
29
  end
@@ -54,9 +53,7 @@ module GnListResolver
54
53
  else
55
54
  match_type_min.match_type.kind.to_sym
56
55
  end
57
- require "byebug"; byebug if @stats.stats[:matches][match_type_value].nil?
58
56
  @stats.stats[:matches][match_type_value] += 1
59
- @stats.stats[:resolved_records] += 1
60
57
  end
61
58
 
62
59
  def compile_result(datum, result, match_size)
@@ -7,11 +7,24 @@ module GnListResolver
7
7
 
8
8
  def initialize
9
9
  @stats = { status: :init, total_records: 0, ingested_records: 0,
10
- resolved_records: 0, ingestion_span: nil,
11
- resolution_span: nil, ingestion_start: nil,
12
- resolution_start: nil, resolution_stop: nil,
13
- last_batches_time: [], matches: init_matches,
14
- errors: [] }
10
+ ingestion_span: nil, ingestion_start: nil,
11
+ resolution: eta_struct,
12
+ matches: init_matches, errors: [] }
13
+ @smooth = 0.05
14
+ end
15
+
16
+ def penalty(threads)
17
+ pnlt = 0.7
18
+ penalty_adj(threads.to_i, 1, pnlt)
19
+ end
20
+
21
+ def update_eta(current_speed)
22
+ eta = @stats[:resolution]
23
+ eta[:speed] = current_speed if eta[:speed].nil?
24
+ eta[:speed] = eta[:speed] * (1 - @smooth) + current_speed * @smooth
25
+ eta[:eta] = (@stats[:total_records] -
26
+ @stats[:resolution][:completed_records]) /
27
+ eta[:speed]
15
28
  end
16
29
 
17
30
  private
@@ -19,5 +32,15 @@ module GnListResolver
19
32
  def init_matches
20
33
  MATCH_TYPES.keys.each_with_object({}) { |t, h| h[t] = 0 }
21
34
  end
35
+
36
+ def eta_struct
37
+ { start_time: nil, completed_records: 0,
38
+ speed: nil, eta: nil, stop_time: nil }
39
+ end
40
+
41
+ def penalty_adj(threads, val, pnlt)
42
+ return val if threads < 2
43
+ val + penalty_adj(threads - 1, (val * pnlt), pnlt)
44
+ end
22
45
  end
23
46
  end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ module GnListResolver
4
+ # Collects statistics about list resolving process
5
+ class Stats
6
+ attr_accessor :stats
7
+
8
+ def initialize
9
+ @stats = { status: :init, total_records: 0, ingested_records: 0,
10
+ <<<<<<< HEAD:lib/gn_list_resolver/stats.rb
11
+ resolved_records: 0, ingestion_span: nil,
12
+ resolution_span: nil, ingestion_start: nil,
13
+ resolution_start: nil, resolution_stop: nil,
14
+ last_batches_time: [], matches: init_matches,
15
+ errors: [] }
16
+ =======
17
+ ingestion_span: nil, ingestion_start: nil,
18
+ resolution: eta_struct,
19
+ matches: match_types, errors: [] }
20
+ @smooth = 0.05
21
+ end
22
+
23
+ def penalty(threads)
24
+ pnlt = 0.7
25
+ penalty_adj(threads.to_i, 1, pnlt)
26
+ end
27
+
28
+ def update_eta(current_speed)
29
+ eta = @stats[:resolution]
30
+ eta[:speed] = current_speed if eta[:speed].nil?
31
+ eta[:speed] = eta[:speed] * (1 - @smooth) + current_speed * @smooth
32
+ eta[:eta] = (@stats[:total_records] -
33
+ @stats[:resolution][:completed_records]) /
34
+ eta[:speed]
35
+ >>>>>>> 2a4afb8... Fix #42 - better speed estimation:lib/gn_crossmap/stats.rb
36
+ end
37
+
38
+ private
39
+
40
+ <<<<<<< HEAD:lib/gn_list_resolver/stats.rb
41
+ def init_matches
42
+ MATCH_TYPES.keys.each_with_object({}) { |t, h| h[t] = 0 }
43
+ =======
44
+ def eta_struct
45
+ { start_time: nil, completed_records: 0,
46
+ speed: nil, eta: nil, stop_time: nil }
47
+ end
48
+
49
+ def match_types
50
+ matches = GnCrossmap::MATCH_TYPES.keys
51
+ matches.each_with_object({}) do |key, obj|
52
+ obj[key] = 0
53
+ end
54
+ >>>>>>> 2a4afb8... Fix #42 - better speed estimation:lib/gn_crossmap/stats.rb
55
+ end
56
+
57
+ def penalty_adj(threads, val, pnlt)
58
+ return val if threads < 2
59
+ val + penalty_adj(threads - 1, (val * pnlt), pnlt)
60
+ end
61
+ end
62
+ end
@@ -2,7 +2,7 @@
2
2
 
3
3
  # Namespace module for crossmapping checklists to GN sources
4
4
  module GnListResolver
5
- VERSION = "3.3.1.1"
5
+ VERSION = "4.0.1.0"
6
6
 
7
7
  def self.version
8
8
  VERSION
@@ -3,11 +3,11 @@
3
3
  # Namespace module for crossmapping checklists to GN sources
4
4
  <<<<<<< HEAD:lib/gn_list_resolver/version.rb
5
5
  module GnListResolver
6
- VERSION = "3.3.1.0"
6
+ VERSION = "3.3.1.1"
7
7
  =======
8
8
  module GnCrossmap
9
- VERSION = "3.3.1"
10
- >>>>>>> 36115cc... better error log:lib/gn_crossmap/version.rb
9
+ VERSION = "4.0.1"
10
+ >>>>>>> 2a4afb8... Fix #42 - better speed estimation:lib/gn_crossmap/version.rb
11
11
 
12
12
  def self.version
13
13
  VERSION
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gn_list_resolver
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.3.1.1
4
+ version: 4.0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2017-09-15 00:00:00.000000000 Z
12
+ date: 2017-09-17 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: biodiversity
@@ -231,6 +231,7 @@ files:
231
231
  - lib/gn_list_resolver/result_processor.rb
232
232
  - lib/gn_list_resolver/sci_name_collector.rb
233
233
  - lib/gn_list_resolver/stats.rb
234
+ - lib/gn_list_resolver/stats.rb.orig
234
235
  - lib/gn_list_resolver/version.rb
235
236
  - lib/gn_list_resolver/version.rb.orig
236
237
  - lib/gn_list_resolver/writer.rb