ncs_mdes_warehouse 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +2 -0
- data/CHANGELOG.md +20 -0
- data/Rakefile +2 -3
- data/generated_models/ncs_navigator/warehouse/models/two_point_zero/spec_blood.rb +3 -3
- data/lib/ncs_navigator/warehouse.rb +1 -0
- data/lib/ncs_navigator/warehouse/cli.rb +46 -1
- data/lib/ncs_navigator/warehouse/comparator.rb +257 -0
- data/lib/ncs_navigator/warehouse/data_mapper.rb +20 -0
- data/lib/ncs_navigator/warehouse/database_initializer.rb +3 -4
- data/lib/ncs_navigator/warehouse/transform_status.rb +5 -1
- data/lib/ncs_navigator/warehouse/transformers/enum_transformer.rb +18 -3
- data/lib/ncs_navigator/warehouse/updating_shell.rb +1 -0
- data/lib/ncs_navigator/warehouse/version.rb +1 -1
- data/lib/ncs_navigator/warehouse/xml_emitter.rb +95 -13
- data/ncs_mdes_warehouse.gemspec +1 -1
- data/spec/ncs_navigator/warehouse/data_mapper_spec.rb +39 -0
- data/spec/ncs_navigator/warehouse/transform_load_spec.rb +3 -1
- data/spec/ncs_navigator/warehouse/transformers/database_spec.rb +3 -3
- data/spec/ncs_navigator/warehouse/transformers/enum_transformer_spec.rb +36 -5
- data/spec/ncs_navigator/warehouse/xml_emitter_spec.rb +176 -30
- data/spec/spec_helper.rb +54 -17
- metadata +39 -38
data/.rspec
ADDED
data/CHANGELOG.md
CHANGED
@@ -1,6 +1,26 @@
|
|
1
1
|
NCS Navigator MDES Warehouse History
|
2
2
|
====================================
|
3
3
|
|
4
|
+
0.2.0
|
5
|
+
-----
|
6
|
+
|
7
|
+
- Automatically set PSU ID and recruitment type in
|
8
|
+
`Transformers::EnumTransformer` if they are not already set. (#1630,
|
9
|
+
#1648)
|
10
|
+
|
11
|
+
- ETL: store invalid or failing record IDs as a separate column
|
12
|
+
instead of as part of TransformError#message. (#1636)
|
13
|
+
|
14
|
+
- When an MDES model string variable is set from a BigDecimal, coerce
|
15
|
+
it to a floating point string. The ruby default is scientific
|
16
|
+
notation, but that's not permitted in the MDES.
|
17
|
+
|
18
|
+
- New options for `emit-xml`: `--tables`, `--zip`,
|
19
|
+
`--include-pii`. (#1612, #1657, #1658)
|
20
|
+
|
21
|
+
- Add `compare` subcommand in `mdes-wh` for comparing the contents of
|
22
|
+
two warehouses. (#1667)
|
23
|
+
|
4
24
|
0.1.1
|
5
25
|
-----
|
6
26
|
|
data/Rakefile
CHANGED
@@ -15,12 +15,11 @@ task :spec => 'spec:all'
|
|
15
15
|
namespace :spec do
|
16
16
|
RSpec::Core::RakeTask.new(:fast) do |t|
|
17
17
|
t.pattern = "spec/**/*_spec.rb"
|
18
|
-
t.rspec_opts = %q(--
|
18
|
+
t.rspec_opts = %q(--tag ~slow)
|
19
19
|
end
|
20
20
|
|
21
21
|
RSpec::Core::RakeTask.new(:all) do |t|
|
22
22
|
t.pattern = "spec/**/*_spec.rb"
|
23
|
-
t.rspec_opts = %q(--format nested)
|
24
23
|
end
|
25
24
|
end
|
26
25
|
|
@@ -38,7 +37,7 @@ namespace :ci do
|
|
38
37
|
|
39
38
|
task :spec_setup do
|
40
39
|
ENV['CI_REPORTS'] = 'reports/spec-xml'
|
41
|
-
ENV['SPEC_OPTS'] = "#{ENV['SPEC_OPTS']} --format nested"
|
40
|
+
ENV['SPEC_OPTS'] = "#{ENV['SPEC_OPTS']} --format nested --no-color"
|
42
41
|
end
|
43
42
|
|
44
43
|
task :spec => [:spec_setup, 'ci:setup:rspecbase', 'rake:spec']
|
@@ -124,9 +124,9 @@ module NcsNavigator::Warehouse::Models::TwoPointZero
|
|
124
124
|
property :centrifuge_location,
|
125
125
|
NcsNavigator::Warehouse::DataMapper::NcsString,
|
126
126
|
{ :required => true, :length => 1..2, :set => ["1", "2", "-3", "-4"] }
|
127
|
-
|
128
|
-
|
129
|
-
:
|
127
|
+
property :equip_id,
|
128
|
+
NcsNavigator::Warehouse::DataMapper::NcsString,
|
129
|
+
{ :length => 0..36 }
|
130
130
|
property :centrifuge_time,
|
131
131
|
NcsNavigator::Warehouse::DataMapper::NcsString,
|
132
132
|
{ :length => 0..5, :format => /^([0-9][0-9]:[0-9][0-9])?$/ }
|
@@ -7,6 +7,7 @@ module NcsNavigator
|
|
7
7
|
DEFAULT_MDES_VERSION = '2.0'
|
8
8
|
|
9
9
|
autoload :CLI, 'ncs_navigator/warehouse/cli'
|
10
|
+
autoload :Comparator, 'ncs_navigator/warehouse/comparator'
|
10
11
|
autoload :Configuration, 'ncs_navigator/warehouse/configuration'
|
11
12
|
autoload :DataMapper, 'ncs_navigator/warehouse/data_mapper'
|
12
13
|
autoload :DatabaseInitializer, 'ncs_navigator/warehouse/database_initializer'
|
@@ -54,10 +54,20 @@ the contents of the current reporting database. The default name for
|
|
54
54
|
the XML file is the county name for the PSU plus the date; e.g.,
|
55
55
|
cook-20110728.xml.
|
56
56
|
DESC
|
57
|
+
method_option 'block-size', :type => :numeric, :aliases => %w(-b),
|
58
|
+
:desc => 'The maximum number of records to have in memory at once.',
|
59
|
+
:default => 5000
|
60
|
+
method_option 'include-pii', :type => :boolean, :default => false,
|
61
|
+
:desc => 'Include PII values in the emitted XML.'
|
62
|
+
method_option 'zip', :type => :boolean, :default => true,
|
63
|
+
:desc => 'Create a zip file alongside the XML. (Use --no-zip to disable.)'
|
64
|
+
method_option 'tables', :type => :string,
|
65
|
+
:desc => 'Emit XML for a subset of tables.', :banner => 'TABLE,TABLE,TABLE'
|
57
66
|
def emit_xml(filename=nil)
|
58
67
|
use_database
|
59
68
|
|
60
|
-
XmlEmitter.new(configuration, filename
|
69
|
+
XmlEmitter.new(configuration, filename,
|
70
|
+
options.merge(:tables => options[:tables].try(:split, /\s*,\s*/))).emit_xml
|
61
71
|
end
|
62
72
|
|
63
73
|
desc 'etl', 'Performs the full extract-transform-load process for this configuration'
|
@@ -83,6 +93,41 @@ DESC
|
|
83
93
|
exit 1
|
84
94
|
end
|
85
95
|
end
|
96
|
+
|
97
|
+
desc 'compare', 'Compares the contents of two warehouses, A & B.'
|
98
|
+
long_desc <<-DESC
|
99
|
+
Compares the contents of the MDES tables in two warehouses.
|
100
|
+
|
101
|
+
The comparison can be done at three levels:
|
102
|
+
|
103
|
+
1. Record counts.
|
104
|
+
|
105
|
+
2. IDs. (Which record IDs appear in one warehouse and not the other?)
|
106
|
+
|
107
|
+
3. Full contents. (Matching records based on ID, what variables have different values?)
|
108
|
+
|
109
|
+
Each level includes the one before it (i.e., doing a full content
|
110
|
+
comparison also does ID and count comparisons). Higher levels skip
|
111
|
+
tables which would add heat without light; i.e., level 2 only compares
|
112
|
+
IDs for a table when there are at least some records in each warehouse
|
113
|
+
and level 3 only compares content for tables where there are some
|
114
|
+
overlapping IDs.
|
115
|
+
DESC
|
116
|
+
method_option 'warehouse-a', :type => :string, :aliases => %w(-a),
|
117
|
+
:desc => 'The configuration file for warehouse A. The environment default will be used if not specified.'
|
118
|
+
method_option 'warehouse-b', :type => :string, :required => true, :aliases => %w(-b),
|
119
|
+
:desc => 'The configuration file for warehouse B.'
|
120
|
+
method_option 'level', :type => :numeric, :default => 1,
|
121
|
+
:desc => 'The level of detail for the comparison.'
|
122
|
+
def compare
|
123
|
+
if options['warehouse-a']
|
124
|
+
options['config'] = options['warehouse-a']
|
125
|
+
end
|
126
|
+
config_a = configuration
|
127
|
+
config_b = Configuration.from_file(options['warehouse-b'])
|
128
|
+
|
129
|
+
Comparator.new(config_a, config_b, options).compare
|
130
|
+
end
|
86
131
|
end
|
87
132
|
end
|
88
133
|
|
@@ -0,0 +1,257 @@
|
|
1
|
+
require 'ncs_navigator/warehouse'
|
2
|
+
|
3
|
+
require 'forwardable'
|
4
|
+
|
5
|
+
module NcsNavigator::Warehouse
|
6
|
+
class Comparator
|
7
|
+
extend Forwardable
|
8
|
+
|
9
|
+
attr_reader :level
|
10
|
+
attr_reader :a, :b
|
11
|
+
|
12
|
+
def_delegators :a, :shell
|
13
|
+
|
14
|
+
def initialize(config_a, config_b, options={})
|
15
|
+
@a = config_a; @b = config_b
|
16
|
+
@level = options[:level] ||= 1
|
17
|
+
|
18
|
+
shell.clear_line_and_say "Connecting to warehouse A..."
|
19
|
+
DatabaseInitializer.new(@a).set_up_repository(:reporting, 'a')
|
20
|
+
shell.clear_line_and_say "Connecting to warehouse B..."
|
21
|
+
DatabaseInitializer.new(@b).set_up_repository(:reporting, 'b')
|
22
|
+
shell.clear_line_and_say ''
|
23
|
+
end
|
24
|
+
|
25
|
+
def compare
|
26
|
+
shell.say_line "Running level #{level} compare..."
|
27
|
+
count_diff if level >= 1
|
28
|
+
id_diff if level >= 2
|
29
|
+
content_diff if level >= 3
|
30
|
+
end
|
31
|
+
|
32
|
+
def count_diff
|
33
|
+
@count_diffs = collect_differences(CountDiff, models.collect { |m| [m] })
|
34
|
+
shell_summarize_differences(CountDiff, @count_diffs)
|
35
|
+
end
|
36
|
+
|
37
|
+
def id_diff
|
38
|
+
@id_diffs = collect_differences(
|
39
|
+
IdDiff, models.select { |m| no_zero_counts?(m) }.collect { |m| [m] })
|
40
|
+
shell_summarize_differences(IdDiff, @id_diffs)
|
41
|
+
end
|
42
|
+
|
43
|
+
def content_diff
|
44
|
+
@content_diffs = collect_differences(
|
45
|
+
ContentDiff, models.collect { |m| [m, list_shared_ids(m)] }.select { |m, shared| shared })
|
46
|
+
shell_summarize_differences(ContentDiff, @content_diffs)
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def models
|
52
|
+
@models ||= begin
|
53
|
+
shell.clear_line_and_say "Loading MDES models..."
|
54
|
+
a.models_module.mdes_order
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def collect_differences(differ, arg_sets)
|
59
|
+
start = Time.now
|
60
|
+
arg_sets.collect do |arg_set|
|
61
|
+
model = arg_set.first
|
62
|
+
shell.clear_line_and_say "comparing #{differ.thing_plural} for #{model.mdes_table_name}..."
|
63
|
+
differ.new(*arg_set).compare
|
64
|
+
end.compact.tap {
|
65
|
+
shell.clear_line_and_say "#{differ.thing} comparisons complete in %.1fs.\n" % (Time.now - start)
|
66
|
+
}
|
67
|
+
end
|
68
|
+
|
69
|
+
def shell_summarize_differences(differ, diffs)
|
70
|
+
if diffs.empty?
|
71
|
+
puts "There are no #{differ.thing} differences."
|
72
|
+
else
|
73
|
+
puts "#{differ.thing_plural} differ in #{diffs.size} table#{'s' if diffs.size != 1}:"
|
74
|
+
diffs.each do |diff|
|
75
|
+
puts diff.shell_summary
|
76
|
+
end
|
77
|
+
end
|
78
|
+
puts
|
79
|
+
end
|
80
|
+
|
81
|
+
def no_zero_counts?(model)
|
82
|
+
if diff = @count_diffs.find { |d| d.table_name == model.mdes_table_name }
|
83
|
+
diff.a_count != 0 && diff.b_count != 0
|
84
|
+
else
|
85
|
+
true
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def list_shared_ids(model)
|
90
|
+
if diff = @id_diffs.find { |d| d.model == model }
|
91
|
+
diff.shared
|
92
|
+
else
|
93
|
+
nil
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# @private
|
98
|
+
class CountDiff
|
99
|
+
attr_reader :table_name, :a_count, :b_count
|
100
|
+
|
101
|
+
def self.thing; 'count'; end
|
102
|
+
def self.thing_plural; 'counts'; end
|
103
|
+
|
104
|
+
def initialize(model)
|
105
|
+
@table_name = model.mdes_table_name
|
106
|
+
@q = "SELECT COUNT(*) FROM #{table_name}"
|
107
|
+
end
|
108
|
+
|
109
|
+
def compare
|
110
|
+
@a_count = count('a_reporting')
|
111
|
+
@b_count = count('b_reporting')
|
112
|
+
if difference != 0
|
113
|
+
self
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def difference
|
118
|
+
a_count - b_count
|
119
|
+
end
|
120
|
+
|
121
|
+
def count(repo)
|
122
|
+
::DataMapper.repository(repo).adapter.select(@q).first
|
123
|
+
end
|
124
|
+
|
125
|
+
def shell_summary
|
126
|
+
" %30s: A = %6d %s %-6d = B | %6d" % [
|
127
|
+
table_name,
|
128
|
+
a_count, a_count > b_count ? '>' : '<', b_count,
|
129
|
+
difference.abs
|
130
|
+
]
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# @private
|
135
|
+
class IdDiff
|
136
|
+
attr_reader :model, :a_ids, :b_ids
|
137
|
+
|
138
|
+
def self.thing; 'ID'; end
|
139
|
+
def self.thing_plural; 'IDs'; end
|
140
|
+
|
141
|
+
def initialize(model)
|
142
|
+
@model = model
|
143
|
+
@q = "SELECT #{model.key.first.name} FROM #{model.mdes_table_name}"
|
144
|
+
end
|
145
|
+
|
146
|
+
def compare
|
147
|
+
@a_ids = ids('a_reporting')
|
148
|
+
@b_ids = ids('b_reporting')
|
149
|
+
unless a_only.empty? && b_only.empty?
|
150
|
+
self
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
def a_only
|
155
|
+
@a_only ||= a_ids - b_ids
|
156
|
+
end
|
157
|
+
|
158
|
+
def b_only
|
159
|
+
@b_only ||= b_ids - a_ids
|
160
|
+
end
|
161
|
+
|
162
|
+
def shared
|
163
|
+
@shared ||= a_ids & b_ids
|
164
|
+
end
|
165
|
+
|
166
|
+
def ids(repo)
|
167
|
+
::DataMapper.repository(repo).adapter.select(@q)
|
168
|
+
end
|
169
|
+
|
170
|
+
def shell_summary
|
171
|
+
[
|
172
|
+
" #{model.mdes_table_name}",
|
173
|
+
(" #{a_only.size} in A only: #{a_only.join(', ')}" unless a_only.empty?),
|
174
|
+
(" #{b_only.size} in B only: #{b_only.join(', ')}" unless b_only.empty?),
|
175
|
+
" #{shared.size} the same in both A and B."
|
176
|
+
].compact.join("\n")
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# @private
|
181
|
+
class ContentDiff
|
182
|
+
BLOCK_SIZE = 5000
|
183
|
+
|
184
|
+
attr_reader :model, :shared_ids
|
185
|
+
|
186
|
+
def self.thing; 'content'; end
|
187
|
+
def self.thing_plural; 'contents'; end
|
188
|
+
|
189
|
+
def initialize(model, shared_ids)
|
190
|
+
@model = model
|
191
|
+
@shared_ids = shared_ids
|
192
|
+
end
|
193
|
+
|
194
|
+
def compare
|
195
|
+
id_blocks.each do |ids|
|
196
|
+
key = model.key.first.name
|
197
|
+
ids_param = "'#{ids.join("', '")}'"
|
198
|
+
find_all = lambda { |repo_name|
|
199
|
+
q = "SELECT * FROM #{model.mdes_table_name} WHERE #{key} IN (#{ids_param}) ORDER BY #{key}"
|
200
|
+
::DataMapper.repository(repo_name).adapter.select(q)
|
201
|
+
}
|
202
|
+
a_records = find_all['a_reporting']
|
203
|
+
b_records = find_all['b_reporting']
|
204
|
+
|
205
|
+
a_records.zip(b_records).each do |a_rec, b_rec|
|
206
|
+
compare_records(a_rec[key], a_rec, b_rec)
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
unless differences.empty?
|
211
|
+
self
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
def differences
|
216
|
+
@differences ||= {}
|
217
|
+
end
|
218
|
+
|
219
|
+
def id_blocks
|
220
|
+
@id_blocks ||=
|
221
|
+
begin
|
222
|
+
allocated = 0
|
223
|
+
blocks = []
|
224
|
+
while allocated < shared_ids.size
|
225
|
+
blocks << shared_ids[allocated...(allocated + BLOCK_SIZE)]
|
226
|
+
allocated += BLOCK_SIZE
|
227
|
+
end
|
228
|
+
blocks
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
def compare_records(id, a_record, b_record)
|
233
|
+
rec_diff = a_record.members.inject({}) { |h, prop|
|
234
|
+
a_value = a_record.send(prop)
|
235
|
+
b_value = b_record.send(prop)
|
236
|
+
h[prop] = [a_value, b_value] if a_value != b_value
|
237
|
+
h
|
238
|
+
}
|
239
|
+
unless rec_diff.empty?
|
240
|
+
differences[id] = rec_diff
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
def shell_summary
|
245
|
+
[
|
246
|
+
" #{model.mdes_table_name}:",
|
247
|
+
differences.each_pair.collect do |id, rec_diff|
|
248
|
+
[" #{id}:"] +
|
249
|
+
rec_diff.each_pair.collect { |var, vals|
|
250
|
+
" #{var}: #{vals.collect(&:inspect).join(' != ')}"
|
251
|
+
}
|
252
|
+
end
|
253
|
+
].flatten.join("\n")
|
254
|
+
end
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
@@ -23,16 +23,36 @@ module NcsNavigator::Warehouse
|
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
+
module NcsStringType
|
27
|
+
protected
|
28
|
+
|
29
|
+
def self.included(into)
|
30
|
+
into.class_eval do
|
31
|
+
alias :original_typecast_to_primitive :typecast_to_primitive
|
32
|
+
|
33
|
+
def typecast_to_primitive(value)
|
34
|
+
if value.is_a?(BigDecimal)
|
35
|
+
value.to_s('F')
|
36
|
+
else
|
37
|
+
original_typecast_to_primitive(value)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
26
44
|
##
|
27
45
|
# DataMapper `:string` that's an {NcsType}.
|
28
46
|
class NcsString < ::DataMapper::Property::String
|
29
47
|
include NcsType
|
48
|
+
include NcsStringType
|
30
49
|
end
|
31
50
|
|
32
51
|
##
|
33
52
|
# DataMapper `:text` that's an {NcsType}.
|
34
53
|
class NcsText < ::DataMapper::Property::Text
|
35
54
|
include NcsType
|
55
|
+
include NcsStringType
|
36
56
|
end
|
37
57
|
|
38
58
|
##
|
@@ -44,18 +44,17 @@ module NcsNavigator::Warehouse
|
|
44
44
|
# be the reporting repo unless only the working repo is
|
45
45
|
# configured in.
|
46
46
|
# @return [void]
|
47
|
-
def set_up_repository(mode=:reporting)
|
47
|
+
def set_up_repository(mode=:reporting, prefix="mdes_warehouse")
|
48
48
|
fail "Invalid mode #{mode.inspect}" unless [:reporting, :working, :both].include?(mode)
|
49
49
|
modes = case mode
|
50
50
|
when :both then [:reporting, :working]
|
51
51
|
else [mode]
|
52
52
|
end
|
53
53
|
connect_one(modes.first, :default)
|
54
|
-
modes.each { |m| connect_one(m) }
|
54
|
+
modes.each { |m| connect_one(m, [prefix, m].join('_').to_sym) }
|
55
55
|
end
|
56
56
|
|
57
|
-
def connect_one(which_one, dm_name
|
58
|
-
dm_name ||= :"mdes_warehouse_#{which_one}"
|
57
|
+
def connect_one(which_one, dm_name)
|
59
58
|
log.info "Connecting DataMapper repository #{dm_name.inspect}"
|
60
59
|
p = params(which_one)
|
61
60
|
log.debug " using #{p.merge('password' => 'SUPPRESSED').inspect}"
|