ncs_mdes_warehouse 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format=nested
data/CHANGELOG.md CHANGED
@@ -1,6 +1,26 @@
1
1
  NCS Navigator MDES Warehouse History
2
2
  ====================================
3
3
 
4
+ 0.2.0
5
+ -----
6
+
7
+ - Automatically set PSU ID and recruitment type in
8
+ `Transformers::EnumTransformer` if they are not already set. (#1630,
9
+ #1648)
10
+
11
+ - ETL: store invalid or failing record IDs as a separate column
12
+ instead of as part of TransformError#message. (#1636)
13
+
14
+ - When an MDES model string variable is set from a BigDecimal, coerce
15
+ it to a floating point string. The ruby default is scientific
16
+ notation, but that's not permitted in the MDES.
17
+
18
+ - New options for `emit-xml`: `--tables`, `--zip`,
19
+ `--include-pii`. (#1612, #1657, #1658)
20
+
21
+ - Add `compare` subcommand in `mdes-wh` for comparing the contents of
22
+ two warehouses. (#1667)
23
+
4
24
  0.1.1
5
25
  -----
6
26
 
data/Rakefile CHANGED
@@ -15,12 +15,11 @@ task :spec => 'spec:all'
15
15
  namespace :spec do
16
16
  RSpec::Core::RakeTask.new(:fast) do |t|
17
17
  t.pattern = "spec/**/*_spec.rb"
18
- t.rspec_opts = %q(--format nested --tag ~slow)
18
+ t.rspec_opts = %q(--tag ~slow)
19
19
  end
20
20
 
21
21
  RSpec::Core::RakeTask.new(:all) do |t|
22
22
  t.pattern = "spec/**/*_spec.rb"
23
- t.rspec_opts = %q(--format nested)
24
23
  end
25
24
  end
26
25
 
@@ -38,7 +37,7 @@ namespace :ci do
38
37
 
39
38
  task :spec_setup do
40
39
  ENV['CI_REPORTS'] = 'reports/spec-xml'
41
- ENV['SPEC_OPTS'] = "#{ENV['SPEC_OPTS']} --format nested"
40
+ ENV['SPEC_OPTS'] = "#{ENV['SPEC_OPTS']} --format nested --no-color"
42
41
  end
43
42
 
44
43
  task :spec => [:spec_setup, 'ci:setup:rspecbase', 'rake:spec']
@@ -124,9 +124,9 @@ module NcsNavigator::Warehouse::Models::TwoPointZero
124
124
  property :centrifuge_location,
125
125
  NcsNavigator::Warehouse::DataMapper::NcsString,
126
126
  { :required => true, :length => 1..2, :set => ["1", "2", "-3", "-4"] }
127
- belongs_to :equip,
128
- 'NcsNavigator::Warehouse::Models::TwoPointZero::SpecEquipment',
129
- :child_key => [ :equip_id ], :required => false
127
+ property :equip_id,
128
+ NcsNavigator::Warehouse::DataMapper::NcsString,
129
+ { :length => 0..36 }
130
130
  property :centrifuge_time,
131
131
  NcsNavigator::Warehouse::DataMapper::NcsString,
132
132
  { :length => 0..5, :format => /^([0-9][0-9]:[0-9][0-9])?$/ }
@@ -7,6 +7,7 @@ module NcsNavigator
7
7
  DEFAULT_MDES_VERSION = '2.0'
8
8
 
9
9
  autoload :CLI, 'ncs_navigator/warehouse/cli'
10
+ autoload :Comparator, 'ncs_navigator/warehouse/comparator'
10
11
  autoload :Configuration, 'ncs_navigator/warehouse/configuration'
11
12
  autoload :DataMapper, 'ncs_navigator/warehouse/data_mapper'
12
13
  autoload :DatabaseInitializer, 'ncs_navigator/warehouse/database_initializer'
@@ -54,10 +54,20 @@ the contents of the current reporting database. The default name for
54
54
  the XML file is the county name for the PSU plus the date; e.g.,
55
55
  cook-20110728.xml.
56
56
  DESC
57
+ method_option 'block-size', :type => :numeric, :aliases => %w(-b),
58
+ :desc => 'The maximum number of records to have in memory at once.',
59
+ :default => 5000
60
+ method_option 'include-pii', :type => :boolean, :default => false,
61
+ :desc => 'Include PII values in the emitted XML.'
62
+ method_option 'zip', :type => :boolean, :default => true,
63
+ :desc => 'Create a zip file alongside the XML. (Use --no-zip to disable.)'
64
+ method_option 'tables', :type => :string,
65
+ :desc => 'Emit XML for a subset of tables.', :banner => 'TABLE,TABLE,TABLE'
57
66
  def emit_xml(filename=nil)
58
67
  use_database
59
68
 
60
- XmlEmitter.new(configuration, filename).emit_xml
69
+ XmlEmitter.new(configuration, filename,
70
+ options.merge(:tables => options[:tables].try(:split, /\s*,\s*/))).emit_xml
61
71
  end
62
72
 
63
73
  desc 'etl', 'Performs the full extract-transform-load process for this configuration'
@@ -83,6 +93,41 @@ DESC
83
93
  exit 1
84
94
  end
85
95
  end
96
+
97
+ desc 'compare', 'Compares the contents of two warehouses, A & B.'
98
+ long_desc <<-DESC
99
+ Compares the contents of the MDES tables in two warehouses.
100
+
101
+ The comparison can be done at three levels:
102
+
103
+ 1. Record counts.
104
+
105
+ 2. IDs. (Which record IDs appear in one warehouse and not the other?)
106
+
107
+ 3. Full contents. (Matching records based on ID, what variables have different values?)
108
+
109
+ Each level includes the one before it (i.e., doing a full content
110
+ comparison also does ID and count comparisons). Higher levels skip
111
+ tables which would add heat without light; i.e., level 2 only compares
112
+ IDs for a table when there are at least some records in each warehouse
113
+ and level 3 only compares content for tables where there are some
114
+ overlapping IDs.
115
+ DESC
116
+ method_option 'warehouse-a', :type => :string, :aliases => %w(-a),
117
+ :desc => 'The configuration file for warehouse A. The environment default will be used if not specified.'
118
+ method_option 'warehouse-b', :type => :string, :required => true, :aliases => %w(-b),
119
+ :desc => 'The configuration file for warehouse B.'
120
+ method_option 'level', :type => :numeric, :default => 1,
121
+ :desc => 'The level of detail for the comparison.'
122
+ def compare
123
+ if options['warehouse-a']
124
+ options['config'] = options['warehouse-a']
125
+ end
126
+ config_a = configuration
127
+ config_b = Configuration.from_file(options['warehouse-b'])
128
+
129
+ Comparator.new(config_a, config_b, options).compare
130
+ end
86
131
  end
87
132
  end
88
133
 
@@ -0,0 +1,257 @@
1
+ require 'ncs_navigator/warehouse'
2
+
3
+ require 'forwardable'
4
+
5
+ module NcsNavigator::Warehouse
6
+ class Comparator
7
+ extend Forwardable
8
+
9
+ attr_reader :level
10
+ attr_reader :a, :b
11
+
12
+ def_delegators :a, :shell
13
+
14
+ def initialize(config_a, config_b, options={})
15
+ @a = config_a; @b = config_b
16
+ @level = options[:level] ||= 1
17
+
18
+ shell.clear_line_and_say "Connecting to warehouse A..."
19
+ DatabaseInitializer.new(@a).set_up_repository(:reporting, 'a')
20
+ shell.clear_line_and_say "Connecting to warehouse B..."
21
+ DatabaseInitializer.new(@b).set_up_repository(:reporting, 'b')
22
+ shell.clear_line_and_say ''
23
+ end
24
+
25
+ def compare
26
+ shell.say_line "Running level #{level} compare..."
27
+ count_diff if level >= 1
28
+ id_diff if level >= 2
29
+ content_diff if level >= 3
30
+ end
31
+
32
+ def count_diff
33
+ @count_diffs = collect_differences(CountDiff, models.collect { |m| [m] })
34
+ shell_summarize_differences(CountDiff, @count_diffs)
35
+ end
36
+
37
+ def id_diff
38
+ @id_diffs = collect_differences(
39
+ IdDiff, models.select { |m| no_zero_counts?(m) }.collect { |m| [m] })
40
+ shell_summarize_differences(IdDiff, @id_diffs)
41
+ end
42
+
43
+ def content_diff
44
+ @content_diffs = collect_differences(
45
+ ContentDiff, models.collect { |m| [m, list_shared_ids(m)] }.select { |m, shared| shared })
46
+ shell_summarize_differences(ContentDiff, @content_diffs)
47
+ end
48
+
49
+ private
50
+
51
+ def models
52
+ @models ||= begin
53
+ shell.clear_line_and_say "Loading MDES models..."
54
+ a.models_module.mdes_order
55
+ end
56
+ end
57
+
58
+ def collect_differences(differ, arg_sets)
59
+ start = Time.now
60
+ arg_sets.collect do |arg_set|
61
+ model = arg_set.first
62
+ shell.clear_line_and_say "comparing #{differ.thing_plural} for #{model.mdes_table_name}..."
63
+ differ.new(*arg_set).compare
64
+ end.compact.tap {
65
+ shell.clear_line_and_say "#{differ.thing} comparisons complete in %.1fs.\n" % (Time.now - start)
66
+ }
67
+ end
68
+
69
+ def shell_summarize_differences(differ, diffs)
70
+ if diffs.empty?
71
+ puts "There are no #{differ.thing} differences."
72
+ else
73
+ puts "#{differ.thing_plural} differ in #{diffs.size} table#{'s' if diffs.size != 1}:"
74
+ diffs.each do |diff|
75
+ puts diff.shell_summary
76
+ end
77
+ end
78
+ puts
79
+ end
80
+
81
+ def no_zero_counts?(model)
82
+ if diff = @count_diffs.find { |d| d.table_name == model.mdes_table_name }
83
+ diff.a_count != 0 && diff.b_count != 0
84
+ else
85
+ true
86
+ end
87
+ end
88
+
89
+ def list_shared_ids(model)
90
+ if diff = @id_diffs.find { |d| d.model == model }
91
+ diff.shared
92
+ else
93
+ nil
94
+ end
95
+ end
96
+
97
+ # @private
98
+ class CountDiff
99
+ attr_reader :table_name, :a_count, :b_count
100
+
101
+ def self.thing; 'count'; end
102
+ def self.thing_plural; 'counts'; end
103
+
104
+ def initialize(model)
105
+ @table_name = model.mdes_table_name
106
+ @q = "SELECT COUNT(*) FROM #{table_name}"
107
+ end
108
+
109
+ def compare
110
+ @a_count = count('a_reporting')
111
+ @b_count = count('b_reporting')
112
+ if difference != 0
113
+ self
114
+ end
115
+ end
116
+
117
+ def difference
118
+ a_count - b_count
119
+ end
120
+
121
+ def count(repo)
122
+ ::DataMapper.repository(repo).adapter.select(@q).first
123
+ end
124
+
125
+ def shell_summary
126
+ " %30s: A = %6d %s %-6d = B | %6d" % [
127
+ table_name,
128
+ a_count, a_count > b_count ? '>' : '<', b_count,
129
+ difference.abs
130
+ ]
131
+ end
132
+ end
133
+
134
+ # @private
135
+ class IdDiff
136
+ attr_reader :model, :a_ids, :b_ids
137
+
138
+ def self.thing; 'ID'; end
139
+ def self.thing_plural; 'IDs'; end
140
+
141
+ def initialize(model)
142
+ @model = model
143
+ @q = "SELECT #{model.key.first.name} FROM #{model.mdes_table_name}"
144
+ end
145
+
146
+ def compare
147
+ @a_ids = ids('a_reporting')
148
+ @b_ids = ids('b_reporting')
149
+ unless a_only.empty? && b_only.empty?
150
+ self
151
+ end
152
+ end
153
+
154
+ def a_only
155
+ @a_only ||= a_ids - b_ids
156
+ end
157
+
158
+ def b_only
159
+ @b_only ||= b_ids - a_ids
160
+ end
161
+
162
+ def shared
163
+ @shared ||= a_ids & b_ids
164
+ end
165
+
166
+ def ids(repo)
167
+ ::DataMapper.repository(repo).adapter.select(@q)
168
+ end
169
+
170
+ def shell_summary
171
+ [
172
+ " #{model.mdes_table_name}",
173
+ (" #{a_only.size} in A only: #{a_only.join(', ')}" unless a_only.empty?),
174
+ (" #{b_only.size} in B only: #{b_only.join(', ')}" unless b_only.empty?),
175
+ " #{shared.size} the same in both A and B."
176
+ ].compact.join("\n")
177
+ end
178
+ end
179
+
180
+ # @private
181
+ class ContentDiff
182
+ BLOCK_SIZE = 5000
183
+
184
+ attr_reader :model, :shared_ids
185
+
186
+ def self.thing; 'content'; end
187
+ def self.thing_plural; 'contents'; end
188
+
189
+ def initialize(model, shared_ids)
190
+ @model = model
191
+ @shared_ids = shared_ids
192
+ end
193
+
194
+ def compare
195
+ id_blocks.each do |ids|
196
+ key = model.key.first.name
197
+ ids_param = "'#{ids.join("', '")}'"
198
+ find_all = lambda { |repo_name|
199
+ q = "SELECT * FROM #{model.mdes_table_name} WHERE #{key} IN (#{ids_param}) ORDER BY #{key}"
200
+ ::DataMapper.repository(repo_name).adapter.select(q)
201
+ }
202
+ a_records = find_all['a_reporting']
203
+ b_records = find_all['b_reporting']
204
+
205
+ a_records.zip(b_records).each do |a_rec, b_rec|
206
+ compare_records(a_rec[key], a_rec, b_rec)
207
+ end
208
+ end
209
+
210
+ unless differences.empty?
211
+ self
212
+ end
213
+ end
214
+
215
+ def differences
216
+ @differences ||= {}
217
+ end
218
+
219
+ def id_blocks
220
+ @id_blocks ||=
221
+ begin
222
+ allocated = 0
223
+ blocks = []
224
+ while allocated < shared_ids.size
225
+ blocks << shared_ids[allocated...(allocated + BLOCK_SIZE)]
226
+ allocated += BLOCK_SIZE
227
+ end
228
+ blocks
229
+ end
230
+ end
231
+
232
+ def compare_records(id, a_record, b_record)
233
+ rec_diff = a_record.members.inject({}) { |h, prop|
234
+ a_value = a_record.send(prop)
235
+ b_value = b_record.send(prop)
236
+ h[prop] = [a_value, b_value] if a_value != b_value
237
+ h
238
+ }
239
+ unless rec_diff.empty?
240
+ differences[id] = rec_diff
241
+ end
242
+ end
243
+
244
+ def shell_summary
245
+ [
246
+ " #{model.mdes_table_name}:",
247
+ differences.each_pair.collect do |id, rec_diff|
248
+ [" #{id}:"] +
249
+ rec_diff.each_pair.collect { |var, vals|
250
+ " #{var}: #{vals.collect(&:inspect).join(' != ')}"
251
+ }
252
+ end
253
+ ].flatten.join("\n")
254
+ end
255
+ end
256
+ end
257
+ end
@@ -23,16 +23,36 @@ module NcsNavigator::Warehouse
23
23
  end
24
24
  end
25
25
 
26
+ module NcsStringType
27
+ protected
28
+
29
+ def self.included(into)
30
+ into.class_eval do
31
+ alias :original_typecast_to_primitive :typecast_to_primitive
32
+
33
+ def typecast_to_primitive(value)
34
+ if value.is_a?(BigDecimal)
35
+ value.to_s('F')
36
+ else
37
+ original_typecast_to_primitive(value)
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
43
+
26
44
  ##
27
45
  # DataMapper `:string` that's an {NcsType}.
28
46
  class NcsString < ::DataMapper::Property::String
29
47
  include NcsType
48
+ include NcsStringType
30
49
  end
31
50
 
32
51
  ##
33
52
  # DataMapper `:text` that's an {NcsType}.
34
53
  class NcsText < ::DataMapper::Property::Text
35
54
  include NcsType
55
+ include NcsStringType
36
56
  end
37
57
 
38
58
  ##
@@ -44,18 +44,17 @@ module NcsNavigator::Warehouse
44
44
  # be the reporting repo unless only the working repo is
45
45
  # configured in.
46
46
  # @return [void]
47
- def set_up_repository(mode=:reporting)
47
+ def set_up_repository(mode=:reporting, prefix="mdes_warehouse")
48
48
  fail "Invalid mode #{mode.inspect}" unless [:reporting, :working, :both].include?(mode)
49
49
  modes = case mode
50
50
  when :both then [:reporting, :working]
51
51
  else [mode]
52
52
  end
53
53
  connect_one(modes.first, :default)
54
- modes.each { |m| connect_one(m) }
54
+ modes.each { |m| connect_one(m, [prefix, m].join('_').to_sym) }
55
55
  end
56
56
 
57
- def connect_one(which_one, dm_name=nil)
58
- dm_name ||= :"mdes_warehouse_#{which_one}"
57
+ def connect_one(which_one, dm_name)
59
58
  log.info "Connecting DataMapper repository #{dm_name.inspect}"
60
59
  p = params(which_one)
61
60
  log.debug " using #{p.merge('password' => 'SUPPRESSED').inspect}"