ncs_mdes_warehouse 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format=nested
data/CHANGELOG.md CHANGED
@@ -1,6 +1,26 @@
1
1
  NCS Navigator MDES Warehouse History
2
2
  ====================================
3
3
 
4
+ 0.2.0
5
+ -----
6
+
7
+ - Automatically set PSU ID and recruitment type in
8
+ `Transformers::EnumTransformer` if they are not already set. (#1630,
9
+ #1648)
10
+
11
+ - ETL: store invalid or failing record IDs as a separate column
12
+ instead of as part of TransformError#message. (#1636)
13
+
14
+ - When an MDES model string variable is set from a BigDecimal, coerce
15
+ it to a floating point string. The ruby default is scientific
16
+ notation, but that's not permitted in the MDES.
17
+
18
+ - New options for `emit-xml`: `--tables`, `--zip`,
19
+ `--include-pii`. (#1612, #1657, #1658)
20
+
21
+ - Add `compare` subcommand in `mdes-wh` for comparing the contents of
22
+ two warehouses. (#1667)
23
+
4
24
  0.1.1
5
25
  -----
6
26
 
data/Rakefile CHANGED
@@ -15,12 +15,11 @@ task :spec => 'spec:all'
15
15
  namespace :spec do
16
16
  RSpec::Core::RakeTask.new(:fast) do |t|
17
17
  t.pattern = "spec/**/*_spec.rb"
18
- t.rspec_opts = %q(--format nested --tag ~slow)
18
+ t.rspec_opts = %q(--tag ~slow)
19
19
  end
20
20
 
21
21
  RSpec::Core::RakeTask.new(:all) do |t|
22
22
  t.pattern = "spec/**/*_spec.rb"
23
- t.rspec_opts = %q(--format nested)
24
23
  end
25
24
  end
26
25
 
@@ -38,7 +37,7 @@ namespace :ci do
38
37
 
39
38
  task :spec_setup do
40
39
  ENV['CI_REPORTS'] = 'reports/spec-xml'
41
- ENV['SPEC_OPTS'] = "#{ENV['SPEC_OPTS']} --format nested"
40
+ ENV['SPEC_OPTS'] = "#{ENV['SPEC_OPTS']} --format nested --no-color"
42
41
  end
43
42
 
44
43
  task :spec => [:spec_setup, 'ci:setup:rspecbase', 'rake:spec']
@@ -124,9 +124,9 @@ module NcsNavigator::Warehouse::Models::TwoPointZero
124
124
  property :centrifuge_location,
125
125
  NcsNavigator::Warehouse::DataMapper::NcsString,
126
126
  { :required => true, :length => 1..2, :set => ["1", "2", "-3", "-4"] }
127
- belongs_to :equip,
128
- 'NcsNavigator::Warehouse::Models::TwoPointZero::SpecEquipment',
129
- :child_key => [ :equip_id ], :required => false
127
+ property :equip_id,
128
+ NcsNavigator::Warehouse::DataMapper::NcsString,
129
+ { :length => 0..36 }
130
130
  property :centrifuge_time,
131
131
  NcsNavigator::Warehouse::DataMapper::NcsString,
132
132
  { :length => 0..5, :format => /^([0-9][0-9]:[0-9][0-9])?$/ }
@@ -7,6 +7,7 @@ module NcsNavigator
7
7
  DEFAULT_MDES_VERSION = '2.0'
8
8
 
9
9
  autoload :CLI, 'ncs_navigator/warehouse/cli'
10
+ autoload :Comparator, 'ncs_navigator/warehouse/comparator'
10
11
  autoload :Configuration, 'ncs_navigator/warehouse/configuration'
11
12
  autoload :DataMapper, 'ncs_navigator/warehouse/data_mapper'
12
13
  autoload :DatabaseInitializer, 'ncs_navigator/warehouse/database_initializer'
@@ -54,10 +54,20 @@ the contents of the current reporting database. The default name for
54
54
  the XML file is the county name for the PSU plus the date; e.g.,
55
55
  cook-20110728.xml.
56
56
  DESC
57
+ method_option 'block-size', :type => :numeric, :aliases => %w(-b),
58
+ :desc => 'The maximum number of records to have in memory at once.',
59
+ :default => 5000
60
+ method_option 'include-pii', :type => :boolean, :default => false,
61
+ :desc => 'Include PII values in the emitted XML.'
62
+ method_option 'zip', :type => :boolean, :default => true,
63
+ :desc => 'Create a zip file alongside the XML. (Use --no-zip to disable.)'
64
+ method_option 'tables', :type => :string,
65
+ :desc => 'Emit XML for a subset of tables.', :banner => 'TABLE,TABLE,TABLE'
57
66
  def emit_xml(filename=nil)
58
67
  use_database
59
68
 
60
- XmlEmitter.new(configuration, filename).emit_xml
69
+ XmlEmitter.new(configuration, filename,
70
+ options.merge(:tables => options[:tables].try(:split, /\s*,\s*/))).emit_xml
61
71
  end
62
72
 
63
73
  desc 'etl', 'Performs the full extract-transform-load process for this configuration'
@@ -83,6 +93,41 @@ DESC
83
93
  exit 1
84
94
  end
85
95
  end
96
+
97
+ desc 'compare', 'Compares the contents of two warehouses, A & B.'
98
+ long_desc <<-DESC
99
+ Compares the contents of the MDES tables in two warehouses.
100
+
101
+ The comparison can be done at three levels:
102
+
103
+ 1. Record counts.
104
+
105
+ 2. IDs. (Which record IDs appear in one warehouse and not the other?)
106
+
107
+ 3. Full contents. (Matching records based on ID, what variables have different values?)
108
+
109
+ Each level includes the one before it (i.e., doing a full content
110
+ comparison also does ID and count comparisons). Higher levels skip
111
+ tables which would add heat without light; i.e., level 2 only compares
112
+ IDs for a table when there are at least some records in each warehouse
113
+ and level 3 only compares content for tables where there are some
114
+ overlapping IDs.
115
+ DESC
116
+ method_option 'warehouse-a', :type => :string, :aliases => %w(-a),
117
+ :desc => 'The configuration file for warehouse A. The environment default will be used if not specified.'
118
+ method_option 'warehouse-b', :type => :string, :required => true, :aliases => %w(-b),
119
+ :desc => 'The configuration file for warehouse B.'
120
+ method_option 'level', :type => :numeric, :default => 1,
121
+ :desc => 'The level of detail for the comparison.'
122
+ def compare
123
+ if options['warehouse-a']
124
+ options['config'] = options['warehouse-a']
125
+ end
126
+ config_a = configuration
127
+ config_b = Configuration.from_file(options['warehouse-b'])
128
+
129
+ Comparator.new(config_a, config_b, options).compare
130
+ end
86
131
  end
87
132
  end
88
133
 
@@ -0,0 +1,257 @@
1
+ require 'ncs_navigator/warehouse'
2
+
3
+ require 'forwardable'
4
+
5
+ module NcsNavigator::Warehouse
6
+ class Comparator
7
+ extend Forwardable
8
+
9
+ attr_reader :level
10
+ attr_reader :a, :b
11
+
12
+ def_delegators :a, :shell
13
+
14
+ def initialize(config_a, config_b, options={})
15
+ @a = config_a; @b = config_b
16
+ @level = options[:level] ||= 1
17
+
18
+ shell.clear_line_and_say "Connecting to warehouse A..."
19
+ DatabaseInitializer.new(@a).set_up_repository(:reporting, 'a')
20
+ shell.clear_line_and_say "Connecting to warehouse B..."
21
+ DatabaseInitializer.new(@b).set_up_repository(:reporting, 'b')
22
+ shell.clear_line_and_say ''
23
+ end
24
+
25
+ def compare
26
+ shell.say_line "Running level #{level} compare..."
27
+ count_diff if level >= 1
28
+ id_diff if level >= 2
29
+ content_diff if level >= 3
30
+ end
31
+
32
+ def count_diff
33
+ @count_diffs = collect_differences(CountDiff, models.collect { |m| [m] })
34
+ shell_summarize_differences(CountDiff, @count_diffs)
35
+ end
36
+
37
+ def id_diff
38
+ @id_diffs = collect_differences(
39
+ IdDiff, models.select { |m| no_zero_counts?(m) }.collect { |m| [m] })
40
+ shell_summarize_differences(IdDiff, @id_diffs)
41
+ end
42
+
43
+ def content_diff
44
+ @content_diffs = collect_differences(
45
+ ContentDiff, models.collect { |m| [m, list_shared_ids(m)] }.select { |m, shared| shared })
46
+ shell_summarize_differences(ContentDiff, @content_diffs)
47
+ end
48
+
49
+ private
50
+
51
+ def models
52
+ @models ||= begin
53
+ shell.clear_line_and_say "Loading MDES models..."
54
+ a.models_module.mdes_order
55
+ end
56
+ end
57
+
58
+ def collect_differences(differ, arg_sets)
59
+ start = Time.now
60
+ arg_sets.collect do |arg_set|
61
+ model = arg_set.first
62
+ shell.clear_line_and_say "comparing #{differ.thing_plural} for #{model.mdes_table_name}..."
63
+ differ.new(*arg_set).compare
64
+ end.compact.tap {
65
+ shell.clear_line_and_say "#{differ.thing} comparisons complete in %.1fs.\n" % (Time.now - start)
66
+ }
67
+ end
68
+
69
+ def shell_summarize_differences(differ, diffs)
70
+ if diffs.empty?
71
+ puts "There are no #{differ.thing} differences."
72
+ else
73
+ puts "#{differ.thing_plural} differ in #{diffs.size} table#{'s' if diffs.size != 1}:"
74
+ diffs.each do |diff|
75
+ puts diff.shell_summary
76
+ end
77
+ end
78
+ puts
79
+ end
80
+
81
+ def no_zero_counts?(model)
82
+ if diff = @count_diffs.find { |d| d.table_name == model.mdes_table_name }
83
+ diff.a_count != 0 && diff.b_count != 0
84
+ else
85
+ true
86
+ end
87
+ end
88
+
89
+ def list_shared_ids(model)
90
+ if diff = @id_diffs.find { |d| d.model == model }
91
+ diff.shared
92
+ else
93
+ nil
94
+ end
95
+ end
96
+
97
+ # @private
98
+ class CountDiff
99
+ attr_reader :table_name, :a_count, :b_count
100
+
101
+ def self.thing; 'count'; end
102
+ def self.thing_plural; 'counts'; end
103
+
104
+ def initialize(model)
105
+ @table_name = model.mdes_table_name
106
+ @q = "SELECT COUNT(*) FROM #{table_name}"
107
+ end
108
+
109
+ def compare
110
+ @a_count = count('a_reporting')
111
+ @b_count = count('b_reporting')
112
+ if difference != 0
113
+ self
114
+ end
115
+ end
116
+
117
+ def difference
118
+ a_count - b_count
119
+ end
120
+
121
+ def count(repo)
122
+ ::DataMapper.repository(repo).adapter.select(@q).first
123
+ end
124
+
125
+ def shell_summary
126
+ " %30s: A = %6d %s %-6d = B | %6d" % [
127
+ table_name,
128
+ a_count, a_count > b_count ? '>' : '<', b_count,
129
+ difference.abs
130
+ ]
131
+ end
132
+ end
133
+
134
+ # @private
135
+ class IdDiff
136
+ attr_reader :model, :a_ids, :b_ids
137
+
138
+ def self.thing; 'ID'; end
139
+ def self.thing_plural; 'IDs'; end
140
+
141
+ def initialize(model)
142
+ @model = model
143
+ @q = "SELECT #{model.key.first.name} FROM #{model.mdes_table_name}"
144
+ end
145
+
146
+ def compare
147
+ @a_ids = ids('a_reporting')
148
+ @b_ids = ids('b_reporting')
149
+ unless a_only.empty? && b_only.empty?
150
+ self
151
+ end
152
+ end
153
+
154
+ def a_only
155
+ @a_only ||= a_ids - b_ids
156
+ end
157
+
158
+ def b_only
159
+ @b_only ||= b_ids - a_ids
160
+ end
161
+
162
+ def shared
163
+ @shared ||= a_ids & b_ids
164
+ end
165
+
166
+ def ids(repo)
167
+ ::DataMapper.repository(repo).adapter.select(@q)
168
+ end
169
+
170
+ def shell_summary
171
+ [
172
+ " #{model.mdes_table_name}",
173
+ (" #{a_only.size} in A only: #{a_only.join(', ')}" unless a_only.empty?),
174
+ (" #{b_only.size} in B only: #{b_only.join(', ')}" unless b_only.empty?),
175
+ " #{shared.size} the same in both A and B."
176
+ ].compact.join("\n")
177
+ end
178
+ end
179
+
180
+ # @private
181
+ class ContentDiff
182
+ BLOCK_SIZE = 5000
183
+
184
+ attr_reader :model, :shared_ids
185
+
186
+ def self.thing; 'content'; end
187
+ def self.thing_plural; 'contents'; end
188
+
189
+ def initialize(model, shared_ids)
190
+ @model = model
191
+ @shared_ids = shared_ids
192
+ end
193
+
194
+ def compare
195
+ id_blocks.each do |ids|
196
+ key = model.key.first.name
197
+ ids_param = "'#{ids.join("', '")}'"
198
+ find_all = lambda { |repo_name|
199
+ q = "SELECT * FROM #{model.mdes_table_name} WHERE #{key} IN (#{ids_param}) ORDER BY #{key}"
200
+ ::DataMapper.repository(repo_name).adapter.select(q)
201
+ }
202
+ a_records = find_all['a_reporting']
203
+ b_records = find_all['b_reporting']
204
+
205
+ a_records.zip(b_records).each do |a_rec, b_rec|
206
+ compare_records(a_rec[key], a_rec, b_rec)
207
+ end
208
+ end
209
+
210
+ unless differences.empty?
211
+ self
212
+ end
213
+ end
214
+
215
+ def differences
216
+ @differences ||= {}
217
+ end
218
+
219
+ def id_blocks
220
+ @id_blocks ||=
221
+ begin
222
+ allocated = 0
223
+ blocks = []
224
+ while allocated < shared_ids.size
225
+ blocks << shared_ids[allocated...(allocated + BLOCK_SIZE)]
226
+ allocated += BLOCK_SIZE
227
+ end
228
+ blocks
229
+ end
230
+ end
231
+
232
+ def compare_records(id, a_record, b_record)
233
+ rec_diff = a_record.members.inject({}) { |h, prop|
234
+ a_value = a_record.send(prop)
235
+ b_value = b_record.send(prop)
236
+ h[prop] = [a_value, b_value] if a_value != b_value
237
+ h
238
+ }
239
+ unless rec_diff.empty?
240
+ differences[id] = rec_diff
241
+ end
242
+ end
243
+
244
+ def shell_summary
245
+ [
246
+ " #{model.mdes_table_name}:",
247
+ differences.each_pair.collect do |id, rec_diff|
248
+ [" #{id}:"] +
249
+ rec_diff.each_pair.collect { |var, vals|
250
+ " #{var}: #{vals.collect(&:inspect).join(' != ')}"
251
+ }
252
+ end
253
+ ].flatten.join("\n")
254
+ end
255
+ end
256
+ end
257
+ end
@@ -23,16 +23,36 @@ module NcsNavigator::Warehouse
23
23
  end
24
24
  end
25
25
 
26
+ module NcsStringType
27
+ protected
28
+
29
+ def self.included(into)
30
+ into.class_eval do
31
+ alias :original_typecast_to_primitive :typecast_to_primitive
32
+
33
+ def typecast_to_primitive(value)
34
+ if value.is_a?(BigDecimal)
35
+ value.to_s('F')
36
+ else
37
+ original_typecast_to_primitive(value)
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
43
+
26
44
  ##
27
45
  # DataMapper `:string` that's an {NcsType}.
28
46
  class NcsString < ::DataMapper::Property::String
29
47
  include NcsType
48
+ include NcsStringType
30
49
  end
31
50
 
32
51
  ##
33
52
  # DataMapper `:text` that's an {NcsType}.
34
53
  class NcsText < ::DataMapper::Property::Text
35
54
  include NcsType
55
+ include NcsStringType
36
56
  end
37
57
 
38
58
  ##
@@ -44,18 +44,17 @@ module NcsNavigator::Warehouse
44
44
  # be the reporting repo unless only the working repo is
45
45
  # configured in.
46
46
  # @return [void]
47
- def set_up_repository(mode=:reporting)
47
+ def set_up_repository(mode=:reporting, prefix="mdes_warehouse")
48
48
  fail "Invalid mode #{mode.inspect}" unless [:reporting, :working, :both].include?(mode)
49
49
  modes = case mode
50
50
  when :both then [:reporting, :working]
51
51
  else [mode]
52
52
  end
53
53
  connect_one(modes.first, :default)
54
- modes.each { |m| connect_one(m) }
54
+ modes.each { |m| connect_one(m, [prefix, m].join('_').to_sym) }
55
55
  end
56
56
 
57
- def connect_one(which_one, dm_name=nil)
58
- dm_name ||= :"mdes_warehouse_#{which_one}"
57
+ def connect_one(which_one, dm_name)
59
58
  log.info "Connecting DataMapper repository #{dm_name.inspect}"
60
59
  p = params(which_one)
61
60
  log.debug " using #{p.merge('password' => 'SUPPRESSED').inspect}"