real_data_tests 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,304 @@
1
+ require 'csv'
2
+ require 'tmpdir'
3
+ require 'fileutils'
4
+ require 'json'
5
+ require 'set'
6
+
7
+ module RealDataTests
8
+ class PgDumpGenerator
9
+ def initialize(records)
10
+ @records = records
11
+ end
12
+
13
+ def generate
14
+ sorted_records = sort_by_dependencies(@records)
15
+ insert_statements = collect_inserts(sorted_records)
16
+ insert_statements.join("\n")
17
+ end
18
+
19
+ private
20
+
21
+ def sort_by_dependencies(records)
22
+ # Group records by their model class
23
+ tables_with_records = records.group_by(&:class)
24
+
25
+ # Build dependency graph directly from the models we have
26
+ dependencies = build_dependency_graph(tables_with_records.keys)
27
+
28
+ # Sort models based on dependencies
29
+ sorted_models = topological_sort(dependencies)
30
+
31
+ # Map back to the actual records in dependency order
32
+ sorted_models.flat_map { |model| tables_with_records[model] || [] }
33
+ end
34
+
35
+ def build_dependency_graph(models)
36
+ models.each_with_object({}) do |model, deps|
37
+ # We only need to consider belongs_to associations since they represent
38
+ # the true foreign key dependencies that affect insert order
39
+ direct_dependencies = model.reflect_on_all_associations(:belongs_to)
40
+ .reject(&:polymorphic?) # Skip polymorphic associations
41
+ .map(&:klass)
42
+ .select { |klass| models.include?(klass) } # Only include models we actually have records for
43
+ .uniq
44
+
45
+ # For HABTM associations, we need to ensure the join tables are handled correctly
46
+ habtm_dependencies = model.reflect_on_all_associations(:has_and_belongs_to_many)
47
+ .map { |assoc| assoc.join_table_model }
48
+ .compact
49
+ .select { |join_model| models.include?(join_model) }
50
+ .uniq
51
+
52
+ deps[model] = (direct_dependencies + habtm_dependencies).uniq
53
+ end
54
+ end
55
+
56
+ def topological_sort(dependencies)
57
+ sorted = []
58
+ visited = Set.new
59
+ temporary = Set.new
60
+
61
+ dependencies.each_key do |model|
62
+ visit_model(model, dependencies, sorted, visited, temporary) unless visited.include?(model)
63
+ end
64
+
65
+ sorted
66
+ end
67
+
68
+ def visit_model(model, dependencies, sorted, visited, temporary)
69
+ return if visited.include?(model)
70
+
71
+ if temporary.include?(model)
72
+ # Provide more context in the error message
73
+ cycle = detect_cycle(model, dependencies, temporary)
74
+ raise "Circular dependency detected: #{cycle.map(&:name).join(' -> ')}"
75
+ end
76
+
77
+ temporary.add(model)
78
+
79
+ (dependencies[model] || []).each do |dependency|
80
+ visit_model(dependency, dependencies, sorted, visited, temporary) unless visited.include?(dependency)
81
+ end
82
+
83
+ temporary.delete(model)
84
+ visited.add(model)
85
+ sorted << model
86
+ end
87
+
88
+ def detect_cycle(start_model, dependencies, temporary)
89
+ cycle = [start_model]
90
+ current = dependencies[start_model]&.find { |dep| temporary.include?(dep) }
91
+
92
+ while current && current != start_model
93
+ cycle << current
94
+ current = dependencies[current]&.find { |dep| temporary.include?(dep) }
95
+ end
96
+
97
+ cycle << start_model if current == start_model
98
+ cycle
99
+ end
100
+
101
+ def collect_inserts(records)
102
+ records.map do |record|
103
+ table_name = record.class.table_name
104
+ columns = record.class.column_names
105
+
106
+ values = columns.map do |column|
107
+ if record.class.respond_to?(:defined_enums) && record.class.defined_enums.key?(column)
108
+ raw_value = record.read_attribute_before_type_cast(column)
109
+ raw_value.nil? ? 'NULL' : raw_value.to_s
110
+ else
111
+ quote_value(record[column], get_column_info(record.class, column))
112
+ end
113
+ end
114
+
115
+ <<~SQL.strip
116
+ INSERT INTO #{table_name}
117
+ (#{columns.join(', ')})
118
+ VALUES (#{values.join(', ')})
119
+ ON CONFLICT (id) DO NOTHING;
120
+ SQL
121
+ end
122
+ end
123
+
124
+ def get_column_info(model, column_name)
125
+ column = model.columns_hash[column_name]
126
+ {
127
+ type: column.type,
128
+ sql_type: column.sql_type,
129
+ array: column.array
130
+ }
131
+ end
132
+
133
+ def quote_value(value, column_info)
134
+ return 'NULL' if value.nil?
135
+
136
+ case column_info[:type]
137
+ when :integer, :decimal, :float
138
+ value.to_s
139
+ when :boolean
140
+ value.to_s
141
+ when :array, :json, :jsonb
142
+ parse_and_format_special_type(value, column_info)
143
+ else
144
+ if column_info[:array]
145
+ parse_and_format_array(value, column_info[:sql_type])
146
+ else
147
+ sanitize_string(value.to_s)
148
+ end
149
+ end
150
+ end
151
+
152
+ def parse_and_format_special_type(value, column_info)
153
+ if column_info[:array] || column_info[:type] == :array
154
+ parse_and_format_array(value, column_info[:sql_type])
155
+ else
156
+ # Handle JSON/JSONB
157
+ json_value = value.is_a?(String) ? value : value.to_json
158
+ sanitize_string(json_value)
159
+ end
160
+ end
161
+
162
+ def parse_and_format_array(value, sql_type)
163
+ # Always cast empty or string representations of empty arrays to proper type
164
+ if value.nil? || value == '[]' || value == '{}' || (value.is_a?(Array) && value.empty?)
165
+ base_type = extract_base_type(sql_type)
166
+ return "'{}'" + "::#{base_type}[]"
167
+ end
168
+
169
+ # Parse the array if it's a string
170
+ array_value = case value
171
+ when String
172
+ begin
173
+ JSON.parse(value)
174
+ rescue JSON::ParserError
175
+ value.gsub(/[{}"]/, '').split(',')
176
+ end
177
+ when Array
178
+ value
179
+ else
180
+ [value]
181
+ end
182
+
183
+ # Format the array elements
184
+ elements = array_value.map do |element|
185
+ case element
186
+ when String
187
+ sanitize_string(element)
188
+ when Numeric
189
+ element.to_s
190
+ when nil
191
+ 'NULL'
192
+ else
193
+ sanitize_string(element.to_s)
194
+ end
195
+ end
196
+
197
+ base_type = extract_base_type(sql_type)
198
+ "ARRAY[#{elements.join(',')}]::#{base_type}[]"
199
+ end
200
+
201
+ def extract_base_type(sql_type)
202
+ case sql_type
203
+ when /character varying\[\]/i, /varchar\[\]/i
204
+ 'varchar'
205
+ when /text\[\]/i
206
+ 'text'
207
+ when /integer\[\]/i
208
+ 'integer'
209
+ when /bigint\[\]/i
210
+ 'bigint'
211
+ when /jsonb\[\]/i
212
+ 'jsonb'
213
+ when /json\[\]/i
214
+ 'json'
215
+ else
216
+ sql_type.sub(/\[\]$/, '')
217
+ end
218
+ end
219
+
220
+ def format_array(value, column_info)
221
+ # Handle empty arrays
222
+ if value.nil? || value == '[]' || value == '{}' || (value.is_a?(Array) && value.empty?)
223
+ return "'{}'" + "::character varying[]" if column_info[:type] == :string
224
+ return "'{}'" + "::#{extract_base_type(column_info[:sql_type])}[]"
225
+ end
226
+
227
+ # Parse the array if it's a string
228
+ array_value = case value
229
+ when String
230
+ begin
231
+ JSON.parse(value)
232
+ rescue JSON::ParserError
233
+ value.gsub(/[{}"]/, '').split(',')
234
+ end
235
+ when Array
236
+ value
237
+ else
238
+ [value]
239
+ end
240
+
241
+ # Format array elements
242
+ elements = array_value.map do |element|
243
+ case element
244
+ when String
245
+ sanitize_string(element)
246
+ when Numeric
247
+ element.to_s
248
+ when nil
249
+ 'NULL'
250
+ else
251
+ sanitize_string(element.to_s)
252
+ end
253
+ end
254
+
255
+ # Use character varying[] for string arrays
256
+ array_type = if column_info[:type] == :string
257
+ 'character varying[]'
258
+ else
259
+ "#{extract_base_type(column_info[:sql_type])}[]"
260
+ end
261
+
262
+ "ARRAY[#{elements.join(',')}]::#{array_type}"
263
+ end
264
+
265
+ def extract_base_type(sql_type)
266
+ case sql_type
267
+ when /character varying\[\]/i, /varchar\[\]/i
268
+ 'character varying'
269
+ when /text\[\]/i
270
+ 'text'
271
+ when /integer\[\]/i
272
+ 'integer'
273
+ when /bigint\[\]/i
274
+ 'bigint'
275
+ when /jsonb\[\]/i
276
+ 'jsonb'
277
+ when /json\[\]/i
278
+ 'json'
279
+ else
280
+ sql_type.sub(/\[\]$/, '')
281
+ end
282
+ end
283
+
284
+ def sanitize_string(str)
285
+ "'#{str.gsub("'", "''")}'"
286
+ end
287
+
288
+ def connection_options
289
+ config = if ActiveRecord::Base.respond_to?(:connection_db_config)
290
+ ActiveRecord::Base.connection_db_config.configuration_hash
291
+ else
292
+ ActiveRecord::Base.connection_config
293
+ end
294
+
295
+ options = []
296
+ options << "-h #{config[:host]}" if config[:host]
297
+ options << "-p #{config[:port]}" if config[:port]
298
+ options << "-U #{config[:username]}" if config[:username]
299
+ options << "-d #{config[:database]}"
300
+ options << "-q" # Run quietly
301
+ options.join(" ")
302
+ end
303
+ end
304
+ end
@@ -0,0 +1,117 @@
1
+ module RealDataTests
2
+ class RecordCollector
3
+ def initialize(record)
4
+ @record = record
5
+ @collected_records = Set.new
6
+ @collection_stats = Hash.new { |h, k| h[k] = { count: 0, associations: Hash.new(0) } }
7
+ @processed_associations = Set.new
8
+ @association_path = []
9
+ end
10
+
11
+ def collect
12
+ puts "\nStarting record collection from: #{@record.class.name}##{@record.id}"
13
+ filter_mode = RealDataTests.configuration.association_filter_mode
14
+ filter_list = RealDataTests.configuration.association_filter_list
15
+ puts "Using #{filter_mode || 'no'} filter with #{filter_list.any? ? filter_list.join(', ') : 'no associations'}"
16
+ collect_record(@record)
17
+ print_collection_stats
18
+ @collected_records.to_a
19
+ end
20
+
21
+ private
22
+
23
+ def should_process_association?(record, association)
24
+ association_key = "#{record.class.name}##{record.id}:#{association.name}"
25
+ return false if @processed_associations.include?(association_key)
26
+ @processed_associations.add(association_key)
27
+
28
+ # Use the enhanced should_process_association? method
29
+ return false unless RealDataTests.configuration.should_process_association?(record, association.name)
30
+
31
+ # Check for prevented reciprocal loading
32
+ if RealDataTests.configuration.prevent_reciprocal?(record.class, association.name)
33
+ puts " Skipping prevented reciprocal association: #{association.name} on #{record.class.name}"
34
+ return false
35
+ end
36
+
37
+ true
38
+ end
39
+
40
+ def collect_record(record)
41
+ return if @collected_records.include?(record)
42
+ return unless record # Guard against nil records
43
+
44
+ @collected_records.add(record)
45
+ @collection_stats[record.class.name][:count] += 1
46
+ collect_associations(record)
47
+ end
48
+
49
+ def collect_associations(record)
50
+ return unless record.class.respond_to?(:reflect_on_all_associations)
51
+
52
+ associations = record.class.reflect_on_all_associations
53
+ puts "\nProcessing associations for: #{record.class.name}##{record.id}"
54
+ puts "Found #{associations.length} associations"
55
+
56
+ associations.each do |association|
57
+ should_process = RealDataTests.configuration.should_process_association?(record, association.name)
58
+
59
+ unless should_process
60
+ puts " Skipping #{RealDataTests.configuration.association_filter_mode == :whitelist ? 'non-whitelisted' : 'blacklisted'} association: #{association.name} for #{record.class.name}"
61
+ next
62
+ end
63
+
64
+ puts " Processing #{association.macro} association: #{association.name}"
65
+ process_association(record, association)
66
+ end
67
+ end
68
+
69
+ def process_association(record, association)
70
+ begin
71
+ related_records = fetch_related_records(record, association)
72
+ count = related_records.length
73
+ puts " Found #{count} related #{association.name} records"
74
+ @collection_stats[record.class.name][:associations][association.name] += count
75
+
76
+ related_records.each { |related_record| collect_record(related_record) }
77
+ rescue => e
78
+ puts " Error processing association #{association.name}: #{e.message}"
79
+ end
80
+ end
81
+
82
+ def fetch_related_records(record, association)
83
+ case association.macro
84
+ when :belongs_to, :has_one
85
+ Array(record.public_send(association.name)).compact
86
+ when :has_many, :has_and_belongs_to_many
87
+ relation = record.public_send(association.name)
88
+
89
+ # Apply configured limit if it exists
90
+ if limit = RealDataTests.configuration.get_association_limit(record.class, association.name)
91
+ puts " Applying configured limit of #{limit} records for #{record.class.name}.#{association.name}"
92
+ relation = relation.limit(limit)
93
+ end
94
+
95
+ relation.loaded? ? relation.to_a : relation.load.to_a
96
+ else
97
+ []
98
+ end
99
+ end
100
+
101
+ def print_collection_stats
102
+ puts "\n=== Collection Statistics ==="
103
+ @collection_stats.each do |model, stats|
104
+ puts "\n#{model}:"
105
+ puts " Total records: #{stats[:count]}"
106
+ if stats[:associations].any?
107
+ puts " Associations:"
108
+ stats[:associations].each do |assoc_name, count|
109
+ puts " #{assoc_name}: #{count} records"
110
+ end
111
+ end
112
+ end
113
+ puts "\nTotal unique records collected: #{@collected_records.size}"
114
+ puts "==============================\n"
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,41 @@
1
+ module RealDataTests
2
+ module RSpecHelper
3
+ def load_real_test_data(name)
4
+ dump_path = File.join(RealDataTests.configuration.dump_path, "#{name}.sql")
5
+ raise Error, "Test data file not found: #{dump_path}" unless File.exist?(dump_path)
6
+
7
+ ActiveRecord::Base.transaction do
8
+ # Disable foreign key checks
9
+ ActiveRecord::Base.connection.execute('SET session_replication_role = replica;')
10
+
11
+ begin
12
+ # Load the SQL dump quietly
13
+ result = system("psql #{connection_options} -q < #{dump_path}")
14
+ raise Error, "Failed to load test data: #{dump_path}" unless result
15
+
16
+ ensure
17
+ # Re-enable foreign key checks
18
+ ActiveRecord::Base.connection.execute('SET session_replication_role = DEFAULT;')
19
+ end
20
+ end
21
+ end
22
+
23
+ private
24
+
25
+ def connection_options
26
+ config = if ActiveRecord::Base.respond_to?(:connection_db_config)
27
+ ActiveRecord::Base.connection_db_config.configuration_hash
28
+ else
29
+ ActiveRecord::Base.connection_config
30
+ end
31
+
32
+ options = []
33
+ options << "-h #{config[:host]}" if config[:host]
34
+ options << "-p #{config[:port]}" if config[:port]
35
+ options << "-U #{config[:username]}" if config[:username]
36
+ options << "-d #{config[:database]}"
37
+ options << "-q"
38
+ options.join(" ")
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,34 @@
1
+ module RealDataTests
2
+ class TestDataBuilder
3
+ def initialize(record, name: nil)
4
+ @record = record
5
+ @name = name || "#{record.class.name.underscore}_#{record.id}"
6
+ end
7
+
8
+ def create_dump_file
9
+ records = RealDataTests::RecordCollector.new(@record).collect
10
+
11
+ # Only anonymize if rules are configured
12
+ if RealDataTests.configuration.anonymization_rules.any?
13
+ puts "\nAnonymizing records..."
14
+ anonymizer = RealDataTests::DataAnonymizer.new(RealDataTests.configuration)
15
+ anonymizer.anonymize_records(records)
16
+ end
17
+
18
+ dump_content = RealDataTests::PgDumpGenerator.new(records).generate
19
+ dump_path = dump_file_path
20
+
21
+ FileUtils.mkdir_p(File.dirname(dump_path))
22
+ File.write(dump_path, dump_content)
23
+
24
+ puts "\nDump file created at: #{dump_path}"
25
+ dump_path
26
+ end
27
+
28
+ private
29
+
30
+ def dump_file_path
31
+ File.join(RealDataTests.configuration.dump_path, "#{@name}.sql")
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RealDataTests
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'real_data_tests/configuration'
4
+ require 'real_data_tests/data_anonymizer'
5
+ require 'real_data_tests/engine' if defined?(Rails)
6
+ require 'real_data_tests/pg_dump_generator'
7
+ require 'real_data_tests/record_collector'
8
+ require 'real_data_tests/rspec_helper'
9
+ require 'real_data_tests/test_data_builder'
10
+ require 'real_data_tests/version'
11
+
12
+ module RealDataTests
13
+ class Error < StandardError; end
14
+ class ConfigurationError < Error; end
15
+ class DumpFileError < Error; end
16
+
17
+ class << self
18
+ def configuration
19
+ @configuration ||= Configuration.new
20
+ end
21
+
22
+ def configure
23
+ yield(configuration) if block_given?
24
+ configuration
25
+ end
26
+
27
+ def reset_configuration!
28
+ @configuration = Configuration.new
29
+ end
30
+
31
+ def create_dump_file(record, name: nil)
32
+ raise ConfigurationError, "Configuration not initialized" unless @configuration
33
+
34
+ begin
35
+ TestDataBuilder.new(record, name: name).create_dump_file
36
+ rescue => e
37
+ raise DumpFileError, "Failed to create dump file: #{e.message}"
38
+ end
39
+ end
40
+
41
+ def root
42
+ File.expand_path('../..', __FILE__)
43
+ end
44
+
45
+ def env
46
+ @env ||= (ENV['RAILS_ENV'] || ENV['RACK_ENV'] || 'development')
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,4 @@
1
+ module RealDataTests
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end