real_data_tests 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,304 @@
1
+ require 'csv'
2
+ require 'tmpdir'
3
+ require 'fileutils'
4
+ require 'json'
5
+ require 'set'
6
+
7
+ module RealDataTests
8
+ class PgDumpGenerator
9
+ def initialize(records)
10
+ @records = records
11
+ end
12
+
13
+ def generate
14
+ sorted_records = sort_by_dependencies(@records)
15
+ insert_statements = collect_inserts(sorted_records)
16
+ insert_statements.join("\n")
17
+ end
18
+
19
+ private
20
+
21
+ def sort_by_dependencies(records)
22
+ # Group records by their model class
23
+ tables_with_records = records.group_by(&:class)
24
+
25
+ # Build dependency graph directly from the models we have
26
+ dependencies = build_dependency_graph(tables_with_records.keys)
27
+
28
+ # Sort models based on dependencies
29
+ sorted_models = topological_sort(dependencies)
30
+
31
+ # Map back to the actual records in dependency order
32
+ sorted_models.flat_map { |model| tables_with_records[model] || [] }
33
+ end
34
+
35
+ def build_dependency_graph(models)
36
+ models.each_with_object({}) do |model, deps|
37
+ # We only need to consider belongs_to associations since they represent
38
+ # the true foreign key dependencies that affect insert order
39
+ direct_dependencies = model.reflect_on_all_associations(:belongs_to)
40
+ .reject(&:polymorphic?) # Skip polymorphic associations
41
+ .map(&:klass)
42
+ .select { |klass| models.include?(klass) } # Only include models we actually have records for
43
+ .uniq
44
+
45
+ # For HABTM associations, we need to ensure the join tables are handled correctly
46
+ habtm_dependencies = model.reflect_on_all_associations(:has_and_belongs_to_many)
47
+ .map { |assoc| assoc.join_table_model }
48
+ .compact
49
+ .select { |join_model| models.include?(join_model) }
50
+ .uniq
51
+
52
+ deps[model] = (direct_dependencies + habtm_dependencies).uniq
53
+ end
54
+ end
55
+
56
+ def topological_sort(dependencies)
57
+ sorted = []
58
+ visited = Set.new
59
+ temporary = Set.new
60
+
61
+ dependencies.each_key do |model|
62
+ visit_model(model, dependencies, sorted, visited, temporary) unless visited.include?(model)
63
+ end
64
+
65
+ sorted
66
+ end
67
+
68
+ def visit_model(model, dependencies, sorted, visited, temporary)
69
+ return if visited.include?(model)
70
+
71
+ if temporary.include?(model)
72
+ # Provide more context in the error message
73
+ cycle = detect_cycle(model, dependencies, temporary)
74
+ raise "Circular dependency detected: #{cycle.map(&:name).join(' -> ')}"
75
+ end
76
+
77
+ temporary.add(model)
78
+
79
+ (dependencies[model] || []).each do |dependency|
80
+ visit_model(dependency, dependencies, sorted, visited, temporary) unless visited.include?(dependency)
81
+ end
82
+
83
+ temporary.delete(model)
84
+ visited.add(model)
85
+ sorted << model
86
+ end
87
+
88
+ def detect_cycle(start_model, dependencies, temporary)
89
+ cycle = [start_model]
90
+ current = dependencies[start_model]&.find { |dep| temporary.include?(dep) }
91
+
92
+ while current && current != start_model
93
+ cycle << current
94
+ current = dependencies[current]&.find { |dep| temporary.include?(dep) }
95
+ end
96
+
97
+ cycle << start_model if current == start_model
98
+ cycle
99
+ end
100
+
101
+ def collect_inserts(records)
102
+ records.map do |record|
103
+ table_name = record.class.table_name
104
+ columns = record.class.column_names
105
+
106
+ values = columns.map do |column|
107
+ if record.class.respond_to?(:defined_enums) && record.class.defined_enums.key?(column)
108
+ raw_value = record.read_attribute_before_type_cast(column)
109
+ raw_value.nil? ? 'NULL' : raw_value.to_s
110
+ else
111
+ quote_value(record[column], get_column_info(record.class, column))
112
+ end
113
+ end
114
+
115
+ <<~SQL.strip
116
+ INSERT INTO #{table_name}
117
+ (#{columns.join(', ')})
118
+ VALUES (#{values.join(', ')})
119
+ ON CONFLICT (id) DO NOTHING;
120
+ SQL
121
+ end
122
+ end
123
+
124
+ def get_column_info(model, column_name)
125
+ column = model.columns_hash[column_name]
126
+ {
127
+ type: column.type,
128
+ sql_type: column.sql_type,
129
+ array: column.array
130
+ }
131
+ end
132
+
133
+ def quote_value(value, column_info)
134
+ return 'NULL' if value.nil?
135
+
136
+ case column_info[:type]
137
+ when :integer, :decimal, :float
138
+ value.to_s
139
+ when :boolean
140
+ value.to_s
141
+ when :array, :json, :jsonb
142
+ parse_and_format_special_type(value, column_info)
143
+ else
144
+ if column_info[:array]
145
+ parse_and_format_array(value, column_info[:sql_type])
146
+ else
147
+ sanitize_string(value.to_s)
148
+ end
149
+ end
150
+ end
151
+
152
+ def parse_and_format_special_type(value, column_info)
153
+ if column_info[:array] || column_info[:type] == :array
154
+ parse_and_format_array(value, column_info[:sql_type])
155
+ else
156
+ # Handle JSON/JSONB
157
+ json_value = value.is_a?(String) ? value : value.to_json
158
+ sanitize_string(json_value)
159
+ end
160
+ end
161
+
162
+ def parse_and_format_array(value, sql_type)
163
+ # Always cast empty or string representations of empty arrays to proper type
164
+ if value.nil? || value == '[]' || value == '{}' || (value.is_a?(Array) && value.empty?)
165
+ base_type = extract_base_type(sql_type)
166
+ return "'{}'" + "::#{base_type}[]"
167
+ end
168
+
169
+ # Parse the array if it's a string
170
+ array_value = case value
171
+ when String
172
+ begin
173
+ JSON.parse(value)
174
+ rescue JSON::ParserError
175
+ value.gsub(/[{}"]/, '').split(',')
176
+ end
177
+ when Array
178
+ value
179
+ else
180
+ [value]
181
+ end
182
+
183
+ # Format the array elements
184
+ elements = array_value.map do |element|
185
+ case element
186
+ when String
187
+ sanitize_string(element)
188
+ when Numeric
189
+ element.to_s
190
+ when nil
191
+ 'NULL'
192
+ else
193
+ sanitize_string(element.to_s)
194
+ end
195
+ end
196
+
197
+ base_type = extract_base_type(sql_type)
198
+ "ARRAY[#{elements.join(',')}]::#{base_type}[]"
199
+ end
200
+
201
+ def extract_base_type(sql_type)
202
+ case sql_type
203
+ when /character varying\[\]/i, /varchar\[\]/i
204
+ 'varchar'
205
+ when /text\[\]/i
206
+ 'text'
207
+ when /integer\[\]/i
208
+ 'integer'
209
+ when /bigint\[\]/i
210
+ 'bigint'
211
+ when /jsonb\[\]/i
212
+ 'jsonb'
213
+ when /json\[\]/i
214
+ 'json'
215
+ else
216
+ sql_type.sub(/\[\]$/, '')
217
+ end
218
+ end
219
+
220
+ def format_array(value, column_info)
221
+ # Handle empty arrays
222
+ if value.nil? || value == '[]' || value == '{}' || (value.is_a?(Array) && value.empty?)
223
+ return "'{}'" + "::character varying[]" if column_info[:type] == :string
224
+ return "'{}'" + "::#{extract_base_type(column_info[:sql_type])}[]"
225
+ end
226
+
227
+ # Parse the array if it's a string
228
+ array_value = case value
229
+ when String
230
+ begin
231
+ JSON.parse(value)
232
+ rescue JSON::ParserError
233
+ value.gsub(/[{}"]/, '').split(',')
234
+ end
235
+ when Array
236
+ value
237
+ else
238
+ [value]
239
+ end
240
+
241
+ # Format array elements
242
+ elements = array_value.map do |element|
243
+ case element
244
+ when String
245
+ sanitize_string(element)
246
+ when Numeric
247
+ element.to_s
248
+ when nil
249
+ 'NULL'
250
+ else
251
+ sanitize_string(element.to_s)
252
+ end
253
+ end
254
+
255
+ # Use character varying[] for string arrays
256
+ array_type = if column_info[:type] == :string
257
+ 'character varying[]'
258
+ else
259
+ "#{extract_base_type(column_info[:sql_type])}[]"
260
+ end
261
+
262
+ "ARRAY[#{elements.join(',')}]::#{array_type}"
263
+ end
264
+
265
+ def extract_base_type(sql_type)
266
+ case sql_type
267
+ when /character varying\[\]/i, /varchar\[\]/i
268
+ 'character varying'
269
+ when /text\[\]/i
270
+ 'text'
271
+ when /integer\[\]/i
272
+ 'integer'
273
+ when /bigint\[\]/i
274
+ 'bigint'
275
+ when /jsonb\[\]/i
276
+ 'jsonb'
277
+ when /json\[\]/i
278
+ 'json'
279
+ else
280
+ sql_type.sub(/\[\]$/, '')
281
+ end
282
+ end
283
+
284
+ def sanitize_string(str)
285
+ "'#{str.gsub("'", "''")}'"
286
+ end
287
+
288
+ def connection_options
289
+ config = if ActiveRecord::Base.respond_to?(:connection_db_config)
290
+ ActiveRecord::Base.connection_db_config.configuration_hash
291
+ else
292
+ ActiveRecord::Base.connection_config
293
+ end
294
+
295
+ options = []
296
+ options << "-h #{config[:host]}" if config[:host]
297
+ options << "-p #{config[:port]}" if config[:port]
298
+ options << "-U #{config[:username]}" if config[:username]
299
+ options << "-d #{config[:database]}"
300
+ options << "-q" # Run quietly
301
+ options.join(" ")
302
+ end
303
+ end
304
+ end
@@ -0,0 +1,117 @@
1
+ module RealDataTests
2
+ class RecordCollector
3
+ def initialize(record)
4
+ @record = record
5
+ @collected_records = Set.new
6
+ @collection_stats = Hash.new { |h, k| h[k] = { count: 0, associations: Hash.new(0) } }
7
+ @processed_associations = Set.new
8
+ @association_path = []
9
+ end
10
+
11
+ def collect
12
+ puts "\nStarting record collection from: #{@record.class.name}##{@record.id}"
13
+ filter_mode = RealDataTests.configuration.association_filter_mode
14
+ filter_list = RealDataTests.configuration.association_filter_list
15
+ puts "Using #{filter_mode || 'no'} filter with #{filter_list.any? ? filter_list.join(', ') : 'no associations'}"
16
+ collect_record(@record)
17
+ print_collection_stats
18
+ @collected_records.to_a
19
+ end
20
+
21
+ private
22
+
23
+ def should_process_association?(record, association)
24
+ association_key = "#{record.class.name}##{record.id}:#{association.name}"
25
+ return false if @processed_associations.include?(association_key)
26
+ @processed_associations.add(association_key)
27
+
28
+ # Use the enhanced should_process_association? method
29
+ return false unless RealDataTests.configuration.should_process_association?(record, association.name)
30
+
31
+ # Check for prevented reciprocal loading
32
+ if RealDataTests.configuration.prevent_reciprocal?(record.class, association.name)
33
+ puts " Skipping prevented reciprocal association: #{association.name} on #{record.class.name}"
34
+ return false
35
+ end
36
+
37
+ true
38
+ end
39
+
40
+ def collect_record(record)
41
+ return if @collected_records.include?(record)
42
+ return unless record # Guard against nil records
43
+
44
+ @collected_records.add(record)
45
+ @collection_stats[record.class.name][:count] += 1
46
+ collect_associations(record)
47
+ end
48
+
49
+ def collect_associations(record)
50
+ return unless record.class.respond_to?(:reflect_on_all_associations)
51
+
52
+ associations = record.class.reflect_on_all_associations
53
+ puts "\nProcessing associations for: #{record.class.name}##{record.id}"
54
+ puts "Found #{associations.length} associations"
55
+
56
+ associations.each do |association|
57
+ should_process = RealDataTests.configuration.should_process_association?(record, association.name)
58
+
59
+ unless should_process
60
+ puts " Skipping #{RealDataTests.configuration.association_filter_mode == :whitelist ? 'non-whitelisted' : 'blacklisted'} association: #{association.name} for #{record.class.name}"
61
+ next
62
+ end
63
+
64
+ puts " Processing #{association.macro} association: #{association.name}"
65
+ process_association(record, association)
66
+ end
67
+ end
68
+
69
+ def process_association(record, association)
70
+ begin
71
+ related_records = fetch_related_records(record, association)
72
+ count = related_records.length
73
+ puts " Found #{count} related #{association.name} records"
74
+ @collection_stats[record.class.name][:associations][association.name] += count
75
+
76
+ related_records.each { |related_record| collect_record(related_record) }
77
+ rescue => e
78
+ puts " Error processing association #{association.name}: #{e.message}"
79
+ end
80
+ end
81
+
82
+ def fetch_related_records(record, association)
83
+ case association.macro
84
+ when :belongs_to, :has_one
85
+ Array(record.public_send(association.name)).compact
86
+ when :has_many, :has_and_belongs_to_many
87
+ relation = record.public_send(association.name)
88
+
89
+ # Apply configured limit if it exists
90
+ if limit = RealDataTests.configuration.get_association_limit(record.class, association.name)
91
+ puts " Applying configured limit of #{limit} records for #{record.class.name}.#{association.name}"
92
+ relation = relation.limit(limit)
93
+ end
94
+
95
+ relation.loaded? ? relation.to_a : relation.load.to_a
96
+ else
97
+ []
98
+ end
99
+ end
100
+
101
+ def print_collection_stats
102
+ puts "\n=== Collection Statistics ==="
103
+ @collection_stats.each do |model, stats|
104
+ puts "\n#{model}:"
105
+ puts " Total records: #{stats[:count]}"
106
+ if stats[:associations].any?
107
+ puts " Associations:"
108
+ stats[:associations].each do |assoc_name, count|
109
+ puts " #{assoc_name}: #{count} records"
110
+ end
111
+ end
112
+ end
113
+ puts "\nTotal unique records collected: #{@collected_records.size}"
114
+ puts "==============================\n"
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,41 @@
1
+ module RealDataTests
2
+ module RSpecHelper
3
+ def load_real_test_data(name)
4
+ dump_path = File.join(RealDataTests.configuration.dump_path, "#{name}.sql")
5
+ raise Error, "Test data file not found: #{dump_path}" unless File.exist?(dump_path)
6
+
7
+ ActiveRecord::Base.transaction do
8
+ # Disable foreign key checks
9
+ ActiveRecord::Base.connection.execute('SET session_replication_role = replica;')
10
+
11
+ begin
12
+ # Load the SQL dump quietly
13
+ result = system("psql #{connection_options} -q < #{dump_path}")
14
+ raise Error, "Failed to load test data: #{dump_path}" unless result
15
+
16
+ ensure
17
+ # Re-enable foreign key checks
18
+ ActiveRecord::Base.connection.execute('SET session_replication_role = DEFAULT;')
19
+ end
20
+ end
21
+ end
22
+
23
+ private
24
+
25
+ def connection_options
26
+ config = if ActiveRecord::Base.respond_to?(:connection_db_config)
27
+ ActiveRecord::Base.connection_db_config.configuration_hash
28
+ else
29
+ ActiveRecord::Base.connection_config
30
+ end
31
+
32
+ options = []
33
+ options << "-h #{config[:host]}" if config[:host]
34
+ options << "-p #{config[:port]}" if config[:port]
35
+ options << "-U #{config[:username]}" if config[:username]
36
+ options << "-d #{config[:database]}"
37
+ options << "-q"
38
+ options.join(" ")
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,34 @@
1
+ module RealDataTests
2
+ class TestDataBuilder
3
+ def initialize(record, name: nil)
4
+ @record = record
5
+ @name = name || "#{record.class.name.underscore}_#{record.id}"
6
+ end
7
+
8
+ def create_dump_file
9
+ records = RealDataTests::RecordCollector.new(@record).collect
10
+
11
+ # Only anonymize if rules are configured
12
+ if RealDataTests.configuration.anonymization_rules.any?
13
+ puts "\nAnonymizing records..."
14
+ anonymizer = RealDataTests::DataAnonymizer.new(RealDataTests.configuration)
15
+ anonymizer.anonymize_records(records)
16
+ end
17
+
18
+ dump_content = RealDataTests::PgDumpGenerator.new(records).generate
19
+ dump_path = dump_file_path
20
+
21
+ FileUtils.mkdir_p(File.dirname(dump_path))
22
+ File.write(dump_path, dump_content)
23
+
24
+ puts "\nDump file created at: #{dump_path}"
25
+ dump_path
26
+ end
27
+
28
+ private
29
+
30
+ def dump_file_path
31
+ File.join(RealDataTests.configuration.dump_path, "#{@name}.sql")
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RealDataTests
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'real_data_tests/configuration'
4
+ require 'real_data_tests/data_anonymizer'
5
+ require 'real_data_tests/engine' if defined?(Rails)
6
+ require 'real_data_tests/pg_dump_generator'
7
+ require 'real_data_tests/record_collector'
8
+ require 'real_data_tests/rspec_helper'
9
+ require 'real_data_tests/test_data_builder'
10
+ require 'real_data_tests/version'
11
+
12
+ module RealDataTests
13
+ class Error < StandardError; end
14
+ class ConfigurationError < Error; end
15
+ class DumpFileError < Error; end
16
+
17
+ class << self
18
+ def configuration
19
+ @configuration ||= Configuration.new
20
+ end
21
+
22
+ def configure
23
+ yield(configuration) if block_given?
24
+ configuration
25
+ end
26
+
27
+ def reset_configuration!
28
+ @configuration = Configuration.new
29
+ end
30
+
31
+ def create_dump_file(record, name: nil)
32
+ raise ConfigurationError, "Configuration not initialized" unless @configuration
33
+
34
+ begin
35
+ TestDataBuilder.new(record, name: name).create_dump_file
36
+ rescue => e
37
+ raise DumpFileError, "Failed to create dump file: #{e.message}"
38
+ end
39
+ end
40
+
41
+ def root
42
+ File.expand_path('../..', __FILE__)
43
+ end
44
+
45
+ def env
46
+ @env ||= (ENV['RAILS_ENV'] || ENV['RACK_ENV'] || 'development')
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,4 @@
1
+ module RealDataTests
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end