activerecord-graph-extractor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,421 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ActiveRecordGraphExtractor
4
+ class DryRunAnalyzer
5
+ attr_reader :config, :relationship_analyzer
6
+
7
+ def initialize(config = ActiveRecordGraphExtractor.configuration)
8
+ @config = config
9
+ @relationship_analyzer = RelationshipAnalyzer.new(config)
10
+ end
11
+
12
+ def analyze(root_objects, options = {})
13
+ raise ExtractionError, "Root object cannot be nil" if root_objects.nil?
14
+
15
+ root_objects = Array(root_objects)
16
+
17
+ # Validate that all objects are ActiveRecord instances
18
+ root_objects.each do |obj|
19
+ unless obj.is_a?(ActiveRecord::Base)
20
+ raise ExtractionError, "Object must be an ActiveRecord object, got #{obj.class}"
21
+ end
22
+ end
23
+
24
+ # Extract options
25
+ max_depth = options[:max_depth] || config.max_depth
26
+
27
+ start_time = Time.now
28
+
29
+ begin
30
+ # Analyze the object graph without loading data
31
+ analysis_result = analyze_object_graph(root_objects, max_depth)
32
+
33
+ analysis_time = Time.now - start_time
34
+
35
+ # Build comprehensive analysis report
36
+ build_analysis_report(analysis_result, analysis_time, root_objects, max_depth)
37
+ rescue StandardError => e
38
+ raise ExtractionError, "Failed to analyze object graph: #{e.message}"
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def analyze_object_graph(root_objects, max_depth)
45
+ visited_models = Set.new
46
+ model_counts = Hash.new(0)
47
+ relationship_map = {}
48
+ circular_references = []
49
+ depth_analysis = {}
50
+
51
+ root_objects.each do |root_object|
52
+ analyze_relationships_recursively(
53
+ root_object.class,
54
+ visited_models,
55
+ model_counts,
56
+ relationship_map,
57
+ circular_references,
58
+ depth_analysis,
59
+ 1,
60
+ max_depth,
61
+ [root_object.class.name]
62
+ )
63
+
64
+ # Count the root object itself
65
+ model_counts[root_object.class.name] += 1
66
+ end
67
+
68
+ {
69
+ visited_models: visited_models,
70
+ model_counts: model_counts,
71
+ relationship_map: relationship_map,
72
+ circular_references: circular_references,
73
+ depth_analysis: depth_analysis
74
+ }
75
+ end
76
+
77
+ def analyze_relationships_recursively(model_class, visited_models, model_counts, relationship_map, circular_references, depth_analysis, current_depth, max_depth, path)
78
+ return if current_depth > max_depth
79
+
80
+ model_name = model_class.name
81
+ visited_models << model_name
82
+
83
+ # Track depth analysis
84
+ depth_analysis[current_depth] ||= Set.new
85
+ depth_analysis[current_depth] << model_name
86
+
87
+ # Get relationships for this model
88
+ relationships = relationship_analyzer.analyze_model(model_class)
89
+ relationship_map[model_name] = relationships
90
+
91
+ relationships.each do |relationship_name, relationship_info|
92
+ next unless config.relationship_included?(relationship_name)
93
+ next unless config.model_included?(relationship_info['model_name'])
94
+
95
+ related_model_name = relationship_info['model_name']
96
+
97
+ # Check for circular references
98
+ if path.include?(related_model_name)
99
+ circular_references << {
100
+ path: path + [related_model_name],
101
+ relationship: relationship_name,
102
+ depth: current_depth
103
+ }
104
+ next if config.handle_circular_references
105
+ end
106
+
107
+ begin
108
+ related_model_class = related_model_name.constantize
109
+
110
+ # Estimate record count for this relationship
111
+ estimated_count = estimate_relationship_count(model_class, relationship_name, relationship_info)
112
+ model_counts[related_model_name] += estimated_count
113
+
114
+ # Recursively analyze deeper relationships
115
+ analyze_relationships_recursively(
116
+ related_model_class,
117
+ visited_models,
118
+ model_counts,
119
+ relationship_map,
120
+ circular_references,
121
+ depth_analysis,
122
+ current_depth + 1,
123
+ max_depth,
124
+ path + [related_model_name]
125
+ )
126
+ rescue NameError
127
+ # Skip models that don't exist
128
+ next
129
+ rescue StandardError => e
130
+ # Log error but continue analysis
131
+ next
132
+ end
133
+ end
134
+ end
135
+
136
+ def estimate_relationship_count(model_class, relationship_name, relationship_info)
137
+ # Try to get a sample record to estimate relationship sizes
138
+ sample_record = model_class.first
139
+ return 0 unless sample_record
140
+
141
+ begin
142
+ case relationship_info['type']
143
+ when 'has_many', 'has_and_belongs_to_many'
144
+ # For has_many relationships, estimate based on sample
145
+ related_records = sample_record.public_send(relationship_name)
146
+ if related_records.respond_to?(:count)
147
+ sample_count = related_records.limit(100).count
148
+ # Estimate total based on sample (with some reasonable assumptions)
149
+ total_records = model_class.count
150
+ return 0 if total_records == 0
151
+
152
+ # Use sample count as average, but cap at reasonable limits
153
+ average_per_record = [sample_count, 50].min # Cap at 50 per record for estimation
154
+ return (total_records * average_per_record * 0.8).to_i # 80% factor for estimation
155
+ end
156
+ when 'has_one', 'belongs_to'
157
+ # For singular relationships, estimate 1 per parent record
158
+ total_records = model_class.count
159
+ return (total_records * 0.9).to_i # 90% factor assuming some records might not have the relationship
160
+ end
161
+ rescue StandardError
162
+ # If we can't estimate, return a conservative estimate
163
+ return model_class.count > 0 ? [model_class.count / 10, 1].max : 0
164
+ end
165
+
166
+ 0
167
+ end
168
+
169
+ def estimate_file_size(model_counts, relationship_map)
170
+ total_size = 0
171
+
172
+ model_counts.each do |model_name, count|
173
+ next if count == 0
174
+
175
+ begin
176
+ model_class = model_name.constantize
177
+
178
+ # Estimate size per record based on column types and relationships
179
+ size_per_record = estimate_record_size(model_class, relationship_map[model_name] || {})
180
+ total_size += count * size_per_record
181
+ rescue NameError
182
+ # Use default size if model doesn't exist
183
+ total_size += count * 500 # 500 bytes default
184
+ end
185
+ end
186
+
187
+ # Add JSON structure overhead (metadata, formatting, etc.)
188
+ metadata_overhead = 2048 # 2KB for metadata
189
+ json_formatting_overhead = total_size * 0.1 # 10% for JSON formatting
190
+
191
+ (total_size + metadata_overhead + json_formatting_overhead).to_i
192
+ end
193
+
194
+ def estimate_record_size(model_class, relationships)
195
+ base_size = 0
196
+
197
+ # Estimate size based on column types
198
+ model_class.columns.each do |column|
199
+ base_size += case column.type
200
+ when :string, :text
201
+ column.limit || 255
202
+ when :integer, :bigint
203
+ 8
204
+ when :decimal, :float
205
+ 16
206
+ when :datetime, :timestamp
207
+ 25
208
+ when :date
209
+ 12
210
+ when :boolean
211
+ 5
212
+ when :json, :jsonb
213
+ 500 # Estimate for JSON fields
214
+ else
215
+ 50 # Default for unknown types
216
+ end
217
+ end
218
+
219
+ # Add overhead for JSON structure and field names
220
+ field_name_overhead = model_class.columns.size * 20 # Average field name length
221
+ json_structure_overhead = 50 # Brackets, commas, etc.
222
+
223
+ base_size + field_name_overhead + json_structure_overhead
224
+ end
225
+
226
+ def build_analysis_report(analysis_result, analysis_time, root_objects, max_depth)
227
+ model_counts = analysis_result[:model_counts]
228
+ total_records = model_counts.values.sum
229
+ estimated_file_size = estimate_file_size(model_counts, analysis_result[:relationship_map])
230
+
231
+ {
232
+ 'dry_run' => true,
233
+ 'analysis_time' => analysis_time.round(3),
234
+ 'root_objects' => {
235
+ 'models' => root_objects.map(&:class).map(&:name).uniq,
236
+ 'ids' => root_objects.map(&:id),
237
+ 'count' => root_objects.size
238
+ },
239
+ 'extraction_scope' => {
240
+ 'max_depth' => max_depth,
241
+ 'total_models' => analysis_result[:visited_models].size,
242
+ 'total_estimated_records' => total_records,
243
+ 'models_involved' => analysis_result[:visited_models].to_a.sort
244
+ },
245
+ 'estimated_counts_by_model' => model_counts.sort_by { |_, count| -count }.to_h,
246
+ 'estimated_file_size' => {
247
+ 'bytes' => estimated_file_size,
248
+ 'human_readable' => format_file_size(estimated_file_size)
249
+ },
250
+ 'depth_analysis' => format_depth_analysis(analysis_result[:depth_analysis]),
251
+ 'relationship_analysis' => {
252
+ 'total_relationships' => analysis_result[:relationship_map].values.map(&:size).sum,
253
+ 'circular_references' => analysis_result[:circular_references].map do |ref|
254
+ {
255
+ 'path' => ref[:path].join(' -> '),
256
+ 'relationship' => ref[:relationship],
257
+ 'depth' => ref[:depth]
258
+ }
259
+ end,
260
+ 'circular_references_count' => analysis_result[:circular_references].size
261
+ },
262
+ 'performance_estimates' => estimate_performance(total_records, estimated_file_size),
263
+ 'warnings' => generate_warnings(analysis_result, total_records, estimated_file_size),
264
+ 'recommendations' => generate_recommendations(analysis_result, total_records, estimated_file_size, max_depth)
265
+ }
266
+ end
267
+
268
+ def format_depth_analysis(depth_analysis)
269
+ depth_analysis.transform_values(&:to_a).transform_values(&:sort)
270
+ end
271
+
272
+ def format_file_size(size_bytes)
273
+ if size_bytes < 1024
274
+ "#{size_bytes} B"
275
+ elsif size_bytes < 1024 * 1024
276
+ "#{(size_bytes / 1024.0).round(1)} KB"
277
+ elsif size_bytes < 1024 * 1024 * 1024
278
+ "#{(size_bytes / (1024.0 * 1024)).round(1)} MB"
279
+ else
280
+ "#{(size_bytes / (1024.0 * 1024 * 1024)).round(1)} GB"
281
+ end
282
+ end
283
+
284
+ def estimate_performance(total_records, file_size_bytes)
285
+ # Rough performance estimates based on typical hardware
286
+ records_per_second = 1000 # Conservative estimate
287
+ estimated_extraction_time = (total_records / records_per_second.to_f).round(1)
288
+
289
+ # Memory usage estimate (records in memory + overhead)
290
+ estimated_memory_mb = ((total_records * 1024) / (1024.0 * 1024)).round(1)
291
+
292
+ {
293
+ 'estimated_extraction_time_seconds' => estimated_extraction_time,
294
+ 'estimated_extraction_time_human' => format_duration(estimated_extraction_time),
295
+ 'estimated_memory_usage_mb' => estimated_memory_mb,
296
+ 'estimated_memory_usage_human' => "#{estimated_memory_mb} MB"
297
+ }
298
+ end
299
+
300
+ def format_duration(seconds)
301
+ if seconds < 60
302
+ "#{seconds.round(1)} seconds"
303
+ elsif seconds < 3600
304
+ minutes = (seconds / 60).round(1)
305
+ "#{minutes} minutes"
306
+ else
307
+ hours = (seconds / 3600).round(1)
308
+ "#{hours} hours"
309
+ end
310
+ end
311
+
312
+ def generate_warnings(analysis_result, total_records, file_size_bytes)
313
+ warnings = []
314
+
315
+ # Large dataset warnings
316
+ if total_records > 100_000
317
+ warnings << {
318
+ 'type' => 'large_dataset',
319
+ 'message' => "Large dataset detected (#{total_records.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse} records). Consider using filters or reducing max_depth.",
320
+ 'severity' => 'high'
321
+ }
322
+ elsif total_records > 10_000
323
+ warnings << {
324
+ 'type' => 'medium_dataset',
325
+ 'message' => "Medium dataset detected (#{total_records.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse} records). Monitor memory usage during extraction.",
326
+ 'severity' => 'medium'
327
+ }
328
+ end
329
+
330
+ # Large file size warnings
331
+ if file_size_bytes > 1024 * 1024 * 1024 # 1GB
332
+ warnings << {
333
+ 'type' => 'large_file',
334
+ 'message' => "Estimated file size is very large (#{format_file_size(file_size_bytes)}). Consider splitting the extraction.",
335
+ 'severity' => 'high'
336
+ }
337
+ elsif file_size_bytes > 100 * 1024 * 1024 # 100MB
338
+ warnings << {
339
+ 'type' => 'medium_file',
340
+ 'message' => "Estimated file size is large (#{format_file_size(file_size_bytes)}). Ensure adequate disk space.",
341
+ 'severity' => 'medium'
342
+ }
343
+ end
344
+
345
+ # Circular reference warnings
346
+ if analysis_result[:circular_references].any?
347
+ warnings << {
348
+ 'type' => 'circular_references',
349
+ 'message' => "#{analysis_result[:circular_references].size} circular reference(s) detected. Enable handle_circular_references if needed.",
350
+ 'severity' => 'medium'
351
+ }
352
+ end
353
+
354
+ # Deep nesting warnings
355
+ max_actual_depth = analysis_result[:depth_analysis].keys.max || 0
356
+ if max_actual_depth > 5
357
+ warnings << {
358
+ 'type' => 'deep_nesting',
359
+ 'message' => "Deep relationship nesting detected (#{max_actual_depth} levels). This may impact performance.",
360
+ 'severity' => 'medium'
361
+ }
362
+ end
363
+
364
+ warnings
365
+ end
366
+
367
+ def generate_recommendations(analysis_result, total_records, file_size_bytes, max_depth)
368
+ recommendations = []
369
+
370
+ # Performance recommendations
371
+ if total_records > 50_000
372
+ recommendations << {
373
+ 'type' => 'performance',
374
+ 'message' => 'Consider using batch processing or streaming for large datasets',
375
+ 'action' => 'Use extract_in_batches or enable streaming mode'
376
+ }
377
+ end
378
+
379
+ # Depth recommendations
380
+ if max_depth > 3 && analysis_result[:depth_analysis].keys.max.to_i > 3
381
+ recommendations << {
382
+ 'type' => 'depth',
383
+ 'message' => 'Consider reducing max_depth to improve performance',
384
+ 'action' => "Try max_depth: #{[max_depth - 1, 2].max}"
385
+ }
386
+ end
387
+
388
+ # Model filtering recommendations
389
+ large_models = analysis_result[:model_counts].select { |_, count| count > total_records * 0.3 }
390
+ if large_models.any?
391
+ model_names = large_models.keys.join(', ')
392
+ recommendations << {
393
+ 'type' => 'filtering',
394
+ 'message' => "Large model(s) detected: #{model_names}",
395
+ 'action' => 'Consider excluding these models or using custom filters'
396
+ }
397
+ end
398
+
399
+ # S3 recommendations
400
+ if file_size_bytes > 50 * 1024 * 1024 # 50MB
401
+ recommendations << {
402
+ 'type' => 's3',
403
+ 'message' => 'Large file detected - consider uploading directly to S3',
404
+ 'action' => 'Use extract_to_s3 or extract_and_upload_to_s3 methods'
405
+ }
406
+ end
407
+
408
+ # Memory recommendations
409
+ estimated_memory_mb = ((total_records * 1024) / (1024.0 * 1024)).round(1)
410
+ if estimated_memory_mb > 1000 # 1GB
411
+ recommendations << {
412
+ 'type' => 'memory',
413
+ 'message' => 'High memory usage expected',
414
+ 'action' => 'Ensure adequate RAM or use streaming extraction'
415
+ }
416
+ end
417
+
418
+ recommendations
419
+ end
420
+ end
421
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ActiveRecordGraphExtractor
4
+ # Base error class for all ActiveRecordGraphExtractor errors
5
+ class Error < StandardError; end
6
+
7
+ # Raised when extraction fails
8
+ class ExtractionError < Error; end
9
+
10
+ # Raised when import fails
11
+ class ImportError < Error; end
12
+
13
+ # Raised when serialization/deserialization fails
14
+ class SerializationError < Error; end
15
+
16
+ # Raised when circular dependencies are detected
17
+ class CircularDependencyError < Error; end
18
+
19
+ # Raised when record data is invalid
20
+ class InvalidRecordError < Error; end
21
+
22
+ class ConfigurationError < Error; end
23
+
24
+ class ValidationError < ImportError; end
25
+
26
+ class DependencyError < ImportError; end
27
+
28
+ class FileError < Error; end
29
+
30
+ class JSONError < Error; end
31
+
32
+ class S3Error < Error; end
33
+ end
@@ -0,0 +1,182 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'tempfile'
5
+
6
+ module ActiveRecordGraphExtractor
7
+ class Extractor
8
+ attr_reader :config, :relationship_analyzer
9
+
10
+ def initialize(config = ActiveRecordGraphExtractor.configuration)
11
+ @config = config
12
+ @relationship_analyzer = RelationshipAnalyzer.new(config)
13
+ end
14
+
15
+ def extract(root_objects, options = {})
16
+ raise ExtractionError, "Root object cannot be nil" if root_objects.nil?
17
+
18
+ root_objects = Array(root_objects)
19
+
20
+ # Validate that all objects are ActiveRecord instances
21
+ root_objects.each do |obj|
22
+ unless obj.is_a?(ActiveRecord::Base)
23
+ raise ExtractionError, "Object must be an ActiveRecord object, got #{obj.class}"
24
+ end
25
+ end
26
+
27
+ # Extract options
28
+ max_depth = options[:max_depth] || config.max_depth
29
+ custom_serializers = options[:custom_serializers] || {}
30
+
31
+ start_time = Time.now
32
+ records = []
33
+ visited = Set.new
34
+ circular_references = 0
35
+
36
+ begin
37
+ root_objects.each do |root_object|
38
+ # Add the root object itself
39
+ record_key = "#{root_object.class.name}_#{root_object.id}"
40
+ unless visited.include?(record_key)
41
+ records << serialize_record(root_object, custom_serializers)
42
+ visited << record_key
43
+ end
44
+
45
+ # Extract related objects
46
+ circular_references += extract_relationships(root_object, records, visited, 1, max_depth, custom_serializers)
47
+ end
48
+
49
+ extraction_time = Time.now - start_time
50
+ metadata = build_metadata(start_time, records, circular_references, max_depth, root_objects)
51
+
52
+ {
53
+ 'records' => records,
54
+ 'metadata' => metadata
55
+ }
56
+ rescue StandardError => e
57
+ raise ExtractionError, "Failed to extract relationships: #{e.message}"
58
+ end
59
+ end
60
+
61
+ def extract_to_file(root_objects, file_path, options = {})
62
+ begin
63
+ result = extract(root_objects, options)
64
+ File.write(file_path, JSON.pretty_generate(result))
65
+ result
66
+ rescue Errno::ENOENT, Errno::EACCES => e
67
+ raise FileError, "Cannot write to file #{file_path}: #{e.message}"
68
+ rescue JSON::GeneratorError => e
69
+ raise JSONError, "Failed to generate JSON: #{e.message}"
70
+ end
71
+ end
72
+
73
+ def extract_to_s3(root_objects, s3_client, s3_key = nil, options = {})
74
+ # Create a temporary file for the extraction
75
+ temp_file = Tempfile.new(['extraction', '.json'])
76
+
77
+ begin
78
+ # Extract to temporary file
79
+ result = extract_to_file(root_objects, temp_file.path, options)
80
+
81
+ # Upload to S3
82
+ upload_result = s3_client.upload_file(temp_file.path, s3_key)
83
+
84
+ # Return combined result
85
+ result.merge({
86
+ 's3_upload' => upload_result
87
+ })
88
+ ensure
89
+ temp_file.close
90
+ temp_file.unlink
91
+ end
92
+ end
93
+
94
+ def extract_and_upload_to_s3(root_objects, bucket_name:, s3_key: nil, region: 'us-east-1', options: {}, **s3_options)
95
+ s3_client = S3Client.new(bucket_name: bucket_name, region: region, **s3_options)
96
+ extract_to_s3(root_objects, s3_client, s3_key, options)
97
+ end
98
+
99
+ def dry_run(root_objects, options = {})
100
+ analyzer = DryRunAnalyzer.new(config)
101
+ analyzer.analyze(root_objects, options)
102
+ end
103
+
104
+ private
105
+
106
+ def extract_relationships(record, records, visited, current_depth, max_depth, custom_serializers)
107
+ return 0 if current_depth > max_depth
108
+
109
+ circular_refs = 0
110
+ relationships = relationship_analyzer.analyze_model(record.class)
111
+
112
+ relationships.each do |relationship_name, relationship_info|
113
+ next unless config.relationship_included?(relationship_name)
114
+ next unless config.model_included?(relationship_info['model_name'])
115
+
116
+ begin
117
+ related_objects = record.public_send(relationship_name)
118
+ related_objects = Array(related_objects).compact
119
+
120
+ related_objects.each do |related_object|
121
+ record_key = "#{related_object.class.name}_#{related_object.id}"
122
+
123
+ if visited.include?(record_key)
124
+ circular_refs += 1 if config.handle_circular_references
125
+ next
126
+ end
127
+
128
+ visited << record_key
129
+ records << serialize_record(related_object, custom_serializers)
130
+
131
+ # Recursively extract relationships
132
+ circular_refs += extract_relationships(related_object, records, visited, current_depth + 1, max_depth, custom_serializers)
133
+ end
134
+ rescue ActiveRecord::StatementInvalid => e
135
+ # Re-raise database errors
136
+ raise e
137
+ rescue StandardError => e
138
+ # Log the error but continue processing other relationships for non-DB errors
139
+ next
140
+ end
141
+ end
142
+
143
+ circular_refs
144
+ end
145
+
146
+ def serialize_record(record, custom_serializers)
147
+ model_name = record.class.name
148
+
149
+ # Use custom serializer if available (check both parameter and config)
150
+ serializer = custom_serializers[model_name] || config.custom_serializers[model_name]
151
+ if serializer
152
+ serialized = serializer.call(record)
153
+ # Ensure all keys are strings for consistency
154
+ string_serialized = serialized.transform_keys(&:to_s)
155
+ return string_serialized.merge('_model' => model_name)
156
+ end
157
+
158
+ # Default serialization
159
+ attributes = record.attributes.except('updated_at', 'created_at')
160
+ attributes['_model'] = model_name
161
+ attributes
162
+ end
163
+
164
+ def build_metadata(start_time, records, circular_references, max_depth, root_objects)
165
+ end_time = Time.now
166
+ model_names = records.map { |r| r['_model'] }.uniq
167
+ root_model = root_objects.first.class.name
168
+ root_ids = root_objects.map(&:id)
169
+
170
+ {
171
+ 'extraction_time' => start_time.iso8601,
172
+ 'total_records' => records.size,
173
+ 'models_extracted' => model_names,
174
+ 'circular_references_detected' => circular_references > 0,
175
+ 'max_depth_used' => max_depth,
176
+ 'duration_seconds' => (end_time - start_time).round(3),
177
+ 'root_model' => root_model,
178
+ 'root_ids' => root_ids
179
+ }
180
+ end
181
+ end
182
+ end