activerecord-graph-extractor 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +4 -0
- data/CHANGELOG.md +36 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +201 -0
- data/LICENSE +21 -0
- data/README.md +532 -0
- data/Rakefile +36 -0
- data/activerecord-graph-extractor.gemspec +64 -0
- data/docs/dry_run.md +410 -0
- data/docs/examples.md +239 -0
- data/docs/s3_integration.md +381 -0
- data/docs/usage.md +363 -0
- data/examples/dry_run_example.rb +227 -0
- data/examples/s3_example.rb +247 -0
- data/exe/arge +7 -0
- data/lib/activerecord_graph_extractor/cli.rb +627 -0
- data/lib/activerecord_graph_extractor/configuration.rb +98 -0
- data/lib/activerecord_graph_extractor/dependency_resolver.rb +406 -0
- data/lib/activerecord_graph_extractor/dry_run_analyzer.rb +421 -0
- data/lib/activerecord_graph_extractor/errors.rb +33 -0
- data/lib/activerecord_graph_extractor/extractor.rb +182 -0
- data/lib/activerecord_graph_extractor/importer.rb +260 -0
- data/lib/activerecord_graph_extractor/json_serializer.rb +176 -0
- data/lib/activerecord_graph_extractor/primary_key_mapper.rb +57 -0
- data/lib/activerecord_graph_extractor/progress_tracker.rb +202 -0
- data/lib/activerecord_graph_extractor/relationship_analyzer.rb +212 -0
- data/lib/activerecord_graph_extractor/s3_client.rb +170 -0
- data/lib/activerecord_graph_extractor/version.rb +5 -0
- data/lib/activerecord_graph_extractor.rb +34 -0
- data/scripts/verify_installation.rb +192 -0
- metadata +388 -0
@@ -0,0 +1,98 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ActiveRecordGraphExtractor
|
4
|
+
class Configuration
|
5
|
+
attr_accessor :max_depth, :batch_size, :progress_enabled, :stream_json,
|
6
|
+
:validate_records, :use_transactions, :handle_circular_references,
|
7
|
+
:skip_missing_models, :included_models, :excluded_models,
|
8
|
+
:included_relationships, :excluded_relationships,
|
9
|
+
:custom_serializers, :primary_key_strategy
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
reset!
|
13
|
+
end
|
14
|
+
|
15
|
+
def reset!
|
16
|
+
@max_depth = 5
|
17
|
+
@batch_size = 1000
|
18
|
+
@progress_enabled = true
|
19
|
+
@stream_json = false
|
20
|
+
@validate_records = true
|
21
|
+
@use_transactions = true
|
22
|
+
@handle_circular_references = true
|
23
|
+
@skip_missing_models = true
|
24
|
+
@included_models = []
|
25
|
+
@excluded_models = []
|
26
|
+
@included_relationships = []
|
27
|
+
@excluded_relationships = []
|
28
|
+
@custom_serializers = {}
|
29
|
+
@primary_key_strategy = :generate_new
|
30
|
+
end
|
31
|
+
|
32
|
+
def max_depth=(value)
|
33
|
+
raise ArgumentError, 'max_depth must be positive' if value <= 0
|
34
|
+
@max_depth = value
|
35
|
+
end
|
36
|
+
|
37
|
+
def batch_size=(value)
|
38
|
+
raise ArgumentError, 'batch_size must be positive' if value <= 0
|
39
|
+
@batch_size = value
|
40
|
+
end
|
41
|
+
|
42
|
+
def primary_key_strategy=(strategy)
|
43
|
+
unless [:preserve_original, :generate_new].include?(strategy)
|
44
|
+
raise ArgumentError, 'primary_key_strategy must be :preserve_original or :generate_new'
|
45
|
+
end
|
46
|
+
@primary_key_strategy = strategy
|
47
|
+
end
|
48
|
+
|
49
|
+
def include_model(model)
|
50
|
+
model_name = model.is_a?(Class) ? model.name : model.to_s
|
51
|
+
@included_models << model_name unless @included_models.include?(model_name)
|
52
|
+
end
|
53
|
+
|
54
|
+
def exclude_model(model)
|
55
|
+
model_name = model.is_a?(Class) ? model.name : model.to_s
|
56
|
+
@excluded_models << model_name unless @excluded_models.include?(model_name)
|
57
|
+
end
|
58
|
+
|
59
|
+
def include_relationship(relationship)
|
60
|
+
@included_relationships << relationship.to_s unless @included_relationships.include?(relationship.to_s)
|
61
|
+
end
|
62
|
+
|
63
|
+
def exclude_relationship(relationship)
|
64
|
+
@excluded_relationships << relationship.to_s unless @excluded_relationships.include?(relationship.to_s)
|
65
|
+
end
|
66
|
+
|
67
|
+
def add_custom_serializer(model, serializer = nil, &block)
|
68
|
+
model_name = model.is_a?(Class) ? model.name : model.to_s
|
69
|
+
@custom_serializers[model_name] = serializer || block
|
70
|
+
end
|
71
|
+
|
72
|
+
def model_included?(model_name)
|
73
|
+
return false if @excluded_models.include?(model_name.to_s)
|
74
|
+
return true if @included_models.empty?
|
75
|
+
@included_models.include?(model_name.to_s)
|
76
|
+
end
|
77
|
+
|
78
|
+
def relationship_included?(relationship_name)
|
79
|
+
return false if @excluded_relationships.include?(relationship_name.to_s)
|
80
|
+
return true if @included_relationships.empty?
|
81
|
+
@included_relationships.include?(relationship_name.to_s)
|
82
|
+
end
|
83
|
+
|
84
|
+
class << self
|
85
|
+
def configure
|
86
|
+
yield(configuration)
|
87
|
+
end
|
88
|
+
|
89
|
+
def configuration
|
90
|
+
@configuration ||= new
|
91
|
+
end
|
92
|
+
|
93
|
+
def reset!
|
94
|
+
@configuration = new
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,406 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ActiveRecordGraphExtractor
|
4
|
+
class DependencyResolver
|
5
|
+
attr_reader :dependency_graph, :resolved_order
|
6
|
+
|
7
|
+
def initialize(dependency_graph)
|
8
|
+
@dependency_graph = dependency_graph
|
9
|
+
@resolved_order = []
|
10
|
+
end
|
11
|
+
|
12
|
+
def resolve_creation_order
|
13
|
+
# Create a copy to avoid modifying the original
|
14
|
+
graph = dependency_graph.dup
|
15
|
+
visited = Set.new
|
16
|
+
temp_visited = Set.new
|
17
|
+
|
18
|
+
graph.keys.each do |model|
|
19
|
+
next if visited.include?(model)
|
20
|
+
|
21
|
+
visit_model(model, graph, visited, temp_visited)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Reverse to get creation order (dependencies first)
|
25
|
+
@resolved_order.reverse
|
26
|
+
end
|
27
|
+
|
28
|
+
def resolve_deletion_order
|
29
|
+
# For deletion, we want the reverse of creation order
|
30
|
+
resolve_creation_order.reverse
|
31
|
+
end
|
32
|
+
|
33
|
+
def validate_dependencies(records_data)
|
34
|
+
missing_dependencies = {}
|
35
|
+
|
36
|
+
records_data.each do |model_name, records|
|
37
|
+
next unless dependency_graph[model_name]
|
38
|
+
|
39
|
+
dependency_graph[model_name].each do |dependency|
|
40
|
+
unless records_data.key?(dependency)
|
41
|
+
missing_dependencies[model_name] ||= []
|
42
|
+
missing_dependencies[model_name] << dependency
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
return missing_dependencies if missing_dependencies.any?
|
48
|
+
|
49
|
+
# Validate foreign key references
|
50
|
+
validate_foreign_key_references(records_data)
|
51
|
+
end
|
52
|
+
|
53
|
+
def group_by_dependency_level
|
54
|
+
creation_order = resolve_creation_order
|
55
|
+
levels = {}
|
56
|
+
current_level = 0
|
57
|
+
|
58
|
+
creation_order.each do |model_name|
|
59
|
+
dependencies = dependency_graph[model_name] || []
|
60
|
+
|
61
|
+
if dependencies.empty?
|
62
|
+
# No dependencies - can be created first
|
63
|
+
levels[current_level] ||= []
|
64
|
+
levels[current_level] << model_name
|
65
|
+
else
|
66
|
+
# Find the maximum level of dependencies
|
67
|
+
max_dependency_level = dependencies.map do |dep|
|
68
|
+
find_model_level(dep, levels)
|
69
|
+
end.max || 0
|
70
|
+
|
71
|
+
model_level = max_dependency_level + 1
|
72
|
+
levels[model_level] ||= []
|
73
|
+
levels[model_level] << model_name
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
levels
|
78
|
+
end
|
79
|
+
|
80
|
+
def resolve(dependency_graph)
|
81
|
+
# Handle different input formats based on test expectations
|
82
|
+
if dependency_graph.values.first.is_a?(Hash)
|
83
|
+
# New format: { 'TestOrder' => { 'test_user' => { 'model_class' => 'TestUser' } } }
|
84
|
+
return resolve_complex_graph(dependency_graph)
|
85
|
+
end
|
86
|
+
|
87
|
+
# Original format: { TestOrder => [TestUser] }
|
88
|
+
return [] if dependency_graph.empty?
|
89
|
+
|
90
|
+
# Check for circular dependencies
|
91
|
+
if detect_circular_dependencies(dependency_graph)
|
92
|
+
raise CircularDependencyError, "Circular dependency detected in model relationships"
|
93
|
+
end
|
94
|
+
|
95
|
+
# Perform topological sort
|
96
|
+
topological_sort(dependency_graph)
|
97
|
+
end
|
98
|
+
|
99
|
+
def detect_circular_dependencies(dependency_graph)
|
100
|
+
# Handle different formats
|
101
|
+
if dependency_graph.values.first.is_a?(Hash)
|
102
|
+
return detect_complex_circular_dependencies(dependency_graph)
|
103
|
+
end
|
104
|
+
|
105
|
+
# Original boolean detection
|
106
|
+
visited = Set.new
|
107
|
+
rec_stack = Set.new
|
108
|
+
|
109
|
+
dependency_graph.each_key do |node|
|
110
|
+
next if visited.include?(node)
|
111
|
+
return true if has_cycle?(node, dependency_graph, visited, rec_stack)
|
112
|
+
end
|
113
|
+
|
114
|
+
false
|
115
|
+
end
|
116
|
+
|
117
|
+
def build_creation_order(records_by_model, dependency_graph)
|
118
|
+
grouped_records = group_records_by_dependencies(records_by_model)
|
119
|
+
ordered_models = resolve(dependency_graph)
|
120
|
+
|
121
|
+
# Create ordered list of [model_name, records] pairs
|
122
|
+
ordered_records = []
|
123
|
+
|
124
|
+
ordered_models.each do |model_class|
|
125
|
+
model_name = model_class.name
|
126
|
+
if grouped_records.key?(model_name)
|
127
|
+
ordered_records << [model_name, grouped_records[model_name]]
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
# Add any remaining models not in dependency graph
|
132
|
+
grouped_records.each do |model_name, records|
|
133
|
+
unless ordered_records.any? { |entry| entry[0] == model_name }
|
134
|
+
ordered_records << [model_name, records]
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
ordered_records
|
139
|
+
end
|
140
|
+
|
141
|
+
private
|
142
|
+
|
143
|
+
def visit_model(model, graph, visited, temp_visited)
|
144
|
+
return if visited.include?(model)
|
145
|
+
|
146
|
+
if temp_visited.include?(model)
|
147
|
+
raise DependencyError.new(
|
148
|
+
"Circular dependency detected involving #{model}",
|
149
|
+
model: model
|
150
|
+
)
|
151
|
+
end
|
152
|
+
|
153
|
+
temp_visited << model
|
154
|
+
|
155
|
+
dependencies = graph[model] || []
|
156
|
+
dependencies.each do |dependency|
|
157
|
+
visit_model(dependency, graph, visited, temp_visited)
|
158
|
+
end
|
159
|
+
|
160
|
+
temp_visited.delete(model)
|
161
|
+
visited << model
|
162
|
+
@resolved_order << model
|
163
|
+
end
|
164
|
+
|
165
|
+
def validate_foreign_key_references(records_data)
|
166
|
+
missing_references = {}
|
167
|
+
|
168
|
+
records_data.each do |model_name, records|
|
169
|
+
records.each do |record|
|
170
|
+
record_relationships = record[:relationships] || {}
|
171
|
+
|
172
|
+
record_relationships.each do |field, reference|
|
173
|
+
referenced_table = reference[:table]
|
174
|
+
referenced_id = reference[:original_id]
|
175
|
+
|
176
|
+
# Check if the referenced record exists in the data
|
177
|
+
referenced_records = records_data[referenced_table]
|
178
|
+
if referenced_records.nil?
|
179
|
+
missing_references[model_name] ||= []
|
180
|
+
missing_references[model_name] << {
|
181
|
+
record_id: record[:original_id],
|
182
|
+
field: field,
|
183
|
+
references: reference
|
184
|
+
}
|
185
|
+
next
|
186
|
+
end
|
187
|
+
|
188
|
+
# Check if specific record exists
|
189
|
+
referenced_record = referenced_records.find do |r|
|
190
|
+
r[:original_id] == referenced_id
|
191
|
+
end
|
192
|
+
|
193
|
+
unless referenced_record
|
194
|
+
missing_references[model_name] ||= []
|
195
|
+
missing_references[model_name] << {
|
196
|
+
record_id: record[:original_id],
|
197
|
+
field: field,
|
198
|
+
references: reference
|
199
|
+
}
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
missing_references
|
206
|
+
end
|
207
|
+
|
208
|
+
def find_model_level(model_name, levels)
|
209
|
+
levels.each do |level, models|
|
210
|
+
return level if models.include?(model_name)
|
211
|
+
end
|
212
|
+
|
213
|
+
-1 # Not found, should be level 0
|
214
|
+
end
|
215
|
+
|
216
|
+
def topological_sort(dependency_graph)
|
217
|
+
# Create a copy to avoid modifying original
|
218
|
+
graph = dependency_graph.dup
|
219
|
+
in_degree = {}
|
220
|
+
|
221
|
+
# Initialize in-degree count for all nodes
|
222
|
+
graph.each_key do |node|
|
223
|
+
in_degree[node] = 0
|
224
|
+
end
|
225
|
+
|
226
|
+
# Calculate in-degree: how many things depend on each node
|
227
|
+
graph.each do |node, dependencies|
|
228
|
+
# This node depends on 'dependencies', so this node has in-degree = dependencies.count
|
229
|
+
in_degree[node] = dependencies.count { |dep| graph.key?(dep) }
|
230
|
+
end
|
231
|
+
|
232
|
+
# Start with nodes that have no dependencies (in-degree 0)
|
233
|
+
queue = in_degree.select { |_, degree| degree == 0 }.keys
|
234
|
+
result = []
|
235
|
+
|
236
|
+
while queue.any?
|
237
|
+
# Sort to ensure consistent ordering
|
238
|
+
current = queue.sort_by(&:name).first
|
239
|
+
queue.delete(current)
|
240
|
+
result << current
|
241
|
+
|
242
|
+
# For each node that depends on the current node, decrease its in-degree
|
243
|
+
graph.each do |node, dependencies|
|
244
|
+
if dependencies.include?(current)
|
245
|
+
in_degree[node] -= 1
|
246
|
+
queue << node if in_degree[node] == 0 && !result.include?(node) && !queue.include?(node)
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
result
|
252
|
+
end
|
253
|
+
|
254
|
+
def has_cycle?(node, graph, visited, rec_stack)
|
255
|
+
visited.add(node)
|
256
|
+
rec_stack.add(node)
|
257
|
+
|
258
|
+
graph[node]&.each do |neighbor|
|
259
|
+
if !visited.include?(neighbor)
|
260
|
+
return true if has_cycle?(neighbor, graph, visited, rec_stack)
|
261
|
+
elsif rec_stack.include?(neighbor)
|
262
|
+
return true
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
rec_stack.delete(node)
|
267
|
+
false
|
268
|
+
end
|
269
|
+
|
270
|
+
def group_records_by_dependencies(records)
|
271
|
+
if records.is_a?(Array)
|
272
|
+
# Convert array of records to hash grouped by model
|
273
|
+
grouped = {}
|
274
|
+
records.each do |record|
|
275
|
+
raise InvalidRecordError, "Record missing _model key: #{record.inspect}" unless record.key?('_model')
|
276
|
+
|
277
|
+
model_name = record['_model']
|
278
|
+
grouped[model_name] ||= []
|
279
|
+
grouped[model_name] << record
|
280
|
+
end
|
281
|
+
grouped
|
282
|
+
else
|
283
|
+
# Assume it's already grouped by model
|
284
|
+
records
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
def resolve_complex_graph(dependency_graph)
|
289
|
+
# Build simple dependency graph from complex format
|
290
|
+
simple_graph = {}
|
291
|
+
missing_models = []
|
292
|
+
all_referenced_models = Set.new
|
293
|
+
|
294
|
+
# Collect all models that are referenced as dependencies
|
295
|
+
dependency_graph.each do |model_name, relationships|
|
296
|
+
simple_graph[model_name] = []
|
297
|
+
|
298
|
+
relationships.each do |_relationship_name, relationship_info|
|
299
|
+
dep_model = relationship_info['model_class']
|
300
|
+
simple_graph[model_name] << dep_model
|
301
|
+
all_referenced_models.add(dep_model)
|
302
|
+
end
|
303
|
+
end
|
304
|
+
|
305
|
+
# Add missing models with no dependencies
|
306
|
+
all_referenced_models.each do |model_name|
|
307
|
+
unless dependency_graph.key?(model_name)
|
308
|
+
missing_models << model_name
|
309
|
+
simple_graph[model_name] = [] # Missing models have no dependencies
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
# Detect circular dependencies
|
314
|
+
circular_deps = detect_complex_circular_dependencies(dependency_graph)
|
315
|
+
|
316
|
+
# Create levels for creation order
|
317
|
+
levels = group_models_by_dependency_level(simple_graph)
|
318
|
+
|
319
|
+
{
|
320
|
+
'creation_order' => levels,
|
321
|
+
'circular_dependencies' => circular_deps,
|
322
|
+
'missing_models' => missing_models.uniq
|
323
|
+
}
|
324
|
+
end
|
325
|
+
|
326
|
+
def detect_complex_circular_dependencies(dependency_graph)
|
327
|
+
circular_deps = []
|
328
|
+
visited = Set.new
|
329
|
+
|
330
|
+
dependency_graph.each_key do |model_name|
|
331
|
+
next if visited.include?(model_name)
|
332
|
+
|
333
|
+
path = []
|
334
|
+
circular_path = find_circular_path(model_name, dependency_graph, visited, Set.new, path)
|
335
|
+
if circular_path
|
336
|
+
# Remove the duplicate end node that creates the cycle
|
337
|
+
clean_cycle = circular_path[0..-2]
|
338
|
+
circular_deps << clean_cycle unless circular_deps.any? { |cycle| cycle.sort == clean_cycle.sort }
|
339
|
+
end
|
340
|
+
end
|
341
|
+
|
342
|
+
circular_deps
|
343
|
+
end
|
344
|
+
|
345
|
+
def find_circular_path(model_name, dependency_graph, global_visited, local_visited, path)
|
346
|
+
return nil if global_visited.include?(model_name)
|
347
|
+
|
348
|
+
if local_visited.include?(model_name)
|
349
|
+
# Found a cycle, extract the circular portion
|
350
|
+
cycle_start = path.index(model_name)
|
351
|
+
return nil unless cycle_start
|
352
|
+
|
353
|
+
circular_path = path[cycle_start..-1] + [model_name]
|
354
|
+
return circular_path
|
355
|
+
end
|
356
|
+
|
357
|
+
local_visited.add(model_name)
|
358
|
+
path << model_name
|
359
|
+
|
360
|
+
relationships = dependency_graph[model_name] || {}
|
361
|
+
relationships.each do |_rel_name, rel_info|
|
362
|
+
dep_model = rel_info['model_class']
|
363
|
+
next unless dependency_graph.key?(dep_model)
|
364
|
+
|
365
|
+
result = find_circular_path(dep_model, dependency_graph, global_visited, local_visited, path)
|
366
|
+
if result
|
367
|
+
local_visited.delete(model_name)
|
368
|
+
path.pop
|
369
|
+
return result
|
370
|
+
end
|
371
|
+
end
|
372
|
+
|
373
|
+
local_visited.delete(model_name)
|
374
|
+
path.pop
|
375
|
+
global_visited.add(model_name)
|
376
|
+
nil
|
377
|
+
end
|
378
|
+
|
379
|
+
def group_models_by_dependency_level(simple_graph)
|
380
|
+
levels = []
|
381
|
+
processed = Set.new
|
382
|
+
|
383
|
+
# Continue until all models are processed
|
384
|
+
while processed.size < simple_graph.size
|
385
|
+
current_level = []
|
386
|
+
|
387
|
+
simple_graph.each do |model_name, dependencies|
|
388
|
+
next if processed.include?(model_name)
|
389
|
+
|
390
|
+
# Check if all dependencies are already processed
|
391
|
+
if dependencies.all? { |dep| processed.include?(dep) }
|
392
|
+
current_level << model_name
|
393
|
+
end
|
394
|
+
end
|
395
|
+
|
396
|
+
# If no models can be processed, we have a circular dependency
|
397
|
+
break if current_level.empty?
|
398
|
+
|
399
|
+
levels << current_level
|
400
|
+
current_level.each { |model| processed.add(model) }
|
401
|
+
end
|
402
|
+
|
403
|
+
levels
|
404
|
+
end
|
405
|
+
end
|
406
|
+
end
|