activerecord-graph-extractor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,260 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module ActiveRecordGraphExtractor
6
+ class Importer
7
+ attr_reader :config
8
+
9
+ def initialize(config = ActiveRecordGraphExtractor.configuration)
10
+ @config = config
11
+ end
12
+
13
+ def import(data, options = {})
14
+ validate_data_structure!(data)
15
+
16
+ records = data['records']
17
+ raise ImportError, "No records found in data" if records.empty?
18
+
19
+ start_time = Time.now
20
+ pk_mapper = PrimaryKeyMapper.new(config.primary_key_strategy)
21
+
22
+ begin
23
+ imported_count = 0
24
+ skipped_count = 0
25
+ errors = []
26
+
27
+ use_transaction = options[:transaction] || config.use_transactions
28
+ batch_size = options[:batch_size] || 1000
29
+ skip_existing = options[:skip_existing] || false
30
+ custom_finders = options[:custom_finders] || {}
31
+
32
+ if use_transaction
33
+ ActiveRecord::Base.transaction do
34
+ imported_count, skipped_count, errors = import_records_in_order(
35
+ records, pk_mapper, skip_existing, custom_finders, batch_size
36
+ )
37
+ end
38
+ else
39
+ imported_count, skipped_count, errors = import_records_in_order(
40
+ records, pk_mapper, skip_existing, custom_finders, batch_size
41
+ )
42
+ end
43
+
44
+ import_duration = Time.now - start_time
45
+
46
+ {
47
+ 'metadata' => build_import_metadata(start_time, imported_count, skipped_count, errors, import_duration, data['records'].size),
48
+ 'imported_records' => imported_count,
49
+ 'skipped_records' => skipped_count,
50
+ 'errors' => errors,
51
+ 'primary_key_mappings' => pk_mapper.get_all_mappings
52
+ }
53
+ rescue StandardError => e
54
+ raise ImportError, "Failed to import records: #{e.message}"
55
+ end
56
+ end
57
+
58
+ def import_from_file(file_path, options = {})
59
+ unless File.exist?(file_path)
60
+ raise FileError, "File not found: #{file_path}"
61
+ end
62
+
63
+ begin
64
+ file_content = File.read(file_path)
65
+ data = JSON.parse(file_content)
66
+ import(data, options)
67
+ rescue JSON::ParserError => e
68
+ raise JSONError, "Invalid JSON in file #{file_path}: #{e.message}"
69
+ rescue => e
70
+ raise FileError, "Error reading file #{file_path}: #{e.message}"
71
+ end
72
+ end
73
+
74
+ private
75
+
76
+ def validate_data_structure!(data)
77
+ unless data.is_a?(Hash) && data.key?('records')
78
+ raise ImportError, "Invalid data structure: expected Hash with 'records' key"
79
+ end
80
+ end
81
+
82
+ def import_records_in_order(records, pk_mapper, skip_existing, custom_finders, batch_size)
83
+ # Group records by model and resolve dependencies
84
+ resolver = DependencyResolver.new({})
85
+ analyzer = RelationshipAnalyzer.new(config)
86
+
87
+ records_by_model = group_records_by_model(records)
88
+ models = records_by_model.keys.map { |name| name.constantize rescue nil }.compact
89
+ dependency_graph = analyzer.build_dependency_graph(models)
90
+
91
+ ordered_records = resolver.build_creation_order(records_by_model, dependency_graph)
92
+
93
+ import_records(ordered_records, pk_mapper, skip_existing, custom_finders, batch_size)
94
+ end
95
+
96
+ def group_records_by_model(records)
97
+ grouped = {}
98
+
99
+ records.each do |record|
100
+ unless record.key?('_model')
101
+ raise ImportError, "Record missing _model key: #{record.inspect}"
102
+ end
103
+
104
+ model_name = record['_model']
105
+ grouped[model_name] ||= []
106
+ grouped[model_name] << record
107
+ end
108
+
109
+ grouped
110
+ end
111
+
112
+ def import_records(ordered_records, pk_mapper, skip_existing, custom_finders, batch_size)
113
+ total_imported = 0
114
+ total_skipped = 0
115
+ errors = []
116
+
117
+ # First pass: validate all records and check for existing records
118
+ records_to_import = []
119
+
120
+ ordered_records.each do |model_name, model_records|
121
+ model_records.each do |record_data|
122
+ begin
123
+ # Check for existing record if skip_existing is true
124
+ if skip_existing || custom_finders[model_name]
125
+ existing_record = find_existing_record(model_name, record_data, custom_finders)
126
+ if existing_record
127
+ total_skipped += 1
128
+ next
129
+ end
130
+ end
131
+
132
+ # Validate the record without saving
133
+ if validate_record(model_name, record_data, pk_mapper)
134
+ records_to_import << [model_name, record_data]
135
+ end
136
+ rescue ImportError, ActiveRecord::RecordInvalid => e
137
+ errors << {
138
+ model: model_name,
139
+ record: record_data,
140
+ error: e.message
141
+ }
142
+ rescue => e
143
+ errors << {
144
+ model: model_name,
145
+ record: record_data,
146
+ error: e.message
147
+ }
148
+ end
149
+ end
150
+ end
151
+
152
+ # If there are any validation errors, don't import anything
153
+ return [0, total_skipped, errors] if errors.any?
154
+
155
+ # Second pass: actually import the records
156
+ records_to_import.each_slice(batch_size) do |batch|
157
+ batch.each do |model_name, record_data|
158
+ begin
159
+ created_record = create_record(model_name, record_data, pk_mapper)
160
+
161
+ if created_record&.persisted?
162
+ original_id = record_data['id']
163
+ pk_mapper.add_mapping(model_name, original_id, created_record.id) if original_id
164
+ total_imported += 1
165
+ end
166
+ rescue => e
167
+ errors << {
168
+ model: model_name,
169
+ record: record_data,
170
+ error: e.message
171
+ }
172
+ end
173
+ end
174
+ end
175
+
176
+ [total_imported, total_skipped, errors]
177
+ end
178
+
179
+ def validate_record(model_name, record_data, pk_mapper)
180
+ return true unless config.validate_records
181
+
182
+ model_class = model_name.constantize
183
+ attributes = prepare_attributes(model_name, record_data, pk_mapper)
184
+ record = model_class.new(attributes)
185
+
186
+ unless record.valid?
187
+ raise ImportError, "Validation failed for #{model_name}: #{record.errors.full_messages.join(', ')}"
188
+ end
189
+
190
+ true
191
+ rescue NameError
192
+ raise ImportError, "Model class #{model_name} not found"
193
+ end
194
+
195
+ def find_existing_record(model_name, record_data, custom_finders)
196
+ if custom_finders[model_name]
197
+ custom_finders[model_name].call(record_data)
198
+ elsif record_data['id']
199
+ model_class = model_name.constantize
200
+ model_class.find_by(id: record_data['id'])
201
+ end
202
+ rescue NameError
203
+ nil
204
+ end
205
+
206
+ def create_record(model_name, record_data, pk_mapper)
207
+ model_class = model_name.constantize
208
+
209
+ attributes = prepare_attributes(model_name, record_data, pk_mapper)
210
+
211
+ record = model_class.new(attributes)
212
+
213
+ if config.validate_records
214
+ unless record.valid?
215
+ raise ImportError, "Validation failed for #{model_name}: #{record.errors.full_messages.join(', ')}"
216
+ end
217
+ end
218
+
219
+ record.save!
220
+ record
221
+ rescue NameError
222
+ raise ImportError, "Model class #{model_name} not found"
223
+ rescue ActiveRecord::RecordInvalid => e
224
+ raise ImportError, "Failed to create #{model_name}: #{e.message}"
225
+ end
226
+
227
+ def prepare_attributes(model_name, record_data, pk_mapper)
228
+ attributes = record_data.except('_model')
229
+
230
+ # Handle primary key based on strategy
231
+ unless pk_mapper.should_preserve_primary_key?
232
+ attributes.delete('id')
233
+ end
234
+
235
+ # Map foreign keys to new primary keys
236
+ attributes.each do |key, value|
237
+ if key.end_with?('_id') && value
238
+ mapped_value = pk_mapper.get_mapping(key.sub('_id', '').classify, value)
239
+ attributes[key] = mapped_value if mapped_value
240
+ end
241
+ end
242
+
243
+ attributes
244
+ end
245
+
246
+ def build_import_metadata(start_time, imported_count, skipped_count, errors, duration, total_records)
247
+ metadata = {
248
+ 'import_time' => start_time.iso8601,
249
+ 'total_records' => total_records,
250
+ 'imported_records' => imported_count,
251
+ 'skipped_records' => skipped_count,
252
+ 'duration_seconds' => duration.round(3),
253
+ 'primary_key_strategy' => config.primary_key_strategy.to_s
254
+ }
255
+
256
+ metadata['errors'] = errors if errors.any?
257
+ metadata
258
+ end
259
+ end
260
+ end
@@ -0,0 +1,176 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'fileutils'
5
+ require 'oj'
6
+
7
+ module ActiveRecordGraphExtractor
8
+ class JSONSerializer
9
+ attr_reader :config
10
+
11
+ def initialize(config = Configuration.new)
12
+ @config = config
13
+ end
14
+
15
+ def serialize_to_file(data, file_path)
16
+ if config.stream_json
17
+ stream_serialize_to_file(data, file_path)
18
+ else
19
+ File.write(file_path, serialize_to_string(data))
20
+ end
21
+ end
22
+
23
+ def serialize_to_string(data)
24
+ Oj.dump(data, mode: :compat, indent: 2)
25
+ end
26
+
27
+ def deserialize_from_file(file_path)
28
+ raise Errno::ENOENT, "No such file or directory @ rb_sysopen - #{file_path}" unless File.exist?(file_path)
29
+
30
+ if config.stream_json
31
+ stream_deserialize_from_file(file_path)
32
+ else
33
+ Oj.load_file(file_path, mode: :compat)
34
+ end
35
+ end
36
+
37
+ def deserialize_from_string(json_string)
38
+ Oj.load(json_string, mode: :compat)
39
+ end
40
+
41
+ def validate_json_structure(data)
42
+ errors = []
43
+
44
+ # Check required metadata
45
+ unless data.is_a?(Hash)
46
+ errors << "Root data must be a hash"
47
+ return errors
48
+ end
49
+
50
+ metadata = data['metadata']
51
+ unless metadata.is_a?(Hash)
52
+ errors << "Missing or invalid metadata section"
53
+ return errors
54
+ end
55
+
56
+ required_metadata = %w[root_model root_id extracted_at schema_version]
57
+ required_metadata.each do |field|
58
+ unless metadata.key?(field)
59
+ errors << "Missing required metadata field: #{field}"
60
+ end
61
+ end
62
+
63
+ # Check records structure
64
+ records = data['records']
65
+ unless records.is_a?(Hash)
66
+ errors << "Missing or invalid records section"
67
+ return errors
68
+ end
69
+
70
+ records.each do |model_name, model_records|
71
+ unless model_records.is_a?(Array)
72
+ errors << "Records for #{model_name} must be an array"
73
+ next
74
+ end
75
+
76
+ model_records.each_with_index do |record, index|
77
+ record_errors = validate_record_structure(record, model_name, index)
78
+ errors.concat(record_errors)
79
+ end
80
+ end
81
+
82
+ errors
83
+ end
84
+
85
+ def estimate_file_size(data)
86
+ # Rough estimation based on JSON serialization
87
+ sample_size = [data.dig('records')&.values&.first&.size || 0, 100].min
88
+
89
+ if sample_size > 0
90
+ sample_data = data.dup
91
+ sample_data['records'] = data['records'].transform_values do |records|
92
+ records.first(sample_size)
93
+ end
94
+
95
+ sample_json = serialize_to_string(sample_data)
96
+ total_records = data['records'].values.sum(&:size)
97
+
98
+ (sample_json.bytesize.to_f / sample_size * total_records).round
99
+ else
100
+ serialize_to_string(data).bytesize
101
+ end
102
+ end
103
+
104
+ private
105
+
106
+ def stream_serialize_to_file(data, file_path)
107
+ File.open(file_path, 'w') do |file|
108
+ file.write('{"metadata":')
109
+ file.write(Oj.dump(data['metadata'], mode: :compat))
110
+ file.write(',"records":{')
111
+
112
+ model_names = data['records'].keys
113
+ model_names.each_with_index do |model_name, model_index|
114
+ file.write('"')
115
+ file.write(model_name)
116
+ file.write('":[')
117
+
118
+ records = data['records'][model_name]
119
+ records.each_with_index do |record, record_index|
120
+ file.write(Oj.dump(record, mode: :compat))
121
+ file.write(',') unless record_index == records.size - 1
122
+ end
123
+
124
+ file.write(']')
125
+ file.write(',') unless model_index == model_names.size - 1
126
+ end
127
+
128
+ file.write('}}')
129
+ end
130
+ end
131
+
132
+ def stream_deserialize_from_file(file_path)
133
+ # For streaming deserialization, we need to parse the JSON incrementally
134
+ # This is a simplified implementation - for production, consider using a proper streaming JSON parser
135
+ content = File.read(file_path)
136
+ Oj.load(content, mode: :compat)
137
+ end
138
+
139
+ def validate_record_structure(record, model_name, index)
140
+ errors = []
141
+
142
+ unless record.is_a?(Hash)
143
+ errors << "Record #{index} in #{model_name} must be a hash"
144
+ return errors
145
+ end
146
+
147
+ unless record.key?('original_id')
148
+ errors << "Record #{index} in #{model_name} missing original_id"
149
+ end
150
+
151
+ unless record.key?('attributes')
152
+ errors << "Record #{index} in #{model_name} missing attributes"
153
+ end
154
+
155
+ attributes = record['attributes']
156
+ unless attributes.is_a?(Hash)
157
+ errors << "Record #{index} in #{model_name} attributes must be a hash"
158
+ end
159
+
160
+ if record.key?('relationships')
161
+ relationships = record['relationships']
162
+ unless relationships.is_a?(Hash)
163
+ errors << "Record #{index} in #{model_name} relationships must be a hash"
164
+ else
165
+ relationships.each do |field, reference|
166
+ unless reference.is_a?(Hash) && reference.key?('table') && reference.key?('original_id')
167
+ errors << "Record #{index} in #{model_name} has invalid relationship #{field}"
168
+ end
169
+ end
170
+ end
171
+ end
172
+
173
+ errors
174
+ end
175
+ end
176
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ActiveRecordGraphExtractor
4
+ class PrimaryKeyMapper
5
+ attr_reader :strategy
6
+
7
+ def initialize(strategy = :generate_new)
8
+ unless [:preserve_original, :generate_new].include?(strategy)
9
+ raise ArgumentError, "Invalid strategy: #{strategy}. Must be :preserve_original or :generate_new"
10
+ end
11
+ @strategy = strategy
12
+ @mappings = {}
13
+ end
14
+
15
+ def add_mapping(model_name, original_id, new_id)
16
+ model_key = model_name.to_s
17
+ @mappings[model_key] ||= {}
18
+ @mappings[model_key][original_id] = new_id
19
+ end
20
+
21
+ def get_mapping(model_name, original_id)
22
+ model_key = model_name.to_s
23
+ @mappings.dig(model_key, original_id)
24
+ end
25
+
26
+ def map_foreign_key(column_name, original_value)
27
+ return original_value if original_value.nil?
28
+
29
+ # Try to infer the model name from the foreign key column
30
+ model_name = infer_model_name(column_name)
31
+ return original_value unless model_name
32
+
33
+ # Look up the mapping
34
+ get_mapping(model_name, original_value) || original_value
35
+ end
36
+
37
+ def get_all_mappings
38
+ @mappings.dup
39
+ end
40
+
41
+ def should_preserve_primary_key?
42
+ @strategy == :preserve_original
43
+ end
44
+
45
+ private
46
+
47
+ def infer_model_name(column_name)
48
+ return nil unless column_name.to_s.end_with?('_id')
49
+
50
+ # Remove _id suffix and convert to model name
51
+ base_name = column_name.to_s.sub(/_id$/, '')
52
+
53
+ # Convert snake_case to CamelCase
54
+ base_name.split('_').map(&:capitalize).join
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,202 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module ActiveRecordGraphExtractor
6
+ class ProgressTracker
7
+ attr_reader :enabled, :total_records, :processed_records, :model_progress, :start_time
8
+
9
+ def initialize(enabled: true, output: $stdout, total_records: 0)
10
+ @enabled = enabled
11
+ @output = output
12
+ @start_time = nil
13
+ @total_records = total_records
14
+ @processed_records = 0
15
+ @model_progress = {}
16
+ end
17
+
18
+ def start
19
+ @start_time = Time.now
20
+ end
21
+
22
+ def start_extraction(total_count)
23
+ return unless @enabled
24
+
25
+ @total_records = total_count
26
+ @start_time = Time.now
27
+ log_info("🚀 Starting extraction of #{format_number(total_count)} records...")
28
+ end
29
+
30
+ def update_progress(current_count, message = nil)
31
+ return unless @enabled
32
+
33
+ percentage = @total_records > 0 ? (current_count * 100.0 / @total_records).round(1) : 0
34
+
35
+ status = "📊 Progress: #{format_number(current_count)}/#{format_number(@total_records)} (#{percentage}%)"
36
+ status += " - #{message}" if message
37
+
38
+ log_info(status)
39
+ end
40
+
41
+ def complete_extraction(final_count, duration)
42
+ return unless @enabled
43
+
44
+ rate = duration > 0 ? (final_count / duration).round(1) : 0
45
+ log_info("✅ Extraction completed! #{format_number(final_count)} records in #{format_duration(duration)} (#{rate} records/sec)")
46
+ end
47
+
48
+ def start_import(total_count)
49
+ return unless @enabled
50
+
51
+ @total_records = total_count
52
+ @start_time = Time.now
53
+ log_info("🚀 Starting import of #{format_number(total_count)} records...")
54
+ end
55
+
56
+ def complete_import(final_count, duration)
57
+ return unless @enabled
58
+
59
+ rate = duration > 0 ? (final_count / duration).round(1) : 0
60
+ log_info("✅ Import completed! #{format_number(final_count)} records in #{format_duration(duration)} (#{rate} records/sec)")
61
+ end
62
+
63
+ def log_model_progress(model_name, current, total = nil)
64
+ if total.nil?
65
+ # If only current is provided, assume it's a simple increment
66
+ @model_progress[model_name] ||= { current: 0, total: 1, percentage: 0 }
67
+ @model_progress[model_name][:current] = current
68
+ @model_progress[model_name][:percentage] = 100
69
+ else
70
+ percentage = total > 0 ? (current * 100.0 / total).round(1) : 0
71
+ @model_progress[model_name] = {
72
+ current: current,
73
+ total: total,
74
+ percentage: percentage
75
+ }
76
+
77
+ if @enabled
78
+ log_info("📝 #{model_name}: #{format_number(current)}/#{format_number(total)} (#{percentage}%)")
79
+ end
80
+ end
81
+ end
82
+
83
+ def increment
84
+ @processed_records += 1
85
+ end
86
+
87
+ def progress_percentage
88
+ return 0 if @total_records == 0
89
+ (@processed_records * 100.0 / @total_records).round(1)
90
+ end
91
+
92
+ def elapsed_time
93
+ return 0 unless @start_time
94
+ Time.now - @start_time
95
+ end
96
+
97
+ def estimated_time_remaining
98
+ return 0 if @processed_records == 0 || @total_records == 0 || @processed_records >= @total_records
99
+
100
+ elapsed = elapsed_time
101
+ rate = @processed_records / elapsed
102
+ remaining_records = @total_records - @processed_records
103
+ remaining_records / rate
104
+ end
105
+
106
+ def records_per_second
107
+ return 0 if @processed_records == 0 || elapsed_time == 0
108
+ @processed_records / elapsed_time
109
+ end
110
+
111
+ def complete?
112
+ @total_records > 0 && @processed_records >= @total_records
113
+ end
114
+
115
+ def reset
116
+ @processed_records = 0
117
+ @model_progress = {}
118
+ @start_time = nil
119
+ end
120
+
121
+ def to_s
122
+ "Progress: #{@processed_records}/#{@total_records} (#{progress_percentage}%)"
123
+ end
124
+
125
+ def to_json(*args)
126
+ {
127
+ total_records: @total_records,
128
+ processed_records: @processed_records,
129
+ progress_percentage: progress_percentage,
130
+ elapsed_time: elapsed_time,
131
+ estimated_time_remaining: estimated_time_remaining,
132
+ records_per_second: records_per_second,
133
+ model_progress: @model_progress,
134
+ complete: complete?
135
+ }.to_json(*args)
136
+ end
137
+
138
+ def log_progress_to_io(io)
139
+ io.puts(to_s)
140
+ io.puts("Elapsed: #{format_duration(elapsed_time)}")
141
+ io.puts("Remaining: #{format_duration(estimated_time_remaining)}")
142
+
143
+ @model_progress.each do |model, progress|
144
+ io.puts("#{model}: #{progress[:current]}/#{progress[:total]} (#{progress[:percentage]}%)")
145
+ end
146
+ end
147
+
148
+ def log_error(message)
149
+ # Always show errors, even if progress is disabled
150
+ @output.puts("❌ ERROR: #{message}")
151
+ rescue StandardError
152
+ # Silently ignore output errors
153
+ end
154
+
155
+ def log_warning(message)
156
+ return unless @enabled
157
+
158
+ @output.puts("⚠️ WARNING: #{message}")
159
+ rescue StandardError
160
+ # Silently ignore output errors
161
+ end
162
+
163
+ def log_info(message)
164
+ return unless @enabled
165
+
166
+ @output.puts(message)
167
+ rescue StandardError
168
+ # Silently ignore output errors
169
+ end
170
+
171
+ def log_memory_usage
172
+ return unless @enabled
173
+
174
+ memory_mb = current_memory_usage
175
+ log_info("💾 Memory usage: #{memory_mb} MB")
176
+ end
177
+
178
+ def current_memory_usage
179
+ if defined?(GC.stat)
180
+ (GC.stat[:heap_allocated_pages] * 4096 / 1024.0 / 1024.0).round(1)
181
+ else
182
+ 0.0
183
+ end
184
+ end
185
+
186
+ private
187
+
188
+ def format_number(number)
189
+ number.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse
190
+ end
191
+
192
+ def format_duration(seconds)
193
+ if seconds < 60
194
+ "#{seconds.round(2)}s"
195
+ else
196
+ minutes = (seconds / 60).floor
197
+ remaining_seconds = (seconds % 60).round
198
+ "#{minutes}m #{remaining_seconds}s"
199
+ end
200
+ end
201
+ end
202
+ end