RubyGems - activerecord-graph-extractor - Versions diffs - 0.1.0 - Mend

activerecord-graph-extractor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

checksums.yaml +7 -0
data/.rspec +4 -0
data/CHANGELOG.md +36 -0
data/Gemfile +17 -0
data/Gemfile.lock +201 -0
data/LICENSE +21 -0
data/README.md +532 -0
data/Rakefile +36 -0
data/activerecord-graph-extractor.gemspec +64 -0
data/docs/dry_run.md +410 -0
data/docs/examples.md +239 -0
data/docs/s3_integration.md +381 -0
data/docs/usage.md +363 -0
data/examples/dry_run_example.rb +227 -0
data/examples/s3_example.rb +247 -0
data/exe/arge +7 -0
data/lib/activerecord_graph_extractor/cli.rb +627 -0
data/lib/activerecord_graph_extractor/configuration.rb +98 -0
data/lib/activerecord_graph_extractor/dependency_resolver.rb +406 -0
data/lib/activerecord_graph_extractor/dry_run_analyzer.rb +421 -0
data/lib/activerecord_graph_extractor/errors.rb +33 -0
data/lib/activerecord_graph_extractor/extractor.rb +182 -0
data/lib/activerecord_graph_extractor/importer.rb +260 -0
data/lib/activerecord_graph_extractor/json_serializer.rb +176 -0
data/lib/activerecord_graph_extractor/primary_key_mapper.rb +57 -0
data/lib/activerecord_graph_extractor/progress_tracker.rb +202 -0
data/lib/activerecord_graph_extractor/relationship_analyzer.rb +212 -0
data/lib/activerecord_graph_extractor/s3_client.rb +170 -0
data/lib/activerecord_graph_extractor/version.rb +5 -0
data/lib/activerecord_graph_extractor.rb +34 -0
data/scripts/verify_installation.rb +192 -0
metadata +388 -0

data/lib/activerecord_graph_extractor/importer.rb ADDED Viewed

@@ -0,0 +1,260 @@
+# frozen_string_literal: true
+require 'json'
+module ActiveRecordGraphExtractor
+  class Importer
+    attr_reader :config
+    def initialize(config = ActiveRecordGraphExtractor.configuration)
+      @config = config
+    end
+    def import(data, options = {})
+      validate_data_structure!(data)
+      records = data['records']
+      raise ImportError, "No records found in data" if records.empty?
+      start_time = Time.now
+      pk_mapper = PrimaryKeyMapper.new(config.primary_key_strategy)
+      begin
+        imported_count = 0
+        skipped_count = 0
+        errors = []
+        use_transaction = options[:transaction] || config.use_transactions
+        batch_size = options[:batch_size] || 1000
+        skip_existing = options[:skip_existing] || false
+        custom_finders = options[:custom_finders] || {}
+        if use_transaction
+          ActiveRecord::Base.transaction do
+            imported_count, skipped_count, errors = import_records_in_order(
+              records, pk_mapper, skip_existing, custom_finders, batch_size
+            )
+          end
+        else
+          imported_count, skipped_count, errors = import_records_in_order(
+            records, pk_mapper, skip_existing, custom_finders, batch_size
+          )
+        end
+        import_duration = Time.now - start_time
+        {
+          'metadata' => build_import_metadata(start_time, imported_count, skipped_count, errors, import_duration, data['records'].size),
+          'imported_records' => imported_count,
+          'skipped_records' => skipped_count,
+          'errors' => errors,
+          'primary_key_mappings' => pk_mapper.get_all_mappings
+        }
+      rescue StandardError => e
+        raise ImportError, "Failed to import records: #{e.message}"
+      end
+    end
+    def import_from_file(file_path, options = {})
+      unless File.exist?(file_path)
+        raise FileError, "File not found: #{file_path}"
+      end
+      begin
+        file_content = File.read(file_path)
+        data = JSON.parse(file_content)
+        import(data, options)
+      rescue JSON::ParserError => e
+        raise JSONError, "Invalid JSON in file #{file_path}: #{e.message}"
+      rescue => e
+        raise FileError, "Error reading file #{file_path}: #{e.message}"
+      end
+    end
+    private
+    def validate_data_structure!(data)
+      unless data.is_a?(Hash) && data.key?('records')
+        raise ImportError, "Invalid data structure: expected Hash with 'records' key"
+      end
+    end
+    def import_records_in_order(records, pk_mapper, skip_existing, custom_finders, batch_size)
+      # Group records by model and resolve dependencies
+      resolver = DependencyResolver.new({})
+      analyzer = RelationshipAnalyzer.new(config)
+      records_by_model = group_records_by_model(records)
+      models = records_by_model.keys.map { |name| name.constantize rescue nil }.compact
+      dependency_graph = analyzer.build_dependency_graph(models)
+      ordered_records = resolver.build_creation_order(records_by_model, dependency_graph)
+      import_records(ordered_records, pk_mapper, skip_existing, custom_finders, batch_size)
+    end
+    def group_records_by_model(records)
+      grouped = {}
+      records.each do |record|
+        unless record.key?('_model')
+          raise ImportError, "Record missing _model key: #{record.inspect}"
+        end
+        model_name = record['_model']
+        grouped[model_name] ||= []
+        grouped[model_name] << record
+      end
+      grouped
+    end
+    def import_records(ordered_records, pk_mapper, skip_existing, custom_finders, batch_size)
+      total_imported = 0
+      total_skipped = 0
+      errors = []
+      # First pass: validate all records and check for existing records
+      records_to_import = []
+      ordered_records.each do |model_name, model_records|
+        model_records.each do |record_data|
+          begin
+            # Check for existing record if skip_existing is true
+            if skip_existing || custom_finders[model_name]
+              existing_record = find_existing_record(model_name, record_data, custom_finders)
+              if existing_record
+                total_skipped += 1
+                next
+              end
+            end
+            # Validate the record without saving
+            if validate_record(model_name, record_data, pk_mapper)
+              records_to_import << [model_name, record_data]
+            end
+          rescue ImportError, ActiveRecord::RecordInvalid => e
+            errors << {
+              model: model_name,
+              record: record_data,
+              error: e.message
+            }
+          rescue => e
+            errors << {
+              model: model_name,
+              record: record_data,
+              error: e.message
+            }
+          end
+        end
+      end
+      # If there are any validation errors, don't import anything
+      return [0, total_skipped, errors] if errors.any?
+      # Second pass: actually import the records
+      records_to_import.each_slice(batch_size) do |batch|
+        batch.each do |model_name, record_data|
+          begin
+            created_record = create_record(model_name, record_data, pk_mapper)
+            if created_record&.persisted?
+              original_id = record_data['id']
+              pk_mapper.add_mapping(model_name, original_id, created_record.id) if original_id
+              total_imported += 1
+            end
+          rescue => e
+            errors << {
+              model: model_name,
+              record: record_data,
+              error: e.message
+            }
+          end
+        end
+      end
+      [total_imported, total_skipped, errors]
+    end
+    def validate_record(model_name, record_data, pk_mapper)
+      return true unless config.validate_records
+      model_class = model_name.constantize
+      attributes = prepare_attributes(model_name, record_data, pk_mapper)
+      record = model_class.new(attributes)
+      unless record.valid?
+        raise ImportError, "Validation failed for #{model_name}: #{record.errors.full_messages.join(', ')}"
+      end
+      true
+    rescue NameError
+      raise ImportError, "Model class #{model_name} not found"
+    end
+    def find_existing_record(model_name, record_data, custom_finders)
+      if custom_finders[model_name]
+        custom_finders[model_name].call(record_data)
+      elsif record_data['id']
+        model_class = model_name.constantize
+        model_class.find_by(id: record_data['id'])
+      end
+    rescue NameError
+      nil
+    end
+    def create_record(model_name, record_data, pk_mapper)
+      model_class = model_name.constantize
+      attributes = prepare_attributes(model_name, record_data, pk_mapper)
+      record = model_class.new(attributes)
+      if config.validate_records
+        unless record.valid?
+          raise ImportError, "Validation failed for #{model_name}: #{record.errors.full_messages.join(', ')}"
+        end
+      end
+      record.save!
+      record
+    rescue NameError
+      raise ImportError, "Model class #{model_name} not found"
+    rescue ActiveRecord::RecordInvalid => e
+      raise ImportError, "Failed to create #{model_name}: #{e.message}"
+    end
+    def prepare_attributes(model_name, record_data, pk_mapper)
+      attributes = record_data.except('_model')
+      # Handle primary key based on strategy
+      unless pk_mapper.should_preserve_primary_key?
+        attributes.delete('id')
+      end
+      # Map foreign keys to new primary keys
+      attributes.each do |key, value|
+        if key.end_with?('_id') && value
+          mapped_value = pk_mapper.get_mapping(key.sub('_id', '').classify, value)
+          attributes[key] = mapped_value if mapped_value
+        end
+      end
+      attributes
+    end
+    def build_import_metadata(start_time, imported_count, skipped_count, errors, duration, total_records)
+      metadata = {
+        'import_time' => start_time.iso8601,
+        'total_records' => total_records,
+        'imported_records' => imported_count,
+        'skipped_records' => skipped_count,
+        'duration_seconds' => duration.round(3),
+        'primary_key_strategy' => config.primary_key_strategy.to_s
+      }
+      metadata['errors'] = errors if errors.any?
+      metadata
+    end
+  end
+end

data/lib/activerecord_graph_extractor/json_serializer.rb ADDED Viewed

@@ -0,0 +1,176 @@
+# frozen_string_literal: true
+require 'json'
+require 'fileutils'
+require 'oj'
+module ActiveRecordGraphExtractor
+  class JSONSerializer
+    attr_reader :config
+    def initialize(config = Configuration.new)
+      @config = config
+    end
+    def serialize_to_file(data, file_path)
+      if config.stream_json
+        stream_serialize_to_file(data, file_path)
+      else
+        File.write(file_path, serialize_to_string(data))
+      end
+    end
+    def serialize_to_string(data)
+      Oj.dump(data, mode: :compat, indent: 2)
+    end
+    def deserialize_from_file(file_path)
+      raise Errno::ENOENT, "No such file or directory @ rb_sysopen - #{file_path}" unless File.exist?(file_path)
+      if config.stream_json
+        stream_deserialize_from_file(file_path)
+      else
+        Oj.load_file(file_path, mode: :compat)
+      end
+    end
+    def deserialize_from_string(json_string)
+      Oj.load(json_string, mode: :compat)
+    end
+    def validate_json_structure(data)
+      errors = []
+      # Check required metadata
+      unless data.is_a?(Hash)
+        errors << "Root data must be a hash"
+        return errors
+      end
+      metadata = data['metadata']
+      unless metadata.is_a?(Hash)
+        errors << "Missing or invalid metadata section"
+        return errors
+      end
+      required_metadata = %w[root_model root_id extracted_at schema_version]
+      required_metadata.each do |field|
+        unless metadata.key?(field)
+          errors << "Missing required metadata field: #{field}"
+        end
+      end
+      # Check records structure
+      records = data['records']
+      unless records.is_a?(Hash)
+        errors << "Missing or invalid records section"
+        return errors
+      end
+      records.each do |model_name, model_records|
+        unless model_records.is_a?(Array)
+          errors << "Records for #{model_name} must be an array"
+          next
+        end
+        model_records.each_with_index do |record, index|
+          record_errors = validate_record_structure(record, model_name, index)
+          errors.concat(record_errors)
+        end
+      end
+      errors
+    end
+    def estimate_file_size(data)
+      # Rough estimation based on JSON serialization
+      sample_size = [data.dig('records')&.values&.first&.size || 0, 100].min
+      if sample_size > 0
+        sample_data = data.dup
+        sample_data['records'] = data['records'].transform_values do |records|
+          records.first(sample_size)
+        end
+        sample_json = serialize_to_string(sample_data)
+        total_records = data['records'].values.sum(&:size)
+        (sample_json.bytesize.to_f / sample_size * total_records).round
+      else
+        serialize_to_string(data).bytesize
+      end
+    end
+    private
+    def stream_serialize_to_file(data, file_path)
+      File.open(file_path, 'w') do |file|
+        file.write('{"metadata":')
+        file.write(Oj.dump(data['metadata'], mode: :compat))
+        file.write(',"records":{')
+        model_names = data['records'].keys
+        model_names.each_with_index do |model_name, model_index|
+          file.write('"')
+          file.write(model_name)
+          file.write('":[')
+          records = data['records'][model_name]
+          records.each_with_index do |record, record_index|
+            file.write(Oj.dump(record, mode: :compat))
+            file.write(',') unless record_index == records.size - 1
+          end
+          file.write(']')
+          file.write(',') unless model_index == model_names.size - 1
+        end
+        file.write('}}')
+      end
+    end
+    def stream_deserialize_from_file(file_path)
+      # For streaming deserialization, we need to parse the JSON incrementally
+      # This is a simplified implementation - for production, consider using a proper streaming JSON parser
+      content = File.read(file_path)
+      Oj.load(content, mode: :compat)
+    end
+    def validate_record_structure(record, model_name, index)
+      errors = []
+      unless record.is_a?(Hash)
+        errors << "Record #{index} in #{model_name} must be a hash"
+        return errors
+      end
+      unless record.key?('original_id')
+        errors << "Record #{index} in #{model_name} missing original_id"
+      end
+      unless record.key?('attributes')
+        errors << "Record #{index} in #{model_name} missing attributes"
+      end
+      attributes = record['attributes']
+      unless attributes.is_a?(Hash)
+        errors << "Record #{index} in #{model_name} attributes must be a hash"
+      end
+      if record.key?('relationships')
+        relationships = record['relationships']
+        unless relationships.is_a?(Hash)
+          errors << "Record #{index} in #{model_name} relationships must be a hash"
+        else
+          relationships.each do |field, reference|
+            unless reference.is_a?(Hash) && reference.key?('table') && reference.key?('original_id')
+              errors << "Record #{index} in #{model_name} has invalid relationship #{field}"
+            end
+          end
+        end
+      end
+      errors
+    end
+  end
+end

data/lib/activerecord_graph_extractor/primary_key_mapper.rb ADDED Viewed

@@ -0,0 +1,57 @@
+# frozen_string_literal: true
+module ActiveRecordGraphExtractor
+  class PrimaryKeyMapper
+    attr_reader :strategy
+    def initialize(strategy = :generate_new)
+      unless [:preserve_original, :generate_new].include?(strategy)
+        raise ArgumentError, "Invalid strategy: #{strategy}. Must be :preserve_original or :generate_new"
+      end
+      @strategy = strategy
+      @mappings = {}
+    end
+    def add_mapping(model_name, original_id, new_id)
+      model_key = model_name.to_s
+      @mappings[model_key] ||= {}
+      @mappings[model_key][original_id] = new_id
+    end
+    def get_mapping(model_name, original_id)
+      model_key = model_name.to_s
+      @mappings.dig(model_key, original_id)
+    end
+    def map_foreign_key(column_name, original_value)
+      return original_value if original_value.nil?
+      # Try to infer the model name from the foreign key column
+      model_name = infer_model_name(column_name)
+      return original_value unless model_name
+      # Look up the mapping
+      get_mapping(model_name, original_value) || original_value
+    end
+    def get_all_mappings
+      @mappings.dup
+    end
+    def should_preserve_primary_key?
+      @strategy == :preserve_original
+    end
+    private
+    def infer_model_name(column_name)
+      return nil unless column_name.to_s.end_with?('_id')
+      # Remove _id suffix and convert to model name
+      base_name = column_name.to_s.sub(/_id$/, '')
+      # Convert snake_case to CamelCase
+      base_name.split('_').map(&:capitalize).join
+    end
+  end
+end

data/lib/activerecord_graph_extractor/progress_tracker.rb ADDED Viewed

@@ -0,0 +1,202 @@
+# frozen_string_literal: true
+require 'json'
+module ActiveRecordGraphExtractor
+  class ProgressTracker
+    attr_reader :enabled, :total_records, :processed_records, :model_progress, :start_time
+    def initialize(enabled: true, output: $stdout, total_records: 0)
+      @enabled = enabled
+      @output = output
+      @start_time = nil
+      @total_records = total_records
+      @processed_records = 0
+      @model_progress = {}
+    end
+    def start
+      @start_time = Time.now
+    end
+    def start_extraction(total_count)
+      return unless @enabled
+      @total_records = total_count
+      @start_time = Time.now
+      log_info("🚀 Starting extraction of #{format_number(total_count)} records...")
+    end
+    def update_progress(current_count, message = nil)
+      return unless @enabled
+      percentage = @total_records > 0 ? (current_count * 100.0 / @total_records).round(1) : 0
+      status = "📊 Progress: #{format_number(current_count)}/#{format_number(@total_records)} (#{percentage}%)"
+      status += " - #{message}" if message
+      log_info(status)
+    end
+    def complete_extraction(final_count, duration)
+      return unless @enabled
+      rate = duration > 0 ? (final_count / duration).round(1) : 0
+      log_info("✅ Extraction completed! #{format_number(final_count)} records in #{format_duration(duration)} (#{rate} records/sec)")
+    end
+    def start_import(total_count)
+      return unless @enabled
+      @total_records = total_count
+      @start_time = Time.now
+      log_info("🚀 Starting import of #{format_number(total_count)} records...")
+    end
+    def complete_import(final_count, duration)
+      return unless @enabled
+      rate = duration > 0 ? (final_count / duration).round(1) : 0
+      log_info("✅ Import completed! #{format_number(final_count)} records in #{format_duration(duration)} (#{rate} records/sec)")
+    end
+    def log_model_progress(model_name, current, total = nil)
+      if total.nil?
+        # If only current is provided, assume it's a simple increment
+        @model_progress[model_name] ||= { current: 0, total: 1, percentage: 0 }
+        @model_progress[model_name][:current] = current
+        @model_progress[model_name][:percentage] = 100
+      else
+        percentage = total > 0 ? (current * 100.0 / total).round(1) : 0
+        @model_progress[model_name] = {
+          current: current,
+          total: total,
+          percentage: percentage
+        }
+        if @enabled
+          log_info("📝 #{model_name}: #{format_number(current)}/#{format_number(total)} (#{percentage}%)")
+        end
+      end
+    end
+    def increment
+      @processed_records += 1
+    end
+    def progress_percentage
+      return 0 if @total_records == 0
+      (@processed_records * 100.0 / @total_records).round(1)
+    end
+    def elapsed_time
+      return 0 unless @start_time
+      Time.now - @start_time
+    end
+    def estimated_time_remaining
+      return 0 if @processed_records == 0 || @total_records == 0 || @processed_records >= @total_records
+      elapsed = elapsed_time
+      rate = @processed_records / elapsed
+      remaining_records = @total_records - @processed_records
+      remaining_records / rate
+    end
+    def records_per_second
+      return 0 if @processed_records == 0 || elapsed_time == 0
+      @processed_records / elapsed_time
+    end
+    def complete?
+      @total_records > 0 && @processed_records >= @total_records
+    end
+    def reset
+      @processed_records = 0
+      @model_progress = {}
+      @start_time = nil
+    end
+    def to_s
+      "Progress: #{@processed_records}/#{@total_records} (#{progress_percentage}%)"
+    end
+    def to_json(*args)
+      {
+        total_records: @total_records,
+        processed_records: @processed_records,
+        progress_percentage: progress_percentage,
+        elapsed_time: elapsed_time,
+        estimated_time_remaining: estimated_time_remaining,
+        records_per_second: records_per_second,
+        model_progress: @model_progress,
+        complete: complete?
+      }.to_json(*args)
+    end
+    def log_progress_to_io(io)
+      io.puts(to_s)
+      io.puts("Elapsed: #{format_duration(elapsed_time)}")
+      io.puts("Remaining: #{format_duration(estimated_time_remaining)}")
+      @model_progress.each do |model, progress|
+        io.puts("#{model}: #{progress[:current]}/#{progress[:total]} (#{progress[:percentage]}%)")
+      end
+    end
+    def log_error(message)
+      # Always show errors, even if progress is disabled
+      @output.puts("❌ ERROR: #{message}")
+    rescue StandardError
+      # Silently ignore output errors
+    end
+    def log_warning(message)
+      return unless @enabled
+      @output.puts("⚠️  WARNING: #{message}")
+    rescue StandardError
+      # Silently ignore output errors
+    end
+    def log_info(message)
+      return unless @enabled
+      @output.puts(message)
+    rescue StandardError
+      # Silently ignore output errors
+    end
+    def log_memory_usage
+      return unless @enabled
+      memory_mb = current_memory_usage
+      log_info("💾 Memory usage: #{memory_mb} MB")
+    end
+    def current_memory_usage
+      if defined?(GC.stat)
+        (GC.stat[:heap_allocated_pages] * 4096 / 1024.0 / 1024.0).round(1)
+      else
+        0.0
+      end
+    end
+    private
+    def format_number(number)
+      number.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse
+    end
+    def format_duration(seconds)
+      if seconds < 60
+        "#{seconds.round(2)}s"
+      else
+        minutes = (seconds / 60).floor
+        remaining_seconds = (seconds % 60).round
+        "#{minutes}m #{remaining_seconds}s"
+      end
+    end
+  end
+end