legal_summariser 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +8 -0
 - data/README.md +92 -29
 - data/lib/legal_summariser/model_trainer.rb +707 -0
 - data/lib/legal_summariser/multilingual_processor.rb +683 -0
 - data/lib/legal_summariser/pdf_annotator.rb +601 -0
 - data/lib/legal_summariser/plain_language_generator.rb +463 -0
 - data/lib/legal_summariser/version.rb +1 -1
 - metadata +19 -11
 
| 
         @@ -0,0 +1,707 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'json'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'fileutils'
         
     | 
| 
      
 3 
     | 
    
         
            +
            require 'digest'
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            module LegalSummariser
         
     | 
| 
      
 6 
     | 
    
         
            +
              # Advanced model training and fine-tuning capabilities for legal text processing
         
     | 
| 
      
 7 
     | 
    
         
            +
              class ModelTrainer
         
     | 
| 
      
 8 
     | 
    
         
            +
                class TrainingError < StandardError; end
         
     | 
| 
      
 9 
     | 
    
         
            +
                class ValidationError < StandardError; end
         
     | 
| 
      
 10 
     | 
    
         
            +
                class ModelNotFoundError < StandardError; end
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
                attr_reader :config, :logger, :training_data_dir, :models_dir
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
                def initialize(config = nil)
         
     | 
| 
      
 15 
     | 
    
         
            +
                  @config = config || LegalSummariser.configuration
         
     | 
| 
      
 16 
     | 
    
         
            +
                  @logger = @config.logger
         
     | 
| 
      
 17 
     | 
    
         
            +
                  @training_data_dir = File.join(@config.cache_dir, 'training_data')
         
     | 
| 
      
 18 
     | 
    
         
            +
                  @models_dir = File.join(@config.cache_dir, 'models')
         
     | 
| 
      
 19 
     | 
    
         
            +
                  
         
     | 
| 
      
 20 
     | 
    
         
            +
                  setup_directories
         
     | 
| 
      
 21 
     | 
    
         
            +
                end
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
                # Train a new model with provided training data
         
     | 
| 
      
 24 
     | 
    
         
            +
                def train_model(training_data, model_name, options = {})
         
     | 
| 
      
 25 
     | 
    
         
            +
                  validate_training_data(training_data)
         
     | 
| 
      
 26 
     | 
    
         
            +
                  
         
     | 
| 
      
 27 
     | 
    
         
            +
                  @logger&.info("Starting model training for '#{model_name}' with #{training_data.length} examples")
         
     | 
| 
      
 28 
     | 
    
         
            +
                  
         
     | 
| 
      
 29 
     | 
    
         
            +
                  start_time = Time.now
         
     | 
| 
      
 30 
     | 
    
         
            +
                  model_id = generate_model_id(model_name)
         
     | 
| 
      
 31 
     | 
    
         
            +
                  
         
     | 
| 
      
 32 
     | 
    
         
            +
                  begin
         
     | 
| 
      
 33 
     | 
    
         
            +
                    # Prepare training data
         
     | 
| 
      
 34 
     | 
    
         
            +
                    prepared_data = prepare_training_data(training_data, options)
         
     | 
| 
      
 35 
     | 
    
         
            +
                    
         
     | 
| 
      
 36 
     | 
    
         
            +
                    # Train different model types
         
     | 
| 
      
 37 
     | 
    
         
            +
                    model_results = {}
         
     | 
| 
      
 38 
     | 
    
         
            +
                    
         
     | 
| 
      
 39 
     | 
    
         
            +
                    if options[:train_pattern_model] != false
         
     | 
| 
      
 40 
     | 
    
         
            +
                      model_results[:pattern_model] = train_pattern_model(prepared_data, model_id)
         
     | 
| 
      
 41 
     | 
    
         
            +
                    end
         
     | 
| 
      
 42 
     | 
    
         
            +
                    
         
     | 
| 
      
 43 
     | 
    
         
            +
                    if options[:train_statistical_model] != false
         
     | 
| 
      
 44 
     | 
    
         
            +
                      model_results[:statistical_model] = train_statistical_model(prepared_data, model_id)
         
     | 
| 
      
 45 
     | 
    
         
            +
                    end
         
     | 
| 
      
 46 
     | 
    
         
            +
                    
         
     | 
| 
      
 47 
     | 
    
         
            +
                    if options[:train_neural_model] && options[:neural_config]
         
     | 
| 
      
 48 
     | 
    
         
            +
                      model_results[:neural_model] = train_neural_model(prepared_data, model_id, options[:neural_config])
         
     | 
| 
      
 49 
     | 
    
         
            +
                    end
         
     | 
| 
      
 50 
     | 
    
         
            +
                    
         
     | 
| 
      
 51 
     | 
    
         
            +
                    # Save model metadata
         
     | 
| 
      
 52 
     | 
    
         
            +
                    model_metadata = {
         
     | 
| 
      
 53 
     | 
    
         
            +
                      model_id: model_id,
         
     | 
| 
      
 54 
     | 
    
         
            +
                      model_name: model_name,
         
     | 
| 
      
 55 
     | 
    
         
            +
                      created_at: Time.now.iso8601,
         
     | 
| 
      
 56 
     | 
    
         
            +
                      training_examples: training_data.length,
         
     | 
| 
      
 57 
     | 
    
         
            +
                      model_types: model_results.keys,
         
     | 
| 
      
 58 
     | 
    
         
            +
                      performance_metrics: calculate_training_metrics(prepared_data, model_results),
         
     | 
| 
      
 59 
     | 
    
         
            +
                      options: options,
         
     | 
| 
      
 60 
     | 
    
         
            +
                      version: '0.3.0'
         
     | 
| 
      
 61 
     | 
    
         
            +
                    }
         
     | 
| 
      
 62 
     | 
    
         
            +
                    
         
     | 
| 
      
 63 
     | 
    
         
            +
                    save_model_metadata(model_id, model_metadata)
         
     | 
| 
      
 64 
     | 
    
         
            +
                    
         
     | 
| 
      
 65 
     | 
    
         
            +
                    duration = Time.now - start_time
         
     | 
| 
      
 66 
     | 
    
         
            +
                    @logger&.info("Model training completed in #{duration.round(2)}s")
         
     | 
| 
      
 67 
     | 
    
         
            +
                    
         
     | 
| 
      
 68 
     | 
    
         
            +
                    {
         
     | 
| 
      
 69 
     | 
    
         
            +
                      model_id: model_id,
         
     | 
| 
      
 70 
     | 
    
         
            +
                      model_name: model_name,
         
     | 
| 
      
 71 
     | 
    
         
            +
                      training_time: duration,
         
     | 
| 
      
 72 
     | 
    
         
            +
                      models: model_results,
         
     | 
| 
      
 73 
     | 
    
         
            +
                      metadata: model_metadata
         
     | 
| 
      
 74 
     | 
    
         
            +
                    }
         
     | 
| 
      
 75 
     | 
    
         
            +
                    
         
     | 
| 
      
 76 
     | 
    
         
            +
                  rescue => e
         
     | 
| 
      
 77 
     | 
    
         
            +
                    @logger&.error("Model training failed: #{e.message}")
         
     | 
| 
      
 78 
     | 
    
         
            +
                    raise TrainingError, "Failed to train model '#{model_name}': #{e.message}"
         
     | 
| 
      
 79 
     | 
    
         
            +
                  end
         
     | 
| 
      
 80 
     | 
    
         
            +
                end
         
     | 
| 
      
 81 
     | 
    
         
            +
             
     | 
| 
      
 82 
     | 
    
         
            +
                # Fine-tune an existing model with additional data
         
     | 
| 
      
 83 
     | 
    
         
            +
                def fine_tune_model(model_id, additional_data, options = {})
         
     | 
| 
      
 84 
     | 
    
         
            +
                  model_metadata = load_model_metadata(model_id)
         
     | 
| 
      
 85 
     | 
    
         
            +
                  raise ModelNotFoundError, "Model '#{model_id}' not found" unless model_metadata
         
     | 
| 
      
 86 
     | 
    
         
            +
                  
         
     | 
| 
      
 87 
     | 
    
         
            +
                  @logger&.info("Fine-tuning model '#{model_id}' with #{additional_data.length} additional examples")
         
     | 
| 
      
 88 
     | 
    
         
            +
                  
         
     | 
| 
      
 89 
     | 
    
         
            +
                  # Load existing training data
         
     | 
| 
      
 90 
     | 
    
         
            +
                  existing_data = load_training_data(model_id)
         
     | 
| 
      
 91 
     | 
    
         
            +
                  combined_data = existing_data + additional_data
         
     | 
| 
      
 92 
     | 
    
         
            +
                  
         
     | 
| 
      
 93 
     | 
    
         
            +
                  # Retrain with combined data
         
     | 
| 
      
 94 
     | 
    
         
            +
                  train_model(combined_data, model_metadata['model_name'], options.merge(model_id: model_id))
         
     | 
| 
      
 95 
     | 
    
         
            +
                end
         
     | 
| 
      
 96 
     | 
    
         
            +
             
     | 
| 
      
 97 
     | 
    
         
            +
                # Evaluate model performance
         
     | 
| 
      
 98 
     | 
    
         
            +
                def evaluate_model(model_id, test_data)
         
     | 
| 
      
 99 
     | 
    
         
            +
                  model_metadata = load_model_metadata(model_id)
         
     | 
| 
      
 100 
     | 
    
         
            +
                  raise ModelNotFoundError, "Model '#{model_id}' not found" unless model_metadata
         
     | 
| 
      
 101 
     | 
    
         
            +
                  
         
     | 
| 
      
 102 
     | 
    
         
            +
                  @logger&.info("Evaluating model '#{model_id}' with #{test_data.length} test examples")
         
     | 
| 
      
 103 
     | 
    
         
            +
                  
         
     | 
| 
      
 104 
     | 
    
         
            +
                  results = {
         
     | 
| 
      
 105 
     | 
    
         
            +
                    model_id: model_id,
         
     | 
| 
      
 106 
     | 
    
         
            +
                    test_examples: test_data.length,
         
     | 
| 
      
 107 
     | 
    
         
            +
                    accuracy_scores: {},
         
     | 
| 
      
 108 
     | 
    
         
            +
                    performance_metrics: {}
         
     | 
| 
      
 109 
     | 
    
         
            +
                  }
         
     | 
| 
      
 110 
     | 
    
         
            +
                  
         
     | 
| 
      
 111 
     | 
    
         
            +
                  # Evaluate each model type
         
     | 
| 
      
 112 
     | 
    
         
            +
                  model_metadata['model_types'].each do |model_type|
         
     | 
| 
      
 113 
     | 
    
         
            +
                    model_path = File.join(@models_dir, model_id, "#{model_type}.json")
         
     | 
| 
      
 114 
     | 
    
         
            +
                    next unless File.exist?(model_path)
         
     | 
| 
      
 115 
     | 
    
         
            +
                    
         
     | 
| 
      
 116 
     | 
    
         
            +
                    model_data = JSON.parse(File.read(model_path))
         
     | 
| 
      
 117 
     | 
    
         
            +
                    accuracy = evaluate_model_type(model_data, test_data, model_type)
         
     | 
| 
      
 118 
     | 
    
         
            +
                    
         
     | 
| 
      
 119 
     | 
    
         
            +
                    results[:accuracy_scores][model_type] = accuracy
         
     | 
| 
      
 120 
     | 
    
         
            +
                    results[:performance_metrics][model_type] = calculate_detailed_metrics(model_data, test_data, model_type)
         
     | 
| 
      
 121 
     | 
    
         
            +
                  end
         
     | 
| 
      
 122 
     | 
    
         
            +
                  
         
     | 
| 
      
 123 
     | 
    
         
            +
                  results
         
     | 
| 
      
 124 
     | 
    
         
            +
                end
         
     | 
| 
      
 125 
     | 
    
         
            +
             
     | 
| 
      
 126 
     | 
    
         
            +
                # List all trained models
         
     | 
| 
      
 127 
     | 
    
         
            +
                def list_models
         
     | 
| 
      
 128 
     | 
    
         
            +
                  return [] unless Dir.exist?(@models_dir)
         
     | 
| 
      
 129 
     | 
    
         
            +
                  
         
     | 
| 
      
 130 
     | 
    
         
            +
                  models = []
         
     | 
| 
      
 131 
     | 
    
         
            +
                  Dir.glob(File.join(@models_dir, '*')).each do |model_dir|
         
     | 
| 
      
 132 
     | 
    
         
            +
                    next unless File.directory?(model_dir)
         
     | 
| 
      
 133 
     | 
    
         
            +
                    
         
     | 
| 
      
 134 
     | 
    
         
            +
                    model_id = File.basename(model_dir)
         
     | 
| 
      
 135 
     | 
    
         
            +
                    metadata_file = File.join(model_dir, 'metadata.json')
         
     | 
| 
      
 136 
     | 
    
         
            +
                    
         
     | 
| 
      
 137 
     | 
    
         
            +
                    if File.exist?(metadata_file)
         
     | 
| 
      
 138 
     | 
    
         
            +
                      metadata = JSON.parse(File.read(metadata_file))
         
     | 
| 
      
 139 
     | 
    
         
            +
                      models << {
         
     | 
| 
      
 140 
     | 
    
         
            +
                        model_id: model_id,
         
     | 
| 
      
 141 
     | 
    
         
            +
                        model_name: metadata['model_name'],
         
     | 
| 
      
 142 
     | 
    
         
            +
                        created_at: metadata['created_at'],
         
     | 
| 
      
 143 
     | 
    
         
            +
                        training_examples: metadata['training_examples'],
         
     | 
| 
      
 144 
     | 
    
         
            +
                        model_types: metadata['model_types']
         
     | 
| 
      
 145 
     | 
    
         
            +
                      }
         
     | 
| 
      
 146 
     | 
    
         
            +
                    end
         
     | 
| 
      
 147 
     | 
    
         
            +
                  end
         
     | 
| 
      
 148 
     | 
    
         
            +
                  
         
     | 
| 
      
 149 
     | 
    
         
            +
                  models.sort_by { |m| m[:created_at] }.reverse
         
     | 
| 
      
 150 
     | 
    
         
            +
                end
         
     | 
| 
      
 151 
     | 
    
         
            +
             
     | 
| 
      
 152 
     | 
    
         
            +
                # Delete a trained model
         
     | 
| 
      
 153 
     | 
    
         
            +
                def delete_model(model_id)
         
     | 
| 
      
 154 
     | 
    
         
            +
                  model_dir = File.join(@models_dir, model_id)
         
     | 
| 
      
 155 
     | 
    
         
            +
                  
         
     | 
| 
      
 156 
     | 
    
         
            +
                  if Dir.exist?(model_dir)
         
     | 
| 
      
 157 
     | 
    
         
            +
                    FileUtils.rm_rf(model_dir)
         
     | 
| 
      
 158 
     | 
    
         
            +
                    @logger&.info("Deleted model '#{model_id}'")
         
     | 
| 
      
 159 
     | 
    
         
            +
                    true
         
     | 
| 
      
 160 
     | 
    
         
            +
                  else
         
     | 
| 
      
 161 
     | 
    
         
            +
                    false
         
     | 
| 
      
 162 
     | 
    
         
            +
                  end
         
     | 
| 
      
 163 
     | 
    
         
            +
                end
         
     | 
| 
      
 164 
     | 
    
         
            +
             
     | 
| 
      
 165 
     | 
    
         
            +
                # Export model for deployment
         
     | 
| 
      
 166 
     | 
    
         
            +
                def export_model(model_id, export_path)
         
     | 
| 
      
 167 
     | 
    
         
            +
                  model_dir = File.join(@models_dir, model_id)
         
     | 
| 
      
 168 
     | 
    
         
            +
                  raise ModelNotFoundError, "Model '#{model_id}' not found" unless Dir.exist?(model_dir)
         
     | 
| 
      
 169 
     | 
    
         
            +
                  
         
     | 
| 
      
 170 
     | 
    
         
            +
                  # Create export package
         
     | 
| 
      
 171 
     | 
    
         
            +
                  export_data = {
         
     | 
| 
      
 172 
     | 
    
         
            +
                    model_id: model_id,
         
     | 
| 
      
 173 
     | 
    
         
            +
                    exported_at: Time.now.iso8601,
         
     | 
| 
      
 174 
     | 
    
         
            +
                    metadata: load_model_metadata(model_id),
         
     | 
| 
      
 175 
     | 
    
         
            +
                    models: {}
         
     | 
| 
      
 176 
     | 
    
         
            +
                  }
         
     | 
| 
      
 177 
     | 
    
         
            +
                  
         
     | 
| 
      
 178 
     | 
    
         
            +
                  # Include all model files
         
     | 
| 
      
 179 
     | 
    
         
            +
                  Dir.glob(File.join(model_dir, '*.json')).each do |model_file|
         
     | 
| 
      
 180 
     | 
    
         
            +
                    model_type = File.basename(model_file, '.json')
         
     | 
| 
      
 181 
     | 
    
         
            +
                    next if model_type == 'metadata'
         
     | 
| 
      
 182 
     | 
    
         
            +
                    
         
     | 
| 
      
 183 
     | 
    
         
            +
                    export_data[:models][model_type] = JSON.parse(File.read(model_file))
         
     | 
| 
      
 184 
     | 
    
         
            +
                  end
         
     | 
| 
      
 185 
     | 
    
         
            +
                  
         
     | 
| 
      
 186 
     | 
    
         
            +
                  File.write(export_path, JSON.pretty_generate(export_data))
         
     | 
| 
      
 187 
     | 
    
         
            +
                  @logger&.info("Model '#{model_id}' exported to '#{export_path}'")
         
     | 
| 
      
 188 
     | 
    
         
            +
                  
         
     | 
| 
      
 189 
     | 
    
         
            +
                  export_data
         
     | 
| 
      
 190 
     | 
    
         
            +
                end
         
     | 
| 
      
 191 
     | 
    
         
            +
             
     | 
| 
      
 192 
     | 
    
         
            +
                # Import a previously exported model
         
     | 
| 
      
 193 
     | 
    
         
            +
                def import_model(import_path)
         
     | 
| 
      
 194 
     | 
    
         
            +
                  raise ValidationError, "Import file not found: #{import_path}" unless File.exist?(import_path)
         
     | 
| 
      
 195 
     | 
    
         
            +
                  
         
     | 
| 
      
 196 
     | 
    
         
            +
                  import_data = JSON.parse(File.read(import_path))
         
     | 
| 
      
 197 
     | 
    
         
            +
                  model_id = import_data['model_id']
         
     | 
| 
      
 198 
     | 
    
         
            +
                  
         
     | 
| 
      
 199 
     | 
    
         
            +
                  # Create model directory
         
     | 
| 
      
 200 
     | 
    
         
            +
                  model_dir = File.join(@models_dir, model_id)
         
     | 
| 
      
 201 
     | 
    
         
            +
                  FileUtils.mkdir_p(model_dir)
         
     | 
| 
      
 202 
     | 
    
         
            +
                  
         
     | 
| 
      
 203 
     | 
    
         
            +
                  # Save metadata
         
     | 
| 
      
 204 
     | 
    
         
            +
                  save_model_metadata(model_id, import_data['metadata'])
         
     | 
| 
      
 205 
     | 
    
         
            +
                  
         
     | 
| 
      
 206 
     | 
    
         
            +
                  # Save model files
         
     | 
| 
      
 207 
     | 
    
         
            +
                  import_data['models'].each do |model_type, model_data|
         
     | 
| 
      
 208 
     | 
    
         
            +
                    model_file = File.join(model_dir, "#{model_type}.json")
         
     | 
| 
      
 209 
     | 
    
         
            +
                    File.write(model_file, JSON.pretty_generate(model_data))
         
     | 
| 
      
 210 
     | 
    
         
            +
                  end
         
     | 
| 
      
 211 
     | 
    
         
            +
                  
         
     | 
| 
      
 212 
     | 
    
         
            +
                  @logger&.info("Model '#{model_id}' imported successfully")
         
     | 
| 
      
 213 
     | 
    
         
            +
                  
         
     | 
| 
      
 214 
     | 
    
         
            +
                  {
         
     | 
| 
      
 215 
     | 
    
         
            +
                    model_id: model_id,
         
     | 
| 
      
 216 
     | 
    
         
            +
                    model_name: import_data['metadata']['model_name'],
         
     | 
| 
      
 217 
     | 
    
         
            +
                    imported_at: Time.now.iso8601
         
     | 
| 
      
 218 
     | 
    
         
            +
                  }
         
     | 
| 
      
 219 
     | 
    
         
            +
                end
         
     | 
| 
      
 220 
     | 
    
         
            +
             
     | 
| 
      
 221 
     | 
    
         
            +
                private
         
     | 
| 
      
 222 
     | 
    
         
            +
             
     | 
| 
      
 223 
     | 
    
         
            +
                def setup_directories
         
     | 
| 
      
 224 
     | 
    
         
            +
                  [@training_data_dir, @models_dir].each do |dir|
         
     | 
| 
      
 225 
     | 
    
         
            +
                    FileUtils.mkdir_p(dir) unless Dir.exist?(dir)
         
     | 
| 
      
 226 
     | 
    
         
            +
                  end
         
     | 
| 
      
 227 
     | 
    
         
            +
                end
         
     | 
| 
      
 228 
     | 
    
         
            +
             
     | 
| 
      
 229 
     | 
    
         
            +
                def validate_training_data(training_data)
         
     | 
| 
      
 230 
     | 
    
         
            +
                  raise ValidationError, "Training data must be an array" unless training_data.is_a?(Array)
         
     | 
| 
      
 231 
     | 
    
         
            +
                  raise ValidationError, "Training data cannot be empty" if training_data.empty?
         
     | 
| 
      
 232 
     | 
    
         
            +
                  
         
     | 
| 
      
 233 
     | 
    
         
            +
                  training_data.each_with_index do |example, index|
         
     | 
| 
      
 234 
     | 
    
         
            +
                    unless example.is_a?(Hash) && example['legal'] && example['plain']
         
     | 
| 
      
 235 
     | 
    
         
            +
                      raise ValidationError, "Invalid training example at index #{index}: must have 'legal' and 'plain' keys"
         
     | 
| 
      
 236 
     | 
    
         
            +
                    end
         
     | 
| 
      
 237 
     | 
    
         
            +
                  end
         
     | 
| 
      
 238 
     | 
    
         
            +
                end
         
     | 
| 
      
 239 
     | 
    
         
            +
             
     | 
| 
      
 240 
     | 
    
         
            +
                def generate_model_id(model_name)
         
     | 
| 
      
 241 
     | 
    
         
            +
                  timestamp = Time.now.strftime('%Y%m%d_%H%M%S')
         
     | 
| 
      
 242 
     | 
    
         
            +
                  hash = Digest::MD5.hexdigest("#{model_name}_#{timestamp}")[0..7]
         
     | 
| 
      
 243 
     | 
    
         
            +
                  "#{model_name.downcase.gsub(/[^a-z0-9]/, '_')}_#{timestamp}_#{hash}"
         
     | 
| 
      
 244 
     | 
    
         
            +
                end
         
     | 
| 
      
 245 
     | 
    
         
            +
             
     | 
| 
      
 246 
     | 
    
         
            +
                def prepare_training_data(training_data, options = {})
         
     | 
| 
      
 247 
     | 
    
         
            +
                  prepared = training_data.map do |example|
         
     | 
| 
      
 248 
     | 
    
         
            +
                    {
         
     | 
| 
      
 249 
     | 
    
         
            +
                      legal: example['legal'].strip,
         
     | 
| 
      
 250 
     | 
    
         
            +
                      plain: example['plain'].strip,
         
     | 
| 
      
 251 
     | 
    
         
            +
                      legal_tokens: tokenize_text(example['legal']),
         
     | 
| 
      
 252 
     | 
    
         
            +
                      plain_tokens: tokenize_text(example['plain']),
         
     | 
| 
      
 253 
     | 
    
         
            +
                      legal_length: example['legal'].split.length,
         
     | 
| 
      
 254 
     | 
    
         
            +
                      plain_length: example['plain'].split.length,
         
     | 
| 
      
 255 
     | 
    
         
            +
                      complexity_score: calculate_complexity_score(example['legal'])
         
     | 
| 
      
 256 
     | 
    
         
            +
                    }
         
     | 
| 
      
 257 
     | 
    
         
            +
                  end
         
     | 
| 
      
 258 
     | 
    
         
            +
                  
         
     | 
| 
      
 259 
     | 
    
         
            +
                  # Add data augmentation if requested
         
     | 
| 
      
 260 
     | 
    
         
            +
                  if options[:augment_data]
         
     | 
| 
      
 261 
     | 
    
         
            +
                    prepared += generate_augmented_examples(prepared, options[:augmentation_factor] || 0.2)
         
     | 
| 
      
 262 
     | 
    
         
            +
                  end
         
     | 
| 
      
 263 
     | 
    
         
            +
                  
         
     | 
| 
      
 264 
     | 
    
         
            +
                  prepared
         
     | 
| 
      
 265 
     | 
    
         
            +
                end
         
     | 
| 
      
 266 
     | 
    
         
            +
             
     | 
| 
      
 267 
     | 
    
         
            +
                def train_pattern_model(training_data, model_id)
         
     | 
| 
      
 268 
     | 
    
         
            +
                  @logger&.info("Training pattern-based model")
         
     | 
| 
      
 269 
     | 
    
         
            +
                  
         
     | 
| 
      
 270 
     | 
    
         
            +
                  patterns = {
         
     | 
| 
      
 271 
     | 
    
         
            +
                    word_mappings: {},
         
     | 
| 
      
 272 
     | 
    
         
            +
                    phrase_mappings: {},
         
     | 
| 
      
 273 
     | 
    
         
            +
                    sentence_patterns: [],
         
     | 
| 
      
 274 
     | 
    
         
            +
                    complexity_rules: []
         
     | 
| 
      
 275 
     | 
    
         
            +
                  }
         
     | 
| 
      
 276 
     | 
    
         
            +
                  
         
     | 
| 
      
 277 
     | 
    
         
            +
                  training_data.each do |example|
         
     | 
| 
      
 278 
     | 
    
         
            +
                    # Extract word-level mappings
         
     | 
| 
      
 279 
     | 
    
         
            +
                    legal_words = example[:legal_tokens]
         
     | 
| 
      
 280 
     | 
    
         
            +
                    plain_words = example[:plain_tokens]
         
     | 
| 
      
 281 
     | 
    
         
            +
                    
         
     | 
| 
      
 282 
     | 
    
         
            +
                    # Simple alignment heuristic
         
     | 
| 
      
 283 
     | 
    
         
            +
                    word_mappings = align_words(legal_words, plain_words)
         
     | 
| 
      
 284 
     | 
    
         
            +
                    word_mappings.each do |legal_word, plain_word|
         
     | 
| 
      
 285 
     | 
    
         
            +
                      patterns[:word_mappings][legal_word] ||= Hash.new(0)
         
     | 
| 
      
 286 
     | 
    
         
            +
                      patterns[:word_mappings][legal_word][plain_word] += 1
         
     | 
| 
      
 287 
     | 
    
         
            +
                    end
         
     | 
| 
      
 288 
     | 
    
         
            +
                    
         
     | 
| 
      
 289 
     | 
    
         
            +
                    # Extract phrase patterns
         
     | 
| 
      
 290 
     | 
    
         
            +
                    phrase_mappings = extract_phrase_patterns(example[:legal], example[:plain])
         
     | 
| 
      
 291 
     | 
    
         
            +
                    phrase_mappings.each do |legal_phrase, plain_phrase|
         
     | 
| 
      
 292 
     | 
    
         
            +
                      patterns[:phrase_mappings][legal_phrase] ||= Hash.new(0)
         
     | 
| 
      
 293 
     | 
    
         
            +
                      patterns[:phrase_mappings][legal_phrase][plain_phrase] += 1
         
     | 
| 
      
 294 
     | 
    
         
            +
                    end
         
     | 
| 
      
 295 
     | 
    
         
            +
                  end
         
     | 
| 
      
 296 
     | 
    
         
            +
                  
         
     | 
| 
      
 297 
     | 
    
         
            +
                  # Convert counts to probabilities
         
     | 
| 
      
 298 
     | 
    
         
            +
                  patterns[:word_mappings].each do |legal_word, plain_words|
         
     | 
| 
      
 299 
     | 
    
         
            +
                    total = plain_words.values.sum
         
     | 
| 
      
 300 
     | 
    
         
            +
                    plain_words.each { |word, count| plain_words[word] = count.to_f / total }
         
     | 
| 
      
 301 
     | 
    
         
            +
                  end
         
     | 
| 
      
 302 
     | 
    
         
            +
                  
         
     | 
| 
      
 303 
     | 
    
         
            +
                  patterns[:phrase_mappings].each do |legal_phrase, plain_phrases|
         
     | 
| 
      
 304 
     | 
    
         
            +
                    total = plain_phrases.values.sum
         
     | 
| 
      
 305 
     | 
    
         
            +
                    plain_phrases.each { |phrase, count| plain_phrases[phrase] = count.to_f / total }
         
     | 
| 
      
 306 
     | 
    
         
            +
                  end
         
     | 
| 
      
 307 
     | 
    
         
            +
                  
         
     | 
| 
      
 308 
     | 
    
         
            +
                  # Save pattern model
         
     | 
| 
      
 309 
     | 
    
         
            +
                  model_file = File.join(@models_dir, model_id, 'pattern_model.json')
         
     | 
| 
      
 310 
     | 
    
         
            +
                  FileUtils.mkdir_p(File.dirname(model_file))
         
     | 
| 
      
 311 
     | 
    
         
            +
                  File.write(model_file, JSON.pretty_generate(patterns))
         
     | 
| 
      
 312 
     | 
    
         
            +
                  
         
     | 
| 
      
 313 
     | 
    
         
            +
                  patterns
         
     | 
| 
      
 314 
     | 
    
         
            +
                end
         
     | 
| 
      
 315 
     | 
    
         
            +
             
     | 
| 
      
 316 
     | 
    
         
            +
                def train_statistical_model(training_data, model_id)
         
     | 
| 
      
 317 
     | 
    
         
            +
                  @logger&.info("Training statistical model")
         
     | 
| 
      
 318 
     | 
    
         
            +
                  
         
     | 
| 
      
 319 
     | 
    
         
            +
                  # Build n-gram models for both legal and plain text
         
     | 
| 
      
 320 
     | 
    
         
            +
                  legal_ngrams = build_ngram_model(training_data.map { |ex| ex[:legal_tokens] })
         
     | 
| 
      
 321 
     | 
    
         
            +
                  plain_ngrams = build_ngram_model(training_data.map { |ex| ex[:plain_tokens] })
         
     | 
| 
      
 322 
     | 
    
         
            +
                  
         
     | 
| 
      
 323 
     | 
    
         
            +
                  # Build translation probabilities
         
     | 
| 
      
 324 
     | 
    
         
            +
                  translation_probs = calculate_translation_probabilities(training_data)
         
     | 
| 
      
 325 
     | 
    
         
            +
                  
         
     | 
| 
      
 326 
     | 
    
         
            +
                  statistical_model = {
         
     | 
| 
      
 327 
     | 
    
         
            +
                    legal_ngrams: legal_ngrams,
         
     | 
| 
      
 328 
     | 
    
         
            +
                    plain_ngrams: plain_ngrams,
         
     | 
| 
      
 329 
     | 
    
         
            +
                    translation_probabilities: translation_probs,
         
     | 
| 
      
 330 
     | 
    
         
            +
                    vocabulary: {
         
     | 
| 
      
 331 
     | 
    
         
            +
                      legal: extract_vocabulary(training_data.map { |ex| ex[:legal_tokens] }),
         
     | 
| 
      
 332 
     | 
    
         
            +
                      plain: extract_vocabulary(training_data.map { |ex| ex[:plain_tokens] })
         
     | 
| 
      
 333 
     | 
    
         
            +
                    }
         
     | 
| 
      
 334 
     | 
    
         
            +
                  }
         
     | 
| 
      
 335 
     | 
    
         
            +
                  
         
     | 
| 
      
 336 
     | 
    
         
            +
                  # Save statistical model
         
     | 
| 
      
 337 
     | 
    
         
            +
                  model_file = File.join(@models_dir, model_id, 'statistical_model.json')
         
     | 
| 
      
 338 
     | 
    
         
            +
                  FileUtils.mkdir_p(File.dirname(model_file))
         
     | 
| 
      
 339 
     | 
    
         
            +
                  File.write(model_file, JSON.pretty_generate(statistical_model))
         
     | 
| 
      
 340 
     | 
    
         
            +
                  
         
     | 
| 
      
 341 
     | 
    
         
            +
                  statistical_model
         
     | 
| 
      
 342 
     | 
    
         
            +
                end
         
     | 
| 
      
 343 
     | 
    
         
            +
             
     | 
| 
      
 344 
     | 
    
         
            +
                def train_neural_model(training_data, model_id, neural_config)
         
     | 
| 
      
 345 
     | 
    
         
            +
                  @logger&.info("Training neural model (placeholder implementation)")
         
     | 
| 
      
 346 
     | 
    
         
            +
                  
         
     | 
| 
      
 347 
     | 
    
         
            +
                  # This is a placeholder for neural model training
         
     | 
| 
      
 348 
     | 
    
         
            +
                  # In a real implementation, you would use frameworks like TensorFlow or PyTorch
         
     | 
| 
      
 349 
     | 
    
         
            +
                  neural_model = {
         
     | 
| 
      
 350 
     | 
    
         
            +
                    model_type: 'transformer',
         
     | 
| 
      
 351 
     | 
    
         
            +
                    architecture: neural_config[:architecture] || 'encoder_decoder',
         
     | 
| 
      
 352 
     | 
    
         
            +
                    vocab_size: neural_config[:vocab_size] || 10000,
         
     | 
| 
      
 353 
     | 
    
         
            +
                    embedding_dim: neural_config[:embedding_dim] || 256,
         
     | 
| 
      
 354 
     | 
    
         
            +
                    hidden_dim: neural_config[:hidden_dim] || 512,
         
     | 
| 
      
 355 
     | 
    
         
            +
                    num_layers: neural_config[:num_layers] || 6,
         
     | 
| 
      
 356 
     | 
    
         
            +
                    training_epochs: neural_config[:epochs] || 10,
         
     | 
| 
      
 357 
     | 
    
         
            +
                    learning_rate: neural_config[:learning_rate] || 0.001,
         
     | 
| 
      
 358 
     | 
    
         
            +
                    trained_on: training_data.length,
         
     | 
| 
      
 359 
     | 
    
         
            +
                    placeholder: true # Indicates this is a placeholder implementation
         
     | 
| 
      
 360 
     | 
    
         
            +
                  }
         
     | 
| 
      
 361 
     | 
    
         
            +
                  
         
     | 
| 
      
 362 
     | 
    
         
            +
                  # Save neural model placeholder
         
     | 
| 
      
 363 
     | 
    
         
            +
                  model_file = File.join(@models_dir, model_id, 'neural_model.json')
         
     | 
| 
      
 364 
     | 
    
         
            +
                  FileUtils.mkdir_p(File.dirname(model_file))
         
     | 
| 
      
 365 
     | 
    
         
            +
                  File.write(model_file, JSON.pretty_generate(neural_model))
         
     | 
| 
      
 366 
     | 
    
         
            +
                  
         
     | 
| 
      
 367 
     | 
    
         
            +
                  neural_model
         
     | 
| 
      
 368 
     | 
    
         
            +
                end
         
     | 
| 
      
 369 
     | 
    
         
            +
             
     | 
| 
      
 370 
     | 
    
         
            +
                def tokenize_text(text)
         
     | 
| 
      
 371 
     | 
    
         
            +
                  # Simple tokenization - in practice, use more sophisticated tokenizers
         
     | 
| 
      
 372 
     | 
    
         
            +
                  text.downcase.gsub(/[^\w\s]/, ' ').split
         
     | 
| 
      
 373 
     | 
    
         
            +
                end
         
     | 
| 
      
 374 
     | 
    
         
            +
             
     | 
| 
      
 375 
     | 
    
         
            +
                def calculate_complexity_score(text)
         
     | 
| 
      
 376 
     | 
    
         
            +
                  words = text.split
         
     | 
| 
      
 377 
     | 
    
         
            +
                  avg_word_length = words.map(&:length).sum.to_f / words.length
         
     | 
| 
      
 378 
     | 
    
         
            +
                  sentence_count = text.split(/[.!?]+/).length
         
     | 
| 
      
 379 
     | 
    
         
            +
                  avg_sentence_length = words.length.to_f / sentence_count
         
     | 
| 
      
 380 
     | 
    
         
            +
                  
         
     | 
| 
      
 381 
     | 
    
         
            +
                  (avg_word_length * 2) + (avg_sentence_length * 0.5)
         
     | 
| 
      
 382 
     | 
    
         
            +
                end
         
     | 
| 
      
 383 
     | 
    
         
            +
             
     | 
| 
      
 384 
     | 
    
         
            +
                def align_words(legal_words, plain_words)
         
     | 
| 
      
 385 
     | 
    
         
            +
                  # Simple word alignment using edit distance
         
     | 
| 
      
 386 
     | 
    
         
            +
                  alignments = {}
         
     | 
| 
      
 387 
     | 
    
         
            +
                  
         
     | 
| 
      
 388 
     | 
    
         
            +
                  legal_words.each do |legal_word|
         
     | 
| 
      
 389 
     | 
    
         
            +
                    best_match = nil
         
     | 
| 
      
 390 
     | 
    
         
            +
                    best_score = Float::INFINITY
         
     | 
| 
      
 391 
     | 
    
         
            +
                    
         
     | 
| 
      
 392 
     | 
    
         
            +
                    plain_words.each do |plain_word|
         
     | 
| 
      
 393 
     | 
    
         
            +
                      score = levenshtein_distance(legal_word, plain_word)
         
     | 
| 
      
 394 
     | 
    
         
            +
                      if score < best_score && score < [legal_word.length, plain_word.length].max * 0.6
         
     | 
| 
      
 395 
     | 
    
         
            +
                        best_score = score
         
     | 
| 
      
 396 
     | 
    
         
            +
                        best_match = plain_word
         
     | 
| 
      
 397 
     | 
    
         
            +
                      end
         
     | 
| 
      
 398 
     | 
    
         
            +
                    end
         
     | 
| 
      
 399 
     | 
    
         
            +
                    
         
     | 
| 
      
 400 
     | 
    
         
            +
                    alignments[legal_word] = best_match if best_match
         
     | 
| 
      
 401 
     | 
    
         
            +
                  end
         
     | 
| 
      
 402 
     | 
    
         
            +
                  
         
     | 
| 
      
 403 
     | 
    
         
            +
                  alignments
         
     | 
| 
      
 404 
     | 
    
         
            +
                end
         
     | 
| 
      
 405 
     | 
    
         
            +
             
     | 
| 
      
 406 
     | 
    
         
            +
                def levenshtein_distance(str1, str2)
         
     | 
| 
      
 407 
     | 
    
         
            +
                  matrix = Array.new(str1.length + 1) { Array.new(str2.length + 1) }
         
     | 
| 
      
 408 
     | 
    
         
            +
                  
         
     | 
| 
      
 409 
     | 
    
         
            +
                  (0..str1.length).each { |i| matrix[i][0] = i }
         
     | 
| 
      
 410 
     | 
    
         
            +
                  (0..str2.length).each { |j| matrix[0][j] = j }
         
     | 
| 
      
 411 
     | 
    
         
            +
                  
         
     | 
| 
      
 412 
     | 
    
         
            +
                  (1..str1.length).each do |i|
         
     | 
| 
      
 413 
     | 
    
         
            +
                    (1..str2.length).each do |j|
         
     | 
| 
      
 414 
     | 
    
         
            +
                      cost = str1[i-1] == str2[j-1] ? 0 : 1
         
     | 
| 
      
 415 
     | 
    
         
            +
                      matrix[i][j] = [
         
     | 
| 
      
 416 
     | 
    
         
            +
                        matrix[i-1][j] + 1,     # deletion
         
     | 
| 
      
 417 
     | 
    
         
            +
                        matrix[i][j-1] + 1,     # insertion
         
     | 
| 
      
 418 
     | 
    
         
            +
                        matrix[i-1][j-1] + cost # substitution
         
     | 
| 
      
 419 
     | 
    
         
            +
                      ].min
         
     | 
| 
      
 420 
     | 
    
         
            +
                    end
         
     | 
| 
      
 421 
     | 
    
         
            +
                  end
         
     | 
| 
      
 422 
     | 
    
         
            +
                  
         
     | 
| 
      
 423 
     | 
    
         
            +
                  matrix[str1.length][str2.length]
         
     | 
| 
      
 424 
     | 
    
         
            +
                end
         
     | 
| 
      
 425 
     | 
    
         
            +
             
     | 
| 
      
 426 
     | 
    
         
            +
                def extract_phrase_patterns(legal_text, plain_text)
         
     | 
| 
      
 427 
     | 
    
         
            +
                  # Extract common phrase patterns
         
     | 
| 
      
 428 
     | 
    
         
            +
                  patterns = {}
         
     | 
| 
      
 429 
     | 
    
         
            +
                  
         
     | 
| 
      
 430 
     | 
    
         
            +
                  # Simple phrase extraction using sliding windows
         
     | 
| 
      
 431 
     | 
    
         
            +
                  [2, 3, 4].each do |window_size|
         
     | 
| 
      
 432 
     | 
    
         
            +
                    legal_phrases = extract_phrases(legal_text, window_size)
         
     | 
| 
      
 433 
     | 
    
         
            +
                    plain_phrases = extract_phrases(plain_text, window_size)
         
     | 
| 
      
 434 
     | 
    
         
            +
                    
         
     | 
| 
      
 435 
     | 
    
         
            +
                    # Find potential mappings
         
     | 
| 
      
 436 
     | 
    
         
            +
                    legal_phrases.each do |legal_phrase|
         
     | 
| 
      
 437 
     | 
    
         
            +
                      plain_phrases.each do |plain_phrase|
         
     | 
| 
      
 438 
     | 
    
         
            +
                        if phrases_similar?(legal_phrase, plain_phrase)
         
     | 
| 
      
 439 
     | 
    
         
            +
                          patterns[legal_phrase] = plain_phrase
         
     | 
| 
      
 440 
     | 
    
         
            +
                        end
         
     | 
| 
      
 441 
     | 
    
         
            +
                      end
         
     | 
| 
      
 442 
     | 
    
         
            +
                    end
         
     | 
| 
      
 443 
     | 
    
         
            +
                  end
         
     | 
| 
      
 444 
     | 
    
         
            +
                  
         
     | 
| 
      
 445 
     | 
    
         
            +
                  patterns
         
     | 
| 
      
 446 
     | 
    
         
            +
                end
         
     | 
| 
      
 447 
     | 
    
         
            +
             
     | 
| 
      
 448 
     | 
    
         
            +
                def extract_phrases(text, window_size)
         
     | 
| 
      
 449 
     | 
    
         
            +
                  words = text.downcase.split
         
     | 
| 
      
 450 
     | 
    
         
            +
                  phrases = []
         
     | 
| 
      
 451 
     | 
    
         
            +
                  
         
     | 
| 
      
 452 
     | 
    
         
            +
                  (0..words.length - window_size).each do |i|
         
     | 
| 
      
 453 
     | 
    
         
            +
                    phrase = words[i, window_size].join(' ')
         
     | 
| 
      
 454 
     | 
    
         
            +
                    phrases << phrase
         
     | 
| 
      
 455 
     | 
    
         
            +
                  end
         
     | 
| 
      
 456 
     | 
    
         
            +
                  
         
     | 
| 
      
 457 
     | 
    
         
            +
                  phrases
         
     | 
| 
      
 458 
     | 
    
         
            +
                end
         
     | 
| 
      
 459 
     | 
    
         
            +
             
     | 
| 
      
 460 
     | 
    
         
            +
                def phrases_similar?(phrase1, phrase2)
         
     | 
| 
      
 461 
     | 
    
         
            +
                  # Simple similarity check
         
     | 
| 
      
 462 
     | 
    
         
            +
                  words1 = phrase1.split
         
     | 
| 
      
 463 
     | 
    
         
            +
                  words2 = phrase2.split
         
     | 
| 
      
 464 
     | 
    
         
            +
                  
         
     | 
| 
      
 465 
     | 
    
         
            +
                  return false if (words1.length - words2.length).abs > 1
         
     | 
| 
      
 466 
     | 
    
         
            +
                  
         
     | 
| 
      
 467 
     | 
    
         
            +
                  common_words = words1 & words2
         
     | 
| 
      
 468 
     | 
    
         
            +
                  common_words.length.to_f / [words1.length, words2.length].max > 0.3
         
     | 
| 
      
 469 
     | 
    
         
            +
                end
         
     | 
| 
      
 470 
     | 
    
         
            +
             
     | 
| 
      
 471 
     | 
    
         
            +
                def build_ngram_model(token_sequences, n = 3)
         
     | 
| 
      
 472 
     | 
    
         
            +
                  ngrams = Hash.new(0)
         
     | 
| 
      
 473 
     | 
    
         
            +
                  
         
     | 
| 
      
 474 
     | 
    
         
            +
                  token_sequences.each do |tokens|
         
     | 
| 
      
 475 
     | 
    
         
            +
                    (0..tokens.length - n).each do |i|
         
     | 
| 
      
 476 
     | 
    
         
            +
                      ngram = tokens[i, n].join(' ')
         
     | 
| 
      
 477 
     | 
    
         
            +
                      ngrams[ngram] += 1
         
     | 
| 
      
 478 
     | 
    
         
            +
                    end
         
     | 
| 
      
 479 
     | 
    
         
            +
                  end
         
     | 
| 
      
 480 
     | 
    
         
            +
                  
         
     | 
| 
      
 481 
     | 
    
         
            +
                  # Convert to probabilities
         
     | 
| 
      
 482 
     | 
    
         
            +
                  total = ngrams.values.sum
         
     | 
| 
      
 483 
     | 
    
         
            +
                  ngrams.each { |ngram, count| ngrams[ngram] = count.to_f / total }
         
     | 
| 
      
 484 
     | 
    
         
            +
                  
         
     | 
| 
      
 485 
     | 
    
         
            +
                  ngrams
         
     | 
| 
      
 486 
     | 
    
         
            +
                end
         
     | 
| 
      
 487 
     | 
    
         
            +
             
     | 
| 
      
 488 
     | 
    
         
            +
                def calculate_translation_probabilities(training_data)
         
     | 
| 
      
 489 
     | 
    
         
            +
                  word_pairs = Hash.new(0)
         
     | 
| 
      
 490 
     | 
    
         
            +
                  
         
     | 
| 
      
 491 
     | 
    
         
            +
                  training_data.each do |example|
         
     | 
| 
      
 492 
     | 
    
         
            +
                    legal_words = example[:legal_tokens]
         
     | 
| 
      
 493 
     | 
    
         
            +
                    plain_words = example[:plain_tokens]
         
     | 
| 
      
 494 
     | 
    
         
            +
                    
         
     | 
| 
      
 495 
     | 
    
         
            +
                    # Simple co-occurrence counting
         
     | 
| 
      
 496 
     | 
    
         
            +
                    legal_words.each do |legal_word|
         
     | 
| 
      
 497 
     | 
    
         
            +
                      plain_words.each do |plain_word|
         
     | 
| 
      
 498 
     | 
    
         
            +
                        word_pairs["#{legal_word}|#{plain_word}"] += 1
         
     | 
| 
      
 499 
     | 
    
         
            +
                      end
         
     | 
| 
      
 500 
     | 
    
         
            +
                    end
         
     | 
| 
      
 501 
     | 
    
         
            +
                  end
         
     | 
| 
      
 502 
     | 
    
         
            +
                  
         
     | 
| 
      
 503 
     | 
    
         
            +
                  # Normalize to probabilities
         
     | 
| 
      
 504 
     | 
    
         
            +
                  legal_word_counts = Hash.new(0)
         
     | 
| 
      
 505 
     | 
    
         
            +
                  word_pairs.each do |pair, count|
         
     | 
| 
      
 506 
     | 
    
         
            +
                    legal_word = pair.split('|').first
         
     | 
| 
      
 507 
     | 
    
         
            +
                    legal_word_counts[legal_word] += count
         
     | 
| 
      
 508 
     | 
    
         
            +
                  end
         
     | 
| 
      
 509 
     | 
    
         
            +
                  
         
     | 
| 
      
 510 
     | 
    
         
            +
                  translation_probs = {}
         
     | 
| 
      
 511 
     | 
    
         
            +
                  word_pairs.each do |pair, count|
         
     | 
| 
      
 512 
     | 
    
         
            +
                    legal_word, plain_word = pair.split('|')
         
     | 
| 
      
 513 
     | 
    
         
            +
                    translation_probs[pair] = count.to_f / legal_word_counts[legal_word]
         
     | 
| 
      
 514 
     | 
    
         
            +
                  end
         
     | 
| 
      
 515 
     | 
    
         
            +
                  
         
     | 
| 
      
 516 
     | 
    
         
            +
                  translation_probs
         
     | 
| 
      
 517 
     | 
    
         
            +
                end
         
     | 
| 
      
 518 
     | 
    
         
            +
             
     | 
| 
      
 519 
     | 
    
         
            +
                def extract_vocabulary(token_sequences)
         
     | 
| 
      
 520 
     | 
    
         
            +
                  vocab = Hash.new(0)
         
     | 
| 
      
 521 
     | 
    
         
            +
                  
         
     | 
| 
      
 522 
     | 
    
         
            +
                  token_sequences.each do |tokens|
         
     | 
| 
      
 523 
     | 
    
         
            +
                    tokens.each { |token| vocab[token] += 1 }
         
     | 
| 
      
 524 
     | 
    
         
            +
                  end
         
     | 
| 
      
 525 
     | 
    
         
            +
                  
         
     | 
| 
      
 526 
     | 
    
         
            +
                  vocab.sort_by { |_, count| -count }.to_h
         
     | 
| 
      
 527 
     | 
    
         
            +
                end
         
     | 
| 
      
 528 
     | 
    
         
            +
             
     | 
| 
      
 529 
     | 
    
         
            +
                def calculate_training_metrics(training_data, model_results)
         
     | 
| 
      
 530 
     | 
    
         
            +
                  {
         
     | 
| 
      
 531 
     | 
    
         
            +
                    total_examples: training_data.length,
         
     | 
| 
      
 532 
     | 
    
         
            +
                    avg_legal_length: training_data.map { |ex| ex[:legal_length] }.sum.to_f / training_data.length,
         
     | 
| 
      
 533 
     | 
    
         
            +
                    avg_plain_length: training_data.map { |ex| ex[:plain_length] }.sum.to_f / training_data.length,
         
     | 
| 
      
 534 
     | 
    
         
            +
                    avg_complexity_score: training_data.map { |ex| ex[:complexity_score] }.sum.to_f / training_data.length,
         
     | 
| 
      
 535 
     | 
    
         
            +
                    model_types_trained: model_results.keys.length,
         
     | 
| 
      
 536 
     | 
    
         
            +
                    training_completed_at: Time.now.iso8601
         
     | 
| 
      
 537 
     | 
    
         
            +
                  }
         
     | 
| 
      
 538 
     | 
    
         
            +
                end
         
     | 
| 
      
 539 
     | 
    
         
            +
             
     | 
| 
      
 540 
     | 
    
         
            +
                def save_model_metadata(model_id, metadata)
         
     | 
| 
      
 541 
     | 
    
         
            +
                  model_dir = File.join(@models_dir, model_id)
         
     | 
| 
      
 542 
     | 
    
         
            +
                  FileUtils.mkdir_p(model_dir)
         
     | 
| 
      
 543 
     | 
    
         
            +
                  
         
     | 
| 
      
 544 
     | 
    
         
            +
                  metadata_file = File.join(model_dir, 'metadata.json')
         
     | 
| 
      
 545 
     | 
    
         
            +
                  File.write(metadata_file, JSON.pretty_generate(metadata))
         
     | 
| 
      
 546 
     | 
    
         
            +
                end
         
     | 
| 
      
 547 
     | 
    
         
            +
             
     | 
| 
      
 548 
     | 
    
         
            +
                def load_model_metadata(model_id)
         
     | 
| 
      
 549 
     | 
    
         
            +
                  metadata_file = File.join(@models_dir, model_id, 'metadata.json')
         
     | 
| 
      
 550 
     | 
    
         
            +
                  return nil unless File.exist?(metadata_file)
         
     | 
| 
      
 551 
     | 
    
         
            +
                  
         
     | 
| 
      
 552 
     | 
    
         
            +
                  JSON.parse(File.read(metadata_file))
         
     | 
| 
      
 553 
     | 
    
         
            +
                end
         
     | 
| 
      
 554 
     | 
    
         
            +
             
     | 
| 
      
 555 
     | 
    
         
            +
                def load_training_data(model_id)
         
     | 
| 
      
 556 
     | 
    
         
            +
                  training_file = File.join(@training_data_dir, "#{model_id}.json")
         
     | 
| 
      
 557 
     | 
    
         
            +
                  return [] unless File.exist?(training_file)
         
     | 
| 
      
 558 
     | 
    
         
            +
                  
         
     | 
| 
      
 559 
     | 
    
         
            +
                  JSON.parse(File.read(training_file))
         
     | 
| 
      
 560 
     | 
    
         
            +
                end
         
     | 
| 
      
 561 
     | 
    
         
            +
             
     | 
| 
      
 562 
     | 
    
         
            +
                def evaluate_model_type(model_data, test_data, model_type)
         
     | 
| 
      
 563 
     | 
    
         
            +
                  correct_predictions = 0
         
     | 
| 
      
 564 
     | 
    
         
            +
                  
         
     | 
| 
      
 565 
     | 
    
         
            +
                  test_data.each do |example|
         
     | 
| 
      
 566 
     | 
    
         
            +
                    predicted = predict_with_model(model_data, example['legal'], model_type)
         
     | 
| 
      
 567 
     | 
    
         
            +
                    if similarity_score(predicted, example['plain']) > 0.7
         
     | 
| 
      
 568 
     | 
    
         
            +
                      correct_predictions += 1
         
     | 
| 
      
 569 
     | 
    
         
            +
                    end
         
     | 
| 
      
 570 
     | 
    
         
            +
                  end
         
     | 
| 
      
 571 
     | 
    
         
            +
                  
         
     | 
| 
      
 572 
     | 
    
         
            +
                  (correct_predictions.to_f / test_data.length * 100).round(2)
         
     | 
| 
      
 573 
     | 
    
         
            +
                end
         
     | 
| 
      
 574 
     | 
    
         
            +
             
     | 
| 
      
 575 
     | 
    
         
            +
                def predict_with_model(model_data, legal_text, model_type)
         
     | 
| 
      
 576 
     | 
    
         
            +
                  case model_type
         
     | 
| 
      
 577 
     | 
    
         
            +
                  when 'pattern_model'
         
     | 
| 
      
 578 
     | 
    
         
            +
                    predict_with_pattern_model(model_data, legal_text)
         
     | 
| 
      
 579 
     | 
    
         
            +
                  when 'statistical_model'
         
     | 
| 
      
 580 
     | 
    
         
            +
                    predict_with_statistical_model(model_data, legal_text)
         
     | 
| 
      
 581 
     | 
    
         
            +
                  when 'neural_model'
         
     | 
| 
      
 582 
     | 
    
         
            +
                    predict_with_neural_model(model_data, legal_text)
         
     | 
| 
      
 583 
     | 
    
         
            +
                  else
         
     | 
| 
      
 584 
     | 
    
         
            +
                    legal_text # Fallback
         
     | 
| 
      
 585 
     | 
    
         
            +
                  end
         
     | 
| 
      
 586 
     | 
    
         
            +
                end
         
     | 
| 
      
 587 
     | 
    
         
            +
             
     | 
| 
      
 588 
     | 
    
         
            +
                def predict_with_pattern_model(model_data, legal_text)
         
     | 
| 
      
 589 
     | 
    
         
            +
                  result = legal_text.dup
         
     | 
| 
      
 590 
     | 
    
         
            +
                  
         
     | 
| 
      
 591 
     | 
    
         
            +
                  # Apply word mappings
         
     | 
| 
      
 592 
     | 
    
         
            +
                  model_data['word_mappings'].each do |legal_word, plain_words|
         
     | 
| 
      
 593 
     | 
    
         
            +
                    best_plain_word = plain_words.max_by { |_, prob| prob }.first
         
     | 
| 
      
 594 
     | 
    
         
            +
                    result.gsub!(/\b#{Regexp.escape(legal_word)}\b/i, best_plain_word)
         
     | 
| 
      
 595 
     | 
    
         
            +
                  end
         
     | 
| 
      
 596 
     | 
    
         
            +
                  
         
     | 
| 
      
 597 
     | 
    
         
            +
                  # Apply phrase mappings
         
     | 
| 
      
 598 
     | 
    
         
            +
                  model_data['phrase_mappings'].each do |legal_phrase, plain_phrases|
         
     | 
| 
      
 599 
     | 
    
         
            +
                    best_plain_phrase = plain_phrases.max_by { |_, prob| prob }.first
         
     | 
| 
      
 600 
     | 
    
         
            +
                    result.gsub!(legal_phrase, best_plain_phrase)
         
     | 
| 
      
 601 
     | 
    
         
            +
                  end
         
     | 
| 
      
 602 
     | 
    
         
            +
                  
         
     | 
| 
      
 603 
     | 
    
         
            +
                  result
         
     | 
| 
      
 604 
     | 
    
         
            +
                end
         
     | 
| 
      
 605 
     | 
    
         
            +
             
     | 
| 
      
 606 
     | 
    
         
            +
                def predict_with_statistical_model(model_data, legal_text)
         
     | 
| 
      
 607 
     | 
    
         
            +
                  # Simplified statistical prediction
         
     | 
| 
      
 608 
     | 
    
         
            +
                  tokens = tokenize_text(legal_text)
         
     | 
| 
      
 609 
     | 
    
         
            +
                  
         
     | 
| 
      
 610 
     | 
    
         
            +
                  predicted_tokens = tokens.map do |token|
         
     | 
| 
      
 611 
     | 
    
         
            +
                    # Find best translation based on translation probabilities
         
     | 
| 
      
 612 
     | 
    
         
            +
                    best_translation = token
         
     | 
| 
      
 613 
     | 
    
         
            +
                    best_prob = 0
         
     | 
| 
      
 614 
     | 
    
         
            +
                    
         
     | 
| 
      
 615 
     | 
    
         
            +
                    model_data['translation_probabilities'].each do |pair, prob|
         
     | 
| 
      
 616 
     | 
    
         
            +
                      legal_word, plain_word = pair.split('|')
         
     | 
| 
      
 617 
     | 
    
         
            +
                      if legal_word == token && prob > best_prob
         
     | 
| 
      
 618 
     | 
    
         
            +
                        best_prob = prob
         
     | 
| 
      
 619 
     | 
    
         
            +
                        best_translation = plain_word
         
     | 
| 
      
 620 
     | 
    
         
            +
                      end
         
     | 
| 
      
 621 
     | 
    
         
            +
                    end
         
     | 
| 
      
 622 
     | 
    
         
            +
                    
         
     | 
| 
      
 623 
     | 
    
         
            +
                    best_translation
         
     | 
| 
      
 624 
     | 
    
         
            +
                  end
         
     | 
| 
      
 625 
     | 
    
         
            +
                  
         
     | 
| 
      
 626 
     | 
    
         
            +
                  predicted_tokens.join(' ')
         
     | 
| 
      
 627 
     | 
    
         
            +
                end
         
     | 
| 
      
 628 
     | 
    
         
            +
             
     | 
| 
      
 629 
     | 
    
         
            +
                def predict_with_neural_model(model_data, legal_text)
         
     | 
| 
      
 630 
     | 
    
         
            +
                  # Placeholder for neural model prediction
         
     | 
| 
      
 631 
     | 
    
         
            +
                  # In practice, this would use the trained neural network
         
     | 
| 
      
 632 
     | 
    
         
            +
                  legal_text # Return original text as placeholder
         
     | 
| 
      
 633 
     | 
    
         
            +
                end
         
     | 
| 
      
 634 
     | 
    
         
            +
             
     | 
| 
      
 635 
     | 
    
         
            +
                def similarity_score(text1, text2)
         
     | 
| 
      
 636 
     | 
    
         
            +
                  words1 = text1.downcase.split
         
     | 
| 
      
 637 
     | 
    
         
            +
                  words2 = text2.downcase.split
         
     | 
| 
      
 638 
     | 
    
         
            +
                  
         
     | 
| 
      
 639 
     | 
    
         
            +
                  return 0 if words1.empty? && words2.empty?
         
     | 
| 
      
 640 
     | 
    
         
            +
                  return 0 if words1.empty? || words2.empty?
         
     | 
| 
      
 641 
     | 
    
         
            +
                  
         
     | 
| 
      
 642 
     | 
    
         
            +
                  common_words = words1 & words2
         
     | 
| 
      
 643 
     | 
    
         
            +
                  common_words.length.to_f / [words1.length, words2.length].max
         
     | 
| 
      
 644 
     | 
    
         
            +
                end
         
     | 
| 
      
 645 
     | 
    
         
            +
             
     | 
| 
      
 646 
     | 
    
         
            +
                def calculate_detailed_metrics(model_data, test_data, model_type)
         
     | 
| 
      
 647 
     | 
    
         
            +
                  predictions = test_data.map do |example|
         
     | 
| 
      
 648 
     | 
    
         
            +
                    predicted = predict_with_model(model_data, example['legal'], model_type)
         
     | 
| 
      
 649 
     | 
    
         
            +
                    {
         
     | 
| 
      
 650 
     | 
    
         
            +
                      legal: example['legal'],
         
     | 
| 
      
 651 
     | 
    
         
            +
                      expected: example['plain'],
         
     | 
| 
      
 652 
     | 
    
         
            +
                      predicted: predicted,
         
     | 
| 
      
 653 
     | 
    
         
            +
                      similarity: similarity_score(predicted, example['plain'])
         
     | 
| 
      
 654 
     | 
    
         
            +
                    }
         
     | 
| 
      
 655 
     | 
    
         
            +
                  end
         
     | 
| 
      
 656 
     | 
    
         
            +
                  
         
     | 
| 
      
 657 
     | 
    
         
            +
                  similarities = predictions.map { |p| p[:similarity] }
         
     | 
| 
      
 658 
     | 
    
         
            +
                  
         
     | 
| 
      
 659 
     | 
    
         
            +
                  {
         
     | 
| 
      
 660 
     | 
    
         
            +
                    accuracy: (similarities.count { |s| s > 0.7 }.to_f / similarities.length * 100).round(2),
         
     | 
| 
      
 661 
     | 
    
         
            +
                    avg_similarity: (similarities.sum / similarities.length).round(3),
         
     | 
| 
      
 662 
     | 
    
         
            +
                    min_similarity: similarities.min.round(3),
         
     | 
| 
      
 663 
     | 
    
         
            +
                    max_similarity: similarities.max.round(3),
         
     | 
| 
      
 664 
     | 
    
         
            +
                    predictions_count: predictions.length
         
     | 
| 
      
 665 
     | 
    
         
            +
                  }
         
     | 
| 
      
 666 
     | 
    
         
            +
                end
         
     | 
| 
      
 667 
     | 
    
         
            +
             
     | 
| 
      
 668 
     | 
    
         
            +
                def generate_augmented_examples(training_data, factor)
         
     | 
| 
      
 669 
     | 
    
         
            +
                  augmented = []
         
     | 
| 
      
 670 
     | 
    
         
            +
                  num_to_generate = (training_data.length * factor).to_i
         
     | 
| 
      
 671 
     | 
    
         
            +
                  
         
     | 
| 
      
 672 
     | 
    
         
            +
                  num_to_generate.times do
         
     | 
| 
      
 673 
     | 
    
         
            +
                    original = training_data.sample
         
     | 
| 
      
 674 
     | 
    
         
            +
                    
         
     | 
| 
      
 675 
     | 
    
         
            +
                    # Simple augmentation: synonym replacement, word order changes
         
     | 
| 
      
 676 
     | 
    
         
            +
                    augmented_legal = augment_text(original[:legal])
         
     | 
| 
      
 677 
     | 
    
         
            +
                    augmented_plain = augment_text(original[:plain])
         
     | 
| 
      
 678 
     | 
    
         
            +
                    
         
     | 
| 
      
 679 
     | 
    
         
            +
                    augmented << {
         
     | 
| 
      
 680 
     | 
    
         
            +
                      legal: augmented_legal,
         
     | 
| 
      
 681 
     | 
    
         
            +
                      plain: augmented_plain,
         
     | 
| 
      
 682 
     | 
    
         
            +
                      legal_tokens: tokenize_text(augmented_legal),
         
     | 
| 
      
 683 
     | 
    
         
            +
                      plain_tokens: tokenize_text(augmented_plain),
         
     | 
| 
      
 684 
     | 
    
         
            +
                      legal_length: augmented_legal.split.length,
         
     | 
| 
      
 685 
     | 
    
         
            +
                      plain_length: augmented_plain.split.length,
         
     | 
| 
      
 686 
     | 
    
         
            +
                      complexity_score: calculate_complexity_score(augmented_legal),
         
     | 
| 
      
 687 
     | 
    
         
            +
                      augmented: true
         
     | 
| 
      
 688 
     | 
    
         
            +
                    }
         
     | 
| 
      
 689 
     | 
    
         
            +
                  end
         
     | 
| 
      
 690 
     | 
    
         
            +
                  
         
     | 
| 
      
 691 
     | 
    
         
            +
                  augmented
         
     | 
| 
      
 692 
     | 
    
         
            +
                end
         
     | 
| 
      
 693 
     | 
    
         
            +
             
     | 
| 
      
 694 
     | 
    
         
            +
                def augment_text(text)
         
     | 
| 
      
 695 
     | 
    
         
            +
                  # Simple text augmentation
         
     | 
| 
      
 696 
     | 
    
         
            +
                  words = text.split
         
     | 
| 
      
 697 
     | 
    
         
            +
                  
         
     | 
| 
      
 698 
     | 
    
         
            +
                  # Randomly shuffle some adjacent words (simple augmentation)
         
     | 
| 
      
 699 
     | 
    
         
            +
                  if words.length > 3 && rand < 0.3
         
     | 
| 
      
 700 
     | 
    
         
            +
                    i = rand(words.length - 1)
         
     | 
| 
      
 701 
     | 
    
         
            +
                    words[i], words[i + 1] = words[i + 1], words[i]
         
     | 
| 
      
 702 
     | 
    
         
            +
                  end
         
     | 
| 
      
 703 
     | 
    
         
            +
                  
         
     | 
| 
      
 704 
     | 
    
         
            +
                  words.join(' ')
         
     | 
| 
      
 705 
     | 
    
         
            +
                end
         
     | 
| 
      
 706 
     | 
    
         
            +
              end
         
     | 
| 
      
 707 
     | 
    
         
            +
            end
         
     |