legal_summariser 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,707 @@
1
+ require 'json'
2
+ require 'fileutils'
3
+ require 'digest'
4
+
5
+ module LegalSummariser
6
+ # Advanced model training and fine-tuning capabilities for legal text processing
7
+ class ModelTrainer
8
+ class TrainingError < StandardError; end
9
+ class ValidationError < StandardError; end
10
+ class ModelNotFoundError < StandardError; end
11
+
12
+ attr_reader :config, :logger, :training_data_dir, :models_dir
13
+
14
+ def initialize(config = nil)
15
+ @config = config || LegalSummariser.configuration
16
+ @logger = @config.logger
17
+ @training_data_dir = File.join(@config.cache_dir, 'training_data')
18
+ @models_dir = File.join(@config.cache_dir, 'models')
19
+
20
+ setup_directories
21
+ end
22
+
23
+ # Train a new model with provided training data
24
+ def train_model(training_data, model_name, options = {})
25
+ validate_training_data(training_data)
26
+
27
+ @logger&.info("Starting model training for '#{model_name}' with #{training_data.length} examples")
28
+
29
+ start_time = Time.now
30
+ model_id = generate_model_id(model_name)
31
+
32
+ begin
33
+ # Prepare training data
34
+ prepared_data = prepare_training_data(training_data, options)
35
+
36
+ # Train different model types
37
+ model_results = {}
38
+
39
+ if options[:train_pattern_model] != false
40
+ model_results[:pattern_model] = train_pattern_model(prepared_data, model_id)
41
+ end
42
+
43
+ if options[:train_statistical_model] != false
44
+ model_results[:statistical_model] = train_statistical_model(prepared_data, model_id)
45
+ end
46
+
47
+ if options[:train_neural_model] && options[:neural_config]
48
+ model_results[:neural_model] = train_neural_model(prepared_data, model_id, options[:neural_config])
49
+ end
50
+
51
+ # Save model metadata
52
+ model_metadata = {
53
+ model_id: model_id,
54
+ model_name: model_name,
55
+ created_at: Time.now.iso8601,
56
+ training_examples: training_data.length,
57
+ model_types: model_results.keys,
58
+ performance_metrics: calculate_training_metrics(prepared_data, model_results),
59
+ options: options,
60
+ version: '0.3.0'
61
+ }
62
+
63
+ save_model_metadata(model_id, model_metadata)
64
+
65
+ duration = Time.now - start_time
66
+ @logger&.info("Model training completed in #{duration.round(2)}s")
67
+
68
+ {
69
+ model_id: model_id,
70
+ model_name: model_name,
71
+ training_time: duration,
72
+ models: model_results,
73
+ metadata: model_metadata
74
+ }
75
+
76
+ rescue => e
77
+ @logger&.error("Model training failed: #{e.message}")
78
+ raise TrainingError, "Failed to train model '#{model_name}': #{e.message}"
79
+ end
80
+ end
81
+
82
+ # Fine-tune an existing model with additional data
83
+ def fine_tune_model(model_id, additional_data, options = {})
84
+ model_metadata = load_model_metadata(model_id)
85
+ raise ModelNotFoundError, "Model '#{model_id}' not found" unless model_metadata
86
+
87
+ @logger&.info("Fine-tuning model '#{model_id}' with #{additional_data.length} additional examples")
88
+
89
+ # Load existing training data
90
+ existing_data = load_training_data(model_id)
91
+ combined_data = existing_data + additional_data
92
+
93
+ # Retrain with combined data
94
+ train_model(combined_data, model_metadata['model_name'], options.merge(model_id: model_id))
95
+ end
96
+
97
+ # Evaluate model performance
98
+ def evaluate_model(model_id, test_data)
99
+ model_metadata = load_model_metadata(model_id)
100
+ raise ModelNotFoundError, "Model '#{model_id}' not found" unless model_metadata
101
+
102
+ @logger&.info("Evaluating model '#{model_id}' with #{test_data.length} test examples")
103
+
104
+ results = {
105
+ model_id: model_id,
106
+ test_examples: test_data.length,
107
+ accuracy_scores: {},
108
+ performance_metrics: {}
109
+ }
110
+
111
+ # Evaluate each model type
112
+ model_metadata['model_types'].each do |model_type|
113
+ model_path = File.join(@models_dir, model_id, "#{model_type}.json")
114
+ next unless File.exist?(model_path)
115
+
116
+ model_data = JSON.parse(File.read(model_path))
117
+ accuracy = evaluate_model_type(model_data, test_data, model_type)
118
+
119
+ results[:accuracy_scores][model_type] = accuracy
120
+ results[:performance_metrics][model_type] = calculate_detailed_metrics(model_data, test_data, model_type)
121
+ end
122
+
123
+ results
124
+ end
125
+
126
+ # List all trained models
127
+ def list_models
128
+ return [] unless Dir.exist?(@models_dir)
129
+
130
+ models = []
131
+ Dir.glob(File.join(@models_dir, '*')).each do |model_dir|
132
+ next unless File.directory?(model_dir)
133
+
134
+ model_id = File.basename(model_dir)
135
+ metadata_file = File.join(model_dir, 'metadata.json')
136
+
137
+ if File.exist?(metadata_file)
138
+ metadata = JSON.parse(File.read(metadata_file))
139
+ models << {
140
+ model_id: model_id,
141
+ model_name: metadata['model_name'],
142
+ created_at: metadata['created_at'],
143
+ training_examples: metadata['training_examples'],
144
+ model_types: metadata['model_types']
145
+ }
146
+ end
147
+ end
148
+
149
+ models.sort_by { |m| m[:created_at] }.reverse
150
+ end
151
+
152
+ # Delete a trained model
153
+ def delete_model(model_id)
154
+ model_dir = File.join(@models_dir, model_id)
155
+
156
+ if Dir.exist?(model_dir)
157
+ FileUtils.rm_rf(model_dir)
158
+ @logger&.info("Deleted model '#{model_id}'")
159
+ true
160
+ else
161
+ false
162
+ end
163
+ end
164
+
165
+ # Export model for deployment
166
+ def export_model(model_id, export_path)
167
+ model_dir = File.join(@models_dir, model_id)
168
+ raise ModelNotFoundError, "Model '#{model_id}' not found" unless Dir.exist?(model_dir)
169
+
170
+ # Create export package
171
+ export_data = {
172
+ model_id: model_id,
173
+ exported_at: Time.now.iso8601,
174
+ metadata: load_model_metadata(model_id),
175
+ models: {}
176
+ }
177
+
178
+ # Include all model files
179
+ Dir.glob(File.join(model_dir, '*.json')).each do |model_file|
180
+ model_type = File.basename(model_file, '.json')
181
+ next if model_type == 'metadata'
182
+
183
+ export_data[:models][model_type] = JSON.parse(File.read(model_file))
184
+ end
185
+
186
+ File.write(export_path, JSON.pretty_generate(export_data))
187
+ @logger&.info("Model '#{model_id}' exported to '#{export_path}'")
188
+
189
+ export_data
190
+ end
191
+
192
+ # Import a previously exported model
193
+ def import_model(import_path)
194
+ raise ValidationError, "Import file not found: #{import_path}" unless File.exist?(import_path)
195
+
196
+ import_data = JSON.parse(File.read(import_path))
197
+ model_id = import_data['model_id']
198
+
199
+ # Create model directory
200
+ model_dir = File.join(@models_dir, model_id)
201
+ FileUtils.mkdir_p(model_dir)
202
+
203
+ # Save metadata
204
+ save_model_metadata(model_id, import_data['metadata'])
205
+
206
+ # Save model files
207
+ import_data['models'].each do |model_type, model_data|
208
+ model_file = File.join(model_dir, "#{model_type}.json")
209
+ File.write(model_file, JSON.pretty_generate(model_data))
210
+ end
211
+
212
+ @logger&.info("Model '#{model_id}' imported successfully")
213
+
214
+ {
215
+ model_id: model_id,
216
+ model_name: import_data['metadata']['model_name'],
217
+ imported_at: Time.now.iso8601
218
+ }
219
+ end
220
+
221
+ private
222
+
223
+ def setup_directories
224
+ [@training_data_dir, @models_dir].each do |dir|
225
+ FileUtils.mkdir_p(dir) unless Dir.exist?(dir)
226
+ end
227
+ end
228
+
229
+ def validate_training_data(training_data)
230
+ raise ValidationError, "Training data must be an array" unless training_data.is_a?(Array)
231
+ raise ValidationError, "Training data cannot be empty" if training_data.empty?
232
+
233
+ training_data.each_with_index do |example, index|
234
+ unless example.is_a?(Hash) && example['legal'] && example['plain']
235
+ raise ValidationError, "Invalid training example at index #{index}: must have 'legal' and 'plain' keys"
236
+ end
237
+ end
238
+ end
239
+
240
+ def generate_model_id(model_name)
241
+ timestamp = Time.now.strftime('%Y%m%d_%H%M%S')
242
+ hash = Digest::MD5.hexdigest("#{model_name}_#{timestamp}")[0..7]
243
+ "#{model_name.downcase.gsub(/[^a-z0-9]/, '_')}_#{timestamp}_#{hash}"
244
+ end
245
+
246
+ def prepare_training_data(training_data, options = {})
247
+ prepared = training_data.map do |example|
248
+ {
249
+ legal: example['legal'].strip,
250
+ plain: example['plain'].strip,
251
+ legal_tokens: tokenize_text(example['legal']),
252
+ plain_tokens: tokenize_text(example['plain']),
253
+ legal_length: example['legal'].split.length,
254
+ plain_length: example['plain'].split.length,
255
+ complexity_score: calculate_complexity_score(example['legal'])
256
+ }
257
+ end
258
+
259
+ # Add data augmentation if requested
260
+ if options[:augment_data]
261
+ prepared += generate_augmented_examples(prepared, options[:augmentation_factor] || 0.2)
262
+ end
263
+
264
+ prepared
265
+ end
266
+
267
+ def train_pattern_model(training_data, model_id)
268
+ @logger&.info("Training pattern-based model")
269
+
270
+ patterns = {
271
+ word_mappings: {},
272
+ phrase_mappings: {},
273
+ sentence_patterns: [],
274
+ complexity_rules: []
275
+ }
276
+
277
+ training_data.each do |example|
278
+ # Extract word-level mappings
279
+ legal_words = example[:legal_tokens]
280
+ plain_words = example[:plain_tokens]
281
+
282
+ # Simple alignment heuristic
283
+ word_mappings = align_words(legal_words, plain_words)
284
+ word_mappings.each do |legal_word, plain_word|
285
+ patterns[:word_mappings][legal_word] ||= Hash.new(0)
286
+ patterns[:word_mappings][legal_word][plain_word] += 1
287
+ end
288
+
289
+ # Extract phrase patterns
290
+ phrase_mappings = extract_phrase_patterns(example[:legal], example[:plain])
291
+ phrase_mappings.each do |legal_phrase, plain_phrase|
292
+ patterns[:phrase_mappings][legal_phrase] ||= Hash.new(0)
293
+ patterns[:phrase_mappings][legal_phrase][plain_phrase] += 1
294
+ end
295
+ end
296
+
297
+ # Convert counts to probabilities
298
+ patterns[:word_mappings].each do |legal_word, plain_words|
299
+ total = plain_words.values.sum
300
+ plain_words.each { |word, count| plain_words[word] = count.to_f / total }
301
+ end
302
+
303
+ patterns[:phrase_mappings].each do |legal_phrase, plain_phrases|
304
+ total = plain_phrases.values.sum
305
+ plain_phrases.each { |phrase, count| plain_phrases[phrase] = count.to_f / total }
306
+ end
307
+
308
+ # Save pattern model
309
+ model_file = File.join(@models_dir, model_id, 'pattern_model.json')
310
+ FileUtils.mkdir_p(File.dirname(model_file))
311
+ File.write(model_file, JSON.pretty_generate(patterns))
312
+
313
+ patterns
314
+ end
315
+
316
+ def train_statistical_model(training_data, model_id)
317
+ @logger&.info("Training statistical model")
318
+
319
+ # Build n-gram models for both legal and plain text
320
+ legal_ngrams = build_ngram_model(training_data.map { |ex| ex[:legal_tokens] })
321
+ plain_ngrams = build_ngram_model(training_data.map { |ex| ex[:plain_tokens] })
322
+
323
+ # Build translation probabilities
324
+ translation_probs = calculate_translation_probabilities(training_data)
325
+
326
+ statistical_model = {
327
+ legal_ngrams: legal_ngrams,
328
+ plain_ngrams: plain_ngrams,
329
+ translation_probabilities: translation_probs,
330
+ vocabulary: {
331
+ legal: extract_vocabulary(training_data.map { |ex| ex[:legal_tokens] }),
332
+ plain: extract_vocabulary(training_data.map { |ex| ex[:plain_tokens] })
333
+ }
334
+ }
335
+
336
+ # Save statistical model
337
+ model_file = File.join(@models_dir, model_id, 'statistical_model.json')
338
+ FileUtils.mkdir_p(File.dirname(model_file))
339
+ File.write(model_file, JSON.pretty_generate(statistical_model))
340
+
341
+ statistical_model
342
+ end
343
+
344
+ def train_neural_model(training_data, model_id, neural_config)
345
+ @logger&.info("Training neural model (placeholder implementation)")
346
+
347
+ # This is a placeholder for neural model training
348
+ # In a real implementation, you would use frameworks like TensorFlow or PyTorch
349
+ neural_model = {
350
+ model_type: 'transformer',
351
+ architecture: neural_config[:architecture] || 'encoder_decoder',
352
+ vocab_size: neural_config[:vocab_size] || 10000,
353
+ embedding_dim: neural_config[:embedding_dim] || 256,
354
+ hidden_dim: neural_config[:hidden_dim] || 512,
355
+ num_layers: neural_config[:num_layers] || 6,
356
+ training_epochs: neural_config[:epochs] || 10,
357
+ learning_rate: neural_config[:learning_rate] || 0.001,
358
+ trained_on: training_data.length,
359
+ placeholder: true # Indicates this is a placeholder implementation
360
+ }
361
+
362
+ # Save neural model placeholder
363
+ model_file = File.join(@models_dir, model_id, 'neural_model.json')
364
+ FileUtils.mkdir_p(File.dirname(model_file))
365
+ File.write(model_file, JSON.pretty_generate(neural_model))
366
+
367
+ neural_model
368
+ end
369
+
370
+ def tokenize_text(text)
371
+ # Simple tokenization - in practice, use more sophisticated tokenizers
372
+ text.downcase.gsub(/[^\w\s]/, ' ').split
373
+ end
374
+
375
+ def calculate_complexity_score(text)
376
+ words = text.split
377
+ avg_word_length = words.map(&:length).sum.to_f / words.length
378
+ sentence_count = text.split(/[.!?]+/).length
379
+ avg_sentence_length = words.length.to_f / sentence_count
380
+
381
+ (avg_word_length * 2) + (avg_sentence_length * 0.5)
382
+ end
383
+
384
+ def align_words(legal_words, plain_words)
385
+ # Simple word alignment using edit distance
386
+ alignments = {}
387
+
388
+ legal_words.each do |legal_word|
389
+ best_match = nil
390
+ best_score = Float::INFINITY
391
+
392
+ plain_words.each do |plain_word|
393
+ score = levenshtein_distance(legal_word, plain_word)
394
+ if score < best_score && score < [legal_word.length, plain_word.length].max * 0.6
395
+ best_score = score
396
+ best_match = plain_word
397
+ end
398
+ end
399
+
400
+ alignments[legal_word] = best_match if best_match
401
+ end
402
+
403
+ alignments
404
+ end
405
+
406
+ def levenshtein_distance(str1, str2)
407
+ matrix = Array.new(str1.length + 1) { Array.new(str2.length + 1) }
408
+
409
+ (0..str1.length).each { |i| matrix[i][0] = i }
410
+ (0..str2.length).each { |j| matrix[0][j] = j }
411
+
412
+ (1..str1.length).each do |i|
413
+ (1..str2.length).each do |j|
414
+ cost = str1[i-1] == str2[j-1] ? 0 : 1
415
+ matrix[i][j] = [
416
+ matrix[i-1][j] + 1, # deletion
417
+ matrix[i][j-1] + 1, # insertion
418
+ matrix[i-1][j-1] + cost # substitution
419
+ ].min
420
+ end
421
+ end
422
+
423
+ matrix[str1.length][str2.length]
424
+ end
425
+
426
+ def extract_phrase_patterns(legal_text, plain_text)
427
+ # Extract common phrase patterns
428
+ patterns = {}
429
+
430
+ # Simple phrase extraction using sliding windows
431
+ [2, 3, 4].each do |window_size|
432
+ legal_phrases = extract_phrases(legal_text, window_size)
433
+ plain_phrases = extract_phrases(plain_text, window_size)
434
+
435
+ # Find potential mappings
436
+ legal_phrases.each do |legal_phrase|
437
+ plain_phrases.each do |plain_phrase|
438
+ if phrases_similar?(legal_phrase, plain_phrase)
439
+ patterns[legal_phrase] = plain_phrase
440
+ end
441
+ end
442
+ end
443
+ end
444
+
445
+ patterns
446
+ end
447
+
448
+ def extract_phrases(text, window_size)
449
+ words = text.downcase.split
450
+ phrases = []
451
+
452
+ (0..words.length - window_size).each do |i|
453
+ phrase = words[i, window_size].join(' ')
454
+ phrases << phrase
455
+ end
456
+
457
+ phrases
458
+ end
459
+
460
+ def phrases_similar?(phrase1, phrase2)
461
+ # Simple similarity check
462
+ words1 = phrase1.split
463
+ words2 = phrase2.split
464
+
465
+ return false if (words1.length - words2.length).abs > 1
466
+
467
+ common_words = words1 & words2
468
+ common_words.length.to_f / [words1.length, words2.length].max > 0.3
469
+ end
470
+
471
+ def build_ngram_model(token_sequences, n = 3)
472
+ ngrams = Hash.new(0)
473
+
474
+ token_sequences.each do |tokens|
475
+ (0..tokens.length - n).each do |i|
476
+ ngram = tokens[i, n].join(' ')
477
+ ngrams[ngram] += 1
478
+ end
479
+ end
480
+
481
+ # Convert to probabilities
482
+ total = ngrams.values.sum
483
+ ngrams.each { |ngram, count| ngrams[ngram] = count.to_f / total }
484
+
485
+ ngrams
486
+ end
487
+
488
+ def calculate_translation_probabilities(training_data)
489
+ word_pairs = Hash.new(0)
490
+
491
+ training_data.each do |example|
492
+ legal_words = example[:legal_tokens]
493
+ plain_words = example[:plain_tokens]
494
+
495
+ # Simple co-occurrence counting
496
+ legal_words.each do |legal_word|
497
+ plain_words.each do |plain_word|
498
+ word_pairs["#{legal_word}|#{plain_word}"] += 1
499
+ end
500
+ end
501
+ end
502
+
503
+ # Normalize to probabilities
504
+ legal_word_counts = Hash.new(0)
505
+ word_pairs.each do |pair, count|
506
+ legal_word = pair.split('|').first
507
+ legal_word_counts[legal_word] += count
508
+ end
509
+
510
+ translation_probs = {}
511
+ word_pairs.each do |pair, count|
512
+ legal_word, plain_word = pair.split('|')
513
+ translation_probs[pair] = count.to_f / legal_word_counts[legal_word]
514
+ end
515
+
516
+ translation_probs
517
+ end
518
+
519
+ def extract_vocabulary(token_sequences)
520
+ vocab = Hash.new(0)
521
+
522
+ token_sequences.each do |tokens|
523
+ tokens.each { |token| vocab[token] += 1 }
524
+ end
525
+
526
+ vocab.sort_by { |_, count| -count }.to_h
527
+ end
528
+
529
+ def calculate_training_metrics(training_data, model_results)
530
+ {
531
+ total_examples: training_data.length,
532
+ avg_legal_length: training_data.map { |ex| ex[:legal_length] }.sum.to_f / training_data.length,
533
+ avg_plain_length: training_data.map { |ex| ex[:plain_length] }.sum.to_f / training_data.length,
534
+ avg_complexity_score: training_data.map { |ex| ex[:complexity_score] }.sum.to_f / training_data.length,
535
+ model_types_trained: model_results.keys.length,
536
+ training_completed_at: Time.now.iso8601
537
+ }
538
+ end
539
+
540
+ def save_model_metadata(model_id, metadata)
541
+ model_dir = File.join(@models_dir, model_id)
542
+ FileUtils.mkdir_p(model_dir)
543
+
544
+ metadata_file = File.join(model_dir, 'metadata.json')
545
+ File.write(metadata_file, JSON.pretty_generate(metadata))
546
+ end
547
+
548
+ def load_model_metadata(model_id)
549
+ metadata_file = File.join(@models_dir, model_id, 'metadata.json')
550
+ return nil unless File.exist?(metadata_file)
551
+
552
+ JSON.parse(File.read(metadata_file))
553
+ end
554
+
555
+ def load_training_data(model_id)
556
+ training_file = File.join(@training_data_dir, "#{model_id}.json")
557
+ return [] unless File.exist?(training_file)
558
+
559
+ JSON.parse(File.read(training_file))
560
+ end
561
+
562
+ def evaluate_model_type(model_data, test_data, model_type)
563
+ correct_predictions = 0
564
+
565
+ test_data.each do |example|
566
+ predicted = predict_with_model(model_data, example['legal'], model_type)
567
+ if similarity_score(predicted, example['plain']) > 0.7
568
+ correct_predictions += 1
569
+ end
570
+ end
571
+
572
+ (correct_predictions.to_f / test_data.length * 100).round(2)
573
+ end
574
+
575
+ def predict_with_model(model_data, legal_text, model_type)
576
+ case model_type
577
+ when 'pattern_model'
578
+ predict_with_pattern_model(model_data, legal_text)
579
+ when 'statistical_model'
580
+ predict_with_statistical_model(model_data, legal_text)
581
+ when 'neural_model'
582
+ predict_with_neural_model(model_data, legal_text)
583
+ else
584
+ legal_text # Fallback
585
+ end
586
+ end
587
+
588
+ def predict_with_pattern_model(model_data, legal_text)
589
+ result = legal_text.dup
590
+
591
+ # Apply word mappings
592
+ model_data['word_mappings'].each do |legal_word, plain_words|
593
+ best_plain_word = plain_words.max_by { |_, prob| prob }.first
594
+ result.gsub!(/\b#{Regexp.escape(legal_word)}\b/i, best_plain_word)
595
+ end
596
+
597
+ # Apply phrase mappings
598
+ model_data['phrase_mappings'].each do |legal_phrase, plain_phrases|
599
+ best_plain_phrase = plain_phrases.max_by { |_, prob| prob }.first
600
+ result.gsub!(legal_phrase, best_plain_phrase)
601
+ end
602
+
603
+ result
604
+ end
605
+
606
+ def predict_with_statistical_model(model_data, legal_text)
607
+ # Simplified statistical prediction
608
+ tokens = tokenize_text(legal_text)
609
+
610
+ predicted_tokens = tokens.map do |token|
611
+ # Find best translation based on translation probabilities
612
+ best_translation = token
613
+ best_prob = 0
614
+
615
+ model_data['translation_probabilities'].each do |pair, prob|
616
+ legal_word, plain_word = pair.split('|')
617
+ if legal_word == token && prob > best_prob
618
+ best_prob = prob
619
+ best_translation = plain_word
620
+ end
621
+ end
622
+
623
+ best_translation
624
+ end
625
+
626
+ predicted_tokens.join(' ')
627
+ end
628
+
629
+ def predict_with_neural_model(model_data, legal_text)
630
+ # Placeholder for neural model prediction
631
+ # In practice, this would use the trained neural network
632
+ legal_text # Return original text as placeholder
633
+ end
634
+
635
+ def similarity_score(text1, text2)
636
+ words1 = text1.downcase.split
637
+ words2 = text2.downcase.split
638
+
639
+ return 0 if words1.empty? && words2.empty?
640
+ return 0 if words1.empty? || words2.empty?
641
+
642
+ common_words = words1 & words2
643
+ common_words.length.to_f / [words1.length, words2.length].max
644
+ end
645
+
646
+ def calculate_detailed_metrics(model_data, test_data, model_type)
647
+ predictions = test_data.map do |example|
648
+ predicted = predict_with_model(model_data, example['legal'], model_type)
649
+ {
650
+ legal: example['legal'],
651
+ expected: example['plain'],
652
+ predicted: predicted,
653
+ similarity: similarity_score(predicted, example['plain'])
654
+ }
655
+ end
656
+
657
+ similarities = predictions.map { |p| p[:similarity] }
658
+
659
+ {
660
+ accuracy: (similarities.count { |s| s > 0.7 }.to_f / similarities.length * 100).round(2),
661
+ avg_similarity: (similarities.sum / similarities.length).round(3),
662
+ min_similarity: similarities.min.round(3),
663
+ max_similarity: similarities.max.round(3),
664
+ predictions_count: predictions.length
665
+ }
666
+ end
667
+
668
+ def generate_augmented_examples(training_data, factor)
669
+ augmented = []
670
+ num_to_generate = (training_data.length * factor).to_i
671
+
672
+ num_to_generate.times do
673
+ original = training_data.sample
674
+
675
+ # Simple augmentation: synonym replacement, word order changes
676
+ augmented_legal = augment_text(original[:legal])
677
+ augmented_plain = augment_text(original[:plain])
678
+
679
+ augmented << {
680
+ legal: augmented_legal,
681
+ plain: augmented_plain,
682
+ legal_tokens: tokenize_text(augmented_legal),
683
+ plain_tokens: tokenize_text(augmented_plain),
684
+ legal_length: augmented_legal.split.length,
685
+ plain_length: augmented_plain.split.length,
686
+ complexity_score: calculate_complexity_score(augmented_legal),
687
+ augmented: true
688
+ }
689
+ end
690
+
691
+ augmented
692
+ end
693
+
694
+ def augment_text(text)
695
+ # Simple text augmentation
696
+ words = text.split
697
+
698
+ # Randomly shuffle some adjacent words (simple augmentation)
699
+ if words.length > 3 && rand < 0.3
700
+ i = rand(words.length - 1)
701
+ words[i], words[i + 1] = words[i + 1], words[i]
702
+ end
703
+
704
+ words.join(' ')
705
+ end
706
+ end
707
+ end