legal_summariser 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,683 @@
1
+ require 'json'
2
+ require 'net/http'
3
+ require 'uri'
4
+
5
+ module LegalSummariser
6
+ # Advanced multilingual processing for legal documents across different languages
7
+ class MultilingualProcessor
8
+ class LanguageError < StandardError; end
9
+ class TranslationError < StandardError; end
10
+ class UnsupportedLanguageError < StandardError; end
11
+
12
+ # Supported languages with their configurations
13
+ SUPPORTED_LANGUAGES = {
14
+ 'en' => {
15
+ name: 'English',
16
+ legal_systems: ['common_law', 'statutory'],
17
+ date_formats: ['MM/dd/yyyy', 'dd/MM/yyyy'],
18
+ currency: 'USD',
19
+ legal_terms_db: 'en_legal_terms.json'
20
+ },
21
+ 'tr' => {
22
+ name: 'Turkish',
23
+ legal_systems: ['civil_law'],
24
+ date_formats: ['dd.MM.yyyy', 'dd/MM/yyyy'],
25
+ currency: 'TRY',
26
+ legal_terms_db: 'tr_legal_terms.json'
27
+ },
28
+ 'de' => {
29
+ name: 'German',
30
+ legal_systems: ['civil_law'],
31
+ date_formats: ['dd.MM.yyyy', 'dd/MM/yyyy'],
32
+ currency: 'EUR',
33
+ legal_terms_db: 'de_legal_terms.json'
34
+ },
35
+ 'fr' => {
36
+ name: 'French',
37
+ legal_systems: ['civil_law'],
38
+ date_formats: ['dd/MM/yyyy', 'dd.MM.yyyy'],
39
+ currency: 'EUR',
40
+ legal_terms_db: 'fr_legal_terms.json'
41
+ },
42
+ 'es' => {
43
+ name: 'Spanish',
44
+ legal_systems: ['civil_law'],
45
+ date_formats: ['dd/MM/yyyy', 'dd.MM.yyyy'],
46
+ currency: 'EUR',
47
+ legal_terms_db: 'es_legal_terms.json'
48
+ },
49
+ 'it' => {
50
+ name: 'Italian',
51
+ legal_systems: ['civil_law'],
52
+ date_formats: ['dd/MM/yyyy', 'dd.MM.yyyy'],
53
+ currency: 'EUR',
54
+ legal_terms_db: 'it_legal_terms.json'
55
+ },
56
+ 'pt' => {
57
+ name: 'Portuguese',
58
+ legal_systems: ['civil_law'],
59
+ date_formats: ['dd/MM/yyyy'],
60
+ currency: 'EUR',
61
+ legal_terms_db: 'pt_legal_terms.json'
62
+ },
63
+ 'nl' => {
64
+ name: 'Dutch',
65
+ legal_systems: ['civil_law'],
66
+ date_formats: ['dd-MM-yyyy', 'dd/MM/yyyy'],
67
+ currency: 'EUR',
68
+ legal_terms_db: 'nl_legal_terms.json'
69
+ }
70
+ }.freeze
71
+
72
+ # Legal term translations for different languages
73
+ LEGAL_TERM_TRANSLATIONS = {
74
+ 'contract' => {
75
+ 'tr' => 'sözleşme',
76
+ 'de' => 'Vertrag',
77
+ 'fr' => 'contrat',
78
+ 'es' => 'contrato',
79
+ 'it' => 'contratto',
80
+ 'pt' => 'contrato',
81
+ 'nl' => 'contract'
82
+ },
83
+ 'agreement' => {
84
+ 'tr' => 'anlaşma',
85
+ 'de' => 'Vereinbarung',
86
+ 'fr' => 'accord',
87
+ 'es' => 'acuerdo',
88
+ 'it' => 'accordo',
89
+ 'pt' => 'acordo',
90
+ 'nl' => 'overeenkomst'
91
+ },
92
+ 'liability' => {
93
+ 'tr' => 'sorumluluk',
94
+ 'de' => 'Haftung',
95
+ 'fr' => 'responsabilité',
96
+ 'es' => 'responsabilidad',
97
+ 'it' => 'responsabilità',
98
+ 'pt' => 'responsabilidade',
99
+ 'nl' => 'aansprakelijkheid'
100
+ },
101
+ 'confidentiality' => {
102
+ 'tr' => 'gizlilik',
103
+ 'de' => 'Vertraulichkeit',
104
+ 'fr' => 'confidentialité',
105
+ 'es' => 'confidencialidad',
106
+ 'it' => 'riservatezza',
107
+ 'pt' => 'confidencialidade',
108
+ 'nl' => 'vertrouwelijkheid'
109
+ },
110
+ 'termination' => {
111
+ 'tr' => 'fesih',
112
+ 'de' => 'Kündigung',
113
+ 'fr' => 'résiliation',
114
+ 'es' => 'terminación',
115
+ 'it' => 'risoluzione',
116
+ 'pt' => 'rescisão',
117
+ 'nl' => 'beëindiging'
118
+ },
119
+ 'jurisdiction' => {
120
+ 'tr' => 'yargı yetkisi',
121
+ 'de' => 'Gerichtsbarkeit',
122
+ 'fr' => 'juridiction',
123
+ 'es' => 'jurisdicción',
124
+ 'it' => 'giurisdizione',
125
+ 'pt' => 'jurisdição',
126
+ 'nl' => 'jurisdictie'
127
+ }
128
+ }.freeze
129
+
130
+ attr_reader :config, :logger, :current_language, :translation_cache
131
+
132
+ def initialize(config = nil)
133
+ @config = config || LegalSummariser.configuration
134
+ @logger = @config.logger
135
+ @current_language = @config.language || 'en'
136
+ @translation_cache = {}
137
+
138
+ validate_language(@current_language)
139
+ end
140
+
141
+ # Detect the language of a legal document
142
+ def detect_language(text)
143
+ return 'en' if text.nil? || text.strip.empty?
144
+
145
+ @logger&.info("Detecting language for text of length: #{text.length}")
146
+
147
+ language_scores = {}
148
+
149
+ # Score based on legal terms presence
150
+ SUPPORTED_LANGUAGES.each do |lang_code, lang_config|
151
+ score = calculate_language_score(text, lang_code)
152
+ language_scores[lang_code] = score
153
+ end
154
+
155
+ # Get the language with highest score
156
+ detected_language = language_scores.max_by { |_, score| score }.first
157
+ confidence = language_scores[detected_language]
158
+
159
+ @logger&.info("Detected language: #{detected_language} (confidence: #{confidence.round(2)})")
160
+
161
+ {
162
+ language: detected_language,
163
+ confidence: confidence,
164
+ language_name: SUPPORTED_LANGUAGES[detected_language][:name],
165
+ all_scores: language_scores
166
+ }
167
+ end
168
+
169
+ # Process legal document in multiple languages
170
+ def process_multilingual(text, target_languages = nil, options = {})
171
+ target_languages ||= ['en']
172
+ target_languages = [target_languages] unless target_languages.is_a?(Array)
173
+
174
+ @logger&.info("Processing text for languages: #{target_languages.join(', ')}")
175
+
176
+ # Detect source language
177
+ detection_result = detect_language(text)
178
+ source_language = detection_result[:language]
179
+
180
+ results = {
181
+ source_language: source_language,
182
+ detection_confidence: detection_result[:confidence],
183
+ processed_languages: {},
184
+ metadata: {
185
+ original_length: text.length,
186
+ processing_time: 0,
187
+ translations_used: []
188
+ }
189
+ }
190
+
191
+ start_time = Time.now
192
+
193
+ target_languages.each do |target_lang|
194
+ begin
195
+ if target_lang == source_language
196
+ # Same language - just process normally
197
+ processed_text = process_in_language(text, target_lang, options)
198
+ else
199
+ # Different language - translate then process
200
+ translated_text = translate_text(text, source_language, target_lang, options)
201
+ processed_text = process_in_language(translated_text, target_lang, options)
202
+ results[:metadata][:translations_used] << "#{source_language} -> #{target_lang}"
203
+ end
204
+
205
+ results[:processed_languages][target_lang] = processed_text
206
+
207
+ rescue => e
208
+ @logger&.error("Failed to process in language #{target_lang}: #{e.message}")
209
+ results[:processed_languages][target_lang] = {
210
+ error: e.message,
211
+ fallback_used: true
212
+ }
213
+ end
214
+ end
215
+
216
+ results[:metadata][:processing_time] = Time.now - start_time
217
+ results
218
+ end
219
+
220
+ # Translate legal text between languages
221
+ def translate_text(text, source_lang, target_lang, options = {})
222
+ return text if source_lang == target_lang
223
+
224
+ cache_key = generate_translation_cache_key(text, source_lang, target_lang)
225
+
226
+ # Check cache first
227
+ if @translation_cache[cache_key] && !options[:force_retranslate]
228
+ @logger&.info("Using cached translation for #{source_lang} -> #{target_lang}")
229
+ return @translation_cache[cache_key]
230
+ end
231
+
232
+ @logger&.info("Translating text from #{source_lang} to #{target_lang}")
233
+
234
+ begin
235
+ # Try different translation methods
236
+ translated_text = nil
237
+
238
+ if options[:use_ai_translation] && translation_api_available?
239
+ translated_text = translate_with_ai_api(text, source_lang, target_lang, options)
240
+ end
241
+
242
+ # Fallback to rule-based translation
243
+ translated_text ||= translate_with_rules(text, source_lang, target_lang)
244
+
245
+ # Post-process translation for legal accuracy
246
+ translated_text = post_process_translation(translated_text, source_lang, target_lang)
247
+
248
+ # Cache the result
249
+ @translation_cache[cache_key] = translated_text
250
+
251
+ translated_text
252
+
253
+ rescue => e
254
+ @logger&.error("Translation failed: #{e.message}")
255
+ raise TranslationError, "Failed to translate from #{source_lang} to #{target_lang}: #{e.message}"
256
+ end
257
+ end
258
+
259
+ # Process text in a specific language context
260
+ def process_in_language(text, language, options = {})
261
+ validate_language(language)
262
+
263
+ @logger&.info("Processing text in #{language} (#{SUPPORTED_LANGUAGES[language][:name]})")
264
+
265
+ # Set language-specific processing context
266
+ old_language = @current_language
267
+ @current_language = language
268
+
269
+ begin
270
+ # Apply language-specific legal processing
271
+ processed = {
272
+ language: language,
273
+ language_name: SUPPORTED_LANGUAGES[language][:name],
274
+ legal_system: SUPPORTED_LANGUAGES[language][:legal_systems],
275
+ processed_text: text,
276
+ legal_terms: extract_legal_terms_for_language(text, language),
277
+ cultural_adaptations: apply_cultural_adaptations(text, language),
278
+ formatting: apply_language_formatting(text, language),
279
+ metadata: {
280
+ word_count: text.split.length,
281
+ character_count: text.length,
282
+ legal_term_count: 0
283
+ }
284
+ }
285
+
286
+ # Extract and translate legal terms
287
+ processed[:legal_terms] = extract_and_process_legal_terms(text, language)
288
+ processed[:metadata][:legal_term_count] = processed[:legal_terms].length
289
+
290
+ # Apply language-specific summarization if requested
291
+ if options[:summarize]
292
+ processed[:summary] = summarize_in_language(text, language, options)
293
+ end
294
+
295
+ # Apply language-specific risk analysis if requested
296
+ if options[:analyze_risks]
297
+ processed[:risks] = analyze_risks_in_language(text, language, options)
298
+ end
299
+
300
+ processed
301
+
302
+ ensure
303
+ @current_language = old_language
304
+ end
305
+ end
306
+
307
+ # Get supported languages information
308
+ def supported_languages
309
+ SUPPORTED_LANGUAGES.map do |code, config|
310
+ {
311
+ code: code,
312
+ name: config[:name],
313
+ legal_systems: config[:legal_systems],
314
+ date_formats: config[:date_formats],
315
+ currency: config[:currency]
316
+ }
317
+ end
318
+ end
319
+
320
+ # Validate if a language is supported
321
+ def language_supported?(language_code)
322
+ SUPPORTED_LANGUAGES.key?(language_code)
323
+ end
324
+
325
+ # Get language-specific legal term database
326
+ def get_legal_terms_for_language(language)
327
+ return {} unless language_supported?(language)
328
+
329
+ terms_file = File.join(@config.cache_dir, 'legal_terms', SUPPORTED_LANGUAGES[language][:legal_terms_db])
330
+
331
+ if File.exist?(terms_file)
332
+ JSON.parse(File.read(terms_file))
333
+ else
334
+ generate_default_legal_terms(language)
335
+ end
336
+ rescue => e
337
+ @logger&.error("Failed to load legal terms for #{language}: #{e.message}")
338
+ {}
339
+ end
340
+
341
+ # Cross-language legal term mapping
342
+ def map_legal_terms_across_languages(terms, source_lang, target_lang)
343
+ mapped_terms = {}
344
+
345
+ terms.each do |term|
346
+ # Check if we have a direct translation
347
+ if LEGAL_TERM_TRANSLATIONS[term.downcase] && LEGAL_TERM_TRANSLATIONS[term.downcase][target_lang]
348
+ mapped_terms[term] = LEGAL_TERM_TRANSLATIONS[term.downcase][target_lang]
349
+ else
350
+ # Use fuzzy matching or keep original
351
+ mapped_terms[term] = find_similar_term(term, target_lang) || term
352
+ end
353
+ end
354
+
355
+ mapped_terms
356
+ end
357
+
358
+ private
359
+
360
+ def validate_language(language_code)
361
+ unless language_supported?(language_code)
362
+ raise UnsupportedLanguageError, "Language '#{language_code}' is not supported. Supported languages: #{SUPPORTED_LANGUAGES.keys.join(', ')}"
363
+ end
364
+ end
365
+
366
+ def calculate_language_score(text, language_code)
367
+ score = 0.0
368
+ text_lower = text.downcase
369
+
370
+ # Check for language-specific legal terms
371
+ legal_terms = get_legal_terms_for_language(language_code)
372
+ legal_terms.each do |term, _|
373
+ if text_lower.include?(term.downcase)
374
+ score += 1.0
375
+ end
376
+ end
377
+
378
+ # Check for language-specific patterns
379
+ case language_code
380
+ when 'en'
381
+ score += text_lower.scan(/\b(shall|hereby|whereas|therefore)\b/).length * 0.5
382
+ when 'tr'
383
+ score += text_lower.scan(/\b(madde|fıkra|sözleşme|taraf)\b/).length * 0.5
384
+ when 'de'
385
+ score += text_lower.scan(/\b(artikel|absatz|vertrag|partei)\b/).length * 0.5
386
+ when 'fr'
387
+ score += text_lower.scan(/\b(article|alinéa|contrat|partie)\b/).length * 0.5
388
+ when 'es'
389
+ score += text_lower.scan(/\b(artículo|párrafo|contrato|parte)\b/).length * 0.5
390
+ when 'it'
391
+ score += text_lower.scan(/\b(articolo|comma|contratto|parte)\b/).length * 0.5
392
+ end
393
+
394
+ # Normalize score
395
+ word_count = text.split.length
396
+ return 0.0 if word_count == 0
397
+
398
+ score / word_count
399
+ end
400
+
401
+ def generate_translation_cache_key(text, source_lang, target_lang)
402
+ content_hash = Digest::MD5.hexdigest(text)[0..15]
403
+ "#{source_lang}_#{target_lang}_#{content_hash}"
404
+ end
405
+
406
+ def translation_api_available?
407
+ ENV['TRANSLATION_API_KEY'] && ENV['TRANSLATION_API_ENDPOINT']
408
+ end
409
+
410
+ def translate_with_ai_api(text, source_lang, target_lang, options = {})
411
+ uri = URI(ENV['TRANSLATION_API_ENDPOINT'])
412
+ http = Net::HTTP.new(uri.host, uri.port)
413
+ http.use_ssl = true if uri.scheme == 'https'
414
+
415
+ request = Net::HTTP::Post.new(uri)
416
+ request['Authorization'] = "Bearer #{ENV['TRANSLATION_API_KEY']}"
417
+ request['Content-Type'] = 'application/json'
418
+
419
+ request.body = JSON.generate({
420
+ text: text,
421
+ source_language: source_lang,
422
+ target_language: target_lang,
423
+ domain: 'legal',
424
+ preserve_formatting: true
425
+ })
426
+
427
+ response = http.request(request)
428
+
429
+ unless response.code == '200'
430
+ raise TranslationError, "Translation API failed with code #{response.code}"
431
+ end
432
+
433
+ result = JSON.parse(response.body)
434
+ result['translated_text']
435
+ end
436
+
437
+ def translate_with_rules(text, source_lang, target_lang)
438
+ translated = text.dup
439
+
440
+ # Apply legal term translations
441
+ LEGAL_TERM_TRANSLATIONS.each do |english_term, translations|
442
+ if translations[source_lang] && translations[target_lang]
443
+ source_term = translations[source_lang]
444
+ target_term = translations[target_lang]
445
+
446
+ # Case-insensitive replacement
447
+ translated.gsub!(/\b#{Regexp.escape(source_term)}\b/i) do |match|
448
+ if match == match.upcase
449
+ target_term.upcase
450
+ elsif match == match.capitalize
451
+ target_term.capitalize
452
+ else
453
+ target_term
454
+ end
455
+ end
456
+ end
457
+ end
458
+
459
+ translated
460
+ end
461
+
462
+ def post_process_translation(text, source_lang, target_lang)
463
+ # Apply language-specific post-processing
464
+ processed = text.dup
465
+
466
+ # Fix common translation issues
467
+ case target_lang
468
+ when 'tr'
469
+ # Turkish-specific fixes
470
+ processed = processed.gsub(/\s+([,.;:!?])/, '\1')
471
+ when 'de'
472
+ # German-specific fixes (capitalization, compound words)
473
+ processed = capitalize_german_nouns(processed)
474
+ when 'fr'
475
+ # French-specific fixes (accents, spacing)
476
+ processed = fix_french_spacing(processed)
477
+ end
478
+
479
+ processed
480
+ end
481
+
482
+ def capitalize_german_nouns(text)
483
+ # Simplified German noun capitalization
484
+ words = text.split
485
+ words.map do |word|
486
+ # This is a very simplified approach
487
+ if word.length > 4 && !word.match(/^[A-Z]/) && german_noun_indicators(word)
488
+ word.capitalize
489
+ else
490
+ word
491
+ end
492
+ end.join(' ')
493
+ end
494
+
495
+ def german_noun_indicators(word)
496
+ # Simple heuristics for German nouns
497
+ word.end_with?('ung', 'heit', 'keit', 'schaft', 'tum')
498
+ end
499
+
500
+ def fix_french_spacing(text)
501
+ # Fix French punctuation spacing
502
+ text.gsub(/\s*([;:!?])\s*/, ' \1 ')
503
+ .gsub(/\s*«\s*/, ' « ')
504
+ .gsub(/\s*»\s*/, ' » ')
505
+ end
506
+
507
+ def extract_legal_terms_for_language(text, language)
508
+ legal_terms_db = get_legal_terms_for_language(language)
509
+ found_terms = []
510
+
511
+ text_lower = text.downcase
512
+ legal_terms_db.each do |term, definition|
513
+ if text_lower.include?(term.downcase)
514
+ found_terms << {
515
+ term: term,
516
+ definition: definition,
517
+ language: language
518
+ }
519
+ end
520
+ end
521
+
522
+ found_terms
523
+ end
524
+
525
+ def apply_cultural_adaptations(text, language)
526
+ adaptations = []
527
+
528
+ case language
529
+ when 'tr'
530
+ # Turkish legal system adaptations
531
+ if text.include?('common law')
532
+ adaptations << "Note: 'Common law' concept adapted for Turkish civil law system"
533
+ end
534
+ when 'de'
535
+ # German legal system adaptations
536
+ if text.include?('jury')
537
+ adaptations << "Note: 'Jury' system adapted for German legal context"
538
+ end
539
+ when 'fr'
540
+ # French legal system adaptations
541
+ if text.include?('discovery')
542
+ adaptations << "Note: 'Discovery' process adapted for French legal procedures"
543
+ end
544
+ end
545
+
546
+ adaptations
547
+ end
548
+
549
+ def apply_language_formatting(text, language)
550
+ formatted = text.dup
551
+
552
+ case language
553
+ when 'tr'
554
+ # Turkish formatting (date formats, currency)
555
+ formatted = format_turkish_dates_and_currency(formatted)
556
+ when 'de'
557
+ # German formatting
558
+ formatted = format_german_dates_and_currency(formatted)
559
+ when 'fr'
560
+ # French formatting
561
+ formatted = format_french_dates_and_currency(formatted)
562
+ end
563
+
564
+ formatted
565
+ end
566
+
567
+ def format_turkish_dates_and_currency(text)
568
+ # Convert date formats to Turkish standard (dd.MM.yyyy)
569
+ text.gsub(/(\d{1,2})\/(\d{1,2})\/(\d{4})/, '\1.\2.\3')
570
+ .gsub(/\$(\d+)/, '\1 TL') # Convert $ to TL
571
+ end
572
+
573
+ def format_german_dates_and_currency(text)
574
+ # Convert to German date format
575
+ text.gsub(/(\d{1,2})\/(\d{1,2})\/(\d{4})/, '\1.\2.\3')
576
+ .gsub(/\$(\d+)/, '\1 €') # Convert $ to €
577
+ end
578
+
579
+ def format_french_dates_and_currency(text)
580
+ # Convert to French date format
581
+ text.gsub(/(\d{1,2})\/(\d{1,2})\/(\d{4})/, '\1/\2/\3')
582
+ .gsub(/\$(\d+)/, '\1 €') # Convert $ to €
583
+ end
584
+
585
+ def extract_and_process_legal_terms(text, language)
586
+ terms = extract_legal_terms_for_language(text, language)
587
+
588
+ # Add cross-references to other languages
589
+ terms.each do |term_info|
590
+ term_info[:translations] = {}
591
+
592
+ SUPPORTED_LANGUAGES.keys.each do |lang_code|
593
+ next if lang_code == language
594
+
595
+ if LEGAL_TERM_TRANSLATIONS[term_info[:term].downcase]
596
+ translation = LEGAL_TERM_TRANSLATIONS[term_info[:term].downcase][lang_code]
597
+ term_info[:translations][lang_code] = translation if translation
598
+ end
599
+ end
600
+ end
601
+
602
+ terms
603
+ end
604
+
605
+ def summarize_in_language(text, language, options = {})
606
+ # Use the main summarizer but with language-specific context
607
+ summarizer = LegalSummariser::Summariser.new(@config)
608
+
609
+ # Adjust summarization based on language and legal system
610
+ language_options = options.merge(
611
+ language: language,
612
+ legal_system: SUPPORTED_LANGUAGES[language][:legal_systems].first
613
+ )
614
+
615
+ summarizer.summarise(text, language_options)
616
+ end
617
+
618
+ def analyze_risks_in_language(text, language, options = {})
619
+ # Use the risk analyzer with language-specific patterns
620
+ risk_analyzer = LegalSummariser::RiskAnalyzer.new(@config)
621
+
622
+ # Apply language-specific risk patterns
623
+ language_options = options.merge(
624
+ language: language,
625
+ legal_system: SUPPORTED_LANGUAGES[language][:legal_systems].first
626
+ )
627
+
628
+ risk_analyzer.analyze(text, language_options)
629
+ end
630
+
631
+ def generate_default_legal_terms(language)
632
+ # Generate basic legal terms for the language
633
+ default_terms = {}
634
+
635
+ LEGAL_TERM_TRANSLATIONS.each do |english_term, translations|
636
+ if translations[language]
637
+ local_term = translations[language]
638
+ default_terms[local_term] = "Legal term: #{local_term}"
639
+ end
640
+ end
641
+
642
+ # Save to cache
643
+ terms_dir = File.join(@config.cache_dir, 'legal_terms')
644
+ FileUtils.mkdir_p(terms_dir) unless Dir.exist?(terms_dir)
645
+
646
+ terms_file = File.join(terms_dir, SUPPORTED_LANGUAGES[language][:legal_terms_db])
647
+ File.write(terms_file, JSON.pretty_generate(default_terms))
648
+
649
+ default_terms
650
+ end
651
+
652
+ def find_similar_term(term, target_language)
653
+ # Simple fuzzy matching for legal terms
654
+ legal_terms = get_legal_terms_for_language(target_language)
655
+
656
+ best_match = nil
657
+ best_score = 0
658
+
659
+ legal_terms.keys.each do |candidate|
660
+ score = similarity_score(term.downcase, candidate.downcase)
661
+ if score > best_score && score > 0.6
662
+ best_score = score
663
+ best_match = candidate
664
+ end
665
+ end
666
+
667
+ best_match
668
+ end
669
+
670
+ def similarity_score(str1, str2)
671
+ # Simple Jaccard similarity
672
+ set1 = str1.chars.to_set
673
+ set2 = str2.chars.to_set
674
+
675
+ intersection = set1 & set2
676
+ union = set1 | set2
677
+
678
+ return 0 if union.empty?
679
+
680
+ intersection.size.to_f / union.size
681
+ end
682
+ end
683
+ end