supervertaler 1.9.153__py3-none-any.whl → 1.9.189__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of supervertaler might be problematic. Click here for more details.

@@ -17,12 +17,38 @@ import sqlite3
17
17
  import os
18
18
  import json
19
19
  import hashlib
20
+ import unicodedata
21
+ import re
20
22
  from datetime import datetime
21
23
  from typing import List, Dict, Optional, Tuple
22
24
  from pathlib import Path
23
25
  from difflib import SequenceMatcher
24
26
 
25
27
 
28
+ def _normalize_for_matching(text: str) -> str:
29
+ """Normalize text for exact matching.
30
+
31
+ Handles invisible differences that would cause exact match to fail:
32
+ - Unicode normalization (NFC)
33
+ - Multiple whitespace -> single space
34
+ - Leading/trailing whitespace
35
+ - Non-breaking spaces -> regular spaces
36
+ """
37
+ if not text:
38
+ return ""
39
+ # Unicode normalize (NFC form)
40
+ text = unicodedata.normalize('NFC', text)
41
+ # Convert non-breaking spaces and other whitespace to regular space
42
+ text = text.replace('\u00a0', ' ') # NBSP
43
+ text = text.replace('\u2007', ' ') # Figure space
44
+ text = text.replace('\u202f', ' ') # Narrow NBSP
45
+ # Collapse multiple whitespace to single space
46
+ text = re.sub(r'\s+', ' ', text)
47
+ # Strip leading/trailing whitespace
48
+ text = text.strip()
49
+ return text
50
+
51
+
26
52
  class DatabaseManager:
27
53
  """Manages SQLite database for translation resources"""
28
54
 
@@ -655,22 +681,46 @@ class DatabaseManager:
655
681
  # TRANSLATION MEMORY METHODS
656
682
  # ============================================
657
683
 
658
- def add_translation_unit(self, source: str, target: str, source_lang: str,
684
+ def add_translation_unit(self, source: str, target: str, source_lang: str,
659
685
  target_lang: str, tm_id: str = 'project',
660
686
  project_id: str = None, context_before: str = None,
661
- context_after: str = None, notes: str = None) -> int:
687
+ context_after: str = None, notes: str = None,
688
+ overwrite: bool = False) -> int:
662
689
  """
663
690
  Add translation unit to database
664
-
691
+
692
+ Args:
693
+ source: Source text
694
+ target: Target text
695
+ source_lang: Source language code
696
+ target_lang: Target language code
697
+ tm_id: TM identifier
698
+ project_id: Optional project ID
699
+ context_before: Optional context before
700
+ context_after: Optional context after
701
+ notes: Optional notes
702
+ overwrite: If True, delete existing entries with same source before inserting
703
+ (implements "Save only latest translation" mode)
704
+
665
705
  Returns: ID of inserted/updated entry
666
706
  """
667
- # Generate hash for fast exact matching
668
- source_hash = hashlib.md5(source.encode('utf-8')).hexdigest()
669
-
707
+ # Generate hash from NORMALIZED source for consistent exact matching
708
+ # This handles invisible differences like Unicode normalization, whitespace variations
709
+ normalized_source = _normalize_for_matching(source)
710
+ source_hash = hashlib.md5(normalized_source.encode('utf-8')).hexdigest()
711
+
670
712
  try:
713
+ # If overwrite mode, delete ALL existing entries with same source_hash and tm_id
714
+ # This ensures only the latest translation is kept
715
+ if overwrite:
716
+ self.cursor.execute("""
717
+ DELETE FROM translation_units
718
+ WHERE source_hash = ? AND tm_id = ?
719
+ """, (source_hash, tm_id))
720
+
671
721
  self.cursor.execute("""
672
- INSERT INTO translation_units
673
- (source_text, target_text, source_lang, target_lang, tm_id,
722
+ INSERT INTO translation_units
723
+ (source_text, target_text, source_lang, target_lang, tm_id,
674
724
  project_id, context_before, context_after, source_hash, notes)
675
725
  VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
676
726
  ON CONFLICT(source_hash, target_text, tm_id) DO UPDATE SET
@@ -678,42 +728,47 @@ class DatabaseManager:
678
728
  modified_date = CURRENT_TIMESTAMP
679
729
  """, (source, target, source_lang, target_lang, tm_id,
680
730
  project_id, context_before, context_after, source_hash, notes))
681
-
731
+
682
732
  self.connection.commit()
683
733
  return self.cursor.lastrowid
684
-
734
+
685
735
  except Exception as e:
686
736
  self.log(f"Error adding translation unit: {e}")
687
737
  return None
688
738
 
689
739
  def get_exact_match(self, source: str, tm_ids: List[str] = None,
690
- source_lang: str = None, target_lang: str = None,
740
+ source_lang: str = None, target_lang: str = None,
691
741
  bidirectional: bool = True) -> Optional[Dict]:
692
742
  """
693
743
  Get exact match from TM
694
-
744
+
695
745
  Args:
696
746
  source: Source text to match
697
747
  tm_ids: List of TM IDs to search (None = all)
698
748
  source_lang: Filter by source language (base code matching: 'en' matches 'en-US', 'en-GB', etc.)
699
749
  target_lang: Filter by target language (base code matching)
700
750
  bidirectional: If True, search both directions (nl→en AND en→nl)
701
-
751
+
702
752
  Returns: Dictionary with match data or None
703
753
  """
704
754
  from modules.tmx_generator import get_base_lang_code
705
-
755
+
756
+ # Try both normalized and non-normalized hashes for backward compatibility
757
+ # This handles invisible differences like Unicode normalization, whitespace variations
706
758
  source_hash = hashlib.md5(source.encode('utf-8')).hexdigest()
707
-
759
+ normalized_source = _normalize_for_matching(source)
760
+ normalized_hash = hashlib.md5(normalized_source.encode('utf-8')).hexdigest()
761
+
708
762
  # Get base language codes for comparison
709
763
  src_base = get_base_lang_code(source_lang) if source_lang else None
710
764
  tgt_base = get_base_lang_code(target_lang) if target_lang else None
711
-
765
+
766
+ # Search using both original hash and normalized hash
712
767
  query = """
713
- SELECT * FROM translation_units
714
- WHERE source_hash = ? AND source_text = ?
768
+ SELECT * FROM translation_units
769
+ WHERE (source_hash = ? OR source_hash = ?)
715
770
  """
716
- params = [source_hash, source]
771
+ params = [source_hash, normalized_hash]
717
772
 
718
773
  if tm_ids:
719
774
  placeholders = ','.join('?' * len(tm_ids))
@@ -840,11 +895,15 @@ class DatabaseManager:
840
895
  bidirectional: If True, search both directions (nl→en AND en→nl)
841
896
 
842
897
  Returns: List of matches with similarity scores
898
+
899
+ Note: When multiple TMs are provided, searches each TM separately to ensure
900
+ good matches from smaller TMs aren't pushed out by BM25 keyword ranking
901
+ from larger TMs. Results are merged and sorted by actual similarity.
843
902
  """
844
903
  # For better FTS5 matching, tokenize the query and escape special chars
845
904
  # FTS5 special characters: " ( ) - : , . ! ?
846
905
  import re
847
- from modules.tmx_generator import get_base_lang_code
906
+ from modules.tmx_generator import get_base_lang_code, get_lang_match_variants
848
907
 
849
908
  # Strip HTML/XML tags from source for clean text search
850
909
  text_without_tags = re.sub(r'<[^>]+>', '', source)
@@ -868,22 +927,57 @@ class DatabaseManager:
868
927
  # This helps find similar long segments more reliably
869
928
  search_terms_for_query = all_search_terms[:20]
870
929
 
871
- print(f"[DEBUG] search_fuzzy_matches: source='{source[:50]}...', {len(all_search_terms)} terms")
872
-
873
930
  if not search_terms_for_query:
874
931
  # If no valid terms, return empty results
875
- print(f"[DEBUG] search_fuzzy_matches: No valid search terms, returning empty")
876
932
  return []
877
933
 
878
934
  # Quote each term to prevent FTS5 syntax errors
879
935
  fts_query = ' OR '.join(f'"{term}"' for term in search_terms_for_query)
880
- print(f"[DEBUG] search_fuzzy_matches: FTS query terms = {search_terms_for_query[:10]}...")
881
936
 
882
937
  # Get base language codes for comparison
883
938
  src_base = get_base_lang_code(source_lang) if source_lang else None
884
939
  tgt_base = get_base_lang_code(target_lang) if target_lang else None
885
940
 
886
- # Use FTS5 for initial candidate retrieval (fast)
941
+ # MULTI-TM FIX: Search each TM separately to avoid BM25 ranking issues
942
+ # When a large TM is combined with a small TM, the large TM's many keyword matches
943
+ # push down genuinely similar sentences from the small TM
944
+ tms_to_search = tm_ids if tm_ids else [None] # None means search all TMs together
945
+
946
+ all_results = []
947
+
948
+ for tm_id in tms_to_search:
949
+ # Search this specific TM (or all if tm_id is None)
950
+ tm_results = self._search_single_tm_fuzzy(
951
+ source, fts_query, [tm_id] if tm_id else None,
952
+ threshold, max_results, src_base, tgt_base,
953
+ source_lang, target_lang, bidirectional
954
+ )
955
+ all_results.extend(tm_results)
956
+
957
+ # Deduplicate by source_text (keep highest similarity for each unique source)
958
+ seen = {}
959
+ for result in all_results:
960
+ key = result['source_text']
961
+ if key not in seen or result['similarity'] > seen[key]['similarity']:
962
+ seen[key] = result
963
+
964
+ deduped_results = list(seen.values())
965
+
966
+ # Sort ALL results by similarity (highest first) - this ensures the 76% match
967
+ # appears before 40% matches regardless of which TM they came from
968
+ deduped_results.sort(key=lambda x: x['similarity'], reverse=True)
969
+
970
+ return deduped_results[:max_results]
971
+
972
+ def _search_single_tm_fuzzy(self, source: str, fts_query: str, tm_ids: List[str],
973
+ threshold: float, max_results: int,
974
+ src_base: str, tgt_base: str,
975
+ source_lang: str, target_lang: str,
976
+ bidirectional: bool) -> List[Dict]:
977
+ """Search a single TM (or all TMs if tm_ids is None) for fuzzy matches"""
978
+ from modules.tmx_generator import get_lang_match_variants
979
+
980
+ # Build query for this TM
887
981
  query = """
888
982
  SELECT tu.*,
889
983
  bm25(translation_units_fts) as relevance
@@ -893,13 +987,12 @@ class DatabaseManager:
893
987
  """
894
988
  params = [fts_query]
895
989
 
896
- if tm_ids:
990
+ if tm_ids and tm_ids[0] is not None:
897
991
  placeholders = ','.join('?' * len(tm_ids))
898
992
  query += f" AND tu.tm_id IN ({placeholders})"
899
993
  params.extend(tm_ids)
900
994
 
901
995
  # Use flexible language matching (matches 'nl', 'nl-NL', 'Dutch', etc.)
902
- from modules.tmx_generator import get_lang_match_variants
903
996
  if src_base:
904
997
  src_variants = get_lang_match_variants(source_lang)
905
998
  src_conditions = []
@@ -920,19 +1013,16 @@ class DatabaseManager:
920
1013
  params.append(f"{variant}-%")
921
1014
  query += f" AND ({' OR '.join(tgt_conditions)})"
922
1015
 
923
- # Get more candidates than needed for proper scoring (increase limit for long segments)
924
- # Long segments need MANY more candidates because BM25 ranking may push down
925
- # the truly similar entries in favor of entries matching more search terms
1016
+ # Per-TM candidate limit - INCREASED to catch more potential fuzzy matches
1017
+ # When multiple TMs are searched, BM25 ranking can push genuinely similar
1018
+ # entries far down the list due to common word matches in other entries
926
1019
  candidate_limit = max(500, max_results * 50)
927
1020
  query += f" ORDER BY relevance DESC LIMIT {candidate_limit}"
928
1021
 
929
- print(f"[DEBUG] search_fuzzy_matches: Executing query (limit={candidate_limit})...")
930
-
931
1022
  try:
932
1023
  self.cursor.execute(query, params)
933
1024
  all_rows = self.cursor.fetchall()
934
1025
  except Exception as e:
935
- print(f"[DEBUG] search_fuzzy_matches: SQL ERROR: {e}")
936
1026
  return []
937
1027
 
938
1028
  results = []
@@ -948,8 +1038,6 @@ class DatabaseManager:
948
1038
  match_dict['match_pct'] = int(similarity * 100)
949
1039
  results.append(match_dict)
950
1040
 
951
- print(f"[DEBUG] search_fuzzy_matches: After threshold filter ({threshold}): {len(results)} matches")
952
-
953
1041
  # If bidirectional, also search reverse direction
954
1042
  if bidirectional and src_base and tgt_base:
955
1043
  query = """
@@ -961,13 +1049,12 @@ class DatabaseManager:
961
1049
  """
962
1050
  params = [fts_query]
963
1051
 
964
- if tm_ids:
1052
+ if tm_ids and tm_ids[0] is not None:
965
1053
  placeholders = ','.join('?' * len(tm_ids))
966
1054
  query += f" AND tu.tm_id IN ({placeholders})"
967
1055
  params.extend(tm_ids)
968
1056
 
969
1057
  # Reversed language filters with flexible matching
970
- # For reverse: TM target_lang should match our source_lang, TM source_lang should match our target_lang
971
1058
  src_variants = get_lang_match_variants(source_lang)
972
1059
  tgt_variants = get_lang_match_variants(target_lang)
973
1060
 
@@ -991,26 +1078,27 @@ class DatabaseManager:
991
1078
 
992
1079
  query += f" ORDER BY relevance DESC LIMIT {max_results * 5}"
993
1080
 
994
- self.cursor.execute(query, params)
995
-
996
- for row in self.cursor.fetchall():
997
- match_dict = dict(row)
998
- # Calculate similarity against target_text (since we're reversing)
999
- similarity = self.calculate_similarity(source, match_dict['target_text'])
1081
+ try:
1082
+ self.cursor.execute(query, params)
1000
1083
 
1001
- # Only include matches above threshold
1002
- if similarity >= threshold:
1003
- # Swap source/target for reverse match
1004
- match_dict['source_text'], match_dict['target_text'] = match_dict['target_text'], match_dict['source_text']
1005
- match_dict['source_lang'], match_dict['target_lang'] = match_dict['target_lang'], match_dict['source_lang']
1006
- match_dict['similarity'] = similarity
1007
- match_dict['match_pct'] = int(similarity * 100)
1008
- match_dict['reverse_match'] = True
1009
- results.append(match_dict)
1010
-
1011
- # Sort by similarity (highest first) and limit results
1012
- results.sort(key=lambda x: x['similarity'], reverse=True)
1013
- return results[:max_results]
1084
+ for row in self.cursor.fetchall():
1085
+ match_dict = dict(row)
1086
+ # Calculate similarity against target_text (since we're reversing)
1087
+ similarity = self.calculate_similarity(source, match_dict['target_text'])
1088
+
1089
+ # Only include matches above threshold
1090
+ if similarity >= threshold:
1091
+ # Swap source/target for reverse match
1092
+ match_dict['source_text'], match_dict['target_text'] = match_dict['target_text'], match_dict['source_text']
1093
+ match_dict['source_lang'], match_dict['target_lang'] = match_dict['target_lang'], match_dict['source_lang']
1094
+ match_dict['similarity'] = similarity
1095
+ match_dict['match_pct'] = int(similarity * 100)
1096
+ match_dict['reverse_match'] = True
1097
+ results.append(match_dict)
1098
+ except Exception as e:
1099
+ print(f"[DEBUG] _search_single_tm_fuzzy (reverse): SQL ERROR: {e}")
1100
+
1101
+ return results
1014
1102
 
1015
1103
  def search_all(self, source: str, tm_ids: List[str] = None, enabled_only: bool = True,
1016
1104
  threshold: float = 0.75, max_results: int = 10) -> List[Dict]:
@@ -1389,120 +1477,225 @@ class DatabaseManager:
1389
1477
  # TODO: Implement in Phase 3
1390
1478
  pass
1391
1479
 
1392
- def search_termbases(self, search_term: str, source_lang: str = None,
1480
+ def search_termbases(self, search_term: str, source_lang: str = None,
1393
1481
  target_lang: str = None, project_id: str = None,
1394
- min_length: int = 0) -> List[Dict]:
1482
+ min_length: int = 0, bidirectional: bool = True) -> List[Dict]:
1395
1483
  """
1396
- Search termbases for matching source terms
1397
-
1484
+ Search termbases for matching terms (bidirectional by default)
1485
+
1398
1486
  Args:
1399
- search_term: Source term to search for
1487
+ search_term: Term to search for
1400
1488
  source_lang: Filter by source language (optional)
1401
1489
  target_lang: Filter by target language (optional)
1402
1490
  project_id: Filter by project (optional)
1403
1491
  min_length: Minimum term length to return
1404
-
1492
+ bidirectional: If True, also search target_term and swap results (default True)
1493
+
1405
1494
  Returns:
1406
1495
  List of termbase hits, sorted by priority (lower = higher priority)
1496
+ Each result includes 'match_direction' ('source' or 'target') indicating
1497
+ which column matched. For 'target' matches, source_term and target_term
1498
+ are swapped so results are always oriented correctly for the current project.
1407
1499
  """
1408
1500
  # Build query with filters - include termbase name and ranking via JOIN
1409
1501
  # Note: termbase_id is stored as TEXT in termbase_terms but INTEGER in termbases
1410
1502
  # Use CAST to ensure proper comparison
1411
1503
  # IMPORTANT: Join with termbase_activation to get the ACTUAL priority for this project
1412
1504
  # CRITICAL FIX: Also match when search_term starts with the glossary term
1413
- # This handles cases like searching for "ca." when glossary has "ca."
1505
+ # This handles cases like searching for "ca." when glossary has "ca."
1414
1506
  # AND searching for "ca" when glossary has "ca."
1415
1507
  # We also strip trailing punctuation from glossary terms for comparison
1416
- query = """
1417
- SELECT
1418
- t.id, t.source_term, t.target_term, t.termbase_id, t.priority,
1508
+
1509
+ # Build matching conditions for a given column
1510
+ def build_match_conditions(column: str) -> str:
1511
+ return f"""(
1512
+ LOWER(t.{column}) = LOWER(?) OR
1513
+ LOWER(t.{column}) LIKE LOWER(?) OR
1514
+ LOWER(t.{column}) LIKE LOWER(?) OR
1515
+ LOWER(t.{column}) LIKE LOWER(?) OR
1516
+ LOWER(RTRIM(t.{column}, '.!?,;:')) = LOWER(?) OR
1517
+ LOWER(?) LIKE LOWER(t.{column}) || '%' OR
1518
+ LOWER(?) = LOWER(RTRIM(t.{column}, '.!?,;:'))
1519
+ )"""
1520
+
1521
+ # Build match params for one direction
1522
+ def build_match_params() -> list:
1523
+ return [
1524
+ search_term,
1525
+ f"{search_term} %",
1526
+ f"% {search_term}",
1527
+ f"% {search_term} %",
1528
+ search_term, # For RTRIM comparison
1529
+ search_term, # For reverse LIKE
1530
+ search_term # For reverse RTRIM comparison
1531
+ ]
1532
+
1533
+ # Matching patterns:
1534
+ # 1. Exact match: column = search_term
1535
+ # 2. Glossary term starts with search: column LIKE "search_term %"
1536
+ # 3. Glossary term ends with search: column LIKE "% search_term"
1537
+ # 4. Glossary term contains search: column LIKE "% search_term %"
1538
+ # 5. Glossary term (stripped) = search_term: RTRIM(column) = search_term (handles "ca." = "ca")
1539
+ # 6. Search starts with glossary term: search_term LIKE column || '%'
1540
+ # 7. Search = glossary term stripped: search_term = RTRIM(column)
1541
+
1542
+ # Base SELECT for forward matches (source_term matches)
1543
+ base_select_forward = """
1544
+ SELECT
1545
+ t.id, t.source_term, t.target_term, t.termbase_id, t.priority,
1419
1546
  t.forbidden, t.source_lang, t.target_lang, t.definition, t.domain,
1420
1547
  t.notes, t.project, t.client,
1421
1548
  tb.name as termbase_name,
1422
1549
  tb.source_lang as termbase_source_lang,
1423
1550
  tb.target_lang as termbase_target_lang,
1424
1551
  tb.is_project_termbase,
1425
- COALESCE(ta.priority, tb.ranking) as ranking
1552
+ COALESCE(ta.priority, tb.ranking) as ranking,
1553
+ 'source' as match_direction
1426
1554
  FROM termbase_terms t
1427
1555
  LEFT JOIN termbases tb ON CAST(t.termbase_id AS INTEGER) = tb.id
1428
1556
  LEFT JOIN termbase_activation ta ON ta.termbase_id = tb.id AND ta.project_id = ? AND ta.is_active = 1
1429
- WHERE (
1430
- LOWER(t.source_term) = LOWER(?) OR
1431
- LOWER(t.source_term) LIKE LOWER(?) OR
1432
- LOWER(t.source_term) LIKE LOWER(?) OR
1433
- LOWER(t.source_term) LIKE LOWER(?) OR
1434
- LOWER(RTRIM(t.source_term, '.!?,;:')) = LOWER(?) OR
1435
- LOWER(?) LIKE LOWER(t.source_term) || '%' OR
1436
- LOWER(?) = LOWER(RTRIM(t.source_term, '.!?,;:'))
1437
- )
1557
+ WHERE {match_conditions}
1438
1558
  AND (ta.is_active = 1 OR tb.is_project_termbase = 1)
1439
- """
1440
- # Matching patterns:
1441
- # 1. Exact match: source_term = search_term
1442
- # 2. Glossary term starts with search: source_term LIKE "search_term %"
1443
- # 3. Glossary term ends with search: source_term LIKE "% search_term"
1444
- # 4. Glossary term contains search: source_term LIKE "% search_term %"
1445
- # 5. Glossary term (stripped) = search_term: RTRIM(source_term) = search_term (handles "ca." = "ca")
1446
- # 6. Search starts with glossary term: search_term LIKE source_term || '%'
1447
- # 7. Search = glossary term stripped: search_term = RTRIM(source_term)
1448
- params = [
1449
- project_id if project_id else 0, # Use 0 if no project (won't match any activation records)
1450
- search_term,
1451
- f"{search_term} %",
1452
- f"% {search_term}",
1453
- f"% {search_term} %",
1454
- search_term, # For RTRIM comparison
1455
- search_term, # For reverse LIKE
1456
- search_term # For reverse RTRIM comparison
1457
- ]
1458
-
1459
- # Language filters - if term has no language, use termbase language for filtering
1559
+ """.format(match_conditions=build_match_conditions('source_term'))
1560
+
1561
+ # Base SELECT for reverse matches (target_term matches) - swap source/target in output
1562
+ base_select_reverse = """
1563
+ SELECT
1564
+ t.id, t.target_term as source_term, t.source_term as target_term,
1565
+ t.termbase_id, t.priority,
1566
+ t.forbidden, t.target_lang as source_lang, t.source_lang as target_lang,
1567
+ t.definition, t.domain,
1568
+ t.notes, t.project, t.client,
1569
+ tb.name as termbase_name,
1570
+ tb.target_lang as termbase_source_lang,
1571
+ tb.source_lang as termbase_target_lang,
1572
+ tb.is_project_termbase,
1573
+ COALESCE(ta.priority, tb.ranking) as ranking,
1574
+ 'target' as match_direction
1575
+ FROM termbase_terms t
1576
+ LEFT JOIN termbases tb ON CAST(t.termbase_id AS INTEGER) = tb.id
1577
+ LEFT JOIN termbase_activation ta ON ta.termbase_id = tb.id AND ta.project_id = ? AND ta.is_active = 1
1578
+ WHERE {match_conditions}
1579
+ AND (ta.is_active = 1 OR tb.is_project_termbase = 1)
1580
+ """.format(match_conditions=build_match_conditions('target_term'))
1581
+
1582
+ # Build params
1583
+ project_param = project_id if project_id else 0
1584
+ forward_params = [project_param] + build_match_params()
1585
+ reverse_params = [project_param] + build_match_params()
1586
+
1587
+ # Build language filter conditions
1588
+ lang_conditions_forward = ""
1589
+ lang_conditions_reverse = ""
1590
+ lang_params_forward = []
1591
+ lang_params_reverse = []
1592
+
1460
1593
  if source_lang:
1461
- query += """ AND (
1462
- t.source_lang = ? OR
1594
+ # For forward: filter on source_lang
1595
+ lang_conditions_forward += """ AND (
1596
+ t.source_lang = ? OR
1463
1597
  (t.source_lang IS NULL AND tb.source_lang = ?) OR
1464
1598
  (t.source_lang IS NULL AND tb.source_lang IS NULL)
1465
1599
  )"""
1466
- params.extend([source_lang, source_lang])
1467
-
1600
+ lang_params_forward.extend([source_lang, source_lang])
1601
+ # For reverse: source_lang becomes target_lang (swapped)
1602
+ lang_conditions_reverse += """ AND (
1603
+ t.target_lang = ? OR
1604
+ (t.target_lang IS NULL AND tb.target_lang = ?) OR
1605
+ (t.target_lang IS NULL AND tb.target_lang IS NULL)
1606
+ )"""
1607
+ lang_params_reverse.extend([source_lang, source_lang])
1608
+
1468
1609
  if target_lang:
1469
- query += """ AND (
1470
- t.target_lang = ? OR
1610
+ # For forward: filter on target_lang
1611
+ lang_conditions_forward += """ AND (
1612
+ t.target_lang = ? OR
1471
1613
  (t.target_lang IS NULL AND tb.target_lang = ?) OR
1472
1614
  (t.target_lang IS NULL AND tb.target_lang IS NULL)
1473
1615
  )"""
1474
- params.extend([target_lang, target_lang])
1475
-
1476
- # Project filter: match project-specific terms OR global terms (project_id IS NULL)
1616
+ lang_params_forward.extend([target_lang, target_lang])
1617
+ # For reverse: target_lang becomes source_lang (swapped)
1618
+ lang_conditions_reverse += """ AND (
1619
+ t.source_lang = ? OR
1620
+ (t.source_lang IS NULL AND tb.source_lang = ?) OR
1621
+ (t.source_lang IS NULL AND tb.source_lang IS NULL)
1622
+ )"""
1623
+ lang_params_reverse.extend([target_lang, target_lang])
1624
+
1625
+ # Project filter conditions
1626
+ project_conditions = ""
1627
+ project_params = []
1477
1628
  if project_id:
1478
- query += " AND (t.project_id = ? OR t.project_id IS NULL)"
1479
- params.append(project_id)
1480
-
1629
+ project_conditions = " AND (t.project_id = ? OR t.project_id IS NULL)"
1630
+ project_params = [project_id]
1631
+
1632
+ # Min length conditions
1633
+ min_len_forward = ""
1634
+ min_len_reverse = ""
1481
1635
  if min_length > 0:
1482
- query += f" AND LENGTH(t.source_term) >= {min_length}"
1483
-
1484
- # Sort by ranking (lower number = higher priority)
1485
- # Project termbases (ranking IS NULL) appear first, then by ranking, then alphabetically
1486
- # Use COALESCE to treat NULL as -1 (highest priority)
1487
- query += " ORDER BY COALESCE(tb.ranking, -1) ASC, t.source_term ASC"
1488
-
1636
+ min_len_forward = f" AND LENGTH(t.source_term) >= {min_length}"
1637
+ min_len_reverse = f" AND LENGTH(t.target_term) >= {min_length}"
1638
+
1639
+ # Build forward query
1640
+ forward_query = base_select_forward + lang_conditions_forward + project_conditions + min_len_forward
1641
+ forward_params.extend(lang_params_forward)
1642
+ forward_params.extend(project_params)
1643
+
1644
+ if bidirectional:
1645
+ # Build reverse query
1646
+ reverse_query = base_select_reverse + lang_conditions_reverse + project_conditions + min_len_reverse
1647
+ reverse_params.extend(lang_params_reverse)
1648
+ reverse_params.extend(project_params)
1649
+
1650
+ # Combine with UNION and sort
1651
+ query = f"""
1652
+ SELECT * FROM (
1653
+ {forward_query}
1654
+ UNION ALL
1655
+ {reverse_query}
1656
+ ) combined
1657
+ ORDER BY COALESCE(ranking, -1) ASC, source_term ASC
1658
+ """
1659
+ params = forward_params + reverse_params
1660
+ else:
1661
+ # Original forward-only behavior
1662
+ query = forward_query + " ORDER BY COALESCE(ranking, -1) ASC, source_term ASC"
1663
+ params = forward_params
1664
+
1489
1665
  self.cursor.execute(query, params)
1490
1666
  results = []
1667
+ seen_combinations = set() # Track (source_term, target_term, termbase_id) to avoid duplicates
1668
+
1491
1669
  for row in self.cursor.fetchall():
1492
1670
  result_dict = dict(row)
1671
+
1672
+ # Deduplicate: same term pair from same termbase should only appear once
1673
+ # Prefer 'source' match over 'target' match
1674
+ combo_key = (
1675
+ result_dict.get('source_term', '').lower(),
1676
+ result_dict.get('target_term', '').lower(),
1677
+ result_dict.get('termbase_id')
1678
+ )
1679
+ if combo_key in seen_combinations:
1680
+ continue
1681
+ seen_combinations.add(combo_key)
1682
+
1493
1683
  # SQLite stores booleans as 0/1, explicitly convert to Python bool
1494
1684
  if 'is_project_termbase' in result_dict:
1495
1685
  result_dict['is_project_termbase'] = bool(result_dict['is_project_termbase'])
1496
-
1686
+
1497
1687
  # Fetch target synonyms for this term and include them in the result
1498
1688
  term_id = result_dict.get('id')
1689
+ match_direction = result_dict.get('match_direction', 'source')
1499
1690
  if term_id:
1500
1691
  try:
1692
+ # For reverse matches, fetch 'source' synonyms since they become targets
1693
+ synonym_lang = 'source' if match_direction == 'target' else 'target'
1501
1694
  self.cursor.execute("""
1502
1695
  SELECT synonym_text, forbidden FROM termbase_synonyms
1503
- WHERE term_id = ? AND language = 'target'
1696
+ WHERE term_id = ? AND language = ?
1504
1697
  ORDER BY display_order ASC
1505
- """, (term_id,))
1698
+ """, (term_id, synonym_lang))
1506
1699
  synonyms = []
1507
1700
  for syn_row in self.cursor.fetchall():
1508
1701
  syn_text = syn_row[0]
@@ -1512,7 +1705,7 @@ class DatabaseManager:
1512
1705
  result_dict['target_synonyms'] = synonyms
1513
1706
  except Exception:
1514
1707
  result_dict['target_synonyms'] = []
1515
-
1708
+
1516
1709
  results.append(result_dict)
1517
1710
  return results
1518
1711