supervertaler 1.9.153__py3-none-any.whl → 1.9.189__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of supervertaler might be problematic. Click here for more details.
- Supervertaler.py +3729 -1195
- modules/database_manager.py +313 -120
- modules/database_migrations.py +54 -7
- modules/extract_tm.py +518 -0
- modules/keyboard_shortcuts_widget.py +7 -0
- modules/mqxliff_handler.py +71 -2
- modules/project_tm.py +320 -0
- modules/superbrowser.py +22 -0
- modules/superlookup.py +12 -8
- modules/tag_manager.py +20 -2
- modules/termbase_manager.py +105 -2
- modules/termview_widget.py +82 -42
- modules/theme_manager.py +41 -4
- modules/tm_metadata_manager.py +59 -13
- modules/translation_memory.py +4 -13
- modules/translation_results_panel.py +0 -7
- modules/unified_prompt_library.py +2 -2
- modules/unified_prompt_manager_qt.py +47 -18
- supervertaler-1.9.189.dist-info/METADATA +151 -0
- {supervertaler-1.9.153.dist-info → supervertaler-1.9.189.dist-info}/RECORD +24 -22
- {supervertaler-1.9.153.dist-info → supervertaler-1.9.189.dist-info}/WHEEL +1 -1
- supervertaler-1.9.153.dist-info/METADATA +0 -896
- {supervertaler-1.9.153.dist-info → supervertaler-1.9.189.dist-info}/entry_points.txt +0 -0
- {supervertaler-1.9.153.dist-info → supervertaler-1.9.189.dist-info}/licenses/LICENSE +0 -0
- {supervertaler-1.9.153.dist-info → supervertaler-1.9.189.dist-info}/top_level.txt +0 -0
modules/database_manager.py
CHANGED
|
@@ -17,12 +17,38 @@ import sqlite3
|
|
|
17
17
|
import os
|
|
18
18
|
import json
|
|
19
19
|
import hashlib
|
|
20
|
+
import unicodedata
|
|
21
|
+
import re
|
|
20
22
|
from datetime import datetime
|
|
21
23
|
from typing import List, Dict, Optional, Tuple
|
|
22
24
|
from pathlib import Path
|
|
23
25
|
from difflib import SequenceMatcher
|
|
24
26
|
|
|
25
27
|
|
|
28
|
+
def _normalize_for_matching(text: str) -> str:
|
|
29
|
+
"""Normalize text for exact matching.
|
|
30
|
+
|
|
31
|
+
Handles invisible differences that would cause exact match to fail:
|
|
32
|
+
- Unicode normalization (NFC)
|
|
33
|
+
- Multiple whitespace -> single space
|
|
34
|
+
- Leading/trailing whitespace
|
|
35
|
+
- Non-breaking spaces -> regular spaces
|
|
36
|
+
"""
|
|
37
|
+
if not text:
|
|
38
|
+
return ""
|
|
39
|
+
# Unicode normalize (NFC form)
|
|
40
|
+
text = unicodedata.normalize('NFC', text)
|
|
41
|
+
# Convert non-breaking spaces and other whitespace to regular space
|
|
42
|
+
text = text.replace('\u00a0', ' ') # NBSP
|
|
43
|
+
text = text.replace('\u2007', ' ') # Figure space
|
|
44
|
+
text = text.replace('\u202f', ' ') # Narrow NBSP
|
|
45
|
+
# Collapse multiple whitespace to single space
|
|
46
|
+
text = re.sub(r'\s+', ' ', text)
|
|
47
|
+
# Strip leading/trailing whitespace
|
|
48
|
+
text = text.strip()
|
|
49
|
+
return text
|
|
50
|
+
|
|
51
|
+
|
|
26
52
|
class DatabaseManager:
|
|
27
53
|
"""Manages SQLite database for translation resources"""
|
|
28
54
|
|
|
@@ -655,22 +681,46 @@ class DatabaseManager:
|
|
|
655
681
|
# TRANSLATION MEMORY METHODS
|
|
656
682
|
# ============================================
|
|
657
683
|
|
|
658
|
-
def add_translation_unit(self, source: str, target: str, source_lang: str,
|
|
684
|
+
def add_translation_unit(self, source: str, target: str, source_lang: str,
|
|
659
685
|
target_lang: str, tm_id: str = 'project',
|
|
660
686
|
project_id: str = None, context_before: str = None,
|
|
661
|
-
context_after: str = None, notes: str = None
|
|
687
|
+
context_after: str = None, notes: str = None,
|
|
688
|
+
overwrite: bool = False) -> int:
|
|
662
689
|
"""
|
|
663
690
|
Add translation unit to database
|
|
664
|
-
|
|
691
|
+
|
|
692
|
+
Args:
|
|
693
|
+
source: Source text
|
|
694
|
+
target: Target text
|
|
695
|
+
source_lang: Source language code
|
|
696
|
+
target_lang: Target language code
|
|
697
|
+
tm_id: TM identifier
|
|
698
|
+
project_id: Optional project ID
|
|
699
|
+
context_before: Optional context before
|
|
700
|
+
context_after: Optional context after
|
|
701
|
+
notes: Optional notes
|
|
702
|
+
overwrite: If True, delete existing entries with same source before inserting
|
|
703
|
+
(implements "Save only latest translation" mode)
|
|
704
|
+
|
|
665
705
|
Returns: ID of inserted/updated entry
|
|
666
706
|
"""
|
|
667
|
-
# Generate hash for
|
|
668
|
-
|
|
669
|
-
|
|
707
|
+
# Generate hash from NORMALIZED source for consistent exact matching
|
|
708
|
+
# This handles invisible differences like Unicode normalization, whitespace variations
|
|
709
|
+
normalized_source = _normalize_for_matching(source)
|
|
710
|
+
source_hash = hashlib.md5(normalized_source.encode('utf-8')).hexdigest()
|
|
711
|
+
|
|
670
712
|
try:
|
|
713
|
+
# If overwrite mode, delete ALL existing entries with same source_hash and tm_id
|
|
714
|
+
# This ensures only the latest translation is kept
|
|
715
|
+
if overwrite:
|
|
716
|
+
self.cursor.execute("""
|
|
717
|
+
DELETE FROM translation_units
|
|
718
|
+
WHERE source_hash = ? AND tm_id = ?
|
|
719
|
+
""", (source_hash, tm_id))
|
|
720
|
+
|
|
671
721
|
self.cursor.execute("""
|
|
672
|
-
INSERT INTO translation_units
|
|
673
|
-
(source_text, target_text, source_lang, target_lang, tm_id,
|
|
722
|
+
INSERT INTO translation_units
|
|
723
|
+
(source_text, target_text, source_lang, target_lang, tm_id,
|
|
674
724
|
project_id, context_before, context_after, source_hash, notes)
|
|
675
725
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
676
726
|
ON CONFLICT(source_hash, target_text, tm_id) DO UPDATE SET
|
|
@@ -678,42 +728,47 @@ class DatabaseManager:
|
|
|
678
728
|
modified_date = CURRENT_TIMESTAMP
|
|
679
729
|
""", (source, target, source_lang, target_lang, tm_id,
|
|
680
730
|
project_id, context_before, context_after, source_hash, notes))
|
|
681
|
-
|
|
731
|
+
|
|
682
732
|
self.connection.commit()
|
|
683
733
|
return self.cursor.lastrowid
|
|
684
|
-
|
|
734
|
+
|
|
685
735
|
except Exception as e:
|
|
686
736
|
self.log(f"Error adding translation unit: {e}")
|
|
687
737
|
return None
|
|
688
738
|
|
|
689
739
|
def get_exact_match(self, source: str, tm_ids: List[str] = None,
|
|
690
|
-
source_lang: str = None, target_lang: str = None,
|
|
740
|
+
source_lang: str = None, target_lang: str = None,
|
|
691
741
|
bidirectional: bool = True) -> Optional[Dict]:
|
|
692
742
|
"""
|
|
693
743
|
Get exact match from TM
|
|
694
|
-
|
|
744
|
+
|
|
695
745
|
Args:
|
|
696
746
|
source: Source text to match
|
|
697
747
|
tm_ids: List of TM IDs to search (None = all)
|
|
698
748
|
source_lang: Filter by source language (base code matching: 'en' matches 'en-US', 'en-GB', etc.)
|
|
699
749
|
target_lang: Filter by target language (base code matching)
|
|
700
750
|
bidirectional: If True, search both directions (nl→en AND en→nl)
|
|
701
|
-
|
|
751
|
+
|
|
702
752
|
Returns: Dictionary with match data or None
|
|
703
753
|
"""
|
|
704
754
|
from modules.tmx_generator import get_base_lang_code
|
|
705
|
-
|
|
755
|
+
|
|
756
|
+
# Try both normalized and non-normalized hashes for backward compatibility
|
|
757
|
+
# This handles invisible differences like Unicode normalization, whitespace variations
|
|
706
758
|
source_hash = hashlib.md5(source.encode('utf-8')).hexdigest()
|
|
707
|
-
|
|
759
|
+
normalized_source = _normalize_for_matching(source)
|
|
760
|
+
normalized_hash = hashlib.md5(normalized_source.encode('utf-8')).hexdigest()
|
|
761
|
+
|
|
708
762
|
# Get base language codes for comparison
|
|
709
763
|
src_base = get_base_lang_code(source_lang) if source_lang else None
|
|
710
764
|
tgt_base = get_base_lang_code(target_lang) if target_lang else None
|
|
711
|
-
|
|
765
|
+
|
|
766
|
+
# Search using both original hash and normalized hash
|
|
712
767
|
query = """
|
|
713
|
-
SELECT * FROM translation_units
|
|
714
|
-
WHERE source_hash = ?
|
|
768
|
+
SELECT * FROM translation_units
|
|
769
|
+
WHERE (source_hash = ? OR source_hash = ?)
|
|
715
770
|
"""
|
|
716
|
-
params = [source_hash,
|
|
771
|
+
params = [source_hash, normalized_hash]
|
|
717
772
|
|
|
718
773
|
if tm_ids:
|
|
719
774
|
placeholders = ','.join('?' * len(tm_ids))
|
|
@@ -840,11 +895,15 @@ class DatabaseManager:
|
|
|
840
895
|
bidirectional: If True, search both directions (nl→en AND en→nl)
|
|
841
896
|
|
|
842
897
|
Returns: List of matches with similarity scores
|
|
898
|
+
|
|
899
|
+
Note: When multiple TMs are provided, searches each TM separately to ensure
|
|
900
|
+
good matches from smaller TMs aren't pushed out by BM25 keyword ranking
|
|
901
|
+
from larger TMs. Results are merged and sorted by actual similarity.
|
|
843
902
|
"""
|
|
844
903
|
# For better FTS5 matching, tokenize the query and escape special chars
|
|
845
904
|
# FTS5 special characters: " ( ) - : , . ! ?
|
|
846
905
|
import re
|
|
847
|
-
from modules.tmx_generator import get_base_lang_code
|
|
906
|
+
from modules.tmx_generator import get_base_lang_code, get_lang_match_variants
|
|
848
907
|
|
|
849
908
|
# Strip HTML/XML tags from source for clean text search
|
|
850
909
|
text_without_tags = re.sub(r'<[^>]+>', '', source)
|
|
@@ -868,22 +927,57 @@ class DatabaseManager:
|
|
|
868
927
|
# This helps find similar long segments more reliably
|
|
869
928
|
search_terms_for_query = all_search_terms[:20]
|
|
870
929
|
|
|
871
|
-
print(f"[DEBUG] search_fuzzy_matches: source='{source[:50]}...', {len(all_search_terms)} terms")
|
|
872
|
-
|
|
873
930
|
if not search_terms_for_query:
|
|
874
931
|
# If no valid terms, return empty results
|
|
875
|
-
print(f"[DEBUG] search_fuzzy_matches: No valid search terms, returning empty")
|
|
876
932
|
return []
|
|
877
933
|
|
|
878
934
|
# Quote each term to prevent FTS5 syntax errors
|
|
879
935
|
fts_query = ' OR '.join(f'"{term}"' for term in search_terms_for_query)
|
|
880
|
-
print(f"[DEBUG] search_fuzzy_matches: FTS query terms = {search_terms_for_query[:10]}...")
|
|
881
936
|
|
|
882
937
|
# Get base language codes for comparison
|
|
883
938
|
src_base = get_base_lang_code(source_lang) if source_lang else None
|
|
884
939
|
tgt_base = get_base_lang_code(target_lang) if target_lang else None
|
|
885
940
|
|
|
886
|
-
#
|
|
941
|
+
# MULTI-TM FIX: Search each TM separately to avoid BM25 ranking issues
|
|
942
|
+
# When a large TM is combined with a small TM, the large TM's many keyword matches
|
|
943
|
+
# push down genuinely similar sentences from the small TM
|
|
944
|
+
tms_to_search = tm_ids if tm_ids else [None] # None means search all TMs together
|
|
945
|
+
|
|
946
|
+
all_results = []
|
|
947
|
+
|
|
948
|
+
for tm_id in tms_to_search:
|
|
949
|
+
# Search this specific TM (or all if tm_id is None)
|
|
950
|
+
tm_results = self._search_single_tm_fuzzy(
|
|
951
|
+
source, fts_query, [tm_id] if tm_id else None,
|
|
952
|
+
threshold, max_results, src_base, tgt_base,
|
|
953
|
+
source_lang, target_lang, bidirectional
|
|
954
|
+
)
|
|
955
|
+
all_results.extend(tm_results)
|
|
956
|
+
|
|
957
|
+
# Deduplicate by source_text (keep highest similarity for each unique source)
|
|
958
|
+
seen = {}
|
|
959
|
+
for result in all_results:
|
|
960
|
+
key = result['source_text']
|
|
961
|
+
if key not in seen or result['similarity'] > seen[key]['similarity']:
|
|
962
|
+
seen[key] = result
|
|
963
|
+
|
|
964
|
+
deduped_results = list(seen.values())
|
|
965
|
+
|
|
966
|
+
# Sort ALL results by similarity (highest first) - this ensures the 76% match
|
|
967
|
+
# appears before 40% matches regardless of which TM they came from
|
|
968
|
+
deduped_results.sort(key=lambda x: x['similarity'], reverse=True)
|
|
969
|
+
|
|
970
|
+
return deduped_results[:max_results]
|
|
971
|
+
|
|
972
|
+
def _search_single_tm_fuzzy(self, source: str, fts_query: str, tm_ids: List[str],
|
|
973
|
+
threshold: float, max_results: int,
|
|
974
|
+
src_base: str, tgt_base: str,
|
|
975
|
+
source_lang: str, target_lang: str,
|
|
976
|
+
bidirectional: bool) -> List[Dict]:
|
|
977
|
+
"""Search a single TM (or all TMs if tm_ids is None) for fuzzy matches"""
|
|
978
|
+
from modules.tmx_generator import get_lang_match_variants
|
|
979
|
+
|
|
980
|
+
# Build query for this TM
|
|
887
981
|
query = """
|
|
888
982
|
SELECT tu.*,
|
|
889
983
|
bm25(translation_units_fts) as relevance
|
|
@@ -893,13 +987,12 @@ class DatabaseManager:
|
|
|
893
987
|
"""
|
|
894
988
|
params = [fts_query]
|
|
895
989
|
|
|
896
|
-
if tm_ids:
|
|
990
|
+
if tm_ids and tm_ids[0] is not None:
|
|
897
991
|
placeholders = ','.join('?' * len(tm_ids))
|
|
898
992
|
query += f" AND tu.tm_id IN ({placeholders})"
|
|
899
993
|
params.extend(tm_ids)
|
|
900
994
|
|
|
901
995
|
# Use flexible language matching (matches 'nl', 'nl-NL', 'Dutch', etc.)
|
|
902
|
-
from modules.tmx_generator import get_lang_match_variants
|
|
903
996
|
if src_base:
|
|
904
997
|
src_variants = get_lang_match_variants(source_lang)
|
|
905
998
|
src_conditions = []
|
|
@@ -920,19 +1013,16 @@ class DatabaseManager:
|
|
|
920
1013
|
params.append(f"{variant}-%")
|
|
921
1014
|
query += f" AND ({' OR '.join(tgt_conditions)})"
|
|
922
1015
|
|
|
923
|
-
#
|
|
924
|
-
#
|
|
925
|
-
#
|
|
1016
|
+
# Per-TM candidate limit - INCREASED to catch more potential fuzzy matches
|
|
1017
|
+
# When multiple TMs are searched, BM25 ranking can push genuinely similar
|
|
1018
|
+
# entries far down the list due to common word matches in other entries
|
|
926
1019
|
candidate_limit = max(500, max_results * 50)
|
|
927
1020
|
query += f" ORDER BY relevance DESC LIMIT {candidate_limit}"
|
|
928
1021
|
|
|
929
|
-
print(f"[DEBUG] search_fuzzy_matches: Executing query (limit={candidate_limit})...")
|
|
930
|
-
|
|
931
1022
|
try:
|
|
932
1023
|
self.cursor.execute(query, params)
|
|
933
1024
|
all_rows = self.cursor.fetchall()
|
|
934
1025
|
except Exception as e:
|
|
935
|
-
print(f"[DEBUG] search_fuzzy_matches: SQL ERROR: {e}")
|
|
936
1026
|
return []
|
|
937
1027
|
|
|
938
1028
|
results = []
|
|
@@ -948,8 +1038,6 @@ class DatabaseManager:
|
|
|
948
1038
|
match_dict['match_pct'] = int(similarity * 100)
|
|
949
1039
|
results.append(match_dict)
|
|
950
1040
|
|
|
951
|
-
print(f"[DEBUG] search_fuzzy_matches: After threshold filter ({threshold}): {len(results)} matches")
|
|
952
|
-
|
|
953
1041
|
# If bidirectional, also search reverse direction
|
|
954
1042
|
if bidirectional and src_base and tgt_base:
|
|
955
1043
|
query = """
|
|
@@ -961,13 +1049,12 @@ class DatabaseManager:
|
|
|
961
1049
|
"""
|
|
962
1050
|
params = [fts_query]
|
|
963
1051
|
|
|
964
|
-
if tm_ids:
|
|
1052
|
+
if tm_ids and tm_ids[0] is not None:
|
|
965
1053
|
placeholders = ','.join('?' * len(tm_ids))
|
|
966
1054
|
query += f" AND tu.tm_id IN ({placeholders})"
|
|
967
1055
|
params.extend(tm_ids)
|
|
968
1056
|
|
|
969
1057
|
# Reversed language filters with flexible matching
|
|
970
|
-
# For reverse: TM target_lang should match our source_lang, TM source_lang should match our target_lang
|
|
971
1058
|
src_variants = get_lang_match_variants(source_lang)
|
|
972
1059
|
tgt_variants = get_lang_match_variants(target_lang)
|
|
973
1060
|
|
|
@@ -991,26 +1078,27 @@ class DatabaseManager:
|
|
|
991
1078
|
|
|
992
1079
|
query += f" ORDER BY relevance DESC LIMIT {max_results * 5}"
|
|
993
1080
|
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
for row in self.cursor.fetchall():
|
|
997
|
-
match_dict = dict(row)
|
|
998
|
-
# Calculate similarity against target_text (since we're reversing)
|
|
999
|
-
similarity = self.calculate_similarity(source, match_dict['target_text'])
|
|
1081
|
+
try:
|
|
1082
|
+
self.cursor.execute(query, params)
|
|
1000
1083
|
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
#
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1084
|
+
for row in self.cursor.fetchall():
|
|
1085
|
+
match_dict = dict(row)
|
|
1086
|
+
# Calculate similarity against target_text (since we're reversing)
|
|
1087
|
+
similarity = self.calculate_similarity(source, match_dict['target_text'])
|
|
1088
|
+
|
|
1089
|
+
# Only include matches above threshold
|
|
1090
|
+
if similarity >= threshold:
|
|
1091
|
+
# Swap source/target for reverse match
|
|
1092
|
+
match_dict['source_text'], match_dict['target_text'] = match_dict['target_text'], match_dict['source_text']
|
|
1093
|
+
match_dict['source_lang'], match_dict['target_lang'] = match_dict['target_lang'], match_dict['source_lang']
|
|
1094
|
+
match_dict['similarity'] = similarity
|
|
1095
|
+
match_dict['match_pct'] = int(similarity * 100)
|
|
1096
|
+
match_dict['reverse_match'] = True
|
|
1097
|
+
results.append(match_dict)
|
|
1098
|
+
except Exception as e:
|
|
1099
|
+
print(f"[DEBUG] _search_single_tm_fuzzy (reverse): SQL ERROR: {e}")
|
|
1100
|
+
|
|
1101
|
+
return results
|
|
1014
1102
|
|
|
1015
1103
|
def search_all(self, source: str, tm_ids: List[str] = None, enabled_only: bool = True,
|
|
1016
1104
|
threshold: float = 0.75, max_results: int = 10) -> List[Dict]:
|
|
@@ -1389,120 +1477,225 @@ class DatabaseManager:
|
|
|
1389
1477
|
# TODO: Implement in Phase 3
|
|
1390
1478
|
pass
|
|
1391
1479
|
|
|
1392
|
-
def search_termbases(self, search_term: str, source_lang: str = None,
|
|
1480
|
+
def search_termbases(self, search_term: str, source_lang: str = None,
|
|
1393
1481
|
target_lang: str = None, project_id: str = None,
|
|
1394
|
-
min_length: int = 0) -> List[Dict]:
|
|
1482
|
+
min_length: int = 0, bidirectional: bool = True) -> List[Dict]:
|
|
1395
1483
|
"""
|
|
1396
|
-
Search termbases for matching
|
|
1397
|
-
|
|
1484
|
+
Search termbases for matching terms (bidirectional by default)
|
|
1485
|
+
|
|
1398
1486
|
Args:
|
|
1399
|
-
search_term:
|
|
1487
|
+
search_term: Term to search for
|
|
1400
1488
|
source_lang: Filter by source language (optional)
|
|
1401
1489
|
target_lang: Filter by target language (optional)
|
|
1402
1490
|
project_id: Filter by project (optional)
|
|
1403
1491
|
min_length: Minimum term length to return
|
|
1404
|
-
|
|
1492
|
+
bidirectional: If True, also search target_term and swap results (default True)
|
|
1493
|
+
|
|
1405
1494
|
Returns:
|
|
1406
1495
|
List of termbase hits, sorted by priority (lower = higher priority)
|
|
1496
|
+
Each result includes 'match_direction' ('source' or 'target') indicating
|
|
1497
|
+
which column matched. For 'target' matches, source_term and target_term
|
|
1498
|
+
are swapped so results are always oriented correctly for the current project.
|
|
1407
1499
|
"""
|
|
1408
1500
|
# Build query with filters - include termbase name and ranking via JOIN
|
|
1409
1501
|
# Note: termbase_id is stored as TEXT in termbase_terms but INTEGER in termbases
|
|
1410
1502
|
# Use CAST to ensure proper comparison
|
|
1411
1503
|
# IMPORTANT: Join with termbase_activation to get the ACTUAL priority for this project
|
|
1412
1504
|
# CRITICAL FIX: Also match when search_term starts with the glossary term
|
|
1413
|
-
# This handles cases like searching for "ca." when glossary has "ca."
|
|
1505
|
+
# This handles cases like searching for "ca." when glossary has "ca."
|
|
1414
1506
|
# AND searching for "ca" when glossary has "ca."
|
|
1415
1507
|
# We also strip trailing punctuation from glossary terms for comparison
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1508
|
+
|
|
1509
|
+
# Build matching conditions for a given column
|
|
1510
|
+
def build_match_conditions(column: str) -> str:
|
|
1511
|
+
return f"""(
|
|
1512
|
+
LOWER(t.{column}) = LOWER(?) OR
|
|
1513
|
+
LOWER(t.{column}) LIKE LOWER(?) OR
|
|
1514
|
+
LOWER(t.{column}) LIKE LOWER(?) OR
|
|
1515
|
+
LOWER(t.{column}) LIKE LOWER(?) OR
|
|
1516
|
+
LOWER(RTRIM(t.{column}, '.!?,;:')) = LOWER(?) OR
|
|
1517
|
+
LOWER(?) LIKE LOWER(t.{column}) || '%' OR
|
|
1518
|
+
LOWER(?) = LOWER(RTRIM(t.{column}, '.!?,;:'))
|
|
1519
|
+
)"""
|
|
1520
|
+
|
|
1521
|
+
# Build match params for one direction
|
|
1522
|
+
def build_match_params() -> list:
|
|
1523
|
+
return [
|
|
1524
|
+
search_term,
|
|
1525
|
+
f"{search_term} %",
|
|
1526
|
+
f"% {search_term}",
|
|
1527
|
+
f"% {search_term} %",
|
|
1528
|
+
search_term, # For RTRIM comparison
|
|
1529
|
+
search_term, # For reverse LIKE
|
|
1530
|
+
search_term # For reverse RTRIM comparison
|
|
1531
|
+
]
|
|
1532
|
+
|
|
1533
|
+
# Matching patterns:
|
|
1534
|
+
# 1. Exact match: column = search_term
|
|
1535
|
+
# 2. Glossary term starts with search: column LIKE "search_term %"
|
|
1536
|
+
# 3. Glossary term ends with search: column LIKE "% search_term"
|
|
1537
|
+
# 4. Glossary term contains search: column LIKE "% search_term %"
|
|
1538
|
+
# 5. Glossary term (stripped) = search_term: RTRIM(column) = search_term (handles "ca." = "ca")
|
|
1539
|
+
# 6. Search starts with glossary term: search_term LIKE column || '%'
|
|
1540
|
+
# 7. Search = glossary term stripped: search_term = RTRIM(column)
|
|
1541
|
+
|
|
1542
|
+
# Base SELECT for forward matches (source_term matches)
|
|
1543
|
+
base_select_forward = """
|
|
1544
|
+
SELECT
|
|
1545
|
+
t.id, t.source_term, t.target_term, t.termbase_id, t.priority,
|
|
1419
1546
|
t.forbidden, t.source_lang, t.target_lang, t.definition, t.domain,
|
|
1420
1547
|
t.notes, t.project, t.client,
|
|
1421
1548
|
tb.name as termbase_name,
|
|
1422
1549
|
tb.source_lang as termbase_source_lang,
|
|
1423
1550
|
tb.target_lang as termbase_target_lang,
|
|
1424
1551
|
tb.is_project_termbase,
|
|
1425
|
-
COALESCE(ta.priority, tb.ranking) as ranking
|
|
1552
|
+
COALESCE(ta.priority, tb.ranking) as ranking,
|
|
1553
|
+
'source' as match_direction
|
|
1426
1554
|
FROM termbase_terms t
|
|
1427
1555
|
LEFT JOIN termbases tb ON CAST(t.termbase_id AS INTEGER) = tb.id
|
|
1428
1556
|
LEFT JOIN termbase_activation ta ON ta.termbase_id = tb.id AND ta.project_id = ? AND ta.is_active = 1
|
|
1429
|
-
WHERE
|
|
1430
|
-
LOWER(t.source_term) = LOWER(?) OR
|
|
1431
|
-
LOWER(t.source_term) LIKE LOWER(?) OR
|
|
1432
|
-
LOWER(t.source_term) LIKE LOWER(?) OR
|
|
1433
|
-
LOWER(t.source_term) LIKE LOWER(?) OR
|
|
1434
|
-
LOWER(RTRIM(t.source_term, '.!?,;:')) = LOWER(?) OR
|
|
1435
|
-
LOWER(?) LIKE LOWER(t.source_term) || '%' OR
|
|
1436
|
-
LOWER(?) = LOWER(RTRIM(t.source_term, '.!?,;:'))
|
|
1437
|
-
)
|
|
1557
|
+
WHERE {match_conditions}
|
|
1438
1558
|
AND (ta.is_active = 1 OR tb.is_project_termbase = 1)
|
|
1439
|
-
"""
|
|
1440
|
-
|
|
1441
|
-
#
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1559
|
+
""".format(match_conditions=build_match_conditions('source_term'))
|
|
1560
|
+
|
|
1561
|
+
# Base SELECT for reverse matches (target_term matches) - swap source/target in output
|
|
1562
|
+
base_select_reverse = """
|
|
1563
|
+
SELECT
|
|
1564
|
+
t.id, t.target_term as source_term, t.source_term as target_term,
|
|
1565
|
+
t.termbase_id, t.priority,
|
|
1566
|
+
t.forbidden, t.target_lang as source_lang, t.source_lang as target_lang,
|
|
1567
|
+
t.definition, t.domain,
|
|
1568
|
+
t.notes, t.project, t.client,
|
|
1569
|
+
tb.name as termbase_name,
|
|
1570
|
+
tb.target_lang as termbase_source_lang,
|
|
1571
|
+
tb.source_lang as termbase_target_lang,
|
|
1572
|
+
tb.is_project_termbase,
|
|
1573
|
+
COALESCE(ta.priority, tb.ranking) as ranking,
|
|
1574
|
+
'target' as match_direction
|
|
1575
|
+
FROM termbase_terms t
|
|
1576
|
+
LEFT JOIN termbases tb ON CAST(t.termbase_id AS INTEGER) = tb.id
|
|
1577
|
+
LEFT JOIN termbase_activation ta ON ta.termbase_id = tb.id AND ta.project_id = ? AND ta.is_active = 1
|
|
1578
|
+
WHERE {match_conditions}
|
|
1579
|
+
AND (ta.is_active = 1 OR tb.is_project_termbase = 1)
|
|
1580
|
+
""".format(match_conditions=build_match_conditions('target_term'))
|
|
1581
|
+
|
|
1582
|
+
# Build params
|
|
1583
|
+
project_param = project_id if project_id else 0
|
|
1584
|
+
forward_params = [project_param] + build_match_params()
|
|
1585
|
+
reverse_params = [project_param] + build_match_params()
|
|
1586
|
+
|
|
1587
|
+
# Build language filter conditions
|
|
1588
|
+
lang_conditions_forward = ""
|
|
1589
|
+
lang_conditions_reverse = ""
|
|
1590
|
+
lang_params_forward = []
|
|
1591
|
+
lang_params_reverse = []
|
|
1592
|
+
|
|
1460
1593
|
if source_lang:
|
|
1461
|
-
|
|
1462
|
-
|
|
1594
|
+
# For forward: filter on source_lang
|
|
1595
|
+
lang_conditions_forward += """ AND (
|
|
1596
|
+
t.source_lang = ? OR
|
|
1463
1597
|
(t.source_lang IS NULL AND tb.source_lang = ?) OR
|
|
1464
1598
|
(t.source_lang IS NULL AND tb.source_lang IS NULL)
|
|
1465
1599
|
)"""
|
|
1466
|
-
|
|
1467
|
-
|
|
1600
|
+
lang_params_forward.extend([source_lang, source_lang])
|
|
1601
|
+
# For reverse: source_lang becomes target_lang (swapped)
|
|
1602
|
+
lang_conditions_reverse += """ AND (
|
|
1603
|
+
t.target_lang = ? OR
|
|
1604
|
+
(t.target_lang IS NULL AND tb.target_lang = ?) OR
|
|
1605
|
+
(t.target_lang IS NULL AND tb.target_lang IS NULL)
|
|
1606
|
+
)"""
|
|
1607
|
+
lang_params_reverse.extend([source_lang, source_lang])
|
|
1608
|
+
|
|
1468
1609
|
if target_lang:
|
|
1469
|
-
|
|
1470
|
-
|
|
1610
|
+
# For forward: filter on target_lang
|
|
1611
|
+
lang_conditions_forward += """ AND (
|
|
1612
|
+
t.target_lang = ? OR
|
|
1471
1613
|
(t.target_lang IS NULL AND tb.target_lang = ?) OR
|
|
1472
1614
|
(t.target_lang IS NULL AND tb.target_lang IS NULL)
|
|
1473
1615
|
)"""
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1616
|
+
lang_params_forward.extend([target_lang, target_lang])
|
|
1617
|
+
# For reverse: target_lang becomes source_lang (swapped)
|
|
1618
|
+
lang_conditions_reverse += """ AND (
|
|
1619
|
+
t.source_lang = ? OR
|
|
1620
|
+
(t.source_lang IS NULL AND tb.source_lang = ?) OR
|
|
1621
|
+
(t.source_lang IS NULL AND tb.source_lang IS NULL)
|
|
1622
|
+
)"""
|
|
1623
|
+
lang_params_reverse.extend([target_lang, target_lang])
|
|
1624
|
+
|
|
1625
|
+
# Project filter conditions
|
|
1626
|
+
project_conditions = ""
|
|
1627
|
+
project_params = []
|
|
1477
1628
|
if project_id:
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1629
|
+
project_conditions = " AND (t.project_id = ? OR t.project_id IS NULL)"
|
|
1630
|
+
project_params = [project_id]
|
|
1631
|
+
|
|
1632
|
+
# Min length conditions
|
|
1633
|
+
min_len_forward = ""
|
|
1634
|
+
min_len_reverse = ""
|
|
1481
1635
|
if min_length > 0:
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
#
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1636
|
+
min_len_forward = f" AND LENGTH(t.source_term) >= {min_length}"
|
|
1637
|
+
min_len_reverse = f" AND LENGTH(t.target_term) >= {min_length}"
|
|
1638
|
+
|
|
1639
|
+
# Build forward query
|
|
1640
|
+
forward_query = base_select_forward + lang_conditions_forward + project_conditions + min_len_forward
|
|
1641
|
+
forward_params.extend(lang_params_forward)
|
|
1642
|
+
forward_params.extend(project_params)
|
|
1643
|
+
|
|
1644
|
+
if bidirectional:
|
|
1645
|
+
# Build reverse query
|
|
1646
|
+
reverse_query = base_select_reverse + lang_conditions_reverse + project_conditions + min_len_reverse
|
|
1647
|
+
reverse_params.extend(lang_params_reverse)
|
|
1648
|
+
reverse_params.extend(project_params)
|
|
1649
|
+
|
|
1650
|
+
# Combine with UNION and sort
|
|
1651
|
+
query = f"""
|
|
1652
|
+
SELECT * FROM (
|
|
1653
|
+
{forward_query}
|
|
1654
|
+
UNION ALL
|
|
1655
|
+
{reverse_query}
|
|
1656
|
+
) combined
|
|
1657
|
+
ORDER BY COALESCE(ranking, -1) ASC, source_term ASC
|
|
1658
|
+
"""
|
|
1659
|
+
params = forward_params + reverse_params
|
|
1660
|
+
else:
|
|
1661
|
+
# Original forward-only behavior
|
|
1662
|
+
query = forward_query + " ORDER BY COALESCE(ranking, -1) ASC, source_term ASC"
|
|
1663
|
+
params = forward_params
|
|
1664
|
+
|
|
1489
1665
|
self.cursor.execute(query, params)
|
|
1490
1666
|
results = []
|
|
1667
|
+
seen_combinations = set() # Track (source_term, target_term, termbase_id) to avoid duplicates
|
|
1668
|
+
|
|
1491
1669
|
for row in self.cursor.fetchall():
|
|
1492
1670
|
result_dict = dict(row)
|
|
1671
|
+
|
|
1672
|
+
# Deduplicate: same term pair from same termbase should only appear once
|
|
1673
|
+
# Prefer 'source' match over 'target' match
|
|
1674
|
+
combo_key = (
|
|
1675
|
+
result_dict.get('source_term', '').lower(),
|
|
1676
|
+
result_dict.get('target_term', '').lower(),
|
|
1677
|
+
result_dict.get('termbase_id')
|
|
1678
|
+
)
|
|
1679
|
+
if combo_key in seen_combinations:
|
|
1680
|
+
continue
|
|
1681
|
+
seen_combinations.add(combo_key)
|
|
1682
|
+
|
|
1493
1683
|
# SQLite stores booleans as 0/1, explicitly convert to Python bool
|
|
1494
1684
|
if 'is_project_termbase' in result_dict:
|
|
1495
1685
|
result_dict['is_project_termbase'] = bool(result_dict['is_project_termbase'])
|
|
1496
|
-
|
|
1686
|
+
|
|
1497
1687
|
# Fetch target synonyms for this term and include them in the result
|
|
1498
1688
|
term_id = result_dict.get('id')
|
|
1689
|
+
match_direction = result_dict.get('match_direction', 'source')
|
|
1499
1690
|
if term_id:
|
|
1500
1691
|
try:
|
|
1692
|
+
# For reverse matches, fetch 'source' synonyms since they become targets
|
|
1693
|
+
synonym_lang = 'source' if match_direction == 'target' else 'target'
|
|
1501
1694
|
self.cursor.execute("""
|
|
1502
1695
|
SELECT synonym_text, forbidden FROM termbase_synonyms
|
|
1503
|
-
WHERE term_id = ? AND language =
|
|
1696
|
+
WHERE term_id = ? AND language = ?
|
|
1504
1697
|
ORDER BY display_order ASC
|
|
1505
|
-
""", (term_id,))
|
|
1698
|
+
""", (term_id, synonym_lang))
|
|
1506
1699
|
synonyms = []
|
|
1507
1700
|
for syn_row in self.cursor.fetchall():
|
|
1508
1701
|
syn_text = syn_row[0]
|
|
@@ -1512,7 +1705,7 @@ class DatabaseManager:
|
|
|
1512
1705
|
result_dict['target_synonyms'] = synonyms
|
|
1513
1706
|
except Exception:
|
|
1514
1707
|
result_dict['target_synonyms'] = []
|
|
1515
|
-
|
|
1708
|
+
|
|
1516
1709
|
results.append(result_dict)
|
|
1517
1710
|
return results
|
|
1518
1711
|
|