supervertaler 1.9.172__py3-none-any.whl → 1.9.180__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- Supervertaler.py +1133 -310
- modules/database_manager.py +243 -83
- modules/database_migrations.py +54 -7
- modules/mqxliff_handler.py +71 -2
- modules/termbase_manager.py +105 -1
- modules/theme_manager.py +41 -4
- modules/tm_metadata_manager.py +23 -18
- modules/translation_memory.py +2 -2
- modules/unified_prompt_library.py +2 -2
- modules/unified_prompt_manager_qt.py +35 -18
- supervertaler-1.9.180.dist-info/METADATA +151 -0
- {supervertaler-1.9.172.dist-info → supervertaler-1.9.180.dist-info}/RECORD +16 -16
- supervertaler-1.9.172.dist-info/METADATA +0 -930
- {supervertaler-1.9.172.dist-info → supervertaler-1.9.180.dist-info}/WHEEL +0 -0
- {supervertaler-1.9.172.dist-info → supervertaler-1.9.180.dist-info}/entry_points.txt +0 -0
- {supervertaler-1.9.172.dist-info → supervertaler-1.9.180.dist-info}/licenses/LICENSE +0 -0
- {supervertaler-1.9.172.dist-info → supervertaler-1.9.180.dist-info}/top_level.txt +0 -0
modules/database_manager.py
CHANGED
|
@@ -17,12 +17,38 @@ import sqlite3
|
|
|
17
17
|
import os
|
|
18
18
|
import json
|
|
19
19
|
import hashlib
|
|
20
|
+
import unicodedata
|
|
21
|
+
import re
|
|
20
22
|
from datetime import datetime
|
|
21
23
|
from typing import List, Dict, Optional, Tuple
|
|
22
24
|
from pathlib import Path
|
|
23
25
|
from difflib import SequenceMatcher
|
|
24
26
|
|
|
25
27
|
|
|
28
|
+
def _normalize_for_matching(text: str) -> str:
|
|
29
|
+
"""Normalize text for exact matching.
|
|
30
|
+
|
|
31
|
+
Handles invisible differences that would cause exact match to fail:
|
|
32
|
+
- Unicode normalization (NFC)
|
|
33
|
+
- Multiple whitespace -> single space
|
|
34
|
+
- Leading/trailing whitespace
|
|
35
|
+
- Non-breaking spaces -> regular spaces
|
|
36
|
+
"""
|
|
37
|
+
if not text:
|
|
38
|
+
return ""
|
|
39
|
+
# Unicode normalize (NFC form)
|
|
40
|
+
text = unicodedata.normalize('NFC', text)
|
|
41
|
+
# Convert non-breaking spaces and other whitespace to regular space
|
|
42
|
+
text = text.replace('\u00a0', ' ') # NBSP
|
|
43
|
+
text = text.replace('\u2007', ' ') # Figure space
|
|
44
|
+
text = text.replace('\u202f', ' ') # Narrow NBSP
|
|
45
|
+
# Collapse multiple whitespace to single space
|
|
46
|
+
text = re.sub(r'\s+', ' ', text)
|
|
47
|
+
# Strip leading/trailing whitespace
|
|
48
|
+
text = text.strip()
|
|
49
|
+
return text
|
|
50
|
+
|
|
51
|
+
|
|
26
52
|
class DatabaseManager:
|
|
27
53
|
"""Manages SQLite database for translation resources"""
|
|
28
54
|
|
|
@@ -655,22 +681,46 @@ class DatabaseManager:
|
|
|
655
681
|
# TRANSLATION MEMORY METHODS
|
|
656
682
|
# ============================================
|
|
657
683
|
|
|
658
|
-
def add_translation_unit(self, source: str, target: str, source_lang: str,
|
|
684
|
+
def add_translation_unit(self, source: str, target: str, source_lang: str,
|
|
659
685
|
target_lang: str, tm_id: str = 'project',
|
|
660
686
|
project_id: str = None, context_before: str = None,
|
|
661
|
-
context_after: str = None, notes: str = None
|
|
687
|
+
context_after: str = None, notes: str = None,
|
|
688
|
+
overwrite: bool = False) -> int:
|
|
662
689
|
"""
|
|
663
690
|
Add translation unit to database
|
|
664
|
-
|
|
691
|
+
|
|
692
|
+
Args:
|
|
693
|
+
source: Source text
|
|
694
|
+
target: Target text
|
|
695
|
+
source_lang: Source language code
|
|
696
|
+
target_lang: Target language code
|
|
697
|
+
tm_id: TM identifier
|
|
698
|
+
project_id: Optional project ID
|
|
699
|
+
context_before: Optional context before
|
|
700
|
+
context_after: Optional context after
|
|
701
|
+
notes: Optional notes
|
|
702
|
+
overwrite: If True, delete existing entries with same source before inserting
|
|
703
|
+
(implements "Save only latest translation" mode)
|
|
704
|
+
|
|
665
705
|
Returns: ID of inserted/updated entry
|
|
666
706
|
"""
|
|
667
|
-
# Generate hash for
|
|
668
|
-
|
|
669
|
-
|
|
707
|
+
# Generate hash from NORMALIZED source for consistent exact matching
|
|
708
|
+
# This handles invisible differences like Unicode normalization, whitespace variations
|
|
709
|
+
normalized_source = _normalize_for_matching(source)
|
|
710
|
+
source_hash = hashlib.md5(normalized_source.encode('utf-8')).hexdigest()
|
|
711
|
+
|
|
670
712
|
try:
|
|
713
|
+
# If overwrite mode, delete ALL existing entries with same source_hash and tm_id
|
|
714
|
+
# This ensures only the latest translation is kept
|
|
715
|
+
if overwrite:
|
|
716
|
+
self.cursor.execute("""
|
|
717
|
+
DELETE FROM translation_units
|
|
718
|
+
WHERE source_hash = ? AND tm_id = ?
|
|
719
|
+
""", (source_hash, tm_id))
|
|
720
|
+
|
|
671
721
|
self.cursor.execute("""
|
|
672
|
-
INSERT INTO translation_units
|
|
673
|
-
(source_text, target_text, source_lang, target_lang, tm_id,
|
|
722
|
+
INSERT INTO translation_units
|
|
723
|
+
(source_text, target_text, source_lang, target_lang, tm_id,
|
|
674
724
|
project_id, context_before, context_after, source_hash, notes)
|
|
675
725
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
676
726
|
ON CONFLICT(source_hash, target_text, tm_id) DO UPDATE SET
|
|
@@ -678,42 +728,47 @@ class DatabaseManager:
|
|
|
678
728
|
modified_date = CURRENT_TIMESTAMP
|
|
679
729
|
""", (source, target, source_lang, target_lang, tm_id,
|
|
680
730
|
project_id, context_before, context_after, source_hash, notes))
|
|
681
|
-
|
|
731
|
+
|
|
682
732
|
self.connection.commit()
|
|
683
733
|
return self.cursor.lastrowid
|
|
684
|
-
|
|
734
|
+
|
|
685
735
|
except Exception as e:
|
|
686
736
|
self.log(f"Error adding translation unit: {e}")
|
|
687
737
|
return None
|
|
688
738
|
|
|
689
739
|
def get_exact_match(self, source: str, tm_ids: List[str] = None,
|
|
690
|
-
source_lang: str = None, target_lang: str = None,
|
|
740
|
+
source_lang: str = None, target_lang: str = None,
|
|
691
741
|
bidirectional: bool = True) -> Optional[Dict]:
|
|
692
742
|
"""
|
|
693
743
|
Get exact match from TM
|
|
694
|
-
|
|
744
|
+
|
|
695
745
|
Args:
|
|
696
746
|
source: Source text to match
|
|
697
747
|
tm_ids: List of TM IDs to search (None = all)
|
|
698
748
|
source_lang: Filter by source language (base code matching: 'en' matches 'en-US', 'en-GB', etc.)
|
|
699
749
|
target_lang: Filter by target language (base code matching)
|
|
700
750
|
bidirectional: If True, search both directions (nl→en AND en→nl)
|
|
701
|
-
|
|
751
|
+
|
|
702
752
|
Returns: Dictionary with match data or None
|
|
703
753
|
"""
|
|
704
754
|
from modules.tmx_generator import get_base_lang_code
|
|
705
|
-
|
|
755
|
+
|
|
756
|
+
# Try both normalized and non-normalized hashes for backward compatibility
|
|
757
|
+
# This handles invisible differences like Unicode normalization, whitespace variations
|
|
706
758
|
source_hash = hashlib.md5(source.encode('utf-8')).hexdigest()
|
|
707
|
-
|
|
759
|
+
normalized_source = _normalize_for_matching(source)
|
|
760
|
+
normalized_hash = hashlib.md5(normalized_source.encode('utf-8')).hexdigest()
|
|
761
|
+
|
|
708
762
|
# Get base language codes for comparison
|
|
709
763
|
src_base = get_base_lang_code(source_lang) if source_lang else None
|
|
710
764
|
tgt_base = get_base_lang_code(target_lang) if target_lang else None
|
|
711
|
-
|
|
765
|
+
|
|
766
|
+
# Search using both original hash and normalized hash
|
|
712
767
|
query = """
|
|
713
|
-
SELECT * FROM translation_units
|
|
714
|
-
WHERE source_hash = ?
|
|
768
|
+
SELECT * FROM translation_units
|
|
769
|
+
WHERE (source_hash = ? OR source_hash = ?)
|
|
715
770
|
"""
|
|
716
|
-
params = [source_hash,
|
|
771
|
+
params = [source_hash, normalized_hash]
|
|
717
772
|
|
|
718
773
|
if tm_ids:
|
|
719
774
|
placeholders = ','.join('?' * len(tm_ids))
|
|
@@ -1422,120 +1477,225 @@ class DatabaseManager:
|
|
|
1422
1477
|
# TODO: Implement in Phase 3
|
|
1423
1478
|
pass
|
|
1424
1479
|
|
|
1425
|
-
def search_termbases(self, search_term: str, source_lang: str = None,
|
|
1480
|
+
def search_termbases(self, search_term: str, source_lang: str = None,
|
|
1426
1481
|
target_lang: str = None, project_id: str = None,
|
|
1427
|
-
min_length: int = 0) -> List[Dict]:
|
|
1482
|
+
min_length: int = 0, bidirectional: bool = True) -> List[Dict]:
|
|
1428
1483
|
"""
|
|
1429
|
-
Search termbases for matching
|
|
1430
|
-
|
|
1484
|
+
Search termbases for matching terms (bidirectional by default)
|
|
1485
|
+
|
|
1431
1486
|
Args:
|
|
1432
|
-
search_term:
|
|
1487
|
+
search_term: Term to search for
|
|
1433
1488
|
source_lang: Filter by source language (optional)
|
|
1434
1489
|
target_lang: Filter by target language (optional)
|
|
1435
1490
|
project_id: Filter by project (optional)
|
|
1436
1491
|
min_length: Minimum term length to return
|
|
1437
|
-
|
|
1492
|
+
bidirectional: If True, also search target_term and swap results (default True)
|
|
1493
|
+
|
|
1438
1494
|
Returns:
|
|
1439
1495
|
List of termbase hits, sorted by priority (lower = higher priority)
|
|
1496
|
+
Each result includes 'match_direction' ('source' or 'target') indicating
|
|
1497
|
+
which column matched. For 'target' matches, source_term and target_term
|
|
1498
|
+
are swapped so results are always oriented correctly for the current project.
|
|
1440
1499
|
"""
|
|
1441
1500
|
# Build query with filters - include termbase name and ranking via JOIN
|
|
1442
1501
|
# Note: termbase_id is stored as TEXT in termbase_terms but INTEGER in termbases
|
|
1443
1502
|
# Use CAST to ensure proper comparison
|
|
1444
1503
|
# IMPORTANT: Join with termbase_activation to get the ACTUAL priority for this project
|
|
1445
1504
|
# CRITICAL FIX: Also match when search_term starts with the glossary term
|
|
1446
|
-
# This handles cases like searching for "ca." when glossary has "ca."
|
|
1505
|
+
# This handles cases like searching for "ca." when glossary has "ca."
|
|
1447
1506
|
# AND searching for "ca" when glossary has "ca."
|
|
1448
1507
|
# We also strip trailing punctuation from glossary terms for comparison
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1508
|
+
|
|
1509
|
+
# Build matching conditions for a given column
|
|
1510
|
+
def build_match_conditions(column: str) -> str:
|
|
1511
|
+
return f"""(
|
|
1512
|
+
LOWER(t.{column}) = LOWER(?) OR
|
|
1513
|
+
LOWER(t.{column}) LIKE LOWER(?) OR
|
|
1514
|
+
LOWER(t.{column}) LIKE LOWER(?) OR
|
|
1515
|
+
LOWER(t.{column}) LIKE LOWER(?) OR
|
|
1516
|
+
LOWER(RTRIM(t.{column}, '.!?,;:')) = LOWER(?) OR
|
|
1517
|
+
LOWER(?) LIKE LOWER(t.{column}) || '%' OR
|
|
1518
|
+
LOWER(?) = LOWER(RTRIM(t.{column}, '.!?,;:'))
|
|
1519
|
+
)"""
|
|
1520
|
+
|
|
1521
|
+
# Build match params for one direction
|
|
1522
|
+
def build_match_params() -> list:
|
|
1523
|
+
return [
|
|
1524
|
+
search_term,
|
|
1525
|
+
f"{search_term} %",
|
|
1526
|
+
f"% {search_term}",
|
|
1527
|
+
f"% {search_term} %",
|
|
1528
|
+
search_term, # For RTRIM comparison
|
|
1529
|
+
search_term, # For reverse LIKE
|
|
1530
|
+
search_term # For reverse RTRIM comparison
|
|
1531
|
+
]
|
|
1532
|
+
|
|
1533
|
+
# Matching patterns:
|
|
1534
|
+
# 1. Exact match: column = search_term
|
|
1535
|
+
# 2. Glossary term starts with search: column LIKE "search_term %"
|
|
1536
|
+
# 3. Glossary term ends with search: column LIKE "% search_term"
|
|
1537
|
+
# 4. Glossary term contains search: column LIKE "% search_term %"
|
|
1538
|
+
# 5. Glossary term (stripped) = search_term: RTRIM(column) = search_term (handles "ca." = "ca")
|
|
1539
|
+
# 6. Search starts with glossary term: search_term LIKE column || '%'
|
|
1540
|
+
# 7. Search = glossary term stripped: search_term = RTRIM(column)
|
|
1541
|
+
|
|
1542
|
+
# Base SELECT for forward matches (source_term matches)
|
|
1543
|
+
base_select_forward = """
|
|
1544
|
+
SELECT
|
|
1545
|
+
t.id, t.source_term, t.target_term, t.termbase_id, t.priority,
|
|
1452
1546
|
t.forbidden, t.source_lang, t.target_lang, t.definition, t.domain,
|
|
1453
1547
|
t.notes, t.project, t.client,
|
|
1454
1548
|
tb.name as termbase_name,
|
|
1455
1549
|
tb.source_lang as termbase_source_lang,
|
|
1456
1550
|
tb.target_lang as termbase_target_lang,
|
|
1457
1551
|
tb.is_project_termbase,
|
|
1458
|
-
COALESCE(ta.priority, tb.ranking) as ranking
|
|
1552
|
+
COALESCE(ta.priority, tb.ranking) as ranking,
|
|
1553
|
+
'source' as match_direction
|
|
1459
1554
|
FROM termbase_terms t
|
|
1460
1555
|
LEFT JOIN termbases tb ON CAST(t.termbase_id AS INTEGER) = tb.id
|
|
1461
1556
|
LEFT JOIN termbase_activation ta ON ta.termbase_id = tb.id AND ta.project_id = ? AND ta.is_active = 1
|
|
1462
|
-
WHERE
|
|
1463
|
-
LOWER(t.source_term) = LOWER(?) OR
|
|
1464
|
-
LOWER(t.source_term) LIKE LOWER(?) OR
|
|
1465
|
-
LOWER(t.source_term) LIKE LOWER(?) OR
|
|
1466
|
-
LOWER(t.source_term) LIKE LOWER(?) OR
|
|
1467
|
-
LOWER(RTRIM(t.source_term, '.!?,;:')) = LOWER(?) OR
|
|
1468
|
-
LOWER(?) LIKE LOWER(t.source_term) || '%' OR
|
|
1469
|
-
LOWER(?) = LOWER(RTRIM(t.source_term, '.!?,;:'))
|
|
1470
|
-
)
|
|
1557
|
+
WHERE {match_conditions}
|
|
1471
1558
|
AND (ta.is_active = 1 OR tb.is_project_termbase = 1)
|
|
1472
|
-
"""
|
|
1473
|
-
|
|
1474
|
-
#
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1559
|
+
""".format(match_conditions=build_match_conditions('source_term'))
|
|
1560
|
+
|
|
1561
|
+
# Base SELECT for reverse matches (target_term matches) - swap source/target in output
|
|
1562
|
+
base_select_reverse = """
|
|
1563
|
+
SELECT
|
|
1564
|
+
t.id, t.target_term as source_term, t.source_term as target_term,
|
|
1565
|
+
t.termbase_id, t.priority,
|
|
1566
|
+
t.forbidden, t.target_lang as source_lang, t.source_lang as target_lang,
|
|
1567
|
+
t.definition, t.domain,
|
|
1568
|
+
t.notes, t.project, t.client,
|
|
1569
|
+
tb.name as termbase_name,
|
|
1570
|
+
tb.target_lang as termbase_source_lang,
|
|
1571
|
+
tb.source_lang as termbase_target_lang,
|
|
1572
|
+
tb.is_project_termbase,
|
|
1573
|
+
COALESCE(ta.priority, tb.ranking) as ranking,
|
|
1574
|
+
'target' as match_direction
|
|
1575
|
+
FROM termbase_terms t
|
|
1576
|
+
LEFT JOIN termbases tb ON CAST(t.termbase_id AS INTEGER) = tb.id
|
|
1577
|
+
LEFT JOIN termbase_activation ta ON ta.termbase_id = tb.id AND ta.project_id = ? AND ta.is_active = 1
|
|
1578
|
+
WHERE {match_conditions}
|
|
1579
|
+
AND (ta.is_active = 1 OR tb.is_project_termbase = 1)
|
|
1580
|
+
""".format(match_conditions=build_match_conditions('target_term'))
|
|
1581
|
+
|
|
1582
|
+
# Build params
|
|
1583
|
+
project_param = project_id if project_id else 0
|
|
1584
|
+
forward_params = [project_param] + build_match_params()
|
|
1585
|
+
reverse_params = [project_param] + build_match_params()
|
|
1586
|
+
|
|
1587
|
+
# Build language filter conditions
|
|
1588
|
+
lang_conditions_forward = ""
|
|
1589
|
+
lang_conditions_reverse = ""
|
|
1590
|
+
lang_params_forward = []
|
|
1591
|
+
lang_params_reverse = []
|
|
1592
|
+
|
|
1493
1593
|
if source_lang:
|
|
1494
|
-
|
|
1495
|
-
|
|
1594
|
+
# For forward: filter on source_lang
|
|
1595
|
+
lang_conditions_forward += """ AND (
|
|
1596
|
+
t.source_lang = ? OR
|
|
1496
1597
|
(t.source_lang IS NULL AND tb.source_lang = ?) OR
|
|
1497
1598
|
(t.source_lang IS NULL AND tb.source_lang IS NULL)
|
|
1498
1599
|
)"""
|
|
1499
|
-
|
|
1500
|
-
|
|
1600
|
+
lang_params_forward.extend([source_lang, source_lang])
|
|
1601
|
+
# For reverse: source_lang becomes target_lang (swapped)
|
|
1602
|
+
lang_conditions_reverse += """ AND (
|
|
1603
|
+
t.target_lang = ? OR
|
|
1604
|
+
(t.target_lang IS NULL AND tb.target_lang = ?) OR
|
|
1605
|
+
(t.target_lang IS NULL AND tb.target_lang IS NULL)
|
|
1606
|
+
)"""
|
|
1607
|
+
lang_params_reverse.extend([source_lang, source_lang])
|
|
1608
|
+
|
|
1501
1609
|
if target_lang:
|
|
1502
|
-
|
|
1503
|
-
|
|
1610
|
+
# For forward: filter on target_lang
|
|
1611
|
+
lang_conditions_forward += """ AND (
|
|
1612
|
+
t.target_lang = ? OR
|
|
1504
1613
|
(t.target_lang IS NULL AND tb.target_lang = ?) OR
|
|
1505
1614
|
(t.target_lang IS NULL AND tb.target_lang IS NULL)
|
|
1506
1615
|
)"""
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1616
|
+
lang_params_forward.extend([target_lang, target_lang])
|
|
1617
|
+
# For reverse: target_lang becomes source_lang (swapped)
|
|
1618
|
+
lang_conditions_reverse += """ AND (
|
|
1619
|
+
t.source_lang = ? OR
|
|
1620
|
+
(t.source_lang IS NULL AND tb.source_lang = ?) OR
|
|
1621
|
+
(t.source_lang IS NULL AND tb.source_lang IS NULL)
|
|
1622
|
+
)"""
|
|
1623
|
+
lang_params_reverse.extend([target_lang, target_lang])
|
|
1624
|
+
|
|
1625
|
+
# Project filter conditions
|
|
1626
|
+
project_conditions = ""
|
|
1627
|
+
project_params = []
|
|
1510
1628
|
if project_id:
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1629
|
+
project_conditions = " AND (t.project_id = ? OR t.project_id IS NULL)"
|
|
1630
|
+
project_params = [project_id]
|
|
1631
|
+
|
|
1632
|
+
# Min length conditions
|
|
1633
|
+
min_len_forward = ""
|
|
1634
|
+
min_len_reverse = ""
|
|
1514
1635
|
if min_length > 0:
|
|
1515
|
-
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
#
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
|
|
1636
|
+
min_len_forward = f" AND LENGTH(t.source_term) >= {min_length}"
|
|
1637
|
+
min_len_reverse = f" AND LENGTH(t.target_term) >= {min_length}"
|
|
1638
|
+
|
|
1639
|
+
# Build forward query
|
|
1640
|
+
forward_query = base_select_forward + lang_conditions_forward + project_conditions + min_len_forward
|
|
1641
|
+
forward_params.extend(lang_params_forward)
|
|
1642
|
+
forward_params.extend(project_params)
|
|
1643
|
+
|
|
1644
|
+
if bidirectional:
|
|
1645
|
+
# Build reverse query
|
|
1646
|
+
reverse_query = base_select_reverse + lang_conditions_reverse + project_conditions + min_len_reverse
|
|
1647
|
+
reverse_params.extend(lang_params_reverse)
|
|
1648
|
+
reverse_params.extend(project_params)
|
|
1649
|
+
|
|
1650
|
+
# Combine with UNION and sort
|
|
1651
|
+
query = f"""
|
|
1652
|
+
SELECT * FROM (
|
|
1653
|
+
{forward_query}
|
|
1654
|
+
UNION ALL
|
|
1655
|
+
{reverse_query}
|
|
1656
|
+
) combined
|
|
1657
|
+
ORDER BY COALESCE(ranking, -1) ASC, source_term ASC
|
|
1658
|
+
"""
|
|
1659
|
+
params = forward_params + reverse_params
|
|
1660
|
+
else:
|
|
1661
|
+
# Original forward-only behavior
|
|
1662
|
+
query = forward_query + " ORDER BY COALESCE(ranking, -1) ASC, source_term ASC"
|
|
1663
|
+
params = forward_params
|
|
1664
|
+
|
|
1522
1665
|
self.cursor.execute(query, params)
|
|
1523
1666
|
results = []
|
|
1667
|
+
seen_combinations = set() # Track (source_term, target_term, termbase_id) to avoid duplicates
|
|
1668
|
+
|
|
1524
1669
|
for row in self.cursor.fetchall():
|
|
1525
1670
|
result_dict = dict(row)
|
|
1671
|
+
|
|
1672
|
+
# Deduplicate: same term pair from same termbase should only appear once
|
|
1673
|
+
# Prefer 'source' match over 'target' match
|
|
1674
|
+
combo_key = (
|
|
1675
|
+
result_dict.get('source_term', '').lower(),
|
|
1676
|
+
result_dict.get('target_term', '').lower(),
|
|
1677
|
+
result_dict.get('termbase_id')
|
|
1678
|
+
)
|
|
1679
|
+
if combo_key in seen_combinations:
|
|
1680
|
+
continue
|
|
1681
|
+
seen_combinations.add(combo_key)
|
|
1682
|
+
|
|
1526
1683
|
# SQLite stores booleans as 0/1, explicitly convert to Python bool
|
|
1527
1684
|
if 'is_project_termbase' in result_dict:
|
|
1528
1685
|
result_dict['is_project_termbase'] = bool(result_dict['is_project_termbase'])
|
|
1529
|
-
|
|
1686
|
+
|
|
1530
1687
|
# Fetch target synonyms for this term and include them in the result
|
|
1531
1688
|
term_id = result_dict.get('id')
|
|
1689
|
+
match_direction = result_dict.get('match_direction', 'source')
|
|
1532
1690
|
if term_id:
|
|
1533
1691
|
try:
|
|
1692
|
+
# For reverse matches, fetch 'source' synonyms since they become targets
|
|
1693
|
+
synonym_lang = 'source' if match_direction == 'target' else 'target'
|
|
1534
1694
|
self.cursor.execute("""
|
|
1535
1695
|
SELECT synonym_text, forbidden FROM termbase_synonyms
|
|
1536
|
-
WHERE term_id = ? AND language =
|
|
1696
|
+
WHERE term_id = ? AND language = ?
|
|
1537
1697
|
ORDER BY display_order ASC
|
|
1538
|
-
""", (term_id,))
|
|
1698
|
+
""", (term_id, synonym_lang))
|
|
1539
1699
|
synonyms = []
|
|
1540
1700
|
for syn_row in self.cursor.fetchall():
|
|
1541
1701
|
syn_text = syn_row[0]
|
|
@@ -1545,7 +1705,7 @@ class DatabaseManager:
|
|
|
1545
1705
|
result_dict['target_synonyms'] = synonyms
|
|
1546
1706
|
except Exception:
|
|
1547
1707
|
result_dict['target_synonyms'] = []
|
|
1548
|
-
|
|
1708
|
+
|
|
1549
1709
|
results.append(result_dict)
|
|
1550
1710
|
return results
|
|
1551
1711
|
|
modules/database_migrations.py
CHANGED
|
@@ -186,9 +186,13 @@ def run_all_migrations(db_manager) -> bool:
|
|
|
186
186
|
# Migration 3: Add display_order and forbidden fields to synonyms
|
|
187
187
|
if not migrate_synonym_fields(db_manager):
|
|
188
188
|
success = False
|
|
189
|
-
|
|
189
|
+
|
|
190
|
+
# Migration 4: Add ai_inject field to termbases
|
|
191
|
+
if not migrate_termbase_ai_inject(db_manager):
|
|
192
|
+
success = False
|
|
193
|
+
|
|
190
194
|
print("="*60)
|
|
191
|
-
|
|
195
|
+
|
|
192
196
|
return success
|
|
193
197
|
|
|
194
198
|
|
|
@@ -221,18 +225,26 @@ def check_and_migrate(db_manager) -> bool:
|
|
|
221
225
|
|
|
222
226
|
# Check if synonyms table exists
|
|
223
227
|
cursor.execute("""
|
|
224
|
-
SELECT name FROM sqlite_master
|
|
228
|
+
SELECT name FROM sqlite_master
|
|
225
229
|
WHERE type='table' AND name='termbase_synonyms'
|
|
226
230
|
""")
|
|
227
231
|
needs_synonyms_table = cursor.fetchone() is None
|
|
228
|
-
|
|
232
|
+
|
|
233
|
+
# Check if termbases table has ai_inject column
|
|
234
|
+
cursor.execute("PRAGMA table_info(termbases)")
|
|
235
|
+
termbase_columns = {row[1] for row in cursor.fetchall()}
|
|
236
|
+
needs_ai_inject = 'ai_inject' not in termbase_columns
|
|
237
|
+
|
|
229
238
|
if needs_migration:
|
|
230
239
|
print(f"⚠️ Migration needed - missing columns: {', '.join([c for c in ['project', 'client', 'term_uuid', 'note'] if c not in columns])}")
|
|
231
|
-
|
|
240
|
+
|
|
232
241
|
if needs_synonyms_table:
|
|
233
242
|
print("⚠️ Migration needed - termbase_synonyms table missing")
|
|
234
|
-
|
|
235
|
-
if
|
|
243
|
+
|
|
244
|
+
if needs_ai_inject:
|
|
245
|
+
print("⚠️ Migration needed - termbases.ai_inject column missing")
|
|
246
|
+
|
|
247
|
+
if needs_migration or needs_synonyms_table or needs_ai_inject:
|
|
236
248
|
success = run_all_migrations(db_manager)
|
|
237
249
|
if success:
|
|
238
250
|
# Generate UUIDs for terms that don't have them
|
|
@@ -316,6 +328,41 @@ def migrate_synonym_fields(db_manager) -> bool:
|
|
|
316
328
|
return False
|
|
317
329
|
|
|
318
330
|
|
|
331
|
+
def migrate_termbase_ai_inject(db_manager) -> bool:
|
|
332
|
+
"""
|
|
333
|
+
Add ai_inject column to termbases table.
|
|
334
|
+
When enabled, the termbase's terms will be injected into LLM translation prompts.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
db_manager: DatabaseManager instance
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
True if migration successful
|
|
341
|
+
"""
|
|
342
|
+
try:
|
|
343
|
+
cursor = db_manager.cursor
|
|
344
|
+
|
|
345
|
+
# Check which columns exist
|
|
346
|
+
cursor.execute("PRAGMA table_info(termbases)")
|
|
347
|
+
columns = {row[1] for row in cursor.fetchall()}
|
|
348
|
+
|
|
349
|
+
if 'ai_inject' not in columns:
|
|
350
|
+
print("📊 Adding 'ai_inject' column to termbases...")
|
|
351
|
+
cursor.execute("ALTER TABLE termbases ADD COLUMN ai_inject BOOLEAN DEFAULT 0")
|
|
352
|
+
db_manager.connection.commit()
|
|
353
|
+
print(" ✓ Column 'ai_inject' added successfully")
|
|
354
|
+
else:
|
|
355
|
+
print("✅ termbases.ai_inject column already exists")
|
|
356
|
+
|
|
357
|
+
return True
|
|
358
|
+
|
|
359
|
+
except Exception as e:
|
|
360
|
+
print(f"❌ ai_inject migration failed: {e}")
|
|
361
|
+
import traceback
|
|
362
|
+
traceback.print_exc()
|
|
363
|
+
return False
|
|
364
|
+
|
|
365
|
+
|
|
319
366
|
def generate_missing_uuids(db_manager) -> bool:
|
|
320
367
|
"""
|
|
321
368
|
Generate UUIDs for any termbase terms that don't have them.
|
modules/mqxliff_handler.py
CHANGED
|
@@ -159,9 +159,78 @@ class MQXLIFFHandler:
|
|
|
159
159
|
|
|
160
160
|
segment = FormattedSegment(trans_unit_id, plain_text, formatted_xml)
|
|
161
161
|
segments.append(segment)
|
|
162
|
-
|
|
162
|
+
|
|
163
163
|
return segments
|
|
164
|
-
|
|
164
|
+
|
|
165
|
+
def extract_bilingual_segments(self) -> List[Dict]:
|
|
166
|
+
"""
|
|
167
|
+
Extract all source AND target segments from the MQXLIFF file.
|
|
168
|
+
Used for importing pretranslated mqxliff files.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
List of dicts with 'id', 'source', 'target', 'status' keys
|
|
172
|
+
"""
|
|
173
|
+
segments = []
|
|
174
|
+
|
|
175
|
+
if self.body_element is None:
|
|
176
|
+
return segments
|
|
177
|
+
|
|
178
|
+
# Find all trans-unit elements (with or without namespace)
|
|
179
|
+
trans_units = self.body_element.findall('.//xliff:trans-unit', self.NAMESPACES)
|
|
180
|
+
if not trans_units:
|
|
181
|
+
trans_units = self.body_element.findall('.//trans-unit')
|
|
182
|
+
|
|
183
|
+
for trans_unit in trans_units:
|
|
184
|
+
trans_unit_id = trans_unit.get('id', 'unknown')
|
|
185
|
+
|
|
186
|
+
# Skip auxiliary segments (like hyperlink URLs with mq:nosplitjoin="true")
|
|
187
|
+
nosplitjoin = trans_unit.get('{MQXliff}nosplitjoin', 'false')
|
|
188
|
+
if nosplitjoin == 'true':
|
|
189
|
+
continue
|
|
190
|
+
|
|
191
|
+
# Find source element
|
|
192
|
+
source_elem = trans_unit.find('xliff:source', self.NAMESPACES)
|
|
193
|
+
if source_elem is None:
|
|
194
|
+
source_elem = trans_unit.find('source')
|
|
195
|
+
|
|
196
|
+
# Find target element
|
|
197
|
+
target_elem = trans_unit.find('xliff:target', self.NAMESPACES)
|
|
198
|
+
if target_elem is None:
|
|
199
|
+
target_elem = trans_unit.find('target')
|
|
200
|
+
|
|
201
|
+
source_text = ""
|
|
202
|
+
target_text = ""
|
|
203
|
+
|
|
204
|
+
if source_elem is not None:
|
|
205
|
+
source_text = self._extract_plain_text(source_elem)
|
|
206
|
+
|
|
207
|
+
if target_elem is not None:
|
|
208
|
+
target_text = self._extract_plain_text(target_elem)
|
|
209
|
+
|
|
210
|
+
# Get memoQ status if available
|
|
211
|
+
mq_status = trans_unit.get('{MQXliff}status', '')
|
|
212
|
+
|
|
213
|
+
# Map memoQ status to internal status
|
|
214
|
+
# memoQ statuses: "NotStarted", "Editing", "Confirmed", "Reviewed", "Rejected", etc.
|
|
215
|
+
status = 'not_started'
|
|
216
|
+
if mq_status in ['Confirmed', 'ProofRead', 'Reviewed']:
|
|
217
|
+
status = 'confirmed'
|
|
218
|
+
elif mq_status == 'Editing':
|
|
219
|
+
status = 'translated'
|
|
220
|
+
elif target_text.strip():
|
|
221
|
+
# Has target but unknown status - mark as pre-translated
|
|
222
|
+
status = 'pre_translated'
|
|
223
|
+
|
|
224
|
+
segments.append({
|
|
225
|
+
'id': trans_unit_id,
|
|
226
|
+
'source': source_text,
|
|
227
|
+
'target': target_text,
|
|
228
|
+
'status': status,
|
|
229
|
+
'mq_status': mq_status
|
|
230
|
+
})
|
|
231
|
+
|
|
232
|
+
return segments
|
|
233
|
+
|
|
165
234
|
def _extract_plain_text(self, element: ET.Element) -> str:
|
|
166
235
|
"""
|
|
167
236
|
Recursively extract plain text from an XML element, stripping all tags.
|