supervertaler 1.9.172__py3-none-any.whl → 1.9.180__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,12 +17,38 @@ import sqlite3
17
17
  import os
18
18
  import json
19
19
  import hashlib
20
+ import unicodedata
21
+ import re
20
22
  from datetime import datetime
21
23
  from typing import List, Dict, Optional, Tuple
22
24
  from pathlib import Path
23
25
  from difflib import SequenceMatcher
24
26
 
25
27
 
28
+ def _normalize_for_matching(text: str) -> str:
29
+ """Normalize text for exact matching.
30
+
31
+ Handles invisible differences that would cause exact match to fail:
32
+ - Unicode normalization (NFC)
33
+ - Multiple whitespace -> single space
34
+ - Leading/trailing whitespace
35
+ - Non-breaking spaces -> regular spaces
36
+ """
37
+ if not text:
38
+ return ""
39
+ # Unicode normalize (NFC form)
40
+ text = unicodedata.normalize('NFC', text)
41
+ # Convert non-breaking spaces and other whitespace to regular space
42
+ text = text.replace('\u00a0', ' ') # NBSP
43
+ text = text.replace('\u2007', ' ') # Figure space
44
+ text = text.replace('\u202f', ' ') # Narrow NBSP
45
+ # Collapse multiple whitespace to single space
46
+ text = re.sub(r'\s+', ' ', text)
47
+ # Strip leading/trailing whitespace
48
+ text = text.strip()
49
+ return text
50
+
51
+
26
52
  class DatabaseManager:
27
53
  """Manages SQLite database for translation resources"""
28
54
 
@@ -655,22 +681,46 @@ class DatabaseManager:
655
681
  # TRANSLATION MEMORY METHODS
656
682
  # ============================================
657
683
 
658
- def add_translation_unit(self, source: str, target: str, source_lang: str,
684
+ def add_translation_unit(self, source: str, target: str, source_lang: str,
659
685
  target_lang: str, tm_id: str = 'project',
660
686
  project_id: str = None, context_before: str = None,
661
- context_after: str = None, notes: str = None) -> int:
687
+ context_after: str = None, notes: str = None,
688
+ overwrite: bool = False) -> int:
662
689
  """
663
690
  Add translation unit to database
664
-
691
+
692
+ Args:
693
+ source: Source text
694
+ target: Target text
695
+ source_lang: Source language code
696
+ target_lang: Target language code
697
+ tm_id: TM identifier
698
+ project_id: Optional project ID
699
+ context_before: Optional context before
700
+ context_after: Optional context after
701
+ notes: Optional notes
702
+ overwrite: If True, delete existing entries with same source before inserting
703
+ (implements "Save only latest translation" mode)
704
+
665
705
  Returns: ID of inserted/updated entry
666
706
  """
667
- # Generate hash for fast exact matching
668
- source_hash = hashlib.md5(source.encode('utf-8')).hexdigest()
669
-
707
+ # Generate hash from NORMALIZED source for consistent exact matching
708
+ # This handles invisible differences like Unicode normalization, whitespace variations
709
+ normalized_source = _normalize_for_matching(source)
710
+ source_hash = hashlib.md5(normalized_source.encode('utf-8')).hexdigest()
711
+
670
712
  try:
713
+ # If overwrite mode, delete ALL existing entries with same source_hash and tm_id
714
+ # This ensures only the latest translation is kept
715
+ if overwrite:
716
+ self.cursor.execute("""
717
+ DELETE FROM translation_units
718
+ WHERE source_hash = ? AND tm_id = ?
719
+ """, (source_hash, tm_id))
720
+
671
721
  self.cursor.execute("""
672
- INSERT INTO translation_units
673
- (source_text, target_text, source_lang, target_lang, tm_id,
722
+ INSERT INTO translation_units
723
+ (source_text, target_text, source_lang, target_lang, tm_id,
674
724
  project_id, context_before, context_after, source_hash, notes)
675
725
  VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
676
726
  ON CONFLICT(source_hash, target_text, tm_id) DO UPDATE SET
@@ -678,42 +728,47 @@ class DatabaseManager:
678
728
  modified_date = CURRENT_TIMESTAMP
679
729
  """, (source, target, source_lang, target_lang, tm_id,
680
730
  project_id, context_before, context_after, source_hash, notes))
681
-
731
+
682
732
  self.connection.commit()
683
733
  return self.cursor.lastrowid
684
-
734
+
685
735
  except Exception as e:
686
736
  self.log(f"Error adding translation unit: {e}")
687
737
  return None
688
738
 
689
739
  def get_exact_match(self, source: str, tm_ids: List[str] = None,
690
- source_lang: str = None, target_lang: str = None,
740
+ source_lang: str = None, target_lang: str = None,
691
741
  bidirectional: bool = True) -> Optional[Dict]:
692
742
  """
693
743
  Get exact match from TM
694
-
744
+
695
745
  Args:
696
746
  source: Source text to match
697
747
  tm_ids: List of TM IDs to search (None = all)
698
748
  source_lang: Filter by source language (base code matching: 'en' matches 'en-US', 'en-GB', etc.)
699
749
  target_lang: Filter by target language (base code matching)
700
750
  bidirectional: If True, search both directions (nl→en AND en→nl)
701
-
751
+
702
752
  Returns: Dictionary with match data or None
703
753
  """
704
754
  from modules.tmx_generator import get_base_lang_code
705
-
755
+
756
+ # Try both normalized and non-normalized hashes for backward compatibility
757
+ # This handles invisible differences like Unicode normalization, whitespace variations
706
758
  source_hash = hashlib.md5(source.encode('utf-8')).hexdigest()
707
-
759
+ normalized_source = _normalize_for_matching(source)
760
+ normalized_hash = hashlib.md5(normalized_source.encode('utf-8')).hexdigest()
761
+
708
762
  # Get base language codes for comparison
709
763
  src_base = get_base_lang_code(source_lang) if source_lang else None
710
764
  tgt_base = get_base_lang_code(target_lang) if target_lang else None
711
-
765
+
766
+ # Search using both original hash and normalized hash
712
767
  query = """
713
- SELECT * FROM translation_units
714
- WHERE source_hash = ? AND source_text = ?
768
+ SELECT * FROM translation_units
769
+ WHERE (source_hash = ? OR source_hash = ?)
715
770
  """
716
- params = [source_hash, source]
771
+ params = [source_hash, normalized_hash]
717
772
 
718
773
  if tm_ids:
719
774
  placeholders = ','.join('?' * len(tm_ids))
@@ -1422,120 +1477,225 @@ class DatabaseManager:
1422
1477
  # TODO: Implement in Phase 3
1423
1478
  pass
1424
1479
 
1425
- def search_termbases(self, search_term: str, source_lang: str = None,
1480
+ def search_termbases(self, search_term: str, source_lang: str = None,
1426
1481
  target_lang: str = None, project_id: str = None,
1427
- min_length: int = 0) -> List[Dict]:
1482
+ min_length: int = 0, bidirectional: bool = True) -> List[Dict]:
1428
1483
  """
1429
- Search termbases for matching source terms
1430
-
1484
+ Search termbases for matching terms (bidirectional by default)
1485
+
1431
1486
  Args:
1432
- search_term: Source term to search for
1487
+ search_term: Term to search for
1433
1488
  source_lang: Filter by source language (optional)
1434
1489
  target_lang: Filter by target language (optional)
1435
1490
  project_id: Filter by project (optional)
1436
1491
  min_length: Minimum term length to return
1437
-
1492
+ bidirectional: If True, also search target_term and swap results (default True)
1493
+
1438
1494
  Returns:
1439
1495
  List of termbase hits, sorted by priority (lower = higher priority)
1496
+ Each result includes 'match_direction' ('source' or 'target') indicating
1497
+ which column matched. For 'target' matches, source_term and target_term
1498
+ are swapped so results are always oriented correctly for the current project.
1440
1499
  """
1441
1500
  # Build query with filters - include termbase name and ranking via JOIN
1442
1501
  # Note: termbase_id is stored as TEXT in termbase_terms but INTEGER in termbases
1443
1502
  # Use CAST to ensure proper comparison
1444
1503
  # IMPORTANT: Join with termbase_activation to get the ACTUAL priority for this project
1445
1504
  # CRITICAL FIX: Also match when search_term starts with the glossary term
1446
- # This handles cases like searching for "ca." when glossary has "ca."
1505
+ # This handles cases like searching for "ca." when glossary has "ca."
1447
1506
  # AND searching for "ca" when glossary has "ca."
1448
1507
  # We also strip trailing punctuation from glossary terms for comparison
1449
- query = """
1450
- SELECT
1451
- t.id, t.source_term, t.target_term, t.termbase_id, t.priority,
1508
+
1509
+ # Build matching conditions for a given column
1510
+ def build_match_conditions(column: str) -> str:
1511
+ return f"""(
1512
+ LOWER(t.{column}) = LOWER(?) OR
1513
+ LOWER(t.{column}) LIKE LOWER(?) OR
1514
+ LOWER(t.{column}) LIKE LOWER(?) OR
1515
+ LOWER(t.{column}) LIKE LOWER(?) OR
1516
+ LOWER(RTRIM(t.{column}, '.!?,;:')) = LOWER(?) OR
1517
+ LOWER(?) LIKE LOWER(t.{column}) || '%' OR
1518
+ LOWER(?) = LOWER(RTRIM(t.{column}, '.!?,;:'))
1519
+ )"""
1520
+
1521
+ # Build match params for one direction
1522
+ def build_match_params() -> list:
1523
+ return [
1524
+ search_term,
1525
+ f"{search_term} %",
1526
+ f"% {search_term}",
1527
+ f"% {search_term} %",
1528
+ search_term, # For RTRIM comparison
1529
+ search_term, # For reverse LIKE
1530
+ search_term # For reverse RTRIM comparison
1531
+ ]
1532
+
1533
+ # Matching patterns:
1534
+ # 1. Exact match: column = search_term
1535
+ # 2. Glossary term starts with search: column LIKE "search_term %"
1536
+ # 3. Glossary term ends with search: column LIKE "% search_term"
1537
+ # 4. Glossary term contains search: column LIKE "% search_term %"
1538
+ # 5. Glossary term (stripped) = search_term: RTRIM(column) = search_term (handles "ca." = "ca")
1539
+ # 6. Search starts with glossary term: search_term LIKE column || '%'
1540
+ # 7. Search = glossary term stripped: search_term = RTRIM(column)
1541
+
1542
+ # Base SELECT for forward matches (source_term matches)
1543
+ base_select_forward = """
1544
+ SELECT
1545
+ t.id, t.source_term, t.target_term, t.termbase_id, t.priority,
1452
1546
  t.forbidden, t.source_lang, t.target_lang, t.definition, t.domain,
1453
1547
  t.notes, t.project, t.client,
1454
1548
  tb.name as termbase_name,
1455
1549
  tb.source_lang as termbase_source_lang,
1456
1550
  tb.target_lang as termbase_target_lang,
1457
1551
  tb.is_project_termbase,
1458
- COALESCE(ta.priority, tb.ranking) as ranking
1552
+ COALESCE(ta.priority, tb.ranking) as ranking,
1553
+ 'source' as match_direction
1459
1554
  FROM termbase_terms t
1460
1555
  LEFT JOIN termbases tb ON CAST(t.termbase_id AS INTEGER) = tb.id
1461
1556
  LEFT JOIN termbase_activation ta ON ta.termbase_id = tb.id AND ta.project_id = ? AND ta.is_active = 1
1462
- WHERE (
1463
- LOWER(t.source_term) = LOWER(?) OR
1464
- LOWER(t.source_term) LIKE LOWER(?) OR
1465
- LOWER(t.source_term) LIKE LOWER(?) OR
1466
- LOWER(t.source_term) LIKE LOWER(?) OR
1467
- LOWER(RTRIM(t.source_term, '.!?,;:')) = LOWER(?) OR
1468
- LOWER(?) LIKE LOWER(t.source_term) || '%' OR
1469
- LOWER(?) = LOWER(RTRIM(t.source_term, '.!?,;:'))
1470
- )
1557
+ WHERE {match_conditions}
1471
1558
  AND (ta.is_active = 1 OR tb.is_project_termbase = 1)
1472
- """
1473
- # Matching patterns:
1474
- # 1. Exact match: source_term = search_term
1475
- # 2. Glossary term starts with search: source_term LIKE "search_term %"
1476
- # 3. Glossary term ends with search: source_term LIKE "% search_term"
1477
- # 4. Glossary term contains search: source_term LIKE "% search_term %"
1478
- # 5. Glossary term (stripped) = search_term: RTRIM(source_term) = search_term (handles "ca." = "ca")
1479
- # 6. Search starts with glossary term: search_term LIKE source_term || '%'
1480
- # 7. Search = glossary term stripped: search_term = RTRIM(source_term)
1481
- params = [
1482
- project_id if project_id else 0, # Use 0 if no project (won't match any activation records)
1483
- search_term,
1484
- f"{search_term} %",
1485
- f"% {search_term}",
1486
- f"% {search_term} %",
1487
- search_term, # For RTRIM comparison
1488
- search_term, # For reverse LIKE
1489
- search_term # For reverse RTRIM comparison
1490
- ]
1491
-
1492
- # Language filters - if term has no language, use termbase language for filtering
1559
+ """.format(match_conditions=build_match_conditions('source_term'))
1560
+
1561
+ # Base SELECT for reverse matches (target_term matches) - swap source/target in output
1562
+ base_select_reverse = """
1563
+ SELECT
1564
+ t.id, t.target_term as source_term, t.source_term as target_term,
1565
+ t.termbase_id, t.priority,
1566
+ t.forbidden, t.target_lang as source_lang, t.source_lang as target_lang,
1567
+ t.definition, t.domain,
1568
+ t.notes, t.project, t.client,
1569
+ tb.name as termbase_name,
1570
+ tb.target_lang as termbase_source_lang,
1571
+ tb.source_lang as termbase_target_lang,
1572
+ tb.is_project_termbase,
1573
+ COALESCE(ta.priority, tb.ranking) as ranking,
1574
+ 'target' as match_direction
1575
+ FROM termbase_terms t
1576
+ LEFT JOIN termbases tb ON CAST(t.termbase_id AS INTEGER) = tb.id
1577
+ LEFT JOIN termbase_activation ta ON ta.termbase_id = tb.id AND ta.project_id = ? AND ta.is_active = 1
1578
+ WHERE {match_conditions}
1579
+ AND (ta.is_active = 1 OR tb.is_project_termbase = 1)
1580
+ """.format(match_conditions=build_match_conditions('target_term'))
1581
+
1582
+ # Build params
1583
+ project_param = project_id if project_id else 0
1584
+ forward_params = [project_param] + build_match_params()
1585
+ reverse_params = [project_param] + build_match_params()
1586
+
1587
+ # Build language filter conditions
1588
+ lang_conditions_forward = ""
1589
+ lang_conditions_reverse = ""
1590
+ lang_params_forward = []
1591
+ lang_params_reverse = []
1592
+
1493
1593
  if source_lang:
1494
- query += """ AND (
1495
- t.source_lang = ? OR
1594
+ # For forward: filter on source_lang
1595
+ lang_conditions_forward += """ AND (
1596
+ t.source_lang = ? OR
1496
1597
  (t.source_lang IS NULL AND tb.source_lang = ?) OR
1497
1598
  (t.source_lang IS NULL AND tb.source_lang IS NULL)
1498
1599
  )"""
1499
- params.extend([source_lang, source_lang])
1500
-
1600
+ lang_params_forward.extend([source_lang, source_lang])
1601
+ # For reverse: source_lang becomes target_lang (swapped)
1602
+ lang_conditions_reverse += """ AND (
1603
+ t.target_lang = ? OR
1604
+ (t.target_lang IS NULL AND tb.target_lang = ?) OR
1605
+ (t.target_lang IS NULL AND tb.target_lang IS NULL)
1606
+ )"""
1607
+ lang_params_reverse.extend([source_lang, source_lang])
1608
+
1501
1609
  if target_lang:
1502
- query += """ AND (
1503
- t.target_lang = ? OR
1610
+ # For forward: filter on target_lang
1611
+ lang_conditions_forward += """ AND (
1612
+ t.target_lang = ? OR
1504
1613
  (t.target_lang IS NULL AND tb.target_lang = ?) OR
1505
1614
  (t.target_lang IS NULL AND tb.target_lang IS NULL)
1506
1615
  )"""
1507
- params.extend([target_lang, target_lang])
1508
-
1509
- # Project filter: match project-specific terms OR global terms (project_id IS NULL)
1616
+ lang_params_forward.extend([target_lang, target_lang])
1617
+ # For reverse: target_lang becomes source_lang (swapped)
1618
+ lang_conditions_reverse += """ AND (
1619
+ t.source_lang = ? OR
1620
+ (t.source_lang IS NULL AND tb.source_lang = ?) OR
1621
+ (t.source_lang IS NULL AND tb.source_lang IS NULL)
1622
+ )"""
1623
+ lang_params_reverse.extend([target_lang, target_lang])
1624
+
1625
+ # Project filter conditions
1626
+ project_conditions = ""
1627
+ project_params = []
1510
1628
  if project_id:
1511
- query += " AND (t.project_id = ? OR t.project_id IS NULL)"
1512
- params.append(project_id)
1513
-
1629
+ project_conditions = " AND (t.project_id = ? OR t.project_id IS NULL)"
1630
+ project_params = [project_id]
1631
+
1632
+ # Min length conditions
1633
+ min_len_forward = ""
1634
+ min_len_reverse = ""
1514
1635
  if min_length > 0:
1515
- query += f" AND LENGTH(t.source_term) >= {min_length}"
1516
-
1517
- # Sort by ranking (lower number = higher priority)
1518
- # Project termbases (ranking IS NULL) appear first, then by ranking, then alphabetically
1519
- # Use COALESCE to treat NULL as -1 (highest priority)
1520
- query += " ORDER BY COALESCE(tb.ranking, -1) ASC, t.source_term ASC"
1521
-
1636
+ min_len_forward = f" AND LENGTH(t.source_term) >= {min_length}"
1637
+ min_len_reverse = f" AND LENGTH(t.target_term) >= {min_length}"
1638
+
1639
+ # Build forward query
1640
+ forward_query = base_select_forward + lang_conditions_forward + project_conditions + min_len_forward
1641
+ forward_params.extend(lang_params_forward)
1642
+ forward_params.extend(project_params)
1643
+
1644
+ if bidirectional:
1645
+ # Build reverse query
1646
+ reverse_query = base_select_reverse + lang_conditions_reverse + project_conditions + min_len_reverse
1647
+ reverse_params.extend(lang_params_reverse)
1648
+ reverse_params.extend(project_params)
1649
+
1650
+ # Combine with UNION and sort
1651
+ query = f"""
1652
+ SELECT * FROM (
1653
+ {forward_query}
1654
+ UNION ALL
1655
+ {reverse_query}
1656
+ ) combined
1657
+ ORDER BY COALESCE(ranking, -1) ASC, source_term ASC
1658
+ """
1659
+ params = forward_params + reverse_params
1660
+ else:
1661
+ # Original forward-only behavior
1662
+ query = forward_query + " ORDER BY COALESCE(ranking, -1) ASC, source_term ASC"
1663
+ params = forward_params
1664
+
1522
1665
  self.cursor.execute(query, params)
1523
1666
  results = []
1667
+ seen_combinations = set() # Track (source_term, target_term, termbase_id) to avoid duplicates
1668
+
1524
1669
  for row in self.cursor.fetchall():
1525
1670
  result_dict = dict(row)
1671
+
1672
+ # Deduplicate: same term pair from same termbase should only appear once
1673
+ # Prefer 'source' match over 'target' match
1674
+ combo_key = (
1675
+ result_dict.get('source_term', '').lower(),
1676
+ result_dict.get('target_term', '').lower(),
1677
+ result_dict.get('termbase_id')
1678
+ )
1679
+ if combo_key in seen_combinations:
1680
+ continue
1681
+ seen_combinations.add(combo_key)
1682
+
1526
1683
  # SQLite stores booleans as 0/1, explicitly convert to Python bool
1527
1684
  if 'is_project_termbase' in result_dict:
1528
1685
  result_dict['is_project_termbase'] = bool(result_dict['is_project_termbase'])
1529
-
1686
+
1530
1687
  # Fetch target synonyms for this term and include them in the result
1531
1688
  term_id = result_dict.get('id')
1689
+ match_direction = result_dict.get('match_direction', 'source')
1532
1690
  if term_id:
1533
1691
  try:
1692
+ # For reverse matches, fetch 'source' synonyms since they become targets
1693
+ synonym_lang = 'source' if match_direction == 'target' else 'target'
1534
1694
  self.cursor.execute("""
1535
1695
  SELECT synonym_text, forbidden FROM termbase_synonyms
1536
- WHERE term_id = ? AND language = 'target'
1696
+ WHERE term_id = ? AND language = ?
1537
1697
  ORDER BY display_order ASC
1538
- """, (term_id,))
1698
+ """, (term_id, synonym_lang))
1539
1699
  synonyms = []
1540
1700
  for syn_row in self.cursor.fetchall():
1541
1701
  syn_text = syn_row[0]
@@ -1545,7 +1705,7 @@ class DatabaseManager:
1545
1705
  result_dict['target_synonyms'] = synonyms
1546
1706
  except Exception:
1547
1707
  result_dict['target_synonyms'] = []
1548
-
1708
+
1549
1709
  results.append(result_dict)
1550
1710
  return results
1551
1711
 
@@ -186,9 +186,13 @@ def run_all_migrations(db_manager) -> bool:
186
186
  # Migration 3: Add display_order and forbidden fields to synonyms
187
187
  if not migrate_synonym_fields(db_manager):
188
188
  success = False
189
-
189
+
190
+ # Migration 4: Add ai_inject field to termbases
191
+ if not migrate_termbase_ai_inject(db_manager):
192
+ success = False
193
+
190
194
  print("="*60)
191
-
195
+
192
196
  return success
193
197
 
194
198
 
@@ -221,18 +225,26 @@ def check_and_migrate(db_manager) -> bool:
221
225
 
222
226
  # Check if synonyms table exists
223
227
  cursor.execute("""
224
- SELECT name FROM sqlite_master
228
+ SELECT name FROM sqlite_master
225
229
  WHERE type='table' AND name='termbase_synonyms'
226
230
  """)
227
231
  needs_synonyms_table = cursor.fetchone() is None
228
-
232
+
233
+ # Check if termbases table has ai_inject column
234
+ cursor.execute("PRAGMA table_info(termbases)")
235
+ termbase_columns = {row[1] for row in cursor.fetchall()}
236
+ needs_ai_inject = 'ai_inject' not in termbase_columns
237
+
229
238
  if needs_migration:
230
239
  print(f"⚠️ Migration needed - missing columns: {', '.join([c for c in ['project', 'client', 'term_uuid', 'note'] if c not in columns])}")
231
-
240
+
232
241
  if needs_synonyms_table:
233
242
  print("⚠️ Migration needed - termbase_synonyms table missing")
234
-
235
- if needs_migration or needs_synonyms_table:
243
+
244
+ if needs_ai_inject:
245
+ print("⚠️ Migration needed - termbases.ai_inject column missing")
246
+
247
+ if needs_migration or needs_synonyms_table or needs_ai_inject:
236
248
  success = run_all_migrations(db_manager)
237
249
  if success:
238
250
  # Generate UUIDs for terms that don't have them
@@ -316,6 +328,41 @@ def migrate_synonym_fields(db_manager) -> bool:
316
328
  return False
317
329
 
318
330
 
331
+ def migrate_termbase_ai_inject(db_manager) -> bool:
332
+ """
333
+ Add ai_inject column to termbases table.
334
+ When enabled, the termbase's terms will be injected into LLM translation prompts.
335
+
336
+ Args:
337
+ db_manager: DatabaseManager instance
338
+
339
+ Returns:
340
+ True if migration successful
341
+ """
342
+ try:
343
+ cursor = db_manager.cursor
344
+
345
+ # Check which columns exist
346
+ cursor.execute("PRAGMA table_info(termbases)")
347
+ columns = {row[1] for row in cursor.fetchall()}
348
+
349
+ if 'ai_inject' not in columns:
350
+ print("📊 Adding 'ai_inject' column to termbases...")
351
+ cursor.execute("ALTER TABLE termbases ADD COLUMN ai_inject BOOLEAN DEFAULT 0")
352
+ db_manager.connection.commit()
353
+ print(" ✓ Column 'ai_inject' added successfully")
354
+ else:
355
+ print("✅ termbases.ai_inject column already exists")
356
+
357
+ return True
358
+
359
+ except Exception as e:
360
+ print(f"❌ ai_inject migration failed: {e}")
361
+ import traceback
362
+ traceback.print_exc()
363
+ return False
364
+
365
+
319
366
  def generate_missing_uuids(db_manager) -> bool:
320
367
  """
321
368
  Generate UUIDs for any termbase terms that don't have them.
@@ -159,9 +159,78 @@ class MQXLIFFHandler:
159
159
 
160
160
  segment = FormattedSegment(trans_unit_id, plain_text, formatted_xml)
161
161
  segments.append(segment)
162
-
162
+
163
163
  return segments
164
-
164
+
165
+ def extract_bilingual_segments(self) -> List[Dict]:
166
+ """
167
+ Extract all source AND target segments from the MQXLIFF file.
168
+ Used for importing pretranslated mqxliff files.
169
+
170
+ Returns:
171
+ List of dicts with 'id', 'source', 'target', 'status' keys
172
+ """
173
+ segments = []
174
+
175
+ if self.body_element is None:
176
+ return segments
177
+
178
+ # Find all trans-unit elements (with or without namespace)
179
+ trans_units = self.body_element.findall('.//xliff:trans-unit', self.NAMESPACES)
180
+ if not trans_units:
181
+ trans_units = self.body_element.findall('.//trans-unit')
182
+
183
+ for trans_unit in trans_units:
184
+ trans_unit_id = trans_unit.get('id', 'unknown')
185
+
186
+ # Skip auxiliary segments (like hyperlink URLs with mq:nosplitjoin="true")
187
+ nosplitjoin = trans_unit.get('{MQXliff}nosplitjoin', 'false')
188
+ if nosplitjoin == 'true':
189
+ continue
190
+
191
+ # Find source element
192
+ source_elem = trans_unit.find('xliff:source', self.NAMESPACES)
193
+ if source_elem is None:
194
+ source_elem = trans_unit.find('source')
195
+
196
+ # Find target element
197
+ target_elem = trans_unit.find('xliff:target', self.NAMESPACES)
198
+ if target_elem is None:
199
+ target_elem = trans_unit.find('target')
200
+
201
+ source_text = ""
202
+ target_text = ""
203
+
204
+ if source_elem is not None:
205
+ source_text = self._extract_plain_text(source_elem)
206
+
207
+ if target_elem is not None:
208
+ target_text = self._extract_plain_text(target_elem)
209
+
210
+ # Get memoQ status if available
211
+ mq_status = trans_unit.get('{MQXliff}status', '')
212
+
213
+ # Map memoQ status to internal status
214
+ # memoQ statuses: "NotStarted", "Editing", "Confirmed", "Reviewed", "Rejected", etc.
215
+ status = 'not_started'
216
+ if mq_status in ['Confirmed', 'ProofRead', 'Reviewed']:
217
+ status = 'confirmed'
218
+ elif mq_status == 'Editing':
219
+ status = 'translated'
220
+ elif target_text.strip():
221
+ # Has target but unknown status - mark as pre-translated
222
+ status = 'pre_translated'
223
+
224
+ segments.append({
225
+ 'id': trans_unit_id,
226
+ 'source': source_text,
227
+ 'target': target_text,
228
+ 'status': status,
229
+ 'mq_status': mq_status
230
+ })
231
+
232
+ return segments
233
+
165
234
  def _extract_plain_text(self, element: ET.Element) -> str:
166
235
  """
167
236
  Recursively extract plain text from an XML element, stripping all tags.