supervertaler 1.9.173__py3-none-any.whl → 1.9.190__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of supervertaler might be problematic. Click here for more details.

@@ -684,21 +684,43 @@ class DatabaseManager:
684
684
  def add_translation_unit(self, source: str, target: str, source_lang: str,
685
685
  target_lang: str, tm_id: str = 'project',
686
686
  project_id: str = None, context_before: str = None,
687
- context_after: str = None, notes: str = None) -> int:
687
+ context_after: str = None, notes: str = None,
688
+ overwrite: bool = False) -> int:
688
689
  """
689
690
  Add translation unit to database
690
691
 
692
+ Args:
693
+ source: Source text
694
+ target: Target text
695
+ source_lang: Source language code
696
+ target_lang: Target language code
697
+ tm_id: TM identifier
698
+ project_id: Optional project ID
699
+ context_before: Optional context before
700
+ context_after: Optional context after
701
+ notes: Optional notes
702
+ overwrite: If True, delete existing entries with same source before inserting
703
+ (implements "Save only latest translation" mode)
704
+
691
705
  Returns: ID of inserted/updated entry
692
706
  """
693
707
  # Generate hash from NORMALIZED source for consistent exact matching
694
708
  # This handles invisible differences like Unicode normalization, whitespace variations
695
709
  normalized_source = _normalize_for_matching(source)
696
710
  source_hash = hashlib.md5(normalized_source.encode('utf-8')).hexdigest()
697
-
711
+
698
712
  try:
713
+ # If overwrite mode, delete ALL existing entries with same source_hash and tm_id
714
+ # This ensures only the latest translation is kept
715
+ if overwrite:
716
+ self.cursor.execute("""
717
+ DELETE FROM translation_units
718
+ WHERE source_hash = ? AND tm_id = ?
719
+ """, (source_hash, tm_id))
720
+
699
721
  self.cursor.execute("""
700
- INSERT INTO translation_units
701
- (source_text, target_text, source_lang, target_lang, tm_id,
722
+ INSERT INTO translation_units
723
+ (source_text, target_text, source_lang, target_lang, tm_id,
702
724
  project_id, context_before, context_after, source_hash, notes)
703
725
  VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
704
726
  ON CONFLICT(source_hash, target_text, tm_id) DO UPDATE SET
@@ -706,10 +728,10 @@ class DatabaseManager:
706
728
  modified_date = CURRENT_TIMESTAMP
707
729
  """, (source, target, source_lang, target_lang, tm_id,
708
730
  project_id, context_before, context_after, source_hash, notes))
709
-
731
+
710
732
  self.connection.commit()
711
733
  return self.cursor.lastrowid
712
-
734
+
713
735
  except Exception as e:
714
736
  self.log(f"Error adding translation unit: {e}")
715
737
  return None
@@ -1455,120 +1477,225 @@ class DatabaseManager:
1455
1477
  # TODO: Implement in Phase 3
1456
1478
  pass
1457
1479
 
1458
- def search_termbases(self, search_term: str, source_lang: str = None,
1480
+ def search_termbases(self, search_term: str, source_lang: str = None,
1459
1481
  target_lang: str = None, project_id: str = None,
1460
- min_length: int = 0) -> List[Dict]:
1482
+ min_length: int = 0, bidirectional: bool = True) -> List[Dict]:
1461
1483
  """
1462
- Search termbases for matching source terms
1463
-
1484
+ Search termbases for matching terms (bidirectional by default)
1485
+
1464
1486
  Args:
1465
- search_term: Source term to search for
1487
+ search_term: Term to search for
1466
1488
  source_lang: Filter by source language (optional)
1467
1489
  target_lang: Filter by target language (optional)
1468
1490
  project_id: Filter by project (optional)
1469
1491
  min_length: Minimum term length to return
1470
-
1492
+ bidirectional: If True, also search target_term and swap results (default True)
1493
+
1471
1494
  Returns:
1472
1495
  List of termbase hits, sorted by priority (lower = higher priority)
1496
+ Each result includes 'match_direction' ('source' or 'target') indicating
1497
+ which column matched. For 'target' matches, source_term and target_term
1498
+ are swapped so results are always oriented correctly for the current project.
1473
1499
  """
1474
1500
  # Build query with filters - include termbase name and ranking via JOIN
1475
1501
  # Note: termbase_id is stored as TEXT in termbase_terms but INTEGER in termbases
1476
1502
  # Use CAST to ensure proper comparison
1477
1503
  # IMPORTANT: Join with termbase_activation to get the ACTUAL priority for this project
1478
1504
  # CRITICAL FIX: Also match when search_term starts with the glossary term
1479
- # This handles cases like searching for "ca." when glossary has "ca."
1505
+ # This handles cases like searching for "ca." when glossary has "ca."
1480
1506
  # AND searching for "ca" when glossary has "ca."
1481
1507
  # We also strip trailing punctuation from glossary terms for comparison
1482
- query = """
1483
- SELECT
1484
- t.id, t.source_term, t.target_term, t.termbase_id, t.priority,
1508
+
1509
+ # Build matching conditions for a given column
1510
+ def build_match_conditions(column: str) -> str:
1511
+ return f"""(
1512
+ LOWER(t.{column}) = LOWER(?) OR
1513
+ LOWER(t.{column}) LIKE LOWER(?) OR
1514
+ LOWER(t.{column}) LIKE LOWER(?) OR
1515
+ LOWER(t.{column}) LIKE LOWER(?) OR
1516
+ LOWER(RTRIM(t.{column}, '.!?,;:')) = LOWER(?) OR
1517
+ LOWER(?) LIKE LOWER(t.{column}) || '%' OR
1518
+ LOWER(?) = LOWER(RTRIM(t.{column}, '.!?,;:'))
1519
+ )"""
1520
+
1521
+ # Build match params for one direction
1522
+ def build_match_params() -> list:
1523
+ return [
1524
+ search_term,
1525
+ f"{search_term} %",
1526
+ f"% {search_term}",
1527
+ f"% {search_term} %",
1528
+ search_term, # For RTRIM comparison
1529
+ search_term, # For reverse LIKE
1530
+ search_term # For reverse RTRIM comparison
1531
+ ]
1532
+
1533
+ # Matching patterns:
1534
+ # 1. Exact match: column = search_term
1535
+ # 2. Glossary term starts with search: column LIKE "search_term %"
1536
+ # 3. Glossary term ends with search: column LIKE "% search_term"
1537
+ # 4. Glossary term contains search: column LIKE "% search_term %"
1538
+ # 5. Glossary term (stripped) = search_term: RTRIM(column) = search_term (handles "ca." = "ca")
1539
+ # 6. Search starts with glossary term: search_term LIKE column || '%'
1540
+ # 7. Search = glossary term stripped: search_term = RTRIM(column)
1541
+
1542
+ # Base SELECT for forward matches (source_term matches)
1543
+ base_select_forward = """
1544
+ SELECT
1545
+ t.id, t.source_term, t.target_term, t.termbase_id, t.priority,
1485
1546
  t.forbidden, t.source_lang, t.target_lang, t.definition, t.domain,
1486
1547
  t.notes, t.project, t.client,
1487
1548
  tb.name as termbase_name,
1488
1549
  tb.source_lang as termbase_source_lang,
1489
1550
  tb.target_lang as termbase_target_lang,
1490
1551
  tb.is_project_termbase,
1491
- COALESCE(ta.priority, tb.ranking) as ranking
1552
+ COALESCE(ta.priority, tb.ranking) as ranking,
1553
+ 'source' as match_direction
1492
1554
  FROM termbase_terms t
1493
1555
  LEFT JOIN termbases tb ON CAST(t.termbase_id AS INTEGER) = tb.id
1494
1556
  LEFT JOIN termbase_activation ta ON ta.termbase_id = tb.id AND ta.project_id = ? AND ta.is_active = 1
1495
- WHERE (
1496
- LOWER(t.source_term) = LOWER(?) OR
1497
- LOWER(t.source_term) LIKE LOWER(?) OR
1498
- LOWER(t.source_term) LIKE LOWER(?) OR
1499
- LOWER(t.source_term) LIKE LOWER(?) OR
1500
- LOWER(RTRIM(t.source_term, '.!?,;:')) = LOWER(?) OR
1501
- LOWER(?) LIKE LOWER(t.source_term) || '%' OR
1502
- LOWER(?) = LOWER(RTRIM(t.source_term, '.!?,;:'))
1503
- )
1557
+ WHERE {match_conditions}
1504
1558
  AND (ta.is_active = 1 OR tb.is_project_termbase = 1)
1505
- """
1506
- # Matching patterns:
1507
- # 1. Exact match: source_term = search_term
1508
- # 2. Glossary term starts with search: source_term LIKE "search_term %"
1509
- # 3. Glossary term ends with search: source_term LIKE "% search_term"
1510
- # 4. Glossary term contains search: source_term LIKE "% search_term %"
1511
- # 5. Glossary term (stripped) = search_term: RTRIM(source_term) = search_term (handles "ca." = "ca")
1512
- # 6. Search starts with glossary term: search_term LIKE source_term || '%'
1513
- # 7. Search = glossary term stripped: search_term = RTRIM(source_term)
1514
- params = [
1515
- project_id if project_id else 0, # Use 0 if no project (won't match any activation records)
1516
- search_term,
1517
- f"{search_term} %",
1518
- f"% {search_term}",
1519
- f"% {search_term} %",
1520
- search_term, # For RTRIM comparison
1521
- search_term, # For reverse LIKE
1522
- search_term # For reverse RTRIM comparison
1523
- ]
1524
-
1525
- # Language filters - if term has no language, use termbase language for filtering
1559
+ """.format(match_conditions=build_match_conditions('source_term'))
1560
+
1561
+ # Base SELECT for reverse matches (target_term matches) - swap source/target in output
1562
+ base_select_reverse = """
1563
+ SELECT
1564
+ t.id, t.target_term as source_term, t.source_term as target_term,
1565
+ t.termbase_id, t.priority,
1566
+ t.forbidden, t.target_lang as source_lang, t.source_lang as target_lang,
1567
+ t.definition, t.domain,
1568
+ t.notes, t.project, t.client,
1569
+ tb.name as termbase_name,
1570
+ tb.target_lang as termbase_source_lang,
1571
+ tb.source_lang as termbase_target_lang,
1572
+ tb.is_project_termbase,
1573
+ COALESCE(ta.priority, tb.ranking) as ranking,
1574
+ 'target' as match_direction
1575
+ FROM termbase_terms t
1576
+ LEFT JOIN termbases tb ON CAST(t.termbase_id AS INTEGER) = tb.id
1577
+ LEFT JOIN termbase_activation ta ON ta.termbase_id = tb.id AND ta.project_id = ? AND ta.is_active = 1
1578
+ WHERE {match_conditions}
1579
+ AND (ta.is_active = 1 OR tb.is_project_termbase = 1)
1580
+ """.format(match_conditions=build_match_conditions('target_term'))
1581
+
1582
+ # Build params
1583
+ project_param = project_id if project_id else 0
1584
+ forward_params = [project_param] + build_match_params()
1585
+ reverse_params = [project_param] + build_match_params()
1586
+
1587
+ # Build language filter conditions
1588
+ lang_conditions_forward = ""
1589
+ lang_conditions_reverse = ""
1590
+ lang_params_forward = []
1591
+ lang_params_reverse = []
1592
+
1526
1593
  if source_lang:
1527
- query += """ AND (
1528
- t.source_lang = ? OR
1594
+ # For forward: filter on source_lang
1595
+ lang_conditions_forward += """ AND (
1596
+ t.source_lang = ? OR
1529
1597
  (t.source_lang IS NULL AND tb.source_lang = ?) OR
1530
1598
  (t.source_lang IS NULL AND tb.source_lang IS NULL)
1531
1599
  )"""
1532
- params.extend([source_lang, source_lang])
1533
-
1600
+ lang_params_forward.extend([source_lang, source_lang])
1601
+ # For reverse: source_lang becomes target_lang (swapped)
1602
+ lang_conditions_reverse += """ AND (
1603
+ t.target_lang = ? OR
1604
+ (t.target_lang IS NULL AND tb.target_lang = ?) OR
1605
+ (t.target_lang IS NULL AND tb.target_lang IS NULL)
1606
+ )"""
1607
+ lang_params_reverse.extend([source_lang, source_lang])
1608
+
1534
1609
  if target_lang:
1535
- query += """ AND (
1536
- t.target_lang = ? OR
1610
+ # For forward: filter on target_lang
1611
+ lang_conditions_forward += """ AND (
1612
+ t.target_lang = ? OR
1537
1613
  (t.target_lang IS NULL AND tb.target_lang = ?) OR
1538
1614
  (t.target_lang IS NULL AND tb.target_lang IS NULL)
1539
1615
  )"""
1540
- params.extend([target_lang, target_lang])
1541
-
1542
- # Project filter: match project-specific terms OR global terms (project_id IS NULL)
1616
+ lang_params_forward.extend([target_lang, target_lang])
1617
+ # For reverse: target_lang becomes source_lang (swapped)
1618
+ lang_conditions_reverse += """ AND (
1619
+ t.source_lang = ? OR
1620
+ (t.source_lang IS NULL AND tb.source_lang = ?) OR
1621
+ (t.source_lang IS NULL AND tb.source_lang IS NULL)
1622
+ )"""
1623
+ lang_params_reverse.extend([target_lang, target_lang])
1624
+
1625
+ # Project filter conditions
1626
+ project_conditions = ""
1627
+ project_params = []
1543
1628
  if project_id:
1544
- query += " AND (t.project_id = ? OR t.project_id IS NULL)"
1545
- params.append(project_id)
1546
-
1629
+ project_conditions = " AND (t.project_id = ? OR t.project_id IS NULL)"
1630
+ project_params = [project_id]
1631
+
1632
+ # Min length conditions
1633
+ min_len_forward = ""
1634
+ min_len_reverse = ""
1547
1635
  if min_length > 0:
1548
- query += f" AND LENGTH(t.source_term) >= {min_length}"
1549
-
1550
- # Sort by ranking (lower number = higher priority)
1551
- # Project termbases (ranking IS NULL) appear first, then by ranking, then alphabetically
1552
- # Use COALESCE to treat NULL as -1 (highest priority)
1553
- query += " ORDER BY COALESCE(tb.ranking, -1) ASC, t.source_term ASC"
1554
-
1636
+ min_len_forward = f" AND LENGTH(t.source_term) >= {min_length}"
1637
+ min_len_reverse = f" AND LENGTH(t.target_term) >= {min_length}"
1638
+
1639
+ # Build forward query
1640
+ forward_query = base_select_forward + lang_conditions_forward + project_conditions + min_len_forward
1641
+ forward_params.extend(lang_params_forward)
1642
+ forward_params.extend(project_params)
1643
+
1644
+ if bidirectional:
1645
+ # Build reverse query
1646
+ reverse_query = base_select_reverse + lang_conditions_reverse + project_conditions + min_len_reverse
1647
+ reverse_params.extend(lang_params_reverse)
1648
+ reverse_params.extend(project_params)
1649
+
1650
+ # Combine with UNION and sort
1651
+ query = f"""
1652
+ SELECT * FROM (
1653
+ {forward_query}
1654
+ UNION ALL
1655
+ {reverse_query}
1656
+ ) combined
1657
+ ORDER BY COALESCE(ranking, -1) ASC, source_term ASC
1658
+ """
1659
+ params = forward_params + reverse_params
1660
+ else:
1661
+ # Original forward-only behavior
1662
+ query = forward_query + " ORDER BY COALESCE(ranking, -1) ASC, source_term ASC"
1663
+ params = forward_params
1664
+
1555
1665
  self.cursor.execute(query, params)
1556
1666
  results = []
1667
+ seen_combinations = set() # Track (source_term, target_term, termbase_id) to avoid duplicates
1668
+
1557
1669
  for row in self.cursor.fetchall():
1558
1670
  result_dict = dict(row)
1671
+
1672
+ # Deduplicate: same term pair from same termbase should only appear once
1673
+ # Prefer 'source' match over 'target' match
1674
+ combo_key = (
1675
+ result_dict.get('source_term', '').lower(),
1676
+ result_dict.get('target_term', '').lower(),
1677
+ result_dict.get('termbase_id')
1678
+ )
1679
+ if combo_key in seen_combinations:
1680
+ continue
1681
+ seen_combinations.add(combo_key)
1682
+
1559
1683
  # SQLite stores booleans as 0/1, explicitly convert to Python bool
1560
1684
  if 'is_project_termbase' in result_dict:
1561
1685
  result_dict['is_project_termbase'] = bool(result_dict['is_project_termbase'])
1562
-
1686
+
1563
1687
  # Fetch target synonyms for this term and include them in the result
1564
1688
  term_id = result_dict.get('id')
1689
+ match_direction = result_dict.get('match_direction', 'source')
1565
1690
  if term_id:
1566
1691
  try:
1692
+ # For reverse matches, fetch 'source' synonyms since they become targets
1693
+ synonym_lang = 'source' if match_direction == 'target' else 'target'
1567
1694
  self.cursor.execute("""
1568
1695
  SELECT synonym_text, forbidden FROM termbase_synonyms
1569
- WHERE term_id = ? AND language = 'target'
1696
+ WHERE term_id = ? AND language = ?
1570
1697
  ORDER BY display_order ASC
1571
- """, (term_id,))
1698
+ """, (term_id, synonym_lang))
1572
1699
  synonyms = []
1573
1700
  for syn_row in self.cursor.fetchall():
1574
1701
  syn_text = syn_row[0]
@@ -1578,7 +1705,7 @@ class DatabaseManager:
1578
1705
  result_dict['target_synonyms'] = synonyms
1579
1706
  except Exception:
1580
1707
  result_dict['target_synonyms'] = []
1581
-
1708
+
1582
1709
  results.append(result_dict)
1583
1710
  return results
1584
1711
 
@@ -186,9 +186,13 @@ def run_all_migrations(db_manager) -> bool:
186
186
  # Migration 3: Add display_order and forbidden fields to synonyms
187
187
  if not migrate_synonym_fields(db_manager):
188
188
  success = False
189
-
189
+
190
+ # Migration 4: Add ai_inject field to termbases
191
+ if not migrate_termbase_ai_inject(db_manager):
192
+ success = False
193
+
190
194
  print("="*60)
191
-
195
+
192
196
  return success
193
197
 
194
198
 
@@ -221,18 +225,26 @@ def check_and_migrate(db_manager) -> bool:
221
225
 
222
226
  # Check if synonyms table exists
223
227
  cursor.execute("""
224
- SELECT name FROM sqlite_master
228
+ SELECT name FROM sqlite_master
225
229
  WHERE type='table' AND name='termbase_synonyms'
226
230
  """)
227
231
  needs_synonyms_table = cursor.fetchone() is None
228
-
232
+
233
+ # Check if termbases table has ai_inject column
234
+ cursor.execute("PRAGMA table_info(termbases)")
235
+ termbase_columns = {row[1] for row in cursor.fetchall()}
236
+ needs_ai_inject = 'ai_inject' not in termbase_columns
237
+
229
238
  if needs_migration:
230
239
  print(f"⚠️ Migration needed - missing columns: {', '.join([c for c in ['project', 'client', 'term_uuid', 'note'] if c not in columns])}")
231
-
240
+
232
241
  if needs_synonyms_table:
233
242
  print("⚠️ Migration needed - termbase_synonyms table missing")
234
-
235
- if needs_migration or needs_synonyms_table:
243
+
244
+ if needs_ai_inject:
245
+ print("⚠️ Migration needed - termbases.ai_inject column missing")
246
+
247
+ if needs_migration or needs_synonyms_table or needs_ai_inject:
236
248
  success = run_all_migrations(db_manager)
237
249
  if success:
238
250
  # Generate UUIDs for terms that don't have them
@@ -316,6 +328,41 @@ def migrate_synonym_fields(db_manager) -> bool:
316
328
  return False
317
329
 
318
330
 
331
+ def migrate_termbase_ai_inject(db_manager) -> bool:
332
+ """
333
+ Add ai_inject column to termbases table.
334
+ When enabled, the termbase's terms will be injected into LLM translation prompts.
335
+
336
+ Args:
337
+ db_manager: DatabaseManager instance
338
+
339
+ Returns:
340
+ True if migration successful
341
+ """
342
+ try:
343
+ cursor = db_manager.cursor
344
+
345
+ # Check which columns exist
346
+ cursor.execute("PRAGMA table_info(termbases)")
347
+ columns = {row[1] for row in cursor.fetchall()}
348
+
349
+ if 'ai_inject' not in columns:
350
+ print("📊 Adding 'ai_inject' column to termbases...")
351
+ cursor.execute("ALTER TABLE termbases ADD COLUMN ai_inject BOOLEAN DEFAULT 0")
352
+ db_manager.connection.commit()
353
+ print(" ✓ Column 'ai_inject' added successfully")
354
+ else:
355
+ print("✅ termbases.ai_inject column already exists")
356
+
357
+ return True
358
+
359
+ except Exception as e:
360
+ print(f"❌ ai_inject migration failed: {e}")
361
+ import traceback
362
+ traceback.print_exc()
363
+ return False
364
+
365
+
319
366
  def generate_missing_uuids(db_manager) -> bool:
320
367
  """
321
368
  Generate UUIDs for any termbase terms that don't have them.