PyPI - supervertaler - Versions diffs - 1.9.153__py3-none-any.whl → 1.9.189__py3-none-any.whl - Mend

supervertaler 1.9.153py3-none-any.whl → 1.9.189py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of supervertaler might be problematic. Click here for more details.

Files changed (25) hide show

Supervertaler.py +3729 -1195
modules/database_manager.py +313 -120
modules/database_migrations.py +54 -7
modules/extract_tm.py +518 -0
modules/keyboard_shortcuts_widget.py +7 -0
modules/mqxliff_handler.py +71 -2
modules/project_tm.py +320 -0
modules/superbrowser.py +22 -0
modules/superlookup.py +12 -8
modules/tag_manager.py +20 -2
modules/termbase_manager.py +105 -2
modules/termview_widget.py +82 -42
modules/theme_manager.py +41 -4
modules/tm_metadata_manager.py +59 -13
modules/translation_memory.py +4 -13
modules/translation_results_panel.py +0 -7
modules/unified_prompt_library.py +2 -2
modules/unified_prompt_manager_qt.py +47 -18
supervertaler-1.9.189.dist-info/METADATA +151 -0
{supervertaler-1.9.153.dist-info → supervertaler-1.9.189.dist-info}/RECORD +24 -22
{supervertaler-1.9.153.dist-info → supervertaler-1.9.189.dist-info}/WHEEL +1 -1
supervertaler-1.9.153.dist-info/METADATA +0 -896
{supervertaler-1.9.153.dist-info → supervertaler-1.9.189.dist-info}/entry_points.txt +0 -0
{supervertaler-1.9.153.dist-info → supervertaler-1.9.189.dist-info}/licenses/LICENSE +0 -0
{supervertaler-1.9.153.dist-info → supervertaler-1.9.189.dist-info}/top_level.txt +0 -0

modules/database_manager.py CHANGED Viewed

@@ -17,12 +17,38 @@ import sqlite3
 import os
 import json
 import hashlib
+import unicodedata
+import re
 from datetime import datetime
 from typing import List, Dict, Optional, Tuple
 from pathlib import Path
 from difflib import SequenceMatcher
+def _normalize_for_matching(text: str) -> str:
+    """Normalize text for exact matching.
+    Handles invisible differences that would cause exact match to fail:
+    - Unicode normalization (NFC)
+    - Multiple whitespace -> single space
+    - Leading/trailing whitespace
+    - Non-breaking spaces -> regular spaces
+    """
+    if not text:
+        return ""
+    # Unicode normalize (NFC form)
+    text = unicodedata.normalize('NFC', text)
+    # Convert non-breaking spaces and other whitespace to regular space
+    text = text.replace('\u00a0', ' ')  # NBSP
+    text = text.replace('\u2007', ' ')  # Figure space
+    text = text.replace('\u202f', ' ')  # Narrow NBSP
+    # Collapse multiple whitespace to single space
+    text = re.sub(r'\s+', ' ', text)
+    # Strip leading/trailing whitespace
+    text = text.strip()
+    return text
 class DatabaseManager:
     """Manages SQLite database for translation resources"""
@@ -655,22 +681,46 @@ class DatabaseManager:
     # TRANSLATION MEMORY METHODS
     # ============================================
-    def add_translation_unit(self, source: str, target: str, source_lang: str,
+    def add_translation_unit(self, source: str, target: str, source_lang: str,
                             target_lang: str, tm_id: str = 'project',
                             project_id: str = None, context_before: str = None,
-                            context_after: str = None, notes: str = None) -> int:
+                            context_after: str = None, notes: str = None,
+                            overwrite: bool = False) -> int:
         """
         Add translation unit to database
+        Args:
+            source: Source text
+            target: Target text
+            source_lang: Source language code
+            target_lang: Target language code
+            tm_id: TM identifier
+            project_id: Optional project ID
+            context_before: Optional context before
+            context_after: Optional context after
+            notes: Optional notes
+            overwrite: If True, delete existing entries with same source before inserting
+                      (implements "Save only latest translation" mode)
         Returns: ID of inserted/updated entry
         """
-        # Generate hash for fast exact matching
-        source_hash = hashlib.md5(source.encode('utf-8')).hexdigest()
+        # Generate hash from NORMALIZED source for consistent exact matching
+        # This handles invisible differences like Unicode normalization, whitespace variations
+        normalized_source = _normalize_for_matching(source)
+        source_hash = hashlib.md5(normalized_source.encode('utf-8')).hexdigest()
         try:
+            # If overwrite mode, delete ALL existing entries with same source_hash and tm_id
+            # This ensures only the latest translation is kept
+            if overwrite:
+                self.cursor.execute("""
+                    DELETE FROM translation_units
+                    WHERE source_hash = ? AND tm_id = ?
+                """, (source_hash, tm_id))
             self.cursor.execute("""
-                INSERT INTO translation_units
-                (source_text, target_text, source_lang, target_lang, tm_id,
+                INSERT INTO translation_units
+                (source_text, target_text, source_lang, target_lang, tm_id,
                  project_id, context_before, context_after, source_hash, notes)
                 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                 ON CONFLICT(source_hash, target_text, tm_id) DO UPDATE SET
@@ -678,42 +728,47 @@ class DatabaseManager:
                     modified_date = CURRENT_TIMESTAMP
             """, (source, target, source_lang, target_lang, tm_id,
                   project_id, context_before, context_after, source_hash, notes))
             self.connection.commit()
             return self.cursor.lastrowid
         except Exception as e:
             self.log(f"Error adding translation unit: {e}")
             return None
     def get_exact_match(self, source: str, tm_ids: List[str] = None,
-                       source_lang: str = None, target_lang: str = None,
+                       source_lang: str = None, target_lang: str = None,
                        bidirectional: bool = True) -> Optional[Dict]:
         """
         Get exact match from TM
         Args:
             source: Source text to match
             tm_ids: List of TM IDs to search (None = all)
             source_lang: Filter by source language (base code matching: 'en' matches 'en-US', 'en-GB', etc.)
             target_lang: Filter by target language (base code matching)
             bidirectional: If True, search both directions (nl→en AND en→nl)
         Returns: Dictionary with match data or None
         """
         from modules.tmx_generator import get_base_lang_code
+        # Try both normalized and non-normalized hashes for backward compatibility
+        # This handles invisible differences like Unicode normalization, whitespace variations
         source_hash = hashlib.md5(source.encode('utf-8')).hexdigest()
+        normalized_source = _normalize_for_matching(source)
+        normalized_hash = hashlib.md5(normalized_source.encode('utf-8')).hexdigest()
         # Get base language codes for comparison
         src_base = get_base_lang_code(source_lang) if source_lang else None
         tgt_base = get_base_lang_code(target_lang) if target_lang else None
+        # Search using both original hash and normalized hash
         query = """
-            SELECT * FROM translation_units
-            WHERE source_hash = ? AND source_text = ?
+            SELECT * FROM translation_units
+            WHERE (source_hash = ? OR source_hash = ?)
         """
-        params = [source_hash, source]
+        params = [source_hash, normalized_hash]
         if tm_ids:
             placeholders = ','.join('?' * len(tm_ids))
@@ -840,11 +895,15 @@ class DatabaseManager:
             bidirectional: If True, search both directions (nl→en AND en→nl)
         Returns: List of matches with similarity scores
+        Note: When multiple TMs are provided, searches each TM separately to ensure
+        good matches from smaller TMs aren't pushed out by BM25 keyword ranking
+        from larger TMs. Results are merged and sorted by actual similarity.
         """
         # For better FTS5 matching, tokenize the query and escape special chars
         # FTS5 special characters: " ( ) - : , . ! ?
         import re
-        from modules.tmx_generator import get_base_lang_code
+        from modules.tmx_generator import get_base_lang_code, get_lang_match_variants
         # Strip HTML/XML tags from source for clean text search
         text_without_tags = re.sub(r'<[^>]+>', '', source)
@@ -868,22 +927,57 @@ class DatabaseManager:
         # This helps find similar long segments more reliably
         search_terms_for_query = all_search_terms[:20]
-        print(f"[DEBUG] search_fuzzy_matches: source='{source[:50]}...', {len(all_search_terms)} terms")
         if not search_terms_for_query:
             # If no valid terms, return empty results
-            print(f"[DEBUG] search_fuzzy_matches: No valid search terms, returning empty")
             return []
         # Quote each term to prevent FTS5 syntax errors
         fts_query = ' OR '.join(f'"{term}"' for term in search_terms_for_query)
-        print(f"[DEBUG] search_fuzzy_matches: FTS query terms = {search_terms_for_query[:10]}...")
         # Get base language codes for comparison
         src_base = get_base_lang_code(source_lang) if source_lang else None
         tgt_base = get_base_lang_code(target_lang) if target_lang else None
-        # Use FTS5 for initial candidate retrieval (fast)
+        # MULTI-TM FIX: Search each TM separately to avoid BM25 ranking issues
+        # When a large TM is combined with a small TM, the large TM's many keyword matches
+        # push down genuinely similar sentences from the small TM
+        tms_to_search = tm_ids if tm_ids else [None]  # None means search all TMs together
+        all_results = []
+        for tm_id in tms_to_search:
+            # Search this specific TM (or all if tm_id is None)
+            tm_results = self._search_single_tm_fuzzy(
+                source, fts_query, [tm_id] if tm_id else None,
+                threshold, max_results, src_base, tgt_base,
+                source_lang, target_lang, bidirectional
+            )
+            all_results.extend(tm_results)
+        # Deduplicate by source_text (keep highest similarity for each unique source)
+        seen = {}
+        for result in all_results:
+            key = result['source_text']
+            if key not in seen or result['similarity'] > seen[key]['similarity']:
+                seen[key] = result
+        deduped_results = list(seen.values())
+        # Sort ALL results by similarity (highest first) - this ensures the 76% match
+        # appears before 40% matches regardless of which TM they came from
+        deduped_results.sort(key=lambda x: x['similarity'], reverse=True)
+        return deduped_results[:max_results]
+    def _search_single_tm_fuzzy(self, source: str, fts_query: str, tm_ids: List[str],
+                                 threshold: float, max_results: int,
+                                 src_base: str, tgt_base: str,
+                                 source_lang: str, target_lang: str,
+                                 bidirectional: bool) -> List[Dict]:
+        """Search a single TM (or all TMs if tm_ids is None) for fuzzy matches"""
+        from modules.tmx_generator import get_lang_match_variants
+        # Build query for this TM
         query = """
             SELECT tu.*,
                    bm25(translation_units_fts) as relevance
@@ -893,13 +987,12 @@ class DatabaseManager:
         """
         params = [fts_query]
-        if tm_ids:
+        if tm_ids and tm_ids[0] is not None:
             placeholders = ','.join('?' * len(tm_ids))
             query += f" AND tu.tm_id IN ({placeholders})"
             params.extend(tm_ids)
         # Use flexible language matching (matches 'nl', 'nl-NL', 'Dutch', etc.)
-        from modules.tmx_generator import get_lang_match_variants
         if src_base:
             src_variants = get_lang_match_variants(source_lang)
             src_conditions = []
@@ -920,19 +1013,16 @@ class DatabaseManager:
                 params.append(f"{variant}-%")
             query += f" AND ({' OR '.join(tgt_conditions)})"
-        # Get more candidates than needed for proper scoring (increase limit for long segments)
-        # Long segments need MANY more candidates because BM25 ranking may push down
-        # the truly similar entries in favor of entries matching more search terms
+        # Per-TM candidate limit - INCREASED to catch more potential fuzzy matches
+        # When multiple TMs are searched, BM25 ranking can push genuinely similar
+        # entries far down the list due to common word matches in other entries
         candidate_limit = max(500, max_results * 50)
         query += f" ORDER BY relevance DESC LIMIT {candidate_limit}"
-        print(f"[DEBUG] search_fuzzy_matches: Executing query (limit={candidate_limit})...")
         try:
             self.cursor.execute(query, params)
             all_rows = self.cursor.fetchall()
         except Exception as e:
-            print(f"[DEBUG] search_fuzzy_matches: SQL ERROR: {e}")
             return []
         results = []
@@ -948,8 +1038,6 @@ class DatabaseManager:
                 match_dict['match_pct'] = int(similarity * 100)
                 results.append(match_dict)
-        print(f"[DEBUG] search_fuzzy_matches: After threshold filter ({threshold}): {len(results)} matches")
         # If bidirectional, also search reverse direction
         if bidirectional and src_base and tgt_base:
             query = """
@@ -961,13 +1049,12 @@ class DatabaseManager:
             """
             params = [fts_query]
-            if tm_ids:
+            if tm_ids and tm_ids[0] is not None:
                 placeholders = ','.join('?' * len(tm_ids))
                 query += f" AND tu.tm_id IN ({placeholders})"
                 params.extend(tm_ids)
             # Reversed language filters with flexible matching
-            # For reverse: TM target_lang should match our source_lang, TM source_lang should match our target_lang
             src_variants = get_lang_match_variants(source_lang)
             tgt_variants = get_lang_match_variants(target_lang)
@@ -991,26 +1078,27 @@ class DatabaseManager:
             query += f" ORDER BY relevance DESC LIMIT {max_results * 5}"
-            self.cursor.execute(query, params)
-            for row in self.cursor.fetchall():
-                match_dict = dict(row)
-                # Calculate similarity against target_text (since we're reversing)
-                similarity = self.calculate_similarity(source, match_dict['target_text'])
+            try:
+                self.cursor.execute(query, params)
-                # Only include matches above threshold
-                if similarity >= threshold:
-                    # Swap source/target for reverse match
-                    match_dict['source_text'], match_dict['target_text'] = match_dict['target_text'], match_dict['source_text']
-                    match_dict['source_lang'], match_dict['target_lang'] = match_dict['target_lang'], match_dict['source_lang']
-                    match_dict['similarity'] = similarity
-                    match_dict['match_pct'] = int(similarity * 100)
-                    match_dict['reverse_match'] = True
-                    results.append(match_dict)
-        # Sort by similarity (highest first) and limit results
-        results.sort(key=lambda x: x['similarity'], reverse=True)
-        return results[:max_results]
+                for row in self.cursor.fetchall():
+                    match_dict = dict(row)
+                    # Calculate similarity against target_text (since we're reversing)
+                    similarity = self.calculate_similarity(source, match_dict['target_text'])
+                    # Only include matches above threshold
+                    if similarity >= threshold:
+                        # Swap source/target for reverse match
+                        match_dict['source_text'], match_dict['target_text'] = match_dict['target_text'], match_dict['source_text']
+                        match_dict['source_lang'], match_dict['target_lang'] = match_dict['target_lang'], match_dict['source_lang']
+                        match_dict['similarity'] = similarity
+                        match_dict['match_pct'] = int(similarity * 100)
+                        match_dict['reverse_match'] = True
+                        results.append(match_dict)
+            except Exception as e:
+                print(f"[DEBUG] _search_single_tm_fuzzy (reverse): SQL ERROR: {e}")
+        return results
     def search_all(self, source: str, tm_ids: List[str] = None, enabled_only: bool = True,
                    threshold: float = 0.75, max_results: int = 10) -> List[Dict]:
@@ -1389,120 +1477,225 @@ class DatabaseManager:
         # TODO: Implement in Phase 3
         pass
-    def search_termbases(self, search_term: str, source_lang: str = None,
+    def search_termbases(self, search_term: str, source_lang: str = None,
                         target_lang: str = None, project_id: str = None,
-                        min_length: int = 0) -> List[Dict]:
+                        min_length: int = 0, bidirectional: bool = True) -> List[Dict]:
         """
-        Search termbases for matching source terms
+        Search termbases for matching terms (bidirectional by default)
         Args:
-            search_term: Source term to search for
+            search_term: Term to search for
             source_lang: Filter by source language (optional)
             target_lang: Filter by target language (optional)
             project_id: Filter by project (optional)
             min_length: Minimum term length to return
+            bidirectional: If True, also search target_term and swap results (default True)
         Returns:
             List of termbase hits, sorted by priority (lower = higher priority)
+            Each result includes 'match_direction' ('source' or 'target') indicating
+            which column matched. For 'target' matches, source_term and target_term
+            are swapped so results are always oriented correctly for the current project.
         """
         # Build query with filters - include termbase name and ranking via JOIN
         # Note: termbase_id is stored as TEXT in termbase_terms but INTEGER in termbases
         # Use CAST to ensure proper comparison
         # IMPORTANT: Join with termbase_activation to get the ACTUAL priority for this project
         # CRITICAL FIX: Also match when search_term starts with the glossary term
-        # This handles cases like searching for "ca." when glossary has "ca."
+        # This handles cases like searching for "ca." when glossary has "ca."
         # AND searching for "ca" when glossary has "ca."
         # We also strip trailing punctuation from glossary terms for comparison
-        query = """
-            SELECT
-                t.id, t.source_term, t.target_term, t.termbase_id, t.priority,
+        # Build matching conditions for a given column
+        def build_match_conditions(column: str) -> str:
+            return f"""(
+                LOWER(t.{column}) = LOWER(?) OR
+                LOWER(t.{column}) LIKE LOWER(?) OR
+                LOWER(t.{column}) LIKE LOWER(?) OR
+                LOWER(t.{column}) LIKE LOWER(?) OR
+                LOWER(RTRIM(t.{column}, '.!?,;:')) = LOWER(?) OR
+                LOWER(?) LIKE LOWER(t.{column}) || '%' OR
+                LOWER(?) = LOWER(RTRIM(t.{column}, '.!?,;:'))
+            )"""
+        # Build match params for one direction
+        def build_match_params() -> list:
+            return [
+                search_term,
+                f"{search_term} %",
+                f"% {search_term}",
+                f"% {search_term} %",
+                search_term,  # For RTRIM comparison
+                search_term,  # For reverse LIKE
+                search_term   # For reverse RTRIM comparison
+            ]
+        # Matching patterns:
+        # 1. Exact match: column = search_term
+        # 2. Glossary term starts with search: column LIKE "search_term %"
+        # 3. Glossary term ends with search: column LIKE "% search_term"
+        # 4. Glossary term contains search: column LIKE "% search_term %"
+        # 5. Glossary term (stripped) = search_term: RTRIM(column) = search_term (handles "ca." = "ca")
+        # 6. Search starts with glossary term: search_term LIKE column || '%'
+        # 7. Search = glossary term stripped: search_term = RTRIM(column)
+        # Base SELECT for forward matches (source_term matches)
+        base_select_forward = """
+            SELECT
+                t.id, t.source_term, t.target_term, t.termbase_id, t.priority,
                 t.forbidden, t.source_lang, t.target_lang, t.definition, t.domain,
                 t.notes, t.project, t.client,
                 tb.name as termbase_name,
                 tb.source_lang as termbase_source_lang,
                 tb.target_lang as termbase_target_lang,
                 tb.is_project_termbase,
-                COALESCE(ta.priority, tb.ranking) as ranking
+                COALESCE(ta.priority, tb.ranking) as ranking,
+                'source' as match_direction
             FROM termbase_terms t
             LEFT JOIN termbases tb ON CAST(t.termbase_id AS INTEGER) = tb.id
             LEFT JOIN termbase_activation ta ON ta.termbase_id = tb.id AND ta.project_id = ? AND ta.is_active = 1
-            WHERE (
-                LOWER(t.source_term) = LOWER(?) OR
-                LOWER(t.source_term) LIKE LOWER(?) OR
-                LOWER(t.source_term) LIKE LOWER(?) OR
-                LOWER(t.source_term) LIKE LOWER(?) OR
-                LOWER(RTRIM(t.source_term, '.!?,;:')) = LOWER(?) OR
-                LOWER(?) LIKE LOWER(t.source_term) || '%' OR
-                LOWER(?) = LOWER(RTRIM(t.source_term, '.!?,;:'))
-            )
+            WHERE {match_conditions}
             AND (ta.is_active = 1 OR tb.is_project_termbase = 1)
-        """
-        # Matching patterns:
-        # 1. Exact match: source_term = search_term
-        # 2. Glossary term starts with search: source_term LIKE "search_term %"
-        # 3. Glossary term ends with search: source_term LIKE "% search_term"
-        # 4. Glossary term contains search: source_term LIKE "% search_term %"
-        # 5. Glossary term (stripped) = search_term: RTRIM(source_term) = search_term (handles "ca." = "ca")
-        # 6. Search starts with glossary term: search_term LIKE source_term || '%'
-        # 7. Search = glossary term stripped: search_term = RTRIM(source_term)
-        params = [
-            project_id if project_id else 0,  # Use 0 if no project (won't match any activation records)
-            search_term,
-            f"{search_term} %",
-            f"% {search_term}",
-            f"% {search_term} %",
-            search_term,  # For RTRIM comparison
-            search_term,  # For reverse LIKE
-            search_term   # For reverse RTRIM comparison
-        ]
-        # Language filters - if term has no language, use termbase language for filtering
+        """.format(match_conditions=build_match_conditions('source_term'))
+        # Base SELECT for reverse matches (target_term matches) - swap source/target in output
+        base_select_reverse = """
+            SELECT
+                t.id, t.target_term as source_term, t.source_term as target_term,
+                t.termbase_id, t.priority,
+                t.forbidden, t.target_lang as source_lang, t.source_lang as target_lang,
+                t.definition, t.domain,
+                t.notes, t.project, t.client,
+                tb.name as termbase_name,
+                tb.target_lang as termbase_source_lang,
+                tb.source_lang as termbase_target_lang,
+                tb.is_project_termbase,
+                COALESCE(ta.priority, tb.ranking) as ranking,
+                'target' as match_direction
+            FROM termbase_terms t
+            LEFT JOIN termbases tb ON CAST(t.termbase_id AS INTEGER) = tb.id
+            LEFT JOIN termbase_activation ta ON ta.termbase_id = tb.id AND ta.project_id = ? AND ta.is_active = 1
+            WHERE {match_conditions}
+            AND (ta.is_active = 1 OR tb.is_project_termbase = 1)
+        """.format(match_conditions=build_match_conditions('target_term'))
+        # Build params
+        project_param = project_id if project_id else 0
+        forward_params = [project_param] + build_match_params()
+        reverse_params = [project_param] + build_match_params()
+        # Build language filter conditions
+        lang_conditions_forward = ""
+        lang_conditions_reverse = ""
+        lang_params_forward = []
+        lang_params_reverse = []
         if source_lang:
-            query += """ AND (
-                t.source_lang = ? OR
+            # For forward: filter on source_lang
+            lang_conditions_forward += """ AND (
+                t.source_lang = ? OR
                 (t.source_lang IS NULL AND tb.source_lang = ?) OR
                 (t.source_lang IS NULL AND tb.source_lang IS NULL)
             )"""
-            params.extend([source_lang, source_lang])
+            lang_params_forward.extend([source_lang, source_lang])
+            # For reverse: source_lang becomes target_lang (swapped)
+            lang_conditions_reverse += """ AND (
+                t.target_lang = ? OR
+                (t.target_lang IS NULL AND tb.target_lang = ?) OR
+                (t.target_lang IS NULL AND tb.target_lang IS NULL)
+            )"""
+            lang_params_reverse.extend([source_lang, source_lang])
         if target_lang:
-            query += """ AND (
-                t.target_lang = ? OR
+            # For forward: filter on target_lang
+            lang_conditions_forward += """ AND (
+                t.target_lang = ? OR
                 (t.target_lang IS NULL AND tb.target_lang = ?) OR
                 (t.target_lang IS NULL AND tb.target_lang IS NULL)
             )"""
-            params.extend([target_lang, target_lang])
-        # Project filter: match project-specific terms OR global terms (project_id IS NULL)
+            lang_params_forward.extend([target_lang, target_lang])
+            # For reverse: target_lang becomes source_lang (swapped)
+            lang_conditions_reverse += """ AND (
+                t.source_lang = ? OR
+                (t.source_lang IS NULL AND tb.source_lang = ?) OR
+                (t.source_lang IS NULL AND tb.source_lang IS NULL)
+            )"""
+            lang_params_reverse.extend([target_lang, target_lang])
+        # Project filter conditions
+        project_conditions = ""
+        project_params = []
         if project_id:
-            query += " AND (t.project_id = ? OR t.project_id IS NULL)"
-            params.append(project_id)
+            project_conditions = " AND (t.project_id = ? OR t.project_id IS NULL)"
+            project_params = [project_id]
+        # Min length conditions
+        min_len_forward = ""
+        min_len_reverse = ""
         if min_length > 0:
-            query += f" AND LENGTH(t.source_term) >= {min_length}"
-        # Sort by ranking (lower number = higher priority)
-        # Project termbases (ranking IS NULL) appear first, then by ranking, then alphabetically
-        # Use COALESCE to treat NULL as -1 (highest priority)
-        query += " ORDER BY COALESCE(tb.ranking, -1) ASC, t.source_term ASC"
+            min_len_forward = f" AND LENGTH(t.source_term) >= {min_length}"
+            min_len_reverse = f" AND LENGTH(t.target_term) >= {min_length}"
+        # Build forward query
+        forward_query = base_select_forward + lang_conditions_forward + project_conditions + min_len_forward
+        forward_params.extend(lang_params_forward)
+        forward_params.extend(project_params)
+        if bidirectional:
+            # Build reverse query
+            reverse_query = base_select_reverse + lang_conditions_reverse + project_conditions + min_len_reverse
+            reverse_params.extend(lang_params_reverse)
+            reverse_params.extend(project_params)
+            # Combine with UNION and sort
+            query = f"""
+                SELECT * FROM (
+                    {forward_query}
+                    UNION ALL
+                    {reverse_query}
+                ) combined
+                ORDER BY COALESCE(ranking, -1) ASC, source_term ASC
+            """
+            params = forward_params + reverse_params
+        else:
+            # Original forward-only behavior
+            query = forward_query + " ORDER BY COALESCE(ranking, -1) ASC, source_term ASC"
+            params = forward_params
         self.cursor.execute(query, params)
         results = []
+        seen_combinations = set()  # Track (source_term, target_term, termbase_id) to avoid duplicates
         for row in self.cursor.fetchall():
             result_dict = dict(row)
+            # Deduplicate: same term pair from same termbase should only appear once
+            # Prefer 'source' match over 'target' match
+            combo_key = (
+                result_dict.get('source_term', '').lower(),
+                result_dict.get('target_term', '').lower(),
+                result_dict.get('termbase_id')
+            )
+            if combo_key in seen_combinations:
+                continue
+            seen_combinations.add(combo_key)
             # SQLite stores booleans as 0/1, explicitly convert to Python bool
             if 'is_project_termbase' in result_dict:
                 result_dict['is_project_termbase'] = bool(result_dict['is_project_termbase'])
             # Fetch target synonyms for this term and include them in the result
             term_id = result_dict.get('id')
+            match_direction = result_dict.get('match_direction', 'source')
             if term_id:
                 try:
+                    # For reverse matches, fetch 'source' synonyms since they become targets
+                    synonym_lang = 'source' if match_direction == 'target' else 'target'
                     self.cursor.execute("""
                         SELECT synonym_text, forbidden FROM termbase_synonyms
-                        WHERE term_id = ? AND language = 'target'
+                        WHERE term_id = ? AND language = ?
                         ORDER BY display_order ASC
-                    """, (term_id,))
+                    """, (term_id, synonym_lang))
                     synonyms = []
                     for syn_row in self.cursor.fetchall():
                         syn_text = syn_row[0]
@@ -1512,7 +1705,7 @@ class DatabaseManager:
                     result_dict['target_synonyms'] = synonyms
                 except Exception:
                     result_dict['target_synonyms'] = []
             results.append(result_dict)
         return results

supervertaler 1.9.153__py3-none-any.whl → 1.9.189__py3-none-any.whl

Potentially problematic release.

supervertaler 1.9.153py3-none-any.whl → 1.9.189py3-none-any.whl