PyPI - supervertaler - Versions diffs - 1.9.153__py3-none-any.whl → 1.9.185__py3-none-any.whl - Mend

supervertaler 1.9.153py3-none-any.whl → 1.9.185py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of supervertaler might be problematic. Click here for more details.

Files changed (24) hide show

Supervertaler.py +3450 -1135
modules/database_manager.py +313 -120
modules/database_migrations.py +54 -7
modules/extract_tm.py +518 -0
modules/keyboard_shortcuts_widget.py +7 -0
modules/mqxliff_handler.py +71 -2
modules/project_tm.py +320 -0
modules/superlookup.py +12 -8
modules/tag_manager.py +20 -2
modules/termbase_manager.py +105 -2
modules/termview_widget.py +82 -42
modules/theme_manager.py +41 -4
modules/tm_metadata_manager.py +59 -13
modules/translation_memory.py +4 -13
modules/translation_results_panel.py +0 -7
modules/unified_prompt_library.py +2 -2
modules/unified_prompt_manager_qt.py +47 -18
supervertaler-1.9.185.dist-info/METADATA +151 -0
{supervertaler-1.9.153.dist-info → supervertaler-1.9.185.dist-info}/RECORD +23 -21
{supervertaler-1.9.153.dist-info → supervertaler-1.9.185.dist-info}/WHEEL +1 -1
supervertaler-1.9.153.dist-info/METADATA +0 -896
{supervertaler-1.9.153.dist-info → supervertaler-1.9.185.dist-info}/entry_points.txt +0 -0
{supervertaler-1.9.153.dist-info → supervertaler-1.9.185.dist-info}/licenses/LICENSE +0 -0
{supervertaler-1.9.153.dist-info → supervertaler-1.9.185.dist-info}/top_level.txt +0 -0

modules/mqxliff_handler.py CHANGED Viewed

@@ -159,9 +159,78 @@ class MQXLIFFHandler:
                 segment = FormattedSegment(trans_unit_id, plain_text, formatted_xml)
                 segments.append(segment)
         return segments
+    def extract_bilingual_segments(self) -> List[Dict]:
+        """
+        Extract all source AND target segments from the MQXLIFF file.
+        Used for importing pretranslated mqxliff files.
+        Returns:
+            List of dicts with 'id', 'source', 'target', 'status' keys
+        """
+        segments = []
+        if self.body_element is None:
+            return segments
+        # Find all trans-unit elements (with or without namespace)
+        trans_units = self.body_element.findall('.//xliff:trans-unit', self.NAMESPACES)
+        if not trans_units:
+            trans_units = self.body_element.findall('.//trans-unit')
+        for trans_unit in trans_units:
+            trans_unit_id = trans_unit.get('id', 'unknown')
+            # Skip auxiliary segments (like hyperlink URLs with mq:nosplitjoin="true")
+            nosplitjoin = trans_unit.get('{MQXliff}nosplitjoin', 'false')
+            if nosplitjoin == 'true':
+                continue
+            # Find source element
+            source_elem = trans_unit.find('xliff:source', self.NAMESPACES)
+            if source_elem is None:
+                source_elem = trans_unit.find('source')
+            # Find target element
+            target_elem = trans_unit.find('xliff:target', self.NAMESPACES)
+            if target_elem is None:
+                target_elem = trans_unit.find('target')
+            source_text = ""
+            target_text = ""
+            if source_elem is not None:
+                source_text = self._extract_plain_text(source_elem)
+            if target_elem is not None:
+                target_text = self._extract_plain_text(target_elem)
+            # Get memoQ status if available
+            mq_status = trans_unit.get('{MQXliff}status', '')
+            # Map memoQ status to internal status
+            # memoQ statuses: "NotStarted", "Editing", "Confirmed", "Reviewed", "Rejected", etc.
+            status = 'not_started'
+            if mq_status in ['Confirmed', 'ProofRead', 'Reviewed']:
+                status = 'confirmed'
+            elif mq_status == 'Editing':
+                status = 'translated'
+            elif target_text.strip():
+                # Has target but unknown status - mark as pre-translated
+                status = 'pre_translated'
+            segments.append({
+                'id': trans_unit_id,
+                'source': source_text,
+                'target': target_text,
+                'status': status,
+                'mq_status': mq_status
+            })
+        return segments
     def _extract_plain_text(self, element: ET.Element) -> str:
         """
         Recursively extract plain text from an XML element, stripping all tags.

modules/project_tm.py ADDED Viewed

@@ -0,0 +1,320 @@
+"""
+ProjectTM - In-memory TM for instant grid lookups (Total Recall architecture)
+This module implements a lightweight in-memory Translation Memory that extracts
+relevant segments from the full TM database on project load. This makes grid
+navigation instant while keeping the full TM for concordance searches.
+Inspired by CafeTran's "Total Recall" feature.
+"""
+import sqlite3
+import threading
+from difflib import SequenceMatcher
+from typing import Dict, List, Optional, Callable
+import re
+class ProjectTM:
+    """
+    Lightweight in-memory TM extracted from the main TM database.
+    On project load, extracts segments that are relevant to the current project
+    (fuzzy matches above threshold) into an in-memory SQLite database for
+    instant lookups during grid navigation.
+    Usage:
+        project_tm = ProjectTM()
+        project_tm.extract_from_database(
+            db_manager,
+            project_segments,
+            tm_ids=['tm1', 'tm2'],
+            threshold=0.75,
+            progress_callback=lambda cur, total: print(f"{cur}/{total}")
+        )
+        # Fast lookup during grid navigation
+        matches = project_tm.search("source text to translate")
+    """
+    def __init__(self):
+        """Initialize in-memory SQLite database for ProjectTM"""
+        self.conn = sqlite3.connect(":memory:", check_same_thread=False)
+        self.conn.row_factory = sqlite3.Row
+        self.lock = threading.Lock()
+        self.is_built = False
+        self.segment_count = 0
+        # Create the schema
+        self._create_schema()
+    def _create_schema(self):
+        """Create the in-memory database schema"""
+        with self.lock:
+            cursor = self.conn.cursor()
+            cursor.execute("""
+                CREATE TABLE IF NOT EXISTS segments (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    source_text TEXT NOT NULL,
+                    target_text TEXT NOT NULL,
+                    source_lower TEXT NOT NULL,
+                    tm_id TEXT,
+                    tm_name TEXT,
+                    similarity REAL,
+                    original_id INTEGER
+                )
+            """)
+            # Index for fast exact match lookups
+            cursor.execute("CREATE INDEX IF NOT EXISTS idx_source_lower ON segments(source_lower)")
+            # FTS5 for fuzzy text search
+            cursor.execute("""
+                CREATE VIRTUAL TABLE IF NOT EXISTS segments_fts USING fts5(
+                    source_text,
+                    content=segments,
+                    content_rowid=id
+                )
+            """)
+            self.conn.commit()
+    def clear(self):
+        """Clear all segments from the ProjectTM"""
+        with self.lock:
+            cursor = self.conn.cursor()
+            cursor.execute("DELETE FROM segments")
+            cursor.execute("DELETE FROM segments_fts")
+            self.conn.commit()
+            self.is_built = False
+            self.segment_count = 0
+    def extract_from_database(
+        self,
+        db_manager,
+        project_segments: List,
+        tm_ids: List[str] = None,
+        source_lang: str = None,
+        target_lang: str = None,
+        threshold: float = 0.75,
+        progress_callback: Optional[Callable[[int, int], None]] = None,
+        log_callback: Optional[Callable[[str], None]] = None
+    ) -> int:
+        """
+        Extract relevant segments from the main TM database into ProjectTM.
+        For each unique source text in the project, searches the TM for fuzzy
+        matches above the threshold and stores them in memory.
+        Args:
+            db_manager: The main database manager with TM data
+            project_segments: List of project segments to find matches for
+            tm_ids: List of TM IDs to search (None = all active TMs)
+            source_lang: Source language filter
+            target_lang: Target language filter
+            threshold: Minimum similarity threshold (0.0-1.0)
+            progress_callback: Optional callback(current, total) for progress
+            log_callback: Optional callback(message) for logging
+        Returns:
+            Number of TM segments extracted
+        """
+        def log(msg):
+            if log_callback:
+                log_callback(msg)
+            else:
+                print(msg)
+        self.clear()
+        if not project_segments or not db_manager:
+            log(f"[ProjectTM] Early exit: segments={bool(project_segments)}, db={bool(db_manager)}")
+            return 0
+        # Get unique source texts from project
+        unique_sources = {}
+        for seg in project_segments:
+            # Try both 'source' and 'source_text' attributes (different segment types use different names)
+            source = getattr(seg, 'source', None) or getattr(seg, 'source_text', None)
+            if source and source.strip():
+                # Normalize: strip and lowercase for deduplication
+                key = source.strip().lower()
+                if key not in unique_sources:
+                    unique_sources[key] = source.strip()
+        total = len(unique_sources)
+        log(f"[ProjectTM] Found {total} unique source texts from {len(project_segments)} segments")
+        if total == 0:
+            return 0
+        extracted_count = 0
+        seen_sources = set()  # Deduplicate TM entries
+        cursor = self.conn.cursor()
+        log(f"[ProjectTM] Searching TMs: {tm_ids}, threshold={threshold}, langs={source_lang}->{target_lang}")
+        for i, (key, source_text) in enumerate(unique_sources.items()):
+            if progress_callback and i % 10 == 0:
+                progress_callback(i, total)
+            try:
+                # Search main TM database for fuzzy matches
+                matches = db_manager.search_fuzzy_matches(
+                    source_text,
+                    tm_ids=tm_ids,
+                    threshold=threshold,
+                    max_results=10,  # Keep top 10 matches per source
+                    source_lang=source_lang,
+                    target_lang=target_lang,
+                    bidirectional=True
+                )
+                # Debug: log first search
+                if i == 0:
+                    log(f"[ProjectTM] First search '{source_text[:50]}...' returned {len(matches)} matches")
+                for match in matches:
+                    match_source = match.get('source_text', '')
+                    match_target = match.get('target_text', '')
+                    if not match_source or not match_target:
+                        continue
+                    # Deduplicate by source text
+                    source_key = match_source.strip().lower()
+                    if source_key in seen_sources:
+                        continue
+                    seen_sources.add(source_key)
+                    # Insert into ProjectTM
+                    cursor.execute("""
+                        INSERT INTO segments (source_text, target_text, source_lower,
+                                            tm_id, tm_name, similarity, original_id)
+                        VALUES (?, ?, ?, ?, ?, ?, ?)
+                    """, (
+                        match_source,
+                        match_target,
+                        source_key,
+                        match.get('tm_id'),
+                        match.get('tm_name', 'Unknown TM'),
+                        match.get('similarity', 0),
+                        match.get('id')
+                    ))
+                    extracted_count += 1
+            except Exception as e:
+                # Log but continue - don't fail extraction for one bad segment
+                pass
+        # Commit all inserts
+        self.conn.commit()
+        # Rebuild FTS5 index
+        try:
+            cursor.execute("INSERT INTO segments_fts(segments_fts) VALUES('rebuild')")
+            self.conn.commit()
+        except Exception:
+            pass  # FTS rebuild may fail if no data, that's OK
+        if progress_callback:
+            progress_callback(total, total)
+        self.is_built = True
+        self.segment_count = extracted_count
+        return extracted_count
+    def search(self, source_text: str, max_results: int = 5) -> List[Dict]:
+        """
+        Search ProjectTM for matches (instant lookup).
+        First checks for exact matches, then falls back to fuzzy search.
+        Args:
+            source_text: Source text to search for
+            max_results: Maximum number of results to return
+        Returns:
+            List of match dictionaries with source_text, target_text, similarity, etc.
+        """
+        if not self.is_built or not source_text:
+            return []
+        source_lower = source_text.strip().lower()
+        results = []
+        with self.lock:
+            cursor = self.conn.cursor()
+            # 1. Check for exact match first (fastest)
+            cursor.execute("""
+                SELECT * FROM segments WHERE source_lower = ? LIMIT 1
+            """, (source_lower,))
+            exact = cursor.fetchone()
+            if exact:
+                results.append({
+                    'source_text': exact['source_text'],
+                    'target_text': exact['target_text'],
+                    'tm_id': exact['tm_id'],
+                    'tm_name': exact['tm_name'],
+                    'similarity': 1.0,  # Exact match
+                    'match_pct': 100,
+                    'id': exact['original_id']
+                })
+                return results  # Exact match - no need to search further
+            # 2. FTS5 fuzzy search
+            try:
+                # Tokenize query for FTS5
+                clean_text = re.sub(r'[^\w\s]', ' ', source_text)
+                search_terms = [t for t in clean_text.split() if len(t) > 2]
+                if search_terms:
+                    fts_query = ' OR '.join(f'"{term}"' for term in search_terms[:10])
+                    cursor.execute("""
+                        SELECT s.*, bm25(segments_fts) as rank
+                        FROM segments s
+                        JOIN segments_fts ON s.id = segments_fts.rowid
+                        WHERE segments_fts MATCH ?
+                        ORDER BY rank
+                        LIMIT ?
+                    """, (fts_query, max_results * 3))  # Get more candidates for re-ranking
+                    candidates = cursor.fetchall()
+                    # Re-rank by actual similarity
+                    for row in candidates:
+                        similarity = self._calculate_similarity(source_text, row['source_text'])
+                        if similarity >= 0.5:  # Lower threshold for ProjectTM (pre-filtered)
+                            results.append({
+                                'source_text': row['source_text'],
+                                'target_text': row['target_text'],
+                                'tm_id': row['tm_id'],
+                                'tm_name': row['tm_name'],
+                                'similarity': similarity,
+                                'match_pct': int(similarity * 100),
+                                'id': row['original_id']
+                            })
+                    # Sort by similarity and limit
+                    results.sort(key=lambda x: x['similarity'], reverse=True)
+                    results = results[:max_results]
+            except Exception:
+                pass  # FTS search may fail, return what we have
+        return results
+    def _calculate_similarity(self, text1: str, text2: str) -> float:
+        """Calculate similarity ratio between two texts"""
+        # Strip HTML/XML tags for comparison
+        clean1 = re.sub(r'<[^>]+>', '', text1).lower()
+        clean2 = re.sub(r'<[^>]+>', '', text2).lower()
+        return SequenceMatcher(None, clean1, clean2).ratio()
+    def get_stats(self) -> Dict:
+        """Get statistics about the ProjectTM"""
+        return {
+            'is_built': self.is_built,
+            'segment_count': self.segment_count
+        }

modules/superlookup.py CHANGED Viewed

@@ -88,14 +88,18 @@ class SuperlookupEngine:
             Captured text or None if failed
         """
         try:
-            import keyboard
-            # Wait for hotkey to release before sending Ctrl+C
-            time.sleep(0.2)
-            # Use keyboard library to send Ctrl+C
-            keyboard.press_and_release('ctrl+c')
-            time.sleep(0.2)
+            # keyboard module is Windows-only
+            try:
+                import keyboard
+                # Wait for hotkey to release before sending Ctrl+C
+                time.sleep(0.2)
+                # Use keyboard library to send Ctrl+C
+                keyboard.press_and_release('ctrl+c')
+                time.sleep(0.2)
+            except ImportError:
+                # On non-Windows, just try to get clipboard content directly
+                # (user needs to have copied text manually)
+                pass
             # Get clipboard
             text = pyperclip.paste()

modules/tag_manager.py CHANGED Viewed

@@ -77,15 +77,33 @@ class TagManager:
         runs = []
         current_pos = 0
+        # Check if paragraph style has bold/italic formatting
+        # This handles cases like "Subtitle" or "Title" styles that are bold
+        style_bold = False
+        style_italic = False
+        try:
+            if paragraph.style and paragraph.style.font:
+                if paragraph.style.font.bold:
+                    style_bold = True
+                if paragraph.style.font.italic:
+                    style_italic = True
+        except Exception:
+            pass  # If we can't read style, just use run-level formatting
         for run in paragraph.runs:
             text = run.text
             if not text:
                 continue
+            # Combine run-level formatting with style-level formatting
+            # run.bold can be True, False, or None (None means inherit from style)
+            is_bold = run.bold if run.bold is not None else style_bold
+            is_italic = run.italic if run.italic is not None else style_italic
             run_info = FormattingRun(
                 text=text,
-                bold=run.bold or False,
-                italic=run.italic or False,
+                bold=is_bold or False,
+                italic=is_italic or False,
                 underline=run.underline or False,
                 subscript=run.font.subscript or False if run.font else False,
                 superscript=run.font.superscript or False if run.font else False,

modules/termbase_manager.py CHANGED Viewed

@@ -409,7 +409,111 @@ class TermbaseManager:
         except Exception as e:
             self.log(f"✗ Error setting termbase read_only: {e}")
             return False
+    def get_termbase_ai_inject(self, termbase_id: int) -> bool:
+        """Get whether termbase terms should be injected into LLM prompts"""
+        try:
+            cursor = self.db_manager.cursor
+            cursor.execute("SELECT ai_inject FROM termbases WHERE id = ?", (termbase_id,))
+            result = cursor.fetchone()
+            return bool(result[0]) if result and result[0] else False
+        except Exception as e:
+            self.log(f"✗ Error getting termbase ai_inject: {e}")
+            return False
+    def set_termbase_ai_inject(self, termbase_id: int, ai_inject: bool) -> bool:
+        """Set whether termbase terms should be injected into LLM prompts"""
+        try:
+            cursor = self.db_manager.cursor
+            cursor.execute("""
+                UPDATE termbases SET ai_inject = ? WHERE id = ?
+            """, (1 if ai_inject else 0, termbase_id))
+            self.db_manager.connection.commit()
+            status = "enabled" if ai_inject else "disabled"
+            self.log(f"✓ AI injection {status} for termbase {termbase_id}")
+            return True
+        except Exception as e:
+            self.log(f"✗ Error setting termbase ai_inject: {e}")
+            return False
+    def get_ai_inject_termbases(self, project_id: Optional[int] = None) -> List[Dict]:
+        """
+        Get all termbases with ai_inject enabled that are active for the given project.
+        Args:
+            project_id: Project ID (0 or None for global)
+        Returns:
+            List of termbase dictionaries with all terms
+        """
+        try:
+            cursor = self.db_manager.cursor
+            proj_id = project_id if project_id else 0
+            cursor.execute("""
+                SELECT t.id, t.name, t.source_lang, t.target_lang
+                FROM termbases t
+                LEFT JOIN termbase_activation ta ON t.id = ta.termbase_id AND ta.project_id = ?
+                WHERE t.ai_inject = 1
+                AND (ta.is_active = 1 OR (t.is_global = 1 AND ta.is_active IS NULL))
+                ORDER BY ta.priority ASC, t.name ASC
+            """, (proj_id,))
+            termbases = []
+            for row in cursor.fetchall():
+                termbases.append({
+                    'id': row[0],
+                    'name': row[1],
+                    'source_lang': row[2],
+                    'target_lang': row[3]
+                })
+            return termbases
+        except Exception as e:
+            self.log(f"✗ Error getting AI inject termbases: {e}")
+            return []
+    def get_ai_inject_terms(self, project_id: Optional[int] = None) -> List[Dict]:
+        """
+        Get all terms from AI-inject-enabled termbases for the given project.
+        Args:
+            project_id: Project ID (0 or None for global)
+        Returns:
+            List of term dictionaries with source_term, target_term, forbidden, termbase_name
+        """
+        try:
+            # First get all AI-inject termbases
+            ai_termbases = self.get_ai_inject_termbases(project_id)
+            if not ai_termbases:
+                return []
+            all_terms = []
+            cursor = self.db_manager.cursor
+            for tb in ai_termbases:
+                cursor.execute("""
+                    SELECT source_term, target_term, forbidden, priority
+                    FROM termbase_terms
+                    WHERE termbase_id = ?
+                    ORDER BY priority ASC, source_term ASC
+                """, (tb['id'],))
+                for row in cursor.fetchall():
+                    all_terms.append({
+                        'source_term': row[0],
+                        'target_term': row[1],
+                        'forbidden': bool(row[2]) if row[2] else False,
+                        'priority': row[3] or 99,
+                        'termbase_name': tb['name']
+                    })
+            self.log(f"📚 Retrieved {len(all_terms)} terms from {len(ai_termbases)} AI-inject glossar{'y' if len(ai_termbases) == 1 else 'ies'}")
+            return all_terms
+        except Exception as e:
+            self.log(f"✗ Error getting AI inject terms: {e}")
+            return []
     def set_termbase_priority(self, termbase_id: int, project_id: int, priority: int) -> bool:
         """
         Set manual priority for a termbase in a specific project.
@@ -505,7 +609,6 @@ class TermbaseManager:
             """, (project_id,))
             active_ids = [row[0] for row in cursor.fetchall()]
-            self.log(f"📋 Found {len(active_ids)} active termbases for project {project_id}: {active_ids}")
             return active_ids
         except Exception as e:
             self.log(f"✗ Error getting active termbase IDs: {e}")

supervertaler 1.9.153__py3-none-any.whl → 1.9.185__py3-none-any.whl

Potentially problematic release.

supervertaler 1.9.153py3-none-any.whl → 1.9.185py3-none-any.whl