PyPI - subtitlekit - Versions diffs - 0.1.0__py3-none-any.whl - Mend

subtitlekit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

subtitlekit/__init__.py +27 -0
subtitlekit/cli/__init__.py +3 -0
subtitlekit/cli/main.py +154 -0
subtitlekit/core/__init__.py +13 -0
subtitlekit/core/cleaner.py +169 -0
subtitlekit/core/encoding.py +100 -0
subtitlekit/core/preprocessor.py +372 -0
subtitlekit/tools/__init__.py +12 -0
subtitlekit/tools/corrections.py +302 -0
subtitlekit/tools/enhanced_matcher.py +543 -0
subtitlekit/tools/matcher.py +352 -0
subtitlekit/tools/overlaps.py +366 -0
subtitlekit/ui/__init__.py +11 -0
subtitlekit/ui/colab.py +240 -0
subtitlekit/ui/desktop.py +397 -0
subtitlekit/updater.py +60 -0
subtitlekit-0.1.0.dist-info/METADATA +206 -0
subtitlekit-0.1.0.dist-info/RECORD +22 -0
subtitlekit-0.1.0.dist-info/WHEEL +5 -0
subtitlekit-0.1.0.dist-info/entry_points.txt +2 -0
subtitlekit-0.1.0.dist-info/licenses/LICENSE +21 -0
subtitlekit-0.1.0.dist-info/top_level.txt +1 -0

subtitlekit/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""
+SubtitleKit - Subtitle Processing Toolkit
+A comprehensive library for subtitle processing, synchronization, and correction.
+"""
+__version__ = "0.1.0"
+from .tools import merge_subtitles, fix_overlaps, apply_corrections
+from .core import (
+    detect_file_encoding,
+    read_srt_with_fallback,
+    preprocess_srt_file,
+    clean_subtitle_file,
+)
+__all__ = [
+    # Main functions
+    'merge_subtitles',
+    'fix_overlaps',
+    'apply_corrections',
+    # Utilities
+    'detect_file_encoding',
+    'read_srt_with_fallback',
+    'preprocess_srt_file',
+    'clean_subtitle_file',
+]

subtitlekit/cli/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""
+SubtitleKit CLI - Command line interface
+"""

subtitlekit/cli/main.py ADDED Viewed

@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+"""
+SubtitleKit CLI - Unified command-line interface
+Usage:
+    subtitlekit merge --original FILE --helper FILE [--helper FILE ...] --output FILE
+    subtitlekit overlaps --input FILE --reference FILE --output FILE [--window N]
+    subtitlekit corrections --input FILE --corrections FILE --output FILE
+"""
+import argparse
+import sys
+from pathlib import Path
+def cmd_merge(args):
+    """Merge subtitle files"""
+    from subtitlekit.tools.matcher import process_subtitles
+    from subtitlekit.core.cleaner import clean_subtitle_file
+    import json
+    import os
+    print(f"Processing subtitles...")
+    print(f"  Original: {args.original}")
+    for i, helper in enumerate(args.helper, 1):
+        print(f"  Helper {i}: {helper}")
+    print(f"  Output: {args.output}")
+    if args.skip_sync:
+        print("  Skipping synchronization")
+    # Clean subtitle formatting
+    print("  Cleaning subtitle formatting...")
+    cleaned_original = clean_subtitle_file(args.original)
+    try:
+        # Process with cleaned file
+        results = process_subtitles(
+            cleaned_original,
+            args.helper,
+            skip_sync=args.skip_sync
+        )
+    finally:
+        # Clean up temporary file
+        if os.path.exists(cleaned_original):
+            os.unlink(cleaned_original)
+    # Write output
+    with open(args.output, 'w', encoding='utf-8') as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+    print(f"\n✅ Success! Processed {len(results)} subtitle entries.")
+    print(f"Output written to: {args.output}")
+def cmd_overlaps(args):
+    """Fix timing overlaps"""
+    from subtitlekit.tools.overlaps import fix_problematic_timings
+    print(f"Fixing overlaps and timing issues...")
+    print(f"  Input: {args.input}")
+    print(f"  Reference: {args.reference}")
+    print(f"  Output: {args.output}")
+    print(f"  Window: {args.window}")
+    fix_problematic_timings(
+        args.input,
+        args.reference,
+        args.output,
+        window=args.window,
+        preprocess=args.preprocess
+    )
+    print(f"\n✅ Done! Fixed file saved to: {args.output}")
+def cmd_corrections(args):
+    """Apply corrections from JSON"""
+    from subtitlekit.tools.corrections import apply_corrections_from_file
+    print(f"Applying corrections...")
+    print(f"  Input: {args.input}")
+    print(f"  Corrections: {args.corrections}")
+    print(f"  Output: {args.output}")
+    stats = apply_corrections_from_file(
+        args.input,
+        args.corrections,
+        args.output,
+        verbose=not args.quiet
+    )
+    if args.quiet:
+        print(f"✅ Applied {stats['applied']}/{stats['total']} corrections")
+def main():
+    """Main CLI entry point"""
+    parser = argparse.ArgumentParser(
+        prog='subtitlekit',
+        description='Subtitle processing toolkit: merge, sync, fix, and correct subtitles'
+    )
+    subparsers = parser.add_subparsers(dest='command', help='Commands')
+    # Merge command
+    merge_parser = subparsers.add_parser('merge', help='Merge and synchronize subtitle files')
+    merge_parser.add_argument('--original', required=True, help='Original subtitle file (to translate)')
+    merge_parser.add_argument('--helper', action='append', required=True,
+                             help='Helper subtitle file (can be used multiple times)')
+    merge_parser.add_argument('--output', required=True, help='Output JSON file')
+    merge_parser.add_argument('--skip-sync', action='store_true',
+                             help='Skip ffsubsync synchronization')
+    merge_parser.set_defaults(func=cmd_merge)
+    # Overlaps command
+    overlaps_parser = subparsers.add_parser('overlaps', help='Fix timing overlaps and issues')
+    overlaps_parser.add_argument('--input', required=True, help='Input subtitle file')
+    overlaps_parser.add_argument('--reference', required=True, help='Reference subtitle file')
+    overlaps_parser.add_argument('--output', required=True, help='Output subtitle file')
+    overlaps_parser.add_argument('--window', type=int, default=5,
+                                help='Context window for matching (default: 5)')
+    overlaps_parser.add_argument('--preprocess', action='store_true',
+                                help='Preprocess input file first')
+    overlaps_parser.set_defaults(func=cmd_overlaps)
+    # Corrections command
+    corrections_parser = subparsers.add_parser('corrections', help='Apply corrections from JSON')
+    corrections_parser.add_argument('--input', required=True, help='Input subtitle file')
+    corrections_parser.add_argument('--corrections', required=True, help='Corrections JSON file')
+    corrections_parser.add_argument('--output', required=True, help='Output subtitle file')
+    corrections_parser.add_argument('--quiet', '-q', action='store_true',
+                                   help='Quiet mode (minimal output)')
+    corrections_parser.set_defaults(func=cmd_corrections)
+    # Parse and execute
+    args = parser.parse_args()
+    if not args.command:
+        parser.print_help()
+        return 1
+    try:
+        args.func(args)
+        return 0
+    except Exception as e:
+        print(f"\n❌ Error: {e}", file=sys.stderr)
+        if '--verbose' in sys.argv:
+            raise
+        return 1
+if __name__ == '__main__':
+    sys.exit(main())

subtitlekit/core/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""
+SubtitleKit Core - Encoding utilities
+"""
+from .encoding import *
+from .preprocessor import *
+from .cleaner import *
+__all__ = [
+    'detect_file_encoding',
+    'read_srt_with_fallback',
+    'preprocess_srt_file',
+    'clean_subtitle_file',
+]

subtitlekit/core/cleaner.py ADDED Viewed

@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+"""
+Subtitle formatting cleanup utilities.
+This module provides functions to clean up common subtitle formatting quirks:
+- Remove hyphens at line breaks (when not dialogue markers)
+- Merge duplicate italic tags into a single tag
+"""
+import re
+import pysrt
+from pathlib import Path
+import tempfile
+from subtitlekit.core.encoding import read_srt_with_fallback
+from io import StringIO
+def is_dialogue_subtitle(text_lines):
+    """
+    Check if a subtitle contains dialogue (multiple lines starting with '-').
+    Args:
+        text_lines: List of text lines in the subtitle
+    Returns:
+        True if this appears to be a dialogue subtitle (2+ lines start with '-')
+    """
+    dash_count = sum(1 for line in text_lines if line.strip().startswith('-'))
+    return dash_count >= 2
+def clean_hyphen_line_breaks(text_lines):
+    """
+    Remove hyphens at line breaks when they represent continuation, not dialogue.
+    Args:
+        text_lines: List of text lines
+    Returns:
+        Cleaned list of text lines
+    """
+    if not text_lines or len(text_lines) < 2:
+        return text_lines
+    # Don't touch dialogue subtitles
+    if is_dialogue_subtitle(text_lines):
+        return text_lines
+    cleaned_lines = []
+    i = 0
+    while i < len(text_lines):
+        current_line = text_lines[i]
+        # Check if current line ends with '-' and next line starts with '-'
+        if (i < len(text_lines) - 1 and
+            current_line.rstrip().endswith('-') and
+            text_lines[i + 1].lstrip().startswith('-')):
+            # Remove trailing '-' from current line and leading '-' from next line
+            cleaned_current = current_line.rstrip()[:-1].rstrip()
+            cleaned_next = text_lines[i + 1].lstrip()[1:].lstrip()
+            # Merge them
+            merged_line = cleaned_current + cleaned_next
+            cleaned_lines.append(merged_line)
+            i += 2  # Skip next line since we merged it
+        else:
+            cleaned_lines.append(current_line)
+            i += 1
+    return cleaned_lines
+def merge_duplicate_italic_tags(text):
+    """
+    Merge duplicate italic tags when each line has its own tag.
+    Example:
+        '<i>Line 1</i>\n<i>Line 2</i>' -> '<i>Line 1\nLine 2</i>'
+    Args:
+        text: Subtitle text with potential duplicate italic tags
+    Returns:
+        Text with merged italic tags
+    """
+    # Split by newlines to check each line
+    lines = text.split('\n')
+    # Check if ALL non-empty lines have their own <i>...</i> tags
+    italic_pattern = re.compile(r'^<i>(.+?)</i>$')
+    all_italic = True
+    for line in lines:
+        line = line.strip()
+        if line and not italic_pattern.match(line):
+            all_italic = False
+            break
+    # If all lines have individual italic tags, merge them
+    if all_italic and len(lines) > 1:
+        # Extract content from each line
+        contents = []
+        for line in lines:
+            match = italic_pattern.match(line.strip())
+            if match:
+                contents.append(match.group(1))
+        if contents:
+            # Return single italic tag wrapping all content
+            return '<i>' + '\n'.join(contents) + '</i>'
+    return text
+def clean_subtitle_file(input_path):
+    """
+    Clean a subtitle file and return path to cleaned temporary file.
+    Args:
+        input_path: Path to input SRT file
+    Returns:
+        Path to cleaned temporary SRT file
+    """
+    # Load subtitles with encoding detection
+    content = read_srt_with_fallback(input_path)
+    subs = pysrt.SubRipFile.from_string(content)
+    # Process each subtitle
+    for sub in subs:
+        # Get text lines
+        text_lines = sub.text.split('\n')
+        # Clean hyphen line breaks
+        text_lines = clean_hyphen_line_breaks(text_lines)
+        # Rejoin lines
+        cleaned_text = '\n'.join(text_lines)
+        # Merge duplicate italic tags
+        cleaned_text = merge_duplicate_italic_tags(cleaned_text)
+        # Update subtitle text
+        sub.text = cleaned_text
+    # Save to temporary file
+    temp_file = tempfile.NamedTemporaryFile(
+        mode='w',
+        suffix='.srt',
+        delete=False,
+        encoding='utf-8'
+    )
+    temp_path = temp_file.name
+    temp_file.close()
+    # Write cleaned subtitles
+    subs.save(temp_path, encoding='utf-8')
+    return temp_path
+if __name__ == '__main__':
+    # Test with command line argument
+    import sys
+    if len(sys.argv) > 1:
+        input_file = sys.argv[1]
+        output = clean_subtitle_file(input_file)
+        print(f"Cleaned file saved to: {output}")

subtitlekit/core/encoding.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""
+Utilities for robust encoding detection and handling.
+"""
+import chardet
+from typing import Tuple
+def detect_file_encoding(file_path: str) -> Tuple[str, float]:
+    """
+    Detect the encoding of a file using chardet.
+    Args:
+        file_path: Path to file
+    Returns:
+        Tuple of (encoding, confidence)
+    """
+    with open(file_path, 'rb') as f:
+        raw_data = f.read()
+    result = chardet.detect(raw_data)
+    encoding = result['encoding']
+    confidence = result['confidence']
+    return encoding, confidence
+def read_srt_with_fallback(file_path: str) -> str:
+    """
+    Read SRT file with automatic encoding detection and fallback.
+    Tries in order:
+    1. UTF-8 with BOM (utf-8-sig)
+    2. UTF-8 without BOM
+    3. Auto-detected encoding
+    4. Latin-1 (fallback - never fails)
+    Args:
+        file_path: Path to SRT file
+    Returns:
+        File content as string
+    """
+    encodings_to_try = [
+        'utf-8-sig',  # UTF-8 with BOM
+        'utf-8',      # UTF-8 without BOM
+    ]
+    # Add auto-detected encoding
+    try:
+        detected_enc, confidence = detect_file_encoding(file_path)
+        if detected_enc and confidence > 0.7:
+            # Only use if confidence is high
+            if detected_enc.lower() not in [e.lower() for e in encodings_to_try]:
+                encodings_to_try.insert(2, detected_enc)
+    except Exception:
+        pass  # If detection fails, continue with defaults
+    # Add common fallbacks
+    encodings_to_try.extend([
+        'windows-1252',  # Common Windows encoding
+        'iso-8859-1',    # Latin-1
+        'cp1253',        # Greek Windows
+        'latin-1'        # Always succeeds
+    ])
+    last_error = None
+    for encoding in encodings_to_try:
+        try:
+            with open(file_path, 'r', encoding=encoding) as f:
+                content = f.read()
+            return content
+        except (UnicodeDecodeError, LookupError) as e:
+            last_error = e
+            continue
+    # This should never happen since latin-1 always succeeds
+    raise last_error or Exception(f"Failed to read {file_path}")
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) < 2:
+        print("Usage: python encoding_utils.py <file.srt>")
+        sys.exit(1)
+    file_path = sys.argv[1]
+    # Detect encoding
+    encoding, confidence = detect_file_encoding(file_path)
+    print(f"Detected encoding: {encoding} (confidence: {confidence:.2%})")
+    # Read with fallback
+    try:
+        content = read_srt_with_fallback(file_path)
+        print(f"Successfully read {len(content)} characters")
+    except Exception as e:
+        print(f"Failed to read: {e}")