subtitlekit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+ """
2
+ SubtitleKit - Subtitle Processing Toolkit
3
+
4
+ A comprehensive library for subtitle processing, synchronization, and correction.
5
+ """
6
+
7
+ __version__ = "0.1.0"
8
+
9
+ from .tools import merge_subtitles, fix_overlaps, apply_corrections
10
+ from .core import (
11
+ detect_file_encoding,
12
+ read_srt_with_fallback,
13
+ preprocess_srt_file,
14
+ clean_subtitle_file,
15
+ )
16
+
17
+ __all__ = [
18
+ # Main functions
19
+ 'merge_subtitles',
20
+ 'fix_overlaps',
21
+ 'apply_corrections',
22
+ # Utilities
23
+ 'detect_file_encoding',
24
+ 'read_srt_with_fallback',
25
+ 'preprocess_srt_file',
26
+ 'clean_subtitle_file',
27
+ ]
@@ -0,0 +1,3 @@
1
+ """
2
+ SubtitleKit CLI - Command line interface
3
+ """
@@ -0,0 +1,154 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ SubtitleKit CLI - Unified command-line interface
4
+
5
+ Usage:
6
+ subtitlekit merge --original FILE --helper FILE [--helper FILE ...] --output FILE
7
+ subtitlekit overlaps --input FILE --reference FILE --output FILE [--window N]
8
+ subtitlekit corrections --input FILE --corrections FILE --output FILE
9
+ """
10
+
11
+ import argparse
12
+ import sys
13
+ from pathlib import Path
14
+
15
+
16
+ def cmd_merge(args):
17
+ """Merge subtitle files"""
18
+ from subtitlekit.tools.matcher import process_subtitles
19
+ from subtitlekit.core.cleaner import clean_subtitle_file
20
+ import json
21
+ import os
22
+
23
+ print(f"Processing subtitles...")
24
+ print(f" Original: {args.original}")
25
+ for i, helper in enumerate(args.helper, 1):
26
+ print(f" Helper {i}: {helper}")
27
+ print(f" Output: {args.output}")
28
+
29
+ if args.skip_sync:
30
+ print(" Skipping synchronization")
31
+
32
+ # Clean subtitle formatting
33
+ print(" Cleaning subtitle formatting...")
34
+ cleaned_original = clean_subtitle_file(args.original)
35
+
36
+ try:
37
+ # Process with cleaned file
38
+ results = process_subtitles(
39
+ cleaned_original,
40
+ args.helper,
41
+ skip_sync=args.skip_sync
42
+ )
43
+ finally:
44
+ # Clean up temporary file
45
+ if os.path.exists(cleaned_original):
46
+ os.unlink(cleaned_original)
47
+
48
+ # Write output
49
+ with open(args.output, 'w', encoding='utf-8') as f:
50
+ json.dump(results, f, ensure_ascii=False, indent=2)
51
+
52
+ print(f"\n✅ Success! Processed {len(results)} subtitle entries.")
53
+ print(f"Output written to: {args.output}")
54
+
55
+
56
+ def cmd_overlaps(args):
57
+ """Fix timing overlaps"""
58
+ from subtitlekit.tools.overlaps import fix_problematic_timings
59
+
60
+ print(f"Fixing overlaps and timing issues...")
61
+ print(f" Input: {args.input}")
62
+ print(f" Reference: {args.reference}")
63
+ print(f" Output: {args.output}")
64
+ print(f" Window: {args.window}")
65
+
66
+ fix_problematic_timings(
67
+ args.input,
68
+ args.reference,
69
+ args.output,
70
+ window=args.window,
71
+ preprocess=args.preprocess
72
+ )
73
+
74
+ print(f"\n✅ Done! Fixed file saved to: {args.output}")
75
+
76
+
77
+ def cmd_corrections(args):
78
+ """Apply corrections from JSON"""
79
+ from subtitlekit.tools.corrections import apply_corrections_from_file
80
+
81
+ print(f"Applying corrections...")
82
+ print(f" Input: {args.input}")
83
+ print(f" Corrections: {args.corrections}")
84
+ print(f" Output: {args.output}")
85
+
86
+ stats = apply_corrections_from_file(
87
+ args.input,
88
+ args.corrections,
89
+ args.output,
90
+ verbose=not args.quiet
91
+ )
92
+
93
+ if args.quiet:
94
+ print(f"✅ Applied {stats['applied']}/{stats['total']} corrections")
95
+
96
+
97
+ def main():
98
+ """Main CLI entry point"""
99
+ parser = argparse.ArgumentParser(
100
+ prog='subtitlekit',
101
+ description='Subtitle processing toolkit: merge, sync, fix, and correct subtitles'
102
+ )
103
+
104
+ subparsers = parser.add_subparsers(dest='command', help='Commands')
105
+
106
+ # Merge command
107
+ merge_parser = subparsers.add_parser('merge', help='Merge and synchronize subtitle files')
108
+ merge_parser.add_argument('--original', required=True, help='Original subtitle file (to translate)')
109
+ merge_parser.add_argument('--helper', action='append', required=True,
110
+ help='Helper subtitle file (can be used multiple times)')
111
+ merge_parser.add_argument('--output', required=True, help='Output JSON file')
112
+ merge_parser.add_argument('--skip-sync', action='store_true',
113
+ help='Skip ffsubsync synchronization')
114
+ merge_parser.set_defaults(func=cmd_merge)
115
+
116
+ # Overlaps command
117
+ overlaps_parser = subparsers.add_parser('overlaps', help='Fix timing overlaps and issues')
118
+ overlaps_parser.add_argument('--input', required=True, help='Input subtitle file')
119
+ overlaps_parser.add_argument('--reference', required=True, help='Reference subtitle file')
120
+ overlaps_parser.add_argument('--output', required=True, help='Output subtitle file')
121
+ overlaps_parser.add_argument('--window', type=int, default=5,
122
+ help='Context window for matching (default: 5)')
123
+ overlaps_parser.add_argument('--preprocess', action='store_true',
124
+ help='Preprocess input file first')
125
+ overlaps_parser.set_defaults(func=cmd_overlaps)
126
+
127
+ # Corrections command
128
+ corrections_parser = subparsers.add_parser('corrections', help='Apply corrections from JSON')
129
+ corrections_parser.add_argument('--input', required=True, help='Input subtitle file')
130
+ corrections_parser.add_argument('--corrections', required=True, help='Corrections JSON file')
131
+ corrections_parser.add_argument('--output', required=True, help='Output subtitle file')
132
+ corrections_parser.add_argument('--quiet', '-q', action='store_true',
133
+ help='Quiet mode (minimal output)')
134
+ corrections_parser.set_defaults(func=cmd_corrections)
135
+
136
+ # Parse and execute
137
+ args = parser.parse_args()
138
+
139
+ if not args.command:
140
+ parser.print_help()
141
+ return 1
142
+
143
+ try:
144
+ args.func(args)
145
+ return 0
146
+ except Exception as e:
147
+ print(f"\n❌ Error: {e}", file=sys.stderr)
148
+ if '--verbose' in sys.argv:
149
+ raise
150
+ return 1
151
+
152
+
153
+ if __name__ == '__main__':
154
+ sys.exit(main())
@@ -0,0 +1,13 @@
1
+ """
2
+ SubtitleKit Core - Encoding utilities
3
+ """
4
+ from .encoding import *
5
+ from .preprocessor import *
6
+ from .cleaner import *
7
+
8
+ __all__ = [
9
+ 'detect_file_encoding',
10
+ 'read_srt_with_fallback',
11
+ 'preprocess_srt_file',
12
+ 'clean_subtitle_file',
13
+ ]
@@ -0,0 +1,169 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Subtitle formatting cleanup utilities.
4
+
5
+ This module provides functions to clean up common subtitle formatting quirks:
6
+ - Remove hyphens at line breaks (when not dialogue markers)
7
+ - Merge duplicate italic tags into a single tag
8
+ """
9
+
10
+ import re
11
+ import pysrt
12
+ from pathlib import Path
13
+ import tempfile
14
+ from subtitlekit.core.encoding import read_srt_with_fallback
15
+ from io import StringIO
16
+
17
+
18
+ def is_dialogue_subtitle(text_lines):
19
+ """
20
+ Check if a subtitle contains dialogue (multiple lines starting with '-').
21
+
22
+ Args:
23
+ text_lines: List of text lines in the subtitle
24
+
25
+ Returns:
26
+ True if this appears to be a dialogue subtitle (2+ lines start with '-')
27
+ """
28
+ dash_count = sum(1 for line in text_lines if line.strip().startswith('-'))
29
+ return dash_count >= 2
30
+
31
+
32
+ def clean_hyphen_line_breaks(text_lines):
33
+ """
34
+ Remove hyphens at line breaks when they represent continuation, not dialogue.
35
+
36
+ Args:
37
+ text_lines: List of text lines
38
+
39
+ Returns:
40
+ Cleaned list of text lines
41
+ """
42
+ if not text_lines or len(text_lines) < 2:
43
+ return text_lines
44
+
45
+ # Don't touch dialogue subtitles
46
+ if is_dialogue_subtitle(text_lines):
47
+ return text_lines
48
+
49
+ cleaned_lines = []
50
+ i = 0
51
+
52
+ while i < len(text_lines):
53
+ current_line = text_lines[i]
54
+
55
+ # Check if current line ends with '-' and next line starts with '-'
56
+ if (i < len(text_lines) - 1 and
57
+ current_line.rstrip().endswith('-') and
58
+ text_lines[i + 1].lstrip().startswith('-')):
59
+
60
+ # Remove trailing '-' from current line and leading '-' from next line
61
+ cleaned_current = current_line.rstrip()[:-1].rstrip()
62
+ cleaned_next = text_lines[i + 1].lstrip()[1:].lstrip()
63
+
64
+ # Merge them
65
+ merged_line = cleaned_current + cleaned_next
66
+ cleaned_lines.append(merged_line)
67
+ i += 2 # Skip next line since we merged it
68
+ else:
69
+ cleaned_lines.append(current_line)
70
+ i += 1
71
+
72
+ return cleaned_lines
73
+
74
+
75
+ def merge_duplicate_italic_tags(text):
76
+ """
77
+ Merge duplicate italic tags when each line has its own tag.
78
+
79
+ Example:
80
+ '<i>Line 1</i>\n<i>Line 2</i>' -> '<i>Line 1\nLine 2</i>'
81
+
82
+ Args:
83
+ text: Subtitle text with potential duplicate italic tags
84
+
85
+ Returns:
86
+ Text with merged italic tags
87
+ """
88
+ # Split by newlines to check each line
89
+ lines = text.split('\n')
90
+
91
+ # Check if ALL non-empty lines have their own <i>...</i> tags
92
+ italic_pattern = re.compile(r'^<i>(.+?)</i>$')
93
+ all_italic = True
94
+ for line in lines:
95
+ line = line.strip()
96
+ if line and not italic_pattern.match(line):
97
+ all_italic = False
98
+ break
99
+
100
+ # If all lines have individual italic tags, merge them
101
+ if all_italic and len(lines) > 1:
102
+ # Extract content from each line
103
+ contents = []
104
+ for line in lines:
105
+ match = italic_pattern.match(line.strip())
106
+ if match:
107
+ contents.append(match.group(1))
108
+
109
+ if contents:
110
+ # Return single italic tag wrapping all content
111
+ return '<i>' + '\n'.join(contents) + '</i>'
112
+
113
+ return text
114
+
115
+
116
+ def clean_subtitle_file(input_path):
117
+ """
118
+ Clean a subtitle file and return path to cleaned temporary file.
119
+
120
+ Args:
121
+ input_path: Path to input SRT file
122
+
123
+ Returns:
124
+ Path to cleaned temporary SRT file
125
+ """
126
+ # Load subtitles with encoding detection
127
+ content = read_srt_with_fallback(input_path)
128
+ subs = pysrt.SubRipFile.from_string(content)
129
+
130
+ # Process each subtitle
131
+ for sub in subs:
132
+ # Get text lines
133
+ text_lines = sub.text.split('\n')
134
+
135
+ # Clean hyphen line breaks
136
+ text_lines = clean_hyphen_line_breaks(text_lines)
137
+
138
+ # Rejoin lines
139
+ cleaned_text = '\n'.join(text_lines)
140
+
141
+ # Merge duplicate italic tags
142
+ cleaned_text = merge_duplicate_italic_tags(cleaned_text)
143
+
144
+ # Update subtitle text
145
+ sub.text = cleaned_text
146
+
147
+ # Save to temporary file
148
+ temp_file = tempfile.NamedTemporaryFile(
149
+ mode='w',
150
+ suffix='.srt',
151
+ delete=False,
152
+ encoding='utf-8'
153
+ )
154
+ temp_path = temp_file.name
155
+ temp_file.close()
156
+
157
+ # Write cleaned subtitles
158
+ subs.save(temp_path, encoding='utf-8')
159
+
160
+ return temp_path
161
+
162
+
163
+ if __name__ == '__main__':
164
+ # Test with command line argument
165
+ import sys
166
+ if len(sys.argv) > 1:
167
+ input_file = sys.argv[1]
168
+ output = clean_subtitle_file(input_file)
169
+ print(f"Cleaned file saved to: {output}")
@@ -0,0 +1,100 @@
1
+ """
2
+ Utilities for robust encoding detection and handling.
3
+ """
4
+
5
+ import chardet
6
+ from typing import Tuple
7
+
8
+
9
+ def detect_file_encoding(file_path: str) -> Tuple[str, float]:
10
+ """
11
+ Detect the encoding of a file using chardet.
12
+
13
+ Args:
14
+ file_path: Path to file
15
+
16
+ Returns:
17
+ Tuple of (encoding, confidence)
18
+ """
19
+ with open(file_path, 'rb') as f:
20
+ raw_data = f.read()
21
+
22
+ result = chardet.detect(raw_data)
23
+ encoding = result['encoding']
24
+ confidence = result['confidence']
25
+
26
+ return encoding, confidence
27
+
28
+
29
+ def read_srt_with_fallback(file_path: str) -> str:
30
+ """
31
+ Read SRT file with automatic encoding detection and fallback.
32
+
33
+ Tries in order:
34
+ 1. UTF-8 with BOM (utf-8-sig)
35
+ 2. UTF-8 without BOM
36
+ 3. Auto-detected encoding
37
+ 4. Latin-1 (fallback - never fails)
38
+
39
+ Args:
40
+ file_path: Path to SRT file
41
+
42
+ Returns:
43
+ File content as string
44
+ """
45
+ encodings_to_try = [
46
+ 'utf-8-sig', # UTF-8 with BOM
47
+ 'utf-8', # UTF-8 without BOM
48
+ ]
49
+
50
+ # Add auto-detected encoding
51
+ try:
52
+ detected_enc, confidence = detect_file_encoding(file_path)
53
+ if detected_enc and confidence > 0.7:
54
+ # Only use if confidence is high
55
+ if detected_enc.lower() not in [e.lower() for e in encodings_to_try]:
56
+ encodings_to_try.insert(2, detected_enc)
57
+ except Exception:
58
+ pass # If detection fails, continue with defaults
59
+
60
+ # Add common fallbacks
61
+ encodings_to_try.extend([
62
+ 'windows-1252', # Common Windows encoding
63
+ 'iso-8859-1', # Latin-1
64
+ 'cp1253', # Greek Windows
65
+ 'latin-1' # Always succeeds
66
+ ])
67
+
68
+ last_error = None
69
+ for encoding in encodings_to_try:
70
+ try:
71
+ with open(file_path, 'r', encoding=encoding) as f:
72
+ content = f.read()
73
+ return content
74
+ except (UnicodeDecodeError, LookupError) as e:
75
+ last_error = e
76
+ continue
77
+
78
+ # This should never happen since latin-1 always succeeds
79
+ raise last_error or Exception(f"Failed to read {file_path}")
80
+
81
+
82
+ if __name__ == "__main__":
83
+ import sys
84
+
85
+ if len(sys.argv) < 2:
86
+ print("Usage: python encoding_utils.py <file.srt>")
87
+ sys.exit(1)
88
+
89
+ file_path = sys.argv[1]
90
+
91
+ # Detect encoding
92
+ encoding, confidence = detect_file_encoding(file_path)
93
+ print(f"Detected encoding: {encoding} (confidence: {confidence:.2%})")
94
+
95
+ # Read with fallback
96
+ try:
97
+ content = read_srt_with_fallback(file_path)
98
+ print(f"Successfully read {len(content)} characters")
99
+ except Exception as e:
100
+ print(f"Failed to read: {e}")