opentrans 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opentrans/__init__.py +6 -0
- opentrans/cache_manager.py +68 -0
- opentrans/hasher.py +27 -0
- opentrans/main.py +98 -0
- opentrans/settings.py +64 -0
- opentrans/translator.py +191 -0
- opentrans-0.1.0.dist-info/METADATA +182 -0
- opentrans-0.1.0.dist-info/RECORD +11 -0
- opentrans-0.1.0.dist-info/WHEEL +4 -0
- opentrans-0.1.0.dist-info/entry_points.txt +2 -0
- opentrans-0.1.0.dist-info/licenses/LICENSE.md +21 -0
opentrans/__init__.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from .hasher import get_file_hash
|
|
5
|
+
from typing import Union
|
|
6
|
+
from .settings import settings
|
|
7
|
+
|
|
8
|
+
class CacheManager:
|
|
9
|
+
"""
|
|
10
|
+
Change Cache
|
|
11
|
+
"""
|
|
12
|
+
def __init__(self, cache_file: Path):
|
|
13
|
+
self.cache_file = cache_file
|
|
14
|
+
self.cache = self._load_cache()
|
|
15
|
+
|
|
16
|
+
def __del__(self):
|
|
17
|
+
self.save()
|
|
18
|
+
|
|
19
|
+
def _load_cache(self) -> dict:
|
|
20
|
+
"""
|
|
21
|
+
Docstring for _load_cache
|
|
22
|
+
|
|
23
|
+
:return: file_path and hash in dict format from cache file
|
|
24
|
+
:rtype: dict
|
|
25
|
+
"""
|
|
26
|
+
if self.cache_file == None:
|
|
27
|
+
return {}
|
|
28
|
+
|
|
29
|
+
if not self.cache_file.exists():
|
|
30
|
+
self.cache_file.parent.mkdir(parents=True, exist_ok=True)
|
|
31
|
+
with open(self.cache_file, 'w') as f:
|
|
32
|
+
json.dump({}, f, indent=2)
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
with open(self.cache_file, "r") as f:
|
|
36
|
+
return json.load(f)
|
|
37
|
+
except (json.JSONDecodeError, IOError):
|
|
38
|
+
return {}
|
|
39
|
+
return {}
|
|
40
|
+
|
|
41
|
+
def file_changed(self, file_path: Union[Path, str]) -> bool:
|
|
42
|
+
"""
|
|
43
|
+
Check if the hash changed
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
file_path (Path): file to check
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
bool: True if changed else False
|
|
50
|
+
"""
|
|
51
|
+
return self.cache.get(str(file_path)) != get_file_hash(file_path, settings.hash_algo)
|
|
52
|
+
|
|
53
|
+
def __contains__(self, file_path: Union[Path, str]):
|
|
54
|
+
return str(file_path) in self.cache
|
|
55
|
+
|
|
56
|
+
def update(self, file_path: Path):
|
|
57
|
+
"""
|
|
58
|
+
Update the file hash in cache. Create the file if not exist.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
file_path (Path): File to update
|
|
62
|
+
"""
|
|
63
|
+
self.cache[str(file_path)] = get_file_hash(file_path, settings.hash_algo)
|
|
64
|
+
|
|
65
|
+
def save(self):
|
|
66
|
+
with open(self.cache_file, "w") as f:
|
|
67
|
+
json.dump(self.cache, f, indent=2)
|
|
68
|
+
|
opentrans/hasher.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Union
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_file_hash(file_path: Union[str, Path], algorithm="sha256") -> str:
|
|
7
|
+
"""
|
|
8
|
+
Generate a hash(default sha256) for a file's content.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
file_path (str | Path): File to hash.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
str: hash of the file
|
|
15
|
+
"""
|
|
16
|
+
try:
|
|
17
|
+
with open(file_path, "rb") as f:
|
|
18
|
+
if hasattr(hashlib, "file_digest"):
|
|
19
|
+
digest = hashlib.file_digest(f, algorithm)
|
|
20
|
+
else:
|
|
21
|
+
digest = hashlib.new(algorithm)
|
|
22
|
+
for chunk in iter(lambda: f.read(8192), b""):
|
|
23
|
+
digest.update(chunk)
|
|
24
|
+
|
|
25
|
+
return digest.hexdigest()
|
|
26
|
+
except FileNotFoundError:
|
|
27
|
+
raise FileNotFoundError(f"Error: File not found at {file_path}")
|
opentrans/main.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import click
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
|
|
6
|
+
from .settings import settings
|
|
7
|
+
from .translator import Translator
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@click.command()
|
|
11
|
+
@click.argument(
|
|
12
|
+
'input_dir',
|
|
13
|
+
type=click.Path(exists=True, path_type=Path)
|
|
14
|
+
)
|
|
15
|
+
@click.argument(
|
|
16
|
+
'output_dir',
|
|
17
|
+
type=click.Path(path_type=Path)
|
|
18
|
+
)
|
|
19
|
+
@click.argument(
|
|
20
|
+
'config',
|
|
21
|
+
type=click.Path(path_type=Path)
|
|
22
|
+
)
|
|
23
|
+
@click.option('--model', '-m', default='translategemma:4b', help='Ollama model name.')
|
|
24
|
+
@click.option('--lang', '-l', default='Chinese', help='Target translation language.')
|
|
25
|
+
@click.option(
|
|
26
|
+
'--input_dir', '-i',
|
|
27
|
+
type=click.Path(exists=True, path_type=Path),
|
|
28
|
+
help='Directory containing the documents to translate.'
|
|
29
|
+
)
|
|
30
|
+
@click.option(
|
|
31
|
+
'--output_dir', '-o',
|
|
32
|
+
type=click.Path(path_type=Path),
|
|
33
|
+
help='Directory where translated files will be saved.'
|
|
34
|
+
)
|
|
35
|
+
@click.option(
|
|
36
|
+
'--config', '-c',
|
|
37
|
+
type=click.Path(path_type=Path),
|
|
38
|
+
help='Custom path to a config.yaml settings file.'
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def main(input_dir, output_dir, config, model, lang):
|
|
42
|
+
"""
|
|
43
|
+
OpenTrans: Batch translate documentation using local Ollama LLMs.
|
|
44
|
+
|
|
45
|
+
A specialized tool for translating Markdown and LaTeX files while preserving
|
|
46
|
+
input directory structures and protecting technical syntax (code, math, and links).
|
|
47
|
+
|
|
48
|
+
Key Features:
|
|
49
|
+
- Local-First: Powered by Ollama for private, cost-free translation.
|
|
50
|
+
- Syntax Shield: Automatically protects backticks, LaTeX, and URLs from translation.
|
|
51
|
+
- Parallel Processing: High-speed batch handling for large documentation sets.
|
|
52
|
+
- Structure Preservation: Mirror your source directory exactly in the output.
|
|
53
|
+
"""
|
|
54
|
+
if config:
|
|
55
|
+
settings.model_config['yaml_file'] = config
|
|
56
|
+
|
|
57
|
+
if input_dir:
|
|
58
|
+
settings.input_dir = input_dir.resolve()
|
|
59
|
+
|
|
60
|
+
if output_dir:
|
|
61
|
+
settings.output_dir = output_dir.resolve()
|
|
62
|
+
|
|
63
|
+
if model:
|
|
64
|
+
settings.ollama_model = model
|
|
65
|
+
|
|
66
|
+
if lang:
|
|
67
|
+
settings.target_lang = lang
|
|
68
|
+
|
|
69
|
+
if not settings.input_dir.exists():
|
|
70
|
+
click.echo(
|
|
71
|
+
f"Error: Input directory {settings.input_dir} does not exist.", err=True)
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
all_files = [f for f in settings.input_dir.rglob('*') if f.is_file()]
|
|
75
|
+
|
|
76
|
+
click.echo(f"Target Language: {settings.target_lang}")
|
|
77
|
+
click.echo(f"Input: {settings.input_dir}")
|
|
78
|
+
click.echo(f"Output: {settings.output_dir}")
|
|
79
|
+
click.echo(f"Using Model: {settings.ollama_model}")
|
|
80
|
+
|
|
81
|
+
translator = Translator(output_dir / settings.cache_filename)
|
|
82
|
+
with ThreadPoolExecutor(max_workers=settings.max_parallel_files) as executor:
|
|
83
|
+
list(tqdm(
|
|
84
|
+
executor.map(
|
|
85
|
+
lambda f: translator.process_file(
|
|
86
|
+
f, settings.input_dir, settings.output_dir),
|
|
87
|
+
all_files
|
|
88
|
+
),
|
|
89
|
+
total=len(all_files),
|
|
90
|
+
desc="Translating Files",
|
|
91
|
+
unit="file"
|
|
92
|
+
))
|
|
93
|
+
|
|
94
|
+
click.echo("\nComplete.")
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
if __name__ == '__main__':
|
|
98
|
+
main()
|
opentrans/settings.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Set, Tuple, Type
|
|
4
|
+
|
|
5
|
+
from pydantic import computed_field, field_validator, Field
|
|
6
|
+
from pydantic_settings import (
|
|
7
|
+
BaseSettings,
|
|
8
|
+
PydanticBaseSettingsSource,
|
|
9
|
+
SettingsConfigDict,
|
|
10
|
+
YamlConfigSettingsSource,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
class Settings(BaseSettings):
|
|
14
|
+
target_lang: str = "Chinese"
|
|
15
|
+
ollama_model: str = "translategemma:4b"
|
|
16
|
+
temperature: float = Field(default=1.0, ge=0.0, le=2.0)
|
|
17
|
+
max_parallel_files: int = Field(default=2, gt=0)
|
|
18
|
+
|
|
19
|
+
input_dir: Path = Path("./docs").resolve()
|
|
20
|
+
output_dir: Path = Path("./translated").resolve()
|
|
21
|
+
hash_algo: str = "sha1"
|
|
22
|
+
translate_file_types: Set[str] = {".md", ".mdx", ".txt"}
|
|
23
|
+
|
|
24
|
+
model_config = SettingsConfigDict(
|
|
25
|
+
yaml_file="config.yaml",
|
|
26
|
+
extra="ignore",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
@computed_field
|
|
30
|
+
@property
|
|
31
|
+
def cache_filename(self) -> str:
|
|
32
|
+
return f".{self.target_lang.lower()}_cache.json"
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def cache_path(self) -> Path:
|
|
36
|
+
return self.output_dir / self.cache_filename
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def check_input_directory(cls, v: Path) -> Path:
|
|
40
|
+
if not v.exists():
|
|
41
|
+
print(f"Warning: Input directory {v} does not exist.")
|
|
42
|
+
return v
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def settings_customise_sources(
|
|
46
|
+
cls,
|
|
47
|
+
settings_cls: Type[BaseSettings],
|
|
48
|
+
init_settings: PydanticBaseSettingsSource,
|
|
49
|
+
env_settings: PydanticBaseSettingsSource,
|
|
50
|
+
dotenv_settings: PydanticBaseSettingsSource,
|
|
51
|
+
file_secret_settings: PydanticBaseSettingsSource,
|
|
52
|
+
) -> Tuple[PydanticBaseSettingsSource, ...]:
|
|
53
|
+
"""Defines the priority: 1. Init, 2. Env Vars, 3. YAML File, 4. Defaults"""
|
|
54
|
+
return (
|
|
55
|
+
init_settings,
|
|
56
|
+
env_settings,
|
|
57
|
+
YamlConfigSettingsSource(settings_cls),
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
settings = Settings()
|
|
62
|
+
except Exception as e:
|
|
63
|
+
print(f"Configuration Error: {e}")
|
|
64
|
+
raise
|
opentrans/translator.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from tqdm import tqdm
|
|
3
|
+
import ollama
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from shutil import copy2
|
|
6
|
+
import threading
|
|
7
|
+
from .settings import settings
|
|
8
|
+
from .cache_manager import CacheManager
|
|
9
|
+
import logging
|
|
10
|
+
import re
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Translator:
|
|
15
|
+
def __init__(self, cache_path: Path):
|
|
16
|
+
self.cache = CacheManager(cache_path)
|
|
17
|
+
self.lock = threading.Lock()
|
|
18
|
+
|
|
19
|
+
def protect_syntax(self, text: str, protect_callback: callable):
|
|
20
|
+
"""
|
|
21
|
+
Scans text for Markdown/Docusaurus syntax and replaces it with placeholders.
|
|
22
|
+
|
|
23
|
+
This function identifies technical and structural syntax—such as code blocks,
|
|
24
|
+
LaTeX math, Docusaurus admonitions, and JSX components—and applies a
|
|
25
|
+
protection callback to prevent them from being modified during translation.
|
|
26
|
+
It ensures that technical syntax remains valid while allowing the
|
|
27
|
+
surrounding prose to be localized.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
text (str): The raw Markdown or MDX content to be processed.
|
|
31
|
+
protect_callback (callable): A function or method that takes the
|
|
32
|
+
matched string and returns a placeholder string (e.g., `[PROTECTED_0]`).
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
str: The processed text with all identified technical syntax replaced
|
|
36
|
+
by placeholders.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
# TODO: Replace this with tree-sitter or similar to redact code blocks depends on the file extension
|
|
40
|
+
# List of patterns to protect (Order matters! Larger blocks first)
|
|
41
|
+
patterns = [
|
|
42
|
+
r'(?s:^---[\s\S]+?---)', # Frontmatter
|
|
43
|
+
r'(?s:```[\s\S]*?```)', # Code Blocks
|
|
44
|
+
r'(?s:<CodeBlock[^>]*>[\s\S]*?</CodeBlock>)', # Docusaurus CodeBlock
|
|
45
|
+
r'(?m:^:::[a-zA-Z]+(?:\[.*?\])?\s*$)', # Admonition Start
|
|
46
|
+
r'(?m:^:::\s*$)', # Admonition End
|
|
47
|
+
r'(?s:<[a-zA-Z][\w:.-]*(?:\s+[a-zA-Z0-9:-]+=(?:"[^"]*"|\'[^\']*\'|\{[^}]*\}|[^ >]+))*\s*/?>)', # Opening Tags (All HTML/JSX)
|
|
48
|
+
r'(?m:</[a-zA-Z][\w:.-]*\s*>)', # Closing Tags (All HTML/JSX)
|
|
49
|
+
r'(?s:\$\$[\s\S]*?\$\$)', # Display Math
|
|
50
|
+
r'(?<!\$)\$[^\$\n]+\$(?!\$)', # Inline Math
|
|
51
|
+
r'`[^`\n]+`', # Inline Code
|
|
52
|
+
r'(?<=\]\()([^\)\s]+)(?=\))', # Link URLs
|
|
53
|
+
r'(?m:^import\s+.*?;$)' # MDX Imports
|
|
54
|
+
]
|
|
55
|
+
combined_pattern = '|'.join(f'(?:{p})' for p in patterns)
|
|
56
|
+
return re.sub(combined_pattern, lambda m: protect_callback(m.group(0)), text)
|
|
57
|
+
|
|
58
|
+
def translate_text(self, text: str, target_lang: str = settings.target_lang, model: str = settings.ollama_model) -> str:
|
|
59
|
+
"""
|
|
60
|
+
Sends text to the Ollama API for translation using a specified model,
|
|
61
|
+
protecting technical syntax with temporary placeholders.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
text (str): The raw content of the file to be translated.
|
|
65
|
+
target_lang (str): The language to translate the text into (e.g., 'Chinese').
|
|
66
|
+
model (str): The name of the Ollama model to use for translation.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
str: The translated text content with original technical syntax preserved.
|
|
70
|
+
"""
|
|
71
|
+
placeholders = {}
|
|
72
|
+
|
|
73
|
+
def protect(text_content):
|
|
74
|
+
placeholder = f"[[DOC_REF_{len(placeholders)}]]"
|
|
75
|
+
placeholders[placeholder] = text_content # No need for .group(0)
|
|
76
|
+
return placeholder
|
|
77
|
+
|
|
78
|
+
protected_text = self.protect_syntax(text, protect)
|
|
79
|
+
|
|
80
|
+
system_prompt = (
|
|
81
|
+
f"You are an expert technical translator. Translate the document into {target_lang}.\n"
|
|
82
|
+
"CRITICAL: Do not modify or translate tokens like [[DOC_REF_N]].\n"
|
|
83
|
+
"Preserve all structural symbols. Output ONLY the translated text."
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
user_prompt = f"TEXT TO TRANSLATE:\n\n{protected_text}"
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
ollama.chat(model)
|
|
90
|
+
except ollama.ResponseError as e:
|
|
91
|
+
if e.status_code == 404:
|
|
92
|
+
self.pull_model_with_progress(model)
|
|
93
|
+
|
|
94
|
+
messages = [
|
|
95
|
+
{'role': 'system', 'content': system_prompt},
|
|
96
|
+
{'role': 'user', 'content': user_prompt}
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
response = ollama.chat(
|
|
101
|
+
model=model,
|
|
102
|
+
messages=messages,
|
|
103
|
+
options={"temperature": settings.temperature}
|
|
104
|
+
)
|
|
105
|
+
translated = response['message']['content'].strip()
|
|
106
|
+
|
|
107
|
+
for placeholder, original_value in placeholders.items():
|
|
108
|
+
translated = translated.replace(placeholder, original_value)
|
|
109
|
+
|
|
110
|
+
return translated
|
|
111
|
+
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logger.error("Translation Failed.")
|
|
114
|
+
logger.info(e)
|
|
115
|
+
return text
|
|
116
|
+
|
|
117
|
+
def process_file(self, file_path: Path, input_root: Path, output_root: Path):
|
|
118
|
+
"""
|
|
119
|
+
Handles the logic for a single file: translates it if it matches allowed
|
|
120
|
+
extensions, otherwise copies it directly to the output directory.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
file_path (Path): The path to the source file.
|
|
124
|
+
input_root (Path): The root directory of the source files for relative path calculation.
|
|
125
|
+
output_root (Path): The root directory where translated/copied files will be saved.
|
|
126
|
+
"""
|
|
127
|
+
new_file_path = output_root / file_path.relative_to(input_root)
|
|
128
|
+
|
|
129
|
+
if not self.cache.file_changed(file_path):
|
|
130
|
+
return
|
|
131
|
+
|
|
132
|
+
new_file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
133
|
+
|
|
134
|
+
if file_path.suffix in settings.translate_file_types:
|
|
135
|
+
content = file_path.read_text(encoding='utf-8')
|
|
136
|
+
translated = self.translate_text(
|
|
137
|
+
content, settings.target_lang, settings.ollama_model)
|
|
138
|
+
new_file_path.write_text(translated, encoding='utf-8')
|
|
139
|
+
else:
|
|
140
|
+
copy2(file_path, new_file_path)
|
|
141
|
+
|
|
142
|
+
self.cache.update(file_path)
|
|
143
|
+
|
|
144
|
+
def pull_model_with_progress(self, model_name: str):
|
|
145
|
+
"""
|
|
146
|
+
Downloads a model from the Ollama registry while displaying a real-time
|
|
147
|
+
progress bar using tqdm for each layer.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
model_name (str): The name of the model to pull (e.g., 'translategemma:4b').
|
|
151
|
+
|
|
152
|
+
Raises:
|
|
153
|
+
SystemExit: Exits the program with code 1 if the download fails.
|
|
154
|
+
"""
|
|
155
|
+
print(f"Attempting to pull model: {model_name}")
|
|
156
|
+
logger.info(f"Attempting to pull model: {model_name}")
|
|
157
|
+
current_digest = ''
|
|
158
|
+
progress_bars = {}
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
for progress in ollama.pull(model_name, stream=True):
|
|
162
|
+
digest = progress.get('digest', '')
|
|
163
|
+
|
|
164
|
+
if digest != current_digest and current_digest in progress_bars:
|
|
165
|
+
progress_bars[current_digest].close()
|
|
166
|
+
|
|
167
|
+
if not digest:
|
|
168
|
+
print(progress.get('status'))
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
if digest not in progress_bars and (total := progress.get('total')):
|
|
172
|
+
progress_bars[digest] = tqdm(
|
|
173
|
+
total=total,
|
|
174
|
+
desc=f'Downloading {digest[7:19]}',
|
|
175
|
+
unit='B',
|
|
176
|
+
unit_scale=True
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
if completed := progress.get('completed'):
|
|
180
|
+
progress_bars[digest].update(
|
|
181
|
+
completed - progress_bars[digest].n)
|
|
182
|
+
|
|
183
|
+
current_digest = digest
|
|
184
|
+
|
|
185
|
+
for bar in progress_bars.values():
|
|
186
|
+
bar.close()
|
|
187
|
+
print(f"Model {model_name} pull complete.")
|
|
188
|
+
|
|
189
|
+
except Exception as e:
|
|
190
|
+
print(f"An error occurred while pulling model: {e}")
|
|
191
|
+
sys.exit(1)
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: opentrans
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A directory-preserving AI translator using Ollama.
|
|
5
|
+
Project-URL: Homepage, https://github.com/RainStorm108/opentrans
|
|
6
|
+
Project-URL: Documentation, https://rainstorm108.github.io/opentrans/
|
|
7
|
+
Project-URL: Issues, https://github.com/RainStorm108/opentrans/issues
|
|
8
|
+
Author: Hydrangea
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE.md
|
|
11
|
+
Keywords: ai,automation,llm,ollama,translation
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Requires-Dist: click
|
|
23
|
+
Requires-Dist: ollama>=0.1.0
|
|
24
|
+
Requires-Dist: pydantic
|
|
25
|
+
Requires-Dist: pydantic-settings
|
|
26
|
+
Requires-Dist: pyyaml
|
|
27
|
+
Requires-Dist: tqdm>=4.66.0
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# OpenTrans
|
|
31
|
+
|
|
32
|
+
**OpenTrans** is a privacy-first, local-LLM batch translation utility using Ollama. It is designed to mirror a source directory into a target language while preserving the exact file hierarchy.
|
|
33
|
+
|
|
34
|
+
## Key Features
|
|
35
|
+
|
|
36
|
+
* **Local-First (Ollama):** Private, cost-free translation using models like Gemma, Llama 3, or DeepSeek.
|
|
37
|
+
* **Syntax Shielding:** Automatically protects code blocks (```), inline code (`), LaTeX math ($), and Markdown links from being corrupted by the LLM.
|
|
38
|
+
* **Directory Mirroring:** Recursively replicates your source folder structure in the output destination.
|
|
39
|
+
* **Parallel Processing:** Uses `ThreadPoolExecutor` for high-speed batch handling of large file sets.
|
|
40
|
+
* **Smart Caching:** Uses hashing to track file changes. Only files that have been modified since the last run are sent to the LLM, saving significant time and compute resources.
|
|
41
|
+
* **Resilient File Handling:** Automatically pulls required models from Ollama with a real-time progress bar if they are missing.
|
|
42
|
+
|
|
43
|
+
## Quick Usage
|
|
44
|
+
|
|
45
|
+
Simply provide the input directory and the target output directory.
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
opentrans ./docs ./translated_docs --config ./config.yaml
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
**Example Output:**
|
|
53
|
+
|
|
54
|
+
```text
|
|
55
|
+
Target Language: Chinese
|
|
56
|
+
Input: /home/user/Projects/OpenTrans/docs
|
|
57
|
+
Output: /home/user/Projects/OpenTrans/translated_docs
|
|
58
|
+
Using Model: translategemma:4b
|
|
59
|
+
|
|
60
|
+
Translating Files: 100%|████████████████████████████████| 12/12 [00:45<00:00, 3.7s/file]
|
|
61
|
+
|
|
62
|
+
Complete.
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Running Examples
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# Translating Docusaurus
|
|
69
|
+
opentrans ./Example/Docusaurus/docs ./Example/Docusaurus/i18n/zh-hans/docusaurus-plugin-content-blog/current ./config.yaml
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Installation
|
|
73
|
+
|
|
74
|
+
### Setup
|
|
75
|
+
|
|
76
|
+
OpenTrans requires Ollama to be installed and running on your local machine.
|
|
77
|
+
|
|
78
|
+
1. Install Ollama: Follow instructions at [Ollama](https://ollama.com/download)
|
|
79
|
+
|
|
80
|
+
2. Install [uv](https://docs.astral.sh/uv/getting-started/installation/)
|
|
81
|
+
|
|
82
|
+
```shell
|
|
83
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
84
|
+
uv tool install .
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
3. Install OpenTrans
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
git clone https://github.com/rainstorm108/OpenTrans.git
|
|
91
|
+
cd OpenTrans
|
|
92
|
+
uv pip install .
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
4. run
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
opentrans ./docs ./translated_docs --config ./config.yaml
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### For Developers
|
|
102
|
+
|
|
103
|
+
1. Environment Setup
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
uv sync
|
|
107
|
+
source .venv/bin/activte
|
|
108
|
+
hatch shell
|
|
109
|
+
uv pip install -e .
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
2. Running Tests
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
hatch test
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Folder Structure
|
|
119
|
+
|
|
120
|
+
```text
|
|
121
|
+
OpenTrans/
|
|
122
|
+
├── src/
|
|
123
|
+
│ └── opentrans/
|
|
124
|
+
│ ├── __init__.py
|
|
125
|
+
│ ├── main.py # CLI interface using Click
|
|
126
|
+
│ ├── settings.py # Translate settings
|
|
127
|
+
│ ├── translator.py # Translate logic
|
|
128
|
+
│ ├── cache_manager.py # Manages translation caching using content hashing to skip unchanged files.
|
|
129
|
+
│ └── hasher.py # Hash file for cache
|
|
130
|
+
├── tests/
|
|
131
|
+
│ ├── test_cache_manager.py
|
|
132
|
+
│ ├── test_hasher.py
|
|
133
|
+
│ └── test_translator.py
|
|
134
|
+
├── config.yaml # Global settings (model, language, etc.)
|
|
135
|
+
├── pyproject.toml
|
|
136
|
+
└── README.md
|
|
137
|
+
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Workflow
|
|
141
|
+
|
|
142
|
+
```mermaid
|
|
143
|
+
graph TD
|
|
144
|
+
%% Setup Phase
|
|
145
|
+
Start((Start)) --> CheckRunning{Ollama Running?}
|
|
146
|
+
CheckRunning -- No --> Err[Error: Start Ollama]
|
|
147
|
+
CheckRunning -- Yes --> CheckModel{Model Exists?}
|
|
148
|
+
|
|
149
|
+
CheckModel -- No --> Pull[Download Model with Progress Bar]
|
|
150
|
+
Pull --> Init
|
|
151
|
+
CheckModel -- Yes --> Init[Load Paths & Config]
|
|
152
|
+
|
|
153
|
+
%% The Loop
|
|
154
|
+
Init --> NextFile{Next File?}
|
|
155
|
+
NextFile -- No --> Exit([Exit])
|
|
156
|
+
NextFile -- Yes --> IsTranslatable{File Type Supported?}
|
|
157
|
+
|
|
158
|
+
%% Branching Logic
|
|
159
|
+
IsTranslatable -- No --> Mirror[Copy Original File]
|
|
160
|
+
IsTranslatable -- Yes --> Shield[Apply Syntax Placeholders]
|
|
161
|
+
|
|
162
|
+
Shield --> Trans[Translate via Ollama API]
|
|
163
|
+
Trans --> Restore[Restore Protected Code/Math]
|
|
164
|
+
Restore --> Save[Create Dir & Save File]
|
|
165
|
+
Save --> NextFile
|
|
166
|
+
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## Todo
|
|
170
|
+
|
|
171
|
+
* [x] Click-based CLI interface
|
|
172
|
+
* [x] Placeholder-based syntax protection (Code/LaTeX)
|
|
173
|
+
* [x] Multi-threaded parallel processing
|
|
174
|
+
* [x] Implement hash caching to skip unchanged files
|
|
175
|
+
* [ ] Finish the Docusaurus translate script
|
|
176
|
+
* [ ] User Tree-sitter to replace the code blocks before translation instead of regex
|
|
177
|
+
* [ ] Add support for more file types
|
|
178
|
+
* [ ] ...
|
|
179
|
+
|
|
180
|
+
## License
|
|
181
|
+
|
|
182
|
+
This project is licensed under the MIT License.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
opentrans/__init__.py,sha256=yxYZKd_7mI-ZBUbBxEDxHC2BlENTzhPafm6xX6luUAg,126
|
|
2
|
+
opentrans/cache_manager.py,sha256=_1ENDkZe6HJWDVw0A29l7gPIYf3KKrsALm-Ikd9xnHc,1661
|
|
3
|
+
opentrans/hasher.py,sha256=-XXNYI6Gq2w9vlsHq7jJCdaJgJ9rIchszg6urwA1Fao,711
|
|
4
|
+
opentrans/main.py,sha256=-NYjsG2FubPpCOz0vrID3ypyoeCLfMP23-eMlegk_1U,2912
|
|
5
|
+
opentrans/settings.py,sha256=GNSPG0YUNHJ5R2RqLzV_ayxNQGZSjMYszvh6yqQRfO0,1891
|
|
6
|
+
opentrans/translator.py,sha256=w2GGR_FBNhRub06vP9_IShqBrAZTB-Ej1wOCbicPouI,7883
|
|
7
|
+
opentrans-0.1.0.dist-info/METADATA,sha256=H__fK87tJQdtGGKSXPTYvJxm-y7bI9PbfVnKpns00zA,5461
|
|
8
|
+
opentrans-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
9
|
+
opentrans-0.1.0.dist-info/entry_points.txt,sha256=u_CT_SOehQGyiqNMN2AiFDGMLzZ4g-o_ayIH6WWJcgA,50
|
|
10
|
+
opentrans-0.1.0.dist-info/licenses/LICENSE.md,sha256=Gmw7jhwvehQCdjZXF9KkTTKdCnlci3uEUvGsVh_fl4Q,1069
|
|
11
|
+
opentrans-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 rainstorm108
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|