smart-translate 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smart_translate-0.1.0/PKG-INFO +10 -0
- smart_translate-0.1.0/README.md +0 -0
- smart_translate-0.1.0/pyproject.toml +25 -0
- smart_translate-0.1.0/setup.cfg +4 -0
- smart_translate-0.1.0/smart_translate/__init__.py +3 -0
- smart_translate-0.1.0/smart_translate/config.py +14 -0
- smart_translate-0.1.0/smart_translate/protector.py +14 -0
- smart_translate-0.1.0/smart_translate/rules/custom_rules.json +4 -0
- smart_translate-0.1.0/smart_translate/rules.py +12 -0
- smart_translate-0.1.0/smart_translate/rules_loader.py +10 -0
- smart_translate-0.1.0/smart_translate/translator.py +47 -0
- smart_translate-0.1.0/smart_translate.egg-info/PKG-INFO +10 -0
- smart_translate-0.1.0/smart_translate.egg-info/SOURCES.txt +14 -0
- smart_translate-0.1.0/smart_translate.egg-info/dependency_links.txt +1 -0
- smart_translate-0.1.0/smart_translate.egg-info/requires.txt +2 -0
- smart_translate-0.1.0/smart_translate.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: smart-translate
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Rule-based and selective smart translation library
|
|
5
|
+
Author-email: Onkar Nanavare <onkarnanavare007@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: transformers
|
|
10
|
+
Requires-Dist: torch
|
|
File without changes
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "smart-translate"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Rule-based and selective smart translation library"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Onkar Nanavare", email = "onkarnanavare007@gmail.com" }
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
dependencies = [
|
|
17
|
+
"transformers",
|
|
18
|
+
"torch"
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[tool.setuptools]
|
|
22
|
+
packages = ["smart_translate", "smart_translate.rules"]
|
|
23
|
+
|
|
24
|
+
[tool.setuptools.package-data]
|
|
25
|
+
"smart_translate.rules" = ["*.json"]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# # smart_translate/config.py
|
|
2
|
+
|
|
3
|
+
# # Default model used for translation
|
|
4
|
+
# MODEL_NAME = "facebook/nllb-200-distilled-600M"
|
|
5
|
+
|
|
6
|
+
# # Optional: default languages (used only if API doesn't send them)
|
|
7
|
+
# DEFAULT_SRC_LANG = "tha_Thai"
|
|
8
|
+
# DEFAULT_TGT_LANG = "eng_Latn"
|
|
9
|
+
|
|
10
|
+
# # Max token length for generation
|
|
11
|
+
# MAX_LENGTH = 256
|
|
12
|
+
|
|
13
|
+
MODEL_NAME = "facebook/nllb-200-distilled-600M"
|
|
14
|
+
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
def protect_words(text, keep_words):
|
|
4
|
+
mapping = {}
|
|
5
|
+
for i, word in enumerate(keep_words):
|
|
6
|
+
placeholder = f"__KEEP_{i}__"
|
|
7
|
+
mapping[placeholder] = word
|
|
8
|
+
text = re.sub(rf"\b{re.escape(word)}\b", placeholder, text)
|
|
9
|
+
return text, mapping
|
|
10
|
+
|
|
11
|
+
def restore_words(text, mapping):
|
|
12
|
+
for placeholder, word in mapping.items():
|
|
13
|
+
text = text.replace(placeholder, word)
|
|
14
|
+
return text
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from importlib import resources
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
def load_custom_rules():
|
|
5
|
+
with resources.files("smart_translate.rules").joinpath("custom_rules.json").open("r") as f:
|
|
6
|
+
return json.load(f)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def apply_custom_rules(text, rules=CUSTOM_REPLACEMENTS):
|
|
10
|
+
for src, tgt in rules.items():
|
|
11
|
+
text = text.replace(src, tgt)
|
|
12
|
+
return text
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
|
3
|
+
from .protector import protect_words, restore_words
|
|
4
|
+
from .rules import apply_custom_rules
|
|
5
|
+
from .config import MODEL_NAME
|
|
6
|
+
|
|
7
|
+
class SmartTranslator:
|
|
8
|
+
def __init__(self, src_lang, tgt_lang):
|
|
9
|
+
self.src_lang = src_lang
|
|
10
|
+
self.tgt_lang = tgt_lang
|
|
11
|
+
|
|
12
|
+
# Detect device
|
|
13
|
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
14
|
+
print("Using device:", self.device)
|
|
15
|
+
|
|
16
|
+
# Load tokenizer and model
|
|
17
|
+
self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
|
|
18
|
+
self.model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
|
|
19
|
+
|
|
20
|
+
# Move model to the device
|
|
21
|
+
self.model.to(self.device)
|
|
22
|
+
self.model.eval()
|
|
23
|
+
|
|
24
|
+
def translate(self, text, keep_words=None):
|
|
25
|
+
keep_words = keep_words or []
|
|
26
|
+
|
|
27
|
+
protected_text, mapping = protect_words(text, keep_words)
|
|
28
|
+
|
|
29
|
+
self.tokenizer.src_lang = self.src_lang
|
|
30
|
+
inputs = self.tokenizer(protected_text, return_tensors="pt")
|
|
31
|
+
|
|
32
|
+
# Move input tensors to same device as model
|
|
33
|
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
|
34
|
+
|
|
35
|
+
tgt_lang_id = self.tokenizer.convert_tokens_to_ids(self.tgt_lang)
|
|
36
|
+
|
|
37
|
+
outputs = self.model.generate(
|
|
38
|
+
**inputs,
|
|
39
|
+
forced_bos_token_id=tgt_lang_id,
|
|
40
|
+
max_length=256
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
translated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
44
|
+
translated = restore_words(translated, mapping)
|
|
45
|
+
translated = apply_custom_rules(translated)
|
|
46
|
+
|
|
47
|
+
return translated
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: smart-translate
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Rule-based and selective smart translation library
|
|
5
|
+
Author-email: Onkar Nanavare <onkarnanavare007@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: transformers
|
|
10
|
+
Requires-Dist: torch
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
smart_translate/__init__.py
|
|
4
|
+
smart_translate/config.py
|
|
5
|
+
smart_translate/protector.py
|
|
6
|
+
smart_translate/rules.py
|
|
7
|
+
smart_translate/rules_loader.py
|
|
8
|
+
smart_translate/translator.py
|
|
9
|
+
smart_translate.egg-info/PKG-INFO
|
|
10
|
+
smart_translate.egg-info/SOURCES.txt
|
|
11
|
+
smart_translate.egg-info/dependency_links.txt
|
|
12
|
+
smart_translate.egg-info/requires.txt
|
|
13
|
+
smart_translate.egg-info/top_level.txt
|
|
14
|
+
smart_translate/rules/custom_rules.json
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
smart_translate
|