smart-translate 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.1
2
+ Name: smart-translate
3
+ Version: 0.1.0
4
+ Summary: Rule-based and selective smart translation library
5
+ Author-email: Onkar Nanavare <onkarnanavare007@gmail.com>
6
+ License: MIT
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: transformers
10
+ Requires-Dist: torch
File without changes
@@ -0,0 +1,25 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "smart-translate"
7
+ version = "0.1.0"
8
+ description = "Rule-based and selective smart translation library"
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = { text = "MIT" }
12
+ authors = [
13
+ { name = "Onkar Nanavare", email = "onkarnanavare007@gmail.com" }
14
+ ]
15
+
16
+ dependencies = [
17
+ "transformers",
18
+ "torch"
19
+ ]
20
+
21
+ [tool.setuptools]
22
+ packages = ["smart_translate", "smart_translate.rules"]
23
+
24
+ [tool.setuptools.package-data]
25
+ "smart_translate.rules" = ["*.json"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,3 @@
1
+ from .translator import SmartTranslator
2
+
3
+ __all__ = ["SmartTranslator"]
@@ -0,0 +1,14 @@
1
+ # # smart_translate/config.py
2
+
3
+ # # Default model used for translation
4
+ # MODEL_NAME = "facebook/nllb-200-distilled-600M"
5
+
6
+ # # Optional: default languages (used only if API doesn't send them)
7
+ # DEFAULT_SRC_LANG = "tha_Thai"
8
+ # DEFAULT_TGT_LANG = "eng_Latn"
9
+
10
+ # # Max token length for generation
11
+ # MAX_LENGTH = 256
12
+
13
+ MODEL_NAME = "facebook/nllb-200-distilled-600M"
14
+
@@ -0,0 +1,14 @@
1
+ import re
2
+
3
+ def protect_words(text, keep_words):
4
+ mapping = {}
5
+ for i, word in enumerate(keep_words):
6
+ placeholder = f"__KEEP_{i}__"
7
+ mapping[placeholder] = word
8
+ text = re.sub(rf"\b{re.escape(word)}\b", placeholder, text)
9
+ return text, mapping
10
+
11
+ def restore_words(text, mapping):
12
+ for placeholder, word in mapping.items():
13
+ text = text.replace(placeholder, word)
14
+ return text
@@ -0,0 +1,4 @@
1
+ {
2
+ "मेरा": "माझा",
3
+ "Anger": "onkar"
4
+ }
@@ -0,0 +1,12 @@
1
+ from importlib import resources
2
+ import json
3
+
4
+ def load_custom_rules():
5
+ with resources.files("smart_translate.rules").joinpath("custom_rules.json").open("r") as f:
6
+ return json.load(f)
7
+
8
+
9
+ def apply_custom_rules(text, rules=CUSTOM_REPLACEMENTS):
10
+ for src, tgt in rules.items():
11
+ text = text.replace(src, tgt)
12
+ return text
@@ -0,0 +1,10 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ RULES_FILE = Path(__file__).parent / "rules" / "custom.rules.json"
5
+
6
+ def load_rules():
7
+ if RULES_FILE.exists():
8
+ with open(RULES_FILE, "r", encoding="utf-8") as f:
9
+ return json.load(f)
10
+ return {}
@@ -0,0 +1,47 @@
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
+ from .protector import protect_words, restore_words
4
+ from .rules import apply_custom_rules
5
+ from .config import MODEL_NAME
6
+
7
+ class SmartTranslator:
8
+ def __init__(self, src_lang, tgt_lang):
9
+ self.src_lang = src_lang
10
+ self.tgt_lang = tgt_lang
11
+
12
+ # Detect device
13
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ print("Using device:", self.device)
15
+
16
+ # Load tokenizer and model
17
+ self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
18
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
19
+
20
+ # Move model to the device
21
+ self.model.to(self.device)
22
+ self.model.eval()
23
+
24
+ def translate(self, text, keep_words=None):
25
+ keep_words = keep_words or []
26
+
27
+ protected_text, mapping = protect_words(text, keep_words)
28
+
29
+ self.tokenizer.src_lang = self.src_lang
30
+ inputs = self.tokenizer(protected_text, return_tensors="pt")
31
+
32
+ # Move input tensors to same device as model
33
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
34
+
35
+ tgt_lang_id = self.tokenizer.convert_tokens_to_ids(self.tgt_lang)
36
+
37
+ outputs = self.model.generate(
38
+ **inputs,
39
+ forced_bos_token_id=tgt_lang_id,
40
+ max_length=256
41
+ )
42
+
43
+ translated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
44
+ translated = restore_words(translated, mapping)
45
+ translated = apply_custom_rules(translated)
46
+
47
+ return translated
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.1
2
+ Name: smart-translate
3
+ Version: 0.1.0
4
+ Summary: Rule-based and selective smart translation library
5
+ Author-email: Onkar Nanavare <onkarnanavare007@gmail.com>
6
+ License: MIT
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: transformers
10
+ Requires-Dist: torch
@@ -0,0 +1,14 @@
1
+ README.md
2
+ pyproject.toml
3
+ smart_translate/__init__.py
4
+ smart_translate/config.py
5
+ smart_translate/protector.py
6
+ smart_translate/rules.py
7
+ smart_translate/rules_loader.py
8
+ smart_translate/translator.py
9
+ smart_translate.egg-info/PKG-INFO
10
+ smart_translate.egg-info/SOURCES.txt
11
+ smart_translate.egg-info/dependency_links.txt
12
+ smart_translate.egg-info/requires.txt
13
+ smart_translate.egg-info/top_level.txt
14
+ smart_translate/rules/custom_rules.json
@@ -0,0 +1,2 @@
1
+ transformers
2
+ torch
@@ -0,0 +1 @@
1
+ smart_translate