argos-translate-files-main 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- argos_translate_files_main-1.4.1.dist-info/METADATA +65 -0
- argos_translate_files_main-1.4.1.dist-info/RECORD +23 -0
- argos_translate_files_main-1.4.1.dist-info/WHEEL +5 -0
- argos_translate_files_main-1.4.1.dist-info/licenses/LICENSE +661 -0
- argos_translate_files_main-1.4.1.dist-info/top_level.txt +2 -0
- argostranslatefiles/__init__.py +1 -0
- argostranslatefiles/abstract_file.py +26 -0
- argostranslatefiles/argostranslatefiles.py +59 -0
- argostranslatefiles/formats/__init__.py +1 -0
- argostranslatefiles/formats/abstract_xml.py +39 -0
- argostranslatefiles/formats/epub.py +86 -0
- argostranslatefiles/formats/html.py +43 -0
- argostranslatefiles/formats/opendocument/__init__.py +0 -0
- argostranslatefiles/formats/opendocument/odp.py +5 -0
- argostranslatefiles/formats/opendocument/odt.py +52 -0
- argostranslatefiles/formats/openxml/__init__.py +1 -0
- argostranslatefiles/formats/openxml/docx.py +58 -0
- argostranslatefiles/formats/openxml/pptx.py +53 -0
- argostranslatefiles/formats/pdf.py +288 -0
- argostranslatefiles/formats/srt.py +28 -0
- argostranslatefiles/formats/txt.py +24 -0
- tests/__init__.py +0 -0
- tests/test_init.py +6 -0
@@ -0,0 +1,288 @@
|
|
1
|
+
import pymupdf as fitz
|
2
|
+
from typing import List
|
3
|
+
from argostranslate.translate import ITranslation
|
4
|
+
from argostranslatefiles.abstract_file import AbstractFile
|
5
|
+
|
6
|
+
|
7
|
+
class Pdf(AbstractFile):
|
8
|
+
supported_file_extensions = ['.pdf']
|
9
|
+
|
10
|
+
def translate(self, underlying_translation: ITranslation, file_path: str) -> str:
|
11
|
+
outfile_path = self.get_output_path(underlying_translation, file_path)
|
12
|
+
|
13
|
+
translator = PdfTranslator(
|
14
|
+
pdf_path=file_path,
|
15
|
+
output_path=outfile_path,
|
16
|
+
underlying_translation=underlying_translation
|
17
|
+
)
|
18
|
+
translator.translate_pdf()
|
19
|
+
|
20
|
+
return outfile_path
|
21
|
+
|
22
|
+
|
23
|
+
def get_texts(self, file_path: str):
|
24
|
+
doc = fitz.open(file_path)
|
25
|
+
|
26
|
+
texts = []
|
27
|
+
|
28
|
+
count = 0
|
29
|
+
for page_num in range(doc.page_count):
|
30
|
+
page = doc.load_page(page_num)
|
31
|
+
text = page.get_text().strip()
|
32
|
+
if text:
|
33
|
+
count += len(text)
|
34
|
+
texts.append(text)
|
35
|
+
if count >= 4096:
|
36
|
+
break
|
37
|
+
|
38
|
+
doc.close()
|
39
|
+
return " ".join(texts)[:4096]
|
40
|
+
|
41
|
+
|
42
|
+
# Roughly based on https://github.com/CBIhalsen/PolyglotPDF/blob/main/main.py
|
43
|
+
# which is GPLv3
|
44
|
+
class PdfTranslator:
|
45
|
+
def __init__(self, pdf_path: str, output_path: str, underlying_translation: ITranslation):
|
46
|
+
self.pdf_path = pdf_path
|
47
|
+
self.output_path = output_path
|
48
|
+
self.underlying_translation = underlying_translation
|
49
|
+
self.doc = fitz.open(pdf_path)
|
50
|
+
self.pages_data = []
|
51
|
+
|
52
|
+
|
53
|
+
def translate_pdf(self):
|
54
|
+
self._extract_text_from_pages()
|
55
|
+
self._translate_pages_data()
|
56
|
+
self._apply_translations_to_pdf()
|
57
|
+
self._save_translated_pdf()
|
58
|
+
|
59
|
+
|
60
|
+
def _decimal_to_hex_color(self, decimal_color):
|
61
|
+
if decimal_color == 0:
|
62
|
+
return '#000000'
|
63
|
+
hex_color = hex(decimal_color)[2:]
|
64
|
+
hex_color = hex_color.zfill(6)
|
65
|
+
return f'#{hex_color}'
|
66
|
+
|
67
|
+
|
68
|
+
def _is_math(self, text, page_num, font_info):
|
69
|
+
#I assume this is a placeholder that's going to be implemented later in the polyglotPDF/main.py later on, I'm leaving this here if it is implemented later copy pasting that code should work fine. Same for is_non_text.
|
70
|
+
return False
|
71
|
+
|
72
|
+
|
73
|
+
def _is_non_text(self, text):
|
74
|
+
return False
|
75
|
+
|
76
|
+
|
77
|
+
def _extract_text_from_pages(self):
|
78
|
+
# The reason for separating _extract_text_from_pages and _extract_text_with_pymupdf is later if _extract_using_OCR is implemented, it can just go here.
|
79
|
+
page_count = self.doc.page_count
|
80
|
+
for page_num in range(page_count):
|
81
|
+
self._extract_text_with_pymupdf(page_num)
|
82
|
+
|
83
|
+
|
84
|
+
def _extract_text_with_pymupdf(self, page_num: int):
|
85
|
+
while len(self.pages_data) <= page_num:
|
86
|
+
self.pages_data.append([])
|
87
|
+
|
88
|
+
page = self.doc.load_page(page_num)
|
89
|
+
|
90
|
+
links = page.get_links()
|
91
|
+
link_map = {}
|
92
|
+
for link in links:
|
93
|
+
rect = fitz.Rect(link["from"])
|
94
|
+
link_map[rect] = {
|
95
|
+
"uri": link.get("uri", ""),
|
96
|
+
"page": link.get("page", -1),
|
97
|
+
"to": link.get("to", None),
|
98
|
+
"kind": link.get("kind", 0)
|
99
|
+
}
|
100
|
+
|
101
|
+
blocks = page.get_text("dict")["blocks"]
|
102
|
+
|
103
|
+
for block in blocks:
|
104
|
+
if "lines" in block:
|
105
|
+
for line in block["lines"]:
|
106
|
+
for span in line["spans"]:
|
107
|
+
text = span.get("text", "").strip()
|
108
|
+
if text and not self._is_math(text, page_num, None) and not self._is_non_text(text):
|
109
|
+
bbox = span.get("bbox", (0, 0, 0, 0))
|
110
|
+
font_size = span.get("size", 12)
|
111
|
+
font_flags = span.get("flags", 0)
|
112
|
+
color = span.get("color", 0)
|
113
|
+
is_bold = bool(font_flags & 2**4)
|
114
|
+
span_rect = fitz.Rect(bbox)
|
115
|
+
link_info = None
|
116
|
+
for link_rect, link_data in link_map.items():
|
117
|
+
if span_rect.intersects(link_rect):
|
118
|
+
link_info = link_data
|
119
|
+
break
|
120
|
+
|
121
|
+
self.pages_data[page_num].append([
|
122
|
+
text,
|
123
|
+
tuple(bbox),
|
124
|
+
None, # Translation placeholder
|
125
|
+
0, # Angle (rotation)
|
126
|
+
self._decimal_to_hex_color(color),
|
127
|
+
0, # Text indent
|
128
|
+
is_bold,
|
129
|
+
font_size,
|
130
|
+
link_info # Link information
|
131
|
+
])
|
132
|
+
|
133
|
+
|
134
|
+
def _translate_pages_data(self):
|
135
|
+
try:
|
136
|
+
for page_blocks in self.pages_data:
|
137
|
+
for block in page_blocks:
|
138
|
+
original_text = block[0]
|
139
|
+
translated_text = self.underlying_translation.translate(original_text)
|
140
|
+
block[2] = translated_text
|
141
|
+
except Exception as e:
|
142
|
+
# Fallback: use original text in case of math or any other issues
|
143
|
+
for page_blocks in self.pages_data:
|
144
|
+
for block in page_blocks:
|
145
|
+
block[2] = block[0]
|
146
|
+
|
147
|
+
|
148
|
+
def _apply_translations_to_pdf(self):
|
149
|
+
for page_index, blocks in enumerate(self.pages_data):
|
150
|
+
if not blocks:
|
151
|
+
continue
|
152
|
+
|
153
|
+
page = self.doc.load_page(page_index)
|
154
|
+
|
155
|
+
normal_blocks = []
|
156
|
+
bold_blocks = []
|
157
|
+
|
158
|
+
for block in blocks:
|
159
|
+
coords = block[1]
|
160
|
+
translated_text = block[2] if block[2] is not None else block[0]
|
161
|
+
|
162
|
+
# Calculate expansion factor based on text length ratio
|
163
|
+
len_ratio = min(1.05, max(1.01, len(translated_text) / max(1, len(block[0]))))
|
164
|
+
|
165
|
+
x0, y0, x1, y1 = coords
|
166
|
+
width = x1 - x0
|
167
|
+
height = y1 - y0
|
168
|
+
|
169
|
+
# Expand horizontally to accommodate longer text
|
170
|
+
h_expand = (len_ratio - 1) * width
|
171
|
+
x1 = x1 + h_expand
|
172
|
+
|
173
|
+
# Reduce vertical coverage to be more precise
|
174
|
+
vertical_margin = min(height * 0.1, 3)
|
175
|
+
y0 = y0 + vertical_margin
|
176
|
+
y1 = y1 - vertical_margin
|
177
|
+
|
178
|
+
# Ensure minimum height
|
179
|
+
if y1 - y0 < 10:
|
180
|
+
y_center = (coords[1] + coords[3]) / 2
|
181
|
+
y0 = y_center - 5
|
182
|
+
y1 = y_center + 5
|
183
|
+
|
184
|
+
enlarged_coords = (x0, y0, x1, y1)
|
185
|
+
rect = fitz.Rect(*enlarged_coords)
|
186
|
+
|
187
|
+
# Cover original text with white rectangle
|
188
|
+
try:
|
189
|
+
page.add_redact_annot(rect)
|
190
|
+
page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)
|
191
|
+
except Exception:
|
192
|
+
page.draw_rect(rect, color=(1, 1, 1), fill=(1, 1, 1))
|
193
|
+
|
194
|
+
is_bold = len(block) > 6 and block[6]
|
195
|
+
if is_bold:
|
196
|
+
bold_blocks.append((block, enlarged_coords))
|
197
|
+
else:
|
198
|
+
normal_blocks.append((block, enlarged_coords))
|
199
|
+
|
200
|
+
self._insert_styled_text_blocks(page, normal_blocks, is_bold=False)
|
201
|
+
self._insert_styled_text_blocks(page, bold_blocks, is_bold=True)
|
202
|
+
|
203
|
+
|
204
|
+
def _insert_styled_text_blocks(self, page, blocks: List, is_bold: bool):
|
205
|
+
if not blocks:
|
206
|
+
return
|
207
|
+
|
208
|
+
font_weight = "bold" if is_bold else "normal"
|
209
|
+
|
210
|
+
for block_data in blocks:
|
211
|
+
block, enlarged_coords = block_data
|
212
|
+
translated_text = block[2] if block[2] is not None else block[0]
|
213
|
+
angle = block[3] if len(block) > 3 else 0
|
214
|
+
color = block[4] if len(block) > 4 else '#000000'
|
215
|
+
text_indent = block[5] if len(block) > 5 else 0
|
216
|
+
font_size = block[7] if len(block) > 7 else 12
|
217
|
+
link_info = block[8] if len(block) > 8 else None
|
218
|
+
|
219
|
+
rect = fitz.Rect(*enlarged_coords)
|
220
|
+
|
221
|
+
if link_info:
|
222
|
+
if link_info.get("uri"):
|
223
|
+
translated_text = f'<a href="{link_info["uri"]}" style="color: {color}; text-decoration: underline;">{translated_text}</a>'
|
224
|
+
elif link_info.get("page", -1) >= 0:
|
225
|
+
page_num = link_info["page"]
|
226
|
+
translated_text = f'<a href="#page{page_num}" style="color: {color}; text-decoration: underline;">{translated_text}</a>'
|
227
|
+
|
228
|
+
css = f"""
|
229
|
+
* {{
|
230
|
+
color: {color};
|
231
|
+
font-weight: {font_weight};
|
232
|
+
font-size: {font_size}px;
|
233
|
+
text-indent: {text_indent}pt;
|
234
|
+
line-height: 1.2;
|
235
|
+
word-wrap: break-word;
|
236
|
+
overflow-wrap: break-word;
|
237
|
+
width: 100%;
|
238
|
+
box-sizing: border-box;
|
239
|
+
margin: 0;
|
240
|
+
padding: 0;
|
241
|
+
}}
|
242
|
+
a {{
|
243
|
+
text-decoration: underline;
|
244
|
+
}}
|
245
|
+
"""
|
246
|
+
|
247
|
+
html_content = f'<div style="font-size: {font_size}px; color: {color}; font-weight: {font_weight}; text-indent: {text_indent}pt; line-height: 1.2; word-wrap: break-word;">{translated_text}</div>'
|
248
|
+
|
249
|
+
try:
|
250
|
+
page.insert_htmlbox(rect, html_content, css=css, rotate=angle)
|
251
|
+
|
252
|
+
if link_info:
|
253
|
+
self._add_link_annotation(page, rect, link_info)
|
254
|
+
|
255
|
+
except Exception as e:
|
256
|
+
page.insert_text(rect.tl, translated_text, fontsize=font_size)
|
257
|
+
|
258
|
+
if link_info:
|
259
|
+
self._add_link_annotation(page, rect, link_info)
|
260
|
+
|
261
|
+
|
262
|
+
def _add_link_annotation(self, page, rect, link_info):
|
263
|
+
try:
|
264
|
+
link_dict = {
|
265
|
+
"kind": link_info.get("kind", 1), # 1 = URI link, 2 = GoTo link
|
266
|
+
"from": rect
|
267
|
+
}
|
268
|
+
|
269
|
+
if link_info.get("uri"):
|
270
|
+
link_dict["uri"] = link_info["uri"]
|
271
|
+
link_dict["kind"] = 1 # URI link
|
272
|
+
elif link_info.get("page", -1) >= 0:
|
273
|
+
link_dict["page"] = link_info["page"]
|
274
|
+
link_dict["kind"] = 2
|
275
|
+
if link_info.get("to"):
|
276
|
+
link_dict["to"] = link_info["to"]
|
277
|
+
|
278
|
+
page.insert_link(link_dict)
|
279
|
+
except Exception as e:
|
280
|
+
pass
|
281
|
+
|
282
|
+
|
283
|
+
def _save_translated_pdf(self):
|
284
|
+
new_doc = fitz.open()
|
285
|
+
new_doc.insert_pdf(self.doc)
|
286
|
+
new_doc.save(self.output_path, garbage=4, deflate=True)
|
287
|
+
new_doc.close()
|
288
|
+
self.doc.close()
|
@@ -0,0 +1,28 @@
|
|
1
|
+
import pysrt
|
2
|
+
import textwrap
|
3
|
+
from argostranslate.translate import ITranslation
|
4
|
+
from argostranslatefiles.abstract_file import AbstractFile
|
5
|
+
|
6
|
+
|
7
|
+
class Srt(AbstractFile):
|
8
|
+
supported_file_extensions = ['.srt']
|
9
|
+
|
10
|
+
def translate(self, underlying_translation: ITranslation, file_path: str):
|
11
|
+
outfile_path = self.get_output_path(underlying_translation, file_path)
|
12
|
+
|
13
|
+
subs = pysrt.open(file_path)
|
14
|
+
|
15
|
+
for sub in subs:
|
16
|
+
cleaned_text = sub.text.replace('\r\n', ' ').replace('\n', ' ').replace('\r', ' ')
|
17
|
+
translated = underlying_translation.translate(cleaned_text)
|
18
|
+
sub.text = textwrap.fill(translated, width=40)
|
19
|
+
|
20
|
+
subs.save(outfile_path, encoding='utf-8')
|
21
|
+
|
22
|
+
return outfile_path
|
23
|
+
|
24
|
+
|
25
|
+
def get_texts(self, file_path: str):
|
26
|
+
subs = pysrt.open(file_path)
|
27
|
+
text = "\n".join([sub.text for sub in subs])
|
28
|
+
return text[0:4096]
|
@@ -0,0 +1,24 @@
|
|
1
|
+
from argostranslate.translate import ITranslation
|
2
|
+
|
3
|
+
from argostranslatefiles.abstract_file import AbstractFile
|
4
|
+
|
5
|
+
|
6
|
+
class Txt(AbstractFile):
|
7
|
+
supported_file_extensions = ['.txt']
|
8
|
+
|
9
|
+
def translate(self, underlying_translation: ITranslation, file_path: str):
|
10
|
+
outfile_path = self.get_output_path(underlying_translation, file_path)
|
11
|
+
|
12
|
+
infile = open(file_path, "r")
|
13
|
+
outfile = open(outfile_path, "w")
|
14
|
+
|
15
|
+
translated_text = underlying_translation.translate(infile.read())
|
16
|
+
outfile.write(translated_text)
|
17
|
+
|
18
|
+
infile.close()
|
19
|
+
outfile.close()
|
20
|
+
|
21
|
+
return outfile_path
|
22
|
+
|
23
|
+
def get_texts(self, file_path: str):
|
24
|
+
return open(file_path, "r").read(4096)
|
tests/__init__.py
ADDED
File without changes
|