argos-translate-files-main 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,288 @@
1
+ import pymupdf as fitz
2
+ from typing import List
3
+ from argostranslate.translate import ITranslation
4
+ from argostranslatefiles.abstract_file import AbstractFile
5
+
6
+
7
+ class Pdf(AbstractFile):
8
+ supported_file_extensions = ['.pdf']
9
+
10
+ def translate(self, underlying_translation: ITranslation, file_path: str) -> str:
11
+ outfile_path = self.get_output_path(underlying_translation, file_path)
12
+
13
+ translator = PdfTranslator(
14
+ pdf_path=file_path,
15
+ output_path=outfile_path,
16
+ underlying_translation=underlying_translation
17
+ )
18
+ translator.translate_pdf()
19
+
20
+ return outfile_path
21
+
22
+
23
+ def get_texts(self, file_path: str):
24
+ doc = fitz.open(file_path)
25
+
26
+ texts = []
27
+
28
+ count = 0
29
+ for page_num in range(doc.page_count):
30
+ page = doc.load_page(page_num)
31
+ text = page.get_text().strip()
32
+ if text:
33
+ count += len(text)
34
+ texts.append(text)
35
+ if count >= 4096:
36
+ break
37
+
38
+ doc.close()
39
+ return " ".join(texts)[:4096]
40
+
41
+
42
+ # Roughly based on https://github.com/CBIhalsen/PolyglotPDF/blob/main/main.py
43
+ # which is GPLv3
44
+ class PdfTranslator:
45
+ def __init__(self, pdf_path: str, output_path: str, underlying_translation: ITranslation):
46
+ self.pdf_path = pdf_path
47
+ self.output_path = output_path
48
+ self.underlying_translation = underlying_translation
49
+ self.doc = fitz.open(pdf_path)
50
+ self.pages_data = []
51
+
52
+
53
+ def translate_pdf(self):
54
+ self._extract_text_from_pages()
55
+ self._translate_pages_data()
56
+ self._apply_translations_to_pdf()
57
+ self._save_translated_pdf()
58
+
59
+
60
+ def _decimal_to_hex_color(self, decimal_color):
61
+ if decimal_color == 0:
62
+ return '#000000'
63
+ hex_color = hex(decimal_color)[2:]
64
+ hex_color = hex_color.zfill(6)
65
+ return f'#{hex_color}'
66
+
67
+
68
+ def _is_math(self, text, page_num, font_info):
69
+ #I assume this is a placeholder that's going to be implemented later in the polyglotPDF/main.py later on, I'm leaving this here if it is implemented later copy pasting that code should work fine. Same for is_non_text.
70
+ return False
71
+
72
+
73
+ def _is_non_text(self, text):
74
+ return False
75
+
76
+
77
+ def _extract_text_from_pages(self):
78
+ # The reason for separating _extract_text_from_pages and _extract_text_with_pymupdf is later if _extract_using_OCR is implemented, it can just go here.
79
+ page_count = self.doc.page_count
80
+ for page_num in range(page_count):
81
+ self._extract_text_with_pymupdf(page_num)
82
+
83
+
84
+ def _extract_text_with_pymupdf(self, page_num: int):
85
+ while len(self.pages_data) <= page_num:
86
+ self.pages_data.append([])
87
+
88
+ page = self.doc.load_page(page_num)
89
+
90
+ links = page.get_links()
91
+ link_map = {}
92
+ for link in links:
93
+ rect = fitz.Rect(link["from"])
94
+ link_map[rect] = {
95
+ "uri": link.get("uri", ""),
96
+ "page": link.get("page", -1),
97
+ "to": link.get("to", None),
98
+ "kind": link.get("kind", 0)
99
+ }
100
+
101
+ blocks = page.get_text("dict")["blocks"]
102
+
103
+ for block in blocks:
104
+ if "lines" in block:
105
+ for line in block["lines"]:
106
+ for span in line["spans"]:
107
+ text = span.get("text", "").strip()
108
+ if text and not self._is_math(text, page_num, None) and not self._is_non_text(text):
109
+ bbox = span.get("bbox", (0, 0, 0, 0))
110
+ font_size = span.get("size", 12)
111
+ font_flags = span.get("flags", 0)
112
+ color = span.get("color", 0)
113
+ is_bold = bool(font_flags & 2**4)
114
+ span_rect = fitz.Rect(bbox)
115
+ link_info = None
116
+ for link_rect, link_data in link_map.items():
117
+ if span_rect.intersects(link_rect):
118
+ link_info = link_data
119
+ break
120
+
121
+ self.pages_data[page_num].append([
122
+ text,
123
+ tuple(bbox),
124
+ None, # Translation placeholder
125
+ 0, # Angle (rotation)
126
+ self._decimal_to_hex_color(color),
127
+ 0, # Text indent
128
+ is_bold,
129
+ font_size,
130
+ link_info # Link information
131
+ ])
132
+
133
+
134
+ def _translate_pages_data(self):
135
+ try:
136
+ for page_blocks in self.pages_data:
137
+ for block in page_blocks:
138
+ original_text = block[0]
139
+ translated_text = self.underlying_translation.translate(original_text)
140
+ block[2] = translated_text
141
+ except Exception as e:
142
+ # Fallback: use original text in case of math or any other issues
143
+ for page_blocks in self.pages_data:
144
+ for block in page_blocks:
145
+ block[2] = block[0]
146
+
147
+
148
+ def _apply_translations_to_pdf(self):
149
+ for page_index, blocks in enumerate(self.pages_data):
150
+ if not blocks:
151
+ continue
152
+
153
+ page = self.doc.load_page(page_index)
154
+
155
+ normal_blocks = []
156
+ bold_blocks = []
157
+
158
+ for block in blocks:
159
+ coords = block[1]
160
+ translated_text = block[2] if block[2] is not None else block[0]
161
+
162
+ # Calculate expansion factor based on text length ratio
163
+ len_ratio = min(1.05, max(1.01, len(translated_text) / max(1, len(block[0]))))
164
+
165
+ x0, y0, x1, y1 = coords
166
+ width = x1 - x0
167
+ height = y1 - y0
168
+
169
+ # Expand horizontally to accommodate longer text
170
+ h_expand = (len_ratio - 1) * width
171
+ x1 = x1 + h_expand
172
+
173
+ # Reduce vertical coverage to be more precise
174
+ vertical_margin = min(height * 0.1, 3)
175
+ y0 = y0 + vertical_margin
176
+ y1 = y1 - vertical_margin
177
+
178
+ # Ensure minimum height
179
+ if y1 - y0 < 10:
180
+ y_center = (coords[1] + coords[3]) / 2
181
+ y0 = y_center - 5
182
+ y1 = y_center + 5
183
+
184
+ enlarged_coords = (x0, y0, x1, y1)
185
+ rect = fitz.Rect(*enlarged_coords)
186
+
187
+ # Cover original text with white rectangle
188
+ try:
189
+ page.add_redact_annot(rect)
190
+ page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)
191
+ except Exception:
192
+ page.draw_rect(rect, color=(1, 1, 1), fill=(1, 1, 1))
193
+
194
+ is_bold = len(block) > 6 and block[6]
195
+ if is_bold:
196
+ bold_blocks.append((block, enlarged_coords))
197
+ else:
198
+ normal_blocks.append((block, enlarged_coords))
199
+
200
+ self._insert_styled_text_blocks(page, normal_blocks, is_bold=False)
201
+ self._insert_styled_text_blocks(page, bold_blocks, is_bold=True)
202
+
203
+
204
+ def _insert_styled_text_blocks(self, page, blocks: List, is_bold: bool):
205
+ if not blocks:
206
+ return
207
+
208
+ font_weight = "bold" if is_bold else "normal"
209
+
210
+ for block_data in blocks:
211
+ block, enlarged_coords = block_data
212
+ translated_text = block[2] if block[2] is not None else block[0]
213
+ angle = block[3] if len(block) > 3 else 0
214
+ color = block[4] if len(block) > 4 else '#000000'
215
+ text_indent = block[5] if len(block) > 5 else 0
216
+ font_size = block[7] if len(block) > 7 else 12
217
+ link_info = block[8] if len(block) > 8 else None
218
+
219
+ rect = fitz.Rect(*enlarged_coords)
220
+
221
+ if link_info:
222
+ if link_info.get("uri"):
223
+ translated_text = f'<a href="{link_info["uri"]}" style="color: {color}; text-decoration: underline;">{translated_text}</a>'
224
+ elif link_info.get("page", -1) >= 0:
225
+ page_num = link_info["page"]
226
+ translated_text = f'<a href="#page{page_num}" style="color: {color}; text-decoration: underline;">{translated_text}</a>'
227
+
228
+ css = f"""
229
+ * {{
230
+ color: {color};
231
+ font-weight: {font_weight};
232
+ font-size: {font_size}px;
233
+ text-indent: {text_indent}pt;
234
+ line-height: 1.2;
235
+ word-wrap: break-word;
236
+ overflow-wrap: break-word;
237
+ width: 100%;
238
+ box-sizing: border-box;
239
+ margin: 0;
240
+ padding: 0;
241
+ }}
242
+ a {{
243
+ text-decoration: underline;
244
+ }}
245
+ """
246
+
247
+ html_content = f'<div style="font-size: {font_size}px; color: {color}; font-weight: {font_weight}; text-indent: {text_indent}pt; line-height: 1.2; word-wrap: break-word;">{translated_text}</div>'
248
+
249
+ try:
250
+ page.insert_htmlbox(rect, html_content, css=css, rotate=angle)
251
+
252
+ if link_info:
253
+ self._add_link_annotation(page, rect, link_info)
254
+
255
+ except Exception as e:
256
+ page.insert_text(rect.tl, translated_text, fontsize=font_size)
257
+
258
+ if link_info:
259
+ self._add_link_annotation(page, rect, link_info)
260
+
261
+
262
+ def _add_link_annotation(self, page, rect, link_info):
263
+ try:
264
+ link_dict = {
265
+ "kind": link_info.get("kind", 1), # 1 = URI link, 2 = GoTo link
266
+ "from": rect
267
+ }
268
+
269
+ if link_info.get("uri"):
270
+ link_dict["uri"] = link_info["uri"]
271
+ link_dict["kind"] = 1 # URI link
272
+ elif link_info.get("page", -1) >= 0:
273
+ link_dict["page"] = link_info["page"]
274
+ link_dict["kind"] = 2
275
+ if link_info.get("to"):
276
+ link_dict["to"] = link_info["to"]
277
+
278
+ page.insert_link(link_dict)
279
+ except Exception as e:
280
+ pass
281
+
282
+
283
+ def _save_translated_pdf(self):
284
+ new_doc = fitz.open()
285
+ new_doc.insert_pdf(self.doc)
286
+ new_doc.save(self.output_path, garbage=4, deflate=True)
287
+ new_doc.close()
288
+ self.doc.close()
@@ -0,0 +1,28 @@
1
+ import pysrt
2
+ import textwrap
3
+ from argostranslate.translate import ITranslation
4
+ from argostranslatefiles.abstract_file import AbstractFile
5
+
6
+
7
+ class Srt(AbstractFile):
8
+ supported_file_extensions = ['.srt']
9
+
10
+ def translate(self, underlying_translation: ITranslation, file_path: str):
11
+ outfile_path = self.get_output_path(underlying_translation, file_path)
12
+
13
+ subs = pysrt.open(file_path)
14
+
15
+ for sub in subs:
16
+ cleaned_text = sub.text.replace('\r\n', ' ').replace('\n', ' ').replace('\r', ' ')
17
+ translated = underlying_translation.translate(cleaned_text)
18
+ sub.text = textwrap.fill(translated, width=40)
19
+
20
+ subs.save(outfile_path, encoding='utf-8')
21
+
22
+ return outfile_path
23
+
24
+
25
+ def get_texts(self, file_path: str):
26
+ subs = pysrt.open(file_path)
27
+ text = "\n".join([sub.text for sub in subs])
28
+ return text[0:4096]
@@ -0,0 +1,24 @@
1
+ from argostranslate.translate import ITranslation
2
+
3
+ from argostranslatefiles.abstract_file import AbstractFile
4
+
5
+
6
+ class Txt(AbstractFile):
7
+ supported_file_extensions = ['.txt']
8
+
9
+ def translate(self, underlying_translation: ITranslation, file_path: str):
10
+ outfile_path = self.get_output_path(underlying_translation, file_path)
11
+
12
+ infile = open(file_path, "r")
13
+ outfile = open(outfile_path, "w")
14
+
15
+ translated_text = underlying_translation.translate(infile.read())
16
+ outfile.write(translated_text)
17
+
18
+ infile.close()
19
+ outfile.close()
20
+
21
+ return outfile_path
22
+
23
+ def get_texts(self, file_path: str):
24
+ return open(file_path, "r").read(4096)
tests/__init__.py ADDED
File without changes
tests/test_init.py ADDED
@@ -0,0 +1,6 @@
1
+ import argostranslatefiles
2
+
3
+
4
+ def test_init():
5
+ """Test Argos translate models initialization"""
6
+ assert len(argostranslatefiles.get_supported_formats()) >= 1