argos-translate-files-main 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,65 @@
1
+ Metadata-Version: 2.4
2
+ Name: argos-translate-files-main
3
+ Version: 1.4.0
4
+ Summary: Translate files with Argos Translate
5
+ Home-page: https://github.com/LibreTranslate/argos-translate-files
6
+ Author: S. Thuret
7
+ Author-email: contact@sebastien-thuret.fr
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: beautifulsoup4>=4.9.3
11
+ Requires-Dist: lxml>=4.9.2
12
+ Requires-Dist: argostranslate>=1.5.1
13
+ Requires-Dist: translatehtml>=1.5.1
14
+ Requires-Dist: pysrt>=1.1.2
15
+ Requires-Dist: PyMuPDF>=1.24.11
16
+ Dynamic: author
17
+ Dynamic: author-email
18
+ Dynamic: description
19
+ Dynamic: description-content-type
20
+ Dynamic: home-page
21
+ Dynamic: license-file
22
+ Dynamic: requires-dist
23
+ Dynamic: summary
24
+
25
+ # Argos Translate Files
26
+
27
+ Translate files using [Argos Translate](https://github.com/argosopentech/argos-translate).
28
+
29
+ ## Supported file format
30
+
31
+ .txt, .odt, .odp, .docx, .pptx, .epub, .html, .srt
32
+
33
+ ## Install
34
+
35
+ ```
36
+ pip install argos-translate-files
37
+ ```
38
+
39
+
40
+ ## Example
41
+
42
+ ```python
43
+ import os.path
44
+
45
+ import argostranslate.package, argostranslate.translate
46
+
47
+
48
+ import argostranslatefiles
49
+ from argostranslatefiles import argostranslatefiles
50
+
51
+ from_code = "fr"
52
+ to_code = "en"
53
+
54
+ installed_languages = argostranslate.translate.get_installed_languages()
55
+ from_lang = list(filter(
56
+ lambda x: x.code == from_code,
57
+ installed_languages))[0]
58
+ to_lang = list(filter(
59
+ lambda x: x.code == to_code,
60
+ installed_languages))[0]
61
+ underlying_translation = from_lang.get_translation(to_lang)
62
+
63
+ argostranslatefiles.translate_file(underlying_translation, os.path.abspath('path/to/file.txt'))
64
+
65
+ ```
@@ -0,0 +1,23 @@
1
+ argos_translate_files_main-1.4.0.dist-info/licenses/LICENSE,sha256=fF1KSXtjRXSX1_BlLo-wY7dRtPnzo1rFiwEKrQaY1-s,1072
2
+ argostranslatefiles/__init__.py,sha256=T0ahsHLbHg2WfvkEQemFCuWSAGC_EwdiOF9eUKgFNKA,54
3
+ argostranslatefiles/abstract_file.py,sha256=nk7Bz0IOwct4vZnSLJU9JLTvmvIKJq15NXmkv8QaOB4,839
4
+ argostranslatefiles/argostranslatefiles.py,sha256=eA17P4Mfuln0hHUbxVkYxMzA79hy7uhs9s9vN_ej6BI,1563
5
+ argostranslatefiles/formats/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
6
+ argostranslatefiles/formats/abstract_xml.py,sha256=CsFEzzaYNacHNYD86UZxbNsQTpVaFlaG0YrKVjTRvgw,1340
7
+ argostranslatefiles/formats/epub.py,sha256=aKSCs8B7Nr9SkH0D5PKGn2B2haoqMnoOv7XncTLYHn8,3226
8
+ argostranslatefiles/formats/html.py,sha256=cj55uvhZfqXFvRTB_DyovTaHWzRs12RnYxE_euXZEHQ,1159
9
+ argostranslatefiles/formats/pdf.py,sha256=g9fRs89EKdod1NAeAolfYDdG9A_fJTcJqG16CH7E4SE,11053
10
+ argostranslatefiles/formats/srt.py,sha256=IjurpRfZvL0mK-ZkJjaYmvuYXnknFVmV3x4-XEqPPJc,720
11
+ argostranslatefiles/formats/txt.py,sha256=zRAZGxkzt5JWoHy-ZqZUlu9NvbND9aK2qivo3MiP2W0,698
12
+ argostranslatefiles/formats/opendocument/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ argostranslatefiles/formats/opendocument/odp.py,sha256=EmkbAxWmBMxGajrp7Zq0QItnrGQlynlKeolNP7kedHU,120
14
+ argostranslatefiles/formats/opendocument/odt.py,sha256=qklQHc7Vy00KL5B-0kce1rhoNYiqeyDVB7l6thp2V_o,1690
15
+ argostranslatefiles/formats/openxml/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
16
+ argostranslatefiles/formats/openxml/docx.py,sha256=xqDWTL4QZVqy4dBOzgY42zc9ej8MUjj4Z3UNKFDsMxY,1704
17
+ argostranslatefiles/formats/openxml/pptx.py,sha256=l3neBU_wWoCEYfm6gph0EaW0PsptiL27WfdFCatDDtw,1756
18
+ tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ tests/test_init.py,sha256=VmOIM9Zw5AAHZAXC3b97STO2MF5VUST-sZQMuP9QPYE,164
20
+ argos_translate_files_main-1.4.0.dist-info/METADATA,sha256=_b4V8c1sHgxo9jAYRBoTnr9ULg1yn8aQ55hvB42g3wQ,1546
21
+ argos_translate_files_main-1.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
22
+ argos_translate_files_main-1.4.0.dist-info/top_level.txt,sha256=H2-Nav_Fg6dGaIChlLbDv4Bf8mZ-pLh3iSlKs7MDatQ,26
23
+ argos_translate_files_main-1.4.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2021 Argos Open Tech
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,2 @@
1
+ argostranslatefiles
2
+ tests
@@ -0,0 +1 @@
1
+ from argostranslatefiles.argostranslatefiles import *
@@ -0,0 +1,26 @@
1
+ import abc
2
+ import os.path
3
+
4
+ from argostranslate.translate import ITranslation
5
+
6
+
7
+ class AbstractFile():
8
+ supported_file_extensions = []
9
+
10
+ def support(self, file_path: str):
11
+ file_ext = os.path.splitext(file_path)[1]
12
+
13
+ return file_ext in self.supported_file_extensions
14
+
15
+ def get_output_path(self, underlying_translation: ITranslation, file_path: str):
16
+ dir_path = os.path.dirname(file_path)
17
+ file_name, file_ext = os.path.splitext(os.path.basename(file_path))
18
+ to_code = underlying_translation.to_lang.code
19
+
20
+ return dir_path + "/" + file_name + '_' + to_code + file_ext
21
+
22
+ @abc.abstractmethod
23
+ def translate(self, underlying_translation: ITranslation, file_path: str): raise NotImplementedError
24
+
25
+ @abc.abstractmethod
26
+ def get_texts(self, file_path: str): raise NotImplementedError
@@ -0,0 +1,59 @@
1
+ from argostranslate.translate import ITranslation
2
+
3
+ from argostranslatefiles.formats.html import Html
4
+ from argostranslatefiles.formats.opendocument.odp import Odp
5
+ from argostranslatefiles.formats.opendocument.odt import Odt
6
+ from argostranslatefiles.formats.openxml.docx import Docx
7
+ from argostranslatefiles.formats.openxml.pptx import Pptx
8
+ from argostranslatefiles.formats.txt import Txt
9
+ from argostranslatefiles.formats.epub import Epub
10
+ from argostranslatefiles.formats.srt import Srt
11
+ from argostranslatefiles.formats.pdf import Pdf
12
+
13
+ def get_supported_formats():
14
+ return [
15
+ Txt(),
16
+ Odt(),
17
+ Odp(),
18
+ Docx(),
19
+ Pptx(),
20
+ Epub(),
21
+ Html(),
22
+ Srt(),
23
+ Pdf(),
24
+ ]
25
+
26
+
27
+ def translate_file(underlying_translation: ITranslation, file_path: str):
28
+ """Translate a file.
29
+
30
+ Args:
31
+ underlying_translation (argostranslate.translate.ITranslation): Argos Translate Translation
32
+ file_path (str): file path
33
+
34
+ Returns:
35
+ file_path: Translated file
36
+ """
37
+
38
+ for supported_format in get_supported_formats():
39
+ if supported_format.support(file_path):
40
+ return supported_format.translate(underlying_translation, file_path)
41
+
42
+ return False
43
+
44
+
45
+ def get_texts(file_path: str):
46
+ """Get the file contents.
47
+
48
+ Args:
49
+ file_path (str): file path
50
+
51
+ Returns:
52
+ texts: File contents
53
+ """
54
+
55
+ for supported_format in get_supported_formats():
56
+ if supported_format.support(file_path):
57
+ return supported_format.get_texts(file_path)
58
+
59
+ return False
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,39 @@
1
+ import argostranslate
2
+ import bs4
3
+ from argostranslate.tags import Tag, ITag
4
+
5
+ from argostranslatefiles.abstract_file import AbstractFile
6
+
7
+
8
+ class AbstractXml(AbstractFile):
9
+
10
+ def is_translatable(self, soup):
11
+ return soup.text != ""
12
+
13
+ def itag_of_soup(self, soup):
14
+ """Returns an argostranslate.tags.ITag tree from a BeautifulSoup object.
15
+ Args:
16
+ soup (bs4.element.Navigablestring or bs4.element.Tag): Beautiful Soup object
17
+ Returns:
18
+ argostranslate.tags.ITag: Argos Translate ITag tree
19
+ """
20
+ if isinstance(soup, bs4.element.NavigableString):
21
+ return str(soup)
22
+
23
+ translatable = self.is_translatable(soup)
24
+ to_return = Tag([self.itag_of_soup(content) for content in soup.contents], translatable)
25
+ to_return.soup = soup
26
+ return to_return
27
+
28
+ def soup_of_itag(self, itag: ITag):
29
+ """Returns a BeautifulSoup object from an Argos Translate ITag.
30
+ Args:
31
+ itag (argostranslate.tags.ITag): ITag object to convert to Soup
32
+ Returns:
33
+ bs4.elements.BeautifulSoup: BeautifulSoup object
34
+ """
35
+ if type(itag) == str:
36
+ return bs4.element.NavigableString(itag)
37
+ soup = itag.soup
38
+ soup.contents = [self.soup_of_itag(child) for child in itag.children]
39
+ return soup
@@ -0,0 +1,86 @@
1
+ import re
2
+ import zipfile
3
+
4
+ import translatehtml
5
+ from argostranslate.tags import translate_tags
6
+ from argostranslate.translate import ITranslation
7
+ from bs4 import BeautifulSoup
8
+
9
+ from argostranslatefiles.formats.abstract_xml import AbstractXml
10
+
11
+
12
+ class Epub(AbstractXml):
13
+ supported_file_extensions = ['.epub']
14
+
15
+ def is_translatable(self, soup):
16
+ return soup.text != ""
17
+
18
+ def translate(self, underlying_translation: ITranslation, file_path: str):
19
+ outzip_path = self.get_output_path(underlying_translation, file_path)
20
+
21
+ inzip = zipfile.ZipFile(file_path, "r")
22
+ outzip = zipfile.ZipFile(outzip_path, "w")
23
+
24
+
25
+ for inzipinfo in inzip.infolist():
26
+ with inzip.open(inzipinfo) as infile:
27
+ translatable_xml_filenames = ["OPS/content.opf", "OPS/toc.ncx", "OEBPS/content.opf", "OEBPS/toc.ncx"]
28
+ if inzipinfo.filename in translatable_xml_filenames:
29
+ soup = BeautifulSoup(infile.read(), 'xml')
30
+
31
+ itag = self.itag_of_soup(soup)
32
+ translated_tag = translate_tags(underlying_translation, itag)
33
+ translated_soup = self.soup_of_itag(translated_tag)
34
+
35
+ outzip.writestr(inzipinfo.filename, str(translated_soup))
36
+ elif inzipinfo.filename.endswith('.html') or inzipinfo.filename.endswith('.xhtml'):
37
+ head = '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>'
38
+ content = str(infile.read(), 'utf-8')
39
+ head_present = content.startswith(head)
40
+
41
+ if head_present:
42
+ content = content[len(head):]
43
+
44
+ translated = str(translatehtml.translate_html(underlying_translation, content))
45
+
46
+ if head_present:
47
+ translated = str(head) + translated
48
+
49
+ outzip.writestr(inzipinfo.filename, translated)
50
+ else:
51
+ outzip.writestr(inzipinfo.filename, infile.read())
52
+
53
+ inzip.close()
54
+ outzip.close()
55
+
56
+ return outzip_path
57
+
58
+ def get_texts(self, file_path: str):
59
+ inzip = zipfile.ZipFile(file_path, "r")
60
+
61
+ texts = ""
62
+
63
+ for inzipinfo in inzip.infolist():
64
+ if len(texts) > 4096:
65
+ break
66
+ with inzip.open(inzipinfo) as infile:
67
+ translatable_xml_filenames = ["OPS/content.opf", "OPS/toc.ncx", "OEBPS/content.opf", "OEBPS/toc.ncx"]
68
+ if inzipinfo.filename in translatable_xml_filenames:
69
+ soup = BeautifulSoup(infile.read(), 'xml')
70
+
71
+ texts += self.itag_of_soup(soup).text()
72
+ elif inzipinfo.filename.endswith('.html') or inzipinfo.filename.endswith('.xhtml'):
73
+ head = '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>'
74
+ content = str(infile.read(), 'utf-8')
75
+ head_present = content.startswith(head)
76
+
77
+ if head_present:
78
+ content = content[len(head):]
79
+
80
+ texts += content
81
+ else:
82
+ texts += infile.read().decode()
83
+
84
+ inzip.close()
85
+
86
+ return texts[:4096]
@@ -0,0 +1,43 @@
1
+ import translatehtml
2
+ from argostranslate.translate import ITranslation
3
+
4
+ from argostranslatefiles.abstract_file import AbstractFile
5
+ from bs4 import BeautifulSoup
6
+
7
+
8
+ class Html(AbstractFile):
9
+ supported_file_extensions = ['.html']
10
+
11
+ def translate(self, underlying_translation: ITranslation, file_path: str):
12
+ outfile_path = self.get_output_path(underlying_translation, file_path)
13
+
14
+ infile = open(file_path, "r")
15
+ outfile = open(outfile_path, "w")
16
+
17
+ content = infile.read()
18
+
19
+ head = '<!DOCTYPE html>'
20
+ head_present = content.startswith(head)
21
+
22
+ if head_present:
23
+ content = content[len(head):]
24
+
25
+ translated = str(translatehtml.translate_html(underlying_translation, content))
26
+
27
+ if head_present:
28
+ translated = str(head) + translated
29
+
30
+ outfile.write(translated)
31
+
32
+ infile.close()
33
+ outfile.close()
34
+
35
+ return outfile_path
36
+
37
+ def get_texts(self, file_path: str):
38
+ infile = open(file_path, "r")
39
+
40
+ content = infile.read()
41
+
42
+ soup = BeautifulSoup(content, "html.parser")
43
+ return translatehtml.itag_of_soup(soup).text()[0:4096]
File without changes
@@ -0,0 +1,5 @@
1
+ from argostranslatefiles.formats.opendocument.odt import Odt
2
+
3
+
4
+ class Odp(Odt):
5
+ supported_file_extensions = ['.odp']
@@ -0,0 +1,52 @@
1
+ import zipfile
2
+
3
+ from argostranslate.tags import translate_tags
4
+ from argostranslate.translate import ITranslation
5
+ from bs4 import BeautifulSoup
6
+
7
+ from argostranslatefiles.formats.abstract_xml import AbstractXml
8
+
9
+
10
+ class Odt(AbstractXml):
11
+ supported_file_extensions = ['.odt']
12
+
13
+ def translate(self, underlying_translation: ITranslation, file_path: str):
14
+ outzip_path = self.get_output_path(underlying_translation, file_path)
15
+
16
+ inzip = zipfile.ZipFile(file_path, "r")
17
+ outzip = zipfile.ZipFile(outzip_path, "w")
18
+
19
+ for inzipinfo in inzip.infolist():
20
+ with inzip.open(inzipinfo) as infile:
21
+ if inzipinfo.filename == "content.xml":
22
+ soup = BeautifulSoup(infile.read(), 'xml')
23
+
24
+ itag = self.itag_of_soup(soup)
25
+ translated_tag = translate_tags(underlying_translation, itag)
26
+ translated_soup = self.soup_of_itag(translated_tag)
27
+
28
+ outzip.writestr(inzipinfo.filename, str(translated_soup))
29
+ else:
30
+ outzip.writestr(inzipinfo.filename, infile.read())
31
+
32
+ inzip.close()
33
+ outzip.close()
34
+
35
+ return outzip_path
36
+
37
+ def get_texts(self, file_path: str):
38
+ inzip = zipfile.ZipFile(file_path, "r")
39
+
40
+ texts = ""
41
+
42
+ for inzipinfo in inzip.infolist():
43
+ if len(texts) > 4096:
44
+ break
45
+ with inzip.open(inzipinfo) as infile:
46
+ if inzipinfo.filename == "content.xml":
47
+ soup = BeautifulSoup(infile.read(), 'xml')
48
+ texts += self.itag_of_soup(soup).text()
49
+
50
+ inzip.close()
51
+
52
+ return texts[:4096]
@@ -0,0 +1,52 @@
1
+ import zipfile
2
+
3
+ from argostranslate.tags import translate_tags
4
+ from argostranslate.translate import ITranslation
5
+ from bs4 import BeautifulSoup
6
+
7
+ from argostranslatefiles.formats.abstract_xml import AbstractXml
8
+
9
+
10
+ class Docx(AbstractXml):
11
+ supported_file_extensions = ['.docx']
12
+
13
+ def translate(self, underlying_translation: ITranslation, file_path: str):
14
+ outzip_path = self.get_output_path(underlying_translation, file_path)
15
+
16
+ inzip = zipfile.ZipFile(file_path, "r")
17
+ outzip = zipfile.ZipFile(outzip_path, "w")
18
+
19
+ for inzipinfo in inzip.infolist():
20
+ with inzip.open(inzipinfo) as infile:
21
+ if inzipinfo.filename == "word/document.xml":
22
+ soup = BeautifulSoup(infile.read(), 'xml')
23
+
24
+ itag = self.itag_of_soup(soup)
25
+ translated_tag = translate_tags(underlying_translation, itag)
26
+ translated_soup = self.soup_of_itag(translated_tag)
27
+
28
+ outzip.writestr(inzipinfo.filename, str(translated_soup))
29
+ else:
30
+ outzip.writestr(inzipinfo.filename, infile.read())
31
+
32
+ inzip.close()
33
+ outzip.close()
34
+
35
+ return outzip_path
36
+
37
+ def get_texts(self, file_path: str):
38
+ inzip = zipfile.ZipFile(file_path, "r")
39
+
40
+ texts = ""
41
+
42
+ for inzipinfo in inzip.infolist():
43
+ if len(texts) > 4096:
44
+ break
45
+ with inzip.open(inzipinfo) as infile:
46
+ if inzipinfo.filename == "word/document.xml":
47
+ soup = BeautifulSoup(infile.read(), 'xml')
48
+ texts += self.itag_of_soup(soup).text()
49
+
50
+ inzip.close()
51
+
52
+ return texts[:4096]
@@ -0,0 +1,53 @@
1
+ import re
2
+ import zipfile
3
+
4
+ from argostranslate.tags import translate_tags
5
+ from argostranslate.translate import ITranslation
6
+ from bs4 import BeautifulSoup
7
+
8
+ from argostranslatefiles.formats.abstract_xml import AbstractXml
9
+
10
+
11
+ class Pptx(AbstractXml):
12
+ supported_file_extensions = ['.pptx']
13
+
14
+ def translate(self, underlying_translation: ITranslation, file_path: str):
15
+ outzip_path = self.get_output_path(underlying_translation, file_path)
16
+
17
+ inzip = zipfile.ZipFile(file_path, "r")
18
+ outzip = zipfile.ZipFile(outzip_path, "w")
19
+
20
+ for inzipinfo in inzip.infolist():
21
+ with inzip.open(inzipinfo) as infile:
22
+ if re.match(r"ppt\/slides\/slide[0-9]*\.xml", inzipinfo.filename):
23
+ soup = BeautifulSoup(infile.read(), 'xml')
24
+
25
+ itag = self.itag_of_soup(soup)
26
+ translated_tag = translate_tags(underlying_translation, itag)
27
+ translated_soup = self.soup_of_itag(translated_tag)
28
+
29
+ outzip.writestr(inzipinfo.filename, str(translated_soup))
30
+ else:
31
+ outzip.writestr(inzipinfo.filename, infile.read())
32
+
33
+ inzip.close()
34
+ outzip.close()
35
+
36
+ return outzip_path
37
+
38
+ def get_texts(self, file_path: str):
39
+ inzip = zipfile.ZipFile(file_path, "r")
40
+
41
+ texts = ""
42
+
43
+ for inzipinfo in inzip.infolist():
44
+ if len(texts) > 4096:
45
+ break
46
+ with inzip.open(inzipinfo) as infile:
47
+ if re.match(r"ppt\/slides\/slide[0-9]*\.xml", inzipinfo.filename):
48
+ soup = BeautifulSoup(infile.read(), 'xml')
49
+ texts += self.itag_of_soup(soup).text()
50
+
51
+ inzip.close()
52
+
53
+ return texts[:4096]
@@ -0,0 +1,288 @@
1
+ import pymupdf as fitz
2
+ from typing import List
3
+ from argostranslate.translate import ITranslation
4
+ from argostranslatefiles.abstract_file import AbstractFile
5
+
6
+
7
+ class Pdf(AbstractFile):
8
+ supported_file_extensions = ['.pdf']
9
+
10
+ def translate(self, underlying_translation: ITranslation, file_path: str) -> str:
11
+ outfile_path = self.get_output_path(underlying_translation, file_path)
12
+
13
+ translator = PdfTranslator(
14
+ pdf_path=file_path,
15
+ output_path=outfile_path,
16
+ underlying_translation=underlying_translation
17
+ )
18
+ translator.translate_pdf()
19
+
20
+ return outfile_path
21
+
22
+
23
+ def get_texts(self, file_path: str):
24
+ doc = fitz.open(file_path)
25
+
26
+ texts = []
27
+
28
+ count = 0
29
+ for page_num in range(doc.page_count):
30
+ page = doc.load_page(page_num)
31
+ text = page.get_text().strip()
32
+ if text:
33
+ count += len(text)
34
+ texts.append(text)
35
+ if count >= 4096:
36
+ break
37
+
38
+ doc.close()
39
+ return " ".join(texts)[:4096]
40
+
41
+
42
+ # Roughly based on https://github.com/CBIhalsen/PolyglotPDF/blob/main/main.py
43
+ # which is GPLv3
44
+ class PdfTranslator:
45
+ def __init__(self, pdf_path: str, output_path: str, underlying_translation: ITranslation):
46
+ self.pdf_path = pdf_path
47
+ self.output_path = output_path
48
+ self.underlying_translation = underlying_translation
49
+ self.doc = fitz.open(pdf_path)
50
+ self.pages_data = []
51
+
52
+
53
+ def translate_pdf(self):
54
+ self._extract_text_from_pages()
55
+ self._translate_pages_data()
56
+ self._apply_translations_to_pdf()
57
+ self._save_translated_pdf()
58
+
59
+
60
+ def _decimal_to_hex_color(self, decimal_color):
61
+ if decimal_color == 0:
62
+ return '#000000'
63
+ hex_color = hex(decimal_color)[2:]
64
+ hex_color = hex_color.zfill(6)
65
+ return f'#{hex_color}'
66
+
67
+
68
+ def _is_math(self, text, page_num, font_info):
69
+ #I assume this is a placeholder that's going to be implemented later in the polyglotPDF/main.py later on, I'm leaving this here if it is implemented later copy pasting that code should work fine. Same for is_non_text.
70
+ return False
71
+
72
+
73
+ def _is_non_text(self, text):
74
+ return False
75
+
76
+
77
+ def _extract_text_from_pages(self):
78
+ # The reason for separating _extract_text_from_pages and _extract_text_with_pymupdf is later if _extract_using_OCR is implemented, it can just go here.
79
+ page_count = self.doc.page_count
80
+ for page_num in range(page_count):
81
+ self._extract_text_with_pymupdf(page_num)
82
+
83
+
84
+ def _extract_text_with_pymupdf(self, page_num: int):
85
+ while len(self.pages_data) <= page_num:
86
+ self.pages_data.append([])
87
+
88
+ page = self.doc.load_page(page_num)
89
+
90
+ links = page.get_links()
91
+ link_map = {}
92
+ for link in links:
93
+ rect = fitz.Rect(link["from"])
94
+ link_map[rect] = {
95
+ "uri": link.get("uri", ""),
96
+ "page": link.get("page", -1),
97
+ "to": link.get("to", None),
98
+ "kind": link.get("kind", 0)
99
+ }
100
+
101
+ blocks = page.get_text("dict")["blocks"]
102
+
103
+ for block in blocks:
104
+ if "lines" in block:
105
+ for line in block["lines"]:
106
+ for span in line["spans"]:
107
+ text = span.get("text", "").strip()
108
+ if text and not self._is_math(text, page_num, None) and not self._is_non_text(text):
109
+ bbox = span.get("bbox", (0, 0, 0, 0))
110
+ font_size = span.get("size", 12)
111
+ font_flags = span.get("flags", 0)
112
+ color = span.get("color", 0)
113
+ is_bold = bool(font_flags & 2**4)
114
+ span_rect = fitz.Rect(bbox)
115
+ link_info = None
116
+ for link_rect, link_data in link_map.items():
117
+ if span_rect.intersects(link_rect):
118
+ link_info = link_data
119
+ break
120
+
121
+ self.pages_data[page_num].append([
122
+ text,
123
+ tuple(bbox),
124
+ None, # Translation placeholder
125
+ 0, # Angle (rotation)
126
+ self._decimal_to_hex_color(color),
127
+ 0, # Text indent
128
+ is_bold,
129
+ font_size,
130
+ link_info # Link information
131
+ ])
132
+
133
+
134
+ def _translate_pages_data(self):
135
+ try:
136
+ for page_blocks in self.pages_data:
137
+ for block in page_blocks:
138
+ original_text = block[0]
139
+ translated_text = self.underlying_translation.translate(original_text)
140
+ block[2] = translated_text
141
+ except Exception as e:
142
+ # Fallback: use original text in case of math or any other issues
143
+ for page_blocks in self.pages_data:
144
+ for block in page_blocks:
145
+ block[2] = block[0]
146
+
147
+
148
+ def _apply_translations_to_pdf(self):
149
+ for page_index, blocks in enumerate(self.pages_data):
150
+ if not blocks:
151
+ continue
152
+
153
+ page = self.doc.load_page(page_index)
154
+
155
+ normal_blocks = []
156
+ bold_blocks = []
157
+
158
+ for block in blocks:
159
+ coords = block[1]
160
+ translated_text = block[2] if block[2] is not None else block[0]
161
+
162
+ # Calculate expansion factor based on text length ratio
163
+ len_ratio = min(1.05, max(1.01, len(translated_text) / max(1, len(block[0]))))
164
+
165
+ x0, y0, x1, y1 = coords
166
+ width = x1 - x0
167
+ height = y1 - y0
168
+
169
+ # Expand horizontally to accommodate longer text
170
+ h_expand = (len_ratio - 1) * width
171
+ x1 = x1 + h_expand
172
+
173
+ # Reduce vertical coverage to be more precise
174
+ vertical_margin = min(height * 0.1, 3)
175
+ y0 = y0 + vertical_margin
176
+ y1 = y1 - vertical_margin
177
+
178
+ # Ensure minimum height
179
+ if y1 - y0 < 10:
180
+ y_center = (coords[1] + coords[3]) / 2
181
+ y0 = y_center - 5
182
+ y1 = y_center + 5
183
+
184
+ enlarged_coords = (x0, y0, x1, y1)
185
+ rect = fitz.Rect(*enlarged_coords)
186
+
187
+ # Cover original text with white rectangle
188
+ try:
189
+ page.add_redact_annot(rect)
190
+ page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)
191
+ except Exception:
192
+ page.draw_rect(rect, color=(1, 1, 1), fill=(1, 1, 1))
193
+
194
+ is_bold = len(block) > 6 and block[6]
195
+ if is_bold:
196
+ bold_blocks.append((block, enlarged_coords))
197
+ else:
198
+ normal_blocks.append((block, enlarged_coords))
199
+
200
+ self._insert_styled_text_blocks(page, normal_blocks, is_bold=False)
201
+ self._insert_styled_text_blocks(page, bold_blocks, is_bold=True)
202
+
203
+
204
+ def _insert_styled_text_blocks(self, page, blocks: List, is_bold: bool):
205
+ if not blocks:
206
+ return
207
+
208
+ font_weight = "bold" if is_bold else "normal"
209
+
210
+ for block_data in blocks:
211
+ block, enlarged_coords = block_data
212
+ translated_text = block[2] if block[2] is not None else block[0]
213
+ angle = block[3] if len(block) > 3 else 0
214
+ color = block[4] if len(block) > 4 else '#000000'
215
+ text_indent = block[5] if len(block) > 5 else 0
216
+ font_size = block[7] if len(block) > 7 else 12
217
+ link_info = block[8] if len(block) > 8 else None
218
+
219
+ rect = fitz.Rect(*enlarged_coords)
220
+
221
+ if link_info:
222
+ if link_info.get("uri"):
223
+ translated_text = f'<a href="{link_info["uri"]}" style="color: {color}; text-decoration: underline;">{translated_text}</a>'
224
+ elif link_info.get("page", -1) >= 0:
225
+ page_num = link_info["page"]
226
+ translated_text = f'<a href="#page{page_num}" style="color: {color}; text-decoration: underline;">{translated_text}</a>'
227
+
228
+ css = f"""
229
+ * {{
230
+ color: {color};
231
+ font-weight: {font_weight};
232
+ font-size: {font_size}px;
233
+ text-indent: {text_indent}pt;
234
+ line-height: 1.2;
235
+ word-wrap: break-word;
236
+ overflow-wrap: break-word;
237
+ width: 100%;
238
+ box-sizing: border-box;
239
+ margin: 0;
240
+ padding: 0;
241
+ }}
242
+ a {{
243
+ text-decoration: underline;
244
+ }}
245
+ """
246
+
247
+ html_content = f'<div style="font-size: {font_size}px; color: {color}; font-weight: {font_weight}; text-indent: {text_indent}pt; line-height: 1.2; word-wrap: break-word;">{translated_text}</div>'
248
+
249
+ try:
250
+ page.insert_htmlbox(rect, html_content, css=css, rotate=angle)
251
+
252
+ if link_info:
253
+ self._add_link_annotation(page, rect, link_info)
254
+
255
+ except Exception as e:
256
+ page.insert_text(rect.tl, translated_text, fontsize=font_size)
257
+
258
+ if link_info:
259
+ self._add_link_annotation(page, rect, link_info)
260
+
261
+
262
+ def _add_link_annotation(self, page, rect, link_info):
263
+ try:
264
+ link_dict = {
265
+ "kind": link_info.get("kind", 1), # 1 = URI link, 2 = GoTo link
266
+ "from": rect
267
+ }
268
+
269
+ if link_info.get("uri"):
270
+ link_dict["uri"] = link_info["uri"]
271
+ link_dict["kind"] = 1 # URI link
272
+ elif link_info.get("page", -1) >= 0:
273
+ link_dict["page"] = link_info["page"]
274
+ link_dict["kind"] = 2
275
+ if link_info.get("to"):
276
+ link_dict["to"] = link_info["to"]
277
+
278
+ page.insert_link(link_dict)
279
+ except Exception as e:
280
+ pass
281
+
282
+
283
+ def _save_translated_pdf(self):
284
+ new_doc = fitz.open()
285
+ new_doc.insert_pdf(self.doc)
286
+ new_doc.save(self.output_path, garbage=4, deflate=True)
287
+ new_doc.close()
288
+ self.doc.close()
@@ -0,0 +1,25 @@
1
+ import pysrt
2
+ from argostranslate.translate import ITranslation
3
+ from argostranslatefiles.abstract_file import AbstractFile
4
+
5
+
6
+ class Srt(AbstractFile):
7
+ supported_file_extensions = ['.srt']
8
+
9
+ def translate(self, underlying_translation: ITranslation, file_path: str):
10
+ outfile_path = self.get_output_path(underlying_translation, file_path)
11
+
12
+ subs = pysrt.open(file_path)
13
+
14
+ for sub in subs:
15
+ sub.text = underlying_translation.translate(sub.text)
16
+
17
+ subs.save(outfile_path, encoding='utf-8')
18
+
19
+ return outfile_path
20
+
21
+
22
+ def get_texts(self, file_path: str):
23
+ subs = pysrt.open(file_path)
24
+ text = "\n".join([sub.text for sub in subs])
25
+ return text[0:4096]
@@ -0,0 +1,24 @@
1
+ from argostranslate.translate import ITranslation
2
+
3
+ from argostranslatefiles.abstract_file import AbstractFile
4
+
5
+
6
+ class Txt(AbstractFile):
7
+ supported_file_extensions = ['.txt']
8
+
9
+ def translate(self, underlying_translation: ITranslation, file_path: str):
10
+ outfile_path = self.get_output_path(underlying_translation, file_path)
11
+
12
+ infile = open(file_path, "r")
13
+ outfile = open(outfile_path, "w")
14
+
15
+ translated_text = underlying_translation.translate(infile.read())
16
+ outfile.write(translated_text)
17
+
18
+ infile.close()
19
+ outfile.close()
20
+
21
+ return outfile_path
22
+
23
+ def get_texts(self, file_path: str):
24
+ return open(file_path, "r").read(4096)
tests/__init__.py ADDED
File without changes
tests/test_init.py ADDED
@@ -0,0 +1,6 @@
1
+ import argostranslatefiles
2
+
3
+
4
+ def test_init():
5
+ """Test Argos translate models initialization"""
6
+ assert len(argostranslatefiles.get_supported_formats()) >= 1