PyPI - argos-translate-files-main - Versions diffs - 1.4.0__py3-none-any.whl - Mend

argos-translate-files-main 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

argos_translate_files_main-1.4.0.dist-info/METADATA +65 -0
argos_translate_files_main-1.4.0.dist-info/RECORD +23 -0
argos_translate_files_main-1.4.0.dist-info/WHEEL +5 -0
argos_translate_files_main-1.4.0.dist-info/licenses/LICENSE +21 -0
argos_translate_files_main-1.4.0.dist-info/top_level.txt +2 -0
argostranslatefiles/__init__.py +1 -0
argostranslatefiles/abstract_file.py +26 -0
argostranslatefiles/argostranslatefiles.py +59 -0
argostranslatefiles/formats/__init__.py +1 -0
argostranslatefiles/formats/abstract_xml.py +39 -0
argostranslatefiles/formats/epub.py +86 -0
argostranslatefiles/formats/html.py +43 -0
argostranslatefiles/formats/opendocument/__init__.py +0 -0
argostranslatefiles/formats/opendocument/odp.py +5 -0
argostranslatefiles/formats/opendocument/odt.py +52 -0
argostranslatefiles/formats/openxml/__init__.py +1 -0
argostranslatefiles/formats/openxml/docx.py +52 -0
argostranslatefiles/formats/openxml/pptx.py +53 -0
argostranslatefiles/formats/pdf.py +288 -0
argostranslatefiles/formats/srt.py +25 -0
argostranslatefiles/formats/txt.py +24 -0
tests/__init__.py +0 -0
tests/test_init.py +6 -0

argos_translate_files_main-1.4.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,65 @@
+Metadata-Version: 2.4
+Name: argos-translate-files-main
+Version: 1.4.0
+Summary: Translate files with Argos Translate
+Home-page: https://github.com/LibreTranslate/argos-translate-files
+Author: S. Thuret
+Author-email: contact@sebastien-thuret.fr
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: beautifulsoup4>=4.9.3
+Requires-Dist: lxml>=4.9.2
+Requires-Dist: argostranslate>=1.5.1
+Requires-Dist: translatehtml>=1.5.1
+Requires-Dist: pysrt>=1.1.2
+Requires-Dist: PyMuPDF>=1.24.11
+Dynamic: author
+Dynamic: author-email
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: license-file
+Dynamic: requires-dist
+Dynamic: summary
+# Argos Translate Files
+Translate files using [Argos Translate](https://github.com/argosopentech/argos-translate).
+## Supported file format
+.txt, .odt, .odp, .docx, .pptx, .epub, .html, .srt
+## Install
+```
+pip install argos-translate-files
+```
+## Example
+```python
+import os.path
+import argostranslate.package, argostranslate.translate
+import argostranslatefiles
+from argostranslatefiles import argostranslatefiles
+from_code = "fr"
+to_code = "en"
+installed_languages = argostranslate.translate.get_installed_languages()
+from_lang = list(filter(
+    lambda x: x.code == from_code,
+    installed_languages))[0]
+to_lang = list(filter(
+    lambda x: x.code == to_code,
+    installed_languages))[0]
+underlying_translation = from_lang.get_translation(to_lang)
+argostranslatefiles.translate_file(underlying_translation, os.path.abspath('path/to/file.txt'))
+```

argos_translate_files_main-1.4.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,23 @@
+argos_translate_files_main-1.4.0.dist-info/licenses/LICENSE,sha256=fF1KSXtjRXSX1_BlLo-wY7dRtPnzo1rFiwEKrQaY1-s,1072
+argostranslatefiles/__init__.py,sha256=T0ahsHLbHg2WfvkEQemFCuWSAGC_EwdiOF9eUKgFNKA,54
+argostranslatefiles/abstract_file.py,sha256=nk7Bz0IOwct4vZnSLJU9JLTvmvIKJq15NXmkv8QaOB4,839
+argostranslatefiles/argostranslatefiles.py,sha256=eA17P4Mfuln0hHUbxVkYxMzA79hy7uhs9s9vN_ej6BI,1563
+argostranslatefiles/formats/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
+argostranslatefiles/formats/abstract_xml.py,sha256=CsFEzzaYNacHNYD86UZxbNsQTpVaFlaG0YrKVjTRvgw,1340
+argostranslatefiles/formats/epub.py,sha256=aKSCs8B7Nr9SkH0D5PKGn2B2haoqMnoOv7XncTLYHn8,3226
+argostranslatefiles/formats/html.py,sha256=cj55uvhZfqXFvRTB_DyovTaHWzRs12RnYxE_euXZEHQ,1159
+argostranslatefiles/formats/pdf.py,sha256=g9fRs89EKdod1NAeAolfYDdG9A_fJTcJqG16CH7E4SE,11053
+argostranslatefiles/formats/srt.py,sha256=IjurpRfZvL0mK-ZkJjaYmvuYXnknFVmV3x4-XEqPPJc,720
+argostranslatefiles/formats/txt.py,sha256=zRAZGxkzt5JWoHy-ZqZUlu9NvbND9aK2qivo3MiP2W0,698
+argostranslatefiles/formats/opendocument/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+argostranslatefiles/formats/opendocument/odp.py,sha256=EmkbAxWmBMxGajrp7Zq0QItnrGQlynlKeolNP7kedHU,120
+argostranslatefiles/formats/opendocument/odt.py,sha256=qklQHc7Vy00KL5B-0kce1rhoNYiqeyDVB7l6thp2V_o,1690
+argostranslatefiles/formats/openxml/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
+argostranslatefiles/formats/openxml/docx.py,sha256=xqDWTL4QZVqy4dBOzgY42zc9ej8MUjj4Z3UNKFDsMxY,1704
+argostranslatefiles/formats/openxml/pptx.py,sha256=l3neBU_wWoCEYfm6gph0EaW0PsptiL27WfdFCatDDtw,1756
+tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+tests/test_init.py,sha256=VmOIM9Zw5AAHZAXC3b97STO2MF5VUST-sZQMuP9QPYE,164
+argos_translate_files_main-1.4.0.dist-info/METADATA,sha256=_b4V8c1sHgxo9jAYRBoTnr9ULg1yn8aQ55hvB42g3wQ,1546
+argos_translate_files_main-1.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+argos_translate_files_main-1.4.0.dist-info/top_level.txt,sha256=H2-Nav_Fg6dGaIChlLbDv4Bf8mZ-pLh3iSlKs7MDatQ,26
+argos_translate_files_main-1.4.0.dist-info/RECORD,,

argos_translate_files_main-1.4.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.9.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

argos_translate_files_main-1.4.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2021 Argos Open Tech
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

argos_translate_files_main-1.4.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ argostranslatefiles
2	+ tests

argostranslatefiles/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from argostranslatefiles.argostranslatefiles import *

argostranslatefiles/abstract_file.py ADDED Viewed

@@ -0,0 +1,26 @@
+import abc
+import os.path
+from argostranslate.translate import ITranslation
+class AbstractFile():
+    supported_file_extensions = []
+    def support(self, file_path: str):
+        file_ext = os.path.splitext(file_path)[1]
+        return file_ext in self.supported_file_extensions
+    def get_output_path(self, underlying_translation: ITranslation, file_path: str):
+        dir_path = os.path.dirname(file_path)
+        file_name, file_ext = os.path.splitext(os.path.basename(file_path))
+        to_code = underlying_translation.to_lang.code
+        return dir_path + "/" + file_name + '_' + to_code + file_ext
+    @abc.abstractmethod
+    def translate(self, underlying_translation: ITranslation, file_path: str): raise NotImplementedError
+    @abc.abstractmethod
+    def get_texts(self, file_path: str): raise NotImplementedError

argostranslatefiles/argostranslatefiles.py ADDED Viewed

@@ -0,0 +1,59 @@
+from argostranslate.translate import ITranslation
+from argostranslatefiles.formats.html import Html
+from argostranslatefiles.formats.opendocument.odp import Odp
+from argostranslatefiles.formats.opendocument.odt import Odt
+from argostranslatefiles.formats.openxml.docx import Docx
+from argostranslatefiles.formats.openxml.pptx import Pptx
+from argostranslatefiles.formats.txt import Txt
+from argostranslatefiles.formats.epub import Epub
+from argostranslatefiles.formats.srt import Srt
+from argostranslatefiles.formats.pdf import Pdf
+def get_supported_formats():
+    return [
+        Txt(),
+        Odt(),
+        Odp(),
+        Docx(),
+        Pptx(),
+        Epub(),
+        Html(),
+        Srt(),
+        Pdf(),
+    ]
+def translate_file(underlying_translation: ITranslation, file_path: str):
+    """Translate a file.
+    Args:
+        underlying_translation (argostranslate.translate.ITranslation): Argos Translate Translation
+        file_path (str): file path
+    Returns:
+        file_path: Translated file
+    """
+    for supported_format in get_supported_formats():
+        if supported_format.support(file_path):
+            return supported_format.translate(underlying_translation, file_path)
+    return False
+def get_texts(file_path: str):
+    """Get the file contents.
+    Args:
+        file_path (str): file path
+    Returns:
+        texts: File contents
+    """
+    for supported_format in get_supported_formats():
+        if supported_format.support(file_path):
+            return supported_format.get_texts(file_path)
+    return False

argostranslatefiles/formats/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+

argostranslatefiles/formats/abstract_xml.py ADDED Viewed

@@ -0,0 +1,39 @@
+import argostranslate
+import bs4
+from argostranslate.tags import Tag, ITag
+from argostranslatefiles.abstract_file import AbstractFile
+class AbstractXml(AbstractFile):
+    def is_translatable(self, soup):
+        return soup.text != ""
+    def itag_of_soup(self, soup):
+        """Returns an argostranslate.tags.ITag tree from a BeautifulSoup object.
+        Args:
+            soup (bs4.element.Navigablestring or bs4.element.Tag): Beautiful Soup object
+        Returns:
+            argostranslate.tags.ITag: Argos Translate ITag tree
+        """
+        if isinstance(soup, bs4.element.NavigableString):
+            return str(soup)
+        translatable = self.is_translatable(soup)
+        to_return = Tag([self.itag_of_soup(content) for content in soup.contents], translatable)
+        to_return.soup = soup
+        return to_return
+    def soup_of_itag(self, itag: ITag):
+        """Returns a BeautifulSoup object from an Argos Translate ITag.
+        Args:
+            itag (argostranslate.tags.ITag): ITag object to convert to Soup
+        Returns:
+            bs4.elements.BeautifulSoup: BeautifulSoup object
+        """
+        if type(itag) == str:
+            return bs4.element.NavigableString(itag)
+        soup = itag.soup
+        soup.contents = [self.soup_of_itag(child) for child in itag.children]
+        return soup

argostranslatefiles/formats/epub.py ADDED Viewed

@@ -0,0 +1,86 @@
+import re
+import zipfile
+import translatehtml
+from argostranslate.tags import translate_tags
+from argostranslate.translate import ITranslation
+from bs4 import BeautifulSoup
+from argostranslatefiles.formats.abstract_xml import AbstractXml
+class Epub(AbstractXml):
+    supported_file_extensions = ['.epub']
+    def is_translatable(self, soup):
+        return soup.text != ""
+    def translate(self, underlying_translation: ITranslation, file_path: str):
+        outzip_path = self.get_output_path(underlying_translation, file_path)
+        inzip = zipfile.ZipFile(file_path, "r")
+        outzip = zipfile.ZipFile(outzip_path, "w")
+        for inzipinfo in inzip.infolist():
+            with inzip.open(inzipinfo) as infile:
+                translatable_xml_filenames = ["OPS/content.opf", "OPS/toc.ncx", "OEBPS/content.opf", "OEBPS/toc.ncx"]
+                if inzipinfo.filename in translatable_xml_filenames:
+                    soup = BeautifulSoup(infile.read(), 'xml')
+                    itag = self.itag_of_soup(soup)
+                    translated_tag = translate_tags(underlying_translation, itag)
+                    translated_soup = self.soup_of_itag(translated_tag)
+                    outzip.writestr(inzipinfo.filename, str(translated_soup))
+                elif inzipinfo.filename.endswith('.html') or inzipinfo.filename.endswith('.xhtml'):
+                    head = '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>'
+                    content = str(infile.read(), 'utf-8')
+                    head_present = content.startswith(head)
+                    if head_present:
+                        content = content[len(head):]
+                    translated = str(translatehtml.translate_html(underlying_translation, content))
+                    if head_present:
+                        translated = str(head) + translated
+                    outzip.writestr(inzipinfo.filename, translated)
+                else:
+                    outzip.writestr(inzipinfo.filename, infile.read())
+        inzip.close()
+        outzip.close()
+        return outzip_path
+    def get_texts(self, file_path: str):
+        inzip = zipfile.ZipFile(file_path, "r")
+        texts = ""
+        for inzipinfo in inzip.infolist():
+            if len(texts) > 4096:
+                break
+            with inzip.open(inzipinfo) as infile:
+                translatable_xml_filenames = ["OPS/content.opf", "OPS/toc.ncx", "OEBPS/content.opf", "OEBPS/toc.ncx"]
+                if inzipinfo.filename in translatable_xml_filenames:
+                    soup = BeautifulSoup(infile.read(), 'xml')
+                    texts += self.itag_of_soup(soup).text()
+                elif inzipinfo.filename.endswith('.html') or inzipinfo.filename.endswith('.xhtml'):
+                    head = '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>'
+                    content = str(infile.read(), 'utf-8')
+                    head_present = content.startswith(head)
+                    if head_present:
+                        content = content[len(head):]
+                    texts += content
+                else:
+                    texts += infile.read().decode()
+        inzip.close()
+        return texts[:4096]

argostranslatefiles/formats/html.py ADDED Viewed

@@ -0,0 +1,43 @@
+import translatehtml
+from argostranslate.translate import ITranslation
+from argostranslatefiles.abstract_file import AbstractFile
+from bs4 import BeautifulSoup
+class Html(AbstractFile):
+    supported_file_extensions = ['.html']
+    def translate(self, underlying_translation: ITranslation, file_path: str):
+        outfile_path = self.get_output_path(underlying_translation, file_path)
+        infile = open(file_path, "r")
+        outfile = open(outfile_path, "w")
+        content = infile.read()
+        head = '<!DOCTYPE html>'
+        head_present = content.startswith(head)
+        if head_present:
+            content = content[len(head):]
+        translated = str(translatehtml.translate_html(underlying_translation, content))
+        if head_present:
+            translated = str(head) + translated
+        outfile.write(translated)
+        infile.close()
+        outfile.close()
+        return outfile_path
+    def get_texts(self, file_path: str):
+        infile = open(file_path, "r")
+        content = infile.read()
+        soup = BeautifulSoup(content, "html.parser")
+        return translatehtml.itag_of_soup(soup).text()[0:4096]

argostranslatefiles/formats/opendocument/__init__.py ADDED Viewed

File without changes

argostranslatefiles/formats/opendocument/odp.py ADDED Viewed

@@ -0,0 +1,5 @@
+from argostranslatefiles.formats.opendocument.odt import Odt
+class Odp(Odt):
+    supported_file_extensions = ['.odp']

argostranslatefiles/formats/opendocument/odt.py ADDED Viewed

@@ -0,0 +1,52 @@
+import zipfile
+from argostranslate.tags import translate_tags
+from argostranslate.translate import ITranslation
+from bs4 import BeautifulSoup
+from argostranslatefiles.formats.abstract_xml import AbstractXml
+class Odt(AbstractXml):
+    supported_file_extensions = ['.odt']
+    def translate(self, underlying_translation: ITranslation, file_path: str):
+        outzip_path = self.get_output_path(underlying_translation, file_path)
+        inzip = zipfile.ZipFile(file_path, "r")
+        outzip = zipfile.ZipFile(outzip_path, "w")
+        for inzipinfo in inzip.infolist():
+            with inzip.open(inzipinfo) as infile:
+                if inzipinfo.filename == "content.xml":
+                    soup = BeautifulSoup(infile.read(), 'xml')
+                    itag = self.itag_of_soup(soup)
+                    translated_tag = translate_tags(underlying_translation, itag)
+                    translated_soup = self.soup_of_itag(translated_tag)
+                    outzip.writestr(inzipinfo.filename, str(translated_soup))
+                else:
+                    outzip.writestr(inzipinfo.filename, infile.read())
+        inzip.close()
+        outzip.close()
+        return outzip_path
+    def get_texts(self, file_path: str):
+        inzip = zipfile.ZipFile(file_path, "r")
+        texts = ""
+        for inzipinfo in inzip.infolist():
+            if len(texts) > 4096:
+                break
+            with inzip.open(inzipinfo) as infile:
+                if inzipinfo.filename == "content.xml":
+                    soup = BeautifulSoup(infile.read(), 'xml')
+                    texts += self.itag_of_soup(soup).text()
+        inzip.close()
+        return texts[:4096]

argostranslatefiles/formats/openxml/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+

argostranslatefiles/formats/openxml/docx.py ADDED Viewed

@@ -0,0 +1,52 @@
+import zipfile
+from argostranslate.tags import translate_tags
+from argostranslate.translate import ITranslation
+from bs4 import BeautifulSoup
+from argostranslatefiles.formats.abstract_xml import AbstractXml
+class Docx(AbstractXml):
+    supported_file_extensions = ['.docx']
+    def translate(self, underlying_translation: ITranslation, file_path: str):
+        outzip_path = self.get_output_path(underlying_translation, file_path)
+        inzip = zipfile.ZipFile(file_path, "r")
+        outzip = zipfile.ZipFile(outzip_path, "w")
+        for inzipinfo in inzip.infolist():
+            with inzip.open(inzipinfo) as infile:
+                if inzipinfo.filename == "word/document.xml":
+                    soup = BeautifulSoup(infile.read(), 'xml')
+                    itag = self.itag_of_soup(soup)
+                    translated_tag = translate_tags(underlying_translation, itag)
+                    translated_soup = self.soup_of_itag(translated_tag)
+                    outzip.writestr(inzipinfo.filename, str(translated_soup))
+                else:
+                    outzip.writestr(inzipinfo.filename, infile.read())
+        inzip.close()
+        outzip.close()
+        return outzip_path
+    def get_texts(self, file_path: str):
+        inzip = zipfile.ZipFile(file_path, "r")
+        texts = ""
+        for inzipinfo in inzip.infolist():
+            if len(texts) > 4096:
+                break
+            with inzip.open(inzipinfo) as infile:
+                if inzipinfo.filename == "word/document.xml":
+                    soup = BeautifulSoup(infile.read(), 'xml')
+                    texts += self.itag_of_soup(soup).text()
+        inzip.close()
+        return texts[:4096]

argostranslatefiles/formats/openxml/pptx.py ADDED Viewed

@@ -0,0 +1,53 @@
+import re
+import zipfile
+from argostranslate.tags import translate_tags
+from argostranslate.translate import ITranslation
+from bs4 import BeautifulSoup
+from argostranslatefiles.formats.abstract_xml import AbstractXml
+class Pptx(AbstractXml):
+    supported_file_extensions = ['.pptx']
+    def translate(self, underlying_translation: ITranslation, file_path: str):
+        outzip_path = self.get_output_path(underlying_translation, file_path)
+        inzip = zipfile.ZipFile(file_path, "r")
+        outzip = zipfile.ZipFile(outzip_path, "w")
+        for inzipinfo in inzip.infolist():
+            with inzip.open(inzipinfo) as infile:
+                if re.match(r"ppt\/slides\/slide[0-9]*\.xml", inzipinfo.filename):
+                    soup = BeautifulSoup(infile.read(), 'xml')
+                    itag = self.itag_of_soup(soup)
+                    translated_tag = translate_tags(underlying_translation, itag)
+                    translated_soup = self.soup_of_itag(translated_tag)
+                    outzip.writestr(inzipinfo.filename, str(translated_soup))
+                else:
+                    outzip.writestr(inzipinfo.filename, infile.read())
+        inzip.close()
+        outzip.close()
+        return outzip_path
+    def get_texts(self, file_path: str):
+        inzip = zipfile.ZipFile(file_path, "r")
+        texts = ""
+        for inzipinfo in inzip.infolist():
+            if len(texts) > 4096:
+                break
+            with inzip.open(inzipinfo) as infile:
+                if re.match(r"ppt\/slides\/slide[0-9]*\.xml", inzipinfo.filename):
+                    soup = BeautifulSoup(infile.read(), 'xml')
+                    texts += self.itag_of_soup(soup).text()
+        inzip.close()
+        return texts[:4096]

argostranslatefiles/formats/pdf.py ADDED Viewed

@@ -0,0 +1,288 @@
+import pymupdf as fitz
+from typing import List
+from argostranslate.translate import ITranslation
+from argostranslatefiles.abstract_file import AbstractFile
+class Pdf(AbstractFile):
+    supported_file_extensions = ['.pdf']
+    def translate(self, underlying_translation: ITranslation, file_path: str) -> str:
+        outfile_path = self.get_output_path(underlying_translation, file_path)
+        translator = PdfTranslator(
+            pdf_path=file_path,
+            output_path=outfile_path,
+            underlying_translation=underlying_translation
+        )
+        translator.translate_pdf()
+        return outfile_path
+    def get_texts(self, file_path: str):
+        doc = fitz.open(file_path)
+        texts = []
+        count = 0
+        for page_num in range(doc.page_count):
+            page = doc.load_page(page_num)
+            text = page.get_text().strip()
+            if text:
+                count += len(text)
+                texts.append(text)
+                if count >= 4096:
+                    break
+        doc.close()
+        return " ".join(texts)[:4096]
+# Roughly based on https://github.com/CBIhalsen/PolyglotPDF/blob/main/main.py
+# which is GPLv3
+class PdfTranslator:
+    def __init__(self, pdf_path: str, output_path: str, underlying_translation: ITranslation):
+        self.pdf_path = pdf_path
+        self.output_path = output_path
+        self.underlying_translation = underlying_translation
+        self.doc = fitz.open(pdf_path)
+        self.pages_data = []
+    def translate_pdf(self):
+        self._extract_text_from_pages()
+        self._translate_pages_data()
+        self._apply_translations_to_pdf()
+        self._save_translated_pdf()
+    def _decimal_to_hex_color(self, decimal_color):
+        if decimal_color == 0:
+            return '#000000'
+        hex_color = hex(decimal_color)[2:]
+        hex_color = hex_color.zfill(6)
+        return f'#{hex_color}'
+    def _is_math(self, text, page_num, font_info):
+        #I assume this is a placeholder that's going to be implemented later in the polyglotPDF/main.py later on, I'm leaving this here if it is implemented later copy pasting that code should work fine. Same for is_non_text.
+        return False
+    def _is_non_text(self, text):
+        return False
+    def _extract_text_from_pages(self):
+        # The reason for separating _extract_text_from_pages and _extract_text_with_pymupdf is later if _extract_using_OCR is implemented, it can just go here.
+        page_count = self.doc.page_count
+        for page_num in range(page_count):
+            self._extract_text_with_pymupdf(page_num)
+    def _extract_text_with_pymupdf(self, page_num: int):
+        while len(self.pages_data) <= page_num:
+            self.pages_data.append([])
+        page = self.doc.load_page(page_num)
+        links = page.get_links()
+        link_map = {}
+        for link in links:
+            rect = fitz.Rect(link["from"])
+            link_map[rect] = {
+                "uri": link.get("uri", ""),
+                "page": link.get("page", -1),
+                "to": link.get("to", None),
+                "kind": link.get("kind", 0)
+            }
+        blocks = page.get_text("dict")["blocks"]
+        for block in blocks:
+            if "lines" in block:
+                for line in block["lines"]:
+                    for span in line["spans"]:
+                        text = span.get("text", "").strip()
+                        if text and not self._is_math(text, page_num, None) and not self._is_non_text(text):
+                            bbox = span.get("bbox", (0, 0, 0, 0))
+                            font_size = span.get("size", 12)
+                            font_flags = span.get("flags", 0)
+                            color = span.get("color", 0)
+                            is_bold = bool(font_flags & 2**4)
+                            span_rect = fitz.Rect(bbox)
+                            link_info = None
+                            for link_rect, link_data in link_map.items():
+                                if span_rect.intersects(link_rect):
+                                    link_info = link_data
+                                    break
+                            self.pages_data[page_num].append([
+                                text,
+                                tuple(bbox),
+                                None,  # Translation placeholder
+                                0,     # Angle (rotation)
+                                self._decimal_to_hex_color(color),
+                                0,     # Text indent
+                                is_bold,
+                                font_size,
+                                link_info  # Link information
+                            ])
+    def _translate_pages_data(self):
+        try:
+            for page_blocks in self.pages_data:
+                for block in page_blocks:
+                    original_text = block[0]
+                    translated_text = self.underlying_translation.translate(original_text)
+                    block[2] = translated_text
+        except Exception as e:
+            # Fallback: use original text in case of math or any other issues
+            for page_blocks in self.pages_data:
+                for block in page_blocks:
+                    block[2] = block[0]
+    def _apply_translations_to_pdf(self):
+        for page_index, blocks in enumerate(self.pages_data):
+            if not blocks:
+                continue
+            page = self.doc.load_page(page_index)
+            normal_blocks = []
+            bold_blocks = []
+            for block in blocks:
+                coords = block[1]
+                translated_text = block[2] if block[2] is not None else block[0]
+                # Calculate expansion factor based on text length ratio
+                len_ratio = min(1.05, max(1.01, len(translated_text) / max(1, len(block[0]))))
+                x0, y0, x1, y1 = coords
+                width = x1 - x0
+                height = y1 - y0
+                # Expand horizontally to accommodate longer text
+                h_expand = (len_ratio - 1) * width
+                x1 = x1 + h_expand
+                # Reduce vertical coverage to be more precise
+                vertical_margin = min(height * 0.1, 3)
+                y0 = y0 + vertical_margin
+                y1 = y1 - vertical_margin
+                # Ensure minimum height
+                if y1 - y0 < 10:
+                    y_center = (coords[1] + coords[3]) / 2
+                    y0 = y_center - 5
+                    y1 = y_center + 5
+                enlarged_coords = (x0, y0, x1, y1)
+                rect = fitz.Rect(*enlarged_coords)
+                # Cover original text with white rectangle
+                try:
+                    page.add_redact_annot(rect)
+                    page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)
+                except Exception:
+                    page.draw_rect(rect, color=(1, 1, 1), fill=(1, 1, 1))
+                is_bold = len(block) > 6 and block[6]
+                if is_bold:
+                    bold_blocks.append((block, enlarged_coords))
+                else:
+                    normal_blocks.append((block, enlarged_coords))
+            self._insert_styled_text_blocks(page, normal_blocks, is_bold=False)
+            self._insert_styled_text_blocks(page, bold_blocks, is_bold=True)
+    def _insert_styled_text_blocks(self, page, blocks: List, is_bold: bool):
+        if not blocks:
+            return
+        font_weight = "bold" if is_bold else "normal"
+        for block_data in blocks:
+            block, enlarged_coords = block_data
+            translated_text = block[2] if block[2] is not None else block[0]
+            angle = block[3] if len(block) > 3 else 0
+            color = block[4] if len(block) > 4 else '#000000'
+            text_indent = block[5] if len(block) > 5 else 0
+            font_size = block[7] if len(block) > 7 else 12
+            link_info = block[8] if len(block) > 8 else None
+            rect = fitz.Rect(*enlarged_coords)
+            if link_info:
+                if link_info.get("uri"):
+                    translated_text = f'<a href="{link_info["uri"]}" style="color: {color}; text-decoration: underline;">{translated_text}</a>'
+                elif link_info.get("page", -1) >= 0:
+                    page_num = link_info["page"]
+                    translated_text = f'<a href="#page{page_num}" style="color: {color}; text-decoration: underline;">{translated_text}</a>'
+            css = f"""
+            * {{
+                color: {color};
+                font-weight: {font_weight};
+                font-size: {font_size}px;
+                text-indent: {text_indent}pt;
+                line-height: 1.2;
+                word-wrap: break-word;
+                overflow-wrap: break-word;
+                width: 100%;
+                box-sizing: border-box;
+                margin: 0;
+                padding: 0;
+            }}
+            a {{
+                text-decoration: underline;
+            }}
+            """
+            html_content = f'<div style="font-size: {font_size}px; color: {color}; font-weight: {font_weight}; text-indent: {text_indent}pt; line-height: 1.2; word-wrap: break-word;">{translated_text}</div>'
+            try:
+                page.insert_htmlbox(rect, html_content, css=css, rotate=angle)
+                if link_info:
+                    self._add_link_annotation(page, rect, link_info)
+            except Exception as e:
+                page.insert_text(rect.tl, translated_text, fontsize=font_size)
+                if link_info:
+                    self._add_link_annotation(page, rect, link_info)
+    def _add_link_annotation(self, page, rect, link_info):
+        try:
+            link_dict = {
+                "kind": link_info.get("kind", 1),  # 1 = URI link, 2 = GoTo link
+                "from": rect
+            }
+            if link_info.get("uri"):
+                link_dict["uri"] = link_info["uri"]
+                link_dict["kind"] = 1  # URI link
+            elif link_info.get("page", -1) >= 0:
+                link_dict["page"] = link_info["page"]
+                link_dict["kind"] = 2
+                if link_info.get("to"):
+                    link_dict["to"] = link_info["to"]
+            page.insert_link(link_dict)
+        except Exception as e:
+            pass
+    def _save_translated_pdf(self):
+        new_doc = fitz.open()
+        new_doc.insert_pdf(self.doc)
+        new_doc.save(self.output_path, garbage=4, deflate=True)
+        new_doc.close()
+        self.doc.close()

argostranslatefiles/formats/srt.py ADDED Viewed

@@ -0,0 +1,25 @@
+import pysrt
+from argostranslate.translate import ITranslation
+from argostranslatefiles.abstract_file import AbstractFile
+class Srt(AbstractFile):
+    supported_file_extensions = ['.srt']
+    def translate(self, underlying_translation: ITranslation, file_path: str):
+        outfile_path = self.get_output_path(underlying_translation, file_path)
+        subs = pysrt.open(file_path)
+        for sub in subs:
+            sub.text = underlying_translation.translate(sub.text)
+        subs.save(outfile_path, encoding='utf-8')
+        return outfile_path
+    def get_texts(self, file_path: str):
+        subs = pysrt.open(file_path)
+        text = "\n".join([sub.text for sub in subs])
+        return text[0:4096]

argostranslatefiles/formats/txt.py ADDED Viewed

@@ -0,0 +1,24 @@
+from argostranslate.translate import ITranslation
+from argostranslatefiles.abstract_file import AbstractFile
+class Txt(AbstractFile):
+    supported_file_extensions = ['.txt']
+    def translate(self, underlying_translation: ITranslation, file_path: str):
+        outfile_path = self.get_output_path(underlying_translation, file_path)
+        infile = open(file_path, "r")
+        outfile = open(outfile_path, "w")
+        translated_text = underlying_translation.translate(infile.read())
+        outfile.write(translated_text)
+        infile.close()
+        outfile.close()
+        return outfile_path
+    def get_texts(self, file_path: str):
+        return open(file_path, "r").read(4096)

tests/__init__.py ADDED Viewed

File without changes

tests/test_init.py ADDED Viewed

@@ -0,0 +1,6 @@
+import argostranslatefiles
+def test_init():
+    """Test Argos translate models initialization"""
+    assert len(argostranslatefiles.get_supported_formats()) >= 1