argos-translate-files-main 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- argos_translate_files_main-1.4.1.dist-info/METADATA +65 -0
- argos_translate_files_main-1.4.1.dist-info/RECORD +23 -0
- argos_translate_files_main-1.4.1.dist-info/WHEEL +5 -0
- argos_translate_files_main-1.4.1.dist-info/licenses/LICENSE +661 -0
- argos_translate_files_main-1.4.1.dist-info/top_level.txt +2 -0
- argostranslatefiles/__init__.py +1 -0
- argostranslatefiles/abstract_file.py +26 -0
- argostranslatefiles/argostranslatefiles.py +59 -0
- argostranslatefiles/formats/__init__.py +1 -0
- argostranslatefiles/formats/abstract_xml.py +39 -0
- argostranslatefiles/formats/epub.py +86 -0
- argostranslatefiles/formats/html.py +43 -0
- argostranslatefiles/formats/opendocument/__init__.py +0 -0
- argostranslatefiles/formats/opendocument/odp.py +5 -0
- argostranslatefiles/formats/opendocument/odt.py +52 -0
- argostranslatefiles/formats/openxml/__init__.py +1 -0
- argostranslatefiles/formats/openxml/docx.py +58 -0
- argostranslatefiles/formats/openxml/pptx.py +53 -0
- argostranslatefiles/formats/pdf.py +288 -0
- argostranslatefiles/formats/srt.py +28 -0
- argostranslatefiles/formats/txt.py +24 -0
- tests/__init__.py +0 -0
- tests/test_init.py +6 -0
@@ -0,0 +1,26 @@
|
|
1
|
+
import abc
|
2
|
+
import os.path
|
3
|
+
|
4
|
+
from argostranslate.translate import ITranslation
|
5
|
+
|
6
|
+
|
7
|
+
class AbstractFile():
|
8
|
+
supported_file_extensions = []
|
9
|
+
|
10
|
+
def support(self, file_path: str):
|
11
|
+
file_ext = os.path.splitext(file_path)[1]
|
12
|
+
|
13
|
+
return file_ext in self.supported_file_extensions
|
14
|
+
|
15
|
+
def get_output_path(self, underlying_translation: ITranslation, file_path: str):
|
16
|
+
dir_path = os.path.dirname(file_path)
|
17
|
+
file_name, file_ext = os.path.splitext(os.path.basename(file_path))
|
18
|
+
to_code = underlying_translation.to_lang.code
|
19
|
+
|
20
|
+
return dir_path + "/" + file_name + '_' + to_code + file_ext
|
21
|
+
|
22
|
+
@abc.abstractmethod
|
23
|
+
def translate(self, underlying_translation: ITranslation, file_path: str): raise NotImplementedError
|
24
|
+
|
25
|
+
@abc.abstractmethod
|
26
|
+
def get_texts(self, file_path: str): raise NotImplementedError
|
@@ -0,0 +1,59 @@
|
|
1
|
+
from argostranslate.translate import ITranslation
|
2
|
+
|
3
|
+
from argostranslatefiles.formats.html import Html
|
4
|
+
from argostranslatefiles.formats.opendocument.odp import Odp
|
5
|
+
from argostranslatefiles.formats.opendocument.odt import Odt
|
6
|
+
from argostranslatefiles.formats.openxml.docx import Docx
|
7
|
+
from argostranslatefiles.formats.openxml.pptx import Pptx
|
8
|
+
from argostranslatefiles.formats.txt import Txt
|
9
|
+
from argostranslatefiles.formats.epub import Epub
|
10
|
+
from argostranslatefiles.formats.srt import Srt
|
11
|
+
from argostranslatefiles.formats.pdf import Pdf
|
12
|
+
|
13
|
+
def get_supported_formats():
|
14
|
+
return [
|
15
|
+
Txt(),
|
16
|
+
Odt(),
|
17
|
+
Odp(),
|
18
|
+
Docx(),
|
19
|
+
Pptx(),
|
20
|
+
Epub(),
|
21
|
+
Html(),
|
22
|
+
Srt(),
|
23
|
+
Pdf(),
|
24
|
+
]
|
25
|
+
|
26
|
+
|
27
|
+
def translate_file(underlying_translation: ITranslation, file_path: str):
|
28
|
+
"""Translate a file.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
underlying_translation (argostranslate.translate.ITranslation): Argos Translate Translation
|
32
|
+
file_path (str): file path
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
file_path: Translated file
|
36
|
+
"""
|
37
|
+
|
38
|
+
for supported_format in get_supported_formats():
|
39
|
+
if supported_format.support(file_path):
|
40
|
+
return supported_format.translate(underlying_translation, file_path)
|
41
|
+
|
42
|
+
return False
|
43
|
+
|
44
|
+
|
45
|
+
def get_texts(file_path: str):
|
46
|
+
"""Get the file contents.
|
47
|
+
|
48
|
+
Args:
|
49
|
+
file_path (str): file path
|
50
|
+
|
51
|
+
Returns:
|
52
|
+
texts: File contents
|
53
|
+
"""
|
54
|
+
|
55
|
+
for supported_format in get_supported_formats():
|
56
|
+
if supported_format.support(file_path):
|
57
|
+
return supported_format.get_texts(file_path)
|
58
|
+
|
59
|
+
return False
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
import argostranslate
|
2
|
+
import bs4
|
3
|
+
from argostranslate.tags import Tag, ITag
|
4
|
+
|
5
|
+
from argostranslatefiles.abstract_file import AbstractFile
|
6
|
+
|
7
|
+
|
8
|
+
class AbstractXml(AbstractFile):
|
9
|
+
|
10
|
+
def is_translatable(self, soup):
|
11
|
+
return soup.text != ""
|
12
|
+
|
13
|
+
def itag_of_soup(self, soup):
|
14
|
+
"""Returns an argostranslate.tags.ITag tree from a BeautifulSoup object.
|
15
|
+
Args:
|
16
|
+
soup (bs4.element.Navigablestring or bs4.element.Tag): Beautiful Soup object
|
17
|
+
Returns:
|
18
|
+
argostranslate.tags.ITag: Argos Translate ITag tree
|
19
|
+
"""
|
20
|
+
if isinstance(soup, bs4.element.NavigableString):
|
21
|
+
return str(soup)
|
22
|
+
|
23
|
+
translatable = self.is_translatable(soup)
|
24
|
+
to_return = Tag([self.itag_of_soup(content) for content in soup.contents], translatable)
|
25
|
+
to_return.soup = soup
|
26
|
+
return to_return
|
27
|
+
|
28
|
+
def soup_of_itag(self, itag: ITag):
|
29
|
+
"""Returns a BeautifulSoup object from an Argos Translate ITag.
|
30
|
+
Args:
|
31
|
+
itag (argostranslate.tags.ITag): ITag object to convert to Soup
|
32
|
+
Returns:
|
33
|
+
bs4.elements.BeautifulSoup: BeautifulSoup object
|
34
|
+
"""
|
35
|
+
if type(itag) == str:
|
36
|
+
return bs4.element.NavigableString(itag)
|
37
|
+
soup = itag.soup
|
38
|
+
soup.contents = [self.soup_of_itag(child) for child in itag.children]
|
39
|
+
return soup
|
@@ -0,0 +1,86 @@
|
|
1
|
+
import re
|
2
|
+
import zipfile
|
3
|
+
|
4
|
+
import translatehtml
|
5
|
+
from argostranslate.tags import translate_tags
|
6
|
+
from argostranslate.translate import ITranslation
|
7
|
+
from bs4 import BeautifulSoup
|
8
|
+
|
9
|
+
from argostranslatefiles.formats.abstract_xml import AbstractXml
|
10
|
+
|
11
|
+
|
12
|
+
class Epub(AbstractXml):
|
13
|
+
supported_file_extensions = ['.epub']
|
14
|
+
|
15
|
+
def is_translatable(self, soup):
|
16
|
+
return soup.text != ""
|
17
|
+
|
18
|
+
def translate(self, underlying_translation: ITranslation, file_path: str):
|
19
|
+
outzip_path = self.get_output_path(underlying_translation, file_path)
|
20
|
+
|
21
|
+
inzip = zipfile.ZipFile(file_path, "r")
|
22
|
+
outzip = zipfile.ZipFile(outzip_path, "w")
|
23
|
+
|
24
|
+
|
25
|
+
for inzipinfo in inzip.infolist():
|
26
|
+
with inzip.open(inzipinfo) as infile:
|
27
|
+
translatable_xml_filenames = ["OPS/content.opf", "OPS/toc.ncx", "OEBPS/content.opf", "OEBPS/toc.ncx"]
|
28
|
+
if inzipinfo.filename in translatable_xml_filenames:
|
29
|
+
soup = BeautifulSoup(infile.read(), 'xml')
|
30
|
+
|
31
|
+
itag = self.itag_of_soup(soup)
|
32
|
+
translated_tag = translate_tags(underlying_translation, itag)
|
33
|
+
translated_soup = self.soup_of_itag(translated_tag)
|
34
|
+
|
35
|
+
outzip.writestr(inzipinfo.filename, str(translated_soup))
|
36
|
+
elif inzipinfo.filename.endswith('.html') or inzipinfo.filename.endswith('.xhtml'):
|
37
|
+
head = '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>'
|
38
|
+
content = str(infile.read(), 'utf-8')
|
39
|
+
head_present = content.startswith(head)
|
40
|
+
|
41
|
+
if head_present:
|
42
|
+
content = content[len(head):]
|
43
|
+
|
44
|
+
translated = str(translatehtml.translate_html(underlying_translation, content))
|
45
|
+
|
46
|
+
if head_present:
|
47
|
+
translated = str(head) + translated
|
48
|
+
|
49
|
+
outzip.writestr(inzipinfo.filename, translated)
|
50
|
+
else:
|
51
|
+
outzip.writestr(inzipinfo.filename, infile.read())
|
52
|
+
|
53
|
+
inzip.close()
|
54
|
+
outzip.close()
|
55
|
+
|
56
|
+
return outzip_path
|
57
|
+
|
58
|
+
def get_texts(self, file_path: str):
|
59
|
+
inzip = zipfile.ZipFile(file_path, "r")
|
60
|
+
|
61
|
+
texts = ""
|
62
|
+
|
63
|
+
for inzipinfo in inzip.infolist():
|
64
|
+
if len(texts) > 4096:
|
65
|
+
break
|
66
|
+
with inzip.open(inzipinfo) as infile:
|
67
|
+
translatable_xml_filenames = ["OPS/content.opf", "OPS/toc.ncx", "OEBPS/content.opf", "OEBPS/toc.ncx"]
|
68
|
+
if inzipinfo.filename in translatable_xml_filenames:
|
69
|
+
soup = BeautifulSoup(infile.read(), 'xml')
|
70
|
+
|
71
|
+
texts += self.itag_of_soup(soup).text()
|
72
|
+
elif inzipinfo.filename.endswith('.html') or inzipinfo.filename.endswith('.xhtml'):
|
73
|
+
head = '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>'
|
74
|
+
content = str(infile.read(), 'utf-8')
|
75
|
+
head_present = content.startswith(head)
|
76
|
+
|
77
|
+
if head_present:
|
78
|
+
content = content[len(head):]
|
79
|
+
|
80
|
+
texts += content
|
81
|
+
else:
|
82
|
+
texts += infile.read().decode()
|
83
|
+
|
84
|
+
inzip.close()
|
85
|
+
|
86
|
+
return texts[:4096]
|
@@ -0,0 +1,43 @@
|
|
1
|
+
import translatehtml
|
2
|
+
from argostranslate.translate import ITranslation
|
3
|
+
|
4
|
+
from argostranslatefiles.abstract_file import AbstractFile
|
5
|
+
from bs4 import BeautifulSoup
|
6
|
+
|
7
|
+
|
8
|
+
class Html(AbstractFile):
|
9
|
+
supported_file_extensions = ['.html']
|
10
|
+
|
11
|
+
def translate(self, underlying_translation: ITranslation, file_path: str):
|
12
|
+
outfile_path = self.get_output_path(underlying_translation, file_path)
|
13
|
+
|
14
|
+
infile = open(file_path, "r")
|
15
|
+
outfile = open(outfile_path, "w")
|
16
|
+
|
17
|
+
content = infile.read()
|
18
|
+
|
19
|
+
head = '<!DOCTYPE html>'
|
20
|
+
head_present = content.startswith(head)
|
21
|
+
|
22
|
+
if head_present:
|
23
|
+
content = content[len(head):]
|
24
|
+
|
25
|
+
translated = str(translatehtml.translate_html(underlying_translation, content))
|
26
|
+
|
27
|
+
if head_present:
|
28
|
+
translated = str(head) + translated
|
29
|
+
|
30
|
+
outfile.write(translated)
|
31
|
+
|
32
|
+
infile.close()
|
33
|
+
outfile.close()
|
34
|
+
|
35
|
+
return outfile_path
|
36
|
+
|
37
|
+
def get_texts(self, file_path: str):
|
38
|
+
infile = open(file_path, "r")
|
39
|
+
|
40
|
+
content = infile.read()
|
41
|
+
|
42
|
+
soup = BeautifulSoup(content, "html.parser")
|
43
|
+
return translatehtml.itag_of_soup(soup).text()[0:4096]
|
File without changes
|
@@ -0,0 +1,52 @@
|
|
1
|
+
import zipfile
|
2
|
+
|
3
|
+
from argostranslate.tags import translate_tags
|
4
|
+
from argostranslate.translate import ITranslation
|
5
|
+
from bs4 import BeautifulSoup
|
6
|
+
|
7
|
+
from argostranslatefiles.formats.abstract_xml import AbstractXml
|
8
|
+
|
9
|
+
|
10
|
+
class Odt(AbstractXml):
|
11
|
+
supported_file_extensions = ['.odt']
|
12
|
+
|
13
|
+
def translate(self, underlying_translation: ITranslation, file_path: str):
|
14
|
+
outzip_path = self.get_output_path(underlying_translation, file_path)
|
15
|
+
|
16
|
+
inzip = zipfile.ZipFile(file_path, "r")
|
17
|
+
outzip = zipfile.ZipFile(outzip_path, "w")
|
18
|
+
|
19
|
+
for inzipinfo in inzip.infolist():
|
20
|
+
with inzip.open(inzipinfo) as infile:
|
21
|
+
if inzipinfo.filename == "content.xml":
|
22
|
+
soup = BeautifulSoup(infile.read(), 'xml')
|
23
|
+
|
24
|
+
itag = self.itag_of_soup(soup)
|
25
|
+
translated_tag = translate_tags(underlying_translation, itag)
|
26
|
+
translated_soup = self.soup_of_itag(translated_tag)
|
27
|
+
|
28
|
+
outzip.writestr(inzipinfo.filename, str(translated_soup))
|
29
|
+
else:
|
30
|
+
outzip.writestr(inzipinfo.filename, infile.read())
|
31
|
+
|
32
|
+
inzip.close()
|
33
|
+
outzip.close()
|
34
|
+
|
35
|
+
return outzip_path
|
36
|
+
|
37
|
+
def get_texts(self, file_path: str):
|
38
|
+
inzip = zipfile.ZipFile(file_path, "r")
|
39
|
+
|
40
|
+
texts = ""
|
41
|
+
|
42
|
+
for inzipinfo in inzip.infolist():
|
43
|
+
if len(texts) > 4096:
|
44
|
+
break
|
45
|
+
with inzip.open(inzipinfo) as infile:
|
46
|
+
if inzipinfo.filename == "content.xml":
|
47
|
+
soup = BeautifulSoup(infile.read(), 'xml')
|
48
|
+
texts += self.itag_of_soup(soup).text()
|
49
|
+
|
50
|
+
inzip.close()
|
51
|
+
|
52
|
+
return texts[:4096]
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1,58 @@
|
|
1
|
+
import zipfile
|
2
|
+
|
3
|
+
from argostranslate.tags import translate_tags
|
4
|
+
from argostranslate.translate import ITranslation
|
5
|
+
from bs4 import BeautifulSoup
|
6
|
+
|
7
|
+
from argostranslatefiles.formats.abstract_xml import AbstractXml
|
8
|
+
|
9
|
+
|
10
|
+
class Docx(AbstractXml):
|
11
|
+
supported_file_extensions = ['.docx']
|
12
|
+
|
13
|
+
def translate(self, underlying_translation: ITranslation, file_path: str):
|
14
|
+
outzip_path = self.get_output_path(underlying_translation, file_path)
|
15
|
+
|
16
|
+
inzip = zipfile.ZipFile(file_path, "r")
|
17
|
+
outzip = zipfile.ZipFile(outzip_path, "w")
|
18
|
+
|
19
|
+
for inzipinfo in inzip.infolist():
|
20
|
+
with inzip.open(inzipinfo) as infile:
|
21
|
+
if (inzipinfo.filename == "word/document.xml" or
|
22
|
+
inzipinfo.filename.startswith("word/header") or
|
23
|
+
inzipinfo.filename.startswith("word/footer")):
|
24
|
+
|
25
|
+
soup = BeautifulSoup(infile.read(), 'xml')
|
26
|
+
|
27
|
+
itag = self.itag_of_soup(soup)
|
28
|
+
translated_tag = translate_tags(underlying_translation, itag)
|
29
|
+
translated_soup = self.soup_of_itag(translated_tag)
|
30
|
+
|
31
|
+
outzip.writestr(inzipinfo.filename, str(translated_soup))
|
32
|
+
else:
|
33
|
+
outzip.writestr(inzipinfo.filename, infile.read())
|
34
|
+
|
35
|
+
inzip.close()
|
36
|
+
outzip.close()
|
37
|
+
|
38
|
+
return outzip_path
|
39
|
+
|
40
|
+
def get_texts(self, file_path: str):
|
41
|
+
inzip = zipfile.ZipFile(file_path, "r")
|
42
|
+
|
43
|
+
texts = ""
|
44
|
+
|
45
|
+
for inzipinfo in inzip.infolist():
|
46
|
+
if len(texts) > 4096:
|
47
|
+
break
|
48
|
+
with inzip.open(inzipinfo) as infile:
|
49
|
+
if (inzipinfo.filename == "word/document.xml" or
|
50
|
+
inzipinfo.filename.startswith("word/header") or
|
51
|
+
inzipinfo.filename.startswith("word/footer")):
|
52
|
+
|
53
|
+
soup = BeautifulSoup(infile.read(), 'xml')
|
54
|
+
texts += self.itag_of_soup(soup).text()
|
55
|
+
|
56
|
+
inzip.close()
|
57
|
+
|
58
|
+
return texts[:4096]
|
@@ -0,0 +1,53 @@
|
|
1
|
+
import re
|
2
|
+
import zipfile
|
3
|
+
|
4
|
+
from argostranslate.tags import translate_tags
|
5
|
+
from argostranslate.translate import ITranslation
|
6
|
+
from bs4 import BeautifulSoup
|
7
|
+
|
8
|
+
from argostranslatefiles.formats.abstract_xml import AbstractXml
|
9
|
+
|
10
|
+
|
11
|
+
class Pptx(AbstractXml):
|
12
|
+
supported_file_extensions = ['.pptx']
|
13
|
+
|
14
|
+
def translate(self, underlying_translation: ITranslation, file_path: str):
|
15
|
+
outzip_path = self.get_output_path(underlying_translation, file_path)
|
16
|
+
|
17
|
+
inzip = zipfile.ZipFile(file_path, "r")
|
18
|
+
outzip = zipfile.ZipFile(outzip_path, "w")
|
19
|
+
|
20
|
+
for inzipinfo in inzip.infolist():
|
21
|
+
with inzip.open(inzipinfo) as infile:
|
22
|
+
if re.match(r"ppt\/slides\/slide[0-9]*\.xml", inzipinfo.filename):
|
23
|
+
soup = BeautifulSoup(infile.read(), 'xml')
|
24
|
+
|
25
|
+
itag = self.itag_of_soup(soup)
|
26
|
+
translated_tag = translate_tags(underlying_translation, itag)
|
27
|
+
translated_soup = self.soup_of_itag(translated_tag)
|
28
|
+
|
29
|
+
outzip.writestr(inzipinfo.filename, str(translated_soup))
|
30
|
+
else:
|
31
|
+
outzip.writestr(inzipinfo.filename, infile.read())
|
32
|
+
|
33
|
+
inzip.close()
|
34
|
+
outzip.close()
|
35
|
+
|
36
|
+
return outzip_path
|
37
|
+
|
38
|
+
def get_texts(self, file_path: str):
|
39
|
+
inzip = zipfile.ZipFile(file_path, "r")
|
40
|
+
|
41
|
+
texts = ""
|
42
|
+
|
43
|
+
for inzipinfo in inzip.infolist():
|
44
|
+
if len(texts) > 4096:
|
45
|
+
break
|
46
|
+
with inzip.open(inzipinfo) as infile:
|
47
|
+
if re.match(r"ppt\/slides\/slide[0-9]*\.xml", inzipinfo.filename):
|
48
|
+
soup = BeautifulSoup(infile.read(), 'xml')
|
49
|
+
texts += self.itag_of_soup(soup).text()
|
50
|
+
|
51
|
+
inzip.close()
|
52
|
+
|
53
|
+
return texts[:4096]
|