polytext 0.1.0__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {polytext-0.1.0 → polytext-0.1.2}/PKG-INFO +1 -1
- {polytext-0.1.0 → polytext-0.1.2}/polytext/__init__.py +5 -2
- polytext-0.1.2/polytext/generator/__init__.py +4 -0
- polytext-0.1.2/polytext/generator/pdf.py +231 -0
- {polytext-0.1.0 → polytext-0.1.2}/polytext/loader/text.py +5 -5
- {polytext-0.1.0 → polytext-0.1.2}/polytext.egg-info/PKG-INFO +1 -1
- {polytext-0.1.0 → polytext-0.1.2}/polytext.egg-info/SOURCES.txt +3 -1
- {polytext-0.1.0 → polytext-0.1.2}/setup.py +1 -1
- {polytext-0.1.0 → polytext-0.1.2}/tests/test_extract_text_from_file.py +2 -7
- polytext-0.1.2/tests/test_get_customized_pdf_from_markdown.py +43 -0
- {polytext-0.1.0 → polytext-0.1.2}/tests/test_get_document_text.py +2 -1
- polytext-0.1.0/polytext/output_manager.py +0 -0
- {polytext-0.1.0 → polytext-0.1.2}/LICENSE +0 -0
- {polytext-0.1.0 → polytext-0.1.2}/README.md +0 -0
- {polytext-0.1.0 → polytext-0.1.2}/polytext/converter/__init__.py +0 -0
- {polytext-0.1.0 → polytext-0.1.2}/polytext/converter/pdf.py +0 -0
- {polytext-0.1.0 → polytext-0.1.2}/polytext/exceptions/__init__.py +0 -0
- {polytext-0.1.0 → polytext-0.1.2}/polytext/exceptions/base.py +0 -0
- {polytext-0.1.0 → polytext-0.1.2}/polytext/loader/__init__.py +0 -0
- {polytext-0.1.0 → polytext-0.1.2}/polytext.egg-info/dependency_links.txt +0 -0
- {polytext-0.1.0 → polytext-0.1.2}/polytext.egg-info/not-zip-safe +0 -0
- {polytext-0.1.0 → polytext-0.1.2}/polytext.egg-info/requires.txt +0 -0
- {polytext-0.1.0 → polytext-0.1.2}/polytext.egg-info/top_level.txt +0 -0
- {polytext-0.1.0 → polytext-0.1.2}/pyproject.toml +0 -0
- {polytext-0.1.0 → polytext-0.1.2}/setup.cfg +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
from .converter.pdf import convert_to_pdf, DocumentConverter
|
|
3
3
|
from .loader.text import get_document_text, extract_text_from_file, TextLoader
|
|
4
4
|
from .exceptions.base import EmptyDocument, ExceededMaxPages, ConversionError
|
|
5
|
+
from .generator.pdf import get_customized_pdf_from_markdown, PDFGenerator
|
|
5
6
|
|
|
6
7
|
__all__ = [
|
|
7
8
|
'convert_to_pdf',
|
|
@@ -11,5 +12,7 @@ __all__ = [
|
|
|
11
12
|
'TextLoader',
|
|
12
13
|
'EmptyDocument',
|
|
13
14
|
'ExceededMaxPages',
|
|
14
|
-
'ConversionError'
|
|
15
|
-
|
|
15
|
+
'ConversionError',
|
|
16
|
+
'get_customized_pdf_from_markdown',
|
|
17
|
+
'PDFGenerator'
|
|
18
|
+
]
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
# pdf.py
|
|
2
|
+
import os
|
|
3
|
+
import logging
|
|
4
|
+
import markdown
|
|
5
|
+
from weasyprint import HTML, CSS
|
|
6
|
+
from io import BytesIO
|
|
7
|
+
from weasyprint.text.fonts import FontConfiguration
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_customized_pdf_from_markdown(input_markdown, output_file=None, use_custom_css=True):
|
|
13
|
+
"""
|
|
14
|
+
Convenience function to convert Markdown content to a PDF with custom styling.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
input_markdown: The Markdown content to convert.
|
|
18
|
+
output_file: Optional; if provided, the PDF will be saved to this file.
|
|
19
|
+
use_custom_css (bool, optional): Whether to use custom CSS for styling. Defaults to True.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
A byte string containing the generated PDF.
|
|
23
|
+
"""
|
|
24
|
+
generator = PDFGenerator()
|
|
25
|
+
return generator.get_customized_pdf_from_markdown(input_markdown, output_file, use_custom_css)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class PDFGenerator:
|
|
29
|
+
"""
|
|
30
|
+
A class to generate PDFs from Markdown content with custom CSS styling.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, font_family="Georgia, serif", title_color="#1a5276", body_color="white", text_color="#333",
|
|
34
|
+
h2_color="#d35400", h3_color="#2e86c1", blockquote_border="#3498db", table_header_bg="#2e86c1",
|
|
35
|
+
page_margin="0.8in", image_max_width="80%", add_page_numbers=True, font_path=None):
|
|
36
|
+
"""
|
|
37
|
+
Initialize the PDFGenerator with custom styling options.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
font_family: Font family for the document.
|
|
41
|
+
title_color: Color for the title.
|
|
42
|
+
body_color: Background color for the body.
|
|
43
|
+
text_color: Text color.
|
|
44
|
+
h2_color: Color for H2 headers.
|
|
45
|
+
h3_color: Color for H3 headers.
|
|
46
|
+
blockquote_border: Border color for blockquotes.
|
|
47
|
+
table_header_bg: Background color for table headers.
|
|
48
|
+
page_margin: Margin for the page.
|
|
49
|
+
image_max_width: Maximum width for images.
|
|
50
|
+
add_page_numbers: Whether to add page numbers.
|
|
51
|
+
font_path: Path to a custom font file.
|
|
52
|
+
"""
|
|
53
|
+
self.font_family = font_family
|
|
54
|
+
self.title_color = title_color
|
|
55
|
+
self.body_color = body_color
|
|
56
|
+
self.text_color = text_color
|
|
57
|
+
self.h2_color = h2_color
|
|
58
|
+
self.h3_color = h3_color
|
|
59
|
+
self.blockquote_border = blockquote_border
|
|
60
|
+
self.table_header_bg = table_header_bg
|
|
61
|
+
self.page_margin = page_margin
|
|
62
|
+
self.image_max_width = image_max_width
|
|
63
|
+
self.add_page_numbers = add_page_numbers
|
|
64
|
+
self.font_path = font_path
|
|
65
|
+
|
|
66
|
+
def generate_custom_css(self):
|
|
67
|
+
"""
|
|
68
|
+
Generate custom CSS based on the provided styling options.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
A string containing the custom CSS.
|
|
72
|
+
"""
|
|
73
|
+
font_face_css = ""
|
|
74
|
+
if self.font_path and os.path.exists(self.font_path):
|
|
75
|
+
logger.info(f"Using custom font: {self.font_path}")
|
|
76
|
+
try:
|
|
77
|
+
font_face_css = f"""
|
|
78
|
+
@font-face {{
|
|
79
|
+
font-family: {self.font_family.split(",")[0]};
|
|
80
|
+
src: url('file://{self.font_path}') format('truetype');
|
|
81
|
+
font-weight: normal;
|
|
82
|
+
font-style: normal;
|
|
83
|
+
}}
|
|
84
|
+
"""
|
|
85
|
+
logger.info("Font-face CSS created")
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logger.error(f"Error loading font: {e}")
|
|
88
|
+
|
|
89
|
+
page_numbers_css = f"""
|
|
90
|
+
@page {{
|
|
91
|
+
size: A4;
|
|
92
|
+
margin: {self.page_margin};
|
|
93
|
+
|
|
94
|
+
@bottom-center {{
|
|
95
|
+
content: counter(page) "/" counter(pages);
|
|
96
|
+
font-size: 12px;
|
|
97
|
+
color: #555;
|
|
98
|
+
}}
|
|
99
|
+
}}
|
|
100
|
+
""" if self.add_page_numbers else ""
|
|
101
|
+
|
|
102
|
+
css_template = f"""
|
|
103
|
+
{page_numbers_css}
|
|
104
|
+
|
|
105
|
+
{font_face_css} /* Include font-face only if custom font is provided */
|
|
106
|
+
|
|
107
|
+
* {{
|
|
108
|
+
font-family: {self.font_family} !important; /* Force the font family on all elements */
|
|
109
|
+
}}
|
|
110
|
+
|
|
111
|
+
body {{
|
|
112
|
+
font-family: {self.font_family};
|
|
113
|
+
color: {self.text_color};
|
|
114
|
+
background-color: {self.body_color};
|
|
115
|
+
text-align: justify;
|
|
116
|
+
line-height: 1.6;
|
|
117
|
+
}}
|
|
118
|
+
|
|
119
|
+
h1 {{
|
|
120
|
+
color: {self.title_color};
|
|
121
|
+
font-size: 28px;
|
|
122
|
+
text-align: center;
|
|
123
|
+
text-transform: uppercase;
|
|
124
|
+
margin-bottom: 20px;
|
|
125
|
+
}}
|
|
126
|
+
|
|
127
|
+
h2 {{
|
|
128
|
+
color: {self.h2_color};
|
|
129
|
+
font-size: 22px;
|
|
130
|
+
text-transform: uppercase;
|
|
131
|
+
margin-top: 30px;
|
|
132
|
+
border-bottom: 2px solid {self.h2_color};
|
|
133
|
+
padding-bottom: 5px;
|
|
134
|
+
}}
|
|
135
|
+
|
|
136
|
+
h3 {{
|
|
137
|
+
color: {self.h3_color};
|
|
138
|
+
font-size: 18px;
|
|
139
|
+
margin-top: 20px;
|
|
140
|
+
}}
|
|
141
|
+
|
|
142
|
+
p {{
|
|
143
|
+
font-size: 14px;
|
|
144
|
+
margin: 10px 0;
|
|
145
|
+
}}
|
|
146
|
+
|
|
147
|
+
blockquote {{
|
|
148
|
+
border-left: 4px solid {self.blockquote_border};
|
|
149
|
+
padding-left: 10px;
|
|
150
|
+
font-style: italic;
|
|
151
|
+
color: #555;
|
|
152
|
+
margin: 15px 0;
|
|
153
|
+
}}
|
|
154
|
+
|
|
155
|
+
table {{
|
|
156
|
+
width: 100%;
|
|
157
|
+
border-collapse: collapse;
|
|
158
|
+
margin: 20px 0;
|
|
159
|
+
}}
|
|
160
|
+
|
|
161
|
+
th, td {{
|
|
162
|
+
border: 1px solid #ddd;
|
|
163
|
+
padding: 8px;
|
|
164
|
+
text-align: left;
|
|
165
|
+
}}
|
|
166
|
+
|
|
167
|
+
th {{
|
|
168
|
+
background-color: {self.table_header_bg};
|
|
169
|
+
color: white;
|
|
170
|
+
}}
|
|
171
|
+
|
|
172
|
+
img {{
|
|
173
|
+
display: block;
|
|
174
|
+
margin: 20px auto;
|
|
175
|
+
max-width: {self.image_max_width};
|
|
176
|
+
height: auto;
|
|
177
|
+
border: 2px solid #ddd;
|
|
178
|
+
padding: 5px;
|
|
179
|
+
}}
|
|
180
|
+
|
|
181
|
+
footer {{
|
|
182
|
+
font-size: 12px;
|
|
183
|
+
text-align: center;
|
|
184
|
+
margin-top: 40px;
|
|
185
|
+
color: #777;
|
|
186
|
+
}}
|
|
187
|
+
"""
|
|
188
|
+
return css_template
|
|
189
|
+
|
|
190
|
+
def get_customized_pdf_from_markdown(self, input_markdown, output_file=None, use_custom_css=True):
|
|
191
|
+
"""
|
|
192
|
+
Convert Markdown content to a PDF with custom styling.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
input_markdown: The Markdown content to convert.
|
|
196
|
+
output_file: Optional; if provided, the PDF will be saved to this file.
|
|
197
|
+
use_custom_css (bool, optional): Whether to use custom CSS for styling. Defaults to True.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
A byte string containing the generated PDF.
|
|
201
|
+
|
|
202
|
+
Raises:
|
|
203
|
+
Exception: If an error occurs during PDF generation.
|
|
204
|
+
"""
|
|
205
|
+
try:
|
|
206
|
+
html_content = markdown.markdown(input_markdown, extensions=['extra', 'codehilite', 'toc'])
|
|
207
|
+
|
|
208
|
+
# Generate PDF from HTML with Custom Styles
|
|
209
|
+
pdf_buffer = BytesIO()
|
|
210
|
+
|
|
211
|
+
if use_custom_css:
|
|
212
|
+
custom_css = self.generate_custom_css()
|
|
213
|
+
font_config = FontConfiguration()
|
|
214
|
+
html = HTML(string=html_content)
|
|
215
|
+
css = CSS(string=custom_css, font_config=font_config)
|
|
216
|
+
html.write_pdf(pdf_buffer, stylesheets=[css], font_config=font_config)
|
|
217
|
+
else:
|
|
218
|
+
html = HTML(string=html_content)
|
|
219
|
+
html.write_pdf(pdf_buffer)
|
|
220
|
+
|
|
221
|
+
pdf_value = pdf_buffer.getvalue()
|
|
222
|
+
|
|
223
|
+
if output_file:
|
|
224
|
+
with open(output_file, 'wb') as f:
|
|
225
|
+
f.write(pdf_value)
|
|
226
|
+
logger.info(f"PDF saved to {output_file}")
|
|
227
|
+
|
|
228
|
+
return pdf_value
|
|
229
|
+
except Exception as e:
|
|
230
|
+
logger.error(f"Error generating PDF: {e}")
|
|
231
|
+
raise
|
|
@@ -25,7 +25,7 @@ def get_document_text(doc_data, page_range=None):
|
|
|
25
25
|
|
|
26
26
|
Args:
|
|
27
27
|
doc_data (dict): Dictionary containing 'file_path' and optional 'bucket'
|
|
28
|
-
page_range (tuple, optional): Tuple of (start_page, end_page)
|
|
28
|
+
page_range (tuple, optional): Tuple of (start_page, end_page), 1-indexed. Note: When converting from .odt or .rtf files, the page range selection might not exactly match the original document's page numbers due to formatting differences during PDF conversion and variations in how LibreOffice renders these formats.
|
|
29
29
|
|
|
30
30
|
Returns:
|
|
31
31
|
str: Extracted text from the document
|
|
@@ -44,7 +44,7 @@ def extract_text_from_file(file_path, page_range=None, backend='auto'):
|
|
|
44
44
|
|
|
45
45
|
Args:
|
|
46
46
|
file_path (str): Path to the local file
|
|
47
|
-
page_range (tuple, optional): Tuple of (start_page, end_page), 1-indexed
|
|
47
|
+
page_range (tuple, optional): Tuple of (start_page, end_page), 1-indexed. Note: When converting from .odt or .rtf files, the page range selection might not exactly match the original document's page numbers due to formatting differences during PDF conversion and variations in how LibreOffice renders these formats.
|
|
48
48
|
backend (str, optional): Text extraction backend ('auto', 'pymupdf', or 'pypdf')
|
|
49
49
|
|
|
50
50
|
Returns:
|
|
@@ -178,7 +178,7 @@ class TextLoader:
|
|
|
178
178
|
|
|
179
179
|
Args:
|
|
180
180
|
doc_data (dict): Dictionary containing 'file_path' and optional 'bucket'
|
|
181
|
-
page_range (tuple, optional): Tuple of (start_page, end_page)
|
|
181
|
+
page_range (tuple, optional): Tuple of (start_page, end_page), 1-indexed. Note: When converting from .odt or .rtf files, the page range selection might not exactly match the original document's page numbers due to formatting differences during PDF conversion and variations in how LibreOffice renders these formats.
|
|
182
182
|
|
|
183
183
|
Returns:
|
|
184
184
|
str: Extracted text from the document
|
|
@@ -280,7 +280,7 @@ class TextLoader:
|
|
|
280
280
|
Args:
|
|
281
281
|
bucket (str): S3 bucket name
|
|
282
282
|
file_path (str): Path to file in S3
|
|
283
|
-
page_range (tuple, optional): Tuple of (start_page, end_page)
|
|
283
|
+
page_range (tuple, optional): Tuple of (start_page, end_page), 1-indexed. Note: When converting from .odt or .rtf files, the page range selection might not exactly match the original document's page numbers due to formatting differences during PDF conversion and variations in how LibreOffice renders these formats.
|
|
284
284
|
|
|
285
285
|
Returns:
|
|
286
286
|
str: Extracted text from the document
|
|
@@ -380,7 +380,7 @@ class TextLoader:
|
|
|
380
380
|
|
|
381
381
|
Args:
|
|
382
382
|
file_path (str): Path to the local file
|
|
383
|
-
page_range (tuple, optional): Tuple of (start_page, end_page), 1-indexed
|
|
383
|
+
page_range (tuple, optional): Tuple of (start_page, end_page), 1-indexed. Note: When converting from .odt or .rtf files, the page range selection might not exactly match the original document's page numbers due to formatting differences during PDF conversion and variations in how LibreOffice renders these formats.
|
|
384
384
|
backend (str, optional): Text extraction backend ('auto', 'pymupdf', or 'pypdf')
|
|
385
385
|
|
|
386
386
|
Returns:
|
|
@@ -3,7 +3,6 @@ README.md
|
|
|
3
3
|
pyproject.toml
|
|
4
4
|
setup.py
|
|
5
5
|
polytext/__init__.py
|
|
6
|
-
polytext/output_manager.py
|
|
7
6
|
polytext.egg-info/PKG-INFO
|
|
8
7
|
polytext.egg-info/SOURCES.txt
|
|
9
8
|
polytext.egg-info/dependency_links.txt
|
|
@@ -14,7 +13,10 @@ polytext/converter/__init__.py
|
|
|
14
13
|
polytext/converter/pdf.py
|
|
15
14
|
polytext/exceptions/__init__.py
|
|
16
15
|
polytext/exceptions/base.py
|
|
16
|
+
polytext/generator/__init__.py
|
|
17
|
+
polytext/generator/pdf.py
|
|
17
18
|
polytext/loader/__init__.py
|
|
18
19
|
polytext/loader/text.py
|
|
19
20
|
tests/test_extract_text_from_file.py
|
|
21
|
+
tests/test_get_customized_pdf_from_markdown.py
|
|
20
22
|
tests/test_get_document_text.py
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
+
import os
|
|
1
2
|
import sys
|
|
2
|
-
import boto3
|
|
3
3
|
import logging
|
|
4
4
|
|
|
5
|
-
sys.path.
|
|
5
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
|
6
6
|
|
|
7
7
|
from dotenv import load_dotenv
|
|
8
8
|
load_dotenv(".env")
|
|
@@ -18,11 +18,6 @@ def main():
|
|
|
18
18
|
# Initialize TextLoader
|
|
19
19
|
text_loader = TextLoader()
|
|
20
20
|
|
|
21
|
-
# Define document data
|
|
22
|
-
doc_data = {
|
|
23
|
-
"file_path": "xxx",
|
|
24
|
-
}
|
|
25
|
-
|
|
26
21
|
# Optional: specify page range (start_page, end_page) - pages are 1-indexed
|
|
27
22
|
page_range = (1, 2) # Extract text from pages 1 to 10
|
|
28
23
|
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
|
6
|
+
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
load_dotenv(".env")
|
|
9
|
+
|
|
10
|
+
from polytext.generator.pdf import PDFGenerator
|
|
11
|
+
|
|
12
|
+
# Set up logging
|
|
13
|
+
logging.basicConfig(level=logging.INFO,
|
|
14
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def main():
|
|
18
|
+
# Initialize PDFGenerator
|
|
19
|
+
generator = PDFGenerator(font_family="'Helvetica', sans-serif")
|
|
20
|
+
|
|
21
|
+
# Define Markdown content
|
|
22
|
+
markdown_text = """# LOREM IPSUM
|
|
23
|
+
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt.
|
|
24
|
+
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt.
|
|
25
|
+
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt.
|
|
26
|
+
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt.
|
|
27
|
+
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt."""
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
# Call get_customized_pdf_from_markdown method
|
|
31
|
+
pdf_value = generator.get_customized_pdf_from_markdown(
|
|
32
|
+
input_markdown=markdown_text,
|
|
33
|
+
output_file="test_custom_pdf.pdf"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
print(f"Successfully generated custom pdf from markdown")
|
|
37
|
+
|
|
38
|
+
except Exception as e:
|
|
39
|
+
logging.error(f"Error generating PDF: {e}")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
if __name__ == "__main__":
|
|
43
|
+
main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|