polytext 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {polytext-0.1.0 → polytext-0.1.2}/PKG-INFO +1 -1
  2. {polytext-0.1.0 → polytext-0.1.2}/polytext/__init__.py +5 -2
  3. polytext-0.1.2/polytext/generator/__init__.py +4 -0
  4. polytext-0.1.2/polytext/generator/pdf.py +231 -0
  5. {polytext-0.1.0 → polytext-0.1.2}/polytext/loader/text.py +5 -5
  6. {polytext-0.1.0 → polytext-0.1.2}/polytext.egg-info/PKG-INFO +1 -1
  7. {polytext-0.1.0 → polytext-0.1.2}/polytext.egg-info/SOURCES.txt +3 -1
  8. {polytext-0.1.0 → polytext-0.1.2}/setup.py +1 -1
  9. {polytext-0.1.0 → polytext-0.1.2}/tests/test_extract_text_from_file.py +2 -7
  10. polytext-0.1.2/tests/test_get_customized_pdf_from_markdown.py +43 -0
  11. {polytext-0.1.0 → polytext-0.1.2}/tests/test_get_document_text.py +2 -1
  12. polytext-0.1.0/polytext/output_manager.py +0 -0
  13. {polytext-0.1.0 → polytext-0.1.2}/LICENSE +0 -0
  14. {polytext-0.1.0 → polytext-0.1.2}/README.md +0 -0
  15. {polytext-0.1.0 → polytext-0.1.2}/polytext/converter/__init__.py +0 -0
  16. {polytext-0.1.0 → polytext-0.1.2}/polytext/converter/pdf.py +0 -0
  17. {polytext-0.1.0 → polytext-0.1.2}/polytext/exceptions/__init__.py +0 -0
  18. {polytext-0.1.0 → polytext-0.1.2}/polytext/exceptions/base.py +0 -0
  19. {polytext-0.1.0 → polytext-0.1.2}/polytext/loader/__init__.py +0 -0
  20. {polytext-0.1.0 → polytext-0.1.2}/polytext.egg-info/dependency_links.txt +0 -0
  21. {polytext-0.1.0 → polytext-0.1.2}/polytext.egg-info/not-zip-safe +0 -0
  22. {polytext-0.1.0 → polytext-0.1.2}/polytext.egg-info/requires.txt +0 -0
  23. {polytext-0.1.0 → polytext-0.1.2}/polytext.egg-info/top_level.txt +0 -0
  24. {polytext-0.1.0 → polytext-0.1.2}/pyproject.toml +0 -0
  25. {polytext-0.1.0 → polytext-0.1.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: polytext
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: Python utilities to simplify document files management
5
5
  Home-page: https://github.com/docsity/polytext
6
6
  Author: Matteo Senardi
@@ -2,6 +2,7 @@
2
2
  from .converter.pdf import convert_to_pdf, DocumentConverter
3
3
  from .loader.text import get_document_text, extract_text_from_file, TextLoader
4
4
  from .exceptions.base import EmptyDocument, ExceededMaxPages, ConversionError
5
+ from .generator.pdf import get_customized_pdf_from_markdown, PDFGenerator
5
6
 
6
7
  __all__ = [
7
8
  'convert_to_pdf',
@@ -11,5 +12,7 @@ __all__ = [
11
12
  'TextLoader',
12
13
  'EmptyDocument',
13
14
  'ExceededMaxPages',
14
- 'ConversionError'
15
- ]
15
+ 'ConversionError',
16
+ 'get_customized_pdf_from_markdown',
17
+ 'PDFGenerator'
18
+ ]
@@ -0,0 +1,4 @@
1
+ # polytext/generator/__init__.py
2
+ from .pdf import get_customized_pdf_from_markdown, PDFGenerator
3
+
4
+ __all__ = ['get_customized_pdf_from_markdown', 'PDFGenerator']
@@ -0,0 +1,231 @@
1
+ # pdf.py
2
+ import os
3
+ import logging
4
+ import markdown
5
+ from weasyprint import HTML, CSS
6
+ from io import BytesIO
7
+ from weasyprint.text.fonts import FontConfiguration
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def get_customized_pdf_from_markdown(input_markdown, output_file=None, use_custom_css=True):
13
+ """
14
+ Convenience function to convert Markdown content to a PDF with custom styling.
15
+
16
+ Args:
17
+ input_markdown: The Markdown content to convert.
18
+ output_file: Optional; if provided, the PDF will be saved to this file.
19
+ use_custom_css (bool, optional): Whether to use custom CSS for styling. Defaults to True.
20
+
21
+ Returns:
22
+ A byte string containing the generated PDF.
23
+ """
24
+ generator = PDFGenerator()
25
+ return generator.get_customized_pdf_from_markdown(input_markdown, output_file, use_custom_css)
26
+
27
+
28
+ class PDFGenerator:
29
+ """
30
+ A class to generate PDFs from Markdown content with custom CSS styling.
31
+ """
32
+
33
+ def __init__(self, font_family="Georgia, serif", title_color="#1a5276", body_color="white", text_color="#333",
34
+ h2_color="#d35400", h3_color="#2e86c1", blockquote_border="#3498db", table_header_bg="#2e86c1",
35
+ page_margin="0.8in", image_max_width="80%", add_page_numbers=True, font_path=None):
36
+ """
37
+ Initialize the PDFGenerator with custom styling options.
38
+
39
+ Args:
40
+ font_family: Font family for the document.
41
+ title_color: Color for the title.
42
+ body_color: Background color for the body.
43
+ text_color: Text color.
44
+ h2_color: Color for H2 headers.
45
+ h3_color: Color for H3 headers.
46
+ blockquote_border: Border color for blockquotes.
47
+ table_header_bg: Background color for table headers.
48
+ page_margin: Margin for the page.
49
+ image_max_width: Maximum width for images.
50
+ add_page_numbers: Whether to add page numbers.
51
+ font_path: Path to a custom font file.
52
+ """
53
+ self.font_family = font_family
54
+ self.title_color = title_color
55
+ self.body_color = body_color
56
+ self.text_color = text_color
57
+ self.h2_color = h2_color
58
+ self.h3_color = h3_color
59
+ self.blockquote_border = blockquote_border
60
+ self.table_header_bg = table_header_bg
61
+ self.page_margin = page_margin
62
+ self.image_max_width = image_max_width
63
+ self.add_page_numbers = add_page_numbers
64
+ self.font_path = font_path
65
+
66
+ def generate_custom_css(self):
67
+ """
68
+ Generate custom CSS based on the provided styling options.
69
+
70
+ Returns:
71
+ A string containing the custom CSS.
72
+ """
73
+ font_face_css = ""
74
+ if self.font_path and os.path.exists(self.font_path):
75
+ logger.info(f"Using custom font: {self.font_path}")
76
+ try:
77
+ font_face_css = f"""
78
+ @font-face {{
79
+ font-family: {self.font_family.split(",")[0]};
80
+ src: url('file://{self.font_path}') format('truetype');
81
+ font-weight: normal;
82
+ font-style: normal;
83
+ }}
84
+ """
85
+ logger.info("Font-face CSS created")
86
+ except Exception as e:
87
+ logger.error(f"Error loading font: {e}")
88
+
89
+ page_numbers_css = f"""
90
+ @page {{
91
+ size: A4;
92
+ margin: {self.page_margin};
93
+
94
+ @bottom-center {{
95
+ content: counter(page) "/" counter(pages);
96
+ font-size: 12px;
97
+ color: #555;
98
+ }}
99
+ }}
100
+ """ if self.add_page_numbers else ""
101
+
102
+ css_template = f"""
103
+ {page_numbers_css}
104
+
105
+ {font_face_css} /* Include font-face only if custom font is provided */
106
+
107
+ * {{
108
+ font-family: {self.font_family} !important; /* Force the font family on all elements */
109
+ }}
110
+
111
+ body {{
112
+ font-family: {self.font_family};
113
+ color: {self.text_color};
114
+ background-color: {self.body_color};
115
+ text-align: justify;
116
+ line-height: 1.6;
117
+ }}
118
+
119
+ h1 {{
120
+ color: {self.title_color};
121
+ font-size: 28px;
122
+ text-align: center;
123
+ text-transform: uppercase;
124
+ margin-bottom: 20px;
125
+ }}
126
+
127
+ h2 {{
128
+ color: {self.h2_color};
129
+ font-size: 22px;
130
+ text-transform: uppercase;
131
+ margin-top: 30px;
132
+ border-bottom: 2px solid {self.h2_color};
133
+ padding-bottom: 5px;
134
+ }}
135
+
136
+ h3 {{
137
+ color: {self.h3_color};
138
+ font-size: 18px;
139
+ margin-top: 20px;
140
+ }}
141
+
142
+ p {{
143
+ font-size: 14px;
144
+ margin: 10px 0;
145
+ }}
146
+
147
+ blockquote {{
148
+ border-left: 4px solid {self.blockquote_border};
149
+ padding-left: 10px;
150
+ font-style: italic;
151
+ color: #555;
152
+ margin: 15px 0;
153
+ }}
154
+
155
+ table {{
156
+ width: 100%;
157
+ border-collapse: collapse;
158
+ margin: 20px 0;
159
+ }}
160
+
161
+ th, td {{
162
+ border: 1px solid #ddd;
163
+ padding: 8px;
164
+ text-align: left;
165
+ }}
166
+
167
+ th {{
168
+ background-color: {self.table_header_bg};
169
+ color: white;
170
+ }}
171
+
172
+ img {{
173
+ display: block;
174
+ margin: 20px auto;
175
+ max-width: {self.image_max_width};
176
+ height: auto;
177
+ border: 2px solid #ddd;
178
+ padding: 5px;
179
+ }}
180
+
181
+ footer {{
182
+ font-size: 12px;
183
+ text-align: center;
184
+ margin-top: 40px;
185
+ color: #777;
186
+ }}
187
+ """
188
+ return css_template
189
+
190
+ def get_customized_pdf_from_markdown(self, input_markdown, output_file=None, use_custom_css=True):
191
+ """
192
+ Convert Markdown content to a PDF with custom styling.
193
+
194
+ Args:
195
+ input_markdown: The Markdown content to convert.
196
+ output_file: Optional; if provided, the PDF will be saved to this file.
197
+ use_custom_css (bool, optional): Whether to use custom CSS for styling. Defaults to True.
198
+
199
+ Returns:
200
+ A byte string containing the generated PDF.
201
+
202
+ Raises:
203
+ Exception: If an error occurs during PDF generation.
204
+ """
205
+ try:
206
+ html_content = markdown.markdown(input_markdown, extensions=['extra', 'codehilite', 'toc'])
207
+
208
+ # Generate PDF from HTML with Custom Styles
209
+ pdf_buffer = BytesIO()
210
+
211
+ if use_custom_css:
212
+ custom_css = self.generate_custom_css()
213
+ font_config = FontConfiguration()
214
+ html = HTML(string=html_content)
215
+ css = CSS(string=custom_css, font_config=font_config)
216
+ html.write_pdf(pdf_buffer, stylesheets=[css], font_config=font_config)
217
+ else:
218
+ html = HTML(string=html_content)
219
+ html.write_pdf(pdf_buffer)
220
+
221
+ pdf_value = pdf_buffer.getvalue()
222
+
223
+ if output_file:
224
+ with open(output_file, 'wb') as f:
225
+ f.write(pdf_value)
226
+ logger.info(f"PDF saved to {output_file}")
227
+
228
+ return pdf_value
229
+ except Exception as e:
230
+ logger.error(f"Error generating PDF: {e}")
231
+ raise
@@ -25,7 +25,7 @@ def get_document_text(doc_data, page_range=None):
25
25
 
26
26
  Args:
27
27
  doc_data (dict): Dictionary containing 'file_path' and optional 'bucket'
28
- page_range (tuple, optional): Tuple of (start_page, end_page) for partial extraction
28
+ page_range (tuple, optional): Tuple of (start_page, end_page), 1-indexed. Note: When converting from .odt or .rtf files, the page range selection might not exactly match the original document's page numbers due to formatting differences during PDF conversion and variations in how LibreOffice renders these formats.
29
29
 
30
30
  Returns:
31
31
  str: Extracted text from the document
@@ -44,7 +44,7 @@ def extract_text_from_file(file_path, page_range=None, backend='auto'):
44
44
 
45
45
  Args:
46
46
  file_path (str): Path to the local file
47
- page_range (tuple, optional): Tuple of (start_page, end_page), 1-indexed
47
+ page_range (tuple, optional): Tuple of (start_page, end_page), 1-indexed. Note: When converting from .odt or .rtf files, the page range selection might not exactly match the original document's page numbers due to formatting differences during PDF conversion and variations in how LibreOffice renders these formats.
48
48
  backend (str, optional): Text extraction backend ('auto', 'pymupdf', or 'pypdf')
49
49
 
50
50
  Returns:
@@ -178,7 +178,7 @@ class TextLoader:
178
178
 
179
179
  Args:
180
180
  doc_data (dict): Dictionary containing 'file_path' and optional 'bucket'
181
- page_range (tuple, optional): Tuple of (start_page, end_page) for partial extraction
181
+ page_range (tuple, optional): Tuple of (start_page, end_page), 1-indexed. Note: When converting from .odt or .rtf files, the page range selection might not exactly match the original document's page numbers due to formatting differences during PDF conversion and variations in how LibreOffice renders these formats.
182
182
 
183
183
  Returns:
184
184
  str: Extracted text from the document
@@ -280,7 +280,7 @@ class TextLoader:
280
280
  Args:
281
281
  bucket (str): S3 bucket name
282
282
  file_path (str): Path to file in S3
283
- page_range (tuple, optional): Tuple of (start_page, end_page) for partial extraction
283
+ page_range (tuple, optional): Tuple of (start_page, end_page), 1-indexed. Note: When converting from .odt or .rtf files, the page range selection might not exactly match the original document's page numbers due to formatting differences during PDF conversion and variations in how LibreOffice renders these formats.
284
284
 
285
285
  Returns:
286
286
  str: Extracted text from the document
@@ -380,7 +380,7 @@ class TextLoader:
380
380
 
381
381
  Args:
382
382
  file_path (str): Path to the local file
383
- page_range (tuple, optional): Tuple of (start_page, end_page), 1-indexed
383
+ page_range (tuple, optional): Tuple of (start_page, end_page), 1-indexed. Note: When converting from .odt or .rtf files, the page range selection might not exactly match the original document's page numbers due to formatting differences during PDF conversion and variations in how LibreOffice renders these formats.
384
384
  backend (str, optional): Text extraction backend ('auto', 'pymupdf', or 'pypdf')
385
385
 
386
386
  Returns:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: polytext
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: Python utilities to simplify document files management
5
5
  Home-page: https://github.com/docsity/polytext
6
6
  Author: Matteo Senardi
@@ -3,7 +3,6 @@ README.md
3
3
  pyproject.toml
4
4
  setup.py
5
5
  polytext/__init__.py
6
- polytext/output_manager.py
7
6
  polytext.egg-info/PKG-INFO
8
7
  polytext.egg-info/SOURCES.txt
9
8
  polytext.egg-info/dependency_links.txt
@@ -14,7 +13,10 @@ polytext/converter/__init__.py
14
13
  polytext/converter/pdf.py
15
14
  polytext/exceptions/__init__.py
16
15
  polytext/exceptions/base.py
16
+ polytext/generator/__init__.py
17
+ polytext/generator/pdf.py
17
18
  polytext/loader/__init__.py
18
19
  polytext/loader/text.py
19
20
  tests/test_extract_text_from_file.py
21
+ tests/test_get_customized_pdf_from_markdown.py
20
22
  tests/test_get_document_text.py
@@ -50,7 +50,7 @@ def get_requirements(*requirements_file):
50
50
 
51
51
 
52
52
  setup(name='polytext',
53
- version='0.1.0',
53
+ version='0.1.2',
54
54
  url='https://github.com/docsity/polytext',
55
55
  # download_url='https://github.com/pualien/py-polytext/archive/0.1.23.tar.gz',
56
56
  license='MIT',
@@ -1,8 +1,8 @@
1
+ import os
1
2
  import sys
2
- import boto3
3
3
  import logging
4
4
 
5
- sys.path.append('/Users/marcodelgiudice/Projects/polytext')
5
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
6
6
 
7
7
  from dotenv import load_dotenv
8
8
  load_dotenv(".env")
@@ -18,11 +18,6 @@ def main():
18
18
  # Initialize TextLoader
19
19
  text_loader = TextLoader()
20
20
 
21
- # Define document data
22
- doc_data = {
23
- "file_path": "xxx",
24
- }
25
-
26
21
  # Optional: specify page range (start_page, end_page) - pages are 1-indexed
27
22
  page_range = (1, 2) # Extract text from pages 1 to 10
28
23
 
@@ -0,0 +1,43 @@
1
+ import os
2
+ import sys
3
+ import logging
4
+
5
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
6
+
7
+ from dotenv import load_dotenv
8
+ load_dotenv(".env")
9
+
10
+ from polytext.generator.pdf import PDFGenerator
11
+
12
+ # Set up logging
13
+ logging.basicConfig(level=logging.INFO,
14
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
15
+
16
+
17
+ def main():
18
+ # Initialize PDFGenerator
19
+ generator = PDFGenerator(font_family="'Helvetica', sans-serif")
20
+
21
+ # Define Markdown content
22
+ markdown_text = """# LOREM IPSUM
23
+ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt.
24
+ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt.
25
+ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt.
26
+ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt.
27
+ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus ac nunc ultricies tincidunt."""
28
+
29
+ try:
30
+ # Call get_customized_pdf_from_markdown method
31
+ pdf_value = generator.get_customized_pdf_from_markdown(
32
+ input_markdown=markdown_text,
33
+ output_file="test_custom_pdf.pdf"
34
+ )
35
+
36
+ print(f"Successfully generated custom pdf from markdown")
37
+
38
+ except Exception as e:
39
+ logging.error(f"Error generating PDF: {e}")
40
+
41
+
42
+ if __name__ == "__main__":
43
+ main()
@@ -1,8 +1,9 @@
1
+ import os
1
2
  import sys
2
3
  import boto3
3
4
  import logging
4
5
 
5
- sys.path.append('/Users/marcodelgiudice/Projects/polytext')
6
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
6
7
 
7
8
  from dotenv import load_dotenv
8
9
  load_dotenv(".env")
File without changes
File without changes
File without changes
File without changes
File without changes