polytext 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
polytext-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Docsity
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,93 @@
1
+ Metadata-Version: 2.2
2
+ Name: polytext
3
+ Version: 0.1.0
4
+ Summary: Python utilities to simplify document files management
5
+ Home-page: https://github.com/docsity/polytext
6
+ Author: Matteo Senardi
7
+ Author-email: matteo.s@docsity.com
8
+ License: MIT
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3.6
12
+ Classifier: Programming Language :: Python :: 3.7
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Requires-Python: ~=3.6
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: pypdf==5.3.0
25
+ Requires-Dist: PyMuPDF>=1.25.3
26
+ Requires-Dist: pycryptodome==3.21.0
27
+ Requires-Dist: weasyprint==64.1
28
+ Requires-Dist: markdown==3.7
29
+ Requires-Dist: python-docx==1.1.2
30
+ Dynamic: author
31
+ Dynamic: author-email
32
+ Dynamic: classifier
33
+ Dynamic: description
34
+ Dynamic: description-content-type
35
+ Dynamic: home-page
36
+ Dynamic: license
37
+ Dynamic: requires-dist
38
+ Dynamic: requires-python
39
+ Dynamic: summary
40
+
41
+ # polytext
42
+
43
+ # Doc Utils
44
+
45
+ A Python package for document conversion and text extraction.
46
+
47
+ ## Features
48
+
49
+ - Convert various document formats (DOCX, ODT, PPT, etc.) to PDF
50
+ - Extract text from PDF documents
51
+ - Support for both local files and S3 storage
52
+ - Multiple PDF parsing backends (PyPDF, PyMuPDF)
53
+
54
+ ## Installation
55
+
56
+ ```bash
57
+ # Basic installation
58
+ pip install plytext
59
+ ```
60
+
61
+ ## Requirements
62
+
63
+ - Python 3.6 or higher
64
+ - LibreOffice (for PDF conversion)
65
+
66
+ ## Usage
67
+
68
+ Converting Documents to PDF
69
+
70
+ ```python
71
+ from polytext import convert_to_pdf, ConversionError
72
+
73
+ try:
74
+ # Convert a document to PDF
75
+ pdf_path = convert_to_pdf('input.docx', 'output.pdf')
76
+ print(f"PDF saved to: {pdf_path}")
77
+ except ConversionError as e:
78
+ print(f"Conversion failed: {e}")
79
+ ```
80
+
81
+ Text Extraction
82
+
83
+ ```python
84
+ from polytext import extract_text_from_file
85
+
86
+ # Extract text from any supported file
87
+ text = extract_text_from_file('document.docx')
88
+ print(f"Extracted text: {text}")
89
+ ```
90
+
91
+ ## License
92
+
93
+ MIT Licence
@@ -0,0 +1,53 @@
1
+ # polytext
2
+
3
+ # Doc Utils
4
+
5
+ A Python package for document conversion and text extraction.
6
+
7
+ ## Features
8
+
9
+ - Convert various document formats (DOCX, ODT, PPT, etc.) to PDF
10
+ - Extract text from PDF documents
11
+ - Support for both local files and S3 storage
12
+ - Multiple PDF parsing backends (PyPDF, PyMuPDF)
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ # Basic installation
18
+ pip install plytext
19
+ ```
20
+
21
+ ## Requirements
22
+
23
+ - Python 3.6 or higher
24
+ - LibreOffice (for PDF conversion)
25
+
26
+ ## Usage
27
+
28
+ Converting Documents to PDF
29
+
30
+ ```python
31
+ from polytext import convert_to_pdf, ConversionError
32
+
33
+ try:
34
+ # Convert a document to PDF
35
+ pdf_path = convert_to_pdf('input.docx', 'output.pdf')
36
+ print(f"PDF saved to: {pdf_path}")
37
+ except ConversionError as e:
38
+ print(f"Conversion failed: {e}")
39
+ ```
40
+
41
+ Text Extraction
42
+
43
+ ```python
44
+ from polytext import extract_text_from_file
45
+
46
+ # Extract text from any supported file
47
+ text = extract_text_from_file('document.docx')
48
+ print(f"Extracted text: {text}")
49
+ ```
50
+
51
+ ## License
52
+
53
+ MIT Licence
@@ -0,0 +1,15 @@
1
+ # polytext/__init__.py
2
+ from .converter.pdf import convert_to_pdf, DocumentConverter
3
+ from .loader.text import get_document_text, extract_text_from_file, TextLoader
4
+ from .exceptions.base import EmptyDocument, ExceededMaxPages, ConversionError
5
+
6
+ __all__ = [
7
+ 'convert_to_pdf',
8
+ 'DocumentConverter',
9
+ 'get_document_text',
10
+ 'extract_text_from_file',
11
+ 'TextLoader',
12
+ 'EmptyDocument',
13
+ 'ExceededMaxPages',
14
+ 'ConversionError'
15
+ ]
@@ -0,0 +1,4 @@
1
+ # polytext/converter/__init__.py
2
+ from .pdf import convert_to_pdf, DocumentConverter
3
+
4
+ __all__ = ['convert_to_pdf', 'DocumentConverter']
@@ -0,0 +1,256 @@
1
+ # converter/pdf.py
2
+ import os
3
+ import subprocess
4
+ import logging
5
+ from ..exceptions.base import ConversionError
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ def convert_to_pdf(input_file, original_file, output_file=None):
11
+ """
12
+ Convenience function to convert a document to PDF format using LibreOffice.
13
+
14
+ Args:
15
+ input_file (str): Path to the input document file to be converted
16
+ original_file (str): Path to the original file for extension checking
17
+ output_file (str, optional): Path where the output PDF should be saved
18
+
19
+ Returns:
20
+ str: Path to the generated PDF file
21
+
22
+ Raises:
23
+ FileNotFoundError: If the input file doesn't exist
24
+ ConversionError: If the conversion process fails
25
+ """
26
+ converter = DocumentConverter()
27
+ return converter.convert_to_pdf(input_file, original_file, output_file)
28
+
29
+
30
+ class DocumentConverter:
31
+ """
32
+ A class for converting various document formats to PDF using LibreOffice.
33
+
34
+ The converter supports common document formats like TXT, DOC(X), ODT, PPT(X),
35
+ and XLS(X). It requires LibreOffice to be installed on the system.
36
+
37
+ Attributes:
38
+ supported_extensions (list): List of supported file extensions
39
+ """
40
+
41
+ def __init__(self):
42
+ """Initialize the DocumentConverter."""
43
+ self.supported_extensions = [
44
+ '.txt', '.docx', '.doc', '.odt',
45
+ '.ppt', '.pptx', '.xlsx', '.xls', '.ods'
46
+ ]
47
+
48
+ @staticmethod
49
+ def check_libreoffice_installed():
50
+ """
51
+ Check if LibreOffice is installed and accessible in the system PATH.
52
+
53
+ Returns:
54
+ bool: True if LibreOffice is installed and available, False otherwise.
55
+ """
56
+ try:
57
+ subprocess.run(
58
+ ['libreoffice', '--version'],
59
+ stdout=subprocess.PIPE,
60
+ stderr=subprocess.PIPE,
61
+ check=False
62
+ )
63
+ return True
64
+ except (subprocess.SubprocessError, FileNotFoundError):
65
+ return False
66
+
67
+ def convert_to_pdf(self, input_file, original_file, output_file=None):
68
+ """
69
+ Convert a document to PDF format using LibreOffice.
70
+
71
+ This method uses LibreOffice in headless mode to convert documents. If the input
72
+ file is already a PDF, it will be copied to the output location.
73
+
74
+ Args:
75
+ input_file (str): Path to the input document file to be converted
76
+ original_file (str): Path to the original file for extension checking
77
+ output_file (str, optional): Path where the output PDF should be saved.
78
+ If not provided, will use input_file name with .pdf extension
79
+
80
+ Returns:
81
+ str: Path to the generated PDF file
82
+
83
+ Raises:
84
+ FileNotFoundError: If the input file doesn't exist
85
+ ConversionError: If the conversion process fails or LibreOffice is not installed
86
+ """
87
+ if not os.path.exists(input_file):
88
+ raise FileNotFoundError(f"Input file '{input_file}' does not exist.")
89
+
90
+ # Check file extension
91
+ _, ext = os.path.splitext(original_file)
92
+ logger.info(os.path.splitext(original_file))
93
+ if ext.lower() not in self.supported_extensions and ext.lower() != '.pdf':
94
+ logger.warning(f"File extension '{ext}' may not be supported.")
95
+
96
+ # Set default output file name if not provided
97
+ if output_file is None:
98
+ output_file = os.path.splitext(input_file)[0] + '.pdf'
99
+
100
+ output_dir = os.path.dirname(os.path.abspath(output_file))
101
+ if not os.path.exists(output_dir):
102
+ os.makedirs(output_dir)
103
+
104
+ # If the file is already a PDF, just copy it
105
+ if ext.lower() == '.pdf':
106
+ import shutil
107
+ shutil.copy2(input_file, output_file)
108
+ logger.info(f"File is already a PDF. Copied to '{output_file}'")
109
+ return output_file
110
+
111
+ # Check if LibreOffice is installed
112
+ if not self.check_libreoffice_installed():
113
+ raise ConversionError(
114
+ "LibreOffice is not installed or not found in PATH. "
115
+ "Please install LibreOffice to convert documents to PDF."
116
+ )
117
+
118
+ # Build the LibreOffice command
119
+ command = [
120
+ 'libreoffice',
121
+ '--headless',
122
+ '--nologo',
123
+ '--nofirststartwizard',
124
+ '--convert-to', 'pdf',
125
+ '--outdir', output_dir,
126
+ input_file
127
+ ]
128
+
129
+ try:
130
+ # Suppress Java runtime warnings by redirecting stderr
131
+ subprocess.check_call(command, stderr=subprocess.DEVNULL)
132
+ logger.info(f"Conversion successful: '{output_file}'")
133
+ except subprocess.CalledProcessError as e:
134
+ error_msg = f"Error during conversion: {e}"
135
+ logger.error(error_msg)
136
+ raise ConversionError(error_msg, e)
137
+
138
+ # After conversion, ensure the output file is correctly named
139
+ converted_file = os.path.join(
140
+ output_dir,
141
+ os.path.splitext(os.path.basename(input_file))[0] + '.pdf'
142
+ )
143
+ if converted_file != output_file:
144
+ os.rename(converted_file, output_file)
145
+
146
+ return output_file
147
+
148
+
149
+ # Alternative method with direct page_range management
150
+
151
+ # def convert_to_pdf(self, input_file, output_file=None, page_range=None):
152
+ # """
153
+ # Converts a document to PDF format using LibreOffice.
154
+ # """
155
+ # if not os.path.exists(input_file):
156
+ # raise FileNotFoundError(f"Input file '{input_file}' does not exist.")
157
+ #
158
+ # # Check file extension
159
+ # _, ext = os.path.splitext(input_file)
160
+ # logger.info(os.path.splitext(input_file))
161
+ #
162
+ # # Set default output file name if not provided
163
+ # if output_file is None:
164
+ # output_file = os.path.splitext(input_file)[0] + '.pdf'
165
+ #
166
+ # output_dir = os.path.dirname(os.path.abspath(output_file))
167
+ # if not os.path.exists(output_dir):
168
+ # os.makedirs(output_dir)
169
+ #
170
+ # # If the file is already a PDF, just copy it
171
+ # if ext.lower() == '.pdf':
172
+ # import shutil
173
+ # shutil.copy2(input_file, output_file)
174
+ # logger.info(f"File is already a PDF. Copied to '{output_file}'")
175
+ # return output_file
176
+ #
177
+ # # Check if LibreOffice is installed
178
+ # if not self.check_libreoffice_installed():
179
+ # raise ConversionError(
180
+ # "LibreOffice is not installed or not found in PATH. "
181
+ # "Please install LibreOffice to convert documents to PDF."
182
+ # )
183
+ #
184
+ # # Record existing PDFs in the output directory
185
+ # import glob
186
+ # existing_pdfs = set(glob.glob(os.path.join(output_dir, "*.pdf")))
187
+ #
188
+ # # Build the LibreOffice command
189
+ # convert_filter = 'pdf'
190
+ # if page_range:
191
+ # convert_filter = f'pdf:writer_pdf_Export:{{"PageRange":{{"type":"string","value":"{page_range}"}}}}'
192
+ #
193
+ # command = [
194
+ # 'libreoffice',
195
+ # '--headless',
196
+ # '--nologo',
197
+ # '--nofirststartwizard',
198
+ # '--convert-to', convert_filter,
199
+ # '--outdir', output_dir,
200
+ # input_file
201
+ # ]
202
+ #
203
+ # try:
204
+ # # Run the command and capture output
205
+ # result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
206
+ # text=True, check=False)
207
+ #
208
+ # # Check if command was successful
209
+ # if result.returncode != 0:
210
+ # error_msg = f"Error during conversion: {result.stderr}"
211
+ # logger.error(error_msg)
212
+ # raise ConversionError(error_msg)
213
+ #
214
+ # # Log the output to help debugging
215
+ # logger.info(f"LibreOffice conversion output: {result.stdout}")
216
+ #
217
+ # # Find newly created PDF file
218
+ # current_pdfs = set(glob.glob(os.path.join(output_dir, "*.pdf")))
219
+ # new_pdfs = current_pdfs - existing_pdfs
220
+ #
221
+ # if not new_pdfs:
222
+ # # Try looking in /private path as well (for macOS)
223
+ # if output_dir.startswith('/var/'):
224
+ # private_dir = '/private' + output_dir
225
+ # private_pdfs = set(glob.glob(os.path.join(private_dir, "*.pdf")))
226
+ # new_pdfs = private_pdfs - existing_pdfs
227
+ #
228
+ # if not new_pdfs:
229
+ # # Last resort: find the most recently created PDF
230
+ # all_pdfs = glob.glob(os.path.join(output_dir, "*.pdf"))
231
+ # if not all_pdfs and output_dir.startswith('/var/'):
232
+ # private_dir = '/private' + output_dir
233
+ # all_pdfs = glob.glob(os.path.join(private_dir, "*.pdf"))
234
+ #
235
+ # if all_pdfs:
236
+ # converted_file = max(all_pdfs, key=os.path.getmtime)
237
+ # logger.info(f"Found most recent PDF: {converted_file}")
238
+ # else:
239
+ # raise ConversionError(f"No PDF files found in output directory after conversion.")
240
+ # else:
241
+ # converted_file = list(new_pdfs)[0]
242
+ # logger.info(f"Found newly created PDF: {converted_file}")
243
+ #
244
+ # # Move to desired output location if needed
245
+ # if converted_file != output_file:
246
+ # import shutil
247
+ # shutil.copy2(converted_file, output_file)
248
+ # os.remove(converted_file) # Clean up the original
249
+ # logger.info(f"Moved PDF to final location: {output_file}")
250
+ #
251
+ # return output_file
252
+ #
253
+ # except Exception as e:
254
+ # error_msg = f"Error during PDF conversion: {str(e)}"
255
+ # logger.error(error_msg)
256
+ # raise ConversionError(error_msg)
@@ -0,0 +1,4 @@
1
+ # polytext/exceptions/__init__.py
2
+ from .base import EmptyDocument, ExceededMaxPages, ConversionError
3
+
4
+ __all__ = ['EmptyDocument', 'ExceededMaxPages', 'ConversionError']
@@ -0,0 +1,52 @@
1
+ # exceptions/base.py
2
+ class ConversionError(Exception):
3
+ """
4
+ Exception raised when document conversion to PDF fails.
5
+
6
+ This exception is typically raised when LibreOffice fails to convert a document
7
+ or when the conversion process encounters system-level issues.
8
+
9
+ Attributes:
10
+ message (str): Detailed error message describing the conversion failure
11
+ original_exception: The underlying exception that caused the conversion failure
12
+ """
13
+
14
+ def __init__(self, message, original_exception=None):
15
+ super().__init__(message)
16
+ self.message = message
17
+ self.original_exception = original_exception
18
+
19
+
20
+ class EmptyDocument(Exception):
21
+ """
22
+ Exception raised when a document contains no extractable text.
23
+
24
+ This exception is raised when text extraction yields empty results or
25
+ when the extracted text fails quality checks (e.g., too few characters,
26
+ excessive repeated content).
27
+
28
+ Attributes:
29
+ message (str): Description of why the document is considered empty
30
+ code (int): Error code for categorizing the type of emptiness (default: None)
31
+ """
32
+ def __init__(self, message, code=None):
33
+ super().__init__(message)
34
+ self.message = message
35
+ self.code = code
36
+
37
+
38
+ class ExceededMaxPages(Exception):
39
+ """
40
+ Exception raised when requested page range exceeds document length.
41
+
42
+ This exception occurs when attempting to extract text from pages beyond
43
+ the document's actual page count or when invalid page ranges are specified.
44
+
45
+ Attributes:
46
+ message (str): Description of the page range error
47
+ code (int): Error code for tracking purposes (default: None)
48
+ """
49
+ def __init__(self, message, code=None):
50
+ super().__init__(message)
51
+ self.message = message
52
+ self.code = code
@@ -0,0 +1,4 @@
1
+ # polytext/loader/__init__.py
2
+ from .text import get_document_text, extract_text_from_file, TextLoader
3
+
4
+ __all__ = ['get_document_text', 'extract_text_from_file', 'TextLoader']