polytext 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
polytext/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ # polytext/__init__.py
2
+ from .converter.pdf import convert_to_pdf, DocumentConverter
3
+ from .loader.text import get_document_text, extract_text_from_file, TextLoader
4
+ from .exceptions.base import EmptyDocument, ExceededMaxPages, ConversionError
5
+
6
+ __all__ = [
7
+ 'convert_to_pdf',
8
+ 'DocumentConverter',
9
+ 'get_document_text',
10
+ 'extract_text_from_file',
11
+ 'TextLoader',
12
+ 'EmptyDocument',
13
+ 'ExceededMaxPages',
14
+ 'ConversionError'
15
+ ]
@@ -0,0 +1,4 @@
1
+ # polytext/converter/__init__.py
2
+ from .pdf import convert_to_pdf, DocumentConverter
3
+
4
+ __all__ = ['convert_to_pdf', 'DocumentConverter']
@@ -0,0 +1,256 @@
1
+ # converter/pdf.py
2
+ import os
3
+ import subprocess
4
+ import logging
5
+ from ..exceptions.base import ConversionError
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ def convert_to_pdf(input_file, original_file, output_file=None):
11
+ """
12
+ Convenience function to convert a document to PDF format using LibreOffice.
13
+
14
+ Args:
15
+ input_file (str): Path to the input document file to be converted
16
+ original_file (str): Path to the original file for extension checking
17
+ output_file (str, optional): Path where the output PDF should be saved
18
+
19
+ Returns:
20
+ str: Path to the generated PDF file
21
+
22
+ Raises:
23
+ FileNotFoundError: If the input file doesn't exist
24
+ ConversionError: If the conversion process fails
25
+ """
26
+ converter = DocumentConverter()
27
+ return converter.convert_to_pdf(input_file, original_file, output_file)
28
+
29
+
30
+ class DocumentConverter:
31
+ """
32
+ A class for converting various document formats to PDF using LibreOffice.
33
+
34
+ The converter supports common document formats like TXT, DOC(X), ODT, PPT(X),
35
+ and XLS(X). It requires LibreOffice to be installed on the system.
36
+
37
+ Attributes:
38
+ supported_extensions (list): List of supported file extensions
39
+ """
40
+
41
+ def __init__(self):
42
+ """Initialize the DocumentConverter."""
43
+ self.supported_extensions = [
44
+ '.txt', '.docx', '.doc', '.odt',
45
+ '.ppt', '.pptx', '.xlsx', '.xls', '.ods'
46
+ ]
47
+
48
+ @staticmethod
49
+ def check_libreoffice_installed():
50
+ """
51
+ Check if LibreOffice is installed and accessible in the system PATH.
52
+
53
+ Returns:
54
+ bool: True if LibreOffice is installed and available, False otherwise.
55
+ """
56
+ try:
57
+ subprocess.run(
58
+ ['libreoffice', '--version'],
59
+ stdout=subprocess.PIPE,
60
+ stderr=subprocess.PIPE,
61
+ check=False
62
+ )
63
+ return True
64
+ except (subprocess.SubprocessError, FileNotFoundError):
65
+ return False
66
+
67
+ def convert_to_pdf(self, input_file, original_file, output_file=None):
68
+ """
69
+ Convert a document to PDF format using LibreOffice.
70
+
71
+ This method uses LibreOffice in headless mode to convert documents. If the input
72
+ file is already a PDF, it will be copied to the output location.
73
+
74
+ Args:
75
+ input_file (str): Path to the input document file to be converted
76
+ original_file (str): Path to the original file for extension checking
77
+ output_file (str, optional): Path where the output PDF should be saved.
78
+ If not provided, will use input_file name with .pdf extension
79
+
80
+ Returns:
81
+ str: Path to the generated PDF file
82
+
83
+ Raises:
84
+ FileNotFoundError: If the input file doesn't exist
85
+ ConversionError: If the conversion process fails or LibreOffice is not installed
86
+ """
87
+ if not os.path.exists(input_file):
88
+ raise FileNotFoundError(f"Input file '{input_file}' does not exist.")
89
+
90
+ # Check file extension
91
+ _, ext = os.path.splitext(original_file)
92
+ logger.info(os.path.splitext(original_file))
93
+ if ext.lower() not in self.supported_extensions and ext.lower() != '.pdf':
94
+ logger.warning(f"File extension '{ext}' may not be supported.")
95
+
96
+ # Set default output file name if not provided
97
+ if output_file is None:
98
+ output_file = os.path.splitext(input_file)[0] + '.pdf'
99
+
100
+ output_dir = os.path.dirname(os.path.abspath(output_file))
101
+ if not os.path.exists(output_dir):
102
+ os.makedirs(output_dir)
103
+
104
+ # If the file is already a PDF, just copy it
105
+ if ext.lower() == '.pdf':
106
+ import shutil
107
+ shutil.copy2(input_file, output_file)
108
+ logger.info(f"File is already a PDF. Copied to '{output_file}'")
109
+ return output_file
110
+
111
+ # Check if LibreOffice is installed
112
+ if not self.check_libreoffice_installed():
113
+ raise ConversionError(
114
+ "LibreOffice is not installed or not found in PATH. "
115
+ "Please install LibreOffice to convert documents to PDF."
116
+ )
117
+
118
+ # Build the LibreOffice command
119
+ command = [
120
+ 'libreoffice',
121
+ '--headless',
122
+ '--nologo',
123
+ '--nofirststartwizard',
124
+ '--convert-to', 'pdf',
125
+ '--outdir', output_dir,
126
+ input_file
127
+ ]
128
+
129
+ try:
130
+ # Suppress Java runtime warnings by redirecting stderr
131
+ subprocess.check_call(command, stderr=subprocess.DEVNULL)
132
+ logger.info(f"Conversion successful: '{output_file}'")
133
+ except subprocess.CalledProcessError as e:
134
+ error_msg = f"Error during conversion: {e}"
135
+ logger.error(error_msg)
136
+ raise ConversionError(error_msg, e)
137
+
138
+ # After conversion, ensure the output file is correctly named
139
+ converted_file = os.path.join(
140
+ output_dir,
141
+ os.path.splitext(os.path.basename(input_file))[0] + '.pdf'
142
+ )
143
+ if converted_file != output_file:
144
+ os.rename(converted_file, output_file)
145
+
146
+ return output_file
147
+
148
+
149
+ # Alternative method with direct page_range management
150
+
151
+ # def convert_to_pdf(self, input_file, output_file=None, page_range=None):
152
+ # """
153
+ # Converts a document to PDF format using LibreOffice.
154
+ # """
155
+ # if not os.path.exists(input_file):
156
+ # raise FileNotFoundError(f"Input file '{input_file}' does not exist.")
157
+ #
158
+ # # Check file extension
159
+ # _, ext = os.path.splitext(input_file)
160
+ # logger.info(os.path.splitext(input_file))
161
+ #
162
+ # # Set default output file name if not provided
163
+ # if output_file is None:
164
+ # output_file = os.path.splitext(input_file)[0] + '.pdf'
165
+ #
166
+ # output_dir = os.path.dirname(os.path.abspath(output_file))
167
+ # if not os.path.exists(output_dir):
168
+ # os.makedirs(output_dir)
169
+ #
170
+ # # If the file is already a PDF, just copy it
171
+ # if ext.lower() == '.pdf':
172
+ # import shutil
173
+ # shutil.copy2(input_file, output_file)
174
+ # logger.info(f"File is already a PDF. Copied to '{output_file}'")
175
+ # return output_file
176
+ #
177
+ # # Check if LibreOffice is installed
178
+ # if not self.check_libreoffice_installed():
179
+ # raise ConversionError(
180
+ # "LibreOffice is not installed or not found in PATH. "
181
+ # "Please install LibreOffice to convert documents to PDF."
182
+ # )
183
+ #
184
+ # # Record existing PDFs in the output directory
185
+ # import glob
186
+ # existing_pdfs = set(glob.glob(os.path.join(output_dir, "*.pdf")))
187
+ #
188
+ # # Build the LibreOffice command
189
+ # convert_filter = 'pdf'
190
+ # if page_range:
191
+ # convert_filter = f'pdf:writer_pdf_Export:{{"PageRange":{{"type":"string","value":"{page_range}"}}}}'
192
+ #
193
+ # command = [
194
+ # 'libreoffice',
195
+ # '--headless',
196
+ # '--nologo',
197
+ # '--nofirststartwizard',
198
+ # '--convert-to', convert_filter,
199
+ # '--outdir', output_dir,
200
+ # input_file
201
+ # ]
202
+ #
203
+ # try:
204
+ # # Run the command and capture output
205
+ # result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
206
+ # text=True, check=False)
207
+ #
208
+ # # Check if command was successful
209
+ # if result.returncode != 0:
210
+ # error_msg = f"Error during conversion: {result.stderr}"
211
+ # logger.error(error_msg)
212
+ # raise ConversionError(error_msg)
213
+ #
214
+ # # Log the output to help debugging
215
+ # logger.info(f"LibreOffice conversion output: {result.stdout}")
216
+ #
217
+ # # Find newly created PDF file
218
+ # current_pdfs = set(glob.glob(os.path.join(output_dir, "*.pdf")))
219
+ # new_pdfs = current_pdfs - existing_pdfs
220
+ #
221
+ # if not new_pdfs:
222
+ # # Try looking in /private path as well (for macOS)
223
+ # if output_dir.startswith('/var/'):
224
+ # private_dir = '/private' + output_dir
225
+ # private_pdfs = set(glob.glob(os.path.join(private_dir, "*.pdf")))
226
+ # new_pdfs = private_pdfs - existing_pdfs
227
+ #
228
+ # if not new_pdfs:
229
+ # # Last resort: find the most recently created PDF
230
+ # all_pdfs = glob.glob(os.path.join(output_dir, "*.pdf"))
231
+ # if not all_pdfs and output_dir.startswith('/var/'):
232
+ # private_dir = '/private' + output_dir
233
+ # all_pdfs = glob.glob(os.path.join(private_dir, "*.pdf"))
234
+ #
235
+ # if all_pdfs:
236
+ # converted_file = max(all_pdfs, key=os.path.getmtime)
237
+ # logger.info(f"Found most recent PDF: {converted_file}")
238
+ # else:
239
+ # raise ConversionError(f"No PDF files found in output directory after conversion.")
240
+ # else:
241
+ # converted_file = list(new_pdfs)[0]
242
+ # logger.info(f"Found newly created PDF: {converted_file}")
243
+ #
244
+ # # Move to desired output location if needed
245
+ # if converted_file != output_file:
246
+ # import shutil
247
+ # shutil.copy2(converted_file, output_file)
248
+ # os.remove(converted_file) # Clean up the original
249
+ # logger.info(f"Moved PDF to final location: {output_file}")
250
+ #
251
+ # return output_file
252
+ #
253
+ # except Exception as e:
254
+ # error_msg = f"Error during PDF conversion: {str(e)}"
255
+ # logger.error(error_msg)
256
+ # raise ConversionError(error_msg)
@@ -0,0 +1,4 @@
1
+ # polytext/exceptions/__init__.py
2
+ from .base import EmptyDocument, ExceededMaxPages, ConversionError
3
+
4
+ __all__ = ['EmptyDocument', 'ExceededMaxPages', 'ConversionError']
@@ -0,0 +1,52 @@
1
+ # exceptions/base.py
2
+ class ConversionError(Exception):
3
+ """
4
+ Exception raised when document conversion to PDF fails.
5
+
6
+ This exception is typically raised when LibreOffice fails to convert a document
7
+ or when the conversion process encounters system-level issues.
8
+
9
+ Attributes:
10
+ message (str): Detailed error message describing the conversion failure
11
+ original_exception: The underlying exception that caused the conversion failure
12
+ """
13
+
14
+ def __init__(self, message, original_exception=None):
15
+ super().__init__(message)
16
+ self.message = message
17
+ self.original_exception = original_exception
18
+
19
+
20
+ class EmptyDocument(Exception):
21
+ """
22
+ Exception raised when a document contains no extractable text.
23
+
24
+ This exception is raised when text extraction yields empty results or
25
+ when the extracted text fails quality checks (e.g., too few characters,
26
+ excessive repeated content).
27
+
28
+ Attributes:
29
+ message (str): Description of why the document is considered empty
30
+ code (int): Error code for categorizing the type of emptiness (default: None)
31
+ """
32
+ def __init__(self, message, code=None):
33
+ super().__init__(message)
34
+ self.message = message
35
+ self.code = code
36
+
37
+
38
+ class ExceededMaxPages(Exception):
39
+ """
40
+ Exception raised when requested page range exceeds document length.
41
+
42
+ This exception occurs when attempting to extract text from pages beyond
43
+ the document's actual page count or when invalid page ranges are specified.
44
+
45
+ Attributes:
46
+ message (str): Description of the page range error
47
+ code (int): Error code for tracking purposes (default: None)
48
+ """
49
+ def __init__(self, message, code=None):
50
+ super().__init__(message)
51
+ self.message = message
52
+ self.code = code
@@ -0,0 +1,4 @@
1
+ # polytext/loader/__init__.py
2
+ from .text import get_document_text, extract_text_from_file, TextLoader
3
+
4
+ __all__ = ['get_document_text', 'extract_text_from_file', 'TextLoader']
@@ -0,0 +1,606 @@
1
+ # text_loader.py
2
+ # Standard library imports
3
+ import os
4
+ import re
5
+ import tempfile
6
+ import logging
7
+ from collections import Counter
8
+
9
+ # Third-party imports
10
+ from pypdf import PdfReader
11
+ import fitz # PyMuPDF
12
+ from botocore.exceptions import ClientError
13
+
14
+ # Local imports
15
+ from ..converter.pdf import convert_to_pdf
16
+ from ..exceptions.base import EmptyDocument, ExceededMaxPages
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ # Standalone functions that wrap TextLoader methods
22
+ def get_document_text(doc_data, page_range=None):
23
+ """
24
+ Convenience function to extract text from a document using PyMuPDF.
25
+
26
+ Args:
27
+ doc_data (dict): Dictionary containing 'file_path' and optional 'bucket'
28
+ page_range (tuple, optional): Tuple of (start_page, end_page) for partial extraction
29
+
30
+ Returns:
31
+ str: Extracted text from the document
32
+
33
+ Raises:
34
+ EmptyDocument: If extracted text is empty or fails quality checks
35
+ ExceededMaxPages: If requested page range is invalid
36
+ """
37
+ loader = TextLoader()
38
+ return loader.get_document_text(doc_data, page_range)
39
+
40
+
41
+ def extract_text_from_file(file_path, page_range=None, backend='auto'):
42
+ """
43
+ Convenience function to extract text from a local file.
44
+
45
+ Args:
46
+ file_path (str): Path to the local file
47
+ page_range (tuple, optional): Tuple of (start_page, end_page), 1-indexed
48
+ backend (str, optional): Text extraction backend ('auto', 'pymupdf', or 'pypdf')
49
+
50
+ Returns:
51
+ str: Extracted text from the document
52
+
53
+ Raises:
54
+ FileNotFoundError: If input file doesn't exist
55
+ ValueError: If invalid backend is specified
56
+ EmptyDocument: If no text could be extracted
57
+ """
58
+ loader = TextLoader()
59
+ return loader.extract_text_from_file(file_path, page_range, backend)
60
+
61
+
62
+ class TextLoader:
63
+ """
64
+ Loads and extracts text from documents with support for S3 storage.
65
+
66
+ This class handles document downloading from S3, conversion to PDF, and text extraction
67
+ using different backends (PyMuPDF and PyPDF). It supports various document formats
68
+ through conversion to PDF using LibreOffice.
69
+
70
+ Attributes:
71
+ converter (DocumentConverter): Instance for converting documents to PDF
72
+ s3_client: Boto3 S3 client for AWS operations
73
+ document_aws_bucket (str): Default S3 bucket name for document storage
74
+ """
75
+
76
+ def __init__(self, s3_client=None, document_aws_bucket=None):
77
+ """
78
+ Initialize TextLoader with optional S3 configuration.
79
+
80
+ Args:
81
+ s3_client: Boto3 S3 client instance for AWS operations (optional)
82
+ document_aws_bucket (str): Default S3 bucket name for document storage (optional)
83
+ """
84
+ self.s3_client = s3_client
85
+ self.document_aws_bucket = document_aws_bucket
86
+
87
+ # S3-related methods
88
+
89
+ def download_file_from_s3(self, bucket, file_path, temp_file_path):
90
+ """
91
+ Download a file from S3 to a local temporary path.
92
+
93
+ Attempts to download the file with both lowercase and uppercase extensions.
94
+ Falls back to document conversion if direct download fails.
95
+
96
+ Args:
97
+ bucket (str): S3 bucket name
98
+ file_path (str): Path to file in S3 bucket
99
+ temp_file_path (str): Local path to save the downloaded file
100
+
101
+ Returns:
102
+ str: Path to the downloaded file (may be converted to PDF)
103
+
104
+ Raises:
105
+ ClientError: If S3 download operation fails
106
+ """
107
+ try:
108
+ self.s3_client.download_file(Bucket=bucket, Key=file_path, Filename=temp_file_path)
109
+ logger.info(f'Downloaded {file_path} to {temp_file_path}')
110
+ except ClientError as e:
111
+ logger.info(e)
112
+ try:
113
+ self.s3_client.download_file(Bucket=bucket,
114
+ Key=file_path.replace(".pdf", ".PDF"),
115
+ Filename=temp_file_path)
116
+ except Exception as e:
117
+ file_prefix = file_path
118
+ temp_file_path = self.convert_doc_to_pdf(bucket=bucket,
119
+ file_prefix=file_prefix,
120
+ input_file=temp_file_path)
121
+ return temp_file_path
122
+
123
+ def convert_doc_to_pdf(self, bucket, file_prefix, input_file):
124
+ """
125
+ Convert a document from S3 to PDF format.
126
+
127
+ Downloads the document from S3 and converts it to PDF using LibreOffice.
128
+
129
+ Args:
130
+ bucket (str): S3 bucket name
131
+ file_prefix (str): Prefix to match files in S3
132
+ input_file (str): Local path to save the downloaded file
133
+
134
+ Returns:
135
+ str: Path to the converted PDF file
136
+
137
+ Raises:
138
+ FileNotFoundError: If no matching file is found in S3
139
+ ConversionError: If PDF conversion fails
140
+ """
141
+ logger.info(f"bucket: {bucket}")
142
+ logger.info(f"file_prefix: {file_prefix}")
143
+ logger.info(f"input_file: {input_file}")
144
+
145
+ # Create a temporary file for output
146
+ fd, output_file = tempfile.mkstemp(suffix=".pdf")
147
+ os.close(fd) # Close file descriptor explicitly
148
+
149
+ # List objects in S3 bucket
150
+ response = self.s3_client.list_objects_v2(Bucket=bucket, Prefix=file_prefix)
151
+
152
+ if 'Contents' not in response or not response['Contents']:
153
+ raise FileNotFoundError("No matching file found in S3 bucket.")
154
+
155
+ # Get the first matching object
156
+ matching_file = response['Contents'][0]['Key']
157
+
158
+ # Download the file
159
+ self.s3_client.download_file(
160
+ Bucket=bucket,
161
+ Key=matching_file,
162
+ Filename=input_file
163
+ )
164
+ logger.info("Using LibreOffice")
165
+ convert_to_pdf(input_file=input_file, output_file=output_file, original_file=file_prefix)
166
+ logger.info("Document converted to pdf")
167
+ os.remove(input_file)
168
+ return output_file
169
+
170
+ # PDF text extraction methods
171
+
172
+ def get_document_text(self, doc_data, page_range=None):
173
+ """
174
+ Extract text from a document using PyMuPDF as primary backend.
175
+
176
+ Downloads the document from S3 if needed, converts to PDF if necessary,
177
+ and extracts text with quality checks and early termination conditions.
178
+
179
+ Args:
180
+ doc_data (dict): Dictionary containing 'file_path' and optional 'bucket'
181
+ page_range (tuple, optional): Tuple of (start_page, end_page) for partial extraction
182
+
183
+ Returns:
184
+ str: Extracted text from the document
185
+
186
+ Raises:
187
+ EmptyDocument: If extracted text is empty or fails quality checks
188
+ ExceededMaxPages: If requested page range is invalid
189
+ """
190
+ logger.debug("Using PyMuPDF")
191
+ file_path = doc_data["file_path"]
192
+
193
+ fd, temp_file_path = tempfile.mkstemp()
194
+ if doc_data.get("bucket"):
195
+ bucket = doc_data.get("bucket")
196
+ else:
197
+ bucket = self.document_aws_bucket
198
+
199
+ if os.path.splitext(file_path)[1].lower() != ".pdf":
200
+ logger.info("Converting file to PDF")
201
+ file_prefix = file_path
202
+ temp_file_path = self.convert_doc_to_pdf(bucket=bucket, file_prefix=file_prefix, input_file=temp_file_path)
203
+ pdf_document = fitz.open(temp_file_path)
204
+ else:
205
+ temp_file_path = self.download_file_from_s3(bucket, file_path, temp_file_path)
206
+ try:
207
+ pdf_document = fitz.open(temp_file_path)
208
+ logger.info(f"Successfully opened file with temp_file_path: {temp_file_path}")
209
+ except Exception as e:
210
+ logger.info("Converting file to PDF")
211
+ file_prefix = file_path
212
+ temp_file_path = self.convert_doc_to_pdf(bucket=bucket, file_prefix=file_prefix, input_file=temp_file_path)
213
+ pdf_document = fitz.open(temp_file_path)
214
+
215
+ text = ""
216
+ last_pages_text = ""
217
+ last_page_index_to_start = 10
218
+ total_pages = pdf_document.page_count
219
+ logger.info(f"Total pages: {total_pages}")
220
+
221
+ # Validate and adjust page range
222
+ start_page, end_page = self.validate_page_range(page_range, total_pages)
223
+
224
+ for page_number in range(start_page, end_page):
225
+ page = pdf_document.load_page(page_number)
226
+ page_text = page.get_text("text", flags=16)
227
+ page_text = self.clean_text(page_text)
228
+ text += page_text
229
+ if page_number >= (pdf_document.page_count - last_page_index_to_start):
230
+ last_pages_text += page_text
231
+
232
+ # Early termination checks
233
+ if len(text) == 0 and page_number == 10:
234
+ message = "First 10 pages of the document are empty"
235
+ logger.info(message)
236
+ raise EmptyDocument(message=message, code=998)
237
+
238
+ if len(text) < 800 and page_number == 20:
239
+ message = "First 20 pages of the document have less than 800 chars"
240
+ logger.info(message)
241
+ raise EmptyDocument(message=message, code=998)
242
+
243
+ if (total_pages >= 500 and
244
+ page_number == 10 and
245
+ self.has_repeated_rows(text=text, threshold=100)):
246
+ message = "First 10 pages of the document have 100 repeated rows"
247
+ logger.info(message)
248
+ raise EmptyDocument(message=message, code=998)
249
+
250
+ if (total_pages >= 500 and
251
+ (page_number == total_pages - 1) and
252
+ self.has_repeated_rows(text=last_pages_text, threshold=100)):
253
+ message = "Last 10 pages of the document have 100 repeated rows"
254
+ logger.info(message)
255
+ raise EmptyDocument(message=message, code=998)
256
+
257
+ pdf_document.close()
258
+ os.remove(temp_file_path)
259
+
260
+ if len(text) == 0:
261
+ message = "No text detected"
262
+ logger.info(message)
263
+ raise EmptyDocument(message=message, code=998)
264
+ if "������������������������������������������" in text:
265
+ logger.info("Using pypdf being strange PDF")
266
+ return self.get_document_text_pypdf(bucket=bucket, file_path=file_path, page_range=page_range)
267
+ if len(text) < 800:
268
+ message = "Document text with less than 800 characters"
269
+ raise EmptyDocument(message=message, code=998)
270
+
271
+ return text
272
+
273
+ def get_document_text_pypdf(self, bucket, file_path, page_range=None):
274
+ """
275
+ Extract text from a document using PyPDF as fallback backend.
276
+
277
+ Similar to get_document_text but uses PyPDF for extraction. Useful when
278
+ PyMuPDF fails to extract text properly.
279
+
280
+ Args:
281
+ bucket (str): S3 bucket name
282
+ file_path (str): Path to file in S3
283
+ page_range (tuple, optional): Tuple of (start_page, end_page) for partial extraction
284
+
285
+ Returns:
286
+ str: Extracted text from the document
287
+
288
+ Raises:
289
+ EmptyDocument: If extracted text is empty or fails quality checks
290
+ ExceededMaxPages: If requested page range is invalid
291
+ """
292
+ logger.info("Using PyPDF")
293
+
294
+ fd, temp_file_path = tempfile.mkstemp()
295
+
296
+ if os.path.splitext(file_path)[1].lower() != ".pdf":
297
+ logger.info("Converting file to PDF")
298
+ file_prefix = file_path
299
+ temp_file_path = self.convert_doc_to_pdf(bucket=bucket, file_prefix=file_prefix, input_file=temp_file_path)
300
+ logger.debug(f"temp_file_path post conversion to pdf: {temp_file_path}")
301
+ file = open(temp_file_path, "rb")
302
+ pdf_reader = PdfReader(file)
303
+ else:
304
+ temp_file_path = self.download_file_from_s3(bucket, file_path, temp_file_path)
305
+ logger.debug(f"temp_file_path: {temp_file_path}")
306
+ try:
307
+ file = open(temp_file_path, "rb")
308
+ pdf_reader = PdfReader(file)
309
+ logger.info(f"Successfully opened file with temp_file_path: {temp_file_path}")
310
+ except Exception as e:
311
+ logger.info("Converting file to PDF")
312
+ file_prefix = file_path
313
+ temp_file_path = self.convert_doc_to_pdf(bucket=bucket, file_prefix=file_prefix,
314
+ input_file=temp_file_path)
315
+ logger.debug(f"temp_file_path post conversion to pdf: {temp_file_path}")
316
+ file = open(temp_file_path, "rb")
317
+ pdf_reader = PdfReader(file)
318
+
319
+ text = ""
320
+ last_pages_text = ""
321
+ last_page_index_to_start = 10
322
+ total_pages = len(pdf_reader.pages)
323
+
324
+ # Validate and adjust page range
325
+ start_page, end_page = self.validate_page_range(page_range, total_pages)
326
+
327
+ for page_number in range(start_page, end_page):
328
+ page = pdf_reader.pages[page_number]
329
+ page_text = page.extract_text()
330
+ page_text = self.clean_text(page_text)
331
+ text += page_text
332
+
333
+ if page.page_number >= (total_pages - last_page_index_to_start):
334
+ last_pages_text += page_text
335
+
336
+ # Early termination checks
337
+ if len(text) == 0 and page.page_number == 10:
338
+ message = "First 10 pages of the document are empty"
339
+ logger.info(message)
340
+ os.remove(temp_file_path)
341
+ raise EmptyDocument(message=message, code=998)
342
+ if len(text) < 800 and page.page_number == 20:
343
+ message = "First 20 pages of the document have less than 800 chars"
344
+ logger.info(message)
345
+ os.remove(temp_file_path)
346
+ raise EmptyDocument(message=message, code=998)
347
+ if (
348
+ total_pages >= 500
349
+ and page.page_number == 10
350
+ and self.has_repeated_rows(text=text, threshold=100)
351
+ ):
352
+ message = "First 10 pages of the document have 100 repeated rows"
353
+ logger.info(message)
354
+ os.remove(temp_file_path)
355
+ raise EmptyDocument(message=message, code=998)
356
+ if (
357
+ total_pages >= 500
358
+ and (page.page_number == total_pages - 1)
359
+ and self.has_repeated_rows(text=last_pages_text, threshold=100)
360
+ ):
361
+ message = "Last 10 pages of the document have 100 repeated rows"
362
+ logger.info(message)
363
+ os.remove(temp_file_path)
364
+ raise EmptyDocument(message=message, code=998)
365
+
366
+ if len(text) == 0:
367
+ message = "No text detected"
368
+ logger.info(message)
369
+ raise EmptyDocument(message=message, code=998)
370
+
371
+ os.remove(temp_file_path)
372
+ return text
373
+
374
+ def extract_text_from_file(self, file_path, page_range=None, backend='auto'):
375
+ """
376
+ Extract text from a local file using specified backend.
377
+
378
+ Supports multiple text extraction backends and handles document conversion
379
+ if needed. Implements fallback mechanism if primary backend fails.
380
+
381
+ Args:
382
+ file_path (str): Path to the local file
383
+ page_range (tuple, optional): Tuple of (start_page, end_page), 1-indexed
384
+ backend (str, optional): Text extraction backend ('auto', 'pymupdf', or 'pypdf')
385
+
386
+ Returns:
387
+ str: Extracted text from the document
388
+
389
+ Raises:
390
+ FileNotFoundError: If input file doesn't exist
391
+ ValueError: If invalid backend is specified
392
+ EmptyDocument: If no text could be extracted
393
+ """
394
+ if not os.path.exists(file_path):
395
+ raise FileNotFoundError(f"File '{file_path}' does not exist.")
396
+
397
+ # Validate backend option
398
+ valid_backends = ['auto', 'pymupdf', 'pypdf']
399
+ if backend not in valid_backends:
400
+ raise ValueError(f"Invalid backend '{backend}'. Must be one of {valid_backends}")
401
+
402
+ # Determine backend to use if 'auto'
403
+ if backend == 'auto':
404
+ backend = 'pymupdf' # Default to PyMuPDF for better extraction quality
405
+
406
+ # Check if file needs conversion to PDF
407
+ file_ext = os.path.splitext(file_path)[1].lower()
408
+ temp_pdf_path = None
409
+
410
+ try:
411
+ if file_ext != '.pdf':
412
+ logger.info(f"Converting {file_ext} file to PDF...")
413
+ fd, temp_pdf_path = tempfile.mkstemp(suffix=".pdf")
414
+ os.close(fd) # Close the file descriptor
415
+
416
+ # Convert to PDF using the converter
417
+ pdf_path = convert_to_pdf(input_file=file_path, original_file=file_path, output_file=temp_pdf_path)
418
+ else:
419
+ pdf_path = file_path
420
+
421
+ text = ""
422
+
423
+ # Extract text using PyMuPDF
424
+ if backend == 'pymupdf':
425
+ logger.debug("Using PyMuPDF for text extraction")
426
+ try:
427
+ pdf_document = fitz.open(pdf_path)
428
+ try:
429
+ total_pages = pdf_document.page_count
430
+
431
+ # Validate and adjust page range
432
+ start_page, end_page = self.validate_page_range(page_range, total_pages)
433
+
434
+ for page_number in range(start_page, end_page):
435
+ page = pdf_document.load_page(page_number)
436
+ page_text = page.get_text("text", flags=16) # Use cleaner text extraction
437
+ page_text = self.clean_text(page_text)
438
+ text += page_text
439
+
440
+ finally:
441
+ pdf_document.close()
442
+
443
+ # Check for strange characters that might indicate PyMuPDF issues
444
+ if "������������������������������������������" in text:
445
+ logger.warning("PyMuPDF extracted unusual characters. Switching to PyPDF.")
446
+ backend = 'pypdf'
447
+ elif len(text.strip()) == 0:
448
+ logger.warning("PyMuPDF extracted no text. Switching to PyPDF.")
449
+ backend = 'pypdf'
450
+ else:
451
+ # If text was successfully extracted, return it
452
+ return text
453
+
454
+ except Exception as e:
455
+ logger.warning(f"PyMuPDF extraction failed: {str(e)}. Trying PyPDF.")
456
+ backend = 'pypdf' # Try PyPDF as a fallback
457
+
458
+ # Extract text using PyPDF
459
+ if backend == 'pypdf':
460
+ logger.debug("Using PyPDF for text extraction")
461
+ with open(pdf_path, "rb") as file:
462
+ pdf_reader = PdfReader(file)
463
+ total_pages = len(pdf_reader.pages)
464
+
465
+ # Validate and adjust page range
466
+ start_page, end_page = self.validate_page_range(page_range, total_pages)
467
+
468
+ # Reset text if we're falling back from PyMuPDF
469
+ text = ""
470
+
471
+ for page_number in range(start_page, end_page):
472
+ page = pdf_reader.pages[page_number]
473
+ page_text = page.extract_text()
474
+ page_text = self.clean_text(page_text)
475
+ text += page_text
476
+
477
+ if not text.strip():
478
+ message = "No text detected in the document"
479
+ logger.info(message)
480
+ raise EmptyDocument(message=message, code=998)
481
+
482
+ return text
483
+
484
+ finally:
485
+ # Clean up temporary file
486
+ if temp_pdf_path and os.path.exists(temp_pdf_path):
487
+ try:
488
+ os.remove(temp_pdf_path)
489
+ except Exception as e:
490
+ logger.warning(f"Failed to remove temporary file {temp_pdf_path}: {str(e)}")
491
+
492
+ # Helper methods
493
+
494
+ @staticmethod
495
+ def validate_page_range(page_range, total_pages):
496
+ """
497
+ Validate and normalize the page range for text extraction.
498
+
499
+ Converts 1-indexed page numbers to 0-indexed and ensures range is within
500
+ document bounds.
501
+
502
+ Args:
503
+ page_range (tuple): Tuple of (start_page, end_page) in 1-indexed format
504
+ total_pages (int): Total number of pages in the document
505
+
506
+ Returns:
507
+ tuple: Normalized (start_page, end_page) in 0-indexed format
508
+
509
+ Raises:
510
+ ExceededMaxPages: If page range exceeds document length
511
+ """
512
+ if page_range:
513
+ logger.info(f"Using page range: {page_range[0]} - {page_range[1]}")
514
+ if page_range[1] > total_pages or page_range[0] < 1:
515
+ raise ExceededMaxPages(
516
+ message=f"Requested page range {page_range} exceeds document length ({total_pages})",
517
+ code=998
518
+ )
519
+ start_page = max(0, page_range[0] - 1) # Convert to 0-indexed
520
+ end_page = min(page_range[1], total_pages)
521
+ else:
522
+ start_page = 0
523
+ end_page = total_pages
524
+
525
+ return start_page, end_page
526
+
527
+ @staticmethod
528
+ def clean_text(text):
529
+ """
530
+ Clean and normalize extracted text.
531
+
532
+ Performs standard text cleaning operations:
533
+ - Replaces double quotes with single quotes
534
+ - Removes excessive newlines
535
+ - Removes special tokens
536
+
537
+ Args:
538
+ text (str): Raw text to clean
539
+
540
+ Returns:
541
+ str: Cleaned and normalized text
542
+ """
543
+ if text:
544
+ text = text.replace('"', "'")
545
+ text = re.sub(r"\n\s*\n", "\n", text)
546
+ text = text.replace('<|endoftext|>', '')
547
+ return text
548
+
549
+ @staticmethod
550
+ def has_repeated_rows(text, threshold=100):
551
+ """
552
+ Check if text contains rows repeated above threshold.
553
+
554
+ Used to detect potential extraction issues or repetitive content.
555
+
556
+ Args:
557
+ text (str): Text to analyze
558
+ threshold (int): Minimum number of repetitions to trigger detection
559
+
560
+ Returns:
561
+ bool: True if repeated lines exceed threshold
562
+ """
563
+ # Split the text block into rows/lines
564
+ rows = text.split("\n")
565
+ rows = [row for row in rows if row.strip() != ""]
566
+
567
+ # Count occurrences of each row
568
+ row_counts = Counter(rows)
569
+
570
+ # Check if any row is repeated at least threshold times
571
+ for count in row_counts.values():
572
+ if count >= threshold:
573
+ return True
574
+ return False
575
+
576
+ @staticmethod
577
+ def has_low_text_quality(text, chars_threshold=2000):
578
+ """
579
+ Check if extracted text has low quality.
580
+
581
+ Analyzes a sample of text to determine if it might have OCR or
582
+ extraction issues based on the ratio of valid characters.
583
+
584
+ Args:
585
+ text (str): Text to analyze
586
+ chars_threshold (int): Number of characters to sample
587
+
588
+ Returns:
589
+ bool: True if text quality is below acceptable threshold
590
+ """
591
+ # Extract a sample of the text
592
+ sample_text = text[:chars_threshold]
593
+
594
+ if not sample_text:
595
+ return True
596
+
597
+ # Count the number of valid (alphanumeric) characters
598
+ valid_chars = sum(c.isalnum() for c in sample_text)
599
+
600
+ # Determine the percentage of valid characters in the sample
601
+ valid_percentage = valid_chars / len(sample_text)
602
+
603
+ # Consider the text low quality if 30% or fewer characters are valid
604
+ return valid_percentage <= 0.3
605
+
606
+
File without changes
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Docsity
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,93 @@
1
+ Metadata-Version: 2.2
2
+ Name: polytext
3
+ Version: 0.1.0
4
+ Summary: Python utilities to simplify document files management
5
+ Home-page: https://github.com/docsity/polytext
6
+ Author: Matteo Senardi
7
+ Author-email: matteo.s@docsity.com
8
+ License: MIT
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3.6
12
+ Classifier: Programming Language :: Python :: 3.7
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Requires-Python: ~=3.6
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: pypdf==5.3.0
25
+ Requires-Dist: PyMuPDF>=1.25.3
26
+ Requires-Dist: pycryptodome==3.21.0
27
+ Requires-Dist: weasyprint==64.1
28
+ Requires-Dist: markdown==3.7
29
+ Requires-Dist: python-docx==1.1.2
30
+ Dynamic: author
31
+ Dynamic: author-email
32
+ Dynamic: classifier
33
+ Dynamic: description
34
+ Dynamic: description-content-type
35
+ Dynamic: home-page
36
+ Dynamic: license
37
+ Dynamic: requires-dist
38
+ Dynamic: requires-python
39
+ Dynamic: summary
40
+
41
+ # polytext
42
+
43
+ # Doc Utils
44
+
45
+ A Python package for document conversion and text extraction.
46
+
47
+ ## Features
48
+
49
+ - Convert various document formats (DOCX, ODT, PPT, etc.) to PDF
50
+ - Extract text from PDF documents
51
+ - Support for both local files and S3 storage
52
+ - Multiple PDF parsing backends (PyPDF, PyMuPDF)
53
+
54
+ ## Installation
55
+
56
+ ```bash
57
+ # Basic installation
58
+ pip install plytext
59
+ ```
60
+
61
+ ## Requirements
62
+
63
+ - Python 3.6 or higher
64
+ - LibreOffice (for PDF conversion)
65
+
66
+ ## Usage
67
+
68
+ Converting Documents to PDF
69
+
70
+ ```python
71
+ from polytext import convert_to_pdf, ConversionError
72
+
73
+ try:
74
+ # Convert a document to PDF
75
+ pdf_path = convert_to_pdf('input.docx', 'output.pdf')
76
+ print(f"PDF saved to: {pdf_path}")
77
+ except ConversionError as e:
78
+ print(f"Conversion failed: {e}")
79
+ ```
80
+
81
+ Text Extraction
82
+
83
+ ```python
84
+ from polytext import extract_text_from_file
85
+
86
+ # Extract text from any supported file
87
+ text = extract_text_from_file('document.docx')
88
+ print(f"Extracted text: {text}")
89
+ ```
90
+
91
+ ## License
92
+
93
+ MIT Licence
@@ -0,0 +1,13 @@
1
+ polytext/__init__.py,sha256=a92gWlgbNkMoOLUxmR_-mo-SyAlM9JF0MCPmV7yJrBY,442
2
+ polytext/output_manager.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ polytext/converter/__init__.py,sha256=vtqVbUqoXDhNfhJj4bJM-bkwac0P8hQ0Efkek6oLrro,134
4
+ polytext/converter/pdf.py,sha256=8DX0EiWNWKz_Uwpvf5xMtB6w-JlPgxEcKvHjriD7vxk,9894
5
+ polytext/exceptions/__init__.py,sha256=A7sckwMe832uv7DacjMvCWScT8NKqKrMtHfM8KpKvls,168
6
+ polytext/exceptions/base.py,sha256=AYq-w9aP3ll_3-bePZi5mJNc_1ZL_23ZNACOLBuBiPQ,1819
7
+ polytext/loader/__init__.py,sha256=WMvYGEviHk1e_dpIw8bwF6KsHHQdNj5W1dcqKRmvA-I,174
8
+ polytext/loader/text.py,sha256=Yb6vGUFYk6V9BDqxlk7lXrd9dZF6eSxK9gVqDPdtSMY,23709
9
+ polytext-0.1.0.dist-info/LICENSE,sha256=n-jK4xSUrmuCR9C0EsG05KUc2vi1N6UE-k2L24pKWS4,1064
10
+ polytext-0.1.0.dist-info/METADATA,sha256=2wDn_U6GSbMOlddZ7xO2oBSual5AptmDtUFQKzzDq_4,2351
11
+ polytext-0.1.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
12
+ polytext-0.1.0.dist-info/top_level.txt,sha256=dHg5ZsItizIPYaIUCzrzjEBdyx2WBf3TGPLa_iTwWg4,9
13
+ polytext-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.8.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ polytext