polytext 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polytext/__init__.py +15 -0
- polytext/converter/__init__.py +4 -0
- polytext/converter/pdf.py +256 -0
- polytext/exceptions/__init__.py +4 -0
- polytext/exceptions/base.py +52 -0
- polytext/loader/__init__.py +4 -0
- polytext/loader/text.py +606 -0
- polytext/output_manager.py +0 -0
- polytext-0.1.0.dist-info/LICENSE +21 -0
- polytext-0.1.0.dist-info/METADATA +93 -0
- polytext-0.1.0.dist-info/RECORD +13 -0
- polytext-0.1.0.dist-info/WHEEL +5 -0
- polytext-0.1.0.dist-info/top_level.txt +1 -0
polytext/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# polytext/__init__.py
|
|
2
|
+
from .converter.pdf import convert_to_pdf, DocumentConverter
|
|
3
|
+
from .loader.text import get_document_text, extract_text_from_file, TextLoader
|
|
4
|
+
from .exceptions.base import EmptyDocument, ExceededMaxPages, ConversionError
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
'convert_to_pdf',
|
|
8
|
+
'DocumentConverter',
|
|
9
|
+
'get_document_text',
|
|
10
|
+
'extract_text_from_file',
|
|
11
|
+
'TextLoader',
|
|
12
|
+
'EmptyDocument',
|
|
13
|
+
'ExceededMaxPages',
|
|
14
|
+
'ConversionError'
|
|
15
|
+
]
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
# converter/pdf.py
|
|
2
|
+
import os
|
|
3
|
+
import subprocess
|
|
4
|
+
import logging
|
|
5
|
+
from ..exceptions.base import ConversionError
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def convert_to_pdf(input_file, original_file, output_file=None):
|
|
11
|
+
"""
|
|
12
|
+
Convenience function to convert a document to PDF format using LibreOffice.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
input_file (str): Path to the input document file to be converted
|
|
16
|
+
original_file (str): Path to the original file for extension checking
|
|
17
|
+
output_file (str, optional): Path where the output PDF should be saved
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
str: Path to the generated PDF file
|
|
21
|
+
|
|
22
|
+
Raises:
|
|
23
|
+
FileNotFoundError: If the input file doesn't exist
|
|
24
|
+
ConversionError: If the conversion process fails
|
|
25
|
+
"""
|
|
26
|
+
converter = DocumentConverter()
|
|
27
|
+
return converter.convert_to_pdf(input_file, original_file, output_file)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DocumentConverter:
|
|
31
|
+
"""
|
|
32
|
+
A class for converting various document formats to PDF using LibreOffice.
|
|
33
|
+
|
|
34
|
+
The converter supports common document formats like TXT, DOC(X), ODT, PPT(X),
|
|
35
|
+
and XLS(X). It requires LibreOffice to be installed on the system.
|
|
36
|
+
|
|
37
|
+
Attributes:
|
|
38
|
+
supported_extensions (list): List of supported file extensions
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self):
|
|
42
|
+
"""Initialize the DocumentConverter."""
|
|
43
|
+
self.supported_extensions = [
|
|
44
|
+
'.txt', '.docx', '.doc', '.odt',
|
|
45
|
+
'.ppt', '.pptx', '.xlsx', '.xls', '.ods'
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def check_libreoffice_installed():
|
|
50
|
+
"""
|
|
51
|
+
Check if LibreOffice is installed and accessible in the system PATH.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
bool: True if LibreOffice is installed and available, False otherwise.
|
|
55
|
+
"""
|
|
56
|
+
try:
|
|
57
|
+
subprocess.run(
|
|
58
|
+
['libreoffice', '--version'],
|
|
59
|
+
stdout=subprocess.PIPE,
|
|
60
|
+
stderr=subprocess.PIPE,
|
|
61
|
+
check=False
|
|
62
|
+
)
|
|
63
|
+
return True
|
|
64
|
+
except (subprocess.SubprocessError, FileNotFoundError):
|
|
65
|
+
return False
|
|
66
|
+
|
|
67
|
+
def convert_to_pdf(self, input_file, original_file, output_file=None):
|
|
68
|
+
"""
|
|
69
|
+
Convert a document to PDF format using LibreOffice.
|
|
70
|
+
|
|
71
|
+
This method uses LibreOffice in headless mode to convert documents. If the input
|
|
72
|
+
file is already a PDF, it will be copied to the output location.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
input_file (str): Path to the input document file to be converted
|
|
76
|
+
original_file (str): Path to the original file for extension checking
|
|
77
|
+
output_file (str, optional): Path where the output PDF should be saved.
|
|
78
|
+
If not provided, will use input_file name with .pdf extension
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
str: Path to the generated PDF file
|
|
82
|
+
|
|
83
|
+
Raises:
|
|
84
|
+
FileNotFoundError: If the input file doesn't exist
|
|
85
|
+
ConversionError: If the conversion process fails or LibreOffice is not installed
|
|
86
|
+
"""
|
|
87
|
+
if not os.path.exists(input_file):
|
|
88
|
+
raise FileNotFoundError(f"Input file '{input_file}' does not exist.")
|
|
89
|
+
|
|
90
|
+
# Check file extension
|
|
91
|
+
_, ext = os.path.splitext(original_file)
|
|
92
|
+
logger.info(os.path.splitext(original_file))
|
|
93
|
+
if ext.lower() not in self.supported_extensions and ext.lower() != '.pdf':
|
|
94
|
+
logger.warning(f"File extension '{ext}' may not be supported.")
|
|
95
|
+
|
|
96
|
+
# Set default output file name if not provided
|
|
97
|
+
if output_file is None:
|
|
98
|
+
output_file = os.path.splitext(input_file)[0] + '.pdf'
|
|
99
|
+
|
|
100
|
+
output_dir = os.path.dirname(os.path.abspath(output_file))
|
|
101
|
+
if not os.path.exists(output_dir):
|
|
102
|
+
os.makedirs(output_dir)
|
|
103
|
+
|
|
104
|
+
# If the file is already a PDF, just copy it
|
|
105
|
+
if ext.lower() == '.pdf':
|
|
106
|
+
import shutil
|
|
107
|
+
shutil.copy2(input_file, output_file)
|
|
108
|
+
logger.info(f"File is already a PDF. Copied to '{output_file}'")
|
|
109
|
+
return output_file
|
|
110
|
+
|
|
111
|
+
# Check if LibreOffice is installed
|
|
112
|
+
if not self.check_libreoffice_installed():
|
|
113
|
+
raise ConversionError(
|
|
114
|
+
"LibreOffice is not installed or not found in PATH. "
|
|
115
|
+
"Please install LibreOffice to convert documents to PDF."
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Build the LibreOffice command
|
|
119
|
+
command = [
|
|
120
|
+
'libreoffice',
|
|
121
|
+
'--headless',
|
|
122
|
+
'--nologo',
|
|
123
|
+
'--nofirststartwizard',
|
|
124
|
+
'--convert-to', 'pdf',
|
|
125
|
+
'--outdir', output_dir,
|
|
126
|
+
input_file
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
# Suppress Java runtime warnings by redirecting stderr
|
|
131
|
+
subprocess.check_call(command, stderr=subprocess.DEVNULL)
|
|
132
|
+
logger.info(f"Conversion successful: '{output_file}'")
|
|
133
|
+
except subprocess.CalledProcessError as e:
|
|
134
|
+
error_msg = f"Error during conversion: {e}"
|
|
135
|
+
logger.error(error_msg)
|
|
136
|
+
raise ConversionError(error_msg, e)
|
|
137
|
+
|
|
138
|
+
# After conversion, ensure the output file is correctly named
|
|
139
|
+
converted_file = os.path.join(
|
|
140
|
+
output_dir,
|
|
141
|
+
os.path.splitext(os.path.basename(input_file))[0] + '.pdf'
|
|
142
|
+
)
|
|
143
|
+
if converted_file != output_file:
|
|
144
|
+
os.rename(converted_file, output_file)
|
|
145
|
+
|
|
146
|
+
return output_file
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# Alternative method with direct page_range management
|
|
150
|
+
|
|
151
|
+
# def convert_to_pdf(self, input_file, output_file=None, page_range=None):
|
|
152
|
+
# """
|
|
153
|
+
# Converts a document to PDF format using LibreOffice.
|
|
154
|
+
# """
|
|
155
|
+
# if not os.path.exists(input_file):
|
|
156
|
+
# raise FileNotFoundError(f"Input file '{input_file}' does not exist.")
|
|
157
|
+
#
|
|
158
|
+
# # Check file extension
|
|
159
|
+
# _, ext = os.path.splitext(input_file)
|
|
160
|
+
# logger.info(os.path.splitext(input_file))
|
|
161
|
+
#
|
|
162
|
+
# # Set default output file name if not provided
|
|
163
|
+
# if output_file is None:
|
|
164
|
+
# output_file = os.path.splitext(input_file)[0] + '.pdf'
|
|
165
|
+
#
|
|
166
|
+
# output_dir = os.path.dirname(os.path.abspath(output_file))
|
|
167
|
+
# if not os.path.exists(output_dir):
|
|
168
|
+
# os.makedirs(output_dir)
|
|
169
|
+
#
|
|
170
|
+
# # If the file is already a PDF, just copy it
|
|
171
|
+
# if ext.lower() == '.pdf':
|
|
172
|
+
# import shutil
|
|
173
|
+
# shutil.copy2(input_file, output_file)
|
|
174
|
+
# logger.info(f"File is already a PDF. Copied to '{output_file}'")
|
|
175
|
+
# return output_file
|
|
176
|
+
#
|
|
177
|
+
# # Check if LibreOffice is installed
|
|
178
|
+
# if not self.check_libreoffice_installed():
|
|
179
|
+
# raise ConversionError(
|
|
180
|
+
# "LibreOffice is not installed or not found in PATH. "
|
|
181
|
+
# "Please install LibreOffice to convert documents to PDF."
|
|
182
|
+
# )
|
|
183
|
+
#
|
|
184
|
+
# # Record existing PDFs in the output directory
|
|
185
|
+
# import glob
|
|
186
|
+
# existing_pdfs = set(glob.glob(os.path.join(output_dir, "*.pdf")))
|
|
187
|
+
#
|
|
188
|
+
# # Build the LibreOffice command
|
|
189
|
+
# convert_filter = 'pdf'
|
|
190
|
+
# if page_range:
|
|
191
|
+
# convert_filter = f'pdf:writer_pdf_Export:{{"PageRange":{{"type":"string","value":"{page_range}"}}}}'
|
|
192
|
+
#
|
|
193
|
+
# command = [
|
|
194
|
+
# 'libreoffice',
|
|
195
|
+
# '--headless',
|
|
196
|
+
# '--nologo',
|
|
197
|
+
# '--nofirststartwizard',
|
|
198
|
+
# '--convert-to', convert_filter,
|
|
199
|
+
# '--outdir', output_dir,
|
|
200
|
+
# input_file
|
|
201
|
+
# ]
|
|
202
|
+
#
|
|
203
|
+
# try:
|
|
204
|
+
# # Run the command and capture output
|
|
205
|
+
# result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
|
206
|
+
# text=True, check=False)
|
|
207
|
+
#
|
|
208
|
+
# # Check if command was successful
|
|
209
|
+
# if result.returncode != 0:
|
|
210
|
+
# error_msg = f"Error during conversion: {result.stderr}"
|
|
211
|
+
# logger.error(error_msg)
|
|
212
|
+
# raise ConversionError(error_msg)
|
|
213
|
+
#
|
|
214
|
+
# # Log the output to help debugging
|
|
215
|
+
# logger.info(f"LibreOffice conversion output: {result.stdout}")
|
|
216
|
+
#
|
|
217
|
+
# # Find newly created PDF file
|
|
218
|
+
# current_pdfs = set(glob.glob(os.path.join(output_dir, "*.pdf")))
|
|
219
|
+
# new_pdfs = current_pdfs - existing_pdfs
|
|
220
|
+
#
|
|
221
|
+
# if not new_pdfs:
|
|
222
|
+
# # Try looking in /private path as well (for macOS)
|
|
223
|
+
# if output_dir.startswith('/var/'):
|
|
224
|
+
# private_dir = '/private' + output_dir
|
|
225
|
+
# private_pdfs = set(glob.glob(os.path.join(private_dir, "*.pdf")))
|
|
226
|
+
# new_pdfs = private_pdfs - existing_pdfs
|
|
227
|
+
#
|
|
228
|
+
# if not new_pdfs:
|
|
229
|
+
# # Last resort: find the most recently created PDF
|
|
230
|
+
# all_pdfs = glob.glob(os.path.join(output_dir, "*.pdf"))
|
|
231
|
+
# if not all_pdfs and output_dir.startswith('/var/'):
|
|
232
|
+
# private_dir = '/private' + output_dir
|
|
233
|
+
# all_pdfs = glob.glob(os.path.join(private_dir, "*.pdf"))
|
|
234
|
+
#
|
|
235
|
+
# if all_pdfs:
|
|
236
|
+
# converted_file = max(all_pdfs, key=os.path.getmtime)
|
|
237
|
+
# logger.info(f"Found most recent PDF: {converted_file}")
|
|
238
|
+
# else:
|
|
239
|
+
# raise ConversionError(f"No PDF files found in output directory after conversion.")
|
|
240
|
+
# else:
|
|
241
|
+
# converted_file = list(new_pdfs)[0]
|
|
242
|
+
# logger.info(f"Found newly created PDF: {converted_file}")
|
|
243
|
+
#
|
|
244
|
+
# # Move to desired output location if needed
|
|
245
|
+
# if converted_file != output_file:
|
|
246
|
+
# import shutil
|
|
247
|
+
# shutil.copy2(converted_file, output_file)
|
|
248
|
+
# os.remove(converted_file) # Clean up the original
|
|
249
|
+
# logger.info(f"Moved PDF to final location: {output_file}")
|
|
250
|
+
#
|
|
251
|
+
# return output_file
|
|
252
|
+
#
|
|
253
|
+
# except Exception as e:
|
|
254
|
+
# error_msg = f"Error during PDF conversion: {str(e)}"
|
|
255
|
+
# logger.error(error_msg)
|
|
256
|
+
# raise ConversionError(error_msg)
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# exceptions/base.py
|
|
2
|
+
class ConversionError(Exception):
|
|
3
|
+
"""
|
|
4
|
+
Exception raised when document conversion to PDF fails.
|
|
5
|
+
|
|
6
|
+
This exception is typically raised when LibreOffice fails to convert a document
|
|
7
|
+
or when the conversion process encounters system-level issues.
|
|
8
|
+
|
|
9
|
+
Attributes:
|
|
10
|
+
message (str): Detailed error message describing the conversion failure
|
|
11
|
+
original_exception: The underlying exception that caused the conversion failure
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, message, original_exception=None):
|
|
15
|
+
super().__init__(message)
|
|
16
|
+
self.message = message
|
|
17
|
+
self.original_exception = original_exception
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class EmptyDocument(Exception):
|
|
21
|
+
"""
|
|
22
|
+
Exception raised when a document contains no extractable text.
|
|
23
|
+
|
|
24
|
+
This exception is raised when text extraction yields empty results or
|
|
25
|
+
when the extracted text fails quality checks (e.g., too few characters,
|
|
26
|
+
excessive repeated content).
|
|
27
|
+
|
|
28
|
+
Attributes:
|
|
29
|
+
message (str): Description of why the document is considered empty
|
|
30
|
+
code (int): Error code for categorizing the type of emptiness (default: None)
|
|
31
|
+
"""
|
|
32
|
+
def __init__(self, message, code=None):
|
|
33
|
+
super().__init__(message)
|
|
34
|
+
self.message = message
|
|
35
|
+
self.code = code
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ExceededMaxPages(Exception):
|
|
39
|
+
"""
|
|
40
|
+
Exception raised when requested page range exceeds document length.
|
|
41
|
+
|
|
42
|
+
This exception occurs when attempting to extract text from pages beyond
|
|
43
|
+
the document's actual page count or when invalid page ranges are specified.
|
|
44
|
+
|
|
45
|
+
Attributes:
|
|
46
|
+
message (str): Description of the page range error
|
|
47
|
+
code (int): Error code for tracking purposes (default: None)
|
|
48
|
+
"""
|
|
49
|
+
def __init__(self, message, code=None):
|
|
50
|
+
super().__init__(message)
|
|
51
|
+
self.message = message
|
|
52
|
+
self.code = code
|
polytext/loader/text.py
ADDED
|
@@ -0,0 +1,606 @@
|
|
|
1
|
+
# text_loader.py
|
|
2
|
+
# Standard library imports
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import tempfile
|
|
6
|
+
import logging
|
|
7
|
+
from collections import Counter
|
|
8
|
+
|
|
9
|
+
# Third-party imports
|
|
10
|
+
from pypdf import PdfReader
|
|
11
|
+
import fitz # PyMuPDF
|
|
12
|
+
from botocore.exceptions import ClientError
|
|
13
|
+
|
|
14
|
+
# Local imports
|
|
15
|
+
from ..converter.pdf import convert_to_pdf
|
|
16
|
+
from ..exceptions.base import EmptyDocument, ExceededMaxPages
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# Standalone functions that wrap TextLoader methods
|
|
22
|
+
def get_document_text(doc_data, page_range=None):
|
|
23
|
+
"""
|
|
24
|
+
Convenience function to extract text from a document using PyMuPDF.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
doc_data (dict): Dictionary containing 'file_path' and optional 'bucket'
|
|
28
|
+
page_range (tuple, optional): Tuple of (start_page, end_page) for partial extraction
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
str: Extracted text from the document
|
|
32
|
+
|
|
33
|
+
Raises:
|
|
34
|
+
EmptyDocument: If extracted text is empty or fails quality checks
|
|
35
|
+
ExceededMaxPages: If requested page range is invalid
|
|
36
|
+
"""
|
|
37
|
+
loader = TextLoader()
|
|
38
|
+
return loader.get_document_text(doc_data, page_range)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def extract_text_from_file(file_path, page_range=None, backend='auto'):
|
|
42
|
+
"""
|
|
43
|
+
Convenience function to extract text from a local file.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
file_path (str): Path to the local file
|
|
47
|
+
page_range (tuple, optional): Tuple of (start_page, end_page), 1-indexed
|
|
48
|
+
backend (str, optional): Text extraction backend ('auto', 'pymupdf', or 'pypdf')
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
str: Extracted text from the document
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
FileNotFoundError: If input file doesn't exist
|
|
55
|
+
ValueError: If invalid backend is specified
|
|
56
|
+
EmptyDocument: If no text could be extracted
|
|
57
|
+
"""
|
|
58
|
+
loader = TextLoader()
|
|
59
|
+
return loader.extract_text_from_file(file_path, page_range, backend)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class TextLoader:
|
|
63
|
+
"""
|
|
64
|
+
Loads and extracts text from documents with support for S3 storage.
|
|
65
|
+
|
|
66
|
+
This class handles document downloading from S3, conversion to PDF, and text extraction
|
|
67
|
+
using different backends (PyMuPDF and PyPDF). It supports various document formats
|
|
68
|
+
through conversion to PDF using LibreOffice.
|
|
69
|
+
|
|
70
|
+
Attributes:
|
|
71
|
+
converter (DocumentConverter): Instance for converting documents to PDF
|
|
72
|
+
s3_client: Boto3 S3 client for AWS operations
|
|
73
|
+
document_aws_bucket (str): Default S3 bucket name for document storage
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(self, s3_client=None, document_aws_bucket=None):
|
|
77
|
+
"""
|
|
78
|
+
Initialize TextLoader with optional S3 configuration.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
s3_client: Boto3 S3 client instance for AWS operations (optional)
|
|
82
|
+
document_aws_bucket (str): Default S3 bucket name for document storage (optional)
|
|
83
|
+
"""
|
|
84
|
+
self.s3_client = s3_client
|
|
85
|
+
self.document_aws_bucket = document_aws_bucket
|
|
86
|
+
|
|
87
|
+
# S3-related methods
|
|
88
|
+
|
|
89
|
+
def download_file_from_s3(self, bucket, file_path, temp_file_path):
|
|
90
|
+
"""
|
|
91
|
+
Download a file from S3 to a local temporary path.
|
|
92
|
+
|
|
93
|
+
Attempts to download the file with both lowercase and uppercase extensions.
|
|
94
|
+
Falls back to document conversion if direct download fails.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
bucket (str): S3 bucket name
|
|
98
|
+
file_path (str): Path to file in S3 bucket
|
|
99
|
+
temp_file_path (str): Local path to save the downloaded file
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
str: Path to the downloaded file (may be converted to PDF)
|
|
103
|
+
|
|
104
|
+
Raises:
|
|
105
|
+
ClientError: If S3 download operation fails
|
|
106
|
+
"""
|
|
107
|
+
try:
|
|
108
|
+
self.s3_client.download_file(Bucket=bucket, Key=file_path, Filename=temp_file_path)
|
|
109
|
+
logger.info(f'Downloaded {file_path} to {temp_file_path}')
|
|
110
|
+
except ClientError as e:
|
|
111
|
+
logger.info(e)
|
|
112
|
+
try:
|
|
113
|
+
self.s3_client.download_file(Bucket=bucket,
|
|
114
|
+
Key=file_path.replace(".pdf", ".PDF"),
|
|
115
|
+
Filename=temp_file_path)
|
|
116
|
+
except Exception as e:
|
|
117
|
+
file_prefix = file_path
|
|
118
|
+
temp_file_path = self.convert_doc_to_pdf(bucket=bucket,
|
|
119
|
+
file_prefix=file_prefix,
|
|
120
|
+
input_file=temp_file_path)
|
|
121
|
+
return temp_file_path
|
|
122
|
+
|
|
123
|
+
def convert_doc_to_pdf(self, bucket, file_prefix, input_file):
|
|
124
|
+
"""
|
|
125
|
+
Convert a document from S3 to PDF format.
|
|
126
|
+
|
|
127
|
+
Downloads the document from S3 and converts it to PDF using LibreOffice.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
bucket (str): S3 bucket name
|
|
131
|
+
file_prefix (str): Prefix to match files in S3
|
|
132
|
+
input_file (str): Local path to save the downloaded file
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
str: Path to the converted PDF file
|
|
136
|
+
|
|
137
|
+
Raises:
|
|
138
|
+
FileNotFoundError: If no matching file is found in S3
|
|
139
|
+
ConversionError: If PDF conversion fails
|
|
140
|
+
"""
|
|
141
|
+
logger.info(f"bucket: {bucket}")
|
|
142
|
+
logger.info(f"file_prefix: {file_prefix}")
|
|
143
|
+
logger.info(f"input_file: {input_file}")
|
|
144
|
+
|
|
145
|
+
# Create a temporary file for output
|
|
146
|
+
fd, output_file = tempfile.mkstemp(suffix=".pdf")
|
|
147
|
+
os.close(fd) # Close file descriptor explicitly
|
|
148
|
+
|
|
149
|
+
# List objects in S3 bucket
|
|
150
|
+
response = self.s3_client.list_objects_v2(Bucket=bucket, Prefix=file_prefix)
|
|
151
|
+
|
|
152
|
+
if 'Contents' not in response or not response['Contents']:
|
|
153
|
+
raise FileNotFoundError("No matching file found in S3 bucket.")
|
|
154
|
+
|
|
155
|
+
# Get the first matching object
|
|
156
|
+
matching_file = response['Contents'][0]['Key']
|
|
157
|
+
|
|
158
|
+
# Download the file
|
|
159
|
+
self.s3_client.download_file(
|
|
160
|
+
Bucket=bucket,
|
|
161
|
+
Key=matching_file,
|
|
162
|
+
Filename=input_file
|
|
163
|
+
)
|
|
164
|
+
logger.info("Using LibreOffice")
|
|
165
|
+
convert_to_pdf(input_file=input_file, output_file=output_file, original_file=file_prefix)
|
|
166
|
+
logger.info("Document converted to pdf")
|
|
167
|
+
os.remove(input_file)
|
|
168
|
+
return output_file
|
|
169
|
+
|
|
170
|
+
# PDF text extraction methods
|
|
171
|
+
|
|
172
|
+
def get_document_text(self, doc_data, page_range=None):
|
|
173
|
+
"""
|
|
174
|
+
Extract text from a document using PyMuPDF as primary backend.
|
|
175
|
+
|
|
176
|
+
Downloads the document from S3 if needed, converts to PDF if necessary,
|
|
177
|
+
and extracts text with quality checks and early termination conditions.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
doc_data (dict): Dictionary containing 'file_path' and optional 'bucket'
|
|
181
|
+
page_range (tuple, optional): Tuple of (start_page, end_page) for partial extraction
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
str: Extracted text from the document
|
|
185
|
+
|
|
186
|
+
Raises:
|
|
187
|
+
EmptyDocument: If extracted text is empty or fails quality checks
|
|
188
|
+
ExceededMaxPages: If requested page range is invalid
|
|
189
|
+
"""
|
|
190
|
+
logger.debug("Using PyMuPDF")
|
|
191
|
+
file_path = doc_data["file_path"]
|
|
192
|
+
|
|
193
|
+
fd, temp_file_path = tempfile.mkstemp()
|
|
194
|
+
if doc_data.get("bucket"):
|
|
195
|
+
bucket = doc_data.get("bucket")
|
|
196
|
+
else:
|
|
197
|
+
bucket = self.document_aws_bucket
|
|
198
|
+
|
|
199
|
+
if os.path.splitext(file_path)[1].lower() != ".pdf":
|
|
200
|
+
logger.info("Converting file to PDF")
|
|
201
|
+
file_prefix = file_path
|
|
202
|
+
temp_file_path = self.convert_doc_to_pdf(bucket=bucket, file_prefix=file_prefix, input_file=temp_file_path)
|
|
203
|
+
pdf_document = fitz.open(temp_file_path)
|
|
204
|
+
else:
|
|
205
|
+
temp_file_path = self.download_file_from_s3(bucket, file_path, temp_file_path)
|
|
206
|
+
try:
|
|
207
|
+
pdf_document = fitz.open(temp_file_path)
|
|
208
|
+
logger.info(f"Successfully opened file with temp_file_path: {temp_file_path}")
|
|
209
|
+
except Exception as e:
|
|
210
|
+
logger.info("Converting file to PDF")
|
|
211
|
+
file_prefix = file_path
|
|
212
|
+
temp_file_path = self.convert_doc_to_pdf(bucket=bucket, file_prefix=file_prefix, input_file=temp_file_path)
|
|
213
|
+
pdf_document = fitz.open(temp_file_path)
|
|
214
|
+
|
|
215
|
+
text = ""
|
|
216
|
+
last_pages_text = ""
|
|
217
|
+
last_page_index_to_start = 10
|
|
218
|
+
total_pages = pdf_document.page_count
|
|
219
|
+
logger.info(f"Total pages: {total_pages}")
|
|
220
|
+
|
|
221
|
+
# Validate and adjust page range
|
|
222
|
+
start_page, end_page = self.validate_page_range(page_range, total_pages)
|
|
223
|
+
|
|
224
|
+
for page_number in range(start_page, end_page):
|
|
225
|
+
page = pdf_document.load_page(page_number)
|
|
226
|
+
page_text = page.get_text("text", flags=16)
|
|
227
|
+
page_text = self.clean_text(page_text)
|
|
228
|
+
text += page_text
|
|
229
|
+
if page_number >= (pdf_document.page_count - last_page_index_to_start):
|
|
230
|
+
last_pages_text += page_text
|
|
231
|
+
|
|
232
|
+
# Early termination checks
|
|
233
|
+
if len(text) == 0 and page_number == 10:
|
|
234
|
+
message = "First 10 pages of the document are empty"
|
|
235
|
+
logger.info(message)
|
|
236
|
+
raise EmptyDocument(message=message, code=998)
|
|
237
|
+
|
|
238
|
+
if len(text) < 800 and page_number == 20:
|
|
239
|
+
message = "First 20 pages of the document have less than 800 chars"
|
|
240
|
+
logger.info(message)
|
|
241
|
+
raise EmptyDocument(message=message, code=998)
|
|
242
|
+
|
|
243
|
+
if (total_pages >= 500 and
|
|
244
|
+
page_number == 10 and
|
|
245
|
+
self.has_repeated_rows(text=text, threshold=100)):
|
|
246
|
+
message = "First 10 pages of the document have 100 repeated rows"
|
|
247
|
+
logger.info(message)
|
|
248
|
+
raise EmptyDocument(message=message, code=998)
|
|
249
|
+
|
|
250
|
+
if (total_pages >= 500 and
|
|
251
|
+
(page_number == total_pages - 1) and
|
|
252
|
+
self.has_repeated_rows(text=last_pages_text, threshold=100)):
|
|
253
|
+
message = "Last 10 pages of the document have 100 repeated rows"
|
|
254
|
+
logger.info(message)
|
|
255
|
+
raise EmptyDocument(message=message, code=998)
|
|
256
|
+
|
|
257
|
+
pdf_document.close()
|
|
258
|
+
os.remove(temp_file_path)
|
|
259
|
+
|
|
260
|
+
if len(text) == 0:
|
|
261
|
+
message = "No text detected"
|
|
262
|
+
logger.info(message)
|
|
263
|
+
raise EmptyDocument(message=message, code=998)
|
|
264
|
+
if "������������������������������������������" in text:
|
|
265
|
+
logger.info("Using pypdf being strange PDF")
|
|
266
|
+
return self.get_document_text_pypdf(bucket=bucket, file_path=file_path, page_range=page_range)
|
|
267
|
+
if len(text) < 800:
|
|
268
|
+
message = "Document text with less than 800 characters"
|
|
269
|
+
raise EmptyDocument(message=message, code=998)
|
|
270
|
+
|
|
271
|
+
return text
|
|
272
|
+
|
|
273
|
+
def get_document_text_pypdf(self, bucket, file_path, page_range=None):
|
|
274
|
+
"""
|
|
275
|
+
Extract text from a document using PyPDF as fallback backend.
|
|
276
|
+
|
|
277
|
+
Similar to get_document_text but uses PyPDF for extraction. Useful when
|
|
278
|
+
PyMuPDF fails to extract text properly.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
bucket (str): S3 bucket name
|
|
282
|
+
file_path (str): Path to file in S3
|
|
283
|
+
page_range (tuple, optional): Tuple of (start_page, end_page) for partial extraction
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
str: Extracted text from the document
|
|
287
|
+
|
|
288
|
+
Raises:
|
|
289
|
+
EmptyDocument: If extracted text is empty or fails quality checks
|
|
290
|
+
ExceededMaxPages: If requested page range is invalid
|
|
291
|
+
"""
|
|
292
|
+
logger.info("Using PyPDF")
|
|
293
|
+
|
|
294
|
+
fd, temp_file_path = tempfile.mkstemp()
|
|
295
|
+
|
|
296
|
+
if os.path.splitext(file_path)[1].lower() != ".pdf":
|
|
297
|
+
logger.info("Converting file to PDF")
|
|
298
|
+
file_prefix = file_path
|
|
299
|
+
temp_file_path = self.convert_doc_to_pdf(bucket=bucket, file_prefix=file_prefix, input_file=temp_file_path)
|
|
300
|
+
logger.debug(f"temp_file_path post conversion to pdf: {temp_file_path}")
|
|
301
|
+
file = open(temp_file_path, "rb")
|
|
302
|
+
pdf_reader = PdfReader(file)
|
|
303
|
+
else:
|
|
304
|
+
temp_file_path = self.download_file_from_s3(bucket, file_path, temp_file_path)
|
|
305
|
+
logger.debug(f"temp_file_path: {temp_file_path}")
|
|
306
|
+
try:
|
|
307
|
+
file = open(temp_file_path, "rb")
|
|
308
|
+
pdf_reader = PdfReader(file)
|
|
309
|
+
logger.info(f"Successfully opened file with temp_file_path: {temp_file_path}")
|
|
310
|
+
except Exception as e:
|
|
311
|
+
logger.info("Converting file to PDF")
|
|
312
|
+
file_prefix = file_path
|
|
313
|
+
temp_file_path = self.convert_doc_to_pdf(bucket=bucket, file_prefix=file_prefix,
|
|
314
|
+
input_file=temp_file_path)
|
|
315
|
+
logger.debug(f"temp_file_path post conversion to pdf: {temp_file_path}")
|
|
316
|
+
file = open(temp_file_path, "rb")
|
|
317
|
+
pdf_reader = PdfReader(file)
|
|
318
|
+
|
|
319
|
+
text = ""
|
|
320
|
+
last_pages_text = ""
|
|
321
|
+
last_page_index_to_start = 10
|
|
322
|
+
total_pages = len(pdf_reader.pages)
|
|
323
|
+
|
|
324
|
+
# Validate and adjust page range
|
|
325
|
+
start_page, end_page = self.validate_page_range(page_range, total_pages)
|
|
326
|
+
|
|
327
|
+
for page_number in range(start_page, end_page):
|
|
328
|
+
page = pdf_reader.pages[page_number]
|
|
329
|
+
page_text = page.extract_text()
|
|
330
|
+
page_text = self.clean_text(page_text)
|
|
331
|
+
text += page_text
|
|
332
|
+
|
|
333
|
+
if page.page_number >= (total_pages - last_page_index_to_start):
|
|
334
|
+
last_pages_text += page_text
|
|
335
|
+
|
|
336
|
+
# Early termination checks
|
|
337
|
+
if len(text) == 0 and page.page_number == 10:
|
|
338
|
+
message = "First 10 pages of the document are empty"
|
|
339
|
+
logger.info(message)
|
|
340
|
+
os.remove(temp_file_path)
|
|
341
|
+
raise EmptyDocument(message=message, code=998)
|
|
342
|
+
if len(text) < 800 and page.page_number == 20:
|
|
343
|
+
message = "First 20 pages of the document have less than 800 chars"
|
|
344
|
+
logger.info(message)
|
|
345
|
+
os.remove(temp_file_path)
|
|
346
|
+
raise EmptyDocument(message=message, code=998)
|
|
347
|
+
if (
|
|
348
|
+
total_pages >= 500
|
|
349
|
+
and page.page_number == 10
|
|
350
|
+
and self.has_repeated_rows(text=text, threshold=100)
|
|
351
|
+
):
|
|
352
|
+
message = "First 10 pages of the document have 100 repeated rows"
|
|
353
|
+
logger.info(message)
|
|
354
|
+
os.remove(temp_file_path)
|
|
355
|
+
raise EmptyDocument(message=message, code=998)
|
|
356
|
+
if (
|
|
357
|
+
total_pages >= 500
|
|
358
|
+
and (page.page_number == total_pages - 1)
|
|
359
|
+
and self.has_repeated_rows(text=last_pages_text, threshold=100)
|
|
360
|
+
):
|
|
361
|
+
message = "Last 10 pages of the document have 100 repeated rows"
|
|
362
|
+
logger.info(message)
|
|
363
|
+
os.remove(temp_file_path)
|
|
364
|
+
raise EmptyDocument(message=message, code=998)
|
|
365
|
+
|
|
366
|
+
if len(text) == 0:
|
|
367
|
+
message = "No text detected"
|
|
368
|
+
logger.info(message)
|
|
369
|
+
raise EmptyDocument(message=message, code=998)
|
|
370
|
+
|
|
371
|
+
os.remove(temp_file_path)
|
|
372
|
+
return text
|
|
373
|
+
|
|
374
|
+
def extract_text_from_file(self, file_path, page_range=None, backend='auto'):
|
|
375
|
+
"""
|
|
376
|
+
Extract text from a local file using specified backend.
|
|
377
|
+
|
|
378
|
+
Supports multiple text extraction backends and handles document conversion
|
|
379
|
+
if needed. Implements fallback mechanism if primary backend fails.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
file_path (str): Path to the local file
|
|
383
|
+
page_range (tuple, optional): Tuple of (start_page, end_page), 1-indexed
|
|
384
|
+
backend (str, optional): Text extraction backend ('auto', 'pymupdf', or 'pypdf')
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
str: Extracted text from the document
|
|
388
|
+
|
|
389
|
+
Raises:
|
|
390
|
+
FileNotFoundError: If input file doesn't exist
|
|
391
|
+
ValueError: If invalid backend is specified
|
|
392
|
+
EmptyDocument: If no text could be extracted
|
|
393
|
+
"""
|
|
394
|
+
if not os.path.exists(file_path):
|
|
395
|
+
raise FileNotFoundError(f"File '{file_path}' does not exist.")
|
|
396
|
+
|
|
397
|
+
# Validate backend option
|
|
398
|
+
valid_backends = ['auto', 'pymupdf', 'pypdf']
|
|
399
|
+
if backend not in valid_backends:
|
|
400
|
+
raise ValueError(f"Invalid backend '{backend}'. Must be one of {valid_backends}")
|
|
401
|
+
|
|
402
|
+
# Determine backend to use if 'auto'
|
|
403
|
+
if backend == 'auto':
|
|
404
|
+
backend = 'pymupdf' # Default to PyMuPDF for better extraction quality
|
|
405
|
+
|
|
406
|
+
# Check if file needs conversion to PDF
|
|
407
|
+
file_ext = os.path.splitext(file_path)[1].lower()
|
|
408
|
+
temp_pdf_path = None
|
|
409
|
+
|
|
410
|
+
try:
|
|
411
|
+
if file_ext != '.pdf':
|
|
412
|
+
logger.info(f"Converting {file_ext} file to PDF...")
|
|
413
|
+
fd, temp_pdf_path = tempfile.mkstemp(suffix=".pdf")
|
|
414
|
+
os.close(fd) # Close the file descriptor
|
|
415
|
+
|
|
416
|
+
# Convert to PDF using the converter
|
|
417
|
+
pdf_path = convert_to_pdf(input_file=file_path, original_file=file_path, output_file=temp_pdf_path)
|
|
418
|
+
else:
|
|
419
|
+
pdf_path = file_path
|
|
420
|
+
|
|
421
|
+
text = ""
|
|
422
|
+
|
|
423
|
+
# Extract text using PyMuPDF
|
|
424
|
+
if backend == 'pymupdf':
|
|
425
|
+
logger.debug("Using PyMuPDF for text extraction")
|
|
426
|
+
try:
|
|
427
|
+
pdf_document = fitz.open(pdf_path)
|
|
428
|
+
try:
|
|
429
|
+
total_pages = pdf_document.page_count
|
|
430
|
+
|
|
431
|
+
# Validate and adjust page range
|
|
432
|
+
start_page, end_page = self.validate_page_range(page_range, total_pages)
|
|
433
|
+
|
|
434
|
+
for page_number in range(start_page, end_page):
|
|
435
|
+
page = pdf_document.load_page(page_number)
|
|
436
|
+
page_text = page.get_text("text", flags=16) # Use cleaner text extraction
|
|
437
|
+
page_text = self.clean_text(page_text)
|
|
438
|
+
text += page_text
|
|
439
|
+
|
|
440
|
+
finally:
|
|
441
|
+
pdf_document.close()
|
|
442
|
+
|
|
443
|
+
# Check for strange characters that might indicate PyMuPDF issues
|
|
444
|
+
if "������������������������������������������" in text:
|
|
445
|
+
logger.warning("PyMuPDF extracted unusual characters. Switching to PyPDF.")
|
|
446
|
+
backend = 'pypdf'
|
|
447
|
+
elif len(text.strip()) == 0:
|
|
448
|
+
logger.warning("PyMuPDF extracted no text. Switching to PyPDF.")
|
|
449
|
+
backend = 'pypdf'
|
|
450
|
+
else:
|
|
451
|
+
# If text was successfully extracted, return it
|
|
452
|
+
return text
|
|
453
|
+
|
|
454
|
+
except Exception as e:
|
|
455
|
+
logger.warning(f"PyMuPDF extraction failed: {str(e)}. Trying PyPDF.")
|
|
456
|
+
backend = 'pypdf' # Try PyPDF as a fallback
|
|
457
|
+
|
|
458
|
+
# Extract text using PyPDF
|
|
459
|
+
if backend == 'pypdf':
|
|
460
|
+
logger.debug("Using PyPDF for text extraction")
|
|
461
|
+
with open(pdf_path, "rb") as file:
|
|
462
|
+
pdf_reader = PdfReader(file)
|
|
463
|
+
total_pages = len(pdf_reader.pages)
|
|
464
|
+
|
|
465
|
+
# Validate and adjust page range
|
|
466
|
+
start_page, end_page = self.validate_page_range(page_range, total_pages)
|
|
467
|
+
|
|
468
|
+
# Reset text if we're falling back from PyMuPDF
|
|
469
|
+
text = ""
|
|
470
|
+
|
|
471
|
+
for page_number in range(start_page, end_page):
|
|
472
|
+
page = pdf_reader.pages[page_number]
|
|
473
|
+
page_text = page.extract_text()
|
|
474
|
+
page_text = self.clean_text(page_text)
|
|
475
|
+
text += page_text
|
|
476
|
+
|
|
477
|
+
if not text.strip():
|
|
478
|
+
message = "No text detected in the document"
|
|
479
|
+
logger.info(message)
|
|
480
|
+
raise EmptyDocument(message=message, code=998)
|
|
481
|
+
|
|
482
|
+
return text
|
|
483
|
+
|
|
484
|
+
finally:
|
|
485
|
+
# Clean up temporary file
|
|
486
|
+
if temp_pdf_path and os.path.exists(temp_pdf_path):
|
|
487
|
+
try:
|
|
488
|
+
os.remove(temp_pdf_path)
|
|
489
|
+
except Exception as e:
|
|
490
|
+
logger.warning(f"Failed to remove temporary file {temp_pdf_path}: {str(e)}")
|
|
491
|
+
|
|
492
|
+
# Helper methods
|
|
493
|
+
|
|
494
|
+
@staticmethod
|
|
495
|
+
def validate_page_range(page_range, total_pages):
|
|
496
|
+
"""
|
|
497
|
+
Validate and normalize the page range for text extraction.
|
|
498
|
+
|
|
499
|
+
Converts 1-indexed page numbers to 0-indexed and ensures range is within
|
|
500
|
+
document bounds.
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
page_range (tuple): Tuple of (start_page, end_page) in 1-indexed format
|
|
504
|
+
total_pages (int): Total number of pages in the document
|
|
505
|
+
|
|
506
|
+
Returns:
|
|
507
|
+
tuple: Normalized (start_page, end_page) in 0-indexed format
|
|
508
|
+
|
|
509
|
+
Raises:
|
|
510
|
+
ExceededMaxPages: If page range exceeds document length
|
|
511
|
+
"""
|
|
512
|
+
if page_range:
|
|
513
|
+
logger.info(f"Using page range: {page_range[0]} - {page_range[1]}")
|
|
514
|
+
if page_range[1] > total_pages or page_range[0] < 1:
|
|
515
|
+
raise ExceededMaxPages(
|
|
516
|
+
message=f"Requested page range {page_range} exceeds document length ({total_pages})",
|
|
517
|
+
code=998
|
|
518
|
+
)
|
|
519
|
+
start_page = max(0, page_range[0] - 1) # Convert to 0-indexed
|
|
520
|
+
end_page = min(page_range[1], total_pages)
|
|
521
|
+
else:
|
|
522
|
+
start_page = 0
|
|
523
|
+
end_page = total_pages
|
|
524
|
+
|
|
525
|
+
return start_page, end_page
|
|
526
|
+
|
|
527
|
+
@staticmethod
|
|
528
|
+
def clean_text(text):
|
|
529
|
+
"""
|
|
530
|
+
Clean and normalize extracted text.
|
|
531
|
+
|
|
532
|
+
Performs standard text cleaning operations:
|
|
533
|
+
- Replaces double quotes with single quotes
|
|
534
|
+
- Removes excessive newlines
|
|
535
|
+
- Removes special tokens
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
text (str): Raw text to clean
|
|
539
|
+
|
|
540
|
+
Returns:
|
|
541
|
+
str: Cleaned and normalized text
|
|
542
|
+
"""
|
|
543
|
+
if text:
|
|
544
|
+
text = text.replace('"', "'")
|
|
545
|
+
text = re.sub(r"\n\s*\n", "\n", text)
|
|
546
|
+
text = text.replace('<|endoftext|>', '')
|
|
547
|
+
return text
|
|
548
|
+
|
|
549
|
+
@staticmethod
|
|
550
|
+
def has_repeated_rows(text, threshold=100):
|
|
551
|
+
"""
|
|
552
|
+
Check if text contains rows repeated above threshold.
|
|
553
|
+
|
|
554
|
+
Used to detect potential extraction issues or repetitive content.
|
|
555
|
+
|
|
556
|
+
Args:
|
|
557
|
+
text (str): Text to analyze
|
|
558
|
+
threshold (int): Minimum number of repetitions to trigger detection
|
|
559
|
+
|
|
560
|
+
Returns:
|
|
561
|
+
bool: True if repeated lines exceed threshold
|
|
562
|
+
"""
|
|
563
|
+
# Split the text block into rows/lines
|
|
564
|
+
rows = text.split("\n")
|
|
565
|
+
rows = [row for row in rows if row.strip() != ""]
|
|
566
|
+
|
|
567
|
+
# Count occurrences of each row
|
|
568
|
+
row_counts = Counter(rows)
|
|
569
|
+
|
|
570
|
+
# Check if any row is repeated at least threshold times
|
|
571
|
+
for count in row_counts.values():
|
|
572
|
+
if count >= threshold:
|
|
573
|
+
return True
|
|
574
|
+
return False
|
|
575
|
+
|
|
576
|
+
@staticmethod
|
|
577
|
+
def has_low_text_quality(text, chars_threshold=2000):
|
|
578
|
+
"""
|
|
579
|
+
Check if extracted text has low quality.
|
|
580
|
+
|
|
581
|
+
Analyzes a sample of text to determine if it might have OCR or
|
|
582
|
+
extraction issues based on the ratio of valid characters.
|
|
583
|
+
|
|
584
|
+
Args:
|
|
585
|
+
text (str): Text to analyze
|
|
586
|
+
chars_threshold (int): Number of characters to sample
|
|
587
|
+
|
|
588
|
+
Returns:
|
|
589
|
+
bool: True if text quality is below acceptable threshold
|
|
590
|
+
"""
|
|
591
|
+
# Extract a sample of the text
|
|
592
|
+
sample_text = text[:chars_threshold]
|
|
593
|
+
|
|
594
|
+
if not sample_text:
|
|
595
|
+
return True
|
|
596
|
+
|
|
597
|
+
# Count the number of valid (alphanumeric) characters
|
|
598
|
+
valid_chars = sum(c.isalnum() for c in sample_text)
|
|
599
|
+
|
|
600
|
+
# Determine the percentage of valid characters in the sample
|
|
601
|
+
valid_percentage = valid_chars / len(sample_text)
|
|
602
|
+
|
|
603
|
+
# Consider the text low quality if 30% or fewer characters are valid
|
|
604
|
+
return valid_percentage <= 0.3
|
|
605
|
+
|
|
606
|
+
|
|
File without changes
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Docsity
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: polytext
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python utilities to simplify document files management
|
|
5
|
+
Home-page: https://github.com/docsity/polytext
|
|
6
|
+
Author: Matteo Senardi
|
|
7
|
+
Author-email: matteo.s@docsity.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: ~=3.6
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: pypdf==5.3.0
|
|
25
|
+
Requires-Dist: PyMuPDF>=1.25.3
|
|
26
|
+
Requires-Dist: pycryptodome==3.21.0
|
|
27
|
+
Requires-Dist: weasyprint==64.1
|
|
28
|
+
Requires-Dist: markdown==3.7
|
|
29
|
+
Requires-Dist: python-docx==1.1.2
|
|
30
|
+
Dynamic: author
|
|
31
|
+
Dynamic: author-email
|
|
32
|
+
Dynamic: classifier
|
|
33
|
+
Dynamic: description
|
|
34
|
+
Dynamic: description-content-type
|
|
35
|
+
Dynamic: home-page
|
|
36
|
+
Dynamic: license
|
|
37
|
+
Dynamic: requires-dist
|
|
38
|
+
Dynamic: requires-python
|
|
39
|
+
Dynamic: summary
|
|
40
|
+
|
|
41
|
+
# polytext
|
|
42
|
+
|
|
43
|
+
# Doc Utils
|
|
44
|
+
|
|
45
|
+
A Python package for document conversion and text extraction.
|
|
46
|
+
|
|
47
|
+
## Features
|
|
48
|
+
|
|
49
|
+
- Convert various document formats (DOCX, ODT, PPT, etc.) to PDF
|
|
50
|
+
- Extract text from PDF documents
|
|
51
|
+
- Support for both local files and S3 storage
|
|
52
|
+
- Multiple PDF parsing backends (PyPDF, PyMuPDF)
|
|
53
|
+
|
|
54
|
+
## Installation
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
# Basic installation
|
|
58
|
+
pip install plytext
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Requirements
|
|
62
|
+
|
|
63
|
+
- Python 3.6 or higher
|
|
64
|
+
- LibreOffice (for PDF conversion)
|
|
65
|
+
|
|
66
|
+
## Usage
|
|
67
|
+
|
|
68
|
+
Converting Documents to PDF
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from polytext import convert_to_pdf, ConversionError
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
# Convert a document to PDF
|
|
75
|
+
pdf_path = convert_to_pdf('input.docx', 'output.pdf')
|
|
76
|
+
print(f"PDF saved to: {pdf_path}")
|
|
77
|
+
except ConversionError as e:
|
|
78
|
+
print(f"Conversion failed: {e}")
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Text Extraction
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from polytext import extract_text_from_file
|
|
85
|
+
|
|
86
|
+
# Extract text from any supported file
|
|
87
|
+
text = extract_text_from_file('document.docx')
|
|
88
|
+
print(f"Extracted text: {text}")
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## License
|
|
92
|
+
|
|
93
|
+
MIT Licence
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
polytext/__init__.py,sha256=a92gWlgbNkMoOLUxmR_-mo-SyAlM9JF0MCPmV7yJrBY,442
|
|
2
|
+
polytext/output_manager.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
polytext/converter/__init__.py,sha256=vtqVbUqoXDhNfhJj4bJM-bkwac0P8hQ0Efkek6oLrro,134
|
|
4
|
+
polytext/converter/pdf.py,sha256=8DX0EiWNWKz_Uwpvf5xMtB6w-JlPgxEcKvHjriD7vxk,9894
|
|
5
|
+
polytext/exceptions/__init__.py,sha256=A7sckwMe832uv7DacjMvCWScT8NKqKrMtHfM8KpKvls,168
|
|
6
|
+
polytext/exceptions/base.py,sha256=AYq-w9aP3ll_3-bePZi5mJNc_1ZL_23ZNACOLBuBiPQ,1819
|
|
7
|
+
polytext/loader/__init__.py,sha256=WMvYGEviHk1e_dpIw8bwF6KsHHQdNj5W1dcqKRmvA-I,174
|
|
8
|
+
polytext/loader/text.py,sha256=Yb6vGUFYk6V9BDqxlk7lXrd9dZF6eSxK9gVqDPdtSMY,23709
|
|
9
|
+
polytext-0.1.0.dist-info/LICENSE,sha256=n-jK4xSUrmuCR9C0EsG05KUc2vi1N6UE-k2L24pKWS4,1064
|
|
10
|
+
polytext-0.1.0.dist-info/METADATA,sha256=2wDn_U6GSbMOlddZ7xO2oBSual5AptmDtUFQKzzDq_4,2351
|
|
11
|
+
polytext-0.1.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
|
12
|
+
polytext-0.1.0.dist-info/top_level.txt,sha256=dHg5ZsItizIPYaIUCzrzjEBdyx2WBf3TGPLa_iTwWg4,9
|
|
13
|
+
polytext-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
polytext
|