PyPI - polytext - Versions diffs - 0.1.0__tar.gz - Mend

polytext 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

polytext-0.1.0/LICENSE +21 -0
polytext-0.1.0/PKG-INFO +93 -0
polytext-0.1.0/README.md +53 -0
polytext-0.1.0/polytext/__init__.py +15 -0
polytext-0.1.0/polytext/converter/__init__.py +4 -0
polytext-0.1.0/polytext/converter/pdf.py +256 -0
polytext-0.1.0/polytext/exceptions/__init__.py +4 -0
polytext-0.1.0/polytext/exceptions/base.py +52 -0
polytext-0.1.0/polytext/loader/__init__.py +4 -0
polytext-0.1.0/polytext/loader/text.py +606 -0
polytext-0.1.0/polytext/output_manager.py +0 -0
polytext-0.1.0/polytext.egg-info/PKG-INFO +93 -0
polytext-0.1.0/polytext.egg-info/SOURCES.txt +20 -0
polytext-0.1.0/polytext.egg-info/dependency_links.txt +1 -0
polytext-0.1.0/polytext.egg-info/not-zip-safe +1 -0
polytext-0.1.0/polytext.egg-info/requires.txt +6 -0
polytext-0.1.0/polytext.egg-info/top_level.txt +1 -0
polytext-0.1.0/pyproject.toml +15 -0
polytext-0.1.0/setup.cfg +4 -0
polytext-0.1.0/setup.py +79 -0
polytext-0.1.0/tests/test_extract_text_from_file.py +49 -0
polytext-0.1.0/tests/test_get_document_text.py +56 -0

polytext-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Docsity
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

polytext-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,93 @@
+Metadata-Version: 2.2
+Name: polytext
+Version: 0.1.0
+Summary: Python utilities to simplify document files management
+Home-page: https://github.com/docsity/polytext
+Author: Matteo Senardi
+Author-email: matteo.s@docsity.com
+License: MIT
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: ~=3.6
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: pypdf==5.3.0
+Requires-Dist: PyMuPDF>=1.25.3
+Requires-Dist: pycryptodome==3.21.0
+Requires-Dist: weasyprint==64.1
+Requires-Dist: markdown==3.7
+Requires-Dist: python-docx==1.1.2
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: license
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
+# polytext
+# Doc Utils
+A Python package for document conversion and text extraction.
+## Features
+- Convert various document formats (DOCX, ODT, PPT, etc.) to PDF
+- Extract text from PDF documents
+- Support for both local files and S3 storage
+- Multiple PDF parsing backends (PyPDF, PyMuPDF)
+## Installation
+```bash
+# Basic installation
+pip install plytext
+```
+## Requirements
+- Python 3.6 or higher
+- LibreOffice (for PDF conversion)
+## Usage
+Converting Documents to PDF
+```python
+from polytext import convert_to_pdf, ConversionError
+try:
+    # Convert a document to PDF
+    pdf_path = convert_to_pdf('input.docx', 'output.pdf')
+    print(f"PDF saved to: {pdf_path}")
+except ConversionError as e:
+    print(f"Conversion failed: {e}")
+```
+Text Extraction
+```python
+from polytext import extract_text_from_file
+# Extract text from any supported file
+text = extract_text_from_file('document.docx')
+print(f"Extracted text: {text}")
+```
+## License
+MIT Licence

polytext-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,53 @@
+# polytext
+# Doc Utils
+A Python package for document conversion and text extraction.
+## Features
+- Convert various document formats (DOCX, ODT, PPT, etc.) to PDF
+- Extract text from PDF documents
+- Support for both local files and S3 storage
+- Multiple PDF parsing backends (PyPDF, PyMuPDF)
+## Installation
+```bash
+# Basic installation
+pip install plytext
+```
+## Requirements
+- Python 3.6 or higher
+- LibreOffice (for PDF conversion)
+## Usage
+Converting Documents to PDF
+```python
+from polytext import convert_to_pdf, ConversionError
+try:
+    # Convert a document to PDF
+    pdf_path = convert_to_pdf('input.docx', 'output.pdf')
+    print(f"PDF saved to: {pdf_path}")
+except ConversionError as e:
+    print(f"Conversion failed: {e}")
+```
+Text Extraction
+```python
+from polytext import extract_text_from_file
+# Extract text from any supported file
+text = extract_text_from_file('document.docx')
+print(f"Extracted text: {text}")
+```
+## License
+MIT Licence

polytext-0.1.0/polytext/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+# polytext/__init__.py
+from .converter.pdf import convert_to_pdf, DocumentConverter
+from .loader.text import get_document_text, extract_text_from_file, TextLoader
+from .exceptions.base import EmptyDocument, ExceededMaxPages, ConversionError
+__all__ = [
+    'convert_to_pdf',
+    'DocumentConverter',
+    'get_document_text',
+    'extract_text_from_file',
+    'TextLoader',
+    'EmptyDocument',
+    'ExceededMaxPages',
+    'ConversionError'
+]

polytext-0.1.0/polytext/converter/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+# polytext/converter/__init__.py
+from .pdf import convert_to_pdf, DocumentConverter
+__all__ = ['convert_to_pdf', 'DocumentConverter']

polytext-0.1.0/polytext/converter/pdf.py ADDED Viewed

@@ -0,0 +1,256 @@
+# converter/pdf.py
+import os
+import subprocess
+import logging
+from ..exceptions.base import ConversionError
+logger = logging.getLogger(__name__)
+def convert_to_pdf(input_file, original_file, output_file=None):
+    """
+    Convenience function to convert a document to PDF format using LibreOffice.
+    Args:
+        input_file (str): Path to the input document file to be converted
+        original_file (str): Path to the original file for extension checking
+        output_file (str, optional): Path where the output PDF should be saved
+    Returns:
+        str: Path to the generated PDF file
+    Raises:
+        FileNotFoundError: If the input file doesn't exist
+        ConversionError: If the conversion process fails
+    """
+    converter = DocumentConverter()
+    return converter.convert_to_pdf(input_file, original_file, output_file)
+class DocumentConverter:
+    """
+    A class for converting various document formats to PDF using LibreOffice.
+    The converter supports common document formats like TXT, DOC(X), ODT, PPT(X),
+    and XLS(X). It requires LibreOffice to be installed on the system.
+    Attributes:
+        supported_extensions (list): List of supported file extensions
+    """
+    def __init__(self):
+        """Initialize the DocumentConverter."""
+        self.supported_extensions = [
+            '.txt', '.docx', '.doc', '.odt',
+            '.ppt', '.pptx', '.xlsx', '.xls', '.ods'
+        ]
+    @staticmethod
+    def check_libreoffice_installed():
+        """
+        Check if LibreOffice is installed and accessible in the system PATH.
+        Returns:
+            bool: True if LibreOffice is installed and available, False otherwise.
+        """
+        try:
+            subprocess.run(
+                ['libreoffice', '--version'],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                check=False
+            )
+            return True
+        except (subprocess.SubprocessError, FileNotFoundError):
+            return False
+    def convert_to_pdf(self, input_file, original_file, output_file=None):
+        """
+        Convert a document to PDF format using LibreOffice.
+        This method uses LibreOffice in headless mode to convert documents. If the input
+        file is already a PDF, it will be copied to the output location.
+        Args:
+            input_file (str): Path to the input document file to be converted
+            original_file (str): Path to the original file for extension checking
+            output_file (str, optional): Path where the output PDF should be saved.
+                If not provided, will use input_file name with .pdf extension
+        Returns:
+            str: Path to the generated PDF file
+        Raises:
+            FileNotFoundError: If the input file doesn't exist
+            ConversionError: If the conversion process fails or LibreOffice is not installed
+        """
+        if not os.path.exists(input_file):
+            raise FileNotFoundError(f"Input file '{input_file}' does not exist.")
+        # Check file extension
+        _, ext = os.path.splitext(original_file)
+        logger.info(os.path.splitext(original_file))
+        if ext.lower() not in self.supported_extensions and ext.lower() != '.pdf':
+            logger.warning(f"File extension '{ext}' may not be supported.")
+        # Set default output file name if not provided
+        if output_file is None:
+            output_file = os.path.splitext(input_file)[0] + '.pdf'
+        output_dir = os.path.dirname(os.path.abspath(output_file))
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        # If the file is already a PDF, just copy it
+        if ext.lower() == '.pdf':
+            import shutil
+            shutil.copy2(input_file, output_file)
+            logger.info(f"File is already a PDF. Copied to '{output_file}'")
+            return output_file
+        # Check if LibreOffice is installed
+        if not self.check_libreoffice_installed():
+            raise ConversionError(
+                "LibreOffice is not installed or not found in PATH. "
+                "Please install LibreOffice to convert documents to PDF."
+            )
+        # Build the LibreOffice command
+        command = [
+            'libreoffice',
+            '--headless',
+            '--nologo',
+            '--nofirststartwizard',
+            '--convert-to', 'pdf',
+            '--outdir', output_dir,
+            input_file
+        ]
+        try:
+            # Suppress Java runtime warnings by redirecting stderr
+            subprocess.check_call(command, stderr=subprocess.DEVNULL)
+            logger.info(f"Conversion successful: '{output_file}'")
+        except subprocess.CalledProcessError as e:
+            error_msg = f"Error during conversion: {e}"
+            logger.error(error_msg)
+            raise ConversionError(error_msg, e)
+        # After conversion, ensure the output file is correctly named
+        converted_file = os.path.join(
+            output_dir,
+            os.path.splitext(os.path.basename(input_file))[0] + '.pdf'
+        )
+        if converted_file != output_file:
+            os.rename(converted_file, output_file)
+        return output_file
+# Alternative method with direct page_range management
+    # def convert_to_pdf(self, input_file, output_file=None, page_range=None):
+    #     """
+    #     Converts a document to PDF format using LibreOffice.
+    #     """
+    #     if not os.path.exists(input_file):
+    #         raise FileNotFoundError(f"Input file '{input_file}' does not exist.")
+    #
+    #     # Check file extension
+    #     _, ext = os.path.splitext(input_file)
+    #     logger.info(os.path.splitext(input_file))
+    #
+    #     # Set default output file name if not provided
+    #     if output_file is None:
+    #         output_file = os.path.splitext(input_file)[0] + '.pdf'
+    #
+    #     output_dir = os.path.dirname(os.path.abspath(output_file))
+    #     if not os.path.exists(output_dir):
+    #         os.makedirs(output_dir)
+    #
+    #     # If the file is already a PDF, just copy it
+    #     if ext.lower() == '.pdf':
+    #         import shutil
+    #         shutil.copy2(input_file, output_file)
+    #         logger.info(f"File is already a PDF. Copied to '{output_file}'")
+    #         return output_file
+    #
+    #     # Check if LibreOffice is installed
+    #     if not self.check_libreoffice_installed():
+    #         raise ConversionError(
+    #             "LibreOffice is not installed or not found in PATH. "
+    #             "Please install LibreOffice to convert documents to PDF."
+    #         )
+    #
+    #     # Record existing PDFs in the output directory
+    #     import glob
+    #     existing_pdfs = set(glob.glob(os.path.join(output_dir, "*.pdf")))
+    #
+    #     # Build the LibreOffice command
+    #     convert_filter = 'pdf'
+    #     if page_range:
+    #         convert_filter = f'pdf:writer_pdf_Export:{{"PageRange":{{"type":"string","value":"{page_range}"}}}}'
+    #
+    #     command = [
+    #         'libreoffice',
+    #         '--headless',
+    #         '--nologo',
+    #         '--nofirststartwizard',
+    #         '--convert-to', convert_filter,
+    #         '--outdir', output_dir,
+    #         input_file
+    #     ]
+    #
+    #     try:
+    #         # Run the command and capture output
+    #         result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+    #                                 text=True, check=False)
+    #
+    #         # Check if command was successful
+    #         if result.returncode != 0:
+    #             error_msg = f"Error during conversion: {result.stderr}"
+    #             logger.error(error_msg)
+    #             raise ConversionError(error_msg)
+    #
+    #         # Log the output to help debugging
+    #         logger.info(f"LibreOffice conversion output: {result.stdout}")
+    #
+    #         # Find newly created PDF file
+    #         current_pdfs = set(glob.glob(os.path.join(output_dir, "*.pdf")))
+    #         new_pdfs = current_pdfs - existing_pdfs
+    #
+    #         if not new_pdfs:
+    #             # Try looking in /private path as well (for macOS)
+    #             if output_dir.startswith('/var/'):
+    #                 private_dir = '/private' + output_dir
+    #                 private_pdfs = set(glob.glob(os.path.join(private_dir, "*.pdf")))
+    #                 new_pdfs = private_pdfs - existing_pdfs
+    #
+    #         if not new_pdfs:
+    #             # Last resort: find the most recently created PDF
+    #             all_pdfs = glob.glob(os.path.join(output_dir, "*.pdf"))
+    #             if not all_pdfs and output_dir.startswith('/var/'):
+    #                 private_dir = '/private' + output_dir
+    #                 all_pdfs = glob.glob(os.path.join(private_dir, "*.pdf"))
+    #
+    #             if all_pdfs:
+    #                 converted_file = max(all_pdfs, key=os.path.getmtime)
+    #                 logger.info(f"Found most recent PDF: {converted_file}")
+    #             else:
+    #                 raise ConversionError(f"No PDF files found in output directory after conversion.")
+    #         else:
+    #             converted_file = list(new_pdfs)[0]
+    #             logger.info(f"Found newly created PDF: {converted_file}")
+    #
+    #         # Move to desired output location if needed
+    #         if converted_file != output_file:
+    #             import shutil
+    #             shutil.copy2(converted_file, output_file)
+    #             os.remove(converted_file)  # Clean up the original
+    #             logger.info(f"Moved PDF to final location: {output_file}")
+    #
+    #         return output_file
+    #
+    #     except Exception as e:
+    #         error_msg = f"Error during PDF conversion: {str(e)}"
+    #         logger.error(error_msg)
+    #         raise ConversionError(error_msg)

polytext-0.1.0/polytext/exceptions/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+# polytext/exceptions/__init__.py
+from .base import EmptyDocument, ExceededMaxPages, ConversionError
+__all__ = ['EmptyDocument', 'ExceededMaxPages', 'ConversionError']

polytext-0.1.0/polytext/exceptions/base.py ADDED Viewed

@@ -0,0 +1,52 @@
+# exceptions/base.py
+class ConversionError(Exception):
+    """
+    Exception raised when document conversion to PDF fails.
+    This exception is typically raised when LibreOffice fails to convert a document
+    or when the conversion process encounters system-level issues.
+    Attributes:
+        message (str): Detailed error message describing the conversion failure
+        original_exception: The underlying exception that caused the conversion failure
+    """
+    def __init__(self, message, original_exception=None):
+        super().__init__(message)
+        self.message = message
+        self.original_exception = original_exception
+class EmptyDocument(Exception):
+    """
+    Exception raised when a document contains no extractable text.
+    This exception is raised when text extraction yields empty results or
+    when the extracted text fails quality checks (e.g., too few characters,
+    excessive repeated content).
+    Attributes:
+        message (str): Description of why the document is considered empty
+        code (int): Error code for categorizing the type of emptiness (default: None)
+    """
+    def __init__(self, message, code=None):
+        super().__init__(message)
+        self.message = message
+        self.code = code
+class ExceededMaxPages(Exception):
+    """
+    Exception raised when requested page range exceeds document length.
+    This exception occurs when attempting to extract text from pages beyond
+    the document's actual page count or when invalid page ranges are specified.
+    Attributes:
+        message (str): Description of the page range error
+        code (int): Error code for tracking purposes (default: None)
+    """
+    def __init__(self, message, code=None):
+        super().__init__(message)
+        self.message = message
+        self.code = code

polytext-0.1.0/polytext/loader/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+# polytext/loader/__init__.py
+from .text import get_document_text, extract_text_from_file, TextLoader
+__all__ = ['get_document_text', 'extract_text_from_file', 'TextLoader']