PyPI - gllm-docproc-binary - Versions diffs - 0.7.26__cp311-cp311-macosx_13_0_arm64.whl - Mend

gllm-docproc-binary 0.7.26__cp311-cp311-macosx_13_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gllm-docproc-binary might be problematic. Click here for more details.

Files changed (168) hide show

gllm_docproc/__init__.pyi +0 -0
gllm_docproc/chunker/__init__.pyi +3 -0
gllm_docproc/chunker/base_chunker.pyi +28 -0
gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
gllm_docproc/chunker/table/__init__.pyi +3 -0
gllm_docproc/chunker/table/table_chunker.pyi +45 -0
gllm_docproc/converter/__init__.pyi +3 -0
gllm_docproc/converter/base_converter.pyi +15 -0
gllm_docproc/data_generator/__init__.pyi +5 -0
gllm_docproc/data_generator/base_data_generator.pyi +18 -0
gllm_docproc/data_generator/image_data_generator/__init__.pyi +4 -0
gllm_docproc/data_generator/image_data_generator/image_caption_data_generator.pyi +40 -0
gllm_docproc/data_generator/image_data_generator/multi_model_image_caption_data_generator.pyi +51 -0
gllm_docproc/data_generator/pii_data_generator/__init__.pyi +1 -0
gllm_docproc/downloader/__init__.pyi +5 -0
gllm_docproc/downloader/base_downloader.pyi +19 -0
gllm_docproc/downloader/direct_file_url_downloader.pyi +40 -0
gllm_docproc/downloader/google_drive_downloader.pyi +36 -0
gllm_docproc/downloader/html/__init__.pyi +7 -0
gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
gllm_docproc/downloader/html/firecrawl_downloader.pyi +49 -0
gllm_docproc/downloader/html/html_downloader.pyi +114 -0
gllm_docproc/downloader/html/playwright_downloader.pyi +60 -0
gllm_docproc/downloader/html/requests_downloader.pyi +46 -0
gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +28 -0
gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +66 -0
gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +43 -0
gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
gllm_docproc/downloader/html/utils/web_utils.pyi +39 -0
gllm_docproc/dpo_router/__init__.pyi +5 -0
gllm_docproc/dpo_router/base_dpo_router.pyi +16 -0
gllm_docproc/dpo_router/loader_router.pyi +52 -0
gllm_docproc/dpo_router/parser_router.pyi +42 -0
gllm_docproc/housekeeping/__init__.pyi +3 -0
gllm_docproc/housekeeping/base_housekeeping.pyi +14 -0
gllm_docproc/indexer/__init__.pyi +3 -0
gllm_docproc/indexer/base_indexer.pyi +30 -0
gllm_docproc/indexer/graph/__init__.pyi +4 -0
gllm_docproc/indexer/graph/graph_rag_indexer.pyi +11 -0
gllm_docproc/indexer/graph/light_rag_graph_rag_indexer.pyi +97 -0
gllm_docproc/indexer/graph/llama_index_graph_rag_indexer.pyi +79 -0
gllm_docproc/indexer/vector/__init__.pyi +3 -0
gllm_docproc/indexer/vector/vector_db_indexer.pyi +53 -0
gllm_docproc/loader/__init__.pyi +4 -0
gllm_docproc/loader/audio/__init__.pyi +3 -0
gllm_docproc/loader/audio/audio_loader.pyi +45 -0
gllm_docproc/loader/base_loader.pyi +30 -0
gllm_docproc/loader/csv/__init__.pyi +3 -0
gllm_docproc/loader/csv/pandas_loader.pyi +53 -0
gllm_docproc/loader/docx/__init__.pyi +5 -0
gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
gllm_docproc/loader/exception/__init__.pyi +4 -0
gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
gllm_docproc/loader/exception/video_conversion_error.pyi +12 -0
gllm_docproc/loader/html/__init__.pyi +5 -0
gllm_docproc/loader/html/exception/__init__.pyi +3 -0
gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
gllm_docproc/loader/html/flat/__init__.pyi +3 -0
gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +66 -0
gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
gllm_docproc/loader/html/flat/html_flat_merger.pyi +23 -0
gllm_docproc/loader/html/html_base_loader.pyi +25 -0
gllm_docproc/loader/html/nested/__init__.pyi +3 -0
gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
gllm_docproc/loader/html/utils/__init__.pyi +0 -0
gllm_docproc/loader/html/utils/flat_table_utils.pyi +44 -0
gllm_docproc/loader/html/utils/html_utils.pyi +59 -0
gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
gllm_docproc/loader/image/__init__.pyi +3 -0
gllm_docproc/loader/image/image_loader.pyi +54 -0
gllm_docproc/loader/json/__init__.pyi +3 -0
gllm_docproc/loader/json/json_elements_loader.pyi +35 -0
gllm_docproc/loader/loader_utils.pyi +43 -0
gllm_docproc/loader/pdf/__init__.pyi +14 -0
gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +37 -0
gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +47 -0
gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +49 -0
gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
gllm_docproc/loader/pdf/pdf_page_loader.pyi +41 -0
gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +35 -0
gllm_docproc/loader/pdf/pymupdf_loader.pyi +55 -0
gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +56 -0
gllm_docproc/loader/pdf/pymupdf_utils.pyi +77 -0
gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
gllm_docproc/loader/pipeline_loader.pyi +48 -0
gllm_docproc/loader/pptx/__init__.pyi +3 -0
gllm_docproc/loader/pptx/python_pptx_loader.pyi +48 -0
gllm_docproc/loader/txt/__init__.pyi +3 -0
gllm_docproc/loader/txt/txt_loader.pyi +55 -0
gllm_docproc/loader/video/__init__.pyi +3 -0
gllm_docproc/loader/video/video_loader_utils.pyi +97 -0
gllm_docproc/loader/video/video_transcript_loader.pyi +59 -0
gllm_docproc/loader/xlsx/__init__.pyi +3 -0
gllm_docproc/loader/xlsx/openpyxl_loader.pyi +36 -0
gllm_docproc/model/__init__.pyi +7 -0
gllm_docproc/model/element.pyi +38 -0
gllm_docproc/model/element_metadata.pyi +35 -0
gllm_docproc/model/loader_type.pyi +20 -0
gllm_docproc/model/media.pyi +51 -0
gllm_docproc/model/parser_type.pyi +19 -0
gllm_docproc/parser/__init__.pyi +4 -0
gllm_docproc/parser/base_parser.pyi +28 -0
gllm_docproc/parser/document/__init__.pyi +7 -0
gllm_docproc/parser/document/docx_parser.pyi +27 -0
gllm_docproc/parser/document/pdf_parser.pyi +35 -0
gllm_docproc/parser/document/pptx_parser.pyi +34 -0
gllm_docproc/parser/document/txt_parser.pyi +22 -0
gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
gllm_docproc/parser/html/__init__.pyi +4 -0
gllm_docproc/parser/html/flat/__init__.pyi +0 -0
gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
gllm_docproc/parser/html/nested/__init__.pyi +0 -0
gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
gllm_docproc/parser/image/__init__.pyi +4 -0
gllm_docproc/parser/image/image_mime_normalization_parser.pyi +43 -0
gllm_docproc/parser/image/image_plain_small_filter_parser.pyi +45 -0
gllm_docproc/parser/pipeline_parser.pyi +33 -0
gllm_docproc/parser/table/__init__.pyi +3 -0
gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
gllm_docproc/request_handler/__init__.pyi +3 -0
gllm_docproc/request_handler/base_request_handler.pyi +16 -0
gllm_docproc/response_handler/__init__.pyi +3 -0
gllm_docproc/response_handler/base_response_handler.pyi +38 -0
gllm_docproc/utils/__init__.pyi +3 -0
gllm_docproc/utils/async_utils.pyi +22 -0
gllm_docproc/utils/file_utils.pyi +76 -0
gllm_docproc/utils/html_constants.pyi +122 -0
gllm_docproc/validator/__init__.pyi +6 -0
gllm_docproc/validator/base_validator.pyi +34 -0
gllm_docproc/validator/character_count_validator.pyi +26 -0
gllm_docproc/validator/file_size_validator.pyi +20 -0
gllm_docproc/validator/model/__init__.pyi +4 -0
gllm_docproc/validator/model/validator_input.pyi +50 -0
gllm_docproc/validator/model/validator_result.pyi +19 -0
gllm_docproc/validator/page_count_validator.pyi +23 -0
gllm_docproc/validator/pipeline_validator.pyi +40 -0
gllm_docproc.build/.gitignore +1 -0
gllm_docproc.cpython-311-darwin.so +0 -0
gllm_docproc.pyi +222 -0
gllm_docproc_binary-0.7.26.dist-info/METADATA +216 -0
gllm_docproc_binary-0.7.26.dist-info/RECORD +168 -0
gllm_docproc_binary-0.7.26.dist-info/WHEEL +5 -0
gllm_docproc_binary-0.7.26.dist-info/top_level.txt +1 -0

gllm_docproc/model/element.pyi ADDED Viewed

@@ -0,0 +1,38 @@
+from _typeshed import Incomplete
+from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
+from pydantic import BaseModel
+from typing import Any
+PAGE: str
+HEADER: str
+TITLE: str
+HEADING: Incomplete
+MAX_HEADING_LEVEL: int
+PARAGRAPH: str
+FOOTER: str
+FOOTNOTE: str
+TABLE: str
+IMAGE: str
+AUDIO: str
+VIDEO: str
+UNCATEGORIZED_TEXT: str
+class Element(BaseModel):
+    """An Element model.
+    This class serves as the Element model for storing element text, structure, and metadata.
+    Attributes:
+        text (str): The element text.
+        structure (str): The element structure.
+        metadata (dict): The element metadata.
+    """
+    text: str
+    structure: str
+    metadata: ElementMetadata
+    @staticmethod
+    def to_list_dict(elements: list['Element']) -> list[dict[str, Any]]:
+        """Convert a list of Element objects to a list of dictionaries."""
+    @staticmethod
+    def from_list_dict(elements: list[dict[str, Any]]) -> list['Element']:
+        """Convert a list of dictionaries to a list of Element objects."""

gllm_docproc/model/element_metadata.pyi ADDED Viewed

@@ -0,0 +1,35 @@
+from pydantic import BaseModel
+PDF: str
+DOCX: str
+XLSX: str
+PPTX: str
+CSV: str
+TXT: str
+HTML: str
+AUDIO: str
+IMAGE: str
+VIDEO: str
+class ElementMetadata(BaseModel):
+    """Element metadata model.
+    This class serves as the Element metadata model for storing element metadata.
+    Mandatory Attributes:
+        source (str): The source of the element.
+        source_type (str): The source type of the element.
+        loaded_datetime (datetime): The datetime when the element is loaded.
+    """
+    source: str
+    source_type: str
+    loaded_datetime: str
+    class Config:
+        """Pydantic model configuration.
+        This class defines the Pydantic model configuration for the ElementMetadata model.
+        Attributes:
+            extra (str): Allow extra fields.
+        """
+        extra: str

gllm_docproc/model/loader_type.pyi ADDED Viewed

@@ -0,0 +1,20 @@
+from enum import StrEnum
+class LoaderType(StrEnum):
+    """Loader Type Enum.
+    This enum defines the different loader types.
+    """
+    AUDIO_LOADER: str
+    CSV_LOADER: str
+    DOCX_LOADER: str
+    HTML_LOADER: str
+    IMAGE_LOADER: str
+    JSON_ELEMENTS_LOADER: str
+    PDF_LOADER: str
+    PPTX_LOADER: str
+    TXT_LOADER: str
+    VIDEO_LOADER: str
+    XLSX_LOADER: str
+    UNCATEGORIZED: str
+    KEY: str

gllm_docproc/model/media.pyi ADDED Viewed

@@ -0,0 +1,51 @@
+from enum import StrEnum
+from pydantic import BaseModel, computed_field
+class MediaType(StrEnum):
+    """Defines valid media types."""
+    IMAGE: str
+    AUDIO: str
+    VIDEO: str
+    YOUTUBE: str
+class MediaSourceType(StrEnum):
+    """Defines valid media source types."""
+    BASE64: str
+    URL: str
+class Media(BaseModel):
+    """Media model which contains media information.
+    This class serves as the base model for storing media information in element metadata.
+    Element with media (image, audio, video, youtube) will have metadata `media` in list of dict.
+    Each dict will be following the Media model schema.
+    Attributes:
+        media_id (str): Unique identifier for the media, automatically generated from media_type and media_content.
+        media_type (MediaType): Type of media (image, audio, video, youtube).
+        media_content (str): Base64 encoded string or URL pointing to the media content.
+        media_content_type (MediaSourceType): Type of content source (base64 or url).
+    """
+    media_type: MediaType
+    media_content: str
+    media_content_type: MediaSourceType
+    @computed_field
+    @property
+    def media_id(self) -> str:
+        """Generate a standardized media ID.
+        This property generates a standardized media ID in the format:
+        {media_type}_{sha256_from_media_content_16_digit}
+        Returns:
+            str: The generated media ID.
+        """
+    class Config:
+        """Pydantic model configuration.
+        This class defines the Pydantic model configuration for the Media model.
+        Attributes:
+            extra (str): Allow extra fields.
+        """
+        extra: str

gllm_docproc/model/parser_type.pyi ADDED Viewed

@@ -0,0 +1,19 @@
+from enum import StrEnum
+class ParserType(StrEnum):
+    """Parser Type Enum.
+    This enum defines the different parser types.
+    """
+    AUDIO_PARSER: str
+    CSV_PARSER: str
+    DOCX_PARSER: str
+    HTML_PARSER: str
+    IMAGE_PARSER: str
+    PDF_PARSER: str
+    PPTX_PARSER: str
+    TXT_PARSER: str
+    VIDEO_PARSER: str
+    XLSX_PARSER: str
+    UNCATEGORIZED: str
+    KEY: str

gllm_docproc/parser/__init__.pyi ADDED Viewed

@@ -0,0 +1,4 @@
+from .base_parser import BaseParser as BaseParser
+from .pipeline_parser import PipelineParser as PipelineParser
+__all__ = ['BaseParser', 'PipelineParser']

gllm_docproc/parser/base_parser.pyi ADDED Viewed

@@ -0,0 +1,28 @@
+from abc import ABC, abstractmethod
+from typing import Any
+class BaseParser(ABC):
+    """Base class for document parser.
+    This class serves as the base for document parser, which will define the structure for every
+    content of document.
+    Methods:
+        parse(loaded_elements, **kwargs): Abstract method to parse a document.
+    """
+    @abstractmethod
+    def parse(self, loaded_elements: Any, **kwargs: Any) -> Any:
+        """Parse loaded elements to get element structure.
+        This method is abstract and must be implemented in subclasses.
+        It defines the process of parsing a document using loaded elements.
+        Args:
+            loaded_elements (Any): The loaded elements from loader. ideally formatted as List[Dict].
+            **kwargs (Any): Additional keyword arguments for customization.
+        Returns:
+            Any: The parsed document, ideally formatted as List[Dict]. Each dictionary within
+                the list are recommended to follows the structure of model 'Element',
+                to ensure consistency and ease of use across Document Processing Orchestrator.
+        """

gllm_docproc/parser/document/__init__.pyi ADDED Viewed

@@ -0,0 +1,7 @@
+from .docx_parser import DOCXParser as DOCXParser
+from .pdf_parser import PDFParser as PDFParser
+from .pptx_parser import PPTXParser as PPTXParser
+from .txt_parser import TXTParser as TXTParser
+from .xlsx_parser import XLSXParser as XLSXParser
+__all__ = ['DOCXParser', 'PDFParser', 'PPTXParser', 'TXTParser', 'XLSXParser']

gllm_docproc/parser/document/docx_parser.pyi ADDED Viewed

@@ -0,0 +1,27 @@
+from gllm_docproc.model.element import Element as Element, FOOTER as FOOTER, HEADER as HEADER, HEADING as HEADING, MAX_HEADING_LEVEL as MAX_HEADING_LEVEL, PARAGRAPH as PARAGRAPH, TITLE as TITLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
+from gllm_docproc.parser.base_parser import BaseParser as BaseParser
+from typing import Any
+class DOCXParser(BaseParser):
+    """A DOCX parser for parsing DOCX document text structure.
+    This class serves as the DOCX parser for parsing DOCX document text structure.
+    It defines the structure for parsing DOCX document text structure from a given loaded_elements.
+    Methods:
+        parse(loaded_elements, **kwargs): Parse the document from the loaded elements.
+    """
+    def parse(self, loaded_elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
+        """Parse the document from the loaded elements.
+        This method defines the process of defining text structure from loaded_elements (DOCX Loader output)
+        by their style_name. In cases there's customized style_name, it will be categorized as paragraph.
+        (example: 'Heading', 'Heading Body', 'Title 1', will be categorized as paragraph.)
+        Args:
+            loaded_elements (list[dict[str, Any]]): A list of loaded elements containing text content and metadata.
+            **kwargs (Any): Additional keyword arguments for parsing the document.
+        Returns:
+            list[dict[str, Any]]: A list of parsed elements containing text content and metadata.
+        """

gllm_docproc/parser/document/pdf_parser.pyi ADDED Viewed

@@ -0,0 +1,35 @@
+from gllm_docproc.model.element import Element as Element, FOOTER as FOOTER, FOOTNOTE as FOOTNOTE, HEADER as HEADER, HEADING as HEADING, PARAGRAPH as PARAGRAPH, TITLE as TITLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
+from gllm_docproc.parser.base_parser import BaseParser as BaseParser
+from typing import Any
+HEADER_THRESHOLD_POSITION: int
+FOOTER_THRESHOLD_POSITION: int
+FOOTNOTE_POSITION_RATIO: float
+class PDFParser(BaseParser):
+    """A class to parse PDF documents.
+    This class serves as a PDF parser for parsing or defining the structure of text within PDF documents
+    based on the text metadata (font size, font family, coordinates, etc.).
+    Methods:
+        parse: Parse the loaded elements.
+    """
+    def parse(self, loaded_elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
+        """Parse the loaded elements.
+        This method defines the process of defining text structure of the loaded elements based on metadata
+        for PDF loaded elements.
+        Args:
+            loaded_elements (list[dict[str, Any]]): A list of dictionaries containing loaded element
+                content and metadata.
+            **kwargs (Any): Additional keyword arguments.
+        Kwargs:
+            header_footer_tolerance (int, optional): An integer value indicating the tolerance for header and footer.
+                Defaults to 0.
+        Returns:
+            list[dict[str, Any]]: A list of dictionaries containing parsed element content and metadata.
+        """

gllm_docproc/parser/document/pptx_parser.pyi ADDED Viewed

@@ -0,0 +1,34 @@
+from _typeshed import Incomplete
+from gllm_docproc.model.element import Element as Element, FOOTER as FOOTER, PARAGRAPH as PARAGRAPH, TITLE as TITLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
+from gllm_docproc.parser.base_parser import BaseParser as BaseParser
+from typing import Any
+class PPTXParser(BaseParser):
+    """A PPTX parser for parsing PPTX document shape structure.
+    This class serves as the PPTX parser for parsing PPTX document shape structure.
+    It defines the structure for parsing PPTX document shape structure from a given loaded_elements.
+    Methods:
+        parse(loaded_elements, **kwargs): Parse the document from the loaded elements.
+    """
+    logger: Incomplete
+    def __init__(self) -> None:
+        """Initialize the PPTXParser class."""
+    def parse(self, loaded_elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
+        """Parse the document from the loaded elements.
+        This method defines the process of defining shape structure from loaded_elements (PPTX Loader output)
+        by their placeholder types. In cases there's customized placeholder types, it will be categorized as paragraph.
+        (example: 'BITMAP', 'MIXED', 'OBJECT', will be categorized as paragraph.)
+        Args:
+            loaded_elements (list[dict[str, Any]]): A list of loaded elements containing shape content and metadata.
+            **kwargs (Any): Additional keyword arguments for parsing the document.
+        Returns:
+            list[dict[str, Any]]: A list of parsed elements containing shape content and metadata.
+        """

gllm_docproc/parser/document/txt_parser.pyi ADDED Viewed

@@ -0,0 +1,22 @@
+from gllm_docproc.model.element import Element as Element, PARAGRAPH as PARAGRAPH, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
+from gllm_docproc.parser.base_parser import BaseParser as BaseParser
+from typing import Any
+class TXTParser(BaseParser):
+    """TXT parser for parsing text files.
+    Methods:
+        parse: Parse a list of elements from a text file.
+    """
+    def parse(self, loaded_elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
+        """Parse a list of elements from a text file.
+        all elements with structure UNCATEGORIZED_TEXT will be converted to PARAGRAPH
+        Args:
+            loaded_elements (list[dict[str, Any]]): The list of elements that have already been loaded.
+            **kwargs: Additional keyword arguments.
+        Returns:
+            list[dict[str, Any]]: A list of elements.
+        """

gllm_docproc/parser/document/xlsx_parser.pyi ADDED Viewed

@@ -0,0 +1,26 @@
+from gllm_docproc.model.element import Element as Element
+from gllm_docproc.parser import BaseParser as BaseParser
+from typing import Any
+DEFAULT_SHEET_NAME_PATTERN: str
+class XLSXParser(BaseParser):
+    """A XLSX parser for parsing XLSX document text structure.
+    This class serves as the XLSX parser for parsing XLSX document text structure.
+    It defines the structure for parsing XLSX document text structure from a given loaded_elements.
+    Methods:
+        parse(loaded_elements, **kwargs): Parse the document from the loaded elements.
+    """
+    def parse(self, loaded_elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
+        """Parse loaded elements by assigning a structure to each element.
+        Args:
+            loaded_elements (list[dict[str, Any]]): A list of dictionaries representing loaded elements.
+            **kwargs (Any): Additional arguments for parsing the document.
+        Returns:
+            list[dict[str, Any]]: A list of dictionaries containing parsed elements with assigned structures.
+        """

gllm_docproc/parser/html/__init__.pyi ADDED Viewed

@@ -0,0 +1,4 @@
+from .flat.html_flat_parser import HTMLFlatParser as HTMLFlatParser
+from .nested.html_nested_parser import HTMLNestedParser as HTMLNestedParser
+__all__ = ['HTMLFlatParser', 'HTMLNestedParser']

gllm_docproc/parser/html/flat/__init__.pyi ADDED Viewed

File without changes

gllm_docproc/parser/html/flat/html_flat_parser.pyi ADDED Viewed

@@ -0,0 +1,27 @@
+from gllm_docproc.model.element import PARAGRAPH as PARAGRAPH
+from gllm_docproc.parser.base_parser import BaseParser as BaseParser
+from gllm_docproc.utils.html_constants import HTMLTags as HTMLTags, ItemDataKeys as ItemDataKeys, Structure as Structure
+from typing import Any
+class HTMLFlatParser(BaseParser):
+    """This class extends the BaseParser and is specifically designed for parsing elements loaded from web content.
+    It assigns a structure to each loaded element based on the HTML tags present in its metadata.
+    Attributes:
+        None
+    Methods:
+        parse(loaded_elements: list[dict], **kwargs: dict[str, Any]) -> list[dict]:
+            Parses the loaded_elements and assigns a structure to each element based on its HTML tags.
+    """
+    def parse(self, loaded_elements: list[dict[str, Any]], **kwargs: dict[str, Any]) -> list[dict[str, Any]]:
+        """Parses the loaded_elements and assigns a structure to each element based on its HTML tags.
+        Args:
+            loaded_elements (list[dict]): The elements loaded from web content to be parsed.
+            **kwargs (dict[str, Any]): Additional keyword arguments.
+        Returns:
+            list[dict]: The parsed elements with assigned structures.
+        """

gllm_docproc/parser/html/nested/__init__.pyi ADDED Viewed

File without changes

gllm_docproc/parser/html/nested/html_json_processor.pyi ADDED Viewed

@@ -0,0 +1,158 @@
+from _typeshed import Incomplete
+from gllm_docproc.model.element import PARAGRAPH as PARAGRAPH, TABLE as TABLE, TITLE as TITLE
+from gllm_docproc.parser.html.nested.nested_element import NestedElement as NestedElement
+from gllm_docproc.utils.html_constants import ContentDataKeys as ContentDataKeys, ErrorMessage as ErrorMessage, FORMATTING_TAGS as FORMATTING_TAGS, HTMLTags as HTMLTags, ItemDataKeys as ItemDataKeys, MetaDataKeys as MetaDataKeys, SPACING as SPACING, Structure as Structure, TableConstants as TableConstants
+class HTMLJsonProcessor:
+    """Processor for processing items scraped by the spider.
+    This pipeline processes the raw data scraped by the spider, formats it, and stores it in a JSON format.
+    It also handles errors during the processing and logging of the data.
+    Attributes:
+        logger: An instance of a logger, used for logging runtime information.
+        element_id: A counter for the elements processed by the pipeSline.
+        processor_result: A dictionary that holds the processed data.
+    """
+    logger: Incomplete
+    element_id: int
+    processor_result: Incomplete
+    def __init__(self) -> None:
+        """Initialize the HTMLJsonProcessor."""
+    def process_item(self, item: list[dict]):
+        """Processes each item passed by the spider.
+        The method formats the raw data and stores it in the processor_result dictionary.
+        Args:
+            item (list): The raw data scraped by the spider.
+        Returns:
+            list: The processed item.
+        """
+    def add_title_element(self, item) -> None:
+        """Adds the title element to the processor_result dictionary.
+        Args:
+            item (dict): The raw data scraped by the spider.
+        """
+    def extract_data(self, current: dict, data: NestedElement):
+        """Extracts data from the raw data.
+        This method traverses the raw data and extracts the necessary information.
+        Args:
+            current (dict): The current node in the raw data.
+            data (NestedElement): The dictionary where the extracted data is stored.
+        """
+    def handle_table_data(self, current, data: NestedElement):
+        """Handles table content.
+        Args:
+            current (dict): The current node in the raw data. It should contain the table content and metadata.
+            data (dict): The dictionary where the extracted data is stored.
+        """
+    def handle_media_data(self, current, data: NestedElement):
+        """Handles media content.
+        Args:
+            current (dict): The current node in the raw data.
+            data (dict): The dictionary where the extracted data is stored.
+        """
+    def handle_string_content(self, current, data: NestedElement):
+        """Handles string content.
+        Args:
+            current (dict): The current node in the raw data.
+            data (dict): The dictionary where the extracted data is stored.
+        """
+    def handle_other_cases(self, current, data: NestedElement):
+        """Handles other cases.
+        Args:
+            current (dict): The current node in the raw data.
+            data (dict): The dictionary where the extracted data is stored.
+        """
+    def handle_current_tag(self, current, data: NestedElement) -> tuple[NestedElement, dict]:
+        """Handles the current tag. This method checks the current tag and updates the data accordingly.
+        Args:
+            current (dict): The current node in the raw data.
+            data (dict): The dictionary where the extracted data is stored.
+        Returns:
+            NestedElement: The updated NestedElement object.
+            dict: A dictionary containing additional arguments.
+        """
+    def handle_content(self, current, data: NestedElement, args: dict):
+        """Handles content. This method iterates over the content and extracts the necessary information.
+        Args:
+            current (dict): The current node in the raw data.
+            data (NestedElement): The dictionary where the extracted data is stored.
+            args (dict): The dictionary containing the arguments for the method.
+        """
+    def add_result(self, data: NestedElement):
+        """Adds the processed data to the processor_result dictionary.
+        Args:
+            data (dict): The processed data.
+        """
+    def add_link(self, data: NestedElement) -> NestedElement:
+        """Adds a link to the processed data content.
+        Args:
+            data (dict): The processed data.
+        Returns:
+            dict: The processed data.
+        """
+    def add_index(self, data: NestedElement) -> NestedElement:
+        """Adds a index to the processed data content.
+        Args:
+            data (dict): The processed data.
+        Returns:
+            dict: The processed data.
+        """
+    def handle_media(self, current, data: NestedElement) -> NestedElement:
+        """Handles media content.
+        Args:
+            current (dict): The current node in the raw data.
+            data (dict): The dictionary where the extracted data is stored.
+        Returns:
+            dict: The processed data.
+        """
+    def handle_table(self, current, data: NestedElement) -> list:
+        """Handle Table.
+        This method processes table content by iterating over its metadata, handling each row based on its type,
+        and appending the result to the table data.
+        Args:
+            current (dict): The current node in the raw data. It should contain the table content and metadata.
+            data (dict): The dictionary where the extracted data is stored. This method adds a 'structure' key with the
+                         value 'table', and appends the extracted table data to this dictionary.
+        Returns:
+            list: A list of dictionaries containing the extracted table data.
+        """
+    def print_row(self, row, col_size=None):
+        """Formats a table row.
+        Args:
+            row (list): The row to be formatted.
+            col_size (list | None, optional): The size of the columns. Defaults to None.
+        Returns:
+            str: The formatted row.
+        """
+    def print_table_separator(self, row):
+        """Formats a table separator.
+        Returns:
+            str: The formatted table separator.
+        """

gllm_docproc/parser/html/nested/html_nested_parser.pyi ADDED Viewed

@@ -0,0 +1,24 @@
+from gllm_docproc.model.element import Element as Element
+from gllm_docproc.parser.base_parser import BaseParser as BaseParser
+from gllm_docproc.parser.html.nested.html_json_processor import HTMLJsonProcessor as HTMLJsonProcessor
+from typing import Any
+class HTMLNestedParser(BaseParser):
+    """A parser class for processing JSON elements into a parsed elements.
+    This class inherits from the BaseParser class and implements the parse method
+    to convert loaded HTML elements into a processed JSON format.
+    Attributes:
+        None
+    """
+    def parse(self, loaded_elements: list[dict[str, Any]], **kwargs: dict[str, Any]) -> list[dict[str, Any]]:
+        """Processes loaded HTML elements into a JSON format.
+        Args:
+            loaded_elements (dict): The loaded HTML elements to be processed.
+            **kwargs (dict[str, Any]): Additional keyword arguments.
+        Returns:
+            dict: The processed JSON representation of the HTML elements.
+        """

gllm_docproc/parser/html/nested/nested_element.pyi ADDED Viewed

@@ -0,0 +1,31 @@
+from gllm_docproc.model.element import Element as Element
+class NestedElement(Element):
+    """A specialized class extending Element to represent nested elements.
+    This class includes additional functionality specific to nested elements, such as generating
+    a unique element_id and providing methods to convert the instance to a dictionary or Element.
+    Attributes:
+        element_id (int): A unique identifier for the nested element.
+    Methods:
+        to_dict(): Convert the NestedElement instance to a dictionary.
+        to_element(): Convert the NestedElement instance to an Element.
+    """
+    element_id: int
+    def to_dict(self):
+        """Convert the NestedElement instance to a dictionary.
+        Returns:
+            dict: A dictionary representation of the NestedElement instance.
+        """
+    def to_element(self) -> Element:
+        """Convert the NestedElement instance to an Element.
+        This method creates an Element instance from the current NestedElement. It deep copies the metadata,
+        assigns the element_id, and constructs an Element with the associated text, metadata, and structure.
+        Returns:
+            Element: The Element instance created from the NestedElement.
+        """

gllm_docproc/parser/image/__init__.pyi ADDED Viewed

@@ -0,0 +1,4 @@
+from gllm_docproc.parser.image.image_mime_normalization_parser import ImageMIMENormalizationParser as ImageMIMENormalizationParser
+from gllm_docproc.parser.image.image_plain_small_filter_parser import ImagePlainSmallFilterParser as ImagePlainSmallFilterParser
+__all__ = ['ImageMIMENormalizationParser', 'ImagePlainSmallFilterParser']

gllm_docproc/parser/image/image_mime_normalization_parser.pyi ADDED Viewed

@@ -0,0 +1,43 @@
+from _typeshed import Incomplete
+from gllm_docproc.model.element import Element as Element
+from gllm_docproc.model.media import MediaSourceType as MediaSourceType, MediaType as MediaType
+from gllm_docproc.parser.base_parser import BaseParser as BaseParser
+from typing import Any
+SUPPORTED_TARGET_MIME_TYPES: Incomplete
+class ImageMIMENormalizationParser(BaseParser):
+    """Parser for normalizing unsupported image MIME types.
+    This parser identifies images with unsupported MIME types and converts them to the target MIME type.
+    """
+    target_mime_type: Incomplete
+    target_format: Incomplete
+    supported_mime_types: Incomplete
+    logger: Incomplete
+    def __init__(self, target_mime_type: str = 'image/png', supported_mime_types: set[str] | None = None) -> None:
+        '''Initialize the image MIME normalization parser.
+        Args:
+            target_mime_type (str, optional): The target MIME type to convert images to.
+                Must be one of the supported target MIME types. Defaults to "image/png".
+            supported_mime_types (set[str] | None, optional): Set of MIME types that don\'t need normalization.
+                If None, only the target format is considered supported. Defaults to None.
+        Raises:
+            ValueError: If target_mime_type is not in SUPPORTED_TARGET_MIME_TYPES.
+        '''
+    def parse(self, loaded_elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
+        """Parse unsupported MIME type image to the target MIME type.
+        This function will normalize the image base64 in element.media to the target mime type.
+        If conversion fails, we will keep the original image base64 and the process will continue.
+        Args:
+            loaded_elements (list[dict[str, Any]]): A list of elements in list dict format where each dict
+                mirroring the Element model structure
+            **kwargs (Any): Additional keyword arguments.
+        Returns:
+            list[dict[str, Any]]: Elements with normalized images.
+        """