gllm-docproc-binary 0.1.8__cp312-cp312-manylinux_2_31_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gllm-docproc-binary might be problematic. Click here for more details.
- gllm_docproc/__init__.pyi +0 -0
- gllm_docproc/chunker/__init__.pyi +3 -0
- gllm_docproc/chunker/base_chunker.pyi +29 -0
- gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
- gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
- gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
- gllm_docproc/chunker/table/__init__.pyi +3 -0
- gllm_docproc/chunker/table/table_chunker.pyi +45 -0
- gllm_docproc/converter/__init__.pyi +3 -0
- gllm_docproc/converter/base_converter.pyi +16 -0
- gllm_docproc/data_generator/__init__.pyi +3 -0
- gllm_docproc/data_generator/base_data_generator.pyi +19 -0
- gllm_docproc/downloader/__init__.pyi +3 -0
- gllm_docproc/downloader/base_downloader.pyi +16 -0
- gllm_docproc/downloader/html/__init__.pyi +4 -0
- gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
- gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
- gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
- gllm_docproc/downloader/html/html_downloader.pyi +91 -0
- gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
- gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +29 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +50 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
- gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +45 -0
- gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
- gllm_docproc/downloader/html/utils/web_utils.pyi +21 -0
- gllm_docproc/dpo_router/__init__.pyi +3 -0
- gllm_docproc/dpo_router/base_dpo_router.pyi +17 -0
- gllm_docproc/housekeeping/__init__.pyi +3 -0
- gllm_docproc/housekeeping/base_housekeeping.pyi +15 -0
- gllm_docproc/indexer/__init__.pyi +3 -0
- gllm_docproc/indexer/base_indexer.pyi +31 -0
- gllm_docproc/indexer/graph/__init__.pyi +3 -0
- gllm_docproc/indexer/knowledge_graph/__init__.pyi +4 -0
- gllm_docproc/loader/__init__.pyi +4 -0
- gllm_docproc/loader/audio/__init__.pyi +3 -0
- gllm_docproc/loader/base_loader.pyi +31 -0
- gllm_docproc/loader/docx/__init__.pyi +5 -0
- gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
- gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
- gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
- gllm_docproc/loader/exception/__init__.pyi +3 -0
- gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
- gllm_docproc/loader/html/__init__.pyi +5 -0
- gllm_docproc/loader/html/exception/__init__.pyi +3 -0
- gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
- gllm_docproc/loader/html/flat/__init__.pyi +3 -0
- gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +52 -0
- gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
- gllm_docproc/loader/html/flat/html_flat_merger.pyi +22 -0
- gllm_docproc/loader/html/html_base_loader.pyi +25 -0
- gllm_docproc/loader/html/nested/__init__.pyi +3 -0
- gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
- gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
- gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
- gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
- gllm_docproc/loader/html/utils/__init__.pyi +0 -0
- gllm_docproc/loader/html/utils/flat_table_utils.pyi +29 -0
- gllm_docproc/loader/html/utils/html_utils.pyi +41 -0
- gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
- gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
- gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
- gllm_docproc/loader/json/__init__.pyi +3 -0
- gllm_docproc/loader/json/json_elements_loader.pyi +33 -0
- gllm_docproc/loader/loader_utils.pyi +42 -0
- gllm_docproc/loader/pdf/__init__.pyi +13 -0
- gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +40 -0
- gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +45 -0
- gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +50 -0
- gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
- gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
- gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
- gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
- gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +38 -0
- gllm_docproc/loader/pdf/pymupdf_loader.pyi +43 -0
- gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +44 -0
- gllm_docproc/loader/pdf/pymupdf_utils.pyi +34 -0
- gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
- gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
- gllm_docproc/loader/pipeline_loader.pyi +48 -0
- gllm_docproc/loader/txt/__init__.pyi +3 -0
- gllm_docproc/loader/txt/txt_loader.pyi +26 -0
- gllm_docproc/loader/xlsx/__init__.pyi +3 -0
- gllm_docproc/loader/xlsx/openpyxl_loader.pyi +37 -0
- gllm_docproc/model/__init__.pyi +4 -0
- gllm_docproc/model/element.pyi +37 -0
- gllm_docproc/model/element_metadata.pyi +35 -0
- gllm_docproc/parser/__init__.pyi +4 -0
- gllm_docproc/parser/base_parser.pyi +29 -0
- gllm_docproc/parser/document/__init__.pyi +6 -0
- gllm_docproc/parser/document/docx_parser.pyi +27 -0
- gllm_docproc/parser/document/pdf_parser.pyi +35 -0
- gllm_docproc/parser/document/txt_parser.pyi +22 -0
- gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
- gllm_docproc/parser/html/__init__.pyi +4 -0
- gllm_docproc/parser/html/flat/__init__.pyi +0 -0
- gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
- gllm_docproc/parser/html/nested/__init__.pyi +0 -0
- gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
- gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
- gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
- gllm_docproc/parser/pipeline_parser.pyi +33 -0
- gllm_docproc/parser/table/__init__.pyi +3 -0
- gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
- gllm_docproc/request_handler/__init__.pyi +3 -0
- gllm_docproc/request_handler/base_request_handler.pyi +17 -0
- gllm_docproc/response_handler/__init__.pyi +3 -0
- gllm_docproc/response_handler/base_response_handler.pyi +39 -0
- gllm_docproc/utils/__init__.pyi +0 -0
- gllm_docproc/utils/file_utils.pyi +76 -0
- gllm_docproc/utils/html_constants.pyi +121 -0
- gllm_docproc.build/.gitignore +1 -0
- gllm_docproc.cpython-312-x86_64-linux-gnu.so +0 -0
- gllm_docproc.pyi +149 -0
- gllm_docproc_binary-0.1.8.dist-info/METADATA +110 -0
- gllm_docproc_binary-0.1.8.dist-info/RECORD +123 -0
- gllm_docproc_binary-0.1.8.dist-info/WHEEL +4 -0
|
File without changes
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
class BaseChunker(ABC, metaclass=abc.ABCMeta):
|
|
6
|
+
"""An abstract base class for chunker.
|
|
7
|
+
|
|
8
|
+
This class segmenting or chunking elements based on contextual information.
|
|
9
|
+
Subclasses are expected to implement the 'chunk' method to handle chunking elements.
|
|
10
|
+
|
|
11
|
+
Methods:
|
|
12
|
+
chunk(elements, **kwargs): Abstract method to chunk a document.
|
|
13
|
+
"""
|
|
14
|
+
@abstractmethod
|
|
15
|
+
def chunk(self, elements: Any, **kwargs: Any) -> Any:
|
|
16
|
+
"""Chunk a document.
|
|
17
|
+
|
|
18
|
+
This method is abstract and must be implemented in subclasses.
|
|
19
|
+
It defines the process of chunking information from elements.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
elements (Any): The information to be chunked. ideally formatted as List[Dict].
|
|
23
|
+
**kwargs (Any): Additional keyword arguments for customization.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Any: The chunked information, ideally formatted as List[Dict]. Each dictionary within
|
|
27
|
+
the list are recommended to follows the structure of model 'Element',
|
|
28
|
+
to ensure consistency and ease of use across Document Processing Orchestrator.
|
|
29
|
+
"""
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from gllm_docproc.model.element import Element as Element
|
|
2
|
+
from gllm_docproc.model.element_metadata import AUDIO as AUDIO, PDF as PDF
|
|
3
|
+
|
|
4
|
+
def enrich_chunk(chunk: Element, elements: list[Element]) -> Element:
|
|
5
|
+
"""Enrich the chunk with information from the original elements.
|
|
6
|
+
|
|
7
|
+
This is the default enrichment function for structured element chunker.
|
|
8
|
+
The function enrich the chunk with information from the original elements.
|
|
9
|
+
Based on the source type, the information that we want to keep are different.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
chunk (Element): The chunk to be enriched.
|
|
13
|
+
elements (list[Element]): The original elements that form the chunk.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Element: The enriched chunk.
|
|
17
|
+
"""
|
|
18
|
+
def enrich_pdf_chunk(chunk: Element, elements: list[Element]) -> Element:
|
|
19
|
+
"""The default function for enriching the PDF chunk.
|
|
20
|
+
|
|
21
|
+
The function enriches the PDF chunk with the coordinates and page_number information
|
|
22
|
+
of the original elements.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
chunk (Element): The PDF chunk to be enriched.
|
|
26
|
+
elements (list[Element]): The original elements that form the chunk.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Element: The enriched PDF chunk.
|
|
30
|
+
"""
|
|
31
|
+
def enrich_audio_chunk(chunk: Element, elements: list[Element]) -> Element:
|
|
32
|
+
"""The default function for enriching the audio chunk.
|
|
33
|
+
|
|
34
|
+
The function enriches the audio chunk by replacing the double newlines with a single newline.
|
|
35
|
+
Then, it adds the start_time, end_time, and lang_id information of the original elements.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
chunk (Element): The audio chunk to be enriched.
|
|
39
|
+
elements (list[Element]): The original elements that form the chunk.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Element: The enriched audio chunk.
|
|
43
|
+
"""
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from gllm_docproc.chunker.base_chunker import BaseChunker as BaseChunker
|
|
3
|
+
from gllm_docproc.chunker.structured_element.chunk_enricher import enrich_chunk as enrich_chunk
|
|
4
|
+
from gllm_docproc.chunker.table import TableChunker as TableChunker
|
|
5
|
+
from gllm_docproc.model.element import AUDIO as AUDIO, Element as Element, FOOTER as FOOTER, FOOTNOTE as FOOTNOTE, HEADER as HEADER, HEADING as HEADING, IMAGE as IMAGE, TABLE as TABLE, TITLE as TITLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT, VIDEO as VIDEO
|
|
6
|
+
from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
|
|
7
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
NON_TEXT_STRUCTURE: Incomplete
|
|
11
|
+
|
|
12
|
+
def default_text_splitter() -> RecursiveCharacterTextSplitter:
|
|
13
|
+
'''Define the default text splitter for structured text chunking.
|
|
14
|
+
|
|
15
|
+
This function defines the default text splitter for structured text chunking.
|
|
16
|
+
The text splitter is defined with the following separators:
|
|
17
|
+
|
|
18
|
+
1. "\\n#" : Split by Title or Heading
|
|
19
|
+
2. "\\n\\n" : Split between Paragraph Elements
|
|
20
|
+
3. "\\n" : Split between Title/Heading and Paragraph Elements
|
|
21
|
+
4. ". " | "! " | "? " : Split by Sentence
|
|
22
|
+
5. ", " : Split by Word
|
|
23
|
+
6. " " : Split by Word
|
|
24
|
+
7. "" : Split by Character
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
RecursiveCharacterTextSplitter: A RecursiveCharacterTextSplitter object for structured text chunking.
|
|
28
|
+
'''
|
|
29
|
+
|
|
30
|
+
class StructuredElementChunker(BaseChunker):
|
|
31
|
+
"""A class for structured text chunker.
|
|
32
|
+
|
|
33
|
+
This class defines the structure for chunking structured text into smaller chunks. It implements
|
|
34
|
+
the 'chunk' method to handle structured text chunking.
|
|
35
|
+
|
|
36
|
+
Methods:
|
|
37
|
+
chunk(elements, **kwargs): Chunk the structured text into smaller chunks.
|
|
38
|
+
"""
|
|
39
|
+
default_text_splitter: Incomplete
|
|
40
|
+
default_table_chunker: Incomplete
|
|
41
|
+
text_splitter: Incomplete
|
|
42
|
+
table_chunker: Incomplete
|
|
43
|
+
is_parent_structure_info_included: Incomplete
|
|
44
|
+
def __init__(self, text_splitter: RecursiveCharacterTextSplitter = ..., table_chunker: BaseChunker = ..., is_parent_structure_info_included: bool = True) -> None:
|
|
45
|
+
"""Initialize the structured text chunker.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
text_splitter (RecursiveCharacterTextSplitter): A RecursiveCharacterTextSplitter object
|
|
49
|
+
for structured text chunking.
|
|
50
|
+
table_chunker (BaseChunker): A BaseChunker object for table chunking.
|
|
51
|
+
is_parent_structure_info_included (bool): A boolean value to include parent structure
|
|
52
|
+
information in the chunk.
|
|
53
|
+
"""
|
|
54
|
+
def chunk(self, elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
|
|
55
|
+
'''Chunk the structured text into smaller chunks.
|
|
56
|
+
|
|
57
|
+
This method defines the process of chunking structured text into smaller chunks. It uses the
|
|
58
|
+
RecursiveCharacterTextSplitter to split the text into chunks based on the defined separators.
|
|
59
|
+
|
|
60
|
+
The method will split the text recursively based on the defined separators, or by default:
|
|
61
|
+
1. "\\n#" : Split by Title or Heading
|
|
62
|
+
2. "\\n\\n" : Split between Paragraph Elements
|
|
63
|
+
3. "\\n" : Split between Title/Heading and Paragraph Elements
|
|
64
|
+
4. ". " | "! " | "? " : Split by Sentence
|
|
65
|
+
5. ", " : Split by Word
|
|
66
|
+
6. " " : Split by Word
|
|
67
|
+
7. "" : Split by Character
|
|
68
|
+
|
|
69
|
+
Kwargs:
|
|
70
|
+
excluded_structures (list[str]): A list of structures to be excluded from the chunking process.
|
|
71
|
+
enrich_chunk (Callable[[Element, list[Element]], Element]): A function to enrich the chunked element.
|
|
72
|
+
file_id (str | None): The file id of the chunked elements. Defaults to None.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
elements (list[dict[str, any]]): A list of dictionaries containing text and structure.
|
|
76
|
+
**kwargs (Any): Additional keyword arguments for the chunker.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
list[dict[str, Any]]: A list of dictionaries containing chunked text and metadata.
|
|
80
|
+
'''
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from gllm_docproc.chunker.base_chunker import BaseChunker as BaseChunker
|
|
3
|
+
from gllm_docproc.model.element import Element as Element, TABLE as TABLE
|
|
4
|
+
from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
MARKDOWN: str
|
|
8
|
+
CSV: str
|
|
9
|
+
HTML: str
|
|
10
|
+
|
|
11
|
+
class TableChunker(BaseChunker):
|
|
12
|
+
"""Table Chunker class.
|
|
13
|
+
|
|
14
|
+
This class is used to chunk a table element into smaller chunks. It implements the 'chunk' method
|
|
15
|
+
to handle chunking the table element based on the chunk size and overlap. The table is converted
|
|
16
|
+
into the expected format (markdown, csv, or html).
|
|
17
|
+
|
|
18
|
+
Methods:
|
|
19
|
+
chunk(elements, **kwargs): Chunk a table element into smaller chunks.
|
|
20
|
+
"""
|
|
21
|
+
chunk_size: Incomplete
|
|
22
|
+
chunk_overlap: Incomplete
|
|
23
|
+
table_format: Incomplete
|
|
24
|
+
table_splitter: Incomplete
|
|
25
|
+
def __init__(self, chunk_size: int = 4000, chunk_overlap: int = 0, table_format: str = ...) -> None:
|
|
26
|
+
"""Initializes the TableChunker class.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
chunk_size (int): The size of each chunk.
|
|
30
|
+
chunk_overlap (int): The overlap between each chunk.
|
|
31
|
+
table_format (str): The format of the table (markdown, csv, or html).
|
|
32
|
+
"""
|
|
33
|
+
def chunk(self, elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
|
|
34
|
+
"""Chunk a table element into smaller chunks.
|
|
35
|
+
|
|
36
|
+
This method chunks a table element into smaller chunks based on the chunk size and overlap.
|
|
37
|
+
It converts the table into the expected format (markdown, csv, or html) and then chunks the table.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
elements (list[dict[str, Any]]): The table element to be chunked.
|
|
41
|
+
**kwargs (Any): Additional keyword arguments for customization.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
list[dict[str, Any]]: The list of smaller chunks.
|
|
45
|
+
"""
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
|
|
4
|
+
class BaseConverter(ABC, metaclass=abc.ABCMeta):
|
|
5
|
+
"""Base class for document converter."""
|
|
6
|
+
@abstractmethod
|
|
7
|
+
def convert(self, path_input: str, path_output: str) -> None:
|
|
8
|
+
"""Converts a document.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
path_input (str): The path of the document to be converted.
|
|
12
|
+
path_output (str): The path of the converted document.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
None
|
|
16
|
+
"""
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
class BaseDataGenerator(ABC, metaclass=abc.ABCMeta):
|
|
6
|
+
"""Base class for data generator."""
|
|
7
|
+
@abstractmethod
|
|
8
|
+
def generate(self, elements: Any, **kwargs: Any) -> Any:
|
|
9
|
+
"""Generates data for a list of chunks.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
elements (Any): The elements to be used for generating data / metadata. ideally formatted as List[Dict].
|
|
13
|
+
**kwargs (Any): Additional keyword arguments for customization.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Any: The generated data, ideally formatted as List[Dict]. Each dictionary within
|
|
17
|
+
the list are recommended to follows the structure of model 'Element',
|
|
18
|
+
to ensure consistency and ease of use across Document Processing Orchestrator.
|
|
19
|
+
"""
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
|
|
4
|
+
class BaseDownloader(ABC, metaclass=abc.ABCMeta):
|
|
5
|
+
"""Base class for document converter."""
|
|
6
|
+
@abstractmethod
|
|
7
|
+
def download(self, source: str, output: str) -> None:
|
|
8
|
+
"""Converts a document.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
source (str): The source of the document (could be JSON-formatted or URL)
|
|
12
|
+
output (str): The output where we put the downloaded content (usually a folder path).
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
None
|
|
16
|
+
"""
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
from .item_scrape_failed_exception import ItemScrapeFailedException as ItemScrapeFailedException
|
|
2
|
+
from .zyte_api_key_not_provided_exception import ZyteApiKeyNotProvidedException as ZyteApiKeyNotProvidedException
|
|
3
|
+
|
|
4
|
+
__all__ = ['ItemScrapeFailedException', 'ZyteApiKeyNotProvidedException']
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
|
|
3
|
+
class ItemScrapeFailedException(Exception):
|
|
4
|
+
"""Exception raised when an item fails to be scraped.
|
|
5
|
+
|
|
6
|
+
Attributes:
|
|
7
|
+
message (str): Optional. The error message indicating the reason for the item scrape failure.
|
|
8
|
+
"""
|
|
9
|
+
message: Incomplete
|
|
10
|
+
def __init__(self, message: str = 'Item failed to be scraped.') -> None:
|
|
11
|
+
'''Initialize the ItemScrapeFailedException.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
message (str): Optional. The error message indicating the reason for the item scrape failure.
|
|
15
|
+
Defaults to "Item failed to be scraped."
|
|
16
|
+
'''
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
|
|
3
|
+
class ZyteApiKeyNotProvidedException(Exception):
|
|
4
|
+
"""Custom exception raised when the Zyte API key is not provided.
|
|
5
|
+
|
|
6
|
+
Attributes:
|
|
7
|
+
message (str): Optional. The error message associated with the exception.
|
|
8
|
+
"""
|
|
9
|
+
message: Incomplete
|
|
10
|
+
def __init__(self, message: str = 'Zyte API Key not provided.') -> None:
|
|
11
|
+
'''Initialize the ZyteApiKeyNotProvidedException.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
message (str, optional): The error message associated with the exception. Defaults to "Zyte API Key not provided."
|
|
15
|
+
'''
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from gllm_docproc.downloader.base_downloader import BaseDownloader as BaseDownloader
|
|
3
|
+
from gllm_docproc.downloader.html.exception import ItemScrapeFailedException as ItemScrapeFailedException
|
|
4
|
+
from gllm_docproc.downloader.html.scraper.scraper.spiders import CrawlBaseSpider as CrawlBaseSpider, CrawlSitemapLinkSpider as CrawlSitemapLinkSpider, CrawlSitemapSpider as CrawlSitemapSpider
|
|
5
|
+
from gllm_docproc.downloader.html.scraper.web_scraper_executor import WebScraperExecutor as WebScraperExecutor
|
|
6
|
+
from gllm_docproc.downloader.html.utils import clean_url as clean_url, is_valid_url as is_valid_url
|
|
7
|
+
from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, HTML as HTML
|
|
8
|
+
from gllm_docproc.utils.file_utils import create_full_path as create_full_path, save_file as save_file, save_to_json as save_to_json
|
|
9
|
+
from scrapy import Spider as Spider
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
class HTMLDownloader(BaseDownloader):
|
|
13
|
+
"""A downloader class for downloading web content.
|
|
14
|
+
|
|
15
|
+
This class inherits from the BaseDownloader class and provides methods to download web content.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
**kwargs (Any): Additional keyword arguments.
|
|
19
|
+
"""
|
|
20
|
+
URL_INDEX: int
|
|
21
|
+
CONTENT_INDEX: int
|
|
22
|
+
kwargs: Incomplete
|
|
23
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
24
|
+
"""Initializes the WebDownloader class.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
**kwargs (Any): Additional keyword arguments.
|
|
28
|
+
"""
|
|
29
|
+
def download(self, source: str, output: str) -> None:
|
|
30
|
+
"""Downloads web content.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
source (str): The source of the web content URL.
|
|
34
|
+
output (str): The output where we put the downloaded content (usually a folder path).
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
None
|
|
38
|
+
"""
|
|
39
|
+
def download_from_multiple_urls(self, urls: list[str], output: str = '.', **kwargs: Any) -> None:
|
|
40
|
+
"""Downloads web content from multiple URLs.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
urls (list[str]): The URLs to download.
|
|
44
|
+
output (str): The output where we put the downloaded content (usually a folder path).
|
|
45
|
+
**kwargs (Any): Additional keyword arguments.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
None
|
|
49
|
+
"""
|
|
50
|
+
def download_crawl(self, urls: list[str] | str, output: str = '.', spider_type: type[Spider] | None = None) -> None:
|
|
51
|
+
"""Downloads web content from the provided URLs.
|
|
52
|
+
|
|
53
|
+
This method uses a web scraper to crawl the provided URLs and saves the downloaded content to a file.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
output (str): The output where we put the downloaded content (usually a folder path).
|
|
57
|
+
urls (list[str] | str): The URLs to crawl. Can be a single URL (str) or a list of URLs (list[str]).
|
|
58
|
+
spider_type (type[Spider] | None): The type of spider to use for downloading.
|
|
59
|
+
Defaults to None, which will use CrawlBaseSpider.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
None
|
|
63
|
+
"""
|
|
64
|
+
def download_sitemap(self, urls: list[str] | str, output: str = '.', spider_type: type[Spider] | None = None) -> None:
|
|
65
|
+
"""Downloads web content from the sitemap of the provided URLs.
|
|
66
|
+
|
|
67
|
+
This method uses a web scraper to scrape the sitemap of each URL and saves the downloaded content to a file..
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
urls (list[str] | str): The URLs to scrape. Can be a single URL (str) or a list of URLs (list[str]).
|
|
71
|
+
output (str): The output where we put the downloaded content (usually a folder path).
|
|
72
|
+
spider_type (type[Spider] | None): The type of spider to use for downloading.
|
|
73
|
+
Defaults to None, which will use CrawlSitemapSpider.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
None
|
|
77
|
+
"""
|
|
78
|
+
def download_sitemap_links(self, urls: list[str] | str, output: str = '.', spider_type: type[Spider] | None = None) -> None:
|
|
79
|
+
"""Retrieves all links from the sitemap of the provided URLs.
|
|
80
|
+
|
|
81
|
+
This method uses a web scraper to scrape the sitemap of each URL and returns a list of all found links.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
urls (list[str] | str): The URLs to scrape. Can be a single URL (str) or a list of URLs (list[str]).
|
|
85
|
+
output (str): The output where we put the downloaded content (usually a folder path).
|
|
86
|
+
spider_type (type[Spider] | None)): The type of spider to use for downloading.
|
|
87
|
+
Defaults to None, which will use CrawlSitemapLinkSpider.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
None
|
|
91
|
+
"""
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from .crawl_pdf_spider import CrawlPDFSpider as CrawlPDFSpider
|
|
2
|
+
from .crawl_sitemap_link_spider import CrawlSitemapLinkSpider as CrawlSitemapLinkSpider
|
|
3
|
+
from .crawl_sitemap_spider import CrawlSitemapSpider as CrawlSitemapSpider
|
|
4
|
+
from .crawl_spider import CrawlBaseSpider as CrawlBaseSpider
|
|
5
|
+
from .playwright_scrape_spider import PlaywrightScrapeSpider as PlaywrightScrapeSpider
|
|
6
|
+
from .scrape_spider import ScrapeSpider as ScrapeSpider
|
|
7
|
+
from .zyte_scrape_spider import ZyteScrapeSpider as ZyteScrapeSpider
|
|
8
|
+
|
|
9
|
+
__all__ = ['ScrapeSpider', 'PlaywrightScrapeSpider', 'ZyteScrapeSpider', 'CrawlBaseSpider', 'CrawlSitemapSpider', 'CrawlSitemapLinkSpider', 'CrawlPDFSpider']
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from .crawl_spider import CrawlBaseSpider as CrawlBaseSpider
|
|
2
|
+
from _typeshed import Incomplete
|
|
3
|
+
from collections.abc import Generator
|
|
4
|
+
from gllm_docproc.downloader.html.utils import clean_url as clean_url
|
|
5
|
+
from scrapy.http import Request as Request, Response as Response
|
|
6
|
+
|
|
7
|
+
class CrawlPDFSpider(CrawlBaseSpider):
|
|
8
|
+
"""Scrapy CrawlSpider to crawl websites and save responses as PDFs using Playwright.
|
|
9
|
+
|
|
10
|
+
Attributes:
|
|
11
|
+
name (str): The name of the spider - 'crawl_pdf_spider'.
|
|
12
|
+
allowed_domains (list): The allowed domains for spider to crawl.
|
|
13
|
+
start_urls (list): The starting URLs for the spider to initiate crawling.
|
|
14
|
+
custom_settings (dict): Custom settings for the spider, including item pipelines configuration.
|
|
15
|
+
rules (tuple): The rules to be followed during the crawling process.
|
|
16
|
+
"""
|
|
17
|
+
name: str
|
|
18
|
+
def add_playwright(self, request: Request, _: Response):
|
|
19
|
+
"""Adds playwright meta information to the request."""
|
|
20
|
+
def start_requests(self) -> Generator[Incomplete]:
|
|
21
|
+
"""Start Request.
|
|
22
|
+
|
|
23
|
+
Initiates requests for the specified start URLs using Scrapy requests with additional
|
|
24
|
+
meta information for playwright usage.
|
|
25
|
+
"""
|
|
26
|
+
async def parse_web(self, response: Response):
|
|
27
|
+
"""Parses the response obtained from the website, distinguishing between HTML content and other file types."""
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from .crawl_sitemap_spider import CrawlSitemapSpider as CrawlSitemapSpider
|
|
2
|
+
from scrapy.crawler import Crawler as Crawler
|
|
3
|
+
from scrapy.http import Response as Response
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
class CrawlSitemapLinkSpider(CrawlSitemapSpider):
|
|
7
|
+
"""A Scrapy spider designed to scrape links from the sitemaps.
|
|
8
|
+
|
|
9
|
+
This spider uses the CrawlSitemapSpider base class to follow the sitemap links provided in the
|
|
10
|
+
robots.txt file of the website. It parses each page and extracts the URLs of the pages. If
|
|
11
|
+
|
|
12
|
+
Attributes:
|
|
13
|
+
name (str): The name of the spider - 'crawl_sitemap_link_spider'.
|
|
14
|
+
custom_settings (dict): Custom settings for the spider, including the log level and log file.
|
|
15
|
+
"""
|
|
16
|
+
name: str
|
|
17
|
+
custom_settings: dict[str, Any]
|
|
18
|
+
@classmethod
|
|
19
|
+
def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any):
|
|
20
|
+
"""Creates a new instance of the spider.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
crawler (scrapy.crawler.Crawler): The Scrapy crawler object.
|
|
24
|
+
*args: Variable length argument list.
|
|
25
|
+
**kwargs: Arbitrary keyword arguments.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
CrawlSitemapLinkSpider: A new instance of the spider.
|
|
29
|
+
"""
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from scrapy.crawler import Crawler as Crawler
|
|
3
|
+
from scrapy.http import Response as Response
|
|
4
|
+
from scrapy.spiders import SitemapSpider
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
class CrawlSitemapSpider(SitemapSpider):
|
|
8
|
+
"""A Scrapy spider designed to scrape content from the sitemaps.
|
|
9
|
+
|
|
10
|
+
This spider uses the SitemapSpider base class to follow the sitemap links provided in the
|
|
11
|
+
robots.txt file of the website. It parses each page and extracts the URLs of the pages. If
|
|
12
|
+
an error occurs during parsing, it logs the error.
|
|
13
|
+
|
|
14
|
+
Attributes:
|
|
15
|
+
name (str): The name of the spider - 'crawl_sitemap_spider'.
|
|
16
|
+
sitemap_urls (list): The URLs of the sitemaps to start crawling from.
|
|
17
|
+
allowed_domains (list): The domains that this spider is allowed to crawl.
|
|
18
|
+
custom_settings (dict): Custom settings for the spider, including the log level and log file.
|
|
19
|
+
"""
|
|
20
|
+
name: str
|
|
21
|
+
custom_settings: dict[str, Any]
|
|
22
|
+
@classmethod
|
|
23
|
+
def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any):
|
|
24
|
+
"""Creates a new CrawlSitemapSpider instance and sets the custom settings.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
crawler (scrapy.crawler.Crawler): The crawler object.
|
|
28
|
+
*args: Variable length argument list.
|
|
29
|
+
**kwargs: Arbitrary keyword arguments.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
CrawlSitemapSpider: The CrawlSitemapSpider instance.
|
|
33
|
+
"""
|
|
34
|
+
sitemap_urls: Incomplete
|
|
35
|
+
allowed_domains: Incomplete
|
|
36
|
+
callback: Incomplete
|
|
37
|
+
removed_components: Incomplete
|
|
38
|
+
is_follow_page: Incomplete
|
|
39
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
40
|
+
"""Initializes the CrawlSitemapSpider instance.
|
|
41
|
+
|
|
42
|
+
The method initializes the CrawlSitemapSpider instance and sets the sitemap_urls, allowed_domains,
|
|
43
|
+
and sitemap_from_robots attributes based on the provided arguments.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
*args: Variable length argument list.
|
|
47
|
+
**kwargs: Arbitrary keyword arguments.
|
|
48
|
+
"""
|
|
49
|
+
def parse(self, response: Response):
|
|
50
|
+
"""Parse the response.
|
|
51
|
+
|
|
52
|
+
This method parses the response obtained from the website, extracts the URLs of the pages,
|
|
53
|
+
and follows the links to the next pages.
|
|
54
|
+
|
|
55
|
+
This method attempts to yield a dictionary containing the URL of the response. If an error occurs,
|
|
56
|
+
it yields the URL and an error message.
|
|
57
|
+
It also extracts the URLs of the next pages from the response and follows them.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
response (scrapy.http.Response): The response object to parse.
|
|
61
|
+
"""
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from collections.abc import Generator
|
|
3
|
+
from scrapy.crawler import Crawler as Crawler
|
|
4
|
+
from scrapy.http import Request as Request, Response as Response
|
|
5
|
+
from scrapy.spiders import CrawlSpider
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
class CrawlBaseSpider(CrawlSpider):
|
|
9
|
+
"""A Scrapy CrawlSpider designed to crawl and extract content from website.
|
|
10
|
+
|
|
11
|
+
Attributes:
|
|
12
|
+
name (str): The name of the spider - 'crawl_spider'.
|
|
13
|
+
allowed_domains (list): The allowed domains for spider to crawl.
|
|
14
|
+
start_urls (list): The starting URLs for the spider to initiate crawling.
|
|
15
|
+
custom_settings (dict): Custom settings for the spider, including item pipelines configuration.
|
|
16
|
+
rules (tuple): The rules to be followed during the crawling process.
|
|
17
|
+
"""
|
|
18
|
+
name: str
|
|
19
|
+
rules: Incomplete
|
|
20
|
+
custom_settings: dict[str, Any]
|
|
21
|
+
def add_playwright(self, request: Request, _: Response):
|
|
22
|
+
"""Adds playwright meta information to the request."""
|
|
23
|
+
def start_requests(self) -> Generator[Incomplete]:
|
|
24
|
+
"""Start Request.
|
|
25
|
+
|
|
26
|
+
Initiates requests for the specified start URLs using Scrapy requests with additional
|
|
27
|
+
meta information for playwright usage.
|
|
28
|
+
"""
|
|
29
|
+
start_urls: Incomplete
|
|
30
|
+
allowed_domains: Incomplete
|
|
31
|
+
callback: Incomplete
|
|
32
|
+
removed_components: Incomplete
|
|
33
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
34
|
+
"""Initialize the CrawlBaseSpider."""
|
|
35
|
+
async def parse_web(self, response: Response):
|
|
36
|
+
"""Parses the response obtained from the website, distinguishing between HTML content and other file types."""
|
|
37
|
+
@classmethod
|
|
38
|
+
def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any):
|
|
39
|
+
"""Creates a new instance of the CrawlBaseSpider with custom settings.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
crawler (Crawler): The crawler object.
|
|
43
|
+
*args (Any): The arguments to be passed to the spider.
|
|
44
|
+
**kwargs (Any): The keyword arguments to be passed to the spider.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
CrawlBaseSpider: The CrawlBaseSpider instance.
|
|
48
|
+
"""
|
|
49
|
+
async def errback(self, failure: Any):
|
|
50
|
+
"""Handles errors encountered during the crawling process and closes playwright pages."""
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from .scrape_spider import ScrapeSpider as ScrapeSpider
|
|
2
|
+
from _typeshed import Incomplete
|
|
3
|
+
from collections.abc import Generator
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
class PlaywrightScrapeSpider(ScrapeSpider):
|
|
7
|
+
"""A Scrapy spider designed to scrape content from website using playwright to render Javascript loaded page.
|
|
8
|
+
|
|
9
|
+
Attributes:
|
|
10
|
+
name (str): The name of the spider - 'playwright_scrape_spider'.
|
|
11
|
+
allowed_domains (list): The allowed domains for spider to crawl.
|
|
12
|
+
start_urls (list): The starting URLs for the spider to initiate crawling
|
|
13
|
+
custom_settings (dict): Custom settings for the spider, including item pipelines configuration.
|
|
14
|
+
"""
|
|
15
|
+
name: str
|
|
16
|
+
custom_settings: dict[str, Any]
|
|
17
|
+
def start_requests(self) -> Generator[Incomplete]:
|
|
18
|
+
"""Start Request.
|
|
19
|
+
|
|
20
|
+
Initiates requests for the specified start URLs using Scrapy requests with additional
|
|
21
|
+
meta information for playwright usage.
|
|
22
|
+
"""
|