gllm-docproc-binary 0.1.8__cp311-cp311-macosx_14_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gllm-docproc-binary might be problematic. Click here for more details.

Files changed (123) hide show
  1. gllm_docproc/__init__.pyi +0 -0
  2. gllm_docproc/chunker/__init__.pyi +3 -0
  3. gllm_docproc/chunker/base_chunker.pyi +29 -0
  4. gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
  5. gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
  6. gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
  7. gllm_docproc/chunker/table/__init__.pyi +3 -0
  8. gllm_docproc/chunker/table/table_chunker.pyi +45 -0
  9. gllm_docproc/converter/__init__.pyi +3 -0
  10. gllm_docproc/converter/base_converter.pyi +16 -0
  11. gllm_docproc/data_generator/__init__.pyi +3 -0
  12. gllm_docproc/data_generator/base_data_generator.pyi +19 -0
  13. gllm_docproc/downloader/__init__.pyi +3 -0
  14. gllm_docproc/downloader/base_downloader.pyi +16 -0
  15. gllm_docproc/downloader/html/__init__.pyi +4 -0
  16. gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
  17. gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
  18. gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
  19. gllm_docproc/downloader/html/html_downloader.pyi +91 -0
  20. gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
  21. gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
  22. gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
  23. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
  24. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +29 -0
  25. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
  26. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +50 -0
  27. gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
  28. gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
  29. gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
  30. gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +45 -0
  31. gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
  32. gllm_docproc/downloader/html/utils/web_utils.pyi +21 -0
  33. gllm_docproc/dpo_router/__init__.pyi +3 -0
  34. gllm_docproc/dpo_router/base_dpo_router.pyi +17 -0
  35. gllm_docproc/housekeeping/__init__.pyi +3 -0
  36. gllm_docproc/housekeeping/base_housekeeping.pyi +15 -0
  37. gllm_docproc/indexer/__init__.pyi +3 -0
  38. gllm_docproc/indexer/base_indexer.pyi +31 -0
  39. gllm_docproc/indexer/graph/__init__.pyi +3 -0
  40. gllm_docproc/indexer/knowledge_graph/__init__.pyi +4 -0
  41. gllm_docproc/loader/__init__.pyi +4 -0
  42. gllm_docproc/loader/audio/__init__.pyi +3 -0
  43. gllm_docproc/loader/base_loader.pyi +31 -0
  44. gllm_docproc/loader/docx/__init__.pyi +5 -0
  45. gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
  46. gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
  47. gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
  48. gllm_docproc/loader/exception/__init__.pyi +3 -0
  49. gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
  50. gllm_docproc/loader/html/__init__.pyi +5 -0
  51. gllm_docproc/loader/html/exception/__init__.pyi +3 -0
  52. gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
  53. gllm_docproc/loader/html/flat/__init__.pyi +3 -0
  54. gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +52 -0
  55. gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
  56. gllm_docproc/loader/html/flat/html_flat_merger.pyi +22 -0
  57. gllm_docproc/loader/html/html_base_loader.pyi +25 -0
  58. gllm_docproc/loader/html/nested/__init__.pyi +3 -0
  59. gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
  60. gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
  61. gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
  62. gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
  63. gllm_docproc/loader/html/utils/__init__.pyi +0 -0
  64. gllm_docproc/loader/html/utils/flat_table_utils.pyi +29 -0
  65. gllm_docproc/loader/html/utils/html_utils.pyi +41 -0
  66. gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
  67. gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
  68. gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
  69. gllm_docproc/loader/json/__init__.pyi +3 -0
  70. gllm_docproc/loader/json/json_elements_loader.pyi +33 -0
  71. gllm_docproc/loader/loader_utils.pyi +42 -0
  72. gllm_docproc/loader/pdf/__init__.pyi +13 -0
  73. gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +40 -0
  74. gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +45 -0
  75. gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +50 -0
  76. gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
  77. gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
  78. gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
  79. gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
  80. gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +38 -0
  81. gllm_docproc/loader/pdf/pymupdf_loader.pyi +43 -0
  82. gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +44 -0
  83. gllm_docproc/loader/pdf/pymupdf_utils.pyi +34 -0
  84. gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
  85. gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
  86. gllm_docproc/loader/pipeline_loader.pyi +48 -0
  87. gllm_docproc/loader/txt/__init__.pyi +3 -0
  88. gllm_docproc/loader/txt/txt_loader.pyi +26 -0
  89. gllm_docproc/loader/xlsx/__init__.pyi +3 -0
  90. gllm_docproc/loader/xlsx/openpyxl_loader.pyi +37 -0
  91. gllm_docproc/model/__init__.pyi +4 -0
  92. gllm_docproc/model/element.pyi +37 -0
  93. gllm_docproc/model/element_metadata.pyi +35 -0
  94. gllm_docproc/parser/__init__.pyi +4 -0
  95. gllm_docproc/parser/base_parser.pyi +29 -0
  96. gllm_docproc/parser/document/__init__.pyi +6 -0
  97. gllm_docproc/parser/document/docx_parser.pyi +27 -0
  98. gllm_docproc/parser/document/pdf_parser.pyi +35 -0
  99. gllm_docproc/parser/document/txt_parser.pyi +22 -0
  100. gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
  101. gllm_docproc/parser/html/__init__.pyi +4 -0
  102. gllm_docproc/parser/html/flat/__init__.pyi +0 -0
  103. gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
  104. gllm_docproc/parser/html/nested/__init__.pyi +0 -0
  105. gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
  106. gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
  107. gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
  108. gllm_docproc/parser/pipeline_parser.pyi +33 -0
  109. gllm_docproc/parser/table/__init__.pyi +3 -0
  110. gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
  111. gllm_docproc/request_handler/__init__.pyi +3 -0
  112. gllm_docproc/request_handler/base_request_handler.pyi +17 -0
  113. gllm_docproc/response_handler/__init__.pyi +3 -0
  114. gllm_docproc/response_handler/base_response_handler.pyi +39 -0
  115. gllm_docproc/utils/__init__.pyi +0 -0
  116. gllm_docproc/utils/file_utils.pyi +76 -0
  117. gllm_docproc/utils/html_constants.pyi +121 -0
  118. gllm_docproc.build/.gitignore +1 -0
  119. gllm_docproc.cpython-311-darwin.so +0 -0
  120. gllm_docproc.pyi +149 -0
  121. gllm_docproc_binary-0.1.8.dist-info/METADATA +110 -0
  122. gllm_docproc_binary-0.1.8.dist-info/RECORD +123 -0
  123. gllm_docproc_binary-0.1.8.dist-info/WHEEL +4 -0
File without changes
@@ -0,0 +1,3 @@
1
+ from .base_chunker import BaseChunker as BaseChunker
2
+
3
+ __all__ = ['BaseChunker']
@@ -0,0 +1,29 @@
1
+ import abc
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any
4
+
5
+ class BaseChunker(ABC, metaclass=abc.ABCMeta):
6
+ """An abstract base class for chunker.
7
+
8
+ This class segmenting or chunking elements based on contextual information.
9
+ Subclasses are expected to implement the 'chunk' method to handle chunking elements.
10
+
11
+ Methods:
12
+ chunk(elements, **kwargs): Abstract method to chunk a document.
13
+ """
14
+ @abstractmethod
15
+ def chunk(self, elements: Any, **kwargs: Any) -> Any:
16
+ """Chunk a document.
17
+
18
+ This method is abstract and must be implemented in subclasses.
19
+ It defines the process of chunking information from elements.
20
+
21
+ Args:
22
+ elements (Any): The information to be chunked. ideally formatted as List[Dict].
23
+ **kwargs (Any): Additional keyword arguments for customization.
24
+
25
+ Returns:
26
+ Any: The chunked information, ideally formatted as List[Dict]. Each dictionary within
27
+ the list are recommended to follows the structure of model 'Element',
28
+ to ensure consistency and ease of use across Document Processing Orchestrator.
29
+ """
@@ -0,0 +1,3 @@
1
+ from .structured_element_chunker import StructuredElementChunker as StructuredElementChunker
2
+
3
+ __all__ = ['StructuredElementChunker']
@@ -0,0 +1,43 @@
1
+ from gllm_docproc.model.element import Element as Element
2
+ from gllm_docproc.model.element_metadata import AUDIO as AUDIO, PDF as PDF
3
+
4
+ def enrich_chunk(chunk: Element, elements: list[Element]) -> Element:
5
+ """Enrich the chunk with information from the original elements.
6
+
7
+ This is the default enrichment function for structured element chunker.
8
+ The function enrich the chunk with information from the original elements.
9
+ Based on the source type, the information that we want to keep are different.
10
+
11
+ Args:
12
+ chunk (Element): The chunk to be enriched.
13
+ elements (list[Element]): The original elements that form the chunk.
14
+
15
+ Returns:
16
+ Element: The enriched chunk.
17
+ """
18
+ def enrich_pdf_chunk(chunk: Element, elements: list[Element]) -> Element:
19
+ """The default function for enriching the PDF chunk.
20
+
21
+ The function enriches the PDF chunk with the coordinates and page_number information
22
+ of the original elements.
23
+
24
+ Args:
25
+ chunk (Element): The PDF chunk to be enriched.
26
+ elements (list[Element]): The original elements that form the chunk.
27
+
28
+ Returns:
29
+ Element: The enriched PDF chunk.
30
+ """
31
+ def enrich_audio_chunk(chunk: Element, elements: list[Element]) -> Element:
32
+ """The default function for enriching the audio chunk.
33
+
34
+ The function enriches the audio chunk by replacing the double newlines with a single newline.
35
+ Then, it adds the start_time, end_time, and lang_id information of the original elements.
36
+
37
+ Args:
38
+ chunk (Element): The audio chunk to be enriched.
39
+ elements (list[Element]): The original elements that form the chunk.
40
+
41
+ Returns:
42
+ Element: The enriched audio chunk.
43
+ """
@@ -0,0 +1,80 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.chunker.base_chunker import BaseChunker as BaseChunker
3
+ from gllm_docproc.chunker.structured_element.chunk_enricher import enrich_chunk as enrich_chunk
4
+ from gllm_docproc.chunker.table import TableChunker as TableChunker
5
+ from gllm_docproc.model.element import AUDIO as AUDIO, Element as Element, FOOTER as FOOTER, FOOTNOTE as FOOTNOTE, HEADER as HEADER, HEADING as HEADING, IMAGE as IMAGE, TABLE as TABLE, TITLE as TITLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT, VIDEO as VIDEO
6
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
7
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+ from typing import Any
9
+
10
+ NON_TEXT_STRUCTURE: Incomplete
11
+
12
+ def default_text_splitter() -> RecursiveCharacterTextSplitter:
13
+ '''Define the default text splitter for structured text chunking.
14
+
15
+ This function defines the default text splitter for structured text chunking.
16
+ The text splitter is defined with the following separators:
17
+
18
+ 1. "\\n#" : Split by Title or Heading
19
+ 2. "\\n\\n" : Split between Paragraph Elements
20
+ 3. "\\n" : Split between Title/Heading and Paragraph Elements
21
+ 4. ". " | "! " | "? " : Split by Sentence
22
+ 5. ", " : Split by Word
23
+ 6. " " : Split by Word
24
+ 7. "" : Split by Character
25
+
26
+ Returns:
27
+ RecursiveCharacterTextSplitter: A RecursiveCharacterTextSplitter object for structured text chunking.
28
+ '''
29
+
30
+ class StructuredElementChunker(BaseChunker):
31
+ """A class for structured text chunker.
32
+
33
+ This class defines the structure for chunking structured text into smaller chunks. It implements
34
+ the 'chunk' method to handle structured text chunking.
35
+
36
+ Methods:
37
+ chunk(elements, **kwargs): Chunk the structured text into smaller chunks.
38
+ """
39
+ default_text_splitter: Incomplete
40
+ default_table_chunker: Incomplete
41
+ text_splitter: Incomplete
42
+ table_chunker: Incomplete
43
+ is_parent_structure_info_included: Incomplete
44
+ def __init__(self, text_splitter: RecursiveCharacterTextSplitter = ..., table_chunker: BaseChunker = ..., is_parent_structure_info_included: bool = True) -> None:
45
+ """Initialize the structured text chunker.
46
+
47
+ Args:
48
+ text_splitter (RecursiveCharacterTextSplitter): A RecursiveCharacterTextSplitter object
49
+ for structured text chunking.
50
+ table_chunker (BaseChunker): A BaseChunker object for table chunking.
51
+ is_parent_structure_info_included (bool): A boolean value to include parent structure
52
+ information in the chunk.
53
+ """
54
+ def chunk(self, elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
55
+ '''Chunk the structured text into smaller chunks.
56
+
57
+ This method defines the process of chunking structured text into smaller chunks. It uses the
58
+ RecursiveCharacterTextSplitter to split the text into chunks based on the defined separators.
59
+
60
+ The method will split the text recursively based on the defined separators, or by default:
61
+ 1. "\\n#" : Split by Title or Heading
62
+ 2. "\\n\\n" : Split between Paragraph Elements
63
+ 3. "\\n" : Split between Title/Heading and Paragraph Elements
64
+ 4. ". " | "! " | "? " : Split by Sentence
65
+ 5. ", " : Split by Word
66
+ 6. " " : Split by Word
67
+ 7. "" : Split by Character
68
+
69
+ Kwargs:
70
+ excluded_structures (list[str]): A list of structures to be excluded from the chunking process.
71
+ enrich_chunk (Callable[[Element, list[Element]], Element]): A function to enrich the chunked element.
72
+ file_id (str | None): The file id of the chunked elements. Defaults to None.
73
+
74
+ Args:
75
+ elements (list[dict[str, any]]): A list of dictionaries containing text and structure.
76
+ **kwargs (Any): Additional keyword arguments for the chunker.
77
+
78
+ Returns:
79
+ list[dict[str, Any]]: A list of dictionaries containing chunked text and metadata.
80
+ '''
@@ -0,0 +1,3 @@
1
+ from .table_chunker import CSV as CSV, HTML as HTML, MARKDOWN as MARKDOWN, TableChunker as TableChunker
2
+
3
+ __all__ = ['CSV', 'HTML', 'MARKDOWN', 'TableChunker']
@@ -0,0 +1,45 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.chunker.base_chunker import BaseChunker as BaseChunker
3
+ from gllm_docproc.model.element import Element as Element, TABLE as TABLE
4
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
5
+ from typing import Any
6
+
7
+ MARKDOWN: str
8
+ CSV: str
9
+ HTML: str
10
+
11
+ class TableChunker(BaseChunker):
12
+ """Table Chunker class.
13
+
14
+ This class is used to chunk a table element into smaller chunks. It implements the 'chunk' method
15
+ to handle chunking the table element based on the chunk size and overlap. The table is converted
16
+ into the expected format (markdown, csv, or html).
17
+
18
+ Methods:
19
+ chunk(elements, **kwargs): Chunk a table element into smaller chunks.
20
+ """
21
+ chunk_size: Incomplete
22
+ chunk_overlap: Incomplete
23
+ table_format: Incomplete
24
+ table_splitter: Incomplete
25
+ def __init__(self, chunk_size: int = 4000, chunk_overlap: int = 0, table_format: str = ...) -> None:
26
+ """Initializes the TableChunker class.
27
+
28
+ Args:
29
+ chunk_size (int): The size of each chunk.
30
+ chunk_overlap (int): The overlap between each chunk.
31
+ table_format (str): The format of the table (markdown, csv, or html).
32
+ """
33
+ def chunk(self, elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
34
+ """Chunk a table element into smaller chunks.
35
+
36
+ This method chunks a table element into smaller chunks based on the chunk size and overlap.
37
+ It converts the table into the expected format (markdown, csv, or html) and then chunks the table.
38
+
39
+ Args:
40
+ elements (list[dict[str, Any]]): The table element to be chunked.
41
+ **kwargs (Any): Additional keyword arguments for customization.
42
+
43
+ Returns:
44
+ list[dict[str, Any]]: The list of smaller chunks.
45
+ """
@@ -0,0 +1,3 @@
1
+ from .base_converter import BaseConverter as BaseConverter
2
+
3
+ __all__ = ['BaseConverter']
@@ -0,0 +1,16 @@
1
+ import abc
2
+ from abc import ABC, abstractmethod
3
+
4
+ class BaseConverter(ABC, metaclass=abc.ABCMeta):
5
+ """Base class for document converter."""
6
+ @abstractmethod
7
+ def convert(self, path_input: str, path_output: str) -> None:
8
+ """Converts a document.
9
+
10
+ Args:
11
+ path_input (str): The path of the document to be converted.
12
+ path_output (str): The path of the converted document.
13
+
14
+ Returns:
15
+ None
16
+ """
@@ -0,0 +1,3 @@
1
+ from gllm_docproc.data_generator.base_data_generator import BaseDataGenerator as BaseDataGenerator
2
+
3
+ __all__ = ['BaseDataGenerator']
@@ -0,0 +1,19 @@
1
+ import abc
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any
4
+
5
+ class BaseDataGenerator(ABC, metaclass=abc.ABCMeta):
6
+ """Base class for data generator."""
7
+ @abstractmethod
8
+ def generate(self, elements: Any, **kwargs: Any) -> Any:
9
+ """Generates data for a list of chunks.
10
+
11
+ Args:
12
+ elements (Any): The elements to be used for generating data / metadata. ideally formatted as List[Dict].
13
+ **kwargs (Any): Additional keyword arguments for customization.
14
+
15
+ Returns:
16
+ Any: The generated data, ideally formatted as List[Dict]. Each dictionary within
17
+ the list are recommended to follows the structure of model 'Element',
18
+ to ensure consistency and ease of use across Document Processing Orchestrator.
19
+ """
@@ -0,0 +1,3 @@
1
+ from .base_downloader import BaseDownloader as BaseDownloader
2
+
3
+ __all__ = ['BaseDownloader']
@@ -0,0 +1,16 @@
1
+ import abc
2
+ from abc import ABC, abstractmethod
3
+
4
+ class BaseDownloader(ABC, metaclass=abc.ABCMeta):
5
+ """Base class for document converter."""
6
+ @abstractmethod
7
+ def download(self, source: str, output: str) -> None:
8
+ """Converts a document.
9
+
10
+ Args:
11
+ source (str): The source of the document (could be JSON-formatted or URL)
12
+ output (str): The output where we put the downloaded content (usually a folder path).
13
+
14
+ Returns:
15
+ None
16
+ """
@@ -0,0 +1,4 @@
1
+ from . import utils as utils
2
+ from .html_downloader import HTMLDownloader as HTMLDownloader
3
+
4
+ __all__ = ['HTMLDownloader', 'utils']
@@ -0,0 +1,4 @@
1
+ from .item_scrape_failed_exception import ItemScrapeFailedException as ItemScrapeFailedException
2
+ from .zyte_api_key_not_provided_exception import ZyteApiKeyNotProvidedException as ZyteApiKeyNotProvidedException
3
+
4
+ __all__ = ['ItemScrapeFailedException', 'ZyteApiKeyNotProvidedException']
@@ -0,0 +1,16 @@
1
+ from _typeshed import Incomplete
2
+
3
+ class ItemScrapeFailedException(Exception):
4
+ """Exception raised when an item fails to be scraped.
5
+
6
+ Attributes:
7
+ message (str): Optional. The error message indicating the reason for the item scrape failure.
8
+ """
9
+ message: Incomplete
10
+ def __init__(self, message: str = 'Item failed to be scraped.') -> None:
11
+ '''Initialize the ItemScrapeFailedException.
12
+
13
+ Args:
14
+ message (str): Optional. The error message indicating the reason for the item scrape failure.
15
+ Defaults to "Item failed to be scraped."
16
+ '''
@@ -0,0 +1,15 @@
1
+ from _typeshed import Incomplete
2
+
3
+ class ZyteApiKeyNotProvidedException(Exception):
4
+ """Custom exception raised when the Zyte API key is not provided.
5
+
6
+ Attributes:
7
+ message (str): Optional. The error message associated with the exception.
8
+ """
9
+ message: Incomplete
10
+ def __init__(self, message: str = 'Zyte API Key not provided.') -> None:
11
+ '''Initialize the ZyteApiKeyNotProvidedException.
12
+
13
+ Args:
14
+ message (str, optional): The error message associated with the exception. Defaults to "Zyte API Key not provided."
15
+ '''
@@ -0,0 +1,91 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.downloader.base_downloader import BaseDownloader as BaseDownloader
3
+ from gllm_docproc.downloader.html.exception import ItemScrapeFailedException as ItemScrapeFailedException
4
+ from gllm_docproc.downloader.html.scraper.scraper.spiders import CrawlBaseSpider as CrawlBaseSpider, CrawlSitemapLinkSpider as CrawlSitemapLinkSpider, CrawlSitemapSpider as CrawlSitemapSpider
5
+ from gllm_docproc.downloader.html.scraper.web_scraper_executor import WebScraperExecutor as WebScraperExecutor
6
+ from gllm_docproc.downloader.html.utils import clean_url as clean_url, is_valid_url as is_valid_url
7
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, HTML as HTML
8
+ from gllm_docproc.utils.file_utils import create_full_path as create_full_path, save_file as save_file, save_to_json as save_to_json
9
+ from scrapy import Spider as Spider
10
+ from typing import Any
11
+
12
+ class HTMLDownloader(BaseDownloader):
13
+ """A downloader class for downloading web content.
14
+
15
+ This class inherits from the BaseDownloader class and provides methods to download web content.
16
+
17
+ Args:
18
+ **kwargs (Any): Additional keyword arguments.
19
+ """
20
+ URL_INDEX: int
21
+ CONTENT_INDEX: int
22
+ kwargs: Incomplete
23
+ def __init__(self, **kwargs: Any) -> None:
24
+ """Initializes the WebDownloader class.
25
+
26
+ Args:
27
+ **kwargs (Any): Additional keyword arguments.
28
+ """
29
+ def download(self, source: str, output: str) -> None:
30
+ """Downloads web content.
31
+
32
+ Args:
33
+ source (str): The source of the web content URL.
34
+ output (str): The output where we put the downloaded content (usually a folder path).
35
+
36
+ Returns:
37
+ None
38
+ """
39
+ def download_from_multiple_urls(self, urls: list[str], output: str = '.', **kwargs: Any) -> None:
40
+ """Downloads web content from multiple URLs.
41
+
42
+ Args:
43
+ urls (list[str]): The URLs to download.
44
+ output (str): The output where we put the downloaded content (usually a folder path).
45
+ **kwargs (Any): Additional keyword arguments.
46
+
47
+ Returns:
48
+ None
49
+ """
50
+ def download_crawl(self, urls: list[str] | str, output: str = '.', spider_type: type[Spider] | None = None) -> None:
51
+ """Downloads web content from the provided URLs.
52
+
53
+ This method uses a web scraper to crawl the provided URLs and saves the downloaded content to a file.
54
+
55
+ Args:
56
+ output (str): The output where we put the downloaded content (usually a folder path).
57
+ urls (list[str] | str): The URLs to crawl. Can be a single URL (str) or a list of URLs (list[str]).
58
+ spider_type (type[Spider] | None): The type of spider to use for downloading.
59
+ Defaults to None, which will use CrawlBaseSpider.
60
+
61
+ Returns:
62
+ None
63
+ """
64
+ def download_sitemap(self, urls: list[str] | str, output: str = '.', spider_type: type[Spider] | None = None) -> None:
65
+ """Downloads web content from the sitemap of the provided URLs.
66
+
67
+ This method uses a web scraper to scrape the sitemap of each URL and saves the downloaded content to a file..
68
+
69
+ Args:
70
+ urls (list[str] | str): The URLs to scrape. Can be a single URL (str) or a list of URLs (list[str]).
71
+ output (str): The output where we put the downloaded content (usually a folder path).
72
+ spider_type (type[Spider] | None): The type of spider to use for downloading.
73
+ Defaults to None, which will use CrawlSitemapSpider.
74
+
75
+ Returns:
76
+ None
77
+ """
78
+ def download_sitemap_links(self, urls: list[str] | str, output: str = '.', spider_type: type[Spider] | None = None) -> None:
79
+ """Retrieves all links from the sitemap of the provided URLs.
80
+
81
+ This method uses a web scraper to scrape the sitemap of each URL and returns a list of all found links.
82
+
83
+ Args:
84
+ urls (list[str] | str): The URLs to scrape. Can be a single URL (str) or a list of URLs (list[str]).
85
+ output (str): The output where we put the downloaded content (usually a folder path).
86
+ spider_type (type[Spider] | None)): The type of spider to use for downloading.
87
+ Defaults to None, which will use CrawlSitemapLinkSpider.
88
+
89
+ Returns:
90
+ None
91
+ """
File without changes
@@ -0,0 +1,9 @@
1
+ from .crawl_pdf_spider import CrawlPDFSpider as CrawlPDFSpider
2
+ from .crawl_sitemap_link_spider import CrawlSitemapLinkSpider as CrawlSitemapLinkSpider
3
+ from .crawl_sitemap_spider import CrawlSitemapSpider as CrawlSitemapSpider
4
+ from .crawl_spider import CrawlBaseSpider as CrawlBaseSpider
5
+ from .playwright_scrape_spider import PlaywrightScrapeSpider as PlaywrightScrapeSpider
6
+ from .scrape_spider import ScrapeSpider as ScrapeSpider
7
+ from .zyte_scrape_spider import ZyteScrapeSpider as ZyteScrapeSpider
8
+
9
+ __all__ = ['ScrapeSpider', 'PlaywrightScrapeSpider', 'ZyteScrapeSpider', 'CrawlBaseSpider', 'CrawlSitemapSpider', 'CrawlSitemapLinkSpider', 'CrawlPDFSpider']
@@ -0,0 +1,27 @@
1
+ from .crawl_spider import CrawlBaseSpider as CrawlBaseSpider
2
+ from _typeshed import Incomplete
3
+ from collections.abc import Generator
4
+ from gllm_docproc.downloader.html.utils import clean_url as clean_url
5
+ from scrapy.http import Request as Request, Response as Response
6
+
7
+ class CrawlPDFSpider(CrawlBaseSpider):
8
+ """Scrapy CrawlSpider to crawl websites and save responses as PDFs using Playwright.
9
+
10
+ Attributes:
11
+ name (str): The name of the spider - 'crawl_pdf_spider'.
12
+ allowed_domains (list): The allowed domains for spider to crawl.
13
+ start_urls (list): The starting URLs for the spider to initiate crawling.
14
+ custom_settings (dict): Custom settings for the spider, including item pipelines configuration.
15
+ rules (tuple): The rules to be followed during the crawling process.
16
+ """
17
+ name: str
18
+ def add_playwright(self, request: Request, _: Response):
19
+ """Adds playwright meta information to the request."""
20
+ def start_requests(self) -> Generator[Incomplete]:
21
+ """Start Request.
22
+
23
+ Initiates requests for the specified start URLs using Scrapy requests with additional
24
+ meta information for playwright usage.
25
+ """
26
+ async def parse_web(self, response: Response):
27
+ """Parses the response obtained from the website, distinguishing between HTML content and other file types."""
@@ -0,0 +1,29 @@
1
+ from .crawl_sitemap_spider import CrawlSitemapSpider as CrawlSitemapSpider
2
+ from scrapy.crawler import Crawler as Crawler
3
+ from scrapy.http import Response as Response
4
+ from typing import Any
5
+
6
+ class CrawlSitemapLinkSpider(CrawlSitemapSpider):
7
+ """A Scrapy spider designed to scrape links from the sitemaps.
8
+
9
+ This spider uses the CrawlSitemapSpider base class to follow the sitemap links provided in the
10
+ robots.txt file of the website. It parses each page and extracts the URLs of the pages. If
11
+
12
+ Attributes:
13
+ name (str): The name of the spider - 'crawl_sitemap_link_spider'.
14
+ custom_settings (dict): Custom settings for the spider, including the log level and log file.
15
+ """
16
+ name: str
17
+ custom_settings: dict[str, Any]
18
+ @classmethod
19
+ def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any):
20
+ """Creates a new instance of the spider.
21
+
22
+ Args:
23
+ crawler (scrapy.crawler.Crawler): The Scrapy crawler object.
24
+ *args: Variable length argument list.
25
+ **kwargs: Arbitrary keyword arguments.
26
+
27
+ Returns:
28
+ CrawlSitemapLinkSpider: A new instance of the spider.
29
+ """
@@ -0,0 +1,61 @@
1
+ from _typeshed import Incomplete
2
+ from scrapy.crawler import Crawler as Crawler
3
+ from scrapy.http import Response as Response
4
+ from scrapy.spiders import SitemapSpider
5
+ from typing import Any
6
+
7
+ class CrawlSitemapSpider(SitemapSpider):
8
+ """A Scrapy spider designed to scrape content from the sitemaps.
9
+
10
+ This spider uses the SitemapSpider base class to follow the sitemap links provided in the
11
+ robots.txt file of the website. It parses each page and extracts the URLs of the pages. If
12
+ an error occurs during parsing, it logs the error.
13
+
14
+ Attributes:
15
+ name (str): The name of the spider - 'crawl_sitemap_spider'.
16
+ sitemap_urls (list): The URLs of the sitemaps to start crawling from.
17
+ allowed_domains (list): The domains that this spider is allowed to crawl.
18
+ custom_settings (dict): Custom settings for the spider, including the log level and log file.
19
+ """
20
+ name: str
21
+ custom_settings: dict[str, Any]
22
+ @classmethod
23
+ def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any):
24
+ """Creates a new CrawlSitemapSpider instance and sets the custom settings.
25
+
26
+ Args:
27
+ crawler (scrapy.crawler.Crawler): The crawler object.
28
+ *args: Variable length argument list.
29
+ **kwargs: Arbitrary keyword arguments.
30
+
31
+ Returns:
32
+ CrawlSitemapSpider: The CrawlSitemapSpider instance.
33
+ """
34
+ sitemap_urls: Incomplete
35
+ allowed_domains: Incomplete
36
+ callback: Incomplete
37
+ removed_components: Incomplete
38
+ is_follow_page: Incomplete
39
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
40
+ """Initializes the CrawlSitemapSpider instance.
41
+
42
+ The method initializes the CrawlSitemapSpider instance and sets the sitemap_urls, allowed_domains,
43
+ and sitemap_from_robots attributes based on the provided arguments.
44
+
45
+ Args:
46
+ *args: Variable length argument list.
47
+ **kwargs: Arbitrary keyword arguments.
48
+ """
49
+ def parse(self, response: Response):
50
+ """Parse the response.
51
+
52
+ This method parses the response obtained from the website, extracts the URLs of the pages,
53
+ and follows the links to the next pages.
54
+
55
+ This method attempts to yield a dictionary containing the URL of the response. If an error occurs,
56
+ it yields the URL and an error message.
57
+ It also extracts the URLs of the next pages from the response and follows them.
58
+
59
+ Args:
60
+ response (scrapy.http.Response): The response object to parse.
61
+ """
@@ -0,0 +1,50 @@
1
+ from _typeshed import Incomplete
2
+ from collections.abc import Generator
3
+ from scrapy.crawler import Crawler as Crawler
4
+ from scrapy.http import Request as Request, Response as Response
5
+ from scrapy.spiders import CrawlSpider
6
+ from typing import Any
7
+
8
+ class CrawlBaseSpider(CrawlSpider):
9
+ """A Scrapy CrawlSpider designed to crawl and extract content from website.
10
+
11
+ Attributes:
12
+ name (str): The name of the spider - 'crawl_spider'.
13
+ allowed_domains (list): The allowed domains for spider to crawl.
14
+ start_urls (list): The starting URLs for the spider to initiate crawling.
15
+ custom_settings (dict): Custom settings for the spider, including item pipelines configuration.
16
+ rules (tuple): The rules to be followed during the crawling process.
17
+ """
18
+ name: str
19
+ rules: Incomplete
20
+ custom_settings: dict[str, Any]
21
+ def add_playwright(self, request: Request, _: Response):
22
+ """Adds playwright meta information to the request."""
23
+ def start_requests(self) -> Generator[Incomplete]:
24
+ """Start Request.
25
+
26
+ Initiates requests for the specified start URLs using Scrapy requests with additional
27
+ meta information for playwright usage.
28
+ """
29
+ start_urls: Incomplete
30
+ allowed_domains: Incomplete
31
+ callback: Incomplete
32
+ removed_components: Incomplete
33
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
34
+ """Initialize the CrawlBaseSpider."""
35
+ async def parse_web(self, response: Response):
36
+ """Parses the response obtained from the website, distinguishing between HTML content and other file types."""
37
+ @classmethod
38
+ def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any):
39
+ """Creates a new instance of the CrawlBaseSpider with custom settings.
40
+
41
+ Args:
42
+ crawler (Crawler): The crawler object.
43
+ *args (Any): The arguments to be passed to the spider.
44
+ **kwargs (Any): The keyword arguments to be passed to the spider.
45
+
46
+ Returns:
47
+ CrawlBaseSpider: The CrawlBaseSpider instance.
48
+ """
49
+ async def errback(self, failure: Any):
50
+ """Handles errors encountered during the crawling process and closes playwright pages."""
@@ -0,0 +1,22 @@
1
+ from .scrape_spider import ScrapeSpider as ScrapeSpider
2
+ from _typeshed import Incomplete
3
+ from collections.abc import Generator
4
+ from typing import Any
5
+
6
+ class PlaywrightScrapeSpider(ScrapeSpider):
7
+ """A Scrapy spider designed to scrape content from website using playwright to render Javascript loaded page.
8
+
9
+ Attributes:
10
+ name (str): The name of the spider - 'playwright_scrape_spider'.
11
+ allowed_domains (list): The allowed domains for spider to crawl.
12
+ start_urls (list): The starting URLs for the spider to initiate crawling
13
+ custom_settings (dict): Custom settings for the spider, including item pipelines configuration.
14
+ """
15
+ name: str
16
+ custom_settings: dict[str, Any]
17
+ def start_requests(self) -> Generator[Incomplete]:
18
+ """Start Request.
19
+
20
+ Initiates requests for the specified start URLs using Scrapy requests with additional
21
+ meta information for playwright usage.
22
+ """