gllm-docproc-binary 0.7.21__cp311-cp311-macosx_13_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. gllm_docproc/__init__.pyi +0 -0
  2. gllm_docproc/chunker/__init__.pyi +3 -0
  3. gllm_docproc/chunker/base_chunker.pyi +28 -0
  4. gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
  5. gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
  6. gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
  7. gllm_docproc/chunker/table/__init__.pyi +3 -0
  8. gllm_docproc/chunker/table/table_chunker.pyi +45 -0
  9. gllm_docproc/converter/__init__.pyi +3 -0
  10. gllm_docproc/converter/base_converter.pyi +15 -0
  11. gllm_docproc/data_generator/__init__.pyi +5 -0
  12. gllm_docproc/data_generator/base_data_generator.pyi +18 -0
  13. gllm_docproc/data_generator/image_data_generator/__init__.pyi +4 -0
  14. gllm_docproc/data_generator/image_data_generator/image_caption_data_generator.pyi +40 -0
  15. gllm_docproc/data_generator/image_data_generator/multi_model_image_caption_data_generator.pyi +51 -0
  16. gllm_docproc/data_generator/pii_data_generator/__init__.pyi +1 -0
  17. gllm_docproc/downloader/__init__.pyi +5 -0
  18. gllm_docproc/downloader/base_downloader.pyi +19 -0
  19. gllm_docproc/downloader/direct_file_url_downloader.pyi +40 -0
  20. gllm_docproc/downloader/google_drive_downloader.pyi +36 -0
  21. gllm_docproc/downloader/html/__init__.pyi +6 -0
  22. gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
  23. gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
  24. gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
  25. gllm_docproc/downloader/html/firecrawl_downloader.pyi +49 -0
  26. gllm_docproc/downloader/html/html_downloader.pyi +114 -0
  27. gllm_docproc/downloader/html/requests_downloader.pyi +46 -0
  28. gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
  29. gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
  30. gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
  31. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
  32. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +28 -0
  33. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
  34. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +66 -0
  35. gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
  36. gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
  37. gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
  38. gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +43 -0
  39. gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
  40. gllm_docproc/downloader/html/utils/web_utils.pyi +39 -0
  41. gllm_docproc/dpo_router/__init__.pyi +5 -0
  42. gllm_docproc/dpo_router/base_dpo_router.pyi +16 -0
  43. gllm_docproc/dpo_router/loader_router.pyi +52 -0
  44. gllm_docproc/dpo_router/parser_router.pyi +42 -0
  45. gllm_docproc/housekeeping/__init__.pyi +3 -0
  46. gllm_docproc/housekeeping/base_housekeeping.pyi +14 -0
  47. gllm_docproc/indexer/__init__.pyi +3 -0
  48. gllm_docproc/indexer/base_indexer.pyi +30 -0
  49. gllm_docproc/indexer/graph/__init__.pyi +4 -0
  50. gllm_docproc/indexer/graph/graph_rag_indexer.pyi +11 -0
  51. gllm_docproc/indexer/graph/light_rag_graph_rag_indexer.pyi +97 -0
  52. gllm_docproc/indexer/graph/llama_index_graph_rag_indexer.pyi +63 -0
  53. gllm_docproc/loader/__init__.pyi +4 -0
  54. gllm_docproc/loader/audio/__init__.pyi +3 -0
  55. gllm_docproc/loader/audio/audio_loader.pyi +45 -0
  56. gllm_docproc/loader/base_loader.pyi +30 -0
  57. gllm_docproc/loader/csv/__init__.pyi +3 -0
  58. gllm_docproc/loader/csv/pandas_loader.pyi +53 -0
  59. gllm_docproc/loader/docx/__init__.pyi +5 -0
  60. gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
  61. gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
  62. gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
  63. gllm_docproc/loader/exception/__init__.pyi +4 -0
  64. gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
  65. gllm_docproc/loader/exception/video_conversion_error.pyi +12 -0
  66. gllm_docproc/loader/html/__init__.pyi +5 -0
  67. gllm_docproc/loader/html/exception/__init__.pyi +3 -0
  68. gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
  69. gllm_docproc/loader/html/flat/__init__.pyi +3 -0
  70. gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +65 -0
  71. gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
  72. gllm_docproc/loader/html/flat/html_flat_merger.pyi +22 -0
  73. gllm_docproc/loader/html/html_base_loader.pyi +25 -0
  74. gllm_docproc/loader/html/nested/__init__.pyi +3 -0
  75. gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
  76. gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
  77. gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
  78. gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
  79. gllm_docproc/loader/html/utils/__init__.pyi +0 -0
  80. gllm_docproc/loader/html/utils/flat_table_utils.pyi +44 -0
  81. gllm_docproc/loader/html/utils/html_utils.pyi +41 -0
  82. gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
  83. gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
  84. gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
  85. gllm_docproc/loader/image/__init__.pyi +3 -0
  86. gllm_docproc/loader/image/image_loader.pyi +54 -0
  87. gllm_docproc/loader/json/__init__.pyi +3 -0
  88. gllm_docproc/loader/json/json_elements_loader.pyi +35 -0
  89. gllm_docproc/loader/loader_utils.pyi +43 -0
  90. gllm_docproc/loader/pdf/__init__.pyi +14 -0
  91. gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +37 -0
  92. gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +47 -0
  93. gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +49 -0
  94. gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
  95. gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
  96. gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
  97. gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
  98. gllm_docproc/loader/pdf/pdf_page_loader.pyi +41 -0
  99. gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +35 -0
  100. gllm_docproc/loader/pdf/pymupdf_loader.pyi +55 -0
  101. gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +56 -0
  102. gllm_docproc/loader/pdf/pymupdf_utils.pyi +77 -0
  103. gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
  104. gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
  105. gllm_docproc/loader/pipeline_loader.pyi +48 -0
  106. gllm_docproc/loader/pptx/__init__.pyi +3 -0
  107. gllm_docproc/loader/pptx/python_pptx_loader.pyi +48 -0
  108. gllm_docproc/loader/txt/__init__.pyi +3 -0
  109. gllm_docproc/loader/txt/txt_loader.pyi +55 -0
  110. gllm_docproc/loader/video/__init__.pyi +3 -0
  111. gllm_docproc/loader/video/video_loader_utils.pyi +97 -0
  112. gllm_docproc/loader/video/video_transcript_loader.pyi +59 -0
  113. gllm_docproc/loader/xlsx/__init__.pyi +3 -0
  114. gllm_docproc/loader/xlsx/openpyxl_loader.pyi +36 -0
  115. gllm_docproc/model/__init__.pyi +7 -0
  116. gllm_docproc/model/element.pyi +38 -0
  117. gllm_docproc/model/element_metadata.pyi +35 -0
  118. gllm_docproc/model/loader_type.pyi +20 -0
  119. gllm_docproc/model/media.pyi +51 -0
  120. gllm_docproc/model/parser_type.pyi +19 -0
  121. gllm_docproc/parser/__init__.pyi +4 -0
  122. gllm_docproc/parser/base_parser.pyi +28 -0
  123. gllm_docproc/parser/document/__init__.pyi +7 -0
  124. gllm_docproc/parser/document/docx_parser.pyi +27 -0
  125. gllm_docproc/parser/document/pdf_parser.pyi +35 -0
  126. gllm_docproc/parser/document/pptx_parser.pyi +34 -0
  127. gllm_docproc/parser/document/txt_parser.pyi +22 -0
  128. gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
  129. gllm_docproc/parser/html/__init__.pyi +4 -0
  130. gllm_docproc/parser/html/flat/__init__.pyi +0 -0
  131. gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
  132. gllm_docproc/parser/html/nested/__init__.pyi +0 -0
  133. gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
  134. gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
  135. gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
  136. gllm_docproc/parser/image/__init__.pyi +4 -0
  137. gllm_docproc/parser/image/image_mime_normalization_parser.pyi +43 -0
  138. gllm_docproc/parser/image/image_plain_small_filter_parser.pyi +45 -0
  139. gllm_docproc/parser/pipeline_parser.pyi +33 -0
  140. gllm_docproc/parser/table/__init__.pyi +3 -0
  141. gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
  142. gllm_docproc/request_handler/__init__.pyi +3 -0
  143. gllm_docproc/request_handler/base_request_handler.pyi +16 -0
  144. gllm_docproc/response_handler/__init__.pyi +3 -0
  145. gllm_docproc/response_handler/base_response_handler.pyi +38 -0
  146. gllm_docproc/utils/__init__.pyi +3 -0
  147. gllm_docproc/utils/async_utils.pyi +22 -0
  148. gllm_docproc/utils/file_utils.pyi +76 -0
  149. gllm_docproc/utils/html_constants.pyi +122 -0
  150. gllm_docproc/validator/__init__.pyi +6 -0
  151. gllm_docproc/validator/base_validator.pyi +34 -0
  152. gllm_docproc/validator/character_count_validator.pyi +26 -0
  153. gllm_docproc/validator/file_size_validator.pyi +20 -0
  154. gllm_docproc/validator/model/__init__.pyi +4 -0
  155. gllm_docproc/validator/model/validator_input.pyi +50 -0
  156. gllm_docproc/validator/model/validator_result.pyi +19 -0
  157. gllm_docproc/validator/page_count_validator.pyi +23 -0
  158. gllm_docproc/validator/pipeline_validator.pyi +40 -0
  159. gllm_docproc.build/.gitignore +1 -0
  160. gllm_docproc.cpython-311-darwin.so +0 -0
  161. gllm_docproc.pyi +213 -0
  162. gllm_docproc_binary-0.7.21.dist-info/METADATA +216 -0
  163. gllm_docproc_binary-0.7.21.dist-info/RECORD +165 -0
  164. gllm_docproc_binary-0.7.21.dist-info/WHEEL +5 -0
  165. gllm_docproc_binary-0.7.21.dist-info/top_level.txt +1 -0
@@ -0,0 +1,114 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.downloader.base_downloader import BaseDownloader as BaseDownloader
3
+ from gllm_docproc.downloader.html.exception import ItemScrapeFailedException as ItemScrapeFailedException
4
+ from gllm_docproc.downloader.html.scraper.scraper.spiders import CrawlBaseSpider as CrawlBaseSpider, CrawlSitemapLinkSpider as CrawlSitemapLinkSpider, CrawlSitemapSpider as CrawlSitemapSpider
5
+ from gllm_docproc.downloader.html.scraper.web_scraper_executor import WebScraperExecutor as WebScraperExecutor
6
+ from gllm_docproc.downloader.html.utils import generate_filename_from_url as generate_filename_from_url, is_valid_url as is_valid_url
7
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, HTML as HTML
8
+ from gllm_docproc.utils.file_utils import create_full_path as create_full_path, save_file as save_file, save_to_json as save_to_json
9
+ from scrapy import Spider as Spider
10
+ from typing import Any
11
+
12
+ class HTMLDownloader(BaseDownloader):
13
+ """A downloader class for downloading web content.
14
+
15
+ This class inherits from the BaseDownloader class and provides methods to download web content.
16
+
17
+ Args:
18
+ **kwargs (Any): Additional keyword arguments.
19
+ """
20
+ URL_INDEX: int
21
+ CONTENT_INDEX: int
22
+ kwargs: Incomplete
23
+ logger: Incomplete
24
+ def __init__(self, **kwargs: Any) -> None:
25
+ """Initializes the WebDownloader class.
26
+
27
+ Args:
28
+ **kwargs (Any): Additional keyword arguments.
29
+ """
30
+ def download(self, source: str, output: str, **kwargs: Any) -> list[str]:
31
+ """Downloads and saves web content from a single URL.
32
+
33
+ This method uses a web scraper to fetch the HTML content from the specified URL
34
+ and saves it as a JSON file in the given output directory.
35
+
36
+ Args:
37
+ source (str): The URL of the web content to download.
38
+ output (str): The directory path where the downloaded content (in JSON format) will be saved.
39
+ **kwargs (Any): Additional keyword arguments.
40
+
41
+ Kwargs:
42
+ include_markdown (bool, optional): Whether to include markdown conversion in the output. Defaults to False.
43
+
44
+ Returns:
45
+ list[str]: The list of full filepath of the created JSON files.
46
+
47
+ Raises:
48
+ ItemScrapeFailedException: If the provided source is not a valid URL, or if the web content
49
+ cannot be successfully scraped or downloaded from the URL.
50
+ """
51
+ def download_from_multiple_urls(self, urls: list[str], output: str = '.', **kwargs: Any) -> list[str]:
52
+ """Downloads web content from multiple URLs.
53
+
54
+ Args:
55
+ urls (list[str]): The URLs to download.
56
+ output (str): The output where we put the downloaded content (usually a folder path).
57
+ **kwargs (Any): Additional keyword arguments.
58
+
59
+ Kwargs:
60
+ include_markdown (bool, optional): Whether to include markdown conversion in the output. Defaults to False.
61
+
62
+ Returns:
63
+ list[str]: The list of full filepath of the created JSON files.
64
+ """
65
+ def download_crawl(self, urls: list[str] | str, output: str = '.', spider_type: type[Spider] | None = None, **kwargs: Any) -> list[str]:
66
+ """Downloads web content from the provided URLs.
67
+
68
+ This method uses a web scraper to crawl the provided URLs and saves the downloaded content to a file.
69
+
70
+ Args:
71
+ output (str): The output where we put the downloaded content (usually a folder path).
72
+ urls (list[str] | str): The URLs to crawl. Can be a single URL (str) or a list of URLs (list[str]).
73
+ spider_type (type[Spider] | None): The type of spider to use for downloading.
74
+ Defaults to None, which will use CrawlBaseSpider.
75
+ **kwargs (Any): Additional keyword arguments.
76
+
77
+ Kwargs:
78
+ include_markdown (bool, optional): Whether to include markdown conversion in the output. Defaults to False.
79
+
80
+ Returns:
81
+ list[str]: The list of full filepath of the created JSON files.
82
+ """
83
+ def download_sitemap(self, urls: list[str] | str, output: str = '.', spider_type: type[Spider] | None = None, **kwargs: Any) -> list[str]:
84
+ """Downloads web content from the sitemap of the provided URLs.
85
+
86
+ This method uses a web scraper to scrape the sitemap of each URL and saves the downloaded content to a file.
87
+
88
+ Args:
89
+ urls (list[str] | str): The URLs to scrape. Can be a single URL (str) or a list of URLs (list[str]).
90
+ output (str): The output where we put the downloaded content (usually a folder path).
91
+ spider_type (type[Spider] | None): The type of spider to use for downloading.
92
+ Defaults to None, which will use CrawlSitemapSpider.
93
+ **kwargs (Any): Additional keyword arguments.
94
+
95
+ Kwargs:
96
+ include_markdown (bool, optional): Whether to include markdown conversion in the output. Defaults to False.
97
+
98
+ Returns:
99
+ list[str]: The list of full filepath of the created JSON files.
100
+ """
101
+ def download_sitemap_links(self, urls: list[str] | str, output: str = '.', spider_type: type[Spider] | None = None) -> None:
102
+ """Retrieves all links from the sitemap of the provided URLs.
103
+
104
+ This method uses a web scraper to scrape the sitemap of each URL and returns a list of all found links.
105
+
106
+ Args:
107
+ urls (list[str] | str): The URLs to scrape. Can be a single URL (str) or a list of URLs (list[str]).
108
+ output (str): The output where we put the downloaded content (usually a folder path).
109
+ spider_type (type[Spider] | None)): The type of spider to use for downloading.
110
+ Defaults to None, which will use CrawlSitemapLinkSpider.
111
+
112
+ Returns:
113
+ None
114
+ """
@@ -0,0 +1,46 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.downloader import BaseDownloader as BaseDownloader
3
+ from gllm_docproc.downloader.html.utils import generate_filename_from_url as generate_filename_from_url
4
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, HTML as HTML
5
+ from gllm_docproc.utils.file_utils import save_to_json as save_to_json
6
+ from typing import Any
7
+
8
+ class RequestsDownloader(BaseDownloader):
9
+ """A class for downloading HTML content from web pages and returning JSON format.
10
+
11
+ This class downloads HTML content from a given URL and returns it in a specific
12
+ dictionary format containing ElementMetadata and the decoded HTML content.
13
+ """
14
+ max_retries: Incomplete
15
+ timeout: Incomplete
16
+ proxies: Incomplete
17
+ session: Incomplete
18
+ logger: Incomplete
19
+ def __init__(self, max_retries: int = 3, timeout: float | None = None, proxies: dict[str, str] | None = None) -> None:
20
+ '''Initialize the RequestsDownloader.
21
+
22
+ Args:
23
+ max_retries (int, optional): The maximum number of retries for failed downloads. Defaults to 3.
24
+ timeout (float | None, optional): The timeout for the download request in seconds. Defaults to None.
25
+ proxies (dict[str, str] | None, optional): Dictionary of proxy servers to use. Defaults to None.
26
+ Example: {"http": "http://proxy.example.com:8080", "https": "https://proxy.example.com:8080"}
27
+ '''
28
+ def download(self, source: str, output: str, **kwargs: Any) -> list[str]:
29
+ """Download HTML content from the source URL and save as JSON file.
30
+
31
+ Args:
32
+ source (str): The URL to download HTML content from.
33
+ output (str): The output directory where the downloaded JSON file will be saved.
34
+ **kwargs (Any): Additional keyword arguments.
35
+
36
+ kwargs:
37
+ timeout (float | None, optional): Override the timeout for this specific request.
38
+ max_retries (int, optional): Override the max_retries for this specific request.
39
+ proxies (dict[str, str] | None, optional): Override the proxies for this specific request.
40
+
41
+ Returns:
42
+ list[str]: A list containing the file path of the saved JSON file.
43
+
44
+ Raises:
45
+ requests.RequestException: If the download fails after all retries.
46
+ """
File without changes
@@ -0,0 +1,9 @@
1
+ from .crawl_pdf_spider import CrawlPDFSpider as CrawlPDFSpider
2
+ from .crawl_sitemap_link_spider import CrawlSitemapLinkSpider as CrawlSitemapLinkSpider
3
+ from .crawl_sitemap_spider import CrawlSitemapSpider as CrawlSitemapSpider
4
+ from .crawl_spider import CrawlBaseSpider as CrawlBaseSpider
5
+ from .playwright_scrape_spider import PlaywrightScrapeSpider as PlaywrightScrapeSpider
6
+ from .scrape_spider import ScrapeSpider as ScrapeSpider
7
+ from .zyte_scrape_spider import ZyteScrapeSpider as ZyteScrapeSpider
8
+
9
+ __all__ = ['ScrapeSpider', 'PlaywrightScrapeSpider', 'ZyteScrapeSpider', 'CrawlBaseSpider', 'CrawlSitemapSpider', 'CrawlSitemapLinkSpider', 'CrawlPDFSpider']
@@ -0,0 +1,27 @@
1
+ from .crawl_spider import CrawlBaseSpider as CrawlBaseSpider
2
+ from _typeshed import Incomplete
3
+ from collections.abc import Generator
4
+ from gllm_docproc.downloader.html.utils import clean_url as clean_url
5
+ from scrapy.http import Request, Response
6
+
7
+ class CrawlPDFSpider(CrawlBaseSpider):
8
+ """Scrapy CrawlSpider to crawl websites and save responses as PDFs using Playwright.
9
+
10
+ Attributes:
11
+ name (str): The name of the spider - 'crawl_pdf_spider'.
12
+ allowed_domains (list): The allowed domains for spider to crawl.
13
+ start_urls (list): The starting URLs for the spider to initiate crawling.
14
+ custom_settings (dict): Custom settings for the spider, including item pipelines configuration.
15
+ rules (tuple): The rules to be followed during the crawling process.
16
+ """
17
+ name: str
18
+ def add_playwright(self, request: Request, _: Response):
19
+ """Adds playwright meta information to the request."""
20
+ async def start(self) -> Generator[Incomplete]:
21
+ """Start Request.
22
+
23
+ Initiates requests for the specified start URLs using Scrapy requests with additional
24
+ meta information for playwright usage.
25
+ """
26
+ async def parse_web(self, response: Response):
27
+ """Parses the response obtained from the website, distinguishing between HTML content and other file types."""
@@ -0,0 +1,28 @@
1
+ from .crawl_sitemap_spider import CrawlSitemapSpider as CrawlSitemapSpider
2
+ from scrapy.crawler import Crawler
3
+ from typing import Any
4
+
5
+ class CrawlSitemapLinkSpider(CrawlSitemapSpider):
6
+ """A Scrapy spider designed to scrape links from the sitemaps.
7
+
8
+ This spider uses the CrawlSitemapSpider base class to follow the sitemap links provided in the
9
+ robots.txt file of the website. It parses each page and extracts the URLs of the pages. If
10
+
11
+ Attributes:
12
+ name (str): The name of the spider - 'crawl_sitemap_link_spider'.
13
+ custom_settings (dict): Custom settings for the spider, including the log level and log file.
14
+ """
15
+ name: str
16
+ custom_settings: dict[str, Any]
17
+ @classmethod
18
+ def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any):
19
+ """Creates a new instance of the spider.
20
+
21
+ Args:
22
+ crawler (scrapy.crawler.Crawler): The Scrapy crawler object.
23
+ *args: Variable length argument list.
24
+ **kwargs: Arbitrary keyword arguments.
25
+
26
+ Returns:
27
+ CrawlSitemapLinkSpider: A new instance of the spider.
28
+ """
@@ -0,0 +1,61 @@
1
+ from _typeshed import Incomplete
2
+ from scrapy.crawler import Crawler
3
+ from scrapy.http import Response
4
+ from scrapy.spiders import SitemapSpider
5
+ from typing import Any
6
+
7
+ class CrawlSitemapSpider(SitemapSpider):
8
+ """A Scrapy spider designed to scrape content from the sitemaps.
9
+
10
+ This spider uses the SitemapSpider base class to follow the sitemap links provided in the
11
+ robots.txt file of the website. It parses each page and extracts the URLs of the pages. If
12
+ an error occurs during parsing, it logs the error.
13
+
14
+ Attributes:
15
+ name (str): The name of the spider - 'crawl_sitemap_spider'.
16
+ sitemap_urls (list): The URLs of the sitemaps to start crawling from.
17
+ allowed_domains (list): The domains that this spider is allowed to crawl.
18
+ custom_settings (dict): Custom settings for the spider, including the log level and log file.
19
+ """
20
+ name: str
21
+ custom_settings: dict[str, Any]
22
+ sitemap_urls: Incomplete
23
+ allowed_domains: Incomplete
24
+ callback: Incomplete
25
+ removed_components: Incomplete
26
+ is_follow_page: Incomplete
27
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
28
+ """Initializes the CrawlSitemapSpider instance.
29
+
30
+ The method initializes the CrawlSitemapSpider instance and sets the sitemap_urls, allowed_domains,
31
+ and sitemap_from_robots attributes based on the provided arguments.
32
+
33
+ Args:
34
+ *args: Variable length argument list.
35
+ **kwargs: Arbitrary keyword arguments.
36
+ """
37
+ def parse(self, response: Response):
38
+ """Parse the response.
39
+
40
+ This method parses the response obtained from the website, extracts the URLs of the pages,
41
+ and follows the links to the next pages.
42
+
43
+ This method attempts to yield a dictionary containing the URL of the response. If an error occurs,
44
+ it yields the URL and an error message.
45
+ It also extracts the URLs of the next pages from the response and follows them.
46
+
47
+ Args:
48
+ response (scrapy.http.Response): The response object to parse.
49
+ """
50
+ @classmethod
51
+ def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any) -> CrawlSitemapSpider:
52
+ """Creates a new CrawlSitemapSpider instance and sets the custom settings.
53
+
54
+ Args:
55
+ crawler (scrapy.crawler.Crawler): The crawler object.
56
+ *args: Variable length argument list.
57
+ **kwargs: Arbitrary keyword arguments.
58
+
59
+ Returns:
60
+ CrawlSitemapSpider: The CrawlSitemapSpider instance.
61
+ """
@@ -0,0 +1,66 @@
1
+ from _typeshed import Incomplete
2
+ from collections.abc import Generator
3
+ from scrapy.crawler import Crawler
4
+ from scrapy.http import Request, Response
5
+ from scrapy.spiders import CrawlSpider
6
+ from typing import Any
7
+
8
+ class CrawlBaseSpider(CrawlSpider):
9
+ """A Scrapy CrawlSpider designed to crawl and extract content from website.
10
+
11
+ Attributes:
12
+ name (str): The name of the spider - 'crawl_spider'.
13
+ allowed_domains (list): The allowed domains for spider to crawl.
14
+ start_urls (list): The starting URLs for the spider to initiate crawling.
15
+ custom_settings (dict): Custom settings for the spider, including item pipelines configuration.
16
+ rules (tuple): The rules to be followed during the crawling process.
17
+ """
18
+ name: str
19
+ rules: Incomplete
20
+ custom_settings: dict[str, Any]
21
+ start_urls: Incomplete
22
+ allowed_domains: Incomplete
23
+ callback: Incomplete
24
+ removed_components: Incomplete
25
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
26
+ """Initialize the CrawlBaseSpider."""
27
+ def add_playwright(self, request: Request, _: Response):
28
+ """Adds playwright meta information to the request."""
29
+ async def start(self) -> Generator[Incomplete]:
30
+ """Start Request.
31
+
32
+ Initiates requests for the specified start URLs using Scrapy requests with additional
33
+ meta information for playwright usage.
34
+ """
35
+ def parse_web(self, response: Response):
36
+ """Parses the response obtained from the website, distinguishing between HTML content and other file types.
37
+
38
+ Args:
39
+ response (Response): The response object containing the page content.
40
+
41
+ Returns:
42
+ None
43
+ """
44
+ def follow_selected_urls(self, response: Response):
45
+ """Follows selected URLs from the response.
46
+
47
+ Args:
48
+ response (Response): The response object containing the page content.
49
+
50
+ Returns:
51
+ None
52
+ """
53
+ @classmethod
54
+ def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any):
55
+ """Creates a new instance of the CrawlBaseSpider with custom settings.
56
+
57
+ Args:
58
+ crawler (Crawler): The crawler object.
59
+ *args (Any): The arguments to be passed to the spider.
60
+ **kwargs (Any): The keyword arguments to be passed to the spider.
61
+
62
+ Returns:
63
+ CrawlBaseSpider: The CrawlBaseSpider instance.
64
+ """
65
+ async def errback(self, failure: Any):
66
+ """Handles errors encountered during the crawling process and closes playwright pages."""
@@ -0,0 +1,22 @@
1
+ from .scrape_spider import ScrapeSpider as ScrapeSpider
2
+ from _typeshed import Incomplete
3
+ from collections.abc import Generator
4
+ from typing import Any
5
+
6
+ class PlaywrightScrapeSpider(ScrapeSpider):
7
+ """A Scrapy spider designed to scrape content from website using playwright to render Javascript loaded page.
8
+
9
+ Attributes:
10
+ name (str): The name of the spider - 'playwright_scrape_spider'.
11
+ allowed_domains (list): The allowed domains for spider to crawl.
12
+ start_urls (list): The starting URLs for the spider to initiate crawling
13
+ custom_settings (dict): Custom settings for the spider, including item pipelines configuration.
14
+ """
15
+ name: str
16
+ custom_settings: dict[str, Any]
17
+ async def start(self) -> Generator[Incomplete]:
18
+ """Start Request.
19
+
20
+ Initiates requests for the specified start URLs using Scrapy requests with additional
21
+ meta information for playwright usage.
22
+ """
@@ -0,0 +1,57 @@
1
+ import scrapy
2
+ from _typeshed import Incomplete
3
+ from scrapy.crawler import Crawler
4
+ from scrapy.http import Response
5
+ from typing import Any
6
+
7
+ class ScrapeSpider(scrapy.Spider):
8
+ """A Scrapy spider designed to scrape content from website.
9
+
10
+ Attributes:
11
+ name (str): The name of the spider - 'scrape_spider'.
12
+ start_urls (list): The list of URLs to start the spider.
13
+ allowed_domains (list): The list of allowed domains for crawling.
14
+ extracted_html (str): The HTML content extracted during crawling.
15
+ custom_settings (dict): Custom settings for the spider, including the log level and log file.
16
+ """
17
+ name: str
18
+ custom_settings: dict[str, Any]
19
+ start_urls: Incomplete
20
+ allowed_domains: Incomplete
21
+ callback: Incomplete
22
+ removed_components: Incomplete
23
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
24
+ """Initializes the ScrapeSpider.
25
+
26
+ Args:
27
+ *args: Variable length argument list.
28
+ **kwargs: Arbitrary keyword arguments.
29
+ """
30
+ def parse(self, response: Response, **kwargs: Any) -> None:
31
+ """Parses the response obtained from the website, distinguishing between HTML content and other file types.
32
+
33
+ Args:
34
+ response (scrapy.http.Response): The response obtained from the website.
35
+ **kwargs (dict[str, Any]): Additional keyword arguments.
36
+ """
37
+ def get_content_type(self, response: Response) -> str:
38
+ """Gets the content type from the response headers.
39
+
40
+ Args:
41
+ response (scrapy.http.Response): The response object.
42
+
43
+ Returns:
44
+ str: The content type.
45
+ """
46
+ @classmethod
47
+ def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any):
48
+ """Creates a new ScrapeSpider instance from the crawler.
49
+
50
+ Args:
51
+ crawler (scrapy.crawler.Crawler): The crawler object.
52
+ *args: Variable length argument list.
53
+ **kwargs: Arbitrary keyword arguments.
54
+
55
+ Returns:
56
+ ScrapeSpider: The ScrapeSpider instance.
57
+ """
@@ -0,0 +1,51 @@
1
+ from .scrape_spider import ScrapeSpider as ScrapeSpider
2
+ from _typeshed import Incomplete
3
+ from collections.abc import Generator
4
+ from scrapy.crawler import Crawler
5
+ from scrapy.http import Response
6
+ from typing import Any
7
+
8
+ class ZyteScrapeSpider(ScrapeSpider):
9
+ """A Scrapy spider designed to scrape content from a website using the Zyte API.
10
+
11
+ This spider is specifically tailored for scraping content from a website using the Zyte API to handle block and
12
+ render Javascript loaded page.
13
+
14
+ Attributes:
15
+ name (str): The name of the spider - 'zyte_scrape_spider'.
16
+ allowed_domains (list): The list of allowed domains for crawling.
17
+ start_urls (list): The starting URLs for the spider to initiate crawling
18
+ custom_settings (dict): Custom settings for the spider, including item pipelines configuration.
19
+ """
20
+ name: str
21
+ custom_settings: dict[str, Any]
22
+ @classmethod
23
+ def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any):
24
+ """Create an instance of the spider from a Scrapy crawler.
25
+
26
+ This method is a class method that is called by Scrapy to create an instance of the spider
27
+ based on the provided Scrapy crawler and additional arguments.
28
+
29
+ Args:
30
+ crawler: The Scrapy crawler object.
31
+ *args: Variable length argument list.
32
+ **kwargs: Variable length keyword argument list.
33
+
34
+ Returns:
35
+ WebLoaderSpider: An instance of the spider.
36
+ """
37
+ async def start(self) -> Generator[Incomplete]:
38
+ """Start Request.
39
+
40
+ Initiates requests for the specified start URLs using Scrapy requests with additional
41
+ meta information for zyte usage.
42
+
43
+ This method iterates over the start_urls list and creates a Scrapy Request for each URL.
44
+ The Request includes meta information to enable the browserHtml feature of the Zyte Automatic Extraction API.
45
+ """
46
+ def parse(self, response: Response) -> None:
47
+ """Parses the HTML response obtained from the website.
48
+
49
+ Args:
50
+ response (scrapy.http.Response): The response object to parse.
51
+ """
@@ -0,0 +1,43 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.downloader.html.exception import ZyteApiKeyNotProvidedException as ZyteApiKeyNotProvidedException
3
+ from gllm_docproc.downloader.html.scraper.scraper.spiders import ScrapeSpider as ScrapeSpider, ZyteScrapeSpider as ZyteScrapeSpider
4
+ from scrapy import Spider as Spider
5
+ from typing import Any
6
+
7
+ class WebScraperExecutor:
8
+ '''A utility class for initiating and running web scraping processes using Scrapy spiders.
9
+
10
+ This class supports multiple spider types such as PlaywrightScrapeSpider, ZyteScrapeSpider, CrawlBaseSpider,
11
+ and CrawlSitemapSpider. It utilizes multiprocessing to run the scraping process concurrently.
12
+
13
+ Methods:
14
+ __init__: Initializes the WebScraperExecutor instance.
15
+ get_html_strings: Initiates the Scrapy spider and starts the scraping process using multiprocessing.
16
+ get_spider_class: Gets the appropriate Scrapy spider class based on the provided spider type.
17
+ _crawler_results: Appends the provided item to the list of items.
18
+ _create_crawl_process: Creates and runs a Scrapy crawl process for a specific spider.
19
+ _is_connected_to_internet: Checks if the system is connected to the internet.
20
+
21
+ Raises:
22
+ ZyteApiKeyNotProvidedException: If the spider is "zyte" but the Zyte API key is not provided.
23
+ '''
24
+ results: Incomplete
25
+ items: dict[str, bytes | Exception]
26
+ kwargs: Incomplete
27
+ spider: Incomplete
28
+ def __init__(self, urls: list[str] | str, **kwargs: Any) -> None:
29
+ """Initializes the WebScraperExecutor instance.
30
+
31
+ Args:
32
+ urls (List[str] | str): The URLs to be scraped.
33
+ **kwargs (Dict[str, Any]): Additional keyword arguments.
34
+ """
35
+ def get_url_content_pairs(self) -> list[tuple[str, bytes | Exception]]:
36
+ '''Initiates the Scrapy spider and starts the scraping process using multiprocessing.
37
+
38
+ Returns:
39
+ List: A list of scraped url and html content.
40
+
41
+ Raises:
42
+ ZyteApiKeyNotProvidedException: If the spider is "zyte" but the Zyte API key is not provided.
43
+ '''
@@ -0,0 +1,3 @@
1
+ from .web_utils import clean_url as clean_url, generate_filename_from_url as generate_filename_from_url, is_valid_url as is_valid_url
2
+
3
+ __all__ = ['is_valid_url', 'clean_url', 'generate_filename_from_url']
@@ -0,0 +1,39 @@
1
+ def is_valid_url(url: str) -> bool:
2
+ """Checks if the provided URL is valid.
3
+
4
+ Args:
5
+ url (str): The URL to be validated.
6
+
7
+ Returns:
8
+ bool: True if the URL is valid; False otherwise.
9
+ """
10
+ def clean_url(url: str) -> str:
11
+ '''Return a deterministic filename stem by sanitizing only disallowed characters.
12
+
13
+ Only these characters are replaced with underscores: < > : " / \\ | ? *
14
+
15
+ Example:
16
+ https://www.bca.co.id/promo-bca -> https_www.bca.co.id_promo-bca
17
+
18
+ Args:
19
+ url (str): The URL to clean.
20
+
21
+ Returns:
22
+ str: A sanitized filename stem with only the listed characters replaced.
23
+ '''
24
+ def generate_filename_from_url(url: str, max_filename_len: int = 128) -> str:
25
+ """Generate a sanitized, unique, and length-safe filename stem from a URL.
26
+
27
+ The returned value is a filename stem (no extension). It is composed of a sanitized
28
+ version of the URL plus a uniqueness suffix consisting of a short random token.
29
+ The function also trims the base so that the final filename stem will not exceed
30
+ the specified maximum length.
31
+
32
+ Args:
33
+ url (str): The URL to derive the filename from.
34
+ max_filename_len (int, optional): Maximum total filename length for the stem.
35
+ Defaults to 128.
36
+
37
+ Returns:
38
+ str: A safe filename stem (no extension) that is unique and within length constraints.
39
+ """
@@ -0,0 +1,5 @@
1
+ from .base_dpo_router import BaseDPORouter as BaseDPORouter
2
+ from .loader_router import LoaderRouter as LoaderRouter
3
+ from .parser_router import ParserRouter as ParserRouter
4
+
5
+ __all__ = ['BaseDPORouter', 'LoaderRouter', 'ParserRouter']
@@ -0,0 +1,16 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any
3
+
4
+ class BaseDPORouter(ABC):
5
+ """Base class for routing in document processing."""
6
+ @abstractmethod
7
+ def route(self, *args: Any, **kwargs: Any) -> Any:
8
+ """Routes the input into different processing pipelines based on certain criteria.
9
+
10
+ Args:
11
+ *args (Any): Variable length argument list for routing parameters.
12
+ **kwargs (Any): Arbitrary keyword arguments for additional routing configuration.
13
+
14
+ Returns:
15
+ Any: The result of the routing process.
16
+ """
@@ -0,0 +1,52 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.dpo_router.base_dpo_router import BaseDPORouter as BaseDPORouter
3
+ from gllm_docproc.loader.csv.pandas_loader import CSV_VARIANTS as CSV_VARIANTS
4
+ from gllm_docproc.loader.image import ImageLoader as ImageLoader
5
+ from gllm_docproc.loader.json.json_elements_loader import JSON as JSON
6
+ from gllm_docproc.loader.txt import TXTLoader as TXTLoader
7
+ from gllm_docproc.loader.video.video_loader_utils import is_supported_video_file as is_supported_video_file
8
+ from gllm_docproc.model import Element as Element, LoaderType as LoaderType
9
+ from gllm_docproc.model.element_metadata import DOCX as DOCX, HTML as HTML, PDF as PDF, PPTX as PPTX, XLSX as XLSX
10
+ from typing import Any
11
+
12
+ class LoaderRouter(BaseDPORouter):
13
+ """Loader Router class.
14
+
15
+ This router determines the appropriate loader type based on the input source.
16
+ Returns a dict with the loader type information.
17
+ """
18
+ logger: Incomplete
19
+ txt_loader: Incomplete
20
+ image_loader: Incomplete
21
+ def __init__(self) -> None:
22
+ """Initialize the LoaderRouter.
23
+
24
+ This method initializes the LoaderRouter.
25
+ """
26
+ def route(self, source: str, *args: Any, **kwargs: Any) -> dict[str, Any]:
27
+ """Route the input source to the appropriate loader type.
28
+
29
+ This method determines the appropriate loader type based on the input source.
30
+ It checks if the source is a file or a YouTube URL.
31
+ 1. If it is a file, it checks the file extension or content to determine the loader type.
32
+ 2. If it is a YouTube URL, it returns the audio loader type.
33
+ 3. If it is not a file or a YouTube URL, it returns the uncategorized loader type.
34
+
35
+ Args:
36
+ source (str): The input source, either a file path or a YouTube URL.
37
+ *args (Any): Additional arguments.
38
+ **kwargs (Any): Additional keyword arguments.
39
+
40
+ Returns:
41
+ dict[str, Any]: A dictionary containing the loader type information.
42
+ Example: {LoaderType.KEY: LoaderType.PDF_LOADER}
43
+ """
44
+ def is_html_from_json(self, source: str) -> bool:
45
+ """Check if the source file contains valid HTML metadata.
46
+
47
+ Args:
48
+ source (str): The file path to check.
49
+
50
+ Returns:
51
+ bool: True if the file is a valid HTML metadata file, False otherwise.
52
+ """