data-prep-connector 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. data_prep_connector-0.2.2/Makefile +51 -0
  2. data_prep_connector-0.2.2/PKG-INFO +54 -0
  3. data_prep_connector-0.2.2/README.md +30 -0
  4. data_prep_connector-0.2.2/doc/overview.md +61 -0
  5. data_prep_connector-0.2.2/pyproject.toml +61 -0
  6. data_prep_connector-0.2.2/setup.cfg +4 -0
  7. data_prep_connector-0.2.2/src/data_prep_connector.egg-info/PKG-INFO +54 -0
  8. data_prep_connector-0.2.2/src/data_prep_connector.egg-info/SOURCES.txt +26 -0
  9. data_prep_connector-0.2.2/src/data_prep_connector.egg-info/dependency_links.txt +1 -0
  10. data_prep_connector-0.2.2/src/data_prep_connector.egg-info/requires.txt +15 -0
  11. data_prep_connector-0.2.2/src/data_prep_connector.egg-info/top_level.txt +1 -0
  12. data_prep_connector-0.2.2/src/dpk_connector/__init__.py +13 -0
  13. data_prep_connector-0.2.2/src/dpk_connector/core/__init__.py +11 -0
  14. data_prep_connector-0.2.2/src/dpk_connector/core/crawler.py +222 -0
  15. data_prep_connector-0.2.2/src/dpk_connector/core/item.py +21 -0
  16. data_prep_connector-0.2.2/src/dpk_connector/core/logging.py +22 -0
  17. data_prep_connector-0.2.2/src/dpk_connector/core/middlewares.py +263 -0
  18. data_prep_connector-0.2.2/src/dpk_connector/core/pipelines.py +29 -0
  19. data_prep_connector-0.2.2/src/dpk_connector/core/settings.py +70 -0
  20. data_prep_connector-0.2.2/src/dpk_connector/core/spiders/__init__.py +11 -0
  21. data_prep_connector-0.2.2/src/dpk_connector/core/spiders/sitemap.py +342 -0
  22. data_prep_connector-0.2.2/src/dpk_connector/core/utils.py +97 -0
  23. data_prep_connector-0.2.2/test/dpk_connector/core/__init__.py +0 -0
  24. data_prep_connector-0.2.2/test/dpk_connector/core/test_crawler.py +39 -0
  25. data_prep_connector-0.2.2/test/dpk_connector/core/test_middlewares.py +71 -0
  26. data_prep_connector-0.2.2/test/dpk_connector/core/test_sitemap_spider/index.html +39 -0
  27. data_prep_connector-0.2.2/test/dpk_connector/core/test_sitemap_spider.py +119 -0
  28. data_prep_connector-0.2.2/test/dpk_connector/core/test_utils.py +178 -0
@@ -0,0 +1,51 @@
1
+ # Use make help, to see the available rules
2
+ REPOROOT=..
3
+ include $(REPOROOT)/.make.defaults
4
+
5
+ DPK_CONNECTOR_VERSION=0.2.2
6
+
7
+ clean::
8
+ @# Help: Clean up the distribution build and the venv
9
+ rm -rf dist venv
10
+ rm -rf src/*egg-info
11
+
12
+ .check-env::
13
+ @echo "Checks passed"
14
+
15
+ setup::
16
+
17
+ set-versions: .check-env
18
+ $(MAKE) TOML_VERSION=$(DPK_CONNECTOR_VERSION) .defaults.update-toml
19
+
20
+ build:: build-dist
21
+
22
+ #build:: update-toml .defaults.build-dist
23
+ build-dist :: .defaults.build-dist
24
+
25
+ publish:: publish-dist
26
+
27
+ publish-dist :: .check-env .defaults.publish-dist
28
+
29
+ venv:: pyproject.toml
30
+ @# Help: Create the virtual environment using pyproject.toml
31
+ rm -r dist venv || true
32
+ rm -rf src/*egg-info || true
33
+ rm makeenv || true
34
+ $(PYTHON) -m venv venv
35
+ source venv/bin/activate; \
36
+ pip install --upgrade pip; \
37
+ pip install -e .; \
38
+ pip install pytest pytest-mock pytest-datadir pytest-cov moto==5.0.5 markupsafe==2.0.1
39
+
40
+ image::
41
+ @# Help: Placeholder does nothing for now.
42
+ @echo "Image building for ray is in the works (comming soon)."
43
+
44
+ # Here we run each test directory of tests and each ray launched test separately, because
45
+ # it seems when running multiple ray launch tests in a single pytest run there is some sort of ray.init() duplication.
46
+ # pytest-forked was tried, but then we get SIGABRT in pytest when running the s3 tests, some of which are skipped..
47
+ # TODO: the following fails. Why? source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) .
48
+ .PHONY: test
49
+ test:: venv
50
+ @# Help: Use the already-built virtual environment to run pytest on the test directory.
51
+ source venv/bin/activate; $(PYTEST);
@@ -0,0 +1,54 @@
1
+ Metadata-Version: 2.1
2
+ Name: data_prep_connector
3
+ Version: 0.2.2
4
+ Summary: Scalable and Compliant Web Crawler
5
+ Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
6
+ License: Apache-2.0
7
+ Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: scrapy>=2.11.2
11
+ Requires-Dist: pydantic>=2.8.1
12
+ Requires-Dist: tldextract>=5.1.2
13
+ Provides-Extra: dev
14
+ Requires-Dist: twine; extra == "dev"
15
+ Requires-Dist: pytest>=7.3.2; extra == "dev"
16
+ Requires-Dist: pytest-dotenv>=0.5.2; extra == "dev"
17
+ Requires-Dist: pytest-env>=1.0.0; extra == "dev"
18
+ Requires-Dist: pre-commit>=3.3.2; extra == "dev"
19
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
20
+ Requires-Dist: pytest-mock>=3.10.0; extra == "dev"
21
+ Requires-Dist: pytest-datadir>=1.5.0; extra == "dev"
22
+ Requires-Dist: moto==5.0.5; extra == "dev"
23
+ Requires-Dist: markupsafe==2.0.1; extra == "dev"
24
+
25
+ # DPK Connector
26
+
27
+ DPK Connector is a scalable and compliant web crawler developed for data acquisition towards LLM development. It is built on [Scrapy](https://scrapy.org/).
28
+ For more details read [the documentation](doc/overview.md).
29
+
30
+ ## Virtual Environment
31
+
32
+ The project uses `pyproject.toml` and a Makefile for operations.
33
+ To do development you should establish the virtual environment
34
+ ```shell
35
+ make venv
36
+ ```
37
+ and then either activate
38
+ ```shell
39
+ source venv/bin/activate
40
+ ```
41
+ or set up your IDE to use the venv directory when developing in this project
42
+
43
+ ## Library Artifact Build and Publish
44
+
45
+ To test, build and publish the library
46
+ ```shell
47
+ make test build publish
48
+ ```
49
+
50
+ To up the version number, edit the Makefile to change VERSION and rerun the above. This will require committing both the `Makefile` and the autotmatically updated `pyproject.toml` file.
51
+
52
+ ## How to use
53
+
54
+ See [the overview](doc/overview.md).
@@ -0,0 +1,30 @@
1
+ # DPK Connector
2
+
3
+ DPK Connector is a scalable and compliant web crawler developed for data acquisition towards LLM development. It is built on [Scrapy](https://scrapy.org/).
4
+ For more details read [the documentation](doc/overview.md).
5
+
6
+ ## Virtual Environment
7
+
8
+ The project uses `pyproject.toml` and a Makefile for operations.
9
+ To do development you should establish the virtual environment
10
+ ```shell
11
+ make venv
12
+ ```
13
+ and then either activate
14
+ ```shell
15
+ source venv/bin/activate
16
+ ```
17
+ or set up your IDE to use the venv directory when developing in this project
18
+
19
+ ## Library Artifact Build and Publish
20
+
21
+ To test, build and publish the library
22
+ ```shell
23
+ make test build publish
24
+ ```
25
+
26
+ To up the version number, edit the Makefile to change VERSION and rerun the above. This will require committing both the `Makefile` and the autotmatically updated `pyproject.toml` file.
27
+
28
+ ## How to use
29
+
30
+ See [the overview](doc/overview.md).
@@ -0,0 +1,61 @@
1
+ # DPK Connector Overview
2
+
3
+ The Data Prep Kit Connector (DPK Connector) is a Python library for scalable and compliant web crawling.
4
+
5
+ Features:
6
+ - Robots.txt compliant: The Connector follows allow/disallow lists and some extended directives such as `Crawl-delay` in robots.txt of websites.
7
+ - Sitemap support: The Connector automatically parses sitemap urls from input and tries to find them from robots.txt.
8
+ - User agent and headers customization: You can use your own user agent string and request headers.
9
+ - Domain and path focus: You can limit domains and paths accessed by the library.
10
+ - Mime type filters: You can restrict mime types which can be downloaded.
11
+ - Parallel processing: Requests to websites are processed in parallel.
12
+
13
+ ## How to install
14
+
15
+ ### From PyPI
16
+
17
+ ```sh
18
+ pip install data-prep-connector
19
+ ```
20
+
21
+ ### From Github
22
+
23
+ ```sh
24
+ pip install git+https://github.com/IBM/data-prep-kit.git@dev#subdirectory=data-connector-lib
25
+ ```
26
+
27
+ ## Example usage
28
+
29
+ ```python
30
+ from dpk_connector import crawl, shutdown
31
+
32
+
33
+ def main():
34
+ """
35
+ An example of running a crawl.
36
+ """
37
+
38
+ def on_downloaded(url: str, body: bytes, headers: dict) -> None:
39
+ """
40
+ Callback function called when a page has been downloaded.
41
+ You have access to the request URL, response body and headers.
42
+ """
43
+ print(f"url: {url}, headers: {headers}, body: {body[:64]}")
44
+
45
+ user_agent = "Mozilla/5.0 (X11; Linux i686; rv:125.0) Gecko/20100101 Firefox/125.0"
46
+
47
+ # Start crawling
48
+ crawl(
49
+ ["https://crawler-test.com/"],
50
+ on_downloaded,
51
+ user_agent=user_agent,
52
+ depth_limit=0,
53
+ ) # blocking call
54
+
55
+ # Shutdown all crawls
56
+ shutdown()
57
+
58
+
59
+ if __name__ == "__main__":
60
+ main()
61
+ ```
@@ -0,0 +1,61 @@
1
+ [project]
2
+ name = "data_prep_connector"
3
+ version = "0.2.2"
4
+ requires-python = ">=3.10"
5
+ keywords = [
6
+ "data",
7
+ "data acquisition",
8
+ "crawler",
9
+ "web crawler",
10
+ "llm",
11
+ "generative",
12
+ "ai",
13
+ "fine-tuning",
14
+ "llmapps",
15
+ ]
16
+ description = "Scalable and Compliant Web Crawler"
17
+ license = { text = "Apache-2.0" }
18
+ readme = { file = "README.md", content-type = "text/markdown" }
19
+ authors = [{ name = "Hiroya Matsubara", email = "hmtbr@jp.ibm.com" }]
20
+ dependencies = [
21
+ "scrapy>=2.11.2",
22
+ "pydantic>=2.8.1",
23
+ "tldextract>=5.1.2",
24
+ ]
25
+
26
+ [project_urls]
27
+ Repository = "https://github.com/IBM/data-prep-kit"
28
+ Issues = "https://github.com/IBM/data-prep-kit/issues"
29
+ Documentation = "https://ibm.github.io/data-prep-kit/"
30
+
31
+ [build-system]
32
+ requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
33
+ build-backend = "setuptools.build_meta"
34
+
35
+ [project.optional-dependencies]
36
+ dev = [
37
+ "twine",
38
+ "pytest>=7.3.2",
39
+ "pytest-dotenv>=0.5.2",
40
+ "pytest-env>=1.0.0",
41
+ "pre-commit>=3.3.2",
42
+ "pytest-cov>=4.1.0",
43
+ "pytest-mock>=3.10.0",
44
+ "pytest-datadir>=1.5.0",
45
+ "moto==5.0.5",
46
+ "markupsafe==2.0.1",
47
+ ]
48
+
49
+ [options]
50
+ package_dir = ["src", "test"]
51
+
52
+ [options.packages.find]
53
+ where = ["src/dpk_connector"]
54
+
55
+ [tool.pytest.ini_options]
56
+ # Currently we use low coverage since we have to run tests separately (see makefile)
57
+ #addopts = "--cov --cov-report term-missing --cov-fail-under 25"
58
+ markers = ["unit: unit tests", "integration: integration tests"]
59
+
60
+ [tool.coverage.run]
61
+ include = ["src/*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,54 @@
1
+ Metadata-Version: 2.1
2
+ Name: data_prep_connector
3
+ Version: 0.2.2
4
+ Summary: Scalable and Compliant Web Crawler
5
+ Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
6
+ License: Apache-2.0
7
+ Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: scrapy>=2.11.2
11
+ Requires-Dist: pydantic>=2.8.1
12
+ Requires-Dist: tldextract>=5.1.2
13
+ Provides-Extra: dev
14
+ Requires-Dist: twine; extra == "dev"
15
+ Requires-Dist: pytest>=7.3.2; extra == "dev"
16
+ Requires-Dist: pytest-dotenv>=0.5.2; extra == "dev"
17
+ Requires-Dist: pytest-env>=1.0.0; extra == "dev"
18
+ Requires-Dist: pre-commit>=3.3.2; extra == "dev"
19
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
20
+ Requires-Dist: pytest-mock>=3.10.0; extra == "dev"
21
+ Requires-Dist: pytest-datadir>=1.5.0; extra == "dev"
22
+ Requires-Dist: moto==5.0.5; extra == "dev"
23
+ Requires-Dist: markupsafe==2.0.1; extra == "dev"
24
+
25
+ # DPK Connector
26
+
27
+ DPK Connector is a scalable and compliant web crawler developed for data acquisition towards LLM development. It is built on [Scrapy](https://scrapy.org/).
28
+ For more details read [the documentation](doc/overview.md).
29
+
30
+ ## Virtual Environment
31
+
32
+ The project uses `pyproject.toml` and a Makefile for operations.
33
+ To do development you should establish the virtual environment
34
+ ```shell
35
+ make venv
36
+ ```
37
+ and then either activate
38
+ ```shell
39
+ source venv/bin/activate
40
+ ```
41
+ or set up your IDE to use the venv directory when developing in this project
42
+
43
+ ## Library Artifact Build and Publish
44
+
45
+ To test, build and publish the library
46
+ ```shell
47
+ make test build publish
48
+ ```
49
+
50
+ To up the version number, edit the Makefile to change VERSION and rerun the above. This will require committing both the `Makefile` and the autotmatically updated `pyproject.toml` file.
51
+
52
+ ## How to use
53
+
54
+ See [the overview](doc/overview.md).
@@ -0,0 +1,26 @@
1
+ Makefile
2
+ README.md
3
+ pyproject.toml
4
+ doc/overview.md
5
+ src/data_prep_connector.egg-info/PKG-INFO
6
+ src/data_prep_connector.egg-info/SOURCES.txt
7
+ src/data_prep_connector.egg-info/dependency_links.txt
8
+ src/data_prep_connector.egg-info/requires.txt
9
+ src/data_prep_connector.egg-info/top_level.txt
10
+ src/dpk_connector/__init__.py
11
+ src/dpk_connector/core/__init__.py
12
+ src/dpk_connector/core/crawler.py
13
+ src/dpk_connector/core/item.py
14
+ src/dpk_connector/core/logging.py
15
+ src/dpk_connector/core/middlewares.py
16
+ src/dpk_connector/core/pipelines.py
17
+ src/dpk_connector/core/settings.py
18
+ src/dpk_connector/core/utils.py
19
+ src/dpk_connector/core/spiders/__init__.py
20
+ src/dpk_connector/core/spiders/sitemap.py
21
+ test/dpk_connector/core/__init__.py
22
+ test/dpk_connector/core/test_crawler.py
23
+ test/dpk_connector/core/test_middlewares.py
24
+ test/dpk_connector/core/test_sitemap_spider.py
25
+ test/dpk_connector/core/test_utils.py
26
+ test/dpk_connector/core/test_sitemap_spider/index.html
@@ -0,0 +1,15 @@
1
+ scrapy>=2.11.2
2
+ pydantic>=2.8.1
3
+ tldextract>=5.1.2
4
+
5
+ [dev]
6
+ twine
7
+ pytest>=7.3.2
8
+ pytest-dotenv>=0.5.2
9
+ pytest-env>=1.0.0
10
+ pre-commit>=3.3.2
11
+ pytest-cov>=4.1.0
12
+ pytest-mock>=3.10.0
13
+ pytest-datadir>=1.5.0
14
+ moto==5.0.5
15
+ markupsafe==2.0.1
@@ -0,0 +1,13 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from dpk_connector.core.crawler import async_crawl, crawl, shutdown # noqa
@@ -0,0 +1,11 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
@@ -0,0 +1,222 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import threading
14
+ from typing import Any, Callable, Collection, Type, cast
15
+
16
+ from scrapy import Spider
17
+ from scrapy.crawler import Crawler, CrawlerRunner
18
+ from scrapy.settings import Settings
19
+ from twisted.internet.defer import Deferred
20
+
21
+ from dpk_connector.core.utils import validate_domain, validate_url
22
+
23
+ _lock = threading.Lock()
24
+ _reactor_initialized = False
25
+ _reactor_started = False
26
+
27
+
28
+ def _run_reactor():
29
+ from twisted.internet import reactor
30
+
31
+ reactor.run(installSignalHandlers=False)
32
+
33
+
34
+ _reactor_thread: threading.Thread = threading.Thread(
35
+ target=_run_reactor,
36
+ daemon=True,
37
+ )
38
+
39
+
40
+ def _start_reactor():
41
+ with _lock:
42
+ global _reactor_started
43
+ if not _reactor_started:
44
+ _reactor_thread.start()
45
+ _reactor_started = True
46
+
47
+
48
+ def _stop_reactor():
49
+ from twisted.internet import reactor
50
+
51
+ try:
52
+ reactor.stop()
53
+ except RuntimeError:
54
+ pass
55
+
56
+
57
+ class MultiThreadedCrawlerRunner(CrawlerRunner):
58
+ def _create_crawler(self, spidercls: str | type[Spider]) -> Crawler:
59
+ if isinstance(spidercls, str):
60
+ spidercls = self.spider_loader.load(spidercls)
61
+ with _lock:
62
+ global _reactor_initialized
63
+ init_reactor = not _reactor_initialized
64
+ crawler = Crawler(
65
+ cast(Type[Spider], spidercls), self.settings, init_reactor
66
+ )
67
+ _reactor_initialized = True
68
+ return crawler
69
+
70
+
71
+ def async_crawl(
72
+ seed_urls: Collection[str],
73
+ on_downloaded: Callable[[str, bytes, dict[str, str]], None],
74
+ user_agent: str = "",
75
+ headers: dict[str, str] = {},
76
+ allow_domains: Collection[str] = (),
77
+ subdomain_focus: bool = False,
78
+ path_focus: bool = False,
79
+ allow_mime_types: Collection[str] = (
80
+ "application/pdf",
81
+ "text/html",
82
+ "text/markdown",
83
+ "text/plain",
84
+ ),
85
+ disallow_mime_types: Collection[str] = (),
86
+ depth_limit: int = -1,
87
+ download_limit: int = -1,
88
+ ) -> Deferred[None]:
89
+ # Assisted by WCA@IBM
90
+ # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
91
+ """
92
+ Do crawl asynchronously.
93
+
94
+ Parameters:
95
+ seed_urls (Collection[str]): A collection of seed URLs to start the crawl from.
96
+ on_downloaded (Callable[[str, bytes, dict[str, str]], None]): The callback function to be called for each downloaded page.
97
+ user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
98
+ headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
99
+ allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
100
+ subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
101
+ path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
102
+ allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
103
+ disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
104
+ depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
105
+ download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
106
+
107
+ Returns:
108
+ Deferred[None]: A Twisted deferred object that can be used to wait for the crawler to finish.
109
+ """
110
+ if not seed_urls:
111
+ raise ValueError(f"Empty seed URLs.")
112
+ for url in seed_urls:
113
+ if not validate_url(url):
114
+ raise ValueError(f"Seed URL {url} is not valid.")
115
+ for domain in allow_domains:
116
+ if not validate_domain(domain):
117
+ raise ValueError(f"Allow domain {domain} is not valid.")
118
+ if depth_limit < -1:
119
+ raise ValueError(f"Invalid depth limit {depth_limit}")
120
+ if download_limit < -1:
121
+ raise ValueError(f"Invalid download limit {download_limit}")
122
+
123
+ settings = Settings()
124
+ settings.setmodule("dpk_connector.core.settings", priority="project")
125
+
126
+ if user_agent:
127
+ settings.set("USER_AGENT", user_agent, priority="spider")
128
+ if headers:
129
+ settings.set("DEFAULT_REQUEST_HEADERS", headers)
130
+ if depth_limit == 0:
131
+ depth_limit = -1
132
+ elif depth_limit == -1:
133
+ depth_limit = 0
134
+ settings.set("DEPTH_LIMIT", depth_limit, priority="spider")
135
+ if download_limit == -1:
136
+ download_limit = 0
137
+ settings.set("CLOSESPIDER_ITEMCOUNT", download_limit, priority="spider")
138
+
139
+ runner = MultiThreadedCrawlerRunner(settings)
140
+ runner.crawl(
141
+ "dpk-connector-sitemap",
142
+ seed_urls=seed_urls,
143
+ callback=on_downloaded,
144
+ allow_domains=allow_domains,
145
+ subdomain_focus=subdomain_focus,
146
+ path_focus=path_focus,
147
+ allow_mime_types=allow_mime_types,
148
+ disallow_mime_types=disallow_mime_types,
149
+ disable_sitemap_search=True,
150
+ )
151
+ _start_reactor()
152
+ return runner.join()
153
+
154
+
155
+ def crawl(
156
+ seed_urls: Collection[str],
157
+ on_downloaded: Callable[[str, bytes, dict[str, str]], None],
158
+ user_agent: str = "",
159
+ headers: dict[str, str] = {},
160
+ allow_domains: Collection[str] = (),
161
+ subdomain_focus: bool = False,
162
+ path_focus: bool = False,
163
+ allow_mime_types: Collection[str] = (
164
+ "application/pdf",
165
+ "text/html",
166
+ "text/markdown",
167
+ "text/plain",
168
+ ),
169
+ disallow_mime_types: Collection[str] = (),
170
+ depth_limit: int = -1,
171
+ download_limit: int = -1,
172
+ ) -> None:
173
+ # Assisted by WCA@IBM
174
+ # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
175
+ """
176
+ Do crawl synchronously.
177
+
178
+ Parameters:
179
+ seed_urls (Collection[str]): A collection of seed URLs to start the crawl from.
180
+ on_downloaded (Callable[[str, bytes, dict[str, str]], None]): The callback function to be called for each downloaded page.
181
+ user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
182
+ headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
183
+ allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
184
+ subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
185
+ path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
186
+ allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
187
+ disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
188
+ depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
189
+ download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
190
+
191
+ Returns:
192
+ None
193
+ """
194
+ condition = threading.Condition()
195
+
196
+ def on_completed(result: Any):
197
+ with condition:
198
+ condition.notify()
199
+
200
+ d = async_crawl(
201
+ seed_urls,
202
+ on_downloaded,
203
+ user_agent,
204
+ headers,
205
+ allow_domains,
206
+ subdomain_focus,
207
+ path_focus,
208
+ allow_mime_types,
209
+ disallow_mime_types,
210
+ depth_limit,
211
+ download_limit,
212
+ )
213
+ d.addBoth(on_completed)
214
+ with condition:
215
+ condition.wait()
216
+
217
+
218
+ def shutdown():
219
+ """
220
+ Shutdown all crawls.
221
+ """
222
+ _stop_reactor()
@@ -0,0 +1,21 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from dataclasses import dataclass
14
+
15
+
16
+ @dataclass
17
+ class ConnectorItem:
18
+ dropped: bool = False
19
+ downloaded: bool = False
20
+ system_request: bool = False
21
+ sitemap: bool = False
@@ -0,0 +1,22 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from scrapy.logformatter import LogFormatter as ScrapyLogFormatter
14
+
15
+
16
+ class QuietLogFormatter(ScrapyLogFormatter):
17
+ def scraped(self, item, response, spider):
18
+ return (
19
+ super().scraped(item, response, spider)
20
+ if spider.settings.getbool("LOG_SCRAPED_ITEMS")
21
+ else None
22
+ )