data-prep-connector 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_prep_connector-0.2.2/Makefile +51 -0
- data_prep_connector-0.2.2/PKG-INFO +54 -0
- data_prep_connector-0.2.2/README.md +30 -0
- data_prep_connector-0.2.2/doc/overview.md +61 -0
- data_prep_connector-0.2.2/pyproject.toml +61 -0
- data_prep_connector-0.2.2/setup.cfg +4 -0
- data_prep_connector-0.2.2/src/data_prep_connector.egg-info/PKG-INFO +54 -0
- data_prep_connector-0.2.2/src/data_prep_connector.egg-info/SOURCES.txt +26 -0
- data_prep_connector-0.2.2/src/data_prep_connector.egg-info/dependency_links.txt +1 -0
- data_prep_connector-0.2.2/src/data_prep_connector.egg-info/requires.txt +15 -0
- data_prep_connector-0.2.2/src/data_prep_connector.egg-info/top_level.txt +1 -0
- data_prep_connector-0.2.2/src/dpk_connector/__init__.py +13 -0
- data_prep_connector-0.2.2/src/dpk_connector/core/__init__.py +11 -0
- data_prep_connector-0.2.2/src/dpk_connector/core/crawler.py +222 -0
- data_prep_connector-0.2.2/src/dpk_connector/core/item.py +21 -0
- data_prep_connector-0.2.2/src/dpk_connector/core/logging.py +22 -0
- data_prep_connector-0.2.2/src/dpk_connector/core/middlewares.py +263 -0
- data_prep_connector-0.2.2/src/dpk_connector/core/pipelines.py +29 -0
- data_prep_connector-0.2.2/src/dpk_connector/core/settings.py +70 -0
- data_prep_connector-0.2.2/src/dpk_connector/core/spiders/__init__.py +11 -0
- data_prep_connector-0.2.2/src/dpk_connector/core/spiders/sitemap.py +342 -0
- data_prep_connector-0.2.2/src/dpk_connector/core/utils.py +97 -0
- data_prep_connector-0.2.2/test/dpk_connector/core/__init__.py +0 -0
- data_prep_connector-0.2.2/test/dpk_connector/core/test_crawler.py +39 -0
- data_prep_connector-0.2.2/test/dpk_connector/core/test_middlewares.py +71 -0
- data_prep_connector-0.2.2/test/dpk_connector/core/test_sitemap_spider/index.html +39 -0
- data_prep_connector-0.2.2/test/dpk_connector/core/test_sitemap_spider.py +119 -0
- data_prep_connector-0.2.2/test/dpk_connector/core/test_utils.py +178 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# Use make help, to see the available rules
|
|
2
|
+
REPOROOT=..
|
|
3
|
+
include $(REPOROOT)/.make.defaults
|
|
4
|
+
|
|
5
|
+
DPK_CONNECTOR_VERSION=0.2.2
|
|
6
|
+
|
|
7
|
+
clean::
|
|
8
|
+
@# Help: Clean up the distribution build and the venv
|
|
9
|
+
rm -rf dist venv
|
|
10
|
+
rm -rf src/*egg-info
|
|
11
|
+
|
|
12
|
+
.check-env::
|
|
13
|
+
@echo "Checks passed"
|
|
14
|
+
|
|
15
|
+
setup::
|
|
16
|
+
|
|
17
|
+
set-versions: .check-env
|
|
18
|
+
$(MAKE) TOML_VERSION=$(DPK_CONNECTOR_VERSION) .defaults.update-toml
|
|
19
|
+
|
|
20
|
+
build:: build-dist
|
|
21
|
+
|
|
22
|
+
#build:: update-toml .defaults.build-dist
|
|
23
|
+
build-dist :: .defaults.build-dist
|
|
24
|
+
|
|
25
|
+
publish:: publish-dist
|
|
26
|
+
|
|
27
|
+
publish-dist :: .check-env .defaults.publish-dist
|
|
28
|
+
|
|
29
|
+
venv:: pyproject.toml
|
|
30
|
+
@# Help: Create the virtual environment using pyproject.toml
|
|
31
|
+
rm -r dist venv || true
|
|
32
|
+
rm -rf src/*egg-info || true
|
|
33
|
+
rm makeenv || true
|
|
34
|
+
$(PYTHON) -m venv venv
|
|
35
|
+
source venv/bin/activate; \
|
|
36
|
+
pip install --upgrade pip; \
|
|
37
|
+
pip install -e .; \
|
|
38
|
+
pip install pytest pytest-mock pytest-datadir pytest-cov moto==5.0.5 markupsafe==2.0.1
|
|
39
|
+
|
|
40
|
+
image::
|
|
41
|
+
@# Help: Placeholder does nothing for now.
|
|
42
|
+
@echo "Image building for ray is in the works (comming soon)."
|
|
43
|
+
|
|
44
|
+
# Here we run each test directory of tests and each ray launched test separately, because
|
|
45
|
+
# it seems when running multiple ray launch tests in a single pytest run there is some sort of ray.init() duplication.
|
|
46
|
+
# pytest-forked was tried, but then we get SIGABRT in pytest when running the s3 tests, some of which are skipped..
|
|
47
|
+
# TODO: the following fails. Why? source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) .
|
|
48
|
+
.PHONY: test
|
|
49
|
+
test:: venv
|
|
50
|
+
@# Help: Use the already-built virtual environment to run pytest on the test directory.
|
|
51
|
+
source venv/bin/activate; $(PYTEST);
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: data_prep_connector
|
|
3
|
+
Version: 0.2.2
|
|
4
|
+
Summary: Scalable and Compliant Web Crawler
|
|
5
|
+
Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: scrapy>=2.11.2
|
|
11
|
+
Requires-Dist: pydantic>=2.8.1
|
|
12
|
+
Requires-Dist: tldextract>=5.1.2
|
|
13
|
+
Provides-Extra: dev
|
|
14
|
+
Requires-Dist: twine; extra == "dev"
|
|
15
|
+
Requires-Dist: pytest>=7.3.2; extra == "dev"
|
|
16
|
+
Requires-Dist: pytest-dotenv>=0.5.2; extra == "dev"
|
|
17
|
+
Requires-Dist: pytest-env>=1.0.0; extra == "dev"
|
|
18
|
+
Requires-Dist: pre-commit>=3.3.2; extra == "dev"
|
|
19
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-mock>=3.10.0; extra == "dev"
|
|
21
|
+
Requires-Dist: pytest-datadir>=1.5.0; extra == "dev"
|
|
22
|
+
Requires-Dist: moto==5.0.5; extra == "dev"
|
|
23
|
+
Requires-Dist: markupsafe==2.0.1; extra == "dev"
|
|
24
|
+
|
|
25
|
+
# DPK Connector
|
|
26
|
+
|
|
27
|
+
DPK Connector is a scalable and compliant web crawler developed for data acquisition towards LLM development. It is built on [Scrapy](https://scrapy.org/).
|
|
28
|
+
For more details read [the documentation](doc/overview.md).
|
|
29
|
+
|
|
30
|
+
## Virtual Environment
|
|
31
|
+
|
|
32
|
+
The project uses `pyproject.toml` and a Makefile for operations.
|
|
33
|
+
To do development you should establish the virtual environment
|
|
34
|
+
```shell
|
|
35
|
+
make venv
|
|
36
|
+
```
|
|
37
|
+
and then either activate
|
|
38
|
+
```shell
|
|
39
|
+
source venv/bin/activate
|
|
40
|
+
```
|
|
41
|
+
or set up your IDE to use the venv directory when developing in this project
|
|
42
|
+
|
|
43
|
+
## Library Artifact Build and Publish
|
|
44
|
+
|
|
45
|
+
To test, build and publish the library
|
|
46
|
+
```shell
|
|
47
|
+
make test build publish
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
To up the version number, edit the Makefile to change VERSION and rerun the above. This will require committing both the `Makefile` and the autotmatically updated `pyproject.toml` file.
|
|
51
|
+
|
|
52
|
+
## How to use
|
|
53
|
+
|
|
54
|
+
See [the overview](doc/overview.md).
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# DPK Connector
|
|
2
|
+
|
|
3
|
+
DPK Connector is a scalable and compliant web crawler developed for data acquisition towards LLM development. It is built on [Scrapy](https://scrapy.org/).
|
|
4
|
+
For more details read [the documentation](doc/overview.md).
|
|
5
|
+
|
|
6
|
+
## Virtual Environment
|
|
7
|
+
|
|
8
|
+
The project uses `pyproject.toml` and a Makefile for operations.
|
|
9
|
+
To do development you should establish the virtual environment
|
|
10
|
+
```shell
|
|
11
|
+
make venv
|
|
12
|
+
```
|
|
13
|
+
and then either activate
|
|
14
|
+
```shell
|
|
15
|
+
source venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
or set up your IDE to use the venv directory when developing in this project
|
|
18
|
+
|
|
19
|
+
## Library Artifact Build and Publish
|
|
20
|
+
|
|
21
|
+
To test, build and publish the library
|
|
22
|
+
```shell
|
|
23
|
+
make test build publish
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
To up the version number, edit the Makefile to change VERSION and rerun the above. This will require committing both the `Makefile` and the autotmatically updated `pyproject.toml` file.
|
|
27
|
+
|
|
28
|
+
## How to use
|
|
29
|
+
|
|
30
|
+
See [the overview](doc/overview.md).
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# DPK Connector Overview
|
|
2
|
+
|
|
3
|
+
The Data Prep Kit Connector (DPK Connector) is a Python library for scalable and compliant web crawling.
|
|
4
|
+
|
|
5
|
+
Features:
|
|
6
|
+
- Robots.txt compliant: The Connector follows allow/disallow lists and some extended directives such as `Crawl-delay` in robots.txt of websites.
|
|
7
|
+
- Sitemap support: The Connector automatically parses sitemap urls from input and tries to find them from robots.txt.
|
|
8
|
+
- User agent and headers customization: You can use your own user agent string and request headers.
|
|
9
|
+
- Domain and path focus: You can limit domains and paths accessed by the library.
|
|
10
|
+
- Mime type filters: You can restrict mime types which can be downloaded.
|
|
11
|
+
- Parallel processing: Requests to websites are processed in parallel.
|
|
12
|
+
|
|
13
|
+
## How to install
|
|
14
|
+
|
|
15
|
+
### From PyPI
|
|
16
|
+
|
|
17
|
+
```sh
|
|
18
|
+
pip install data-prep-connector
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
### From Github
|
|
22
|
+
|
|
23
|
+
```sh
|
|
24
|
+
pip install git+https://github.com/IBM/data-prep-kit.git@dev#subdirectory=data-connector-lib
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Example usage
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from dpk_connector import crawl, shutdown
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def main():
|
|
34
|
+
"""
|
|
35
|
+
An example of running a crawl.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def on_downloaded(url: str, body: bytes, headers: dict) -> None:
|
|
39
|
+
"""
|
|
40
|
+
Callback function called when a page has been downloaded.
|
|
41
|
+
You have access to the request URL, response body and headers.
|
|
42
|
+
"""
|
|
43
|
+
print(f"url: {url}, headers: {headers}, body: {body[:64]}")
|
|
44
|
+
|
|
45
|
+
user_agent = "Mozilla/5.0 (X11; Linux i686; rv:125.0) Gecko/20100101 Firefox/125.0"
|
|
46
|
+
|
|
47
|
+
# Start crawling
|
|
48
|
+
crawl(
|
|
49
|
+
["https://crawler-test.com/"],
|
|
50
|
+
on_downloaded,
|
|
51
|
+
user_agent=user_agent,
|
|
52
|
+
depth_limit=0,
|
|
53
|
+
) # blocking call
|
|
54
|
+
|
|
55
|
+
# Shutdown all crawls
|
|
56
|
+
shutdown()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
if __name__ == "__main__":
|
|
60
|
+
main()
|
|
61
|
+
```
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "data_prep_connector"
|
|
3
|
+
version = "0.2.2"
|
|
4
|
+
requires-python = ">=3.10"
|
|
5
|
+
keywords = [
|
|
6
|
+
"data",
|
|
7
|
+
"data acquisition",
|
|
8
|
+
"crawler",
|
|
9
|
+
"web crawler",
|
|
10
|
+
"llm",
|
|
11
|
+
"generative",
|
|
12
|
+
"ai",
|
|
13
|
+
"fine-tuning",
|
|
14
|
+
"llmapps",
|
|
15
|
+
]
|
|
16
|
+
description = "Scalable and Compliant Web Crawler"
|
|
17
|
+
license = { text = "Apache-2.0" }
|
|
18
|
+
readme = { file = "README.md", content-type = "text/markdown" }
|
|
19
|
+
authors = [{ name = "Hiroya Matsubara", email = "hmtbr@jp.ibm.com" }]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"scrapy>=2.11.2",
|
|
22
|
+
"pydantic>=2.8.1",
|
|
23
|
+
"tldextract>=5.1.2",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[project_urls]
|
|
27
|
+
Repository = "https://github.com/IBM/data-prep-kit"
|
|
28
|
+
Issues = "https://github.com/IBM/data-prep-kit/issues"
|
|
29
|
+
Documentation = "https://ibm.github.io/data-prep-kit/"
|
|
30
|
+
|
|
31
|
+
[build-system]
|
|
32
|
+
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
|
|
33
|
+
build-backend = "setuptools.build_meta"
|
|
34
|
+
|
|
35
|
+
[project.optional-dependencies]
|
|
36
|
+
dev = [
|
|
37
|
+
"twine",
|
|
38
|
+
"pytest>=7.3.2",
|
|
39
|
+
"pytest-dotenv>=0.5.2",
|
|
40
|
+
"pytest-env>=1.0.0",
|
|
41
|
+
"pre-commit>=3.3.2",
|
|
42
|
+
"pytest-cov>=4.1.0",
|
|
43
|
+
"pytest-mock>=3.10.0",
|
|
44
|
+
"pytest-datadir>=1.5.0",
|
|
45
|
+
"moto==5.0.5",
|
|
46
|
+
"markupsafe==2.0.1",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
[options]
|
|
50
|
+
package_dir = ["src", "test"]
|
|
51
|
+
|
|
52
|
+
[options.packages.find]
|
|
53
|
+
where = ["src/dpk_connector"]
|
|
54
|
+
|
|
55
|
+
[tool.pytest.ini_options]
|
|
56
|
+
# Currently we use low coverage since we have to run tests separately (see makefile)
|
|
57
|
+
#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
|
|
58
|
+
markers = ["unit: unit tests", "integration: integration tests"]
|
|
59
|
+
|
|
60
|
+
[tool.coverage.run]
|
|
61
|
+
include = ["src/*"]
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: data_prep_connector
|
|
3
|
+
Version: 0.2.2
|
|
4
|
+
Summary: Scalable and Compliant Web Crawler
|
|
5
|
+
Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: scrapy>=2.11.2
|
|
11
|
+
Requires-Dist: pydantic>=2.8.1
|
|
12
|
+
Requires-Dist: tldextract>=5.1.2
|
|
13
|
+
Provides-Extra: dev
|
|
14
|
+
Requires-Dist: twine; extra == "dev"
|
|
15
|
+
Requires-Dist: pytest>=7.3.2; extra == "dev"
|
|
16
|
+
Requires-Dist: pytest-dotenv>=0.5.2; extra == "dev"
|
|
17
|
+
Requires-Dist: pytest-env>=1.0.0; extra == "dev"
|
|
18
|
+
Requires-Dist: pre-commit>=3.3.2; extra == "dev"
|
|
19
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-mock>=3.10.0; extra == "dev"
|
|
21
|
+
Requires-Dist: pytest-datadir>=1.5.0; extra == "dev"
|
|
22
|
+
Requires-Dist: moto==5.0.5; extra == "dev"
|
|
23
|
+
Requires-Dist: markupsafe==2.0.1; extra == "dev"
|
|
24
|
+
|
|
25
|
+
# DPK Connector
|
|
26
|
+
|
|
27
|
+
DPK Connector is a scalable and compliant web crawler developed for data acquisition towards LLM development. It is built on [Scrapy](https://scrapy.org/).
|
|
28
|
+
For more details read [the documentation](doc/overview.md).
|
|
29
|
+
|
|
30
|
+
## Virtual Environment
|
|
31
|
+
|
|
32
|
+
The project uses `pyproject.toml` and a Makefile for operations.
|
|
33
|
+
To do development you should establish the virtual environment
|
|
34
|
+
```shell
|
|
35
|
+
make venv
|
|
36
|
+
```
|
|
37
|
+
and then either activate
|
|
38
|
+
```shell
|
|
39
|
+
source venv/bin/activate
|
|
40
|
+
```
|
|
41
|
+
or set up your IDE to use the venv directory when developing in this project
|
|
42
|
+
|
|
43
|
+
## Library Artifact Build and Publish
|
|
44
|
+
|
|
45
|
+
To test, build and publish the library
|
|
46
|
+
```shell
|
|
47
|
+
make test build publish
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
To up the version number, edit the Makefile to change VERSION and rerun the above. This will require committing both the `Makefile` and the autotmatically updated `pyproject.toml` file.
|
|
51
|
+
|
|
52
|
+
## How to use
|
|
53
|
+
|
|
54
|
+
See [the overview](doc/overview.md).
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
Makefile
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
doc/overview.md
|
|
5
|
+
src/data_prep_connector.egg-info/PKG-INFO
|
|
6
|
+
src/data_prep_connector.egg-info/SOURCES.txt
|
|
7
|
+
src/data_prep_connector.egg-info/dependency_links.txt
|
|
8
|
+
src/data_prep_connector.egg-info/requires.txt
|
|
9
|
+
src/data_prep_connector.egg-info/top_level.txt
|
|
10
|
+
src/dpk_connector/__init__.py
|
|
11
|
+
src/dpk_connector/core/__init__.py
|
|
12
|
+
src/dpk_connector/core/crawler.py
|
|
13
|
+
src/dpk_connector/core/item.py
|
|
14
|
+
src/dpk_connector/core/logging.py
|
|
15
|
+
src/dpk_connector/core/middlewares.py
|
|
16
|
+
src/dpk_connector/core/pipelines.py
|
|
17
|
+
src/dpk_connector/core/settings.py
|
|
18
|
+
src/dpk_connector/core/utils.py
|
|
19
|
+
src/dpk_connector/core/spiders/__init__.py
|
|
20
|
+
src/dpk_connector/core/spiders/sitemap.py
|
|
21
|
+
test/dpk_connector/core/__init__.py
|
|
22
|
+
test/dpk_connector/core/test_crawler.py
|
|
23
|
+
test/dpk_connector/core/test_middlewares.py
|
|
24
|
+
test/dpk_connector/core/test_sitemap_spider.py
|
|
25
|
+
test/dpk_connector/core/test_utils.py
|
|
26
|
+
test/dpk_connector/core/test_sitemap_spider/index.html
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
scrapy>=2.11.2
|
|
2
|
+
pydantic>=2.8.1
|
|
3
|
+
tldextract>=5.1.2
|
|
4
|
+
|
|
5
|
+
[dev]
|
|
6
|
+
twine
|
|
7
|
+
pytest>=7.3.2
|
|
8
|
+
pytest-dotenv>=0.5.2
|
|
9
|
+
pytest-env>=1.0.0
|
|
10
|
+
pre-commit>=3.3.2
|
|
11
|
+
pytest-cov>=4.1.0
|
|
12
|
+
pytest-mock>=3.10.0
|
|
13
|
+
pytest-datadir>=1.5.0
|
|
14
|
+
moto==5.0.5
|
|
15
|
+
markupsafe==2.0.1
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
dpk_connector
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
from dpk_connector.core.crawler import async_crawl, crawl, shutdown # noqa
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import threading
|
|
14
|
+
from typing import Any, Callable, Collection, Type, cast
|
|
15
|
+
|
|
16
|
+
from scrapy import Spider
|
|
17
|
+
from scrapy.crawler import Crawler, CrawlerRunner
|
|
18
|
+
from scrapy.settings import Settings
|
|
19
|
+
from twisted.internet.defer import Deferred
|
|
20
|
+
|
|
21
|
+
from dpk_connector.core.utils import validate_domain, validate_url
|
|
22
|
+
|
|
23
|
+
_lock = threading.Lock()
|
|
24
|
+
_reactor_initialized = False
|
|
25
|
+
_reactor_started = False
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _run_reactor():
|
|
29
|
+
from twisted.internet import reactor
|
|
30
|
+
|
|
31
|
+
reactor.run(installSignalHandlers=False)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
_reactor_thread: threading.Thread = threading.Thread(
|
|
35
|
+
target=_run_reactor,
|
|
36
|
+
daemon=True,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _start_reactor():
|
|
41
|
+
with _lock:
|
|
42
|
+
global _reactor_started
|
|
43
|
+
if not _reactor_started:
|
|
44
|
+
_reactor_thread.start()
|
|
45
|
+
_reactor_started = True
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _stop_reactor():
|
|
49
|
+
from twisted.internet import reactor
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
reactor.stop()
|
|
53
|
+
except RuntimeError:
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class MultiThreadedCrawlerRunner(CrawlerRunner):
|
|
58
|
+
def _create_crawler(self, spidercls: str | type[Spider]) -> Crawler:
|
|
59
|
+
if isinstance(spidercls, str):
|
|
60
|
+
spidercls = self.spider_loader.load(spidercls)
|
|
61
|
+
with _lock:
|
|
62
|
+
global _reactor_initialized
|
|
63
|
+
init_reactor = not _reactor_initialized
|
|
64
|
+
crawler = Crawler(
|
|
65
|
+
cast(Type[Spider], spidercls), self.settings, init_reactor
|
|
66
|
+
)
|
|
67
|
+
_reactor_initialized = True
|
|
68
|
+
return crawler
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def async_crawl(
|
|
72
|
+
seed_urls: Collection[str],
|
|
73
|
+
on_downloaded: Callable[[str, bytes, dict[str, str]], None],
|
|
74
|
+
user_agent: str = "",
|
|
75
|
+
headers: dict[str, str] = {},
|
|
76
|
+
allow_domains: Collection[str] = (),
|
|
77
|
+
subdomain_focus: bool = False,
|
|
78
|
+
path_focus: bool = False,
|
|
79
|
+
allow_mime_types: Collection[str] = (
|
|
80
|
+
"application/pdf",
|
|
81
|
+
"text/html",
|
|
82
|
+
"text/markdown",
|
|
83
|
+
"text/plain",
|
|
84
|
+
),
|
|
85
|
+
disallow_mime_types: Collection[str] = (),
|
|
86
|
+
depth_limit: int = -1,
|
|
87
|
+
download_limit: int = -1,
|
|
88
|
+
) -> Deferred[None]:
|
|
89
|
+
# Assisted by WCA@IBM
|
|
90
|
+
# Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
|
|
91
|
+
"""
|
|
92
|
+
Do crawl asynchronously.
|
|
93
|
+
|
|
94
|
+
Parameters:
|
|
95
|
+
seed_urls (Collection[str]): A collection of seed URLs to start the crawl from.
|
|
96
|
+
on_downloaded (Callable[[str, bytes, dict[str, str]], None]): The callback function to be called for each downloaded page.
|
|
97
|
+
user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
|
|
98
|
+
headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
|
|
99
|
+
allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
|
|
100
|
+
subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
|
|
101
|
+
path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
|
|
102
|
+
allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
|
|
103
|
+
disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
|
|
104
|
+
depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
|
|
105
|
+
download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Deferred[None]: A Twisted deferred object that can be used to wait for the crawler to finish.
|
|
109
|
+
"""
|
|
110
|
+
if not seed_urls:
|
|
111
|
+
raise ValueError(f"Empty seed URLs.")
|
|
112
|
+
for url in seed_urls:
|
|
113
|
+
if not validate_url(url):
|
|
114
|
+
raise ValueError(f"Seed URL {url} is not valid.")
|
|
115
|
+
for domain in allow_domains:
|
|
116
|
+
if not validate_domain(domain):
|
|
117
|
+
raise ValueError(f"Allow domain {domain} is not valid.")
|
|
118
|
+
if depth_limit < -1:
|
|
119
|
+
raise ValueError(f"Invalid depth limit {depth_limit}")
|
|
120
|
+
if download_limit < -1:
|
|
121
|
+
raise ValueError(f"Invalid download limit {download_limit}")
|
|
122
|
+
|
|
123
|
+
settings = Settings()
|
|
124
|
+
settings.setmodule("dpk_connector.core.settings", priority="project")
|
|
125
|
+
|
|
126
|
+
if user_agent:
|
|
127
|
+
settings.set("USER_AGENT", user_agent, priority="spider")
|
|
128
|
+
if headers:
|
|
129
|
+
settings.set("DEFAULT_REQUEST_HEADERS", headers)
|
|
130
|
+
if depth_limit == 0:
|
|
131
|
+
depth_limit = -1
|
|
132
|
+
elif depth_limit == -1:
|
|
133
|
+
depth_limit = 0
|
|
134
|
+
settings.set("DEPTH_LIMIT", depth_limit, priority="spider")
|
|
135
|
+
if download_limit == -1:
|
|
136
|
+
download_limit = 0
|
|
137
|
+
settings.set("CLOSESPIDER_ITEMCOUNT", download_limit, priority="spider")
|
|
138
|
+
|
|
139
|
+
runner = MultiThreadedCrawlerRunner(settings)
|
|
140
|
+
runner.crawl(
|
|
141
|
+
"dpk-connector-sitemap",
|
|
142
|
+
seed_urls=seed_urls,
|
|
143
|
+
callback=on_downloaded,
|
|
144
|
+
allow_domains=allow_domains,
|
|
145
|
+
subdomain_focus=subdomain_focus,
|
|
146
|
+
path_focus=path_focus,
|
|
147
|
+
allow_mime_types=allow_mime_types,
|
|
148
|
+
disallow_mime_types=disallow_mime_types,
|
|
149
|
+
disable_sitemap_search=True,
|
|
150
|
+
)
|
|
151
|
+
_start_reactor()
|
|
152
|
+
return runner.join()
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def crawl(
|
|
156
|
+
seed_urls: Collection[str],
|
|
157
|
+
on_downloaded: Callable[[str, bytes, dict[str, str]], None],
|
|
158
|
+
user_agent: str = "",
|
|
159
|
+
headers: dict[str, str] = {},
|
|
160
|
+
allow_domains: Collection[str] = (),
|
|
161
|
+
subdomain_focus: bool = False,
|
|
162
|
+
path_focus: bool = False,
|
|
163
|
+
allow_mime_types: Collection[str] = (
|
|
164
|
+
"application/pdf",
|
|
165
|
+
"text/html",
|
|
166
|
+
"text/markdown",
|
|
167
|
+
"text/plain",
|
|
168
|
+
),
|
|
169
|
+
disallow_mime_types: Collection[str] = (),
|
|
170
|
+
depth_limit: int = -1,
|
|
171
|
+
download_limit: int = -1,
|
|
172
|
+
) -> None:
|
|
173
|
+
# Assisted by WCA@IBM
|
|
174
|
+
# Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
|
|
175
|
+
"""
|
|
176
|
+
Do crawl synchronously.
|
|
177
|
+
|
|
178
|
+
Parameters:
|
|
179
|
+
seed_urls (Collection[str]): A collection of seed URLs to start the crawl from.
|
|
180
|
+
on_downloaded (Callable[[str, bytes, dict[str, str]], None]): The callback function to be called for each downloaded page.
|
|
181
|
+
user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
|
|
182
|
+
headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
|
|
183
|
+
allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
|
|
184
|
+
subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
|
|
185
|
+
path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
|
|
186
|
+
allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
|
|
187
|
+
disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
|
|
188
|
+
depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
|
|
189
|
+
download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
None
|
|
193
|
+
"""
|
|
194
|
+
condition = threading.Condition()
|
|
195
|
+
|
|
196
|
+
def on_completed(result: Any):
|
|
197
|
+
with condition:
|
|
198
|
+
condition.notify()
|
|
199
|
+
|
|
200
|
+
d = async_crawl(
|
|
201
|
+
seed_urls,
|
|
202
|
+
on_downloaded,
|
|
203
|
+
user_agent,
|
|
204
|
+
headers,
|
|
205
|
+
allow_domains,
|
|
206
|
+
subdomain_focus,
|
|
207
|
+
path_focus,
|
|
208
|
+
allow_mime_types,
|
|
209
|
+
disallow_mime_types,
|
|
210
|
+
depth_limit,
|
|
211
|
+
download_limit,
|
|
212
|
+
)
|
|
213
|
+
d.addBoth(on_completed)
|
|
214
|
+
with condition:
|
|
215
|
+
condition.wait()
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def shutdown():
|
|
219
|
+
"""
|
|
220
|
+
Shutdown all crawls.
|
|
221
|
+
"""
|
|
222
|
+
_stop_reactor()
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class ConnectorItem:
|
|
18
|
+
dropped: bool = False
|
|
19
|
+
downloaded: bool = False
|
|
20
|
+
system_request: bool = False
|
|
21
|
+
sitemap: bool = False
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
from scrapy.logformatter import LogFormatter as ScrapyLogFormatter
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class QuietLogFormatter(ScrapyLogFormatter):
|
|
17
|
+
def scraped(self, item, response, spider):
|
|
18
|
+
return (
|
|
19
|
+
super().scraped(item, response, spider)
|
|
20
|
+
if spider.settings.getbool("LOG_SCRAPED_ITEMS")
|
|
21
|
+
else None
|
|
22
|
+
)
|