data-prep-connector 0.2.2.dev0__tar.gz → 0.2.2.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/PKG-INFO +1 -1
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/doc/overview.md +14 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/pyproject.toml +1 -1
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/data_prep_connector.egg-info/PKG-INFO +1 -1
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/crawler.py +6 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/spiders/sitemap.py +14 -6
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/utils.py +5 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/test/dpk_connector/core/test_sitemap_spider.py +17 -3
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/test/dpk_connector/core/test_utils.py +16 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/Makefile +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/README.md +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/setup.cfg +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/data_prep_connector.egg-info/SOURCES.txt +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/data_prep_connector.egg-info/dependency_links.txt +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/data_prep_connector.egg-info/requires.txt +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/data_prep_connector.egg-info/top_level.txt +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/__init__.py +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/__init__.py +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/item.py +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/logging.py +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/middlewares.py +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/pipelines.py +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/settings.py +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/spiders/__init__.py +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/test/dpk_connector/core/__init__.py +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/test/dpk_connector/core/test_crawler.py +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/test/dpk_connector/core/test_middlewares.py +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/test/dpk_connector/core/test_sitemap_spider/index.html +0 -0
|
@@ -10,6 +10,20 @@ Features:
|
|
|
10
10
|
- Mime type filters: You can restrict mime types which can be downloaded.
|
|
11
11
|
- Parallel processing: Requests to websites are processed in parallel.
|
|
12
12
|
|
|
13
|
+
## How to install
|
|
14
|
+
|
|
15
|
+
### From PyPI
|
|
16
|
+
|
|
17
|
+
```sh
|
|
18
|
+
pip install data-prep-connector
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
### From Github
|
|
22
|
+
|
|
23
|
+
```sh
|
|
24
|
+
pip install git+https://github.com/IBM/data-prep-kit.git@dev#subdirectory=data-connector-lib
|
|
25
|
+
```
|
|
26
|
+
|
|
13
27
|
## Example usage
|
|
14
28
|
|
|
15
29
|
```python
|
{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/crawler.py
RENAMED
|
@@ -74,6 +74,7 @@ def async_crawl(
|
|
|
74
74
|
user_agent: str = "",
|
|
75
75
|
headers: dict[str, str] = {},
|
|
76
76
|
allow_domains: Collection[str] = (),
|
|
77
|
+
subdomain_focus: bool = False,
|
|
77
78
|
path_focus: bool = False,
|
|
78
79
|
allow_mime_types: Collection[str] = (
|
|
79
80
|
"application/pdf",
|
|
@@ -96,6 +97,7 @@ def async_crawl(
|
|
|
96
97
|
user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
|
|
97
98
|
headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
|
|
98
99
|
allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
|
|
100
|
+
subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
|
|
99
101
|
path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
|
|
100
102
|
allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
|
|
101
103
|
disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
|
|
@@ -140,6 +142,7 @@ def async_crawl(
|
|
|
140
142
|
seed_urls=seed_urls,
|
|
141
143
|
callback=on_downloaded,
|
|
142
144
|
allow_domains=allow_domains,
|
|
145
|
+
subdomain_focus=subdomain_focus,
|
|
143
146
|
path_focus=path_focus,
|
|
144
147
|
allow_mime_types=allow_mime_types,
|
|
145
148
|
disallow_mime_types=disallow_mime_types,
|
|
@@ -155,6 +158,7 @@ def crawl(
|
|
|
155
158
|
user_agent: str = "",
|
|
156
159
|
headers: dict[str, str] = {},
|
|
157
160
|
allow_domains: Collection[str] = (),
|
|
161
|
+
subdomain_focus: bool = False,
|
|
158
162
|
path_focus: bool = False,
|
|
159
163
|
allow_mime_types: Collection[str] = (
|
|
160
164
|
"application/pdf",
|
|
@@ -177,6 +181,7 @@ def crawl(
|
|
|
177
181
|
user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
|
|
178
182
|
headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
|
|
179
183
|
allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
|
|
184
|
+
subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
|
|
180
185
|
path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
|
|
181
186
|
allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
|
|
182
187
|
disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
|
|
@@ -198,6 +203,7 @@ def crawl(
|
|
|
198
203
|
user_agent,
|
|
199
204
|
headers,
|
|
200
205
|
allow_domains,
|
|
206
|
+
subdomain_focus,
|
|
201
207
|
path_focus,
|
|
202
208
|
allow_mime_types,
|
|
203
209
|
disallow_mime_types,
|
|
@@ -28,6 +28,7 @@ from dpk_connector.core.utils import (
|
|
|
28
28
|
get_content_type,
|
|
29
29
|
get_etld1,
|
|
30
30
|
get_focus_path,
|
|
31
|
+
get_fqdn,
|
|
31
32
|
is_allowed_path,
|
|
32
33
|
urlparse_cached,
|
|
33
34
|
)
|
|
@@ -42,6 +43,7 @@ class BaseSitemapSpider(SitemapSpider):
|
|
|
42
43
|
self,
|
|
43
44
|
seed_urls: Collection[str],
|
|
44
45
|
allow_domains: Collection[str] = (),
|
|
46
|
+
subdomain_focus: bool = False,
|
|
45
47
|
path_focus: bool = False,
|
|
46
48
|
allow_mime_types: Collection[str] = (),
|
|
47
49
|
disallow_mime_types: Collection[str] = (),
|
|
@@ -88,11 +90,15 @@ class BaseSitemapSpider(SitemapSpider):
|
|
|
88
90
|
self.focus_paths.add(path)
|
|
89
91
|
|
|
90
92
|
# Domains and mime types filtering
|
|
91
|
-
|
|
92
|
-
allow_domains
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
93
|
+
if allow_domains:
|
|
94
|
+
self.allowed_domains = set(allow_domains)
|
|
95
|
+
elif subdomain_focus:
|
|
96
|
+
self.allowed_domains = set()
|
|
97
|
+
for url in seed_urls:
|
|
98
|
+
if fqdn := get_fqdn(url):
|
|
99
|
+
self.allowed_domains.add(fqdn)
|
|
100
|
+
else:
|
|
101
|
+
self.allowed_domains = set(get_etld1(url) for url in seed_urls)
|
|
96
102
|
self.allow_mime_types = set(
|
|
97
103
|
[m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else ()
|
|
98
104
|
)
|
|
@@ -155,7 +161,9 @@ class BaseSitemapSpider(SitemapSpider):
|
|
|
155
161
|
)
|
|
156
162
|
|
|
157
163
|
def _parse_sitemap(self, response: Response):
|
|
158
|
-
yield ConnectorItem(
|
|
164
|
+
yield ConnectorItem(
|
|
165
|
+
dropped=False, downloaded=False, system_request=True, sitemap=True
|
|
166
|
+
)
|
|
159
167
|
|
|
160
168
|
seed_url = response.meta["seed_url"]
|
|
161
169
|
|
{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/utils.py
RENAMED
|
@@ -57,6 +57,11 @@ def get_etld1(url: str) -> str:
|
|
|
57
57
|
return f"{ext.domain}.{ext.suffix}"
|
|
58
58
|
|
|
59
59
|
|
|
60
|
+
def get_fqdn(url: str) -> str:
|
|
61
|
+
ext = tldextract.extract(url)
|
|
62
|
+
return ext.fqdn
|
|
63
|
+
|
|
64
|
+
|
|
60
65
|
def get_focus_path(url: str) -> str | None:
|
|
61
66
|
parts = urlparse_cached(url)
|
|
62
67
|
if len(parts.path.split("/")) > 2:
|
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
|
|
3
3
|
import pytest
|
|
4
|
+
from dpk_connector.core.item import ConnectorItem
|
|
5
|
+
from dpk_connector.core.spiders.sitemap import BaseSitemapSpider, ConnectorSitemapSpider
|
|
4
6
|
from scrapy import Request
|
|
5
7
|
from scrapy.crawler import Crawler
|
|
6
8
|
from scrapy.http import HtmlResponse
|
|
7
9
|
|
|
8
|
-
from dpk_connector.core.item import ConnectorItem
|
|
9
|
-
from dpk_connector.core.spiders.sitemap import BaseSitemapSpider, ConnectorSitemapSpider
|
|
10
|
-
|
|
11
10
|
|
|
12
11
|
@pytest.fixture
|
|
13
12
|
def crawler() -> Crawler:
|
|
@@ -22,6 +21,21 @@ def crawler() -> Crawler:
|
|
|
22
21
|
return crawler
|
|
23
22
|
|
|
24
23
|
|
|
24
|
+
def test_init_subdomain_focus():
|
|
25
|
+
spider = BaseSitemapSpider(
|
|
26
|
+
seed_urls=(
|
|
27
|
+
"http://blog.example.com/",
|
|
28
|
+
"http://contents.example.com/",
|
|
29
|
+
),
|
|
30
|
+
subdomain_focus=True,
|
|
31
|
+
)
|
|
32
|
+
assert spider.seed_urls == {
|
|
33
|
+
"http://blog.example.com/",
|
|
34
|
+
"http://contents.example.com/",
|
|
35
|
+
}
|
|
36
|
+
assert spider.allowed_domains == {"blog.example.com", "contents.example.com"}
|
|
37
|
+
|
|
38
|
+
|
|
25
39
|
def test_init_path_focus():
|
|
26
40
|
spider = BaseSitemapSpider(
|
|
27
41
|
seed_urls=(
|
|
@@ -7,6 +7,7 @@ from dpk_connector.core.utils import (
|
|
|
7
7
|
get_content_type,
|
|
8
8
|
get_etld1,
|
|
9
9
|
get_focus_path,
|
|
10
|
+
get_fqdn,
|
|
10
11
|
get_header_value,
|
|
11
12
|
get_mime_type,
|
|
12
13
|
is_allowed_path,
|
|
@@ -83,6 +84,21 @@ def test_get_etld1(url: str, expected: str):
|
|
|
83
84
|
assert get_etld1(url) == expected
|
|
84
85
|
|
|
85
86
|
|
|
87
|
+
@pytest.mark.parametrize(
|
|
88
|
+
"url,expected",
|
|
89
|
+
[
|
|
90
|
+
("http://www.example.com", "www.example.com"),
|
|
91
|
+
("https://www.example.co.uk", "www.example.co.uk"),
|
|
92
|
+
("http://www.example.com/path?query=string#fragment", "www.example.com"),
|
|
93
|
+
("http://localhost:8080/", ""),
|
|
94
|
+
("http://www.example.com:8080/", "www.example.com"),
|
|
95
|
+
("http://www.sub.example.com:8080/", "www.sub.example.com"),
|
|
96
|
+
],
|
|
97
|
+
)
|
|
98
|
+
def test_get_fqdn(url: str, expected: str):
|
|
99
|
+
assert get_fqdn(url) == expected
|
|
100
|
+
|
|
101
|
+
|
|
86
102
|
@pytest.mark.parametrize(
|
|
87
103
|
"url,expected",
|
|
88
104
|
[
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/__init__.py
RENAMED
|
File without changes
|
{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/__init__.py
RENAMED
|
File without changes
|
{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/item.py
RENAMED
|
File without changes
|
{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/logging.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/settings.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|