data-prep-connector 0.2.2.dev0__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/Makefile +1 -1
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/PKG-INFO +3 -3
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/doc/overview.md +14 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/pyproject.toml +3 -2
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/data_prep_connector.egg-info/PKG-INFO +3 -3
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/crawler.py +90 -2
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/settings.py +0 -11
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/spiders/sitemap.py +14 -6
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/utils.py +5 -0
- data_prep_connector-0.2.3/test/dpk_connector/core/test_crawler.py +69 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_middlewares.py +12 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_sitemap_spider.py +31 -9
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_utils.py +29 -3
- data_prep_connector-0.2.2.dev0/test/dpk_connector/core/test_crawler.py +0 -28
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/README.md +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/setup.cfg +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/data_prep_connector.egg-info/SOURCES.txt +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/data_prep_connector.egg-info/dependency_links.txt +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/data_prep_connector.egg-info/requires.txt +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/data_prep_connector.egg-info/top_level.txt +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/__init__.py +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/__init__.py +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/item.py +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/logging.py +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/middlewares.py +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/pipelines.py +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/spiders/__init__.py +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/test/dpk_connector/core/__init__.py +0 -0
- {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_sitemap_spider/index.html +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: data_prep_connector
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: Scalable and Compliant Web Crawler
|
|
5
5
|
Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
|
|
6
6
|
License: Apache-2.0
|
|
7
|
-
Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps
|
|
8
|
-
Requires-Python:
|
|
7
|
+
Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps,0b74b5a
|
|
8
|
+
Requires-Python: <3.13,>=3.10
|
|
9
9
|
Description-Content-Type: text/markdown
|
|
10
10
|
Requires-Dist: scrapy>=2.11.2
|
|
11
11
|
Requires-Dist: pydantic>=2.8.1
|
|
@@ -10,6 +10,20 @@ Features:
|
|
|
10
10
|
- Mime type filters: You can restrict mime types which can be downloaded.
|
|
11
11
|
- Parallel processing: Requests to websites are processed in parallel.
|
|
12
12
|
|
|
13
|
+
## How to install
|
|
14
|
+
|
|
15
|
+
### From PyPI
|
|
16
|
+
|
|
17
|
+
```sh
|
|
18
|
+
pip install data-prep-connector
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
### From Github
|
|
22
|
+
|
|
23
|
+
```sh
|
|
24
|
+
pip install git+https://github.com/IBM/data-prep-kit.git@dev#subdirectory=data-connector-lib
|
|
25
|
+
```
|
|
26
|
+
|
|
13
27
|
## Example usage
|
|
14
28
|
|
|
15
29
|
```python
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "data_prep_connector"
|
|
3
|
-
version = "0.2.
|
|
4
|
-
requires-python = ">=3.10"
|
|
3
|
+
version = "0.2.3"
|
|
4
|
+
requires-python = ">=3.10,<3.13"
|
|
5
5
|
keywords = [
|
|
6
6
|
"data",
|
|
7
7
|
"data acquisition",
|
|
@@ -12,6 +12,7 @@ keywords = [
|
|
|
12
12
|
"ai",
|
|
13
13
|
"fine-tuning",
|
|
14
14
|
"llmapps",
|
|
15
|
+
"0b74b5a"
|
|
15
16
|
]
|
|
16
17
|
description = "Scalable and Compliant Web Crawler"
|
|
17
18
|
license = { text = "Apache-2.0" }
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: data_prep_connector
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: Scalable and Compliant Web Crawler
|
|
5
5
|
Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
|
|
6
6
|
License: Apache-2.0
|
|
7
|
-
Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps
|
|
8
|
-
Requires-Python:
|
|
7
|
+
Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps,0b74b5a
|
|
8
|
+
Requires-Python: <3.13,>=3.10
|
|
9
9
|
Description-Content-Type: text/markdown
|
|
10
10
|
Requires-Dist: scrapy>=2.11.2
|
|
11
11
|
Requires-Dist: pydantic>=2.8.1
|
{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/crawler.py
RENAMED
|
@@ -74,6 +74,7 @@ def async_crawl(
|
|
|
74
74
|
user_agent: str = "",
|
|
75
75
|
headers: dict[str, str] = {},
|
|
76
76
|
allow_domains: Collection[str] = (),
|
|
77
|
+
subdomain_focus: bool = False,
|
|
77
78
|
path_focus: bool = False,
|
|
78
79
|
allow_mime_types: Collection[str] = (
|
|
79
80
|
"application/pdf",
|
|
@@ -84,6 +85,15 @@ def async_crawl(
|
|
|
84
85
|
disallow_mime_types: Collection[str] = (),
|
|
85
86
|
depth_limit: int = -1,
|
|
86
87
|
download_limit: int = -1,
|
|
88
|
+
concurrent_requests: int = 16,
|
|
89
|
+
concurrent_requests_per_domain: int = 8,
|
|
90
|
+
download_delay: float = 0,
|
|
91
|
+
randomize_download_delay: bool = True,
|
|
92
|
+
download_timeout: float = 180,
|
|
93
|
+
autothrottle_enabled: bool = True,
|
|
94
|
+
autothrottle_max_delay: float = 60,
|
|
95
|
+
autothrottle_target_concurrency: float = 8,
|
|
96
|
+
robots_max_crawl_delay: float = 60,
|
|
87
97
|
) -> Deferred[None]:
|
|
88
98
|
# Assisted by WCA@IBM
|
|
89
99
|
# Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
|
|
@@ -96,17 +106,27 @@ def async_crawl(
|
|
|
96
106
|
user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
|
|
97
107
|
headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
|
|
98
108
|
allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
|
|
109
|
+
subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
|
|
99
110
|
path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
|
|
100
111
|
allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
|
|
101
112
|
disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
|
|
102
113
|
depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
|
|
103
114
|
download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
|
|
115
|
+
concurrent_requests (int): The maximum number of concurrent requests to make. Default is 16.
|
|
116
|
+
concurrent_requests_per_domain (int): The maximum number of concurrent requests to make per domain. Default is 8.
|
|
117
|
+
download_delay (float): The delay between consecutive requests. Default is 0.
|
|
118
|
+
randomize_download_delay (bool): If specified, the download delay will be randomized between 0.5 * `download_delay and 1.5 * `download_delay`. Default is True.
|
|
119
|
+
download_timeout (float): The timeout for each request. Default is 180 seconds.
|
|
120
|
+
autothrottle_enabled (bool): If specified, autothrottling will be enabled. Default is True.
|
|
121
|
+
autothrottle_max_delay (float): The maximum delay between consecutive requests when autothrottling is enabled. Default is 60 seconds.
|
|
122
|
+
autothrottle_target_concurrency (float): The target concurrency for autothrottling. Default is 8.
|
|
123
|
+
robots_max_crawl_delay (float): The maximum crawl delay allowed by the robots.txt file. Default is 60 seconds.
|
|
104
124
|
|
|
105
125
|
Returns:
|
|
106
126
|
Deferred[None]: A Twisted deferred object that can be used to wait for the crawler to finish.
|
|
107
127
|
"""
|
|
108
128
|
if not seed_urls:
|
|
109
|
-
raise ValueError(
|
|
129
|
+
raise ValueError("Empty seed URLs.")
|
|
110
130
|
for url in seed_urls:
|
|
111
131
|
if not validate_url(url):
|
|
112
132
|
raise ValueError(f"Seed URL {url} is not valid.")
|
|
@@ -117,6 +137,24 @@ def async_crawl(
|
|
|
117
137
|
raise ValueError(f"Invalid depth limit {depth_limit}")
|
|
118
138
|
if download_limit < -1:
|
|
119
139
|
raise ValueError(f"Invalid download limit {download_limit}")
|
|
140
|
+
if concurrent_requests < 1:
|
|
141
|
+
raise ValueError(f"Invalid concurrent requests {concurrent_requests}")
|
|
142
|
+
if concurrent_requests_per_domain < 1:
|
|
143
|
+
raise ValueError(
|
|
144
|
+
f"Invalid concurrent reuqests per domain {concurrent_requests_per_domain}"
|
|
145
|
+
)
|
|
146
|
+
if download_delay < 0:
|
|
147
|
+
raise ValueError(f"Invalid download delay {download_delay}")
|
|
148
|
+
if download_timeout < 0:
|
|
149
|
+
raise ValueError(f"Invalid donwload timeout {download_timeout}")
|
|
150
|
+
if autothrottle_max_delay < 0:
|
|
151
|
+
raise ValueError(f"Invalid autothrottle max delay {autothrottle_max_delay}")
|
|
152
|
+
if autothrottle_target_concurrency < 1:
|
|
153
|
+
raise ValueError(
|
|
154
|
+
f"Invalid autothrottle target concurrency {autothrottle_target_concurrency}"
|
|
155
|
+
)
|
|
156
|
+
if robots_max_crawl_delay < 0:
|
|
157
|
+
raise ValueError(f"Invalid robots max crawl delay {robots_max_crawl_delay}")
|
|
120
158
|
|
|
121
159
|
settings = Settings()
|
|
122
160
|
settings.setmodule("dpk_connector.core.settings", priority="project")
|
|
@@ -124,7 +162,7 @@ def async_crawl(
|
|
|
124
162
|
if user_agent:
|
|
125
163
|
settings.set("USER_AGENT", user_agent, priority="spider")
|
|
126
164
|
if headers:
|
|
127
|
-
settings.set("DEFAULT_REQUEST_HEADERS", headers)
|
|
165
|
+
settings.set("DEFAULT_REQUEST_HEADERS", headers, priority="spider")
|
|
128
166
|
if depth_limit == 0:
|
|
129
167
|
depth_limit = -1
|
|
130
168
|
elif depth_limit == -1:
|
|
@@ -133,6 +171,25 @@ def async_crawl(
|
|
|
133
171
|
if download_limit == -1:
|
|
134
172
|
download_limit = 0
|
|
135
173
|
settings.set("CLOSESPIDER_ITEMCOUNT", download_limit, priority="spider")
|
|
174
|
+
settings.set("CONCURRENT_REQUESTS", concurrent_requests, priority="spider")
|
|
175
|
+
settings.set(
|
|
176
|
+
"CONCURRENT_REQUESTS_PER_DOMAIN",
|
|
177
|
+
concurrent_requests_per_domain,
|
|
178
|
+
priority="spider",
|
|
179
|
+
)
|
|
180
|
+
settings.set("DOWNLOAD_DELAY", download_delay, priority="spider")
|
|
181
|
+
settings.set(
|
|
182
|
+
"RANDOMIZE_DOWNLOAD_DELAY", randomize_download_delay, priority="spider"
|
|
183
|
+
)
|
|
184
|
+
settings.set("DOWNLOAD_TIMEOUT", download_timeout, priority="spider")
|
|
185
|
+
settings.set("AUTOTHROTTLE_ENABLED", autothrottle_enabled, priority="spider")
|
|
186
|
+
settings.set("AUTOTHROTTLE_MAX_DELAY", autothrottle_max_delay, priority="spider")
|
|
187
|
+
settings.set(
|
|
188
|
+
"AUTOTHROTTLE_TARGET_CONCURRENCY",
|
|
189
|
+
autothrottle_target_concurrency,
|
|
190
|
+
priority="spider",
|
|
191
|
+
)
|
|
192
|
+
settings.set("ROBOTS_MAX_CRAWL_DELAY", robots_max_crawl_delay, priority="spider")
|
|
136
193
|
|
|
137
194
|
runner = MultiThreadedCrawlerRunner(settings)
|
|
138
195
|
runner.crawl(
|
|
@@ -140,6 +197,7 @@ def async_crawl(
|
|
|
140
197
|
seed_urls=seed_urls,
|
|
141
198
|
callback=on_downloaded,
|
|
142
199
|
allow_domains=allow_domains,
|
|
200
|
+
subdomain_focus=subdomain_focus,
|
|
143
201
|
path_focus=path_focus,
|
|
144
202
|
allow_mime_types=allow_mime_types,
|
|
145
203
|
disallow_mime_types=disallow_mime_types,
|
|
@@ -155,6 +213,7 @@ def crawl(
|
|
|
155
213
|
user_agent: str = "",
|
|
156
214
|
headers: dict[str, str] = {},
|
|
157
215
|
allow_domains: Collection[str] = (),
|
|
216
|
+
subdomain_focus: bool = False,
|
|
158
217
|
path_focus: bool = False,
|
|
159
218
|
allow_mime_types: Collection[str] = (
|
|
160
219
|
"application/pdf",
|
|
@@ -165,6 +224,15 @@ def crawl(
|
|
|
165
224
|
disallow_mime_types: Collection[str] = (),
|
|
166
225
|
depth_limit: int = -1,
|
|
167
226
|
download_limit: int = -1,
|
|
227
|
+
concurrent_requests: int = 16,
|
|
228
|
+
concurrent_requests_per_domain: int = 8,
|
|
229
|
+
download_delay: float = 0,
|
|
230
|
+
randomize_download_delay: bool = True,
|
|
231
|
+
download_timeout: float = 180,
|
|
232
|
+
autothrottle_enabled: bool = True,
|
|
233
|
+
autothrottle_max_delay: float = 60,
|
|
234
|
+
autothrottle_target_concurrency: float = 8,
|
|
235
|
+
robots_max_crawl_delay: float = 60,
|
|
168
236
|
) -> None:
|
|
169
237
|
# Assisted by WCA@IBM
|
|
170
238
|
# Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
|
|
@@ -177,11 +245,21 @@ def crawl(
|
|
|
177
245
|
user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
|
|
178
246
|
headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
|
|
179
247
|
allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
|
|
248
|
+
subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
|
|
180
249
|
path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
|
|
181
250
|
allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
|
|
182
251
|
disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
|
|
183
252
|
depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
|
|
184
253
|
download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
|
|
254
|
+
concurrent_requests (int): The maximum number of concurrent requests to make. Default is 16.
|
|
255
|
+
concurrent_requests_per_domain (int): The maximum number of concurrent requests to make per domain. Default is 8.
|
|
256
|
+
download_delay (float): The delay between consecutive requests. Default is 0.
|
|
257
|
+
randomize_download_delay (bool): If specified, the download delay will be randomized between 0.5 * `download_delay and 1.5 * `download_delay`. Default is True.
|
|
258
|
+
download_timeout (float): The timeout for each request. Default is 180 seconds.
|
|
259
|
+
autothrottle_enabled (bool): If specified, autothrottling will be enabled. Default is True.
|
|
260
|
+
autothrottle_max_delay (float): The maximum delay between consecutive requests when autothrottling is enabled. Default is 60 seconds.
|
|
261
|
+
autothrottle_target_concurrency (float): The target concurrency for autothrottling. Default is 8.
|
|
262
|
+
robots_max_crawl_delay (float): The maximum crawl delay allowed by the robots.txt file. Default is 60 seconds.
|
|
185
263
|
|
|
186
264
|
Returns:
|
|
187
265
|
None
|
|
@@ -198,11 +276,21 @@ def crawl(
|
|
|
198
276
|
user_agent,
|
|
199
277
|
headers,
|
|
200
278
|
allow_domains,
|
|
279
|
+
subdomain_focus,
|
|
201
280
|
path_focus,
|
|
202
281
|
allow_mime_types,
|
|
203
282
|
disallow_mime_types,
|
|
204
283
|
depth_limit,
|
|
205
284
|
download_limit,
|
|
285
|
+
concurrent_requests,
|
|
286
|
+
concurrent_requests_per_domain,
|
|
287
|
+
download_delay,
|
|
288
|
+
randomize_download_delay,
|
|
289
|
+
download_timeout,
|
|
290
|
+
autothrottle_enabled,
|
|
291
|
+
autothrottle_max_delay,
|
|
292
|
+
autothrottle_target_concurrency,
|
|
293
|
+
robots_max_crawl_delay,
|
|
206
294
|
)
|
|
207
295
|
d.addBoth(on_completed)
|
|
208
296
|
with condition:
|
{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/settings.py
RENAMED
|
@@ -16,21 +16,10 @@ SPIDER_MODULES = ["dpk_connector.core.spiders"]
|
|
|
16
16
|
|
|
17
17
|
# Robots
|
|
18
18
|
ROBOTSTXT_OBEY = True
|
|
19
|
-
ROBOTS_MAX_CRAWL_DELAY = 60
|
|
20
19
|
ROBOTSTXT_PARSER = "dpk_connector.core.middlewares.DelayingProtegoRobotParser"
|
|
21
20
|
|
|
22
|
-
# Downloader parameters
|
|
23
|
-
CONCURRENT_REQUESTS = 20
|
|
24
|
-
CONCURRENT_REQUESTS_PER_DOMAIN = 10
|
|
25
|
-
DOWNLOAD_DELAY = 0
|
|
26
|
-
RANDOMIZE_DOWNLOAD_DELAY = True
|
|
27
|
-
DOWNLOAD_TIMEOUT = 180
|
|
28
|
-
|
|
29
21
|
# Autothrottle
|
|
30
|
-
AUTOTHROTTLE_ENABLED = True
|
|
31
22
|
AUTOTHROTTLE_START_DELAY = 0
|
|
32
|
-
AUTOTHROTTLE_MAX_DELAY = 300
|
|
33
|
-
AUTOTHROTTLE_TARGET_CONCURRENCY = 10
|
|
34
23
|
AUTOTHROTTLE_DEBUG = False
|
|
35
24
|
|
|
36
25
|
# Middlewares/pipelines/extensions
|
|
@@ -28,6 +28,7 @@ from dpk_connector.core.utils import (
|
|
|
28
28
|
get_content_type,
|
|
29
29
|
get_etld1,
|
|
30
30
|
get_focus_path,
|
|
31
|
+
get_fqdn,
|
|
31
32
|
is_allowed_path,
|
|
32
33
|
urlparse_cached,
|
|
33
34
|
)
|
|
@@ -42,6 +43,7 @@ class BaseSitemapSpider(SitemapSpider):
|
|
|
42
43
|
self,
|
|
43
44
|
seed_urls: Collection[str],
|
|
44
45
|
allow_domains: Collection[str] = (),
|
|
46
|
+
subdomain_focus: bool = False,
|
|
45
47
|
path_focus: bool = False,
|
|
46
48
|
allow_mime_types: Collection[str] = (),
|
|
47
49
|
disallow_mime_types: Collection[str] = (),
|
|
@@ -88,11 +90,15 @@ class BaseSitemapSpider(SitemapSpider):
|
|
|
88
90
|
self.focus_paths.add(path)
|
|
89
91
|
|
|
90
92
|
# Domains and mime types filtering
|
|
91
|
-
|
|
92
|
-
allow_domains
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
93
|
+
if allow_domains:
|
|
94
|
+
self.allowed_domains = set(allow_domains)
|
|
95
|
+
elif subdomain_focus:
|
|
96
|
+
self.allowed_domains = set()
|
|
97
|
+
for url in seed_urls:
|
|
98
|
+
if fqdn := get_fqdn(url):
|
|
99
|
+
self.allowed_domains.add(fqdn)
|
|
100
|
+
else:
|
|
101
|
+
self.allowed_domains = set(get_etld1(url) for url in seed_urls)
|
|
96
102
|
self.allow_mime_types = set(
|
|
97
103
|
[m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else ()
|
|
98
104
|
)
|
|
@@ -155,7 +161,9 @@ class BaseSitemapSpider(SitemapSpider):
|
|
|
155
161
|
)
|
|
156
162
|
|
|
157
163
|
def _parse_sitemap(self, response: Response):
|
|
158
|
-
yield ConnectorItem(
|
|
164
|
+
yield ConnectorItem(
|
|
165
|
+
dropped=False, downloaded=False, system_request=True, sitemap=True
|
|
166
|
+
)
|
|
159
167
|
|
|
160
168
|
seed_url = response.meta["seed_url"]
|
|
161
169
|
|
{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/utils.py
RENAMED
|
@@ -57,6 +57,11 @@ def get_etld1(url: str) -> str:
|
|
|
57
57
|
return f"{ext.domain}.{ext.suffix}"
|
|
58
58
|
|
|
59
59
|
|
|
60
|
+
def get_fqdn(url: str) -> str:
|
|
61
|
+
ext = tldextract.extract(url)
|
|
62
|
+
return ext.fqdn
|
|
63
|
+
|
|
64
|
+
|
|
60
65
|
def get_focus_path(url: str) -> str | None:
|
|
61
66
|
parts = urlparse_cached(url)
|
|
62
67
|
if len(parts.path.split("/")) > 2:
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import pytest
|
|
14
|
+
from dpk_connector.core.crawler import crawl
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_invalid_crawler():
|
|
18
|
+
def on_downloaded(url: str, body: bytes, headers: dict[str, str]):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
with pytest.raises(ValueError) as e:
|
|
22
|
+
crawl([], on_downloaded)
|
|
23
|
+
assert isinstance(e.value, ValueError) is True
|
|
24
|
+
|
|
25
|
+
with pytest.raises(ValueError) as e:
|
|
26
|
+
crawl(["invalidseedurl"], on_downloaded)
|
|
27
|
+
assert isinstance(e.value, ValueError) is True
|
|
28
|
+
|
|
29
|
+
with pytest.raises(ValueError) as e:
|
|
30
|
+
crawl(["http://example.com"], on_downloaded, allow_domains=("invaliddomain",))
|
|
31
|
+
assert isinstance(e.value, ValueError) is True
|
|
32
|
+
|
|
33
|
+
with pytest.raises(ValueError) as e:
|
|
34
|
+
crawl(["http://example.com"], on_downloaded, depth_limit=-10)
|
|
35
|
+
assert isinstance(e.value, ValueError) is True
|
|
36
|
+
|
|
37
|
+
with pytest.raises(ValueError) as e:
|
|
38
|
+
crawl(["http://example.com"], on_downloaded, download_limit=-10)
|
|
39
|
+
assert isinstance(e.value, ValueError) is True
|
|
40
|
+
|
|
41
|
+
with pytest.raises(ValueError) as e:
|
|
42
|
+
crawl(["http://example.com"], on_downloaded, concurrent_requests=-10)
|
|
43
|
+
assert isinstance(e.value, ValueError) is True
|
|
44
|
+
|
|
45
|
+
with pytest.raises(ValueError) as e:
|
|
46
|
+
crawl(["http://example.com"], on_downloaded, concurrent_requests_per_domain=-10)
|
|
47
|
+
assert isinstance(e.value, ValueError) is True
|
|
48
|
+
|
|
49
|
+
with pytest.raises(ValueError) as e:
|
|
50
|
+
crawl(["http://example.com"], on_downloaded, download_delay=-0.1)
|
|
51
|
+
assert isinstance(e.value, ValueError) is True
|
|
52
|
+
|
|
53
|
+
with pytest.raises(ValueError) as e:
|
|
54
|
+
crawl(["http://example.com"], on_downloaded, download_timeout=-0.1)
|
|
55
|
+
assert isinstance(e.value, ValueError) is True
|
|
56
|
+
|
|
57
|
+
with pytest.raises(ValueError) as e:
|
|
58
|
+
crawl(["http://example.com"], on_downloaded, autothrottle_max_delay=-0.1)
|
|
59
|
+
assert isinstance(e.value, ValueError) is True
|
|
60
|
+
|
|
61
|
+
with pytest.raises(ValueError) as e:
|
|
62
|
+
crawl(
|
|
63
|
+
["http://example.com"], on_downloaded, autothrottle_target_concurrency=0.5
|
|
64
|
+
)
|
|
65
|
+
assert isinstance(e.value, ValueError) is True
|
|
66
|
+
|
|
67
|
+
with pytest.raises(ValueError) as e:
|
|
68
|
+
crawl(["http://example.com"], on_downloaded, robots_max_crawl_delay=-0.1)
|
|
69
|
+
assert isinstance(e.value, ValueError) is True
|
|
@@ -1,3 +1,15 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
1
13
|
import pytest
|
|
2
14
|
from dpk_connector.core.middlewares import DelayingProtegoRobotParser
|
|
3
15
|
from pytest_mock import MockerFixture
|
|
@@ -1,13 +1,24 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
1
13
|
from pathlib import Path
|
|
2
14
|
|
|
3
15
|
import pytest
|
|
16
|
+
from dpk_connector.core.item import ConnectorItem
|
|
17
|
+
from dpk_connector.core.spiders.sitemap import BaseSitemapSpider, ConnectorSitemapSpider
|
|
4
18
|
from scrapy import Request
|
|
5
19
|
from scrapy.crawler import Crawler
|
|
6
20
|
from scrapy.http import HtmlResponse
|
|
7
21
|
|
|
8
|
-
from dpk_connector.core.item import ConnectorItem
|
|
9
|
-
from dpk_connector.core.spiders.sitemap import BaseSitemapSpider, ConnectorSitemapSpider
|
|
10
|
-
|
|
11
22
|
|
|
12
23
|
@pytest.fixture
|
|
13
24
|
def crawler() -> Crawler:
|
|
@@ -22,6 +33,21 @@ def crawler() -> Crawler:
|
|
|
22
33
|
return crawler
|
|
23
34
|
|
|
24
35
|
|
|
36
|
+
def test_init_subdomain_focus():
|
|
37
|
+
spider = BaseSitemapSpider(
|
|
38
|
+
seed_urls=(
|
|
39
|
+
"http://blog.example.com/",
|
|
40
|
+
"http://contents.example.com/",
|
|
41
|
+
),
|
|
42
|
+
subdomain_focus=True,
|
|
43
|
+
)
|
|
44
|
+
assert spider.seed_urls == {
|
|
45
|
+
"http://blog.example.com/",
|
|
46
|
+
"http://contents.example.com/",
|
|
47
|
+
}
|
|
48
|
+
assert spider.allowed_domains == {"blog.example.com", "contents.example.com"}
|
|
49
|
+
|
|
50
|
+
|
|
25
51
|
def test_init_path_focus():
|
|
26
52
|
spider = BaseSitemapSpider(
|
|
27
53
|
seed_urls=(
|
|
@@ -59,9 +85,7 @@ def test_parse(datadir: Path, crawler: Crawler):
|
|
|
59
85
|
assert body.decode("utf-8") == response_body
|
|
60
86
|
assert headers == {"Content-Type": "text/html"}
|
|
61
87
|
|
|
62
|
-
spider = ConnectorSitemapSpider.from_crawler(
|
|
63
|
-
crawler, seed_urls=("http://example.com",), callback=callback
|
|
64
|
-
)
|
|
88
|
+
spider = ConnectorSitemapSpider.from_crawler(crawler, seed_urls=("http://example.com",), callback=callback)
|
|
65
89
|
request = Request(
|
|
66
90
|
"http://example.com/index.html",
|
|
67
91
|
meta={
|
|
@@ -79,9 +103,7 @@ def test_parse(datadir: Path, crawler: Crawler):
|
|
|
79
103
|
parsed = spider.parse(response)
|
|
80
104
|
|
|
81
105
|
item = next(parsed)
|
|
82
|
-
assert item == ConnectorItem(
|
|
83
|
-
dropped=False, downloaded=True, system_request=False, sitemap=False
|
|
84
|
-
)
|
|
106
|
+
assert item == ConnectorItem(dropped=False, downloaded=True, system_request=False, sitemap=False)
|
|
85
107
|
|
|
86
108
|
for next_request in parsed:
|
|
87
109
|
assert isinstance(next_request, Request) is True
|
{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_utils.py
RENAMED
|
@@ -1,3 +1,15 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
1
13
|
# Assisted by WCA@IBM
|
|
2
14
|
# Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
|
|
3
15
|
|
|
@@ -7,6 +19,7 @@ from dpk_connector.core.utils import (
|
|
|
7
19
|
get_content_type,
|
|
8
20
|
get_etld1,
|
|
9
21
|
get_focus_path,
|
|
22
|
+
get_fqdn,
|
|
10
23
|
get_header_value,
|
|
11
24
|
get_mime_type,
|
|
12
25
|
is_allowed_path,
|
|
@@ -19,9 +32,7 @@ from scrapy.http import Request, Response
|
|
|
19
32
|
|
|
20
33
|
|
|
21
34
|
def test_get_header_value():
|
|
22
|
-
response = Response(
|
|
23
|
-
"http://example.com", headers={"Content-Type": "application/json"}
|
|
24
|
-
)
|
|
35
|
+
response = Response("http://example.com", headers={"Content-Type": "application/json"})
|
|
25
36
|
assert get_header_value(response, "Content-Type") == "application/json"
|
|
26
37
|
|
|
27
38
|
|
|
@@ -83,6 +94,21 @@ def test_get_etld1(url: str, expected: str):
|
|
|
83
94
|
assert get_etld1(url) == expected
|
|
84
95
|
|
|
85
96
|
|
|
97
|
+
@pytest.mark.parametrize(
|
|
98
|
+
"url,expected",
|
|
99
|
+
[
|
|
100
|
+
("http://www.example.com", "www.example.com"),
|
|
101
|
+
("https://www.example.co.uk", "www.example.co.uk"),
|
|
102
|
+
("http://www.example.com/path?query=string#fragment", "www.example.com"),
|
|
103
|
+
("http://localhost:8080/", ""),
|
|
104
|
+
("http://www.example.com:8080/", "www.example.com"),
|
|
105
|
+
("http://www.sub.example.com:8080/", "www.sub.example.com"),
|
|
106
|
+
],
|
|
107
|
+
)
|
|
108
|
+
def test_get_fqdn(url: str, expected: str):
|
|
109
|
+
assert get_fqdn(url) == expected
|
|
110
|
+
|
|
111
|
+
|
|
86
112
|
@pytest.mark.parametrize(
|
|
87
113
|
"url,expected",
|
|
88
114
|
[
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
|
|
3
|
-
from dpk_connector.core.crawler import crawl
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def test_invalid_crawler():
|
|
7
|
-
def on_downloaded(url: str, body: bytes, headers: dict[str, str]):
|
|
8
|
-
pass
|
|
9
|
-
|
|
10
|
-
with pytest.raises(ValueError) as e:
|
|
11
|
-
crawl([], on_downloaded)
|
|
12
|
-
assert isinstance(e.value, ValueError) is True
|
|
13
|
-
|
|
14
|
-
with pytest.raises(ValueError) as e:
|
|
15
|
-
crawl(["invalidseedurl"], on_downloaded)
|
|
16
|
-
assert isinstance(e.value, ValueError) is True
|
|
17
|
-
|
|
18
|
-
with pytest.raises(ValueError) as e:
|
|
19
|
-
crawl(["http://example.com"], on_downloaded, allow_domains=("invaliddomain",))
|
|
20
|
-
assert isinstance(e.value, ValueError) is True
|
|
21
|
-
|
|
22
|
-
with pytest.raises(ValueError) as e:
|
|
23
|
-
crawl(["http://example.com"], on_downloaded, depth_limit=-10)
|
|
24
|
-
assert isinstance(e.value, ValueError) is True
|
|
25
|
-
|
|
26
|
-
with pytest.raises(ValueError) as e:
|
|
27
|
-
crawl(["http://example.com"], on_downloaded, download_limit=-10)
|
|
28
|
-
assert isinstance(e.value, ValueError) is True
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/logging.py
RENAMED
|
File without changes
|
{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/middlewares.py
RENAMED
|
File without changes
|
{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/pipelines.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/test/dpk_connector/core/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|