data-prep-connector 0.2.2.dev1__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/Makefile +1 -1
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/PKG-INFO +3 -3
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/pyproject.toml +3 -2
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/data_prep_connector.egg-info/PKG-INFO +3 -3
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/core/crawler.py +84 -2
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/core/settings.py +0 -11
- data_prep_connector-0.2.3/test/dpk_connector/core/test_crawler.py +69 -0
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_middlewares.py +12 -0
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_sitemap_spider.py +14 -6
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_utils.py +13 -3
- data_prep_connector-0.2.2.dev1/test/dpk_connector/core/test_crawler.py +0 -28
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/README.md +0 -0
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/doc/overview.md +0 -0
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/setup.cfg +0 -0
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/data_prep_connector.egg-info/SOURCES.txt +0 -0
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/data_prep_connector.egg-info/dependency_links.txt +0 -0
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/data_prep_connector.egg-info/requires.txt +0 -0
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/data_prep_connector.egg-info/top_level.txt +0 -0
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/__init__.py +0 -0
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/core/__init__.py +0 -0
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/core/item.py +0 -0
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/core/logging.py +0 -0
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/core/middlewares.py +0 -0
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/core/pipelines.py +0 -0
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/core/spiders/__init__.py +0 -0
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/core/spiders/sitemap.py +0 -0
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/core/utils.py +0 -0
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/test/dpk_connector/core/__init__.py +0 -0
- {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_sitemap_spider/index.html +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: data_prep_connector
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: Scalable and Compliant Web Crawler
|
|
5
5
|
Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
|
|
6
6
|
License: Apache-2.0
|
|
7
|
-
Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps
|
|
8
|
-
Requires-Python:
|
|
7
|
+
Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps,0b74b5a
|
|
8
|
+
Requires-Python: <3.13,>=3.10
|
|
9
9
|
Description-Content-Type: text/markdown
|
|
10
10
|
Requires-Dist: scrapy>=2.11.2
|
|
11
11
|
Requires-Dist: pydantic>=2.8.1
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "data_prep_connector"
|
|
3
|
-
version = "0.2.
|
|
4
|
-
requires-python = ">=3.10"
|
|
3
|
+
version = "0.2.3"
|
|
4
|
+
requires-python = ">=3.10,<3.13"
|
|
5
5
|
keywords = [
|
|
6
6
|
"data",
|
|
7
7
|
"data acquisition",
|
|
@@ -12,6 +12,7 @@ keywords = [
|
|
|
12
12
|
"ai",
|
|
13
13
|
"fine-tuning",
|
|
14
14
|
"llmapps",
|
|
15
|
+
"0b74b5a"
|
|
15
16
|
]
|
|
16
17
|
description = "Scalable and Compliant Web Crawler"
|
|
17
18
|
license = { text = "Apache-2.0" }
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: data_prep_connector
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: Scalable and Compliant Web Crawler
|
|
5
5
|
Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
|
|
6
6
|
License: Apache-2.0
|
|
7
|
-
Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps
|
|
8
|
-
Requires-Python:
|
|
7
|
+
Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps,0b74b5a
|
|
8
|
+
Requires-Python: <3.13,>=3.10
|
|
9
9
|
Description-Content-Type: text/markdown
|
|
10
10
|
Requires-Dist: scrapy>=2.11.2
|
|
11
11
|
Requires-Dist: pydantic>=2.8.1
|
{data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/core/crawler.py
RENAMED
|
@@ -85,6 +85,15 @@ def async_crawl(
|
|
|
85
85
|
disallow_mime_types: Collection[str] = (),
|
|
86
86
|
depth_limit: int = -1,
|
|
87
87
|
download_limit: int = -1,
|
|
88
|
+
concurrent_requests: int = 16,
|
|
89
|
+
concurrent_requests_per_domain: int = 8,
|
|
90
|
+
download_delay: float = 0,
|
|
91
|
+
randomize_download_delay: bool = True,
|
|
92
|
+
download_timeout: float = 180,
|
|
93
|
+
autothrottle_enabled: bool = True,
|
|
94
|
+
autothrottle_max_delay: float = 60,
|
|
95
|
+
autothrottle_target_concurrency: float = 8,
|
|
96
|
+
robots_max_crawl_delay: float = 60,
|
|
88
97
|
) -> Deferred[None]:
|
|
89
98
|
# Assisted by WCA@IBM
|
|
90
99
|
# Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
|
|
@@ -103,12 +112,21 @@ def async_crawl(
|
|
|
103
112
|
disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
|
|
104
113
|
depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
|
|
105
114
|
download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
|
|
115
|
+
concurrent_requests (int): The maximum number of concurrent requests to make. Default is 16.
|
|
116
|
+
concurrent_requests_per_domain (int): The maximum number of concurrent requests to make per domain. Default is 8.
|
|
117
|
+
download_delay (float): The delay between consecutive requests. Default is 0.
|
|
118
|
+
randomize_download_delay (bool): If specified, the download delay will be randomized between 0.5 * `download_delay and 1.5 * `download_delay`. Default is True.
|
|
119
|
+
download_timeout (float): The timeout for each request. Default is 180 seconds.
|
|
120
|
+
autothrottle_enabled (bool): If specified, autothrottling will be enabled. Default is True.
|
|
121
|
+
autothrottle_max_delay (float): The maximum delay between consecutive requests when autothrottling is enabled. Default is 60 seconds.
|
|
122
|
+
autothrottle_target_concurrency (float): The target concurrency for autothrottling. Default is 8.
|
|
123
|
+
robots_max_crawl_delay (float): The maximum crawl delay allowed by the robots.txt file. Default is 60 seconds.
|
|
106
124
|
|
|
107
125
|
Returns:
|
|
108
126
|
Deferred[None]: A Twisted deferred object that can be used to wait for the crawler to finish.
|
|
109
127
|
"""
|
|
110
128
|
if not seed_urls:
|
|
111
|
-
raise ValueError(
|
|
129
|
+
raise ValueError("Empty seed URLs.")
|
|
112
130
|
for url in seed_urls:
|
|
113
131
|
if not validate_url(url):
|
|
114
132
|
raise ValueError(f"Seed URL {url} is not valid.")
|
|
@@ -119,6 +137,24 @@ def async_crawl(
|
|
|
119
137
|
raise ValueError(f"Invalid depth limit {depth_limit}")
|
|
120
138
|
if download_limit < -1:
|
|
121
139
|
raise ValueError(f"Invalid download limit {download_limit}")
|
|
140
|
+
if concurrent_requests < 1:
|
|
141
|
+
raise ValueError(f"Invalid concurrent requests {concurrent_requests}")
|
|
142
|
+
if concurrent_requests_per_domain < 1:
|
|
143
|
+
raise ValueError(
|
|
144
|
+
f"Invalid concurrent reuqests per domain {concurrent_requests_per_domain}"
|
|
145
|
+
)
|
|
146
|
+
if download_delay < 0:
|
|
147
|
+
raise ValueError(f"Invalid download delay {download_delay}")
|
|
148
|
+
if download_timeout < 0:
|
|
149
|
+
raise ValueError(f"Invalid donwload timeout {download_timeout}")
|
|
150
|
+
if autothrottle_max_delay < 0:
|
|
151
|
+
raise ValueError(f"Invalid autothrottle max delay {autothrottle_max_delay}")
|
|
152
|
+
if autothrottle_target_concurrency < 1:
|
|
153
|
+
raise ValueError(
|
|
154
|
+
f"Invalid autothrottle target concurrency {autothrottle_target_concurrency}"
|
|
155
|
+
)
|
|
156
|
+
if robots_max_crawl_delay < 0:
|
|
157
|
+
raise ValueError(f"Invalid robots max crawl delay {robots_max_crawl_delay}")
|
|
122
158
|
|
|
123
159
|
settings = Settings()
|
|
124
160
|
settings.setmodule("dpk_connector.core.settings", priority="project")
|
|
@@ -126,7 +162,7 @@ def async_crawl(
|
|
|
126
162
|
if user_agent:
|
|
127
163
|
settings.set("USER_AGENT", user_agent, priority="spider")
|
|
128
164
|
if headers:
|
|
129
|
-
settings.set("DEFAULT_REQUEST_HEADERS", headers)
|
|
165
|
+
settings.set("DEFAULT_REQUEST_HEADERS", headers, priority="spider")
|
|
130
166
|
if depth_limit == 0:
|
|
131
167
|
depth_limit = -1
|
|
132
168
|
elif depth_limit == -1:
|
|
@@ -135,6 +171,25 @@ def async_crawl(
|
|
|
135
171
|
if download_limit == -1:
|
|
136
172
|
download_limit = 0
|
|
137
173
|
settings.set("CLOSESPIDER_ITEMCOUNT", download_limit, priority="spider")
|
|
174
|
+
settings.set("CONCURRENT_REQUESTS", concurrent_requests, priority="spider")
|
|
175
|
+
settings.set(
|
|
176
|
+
"CONCURRENT_REQUESTS_PER_DOMAIN",
|
|
177
|
+
concurrent_requests_per_domain,
|
|
178
|
+
priority="spider",
|
|
179
|
+
)
|
|
180
|
+
settings.set("DOWNLOAD_DELAY", download_delay, priority="spider")
|
|
181
|
+
settings.set(
|
|
182
|
+
"RANDOMIZE_DOWNLOAD_DELAY", randomize_download_delay, priority="spider"
|
|
183
|
+
)
|
|
184
|
+
settings.set("DOWNLOAD_TIMEOUT", download_timeout, priority="spider")
|
|
185
|
+
settings.set("AUTOTHROTTLE_ENABLED", autothrottle_enabled, priority="spider")
|
|
186
|
+
settings.set("AUTOTHROTTLE_MAX_DELAY", autothrottle_max_delay, priority="spider")
|
|
187
|
+
settings.set(
|
|
188
|
+
"AUTOTHROTTLE_TARGET_CONCURRENCY",
|
|
189
|
+
autothrottle_target_concurrency,
|
|
190
|
+
priority="spider",
|
|
191
|
+
)
|
|
192
|
+
settings.set("ROBOTS_MAX_CRAWL_DELAY", robots_max_crawl_delay, priority="spider")
|
|
138
193
|
|
|
139
194
|
runner = MultiThreadedCrawlerRunner(settings)
|
|
140
195
|
runner.crawl(
|
|
@@ -169,6 +224,15 @@ def crawl(
|
|
|
169
224
|
disallow_mime_types: Collection[str] = (),
|
|
170
225
|
depth_limit: int = -1,
|
|
171
226
|
download_limit: int = -1,
|
|
227
|
+
concurrent_requests: int = 16,
|
|
228
|
+
concurrent_requests_per_domain: int = 8,
|
|
229
|
+
download_delay: float = 0,
|
|
230
|
+
randomize_download_delay: bool = True,
|
|
231
|
+
download_timeout: float = 180,
|
|
232
|
+
autothrottle_enabled: bool = True,
|
|
233
|
+
autothrottle_max_delay: float = 60,
|
|
234
|
+
autothrottle_target_concurrency: float = 8,
|
|
235
|
+
robots_max_crawl_delay: float = 60,
|
|
172
236
|
) -> None:
|
|
173
237
|
# Assisted by WCA@IBM
|
|
174
238
|
# Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
|
|
@@ -187,6 +251,15 @@ def crawl(
|
|
|
187
251
|
disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
|
|
188
252
|
depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
|
|
189
253
|
download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
|
|
254
|
+
concurrent_requests (int): The maximum number of concurrent requests to make. Default is 16.
|
|
255
|
+
concurrent_requests_per_domain (int): The maximum number of concurrent requests to make per domain. Default is 8.
|
|
256
|
+
download_delay (float): The delay between consecutive requests. Default is 0.
|
|
257
|
+
randomize_download_delay (bool): If specified, the download delay will be randomized between 0.5 * `download_delay and 1.5 * `download_delay`. Default is True.
|
|
258
|
+
download_timeout (float): The timeout for each request. Default is 180 seconds.
|
|
259
|
+
autothrottle_enabled (bool): If specified, autothrottling will be enabled. Default is True.
|
|
260
|
+
autothrottle_max_delay (float): The maximum delay between consecutive requests when autothrottling is enabled. Default is 60 seconds.
|
|
261
|
+
autothrottle_target_concurrency (float): The target concurrency for autothrottling. Default is 8.
|
|
262
|
+
robots_max_crawl_delay (float): The maximum crawl delay allowed by the robots.txt file. Default is 60 seconds.
|
|
190
263
|
|
|
191
264
|
Returns:
|
|
192
265
|
None
|
|
@@ -209,6 +282,15 @@ def crawl(
|
|
|
209
282
|
disallow_mime_types,
|
|
210
283
|
depth_limit,
|
|
211
284
|
download_limit,
|
|
285
|
+
concurrent_requests,
|
|
286
|
+
concurrent_requests_per_domain,
|
|
287
|
+
download_delay,
|
|
288
|
+
randomize_download_delay,
|
|
289
|
+
download_timeout,
|
|
290
|
+
autothrottle_enabled,
|
|
291
|
+
autothrottle_max_delay,
|
|
292
|
+
autothrottle_target_concurrency,
|
|
293
|
+
robots_max_crawl_delay,
|
|
212
294
|
)
|
|
213
295
|
d.addBoth(on_completed)
|
|
214
296
|
with condition:
|
{data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/core/settings.py
RENAMED
|
@@ -16,21 +16,10 @@ SPIDER_MODULES = ["dpk_connector.core.spiders"]
|
|
|
16
16
|
|
|
17
17
|
# Robots
|
|
18
18
|
ROBOTSTXT_OBEY = True
|
|
19
|
-
ROBOTS_MAX_CRAWL_DELAY = 60
|
|
20
19
|
ROBOTSTXT_PARSER = "dpk_connector.core.middlewares.DelayingProtegoRobotParser"
|
|
21
20
|
|
|
22
|
-
# Downloader parameters
|
|
23
|
-
CONCURRENT_REQUESTS = 20
|
|
24
|
-
CONCURRENT_REQUESTS_PER_DOMAIN = 10
|
|
25
|
-
DOWNLOAD_DELAY = 0
|
|
26
|
-
RANDOMIZE_DOWNLOAD_DELAY = True
|
|
27
|
-
DOWNLOAD_TIMEOUT = 180
|
|
28
|
-
|
|
29
21
|
# Autothrottle
|
|
30
|
-
AUTOTHROTTLE_ENABLED = True
|
|
31
22
|
AUTOTHROTTLE_START_DELAY = 0
|
|
32
|
-
AUTOTHROTTLE_MAX_DELAY = 300
|
|
33
|
-
AUTOTHROTTLE_TARGET_CONCURRENCY = 10
|
|
34
23
|
AUTOTHROTTLE_DEBUG = False
|
|
35
24
|
|
|
36
25
|
# Middlewares/pipelines/extensions
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import pytest
|
|
14
|
+
from dpk_connector.core.crawler import crawl
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_invalid_crawler():
|
|
18
|
+
def on_downloaded(url: str, body: bytes, headers: dict[str, str]):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
with pytest.raises(ValueError) as e:
|
|
22
|
+
crawl([], on_downloaded)
|
|
23
|
+
assert isinstance(e.value, ValueError) is True
|
|
24
|
+
|
|
25
|
+
with pytest.raises(ValueError) as e:
|
|
26
|
+
crawl(["invalidseedurl"], on_downloaded)
|
|
27
|
+
assert isinstance(e.value, ValueError) is True
|
|
28
|
+
|
|
29
|
+
with pytest.raises(ValueError) as e:
|
|
30
|
+
crawl(["http://example.com"], on_downloaded, allow_domains=("invaliddomain",))
|
|
31
|
+
assert isinstance(e.value, ValueError) is True
|
|
32
|
+
|
|
33
|
+
with pytest.raises(ValueError) as e:
|
|
34
|
+
crawl(["http://example.com"], on_downloaded, depth_limit=-10)
|
|
35
|
+
assert isinstance(e.value, ValueError) is True
|
|
36
|
+
|
|
37
|
+
with pytest.raises(ValueError) as e:
|
|
38
|
+
crawl(["http://example.com"], on_downloaded, download_limit=-10)
|
|
39
|
+
assert isinstance(e.value, ValueError) is True
|
|
40
|
+
|
|
41
|
+
with pytest.raises(ValueError) as e:
|
|
42
|
+
crawl(["http://example.com"], on_downloaded, concurrent_requests=-10)
|
|
43
|
+
assert isinstance(e.value, ValueError) is True
|
|
44
|
+
|
|
45
|
+
with pytest.raises(ValueError) as e:
|
|
46
|
+
crawl(["http://example.com"], on_downloaded, concurrent_requests_per_domain=-10)
|
|
47
|
+
assert isinstance(e.value, ValueError) is True
|
|
48
|
+
|
|
49
|
+
with pytest.raises(ValueError) as e:
|
|
50
|
+
crawl(["http://example.com"], on_downloaded, download_delay=-0.1)
|
|
51
|
+
assert isinstance(e.value, ValueError) is True
|
|
52
|
+
|
|
53
|
+
with pytest.raises(ValueError) as e:
|
|
54
|
+
crawl(["http://example.com"], on_downloaded, download_timeout=-0.1)
|
|
55
|
+
assert isinstance(e.value, ValueError) is True
|
|
56
|
+
|
|
57
|
+
with pytest.raises(ValueError) as e:
|
|
58
|
+
crawl(["http://example.com"], on_downloaded, autothrottle_max_delay=-0.1)
|
|
59
|
+
assert isinstance(e.value, ValueError) is True
|
|
60
|
+
|
|
61
|
+
with pytest.raises(ValueError) as e:
|
|
62
|
+
crawl(
|
|
63
|
+
["http://example.com"], on_downloaded, autothrottle_target_concurrency=0.5
|
|
64
|
+
)
|
|
65
|
+
assert isinstance(e.value, ValueError) is True
|
|
66
|
+
|
|
67
|
+
with pytest.raises(ValueError) as e:
|
|
68
|
+
crawl(["http://example.com"], on_downloaded, robots_max_crawl_delay=-0.1)
|
|
69
|
+
assert isinstance(e.value, ValueError) is True
|
|
@@ -1,3 +1,15 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
1
13
|
import pytest
|
|
2
14
|
from dpk_connector.core.middlewares import DelayingProtegoRobotParser
|
|
3
15
|
from pytest_mock import MockerFixture
|
|
@@ -1,3 +1,15 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
1
13
|
from pathlib import Path
|
|
2
14
|
|
|
3
15
|
import pytest
|
|
@@ -73,9 +85,7 @@ def test_parse(datadir: Path, crawler: Crawler):
|
|
|
73
85
|
assert body.decode("utf-8") == response_body
|
|
74
86
|
assert headers == {"Content-Type": "text/html"}
|
|
75
87
|
|
|
76
|
-
spider = ConnectorSitemapSpider.from_crawler(
|
|
77
|
-
crawler, seed_urls=("http://example.com",), callback=callback
|
|
78
|
-
)
|
|
88
|
+
spider = ConnectorSitemapSpider.from_crawler(crawler, seed_urls=("http://example.com",), callback=callback)
|
|
79
89
|
request = Request(
|
|
80
90
|
"http://example.com/index.html",
|
|
81
91
|
meta={
|
|
@@ -93,9 +103,7 @@ def test_parse(datadir: Path, crawler: Crawler):
|
|
|
93
103
|
parsed = spider.parse(response)
|
|
94
104
|
|
|
95
105
|
item = next(parsed)
|
|
96
|
-
assert item == ConnectorItem(
|
|
97
|
-
dropped=False, downloaded=True, system_request=False, sitemap=False
|
|
98
|
-
)
|
|
106
|
+
assert item == ConnectorItem(dropped=False, downloaded=True, system_request=False, sitemap=False)
|
|
99
107
|
|
|
100
108
|
for next_request in parsed:
|
|
101
109
|
assert isinstance(next_request, Request) is True
|
{data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_utils.py
RENAMED
|
@@ -1,3 +1,15 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
1
13
|
# Assisted by WCA@IBM
|
|
2
14
|
# Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
|
|
3
15
|
|
|
@@ -20,9 +32,7 @@ from scrapy.http import Request, Response
|
|
|
20
32
|
|
|
21
33
|
|
|
22
34
|
def test_get_header_value():
|
|
23
|
-
response = Response(
|
|
24
|
-
"http://example.com", headers={"Content-Type": "application/json"}
|
|
25
|
-
)
|
|
35
|
+
response = Response("http://example.com", headers={"Content-Type": "application/json"})
|
|
26
36
|
assert get_header_value(response, "Content-Type") == "application/json"
|
|
27
37
|
|
|
28
38
|
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
|
|
3
|
-
from dpk_connector.core.crawler import crawl
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def test_invalid_crawler():
|
|
7
|
-
def on_downloaded(url: str, body: bytes, headers: dict[str, str]):
|
|
8
|
-
pass
|
|
9
|
-
|
|
10
|
-
with pytest.raises(ValueError) as e:
|
|
11
|
-
crawl([], on_downloaded)
|
|
12
|
-
assert isinstance(e.value, ValueError) is True
|
|
13
|
-
|
|
14
|
-
with pytest.raises(ValueError) as e:
|
|
15
|
-
crawl(["invalidseedurl"], on_downloaded)
|
|
16
|
-
assert isinstance(e.value, ValueError) is True
|
|
17
|
-
|
|
18
|
-
with pytest.raises(ValueError) as e:
|
|
19
|
-
crawl(["http://example.com"], on_downloaded, allow_domains=("invaliddomain",))
|
|
20
|
-
assert isinstance(e.value, ValueError) is True
|
|
21
|
-
|
|
22
|
-
with pytest.raises(ValueError) as e:
|
|
23
|
-
crawl(["http://example.com"], on_downloaded, depth_limit=-10)
|
|
24
|
-
assert isinstance(e.value, ValueError) is True
|
|
25
|
-
|
|
26
|
-
with pytest.raises(ValueError) as e:
|
|
27
|
-
crawl(["http://example.com"], on_downloaded, download_limit=-10)
|
|
28
|
-
assert isinstance(e.value, ValueError) is True
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/core/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/core/logging.py
RENAMED
|
File without changes
|
{data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/core/middlewares.py
RENAMED
|
File without changes
|
{data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/core/pipelines.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/core/utils.py
RENAMED
|
File without changes
|
{data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/test/dpk_connector/core/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|