data-prep-connector 0.2.2.dev1__tar.gz → 0.2.3.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/Makefile +1 -1
  2. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/PKG-INFO +2 -2
  3. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/pyproject.toml +2 -2
  4. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/src/data_prep_connector.egg-info/PKG-INFO +2 -2
  5. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/src/dpk_connector/core/crawler.py +84 -2
  6. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/src/dpk_connector/core/settings.py +0 -11
  7. data_prep_connector-0.2.3.dev0/test/dpk_connector/core/test_crawler.py +69 -0
  8. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/test/dpk_connector/core/test_middlewares.py +12 -0
  9. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/test/dpk_connector/core/test_sitemap_spider.py +14 -6
  10. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/test/dpk_connector/core/test_utils.py +13 -3
  11. data_prep_connector-0.2.2.dev1/test/dpk_connector/core/test_crawler.py +0 -28
  12. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/README.md +0 -0
  13. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/doc/overview.md +0 -0
  14. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/setup.cfg +0 -0
  15. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/src/data_prep_connector.egg-info/SOURCES.txt +0 -0
  16. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/src/data_prep_connector.egg-info/dependency_links.txt +0 -0
  17. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/src/data_prep_connector.egg-info/requires.txt +0 -0
  18. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/src/data_prep_connector.egg-info/top_level.txt +0 -0
  19. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/src/dpk_connector/__init__.py +0 -0
  20. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/src/dpk_connector/core/__init__.py +0 -0
  21. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/src/dpk_connector/core/item.py +0 -0
  22. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/src/dpk_connector/core/logging.py +0 -0
  23. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/src/dpk_connector/core/middlewares.py +0 -0
  24. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/src/dpk_connector/core/pipelines.py +0 -0
  25. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/src/dpk_connector/core/spiders/__init__.py +0 -0
  26. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/src/dpk_connector/core/spiders/sitemap.py +0 -0
  27. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/src/dpk_connector/core/utils.py +0 -0
  28. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/test/dpk_connector/core/__init__.py +0 -0
  29. {data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3.dev0}/test/dpk_connector/core/test_sitemap_spider/index.html +0 -0
@@ -13,7 +13,7 @@ clean::
13
13
  setup::
14
14
 
15
15
  set-versions: .check-env
16
- $(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml
16
+ $(MAKE) TOML_VERSION=$(DPK_CONNECTOR_VERSION) .defaults.update-toml
17
17
 
18
18
  build:: build-dist
19
19
 
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_connector
3
- Version: 0.2.2.dev1
3
+ Version: 0.2.3.dev0
4
4
  Summary: Scalable and Compliant Web Crawler
5
5
  Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
6
6
  License: Apache-2.0
7
7
  Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps
8
- Requires-Python: >=3.10
8
+ Requires-Python: <3.13,>=3.10
9
9
  Description-Content-Type: text/markdown
10
10
  Requires-Dist: scrapy>=2.11.2
11
11
  Requires-Dist: pydantic>=2.8.1
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "data_prep_connector"
3
- version = "0.2.2.dev1"
4
- requires-python = ">=3.10"
3
+ version = "0.2.3.dev0"
4
+ requires-python = ">=3.10,<3.13"
5
5
  keywords = [
6
6
  "data",
7
7
  "data acquisition",
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_connector
3
- Version: 0.2.2.dev1
3
+ Version: 0.2.3.dev0
4
4
  Summary: Scalable and Compliant Web Crawler
5
5
  Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
6
6
  License: Apache-2.0
7
7
  Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps
8
- Requires-Python: >=3.10
8
+ Requires-Python: <3.13,>=3.10
9
9
  Description-Content-Type: text/markdown
10
10
  Requires-Dist: scrapy>=2.11.2
11
11
  Requires-Dist: pydantic>=2.8.1
@@ -85,6 +85,15 @@ def async_crawl(
85
85
  disallow_mime_types: Collection[str] = (),
86
86
  depth_limit: int = -1,
87
87
  download_limit: int = -1,
88
+ concurrent_requests: int = 16,
89
+ concurrent_requests_per_domain: int = 8,
90
+ download_delay: float = 0,
91
+ randomize_download_delay: bool = True,
92
+ download_timeout: float = 180,
93
+ autothrottle_enabled: bool = True,
94
+ autothrottle_max_delay: float = 60,
95
+ autothrottle_target_concurrency: float = 8,
96
+ robots_max_crawl_delay: float = 60,
88
97
  ) -> Deferred[None]:
89
98
  # Assisted by WCA@IBM
90
99
  # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
@@ -103,12 +112,21 @@ def async_crawl(
103
112
  disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
104
113
  depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
105
114
  download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
115
+ concurrent_requests (int): The maximum number of concurrent requests to make. Default is 16.
116
+ concurrent_requests_per_domain (int): The maximum number of concurrent requests to make per domain. Default is 8.
117
+ download_delay (float): The delay between consecutive requests. Default is 0.
118
+ randomize_download_delay (bool): If specified, the download delay will be randomized between 0.5 * `download_delay and 1.5 * `download_delay`. Default is True.
119
+ download_timeout (float): The timeout for each request. Default is 180 seconds.
120
+ autothrottle_enabled (bool): If specified, autothrottling will be enabled. Default is True.
121
+ autothrottle_max_delay (float): The maximum delay between consecutive requests when autothrottling is enabled. Default is 60 seconds.
122
+ autothrottle_target_concurrency (float): The target concurrency for autothrottling. Default is 8.
123
+ robots_max_crawl_delay (float): The maximum crawl delay allowed by the robots.txt file. Default is 60 seconds.
106
124
 
107
125
  Returns:
108
126
  Deferred[None]: A Twisted deferred object that can be used to wait for the crawler to finish.
109
127
  """
110
128
  if not seed_urls:
111
- raise ValueError(f"Empty seed URLs.")
129
+ raise ValueError("Empty seed URLs.")
112
130
  for url in seed_urls:
113
131
  if not validate_url(url):
114
132
  raise ValueError(f"Seed URL {url} is not valid.")
@@ -119,6 +137,24 @@ def async_crawl(
119
137
  raise ValueError(f"Invalid depth limit {depth_limit}")
120
138
  if download_limit < -1:
121
139
  raise ValueError(f"Invalid download limit {download_limit}")
140
+ if concurrent_requests < 1:
141
+ raise ValueError(f"Invalid concurrent requests {concurrent_requests}")
142
+ if concurrent_requests_per_domain < 1:
143
+ raise ValueError(
144
+ f"Invalid concurrent reuqests per domain {concurrent_requests_per_domain}"
145
+ )
146
+ if download_delay < 0:
147
+ raise ValueError(f"Invalid download delay {download_delay}")
148
+ if download_timeout < 0:
149
+ raise ValueError(f"Invalid donwload timeout {download_timeout}")
150
+ if autothrottle_max_delay < 0:
151
+ raise ValueError(f"Invalid autothrottle max delay {autothrottle_max_delay}")
152
+ if autothrottle_target_concurrency < 1:
153
+ raise ValueError(
154
+ f"Invalid autothrottle target concurrency {autothrottle_target_concurrency}"
155
+ )
156
+ if robots_max_crawl_delay < 0:
157
+ raise ValueError(f"Invalid robots max crawl delay {robots_max_crawl_delay}")
122
158
 
123
159
  settings = Settings()
124
160
  settings.setmodule("dpk_connector.core.settings", priority="project")
@@ -126,7 +162,7 @@ def async_crawl(
126
162
  if user_agent:
127
163
  settings.set("USER_AGENT", user_agent, priority="spider")
128
164
  if headers:
129
- settings.set("DEFAULT_REQUEST_HEADERS", headers)
165
+ settings.set("DEFAULT_REQUEST_HEADERS", headers, priority="spider")
130
166
  if depth_limit == 0:
131
167
  depth_limit = -1
132
168
  elif depth_limit == -1:
@@ -135,6 +171,25 @@ def async_crawl(
135
171
  if download_limit == -1:
136
172
  download_limit = 0
137
173
  settings.set("CLOSESPIDER_ITEMCOUNT", download_limit, priority="spider")
174
+ settings.set("CONCURRENT_REQUESTS", concurrent_requests, priority="spider")
175
+ settings.set(
176
+ "CONCURRENT_REQUESTS_PER_DOMAIN",
177
+ concurrent_requests_per_domain,
178
+ priority="spider",
179
+ )
180
+ settings.set("DOWNLOAD_DELAY", download_delay, priority="spider")
181
+ settings.set(
182
+ "RANDOMIZE_DOWNLOAD_DELAY", randomize_download_delay, priority="spider"
183
+ )
184
+ settings.set("DOWNLOAD_TIMEOUT", download_timeout, priority="spider")
185
+ settings.set("AUTOTHROTTLE_ENABLED", autothrottle_enabled, priority="spider")
186
+ settings.set("AUTOTHROTTLE_MAX_DELAY", autothrottle_max_delay, priority="spider")
187
+ settings.set(
188
+ "AUTOTHROTTLE_TARGET_CONCURRENCY",
189
+ autothrottle_target_concurrency,
190
+ priority="spider",
191
+ )
192
+ settings.set("ROBOTS_MAX_CRAWL_DELAY", robots_max_crawl_delay, priority="spider")
138
193
 
139
194
  runner = MultiThreadedCrawlerRunner(settings)
140
195
  runner.crawl(
@@ -169,6 +224,15 @@ def crawl(
169
224
  disallow_mime_types: Collection[str] = (),
170
225
  depth_limit: int = -1,
171
226
  download_limit: int = -1,
227
+ concurrent_requests: int = 16,
228
+ concurrent_requests_per_domain: int = 8,
229
+ download_delay: float = 0,
230
+ randomize_download_delay: bool = True,
231
+ download_timeout: float = 180,
232
+ autothrottle_enabled: bool = True,
233
+ autothrottle_max_delay: float = 60,
234
+ autothrottle_target_concurrency: float = 8,
235
+ robots_max_crawl_delay: float = 60,
172
236
  ) -> None:
173
237
  # Assisted by WCA@IBM
174
238
  # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
@@ -187,6 +251,15 @@ def crawl(
187
251
  disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
188
252
  depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
189
253
  download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
254
+ concurrent_requests (int): The maximum number of concurrent requests to make. Default is 16.
255
+ concurrent_requests_per_domain (int): The maximum number of concurrent requests to make per domain. Default is 8.
256
+ download_delay (float): The delay between consecutive requests. Default is 0.
257
+ randomize_download_delay (bool): If specified, the download delay will be randomized between 0.5 * `download_delay and 1.5 * `download_delay`. Default is True.
258
+ download_timeout (float): The timeout for each request. Default is 180 seconds.
259
+ autothrottle_enabled (bool): If specified, autothrottling will be enabled. Default is True.
260
+ autothrottle_max_delay (float): The maximum delay between consecutive requests when autothrottling is enabled. Default is 60 seconds.
261
+ autothrottle_target_concurrency (float): The target concurrency for autothrottling. Default is 8.
262
+ robots_max_crawl_delay (float): The maximum crawl delay allowed by the robots.txt file. Default is 60 seconds.
190
263
 
191
264
  Returns:
192
265
  None
@@ -209,6 +282,15 @@ def crawl(
209
282
  disallow_mime_types,
210
283
  depth_limit,
211
284
  download_limit,
285
+ concurrent_requests,
286
+ concurrent_requests_per_domain,
287
+ download_delay,
288
+ randomize_download_delay,
289
+ download_timeout,
290
+ autothrottle_enabled,
291
+ autothrottle_max_delay,
292
+ autothrottle_target_concurrency,
293
+ robots_max_crawl_delay,
212
294
  )
213
295
  d.addBoth(on_completed)
214
296
  with condition:
@@ -16,21 +16,10 @@ SPIDER_MODULES = ["dpk_connector.core.spiders"]
16
16
 
17
17
  # Robots
18
18
  ROBOTSTXT_OBEY = True
19
- ROBOTS_MAX_CRAWL_DELAY = 60
20
19
  ROBOTSTXT_PARSER = "dpk_connector.core.middlewares.DelayingProtegoRobotParser"
21
20
 
22
- # Downloader parameters
23
- CONCURRENT_REQUESTS = 20
24
- CONCURRENT_REQUESTS_PER_DOMAIN = 10
25
- DOWNLOAD_DELAY = 0
26
- RANDOMIZE_DOWNLOAD_DELAY = True
27
- DOWNLOAD_TIMEOUT = 180
28
-
29
21
  # Autothrottle
30
- AUTOTHROTTLE_ENABLED = True
31
22
  AUTOTHROTTLE_START_DELAY = 0
32
- AUTOTHROTTLE_MAX_DELAY = 300
33
- AUTOTHROTTLE_TARGET_CONCURRENCY = 10
34
23
  AUTOTHROTTLE_DEBUG = False
35
24
 
36
25
  # Middlewares/pipelines/extensions
@@ -0,0 +1,69 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import pytest
14
+ from dpk_connector.core.crawler import crawl
15
+
16
+
17
+ def test_invalid_crawler():
18
+ def on_downloaded(url: str, body: bytes, headers: dict[str, str]):
19
+ pass
20
+
21
+ with pytest.raises(ValueError) as e:
22
+ crawl([], on_downloaded)
23
+ assert isinstance(e.value, ValueError) is True
24
+
25
+ with pytest.raises(ValueError) as e:
26
+ crawl(["invalidseedurl"], on_downloaded)
27
+ assert isinstance(e.value, ValueError) is True
28
+
29
+ with pytest.raises(ValueError) as e:
30
+ crawl(["http://example.com"], on_downloaded, allow_domains=("invaliddomain",))
31
+ assert isinstance(e.value, ValueError) is True
32
+
33
+ with pytest.raises(ValueError) as e:
34
+ crawl(["http://example.com"], on_downloaded, depth_limit=-10)
35
+ assert isinstance(e.value, ValueError) is True
36
+
37
+ with pytest.raises(ValueError) as e:
38
+ crawl(["http://example.com"], on_downloaded, download_limit=-10)
39
+ assert isinstance(e.value, ValueError) is True
40
+
41
+ with pytest.raises(ValueError) as e:
42
+ crawl(["http://example.com"], on_downloaded, concurrent_requests=-10)
43
+ assert isinstance(e.value, ValueError) is True
44
+
45
+ with pytest.raises(ValueError) as e:
46
+ crawl(["http://example.com"], on_downloaded, concurrent_requests_per_domain=-10)
47
+ assert isinstance(e.value, ValueError) is True
48
+
49
+ with pytest.raises(ValueError) as e:
50
+ crawl(["http://example.com"], on_downloaded, download_delay=-0.1)
51
+ assert isinstance(e.value, ValueError) is True
52
+
53
+ with pytest.raises(ValueError) as e:
54
+ crawl(["http://example.com"], on_downloaded, download_timeout=-0.1)
55
+ assert isinstance(e.value, ValueError) is True
56
+
57
+ with pytest.raises(ValueError) as e:
58
+ crawl(["http://example.com"], on_downloaded, autothrottle_max_delay=-0.1)
59
+ assert isinstance(e.value, ValueError) is True
60
+
61
+ with pytest.raises(ValueError) as e:
62
+ crawl(
63
+ ["http://example.com"], on_downloaded, autothrottle_target_concurrency=0.5
64
+ )
65
+ assert isinstance(e.value, ValueError) is True
66
+
67
+ with pytest.raises(ValueError) as e:
68
+ crawl(["http://example.com"], on_downloaded, robots_max_crawl_delay=-0.1)
69
+ assert isinstance(e.value, ValueError) is True
@@ -1,3 +1,15 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
1
13
  import pytest
2
14
  from dpk_connector.core.middlewares import DelayingProtegoRobotParser
3
15
  from pytest_mock import MockerFixture
@@ -1,3 +1,15 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
1
13
  from pathlib import Path
2
14
 
3
15
  import pytest
@@ -73,9 +85,7 @@ def test_parse(datadir: Path, crawler: Crawler):
73
85
  assert body.decode("utf-8") == response_body
74
86
  assert headers == {"Content-Type": "text/html"}
75
87
 
76
- spider = ConnectorSitemapSpider.from_crawler(
77
- crawler, seed_urls=("http://example.com",), callback=callback
78
- )
88
+ spider = ConnectorSitemapSpider.from_crawler(crawler, seed_urls=("http://example.com",), callback=callback)
79
89
  request = Request(
80
90
  "http://example.com/index.html",
81
91
  meta={
@@ -93,9 +103,7 @@ def test_parse(datadir: Path, crawler: Crawler):
93
103
  parsed = spider.parse(response)
94
104
 
95
105
  item = next(parsed)
96
- assert item == ConnectorItem(
97
- dropped=False, downloaded=True, system_request=False, sitemap=False
98
- )
106
+ assert item == ConnectorItem(dropped=False, downloaded=True, system_request=False, sitemap=False)
99
107
 
100
108
  for next_request in parsed:
101
109
  assert isinstance(next_request, Request) is True
@@ -1,3 +1,15 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
1
13
  # Assisted by WCA@IBM
2
14
  # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
3
15
 
@@ -20,9 +32,7 @@ from scrapy.http import Request, Response
20
32
 
21
33
 
22
34
  def test_get_header_value():
23
- response = Response(
24
- "http://example.com", headers={"Content-Type": "application/json"}
25
- )
35
+ response = Response("http://example.com", headers={"Content-Type": "application/json"})
26
36
  assert get_header_value(response, "Content-Type") == "application/json"
27
37
 
28
38
 
@@ -1,28 +0,0 @@
1
- import pytest
2
-
3
- from dpk_connector.core.crawler import crawl
4
-
5
-
6
- def test_invalid_crawler():
7
- def on_downloaded(url: str, body: bytes, headers: dict[str, str]):
8
- pass
9
-
10
- with pytest.raises(ValueError) as e:
11
- crawl([], on_downloaded)
12
- assert isinstance(e.value, ValueError) is True
13
-
14
- with pytest.raises(ValueError) as e:
15
- crawl(["invalidseedurl"], on_downloaded)
16
- assert isinstance(e.value, ValueError) is True
17
-
18
- with pytest.raises(ValueError) as e:
19
- crawl(["http://example.com"], on_downloaded, allow_domains=("invaliddomain",))
20
- assert isinstance(e.value, ValueError) is True
21
-
22
- with pytest.raises(ValueError) as e:
23
- crawl(["http://example.com"], on_downloaded, depth_limit=-10)
24
- assert isinstance(e.value, ValueError) is True
25
-
26
- with pytest.raises(ValueError) as e:
27
- crawl(["http://example.com"], on_downloaded, download_limit=-10)
28
- assert isinstance(e.value, ValueError) is True