data-prep-connector 0.2.2.dev0__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/Makefile +1 -1
  2. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/PKG-INFO +3 -3
  3. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/doc/overview.md +14 -0
  4. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/pyproject.toml +3 -2
  5. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/data_prep_connector.egg-info/PKG-INFO +3 -3
  6. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/crawler.py +90 -2
  7. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/settings.py +0 -11
  8. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/spiders/sitemap.py +14 -6
  9. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/utils.py +5 -0
  10. data_prep_connector-0.2.3/test/dpk_connector/core/test_crawler.py +69 -0
  11. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_middlewares.py +12 -0
  12. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_sitemap_spider.py +31 -9
  13. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_utils.py +29 -3
  14. data_prep_connector-0.2.2.dev0/test/dpk_connector/core/test_crawler.py +0 -28
  15. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/README.md +0 -0
  16. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/setup.cfg +0 -0
  17. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/data_prep_connector.egg-info/SOURCES.txt +0 -0
  18. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/data_prep_connector.egg-info/dependency_links.txt +0 -0
  19. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/data_prep_connector.egg-info/requires.txt +0 -0
  20. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/data_prep_connector.egg-info/top_level.txt +0 -0
  21. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/__init__.py +0 -0
  22. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/__init__.py +0 -0
  23. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/item.py +0 -0
  24. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/logging.py +0 -0
  25. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/middlewares.py +0 -0
  26. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/pipelines.py +0 -0
  27. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/spiders/__init__.py +0 -0
  28. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/test/dpk_connector/core/__init__.py +0 -0
  29. {data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_sitemap_spider/index.html +0 -0
@@ -13,7 +13,7 @@ clean::
13
13
  setup::
14
14
 
15
15
  set-versions: .check-env
16
- $(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml
16
+ $(MAKE) TOML_VERSION=$(DPK_CONNECTOR_VERSION) .defaults.update-toml
17
17
 
18
18
  build:: build-dist
19
19
 
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_connector
3
- Version: 0.2.2.dev0
3
+ Version: 0.2.3
4
4
  Summary: Scalable and Compliant Web Crawler
5
5
  Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
6
6
  License: Apache-2.0
7
- Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps
8
- Requires-Python: >=3.10
7
+ Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps,0b74b5a
8
+ Requires-Python: <3.13,>=3.10
9
9
  Description-Content-Type: text/markdown
10
10
  Requires-Dist: scrapy>=2.11.2
11
11
  Requires-Dist: pydantic>=2.8.1
@@ -10,6 +10,20 @@ Features:
10
10
  - Mime type filters: You can restrict mime types which can be downloaded.
11
11
  - Parallel processing: Requests to websites are processed in parallel.
12
12
 
13
+ ## How to install
14
+
15
+ ### From PyPI
16
+
17
+ ```sh
18
+ pip install data-prep-connector
19
+ ```
20
+
21
+ ### From Github
22
+
23
+ ```sh
24
+ pip install git+https://github.com/IBM/data-prep-kit.git@dev#subdirectory=data-connector-lib
25
+ ```
26
+
13
27
  ## Example usage
14
28
 
15
29
  ```python
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "data_prep_connector"
3
- version = "0.2.2.dev0"
4
- requires-python = ">=3.10"
3
+ version = "0.2.3"
4
+ requires-python = ">=3.10,<3.13"
5
5
  keywords = [
6
6
  "data",
7
7
  "data acquisition",
@@ -12,6 +12,7 @@ keywords = [
12
12
  "ai",
13
13
  "fine-tuning",
14
14
  "llmapps",
15
+ "0b74b5a"
15
16
  ]
16
17
  description = "Scalable and Compliant Web Crawler"
17
18
  license = { text = "Apache-2.0" }
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_connector
3
- Version: 0.2.2.dev0
3
+ Version: 0.2.3
4
4
  Summary: Scalable and Compliant Web Crawler
5
5
  Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
6
6
  License: Apache-2.0
7
- Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps
8
- Requires-Python: >=3.10
7
+ Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps,0b74b5a
8
+ Requires-Python: <3.13,>=3.10
9
9
  Description-Content-Type: text/markdown
10
10
  Requires-Dist: scrapy>=2.11.2
11
11
  Requires-Dist: pydantic>=2.8.1
@@ -74,6 +74,7 @@ def async_crawl(
74
74
  user_agent: str = "",
75
75
  headers: dict[str, str] = {},
76
76
  allow_domains: Collection[str] = (),
77
+ subdomain_focus: bool = False,
77
78
  path_focus: bool = False,
78
79
  allow_mime_types: Collection[str] = (
79
80
  "application/pdf",
@@ -84,6 +85,15 @@ def async_crawl(
84
85
  disallow_mime_types: Collection[str] = (),
85
86
  depth_limit: int = -1,
86
87
  download_limit: int = -1,
88
+ concurrent_requests: int = 16,
89
+ concurrent_requests_per_domain: int = 8,
90
+ download_delay: float = 0,
91
+ randomize_download_delay: bool = True,
92
+ download_timeout: float = 180,
93
+ autothrottle_enabled: bool = True,
94
+ autothrottle_max_delay: float = 60,
95
+ autothrottle_target_concurrency: float = 8,
96
+ robots_max_crawl_delay: float = 60,
87
97
  ) -> Deferred[None]:
88
98
  # Assisted by WCA@IBM
89
99
  # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
@@ -96,17 +106,27 @@ def async_crawl(
96
106
  user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
97
107
  headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
98
108
  allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
109
+ subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
99
110
  path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
100
111
  allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
101
112
  disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
102
113
  depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
103
114
  download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
115
+ concurrent_requests (int): The maximum number of concurrent requests to make. Default is 16.
116
+ concurrent_requests_per_domain (int): The maximum number of concurrent requests to make per domain. Default is 8.
117
+ download_delay (float): The delay between consecutive requests. Default is 0.
118
+ randomize_download_delay (bool): If specified, the download delay will be randomized between 0.5 * `download_delay and 1.5 * `download_delay`. Default is True.
119
+ download_timeout (float): The timeout for each request. Default is 180 seconds.
120
+ autothrottle_enabled (bool): If specified, autothrottling will be enabled. Default is True.
121
+ autothrottle_max_delay (float): The maximum delay between consecutive requests when autothrottling is enabled. Default is 60 seconds.
122
+ autothrottle_target_concurrency (float): The target concurrency for autothrottling. Default is 8.
123
+ robots_max_crawl_delay (float): The maximum crawl delay allowed by the robots.txt file. Default is 60 seconds.
104
124
 
105
125
  Returns:
106
126
  Deferred[None]: A Twisted deferred object that can be used to wait for the crawler to finish.
107
127
  """
108
128
  if not seed_urls:
109
- raise ValueError(f"Empty seed URLs.")
129
+ raise ValueError("Empty seed URLs.")
110
130
  for url in seed_urls:
111
131
  if not validate_url(url):
112
132
  raise ValueError(f"Seed URL {url} is not valid.")
@@ -117,6 +137,24 @@ def async_crawl(
117
137
  raise ValueError(f"Invalid depth limit {depth_limit}")
118
138
  if download_limit < -1:
119
139
  raise ValueError(f"Invalid download limit {download_limit}")
140
+ if concurrent_requests < 1:
141
+ raise ValueError(f"Invalid concurrent requests {concurrent_requests}")
142
+ if concurrent_requests_per_domain < 1:
143
+ raise ValueError(
144
+ f"Invalid concurrent reuqests per domain {concurrent_requests_per_domain}"
145
+ )
146
+ if download_delay < 0:
147
+ raise ValueError(f"Invalid download delay {download_delay}")
148
+ if download_timeout < 0:
149
+ raise ValueError(f"Invalid donwload timeout {download_timeout}")
150
+ if autothrottle_max_delay < 0:
151
+ raise ValueError(f"Invalid autothrottle max delay {autothrottle_max_delay}")
152
+ if autothrottle_target_concurrency < 1:
153
+ raise ValueError(
154
+ f"Invalid autothrottle target concurrency {autothrottle_target_concurrency}"
155
+ )
156
+ if robots_max_crawl_delay < 0:
157
+ raise ValueError(f"Invalid robots max crawl delay {robots_max_crawl_delay}")
120
158
 
121
159
  settings = Settings()
122
160
  settings.setmodule("dpk_connector.core.settings", priority="project")
@@ -124,7 +162,7 @@ def async_crawl(
124
162
  if user_agent:
125
163
  settings.set("USER_AGENT", user_agent, priority="spider")
126
164
  if headers:
127
- settings.set("DEFAULT_REQUEST_HEADERS", headers)
165
+ settings.set("DEFAULT_REQUEST_HEADERS", headers, priority="spider")
128
166
  if depth_limit == 0:
129
167
  depth_limit = -1
130
168
  elif depth_limit == -1:
@@ -133,6 +171,25 @@ def async_crawl(
133
171
  if download_limit == -1:
134
172
  download_limit = 0
135
173
  settings.set("CLOSESPIDER_ITEMCOUNT", download_limit, priority="spider")
174
+ settings.set("CONCURRENT_REQUESTS", concurrent_requests, priority="spider")
175
+ settings.set(
176
+ "CONCURRENT_REQUESTS_PER_DOMAIN",
177
+ concurrent_requests_per_domain,
178
+ priority="spider",
179
+ )
180
+ settings.set("DOWNLOAD_DELAY", download_delay, priority="spider")
181
+ settings.set(
182
+ "RANDOMIZE_DOWNLOAD_DELAY", randomize_download_delay, priority="spider"
183
+ )
184
+ settings.set("DOWNLOAD_TIMEOUT", download_timeout, priority="spider")
185
+ settings.set("AUTOTHROTTLE_ENABLED", autothrottle_enabled, priority="spider")
186
+ settings.set("AUTOTHROTTLE_MAX_DELAY", autothrottle_max_delay, priority="spider")
187
+ settings.set(
188
+ "AUTOTHROTTLE_TARGET_CONCURRENCY",
189
+ autothrottle_target_concurrency,
190
+ priority="spider",
191
+ )
192
+ settings.set("ROBOTS_MAX_CRAWL_DELAY", robots_max_crawl_delay, priority="spider")
136
193
 
137
194
  runner = MultiThreadedCrawlerRunner(settings)
138
195
  runner.crawl(
@@ -140,6 +197,7 @@ def async_crawl(
140
197
  seed_urls=seed_urls,
141
198
  callback=on_downloaded,
142
199
  allow_domains=allow_domains,
200
+ subdomain_focus=subdomain_focus,
143
201
  path_focus=path_focus,
144
202
  allow_mime_types=allow_mime_types,
145
203
  disallow_mime_types=disallow_mime_types,
@@ -155,6 +213,7 @@ def crawl(
155
213
  user_agent: str = "",
156
214
  headers: dict[str, str] = {},
157
215
  allow_domains: Collection[str] = (),
216
+ subdomain_focus: bool = False,
158
217
  path_focus: bool = False,
159
218
  allow_mime_types: Collection[str] = (
160
219
  "application/pdf",
@@ -165,6 +224,15 @@ def crawl(
165
224
  disallow_mime_types: Collection[str] = (),
166
225
  depth_limit: int = -1,
167
226
  download_limit: int = -1,
227
+ concurrent_requests: int = 16,
228
+ concurrent_requests_per_domain: int = 8,
229
+ download_delay: float = 0,
230
+ randomize_download_delay: bool = True,
231
+ download_timeout: float = 180,
232
+ autothrottle_enabled: bool = True,
233
+ autothrottle_max_delay: float = 60,
234
+ autothrottle_target_concurrency: float = 8,
235
+ robots_max_crawl_delay: float = 60,
168
236
  ) -> None:
169
237
  # Assisted by WCA@IBM
170
238
  # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
@@ -177,11 +245,21 @@ def crawl(
177
245
  user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
178
246
  headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
179
247
  allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
248
+ subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
180
249
  path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
181
250
  allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
182
251
  disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
183
252
  depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
184
253
  download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
254
+ concurrent_requests (int): The maximum number of concurrent requests to make. Default is 16.
255
+ concurrent_requests_per_domain (int): The maximum number of concurrent requests to make per domain. Default is 8.
256
+ download_delay (float): The delay between consecutive requests. Default is 0.
257
+ randomize_download_delay (bool): If specified, the download delay will be randomized between 0.5 * `download_delay and 1.5 * `download_delay`. Default is True.
258
+ download_timeout (float): The timeout for each request. Default is 180 seconds.
259
+ autothrottle_enabled (bool): If specified, autothrottling will be enabled. Default is True.
260
+ autothrottle_max_delay (float): The maximum delay between consecutive requests when autothrottling is enabled. Default is 60 seconds.
261
+ autothrottle_target_concurrency (float): The target concurrency for autothrottling. Default is 8.
262
+ robots_max_crawl_delay (float): The maximum crawl delay allowed by the robots.txt file. Default is 60 seconds.
185
263
 
186
264
  Returns:
187
265
  None
@@ -198,11 +276,21 @@ def crawl(
198
276
  user_agent,
199
277
  headers,
200
278
  allow_domains,
279
+ subdomain_focus,
201
280
  path_focus,
202
281
  allow_mime_types,
203
282
  disallow_mime_types,
204
283
  depth_limit,
205
284
  download_limit,
285
+ concurrent_requests,
286
+ concurrent_requests_per_domain,
287
+ download_delay,
288
+ randomize_download_delay,
289
+ download_timeout,
290
+ autothrottle_enabled,
291
+ autothrottle_max_delay,
292
+ autothrottle_target_concurrency,
293
+ robots_max_crawl_delay,
206
294
  )
207
295
  d.addBoth(on_completed)
208
296
  with condition:
@@ -16,21 +16,10 @@ SPIDER_MODULES = ["dpk_connector.core.spiders"]
16
16
 
17
17
  # Robots
18
18
  ROBOTSTXT_OBEY = True
19
- ROBOTS_MAX_CRAWL_DELAY = 60
20
19
  ROBOTSTXT_PARSER = "dpk_connector.core.middlewares.DelayingProtegoRobotParser"
21
20
 
22
- # Downloader parameters
23
- CONCURRENT_REQUESTS = 20
24
- CONCURRENT_REQUESTS_PER_DOMAIN = 10
25
- DOWNLOAD_DELAY = 0
26
- RANDOMIZE_DOWNLOAD_DELAY = True
27
- DOWNLOAD_TIMEOUT = 180
28
-
29
21
  # Autothrottle
30
- AUTOTHROTTLE_ENABLED = True
31
22
  AUTOTHROTTLE_START_DELAY = 0
32
- AUTOTHROTTLE_MAX_DELAY = 300
33
- AUTOTHROTTLE_TARGET_CONCURRENCY = 10
34
23
  AUTOTHROTTLE_DEBUG = False
35
24
 
36
25
  # Middlewares/pipelines/extensions
@@ -28,6 +28,7 @@ from dpk_connector.core.utils import (
28
28
  get_content_type,
29
29
  get_etld1,
30
30
  get_focus_path,
31
+ get_fqdn,
31
32
  is_allowed_path,
32
33
  urlparse_cached,
33
34
  )
@@ -42,6 +43,7 @@ class BaseSitemapSpider(SitemapSpider):
42
43
  self,
43
44
  seed_urls: Collection[str],
44
45
  allow_domains: Collection[str] = (),
46
+ subdomain_focus: bool = False,
45
47
  path_focus: bool = False,
46
48
  allow_mime_types: Collection[str] = (),
47
49
  disallow_mime_types: Collection[str] = (),
@@ -88,11 +90,15 @@ class BaseSitemapSpider(SitemapSpider):
88
90
  self.focus_paths.add(path)
89
91
 
90
92
  # Domains and mime types filtering
91
- self.allowed_domains = set(
92
- allow_domains
93
- if len(allow_domains) > 0
94
- else [get_etld1(url) for url in seed_urls]
95
- )
93
+ if allow_domains:
94
+ self.allowed_domains = set(allow_domains)
95
+ elif subdomain_focus:
96
+ self.allowed_domains = set()
97
+ for url in seed_urls:
98
+ if fqdn := get_fqdn(url):
99
+ self.allowed_domains.add(fqdn)
100
+ else:
101
+ self.allowed_domains = set(get_etld1(url) for url in seed_urls)
96
102
  self.allow_mime_types = set(
97
103
  [m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else ()
98
104
  )
@@ -155,7 +161,9 @@ class BaseSitemapSpider(SitemapSpider):
155
161
  )
156
162
 
157
163
  def _parse_sitemap(self, response: Response):
158
- yield ConnectorItem(dropped=False, downloaded=False, system_request=True, sitemap=True)
164
+ yield ConnectorItem(
165
+ dropped=False, downloaded=False, system_request=True, sitemap=True
166
+ )
159
167
 
160
168
  seed_url = response.meta["seed_url"]
161
169
 
@@ -57,6 +57,11 @@ def get_etld1(url: str) -> str:
57
57
  return f"{ext.domain}.{ext.suffix}"
58
58
 
59
59
 
60
+ def get_fqdn(url: str) -> str:
61
+ ext = tldextract.extract(url)
62
+ return ext.fqdn
63
+
64
+
60
65
  def get_focus_path(url: str) -> str | None:
61
66
  parts = urlparse_cached(url)
62
67
  if len(parts.path.split("/")) > 2:
@@ -0,0 +1,69 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import pytest
14
+ from dpk_connector.core.crawler import crawl
15
+
16
+
17
+ def test_invalid_crawler():
18
+ def on_downloaded(url: str, body: bytes, headers: dict[str, str]):
19
+ pass
20
+
21
+ with pytest.raises(ValueError) as e:
22
+ crawl([], on_downloaded)
23
+ assert isinstance(e.value, ValueError) is True
24
+
25
+ with pytest.raises(ValueError) as e:
26
+ crawl(["invalidseedurl"], on_downloaded)
27
+ assert isinstance(e.value, ValueError) is True
28
+
29
+ with pytest.raises(ValueError) as e:
30
+ crawl(["http://example.com"], on_downloaded, allow_domains=("invaliddomain",))
31
+ assert isinstance(e.value, ValueError) is True
32
+
33
+ with pytest.raises(ValueError) as e:
34
+ crawl(["http://example.com"], on_downloaded, depth_limit=-10)
35
+ assert isinstance(e.value, ValueError) is True
36
+
37
+ with pytest.raises(ValueError) as e:
38
+ crawl(["http://example.com"], on_downloaded, download_limit=-10)
39
+ assert isinstance(e.value, ValueError) is True
40
+
41
+ with pytest.raises(ValueError) as e:
42
+ crawl(["http://example.com"], on_downloaded, concurrent_requests=-10)
43
+ assert isinstance(e.value, ValueError) is True
44
+
45
+ with pytest.raises(ValueError) as e:
46
+ crawl(["http://example.com"], on_downloaded, concurrent_requests_per_domain=-10)
47
+ assert isinstance(e.value, ValueError) is True
48
+
49
+ with pytest.raises(ValueError) as e:
50
+ crawl(["http://example.com"], on_downloaded, download_delay=-0.1)
51
+ assert isinstance(e.value, ValueError) is True
52
+
53
+ with pytest.raises(ValueError) as e:
54
+ crawl(["http://example.com"], on_downloaded, download_timeout=-0.1)
55
+ assert isinstance(e.value, ValueError) is True
56
+
57
+ with pytest.raises(ValueError) as e:
58
+ crawl(["http://example.com"], on_downloaded, autothrottle_max_delay=-0.1)
59
+ assert isinstance(e.value, ValueError) is True
60
+
61
+ with pytest.raises(ValueError) as e:
62
+ crawl(
63
+ ["http://example.com"], on_downloaded, autothrottle_target_concurrency=0.5
64
+ )
65
+ assert isinstance(e.value, ValueError) is True
66
+
67
+ with pytest.raises(ValueError) as e:
68
+ crawl(["http://example.com"], on_downloaded, robots_max_crawl_delay=-0.1)
69
+ assert isinstance(e.value, ValueError) is True
@@ -1,3 +1,15 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
1
13
  import pytest
2
14
  from dpk_connector.core.middlewares import DelayingProtegoRobotParser
3
15
  from pytest_mock import MockerFixture
@@ -1,13 +1,24 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
1
13
  from pathlib import Path
2
14
 
3
15
  import pytest
16
+ from dpk_connector.core.item import ConnectorItem
17
+ from dpk_connector.core.spiders.sitemap import BaseSitemapSpider, ConnectorSitemapSpider
4
18
  from scrapy import Request
5
19
  from scrapy.crawler import Crawler
6
20
  from scrapy.http import HtmlResponse
7
21
 
8
- from dpk_connector.core.item import ConnectorItem
9
- from dpk_connector.core.spiders.sitemap import BaseSitemapSpider, ConnectorSitemapSpider
10
-
11
22
 
12
23
  @pytest.fixture
13
24
  def crawler() -> Crawler:
@@ -22,6 +33,21 @@ def crawler() -> Crawler:
22
33
  return crawler
23
34
 
24
35
 
36
+ def test_init_subdomain_focus():
37
+ spider = BaseSitemapSpider(
38
+ seed_urls=(
39
+ "http://blog.example.com/",
40
+ "http://contents.example.com/",
41
+ ),
42
+ subdomain_focus=True,
43
+ )
44
+ assert spider.seed_urls == {
45
+ "http://blog.example.com/",
46
+ "http://contents.example.com/",
47
+ }
48
+ assert spider.allowed_domains == {"blog.example.com", "contents.example.com"}
49
+
50
+
25
51
  def test_init_path_focus():
26
52
  spider = BaseSitemapSpider(
27
53
  seed_urls=(
@@ -59,9 +85,7 @@ def test_parse(datadir: Path, crawler: Crawler):
59
85
  assert body.decode("utf-8") == response_body
60
86
  assert headers == {"Content-Type": "text/html"}
61
87
 
62
- spider = ConnectorSitemapSpider.from_crawler(
63
- crawler, seed_urls=("http://example.com",), callback=callback
64
- )
88
+ spider = ConnectorSitemapSpider.from_crawler(crawler, seed_urls=("http://example.com",), callback=callback)
65
89
  request = Request(
66
90
  "http://example.com/index.html",
67
91
  meta={
@@ -79,9 +103,7 @@ def test_parse(datadir: Path, crawler: Crawler):
79
103
  parsed = spider.parse(response)
80
104
 
81
105
  item = next(parsed)
82
- assert item == ConnectorItem(
83
- dropped=False, downloaded=True, system_request=False, sitemap=False
84
- )
106
+ assert item == ConnectorItem(dropped=False, downloaded=True, system_request=False, sitemap=False)
85
107
 
86
108
  for next_request in parsed:
87
109
  assert isinstance(next_request, Request) is True
@@ -1,3 +1,15 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
1
13
  # Assisted by WCA@IBM
2
14
  # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
3
15
 
@@ -7,6 +19,7 @@ from dpk_connector.core.utils import (
7
19
  get_content_type,
8
20
  get_etld1,
9
21
  get_focus_path,
22
+ get_fqdn,
10
23
  get_header_value,
11
24
  get_mime_type,
12
25
  is_allowed_path,
@@ -19,9 +32,7 @@ from scrapy.http import Request, Response
19
32
 
20
33
 
21
34
  def test_get_header_value():
22
- response = Response(
23
- "http://example.com", headers={"Content-Type": "application/json"}
24
- )
35
+ response = Response("http://example.com", headers={"Content-Type": "application/json"})
25
36
  assert get_header_value(response, "Content-Type") == "application/json"
26
37
 
27
38
 
@@ -83,6 +94,21 @@ def test_get_etld1(url: str, expected: str):
83
94
  assert get_etld1(url) == expected
84
95
 
85
96
 
97
+ @pytest.mark.parametrize(
98
+ "url,expected",
99
+ [
100
+ ("http://www.example.com", "www.example.com"),
101
+ ("https://www.example.co.uk", "www.example.co.uk"),
102
+ ("http://www.example.com/path?query=string#fragment", "www.example.com"),
103
+ ("http://localhost:8080/", ""),
104
+ ("http://www.example.com:8080/", "www.example.com"),
105
+ ("http://www.sub.example.com:8080/", "www.sub.example.com"),
106
+ ],
107
+ )
108
+ def test_get_fqdn(url: str, expected: str):
109
+ assert get_fqdn(url) == expected
110
+
111
+
86
112
  @pytest.mark.parametrize(
87
113
  "url,expected",
88
114
  [
@@ -1,28 +0,0 @@
1
- import pytest
2
-
3
- from dpk_connector.core.crawler import crawl
4
-
5
-
6
- def test_invalid_crawler():
7
- def on_downloaded(url: str, body: bytes, headers: dict[str, str]):
8
- pass
9
-
10
- with pytest.raises(ValueError) as e:
11
- crawl([], on_downloaded)
12
- assert isinstance(e.value, ValueError) is True
13
-
14
- with pytest.raises(ValueError) as e:
15
- crawl(["invalidseedurl"], on_downloaded)
16
- assert isinstance(e.value, ValueError) is True
17
-
18
- with pytest.raises(ValueError) as e:
19
- crawl(["http://example.com"], on_downloaded, allow_domains=("invaliddomain",))
20
- assert isinstance(e.value, ValueError) is True
21
-
22
- with pytest.raises(ValueError) as e:
23
- crawl(["http://example.com"], on_downloaded, depth_limit=-10)
24
- assert isinstance(e.value, ValueError) is True
25
-
26
- with pytest.raises(ValueError) as e:
27
- crawl(["http://example.com"], on_downloaded, download_limit=-10)
28
- assert isinstance(e.value, ValueError) is True