PyPI - data-prep-connector - Versions diffs - 0.2.2.dev0__tar.gz → 0.2.3__tar.gz - Mend

data-prep-connector 0.2.2.dev0tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/Makefile RENAMED Viewed

@@ -13,7 +13,7 @@ clean::
 setup::
 set-versions: .check-env
-	$(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml
+	$(MAKE) TOML_VERSION=$(DPK_CONNECTOR_VERSION) .defaults.update-toml
 build:: build-dist

{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.1
 Name: data_prep_connector
-Version: 0.2.2.dev0
+Version: 0.2.3
 Summary: Scalable and Compliant Web Crawler
 Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
 License: Apache-2.0
-Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps
-Requires-Python: >=3.10
+Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps,0b74b5a
+Requires-Python: <3.13,>=3.10
 Description-Content-Type: text/markdown
 Requires-Dist: scrapy>=2.11.2
 Requires-Dist: pydantic>=2.8.1

{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/doc/overview.md RENAMED Viewed

@@ -10,6 +10,20 @@ Features:
 - Mime type filters: You can restrict mime types which can be downloaded.
 - Parallel processing: Requests to websites are processed in parallel.
+## How to install
+### From PyPI
+```sh
+pip install data-prep-connector
+```
+### From Github
+```sh
+pip install git+https://github.com/IBM/data-prep-kit.git@dev#subdirectory=data-connector-lib
+```
 ## Example usage
 ```python

{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "data_prep_connector"
-version = "0.2.2.dev0"
-requires-python = ">=3.10"
+version = "0.2.3"
+requires-python = ">=3.10,<3.13"
 keywords = [
     "data",
     "data acquisition",
@@ -12,6 +12,7 @@ keywords = [
     "ai",
     "fine-tuning",
     "llmapps",
+    "0b74b5a"
 ]
 description = "Scalable and Compliant Web Crawler"
 license = { text = "Apache-2.0" }

{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/data_prep_connector.egg-info/PKG-INFO RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.1
 Name: data_prep_connector
-Version: 0.2.2.dev0
+Version: 0.2.3
 Summary: Scalable and Compliant Web Crawler
 Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
 License: Apache-2.0
-Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps
-Requires-Python: >=3.10
+Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps,0b74b5a
+Requires-Python: <3.13,>=3.10
 Description-Content-Type: text/markdown
 Requires-Dist: scrapy>=2.11.2
 Requires-Dist: pydantic>=2.8.1

{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/crawler.py RENAMED Viewed

@@ -74,6 +74,7 @@ def async_crawl(
     user_agent: str = "",
     headers: dict[str, str] = {},
     allow_domains: Collection[str] = (),
+    subdomain_focus: bool = False,
     path_focus: bool = False,
     allow_mime_types: Collection[str] = (
         "application/pdf",
@@ -84,6 +85,15 @@ def async_crawl(
     disallow_mime_types: Collection[str] = (),
     depth_limit: int = -1,
     download_limit: int = -1,
+    concurrent_requests: int = 16,
+    concurrent_requests_per_domain: int = 8,
+    download_delay: float = 0,
+    randomize_download_delay: bool = True,
+    download_timeout: float = 180,
+    autothrottle_enabled: bool = True,
+    autothrottle_max_delay: float = 60,
+    autothrottle_target_concurrency: float = 8,
+    robots_max_crawl_delay: float = 60,
 ) -> Deferred[None]:
     # Assisted by WCA@IBM
     # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
@@ -96,17 +106,27 @@ def async_crawl(
         user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
         headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
         allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
+        subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
         path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
         allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
         disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
         depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
         download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
+        concurrent_requests (int): The maximum number of concurrent requests to make. Default is 16.
+        concurrent_requests_per_domain (int): The maximum number of concurrent requests to make per domain. Default is 8.
+        download_delay (float): The delay between consecutive requests. Default is 0.
+        randomize_download_delay (bool): If specified, the download delay will be randomized between 0.5 * `download_delay and 1.5 * `download_delay`. Default is True.
+        download_timeout (float): The timeout for each request. Default is 180 seconds.
+        autothrottle_enabled (bool): If specified, autothrottling will be enabled. Default is True.
+        autothrottle_max_delay (float): The maximum delay between consecutive requests when autothrottling is enabled. Default is 60 seconds.
+        autothrottle_target_concurrency (float): The target concurrency for autothrottling. Default is 8.
+        robots_max_crawl_delay (float): The maximum crawl delay allowed by the robots.txt file. Default is 60 seconds.
     Returns:
         Deferred[None]: A Twisted deferred object that can be used to wait for the crawler to finish.
     """
     if not seed_urls:
-        raise ValueError(f"Empty seed URLs.")
+        raise ValueError("Empty seed URLs.")
     for url in seed_urls:
         if not validate_url(url):
             raise ValueError(f"Seed URL {url} is not valid.")
@@ -117,6 +137,24 @@ def async_crawl(
         raise ValueError(f"Invalid depth limit {depth_limit}")
     if download_limit < -1:
         raise ValueError(f"Invalid download limit {download_limit}")
+    if concurrent_requests < 1:
+        raise ValueError(f"Invalid concurrent requests {concurrent_requests}")
+    if concurrent_requests_per_domain < 1:
+        raise ValueError(
+            f"Invalid concurrent reuqests per domain {concurrent_requests_per_domain}"
+        )
+    if download_delay < 0:
+        raise ValueError(f"Invalid download delay {download_delay}")
+    if download_timeout < 0:
+        raise ValueError(f"Invalid donwload timeout {download_timeout}")
+    if autothrottle_max_delay < 0:
+        raise ValueError(f"Invalid autothrottle max delay {autothrottle_max_delay}")
+    if autothrottle_target_concurrency < 1:
+        raise ValueError(
+            f"Invalid autothrottle target concurrency {autothrottle_target_concurrency}"
+        )
+    if robots_max_crawl_delay < 0:
+        raise ValueError(f"Invalid robots max crawl delay {robots_max_crawl_delay}")
     settings = Settings()
     settings.setmodule("dpk_connector.core.settings", priority="project")
@@ -124,7 +162,7 @@ def async_crawl(
     if user_agent:
         settings.set("USER_AGENT", user_agent, priority="spider")
     if headers:
-        settings.set("DEFAULT_REQUEST_HEADERS", headers)
+        settings.set("DEFAULT_REQUEST_HEADERS", headers, priority="spider")
     if depth_limit == 0:
         depth_limit = -1
     elif depth_limit == -1:
@@ -133,6 +171,25 @@ def async_crawl(
     if download_limit == -1:
         download_limit = 0
     settings.set("CLOSESPIDER_ITEMCOUNT", download_limit, priority="spider")
+    settings.set("CONCURRENT_REQUESTS", concurrent_requests, priority="spider")
+    settings.set(
+        "CONCURRENT_REQUESTS_PER_DOMAIN",
+        concurrent_requests_per_domain,
+        priority="spider",
+    )
+    settings.set("DOWNLOAD_DELAY", download_delay, priority="spider")
+    settings.set(
+        "RANDOMIZE_DOWNLOAD_DELAY", randomize_download_delay, priority="spider"
+    )
+    settings.set("DOWNLOAD_TIMEOUT", download_timeout, priority="spider")
+    settings.set("AUTOTHROTTLE_ENABLED", autothrottle_enabled, priority="spider")
+    settings.set("AUTOTHROTTLE_MAX_DELAY", autothrottle_max_delay, priority="spider")
+    settings.set(
+        "AUTOTHROTTLE_TARGET_CONCURRENCY",
+        autothrottle_target_concurrency,
+        priority="spider",
+    )
+    settings.set("ROBOTS_MAX_CRAWL_DELAY", robots_max_crawl_delay, priority="spider")
     runner = MultiThreadedCrawlerRunner(settings)
     runner.crawl(
@@ -140,6 +197,7 @@ def async_crawl(
         seed_urls=seed_urls,
         callback=on_downloaded,
         allow_domains=allow_domains,
+        subdomain_focus=subdomain_focus,
         path_focus=path_focus,
         allow_mime_types=allow_mime_types,
         disallow_mime_types=disallow_mime_types,
@@ -155,6 +213,7 @@ def crawl(
     user_agent: str = "",
     headers: dict[str, str] = {},
     allow_domains: Collection[str] = (),
+    subdomain_focus: bool = False,
     path_focus: bool = False,
     allow_mime_types: Collection[str] = (
         "application/pdf",
@@ -165,6 +224,15 @@ def crawl(
     disallow_mime_types: Collection[str] = (),
     depth_limit: int = -1,
     download_limit: int = -1,
+    concurrent_requests: int = 16,
+    concurrent_requests_per_domain: int = 8,
+    download_delay: float = 0,
+    randomize_download_delay: bool = True,
+    download_timeout: float = 180,
+    autothrottle_enabled: bool = True,
+    autothrottle_max_delay: float = 60,
+    autothrottle_target_concurrency: float = 8,
+    robots_max_crawl_delay: float = 60,
 ) -> None:
     # Assisted by WCA@IBM
     # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
@@ -177,11 +245,21 @@ def crawl(
         user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
         headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
         allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
+        subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
         path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
         allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
         disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
         depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
         download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
+        concurrent_requests (int): The maximum number of concurrent requests to make. Default is 16.
+        concurrent_requests_per_domain (int): The maximum number of concurrent requests to make per domain. Default is 8.
+        download_delay (float): The delay between consecutive requests. Default is 0.
+        randomize_download_delay (bool): If specified, the download delay will be randomized between 0.5 * `download_delay and 1.5 * `download_delay`. Default is True.
+        download_timeout (float): The timeout for each request. Default is 180 seconds.
+        autothrottle_enabled (bool): If specified, autothrottling will be enabled. Default is True.
+        autothrottle_max_delay (float): The maximum delay between consecutive requests when autothrottling is enabled. Default is 60 seconds.
+        autothrottle_target_concurrency (float): The target concurrency for autothrottling. Default is 8.
+        robots_max_crawl_delay (float): The maximum crawl delay allowed by the robots.txt file. Default is 60 seconds.
     Returns:
         None
@@ -198,11 +276,21 @@ def crawl(
         user_agent,
         headers,
         allow_domains,
+        subdomain_focus,
         path_focus,
         allow_mime_types,
         disallow_mime_types,
         depth_limit,
         download_limit,
+        concurrent_requests,
+        concurrent_requests_per_domain,
+        download_delay,
+        randomize_download_delay,
+        download_timeout,
+        autothrottle_enabled,
+        autothrottle_max_delay,
+        autothrottle_target_concurrency,
+        robots_max_crawl_delay,
     )
     d.addBoth(on_completed)
     with condition:

{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/settings.py RENAMED Viewed

@@ -16,21 +16,10 @@ SPIDER_MODULES = ["dpk_connector.core.spiders"]
 # Robots
 ROBOTSTXT_OBEY = True
-ROBOTS_MAX_CRAWL_DELAY = 60
 ROBOTSTXT_PARSER = "dpk_connector.core.middlewares.DelayingProtegoRobotParser"
-# Downloader parameters
-CONCURRENT_REQUESTS = 20
-CONCURRENT_REQUESTS_PER_DOMAIN = 10
-DOWNLOAD_DELAY = 0
-RANDOMIZE_DOWNLOAD_DELAY = True
-DOWNLOAD_TIMEOUT = 180
 # Autothrottle
-AUTOTHROTTLE_ENABLED = True
 AUTOTHROTTLE_START_DELAY = 0
-AUTOTHROTTLE_MAX_DELAY = 300
-AUTOTHROTTLE_TARGET_CONCURRENCY = 10
 AUTOTHROTTLE_DEBUG = False
 # Middlewares/pipelines/extensions

{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/spiders/sitemap.py RENAMED Viewed

@@ -28,6 +28,7 @@ from dpk_connector.core.utils import (
     get_content_type,
     get_etld1,
     get_focus_path,
+    get_fqdn,
     is_allowed_path,
     urlparse_cached,
 )
@@ -42,6 +43,7 @@ class BaseSitemapSpider(SitemapSpider):
         self,
         seed_urls: Collection[str],
         allow_domains: Collection[str] = (),
+        subdomain_focus: bool = False,
         path_focus: bool = False,
         allow_mime_types: Collection[str] = (),
         disallow_mime_types: Collection[str] = (),
@@ -88,11 +90,15 @@ class BaseSitemapSpider(SitemapSpider):
                     self.focus_paths.add(path)
         # Domains and mime types filtering
-        self.allowed_domains = set(
-            allow_domains
-            if len(allow_domains) > 0
-            else [get_etld1(url) for url in seed_urls]
-        )
+        if allow_domains:
+            self.allowed_domains = set(allow_domains)
+        elif subdomain_focus:
+            self.allowed_domains = set()
+            for url in seed_urls:
+                if fqdn := get_fqdn(url):
+                    self.allowed_domains.add(fqdn)
+        else:
+            self.allowed_domains = set(get_etld1(url) for url in seed_urls)
         self.allow_mime_types = set(
             [m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else ()
         )
@@ -155,7 +161,9 @@ class BaseSitemapSpider(SitemapSpider):
             )
     def _parse_sitemap(self, response: Response):
-        yield ConnectorItem(dropped=False, downloaded=False, system_request=True, sitemap=True)
+        yield ConnectorItem(
+            dropped=False, downloaded=False, system_request=True, sitemap=True
+        )
         seed_url = response.meta["seed_url"]

{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/src/dpk_connector/core/utils.py RENAMED Viewed

@@ -57,6 +57,11 @@ def get_etld1(url: str) -> str:
     return f"{ext.domain}.{ext.suffix}"
+def get_fqdn(url: str) -> str:
+    ext = tldextract.extract(url)
+    return ext.fqdn
 def get_focus_path(url: str) -> str | None:
     parts = urlparse_cached(url)
     if len(parts.path.split("/")) > 2:

data_prep_connector-0.2.3/test/dpk_connector/core/test_crawler.py ADDED Viewed

@@ -0,0 +1,69 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+import pytest
+from dpk_connector.core.crawler import crawl
+def test_invalid_crawler():
+    def on_downloaded(url: str, body: bytes, headers: dict[str, str]):
+        pass
+    with pytest.raises(ValueError) as e:
+        crawl([], on_downloaded)
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["invalidseedurl"], on_downloaded)
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, allow_domains=("invaliddomain",))
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, depth_limit=-10)
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, download_limit=-10)
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, concurrent_requests=-10)
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, concurrent_requests_per_domain=-10)
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, download_delay=-0.1)
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, download_timeout=-0.1)
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, autothrottle_max_delay=-0.1)
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(
+            ["http://example.com"], on_downloaded, autothrottle_target_concurrency=0.5
+        )
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, robots_max_crawl_delay=-0.1)
+    assert isinstance(e.value, ValueError) is True

{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_middlewares.py RENAMED Viewed

@@ -1,3 +1,15 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
 import pytest
 from dpk_connector.core.middlewares import DelayingProtegoRobotParser
 from pytest_mock import MockerFixture

{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_sitemap_spider.py RENAMED Viewed

@@ -1,13 +1,24 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
 from pathlib import Path
 import pytest
+from dpk_connector.core.item import ConnectorItem
+from dpk_connector.core.spiders.sitemap import BaseSitemapSpider, ConnectorSitemapSpider
 from scrapy import Request
 from scrapy.crawler import Crawler
 from scrapy.http import HtmlResponse
-from dpk_connector.core.item import ConnectorItem
-from dpk_connector.core.spiders.sitemap import BaseSitemapSpider, ConnectorSitemapSpider
 @pytest.fixture
 def crawler() -> Crawler:
@@ -22,6 +33,21 @@ def crawler() -> Crawler:
     return crawler
+def test_init_subdomain_focus():
+    spider = BaseSitemapSpider(
+        seed_urls=(
+            "http://blog.example.com/",
+            "http://contents.example.com/",
+        ),
+        subdomain_focus=True,
+    )
+    assert spider.seed_urls == {
+        "http://blog.example.com/",
+        "http://contents.example.com/",
+    }
+    assert spider.allowed_domains == {"blog.example.com", "contents.example.com"}
 def test_init_path_focus():
     spider = BaseSitemapSpider(
         seed_urls=(
@@ -59,9 +85,7 @@ def test_parse(datadir: Path, crawler: Crawler):
         assert body.decode("utf-8") == response_body
         assert headers == {"Content-Type": "text/html"}
-    spider = ConnectorSitemapSpider.from_crawler(
-        crawler, seed_urls=("http://example.com",), callback=callback
-    )
+    spider = ConnectorSitemapSpider.from_crawler(crawler, seed_urls=("http://example.com",), callback=callback)
     request = Request(
         "http://example.com/index.html",
         meta={
@@ -79,9 +103,7 @@ def test_parse(datadir: Path, crawler: Crawler):
     parsed = spider.parse(response)
     item = next(parsed)
-    assert item == ConnectorItem(
-        dropped=False, downloaded=True, system_request=False, sitemap=False
-    )
+    assert item == ConnectorItem(dropped=False, downloaded=True, system_request=False, sitemap=False)
     for next_request in parsed:
         assert isinstance(next_request, Request) is True

{data_prep_connector-0.2.2.dev0 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_utils.py RENAMED Viewed

@@ -1,3 +1,15 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
 # Assisted by WCA@IBM
 # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
@@ -7,6 +19,7 @@ from dpk_connector.core.utils import (
     get_content_type,
     get_etld1,
     get_focus_path,
+    get_fqdn,
     get_header_value,
     get_mime_type,
     is_allowed_path,
@@ -19,9 +32,7 @@ from scrapy.http import Request, Response
 def test_get_header_value():
-    response = Response(
-        "http://example.com", headers={"Content-Type": "application/json"}
-    )
+    response = Response("http://example.com", headers={"Content-Type": "application/json"})
     assert get_header_value(response, "Content-Type") == "application/json"
@@ -83,6 +94,21 @@ def test_get_etld1(url: str, expected: str):
     assert get_etld1(url) == expected
+@pytest.mark.parametrize(
+    "url,expected",
+    [
+        ("http://www.example.com", "www.example.com"),
+        ("https://www.example.co.uk", "www.example.co.uk"),
+        ("http://www.example.com/path?query=string#fragment", "www.example.com"),
+        ("http://localhost:8080/", ""),
+        ("http://www.example.com:8080/", "www.example.com"),
+        ("http://www.sub.example.com:8080/", "www.sub.example.com"),
+    ],
+)
+def test_get_fqdn(url: str, expected: str):
+    assert get_fqdn(url) == expected
 @pytest.mark.parametrize(
     "url,expected",
     [

data_prep_connector-0.2.2.dev0/test/dpk_connector/core/test_crawler.py DELETED Viewed

@@ -1,28 +0,0 @@
-import pytest
-from dpk_connector.core.crawler import crawl
-def test_invalid_crawler():
-    def on_downloaded(url: str, body: bytes, headers: dict[str, str]):
-        pass
-    with pytest.raises(ValueError) as e:
-        crawl([], on_downloaded)
-    assert isinstance(e.value, ValueError) is True
-    with pytest.raises(ValueError) as e:
-        crawl(["invalidseedurl"], on_downloaded)
-    assert isinstance(e.value, ValueError) is True
-    with pytest.raises(ValueError) as e:
-        crawl(["http://example.com"], on_downloaded, allow_domains=("invaliddomain",))
-    assert isinstance(e.value, ValueError) is True
-    with pytest.raises(ValueError) as e:
-        crawl(["http://example.com"], on_downloaded, depth_limit=-10)
-    assert isinstance(e.value, ValueError) is True
-    with pytest.raises(ValueError) as e:
-        crawl(["http://example.com"], on_downloaded, download_limit=-10)
-    assert isinstance(e.value, ValueError) is True