PyPI - data-prep-connector - Versions diffs - 0.2.2.dev1__tar.gz → 0.2.3__tar.gz - Mend

data-prep-connector 0.2.2.dev1tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/Makefile RENAMED Viewed

@@ -13,7 +13,7 @@ clean::
 setup::
 set-versions: .check-env
-	$(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml
+	$(MAKE) TOML_VERSION=$(DPK_CONNECTOR_VERSION) .defaults.update-toml
 build:: build-dist

{data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.1
 Name: data_prep_connector
-Version: 0.2.2.dev1
+Version: 0.2.3
 Summary: Scalable and Compliant Web Crawler
 Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
 License: Apache-2.0
-Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps
-Requires-Python: >=3.10
+Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps,0b74b5a
+Requires-Python: <3.13,>=3.10
 Description-Content-Type: text/markdown
 Requires-Dist: scrapy>=2.11.2
 Requires-Dist: pydantic>=2.8.1

{data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "data_prep_connector"
-version = "0.2.2.dev1"
-requires-python = ">=3.10"
+version = "0.2.3"
+requires-python = ">=3.10,<3.13"
 keywords = [
     "data",
     "data acquisition",
@@ -12,6 +12,7 @@ keywords = [
     "ai",
     "fine-tuning",
     "llmapps",
+    "0b74b5a"
 ]
 description = "Scalable and Compliant Web Crawler"
 license = { text = "Apache-2.0" }

{data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/data_prep_connector.egg-info/PKG-INFO RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.1
 Name: data_prep_connector
-Version: 0.2.2.dev1
+Version: 0.2.3
 Summary: Scalable and Compliant Web Crawler
 Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
 License: Apache-2.0
-Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps
-Requires-Python: >=3.10
+Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps,0b74b5a
+Requires-Python: <3.13,>=3.10
 Description-Content-Type: text/markdown
 Requires-Dist: scrapy>=2.11.2
 Requires-Dist: pydantic>=2.8.1

{data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/core/crawler.py RENAMED Viewed

@@ -85,6 +85,15 @@ def async_crawl(
     disallow_mime_types: Collection[str] = (),
     depth_limit: int = -1,
     download_limit: int = -1,
+    concurrent_requests: int = 16,
+    concurrent_requests_per_domain: int = 8,
+    download_delay: float = 0,
+    randomize_download_delay: bool = True,
+    download_timeout: float = 180,
+    autothrottle_enabled: bool = True,
+    autothrottle_max_delay: float = 60,
+    autothrottle_target_concurrency: float = 8,
+    robots_max_crawl_delay: float = 60,
 ) -> Deferred[None]:
     # Assisted by WCA@IBM
     # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
@@ -103,12 +112,21 @@ def async_crawl(
         disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
         depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
         download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
+        concurrent_requests (int): The maximum number of concurrent requests to make. Default is 16.
+        concurrent_requests_per_domain (int): The maximum number of concurrent requests to make per domain. Default is 8.
+        download_delay (float): The delay between consecutive requests. Default is 0.
+        randomize_download_delay (bool): If specified, the download delay will be randomized between 0.5 * `download_delay and 1.5 * `download_delay`. Default is True.
+        download_timeout (float): The timeout for each request. Default is 180 seconds.
+        autothrottle_enabled (bool): If specified, autothrottling will be enabled. Default is True.
+        autothrottle_max_delay (float): The maximum delay between consecutive requests when autothrottling is enabled. Default is 60 seconds.
+        autothrottle_target_concurrency (float): The target concurrency for autothrottling. Default is 8.
+        robots_max_crawl_delay (float): The maximum crawl delay allowed by the robots.txt file. Default is 60 seconds.
     Returns:
         Deferred[None]: A Twisted deferred object that can be used to wait for the crawler to finish.
     """
     if not seed_urls:
-        raise ValueError(f"Empty seed URLs.")
+        raise ValueError("Empty seed URLs.")
     for url in seed_urls:
         if not validate_url(url):
             raise ValueError(f"Seed URL {url} is not valid.")
@@ -119,6 +137,24 @@ def async_crawl(
         raise ValueError(f"Invalid depth limit {depth_limit}")
     if download_limit < -1:
         raise ValueError(f"Invalid download limit {download_limit}")
+    if concurrent_requests < 1:
+        raise ValueError(f"Invalid concurrent requests {concurrent_requests}")
+    if concurrent_requests_per_domain < 1:
+        raise ValueError(
+            f"Invalid concurrent reuqests per domain {concurrent_requests_per_domain}"
+        )
+    if download_delay < 0:
+        raise ValueError(f"Invalid download delay {download_delay}")
+    if download_timeout < 0:
+        raise ValueError(f"Invalid donwload timeout {download_timeout}")
+    if autothrottle_max_delay < 0:
+        raise ValueError(f"Invalid autothrottle max delay {autothrottle_max_delay}")
+    if autothrottle_target_concurrency < 1:
+        raise ValueError(
+            f"Invalid autothrottle target concurrency {autothrottle_target_concurrency}"
+        )
+    if robots_max_crawl_delay < 0:
+        raise ValueError(f"Invalid robots max crawl delay {robots_max_crawl_delay}")
     settings = Settings()
     settings.setmodule("dpk_connector.core.settings", priority="project")
@@ -126,7 +162,7 @@ def async_crawl(
     if user_agent:
         settings.set("USER_AGENT", user_agent, priority="spider")
     if headers:
-        settings.set("DEFAULT_REQUEST_HEADERS", headers)
+        settings.set("DEFAULT_REQUEST_HEADERS", headers, priority="spider")
     if depth_limit == 0:
         depth_limit = -1
     elif depth_limit == -1:
@@ -135,6 +171,25 @@ def async_crawl(
     if download_limit == -1:
         download_limit = 0
     settings.set("CLOSESPIDER_ITEMCOUNT", download_limit, priority="spider")
+    settings.set("CONCURRENT_REQUESTS", concurrent_requests, priority="spider")
+    settings.set(
+        "CONCURRENT_REQUESTS_PER_DOMAIN",
+        concurrent_requests_per_domain,
+        priority="spider",
+    )
+    settings.set("DOWNLOAD_DELAY", download_delay, priority="spider")
+    settings.set(
+        "RANDOMIZE_DOWNLOAD_DELAY", randomize_download_delay, priority="spider"
+    )
+    settings.set("DOWNLOAD_TIMEOUT", download_timeout, priority="spider")
+    settings.set("AUTOTHROTTLE_ENABLED", autothrottle_enabled, priority="spider")
+    settings.set("AUTOTHROTTLE_MAX_DELAY", autothrottle_max_delay, priority="spider")
+    settings.set(
+        "AUTOTHROTTLE_TARGET_CONCURRENCY",
+        autothrottle_target_concurrency,
+        priority="spider",
+    )
+    settings.set("ROBOTS_MAX_CRAWL_DELAY", robots_max_crawl_delay, priority="spider")
     runner = MultiThreadedCrawlerRunner(settings)
     runner.crawl(
@@ -169,6 +224,15 @@ def crawl(
     disallow_mime_types: Collection[str] = (),
     depth_limit: int = -1,
     download_limit: int = -1,
+    concurrent_requests: int = 16,
+    concurrent_requests_per_domain: int = 8,
+    download_delay: float = 0,
+    randomize_download_delay: bool = True,
+    download_timeout: float = 180,
+    autothrottle_enabled: bool = True,
+    autothrottle_max_delay: float = 60,
+    autothrottle_target_concurrency: float = 8,
+    robots_max_crawl_delay: float = 60,
 ) -> None:
     # Assisted by WCA@IBM
     # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
@@ -187,6 +251,15 @@ def crawl(
         disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
         depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
         download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
+        concurrent_requests (int): The maximum number of concurrent requests to make. Default is 16.
+        concurrent_requests_per_domain (int): The maximum number of concurrent requests to make per domain. Default is 8.
+        download_delay (float): The delay between consecutive requests. Default is 0.
+        randomize_download_delay (bool): If specified, the download delay will be randomized between 0.5 * `download_delay and 1.5 * `download_delay`. Default is True.
+        download_timeout (float): The timeout for each request. Default is 180 seconds.
+        autothrottle_enabled (bool): If specified, autothrottling will be enabled. Default is True.
+        autothrottle_max_delay (float): The maximum delay between consecutive requests when autothrottling is enabled. Default is 60 seconds.
+        autothrottle_target_concurrency (float): The target concurrency for autothrottling. Default is 8.
+        robots_max_crawl_delay (float): The maximum crawl delay allowed by the robots.txt file. Default is 60 seconds.
     Returns:
         None
@@ -209,6 +282,15 @@ def crawl(
         disallow_mime_types,
         depth_limit,
         download_limit,
+        concurrent_requests,
+        concurrent_requests_per_domain,
+        download_delay,
+        randomize_download_delay,
+        download_timeout,
+        autothrottle_enabled,
+        autothrottle_max_delay,
+        autothrottle_target_concurrency,
+        robots_max_crawl_delay,
     )
     d.addBoth(on_completed)
     with condition:

{data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/src/dpk_connector/core/settings.py RENAMED Viewed

@@ -16,21 +16,10 @@ SPIDER_MODULES = ["dpk_connector.core.spiders"]
 # Robots
 ROBOTSTXT_OBEY = True
-ROBOTS_MAX_CRAWL_DELAY = 60
 ROBOTSTXT_PARSER = "dpk_connector.core.middlewares.DelayingProtegoRobotParser"
-# Downloader parameters
-CONCURRENT_REQUESTS = 20
-CONCURRENT_REQUESTS_PER_DOMAIN = 10
-DOWNLOAD_DELAY = 0
-RANDOMIZE_DOWNLOAD_DELAY = True
-DOWNLOAD_TIMEOUT = 180
 # Autothrottle
-AUTOTHROTTLE_ENABLED = True
 AUTOTHROTTLE_START_DELAY = 0
-AUTOTHROTTLE_MAX_DELAY = 300
-AUTOTHROTTLE_TARGET_CONCURRENCY = 10
 AUTOTHROTTLE_DEBUG = False
 # Middlewares/pipelines/extensions

data_prep_connector-0.2.3/test/dpk_connector/core/test_crawler.py ADDED Viewed

@@ -0,0 +1,69 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+import pytest
+from dpk_connector.core.crawler import crawl
+def test_invalid_crawler():
+    def on_downloaded(url: str, body: bytes, headers: dict[str, str]):
+        pass
+    with pytest.raises(ValueError) as e:
+        crawl([], on_downloaded)
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["invalidseedurl"], on_downloaded)
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, allow_domains=("invaliddomain",))
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, depth_limit=-10)
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, download_limit=-10)
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, concurrent_requests=-10)
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, concurrent_requests_per_domain=-10)
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, download_delay=-0.1)
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, download_timeout=-0.1)
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, autothrottle_max_delay=-0.1)
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(
+            ["http://example.com"], on_downloaded, autothrottle_target_concurrency=0.5
+        )
+    assert isinstance(e.value, ValueError) is True
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, robots_max_crawl_delay=-0.1)
+    assert isinstance(e.value, ValueError) is True

{data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_middlewares.py RENAMED Viewed

@@ -1,3 +1,15 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
 import pytest
 from dpk_connector.core.middlewares import DelayingProtegoRobotParser
 from pytest_mock import MockerFixture

{data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_sitemap_spider.py RENAMED Viewed

@@ -1,3 +1,15 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
 from pathlib import Path
 import pytest
@@ -73,9 +85,7 @@ def test_parse(datadir: Path, crawler: Crawler):
         assert body.decode("utf-8") == response_body
         assert headers == {"Content-Type": "text/html"}
-    spider = ConnectorSitemapSpider.from_crawler(
-        crawler, seed_urls=("http://example.com",), callback=callback
-    )
+    spider = ConnectorSitemapSpider.from_crawler(crawler, seed_urls=("http://example.com",), callback=callback)
     request = Request(
         "http://example.com/index.html",
         meta={
@@ -93,9 +103,7 @@ def test_parse(datadir: Path, crawler: Crawler):
     parsed = spider.parse(response)
     item = next(parsed)
-    assert item == ConnectorItem(
-        dropped=False, downloaded=True, system_request=False, sitemap=False
-    )
+    assert item == ConnectorItem(dropped=False, downloaded=True, system_request=False, sitemap=False)
     for next_request in parsed:
         assert isinstance(next_request, Request) is True

{data_prep_connector-0.2.2.dev1 → data_prep_connector-0.2.3}/test/dpk_connector/core/test_utils.py RENAMED Viewed

@@ -1,3 +1,15 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
 # Assisted by WCA@IBM
 # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
@@ -20,9 +32,7 @@ from scrapy.http import Request, Response
 def test_get_header_value():
-    response = Response(
-        "http://example.com", headers={"Content-Type": "application/json"}
-    )
+    response = Response("http://example.com", headers={"Content-Type": "application/json"})
     assert get_header_value(response, "Content-Type") == "application/json"

data_prep_connector-0.2.2.dev1/test/dpk_connector/core/test_crawler.py DELETED Viewed

@@ -1,28 +0,0 @@
-import pytest
-from dpk_connector.core.crawler import crawl
-def test_invalid_crawler():
-    def on_downloaded(url: str, body: bytes, headers: dict[str, str]):
-        pass
-    with pytest.raises(ValueError) as e:
-        crawl([], on_downloaded)
-    assert isinstance(e.value, ValueError) is True
-    with pytest.raises(ValueError) as e:
-        crawl(["invalidseedurl"], on_downloaded)
-    assert isinstance(e.value, ValueError) is True
-    with pytest.raises(ValueError) as e:
-        crawl(["http://example.com"], on_downloaded, allow_domains=("invaliddomain",))
-    assert isinstance(e.value, ValueError) is True
-    with pytest.raises(ValueError) as e:
-        crawl(["http://example.com"], on_downloaded, depth_limit=-10)
-    assert isinstance(e.value, ValueError) is True
-    with pytest.raises(ValueError) as e:
-        crawl(["http://example.com"], on_downloaded, download_limit=-10)
-    assert isinstance(e.value, ValueError) is True