PyPI - data-prep-connector - Versions diffs - 0.2.2__py3-none-any.whl - Mend

data-prep-connector 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

data_prep_connector-0.2.2.dist-info/METADATA +54 -0
data_prep_connector-0.2.2.dist-info/RECORD +15 -0
data_prep_connector-0.2.2.dist-info/WHEEL +5 -0
data_prep_connector-0.2.2.dist-info/top_level.txt +1 -0
dpk_connector/__init__.py +13 -0
dpk_connector/core/__init__.py +11 -0
dpk_connector/core/crawler.py +222 -0
dpk_connector/core/item.py +21 -0
dpk_connector/core/logging.py +22 -0
dpk_connector/core/middlewares.py +263 -0
dpk_connector/core/pipelines.py +29 -0
dpk_connector/core/settings.py +70 -0
dpk_connector/core/spiders/__init__.py +11 -0
dpk_connector/core/spiders/sitemap.py +342 -0
dpk_connector/core/utils.py +97 -0

data_prep_connector-0.2.2.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,54 @@
+Metadata-Version: 2.1
+Name: data_prep_connector
+Version: 0.2.2
+Summary: Scalable and Compliant Web Crawler
+Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
+License: Apache-2.0
+Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: scrapy>=2.11.2
+Requires-Dist: pydantic>=2.8.1
+Requires-Dist: tldextract>=5.1.2
+Provides-Extra: dev
+Requires-Dist: twine; extra == "dev"
+Requires-Dist: pytest>=7.3.2; extra == "dev"
+Requires-Dist: pytest-dotenv>=0.5.2; extra == "dev"
+Requires-Dist: pytest-env>=1.0.0; extra == "dev"
+Requires-Dist: pre-commit>=3.3.2; extra == "dev"
+Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
+Requires-Dist: pytest-mock>=3.10.0; extra == "dev"
+Requires-Dist: pytest-datadir>=1.5.0; extra == "dev"
+Requires-Dist: moto==5.0.5; extra == "dev"
+Requires-Dist: markupsafe==2.0.1; extra == "dev"
+# DPK Connector
+DPK Connector is a scalable and compliant web crawler developed for data acquisition towards LLM development. It is built on [Scrapy](https://scrapy.org/).
+For more details read [the documentation](doc/overview.md).
+## Virtual Environment
+The project uses `pyproject.toml` and a Makefile for operations.
+To do development you should establish the virtual environment
+```shell
+make venv
+```
+and then either activate
+```shell
+source venv/bin/activate
+```
+or set up your IDE to use the venv directory when developing in this project
+## Library Artifact Build and Publish
+To test, build and publish the library
+```shell
+make test build publish
+```
+To up the version number, edit the Makefile to change VERSION and rerun the above. This will require committing both the `Makefile` and the autotmatically updated `pyproject.toml` file.
+## How to use
+See [the overview](doc/overview.md).

data_prep_connector-0.2.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+dpk_connector/__init__.py,sha256=xG6Sve8_Vf1RI0uLDIxEMrFM62TUxeTkuYVPPADqETQ,735
+dpk_connector/core/__init__.py,sha256=WrQMZyFE3Gn6fT7oHmL9zBYpJ9lI9j-PpJBqE_a6Zww,658
+dpk_connector/core/crawler.py,sha256=Wss9DKRQkh0lu2e_Ox6usayp3SdcUWXGGIMqTcLzNFE,8720
+dpk_connector/core/item.py,sha256=MZRTwhJJupkC_oegEfzrb-YsWP0TRv09Y2rwEv71uII,841
+dpk_connector/core/logging.py,sha256=aV1SNJUPgJuoiZ6wwlZcHTHigLB0vRDT2UfM0RWeWW4,981
+dpk_connector/core/middlewares.py,sha256=dB44kOG1wU1yCp7zNxe66DB37rTmYnsQokv99Bng-8k,9942
+dpk_connector/core/pipelines.py,sha256=W3EYF6l8hyV2FccJ2Mj2FL28RUtQoHKqSps-SYV1Lpo,1115
+dpk_connector/core/settings.py,sha256=BhATbs9UEtTMWpUMpZUY66b-brRUmG-d7danm_FYAD8,2275
+dpk_connector/core/utils.py,sha256=O6MI9Gz6TvncTv0isaxIvE29q-CnXLTN3cx4abEG2VE,3034
+dpk_connector/core/spiders/__init__.py,sha256=WrQMZyFE3Gn6fT7oHmL9zBYpJ9lI9j-PpJBqE_a6Zww,658
+dpk_connector/core/spiders/sitemap.py,sha256=SYT89P3V2QpHvE_PuEdBJlabKCswi_0W6A4sOqOnvXc,12600
+data_prep_connector-0.2.2.dist-info/METADATA,sha256=b_NuUk_6AvbbWe8ekPYK4K6KLX2CC6KTZCnpb_ZXYcU,1828
+data_prep_connector-0.2.2.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
+data_prep_connector-0.2.2.dist-info/top_level.txt,sha256=V5veaYVXWTfjj98ntRCsHK7A36nzNprbMwB8PRrtsN4,14
+data_prep_connector-0.2.2.dist-info/RECORD,,

data_prep_connector-0.2.2.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (75.2.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

data_prep_connector-0.2.2.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ dpk_connector

dpk_connector/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+from dpk_connector.core.crawler import async_crawl, crawl, shutdown  # noqa

dpk_connector/core/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################

dpk_connector/core/crawler.py ADDED Viewed

@@ -0,0 +1,222 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+import threading
+from typing import Any, Callable, Collection, Type, cast
+from scrapy import Spider
+from scrapy.crawler import Crawler, CrawlerRunner
+from scrapy.settings import Settings
+from twisted.internet.defer import Deferred
+from dpk_connector.core.utils import validate_domain, validate_url
+_lock = threading.Lock()
+_reactor_initialized = False
+_reactor_started = False
+def _run_reactor():
+    from twisted.internet import reactor
+    reactor.run(installSignalHandlers=False)
+_reactor_thread: threading.Thread = threading.Thread(
+    target=_run_reactor,
+    daemon=True,
+)
+def _start_reactor():
+    with _lock:
+        global _reactor_started
+        if not _reactor_started:
+            _reactor_thread.start()
+            _reactor_started = True
+def _stop_reactor():
+    from twisted.internet import reactor
+    try:
+        reactor.stop()
+    except RuntimeError:
+        pass
+class MultiThreadedCrawlerRunner(CrawlerRunner):
+    def _create_crawler(self, spidercls: str | type[Spider]) -> Crawler:
+        if isinstance(spidercls, str):
+            spidercls = self.spider_loader.load(spidercls)
+        with _lock:
+            global _reactor_initialized
+            init_reactor = not _reactor_initialized
+            crawler = Crawler(
+                cast(Type[Spider], spidercls), self.settings, init_reactor
+            )
+            _reactor_initialized = True
+        return crawler
+def async_crawl(
+    seed_urls: Collection[str],
+    on_downloaded: Callable[[str, bytes, dict[str, str]], None],
+    user_agent: str = "",
+    headers: dict[str, str] = {},
+    allow_domains: Collection[str] = (),
+    subdomain_focus: bool = False,
+    path_focus: bool = False,
+    allow_mime_types: Collection[str] = (
+        "application/pdf",
+        "text/html",
+        "text/markdown",
+        "text/plain",
+    ),
+    disallow_mime_types: Collection[str] = (),
+    depth_limit: int = -1,
+    download_limit: int = -1,
+) -> Deferred[None]:
+    # Assisted by WCA@IBM
+    # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
+    """
+    Do crawl asynchronously.
+    Parameters:
+        seed_urls (Collection[str]): A collection of seed URLs to start the crawl from.
+        on_downloaded (Callable[[str, bytes, dict[str, str]], None]): The callback function to be called for each downloaded page.
+        user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
+        headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
+        allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
+        subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
+        path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
+        allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
+        disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
+        depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
+        download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
+    Returns:
+        Deferred[None]: A Twisted deferred object that can be used to wait for the crawler to finish.
+    """
+    if not seed_urls:
+        raise ValueError(f"Empty seed URLs.")
+    for url in seed_urls:
+        if not validate_url(url):
+            raise ValueError(f"Seed URL {url} is not valid.")
+    for domain in allow_domains:
+        if not validate_domain(domain):
+            raise ValueError(f"Allow domain {domain} is not valid.")
+    if depth_limit < -1:
+        raise ValueError(f"Invalid depth limit {depth_limit}")
+    if download_limit < -1:
+        raise ValueError(f"Invalid download limit {download_limit}")
+    settings = Settings()
+    settings.setmodule("dpk_connector.core.settings", priority="project")
+    if user_agent:
+        settings.set("USER_AGENT", user_agent, priority="spider")
+    if headers:
+        settings.set("DEFAULT_REQUEST_HEADERS", headers)
+    if depth_limit == 0:
+        depth_limit = -1
+    elif depth_limit == -1:
+        depth_limit = 0
+    settings.set("DEPTH_LIMIT", depth_limit, priority="spider")
+    if download_limit == -1:
+        download_limit = 0
+    settings.set("CLOSESPIDER_ITEMCOUNT", download_limit, priority="spider")
+    runner = MultiThreadedCrawlerRunner(settings)
+    runner.crawl(
+        "dpk-connector-sitemap",
+        seed_urls=seed_urls,
+        callback=on_downloaded,
+        allow_domains=allow_domains,
+        subdomain_focus=subdomain_focus,
+        path_focus=path_focus,
+        allow_mime_types=allow_mime_types,
+        disallow_mime_types=disallow_mime_types,
+        disable_sitemap_search=True,
+    )
+    _start_reactor()
+    return runner.join()
+def crawl(
+    seed_urls: Collection[str],
+    on_downloaded: Callable[[str, bytes, dict[str, str]], None],
+    user_agent: str = "",
+    headers: dict[str, str] = {},
+    allow_domains: Collection[str] = (),
+    subdomain_focus: bool = False,
+    path_focus: bool = False,
+    allow_mime_types: Collection[str] = (
+        "application/pdf",
+        "text/html",
+        "text/markdown",
+        "text/plain",
+    ),
+    disallow_mime_types: Collection[str] = (),
+    depth_limit: int = -1,
+    download_limit: int = -1,
+) -> None:
+    # Assisted by WCA@IBM
+    # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
+    """
+    Do crawl synchronously.
+    Parameters:
+        seed_urls (Collection[str]): A collection of seed URLs to start the crawl from.
+        on_downloaded (Callable[[str, bytes, dict[str, str]], None]): The callback function to be called for each downloaded page.
+        user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
+        headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
+        allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
+        subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
+        path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
+        allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
+        disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
+        depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
+        download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
+    Returns:
+        None
+    """
+    condition = threading.Condition()
+    def on_completed(result: Any):
+        with condition:
+            condition.notify()
+    d = async_crawl(
+        seed_urls,
+        on_downloaded,
+        user_agent,
+        headers,
+        allow_domains,
+        subdomain_focus,
+        path_focus,
+        allow_mime_types,
+        disallow_mime_types,
+        depth_limit,
+        download_limit,
+    )
+    d.addBoth(on_completed)
+    with condition:
+        condition.wait()
+def shutdown():
+    """
+    Shutdown all crawls.
+    """
+    _stop_reactor()

dpk_connector/core/item.py ADDED Viewed

@@ -0,0 +1,21 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+from dataclasses import dataclass
+@dataclass
+class ConnectorItem:
+    dropped: bool = False
+    downloaded: bool = False
+    system_request: bool = False
+    sitemap: bool = False

dpk_connector/core/logging.py ADDED Viewed

@@ -0,0 +1,22 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+from scrapy.logformatter import LogFormatter as ScrapyLogFormatter
+class QuietLogFormatter(ScrapyLogFormatter):
+    def scraped(self, item, response, spider):
+        return (
+            super().scraped(item, response, spider)
+            if spider.settings.getbool("LOG_SCRAPED_ITEMS")
+            else None
+        )

dpk_connector/core/middlewares.py ADDED Viewed

@@ -0,0 +1,263 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+import logging
+from typing import Any, Generator, Iterable
+from scrapy import Spider, signals
+from scrapy.crawler import Crawler
+from scrapy.downloadermiddlewares.robotstxt import RobotsTxtMiddleware
+from scrapy.downloadermiddlewares.stats import DownloaderStats
+from scrapy.exceptions import NotConfigured
+from scrapy.http import Request, Response
+from scrapy.http.request import NO_CALLBACK
+from scrapy.robotstxt import ProtegoRobotParser, RobotParser
+from scrapy.statscollectors import StatsCollector
+from scrapy.utils.httpobj import urlparse_cached
+from scrapy.utils.python import to_unicode
+from twisted.internet.defer import Deferred
+from dpk_connector.core.item import ConnectorItem
+from dpk_connector.core.utils import get_content_type, get_etld1, get_mime_type, get_netloc
+logger = logging.getLogger(__name__)
+class DelayingProtegoRobotParser(ProtegoRobotParser):
+    """
+    Robots.txt parser supporting crawl-delay/request-rate.
+    """
+    def __init__(self, robotstxt_body: str | bytes, spider: Spider):
+        super().__init__(robotstxt_body, spider)
+        self.max_delay = spider.crawler.settings.getfloat("ROBOTS_MAX_CRAWL_DELAY", 60)
+    def delay(self, user_agent: str | bytes) -> float | None:
+        user_agent = to_unicode(user_agent)
+        crawl_delay = self.rp.crawl_delay(user_agent)
+        request_rate = self.rp.request_rate(user_agent)
+        if crawl_delay is None and request_rate is None:
+            return None
+        crawl_delay = crawl_delay or 0
+        request_rate = (
+            request_rate.seconds / request_rate.requests if request_rate else 0
+        )
+        delay = min(max(crawl_delay, request_rate), self.max_delay)
+        return delay
+class DelayingRobotsTxtMiddleware(RobotsTxtMiddleware):
+    """
+    Downloader middleware to follow crawl-delay/request-rate directives of robots.txt.
+    """
+    def __init__(self, crawler: Crawler, download_timeout: float):
+        super().__init__(crawler)
+        self.download_timeout = download_timeout
+        self._delays: dict[str, float] = {}
+        crawler.signals.connect(
+            self._request_reached_downloader, signal=signals.request_reached_downloader
+        )
+    @classmethod
+    def from_crawler(cls, crawler: Crawler):
+        download_timeout = crawler.settings.getfloat("ROBOTSTXT_DOWNLOAD_TIMEOUT")
+        if not download_timeout:
+            download_timeout = crawler.settings.getfloat("DOWNLOAD_TIMEOUT")
+        return cls(crawler, download_timeout)
+    def _request_reached_downloader(self, request: Request, spider: Spider) -> None:
+        key = request.meta.get("download_slot")
+        if slot := self.crawler.engine.downloader.slots.get(key):
+            parts = urlparse_cached(request)
+            domain = parts.netloc
+            if domain in self._delays:
+                delay = self._delays[domain]
+                if delay and slot.delay < delay:
+                    slot.delay = delay
+                    slot.randomize_delay = False
+    def process_request_2(
+        self, rp: RobotParser, request: Request, spider: Spider
+    ) -> None:
+        super().process_request_2(rp, request, spider)
+        if isinstance(rp, DelayingProtegoRobotParser):
+            parts = urlparse_cached(request)
+            domain = parts.netloc
+            if domain not in self._delays:
+                user_agent = self._robotstxt_useragent
+                if not user_agent:
+                    user_agent = request.headers.get(
+                        b"User-Agent", self._default_useragent
+                    )
+                delay = rp.delay(user_agent) or 0.0
+                self._delays[domain] = delay
+                if delay:
+                    logger.info(
+                        f"Set download delay to {delay} according to robots.txt. domain: {domain}"
+                    )
+    def robot_parser(self, request: Request, spider: Spider):
+        url = urlparse_cached(request)
+        netloc = url.netloc
+        if netloc not in self._parsers:
+            self._parsers[netloc] = Deferred()
+            robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
+            robotsreq = Request(
+                robotsurl,
+                priority=self.DOWNLOAD_PRIORITY,
+                meta={
+                    "dont_obey_robotstxt": True,
+                    "system_request": True,
+                    "download_timeout": self.download_timeout,
+                },
+                callback=NO_CALLBACK,
+            )
+            dfd = self.crawler.engine.download(robotsreq)
+            dfd.addCallback(self._parse_robots, netloc, spider)
+            dfd.addErrback(self._logerror, robotsreq, spider)
+            dfd.addErrback(self._robots_error, netloc)
+            self.crawler.stats.inc_value("robotstxt/request_count")
+        if isinstance(self._parsers[netloc], Deferred):
+            d = Deferred()
+            def cb(result):
+                d.callback(result)
+                return result
+            self._parsers[netloc].addCallback(cb)
+            return d
+        return self._parsers[netloc]
+def _update_request_stats(
+    stats: StatsCollector,
+    request: Request,
+    spider: Spider,
+    prefix: str,
+    skip_domains: bool = False,
+):
+    # request count
+    stats.inc_value(prefix, spider=spider)
+    # proxy distribution
+    proxy = request.meta.get("proxy", "None")
+    stats.inc_value(f"{prefix}/proxy/{proxy}", spider=spider)
+    if not skip_domains:
+        # domain distribution
+        domain = get_etld1(to_unicode(request.url))
+        stats.inc_value(f"{prefix}/domain/{domain}", spider=spider)
+        # subdomain distribution
+        sub_domain = get_netloc(request)
+        stats.inc_value(f"{prefix}/subdomain/{sub_domain}", spider=spider)
+def _update_stats(
+    stats: StatsCollector,
+    request: Request,
+    response: Response,
+    spider: Spider,
+    prefix: str,
+    skip_domains: bool = False,
+):
+    _update_request_stats(stats, request, spider, prefix, skip_domains)
+    # mime type distribution
+    content_type = get_content_type(response)
+    if not content_type:
+        stats.inc_value(f"{prefix}/mime_type/None", spider=spider)
+    else:
+        mime_type = get_mime_type(content_type)
+        stats.inc_value(f"{prefix}/mime_type/{mime_type}", spider=spider)
+    # status code distribution
+    stats.inc_value(f"{prefix}/status_code/{response.status}", spider=spider)
+def _update_sitemap_stats(stats: StatsCollector, spider: Spider, prefix: str):
+    # sitemap
+    stats.inc_value(f"{prefix}/sitemap", spider=spider)
+class ConnectorRequestedStats(DownloaderStats):
+    """
+    Downloader middleware to expose additional stats.
+    """
+    def __init__(self, stats: StatsCollector, skip_domains: bool):
+        super().__init__(stats)
+        self.skip_domains = skip_domains
+    @classmethod
+    def from_crawler(cls, crawler: Crawler):
+        if not crawler.settings.getbool("DOWNLOADER_STATS"):
+            raise NotConfigured
+        skip_domains = crawler.settings.getbool("STATS_SKIP_DOMAINS")
+        return cls(crawler.stats, skip_domains)
+    def process_request(self, request: Request, spider: Spider):
+        super().process_request(request, spider)
+        prefix = "dpk_connector/requested"
+        if not request.meta.get("system_request", False):
+            _update_request_stats(
+                self.stats, request, spider, prefix, self.skip_domains
+            )
+        if request.meta.get("sitemap", False):
+            _update_sitemap_stats(self.stats, spider, prefix)
+    def process_response(self, request: Request, response: Response, spider: Spider):
+        ret = super().process_response(request, response, spider)
+        prefix = "dpk_connector/accessed"
+        if not request.meta.get("system_request", False):
+            _update_stats(
+                self.stats, request, response, spider, prefix, self.skip_domains
+            )
+        if request.meta.get("sitemap", False):
+            _update_sitemap_stats(self.stats, spider, prefix)
+        return ret
+class ConnectorDownloadedStats:
+    """
+    Spider middleware to expose additional stats.
+    """
+    def __init__(self, stats: StatsCollector, skip_domains: bool):
+        self.stats = stats
+        self.skip_domains = skip_domains
+    @classmethod
+    def from_crawler(cls, crawler: Crawler):
+        if not crawler.settings.getbool("DOWNLOADER_STATS"):
+            raise NotConfigured
+        skip_domains = crawler.settings.getbool("STATS_SKIP_DOMAINS")
+        return cls(crawler.stats, skip_domains)
+    def process_spider_output(
+        self,
+        response: Response,
+        result: Iterable[Request | ConnectorItem],
+        spider: Spider,
+    ) -> Generator[Any, Any, None]:
+        for r in result:
+            if isinstance(r, ConnectorItem):
+                if (not r.system_request) and r.downloaded:
+                    _update_stats(
+                        self.stats,
+                        response.request,
+                        response,
+                        spider,
+                        "dpk_connector/downloaded",
+                        self.skip_domains,
+                    )
+                if r.sitemap:
+                    _update_sitemap_stats(self.stats, spider, "dpk_connector/downloaded")
+            yield r

dpk_connector/core/pipelines.py ADDED Viewed

@@ -0,0 +1,29 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+from typing import Any
+from scrapy import Spider
+from scrapy.crawler import Crawler
+from scrapy.exceptions import DropItem
+from dpk_connector.core.item import ConnectorItem
+class DropPipeline:
+    @classmethod
+    def from_crawler(cls, crawler: Crawler):
+        return cls()
+    def process_item(self, item: ConnectorItem, spider: Spider) -> Any:
+        if item.system_request or (not item.downloaded):
+            raise DropItem
+        return item

dpk_connector/core/settings.py ADDED Viewed

@@ -0,0 +1,70 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+BOT_NAME = "dpk-connector"
+SPIDER_MODULES = ["dpk_connector.core.spiders"]
+# Robots
+ROBOTSTXT_OBEY = True
+ROBOTS_MAX_CRAWL_DELAY = 60
+ROBOTSTXT_PARSER = "dpk_connector.core.middlewares.DelayingProtegoRobotParser"
+# Downloader parameters
+CONCURRENT_REQUESTS = 20
+CONCURRENT_REQUESTS_PER_DOMAIN = 10
+DOWNLOAD_DELAY = 0
+RANDOMIZE_DOWNLOAD_DELAY = True
+DOWNLOAD_TIMEOUT = 180
+# Autothrottle
+AUTOTHROTTLE_ENABLED = True
+AUTOTHROTTLE_START_DELAY = 0
+AUTOTHROTTLE_MAX_DELAY = 300
+AUTOTHROTTLE_TARGET_CONCURRENCY = 10
+AUTOTHROTTLE_DEBUG = False
+# Middlewares/pipelines/extensions
+SPIDER_MIDDLEWARES = {
+    "dpk_connector.core.middlewares.ConnectorDownloadedStats": 10,
+}
+DOWNLOADER_MIDDLEWARES = {
+    "scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware": None,
+    "dpk_connector.core.middlewares.DelayingRobotsTxtMiddleware": 100,
+    "scrapy.downloadermiddlewares.stats.DownloaderStats": None,
+    "dpk_connector.core.middlewares.ConnectorRequestedStats": 850,
+}
+ITEM_PIPELINES = {
+    "dpk_connector.core.pipelines.DropPipeline": 100,
+}
+EXTENSIONS = {
+    "scrapy.extensions.telnet.TelnetConsole": None,
+    "scrapy.extensions.memdebug.MemoryDebugger": None,
+}
+# Queue
+SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.LifoMemoryQueue"
+# Logging
+LOG_LEVEL = "INFO"
+LOG_SCRAPED_ITEMS = False
+LOG_FORMATTER = "dpk_connector.core.logging.QuietLogFormatter"
+# Periodic logging
+PERIODIC_LOG_DELTA = True
+PERIODIC_LOG_STATS = True
+PERIODIC_LOG_TIMING_ENABLED = True
+LOGSTATS_INTERVAL = 300
+# Misc
+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"

dpk_connector/core/spiders/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################

dpk_connector/core/spiders/sitemap.py ADDED Viewed

@@ -0,0 +1,342 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+import logging
+from typing import Any, Callable, Collection, Generator
+from urllib.parse import ParseResult
+from scrapy import Request
+from scrapy.http import Response
+from scrapy.link import Link
+from scrapy.linkextractors import LinkExtractor
+from scrapy.spiders import SitemapSpider
+from scrapy.spiders.sitemap import iterloc
+from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
+from dpk_connector.core.item import ConnectorItem
+from dpk_connector.core.utils import (
+    get_base_url,
+    get_content_type,
+    get_etld1,
+    get_focus_path,
+    get_fqdn,
+    is_allowed_path,
+    urlparse_cached,
+)
+class BaseSitemapSpider(SitemapSpider):
+    SITEMAP_DOWNLOAD_PRIORITY = 10
+    name = "base-sitemap"
+    def __init__(
+        self,
+        seed_urls: Collection[str],
+        allow_domains: Collection[str] = (),
+        subdomain_focus: bool = False,
+        path_focus: bool = False,
+        allow_mime_types: Collection[str] = (),
+        disallow_mime_types: Collection[str] = (),
+        depth_limit: int = 0,
+        disable_sitemap_search: bool = False,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.depth_limit = depth_limit
+        self.sitemap_search = (not disable_sitemap_search) and depth_limit >= 0
+        # Build sitemap url candidates
+        self.input_seed_urls = seed_urls
+        sitemap_urls = []
+        sitemaps_seen = []
+        for seed_url in seed_urls:
+            parts = urlparse_cached(seed_url)
+            if seed_url.endswith(
+                (
+                    ".xml",
+                    ".xml.gz",
+                    "/robots.txt",
+                    "robots.txt/",
+                    "/sitemap",
+                    "/sitemap/",
+                )
+            ):
+                sitemap_urls.append(seed_url)
+            elif self.sitemap_search:
+                sitemap_urls.extend(self._get_sitemap_urls(parts))
+            sitemaps_seen.append(parts.netloc)
+        self.seed_urls = set(seed_urls) - set(sitemap_urls)
+        self.sitemap_urls = set(sitemap_urls)
+        self.sitemaps_seen = set(sitemaps_seen)
+        # Extract focus paths
+        self.focus_paths: set[str] = set()
+        if path_focus:
+            for seed_url in self.seed_urls:
+                path = get_focus_path(seed_url)
+                if path is not None:
+                    self.focus_paths.add(path)
+        # Domains and mime types filtering
+        if allow_domains:
+            self.allowed_domains = set(allow_domains)
+        elif subdomain_focus:
+            self.allowed_domains = set()
+            for url in seed_urls:
+                if fqdn := get_fqdn(url):
+                    self.allowed_domains.add(fqdn)
+        else:
+            self.allowed_domains = set(get_etld1(url) for url in seed_urls)
+        self.allow_mime_types = set(
+            [m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else ()
+        )
+        self.disallow_mime_types = set(
+            [m.lower() for m in disallow_mime_types]
+            if len(disallow_mime_types) > 0
+            else ()
+        )
+        # Link extraction from html
+        self.link_extractor = LinkExtractor(
+            allow_domains=self.allowed_domains,
+            unique=True,
+            deny_extensions=(),
+            tags=("a", "area", "link"),
+        )
+        self.log(
+            f"Seed URLs: {self.seed_urls}, sitemap URLs: {self.sitemap_urls}, allow domains: {self.allowed_domains}, focus paths: {self.focus_paths}, allow mime types: {self.allow_mime_types}, disallow mime types: {self.disallow_mime_types}, depth limit: {self.depth_limit}, sitemap search: {self.sitemap_search}",
+            logging.INFO,
+        )
+    def _get_sitemap_urls(self, parts: ParseResult) -> list[str]:
+        base_url = get_base_url(parts)
+        sitemap_variations = (
+            "robots.txt",
+            "robots.txt/",
+            "sitemap.xml",
+            "sitemap_index.xml",
+            "sitemapindex.xml",
+            "sitemap",
+            "sitemap-index.xml",
+            "sitemap/index.xml",
+            "sitemap/sitemap.xml",
+            "sitemap1.xml",
+        )
+        return [f"{base_url}/{sitemap}" for sitemap in sitemap_variations]
+    def start_requests(self):
+        for url in self.sitemap_urls:
+            yield Request(
+                url,
+                self._parse_sitemap,
+                priority=self.SITEMAP_DOWNLOAD_PRIORITY,
+                meta={
+                    "seed_url": url,
+                    "previous_url": "",
+                    "system_request": True,
+                    "sitemap": True,
+                },
+            )
+        for url in self.seed_urls:
+            yield Request(
+                url,
+                self.parse,
+                meta={
+                    "seed_url": url,
+                    "previous_url": "",
+                },
+            )
+    def _parse_sitemap(self, response: Response):
+        yield ConnectorItem(
+            dropped=False, downloaded=False, system_request=True, sitemap=True
+        )
+        seed_url = response.meta["seed_url"]
+        if response.url.endswith("/robots.txt") or response.url.endswith(
+            "/robots.txt/"
+        ):
+            for url in sitemap_urls_from_robots(response.text, base_url=response.url):
+                yield Request(
+                    url,
+                    callback=self._parse_sitemap,
+                    priority=self.SITEMAP_DOWNLOAD_PRIORITY,
+                    meta={
+                        "seed_url": seed_url,
+                        "previous_url": response.url,
+                        "system_request": True,
+                        "sitemap": True,
+                    },
+                )
+        else:
+            body = self._get_sitemap_body(response)
+            if not body:
+                self.log(
+                    f"Ignoring invalid sitemap: {response}",
+                    logging.WARN,
+                    extra={"spider": self},
+                )
+                return
+            s = Sitemap(body)
+            it = self.sitemap_filter(s)
+            if s.type == "sitemapindex":
+                for loc in iterloc(it, self.sitemap_alternate_links):
+                    if any(
+                        x.search(loc) for x in self._follow
+                    ) and self._is_allowed_path(loc):
+                        yield Request(
+                            loc,
+                            callback=self._parse_sitemap,
+                            priority=self.SITEMAP_DOWNLOAD_PRIORITY,
+                            meta={
+                                "seed_url": seed_url,
+                                "previous_url": response.url,
+                                "system_request": True,
+                                "sitemap": True,
+                            },
+                        )
+            elif s.type == "urlset":
+                for loc in iterloc(it, self.sitemap_alternate_links):
+                    for r, c in self._cbs:
+                        if r.search(loc) and self._is_allowed_path(loc):
+                            yield Request(
+                                loc,
+                                callback=c,
+                                meta={
+                                    "seed_url": seed_url,
+                                    "previous_url": response.url,
+                                },
+                            )
+                            break
+    def _is_allowed_path(self, input: str | Request | Response) -> bool:
+        return is_allowed_path(input, self.focus_paths)
+    def _is_allowed_content_type(self, content_type: str) -> bool:
+        return any([mtype in content_type for mtype in self.allow_mime_types])
+    def _is_disallowed_content_type(self, content_type: str) -> bool:
+        return any([mtype in content_type for mtype in self.disallow_mime_types])
+    def _should_download(self, content_type: str | None) -> bool:
+        if (not self.allow_mime_types) and (not self.disallow_mime_types):
+            return True
+        if not content_type:
+            return False
+        ctype = content_type.lower()
+        if not self.allow_mime_types:
+            return not self._is_disallowed_content_type(ctype)
+        if not self.disallow_mime_types:
+            return self._is_allowed_content_type(ctype)
+        return (
+            not self._is_disallowed_content_type(ctype)
+        ) and self._is_allowed_content_type(ctype)
+    def _explore_sitemap(self, response: Response) -> Generator[Request, Any, None]:
+        depth = response.meta.get("depth", 0)
+        depth_limit = self.depth_limit
+        if (depth_limit == 0 or depth < depth_limit) and self.sitemap_search:
+            parts = urlparse_cached(response)
+            domain = parts.netloc
+            if domain not in self.sitemaps_seen:
+                self.log(
+                    f"New domain {domain} found. Search for sitemap.", logging.INFO
+                )
+                self.sitemaps_seen.add(domain)
+                for sitemap in self._get_sitemap_urls(parts):
+                    yield Request(
+                        sitemap,
+                        callback=self._parse_sitemap,
+                        priority=self.SITEMAP_DOWNLOAD_PRIORITY,
+                        meta={
+                            "seed_url": response.meta["seed_url"],
+                            "previous_url": response.url,
+                            "system_request": True,
+                            "sitemap": True,
+                        },
+                    )
+    def _explore_links(
+        self, response: Response, links: list[Link]
+    ) -> Generator[Request, Any, None]:
+        depth = response.meta.get("depth", 0)
+        depth_limit = self.depth_limit
+        if depth_limit == 0 or depth < depth_limit:
+            for link in links:
+                if self._is_allowed_path(link.url):
+                    yield Request(
+                        link.url,
+                        callback=self.parse,
+                        meta={
+                            "seed_url": response.meta["seed_url"],
+                            "previous_url": response.url,
+                        },
+                    )
+class ConnectorSitemapSpider(BaseSitemapSpider):
+    name = "dpk-connector-sitemap"
+    def __init__(
+        self,
+        callback: Callable[[str, bytes, dict[str, str]], None],
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.callback = callback
+    def parse(
+        self, response: Response, **kwargs: Any
+    ) -> Generator[Request | ConnectorItem, Any, None]:
+        drop = False
+        content_type = get_content_type(response)
+        if not content_type:
+            drop = True
+        is_html = "text/html" in content_type.lower()
+        should_download = self._should_download(content_type)
+        if not (is_html or should_download):
+            drop = True
+        if drop:
+            yield ConnectorItem(
+                dropped=True, downloaded=False, system_request=False, sitemap=False
+            )
+            return
+        # Download contents
+        if should_download:
+            self.callback(
+                str(response.url), response.body, response.headers.to_unicode_dict()
+            )
+            # to count up downloaded pages and collect stats
+            yield ConnectorItem(
+                dropped=False, downloaded=True, system_request=False, sitemap=False
+            )
+        else:
+            yield ConnectorItem(
+                dropped=False, downloaded=False, system_request=False, sitemap=False
+            )
+        # Search for sitemap
+        yield from self._explore_sitemap(response)
+        # Extract links and dispatch them
+        links = self.link_extractor.extract_links(response) if is_html else []
+        yield from self._explore_links(response, links)

dpk_connector/core/utils.py ADDED Viewed

@@ -0,0 +1,97 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+import re
+from urllib.parse import ParseResult, urlparse
+import tldextract
+from scrapy.http import Request, Response
+from scrapy.http.headers import Headers
+from scrapy.utils.httpobj import urlparse_cached as _urlparse_cached
+def _get_header_value(headers: Headers, key: str) -> str | None:
+    value = headers.get(key)
+    return value.decode("utf-8") if value else None
+def get_header_value(response: Response, key: str) -> str | None:
+    return _get_header_value(response.headers, key)
+def get_content_type(response: Response) -> str | None:
+    return get_header_value(response, "Content-Type")
+def get_mime_type(content_type: str) -> str:
+    return content_type.split(";")[0].strip()
+def urlparse_cached(input: str | Request | Response) -> ParseResult:
+    return urlparse(input) if isinstance(input, str) else _urlparse_cached(input)
+def get_netloc(input: str | Request | Response) -> str:
+    return urlparse_cached(input).netloc
+def get_base_url(input: str | Request | Response | ParseResult) -> str:
+    if isinstance(input, ParseResult):
+        parts = input
+    else:
+        parts = urlparse_cached(input)
+    return f"{parts.scheme}://{parts.netloc}"
+def get_etld1(url: str) -> str:
+    ext = tldextract.extract(url)
+    return f"{ext.domain}.{ext.suffix}"
+def get_fqdn(url: str) -> str:
+    ext = tldextract.extract(url)
+    return ext.fqdn
+def get_focus_path(url: str) -> str | None:
+    parts = urlparse_cached(url)
+    if len(parts.path.split("/")) > 2:
+        return "/".join(parts.path.split("/")[:-1]) + "/"
+    return None
+def _check_path(url_path: str, focus_element: str):
+    if focus_element.startswith("/"):
+        return url_path.startswith(focus_element)
+    else:
+        return focus_element in url_path
+def is_allowed_path(input: str | Request | Response, focus_paths: set[str]) -> bool:
+    if not focus_paths:
+        return True
+    url_path = urlparse_cached(input).path
+    return any(_check_path(url_path.lower(), p.lower()) for p in focus_paths)
+def validate_url(url: str) -> bool:
+    result = urlparse(url)
+    if result.scheme not in ("http", "https"):
+        return False
+    if not result.netloc:
+        return False
+    return True
+def validate_domain(domain: str) -> bool:
+    pattern = r"^([a-zA-Z0-9][a-zA-Z0-9\-]{1,61}[a-zA-Z0-9]\.)+[a-zA-Z]{2,}$"
+    return bool(re.match(pattern, domain))