PyPI - data-prep-connector - Versions diffs - 0.2.2.dev0__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl - Mend

data-prep-connector 0.2.2.dev0py3-none-any.whl → 0.2.2.dev1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

{data_prep_connector-0.2.2.dev0.dist-info → data_prep_connector-0.2.2.dev1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: data_prep_connector
-Version: 0.2.2.dev0
+Version: 0.2.2.dev1
 Summary: Scalable and Compliant Web Crawler
 Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
 License: Apache-2.0

{data_prep_connector-0.2.2.dev0.dist-info → data_prep_connector-0.2.2.dev1.dist-info}/RECORD RENAMED Viewed

@@ -1,15 +1,15 @@
 dpk_connector/__init__.py,sha256=xG6Sve8_Vf1RI0uLDIxEMrFM62TUxeTkuYVPPADqETQ,735
 dpk_connector/core/__init__.py,sha256=WrQMZyFE3Gn6fT7oHmL9zBYpJ9lI9j-PpJBqE_a6Zww,658
-dpk_connector/core/crawler.py,sha256=8c_ZqwZGyHcRmmCp1bLEZdYsz_A0NOKJ9-yFEi_I1wk,8262
+dpk_connector/core/crawler.py,sha256=Wss9DKRQkh0lu2e_Ox6usayp3SdcUWXGGIMqTcLzNFE,8720
 dpk_connector/core/item.py,sha256=MZRTwhJJupkC_oegEfzrb-YsWP0TRv09Y2rwEv71uII,841
 dpk_connector/core/logging.py,sha256=aV1SNJUPgJuoiZ6wwlZcHTHigLB0vRDT2UfM0RWeWW4,981
 dpk_connector/core/middlewares.py,sha256=dB44kOG1wU1yCp7zNxe66DB37rTmYnsQokv99Bng-8k,9942
 dpk_connector/core/pipelines.py,sha256=W3EYF6l8hyV2FccJ2Mj2FL28RUtQoHKqSps-SYV1Lpo,1115
 dpk_connector/core/settings.py,sha256=BhATbs9UEtTMWpUMpZUY66b-brRUmG-d7danm_FYAD8,2275
-dpk_connector/core/utils.py,sha256=keH9FrbBXaIjPg7SWUdMif3tyJMAq4CK2qnWcJoqPNQ,2947
+dpk_connector/core/utils.py,sha256=O6MI9Gz6TvncTv0isaxIvE29q-CnXLTN3cx4abEG2VE,3034
 dpk_connector/core/spiders/__init__.py,sha256=WrQMZyFE3Gn6fT7oHmL9zBYpJ9lI9j-PpJBqE_a6Zww,658
-dpk_connector/core/spiders/sitemap.py,sha256=OpnHIzCQZ08SwUpp2JEOBfq2DPA214am5vnI7LX3JB8,12322
-data_prep_connector-0.2.2.dev0.dist-info/METADATA,sha256=3uii36rBy4Dz7ltRbcqAS-_vjXkvQwIYAXYHjIC1XNI,1833
-data_prep_connector-0.2.2.dev0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
-data_prep_connector-0.2.2.dev0.dist-info/top_level.txt,sha256=V5veaYVXWTfjj98ntRCsHK7A36nzNprbMwB8PRrtsN4,14
-data_prep_connector-0.2.2.dev0.dist-info/RECORD,,
+dpk_connector/core/spiders/sitemap.py,sha256=SYT89P3V2QpHvE_PuEdBJlabKCswi_0W6A4sOqOnvXc,12600
+data_prep_connector-0.2.2.dev1.dist-info/METADATA,sha256=ue_NdgtrsFL4lyrdv9pJ01Xh1jhYYgCP9nIUSJCbjT0,1833
+data_prep_connector-0.2.2.dev1.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
+data_prep_connector-0.2.2.dev1.dist-info/top_level.txt,sha256=V5veaYVXWTfjj98ntRCsHK7A36nzNprbMwB8PRrtsN4,14
+data_prep_connector-0.2.2.dev1.dist-info/RECORD,,

{data_prep_connector-0.2.2.dev0.dist-info → data_prep_connector-0.2.2.dev1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.1.0)
+Generator: setuptools (75.2.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

dpk_connector/core/crawler.py CHANGED Viewed

@@ -74,6 +74,7 @@ def async_crawl(
     user_agent: str = "",
     headers: dict[str, str] = {},
     allow_domains: Collection[str] = (),
+    subdomain_focus: bool = False,
     path_focus: bool = False,
     allow_mime_types: Collection[str] = (
         "application/pdf",
@@ -96,6 +97,7 @@ def async_crawl(
         user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
         headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
         allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
+        subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
         path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
         allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
         disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
@@ -140,6 +142,7 @@ def async_crawl(
         seed_urls=seed_urls,
         callback=on_downloaded,
         allow_domains=allow_domains,
+        subdomain_focus=subdomain_focus,
         path_focus=path_focus,
         allow_mime_types=allow_mime_types,
         disallow_mime_types=disallow_mime_types,
@@ -155,6 +158,7 @@ def crawl(
     user_agent: str = "",
     headers: dict[str, str] = {},
     allow_domains: Collection[str] = (),
+    subdomain_focus: bool = False,
     path_focus: bool = False,
     allow_mime_types: Collection[str] = (
         "application/pdf",
@@ -177,6 +181,7 @@ def crawl(
         user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
         headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
         allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
+        subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
         path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
         allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
         disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
@@ -198,6 +203,7 @@ def crawl(
         user_agent,
         headers,
         allow_domains,
+        subdomain_focus,
         path_focus,
         allow_mime_types,
         disallow_mime_types,

dpk_connector/core/spiders/sitemap.py CHANGED Viewed

@@ -28,6 +28,7 @@ from dpk_connector.core.utils import (
     get_content_type,
     get_etld1,
     get_focus_path,
+    get_fqdn,
     is_allowed_path,
     urlparse_cached,
 )
@@ -42,6 +43,7 @@ class BaseSitemapSpider(SitemapSpider):
         self,
         seed_urls: Collection[str],
         allow_domains: Collection[str] = (),
+        subdomain_focus: bool = False,
         path_focus: bool = False,
         allow_mime_types: Collection[str] = (),
         disallow_mime_types: Collection[str] = (),
@@ -88,11 +90,15 @@ class BaseSitemapSpider(SitemapSpider):
                     self.focus_paths.add(path)
         # Domains and mime types filtering
-        self.allowed_domains = set(
-            allow_domains
-            if len(allow_domains) > 0
-            else [get_etld1(url) for url in seed_urls]
-        )
+        if allow_domains:
+            self.allowed_domains = set(allow_domains)
+        elif subdomain_focus:
+            self.allowed_domains = set()
+            for url in seed_urls:
+                if fqdn := get_fqdn(url):
+                    self.allowed_domains.add(fqdn)
+        else:
+            self.allowed_domains = set(get_etld1(url) for url in seed_urls)
         self.allow_mime_types = set(
             [m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else ()
         )
@@ -155,7 +161,9 @@ class BaseSitemapSpider(SitemapSpider):
             )
     def _parse_sitemap(self, response: Response):
-        yield ConnectorItem(dropped=False, downloaded=False, system_request=True, sitemap=True)
+        yield ConnectorItem(
+            dropped=False, downloaded=False, system_request=True, sitemap=True
+        )
         seed_url = response.meta["seed_url"]

dpk_connector/core/utils.py CHANGED Viewed

@@ -57,6 +57,11 @@ def get_etld1(url: str) -> str:
     return f"{ext.domain}.{ext.suffix}"
+def get_fqdn(url: str) -> str:
+    ext = tldextract.extract(url)
+    return ext.fqdn
 def get_focus_path(url: str) -> str | None:
     parts = urlparse_cached(url)
     if len(parts.path.split("/")) > 2:

{data_prep_connector-0.2.2.dev0.dist-info → data_prep_connector-0.2.2.dev1.dist-info}/top_level.txt RENAMED Viewed

File without changes

data-prep-connector 0.2.2.dev0__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl

data-prep-connector 0.2.2.dev0py3-none-any.whl → 0.2.2.dev1py3-none-any.whl