data-prep-connector 0.2.2.dev0__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_connector
3
- Version: 0.2.2.dev0
3
+ Version: 0.2.2.dev1
4
4
  Summary: Scalable and Compliant Web Crawler
5
5
  Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
6
6
  License: Apache-2.0
@@ -1,15 +1,15 @@
1
1
  dpk_connector/__init__.py,sha256=xG6Sve8_Vf1RI0uLDIxEMrFM62TUxeTkuYVPPADqETQ,735
2
2
  dpk_connector/core/__init__.py,sha256=WrQMZyFE3Gn6fT7oHmL9zBYpJ9lI9j-PpJBqE_a6Zww,658
3
- dpk_connector/core/crawler.py,sha256=8c_ZqwZGyHcRmmCp1bLEZdYsz_A0NOKJ9-yFEi_I1wk,8262
3
+ dpk_connector/core/crawler.py,sha256=Wss9DKRQkh0lu2e_Ox6usayp3SdcUWXGGIMqTcLzNFE,8720
4
4
  dpk_connector/core/item.py,sha256=MZRTwhJJupkC_oegEfzrb-YsWP0TRv09Y2rwEv71uII,841
5
5
  dpk_connector/core/logging.py,sha256=aV1SNJUPgJuoiZ6wwlZcHTHigLB0vRDT2UfM0RWeWW4,981
6
6
  dpk_connector/core/middlewares.py,sha256=dB44kOG1wU1yCp7zNxe66DB37rTmYnsQokv99Bng-8k,9942
7
7
  dpk_connector/core/pipelines.py,sha256=W3EYF6l8hyV2FccJ2Mj2FL28RUtQoHKqSps-SYV1Lpo,1115
8
8
  dpk_connector/core/settings.py,sha256=BhATbs9UEtTMWpUMpZUY66b-brRUmG-d7danm_FYAD8,2275
9
- dpk_connector/core/utils.py,sha256=keH9FrbBXaIjPg7SWUdMif3tyJMAq4CK2qnWcJoqPNQ,2947
9
+ dpk_connector/core/utils.py,sha256=O6MI9Gz6TvncTv0isaxIvE29q-CnXLTN3cx4abEG2VE,3034
10
10
  dpk_connector/core/spiders/__init__.py,sha256=WrQMZyFE3Gn6fT7oHmL9zBYpJ9lI9j-PpJBqE_a6Zww,658
11
- dpk_connector/core/spiders/sitemap.py,sha256=OpnHIzCQZ08SwUpp2JEOBfq2DPA214am5vnI7LX3JB8,12322
12
- data_prep_connector-0.2.2.dev0.dist-info/METADATA,sha256=3uii36rBy4Dz7ltRbcqAS-_vjXkvQwIYAXYHjIC1XNI,1833
13
- data_prep_connector-0.2.2.dev0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
14
- data_prep_connector-0.2.2.dev0.dist-info/top_level.txt,sha256=V5veaYVXWTfjj98ntRCsHK7A36nzNprbMwB8PRrtsN4,14
15
- data_prep_connector-0.2.2.dev0.dist-info/RECORD,,
11
+ dpk_connector/core/spiders/sitemap.py,sha256=SYT89P3V2QpHvE_PuEdBJlabKCswi_0W6A4sOqOnvXc,12600
12
+ data_prep_connector-0.2.2.dev1.dist-info/METADATA,sha256=ue_NdgtrsFL4lyrdv9pJ01Xh1jhYYgCP9nIUSJCbjT0,1833
13
+ data_prep_connector-0.2.2.dev1.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
14
+ data_prep_connector-0.2.2.dev1.dist-info/top_level.txt,sha256=V5veaYVXWTfjj98ntRCsHK7A36nzNprbMwB8PRrtsN4,14
15
+ data_prep_connector-0.2.2.dev1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.1.0)
2
+ Generator: setuptools (75.2.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -74,6 +74,7 @@ def async_crawl(
74
74
  user_agent: str = "",
75
75
  headers: dict[str, str] = {},
76
76
  allow_domains: Collection[str] = (),
77
+ subdomain_focus: bool = False,
77
78
  path_focus: bool = False,
78
79
  allow_mime_types: Collection[str] = (
79
80
  "application/pdf",
@@ -96,6 +97,7 @@ def async_crawl(
96
97
  user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
97
98
  headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
98
99
  allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
100
+ subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
99
101
  path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
100
102
  allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
101
103
  disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
@@ -140,6 +142,7 @@ def async_crawl(
140
142
  seed_urls=seed_urls,
141
143
  callback=on_downloaded,
142
144
  allow_domains=allow_domains,
145
+ subdomain_focus=subdomain_focus,
143
146
  path_focus=path_focus,
144
147
  allow_mime_types=allow_mime_types,
145
148
  disallow_mime_types=disallow_mime_types,
@@ -155,6 +158,7 @@ def crawl(
155
158
  user_agent: str = "",
156
159
  headers: dict[str, str] = {},
157
160
  allow_domains: Collection[str] = (),
161
+ subdomain_focus: bool = False,
158
162
  path_focus: bool = False,
159
163
  allow_mime_types: Collection[str] = (
160
164
  "application/pdf",
@@ -177,6 +181,7 @@ def crawl(
177
181
  user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
178
182
  headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
179
183
  allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
184
+ subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
180
185
  path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
181
186
  allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
182
187
  disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
@@ -198,6 +203,7 @@ def crawl(
198
203
  user_agent,
199
204
  headers,
200
205
  allow_domains,
206
+ subdomain_focus,
201
207
  path_focus,
202
208
  allow_mime_types,
203
209
  disallow_mime_types,
@@ -28,6 +28,7 @@ from dpk_connector.core.utils import (
28
28
  get_content_type,
29
29
  get_etld1,
30
30
  get_focus_path,
31
+ get_fqdn,
31
32
  is_allowed_path,
32
33
  urlparse_cached,
33
34
  )
@@ -42,6 +43,7 @@ class BaseSitemapSpider(SitemapSpider):
42
43
  self,
43
44
  seed_urls: Collection[str],
44
45
  allow_domains: Collection[str] = (),
46
+ subdomain_focus: bool = False,
45
47
  path_focus: bool = False,
46
48
  allow_mime_types: Collection[str] = (),
47
49
  disallow_mime_types: Collection[str] = (),
@@ -88,11 +90,15 @@ class BaseSitemapSpider(SitemapSpider):
88
90
  self.focus_paths.add(path)
89
91
 
90
92
  # Domains and mime types filtering
91
- self.allowed_domains = set(
92
- allow_domains
93
- if len(allow_domains) > 0
94
- else [get_etld1(url) for url in seed_urls]
95
- )
93
+ if allow_domains:
94
+ self.allowed_domains = set(allow_domains)
95
+ elif subdomain_focus:
96
+ self.allowed_domains = set()
97
+ for url in seed_urls:
98
+ if fqdn := get_fqdn(url):
99
+ self.allowed_domains.add(fqdn)
100
+ else:
101
+ self.allowed_domains = set(get_etld1(url) for url in seed_urls)
96
102
  self.allow_mime_types = set(
97
103
  [m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else ()
98
104
  )
@@ -155,7 +161,9 @@ class BaseSitemapSpider(SitemapSpider):
155
161
  )
156
162
 
157
163
  def _parse_sitemap(self, response: Response):
158
- yield ConnectorItem(dropped=False, downloaded=False, system_request=True, sitemap=True)
164
+ yield ConnectorItem(
165
+ dropped=False, downloaded=False, system_request=True, sitemap=True
166
+ )
159
167
 
160
168
  seed_url = response.meta["seed_url"]
161
169
 
@@ -57,6 +57,11 @@ def get_etld1(url: str) -> str:
57
57
  return f"{ext.domain}.{ext.suffix}"
58
58
 
59
59
 
60
+ def get_fqdn(url: str) -> str:
61
+ ext = tldextract.extract(url)
62
+ return ext.fqdn
63
+
64
+
60
65
  def get_focus_path(url: str) -> str | None:
61
66
  parts = urlparse_cached(url)
62
67
  if len(parts.path.split("/")) > 2: