data-prep-connector 0.2.2.dev0__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_prep_connector-0.2.2.dev0.dist-info → data_prep_connector-0.2.2.dev1.dist-info}/METADATA +1 -1
- {data_prep_connector-0.2.2.dev0.dist-info → data_prep_connector-0.2.2.dev1.dist-info}/RECORD +7 -7
- {data_prep_connector-0.2.2.dev0.dist-info → data_prep_connector-0.2.2.dev1.dist-info}/WHEEL +1 -1
- dpk_connector/core/crawler.py +6 -0
- dpk_connector/core/spiders/sitemap.py +14 -6
- dpk_connector/core/utils.py +5 -0
- {data_prep_connector-0.2.2.dev0.dist-info → data_prep_connector-0.2.2.dev1.dist-info}/top_level.txt +0 -0
{data_prep_connector-0.2.2.dev0.dist-info → data_prep_connector-0.2.2.dev1.dist-info}/RECORD
RENAMED
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
dpk_connector/__init__.py,sha256=xG6Sve8_Vf1RI0uLDIxEMrFM62TUxeTkuYVPPADqETQ,735
|
|
2
2
|
dpk_connector/core/__init__.py,sha256=WrQMZyFE3Gn6fT7oHmL9zBYpJ9lI9j-PpJBqE_a6Zww,658
|
|
3
|
-
dpk_connector/core/crawler.py,sha256=
|
|
3
|
+
dpk_connector/core/crawler.py,sha256=Wss9DKRQkh0lu2e_Ox6usayp3SdcUWXGGIMqTcLzNFE,8720
|
|
4
4
|
dpk_connector/core/item.py,sha256=MZRTwhJJupkC_oegEfzrb-YsWP0TRv09Y2rwEv71uII,841
|
|
5
5
|
dpk_connector/core/logging.py,sha256=aV1SNJUPgJuoiZ6wwlZcHTHigLB0vRDT2UfM0RWeWW4,981
|
|
6
6
|
dpk_connector/core/middlewares.py,sha256=dB44kOG1wU1yCp7zNxe66DB37rTmYnsQokv99Bng-8k,9942
|
|
7
7
|
dpk_connector/core/pipelines.py,sha256=W3EYF6l8hyV2FccJ2Mj2FL28RUtQoHKqSps-SYV1Lpo,1115
|
|
8
8
|
dpk_connector/core/settings.py,sha256=BhATbs9UEtTMWpUMpZUY66b-brRUmG-d7danm_FYAD8,2275
|
|
9
|
-
dpk_connector/core/utils.py,sha256=
|
|
9
|
+
dpk_connector/core/utils.py,sha256=O6MI9Gz6TvncTv0isaxIvE29q-CnXLTN3cx4abEG2VE,3034
|
|
10
10
|
dpk_connector/core/spiders/__init__.py,sha256=WrQMZyFE3Gn6fT7oHmL9zBYpJ9lI9j-PpJBqE_a6Zww,658
|
|
11
|
-
dpk_connector/core/spiders/sitemap.py,sha256=
|
|
12
|
-
data_prep_connector-0.2.2.
|
|
13
|
-
data_prep_connector-0.2.2.
|
|
14
|
-
data_prep_connector-0.2.2.
|
|
15
|
-
data_prep_connector-0.2.2.
|
|
11
|
+
dpk_connector/core/spiders/sitemap.py,sha256=SYT89P3V2QpHvE_PuEdBJlabKCswi_0W6A4sOqOnvXc,12600
|
|
12
|
+
data_prep_connector-0.2.2.dev1.dist-info/METADATA,sha256=ue_NdgtrsFL4lyrdv9pJ01Xh1jhYYgCP9nIUSJCbjT0,1833
|
|
13
|
+
data_prep_connector-0.2.2.dev1.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
|
|
14
|
+
data_prep_connector-0.2.2.dev1.dist-info/top_level.txt,sha256=V5veaYVXWTfjj98ntRCsHK7A36nzNprbMwB8PRrtsN4,14
|
|
15
|
+
data_prep_connector-0.2.2.dev1.dist-info/RECORD,,
|
dpk_connector/core/crawler.py
CHANGED
|
@@ -74,6 +74,7 @@ def async_crawl(
|
|
|
74
74
|
user_agent: str = "",
|
|
75
75
|
headers: dict[str, str] = {},
|
|
76
76
|
allow_domains: Collection[str] = (),
|
|
77
|
+
subdomain_focus: bool = False,
|
|
77
78
|
path_focus: bool = False,
|
|
78
79
|
allow_mime_types: Collection[str] = (
|
|
79
80
|
"application/pdf",
|
|
@@ -96,6 +97,7 @@ def async_crawl(
|
|
|
96
97
|
user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
|
|
97
98
|
headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
|
|
98
99
|
allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
|
|
100
|
+
subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
|
|
99
101
|
path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
|
|
100
102
|
allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
|
|
101
103
|
disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
|
|
@@ -140,6 +142,7 @@ def async_crawl(
|
|
|
140
142
|
seed_urls=seed_urls,
|
|
141
143
|
callback=on_downloaded,
|
|
142
144
|
allow_domains=allow_domains,
|
|
145
|
+
subdomain_focus=subdomain_focus,
|
|
143
146
|
path_focus=path_focus,
|
|
144
147
|
allow_mime_types=allow_mime_types,
|
|
145
148
|
disallow_mime_types=disallow_mime_types,
|
|
@@ -155,6 +158,7 @@ def crawl(
|
|
|
155
158
|
user_agent: str = "",
|
|
156
159
|
headers: dict[str, str] = {},
|
|
157
160
|
allow_domains: Collection[str] = (),
|
|
161
|
+
subdomain_focus: bool = False,
|
|
158
162
|
path_focus: bool = False,
|
|
159
163
|
allow_mime_types: Collection[str] = (
|
|
160
164
|
"application/pdf",
|
|
@@ -177,6 +181,7 @@ def crawl(
|
|
|
177
181
|
user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
|
|
178
182
|
headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
|
|
179
183
|
allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
|
|
184
|
+
subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
|
|
180
185
|
path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
|
|
181
186
|
allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
|
|
182
187
|
disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
|
|
@@ -198,6 +203,7 @@ def crawl(
|
|
|
198
203
|
user_agent,
|
|
199
204
|
headers,
|
|
200
205
|
allow_domains,
|
|
206
|
+
subdomain_focus,
|
|
201
207
|
path_focus,
|
|
202
208
|
allow_mime_types,
|
|
203
209
|
disallow_mime_types,
|
|
@@ -28,6 +28,7 @@ from dpk_connector.core.utils import (
|
|
|
28
28
|
get_content_type,
|
|
29
29
|
get_etld1,
|
|
30
30
|
get_focus_path,
|
|
31
|
+
get_fqdn,
|
|
31
32
|
is_allowed_path,
|
|
32
33
|
urlparse_cached,
|
|
33
34
|
)
|
|
@@ -42,6 +43,7 @@ class BaseSitemapSpider(SitemapSpider):
|
|
|
42
43
|
self,
|
|
43
44
|
seed_urls: Collection[str],
|
|
44
45
|
allow_domains: Collection[str] = (),
|
|
46
|
+
subdomain_focus: bool = False,
|
|
45
47
|
path_focus: bool = False,
|
|
46
48
|
allow_mime_types: Collection[str] = (),
|
|
47
49
|
disallow_mime_types: Collection[str] = (),
|
|
@@ -88,11 +90,15 @@ class BaseSitemapSpider(SitemapSpider):
|
|
|
88
90
|
self.focus_paths.add(path)
|
|
89
91
|
|
|
90
92
|
# Domains and mime types filtering
|
|
91
|
-
|
|
92
|
-
allow_domains
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
93
|
+
if allow_domains:
|
|
94
|
+
self.allowed_domains = set(allow_domains)
|
|
95
|
+
elif subdomain_focus:
|
|
96
|
+
self.allowed_domains = set()
|
|
97
|
+
for url in seed_urls:
|
|
98
|
+
if fqdn := get_fqdn(url):
|
|
99
|
+
self.allowed_domains.add(fqdn)
|
|
100
|
+
else:
|
|
101
|
+
self.allowed_domains = set(get_etld1(url) for url in seed_urls)
|
|
96
102
|
self.allow_mime_types = set(
|
|
97
103
|
[m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else ()
|
|
98
104
|
)
|
|
@@ -155,7 +161,9 @@ class BaseSitemapSpider(SitemapSpider):
|
|
|
155
161
|
)
|
|
156
162
|
|
|
157
163
|
def _parse_sitemap(self, response: Response):
|
|
158
|
-
yield ConnectorItem(
|
|
164
|
+
yield ConnectorItem(
|
|
165
|
+
dropped=False, downloaded=False, system_request=True, sitemap=True
|
|
166
|
+
)
|
|
159
167
|
|
|
160
168
|
seed_url = response.meta["seed_url"]
|
|
161
169
|
|
dpk_connector/core/utils.py
CHANGED
|
@@ -57,6 +57,11 @@ def get_etld1(url: str) -> str:
|
|
|
57
57
|
return f"{ext.domain}.{ext.suffix}"
|
|
58
58
|
|
|
59
59
|
|
|
60
|
+
def get_fqdn(url: str) -> str:
|
|
61
|
+
ext = tldextract.extract(url)
|
|
62
|
+
return ext.fqdn
|
|
63
|
+
|
|
64
|
+
|
|
60
65
|
def get_focus_path(url: str) -> str | None:
|
|
61
66
|
parts = urlparse_cached(url)
|
|
62
67
|
if len(parts.path.split("/")) > 2:
|
{data_prep_connector-0.2.2.dev0.dist-info → data_prep_connector-0.2.2.dev1.dist-info}/top_level.txt
RENAMED
|
File without changes
|