data-prep-connector 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,54 @@
1
+ Metadata-Version: 2.1
2
+ Name: data_prep_connector
3
+ Version: 0.2.2
4
+ Summary: Scalable and Compliant Web Crawler
5
+ Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
6
+ License: Apache-2.0
7
+ Keywords: data,data acquisition,crawler,web crawler,llm,generative,ai,fine-tuning,llmapps
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: scrapy>=2.11.2
11
+ Requires-Dist: pydantic>=2.8.1
12
+ Requires-Dist: tldextract>=5.1.2
13
+ Provides-Extra: dev
14
+ Requires-Dist: twine; extra == "dev"
15
+ Requires-Dist: pytest>=7.3.2; extra == "dev"
16
+ Requires-Dist: pytest-dotenv>=0.5.2; extra == "dev"
17
+ Requires-Dist: pytest-env>=1.0.0; extra == "dev"
18
+ Requires-Dist: pre-commit>=3.3.2; extra == "dev"
19
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
20
+ Requires-Dist: pytest-mock>=3.10.0; extra == "dev"
21
+ Requires-Dist: pytest-datadir>=1.5.0; extra == "dev"
22
+ Requires-Dist: moto==5.0.5; extra == "dev"
23
+ Requires-Dist: markupsafe==2.0.1; extra == "dev"
24
+
25
+ # DPK Connector
26
+
27
+ DPK Connector is a scalable and compliant web crawler developed for data acquisition towards LLM development. It is built on [Scrapy](https://scrapy.org/).
28
+ For more details read [the documentation](doc/overview.md).
29
+
30
+ ## Virtual Environment
31
+
32
+ The project uses `pyproject.toml` and a Makefile for operations.
33
+ To do development you should establish the virtual environment
34
+ ```shell
35
+ make venv
36
+ ```
37
+ and then either activate
38
+ ```shell
39
+ source venv/bin/activate
40
+ ```
41
+ or set up your IDE to use the venv directory when developing in this project
42
+
43
+ ## Library Artifact Build and Publish
44
+
45
+ To test, build and publish the library
46
+ ```shell
47
+ make test build publish
48
+ ```
49
+
50
+ To up the version number, edit the Makefile to change VERSION and rerun the above. This will require committing both the `Makefile` and the autotmatically updated `pyproject.toml` file.
51
+
52
+ ## How to use
53
+
54
+ See [the overview](doc/overview.md).
@@ -0,0 +1,15 @@
1
+ dpk_connector/__init__.py,sha256=xG6Sve8_Vf1RI0uLDIxEMrFM62TUxeTkuYVPPADqETQ,735
2
+ dpk_connector/core/__init__.py,sha256=WrQMZyFE3Gn6fT7oHmL9zBYpJ9lI9j-PpJBqE_a6Zww,658
3
+ dpk_connector/core/crawler.py,sha256=Wss9DKRQkh0lu2e_Ox6usayp3SdcUWXGGIMqTcLzNFE,8720
4
+ dpk_connector/core/item.py,sha256=MZRTwhJJupkC_oegEfzrb-YsWP0TRv09Y2rwEv71uII,841
5
+ dpk_connector/core/logging.py,sha256=aV1SNJUPgJuoiZ6wwlZcHTHigLB0vRDT2UfM0RWeWW4,981
6
+ dpk_connector/core/middlewares.py,sha256=dB44kOG1wU1yCp7zNxe66DB37rTmYnsQokv99Bng-8k,9942
7
+ dpk_connector/core/pipelines.py,sha256=W3EYF6l8hyV2FccJ2Mj2FL28RUtQoHKqSps-SYV1Lpo,1115
8
+ dpk_connector/core/settings.py,sha256=BhATbs9UEtTMWpUMpZUY66b-brRUmG-d7danm_FYAD8,2275
9
+ dpk_connector/core/utils.py,sha256=O6MI9Gz6TvncTv0isaxIvE29q-CnXLTN3cx4abEG2VE,3034
10
+ dpk_connector/core/spiders/__init__.py,sha256=WrQMZyFE3Gn6fT7oHmL9zBYpJ9lI9j-PpJBqE_a6Zww,658
11
+ dpk_connector/core/spiders/sitemap.py,sha256=SYT89P3V2QpHvE_PuEdBJlabKCswi_0W6A4sOqOnvXc,12600
12
+ data_prep_connector-0.2.2.dist-info/METADATA,sha256=b_NuUk_6AvbbWe8ekPYK4K6KLX2CC6KTZCnpb_ZXYcU,1828
13
+ data_prep_connector-0.2.2.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
14
+ data_prep_connector-0.2.2.dist-info/top_level.txt,sha256=V5veaYVXWTfjj98ntRCsHK7A36nzNprbMwB8PRrtsN4,14
15
+ data_prep_connector-0.2.2.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.2.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ dpk_connector
@@ -0,0 +1,13 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from dpk_connector.core.crawler import async_crawl, crawl, shutdown # noqa
@@ -0,0 +1,11 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
@@ -0,0 +1,222 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import threading
14
+ from typing import Any, Callable, Collection, Type, cast
15
+
16
+ from scrapy import Spider
17
+ from scrapy.crawler import Crawler, CrawlerRunner
18
+ from scrapy.settings import Settings
19
+ from twisted.internet.defer import Deferred
20
+
21
+ from dpk_connector.core.utils import validate_domain, validate_url
22
+
23
+ _lock = threading.Lock()
24
+ _reactor_initialized = False
25
+ _reactor_started = False
26
+
27
+
28
+ def _run_reactor():
29
+ from twisted.internet import reactor
30
+
31
+ reactor.run(installSignalHandlers=False)
32
+
33
+
34
+ _reactor_thread: threading.Thread = threading.Thread(
35
+ target=_run_reactor,
36
+ daemon=True,
37
+ )
38
+
39
+
40
+ def _start_reactor():
41
+ with _lock:
42
+ global _reactor_started
43
+ if not _reactor_started:
44
+ _reactor_thread.start()
45
+ _reactor_started = True
46
+
47
+
48
+ def _stop_reactor():
49
+ from twisted.internet import reactor
50
+
51
+ try:
52
+ reactor.stop()
53
+ except RuntimeError:
54
+ pass
55
+
56
+
57
+ class MultiThreadedCrawlerRunner(CrawlerRunner):
58
+ def _create_crawler(self, spidercls: str | type[Spider]) -> Crawler:
59
+ if isinstance(spidercls, str):
60
+ spidercls = self.spider_loader.load(spidercls)
61
+ with _lock:
62
+ global _reactor_initialized
63
+ init_reactor = not _reactor_initialized
64
+ crawler = Crawler(
65
+ cast(Type[Spider], spidercls), self.settings, init_reactor
66
+ )
67
+ _reactor_initialized = True
68
+ return crawler
69
+
70
+
71
+ def async_crawl(
72
+ seed_urls: Collection[str],
73
+ on_downloaded: Callable[[str, bytes, dict[str, str]], None],
74
+ user_agent: str = "",
75
+ headers: dict[str, str] = {},
76
+ allow_domains: Collection[str] = (),
77
+ subdomain_focus: bool = False,
78
+ path_focus: bool = False,
79
+ allow_mime_types: Collection[str] = (
80
+ "application/pdf",
81
+ "text/html",
82
+ "text/markdown",
83
+ "text/plain",
84
+ ),
85
+ disallow_mime_types: Collection[str] = (),
86
+ depth_limit: int = -1,
87
+ download_limit: int = -1,
88
+ ) -> Deferred[None]:
89
+ # Assisted by WCA@IBM
90
+ # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
91
+ """
92
+ Do crawl asynchronously.
93
+
94
+ Parameters:
95
+ seed_urls (Collection[str]): A collection of seed URLs to start the crawl from.
96
+ on_downloaded (Callable[[str, bytes, dict[str, str]], None]): The callback function to be called for each downloaded page.
97
+ user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
98
+ headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
99
+ allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
100
+ subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
101
+ path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
102
+ allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
103
+ disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
104
+ depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
105
+ download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
106
+
107
+ Returns:
108
+ Deferred[None]: A Twisted deferred object that can be used to wait for the crawler to finish.
109
+ """
110
+ if not seed_urls:
111
+ raise ValueError(f"Empty seed URLs.")
112
+ for url in seed_urls:
113
+ if not validate_url(url):
114
+ raise ValueError(f"Seed URL {url} is not valid.")
115
+ for domain in allow_domains:
116
+ if not validate_domain(domain):
117
+ raise ValueError(f"Allow domain {domain} is not valid.")
118
+ if depth_limit < -1:
119
+ raise ValueError(f"Invalid depth limit {depth_limit}")
120
+ if download_limit < -1:
121
+ raise ValueError(f"Invalid download limit {download_limit}")
122
+
123
+ settings = Settings()
124
+ settings.setmodule("dpk_connector.core.settings", priority="project")
125
+
126
+ if user_agent:
127
+ settings.set("USER_AGENT", user_agent, priority="spider")
128
+ if headers:
129
+ settings.set("DEFAULT_REQUEST_HEADERS", headers)
130
+ if depth_limit == 0:
131
+ depth_limit = -1
132
+ elif depth_limit == -1:
133
+ depth_limit = 0
134
+ settings.set("DEPTH_LIMIT", depth_limit, priority="spider")
135
+ if download_limit == -1:
136
+ download_limit = 0
137
+ settings.set("CLOSESPIDER_ITEMCOUNT", download_limit, priority="spider")
138
+
139
+ runner = MultiThreadedCrawlerRunner(settings)
140
+ runner.crawl(
141
+ "dpk-connector-sitemap",
142
+ seed_urls=seed_urls,
143
+ callback=on_downloaded,
144
+ allow_domains=allow_domains,
145
+ subdomain_focus=subdomain_focus,
146
+ path_focus=path_focus,
147
+ allow_mime_types=allow_mime_types,
148
+ disallow_mime_types=disallow_mime_types,
149
+ disable_sitemap_search=True,
150
+ )
151
+ _start_reactor()
152
+ return runner.join()
153
+
154
+
155
+ def crawl(
156
+ seed_urls: Collection[str],
157
+ on_downloaded: Callable[[str, bytes, dict[str, str]], None],
158
+ user_agent: str = "",
159
+ headers: dict[str, str] = {},
160
+ allow_domains: Collection[str] = (),
161
+ subdomain_focus: bool = False,
162
+ path_focus: bool = False,
163
+ allow_mime_types: Collection[str] = (
164
+ "application/pdf",
165
+ "text/html",
166
+ "text/markdown",
167
+ "text/plain",
168
+ ),
169
+ disallow_mime_types: Collection[str] = (),
170
+ depth_limit: int = -1,
171
+ download_limit: int = -1,
172
+ ) -> None:
173
+ # Assisted by WCA@IBM
174
+ # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
175
+ """
176
+ Do crawl synchronously.
177
+
178
+ Parameters:
179
+ seed_urls (Collection[str]): A collection of seed URLs to start the crawl from.
180
+ on_downloaded (Callable[[str, bytes, dict[str, str]], None]): The callback function to be called for each downloaded page.
181
+ user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
182
+ headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
183
+ allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
184
+ subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
185
+ path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
186
+ allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
187
+ disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
188
+ depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
189
+ download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
190
+
191
+ Returns:
192
+ None
193
+ """
194
+ condition = threading.Condition()
195
+
196
+ def on_completed(result: Any):
197
+ with condition:
198
+ condition.notify()
199
+
200
+ d = async_crawl(
201
+ seed_urls,
202
+ on_downloaded,
203
+ user_agent,
204
+ headers,
205
+ allow_domains,
206
+ subdomain_focus,
207
+ path_focus,
208
+ allow_mime_types,
209
+ disallow_mime_types,
210
+ depth_limit,
211
+ download_limit,
212
+ )
213
+ d.addBoth(on_completed)
214
+ with condition:
215
+ condition.wait()
216
+
217
+
218
+ def shutdown():
219
+ """
220
+ Shutdown all crawls.
221
+ """
222
+ _stop_reactor()
@@ -0,0 +1,21 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from dataclasses import dataclass
14
+
15
+
16
+ @dataclass
17
+ class ConnectorItem:
18
+ dropped: bool = False
19
+ downloaded: bool = False
20
+ system_request: bool = False
21
+ sitemap: bool = False
@@ -0,0 +1,22 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from scrapy.logformatter import LogFormatter as ScrapyLogFormatter
14
+
15
+
16
+ class QuietLogFormatter(ScrapyLogFormatter):
17
+ def scraped(self, item, response, spider):
18
+ return (
19
+ super().scraped(item, response, spider)
20
+ if spider.settings.getbool("LOG_SCRAPED_ITEMS")
21
+ else None
22
+ )
@@ -0,0 +1,263 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import logging
14
+ from typing import Any, Generator, Iterable
15
+
16
+ from scrapy import Spider, signals
17
+ from scrapy.crawler import Crawler
18
+ from scrapy.downloadermiddlewares.robotstxt import RobotsTxtMiddleware
19
+ from scrapy.downloadermiddlewares.stats import DownloaderStats
20
+ from scrapy.exceptions import NotConfigured
21
+ from scrapy.http import Request, Response
22
+ from scrapy.http.request import NO_CALLBACK
23
+ from scrapy.robotstxt import ProtegoRobotParser, RobotParser
24
+ from scrapy.statscollectors import StatsCollector
25
+ from scrapy.utils.httpobj import urlparse_cached
26
+ from scrapy.utils.python import to_unicode
27
+ from twisted.internet.defer import Deferred
28
+
29
+ from dpk_connector.core.item import ConnectorItem
30
+ from dpk_connector.core.utils import get_content_type, get_etld1, get_mime_type, get_netloc
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ class DelayingProtegoRobotParser(ProtegoRobotParser):
36
+ """
37
+ Robots.txt parser supporting crawl-delay/request-rate.
38
+ """
39
+
40
+ def __init__(self, robotstxt_body: str | bytes, spider: Spider):
41
+ super().__init__(robotstxt_body, spider)
42
+ self.max_delay = spider.crawler.settings.getfloat("ROBOTS_MAX_CRAWL_DELAY", 60)
43
+
44
+ def delay(self, user_agent: str | bytes) -> float | None:
45
+ user_agent = to_unicode(user_agent)
46
+ crawl_delay = self.rp.crawl_delay(user_agent)
47
+ request_rate = self.rp.request_rate(user_agent)
48
+ if crawl_delay is None and request_rate is None:
49
+ return None
50
+ crawl_delay = crawl_delay or 0
51
+ request_rate = (
52
+ request_rate.seconds / request_rate.requests if request_rate else 0
53
+ )
54
+ delay = min(max(crawl_delay, request_rate), self.max_delay)
55
+ return delay
56
+
57
+
58
+ class DelayingRobotsTxtMiddleware(RobotsTxtMiddleware):
59
+ """
60
+ Downloader middleware to follow crawl-delay/request-rate directives of robots.txt.
61
+ """
62
+
63
+ def __init__(self, crawler: Crawler, download_timeout: float):
64
+ super().__init__(crawler)
65
+ self.download_timeout = download_timeout
66
+ self._delays: dict[str, float] = {}
67
+ crawler.signals.connect(
68
+ self._request_reached_downloader, signal=signals.request_reached_downloader
69
+ )
70
+
71
+ @classmethod
72
+ def from_crawler(cls, crawler: Crawler):
73
+ download_timeout = crawler.settings.getfloat("ROBOTSTXT_DOWNLOAD_TIMEOUT")
74
+ if not download_timeout:
75
+ download_timeout = crawler.settings.getfloat("DOWNLOAD_TIMEOUT")
76
+ return cls(crawler, download_timeout)
77
+
78
+ def _request_reached_downloader(self, request: Request, spider: Spider) -> None:
79
+ key = request.meta.get("download_slot")
80
+ if slot := self.crawler.engine.downloader.slots.get(key):
81
+ parts = urlparse_cached(request)
82
+ domain = parts.netloc
83
+ if domain in self._delays:
84
+ delay = self._delays[domain]
85
+ if delay and slot.delay < delay:
86
+ slot.delay = delay
87
+ slot.randomize_delay = False
88
+
89
+ def process_request_2(
90
+ self, rp: RobotParser, request: Request, spider: Spider
91
+ ) -> None:
92
+ super().process_request_2(rp, request, spider)
93
+ if isinstance(rp, DelayingProtegoRobotParser):
94
+ parts = urlparse_cached(request)
95
+ domain = parts.netloc
96
+ if domain not in self._delays:
97
+ user_agent = self._robotstxt_useragent
98
+ if not user_agent:
99
+ user_agent = request.headers.get(
100
+ b"User-Agent", self._default_useragent
101
+ )
102
+ delay = rp.delay(user_agent) or 0.0
103
+ self._delays[domain] = delay
104
+ if delay:
105
+ logger.info(
106
+ f"Set download delay to {delay} according to robots.txt. domain: {domain}"
107
+ )
108
+
109
+ def robot_parser(self, request: Request, spider: Spider):
110
+ url = urlparse_cached(request)
111
+ netloc = url.netloc
112
+
113
+ if netloc not in self._parsers:
114
+ self._parsers[netloc] = Deferred()
115
+ robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
116
+ robotsreq = Request(
117
+ robotsurl,
118
+ priority=self.DOWNLOAD_PRIORITY,
119
+ meta={
120
+ "dont_obey_robotstxt": True,
121
+ "system_request": True,
122
+ "download_timeout": self.download_timeout,
123
+ },
124
+ callback=NO_CALLBACK,
125
+ )
126
+ dfd = self.crawler.engine.download(robotsreq)
127
+ dfd.addCallback(self._parse_robots, netloc, spider)
128
+ dfd.addErrback(self._logerror, robotsreq, spider)
129
+ dfd.addErrback(self._robots_error, netloc)
130
+ self.crawler.stats.inc_value("robotstxt/request_count")
131
+
132
+ if isinstance(self._parsers[netloc], Deferred):
133
+ d = Deferred()
134
+
135
+ def cb(result):
136
+ d.callback(result)
137
+ return result
138
+
139
+ self._parsers[netloc].addCallback(cb)
140
+ return d
141
+ return self._parsers[netloc]
142
+
143
+
144
+ def _update_request_stats(
145
+ stats: StatsCollector,
146
+ request: Request,
147
+ spider: Spider,
148
+ prefix: str,
149
+ skip_domains: bool = False,
150
+ ):
151
+ # request count
152
+ stats.inc_value(prefix, spider=spider)
153
+ # proxy distribution
154
+ proxy = request.meta.get("proxy", "None")
155
+ stats.inc_value(f"{prefix}/proxy/{proxy}", spider=spider)
156
+ if not skip_domains:
157
+ # domain distribution
158
+ domain = get_etld1(to_unicode(request.url))
159
+ stats.inc_value(f"{prefix}/domain/{domain}", spider=spider)
160
+ # subdomain distribution
161
+ sub_domain = get_netloc(request)
162
+ stats.inc_value(f"{prefix}/subdomain/{sub_domain}", spider=spider)
163
+
164
+
165
+ def _update_stats(
166
+ stats: StatsCollector,
167
+ request: Request,
168
+ response: Response,
169
+ spider: Spider,
170
+ prefix: str,
171
+ skip_domains: bool = False,
172
+ ):
173
+ _update_request_stats(stats, request, spider, prefix, skip_domains)
174
+ # mime type distribution
175
+ content_type = get_content_type(response)
176
+ if not content_type:
177
+ stats.inc_value(f"{prefix}/mime_type/None", spider=spider)
178
+ else:
179
+ mime_type = get_mime_type(content_type)
180
+ stats.inc_value(f"{prefix}/mime_type/{mime_type}", spider=spider)
181
+ # status code distribution
182
+ stats.inc_value(f"{prefix}/status_code/{response.status}", spider=spider)
183
+
184
+
185
+ def _update_sitemap_stats(stats: StatsCollector, spider: Spider, prefix: str):
186
+ # sitemap
187
+ stats.inc_value(f"{prefix}/sitemap", spider=spider)
188
+
189
+
190
+ class ConnectorRequestedStats(DownloaderStats):
191
+ """
192
+ Downloader middleware to expose additional stats.
193
+ """
194
+
195
+ def __init__(self, stats: StatsCollector, skip_domains: bool):
196
+ super().__init__(stats)
197
+ self.skip_domains = skip_domains
198
+
199
+ @classmethod
200
+ def from_crawler(cls, crawler: Crawler):
201
+ if not crawler.settings.getbool("DOWNLOADER_STATS"):
202
+ raise NotConfigured
203
+ skip_domains = crawler.settings.getbool("STATS_SKIP_DOMAINS")
204
+ return cls(crawler.stats, skip_domains)
205
+
206
+ def process_request(self, request: Request, spider: Spider):
207
+ super().process_request(request, spider)
208
+ prefix = "dpk_connector/requested"
209
+ if not request.meta.get("system_request", False):
210
+ _update_request_stats(
211
+ self.stats, request, spider, prefix, self.skip_domains
212
+ )
213
+ if request.meta.get("sitemap", False):
214
+ _update_sitemap_stats(self.stats, spider, prefix)
215
+
216
+ def process_response(self, request: Request, response: Response, spider: Spider):
217
+ ret = super().process_response(request, response, spider)
218
+ prefix = "dpk_connector/accessed"
219
+ if not request.meta.get("system_request", False):
220
+ _update_stats(
221
+ self.stats, request, response, spider, prefix, self.skip_domains
222
+ )
223
+ if request.meta.get("sitemap", False):
224
+ _update_sitemap_stats(self.stats, spider, prefix)
225
+ return ret
226
+
227
+
228
+ class ConnectorDownloadedStats:
229
+ """
230
+ Spider middleware to expose additional stats.
231
+ """
232
+
233
+ def __init__(self, stats: StatsCollector, skip_domains: bool):
234
+ self.stats = stats
235
+ self.skip_domains = skip_domains
236
+
237
+ @classmethod
238
+ def from_crawler(cls, crawler: Crawler):
239
+ if not crawler.settings.getbool("DOWNLOADER_STATS"):
240
+ raise NotConfigured
241
+ skip_domains = crawler.settings.getbool("STATS_SKIP_DOMAINS")
242
+ return cls(crawler.stats, skip_domains)
243
+
244
+ def process_spider_output(
245
+ self,
246
+ response: Response,
247
+ result: Iterable[Request | ConnectorItem],
248
+ spider: Spider,
249
+ ) -> Generator[Any, Any, None]:
250
+ for r in result:
251
+ if isinstance(r, ConnectorItem):
252
+ if (not r.system_request) and r.downloaded:
253
+ _update_stats(
254
+ self.stats,
255
+ response.request,
256
+ response,
257
+ spider,
258
+ "dpk_connector/downloaded",
259
+ self.skip_domains,
260
+ )
261
+ if r.sitemap:
262
+ _update_sitemap_stats(self.stats, spider, "dpk_connector/downloaded")
263
+ yield r
@@ -0,0 +1,29 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from typing import Any
14
+ from scrapy import Spider
15
+ from scrapy.crawler import Crawler
16
+ from scrapy.exceptions import DropItem
17
+
18
+ from dpk_connector.core.item import ConnectorItem
19
+
20
+
21
+ class DropPipeline:
22
+ @classmethod
23
+ def from_crawler(cls, crawler: Crawler):
24
+ return cls()
25
+
26
+ def process_item(self, item: ConnectorItem, spider: Spider) -> Any:
27
+ if item.system_request or (not item.downloaded):
28
+ raise DropItem
29
+ return item
@@ -0,0 +1,70 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ BOT_NAME = "dpk-connector"
14
+
15
+ SPIDER_MODULES = ["dpk_connector.core.spiders"]
16
+
17
+ # Robots
18
+ ROBOTSTXT_OBEY = True
19
+ ROBOTS_MAX_CRAWL_DELAY = 60
20
+ ROBOTSTXT_PARSER = "dpk_connector.core.middlewares.DelayingProtegoRobotParser"
21
+
22
+ # Downloader parameters
23
+ CONCURRENT_REQUESTS = 20
24
+ CONCURRENT_REQUESTS_PER_DOMAIN = 10
25
+ DOWNLOAD_DELAY = 0
26
+ RANDOMIZE_DOWNLOAD_DELAY = True
27
+ DOWNLOAD_TIMEOUT = 180
28
+
29
+ # Autothrottle
30
+ AUTOTHROTTLE_ENABLED = True
31
+ AUTOTHROTTLE_START_DELAY = 0
32
+ AUTOTHROTTLE_MAX_DELAY = 300
33
+ AUTOTHROTTLE_TARGET_CONCURRENCY = 10
34
+ AUTOTHROTTLE_DEBUG = False
35
+
36
+ # Middlewares/pipelines/extensions
37
+ SPIDER_MIDDLEWARES = {
38
+ "dpk_connector.core.middlewares.ConnectorDownloadedStats": 10,
39
+ }
40
+ DOWNLOADER_MIDDLEWARES = {
41
+ "scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware": None,
42
+ "dpk_connector.core.middlewares.DelayingRobotsTxtMiddleware": 100,
43
+ "scrapy.downloadermiddlewares.stats.DownloaderStats": None,
44
+ "dpk_connector.core.middlewares.ConnectorRequestedStats": 850,
45
+ }
46
+ ITEM_PIPELINES = {
47
+ "dpk_connector.core.pipelines.DropPipeline": 100,
48
+ }
49
+ EXTENSIONS = {
50
+ "scrapy.extensions.telnet.TelnetConsole": None,
51
+ "scrapy.extensions.memdebug.MemoryDebugger": None,
52
+ }
53
+
54
+ # Queue
55
+ SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.LifoMemoryQueue"
56
+
57
+ # Logging
58
+ LOG_LEVEL = "INFO"
59
+ LOG_SCRAPED_ITEMS = False
60
+ LOG_FORMATTER = "dpk_connector.core.logging.QuietLogFormatter"
61
+
62
+ # Periodic logging
63
+ PERIODIC_LOG_DELTA = True
64
+ PERIODIC_LOG_STATS = True
65
+ PERIODIC_LOG_TIMING_ENABLED = True
66
+ LOGSTATS_INTERVAL = 300
67
+
68
+ # Misc
69
+ REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
70
+ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
@@ -0,0 +1,11 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
@@ -0,0 +1,342 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import logging
14
+ from typing import Any, Callable, Collection, Generator
15
+ from urllib.parse import ParseResult
16
+
17
+ from scrapy import Request
18
+ from scrapy.http import Response
19
+ from scrapy.link import Link
20
+ from scrapy.linkextractors import LinkExtractor
21
+ from scrapy.spiders import SitemapSpider
22
+ from scrapy.spiders.sitemap import iterloc
23
+ from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
24
+
25
+ from dpk_connector.core.item import ConnectorItem
26
+ from dpk_connector.core.utils import (
27
+ get_base_url,
28
+ get_content_type,
29
+ get_etld1,
30
+ get_focus_path,
31
+ get_fqdn,
32
+ is_allowed_path,
33
+ urlparse_cached,
34
+ )
35
+
36
+
37
+ class BaseSitemapSpider(SitemapSpider):
38
+ SITEMAP_DOWNLOAD_PRIORITY = 10
39
+
40
+ name = "base-sitemap"
41
+
42
+ def __init__(
43
+ self,
44
+ seed_urls: Collection[str],
45
+ allow_domains: Collection[str] = (),
46
+ subdomain_focus: bool = False,
47
+ path_focus: bool = False,
48
+ allow_mime_types: Collection[str] = (),
49
+ disallow_mime_types: Collection[str] = (),
50
+ depth_limit: int = 0,
51
+ disable_sitemap_search: bool = False,
52
+ *args,
53
+ **kwargs,
54
+ ):
55
+ super().__init__(*args, **kwargs)
56
+
57
+ self.depth_limit = depth_limit
58
+ self.sitemap_search = (not disable_sitemap_search) and depth_limit >= 0
59
+
60
+ # Build sitemap url candidates
61
+ self.input_seed_urls = seed_urls
62
+ sitemap_urls = []
63
+ sitemaps_seen = []
64
+ for seed_url in seed_urls:
65
+ parts = urlparse_cached(seed_url)
66
+ if seed_url.endswith(
67
+ (
68
+ ".xml",
69
+ ".xml.gz",
70
+ "/robots.txt",
71
+ "robots.txt/",
72
+ "/sitemap",
73
+ "/sitemap/",
74
+ )
75
+ ):
76
+ sitemap_urls.append(seed_url)
77
+ elif self.sitemap_search:
78
+ sitemap_urls.extend(self._get_sitemap_urls(parts))
79
+ sitemaps_seen.append(parts.netloc)
80
+ self.seed_urls = set(seed_urls) - set(sitemap_urls)
81
+ self.sitemap_urls = set(sitemap_urls)
82
+ self.sitemaps_seen = set(sitemaps_seen)
83
+
84
+ # Extract focus paths
85
+ self.focus_paths: set[str] = set()
86
+ if path_focus:
87
+ for seed_url in self.seed_urls:
88
+ path = get_focus_path(seed_url)
89
+ if path is not None:
90
+ self.focus_paths.add(path)
91
+
92
+ # Domains and mime types filtering
93
+ if allow_domains:
94
+ self.allowed_domains = set(allow_domains)
95
+ elif subdomain_focus:
96
+ self.allowed_domains = set()
97
+ for url in seed_urls:
98
+ if fqdn := get_fqdn(url):
99
+ self.allowed_domains.add(fqdn)
100
+ else:
101
+ self.allowed_domains = set(get_etld1(url) for url in seed_urls)
102
+ self.allow_mime_types = set(
103
+ [m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else ()
104
+ )
105
+ self.disallow_mime_types = set(
106
+ [m.lower() for m in disallow_mime_types]
107
+ if len(disallow_mime_types) > 0
108
+ else ()
109
+ )
110
+
111
+ # Link extraction from html
112
+ self.link_extractor = LinkExtractor(
113
+ allow_domains=self.allowed_domains,
114
+ unique=True,
115
+ deny_extensions=(),
116
+ tags=("a", "area", "link"),
117
+ )
118
+
119
+ self.log(
120
+ f"Seed URLs: {self.seed_urls}, sitemap URLs: {self.sitemap_urls}, allow domains: {self.allowed_domains}, focus paths: {self.focus_paths}, allow mime types: {self.allow_mime_types}, disallow mime types: {self.disallow_mime_types}, depth limit: {self.depth_limit}, sitemap search: {self.sitemap_search}",
121
+ logging.INFO,
122
+ )
123
+
124
+ def _get_sitemap_urls(self, parts: ParseResult) -> list[str]:
125
+ base_url = get_base_url(parts)
126
+ sitemap_variations = (
127
+ "robots.txt",
128
+ "robots.txt/",
129
+ "sitemap.xml",
130
+ "sitemap_index.xml",
131
+ "sitemapindex.xml",
132
+ "sitemap",
133
+ "sitemap-index.xml",
134
+ "sitemap/index.xml",
135
+ "sitemap/sitemap.xml",
136
+ "sitemap1.xml",
137
+ )
138
+ return [f"{base_url}/{sitemap}" for sitemap in sitemap_variations]
139
+
140
+ def start_requests(self):
141
+ for url in self.sitemap_urls:
142
+ yield Request(
143
+ url,
144
+ self._parse_sitemap,
145
+ priority=self.SITEMAP_DOWNLOAD_PRIORITY,
146
+ meta={
147
+ "seed_url": url,
148
+ "previous_url": "",
149
+ "system_request": True,
150
+ "sitemap": True,
151
+ },
152
+ )
153
+ for url in self.seed_urls:
154
+ yield Request(
155
+ url,
156
+ self.parse,
157
+ meta={
158
+ "seed_url": url,
159
+ "previous_url": "",
160
+ },
161
+ )
162
+
163
+ def _parse_sitemap(self, response: Response):
164
+ yield ConnectorItem(
165
+ dropped=False, downloaded=False, system_request=True, sitemap=True
166
+ )
167
+
168
+ seed_url = response.meta["seed_url"]
169
+
170
+ if response.url.endswith("/robots.txt") or response.url.endswith(
171
+ "/robots.txt/"
172
+ ):
173
+ for url in sitemap_urls_from_robots(response.text, base_url=response.url):
174
+ yield Request(
175
+ url,
176
+ callback=self._parse_sitemap,
177
+ priority=self.SITEMAP_DOWNLOAD_PRIORITY,
178
+ meta={
179
+ "seed_url": seed_url,
180
+ "previous_url": response.url,
181
+ "system_request": True,
182
+ "sitemap": True,
183
+ },
184
+ )
185
+ else:
186
+ body = self._get_sitemap_body(response)
187
+ if not body:
188
+ self.log(
189
+ f"Ignoring invalid sitemap: {response}",
190
+ logging.WARN,
191
+ extra={"spider": self},
192
+ )
193
+ return
194
+
195
+ s = Sitemap(body)
196
+ it = self.sitemap_filter(s)
197
+
198
+ if s.type == "sitemapindex":
199
+ for loc in iterloc(it, self.sitemap_alternate_links):
200
+ if any(
201
+ x.search(loc) for x in self._follow
202
+ ) and self._is_allowed_path(loc):
203
+ yield Request(
204
+ loc,
205
+ callback=self._parse_sitemap,
206
+ priority=self.SITEMAP_DOWNLOAD_PRIORITY,
207
+ meta={
208
+ "seed_url": seed_url,
209
+ "previous_url": response.url,
210
+ "system_request": True,
211
+ "sitemap": True,
212
+ },
213
+ )
214
+ elif s.type == "urlset":
215
+ for loc in iterloc(it, self.sitemap_alternate_links):
216
+ for r, c in self._cbs:
217
+ if r.search(loc) and self._is_allowed_path(loc):
218
+ yield Request(
219
+ loc,
220
+ callback=c,
221
+ meta={
222
+ "seed_url": seed_url,
223
+ "previous_url": response.url,
224
+ },
225
+ )
226
+ break
227
+
228
+ def _is_allowed_path(self, input: str | Request | Response) -> bool:
229
+ return is_allowed_path(input, self.focus_paths)
230
+
231
+ def _is_allowed_content_type(self, content_type: str) -> bool:
232
+ return any([mtype in content_type for mtype in self.allow_mime_types])
233
+
234
+ def _is_disallowed_content_type(self, content_type: str) -> bool:
235
+ return any([mtype in content_type for mtype in self.disallow_mime_types])
236
+
237
+ def _should_download(self, content_type: str | None) -> bool:
238
+ if (not self.allow_mime_types) and (not self.disallow_mime_types):
239
+ return True
240
+ if not content_type:
241
+ return False
242
+ ctype = content_type.lower()
243
+ if not self.allow_mime_types:
244
+ return not self._is_disallowed_content_type(ctype)
245
+ if not self.disallow_mime_types:
246
+ return self._is_allowed_content_type(ctype)
247
+ return (
248
+ not self._is_disallowed_content_type(ctype)
249
+ ) and self._is_allowed_content_type(ctype)
250
+
251
+ def _explore_sitemap(self, response: Response) -> Generator[Request, Any, None]:
252
+ depth = response.meta.get("depth", 0)
253
+ depth_limit = self.depth_limit
254
+ if (depth_limit == 0 or depth < depth_limit) and self.sitemap_search:
255
+ parts = urlparse_cached(response)
256
+ domain = parts.netloc
257
+ if domain not in self.sitemaps_seen:
258
+ self.log(
259
+ f"New domain {domain} found. Search for sitemap.", logging.INFO
260
+ )
261
+ self.sitemaps_seen.add(domain)
262
+ for sitemap in self._get_sitemap_urls(parts):
263
+ yield Request(
264
+ sitemap,
265
+ callback=self._parse_sitemap,
266
+ priority=self.SITEMAP_DOWNLOAD_PRIORITY,
267
+ meta={
268
+ "seed_url": response.meta["seed_url"],
269
+ "previous_url": response.url,
270
+ "system_request": True,
271
+ "sitemap": True,
272
+ },
273
+ )
274
+
275
+ def _explore_links(
276
+ self, response: Response, links: list[Link]
277
+ ) -> Generator[Request, Any, None]:
278
+ depth = response.meta.get("depth", 0)
279
+ depth_limit = self.depth_limit
280
+ if depth_limit == 0 or depth < depth_limit:
281
+ for link in links:
282
+ if self._is_allowed_path(link.url):
283
+ yield Request(
284
+ link.url,
285
+ callback=self.parse,
286
+ meta={
287
+ "seed_url": response.meta["seed_url"],
288
+ "previous_url": response.url,
289
+ },
290
+ )
291
+
292
+
293
+ class ConnectorSitemapSpider(BaseSitemapSpider):
294
+ name = "dpk-connector-sitemap"
295
+
296
+ def __init__(
297
+ self,
298
+ callback: Callable[[str, bytes, dict[str, str]], None],
299
+ *args,
300
+ **kwargs,
301
+ ):
302
+ super().__init__(*args, **kwargs)
303
+
304
+ self.callback = callback
305
+
306
+ def parse(
307
+ self, response: Response, **kwargs: Any
308
+ ) -> Generator[Request | ConnectorItem, Any, None]:
309
+ drop = False
310
+ content_type = get_content_type(response)
311
+ if not content_type:
312
+ drop = True
313
+ is_html = "text/html" in content_type.lower()
314
+ should_download = self._should_download(content_type)
315
+ if not (is_html or should_download):
316
+ drop = True
317
+ if drop:
318
+ yield ConnectorItem(
319
+ dropped=True, downloaded=False, system_request=False, sitemap=False
320
+ )
321
+ return
322
+
323
+ # Download contents
324
+ if should_download:
325
+ self.callback(
326
+ str(response.url), response.body, response.headers.to_unicode_dict()
327
+ )
328
+ # to count up downloaded pages and collect stats
329
+ yield ConnectorItem(
330
+ dropped=False, downloaded=True, system_request=False, sitemap=False
331
+ )
332
+ else:
333
+ yield ConnectorItem(
334
+ dropped=False, downloaded=False, system_request=False, sitemap=False
335
+ )
336
+
337
+ # Search for sitemap
338
+ yield from self._explore_sitemap(response)
339
+
340
+ # Extract links and dispatch them
341
+ links = self.link_extractor.extract_links(response) if is_html else []
342
+ yield from self._explore_links(response, links)
@@ -0,0 +1,97 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import re
14
+ from urllib.parse import ParseResult, urlparse
15
+
16
+ import tldextract
17
+ from scrapy.http import Request, Response
18
+ from scrapy.http.headers import Headers
19
+ from scrapy.utils.httpobj import urlparse_cached as _urlparse_cached
20
+
21
+
22
+ def _get_header_value(headers: Headers, key: str) -> str | None:
23
+ value = headers.get(key)
24
+ return value.decode("utf-8") if value else None
25
+
26
+
27
+ def get_header_value(response: Response, key: str) -> str | None:
28
+ return _get_header_value(response.headers, key)
29
+
30
+
31
+ def get_content_type(response: Response) -> str | None:
32
+ return get_header_value(response, "Content-Type")
33
+
34
+
35
+ def get_mime_type(content_type: str) -> str:
36
+ return content_type.split(";")[0].strip()
37
+
38
+
39
+ def urlparse_cached(input: str | Request | Response) -> ParseResult:
40
+ return urlparse(input) if isinstance(input, str) else _urlparse_cached(input)
41
+
42
+
43
+ def get_netloc(input: str | Request | Response) -> str:
44
+ return urlparse_cached(input).netloc
45
+
46
+
47
+ def get_base_url(input: str | Request | Response | ParseResult) -> str:
48
+ if isinstance(input, ParseResult):
49
+ parts = input
50
+ else:
51
+ parts = urlparse_cached(input)
52
+ return f"{parts.scheme}://{parts.netloc}"
53
+
54
+
55
+ def get_etld1(url: str) -> str:
56
+ ext = tldextract.extract(url)
57
+ return f"{ext.domain}.{ext.suffix}"
58
+
59
+
60
+ def get_fqdn(url: str) -> str:
61
+ ext = tldextract.extract(url)
62
+ return ext.fqdn
63
+
64
+
65
+ def get_focus_path(url: str) -> str | None:
66
+ parts = urlparse_cached(url)
67
+ if len(parts.path.split("/")) > 2:
68
+ return "/".join(parts.path.split("/")[:-1]) + "/"
69
+ return None
70
+
71
+
72
+ def _check_path(url_path: str, focus_element: str):
73
+ if focus_element.startswith("/"):
74
+ return url_path.startswith(focus_element)
75
+ else:
76
+ return focus_element in url_path
77
+
78
+
79
+ def is_allowed_path(input: str | Request | Response, focus_paths: set[str]) -> bool:
80
+ if not focus_paths:
81
+ return True
82
+ url_path = urlparse_cached(input).path
83
+ return any(_check_path(url_path.lower(), p.lower()) for p in focus_paths)
84
+
85
+
86
+ def validate_url(url: str) -> bool:
87
+ result = urlparse(url)
88
+ if result.scheme not in ("http", "https"):
89
+ return False
90
+ if not result.netloc:
91
+ return False
92
+ return True
93
+
94
+
95
+ def validate_domain(domain: str) -> bool:
96
+ pattern = r"^([a-zA-Z0-9][a-zA-Z0-9\-]{1,61}[a-zA-Z0-9]\.)+[a-zA-Z]{2,}$"
97
+ return bool(re.match(pattern, domain))