crawlee 1.0.5b2__py3-none-any.whl → 1.0.5b10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/_utils/robots.py +17 -5
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/storage_clients/_sql/_db_models.py +1 -2
- {crawlee-1.0.5b2.dist-info → crawlee-1.0.5b10.dist-info}/METADATA +1 -1
- {crawlee-1.0.5b2.dist-info → crawlee-1.0.5b10.dist-info}/RECORD +10 -10
- {crawlee-1.0.5b2.dist-info → crawlee-1.0.5b10.dist-info}/WHEEL +0 -0
- {crawlee-1.0.5b2.dist-info → crawlee-1.0.5b10.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.5b2.dist-info → crawlee-1.0.5b10.dist-info}/licenses/LICENSE +0 -0
crawlee/_utils/robots.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from logging import getLogger
|
|
3
4
|
from typing import TYPE_CHECKING
|
|
4
5
|
|
|
5
6
|
from protego import Protego
|
|
@@ -15,6 +16,9 @@ if TYPE_CHECKING:
|
|
|
15
16
|
from crawlee.proxy_configuration import ProxyInfo
|
|
16
17
|
|
|
17
18
|
|
|
19
|
+
logger = getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
18
22
|
class RobotsTxtFile:
|
|
19
23
|
def __init__(
|
|
20
24
|
self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
|
|
@@ -56,12 +60,20 @@ class RobotsTxtFile:
|
|
|
56
60
|
http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
|
|
57
61
|
proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
|
|
58
62
|
"""
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
+
try:
|
|
64
|
+
response = await http_client.send_request(url, proxy_info=proxy_info)
|
|
65
|
+
|
|
66
|
+
body = (
|
|
67
|
+
b'User-agent: *\nAllow: /'
|
|
68
|
+
if is_status_code_client_error(response.status_code)
|
|
69
|
+
else await response.read()
|
|
70
|
+
)
|
|
71
|
+
robots = Protego.parse(body.decode('utf-8'))
|
|
72
|
+
|
|
73
|
+
except Exception as e:
|
|
74
|
+
logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')
|
|
63
75
|
|
|
64
|
-
|
|
76
|
+
robots = Protego.parse('User-agent: *\nAllow: /')
|
|
65
77
|
|
|
66
78
|
return cls(url, robots, http_client=http_client, proxy_info=proxy_info)
|
|
67
79
|
|
|
@@ -69,7 +69,7 @@ class CrawlerInstrumentor(BaseInstrumentor):
|
|
|
69
69
|
|
|
70
70
|
if request_handling_instrumentation:
|
|
71
71
|
|
|
72
|
-
async def
|
|
72
|
+
async def middleware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
|
|
73
73
|
with self._tracer.start_as_current_span(
|
|
74
74
|
name=f'{instance.generator.__name__}, {wrapped.__name__}', # type:ignore[attr-defined] # valid in our context
|
|
75
75
|
attributes={
|
|
@@ -111,8 +111,8 @@ class CrawlerInstrumentor(BaseInstrumentor):
|
|
|
111
111
|
# Handpicked interesting methods to instrument
|
|
112
112
|
self._instrumented.extend(
|
|
113
113
|
[
|
|
114
|
-
(_Middleware, 'action',
|
|
115
|
-
(_Middleware, 'cleanup',
|
|
114
|
+
(_Middleware, 'action', middleware_wrapper),
|
|
115
|
+
(_Middleware, 'cleanup', middleware_wrapper),
|
|
116
116
|
(ContextPipeline, '__call__', context_pipeline_wrapper),
|
|
117
117
|
(BasicCrawler, '_BasicCrawler__run_task_function', self._simple_async_wrapper),
|
|
118
118
|
(BasicCrawler, '_commit_request_handler_result', _commit_request_handler_result_wrapper),
|
|
@@ -163,7 +163,7 @@ class SessionPool:
|
|
|
163
163
|
def add_session(self, session: Session) -> None:
|
|
164
164
|
"""Add an externally created session to the pool.
|
|
165
165
|
|
|
166
|
-
This is
|
|
166
|
+
This is intended only for the cases when you want to add a session that was created outside of the pool.
|
|
167
167
|
Otherwise, the pool will create new sessions automatically.
|
|
168
168
|
|
|
169
169
|
Args:
|
|
@@ -32,7 +32,7 @@ class ErrorSnapshotter:
|
|
|
32
32
|
"""Capture error snapshot and save it to key value store.
|
|
33
33
|
|
|
34
34
|
It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because
|
|
35
|
-
it returns `KeyValueStoreChangeRecords` which is
|
|
35
|
+
it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`
|
|
36
36
|
returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with
|
|
37
37
|
an exception.
|
|
38
38
|
|
|
@@ -205,9 +205,8 @@ class RequestDb(Base):
|
|
|
205
205
|
'idx_fetch_available',
|
|
206
206
|
'request_queue_id',
|
|
207
207
|
'is_handled',
|
|
208
|
-
'time_blocked_until',
|
|
209
208
|
'sequence_number',
|
|
210
|
-
postgresql_where=text('is_handled
|
|
209
|
+
postgresql_where=text('is_handled is false'),
|
|
211
210
|
),
|
|
212
211
|
)
|
|
213
212
|
|
|
@@ -32,7 +32,7 @@ crawlee/_utils/raise_if_too_many_kwargs.py,sha256=J2gaUJmsmNwexohuehXw_mdYKv-eWi
|
|
|
32
32
|
crawlee/_utils/recoverable_state.py,sha256=c1D2ZecxEliGZzhqYz9_oU5CF2Hm0UKvpOHqO6CDJRE,9032
|
|
33
33
|
crawlee/_utils/recurring_task.py,sha256=sQMiURuDXbwwfAcIXK8V4NXncSxIBxsqN1cZWX7DLyg,2128
|
|
34
34
|
crawlee/_utils/requests.py,sha256=yOjai7bHR9_duPJ0ck-L76y9AnKZr49JBfSOQv9kvJc,5048
|
|
35
|
-
crawlee/_utils/robots.py,sha256=
|
|
35
|
+
crawlee/_utils/robots.py,sha256=DBU5ni4Y-p7bIKMbLd_ws8wgHSFc4K8zPVF3JvH_pkw,4661
|
|
36
36
|
crawlee/_utils/sitemap.py,sha256=UI9EJiFiyFvV5_flVUtdsEVz8ZsJeRERPtcx8ZsqjTU,16632
|
|
37
37
|
crawlee/_utils/system.py,sha256=tA8AP__9vsJ9OTLTnAYAKkxc8U5-IEna0N_hqYBybUo,4294
|
|
38
38
|
crawlee/_utils/time.py,sha256=WK17P939r65dLz2rWvL59OEJoxgzdinw-ND9WuG4DuU,2353
|
|
@@ -109,7 +109,7 @@ crawlee/http_clients/_curl_impersonate.py,sha256=EBaoJZzBDgsfhLRgZwu-mFzKqFJ8BDF
|
|
|
109
109
|
crawlee/http_clients/_httpx.py,sha256=pMVyL5P1UU49svqRr7hqcxInIuT4MT3R7UBmkJ07xMc,11780
|
|
110
110
|
crawlee/http_clients/_impit.py,sha256=nGSe5NuZmsJjQzsfUGIZdbZFC48_Gpu8aU3I8kpq6O4,8833
|
|
111
111
|
crawlee/otel/__init__.py,sha256=g5y1tJfpDKfcIPGcKBztMgP6sptum-vJrtemeR8_-co,108
|
|
112
|
-
crawlee/otel/crawler_instrumentor.py,sha256
|
|
112
|
+
crawlee/otel/crawler_instrumentor.py,sha256=yC367A1NnAdhOanvym2zfiu4H4BskUslrib0GcHiVJs,6865
|
|
113
113
|
crawlee/project_template/cookiecutter.json,sha256=dJeYxLx5QEy2DCzXsDpqJQJlIJ3nw42lJrclZFoSZ8w,622
|
|
114
114
|
crawlee/project_template/hooks/post_gen_project.py,sha256=EBNgb_-eodDxaYJljYYznZGASnduxJ54RDO_b4ofm4M,1296
|
|
115
115
|
crawlee/project_template/hooks/pre_gen_project.py,sha256=AqvHJRTOTQzKFX10Zt8uKt8UFHczDJ1ogPPHQVdY2ZU,1175
|
|
@@ -142,10 +142,10 @@ crawlee/sessions/__init__.py,sha256=dJdelbL-6MK5sW4SMU4QrjFbb9kRZ9uRnN-VS3R5-8Y,
|
|
|
142
142
|
crawlee/sessions/_cookies.py,sha256=ihYbmpXfCzClzXDT7M2wefB_3KVzcMUdIzTZo6uGk6Y,9356
|
|
143
143
|
crawlee/sessions/_models.py,sha256=JMRQgDUP30XUdZ32isncHowOsXvK9jC_m9QYegbBI1E,2916
|
|
144
144
|
crawlee/sessions/_session.py,sha256=cMXVf6QjfGJDgdLUB4MhUP-zTm3pEDHRs-W5SBA4JFI,9638
|
|
145
|
-
crawlee/sessions/_session_pool.py,sha256
|
|
145
|
+
crawlee/sessions/_session_pool.py,sha256=_FYTXRANDiREK09DSoHIu-536sSUj9wazbrcl6iKdFM,9631
|
|
146
146
|
crawlee/sessions/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
147
147
|
crawlee/statistics/__init__.py,sha256=lXAsHNkeRZQBffW1B7rERarivXIUJveNlcKTGOXQZY0,154
|
|
148
|
-
crawlee/statistics/_error_snapshotter.py,sha256=
|
|
148
|
+
crawlee/statistics/_error_snapshotter.py,sha256=g-roZgkJ-glyStZL7gXrOhrpdZvZ686W9lR43uZjPao,3279
|
|
149
149
|
crawlee/statistics/_error_tracker.py,sha256=x9Yw1TuyEptjwgPPJ4gIom-0oVjawcNReQDsHH2nZ3w,8553
|
|
150
150
|
crawlee/statistics/_models.py,sha256=SFWYpT3r1c4XugU8nrm0epTpcM5_0fS1mXi9fnbhGJ8,5237
|
|
151
151
|
crawlee/statistics/_statistics.py,sha256=d6z5XxXm-an4M_8TierOPpSB78vxqxwvUFCewIEmiK4,12786
|
|
@@ -174,7 +174,7 @@ crawlee/storage_clients/_memory/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
|
174
174
|
crawlee/storage_clients/_sql/__init__.py,sha256=X_fDMc6jn50gEBZ9QyUw54sjovYfFvE-dgXAdci6Y2M,312
|
|
175
175
|
crawlee/storage_clients/_sql/_client_mixin.py,sha256=U9ThDUuRbT5JDtCFlBurhZIs1Ay5t9fTfPXXI_4dwHY,15988
|
|
176
176
|
crawlee/storage_clients/_sql/_dataset_client.py,sha256=tiJVvOPZgc7cy4kGfWnun-g2TJMHMdaLnoqns5Sl6ek,10203
|
|
177
|
-
crawlee/storage_clients/_sql/_db_models.py,sha256=
|
|
177
|
+
crawlee/storage_clients/_sql/_db_models.py,sha256=KzA-R_L6zv9gqQg7B27mF-fERNJuMUEnewV9iofmTnI,9812
|
|
178
178
|
crawlee/storage_clients/_sql/_key_value_store_client.py,sha256=LnVLWhOjo4LdvtCac4fwuf__DgEQjlqSxz8KkjY3Qx4,11311
|
|
179
179
|
crawlee/storage_clients/_sql/_request_queue_client.py,sha256=OlvAOwEoYY5f4NO7BdhLFRT_i_E3YzJDb_ptKKK2huY,29478
|
|
180
180
|
crawlee/storage_clients/_sql/_storage_client.py,sha256=ITtMpwfotIW4SZjO4rycB5wfMKaqTAJgMvzcUZxckrk,10905
|
|
@@ -187,8 +187,8 @@ crawlee/storages/_request_queue.py,sha256=bjBOGbpMaGUsqJPVB-JD2VShziPAYMI-GvWKKp
|
|
|
187
187
|
crawlee/storages/_storage_instance_manager.py,sha256=72n0YlPwNpSQDJSPf4TxnI2GvIK6L-ZiTmHRbFcoVU0,8164
|
|
188
188
|
crawlee/storages/_utils.py,sha256=Yz-5tEBYKYCFJemYT29--uGJqoJLApLDLgPcsnbifRw,439
|
|
189
189
|
crawlee/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
190
|
-
crawlee-1.0.
|
|
191
|
-
crawlee-1.0.
|
|
192
|
-
crawlee-1.0.
|
|
193
|
-
crawlee-1.0.
|
|
194
|
-
crawlee-1.0.
|
|
190
|
+
crawlee-1.0.5b10.dist-info/METADATA,sha256=_P4cueMHtYUNXkWueR4kntoDN6nQuUQzQ52cEnPt_1o,29315
|
|
191
|
+
crawlee-1.0.5b10.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
192
|
+
crawlee-1.0.5b10.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
|
|
193
|
+
crawlee-1.0.5b10.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
|
|
194
|
+
crawlee-1.0.5b10.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|