PyPI - crawlee - Versions diffs - 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl - Mend

crawlee 0.6.13b15py3-none-any.whl → 1.3.1b3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlee might be problematic. Click here for more details.

Files changed (116) hide show

crawlee/__init__.py +2 -1
crawlee/_autoscaling/snapshotter.py +1 -1
crawlee/_browserforge_workaround.py +7 -3
crawlee/_request.py +64 -43
crawlee/_service_locator.py +44 -24
crawlee/_types.py +128 -36
crawlee/_utils/context.py +3 -3
crawlee/_utils/file.py +8 -1
crawlee/_utils/globs.py +4 -4
crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
crawlee/_utils/recoverable_state.py +32 -8
crawlee/_utils/recurring_task.py +27 -3
crawlee/_utils/requests.py +0 -26
crawlee/_utils/robots.py +17 -5
crawlee/_utils/sitemap.py +16 -7
crawlee/_utils/system.py +30 -14
crawlee/_utils/time.py +120 -0
crawlee/_utils/urls.py +9 -2
crawlee/browsers/_browser_pool.py +5 -2
crawlee/browsers/_playwright_browser.py +2 -1
crawlee/browsers/_playwright_browser_controller.py +21 -15
crawlee/browsers/_playwright_browser_plugin.py +17 -3
crawlee/browsers/_types.py +1 -1
crawlee/configuration.py +2 -0
crawlee/crawlers/__init__.py +5 -1
crawlee/crawlers/_abstract_http/__init__.py +2 -1
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
crawlee/crawlers/_basic/_basic_crawler.py +254 -148
crawlee/crawlers/_basic/_context_utils.py +24 -0
crawlee/crawlers/_basic/_logging_utils.py +27 -4
crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
crawlee/crawlers/_playwright/_types.py +12 -2
crawlee/errors.py +4 -0
crawlee/events/_event_manager.py +12 -6
crawlee/events/_types.py +6 -6
crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
crawlee/fingerprint_suite/_header_generator.py +2 -2
crawlee/fingerprint_suite/_types.py +2 -2
crawlee/http_clients/_base.py +4 -0
crawlee/http_clients/_curl_impersonate.py +68 -14
crawlee/http_clients/_httpx.py +16 -6
crawlee/http_clients/_impit.py +32 -11
crawlee/otel/crawler_instrumentor.py +4 -6
crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
crawlee/request_loaders/_request_list.py +3 -3
crawlee/request_loaders/_request_loader.py +5 -1
crawlee/request_loaders/_sitemap_request_loader.py +248 -50
crawlee/router.py +13 -3
crawlee/sessions/_cookies.py +13 -8
crawlee/sessions/_models.py +5 -5
crawlee/sessions/_session_pool.py +1 -1
crawlee/statistics/_error_snapshotter.py +1 -1
crawlee/statistics/_models.py +62 -12
crawlee/statistics/_statistics.py +24 -33
crawlee/storage_clients/__init__.py +16 -0
crawlee/storage_clients/_base/_dataset_client.py +2 -2
crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
crawlee/storage_clients/_base/_request_queue_client.py +2 -2
crawlee/storage_clients/_base/_storage_client.py +13 -0
crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
crawlee/storage_clients/_file_system/_storage_client.py +16 -3
crawlee/storage_clients/_file_system/_utils.py +0 -0
crawlee/storage_clients/_memory/_dataset_client.py +16 -4
crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
crawlee/storage_clients/_memory/_storage_client.py +6 -3
crawlee/storage_clients/_redis/__init__.py +6 -0
crawlee/storage_clients/_redis/_client_mixin.py +292 -0
crawlee/storage_clients/_redis/_dataset_client.py +329 -0
crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
crawlee/storage_clients/_redis/_storage_client.py +149 -0
crawlee/storage_clients/_redis/_utils.py +23 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
crawlee/storage_clients/_redis/py.typed +0 -0
crawlee/storage_clients/_sql/__init__.py +6 -0
crawlee/storage_clients/_sql/_client_mixin.py +385 -0
crawlee/storage_clients/_sql/_dataset_client.py +310 -0
crawlee/storage_clients/_sql/_db_models.py +268 -0
crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
crawlee/storage_clients/_sql/_storage_client.py +282 -0
crawlee/storage_clients/_sql/py.typed +0 -0
crawlee/storage_clients/models.py +21 -14
crawlee/storages/_base.py +5 -1
crawlee/storages/_dataset.py +12 -2
crawlee/storages/_key_value_store.py +17 -4
crawlee/storages/_request_queue.py +13 -5
crawlee/storages/_storage_instance_manager.py +196 -75
crawlee/storages/_utils.py +11 -0
{crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
crawlee-1.3.1b3.dist-info/RECORD +207 -0
{crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
crawlee/_utils/measure_time.py +0 -31
crawlee-0.6.13b15.dist-info/RECORD +0 -183
{crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
{crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0

crawlee/http_clients/_base.py CHANGED Viewed

@@ -104,6 +104,7 @@ class HttpClient(ABC):
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
         statistics: Statistics | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpCrawlingResult:
         """Perform the crawling for a given request.
@@ -114,6 +115,7 @@ class HttpClient(ABC):
             session: The session associated with the request.
             proxy_info: The information about the proxy to be used.
             statistics: The statistics object to register status codes.
+            timeout: Maximum time allowed to process the request.
         Raises:
             ProxyError: Raised if a proxy-related error occurs.
@@ -132,6 +134,7 @@ class HttpClient(ABC):
         payload: HttpPayload | None = None,
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpResponse:
         """Send an HTTP request via the client.
@@ -144,6 +147,7 @@ class HttpClient(ABC):
             payload: The data to be sent as the request body.
             session: The session associated with the request.
             proxy_info: The information about the proxy to be used.
+            timeout: Maximum time allowed to process the request.
         Raises:
             ProxyError: Raised if a proxy-related error occurs.

crawlee/http_clients/_curl_impersonate.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from __future__ import annotations
+import asyncio
 from contextlib import asynccontextmanager
-from typing import TYPE_CHECKING, Any
+from http.cookiejar import Cookie
+from typing import TYPE_CHECKING, Any, cast
 from curl_cffi import CurlInfo
 from curl_cffi.const import CurlHttpVersion
@@ -10,10 +12,11 @@ from curl_cffi.requests.cookies import Cookies as CurlCookies
 from curl_cffi.requests.cookies import CurlMorsel
 from curl_cffi.requests.exceptions import ProxyError as CurlProxyError
 from curl_cffi.requests.exceptions import RequestException as CurlRequestError
+from curl_cffi.requests.exceptions import Timeout
 from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
 from typing_extensions import override
-from crawlee._types import HttpHeaders, HttpPayload
+from crawlee._types import HttpHeaders, HttpMethod, HttpPayload
 from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
 from crawlee._utils.docs import docs_group
 from crawlee.errors import ProxyError
@@ -22,11 +25,11 @@ from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse
 if TYPE_CHECKING:
     from collections.abc import AsyncGenerator
     from datetime import timedelta
-    from http.cookiejar import Cookie
     from curl_cffi import Curl
     from curl_cffi.requests import Request as CurlRequest
     from curl_cffi.requests import Response
+    from curl_cffi.requests.session import HttpMethod as CurlHttpMethod
     from crawlee import Request
     from crawlee._types import HttpMethod
@@ -88,15 +91,17 @@ class _CurlImpersonateResponse:
     async def read(self) -> bytes:
         if self._response.astream_task:
             raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method')
         return self._response.content
     async def read_stream(self) -> AsyncGenerator[bytes, None]:
-        if not self._response.astream_task or self._response.astream_task.done():  # type: ignore[attr-defined]
-            raise RuntimeError(
-                'Cannot read stream: either already consumed or Response not obtained from `stream` method'
-            )
+        if not self._response.astream_task:
+            raise RuntimeError('Cannot read stream, Response not obtained from `stream` method.')
-        async for chunk in self._response.aiter_content():  # type: ignore[no-untyped-call]
+        if isinstance(self._response.astream_task, asyncio.Future) and self._response.astream_task.done():
+            raise RuntimeError('Cannot read stream, it was already consumed.')
+        async for chunk in self._response.aiter_content():
             yield chunk
@@ -147,17 +152,21 @@ class CurlImpersonateHttpClient(HttpClient):
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
         statistics: Statistics | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpCrawlingResult:
         client = self._get_client(proxy_info.url if proxy_info else None)
         try:
             response = await client.request(
                 url=request.url,
-                method=request.method.upper(),  # type: ignore[arg-type] # curl-cffi requires uppercase method
+                method=self._convert_method(request.method),
                 headers=request.headers,
                 data=request.payload,
                 cookies=session.cookies.jar if session else None,
+                timeout=timeout.total_seconds() if timeout else None,
             )
+        except Timeout as exc:
+            raise asyncio.TimeoutError from exc
         except CurlRequestError as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc
@@ -186,6 +195,7 @@ class CurlImpersonateHttpClient(HttpClient):
         payload: HttpPayload | None = None,
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpResponse:
         if isinstance(headers, dict) or headers is None:
             headers = HttpHeaders(headers or {})
@@ -196,11 +206,14 @@ class CurlImpersonateHttpClient(HttpClient):
         try:
             response = await client.request(
                 url=url,
-                method=method.upper(),  # type: ignore[arg-type] # curl-cffi requires uppercase method
+                method=self._convert_method(method),
                 headers=dict(headers) if headers else None,
                 data=payload,
                 cookies=session.cookies.jar if session else None,
+                timeout=timeout.total_seconds() if timeout else None,
             )
+        except Timeout as exc:
+            raise asyncio.TimeoutError from exc
         except CurlRequestError as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc
@@ -234,13 +247,15 @@ class CurlImpersonateHttpClient(HttpClient):
         try:
             response = await client.request(
                 url=url,
-                method=method.upper(),  # type: ignore[arg-type] # curl-cffi requires uppercase method
+                method=self._convert_method(method),
                 headers=dict(headers) if headers else None,
                 data=payload,
                 cookies=session.cookies.jar if session else None,
                 stream=True,
                 timeout=timeout.total_seconds() if timeout else None,
             )
+        except Timeout as exc:
+            raise asyncio.TimeoutError from exc
         except CurlRequestError as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc
@@ -279,6 +294,40 @@ class CurlImpersonateHttpClient(HttpClient):
         return self._client_by_proxy_url[proxy_url]
+    def _convert_method(self, method: HttpMethod) -> CurlHttpMethod:
+        """Convert from Crawlee HTTP method to curl-cffi HTTP method.
+        Args:
+            method: Crawlee HTTP method.
+        Returns:
+            Corresponding curl-cffi HTTP method.
+        Raises:
+            ValueError: If the provided HTTP method is not supported.
+        """
+        method_upper = method.upper()  # curl-cffi requires uppercase methods
+        match method_upper:
+            case 'GET':
+                return 'GET'
+            case 'POST':
+                return 'POST'
+            case 'PUT':
+                return 'PUT'
+            case 'DELETE':
+                return 'DELETE'
+            case 'OPTIONS':
+                return 'OPTIONS'
+            case 'HEAD':
+                return 'HEAD'
+            case 'TRACE':
+                return 'TRACE'
+            case 'PATCH':
+                return 'PATCH'
+            case _:
+                raise ValueError(f'HTTP method {method} is not supported in {self.__class__.__name__}.')
     @staticmethod
     def _is_proxy_error(error: CurlRequestError) -> bool:
         """Determine whether the given error is related to a proxy issue.
@@ -296,11 +345,16 @@ class CurlImpersonateHttpClient(HttpClient):
     @staticmethod
     def _get_cookies(curl: Curl) -> list[Cookie]:
-        cookies: list[Cookie] = []
-        for curl_cookie in curl.getinfo(CurlInfo.COOKIELIST):  # type: ignore[union-attr]
-            curl_morsel = CurlMorsel.from_curl_format(curl_cookie)  # type: ignore[arg-type]
+        cookies = list[Cookie]()
+        # Implementation of getinfo always returns list[bytes] for CurlInfo.COOKIELIST.
+        cookie_list = cast('list[bytes]', curl.getinfo(CurlInfo.COOKIELIST))
+        for curl_cookie in cookie_list:
+            curl_morsel = CurlMorsel.from_curl_format(curl_cookie)
             cookie = curl_morsel.to_cookiejar_cookie()
             cookies.append(cookie)
         return cookies
     async def cleanup(self) -> None:

crawlee/http_clients/_httpx.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import asyncio
 from contextlib import asynccontextmanager
 from logging import getLogger
 from typing import TYPE_CHECKING, Any, cast
@@ -146,6 +147,7 @@ class HttpxHttpClient(HttpClient):
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
         statistics: Statistics | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpCrawlingResult:
         client = self._get_client(proxy_info.url if proxy_info else None)
         headers = self._combine_headers(request.headers)
@@ -157,10 +159,13 @@ class HttpxHttpClient(HttpClient):
             content=request.payload,
             cookies=session.cookies.jar if session else None,
             extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
+            timeout=timeout.total_seconds() if timeout is not None else httpx.USE_CLIENT_DEFAULT,
         )
         try:
             response = await client.send(http_request)
+        except httpx.TimeoutException as exc:
+            raise asyncio.TimeoutError from exc
         except httpx.TransportError as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc
@@ -185,6 +190,7 @@ class HttpxHttpClient(HttpClient):
         payload: HttpPayload | None = None,
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpResponse:
         client = self._get_client(proxy_info.url if proxy_info else None)
@@ -195,10 +201,13 @@ class HttpxHttpClient(HttpClient):
             headers=headers,
             payload=payload,
             session=session,
+            timeout=httpx.Timeout(timeout.total_seconds()) if timeout is not None else None,
         )
         try:
             response = await client.send(http_request)
+        except httpx.TimeoutException as exc:
+            raise asyncio.TimeoutError from exc
         except httpx.TransportError as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc
@@ -228,10 +237,13 @@ class HttpxHttpClient(HttpClient):
             headers=headers,
             payload=payload,
             session=session,
-            timeout=timeout,
+            timeout=httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None,
         )
-        response = await client.send(http_request, stream=True)
+        try:
+            response = await client.send(http_request, stream=True)
+        except httpx.TimeoutException as exc:
+            raise asyncio.TimeoutError from exc
         try:
             yield _HttpxResponse(response)
@@ -246,7 +258,7 @@ class HttpxHttpClient(HttpClient):
         headers: HttpHeaders | dict[str, str] | None,
         payload: HttpPayload | None,
         session: Session | None = None,
-        timeout: timedelta | None = None,
+        timeout: httpx.Timeout | None = None,
     ) -> httpx.Request:
         """Build an `httpx.Request` using the provided parameters."""
         if isinstance(headers, dict) or headers is None:
@@ -254,15 +266,13 @@ class HttpxHttpClient(HttpClient):
         headers = self._combine_headers(headers)
-        httpx_timeout = httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None
         return client.build_request(
             url=url,
             method=method,
             headers=dict(headers) if headers else None,
             content=payload,
             extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
-            timeout=httpx_timeout,
+            timeout=timeout if timeout else httpx.USE_CLIENT_DEFAULT,
         )
     def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient:

crawlee/http_clients/_impit.py CHANGED Viewed

@@ -1,11 +1,12 @@
 from __future__ import annotations
+import asyncio
 from contextlib import asynccontextmanager
 from logging import getLogger
 from typing import TYPE_CHECKING, Any, TypedDict
 from cachetools import LRUCache
-from impit import AsyncClient, Browser, HTTPError, Response, TransportError
+from impit import AsyncClient, Browser, HTTPError, Response, TimeoutException, TransportError
 from impit import ProxyError as ImpitProxyError
 from typing_extensions import override
@@ -124,6 +125,7 @@ class ImpitHttpClient(HttpClient):
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
         statistics: Statistics | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpCrawlingResult:
         client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
@@ -133,7 +135,10 @@ class ImpitHttpClient(HttpClient):
                 method=request.method,
                 content=request.payload,
                 headers=dict(request.headers) if request.headers else None,
+                timeout=timeout.total_seconds() if timeout else None,
             )
+        except TimeoutException as exc:
+            raise asyncio.TimeoutError from exc
         except (TransportError, HTTPError) as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc
@@ -156,6 +161,7 @@ class ImpitHttpClient(HttpClient):
         payload: HttpPayload | None = None,
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpResponse:
         if isinstance(headers, dict) or headers is None:
             headers = HttpHeaders(headers or {})
@@ -164,8 +170,14 @@ class ImpitHttpClient(HttpClient):
         try:
             response = await client.request(
-                method=method, url=url, content=payload, headers=dict(headers) if headers else None
+                method=method,
+                url=url,
+                content=payload,
+                headers=dict(headers) if headers else None,
+                timeout=timeout.total_seconds() if timeout else None,
             )
+        except TimeoutException as exc:
+            raise asyncio.TimeoutError from exc
         except (TransportError, HTTPError) as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc
@@ -188,18 +200,27 @@ class ImpitHttpClient(HttpClient):
     ) -> AsyncGenerator[HttpResponse]:
         client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
-        response = await client.request(
-            method=method,
-            url=url,
-            content=payload,
-            headers=dict(headers) if headers else None,
-            timeout=timeout.total_seconds() if timeout else None,
-            stream=True,
-        )
+        try:
+            response = await client.request(
+                method=method,
+                url=url,
+                content=payload,
+                headers=dict(headers) if headers else None,
+                timeout=timeout.total_seconds() if timeout else None,
+                stream=True,
+            )
+        except TimeoutException as exc:
+            raise asyncio.TimeoutError from exc
         try:
             yield _ImpitResponse(response)
         finally:
-            await response.aclose()
+            # TODO: https://github.com/apify/impit/issues/242
+            # Quickly closing Response while reading the response body causes an error in the Rust generator in `impit`.
+            # With a short sleep and sync closing, the error does not occur.
+            # Replace with `response.aclose` when this is resolved in impit.
+            await asyncio.sleep(0.01)
+            response.close()
     def _get_client(self, proxy_url: str | None, cookie_jar: CookieJar | None) -> AsyncClient:
         """Retrieve or create an HTTP client for the given proxy URL.

crawlee/otel/crawler_instrumentor.py CHANGED Viewed

@@ -3,9 +3,7 @@ from __future__ import annotations
 import inspect
 from typing import TYPE_CHECKING, Any
-from opentelemetry.instrumentation.instrumentor import (  # type:ignore[attr-defined]  # Mypy has troubles with OTEL
-    BaseInstrumentor,
-)
+from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
 from opentelemetry.instrumentation.utils import unwrap
 from opentelemetry.semconv.attributes.code_attributes import CODE_FUNCTION_NAME
 from opentelemetry.semconv.attributes.http_attributes import HTTP_REQUEST_METHOD
@@ -69,7 +67,7 @@ class CrawlerInstrumentor(BaseInstrumentor):
         if request_handling_instrumentation:
-            async def middlware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
+            async def middleware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
                 with self._tracer.start_as_current_span(
                     name=f'{instance.generator.__name__}, {wrapped.__name__}',  # type:ignore[attr-defined]  # valid in our context
                     attributes={
@@ -111,8 +109,8 @@ class CrawlerInstrumentor(BaseInstrumentor):
             # Handpicked interesting methods to instrument
             self._instrumented.extend(
                 [
-                    (_Middleware, 'action', middlware_wrapper),
-                    (_Middleware, 'cleanup', middlware_wrapper),
+                    (_Middleware, 'action', middleware_wrapper),
+                    (_Middleware, 'cleanup', middleware_wrapper),
                     (ContextPipeline, '__call__', context_pipeline_wrapper),
                     (BasicCrawler, '_BasicCrawler__run_task_function', self._simple_async_wrapper),
                     (BasicCrawler, '_commit_request_handler_result', _commit_request_handler_result_wrapper),

crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml CHANGED Viewed

@@ -5,8 +5,8 @@
 # % endif
 # % if cookiecutter.http_client == 'curl-impersonate'
 # % do extras.append('curl-impersonate')
-# % elif cookiecutter.http_client == 'impit'
-# % do extras.append('impit')
+# % elif cookiecutter.http_client == 'httpx'
+# % do extras.append('httpx')
 # % endif
 [project]

crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt CHANGED Viewed

@@ -10,4 +10,7 @@ apify
 # % if cookiecutter.http_client == 'curl-impersonate'
 # % do extras.append('curl-impersonate')
 # % endif
+# % if cookiecutter.http_client == 'httpx'
+# % do extras.append('httpx')
+# % endif
 crawlee[{{ extras | join(',') }}]

crawlee/request_loaders/_request_list.py CHANGED Viewed

@@ -17,7 +17,7 @@ logger = getLogger(__name__)
 class RequestListState(BaseModel):
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     next_index: Annotated[int, Field(alias='nextIndex')] = 0
     next_unique_key: Annotated[str | None, Field(alias='nextUniqueKey')] = None
@@ -166,7 +166,7 @@ class RequestList(RequestLoader):
             return None
         state = await self._get_state()
-        state.in_progress.add(self._next[0].id)
+        state.in_progress.add(self._next[0].unique_key)
         self._assumed_total_count += 1
         next_request = self._next[0]
@@ -183,7 +183,7 @@ class RequestList(RequestLoader):
     async def mark_request_as_handled(self, request: Request) -> None:
         self._handled_count += 1
         state = await self._get_state()
-        state.in_progress.remove(request.id)
+        state.in_progress.remove(request.unique_key)
     async def _ensure_next_request(self) -> None:
         await self._get_state()

crawlee/request_loaders/_request_loader.py CHANGED Viewed

@@ -43,7 +43,11 @@ class RequestLoader(ABC):
     @abstractmethod
     async def fetch_next_request(self) -> Request | None:
-        """Return the next request to be processed, or `null` if there are no more pending requests."""
+        """Return the next request to be processed, or `None` if there are no more pending requests.
+        The method should return `None` if and only if `is_finished` would return `True`. In other cases, the method
+        should wait until a request appears.
+        """
     @abstractmethod
     async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:

crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl

Potentially problematic release.

crawlee 0.6.13b15py3-none-any.whl → 1.3.1b3py3-none-any.whl