aioscrapper 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aioscrapper/__init__.py +1 -2
- aioscrapper/scrapper/base.py +1 -1
- aioscrapper/scrapper/executor.py +14 -16
- aioscrapper/{request_manager.py → scrapper/request_manager.py} +77 -10
- aioscrapper/types/__init__.py +1 -2
- aioscrapper/types/session.py +20 -11
- {aioscrapper-0.1.0.dist-info → aioscrapper-0.1.1.dist-info}/METADATA +2 -1
- aioscrapper-0.1.1.dist-info/RECORD +22 -0
- {aioscrapper-0.1.0.dist-info → aioscrapper-0.1.1.dist-info}/WHEEL +1 -1
- {aioscrapper-0.1.0.dist-info → aioscrapper-0.1.1.dist-info}/licenses/LICENSE +1 -1
- aioscrapper/request_sender.py +0 -52
- aioscrapper-0.1.0.dist-info/RECORD +0 -23
- {aioscrapper-0.1.0.dist-info → aioscrapper-0.1.1.dist-info}/top_level.txt +0 -0
aioscrapper/__init__.py
CHANGED
@@ -4,7 +4,6 @@ __author__ = "darkstussy"
|
|
4
4
|
|
5
5
|
__copyright__ = f"Copyright (c) 2025 {__author__}"
|
6
6
|
|
7
|
-
from .request_sender import RequestSender
|
8
7
|
from .scrapper import AIOScrapper, BaseScrapper
|
9
8
|
|
10
|
-
__all__ = ["AIOScrapper", "BaseScrapper"
|
9
|
+
__all__ = ["AIOScrapper", "BaseScrapper"]
|
aioscrapper/scrapper/base.py
CHANGED
aioscrapper/scrapper/executor.py
CHANGED
@@ -6,11 +6,10 @@ from typing import Type, Any
|
|
6
6
|
|
7
7
|
from aiojobs import Scheduler
|
8
8
|
|
9
|
+
from .request_manager import RequestManager
|
9
10
|
from ..config import Config
|
10
11
|
from ..helpers import get_func_kwargs
|
11
12
|
from ..pipeline import PipelineDispatcher, BasePipeline
|
12
|
-
from ..request_manager import RequestManager
|
13
|
-
from ..request_sender import RequestSender
|
14
13
|
from ..scrapper import BaseScrapper
|
15
14
|
from ..session.aiohttp import AiohttpSession
|
16
15
|
from ..types import RequestMiddleware, ResponseMiddleware
|
@@ -39,7 +38,7 @@ class AIOScrapper:
|
|
39
38
|
|
40
39
|
def _exception_handler(_, context: dict[str, Any]):
|
41
40
|
if "job" in context:
|
42
|
-
self._logger.error(f'{context[
|
41
|
+
self._logger.error(f'{context["message"]}: {context["exception"]}', extra={"context": context})
|
43
42
|
else:
|
44
43
|
self._logger.error("Unhandled error", extra={"context": context})
|
45
44
|
|
@@ -51,7 +50,6 @@ class AIOScrapper:
|
|
51
50
|
)
|
52
51
|
|
53
52
|
self._request_queue = asyncio.PriorityQueue()
|
54
|
-
self._request_sender = RequestSender(self._request_queue)
|
55
53
|
self._request_manager = RequestManager(
|
56
54
|
logger=self._logger.getChild("request_worker"),
|
57
55
|
session=AiohttpSession(
|
@@ -59,7 +57,6 @@ class AIOScrapper:
|
|
59
57
|
ssl=self._config.session.request.ssl,
|
60
58
|
),
|
61
59
|
schedule_request=self._scheduler.spawn,
|
62
|
-
sender=self._request_sender,
|
63
60
|
queue=self._request_queue,
|
64
61
|
delay=self._config.session.request.delay,
|
65
62
|
shutdown_timeout=self._config.execution.shutdown_timeout,
|
@@ -69,8 +66,6 @@ class AIOScrapper:
|
|
69
66
|
response_middlewares=self._response_middlewares,
|
70
67
|
)
|
71
68
|
|
72
|
-
self._scrapper_kwargs = {"request_sender": self._request_sender, "pipeline": self._pipeline_dispatcher}
|
73
|
-
|
74
69
|
def add_pipeline(self, name: str, pipeline: BasePipeline) -> None:
|
75
70
|
if name not in self._pipelines:
|
76
71
|
self._pipelines[name] = [pipeline]
|
@@ -86,7 +81,7 @@ class AIOScrapper:
|
|
86
81
|
def add_response_middlewares(self, *middlewares: ResponseMiddleware) -> None:
|
87
82
|
self._response_middlewares.extend(middlewares)
|
88
83
|
|
89
|
-
async def __aenter__(self):
|
84
|
+
async def __aenter__(self) -> "AIOScrapper":
|
90
85
|
return self
|
91
86
|
|
92
87
|
async def __aexit__(
|
@@ -101,11 +96,12 @@ class AIOScrapper:
|
|
101
96
|
await self._pipeline_dispatcher.initialize()
|
102
97
|
self._request_manager.listen_queue()
|
103
98
|
|
99
|
+
scrapper_kwargs = {"request_sender": self._request_manager.sender, "pipeline": self._pipeline_dispatcher}
|
104
100
|
for scrapper in self._scrappers:
|
105
|
-
await scrapper.initialize(**get_func_kwargs(scrapper.initialize,
|
101
|
+
await scrapper.initialize(**get_func_kwargs(scrapper.initialize, scrapper_kwargs))
|
106
102
|
|
107
103
|
await asyncio.gather(
|
108
|
-
*[scrapper.start(**get_func_kwargs(scrapper.start,
|
104
|
+
*[scrapper.start(**get_func_kwargs(scrapper.start, scrapper_kwargs)) for scrapper in self._scrappers]
|
109
105
|
)
|
110
106
|
|
111
107
|
async def _shutdown(self) -> bool:
|
@@ -138,9 +134,11 @@ class AIOScrapper:
|
|
138
134
|
if shutdown:
|
139
135
|
await self.shutdown()
|
140
136
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
137
|
+
scrapper_kwargs = {"pipeline": self._pipeline_dispatcher}
|
138
|
+
try:
|
139
|
+
for scrapper in self._scrappers:
|
140
|
+
await scrapper.close(**get_func_kwargs(scrapper.close, scrapper_kwargs))
|
141
|
+
finally:
|
142
|
+
await self._scheduler.close()
|
143
|
+
await self._request_manager.close()
|
144
|
+
await self._pipeline_dispatcher.close()
|
@@ -1,13 +1,76 @@
|
|
1
1
|
import asyncio
|
2
|
+
from dataclasses import dataclass, field
|
2
3
|
from logging import Logger
|
3
|
-
from typing import Callable, Awaitable, Any
|
4
|
+
from typing import Callable, Awaitable, Any
|
5
|
+
from typing import Coroutine
|
4
6
|
|
5
|
-
from
|
6
|
-
from
|
7
|
-
from .
|
8
|
-
from
|
9
|
-
|
10
|
-
|
7
|
+
from ..exceptions import HTTPException, RequestException, ClientException
|
8
|
+
from ..helpers import get_cb_kwargs
|
9
|
+
from ..session.base import BaseSession
|
10
|
+
from ..types import (
|
11
|
+
QueryParams,
|
12
|
+
Cookies,
|
13
|
+
Headers,
|
14
|
+
BasicAuth,
|
15
|
+
Request,
|
16
|
+
RequestParams,
|
17
|
+
RequestMiddleware,
|
18
|
+
ResponseMiddleware,
|
19
|
+
RequestSender,
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
@dataclass(slots=True, order=True)
|
24
|
+
class _PRPRequest:
|
25
|
+
priority: int
|
26
|
+
request: Request = field(compare=False)
|
27
|
+
request_params: RequestParams = field(compare=False)
|
28
|
+
|
29
|
+
|
30
|
+
_RequestQueue = asyncio.PriorityQueue[_PRPRequest | None]
|
31
|
+
|
32
|
+
|
33
|
+
def _get_request_sender(queue: _RequestQueue) -> RequestSender:
|
34
|
+
async def sender(
|
35
|
+
url: str,
|
36
|
+
method: str = "GET",
|
37
|
+
callback: Callable[..., Awaitable] | None = None,
|
38
|
+
cb_kwargs: dict[str, Any] | None = None,
|
39
|
+
errback: Callable[..., Awaitable] | None = None,
|
40
|
+
params: QueryParams | None = None,
|
41
|
+
data: Any = None,
|
42
|
+
json_data: Any = None,
|
43
|
+
cookies: Cookies | None = None,
|
44
|
+
headers: Headers | None = None,
|
45
|
+
proxy: str | None = None,
|
46
|
+
auth: BasicAuth | None = None,
|
47
|
+
timeout: float | None = None,
|
48
|
+
priority: int = 0,
|
49
|
+
) -> None:
|
50
|
+
await queue.put(
|
51
|
+
_PRPRequest(
|
52
|
+
priority=priority,
|
53
|
+
request=Request(
|
54
|
+
method=method,
|
55
|
+
url=url,
|
56
|
+
params=params,
|
57
|
+
data=data,
|
58
|
+
json_data=json_data,
|
59
|
+
cookies=cookies,
|
60
|
+
headers=headers,
|
61
|
+
auth=auth,
|
62
|
+
proxy=proxy,
|
63
|
+
timeout=timeout,
|
64
|
+
),
|
65
|
+
request_params=RequestParams(
|
66
|
+
callback=callback,
|
67
|
+
cb_kwargs=cb_kwargs,
|
68
|
+
errback=errback,
|
69
|
+
),
|
70
|
+
)
|
71
|
+
)
|
72
|
+
|
73
|
+
return sender
|
11
74
|
|
12
75
|
|
13
76
|
class RequestManager:
|
@@ -16,8 +79,7 @@ class RequestManager:
|
|
16
79
|
logger: Logger,
|
17
80
|
session: BaseSession,
|
18
81
|
schedule_request: Callable[[Coroutine], Awaitable],
|
19
|
-
|
20
|
-
queue: RequestQueue,
|
82
|
+
queue: _RequestQueue,
|
21
83
|
delay: float,
|
22
84
|
shutdown_timeout: float,
|
23
85
|
srv_kwargs: dict[str, Any],
|
@@ -31,12 +93,17 @@ class RequestManager:
|
|
31
93
|
self._queue = queue
|
32
94
|
self._delay = delay
|
33
95
|
self._shutdown_timeout = shutdown_timeout
|
34
|
-
self.
|
96
|
+
self._request_sender = _get_request_sender(queue)
|
97
|
+
self._srv_kwargs = {"send_request": self._request_sender, **srv_kwargs}
|
35
98
|
self._request_outer_middlewares = request_outer_middlewares
|
36
99
|
self._request_inner_middlewares = request_inner_middlewares
|
37
100
|
self._response_middlewares = response_middlewares
|
38
101
|
self._task: asyncio.Task | None = None
|
39
102
|
|
103
|
+
@property
|
104
|
+
def sender(self) -> RequestSender:
|
105
|
+
return self._request_sender
|
106
|
+
|
40
107
|
async def _send_request(self, request: Request, params: RequestParams) -> None:
|
41
108
|
full_url = request.full_url
|
42
109
|
self._logger.debug(f"request: {request.method} {full_url}")
|
aioscrapper/types/__init__.py
CHANGED
aioscrapper/types/session.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1
|
-
import asyncio
|
2
1
|
import json
|
3
|
-
from dataclasses import
|
4
|
-
from typing import Union, Mapping, Any, Callable, Awaitable, TypedDict
|
2
|
+
from dataclasses import dataclass
|
3
|
+
from typing import Union, Mapping, Any, Callable, Awaitable, TypedDict, Protocol
|
5
4
|
from urllib.parse import urlencode
|
6
5
|
|
7
6
|
QueryParams = Mapping[str, Union[str, int, float]]
|
@@ -40,14 +39,24 @@ class RequestParams:
|
|
40
39
|
errback: Callable[..., Awaitable] | None = None
|
41
40
|
|
42
41
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
42
|
+
class RequestSender(Protocol):
|
43
|
+
async def __call__(
|
44
|
+
self,
|
45
|
+
url: str,
|
46
|
+
method: str = "GET",
|
47
|
+
callback: Callable[..., Awaitable] | None = None,
|
48
|
+
cb_kwargs: dict[str, Any] | None = None,
|
49
|
+
errback: Callable[..., Awaitable] | None = None,
|
50
|
+
params: QueryParams | None = None,
|
51
|
+
data: Any = None,
|
52
|
+
json_data: Any = None,
|
53
|
+
cookies: Cookies | None = None,
|
54
|
+
headers: Headers | None = None,
|
55
|
+
proxy: str | None = None,
|
56
|
+
auth: BasicAuth | None = None,
|
57
|
+
timeout: float | None = None,
|
58
|
+
priority: int = 0,
|
59
|
+
) -> None: ...
|
51
60
|
|
52
61
|
|
53
62
|
class Response:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: aioscrapper
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.1
|
4
4
|
Summary: Async framework for building modular and scalable web scrapers.
|
5
5
|
Author: darkstussy
|
6
6
|
Project-URL: Homepage, https://github.com/darkstussy/aioscrapper
|
@@ -83,5 +83,6 @@ Copyright (c) 2025 darkstussy
|
|
83
83
|
|
84
84
|
## Links
|
85
85
|
|
86
|
+
- [PyPI](https://pypi.org/project/aioscrapper)
|
86
87
|
- [GitHub](https://github.com/darkstussy/aioscrapper)
|
87
88
|
- [Issues](https://github.com/darkstussy/aioscrapper/issues)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
aioscrapper/__init__.py,sha256=Yl57BbmijQN_UgP5nRUiWYqgU8kXQ9kplzzzyMcsUMY,197
|
2
|
+
aioscrapper/config.py,sha256=yO5ipQUHxA_-CiSqJ0u7WioN6lu8VgT1ss5PRvS1foc,844
|
3
|
+
aioscrapper/exceptions.py,sha256=Akk3zDTgws9E7J-Sh8bgdlgS8L3auDKuv3_U3aefxMc,765
|
4
|
+
aioscrapper/helpers.py,sha256=slq9r5oCHrR7M9hKZFBLFRsWoqJcw_QFptQI1NjIdQw,610
|
5
|
+
aioscrapper/pipeline/__init__.py,sha256=hv7Kcssd2BP0LM9fNZtaMs1tmRuAUu4mwAescoeV3Uk,84
|
6
|
+
aioscrapper/pipeline/base.py,sha256=Ro7YGUOB-V2NJCtfgwhtQDedY4OYMu-jwEV8iR-L89k,405
|
7
|
+
aioscrapper/pipeline/dispatcher.py,sha256=H4cHNxTyHEF4BnEwaW6nwmcRmK839GqbDTzZh1Zftv4,1156
|
8
|
+
aioscrapper/scrapper/__init__.py,sha256=UR7bTck-_YVoP2BqYdPldN9PgaCuJf9wvDdQLTVJ578,65
|
9
|
+
aioscrapper/scrapper/base.py,sha256=2_WeLMyJICLmIG7N9r6BGmBg0f-wjEQPsVY076WHKOI,241
|
10
|
+
aioscrapper/scrapper/executor.py,sha256=Rz2dNmdFOvjUXM6-8GLGNTpZmlEkxD24ZK3qlnioSuQ,5495
|
11
|
+
aioscrapper/scrapper/request_manager.py,sha256=xhF_feppHQognTbbHjVUC13V4NwJJt7bCWwcyznFK84,5831
|
12
|
+
aioscrapper/session/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
+
aioscrapper/session/aiohttp.py,sha256=8h4Ca1o2JJ7L24iqEnZ5I8bUudTn2cvTngBhM8eJPD4,1753
|
14
|
+
aioscrapper/session/base.py,sha256=Zxw1VHIe_LgveUufJXh_cl0YeHykiutQveMUwZ6VL54,356
|
15
|
+
aioscrapper/types/__init__.py,sha256=OK7vKZznJAWXQFBhXbQJha-XqGoRAE3lCaJUF7WXn64,210
|
16
|
+
aioscrapper/types/middleware.py,sha256=WtT73QTAlwhdP6UNFgyFHGpFOx1vlxehCAwiO6xjR10,326
|
17
|
+
aioscrapper/types/session.py,sha256=WppvDBZ0sBWVddzz7RXLkg8iZCfZipTdHpKuGW-U090,2970
|
18
|
+
aioscrapper-0.1.1.dist-info/licenses/LICENSE,sha256=EEeV20hghyroJWe2vcHjJma9PcjSkjD6vIwlUtaAjLE,1067
|
19
|
+
aioscrapper-0.1.1.dist-info/METADATA,sha256=Q10bIQpw0JBM8oG3x5mP1pOFtUURYWcMOHvWLs-rVQQ,2350
|
20
|
+
aioscrapper-0.1.1.dist-info/WHEEL,sha256=lTU6B6eIfYoiQJTZNc-fyaR6BpL6ehTzU3xGYxn2n8k,91
|
21
|
+
aioscrapper-0.1.1.dist-info/top_level.txt,sha256=d7lbzXOwzzk2HLh-A0X7dkqn8q3zGAJcKqx6TkaEEWI,12
|
22
|
+
aioscrapper-0.1.1.dist-info/RECORD,,
|
aioscrapper/request_sender.py
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
import asyncio
|
2
|
-
from typing import Callable, Awaitable, Any
|
3
|
-
|
4
|
-
from .types import QueryParams, Cookies, Headers, BasicAuth, Request, RequestParams, RequestQueue, PRPRequest
|
5
|
-
|
6
|
-
|
7
|
-
class RequestSender:
|
8
|
-
def __init__(self, queue: RequestQueue) -> None:
|
9
|
-
self._queue = queue
|
10
|
-
|
11
|
-
async def __call__(
|
12
|
-
self,
|
13
|
-
url: str,
|
14
|
-
method: str = "GET",
|
15
|
-
callback: Callable[..., Awaitable] | None = None,
|
16
|
-
cb_kwargs: dict[str, Any] | None = None,
|
17
|
-
errback: Callable[..., Awaitable] | None = None,
|
18
|
-
params: QueryParams | None = None,
|
19
|
-
data: Any = None,
|
20
|
-
json_data: Any = None,
|
21
|
-
cookies: Cookies | None = None,
|
22
|
-
headers: Headers | None = None,
|
23
|
-
proxy: str | None = None,
|
24
|
-
auth: BasicAuth | None = None,
|
25
|
-
timeout: float | None = None,
|
26
|
-
priority: int = 0,
|
27
|
-
delay: float | None = None,
|
28
|
-
) -> None:
|
29
|
-
await self._queue.put(
|
30
|
-
PRPRequest(
|
31
|
-
priority=priority,
|
32
|
-
request=Request(
|
33
|
-
method=method,
|
34
|
-
url=url,
|
35
|
-
params=params,
|
36
|
-
data=data,
|
37
|
-
json_data=json_data,
|
38
|
-
cookies=cookies,
|
39
|
-
headers=headers,
|
40
|
-
auth=auth,
|
41
|
-
proxy=proxy,
|
42
|
-
timeout=timeout,
|
43
|
-
),
|
44
|
-
request_params=RequestParams(
|
45
|
-
callback=callback,
|
46
|
-
cb_kwargs=cb_kwargs,
|
47
|
-
errback=errback,
|
48
|
-
),
|
49
|
-
)
|
50
|
-
)
|
51
|
-
if delay:
|
52
|
-
await asyncio.sleep(delay)
|
@@ -1,23 +0,0 @@
|
|
1
|
-
aioscrapper/__init__.py,sha256=_01EI59FLQmHspoN9HqJMoJ9OHpEaYGDyFXAKtfnnYY,256
|
2
|
-
aioscrapper/config.py,sha256=yO5ipQUHxA_-CiSqJ0u7WioN6lu8VgT1ss5PRvS1foc,844
|
3
|
-
aioscrapper/exceptions.py,sha256=Akk3zDTgws9E7J-Sh8bgdlgS8L3auDKuv3_U3aefxMc,765
|
4
|
-
aioscrapper/helpers.py,sha256=slq9r5oCHrR7M9hKZFBLFRsWoqJcw_QFptQI1NjIdQw,610
|
5
|
-
aioscrapper/request_manager.py,sha256=YLZvuPthhFMnJVQ7pV9-YCsni0Kdu8baP5tmOccEDOM,4037
|
6
|
-
aioscrapper/request_sender.py,sha256=_Vx_LJyV_5qb23-C3VdnOUUUcQPW42OJNbtERVu1DIA,1644
|
7
|
-
aioscrapper/pipeline/__init__.py,sha256=hv7Kcssd2BP0LM9fNZtaMs1tmRuAUu4mwAescoeV3Uk,84
|
8
|
-
aioscrapper/pipeline/base.py,sha256=Ro7YGUOB-V2NJCtfgwhtQDedY4OYMu-jwEV8iR-L89k,405
|
9
|
-
aioscrapper/pipeline/dispatcher.py,sha256=H4cHNxTyHEF4BnEwaW6nwmcRmK839GqbDTzZh1Zftv4,1156
|
10
|
-
aioscrapper/scrapper/__init__.py,sha256=UR7bTck-_YVoP2BqYdPldN9PgaCuJf9wvDdQLTVJ578,65
|
11
|
-
aioscrapper/scrapper/base.py,sha256=_wFrI0UVsTBIAV7EOZCk_QMy2-chPjr1pKzu6w8Huso,224
|
12
|
-
aioscrapper/scrapper/executor.py,sha256=TrZBh0JyFeQIJd_O4S86cSZNgywjKxJWdb2QSzZyObU,5475
|
13
|
-
aioscrapper/session/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
-
aioscrapper/session/aiohttp.py,sha256=8h4Ca1o2JJ7L24iqEnZ5I8bUudTn2cvTngBhM8eJPD4,1753
|
15
|
-
aioscrapper/session/base.py,sha256=Zxw1VHIe_LgveUufJXh_cl0YeHykiutQveMUwZ6VL54,356
|
16
|
-
aioscrapper/types/__init__.py,sha256=SC6De0ThepMK-wPQGFYGq6x8aGlzsWscphFxGQGBWek,225
|
17
|
-
aioscrapper/types/middleware.py,sha256=WtT73QTAlwhdP6UNFgyFHGpFOx1vlxehCAwiO6xjR10,326
|
18
|
-
aioscrapper/types/session.py,sha256=ffJelDaZmeIoNOk_ivGb_nSC5bBpgKwCyiSsUl4e-B0,2595
|
19
|
-
aioscrapper-0.1.0.dist-info/licenses/LICENSE,sha256=LefKIkLsd_UuLWYOatzEjY5yscQS8nZAFi8rzCs54OM,1066
|
20
|
-
aioscrapper-0.1.0.dist-info/METADATA,sha256=pbjxh2xCDlGRYDEFXT40JBL6_7Xkw31-BXivX_Fmog0,2303
|
21
|
-
aioscrapper-0.1.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
22
|
-
aioscrapper-0.1.0.dist-info/top_level.txt,sha256=d7lbzXOwzzk2HLh-A0X7dkqn8q3zGAJcKqx6TkaEEWI,12
|
23
|
-
aioscrapper-0.1.0.dist-info/RECORD,,
|
File without changes
|