aioscrapper 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aioscrapper/__init__.py +10 -0
- aioscrapper/config.py +36 -0
- aioscrapper/exceptions.py +26 -0
- aioscrapper/helpers.py +22 -0
- aioscrapper/pipeline/__init__.py +2 -0
- aioscrapper/pipeline/base.py +19 -0
- aioscrapper/pipeline/dispatcher.py +36 -0
- aioscrapper/request_manager.py +101 -0
- aioscrapper/request_sender.py +52 -0
- aioscrapper/scrapper/__init__.py +2 -0
- aioscrapper/scrapper/base.py +10 -0
- aioscrapper/scrapper/executor.py +146 -0
- aioscrapper/session/__init__.py +0 -0
- aioscrapper/session/aiohttp.py +46 -0
- aioscrapper/session/base.py +14 -0
- aioscrapper/types/__init__.py +12 -0
- aioscrapper/types/middleware.py +11 -0
- aioscrapper/types/session.py +109 -0
- aioscrapper-0.1.0.dist-info/METADATA +87 -0
- aioscrapper-0.1.0.dist-info/RECORD +23 -0
- aioscrapper-0.1.0.dist-info/WHEEL +5 -0
- aioscrapper-0.1.0.dist-info/licenses/LICENSE +21 -0
- aioscrapper-0.1.0.dist-info/top_level.txt +1 -0
aioscrapper/__init__.py
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
__title__ = "aioscrapper"
|
2
|
+
|
3
|
+
__author__ = "darkstussy"
|
4
|
+
|
5
|
+
__copyright__ = f"Copyright (c) 2025 {__author__}"
|
6
|
+
|
7
|
+
from .request_sender import RequestSender
|
8
|
+
from .scrapper import AIOScrapper, BaseScrapper
|
9
|
+
|
10
|
+
__all__ = ["AIOScrapper", "BaseScrapper", "RequestSender"]
|
aioscrapper/config.py
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
import logging
|
2
|
+
from dataclasses import dataclass
|
3
|
+
|
4
|
+
|
5
|
+
@dataclass(slots=True, frozen=True)
|
6
|
+
class RequestConfig:
|
7
|
+
timeout: int = 60
|
8
|
+
delay: float = 0.0
|
9
|
+
ssl: bool = True
|
10
|
+
|
11
|
+
|
12
|
+
@dataclass(slots=True, frozen=True)
|
13
|
+
class SessionConfig:
|
14
|
+
request: RequestConfig = RequestConfig()
|
15
|
+
|
16
|
+
|
17
|
+
@dataclass(slots=True, frozen=True)
|
18
|
+
class SchedulerConfig:
|
19
|
+
concurrent_requests: int = 64
|
20
|
+
pending_requests: int = 1
|
21
|
+
close_timeout: float | None = 0.1
|
22
|
+
|
23
|
+
|
24
|
+
@dataclass(slots=True, frozen=True)
|
25
|
+
class ExecutionConfig:
|
26
|
+
timeout: float | None = None
|
27
|
+
shutdown_timeout: float = 0.1
|
28
|
+
shutdown_check_interval: float = 0.1
|
29
|
+
log_level: int = logging.ERROR
|
30
|
+
|
31
|
+
|
32
|
+
@dataclass(slots=True, frozen=True)
|
33
|
+
class Config:
|
34
|
+
session: SessionConfig = SessionConfig()
|
35
|
+
scheduler: SchedulerConfig = SchedulerConfig()
|
36
|
+
execution: ExecutionConfig = ExecutionConfig()
|
@@ -0,0 +1,26 @@
|
|
1
|
+
class ClientException(Exception):
|
2
|
+
pass
|
3
|
+
|
4
|
+
|
5
|
+
class HTTPException(ClientException):
|
6
|
+
def __init__(self, status_code: int, message: str | None, url: str, method: str) -> None:
|
7
|
+
self.status_code = status_code
|
8
|
+
self.message = message
|
9
|
+
self.url = url
|
10
|
+
self.method = method
|
11
|
+
|
12
|
+
def __str__(self) -> str:
|
13
|
+
return f"{self.method} {self.url}: {self.status_code}: {self.message}"
|
14
|
+
|
15
|
+
|
16
|
+
class RequestException(ClientException):
|
17
|
+
def __init__(self, src: Exception | str, url: str, method: str) -> None:
|
18
|
+
self.src = src
|
19
|
+
self.url = url
|
20
|
+
self.method = method
|
21
|
+
|
22
|
+
def __str__(self) -> str:
|
23
|
+
return f"[{self.src.__class__.__name__}]: {self.method} {self.url}: {self.src}"
|
24
|
+
|
25
|
+
|
26
|
+
class PipelineException(Exception): ...
|
aioscrapper/helpers.py
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
import inspect
|
2
|
+
from typing import Callable, Any
|
3
|
+
|
4
|
+
|
5
|
+
def get_func_kwargs(func: Callable, kwargs: dict[str, Any]) -> dict[str, Any]:
|
6
|
+
return {param: kwargs[param] for param in inspect.signature(func).parameters.keys() if param in kwargs}
|
7
|
+
|
8
|
+
|
9
|
+
def get_cb_kwargs(
|
10
|
+
callback: Callable,
|
11
|
+
srv_kwargs: dict[str, Any] | None,
|
12
|
+
cb_kwargs: dict[str, Any] | None,
|
13
|
+
) -> dict[str, Any]:
|
14
|
+
if cb_kwargs is None and srv_kwargs is None:
|
15
|
+
return {}
|
16
|
+
|
17
|
+
if cb_kwargs is None:
|
18
|
+
cb_kwargs = {}
|
19
|
+
if srv_kwargs is None:
|
20
|
+
srv_kwargs = {}
|
21
|
+
|
22
|
+
return get_func_kwargs(callback, cb_kwargs | srv_kwargs)
|
@@ -0,0 +1,19 @@
|
|
1
|
+
import abc
|
2
|
+
from typing import TypeVar, Generic, Protocol
|
3
|
+
|
4
|
+
|
5
|
+
class BaseItem(Protocol):
|
6
|
+
@property
|
7
|
+
def pipeline_name(self) -> str: ...
|
8
|
+
|
9
|
+
|
10
|
+
ItemType = TypeVar("ItemType", bound=BaseItem)
|
11
|
+
|
12
|
+
|
13
|
+
class BasePipeline(abc.ABC, Generic[ItemType]):
|
14
|
+
@abc.abstractmethod
|
15
|
+
async def put_item(self, item: ItemType) -> None: ...
|
16
|
+
|
17
|
+
async def initialize(self) -> None: ...
|
18
|
+
|
19
|
+
async def close(self) -> None: ...
|
@@ -0,0 +1,36 @@
|
|
1
|
+
from logging import Logger
|
2
|
+
from typing import Generator
|
3
|
+
|
4
|
+
from .base import BasePipeline, BaseItem
|
5
|
+
from ..exceptions import PipelineException
|
6
|
+
|
7
|
+
|
8
|
+
class PipelineDispatcher:
|
9
|
+
def __init__(self, logger: Logger, pipelines: dict[str, list[BasePipeline]]) -> None:
|
10
|
+
self._logger = logger
|
11
|
+
self._pipelines = pipelines
|
12
|
+
|
13
|
+
async def put_item(self, item: BaseItem) -> BaseItem:
|
14
|
+
self._logger.debug(f"pipeline item received: {item}")
|
15
|
+
try:
|
16
|
+
pipelines = self._pipelines[item.pipeline_name]
|
17
|
+
except KeyError:
|
18
|
+
raise PipelineException(f"Pipelines for item {item} not found")
|
19
|
+
|
20
|
+
for pipeline in pipelines:
|
21
|
+
await pipeline.put_item(item)
|
22
|
+
|
23
|
+
return item
|
24
|
+
|
25
|
+
def _get_pipelines(self) -> Generator[BasePipeline, None, None]:
|
26
|
+
for pipelines in self._pipelines.values():
|
27
|
+
for pipeline in pipelines:
|
28
|
+
yield pipeline
|
29
|
+
|
30
|
+
async def initialize(self) -> None:
|
31
|
+
for pipeline in self._get_pipelines():
|
32
|
+
await pipeline.initialize()
|
33
|
+
|
34
|
+
async def close(self) -> None:
|
35
|
+
for pipeline in self._get_pipelines():
|
36
|
+
await pipeline.close()
|
@@ -0,0 +1,101 @@
|
|
1
|
+
import asyncio
|
2
|
+
from logging import Logger
|
3
|
+
from typing import Callable, Awaitable, Any, Coroutine
|
4
|
+
|
5
|
+
from .exceptions import HTTPException, RequestException, ClientException
|
6
|
+
from .helpers import get_cb_kwargs
|
7
|
+
from .request_sender import RequestSender
|
8
|
+
from .session.base import BaseSession
|
9
|
+
from .types import Request, RequestParams, RequestQueue
|
10
|
+
from .types import RequestMiddleware, ResponseMiddleware
|
11
|
+
|
12
|
+
|
13
|
+
class RequestManager:
|
14
|
+
def __init__(
|
15
|
+
self,
|
16
|
+
logger: Logger,
|
17
|
+
session: BaseSession,
|
18
|
+
schedule_request: Callable[[Coroutine], Awaitable],
|
19
|
+
sender: RequestSender,
|
20
|
+
queue: RequestQueue,
|
21
|
+
delay: float,
|
22
|
+
shutdown_timeout: float,
|
23
|
+
srv_kwargs: dict[str, Any],
|
24
|
+
request_outer_middlewares: list[RequestMiddleware],
|
25
|
+
request_inner_middlewares: list[RequestMiddleware],
|
26
|
+
response_middlewares: list[ResponseMiddleware],
|
27
|
+
) -> None:
|
28
|
+
self._logger = logger
|
29
|
+
self._session = session
|
30
|
+
self._schedule_request = schedule_request
|
31
|
+
self._queue = queue
|
32
|
+
self._delay = delay
|
33
|
+
self._shutdown_timeout = shutdown_timeout
|
34
|
+
self._srv_kwargs = {"send_request": sender, **srv_kwargs}
|
35
|
+
self._request_outer_middlewares = request_outer_middlewares
|
36
|
+
self._request_inner_middlewares = request_inner_middlewares
|
37
|
+
self._response_middlewares = response_middlewares
|
38
|
+
self._task: asyncio.Task | None = None
|
39
|
+
|
40
|
+
async def _send_request(self, request: Request, params: RequestParams) -> None:
|
41
|
+
full_url = request.full_url
|
42
|
+
self._logger.debug(f"request: {request.method} {full_url}")
|
43
|
+
try:
|
44
|
+
for inner_middleware in self._request_inner_middlewares:
|
45
|
+
await inner_middleware(request, params)
|
46
|
+
|
47
|
+
response = await self._session.make_request(request)
|
48
|
+
for response_middleware in self._response_middlewares:
|
49
|
+
await response_middleware(params, response)
|
50
|
+
|
51
|
+
if response.status >= 400:
|
52
|
+
await self._handle_client_exception(
|
53
|
+
params,
|
54
|
+
client_exc=HTTPException(
|
55
|
+
status_code=response.status,
|
56
|
+
message=response.text(),
|
57
|
+
url=full_url,
|
58
|
+
method=response.method,
|
59
|
+
),
|
60
|
+
)
|
61
|
+
elif params.callback is not None:
|
62
|
+
await params.callback(
|
63
|
+
response,
|
64
|
+
**get_cb_kwargs(params.callback, srv_kwargs=self._srv_kwargs, cb_kwargs=params.cb_kwargs),
|
65
|
+
)
|
66
|
+
except Exception as exc:
|
67
|
+
await self._handle_client_exception(
|
68
|
+
params,
|
69
|
+
client_exc=RequestException(src=exc, url=full_url, method=request.method),
|
70
|
+
)
|
71
|
+
|
72
|
+
async def _handle_client_exception(self, params: RequestParams, client_exc: ClientException) -> None:
|
73
|
+
if params.errback is None:
|
74
|
+
raise client_exc
|
75
|
+
|
76
|
+
try:
|
77
|
+
await params.errback(
|
78
|
+
client_exc,
|
79
|
+
**get_cb_kwargs(params.errback, srv_kwargs=self._srv_kwargs, cb_kwargs=params.cb_kwargs),
|
80
|
+
)
|
81
|
+
except Exception as exc:
|
82
|
+
self._logger.exception(exc)
|
83
|
+
|
84
|
+
def listen_queue(self) -> None:
|
85
|
+
self._task = asyncio.create_task(self._listen_queue())
|
86
|
+
|
87
|
+
async def _listen_queue(self) -> None:
|
88
|
+
while (r := (await self._queue.get())) is not None:
|
89
|
+
for outer_middleware in self._request_outer_middlewares:
|
90
|
+
await outer_middleware(r.request, r.request_params)
|
91
|
+
|
92
|
+
await self._schedule_request(self._send_request(r.request, r.request_params))
|
93
|
+
await asyncio.sleep(self._delay)
|
94
|
+
|
95
|
+
async def shutdown(self, force: bool = False) -> None:
|
96
|
+
await self._queue.put(None)
|
97
|
+
if self._task is not None:
|
98
|
+
await asyncio.wait_for(self._task, timeout=self._shutdown_timeout) if force else await self._task
|
99
|
+
|
100
|
+
async def close(self) -> None:
|
101
|
+
await self._session.close()
|
@@ -0,0 +1,52 @@
|
|
1
|
+
import asyncio
|
2
|
+
from typing import Callable, Awaitable, Any
|
3
|
+
|
4
|
+
from .types import QueryParams, Cookies, Headers, BasicAuth, Request, RequestParams, RequestQueue, PRPRequest
|
5
|
+
|
6
|
+
|
7
|
+
class RequestSender:
|
8
|
+
def __init__(self, queue: RequestQueue) -> None:
|
9
|
+
self._queue = queue
|
10
|
+
|
11
|
+
async def __call__(
|
12
|
+
self,
|
13
|
+
url: str,
|
14
|
+
method: str = "GET",
|
15
|
+
callback: Callable[..., Awaitable] | None = None,
|
16
|
+
cb_kwargs: dict[str, Any] | None = None,
|
17
|
+
errback: Callable[..., Awaitable] | None = None,
|
18
|
+
params: QueryParams | None = None,
|
19
|
+
data: Any = None,
|
20
|
+
json_data: Any = None,
|
21
|
+
cookies: Cookies | None = None,
|
22
|
+
headers: Headers | None = None,
|
23
|
+
proxy: str | None = None,
|
24
|
+
auth: BasicAuth | None = None,
|
25
|
+
timeout: float | None = None,
|
26
|
+
priority: int = 0,
|
27
|
+
delay: float | None = None,
|
28
|
+
) -> None:
|
29
|
+
await self._queue.put(
|
30
|
+
PRPRequest(
|
31
|
+
priority=priority,
|
32
|
+
request=Request(
|
33
|
+
method=method,
|
34
|
+
url=url,
|
35
|
+
params=params,
|
36
|
+
data=data,
|
37
|
+
json_data=json_data,
|
38
|
+
cookies=cookies,
|
39
|
+
headers=headers,
|
40
|
+
auth=auth,
|
41
|
+
proxy=proxy,
|
42
|
+
timeout=timeout,
|
43
|
+
),
|
44
|
+
request_params=RequestParams(
|
45
|
+
callback=callback,
|
46
|
+
cb_kwargs=cb_kwargs,
|
47
|
+
errback=errback,
|
48
|
+
),
|
49
|
+
)
|
50
|
+
)
|
51
|
+
if delay:
|
52
|
+
await asyncio.sleep(delay)
|
@@ -0,0 +1,146 @@
|
|
1
|
+
import asyncio
|
2
|
+
import time
|
3
|
+
from logging import Logger, getLogger
|
4
|
+
from types import TracebackType
|
5
|
+
from typing import Type, Any
|
6
|
+
|
7
|
+
from aiojobs import Scheduler
|
8
|
+
|
9
|
+
from ..config import Config
|
10
|
+
from ..helpers import get_func_kwargs
|
11
|
+
from ..pipeline import PipelineDispatcher, BasePipeline
|
12
|
+
from ..request_manager import RequestManager
|
13
|
+
from ..request_sender import RequestSender
|
14
|
+
from ..scrapper import BaseScrapper
|
15
|
+
from ..session.aiohttp import AiohttpSession
|
16
|
+
from ..types import RequestMiddleware, ResponseMiddleware
|
17
|
+
|
18
|
+
|
19
|
+
class AIOScrapper:
|
20
|
+
def __init__(
|
21
|
+
self,
|
22
|
+
scrappers: list[BaseScrapper],
|
23
|
+
config: Config | None = None,
|
24
|
+
logger: Logger | None = None,
|
25
|
+
) -> None:
|
26
|
+
self._start_time = time.time()
|
27
|
+
self._config = config or Config()
|
28
|
+
self._logger = logger or getLogger("aioscrapper")
|
29
|
+
|
30
|
+
self._scrappers = scrappers
|
31
|
+
self._request_outer_middlewares = []
|
32
|
+
self._request_inner_middlewares = []
|
33
|
+
self._response_middlewares = []
|
34
|
+
|
35
|
+
self._pipelines: dict[str, list[BasePipeline]] = {}
|
36
|
+
self._pipeline_dispatcher = PipelineDispatcher(
|
37
|
+
logger=self._logger.getChild("pipeline"), pipelines=self._pipelines
|
38
|
+
)
|
39
|
+
|
40
|
+
def _exception_handler(_, context: dict[str, Any]):
|
41
|
+
if "job" in context:
|
42
|
+
self._logger.error(f'{context['message']}: {context["exception"]}', extra={"context": context})
|
43
|
+
else:
|
44
|
+
self._logger.error("Unhandled error", extra={"context": context})
|
45
|
+
|
46
|
+
self._scheduler = Scheduler(
|
47
|
+
limit=self._config.scheduler.concurrent_requests,
|
48
|
+
pending_limit=self._config.scheduler.pending_requests,
|
49
|
+
close_timeout=self._config.scheduler.close_timeout,
|
50
|
+
exception_handler=_exception_handler,
|
51
|
+
)
|
52
|
+
|
53
|
+
self._request_queue = asyncio.PriorityQueue()
|
54
|
+
self._request_sender = RequestSender(self._request_queue)
|
55
|
+
self._request_manager = RequestManager(
|
56
|
+
logger=self._logger.getChild("request_worker"),
|
57
|
+
session=AiohttpSession(
|
58
|
+
timeout=self._config.session.request.timeout,
|
59
|
+
ssl=self._config.session.request.ssl,
|
60
|
+
),
|
61
|
+
schedule_request=self._scheduler.spawn,
|
62
|
+
sender=self._request_sender,
|
63
|
+
queue=self._request_queue,
|
64
|
+
delay=self._config.session.request.delay,
|
65
|
+
shutdown_timeout=self._config.execution.shutdown_timeout,
|
66
|
+
srv_kwargs={"pipeline": self._pipeline_dispatcher},
|
67
|
+
request_outer_middlewares=self._request_outer_middlewares,
|
68
|
+
request_inner_middlewares=self._request_inner_middlewares,
|
69
|
+
response_middlewares=self._response_middlewares,
|
70
|
+
)
|
71
|
+
|
72
|
+
self._scrapper_kwargs = {"request_sender": self._request_sender, "pipeline": self._pipeline_dispatcher}
|
73
|
+
|
74
|
+
def add_pipeline(self, name: str, pipeline: BasePipeline) -> None:
|
75
|
+
if name not in self._pipelines:
|
76
|
+
self._pipelines[name] = [pipeline]
|
77
|
+
else:
|
78
|
+
self._pipelines[name].append(pipeline)
|
79
|
+
|
80
|
+
def add_outer_request_middlewares(self, *middlewares: RequestMiddleware) -> None:
|
81
|
+
self._request_outer_middlewares.extend(middlewares)
|
82
|
+
|
83
|
+
def add_inner_request_middlewares(self, *middlewares: RequestMiddleware) -> None:
|
84
|
+
self._request_inner_middlewares.extend(middlewares)
|
85
|
+
|
86
|
+
def add_response_middlewares(self, *middlewares: ResponseMiddleware) -> None:
|
87
|
+
self._response_middlewares.extend(middlewares)
|
88
|
+
|
89
|
+
async def __aenter__(self):
|
90
|
+
return self
|
91
|
+
|
92
|
+
async def __aexit__(
|
93
|
+
self,
|
94
|
+
exc_type: Type[BaseException] | None,
|
95
|
+
exc_val: BaseException | None,
|
96
|
+
exc_tb: TracebackType | None,
|
97
|
+
) -> None:
|
98
|
+
await self.close()
|
99
|
+
|
100
|
+
async def start(self) -> None:
|
101
|
+
await self._pipeline_dispatcher.initialize()
|
102
|
+
self._request_manager.listen_queue()
|
103
|
+
|
104
|
+
for scrapper in self._scrappers:
|
105
|
+
await scrapper.initialize(**get_func_kwargs(scrapper.initialize, self._scrapper_kwargs))
|
106
|
+
|
107
|
+
await asyncio.gather(
|
108
|
+
*[scrapper.start(**get_func_kwargs(scrapper.start, self._scrapper_kwargs)) for scrapper in self._scrappers]
|
109
|
+
)
|
110
|
+
|
111
|
+
async def _shutdown(self) -> bool:
|
112
|
+
status = False
|
113
|
+
execution_timeout = (
|
114
|
+
max(self._config.execution.timeout - (time.time() - self._start_time), 0.1)
|
115
|
+
if self._config.execution.timeout
|
116
|
+
else None
|
117
|
+
)
|
118
|
+
while True:
|
119
|
+
if execution_timeout is not None and time.time() - self._start_time > execution_timeout:
|
120
|
+
self._logger.log(
|
121
|
+
level=self._config.execution.log_level,
|
122
|
+
msg=f"execution timeout: {self._config.execution.timeout}",
|
123
|
+
)
|
124
|
+
status = True
|
125
|
+
break
|
126
|
+
if len(self._scheduler) == 0 and self._request_queue.qsize() == 0:
|
127
|
+
break
|
128
|
+
|
129
|
+
await asyncio.sleep(self._config.execution.shutdown_check_interval)
|
130
|
+
|
131
|
+
return status
|
132
|
+
|
133
|
+
async def shutdown(self) -> None:
|
134
|
+
force = await self._shutdown()
|
135
|
+
await self._request_manager.shutdown(force)
|
136
|
+
|
137
|
+
async def close(self, shutdown: bool = True) -> None:
|
138
|
+
if shutdown:
|
139
|
+
await self.shutdown()
|
140
|
+
|
141
|
+
for scrapper in self._scrappers:
|
142
|
+
await scrapper.close()
|
143
|
+
|
144
|
+
await self._scheduler.close()
|
145
|
+
await self._request_manager.close()
|
146
|
+
await self._pipeline_dispatcher.close()
|
File without changes
|
@@ -0,0 +1,46 @@
|
|
1
|
+
from aiohttp import ClientSession, ClientTimeout, TCPConnector
|
2
|
+
from aiohttp.helpers import BasicAuth
|
3
|
+
|
4
|
+
from .base import BaseSession
|
5
|
+
from ..types import Response, Request
|
6
|
+
|
7
|
+
|
8
|
+
class AiohttpSession(BaseSession):
|
9
|
+
def __init__(self, timeout: float | None = None, ssl: bool | None = None, **kwargs) -> None:
|
10
|
+
super().__init__(timeout, ssl)
|
11
|
+
self._session = ClientSession(
|
12
|
+
timeout=ClientTimeout(total=timeout),
|
13
|
+
connector=TCPConnector(ssl=ssl) if ssl is not None else None,
|
14
|
+
**kwargs,
|
15
|
+
)
|
16
|
+
|
17
|
+
async def make_request(self, request: Request) -> Response:
|
18
|
+
async with self._session.request(
|
19
|
+
url=request.url,
|
20
|
+
method=request.method,
|
21
|
+
params=request.params,
|
22
|
+
data=request.data,
|
23
|
+
json=request.json_data,
|
24
|
+
cookies=request.cookies,
|
25
|
+
headers=request.headers,
|
26
|
+
proxy=request.proxy,
|
27
|
+
auth=(
|
28
|
+
BasicAuth(login=request.auth["username"], password=request.auth["password"])
|
29
|
+
if request.auth is not None
|
30
|
+
else None
|
31
|
+
),
|
32
|
+
timeout=ClientTimeout(total=request.timeout) if request.timeout is not None else None,
|
33
|
+
) as response:
|
34
|
+
return Response(
|
35
|
+
url=request.url,
|
36
|
+
method=request.method,
|
37
|
+
params=request.params,
|
38
|
+
status=response.status,
|
39
|
+
headers=dict(response.headers),
|
40
|
+
cookies={k: f"{v.key}={v.value}" for k, v in response.cookies.items()},
|
41
|
+
content=await response.read(),
|
42
|
+
content_type=response.headers.get("Content-Type"),
|
43
|
+
)
|
44
|
+
|
45
|
+
async def close(self) -> None:
|
46
|
+
await self._session.close()
|
@@ -0,0 +1,14 @@
|
|
1
|
+
import abc
|
2
|
+
|
3
|
+
from ..types import Request, Response
|
4
|
+
|
5
|
+
|
6
|
+
class BaseSession(abc.ABC):
|
7
|
+
def __init__(self, timeout: float | None = None, ssl: bool | None = None) -> None:
|
8
|
+
self._timeout = timeout
|
9
|
+
self._ssl = ssl
|
10
|
+
|
11
|
+
@abc.abstractmethod
|
12
|
+
async def make_request(self, request: Request) -> Response: ...
|
13
|
+
|
14
|
+
async def close(self) -> None: ...
|
@@ -0,0 +1,11 @@
|
|
1
|
+
from typing import Protocol
|
2
|
+
|
3
|
+
from .session import Request, RequestParams, Response
|
4
|
+
|
5
|
+
|
6
|
+
class RequestMiddleware(Protocol):
|
7
|
+
async def __call__(self, request: Request, params: RequestParams) -> None: ...
|
8
|
+
|
9
|
+
|
10
|
+
class ResponseMiddleware(Protocol):
|
11
|
+
async def __call__(self, params: RequestParams, response: Response) -> None: ...
|
@@ -0,0 +1,109 @@
|
|
1
|
+
import asyncio
|
2
|
+
import json
|
3
|
+
from dataclasses import field, dataclass
|
4
|
+
from typing import Union, Mapping, Any, Callable, Awaitable, TypedDict
|
5
|
+
from urllib.parse import urlencode
|
6
|
+
|
7
|
+
QueryParams = Mapping[str, Union[str, int, float]]
|
8
|
+
|
9
|
+
Cookies = Mapping[str, str]
|
10
|
+
Headers = Mapping[str, str]
|
11
|
+
|
12
|
+
|
13
|
+
class BasicAuth(TypedDict):
|
14
|
+
username: str
|
15
|
+
password: str
|
16
|
+
|
17
|
+
|
18
|
+
@dataclass(slots=True)
|
19
|
+
class Request:
|
20
|
+
url: str
|
21
|
+
method: str
|
22
|
+
params: QueryParams | None = None
|
23
|
+
data: Any = None
|
24
|
+
json_data: Any = None
|
25
|
+
cookies: Cookies | None = None
|
26
|
+
headers: Headers | None = None
|
27
|
+
auth: BasicAuth | None = None
|
28
|
+
proxy: str | None = None
|
29
|
+
timeout: float | None = None
|
30
|
+
|
31
|
+
@property
|
32
|
+
def full_url(self) -> str:
|
33
|
+
return f"{self.url}{urlencode(self.params or {})}"
|
34
|
+
|
35
|
+
|
36
|
+
@dataclass(slots=True)
|
37
|
+
class RequestParams:
|
38
|
+
callback: Callable[..., Awaitable] | None = None
|
39
|
+
cb_kwargs: dict[str, Any] | None = None
|
40
|
+
errback: Callable[..., Awaitable] | None = None
|
41
|
+
|
42
|
+
|
43
|
+
@dataclass(slots=True, order=True)
|
44
|
+
class PRPRequest:
|
45
|
+
priority: int
|
46
|
+
request: Request = field(compare=False)
|
47
|
+
request_params: RequestParams = field(compare=False)
|
48
|
+
|
49
|
+
|
50
|
+
RequestQueue = asyncio.PriorityQueue[PRPRequest | None]
|
51
|
+
|
52
|
+
|
53
|
+
class Response:
|
54
|
+
def __init__(
|
55
|
+
self,
|
56
|
+
url: str,
|
57
|
+
method: str,
|
58
|
+
params: QueryParams | None,
|
59
|
+
status: int,
|
60
|
+
headers: Headers,
|
61
|
+
cookies: Cookies,
|
62
|
+
content: bytes,
|
63
|
+
content_type: str | None,
|
64
|
+
) -> None:
|
65
|
+
self._url = url
|
66
|
+
self._method = method
|
67
|
+
self._params = params
|
68
|
+
self._status = status
|
69
|
+
self._headers = headers
|
70
|
+
self._cookies = cookies
|
71
|
+
self._content = content
|
72
|
+
self._content_type = content_type
|
73
|
+
|
74
|
+
@property
|
75
|
+
def url(self) -> str:
|
76
|
+
return self._url
|
77
|
+
|
78
|
+
@property
|
79
|
+
def method(self) -> str:
|
80
|
+
return self._method
|
81
|
+
|
82
|
+
@property
|
83
|
+
def params(self) -> QueryParams | None:
|
84
|
+
return self._params
|
85
|
+
|
86
|
+
@property
|
87
|
+
def status(self) -> int:
|
88
|
+
return self._status
|
89
|
+
|
90
|
+
@property
|
91
|
+
def headers(self) -> Headers | None:
|
92
|
+
return self._headers
|
93
|
+
|
94
|
+
@property
|
95
|
+
def cookies(self) -> Cookies | None:
|
96
|
+
return self._cookies
|
97
|
+
|
98
|
+
@property
|
99
|
+
def content_type(self) -> str | None:
|
100
|
+
return self._content_type
|
101
|
+
|
102
|
+
def bytes(self) -> bytes:
|
103
|
+
return self._content
|
104
|
+
|
105
|
+
def json(self) -> Any:
|
106
|
+
return json.loads(self._content) if self._content is not None else None
|
107
|
+
|
108
|
+
def text(self, encoding: str = "utf-8") -> str | None:
|
109
|
+
return self._content.decode(encoding) if self._content is not None else None
|
@@ -0,0 +1,87 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: aioscrapper
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Async framework for building modular and scalable web scrapers.
|
5
|
+
Author: darkstussy
|
6
|
+
Project-URL: Homepage, https://github.com/darkstussy/aioscrapper
|
7
|
+
Project-URL: Issues, https://github.com/darkstussy/aioscrapper/issues
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
10
|
+
Classifier: Framework :: AsyncIO
|
11
|
+
Classifier: Intended Audience :: Developers
|
12
|
+
Classifier: Operating System :: OS Independent
|
13
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
14
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
15
|
+
Requires-Python: >=3.12
|
16
|
+
Description-Content-Type: text/markdown
|
17
|
+
License-File: LICENSE
|
18
|
+
Requires-Dist: aiohttp[speedups]~=3.11.16
|
19
|
+
Requires-Dist: aiojobs~=1.4.0
|
20
|
+
Dynamic: license-file
|
21
|
+
|
22
|
+
# aioscrapper
|
23
|
+
|
24
|
+
**Asynchronous framework for building modular and scalable web scrapers.**
|
25
|
+
|
26
|
+

|
27
|
+

|
28
|
+

|
29
|
+
|
30
|
+
## Features
|
31
|
+
|
32
|
+
- 🚀 Fully asynchronous architecture powered by `aiohttp` and `aiojobs`
|
33
|
+
- 🔧 Modular system with middleware support
|
34
|
+
- 📦 Pipeline data processing
|
35
|
+
- ⚙️ Flexible configuration
|
36
|
+
- 🔄 Priority-based request queue management
|
37
|
+
- 🛡️ Built-in error handling
|
38
|
+
|
39
|
+
## Installation
|
40
|
+
|
41
|
+
```bash
|
42
|
+
pip install aioscrapper
|
43
|
+
```
|
44
|
+
|
45
|
+
## Requirements
|
46
|
+
|
47
|
+
- Python 3.12 or higher
|
48
|
+
- aiohttp
|
49
|
+
- aiojobs
|
50
|
+
|
51
|
+
## Quick Start
|
52
|
+
|
53
|
+
```python
|
54
|
+
import asyncio
|
55
|
+
|
56
|
+
from aioscrapper import BaseScrapper, AIOScrapper, RequestSender
|
57
|
+
from aioscrapper.types import Response
|
58
|
+
|
59
|
+
|
60
|
+
class Scrapper(BaseScrapper):
|
61
|
+
async def start(self, request_sender: RequestSender) -> None:
|
62
|
+
await request_sender(url="https://example.com", callback=self.parse)
|
63
|
+
|
64
|
+
async def parse(self, response: Response) -> None:
|
65
|
+
# handle response
|
66
|
+
pass
|
67
|
+
|
68
|
+
|
69
|
+
async def main():
|
70
|
+
async with AIOScrapper(scrappers=[Scrapper()]) as scrapper:
|
71
|
+
await scrapper.start()
|
72
|
+
|
73
|
+
|
74
|
+
if __name__ == "__main__":
|
75
|
+
asyncio.run(main())
|
76
|
+
```
|
77
|
+
|
78
|
+
## License
|
79
|
+
|
80
|
+
MIT License
|
81
|
+
|
82
|
+
Copyright (c) 2025 darkstussy
|
83
|
+
|
84
|
+
## Links
|
85
|
+
|
86
|
+
- [GitHub](https://github.com/darkstussy/aioscrapper)
|
87
|
+
- [Issues](https://github.com/darkstussy/aioscrapper/issues)
|
@@ -0,0 +1,23 @@
|
|
1
|
+
aioscrapper/__init__.py,sha256=_01EI59FLQmHspoN9HqJMoJ9OHpEaYGDyFXAKtfnnYY,256
|
2
|
+
aioscrapper/config.py,sha256=yO5ipQUHxA_-CiSqJ0u7WioN6lu8VgT1ss5PRvS1foc,844
|
3
|
+
aioscrapper/exceptions.py,sha256=Akk3zDTgws9E7J-Sh8bgdlgS8L3auDKuv3_U3aefxMc,765
|
4
|
+
aioscrapper/helpers.py,sha256=slq9r5oCHrR7M9hKZFBLFRsWoqJcw_QFptQI1NjIdQw,610
|
5
|
+
aioscrapper/request_manager.py,sha256=YLZvuPthhFMnJVQ7pV9-YCsni0Kdu8baP5tmOccEDOM,4037
|
6
|
+
aioscrapper/request_sender.py,sha256=_Vx_LJyV_5qb23-C3VdnOUUUcQPW42OJNbtERVu1DIA,1644
|
7
|
+
aioscrapper/pipeline/__init__.py,sha256=hv7Kcssd2BP0LM9fNZtaMs1tmRuAUu4mwAescoeV3Uk,84
|
8
|
+
aioscrapper/pipeline/base.py,sha256=Ro7YGUOB-V2NJCtfgwhtQDedY4OYMu-jwEV8iR-L89k,405
|
9
|
+
aioscrapper/pipeline/dispatcher.py,sha256=H4cHNxTyHEF4BnEwaW6nwmcRmK839GqbDTzZh1Zftv4,1156
|
10
|
+
aioscrapper/scrapper/__init__.py,sha256=UR7bTck-_YVoP2BqYdPldN9PgaCuJf9wvDdQLTVJ578,65
|
11
|
+
aioscrapper/scrapper/base.py,sha256=_wFrI0UVsTBIAV7EOZCk_QMy2-chPjr1pKzu6w8Huso,224
|
12
|
+
aioscrapper/scrapper/executor.py,sha256=TrZBh0JyFeQIJd_O4S86cSZNgywjKxJWdb2QSzZyObU,5475
|
13
|
+
aioscrapper/session/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
+
aioscrapper/session/aiohttp.py,sha256=8h4Ca1o2JJ7L24iqEnZ5I8bUudTn2cvTngBhM8eJPD4,1753
|
15
|
+
aioscrapper/session/base.py,sha256=Zxw1VHIe_LgveUufJXh_cl0YeHykiutQveMUwZ6VL54,356
|
16
|
+
aioscrapper/types/__init__.py,sha256=SC6De0ThepMK-wPQGFYGq6x8aGlzsWscphFxGQGBWek,225
|
17
|
+
aioscrapper/types/middleware.py,sha256=WtT73QTAlwhdP6UNFgyFHGpFOx1vlxehCAwiO6xjR10,326
|
18
|
+
aioscrapper/types/session.py,sha256=ffJelDaZmeIoNOk_ivGb_nSC5bBpgKwCyiSsUl4e-B0,2595
|
19
|
+
aioscrapper-0.1.0.dist-info/licenses/LICENSE,sha256=LefKIkLsd_UuLWYOatzEjY5yscQS8nZAFi8rzCs54OM,1066
|
20
|
+
aioscrapper-0.1.0.dist-info/METADATA,sha256=pbjxh2xCDlGRYDEFXT40JBL6_7Xkw31-BXivX_Fmog0,2303
|
21
|
+
aioscrapper-0.1.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
22
|
+
aioscrapper-0.1.0.dist-info/top_level.txt,sha256=d7lbzXOwzzk2HLh-A0X7dkqn8q3zGAJcKqx6TkaEEWI,12
|
23
|
+
aioscrapper-0.1.0.dist-info/RECORD,,
|
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2024 Stanislav
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1 @@
|
|
1
|
+
aioscrapper
|