aioscrapper 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. aioscrapper-0.1.0/LICENSE +21 -0
  2. aioscrapper-0.1.0/PKG-INFO +87 -0
  3. aioscrapper-0.1.0/README.md +66 -0
  4. aioscrapper-0.1.0/pyproject.toml +24 -0
  5. aioscrapper-0.1.0/setup.cfg +4 -0
  6. aioscrapper-0.1.0/src/aioscrapper/__init__.py +10 -0
  7. aioscrapper-0.1.0/src/aioscrapper/config.py +36 -0
  8. aioscrapper-0.1.0/src/aioscrapper/exceptions.py +26 -0
  9. aioscrapper-0.1.0/src/aioscrapper/helpers.py +22 -0
  10. aioscrapper-0.1.0/src/aioscrapper/pipeline/__init__.py +2 -0
  11. aioscrapper-0.1.0/src/aioscrapper/pipeline/base.py +19 -0
  12. aioscrapper-0.1.0/src/aioscrapper/pipeline/dispatcher.py +36 -0
  13. aioscrapper-0.1.0/src/aioscrapper/request_manager.py +101 -0
  14. aioscrapper-0.1.0/src/aioscrapper/request_sender.py +52 -0
  15. aioscrapper-0.1.0/src/aioscrapper/scrapper/__init__.py +2 -0
  16. aioscrapper-0.1.0/src/aioscrapper/scrapper/base.py +10 -0
  17. aioscrapper-0.1.0/src/aioscrapper/scrapper/executor.py +146 -0
  18. aioscrapper-0.1.0/src/aioscrapper/session/__init__.py +0 -0
  19. aioscrapper-0.1.0/src/aioscrapper/session/aiohttp.py +46 -0
  20. aioscrapper-0.1.0/src/aioscrapper/session/base.py +14 -0
  21. aioscrapper-0.1.0/src/aioscrapper/types/__init__.py +12 -0
  22. aioscrapper-0.1.0/src/aioscrapper/types/middleware.py +11 -0
  23. aioscrapper-0.1.0/src/aioscrapper/types/session.py +109 -0
  24. aioscrapper-0.1.0/src/aioscrapper.egg-info/PKG-INFO +87 -0
  25. aioscrapper-0.1.0/src/aioscrapper.egg-info/SOURCES.txt +26 -0
  26. aioscrapper-0.1.0/src/aioscrapper.egg-info/dependency_links.txt +1 -0
  27. aioscrapper-0.1.0/src/aioscrapper.egg-info/requires.txt +2 -0
  28. aioscrapper-0.1.0/src/aioscrapper.egg-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Stanislav
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,87 @@
1
+ Metadata-Version: 2.4
2
+ Name: aioscrapper
3
+ Version: 0.1.0
4
+ Summary: Async framework for building modular and scalable web scrapers.
5
+ Author: darkstussy
6
+ Project-URL: Homepage, https://github.com/darkstussy/aioscrapper
7
+ Project-URL: Issues, https://github.com/darkstussy/aioscrapper/issues
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Classifier: Framework :: AsyncIO
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
14
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
15
+ Requires-Python: >=3.12
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: aiohttp[speedups]~=3.11.16
19
+ Requires-Dist: aiojobs~=1.4.0
20
+ Dynamic: license-file
21
+
22
+ # aioscrapper
23
+
24
+ **Asynchronous framework for building modular and scalable web scrapers.**
25
+
26
+ ![Python](https://img.shields.io/badge/python-3.12%2B-blue)
27
+ ![License](https://img.shields.io/github/license/darkstussy/aioscrapper)
28
+ ![Version](https://img.shields.io/github/v/tag/darkstussy/aioscrapper?label=version)
29
+
30
+ ## Features
31
+
32
+ - 🚀 Fully asynchronous architecture powered by `aiohttp` and `aiojobs`
33
+ - 🔧 Modular system with middleware support
34
+ - 📦 Pipeline data processing
35
+ - ⚙️ Flexible configuration
36
+ - 🔄 Priority-based request queue management
37
+ - 🛡️ Built-in error handling
38
+
39
+ ## Installation
40
+
41
+ ```bash
42
+ pip install aioscrapper
43
+ ```
44
+
45
+ ## Requirements
46
+
47
+ - Python 3.12 or higher
48
+ - aiohttp
49
+ - aiojobs
50
+
51
+ ## Quick Start
52
+
53
+ ```python
54
+ import asyncio
55
+
56
+ from aioscrapper import BaseScrapper, AIOScrapper, RequestSender
57
+ from aioscrapper.types import Response
58
+
59
+
60
+ class Scrapper(BaseScrapper):
61
+ async def start(self, request_sender: RequestSender) -> None:
62
+ await request_sender(url="https://example.com", callback=self.parse)
63
+
64
+ async def parse(self, response: Response) -> None:
65
+ # handle response
66
+ pass
67
+
68
+
69
+ async def main():
70
+ async with AIOScrapper(scrappers=[Scrapper()]) as scrapper:
71
+ await scrapper.start()
72
+
73
+
74
+ if __name__ == "__main__":
75
+ asyncio.run(main())
76
+ ```
77
+
78
+ ## License
79
+
80
+ MIT License
81
+
82
+ Copyright (c) 2025 darkstussy
83
+
84
+ ## Links
85
+
86
+ - [GitHub](https://github.com/darkstussy/aioscrapper)
87
+ - [Issues](https://github.com/darkstussy/aioscrapper/issues)
@@ -0,0 +1,66 @@
1
+ # aioscrapper
2
+
3
+ **Asynchronous framework for building modular and scalable web scrapers.**
4
+
5
+ ![Python](https://img.shields.io/badge/python-3.12%2B-blue)
6
+ ![License](https://img.shields.io/github/license/darkstussy/aioscrapper)
7
+ ![Version](https://img.shields.io/github/v/tag/darkstussy/aioscrapper?label=version)
8
+
9
+ ## Features
10
+
11
+ - 🚀 Fully asynchronous architecture powered by `aiohttp` and `aiojobs`
12
+ - 🔧 Modular system with middleware support
13
+ - 📦 Pipeline data processing
14
+ - ⚙️ Flexible configuration
15
+ - 🔄 Priority-based request queue management
16
+ - 🛡️ Built-in error handling
17
+
18
+ ## Installation
19
+
20
+ ```bash
21
+ pip install aioscrapper
22
+ ```
23
+
24
+ ## Requirements
25
+
26
+ - Python 3.12 or higher
27
+ - aiohttp
28
+ - aiojobs
29
+
30
+ ## Quick Start
31
+
32
+ ```python
33
+ import asyncio
34
+
35
+ from aioscrapper import BaseScrapper, AIOScrapper, RequestSender
36
+ from aioscrapper.types import Response
37
+
38
+
39
+ class Scrapper(BaseScrapper):
40
+ async def start(self, request_sender: RequestSender) -> None:
41
+ await request_sender(url="https://example.com", callback=self.parse)
42
+
43
+ async def parse(self, response: Response) -> None:
44
+ # handle response
45
+ pass
46
+
47
+
48
+ async def main():
49
+ async with AIOScrapper(scrappers=[Scrapper()]) as scrapper:
50
+ await scrapper.start()
51
+
52
+
53
+ if __name__ == "__main__":
54
+ asyncio.run(main())
55
+ ```
56
+
57
+ ## License
58
+
59
+ MIT License
60
+
61
+ Copyright (c) 2025 darkstussy
62
+
63
+ ## Links
64
+
65
+ - [GitHub](https://github.com/darkstussy/aioscrapper)
66
+ - [Issues](https://github.com/darkstussy/aioscrapper/issues)
@@ -0,0 +1,24 @@
1
+ [project]
2
+ name = "aioscrapper"
3
+ version = "0.1.0"
4
+ authors = [{ name = "darkstussy" }, ]
5
+ description = "Async framework for building modular and scalable web scrapers."
6
+ readme = "README.md"
7
+ requires-python = ">=3.12"
8
+ dependencies = [
9
+ "aiohttp[speedups] ~= 3.11.16",
10
+ "aiojobs ~= 1.4.0",
11
+ ]
12
+ classifiers = [
13
+ "Programming Language :: Python :: 3",
14
+ "Programming Language :: Python :: 3.12",
15
+ "Framework :: AsyncIO",
16
+ "Intended Audience :: Developers",
17
+ "Operating System :: OS Independent",
18
+ "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
19
+ "Topic :: Software Development :: Libraries :: Application Frameworks",
20
+ ]
21
+
22
+ [project.urls]
23
+ Homepage = "https://github.com/darkstussy/aioscrapper"
24
+ Issues = "https://github.com/darkstussy/aioscrapper/issues"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,10 @@
1
+ __title__ = "aioscrapper"
2
+
3
+ __author__ = "darkstussy"
4
+
5
+ __copyright__ = f"Copyright (c) 2025 {__author__}"
6
+
7
+ from .request_sender import RequestSender
8
+ from .scrapper import AIOScrapper, BaseScrapper
9
+
10
+ __all__ = ["AIOScrapper", "BaseScrapper", "RequestSender"]
@@ -0,0 +1,36 @@
1
+ import logging
2
+ from dataclasses import dataclass
3
+
4
+
5
+ @dataclass(slots=True, frozen=True)
6
+ class RequestConfig:
7
+ timeout: int = 60
8
+ delay: float = 0.0
9
+ ssl: bool = True
10
+
11
+
12
+ @dataclass(slots=True, frozen=True)
13
+ class SessionConfig:
14
+ request: RequestConfig = RequestConfig()
15
+
16
+
17
+ @dataclass(slots=True, frozen=True)
18
+ class SchedulerConfig:
19
+ concurrent_requests: int = 64
20
+ pending_requests: int = 1
21
+ close_timeout: float | None = 0.1
22
+
23
+
24
+ @dataclass(slots=True, frozen=True)
25
+ class ExecutionConfig:
26
+ timeout: float | None = None
27
+ shutdown_timeout: float = 0.1
28
+ shutdown_check_interval: float = 0.1
29
+ log_level: int = logging.ERROR
30
+
31
+
32
+ @dataclass(slots=True, frozen=True)
33
+ class Config:
34
+ session: SessionConfig = SessionConfig()
35
+ scheduler: SchedulerConfig = SchedulerConfig()
36
+ execution: ExecutionConfig = ExecutionConfig()
@@ -0,0 +1,26 @@
1
+ class ClientException(Exception):
2
+ pass
3
+
4
+
5
+ class HTTPException(ClientException):
6
+ def __init__(self, status_code: int, message: str | None, url: str, method: str) -> None:
7
+ self.status_code = status_code
8
+ self.message = message
9
+ self.url = url
10
+ self.method = method
11
+
12
+ def __str__(self) -> str:
13
+ return f"{self.method} {self.url}: {self.status_code}: {self.message}"
14
+
15
+
16
+ class RequestException(ClientException):
17
+ def __init__(self, src: Exception | str, url: str, method: str) -> None:
18
+ self.src = src
19
+ self.url = url
20
+ self.method = method
21
+
22
+ def __str__(self) -> str:
23
+ return f"[{self.src.__class__.__name__}]: {self.method} {self.url}: {self.src}"
24
+
25
+
26
+ class PipelineException(Exception): ...
@@ -0,0 +1,22 @@
1
+ import inspect
2
+ from typing import Callable, Any
3
+
4
+
5
+ def get_func_kwargs(func: Callable, kwargs: dict[str, Any]) -> dict[str, Any]:
6
+ return {param: kwargs[param] for param in inspect.signature(func).parameters.keys() if param in kwargs}
7
+
8
+
9
+ def get_cb_kwargs(
10
+ callback: Callable,
11
+ srv_kwargs: dict[str, Any] | None,
12
+ cb_kwargs: dict[str, Any] | None,
13
+ ) -> dict[str, Any]:
14
+ if cb_kwargs is None and srv_kwargs is None:
15
+ return {}
16
+
17
+ if cb_kwargs is None:
18
+ cb_kwargs = {}
19
+ if srv_kwargs is None:
20
+ srv_kwargs = {}
21
+
22
+ return get_func_kwargs(callback, cb_kwargs | srv_kwargs)
@@ -0,0 +1,2 @@
1
+ from .base import BasePipeline, BaseItem
2
+ from .dispatcher import PipelineDispatcher
@@ -0,0 +1,19 @@
1
+ import abc
2
+ from typing import TypeVar, Generic, Protocol
3
+
4
+
5
+ class BaseItem(Protocol):
6
+ @property
7
+ def pipeline_name(self) -> str: ...
8
+
9
+
10
+ ItemType = TypeVar("ItemType", bound=BaseItem)
11
+
12
+
13
+ class BasePipeline(abc.ABC, Generic[ItemType]):
14
+ @abc.abstractmethod
15
+ async def put_item(self, item: ItemType) -> None: ...
16
+
17
+ async def initialize(self) -> None: ...
18
+
19
+ async def close(self) -> None: ...
@@ -0,0 +1,36 @@
1
+ from logging import Logger
2
+ from typing import Generator
3
+
4
+ from .base import BasePipeline, BaseItem
5
+ from ..exceptions import PipelineException
6
+
7
+
8
+ class PipelineDispatcher:
9
+ def __init__(self, logger: Logger, pipelines: dict[str, list[BasePipeline]]) -> None:
10
+ self._logger = logger
11
+ self._pipelines = pipelines
12
+
13
+ async def put_item(self, item: BaseItem) -> BaseItem:
14
+ self._logger.debug(f"pipeline item received: {item}")
15
+ try:
16
+ pipelines = self._pipelines[item.pipeline_name]
17
+ except KeyError:
18
+ raise PipelineException(f"Pipelines for item {item} not found")
19
+
20
+ for pipeline in pipelines:
21
+ await pipeline.put_item(item)
22
+
23
+ return item
24
+
25
+ def _get_pipelines(self) -> Generator[BasePipeline, None, None]:
26
+ for pipelines in self._pipelines.values():
27
+ for pipeline in pipelines:
28
+ yield pipeline
29
+
30
+ async def initialize(self) -> None:
31
+ for pipeline in self._get_pipelines():
32
+ await pipeline.initialize()
33
+
34
+ async def close(self) -> None:
35
+ for pipeline in self._get_pipelines():
36
+ await pipeline.close()
@@ -0,0 +1,101 @@
1
+ import asyncio
2
+ from logging import Logger
3
+ from typing import Callable, Awaitable, Any, Coroutine
4
+
5
+ from .exceptions import HTTPException, RequestException, ClientException
6
+ from .helpers import get_cb_kwargs
7
+ from .request_sender import RequestSender
8
+ from .session.base import BaseSession
9
+ from .types import Request, RequestParams, RequestQueue
10
+ from .types import RequestMiddleware, ResponseMiddleware
11
+
12
+
13
+ class RequestManager:
14
+ def __init__(
15
+ self,
16
+ logger: Logger,
17
+ session: BaseSession,
18
+ schedule_request: Callable[[Coroutine], Awaitable],
19
+ sender: RequestSender,
20
+ queue: RequestQueue,
21
+ delay: float,
22
+ shutdown_timeout: float,
23
+ srv_kwargs: dict[str, Any],
24
+ request_outer_middlewares: list[RequestMiddleware],
25
+ request_inner_middlewares: list[RequestMiddleware],
26
+ response_middlewares: list[ResponseMiddleware],
27
+ ) -> None:
28
+ self._logger = logger
29
+ self._session = session
30
+ self._schedule_request = schedule_request
31
+ self._queue = queue
32
+ self._delay = delay
33
+ self._shutdown_timeout = shutdown_timeout
34
+ self._srv_kwargs = {"send_request": sender, **srv_kwargs}
35
+ self._request_outer_middlewares = request_outer_middlewares
36
+ self._request_inner_middlewares = request_inner_middlewares
37
+ self._response_middlewares = response_middlewares
38
+ self._task: asyncio.Task | None = None
39
+
40
+ async def _send_request(self, request: Request, params: RequestParams) -> None:
41
+ full_url = request.full_url
42
+ self._logger.debug(f"request: {request.method} {full_url}")
43
+ try:
44
+ for inner_middleware in self._request_inner_middlewares:
45
+ await inner_middleware(request, params)
46
+
47
+ response = await self._session.make_request(request)
48
+ for response_middleware in self._response_middlewares:
49
+ await response_middleware(params, response)
50
+
51
+ if response.status >= 400:
52
+ await self._handle_client_exception(
53
+ params,
54
+ client_exc=HTTPException(
55
+ status_code=response.status,
56
+ message=response.text(),
57
+ url=full_url,
58
+ method=response.method,
59
+ ),
60
+ )
61
+ elif params.callback is not None:
62
+ await params.callback(
63
+ response,
64
+ **get_cb_kwargs(params.callback, srv_kwargs=self._srv_kwargs, cb_kwargs=params.cb_kwargs),
65
+ )
66
+ except Exception as exc:
67
+ await self._handle_client_exception(
68
+ params,
69
+ client_exc=RequestException(src=exc, url=full_url, method=request.method),
70
+ )
71
+
72
+ async def _handle_client_exception(self, params: RequestParams, client_exc: ClientException) -> None:
73
+ if params.errback is None:
74
+ raise client_exc
75
+
76
+ try:
77
+ await params.errback(
78
+ client_exc,
79
+ **get_cb_kwargs(params.errback, srv_kwargs=self._srv_kwargs, cb_kwargs=params.cb_kwargs),
80
+ )
81
+ except Exception as exc:
82
+ self._logger.exception(exc)
83
+
84
+ def listen_queue(self) -> None:
85
+ self._task = asyncio.create_task(self._listen_queue())
86
+
87
+ async def _listen_queue(self) -> None:
88
+ while (r := (await self._queue.get())) is not None:
89
+ for outer_middleware in self._request_outer_middlewares:
90
+ await outer_middleware(r.request, r.request_params)
91
+
92
+ await self._schedule_request(self._send_request(r.request, r.request_params))
93
+ await asyncio.sleep(self._delay)
94
+
95
+ async def shutdown(self, force: bool = False) -> None:
96
+ await self._queue.put(None)
97
+ if self._task is not None:
98
+ await asyncio.wait_for(self._task, timeout=self._shutdown_timeout) if force else await self._task
99
+
100
+ async def close(self) -> None:
101
+ await self._session.close()
@@ -0,0 +1,52 @@
1
+ import asyncio
2
+ from typing import Callable, Awaitable, Any
3
+
4
+ from .types import QueryParams, Cookies, Headers, BasicAuth, Request, RequestParams, RequestQueue, PRPRequest
5
+
6
+
7
+ class RequestSender:
8
+ def __init__(self, queue: RequestQueue) -> None:
9
+ self._queue = queue
10
+
11
+ async def __call__(
12
+ self,
13
+ url: str,
14
+ method: str = "GET",
15
+ callback: Callable[..., Awaitable] | None = None,
16
+ cb_kwargs: dict[str, Any] | None = None,
17
+ errback: Callable[..., Awaitable] | None = None,
18
+ params: QueryParams | None = None,
19
+ data: Any = None,
20
+ json_data: Any = None,
21
+ cookies: Cookies | None = None,
22
+ headers: Headers | None = None,
23
+ proxy: str | None = None,
24
+ auth: BasicAuth | None = None,
25
+ timeout: float | None = None,
26
+ priority: int = 0,
27
+ delay: float | None = None,
28
+ ) -> None:
29
+ await self._queue.put(
30
+ PRPRequest(
31
+ priority=priority,
32
+ request=Request(
33
+ method=method,
34
+ url=url,
35
+ params=params,
36
+ data=data,
37
+ json_data=json_data,
38
+ cookies=cookies,
39
+ headers=headers,
40
+ auth=auth,
41
+ proxy=proxy,
42
+ timeout=timeout,
43
+ ),
44
+ request_params=RequestParams(
45
+ callback=callback,
46
+ cb_kwargs=cb_kwargs,
47
+ errback=errback,
48
+ ),
49
+ )
50
+ )
51
+ if delay:
52
+ await asyncio.sleep(delay)
@@ -0,0 +1,2 @@
1
+ from .base import BaseScrapper
2
+ from .executor import AIOScrapper
@@ -0,0 +1,10 @@
1
+ import abc
2
+
3
+
4
+ class BaseScrapper(abc.ABC):
5
+ @abc.abstractmethod
6
+ async def start(self, *args, **kwargs) -> None: ...
7
+
8
+ async def initialize(self, *args, **kwargs) -> None: ...
9
+
10
+ async def close(self) -> None: ...
@@ -0,0 +1,146 @@
1
+ import asyncio
2
+ import time
3
+ from logging import Logger, getLogger
4
+ from types import TracebackType
5
+ from typing import Type, Any
6
+
7
+ from aiojobs import Scheduler
8
+
9
+ from ..config import Config
10
+ from ..helpers import get_func_kwargs
11
+ from ..pipeline import PipelineDispatcher, BasePipeline
12
+ from ..request_manager import RequestManager
13
+ from ..request_sender import RequestSender
14
+ from ..scrapper import BaseScrapper
15
+ from ..session.aiohttp import AiohttpSession
16
+ from ..types import RequestMiddleware, ResponseMiddleware
17
+
18
+
19
+ class AIOScrapper:
20
+ def __init__(
21
+ self,
22
+ scrappers: list[BaseScrapper],
23
+ config: Config | None = None,
24
+ logger: Logger | None = None,
25
+ ) -> None:
26
+ self._start_time = time.time()
27
+ self._config = config or Config()
28
+ self._logger = logger or getLogger("aioscrapper")
29
+
30
+ self._scrappers = scrappers
31
+ self._request_outer_middlewares = []
32
+ self._request_inner_middlewares = []
33
+ self._response_middlewares = []
34
+
35
+ self._pipelines: dict[str, list[BasePipeline]] = {}
36
+ self._pipeline_dispatcher = PipelineDispatcher(
37
+ logger=self._logger.getChild("pipeline"), pipelines=self._pipelines
38
+ )
39
+
40
+ def _exception_handler(_, context: dict[str, Any]):
41
+ if "job" in context:
42
+ self._logger.error(f'{context['message']}: {context["exception"]}', extra={"context": context})
43
+ else:
44
+ self._logger.error("Unhandled error", extra={"context": context})
45
+
46
+ self._scheduler = Scheduler(
47
+ limit=self._config.scheduler.concurrent_requests,
48
+ pending_limit=self._config.scheduler.pending_requests,
49
+ close_timeout=self._config.scheduler.close_timeout,
50
+ exception_handler=_exception_handler,
51
+ )
52
+
53
+ self._request_queue = asyncio.PriorityQueue()
54
+ self._request_sender = RequestSender(self._request_queue)
55
+ self._request_manager = RequestManager(
56
+ logger=self._logger.getChild("request_worker"),
57
+ session=AiohttpSession(
58
+ timeout=self._config.session.request.timeout,
59
+ ssl=self._config.session.request.ssl,
60
+ ),
61
+ schedule_request=self._scheduler.spawn,
62
+ sender=self._request_sender,
63
+ queue=self._request_queue,
64
+ delay=self._config.session.request.delay,
65
+ shutdown_timeout=self._config.execution.shutdown_timeout,
66
+ srv_kwargs={"pipeline": self._pipeline_dispatcher},
67
+ request_outer_middlewares=self._request_outer_middlewares,
68
+ request_inner_middlewares=self._request_inner_middlewares,
69
+ response_middlewares=self._response_middlewares,
70
+ )
71
+
72
+ self._scrapper_kwargs = {"request_sender": self._request_sender, "pipeline": self._pipeline_dispatcher}
73
+
74
+ def add_pipeline(self, name: str, pipeline: BasePipeline) -> None:
75
+ if name not in self._pipelines:
76
+ self._pipelines[name] = [pipeline]
77
+ else:
78
+ self._pipelines[name].append(pipeline)
79
+
80
+ def add_outer_request_middlewares(self, *middlewares: RequestMiddleware) -> None:
81
+ self._request_outer_middlewares.extend(middlewares)
82
+
83
+ def add_inner_request_middlewares(self, *middlewares: RequestMiddleware) -> None:
84
+ self._request_inner_middlewares.extend(middlewares)
85
+
86
+ def add_response_middlewares(self, *middlewares: ResponseMiddleware) -> None:
87
+ self._response_middlewares.extend(middlewares)
88
+
89
+ async def __aenter__(self):
90
+ return self
91
+
92
+ async def __aexit__(
93
+ self,
94
+ exc_type: Type[BaseException] | None,
95
+ exc_val: BaseException | None,
96
+ exc_tb: TracebackType | None,
97
+ ) -> None:
98
+ await self.close()
99
+
100
+ async def start(self) -> None:
101
+ await self._pipeline_dispatcher.initialize()
102
+ self._request_manager.listen_queue()
103
+
104
+ for scrapper in self._scrappers:
105
+ await scrapper.initialize(**get_func_kwargs(scrapper.initialize, self._scrapper_kwargs))
106
+
107
+ await asyncio.gather(
108
+ *[scrapper.start(**get_func_kwargs(scrapper.start, self._scrapper_kwargs)) for scrapper in self._scrappers]
109
+ )
110
+
111
+ async def _shutdown(self) -> bool:
112
+ status = False
113
+ execution_timeout = (
114
+ max(self._config.execution.timeout - (time.time() - self._start_time), 0.1)
115
+ if self._config.execution.timeout
116
+ else None
117
+ )
118
+ while True:
119
+ if execution_timeout is not None and time.time() - self._start_time > execution_timeout:
120
+ self._logger.log(
121
+ level=self._config.execution.log_level,
122
+ msg=f"execution timeout: {self._config.execution.timeout}",
123
+ )
124
+ status = True
125
+ break
126
+ if len(self._scheduler) == 0 and self._request_queue.qsize() == 0:
127
+ break
128
+
129
+ await asyncio.sleep(self._config.execution.shutdown_check_interval)
130
+
131
+ return status
132
+
133
+ async def shutdown(self) -> None:
134
+ force = await self._shutdown()
135
+ await self._request_manager.shutdown(force)
136
+
137
+ async def close(self, shutdown: bool = True) -> None:
138
+ if shutdown:
139
+ await self.shutdown()
140
+
141
+ for scrapper in self._scrappers:
142
+ await scrapper.close()
143
+
144
+ await self._scheduler.close()
145
+ await self._request_manager.close()
146
+ await self._pipeline_dispatcher.close()
File without changes
@@ -0,0 +1,46 @@
1
+ from aiohttp import ClientSession, ClientTimeout, TCPConnector
2
+ from aiohttp.helpers import BasicAuth
3
+
4
+ from .base import BaseSession
5
+ from ..types import Response, Request
6
+
7
+
8
+ class AiohttpSession(BaseSession):
9
+ def __init__(self, timeout: float | None = None, ssl: bool | None = None, **kwargs) -> None:
10
+ super().__init__(timeout, ssl)
11
+ self._session = ClientSession(
12
+ timeout=ClientTimeout(total=timeout),
13
+ connector=TCPConnector(ssl=ssl) if ssl is not None else None,
14
+ **kwargs,
15
+ )
16
+
17
+ async def make_request(self, request: Request) -> Response:
18
+ async with self._session.request(
19
+ url=request.url,
20
+ method=request.method,
21
+ params=request.params,
22
+ data=request.data,
23
+ json=request.json_data,
24
+ cookies=request.cookies,
25
+ headers=request.headers,
26
+ proxy=request.proxy,
27
+ auth=(
28
+ BasicAuth(login=request.auth["username"], password=request.auth["password"])
29
+ if request.auth is not None
30
+ else None
31
+ ),
32
+ timeout=ClientTimeout(total=request.timeout) if request.timeout is not None else None,
33
+ ) as response:
34
+ return Response(
35
+ url=request.url,
36
+ method=request.method,
37
+ params=request.params,
38
+ status=response.status,
39
+ headers=dict(response.headers),
40
+ cookies={k: f"{v.key}={v.value}" for k, v in response.cookies.items()},
41
+ content=await response.read(),
42
+ content_type=response.headers.get("Content-Type"),
43
+ )
44
+
45
+ async def close(self) -> None:
46
+ await self._session.close()
@@ -0,0 +1,14 @@
1
+ import abc
2
+
3
+ from ..types import Request, Response
4
+
5
+
6
+ class BaseSession(abc.ABC):
7
+ def __init__(self, timeout: float | None = None, ssl: bool | None = None) -> None:
8
+ self._timeout = timeout
9
+ self._ssl = ssl
10
+
11
+ @abc.abstractmethod
12
+ async def make_request(self, request: Request) -> Response: ...
13
+
14
+ async def close(self) -> None: ...
@@ -0,0 +1,12 @@
1
+ from .middleware import RequestMiddleware, ResponseMiddleware
2
+ from .session import (
3
+ QueryParams,
4
+ Cookies,
5
+ Headers,
6
+ BasicAuth,
7
+ Request,
8
+ RequestParams,
9
+ PRPRequest,
10
+ RequestQueue,
11
+ Response,
12
+ )
@@ -0,0 +1,11 @@
1
+ from typing import Protocol
2
+
3
+ from .session import Request, RequestParams, Response
4
+
5
+
6
+ class RequestMiddleware(Protocol):
7
+ async def __call__(self, request: Request, params: RequestParams) -> None: ...
8
+
9
+
10
+ class ResponseMiddleware(Protocol):
11
+ async def __call__(self, params: RequestParams, response: Response) -> None: ...
@@ -0,0 +1,109 @@
1
+ import asyncio
2
+ import json
3
+ from dataclasses import field, dataclass
4
+ from typing import Union, Mapping, Any, Callable, Awaitable, TypedDict
5
+ from urllib.parse import urlencode
6
+
7
+ QueryParams = Mapping[str, Union[str, int, float]]
8
+
9
+ Cookies = Mapping[str, str]
10
+ Headers = Mapping[str, str]
11
+
12
+
13
+ class BasicAuth(TypedDict):
14
+ username: str
15
+ password: str
16
+
17
+
18
+ @dataclass(slots=True)
19
+ class Request:
20
+ url: str
21
+ method: str
22
+ params: QueryParams | None = None
23
+ data: Any = None
24
+ json_data: Any = None
25
+ cookies: Cookies | None = None
26
+ headers: Headers | None = None
27
+ auth: BasicAuth | None = None
28
+ proxy: str | None = None
29
+ timeout: float | None = None
30
+
31
+ @property
32
+ def full_url(self) -> str:
33
+ return f"{self.url}{urlencode(self.params or {})}"
34
+
35
+
36
+ @dataclass(slots=True)
37
+ class RequestParams:
38
+ callback: Callable[..., Awaitable] | None = None
39
+ cb_kwargs: dict[str, Any] | None = None
40
+ errback: Callable[..., Awaitable] | None = None
41
+
42
+
43
+ @dataclass(slots=True, order=True)
44
+ class PRPRequest:
45
+ priority: int
46
+ request: Request = field(compare=False)
47
+ request_params: RequestParams = field(compare=False)
48
+
49
+
50
+ RequestQueue = asyncio.PriorityQueue[PRPRequest | None]
51
+
52
+
53
+ class Response:
54
+ def __init__(
55
+ self,
56
+ url: str,
57
+ method: str,
58
+ params: QueryParams | None,
59
+ status: int,
60
+ headers: Headers,
61
+ cookies: Cookies,
62
+ content: bytes,
63
+ content_type: str | None,
64
+ ) -> None:
65
+ self._url = url
66
+ self._method = method
67
+ self._params = params
68
+ self._status = status
69
+ self._headers = headers
70
+ self._cookies = cookies
71
+ self._content = content
72
+ self._content_type = content_type
73
+
74
+ @property
75
+ def url(self) -> str:
76
+ return self._url
77
+
78
+ @property
79
+ def method(self) -> str:
80
+ return self._method
81
+
82
+ @property
83
+ def params(self) -> QueryParams | None:
84
+ return self._params
85
+
86
+ @property
87
+ def status(self) -> int:
88
+ return self._status
89
+
90
+ @property
91
+ def headers(self) -> Headers | None:
92
+ return self._headers
93
+
94
+ @property
95
+ def cookies(self) -> Cookies | None:
96
+ return self._cookies
97
+
98
+ @property
99
+ def content_type(self) -> str | None:
100
+ return self._content_type
101
+
102
+ def bytes(self) -> bytes:
103
+ return self._content
104
+
105
+ def json(self) -> Any:
106
+ return json.loads(self._content) if self._content is not None else None
107
+
108
+ def text(self, encoding: str = "utf-8") -> str | None:
109
+ return self._content.decode(encoding) if self._content is not None else None
@@ -0,0 +1,87 @@
1
+ Metadata-Version: 2.4
2
+ Name: aioscrapper
3
+ Version: 0.1.0
4
+ Summary: Async framework for building modular and scalable web scrapers.
5
+ Author: darkstussy
6
+ Project-URL: Homepage, https://github.com/darkstussy/aioscrapper
7
+ Project-URL: Issues, https://github.com/darkstussy/aioscrapper/issues
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Classifier: Framework :: AsyncIO
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
14
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
15
+ Requires-Python: >=3.12
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: aiohttp[speedups]~=3.11.16
19
+ Requires-Dist: aiojobs~=1.4.0
20
+ Dynamic: license-file
21
+
22
+ # aioscrapper
23
+
24
+ **Asynchronous framework for building modular and scalable web scrapers.**
25
+
26
+ ![Python](https://img.shields.io/badge/python-3.12%2B-blue)
27
+ ![License](https://img.shields.io/github/license/darkstussy/aioscrapper)
28
+ ![Version](https://img.shields.io/github/v/tag/darkstussy/aioscrapper?label=version)
29
+
30
+ ## Features
31
+
32
+ - 🚀 Fully asynchronous architecture powered by `aiohttp` and `aiojobs`
33
+ - 🔧 Modular system with middleware support
34
+ - 📦 Pipeline data processing
35
+ - ⚙️ Flexible configuration
36
+ - 🔄 Priority-based request queue management
37
+ - 🛡️ Built-in error handling
38
+
39
+ ## Installation
40
+
41
+ ```bash
42
+ pip install aioscrapper
43
+ ```
44
+
45
+ ## Requirements
46
+
47
+ - Python 3.12 or higher
48
+ - aiohttp
49
+ - aiojobs
50
+
51
+ ## Quick Start
52
+
53
+ ```python
54
+ import asyncio
55
+
56
+ from aioscrapper import BaseScrapper, AIOScrapper, RequestSender
57
+ from aioscrapper.types import Response
58
+
59
+
60
+ class Scrapper(BaseScrapper):
61
+ async def start(self, request_sender: RequestSender) -> None:
62
+ await request_sender(url="https://example.com", callback=self.parse)
63
+
64
+ async def parse(self, response: Response) -> None:
65
+ # handle response
66
+ pass
67
+
68
+
69
+ async def main():
70
+ async with AIOScrapper(scrappers=[Scrapper()]) as scrapper:
71
+ await scrapper.start()
72
+
73
+
74
+ if __name__ == "__main__":
75
+ asyncio.run(main())
76
+ ```
77
+
78
+ ## License
79
+
80
+ MIT License
81
+
82
+ Copyright (c) 2025 darkstussy
83
+
84
+ ## Links
85
+
86
+ - [GitHub](https://github.com/darkstussy/aioscrapper)
87
+ - [Issues](https://github.com/darkstussy/aioscrapper/issues)
@@ -0,0 +1,26 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/aioscrapper/__init__.py
5
+ src/aioscrapper/config.py
6
+ src/aioscrapper/exceptions.py
7
+ src/aioscrapper/helpers.py
8
+ src/aioscrapper/request_manager.py
9
+ src/aioscrapper/request_sender.py
10
+ src/aioscrapper.egg-info/PKG-INFO
11
+ src/aioscrapper.egg-info/SOURCES.txt
12
+ src/aioscrapper.egg-info/dependency_links.txt
13
+ src/aioscrapper.egg-info/requires.txt
14
+ src/aioscrapper.egg-info/top_level.txt
15
+ src/aioscrapper/pipeline/__init__.py
16
+ src/aioscrapper/pipeline/base.py
17
+ src/aioscrapper/pipeline/dispatcher.py
18
+ src/aioscrapper/scrapper/__init__.py
19
+ src/aioscrapper/scrapper/base.py
20
+ src/aioscrapper/scrapper/executor.py
21
+ src/aioscrapper/session/__init__.py
22
+ src/aioscrapper/session/aiohttp.py
23
+ src/aioscrapper/session/base.py
24
+ src/aioscrapper/types/__init__.py
25
+ src/aioscrapper/types/middleware.py
26
+ src/aioscrapper/types/session.py
@@ -0,0 +1,2 @@
1
+ aiohttp[speedups]~=3.11.16
2
+ aiojobs~=1.4.0
@@ -0,0 +1 @@
1
+ aioscrapper