crawlo 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (65) hide show
  1. crawlo-1.0.0/LICENSE +23 -0
  2. crawlo-1.0.0/MANIFEST.in +11 -0
  3. crawlo-1.0.0/PKG-INFO +36 -0
  4. crawlo-1.0.0/README.md +2 -0
  5. crawlo-1.0.0/crawlo/__init__.py +5 -0
  6. crawlo-1.0.0/crawlo/__version__.py +2 -0
  7. crawlo-1.0.0/crawlo/core/__init__.py +2 -0
  8. crawlo-1.0.0/crawlo/core/engine.py +157 -0
  9. crawlo-1.0.0/crawlo/core/processor.py +40 -0
  10. crawlo-1.0.0/crawlo/core/scheduler.py +35 -0
  11. crawlo-1.0.0/crawlo/crawler.py +107 -0
  12. crawlo-1.0.0/crawlo/downloader/__init__.py +78 -0
  13. crawlo-1.0.0/crawlo/downloader/aiohttp_downloader.py +96 -0
  14. crawlo-1.0.0/crawlo/downloader/httpx_downloader.py +48 -0
  15. crawlo-1.0.0/crawlo/event.py +11 -0
  16. crawlo-1.0.0/crawlo/exceptions.py +64 -0
  17. crawlo-1.0.0/crawlo/extension/__init__.py +31 -0
  18. crawlo-1.0.0/crawlo/extension/log_interval.py +49 -0
  19. crawlo-1.0.0/crawlo/extension/log_stats.py +44 -0
  20. crawlo-1.0.0/crawlo/items/__init__.py +24 -0
  21. crawlo-1.0.0/crawlo/items/items.py +88 -0
  22. crawlo-1.0.0/crawlo/middleware/__init__.py +21 -0
  23. crawlo-1.0.0/crawlo/middleware/default_header.py +32 -0
  24. crawlo-1.0.0/crawlo/middleware/download_delay.py +28 -0
  25. crawlo-1.0.0/crawlo/middleware/middleware_manager.py +140 -0
  26. crawlo-1.0.0/crawlo/middleware/request_ignore.py +30 -0
  27. crawlo-1.0.0/crawlo/middleware/response_code.py +19 -0
  28. crawlo-1.0.0/crawlo/middleware/response_filter.py +26 -0
  29. crawlo-1.0.0/crawlo/middleware/retry.py +84 -0
  30. crawlo-1.0.0/crawlo/network/__init__.py +7 -0
  31. crawlo-1.0.0/crawlo/network/request.py +52 -0
  32. crawlo-1.0.0/crawlo/network/response.py +93 -0
  33. crawlo-1.0.0/crawlo/pipelines/__init__.py +13 -0
  34. crawlo-1.0.0/crawlo/pipelines/console_pipeline.py +20 -0
  35. crawlo-1.0.0/crawlo/pipelines/mongo_pipeline.py +5 -0
  36. crawlo-1.0.0/crawlo/pipelines/mysql_pipeline.py +5 -0
  37. crawlo-1.0.0/crawlo/pipelines/pipeline_manager.py +56 -0
  38. crawlo-1.0.0/crawlo/settings/__init__.py +7 -0
  39. crawlo-1.0.0/crawlo/settings/default_settings.py +39 -0
  40. crawlo-1.0.0/crawlo/settings/setting_manager.py +100 -0
  41. crawlo-1.0.0/crawlo/spider/__init__.py +36 -0
  42. crawlo-1.0.0/crawlo/stats_collector.py +47 -0
  43. crawlo-1.0.0/crawlo/subscriber.py +27 -0
  44. crawlo-1.0.0/crawlo/task_manager.py +27 -0
  45. crawlo-1.0.0/crawlo/templates/item_template.tmpl +22 -0
  46. crawlo-1.0.0/crawlo/templates/project_template/items/__init__.py +0 -0
  47. crawlo-1.0.0/crawlo/templates/project_template/main.py +33 -0
  48. crawlo-1.0.0/crawlo/templates/project_template/setting.py +190 -0
  49. crawlo-1.0.0/crawlo/templates/project_template/spiders/__init__.py +0 -0
  50. crawlo-1.0.0/crawlo/templates/spider_template.tmpl +31 -0
  51. crawlo-1.0.0/crawlo/utils/__init__.py +7 -0
  52. crawlo-1.0.0/crawlo/utils/date_tools.py +20 -0
  53. crawlo-1.0.0/crawlo/utils/func_tools.py +22 -0
  54. crawlo-1.0.0/crawlo/utils/log.py +39 -0
  55. crawlo-1.0.0/crawlo/utils/pqueue.py +16 -0
  56. crawlo-1.0.0/crawlo/utils/project.py +58 -0
  57. crawlo-1.0.0/crawlo/utils/system.py +11 -0
  58. crawlo-1.0.0/crawlo.egg-info/PKG-INFO +36 -0
  59. crawlo-1.0.0/crawlo.egg-info/SOURCES.txt +64 -0
  60. crawlo-1.0.0/crawlo.egg-info/dependency_links.txt +1 -0
  61. crawlo-1.0.0/crawlo.egg-info/entry_points.txt +2 -0
  62. crawlo-1.0.0/crawlo.egg-info/requires.txt +20 -0
  63. crawlo-1.0.0/crawlo.egg-info/top_level.txt +1 -0
  64. crawlo-1.0.0/pyproject.toml +6 -0
  65. crawlo-1.0.0/setup.cfg +52 -0
crawlo-1.0.0/LICENSE ADDED
@@ -0,0 +1,23 @@
1
+ MIT License
2
+
3
+ Modifications:
4
+
5
+ Copyright (c) 2020 crawl-coder <2251018029@qq.com>
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ of this software and associated documentation files (the "Software"), to deal
9
+ in the Software without restriction, including without limitation the rights
10
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the Software is
12
+ furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in all
15
+ copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ SOFTWARE.
@@ -0,0 +1,11 @@
1
+ include README.md
2
+ include LICENSE
3
+
4
+ include crawlo/requirements.txt
5
+ include crawlo/VERSION
6
+
7
+ recursive-include crawlo/utils/js *
8
+ recursive-include crawlo/templates *
9
+ recursive-include tests *
10
+
11
+ global-exclude __pycache__ *.py[cod]
crawlo-1.0.0/PKG-INFO ADDED
@@ -0,0 +1,36 @@
1
+ Metadata-Version: 2.4
2
+ Name: crawlo
3
+ Version: 1.0.0
4
+ Summary: feapder是一款支持异步的python爬虫框架
5
+ Home-page: https://github.com/crawl-coder/Crawlo.git
6
+ Author: crawl-coder
7
+ Author-email: crawlo@qq.com
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.6
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: aiohttp>=3.12.6
16
+ Requires-Dist: httpx>=0.28.1
17
+ Requires-Dist: DBUtils>=2.0
18
+ Requires-Dist: parsel>=1.10.0
19
+ Requires-Dist: pymysql>=1.1.1
20
+ Requires-Dist: ujson>=5.10.0
21
+ Provides-Extra: render
22
+ Requires-Dist: webdriver-manager>=4.0.0; extra == "render"
23
+ Requires-Dist: playwright; extra == "render"
24
+ Requires-Dist: selenium>=3.141.0; extra == "render"
25
+ Provides-Extra: all
26
+ Requires-Dist: bitarray>=1.5.3; extra == "all"
27
+ Requires-Dist: PyExecJS>=1.5.1; extra == "all"
28
+ Requires-Dist: pymongo>=3.10.1; extra == "all"
29
+ Requires-Dist: redis-py-cluster>=2.1.0; extra == "all"
30
+ Requires-Dist: webdriver-manager>=4.0.0; extra == "all"
31
+ Requires-Dist: playwright; extra == "all"
32
+ Requires-Dist: selenium>=3.141.0; extra == "all"
33
+ Dynamic: license-file
34
+
35
+ # Crawlo
36
+ 异步通用爬虫框架
crawlo-1.0.0/README.md ADDED
@@ -0,0 +1,2 @@
1
+ # Crawlo
2
+ 异步通用爬虫框架
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from crawlo.network.request import Request
4
+ from crawlo.network.response import Response
5
+ from crawlo.items.items import Item
@@ -0,0 +1,2 @@
1
+
2
+ __version__ = "1.0.0"
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
@@ -0,0 +1,157 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ import asyncio
4
+ from typing import Optional, Generator, Callable
5
+ from inspect import iscoroutine, isgenerator, isasyncgen
6
+
7
+ from crawlo import Request, Item
8
+ from crawlo.spider import Spider
9
+ from crawlo.core.scheduler import Scheduler
10
+ from crawlo.core.processor import Processor
11
+ from crawlo.utils.log import get_logger
12
+ from crawlo.task_manager import TaskManager
13
+ from crawlo.utils.project import load_class
14
+ from crawlo.downloader import DownloaderBase
15
+ from crawlo.utils.func_tools import transform
16
+ from crawlo.exceptions import OutputError, TransformTypeError
17
+ from crawlo.event import spider_opened, spider_error
18
+
19
+
20
+ class Engine(object):
21
+
22
+ def __init__(self, crawler):
23
+ self.running = False
24
+ self.normal = True
25
+ self.crawler = crawler
26
+ self.settings = crawler.settings
27
+ self.spider: Optional[Spider] = None
28
+ self.downloader: Optional[DownloaderBase] = None
29
+ self.scheduler: Optional[Scheduler] = None
30
+ self.processor: Optional[Processor] = None
31
+ self.start_requests: Optional[Generator] = None
32
+ self.task_manager: Optional[TaskManager] = TaskManager(self.settings.get_int('CONCURRENCY'))
33
+
34
+ self.logger = get_logger(name=self.__class__.__name__)
35
+
36
+ def _get_downloader_cls(self):
37
+ downloader_cls = load_class(self.settings.get('DOWNLOADER'))
38
+ if not issubclass(downloader_cls, DownloaderBase):
39
+ raise TypeError(f'Downloader {downloader_cls.__name__} is not subclass of DownloaderBase.')
40
+ return downloader_cls
41
+
42
+ def engine_start(self):
43
+ self.running = True
44
+ self.logger.info(
45
+ f"Crawlo (version {self.settings.get_int('VERSION')}) started. "
46
+ f"(project name : {self.settings.get('PROJECT_NAME')})"
47
+ )
48
+
49
+ async def start_spider(self, spider):
50
+ self.spider = spider
51
+
52
+ self.scheduler = Scheduler(self.crawler)
53
+ if hasattr(self.scheduler, 'open'):
54
+ self.scheduler.open()
55
+
56
+ downloader_cls = self._get_downloader_cls()
57
+ self.downloader = downloader_cls(self.crawler)
58
+ if hasattr(self.downloader, 'open'):
59
+ self.downloader.open()
60
+
61
+ self.processor = Processor(self.crawler)
62
+ if hasattr(self.processor, 'open'):
63
+ self.processor.open()
64
+
65
+ self.start_requests = iter(spider.start_requests())
66
+ await self._open_spider()
67
+
68
+ async def crawl(self):
69
+ """
70
+ Crawl the spider
71
+ """
72
+ while self.running:
73
+ if request := await self._get_next_request():
74
+ await self._crawl(request)
75
+ try:
76
+ start_request = next(self.start_requests)
77
+ except StopIteration:
78
+ self.start_requests = None
79
+ except Exception as exp:
80
+ # 1、发去请求的request全部运行完毕
81
+ # 2、调度器是否空闲
82
+ # 3、下载器是否空闲
83
+ if not await self._exit():
84
+ continue
85
+ self.running = False
86
+ if self.start_requests is not None:
87
+ self.logger.error(f"启动请求时发生错误: {str(exp)}")
88
+ else:
89
+ # 请求入队
90
+ await self.enqueue_request(start_request)
91
+
92
+ if not self.running:
93
+ await self.close_spider()
94
+
95
+ async def _open_spider(self):
96
+ asyncio.create_task(self.crawler.subscriber.notify(spider_opened))
97
+ crawling = asyncio.create_task(self.crawl())
98
+ await crawling
99
+
100
+ async def _crawl(self, request):
101
+ # TODO 实现并发
102
+ async def crawl_task():
103
+ outputs = await self._fetch(request)
104
+ # TODO 处理output
105
+ if outputs:
106
+ await self._handle_spider_output(outputs)
107
+
108
+ # asyncio.create_task(crawl_task())
109
+ self.task_manager.create_task(crawl_task())
110
+
111
+ async def _fetch(self, request):
112
+ async def _successful(_response):
113
+ callback: Callable = request.callback or self.spider.parse
114
+ if _outputs := callback(_response):
115
+ if iscoroutine(_outputs):
116
+ await _outputs
117
+ else:
118
+ return transform(_outputs)
119
+
120
+ _response = await self.downloader.fetch(request)
121
+ if _response is None:
122
+ return None
123
+ output = await _successful(_response)
124
+ return output
125
+
126
+ async def enqueue_request(self, start_request):
127
+ await self._schedule_request(start_request)
128
+
129
+ async def _schedule_request(self, request):
130
+ # TODO 去重
131
+ await self.scheduler.enqueue_request(request)
132
+
133
+ async def _get_next_request(self):
134
+ return await self.scheduler.next_request()
135
+
136
+ async def _handle_spider_output(self, outputs):
137
+ async for spider_output in outputs:
138
+ if isinstance(spider_output, (Request, Item)):
139
+ await self.processor.enqueue(spider_output)
140
+ elif isinstance(spider_output, Exception):
141
+ asyncio.create_task(
142
+ self.crawler.subscriber.notify(spider_error, spider_output, self.spider)
143
+ )
144
+ raise spider_output
145
+ else:
146
+ raise OutputError(f'{type(self.spider)} must return `Request` or `Item`.')
147
+
148
+ async def _exit(self):
149
+ if self.scheduler.idle() and self.downloader.idle() and self.task_manager.all_done() and self.processor.idle():
150
+ return True
151
+ return False
152
+
153
+ async def close_spider(self):
154
+ await asyncio.gather(*self.task_manager.current_task)
155
+ await self.downloader.close()
156
+ if self.normal:
157
+ await self.crawler.close()
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from asyncio import Queue
4
+ from typing import Union, Optional
5
+
6
+ from crawlo import Request, Item
7
+ from crawlo.pipelines.pipeline_manager import PipelineManager
8
+
9
+
10
+ class Processor(object):
11
+
12
+ def __init__(self, crawler):
13
+ self.crawler = crawler
14
+ self.queue: Queue = Queue()
15
+ self.pipelines: Optional[PipelineManager] = None
16
+
17
+ def open(self):
18
+ self.pipelines = PipelineManager.create_instance(self.crawler)
19
+
20
+ async def process(self):
21
+ while not self.idle():
22
+ result = await self.queue.get()
23
+ if isinstance(result, Request):
24
+ await self.crawler.engine.enqueue_request(result)
25
+ else:
26
+ assert isinstance(result, Item)
27
+ await self._process_item(result)
28
+
29
+ async def _process_item(self, item):
30
+ await self.pipelines.process_item(item=item)
31
+
32
+ async def enqueue(self, output: Union[Request, Item]):
33
+ await self.queue.put(output)
34
+ await self.process()
35
+
36
+ def idle(self) -> bool:
37
+ return len(self) == 0
38
+
39
+ def __len__(self):
40
+ return self.queue.qsize()
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ import asyncio
4
+ from typing import Optional
5
+
6
+ from crawlo.utils.log import get_logger
7
+ from crawlo.event import request_scheduled
8
+ from crawlo.utils.pqueue import SpiderPriorityQueue
9
+
10
+
11
+ class Scheduler:
12
+ def __init__(self, crawler):
13
+ self.crawler = crawler
14
+ self.request_queue: Optional[SpiderPriorityQueue] = None
15
+
16
+ self.item_count = 0
17
+ self.response_count = 0
18
+ self.logger = get_logger(name=self.__class__.__name__, level=crawler.settings.get('LOG_LEVEL'))
19
+
20
+ def open(self):
21
+ self.request_queue = SpiderPriorityQueue()
22
+
23
+ async def next_request(self):
24
+ request = await self.request_queue.get()
25
+ return request
26
+
27
+ async def enqueue_request(self, request):
28
+ await self.request_queue.put(request)
29
+ asyncio.create_task(self.crawler.subscriber.notify(request_scheduled, request, self.crawler.spider))
30
+
31
+ def idle(self) -> bool:
32
+ return len(self) == 0
33
+
34
+ def __len__(self):
35
+ return self.request_queue.qsize()
@@ -0,0 +1,107 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*
3
+ import signal
4
+ import asyncio
5
+ from typing import Type, Final, Set, Optional
6
+
7
+ from crawlo.spider import Spider
8
+ from crawlo.core.engine import Engine
9
+ from crawlo.subscriber import Subscriber
10
+
11
+ from crawlo.utils.log import get_logger
12
+ from crawlo.extension import ExtensionManager
13
+ from crawlo.exceptions import SpiderTypeError
14
+ from crawlo.utils.project import merge_settings
15
+ from crawlo.stats_collector import StatsCollector
16
+ from crawlo.event import spider_opened, spider_closed
17
+ from crawlo.settings.setting_manager import SettingManager
18
+
19
+ logger = get_logger(__name__)
20
+
21
+
22
+ class Crawler:
23
+
24
+ def __init__(self, spider_cls, settings):
25
+ self.spider_cls = spider_cls
26
+ self.spider: Optional[Spider] = None
27
+ self.engine: Optional[Engine] = None
28
+ self.stats: Optional[StatsCollector] = None
29
+ self.subscriber: Optional[Subscriber] = None
30
+ self.extension: Optional[ExtensionManager] = None
31
+ self.settings: SettingManager = settings.copy()
32
+
33
+ async def crawl(self):
34
+ self.subscriber = self._create_subscriber()
35
+ self.spider = self._create_spider()
36
+ self.engine = self._create_engine()
37
+ self.stats = self._create_stats()
38
+ self.extension = self._create_extension()
39
+
40
+ await self.engine.start_spider(self.spider)
41
+
42
+ @staticmethod
43
+ def _create_subscriber():
44
+ return Subscriber()
45
+
46
+ def _create_spider(self) -> Spider:
47
+ spider = self.spider_cls.create_instance(self)
48
+ self._set_spider(spider)
49
+ return spider
50
+
51
+ def _create_engine(self) -> Engine:
52
+ engine = Engine(self)
53
+ engine.engine_start()
54
+ return engine
55
+
56
+ def _create_stats(self) -> StatsCollector:
57
+ stats = StatsCollector(self)
58
+ return stats
59
+
60
+ def _create_extension(self) -> ExtensionManager:
61
+ extension = ExtensionManager.create_instance(self)
62
+ return extension
63
+
64
+ def _set_spider(self, spider):
65
+ self.subscriber.subscribe(spider.spider_opened, event=spider_opened)
66
+ self.subscriber.subscribe(spider.spider_closed, event=spider_closed)
67
+ merge_settings(spider, self.settings)
68
+
69
+ async def close(self, reason='finished') -> None:
70
+ await asyncio.create_task(self.subscriber.notify(spider_closed))
71
+ self.stats.close_spider(spider_name=self.spider, reason=reason)
72
+
73
+
74
+ class CrawlerProcess:
75
+
76
+ def __init__(self, settings=None):
77
+ self.crawlers: Final[Set] = set()
78
+ self._active_spiders: Final[Set] = set()
79
+ self.settings = settings
80
+
81
+ signal.signal(signal.SIGINT, self._shutdown)
82
+
83
+ async def crawl(self, spider: Type[Spider]):
84
+ crawler: Crawler = self._create_crawler(spider)
85
+ self.crawlers.add(crawler)
86
+ task = await self._crawl(crawler)
87
+ self._active_spiders.add(task)
88
+
89
+ @staticmethod
90
+ async def _crawl(crawler):
91
+ return asyncio.create_task(crawler.crawl())
92
+
93
+ async def start(self):
94
+ await asyncio.gather(*self._active_spiders)
95
+
96
+ def _create_crawler(self, spider_cls) -> Crawler:
97
+ if isinstance(spider_cls, str):
98
+ raise SpiderTypeError(f"{type(self)}.crawl args: String is not supported.")
99
+ crawler: Crawler = Crawler(spider_cls, self.settings)
100
+ return crawler
101
+
102
+ def _shutdown(self, _signum, _frame):
103
+ for crawler in self.crawlers:
104
+ crawler.engine.running = False
105
+ crawler.engine.normal = False
106
+ crawler.stats.close_spider(crawler.spider, 'Ctrl C')
107
+ logger.warning(f'spiders received: `Ctrl C` signal, closed.')
@@ -0,0 +1,78 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from abc import abstractmethod, ABCMeta
4
+ from typing_extensions import Self
5
+ from typing import Final, Set, Optional
6
+ from contextlib import asynccontextmanager
7
+
8
+ from crawlo import Response, Request
9
+ from crawlo.utils.log import get_logger
10
+ from crawlo.middleware.middleware_manager import MiddlewareManager
11
+
12
+
13
+ class ActivateRequestManager:
14
+
15
+ def __init__(self):
16
+ self._active: Final[Set] = set()
17
+
18
+ def add(self, request):
19
+ self._active.add(request)
20
+
21
+ def remove(self, request):
22
+ self._active.remove(request)
23
+
24
+ @asynccontextmanager
25
+ async def __call__(self, request):
26
+ try:
27
+ yield self.add(request)
28
+ finally:
29
+ self.remove(request)
30
+
31
+ def __len__(self):
32
+ return len(self._active)
33
+
34
+
35
+ class DownloaderMeta(ABCMeta):
36
+ def __subclasscheck__(self, subclass):
37
+ required_methods = ('fetch', 'download', 'create_instance', 'close')
38
+ is_subclass = all(
39
+ hasattr(subclass, method) and callable(getattr(subclass, method, None)) for method in required_methods
40
+ )
41
+ return is_subclass
42
+
43
+
44
+ class DownloaderBase(metaclass=DownloaderMeta):
45
+ def __init__(self, crawler):
46
+ self.crawler = crawler
47
+ self._active = ActivateRequestManager()
48
+ self.middleware: Optional[MiddlewareManager] = None
49
+ self.logger = get_logger(self.__class__.__name__, crawler.settings.get("LOG_LEVEL"))
50
+
51
+ @classmethod
52
+ def create_instance(cls, *args, **kwargs) -> Self:
53
+ return cls(*args, **kwargs)
54
+
55
+ def open(self) -> None:
56
+ self.logger.info(
57
+ f"{self.crawler.spider} <downloader class:{type(self).__name__}>"
58
+ f"<concurrency:{self.crawler.settings.get_int('CONCURRENCY')}>"
59
+ )
60
+ self.middleware = MiddlewareManager.create_instance(self.crawler)
61
+
62
+ async def fetch(self, request) -> Optional[Response]:
63
+ async with self._active(request):
64
+ response = await self.middleware.download(request)
65
+ return response
66
+
67
+ @abstractmethod
68
+ async def download(self, request: Request) -> Response:
69
+ pass
70
+
71
+ async def close(self) -> None:
72
+ pass
73
+
74
+ def idle(self) -> bool:
75
+ return len(self) == 0
76
+
77
+ def __len__(self) -> int:
78
+ return len(self._active)
@@ -0,0 +1,96 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from typing import Optional
4
+ from aiohttp import ClientSession, TCPConnector, BaseConnector, ClientTimeout, ClientResponse, TraceConfig
5
+
6
+ from crawlo import Response
7
+ from crawlo.downloader import DownloaderBase
8
+
9
+
10
+ class AioHttpDownloader(DownloaderBase):
11
+ def __init__(self, crawler):
12
+ super().__init__(crawler)
13
+ self.session: Optional[ClientSession] = None
14
+ self.connector: Optional[BaseConnector] = None
15
+ self._verify_ssl: Optional[bool] = None
16
+ self._timeout: Optional[ClientTimeout] = None
17
+ self._use_session: Optional[bool] = None
18
+ self.trace_config: Optional[TraceConfig] = None
19
+
20
+ self.request_method = {
21
+ "get": self._get,
22
+ "post": self._post
23
+ }
24
+
25
+ def open(self):
26
+ super().open()
27
+ self._timeout = ClientTimeout(total=self.crawler.settings.get_int("DOWNLOAD_TIMEOUT"))
28
+ self._verify_ssl = self.crawler.settings.get_bool("VERIFY_SSL")
29
+ self._use_session = self.crawler.settings.get_bool("USE_SESSION")
30
+ self.trace_config = TraceConfig()
31
+ self.trace_config.on_request_start.append(self.request_start)
32
+ if self._use_session:
33
+ self.connector = TCPConnector(verify_ssl=self._verify_ssl)
34
+ self.session = ClientSession(
35
+ connector=self.connector, timeout=self._timeout, trace_configs=[self.trace_config]
36
+ )
37
+
38
+ async def download(self, request) -> Optional[Response]:
39
+ try:
40
+ if self._use_session:
41
+ response = await self.send_request(self.session, request)
42
+ body = await response.content.read()
43
+ else:
44
+ connector = TCPConnector(verify_ssl=self._verify_ssl)
45
+ async with ClientSession(
46
+ connector=connector, timeout=self._timeout, trace_configs=[self.trace_config]
47
+ ) as session:
48
+ response = await self.send_request(session, request)
49
+ body = await response.content.read()
50
+ except Exception as exp:
51
+ self.logger.error(f"Error downloading {request}: {exp}")
52
+ raise exp
53
+
54
+ return self.structure_response(request=request, response=response, body=body)
55
+
56
+ @staticmethod
57
+ def structure_response(request, response, body):
58
+ return Response(
59
+ url=response.url,
60
+ headers=dict(response.headers),
61
+ status_code=response.status,
62
+ body=body,
63
+ request=request
64
+ )
65
+
66
+ async def send_request(self, session, request) -> ClientResponse:
67
+ return await self.request_method[request.method.lower()](session, request)
68
+
69
+ @staticmethod
70
+ async def _get(session, request) -> ClientResponse:
71
+ response = await session.get(
72
+ request.url,
73
+ headers=request.headers,
74
+ cookies=request.cookies
75
+ )
76
+ return response
77
+
78
+ @staticmethod
79
+ async def _post(session, request) -> ClientResponse:
80
+ response = await session.post(
81
+ request.url,
82
+ data=request.body,
83
+ headers=request.headers,
84
+ cookies=request.cookies,
85
+ proxy=request.proxy,
86
+ )
87
+ return response
88
+
89
+ async def request_start(self, _session, _trace_config_ctx, params):
90
+ self.logger.debug(f"Request start: {params.url}, method:{params.method}")
91
+
92
+ async def close(self) -> None:
93
+ if self.connector:
94
+ await self.connector.close()
95
+ if self.session:
96
+ await self.session.close()
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from typing import Optional
4
+ from httpx import AsyncClient, Timeout
5
+
6
+ from crawlo import Response
7
+ from crawlo.downloader import DownloaderBase
8
+
9
+
10
+ class HttpXDownloader(DownloaderBase):
11
+ def __init__(self, crawler):
12
+ super().__init__(crawler)
13
+ self._client: Optional[AsyncClient] = None
14
+ self._timeout: Optional[Timeout] = None
15
+
16
+ def open(self):
17
+ super().open()
18
+ timeout = self.crawler.settings.get_int("DOWNLOAD_TIMEOUT")
19
+ self._timeout = Timeout(timeout=timeout)
20
+
21
+ async def download(self, request) -> Optional[Response]:
22
+ try:
23
+ proxies = None
24
+ async with AsyncClient(timeout=self._timeout, proxy=proxies) as client:
25
+ self.logger.debug(f"request downloading: {request.url},method: {request.method}")
26
+ response = await client.request(
27
+ url=request.url,
28
+ method=request.method,
29
+ headers=request.headers,
30
+ cookies=request.cookies,
31
+ data=request.body
32
+ )
33
+ body = await response.aread()
34
+ except Exception as exp:
35
+ self.logger.error(f"Error downloading {request}: {exp}")
36
+ raise exp
37
+
38
+ return self.structure_response(request=request, response=response, body=body)
39
+
40
+ @staticmethod
41
+ def structure_response(request, response, body) -> Response:
42
+ return Response(
43
+ url=response.url,
44
+ headers=dict(response.headers),
45
+ status_code=response.status_code,
46
+ body=body,
47
+ request=request
48
+ )
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+
4
+ spider_error = "spider_error"
5
+ spider_opened = "spider_open"
6
+ spider_closed = "spider_closed"
7
+ ignore_request = "ignore_request"
8
+ request_scheduled = "request_scheduled"
9
+ response_received = "request_received"
10
+ item_successful = "item_successful"
11
+ item_discard = "item_discard"