crawlo 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (59) hide show
  1. crawlo/__init__.py +5 -0
  2. crawlo/__version__.py +2 -0
  3. crawlo/core/__init__.py +2 -0
  4. crawlo/core/engine.py +157 -0
  5. crawlo/core/processor.py +40 -0
  6. crawlo/core/scheduler.py +35 -0
  7. crawlo/crawler.py +107 -0
  8. crawlo/downloader/__init__.py +78 -0
  9. crawlo/downloader/aiohttp_downloader.py +96 -0
  10. crawlo/downloader/httpx_downloader.py +48 -0
  11. crawlo/event.py +11 -0
  12. crawlo/exceptions.py +64 -0
  13. crawlo/extension/__init__.py +31 -0
  14. crawlo/extension/log_interval.py +49 -0
  15. crawlo/extension/log_stats.py +44 -0
  16. crawlo/items/__init__.py +24 -0
  17. crawlo/items/items.py +88 -0
  18. crawlo/middleware/__init__.py +21 -0
  19. crawlo/middleware/default_header.py +32 -0
  20. crawlo/middleware/download_delay.py +28 -0
  21. crawlo/middleware/middleware_manager.py +140 -0
  22. crawlo/middleware/request_ignore.py +30 -0
  23. crawlo/middleware/response_code.py +19 -0
  24. crawlo/middleware/response_filter.py +26 -0
  25. crawlo/middleware/retry.py +84 -0
  26. crawlo/network/__init__.py +7 -0
  27. crawlo/network/request.py +52 -0
  28. crawlo/network/response.py +93 -0
  29. crawlo/pipelines/__init__.py +13 -0
  30. crawlo/pipelines/console_pipeline.py +20 -0
  31. crawlo/pipelines/mongo_pipeline.py +5 -0
  32. crawlo/pipelines/mysql_pipeline.py +5 -0
  33. crawlo/pipelines/pipeline_manager.py +56 -0
  34. crawlo/settings/__init__.py +7 -0
  35. crawlo/settings/default_settings.py +39 -0
  36. crawlo/settings/setting_manager.py +100 -0
  37. crawlo/spider/__init__.py +36 -0
  38. crawlo/stats_collector.py +47 -0
  39. crawlo/subscriber.py +27 -0
  40. crawlo/task_manager.py +27 -0
  41. crawlo/templates/item_template.tmpl +22 -0
  42. crawlo/templates/project_template/items/__init__.py +0 -0
  43. crawlo/templates/project_template/main.py +33 -0
  44. crawlo/templates/project_template/setting.py +190 -0
  45. crawlo/templates/project_template/spiders/__init__.py +0 -0
  46. crawlo/templates/spider_template.tmpl +31 -0
  47. crawlo/utils/__init__.py +7 -0
  48. crawlo/utils/date_tools.py +20 -0
  49. crawlo/utils/func_tools.py +22 -0
  50. crawlo/utils/log.py +39 -0
  51. crawlo/utils/pqueue.py +16 -0
  52. crawlo/utils/project.py +58 -0
  53. crawlo/utils/system.py +11 -0
  54. crawlo-1.0.0.dist-info/METADATA +36 -0
  55. crawlo-1.0.0.dist-info/RECORD +59 -0
  56. crawlo-1.0.0.dist-info/WHEEL +5 -0
  57. crawlo-1.0.0.dist-info/entry_points.txt +2 -0
  58. crawlo-1.0.0.dist-info/licenses/LICENSE +23 -0
  59. crawlo-1.0.0.dist-info/top_level.txt +1 -0
crawlo/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from crawlo.network.request import Request
4
+ from crawlo.network.response import Response
5
+ from crawlo.items.items import Item
crawlo/__version__.py ADDED
@@ -0,0 +1,2 @@
1
+
2
+ __version__ = "1.0.0"
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
crawlo/core/engine.py ADDED
@@ -0,0 +1,157 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ import asyncio
4
+ from typing import Optional, Generator, Callable
5
+ from inspect import iscoroutine, isgenerator, isasyncgen
6
+
7
+ from crawlo import Request, Item
8
+ from crawlo.spider import Spider
9
+ from crawlo.core.scheduler import Scheduler
10
+ from crawlo.core.processor import Processor
11
+ from crawlo.utils.log import get_logger
12
+ from crawlo.task_manager import TaskManager
13
+ from crawlo.utils.project import load_class
14
+ from crawlo.downloader import DownloaderBase
15
+ from crawlo.utils.func_tools import transform
16
+ from crawlo.exceptions import OutputError, TransformTypeError
17
+ from crawlo.event import spider_opened, spider_error
18
+
19
+
20
+ class Engine(object):
21
+
22
+ def __init__(self, crawler):
23
+ self.running = False
24
+ self.normal = True
25
+ self.crawler = crawler
26
+ self.settings = crawler.settings
27
+ self.spider: Optional[Spider] = None
28
+ self.downloader: Optional[DownloaderBase] = None
29
+ self.scheduler: Optional[Scheduler] = None
30
+ self.processor: Optional[Processor] = None
31
+ self.start_requests: Optional[Generator] = None
32
+ self.task_manager: Optional[TaskManager] = TaskManager(self.settings.get_int('CONCURRENCY'))
33
+
34
+ self.logger = get_logger(name=self.__class__.__name__)
35
+
36
+ def _get_downloader_cls(self):
37
+ downloader_cls = load_class(self.settings.get('DOWNLOADER'))
38
+ if not issubclass(downloader_cls, DownloaderBase):
39
+ raise TypeError(f'Downloader {downloader_cls.__name__} is not subclass of DownloaderBase.')
40
+ return downloader_cls
41
+
42
+ def engine_start(self):
43
+ self.running = True
44
+ self.logger.info(
45
+ f"Crawlo (version {self.settings.get_int('VERSION')}) started. "
46
+ f"(project name : {self.settings.get('PROJECT_NAME')})"
47
+ )
48
+
49
+ async def start_spider(self, spider):
50
+ self.spider = spider
51
+
52
+ self.scheduler = Scheduler(self.crawler)
53
+ if hasattr(self.scheduler, 'open'):
54
+ self.scheduler.open()
55
+
56
+ downloader_cls = self._get_downloader_cls()
57
+ self.downloader = downloader_cls(self.crawler)
58
+ if hasattr(self.downloader, 'open'):
59
+ self.downloader.open()
60
+
61
+ self.processor = Processor(self.crawler)
62
+ if hasattr(self.processor, 'open'):
63
+ self.processor.open()
64
+
65
+ self.start_requests = iter(spider.start_requests())
66
+ await self._open_spider()
67
+
68
+ async def crawl(self):
69
+ """
70
+ Crawl the spider
71
+ """
72
+ while self.running:
73
+ if request := await self._get_next_request():
74
+ await self._crawl(request)
75
+ try:
76
+ start_request = next(self.start_requests)
77
+ except StopIteration:
78
+ self.start_requests = None
79
+ except Exception as exp:
80
+ # 1、发去请求的request全部运行完毕
81
+ # 2、调度器是否空闲
82
+ # 3、下载器是否空闲
83
+ if not await self._exit():
84
+ continue
85
+ self.running = False
86
+ if self.start_requests is not None:
87
+ self.logger.error(f"启动请求时发生错误: {str(exp)}")
88
+ else:
89
+ # 请求入队
90
+ await self.enqueue_request(start_request)
91
+
92
+ if not self.running:
93
+ await self.close_spider()
94
+
95
+ async def _open_spider(self):
96
+ asyncio.create_task(self.crawler.subscriber.notify(spider_opened))
97
+ crawling = asyncio.create_task(self.crawl())
98
+ await crawling
99
+
100
+ async def _crawl(self, request):
101
+ # TODO 实现并发
102
+ async def crawl_task():
103
+ outputs = await self._fetch(request)
104
+ # TODO 处理output
105
+ if outputs:
106
+ await self._handle_spider_output(outputs)
107
+
108
+ # asyncio.create_task(crawl_task())
109
+ self.task_manager.create_task(crawl_task())
110
+
111
+ async def _fetch(self, request):
112
+ async def _successful(_response):
113
+ callback: Callable = request.callback or self.spider.parse
114
+ if _outputs := callback(_response):
115
+ if iscoroutine(_outputs):
116
+ await _outputs
117
+ else:
118
+ return transform(_outputs)
119
+
120
+ _response = await self.downloader.fetch(request)
121
+ if _response is None:
122
+ return None
123
+ output = await _successful(_response)
124
+ return output
125
+
126
+ async def enqueue_request(self, start_request):
127
+ await self._schedule_request(start_request)
128
+
129
+ async def _schedule_request(self, request):
130
+ # TODO 去重
131
+ await self.scheduler.enqueue_request(request)
132
+
133
+ async def _get_next_request(self):
134
+ return await self.scheduler.next_request()
135
+
136
+ async def _handle_spider_output(self, outputs):
137
+ async for spider_output in outputs:
138
+ if isinstance(spider_output, (Request, Item)):
139
+ await self.processor.enqueue(spider_output)
140
+ elif isinstance(spider_output, Exception):
141
+ asyncio.create_task(
142
+ self.crawler.subscriber.notify(spider_error, spider_output, self.spider)
143
+ )
144
+ raise spider_output
145
+ else:
146
+ raise OutputError(f'{type(self.spider)} must return `Request` or `Item`.')
147
+
148
+ async def _exit(self):
149
+ if self.scheduler.idle() and self.downloader.idle() and self.task_manager.all_done() and self.processor.idle():
150
+ return True
151
+ return False
152
+
153
+ async def close_spider(self):
154
+ await asyncio.gather(*self.task_manager.current_task)
155
+ await self.downloader.close()
156
+ if self.normal:
157
+ await self.crawler.close()
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from asyncio import Queue
4
+ from typing import Union, Optional
5
+
6
+ from crawlo import Request, Item
7
+ from crawlo.pipelines.pipeline_manager import PipelineManager
8
+
9
+
10
+ class Processor(object):
11
+
12
+ def __init__(self, crawler):
13
+ self.crawler = crawler
14
+ self.queue: Queue = Queue()
15
+ self.pipelines: Optional[PipelineManager] = None
16
+
17
+ def open(self):
18
+ self.pipelines = PipelineManager.create_instance(self.crawler)
19
+
20
+ async def process(self):
21
+ while not self.idle():
22
+ result = await self.queue.get()
23
+ if isinstance(result, Request):
24
+ await self.crawler.engine.enqueue_request(result)
25
+ else:
26
+ assert isinstance(result, Item)
27
+ await self._process_item(result)
28
+
29
+ async def _process_item(self, item):
30
+ await self.pipelines.process_item(item=item)
31
+
32
+ async def enqueue(self, output: Union[Request, Item]):
33
+ await self.queue.put(output)
34
+ await self.process()
35
+
36
+ def idle(self) -> bool:
37
+ return len(self) == 0
38
+
39
+ def __len__(self):
40
+ return self.queue.qsize()
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ import asyncio
4
+ from typing import Optional
5
+
6
+ from crawlo.utils.log import get_logger
7
+ from crawlo.event import request_scheduled
8
+ from crawlo.utils.pqueue import SpiderPriorityQueue
9
+
10
+
11
+ class Scheduler:
12
+ def __init__(self, crawler):
13
+ self.crawler = crawler
14
+ self.request_queue: Optional[SpiderPriorityQueue] = None
15
+
16
+ self.item_count = 0
17
+ self.response_count = 0
18
+ self.logger = get_logger(name=self.__class__.__name__, level=crawler.settings.get('LOG_LEVEL'))
19
+
20
+ def open(self):
21
+ self.request_queue = SpiderPriorityQueue()
22
+
23
+ async def next_request(self):
24
+ request = await self.request_queue.get()
25
+ return request
26
+
27
+ async def enqueue_request(self, request):
28
+ await self.request_queue.put(request)
29
+ asyncio.create_task(self.crawler.subscriber.notify(request_scheduled, request, self.crawler.spider))
30
+
31
+ def idle(self) -> bool:
32
+ return len(self) == 0
33
+
34
+ def __len__(self):
35
+ return self.request_queue.qsize()
crawlo/crawler.py ADDED
@@ -0,0 +1,107 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*
3
+ import signal
4
+ import asyncio
5
+ from typing import Type, Final, Set, Optional
6
+
7
+ from crawlo.spider import Spider
8
+ from crawlo.core.engine import Engine
9
+ from crawlo.subscriber import Subscriber
10
+
11
+ from crawlo.utils.log import get_logger
12
+ from crawlo.extension import ExtensionManager
13
+ from crawlo.exceptions import SpiderTypeError
14
+ from crawlo.utils.project import merge_settings
15
+ from crawlo.stats_collector import StatsCollector
16
+ from crawlo.event import spider_opened, spider_closed
17
+ from crawlo.settings.setting_manager import SettingManager
18
+
19
+ logger = get_logger(__name__)
20
+
21
+
22
+ class Crawler:
23
+
24
+ def __init__(self, spider_cls, settings):
25
+ self.spider_cls = spider_cls
26
+ self.spider: Optional[Spider] = None
27
+ self.engine: Optional[Engine] = None
28
+ self.stats: Optional[StatsCollector] = None
29
+ self.subscriber: Optional[Subscriber] = None
30
+ self.extension: Optional[ExtensionManager] = None
31
+ self.settings: SettingManager = settings.copy()
32
+
33
+ async def crawl(self):
34
+ self.subscriber = self._create_subscriber()
35
+ self.spider = self._create_spider()
36
+ self.engine = self._create_engine()
37
+ self.stats = self._create_stats()
38
+ self.extension = self._create_extension()
39
+
40
+ await self.engine.start_spider(self.spider)
41
+
42
+ @staticmethod
43
+ def _create_subscriber():
44
+ return Subscriber()
45
+
46
+ def _create_spider(self) -> Spider:
47
+ spider = self.spider_cls.create_instance(self)
48
+ self._set_spider(spider)
49
+ return spider
50
+
51
+ def _create_engine(self) -> Engine:
52
+ engine = Engine(self)
53
+ engine.engine_start()
54
+ return engine
55
+
56
+ def _create_stats(self) -> StatsCollector:
57
+ stats = StatsCollector(self)
58
+ return stats
59
+
60
+ def _create_extension(self) -> ExtensionManager:
61
+ extension = ExtensionManager.create_instance(self)
62
+ return extension
63
+
64
+ def _set_spider(self, spider):
65
+ self.subscriber.subscribe(spider.spider_opened, event=spider_opened)
66
+ self.subscriber.subscribe(spider.spider_closed, event=spider_closed)
67
+ merge_settings(spider, self.settings)
68
+
69
+ async def close(self, reason='finished') -> None:
70
+ await asyncio.create_task(self.subscriber.notify(spider_closed))
71
+ self.stats.close_spider(spider_name=self.spider, reason=reason)
72
+
73
+
74
+ class CrawlerProcess:
75
+
76
+ def __init__(self, settings=None):
77
+ self.crawlers: Final[Set] = set()
78
+ self._active_spiders: Final[Set] = set()
79
+ self.settings = settings
80
+
81
+ signal.signal(signal.SIGINT, self._shutdown)
82
+
83
+ async def crawl(self, spider: Type[Spider]):
84
+ crawler: Crawler = self._create_crawler(spider)
85
+ self.crawlers.add(crawler)
86
+ task = await self._crawl(crawler)
87
+ self._active_spiders.add(task)
88
+
89
+ @staticmethod
90
+ async def _crawl(crawler):
91
+ return asyncio.create_task(crawler.crawl())
92
+
93
+ async def start(self):
94
+ await asyncio.gather(*self._active_spiders)
95
+
96
+ def _create_crawler(self, spider_cls) -> Crawler:
97
+ if isinstance(spider_cls, str):
98
+ raise SpiderTypeError(f"{type(self)}.crawl args: String is not supported.")
99
+ crawler: Crawler = Crawler(spider_cls, self.settings)
100
+ return crawler
101
+
102
+ def _shutdown(self, _signum, _frame):
103
+ for crawler in self.crawlers:
104
+ crawler.engine.running = False
105
+ crawler.engine.normal = False
106
+ crawler.stats.close_spider(crawler.spider, 'Ctrl C')
107
+ logger.warning(f'spiders received: `Ctrl C` signal, closed.')
@@ -0,0 +1,78 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from abc import abstractmethod, ABCMeta
4
+ from typing_extensions import Self
5
+ from typing import Final, Set, Optional
6
+ from contextlib import asynccontextmanager
7
+
8
+ from crawlo import Response, Request
9
+ from crawlo.utils.log import get_logger
10
+ from crawlo.middleware.middleware_manager import MiddlewareManager
11
+
12
+
13
+ class ActivateRequestManager:
14
+
15
+ def __init__(self):
16
+ self._active: Final[Set] = set()
17
+
18
+ def add(self, request):
19
+ self._active.add(request)
20
+
21
+ def remove(self, request):
22
+ self._active.remove(request)
23
+
24
+ @asynccontextmanager
25
+ async def __call__(self, request):
26
+ try:
27
+ yield self.add(request)
28
+ finally:
29
+ self.remove(request)
30
+
31
+ def __len__(self):
32
+ return len(self._active)
33
+
34
+
35
+ class DownloaderMeta(ABCMeta):
36
+ def __subclasscheck__(self, subclass):
37
+ required_methods = ('fetch', 'download', 'create_instance', 'close')
38
+ is_subclass = all(
39
+ hasattr(subclass, method) and callable(getattr(subclass, method, None)) for method in required_methods
40
+ )
41
+ return is_subclass
42
+
43
+
44
+ class DownloaderBase(metaclass=DownloaderMeta):
45
+ def __init__(self, crawler):
46
+ self.crawler = crawler
47
+ self._active = ActivateRequestManager()
48
+ self.middleware: Optional[MiddlewareManager] = None
49
+ self.logger = get_logger(self.__class__.__name__, crawler.settings.get("LOG_LEVEL"))
50
+
51
+ @classmethod
52
+ def create_instance(cls, *args, **kwargs) -> Self:
53
+ return cls(*args, **kwargs)
54
+
55
+ def open(self) -> None:
56
+ self.logger.info(
57
+ f"{self.crawler.spider} <downloader class:{type(self).__name__}>"
58
+ f"<concurrency:{self.crawler.settings.get_int('CONCURRENCY')}>"
59
+ )
60
+ self.middleware = MiddlewareManager.create_instance(self.crawler)
61
+
62
+ async def fetch(self, request) -> Optional[Response]:
63
+ async with self._active(request):
64
+ response = await self.middleware.download(request)
65
+ return response
66
+
67
+ @abstractmethod
68
+ async def download(self, request: Request) -> Response:
69
+ pass
70
+
71
+ async def close(self) -> None:
72
+ pass
73
+
74
+ def idle(self) -> bool:
75
+ return len(self) == 0
76
+
77
+ def __len__(self) -> int:
78
+ return len(self._active)
@@ -0,0 +1,96 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from typing import Optional
4
+ from aiohttp import ClientSession, TCPConnector, BaseConnector, ClientTimeout, ClientResponse, TraceConfig
5
+
6
+ from crawlo import Response
7
+ from crawlo.downloader import DownloaderBase
8
+
9
+
10
+ class AioHttpDownloader(DownloaderBase):
11
+ def __init__(self, crawler):
12
+ super().__init__(crawler)
13
+ self.session: Optional[ClientSession] = None
14
+ self.connector: Optional[BaseConnector] = None
15
+ self._verify_ssl: Optional[bool] = None
16
+ self._timeout: Optional[ClientTimeout] = None
17
+ self._use_session: Optional[bool] = None
18
+ self.trace_config: Optional[TraceConfig] = None
19
+
20
+ self.request_method = {
21
+ "get": self._get,
22
+ "post": self._post
23
+ }
24
+
25
+ def open(self):
26
+ super().open()
27
+ self._timeout = ClientTimeout(total=self.crawler.settings.get_int("DOWNLOAD_TIMEOUT"))
28
+ self._verify_ssl = self.crawler.settings.get_bool("VERIFY_SSL")
29
+ self._use_session = self.crawler.settings.get_bool("USE_SESSION")
30
+ self.trace_config = TraceConfig()
31
+ self.trace_config.on_request_start.append(self.request_start)
32
+ if self._use_session:
33
+ self.connector = TCPConnector(verify_ssl=self._verify_ssl)
34
+ self.session = ClientSession(
35
+ connector=self.connector, timeout=self._timeout, trace_configs=[self.trace_config]
36
+ )
37
+
38
+ async def download(self, request) -> Optional[Response]:
39
+ try:
40
+ if self._use_session:
41
+ response = await self.send_request(self.session, request)
42
+ body = await response.content.read()
43
+ else:
44
+ connector = TCPConnector(verify_ssl=self._verify_ssl)
45
+ async with ClientSession(
46
+ connector=connector, timeout=self._timeout, trace_configs=[self.trace_config]
47
+ ) as session:
48
+ response = await self.send_request(session, request)
49
+ body = await response.content.read()
50
+ except Exception as exp:
51
+ self.logger.error(f"Error downloading {request}: {exp}")
52
+ raise exp
53
+
54
+ return self.structure_response(request=request, response=response, body=body)
55
+
56
+ @staticmethod
57
+ def structure_response(request, response, body):
58
+ return Response(
59
+ url=response.url,
60
+ headers=dict(response.headers),
61
+ status_code=response.status,
62
+ body=body,
63
+ request=request
64
+ )
65
+
66
+ async def send_request(self, session, request) -> ClientResponse:
67
+ return await self.request_method[request.method.lower()](session, request)
68
+
69
+ @staticmethod
70
+ async def _get(session, request) -> ClientResponse:
71
+ response = await session.get(
72
+ request.url,
73
+ headers=request.headers,
74
+ cookies=request.cookies
75
+ )
76
+ return response
77
+
78
+ @staticmethod
79
+ async def _post(session, request) -> ClientResponse:
80
+ response = await session.post(
81
+ request.url,
82
+ data=request.body,
83
+ headers=request.headers,
84
+ cookies=request.cookies,
85
+ proxy=request.proxy,
86
+ )
87
+ return response
88
+
89
+ async def request_start(self, _session, _trace_config_ctx, params):
90
+ self.logger.debug(f"Request start: {params.url}, method:{params.method}")
91
+
92
+ async def close(self) -> None:
93
+ if self.connector:
94
+ await self.connector.close()
95
+ if self.session:
96
+ await self.session.close()
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from typing import Optional
4
+ from httpx import AsyncClient, Timeout
5
+
6
+ from crawlo import Response
7
+ from crawlo.downloader import DownloaderBase
8
+
9
+
10
+ class HttpXDownloader(DownloaderBase):
11
+ def __init__(self, crawler):
12
+ super().__init__(crawler)
13
+ self._client: Optional[AsyncClient] = None
14
+ self._timeout: Optional[Timeout] = None
15
+
16
+ def open(self):
17
+ super().open()
18
+ timeout = self.crawler.settings.get_int("DOWNLOAD_TIMEOUT")
19
+ self._timeout = Timeout(timeout=timeout)
20
+
21
+ async def download(self, request) -> Optional[Response]:
22
+ try:
23
+ proxies = None
24
+ async with AsyncClient(timeout=self._timeout, proxy=proxies) as client:
25
+ self.logger.debug(f"request downloading: {request.url},method: {request.method}")
26
+ response = await client.request(
27
+ url=request.url,
28
+ method=request.method,
29
+ headers=request.headers,
30
+ cookies=request.cookies,
31
+ data=request.body
32
+ )
33
+ body = await response.aread()
34
+ except Exception as exp:
35
+ self.logger.error(f"Error downloading {request}: {exp}")
36
+ raise exp
37
+
38
+ return self.structure_response(request=request, response=response, body=body)
39
+
40
+ @staticmethod
41
+ def structure_response(request, response, body) -> Response:
42
+ return Response(
43
+ url=response.url,
44
+ headers=dict(response.headers),
45
+ status_code=response.status_code,
46
+ body=body,
47
+ request=request
48
+ )
crawlo/event.py ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+
4
+ spider_error = "spider_error"
5
+ spider_opened = "spider_open"
6
+ spider_closed = "spider_closed"
7
+ ignore_request = "ignore_request"
8
+ request_scheduled = "request_scheduled"
9
+ response_received = "request_received"
10
+ item_successful = "item_successful"
11
+ item_discard = "item_discard"
crawlo/exceptions.py ADDED
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ class TransformTypeError(TypeError):
4
+ pass
5
+
6
+
7
+ class OutputError(Exception):
8
+ pass
9
+
10
+
11
+ class SpiderTypeError(TypeError):
12
+ pass
13
+
14
+
15
+ class ItemInitError(Exception):
16
+ pass
17
+
18
+
19
+ class ItemAttributeError(Exception):
20
+ pass
21
+
22
+
23
+ class DecodeError(Exception):
24
+ pass
25
+
26
+
27
+ class MiddlewareInitError(Exception):
28
+ pass
29
+
30
+
31
+ class PipelineInitError(Exception):
32
+ pass
33
+
34
+
35
+ class InvalidOutputError(Exception):
36
+ pass
37
+
38
+
39
+ class RequestMethodError(Exception):
40
+ pass
41
+
42
+
43
+ class IgnoreRequestError(Exception):
44
+ def __init__(self, msg):
45
+ self.msg = msg
46
+ super(IgnoreRequestError, self).__init__(msg)
47
+
48
+
49
+ class ItemDiscard(Exception):
50
+ def __init__(self, msg):
51
+ self.msg = msg
52
+ super(ItemDiscard, self).__init__(msg)
53
+
54
+
55
+ class NotConfiguredError(Exception):
56
+ pass
57
+
58
+
59
+ class ExtensionInitError(Exception):
60
+ pass
61
+
62
+
63
+ class ReceiverTypeError(Exception):
64
+ pass
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from typing import List
4
+ from pprint import pformat
5
+
6
+ from crawlo.utils.log import get_logger
7
+ from crawlo.utils.project import load_class
8
+ from crawlo.exceptions import ExtensionInitError
9
+
10
+
11
+ class ExtensionManager(object):
12
+
13
+ def __init__(self, crawler):
14
+ self.crawler = crawler
15
+ self.extensions: List = []
16
+ extensions = self.crawler.settings.get_list('EXTENSIONS')
17
+ self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
18
+ self._add_extensions(extensions)
19
+
20
+ @classmethod
21
+ def create_instance(cls, *args, **kwargs):
22
+ return cls(*args, **kwargs)
23
+
24
+ def _add_extensions(self, extensions):
25
+ for extension in extensions:
26
+ extension_cls = load_class(extension)
27
+ if not hasattr(extension_cls, 'create_instance'):
28
+ raise ExtensionInitError(f"extension init failed, Must have method 'create_instance()")
29
+ self.extensions.append(extension_cls.create_instance(self.crawler))
30
+ if extensions:
31
+ self.logger.info(f"enabled extensions: \n {pformat(extensions)}")