crawlo 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +9 -6
- crawlo/__version__.py +1 -2
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +158 -158
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +57 -59
- crawlo/crawler.py +242 -107
- crawlo/downloader/__init__.py +78 -78
- crawlo/downloader/aiohttp_downloader.py +259 -96
- crawlo/downloader/httpx_downloader.py +187 -48
- crawlo/downloader/playwright_downloader.py +160 -160
- crawlo/event.py +11 -11
- crawlo/exceptions.py +64 -64
- crawlo/extension/__init__.py +31 -31
- crawlo/extension/log_interval.py +49 -49
- crawlo/extension/log_stats.py +44 -44
- crawlo/filters/__init__.py +37 -37
- crawlo/filters/aioredis_filter.py +157 -129
- crawlo/filters/memory_filter.py +202 -203
- crawlo/filters/redis_filter.py +119 -119
- crawlo/items/__init__.py +62 -62
- crawlo/items/items.py +118 -118
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +32 -32
- crawlo/middleware/download_delay.py +28 -28
- crawlo/middleware/middleware_manager.py +140 -140
- crawlo/middleware/request_ignore.py +30 -30
- crawlo/middleware/response_code.py +18 -18
- crawlo/middleware/response_filter.py +26 -26
- crawlo/middleware/retry.py +90 -89
- crawlo/network/__init__.py +7 -7
- crawlo/network/request.py +205 -155
- crawlo/network/response.py +166 -93
- crawlo/pipelines/__init__.py +13 -13
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/mongo_pipeline.py +116 -116
- crawlo/pipelines/mysql_batch_pipline.py +133 -133
- crawlo/pipelines/mysql_pipeline.py +195 -176
- crawlo/pipelines/pipeline_manager.py +56 -56
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +93 -89
- crawlo/settings/setting_manager.py +99 -99
- crawlo/spider/__init__.py +36 -36
- crawlo/stats_collector.py +59 -47
- crawlo/subscriber.py +106 -27
- crawlo/task_manager.py +27 -27
- crawlo/templates/item_template.tmpl +21 -21
- crawlo/templates/project_template/main.py +32 -32
- crawlo/templates/project_template/setting.py +189 -189
- crawlo/templates/spider_template.tmpl +30 -30
- crawlo/utils/__init__.py +7 -7
- crawlo/utils/concurrency_manager.py +125 -0
- crawlo/utils/date_tools.py +177 -177
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/log.py +39 -39
- crawlo/utils/pqueue.py +173 -173
- crawlo/utils/project.py +59 -59
- crawlo/utils/request.py +122 -85
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +303 -0
- crawlo/utils/url.py +39 -39
- {crawlo-1.0.1.dist-info → crawlo-1.0.3.dist-info}/METADATA +48 -36
- crawlo-1.0.3.dist-info/RECORD +80 -0
- {crawlo-1.0.1.dist-info → crawlo-1.0.3.dist-info}/top_level.txt +1 -0
- tests/__init__.py +7 -0
- tests/baidu_spider/__init__.py +7 -0
- tests/baidu_spider/demo.py +94 -0
- tests/baidu_spider/items.py +25 -0
- tests/baidu_spider/middleware.py +49 -0
- tests/baidu_spider/pipeline.py +55 -0
- tests/baidu_spider/request_fingerprints.txt +9 -0
- tests/baidu_spider/run.py +27 -0
- tests/baidu_spider/settings.py +78 -0
- tests/baidu_spider/spiders/__init__.py +7 -0
- tests/baidu_spider/spiders/bai_du.py +61 -0
- tests/baidu_spider/spiders/sina.py +79 -0
- crawlo-1.0.1.dist-info/RECORD +0 -67
- crawlo-1.0.1.dist-info/licenses/LICENSE +0 -23
- {crawlo-1.0.1.dist-info → crawlo-1.0.3.dist-info}/WHEEL +0 -0
- {crawlo-1.0.1.dist-info → crawlo-1.0.3.dist-info}/entry_points.txt +0 -0
|
@@ -1,140 +1,140 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
from pprint import pformat
|
|
4
|
-
from types import MethodType
|
|
5
|
-
from asyncio import create_task
|
|
6
|
-
from collections import defaultdict
|
|
7
|
-
from typing import List, Dict, Callable, Optional
|
|
8
|
-
|
|
9
|
-
from crawlo import Request, Response
|
|
10
|
-
from crawlo.utils.log import get_logger
|
|
11
|
-
from crawlo.utils.project import load_class
|
|
12
|
-
from crawlo.middleware import BaseMiddleware
|
|
13
|
-
from crawlo.utils.project import common_call
|
|
14
|
-
from crawlo.event import ignore_request, response_received
|
|
15
|
-
from crawlo.exceptions import MiddlewareInitError, InvalidOutputError, RequestMethodError, IgnoreRequestError, \
|
|
16
|
-
NotConfiguredError
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class MiddlewareManager:
|
|
20
|
-
|
|
21
|
-
def __init__(self, crawler):
|
|
22
|
-
self.crawler = crawler
|
|
23
|
-
self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
|
|
24
|
-
self.middlewares: List = []
|
|
25
|
-
self.methods: Dict[str, List[MethodType]] = defaultdict(list)
|
|
26
|
-
middlewares = self.crawler.settings.get_list('MIDDLEWARES')
|
|
27
|
-
self._add_middleware(middlewares)
|
|
28
|
-
self._add_method()
|
|
29
|
-
|
|
30
|
-
self.download_method: Callable = crawler.engine.downloader.download
|
|
31
|
-
self._stats = crawler.stats
|
|
32
|
-
|
|
33
|
-
async def _process_request(self, request: Request):
|
|
34
|
-
for method in self.methods['process_request']:
|
|
35
|
-
result = await common_call(method, request, self.crawler.spider)
|
|
36
|
-
if result is None:
|
|
37
|
-
continue
|
|
38
|
-
if isinstance(result, (Request, Response)):
|
|
39
|
-
return result
|
|
40
|
-
raise InvalidOutputError(
|
|
41
|
-
f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(result).__name__}"
|
|
42
|
-
)
|
|
43
|
-
return await self.download_method(request)
|
|
44
|
-
|
|
45
|
-
async def _process_response(self, request: Request, response: Response):
|
|
46
|
-
for method in reversed(self.methods['process_response']):
|
|
47
|
-
try:
|
|
48
|
-
response = await common_call(method, request, response, self.crawler.spider)
|
|
49
|
-
except IgnoreRequestError as exp:
|
|
50
|
-
create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
|
|
51
|
-
# self.logger.info(f'{request} ignored.')
|
|
52
|
-
# self._stats.inc_value('request_ignore_count')
|
|
53
|
-
# reason = exp.msg
|
|
54
|
-
# if reason:
|
|
55
|
-
# self._stats.inc_value(f'request_ignore_count/{reason}')
|
|
56
|
-
if isinstance(response, Request):
|
|
57
|
-
return response
|
|
58
|
-
if isinstance(response, Response):
|
|
59
|
-
continue
|
|
60
|
-
raise InvalidOutputError(
|
|
61
|
-
f"{method.__self__.__class__.__name__}. must return Request or Response, got {type(response).__name__}"
|
|
62
|
-
)
|
|
63
|
-
return response
|
|
64
|
-
|
|
65
|
-
async def _process_exception(self, request: Request, exp: Exception):
|
|
66
|
-
for method in self.methods['process_exception']:
|
|
67
|
-
response = await common_call(method, request, exp, self.crawler.spider)
|
|
68
|
-
if response is None:
|
|
69
|
-
continue
|
|
70
|
-
if isinstance(response, (Request, Response)):
|
|
71
|
-
return response
|
|
72
|
-
if response:
|
|
73
|
-
break
|
|
74
|
-
raise InvalidOutputError(
|
|
75
|
-
f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(response).__name__}"
|
|
76
|
-
)
|
|
77
|
-
else:
|
|
78
|
-
raise exp
|
|
79
|
-
|
|
80
|
-
async def download(self, request) -> Optional[Response]:
|
|
81
|
-
""" called in the download method. """
|
|
82
|
-
try:
|
|
83
|
-
response = await self._process_request(request)
|
|
84
|
-
except KeyError:
|
|
85
|
-
raise RequestMethodError(f"{request.method.lower()} is not supported")
|
|
86
|
-
except IgnoreRequestError as exp:
|
|
87
|
-
create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
|
|
88
|
-
response = await self._process_exception(request, exp)
|
|
89
|
-
except Exception as exp:
|
|
90
|
-
self._stats.inc_value(f'download_error/{exp.__class__.__name__}')
|
|
91
|
-
response = await self._process_exception(request, exp)
|
|
92
|
-
else:
|
|
93
|
-
create_task(self.crawler.subscriber.notify(response_received, response, self.crawler.spider))
|
|
94
|
-
# self.crawler.stats.inc_value('response_received_count')
|
|
95
|
-
if isinstance(response, Response):
|
|
96
|
-
response = await self._process_response(request, response)
|
|
97
|
-
if isinstance(response, Request):
|
|
98
|
-
await self.crawler.engine.enqueue_request(request)
|
|
99
|
-
return None
|
|
100
|
-
return response
|
|
101
|
-
|
|
102
|
-
@classmethod
|
|
103
|
-
def create_instance(cls, *args, **kwargs):
|
|
104
|
-
return cls(*args, **kwargs)
|
|
105
|
-
|
|
106
|
-
def _add_middleware(self, middlewares):
|
|
107
|
-
enabled_middlewares = [m for m in middlewares if self._validate_middleware(m)]
|
|
108
|
-
if enabled_middlewares:
|
|
109
|
-
self.logger.info(f'enabled middleware:\n {pformat(enabled_middlewares)}')
|
|
110
|
-
|
|
111
|
-
def _validate_middleware(self, middleware):
|
|
112
|
-
middleware_cls = load_class(middleware)
|
|
113
|
-
if not hasattr(middleware_cls, 'create_instance'):
|
|
114
|
-
raise MiddlewareInitError(
|
|
115
|
-
f"Middleware init failed, must inherit from `BaseMiddleware` or have a `create_instance` method"
|
|
116
|
-
)
|
|
117
|
-
try:
|
|
118
|
-
instance = middleware_cls.create_instance(self.crawler)
|
|
119
|
-
self.middlewares.append(instance)
|
|
120
|
-
return True
|
|
121
|
-
except NotConfiguredError:
|
|
122
|
-
return False
|
|
123
|
-
|
|
124
|
-
def _add_method(self):
|
|
125
|
-
for middleware in self.middlewares:
|
|
126
|
-
if hasattr(middleware, 'process_request'):
|
|
127
|
-
if self._validate_middleware_method(method_name='process_request', middleware=middleware):
|
|
128
|
-
self.methods['process_request'].append(middleware.process_request)
|
|
129
|
-
if hasattr(middleware, 'process_response'):
|
|
130
|
-
if self._validate_middleware_method(method_name='process_response', middleware=middleware):
|
|
131
|
-
self.methods['process_response'].append(middleware.process_response)
|
|
132
|
-
if hasattr(middleware, 'process_exception'):
|
|
133
|
-
if self._validate_middleware_method(method_name='process_exception', middleware=middleware):
|
|
134
|
-
self.methods['process_exception'].append(middleware.process_exception)
|
|
135
|
-
|
|
136
|
-
@staticmethod
|
|
137
|
-
def _validate_middleware_method(method_name, middleware) -> bool:
|
|
138
|
-
method = getattr(type(middleware), method_name)
|
|
139
|
-
base_method = getattr(BaseMiddleware, method_name)
|
|
140
|
-
return False if method == base_method else True
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
from pprint import pformat
|
|
4
|
+
from types import MethodType
|
|
5
|
+
from asyncio import create_task
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from typing import List, Dict, Callable, Optional
|
|
8
|
+
|
|
9
|
+
from crawlo import Request, Response
|
|
10
|
+
from crawlo.utils.log import get_logger
|
|
11
|
+
from crawlo.utils.project import load_class
|
|
12
|
+
from crawlo.middleware import BaseMiddleware
|
|
13
|
+
from crawlo.utils.project import common_call
|
|
14
|
+
from crawlo.event import ignore_request, response_received
|
|
15
|
+
from crawlo.exceptions import MiddlewareInitError, InvalidOutputError, RequestMethodError, IgnoreRequestError, \
|
|
16
|
+
NotConfiguredError
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MiddlewareManager:
|
|
20
|
+
|
|
21
|
+
def __init__(self, crawler):
|
|
22
|
+
self.crawler = crawler
|
|
23
|
+
self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
|
|
24
|
+
self.middlewares: List = []
|
|
25
|
+
self.methods: Dict[str, List[MethodType]] = defaultdict(list)
|
|
26
|
+
middlewares = self.crawler.settings.get_list('MIDDLEWARES')
|
|
27
|
+
self._add_middleware(middlewares)
|
|
28
|
+
self._add_method()
|
|
29
|
+
|
|
30
|
+
self.download_method: Callable = crawler.engine.downloader.download
|
|
31
|
+
self._stats = crawler.stats
|
|
32
|
+
|
|
33
|
+
async def _process_request(self, request: Request):
|
|
34
|
+
for method in self.methods['process_request']:
|
|
35
|
+
result = await common_call(method, request, self.crawler.spider)
|
|
36
|
+
if result is None:
|
|
37
|
+
continue
|
|
38
|
+
if isinstance(result, (Request, Response)):
|
|
39
|
+
return result
|
|
40
|
+
raise InvalidOutputError(
|
|
41
|
+
f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(result).__name__}"
|
|
42
|
+
)
|
|
43
|
+
return await self.download_method(request)
|
|
44
|
+
|
|
45
|
+
async def _process_response(self, request: Request, response: Response):
|
|
46
|
+
for method in reversed(self.methods['process_response']):
|
|
47
|
+
try:
|
|
48
|
+
response = await common_call(method, request, response, self.crawler.spider)
|
|
49
|
+
except IgnoreRequestError as exp:
|
|
50
|
+
create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
|
|
51
|
+
# self.logger.info(f'{request} ignored.')
|
|
52
|
+
# self._stats.inc_value('request_ignore_count')
|
|
53
|
+
# reason = exp.msg
|
|
54
|
+
# if reason:
|
|
55
|
+
# self._stats.inc_value(f'request_ignore_count/{reason}')
|
|
56
|
+
if isinstance(response, Request):
|
|
57
|
+
return response
|
|
58
|
+
if isinstance(response, Response):
|
|
59
|
+
continue
|
|
60
|
+
raise InvalidOutputError(
|
|
61
|
+
f"{method.__self__.__class__.__name__}. must return Request or Response, got {type(response).__name__}"
|
|
62
|
+
)
|
|
63
|
+
return response
|
|
64
|
+
|
|
65
|
+
async def _process_exception(self, request: Request, exp: Exception):
|
|
66
|
+
for method in self.methods['process_exception']:
|
|
67
|
+
response = await common_call(method, request, exp, self.crawler.spider)
|
|
68
|
+
if response is None:
|
|
69
|
+
continue
|
|
70
|
+
if isinstance(response, (Request, Response)):
|
|
71
|
+
return response
|
|
72
|
+
if response:
|
|
73
|
+
break
|
|
74
|
+
raise InvalidOutputError(
|
|
75
|
+
f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(response).__name__}"
|
|
76
|
+
)
|
|
77
|
+
else:
|
|
78
|
+
raise exp
|
|
79
|
+
|
|
80
|
+
async def download(self, request) -> Optional[Response]:
|
|
81
|
+
""" called in the download method. """
|
|
82
|
+
try:
|
|
83
|
+
response = await self._process_request(request)
|
|
84
|
+
except KeyError:
|
|
85
|
+
raise RequestMethodError(f"{request.method.lower()} is not supported")
|
|
86
|
+
except IgnoreRequestError as exp:
|
|
87
|
+
create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
|
|
88
|
+
response = await self._process_exception(request, exp)
|
|
89
|
+
except Exception as exp:
|
|
90
|
+
self._stats.inc_value(f'download_error/{exp.__class__.__name__}')
|
|
91
|
+
response = await self._process_exception(request, exp)
|
|
92
|
+
else:
|
|
93
|
+
create_task(self.crawler.subscriber.notify(response_received, response, self.crawler.spider))
|
|
94
|
+
# self.crawler.stats.inc_value('response_received_count')
|
|
95
|
+
if isinstance(response, Response):
|
|
96
|
+
response = await self._process_response(request, response)
|
|
97
|
+
if isinstance(response, Request):
|
|
98
|
+
await self.crawler.engine.enqueue_request(request)
|
|
99
|
+
return None
|
|
100
|
+
return response
|
|
101
|
+
|
|
102
|
+
@classmethod
|
|
103
|
+
def create_instance(cls, *args, **kwargs):
|
|
104
|
+
return cls(*args, **kwargs)
|
|
105
|
+
|
|
106
|
+
def _add_middleware(self, middlewares):
|
|
107
|
+
enabled_middlewares = [m for m in middlewares if self._validate_middleware(m)]
|
|
108
|
+
if enabled_middlewares:
|
|
109
|
+
self.logger.info(f'enabled middleware:\n {pformat(enabled_middlewares)}')
|
|
110
|
+
|
|
111
|
+
def _validate_middleware(self, middleware):
|
|
112
|
+
middleware_cls = load_class(middleware)
|
|
113
|
+
if not hasattr(middleware_cls, 'create_instance'):
|
|
114
|
+
raise MiddlewareInitError(
|
|
115
|
+
f"Middleware init failed, must inherit from `BaseMiddleware` or have a `create_instance` method"
|
|
116
|
+
)
|
|
117
|
+
try:
|
|
118
|
+
instance = middleware_cls.create_instance(self.crawler)
|
|
119
|
+
self.middlewares.append(instance)
|
|
120
|
+
return True
|
|
121
|
+
except NotConfiguredError:
|
|
122
|
+
return False
|
|
123
|
+
|
|
124
|
+
def _add_method(self):
|
|
125
|
+
for middleware in self.middlewares:
|
|
126
|
+
if hasattr(middleware, 'process_request'):
|
|
127
|
+
if self._validate_middleware_method(method_name='process_request', middleware=middleware):
|
|
128
|
+
self.methods['process_request'].append(middleware.process_request)
|
|
129
|
+
if hasattr(middleware, 'process_response'):
|
|
130
|
+
if self._validate_middleware_method(method_name='process_response', middleware=middleware):
|
|
131
|
+
self.methods['process_response'].append(middleware.process_response)
|
|
132
|
+
if hasattr(middleware, 'process_exception'):
|
|
133
|
+
if self._validate_middleware_method(method_name='process_exception', middleware=middleware):
|
|
134
|
+
self.methods['process_exception'].append(middleware.process_exception)
|
|
135
|
+
|
|
136
|
+
@staticmethod
|
|
137
|
+
def _validate_middleware_method(method_name, middleware) -> bool:
|
|
138
|
+
method = getattr(type(middleware), method_name)
|
|
139
|
+
base_method = getattr(BaseMiddleware, method_name)
|
|
140
|
+
return False if method == base_method else True
|
|
@@ -1,30 +1,30 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
from crawlo.utils.log import get_logger
|
|
4
|
-
from crawlo.exceptions import IgnoreRequestError
|
|
5
|
-
from crawlo.event import ignore_request
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class RequestIgnoreMiddleware(object):
|
|
9
|
-
|
|
10
|
-
def __init__(self, stats, log_level):
|
|
11
|
-
self.logger = get_logger(self.__class__.__name__, log_level)
|
|
12
|
-
self.stats = stats
|
|
13
|
-
|
|
14
|
-
@classmethod
|
|
15
|
-
def create_instance(cls, crawler):
|
|
16
|
-
o = cls(stats=crawler.stats, log_level=crawler.settings.get('LOG_LEVEL'))
|
|
17
|
-
crawler.subscriber.subscribe(o.request_ignore, event=ignore_request)
|
|
18
|
-
return o
|
|
19
|
-
|
|
20
|
-
async def request_ignore(self, exc, request, _spider):
|
|
21
|
-
self.logger.info(f'{request} ignored.')
|
|
22
|
-
self.stats.inc_value('request_ignore_count')
|
|
23
|
-
reason = exc.msg
|
|
24
|
-
if reason:
|
|
25
|
-
self.stats.inc_value(f'request_ignore_count/{reason}')
|
|
26
|
-
|
|
27
|
-
@staticmethod
|
|
28
|
-
def process_exception(_request, exc, _spider):
|
|
29
|
-
if isinstance(exc, IgnoreRequestError):
|
|
30
|
-
return True
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
from crawlo.utils.log import get_logger
|
|
4
|
+
from crawlo.exceptions import IgnoreRequestError
|
|
5
|
+
from crawlo.event import ignore_request
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class RequestIgnoreMiddleware(object):
|
|
9
|
+
|
|
10
|
+
def __init__(self, stats, log_level):
|
|
11
|
+
self.logger = get_logger(self.__class__.__name__, log_level)
|
|
12
|
+
self.stats = stats
|
|
13
|
+
|
|
14
|
+
@classmethod
|
|
15
|
+
def create_instance(cls, crawler):
|
|
16
|
+
o = cls(stats=crawler.stats, log_level=crawler.settings.get('LOG_LEVEL'))
|
|
17
|
+
crawler.subscriber.subscribe(o.request_ignore, event=ignore_request)
|
|
18
|
+
return o
|
|
19
|
+
|
|
20
|
+
async def request_ignore(self, exc, request, _spider):
|
|
21
|
+
self.logger.info(f'{request} ignored.')
|
|
22
|
+
self.stats.inc_value('request_ignore_count')
|
|
23
|
+
reason = exc.msg
|
|
24
|
+
if reason:
|
|
25
|
+
self.stats.inc_value(f'request_ignore_count/{reason}')
|
|
26
|
+
|
|
27
|
+
@staticmethod
|
|
28
|
+
def process_exception(_request, exc, _spider):
|
|
29
|
+
if isinstance(exc, IgnoreRequestError):
|
|
30
|
+
return True
|
|
@@ -1,19 +1,19 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
from crawlo.utils.log import get_logger
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class ResponseCodeMiddleware(object):
|
|
7
|
-
def __init__(self, stats, log_level):
|
|
8
|
-
self.logger = get_logger(self.__class__.__name__, log_level)
|
|
9
|
-
self.stats = stats
|
|
10
|
-
|
|
11
|
-
@classmethod
|
|
12
|
-
def create_instance(cls, crawler):
|
|
13
|
-
o = cls(stats=crawler.stats, log_level=crawler.settings.get('LOG_LEVEL'))
|
|
14
|
-
return o
|
|
15
|
-
|
|
16
|
-
def process_response(self, request, response, spider):
|
|
17
|
-
self.stats.inc_value(f'stats_code/count/{response.status_code}')
|
|
18
|
-
self.logger.debug(f'Got response from <{response.status_code} {response.url}>')
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
from crawlo.utils.log import get_logger
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ResponseCodeMiddleware(object):
|
|
7
|
+
def __init__(self, stats, log_level):
|
|
8
|
+
self.logger = get_logger(self.__class__.__name__, log_level)
|
|
9
|
+
self.stats = stats
|
|
10
|
+
|
|
11
|
+
@classmethod
|
|
12
|
+
def create_instance(cls, crawler):
|
|
13
|
+
o = cls(stats=crawler.stats, log_level=crawler.settings.get('LOG_LEVEL'))
|
|
14
|
+
return o
|
|
15
|
+
|
|
16
|
+
def process_response(self, request, response, spider):
|
|
17
|
+
self.stats.inc_value(f'stats_code/count/{response.status_code}')
|
|
18
|
+
self.logger.debug(f'Got response from <{response.status_code} {response.url}>')
|
|
19
19
|
return response
|
|
@@ -1,26 +1,26 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
from crawlo.utils.log import get_logger
|
|
4
|
-
from crawlo.exceptions import IgnoreRequestError
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class ResponseFilterMiddleware:
|
|
8
|
-
|
|
9
|
-
def __init__(self, allowed_codes, log_level):
|
|
10
|
-
self.allowed_codes = allowed_codes
|
|
11
|
-
self.logger = get_logger(self.__class__.__name__, log_level)
|
|
12
|
-
|
|
13
|
-
@classmethod
|
|
14
|
-
def create_instance(cls, crawler):
|
|
15
|
-
o = cls(
|
|
16
|
-
allowed_codes=crawler.settings.get_list('ALLOWED_CODES'),
|
|
17
|
-
log_level=crawler.settings.get('LOG_LEVEL')
|
|
18
|
-
)
|
|
19
|
-
return o
|
|
20
|
-
|
|
21
|
-
def process_response(self, request, response, spider):
|
|
22
|
-
if 200 <= response.status_code < 300:
|
|
23
|
-
return response
|
|
24
|
-
if response.status_code in self.allowed_codes:
|
|
25
|
-
return response
|
|
26
|
-
raise IgnoreRequestError(f"response status_code/non-200")
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
from crawlo.utils.log import get_logger
|
|
4
|
+
from crawlo.exceptions import IgnoreRequestError
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ResponseFilterMiddleware:
|
|
8
|
+
|
|
9
|
+
def __init__(self, allowed_codes, log_level):
|
|
10
|
+
self.allowed_codes = allowed_codes
|
|
11
|
+
self.logger = get_logger(self.__class__.__name__, log_level)
|
|
12
|
+
|
|
13
|
+
@classmethod
|
|
14
|
+
def create_instance(cls, crawler):
|
|
15
|
+
o = cls(
|
|
16
|
+
allowed_codes=crawler.settings.get_list('ALLOWED_CODES'),
|
|
17
|
+
log_level=crawler.settings.get('LOG_LEVEL')
|
|
18
|
+
)
|
|
19
|
+
return o
|
|
20
|
+
|
|
21
|
+
def process_response(self, request, response, spider):
|
|
22
|
+
if 200 <= response.status_code < 300:
|
|
23
|
+
return response
|
|
24
|
+
if response.status_code in self.allowed_codes:
|
|
25
|
+
return response
|
|
26
|
+
raise IgnoreRequestError(f"response status_code/non-200")
|
crawlo/middleware/retry.py
CHANGED
|
@@ -1,89 +1,90 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
from typing import List
|
|
4
|
-
from anyio import EndOfStream
|
|
5
|
-
from httpcore import ReadError
|
|
6
|
-
from asyncio.exceptions import TimeoutError
|
|
7
|
-
from httpx import RemoteProtocolError, ConnectError, ReadTimeout
|
|
8
|
-
from aiohttp.client_exceptions import ClientConnectionError, ClientPayloadError
|
|
9
|
-
from aiohttp import ClientConnectorError, ClientTimeout, ClientConnectorSSLError, ClientResponseError
|
|
10
|
-
|
|
11
|
-
from crawlo.utils.log import get_logger
|
|
12
|
-
from crawlo.stats_collector import StatsCollector
|
|
13
|
-
|
|
14
|
-
_retry_exceptions = [
|
|
15
|
-
EndOfStream,
|
|
16
|
-
ReadError,
|
|
17
|
-
TimeoutError,
|
|
18
|
-
ConnectError,
|
|
19
|
-
ReadTimeout,
|
|
20
|
-
ClientConnectorError,
|
|
21
|
-
ClientResponseError,
|
|
22
|
-
RemoteProtocolError,
|
|
23
|
-
ClientTimeout,
|
|
24
|
-
ClientConnectorSSLError,
|
|
25
|
-
ClientPayloadError,
|
|
26
|
-
ClientConnectionError
|
|
27
|
-
]
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class RetryMiddleware(object):
|
|
31
|
-
|
|
32
|
-
def __init__(
|
|
33
|
-
self,
|
|
34
|
-
*,
|
|
35
|
-
retry_http_codes: List,
|
|
36
|
-
ignore_http_codes: List,
|
|
37
|
-
max_retry_times: int,
|
|
38
|
-
retry_exceptions: List,
|
|
39
|
-
stats: StatsCollector,
|
|
40
|
-
retry_priority: int
|
|
41
|
-
):
|
|
42
|
-
self.retry_http_codes = retry_http_codes
|
|
43
|
-
self.ignore_http_codes = ignore_http_codes
|
|
44
|
-
self.max_retry_times = max_retry_times
|
|
45
|
-
self.retry_exceptions = tuple(retry_exceptions + _retry_exceptions)
|
|
46
|
-
self.retry_priority = retry_priority
|
|
47
|
-
self.stats = stats
|
|
48
|
-
self.logger = get_logger(self.__class__.__name__)
|
|
49
|
-
|
|
50
|
-
@classmethod
|
|
51
|
-
def create_instance(cls, crawler):
|
|
52
|
-
o = cls(
|
|
53
|
-
retry_http_codes=crawler.settings.get_list('RETRY_HTTP_CODES'),
|
|
54
|
-
ignore_http_codes=crawler.settings.get_list('IGNORE_HTTP_CODES'),
|
|
55
|
-
max_retry_times=crawler.settings.get_int('MAX_RETRY_TIMES'),
|
|
56
|
-
retry_exceptions=crawler.settings.get_list('RETRY_EXCEPTIONS'),
|
|
57
|
-
stats=crawler.stats,
|
|
58
|
-
retry_priority=crawler.settings.get_int('RETRY_PRIORITY')
|
|
59
|
-
)
|
|
60
|
-
return o
|
|
61
|
-
|
|
62
|
-
def process_response(self, request, response, spider):
|
|
63
|
-
if request.meta.get('dont_retry', False):
|
|
64
|
-
return response
|
|
65
|
-
if response.status_code in self.ignore_http_codes:
|
|
66
|
-
return response
|
|
67
|
-
if response.status_code in self.retry_http_codes:
|
|
68
|
-
# 重试逻辑
|
|
69
|
-
reason = f"response code {response.status_code}"
|
|
70
|
-
return self._retry(request, reason, spider) or response
|
|
71
|
-
return response
|
|
72
|
-
|
|
73
|
-
def process_exception(self, request, exc, spider):
|
|
74
|
-
if isinstance(exc, self.retry_exceptions) and not request.meta.get('dont_retry', False):
|
|
75
|
-
return self._retry(request=request, reason=type(exc).__name__, spider=spider)
|
|
76
|
-
|
|
77
|
-
def _retry(self, request, reason, spider):
|
|
78
|
-
retry_times = request.meta.get('retry_times', 0)
|
|
79
|
-
if retry_times < self.max_retry_times:
|
|
80
|
-
retry_times += 1
|
|
81
|
-
self.logger.info(f"{spider} {request} {reason} retrying {retry_times} time...")
|
|
82
|
-
request.meta['retry_times'] = retry_times
|
|
83
|
-
request.dont_retry = True
|
|
84
|
-
request.
|
|
85
|
-
self.
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
from typing import List
|
|
4
|
+
from anyio import EndOfStream
|
|
5
|
+
from httpcore import ReadError
|
|
6
|
+
from asyncio.exceptions import TimeoutError
|
|
7
|
+
from httpx import RemoteProtocolError, ConnectError, ReadTimeout
|
|
8
|
+
from aiohttp.client_exceptions import ClientConnectionError, ClientPayloadError
|
|
9
|
+
from aiohttp import ClientConnectorError, ClientTimeout, ClientConnectorSSLError, ClientResponseError
|
|
10
|
+
|
|
11
|
+
from crawlo.utils.log import get_logger
|
|
12
|
+
from crawlo.stats_collector import StatsCollector
|
|
13
|
+
|
|
14
|
+
_retry_exceptions = [
|
|
15
|
+
EndOfStream,
|
|
16
|
+
ReadError,
|
|
17
|
+
TimeoutError,
|
|
18
|
+
ConnectError,
|
|
19
|
+
ReadTimeout,
|
|
20
|
+
ClientConnectorError,
|
|
21
|
+
ClientResponseError,
|
|
22
|
+
RemoteProtocolError,
|
|
23
|
+
ClientTimeout,
|
|
24
|
+
ClientConnectorSSLError,
|
|
25
|
+
ClientPayloadError,
|
|
26
|
+
ClientConnectionError
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class RetryMiddleware(object):
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
*,
|
|
35
|
+
retry_http_codes: List,
|
|
36
|
+
ignore_http_codes: List,
|
|
37
|
+
max_retry_times: int,
|
|
38
|
+
retry_exceptions: List,
|
|
39
|
+
stats: StatsCollector,
|
|
40
|
+
retry_priority: int
|
|
41
|
+
):
|
|
42
|
+
self.retry_http_codes = retry_http_codes
|
|
43
|
+
self.ignore_http_codes = ignore_http_codes
|
|
44
|
+
self.max_retry_times = max_retry_times
|
|
45
|
+
self.retry_exceptions = tuple(retry_exceptions + _retry_exceptions)
|
|
46
|
+
self.retry_priority = retry_priority
|
|
47
|
+
self.stats = stats
|
|
48
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def create_instance(cls, crawler):
|
|
52
|
+
o = cls(
|
|
53
|
+
retry_http_codes=crawler.settings.get_list('RETRY_HTTP_CODES'),
|
|
54
|
+
ignore_http_codes=crawler.settings.get_list('IGNORE_HTTP_CODES'),
|
|
55
|
+
max_retry_times=crawler.settings.get_int('MAX_RETRY_TIMES'),
|
|
56
|
+
retry_exceptions=crawler.settings.get_list('RETRY_EXCEPTIONS'),
|
|
57
|
+
stats=crawler.stats,
|
|
58
|
+
retry_priority=crawler.settings.get_int('RETRY_PRIORITY')
|
|
59
|
+
)
|
|
60
|
+
return o
|
|
61
|
+
|
|
62
|
+
def process_response(self, request, response, spider):
|
|
63
|
+
if request.meta.get('dont_retry', False):
|
|
64
|
+
return response
|
|
65
|
+
if response.status_code in self.ignore_http_codes:
|
|
66
|
+
return response
|
|
67
|
+
if response.status_code in self.retry_http_codes:
|
|
68
|
+
# 重试逻辑
|
|
69
|
+
reason = f"response code {response.status_code}"
|
|
70
|
+
return self._retry(request, reason, spider) or response
|
|
71
|
+
return response
|
|
72
|
+
|
|
73
|
+
def process_exception(self, request, exc, spider):
|
|
74
|
+
if isinstance(exc, self.retry_exceptions) and not request.meta.get('dont_retry', False):
|
|
75
|
+
return self._retry(request=request, reason=type(exc).__name__, spider=spider)
|
|
76
|
+
|
|
77
|
+
def _retry(self, request, reason, spider):
|
|
78
|
+
retry_times = request.meta.get('retry_times', 0)
|
|
79
|
+
if retry_times < self.max_retry_times:
|
|
80
|
+
retry_times += 1
|
|
81
|
+
self.logger.info(f"{spider} {request} {reason} retrying {retry_times} time...")
|
|
82
|
+
request.meta['retry_times'] = retry_times
|
|
83
|
+
# request.dont_retry = True
|
|
84
|
+
request.meta['dont_retry'] = True
|
|
85
|
+
request.retry_priority = request.priority + self.retry_priority
|
|
86
|
+
self.stats.inc_value("retry_count")
|
|
87
|
+
return request
|
|
88
|
+
else:
|
|
89
|
+
self.logger.warning(f"{spider} {request} {reason} retry max {self.max_retry_times} times, give up.")
|
|
90
|
+
return None
|
crawlo/network/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
# @Time : 2025-02-05 14:07
|
|
5
|
-
# @Author : oscar
|
|
6
|
-
# @Desc : None
|
|
7
|
-
"""
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-02-05 14:07
|
|
5
|
+
# @Author : oscar
|
|
6
|
+
# @Desc : None
|
|
7
|
+
"""
|