crawlo 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (80) hide show
  1. crawlo/__init__.py +9 -6
  2. crawlo/__version__.py +1 -2
  3. crawlo/core/__init__.py +2 -2
  4. crawlo/core/engine.py +158 -158
  5. crawlo/core/processor.py +40 -40
  6. crawlo/core/scheduler.py +57 -59
  7. crawlo/crawler.py +242 -107
  8. crawlo/downloader/__init__.py +78 -78
  9. crawlo/downloader/aiohttp_downloader.py +259 -96
  10. crawlo/downloader/httpx_downloader.py +187 -48
  11. crawlo/downloader/playwright_downloader.py +160 -160
  12. crawlo/event.py +11 -11
  13. crawlo/exceptions.py +64 -64
  14. crawlo/extension/__init__.py +31 -31
  15. crawlo/extension/log_interval.py +49 -49
  16. crawlo/extension/log_stats.py +44 -44
  17. crawlo/filters/__init__.py +37 -37
  18. crawlo/filters/aioredis_filter.py +157 -129
  19. crawlo/filters/memory_filter.py +202 -203
  20. crawlo/filters/redis_filter.py +119 -119
  21. crawlo/items/__init__.py +62 -62
  22. crawlo/items/items.py +118 -118
  23. crawlo/middleware/__init__.py +21 -21
  24. crawlo/middleware/default_header.py +32 -32
  25. crawlo/middleware/download_delay.py +28 -28
  26. crawlo/middleware/middleware_manager.py +140 -140
  27. crawlo/middleware/request_ignore.py +30 -30
  28. crawlo/middleware/response_code.py +18 -18
  29. crawlo/middleware/response_filter.py +26 -26
  30. crawlo/middleware/retry.py +90 -89
  31. crawlo/network/__init__.py +7 -7
  32. crawlo/network/request.py +205 -155
  33. crawlo/network/response.py +166 -93
  34. crawlo/pipelines/__init__.py +13 -13
  35. crawlo/pipelines/console_pipeline.py +39 -39
  36. crawlo/pipelines/mongo_pipeline.py +116 -116
  37. crawlo/pipelines/mysql_batch_pipline.py +133 -133
  38. crawlo/pipelines/mysql_pipeline.py +195 -176
  39. crawlo/pipelines/pipeline_manager.py +56 -56
  40. crawlo/settings/__init__.py +7 -7
  41. crawlo/settings/default_settings.py +93 -89
  42. crawlo/settings/setting_manager.py +99 -99
  43. crawlo/spider/__init__.py +36 -36
  44. crawlo/stats_collector.py +59 -47
  45. crawlo/subscriber.py +106 -27
  46. crawlo/task_manager.py +27 -27
  47. crawlo/templates/item_template.tmpl +21 -21
  48. crawlo/templates/project_template/main.py +32 -32
  49. crawlo/templates/project_template/setting.py +189 -189
  50. crawlo/templates/spider_template.tmpl +30 -30
  51. crawlo/utils/__init__.py +7 -7
  52. crawlo/utils/concurrency_manager.py +125 -0
  53. crawlo/utils/date_tools.py +177 -177
  54. crawlo/utils/func_tools.py +82 -82
  55. crawlo/utils/log.py +39 -39
  56. crawlo/utils/pqueue.py +173 -173
  57. crawlo/utils/project.py +59 -59
  58. crawlo/utils/request.py +122 -85
  59. crawlo/utils/system.py +11 -11
  60. crawlo/utils/tools.py +303 -0
  61. crawlo/utils/url.py +39 -39
  62. {crawlo-1.0.1.dist-info → crawlo-1.0.3.dist-info}/METADATA +48 -36
  63. crawlo-1.0.3.dist-info/RECORD +80 -0
  64. {crawlo-1.0.1.dist-info → crawlo-1.0.3.dist-info}/top_level.txt +1 -0
  65. tests/__init__.py +7 -0
  66. tests/baidu_spider/__init__.py +7 -0
  67. tests/baidu_spider/demo.py +94 -0
  68. tests/baidu_spider/items.py +25 -0
  69. tests/baidu_spider/middleware.py +49 -0
  70. tests/baidu_spider/pipeline.py +55 -0
  71. tests/baidu_spider/request_fingerprints.txt +9 -0
  72. tests/baidu_spider/run.py +27 -0
  73. tests/baidu_spider/settings.py +78 -0
  74. tests/baidu_spider/spiders/__init__.py +7 -0
  75. tests/baidu_spider/spiders/bai_du.py +61 -0
  76. tests/baidu_spider/spiders/sina.py +79 -0
  77. crawlo-1.0.1.dist-info/RECORD +0 -67
  78. crawlo-1.0.1.dist-info/licenses/LICENSE +0 -23
  79. {crawlo-1.0.1.dist-info → crawlo-1.0.3.dist-info}/WHEEL +0 -0
  80. {crawlo-1.0.1.dist-info → crawlo-1.0.3.dist-info}/entry_points.txt +0 -0
@@ -1,140 +1,140 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from pprint import pformat
4
- from types import MethodType
5
- from asyncio import create_task
6
- from collections import defaultdict
7
- from typing import List, Dict, Callable, Optional
8
-
9
- from crawlo import Request, Response
10
- from crawlo.utils.log import get_logger
11
- from crawlo.utils.project import load_class
12
- from crawlo.middleware import BaseMiddleware
13
- from crawlo.utils.project import common_call
14
- from crawlo.event import ignore_request, response_received
15
- from crawlo.exceptions import MiddlewareInitError, InvalidOutputError, RequestMethodError, IgnoreRequestError, \
16
- NotConfiguredError
17
-
18
-
19
- class MiddlewareManager:
20
-
21
- def __init__(self, crawler):
22
- self.crawler = crawler
23
- self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
24
- self.middlewares: List = []
25
- self.methods: Dict[str, List[MethodType]] = defaultdict(list)
26
- middlewares = self.crawler.settings.get_list('MIDDLEWARES')
27
- self._add_middleware(middlewares)
28
- self._add_method()
29
-
30
- self.download_method: Callable = crawler.engine.downloader.download
31
- self._stats = crawler.stats
32
-
33
- async def _process_request(self, request: Request):
34
- for method in self.methods['process_request']:
35
- result = await common_call(method, request, self.crawler.spider)
36
- if result is None:
37
- continue
38
- if isinstance(result, (Request, Response)):
39
- return result
40
- raise InvalidOutputError(
41
- f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(result).__name__}"
42
- )
43
- return await self.download_method(request)
44
-
45
- async def _process_response(self, request: Request, response: Response):
46
- for method in reversed(self.methods['process_response']):
47
- try:
48
- response = await common_call(method, request, response, self.crawler.spider)
49
- except IgnoreRequestError as exp:
50
- create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
51
- # self.logger.info(f'{request} ignored.')
52
- # self._stats.inc_value('request_ignore_count')
53
- # reason = exp.msg
54
- # if reason:
55
- # self._stats.inc_value(f'request_ignore_count/{reason}')
56
- if isinstance(response, Request):
57
- return response
58
- if isinstance(response, Response):
59
- continue
60
- raise InvalidOutputError(
61
- f"{method.__self__.__class__.__name__}. must return Request or Response, got {type(response).__name__}"
62
- )
63
- return response
64
-
65
- async def _process_exception(self, request: Request, exp: Exception):
66
- for method in self.methods['process_exception']:
67
- response = await common_call(method, request, exp, self.crawler.spider)
68
- if response is None:
69
- continue
70
- if isinstance(response, (Request, Response)):
71
- return response
72
- if response:
73
- break
74
- raise InvalidOutputError(
75
- f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(response).__name__}"
76
- )
77
- else:
78
- raise exp
79
-
80
- async def download(self, request) -> Optional[Response]:
81
- """ called in the download method. """
82
- try:
83
- response = await self._process_request(request)
84
- except KeyError:
85
- raise RequestMethodError(f"{request.method.lower()} is not supported")
86
- except IgnoreRequestError as exp:
87
- create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
88
- response = await self._process_exception(request, exp)
89
- except Exception as exp:
90
- self._stats.inc_value(f'download_error/{exp.__class__.__name__}')
91
- response = await self._process_exception(request, exp)
92
- else:
93
- create_task(self.crawler.subscriber.notify(response_received, response, self.crawler.spider))
94
- # self.crawler.stats.inc_value('response_received_count')
95
- if isinstance(response, Response):
96
- response = await self._process_response(request, response)
97
- if isinstance(response, Request):
98
- await self.crawler.engine.enqueue_request(request)
99
- return None
100
- return response
101
-
102
- @classmethod
103
- def create_instance(cls, *args, **kwargs):
104
- return cls(*args, **kwargs)
105
-
106
- def _add_middleware(self, middlewares):
107
- enabled_middlewares = [m for m in middlewares if self._validate_middleware(m)]
108
- if enabled_middlewares:
109
- self.logger.info(f'enabled middleware:\n {pformat(enabled_middlewares)}')
110
-
111
- def _validate_middleware(self, middleware):
112
- middleware_cls = load_class(middleware)
113
- if not hasattr(middleware_cls, 'create_instance'):
114
- raise MiddlewareInitError(
115
- f"Middleware init failed, must inherit from `BaseMiddleware` or have a `create_instance` method"
116
- )
117
- try:
118
- instance = middleware_cls.create_instance(self.crawler)
119
- self.middlewares.append(instance)
120
- return True
121
- except NotConfiguredError:
122
- return False
123
-
124
- def _add_method(self):
125
- for middleware in self.middlewares:
126
- if hasattr(middleware, 'process_request'):
127
- if self._validate_middleware_method(method_name='process_request', middleware=middleware):
128
- self.methods['process_request'].append(middleware.process_request)
129
- if hasattr(middleware, 'process_response'):
130
- if self._validate_middleware_method(method_name='process_response', middleware=middleware):
131
- self.methods['process_response'].append(middleware.process_response)
132
- if hasattr(middleware, 'process_exception'):
133
- if self._validate_middleware_method(method_name='process_exception', middleware=middleware):
134
- self.methods['process_exception'].append(middleware.process_exception)
135
-
136
- @staticmethod
137
- def _validate_middleware_method(method_name, middleware) -> bool:
138
- method = getattr(type(middleware), method_name)
139
- base_method = getattr(BaseMiddleware, method_name)
140
- return False if method == base_method else True
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from pprint import pformat
4
+ from types import MethodType
5
+ from asyncio import create_task
6
+ from collections import defaultdict
7
+ from typing import List, Dict, Callable, Optional
8
+
9
+ from crawlo import Request, Response
10
+ from crawlo.utils.log import get_logger
11
+ from crawlo.utils.project import load_class
12
+ from crawlo.middleware import BaseMiddleware
13
+ from crawlo.utils.project import common_call
14
+ from crawlo.event import ignore_request, response_received
15
+ from crawlo.exceptions import MiddlewareInitError, InvalidOutputError, RequestMethodError, IgnoreRequestError, \
16
+ NotConfiguredError
17
+
18
+
19
+ class MiddlewareManager:
20
+
21
+ def __init__(self, crawler):
22
+ self.crawler = crawler
23
+ self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
24
+ self.middlewares: List = []
25
+ self.methods: Dict[str, List[MethodType]] = defaultdict(list)
26
+ middlewares = self.crawler.settings.get_list('MIDDLEWARES')
27
+ self._add_middleware(middlewares)
28
+ self._add_method()
29
+
30
+ self.download_method: Callable = crawler.engine.downloader.download
31
+ self._stats = crawler.stats
32
+
33
+ async def _process_request(self, request: Request):
34
+ for method in self.methods['process_request']:
35
+ result = await common_call(method, request, self.crawler.spider)
36
+ if result is None:
37
+ continue
38
+ if isinstance(result, (Request, Response)):
39
+ return result
40
+ raise InvalidOutputError(
41
+ f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(result).__name__}"
42
+ )
43
+ return await self.download_method(request)
44
+
45
+ async def _process_response(self, request: Request, response: Response):
46
+ for method in reversed(self.methods['process_response']):
47
+ try:
48
+ response = await common_call(method, request, response, self.crawler.spider)
49
+ except IgnoreRequestError as exp:
50
+ create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
51
+ # self.logger.info(f'{request} ignored.')
52
+ # self._stats.inc_value('request_ignore_count')
53
+ # reason = exp.msg
54
+ # if reason:
55
+ # self._stats.inc_value(f'request_ignore_count/{reason}')
56
+ if isinstance(response, Request):
57
+ return response
58
+ if isinstance(response, Response):
59
+ continue
60
+ raise InvalidOutputError(
61
+ f"{method.__self__.__class__.__name__}. must return Request or Response, got {type(response).__name__}"
62
+ )
63
+ return response
64
+
65
+ async def _process_exception(self, request: Request, exp: Exception):
66
+ for method in self.methods['process_exception']:
67
+ response = await common_call(method, request, exp, self.crawler.spider)
68
+ if response is None:
69
+ continue
70
+ if isinstance(response, (Request, Response)):
71
+ return response
72
+ if response:
73
+ break
74
+ raise InvalidOutputError(
75
+ f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(response).__name__}"
76
+ )
77
+ else:
78
+ raise exp
79
+
80
+ async def download(self, request) -> Optional[Response]:
81
+ """ called in the download method. """
82
+ try:
83
+ response = await self._process_request(request)
84
+ except KeyError:
85
+ raise RequestMethodError(f"{request.method.lower()} is not supported")
86
+ except IgnoreRequestError as exp:
87
+ create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
88
+ response = await self._process_exception(request, exp)
89
+ except Exception as exp:
90
+ self._stats.inc_value(f'download_error/{exp.__class__.__name__}')
91
+ response = await self._process_exception(request, exp)
92
+ else:
93
+ create_task(self.crawler.subscriber.notify(response_received, response, self.crawler.spider))
94
+ # self.crawler.stats.inc_value('response_received_count')
95
+ if isinstance(response, Response):
96
+ response = await self._process_response(request, response)
97
+ if isinstance(response, Request):
98
+ await self.crawler.engine.enqueue_request(request)
99
+ return None
100
+ return response
101
+
102
+ @classmethod
103
+ def create_instance(cls, *args, **kwargs):
104
+ return cls(*args, **kwargs)
105
+
106
+ def _add_middleware(self, middlewares):
107
+ enabled_middlewares = [m for m in middlewares if self._validate_middleware(m)]
108
+ if enabled_middlewares:
109
+ self.logger.info(f'enabled middleware:\n {pformat(enabled_middlewares)}')
110
+
111
+ def _validate_middleware(self, middleware):
112
+ middleware_cls = load_class(middleware)
113
+ if not hasattr(middleware_cls, 'create_instance'):
114
+ raise MiddlewareInitError(
115
+ f"Middleware init failed, must inherit from `BaseMiddleware` or have a `create_instance` method"
116
+ )
117
+ try:
118
+ instance = middleware_cls.create_instance(self.crawler)
119
+ self.middlewares.append(instance)
120
+ return True
121
+ except NotConfiguredError:
122
+ return False
123
+
124
+ def _add_method(self):
125
+ for middleware in self.middlewares:
126
+ if hasattr(middleware, 'process_request'):
127
+ if self._validate_middleware_method(method_name='process_request', middleware=middleware):
128
+ self.methods['process_request'].append(middleware.process_request)
129
+ if hasattr(middleware, 'process_response'):
130
+ if self._validate_middleware_method(method_name='process_response', middleware=middleware):
131
+ self.methods['process_response'].append(middleware.process_response)
132
+ if hasattr(middleware, 'process_exception'):
133
+ if self._validate_middleware_method(method_name='process_exception', middleware=middleware):
134
+ self.methods['process_exception'].append(middleware.process_exception)
135
+
136
+ @staticmethod
137
+ def _validate_middleware_method(method_name, middleware) -> bool:
138
+ method = getattr(type(middleware), method_name)
139
+ base_method = getattr(BaseMiddleware, method_name)
140
+ return False if method == base_method else True
@@ -1,30 +1,30 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from crawlo.utils.log import get_logger
4
- from crawlo.exceptions import IgnoreRequestError
5
- from crawlo.event import ignore_request
6
-
7
-
8
- class RequestIgnoreMiddleware(object):
9
-
10
- def __init__(self, stats, log_level):
11
- self.logger = get_logger(self.__class__.__name__, log_level)
12
- self.stats = stats
13
-
14
- @classmethod
15
- def create_instance(cls, crawler):
16
- o = cls(stats=crawler.stats, log_level=crawler.settings.get('LOG_LEVEL'))
17
- crawler.subscriber.subscribe(o.request_ignore, event=ignore_request)
18
- return o
19
-
20
- async def request_ignore(self, exc, request, _spider):
21
- self.logger.info(f'{request} ignored.')
22
- self.stats.inc_value('request_ignore_count')
23
- reason = exc.msg
24
- if reason:
25
- self.stats.inc_value(f'request_ignore_count/{reason}')
26
-
27
- @staticmethod
28
- def process_exception(_request, exc, _spider):
29
- if isinstance(exc, IgnoreRequestError):
30
- return True
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from crawlo.utils.log import get_logger
4
+ from crawlo.exceptions import IgnoreRequestError
5
+ from crawlo.event import ignore_request
6
+
7
+
8
+ class RequestIgnoreMiddleware(object):
9
+
10
+ def __init__(self, stats, log_level):
11
+ self.logger = get_logger(self.__class__.__name__, log_level)
12
+ self.stats = stats
13
+
14
+ @classmethod
15
+ def create_instance(cls, crawler):
16
+ o = cls(stats=crawler.stats, log_level=crawler.settings.get('LOG_LEVEL'))
17
+ crawler.subscriber.subscribe(o.request_ignore, event=ignore_request)
18
+ return o
19
+
20
+ async def request_ignore(self, exc, request, _spider):
21
+ self.logger.info(f'{request} ignored.')
22
+ self.stats.inc_value('request_ignore_count')
23
+ reason = exc.msg
24
+ if reason:
25
+ self.stats.inc_value(f'request_ignore_count/{reason}')
26
+
27
+ @staticmethod
28
+ def process_exception(_request, exc, _spider):
29
+ if isinstance(exc, IgnoreRequestError):
30
+ return True
@@ -1,19 +1,19 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from crawlo.utils.log import get_logger
4
-
5
-
6
- class ResponseCodeMiddleware(object):
7
- def __init__(self, stats, log_level):
8
- self.logger = get_logger(self.__class__.__name__, log_level)
9
- self.stats = stats
10
-
11
- @classmethod
12
- def create_instance(cls, crawler):
13
- o = cls(stats=crawler.stats, log_level=crawler.settings.get('LOG_LEVEL'))
14
- return o
15
-
16
- def process_response(self, request, response, spider):
17
- self.stats.inc_value(f'stats_code/count/{response.status_code}')
18
- self.logger.debug(f'Got response from <{response.status_code} {response.url}>')
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from crawlo.utils.log import get_logger
4
+
5
+
6
+ class ResponseCodeMiddleware(object):
7
+ def __init__(self, stats, log_level):
8
+ self.logger = get_logger(self.__class__.__name__, log_level)
9
+ self.stats = stats
10
+
11
+ @classmethod
12
+ def create_instance(cls, crawler):
13
+ o = cls(stats=crawler.stats, log_level=crawler.settings.get('LOG_LEVEL'))
14
+ return o
15
+
16
+ def process_response(self, request, response, spider):
17
+ self.stats.inc_value(f'stats_code/count/{response.status_code}')
18
+ self.logger.debug(f'Got response from <{response.status_code} {response.url}>')
19
19
  return response
@@ -1,26 +1,26 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from crawlo.utils.log import get_logger
4
- from crawlo.exceptions import IgnoreRequestError
5
-
6
-
7
- class ResponseFilterMiddleware:
8
-
9
- def __init__(self, allowed_codes, log_level):
10
- self.allowed_codes = allowed_codes
11
- self.logger = get_logger(self.__class__.__name__, log_level)
12
-
13
- @classmethod
14
- def create_instance(cls, crawler):
15
- o = cls(
16
- allowed_codes=crawler.settings.get_list('ALLOWED_CODES'),
17
- log_level=crawler.settings.get('LOG_LEVEL')
18
- )
19
- return o
20
-
21
- def process_response(self, request, response, spider):
22
- if 200 <= response.status_code < 300:
23
- return response
24
- if response.status_code in self.allowed_codes:
25
- return response
26
- raise IgnoreRequestError(f"response status_code/non-200")
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from crawlo.utils.log import get_logger
4
+ from crawlo.exceptions import IgnoreRequestError
5
+
6
+
7
+ class ResponseFilterMiddleware:
8
+
9
+ def __init__(self, allowed_codes, log_level):
10
+ self.allowed_codes = allowed_codes
11
+ self.logger = get_logger(self.__class__.__name__, log_level)
12
+
13
+ @classmethod
14
+ def create_instance(cls, crawler):
15
+ o = cls(
16
+ allowed_codes=crawler.settings.get_list('ALLOWED_CODES'),
17
+ log_level=crawler.settings.get('LOG_LEVEL')
18
+ )
19
+ return o
20
+
21
+ def process_response(self, request, response, spider):
22
+ if 200 <= response.status_code < 300:
23
+ return response
24
+ if response.status_code in self.allowed_codes:
25
+ return response
26
+ raise IgnoreRequestError(f"response status_code/non-200")
@@ -1,89 +1,90 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from typing import List
4
- from anyio import EndOfStream
5
- from httpcore import ReadError
6
- from asyncio.exceptions import TimeoutError
7
- from httpx import RemoteProtocolError, ConnectError, ReadTimeout
8
- from aiohttp.client_exceptions import ClientConnectionError, ClientPayloadError
9
- from aiohttp import ClientConnectorError, ClientTimeout, ClientConnectorSSLError, ClientResponseError
10
-
11
- from crawlo.utils.log import get_logger
12
- from crawlo.stats_collector import StatsCollector
13
-
14
- _retry_exceptions = [
15
- EndOfStream,
16
- ReadError,
17
- TimeoutError,
18
- ConnectError,
19
- ReadTimeout,
20
- ClientConnectorError,
21
- ClientResponseError,
22
- RemoteProtocolError,
23
- ClientTimeout,
24
- ClientConnectorSSLError,
25
- ClientPayloadError,
26
- ClientConnectionError
27
- ]
28
-
29
-
30
- class RetryMiddleware(object):
31
-
32
- def __init__(
33
- self,
34
- *,
35
- retry_http_codes: List,
36
- ignore_http_codes: List,
37
- max_retry_times: int,
38
- retry_exceptions: List,
39
- stats: StatsCollector,
40
- retry_priority: int
41
- ):
42
- self.retry_http_codes = retry_http_codes
43
- self.ignore_http_codes = ignore_http_codes
44
- self.max_retry_times = max_retry_times
45
- self.retry_exceptions = tuple(retry_exceptions + _retry_exceptions)
46
- self.retry_priority = retry_priority
47
- self.stats = stats
48
- self.logger = get_logger(self.__class__.__name__)
49
-
50
- @classmethod
51
- def create_instance(cls, crawler):
52
- o = cls(
53
- retry_http_codes=crawler.settings.get_list('RETRY_HTTP_CODES'),
54
- ignore_http_codes=crawler.settings.get_list('IGNORE_HTTP_CODES'),
55
- max_retry_times=crawler.settings.get_int('MAX_RETRY_TIMES'),
56
- retry_exceptions=crawler.settings.get_list('RETRY_EXCEPTIONS'),
57
- stats=crawler.stats,
58
- retry_priority=crawler.settings.get_int('RETRY_PRIORITY')
59
- )
60
- return o
61
-
62
- def process_response(self, request, response, spider):
63
- if request.meta.get('dont_retry', False):
64
- return response
65
- if response.status_code in self.ignore_http_codes:
66
- return response
67
- if response.status_code in self.retry_http_codes:
68
- # 重试逻辑
69
- reason = f"response code {response.status_code}"
70
- return self._retry(request, reason, spider) or response
71
- return response
72
-
73
- def process_exception(self, request, exc, spider):
74
- if isinstance(exc, self.retry_exceptions) and not request.meta.get('dont_retry', False):
75
- return self._retry(request=request, reason=type(exc).__name__, spider=spider)
76
-
77
- def _retry(self, request, reason, spider):
78
- retry_times = request.meta.get('retry_times', 0)
79
- if retry_times < self.max_retry_times:
80
- retry_times += 1
81
- self.logger.info(f"{spider} {request} {reason} retrying {retry_times} time...")
82
- request.meta['retry_times'] = retry_times
83
- request.dont_retry = True
84
- request.retry_priority = request.priority + self.retry_priority
85
- self.stats.inc_value("retry_count")
86
- return request
87
- else:
88
- self.logger.warning(f"{spider} {request} {reason} retry max {self.max_retry_times} times, give up.")
89
- return None
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from typing import List
4
+ from anyio import EndOfStream
5
+ from httpcore import ReadError
6
+ from asyncio.exceptions import TimeoutError
7
+ from httpx import RemoteProtocolError, ConnectError, ReadTimeout
8
+ from aiohttp.client_exceptions import ClientConnectionError, ClientPayloadError
9
+ from aiohttp import ClientConnectorError, ClientTimeout, ClientConnectorSSLError, ClientResponseError
10
+
11
+ from crawlo.utils.log import get_logger
12
+ from crawlo.stats_collector import StatsCollector
13
+
14
+ _retry_exceptions = [
15
+ EndOfStream,
16
+ ReadError,
17
+ TimeoutError,
18
+ ConnectError,
19
+ ReadTimeout,
20
+ ClientConnectorError,
21
+ ClientResponseError,
22
+ RemoteProtocolError,
23
+ ClientTimeout,
24
+ ClientConnectorSSLError,
25
+ ClientPayloadError,
26
+ ClientConnectionError
27
+ ]
28
+
29
+
30
+ class RetryMiddleware(object):
31
+
32
+ def __init__(
33
+ self,
34
+ *,
35
+ retry_http_codes: List,
36
+ ignore_http_codes: List,
37
+ max_retry_times: int,
38
+ retry_exceptions: List,
39
+ stats: StatsCollector,
40
+ retry_priority: int
41
+ ):
42
+ self.retry_http_codes = retry_http_codes
43
+ self.ignore_http_codes = ignore_http_codes
44
+ self.max_retry_times = max_retry_times
45
+ self.retry_exceptions = tuple(retry_exceptions + _retry_exceptions)
46
+ self.retry_priority = retry_priority
47
+ self.stats = stats
48
+ self.logger = get_logger(self.__class__.__name__)
49
+
50
+ @classmethod
51
+ def create_instance(cls, crawler):
52
+ o = cls(
53
+ retry_http_codes=crawler.settings.get_list('RETRY_HTTP_CODES'),
54
+ ignore_http_codes=crawler.settings.get_list('IGNORE_HTTP_CODES'),
55
+ max_retry_times=crawler.settings.get_int('MAX_RETRY_TIMES'),
56
+ retry_exceptions=crawler.settings.get_list('RETRY_EXCEPTIONS'),
57
+ stats=crawler.stats,
58
+ retry_priority=crawler.settings.get_int('RETRY_PRIORITY')
59
+ )
60
+ return o
61
+
62
+ def process_response(self, request, response, spider):
63
+ if request.meta.get('dont_retry', False):
64
+ return response
65
+ if response.status_code in self.ignore_http_codes:
66
+ return response
67
+ if response.status_code in self.retry_http_codes:
68
+ # 重试逻辑
69
+ reason = f"response code {response.status_code}"
70
+ return self._retry(request, reason, spider) or response
71
+ return response
72
+
73
+ def process_exception(self, request, exc, spider):
74
+ if isinstance(exc, self.retry_exceptions) and not request.meta.get('dont_retry', False):
75
+ return self._retry(request=request, reason=type(exc).__name__, spider=spider)
76
+
77
+ def _retry(self, request, reason, spider):
78
+ retry_times = request.meta.get('retry_times', 0)
79
+ if retry_times < self.max_retry_times:
80
+ retry_times += 1
81
+ self.logger.info(f"{spider} {request} {reason} retrying {retry_times} time...")
82
+ request.meta['retry_times'] = retry_times
83
+ # request.dont_retry = True
84
+ request.meta['dont_retry'] = True
85
+ request.retry_priority = request.priority + self.retry_priority
86
+ self.stats.inc_value("retry_count")
87
+ return request
88
+ else:
89
+ self.logger.warning(f"{spider} {request} {reason} retry max {self.max_retry_times} times, give up.")
90
+ return None
@@ -1,7 +1,7 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-02-05 14:07
5
- # @Author : oscar
6
- # @Desc : None
7
- """
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-02-05 14:07
5
+ # @Author : oscar
6
+ # @Desc : None
7
+ """