crawlo 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (112) hide show
  1. crawlo/__init__.py +25 -9
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +41 -0
  4. crawlo/commands/__init__.py +10 -0
  5. crawlo/commands/genspider.py +111 -0
  6. crawlo/commands/run.py +149 -0
  7. crawlo/commands/startproject.py +101 -0
  8. crawlo/core/__init__.py +2 -2
  9. crawlo/core/engine.py +158 -158
  10. crawlo/core/processor.py +40 -40
  11. crawlo/core/scheduler.py +57 -57
  12. crawlo/crawler.py +219 -242
  13. crawlo/downloader/__init__.py +78 -78
  14. crawlo/downloader/aiohttp_downloader.py +200 -259
  15. crawlo/downloader/cffi_downloader.py +277 -0
  16. crawlo/downloader/httpx_downloader.py +246 -187
  17. crawlo/event.py +11 -11
  18. crawlo/exceptions.py +78 -64
  19. crawlo/extension/__init__.py +31 -31
  20. crawlo/extension/log_interval.py +49 -49
  21. crawlo/extension/log_stats.py +44 -44
  22. crawlo/extension/logging_extension.py +35 -0
  23. crawlo/filters/__init__.py +37 -37
  24. crawlo/filters/aioredis_filter.py +150 -150
  25. crawlo/filters/memory_filter.py +202 -202
  26. crawlo/items/__init__.py +22 -62
  27. crawlo/items/base.py +31 -0
  28. crawlo/items/fields.py +54 -0
  29. crawlo/items/items.py +105 -119
  30. crawlo/middleware/__init__.py +21 -21
  31. crawlo/middleware/default_header.py +32 -32
  32. crawlo/middleware/download_delay.py +28 -28
  33. crawlo/middleware/middleware_manager.py +135 -140
  34. crawlo/middleware/proxy.py +246 -0
  35. crawlo/middleware/request_ignore.py +30 -30
  36. crawlo/middleware/response_code.py +18 -18
  37. crawlo/middleware/response_filter.py +26 -26
  38. crawlo/middleware/retry.py +90 -90
  39. crawlo/network/__init__.py +7 -7
  40. crawlo/network/request.py +203 -204
  41. crawlo/network/response.py +166 -166
  42. crawlo/pipelines/__init__.py +13 -13
  43. crawlo/pipelines/console_pipeline.py +39 -39
  44. crawlo/pipelines/mongo_pipeline.py +116 -116
  45. crawlo/pipelines/mysql_batch_pipline.py +273 -134
  46. crawlo/pipelines/mysql_pipeline.py +195 -195
  47. crawlo/pipelines/pipeline_manager.py +56 -56
  48. crawlo/settings/__init__.py +7 -7
  49. crawlo/settings/default_settings.py +169 -94
  50. crawlo/settings/setting_manager.py +99 -99
  51. crawlo/spider/__init__.py +41 -36
  52. crawlo/stats_collector.py +59 -59
  53. crawlo/subscriber.py +106 -106
  54. crawlo/task_manager.py +27 -27
  55. crawlo/templates/crawlo.cfg.tmpl +11 -0
  56. crawlo/templates/project/__init__.py.tmpl +4 -0
  57. crawlo/templates/project/items.py.tmpl +18 -0
  58. crawlo/templates/project/middlewares.py.tmpl +76 -0
  59. crawlo/templates/project/pipelines.py.tmpl +64 -0
  60. crawlo/templates/project/settings.py.tmpl +54 -0
  61. crawlo/templates/project/spiders/__init__.py.tmpl +6 -0
  62. crawlo/templates/spider/spider.py.tmpl +32 -0
  63. crawlo/utils/__init__.py +7 -7
  64. crawlo/utils/concurrency_manager.py +124 -124
  65. crawlo/utils/date_tools.py +233 -177
  66. crawlo/utils/db_helper.py +344 -0
  67. crawlo/utils/func_tools.py +82 -82
  68. crawlo/utils/log.py +129 -39
  69. crawlo/utils/pqueue.py +173 -173
  70. crawlo/utils/project.py +199 -59
  71. crawlo/utils/request.py +267 -122
  72. crawlo/utils/spider_loader.py +63 -0
  73. crawlo/utils/system.py +11 -11
  74. crawlo/utils/tools.py +5 -303
  75. crawlo/utils/url.py +39 -39
  76. {crawlo-1.0.4.dist-info → crawlo-1.0.6.dist-info}/METADATA +49 -48
  77. crawlo-1.0.6.dist-info/RECORD +94 -0
  78. crawlo-1.0.6.dist-info/entry_points.txt +2 -0
  79. {crawlo-1.0.4.dist-info → crawlo-1.0.6.dist-info}/top_level.txt +1 -0
  80. examples/gxb/items.py +36 -0
  81. examples/gxb/run.py +16 -0
  82. examples/gxb/settings.py +72 -0
  83. examples/gxb/spider/__init__.py +0 -0
  84. examples/gxb/spider/miit_spider.py +180 -0
  85. examples/gxb/spider/telecom_device.py +129 -0
  86. tests/__init__.py +7 -7
  87. tests/test_proxy_health_check.py +33 -0
  88. tests/test_proxy_middleware_integration.py +137 -0
  89. tests/test_proxy_providers.py +57 -0
  90. tests/test_proxy_stats.py +20 -0
  91. tests/test_proxy_strategies.py +60 -0
  92. crawlo/downloader/playwright_downloader.py +0 -161
  93. crawlo/templates/item_template.tmpl +0 -22
  94. crawlo/templates/project_template/main.py +0 -33
  95. crawlo/templates/project_template/setting.py +0 -190
  96. crawlo/templates/spider_template.tmpl +0 -31
  97. crawlo-1.0.4.dist-info/RECORD +0 -79
  98. crawlo-1.0.4.dist-info/entry_points.txt +0 -2
  99. tests/baidu_spider/__init__.py +0 -7
  100. tests/baidu_spider/demo.py +0 -94
  101. tests/baidu_spider/items.py +0 -25
  102. tests/baidu_spider/middleware.py +0 -49
  103. tests/baidu_spider/pipeline.py +0 -55
  104. tests/baidu_spider/request_fingerprints.txt +0 -9
  105. tests/baidu_spider/run.py +0 -27
  106. tests/baidu_spider/settings.py +0 -80
  107. tests/baidu_spider/spiders/__init__.py +0 -7
  108. tests/baidu_spider/spiders/bai_du.py +0 -61
  109. tests/baidu_spider/spiders/sina.py +0 -79
  110. {crawlo-1.0.4.dist-info → crawlo-1.0.6.dist-info}/WHEEL +0 -0
  111. {crawlo/templates/project_template/items → examples}/__init__.py +0 -0
  112. {crawlo/templates/project_template/spiders → examples/gxb}/__init__.py +0 -0
@@ -1,140 +1,135 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from pprint import pformat
4
- from types import MethodType
5
- from asyncio import create_task
6
- from collections import defaultdict
7
- from typing import List, Dict, Callable, Optional
8
-
9
- from crawlo import Request, Response
10
- from crawlo.utils.log import get_logger
11
- from crawlo.utils.project import load_class
12
- from crawlo.middleware import BaseMiddleware
13
- from crawlo.utils.project import common_call
14
- from crawlo.event import ignore_request, response_received
15
- from crawlo.exceptions import MiddlewareInitError, InvalidOutputError, RequestMethodError, IgnoreRequestError, \
16
- NotConfiguredError
17
-
18
-
19
- class MiddlewareManager:
20
-
21
- def __init__(self, crawler):
22
- self.crawler = crawler
23
- self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
24
- self.middlewares: List = []
25
- self.methods: Dict[str, List[MethodType]] = defaultdict(list)
26
- middlewares = self.crawler.settings.get_list('MIDDLEWARES')
27
- self._add_middleware(middlewares)
28
- self._add_method()
29
-
30
- self.download_method: Callable = crawler.engine.downloader.download
31
- self._stats = crawler.stats
32
-
33
- async def _process_request(self, request: Request):
34
- for method in self.methods['process_request']:
35
- result = await common_call(method, request, self.crawler.spider)
36
- if result is None:
37
- continue
38
- if isinstance(result, (Request, Response)):
39
- return result
40
- raise InvalidOutputError(
41
- f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(result).__name__}"
42
- )
43
- return await self.download_method(request)
44
-
45
- async def _process_response(self, request: Request, response: Response):
46
- for method in reversed(self.methods['process_response']):
47
- try:
48
- response = await common_call(method, request, response, self.crawler.spider)
49
- except IgnoreRequestError as exp:
50
- create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
51
- # self.logger.info(f'{request} ignored.')
52
- # self._stats.inc_value('request_ignore_count')
53
- # reason = exp.msg
54
- # if reason:
55
- # self._stats.inc_value(f'request_ignore_count/{reason}')
56
- if isinstance(response, Request):
57
- return response
58
- if isinstance(response, Response):
59
- continue
60
- raise InvalidOutputError(
61
- f"{method.__self__.__class__.__name__}. must return Request or Response, got {type(response).__name__}"
62
- )
63
- return response
64
-
65
- async def _process_exception(self, request: Request, exp: Exception):
66
- for method in self.methods['process_exception']:
67
- response = await common_call(method, request, exp, self.crawler.spider)
68
- if response is None:
69
- continue
70
- if isinstance(response, (Request, Response)):
71
- return response
72
- if response:
73
- break
74
- raise InvalidOutputError(
75
- f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(response).__name__}"
76
- )
77
- else:
78
- raise exp
79
-
80
- async def download(self, request) -> Optional[Response]:
81
- """ called in the download method. """
82
- try:
83
- response = await self._process_request(request)
84
- except KeyError:
85
- raise RequestMethodError(f"{request.method.lower()} is not supported")
86
- except IgnoreRequestError as exp:
87
- create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
88
- response = await self._process_exception(request, exp)
89
- except Exception as exp:
90
- self._stats.inc_value(f'download_error/{exp.__class__.__name__}')
91
- response = await self._process_exception(request, exp)
92
- else:
93
- create_task(self.crawler.subscriber.notify(response_received, response, self.crawler.spider))
94
- # self.crawler.stats.inc_value('response_received_count')
95
- if isinstance(response, Response):
96
- response = await self._process_response(request, response)
97
- if isinstance(response, Request):
98
- await self.crawler.engine.enqueue_request(request)
99
- return None
100
- return response
101
-
102
- @classmethod
103
- def create_instance(cls, *args, **kwargs):
104
- return cls(*args, **kwargs)
105
-
106
- def _add_middleware(self, middlewares):
107
- enabled_middlewares = [m for m in middlewares if self._validate_middleware(m)]
108
- if enabled_middlewares:
109
- self.logger.info(f'enabled middleware:\n {pformat(enabled_middlewares)}')
110
-
111
- def _validate_middleware(self, middleware):
112
- middleware_cls = load_class(middleware)
113
- if not hasattr(middleware_cls, 'create_instance'):
114
- raise MiddlewareInitError(
115
- f"Middleware init failed, must inherit from `BaseMiddleware` or have a `create_instance` method"
116
- )
117
- try:
118
- instance = middleware_cls.create_instance(self.crawler)
119
- self.middlewares.append(instance)
120
- return True
121
- except NotConfiguredError:
122
- return False
123
-
124
- def _add_method(self):
125
- for middleware in self.middlewares:
126
- if hasattr(middleware, 'process_request'):
127
- if self._validate_middleware_method(method_name='process_request', middleware=middleware):
128
- self.methods['process_request'].append(middleware.process_request)
129
- if hasattr(middleware, 'process_response'):
130
- if self._validate_middleware_method(method_name='process_response', middleware=middleware):
131
- self.methods['process_response'].append(middleware.process_response)
132
- if hasattr(middleware, 'process_exception'):
133
- if self._validate_middleware_method(method_name='process_exception', middleware=middleware):
134
- self.methods['process_exception'].append(middleware.process_exception)
135
-
136
- @staticmethod
137
- def _validate_middleware_method(method_name, middleware) -> bool:
138
- method = getattr(type(middleware), method_name)
139
- base_method = getattr(BaseMiddleware, method_name)
140
- return False if method == base_method else True
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from pprint import pformat
4
+ from types import MethodType
5
+ from asyncio import create_task
6
+ from collections import defaultdict
7
+ from typing import List, Dict, Callable, Optional
8
+
9
+ from crawlo import Request, Response
10
+ from crawlo.utils.log import get_logger
11
+ from crawlo.utils.project import load_class
12
+ from crawlo.middleware import BaseMiddleware
13
+ from crawlo.utils.project import common_call
14
+ from crawlo.event import ignore_request, response_received
15
+ from crawlo.exceptions import MiddlewareInitError, InvalidOutputError, RequestMethodError, IgnoreRequestError, \
16
+ NotConfiguredError
17
+
18
+
19
+ class MiddlewareManager:
20
+
21
+ def __init__(self, crawler):
22
+ self.crawler = crawler
23
+ self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
24
+ self.middlewares: List = []
25
+ self.methods: Dict[str, List[MethodType]] = defaultdict(list)
26
+ middlewares = self.crawler.settings.get_list('MIDDLEWARES')
27
+ self._add_middleware(middlewares)
28
+ self._add_method()
29
+
30
+ self.download_method: Callable = crawler.engine.downloader.download
31
+ self._stats = crawler.stats
32
+
33
+ async def _process_request(self, request: Request):
34
+ for method in self.methods['process_request']:
35
+ result = await common_call(method, request, self.crawler.spider)
36
+ if result is None:
37
+ continue
38
+ if isinstance(result, (Request, Response)):
39
+ return result
40
+ raise InvalidOutputError(
41
+ f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(result).__name__}"
42
+ )
43
+ return await self.download_method(request)
44
+
45
+ async def _process_response(self, request: Request, response: Response):
46
+ for method in reversed(self.methods['process_response']):
47
+ try:
48
+ response = await common_call(method, request, response, self.crawler.spider)
49
+ except IgnoreRequestError as exp:
50
+ create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
51
+ if isinstance(response, Request):
52
+ return response
53
+ if isinstance(response, Response):
54
+ continue
55
+ raise InvalidOutputError(
56
+ f"{method.__self__.__class__.__name__}. must return Request or Response, got {type(response).__name__}"
57
+ )
58
+ return response
59
+
60
+ async def _process_exception(self, request: Request, exp: Exception):
61
+ for method in self.methods['process_exception']:
62
+ response = await common_call(method, request, exp, self.crawler.spider)
63
+ if response is None:
64
+ continue
65
+ if isinstance(response, (Request, Response)):
66
+ return response
67
+ if response:
68
+ break
69
+ raise InvalidOutputError(
70
+ f"{method.__self__.__class__.__name__}. must return None or Request or Response, got {type(response).__name__}"
71
+ )
72
+ else:
73
+ raise exp
74
+
75
+ async def download(self, request) -> Optional[Response]:
76
+ """ called in the download method. """
77
+ try:
78
+ response = await self._process_request(request)
79
+ except KeyError:
80
+ raise RequestMethodError(f"{request.method.lower()} is not supported")
81
+ except IgnoreRequestError as exp:
82
+ create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
83
+ response = await self._process_exception(request, exp)
84
+ except Exception as exp:
85
+ self._stats.inc_value(f'download_error/{exp.__class__.__name__}')
86
+ response = await self._process_exception(request, exp)
87
+ else:
88
+ create_task(self.crawler.subscriber.notify(response_received, response, self.crawler.spider))
89
+ # self.crawler.stats.inc_value('response_received_count')
90
+ if isinstance(response, Response):
91
+ response = await self._process_response(request, response)
92
+ if isinstance(response, Request):
93
+ await self.crawler.engine.enqueue_request(request)
94
+ return None
95
+ return response
96
+
97
+ @classmethod
98
+ def create_instance(cls, *args, **kwargs):
99
+ return cls(*args, **kwargs)
100
+
101
+ def _add_middleware(self, middlewares):
102
+ enabled_middlewares = [m for m in middlewares if self._validate_middleware(m)]
103
+ if enabled_middlewares:
104
+ self.logger.info(f'enabled middleware:\n {pformat(enabled_middlewares)}')
105
+
106
+ def _validate_middleware(self, middleware):
107
+ middleware_cls = load_class(middleware)
108
+ if not hasattr(middleware_cls, 'create_instance'):
109
+ raise MiddlewareInitError(
110
+ f"Middleware init failed, must inherit from `BaseMiddleware` or have a `create_instance` method"
111
+ )
112
+ try:
113
+ instance = middleware_cls.create_instance(self.crawler)
114
+ self.middlewares.append(instance)
115
+ return True
116
+ except NotConfiguredError:
117
+ return False
118
+
119
+ def _add_method(self):
120
+ for middleware in self.middlewares:
121
+ if hasattr(middleware, 'process_request'):
122
+ if self._validate_middleware_method(method_name='process_request', middleware=middleware):
123
+ self.methods['process_request'].append(middleware.process_request)
124
+ if hasattr(middleware, 'process_response'):
125
+ if self._validate_middleware_method(method_name='process_response', middleware=middleware):
126
+ self.methods['process_response'].append(middleware.process_response)
127
+ if hasattr(middleware, 'process_exception'):
128
+ if self._validate_middleware_method(method_name='process_exception', middleware=middleware):
129
+ self.methods['process_exception'].append(middleware.process_exception)
130
+
131
+ @staticmethod
132
+ def _validate_middleware_method(method_name, middleware) -> bool:
133
+ method = getattr(type(middleware), method_name)
134
+ base_method = getattr(BaseMiddleware, method_name)
135
+ return False if method == base_method else True
@@ -0,0 +1,246 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ import asyncio
4
+ import socket
5
+ from typing import Optional, Dict, Any, Callable, Union
6
+ from urllib.parse import urlparse
7
+
8
+ from crawlo import Request, Response
9
+ from crawlo.exceptions import NotConfiguredError
10
+ from crawlo.utils.log import get_logger
11
+
12
+ try:
13
+ import httpx
14
+
15
+ HTTPX_EXCEPTIONS = (httpx.NetworkError, httpx.TimeoutException, httpx.ReadError, httpx.ConnectError)
16
+ except ImportError:
17
+ HTTPX_EXCEPTIONS = ()
18
+ httpx = None
19
+
20
+ try:
21
+ import aiohttp
22
+
23
+ AIOHTTP_EXCEPTIONS = (
24
+ aiohttp.ClientError, aiohttp.ClientConnectorError, aiohttp.ClientResponseError, aiohttp.ServerTimeoutError,
25
+ aiohttp.ServerDisconnectedError)
26
+ except ImportError:
27
+ AIOHTTP_EXCEPTIONS = ()
28
+ aiohttp = None
29
+
30
+ try:
31
+ from curl_cffi import requests as cffi_requests
32
+
33
+ CURL_CFFI_EXCEPTIONS = (cffi_requests.RequestsError,)
34
+ except (ImportError, AttributeError):
35
+ CURL_CFFI_EXCEPTIONS = ()
36
+ cffi_requests = None
37
+
38
+ NETWORK_EXCEPTIONS = (
39
+ asyncio.TimeoutError,
40
+ socket.gaierror,
41
+ ConnectionError,
42
+ TimeoutError,
43
+ ) + HTTPX_EXCEPTIONS + AIOHTTP_EXCEPTIONS + CURL_CFFI_EXCEPTIONS
44
+
45
+ ProxyExtractor = Callable[[Dict[str, Any]], Union[None, str, Dict[str, str]]]
46
+
47
+
48
+ class ProxyMiddleware:
49
+ def __init__(self, settings, log_level):
50
+ self.logger = get_logger(self.__class__.__name__, log_level)
51
+
52
+ self._session: Optional[aiohttp.ClientSession] = None
53
+ self._current_proxy: Optional[Union[str, Dict[str, str]]] = None
54
+ self._last_fetch_time: float = 0
55
+
56
+ self.proxy_extractor = settings.get("PROXY_EXTRACTOR", "proxy")
57
+ self.refresh_interval = settings.get_float("PROXY_REFRESH_INTERVAL", 60)
58
+ self.timeout = settings.get_float("PROXY_API_TIMEOUT", 10)
59
+
60
+ self.enabled = settings.get_bool("PROXY_ENABLED", True)
61
+
62
+ if not self.enabled:
63
+ self.logger.info("ProxyMiddleware 已被禁用 (PROXY_ENABLED=False)")
64
+ return
65
+
66
+ self.api_url = settings.get("PROXY_API_URL")
67
+ if not self.api_url:
68
+ raise NotConfiguredError("PROXY_API_URL 未配置,ProxyMiddleware 已禁用")
69
+
70
+ self.logger.info(f"代理中间件已启用 | API: {self.api_url} | 刷新间隔: {self.refresh_interval}s")
71
+
72
+ @classmethod
73
+ def create_instance(cls, crawler):
74
+ return cls(settings=crawler.settings, log_level=crawler.settings.get("LOG_LEVEL"))
75
+
76
+ def _compile_extractor(self) -> ProxyExtractor:
77
+ if callable(self.proxy_extractor):
78
+ return self.proxy_extractor
79
+
80
+ if isinstance(self.proxy_extractor, str):
81
+ keys = self.proxy_extractor.split(".")
82
+
83
+ def extract(data: Dict[str, Any]) -> Union[None, str, Dict[str, str]]:
84
+ for k in keys:
85
+ if isinstance(data, dict):
86
+ data = data.get(k)
87
+ else:
88
+ return None
89
+ if data is None:
90
+ break
91
+ return data
92
+
93
+ return extract
94
+
95
+ raise ValueError(f"PROXY_EXTRACTOR 必须是 str 或 callable,当前类型: {type(self.proxy_extractor)}")
96
+
97
+ async def _close_session(self):
98
+ if self._session:
99
+ try:
100
+ await self._session.close()
101
+ self.logger.debug("已关闭 aiohttp session.")
102
+ except Exception as e:
103
+ self.logger.warning(f"关闭 aiohttp session 时出错: {e}")
104
+ finally:
105
+ self._session = None
106
+
107
+ async def _get_session(self) -> aiohttp.ClientSession:
108
+ if self._session is None or self._session.closed:
109
+ if self._session and self._session.closed:
110
+ self.logger.debug("现有 session 已关闭,正在创建新 session...")
111
+ timeout = aiohttp.ClientTimeout(total=self.timeout)
112
+ self._session = aiohttp.ClientSession(timeout=timeout)
113
+ self.logger.debug("已创建新的 aiohttp session.")
114
+ return self._session
115
+
116
+ async def _fetch_raw_data(self) -> Optional[Dict[str, Any]]:
117
+ max_retries = 2
118
+ retry_count = 0
119
+
120
+ while retry_count <= max_retries:
121
+ session = await self._get_session()
122
+ try:
123
+ async with session.get(self.api_url) as resp:
124
+ content_type = resp.content_type.lower()
125
+ if 'application/json' not in content_type:
126
+ self.logger.warning(f"代理 API 返回非 JSON 内容类型: {content_type} (URL: {self.api_url})")
127
+ try:
128
+ text = await resp.text()
129
+ return {"__raw_text__": text.strip(), "__content_type__": content_type}
130
+ except Exception as e:
131
+ self.logger.error(f"读取非 JSON 响应体失败: {repr(e)}")
132
+ return None
133
+
134
+ if resp.status != 200:
135
+ try:
136
+ error_text = await resp.text()
137
+ except:
138
+ error_text = "<无法读取响应体>"
139
+ self.logger.error(f"代理 API 状态码异常: {resp.status}, 响应体: {error_text}")
140
+ if 400 <= resp.status < 500:
141
+ return None
142
+ return None
143
+
144
+ return await resp.json()
145
+
146
+ except NETWORK_EXCEPTIONS as e:
147
+ retry_count += 1
148
+ self.logger.warning(f"请求代理 API 失败 (尝试 {retry_count}/{max_retries + 1}): {repr(e)}")
149
+ if retry_count <= max_retries:
150
+ self.logger.info("正在关闭并重建 session 以进行重试...")
151
+ await self._close_session()
152
+ else:
153
+ self.logger.error(f"请求代理 API 失败,已达到最大重试次数 ({max_retries + 1}): {repr(e)}")
154
+ return None
155
+
156
+ except aiohttp.ContentTypeError as e:
157
+ self.logger.error(f"代理 API 响应内容类型错误: {repr(e)}")
158
+ return None
159
+
160
+ except Exception as e:
161
+ self.logger.critical(f"请求代理 API 时发生未预期错误: {repr(e)}", exc_info=True)
162
+ return None
163
+
164
+ return None
165
+
166
+ async def _extract_proxy(self, data: Dict[str, Any]) -> Optional[Union[str, Dict[str, str]]]:
167
+ extractor = self._compile_extractor()
168
+ try:
169
+ result = extractor(data)
170
+ if isinstance(result, str) and result.strip():
171
+ return result.strip()
172
+ elif isinstance(result, dict):
173
+ cleaned = {k: v.strip() for k, v in result.items() if v and isinstance(v, str)}
174
+ return cleaned if cleaned else None
175
+ return None
176
+ except Exception as e:
177
+ self.logger.error(f"执行 PROXY_EXTRACTOR 时出错: {repr(e)}")
178
+ return None
179
+
180
+ async def _get_proxy_from_api(self) -> Optional[Union[str, Dict[str, str]]]:
181
+ raw_data = await self._fetch_raw_data()
182
+ if not raw_data:
183
+ return None
184
+
185
+ if "__raw_text__" in raw_data:
186
+ text = raw_data["__raw_text__"]
187
+ if text.startswith("http://") or text.startswith("https://"):
188
+ return text
189
+
190
+ return await self._extract_proxy(raw_data)
191
+
192
+ async def _get_cached_proxy(self) -> Optional[str]:
193
+ if not self.enabled:
194
+ self.logger.debug("ProxyMiddleware 已禁用,跳过代理获取。")
195
+ return None
196
+
197
+ now = asyncio.get_event_loop().time()
198
+ if self._current_proxy and (now - self._last_fetch_time) < self.refresh_interval:
199
+ pass
200
+ else:
201
+ proxy = await self._get_proxy_from_api()
202
+ if proxy:
203
+ self._current_proxy = proxy
204
+ self._last_fetch_time = now
205
+ self.logger.debug(f"更新代理缓存: {proxy}")
206
+ else:
207
+ self.logger.warning("无法获取新代理,请求将直连。")
208
+
209
+ return self._current_proxy
210
+
211
+ @staticmethod
212
+ def _is_https(request: Request) -> bool:
213
+ return urlparse(request.url).scheme == "https"
214
+
215
+ async def process_request(self, request: Request, spider) -> Optional[Request]:
216
+ if not self.enabled:
217
+ self.logger.debug(f"ProxyMiddleware 已禁用,请求将直连: {request.url}")
218
+ return None
219
+
220
+ if request.proxy:
221
+ return None
222
+
223
+ proxy = await self._get_cached_proxy()
224
+ if proxy:
225
+ request.proxy = proxy
226
+ self.logger.debug(f"分配代理 → {proxy} | {request.url}")
227
+ else:
228
+ self.logger.warning(f"未获取到代理,请求直连: {request.url}")
229
+
230
+ return None
231
+
232
+ def process_response(self, request: Request, response: Response, spider) -> Response:
233
+ proxy = request.proxy
234
+ if proxy:
235
+ status_code = getattr(response, 'status_code', 'N/A')
236
+ self.logger.debug(f"代理成功: {proxy} | {request.url} | Status: {status_code}")
237
+ return response
238
+
239
+ def process_exception(self, request: Request, exception: Exception, spider) -> Optional[Request]:
240
+ proxy = request.proxy
241
+ if proxy:
242
+ self.logger.warning(f"代理请求失败: {proxy} | {request.url} | {repr(exception)}")
243
+ return None
244
+
245
+ async def close(self):
246
+ await self._close_session()
@@ -1,30 +1,30 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from crawlo.utils.log import get_logger
4
- from crawlo.exceptions import IgnoreRequestError
5
- from crawlo.event import ignore_request
6
-
7
-
8
- class RequestIgnoreMiddleware(object):
9
-
10
- def __init__(self, stats, log_level):
11
- self.logger = get_logger(self.__class__.__name__, log_level)
12
- self.stats = stats
13
-
14
- @classmethod
15
- def create_instance(cls, crawler):
16
- o = cls(stats=crawler.stats, log_level=crawler.settings.get('LOG_LEVEL'))
17
- crawler.subscriber.subscribe(o.request_ignore, event=ignore_request)
18
- return o
19
-
20
- async def request_ignore(self, exc, request, _spider):
21
- self.logger.info(f'{request} ignored.')
22
- self.stats.inc_value('request_ignore_count')
23
- reason = exc.msg
24
- if reason:
25
- self.stats.inc_value(f'request_ignore_count/{reason}')
26
-
27
- @staticmethod
28
- def process_exception(_request, exc, _spider):
29
- if isinstance(exc, IgnoreRequestError):
30
- return True
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from crawlo.utils.log import get_logger
4
+ from crawlo.exceptions import IgnoreRequestError
5
+ from crawlo.event import ignore_request
6
+
7
+
8
+ class RequestIgnoreMiddleware(object):
9
+
10
+ def __init__(self, stats, log_level):
11
+ self.logger = get_logger(self.__class__.__name__, log_level)
12
+ self.stats = stats
13
+
14
+ @classmethod
15
+ def create_instance(cls, crawler):
16
+ o = cls(stats=crawler.stats, log_level=crawler.settings.get('LOG_LEVEL'))
17
+ crawler.subscriber.subscribe(o.request_ignore, event=ignore_request)
18
+ return o
19
+
20
+ async def request_ignore(self, exc, request, _spider):
21
+ self.logger.info(f'{request} ignored.')
22
+ self.stats.inc_value('request_ignore_count')
23
+ reason = exc.msg
24
+ if reason:
25
+ self.stats.inc_value(f'request_ignore_count/{reason}')
26
+
27
+ @staticmethod
28
+ def process_exception(_request, exc, _spider):
29
+ if isinstance(exc, IgnoreRequestError):
30
+ return True
@@ -1,19 +1,19 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from crawlo.utils.log import get_logger
4
-
5
-
6
- class ResponseCodeMiddleware(object):
7
- def __init__(self, stats, log_level):
8
- self.logger = get_logger(self.__class__.__name__, log_level)
9
- self.stats = stats
10
-
11
- @classmethod
12
- def create_instance(cls, crawler):
13
- o = cls(stats=crawler.stats, log_level=crawler.settings.get('LOG_LEVEL'))
14
- return o
15
-
16
- def process_response(self, request, response, spider):
17
- self.stats.inc_value(f'stats_code/count/{response.status_code}')
18
- self.logger.debug(f'Got response from <{response.status_code} {response.url}>')
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from crawlo.utils.log import get_logger
4
+
5
+
6
+ class ResponseCodeMiddleware(object):
7
+ def __init__(self, stats, log_level):
8
+ self.logger = get_logger(self.__class__.__name__, log_level)
9
+ self.stats = stats
10
+
11
+ @classmethod
12
+ def create_instance(cls, crawler):
13
+ o = cls(stats=crawler.stats, log_level=crawler.settings.get('LOG_LEVEL'))
14
+ return o
15
+
16
+ def process_response(self, request, response, spider):
17
+ self.stats.inc_value(f'stats_code/count/{response.status_code}')
18
+ self.logger.debug(f'Got response from <{response.status_code} {response.url}>')
19
19
  return response