crawlo 1.0.4__py3-none-any.whl → 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (95) hide show
  1. crawlo/__init__.py +25 -9
  2. crawlo/__version__.py +1 -1
  3. crawlo/core/__init__.py +2 -2
  4. crawlo/core/engine.py +158 -158
  5. crawlo/core/processor.py +40 -40
  6. crawlo/core/scheduler.py +57 -57
  7. crawlo/crawler.py +424 -242
  8. crawlo/downloader/__init__.py +78 -78
  9. crawlo/downloader/aiohttp_downloader.py +200 -259
  10. crawlo/downloader/cffi_downloader.py +277 -0
  11. crawlo/downloader/httpx_downloader.py +246 -187
  12. crawlo/event.py +11 -11
  13. crawlo/exceptions.py +73 -64
  14. crawlo/extension/__init__.py +31 -31
  15. crawlo/extension/log_interval.py +49 -49
  16. crawlo/extension/log_stats.py +44 -44
  17. crawlo/extension/logging_extension.py +35 -0
  18. crawlo/filters/__init__.py +37 -37
  19. crawlo/filters/aioredis_filter.py +150 -150
  20. crawlo/filters/memory_filter.py +202 -202
  21. crawlo/items/__init__.py +62 -62
  22. crawlo/items/items.py +115 -119
  23. crawlo/middleware/__init__.py +21 -21
  24. crawlo/middleware/default_header.py +32 -32
  25. crawlo/middleware/download_delay.py +28 -28
  26. crawlo/middleware/middleware_manager.py +135 -140
  27. crawlo/middleware/proxy.py +246 -0
  28. crawlo/middleware/request_ignore.py +30 -30
  29. crawlo/middleware/response_code.py +18 -18
  30. crawlo/middleware/response_filter.py +26 -26
  31. crawlo/middleware/retry.py +90 -90
  32. crawlo/network/__init__.py +7 -7
  33. crawlo/network/request.py +203 -204
  34. crawlo/network/response.py +166 -166
  35. crawlo/pipelines/__init__.py +13 -13
  36. crawlo/pipelines/console_pipeline.py +39 -39
  37. crawlo/pipelines/mongo_pipeline.py +116 -116
  38. crawlo/pipelines/mysql_batch_pipline.py +273 -134
  39. crawlo/pipelines/mysql_pipeline.py +195 -195
  40. crawlo/pipelines/pipeline_manager.py +56 -56
  41. crawlo/settings/__init__.py +7 -7
  42. crawlo/settings/default_settings.py +169 -94
  43. crawlo/settings/setting_manager.py +99 -99
  44. crawlo/spider/__init__.py +41 -36
  45. crawlo/stats_collector.py +59 -59
  46. crawlo/subscriber.py +106 -106
  47. crawlo/task_manager.py +27 -27
  48. crawlo/templates/item_template.tmpl +21 -21
  49. crawlo/templates/project_template/main.py +32 -32
  50. crawlo/templates/project_template/setting.py +189 -189
  51. crawlo/templates/spider_template.tmpl +30 -30
  52. crawlo/utils/__init__.py +7 -7
  53. crawlo/utils/concurrency_manager.py +124 -124
  54. crawlo/utils/date_tools.py +233 -177
  55. crawlo/utils/db_helper.py +344 -0
  56. crawlo/utils/func_tools.py +82 -82
  57. crawlo/utils/log.py +129 -39
  58. crawlo/utils/pqueue.py +173 -173
  59. crawlo/utils/project.py +59 -59
  60. crawlo/utils/request.py +267 -122
  61. crawlo/utils/system.py +11 -11
  62. crawlo/utils/tools.py +5 -303
  63. crawlo/utils/url.py +39 -39
  64. {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/METADATA +49 -48
  65. crawlo-1.0.5.dist-info/RECORD +84 -0
  66. {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/top_level.txt +1 -0
  67. examples/__init__.py +0 -0
  68. examples/gxb/__init__.py +0 -0
  69. examples/gxb/items.py +36 -0
  70. examples/gxb/run.py +15 -0
  71. examples/gxb/settings.py +71 -0
  72. examples/gxb/spider/__init__.py +0 -0
  73. examples/gxb/spider/miit_spider.py +180 -0
  74. examples/gxb/spider/telecom_device_licenses.py +129 -0
  75. tests/__init__.py +7 -7
  76. tests/test_proxy_health_check.py +33 -0
  77. tests/test_proxy_middleware_integration.py +137 -0
  78. tests/test_proxy_providers.py +57 -0
  79. tests/test_proxy_stats.py +20 -0
  80. tests/test_proxy_strategies.py +60 -0
  81. crawlo/downloader/playwright_downloader.py +0 -161
  82. crawlo-1.0.4.dist-info/RECORD +0 -79
  83. tests/baidu_spider/__init__.py +0 -7
  84. tests/baidu_spider/demo.py +0 -94
  85. tests/baidu_spider/items.py +0 -25
  86. tests/baidu_spider/middleware.py +0 -49
  87. tests/baidu_spider/pipeline.py +0 -55
  88. tests/baidu_spider/request_fingerprints.txt +0 -9
  89. tests/baidu_spider/run.py +0 -27
  90. tests/baidu_spider/settings.py +0 -80
  91. tests/baidu_spider/spiders/__init__.py +0 -7
  92. tests/baidu_spider/spiders/bai_du.py +0 -61
  93. tests/baidu_spider/spiders/sina.py +0 -79
  94. {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/WHEEL +0 -0
  95. {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,71 @@
1
+ import platform
2
+
3
+ PROXY_ENABLED = False
4
+
5
+ # API 地址
6
+ PROXY_API_URL = 'http://123.56.42.142:5000/proxy/getitem/'
7
+
8
+ # 提取方式(根据实际返回结构选择)
9
+ PROXY_EXTRACTOR = "proxy"
10
+ # 或
11
+ # from utils.proxy_extractors import custom_extractor_proxy
12
+ # PROXY_EXTRACTOR = custom_extractor_proxy
13
+
14
+ # 刷新间隔
15
+ PROXY_REFRESH_INTERVAL = 5
16
+
17
+ CONCURRENCY = 3
18
+
19
+ # 超时时间
20
+ PROXY_API_TIMEOUT = 10
21
+
22
+ if platform.system() == "Windows":
23
+ MYSQL_HOST = "pc-2ze9oh2diu5e5firh.rwlb.rds.aliyuncs.com"
24
+ else:
25
+ MYSQL_HOST = "tianmai-k8s-dmadmin-x.rwlb.rds.aliyuncs.com"
26
+
27
+ # 数据库端口
28
+ MYSQL_PORT = 3306
29
+ # 数据库用户名
30
+ MYSQL_USER = "data_collection"
31
+ # 数据库密码
32
+ MYSQL_PASSWORD = "CRNabzFQ2H"
33
+ # 数据库名
34
+ MYSQL_DB = "cxzx_xm"
35
+ # 数据库编码
36
+ MYSQL_TABLE = "telecom_device_licenses_v4"
37
+
38
+ MYSQL_BATCH_SIZE = 100
39
+
40
+ PIPELINES = [
41
+ 'crawlo.pipelines.console_pipeline.ConsolePipeline',
42
+ # 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # 可选:存入 MySQL
43
+ ]
44
+
45
+
46
+ HEADERS = {
47
+ "Accept": "application/json, text/plain, */*",
48
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
49
+ "Authorization": "null",
50
+ "Cache-Control": "no-cache",
51
+ "Connection": "keep-alive",
52
+ "Content-Type": "application/json;charset=UTF-8",
53
+ "Origin": "https://ythzxfw.miit.gov.cn",
54
+ "Pragma": "no-cache",
55
+ "Referer": "https://ythzxfw.miit.gov.cn/oldyth/resultQuery",
56
+ "Sec-Fetch-Dest": "empty",
57
+ "Sec-Fetch-Mode": "cors",
58
+ "Sec-Fetch-Site": "same-origin",
59
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
60
+ "sec-ch-ua": '"Not;A=Brand";v="99", "Google Chrome";v="139", "Chromium";v="139"',
61
+ "sec-ch-ua-mobile": "?0",
62
+ "sec-ch-ua-platform": '"Windows"'
63
+ }
64
+
65
+ COOKIES = {
66
+ "wzws_sessionid": "oGivsIOAMjQwZTozYjM6MzBiMjo3MWMwOjg0NmY6MzQ4OTozNWZjOjEyMTGBOGY2OTQ2gjdjYmMyNQ==",
67
+ "ariauseGraymode": "false",
68
+ "Hm_lvt_a73626d298a849004aacc34159f68abd": "1755909741,1756084244,1756256541,1756344453",
69
+ "Hm_lpvt_a73626d298a849004aacc34159f68abd": "1756344453",
70
+ "HMACCOUNT": "08DF0D235A291EAA"
71
+ }
File without changes
@@ -0,0 +1,180 @@
1
+ import json
2
+ import logging
3
+ import re
4
+
5
+ from crawlo import Request, Spider
6
+
7
+ from examples.gxb.items import RadioApprovalItem, TelecomLicenseItem
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ # 基础配置
12
+ BASE_URL = "https://ythzxfw.miit.gov.cn"
13
+ API_URL = BASE_URL + "/oldyth/user-center/tbAppSearch/selectResult"
14
+
15
+ # 任务配置
16
+ TASKS = {
17
+ "radio_approval": {
18
+ "name": "无线电设备型号核准",
19
+ "category_id": "352",
20
+ "item_class": RadioApprovalItem,
21
+ "table": "radio_equipment_approval_new",
22
+ "field_mapping": {
23
+ 'articleField01': 'approval_number',
24
+ 'articleField02': 'device_name',
25
+ 'articleField03': 'device_model',
26
+ 'articleField04': 'applicant',
27
+ 'articleField05': 'remarks',
28
+ 'articleField06': 'validity_period',
29
+ 'articleField07': 'frequency_tolerance',
30
+ 'articleField08': 'frequency_range',
31
+ 'articleField09': 'transmit_power',
32
+ 'articleField10': 'occupied_bandwidth',
33
+ 'articleField11': 'spurious_emission_limit',
34
+ 'articleField12': 'issue_date',
35
+ 'articleField13': 'approval_code',
36
+ 'articleField14': 'cmiit_id',
37
+ 'articleField15': 'modulation_mode',
38
+ 'articleField16': 'technology_system',
39
+ }
40
+ },
41
+ "telecom_license": {
42
+ "name": "电信设备进网许可证",
43
+ "category_id": "144",
44
+ "item_class": TelecomLicenseItem,
45
+ "table": "telecom_device_licenses_new",
46
+ "field_mapping": {
47
+ 'articleField01': 'license_number',
48
+ 'articleField02': 'device_name',
49
+ 'articleField03': 'device_model',
50
+ 'articleField04': 'applicant',
51
+ 'articleField05': 'manufacturer',
52
+ 'articleField06': 'issue_date',
53
+ 'articleField07': 'expiry_date',
54
+ 'articleField08': 'certificate_type',
55
+ 'articleField09': 'remarks',
56
+ 'articleField10': 'certificate_status',
57
+ 'articleField11': 'origin',
58
+ }
59
+ }
60
+ }
61
+
62
+ def strip_html(text: str) -> str:
63
+ """去除 HTML 标签"""
64
+ if not text or not isinstance(text, str):
65
+ return text
66
+ return re.sub(r'<[^>]+>', '', text)
67
+
68
+ class MiitSpider(Spider):
69
+ name = "miit_spider"
70
+ custom_settings = {
71
+ 'DOWNLOAD_DELAY': 0.5,
72
+ 'CONCURRENT_REQUESTS': 5,
73
+ 'CONCURRENT_REQUESTS_PER_DOMAIN': 5,
74
+ 'COOKIES_ENABLED': True,
75
+ 'RETRY_TIMES': 3,
76
+ 'DEFAULT_REQUEST_HEADERS': {
77
+ "Accept": "application/json, text/plain, */*",
78
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
79
+ "Authorization": "null",
80
+ "Cache-Control": "no-cache",
81
+ "Connection": "keep-alive",
82
+ "Content-Type": "application/json;charset=UTF-8",
83
+ "Origin": BASE_URL,
84
+ "Pragma": "no-cache",
85
+ "Referer": f"{BASE_URL}/oldyth/resultQuery",
86
+ "Sec-Fetch-Dest": "empty",
87
+ "Sec-Fetch-Mode": "cors",
88
+ "Sec-Fetch-Site": "same-origin",
89
+ "sec-ch-ua": '"Not;A=Brand";v="99", "Google Chrome";v="139", "Chromium";v="139"',
90
+ "sec-ch-ua-mobile": "?0",
91
+ "sec-ch-ua-platform": '"Windows"'
92
+ },
93
+ 'COOKIES_DEBUG': False,
94
+ 'LOG_LEVEL': 'INFO',
95
+ 'ITEM_PIPELINES': {
96
+ 'kyqb_scrapy.pipelines.DedupAndMySQLPipeline': 300,
97
+ },
98
+ 'DOWNLOADER_MIDDLEWARES': {
99
+ 'kyqb_scrapy.middlewares.RandomUserAgentMiddleware': 400,
100
+ }
101
+ }
102
+
103
+ def __init__(self, task='telecom_license', start_page=1, end_page=100, *args, **kwargs):
104
+ super(MiitSpider, self).__init__(*args, **kwargs)
105
+
106
+ if task not in TASKS:
107
+ raise ValueError(f"不支持的任务: {task}")
108
+
109
+ self.task_config = TASKS[task]
110
+ self.category_id = self.task_config["category_id"]
111
+ self.item_class = self.task_config["item_class"]
112
+ self.table_name = self.task_config["table"]
113
+ self.field_mapping = self.task_config["field_mapping"]
114
+
115
+ self.start_page = int(start_page)
116
+ self.end_page = int(end_page)
117
+ self.page_size = 5
118
+
119
+ # 设置 custom_settings 中的表名(动态)
120
+ self.custom_settings['MYSQL_TABLE'] = self.table_name
121
+
122
+ logger.info(f"🚀 启动任务: {self.task_config['name']},页码 {self.start_page} ~ {self.end_page}")
123
+
124
+ def start_requests(self):
125
+ for page in range(self.start_page, self.end_page + 1):
126
+ data = {
127
+ "categoryId": self.category_id,
128
+ "currentPage": page,
129
+ "pageSize": self.page_size,
130
+ "searchContent": ""
131
+ }
132
+ yield Request(
133
+ url=API_URL,
134
+ method='POST',
135
+ body=json.dumps(data, separators=(',', ':')),
136
+ headers={'Content-Type': 'application/json;charset=UTF-8'},
137
+ callback=self.parse,
138
+ meta={'page': page},
139
+ dont_filter=True
140
+ )
141
+
142
+ def parse(self, response):
143
+ page = response.meta['page']
144
+
145
+ # 检查响应
146
+ if response.status_code != 200:
147
+ self.logger.error(f"❌ 第 {page} 页请求失败: HTTP {response.status}")
148
+ return
149
+
150
+ try:
151
+ result = json.loads(response.text)
152
+ except json.JSONDecodeError:
153
+ text = response.text
154
+ if "升级浏览器" in text or "请尝试" in text:
155
+ self.logger.error(f"⚠️ 检测到反爬: 请升级浏览器。响应片段: {text[:300]}")
156
+ else:
157
+ self.logger.error(f"JSON解析失败: {text[:300]}")
158
+ return
159
+
160
+ if not result.get("success"):
161
+ msg = result.get("msg", "未知错误")
162
+ if "升级浏览器" in msg or "请尝试" in msg:
163
+ self.logger.error(f"⚠️ 反爬提示: {msg}")
164
+ else:
165
+ self.logger.error(f"接口失败: {msg}")
166
+ return
167
+
168
+ raw_records = result["params"]["tbAppArticle"]["list"]
169
+ self.logger.info(f"✅ 第 {page} 页获取 {len(raw_records)} 条数据")
170
+
171
+ for record in raw_records:
172
+ item = self.item_class()
173
+
174
+ for src_key, dst_key in self.field_mapping.items():
175
+ value = record.get(src_key, '')
176
+ if isinstance(value, str):
177
+ value = strip_html(value)
178
+ item[dst_key] = value
179
+
180
+ yield item
@@ -0,0 +1,129 @@
1
+ # -*- coding: utf-8 -*-
2
+ import json
3
+ from crawlo import Spider, Request
4
+ from crawlo.utils.log import get_logger
5
+
6
+ from examples.gxb.items import TelecomLicenseItem
7
+ from examples.gxb.settings import HEADERS, COOKIES
8
+
9
+
10
+ logger = get_logger(__name__)
11
+
12
+ class TelecomDeviceLicensesSpider(Spider):
13
+ name = 'telecom_device_licenses'
14
+ allowed_domains = ['ythzxfw.miit.gov.cn']
15
+ # API 的基础 URL
16
+ base_api_url = 'https://ythzxfw.miit.gov.cn/oldyth/user-center/tbAppSearch/selectResult'
17
+
18
+ # 配置:起始页码和结束页码
19
+ start_page = 1
20
+ end_page = 26405
21
+ data = {
22
+ "categoryId": "144",
23
+ "currentPage": 1,
24
+ "pageSize": 5,
25
+ "searchContent": ""
26
+ }
27
+
28
+
29
+ def start_requests(self):
30
+ """从第一页开始,逐页发起请求"""
31
+
32
+ yield Request(
33
+ url=self.base_api_url,
34
+ method='POST',
35
+ headers=HEADERS,
36
+ cookies=COOKIES,
37
+ body=json.dumps(self.data),
38
+ callback=self.parse,
39
+ meta={'page': 1},
40
+ dont_filter=True
41
+ )
42
+
43
+
44
+ def parse(self, response):
45
+ """
46
+ 解析 API 响应
47
+ :param response: Scrapy Response 对象
48
+ """
49
+ page = response.meta['page']
50
+ self.logger.info(f"正在解析第 {page} 页,状态码: {response.status_code}")
51
+
52
+ try:
53
+ json_data = response.json()
54
+
55
+ if not json_data.get('success'):
56
+ self.logger.error(f"第 {page} 页请求失败: {json_data.get('msg', 'Unknown error')}")
57
+ return
58
+
59
+ # 提取总页数和总记录数(可选,用于验证)
60
+ total_records = json_data.get("params", {}).get("tbAppArticle", {}).get("total", 0)
61
+ self.logger.info(f"第 {page} 页,总记录数: {total_records}")
62
+
63
+ article_list = json_data.get("params", {}).get("tbAppArticle", {}).get("list", [])
64
+
65
+ if not article_list:
66
+ self.logger.warning(f"第 {page} 页未找到数据")
67
+ return
68
+
69
+ self.logger.info(f"第 {page} 页成功解析到 {len(article_list)} 条记录")
70
+
71
+ # 将每条记录作为独立的 item yield 出去
72
+ for item in article_list:
73
+ # 清洗数据:移除 HTML 标签
74
+ cleaned_item = self.clean_item(item)
75
+ item = TelecomLicenseItem()
76
+ item['license_number'] = cleaned_item.get('articleField01')
77
+ item['device_name'] = cleaned_item.get('articleField02')
78
+ item['device_model'] = cleaned_item.get('articleField03')
79
+ item['applicant'] = cleaned_item.get('articleField04')
80
+ item['manufacturer'] = cleaned_item.get('articleField05')
81
+ item['issue_date'] = cleaned_item.get('articleField06')
82
+ item['expiry_date'] = cleaned_item.get('articleField07')
83
+ item['certificate_type'] = cleaned_item.get('articleField08')
84
+ item['remarks'] = cleaned_item.get('articleField09')
85
+ item['certificate_status'] = cleaned_item.get('articleField10')
86
+ item['origin'] = cleaned_item.get('articleField11')
87
+ item['article_id'] = cleaned_item.get('articleId')
88
+ item['article_edit_date'] = cleaned_item.get('articleEdate')
89
+ item['create_time'] = cleaned_item.get('createTime')
90
+ yield item
91
+
92
+ # --- 自动翻页逻辑 ---
93
+ # 检查是否还有下一页
94
+ # 方法1:根据当前页码和预设的总页数
95
+ if page < self.end_page:
96
+ next_page = page + 1
97
+ self.data['currentPage'] = next_page
98
+ self.logger.debug(f"准备爬取下一页: {next_page}")
99
+ yield Request(
100
+ url=self.base_api_url,
101
+ method='POST',
102
+ headers=HEADERS,
103
+ cookies=COOKIES,
104
+ body=json.dumps(self.data),
105
+ callback=self.parse,
106
+ meta={'page': next_page},
107
+ dont_filter=True
108
+ )
109
+
110
+ except Exception as e:
111
+ self.logger.error(f"解析第 {page} 页响应失败: {e}", exc_info=True)
112
+
113
+ @staticmethod
114
+ def clean_item(item: dict) -> dict:
115
+ """
116
+ 清洗单条记录,移除 HTML 标签等
117
+ :param item: 原始字典
118
+ :return: 清洗后的字典
119
+ """
120
+ import re
121
+ html_tag_re = re.compile(r'<[^>]+>')
122
+ cleaned = {}
123
+ for k, v in item.items():
124
+ if isinstance(v, str):
125
+ # 移除 HTML 标签并去除首尾空白
126
+ cleaned[k] = html_tag_re.sub('', v).strip()
127
+ else:
128
+ cleaned[k] = v
129
+ return cleaned
tests/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-02-05 12:36
5
- # @Author : oscar
6
- # @Desc : None
7
- """
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-08-24 12:36
5
+ # @Author : crawl-coder
6
+ # @Desc : None
7
+ """
@@ -0,0 +1,33 @@
1
+ # tests/test_proxy_health_check.py
2
+ import pytest
3
+ from unittest.mock import AsyncMock, patch
4
+ from crawlo.proxy.health_check import check_single_proxy
5
+ import httpx
6
+
7
+
8
+ @pytest.mark.asyncio
9
+ @patch('httpx.AsyncClient')
10
+ async def test_health_check_success(mock_client_class):
11
+ """测试健康检查:成功"""
12
+ mock_resp = AsyncMock()
13
+ mock_resp.status_code = 200
14
+ mock_client_class.return_value.__aenter__.return_value.get.return_value = mock_resp
15
+
16
+ proxy_info = {'url': 'http://good:8080', 'healthy': False}
17
+ await check_single_proxy(proxy_info)
18
+
19
+ assert proxy_info['healthy'] is True
20
+ assert proxy_info['failures'] == 0
21
+
22
+
23
+ @pytest.mark.asyncio
24
+ @patch('httpx.AsyncClient')
25
+ async def test_health_check_failure(mock_client_class):
26
+ """测试健康检查:失败"""
27
+ mock_client_class.return_value.__aenter__.return_value.get.side_effect = httpx.ConnectError("Failed")
28
+
29
+ proxy_info = {'url': 'http://bad:8080', 'healthy': True, 'failures': 0}
30
+ await check_single_proxy(proxy_info)
31
+
32
+ assert proxy_info['healthy'] is False
33
+ assert proxy_info['failures'] == 1
@@ -0,0 +1,137 @@
1
+ # tests/test_proxy_middleware_integration.py
2
+ import pytest
3
+ import asyncio
4
+ import time
5
+ from unittest.mock import Mock, AsyncMock, patch
6
+ from crawlo import Request, Response, Spider
7
+ from crawlo.proxy.middleware import ProxyMiddleware
8
+ from crawlo.proxy.stats import ProxyStats
9
+
10
+
11
+ @pytest.fixture
12
+ def crawler():
13
+ class MockSettings:
14
+ def get(self, key, default=None):
15
+ defaults = {
16
+ 'PROXY_ENABLED': True,
17
+ 'PROXIES': ['http://p1:8080', 'http://p2:8080'],
18
+ 'PROXY_SELECTION_STRATEGY': 'random',
19
+ 'PROXY_REQUEST_DELAY_ENABLED': False,
20
+ 'PROXY_MAX_RETRY_COUNT': 1,
21
+ }
22
+ return defaults.get(key, default)
23
+
24
+ def get_bool(self, key, default=None):
25
+ return self.get(key, default)
26
+
27
+ def get_int(self, key, default=None):
28
+ return self.get(key, default)
29
+
30
+ def get_float(self, key, default=None):
31
+ return self.get(key, default)
32
+
33
+ def get_list(self, key, default=None):
34
+ return self.get(key, default)
35
+
36
+ class MockCrawler:
37
+ def __init__(self):
38
+ self.settings = MockSettings()
39
+
40
+ return MockCrawler()
41
+
42
+
43
+ @pytest.fixture
44
+ def middleware(crawler):
45
+ mw = ProxyMiddleware.create_instance(crawler)
46
+ mw._load_providers = Mock()
47
+ mw._update_proxy_pool = AsyncMock()
48
+ mw._health_check = AsyncMock()
49
+ mw.scheduler = None
50
+
51
+ mw.proxies = [
52
+ {
53
+ 'url': 'http://p1:8080',
54
+ 'healthy': True,
55
+ 'failures': 0,
56
+ 'last_health_check': 0,
57
+ 'unhealthy_since': 0
58
+ },
59
+ {
60
+ 'url': 'http://p2:8080',
61
+ 'healthy': True,
62
+ 'failures': 0,
63
+ 'last_health_check': 0,
64
+ 'unhealthy_since': 0
65
+ },
66
+ ]
67
+ mw.stats = ProxyStats()
68
+ for p in mw.proxies:
69
+ mw.stats.record(p['url'], 'total')
70
+
71
+ asyncio.get_event_loop().run_until_complete(mw._initial_setup())
72
+ return mw
73
+
74
+
75
+ @pytest.fixture
76
+ def spider():
77
+ return Mock(spec=Spider, logger=Mock())
78
+
79
+
80
+ def test_process_request_sets_proxy(middleware, spider):
81
+ request = Request("https://example.com")
82
+ result = asyncio.get_event_loop().run_until_complete(
83
+ middleware.process_request(request, spider)
84
+ )
85
+ assert result is None
86
+ assert hasattr(request, 'proxy')
87
+ assert request.proxy in ['http://p1:8080', 'http://p2:8080']
88
+
89
+
90
+ def test_process_response_records_success(middleware, spider):
91
+ request = Request("https://example.com")
92
+ request.proxy = 'http://p1:8080'
93
+ response = Response("https://example.com", body=b"ok", headers={})
94
+ middleware.stats.record(request.proxy, 'total')
95
+ middleware.process_response(request, response, spider)
96
+ assert middleware.stats.get(request.proxy)['success'] == 1
97
+
98
+
99
+ def test_process_exception_switches_proxy(middleware, spider):
100
+ request = Request("https://example.com")
101
+ request.proxy = 'http://p1:8080'
102
+ request.meta['proxy_retry_count'] = 0
103
+
104
+ result = middleware.process_exception(request, Exception("Timeout"), spider)
105
+ assert result is not None
106
+ assert result.proxy != 'http://p1:8080'
107
+ assert result.meta['proxy_retry_count'] == 1
108
+
109
+ final = middleware.process_exception(result, Exception("Timeout"), spider)
110
+ assert final is None
111
+
112
+
113
+ def test_mark_failure_disables_proxy(middleware):
114
+ proxy_url = 'http://p1:8080'
115
+ p = next(p for p in middleware.proxies if p['url'] == proxy_url)
116
+ p['failures'] = 2
117
+
118
+ middleware._mark_failure(proxy_url)
119
+ assert p['failures'] == 3
120
+ assert p['healthy'] is False
121
+ assert p['unhealthy_since'] > 0
122
+
123
+
124
+ @pytest.mark.asyncio
125
+ async def test_request_delay(middleware, spider):
126
+ """测试请求延迟功能:验证是否调用了 asyncio.sleep"""
127
+ with patch("crawlo.proxy.middleware.asyncio.sleep", new_callable=AsyncMock) as mock_sleep:
128
+ middleware.delay_enabled = True # 注意:这里应该是 delay_enabled 而不是 request_delay_enabled
129
+ middleware.request_delay = 0.1
130
+ middleware._last_req_time = time.time() - 0.05 # 50ms 前
131
+
132
+ request = Request("https://a.com")
133
+ await middleware.process_request(request, spider)
134
+
135
+ mock_sleep.assert_called_once()
136
+ delay = mock_sleep.call_args[0][0]
137
+ assert 0.04 <= delay <= 0.06
@@ -0,0 +1,57 @@
1
+ # tests/test_proxy_providers.py
2
+ import pytest
3
+ import pytest
4
+ import respx
5
+ from httpx import Response
6
+ from crawlo.proxy.providers import StaticProxyProvider, FileProxyProvider, APIProxyProvider
7
+ import tempfile
8
+ import os
9
+
10
+
11
+ @pytest.mark.asyncio
12
+ async def test_static_provider():
13
+ """测试静态代理提供者"""
14
+ provider = StaticProxyProvider(['http://1.1.1.1:8080', 'http://2.2.2.2:8080'])
15
+ proxies = await provider.fetch_proxies()
16
+ assert len(proxies) == 2
17
+ assert 'http://1.1.1.1:8080' in proxies
18
+ assert 'http://2.2.2.2:8080' in proxies
19
+
20
+
21
+ @pytest.mark.asyncio
22
+ async def test_file_provider():
23
+ """测试文件代理提供者"""
24
+ with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
25
+ f.write("http://a.com:8080\nhttp://b.com:8080\n")
26
+ temp_path = f.name
27
+ try:
28
+ provider = FileProxyProvider(temp_path)
29
+ proxies = await provider.fetch_proxies()
30
+ assert len(proxies) == 2
31
+ assert 'http://a.com:8080' in proxies
32
+ assert 'http://b.com:8080' in proxies
33
+ finally:
34
+ os.unlink(temp_path)
35
+
36
+
37
+ @pytest.mark.asyncio
38
+ @respx.mock
39
+ async def test_api_provider():
40
+ """使用 respx 拦截 HTTP 请求,更简洁可靠"""
41
+ # 拦截 GET 请求
42
+ respx.get("https://api.example.com").mock(
43
+ return_value=Response(
44
+ 200,
45
+ json=[
46
+ {"ip": "1.1.1.1", "port": 8080},
47
+ {"ip": "2.2.2.2", "port": 8080}
48
+ ]
49
+ )
50
+ )
51
+
52
+ provider = APIProxyProvider(url="https://api.example.com")
53
+ proxies = await provider.fetch_proxies()
54
+
55
+ assert len(proxies) == 2
56
+ assert "http://1.1.1.1:8080" in proxies
57
+ assert "http://2.2.2.2:8080" in proxies
@@ -0,0 +1,20 @@
1
+ # tests/test_proxy_stats.py
2
+ from crawlo.proxy.stats import ProxyStats
3
+
4
+
5
+ def test_proxy_stats():
6
+ """测试代理统计功能"""
7
+ stats = ProxyStats()
8
+ url = 'http://proxy1:8080'
9
+
10
+ stats.record(url, 'success')
11
+ stats.record(url, 'success')
12
+ stats.record(url, 'failure')
13
+
14
+ assert stats.get(url)['success'] == 2
15
+ assert stats.get(url)['failure'] == 1
16
+ assert stats.get(url)['total'] == 3
17
+
18
+ all_data = stats.all()
19
+ assert url in all_data
20
+ assert all_data[url]['success'] == 2