crawlo 1.4.4__py3-none-any.whl → 1.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (120) hide show
  1. crawlo/__init__.py +11 -15
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/startproject.py +24 -0
  4. crawlo/core/engine.py +2 -2
  5. crawlo/core/scheduler.py +4 -4
  6. crawlo/crawler.py +8 -7
  7. crawlo/downloader/__init__.py +5 -2
  8. crawlo/downloader/cffi_downloader.py +3 -1
  9. crawlo/extension/__init__.py +2 -2
  10. crawlo/filters/aioredis_filter.py +8 -1
  11. crawlo/filters/memory_filter.py +8 -1
  12. crawlo/initialization/built_in.py +13 -4
  13. crawlo/initialization/core.py +5 -4
  14. crawlo/interfaces.py +24 -0
  15. crawlo/middleware/__init__.py +7 -4
  16. crawlo/middleware/middleware_manager.py +15 -8
  17. crawlo/middleware/proxy.py +171 -348
  18. crawlo/mode_manager.py +45 -11
  19. crawlo/network/response.py +374 -69
  20. crawlo/pipelines/mysql_pipeline.py +340 -189
  21. crawlo/pipelines/pipeline_manager.py +2 -2
  22. crawlo/project.py +2 -4
  23. crawlo/settings/default_settings.py +42 -30
  24. crawlo/stats_collector.py +10 -1
  25. crawlo/task_manager.py +2 -2
  26. crawlo/templates/project/items.py.tmpl +2 -2
  27. crawlo/templates/project/middlewares.py.tmpl +9 -89
  28. crawlo/templates/project/pipelines.py.tmpl +8 -68
  29. crawlo/templates/project/settings.py.tmpl +10 -55
  30. crawlo/templates/project/settings_distributed.py.tmpl +20 -22
  31. crawlo/templates/project/settings_gentle.py.tmpl +5 -0
  32. crawlo/templates/project/settings_high_performance.py.tmpl +5 -0
  33. crawlo/templates/project/settings_minimal.py.tmpl +25 -1
  34. crawlo/templates/project/settings_simple.py.tmpl +5 -0
  35. crawlo/templates/run.py.tmpl +1 -8
  36. crawlo/templates/spider/spider.py.tmpl +5 -108
  37. crawlo/tools/__init__.py +0 -11
  38. crawlo/utils/__init__.py +17 -1
  39. crawlo/utils/db_helper.py +226 -319
  40. crawlo/utils/error_handler.py +313 -67
  41. crawlo/utils/fingerprint.py +3 -4
  42. crawlo/utils/misc.py +82 -0
  43. crawlo/utils/request.py +55 -66
  44. crawlo/utils/selector_helper.py +138 -0
  45. crawlo/utils/spider_loader.py +185 -45
  46. crawlo/utils/text_helper.py +95 -0
  47. crawlo-1.4.6.dist-info/METADATA +329 -0
  48. {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/RECORD +110 -69
  49. tests/authenticated_proxy_example.py +10 -6
  50. tests/bug_check_test.py +251 -0
  51. tests/direct_selector_helper_test.py +97 -0
  52. tests/explain_mysql_update_behavior.py +77 -0
  53. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
  54. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
  55. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
  56. tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
  57. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
  58. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
  59. tests/ofweek_scrapy/scrapy.cfg +11 -0
  60. tests/performance_comparison.py +4 -5
  61. tests/simple_crawlo_test.py +1 -2
  62. tests/simple_follow_test.py +39 -0
  63. tests/simple_response_selector_test.py +95 -0
  64. tests/simple_selector_helper_test.py +155 -0
  65. tests/simple_selector_test.py +208 -0
  66. tests/simple_url_test.py +74 -0
  67. tests/simulate_mysql_update_test.py +140 -0
  68. tests/test_asyncmy_usage.py +57 -0
  69. tests/test_crawler_process_import.py +39 -0
  70. tests/test_crawler_process_spider_modules.py +48 -0
  71. tests/test_crawlo_proxy_integration.py +8 -2
  72. tests/test_downloader_proxy_compatibility.py +24 -20
  73. tests/test_edge_cases.py +7 -5
  74. tests/test_encoding_core.py +57 -0
  75. tests/test_encoding_detection.py +127 -0
  76. tests/test_factory_compatibility.py +197 -0
  77. tests/test_mysql_pipeline_config.py +165 -0
  78. tests/test_mysql_pipeline_error.py +99 -0
  79. tests/test_mysql_pipeline_init_log.py +83 -0
  80. tests/test_mysql_pipeline_integration.py +133 -0
  81. tests/test_mysql_pipeline_refactor.py +144 -0
  82. tests/test_mysql_pipeline_refactor_simple.py +86 -0
  83. tests/test_mysql_pipeline_robustness.py +196 -0
  84. tests/test_mysql_pipeline_types.py +89 -0
  85. tests/test_mysql_update_columns.py +94 -0
  86. tests/test_optimized_selector_naming.py +101 -0
  87. tests/test_priority_behavior.py +18 -18
  88. tests/test_proxy_middleware.py +104 -8
  89. tests/test_proxy_middleware_enhanced.py +1 -5
  90. tests/test_proxy_middleware_integration.py +7 -2
  91. tests/test_proxy_middleware_refactored.py +25 -2
  92. tests/test_proxy_only.py +84 -0
  93. tests/test_proxy_with_downloader.py +153 -0
  94. tests/test_real_scenario_proxy.py +17 -17
  95. tests/test_response_follow.py +105 -0
  96. tests/test_response_selector_methods.py +93 -0
  97. tests/test_response_url_methods.py +71 -0
  98. tests/test_response_urljoin.py +87 -0
  99. tests/test_scrapy_style_encoding.py +113 -0
  100. tests/test_selector_helper.py +101 -0
  101. tests/test_selector_optimizations.py +147 -0
  102. tests/test_spider_loader.py +50 -0
  103. tests/test_spider_loader_comprehensive.py +70 -0
  104. tests/test_spiders/__init__.py +1 -0
  105. tests/test_spiders/test_spider.py +10 -0
  106. tests/verify_mysql_warnings.py +110 -0
  107. crawlo/middleware/simple_proxy.py +0 -65
  108. crawlo/tools/anti_crawler.py +0 -269
  109. crawlo/utils/class_loader.py +0 -26
  110. crawlo/utils/enhanced_error_handler.py +0 -357
  111. crawlo-1.4.4.dist-info/METADATA +0 -190
  112. tests/simple_log_test.py +0 -58
  113. tests/simple_test.py +0 -48
  114. tests/test_framework_logger.py +0 -67
  115. tests/test_framework_startup.py +0 -65
  116. tests/test_mode_change.py +0 -73
  117. {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/WHEEL +0 -0
  118. {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/entry_points.txt +0 -0
  119. {crawlo-1.4.4.dist-info → crawlo-1.4.6.dist-info}/top_level.txt +0 -0
  120. /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
@@ -35,7 +35,7 @@ class MockCrawler:
35
35
  self.spider = MockSpider(self) # 添加spider属性
36
36
 
37
37
 
38
- def create_test_settings(proxy_url=None):
38
+ def create_test_settings(proxy_url=None, proxy_list=None):
39
39
  """创建测试设置"""
40
40
  settings = SettingManager()
41
41
  settings.set("LOG_LEVEL", "DEBUG")
@@ -47,12 +47,13 @@ def create_test_settings(proxy_url=None):
47
47
 
48
48
  # 代理相关设置
49
49
  if proxy_url:
50
- settings.set("PROXY_ENABLED", True)
50
+ # 高级代理配置(适用于ProxyMiddleware)
51
+ # 只要配置了代理API URL,中间件就会自动启用
51
52
  settings.set("PROXY_API_URL", proxy_url)
52
- settings.set("PROXY_REFRESH_INTERVAL", 60)
53
- settings.set("PROXY_POOL_SIZE", 5)
54
- else:
55
- settings.set("PROXY_ENABLED", False)
53
+ elif proxy_list:
54
+ # 代理配置(适用于ProxyMiddleware)
55
+ # 只要配置了代理列表,中间件就会自动启用
56
+ settings.set("PROXY_LIST", proxy_list)
56
57
 
57
58
  return settings
58
59
 
@@ -65,7 +66,7 @@ async def test_aiohttp_with_proxy(proxy_url, target_url):
65
66
 
66
67
  try:
67
68
  # 创建设置
68
- settings = create_test_settings(proxy_url)
69
+ settings = create_test_settings(proxy_url=proxy_url)
69
70
  crawler = MockCrawler(settings)
70
71
 
71
72
  # 创建下载器
@@ -73,6 +74,7 @@ async def test_aiohttp_with_proxy(proxy_url, target_url):
73
74
  downloader.open()
74
75
 
75
76
  # 创建代理中间件
77
+ from crawlo.middleware.proxy import ProxyMiddleware
76
78
  proxy_middleware = ProxyMiddleware(settings, "DEBUG")
77
79
 
78
80
  # 创建请求
@@ -115,15 +117,15 @@ async def test_aiohttp_with_proxy(proxy_url, target_url):
115
117
  pass
116
118
 
117
119
 
118
- async def test_httpx_with_proxy_async(proxy_url, target_url):
120
+ async def test_httpx_with_proxy_async(proxy_list, target_url):
119
121
  """测试httpx下载器与代理的适配性"""
120
122
  print(f"\n=== 测试 httpx 下载器与代理 ===")
121
- print(f"代理URL: {proxy_url}")
123
+ print(f"代理列表: {proxy_list}")
122
124
  print(f"目标URL: {target_url}")
123
125
 
124
126
  try:
125
127
  # 创建设置
126
- settings = create_test_settings(proxy_url)
128
+ settings = create_test_settings(proxy_list=proxy_list)
127
129
  crawler = MockCrawler(settings)
128
130
 
129
131
  # 创建下载器
@@ -131,7 +133,8 @@ async def test_httpx_with_proxy_async(proxy_url, target_url):
131
133
  downloader.open()
132
134
 
133
135
  # 创建代理中间件
134
- proxy_middleware = ProxyMiddleware(settings, "DEBUG")
136
+ from crawlo.middleware.simple_proxy import SimpleProxyMiddleware
137
+ proxy_middleware = SimpleProxyMiddleware(settings, "DEBUG")
135
138
 
136
139
  # 创建请求
137
140
  request = Request(url=target_url)
@@ -168,7 +171,6 @@ async def test_httpx_with_proxy_async(proxy_url, target_url):
168
171
  # 清理资源
169
172
  try:
170
173
  await downloader.close()
171
- await proxy_middleware.close()
172
174
  except:
173
175
  pass
174
176
 
@@ -181,7 +183,7 @@ async def test_curl_cffi_with_proxy_async(proxy_url, target_url):
181
183
 
182
184
  try:
183
185
  # 创建设置
184
- settings = create_test_settings(proxy_url)
186
+ settings = create_test_settings(proxy_url=proxy_url)
185
187
  crawler = MockCrawler(settings)
186
188
 
187
189
  # 创建下载器
@@ -238,26 +240,28 @@ async def main():
238
240
  # 使用测试代理URL(这里使用一个公开的测试代理)
239
241
  # 注意:在实际使用中,您需要替换为有效的代理URL
240
242
  test_proxy_url = "http://test.proxy.api:8080/proxy/getitem/"
243
+ test_proxy_list = ["http://proxy1:8080", "http://proxy2:8080"]
241
244
  test_target_url = "https://httpbin.org/ip" # 一个返回IP信息的测试站点
242
245
 
243
246
  print(f"测试代理API: {test_proxy_url}")
247
+ print(f"测试代理列表: {test_proxy_list}")
244
248
  print(f"测试目标URL: {test_target_url}")
245
249
 
246
- # 测试aiohttp下载器
250
+ # 测试aiohttp下载器(使用高级代理)
247
251
  aiohttp_result = await test_aiohttp_with_proxy(test_proxy_url, test_target_url)
248
252
 
249
- # 测试httpx下载器
250
- httpx_result = await test_httpx_with_proxy_async(test_proxy_url, test_target_url)
253
+ # 测试httpx下载器(使用简化代理)
254
+ httpx_result = await test_httpx_with_proxy_async(test_proxy_list, test_target_url)
251
255
 
252
- # 测试curl-cffi下载器
256
+ # 测试curl-cffi下载器(使用高级代理)
253
257
  curl_cffi_result = await test_curl_cffi_with_proxy_async(test_proxy_url, test_target_url)
254
258
 
255
259
  # 汇总结果
256
260
  print("\n" + "="*50)
257
261
  print("测试结果汇总:")
258
- print(f"aiohttp 下载器: {'✓ 通过' if aiohttp_result else '✗ 失败'}")
259
- print(f"httpx 下载器: {'✓ 通过' if httpx_result else '✗ 失败'}")
260
- print(f"curl-cffi 下载器: {'✓ 通过' if curl_cffi_result else '✗ 失败'}")
262
+ print(f"aiohttp 下载器 (高级代理): {'✓ 通过' if aiohttp_result else '✗ 失败'}")
263
+ print(f"httpx 下载器 (简化代理): {'✓ 通过' if httpx_result else '✗ 失败'}")
264
+ print(f"curl-cffi 下载器 (高级代理): {'✓ 通过' if curl_cffi_result else '✗ 失败'}")
261
265
 
262
266
  overall_result = all([aiohttp_result, httpx_result, curl_cffi_result])
263
267
  print(f"\n总体结果: {'✓ 所有下载器都适配代理中间件' if overall_result else '✗ 部分下载器不兼容'}")
tests/test_edge_cases.py CHANGED
@@ -112,13 +112,15 @@ async def test_redis_queue_edge_cases():
112
112
  print(" 特殊字符 URL 测试通过")
113
113
 
114
114
  # 4. 测试优先级(高优先级值应该先出队)
115
- high_priority_request = Request(url="https://high-priority.com", priority=1000)
116
- low_priority_request = Request(url="https://low-priority.com", priority=-1000)
115
+ # 注意:Request构造函数会将传入的priority值取反存储
116
+ # 所以priority=1000的请求实际存储为-1000,priority=-1000的请求实际存储为1000
117
+ high_priority_request = Request(url="https://high-priority.com", priority=1000) # 实际存储为-1000
118
+ low_priority_request = Request(url="https://low-priority.com", priority=-1000) # 实际存储为1000
117
119
 
118
- await queue.put(high_priority_request) # 高优先级值
119
- await queue.put(low_priority_request) # 低优先级值
120
+ await queue.put(high_priority_request, priority=high_priority_request.priority) # 使用实际存储的priority值
121
+ await queue.put(low_priority_request, priority=low_priority_request.priority) # 使用实际存储的priority值
120
122
 
121
- # 高优先级值应该先出队
123
+ # 高优先级值应该先出队(因为score = priority,score小的先出队)
122
124
  first = await queue.get(timeout=1.0)
123
125
  assert first is not None and first.url == "https://high-priority.com", "高优先级值应该先出队"
124
126
  print(" 优先级测试通过")
@@ -0,0 +1,57 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ 编码检测核心功能测试
5
+ """
6
+ import sys
7
+ import os
8
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
9
+
10
+ from crawlo.network.response import Response
11
+
12
+
13
+ def test_encoding_detection():
14
+ """测试编码检测核心功能"""
15
+ print("测试编码检测核心功能...")
16
+
17
+ # 测试 Request 编码优先级
18
+ class MockRequest:
19
+ encoding = 'gbk'
20
+
21
+ response1 = Response(
22
+ url="https://example.com",
23
+ body=b'',
24
+ request=MockRequest()
25
+ )
26
+ print(f"Request 编码优先级: {response1.encoding}")
27
+
28
+ # 测试 Content-Type 头部编码
29
+ response2 = Response(
30
+ url="https://example.com",
31
+ body=b'',
32
+ headers={"content-type": "text/html; charset=iso-8859-1"}
33
+ )
34
+ print(f"Content-Type 编码: {response2.encoding}")
35
+
36
+ # 测试声明编码方法
37
+ declared_enc = response2._declared_encoding()
38
+ print(f"声明编码: {declared_enc}")
39
+
40
+ # 测试默认编码
41
+ response3 = Response(
42
+ url="https://example.com",
43
+ body=b''
44
+ )
45
+ print(f"默认编码: {response3.encoding}")
46
+
47
+ # 验证结果
48
+ assert response1.encoding == 'gbk', f"Expected 'gbk', got {response1.encoding}"
49
+ assert response2.encoding == 'iso-8859-1', f"Expected 'iso-8859-1', got {response2.encoding}"
50
+ assert declared_enc == 'iso-8859-1', f"Expected 'iso-8859-1', got {declared_enc}"
51
+ assert response3.encoding == 'utf-8', f"Expected 'utf-8', got {response3.encoding}"
52
+
53
+ print("所有测试通过!")
54
+
55
+
56
+ if __name__ == '__main__':
57
+ test_encoding_detection()
@@ -0,0 +1,127 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ Response 编码检测优化测试
5
+ """
6
+ import unittest
7
+
8
+ # 模拟 Response 类的部分功能用于测试
9
+ class MockResponse:
10
+ def __init__(self, body, headers=None, request=None):
11
+ self.body = body
12
+ self.headers = headers or {}
13
+ self.request = request
14
+ self._DEFAULT_ENCODING = "ascii"
15
+
16
+ def _determine_encoding(self):
17
+ """简化版编码检测"""
18
+ # 1. 优先使用声明的编码
19
+ declared_encoding = self._declared_encoding()
20
+ if declared_encoding:
21
+ return declared_encoding
22
+
23
+ # 2. 默认使用 utf-8
24
+ return 'utf-8'
25
+
26
+ def _declared_encoding(self):
27
+ """获取声明的编码"""
28
+ # 1. Request 中指定的编码
29
+ if self.request and getattr(self.request, 'encoding', None):
30
+ return self.request.encoding
31
+
32
+ # 2. 从 Content-Type 头中检测
33
+ content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
34
+ if content_type:
35
+ import re
36
+ charset_match = re.search(r"charset=([\w-]+)", content_type, re.I)
37
+ if charset_match:
38
+ return charset_match.group(1).lower()
39
+
40
+ return None
41
+
42
+
43
+ class TestDetermineEncoding(unittest.TestCase):
44
+ """编码检测测试类"""
45
+
46
+ def test_request_encoding_priority(self):
47
+ """测试 Request 编码优先级"""
48
+ class MockRequest:
49
+ encoding = 'gbk'
50
+
51
+ response = MockResponse(b'', request=MockRequest())
52
+ encoding = response._determine_encoding()
53
+ self.assertEqual(encoding, 'gbk')
54
+
55
+ def test_content_type_encoding(self):
56
+ """测试 Content-Type 头部编码检测"""
57
+ response = MockResponse(
58
+ b'',
59
+ headers={"content-type": "text/html; charset=iso-8859-1"}
60
+ )
61
+ encoding = response._determine_encoding()
62
+ self.assertEqual(encoding, 'iso-8859-1')
63
+
64
+ def test_default_encoding(self):
65
+ """测试默认编码"""
66
+ response = MockResponse(b'')
67
+ encoding = response._determine_encoding()
68
+ self.assertEqual(encoding, 'utf-8')
69
+
70
+ def test_case_insensitive_content_type(self):
71
+ """测试 Content-Type 头部大小写不敏感"""
72
+ response = MockResponse(
73
+ b'',
74
+ headers={"Content-Type": "text/html; CHARSET=UTF-8"}
75
+ )
76
+ encoding = response._determine_encoding()
77
+ self.assertEqual(encoding, 'utf-8')
78
+
79
+ def test_declared_encoding_with_request(self):
80
+ """测试声明编码 - Request优先级"""
81
+ class MockRequest:
82
+ encoding = 'gbk'
83
+
84
+ response = MockResponse(b'', request=MockRequest())
85
+ declared_encoding = response._declared_encoding()
86
+ self.assertEqual(declared_encoding, 'gbk')
87
+
88
+ def test_declared_encoding_with_content_type(self):
89
+ """测试声明编码 - Content-Type"""
90
+ response = MockResponse(
91
+ b'',
92
+ headers={"content-type": "text/html; charset=iso-8859-1"}
93
+ )
94
+ declared_encoding = response._declared_encoding()
95
+ self.assertEqual(declared_encoding, 'iso-8859-1')
96
+
97
+
98
+ def test_encoding_detection():
99
+ """简单测试编码检测功能"""
100
+ print("测试编码检测功能...")
101
+
102
+ # 测试 Request 编码优先级
103
+ class MockRequest:
104
+ encoding = 'gbk'
105
+
106
+ response1 = MockResponse(b'', request=MockRequest())
107
+ encoding1 = response1._determine_encoding()
108
+ print(f"Request 编码优先级: {encoding1}")
109
+
110
+ # 测试 Content-Type 头部编码
111
+ response2 = MockResponse(
112
+ b'',
113
+ headers={"content-type": "text/html; charset=iso-8859-1"}
114
+ )
115
+ encoding2 = response2._determine_encoding()
116
+ print(f"Content-Type 编码: {encoding2}")
117
+
118
+ # 测试默认编码
119
+ response3 = MockResponse(b'')
120
+ encoding3 = response3._determine_encoding()
121
+ print(f"默认编码: {encoding3}")
122
+
123
+ print("编码检测测试完成!")
124
+
125
+
126
+ if __name__ == '__main__':
127
+ test_encoding_detection()
@@ -0,0 +1,197 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 测试CrawloConfig工厂模式兼容性
5
+ """
6
+
7
+ import sys
8
+ import os
9
+ import traceback
10
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
11
+
12
+ from crawlo.config import CrawloConfig
13
+
14
+
15
+ def test_standalone_factory():
16
+ """测试单机模式工厂函数"""
17
+ print("测试单机模式工厂函数...")
18
+
19
+ try:
20
+ # 创建单机模式配置
21
+ config = CrawloConfig.standalone(
22
+ project_name='ofweek_standalone',
23
+ concurrency=8,
24
+ download_delay=1.0
25
+ )
26
+
27
+ print(f"配置创建成功")
28
+ print(f"RUN_MODE: {config.get('RUN_MODE')}")
29
+ print(f"QUEUE_TYPE: {config.get('QUEUE_TYPE')}")
30
+ print(f"PROJECT_NAME: {config.get('PROJECT_NAME')}")
31
+ print(f"CONCURRENCY: {config.get('CONCURRENCY')}")
32
+ print(f"DOWNLOAD_DELAY: {config.get('DOWNLOAD_DELAY')}")
33
+
34
+ # 验证配置是否正确
35
+ assert config.get('RUN_MODE') == 'standalone'
36
+ assert config.get('QUEUE_TYPE') == 'memory'
37
+ assert config.get('PROJECT_NAME') == 'ofweek_standalone'
38
+ assert config.get('CONCURRENCY') == 8
39
+ assert config.get('DOWNLOAD_DELAY') == 1.0
40
+
41
+ print("✅ 单机模式工厂函数测试通过")
42
+ return True
43
+
44
+ except Exception as e:
45
+ print(f"❌ 单机模式工厂函数测试失败: {e}")
46
+ traceback.print_exc()
47
+ return False
48
+
49
+
50
+ def test_distributed_factory():
51
+ """测试分布式模式工厂函数"""
52
+ print("\n测试分布式模式工厂函数...")
53
+
54
+ try:
55
+ # 创建分布式模式配置
56
+ config = CrawloConfig.distributed(
57
+ redis_host='127.0.0.1',
58
+ redis_port=6379,
59
+ project_name='ofweek_distributed',
60
+ concurrency=16,
61
+ download_delay=0.5
62
+ )
63
+
64
+ print(f"配置创建成功")
65
+ print(f"RUN_MODE: {config.get('RUN_MODE')}")
66
+ print(f"QUEUE_TYPE: {config.get('QUEUE_TYPE')}")
67
+ print(f"PROJECT_NAME: {config.get('PROJECT_NAME')}")
68
+ print(f"CONCURRENCY: {config.get('CONCURRENCY')}")
69
+ print(f"DOWNLOAD_DELAY: {config.get('DOWNLOAD_DELAY')}")
70
+ print(f"REDIS_HOST: {config.get('REDIS_HOST')}")
71
+ print(f"REDIS_PORT: {config.get('REDIS_PORT')}")
72
+
73
+ # 验证配置是否正确
74
+ assert config.get('RUN_MODE') == 'distributed'
75
+ assert config.get('QUEUE_TYPE') == 'redis'
76
+ assert config.get('PROJECT_NAME') == 'ofweek_distributed'
77
+ assert config.get('CONCURRENCY') == 16
78
+ assert config.get('DOWNLOAD_DELAY') == 0.5
79
+ assert config.get('REDIS_HOST') == '127.0.0.1'
80
+ assert config.get('REDIS_PORT') == 6379
81
+
82
+ print("✅ 分布式模式工厂函数测试通过")
83
+ return True
84
+
85
+ except Exception as e:
86
+ print(f"❌ 分布式模式工厂函数测试失败: {e}")
87
+ traceback.print_exc()
88
+ return False
89
+
90
+
91
+ def test_auto_factory():
92
+ """测试自动模式工厂函数"""
93
+ print("\n测试自动模式工厂函数...")
94
+
95
+ try:
96
+ # 创建自动模式配置
97
+ config = CrawloConfig.auto(
98
+ project_name='ofweek_auto',
99
+ concurrency=12,
100
+ download_delay=0.8
101
+ )
102
+
103
+ print(f"配置创建成功")
104
+ print(f"RUN_MODE: {config.get('RUN_MODE')}")
105
+ print(f"QUEUE_TYPE: {config.get('QUEUE_TYPE')}")
106
+ print(f"PROJECT_NAME: {config.get('PROJECT_NAME')}")
107
+ print(f"CONCURRENCY: {config.get('CONCURRENCY')}")
108
+ print(f"DOWNLOAD_DELAY: {config.get('DOWNLOAD_DELAY')}")
109
+
110
+ # 验证配置是否正确
111
+ assert config.get('RUN_MODE') == 'auto'
112
+ assert config.get('QUEUE_TYPE') == 'auto'
113
+ assert config.get('PROJECT_NAME') == 'ofweek_auto'
114
+ assert config.get('CONCURRENCY') == 12
115
+ assert config.get('DOWNLOAD_DELAY') == 0.8
116
+
117
+ print("✅ 自动模式工厂函数测试通过")
118
+ return True
119
+
120
+ except Exception as e:
121
+ print(f"❌ 自动模式工厂函数测试失败: {e}")
122
+ traceback.print_exc()
123
+ return False
124
+
125
+
126
+ def test_config_to_dict():
127
+ """测试配置转换为字典"""
128
+ print("\n测试配置转换为字典...")
129
+
130
+ try:
131
+ # 创建配置
132
+ config = CrawloConfig.standalone(
133
+ project_name='test_project',
134
+ concurrency=4
135
+ )
136
+
137
+ # 转换为字典
138
+ config_dict = config.to_dict()
139
+
140
+ print(f"字典转换成功")
141
+ print(f"字典键数量: {len(config_dict)}")
142
+
143
+ # 验证关键配置项
144
+ assert 'RUN_MODE' in config_dict
145
+ assert 'QUEUE_TYPE' in config_dict
146
+ assert 'PROJECT_NAME' in config_dict
147
+ assert 'CONCURRENCY' in config_dict
148
+
149
+ print("✅ 配置转换为字典测试通过")
150
+ return True
151
+
152
+ except Exception as e:
153
+ print(f"❌ 配置转换为字典测试失败: {e}")
154
+ traceback.print_exc()
155
+ return False
156
+
157
+
158
+ def main():
159
+ """主函数"""
160
+ print("开始测试CrawloConfig工厂模式兼容性...")
161
+ print("=" * 50)
162
+
163
+ tests = [
164
+ test_standalone_factory,
165
+ test_distributed_factory,
166
+ test_auto_factory,
167
+ test_config_to_dict,
168
+ ]
169
+
170
+ passed = 0
171
+ total = len(tests)
172
+
173
+ for test_func in tests:
174
+ try:
175
+ if test_func():
176
+ passed += 1
177
+ print(f"✓ {test_func.__name__} 通过")
178
+ else:
179
+ print(f"✗ {test_func.__name__} 失败")
180
+ except Exception as e:
181
+ print(f"✗ {test_func.__name__} 异常: {e}")
182
+ print()
183
+
184
+ print("=" * 50)
185
+ print(f"测试结果: {passed}/{total} 通过")
186
+
187
+ if passed == total:
188
+ print("所有测试通过!CrawloConfig工厂模式兼容性正常。")
189
+ return 0
190
+ else:
191
+ print("部分测试失败,请检查实现。")
192
+ return 1
193
+
194
+
195
+ if __name__ == "__main__":
196
+ exit_code = main()
197
+ exit(exit_code)