crawlo 1.2.2__py3-none-any.whl → 1.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (222) hide show
  1. crawlo/__init__.py +61 -61
  2. crawlo/__version__.py +1 -1
  3. crawlo/cleaners/__init__.py +60 -60
  4. crawlo/cleaners/data_formatter.py +225 -225
  5. crawlo/cleaners/encoding_converter.py +125 -125
  6. crawlo/cleaners/text_cleaner.py +232 -232
  7. crawlo/cli.py +81 -81
  8. crawlo/commands/__init__.py +14 -14
  9. crawlo/commands/check.py +594 -594
  10. crawlo/commands/genspider.py +151 -151
  11. crawlo/commands/help.py +144 -142
  12. crawlo/commands/list.py +155 -155
  13. crawlo/commands/run.py +323 -292
  14. crawlo/commands/startproject.py +420 -418
  15. crawlo/commands/stats.py +188 -188
  16. crawlo/commands/utils.py +186 -186
  17. crawlo/config.py +312 -312
  18. crawlo/config_validator.py +251 -252
  19. crawlo/core/__init__.py +2 -2
  20. crawlo/core/engine.py +354 -354
  21. crawlo/core/processor.py +40 -40
  22. crawlo/core/scheduler.py +143 -143
  23. crawlo/crawler.py +1110 -1027
  24. crawlo/data/__init__.py +6 -0
  25. crawlo/data/user_agents.py +108 -0
  26. crawlo/downloader/__init__.py +266 -266
  27. crawlo/downloader/aiohttp_downloader.py +220 -220
  28. crawlo/downloader/cffi_downloader.py +256 -256
  29. crawlo/downloader/httpx_downloader.py +259 -259
  30. crawlo/downloader/hybrid_downloader.py +212 -213
  31. crawlo/downloader/playwright_downloader.py +402 -402
  32. crawlo/downloader/selenium_downloader.py +472 -472
  33. crawlo/event.py +11 -11
  34. crawlo/exceptions.py +81 -81
  35. crawlo/extension/__init__.py +37 -37
  36. crawlo/extension/health_check.py +141 -141
  37. crawlo/extension/log_interval.py +57 -57
  38. crawlo/extension/log_stats.py +81 -81
  39. crawlo/extension/logging_extension.py +43 -43
  40. crawlo/extension/memory_monitor.py +104 -104
  41. crawlo/extension/performance_profiler.py +133 -133
  42. crawlo/extension/request_recorder.py +107 -107
  43. crawlo/filters/__init__.py +154 -154
  44. crawlo/filters/aioredis_filter.py +280 -280
  45. crawlo/filters/memory_filter.py +269 -269
  46. crawlo/items/__init__.py +23 -23
  47. crawlo/items/base.py +21 -21
  48. crawlo/items/fields.py +52 -53
  49. crawlo/items/items.py +104 -104
  50. crawlo/middleware/__init__.py +21 -21
  51. crawlo/middleware/default_header.py +131 -131
  52. crawlo/middleware/download_delay.py +104 -104
  53. crawlo/middleware/middleware_manager.py +135 -135
  54. crawlo/middleware/offsite.py +114 -115
  55. crawlo/middleware/proxy.py +367 -366
  56. crawlo/middleware/request_ignore.py +86 -87
  57. crawlo/middleware/response_code.py +163 -164
  58. crawlo/middleware/response_filter.py +136 -137
  59. crawlo/middleware/retry.py +124 -124
  60. crawlo/mode_manager.py +211 -211
  61. crawlo/network/__init__.py +21 -21
  62. crawlo/network/request.py +338 -338
  63. crawlo/network/response.py +359 -359
  64. crawlo/pipelines/__init__.py +21 -21
  65. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  66. crawlo/pipelines/console_pipeline.py +39 -39
  67. crawlo/pipelines/csv_pipeline.py +316 -316
  68. crawlo/pipelines/database_dedup_pipeline.py +222 -224
  69. crawlo/pipelines/json_pipeline.py +218 -218
  70. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  71. crawlo/pipelines/mongo_pipeline.py +131 -131
  72. crawlo/pipelines/mysql_pipeline.py +317 -316
  73. crawlo/pipelines/pipeline_manager.py +61 -61
  74. crawlo/pipelines/redis_dedup_pipeline.py +165 -167
  75. crawlo/project.py +279 -187
  76. crawlo/queue/pqueue.py +37 -37
  77. crawlo/queue/queue_manager.py +337 -337
  78. crawlo/queue/redis_priority_queue.py +298 -298
  79. crawlo/settings/__init__.py +7 -7
  80. crawlo/settings/default_settings.py +217 -226
  81. crawlo/settings/setting_manager.py +122 -122
  82. crawlo/spider/__init__.py +639 -639
  83. crawlo/stats_collector.py +59 -59
  84. crawlo/subscriber.py +129 -130
  85. crawlo/task_manager.py +30 -30
  86. crawlo/templates/crawlo.cfg.tmpl +10 -10
  87. crawlo/templates/project/__init__.py.tmpl +3 -3
  88. crawlo/templates/project/items.py.tmpl +17 -17
  89. crawlo/templates/project/middlewares.py.tmpl +118 -118
  90. crawlo/templates/project/pipelines.py.tmpl +96 -96
  91. crawlo/templates/project/run.py.tmpl +47 -45
  92. crawlo/templates/project/settings.py.tmpl +350 -327
  93. crawlo/templates/project/settings_distributed.py.tmpl +160 -119
  94. crawlo/templates/project/settings_gentle.py.tmpl +133 -94
  95. crawlo/templates/project/settings_high_performance.py.tmpl +155 -151
  96. crawlo/templates/project/settings_simple.py.tmpl +108 -68
  97. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  98. crawlo/templates/spider/spider.py.tmpl +143 -143
  99. crawlo/tools/__init__.py +182 -182
  100. crawlo/tools/anti_crawler.py +268 -268
  101. crawlo/tools/authenticated_proxy.py +240 -240
  102. crawlo/tools/data_validator.py +180 -180
  103. crawlo/tools/date_tools.py +35 -35
  104. crawlo/tools/distributed_coordinator.py +386 -386
  105. crawlo/tools/retry_mechanism.py +220 -220
  106. crawlo/tools/scenario_adapter.py +262 -262
  107. crawlo/utils/__init__.py +35 -35
  108. crawlo/utils/batch_processor.py +259 -260
  109. crawlo/utils/controlled_spider_mixin.py +439 -439
  110. crawlo/utils/date_tools.py +290 -290
  111. crawlo/utils/db_helper.py +343 -343
  112. crawlo/utils/enhanced_error_handler.py +356 -359
  113. crawlo/utils/env_config.py +105 -105
  114. crawlo/utils/error_handler.py +123 -125
  115. crawlo/utils/func_tools.py +82 -82
  116. crawlo/utils/large_scale_config.py +286 -286
  117. crawlo/utils/large_scale_helper.py +344 -343
  118. crawlo/utils/log.py +128 -128
  119. crawlo/utils/performance_monitor.py +285 -284
  120. crawlo/utils/queue_helper.py +175 -175
  121. crawlo/utils/redis_connection_pool.py +334 -334
  122. crawlo/utils/redis_key_validator.py +198 -199
  123. crawlo/utils/request.py +267 -267
  124. crawlo/utils/request_serializer.py +218 -219
  125. crawlo/utils/spider_loader.py +61 -62
  126. crawlo/utils/system.py +11 -11
  127. crawlo/utils/tools.py +4 -4
  128. crawlo/utils/url.py +39 -39
  129. {crawlo-1.2.2.dist-info → crawlo-1.2.4.dist-info}/METADATA +764 -692
  130. crawlo-1.2.4.dist-info/RECORD +206 -0
  131. examples/__init__.py +7 -7
  132. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
  133. tests/__init__.py +7 -7
  134. tests/advanced_tools_example.py +275 -275
  135. tests/authenticated_proxy_example.py +236 -236
  136. tests/cleaners_example.py +160 -160
  137. tests/config_validation_demo.py +102 -102
  138. tests/controlled_spider_example.py +205 -205
  139. tests/date_tools_example.py +180 -180
  140. tests/dynamic_loading_example.py +523 -523
  141. tests/dynamic_loading_test.py +104 -104
  142. tests/env_config_example.py +133 -133
  143. tests/error_handling_example.py +171 -171
  144. tests/redis_key_validation_demo.py +130 -130
  145. tests/response_improvements_example.py +144 -144
  146. tests/test_advanced_tools.py +148 -148
  147. tests/test_all_redis_key_configs.py +145 -145
  148. tests/test_authenticated_proxy.py +141 -141
  149. tests/test_cleaners.py +54 -54
  150. tests/test_comprehensive.py +146 -146
  151. tests/test_config_validator.py +193 -193
  152. tests/test_crawlo_proxy_integration.py +172 -172
  153. tests/test_date_tools.py +123 -123
  154. tests/test_default_header_middleware.py +158 -158
  155. tests/test_double_crawlo_fix.py +207 -207
  156. tests/test_double_crawlo_fix_simple.py +124 -124
  157. tests/test_download_delay_middleware.py +221 -221
  158. tests/test_downloader_proxy_compatibility.py +268 -268
  159. tests/test_dynamic_downloaders_proxy.py +124 -124
  160. tests/test_dynamic_proxy.py +92 -92
  161. tests/test_dynamic_proxy_config.py +146 -146
  162. tests/test_dynamic_proxy_real.py +109 -109
  163. tests/test_edge_cases.py +303 -303
  164. tests/test_enhanced_error_handler.py +270 -270
  165. tests/test_env_config.py +121 -121
  166. tests/test_error_handler_compatibility.py +112 -112
  167. tests/test_final_validation.py +153 -153
  168. tests/test_framework_env_usage.py +103 -103
  169. tests/test_integration.py +356 -356
  170. tests/test_item_dedup_redis_key.py +122 -122
  171. tests/test_offsite_middleware.py +221 -221
  172. tests/test_parsel.py +29 -29
  173. tests/test_performance.py +327 -327
  174. tests/test_proxy_api.py +264 -264
  175. tests/test_proxy_health_check.py +32 -32
  176. tests/test_proxy_middleware.py +121 -121
  177. tests/test_proxy_middleware_enhanced.py +216 -216
  178. tests/test_proxy_middleware_integration.py +136 -136
  179. tests/test_proxy_providers.py +56 -56
  180. tests/test_proxy_stats.py +19 -19
  181. tests/test_proxy_strategies.py +59 -59
  182. tests/test_queue_manager_double_crawlo.py +173 -173
  183. tests/test_queue_manager_redis_key.py +176 -176
  184. tests/test_real_scenario_proxy.py +195 -195
  185. tests/test_redis_config.py +28 -28
  186. tests/test_redis_connection_pool.py +294 -294
  187. tests/test_redis_key_naming.py +181 -181
  188. tests/test_redis_key_validator.py +123 -123
  189. tests/test_redis_queue.py +224 -224
  190. tests/test_request_ignore_middleware.py +182 -182
  191. tests/test_request_serialization.py +70 -70
  192. tests/test_response_code_middleware.py +349 -349
  193. tests/test_response_filter_middleware.py +427 -427
  194. tests/test_response_improvements.py +152 -152
  195. tests/test_retry_middleware.py +241 -241
  196. tests/test_scheduler.py +241 -241
  197. tests/test_simple_response.py +61 -61
  198. tests/test_telecom_spider_redis_key.py +205 -205
  199. tests/test_template_content.py +87 -87
  200. tests/test_template_redis_key.py +134 -134
  201. tests/test_tools.py +153 -153
  202. tests/tools_example.py +257 -257
  203. crawlo-1.2.2.dist-info/RECORD +0 -220
  204. examples/aiohttp_settings.py +0 -42
  205. examples/curl_cffi_settings.py +0 -41
  206. examples/default_header_middleware_example.py +0 -107
  207. examples/default_header_spider_example.py +0 -129
  208. examples/download_delay_middleware_example.py +0 -160
  209. examples/httpx_settings.py +0 -42
  210. examples/multi_downloader_proxy_example.py +0 -81
  211. examples/offsite_middleware_example.py +0 -55
  212. examples/offsite_spider_example.py +0 -107
  213. examples/proxy_spider_example.py +0 -166
  214. examples/request_ignore_middleware_example.py +0 -51
  215. examples/request_ignore_spider_example.py +0 -99
  216. examples/response_code_middleware_example.py +0 -52
  217. examples/response_filter_middleware_example.py +0 -67
  218. examples/tong_hua_shun_settings.py +0 -62
  219. examples/tong_hua_shun_spider.py +0 -170
  220. {crawlo-1.2.2.dist-info → crawlo-1.2.4.dist-info}/WHEEL +0 -0
  221. {crawlo-1.2.2.dist-info → crawlo-1.2.4.dist-info}/entry_points.txt +0 -0
  222. {crawlo-1.2.2.dist-info → crawlo-1.2.4.dist-info}/top_level.txt +0 -0
@@ -1,154 +1,154 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- Crawlo Filters Module
5
- ====================
6
- 提供多种请求去重过滤器实现。
7
-
8
- 过滤器类型:
9
- - MemoryFilter: 基于内存的高效去重,适合单机模式
10
- - AioRedisFilter: 基于Redis的分布式去重,适合分布式模式
11
- - MemoryFileFilter: 内存+文件持久化,适合需要重启恢复的场景
12
-
13
- 核心接口:
14
- - BaseFilter: 所有过滤器的基类
15
- - requested(): 检查请求是否重复的主要方法
16
- """
17
- from abc import ABC, abstractmethod
18
- from typing import Optional
19
-
20
- from crawlo.utils.request import request_fingerprint
21
-
22
-
23
- class BaseFilter(ABC):
24
- """
25
- 请求去重过滤器基类
26
-
27
- 提供统一的去重接口和统计功能。
28
- 所有过滤器实现都应该继承此类。
29
- """
30
-
31
- def __init__(self, logger, stats, debug: bool = False):
32
- """
33
- 初始化过滤器
34
-
35
- :param logger: 日志器实例
36
- :param stats: 统计信息存储
37
- :param debug: 是否启用调试日志
38
- """
39
- self.logger = logger
40
- self.stats = stats
41
- self.debug = debug
42
- self._request_count = 0
43
- self._duplicate_count = 0
44
-
45
- @classmethod
46
- def create_instance(cls, *args, **kwargs) -> 'BaseFilter':
47
- return cls(*args, **kwargs)
48
-
49
- def requested(self, request) -> bool:
50
- """
51
- 检查请求是否重复(主要接口)
52
-
53
- :param request: 请求对象
54
- :return: True 表示重复,False 表示新请求
55
- """
56
- self._request_count += 1
57
- fp = request_fingerprint(request)
58
-
59
- if fp in self:
60
- self._duplicate_count += 1
61
- self.log_stats(request)
62
- return True
63
-
64
- self.add_fingerprint(fp)
65
- return False
66
-
67
- @abstractmethod
68
- def add_fingerprint(self, fp: str) -> None:
69
- """
70
- 添加请求指纹(子类必须实现)
71
-
72
- :param fp: 请求指纹字符串
73
- """
74
- pass
75
-
76
- @abstractmethod
77
- def __contains__(self, item: str) -> bool:
78
- """
79
- 检查指纹是否存在(支持 in 操作符)
80
-
81
- :param item: 要检查的指纹
82
- :return: 是否已存在
83
- """
84
- pass
85
-
86
- def log_stats(self, request) -> None:
87
- """
88
- 记录统计信息
89
-
90
- :param request: 重复的请求对象
91
- """
92
- if self.debug:
93
- self.logger.debug(f'Filtered duplicate request: {request}')
94
- self.stats.inc_value(f'{self}/filtered_count')
95
-
96
- def get_stats(self) -> dict:
97
- """
98
- 获取过滤器统计信息
99
-
100
- :return: 统计信息字典
101
- """
102
- return {
103
- 'total_requests': self._request_count,
104
- 'duplicate_requests': self._duplicate_count,
105
- 'unique_requests': self._request_count - self._duplicate_count,
106
- 'duplicate_rate': f"{self._duplicate_count / max(1, self._request_count) * 100:.2f}%"
107
- }
108
-
109
- def reset_stats(self) -> None:
110
- """重置统计信息"""
111
- self._request_count = 0
112
- self._duplicate_count = 0
113
-
114
- def close(self) -> None:
115
- """关闭过滤器并清理资源"""
116
- pass
117
-
118
- def __str__(self) -> str:
119
- return f'{self.__class__.__name__}'
120
-
121
-
122
- # 导出所有可用的过滤器
123
- __all__ = ['BaseFilter']
124
-
125
- # 动态导入具体实现
126
- try:
127
- from .memory_filter import MemoryFilter, MemoryFileFilter
128
- __all__.extend(['MemoryFilter', 'MemoryFileFilter'])
129
- except ImportError:
130
- MemoryFilter = None
131
- MemoryFileFilter = None
132
-
133
- try:
134
- from .aioredis_filter import AioRedisFilter
135
- __all__.append('AioRedisFilter')
136
- except ImportError:
137
- AioRedisFilter = None
138
-
139
- # 提供便捷的过滤器映射
140
- FILTER_MAP = {
141
- 'memory': MemoryFilter,
142
- 'memory_file': MemoryFileFilter,
143
- 'redis': AioRedisFilter,
144
- 'aioredis': AioRedisFilter, # 别名
145
- }
146
-
147
- # 过滤掉不可用的过滤器
148
- FILTER_MAP = {k: v for k, v in FILTER_MAP.items() if v is not None}
149
-
150
- def get_filter_class(name: str):
151
- """根据名称获取过滤器类"""
152
- if name in FILTER_MAP:
153
- return FILTER_MAP[name]
154
- raise ValueError(f"未知的过滤器类型: {name}。可用类型: {list(FILTER_MAP.keys())}")
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ Crawlo Filters Module
5
+ ====================
6
+ 提供多种请求去重过滤器实现。
7
+
8
+ 过滤器类型:
9
+ - MemoryFilter: 基于内存的高效去重,适合单机模式
10
+ - AioRedisFilter: 基于Redis的分布式去重,适合分布式模式
11
+ - MemoryFileFilter: 内存+文件持久化,适合需要重启恢复的场景
12
+
13
+ 核心接口:
14
+ - BaseFilter: 所有过滤器的基类
15
+ - requested(): 检查请求是否重复的主要方法
16
+ """
17
+ from abc import ABC, abstractmethod
18
+ from typing import Optional
19
+
20
+ from crawlo.utils.request import request_fingerprint
21
+
22
+
23
+ class BaseFilter(ABC):
24
+ """
25
+ 请求去重过滤器基类
26
+
27
+ 提供统一的去重接口和统计功能。
28
+ 所有过滤器实现都应该继承此类。
29
+ """
30
+
31
+ def __init__(self, logger, stats, debug: bool = False):
32
+ """
33
+ 初始化过滤器
34
+
35
+ :param logger: 日志器实例
36
+ :param stats: 统计信息存储
37
+ :param debug: 是否启用调试日志
38
+ """
39
+ self.logger = logger
40
+ self.stats = stats
41
+ self.debug = debug
42
+ self._request_count = 0
43
+ self._duplicate_count = 0
44
+
45
+ @classmethod
46
+ def create_instance(cls, *args, **kwargs) -> 'BaseFilter':
47
+ return cls(*args, **kwargs)
48
+
49
+ def requested(self, request) -> bool:
50
+ """
51
+ 检查请求是否重复(主要接口)
52
+
53
+ :param request: 请求对象
54
+ :return: True 表示重复,False 表示新请求
55
+ """
56
+ self._request_count += 1
57
+ fp = request_fingerprint(request)
58
+
59
+ if fp in self:
60
+ self._duplicate_count += 1
61
+ self.log_stats(request)
62
+ return True
63
+
64
+ self.add_fingerprint(fp)
65
+ return False
66
+
67
+ @abstractmethod
68
+ def add_fingerprint(self, fp: str) -> None:
69
+ """
70
+ 添加请求指纹(子类必须实现)
71
+
72
+ :param fp: 请求指纹字符串
73
+ """
74
+ pass
75
+
76
+ @abstractmethod
77
+ def __contains__(self, item: str) -> bool:
78
+ """
79
+ 检查指纹是否存在(支持 in 操作符)
80
+
81
+ :param item: 要检查的指纹
82
+ :return: 是否已存在
83
+ """
84
+ pass
85
+
86
+ def log_stats(self, request) -> None:
87
+ """
88
+ 记录统计信息
89
+
90
+ :param request: 重复的请求对象
91
+ """
92
+ if self.debug:
93
+ self.logger.debug(f'Filtered duplicate request: {request}')
94
+ self.stats.inc_value(f'{self}/filtered_count')
95
+
96
+ def get_stats(self) -> dict:
97
+ """
98
+ 获取过滤器统计信息
99
+
100
+ :return: 统计信息字典
101
+ """
102
+ return {
103
+ 'total_requests': self._request_count,
104
+ 'duplicate_requests': self._duplicate_count,
105
+ 'unique_requests': self._request_count - self._duplicate_count,
106
+ 'duplicate_rate': f"{self._duplicate_count / max(1, self._request_count) * 100:.2f}%"
107
+ }
108
+
109
+ def reset_stats(self) -> None:
110
+ """重置统计信息"""
111
+ self._request_count = 0
112
+ self._duplicate_count = 0
113
+
114
+ def close(self) -> None:
115
+ """关闭过滤器并清理资源"""
116
+ pass
117
+
118
+ def __str__(self) -> str:
119
+ return f'{self.__class__.__name__}'
120
+
121
+
122
+ # 导出所有可用的过滤器
123
+ __all__ = ['BaseFilter']
124
+
125
+ # 动态导入具体实现
126
+ try:
127
+ from .memory_filter import MemoryFilter, MemoryFileFilter
128
+ __all__.extend(['MemoryFilter', 'MemoryFileFilter'])
129
+ except ImportError:
130
+ MemoryFilter = None
131
+ MemoryFileFilter = None
132
+
133
+ try:
134
+ from .aioredis_filter import AioRedisFilter
135
+ __all__.append('AioRedisFilter')
136
+ except ImportError:
137
+ AioRedisFilter = None
138
+
139
+ # 提供便捷的过滤器映射
140
+ FILTER_MAP = {
141
+ 'memory': MemoryFilter,
142
+ 'memory_file': MemoryFileFilter,
143
+ 'redis': AioRedisFilter,
144
+ 'aioredis': AioRedisFilter, # 别名
145
+ }
146
+
147
+ # 过滤掉不可用的过滤器
148
+ FILTER_MAP = {k: v for k, v in FILTER_MAP.items() if v is not None}
149
+
150
+ def get_filter_class(name: str):
151
+ """根据名称获取过滤器类"""
152
+ if name in FILTER_MAP:
153
+ return FILTER_MAP[name]
154
+ raise ValueError(f"未知的过滤器类型: {name}。可用类型: {list(FILTER_MAP.keys())}")