crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (162) hide show
  1. crawlo/__init__.py +2 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +2 -2
  4. crawlo/commands/check.py +1 -1
  5. crawlo/commands/help.py +5 -3
  6. crawlo/commands/list.py +1 -1
  7. crawlo/commands/run.py +49 -11
  8. crawlo/commands/stats.py +1 -1
  9. crawlo/config.py +12 -4
  10. crawlo/config_validator.py +1 -1
  11. crawlo/core/engine.py +20 -7
  12. crawlo/core/processor.py +1 -1
  13. crawlo/core/scheduler.py +4 -5
  14. crawlo/crawler.py +51 -10
  15. crawlo/downloader/__init__.py +7 -3
  16. crawlo/downloader/aiohttp_downloader.py +18 -18
  17. crawlo/downloader/cffi_downloader.py +5 -2
  18. crawlo/downloader/httpx_downloader.py +9 -3
  19. crawlo/downloader/hybrid_downloader.py +2 -2
  20. crawlo/downloader/playwright_downloader.py +38 -15
  21. crawlo/downloader/selenium_downloader.py +16 -2
  22. crawlo/event.py +42 -8
  23. crawlo/exceptions.py +157 -24
  24. crawlo/extension/__init__.py +10 -9
  25. crawlo/extension/health_check.py +7 -7
  26. crawlo/extension/log_interval.py +6 -6
  27. crawlo/extension/log_stats.py +2 -2
  28. crawlo/extension/logging_extension.py +4 -12
  29. crawlo/extension/memory_monitor.py +5 -5
  30. crawlo/extension/performance_profiler.py +5 -5
  31. crawlo/extension/request_recorder.py +6 -6
  32. crawlo/factories/base.py +1 -1
  33. crawlo/factories/crawler.py +61 -60
  34. crawlo/factories/utils.py +135 -0
  35. crawlo/filters/__init__.py +19 -2
  36. crawlo/filters/aioredis_filter.py +133 -49
  37. crawlo/filters/memory_filter.py +6 -21
  38. crawlo/framework.py +22 -8
  39. crawlo/initialization/built_in.py +24 -67
  40. crawlo/initialization/core.py +65 -19
  41. crawlo/initialization/phases.py +83 -2
  42. crawlo/initialization/registry.py +5 -7
  43. crawlo/initialization/utils.py +49 -0
  44. crawlo/logging/__init__.py +6 -10
  45. crawlo/logging/config.py +106 -22
  46. crawlo/logging/factory.py +12 -8
  47. crawlo/logging/manager.py +19 -27
  48. crawlo/middleware/__init__.py +72 -9
  49. crawlo/middleware/default_header.py +2 -2
  50. crawlo/middleware/download_delay.py +2 -2
  51. crawlo/middleware/middleware_manager.py +6 -6
  52. crawlo/middleware/offsite.py +2 -2
  53. crawlo/middleware/proxy.py +2 -2
  54. crawlo/middleware/request_ignore.py +4 -4
  55. crawlo/middleware/response_code.py +2 -2
  56. crawlo/middleware/response_filter.py +2 -2
  57. crawlo/middleware/retry.py +1 -1
  58. crawlo/mode_manager.py +38 -4
  59. crawlo/network/request.py +54 -26
  60. crawlo/network/response.py +69 -135
  61. crawlo/pipelines/__init__.py +40 -9
  62. crawlo/pipelines/base_pipeline.py +452 -0
  63. crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
  64. crawlo/pipelines/console_pipeline.py +2 -2
  65. crawlo/pipelines/csv_pipeline.py +4 -4
  66. crawlo/pipelines/database_dedup_pipeline.py +4 -5
  67. crawlo/pipelines/json_pipeline.py +4 -4
  68. crawlo/pipelines/memory_dedup_pipeline.py +4 -5
  69. crawlo/pipelines/mongo_pipeline.py +23 -14
  70. crawlo/pipelines/mysql_pipeline.py +31 -39
  71. crawlo/pipelines/pipeline_manager.py +8 -8
  72. crawlo/pipelines/redis_dedup_pipeline.py +13 -14
  73. crawlo/project.py +1 -1
  74. crawlo/queue/__init__.py +10 -0
  75. crawlo/queue/queue_manager.py +79 -13
  76. crawlo/queue/redis_priority_queue.py +196 -47
  77. crawlo/settings/default_settings.py +16 -6
  78. crawlo/spider/__init__.py +6 -5
  79. crawlo/stats_collector.py +2 -2
  80. crawlo/task_manager.py +1 -1
  81. crawlo/templates/crawlo.cfg.tmpl +3 -3
  82. crawlo/templates/project/__init__.py.tmpl +1 -3
  83. crawlo/templates/project/items.py.tmpl +2 -6
  84. crawlo/templates/project/middlewares.py.tmpl +1 -1
  85. crawlo/templates/project/pipelines.py.tmpl +1 -2
  86. crawlo/templates/project/settings.py.tmpl +12 -10
  87. crawlo/templates/project/settings_distributed.py.tmpl +14 -13
  88. crawlo/templates/project/settings_gentle.py.tmpl +21 -23
  89. crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
  90. crawlo/templates/project/settings_minimal.py.tmpl +10 -8
  91. crawlo/templates/project/settings_simple.py.tmpl +21 -23
  92. crawlo/templates/run.py.tmpl +1 -1
  93. crawlo/templates/spider/spider.py.tmpl +4 -12
  94. crawlo/templates/spiders_init.py.tmpl +3 -8
  95. crawlo/tools/__init__.py +0 -103
  96. crawlo/tools/scenario_adapter.py +1 -1
  97. crawlo/utils/__init__.py +25 -1
  98. crawlo/utils/batch_processor.py +23 -6
  99. crawlo/utils/config_manager.py +442 -0
  100. crawlo/utils/controlled_spider_mixin.py +1 -1
  101. crawlo/utils/db_helper.py +1 -1
  102. crawlo/utils/encoding_helper.py +190 -0
  103. crawlo/utils/error_handler.py +2 -2
  104. crawlo/utils/large_scale_helper.py +1 -1
  105. crawlo/utils/leak_detector.py +335 -0
  106. crawlo/utils/mongo_connection_pool.py +157 -0
  107. crawlo/utils/mysql_connection_pool.py +197 -0
  108. crawlo/utils/performance_monitor.py +1 -1
  109. crawlo/utils/redis_checker.py +91 -0
  110. crawlo/utils/redis_connection_pool.py +260 -70
  111. crawlo/utils/redis_key_validator.py +1 -1
  112. crawlo/utils/request.py +24 -2
  113. crawlo/utils/request_serializer.py +1 -1
  114. crawlo/utils/resource_manager.py +337 -0
  115. crawlo/utils/response_helper.py +113 -0
  116. crawlo/utils/selector_helper.py +3 -2
  117. crawlo/utils/singleton.py +70 -0
  118. crawlo/utils/spider_loader.py +1 -1
  119. crawlo/utils/text_helper.py +1 -1
  120. crawlo-1.4.8.dist-info/METADATA +831 -0
  121. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
  122. tests/advanced_tools_example.py +10 -68
  123. tests/distributed_dedup_test.py +467 -0
  124. tests/monitor_redis_dedup.sh +72 -0
  125. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
  126. tests/simple_cli_test.py +55 -0
  127. tests/test_cli_arguments.py +119 -0
  128. tests/test_dedup_fix.py +10 -10
  129. crawlo/logging/async_handler.py +0 -181
  130. crawlo/logging/monitor.py +0 -153
  131. crawlo/logging/sampler.py +0 -167
  132. crawlo/tools/authenticated_proxy.py +0 -241
  133. crawlo/tools/data_formatter.py +0 -226
  134. crawlo/tools/data_validator.py +0 -181
  135. crawlo/tools/encoding_converter.py +0 -127
  136. crawlo/tools/network_diagnostic.py +0 -365
  137. crawlo/tools/request_tools.py +0 -83
  138. crawlo/tools/retry_mechanism.py +0 -224
  139. crawlo/utils/env_config.py +0 -143
  140. crawlo/utils/large_scale_config.py +0 -287
  141. crawlo/utils/log.py +0 -80
  142. crawlo/utils/system.py +0 -11
  143. crawlo/utils/tools.py +0 -5
  144. crawlo/utils/url.py +0 -40
  145. crawlo-1.4.6.dist-info/METADATA +0 -329
  146. tests/env_config_example.py +0 -134
  147. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  148. tests/test_authenticated_proxy.py +0 -142
  149. tests/test_comprehensive.py +0 -147
  150. tests/test_dynamic_downloaders_proxy.py +0 -125
  151. tests/test_dynamic_proxy.py +0 -93
  152. tests/test_dynamic_proxy_config.py +0 -147
  153. tests/test_dynamic_proxy_real.py +0 -110
  154. tests/test_env_config.py +0 -122
  155. tests/test_framework_env_usage.py +0 -104
  156. tests/test_large_scale_config.py +0 -113
  157. tests/test_proxy_api.py +0 -265
  158. tests/test_real_scenario_proxy.py +0 -196
  159. tests/tools_example.py +0 -261
  160. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
  161. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
  162. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
@@ -30,12 +30,10 @@ locals().update(config.to_dict())
30
30
  # 爬虫模块配置
31
31
  SPIDER_MODULES = ['{{project_name}}.spiders']
32
32
 
33
- # 默认请求头配置
34
- # 为DefaultHeaderMiddleware配置默认请求头
33
+ # 默认请求头
35
34
  # DEFAULT_REQUEST_HEADERS = {}
36
35
 
37
36
  # 允许的域名
38
- # 为OffsiteMiddleware配置允许的域名
39
37
  # ALLOWED_DOMAINS = []
40
38
 
41
39
  # 数据管道
@@ -63,6 +61,10 @@ SPIDER_MODULES = ['{{project_name}}.spiders']
63
61
  LOG_LEVEL = 'INFO'
64
62
  LOG_FILE = 'logs/{{project_name}}.log'
65
63
  LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
64
+ LOG_MAX_BYTES = 20 * 1024 * 1024 # 20MB,推荐值
65
+ LOG_BACKUP_COUNT = 10 # 10个备份文件,推荐值
66
+ # 如果不想要日志轮转,可以设置 LOG_MAX_BYTES = 0
67
+ # 当LOG_MAX_BYTES或LOG_BACKUP_COUNT为0时,日志轮转将被禁用,文件会持续增长
66
68
  STATS_DUMP = True
67
69
 
68
70
  # 输出配置
@@ -108,10 +110,10 @@ MONGO_USE_BATCH = True # 是否启用批量插入
108
110
 
109
111
  # =================================== 浏览器指纹模拟 ===================================
110
112
 
111
- # 浏览器指纹模拟(仅 CurlCffi 下载器有效)
112
- CURL_BROWSER_TYPE = "chrome" # 可选: chrome, edge, safari, firefox 或版本如 chrome136
113
+ # 浏览器指纹模拟(仅CurlCffiDownloader有效)
114
+ CURL_BROWSER_TYPE = "chrome" # 可选:chrome/edge/safari/firefox
113
115
 
114
- # 自定义浏览器版本映射(可覆盖默认行为)
116
+ # 自定义浏览器版本映射
115
117
  CURL_BROWSER_VERSION_MAP = {
116
118
  "chrome": "chrome136",
117
119
  "edge": "edge101",
@@ -129,11 +131,11 @@ HEALTH_CHECK_INTERVAL = 60 # 健康检查间隔(秒)
129
131
  REQUEST_STATS_ENABLED = True # 是否启用请求统计
130
132
  STATS_RESET_ON_START = False # 启动时是否重置统计
131
133
 
132
- # HttpX 下载器专用配置
134
+ # HttpX专用配置
133
135
  HTTPX_HTTP2 = True # 是否启用HTTP/2支持
134
136
  HTTPX_FOLLOW_REDIRECTS = True # 是否自动跟随重定向
135
137
 
136
- # AioHttp 下载器专用配置
138
+ # AioHttp专用配置
137
139
  AIOHTTP_AUTO_DECOMPRESS = True # 是否自动解压响应
138
140
  AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
139
141
 
@@ -143,17 +145,16 @@ CONNECTION_KEEPALIVE_TIMEOUT = 15 # Keep-Alive超时(秒)
143
145
 
144
146
  # =================================== 代理配置 ===================================
145
147
 
146
- # 简化版代理配置(适用于SimpleProxyMiddleware)
147
- # 只要配置了代理列表,中间件就会自动启用
148
+ # 简单代理(SimpleProxyMiddleware)
149
+ # 配置代理列表后中间件自动启用
148
150
  # PROXY_LIST = ["http://proxy1:8080", "http://proxy2:8080"]
149
151
 
150
- # 高级代理配置(适用于ProxyMiddleware)
151
- # 只要配置了代理API URL,中间件就会自动启用
152
+ # 动态代理(ProxyMiddleware)
153
+ # 配置代理API URL后中间件自动启用
152
154
  # PROXY_API_URL = "http://your-proxy-api.com/get-proxy"
153
155
 
154
156
  # =================================== 内存监控配置 ===================================
155
157
 
156
- # 内存监控扩展默认不启用,如需使用请在项目配置文件中启用
157
158
  MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
158
159
  MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
159
160
  MEMORY_WARNING_THRESHOLD = 80.0 # 内存使用率警告阈值(百分比)
@@ -11,7 +11,7 @@
11
11
  # 项目基本信息
12
12
  PROJECT_NAME = '{{project_name}}'
13
13
 
14
- # 运行模式
14
+ # 运行模式:standalone/distributed/auto
15
15
  RUN_MODE = 'standalone'
16
16
 
17
17
  # 并发配置
@@ -23,16 +23,16 @@ RANDOM_RANGE = [0.5, 1.5] # 随机延迟范围因子
23
23
 
24
24
  # =================================== 核心组件配置 ===================================
25
25
 
26
- # 下载器配置
26
+ # 下载器:AioHttpDownloader/HttpXDownloader/CurlCffiDownloader
27
27
  DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
28
28
 
29
- # 队列配置
29
+ # 队列类型:memory/redis/auto
30
30
  QUEUE_TYPE = 'memory'
31
31
 
32
- # 去重过滤器
32
+ # 去重过滤器:MemoryFilter/AioRedisFilter
33
33
  FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
34
34
 
35
- # 默认去重管道
35
+ # 默认去重管道:MemoryDedupPipeline/RedisDedupPipeline
36
36
  DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
37
37
 
38
38
  # =================================== 爬虫配置 ===================================
@@ -40,12 +40,10 @@ DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipe
40
40
  # 爬虫模块配置
41
41
  SPIDER_MODULES = ['{{project_name}}.spiders']
42
42
 
43
- # 默认请求头配置
44
- # 为DefaultHeaderMiddleware配置默认请求头
43
+ # 默认请求头
45
44
  # DEFAULT_REQUEST_HEADERS = {}
46
45
 
47
46
  # 允许的域名
48
- # 为OffsiteMiddleware配置允许的域名
49
47
  # ALLOWED_DOMAINS = []
50
48
 
51
49
  # 数据管道
@@ -73,6 +71,10 @@ SPIDER_MODULES = ['{{project_name}}.spiders']
73
71
  LOG_LEVEL = 'INFO'
74
72
  LOG_FILE = 'logs/{{project_name}}.log'
75
73
  LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
74
+ LOG_MAX_BYTES = 20 * 1024 * 1024 # 20MB,推荐值
75
+ LOG_BACKUP_COUNT = 10 # 10个备份文件,推荐值
76
+ # 如果不想要日志轮转,可以设置 LOG_MAX_BYTES = 0
77
+ # 当LOG_MAX_BYTES或LOG_BACKUP_COUNT为0时,日志轮转将被禁用,文件会持续增长
76
78
  STATS_DUMP = True
77
79
 
78
80
  # 输出配置
@@ -119,28 +121,25 @@ MONGO_USE_BATCH = False # 是否启用批量插入
119
121
  # =================================== 网络配置 ===================================
120
122
 
121
123
  # 代理配置
122
- # 代理功能默认不启用,如需使用请在项目配置文件中启用并配置相关参数
123
124
  PROXY_ENABLED = False # 是否启用代理
124
125
 
125
- # 简化版代理配置(适用于SimpleProxyMiddleware)
126
- PROXY_LIST = [] # 代理列表,例如: ["http://proxy1:8080", "http://proxy2:8080"]
126
+ # 简单代理(SimpleProxyMiddleware)
127
+ PROXY_LIST = [] # 代理列表
127
128
 
128
- # 高级代理配置(适用于ProxyMiddleware)
129
- PROXY_API_URL = "" # 代理获取接口(请替换为真实地址)
129
+ # 动态代理(ProxyMiddleware)
130
+ PROXY_API_URL = "" # 代理API地址
130
131
 
131
- # 代理提取方式(支持字段路径或函数)
132
- # 示例: "proxy" 适用于 {"proxy": "http://1.1.1.1:8080"}
133
- # 示例: "data.proxy" 适用于 {"data": {"proxy": "http://1.1.1.1:8080"}}
132
+ # 代理提取方式:"proxy" 或 "data.proxy"
134
133
  PROXY_EXTRACTOR = "proxy"
135
134
 
136
135
  # 代理刷新控制
137
136
  PROXY_REFRESH_INTERVAL = 60 # 代理刷新间隔(秒)
138
- PROXY_API_TIMEOUT = 10 # 请求代理 API 超时时间
137
+ PROXY_API_TIMEOUT = 10 # API超时时间
139
138
 
140
- # 浏览器指纹模拟(仅 CurlCffi 下载器有效)
141
- CURL_BROWSER_TYPE = "chrome" # 可选: chrome, edge, safari, firefox 或版本如 chrome136
139
+ # 浏览器指纹模拟(仅CurlCffiDownloader有效)
140
+ CURL_BROWSER_TYPE = "chrome" # 可选:chrome/edge/safari/firefox
142
141
 
143
- # 自定义浏览器版本映射(可覆盖默认行为)
142
+ # 自定义浏览器版本映射
144
143
  CURL_BROWSER_VERSION_MAP = {
145
144
  "chrome": "chrome136",
146
145
  "edge": "edge101",
@@ -157,11 +156,11 @@ HEALTH_CHECK_INTERVAL = 60 # 健康检查间隔(秒)
157
156
  REQUEST_STATS_ENABLED = True # 是否启用请求统计
158
157
  STATS_RESET_ON_START = False # 启动时是否重置统计
159
158
 
160
- # HttpX 下载器专用配置
159
+ # HttpX专用配置
161
160
  HTTPX_HTTP2 = True # 是否启用HTTP/2支持
162
161
  HTTPX_FOLLOW_REDIRECTS = True # 是否自动跟随重定向
163
162
 
164
- # AioHttp 下载器专用配置
163
+ # AioHttp专用配置
165
164
  AIOHTTP_AUTO_DECOMPRESS = True # 是否自动解压响应
166
165
  AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
167
166
 
@@ -170,7 +169,6 @@ CONNECTION_TTL_DNS_CACHE = 300 # DNS缓存TTL(秒)
170
169
  CONNECTION_KEEPALIVE_TIMEOUT = 15 # Keep-Alive超时(秒)
171
170
 
172
171
  # 内存监控配置
173
- # 内存监控扩展默认不启用,如需使用请在项目配置文件中启用
174
172
  MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
175
173
  MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
176
174
  MEMORY_WARNING_THRESHOLD = 80.0 # 内存使用率警告阈值(百分比)
@@ -11,7 +11,7 @@
11
11
  # 项目基本信息
12
12
  PROJECT_NAME = '{{project_name}}'
13
13
 
14
- # 运行模式
14
+ # 运行模式:standalone/distributed/auto
15
15
  RUN_MODE = 'standalone'
16
16
 
17
17
  # 并发配置
@@ -22,17 +22,17 @@ RANDOMNESS = False # 禁用随机延迟以保证性能
22
22
 
23
23
  # =================================== 核心组件配置 ===================================
24
24
 
25
- # 下载器配置
25
+ # 下载器:AioHttpDownloader/HttpXDownloader/CurlCffiDownloader
26
26
  DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
27
27
 
28
- # 队列配置
28
+ # 队列类型:memory/redis/auto
29
29
  QUEUE_TYPE = 'auto'
30
30
 
31
- # 去重过滤器
31
+ # 去重过滤器:MemoryFilter/AioRedisFilter
32
32
  # 高性能模式下,如果Redis可用则使用Redis去重,否则使用内存去重
33
33
  FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'
34
34
 
35
- # 默认去重管道
35
+ # 默认去重管道:MemoryDedupPipeline/RedisDedupPipeline
36
36
  # 高性能模式下,如果Redis可用则使用Redis去重,否则使用内存去重
37
37
  DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline'
38
38
 
@@ -41,12 +41,10 @@ DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeli
41
41
  # 爬虫模块配置
42
42
  SPIDER_MODULES = ['{{project_name}}.spiders']
43
43
 
44
- # 默认请求头配置
45
- # 为DefaultHeaderMiddleware配置默认请求头
44
+ # 默认请求头
46
45
  # DEFAULT_REQUEST_HEADERS = {}
47
46
 
48
47
  # 允许的域名
49
- # 为OffsiteMiddleware配置允许的域名
50
48
  # ALLOWED_DOMAINS = []
51
49
 
52
50
  # 数据管道
@@ -74,6 +72,10 @@ SPIDER_MODULES = ['{{project_name}}.spiders']
74
72
  LOG_LEVEL = 'INFO'
75
73
  LOG_FILE = 'logs/{{project_name}}.log'
76
74
  LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
75
+ LOG_MAX_BYTES = 50 * 1024 * 1024 # 50MB,适用于高负载场景
76
+ LOG_BACKUP_COUNT = 20 # 20个备份文件,适用于高负载场景
77
+ # 如果不想要日志轮转,可以设置 LOG_MAX_BYTES = 0
78
+ # 当LOG_MAX_BYTES或LOG_BACKUP_COUNT为0时,日志轮转将被禁用,文件会持续增长
77
79
  STATS_DUMP = True
78
80
 
79
81
  # 输出配置
@@ -120,28 +122,25 @@ MONGO_USE_BATCH = True # 是否启用批量插入
120
122
  # =================================== 网络配置 ===================================
121
123
 
122
124
  # 代理配置
123
- # 代理功能默认不启用,如需使用请在项目配置文件中启用并配置相关参数
124
125
  PROXY_ENABLED = False # 是否启用代理
125
126
 
126
- # 简化版代理配置(适用于SimpleProxyMiddleware)
127
- PROXY_LIST = [] # 代理列表,例如: ["http://proxy1:8080", "http://proxy2:8080"]
127
+ # 简单代理(SimpleProxyMiddleware)
128
+ PROXY_LIST = [] # 代理列表
128
129
 
129
- # 高级代理配置(适用于ProxyMiddleware)
130
- PROXY_API_URL = "" # 代理获取接口(请替换为真实地址)
130
+ # 动态代理(ProxyMiddleware)
131
+ PROXY_API_URL = "" # 代理API地址
131
132
 
132
- # 代理提取方式(支持字段路径或函数)
133
- # 示例: "proxy" 适用于 {"proxy": "http://1.1.1.1:8080"}
134
- # 示例: "data.proxy" 适用于 {"data": {"proxy": "http://1.1.1.1:8080"}}
133
+ # 代理提取方式:"proxy" 或 "data.proxy"
135
134
  PROXY_EXTRACTOR = "proxy"
136
135
 
137
136
  # 代理刷新控制
138
137
  PROXY_REFRESH_INTERVAL = 60 # 代理刷新间隔(秒)
139
- PROXY_API_TIMEOUT = 10 # 请求代理 API 超时时间
138
+ PROXY_API_TIMEOUT = 10 # API超时时间
140
139
 
141
- # 浏览器指纹模拟(仅 CurlCffi 下载器有效)
142
- CURL_BROWSER_TYPE = "chrome" # 可选: chrome, edge, safari, firefox 或版本如 chrome136
140
+ # 浏览器指纹模拟(仅CurlCffiDownloader有效)
141
+ CURL_BROWSER_TYPE = "chrome" # 可选:chrome/edge/safari/firefox
143
142
 
144
- # 自定义浏览器版本映射(可覆盖默认行为)
143
+ # 自定义浏览器版本映射
145
144
  CURL_BROWSER_VERSION_MAP = {
146
145
  "chrome": "chrome136",
147
146
  "edge": "edge101",
@@ -158,11 +157,11 @@ HEALTH_CHECK_INTERVAL = 30 # 健康检查间隔(秒)
158
157
  REQUEST_STATS_ENABLED = True # 是否启用请求统计
159
158
  STATS_RESET_ON_START = False # 启动时是否重置统计
160
159
 
161
- # HttpX 下载器专用配置
160
+ # HttpX专用配置
162
161
  HTTPX_HTTP2 = True # 是否启用HTTP/2支持
163
162
  HTTPX_FOLLOW_REDIRECTS = True # 是否自动跟随重定向
164
163
 
165
- # AioHttp 下载器专用配置
164
+ # AioHttp专用配置
166
165
  AIOHTTP_AUTO_DECOMPRESS = True # 是否自动解压响应
167
166
  AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
168
167
 
@@ -171,7 +170,6 @@ CONNECTION_TTL_DNS_CACHE = 300 # DNS缓存TTL(秒)
171
170
  CONNECTION_KEEPALIVE_TIMEOUT = 15 # Keep-Alive超时(秒)
172
171
 
173
172
  # 内存监控配置
174
- # 内存监控扩展默认不启用,如需使用请在项目配置文件中启用
175
173
  MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
176
174
  MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
177
175
  MEMORY_WARNING_THRESHOLD = 80.0 # 内存使用率警告阈值(百分比)
@@ -11,7 +11,7 @@
11
11
  # 项目基本信息
12
12
  PROJECT_NAME = '{{project_name}}'
13
13
 
14
- # 运行模式
14
+ # 运行模式:standalone/distributed/auto
15
15
  RUN_MODE = 'standalone'
16
16
 
17
17
  # 并发配置
@@ -21,16 +21,16 @@ DOWNLOAD_DELAY = 1.0
21
21
 
22
22
  # =================================== 核心组件配置 ===================================
23
23
 
24
- # 下载器配置
24
+ # 下载器:AioHttpDownloader/HttpXDownloader/CurlCffiDownloader
25
25
  DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
26
26
 
27
- # 队列配置
27
+ # 队列类型:memory/redis/auto
28
28
  QUEUE_TYPE = 'memory'
29
29
 
30
- # 去重过滤器
30
+ # 去重过滤器:MemoryFilter/AioRedisFilter
31
31
  FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
32
32
 
33
- # 默认去重管道
33
+ # 默认去重管道:MemoryDedupPipeline/RedisDedupPipeline
34
34
  DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
35
35
 
36
36
  # =================================== 爬虫配置 ===================================
@@ -38,12 +38,10 @@ DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipe
38
38
  # 爬虫模块配置
39
39
  SPIDER_MODULES = ['{{project_name}}.spiders']
40
40
 
41
- # 默认请求头配置
42
- # 为DefaultHeaderMiddleware配置默认请求头
41
+ # 默认请求头
43
42
  # DEFAULT_REQUEST_HEADERS = {}
44
43
 
45
44
  # 允许的域名
46
- # 为OffsiteMiddleware配置允许的域名
47
45
  # ALLOWED_DOMAINS = []
48
46
 
49
47
  # 数据管道
@@ -71,6 +69,10 @@ SPIDER_MODULES = ['{{project_name}}.spiders']
71
69
  LOG_LEVEL = 'INFO'
72
70
  LOG_FILE = 'logs/{{project_name}}.log'
73
71
  LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
72
+ LOG_MAX_BYTES = 20 * 1024 * 1024 # 20MB,推荐值
73
+ LOG_BACKUP_COUNT = 10 # 10个备份文件,推荐值
74
+ # 如果不想要日志轮转,可以设置 LOG_MAX_BYTES = 0
75
+ # 当LOG_MAX_BYTES或LOG_BACKUP_COUNT为0时,日志轮转将被禁用,文件会持续增长
74
76
  STATS_DUMP = True
75
77
 
76
78
  # 输出配置
@@ -11,7 +11,7 @@
11
11
  # 项目基本信息
12
12
  PROJECT_NAME = '{{project_name}}'
13
13
 
14
- # 运行模式
14
+ # 运行模式:standalone/distributed/auto
15
15
  RUN_MODE = 'standalone'
16
16
 
17
17
  # 并发配置
@@ -21,16 +21,16 @@ DOWNLOAD_DELAY = 1.0
21
21
 
22
22
  # =================================== 核心组件配置 ===================================
23
23
 
24
- # 下载器配置
24
+ # 下载器:AioHttpDownloader/HttpXDownloader/CurlCffiDownloader
25
25
  DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
26
26
 
27
- # 队列配置
27
+ # 队列类型:memory/redis/auto
28
28
  QUEUE_TYPE = 'memory'
29
29
 
30
- # 去重过滤器
30
+ # 去重过滤器:MemoryFilter/AioRedisFilter
31
31
  FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
32
32
 
33
- # 默认去重管道
33
+ # 默认去重管道:MemoryDedupPipeline/RedisDedupPipeline
34
34
  DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
35
35
 
36
36
  # =================================== 爬虫配置 ===================================
@@ -38,12 +38,10 @@ DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipe
38
38
  # 爬虫模块配置
39
39
  SPIDER_MODULES = ['{{project_name}}.spiders']
40
40
 
41
- # 默认请求头配置
42
- # 为DefaultHeaderMiddleware配置默认请求头
41
+ # 默认请求头
43
42
  # DEFAULT_REQUEST_HEADERS = {}
44
43
 
45
44
  # 允许的域名
46
- # 为OffsiteMiddleware配置允许的域名
47
45
  # ALLOWED_DOMAINS = []
48
46
 
49
47
  # 数据管道
@@ -71,6 +69,10 @@ SPIDER_MODULES = ['{{project_name}}.spiders']
71
69
  LOG_LEVEL = 'INFO'
72
70
  LOG_FILE = 'logs/{{project_name}}.log'
73
71
  LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
72
+ LOG_MAX_BYTES = 20 * 1024 * 1024 # 20MB,推荐值
73
+ LOG_BACKUP_COUNT = 10 # 10个备份文件,推荐值
74
+ # 如果不想要日志轮转,可以设置 LOG_MAX_BYTES = 0
75
+ # 当LOG_MAX_BYTES或LOG_BACKUP_COUNT为0时,日志轮转将被禁用,文件会持续增长
74
76
  STATS_DUMP = True
75
77
 
76
78
  # 输出配置
@@ -117,28 +119,25 @@ MONGO_USE_BATCH = False # 是否启用批量插入
117
119
  # =================================== 网络配置 ===================================
118
120
 
119
121
  # 代理配置
120
- # 代理功能默认不启用,如需使用请在项目配置文件中启用并配置相关参数
121
122
  PROXY_ENABLED = False # 是否启用代理
122
123
 
123
- # 简化版代理配置(适用于SimpleProxyMiddleware)
124
- PROXY_LIST = [] # 代理列表,例如: ["http://proxy1:8080", "http://proxy2:8080"]
124
+ # 简单代理(SimpleProxyMiddleware)
125
+ PROXY_LIST = [] # 代理列表
125
126
 
126
- # 高级代理配置(适用于ProxyMiddleware)
127
- PROXY_API_URL = "" # 代理获取接口(请替换为真实地址)
127
+ # 动态代理(ProxyMiddleware)
128
+ PROXY_API_URL = "" # 代理API地址
128
129
 
129
- # 代理提取方式(支持字段路径或函数)
130
- # 示例: "proxy" 适用于 {"proxy": "http://1.1.1.1:8080"}
131
- # 示例: "data.proxy" 适用于 {"data": {"proxy": "http://1.1.1.1:8080"}}
130
+ # 代理提取方式:"proxy" 或 "data.proxy"
132
131
  PROXY_EXTRACTOR = "proxy"
133
132
 
134
133
  # 代理刷新控制
135
134
  PROXY_REFRESH_INTERVAL = 60 # 代理刷新间隔(秒)
136
- PROXY_API_TIMEOUT = 10 # 请求代理 API 超时时间
135
+ PROXY_API_TIMEOUT = 10 # API超时时间
137
136
 
138
- # 浏览器指纹模拟(仅 CurlCffi 下载器有效)
139
- CURL_BROWSER_TYPE = "chrome" # 可选: chrome, edge, safari, firefox 或版本如 chrome136
137
+ # 浏览器指纹模拟(仅CurlCffiDownloader有效)
138
+ CURL_BROWSER_TYPE = "chrome" # 可选:chrome/edge/safari/firefox
140
139
 
141
- # 自定义浏览器版本映射(可覆盖默认行为)
140
+ # 自定义浏览器版本映射
142
141
  CURL_BROWSER_VERSION_MAP = {
143
142
  "chrome": "chrome136",
144
143
  "edge": "edge101",
@@ -155,11 +154,11 @@ HEALTH_CHECK_INTERVAL = 60 # 健康检查间隔(秒)
155
154
  REQUEST_STATS_ENABLED = True # 是否启用请求统计
156
155
  STATS_RESET_ON_START = False # 启动时是否重置统计
157
156
 
158
- # HttpX 下载器专用配置
157
+ # HttpX专用配置
159
158
  HTTPX_HTTP2 = True # 是否启用HTTP/2支持
160
159
  HTTPX_FOLLOW_REDIRECTS = True # 是否自动跟随重定向
161
160
 
162
- # AioHttp 下载器专用配置
161
+ # AioHttp专用配置
163
162
  AIOHTTP_AUTO_DECOMPRESS = True # 是否自动解压响应
164
163
  AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
165
164
 
@@ -168,7 +167,6 @@ CONNECTION_TTL_DNS_CACHE = 300 # DNS缓存TTL(秒)
168
167
  CONNECTION_KEEPALIVE_TIMEOUT = 15 # Keep-Alive超时(秒)
169
168
 
170
169
  # 内存监控配置
171
- # 内存监控扩展默认不启用,如需使用请在项目配置文件中启用
172
170
  MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
173
171
  MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
174
172
  MEMORY_WARNING_THRESHOLD = 80.0 # 内存使用率警告阈值(百分比)
@@ -8,7 +8,7 @@ from crawlo.crawler import CrawlerProcess
8
8
 
9
9
 
10
10
  def main():
11
- """主函数:运行爬虫"""
11
+ """运行爬虫"""
12
12
  try:
13
13
  # TODO: 请将 'spider_name' 替换为实际要运行的爬虫名称
14
14
  asyncio.run(CrawlerProcess().crawl('spider_name'))
@@ -1,8 +1,6 @@
1
1
  # -*- coding: UTF-8 -*-
2
2
  """
3
- {{project_name}}.spiders.{{spider_name}}
4
- =======================================
5
- 由 `crawlo genspider` 命令生成的爬虫。
3
+ 爬虫:{{spider_name}}
6
4
  """
7
5
 
8
6
  from crawlo.spider import Spider
@@ -11,9 +9,7 @@ from ..items import {{item_class}}
11
9
 
12
10
 
13
11
  class {{class_name}}(Spider):
14
- """
15
- 爬虫:{{spider_name}}
16
- """
12
+ """{{spider_name}} 爬虫"""
17
13
  name = '{{spider_name}}'
18
14
  allowed_domains = ['{{domain}}']
19
15
  start_urls = ['https://{{domain}}/']
@@ -22,16 +18,12 @@ class {{class_name}}(Spider):
22
18
  custom_settings = {}
23
19
 
24
20
  def start_requests(self):
25
- """
26
- 生成初始请求。
27
- """
21
+ """生成初始请求"""
28
22
  for url in self.start_urls:
29
23
  yield Request(url=url, callback=self.parse)
30
24
 
31
25
  def parse(self, response):
32
- """
33
- 解析响应的主方法。
34
- """
26
+ """解析响应"""
35
27
  self.logger.info(f'正在解析页面: {response.url}')
36
28
 
37
29
  yield {
@@ -1,10 +1,5 @@
1
1
  # -*- coding: UTF-8 -*-
2
2
  """
3
- {{project_name}}.spiders
4
- ========================
5
- 存放所有的爬虫。
6
-
7
- 该文件支持自动导入所有爬虫模块,确保爬虫能被正确注册。
8
- 框架会自动扫描并导入此目录下的所有Python文件(除了__init__.py)。
9
- """
10
- # 框架会自动处理爬虫模块的导入,无需手动导入
3
+ 爬虫模块目录
4
+ 框架自动扫描并注册此目录下的所有爬虫
5
+ """
crawlo/tools/__init__.py CHANGED
@@ -35,60 +35,6 @@ from .text_cleaner import (
35
35
  extract_urls
36
36
  )
37
37
 
38
- from .data_formatter import (
39
- DataFormatter,
40
- format_number,
41
- format_currency,
42
- format_percentage,
43
- format_phone_number,
44
- format_chinese_id_card,
45
- capitalize_words
46
- )
47
-
48
- from .encoding_converter import (
49
- EncodingConverter,
50
- detect_encoding,
51
- to_utf8,
52
- convert_encoding
53
- )
54
-
55
- # 数据验证工具封装
56
- from .data_validator import (
57
- DataValidator,
58
- validate_email,
59
- validate_phone,
60
- validate_url,
61
- validate_chinese_id_card,
62
- validate_date,
63
- validate_number_range,
64
- check_data_integrity
65
- )
66
-
67
- # 请求处理工具
68
- from .request_tools import (
69
- build_url,
70
- add_query_params,
71
- merge_headers
72
- )
73
-
74
- # 重试机制封装
75
- from .retry_mechanism import (
76
- RetryMechanism,
77
- retry,
78
- should_retry,
79
- exponential_backoff
80
- )
81
-
82
- # 带认证代理工具
83
- from .authenticated_proxy import (
84
- AuthenticatedProxy,
85
- create_proxy_config,
86
- format_proxy_for_request,
87
- parse_proxy_url,
88
- validate_proxy_url,
89
- get_proxy_info
90
- )
91
-
92
38
  # 分布式协调工具
93
39
  from .distributed_coordinator import (
94
40
  TaskDistributor,
@@ -118,8 +64,6 @@ __all__ = [
118
64
 
119
65
  # 数据清洗工具
120
66
  "TextCleaner",
121
- "DataFormatter",
122
- "EncodingConverter",
123
67
  "remove_html_tags",
124
68
  "decode_html_entities",
125
69
  "remove_extra_whitespace",
@@ -129,53 +73,6 @@ __all__ = [
129
73
  "extract_numbers",
130
74
  "extract_emails",
131
75
  "extract_urls",
132
- "format_number",
133
- "format_currency",
134
- "format_percentage",
135
- "format_phone_number",
136
- "format_chinese_id_card",
137
- "capitalize_words",
138
- "detect_encoding",
139
- "to_utf8",
140
- "convert_encoding",
141
-
142
- # 数据验证工具
143
- "DataValidator",
144
- "validate_email",
145
- "validate_phone",
146
- "validate_url",
147
- "validate_chinese_id_card",
148
- "validate_date",
149
- "validate_number_range",
150
- "check_data_integrity",
151
-
152
- # 请求处理工具
153
- "build_url",
154
- "add_query_params",
155
- "merge_headers",
156
-
157
- # 重试机制封装
158
- "RetryMechanism",
159
- "retry",
160
- "should_retry",
161
- "exponential_backoff",
162
-
163
- # 反爬虫应对工具
164
- "ProxyPoolManager",
165
- "CaptchaHandler",
166
- "AntiCrawler",
167
- "get_random_user_agent",
168
- "rotate_proxy",
169
- "handle_captcha",
170
- "detect_rate_limiting",
171
-
172
- # 带认证代理工具
173
- "AuthenticatedProxy",
174
- "create_proxy_config",
175
- "format_proxy_for_request",
176
- "parse_proxy_url",
177
- "validate_proxy_url",
178
- "get_proxy_info",
179
76
 
180
77
  # 分布式协调工具
181
78
  "TaskDistributor",
@@ -24,7 +24,7 @@ import re
24
24
  from typing import Dict, Any
25
25
  from urllib.parse import urlparse
26
26
 
27
- from crawlo.utils.log import get_logger
27
+ from crawlo.logging import get_logger
28
28
 
29
29
 
30
30
  class DynamicLoadingScenarioAdapter: