crawlo 1.1.4__py3-none-any.whl → 1.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (190) hide show
  1. crawlo/__init__.py +61 -34
  2. crawlo/__version__.py +1 -1
  3. crawlo/cleaners/__init__.py +61 -0
  4. crawlo/cleaners/data_formatter.py +226 -0
  5. crawlo/cleaners/encoding_converter.py +126 -0
  6. crawlo/cleaners/text_cleaner.py +233 -0
  7. crawlo/cli.py +40 -40
  8. crawlo/commands/__init__.py +13 -13
  9. crawlo/commands/check.py +594 -594
  10. crawlo/commands/genspider.py +151 -151
  11. crawlo/commands/list.py +155 -155
  12. crawlo/commands/run.py +292 -285
  13. crawlo/commands/startproject.py +419 -196
  14. crawlo/commands/stats.py +188 -188
  15. crawlo/commands/utils.py +186 -186
  16. crawlo/config.py +312 -279
  17. crawlo/config_validator.py +253 -0
  18. crawlo/core/__init__.py +2 -2
  19. crawlo/core/engine.py +346 -172
  20. crawlo/core/processor.py +40 -40
  21. crawlo/core/scheduler.py +137 -166
  22. crawlo/crawler.py +1027 -1027
  23. crawlo/downloader/__init__.py +266 -242
  24. crawlo/downloader/aiohttp_downloader.py +220 -212
  25. crawlo/downloader/cffi_downloader.py +256 -251
  26. crawlo/downloader/httpx_downloader.py +259 -259
  27. crawlo/downloader/hybrid_downloader.py +214 -0
  28. crawlo/downloader/playwright_downloader.py +403 -0
  29. crawlo/downloader/selenium_downloader.py +473 -0
  30. crawlo/event.py +11 -11
  31. crawlo/exceptions.py +81 -81
  32. crawlo/extension/__init__.py +37 -37
  33. crawlo/extension/health_check.py +141 -141
  34. crawlo/extension/log_interval.py +57 -57
  35. crawlo/extension/log_stats.py +81 -81
  36. crawlo/extension/logging_extension.py +43 -43
  37. crawlo/extension/memory_monitor.py +104 -88
  38. crawlo/extension/performance_profiler.py +133 -117
  39. crawlo/extension/request_recorder.py +107 -107
  40. crawlo/filters/__init__.py +154 -154
  41. crawlo/filters/aioredis_filter.py +281 -242
  42. crawlo/filters/memory_filter.py +269 -269
  43. crawlo/items/__init__.py +23 -23
  44. crawlo/items/base.py +21 -21
  45. crawlo/items/fields.py +53 -53
  46. crawlo/items/items.py +104 -104
  47. crawlo/middleware/__init__.py +21 -21
  48. crawlo/middleware/default_header.py +32 -32
  49. crawlo/middleware/download_delay.py +28 -28
  50. crawlo/middleware/middleware_manager.py +135 -135
  51. crawlo/middleware/proxy.py +272 -248
  52. crawlo/middleware/request_ignore.py +30 -30
  53. crawlo/middleware/response_code.py +18 -18
  54. crawlo/middleware/response_filter.py +26 -26
  55. crawlo/middleware/retry.py +124 -124
  56. crawlo/mode_manager.py +212 -201
  57. crawlo/network/__init__.py +21 -21
  58. crawlo/network/request.py +338 -311
  59. crawlo/network/response.py +360 -271
  60. crawlo/pipelines/__init__.py +21 -21
  61. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  62. crawlo/pipelines/console_pipeline.py +39 -39
  63. crawlo/pipelines/csv_pipeline.py +316 -316
  64. crawlo/pipelines/database_dedup_pipeline.py +224 -224
  65. crawlo/pipelines/json_pipeline.py +218 -218
  66. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  67. crawlo/pipelines/mongo_pipeline.py +131 -131
  68. crawlo/pipelines/mysql_pipeline.py +316 -316
  69. crawlo/pipelines/pipeline_manager.py +61 -56
  70. crawlo/pipelines/redis_dedup_pipeline.py +167 -162
  71. crawlo/project.py +188 -153
  72. crawlo/queue/pqueue.py +37 -37
  73. crawlo/queue/queue_manager.py +334 -307
  74. crawlo/queue/redis_priority_queue.py +299 -209
  75. crawlo/settings/__init__.py +7 -7
  76. crawlo/settings/default_settings.py +219 -278
  77. crawlo/settings/setting_manager.py +123 -100
  78. crawlo/spider/__init__.py +639 -639
  79. crawlo/stats_collector.py +59 -59
  80. crawlo/subscriber.py +130 -130
  81. crawlo/task_manager.py +30 -30
  82. crawlo/templates/crawlo.cfg.tmpl +10 -10
  83. crawlo/templates/project/__init__.py.tmpl +3 -3
  84. crawlo/templates/project/items.py.tmpl +17 -17
  85. crawlo/templates/project/middlewares.py.tmpl +110 -110
  86. crawlo/templates/project/pipelines.py.tmpl +97 -97
  87. crawlo/templates/project/run.py.tmpl +251 -251
  88. crawlo/templates/project/settings.py.tmpl +326 -279
  89. crawlo/templates/project/settings_distributed.py.tmpl +120 -0
  90. crawlo/templates/project/settings_gentle.py.tmpl +95 -0
  91. crawlo/templates/project/settings_high_performance.py.tmpl +152 -0
  92. crawlo/templates/project/settings_simple.py.tmpl +69 -0
  93. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  94. crawlo/templates/spider/spider.py.tmpl +141 -141
  95. crawlo/tools/__init__.py +183 -0
  96. crawlo/tools/anti_crawler.py +269 -0
  97. crawlo/tools/authenticated_proxy.py +241 -0
  98. crawlo/tools/data_validator.py +181 -0
  99. crawlo/tools/date_tools.py +36 -0
  100. crawlo/tools/distributed_coordinator.py +387 -0
  101. crawlo/tools/retry_mechanism.py +221 -0
  102. crawlo/tools/scenario_adapter.py +263 -0
  103. crawlo/utils/__init__.py +35 -7
  104. crawlo/utils/batch_processor.py +261 -0
  105. crawlo/utils/controlled_spider_mixin.py +439 -439
  106. crawlo/utils/date_tools.py +290 -233
  107. crawlo/utils/db_helper.py +343 -343
  108. crawlo/utils/enhanced_error_handler.py +360 -0
  109. crawlo/utils/env_config.py +106 -0
  110. crawlo/utils/error_handler.py +126 -0
  111. crawlo/utils/func_tools.py +82 -82
  112. crawlo/utils/large_scale_config.py +286 -286
  113. crawlo/utils/large_scale_helper.py +343 -343
  114. crawlo/utils/log.py +128 -128
  115. crawlo/utils/performance_monitor.py +285 -0
  116. crawlo/utils/queue_helper.py +175 -175
  117. crawlo/utils/redis_connection_pool.py +335 -0
  118. crawlo/utils/redis_key_validator.py +200 -0
  119. crawlo/utils/request.py +267 -267
  120. crawlo/utils/request_serializer.py +219 -219
  121. crawlo/utils/spider_loader.py +62 -62
  122. crawlo/utils/system.py +11 -11
  123. crawlo/utils/tools.py +4 -4
  124. crawlo/utils/url.py +39 -39
  125. {crawlo-1.1.4.dist-info → crawlo-1.1.6.dist-info}/METADATA +401 -403
  126. crawlo-1.1.6.dist-info/RECORD +189 -0
  127. examples/__init__.py +7 -7
  128. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +82 -0
  129. tests/__init__.py +7 -7
  130. tests/advanced_tools_example.py +276 -0
  131. tests/authenticated_proxy_example.py +237 -0
  132. tests/cleaners_example.py +161 -0
  133. tests/config_validation_demo.py +103 -0
  134. {examples → tests}/controlled_spider_example.py +205 -205
  135. tests/date_tools_example.py +181 -0
  136. tests/dynamic_loading_example.py +524 -0
  137. tests/dynamic_loading_test.py +105 -0
  138. tests/env_config_example.py +134 -0
  139. tests/error_handling_example.py +172 -0
  140. tests/redis_key_validation_demo.py +131 -0
  141. tests/response_improvements_example.py +145 -0
  142. tests/test_advanced_tools.py +149 -0
  143. tests/test_all_redis_key_configs.py +146 -0
  144. tests/test_authenticated_proxy.py +142 -0
  145. tests/test_cleaners.py +55 -0
  146. tests/test_comprehensive.py +147 -0
  147. tests/test_config_validator.py +194 -0
  148. tests/test_date_tools.py +124 -0
  149. tests/test_double_crawlo_fix.py +208 -0
  150. tests/test_double_crawlo_fix_simple.py +125 -0
  151. tests/test_dynamic_downloaders_proxy.py +125 -0
  152. tests/test_dynamic_proxy.py +93 -0
  153. tests/test_dynamic_proxy_config.py +147 -0
  154. tests/test_dynamic_proxy_real.py +110 -0
  155. tests/test_edge_cases.py +304 -0
  156. tests/test_enhanced_error_handler.py +271 -0
  157. tests/test_env_config.py +122 -0
  158. tests/test_error_handler_compatibility.py +113 -0
  159. tests/test_final_validation.py +153 -153
  160. tests/test_framework_env_usage.py +104 -0
  161. tests/test_integration.py +357 -0
  162. tests/test_item_dedup_redis_key.py +123 -0
  163. tests/test_parsel.py +30 -0
  164. tests/test_performance.py +328 -0
  165. tests/test_proxy_health_check.py +32 -32
  166. tests/test_proxy_middleware_integration.py +136 -136
  167. tests/test_proxy_providers.py +56 -56
  168. tests/test_proxy_stats.py +19 -19
  169. tests/test_proxy_strategies.py +59 -59
  170. tests/test_queue_manager_double_crawlo.py +231 -0
  171. tests/test_queue_manager_redis_key.py +177 -0
  172. tests/test_redis_config.py +28 -28
  173. tests/test_redis_connection_pool.py +295 -0
  174. tests/test_redis_key_naming.py +182 -0
  175. tests/test_redis_key_validator.py +124 -0
  176. tests/test_redis_queue.py +224 -224
  177. tests/test_request_serialization.py +70 -70
  178. tests/test_response_improvements.py +153 -0
  179. tests/test_scheduler.py +241 -241
  180. tests/test_simple_response.py +62 -0
  181. tests/test_telecom_spider_redis_key.py +206 -0
  182. tests/test_template_content.py +88 -0
  183. tests/test_template_redis_key.py +135 -0
  184. tests/test_tools.py +154 -0
  185. tests/tools_example.py +258 -0
  186. crawlo/core/enhanced_engine.py +0 -190
  187. crawlo-1.1.4.dist-info/RECORD +0 -117
  188. {crawlo-1.1.4.dist-info → crawlo-1.1.6.dist-info}/WHEEL +0 -0
  189. {crawlo-1.1.4.dist-info → crawlo-1.1.6.dist-info}/entry_points.txt +0 -0
  190. {crawlo-1.1.4.dist-info → crawlo-1.1.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,120 @@
1
+ # -*- coding: UTF-8 -*-
2
+ """
3
+ {{project_name}} 项目配置文件(分布式版)
4
+ =============================
5
+ 基于 Crawlo 框架的分布式爬虫项目配置。
6
+ 适合大规模数据采集和多节点部署。
7
+ """
8
+
9
+ import os
10
+ from crawlo.config import CrawloConfig
11
+
12
+ # ============================== 项目基本信息 ==============================
13
+ PROJECT_NAME = '{{project_name}}'
14
+ VERSION = '1.0.0'
15
+
16
+ # ============================== 分布式配置 ==============================
17
+ # 使用配置工厂创建分布式配置
18
+ CONFIG = CrawloConfig.distributed(
19
+ redis_host=os.getenv('REDIS_HOST', '127.0.0.1'),
20
+ redis_port=int(os.getenv('REDIS_PORT', 6379)),
21
+ redis_password=os.getenv('REDIS_PASSWORD', ''),
22
+ project_name='{{project_name}}',
23
+ concurrency=16,
24
+ download_delay=1.0
25
+ )
26
+
27
+ # 获取配置
28
+ locals().update(CONFIG.to_dict())
29
+
30
+ # ============================== 网络请求配置 ==============================
31
+ DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader"
32
+ DOWNLOAD_TIMEOUT = 60
33
+ VERIFY_SSL = True
34
+
35
+ # ============================== 并发配置 ==============================
36
+ CONCURRENCY = 16
37
+ MAX_RUNNING_SPIDERS = 5
38
+ DOWNLOAD_DELAY = 1.0
39
+
40
+ # ============================== 队列配置 ==============================
41
+ SCHEDULER_MAX_QUEUE_SIZE = 5000
42
+ QUEUE_MAX_RETRIES = 5
43
+ QUEUE_TIMEOUT = 300
44
+
45
+ # ============================== Redis 配置 ==============================
46
+ REDIS_HOST = os.getenv('REDIS_HOST', '127.0.0.1')
47
+ REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
48
+ REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', '')
49
+ REDIS_DB = int(os.getenv('REDIS_DB', 0))
50
+
51
+ # 根据是否有密码生成 URL
52
+ if REDIS_PASSWORD:
53
+ REDIS_URL = f'redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
54
+ else:
55
+ REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
56
+
57
+ # ============================== 数据存储配置 ==============================
58
+ # MySQL 配置
59
+ MYSQL_HOST = os.getenv('MYSQL_HOST', '127.0.0.1')
60
+ MYSQL_PORT = int(os.getenv('MYSQL_PORT', 3306))
61
+ MYSQL_USER = os.getenv('MYSQL_USER', 'root')
62
+ MYSQL_PASSWORD = os.getenv('MYSQL_PASSWORD', '123456')
63
+ MYSQL_DB = os.getenv('MYSQL_DB', '{{project_name}}')
64
+ MYSQL_TABLE = '{{project_name}}_data'
65
+ MYSQL_BATCH_SIZE = 100
66
+ MYSQL_USE_BATCH = True
67
+
68
+ # MongoDB 配置
69
+ MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost:27017')
70
+ MONGO_DATABASE = '{{project_name}}_db'
71
+ MONGO_COLLECTION = '{{project_name}}_items'
72
+ MONGO_BATCH_SIZE = 100
73
+ MONGO_USE_BATCH = True
74
+
75
+ # ============================== 去重配置 ==============================
76
+ REDIS_TTL = 0
77
+ CLEANUP_FP = 0
78
+ FILTER_DEBUG = True
79
+
80
+ # ============================== 中间件与管道 ==============================
81
+ MIDDLEWARES = [
82
+ 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
83
+ 'crawlo.middleware.download_delay.DownloadDelayMiddleware',
84
+ 'crawlo.middleware.default_header.DefaultHeaderMiddleware',
85
+ 'crawlo.middleware.proxy.ProxyMiddleware',
86
+ 'crawlo.middleware.retry.RetryMiddleware',
87
+ 'crawlo.middleware.response_code.ResponseCodeMiddleware',
88
+ 'crawlo.middleware.response_filter.ResponseFilterMiddleware',
89
+ ]
90
+
91
+ PIPELINES = [
92
+ 'crawlo.pipelines.console_pipeline.ConsolePipeline',
93
+ # '{{project_name}}.pipelines.DatabasePipeline',
94
+ # 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline',
95
+ # 'crawlo.pipelines.mongo_pipeline.MongoPipeline',
96
+ ]
97
+
98
+ # ============================== 扩展组件 ==============================
99
+ EXTENSIONS = [
100
+ 'crawlo.extension.log_interval.LogIntervalExtension',
101
+ 'crawlo.extension.log_stats.LogStats',
102
+ 'crawlo.extension.logging_extension.CustomLoggerExtension',
103
+ # 'crawlo.extension.memory_monitor.MemoryMonitorExtension',
104
+ # 'crawlo.extension.request_recorder.RequestRecorderExtension',
105
+ ]
106
+
107
+ # ============================== 日志配置 ==============================
108
+ LOG_LEVEL = 'INFO'
109
+ LOG_FILE = f'logs/{{project_name}}.log'
110
+ STATS_DUMP = True
111
+
112
+ # ============================== 代理配置 ==============================
113
+ PROXY_ENABLED = False
114
+ PROXY_API_URL = ""
115
+ PROXY_EXTRACTOR = "proxy"
116
+ PROXY_REFRESH_INTERVAL = 60
117
+ PROXY_API_TIMEOUT = 10
118
+
119
+ # ============================== 自定义配置 ==============================
120
+ # 在此处添加项目特定的配置项
@@ -0,0 +1,95 @@
1
+ # -*- coding: UTF-8 -*-
2
+ """
3
+ {{project_name}} 项目配置文件(温和版)
4
+ =============================
5
+ 基于 Crawlo 框架的温和爬虫项目配置。
6
+ 适合对目标网站友好的低负载爬取。
7
+ """
8
+
9
+ import os
10
+ from crawlo.config import CrawloConfig
11
+
12
+ # ============================== 项目基本信息 ==============================
13
+ PROJECT_NAME = '{{project_name}}'
14
+ VERSION = '1.0.0'
15
+
16
+ # ============================== 温和模式配置 ==============================
17
+ # 使用配置工厂创建温和模式配置
18
+ CONFIG = CrawloConfig.presets().gentle()
19
+
20
+ # 获取配置
21
+ locals().update(CONFIG.to_dict())
22
+
23
+ # ============================== 网络请求配置 ==============================
24
+ DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader"
25
+ DOWNLOAD_TIMEOUT = 60
26
+ VERIFY_SSL = True
27
+
28
+ # ============================== 低并发配置 ==============================
29
+ CONCURRENCY = 2
30
+ MAX_RUNNING_SPIDERS = 1
31
+ DOWNLOAD_DELAY = 3.0
32
+ RANDOMNESS = True
33
+ RANDOM_RANGE = (2.0, 5.0)
34
+
35
+ # ============================== 连接池配置 ==============================
36
+ CONNECTION_POOL_LIMIT = 10
37
+ CONNECTION_POOL_LIMIT_PER_HOST = 5
38
+
39
+ # ============================== 重试配置 ==============================
40
+ MAX_RETRY_TIMES = 3
41
+ RETRY_HTTP_CODES = [408, 429, 500, 502, 503, 504, 522, 524]
42
+ IGNORE_HTTP_CODES = [403, 404]
43
+
44
+ # ============================== 队列配置 ==============================
45
+ SCHEDULER_MAX_QUEUE_SIZE = 1000
46
+ QUEUE_MAX_RETRIES = 3
47
+ QUEUE_TIMEOUT = 300
48
+
49
+ # ============================== 数据存储配置 ==============================
50
+ # MySQL 配置
51
+ MYSQL_HOST = os.getenv('MYSQL_HOST', '127.0.0.1')
52
+ MYSQL_PORT = int(os.getenv('MYSQL_PORT', 3306))
53
+ MYSQL_USER = os.getenv('MYSQL_USER', 'root')
54
+ MYSQL_PASSWORD = os.getenv('MYSQL_PASSWORD', '123456')
55
+ MYSQL_DB = os.getenv('MYSQL_DB', '{{project_name}}')
56
+ MYSQL_TABLE = '{{project_name}}_data'
57
+
58
+ # MongoDB 配置
59
+ MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost:27017')
60
+ MONGO_DATABASE = '{{project_name}}_db'
61
+ MONGO_COLLECTION = '{{project_name}}_items'
62
+
63
+ # ============================== 去重配置 ==============================
64
+ REDIS_TTL = 0
65
+ CLEANUP_FP = 0
66
+ FILTER_DEBUG = True
67
+
68
+ # ============================== 中间件与管道 ==============================
69
+ MIDDLEWARES = [
70
+ 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
71
+ 'crawlo.middleware.download_delay.DownloadDelayMiddleware',
72
+ 'crawlo.middleware.default_header.DefaultHeaderMiddleware',
73
+ 'crawlo.middleware.retry.RetryMiddleware',
74
+ 'crawlo.middleware.response_code.ResponseCodeMiddleware',
75
+ ]
76
+
77
+ PIPELINES = [
78
+ 'crawlo.pipelines.console_pipeline.ConsolePipeline',
79
+ # '{{project_name}}.pipelines.DatabasePipeline',
80
+ ]
81
+
82
+ # ============================== 扩展组件 ==============================
83
+ EXTENSIONS = [
84
+ 'crawlo.extension.log_interval.LogIntervalExtension',
85
+ 'crawlo.extension.log_stats.LogStats',
86
+ 'crawlo.extension.logging_extension.CustomLoggerExtension',
87
+ ]
88
+
89
+ # ============================== 日志配置 ==============================
90
+ LOG_LEVEL = 'INFO'
91
+ LOG_FILE = f'logs/{{project_name}}.log'
92
+ STATS_DUMP = True
93
+
94
+ # ============================== 自定义配置 ==============================
95
+ # 在此处添加项目特定的配置项
@@ -0,0 +1,152 @@
1
+ # -*- coding: UTF-8 -*-
2
+ """
3
+ {{project_name}} 项目配置文件(高性能版)
4
+ =============================
5
+ 基于 Crawlo 框架的高性能爬虫项目配置。
6
+ 针对大规模、高并发场景优化。
7
+ """
8
+
9
+ import os
10
+ from crawlo.config import CrawloConfig
11
+
12
+ # ============================== 项目基本信息 ==============================
13
+ PROJECT_NAME = '{{project_name}}'
14
+ VERSION = '1.0.0'
15
+
16
+ # ============================== 高性能配置 ==============================
17
+ # 使用配置工厂创建高性能配置
18
+ CONFIG = CrawloConfig.presets().large_scale(
19
+ redis_host=os.getenv('REDIS_HOST', '127.0.0.1'),
20
+ project_name='{{project_name}}'
21
+ )
22
+
23
+ # 获取配置
24
+ locals().update(CONFIG.to_dict())
25
+
26
+ # ============================== 网络请求配置 ==============================
27
+ DOWNLOADER = "crawlo.downloader.cffi_downloader.CurlCffiDownloader"
28
+ DOWNLOAD_TIMEOUT = 30
29
+ VERIFY_SSL = True
30
+ USE_SESSION = True
31
+
32
+ # ============================== 高并发配置 ==============================
33
+ CONCURRENCY = 32
34
+ MAX_RUNNING_SPIDERS = 10
35
+ DOWNLOAD_DELAY = 0.5
36
+ RANDOMNESS = True
37
+ RANDOM_RANGE = (0.8, 1.2)
38
+
39
+ # ============================== 连接池配置 ==============================
40
+ CONNECTION_POOL_LIMIT = 100
41
+ CONNECTION_POOL_LIMIT_PER_HOST = 50
42
+
43
+ # ============================== 重试配置 ==============================
44
+ MAX_RETRY_TIMES = 5
45
+ RETRY_HTTP_CODES = [408, 429, 500, 502, 503, 504, 522, 524]
46
+ IGNORE_HTTP_CODES = [403, 404]
47
+
48
+ # ============================== 队列配置 ==============================
49
+ SCHEDULER_MAX_QUEUE_SIZE = 10000
50
+ SCHEDULER_QUEUE_NAME = f'crawlo:{{project_name}}:queue:requests'
51
+ QUEUE_MAX_RETRIES = 5
52
+ QUEUE_TIMEOUT = 300
53
+ LARGE_SCALE_BATCH_SIZE = 2000
54
+ LARGE_SCALE_CHECKPOINT_INTERVAL = 5000
55
+
56
+ # ============================== Redis 配置 ==============================
57
+ REDIS_HOST = os.getenv('REDIS_HOST', '127.0.0.1')
58
+ REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
59
+ REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', '')
60
+ REDIS_DB = int(os.getenv('REDIS_DB', 0))
61
+
62
+ # 根据是否有密码生成 URL
63
+ if REDIS_PASSWORD:
64
+ REDIS_URL = f'redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
65
+ else:
66
+ REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
67
+
68
+ # ============================== 数据存储配置 ==============================
69
+ # MySQL 配置
70
+ MYSQL_HOST = os.getenv('MYSQL_HOST', '127.0.0.1')
71
+ MYSQL_PORT = int(os.getenv('MYSQL_PORT', 3306))
72
+ MYSQL_USER = os.getenv('MYSQL_USER', 'root')
73
+ MYSQL_PASSWORD = os.getenv('MYSQL_PASSWORD', '123456')
74
+ MYSQL_DB = os.getenv('MYSQL_DB', '{{project_name}}')
75
+ MYSQL_TABLE = '{{project_name}}_data'
76
+ MYSQL_BATCH_SIZE = 200
77
+ MYSQL_USE_BATCH = True
78
+ MYSQL_POOL_MIN = 10
79
+ MYSQL_POOL_MAX = 50
80
+
81
+ # MongoDB 配置
82
+ MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost:27017')
83
+ MONGO_DATABASE = '{{project_name}}_db'
84
+ MONGO_COLLECTION = '{{project_name}}_items'
85
+ MONGO_BATCH_SIZE = 200
86
+ MONGO_USE_BATCH = True
87
+ MONGO_MAX_POOL_SIZE = 300
88
+ MONGO_MIN_POOL_SIZE = 50
89
+
90
+ # ============================== 去重配置 ==============================
91
+ REDIS_TTL = 0
92
+ CLEANUP_FP = 0
93
+ FILTER_DEBUG = False # 生产环境关闭调试日志
94
+
95
+ # ============================== 中间件与管道 ==============================
96
+ MIDDLEWARES = [
97
+ 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
98
+ 'crawlo.middleware.download_delay.DownloadDelayMiddleware',
99
+ 'crawlo.middleware.default_header.DefaultHeaderMiddleware',
100
+ 'crawlo.middleware.proxy.ProxyMiddleware',
101
+ 'crawlo.middleware.retry.RetryMiddleware',
102
+ 'crawlo.middleware.response_code.ResponseCodeMiddleware',
103
+ 'crawlo.middleware.response_filter.ResponseFilterMiddleware',
104
+ ]
105
+
106
+ PIPELINES = [
107
+ 'crawlo.pipelines.console_pipeline.ConsolePipeline',
108
+ # '{{project_name}}.pipelines.DatabasePipeline',
109
+ # 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline',
110
+ # 'crawlo.pipelines.mongo_pipeline.MongoPipeline',
111
+ ]
112
+
113
+ # ============================== 扩展组件 ==============================
114
+ EXTENSIONS = [
115
+ 'crawlo.extension.log_interval.LogIntervalExtension',
116
+ 'crawlo.extension.log_stats.LogStats',
117
+ 'crawlo.extension.logging_extension.CustomLoggerExtension',
118
+ # 'crawlo.extension.memory_monitor.MemoryMonitorExtension',
119
+ # 'crawlo.extension.request_recorder.RequestRecorderExtension',
120
+ # 'crawlo.extension.performance_profiler.PerformanceProfilerExtension',
121
+ ]
122
+
123
+ # ============================== 日志配置 ==============================
124
+ LOG_LEVEL = 'INFO'
125
+ LOG_FILE = f'logs/{{project_name}}.log'
126
+ STATS_DUMP = True
127
+
128
+ # ============================== 代理配置 ==============================
129
+ PROXY_ENABLED = False
130
+ PROXY_API_URL = ""
131
+ PROXY_EXTRACTOR = "proxy"
132
+ PROXY_REFRESH_INTERVAL = 30
133
+ PROXY_API_TIMEOUT = 5
134
+
135
+ # ============================== 浏览器指纹配置 ==============================
136
+ CURL_BROWSER_TYPE = "chrome"
137
+ CURL_BROWSER_VERSION_MAP = {
138
+ "chrome": "chrome136",
139
+ "edge": "edge101",
140
+ "safari": "safari184",
141
+ "firefox": "firefox135",
142
+ }
143
+
144
+ # ============================== 下载器优化配置 ==============================
145
+ HTTPX_HTTP2 = True
146
+ HTTPX_FOLLOW_REDIRECTS = True
147
+ AIOHTTP_AUTO_DECOMPRESS = True
148
+ CONNECTION_TTL_DNS_CACHE = 300
149
+ CONNECTION_KEEPALIVE_TIMEOUT = 15
150
+
151
+ # ============================== 自定义配置 ==============================
152
+ # 在此处添加项目特定的配置项
@@ -0,0 +1,69 @@
1
+ # -*- coding: UTF-8 -*-
2
+ """
3
+ {{project_name}} 项目配置文件(简化版)
4
+ =============================
5
+ 基于 Crawlo 框架的简化爬虫项目配置。
6
+ 适合快速开始和小型项目。
7
+ """
8
+
9
+ import os
10
+ from crawlo.config import CrawloConfig
11
+
12
+ # ============================== 项目基本信息 ==============================
13
+ PROJECT_NAME = '{{project_name}}'
14
+ VERSION = '1.0.0'
15
+
16
+ # ============================== 基本配置 ==============================
17
+ # 使用配置工厂创建基本配置
18
+ CONFIG = CrawloConfig.standalone(
19
+ concurrency=4,
20
+ download_delay=1.0
21
+ )
22
+
23
+ # 获取配置
24
+ locals().update(CONFIG.to_dict())
25
+
26
+ # ============================== 网络请求配置 ==============================
27
+ DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader"
28
+ DOWNLOAD_TIMEOUT = 30
29
+ VERIFY_SSL = True
30
+
31
+ # ============================== 并发配置 ==============================
32
+ CONCURRENCY = 4
33
+ DOWNLOAD_DELAY = 1.0
34
+
35
+ # ============================== 数据存储配置 ==============================
36
+ # MySQL 配置
37
+ MYSQL_HOST = os.getenv('MYSQL_HOST', '127.0.0.1')
38
+ MYSQL_PORT = int(os.getenv('MYSQL_PORT', 3306))
39
+ MYSQL_USER = os.getenv('MYSQL_USER', 'root')
40
+ MYSQL_PASSWORD = os.getenv('MYSQL_PASSWORD', '123456')
41
+ MYSQL_DB = os.getenv('MYSQL_DB', '{{project_name}}')
42
+ MYSQL_TABLE = '{{project_name}}_data'
43
+
44
+ # MongoDB 配置
45
+ MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost:27017')
46
+ MONGO_DATABASE = '{{project_name}}_db'
47
+ MONGO_COLLECTION = '{{project_name}}_items'
48
+
49
+ # ============================== 中间件与管道 ==============================
50
+ MIDDLEWARES = [
51
+ 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
52
+ 'crawlo.middleware.download_delay.DownloadDelayMiddleware',
53
+ 'crawlo.middleware.default_header.DefaultHeaderMiddleware',
54
+ 'crawlo.middleware.retry.RetryMiddleware',
55
+ 'crawlo.middleware.response_code.ResponseCodeMiddleware',
56
+ ]
57
+
58
+ PIPELINES = [
59
+ 'crawlo.pipelines.console_pipeline.ConsolePipeline',
60
+ # '{{project_name}}.pipelines.DatabasePipeline',
61
+ ]
62
+
63
+ # ============================== 日志配置 ==============================
64
+ LOG_LEVEL = 'INFO'
65
+ LOG_FILE = f'logs/{{project_name}}.log'
66
+ STATS_DUMP = True
67
+
68
+ # ============================== 自定义配置 ==============================
69
+ # 在此处添加项目特定的配置项
@@ -1,6 +1,6 @@
1
- # -*- coding: UTF-8 -*-
2
- """
3
- {{project_name}}.spiders
4
- ========================
5
- 存放所有的爬虫。
1
+ # -*- coding: UTF-8 -*-
2
+ """
3
+ {{project_name}}.spiders
4
+ ========================
5
+ 存放所有的爬虫。
6
6
  """