crawlo 1.1.3__py3-none-any.whl → 1.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (115) hide show
  1. crawlo/__init__.py +28 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cleaners/__init__.py +61 -0
  4. crawlo/cleaners/data_formatter.py +226 -0
  5. crawlo/cleaners/encoding_converter.py +126 -0
  6. crawlo/cleaners/text_cleaner.py +233 -0
  7. crawlo/commands/startproject.py +117 -13
  8. crawlo/config.py +30 -0
  9. crawlo/config_validator.py +253 -0
  10. crawlo/core/engine.py +185 -11
  11. crawlo/core/scheduler.py +49 -78
  12. crawlo/crawler.py +6 -6
  13. crawlo/downloader/__init__.py +24 -0
  14. crawlo/downloader/aiohttp_downloader.py +8 -0
  15. crawlo/downloader/cffi_downloader.py +5 -0
  16. crawlo/downloader/hybrid_downloader.py +214 -0
  17. crawlo/downloader/playwright_downloader.py +403 -0
  18. crawlo/downloader/selenium_downloader.py +473 -0
  19. crawlo/extension/__init__.py +17 -10
  20. crawlo/extension/health_check.py +142 -0
  21. crawlo/extension/log_interval.py +27 -18
  22. crawlo/extension/log_stats.py +62 -24
  23. crawlo/extension/logging_extension.py +18 -9
  24. crawlo/extension/memory_monitor.py +105 -0
  25. crawlo/extension/performance_profiler.py +134 -0
  26. crawlo/extension/request_recorder.py +108 -0
  27. crawlo/filters/aioredis_filter.py +50 -12
  28. crawlo/middleware/proxy.py +26 -2
  29. crawlo/mode_manager.py +24 -19
  30. crawlo/network/request.py +30 -3
  31. crawlo/network/response.py +114 -25
  32. crawlo/pipelines/mongo_pipeline.py +81 -66
  33. crawlo/pipelines/mysql_pipeline.py +165 -43
  34. crawlo/pipelines/redis_dedup_pipeline.py +7 -3
  35. crawlo/queue/queue_manager.py +15 -2
  36. crawlo/queue/redis_priority_queue.py +144 -76
  37. crawlo/settings/default_settings.py +93 -121
  38. crawlo/subscriber.py +62 -37
  39. crawlo/templates/project/items.py.tmpl +1 -1
  40. crawlo/templates/project/middlewares.py.tmpl +73 -49
  41. crawlo/templates/project/pipelines.py.tmpl +51 -295
  42. crawlo/templates/project/settings.py.tmpl +93 -17
  43. crawlo/templates/project/settings_distributed.py.tmpl +120 -0
  44. crawlo/templates/project/settings_gentle.py.tmpl +95 -0
  45. crawlo/templates/project/settings_high_performance.py.tmpl +152 -0
  46. crawlo/templates/project/settings_simple.py.tmpl +69 -0
  47. crawlo/templates/spider/spider.py.tmpl +2 -38
  48. crawlo/tools/__init__.py +183 -0
  49. crawlo/tools/anti_crawler.py +269 -0
  50. crawlo/tools/authenticated_proxy.py +241 -0
  51. crawlo/tools/data_validator.py +181 -0
  52. crawlo/tools/date_tools.py +36 -0
  53. crawlo/tools/distributed_coordinator.py +387 -0
  54. crawlo/tools/retry_mechanism.py +221 -0
  55. crawlo/tools/scenario_adapter.py +263 -0
  56. crawlo/utils/__init__.py +29 -1
  57. crawlo/utils/batch_processor.py +261 -0
  58. crawlo/utils/date_tools.py +58 -1
  59. crawlo/utils/enhanced_error_handler.py +360 -0
  60. crawlo/utils/env_config.py +106 -0
  61. crawlo/utils/error_handler.py +126 -0
  62. crawlo/utils/performance_monitor.py +285 -0
  63. crawlo/utils/redis_connection_pool.py +335 -0
  64. crawlo/utils/redis_key_validator.py +200 -0
  65. crawlo-1.1.5.dist-info/METADATA +401 -0
  66. crawlo-1.1.5.dist-info/RECORD +185 -0
  67. tests/advanced_tools_example.py +276 -0
  68. tests/authenticated_proxy_example.py +237 -0
  69. tests/cleaners_example.py +161 -0
  70. tests/config_validation_demo.py +103 -0
  71. tests/date_tools_example.py +181 -0
  72. tests/dynamic_loading_example.py +524 -0
  73. tests/dynamic_loading_test.py +105 -0
  74. tests/env_config_example.py +134 -0
  75. tests/error_handling_example.py +172 -0
  76. tests/redis_key_validation_demo.py +131 -0
  77. tests/response_improvements_example.py +145 -0
  78. tests/test_advanced_tools.py +149 -0
  79. tests/test_all_redis_key_configs.py +146 -0
  80. tests/test_authenticated_proxy.py +142 -0
  81. tests/test_cleaners.py +55 -0
  82. tests/test_comprehensive.py +147 -0
  83. tests/test_config_validator.py +194 -0
  84. tests/test_date_tools.py +124 -0
  85. tests/test_dynamic_downloaders_proxy.py +125 -0
  86. tests/test_dynamic_proxy.py +93 -0
  87. tests/test_dynamic_proxy_config.py +147 -0
  88. tests/test_dynamic_proxy_real.py +110 -0
  89. tests/test_edge_cases.py +304 -0
  90. tests/test_enhanced_error_handler.py +271 -0
  91. tests/test_env_config.py +122 -0
  92. tests/test_error_handler_compatibility.py +113 -0
  93. tests/test_framework_env_usage.py +104 -0
  94. tests/test_integration.py +357 -0
  95. tests/test_item_dedup_redis_key.py +123 -0
  96. tests/test_parsel.py +30 -0
  97. tests/test_performance.py +328 -0
  98. tests/test_queue_manager_redis_key.py +177 -0
  99. tests/test_redis_connection_pool.py +295 -0
  100. tests/test_redis_key_naming.py +182 -0
  101. tests/test_redis_key_validator.py +124 -0
  102. tests/test_response_improvements.py +153 -0
  103. tests/test_simple_response.py +62 -0
  104. tests/test_telecom_spider_redis_key.py +206 -0
  105. tests/test_template_content.py +88 -0
  106. tests/test_template_redis_key.py +135 -0
  107. tests/test_tools.py +154 -0
  108. tests/tools_example.py +258 -0
  109. crawlo/core/enhanced_engine.py +0 -190
  110. crawlo-1.1.3.dist-info/METADATA +0 -635
  111. crawlo-1.1.3.dist-info/RECORD +0 -113
  112. {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/WHEEL +0 -0
  113. {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/entry_points.txt +0 -0
  114. {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/top_level.txt +0 -0
  115. {examples → tests}/controlled_spider_example.py +0 -0
@@ -0,0 +1,120 @@
1
+ # -*- coding: UTF-8 -*-
2
+ """
3
+ {{project_name}} 项目配置文件(分布式版)
4
+ =============================
5
+ 基于 Crawlo 框架的分布式爬虫项目配置。
6
+ 适合大规模数据采集和多节点部署。
7
+ """
8
+
9
+ import os
10
+ from crawlo.config import CrawloConfig
11
+
12
+ # ============================== 项目基本信息 ==============================
13
+ PROJECT_NAME = '{{project_name}}'
14
+ VERSION = '1.0.0'
15
+
16
+ # ============================== 分布式配置 ==============================
17
+ # 使用配置工厂创建分布式配置
18
+ CONFIG = CrawloConfig.distributed(
19
+ redis_host=os.getenv('REDIS_HOST', '127.0.0.1'),
20
+ redis_port=int(os.getenv('REDIS_PORT', 6379)),
21
+ redis_password=os.getenv('REDIS_PASSWORD', ''),
22
+ project_name='{{project_name}}',
23
+ concurrency=16,
24
+ download_delay=1.0
25
+ )
26
+
27
+ # 获取配置
28
+ locals().update(CONFIG.to_dict())
29
+
30
+ # ============================== 网络请求配置 ==============================
31
+ DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader"
32
+ DOWNLOAD_TIMEOUT = 60
33
+ VERIFY_SSL = True
34
+
35
+ # ============================== 并发配置 ==============================
36
+ CONCURRENCY = 16
37
+ MAX_RUNNING_SPIDERS = 5
38
+ DOWNLOAD_DELAY = 1.0
39
+
40
+ # ============================== 队列配置 ==============================
41
+ SCHEDULER_MAX_QUEUE_SIZE = 5000
42
+ QUEUE_MAX_RETRIES = 5
43
+ QUEUE_TIMEOUT = 300
44
+
45
+ # ============================== Redis 配置 ==============================
46
+ REDIS_HOST = os.getenv('REDIS_HOST', '127.0.0.1')
47
+ REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
48
+ REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', '')
49
+ REDIS_DB = int(os.getenv('REDIS_DB', 0))
50
+
51
+ # 根据是否有密码生成 URL
52
+ if REDIS_PASSWORD:
53
+ REDIS_URL = f'redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
54
+ else:
55
+ REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
56
+
57
+ # ============================== 数据存储配置 ==============================
58
+ # MySQL 配置
59
+ MYSQL_HOST = os.getenv('MYSQL_HOST', '127.0.0.1')
60
+ MYSQL_PORT = int(os.getenv('MYSQL_PORT', 3306))
61
+ MYSQL_USER = os.getenv('MYSQL_USER', 'root')
62
+ MYSQL_PASSWORD = os.getenv('MYSQL_PASSWORD', '123456')
63
+ MYSQL_DB = os.getenv('MYSQL_DB', '{{project_name}}')
64
+ MYSQL_TABLE = '{{project_name}}_data'
65
+ MYSQL_BATCH_SIZE = 100
66
+ MYSQL_USE_BATCH = True
67
+
68
+ # MongoDB 配置
69
+ MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost:27017')
70
+ MONGO_DATABASE = '{{project_name}}_db'
71
+ MONGO_COLLECTION = '{{project_name}}_items'
72
+ MONGO_BATCH_SIZE = 100
73
+ MONGO_USE_BATCH = True
74
+
75
+ # ============================== 去重配置 ==============================
76
+ REDIS_TTL = 0
77
+ CLEANUP_FP = 0
78
+ FILTER_DEBUG = True
79
+
80
+ # ============================== 中间件与管道 ==============================
81
+ MIDDLEWARES = [
82
+ 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
83
+ 'crawlo.middleware.download_delay.DownloadDelayMiddleware',
84
+ 'crawlo.middleware.default_header.DefaultHeaderMiddleware',
85
+ 'crawlo.middleware.proxy.ProxyMiddleware',
86
+ 'crawlo.middleware.retry.RetryMiddleware',
87
+ 'crawlo.middleware.response_code.ResponseCodeMiddleware',
88
+ 'crawlo.middleware.response_filter.ResponseFilterMiddleware',
89
+ ]
90
+
91
+ PIPELINES = [
92
+ 'crawlo.pipelines.console_pipeline.ConsolePipeline',
93
+ # '{{project_name}}.pipelines.DatabasePipeline',
94
+ # 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline',
95
+ # 'crawlo.pipelines.mongo_pipeline.MongoPipeline',
96
+ ]
97
+
98
+ # ============================== 扩展组件 ==============================
99
+ EXTENSIONS = [
100
+ 'crawlo.extension.log_interval.LogIntervalExtension',
101
+ 'crawlo.extension.log_stats.LogStats',
102
+ 'crawlo.extension.logging_extension.CustomLoggerExtension',
103
+ # 'crawlo.extension.memory_monitor.MemoryMonitorExtension',
104
+ # 'crawlo.extension.request_recorder.RequestRecorderExtension',
105
+ ]
106
+
107
+ # ============================== 日志配置 ==============================
108
+ LOG_LEVEL = 'INFO'
109
+ LOG_FILE = f'logs/{{project_name}}.log'
110
+ STATS_DUMP = True
111
+
112
+ # ============================== 代理配置 ==============================
113
+ PROXY_ENABLED = False
114
+ PROXY_API_URL = ""
115
+ PROXY_EXTRACTOR = "proxy"
116
+ PROXY_REFRESH_INTERVAL = 60
117
+ PROXY_API_TIMEOUT = 10
118
+
119
+ # ============================== 自定义配置 ==============================
120
+ # 在此处添加项目特定的配置项
@@ -0,0 +1,95 @@
1
+ # -*- coding: UTF-8 -*-
2
+ """
3
+ {{project_name}} 项目配置文件(温和版)
4
+ =============================
5
+ 基于 Crawlo 框架的温和爬虫项目配置。
6
+ 适合对目标网站友好的低负载爬取。
7
+ """
8
+
9
+ import os
10
+ from crawlo.config import CrawloConfig
11
+
12
+ # ============================== 项目基本信息 ==============================
13
+ PROJECT_NAME = '{{project_name}}'
14
+ VERSION = '1.0.0'
15
+
16
+ # ============================== 温和模式配置 ==============================
17
+ # 使用配置工厂创建温和模式配置
18
+ CONFIG = CrawloConfig.presets().gentle()
19
+
20
+ # 获取配置
21
+ locals().update(CONFIG.to_dict())
22
+
23
+ # ============================== 网络请求配置 ==============================
24
+ DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader"
25
+ DOWNLOAD_TIMEOUT = 60
26
+ VERIFY_SSL = True
27
+
28
+ # ============================== 低并发配置 ==============================
29
+ CONCURRENCY = 2
30
+ MAX_RUNNING_SPIDERS = 1
31
+ DOWNLOAD_DELAY = 3.0
32
+ RANDOMNESS = True
33
+ RANDOM_RANGE = (2.0, 5.0)
34
+
35
+ # ============================== 连接池配置 ==============================
36
+ CONNECTION_POOL_LIMIT = 10
37
+ CONNECTION_POOL_LIMIT_PER_HOST = 5
38
+
39
+ # ============================== 重试配置 ==============================
40
+ MAX_RETRY_TIMES = 3
41
+ RETRY_HTTP_CODES = [408, 429, 500, 502, 503, 504, 522, 524]
42
+ IGNORE_HTTP_CODES = [403, 404]
43
+
44
+ # ============================== 队列配置 ==============================
45
+ SCHEDULER_MAX_QUEUE_SIZE = 1000
46
+ QUEUE_MAX_RETRIES = 3
47
+ QUEUE_TIMEOUT = 300
48
+
49
+ # ============================== 数据存储配置 ==============================
50
+ # MySQL 配置
51
+ MYSQL_HOST = os.getenv('MYSQL_HOST', '127.0.0.1')
52
+ MYSQL_PORT = int(os.getenv('MYSQL_PORT', 3306))
53
+ MYSQL_USER = os.getenv('MYSQL_USER', 'root')
54
+ MYSQL_PASSWORD = os.getenv('MYSQL_PASSWORD', '123456')
55
+ MYSQL_DB = os.getenv('MYSQL_DB', '{{project_name}}')
56
+ MYSQL_TABLE = '{{project_name}}_data'
57
+
58
+ # MongoDB 配置
59
+ MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost:27017')
60
+ MONGO_DATABASE = '{{project_name}}_db'
61
+ MONGO_COLLECTION = '{{project_name}}_items'
62
+
63
+ # ============================== 去重配置 ==============================
64
+ REDIS_TTL = 0
65
+ CLEANUP_FP = 0
66
+ FILTER_DEBUG = True
67
+
68
+ # ============================== 中间件与管道 ==============================
69
+ MIDDLEWARES = [
70
+ 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
71
+ 'crawlo.middleware.download_delay.DownloadDelayMiddleware',
72
+ 'crawlo.middleware.default_header.DefaultHeaderMiddleware',
73
+ 'crawlo.middleware.retry.RetryMiddleware',
74
+ 'crawlo.middleware.response_code.ResponseCodeMiddleware',
75
+ ]
76
+
77
+ PIPELINES = [
78
+ 'crawlo.pipelines.console_pipeline.ConsolePipeline',
79
+ # '{{project_name}}.pipelines.DatabasePipeline',
80
+ ]
81
+
82
+ # ============================== 扩展组件 ==============================
83
+ EXTENSIONS = [
84
+ 'crawlo.extension.log_interval.LogIntervalExtension',
85
+ 'crawlo.extension.log_stats.LogStats',
86
+ 'crawlo.extension.logging_extension.CustomLoggerExtension',
87
+ ]
88
+
89
+ # ============================== 日志配置 ==============================
90
+ LOG_LEVEL = 'INFO'
91
+ LOG_FILE = f'logs/{{project_name}}.log'
92
+ STATS_DUMP = True
93
+
94
+ # ============================== 自定义配置 ==============================
95
+ # 在此处添加项目特定的配置项
@@ -0,0 +1,152 @@
1
+ # -*- coding: UTF-8 -*-
2
+ """
3
+ {{project_name}} 项目配置文件(高性能版)
4
+ =============================
5
+ 基于 Crawlo 框架的高性能爬虫项目配置。
6
+ 针对大规模、高并发场景优化。
7
+ """
8
+
9
+ import os
10
+ from crawlo.config import CrawloConfig
11
+
12
+ # ============================== 项目基本信息 ==============================
13
+ PROJECT_NAME = '{{project_name}}'
14
+ VERSION = '1.0.0'
15
+
16
+ # ============================== 高性能配置 ==============================
17
+ # 使用配置工厂创建高性能配置
18
+ CONFIG = CrawloConfig.presets().large_scale(
19
+ redis_host=os.getenv('REDIS_HOST', '127.0.0.1'),
20
+ project_name='{{project_name}}'
21
+ )
22
+
23
+ # 获取配置
24
+ locals().update(CONFIG.to_dict())
25
+
26
+ # ============================== 网络请求配置 ==============================
27
+ DOWNLOADER = "crawlo.downloader.cffi_downloader.CurlCffiDownloader"
28
+ DOWNLOAD_TIMEOUT = 30
29
+ VERIFY_SSL = True
30
+ USE_SESSION = True
31
+
32
+ # ============================== 高并发配置 ==============================
33
+ CONCURRENCY = 32
34
+ MAX_RUNNING_SPIDERS = 10
35
+ DOWNLOAD_DELAY = 0.5
36
+ RANDOMNESS = True
37
+ RANDOM_RANGE = (0.8, 1.2)
38
+
39
+ # ============================== 连接池配置 ==============================
40
+ CONNECTION_POOL_LIMIT = 100
41
+ CONNECTION_POOL_LIMIT_PER_HOST = 50
42
+
43
+ # ============================== 重试配置 ==============================
44
+ MAX_RETRY_TIMES = 5
45
+ RETRY_HTTP_CODES = [408, 429, 500, 502, 503, 504, 522, 524]
46
+ IGNORE_HTTP_CODES = [403, 404]
47
+
48
+ # ============================== 队列配置 ==============================
49
+ SCHEDULER_MAX_QUEUE_SIZE = 10000
50
+ SCHEDULER_QUEUE_NAME = f'crawlo:{{project_name}}:queue:requests'
51
+ QUEUE_MAX_RETRIES = 5
52
+ QUEUE_TIMEOUT = 300
53
+ LARGE_SCALE_BATCH_SIZE = 2000
54
+ LARGE_SCALE_CHECKPOINT_INTERVAL = 5000
55
+
56
+ # ============================== Redis 配置 ==============================
57
+ REDIS_HOST = os.getenv('REDIS_HOST', '127.0.0.1')
58
+ REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
59
+ REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', '')
60
+ REDIS_DB = int(os.getenv('REDIS_DB', 0))
61
+
62
+ # 根据是否有密码生成 URL
63
+ if REDIS_PASSWORD:
64
+ REDIS_URL = f'redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
65
+ else:
66
+ REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
67
+
68
+ # ============================== 数据存储配置 ==============================
69
+ # MySQL 配置
70
+ MYSQL_HOST = os.getenv('MYSQL_HOST', '127.0.0.1')
71
+ MYSQL_PORT = int(os.getenv('MYSQL_PORT', 3306))
72
+ MYSQL_USER = os.getenv('MYSQL_USER', 'root')
73
+ MYSQL_PASSWORD = os.getenv('MYSQL_PASSWORD', '123456')
74
+ MYSQL_DB = os.getenv('MYSQL_DB', '{{project_name}}')
75
+ MYSQL_TABLE = '{{project_name}}_data'
76
+ MYSQL_BATCH_SIZE = 200
77
+ MYSQL_USE_BATCH = True
78
+ MYSQL_POOL_MIN = 10
79
+ MYSQL_POOL_MAX = 50
80
+
81
+ # MongoDB 配置
82
+ MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost:27017')
83
+ MONGO_DATABASE = '{{project_name}}_db'
84
+ MONGO_COLLECTION = '{{project_name}}_items'
85
+ MONGO_BATCH_SIZE = 200
86
+ MONGO_USE_BATCH = True
87
+ MONGO_MAX_POOL_SIZE = 300
88
+ MONGO_MIN_POOL_SIZE = 50
89
+
90
+ # ============================== 去重配置 ==============================
91
+ REDIS_TTL = 0
92
+ CLEANUP_FP = 0
93
+ FILTER_DEBUG = False # 生产环境关闭调试日志
94
+
95
+ # ============================== 中间件与管道 ==============================
96
+ MIDDLEWARES = [
97
+ 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
98
+ 'crawlo.middleware.download_delay.DownloadDelayMiddleware',
99
+ 'crawlo.middleware.default_header.DefaultHeaderMiddleware',
100
+ 'crawlo.middleware.proxy.ProxyMiddleware',
101
+ 'crawlo.middleware.retry.RetryMiddleware',
102
+ 'crawlo.middleware.response_code.ResponseCodeMiddleware',
103
+ 'crawlo.middleware.response_filter.ResponseFilterMiddleware',
104
+ ]
105
+
106
+ PIPELINES = [
107
+ 'crawlo.pipelines.console_pipeline.ConsolePipeline',
108
+ # '{{project_name}}.pipelines.DatabasePipeline',
109
+ # 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline',
110
+ # 'crawlo.pipelines.mongo_pipeline.MongoPipeline',
111
+ ]
112
+
113
+ # ============================== 扩展组件 ==============================
114
+ EXTENSIONS = [
115
+ 'crawlo.extension.log_interval.LogIntervalExtension',
116
+ 'crawlo.extension.log_stats.LogStats',
117
+ 'crawlo.extension.logging_extension.CustomLoggerExtension',
118
+ # 'crawlo.extension.memory_monitor.MemoryMonitorExtension',
119
+ # 'crawlo.extension.request_recorder.RequestRecorderExtension',
120
+ # 'crawlo.extension.performance_profiler.PerformanceProfilerExtension',
121
+ ]
122
+
123
+ # ============================== 日志配置 ==============================
124
+ LOG_LEVEL = 'INFO'
125
+ LOG_FILE = f'logs/{{project_name}}.log'
126
+ STATS_DUMP = True
127
+
128
+ # ============================== 代理配置 ==============================
129
+ PROXY_ENABLED = False
130
+ PROXY_API_URL = ""
131
+ PROXY_EXTRACTOR = "proxy"
132
+ PROXY_REFRESH_INTERVAL = 30
133
+ PROXY_API_TIMEOUT = 5
134
+
135
+ # ============================== 浏览器指纹配置 ==============================
136
+ CURL_BROWSER_TYPE = "chrome"
137
+ CURL_BROWSER_VERSION_MAP = {
138
+ "chrome": "chrome136",
139
+ "edge": "edge101",
140
+ "safari": "safari184",
141
+ "firefox": "firefox135",
142
+ }
143
+
144
+ # ============================== 下载器优化配置 ==============================
145
+ HTTPX_HTTP2 = True
146
+ HTTPX_FOLLOW_REDIRECTS = True
147
+ AIOHTTP_AUTO_DECOMPRESS = True
148
+ CONNECTION_TTL_DNS_CACHE = 300
149
+ CONNECTION_KEEPALIVE_TIMEOUT = 15
150
+
151
+ # ============================== 自定义配置 ==============================
152
+ # 在此处添加项目特定的配置项
@@ -0,0 +1,69 @@
1
+ # -*- coding: UTF-8 -*-
2
+ """
3
+ {{project_name}} 项目配置文件(简化版)
4
+ =============================
5
+ 基于 Crawlo 框架的简化爬虫项目配置。
6
+ 适合快速开始和小型项目。
7
+ """
8
+
9
+ import os
10
+ from crawlo.config import CrawloConfig
11
+
12
+ # ============================== 项目基本信息 ==============================
13
+ PROJECT_NAME = '{{project_name}}'
14
+ VERSION = '1.0.0'
15
+
16
+ # ============================== 基本配置 ==============================
17
+ # 使用配置工厂创建基本配置
18
+ CONFIG = CrawloConfig.standalone(
19
+ concurrency=4,
20
+ download_delay=1.0
21
+ )
22
+
23
+ # 获取配置
24
+ locals().update(CONFIG.to_dict())
25
+
26
+ # ============================== 网络请求配置 ==============================
27
+ DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader"
28
+ DOWNLOAD_TIMEOUT = 30
29
+ VERIFY_SSL = True
30
+
31
+ # ============================== 并发配置 ==============================
32
+ CONCURRENCY = 4
33
+ DOWNLOAD_DELAY = 1.0
34
+
35
+ # ============================== 数据存储配置 ==============================
36
+ # MySQL 配置
37
+ MYSQL_HOST = os.getenv('MYSQL_HOST', '127.0.0.1')
38
+ MYSQL_PORT = int(os.getenv('MYSQL_PORT', 3306))
39
+ MYSQL_USER = os.getenv('MYSQL_USER', 'root')
40
+ MYSQL_PASSWORD = os.getenv('MYSQL_PASSWORD', '123456')
41
+ MYSQL_DB = os.getenv('MYSQL_DB', '{{project_name}}')
42
+ MYSQL_TABLE = '{{project_name}}_data'
43
+
44
+ # MongoDB 配置
45
+ MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost:27017')
46
+ MONGO_DATABASE = '{{project_name}}_db'
47
+ MONGO_COLLECTION = '{{project_name}}_items'
48
+
49
+ # ============================== 中间件与管道 ==============================
50
+ MIDDLEWARES = [
51
+ 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
52
+ 'crawlo.middleware.download_delay.DownloadDelayMiddleware',
53
+ 'crawlo.middleware.default_header.DefaultHeaderMiddleware',
54
+ 'crawlo.middleware.retry.RetryMiddleware',
55
+ 'crawlo.middleware.response_code.ResponseCodeMiddleware',
56
+ ]
57
+
58
+ PIPELINES = [
59
+ 'crawlo.pipelines.console_pipeline.ConsolePipeline',
60
+ # '{{project_name}}.pipelines.DatabasePipeline',
61
+ ]
62
+
63
+ # ============================== 日志配置 ==============================
64
+ LOG_LEVEL = 'INFO'
65
+ LOG_FILE = f'logs/{{project_name}}.log'
66
+ STATS_DUMP = True
67
+
68
+ # ============================== 自定义配置 ==============================
69
+ # 在此处添加项目特定的配置项
@@ -11,7 +11,7 @@
11
11
 
12
12
  from crawlo.spider import Spider
13
13
  from crawlo import Request
14
- # from {{project_name}}.items import {{item_class}} # 可选:导入数据项
14
+ from ..items import ExampleItem
15
15
 
16
16
 
17
17
  class {{class_name}}(Spider):
@@ -139,40 +139,4 @@ class {{class_name}}(Spider):
139
139
  # 'publish_time': response.xpath('//time/@datetime').get(),
140
140
  # }
141
141
 
142
- pass
143
-
144
- def parse_error(self, failure):
145
- """
146
- 处理请求失败的方法(可选)。
147
-
148
- 当请求失败时会被调用。
149
- """
150
- self.logger.error(f'请求失败: {failure.request.url} - {failure.value}')
151
-
152
- # 可以选择重试或记录失败信息
153
- # yield {
154
- # 'error_url': failure.request.url,
155
- # 'error_message': str(failure.value),
156
- # 'error_type': failure.type.__name__,
157
- # }
158
-
159
- def spider_opened(self, spider):
160
- """
161
- 爬虫启动时的回调方法(可选)。
162
- """
163
- self.logger.info(f'爬虫 {spider.name} 已启动')
164
-
165
- # 初始化操作,例如连接数据库、加载配置等
166
- # self.database = self.connect_database()
167
- # self.cookies = self.load_cookies()
168
-
169
- def spider_closed(self, spider, reason):
170
- """
171
- 爬虫关闭时的回调方法(可选)。
172
- """
173
- self.logger.info(f'爬虫 {spider.name} 已关闭,原因: {reason}')
174
-
175
- # 清理操作,例如关闭数据库连接、保存状态等
176
- # if hasattr(self, 'database'):
177
- # self.database.close()
178
- # self.save_cookies()
142
+ pass
@@ -0,0 +1,183 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ # @Time : 2025-09-10 22:00
5
+ # @Author : crawl-coder
6
+ # @Desc : Crawlo框架工具包集合
7
+ """
8
+
9
+ # 日期工具封装
10
+ from .date_tools import (
11
+ TimeUtils,
12
+ parse_time,
13
+ format_time,
14
+ time_diff,
15
+ to_timestamp,
16
+ to_datetime,
17
+ now,
18
+ to_timezone,
19
+ to_utc,
20
+ to_local,
21
+ from_timestamp_with_tz
22
+ )
23
+
24
+ # 数据清洗工具封装
25
+ from ..cleaners import (
26
+ TextCleaner,
27
+ DataFormatter,
28
+ EncodingConverter,
29
+ remove_html_tags,
30
+ decode_html_entities,
31
+ remove_extra_whitespace,
32
+ remove_special_chars,
33
+ normalize_unicode,
34
+ clean_text,
35
+ extract_numbers,
36
+ extract_emails,
37
+ extract_urls,
38
+ format_number,
39
+ format_currency,
40
+ format_percentage,
41
+ format_phone_number,
42
+ format_chinese_id_card,
43
+ capitalize_words,
44
+ detect_encoding,
45
+ to_utf8,
46
+ convert_encoding
47
+ )
48
+
49
+ # 数据验证工具封装
50
+ from .data_validator import (
51
+ DataValidator,
52
+ validate_email,
53
+ validate_phone,
54
+ validate_url,
55
+ validate_chinese_id_card,
56
+ validate_date,
57
+ validate_number_range,
58
+ check_data_integrity
59
+ )
60
+
61
+ # 重试机制封装
62
+ from .retry_mechanism import (
63
+ RetryMechanism,
64
+ retry,
65
+ should_retry,
66
+ exponential_backoff
67
+ )
68
+
69
+ # 反爬虫应对工具
70
+ from .anti_crawler import (
71
+ ProxyPoolManager,
72
+ CaptchaHandler,
73
+ AntiCrawler,
74
+ get_random_user_agent,
75
+ rotate_proxy,
76
+ handle_captcha,
77
+ detect_rate_limiting
78
+ )
79
+
80
+ # 带认证代理工具
81
+ from .authenticated_proxy import (
82
+ AuthenticatedProxy,
83
+ create_proxy_config,
84
+ format_proxy_for_request,
85
+ parse_proxy_url,
86
+ validate_proxy_url,
87
+ get_proxy_info
88
+ )
89
+
90
+ # 分布式协调工具
91
+ from .distributed_coordinator import (
92
+ TaskDistributor,
93
+ DeduplicationTool,
94
+ DistributedCoordinator,
95
+ generate_task_id,
96
+ claim_task,
97
+ report_task_status,
98
+ get_cluster_info,
99
+ generate_pagination_tasks,
100
+ distribute_tasks
101
+ )
102
+
103
+ __all__ = [
104
+ # 日期工具
105
+ "TimeUtils",
106
+ "parse_time",
107
+ "format_time",
108
+ "time_diff",
109
+ "to_timestamp",
110
+ "to_datetime",
111
+ "now",
112
+ "to_timezone",
113
+ "to_utc",
114
+ "to_local",
115
+ "from_timestamp_with_tz",
116
+
117
+ # 数据清洗工具
118
+ "TextCleaner",
119
+ "DataFormatter",
120
+ "EncodingConverter",
121
+ "remove_html_tags",
122
+ "decode_html_entities",
123
+ "remove_extra_whitespace",
124
+ "remove_special_chars",
125
+ "normalize_unicode",
126
+ "clean_text",
127
+ "extract_numbers",
128
+ "extract_emails",
129
+ "extract_urls",
130
+ "format_number",
131
+ "format_currency",
132
+ "format_percentage",
133
+ "format_phone_number",
134
+ "format_chinese_id_card",
135
+ "capitalize_words",
136
+ "detect_encoding",
137
+ "to_utf8",
138
+ "convert_encoding",
139
+
140
+ # 数据验证工具
141
+ "DataValidator",
142
+ "validate_email",
143
+ "validate_phone",
144
+ "validate_url",
145
+ "validate_chinese_id_card",
146
+ "validate_date",
147
+ "validate_number_range",
148
+ "check_data_integrity",
149
+
150
+ # 重试机制封装
151
+ "RetryMechanism",
152
+ "retry",
153
+ "should_retry",
154
+ "exponential_backoff",
155
+
156
+ # 反爬虫应对工具
157
+ "ProxyPoolManager",
158
+ "CaptchaHandler",
159
+ "AntiCrawler",
160
+ "get_random_user_agent",
161
+ "rotate_proxy",
162
+ "handle_captcha",
163
+ "detect_rate_limiting",
164
+
165
+ # 带认证代理工具
166
+ "AuthenticatedProxy",
167
+ "create_proxy_config",
168
+ "format_proxy_for_request",
169
+ "parse_proxy_url",
170
+ "validate_proxy_url",
171
+ "get_proxy_info",
172
+
173
+ # 分布式协调工具
174
+ "TaskDistributor",
175
+ "DeduplicationTool",
176
+ "DistributedCoordinator",
177
+ "generate_task_id",
178
+ "claim_task",
179
+ "report_task_status",
180
+ "get_cluster_info",
181
+ "generate_pagination_tasks",
182
+ "distribute_tasks"
183
+ ]