crawlo 1.1.3__py3-none-any.whl → 1.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (118) hide show
  1. crawlo/__init__.py +34 -34
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +40 -40
  4. crawlo/commands/__init__.py +13 -13
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/list.py +155 -155
  8. crawlo/commands/run.py +285 -285
  9. crawlo/commands/startproject.py +196 -196
  10. crawlo/commands/stats.py +188 -188
  11. crawlo/commands/utils.py +186 -186
  12. crawlo/config.py +279 -279
  13. crawlo/core/__init__.py +2 -2
  14. crawlo/core/engine.py +171 -171
  15. crawlo/core/enhanced_engine.py +189 -189
  16. crawlo/core/processor.py +40 -40
  17. crawlo/core/scheduler.py +165 -165
  18. crawlo/crawler.py +1027 -1027
  19. crawlo/downloader/__init__.py +242 -242
  20. crawlo/downloader/aiohttp_downloader.py +212 -212
  21. crawlo/downloader/cffi_downloader.py +251 -251
  22. crawlo/downloader/httpx_downloader.py +259 -259
  23. crawlo/event.py +11 -11
  24. crawlo/exceptions.py +81 -81
  25. crawlo/extension/__init__.py +38 -31
  26. crawlo/extension/health_check.py +142 -0
  27. crawlo/extension/log_interval.py +58 -49
  28. crawlo/extension/log_stats.py +82 -44
  29. crawlo/extension/logging_extension.py +44 -35
  30. crawlo/extension/memory_monitor.py +89 -0
  31. crawlo/extension/performance_profiler.py +118 -0
  32. crawlo/extension/request_recorder.py +108 -0
  33. crawlo/filters/__init__.py +154 -154
  34. crawlo/filters/aioredis_filter.py +241 -241
  35. crawlo/filters/memory_filter.py +269 -269
  36. crawlo/items/__init__.py +23 -23
  37. crawlo/items/base.py +21 -21
  38. crawlo/items/fields.py +53 -53
  39. crawlo/items/items.py +104 -104
  40. crawlo/middleware/__init__.py +21 -21
  41. crawlo/middleware/default_header.py +32 -32
  42. crawlo/middleware/download_delay.py +28 -28
  43. crawlo/middleware/middleware_manager.py +135 -135
  44. crawlo/middleware/proxy.py +248 -248
  45. crawlo/middleware/request_ignore.py +30 -30
  46. crawlo/middleware/response_code.py +18 -18
  47. crawlo/middleware/response_filter.py +26 -26
  48. crawlo/middleware/retry.py +124 -124
  49. crawlo/mode_manager.py +200 -200
  50. crawlo/network/__init__.py +21 -21
  51. crawlo/network/request.py +311 -311
  52. crawlo/network/response.py +271 -271
  53. crawlo/pipelines/__init__.py +21 -21
  54. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  55. crawlo/pipelines/console_pipeline.py +39 -39
  56. crawlo/pipelines/csv_pipeline.py +316 -316
  57. crawlo/pipelines/database_dedup_pipeline.py +224 -224
  58. crawlo/pipelines/json_pipeline.py +218 -218
  59. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  60. crawlo/pipelines/mongo_pipeline.py +132 -117
  61. crawlo/pipelines/mysql_pipeline.py +317 -195
  62. crawlo/pipelines/pipeline_manager.py +56 -56
  63. crawlo/pipelines/redis_dedup_pipeline.py +162 -162
  64. crawlo/project.py +153 -153
  65. crawlo/queue/pqueue.py +37 -37
  66. crawlo/queue/queue_manager.py +307 -307
  67. crawlo/queue/redis_priority_queue.py +208 -208
  68. crawlo/settings/__init__.py +7 -7
  69. crawlo/settings/default_settings.py +278 -244
  70. crawlo/settings/setting_manager.py +99 -99
  71. crawlo/spider/__init__.py +639 -639
  72. crawlo/stats_collector.py +59 -59
  73. crawlo/subscriber.py +131 -106
  74. crawlo/task_manager.py +30 -30
  75. crawlo/templates/crawlo.cfg.tmpl +10 -10
  76. crawlo/templates/project/__init__.py.tmpl +3 -3
  77. crawlo/templates/project/items.py.tmpl +17 -17
  78. crawlo/templates/project/middlewares.py.tmpl +111 -87
  79. crawlo/templates/project/pipelines.py.tmpl +97 -341
  80. crawlo/templates/project/run.py.tmpl +251 -251
  81. crawlo/templates/project/settings.py.tmpl +279 -250
  82. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  83. crawlo/templates/spider/spider.py.tmpl +142 -178
  84. crawlo/utils/__init__.py +7 -7
  85. crawlo/utils/controlled_spider_mixin.py +439 -439
  86. crawlo/utils/date_tools.py +233 -233
  87. crawlo/utils/db_helper.py +343 -343
  88. crawlo/utils/func_tools.py +82 -82
  89. crawlo/utils/large_scale_config.py +286 -286
  90. crawlo/utils/large_scale_helper.py +343 -343
  91. crawlo/utils/log.py +128 -128
  92. crawlo/utils/queue_helper.py +175 -175
  93. crawlo/utils/request.py +267 -267
  94. crawlo/utils/request_serializer.py +219 -219
  95. crawlo/utils/spider_loader.py +62 -62
  96. crawlo/utils/system.py +11 -11
  97. crawlo/utils/tools.py +4 -4
  98. crawlo/utils/url.py +39 -39
  99. crawlo-1.1.4.dist-info/METADATA +403 -0
  100. crawlo-1.1.4.dist-info/RECORD +117 -0
  101. examples/__init__.py +7 -7
  102. examples/controlled_spider_example.py +205 -205
  103. tests/__init__.py +7 -7
  104. tests/test_final_validation.py +153 -153
  105. tests/test_proxy_health_check.py +32 -32
  106. tests/test_proxy_middleware_integration.py +136 -136
  107. tests/test_proxy_providers.py +56 -56
  108. tests/test_proxy_stats.py +19 -19
  109. tests/test_proxy_strategies.py +59 -59
  110. tests/test_redis_config.py +28 -28
  111. tests/test_redis_queue.py +224 -224
  112. tests/test_request_serialization.py +70 -70
  113. tests/test_scheduler.py +241 -241
  114. crawlo-1.1.3.dist-info/METADATA +0 -635
  115. crawlo-1.1.3.dist-info/RECORD +0 -113
  116. {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/WHEEL +0 -0
  117. {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/entry_points.txt +0 -0
  118. {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/top_level.txt +0 -0
@@ -1,163 +1,163 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- 基于 Redis 的数据项去重管道
5
- ========================
6
- 提供分布式环境下的数据项去重功能,防止保存重复的数据记录。
7
-
8
- 特点:
9
- - 分布式支持: 多节点共享去重数据
10
- - 高性能: 使用 Redis 集合进行快速查找
11
- - 可配置: 支持自定义 Redis 连接参数
12
- - 容错设计: 网络异常时不会丢失数据
13
- """
14
-
15
- import hashlib
16
- from typing import Dict, Any, Optional
17
- import redis
18
-
19
- from crawlo import Item
20
- from crawlo.spider import Spider
21
- from crawlo.utils.log import get_logger
22
- from crawlo.exceptions import DropItem
23
-
24
-
25
- class RedisDedupPipeline:
26
- """基于 Redis 的数据项去重管道"""
27
-
28
- def __init__(
29
- self,
30
- redis_host: str = 'localhost',
31
- redis_port: int = 6379,
32
- redis_db: int = 0,
33
- redis_password: Optional[str] = None,
34
- redis_key: str = 'crawlo:item_fingerprints',
35
- log_level: str = "INFO"
36
- ):
37
- """
38
- 初始化 Redis 去重管道
39
-
40
- :param redis_host: Redis 主机地址
41
- :param redis_port: Redis 端口
42
- :param redis_db: Redis 数据库编号
43
- :param redis_password: Redis 密码
44
- :param redis_key: 存储指纹的 Redis 键名
45
- :param log_level: 日志级别
46
- """
47
- self.logger = get_logger(self.__class__.__name__, log_level)
48
-
49
- # 初始化 Redis 连接
50
- try:
51
- self.redis_client = redis.Redis(
52
- host=redis_host,
53
- port=redis_port,
54
- db=redis_db,
55
- password=redis_password,
56
- decode_responses=True,
57
- socket_connect_timeout=5,
58
- socket_timeout=5
59
- )
60
- # 测试连接
61
- self.redis_client.ping()
62
- self.logger.info(f"Redis 连接成功: {redis_host}:{redis_port}/{redis_db}")
63
- except Exception as e:
64
- self.logger.error(f"Redis 连接失败: {e}")
65
- raise RuntimeError(f"Redis 连接失败: {e}")
66
-
67
- self.redis_key = redis_key
68
- self.dropped_count = 0
69
-
70
- @classmethod
71
- def from_crawler(cls, crawler):
72
- """从爬虫配置创建管道实例"""
73
- settings = crawler.settings
74
-
75
- return cls(
76
- redis_host=settings.get('REDIS_HOST', 'localhost'),
77
- redis_port=settings.getint('REDIS_PORT', 6379),
78
- redis_db=settings.getint('REDIS_DB', 0),
79
- redis_password=settings.get('REDIS_PASSWORD') or None,
80
- redis_key=settings.get('REDIS_DEDUP_KEY', 'crawlo:item_fingerprints'),
81
- log_level=settings.get('LOG_LEVEL', 'INFO')
82
- )
83
-
84
- def process_item(self, item: Item, spider: Spider) -> Item:
85
- """
86
- 处理数据项,进行去重检查
87
-
88
- :param item: 要处理的数据项
89
- :param spider: 爬虫实例
90
- :return: 处理后的数据项或抛出 DropItem 异常
91
- """
92
- try:
93
- # 生成数据项指纹
94
- fingerprint = self._generate_item_fingerprint(item)
95
-
96
- # 使用 Redis 的 SADD 命令检查并添加指纹
97
- # 如果指纹已存在,SADD 返回 0;如果指纹是新的,SADD 返回 1
98
- is_new = self.redis_client.sadd(self.redis_key, fingerprint)
99
-
100
- if not is_new:
101
- # 如果指纹已存在,丢弃这个数据项
102
- self.dropped_count += 1
103
- self.logger.debug(f"丢弃重复数据项: {fingerprint[:20]}...")
104
- raise DropItem(f"重复的数据项: {fingerprint}")
105
- else:
106
- # 如果是新数据项,继续处理
107
- self.logger.debug(f"处理新数据项: {fingerprint[:20]}...")
108
- return item
109
-
110
- except redis.RedisError as e:
111
- self.logger.error(f"Redis 错误: {e}")
112
- # 在 Redis 错误时继续处理,避免丢失数据
113
- return item
114
- except Exception as e:
115
- self.logger.error(f"处理数据项时出错: {e}")
116
- # 在其他错误时继续处理
117
- return item
118
-
119
- def _generate_item_fingerprint(self, item: Item) -> str:
120
- """
121
- 生成数据项指纹
122
-
123
- 基于数据项的所有字段生成唯一指纹,用于去重判断。
124
-
125
- :param item: 数据项
126
- :return: 指纹字符串
127
- """
128
- # 将数据项转换为可序列化的字典
129
- try:
130
- item_dict = item.to_dict()
131
- except AttributeError:
132
- # 兼容没有to_dict方法的Item实现
133
- item_dict = dict(item)
134
-
135
- # 对字典进行排序以确保一致性
136
- sorted_items = sorted(item_dict.items())
137
-
138
- # 生成指纹字符串
139
- fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
140
-
141
- # 使用 SHA256 生成固定长度的指纹
142
- return hashlib.sha256(fingerprint_string.encode('utf-8')).hexdigest()
143
-
144
- def close_spider(self, spider: Spider) -> None:
145
- """
146
- 爬虫关闭时的清理工作
147
-
148
- :param spider: 爬虫实例
149
- """
150
- try:
151
- # 获取去重统计信息
152
- total_items = self.redis_client.scard(self.redis_key)
153
- self.logger.info(f"爬虫 {spider.name} 关闭:")
154
- self.logger.info(f" - 丢弃的重复数据项: {self.dropped_count}")
155
- self.logger.info(f" - Redis 中存储的指纹数: {total_items}")
156
-
157
- # 注意:默认情况下不清理 Redis 中的指纹
158
- # 如果需要清理,可以在设置中配置
159
- if spider.crawler.settings.getbool('REDIS_DEDUP_CLEANUP', False):
160
- deleted = self.redis_client.delete(self.redis_key)
161
- self.logger.info(f" - 清理的指纹数: {deleted}")
162
- except Exception as e:
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ 基于 Redis 的数据项去重管道
5
+ ========================
6
+ 提供分布式环境下的数据项去重功能,防止保存重复的数据记录。
7
+
8
+ 特点:
9
+ - 分布式支持: 多节点共享去重数据
10
+ - 高性能: 使用 Redis 集合进行快速查找
11
+ - 可配置: 支持自定义 Redis 连接参数
12
+ - 容错设计: 网络异常时不会丢失数据
13
+ """
14
+
15
+ import hashlib
16
+ from typing import Dict, Any, Optional
17
+ import redis
18
+
19
+ from crawlo import Item
20
+ from crawlo.spider import Spider
21
+ from crawlo.utils.log import get_logger
22
+ from crawlo.exceptions import DropItem
23
+
24
+
25
+ class RedisDedupPipeline:
26
+ """基于 Redis 的数据项去重管道"""
27
+
28
+ def __init__(
29
+ self,
30
+ redis_host: str = 'localhost',
31
+ redis_port: int = 6379,
32
+ redis_db: int = 0,
33
+ redis_password: Optional[str] = None,
34
+ redis_key: str = 'crawlo:item_fingerprints',
35
+ log_level: str = "INFO"
36
+ ):
37
+ """
38
+ 初始化 Redis 去重管道
39
+
40
+ :param redis_host: Redis 主机地址
41
+ :param redis_port: Redis 端口
42
+ :param redis_db: Redis 数据库编号
43
+ :param redis_password: Redis 密码
44
+ :param redis_key: 存储指纹的 Redis 键名
45
+ :param log_level: 日志级别
46
+ """
47
+ self.logger = get_logger(self.__class__.__name__, log_level)
48
+
49
+ # 初始化 Redis 连接
50
+ try:
51
+ self.redis_client = redis.Redis(
52
+ host=redis_host,
53
+ port=redis_port,
54
+ db=redis_db,
55
+ password=redis_password,
56
+ decode_responses=True,
57
+ socket_connect_timeout=5,
58
+ socket_timeout=5
59
+ )
60
+ # 测试连接
61
+ self.redis_client.ping()
62
+ self.logger.info(f"Redis 连接成功: {redis_host}:{redis_port}/{redis_db}")
63
+ except Exception as e:
64
+ self.logger.error(f"Redis 连接失败: {e}")
65
+ raise RuntimeError(f"Redis 连接失败: {e}")
66
+
67
+ self.redis_key = redis_key
68
+ self.dropped_count = 0
69
+
70
+ @classmethod
71
+ def from_crawler(cls, crawler):
72
+ """从爬虫配置创建管道实例"""
73
+ settings = crawler.settings
74
+
75
+ return cls(
76
+ redis_host=settings.get('REDIS_HOST', 'localhost'),
77
+ redis_port=settings.getint('REDIS_PORT', 6379),
78
+ redis_db=settings.getint('REDIS_DB', 0),
79
+ redis_password=settings.get('REDIS_PASSWORD') or None,
80
+ redis_key=settings.get('REDIS_DEDUP_KEY', 'crawlo:item_fingerprints'),
81
+ log_level=settings.get('LOG_LEVEL', 'INFO')
82
+ )
83
+
84
+ def process_item(self, item: Item, spider: Spider) -> Item:
85
+ """
86
+ 处理数据项,进行去重检查
87
+
88
+ :param item: 要处理的数据项
89
+ :param spider: 爬虫实例
90
+ :return: 处理后的数据项或抛出 DropItem 异常
91
+ """
92
+ try:
93
+ # 生成数据项指纹
94
+ fingerprint = self._generate_item_fingerprint(item)
95
+
96
+ # 使用 Redis 的 SADD 命令检查并添加指纹
97
+ # 如果指纹已存在,SADD 返回 0;如果指纹是新的,SADD 返回 1
98
+ is_new = self.redis_client.sadd(self.redis_key, fingerprint)
99
+
100
+ if not is_new:
101
+ # 如果指纹已存在,丢弃这个数据项
102
+ self.dropped_count += 1
103
+ self.logger.debug(f"丢弃重复数据项: {fingerprint[:20]}...")
104
+ raise DropItem(f"重复的数据项: {fingerprint}")
105
+ else:
106
+ # 如果是新数据项,继续处理
107
+ self.logger.debug(f"处理新数据项: {fingerprint[:20]}...")
108
+ return item
109
+
110
+ except redis.RedisError as e:
111
+ self.logger.error(f"Redis 错误: {e}")
112
+ # 在 Redis 错误时继续处理,避免丢失数据
113
+ return item
114
+ except Exception as e:
115
+ self.logger.error(f"处理数据项时出错: {e}")
116
+ # 在其他错误时继续处理
117
+ return item
118
+
119
+ def _generate_item_fingerprint(self, item: Item) -> str:
120
+ """
121
+ 生成数据项指纹
122
+
123
+ 基于数据项的所有字段生成唯一指纹,用于去重判断。
124
+
125
+ :param item: 数据项
126
+ :return: 指纹字符串
127
+ """
128
+ # 将数据项转换为可序列化的字典
129
+ try:
130
+ item_dict = item.to_dict()
131
+ except AttributeError:
132
+ # 兼容没有to_dict方法的Item实现
133
+ item_dict = dict(item)
134
+
135
+ # 对字典进行排序以确保一致性
136
+ sorted_items = sorted(item_dict.items())
137
+
138
+ # 生成指纹字符串
139
+ fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
140
+
141
+ # 使用 SHA256 生成固定长度的指纹
142
+ return hashlib.sha256(fingerprint_string.encode('utf-8')).hexdigest()
143
+
144
+ def close_spider(self, spider: Spider) -> None:
145
+ """
146
+ 爬虫关闭时的清理工作
147
+
148
+ :param spider: 爬虫实例
149
+ """
150
+ try:
151
+ # 获取去重统计信息
152
+ total_items = self.redis_client.scard(self.redis_key)
153
+ self.logger.info(f"爬虫 {spider.name} 关闭:")
154
+ self.logger.info(f" - 丢弃的重复数据项: {self.dropped_count}")
155
+ self.logger.info(f" - Redis 中存储的指纹数: {total_items}")
156
+
157
+ # 注意:默认情况下不清理 Redis 中的指纹
158
+ # 如果需要清理,可以在设置中配置
159
+ if spider.crawler.settings.getbool('REDIS_DEDUP_CLEANUP', False):
160
+ deleted = self.redis_client.delete(self.redis_key)
161
+ self.logger.info(f" - 清理的指纹数: {deleted}")
162
+ except Exception as e:
163
163
  self.logger.error(f"关闭爬虫时出错: {e}")
crawlo/project.py CHANGED
@@ -1,153 +1,153 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- Crawlo 项目初始化模块
5
-
6
- 负责:
7
- 1. 向上搜索项目根目录(通过 crawlo.cfg 或 settings.py)
8
- 2. 将项目根目录加入 sys.path
9
- 3. 加载 settings 模块
10
- 4. 返回 SettingManager 实例
11
- """
12
- import os
13
- import sys
14
- import configparser
15
- from importlib import import_module
16
- from inspect import iscoroutinefunction
17
- from typing import Callable, Optional, Tuple
18
-
19
- from crawlo.utils.log import get_logger
20
- from crawlo.settings.setting_manager import SettingManager
21
-
22
- logger = get_logger(__name__)
23
-
24
-
25
- def _find_project_root(start_path: str = ".") -> Optional[str]:
26
- """
27
- 从指定路径向上查找项目根目录。
28
- 识别依据:
29
- 1. 存在 'crawlo.cfg'
30
- 2. 存在 '__init__.py' 和 'settings.py'(即 Python 包)
31
- """
32
- path = os.path.abspath(start_path)
33
- while True:
34
- cfg_file = os.path.join(path, "crawlo.cfg")
35
- if os.path.isfile(cfg_file):
36
- logger.info(f"✅ 找到项目配置文件: {cfg_file}")
37
- return path
38
-
39
- settings_file = os.path.join(path, "settings.py")
40
- init_file = os.path.join(path, "__init__.py")
41
- if os.path.isfile(settings_file) and os.path.isfile(init_file):
42
- logger.info(f"✅ 找到项目模块: {path}")
43
- return path
44
-
45
- parent = os.path.dirname(path)
46
- if parent == path:
47
- break
48
- path = parent
49
-
50
- logger.warning("❌ 未找到 Crawlo 项目根目录。请确保在包含 'crawlo.cfg' 或 'settings.py' 的目录运行。")
51
- return None
52
-
53
-
54
- def _get_settings_module_from_cfg(cfg_path: str) -> str:
55
- """从 crawlo.cfg 读取 settings 模块路径"""
56
- config = configparser.ConfigParser()
57
- try:
58
- config.read(cfg_path, encoding="utf-8")
59
- if config.has_section("settings") and config.has_option("settings", "default"):
60
- module_path = config.get("settings", "default")
61
- logger.info(f"📄 从 crawlo.cfg 加载 settings 模块: {module_path}")
62
- return module_path
63
- else:
64
- raise RuntimeError(f"配置文件缺少 [settings] 或 default 选项: {cfg_path}")
65
- except Exception as e:
66
- raise RuntimeError(f"解析 crawlo.cfg 失败: {e}")
67
-
68
-
69
- def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
70
- """
71
- 获取配置管理器实例(主入口函数)
72
-
73
- Args:
74
- custom_settings: 运行时自定义配置,会覆盖 settings.py
75
-
76
- Returns:
77
- SettingManager: 已加载配置的实例
78
- """
79
- logger.info("🚀 正在初始化 Crawlo 项目配置...")
80
-
81
- # 1. 查找项目根
82
- project_root = _find_project_root()
83
- if not project_root:
84
- raise RuntimeError("未找到 Crawlo 项目,请检查项目结构")
85
-
86
- # 2. 确定 settings 模块
87
- settings_module_path = None
88
- cfg_file = os.path.join(project_root, "crawlo.cfg")
89
-
90
- if os.path.isfile(cfg_file):
91
- settings_module_path = _get_settings_module_from_cfg(cfg_file)
92
- else:
93
- # 推断:项目目录名.settings
94
- project_name = os.path.basename(project_root)
95
- settings_module_path = f"{project_name}.settings"
96
- logger.warning(f"⚠️ 未找到 crawlo.cfg,推断 settings 模块为: {settings_module_path}")
97
-
98
- # 3. 注入 sys.path
99
- project_root_str = os.path.abspath(project_root)
100
- if project_root_str not in sys.path:
101
- sys.path.insert(0, project_root_str)
102
- logger.info(f"📁 项目根目录已加入 sys.path: {project_root_str}")
103
-
104
- # 4. 加载 SettingManager
105
- logger.info(f"⚙️ 正在加载配置模块: {settings_module_path}")
106
- settings = SettingManager()
107
-
108
- try:
109
- settings.set_settings(settings_module_path)
110
- logger.info("✅ settings 模块加载成功")
111
- except Exception as e:
112
- raise ImportError(f"加载 settings 模块失败 '{settings_module_path}': {e}")
113
-
114
- # 5. 合并运行时配置
115
- if custom_settings:
116
- settings.update_attributes(custom_settings)
117
- logger.info(f"🔧 已应用运行时自定义配置: {list(custom_settings.keys())}")
118
-
119
- logger.info("🎉 Crawlo 项目配置初始化完成!")
120
- return settings
121
-
122
-
123
- def load_class(_path):
124
- if not isinstance(_path, str):
125
- if callable(_path):
126
- return _path
127
- else:
128
- raise TypeError(f"args expect str or object, got {_path}")
129
-
130
- module_name, class_name = _path.rsplit('.', 1)
131
- module = import_module(module_name)
132
-
133
- try:
134
- cls = getattr(module, class_name)
135
- except AttributeError:
136
- raise NameError(f"Module {module_name!r} has no class named {class_name!r}")
137
- return cls
138
-
139
-
140
- def merge_settings(spider, settings):
141
- spider_name = getattr(spider, 'name', 'UnknownSpider')
142
- if hasattr(spider, 'custom_settings'):
143
- custom_settings = getattr(spider, 'custom_settings')
144
- settings.update_attributes(custom_settings)
145
- else:
146
- logger.debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并") # 添加日志
147
-
148
-
149
- async def common_call(func: Callable, *args, **kwargs):
150
- if iscoroutinefunction(func):
151
- return await func(*args, **kwargs)
152
- else:
153
- return func(*args, **kwargs)
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ Crawlo 项目初始化模块
5
+
6
+ 负责:
7
+ 1. 向上搜索项目根目录(通过 crawlo.cfg 或 settings.py)
8
+ 2. 将项目根目录加入 sys.path
9
+ 3. 加载 settings 模块
10
+ 4. 返回 SettingManager 实例
11
+ """
12
+ import os
13
+ import sys
14
+ import configparser
15
+ from importlib import import_module
16
+ from inspect import iscoroutinefunction
17
+ from typing import Callable, Optional, Tuple
18
+
19
+ from crawlo.utils.log import get_logger
20
+ from crawlo.settings.setting_manager import SettingManager
21
+
22
+ logger = get_logger(__name__)
23
+
24
+
25
+ def _find_project_root(start_path: str = ".") -> Optional[str]:
26
+ """
27
+ 从指定路径向上查找项目根目录。
28
+ 识别依据:
29
+ 1. 存在 'crawlo.cfg'
30
+ 2. 存在 '__init__.py' 和 'settings.py'(即 Python 包)
31
+ """
32
+ path = os.path.abspath(start_path)
33
+ while True:
34
+ cfg_file = os.path.join(path, "crawlo.cfg")
35
+ if os.path.isfile(cfg_file):
36
+ logger.info(f"✅ 找到项目配置文件: {cfg_file}")
37
+ return path
38
+
39
+ settings_file = os.path.join(path, "settings.py")
40
+ init_file = os.path.join(path, "__init__.py")
41
+ if os.path.isfile(settings_file) and os.path.isfile(init_file):
42
+ logger.info(f"✅ 找到项目模块: {path}")
43
+ return path
44
+
45
+ parent = os.path.dirname(path)
46
+ if parent == path:
47
+ break
48
+ path = parent
49
+
50
+ logger.warning("❌ 未找到 Crawlo 项目根目录。请确保在包含 'crawlo.cfg' 或 'settings.py' 的目录运行。")
51
+ return None
52
+
53
+
54
+ def _get_settings_module_from_cfg(cfg_path: str) -> str:
55
+ """从 crawlo.cfg 读取 settings 模块路径"""
56
+ config = configparser.ConfigParser()
57
+ try:
58
+ config.read(cfg_path, encoding="utf-8")
59
+ if config.has_section("settings") and config.has_option("settings", "default"):
60
+ module_path = config.get("settings", "default")
61
+ logger.info(f"📄 从 crawlo.cfg 加载 settings 模块: {module_path}")
62
+ return module_path
63
+ else:
64
+ raise RuntimeError(f"配置文件缺少 [settings] 或 default 选项: {cfg_path}")
65
+ except Exception as e:
66
+ raise RuntimeError(f"解析 crawlo.cfg 失败: {e}")
67
+
68
+
69
+ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
70
+ """
71
+ 获取配置管理器实例(主入口函数)
72
+
73
+ Args:
74
+ custom_settings: 运行时自定义配置,会覆盖 settings.py
75
+
76
+ Returns:
77
+ SettingManager: 已加载配置的实例
78
+ """
79
+ logger.info("🚀 正在初始化 Crawlo 项目配置...")
80
+
81
+ # 1. 查找项目根
82
+ project_root = _find_project_root()
83
+ if not project_root:
84
+ raise RuntimeError("未找到 Crawlo 项目,请检查项目结构")
85
+
86
+ # 2. 确定 settings 模块
87
+ settings_module_path = None
88
+ cfg_file = os.path.join(project_root, "crawlo.cfg")
89
+
90
+ if os.path.isfile(cfg_file):
91
+ settings_module_path = _get_settings_module_from_cfg(cfg_file)
92
+ else:
93
+ # 推断:项目目录名.settings
94
+ project_name = os.path.basename(project_root)
95
+ settings_module_path = f"{project_name}.settings"
96
+ logger.warning(f"⚠️ 未找到 crawlo.cfg,推断 settings 模块为: {settings_module_path}")
97
+
98
+ # 3. 注入 sys.path
99
+ project_root_str = os.path.abspath(project_root)
100
+ if project_root_str not in sys.path:
101
+ sys.path.insert(0, project_root_str)
102
+ logger.info(f"📁 项目根目录已加入 sys.path: {project_root_str}")
103
+
104
+ # 4. 加载 SettingManager
105
+ logger.info(f"⚙️ 正在加载配置模块: {settings_module_path}")
106
+ settings = SettingManager()
107
+
108
+ try:
109
+ settings.set_settings(settings_module_path)
110
+ logger.info("✅ settings 模块加载成功")
111
+ except Exception as e:
112
+ raise ImportError(f"加载 settings 模块失败 '{settings_module_path}': {e}")
113
+
114
+ # 5. 合并运行时配置
115
+ if custom_settings:
116
+ settings.update_attributes(custom_settings)
117
+ logger.info(f"🔧 已应用运行时自定义配置: {list(custom_settings.keys())}")
118
+
119
+ logger.info("🎉 Crawlo 项目配置初始化完成!")
120
+ return settings
121
+
122
+
123
+ def load_class(_path):
124
+ if not isinstance(_path, str):
125
+ if callable(_path):
126
+ return _path
127
+ else:
128
+ raise TypeError(f"args expect str or object, got {_path}")
129
+
130
+ module_name, class_name = _path.rsplit('.', 1)
131
+ module = import_module(module_name)
132
+
133
+ try:
134
+ cls = getattr(module, class_name)
135
+ except AttributeError:
136
+ raise NameError(f"Module {module_name!r} has no class named {class_name!r}")
137
+ return cls
138
+
139
+
140
+ def merge_settings(spider, settings):
141
+ spider_name = getattr(spider, 'name', 'UnknownSpider')
142
+ if hasattr(spider, 'custom_settings'):
143
+ custom_settings = getattr(spider, 'custom_settings')
144
+ settings.update_attributes(custom_settings)
145
+ else:
146
+ logger.debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并") # 添加日志
147
+
148
+
149
+ async def common_call(func: Callable, *args, **kwargs):
150
+ if iscoroutinefunction(func):
151
+ return await func(*args, **kwargs)
152
+ else:
153
+ return func(*args, **kwargs)