crawlo 1.0.6__py3-none-any.whl → 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__version__.py +1 -1
- crawlo/commands/check.py +107 -0
- crawlo/commands/list.py +92 -0
- crawlo/commands/run.py +109 -77
- crawlo/commands/stats.py +59 -0
- crawlo/crawler.py +340 -66
- crawlo/items/__init__.py +2 -1
- crawlo/items/base.py +1 -10
- crawlo/spider/__init__.py +91 -3
- crawlo/utils/project.py +14 -16
- {crawlo-1.0.6.dist-info → crawlo-1.0.8.dist-info}/METADATA +1 -1
- {crawlo-1.0.6.dist-info → crawlo-1.0.8.dist-info}/RECORD +15 -12
- {crawlo-1.0.6.dist-info → crawlo-1.0.8.dist-info}/WHEEL +0 -0
- {crawlo-1.0.6.dist-info → crawlo-1.0.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.0.6.dist-info → crawlo-1.0.8.dist-info}/top_level.txt +0 -0
crawlo/utils/project.py
CHANGED
|
@@ -19,8 +19,7 @@ from typing import Callable, Optional
|
|
|
19
19
|
from crawlo.utils.log import get_logger
|
|
20
20
|
from crawlo.settings.setting_manager import SettingManager
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
logger =get_logger(__name__)
|
|
22
|
+
logger = get_logger(__name__)
|
|
24
23
|
|
|
25
24
|
|
|
26
25
|
def _find_project_root(start_path: str = '.') -> Optional[str]:
|
|
@@ -37,13 +36,11 @@ def _find_project_root(start_path: str = '.') -> Optional[str]:
|
|
|
37
36
|
Optional[str]: 找到的项目根目录的绝对路径,如果未找到则返回 None。
|
|
38
37
|
"""
|
|
39
38
|
path = os.path.abspath(start_path)
|
|
40
|
-
logger.info(f"开始向上搜索项目根目录,起始路径: {path}")
|
|
41
39
|
|
|
42
40
|
while True:
|
|
43
41
|
# 1. 检查是否存在 crawlo.cfg 文件
|
|
44
42
|
cfg_file = os.path.join(path, 'crawlo.cfg')
|
|
45
43
|
if os.path.isfile(cfg_file):
|
|
46
|
-
logger.info(f"在路径 {path} 找到 'crawlo.cfg' 文件,确定为项目根目录。")
|
|
47
44
|
return path
|
|
48
45
|
|
|
49
46
|
# 2. 检查是否存在 settings.py 文件,并且它位于一个 Python 包中
|
|
@@ -51,7 +48,6 @@ def _find_project_root(start_path: str = '.') -> Optional[str]:
|
|
|
51
48
|
if os.path.isfile(settings_file):
|
|
52
49
|
init_file = os.path.join(path, '__init__.py')
|
|
53
50
|
if os.path.isfile(init_file):
|
|
54
|
-
logger.info(f"在路径 {path} 找到 'settings.py' 文件,确定为项目根目录。")
|
|
55
51
|
return path
|
|
56
52
|
else:
|
|
57
53
|
logger.debug(f"在路径 {path} 找到 'settings.py',但缺少 '__init__.py',忽略。")
|
|
@@ -86,7 +82,7 @@ def _get_settings_module_from_cfg(cfg_path: str) -> str:
|
|
|
86
82
|
config.read(cfg_path, encoding='utf-8')
|
|
87
83
|
if config.has_section('settings') and config.has_option('settings', 'default'):
|
|
88
84
|
module_path = config.get('settings', 'default')
|
|
89
|
-
logger.
|
|
85
|
+
logger.debug(f"从 'crawlo.cfg' 中读取到 settings 模块路径: {module_path}")
|
|
90
86
|
return module_path
|
|
91
87
|
else:
|
|
92
88
|
error_msg = f"配置文件 '{cfg_path}' 缺少 '[settings]' 或 'default' 配置项。"
|
|
@@ -113,7 +109,7 @@ def get_settings(custom_settings=None):
|
|
|
113
109
|
RuntimeError: 当无法找到项目或配置文件时。
|
|
114
110
|
ImportError: 当无法导入指定的 settings 模块时。
|
|
115
111
|
"""
|
|
116
|
-
logger.
|
|
112
|
+
logger.debug("正在初始化配置管理器...")
|
|
117
113
|
|
|
118
114
|
# 1. 发现项目根目录
|
|
119
115
|
project_root = _find_project_root()
|
|
@@ -122,7 +118,7 @@ def get_settings(custom_settings=None):
|
|
|
122
118
|
logger.error(error_msg)
|
|
123
119
|
raise RuntimeError(error_msg)
|
|
124
120
|
|
|
125
|
-
logger.
|
|
121
|
+
logger.debug(f"项目根目录已确定: {project_root}")
|
|
126
122
|
|
|
127
123
|
# 2. 确定 settings 模块的导入路径
|
|
128
124
|
settings_module_path = None
|
|
@@ -132,27 +128,27 @@ def get_settings(custom_settings=None):
|
|
|
132
128
|
if os.path.isfile(cfg_file):
|
|
133
129
|
settings_module_path = _get_settings_module_from_cfg(cfg_file)
|
|
134
130
|
else:
|
|
135
|
-
logger.
|
|
131
|
+
logger.debug("未找到 'crawlo.cfg',尝试推断 settings 模块路径...")
|
|
136
132
|
# 推断:项目目录名.settings
|
|
137
133
|
project_name = os.path.basename(project_root)
|
|
138
134
|
settings_module_path = f"{project_name}.settings"
|
|
139
|
-
logger.
|
|
135
|
+
logger.debug(f"推断 settings 模块路径为: {settings_module_path}")
|
|
140
136
|
|
|
141
137
|
# 3. 将项目根目录添加到 Python 路径,确保可以成功导入
|
|
142
138
|
if project_root not in sys.path:
|
|
143
139
|
sys.path.insert(0, project_root)
|
|
144
|
-
logger.
|
|
140
|
+
logger.debug(f"已将项目根目录 '{project_root}' 添加到 Python 路径。")
|
|
145
141
|
else:
|
|
146
142
|
logger.debug(f"项目根目录 '{project_root}' 已在 Python 路径中。")
|
|
147
143
|
|
|
148
144
|
# 4. 创建 SettingManager 并加载配置
|
|
149
|
-
logger.
|
|
145
|
+
logger.debug(f"正在加载 settings 模块: {settings_module_path}")
|
|
150
146
|
settings = SettingManager()
|
|
151
147
|
|
|
152
148
|
try:
|
|
153
149
|
# 这会触发 SettingManager.set_settings(),从模块中加载所有大写常量
|
|
154
150
|
settings.set_settings(settings_module_path)
|
|
155
|
-
logger.
|
|
151
|
+
logger.debug("settings 模块加载成功。")
|
|
156
152
|
except Exception as e:
|
|
157
153
|
error_msg = f"加载 settings 模块 '{settings_module_path}' 失败: {e}"
|
|
158
154
|
logger.error(error_msg)
|
|
@@ -160,13 +156,14 @@ def get_settings(custom_settings=None):
|
|
|
160
156
|
|
|
161
157
|
# 5. 应用运行时自定义设置
|
|
162
158
|
if custom_settings:
|
|
163
|
-
logger.
|
|
159
|
+
logger.debug(f"正在应用运行时自定义设置: {custom_settings}")
|
|
164
160
|
settings.update_attributes(custom_settings)
|
|
165
161
|
logger.info("运行时自定义设置已应用。")
|
|
166
162
|
|
|
167
|
-
logger.
|
|
163
|
+
logger.debug("配置管理器初始化完成。")
|
|
168
164
|
return settings
|
|
169
165
|
|
|
166
|
+
|
|
170
167
|
def load_class(_path):
|
|
171
168
|
if not isinstance(_path, str):
|
|
172
169
|
if callable(_path):
|
|
@@ -183,13 +180,14 @@ def load_class(_path):
|
|
|
183
180
|
raise NameError(f"Module {module_name!r} has no class named {class_name!r}")
|
|
184
181
|
return cls
|
|
185
182
|
|
|
183
|
+
|
|
186
184
|
def merge_settings(spider, settings):
|
|
187
185
|
spider_name = getattr(spider, 'name', 'UnknownSpider')
|
|
188
186
|
if hasattr(spider, 'custom_settings'):
|
|
189
187
|
custom_settings = getattr(spider, 'custom_settings')
|
|
190
188
|
settings.update_attributes(custom_settings)
|
|
191
189
|
else:
|
|
192
|
-
logger.debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并")
|
|
190
|
+
logger.debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并") # 添加日志
|
|
193
191
|
|
|
194
192
|
|
|
195
193
|
async def common_call(func: Callable, *args, **kwargs):
|
|
@@ -1,16 +1,19 @@
|
|
|
1
1
|
crawlo/__init__.py,sha256=xpiIAZbSG3CzneJuDLPCbwfRcvw2wyHYl2kJjaNfNGY,584
|
|
2
|
-
crawlo/__version__.py,sha256=
|
|
2
|
+
crawlo/__version__.py,sha256=uyL3a6o1xccXPZ2OS65zqIN_lbEMT7PcCxErq7cuWwA,23
|
|
3
3
|
crawlo/cli.py,sha256=hjAJKx9pba375sATvvcy-dtZyBIgXj8fRBq9RFIZHA4,1206
|
|
4
|
-
crawlo/crawler.py,sha256=
|
|
4
|
+
crawlo/crawler.py,sha256=AyKxUyJvCwb1u4d3Zn3vFmjH28ExWKIygfTICps-3yY,20026
|
|
5
5
|
crawlo/event.py,sha256=ZhoPW5CglCEuZNFEwviSCBIw0pT5O6jT98bqYrDFd3E,324
|
|
6
6
|
crawlo/exceptions.py,sha256=xdyZkvVcLEJ-19sWMHvn9IJsu30-hAY2jJhA2kYIims,1207
|
|
7
7
|
crawlo/stats_collector.py,sha256=v4jC9BAe-23w93hWzbeMCCgQ9VuFPyxw5JV9ItbGH8w,1636
|
|
8
8
|
crawlo/subscriber.py,sha256=udlHeTR0ymGQhCDxVUGwUzeeeR4TYCEJrJwFnkgr0cU,3836
|
|
9
9
|
crawlo/task_manager.py,sha256=D9m-nqnGj-FZPtGk4CdwZX3Gw7IWyYvTS7CHpRGWc_w,748
|
|
10
10
|
crawlo/commands/__init__.py,sha256=dRu3ipuhDM7M1eTb6zJtQZ_u7N_tZumGfH5_I92xno8,252
|
|
11
|
+
crawlo/commands/check.py,sha256=Q8wFjIo43XW0wP93TTlM7HSShgytJsbSWHIlmkcNxz0,3585
|
|
11
12
|
crawlo/commands/genspider.py,sha256=kSHYsAGHRoxU6Qf_MGpR_VS-Ua5NUGY2KGm_Wapn0sw,3529
|
|
12
|
-
crawlo/commands/
|
|
13
|
+
crawlo/commands/list.py,sha256=itR05muZlZs8FbRh88kOhcRbZc77OXiR6A86UnVhSMY,2974
|
|
14
|
+
crawlo/commands/run.py,sha256=s6JJC8HNa-tBgPDB2BPUmj26D7PMckhlx4AOEz57ESY,6197
|
|
13
15
|
crawlo/commands/startproject.py,sha256=1KOq_CALy01oklr0dAUYhGFzu4f7w45q2H0O3qafLX4,3494
|
|
16
|
+
crawlo/commands/stats.py,sha256=rH0TlD0o-xUr9RxtvNYgnSjHHoRyma3rvx9Q9nIGDNg,1659
|
|
14
17
|
crawlo/core/__init__.py,sha256=JYSAn15r8yWgRK_Nc69t_8tZCyb70MiPZKssA8wrYz0,43
|
|
15
18
|
crawlo/core/engine.py,sha256=JFHooPp-5cfHSyxEh87nOOR5NMaPLVDfNSqAsbtx4PM,6030
|
|
16
19
|
crawlo/core/processor.py,sha256=oHLs-cno0bJGTNc9NGD2S7_2-grI3ruvggO0SY2mf3Q,1180
|
|
@@ -26,8 +29,8 @@ crawlo/extension/logging_extension.py,sha256=rty2_up53KV05nCazuBuz2ZapHKq0ti7mGV
|
|
|
26
29
|
crawlo/filters/__init__.py,sha256=9fJQRVkxWWPChajYbAGe1O6UYB639xWt0hiLUGBs4hQ,1014
|
|
27
30
|
crawlo/filters/aioredis_filter.py,sha256=phBFW9Z28oylbik9Kb2WHM65Wo5yRAH2w9Yz0_2HaOQ,5621
|
|
28
31
|
crawlo/filters/memory_filter.py,sha256=L8XEJkObOxs4BzYpQvk9PVM969k2LE61VFsnEOTEf_E,6841
|
|
29
|
-
crawlo/items/__init__.py,sha256=
|
|
30
|
-
crawlo/items/base.py,sha256=
|
|
32
|
+
crawlo/items/__init__.py,sha256=rFpx1qFBo0Ik7bSdnXC8EVTJUOQdoJYGVdhYjaH00nk,409
|
|
33
|
+
crawlo/items/base.py,sha256=hwGJEdFWOdaZfalFX8umRkh_HUWLEbCjvq4j70fplMQ,598
|
|
31
34
|
crawlo/items/fields.py,sha256=fpS0vlRPpZYjTaMDgI9Q8z_YQqruwf6fi4Dgm6R2oEk,1854
|
|
32
35
|
crawlo/items/items.py,sha256=OmVEvMmgofMU95GkaiWkfNQ2fjsH2fY9sw3SKcmUhLs,3478
|
|
33
36
|
crawlo/middleware/__init__.py,sha256=PSwpRLdBUopaQzBp1S0zK_TZbrRagQ4yzvgyLy4tBk8,570
|
|
@@ -51,7 +54,7 @@ crawlo/pipelines/pipeline_manager.py,sha256=k-Rg0os0Havrov99D-Jn3ROpnz154K30tf7a
|
|
|
51
54
|
crawlo/settings/__init__.py,sha256=NgYFLfk_Bw7h6KSoepJn_lMBSqVbCHebjKxaE3_eMgw,130
|
|
52
55
|
crawlo/settings/default_settings.py,sha256=urj4XJ--ZpVRbbo3fWUT71bYQLmElx43AC9KeHtqHBs,7310
|
|
53
56
|
crawlo/settings/setting_manager.py,sha256=4xXOzKwZCgAp8ybwvVcs2R--CsOD7c6dBIkj6DJHB3c,2998
|
|
54
|
-
crawlo/spider/__init__.py,sha256=
|
|
57
|
+
crawlo/spider/__init__.py,sha256=IyQd4ufbAIhA_cvWrsNReRv3tj76CHc5Aef9c8KR-9s,3983
|
|
55
58
|
crawlo/templates/crawlo.cfg.tmpl,sha256=lwiUVe5sFixJgHFEjn1OtbAeyWsECOrz37uheuVtulk,240
|
|
56
59
|
crawlo/templates/project/__init__.py.tmpl,sha256=aQnHaOjMSkTviOC8COUX0fKymuyf8lx2tGduxkMkXEE,61
|
|
57
60
|
crawlo/templates/project/items.py.tmpl,sha256=bXx-oCldMr2EgBKUAH9LH5gMnbyLiWX-EySAaMzcu2g,318
|
|
@@ -67,7 +70,7 @@ crawlo/utils/db_helper.py,sha256=ZqOt1d3mErVv4TOvoWlov0niUxORB9aHByTmMoNFIDw,109
|
|
|
67
70
|
crawlo/utils/func_tools.py,sha256=y-TYP9H3X67MS_foWy9Z2LIS6GP7Y4Cy3T168ulq3Jc,2451
|
|
68
71
|
crawlo/utils/log.py,sha256=YD2FfXuuE2MC9ZdQQZ0H7KysE7l_LHZqQepaTPlcApo,4133
|
|
69
72
|
crawlo/utils/pqueue.py,sha256=HDgX4HAkc7RqYUtX6q51tzI1ZRTACf8P_4jLqC4-uC0,5559
|
|
70
|
-
crawlo/utils/project.py,sha256=
|
|
73
|
+
crawlo/utils/project.py,sha256=hXSKV55OBUFjJi7TXekB4X3MmAgsqAeVTj5wPUWOizc,7394
|
|
71
74
|
crawlo/utils/request.py,sha256=ejdKpTwc-HE04HQybafhOVywzz57IV3pY0YMkSLyGUo,9065
|
|
72
75
|
crawlo/utils/spider_loader.py,sha256=V0CBTicJBYBZafhwLfDEfuEc_hJ2mSoiptT6qKufI9U,2249
|
|
73
76
|
crawlo/utils/system.py,sha256=24zGmtHNhDFMGVo7ftMV-Pqg6_5d63zsyNey9udvJJk,248
|
|
@@ -87,8 +90,8 @@ tests/test_proxy_middleware_integration.py,sha256=mTPK_XvbmLCV_QoVZzA3ybWOOX6149
|
|
|
87
90
|
tests/test_proxy_providers.py,sha256=u_R2fhab90vqvQEaOAztpAOe9tJXvUMIdoDxmStmXJ4,1749
|
|
88
91
|
tests/test_proxy_stats.py,sha256=ES00CEoDITYPFBGPk8pecFzD3ItYIv6NSpcqNd8-kvo,526
|
|
89
92
|
tests/test_proxy_strategies.py,sha256=9Z1pXmTNyw-eIhGXlf2abZbJx6igLohYq-_3hldQ5uE,1868
|
|
90
|
-
crawlo-1.0.
|
|
91
|
-
crawlo-1.0.
|
|
92
|
-
crawlo-1.0.
|
|
93
|
-
crawlo-1.0.
|
|
94
|
-
crawlo-1.0.
|
|
93
|
+
crawlo-1.0.8.dist-info/METADATA,sha256=ia-nA0g0Rl76iHFIlvaRbvUnjd88KEKoxIrJKcjtCyw,1825
|
|
94
|
+
crawlo-1.0.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
95
|
+
crawlo-1.0.8.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
|
|
96
|
+
crawlo-1.0.8.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
|
|
97
|
+
crawlo-1.0.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|