crawlo 1.0.7__py3-none-any.whl → 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__version__.py +1 -1
- crawlo/commands/check.py +107 -0
- crawlo/commands/list.py +92 -0
- crawlo/commands/run.py +109 -77
- crawlo/commands/stats.py +59 -0
- crawlo/crawler.py +340 -66
- crawlo/spider/__init__.py +91 -3
- crawlo/utils/project.py +14 -16
- {crawlo-1.0.7.dist-info → crawlo-1.0.8.dist-info}/METADATA +1 -1
- {crawlo-1.0.7.dist-info → crawlo-1.0.8.dist-info}/RECORD +13 -10
- {crawlo-1.0.7.dist-info → crawlo-1.0.8.dist-info}/WHEEL +0 -0
- {crawlo-1.0.7.dist-info → crawlo-1.0.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.0.7.dist-info → crawlo-1.0.8.dist-info}/top_level.txt +0 -0
crawlo/__version__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.0.
|
|
1
|
+
__version__ = "1.0.8"
|
crawlo/commands/check.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-08-31 22:35
|
|
5
|
+
# @Author : crawl-coder
|
|
6
|
+
# @Desc : 命令行入口:crawlo check, 检查所有爬虫定义是否合规。
|
|
7
|
+
"""
|
|
8
|
+
import sys
|
|
9
|
+
import configparser
|
|
10
|
+
|
|
11
|
+
from crawlo.crawler import CrawlerProcess
|
|
12
|
+
from crawlo.utils.project import get_settings
|
|
13
|
+
from crawlo.utils.log import get_logger
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
logger = get_logger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def main(args):
|
|
20
|
+
if args:
|
|
21
|
+
print("Usage: crawlo check")
|
|
22
|
+
return 1
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
project_root = get_settings().get('PROJECT_ROOT')
|
|
26
|
+
if not project_root:
|
|
27
|
+
print("❌ Error: Cannot determine project root.")
|
|
28
|
+
return 1
|
|
29
|
+
|
|
30
|
+
if str(project_root) not in sys.path:
|
|
31
|
+
sys.path.insert(0, str(project_root))
|
|
32
|
+
|
|
33
|
+
cfg_file = project_root / 'crawlo.cfg'
|
|
34
|
+
if not cfg_file.exists():
|
|
35
|
+
print(f"❌ Error: crawlo.cfg not found in {project_root}")
|
|
36
|
+
return 1
|
|
37
|
+
|
|
38
|
+
config = configparser.ConfigParser()
|
|
39
|
+
config.read(cfg_file, encoding='utf-8')
|
|
40
|
+
|
|
41
|
+
if not config.has_section('settings') or not config.has_option('settings', 'default'):
|
|
42
|
+
print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
|
|
43
|
+
return 1
|
|
44
|
+
|
|
45
|
+
settings_module = config.get('settings', 'default')
|
|
46
|
+
project_package = settings_module.split('.')[0]
|
|
47
|
+
|
|
48
|
+
# 创建 CrawlerProcess 并发现爬虫
|
|
49
|
+
process = CrawlerProcess(spider_modules=[f"{project_package}.spiders"])
|
|
50
|
+
spider_names = process.get_spider_names()
|
|
51
|
+
|
|
52
|
+
if not spider_names:
|
|
53
|
+
print("📭 No spiders found.")
|
|
54
|
+
return 1
|
|
55
|
+
|
|
56
|
+
print(f"🔍 Checking {len(spider_names)} spider(s)...")
|
|
57
|
+
print("-" * 60)
|
|
58
|
+
|
|
59
|
+
issues_found = False
|
|
60
|
+
for name in sorted(spider_names):
|
|
61
|
+
cls = process.get_spider_class(name)
|
|
62
|
+
issues = []
|
|
63
|
+
|
|
64
|
+
if not hasattr(cls, 'name') or not cls.name:
|
|
65
|
+
issues.append("missing or empty 'name' attribute")
|
|
66
|
+
elif not isinstance(cls.name, str):
|
|
67
|
+
issues.append("'name' is not a string")
|
|
68
|
+
|
|
69
|
+
if not callable(getattr(cls, 'start_requests', None)):
|
|
70
|
+
issues.append("missing or non-callable 'start_requests' method")
|
|
71
|
+
|
|
72
|
+
if hasattr(cls, 'start_urls') and isinstance(cls.start_urls, str):
|
|
73
|
+
issues.append("'start_urls' is a string, should be list/tuple")
|
|
74
|
+
|
|
75
|
+
# 实例化检查(轻量)
|
|
76
|
+
try:
|
|
77
|
+
spider = cls.create_instance(None)
|
|
78
|
+
if not callable(getattr(spider, 'parse', None)):
|
|
79
|
+
issues.append("no 'parse' method defined (optional but recommended)")
|
|
80
|
+
except Exception as e:
|
|
81
|
+
issues.append(f"failed to create instance: {e}")
|
|
82
|
+
|
|
83
|
+
if issues:
|
|
84
|
+
print(f"❌ {name:<20} {cls.__name__}")
|
|
85
|
+
for issue in issues:
|
|
86
|
+
print(f" • {issue}")
|
|
87
|
+
issues_found = True
|
|
88
|
+
else:
|
|
89
|
+
print(f"✅ {name:<20} {cls.__name__} (OK)")
|
|
90
|
+
|
|
91
|
+
print("-" * 60)
|
|
92
|
+
if issues_found:
|
|
93
|
+
print("⚠️ Some spiders have issues. Please fix them.")
|
|
94
|
+
return 1
|
|
95
|
+
else:
|
|
96
|
+
print("🎉 All spiders are compliant!")
|
|
97
|
+
return 0
|
|
98
|
+
|
|
99
|
+
except Exception as e:
|
|
100
|
+
print(f"❌ Error during check: {e}")
|
|
101
|
+
import traceback
|
|
102
|
+
traceback.print_exc()
|
|
103
|
+
return 1
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
if __name__ == '__main__':
|
|
107
|
+
sys.exit(main(sys.argv[1:]))
|
crawlo/commands/list.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-08-31 22:33
|
|
5
|
+
# @Author : crawl-coder
|
|
6
|
+
# @Desc : 命令行入口:crawlo list,用于列出所有已注册的爬虫
|
|
7
|
+
"""
|
|
8
|
+
import sys
|
|
9
|
+
import configparser
|
|
10
|
+
|
|
11
|
+
from crawlo.crawler import CrawlerProcess
|
|
12
|
+
from crawlo.utils.project import get_settings
|
|
13
|
+
from crawlo.utils.log import get_logger
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
logger = get_logger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def main(args):
|
|
20
|
+
"""
|
|
21
|
+
列出所有可用爬虫
|
|
22
|
+
用法: crawlo list
|
|
23
|
+
"""
|
|
24
|
+
if args:
|
|
25
|
+
print("Usage: crawlo list")
|
|
26
|
+
return 1
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
# 1. 获取项目根目录
|
|
30
|
+
project_root = get_settings().get('PROJECT_ROOT')
|
|
31
|
+
if not project_root:
|
|
32
|
+
print("❌ Error: Cannot determine project root.")
|
|
33
|
+
return 1
|
|
34
|
+
|
|
35
|
+
# 将项目根目录加入 sys.path
|
|
36
|
+
project_root_str = str(project_root)
|
|
37
|
+
if project_root_str not in sys.path:
|
|
38
|
+
sys.path.insert(0, project_root_str)
|
|
39
|
+
|
|
40
|
+
# 2. 读取 crawlo.cfg 获取项目包名
|
|
41
|
+
cfg_file = project_root / 'crawlo.cfg'
|
|
42
|
+
if not cfg_file.exists():
|
|
43
|
+
print(f"❌ Error: crawlo.cfg not found in {project_root}")
|
|
44
|
+
return 1
|
|
45
|
+
|
|
46
|
+
config = configparser.ConfigParser()
|
|
47
|
+
config.read(cfg_file, encoding='utf-8')
|
|
48
|
+
|
|
49
|
+
if not config.has_section('settings') or not config.has_option('settings', 'default'):
|
|
50
|
+
print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
|
|
51
|
+
return 1
|
|
52
|
+
|
|
53
|
+
settings_module = config.get('settings', 'default')
|
|
54
|
+
project_package = settings_module.split('.')[0]
|
|
55
|
+
|
|
56
|
+
# 3. 创建 CrawlerProcess 并自动发现爬虫
|
|
57
|
+
spider_modules = [f"{project_package}.spiders"]
|
|
58
|
+
process = CrawlerProcess(spider_modules=spider_modules)
|
|
59
|
+
|
|
60
|
+
# 4. 获取所有爬虫信息
|
|
61
|
+
spider_names = process.get_spider_names()
|
|
62
|
+
if not spider_names:
|
|
63
|
+
print("📭 No spiders found.")
|
|
64
|
+
print("💡 Make sure:")
|
|
65
|
+
print(" - Your spider classes inherit from `Spider`")
|
|
66
|
+
print(" - They define a `name` attribute")
|
|
67
|
+
print(" - The modules are imported (e.g. via __init__.py)")
|
|
68
|
+
return 1
|
|
69
|
+
|
|
70
|
+
# 5. 输出爬虫列表
|
|
71
|
+
print(f"📋 Found {len(spider_names)} spider(s):")
|
|
72
|
+
print("-" * 50)
|
|
73
|
+
for name in sorted(spider_names):
|
|
74
|
+
cls = process.get_spider_class(name)
|
|
75
|
+
module = cls.__module__.replace(project_package + ".", "") # 简化模块名
|
|
76
|
+
print(f"🕷️ {name:<20} {cls.__name__:<25} ({module})")
|
|
77
|
+
print("-" * 50)
|
|
78
|
+
return 0
|
|
79
|
+
|
|
80
|
+
except Exception as e:
|
|
81
|
+
print(f"❌ Error listing spiders: {e}")
|
|
82
|
+
import traceback
|
|
83
|
+
traceback.print_exc()
|
|
84
|
+
return 1
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
if __name__ == '__main__':
|
|
88
|
+
"""
|
|
89
|
+
允许直接运行:
|
|
90
|
+
python -m crawlo.commands.list
|
|
91
|
+
"""
|
|
92
|
+
sys.exit(main(sys.argv[1:]))
|
crawlo/commands/run.py
CHANGED
|
@@ -1,14 +1,15 @@
|
|
|
1
|
-
|
|
1
|
+
"""
|
|
2
|
+
命令行入口:crawlo run <spider_name>
|
|
3
|
+
用于运行指定名称的爬虫。
|
|
4
|
+
"""
|
|
5
|
+
|
|
2
6
|
import asyncio
|
|
3
|
-
import importlib
|
|
4
|
-
import sys
|
|
5
7
|
from pathlib import Path
|
|
6
8
|
import configparser
|
|
7
9
|
|
|
8
10
|
from crawlo.crawler import CrawlerProcess
|
|
9
11
|
from crawlo.utils.project import get_settings
|
|
10
12
|
from crawlo.utils.log import get_logger
|
|
11
|
-
from crawlo.utils.spider_loader import SpiderLoader
|
|
12
13
|
|
|
13
14
|
logger = get_logger(__name__)
|
|
14
15
|
|
|
@@ -16,24 +17,30 @@ logger = get_logger(__name__)
|
|
|
16
17
|
def main(args):
|
|
17
18
|
"""
|
|
18
19
|
运行指定爬虫的主函数
|
|
19
|
-
用法:
|
|
20
|
+
用法:
|
|
21
|
+
crawlo run <spider_name>
|
|
22
|
+
crawlo run all
|
|
20
23
|
"""
|
|
21
24
|
if len(args) < 1:
|
|
22
|
-
print("Usage: crawlo run <spider_name
|
|
23
|
-
print("
|
|
25
|
+
print("Usage: crawlo run <spider_name>|all")
|
|
26
|
+
print("Examples:")
|
|
27
|
+
print(" crawlo run baidu")
|
|
28
|
+
print(" crawlo run all")
|
|
24
29
|
return 1
|
|
25
30
|
|
|
26
|
-
|
|
31
|
+
spider_arg = args[0]
|
|
27
32
|
|
|
28
33
|
try:
|
|
29
34
|
# 1. 获取项目根目录
|
|
30
|
-
project_root = get_settings()
|
|
35
|
+
project_root = get_settings().get('PROJECT_ROOT')
|
|
36
|
+
if not project_root:
|
|
37
|
+
print("❌ Error: Cannot determine project root.")
|
|
38
|
+
return 1
|
|
31
39
|
|
|
32
|
-
# 将项目根目录添加到 Python 路径
|
|
33
40
|
if str(project_root) not in sys.path:
|
|
34
41
|
sys.path.insert(0, str(project_root))
|
|
35
42
|
|
|
36
|
-
# 2.
|
|
43
|
+
# 2. 读取 crawlo.cfg 获取项目包名
|
|
37
44
|
cfg_file = project_root / 'crawlo.cfg'
|
|
38
45
|
if not cfg_file.exists():
|
|
39
46
|
print(f"❌ Error: crawlo.cfg not found in {project_root}")
|
|
@@ -49,27 +56,60 @@ def main(args):
|
|
|
49
56
|
settings_module = config.get('settings', 'default')
|
|
50
57
|
project_package = settings_module.split('.')[0]
|
|
51
58
|
|
|
52
|
-
# 3.
|
|
53
|
-
|
|
54
|
-
|
|
59
|
+
# 3. 创建 CrawlerProcess 并自动发现爬虫模块
|
|
60
|
+
spider_modules = [f"{project_package}.spiders"]
|
|
61
|
+
settings = get_settings()
|
|
62
|
+
process = CrawlerProcess(settings=settings, spider_modules=spider_modules)
|
|
63
|
+
|
|
64
|
+
# === 新增:支持 'all' ===
|
|
65
|
+
if spider_arg.lower() == "all":
|
|
66
|
+
spider_names = process.get_spider_names()
|
|
67
|
+
if not spider_names:
|
|
68
|
+
print("❌ No spiders found. Make sure spiders are defined and imported.")
|
|
69
|
+
return 1
|
|
70
|
+
|
|
71
|
+
print(f"🚀 Starting ALL {len(spider_names)} spiders:")
|
|
72
|
+
for name in sorted(spider_names):
|
|
73
|
+
cls = process.get_spider_class(name)
|
|
74
|
+
print(f" 🕷️ {name} ({cls.__name__})")
|
|
75
|
+
print("-" * 50)
|
|
76
|
+
|
|
77
|
+
# 启动所有爬虫
|
|
78
|
+
asyncio.run(process.crawl(spider_names))
|
|
79
|
+
return 0
|
|
80
|
+
|
|
81
|
+
# === 原有:启动单个爬虫 ===
|
|
82
|
+
spider_name = spider_arg
|
|
83
|
+
if not process.is_spider_registered(spider_name):
|
|
84
|
+
print(f"❌ Error: Spider with name '{spider_name}' not found.")
|
|
85
|
+
available_names = process.get_spider_names()
|
|
86
|
+
if available_names:
|
|
87
|
+
print("💡 Available spiders:")
|
|
88
|
+
for name in sorted(available_names):
|
|
89
|
+
cls = process.get_spider_class(name)
|
|
90
|
+
print(f" - {name} (class: {cls.__name__})")
|
|
91
|
+
else:
|
|
92
|
+
print("💡 No spiders found. Make sure your spider classes are defined and imported.")
|
|
55
93
|
return 1
|
|
56
94
|
|
|
57
|
-
|
|
58
|
-
settings = get_settings()
|
|
59
|
-
process = CrawlerProcess(settings)
|
|
95
|
+
spider_class = process.get_spider_class(spider_name)
|
|
60
96
|
|
|
61
|
-
|
|
97
|
+
# 打印启动信息
|
|
98
|
+
print(f"🚀 Starting spider: {spider_name}")
|
|
62
99
|
print(f"📁 Project: {project_package}")
|
|
63
100
|
print(f"🕷️ Class: {spider_class.__name__}")
|
|
64
101
|
print("-" * 50)
|
|
65
102
|
|
|
66
|
-
#
|
|
67
|
-
asyncio.run(process.crawl(
|
|
103
|
+
# 启动爬虫
|
|
104
|
+
asyncio.run(process.crawl(spider_name))
|
|
68
105
|
|
|
69
106
|
print("-" * 50)
|
|
70
107
|
print("✅ Spider completed successfully!")
|
|
71
108
|
return 0
|
|
72
109
|
|
|
110
|
+
except KeyboardInterrupt:
|
|
111
|
+
print("\n⚠️ Spider interrupted by user.")
|
|
112
|
+
return 1
|
|
73
113
|
except Exception as e:
|
|
74
114
|
print(f"❌ Error running spider: {e}")
|
|
75
115
|
import traceback
|
|
@@ -77,73 +117,65 @@ def main(args):
|
|
|
77
117
|
return 1
|
|
78
118
|
|
|
79
119
|
|
|
80
|
-
def find_spider_by_name(project_package: str, target_spider_name: str):
|
|
81
|
-
"""使用 SpiderLoader 查找爬虫"""
|
|
82
|
-
loader = SpiderLoader(project_package)
|
|
83
|
-
spider_class = loader.load(target_spider_name)
|
|
84
|
-
|
|
85
|
-
if spider_class is None:
|
|
86
|
-
print(f"❌ Error: Spider with name '{target_spider_name}' not found")
|
|
87
|
-
print("💡 Available spiders:")
|
|
88
|
-
available_spiders = loader.list()
|
|
89
|
-
for spider_name in available_spiders:
|
|
90
|
-
print(f" - {spider_name}")
|
|
91
|
-
return None
|
|
92
|
-
|
|
93
|
-
return spider_class
|
|
94
|
-
|
|
95
|
-
|
|
96
120
|
def list_available_spiders(project_package: str):
|
|
97
121
|
"""
|
|
98
|
-
|
|
122
|
+
列出指定项目包中所有可用的爬虫(用于调试或命令行扩展)
|
|
99
123
|
"""
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
attr_value != Spider and
|
|
125
|
-
hasattr(attr_value, 'name')):
|
|
126
|
-
print(f" - {attr_value.name} (class: {attr_value.__name__}, module: {module_name})")
|
|
127
|
-
spider_count += 1
|
|
128
|
-
|
|
129
|
-
if spider_count == 0:
|
|
130
|
-
print(" No spiders found")
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
def run_spider_by_name(spider_name: str, project_root: Path = None):
|
|
124
|
+
try:
|
|
125
|
+
# 临时创建一个 CrawlerProcess 来发现爬虫
|
|
126
|
+
process = CrawlerProcess(spider_modules=[f"{project_package}.spiders"])
|
|
127
|
+
available_names = process.get_spider_names()
|
|
128
|
+
|
|
129
|
+
if not available_names:
|
|
130
|
+
print(" No spiders found. Make sure:")
|
|
131
|
+
print(" - spiders/ 目录存在")
|
|
132
|
+
print(" - 爬虫类继承 Spider 且定义了 name")
|
|
133
|
+
print(" - 模块被导入(可通过 __init__.py 触发)")
|
|
134
|
+
return
|
|
135
|
+
|
|
136
|
+
print(f"Found {len(available_names)} spider(s):")
|
|
137
|
+
for name in sorted(available_names):
|
|
138
|
+
cls = process.get_spider_class(name)
|
|
139
|
+
module = cls.__module__.replace(project_package + ".", "")
|
|
140
|
+
print(f" - {name} ({cls.__name__} @ {module})")
|
|
141
|
+
except Exception as e:
|
|
142
|
+
print(f"❌ Failed to list spiders: {e}")
|
|
143
|
+
import traceback
|
|
144
|
+
traceback.print_exc()
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def run_spider_by_name(spider_name: str, project_package: str = None):
|
|
134
148
|
"""
|
|
135
|
-
|
|
149
|
+
在代码中直接运行某个爬虫(需提供 project_package)
|
|
136
150
|
"""
|
|
137
|
-
if
|
|
138
|
-
|
|
139
|
-
|
|
151
|
+
if project_package is None:
|
|
152
|
+
# 尝试从配置读取
|
|
153
|
+
cfg_file = Path('crawlo.cfg')
|
|
154
|
+
if cfg_file.exists():
|
|
155
|
+
config = configparser.ConfigParser()
|
|
156
|
+
config.read(cfg_file, encoding='utf-8')
|
|
157
|
+
if config.has_option('settings', 'default'):
|
|
158
|
+
project_package = config.get('settings', 'default').split('.')[0]
|
|
159
|
+
|
|
160
|
+
if not project_package:
|
|
161
|
+
print("❌ Error: project_package is required.")
|
|
162
|
+
return 1
|
|
163
|
+
|
|
164
|
+
# 添加项目路径
|
|
165
|
+
project_root = get_settings().get('PROJECT_ROOT')
|
|
166
|
+
if project_root and str(project_root) not in sys.path:
|
|
167
|
+
sys.path.insert(0, str(project_root))
|
|
140
168
|
|
|
169
|
+
# 复用 main 函数逻辑
|
|
141
170
|
args = [spider_name]
|
|
142
171
|
return main(args)
|
|
143
172
|
|
|
144
173
|
|
|
145
174
|
if __name__ == '__main__':
|
|
146
|
-
|
|
175
|
+
"""
|
|
176
|
+
允许直接运行:
|
|
177
|
+
python -m crawlo.commands.run <spider_name>
|
|
178
|
+
"""
|
|
147
179
|
import sys
|
|
148
180
|
|
|
149
|
-
sys.exit(main(sys.argv[1:]))
|
|
181
|
+
sys.exit(main(sys.argv[1:]))
|
crawlo/commands/stats.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-08-31 22:36
|
|
5
|
+
# @Author : crawl-coder
|
|
6
|
+
# @Desc : 命令行入口:crawlo stats,查看最近运行的爬虫统计信息。
|
|
7
|
+
"""
|
|
8
|
+
import sys
|
|
9
|
+
from crawlo.utils.log import get_logger
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
logger = get_logger(__name__)
|
|
13
|
+
|
|
14
|
+
# 保存最近运行的爬虫的统计(示例)
|
|
15
|
+
_LAST_RUN_STATS = {}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def record_stats(crawler):
|
|
19
|
+
"""在爬虫关闭后记录统计(需在 close 中调用)"""
|
|
20
|
+
if crawler.stats and crawler.spider:
|
|
21
|
+
_LAST_RUN_STATS[crawler.spider.name] = crawler.stats.get_stats()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def main(args):
|
|
25
|
+
if len(args) == 0:
|
|
26
|
+
# 显示所有历史统计
|
|
27
|
+
if not _LAST_RUN_STATS:
|
|
28
|
+
print("📊 No stats available. Run a spider first.")
|
|
29
|
+
return 0
|
|
30
|
+
|
|
31
|
+
print("📊 Recent Spider Statistics:")
|
|
32
|
+
print("-" * 60)
|
|
33
|
+
for spider_name, stats in _LAST_RUN_STATS.items():
|
|
34
|
+
print(f"🕷️ {spider_name}")
|
|
35
|
+
for k, v in stats.items():
|
|
36
|
+
print(f" {k:<30} {v}")
|
|
37
|
+
print()
|
|
38
|
+
return 0
|
|
39
|
+
|
|
40
|
+
elif len(args) == 1:
|
|
41
|
+
spider_name = args[0]
|
|
42
|
+
if spider_name not in _LAST_RUN_STATS:
|
|
43
|
+
print(f"📊 No stats found for spider '{spider_name}'")
|
|
44
|
+
return 1
|
|
45
|
+
|
|
46
|
+
stats = _LAST_RUN_STATS[spider_name]
|
|
47
|
+
print(f"📊 Stats for '{spider_name}':")
|
|
48
|
+
print("-" * 60)
|
|
49
|
+
for k, v in stats.items():
|
|
50
|
+
print(f" {k:<30} {v}")
|
|
51
|
+
return 0
|
|
52
|
+
|
|
53
|
+
else:
|
|
54
|
+
print("Usage: crawlo stats [spider_name]")
|
|
55
|
+
return 1
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
if __name__ == '__main__':
|
|
59
|
+
sys.exit(main(sys.argv[1:]))
|
crawlo/crawler.py
CHANGED
|
@@ -1,19 +1,18 @@
|
|
|
1
1
|
#!/usr/bin/python
|
|
2
2
|
# -*- coding: UTF-8 -*-
|
|
3
|
+
from __future__ import annotations
|
|
3
4
|
import asyncio
|
|
4
5
|
import signal
|
|
5
|
-
from typing import Type, Optional, Set, List
|
|
6
|
-
|
|
7
|
-
from
|
|
8
|
-
from
|
|
9
|
-
from
|
|
10
|
-
from
|
|
11
|
-
from
|
|
12
|
-
from
|
|
13
|
-
from
|
|
14
|
-
from
|
|
15
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
16
|
-
from crawlo.utils.project import merge_settings, get_settings
|
|
6
|
+
from typing import Type, Optional, Set, List, Union, Dict
|
|
7
|
+
from .spider import Spider, get_global_spider_registry
|
|
8
|
+
from .core.engine import Engine
|
|
9
|
+
from .utils.log import get_logger
|
|
10
|
+
from .subscriber import Subscriber
|
|
11
|
+
from .extension import ExtensionManager
|
|
12
|
+
from .stats_collector import StatsCollector
|
|
13
|
+
from .event import spider_opened, spider_closed
|
|
14
|
+
from .settings.setting_manager import SettingManager
|
|
15
|
+
from .utils.project import merge_settings, get_settings
|
|
17
16
|
|
|
18
17
|
|
|
19
18
|
logger = get_logger(__name__)
|
|
@@ -30,7 +29,7 @@ class Crawler:
|
|
|
30
29
|
self.subscriber: Optional[Subscriber] = None
|
|
31
30
|
self.extension: Optional[ExtensionManager] = None
|
|
32
31
|
self.settings: SettingManager = settings.copy()
|
|
33
|
-
self._closed = False
|
|
32
|
+
self._closed = False
|
|
34
33
|
self._close_lock = asyncio.Lock()
|
|
35
34
|
|
|
36
35
|
async def crawl(self):
|
|
@@ -40,7 +39,6 @@ class Crawler:
|
|
|
40
39
|
self.engine = self._create_engine()
|
|
41
40
|
self.stats = self._create_stats()
|
|
42
41
|
self.extension = self._create_extension()
|
|
43
|
-
|
|
44
42
|
await self.engine.start_spider(self.spider)
|
|
45
43
|
|
|
46
44
|
@staticmethod
|
|
@@ -50,7 +48,6 @@ class Crawler:
|
|
|
50
48
|
def _create_spider(self) -> Spider:
|
|
51
49
|
spider = self.spider_cls.create_instance(self)
|
|
52
50
|
|
|
53
|
-
# --- 关键属性检查 ---
|
|
54
51
|
if not getattr(spider, 'name', None):
|
|
55
52
|
raise AttributeError(f"爬虫类 '{self.spider_cls.__name__}' 必须定义 'name' 属性。")
|
|
56
53
|
|
|
@@ -92,19 +89,36 @@ class Crawler:
|
|
|
92
89
|
await self.subscriber.notify(spider_closed)
|
|
93
90
|
if self.stats and self.spider:
|
|
94
91
|
self.stats.close_spider(spider=self.spider, reason=reason)
|
|
92
|
+
from crawlo.commands.stats import record_stats
|
|
93
|
+
record_stats(self)
|
|
95
94
|
|
|
96
95
|
|
|
97
96
|
class CrawlerProcess:
|
|
98
97
|
"""
|
|
99
|
-
|
|
98
|
+
爬虫进程管理器,支持:
|
|
99
|
+
- 自动发现爬虫模块
|
|
100
|
+
- 通过 name 或类启动爬虫
|
|
101
|
+
- 并发控制
|
|
102
|
+
- 优雅关闭
|
|
100
103
|
"""
|
|
101
104
|
|
|
102
|
-
def __init__(
|
|
105
|
+
def __init__(
|
|
106
|
+
self,
|
|
107
|
+
settings: Optional[SettingManager] = None,
|
|
108
|
+
max_concurrency: Optional[int] = None,
|
|
109
|
+
spider_modules: Optional[List[str]] = None
|
|
110
|
+
):
|
|
103
111
|
self.settings: SettingManager = settings or self._get_default_settings()
|
|
104
112
|
self.crawlers: Set[Crawler] = set()
|
|
105
113
|
self._active_tasks: Set[asyncio.Task] = set()
|
|
106
114
|
|
|
107
|
-
#
|
|
115
|
+
# 自动发现并导入爬虫模块
|
|
116
|
+
if spider_modules:
|
|
117
|
+
self.auto_discover(spider_modules)
|
|
118
|
+
|
|
119
|
+
# 使用全局注册表的快照(避免后续导入影响)
|
|
120
|
+
self._spider_registry: Dict[str, Type[Spider]] = get_global_spider_registry()
|
|
121
|
+
|
|
108
122
|
self.max_concurrency: int = (
|
|
109
123
|
max_concurrency
|
|
110
124
|
or self.settings.get('MAX_RUNNING_SPIDERS')
|
|
@@ -117,80 +131,120 @@ class CrawlerProcess:
|
|
|
117
131
|
signal.signal(signal.SIGTERM, self._shutdown)
|
|
118
132
|
logger.info(f"CrawlerProcess 初始化完成,最大并行爬虫数: {self.max_concurrency}")
|
|
119
133
|
|
|
120
|
-
|
|
121
|
-
"""
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
134
|
+
def auto_discover(self, modules: List[str]):
|
|
135
|
+
"""自动导入模块,触发 Spider 类定义和注册"""
|
|
136
|
+
import importlib
|
|
137
|
+
import pkgutil
|
|
138
|
+
for module_name in modules:
|
|
139
|
+
try:
|
|
140
|
+
module = importlib.import_module(module_name)
|
|
141
|
+
if hasattr(module, '__path__'):
|
|
142
|
+
for _, name, _ in pkgutil.walk_packages(module.__path__, module.__name__ + "."):
|
|
143
|
+
importlib.import_module(name)
|
|
144
|
+
else:
|
|
145
|
+
importlib.import_module(module_name)
|
|
146
|
+
logger.debug(f"已扫描模块: {module_name}")
|
|
147
|
+
except Exception as e:
|
|
148
|
+
logger.error(f"扫描模块 {module_name} 失败: {e}", exc_info=True)
|
|
149
|
+
|
|
150
|
+
# === 公共只读接口:避免直接访问 _spider_registry ===
|
|
151
|
+
|
|
152
|
+
def get_spider_names(self) -> List[str]:
|
|
153
|
+
"""获取所有已注册的爬虫名称"""
|
|
154
|
+
return list(self._spider_registry.keys())
|
|
155
|
+
|
|
156
|
+
def get_spider_class(self, name: str) -> Optional[Type[Spider]]:
|
|
157
|
+
"""根据 name 获取爬虫类"""
|
|
158
|
+
return self._spider_registry.get(name)
|
|
159
|
+
|
|
160
|
+
def is_spider_registered(self, name: str) -> bool:
|
|
161
|
+
"""检查某个 name 是否已注册"""
|
|
162
|
+
return name in self._spider_registry
|
|
163
|
+
|
|
164
|
+
async def crawl(self, spiders: Union[Type[Spider], str, List[Union[Type[Spider], str]]]):
|
|
165
|
+
"""启动一个或多个爬虫"""
|
|
166
|
+
spider_classes_to_run = self._resolve_spiders_to_run(spiders)
|
|
167
|
+
total = len(spider_classes_to_run)
|
|
126
168
|
|
|
127
169
|
if total == 0:
|
|
128
|
-
raise ValueError("
|
|
129
|
-
|
|
130
|
-
# 按名称排序
|
|
131
|
-
spider_classes.sort(key=lambda cls: cls.__name__.lower())
|
|
170
|
+
raise ValueError("至少需要提供一个爬虫类或名称")
|
|
132
171
|
|
|
172
|
+
# 按类名排序,保证启动顺序可预测
|
|
173
|
+
spider_classes_to_run.sort(key=lambda cls: cls.__name__.lower())
|
|
133
174
|
logger.info(f"启动 {total} 个爬虫.")
|
|
134
175
|
|
|
135
|
-
#
|
|
176
|
+
# 流式启动
|
|
136
177
|
tasks = [
|
|
137
178
|
asyncio.create_task(self._run_spider_with_limit(spider_cls, index + 1, total))
|
|
138
|
-
for index, spider_cls in enumerate(
|
|
179
|
+
for index, spider_cls in enumerate(spider_classes_to_run)
|
|
139
180
|
]
|
|
140
181
|
|
|
141
|
-
#
|
|
182
|
+
# 等待完成(失败不中断)
|
|
142
183
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
143
|
-
|
|
144
|
-
# 统计异常
|
|
145
184
|
failed = [i for i, r in enumerate(results) if isinstance(r, Exception)]
|
|
146
185
|
if failed:
|
|
147
|
-
logger.error(f"共 {len(failed)} 个爬虫执行异常: {[
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
186
|
+
logger.error(f"共 {len(failed)} 个爬虫执行异常: {[spider_classes_to_run[i].__name__ for i in failed]}")
|
|
187
|
+
|
|
188
|
+
def _resolve_spiders_to_run(
|
|
189
|
+
self,
|
|
190
|
+
spiders_input: Union[Type[Spider], str, List[Union[Type[Spider], str]]]
|
|
191
|
+
) -> List[Type[Spider]]:
|
|
192
|
+
"""解析输入为爬虫类列表"""
|
|
193
|
+
inputs = self._normalize_inputs(spiders_input)
|
|
194
|
+
seen_spider_names: Set[str] = set()
|
|
195
|
+
spider_classes: List[Type[Spider]] = []
|
|
196
|
+
|
|
197
|
+
for item in inputs:
|
|
198
|
+
spider_cls = self._resolve_spider_class(item)
|
|
199
|
+
spider_name = spider_cls.name
|
|
200
|
+
|
|
201
|
+
if spider_name in seen_spider_names:
|
|
202
|
+
raise ValueError(f"本次运行中爬虫名称 '{spider_name}' 重复。")
|
|
203
|
+
|
|
204
|
+
seen_spider_names.add(spider_name)
|
|
205
|
+
spider_classes.append(spider_cls)
|
|
206
|
+
|
|
207
|
+
return spider_classes
|
|
208
|
+
|
|
209
|
+
def _normalize_inputs(self, spiders_input) -> List[Union[Type[Spider], str]]:
|
|
210
|
+
"""标准化输入为列表"""
|
|
211
|
+
if isinstance(spiders_input, (type, str)):
|
|
212
|
+
return [spiders_input]
|
|
213
|
+
elif isinstance(spiders_input, (list, tuple)):
|
|
214
|
+
return list(spiders_input)
|
|
156
215
|
else:
|
|
157
|
-
raise TypeError("spiders
|
|
216
|
+
raise TypeError("spiders 必须是爬虫类、name 字符串,或它们的列表/元组")
|
|
217
|
+
|
|
218
|
+
def _resolve_spider_class(self, item: Union[Type[Spider], str]) -> Type[Spider]:
|
|
219
|
+
"""解析单个输入项为爬虫类"""
|
|
220
|
+
if isinstance(item, type) and issubclass(item, Spider):
|
|
221
|
+
return item
|
|
222
|
+
elif isinstance(item, str):
|
|
223
|
+
spider_cls = self._spider_registry.get(item)
|
|
224
|
+
if not spider_cls:
|
|
225
|
+
raise ValueError(f"未找到名为 '{item}' 的爬虫。")
|
|
226
|
+
return spider_cls
|
|
227
|
+
else:
|
|
228
|
+
raise TypeError(f"无效类型 {type(item)}。必须是 Spider 类或字符串 name。")
|
|
158
229
|
|
|
159
230
|
async def _run_spider_with_limit(self, spider_cls: Type[Spider], seq: int, total: int):
|
|
160
|
-
"""
|
|
161
|
-
受信号量限制的爬虫运行函数,带进度日志
|
|
162
|
-
"""
|
|
231
|
+
"""受信号量限制的爬虫运行函数"""
|
|
163
232
|
task = asyncio.current_task()
|
|
164
233
|
self._active_tasks.add(task)
|
|
165
|
-
|
|
166
234
|
try:
|
|
167
|
-
# 获取并发许可
|
|
168
235
|
await self.semaphore.acquire()
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
logger.info(start_msg)
|
|
172
|
-
|
|
173
|
-
# 创建并运行爬虫
|
|
174
|
-
crawler = self._create_crawler(spider_cls)
|
|
236
|
+
logger.info(f"[{seq}/{total}] 启动爬虫: {spider_cls.__name__}")
|
|
237
|
+
crawler = Crawler(spider_cls, self.settings)
|
|
175
238
|
self.crawlers.add(crawler)
|
|
176
239
|
await crawler.crawl()
|
|
177
|
-
|
|
178
|
-
end_msg = f"[{seq}/{total}] 爬虫完成: {spider_cls.__name__}"
|
|
179
|
-
logger.info(end_msg)
|
|
180
|
-
|
|
240
|
+
logger.info(f"[{seq}/{total}] 爬虫完成: {spider_cls.__name__}")
|
|
181
241
|
except Exception as e:
|
|
182
242
|
logger.error(f"爬虫 {spider_cls.__name__} 执行失败: {e}", exc_info=True)
|
|
183
243
|
raise
|
|
184
244
|
finally:
|
|
185
245
|
if task in self._active_tasks:
|
|
186
246
|
self._active_tasks.remove(task)
|
|
187
|
-
self.semaphore.release()
|
|
188
|
-
|
|
189
|
-
def _create_crawler(self, spider_cls: Type[Spider]) -> Crawler:
|
|
190
|
-
"""创建爬虫实例"""
|
|
191
|
-
if isinstance(spider_cls, str):
|
|
192
|
-
raise SpiderTypeError(f"不支持字符串形式的爬虫: {spider_cls}")
|
|
193
|
-
return Crawler(spider_cls, self.settings)
|
|
247
|
+
self.semaphore.release()
|
|
194
248
|
|
|
195
249
|
def _shutdown(self, _signum, _frame):
|
|
196
250
|
"""优雅关闭信号处理"""
|
|
@@ -216,4 +270,224 @@ class CrawlerProcess:
|
|
|
216
270
|
return get_settings()
|
|
217
271
|
except Exception as e:
|
|
218
272
|
logger.warning(f"无法加载默认配置: {e}")
|
|
219
|
-
return SettingManager()
|
|
273
|
+
return SettingManager()
|
|
274
|
+
|
|
275
|
+
# #!/usr/bin/python
|
|
276
|
+
# # -*- coding: UTF-8 -*-
|
|
277
|
+
# import asyncio
|
|
278
|
+
# import signal
|
|
279
|
+
# from typing import Type, Optional, Set, List
|
|
280
|
+
#
|
|
281
|
+
# from crawlo.spider import Spider
|
|
282
|
+
# from crawlo.core.engine import Engine
|
|
283
|
+
# from crawlo.utils.log import get_logger
|
|
284
|
+
# from crawlo.subscriber import Subscriber
|
|
285
|
+
# from crawlo.extension import ExtensionManager
|
|
286
|
+
# from crawlo.exceptions import SpiderTypeError
|
|
287
|
+
# from crawlo.stats_collector import StatsCollector
|
|
288
|
+
# from crawlo.event import spider_opened, spider_closed
|
|
289
|
+
# from crawlo.settings.setting_manager import SettingManager
|
|
290
|
+
# from crawlo.utils.project import merge_settings, get_settings
|
|
291
|
+
#
|
|
292
|
+
#
|
|
293
|
+
# logger = get_logger(__name__)
|
|
294
|
+
#
|
|
295
|
+
#
|
|
296
|
+
# class Crawler:
|
|
297
|
+
# """单个爬虫运行实例,绑定 Spider 与引擎"""
|
|
298
|
+
#
|
|
299
|
+
# def __init__(self, spider_cls: Type[Spider], settings: SettingManager):
|
|
300
|
+
# self.spider_cls = spider_cls
|
|
301
|
+
# self.spider: Optional[Spider] = None
|
|
302
|
+
# self.engine: Optional[Engine] = None
|
|
303
|
+
# self.stats: Optional[StatsCollector] = None
|
|
304
|
+
# self.subscriber: Optional[Subscriber] = None
|
|
305
|
+
# self.extension: Optional[ExtensionManager] = None
|
|
306
|
+
# self.settings: SettingManager = settings.copy()
|
|
307
|
+
# self._closed = False # 新增状态
|
|
308
|
+
# self._close_lock = asyncio.Lock()
|
|
309
|
+
#
|
|
310
|
+
# async def crawl(self):
|
|
311
|
+
# """启动爬虫核心流程"""
|
|
312
|
+
# self.subscriber = self._create_subscriber()
|
|
313
|
+
# self.spider = self._create_spider()
|
|
314
|
+
# self.engine = self._create_engine()
|
|
315
|
+
# self.stats = self._create_stats()
|
|
316
|
+
# self.extension = self._create_extension()
|
|
317
|
+
#
|
|
318
|
+
# await self.engine.start_spider(self.spider)
|
|
319
|
+
#
|
|
320
|
+
# @staticmethod
|
|
321
|
+
# def _create_subscriber() -> Subscriber:
|
|
322
|
+
# return Subscriber()
|
|
323
|
+
#
|
|
324
|
+
# def _create_spider(self) -> Spider:
|
|
325
|
+
# spider = self.spider_cls.create_instance(self)
|
|
326
|
+
#
|
|
327
|
+
# # --- 关键属性检查 ---
|
|
328
|
+
# if not getattr(spider, 'name', None):
|
|
329
|
+
# raise AttributeError(f"爬虫类 '{self.spider_cls.__name__}' 必须定义 'name' 属性。")
|
|
330
|
+
#
|
|
331
|
+
# if not callable(getattr(spider, 'start_requests', None)):
|
|
332
|
+
# raise AttributeError(f"爬虫 '{spider.name}' 必须实现可调用的 'start_requests' 方法。")
|
|
333
|
+
#
|
|
334
|
+
# start_urls = getattr(spider, 'start_urls', [])
|
|
335
|
+
# if isinstance(start_urls, str):
|
|
336
|
+
# raise TypeError(f"爬虫 '{spider.name}' 的 'start_urls' 必须是列表或元组,不能是字符串。")
|
|
337
|
+
#
|
|
338
|
+
# if not callable(getattr(spider, 'parse', None)):
|
|
339
|
+
# logger.warning(
|
|
340
|
+
# f"爬虫 '{spider.name}' 未定义 'parse' 方法。请确保所有 Request 都指定了回调函数,否则响应将被忽略。")
|
|
341
|
+
#
|
|
342
|
+
# self._set_spider(spider)
|
|
343
|
+
# return spider
|
|
344
|
+
#
|
|
345
|
+
# def _create_engine(self) -> Engine:
|
|
346
|
+
# engine = Engine(self)
|
|
347
|
+
# engine.engine_start()
|
|
348
|
+
# return engine
|
|
349
|
+
#
|
|
350
|
+
# def _create_stats(self) -> StatsCollector:
|
|
351
|
+
# return StatsCollector(self)
|
|
352
|
+
#
|
|
353
|
+
# def _create_extension(self) -> ExtensionManager:
|
|
354
|
+
# return ExtensionManager.create_instance(self)
|
|
355
|
+
#
|
|
356
|
+
# def _set_spider(self, spider: Spider):
|
|
357
|
+
# self.subscriber.subscribe(spider.spider_opened, event=spider_opened)
|
|
358
|
+
# self.subscriber.subscribe(spider.spider_closed, event=spider_closed)
|
|
359
|
+
# merge_settings(spider, self.settings)
|
|
360
|
+
#
|
|
361
|
+
# async def close(self, reason='finished') -> None:
|
|
362
|
+
# async with self._close_lock:
|
|
363
|
+
# if self._closed:
|
|
364
|
+
# return
|
|
365
|
+
# self._closed = True
|
|
366
|
+
# await self.subscriber.notify(spider_closed)
|
|
367
|
+
# if self.stats and self.spider:
|
|
368
|
+
# self.stats.close_spider(spider=self.spider, reason=reason)
|
|
369
|
+
#
|
|
370
|
+
#
|
|
371
|
+
# class CrawlerProcess:
|
|
372
|
+
# """
|
|
373
|
+
# 爬虫进程管理器,支持多爬虫并发调度、信号量控制、实时日志与优雅关闭
|
|
374
|
+
# """
|
|
375
|
+
#
|
|
376
|
+
# def __init__(self, settings: Optional[SettingManager] = None, max_concurrency: Optional[int] = None):
|
|
377
|
+
# self.settings: SettingManager = settings or self._get_default_settings()
|
|
378
|
+
# self.crawlers: Set[Crawler] = set()
|
|
379
|
+
# self._active_tasks: Set[asyncio.Task] = set()
|
|
380
|
+
#
|
|
381
|
+
# # 使用专用配置,降级使用 CONCURRENCY
|
|
382
|
+
# self.max_concurrency: int = (
|
|
383
|
+
# max_concurrency
|
|
384
|
+
# or self.settings.get('MAX_RUNNING_SPIDERS')
|
|
385
|
+
# or self.settings.get('CONCURRENCY', 3)
|
|
386
|
+
# )
|
|
387
|
+
# self.semaphore = asyncio.Semaphore(self.max_concurrency)
|
|
388
|
+
#
|
|
389
|
+
# # 注册信号量
|
|
390
|
+
# signal.signal(signal.SIGINT, self._shutdown)
|
|
391
|
+
# signal.signal(signal.SIGTERM, self._shutdown)
|
|
392
|
+
# logger.info(f"CrawlerProcess 初始化完成,最大并行爬虫数: {self.max_concurrency}")
|
|
393
|
+
#
|
|
394
|
+
# async def crawl(self, spiders):
|
|
395
|
+
# """
|
|
396
|
+
# 启动一个或多个爬虫,流式调度,支持实时进度反馈
|
|
397
|
+
# """
|
|
398
|
+
# spider_classes = self._normalize_spiders(spiders)
|
|
399
|
+
# total = len(spider_classes)
|
|
400
|
+
#
|
|
401
|
+
# if total == 0:
|
|
402
|
+
# raise ValueError("至少需要提供一个爬虫类")
|
|
403
|
+
#
|
|
404
|
+
# # 按名称排序
|
|
405
|
+
# spider_classes.sort(key=lambda cls: cls.__name__.lower())
|
|
406
|
+
#
|
|
407
|
+
# logger.info(f"启动 {total} 个爬虫.")
|
|
408
|
+
#
|
|
409
|
+
# # 流式启动所有爬虫任务
|
|
410
|
+
# tasks = [
|
|
411
|
+
# asyncio.create_task(self._run_spider_with_limit(spider_cls, index + 1, total))
|
|
412
|
+
# for index, spider_cls in enumerate(spider_classes)
|
|
413
|
+
# ]
|
|
414
|
+
#
|
|
415
|
+
# # 等待所有任务完成(失败不中断)
|
|
416
|
+
# results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
417
|
+
#
|
|
418
|
+
# # 统计异常
|
|
419
|
+
# failed = [i for i, r in enumerate(results) if isinstance(r, Exception)]
|
|
420
|
+
# if failed:
|
|
421
|
+
# logger.error(f"共 {len(failed)} 个爬虫执行异常: {[spider_classes[i].__name__ for i in failed]}")
|
|
422
|
+
#
|
|
423
|
+
# @staticmethod
|
|
424
|
+
# def _normalize_spiders(spiders) -> List[Type[Spider]]:
|
|
425
|
+
# """标准化输入为爬虫类列表"""
|
|
426
|
+
# if isinstance(spiders, type) and issubclass(spiders, Spider):
|
|
427
|
+
# return [spiders]
|
|
428
|
+
# elif isinstance(spiders, (list, tuple)):
|
|
429
|
+
# return list(spiders)
|
|
430
|
+
# else:
|
|
431
|
+
# raise TypeError("spiders 必须是爬虫类或爬虫类列表/元组")
|
|
432
|
+
#
|
|
433
|
+
# async def _run_spider_with_limit(self, spider_cls: Type[Spider], seq: int, total: int):
|
|
434
|
+
# """
|
|
435
|
+
# 受信号量限制的爬虫运行函数,带进度日志
|
|
436
|
+
# """
|
|
437
|
+
# task = asyncio.current_task()
|
|
438
|
+
# self._active_tasks.add(task)
|
|
439
|
+
#
|
|
440
|
+
# try:
|
|
441
|
+
# # 获取并发许可
|
|
442
|
+
# await self.semaphore.acquire()
|
|
443
|
+
#
|
|
444
|
+
# start_msg = f"[{seq}/{total}] 启动爬虫: {spider_cls.__name__}"
|
|
445
|
+
# logger.info(start_msg)
|
|
446
|
+
#
|
|
447
|
+
# # 创建并运行爬虫
|
|
448
|
+
# crawler = self._create_crawler(spider_cls)
|
|
449
|
+
# self.crawlers.add(crawler)
|
|
450
|
+
# await crawler.crawl()
|
|
451
|
+
#
|
|
452
|
+
# end_msg = f"[{seq}/{total}] 爬虫完成: {spider_cls.__name__}"
|
|
453
|
+
# logger.info(end_msg)
|
|
454
|
+
#
|
|
455
|
+
# except Exception as e:
|
|
456
|
+
# logger.error(f"爬虫 {spider_cls.__name__} 执行失败: {e}", exc_info=True)
|
|
457
|
+
# raise
|
|
458
|
+
# finally:
|
|
459
|
+
# if task in self._active_tasks:
|
|
460
|
+
# self._active_tasks.remove(task)
|
|
461
|
+
# self.semaphore.release() # 必须释放
|
|
462
|
+
#
|
|
463
|
+
# def _create_crawler(self, spider_cls: Type[Spider]) -> Crawler:
|
|
464
|
+
# """创建爬虫实例"""
|
|
465
|
+
# if isinstance(spider_cls, str):
|
|
466
|
+
# raise SpiderTypeError(f"不支持字符串形式的爬虫: {spider_cls}")
|
|
467
|
+
# return Crawler(spider_cls, self.settings)
|
|
468
|
+
#
|
|
469
|
+
# def _shutdown(self, _signum, _frame):
|
|
470
|
+
# """优雅关闭信号处理"""
|
|
471
|
+
# logger.warning("收到关闭信号,正在停止所有爬虫...")
|
|
472
|
+
# for crawler in list(self.crawlers):
|
|
473
|
+
# if crawler.engine:
|
|
474
|
+
# crawler.engine.running = False
|
|
475
|
+
# crawler.engine.normal = False
|
|
476
|
+
# asyncio.create_task(self._wait_for_shutdown())
|
|
477
|
+
#
|
|
478
|
+
# async def _wait_for_shutdown(self):
|
|
479
|
+
# """等待所有活跃任务完成"""
|
|
480
|
+
# pending = [t for t in self._active_tasks if not t.done()]
|
|
481
|
+
# if pending:
|
|
482
|
+
# logger.info(f"等待 {len(pending)} 个活跃任务完成...")
|
|
483
|
+
# await asyncio.gather(*pending, return_exceptions=True)
|
|
484
|
+
# logger.info("所有爬虫已优雅关闭")
|
|
485
|
+
#
|
|
486
|
+
# @classmethod
|
|
487
|
+
# def _get_default_settings(cls) -> SettingManager:
|
|
488
|
+
# """加载默认配置"""
|
|
489
|
+
# try:
|
|
490
|
+
# return get_settings()
|
|
491
|
+
# except Exception as e:
|
|
492
|
+
# logger.warning(f"无法加载默认配置: {e}")
|
|
493
|
+
# return SettingManager()
|
crawlo/spider/__init__.py
CHANGED
|
@@ -1,11 +1,44 @@
|
|
|
1
1
|
#!/usr/bin/python
|
|
2
2
|
# -*- coding:UTF-8 -*-
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
from typing import Type, Any, Optional, List, Dict
|
|
3
5
|
from ..network.request import Request
|
|
4
6
|
from ..utils.log import get_logger
|
|
5
7
|
|
|
6
8
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
+
# 全局注册表
|
|
10
|
+
_DEFAULT_SPIDER_REGISTRY: dict[str, Type[Spider]] = {}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SpiderMeta(type):
|
|
14
|
+
def __new__(mcs, name: str, bases: tuple[type], namespace: dict[str, Any], **kwargs):
|
|
15
|
+
cls = super().__new__(mcs, name, bases, namespace)
|
|
16
|
+
|
|
17
|
+
is_spider_subclass = any(
|
|
18
|
+
base is Spider or (isinstance(base, type) and issubclass(base, Spider))
|
|
19
|
+
for base in bases
|
|
20
|
+
)
|
|
21
|
+
if not is_spider_subclass:
|
|
22
|
+
return cls
|
|
23
|
+
|
|
24
|
+
spider_name = namespace.get('name')
|
|
25
|
+
if not isinstance(spider_name, str):
|
|
26
|
+
raise AttributeError(f"爬虫类 '{cls.__name__}' 必须定义字符串类型的 'name' 属性。")
|
|
27
|
+
|
|
28
|
+
if spider_name in _DEFAULT_SPIDER_REGISTRY:
|
|
29
|
+
raise ValueError(
|
|
30
|
+
f"爬虫名称 '{spider_name}' 已被 {_DEFAULT_SPIDER_REGISTRY[spider_name].__name__} 占用。"
|
|
31
|
+
f"请确保每个爬虫的 name 属性全局唯一。"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
_DEFAULT_SPIDER_REGISTRY[spider_name] = cls
|
|
35
|
+
get_logger(__name__).debug(f"自动注册爬虫: {spider_name} -> {cls.__name__}")
|
|
36
|
+
|
|
37
|
+
return cls
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class Spider(metaclass=SpiderMeta):
|
|
41
|
+
name: str = None
|
|
9
42
|
|
|
10
43
|
def __init__(self, name=None, **kwargs):
|
|
11
44
|
if not hasattr(self, 'start_urls'):
|
|
@@ -15,7 +48,7 @@ class Spider(object):
|
|
|
15
48
|
self.logger = get_logger(self.name or self.__class__.__name__)
|
|
16
49
|
|
|
17
50
|
@classmethod
|
|
18
|
-
def create_instance(cls, crawler):
|
|
51
|
+
def create_instance(cls, crawler) -> Spider:
|
|
19
52
|
o = cls()
|
|
20
53
|
o.crawler = crawler
|
|
21
54
|
return o
|
|
@@ -39,3 +72,58 @@ class Spider(object):
|
|
|
39
72
|
|
|
40
73
|
def __str__(self):
|
|
41
74
|
return self.__class__.__name__
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# === 公共只读接口 ===
|
|
78
|
+
def get_global_spider_registry() -> dict[str, Type[Spider]]:
|
|
79
|
+
return _DEFAULT_SPIDER_REGISTRY.copy()
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def get_spider_by_name(name: str) -> Optional[Type[Spider]]:
|
|
83
|
+
return _DEFAULT_SPIDER_REGISTRY.get(name)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def get_all_spider_classes() -> list[Type[Spider]]:
|
|
87
|
+
return list(set(_DEFAULT_SPIDER_REGISTRY.values()))
|
|
88
|
+
|
|
89
|
+
# #!/usr/bin/python
|
|
90
|
+
# # -*- coding:UTF-8 -*-
|
|
91
|
+
# from ..network.request import Request
|
|
92
|
+
# from ..utils.log import get_logger
|
|
93
|
+
#
|
|
94
|
+
#
|
|
95
|
+
# class Spider(object):
|
|
96
|
+
# name = None
|
|
97
|
+
#
|
|
98
|
+
# def __init__(self, name=None, **kwargs):
|
|
99
|
+
# if not hasattr(self, 'start_urls'):
|
|
100
|
+
# self.start_urls = []
|
|
101
|
+
# self.crawler = None
|
|
102
|
+
# self.name = name or self.name
|
|
103
|
+
# self.logger = get_logger(self.name or self.__class__.__name__)
|
|
104
|
+
#
|
|
105
|
+
# @classmethod
|
|
106
|
+
# def create_instance(cls, crawler):
|
|
107
|
+
# o = cls()
|
|
108
|
+
# o.crawler = crawler
|
|
109
|
+
# return o
|
|
110
|
+
#
|
|
111
|
+
# def start_requests(self):
|
|
112
|
+
# if self.start_urls:
|
|
113
|
+
# for url in self.start_urls:
|
|
114
|
+
# yield Request(url=url, dont_filter=True)
|
|
115
|
+
# else:
|
|
116
|
+
# if hasattr(self, 'start_url') and isinstance(getattr(self, 'start_url'), str):
|
|
117
|
+
# yield Request(getattr(self, 'start_url'), dont_filter=True)
|
|
118
|
+
#
|
|
119
|
+
# def parse(self, response):
|
|
120
|
+
# raise NotImplementedError
|
|
121
|
+
#
|
|
122
|
+
# async def spider_opened(self):
|
|
123
|
+
# pass
|
|
124
|
+
#
|
|
125
|
+
# async def spider_closed(self):
|
|
126
|
+
# pass
|
|
127
|
+
#
|
|
128
|
+
# def __str__(self):
|
|
129
|
+
# return self.__class__.__name__
|
crawlo/utils/project.py
CHANGED
|
@@ -19,8 +19,7 @@ from typing import Callable, Optional
|
|
|
19
19
|
from crawlo.utils.log import get_logger
|
|
20
20
|
from crawlo.settings.setting_manager import SettingManager
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
logger =get_logger(__name__)
|
|
22
|
+
logger = get_logger(__name__)
|
|
24
23
|
|
|
25
24
|
|
|
26
25
|
def _find_project_root(start_path: str = '.') -> Optional[str]:
|
|
@@ -37,13 +36,11 @@ def _find_project_root(start_path: str = '.') -> Optional[str]:
|
|
|
37
36
|
Optional[str]: 找到的项目根目录的绝对路径,如果未找到则返回 None。
|
|
38
37
|
"""
|
|
39
38
|
path = os.path.abspath(start_path)
|
|
40
|
-
logger.info(f"开始向上搜索项目根目录,起始路径: {path}")
|
|
41
39
|
|
|
42
40
|
while True:
|
|
43
41
|
# 1. 检查是否存在 crawlo.cfg 文件
|
|
44
42
|
cfg_file = os.path.join(path, 'crawlo.cfg')
|
|
45
43
|
if os.path.isfile(cfg_file):
|
|
46
|
-
logger.info(f"在路径 {path} 找到 'crawlo.cfg' 文件,确定为项目根目录。")
|
|
47
44
|
return path
|
|
48
45
|
|
|
49
46
|
# 2. 检查是否存在 settings.py 文件,并且它位于一个 Python 包中
|
|
@@ -51,7 +48,6 @@ def _find_project_root(start_path: str = '.') -> Optional[str]:
|
|
|
51
48
|
if os.path.isfile(settings_file):
|
|
52
49
|
init_file = os.path.join(path, '__init__.py')
|
|
53
50
|
if os.path.isfile(init_file):
|
|
54
|
-
logger.info(f"在路径 {path} 找到 'settings.py' 文件,确定为项目根目录。")
|
|
55
51
|
return path
|
|
56
52
|
else:
|
|
57
53
|
logger.debug(f"在路径 {path} 找到 'settings.py',但缺少 '__init__.py',忽略。")
|
|
@@ -86,7 +82,7 @@ def _get_settings_module_from_cfg(cfg_path: str) -> str:
|
|
|
86
82
|
config.read(cfg_path, encoding='utf-8')
|
|
87
83
|
if config.has_section('settings') and config.has_option('settings', 'default'):
|
|
88
84
|
module_path = config.get('settings', 'default')
|
|
89
|
-
logger.
|
|
85
|
+
logger.debug(f"从 'crawlo.cfg' 中读取到 settings 模块路径: {module_path}")
|
|
90
86
|
return module_path
|
|
91
87
|
else:
|
|
92
88
|
error_msg = f"配置文件 '{cfg_path}' 缺少 '[settings]' 或 'default' 配置项。"
|
|
@@ -113,7 +109,7 @@ def get_settings(custom_settings=None):
|
|
|
113
109
|
RuntimeError: 当无法找到项目或配置文件时。
|
|
114
110
|
ImportError: 当无法导入指定的 settings 模块时。
|
|
115
111
|
"""
|
|
116
|
-
logger.
|
|
112
|
+
logger.debug("正在初始化配置管理器...")
|
|
117
113
|
|
|
118
114
|
# 1. 发现项目根目录
|
|
119
115
|
project_root = _find_project_root()
|
|
@@ -122,7 +118,7 @@ def get_settings(custom_settings=None):
|
|
|
122
118
|
logger.error(error_msg)
|
|
123
119
|
raise RuntimeError(error_msg)
|
|
124
120
|
|
|
125
|
-
logger.
|
|
121
|
+
logger.debug(f"项目根目录已确定: {project_root}")
|
|
126
122
|
|
|
127
123
|
# 2. 确定 settings 模块的导入路径
|
|
128
124
|
settings_module_path = None
|
|
@@ -132,27 +128,27 @@ def get_settings(custom_settings=None):
|
|
|
132
128
|
if os.path.isfile(cfg_file):
|
|
133
129
|
settings_module_path = _get_settings_module_from_cfg(cfg_file)
|
|
134
130
|
else:
|
|
135
|
-
logger.
|
|
131
|
+
logger.debug("未找到 'crawlo.cfg',尝试推断 settings 模块路径...")
|
|
136
132
|
# 推断:项目目录名.settings
|
|
137
133
|
project_name = os.path.basename(project_root)
|
|
138
134
|
settings_module_path = f"{project_name}.settings"
|
|
139
|
-
logger.
|
|
135
|
+
logger.debug(f"推断 settings 模块路径为: {settings_module_path}")
|
|
140
136
|
|
|
141
137
|
# 3. 将项目根目录添加到 Python 路径,确保可以成功导入
|
|
142
138
|
if project_root not in sys.path:
|
|
143
139
|
sys.path.insert(0, project_root)
|
|
144
|
-
logger.
|
|
140
|
+
logger.debug(f"已将项目根目录 '{project_root}' 添加到 Python 路径。")
|
|
145
141
|
else:
|
|
146
142
|
logger.debug(f"项目根目录 '{project_root}' 已在 Python 路径中。")
|
|
147
143
|
|
|
148
144
|
# 4. 创建 SettingManager 并加载配置
|
|
149
|
-
logger.
|
|
145
|
+
logger.debug(f"正在加载 settings 模块: {settings_module_path}")
|
|
150
146
|
settings = SettingManager()
|
|
151
147
|
|
|
152
148
|
try:
|
|
153
149
|
# 这会触发 SettingManager.set_settings(),从模块中加载所有大写常量
|
|
154
150
|
settings.set_settings(settings_module_path)
|
|
155
|
-
logger.
|
|
151
|
+
logger.debug("settings 模块加载成功。")
|
|
156
152
|
except Exception as e:
|
|
157
153
|
error_msg = f"加载 settings 模块 '{settings_module_path}' 失败: {e}"
|
|
158
154
|
logger.error(error_msg)
|
|
@@ -160,13 +156,14 @@ def get_settings(custom_settings=None):
|
|
|
160
156
|
|
|
161
157
|
# 5. 应用运行时自定义设置
|
|
162
158
|
if custom_settings:
|
|
163
|
-
logger.
|
|
159
|
+
logger.debug(f"正在应用运行时自定义设置: {custom_settings}")
|
|
164
160
|
settings.update_attributes(custom_settings)
|
|
165
161
|
logger.info("运行时自定义设置已应用。")
|
|
166
162
|
|
|
167
|
-
logger.
|
|
163
|
+
logger.debug("配置管理器初始化完成。")
|
|
168
164
|
return settings
|
|
169
165
|
|
|
166
|
+
|
|
170
167
|
def load_class(_path):
|
|
171
168
|
if not isinstance(_path, str):
|
|
172
169
|
if callable(_path):
|
|
@@ -183,13 +180,14 @@ def load_class(_path):
|
|
|
183
180
|
raise NameError(f"Module {module_name!r} has no class named {class_name!r}")
|
|
184
181
|
return cls
|
|
185
182
|
|
|
183
|
+
|
|
186
184
|
def merge_settings(spider, settings):
|
|
187
185
|
spider_name = getattr(spider, 'name', 'UnknownSpider')
|
|
188
186
|
if hasattr(spider, 'custom_settings'):
|
|
189
187
|
custom_settings = getattr(spider, 'custom_settings')
|
|
190
188
|
settings.update_attributes(custom_settings)
|
|
191
189
|
else:
|
|
192
|
-
logger.debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并")
|
|
190
|
+
logger.debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并") # 添加日志
|
|
193
191
|
|
|
194
192
|
|
|
195
193
|
async def common_call(func: Callable, *args, **kwargs):
|
|
@@ -1,16 +1,19 @@
|
|
|
1
1
|
crawlo/__init__.py,sha256=xpiIAZbSG3CzneJuDLPCbwfRcvw2wyHYl2kJjaNfNGY,584
|
|
2
|
-
crawlo/__version__.py,sha256=
|
|
2
|
+
crawlo/__version__.py,sha256=uyL3a6o1xccXPZ2OS65zqIN_lbEMT7PcCxErq7cuWwA,23
|
|
3
3
|
crawlo/cli.py,sha256=hjAJKx9pba375sATvvcy-dtZyBIgXj8fRBq9RFIZHA4,1206
|
|
4
|
-
crawlo/crawler.py,sha256=
|
|
4
|
+
crawlo/crawler.py,sha256=AyKxUyJvCwb1u4d3Zn3vFmjH28ExWKIygfTICps-3yY,20026
|
|
5
5
|
crawlo/event.py,sha256=ZhoPW5CglCEuZNFEwviSCBIw0pT5O6jT98bqYrDFd3E,324
|
|
6
6
|
crawlo/exceptions.py,sha256=xdyZkvVcLEJ-19sWMHvn9IJsu30-hAY2jJhA2kYIims,1207
|
|
7
7
|
crawlo/stats_collector.py,sha256=v4jC9BAe-23w93hWzbeMCCgQ9VuFPyxw5JV9ItbGH8w,1636
|
|
8
8
|
crawlo/subscriber.py,sha256=udlHeTR0ymGQhCDxVUGwUzeeeR4TYCEJrJwFnkgr0cU,3836
|
|
9
9
|
crawlo/task_manager.py,sha256=D9m-nqnGj-FZPtGk4CdwZX3Gw7IWyYvTS7CHpRGWc_w,748
|
|
10
10
|
crawlo/commands/__init__.py,sha256=dRu3ipuhDM7M1eTb6zJtQZ_u7N_tZumGfH5_I92xno8,252
|
|
11
|
+
crawlo/commands/check.py,sha256=Q8wFjIo43XW0wP93TTlM7HSShgytJsbSWHIlmkcNxz0,3585
|
|
11
12
|
crawlo/commands/genspider.py,sha256=kSHYsAGHRoxU6Qf_MGpR_VS-Ua5NUGY2KGm_Wapn0sw,3529
|
|
12
|
-
crawlo/commands/
|
|
13
|
+
crawlo/commands/list.py,sha256=itR05muZlZs8FbRh88kOhcRbZc77OXiR6A86UnVhSMY,2974
|
|
14
|
+
crawlo/commands/run.py,sha256=s6JJC8HNa-tBgPDB2BPUmj26D7PMckhlx4AOEz57ESY,6197
|
|
13
15
|
crawlo/commands/startproject.py,sha256=1KOq_CALy01oklr0dAUYhGFzu4f7w45q2H0O3qafLX4,3494
|
|
16
|
+
crawlo/commands/stats.py,sha256=rH0TlD0o-xUr9RxtvNYgnSjHHoRyma3rvx9Q9nIGDNg,1659
|
|
14
17
|
crawlo/core/__init__.py,sha256=JYSAn15r8yWgRK_Nc69t_8tZCyb70MiPZKssA8wrYz0,43
|
|
15
18
|
crawlo/core/engine.py,sha256=JFHooPp-5cfHSyxEh87nOOR5NMaPLVDfNSqAsbtx4PM,6030
|
|
16
19
|
crawlo/core/processor.py,sha256=oHLs-cno0bJGTNc9NGD2S7_2-grI3ruvggO0SY2mf3Q,1180
|
|
@@ -51,7 +54,7 @@ crawlo/pipelines/pipeline_manager.py,sha256=k-Rg0os0Havrov99D-Jn3ROpnz154K30tf7a
|
|
|
51
54
|
crawlo/settings/__init__.py,sha256=NgYFLfk_Bw7h6KSoepJn_lMBSqVbCHebjKxaE3_eMgw,130
|
|
52
55
|
crawlo/settings/default_settings.py,sha256=urj4XJ--ZpVRbbo3fWUT71bYQLmElx43AC9KeHtqHBs,7310
|
|
53
56
|
crawlo/settings/setting_manager.py,sha256=4xXOzKwZCgAp8ybwvVcs2R--CsOD7c6dBIkj6DJHB3c,2998
|
|
54
|
-
crawlo/spider/__init__.py,sha256=
|
|
57
|
+
crawlo/spider/__init__.py,sha256=IyQd4ufbAIhA_cvWrsNReRv3tj76CHc5Aef9c8KR-9s,3983
|
|
55
58
|
crawlo/templates/crawlo.cfg.tmpl,sha256=lwiUVe5sFixJgHFEjn1OtbAeyWsECOrz37uheuVtulk,240
|
|
56
59
|
crawlo/templates/project/__init__.py.tmpl,sha256=aQnHaOjMSkTviOC8COUX0fKymuyf8lx2tGduxkMkXEE,61
|
|
57
60
|
crawlo/templates/project/items.py.tmpl,sha256=bXx-oCldMr2EgBKUAH9LH5gMnbyLiWX-EySAaMzcu2g,318
|
|
@@ -67,7 +70,7 @@ crawlo/utils/db_helper.py,sha256=ZqOt1d3mErVv4TOvoWlov0niUxORB9aHByTmMoNFIDw,109
|
|
|
67
70
|
crawlo/utils/func_tools.py,sha256=y-TYP9H3X67MS_foWy9Z2LIS6GP7Y4Cy3T168ulq3Jc,2451
|
|
68
71
|
crawlo/utils/log.py,sha256=YD2FfXuuE2MC9ZdQQZ0H7KysE7l_LHZqQepaTPlcApo,4133
|
|
69
72
|
crawlo/utils/pqueue.py,sha256=HDgX4HAkc7RqYUtX6q51tzI1ZRTACf8P_4jLqC4-uC0,5559
|
|
70
|
-
crawlo/utils/project.py,sha256=
|
|
73
|
+
crawlo/utils/project.py,sha256=hXSKV55OBUFjJi7TXekB4X3MmAgsqAeVTj5wPUWOizc,7394
|
|
71
74
|
crawlo/utils/request.py,sha256=ejdKpTwc-HE04HQybafhOVywzz57IV3pY0YMkSLyGUo,9065
|
|
72
75
|
crawlo/utils/spider_loader.py,sha256=V0CBTicJBYBZafhwLfDEfuEc_hJ2mSoiptT6qKufI9U,2249
|
|
73
76
|
crawlo/utils/system.py,sha256=24zGmtHNhDFMGVo7ftMV-Pqg6_5d63zsyNey9udvJJk,248
|
|
@@ -87,8 +90,8 @@ tests/test_proxy_middleware_integration.py,sha256=mTPK_XvbmLCV_QoVZzA3ybWOOX6149
|
|
|
87
90
|
tests/test_proxy_providers.py,sha256=u_R2fhab90vqvQEaOAztpAOe9tJXvUMIdoDxmStmXJ4,1749
|
|
88
91
|
tests/test_proxy_stats.py,sha256=ES00CEoDITYPFBGPk8pecFzD3ItYIv6NSpcqNd8-kvo,526
|
|
89
92
|
tests/test_proxy_strategies.py,sha256=9Z1pXmTNyw-eIhGXlf2abZbJx6igLohYq-_3hldQ5uE,1868
|
|
90
|
-
crawlo-1.0.
|
|
91
|
-
crawlo-1.0.
|
|
92
|
-
crawlo-1.0.
|
|
93
|
-
crawlo-1.0.
|
|
94
|
-
crawlo-1.0.
|
|
93
|
+
crawlo-1.0.8.dist-info/METADATA,sha256=ia-nA0g0Rl76iHFIlvaRbvUnjd88KEKoxIrJKcjtCyw,1825
|
|
94
|
+
crawlo-1.0.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
95
|
+
crawlo-1.0.8.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
|
|
96
|
+
crawlo-1.0.8.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
|
|
97
|
+
crawlo-1.0.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|