crawlo 1.0.5__py3-none-any.whl → 1.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__version__.py +1 -1
- crawlo/cli.py +41 -0
- crawlo/commands/__init__.py +10 -0
- crawlo/commands/genspider.py +111 -0
- crawlo/commands/run.py +149 -0
- crawlo/commands/startproject.py +101 -0
- crawlo/crawler.py +1 -206
- crawlo/exceptions.py +5 -0
- crawlo/items/__init__.py +17 -56
- crawlo/items/base.py +22 -0
- crawlo/items/fields.py +54 -0
- crawlo/items/items.py +10 -20
- crawlo/settings/default_settings.py +1 -1
- crawlo/templates/crawlo.cfg.tmpl +11 -0
- crawlo/templates/project/__init__.py.tmpl +4 -0
- crawlo/templates/project/items.py.tmpl +18 -0
- crawlo/templates/project/middlewares.py.tmpl +76 -0
- crawlo/templates/project/pipelines.py.tmpl +64 -0
- crawlo/templates/project/settings.py.tmpl +54 -0
- crawlo/templates/project/spiders/__init__.py.tmpl +6 -0
- crawlo/templates/spider/spider.py.tmpl +32 -0
- crawlo/utils/project.py +159 -19
- crawlo/utils/spider_loader.py +63 -0
- {crawlo-1.0.5.dist-info → crawlo-1.0.7.dist-info}/METADATA +1 -1
- {crawlo-1.0.5.dist-info → crawlo-1.0.7.dist-info}/RECORD +32 -22
- crawlo-1.0.7.dist-info/entry_points.txt +2 -0
- examples/gxb/items.py +1 -1
- examples/gxb/run.py +2 -1
- examples/gxb/settings.py +2 -1
- examples/gxb/spider/{telecom_device_licenses.py → telecom_device.py} +1 -1
- crawlo/templates/item_template.tmpl +0 -22
- crawlo/templates/project_template/items/__init__.py +0 -0
- crawlo/templates/project_template/main.py +0 -33
- crawlo/templates/project_template/setting.py +0 -190
- crawlo/templates/project_template/spiders/__init__.py +0 -0
- crawlo/templates/spider_template.tmpl +0 -31
- crawlo-1.0.5.dist-info/entry_points.txt +0 -2
- {crawlo-1.0.5.dist-info → crawlo-1.0.7.dist-info}/WHEEL +0 -0
- {crawlo-1.0.5.dist-info → crawlo-1.0.7.dist-info}/top_level.txt +0 -0
crawlo/utils/project.py
CHANGED
|
@@ -1,39 +1,171 @@
|
|
|
1
1
|
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
自动发现项目并创建 SettingManager 实例
|
|
5
|
+
|
|
6
|
+
该模块负责:
|
|
7
|
+
1. 向上搜索项目根目录(通过 crawlo.cfg 或 settings.py)
|
|
8
|
+
2. 将项目根目录加入 Python 路径 (sys.path)
|
|
9
|
+
3. 加载指定的 settings 模块
|
|
10
|
+
4. 返回一个已配置好的 SettingManager 实例
|
|
11
|
+
"""
|
|
3
12
|
import os
|
|
4
13
|
import sys
|
|
14
|
+
import configparser
|
|
5
15
|
from importlib import import_module
|
|
6
16
|
from inspect import iscoroutinefunction
|
|
7
|
-
from typing import Callable
|
|
17
|
+
from typing import Callable, Optional
|
|
8
18
|
|
|
19
|
+
from crawlo.utils.log import get_logger
|
|
9
20
|
from crawlo.settings.setting_manager import SettingManager
|
|
10
21
|
|
|
11
22
|
|
|
12
|
-
|
|
13
|
-
path = os.path.abspath(path)
|
|
14
|
-
return path
|
|
23
|
+
logger =get_logger(__name__)
|
|
15
24
|
|
|
16
25
|
|
|
17
|
-
def
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
26
|
+
def _find_project_root(start_path: str = '.') -> Optional[str]:
|
|
27
|
+
"""
|
|
28
|
+
从指定的起始路径开始,向上级目录递归搜索,寻找项目根目录。
|
|
29
|
+
搜索依据:
|
|
30
|
+
1. 优先查找 'crawlo.cfg' 文件。
|
|
31
|
+
2. 如果未找到 cfg 文件,则查找位于 Python 包内(即包含 __init__.py 的目录)的 'settings.py' 文件。
|
|
23
32
|
|
|
33
|
+
Args:
|
|
34
|
+
start_path (str): 搜索的起始路径,默认为当前工作目录 '.'。
|
|
24
35
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
36
|
+
Returns:
|
|
37
|
+
Optional[str]: 找到的项目根目录的绝对路径,如果未找到则返回 None。
|
|
38
|
+
"""
|
|
39
|
+
path = os.path.abspath(start_path)
|
|
40
|
+
logger.info(f"开始向上搜索项目根目录,起始路径: {path}")
|
|
30
41
|
|
|
42
|
+
while True:
|
|
43
|
+
# 1. 检查是否存在 crawlo.cfg 文件
|
|
44
|
+
cfg_file = os.path.join(path, 'crawlo.cfg')
|
|
45
|
+
if os.path.isfile(cfg_file):
|
|
46
|
+
logger.info(f"在路径 {path} 找到 'crawlo.cfg' 文件,确定为项目根目录。")
|
|
47
|
+
return path
|
|
31
48
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
49
|
+
# 2. 检查是否存在 settings.py 文件,并且它位于一个 Python 包中
|
|
50
|
+
settings_file = os.path.join(path, 'settings.py')
|
|
51
|
+
if os.path.isfile(settings_file):
|
|
52
|
+
init_file = os.path.join(path, '__init__.py')
|
|
53
|
+
if os.path.isfile(init_file):
|
|
54
|
+
logger.info(f"在路径 {path} 找到 'settings.py' 文件,确定为项目根目录。")
|
|
55
|
+
return path
|
|
56
|
+
else:
|
|
57
|
+
logger.debug(f"在路径 {path} 找到 'settings.py',但缺少 '__init__.py',忽略。")
|
|
58
|
+
|
|
59
|
+
# 移动到上一级目录
|
|
60
|
+
parent = os.path.dirname(path)
|
|
61
|
+
if parent == path:
|
|
62
|
+
# 已经到达文件系统根目录
|
|
63
|
+
break
|
|
64
|
+
path = parent
|
|
65
|
+
|
|
66
|
+
logger.warning("向上搜索完毕,未找到项目根目录。")
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _get_settings_module_from_cfg(cfg_path: str) -> str:
|
|
71
|
+
"""
|
|
72
|
+
从 crawlo.cfg 配置文件中读取 settings 模块的路径。
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
cfg_path (str): crawlo.cfg 文件的完整路径。
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
str: settings 模块的导入路径,例如 'myproject.settings'。
|
|
79
|
+
|
|
80
|
+
Raises:
|
|
81
|
+
RuntimeError: 当读取文件或解析配置出错时抛出。
|
|
82
|
+
"""
|
|
83
|
+
logger.info(f"正在读取配置文件: {cfg_path}")
|
|
84
|
+
config = configparser.ConfigParser()
|
|
85
|
+
try:
|
|
86
|
+
config.read(cfg_path, encoding='utf-8')
|
|
87
|
+
if config.has_section('settings') and config.has_option('settings', 'default'):
|
|
88
|
+
module_path = config.get('settings', 'default')
|
|
89
|
+
logger.info(f"从 'crawlo.cfg' 中读取到 settings 模块路径: {module_path}")
|
|
90
|
+
return module_path
|
|
91
|
+
else:
|
|
92
|
+
error_msg = f"配置文件 '{cfg_path}' 缺少 '[settings]' 或 'default' 配置项。"
|
|
93
|
+
logger.error(error_msg)
|
|
94
|
+
raise RuntimeError(error_msg)
|
|
95
|
+
except (configparser.Error, OSError) as e:
|
|
96
|
+
error_msg = f"读取或解析配置文件 '{cfg_path}' 时出错: {e}"
|
|
97
|
+
logger.error(error_msg)
|
|
98
|
+
raise RuntimeError(error_msg)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def get_settings(custom_settings=None):
|
|
102
|
+
"""
|
|
103
|
+
获取配置管理器实例的主函数。
|
|
104
|
+
此函数会自动发现项目,加载配置,并返回一个配置好的 SettingManager。
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
custom_settings (dict, optional): 运行时传入的自定义设置字典,会覆盖 settings.py 中的同名配置。
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
SettingManager: 一个已加载所有配置的 SettingManager 实例。
|
|
111
|
+
|
|
112
|
+
Raises:
|
|
113
|
+
RuntimeError: 当无法找到项目或配置文件时。
|
|
114
|
+
ImportError: 当无法导入指定的 settings 模块时。
|
|
115
|
+
"""
|
|
116
|
+
logger.info("正在初始化配置管理器...")
|
|
117
|
+
|
|
118
|
+
# 1. 发现项目根目录
|
|
119
|
+
project_root = _find_project_root()
|
|
120
|
+
if not project_root:
|
|
121
|
+
error_msg = "未找到 Crawlo 项目。请确保您正在包含 'crawlo.cfg' 或 'settings.py' 的项目目录中运行。"
|
|
122
|
+
logger.error(error_msg)
|
|
123
|
+
raise RuntimeError(error_msg)
|
|
124
|
+
|
|
125
|
+
logger.info(f"项目根目录已确定: {project_root}")
|
|
126
|
+
|
|
127
|
+
# 2. 确定 settings 模块的导入路径
|
|
128
|
+
settings_module_path = None
|
|
129
|
+
|
|
130
|
+
# 优先从 crawlo.cfg 中读取
|
|
131
|
+
cfg_file = os.path.join(project_root, 'crawlo.cfg')
|
|
132
|
+
if os.path.isfile(cfg_file):
|
|
133
|
+
settings_module_path = _get_settings_module_from_cfg(cfg_file)
|
|
134
|
+
else:
|
|
135
|
+
logger.info("未找到 'crawlo.cfg',尝试推断 settings 模块路径...")
|
|
136
|
+
# 推断:项目目录名.settings
|
|
137
|
+
project_name = os.path.basename(project_root)
|
|
138
|
+
settings_module_path = f"{project_name}.settings"
|
|
139
|
+
logger.info(f"推断 settings 模块路径为: {settings_module_path}")
|
|
140
|
+
|
|
141
|
+
# 3. 将项目根目录添加到 Python 路径,确保可以成功导入
|
|
142
|
+
if project_root not in sys.path:
|
|
143
|
+
sys.path.insert(0, project_root)
|
|
144
|
+
logger.info(f"已将项目根目录 '{project_root}' 添加到 Python 路径。")
|
|
145
|
+
else:
|
|
146
|
+
logger.debug(f"项目根目录 '{project_root}' 已在 Python 路径中。")
|
|
147
|
+
|
|
148
|
+
# 4. 创建 SettingManager 并加载配置
|
|
149
|
+
logger.info(f"正在加载 settings 模块: {settings_module_path}")
|
|
150
|
+
settings = SettingManager()
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
# 这会触发 SettingManager.set_settings(),从模块中加载所有大写常量
|
|
154
|
+
settings.set_settings(settings_module_path)
|
|
155
|
+
logger.info("settings 模块加载成功。")
|
|
156
|
+
except Exception as e:
|
|
157
|
+
error_msg = f"加载 settings 模块 '{settings_module_path}' 失败: {e}"
|
|
158
|
+
logger.error(error_msg)
|
|
159
|
+
raise ImportError(error_msg)
|
|
160
|
+
|
|
161
|
+
# 5. 应用运行时自定义设置
|
|
162
|
+
if custom_settings:
|
|
163
|
+
logger.info(f"正在应用运行时自定义设置: {custom_settings}")
|
|
35
164
|
settings.update_attributes(custom_settings)
|
|
165
|
+
logger.info("运行时自定义设置已应用。")
|
|
36
166
|
|
|
167
|
+
logger.info("配置管理器初始化完成。")
|
|
168
|
+
return settings
|
|
37
169
|
|
|
38
170
|
def load_class(_path):
|
|
39
171
|
if not isinstance(_path, str):
|
|
@@ -51,6 +183,14 @@ def load_class(_path):
|
|
|
51
183
|
raise NameError(f"Module {module_name!r} has no class named {class_name!r}")
|
|
52
184
|
return cls
|
|
53
185
|
|
|
186
|
+
def merge_settings(spider, settings):
|
|
187
|
+
spider_name = getattr(spider, 'name', 'UnknownSpider')
|
|
188
|
+
if hasattr(spider, 'custom_settings'):
|
|
189
|
+
custom_settings = getattr(spider, 'custom_settings')
|
|
190
|
+
settings.update_attributes(custom_settings)
|
|
191
|
+
else:
|
|
192
|
+
logger.debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并") # 添加日志
|
|
193
|
+
|
|
54
194
|
|
|
55
195
|
async def common_call(func: Callable, *args, **kwargs):
|
|
56
196
|
if iscoroutinefunction(func):
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import inspect
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Type, Optional, Dict
|
|
5
|
+
|
|
6
|
+
from crawlo.spider import Spider
|
|
7
|
+
from crawlo.utils.log import get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SpiderLoader:
|
|
13
|
+
"""爬虫加载器,负责发现和加载爬虫"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, project_package: str):
|
|
16
|
+
self.project_package = project_package
|
|
17
|
+
self._spiders: Dict[str, Type[Spider]] = {}
|
|
18
|
+
self._load_spiders()
|
|
19
|
+
|
|
20
|
+
def _load_spiders(self):
|
|
21
|
+
"""加载所有爬虫"""
|
|
22
|
+
spiders_dir = Path.cwd() / self.project_package / 'spiders'
|
|
23
|
+
if not spiders_dir.exists():
|
|
24
|
+
logger.warning(f"Spiders directory not found: {spiders_dir}")
|
|
25
|
+
return
|
|
26
|
+
|
|
27
|
+
for py_file in spiders_dir.glob("*.py"):
|
|
28
|
+
if py_file.name.startswith('_'):
|
|
29
|
+
continue
|
|
30
|
+
|
|
31
|
+
module_name = py_file.stem
|
|
32
|
+
spider_module_path = f"{self.project_package}.spiders.{module_name}"
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
module = importlib.import_module(spider_module_path)
|
|
36
|
+
except ImportError as e:
|
|
37
|
+
logger.debug(f"Skip module {module_name}: {e}")
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
# 查找所有 Spider 子类
|
|
41
|
+
for attr_name in dir(module):
|
|
42
|
+
attr_value = getattr(module, attr_name)
|
|
43
|
+
if (isinstance(attr_value, type) and
|
|
44
|
+
issubclass(attr_value, Spider) and
|
|
45
|
+
attr_value != Spider and
|
|
46
|
+
hasattr(attr_value, 'name')):
|
|
47
|
+
|
|
48
|
+
spider_name = getattr(attr_value, 'name')
|
|
49
|
+
if spider_name in self._spiders:
|
|
50
|
+
logger.warning(f"Duplicate spider name '{spider_name}' found")
|
|
51
|
+
self._spiders[spider_name] = attr_value
|
|
52
|
+
|
|
53
|
+
def load(self, spider_name: str) -> Optional[Type[Spider]]:
|
|
54
|
+
"""通过 name 加载爬虫"""
|
|
55
|
+
return self._spiders.get(spider_name)
|
|
56
|
+
|
|
57
|
+
def list(self) -> List[str]:
|
|
58
|
+
"""列出所有可用的爬虫名称"""
|
|
59
|
+
return list(self._spiders.keys())
|
|
60
|
+
|
|
61
|
+
def get_all(self) -> Dict[str, Type[Spider]]:
|
|
62
|
+
"""获取所有爬虫"""
|
|
63
|
+
return self._spiders.copy()
|
|
@@ -1,11 +1,16 @@
|
|
|
1
1
|
crawlo/__init__.py,sha256=xpiIAZbSG3CzneJuDLPCbwfRcvw2wyHYl2kJjaNfNGY,584
|
|
2
|
-
crawlo/__version__.py,sha256=
|
|
3
|
-
crawlo/
|
|
2
|
+
crawlo/__version__.py,sha256=xYbb67T0bE-s3OjgVTmJzJVvjxFP4YEjxAaBB2nj3zA,23
|
|
3
|
+
crawlo/cli.py,sha256=hjAJKx9pba375sATvvcy-dtZyBIgXj8fRBq9RFIZHA4,1206
|
|
4
|
+
crawlo/crawler.py,sha256=nfuA_f8QnuIp2F4ZbaJv8Fceo_QPwqV1jYdD_edkMjg,8527
|
|
4
5
|
crawlo/event.py,sha256=ZhoPW5CglCEuZNFEwviSCBIw0pT5O6jT98bqYrDFd3E,324
|
|
5
|
-
crawlo/exceptions.py,sha256=
|
|
6
|
+
crawlo/exceptions.py,sha256=xdyZkvVcLEJ-19sWMHvn9IJsu30-hAY2jJhA2kYIims,1207
|
|
6
7
|
crawlo/stats_collector.py,sha256=v4jC9BAe-23w93hWzbeMCCgQ9VuFPyxw5JV9ItbGH8w,1636
|
|
7
8
|
crawlo/subscriber.py,sha256=udlHeTR0ymGQhCDxVUGwUzeeeR4TYCEJrJwFnkgr0cU,3836
|
|
8
9
|
crawlo/task_manager.py,sha256=D9m-nqnGj-FZPtGk4CdwZX3Gw7IWyYvTS7CHpRGWc_w,748
|
|
10
|
+
crawlo/commands/__init__.py,sha256=dRu3ipuhDM7M1eTb6zJtQZ_u7N_tZumGfH5_I92xno8,252
|
|
11
|
+
crawlo/commands/genspider.py,sha256=kSHYsAGHRoxU6Qf_MGpR_VS-Ua5NUGY2KGm_Wapn0sw,3529
|
|
12
|
+
crawlo/commands/run.py,sha256=Upv8K4sM0c0I1fIwTFK18VDcSHF7xabqfXtQ82fk56g,4628
|
|
13
|
+
crawlo/commands/startproject.py,sha256=1KOq_CALy01oklr0dAUYhGFzu4f7w45q2H0O3qafLX4,3494
|
|
9
14
|
crawlo/core/__init__.py,sha256=JYSAn15r8yWgRK_Nc69t_8tZCyb70MiPZKssA8wrYz0,43
|
|
10
15
|
crawlo/core/engine.py,sha256=JFHooPp-5cfHSyxEh87nOOR5NMaPLVDfNSqAsbtx4PM,6030
|
|
11
16
|
crawlo/core/processor.py,sha256=oHLs-cno0bJGTNc9NGD2S7_2-grI3ruvggO0SY2mf3Q,1180
|
|
@@ -21,8 +26,10 @@ crawlo/extension/logging_extension.py,sha256=rty2_up53KV05nCazuBuz2ZapHKq0ti7mGV
|
|
|
21
26
|
crawlo/filters/__init__.py,sha256=9fJQRVkxWWPChajYbAGe1O6UYB639xWt0hiLUGBs4hQ,1014
|
|
22
27
|
crawlo/filters/aioredis_filter.py,sha256=phBFW9Z28oylbik9Kb2WHM65Wo5yRAH2w9Yz0_2HaOQ,5621
|
|
23
28
|
crawlo/filters/memory_filter.py,sha256=L8XEJkObOxs4BzYpQvk9PVM969k2LE61VFsnEOTEf_E,6841
|
|
24
|
-
crawlo/items/__init__.py,sha256=
|
|
25
|
-
crawlo/items/
|
|
29
|
+
crawlo/items/__init__.py,sha256=rFpx1qFBo0Ik7bSdnXC8EVTJUOQdoJYGVdhYjaH00nk,409
|
|
30
|
+
crawlo/items/base.py,sha256=hwGJEdFWOdaZfalFX8umRkh_HUWLEbCjvq4j70fplMQ,598
|
|
31
|
+
crawlo/items/fields.py,sha256=fpS0vlRPpZYjTaMDgI9Q8z_YQqruwf6fi4Dgm6R2oEk,1854
|
|
32
|
+
crawlo/items/items.py,sha256=OmVEvMmgofMU95GkaiWkfNQ2fjsH2fY9sw3SKcmUhLs,3478
|
|
26
33
|
crawlo/middleware/__init__.py,sha256=PSwpRLdBUopaQzBp1S0zK_TZbrRagQ4yzvgyLy4tBk8,570
|
|
27
34
|
crawlo/middleware/default_header.py,sha256=OVW4vpRPp3Y6qYXtiEYlGqVjCYcbuv1Iecc7zEgwCsI,1099
|
|
28
35
|
crawlo/middleware/download_delay.py,sha256=P2eyAJXwdLdC4yYuLhvKZVa1b5YQvQD0GpsR8aDW8-8,994
|
|
@@ -42,15 +49,17 @@ crawlo/pipelines/mysql_batch_pipline.py,sha256=Mj3PReDRw22JhJ5hZxnka4cirKq3kEbOC
|
|
|
42
49
|
crawlo/pipelines/mysql_pipeline.py,sha256=bsAFqpxrCijzvX-IusxOtvTvQEUCt5uHNTyYMo_pIq4,8056
|
|
43
50
|
crawlo/pipelines/pipeline_manager.py,sha256=k-Rg0os0Havrov99D-Jn3ROpnz154K30tf7aARE5W3k,2174
|
|
44
51
|
crawlo/settings/__init__.py,sha256=NgYFLfk_Bw7h6KSoepJn_lMBSqVbCHebjKxaE3_eMgw,130
|
|
45
|
-
crawlo/settings/default_settings.py,sha256=
|
|
52
|
+
crawlo/settings/default_settings.py,sha256=urj4XJ--ZpVRbbo3fWUT71bYQLmElx43AC9KeHtqHBs,7310
|
|
46
53
|
crawlo/settings/setting_manager.py,sha256=4xXOzKwZCgAp8ybwvVcs2R--CsOD7c6dBIkj6DJHB3c,2998
|
|
47
54
|
crawlo/spider/__init__.py,sha256=lWi0bCR7HLT5bnj7_e9UIgFJjuqoeWtbwADfNkaajug,1139
|
|
48
|
-
crawlo/templates/
|
|
49
|
-
crawlo/templates/
|
|
50
|
-
crawlo/templates/
|
|
51
|
-
crawlo/templates/
|
|
52
|
-
crawlo/templates/
|
|
53
|
-
crawlo/templates/
|
|
55
|
+
crawlo/templates/crawlo.cfg.tmpl,sha256=lwiUVe5sFixJgHFEjn1OtbAeyWsECOrz37uheuVtulk,240
|
|
56
|
+
crawlo/templates/project/__init__.py.tmpl,sha256=aQnHaOjMSkTviOC8COUX0fKymuyf8lx2tGduxkMkXEE,61
|
|
57
|
+
crawlo/templates/project/items.py.tmpl,sha256=bXx-oCldMr2EgBKUAH9LH5gMnbyLiWX-EySAaMzcu2g,318
|
|
58
|
+
crawlo/templates/project/middlewares.py.tmpl,sha256=VAolmMTC6HBmJT5XvWB0ag6ig9iaDBS32adIQ1zPdw0,2177
|
|
59
|
+
crawlo/templates/project/pipelines.py.tmpl,sha256=xK1Yl7wYxiyUCm07GZvMnCS_cxJ5LF7z1YBBdkLlWys,1880
|
|
60
|
+
crawlo/templates/project/settings.py.tmpl,sha256=985Z-jiU6A31f5s1IVU4PvkC_QGlFlRRfTF6rZ_G4ek,1771
|
|
61
|
+
crawlo/templates/project/spiders/__init__.py.tmpl,sha256=zMbePipgLsctQUEnda4WkHz8rDLUX--rc8ruI6zkpWc,111
|
|
62
|
+
crawlo/templates/spider/spider.py.tmpl,sha256=SkNv1kOwet7ZdxoNXpj-o1iRETB30bcwPP16Uy8lyXg,869
|
|
54
63
|
crawlo/utils/__init__.py,sha256=XCYumI8wJ1jU_Myn_K0LT-LVygPDUCdETCbXM3EWvlo,130
|
|
55
64
|
crawlo/utils/concurrency_manager.py,sha256=o-_cfeUHdlBOM3eAXF857MtekSrRcVTBJ2jWZvY6weQ,5230
|
|
56
65
|
crawlo/utils/date_tools.py,sha256=lcEFP2Z5b-6pUTHczrzCCuqiHP_4_2zamomMGPZrExo,7194
|
|
@@ -58,27 +67,28 @@ crawlo/utils/db_helper.py,sha256=ZqOt1d3mErVv4TOvoWlov0niUxORB9aHByTmMoNFIDw,109
|
|
|
58
67
|
crawlo/utils/func_tools.py,sha256=y-TYP9H3X67MS_foWy9Z2LIS6GP7Y4Cy3T168ulq3Jc,2451
|
|
59
68
|
crawlo/utils/log.py,sha256=YD2FfXuuE2MC9ZdQQZ0H7KysE7l_LHZqQepaTPlcApo,4133
|
|
60
69
|
crawlo/utils/pqueue.py,sha256=HDgX4HAkc7RqYUtX6q51tzI1ZRTACf8P_4jLqC4-uC0,5559
|
|
61
|
-
crawlo/utils/project.py,sha256=
|
|
70
|
+
crawlo/utils/project.py,sha256=qAiCmpIxiB7RxCLG-U5lGV6k4UCa21uRdykTfnAF834,7669
|
|
62
71
|
crawlo/utils/request.py,sha256=ejdKpTwc-HE04HQybafhOVywzz57IV3pY0YMkSLyGUo,9065
|
|
72
|
+
crawlo/utils/spider_loader.py,sha256=V0CBTicJBYBZafhwLfDEfuEc_hJ2mSoiptT6qKufI9U,2249
|
|
63
73
|
crawlo/utils/system.py,sha256=24zGmtHNhDFMGVo7ftMV-Pqg6_5d63zsyNey9udvJJk,248
|
|
64
74
|
crawlo/utils/tools.py,sha256=uy7qw5Z1BIhyEgiHENvtM7WoGCJxlS8EX3PmOA7ouCo,275
|
|
65
75
|
crawlo/utils/url.py,sha256=RKe_iqdjafsNcp-P2GVLYpsL1qbxiuZLiFc-SqOQkcs,1521
|
|
66
76
|
examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
67
77
|
examples/gxb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
68
|
-
examples/gxb/items.py,sha256=
|
|
69
|
-
examples/gxb/run.py,sha256=
|
|
70
|
-
examples/gxb/settings.py,sha256=
|
|
78
|
+
examples/gxb/items.py,sha256=3-1Lxpi7EqMzheDJoO0MPyHky5nHG_nqQGgKlm8y6mQ,989
|
|
79
|
+
examples/gxb/run.py,sha256=9kJlR8f-tZ3BqP5PW7sCLTw6PAFWo3x4cG5lc-6GWqI,333
|
|
80
|
+
examples/gxb/settings.py,sha256=_nbXj9HV2e0F6liUzK0ueygLcaMM_IUlkuwL6mJqUfc,2345
|
|
71
81
|
examples/gxb/spider/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
72
82
|
examples/gxb/spider/miit_spider.py,sha256=tcQnuyUHfu-Re1QbKKSI9DXW3Sp1vyBW8qBzKLf_RC4,6666
|
|
73
|
-
examples/gxb/spider/
|
|
83
|
+
examples/gxb/spider/telecom_device.py,sha256=58iG6BQtQjjDHOF7-DXH0u5_XnppP5AJTQwaVJVyBEo,4929
|
|
74
84
|
tests/__init__.py,sha256=409aRX8hsPffiZCVjOogtxwhACzBp8G2UTJyUQSxhK0,136
|
|
75
85
|
tests/test_proxy_health_check.py,sha256=_tDlxa_6TdL3M5RLkHF82roXJ8WIuG5hELBp2GADyKQ,1123
|
|
76
86
|
tests/test_proxy_middleware_integration.py,sha256=mTPK_XvbmLCV_QoVZzA3ybWOOX61493Ew78WfTp-bYQ,4441
|
|
77
87
|
tests/test_proxy_providers.py,sha256=u_R2fhab90vqvQEaOAztpAOe9tJXvUMIdoDxmStmXJ4,1749
|
|
78
88
|
tests/test_proxy_stats.py,sha256=ES00CEoDITYPFBGPk8pecFzD3ItYIv6NSpcqNd8-kvo,526
|
|
79
89
|
tests/test_proxy_strategies.py,sha256=9Z1pXmTNyw-eIhGXlf2abZbJx6igLohYq-_3hldQ5uE,1868
|
|
80
|
-
crawlo-1.0.
|
|
81
|
-
crawlo-1.0.
|
|
82
|
-
crawlo-1.0.
|
|
83
|
-
crawlo-1.0.
|
|
84
|
-
crawlo-1.0.
|
|
90
|
+
crawlo-1.0.7.dist-info/METADATA,sha256=ZjDus84i4MbVr_-7BEStA2OmZHEGcdw-b1MGUosylOI,1825
|
|
91
|
+
crawlo-1.0.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
92
|
+
crawlo-1.0.7.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
|
|
93
|
+
crawlo-1.0.7.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
|
|
94
|
+
crawlo-1.0.7.dist-info/RECORD,,
|
examples/gxb/items.py
CHANGED
examples/gxb/run.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
|
|
3
3
|
from crawlo.crawler import CrawlerProcess
|
|
4
|
-
from examples.gxb.spider.
|
|
4
|
+
from examples.gxb.spider.telecom_device import TelecomDeviceLicensesSpider
|
|
5
5
|
|
|
6
6
|
async def main():
|
|
7
7
|
process = CrawlerProcess()
|
|
@@ -10,6 +10,7 @@ async def main():
|
|
|
10
10
|
)
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
|
|
13
14
|
if __name__ == '__main__':
|
|
14
15
|
asyncio.run(main())
|
|
15
16
|
# 132023
|
examples/gxb/settings.py
CHANGED
|
@@ -10,7 +10,7 @@ from examples.gxb.settings import HEADERS, COOKIES
|
|
|
10
10
|
logger = get_logger(__name__)
|
|
11
11
|
|
|
12
12
|
class TelecomDeviceLicensesSpider(Spider):
|
|
13
|
-
name = '
|
|
13
|
+
name = 'telecom_device'
|
|
14
14
|
allowed_domains = ['ythzxfw.miit.gov.cn']
|
|
15
15
|
# API 的基础 URL
|
|
16
16
|
base_api_url = 'https://ythzxfw.miit.gov.cn/oldyth/user-center/tbAppSearch/selectResult'
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
"""
|
|
3
|
-
Created on {DATE}
|
|
4
|
-
---------
|
|
5
|
-
@summary:
|
|
6
|
-
---------
|
|
7
|
-
@author: {USER}
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
from crawlo import Item
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class ${item_name}Item(Item):
|
|
14
|
-
"""
|
|
15
|
-
This class was generated by feapder
|
|
16
|
-
command: feapder create -i ${command}
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
__table_name__ = "${table_name}"
|
|
20
|
-
|
|
21
|
-
def __init__(self, *args, **kwargs):
|
|
22
|
-
${propertys}
|
|
File without changes
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
"""
|
|
3
|
-
Created on {DATE}
|
|
4
|
-
---------
|
|
5
|
-
@summary: 爬虫入口
|
|
6
|
-
---------
|
|
7
|
-
@author: {USER}
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
from crawlo import ArgumentParser
|
|
11
|
-
|
|
12
|
-
from spiders import *
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def crawl_xxx():
|
|
17
|
-
"""
|
|
18
|
-
Spider爬虫
|
|
19
|
-
"""
|
|
20
|
-
spider = xxx.XXXSpider(redis_key="xxx:xxx")
|
|
21
|
-
spider.start()
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
if __name__ == "__main__":
|
|
26
|
-
parser = ArgumentParser(description="xxx爬虫")
|
|
27
|
-
|
|
28
|
-
parser.add_argument(
|
|
29
|
-
"--crawl_xxx", action="store_true", help="xxx爬虫", function=crawl_xxx
|
|
30
|
-
)
|
|
31
|
-
parser.start()
|
|
32
|
-
|
|
33
|
-
# main.py作为爬虫启动的统一入口,提供命令行的方式启动多个爬虫
|