crawlo 1.0.5__py3-none-any.whl → 1.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (39) hide show
  1. crawlo/__version__.py +1 -1
  2. crawlo/cli.py +41 -0
  3. crawlo/commands/__init__.py +10 -0
  4. crawlo/commands/genspider.py +111 -0
  5. crawlo/commands/run.py +149 -0
  6. crawlo/commands/startproject.py +101 -0
  7. crawlo/crawler.py +1 -206
  8. crawlo/exceptions.py +5 -0
  9. crawlo/items/__init__.py +18 -58
  10. crawlo/items/base.py +31 -0
  11. crawlo/items/fields.py +54 -0
  12. crawlo/items/items.py +10 -20
  13. crawlo/settings/default_settings.py +1 -1
  14. crawlo/templates/crawlo.cfg.tmpl +11 -0
  15. crawlo/templates/project/__init__.py.tmpl +4 -0
  16. crawlo/templates/project/items.py.tmpl +18 -0
  17. crawlo/templates/project/middlewares.py.tmpl +76 -0
  18. crawlo/templates/project/pipelines.py.tmpl +64 -0
  19. crawlo/templates/project/settings.py.tmpl +54 -0
  20. crawlo/templates/project/spiders/__init__.py.tmpl +6 -0
  21. crawlo/templates/spider/spider.py.tmpl +32 -0
  22. crawlo/utils/project.py +159 -19
  23. crawlo/utils/spider_loader.py +63 -0
  24. {crawlo-1.0.5.dist-info → crawlo-1.0.6.dist-info}/METADATA +1 -1
  25. {crawlo-1.0.5.dist-info → crawlo-1.0.6.dist-info}/RECORD +32 -22
  26. crawlo-1.0.6.dist-info/entry_points.txt +2 -0
  27. examples/gxb/items.py +1 -1
  28. examples/gxb/run.py +2 -1
  29. examples/gxb/settings.py +2 -1
  30. examples/gxb/spider/{telecom_device_licenses.py → telecom_device.py} +1 -1
  31. crawlo/templates/item_template.tmpl +0 -22
  32. crawlo/templates/project_template/items/__init__.py +0 -0
  33. crawlo/templates/project_template/main.py +0 -33
  34. crawlo/templates/project_template/setting.py +0 -190
  35. crawlo/templates/project_template/spiders/__init__.py +0 -0
  36. crawlo/templates/spider_template.tmpl +0 -31
  37. crawlo-1.0.5.dist-info/entry_points.txt +0 -2
  38. {crawlo-1.0.5.dist-info → crawlo-1.0.6.dist-info}/WHEEL +0 -0
  39. {crawlo-1.0.5.dist-info → crawlo-1.0.6.dist-info}/top_level.txt +0 -0
crawlo/utils/project.py CHANGED
@@ -1,39 +1,171 @@
1
1
  #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 自动发现项目并创建 SettingManager 实例
5
+
6
+ 该模块负责:
7
+ 1. 向上搜索项目根目录(通过 crawlo.cfg 或 settings.py)
8
+ 2. 将项目根目录加入 Python 路径 (sys.path)
9
+ 3. 加载指定的 settings 模块
10
+ 4. 返回一个已配置好的 SettingManager 实例
11
+ """
3
12
  import os
4
13
  import sys
14
+ import configparser
5
15
  from importlib import import_module
6
16
  from inspect import iscoroutinefunction
7
- from typing import Callable
17
+ from typing import Callable, Optional
8
18
 
19
+ from crawlo.utils.log import get_logger
9
20
  from crawlo.settings.setting_manager import SettingManager
10
21
 
11
22
 
12
- def _get_closest(path='.'):
13
- path = os.path.abspath(path)
14
- return path
23
+ logger =get_logger(__name__)
15
24
 
16
25
 
17
- def _init_env():
18
- closest = _get_closest()
19
- if closest:
20
- sys.path.append(closest)
21
- # project_dir = os.path.dirname(closest)
22
- # sys.path.append(project_dir)
26
+ def _find_project_root(start_path: str = '.') -> Optional[str]:
27
+ """
28
+ 从指定的起始路径开始,向上级目录递归搜索,寻找项目根目录。
29
+ 搜索依据:
30
+ 1. 优先查找 'crawlo.cfg' 文件。
31
+ 2. 如果未找到 cfg 文件,则查找位于 Python 包内(即包含 __init__.py 的目录)的 'settings.py' 文件。
23
32
 
33
+ Args:
34
+ start_path (str): 搜索的起始路径,默认为当前工作目录 '.'。
24
35
 
25
- def get_settings(settings='settings'):
26
- _settings = SettingManager()
27
- _init_env()
28
- _settings.set_settings(settings)
29
- return _settings
36
+ Returns:
37
+ Optional[str]: 找到的项目根目录的绝对路径,如果未找到则返回 None。
38
+ """
39
+ path = os.path.abspath(start_path)
40
+ logger.info(f"开始向上搜索项目根目录,起始路径: {path}")
30
41
 
42
+ while True:
43
+ # 1. 检查是否存在 crawlo.cfg 文件
44
+ cfg_file = os.path.join(path, 'crawlo.cfg')
45
+ if os.path.isfile(cfg_file):
46
+ logger.info(f"在路径 {path} 找到 'crawlo.cfg' 文件,确定为项目根目录。")
47
+ return path
31
48
 
32
- def merge_settings(spider, settings):
33
- if hasattr(spider, 'custom_settings'):
34
- custom_settings = getattr(spider, 'custom_settings')
49
+ # 2. 检查是否存在 settings.py 文件,并且它位于一个 Python 包中
50
+ settings_file = os.path.join(path, 'settings.py')
51
+ if os.path.isfile(settings_file):
52
+ init_file = os.path.join(path, '__init__.py')
53
+ if os.path.isfile(init_file):
54
+ logger.info(f"在路径 {path} 找到 'settings.py' 文件,确定为项目根目录。")
55
+ return path
56
+ else:
57
+ logger.debug(f"在路径 {path} 找到 'settings.py',但缺少 '__init__.py',忽略。")
58
+
59
+ # 移动到上一级目录
60
+ parent = os.path.dirname(path)
61
+ if parent == path:
62
+ # 已经到达文件系统根目录
63
+ break
64
+ path = parent
65
+
66
+ logger.warning("向上搜索完毕,未找到项目根目录。")
67
+ return None
68
+
69
+
70
+ def _get_settings_module_from_cfg(cfg_path: str) -> str:
71
+ """
72
+ 从 crawlo.cfg 配置文件中读取 settings 模块的路径。
73
+
74
+ Args:
75
+ cfg_path (str): crawlo.cfg 文件的完整路径。
76
+
77
+ Returns:
78
+ str: settings 模块的导入路径,例如 'myproject.settings'。
79
+
80
+ Raises:
81
+ RuntimeError: 当读取文件或解析配置出错时抛出。
82
+ """
83
+ logger.info(f"正在读取配置文件: {cfg_path}")
84
+ config = configparser.ConfigParser()
85
+ try:
86
+ config.read(cfg_path, encoding='utf-8')
87
+ if config.has_section('settings') and config.has_option('settings', 'default'):
88
+ module_path = config.get('settings', 'default')
89
+ logger.info(f"从 'crawlo.cfg' 中读取到 settings 模块路径: {module_path}")
90
+ return module_path
91
+ else:
92
+ error_msg = f"配置文件 '{cfg_path}' 缺少 '[settings]' 或 'default' 配置项。"
93
+ logger.error(error_msg)
94
+ raise RuntimeError(error_msg)
95
+ except (configparser.Error, OSError) as e:
96
+ error_msg = f"读取或解析配置文件 '{cfg_path}' 时出错: {e}"
97
+ logger.error(error_msg)
98
+ raise RuntimeError(error_msg)
99
+
100
+
101
+ def get_settings(custom_settings=None):
102
+ """
103
+ 获取配置管理器实例的主函数。
104
+ 此函数会自动发现项目,加载配置,并返回一个配置好的 SettingManager。
105
+
106
+ Args:
107
+ custom_settings (dict, optional): 运行时传入的自定义设置字典,会覆盖 settings.py 中的同名配置。
108
+
109
+ Returns:
110
+ SettingManager: 一个已加载所有配置的 SettingManager 实例。
111
+
112
+ Raises:
113
+ RuntimeError: 当无法找到项目或配置文件时。
114
+ ImportError: 当无法导入指定的 settings 模块时。
115
+ """
116
+ logger.info("正在初始化配置管理器...")
117
+
118
+ # 1. 发现项目根目录
119
+ project_root = _find_project_root()
120
+ if not project_root:
121
+ error_msg = "未找到 Crawlo 项目。请确保您正在包含 'crawlo.cfg' 或 'settings.py' 的项目目录中运行。"
122
+ logger.error(error_msg)
123
+ raise RuntimeError(error_msg)
124
+
125
+ logger.info(f"项目根目录已确定: {project_root}")
126
+
127
+ # 2. 确定 settings 模块的导入路径
128
+ settings_module_path = None
129
+
130
+ # 优先从 crawlo.cfg 中读取
131
+ cfg_file = os.path.join(project_root, 'crawlo.cfg')
132
+ if os.path.isfile(cfg_file):
133
+ settings_module_path = _get_settings_module_from_cfg(cfg_file)
134
+ else:
135
+ logger.info("未找到 'crawlo.cfg',尝试推断 settings 模块路径...")
136
+ # 推断:项目目录名.settings
137
+ project_name = os.path.basename(project_root)
138
+ settings_module_path = f"{project_name}.settings"
139
+ logger.info(f"推断 settings 模块路径为: {settings_module_path}")
140
+
141
+ # 3. 将项目根目录添加到 Python 路径,确保可以成功导入
142
+ if project_root not in sys.path:
143
+ sys.path.insert(0, project_root)
144
+ logger.info(f"已将项目根目录 '{project_root}' 添加到 Python 路径。")
145
+ else:
146
+ logger.debug(f"项目根目录 '{project_root}' 已在 Python 路径中。")
147
+
148
+ # 4. 创建 SettingManager 并加载配置
149
+ logger.info(f"正在加载 settings 模块: {settings_module_path}")
150
+ settings = SettingManager()
151
+
152
+ try:
153
+ # 这会触发 SettingManager.set_settings(),从模块中加载所有大写常量
154
+ settings.set_settings(settings_module_path)
155
+ logger.info("settings 模块加载成功。")
156
+ except Exception as e:
157
+ error_msg = f"加载 settings 模块 '{settings_module_path}' 失败: {e}"
158
+ logger.error(error_msg)
159
+ raise ImportError(error_msg)
160
+
161
+ # 5. 应用运行时自定义设置
162
+ if custom_settings:
163
+ logger.info(f"正在应用运行时自定义设置: {custom_settings}")
35
164
  settings.update_attributes(custom_settings)
165
+ logger.info("运行时自定义设置已应用。")
36
166
 
167
+ logger.info("配置管理器初始化完成。")
168
+ return settings
37
169
 
38
170
  def load_class(_path):
39
171
  if not isinstance(_path, str):
@@ -51,6 +183,14 @@ def load_class(_path):
51
183
  raise NameError(f"Module {module_name!r} has no class named {class_name!r}")
52
184
  return cls
53
185
 
186
+ def merge_settings(spider, settings):
187
+ spider_name = getattr(spider, 'name', 'UnknownSpider')
188
+ if hasattr(spider, 'custom_settings'):
189
+ custom_settings = getattr(spider, 'custom_settings')
190
+ settings.update_attributes(custom_settings)
191
+ else:
192
+ logger.debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并") # 添加日志
193
+
54
194
 
55
195
  async def common_call(func: Callable, *args, **kwargs):
56
196
  if iscoroutinefunction(func):
@@ -0,0 +1,63 @@
1
+ import importlib
2
+ import inspect
3
+ from pathlib import Path
4
+ from typing import List, Type, Optional, Dict
5
+
6
+ from crawlo.spider import Spider
7
+ from crawlo.utils.log import get_logger
8
+
9
+ logger = get_logger(__name__)
10
+
11
+
12
+ class SpiderLoader:
13
+ """爬虫加载器,负责发现和加载爬虫"""
14
+
15
+ def __init__(self, project_package: str):
16
+ self.project_package = project_package
17
+ self._spiders: Dict[str, Type[Spider]] = {}
18
+ self._load_spiders()
19
+
20
+ def _load_spiders(self):
21
+ """加载所有爬虫"""
22
+ spiders_dir = Path.cwd() / self.project_package / 'spiders'
23
+ if not spiders_dir.exists():
24
+ logger.warning(f"Spiders directory not found: {spiders_dir}")
25
+ return
26
+
27
+ for py_file in spiders_dir.glob("*.py"):
28
+ if py_file.name.startswith('_'):
29
+ continue
30
+
31
+ module_name = py_file.stem
32
+ spider_module_path = f"{self.project_package}.spiders.{module_name}"
33
+
34
+ try:
35
+ module = importlib.import_module(spider_module_path)
36
+ except ImportError as e:
37
+ logger.debug(f"Skip module {module_name}: {e}")
38
+ continue
39
+
40
+ # 查找所有 Spider 子类
41
+ for attr_name in dir(module):
42
+ attr_value = getattr(module, attr_name)
43
+ if (isinstance(attr_value, type) and
44
+ issubclass(attr_value, Spider) and
45
+ attr_value != Spider and
46
+ hasattr(attr_value, 'name')):
47
+
48
+ spider_name = getattr(attr_value, 'name')
49
+ if spider_name in self._spiders:
50
+ logger.warning(f"Duplicate spider name '{spider_name}' found")
51
+ self._spiders[spider_name] = attr_value
52
+
53
+ def load(self, spider_name: str) -> Optional[Type[Spider]]:
54
+ """通过 name 加载爬虫"""
55
+ return self._spiders.get(spider_name)
56
+
57
+ def list(self) -> List[str]:
58
+ """列出所有可用的爬虫名称"""
59
+ return list(self._spiders.keys())
60
+
61
+ def get_all(self) -> Dict[str, Type[Spider]]:
62
+ """获取所有爬虫"""
63
+ return self._spiders.copy()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.0.5
3
+ Version: 1.0.6
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -1,11 +1,16 @@
1
1
  crawlo/__init__.py,sha256=xpiIAZbSG3CzneJuDLPCbwfRcvw2wyHYl2kJjaNfNGY,584
2
- crawlo/__version__.py,sha256=lfZikIZ2prlMV6RkxhMRZj5dAeD0TCswIWS46kSjXw0,23
3
- crawlo/crawler.py,sha256=2izxy7-0yD8n_FsLSL_NaoFYdqQWIhm0hsoSLKgnPcA,16919
2
+ crawlo/__version__.py,sha256=1HqFYnow__4MUVRI_OMjvzTBzKkReNozOdA96kH53cA,23
3
+ crawlo/cli.py,sha256=hjAJKx9pba375sATvvcy-dtZyBIgXj8fRBq9RFIZHA4,1206
4
+ crawlo/crawler.py,sha256=nfuA_f8QnuIp2F4ZbaJv8Fceo_QPwqV1jYdD_edkMjg,8527
4
5
  crawlo/event.py,sha256=ZhoPW5CglCEuZNFEwviSCBIw0pT5O6jT98bqYrDFd3E,324
5
- crawlo/exceptions.py,sha256=VUFSOS00BPWMcH8EW5MgMDhXlUlaFeEcsgqbS_e8MoU,1119
6
+ crawlo/exceptions.py,sha256=xdyZkvVcLEJ-19sWMHvn9IJsu30-hAY2jJhA2kYIims,1207
6
7
  crawlo/stats_collector.py,sha256=v4jC9BAe-23w93hWzbeMCCgQ9VuFPyxw5JV9ItbGH8w,1636
7
8
  crawlo/subscriber.py,sha256=udlHeTR0ymGQhCDxVUGwUzeeeR4TYCEJrJwFnkgr0cU,3836
8
9
  crawlo/task_manager.py,sha256=D9m-nqnGj-FZPtGk4CdwZX3Gw7IWyYvTS7CHpRGWc_w,748
10
+ crawlo/commands/__init__.py,sha256=dRu3ipuhDM7M1eTb6zJtQZ_u7N_tZumGfH5_I92xno8,252
11
+ crawlo/commands/genspider.py,sha256=kSHYsAGHRoxU6Qf_MGpR_VS-Ua5NUGY2KGm_Wapn0sw,3529
12
+ crawlo/commands/run.py,sha256=Upv8K4sM0c0I1fIwTFK18VDcSHF7xabqfXtQ82fk56g,4628
13
+ crawlo/commands/startproject.py,sha256=1KOq_CALy01oklr0dAUYhGFzu4f7w45q2H0O3qafLX4,3494
9
14
  crawlo/core/__init__.py,sha256=JYSAn15r8yWgRK_Nc69t_8tZCyb70MiPZKssA8wrYz0,43
10
15
  crawlo/core/engine.py,sha256=JFHooPp-5cfHSyxEh87nOOR5NMaPLVDfNSqAsbtx4PM,6030
11
16
  crawlo/core/processor.py,sha256=oHLs-cno0bJGTNc9NGD2S7_2-grI3ruvggO0SY2mf3Q,1180
@@ -21,8 +26,10 @@ crawlo/extension/logging_extension.py,sha256=rty2_up53KV05nCazuBuz2ZapHKq0ti7mGV
21
26
  crawlo/filters/__init__.py,sha256=9fJQRVkxWWPChajYbAGe1O6UYB639xWt0hiLUGBs4hQ,1014
22
27
  crawlo/filters/aioredis_filter.py,sha256=phBFW9Z28oylbik9Kb2WHM65Wo5yRAH2w9Yz0_2HaOQ,5621
23
28
  crawlo/filters/memory_filter.py,sha256=L8XEJkObOxs4BzYpQvk9PVM969k2LE61VFsnEOTEf_E,6841
24
- crawlo/items/__init__.py,sha256=o5BSpS1Byivr-bpdfFgc9GCoGi8ThNuPJiTW7lz85-I,2125
25
- crawlo/items/items.py,sha256=0jf-CdZFkgDAevYn8PmSgGhf6iYu3bx1sv87hJbFtF4,3891
29
+ crawlo/items/__init__.py,sha256=HLDShSwAQUrgwt9_Ec2SIwzpIDZnNOCg9nSYqqEQdp8,407
30
+ crawlo/items/base.py,sha256=DZG0qENdukJExRtKjqdNkSlzUoWR3ucjyF73LYLANFo,754
31
+ crawlo/items/fields.py,sha256=fpS0vlRPpZYjTaMDgI9Q8z_YQqruwf6fi4Dgm6R2oEk,1854
32
+ crawlo/items/items.py,sha256=OmVEvMmgofMU95GkaiWkfNQ2fjsH2fY9sw3SKcmUhLs,3478
26
33
  crawlo/middleware/__init__.py,sha256=PSwpRLdBUopaQzBp1S0zK_TZbrRagQ4yzvgyLy4tBk8,570
27
34
  crawlo/middleware/default_header.py,sha256=OVW4vpRPp3Y6qYXtiEYlGqVjCYcbuv1Iecc7zEgwCsI,1099
28
35
  crawlo/middleware/download_delay.py,sha256=P2eyAJXwdLdC4yYuLhvKZVa1b5YQvQD0GpsR8aDW8-8,994
@@ -42,15 +49,17 @@ crawlo/pipelines/mysql_batch_pipline.py,sha256=Mj3PReDRw22JhJ5hZxnka4cirKq3kEbOC
42
49
  crawlo/pipelines/mysql_pipeline.py,sha256=bsAFqpxrCijzvX-IusxOtvTvQEUCt5uHNTyYMo_pIq4,8056
43
50
  crawlo/pipelines/pipeline_manager.py,sha256=k-Rg0os0Havrov99D-Jn3ROpnz154K30tf7aARE5W3k,2174
44
51
  crawlo/settings/__init__.py,sha256=NgYFLfk_Bw7h6KSoepJn_lMBSqVbCHebjKxaE3_eMgw,130
45
- crawlo/settings/default_settings.py,sha256=CH95c2oBmxy6t6bGLUuqSL8vJ3Z5Psicdfpc9W0MG90,7309
52
+ crawlo/settings/default_settings.py,sha256=urj4XJ--ZpVRbbo3fWUT71bYQLmElx43AC9KeHtqHBs,7310
46
53
  crawlo/settings/setting_manager.py,sha256=4xXOzKwZCgAp8ybwvVcs2R--CsOD7c6dBIkj6DJHB3c,2998
47
54
  crawlo/spider/__init__.py,sha256=lWi0bCR7HLT5bnj7_e9UIgFJjuqoeWtbwADfNkaajug,1139
48
- crawlo/templates/item_template.tmpl,sha256=0bGFnlwJRqstxMNEj1H_pEICybwoueRhs31QaDPXrS0,372
49
- crawlo/templates/spider_template.tmpl,sha256=JzphuA87Yl_F1xR9zOIi_ZSazyT8eSNPxYYPMv3Uiko,835
50
- crawlo/templates/project_template/main.py,sha256=BcCP294ycCPsHi_AMN7OAJtcrLvQdf91meH93PqbQgs,626
51
- crawlo/templates/project_template/setting.py,sha256=Ce4nMbrdhL1ioRdTcB0vV_vK_50cfnwVqSvt49QsNkA,9395
52
- crawlo/templates/project_template/items/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
- crawlo/templates/project_template/spiders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
+ crawlo/templates/crawlo.cfg.tmpl,sha256=lwiUVe5sFixJgHFEjn1OtbAeyWsECOrz37uheuVtulk,240
56
+ crawlo/templates/project/__init__.py.tmpl,sha256=aQnHaOjMSkTviOC8COUX0fKymuyf8lx2tGduxkMkXEE,61
57
+ crawlo/templates/project/items.py.tmpl,sha256=bXx-oCldMr2EgBKUAH9LH5gMnbyLiWX-EySAaMzcu2g,318
58
+ crawlo/templates/project/middlewares.py.tmpl,sha256=VAolmMTC6HBmJT5XvWB0ag6ig9iaDBS32adIQ1zPdw0,2177
59
+ crawlo/templates/project/pipelines.py.tmpl,sha256=xK1Yl7wYxiyUCm07GZvMnCS_cxJ5LF7z1YBBdkLlWys,1880
60
+ crawlo/templates/project/settings.py.tmpl,sha256=985Z-jiU6A31f5s1IVU4PvkC_QGlFlRRfTF6rZ_G4ek,1771
61
+ crawlo/templates/project/spiders/__init__.py.tmpl,sha256=zMbePipgLsctQUEnda4WkHz8rDLUX--rc8ruI6zkpWc,111
62
+ crawlo/templates/spider/spider.py.tmpl,sha256=SkNv1kOwet7ZdxoNXpj-o1iRETB30bcwPP16Uy8lyXg,869
54
63
  crawlo/utils/__init__.py,sha256=XCYumI8wJ1jU_Myn_K0LT-LVygPDUCdETCbXM3EWvlo,130
55
64
  crawlo/utils/concurrency_manager.py,sha256=o-_cfeUHdlBOM3eAXF857MtekSrRcVTBJ2jWZvY6weQ,5230
56
65
  crawlo/utils/date_tools.py,sha256=lcEFP2Z5b-6pUTHczrzCCuqiHP_4_2zamomMGPZrExo,7194
@@ -58,27 +67,28 @@ crawlo/utils/db_helper.py,sha256=ZqOt1d3mErVv4TOvoWlov0niUxORB9aHByTmMoNFIDw,109
58
67
  crawlo/utils/func_tools.py,sha256=y-TYP9H3X67MS_foWy9Z2LIS6GP7Y4Cy3T168ulq3Jc,2451
59
68
  crawlo/utils/log.py,sha256=YD2FfXuuE2MC9ZdQQZ0H7KysE7l_LHZqQepaTPlcApo,4133
60
69
  crawlo/utils/pqueue.py,sha256=HDgX4HAkc7RqYUtX6q51tzI1ZRTACf8P_4jLqC4-uC0,5559
61
- crawlo/utils/project.py,sha256=FfBaMfxcau4yL59O-DfD7FAii8k6gXWQmQ1YU6aaUCE,1544
70
+ crawlo/utils/project.py,sha256=qAiCmpIxiB7RxCLG-U5lGV6k4UCa21uRdykTfnAF834,7669
62
71
  crawlo/utils/request.py,sha256=ejdKpTwc-HE04HQybafhOVywzz57IV3pY0YMkSLyGUo,9065
72
+ crawlo/utils/spider_loader.py,sha256=V0CBTicJBYBZafhwLfDEfuEc_hJ2mSoiptT6qKufI9U,2249
63
73
  crawlo/utils/system.py,sha256=24zGmtHNhDFMGVo7ftMV-Pqg6_5d63zsyNey9udvJJk,248
64
74
  crawlo/utils/tools.py,sha256=uy7qw5Z1BIhyEgiHENvtM7WoGCJxlS8EX3PmOA7ouCo,275
65
75
  crawlo/utils/url.py,sha256=RKe_iqdjafsNcp-P2GVLYpsL1qbxiuZLiFc-SqOQkcs,1521
66
76
  examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
77
  examples/gxb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
68
- examples/gxb/items.py,sha256=s4uKo3kKlN2DC4Y4muwp_qzG6kdyhzOVLEjXv7Qvi7c,995
69
- examples/gxb/run.py,sha256=YLtlUB6GEAHLuLaTOt8HSOyAF1ZBdjSAwR9rJ2prUSs,340
70
- examples/gxb/settings.py,sha256=JqwnEkZ0wZZ1f43I2Ne9yu1LnEBBiH2rVG2iDKZC1Q8,2321
78
+ examples/gxb/items.py,sha256=3-1Lxpi7EqMzheDJoO0MPyHky5nHG_nqQGgKlm8y6mQ,989
79
+ examples/gxb/run.py,sha256=9kJlR8f-tZ3BqP5PW7sCLTw6PAFWo3x4cG5lc-6GWqI,333
80
+ examples/gxb/settings.py,sha256=_nbXj9HV2e0F6liUzK0ueygLcaMM_IUlkuwL6mJqUfc,2345
71
81
  examples/gxb/spider/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
72
82
  examples/gxb/spider/miit_spider.py,sha256=tcQnuyUHfu-Re1QbKKSI9DXW3Sp1vyBW8qBzKLf_RC4,6666
73
- examples/gxb/spider/telecom_device_licenses.py,sha256=t-XFai7e4itfGR4zeTJVJ1ulhfj-92gIgISqqdOwdag,4938
83
+ examples/gxb/spider/telecom_device.py,sha256=58iG6BQtQjjDHOF7-DXH0u5_XnppP5AJTQwaVJVyBEo,4929
74
84
  tests/__init__.py,sha256=409aRX8hsPffiZCVjOogtxwhACzBp8G2UTJyUQSxhK0,136
75
85
  tests/test_proxy_health_check.py,sha256=_tDlxa_6TdL3M5RLkHF82roXJ8WIuG5hELBp2GADyKQ,1123
76
86
  tests/test_proxy_middleware_integration.py,sha256=mTPK_XvbmLCV_QoVZzA3ybWOOX61493Ew78WfTp-bYQ,4441
77
87
  tests/test_proxy_providers.py,sha256=u_R2fhab90vqvQEaOAztpAOe9tJXvUMIdoDxmStmXJ4,1749
78
88
  tests/test_proxy_stats.py,sha256=ES00CEoDITYPFBGPk8pecFzD3ItYIv6NSpcqNd8-kvo,526
79
89
  tests/test_proxy_strategies.py,sha256=9Z1pXmTNyw-eIhGXlf2abZbJx6igLohYq-_3hldQ5uE,1868
80
- crawlo-1.0.5.dist-info/METADATA,sha256=IC9lzZIPUOEZdBXsSZkkd0CpkFuYChtuNtSasgO-O6M,1825
81
- crawlo-1.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
82
- crawlo-1.0.5.dist-info/entry_points.txt,sha256=GD9PBhKQN83EaxPYtz7NhcGeZeh3bdr2jWbTixOs-lw,59
83
- crawlo-1.0.5.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
84
- crawlo-1.0.5.dist-info/RECORD,,
90
+ crawlo-1.0.6.dist-info/METADATA,sha256=_TDAivxDg2R8omq5gG1kUiODY2tZ3UEp5aH0SwshOjI,1825
91
+ crawlo-1.0.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
92
+ crawlo-1.0.6.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
93
+ crawlo-1.0.6.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
94
+ crawlo-1.0.6.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ crawlo = crawlo.cli:main
examples/gxb/items.py CHANGED
@@ -1,4 +1,4 @@
1
- from crawlo.items.items import Item, Field
1
+ from crawlo.items import Item, Field
2
2
 
3
3
  class RadioApprovalItem(Item):
4
4
  approval_number = Field()
examples/gxb/run.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import asyncio
2
2
 
3
3
  from crawlo.crawler import CrawlerProcess
4
- from examples.gxb.spider.telecom_device_licenses import TelecomDeviceLicensesSpider
4
+ from examples.gxb.spider.telecom_device import TelecomDeviceLicensesSpider
5
5
 
6
6
  async def main():
7
7
  process = CrawlerProcess()
@@ -10,6 +10,7 @@ async def main():
10
10
  )
11
11
 
12
12
 
13
+
13
14
  if __name__ == '__main__':
14
15
  asyncio.run(main())
15
16
  # 132023
examples/gxb/settings.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import platform
2
2
 
3
- PROXY_ENABLED = False
3
+ PROXY_ENABLED = True
4
+ PROJECT_PACKAGE = 'gxb'
4
5
 
5
6
  # API 地址
6
7
  PROXY_API_URL = 'http://123.56.42.142:5000/proxy/getitem/'
@@ -10,7 +10,7 @@ from examples.gxb.settings import HEADERS, COOKIES
10
10
  logger = get_logger(__name__)
11
11
 
12
12
  class TelecomDeviceLicensesSpider(Spider):
13
- name = 'telecom_device_licenses'
13
+ name = 'telecom_device'
14
14
  allowed_domains = ['ythzxfw.miit.gov.cn']
15
15
  # API 的基础 URL
16
16
  base_api_url = 'https://ythzxfw.miit.gov.cn/oldyth/user-center/tbAppSearch/selectResult'
@@ -1,22 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on {DATE}
4
- ---------
5
- @summary:
6
- ---------
7
- @author: {USER}
8
- """
9
-
10
- from crawlo import Item
11
-
12
-
13
- class ${item_name}Item(Item):
14
- """
15
- This class was generated by feapder
16
- command: feapder create -i ${command}
17
- """
18
-
19
- __table_name__ = "${table_name}"
20
-
21
- def __init__(self, *args, **kwargs):
22
- ${propertys}
File without changes
@@ -1,33 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on {DATE}
4
- ---------
5
- @summary: 爬虫入口
6
- ---------
7
- @author: {USER}
8
- """
9
-
10
- from crawlo import ArgumentParser
11
-
12
- from spiders import *
13
-
14
-
15
-
16
- def crawl_xxx():
17
- """
18
- Spider爬虫
19
- """
20
- spider = xxx.XXXSpider(redis_key="xxx:xxx")
21
- spider.start()
22
-
23
-
24
-
25
- if __name__ == "__main__":
26
- parser = ArgumentParser(description="xxx爬虫")
27
-
28
- parser.add_argument(
29
- "--crawl_xxx", action="store_true", help="xxx爬虫", function=crawl_xxx
30
- )
31
- parser.start()
32
-
33
- # main.py作为爬虫启动的统一入口,提供命令行的方式启动多个爬虫