crawlo 1.0.9__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (111) hide show
  1. crawlo/__init__.py +33 -24
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +40 -40
  4. crawlo/commands/__init__.py +13 -13
  5. crawlo/commands/check.py +594 -106
  6. crawlo/commands/genspider.py +125 -110
  7. crawlo/commands/list.py +147 -92
  8. crawlo/commands/run.py +286 -181
  9. crawlo/commands/startproject.py +111 -101
  10. crawlo/commands/stats.py +188 -59
  11. crawlo/core/__init__.py +2 -2
  12. crawlo/core/engine.py +158 -158
  13. crawlo/core/processor.py +40 -40
  14. crawlo/core/scheduler.py +57 -57
  15. crawlo/crawler.py +494 -492
  16. crawlo/downloader/__init__.py +78 -78
  17. crawlo/downloader/aiohttp_downloader.py +199 -199
  18. crawlo/downloader/cffi_downloader.py +242 -277
  19. crawlo/downloader/httpx_downloader.py +246 -246
  20. crawlo/event.py +11 -11
  21. crawlo/exceptions.py +78 -78
  22. crawlo/extension/__init__.py +31 -31
  23. crawlo/extension/log_interval.py +49 -49
  24. crawlo/extension/log_stats.py +44 -44
  25. crawlo/extension/logging_extension.py +34 -34
  26. crawlo/filters/__init__.py +37 -37
  27. crawlo/filters/aioredis_filter.py +150 -150
  28. crawlo/filters/memory_filter.py +202 -202
  29. crawlo/items/__init__.py +23 -23
  30. crawlo/items/base.py +21 -21
  31. crawlo/items/fields.py +53 -53
  32. crawlo/items/items.py +104 -104
  33. crawlo/middleware/__init__.py +21 -21
  34. crawlo/middleware/default_header.py +32 -32
  35. crawlo/middleware/download_delay.py +28 -28
  36. crawlo/middleware/middleware_manager.py +135 -135
  37. crawlo/middleware/proxy.py +245 -245
  38. crawlo/middleware/request_ignore.py +30 -30
  39. crawlo/middleware/response_code.py +18 -18
  40. crawlo/middleware/response_filter.py +26 -26
  41. crawlo/middleware/retry.py +90 -90
  42. crawlo/network/__init__.py +7 -7
  43. crawlo/network/request.py +203 -203
  44. crawlo/network/response.py +166 -166
  45. crawlo/pipelines/__init__.py +13 -13
  46. crawlo/pipelines/console_pipeline.py +39 -39
  47. crawlo/pipelines/mongo_pipeline.py +116 -116
  48. crawlo/pipelines/mysql_batch_pipline.py +272 -272
  49. crawlo/pipelines/mysql_pipeline.py +195 -195
  50. crawlo/pipelines/pipeline_manager.py +56 -56
  51. crawlo/project.py +153 -0
  52. crawlo/settings/__init__.py +7 -7
  53. crawlo/settings/default_settings.py +166 -168
  54. crawlo/settings/setting_manager.py +99 -99
  55. crawlo/spider/__init__.py +129 -129
  56. crawlo/stats_collector.py +59 -59
  57. crawlo/subscriber.py +106 -106
  58. crawlo/task_manager.py +27 -27
  59. crawlo/templates/crawlo.cfg.tmpl +10 -10
  60. crawlo/templates/project/__init__.py.tmpl +3 -3
  61. crawlo/templates/project/items.py.tmpl +17 -17
  62. crawlo/templates/project/middlewares.py.tmpl +75 -75
  63. crawlo/templates/project/pipelines.py.tmpl +63 -63
  64. crawlo/templates/project/settings.py.tmpl +54 -54
  65. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  66. crawlo/templates/spider/spider.py.tmpl +31 -31
  67. crawlo/utils/__init__.py +7 -7
  68. crawlo/utils/date_tools.py +233 -233
  69. crawlo/utils/db_helper.py +343 -343
  70. crawlo/utils/func_tools.py +82 -82
  71. crawlo/utils/log.py +128 -128
  72. crawlo/utils/pqueue.py +173 -173
  73. crawlo/utils/request.py +267 -267
  74. crawlo/utils/spider_loader.py +62 -62
  75. crawlo/utils/system.py +11 -11
  76. crawlo/utils/tools.py +4 -4
  77. crawlo/utils/url.py +39 -39
  78. crawlo-1.1.1.dist-info/METADATA +220 -0
  79. crawlo-1.1.1.dist-info/RECORD +100 -0
  80. examples/__init__.py +7 -0
  81. examples/baidu_spider/__init__.py +7 -0
  82. examples/baidu_spider/demo.py +94 -0
  83. examples/baidu_spider/items.py +46 -0
  84. examples/baidu_spider/middleware.py +49 -0
  85. examples/baidu_spider/pipeline.py +55 -0
  86. examples/baidu_spider/run.py +27 -0
  87. examples/baidu_spider/settings.py +121 -0
  88. examples/baidu_spider/spiders/__init__.py +7 -0
  89. examples/baidu_spider/spiders/bai_du.py +61 -0
  90. examples/baidu_spider/spiders/miit.py +159 -0
  91. examples/baidu_spider/spiders/sina.py +79 -0
  92. tests/__init__.py +7 -7
  93. tests/test_proxy_health_check.py +32 -32
  94. tests/test_proxy_middleware_integration.py +136 -136
  95. tests/test_proxy_providers.py +56 -56
  96. tests/test_proxy_stats.py +19 -19
  97. tests/test_proxy_strategies.py +59 -59
  98. crawlo/utils/concurrency_manager.py +0 -125
  99. crawlo/utils/project.py +0 -197
  100. crawlo-1.0.9.dist-info/METADATA +0 -49
  101. crawlo-1.0.9.dist-info/RECORD +0 -97
  102. examples/gxb/__init__.py +0 -0
  103. examples/gxb/items.py +0 -36
  104. examples/gxb/run.py +0 -16
  105. examples/gxb/settings.py +0 -72
  106. examples/gxb/spider/__init__.py +0 -0
  107. examples/gxb/spider/miit_spider.py +0 -180
  108. examples/gxb/spider/telecom_device.py +0 -129
  109. {crawlo-1.0.9.dist-info → crawlo-1.1.1.dist-info}/WHEEL +0 -0
  110. {crawlo-1.0.9.dist-info → crawlo-1.1.1.dist-info}/entry_points.txt +0 -0
  111. {crawlo-1.0.9.dist-info → crawlo-1.1.1.dist-info}/top_level.txt +0 -0
crawlo/utils/project.py DELETED
@@ -1,197 +0,0 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 自动发现项目并创建 SettingManager 实例
5
-
6
- 该模块负责:
7
- 1. 向上搜索项目根目录(通过 crawlo.cfg 或 settings.py)
8
- 2. 将项目根目录加入 Python 路径 (sys.path)
9
- 3. 加载指定的 settings 模块
10
- 4. 返回一个已配置好的 SettingManager 实例
11
- """
12
- import os
13
- import sys
14
- import configparser
15
- from importlib import import_module
16
- from inspect import iscoroutinefunction
17
- from typing import Callable, Optional
18
-
19
- from crawlo.utils.log import get_logger
20
- from crawlo.settings.setting_manager import SettingManager
21
-
22
- logger = get_logger(__name__)
23
-
24
-
25
- def _find_project_root(start_path: str = '.') -> Optional[str]:
26
- """
27
- 从指定的起始路径开始,向上级目录递归搜索,寻找项目根目录。
28
- 搜索依据:
29
- 1. 优先查找 'crawlo.cfg' 文件。
30
- 2. 如果未找到 cfg 文件,则查找位于 Python 包内(即包含 __init__.py 的目录)的 'settings.py' 文件。
31
-
32
- Args:
33
- start_path (str): 搜索的起始路径,默认为当前工作目录 '.'。
34
-
35
- Returns:
36
- Optional[str]: 找到的项目根目录的绝对路径,如果未找到则返回 None。
37
- """
38
- path = os.path.abspath(start_path)
39
-
40
- while True:
41
- # 1. 检查是否存在 crawlo.cfg 文件
42
- cfg_file = os.path.join(path, 'crawlo.cfg')
43
- if os.path.isfile(cfg_file):
44
- return path
45
-
46
- # 2. 检查是否存在 settings.py 文件,并且它位于一个 Python 包中
47
- settings_file = os.path.join(path, 'settings.py')
48
- if os.path.isfile(settings_file):
49
- init_file = os.path.join(path, '__init__.py')
50
- if os.path.isfile(init_file):
51
- return path
52
- else:
53
- logger.debug(f"在路径 {path} 找到 'settings.py',但缺少 '__init__.py',忽略。")
54
-
55
- # 移动到上一级目录
56
- parent = os.path.dirname(path)
57
- if parent == path:
58
- # 已经到达文件系统根目录
59
- break
60
- path = parent
61
-
62
- logger.warning("向上搜索完毕,未找到项目根目录。")
63
- return None
64
-
65
-
66
- def _get_settings_module_from_cfg(cfg_path: str) -> str:
67
- """
68
- 从 crawlo.cfg 配置文件中读取 settings 模块的路径。
69
-
70
- Args:
71
- cfg_path (str): crawlo.cfg 文件的完整路径。
72
-
73
- Returns:
74
- str: settings 模块的导入路径,例如 'myproject.settings'。
75
-
76
- Raises:
77
- RuntimeError: 当读取文件或解析配置出错时抛出。
78
- """
79
- logger.info(f"正在读取配置文件: {cfg_path}")
80
- config = configparser.ConfigParser()
81
- try:
82
- config.read(cfg_path, encoding='utf-8')
83
- if config.has_section('settings') and config.has_option('settings', 'default'):
84
- module_path = config.get('settings', 'default')
85
- logger.debug(f"从 'crawlo.cfg' 中读取到 settings 模块路径: {module_path}")
86
- return module_path
87
- else:
88
- error_msg = f"配置文件 '{cfg_path}' 缺少 '[settings]' 或 'default' 配置项。"
89
- logger.error(error_msg)
90
- raise RuntimeError(error_msg)
91
- except (configparser.Error, OSError) as e:
92
- error_msg = f"读取或解析配置文件 '{cfg_path}' 时出错: {e}"
93
- logger.error(error_msg)
94
- raise RuntimeError(error_msg)
95
-
96
-
97
- def get_settings(custom_settings=None):
98
- """
99
- 获取配置管理器实例的主函数。
100
- 此函数会自动发现项目,加载配置,并返回一个配置好的 SettingManager。
101
-
102
- Args:
103
- custom_settings (dict, optional): 运行时传入的自定义设置字典,会覆盖 settings.py 中的同名配置。
104
-
105
- Returns:
106
- SettingManager: 一个已加载所有配置的 SettingManager 实例。
107
-
108
- Raises:
109
- RuntimeError: 当无法找到项目或配置文件时。
110
- ImportError: 当无法导入指定的 settings 模块时。
111
- """
112
- logger.debug("正在初始化配置管理器...")
113
-
114
- # 1. 发现项目根目录
115
- project_root = _find_project_root()
116
- if not project_root:
117
- error_msg = "未找到 Crawlo 项目。请确保您正在包含 'crawlo.cfg' 或 'settings.py' 的项目目录中运行。"
118
- logger.error(error_msg)
119
- raise RuntimeError(error_msg)
120
-
121
- logger.debug(f"项目根目录已确定: {project_root}")
122
-
123
- # 2. 确定 settings 模块的导入路径
124
- settings_module_path = None
125
-
126
- # 优先从 crawlo.cfg 中读取
127
- cfg_file = os.path.join(project_root, 'crawlo.cfg')
128
- if os.path.isfile(cfg_file):
129
- settings_module_path = _get_settings_module_from_cfg(cfg_file)
130
- else:
131
- logger.debug("未找到 'crawlo.cfg',尝试推断 settings 模块路径...")
132
- # 推断:项目目录名.settings
133
- project_name = os.path.basename(project_root)
134
- settings_module_path = f"{project_name}.settings"
135
- logger.debug(f"推断 settings 模块路径为: {settings_module_path}")
136
-
137
- # 3. 将项目根目录添加到 Python 路径,确保可以成功导入
138
- if project_root not in sys.path:
139
- sys.path.insert(0, project_root)
140
- logger.debug(f"已将项目根目录 '{project_root}' 添加到 Python 路径。")
141
- else:
142
- logger.debug(f"项目根目录 '{project_root}' 已在 Python 路径中。")
143
-
144
- # 4. 创建 SettingManager 并加载配置
145
- logger.debug(f"正在加载 settings 模块: {settings_module_path}")
146
- settings = SettingManager()
147
-
148
- try:
149
- # 这会触发 SettingManager.set_settings(),从模块中加载所有大写常量
150
- settings.set_settings(settings_module_path)
151
- logger.debug("settings 模块加载成功。")
152
- except Exception as e:
153
- error_msg = f"加载 settings 模块 '{settings_module_path}' 失败: {e}"
154
- logger.error(error_msg)
155
- raise ImportError(error_msg)
156
-
157
- # 5. 应用运行时自定义设置
158
- if custom_settings:
159
- logger.debug(f"正在应用运行时自定义设置: {custom_settings}")
160
- settings.update_attributes(custom_settings)
161
- logger.info("运行时自定义设置已应用。")
162
-
163
- logger.debug("配置管理器初始化完成。")
164
- return settings
165
-
166
-
167
- def load_class(_path):
168
- if not isinstance(_path, str):
169
- if callable(_path):
170
- return _path
171
- else:
172
- raise TypeError(f"args expect str or object, got {_path}")
173
-
174
- module_name, class_name = _path.rsplit('.', 1)
175
- module = import_module(module_name)
176
-
177
- try:
178
- cls = getattr(module, class_name)
179
- except AttributeError:
180
- raise NameError(f"Module {module_name!r} has no class named {class_name!r}")
181
- return cls
182
-
183
-
184
- def merge_settings(spider, settings):
185
- spider_name = getattr(spider, 'name', 'UnknownSpider')
186
- if hasattr(spider, 'custom_settings'):
187
- custom_settings = getattr(spider, 'custom_settings')
188
- settings.update_attributes(custom_settings)
189
- else:
190
- logger.debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并") # 添加日志
191
-
192
-
193
- async def common_call(func: Callable, *args, **kwargs):
194
- if iscoroutinefunction(func):
195
- return await func(*args, **kwargs)
196
- else:
197
- return func(*args, **kwargs)
@@ -1,49 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: crawlo
3
- Version: 1.0.9
4
- Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
- Home-page: https://github.com/crawl-coder/Crawlo.git
6
- Author: crawl-coder
7
- Author-email: crawlo@qq.com
8
- License: MIT
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: License :: OSI Approved :: MIT License
11
- Classifier: Operating System :: OS Independent
12
- Requires-Python: >=3.6
13
- Description-Content-Type: text/markdown
14
- Requires-Dist: aiohttp>=3.12.14
15
- Requires-Dist: aiomysql>=0.2.0
16
- Requires-Dist: aioredis>=2.0.1
17
- Requires-Dist: asyncmy>=0.2.10
18
- Requires-Dist: cssselect>=1.2.0
19
- Requires-Dist: dateparser>=1.2.2
20
- Requires-Dist: httpx[http2]>=0.27.0
21
- Requires-Dist: curl-cffi>=0.13.0
22
- Requires-Dist: lxml>=5.2.1
23
- Requires-Dist: motor>=3.7.0
24
- Requires-Dist: parsel>=1.9.1
25
- Requires-Dist: pydantic>=2.11.7
26
- Requires-Dist: pymongo>=4.11
27
- Requires-Dist: PyMySQL>=1.1.1
28
- Requires-Dist: python-dateutil>=2.9.0.post0
29
- Requires-Dist: redis>=6.2.0
30
- Requires-Dist: requests>=2.32.4
31
- Requires-Dist: six>=1.17.0
32
- Requires-Dist: ujson>=5.9.0
33
- Requires-Dist: urllib3>=2.5.0
34
- Requires-Dist: w3lib>=2.1.2
35
- Provides-Extra: render
36
- Requires-Dist: webdriver-manager>=4.0.0; extra == "render"
37
- Requires-Dist: playwright; extra == "render"
38
- Requires-Dist: selenium>=3.141.0; extra == "render"
39
- Provides-Extra: all
40
- Requires-Dist: bitarray>=1.5.3; extra == "all"
41
- Requires-Dist: PyExecJS>=1.5.1; extra == "all"
42
- Requires-Dist: pymongo>=3.10.1; extra == "all"
43
- Requires-Dist: redis-py-cluster>=2.1.0; extra == "all"
44
- Requires-Dist: webdriver-manager>=4.0.0; extra == "all"
45
- Requires-Dist: playwright; extra == "all"
46
- Requires-Dist: selenium>=3.141.0; extra == "all"
47
-
48
- # Crawlo
49
- Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取与数据管道。
@@ -1,97 +0,0 @@
1
- crawlo/__init__.py,sha256=xpiIAZbSG3CzneJuDLPCbwfRcvw2wyHYl2kJjaNfNGY,584
2
- crawlo/__version__.py,sha256=5fdKqtSBBDvdwuTWCGoh62x6-wR269e8DEQnOPkCHWg,23
3
- crawlo/cli.py,sha256=hjAJKx9pba375sATvvcy-dtZyBIgXj8fRBq9RFIZHA4,1206
4
- crawlo/crawler.py,sha256=AyKxUyJvCwb1u4d3Zn3vFmjH28ExWKIygfTICps-3yY,20026
5
- crawlo/event.py,sha256=ZhoPW5CglCEuZNFEwviSCBIw0pT5O6jT98bqYrDFd3E,324
6
- crawlo/exceptions.py,sha256=xdyZkvVcLEJ-19sWMHvn9IJsu30-hAY2jJhA2kYIims,1207
7
- crawlo/stats_collector.py,sha256=v4jC9BAe-23w93hWzbeMCCgQ9VuFPyxw5JV9ItbGH8w,1636
8
- crawlo/subscriber.py,sha256=udlHeTR0ymGQhCDxVUGwUzeeeR4TYCEJrJwFnkgr0cU,3836
9
- crawlo/task_manager.py,sha256=D9m-nqnGj-FZPtGk4CdwZX3Gw7IWyYvTS7CHpRGWc_w,748
10
- crawlo/commands/__init__.py,sha256=kZ3qATqDPmMUCNUQSFfBfIA8fp_1dgBwIAWbmFN3_To,355
11
- crawlo/commands/check.py,sha256=Q8wFjIo43XW0wP93TTlM7HSShgytJsbSWHIlmkcNxz0,3585
12
- crawlo/commands/genspider.py,sha256=kSHYsAGHRoxU6Qf_MGpR_VS-Ua5NUGY2KGm_Wapn0sw,3529
13
- crawlo/commands/list.py,sha256=itR05muZlZs8FbRh88kOhcRbZc77OXiR6A86UnVhSMY,2974
14
- crawlo/commands/run.py,sha256=s6JJC8HNa-tBgPDB2BPUmj26D7PMckhlx4AOEz57ESY,6197
15
- crawlo/commands/startproject.py,sha256=1KOq_CALy01oklr0dAUYhGFzu4f7w45q2H0O3qafLX4,3494
16
- crawlo/commands/stats.py,sha256=rH0TlD0o-xUr9RxtvNYgnSjHHoRyma3rvx9Q9nIGDNg,1659
17
- crawlo/core/__init__.py,sha256=JYSAn15r8yWgRK_Nc69t_8tZCyb70MiPZKssA8wrYz0,43
18
- crawlo/core/engine.py,sha256=JFHooPp-5cfHSyxEh87nOOR5NMaPLVDfNSqAsbtx4PM,6030
19
- crawlo/core/processor.py,sha256=oHLs-cno0bJGTNc9NGD2S7_2-grI3ruvggO0SY2mf3Q,1180
20
- crawlo/core/scheduler.py,sha256=ZMPs4LSs69FsFfDTvaOMJKqpSQQGvIEE9pMyYVVAA64,1948
21
- crawlo/downloader/__init__.py,sha256=72u2Hef4HaMfs9VCqEjbMtiaRXbaXmgNiJn6qy09LHs,2384
22
- crawlo/downloader/aiohttp_downloader.py,sha256=YfvYCDp3y0OsVyfdYX1XJC3EcCrbNLKOcFY8b7JC3_w,7675
23
- crawlo/downloader/cffi_downloader.py,sha256=QthBmZOE0cjYNRTM-449EuaFuqxxdc19kp93iqOlwB8,12678
24
- crawlo/downloader/httpx_downloader.py,sha256=yshb1JZa4B6hcVwIT97SrxCXkj3HJqT9IDpxSxjkJm4,11754
25
- crawlo/extension/__init__.py,sha256=O2BVK1U3WwmurZb-PaYVz3g1tZ_iYUjCwilmUKf6844,1170
26
- crawlo/extension/log_interval.py,sha256=FOWeTOuWtOpCz2UPV5F_--QIa8yomltSpjxbw3F7bkU,1971
27
- crawlo/extension/log_stats.py,sha256=JFJsdK7UWkhP4TEAF-H-S7SpQbDpBryS0AT6e6jZCBo,1721
28
- crawlo/extension/logging_extension.py,sha256=rty2_up53KV05nCazuBuz2ZapHKq0ti7mGVBzMTr0ak,1236
29
- crawlo/filters/__init__.py,sha256=9fJQRVkxWWPChajYbAGe1O6UYB639xWt0hiLUGBs4hQ,1014
30
- crawlo/filters/aioredis_filter.py,sha256=phBFW9Z28oylbik9Kb2WHM65Wo5yRAH2w9Yz0_2HaOQ,5621
31
- crawlo/filters/memory_filter.py,sha256=L8XEJkObOxs4BzYpQvk9PVM969k2LE61VFsnEOTEf_E,6841
32
- crawlo/items/__init__.py,sha256=rFpx1qFBo0Ik7bSdnXC8EVTJUOQdoJYGVdhYjaH00nk,409
33
- crawlo/items/base.py,sha256=hwGJEdFWOdaZfalFX8umRkh_HUWLEbCjvq4j70fplMQ,598
34
- crawlo/items/fields.py,sha256=fpS0vlRPpZYjTaMDgI9Q8z_YQqruwf6fi4Dgm6R2oEk,1854
35
- crawlo/items/items.py,sha256=OmVEvMmgofMU95GkaiWkfNQ2fjsH2fY9sw3SKcmUhLs,3478
36
- crawlo/middleware/__init__.py,sha256=PSwpRLdBUopaQzBp1S0zK_TZbrRagQ4yzvgyLy4tBk8,570
37
- crawlo/middleware/default_header.py,sha256=OVW4vpRPp3Y6qYXtiEYlGqVjCYcbuv1Iecc7zEgwCsI,1099
38
- crawlo/middleware/download_delay.py,sha256=P2eyAJXwdLdC4yYuLhvKZVa1b5YQvQD0GpsR8aDW8-8,994
39
- crawlo/middleware/middleware_manager.py,sha256=Vfkasi8YaLxzGrOrFYfxOMEGRS8XocqeQMruLtVxL_c,6360
40
- crawlo/middleware/proxy.py,sha256=PiIfhRXfcMzBtW_p7jfR8rGxcM4VT68Mk54swbaV2H4,9801
41
- crawlo/middleware/request_ignore.py,sha256=jdybWFVXuA5YsAPfZJFzLTWkYhEAewNgxuhFqczPW9M,1027
42
- crawlo/middleware/response_code.py,sha256=vgXWv3mMu_v9URvhKA9myIFH4u6L4EwNme80wL4DCGc,677
43
- crawlo/middleware/response_filter.py,sha256=O2gkV_Yjart8kmmXTGzrtZnb_Uuefap4uL2Cu01iRs4,863
44
- crawlo/middleware/retry.py,sha256=a2EmigYFzL8oxd50JhrSe5XbYJyx8yDjOjE5fXAOFhY,3459
45
- crawlo/network/__init__.py,sha256=DVz1JpasjxCgOlXvm76gz-S18OXr4emG_J39yi5iVuA,130
46
- crawlo/network/request.py,sha256=qd50mmrXS6yZKmAb6ERAMHzm2Ln80Wu5NSMwx_t1IGc,7247
47
- crawlo/network/response.py,sha256=z2Owti_9ds567jLvfuX8hrfdQL8JKn5lkt2QOc-Gi3Y,6200
48
- crawlo/pipelines/__init__.py,sha256=IbXJ6B8LqxVVjeLNgL_12AxV6zbV8hNRQxAfMLjjSaw,273
49
- crawlo/pipelines/console_pipeline.py,sha256=bwe5hZgaVSWmh3R8XpOaaeAjJme-Ttrpo6G6f1cnLIg,1287
50
- crawlo/pipelines/mongo_pipeline.py,sha256=Yr48D0T61-_Y-EpgWXf7BUn9w8e-Pj5P07QDSPZ0pYU,4558
51
- crawlo/pipelines/mysql_batch_pipline.py,sha256=Mj3PReDRw22JhJ5hZxnka4cirKq3kEbOCNhgpq1gvfA,10611
52
- crawlo/pipelines/mysql_pipeline.py,sha256=bsAFqpxrCijzvX-IusxOtvTvQEUCt5uHNTyYMo_pIq4,8056
53
- crawlo/pipelines/pipeline_manager.py,sha256=k-Rg0os0Havrov99D-Jn3ROpnz154K30tf7aARE5W3k,2174
54
- crawlo/settings/__init__.py,sha256=NgYFLfk_Bw7h6KSoepJn_lMBSqVbCHebjKxaE3_eMgw,130
55
- crawlo/settings/default_settings.py,sha256=urj4XJ--ZpVRbbo3fWUT71bYQLmElx43AC9KeHtqHBs,7310
56
- crawlo/settings/setting_manager.py,sha256=4xXOzKwZCgAp8ybwvVcs2R--CsOD7c6dBIkj6DJHB3c,2998
57
- crawlo/spider/__init__.py,sha256=IyQd4ufbAIhA_cvWrsNReRv3tj76CHc5Aef9c8KR-9s,3983
58
- crawlo/templates/crawlo.cfg.tmpl,sha256=lwiUVe5sFixJgHFEjn1OtbAeyWsECOrz37uheuVtulk,240
59
- crawlo/templates/project/__init__.py.tmpl,sha256=aQnHaOjMSkTviOC8COUX0fKymuyf8lx2tGduxkMkXEE,61
60
- crawlo/templates/project/items.py.tmpl,sha256=bXx-oCldMr2EgBKUAH9LH5gMnbyLiWX-EySAaMzcu2g,318
61
- crawlo/templates/project/middlewares.py.tmpl,sha256=VAolmMTC6HBmJT5XvWB0ag6ig9iaDBS32adIQ1zPdw0,2177
62
- crawlo/templates/project/pipelines.py.tmpl,sha256=xK1Yl7wYxiyUCm07GZvMnCS_cxJ5LF7z1YBBdkLlWys,1880
63
- crawlo/templates/project/settings.py.tmpl,sha256=985Z-jiU6A31f5s1IVU4PvkC_QGlFlRRfTF6rZ_G4ek,1771
64
- crawlo/templates/project/spiders/__init__.py.tmpl,sha256=zMbePipgLsctQUEnda4WkHz8rDLUX--rc8ruI6zkpWc,111
65
- crawlo/templates/spider/spider.py.tmpl,sha256=SkNv1kOwet7ZdxoNXpj-o1iRETB30bcwPP16Uy8lyXg,869
66
- crawlo/utils/__init__.py,sha256=XCYumI8wJ1jU_Myn_K0LT-LVygPDUCdETCbXM3EWvlo,130
67
- crawlo/utils/concurrency_manager.py,sha256=o-_cfeUHdlBOM3eAXF857MtekSrRcVTBJ2jWZvY6weQ,5230
68
- crawlo/utils/date_tools.py,sha256=lcEFP2Z5b-6pUTHczrzCCuqiHP_4_2zamomMGPZrExo,7194
69
- crawlo/utils/db_helper.py,sha256=ZqOt1d3mErVv4TOvoWlov0niUxORB9aHByTmMoNFIDw,10902
70
- crawlo/utils/func_tools.py,sha256=y-TYP9H3X67MS_foWy9Z2LIS6GP7Y4Cy3T168ulq3Jc,2451
71
- crawlo/utils/log.py,sha256=YD2FfXuuE2MC9ZdQQZ0H7KysE7l_LHZqQepaTPlcApo,4133
72
- crawlo/utils/pqueue.py,sha256=HDgX4HAkc7RqYUtX6q51tzI1ZRTACf8P_4jLqC4-uC0,5559
73
- crawlo/utils/project.py,sha256=hXSKV55OBUFjJi7TXekB4X3MmAgsqAeVTj5wPUWOizc,7394
74
- crawlo/utils/request.py,sha256=ejdKpTwc-HE04HQybafhOVywzz57IV3pY0YMkSLyGUo,9065
75
- crawlo/utils/spider_loader.py,sha256=V0CBTicJBYBZafhwLfDEfuEc_hJ2mSoiptT6qKufI9U,2249
76
- crawlo/utils/system.py,sha256=24zGmtHNhDFMGVo7ftMV-Pqg6_5d63zsyNey9udvJJk,248
77
- crawlo/utils/tools.py,sha256=uy7qw5Z1BIhyEgiHENvtM7WoGCJxlS8EX3PmOA7ouCo,275
78
- crawlo/utils/url.py,sha256=RKe_iqdjafsNcp-P2GVLYpsL1qbxiuZLiFc-SqOQkcs,1521
79
- examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
80
- examples/gxb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
81
- examples/gxb/items.py,sha256=3-1Lxpi7EqMzheDJoO0MPyHky5nHG_nqQGgKlm8y6mQ,989
82
- examples/gxb/run.py,sha256=9kJlR8f-tZ3BqP5PW7sCLTw6PAFWo3x4cG5lc-6GWqI,333
83
- examples/gxb/settings.py,sha256=_nbXj9HV2e0F6liUzK0ueygLcaMM_IUlkuwL6mJqUfc,2345
84
- examples/gxb/spider/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
- examples/gxb/spider/miit_spider.py,sha256=tcQnuyUHfu-Re1QbKKSI9DXW3Sp1vyBW8qBzKLf_RC4,6666
86
- examples/gxb/spider/telecom_device.py,sha256=58iG6BQtQjjDHOF7-DXH0u5_XnppP5AJTQwaVJVyBEo,4929
87
- tests/__init__.py,sha256=409aRX8hsPffiZCVjOogtxwhACzBp8G2UTJyUQSxhK0,136
88
- tests/test_proxy_health_check.py,sha256=_tDlxa_6TdL3M5RLkHF82roXJ8WIuG5hELBp2GADyKQ,1123
89
- tests/test_proxy_middleware_integration.py,sha256=mTPK_XvbmLCV_QoVZzA3ybWOOX61493Ew78WfTp-bYQ,4441
90
- tests/test_proxy_providers.py,sha256=u_R2fhab90vqvQEaOAztpAOe9tJXvUMIdoDxmStmXJ4,1749
91
- tests/test_proxy_stats.py,sha256=ES00CEoDITYPFBGPk8pecFzD3ItYIv6NSpcqNd8-kvo,526
92
- tests/test_proxy_strategies.py,sha256=9Z1pXmTNyw-eIhGXlf2abZbJx6igLohYq-_3hldQ5uE,1868
93
- crawlo-1.0.9.dist-info/METADATA,sha256=yvso4RU1U2ntTLfw9hVj_AkZmD5ygve40Oo0haLkbHw,1825
94
- crawlo-1.0.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
95
- crawlo-1.0.9.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
96
- crawlo-1.0.9.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
97
- crawlo-1.0.9.dist-info/RECORD,,
examples/gxb/__init__.py DELETED
File without changes
examples/gxb/items.py DELETED
@@ -1,36 +0,0 @@
1
- from crawlo.items import Item, Field
2
-
3
- class RadioApprovalItem(Item):
4
- approval_number = Field()
5
- device_name = Field()
6
- device_model = Field()
7
- applicant = Field()
8
- remarks = Field()
9
- validity_period = Field()
10
- frequency_tolerance = Field()
11
- frequency_range = Field()
12
- transmit_power = Field()
13
- occupied_bandwidth = Field()
14
- spurious_emission_limit = Field()
15
- issue_date = Field()
16
- approval_code = Field()
17
- cmiit_id = Field()
18
- modulation_mode = Field()
19
- technology_system = Field()
20
- mid = Field()
21
-
22
- class TelecomLicenseItem(Item):
23
- license_number = Field()
24
- device_name = Field()
25
- device_model = Field()
26
- applicant = Field()
27
- manufacturer = Field()
28
- issue_date = Field()
29
- expiry_date = Field()
30
- certificate_type = Field()
31
- remarks = Field()
32
- certificate_status = Field()
33
- origin = Field()
34
- article_id = Field()
35
- article_edit_date = Field()
36
- create_time = Field()
examples/gxb/run.py DELETED
@@ -1,16 +0,0 @@
1
- import asyncio
2
-
3
- from crawlo.crawler import CrawlerProcess
4
- from examples.gxb.spider.telecom_device import TelecomDeviceLicensesSpider
5
-
6
- async def main():
7
- process = CrawlerProcess()
8
- await process.crawl(
9
- [TelecomDeviceLicensesSpider]
10
- )
11
-
12
-
13
-
14
- if __name__ == '__main__':
15
- asyncio.run(main())
16
- # 132023
examples/gxb/settings.py DELETED
@@ -1,72 +0,0 @@
1
- import platform
2
-
3
- PROXY_ENABLED = True
4
- PROJECT_PACKAGE = 'gxb'
5
-
6
- # API 地址
7
- PROXY_API_URL = 'http://123.56.42.142:5000/proxy/getitem/'
8
-
9
- # 提取方式(根据实际返回结构选择)
10
- PROXY_EXTRACTOR = "proxy"
11
- # 或
12
- # from utils.proxy_extractors import custom_extractor_proxy
13
- # PROXY_EXTRACTOR = custom_extractor_proxy
14
-
15
- # 刷新间隔
16
- PROXY_REFRESH_INTERVAL = 5
17
-
18
- CONCURRENCY = 3
19
-
20
- # 超时时间
21
- PROXY_API_TIMEOUT = 10
22
-
23
- if platform.system() == "Windows":
24
- MYSQL_HOST = "pc-2ze9oh2diu5e5firh.rwlb.rds.aliyuncs.com"
25
- else:
26
- MYSQL_HOST = "tianmai-k8s-dmadmin-x.rwlb.rds.aliyuncs.com"
27
-
28
- # 数据库端口
29
- MYSQL_PORT = 3306
30
- # 数据库用户名
31
- MYSQL_USER = "data_collection"
32
- # 数据库密码
33
- MYSQL_PASSWORD = "CRNabzFQ2H"
34
- # 数据库名
35
- MYSQL_DB = "cxzx_xm"
36
- # 数据库编码
37
- MYSQL_TABLE = "telecom_device_licenses_v4"
38
-
39
- MYSQL_BATCH_SIZE = 100
40
-
41
- PIPELINES = [
42
- 'crawlo.pipelines.console_pipeline.ConsolePipeline',
43
- # 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # 可选:存入 MySQL
44
- ]
45
-
46
-
47
- HEADERS = {
48
- "Accept": "application/json, text/plain, */*",
49
- "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
50
- "Authorization": "null",
51
- "Cache-Control": "no-cache",
52
- "Connection": "keep-alive",
53
- "Content-Type": "application/json;charset=UTF-8",
54
- "Origin": "https://ythzxfw.miit.gov.cn",
55
- "Pragma": "no-cache",
56
- "Referer": "https://ythzxfw.miit.gov.cn/oldyth/resultQuery",
57
- "Sec-Fetch-Dest": "empty",
58
- "Sec-Fetch-Mode": "cors",
59
- "Sec-Fetch-Site": "same-origin",
60
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
61
- "sec-ch-ua": '"Not;A=Brand";v="99", "Google Chrome";v="139", "Chromium";v="139"',
62
- "sec-ch-ua-mobile": "?0",
63
- "sec-ch-ua-platform": '"Windows"'
64
- }
65
-
66
- COOKIES = {
67
- "wzws_sessionid": "oGivsIOAMjQwZTozYjM6MzBiMjo3MWMwOjg0NmY6MzQ4OTozNWZjOjEyMTGBOGY2OTQ2gjdjYmMyNQ==",
68
- "ariauseGraymode": "false",
69
- "Hm_lvt_a73626d298a849004aacc34159f68abd": "1755909741,1756084244,1756256541,1756344453",
70
- "Hm_lpvt_a73626d298a849004aacc34159f68abd": "1756344453",
71
- "HMACCOUNT": "08DF0D235A291EAA"
72
- }
File without changes
@@ -1,180 +0,0 @@
1
- import json
2
- import logging
3
- import re
4
-
5
- from crawlo import Request, Spider
6
-
7
- from examples.gxb.items import RadioApprovalItem, TelecomLicenseItem
8
-
9
- logger = logging.getLogger(__name__)
10
-
11
- # 基础配置
12
- BASE_URL = "https://ythzxfw.miit.gov.cn"
13
- API_URL = BASE_URL + "/oldyth/user-center/tbAppSearch/selectResult"
14
-
15
- # 任务配置
16
- TASKS = {
17
- "radio_approval": {
18
- "name": "无线电设备型号核准",
19
- "category_id": "352",
20
- "item_class": RadioApprovalItem,
21
- "table": "radio_equipment_approval_new",
22
- "field_mapping": {
23
- 'articleField01': 'approval_number',
24
- 'articleField02': 'device_name',
25
- 'articleField03': 'device_model',
26
- 'articleField04': 'applicant',
27
- 'articleField05': 'remarks',
28
- 'articleField06': 'validity_period',
29
- 'articleField07': 'frequency_tolerance',
30
- 'articleField08': 'frequency_range',
31
- 'articleField09': 'transmit_power',
32
- 'articleField10': 'occupied_bandwidth',
33
- 'articleField11': 'spurious_emission_limit',
34
- 'articleField12': 'issue_date',
35
- 'articleField13': 'approval_code',
36
- 'articleField14': 'cmiit_id',
37
- 'articleField15': 'modulation_mode',
38
- 'articleField16': 'technology_system',
39
- }
40
- },
41
- "telecom_license": {
42
- "name": "电信设备进网许可证",
43
- "category_id": "144",
44
- "item_class": TelecomLicenseItem,
45
- "table": "telecom_device_licenses_new",
46
- "field_mapping": {
47
- 'articleField01': 'license_number',
48
- 'articleField02': 'device_name',
49
- 'articleField03': 'device_model',
50
- 'articleField04': 'applicant',
51
- 'articleField05': 'manufacturer',
52
- 'articleField06': 'issue_date',
53
- 'articleField07': 'expiry_date',
54
- 'articleField08': 'certificate_type',
55
- 'articleField09': 'remarks',
56
- 'articleField10': 'certificate_status',
57
- 'articleField11': 'origin',
58
- }
59
- }
60
- }
61
-
62
- def strip_html(text: str) -> str:
63
- """去除 HTML 标签"""
64
- if not text or not isinstance(text, str):
65
- return text
66
- return re.sub(r'<[^>]+>', '', text)
67
-
68
- class MiitSpider(Spider):
69
- name = "miit_spider"
70
- custom_settings = {
71
- 'DOWNLOAD_DELAY': 0.5,
72
- 'CONCURRENT_REQUESTS': 5,
73
- 'CONCURRENT_REQUESTS_PER_DOMAIN': 5,
74
- 'COOKIES_ENABLED': True,
75
- 'RETRY_TIMES': 3,
76
- 'DEFAULT_REQUEST_HEADERS': {
77
- "Accept": "application/json, text/plain, */*",
78
- "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
79
- "Authorization": "null",
80
- "Cache-Control": "no-cache",
81
- "Connection": "keep-alive",
82
- "Content-Type": "application/json;charset=UTF-8",
83
- "Origin": BASE_URL,
84
- "Pragma": "no-cache",
85
- "Referer": f"{BASE_URL}/oldyth/resultQuery",
86
- "Sec-Fetch-Dest": "empty",
87
- "Sec-Fetch-Mode": "cors",
88
- "Sec-Fetch-Site": "same-origin",
89
- "sec-ch-ua": '"Not;A=Brand";v="99", "Google Chrome";v="139", "Chromium";v="139"',
90
- "sec-ch-ua-mobile": "?0",
91
- "sec-ch-ua-platform": '"Windows"'
92
- },
93
- 'COOKIES_DEBUG': False,
94
- 'LOG_LEVEL': 'INFO',
95
- 'ITEM_PIPELINES': {
96
- 'kyqb_scrapy.pipelines.DedupAndMySQLPipeline': 300,
97
- },
98
- 'DOWNLOADER_MIDDLEWARES': {
99
- 'kyqb_scrapy.middlewares.RandomUserAgentMiddleware': 400,
100
- }
101
- }
102
-
103
- def __init__(self, task='telecom_license', start_page=1, end_page=100, *args, **kwargs):
104
- super(MiitSpider, self).__init__(*args, **kwargs)
105
-
106
- if task not in TASKS:
107
- raise ValueError(f"不支持的任务: {task}")
108
-
109
- self.task_config = TASKS[task]
110
- self.category_id = self.task_config["category_id"]
111
- self.item_class = self.task_config["item_class"]
112
- self.table_name = self.task_config["table"]
113
- self.field_mapping = self.task_config["field_mapping"]
114
-
115
- self.start_page = int(start_page)
116
- self.end_page = int(end_page)
117
- self.page_size = 5
118
-
119
- # 设置 custom_settings 中的表名(动态)
120
- self.custom_settings['MYSQL_TABLE'] = self.table_name
121
-
122
- logger.info(f"🚀 启动任务: {self.task_config['name']},页码 {self.start_page} ~ {self.end_page}")
123
-
124
- def start_requests(self):
125
- for page in range(self.start_page, self.end_page + 1):
126
- data = {
127
- "categoryId": self.category_id,
128
- "currentPage": page,
129
- "pageSize": self.page_size,
130
- "searchContent": ""
131
- }
132
- yield Request(
133
- url=API_URL,
134
- method='POST',
135
- body=json.dumps(data, separators=(',', ':')),
136
- headers={'Content-Type': 'application/json;charset=UTF-8'},
137
- callback=self.parse,
138
- meta={'page': page},
139
- dont_filter=True
140
- )
141
-
142
- def parse(self, response):
143
- page = response.meta['page']
144
-
145
- # 检查响应
146
- if response.status_code != 200:
147
- self.logger.error(f"❌ 第 {page} 页请求失败: HTTP {response.status}")
148
- return
149
-
150
- try:
151
- result = json.loads(response.text)
152
- except json.JSONDecodeError:
153
- text = response.text
154
- if "升级浏览器" in text or "请尝试" in text:
155
- self.logger.error(f"⚠️ 检测到反爬: 请升级浏览器。响应片段: {text[:300]}")
156
- else:
157
- self.logger.error(f"JSON解析失败: {text[:300]}")
158
- return
159
-
160
- if not result.get("success"):
161
- msg = result.get("msg", "未知错误")
162
- if "升级浏览器" in msg or "请尝试" in msg:
163
- self.logger.error(f"⚠️ 反爬提示: {msg}")
164
- else:
165
- self.logger.error(f"接口失败: {msg}")
166
- return
167
-
168
- raw_records = result["params"]["tbAppArticle"]["list"]
169
- self.logger.info(f"✅ 第 {page} 页获取 {len(raw_records)} 条数据")
170
-
171
- for record in raw_records:
172
- item = self.item_class()
173
-
174
- for src_key, dst_key in self.field_mapping.items():
175
- value = record.get(src_key, '')
176
- if isinstance(value, str):
177
- value = strip_html(value)
178
- item[dst_key] = value
179
-
180
- yield item