crawlo 1.0.4__tar.gz → 1.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- {crawlo-1.0.4 → crawlo-1.0.6}/LICENSE +22 -22
- {crawlo-1.0.4 → crawlo-1.0.6}/MANIFEST.in +16 -16
- {crawlo-1.0.4/crawlo.egg-info → crawlo-1.0.6}/PKG-INFO +49 -48
- {crawlo-1.0.4 → crawlo-1.0.6}/README.md +2 -2
- crawlo-1.0.6/crawlo/__init__.py +25 -0
- crawlo-1.0.6/crawlo/__version__.py +1 -0
- crawlo-1.0.6/crawlo/cli.py +41 -0
- crawlo-1.0.6/crawlo/commands/__init__.py +10 -0
- crawlo-1.0.6/crawlo/commands/genspider.py +111 -0
- crawlo-1.0.6/crawlo/commands/run.py +149 -0
- crawlo-1.0.6/crawlo/commands/startproject.py +101 -0
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/core/__init__.py +2 -2
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/core/engine.py +158 -158
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/core/processor.py +40 -40
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/core/scheduler.py +57 -57
- crawlo-1.0.6/crawlo/crawler.py +219 -0
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/downloader/__init__.py +78 -78
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/downloader/aiohttp_downloader.py +200 -259
- crawlo-1.0.6/crawlo/downloader/cffi_downloader.py +277 -0
- crawlo-1.0.6/crawlo/downloader/httpx_downloader.py +246 -0
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/event.py +11 -11
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/exceptions.py +78 -64
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/extension/__init__.py +31 -31
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/extension/log_interval.py +49 -49
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/extension/log_stats.py +44 -44
- crawlo-1.0.6/crawlo/extension/logging_extension.py +35 -0
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/filters/__init__.py +37 -37
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/filters/aioredis_filter.py +150 -150
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/filters/memory_filter.py +202 -202
- crawlo-1.0.6/crawlo/items/__init__.py +22 -0
- crawlo-1.0.6/crawlo/items/base.py +31 -0
- crawlo-1.0.4/crawlo/items/__init__.py → crawlo-1.0.6/crawlo/items/fields.py +54 -62
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/items/items.py +105 -119
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/middleware/__init__.py +21 -21
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/middleware/default_header.py +32 -32
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/middleware/download_delay.py +28 -28
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/middleware/middleware_manager.py +135 -140
- crawlo-1.0.6/crawlo/middleware/proxy.py +246 -0
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/middleware/request_ignore.py +30 -30
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/middleware/response_code.py +18 -18
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/middleware/response_filter.py +26 -26
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/middleware/retry.py +90 -90
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/network/__init__.py +7 -7
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/network/request.py +203 -204
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/network/response.py +166 -166
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/pipelines/__init__.py +13 -13
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/pipelines/console_pipeline.py +39 -39
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/pipelines/mongo_pipeline.py +116 -116
- crawlo-1.0.6/crawlo/pipelines/mysql_batch_pipline.py +273 -0
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/pipelines/mysql_pipeline.py +195 -195
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/pipelines/pipeline_manager.py +56 -56
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/settings/__init__.py +7 -7
- crawlo-1.0.6/crawlo/settings/default_settings.py +169 -0
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/settings/setting_manager.py +99 -99
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/spider/__init__.py +41 -36
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/stats_collector.py +59 -59
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/subscriber.py +106 -106
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/task_manager.py +27 -27
- crawlo-1.0.6/crawlo/templates/crawlo.cfg.tmpl +11 -0
- crawlo-1.0.6/crawlo/templates/project/__init__.py.tmpl +4 -0
- crawlo-1.0.6/crawlo/templates/project/items.py.tmpl +18 -0
- crawlo-1.0.6/crawlo/templates/project/middlewares.py.tmpl +76 -0
- crawlo-1.0.6/crawlo/templates/project/pipelines.py.tmpl +64 -0
- crawlo-1.0.6/crawlo/templates/project/settings.py.tmpl +54 -0
- crawlo-1.0.6/crawlo/templates/project/spiders/__init__.py.tmpl +6 -0
- crawlo-1.0.6/crawlo/templates/spider/spider.py.tmpl +32 -0
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/utils/__init__.py +7 -7
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/utils/concurrency_manager.py +124 -124
- crawlo-1.0.6/crawlo/utils/date_tools.py +233 -0
- crawlo-1.0.6/crawlo/utils/db_helper.py +344 -0
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/utils/func_tools.py +82 -82
- crawlo-1.0.6/crawlo/utils/log.py +129 -0
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/utils/pqueue.py +173 -173
- crawlo-1.0.6/crawlo/utils/project.py +199 -0
- crawlo-1.0.6/crawlo/utils/request.py +267 -0
- crawlo-1.0.6/crawlo/utils/spider_loader.py +63 -0
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/utils/system.py +11 -11
- crawlo-1.0.6/crawlo/utils/tools.py +5 -0
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo/utils/url.py +39 -39
- {crawlo-1.0.4 → crawlo-1.0.6/crawlo.egg-info}/PKG-INFO +49 -48
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo.egg-info/SOURCES.txt +33 -18
- crawlo-1.0.6/crawlo.egg-info/entry_points.txt +2 -0
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo.egg-info/requires.txt +1 -0
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo.egg-info/top_level.txt +1 -0
- crawlo-1.0.6/examples/gxb/items.py +36 -0
- crawlo-1.0.6/examples/gxb/run.py +16 -0
- crawlo-1.0.6/examples/gxb/settings.py +72 -0
- crawlo-1.0.6/examples/gxb/spider/__init__.py +0 -0
- crawlo-1.0.6/examples/gxb/spider/miit_spider.py +180 -0
- crawlo-1.0.6/examples/gxb/spider/telecom_device.py +129 -0
- {crawlo-1.0.4 → crawlo-1.0.6}/pyproject.toml +2 -2
- {crawlo-1.0.4 → crawlo-1.0.6}/requirements.txt +28 -21
- {crawlo-1.0.4 → crawlo-1.0.6}/setup.cfg +68 -67
- {crawlo-1.0.4 → crawlo-1.0.6}/tests/__init__.py +7 -7
- crawlo-1.0.6/tests/test_proxy_health_check.py +33 -0
- crawlo-1.0.6/tests/test_proxy_middleware_integration.py +137 -0
- crawlo-1.0.6/tests/test_proxy_providers.py +57 -0
- crawlo-1.0.6/tests/test_proxy_stats.py +20 -0
- crawlo-1.0.6/tests/test_proxy_strategies.py +60 -0
- crawlo-1.0.4/crawlo/__init__.py +0 -9
- crawlo-1.0.4/crawlo/__version__.py +0 -1
- crawlo-1.0.4/crawlo/crawler.py +0 -242
- crawlo-1.0.4/crawlo/downloader/httpx_downloader.py +0 -187
- crawlo-1.0.4/crawlo/downloader/playwright_downloader.py +0 -161
- crawlo-1.0.4/crawlo/pipelines/mysql_batch_pipline.py +0 -134
- crawlo-1.0.4/crawlo/settings/default_settings.py +0 -94
- crawlo-1.0.4/crawlo/templates/item_template.tmpl +0 -22
- crawlo-1.0.4/crawlo/templates/project_template/main.py +0 -33
- crawlo-1.0.4/crawlo/templates/project_template/setting.py +0 -190
- crawlo-1.0.4/crawlo/templates/spider_template.tmpl +0 -31
- crawlo-1.0.4/crawlo/utils/date_tools.py +0 -177
- crawlo-1.0.4/crawlo/utils/log.py +0 -39
- crawlo-1.0.4/crawlo/utils/project.py +0 -59
- crawlo-1.0.4/crawlo/utils/request.py +0 -122
- crawlo-1.0.4/crawlo/utils/tools.py +0 -303
- crawlo-1.0.4/crawlo.egg-info/entry_points.txt +0 -2
- crawlo-1.0.4/tests/baidu_spider/__init__.py +0 -7
- crawlo-1.0.4/tests/baidu_spider/demo.py +0 -94
- crawlo-1.0.4/tests/baidu_spider/items.py +0 -25
- crawlo-1.0.4/tests/baidu_spider/middleware.py +0 -49
- crawlo-1.0.4/tests/baidu_spider/pipeline.py +0 -55
- crawlo-1.0.4/tests/baidu_spider/request_fingerprints.txt +0 -9
- crawlo-1.0.4/tests/baidu_spider/run.py +0 -27
- crawlo-1.0.4/tests/baidu_spider/settings.py +0 -80
- crawlo-1.0.4/tests/baidu_spider/spiders/__init__.py +0 -7
- crawlo-1.0.4/tests/baidu_spider/spiders/bai_du.py +0 -61
- crawlo-1.0.4/tests/baidu_spider/spiders/sina.py +0 -79
- {crawlo-1.0.4 → crawlo-1.0.6}/crawlo.egg-info/dependency_links.txt +0 -0
- {crawlo-1.0.4/crawlo/templates/project_template/items → crawlo-1.0.6/examples}/__init__.py +0 -0
- {crawlo-1.0.4/crawlo/templates/project_template/spiders → crawlo-1.0.6/examples/gxb}/__init__.py +0 -0
|
@@ -1,23 +1,23 @@
|
|
|
1
|
-
MIT License
|
|
2
|
-
|
|
3
|
-
Modifications:
|
|
4
|
-
|
|
5
|
-
Copyright (c) 2020 crawl-coder <2251018029@qq.com>
|
|
6
|
-
|
|
7
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
8
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
9
|
-
in the Software without restriction, including without limitation the rights
|
|
10
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
12
|
-
furnished to do so, subject to the following conditions:
|
|
13
|
-
|
|
14
|
-
The above copyright notice and this permission notice shall be included in all
|
|
15
|
-
copies or substantial portions of the Software.
|
|
16
|
-
|
|
17
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
20
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
22
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Modifications:
|
|
4
|
+
|
|
5
|
+
Copyright (c) 2020 crawl-coder <2251018029@qq.com>
|
|
6
|
+
|
|
7
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
8
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
9
|
+
in the Software without restriction, including without limitation the rights
|
|
10
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
12
|
+
furnished to do so, subject to the following conditions:
|
|
13
|
+
|
|
14
|
+
The above copyright notice and this permission notice shall be included in all
|
|
15
|
+
copies or substantial portions of the Software.
|
|
16
|
+
|
|
17
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
20
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
22
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
23
23
|
SOFTWARE.
|
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
include README.md
|
|
2
|
-
include LICENSE
|
|
3
|
-
include requirements.txt # 如果根目录有全局requirements.txt
|
|
4
|
-
include VERSION # 如果根目录有全局VERSION文件
|
|
5
|
-
|
|
6
|
-
# 包内文件包含
|
|
7
|
-
recursive-include crawlo/utils/js *
|
|
8
|
-
recursive-include crawlo/templates *
|
|
9
|
-
|
|
10
|
-
# 测试文件(如果需要在分发包中包含测试)
|
|
11
|
-
recursive-include tests *
|
|
12
|
-
|
|
13
|
-
# 排除项
|
|
14
|
-
global-exclude __pycache__ *.py[cod] .DS_Store *.so
|
|
15
|
-
global-exclude *.bak *.swp *.orig *.rej
|
|
16
|
-
prune samples # 排除示例目录
|
|
1
|
+
include README.md
|
|
2
|
+
include LICENSE
|
|
3
|
+
include requirements.txt # 如果根目录有全局requirements.txt
|
|
4
|
+
include VERSION # 如果根目录有全局VERSION文件
|
|
5
|
+
|
|
6
|
+
# 包内文件包含
|
|
7
|
+
recursive-include crawlo/utils/js *
|
|
8
|
+
recursive-include crawlo/templates *
|
|
9
|
+
|
|
10
|
+
# 测试文件(如果需要在分发包中包含测试)
|
|
11
|
+
recursive-include tests *
|
|
12
|
+
|
|
13
|
+
# 排除项
|
|
14
|
+
global-exclude __pycache__ *.py[cod] .DS_Store *.so
|
|
15
|
+
global-exclude *.bak *.swp *.orig *.rej
|
|
16
|
+
prune samples # 排除示例目录
|
|
17
17
|
prune docs # 排除文档目录
|
|
@@ -1,48 +1,49 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: crawlo
|
|
3
|
-
Version: 1.0.
|
|
4
|
-
Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
|
|
5
|
-
Home-page: https://github.com/crawl-coder/Crawlo.git
|
|
6
|
-
Author: crawl-coder
|
|
7
|
-
Author-email: crawlo@qq.com
|
|
8
|
-
License: MIT
|
|
9
|
-
Classifier: Programming Language :: Python :: 3
|
|
10
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
-
Classifier: Operating System :: OS Independent
|
|
12
|
-
Requires-Python: >=3.6
|
|
13
|
-
Description-Content-Type: text/markdown
|
|
14
|
-
Requires-Dist: aiohttp>=3.12.14
|
|
15
|
-
Requires-Dist: aiomysql>=0.2.0
|
|
16
|
-
Requires-Dist: aioredis>=2.0.1
|
|
17
|
-
Requires-Dist: asyncmy>=0.2.10
|
|
18
|
-
Requires-Dist: cssselect>=1.2.0
|
|
19
|
-
Requires-Dist: dateparser>=1.2.2
|
|
20
|
-
Requires-Dist: httpx[http2]>=0.27.0
|
|
21
|
-
Requires-Dist:
|
|
22
|
-
Requires-Dist:
|
|
23
|
-
Requires-Dist:
|
|
24
|
-
Requires-Dist:
|
|
25
|
-
Requires-Dist:
|
|
26
|
-
Requires-Dist:
|
|
27
|
-
Requires-Dist:
|
|
28
|
-
Requires-Dist:
|
|
29
|
-
Requires-Dist:
|
|
30
|
-
Requires-Dist:
|
|
31
|
-
Requires-Dist:
|
|
32
|
-
Requires-Dist:
|
|
33
|
-
Requires-Dist:
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
Requires-Dist:
|
|
37
|
-
Requires-Dist:
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
Requires-Dist:
|
|
41
|
-
Requires-Dist:
|
|
42
|
-
Requires-Dist:
|
|
43
|
-
Requires-Dist:
|
|
44
|
-
Requires-Dist:
|
|
45
|
-
Requires-Dist:
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
Crawlo
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: crawlo
|
|
3
|
+
Version: 1.0.6
|
|
4
|
+
Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
|
|
5
|
+
Home-page: https://github.com/crawl-coder/Crawlo.git
|
|
6
|
+
Author: crawl-coder
|
|
7
|
+
Author-email: crawlo@qq.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.6
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: aiohttp>=3.12.14
|
|
15
|
+
Requires-Dist: aiomysql>=0.2.0
|
|
16
|
+
Requires-Dist: aioredis>=2.0.1
|
|
17
|
+
Requires-Dist: asyncmy>=0.2.10
|
|
18
|
+
Requires-Dist: cssselect>=1.2.0
|
|
19
|
+
Requires-Dist: dateparser>=1.2.2
|
|
20
|
+
Requires-Dist: httpx[http2]>=0.27.0
|
|
21
|
+
Requires-Dist: curl-cffi>=0.13.0
|
|
22
|
+
Requires-Dist: lxml>=5.2.1
|
|
23
|
+
Requires-Dist: motor>=3.7.0
|
|
24
|
+
Requires-Dist: parsel>=1.9.1
|
|
25
|
+
Requires-Dist: pydantic>=2.11.7
|
|
26
|
+
Requires-Dist: pymongo>=4.11
|
|
27
|
+
Requires-Dist: PyMySQL>=1.1.1
|
|
28
|
+
Requires-Dist: python-dateutil>=2.9.0.post0
|
|
29
|
+
Requires-Dist: redis>=6.2.0
|
|
30
|
+
Requires-Dist: requests>=2.32.4
|
|
31
|
+
Requires-Dist: six>=1.17.0
|
|
32
|
+
Requires-Dist: ujson>=5.9.0
|
|
33
|
+
Requires-Dist: urllib3>=2.5.0
|
|
34
|
+
Requires-Dist: w3lib>=2.1.2
|
|
35
|
+
Provides-Extra: render
|
|
36
|
+
Requires-Dist: webdriver-manager>=4.0.0; extra == "render"
|
|
37
|
+
Requires-Dist: playwright; extra == "render"
|
|
38
|
+
Requires-Dist: selenium>=3.141.0; extra == "render"
|
|
39
|
+
Provides-Extra: all
|
|
40
|
+
Requires-Dist: bitarray>=1.5.3; extra == "all"
|
|
41
|
+
Requires-Dist: PyExecJS>=1.5.1; extra == "all"
|
|
42
|
+
Requires-Dist: pymongo>=3.10.1; extra == "all"
|
|
43
|
+
Requires-Dist: redis-py-cluster>=2.1.0; extra == "all"
|
|
44
|
+
Requires-Dist: webdriver-manager>=4.0.0; extra == "all"
|
|
45
|
+
Requires-Dist: playwright; extra == "all"
|
|
46
|
+
Requires-Dist: selenium>=3.141.0; extra == "all"
|
|
47
|
+
|
|
48
|
+
# Crawlo
|
|
49
|
+
Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取与数据管道。
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
# Crawlo
|
|
2
|
-
Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取与数据管道。
|
|
1
|
+
# Crawlo
|
|
2
|
+
Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取与数据管道。
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Crawlo - 一个异步爬虫框架
|
|
5
|
+
"""
|
|
6
|
+
from crawlo.spider import Spider
|
|
7
|
+
from crawlo.items.items import Item
|
|
8
|
+
from crawlo.network.request import Request
|
|
9
|
+
from crawlo.network.response import Response
|
|
10
|
+
from crawlo.downloader import DownloaderBase
|
|
11
|
+
from crawlo.middleware import BaseMiddleware
|
|
12
|
+
|
|
13
|
+
# 版本号
|
|
14
|
+
from crawlo.__version__ import __version__
|
|
15
|
+
|
|
16
|
+
# 可选:定义对外暴露的接口
|
|
17
|
+
__all__ = [
|
|
18
|
+
'Spider',
|
|
19
|
+
'Item',
|
|
20
|
+
'Request',
|
|
21
|
+
'Response',
|
|
22
|
+
'DownloaderBase',
|
|
23
|
+
'BaseMiddleware',
|
|
24
|
+
'__version__',
|
|
25
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0.6"
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# crawlo/cli.py
|
|
2
|
+
# !/usr/bin/python
|
|
3
|
+
# -*- coding: UTF-8 -*-
|
|
4
|
+
import sys
|
|
5
|
+
import argparse
|
|
6
|
+
from crawlo.commands import get_commands
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def main():
|
|
10
|
+
# 获取所有可用命令
|
|
11
|
+
commands = get_commands()
|
|
12
|
+
|
|
13
|
+
parser = argparse.ArgumentParser(
|
|
14
|
+
description="Crawlo: A lightweight web crawler framework.",
|
|
15
|
+
usage="crawlo <command> [options]"
|
|
16
|
+
)
|
|
17
|
+
parser.add_argument('command', help='Available commands: ' + ', '.join(commands.keys()))
|
|
18
|
+
# 注意:这里不添加具体参数,由子命令解析
|
|
19
|
+
|
|
20
|
+
# 只解析命令
|
|
21
|
+
args, unknown = parser.parse_known_args()
|
|
22
|
+
|
|
23
|
+
if args.command not in commands:
|
|
24
|
+
print(f"Unknown command: {args.command}")
|
|
25
|
+
print(f"Available commands: {', '.join(commands.keys())}")
|
|
26
|
+
sys.exit(1)
|
|
27
|
+
|
|
28
|
+
# 动态导入并执行命令
|
|
29
|
+
try:
|
|
30
|
+
module = __import__(commands[args.command], fromlist=['main'])
|
|
31
|
+
sys.exit(module.main(unknown))
|
|
32
|
+
except ImportError as e:
|
|
33
|
+
print(f"Failed to load command '{args.command}': {e}")
|
|
34
|
+
sys.exit(1)
|
|
35
|
+
except Exception as e:
|
|
36
|
+
print(f"Command '{args.command}' failed: {e}")
|
|
37
|
+
sys.exit(1)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
if __name__ == '__main__':
|
|
41
|
+
main()
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import configparser
|
|
5
|
+
import importlib
|
|
6
|
+
|
|
7
|
+
TEMPLATES_DIR = Path(__file__).parent.parent / 'templates'
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _render_template(tmpl_path, context):
|
|
11
|
+
"""读取模板文件,替换 {{key}} 为 context 中的值"""
|
|
12
|
+
with open(tmpl_path, 'r', encoding='utf-8') as f:
|
|
13
|
+
content = f.read()
|
|
14
|
+
for key, value in context.items():
|
|
15
|
+
content = content.replace(f'{{{{{key}}}}}', str(value))
|
|
16
|
+
return content
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def main(args):
|
|
20
|
+
if len(args) < 2:
|
|
21
|
+
print("Usage: crawlo genspider <spider_name> <domain>")
|
|
22
|
+
return 1
|
|
23
|
+
|
|
24
|
+
spider_name = args[0]
|
|
25
|
+
domain = args[1]
|
|
26
|
+
|
|
27
|
+
# 查找项目根目录
|
|
28
|
+
project_root = None
|
|
29
|
+
current = Path.cwd()
|
|
30
|
+
while True:
|
|
31
|
+
cfg_file = current / 'crawlo.cfg'
|
|
32
|
+
if cfg_file.exists():
|
|
33
|
+
project_root = current
|
|
34
|
+
break
|
|
35
|
+
parent = current.parent
|
|
36
|
+
if parent == current:
|
|
37
|
+
break
|
|
38
|
+
current = parent
|
|
39
|
+
|
|
40
|
+
if not project_root:
|
|
41
|
+
print("Error: Not a crawlo project. crawlo.cfg not found.")
|
|
42
|
+
return 1
|
|
43
|
+
|
|
44
|
+
# 将项目根目录加入 sys.path
|
|
45
|
+
if str(project_root) not in sys.path:
|
|
46
|
+
sys.path.insert(0, str(project_root))
|
|
47
|
+
|
|
48
|
+
# 从 crawlo.cfg 读取 settings 模块,获取项目包名
|
|
49
|
+
config = configparser.ConfigParser()
|
|
50
|
+
try:
|
|
51
|
+
config.read(cfg_file, encoding='utf-8')
|
|
52
|
+
settings_module = config.get('settings', 'default')
|
|
53
|
+
project_package = settings_module.split('.')[0] # e.g., myproject.settings -> myproject
|
|
54
|
+
except Exception as e:
|
|
55
|
+
print(f"Error reading crawlo.cfg: {e}")
|
|
56
|
+
return 1
|
|
57
|
+
|
|
58
|
+
# 确定 items 模块的路径
|
|
59
|
+
items_module_path = f"{project_package}.items"
|
|
60
|
+
|
|
61
|
+
# 尝试导入 items 模块
|
|
62
|
+
try:
|
|
63
|
+
items_module = importlib.import_module(items_module_path)
|
|
64
|
+
# 获取模块中所有大写开头的类
|
|
65
|
+
item_classes = [cls for cls in items_module.__dict__.values()
|
|
66
|
+
if isinstance(cls, type) and cls.__name__.isupper()]
|
|
67
|
+
|
|
68
|
+
# 如果找到了类,使用第一个作为默认
|
|
69
|
+
if item_classes:
|
|
70
|
+
default_item_class = item_classes[0].__name__
|
|
71
|
+
else:
|
|
72
|
+
default_item_class = "ExampleItem" # 回退到示例
|
|
73
|
+
except ImportError as e:
|
|
74
|
+
print(f"Error importing items module '{items_module_path}': {e}")
|
|
75
|
+
default_item_class = "ExampleItem"
|
|
76
|
+
|
|
77
|
+
# 创建爬虫文件
|
|
78
|
+
spiders_dir = project_root / project_package / 'spiders'
|
|
79
|
+
if not spiders_dir.exists():
|
|
80
|
+
spiders_dir.mkdir(parents=True)
|
|
81
|
+
|
|
82
|
+
spider_file = spiders_dir / f'{spider_name}.py'
|
|
83
|
+
if spider_file.exists():
|
|
84
|
+
print(f"Error: Spider '{spider_name}' already exists.")
|
|
85
|
+
return 1
|
|
86
|
+
|
|
87
|
+
# ✅ 修正模板路径
|
|
88
|
+
tmpl_path = TEMPLATES_DIR / 'spider' / 'spider.py.tmpl'
|
|
89
|
+
|
|
90
|
+
if not tmpl_path.exists():
|
|
91
|
+
print(f"Error: Template file not found at {tmpl_path}")
|
|
92
|
+
return 1
|
|
93
|
+
|
|
94
|
+
# ✅ 生成正确的类名
|
|
95
|
+
class_name = f"{spider_name.capitalize()}Spider"
|
|
96
|
+
|
|
97
|
+
context = {
|
|
98
|
+
'spider_name': spider_name,
|
|
99
|
+
'domain': domain,
|
|
100
|
+
'project_name': project_package,
|
|
101
|
+
'item_class': default_item_class,
|
|
102
|
+
'class_name': class_name # ✅ 添加处理好的类名
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
content = _render_template(tmpl_path, context)
|
|
106
|
+
|
|
107
|
+
with open(spider_file, 'w', encoding='utf-8') as f:
|
|
108
|
+
f.write(content)
|
|
109
|
+
|
|
110
|
+
print(f"Spider '{spider_name}' created in {spider_file}")
|
|
111
|
+
return 0
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# crawlo/commands/run.py
|
|
2
|
+
import asyncio
|
|
3
|
+
import importlib
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import configparser
|
|
7
|
+
|
|
8
|
+
from crawlo.crawler import CrawlerProcess
|
|
9
|
+
from crawlo.utils.project import get_settings
|
|
10
|
+
from crawlo.utils.log import get_logger
|
|
11
|
+
from crawlo.utils.spider_loader import SpiderLoader
|
|
12
|
+
|
|
13
|
+
logger = get_logger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def main(args):
|
|
17
|
+
"""
|
|
18
|
+
运行指定爬虫的主函数
|
|
19
|
+
用法: crawlo run <spider_name>
|
|
20
|
+
"""
|
|
21
|
+
if len(args) < 1:
|
|
22
|
+
print("Usage: crawlo run <spider_name>")
|
|
23
|
+
print("Example: crawlo run baidu")
|
|
24
|
+
return 1
|
|
25
|
+
|
|
26
|
+
spider_name = args[0]
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
# 1. 获取项目根目录
|
|
30
|
+
project_root = get_settings()
|
|
31
|
+
|
|
32
|
+
# 将项目根目录添加到 Python 路径
|
|
33
|
+
if str(project_root) not in sys.path:
|
|
34
|
+
sys.path.insert(0, str(project_root))
|
|
35
|
+
|
|
36
|
+
# 2. 读取配置文件获取项目包名
|
|
37
|
+
cfg_file = project_root / 'crawlo.cfg'
|
|
38
|
+
if not cfg_file.exists():
|
|
39
|
+
print(f"❌ Error: crawlo.cfg not found in {project_root}")
|
|
40
|
+
return 1
|
|
41
|
+
|
|
42
|
+
config = configparser.ConfigParser()
|
|
43
|
+
config.read(cfg_file, encoding='utf-8')
|
|
44
|
+
|
|
45
|
+
if not config.has_section('settings') or not config.has_option('settings', 'default'):
|
|
46
|
+
print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
|
|
47
|
+
return 1
|
|
48
|
+
|
|
49
|
+
settings_module = config.get('settings', 'default')
|
|
50
|
+
project_package = settings_module.split('.')[0]
|
|
51
|
+
|
|
52
|
+
# 3. 查找并加载指定名称的 Spider
|
|
53
|
+
spider_class = find_spider_by_name(project_package, spider_name)
|
|
54
|
+
if spider_class is None:
|
|
55
|
+
return 1
|
|
56
|
+
|
|
57
|
+
# 4. 创建 CrawlerProcess 并运行单个爬虫
|
|
58
|
+
settings = get_settings()
|
|
59
|
+
process = CrawlerProcess(settings)
|
|
60
|
+
|
|
61
|
+
print(f"🚀 Starting spider: {spider_class.name}")
|
|
62
|
+
print(f"📁 Project: {project_package}")
|
|
63
|
+
print(f"🕷️ Class: {spider_class.__name__}")
|
|
64
|
+
print("-" * 50)
|
|
65
|
+
|
|
66
|
+
# 运行单个爬虫
|
|
67
|
+
asyncio.run(process.crawl(spider_class))
|
|
68
|
+
|
|
69
|
+
print("-" * 50)
|
|
70
|
+
print("✅ Spider completed successfully!")
|
|
71
|
+
return 0
|
|
72
|
+
|
|
73
|
+
except Exception as e:
|
|
74
|
+
print(f"❌ Error running spider: {e}")
|
|
75
|
+
import traceback
|
|
76
|
+
traceback.print_exc()
|
|
77
|
+
return 1
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def find_spider_by_name(project_package: str, target_spider_name: str):
|
|
81
|
+
"""使用 SpiderLoader 查找爬虫"""
|
|
82
|
+
loader = SpiderLoader(project_package)
|
|
83
|
+
spider_class = loader.load(target_spider_name)
|
|
84
|
+
|
|
85
|
+
if spider_class is None:
|
|
86
|
+
print(f"❌ Error: Spider with name '{target_spider_name}' not found")
|
|
87
|
+
print("💡 Available spiders:")
|
|
88
|
+
available_spiders = loader.list()
|
|
89
|
+
for spider_name in available_spiders:
|
|
90
|
+
print(f" - {spider_name}")
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
return spider_class
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def list_available_spiders(project_package: str):
|
|
97
|
+
"""
|
|
98
|
+
列出所有可用的爬虫
|
|
99
|
+
"""
|
|
100
|
+
spiders_dir = Path.cwd() / project_package / 'spiders'
|
|
101
|
+
if not spiders_dir.exists():
|
|
102
|
+
print(" No spiders directory found")
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
spider_count = 0
|
|
106
|
+
for py_file in spiders_dir.glob("*.py"):
|
|
107
|
+
if py_file.name.startswith('_'):
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
module_name = py_file.stem
|
|
111
|
+
spider_module_path = f"{project_package}.spiders.{module_name}"
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
module = importlib.import_module(spider_module_path)
|
|
115
|
+
except ImportError:
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
# 查找模块中所有 Spider 子类
|
|
119
|
+
from crawlo.spider import Spider
|
|
120
|
+
for attr_name in dir(module):
|
|
121
|
+
attr_value = getattr(module, attr_name)
|
|
122
|
+
if (isinstance(attr_value, type) and
|
|
123
|
+
issubclass(attr_value, Spider) and
|
|
124
|
+
attr_value != Spider and
|
|
125
|
+
hasattr(attr_value, 'name')):
|
|
126
|
+
print(f" - {attr_value.name} (class: {attr_value.__name__}, module: {module_name})")
|
|
127
|
+
spider_count += 1
|
|
128
|
+
|
|
129
|
+
if spider_count == 0:
|
|
130
|
+
print(" No spiders found")
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def run_spider_by_name(spider_name: str, project_root: Path = None):
|
|
134
|
+
"""
|
|
135
|
+
直接在代码中通过 spider name 运行爬虫
|
|
136
|
+
"""
|
|
137
|
+
if project_root:
|
|
138
|
+
if str(project_root) not in sys.path:
|
|
139
|
+
sys.path.insert(0, str(project_root))
|
|
140
|
+
|
|
141
|
+
args = [spider_name]
|
|
142
|
+
return main(args)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
if __name__ == '__main__':
|
|
146
|
+
# 允许直接运行: python -m crawlo.commands.run <spider_name>
|
|
147
|
+
import sys
|
|
148
|
+
|
|
149
|
+
sys.exit(main(sys.argv[1:]))
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# crawlo/commands/startproject.py
|
|
2
|
+
import os
|
|
3
|
+
import shutil
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
TEMPLATES_DIR = Path(__file__).parent.parent / 'templates'
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _render_template(tmpl_path, context):
|
|
10
|
+
"""读取模板文件,替换 {{key}} 为 context 中的值"""
|
|
11
|
+
with open(tmpl_path, 'r', encoding='utf-8') as f:
|
|
12
|
+
content = f.read()
|
|
13
|
+
for key, value in context.items():
|
|
14
|
+
content = content.replace(f'{{{{{key}}}}}', str(value))
|
|
15
|
+
return content
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _copytree_with_templates(src, dst, context):
|
|
19
|
+
"""
|
|
20
|
+
递归复制目录,将 .tmpl 文件渲染后复制(去除 .tmpl 后缀),其他文件直接复制。
|
|
21
|
+
"""
|
|
22
|
+
src_path = Path(src)
|
|
23
|
+
dst_path = Path(dst)
|
|
24
|
+
dst_path.mkdir(parents=True, exist_ok=True)
|
|
25
|
+
|
|
26
|
+
for item in src_path.rglob('*'):
|
|
27
|
+
rel_path = item.relative_to(src_path)
|
|
28
|
+
dst_item = dst_path / rel_path
|
|
29
|
+
|
|
30
|
+
if item.is_dir():
|
|
31
|
+
# 创建目标目录
|
|
32
|
+
dst_item.mkdir(parents=True, exist_ok=True)
|
|
33
|
+
else:
|
|
34
|
+
if item.suffix == '.tmpl':
|
|
35
|
+
# 渲染模板文件,并去掉 .tmpl 后缀
|
|
36
|
+
rendered_content = _render_template(item, context)
|
|
37
|
+
final_dst = dst_item.with_suffix('') # 去掉 .tmpl
|
|
38
|
+
final_dst.parent.mkdir(parents=True, exist_ok=True) # 确保父目录存在
|
|
39
|
+
with open(final_dst, 'w', encoding='utf-8') as f:
|
|
40
|
+
f.write(rendered_content)
|
|
41
|
+
else:
|
|
42
|
+
# 普通文件,直接复制
|
|
43
|
+
shutil.copy2(item, dst_item)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def main(args):
|
|
47
|
+
if len(args) != 1:
|
|
48
|
+
print("Usage: crawlo startproject <project_name>")
|
|
49
|
+
return 1
|
|
50
|
+
|
|
51
|
+
project_name = args[0]
|
|
52
|
+
project_dir = Path(project_name)
|
|
53
|
+
|
|
54
|
+
if project_dir.exists():
|
|
55
|
+
print(f"Error: Directory '{project_dir}' already exists.")
|
|
56
|
+
return 1
|
|
57
|
+
|
|
58
|
+
context = {'project_name': project_name}
|
|
59
|
+
template_dir = TEMPLATES_DIR / 'project'
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
# 1. 创建项目根目录
|
|
63
|
+
project_dir.mkdir()
|
|
64
|
+
|
|
65
|
+
# 2. 处理 crawlo.cfg.tmpl:单独渲染并写入项目根目录
|
|
66
|
+
cfg_template = TEMPLATES_DIR / 'crawlo.cfg.tmpl' # ✅ 使用 templates/ 目录下的模板
|
|
67
|
+
if cfg_template.exists():
|
|
68
|
+
cfg_content = _render_template(cfg_template, context)
|
|
69
|
+
(project_dir / 'crawlo.cfg').write_text(cfg_content, encoding='utf-8')
|
|
70
|
+
else:
|
|
71
|
+
print("Warning: crawlo.cfg.tmpl not found in templates.")
|
|
72
|
+
|
|
73
|
+
# 3. 复制所有其他模板文件到项目包内 (project_dir / project_name)
|
|
74
|
+
package_dir = project_dir / project_name
|
|
75
|
+
# 这会复制 __init__.py.tmpl, items.py.tmpl, settings.py.tmpl, spiders/ 等
|
|
76
|
+
# 并将它们渲染为 .py 文件
|
|
77
|
+
_copytree_with_templates(template_dir, package_dir, context)
|
|
78
|
+
|
|
79
|
+
# 4. 创建 logs 目录
|
|
80
|
+
(project_dir / 'logs').mkdir(exist_ok=True)
|
|
81
|
+
|
|
82
|
+
print(f"""
|
|
83
|
+
✔ 项目 '{project_name}' 创建成功!
|
|
84
|
+
|
|
85
|
+
进入项目目录:
|
|
86
|
+
cd {project_name}
|
|
87
|
+
|
|
88
|
+
创建一个爬虫:
|
|
89
|
+
crawlo genspider example example.com
|
|
90
|
+
|
|
91
|
+
运行爬虫:
|
|
92
|
+
crawlo run example
|
|
93
|
+
""")
|
|
94
|
+
return 0
|
|
95
|
+
|
|
96
|
+
except Exception as e:
|
|
97
|
+
print(f"Error creating project: {e}")
|
|
98
|
+
# 如果出错,尝试清理已创建的目录
|
|
99
|
+
if project_dir.exists():
|
|
100
|
+
shutil.rmtree(project_dir, ignore_errors=True)
|
|
101
|
+
return 1
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|