scrapy-cffi 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapy_cffi-0.1.0/LICENSE +38 -0
- scrapy_cffi-0.1.0/MANIFEST.in +4 -0
- scrapy_cffi-0.1.0/PKG-INFO +78 -0
- scrapy_cffi-0.1.0/README.md +49 -0
- scrapy_cffi-0.1.0/pyproject.toml +49 -0
- scrapy_cffi-0.1.0/scrapy_cffi/__init__.py +13 -0
- scrapy_cffi-0.1.0/scrapy_cffi/commands/__init__.py +0 -0
- scrapy_cffi-0.1.0/scrapy_cffi/commands/base.py +10 -0
- scrapy_cffi-0.1.0/scrapy_cffi/commands/demo.py +112 -0
- scrapy_cffi-0.1.0/scrapy_cffi/commands/genspider.py +83 -0
- scrapy_cffi-0.1.0/scrapy_cffi/commands/main.py +43 -0
- scrapy_cffi-0.1.0/scrapy_cffi/commands/startproject.py +47 -0
- scrapy_cffi-0.1.0/scrapy_cffi/core/__init__.py +10 -0
- scrapy_cffi-0.1.0/scrapy_cffi/core/api.py +11 -0
- scrapy_cffi-0.1.0/scrapy_cffi/core/downloader/__init__.py +12 -0
- scrapy_cffi-0.1.0/scrapy_cffi/core/downloader/fetch.py +165 -0
- scrapy_cffi-0.1.0/scrapy_cffi/core/downloader/internet/__init__.py +11 -0
- scrapy_cffi-0.1.0/scrapy_cffi/core/downloader/internet/registry.py +23 -0
- scrapy_cffi-0.1.0/scrapy_cffi/core/downloader/internet/request.py +268 -0
- scrapy_cffi-0.1.0/scrapy_cffi/core/downloader/internet/response.py +112 -0
- scrapy_cffi-0.1.0/scrapy_cffi/core/downloader/selector.py +247 -0
- scrapy_cffi-0.1.0/scrapy_cffi/core/engine.py +253 -0
- scrapy_cffi-0.1.0/scrapy_cffi/core/scheduler/__init__.py +6 -0
- scrapy_cffi-0.1.0/scrapy_cffi/core/scheduler/base.py +208 -0
- scrapy_cffi-0.1.0/scrapy_cffi/core/sessions.py +439 -0
- scrapy_cffi-0.1.0/scrapy_cffi/core/tasks.py +136 -0
- scrapy_cffi-0.1.0/scrapy_cffi/crawler.py +290 -0
- scrapy_cffi-0.1.0/scrapy_cffi/databases/__init__.py +5 -0
- scrapy_cffi-0.1.0/scrapy_cffi/databases/mongodb.py +0 -0
- scrapy_cffi-0.1.0/scrapy_cffi/databases/mysql.py +0 -0
- scrapy_cffi-0.1.0/scrapy_cffi/databases/redis.py +136 -0
- scrapy_cffi-0.1.0/scrapy_cffi/exceptions/__init__.py +1 -0
- scrapy_cffi-0.1.0/scrapy_cffi/exceptions/base.py +48 -0
- scrapy_cffi-0.1.0/scrapy_cffi/extensions/__init__.py +7 -0
- scrapy_cffi-0.1.0/scrapy_cffi/extensions/base.py +12 -0
- scrapy_cffi-0.1.0/scrapy_cffi/extensions/signal_manager.py +104 -0
- scrapy_cffi-0.1.0/scrapy_cffi/extensions/signals.py +23 -0
- scrapy_cffi-0.1.0/scrapy_cffi/hooks/__init__.py +1 -0
- scrapy_cffi-0.1.0/scrapy_cffi/hooks/base.py +63 -0
- scrapy_cffi-0.1.0/scrapy_cffi/hooks/interceptors.py +13 -0
- scrapy_cffi-0.1.0/scrapy_cffi/hooks/pipelines.py +17 -0
- scrapy_cffi-0.1.0/scrapy_cffi/hooks/signals.py +10 -0
- scrapy_cffi-0.1.0/scrapy_cffi/hooks/spiders.py +7 -0
- scrapy_cffi-0.1.0/scrapy_cffi/interceptors/__init__.py +11 -0
- scrapy_cffi-0.1.0/scrapy_cffi/interceptors/api.py +1 -0
- scrapy_cffi-0.1.0/scrapy_cffi/interceptors/base.py +75 -0
- scrapy_cffi-0.1.0/scrapy_cffi/interceptors/chains.py +338 -0
- scrapy_cffi-0.1.0/scrapy_cffi/interceptors/spiders.py +187 -0
- scrapy_cffi-0.1.0/scrapy_cffi/internet/__init__.py +10 -0
- scrapy_cffi-0.1.0/scrapy_cffi/item/__init__.py +6 -0
- scrapy_cffi-0.1.0/scrapy_cffi/item/base.py +135 -0
- scrapy_cffi-0.1.0/scrapy_cffi/models/__init__.py +1 -0
- scrapy_cffi-0.1.0/scrapy_cffi/models/api.py +4 -0
- scrapy_cffi-0.1.0/scrapy_cffi/models/base.py +35 -0
- scrapy_cffi-0.1.0/scrapy_cffi/models/component.py +43 -0
- scrapy_cffi-0.1.0/scrapy_cffi/models/databases.py +27 -0
- scrapy_cffi-0.1.0/scrapy_cffi/models/media.py +43 -0
- scrapy_cffi-0.1.0/scrapy_cffi/models/settings.py +121 -0
- scrapy_cffi-0.1.0/scrapy_cffi/models/singal.py +21 -0
- scrapy_cffi-0.1.0/scrapy_cffi/pipelines/__init__.py +5 -0
- scrapy_cffi-0.1.0/scrapy_cffi/pipelines/api.py +44 -0
- scrapy_cffi-0.1.0/scrapy_cffi/pipelines/base.py +39 -0
- scrapy_cffi-0.1.0/scrapy_cffi/scheduler/__init__.py +6 -0
- scrapy_cffi-0.1.0/scrapy_cffi/spiders/__init__.py +7 -0
- scrapy_cffi-0.1.0/scrapy_cffi/spiders/base.py +150 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/demo_spider/customRedisSpider.py +50 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/demo_spider/customSpider.py +67 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/demo_spider/studentSpider.py +69 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/js_path/js_action.js +3 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/server/demo_server/fastApiServer.py +142 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/server/demo_server/ws_server.py +53 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/server/readme.txt +12 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/spider.py.j2 +12 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/extensions/__init__.py +1 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/extensions/extension.py +59 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/interceptors/__init__.py +1 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/interceptors/interceptors.py +31 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/items/__init__.py +1 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/items/item.py +9 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/pipelines/__init__.py +1 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/pipelines/pipeline.py +28 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/runner.py +118 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/settings.py +21 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/spiders/__init__.py +0 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/_action_core/__init__.py +7 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/_action_core/baseManager.py +32 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/_action_core/coreBase.py +33 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/_action_core/coreFlow.py +47 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/_action_core/reqBase.py +156 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/action_base/__init__.py +2 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/action_base/collect_base.py +27 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/action_base/function_base.py +36 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/action_flow/__init__.py +2 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/action_flow/collect.py +37 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/action_flow/function_flow.py +29 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/component/__init__.py +5 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/component/midManager.py +86 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/component/someManager.py +3 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/config.py +12 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/export_interface.py +49 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/manager.py +160 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/models/__init__.py +1 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/models/data_model.py +4 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/utils/__init__.py +2 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/utils/algorithm.py +7 -0
- scrapy_cffi-0.1.0/scrapy_cffi/templates/task/utils/common.py +42 -0
- scrapy_cffi-0.1.0/scrapy_cffi/utils/__init__.py +3 -0
- scrapy_cffi-0.1.0/scrapy_cffi/utils/common.py +472 -0
- scrapy_cffi-0.1.0/scrapy_cffi/utils/log.py +152 -0
- scrapy_cffi-0.1.0/scrapy_cffi/utils/media.py +72 -0
- scrapy_cffi-0.1.0/scrapy_cffi/utils/robot.py +119 -0
- scrapy_cffi-0.1.0/scrapy_cffi/utils/scrapyRunner.py +55 -0
- scrapy_cffi-0.1.0/scrapy_cffi.egg-info/PKG-INFO +78 -0
- scrapy_cffi-0.1.0/scrapy_cffi.egg-info/SOURCES.txt +118 -0
- scrapy_cffi-0.1.0/scrapy_cffi.egg-info/dependency_links.txt +1 -0
- scrapy_cffi-0.1.0/scrapy_cffi.egg-info/entry_points.txt +2 -0
- scrapy_cffi-0.1.0/scrapy_cffi.egg-info/requires.txt +19 -0
- scrapy_cffi-0.1.0/scrapy_cffi.egg-info/top_level.txt +1 -0
- scrapy_cffi-0.1.0/setup.cfg +4 -0
- scrapy_cffi-0.1.0/tests/test.py +2 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
Portions of this project are derived from the Scrapy project (https://github.com/scrapy/scrapy),
|
|
2
|
+
which is licensed under the BSD 3-Clause License.
|
|
3
|
+
|
|
4
|
+
In particular, the `item.py` module is adapted from Scrapy's `scrapy/item.py`
|
|
5
|
+
with necessary adjustments for integration with the asyncio-based engine.
|
|
6
|
+
Original copyright:
|
|
7
|
+
Copyright (c) Scrapy developers. All rights reserved.
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
BSD 3-Clause License
|
|
12
|
+
|
|
13
|
+
Copyright (c) 2025, aFunnyStrange
|
|
14
|
+
|
|
15
|
+
Redistribution and use in source and binary forms, with or without
|
|
16
|
+
modification, are permitted provided that the following conditions are met:
|
|
17
|
+
|
|
18
|
+
1. Redistributions of source code must retain the above copyright notice,
|
|
19
|
+
this list of conditions, and the following disclaimer.
|
|
20
|
+
|
|
21
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
22
|
+
this list of conditions, and the following disclaimer in the documentation
|
|
23
|
+
and/or other materials provided with the distribution.
|
|
24
|
+
|
|
25
|
+
3. Neither the name of the copyright holder nor the names of its
|
|
26
|
+
contributors may be used to endorse or promote products derived from
|
|
27
|
+
this software without specific prior written permission.
|
|
28
|
+
|
|
29
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
30
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
31
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
32
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
33
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
34
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
35
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
36
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
37
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
38
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scrapy_cffi
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: An asyncio-style web scraping framework inspired by Scrapy, powered by curl_cffi.
|
|
5
|
+
Author: aFunnyStrange
|
|
6
|
+
License: BSD-3-Clause
|
|
7
|
+
Project-URL: Homepage, https://github.com/aFunnyStrange/scrapy_cffi
|
|
8
|
+
Requires-Python: >=3.7
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: curl_cffi
|
|
12
|
+
Requires-Dist: PyExecjs
|
|
13
|
+
Requires-Dist: orjson
|
|
14
|
+
Requires-Dist: json5
|
|
15
|
+
Requires-Dist: bbpb
|
|
16
|
+
Requires-Dist: toml
|
|
17
|
+
Requires-Dist: pydantic>=2.0.0
|
|
18
|
+
Requires-Dist: jinja2
|
|
19
|
+
Requires-Dist: tenacity
|
|
20
|
+
Requires-Dist: redis>=5.0.0
|
|
21
|
+
Requires-Dist: parsel
|
|
22
|
+
Requires-Dist: Pillow
|
|
23
|
+
Requires-Dist: hachoir
|
|
24
|
+
Provides-Extra: windows
|
|
25
|
+
Requires-Dist: python-magic-bin; extra == "windows"
|
|
26
|
+
Provides-Extra: unix
|
|
27
|
+
Requires-Dist: python-magic; extra == "unix"
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
## scrapy_cffi
|
|
31
|
+
|
|
32
|
+
> An asyncio-style web scraping framework inspired by Scrapy, powered by `curl_cffi`.
|
|
33
|
+
|
|
34
|
+
`scrapy_cffi` is a lightweight asynchronous crawling framework that mimics the Scrapy architecture while replacing Twisted with `curl_cffi` as the underlying HTTP/WebSocket client. It is designed to be efficient, extensible, and suitable for both simple tasks and complex distributed crawlers.
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## ✨ Features
|
|
39
|
+
|
|
40
|
+
- Familiar Scrapy-style components: spiders, items, interceptors, pipelines
|
|
41
|
+
- Fully asyncio-based engine
|
|
42
|
+
- Built-in support for HTTP and WebSocket requests
|
|
43
|
+
- Lightweight signal system
|
|
44
|
+
- Plug-in ready interceptor and task manager design
|
|
45
|
+
- Redis-compatible scheduler (optional)
|
|
46
|
+
- Designed for high-concurrency crawling
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## 📦 Installation
|
|
51
|
+
#### From PyPI
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install scrapy_cffi
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
#### From source
|
|
58
|
+
```bash
|
|
59
|
+
git clone https://github.com/aFunnyStrange/scrapy_cffi.git
|
|
60
|
+
cd curl_cffi
|
|
61
|
+
pip install -e .
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## 🚀 Quick Start
|
|
65
|
+
```bash
|
|
66
|
+
scrapy_cffi startproject <project_name>
|
|
67
|
+
cd <project_name>
|
|
68
|
+
scrapy_cffi genspider <spider_name> <domain>
|
|
69
|
+
python runner.py
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## 📖 Documentation
|
|
73
|
+
Technical module-level documentation can be found in the [`docs/`](https://github.com/aFunnyStrange/scrapy_cffi/tree/main/docs/usage) directory on GitHub.
|
|
74
|
+
Each core component (engine, downloader, middleware, etc.) has its own `.md` file.
|
|
75
|
+
|
|
76
|
+
## 📄 License
|
|
77
|
+
This project is licensed under the BSD 3-Clause License. Portions of the code (specifically item.py) are adapted from the Scrapy project.
|
|
78
|
+
See LICENSE for details.
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
## scrapy_cffi
|
|
2
|
+
|
|
3
|
+
> An asyncio-style web scraping framework inspired by Scrapy, powered by `curl_cffi`.
|
|
4
|
+
|
|
5
|
+
`scrapy_cffi` is a lightweight asynchronous crawling framework that mimics the Scrapy architecture while replacing Twisted with `curl_cffi` as the underlying HTTP/WebSocket client. It is designed to be efficient, extensible, and suitable for both simple tasks and complex distributed crawlers.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## ✨ Features
|
|
10
|
+
|
|
11
|
+
- Familiar Scrapy-style components: spiders, items, interceptors, pipelines
|
|
12
|
+
- Fully asyncio-based engine
|
|
13
|
+
- Built-in support for HTTP and WebSocket requests
|
|
14
|
+
- Lightweight signal system
|
|
15
|
+
- Plug-in ready interceptor and task manager design
|
|
16
|
+
- Redis-compatible scheduler (optional)
|
|
17
|
+
- Designed for high-concurrency crawling
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## 📦 Installation
|
|
22
|
+
#### From PyPI
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install scrapy_cffi
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
#### From source
|
|
29
|
+
```bash
|
|
30
|
+
git clone https://github.com/aFunnyStrange/scrapy_cffi.git
|
|
31
|
+
cd curl_cffi
|
|
32
|
+
pip install -e .
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## 🚀 Quick Start
|
|
36
|
+
```bash
|
|
37
|
+
scrapy_cffi startproject <project_name>
|
|
38
|
+
cd <project_name>
|
|
39
|
+
scrapy_cffi genspider <spider_name> <domain>
|
|
40
|
+
python runner.py
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## 📖 Documentation
|
|
44
|
+
Technical module-level documentation can be found in the [`docs/`](https://github.com/aFunnyStrange/scrapy_cffi/tree/main/docs/usage) directory on GitHub.
|
|
45
|
+
Each core component (engine, downloader, middleware, etc.) has its own `.md` file.
|
|
46
|
+
|
|
47
|
+
## 📄 License
|
|
48
|
+
This project is licensed under the BSD 3-Clause License. Portions of the code (specifically item.py) are adapted from the Scrapy project.
|
|
49
|
+
See LICENSE for details.
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "scrapy_cffi"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "An asyncio-style web scraping framework inspired by Scrapy, powered by curl_cffi."
|
|
9
|
+
authors = [{name = "aFunnyStrange"}]
|
|
10
|
+
license = {text = "BSD-3-Clause"}
|
|
11
|
+
readme = "README.md"
|
|
12
|
+
requires-python = ">=3.7"
|
|
13
|
+
dependencies = [
|
|
14
|
+
"curl_cffi",
|
|
15
|
+
"PyExecjs",
|
|
16
|
+
"orjson",
|
|
17
|
+
"json5",
|
|
18
|
+
"bbpb",
|
|
19
|
+
"toml",
|
|
20
|
+
"pydantic>=2.0.0",
|
|
21
|
+
"jinja2",
|
|
22
|
+
"tenacity",
|
|
23
|
+
"redis>=5.0.0",
|
|
24
|
+
"parsel",
|
|
25
|
+
"Pillow",
|
|
26
|
+
"hachoir",
|
|
27
|
+
]
|
|
28
|
+
[project.optional-dependencies]
|
|
29
|
+
windows = ["python-magic-bin"]
|
|
30
|
+
unix = ["python-magic"]
|
|
31
|
+
|
|
32
|
+
[project.urls]
|
|
33
|
+
Homepage = "https://github.com/aFunnyStrange/scrapy_cffi"
|
|
34
|
+
|
|
35
|
+
[tool.setuptools.packages.find]
|
|
36
|
+
include = ["scrapy_cffi", "scrapy_cffi.*"]
|
|
37
|
+
exclude = ["docs*", "tests*", "examples*"]
|
|
38
|
+
|
|
39
|
+
[project.scripts]
|
|
40
|
+
scrapy_cffi = "scrapy_cffi.commands.main:main"
|
|
41
|
+
|
|
42
|
+
[tool.setuptools]
|
|
43
|
+
include-package-data = true
|
|
44
|
+
|
|
45
|
+
[tool.setuptools.package-data]
|
|
46
|
+
"scrapy_cffi" = ["templates/*"]
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.exclude-package-data]
|
|
49
|
+
"*" = ["__pycache__/*", "*.py[cod]"]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
2
|
+
|
|
3
|
+
from .crawler import run_spider, run_all_spiders, run_spider_sync, run_all_spiders_sync, cleanup_loop
|
|
4
|
+
from .utils import load_settings_with_path, init_logger, run_coroutine_in_thread, run_coroutine_in_new_loop, ProcessTaskManager
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"run_spider",
|
|
8
|
+
"run_all_spiders",
|
|
9
|
+
"run_spider_sync",
|
|
10
|
+
"run_all_spiders_sync",
|
|
11
|
+
"load_settings_with_path",
|
|
12
|
+
"init_logger"
|
|
13
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
def find_project_root(start: Path = None, is_demo=False) -> Path:
|
|
4
|
+
if start is None:
|
|
5
|
+
start = Path.cwd()
|
|
6
|
+
all_path = [start / "demo"] if is_demo else [start, *start.parents]
|
|
7
|
+
for path in all_path:
|
|
8
|
+
if (path / "scrapy_cffi.toml").exists():
|
|
9
|
+
return path
|
|
10
|
+
raise FileNotFoundError("Project root not found (missing scrapy_cffi.toml)")
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import shutil, os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
def copytree_merge(src: Path, dst: Path):
|
|
6
|
+
src = Path(src)
|
|
7
|
+
dst = Path(dst)
|
|
8
|
+
if not src.is_dir():
|
|
9
|
+
raise ValueError(f"not dir: {src}")
|
|
10
|
+
|
|
11
|
+
if not dst.exists():
|
|
12
|
+
os.makedirs(dst)
|
|
13
|
+
|
|
14
|
+
for item in src.iterdir():
|
|
15
|
+
s = src / item.name
|
|
16
|
+
d = dst / item.name
|
|
17
|
+
if s.is_dir():
|
|
18
|
+
copytree_merge(s, d)
|
|
19
|
+
else:
|
|
20
|
+
shutil.copy2(s, d)
|
|
21
|
+
|
|
22
|
+
def run(use_task: bool, use_redis: bool):
|
|
23
|
+
base = Path(__file__).parent.parent # scrapy_cffi
|
|
24
|
+
template_dir = base / "templates"
|
|
25
|
+
target: Path = Path.cwd() / "demo"
|
|
26
|
+
|
|
27
|
+
settings_path = target / "spiders" / "settings.py" if use_task else target / "settings.py"
|
|
28
|
+
settings_code = settings_path.read_text(encoding='utf-8')
|
|
29
|
+
settings_code = settings_code.replace('# settings.EXTENSIONS_PATH', 'settings.EXTENSIONS_PATH')
|
|
30
|
+
settings_code = settings_code.replace('# settings.ITEM_PIPELINES_PATH', 'settings.ITEM_PIPELINES_PATH')
|
|
31
|
+
settings_code = settings_code.replace('# "interceptors.CustomDownloadInterceptor1"', '"interceptors.CustomDownloadInterceptor1"')
|
|
32
|
+
settings_code = settings_code.replace('# "interceptors.CustomDownloadInterceptor2"', '"interceptors.CustomDownloadInterceptor2"')
|
|
33
|
+
settings_path.write_text(settings_code, encoding='utf-8')
|
|
34
|
+
|
|
35
|
+
spider_dir = target / "spiders" / "spiders" if use_task else target / "spiders"
|
|
36
|
+
demo_spiders_dir = template_dir / "demo_spider"
|
|
37
|
+
|
|
38
|
+
# demo_server
|
|
39
|
+
copytree_merge(template_dir / "server", target)
|
|
40
|
+
readme_path = target / "readme.txt"
|
|
41
|
+
readme_code = readme_path.read_text(encoding='utf-8')
|
|
42
|
+
if use_task:
|
|
43
|
+
readme_code = readme_code.replace('2.run runner.py', '2.run manager.py')
|
|
44
|
+
if use_redis:
|
|
45
|
+
readme_code = readme_code + '\n3.redis-cli\n4.RPUSH customRedisSpider_test http://127.0.0.1:8002\r\n'
|
|
46
|
+
readme_path.write_text(readme_code, encoding='utf-8')
|
|
47
|
+
|
|
48
|
+
if use_redis:
|
|
49
|
+
from .base import find_project_root
|
|
50
|
+
from .genspider import check_use_redis
|
|
51
|
+
project_path = find_project_root(is_demo=True)
|
|
52
|
+
use_task = check_use_redis(project_path, use_redis, use_task)
|
|
53
|
+
|
|
54
|
+
demo_spider_files = ["customRedisSpider", "studentSpider"]
|
|
55
|
+
for demo_spider in demo_spider_files:
|
|
56
|
+
demo_spider_path = demo_spiders_dir / f'{demo_spider}.py'
|
|
57
|
+
target_spider_path = spider_dir / f'{demo_spider}.py'
|
|
58
|
+
demo_spider_code = demo_spider_path.read_text(encoding='utf-8')
|
|
59
|
+
target_spider_path.parent.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
target_spider_path.write_text(demo_spider_code, encoding='utf-8')
|
|
61
|
+
|
|
62
|
+
update_spiders_path(
|
|
63
|
+
project_path=target,
|
|
64
|
+
demo_spiders_dir=demo_spiders_dir,
|
|
65
|
+
demo_spider_files=demo_spider_files,
|
|
66
|
+
spider_dir=spider_dir,
|
|
67
|
+
use_task=use_task,
|
|
68
|
+
use_redis=use_redis
|
|
69
|
+
)
|
|
70
|
+
# runner.py update spider_path
|
|
71
|
+
|
|
72
|
+
# module path with `spiders`
|
|
73
|
+
spiders_dir = target / "spiders" if use_task else target
|
|
74
|
+
runner_path = spiders_dir / "runner.py"
|
|
75
|
+
runner_code = runner_path.read_text(encoding='utf-8')
|
|
76
|
+
runner_code = runner_code.replace('crawler, engine_task = await advance_main()', '# crawler, engine_task = await advance_main()')
|
|
77
|
+
runner_code = runner_code.replace('# crawler, engine_task = await advance_main_all()', 'crawler, engine_task = await advance_main_all()')
|
|
78
|
+
runner_code = runner_code.replace('import threading', '# import threading')
|
|
79
|
+
runner_code = runner_code.replace('t = threading.Thread(', '# t = threading.Thread(')
|
|
80
|
+
runner_code = runner_code.replace('t.start()', '# t.start()')
|
|
81
|
+
runner_code = runner_code.replace('t.join()', '# t.join()')
|
|
82
|
+
runner_code = runner_code.replace(' main()', ' # main()')
|
|
83
|
+
if use_task:
|
|
84
|
+
runner_code = runner_code.replace('spider_path="spiders.CustomSpider"', 'spider_path="spiders.spiders.CustomRedisSpider"')
|
|
85
|
+
runner_code = runner_code.replace('get_run_py_dir() / "spiders"', 'get_run_py_dir() / "spiders" / "spiders"')
|
|
86
|
+
runner_path.write_text(runner_code, encoding='utf-8')
|
|
87
|
+
else:
|
|
88
|
+
spider_dir.mkdir(parents=True, exist_ok=True)
|
|
89
|
+
demo_spider_files = ["customSpider", "studentSpider"]
|
|
90
|
+
for demo_spider in demo_spider_files:
|
|
91
|
+
demo_spider_path = demo_spiders_dir / f'{demo_spider}.py'
|
|
92
|
+
target_spider_path = spider_dir / f'{demo_spider}.py'
|
|
93
|
+
demo_spider_code = demo_spider_path.read_text(encoding='utf-8')
|
|
94
|
+
target_spider_path.parent.mkdir(parents=True, exist_ok=True)
|
|
95
|
+
target_spider_path.write_text(demo_spider_code, encoding='utf-8')
|
|
96
|
+
update_spiders_path(project_path=target, demo_spiders_dir=demo_spiders_dir, demo_spider_files=demo_spider_files, spider_dir=spider_dir, use_task=use_task, use_redis=use_redis)
|
|
97
|
+
|
|
98
|
+
print(f"Project 'demo' with tasks manager created.") if use_task else print(f"Project 'demo' created.")
|
|
99
|
+
|
|
100
|
+
def update_spiders_path(project_path: Path, demo_spiders_dir: Path, demo_spider_files: List, spider_dir: Path, use_task: bool, use_redis: bool):
|
|
101
|
+
for spider_name in demo_spider_files:
|
|
102
|
+
spider_path = demo_spiders_dir / f"{spider_name}.py"
|
|
103
|
+
spider_code = spider_path.read_text('utf-8')
|
|
104
|
+
|
|
105
|
+
write_path = spider_dir / f"{spider_name}.py"
|
|
106
|
+
if use_task:
|
|
107
|
+
spider_code = spider_code.replace("from items.item import CustomItem", "from spiders.items.item import CustomItem")
|
|
108
|
+
write_path.write_text(spider_code, encoding='utf-8')
|
|
109
|
+
cls_name = spider_name[0].upper() + spider_name[1:] if spider_name else spider_name
|
|
110
|
+
# update __init__.py
|
|
111
|
+
from .genspider import update_spiders_init
|
|
112
|
+
update_spiders_init(project_path=project_path, class_name=cls_name, spider_name=spider_name, use_task=use_task)
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import toml
|
|
2
|
+
from ..utils import load_settings_from_py
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from jinja2 import Template
|
|
5
|
+
|
|
6
|
+
def check_use_redis(project_path: Path, use_redis: bool=False, use_task: bool=False):
|
|
7
|
+
config_path = project_path / "scrapy_cffi.toml"
|
|
8
|
+
|
|
9
|
+
config_data = toml.load(config_path)
|
|
10
|
+
if config_data.get("default"):
|
|
11
|
+
if not use_task:
|
|
12
|
+
use_task = config_data["default"].get("use_task", False)
|
|
13
|
+
if use_redis and (not config_data["default"].get("use_redis", False)):
|
|
14
|
+
config_data["default"]["use_redis"] = True
|
|
15
|
+
with config_path.open("w", encoding="utf-8") as f:
|
|
16
|
+
toml.dump(config_data, f)
|
|
17
|
+
|
|
18
|
+
# Scheduler -> RedisScheduler
|
|
19
|
+
settings_file = project_path / "spiders" / "settings.py" if use_task else project_path / "settings.py"
|
|
20
|
+
settings_data = settings_file.read_text(encoding='utf-8')
|
|
21
|
+
if 'def create_settings(spider_path, user_redis=False, *args, **kwargs):' in settings_data:
|
|
22
|
+
settings_data = settings_data.replace('def create_settings(spider_path, user_redis=False, *args, **kwargs):', 'def create_settings(spider_path, user_redis=True, *args, **kwargs):')
|
|
23
|
+
settings_file.write_text(settings_data, encoding="utf-8")
|
|
24
|
+
# update config.py
|
|
25
|
+
if use_task:
|
|
26
|
+
task_config_path = project_path / "config.py"
|
|
27
|
+
if task_config_path.exists():
|
|
28
|
+
task_config = load_settings_from_py(task_config_path, auto_upper=False)
|
|
29
|
+
if not task_config.get("redis_url"):
|
|
30
|
+
with task_config_path.open("a", encoding="utf-8") as f:
|
|
31
|
+
f.write('\nredis_url = "redis://127.0.0.1:6379"\n')
|
|
32
|
+
return use_task
|
|
33
|
+
|
|
34
|
+
def run(spider_name: str, allow_domain: str, use_redis: bool, is_demo=False):
|
|
35
|
+
from .base import find_project_root
|
|
36
|
+
project_path = find_project_root()
|
|
37
|
+
use_task = check_use_redis(project_path, use_redis)
|
|
38
|
+
|
|
39
|
+
class_name = snake_to_camel(spider_name)
|
|
40
|
+
base_class = "RedisSpider" if use_redis else "Spider"
|
|
41
|
+
base_import = "scrapy_cffi.spiders"
|
|
42
|
+
start_urls = f'redis_key = ""' if use_redis else f'start_urls = ["https://{allow_domain}"]'
|
|
43
|
+
|
|
44
|
+
base = Path(__file__).parent.parent # scrapy_cffi
|
|
45
|
+
template_dir = base / "templates"
|
|
46
|
+
with open(template_dir / "spider.py.j2", "r", encoding="utf-8") as f:
|
|
47
|
+
template: Template = Template(f.read())
|
|
48
|
+
|
|
49
|
+
code = template.render(
|
|
50
|
+
class_name=class_name,
|
|
51
|
+
spider_name=spider_name,
|
|
52
|
+
domain=allow_domain,
|
|
53
|
+
base_class=base_class,
|
|
54
|
+
base_import=base_import,
|
|
55
|
+
start_urls=start_urls
|
|
56
|
+
)
|
|
57
|
+
target_file = project_path / "spiders" / "spiders" / f"{spider_name}.py" if use_task else project_path / "spiders" / f"{spider_name}.py" # use abspath
|
|
58
|
+
target_file.parent.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
target_file.write_text(code, encoding="utf-8")
|
|
60
|
+
|
|
61
|
+
# To avoid overwriting user-defined content, only spider templates should be regenerated; other files should be appended or updated dynamically.
|
|
62
|
+
update_spiders_init(project_path, class_name, spider_name, use_task)
|
|
63
|
+
if not is_demo:
|
|
64
|
+
print(f"Spider created: {target_file}")
|
|
65
|
+
|
|
66
|
+
# Use this to automatically convert snake_case to camelCase.
|
|
67
|
+
def snake_to_camel(name: str) -> str:
|
|
68
|
+
return ''.join(word.capitalize() for word in name.split('_')) + "Spider"
|
|
69
|
+
|
|
70
|
+
# auto import
|
|
71
|
+
def update_spiders_init(project_path: Path, class_name: str, spider_name: str, use_task: bool):
|
|
72
|
+
init_path =project_path / "spiders" / "spiders" / "__init__.py" if use_task else project_path / "spiders" / "__init__.py"
|
|
73
|
+
import_line = f"from .{spider_name} import {class_name}\n"
|
|
74
|
+
|
|
75
|
+
if not init_path.exists():
|
|
76
|
+
init_path.write_text(import_line, encoding="utf-8")
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
init_data = init_path.read_text(encoding='utf-8')
|
|
80
|
+
if import_line in init_data:
|
|
81
|
+
return
|
|
82
|
+
with open(init_path, "a", encoding="utf-8") as f:
|
|
83
|
+
f.write(import_line)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from . import startproject, genspider, demo
|
|
3
|
+
|
|
4
|
+
def main():
|
|
5
|
+
parser = argparse.ArgumentParser(prog="scrapy_cffi", description="scrapy_cffi CLI tool")
|
|
6
|
+
subparsers = parser.add_subparsers(dest="command")
|
|
7
|
+
|
|
8
|
+
# startproject
|
|
9
|
+
sp = subparsers.add_parser("startproject", help="Create a new project")
|
|
10
|
+
sp.add_argument("name", help="Project name")
|
|
11
|
+
sp.add_argument("-t", "--task", action="store_true", help="Create a new project with tasks manager")
|
|
12
|
+
|
|
13
|
+
# genspider
|
|
14
|
+
gp = subparsers.add_parser("genspider", help="Generate a new spider")
|
|
15
|
+
gp.add_argument("-r", "--redis", action="store_true", help="Use RedisSpider")
|
|
16
|
+
gp.add_argument("name", help="Spider name")
|
|
17
|
+
gp.add_argument("domain", help="Target domain")
|
|
18
|
+
|
|
19
|
+
# demo project
|
|
20
|
+
demo_p = subparsers.add_parser("demo", help="Create a demo project")
|
|
21
|
+
demo_p.add_argument("-t", "--task", action="store_true", help="with tasks manager")
|
|
22
|
+
demo_p.add_argument("-r", "--redis", action="store_true", help="Use RedisSpider")
|
|
23
|
+
|
|
24
|
+
# export
|
|
25
|
+
# ep = subparsers.add_parser("export", help="Export files")
|
|
26
|
+
# ep.add_argument("name", help="Filename")
|
|
27
|
+
|
|
28
|
+
args = parser.parse_args()
|
|
29
|
+
|
|
30
|
+
if args.command == "startproject":
|
|
31
|
+
startproject.run(args.name, args.task)
|
|
32
|
+
elif args.command == "genspider":
|
|
33
|
+
genspider.run(args.name, args.domain, args.redis)
|
|
34
|
+
# elif args.command == "export":
|
|
35
|
+
# export.run(args.name)
|
|
36
|
+
elif args.command == "demo":
|
|
37
|
+
result = startproject.run("demo", args.task, is_demo=True)
|
|
38
|
+
if result is not None:
|
|
39
|
+
return
|
|
40
|
+
demo.run(args.task, args.redis)
|
|
41
|
+
else:
|
|
42
|
+
print(f"Unknown command: {args.command}")
|
|
43
|
+
parser.print_help()
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import shutil, toml
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
def run(project_name, use_task: bool, is_demo=False):
|
|
5
|
+
base = Path(__file__).parent.parent # scrapy_cffi
|
|
6
|
+
template_dir = base / "templates"
|
|
7
|
+
target: Path = Path.cwd() / project_name
|
|
8
|
+
|
|
9
|
+
if target.exists():
|
|
10
|
+
print(f"Error: Project '{project_name}' already exists.")
|
|
11
|
+
return False
|
|
12
|
+
|
|
13
|
+
if use_task:
|
|
14
|
+
shutil.copytree(template_dir / "task", target)
|
|
15
|
+
spiders_dir = target / "spiders" if use_task else target
|
|
16
|
+
shutil.copytree(template_dir / "spiders", spiders_dir)
|
|
17
|
+
shutil.copytree(template_dir / "js_path", target / "js_path")
|
|
18
|
+
|
|
19
|
+
if use_task:
|
|
20
|
+
# module path with `spiders`
|
|
21
|
+
runner_path = spiders_dir / "runner.py"
|
|
22
|
+
runner_code = runner_path.read_text(encoding='utf-8')
|
|
23
|
+
runner_code = runner_code.replace('from settings import create_settings', 'from spiders.settings import create_settings')
|
|
24
|
+
runner_path.write_text(runner_code, encoding='utf-8')
|
|
25
|
+
|
|
26
|
+
settings_path = spiders_dir / "settings.py"
|
|
27
|
+
settings_code = settings_path.read_text(encoding='utf-8')
|
|
28
|
+
settings_code = settings_code.replace('"extensions.CustomExtension"', '"spiders.extensions.CustomExtension"')
|
|
29
|
+
settings_code = settings_code.replace('"pipelines.CustomPipeline2"', '"spiders.pipelines.CustomPipeline2"')
|
|
30
|
+
settings_code = settings_code.replace('"pipelines.CustomPipeline1"', '"spiders.pipelines.CustomPipeline1"')
|
|
31
|
+
settings_code = settings_code.replace('"interceptors.CustomDownloadInterceptor1"', '"spiders.interceptors.CustomDownloadInterceptor1"')
|
|
32
|
+
settings_code = settings_code.replace('"interceptors.CustomDownloadInterceptor2"', '"spiders.interceptors.CustomDownloadInterceptor2"')
|
|
33
|
+
settings_path.write_text(settings_code, encoding='utf-8')
|
|
34
|
+
|
|
35
|
+
config_data = {
|
|
36
|
+
"default": {
|
|
37
|
+
"project_name": project_name,
|
|
38
|
+
"use_task": use_task
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
config_path = target / "scrapy_cffi.toml"
|
|
42
|
+
with config_path.open("w", encoding="utf-8") as f:
|
|
43
|
+
toml.dump(config_data, f)
|
|
44
|
+
if not is_demo:
|
|
45
|
+
print(f"Project '{project_name}' created.")
|
|
46
|
+
print(f"\tcd {project_name}")
|
|
47
|
+
print(f"\tscrapy_cffi genspider <spider_name> <domain>")
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from .fetch import Downloader
|
|
2
|
+
from .internet import Request, HttpRequest, WebSocketRequest, Response, HttpResponse, WebSocketResponse
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"Downloader",
|
|
6
|
+
"Request",
|
|
7
|
+
"HttpRequest",
|
|
8
|
+
"WebSocketRequest",
|
|
9
|
+
"Response",
|
|
10
|
+
"HttpResponse",
|
|
11
|
+
"WebSocketResponse",
|
|
12
|
+
]
|