scrapy-cffi 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. scrapy_cffi-0.1.0/LICENSE +38 -0
  2. scrapy_cffi-0.1.0/MANIFEST.in +4 -0
  3. scrapy_cffi-0.1.0/PKG-INFO +78 -0
  4. scrapy_cffi-0.1.0/README.md +49 -0
  5. scrapy_cffi-0.1.0/pyproject.toml +49 -0
  6. scrapy_cffi-0.1.0/scrapy_cffi/__init__.py +13 -0
  7. scrapy_cffi-0.1.0/scrapy_cffi/commands/__init__.py +0 -0
  8. scrapy_cffi-0.1.0/scrapy_cffi/commands/base.py +10 -0
  9. scrapy_cffi-0.1.0/scrapy_cffi/commands/demo.py +112 -0
  10. scrapy_cffi-0.1.0/scrapy_cffi/commands/genspider.py +83 -0
  11. scrapy_cffi-0.1.0/scrapy_cffi/commands/main.py +43 -0
  12. scrapy_cffi-0.1.0/scrapy_cffi/commands/startproject.py +47 -0
  13. scrapy_cffi-0.1.0/scrapy_cffi/core/__init__.py +10 -0
  14. scrapy_cffi-0.1.0/scrapy_cffi/core/api.py +11 -0
  15. scrapy_cffi-0.1.0/scrapy_cffi/core/downloader/__init__.py +12 -0
  16. scrapy_cffi-0.1.0/scrapy_cffi/core/downloader/fetch.py +165 -0
  17. scrapy_cffi-0.1.0/scrapy_cffi/core/downloader/internet/__init__.py +11 -0
  18. scrapy_cffi-0.1.0/scrapy_cffi/core/downloader/internet/registry.py +23 -0
  19. scrapy_cffi-0.1.0/scrapy_cffi/core/downloader/internet/request.py +268 -0
  20. scrapy_cffi-0.1.0/scrapy_cffi/core/downloader/internet/response.py +112 -0
  21. scrapy_cffi-0.1.0/scrapy_cffi/core/downloader/selector.py +247 -0
  22. scrapy_cffi-0.1.0/scrapy_cffi/core/engine.py +253 -0
  23. scrapy_cffi-0.1.0/scrapy_cffi/core/scheduler/__init__.py +6 -0
  24. scrapy_cffi-0.1.0/scrapy_cffi/core/scheduler/base.py +208 -0
  25. scrapy_cffi-0.1.0/scrapy_cffi/core/sessions.py +439 -0
  26. scrapy_cffi-0.1.0/scrapy_cffi/core/tasks.py +136 -0
  27. scrapy_cffi-0.1.0/scrapy_cffi/crawler.py +290 -0
  28. scrapy_cffi-0.1.0/scrapy_cffi/databases/__init__.py +5 -0
  29. scrapy_cffi-0.1.0/scrapy_cffi/databases/mongodb.py +0 -0
  30. scrapy_cffi-0.1.0/scrapy_cffi/databases/mysql.py +0 -0
  31. scrapy_cffi-0.1.0/scrapy_cffi/databases/redis.py +136 -0
  32. scrapy_cffi-0.1.0/scrapy_cffi/exceptions/__init__.py +1 -0
  33. scrapy_cffi-0.1.0/scrapy_cffi/exceptions/base.py +48 -0
  34. scrapy_cffi-0.1.0/scrapy_cffi/extensions/__init__.py +7 -0
  35. scrapy_cffi-0.1.0/scrapy_cffi/extensions/base.py +12 -0
  36. scrapy_cffi-0.1.0/scrapy_cffi/extensions/signal_manager.py +104 -0
  37. scrapy_cffi-0.1.0/scrapy_cffi/extensions/signals.py +23 -0
  38. scrapy_cffi-0.1.0/scrapy_cffi/hooks/__init__.py +1 -0
  39. scrapy_cffi-0.1.0/scrapy_cffi/hooks/base.py +63 -0
  40. scrapy_cffi-0.1.0/scrapy_cffi/hooks/interceptors.py +13 -0
  41. scrapy_cffi-0.1.0/scrapy_cffi/hooks/pipelines.py +17 -0
  42. scrapy_cffi-0.1.0/scrapy_cffi/hooks/signals.py +10 -0
  43. scrapy_cffi-0.1.0/scrapy_cffi/hooks/spiders.py +7 -0
  44. scrapy_cffi-0.1.0/scrapy_cffi/interceptors/__init__.py +11 -0
  45. scrapy_cffi-0.1.0/scrapy_cffi/interceptors/api.py +1 -0
  46. scrapy_cffi-0.1.0/scrapy_cffi/interceptors/base.py +75 -0
  47. scrapy_cffi-0.1.0/scrapy_cffi/interceptors/chains.py +338 -0
  48. scrapy_cffi-0.1.0/scrapy_cffi/interceptors/spiders.py +187 -0
  49. scrapy_cffi-0.1.0/scrapy_cffi/internet/__init__.py +10 -0
  50. scrapy_cffi-0.1.0/scrapy_cffi/item/__init__.py +6 -0
  51. scrapy_cffi-0.1.0/scrapy_cffi/item/base.py +135 -0
  52. scrapy_cffi-0.1.0/scrapy_cffi/models/__init__.py +1 -0
  53. scrapy_cffi-0.1.0/scrapy_cffi/models/api.py +4 -0
  54. scrapy_cffi-0.1.0/scrapy_cffi/models/base.py +35 -0
  55. scrapy_cffi-0.1.0/scrapy_cffi/models/component.py +43 -0
  56. scrapy_cffi-0.1.0/scrapy_cffi/models/databases.py +27 -0
  57. scrapy_cffi-0.1.0/scrapy_cffi/models/media.py +43 -0
  58. scrapy_cffi-0.1.0/scrapy_cffi/models/settings.py +121 -0
  59. scrapy_cffi-0.1.0/scrapy_cffi/models/singal.py +21 -0
  60. scrapy_cffi-0.1.0/scrapy_cffi/pipelines/__init__.py +5 -0
  61. scrapy_cffi-0.1.0/scrapy_cffi/pipelines/api.py +44 -0
  62. scrapy_cffi-0.1.0/scrapy_cffi/pipelines/base.py +39 -0
  63. scrapy_cffi-0.1.0/scrapy_cffi/scheduler/__init__.py +6 -0
  64. scrapy_cffi-0.1.0/scrapy_cffi/spiders/__init__.py +7 -0
  65. scrapy_cffi-0.1.0/scrapy_cffi/spiders/base.py +150 -0
  66. scrapy_cffi-0.1.0/scrapy_cffi/templates/demo_spider/customRedisSpider.py +50 -0
  67. scrapy_cffi-0.1.0/scrapy_cffi/templates/demo_spider/customSpider.py +67 -0
  68. scrapy_cffi-0.1.0/scrapy_cffi/templates/demo_spider/studentSpider.py +69 -0
  69. scrapy_cffi-0.1.0/scrapy_cffi/templates/js_path/js_action.js +3 -0
  70. scrapy_cffi-0.1.0/scrapy_cffi/templates/server/demo_server/fastApiServer.py +142 -0
  71. scrapy_cffi-0.1.0/scrapy_cffi/templates/server/demo_server/ws_server.py +53 -0
  72. scrapy_cffi-0.1.0/scrapy_cffi/templates/server/readme.txt +12 -0
  73. scrapy_cffi-0.1.0/scrapy_cffi/templates/spider.py.j2 +12 -0
  74. scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/extensions/__init__.py +1 -0
  75. scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/extensions/extension.py +59 -0
  76. scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/interceptors/__init__.py +1 -0
  77. scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/interceptors/interceptors.py +31 -0
  78. scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/items/__init__.py +1 -0
  79. scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/items/item.py +9 -0
  80. scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/pipelines/__init__.py +1 -0
  81. scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/pipelines/pipeline.py +28 -0
  82. scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/runner.py +118 -0
  83. scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/settings.py +21 -0
  84. scrapy_cffi-0.1.0/scrapy_cffi/templates/spiders/spiders/__init__.py +0 -0
  85. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/_action_core/__init__.py +7 -0
  86. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/_action_core/baseManager.py +32 -0
  87. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/_action_core/coreBase.py +33 -0
  88. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/_action_core/coreFlow.py +47 -0
  89. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/_action_core/reqBase.py +156 -0
  90. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/action_base/__init__.py +2 -0
  91. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/action_base/collect_base.py +27 -0
  92. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/action_base/function_base.py +36 -0
  93. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/action_flow/__init__.py +2 -0
  94. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/action_flow/collect.py +37 -0
  95. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/action_flow/function_flow.py +29 -0
  96. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/component/__init__.py +5 -0
  97. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/component/midManager.py +86 -0
  98. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/component/someManager.py +3 -0
  99. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/config.py +12 -0
  100. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/export_interface.py +49 -0
  101. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/manager.py +160 -0
  102. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/models/__init__.py +1 -0
  103. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/models/data_model.py +4 -0
  104. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/utils/__init__.py +2 -0
  105. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/utils/algorithm.py +7 -0
  106. scrapy_cffi-0.1.0/scrapy_cffi/templates/task/utils/common.py +42 -0
  107. scrapy_cffi-0.1.0/scrapy_cffi/utils/__init__.py +3 -0
  108. scrapy_cffi-0.1.0/scrapy_cffi/utils/common.py +472 -0
  109. scrapy_cffi-0.1.0/scrapy_cffi/utils/log.py +152 -0
  110. scrapy_cffi-0.1.0/scrapy_cffi/utils/media.py +72 -0
  111. scrapy_cffi-0.1.0/scrapy_cffi/utils/robot.py +119 -0
  112. scrapy_cffi-0.1.0/scrapy_cffi/utils/scrapyRunner.py +55 -0
  113. scrapy_cffi-0.1.0/scrapy_cffi.egg-info/PKG-INFO +78 -0
  114. scrapy_cffi-0.1.0/scrapy_cffi.egg-info/SOURCES.txt +118 -0
  115. scrapy_cffi-0.1.0/scrapy_cffi.egg-info/dependency_links.txt +1 -0
  116. scrapy_cffi-0.1.0/scrapy_cffi.egg-info/entry_points.txt +2 -0
  117. scrapy_cffi-0.1.0/scrapy_cffi.egg-info/requires.txt +19 -0
  118. scrapy_cffi-0.1.0/scrapy_cffi.egg-info/top_level.txt +1 -0
  119. scrapy_cffi-0.1.0/setup.cfg +4 -0
  120. scrapy_cffi-0.1.0/tests/test.py +2 -0
@@ -0,0 +1,38 @@
1
+ Portions of this project are derived from the Scrapy project (https://github.com/scrapy/scrapy),
2
+ which is licensed under the BSD 3-Clause License.
3
+
4
+ In particular, the `item.py` module is adapted from Scrapy's `scrapy/item.py`
5
+ with necessary adjustments for integration with the asyncio-based engine.
6
+ Original copyright:
7
+ Copyright (c) Scrapy developers. All rights reserved.
8
+
9
+ ---
10
+
11
+ BSD 3-Clause License
12
+
13
+ Copyright (c) 2025, aFunnyStrange
14
+
15
+ Redistribution and use in source and binary forms, with or without
16
+ modification, are permitted provided that the following conditions are met:
17
+
18
+ 1. Redistributions of source code must retain the above copyright notice,
19
+ this list of conditions, and the following disclaimer.
20
+
21
+ 2. Redistributions in binary form must reproduce the above copyright notice,
22
+ this list of conditions, and the following disclaimer in the documentation
23
+ and/or other materials provided with the distribution.
24
+
25
+ 3. Neither the name of the copyright holder nor the names of its
26
+ contributors may be used to endorse or promote products derived from
27
+ this software without specific prior written permission.
28
+
29
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
32
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
33
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
35
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
36
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
37
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
38
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,4 @@
1
+ recursive-include scrapy_cffi/templates *
2
+
3
+ global-exclude __pycache__/
4
+ global-exclude *.py[cod]
@@ -0,0 +1,78 @@
1
+ Metadata-Version: 2.4
2
+ Name: scrapy_cffi
3
+ Version: 0.1.0
4
+ Summary: An asyncio-style web scraping framework inspired by Scrapy, powered by curl_cffi.
5
+ Author: aFunnyStrange
6
+ License: BSD-3-Clause
7
+ Project-URL: Homepage, https://github.com/aFunnyStrange/scrapy_cffi
8
+ Requires-Python: >=3.7
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: curl_cffi
12
+ Requires-Dist: PyExecjs
13
+ Requires-Dist: orjson
14
+ Requires-Dist: json5
15
+ Requires-Dist: bbpb
16
+ Requires-Dist: toml
17
+ Requires-Dist: pydantic>=2.0.0
18
+ Requires-Dist: jinja2
19
+ Requires-Dist: tenacity
20
+ Requires-Dist: redis>=5.0.0
21
+ Requires-Dist: parsel
22
+ Requires-Dist: Pillow
23
+ Requires-Dist: hachoir
24
+ Provides-Extra: windows
25
+ Requires-Dist: python-magic-bin; extra == "windows"
26
+ Provides-Extra: unix
27
+ Requires-Dist: python-magic; extra == "unix"
28
+ Dynamic: license-file
29
+
30
+ ## scrapy_cffi
31
+
32
+ > An asyncio-style web scraping framework inspired by Scrapy, powered by `curl_cffi`.
33
+
34
+ `scrapy_cffi` is a lightweight asynchronous crawling framework that mimics the Scrapy architecture while replacing Twisted with `curl_cffi` as the underlying HTTP/WebSocket client. It is designed to be efficient, extensible, and suitable for both simple tasks and complex distributed crawlers.
35
+
36
+ ---
37
+
38
+ ## ✨ Features
39
+
40
+ - Familiar Scrapy-style components: spiders, items, interceptors, pipelines
41
+ - Fully asyncio-based engine
42
+ - Built-in support for HTTP and WebSocket requests
43
+ - Lightweight signal system
44
+ - Plug-in ready interceptor and task manager design
45
+ - Redis-compatible scheduler (optional)
46
+ - Designed for high-concurrency crawling
47
+
48
+ ---
49
+
50
+ ## 📦 Installation
51
+ #### From PyPI
52
+
53
+ ```bash
54
+ pip install scrapy_cffi
55
+ ```
56
+
57
+ #### From source
58
+ ```bash
59
+ git clone https://github.com/aFunnyStrange/scrapy_cffi.git
60
+ cd curl_cffi
61
+ pip install -e .
62
+ ```
63
+
64
+ ## 🚀 Quick Start
65
+ ```bash
66
+ scrapy_cffi startproject <project_name>
67
+ cd <project_name>
68
+ scrapy_cffi genspider <spider_name> <domain>
69
+ python runner.py
70
+ ```
71
+
72
+ ## 📖 Documentation
73
+ Technical module-level documentation can be found in the [`docs/`](https://github.com/aFunnyStrange/scrapy_cffi/tree/main/docs/usage) directory on GitHub.
74
+ Each core component (engine, downloader, middleware, etc.) has its own `.md` file.
75
+
76
+ ## 📄 License
77
+ This project is licensed under the BSD 3-Clause License. Portions of the code (specifically item.py) are adapted from the Scrapy project.
78
+ See LICENSE for details.
@@ -0,0 +1,49 @@
1
+ ## scrapy_cffi
2
+
3
+ > An asyncio-style web scraping framework inspired by Scrapy, powered by `curl_cffi`.
4
+
5
+ `scrapy_cffi` is a lightweight asynchronous crawling framework that mimics the Scrapy architecture while replacing Twisted with `curl_cffi` as the underlying HTTP/WebSocket client. It is designed to be efficient, extensible, and suitable for both simple tasks and complex distributed crawlers.
6
+
7
+ ---
8
+
9
+ ## ✨ Features
10
+
11
+ - Familiar Scrapy-style components: spiders, items, interceptors, pipelines
12
+ - Fully asyncio-based engine
13
+ - Built-in support for HTTP and WebSocket requests
14
+ - Lightweight signal system
15
+ - Plug-in ready interceptor and task manager design
16
+ - Redis-compatible scheduler (optional)
17
+ - Designed for high-concurrency crawling
18
+
19
+ ---
20
+
21
+ ## 📦 Installation
22
+ #### From PyPI
23
+
24
+ ```bash
25
+ pip install scrapy_cffi
26
+ ```
27
+
28
+ #### From source
29
+ ```bash
30
+ git clone https://github.com/aFunnyStrange/scrapy_cffi.git
31
+ cd curl_cffi
32
+ pip install -e .
33
+ ```
34
+
35
+ ## 🚀 Quick Start
36
+ ```bash
37
+ scrapy_cffi startproject <project_name>
38
+ cd <project_name>
39
+ scrapy_cffi genspider <spider_name> <domain>
40
+ python runner.py
41
+ ```
42
+
43
+ ## 📖 Documentation
44
+ Technical module-level documentation can be found in the [`docs/`](https://github.com/aFunnyStrange/scrapy_cffi/tree/main/docs/usage) directory on GitHub.
45
+ Each core component (engine, downloader, middleware, etc.) has its own `.md` file.
46
+
47
+ ## 📄 License
48
+ This project is licensed under the BSD 3-Clause License. Portions of the code (specifically item.py) are adapted from the Scrapy project.
49
+ See LICENSE for details.
@@ -0,0 +1,49 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "scrapy_cffi"
7
+ version = "0.1.0"
8
+ description = "An asyncio-style web scraping framework inspired by Scrapy, powered by curl_cffi."
9
+ authors = [{name = "aFunnyStrange"}]
10
+ license = {text = "BSD-3-Clause"}
11
+ readme = "README.md"
12
+ requires-python = ">=3.7"
13
+ dependencies = [
14
+ "curl_cffi",
15
+ "PyExecjs",
16
+ "orjson",
17
+ "json5",
18
+ "bbpb",
19
+ "toml",
20
+ "pydantic>=2.0.0",
21
+ "jinja2",
22
+ "tenacity",
23
+ "redis>=5.0.0",
24
+ "parsel",
25
+ "Pillow",
26
+ "hachoir",
27
+ ]
28
+ [project.optional-dependencies]
29
+ windows = ["python-magic-bin"]
30
+ unix = ["python-magic"]
31
+
32
+ [project.urls]
33
+ Homepage = "https://github.com/aFunnyStrange/scrapy_cffi"
34
+
35
+ [tool.setuptools.packages.find]
36
+ include = ["scrapy_cffi", "scrapy_cffi.*"]
37
+ exclude = ["docs*", "tests*", "examples*"]
38
+
39
+ [project.scripts]
40
+ scrapy_cffi = "scrapy_cffi.commands.main:main"
41
+
42
+ [tool.setuptools]
43
+ include-package-data = true
44
+
45
+ [tool.setuptools.package-data]
46
+ "scrapy_cffi" = ["templates/*"]
47
+
48
+ [tool.setuptools.exclude-package-data]
49
+ "*" = ["__pycache__/*", "*.py[cod]"]
@@ -0,0 +1,13 @@
1
+ __version__ = "0.1.0"
2
+
3
+ from .crawler import run_spider, run_all_spiders, run_spider_sync, run_all_spiders_sync, cleanup_loop
4
+ from .utils import load_settings_with_path, init_logger, run_coroutine_in_thread, run_coroutine_in_new_loop, ProcessTaskManager
5
+
6
+ __all__ = [
7
+ "run_spider",
8
+ "run_all_spiders",
9
+ "run_spider_sync",
10
+ "run_all_spiders_sync",
11
+ "load_settings_with_path",
12
+ "init_logger"
13
+ ]
File without changes
@@ -0,0 +1,10 @@
1
+ from pathlib import Path
2
+
3
+ def find_project_root(start: Path = None, is_demo=False) -> Path:
4
+ if start is None:
5
+ start = Path.cwd()
6
+ all_path = [start / "demo"] if is_demo else [start, *start.parents]
7
+ for path in all_path:
8
+ if (path / "scrapy_cffi.toml").exists():
9
+ return path
10
+ raise FileNotFoundError("Project root not found (missing scrapy_cffi.toml)")
@@ -0,0 +1,112 @@
1
+ import shutil, os
2
+ from pathlib import Path
3
+ from typing import List
4
+
5
+ def copytree_merge(src: Path, dst: Path):
6
+ src = Path(src)
7
+ dst = Path(dst)
8
+ if not src.is_dir():
9
+ raise ValueError(f"not dir: {src}")
10
+
11
+ if not dst.exists():
12
+ os.makedirs(dst)
13
+
14
+ for item in src.iterdir():
15
+ s = src / item.name
16
+ d = dst / item.name
17
+ if s.is_dir():
18
+ copytree_merge(s, d)
19
+ else:
20
+ shutil.copy2(s, d)
21
+
22
+ def run(use_task: bool, use_redis: bool):
23
+ base = Path(__file__).parent.parent # scrapy_cffi
24
+ template_dir = base / "templates"
25
+ target: Path = Path.cwd() / "demo"
26
+
27
+ settings_path = target / "spiders" / "settings.py" if use_task else target / "settings.py"
28
+ settings_code = settings_path.read_text(encoding='utf-8')
29
+ settings_code = settings_code.replace('# settings.EXTENSIONS_PATH', 'settings.EXTENSIONS_PATH')
30
+ settings_code = settings_code.replace('# settings.ITEM_PIPELINES_PATH', 'settings.ITEM_PIPELINES_PATH')
31
+ settings_code = settings_code.replace('# "interceptors.CustomDownloadInterceptor1"', '"interceptors.CustomDownloadInterceptor1"')
32
+ settings_code = settings_code.replace('# "interceptors.CustomDownloadInterceptor2"', '"interceptors.CustomDownloadInterceptor2"')
33
+ settings_path.write_text(settings_code, encoding='utf-8')
34
+
35
+ spider_dir = target / "spiders" / "spiders" if use_task else target / "spiders"
36
+ demo_spiders_dir = template_dir / "demo_spider"
37
+
38
+ # demo_server
39
+ copytree_merge(template_dir / "server", target)
40
+ readme_path = target / "readme.txt"
41
+ readme_code = readme_path.read_text(encoding='utf-8')
42
+ if use_task:
43
+ readme_code = readme_code.replace('2.run runner.py', '2.run manager.py')
44
+ if use_redis:
45
+ readme_code = readme_code + '\n3.redis-cli\n4.RPUSH customRedisSpider_test http://127.0.0.1:8002\r\n'
46
+ readme_path.write_text(readme_code, encoding='utf-8')
47
+
48
+ if use_redis:
49
+ from .base import find_project_root
50
+ from .genspider import check_use_redis
51
+ project_path = find_project_root(is_demo=True)
52
+ use_task = check_use_redis(project_path, use_redis, use_task)
53
+
54
+ demo_spider_files = ["customRedisSpider", "studentSpider"]
55
+ for demo_spider in demo_spider_files:
56
+ demo_spider_path = demo_spiders_dir / f'{demo_spider}.py'
57
+ target_spider_path = spider_dir / f'{demo_spider}.py'
58
+ demo_spider_code = demo_spider_path.read_text(encoding='utf-8')
59
+ target_spider_path.parent.mkdir(parents=True, exist_ok=True)
60
+ target_spider_path.write_text(demo_spider_code, encoding='utf-8')
61
+
62
+ update_spiders_path(
63
+ project_path=target,
64
+ demo_spiders_dir=demo_spiders_dir,
65
+ demo_spider_files=demo_spider_files,
66
+ spider_dir=spider_dir,
67
+ use_task=use_task,
68
+ use_redis=use_redis
69
+ )
70
+ # runner.py update spider_path
71
+
72
+ # module path with `spiders`
73
+ spiders_dir = target / "spiders" if use_task else target
74
+ runner_path = spiders_dir / "runner.py"
75
+ runner_code = runner_path.read_text(encoding='utf-8')
76
+ runner_code = runner_code.replace('crawler, engine_task = await advance_main()', '# crawler, engine_task = await advance_main()')
77
+ runner_code = runner_code.replace('# crawler, engine_task = await advance_main_all()', 'crawler, engine_task = await advance_main_all()')
78
+ runner_code = runner_code.replace('import threading', '# import threading')
79
+ runner_code = runner_code.replace('t = threading.Thread(', '# t = threading.Thread(')
80
+ runner_code = runner_code.replace('t.start()', '# t.start()')
81
+ runner_code = runner_code.replace('t.join()', '# t.join()')
82
+ runner_code = runner_code.replace(' main()', ' # main()')
83
+ if use_task:
84
+ runner_code = runner_code.replace('spider_path="spiders.CustomSpider"', 'spider_path="spiders.spiders.CustomRedisSpider"')
85
+ runner_code = runner_code.replace('get_run_py_dir() / "spiders"', 'get_run_py_dir() / "spiders" / "spiders"')
86
+ runner_path.write_text(runner_code, encoding='utf-8')
87
+ else:
88
+ spider_dir.mkdir(parents=True, exist_ok=True)
89
+ demo_spider_files = ["customSpider", "studentSpider"]
90
+ for demo_spider in demo_spider_files:
91
+ demo_spider_path = demo_spiders_dir / f'{demo_spider}.py'
92
+ target_spider_path = spider_dir / f'{demo_spider}.py'
93
+ demo_spider_code = demo_spider_path.read_text(encoding='utf-8')
94
+ target_spider_path.parent.mkdir(parents=True, exist_ok=True)
95
+ target_spider_path.write_text(demo_spider_code, encoding='utf-8')
96
+ update_spiders_path(project_path=target, demo_spiders_dir=demo_spiders_dir, demo_spider_files=demo_spider_files, spider_dir=spider_dir, use_task=use_task, use_redis=use_redis)
97
+
98
+ print(f"Project 'demo' with tasks manager created.") if use_task else print(f"Project 'demo' created.")
99
+
100
+ def update_spiders_path(project_path: Path, demo_spiders_dir: Path, demo_spider_files: List, spider_dir: Path, use_task: bool, use_redis: bool):
101
+ for spider_name in demo_spider_files:
102
+ spider_path = demo_spiders_dir / f"{spider_name}.py"
103
+ spider_code = spider_path.read_text('utf-8')
104
+
105
+ write_path = spider_dir / f"{spider_name}.py"
106
+ if use_task:
107
+ spider_code = spider_code.replace("from items.item import CustomItem", "from spiders.items.item import CustomItem")
108
+ write_path.write_text(spider_code, encoding='utf-8')
109
+ cls_name = spider_name[0].upper() + spider_name[1:] if spider_name else spider_name
110
+ # update __init__.py
111
+ from .genspider import update_spiders_init
112
+ update_spiders_init(project_path=project_path, class_name=cls_name, spider_name=spider_name, use_task=use_task)
@@ -0,0 +1,83 @@
1
+ import toml
2
+ from ..utils import load_settings_from_py
3
+ from pathlib import Path
4
+ from jinja2 import Template
5
+
6
+ def check_use_redis(project_path: Path, use_redis: bool=False, use_task: bool=False):
7
+ config_path = project_path / "scrapy_cffi.toml"
8
+
9
+ config_data = toml.load(config_path)
10
+ if config_data.get("default"):
11
+ if not use_task:
12
+ use_task = config_data["default"].get("use_task", False)
13
+ if use_redis and (not config_data["default"].get("use_redis", False)):
14
+ config_data["default"]["use_redis"] = True
15
+ with config_path.open("w", encoding="utf-8") as f:
16
+ toml.dump(config_data, f)
17
+
18
+ # Scheduler -> RedisScheduler
19
+ settings_file = project_path / "spiders" / "settings.py" if use_task else project_path / "settings.py"
20
+ settings_data = settings_file.read_text(encoding='utf-8')
21
+ if 'def create_settings(spider_path, user_redis=False, *args, **kwargs):' in settings_data:
22
+ settings_data = settings_data.replace('def create_settings(spider_path, user_redis=False, *args, **kwargs):', 'def create_settings(spider_path, user_redis=True, *args, **kwargs):')
23
+ settings_file.write_text(settings_data, encoding="utf-8")
24
+ # update config.py
25
+ if use_task:
26
+ task_config_path = project_path / "config.py"
27
+ if task_config_path.exists():
28
+ task_config = load_settings_from_py(task_config_path, auto_upper=False)
29
+ if not task_config.get("redis_url"):
30
+ with task_config_path.open("a", encoding="utf-8") as f:
31
+ f.write('\nredis_url = "redis://127.0.0.1:6379"\n')
32
+ return use_task
33
+
34
+ def run(spider_name: str, allow_domain: str, use_redis: bool, is_demo=False):
35
+ from .base import find_project_root
36
+ project_path = find_project_root()
37
+ use_task = check_use_redis(project_path, use_redis)
38
+
39
+ class_name = snake_to_camel(spider_name)
40
+ base_class = "RedisSpider" if use_redis else "Spider"
41
+ base_import = "scrapy_cffi.spiders"
42
+ start_urls = f'redis_key = ""' if use_redis else f'start_urls = ["https://{allow_domain}"]'
43
+
44
+ base = Path(__file__).parent.parent # scrapy_cffi
45
+ template_dir = base / "templates"
46
+ with open(template_dir / "spider.py.j2", "r", encoding="utf-8") as f:
47
+ template: Template = Template(f.read())
48
+
49
+ code = template.render(
50
+ class_name=class_name,
51
+ spider_name=spider_name,
52
+ domain=allow_domain,
53
+ base_class=base_class,
54
+ base_import=base_import,
55
+ start_urls=start_urls
56
+ )
57
+ target_file = project_path / "spiders" / "spiders" / f"{spider_name}.py" if use_task else project_path / "spiders" / f"{spider_name}.py" # use abspath
58
+ target_file.parent.mkdir(parents=True, exist_ok=True)
59
+ target_file.write_text(code, encoding="utf-8")
60
+
61
+ # To avoid overwriting user-defined content, only spider templates should be regenerated; other files should be appended or updated dynamically.
62
+ update_spiders_init(project_path, class_name, spider_name, use_task)
63
+ if not is_demo:
64
+ print(f"Spider created: {target_file}")
65
+
66
+ # Use this to automatically convert snake_case to camelCase.
67
+ def snake_to_camel(name: str) -> str:
68
+ return ''.join(word.capitalize() for word in name.split('_')) + "Spider"
69
+
70
+ # auto import
71
+ def update_spiders_init(project_path: Path, class_name: str, spider_name: str, use_task: bool):
72
+ init_path =project_path / "spiders" / "spiders" / "__init__.py" if use_task else project_path / "spiders" / "__init__.py"
73
+ import_line = f"from .{spider_name} import {class_name}\n"
74
+
75
+ if not init_path.exists():
76
+ init_path.write_text(import_line, encoding="utf-8")
77
+ return
78
+
79
+ init_data = init_path.read_text(encoding='utf-8')
80
+ if import_line in init_data:
81
+ return
82
+ with open(init_path, "a", encoding="utf-8") as f:
83
+ f.write(import_line)
@@ -0,0 +1,43 @@
1
+ import argparse
2
+ from . import startproject, genspider, demo
3
+
4
+ def main():
5
+ parser = argparse.ArgumentParser(prog="scrapy_cffi", description="scrapy_cffi CLI tool")
6
+ subparsers = parser.add_subparsers(dest="command")
7
+
8
+ # startproject
9
+ sp = subparsers.add_parser("startproject", help="Create a new project")
10
+ sp.add_argument("name", help="Project name")
11
+ sp.add_argument("-t", "--task", action="store_true", help="Create a new project with tasks manager")
12
+
13
+ # genspider
14
+ gp = subparsers.add_parser("genspider", help="Generate a new spider")
15
+ gp.add_argument("-r", "--redis", action="store_true", help="Use RedisSpider")
16
+ gp.add_argument("name", help="Spider name")
17
+ gp.add_argument("domain", help="Target domain")
18
+
19
+ # demo project
20
+ demo_p = subparsers.add_parser("demo", help="Create a demo project")
21
+ demo_p.add_argument("-t", "--task", action="store_true", help="with tasks manager")
22
+ demo_p.add_argument("-r", "--redis", action="store_true", help="Use RedisSpider")
23
+
24
+ # export
25
+ # ep = subparsers.add_parser("export", help="Export files")
26
+ # ep.add_argument("name", help="Filename")
27
+
28
+ args = parser.parse_args()
29
+
30
+ if args.command == "startproject":
31
+ startproject.run(args.name, args.task)
32
+ elif args.command == "genspider":
33
+ genspider.run(args.name, args.domain, args.redis)
34
+ # elif args.command == "export":
35
+ # export.run(args.name)
36
+ elif args.command == "demo":
37
+ result = startproject.run("demo", args.task, is_demo=True)
38
+ if result is not None:
39
+ return
40
+ demo.run(args.task, args.redis)
41
+ else:
42
+ print(f"Unknown command: {args.command}")
43
+ parser.print_help()
@@ -0,0 +1,47 @@
1
+ import shutil, toml
2
+ from pathlib import Path
3
+
4
+ def run(project_name, use_task: bool, is_demo=False):
5
+ base = Path(__file__).parent.parent # scrapy_cffi
6
+ template_dir = base / "templates"
7
+ target: Path = Path.cwd() / project_name
8
+
9
+ if target.exists():
10
+ print(f"Error: Project '{project_name}' already exists.")
11
+ return False
12
+
13
+ if use_task:
14
+ shutil.copytree(template_dir / "task", target)
15
+ spiders_dir = target / "spiders" if use_task else target
16
+ shutil.copytree(template_dir / "spiders", spiders_dir)
17
+ shutil.copytree(template_dir / "js_path", target / "js_path")
18
+
19
+ if use_task:
20
+ # module path with `spiders`
21
+ runner_path = spiders_dir / "runner.py"
22
+ runner_code = runner_path.read_text(encoding='utf-8')
23
+ runner_code = runner_code.replace('from settings import create_settings', 'from spiders.settings import create_settings')
24
+ runner_path.write_text(runner_code, encoding='utf-8')
25
+
26
+ settings_path = spiders_dir / "settings.py"
27
+ settings_code = settings_path.read_text(encoding='utf-8')
28
+ settings_code = settings_code.replace('"extensions.CustomExtension"', '"spiders.extensions.CustomExtension"')
29
+ settings_code = settings_code.replace('"pipelines.CustomPipeline2"', '"spiders.pipelines.CustomPipeline2"')
30
+ settings_code = settings_code.replace('"pipelines.CustomPipeline1"', '"spiders.pipelines.CustomPipeline1"')
31
+ settings_code = settings_code.replace('"interceptors.CustomDownloadInterceptor1"', '"spiders.interceptors.CustomDownloadInterceptor1"')
32
+ settings_code = settings_code.replace('"interceptors.CustomDownloadInterceptor2"', '"spiders.interceptors.CustomDownloadInterceptor2"')
33
+ settings_path.write_text(settings_code, encoding='utf-8')
34
+
35
+ config_data = {
36
+ "default": {
37
+ "project_name": project_name,
38
+ "use_task": use_task
39
+ }
40
+ }
41
+ config_path = target / "scrapy_cffi.toml"
42
+ with config_path.open("w", encoding="utf-8") as f:
43
+ toml.dump(config_data, f)
44
+ if not is_demo:
45
+ print(f"Project '{project_name}' created.")
46
+ print(f"\tcd {project_name}")
47
+ print(f"\tscrapy_cffi genspider <spider_name> <domain>")
@@ -0,0 +1,10 @@
1
+ from .downloader import Request, HttpRequest, WebSocketRequest, Response, HttpResponse, WebSocketResponse
2
+
3
+ __all__ = [
4
+ "Request",
5
+ "HttpRequest",
6
+ "WebSocketRequest",
7
+ "Response",
8
+ "HttpResponse",
9
+ "WebSocketResponse",
10
+ ]
@@ -0,0 +1,11 @@
1
+ from .sessions import SessionManager
2
+ from .engine import Engine
3
+ from .tasks import TaskManager
4
+ from .downloader import Downloader
5
+
6
+ __all__ = [
7
+ "SessionManager",
8
+ "Engine",
9
+ "TaskManager",
10
+ "Downloader"
11
+ ]
@@ -0,0 +1,12 @@
1
+ from .fetch import Downloader
2
+ from .internet import Request, HttpRequest, WebSocketRequest, Response, HttpResponse, WebSocketResponse
3
+
4
+ __all__ = [
5
+ "Downloader",
6
+ "Request",
7
+ "HttpRequest",
8
+ "WebSocketRequest",
9
+ "Response",
10
+ "HttpResponse",
11
+ "WebSocketResponse",
12
+ ]