bricks-py 0.1.4__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. bricks_py-0.2.0/.gitignore +14 -0
  2. bricks_py-0.2.0/PKG-INFO +257 -0
  3. bricks_py-0.2.0/README.md +219 -0
  4. bricks_py-0.2.0/bricks/__main__.py +4 -0
  5. bricks_py-0.2.0/bricks/client/cli.py +199 -0
  6. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/client/runner.py +1 -1
  7. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/core/context.py +1 -0
  8. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/core/dispatch.py +210 -118
  9. bricks_py-0.2.0/bricks/core/events.py +459 -0
  10. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/core/genesis.py +8 -4
  11. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/db/sqlite.py +1 -1
  12. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/downloader/curl.py +1 -1
  13. bricks_py-0.2.0/bricks/downloader/niquests_.py +176 -0
  14. bricks_py-0.2.0/bricks/downloader/primp_.py +237 -0
  15. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/lib/extractors.py +1 -1
  16. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/lib/queues/redis_.py +30 -17
  17. bricks_py-0.2.0/bricks/lib/queues/sqlite_.py +789 -0
  18. bricks_py-0.2.0/bricks/rpc/grpc_/generic.proto +24 -0
  19. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/spider/air.py +90 -80
  20. bricks_py-0.2.0/bricks/tpls/spider/air.tpl +55 -0
  21. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/utils/codes.py +8 -10
  22. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/utils/convert.py +1 -1
  23. bricks_py-0.2.0/bricks/utils/fake/tls.py +608 -0
  24. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/utils/pandora.py +137 -22
  25. bricks_py-0.2.0/pyproject.toml +64 -0
  26. bricks_py-0.1.4/MANIFEST.in +0 -1
  27. bricks_py-0.1.4/PKG-INFO +0 -105
  28. bricks_py-0.1.4/README.md +0 -73
  29. bricks_py-0.1.4/bricks/core/events.py +0 -299
  30. bricks_py-0.1.4/bricks/lib/queues/sqlite_.py +0 -785
  31. bricks_py-0.1.4/bricks/utils/media_downloader.py +0 -877
  32. bricks_py-0.1.4/bricks_py.egg-info/PKG-INFO +0 -105
  33. bricks_py-0.1.4/bricks_py.egg-info/SOURCES.txt +0 -101
  34. bricks_py-0.1.4/bricks_py.egg-info/dependency_links.txt +0 -1
  35. bricks_py-0.1.4/bricks_py.egg-info/requires.txt +0 -8
  36. bricks_py-0.1.4/bricks_py.egg-info/top_level.txt +0 -2
  37. bricks_py-0.1.4/example/__init__.py +0 -4
  38. bricks_py-0.1.4/example/generator/__init__.py +0 -0
  39. bricks_py-0.1.4/example/generator/spider_generator.py +0 -33
  40. bricks_py-0.1.4/setup.cfg +0 -4
  41. bricks_py-0.1.4/setup.py +0 -48
  42. {bricks_py-0.1.4 → bricks_py-0.2.0}/LICENSE +0 -0
  43. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/__init__.py +0 -0
  44. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/client/__init__.py +0 -0
  45. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/client/manage.py +0 -0
  46. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/client/server/__init__.py +0 -0
  47. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/client/server/sanic_.py +0 -0
  48. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/client/server/starlette_.py +0 -0
  49. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/core/__init__.py +0 -0
  50. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/core/signals.py +0 -0
  51. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/db/__init__.py +0 -0
  52. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/db/mongo.py +0 -0
  53. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/db/redis_.py +0 -0
  54. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/db/rocksdb.py +0 -0
  55. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/downloader/__init__.py +0 -0
  56. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/downloader/cffi.py +0 -0
  57. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/downloader/dp.py +0 -0
  58. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/downloader/go_requests.py +0 -0
  59. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/downloader/httpx_.py +0 -0
  60. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/downloader/playwright_.py +0 -0
  61. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/downloader/pyhttpx_.py +0 -0
  62. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/downloader/requests_.py +0 -0
  63. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/downloader/tls_client_.py +0 -0
  64. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/lib/__init__.py +0 -0
  65. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/lib/cookies.py +0 -0
  66. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/lib/counter.py +0 -0
  67. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/lib/headers.py +0 -0
  68. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/lib/items.py +0 -0
  69. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/lib/nodes.py +0 -0
  70. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/lib/proxies.py +0 -0
  71. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/lib/queues/__init__.py +0 -0
  72. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/lib/queues/cache.py +0 -0
  73. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/lib/queues/local.py +0 -0
  74. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/lib/queues/rocksdb_.py +0 -0
  75. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/lib/queues/smart.py +0 -0
  76. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/lib/request.py +0 -0
  77. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/lib/response.py +0 -0
  78. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/lib/variable.py +0 -0
  79. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/plugins/__init__.py +0 -0
  80. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/plugins/make_seeds.py +0 -0
  81. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/plugins/on_request.py +0 -0
  82. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/plugins/scripts.py +0 -0
  83. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/plugins/storage.py +0 -0
  84. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/rpc/__init__.py +0 -0
  85. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/rpc/common.py +0 -0
  86. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/rpc/grpc_/__init__.py +0 -0
  87. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/rpc/grpc_/generic_pb2.py +0 -0
  88. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/rpc/grpc_/generic_pb2.pyi +0 -0
  89. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/rpc/grpc_/generic_pb2_grpc.py +0 -0
  90. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/rpc/grpc_/service.py +0 -0
  91. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/rpc/http_/__init__.py +0 -0
  92. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/rpc/http_/service.py +0 -0
  93. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/rpc/redis_/__init__.py +0 -0
  94. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/rpc/redis_/service.py +0 -0
  95. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/rpc/socket_/__init__.py +0 -0
  96. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/rpc/socket_/service.py +0 -0
  97. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/rpc/websocket_/__init__.py +0 -0
  98. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/rpc/websocket_/service.py +0 -0
  99. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/spider/__init__.py +0 -0
  100. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/spider/addon.py +0 -0
  101. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/spider/form.py +0 -0
  102. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/spider/template.py +0 -0
  103. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/state.py +0 -0
  104. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/tpls/__init__.py +0 -0
  105. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/tpls/spider/__init__.py +0 -0
  106. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/tpls/spider/form.tpl +0 -0
  107. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/tpls/spider/template.tpl +0 -0
  108. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/utils/__init__.py +0 -0
  109. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/utils/arrow.py +0 -0
  110. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/utils/compress.py +0 -0
  111. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/utils/csv_.py +0 -0
  112. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/utils/fake/__init__.py +0 -0
  113. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/utils/fake/stochastic.py +0 -0
  114. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/utils/fake/user_agent.py +0 -0
  115. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/utils/package.py +0 -0
  116. {bricks_py-0.1.4 → bricks_py-0.2.0}/bricks/utils/scheduler.py +0 -0
@@ -0,0 +1,14 @@
1
+ venv
2
+ /.svn
3
+ /.idea
4
+ __pycache__
5
+ /logs
6
+ /Files
7
+ /test
8
+ /venv
9
+ /no_views
10
+ .vscode/launch.json
11
+ .gitignore
12
+ .python-version
13
+ .gitignore
14
+ uv.lock
@@ -0,0 +1,257 @@
1
+ Metadata-Version: 2.4
2
+ Name: bricks-py
3
+ Version: 0.2.0
4
+ Summary: quickly build your crawler
5
+ Project-URL: Homepage, https://github.com/KKKKKKKEM/bricks
6
+ Project-URL: Repository, https://github.com/KKKKKKKEM/bricks.git
7
+ Project-URL: Documentation, https://kkkkkkkem.vercel.app/bricks
8
+ Project-URL: Bug Tracker, https://github.com/KKKKKKKEM/bricks/issues
9
+ Author-email: Kem <531144129@qq.com>
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Keywords: bricks,crawler,scraper,spider
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Internet :: WWW/HTTP
21
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
+ Requires-Python: >=3.10
23
+ Requires-Dist: better-exceptions==0.3.3
24
+ Requires-Dist: curl-cffi==0.15.0
25
+ Requires-Dist: jmespath-community==1.1.3
26
+ Requires-Dist: jsonpath==0.82.2
27
+ Requires-Dist: loguru==0.7.3
28
+ Requires-Dist: lxml==5.3.1
29
+ Requires-Dist: redis==5.2.1
30
+ Requires-Dist: w3lib==2.3.1
31
+ Provides-Extra: httpx
32
+ Requires-Dist: httpx>=0.24.0; extra == 'httpx'
33
+ Provides-Extra: playwright
34
+ Requires-Dist: playwright>=1.40.0; extra == 'playwright'
35
+ Provides-Extra: requests
36
+ Requires-Dist: requests>=2.28.0; extra == 'requests'
37
+ Description-Content-Type: text/markdown
38
+
39
+ [license]: /LICENSE
40
+ [license-badge]: https://img.shields.io/github/license/KKKKKKKEM/bricks?style=flat-square
41
+ [prs]: https://github.com/KKKKKKKEM/bricks
42
+ [prs-badge]: https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square
43
+ [issues]: https://github.com/KKKKKKKEM/bricks/issues/new
44
+ [issues-badge]: https://img.shields.io/badge/Issues-welcome-brightgreen.svg?style=flat-square
45
+ [release]: https://github.com/KKKKKKKEM/bricks/releases/latest
46
+ [release-badge]: https://img.shields.io/github/v/release/KKKKKKKEM/bricks?style=flat-square
47
+ [pypi]: https://pypi.org/project/bricks-py/
48
+ [pypi-badge]: https://img.shields.io/pypi/v/bricks-py?style=flat-square
49
+ [python-badge]: https://img.shields.io/pypi/pyversions/bricks-py?style=flat-square
50
+
51
+ <div align="center">
52
+
53
+ # 🧱 Bricks
54
+
55
+ **像搭积木一样构建爬虫**
56
+
57
+ [![license][license-badge]][license]
58
+ [![release][release-badge]][release]
59
+ [![prs][prs-badge]][prs]
60
+ [![issues][issues-badge]][issues]
61
+ ![python][python-badge]
62
+
63
+ </div>
64
+
65
+ ---
66
+
67
+ ## 简介
68
+
69
+ `Bricks` 是一个模块化、事件驱动的 Python 爬虫框架,旨在将爬虫开发变得像搭建积木一样简单而有趣。框架提供了从 **纯代码式** 到 **零代码配置式** 的多层次开发体验,让新手快速上手,让专家灵活掌控。
70
+
71
+ 无论是简单的单页抓取、多步骤链式请求,还是分布式大规模爬取,`Bricks` 都能以一致的编程模型优雅地处理。
72
+
73
+ ---
74
+
75
+ ## ✨ 核心特性
76
+
77
+ | 特性 | 说明 |
78
+ |---|---|
79
+ | **事件驱动架构** | 在请求前后、存储前后等生命周期节点注册事件钩子,无需修改核心逻辑即可扩展行为 |
80
+ | **三种爬虫基类** | `air`(纯代码)、`form`(自定义流程配置)、`template`(固定流程配置),按复杂度选择 |
81
+ | **丰富的解析器** | 内置 `json` / `xpath` / `jsonpath` / `regex` 解析,声明规则即可完成数据提取 |
82
+ | **多种下载器** | 默认使用 `curl-cffi`,可选 `requests` / `httpx` / `playwright` / `pycurl` 等,支持自定义 |
83
+ | **弹性调度器** | 可伸缩线程池,同步/异步任务统一调度,自动根据任务量调节 Worker 数量 |
84
+ | **多种任务队列** | 内置 `Local`(单机)和 `Redis`(分布式)队列,接口统一,支持自定义扩展 |
85
+ | **爬虫 API 化** | 内置 `rpc` 模式,一键将爬虫转化为可远程调用的 API |
86
+ | **代理管理** | 内置 `ApiProxy` / `RedisProxy` / `ClashProxy` 等代理管理器,支持自动轮换、阈值回收 |
87
+
88
+ ---
89
+
90
+ ## 🚀 快速开始
91
+
92
+ ### 安装
93
+
94
+ ```bash
95
+ # 安装正式版
96
+ pip install -U bricks-py
97
+
98
+ # 安装最新开发版
99
+ pip install -U git+https://github.com/KKKKKKKEM/bricks.git
100
+
101
+ # 安装测试版(beta)
102
+ pip install -i https://test.pypi.org/simple/ -U bricks-py
103
+ ```
104
+
105
+ 可选下载器依赖:
106
+
107
+ ```bash
108
+ pip install bricks-py[requests] # requests 下载器
109
+ pip install bricks-py[httpx] # httpx 下载器
110
+ pip install bricks-py[playwright] # playwright 下载器
111
+ ```
112
+
113
+ ### 最简示例(air 爬虫)
114
+
115
+ ```python
116
+ from bricks import Request, const
117
+ from bricks.core import events, signals
118
+ from bricks.spider import air
119
+ from bricks.spider.air import Context
120
+
121
+
122
+ class MySpider(air.Spider):
123
+
124
+ def make_seeds(self, context: Context, **kwargs):
125
+ # 返回要爬取的种子列表
126
+ return [{"page": 1}, {"page": 2}, {"page": 3}]
127
+
128
+ def make_request(self, context: Context) -> Request:
129
+ seeds = context.seeds
130
+ return Request(
131
+ url="https://api.example.com/list",
132
+ params={"page": seeds["page"]},
133
+ )
134
+
135
+ def parse(self, context: Context):
136
+ return context.response.extract(
137
+ engine="json",
138
+ rules={"data.list": {"id": "id", "name": "name"}},
139
+ )
140
+
141
+ def item_pipeline(self, context: Context):
142
+ print(context.items)
143
+ context.success() # 标记种子处理完成
144
+
145
+ @staticmethod
146
+ @events.on(const.AFTER_REQUEST)
147
+ def check_response(context: Context):
148
+ if context.response.get("code") != 0:
149
+ raise signals.Retry # 触发重试
150
+
151
+
152
+ if __name__ == "__main__":
153
+ spider = MySpider()
154
+ spider.run()
155
+ ```
156
+
157
+ ### 配置式示例(form 爬虫)
158
+
159
+ ```python
160
+ from bricks.spider import form
161
+
162
+
163
+ class MySpider(form.Spider):
164
+
165
+ @property
166
+ def config(self) -> form.Config:
167
+ return form.Config(
168
+ init=[form.Init(func=lambda: {"page": 1})],
169
+ spider=[
170
+ form.Download(
171
+ url="https://api.example.com/list",
172
+ params={"page": "{page}"},
173
+ ),
174
+ form.Parse(
175
+ func="json",
176
+ kwargs={"rules": {"data.list": {"id": "id", "name": "name"}}},
177
+ ),
178
+ form.Pipeline(
179
+ func=lambda context: print(context.items),
180
+ success=True,
181
+ ),
182
+ ],
183
+ )
184
+
185
+
186
+ if __name__ == "__main__":
187
+ MySpider().run()
188
+ ```
189
+
190
+ ---
191
+
192
+ ## 📖 文档
193
+
194
+ | 文档 | 描述 |
195
+ |---|---|
196
+ | [快速入门](docs/quickstart.md) | 5 分钟了解 Bricks 的核心概念和使用方式 |
197
+ | [爬虫基类](docs/spiders.md) | `air` / `form` / `template` 三种爬虫的详细说明 |
198
+ | [事件系统](docs/events.md) | 生命周期钩子、事件注册与触发机制 |
199
+ | [解析器](docs/parsers.md) | JSON / XPath / JSONPath / Regex 解析规则详解 |
200
+ | [下载器](docs/downloaders.md) | 各类下载器的使用与自定义扩展 |
201
+ | [任务队列](docs/queues.md) | Local / Redis 队列与分布式爬虫 |
202
+ | [代理管理](docs/proxies.md) | 代理池配置与自动轮换策略 |
203
+ | [信号机制](docs/signals.md) | Retry / Success / Failure 等控制信号 |
204
+ | [RPC 模式](docs/rpc.md) | 将爬虫暴露为远程 API |
205
+ | [存储插件](docs/storage.md) | 内置 SQLite / MongoDB / Redis / CSV 存储 |
206
+
207
+ > 完整在线文档:[https://kkkkkkkem.vercel.app/bricks](https://kkkkkkkem.vercel.app/bricks)
208
+
209
+ ---
210
+
211
+ ## 🏗️ 架构概览
212
+
213
+ ```
214
+ bricks/
215
+ ├── spider/ # 爬虫基类
216
+ │ ├── air.py # 纯代码式爬虫
217
+ │ ├── form.py # 自定义流程配置式爬虫
218
+ │ └── template.py # 固定流程配置式爬虫
219
+ ├── core/ # 核心机制
220
+ │ ├── context.py # 上下文 / 流程控制
221
+ │ ├── events.py # 事件管理器
222
+ │ ├── genesis.py # 基础类 Chaos / Pangu
223
+ │ ├── dispatch.py # 调度器
224
+ │ └── signals.py # 控制信号
225
+ ├── downloader/ # 下载器
226
+ ├── lib/ # 基础库(Request / Response / Queue / Proxy 等)
227
+ ├── plugins/ # 内置插件(storage / scripts 等)
228
+ └── rpc/ # RPC 模式
229
+ ```
230
+
231
+ **爬虫生命周期**:
232
+
233
+ ```
234
+ make_seeds → [BEFORE_PUT_SEEDS] → put_seeds → [AFTER_PUT_SEEDS]
235
+
236
+ [BEFORE_GET_SEEDS] → get_seeds → [AFTER_GET_SEEDS]
237
+
238
+ [BEFORE_MAKE_REQUEST] → make_request → [AFTER_MAKE_REQUEST]
239
+
240
+ [BEFORE_REQUEST] → on_request → [AFTER_REQUEST]
241
+
242
+ on_response (parse)
243
+
244
+ [BEFORE_PIPELINE] → on_pipeline → [AFTER_PIPELINE]
245
+ ```
246
+
247
+ ---
248
+
249
+ ## 🤝 贡献
250
+
251
+ 欢迎提交 [Issue][issues] 和 [Pull Request][prs]。
252
+
253
+ ---
254
+
255
+ ## 📄 License
256
+
257
+ [MIT](LICENSE) © Kem
@@ -0,0 +1,219 @@
1
+ [license]: /LICENSE
2
+ [license-badge]: https://img.shields.io/github/license/KKKKKKKEM/bricks?style=flat-square
3
+ [prs]: https://github.com/KKKKKKKEM/bricks
4
+ [prs-badge]: https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square
5
+ [issues]: https://github.com/KKKKKKKEM/bricks/issues/new
6
+ [issues-badge]: https://img.shields.io/badge/Issues-welcome-brightgreen.svg?style=flat-square
7
+ [release]: https://github.com/KKKKKKKEM/bricks/releases/latest
8
+ [release-badge]: https://img.shields.io/github/v/release/KKKKKKKEM/bricks?style=flat-square
9
+ [pypi]: https://pypi.org/project/bricks-py/
10
+ [pypi-badge]: https://img.shields.io/pypi/v/bricks-py?style=flat-square
11
+ [python-badge]: https://img.shields.io/pypi/pyversions/bricks-py?style=flat-square
12
+
13
+ <div align="center">
14
+
15
+ # 🧱 Bricks
16
+
17
+ **像搭积木一样构建爬虫**
18
+
19
+ [![license][license-badge]][license]
20
+ [![release][release-badge]][release]
21
+ [![prs][prs-badge]][prs]
22
+ [![issues][issues-badge]][issues]
23
+ ![python][python-badge]
24
+
25
+ </div>
26
+
27
+ ---
28
+
29
+ ## 简介
30
+
31
+ `Bricks` 是一个模块化、事件驱动的 Python 爬虫框架,旨在将爬虫开发变得像搭建积木一样简单而有趣。框架提供了从 **纯代码式** 到 **零代码配置式** 的多层次开发体验,让新手快速上手,让专家灵活掌控。
32
+
33
+ 无论是简单的单页抓取、多步骤链式请求,还是分布式大规模爬取,`Bricks` 都能以一致的编程模型优雅地处理。
34
+
35
+ ---
36
+
37
+ ## ✨ 核心特性
38
+
39
+ | 特性 | 说明 |
40
+ |---|---|
41
+ | **事件驱动架构** | 在请求前后、存储前后等生命周期节点注册事件钩子,无需修改核心逻辑即可扩展行为 |
42
+ | **三种爬虫基类** | `air`(纯代码)、`form`(自定义流程配置)、`template`(固定流程配置),按复杂度选择 |
43
+ | **丰富的解析器** | 内置 `json` / `xpath` / `jsonpath` / `regex` 解析,声明规则即可完成数据提取 |
44
+ | **多种下载器** | 默认使用 `curl-cffi`,可选 `requests` / `httpx` / `playwright` / `pycurl` 等,支持自定义 |
45
+ | **弹性调度器** | 可伸缩线程池,同步/异步任务统一调度,自动根据任务量调节 Worker 数量 |
46
+ | **多种任务队列** | 内置 `Local`(单机)和 `Redis`(分布式)队列,接口统一,支持自定义扩展 |
47
+ | **爬虫 API 化** | 内置 `rpc` 模式,一键将爬虫转化为可远程调用的 API |
48
+ | **代理管理** | 内置 `ApiProxy` / `RedisProxy` / `ClashProxy` 等代理管理器,支持自动轮换、阈值回收 |
49
+
50
+ ---
51
+
52
+ ## 🚀 快速开始
53
+
54
+ ### 安装
55
+
56
+ ```bash
57
+ # 安装正式版
58
+ pip install -U bricks-py
59
+
60
+ # 安装最新开发版
61
+ pip install -U git+https://github.com/KKKKKKKEM/bricks.git
62
+
63
+ # 安装测试版(beta)
64
+ pip install -i https://test.pypi.org/simple/ -U bricks-py
65
+ ```
66
+
67
+ 可选下载器依赖:
68
+
69
+ ```bash
70
+ pip install bricks-py[requests] # requests 下载器
71
+ pip install bricks-py[httpx] # httpx 下载器
72
+ pip install bricks-py[playwright] # playwright 下载器
73
+ ```
74
+
75
+ ### 最简示例(air 爬虫)
76
+
77
+ ```python
78
+ from bricks import Request, const
79
+ from bricks.core import events, signals
80
+ from bricks.spider import air
81
+ from bricks.spider.air import Context
82
+
83
+
84
+ class MySpider(air.Spider):
85
+
86
+ def make_seeds(self, context: Context, **kwargs):
87
+ # 返回要爬取的种子列表
88
+ return [{"page": 1}, {"page": 2}, {"page": 3}]
89
+
90
+ def make_request(self, context: Context) -> Request:
91
+ seeds = context.seeds
92
+ return Request(
93
+ url="https://api.example.com/list",
94
+ params={"page": seeds["page"]},
95
+ )
96
+
97
+ def parse(self, context: Context):
98
+ return context.response.extract(
99
+ engine="json",
100
+ rules={"data.list": {"id": "id", "name": "name"}},
101
+ )
102
+
103
+ def item_pipeline(self, context: Context):
104
+ print(context.items)
105
+ context.success() # 标记种子处理完成
106
+
107
+ @staticmethod
108
+ @events.on(const.AFTER_REQUEST)
109
+ def check_response(context: Context):
110
+ if context.response.get("code") != 0:
111
+ raise signals.Retry # 触发重试
112
+
113
+
114
+ if __name__ == "__main__":
115
+ spider = MySpider()
116
+ spider.run()
117
+ ```
118
+
119
+ ### 配置式示例(form 爬虫)
120
+
121
+ ```python
122
+ from bricks.spider import form
123
+
124
+
125
+ class MySpider(form.Spider):
126
+
127
+ @property
128
+ def config(self) -> form.Config:
129
+ return form.Config(
130
+ init=[form.Init(func=lambda: {"page": 1})],
131
+ spider=[
132
+ form.Download(
133
+ url="https://api.example.com/list",
134
+ params={"page": "{page}"},
135
+ ),
136
+ form.Parse(
137
+ func="json",
138
+ kwargs={"rules": {"data.list": {"id": "id", "name": "name"}}},
139
+ ),
140
+ form.Pipeline(
141
+ func=lambda context: print(context.items),
142
+ success=True,
143
+ ),
144
+ ],
145
+ )
146
+
147
+
148
+ if __name__ == "__main__":
149
+ MySpider().run()
150
+ ```
151
+
152
+ ---
153
+
154
+ ## 📖 文档
155
+
156
+ | 文档 | 描述 |
157
+ |---|---|
158
+ | [快速入门](docs/quickstart.md) | 5 分钟了解 Bricks 的核心概念和使用方式 |
159
+ | [爬虫基类](docs/spiders.md) | `air` / `form` / `template` 三种爬虫的详细说明 |
160
+ | [事件系统](docs/events.md) | 生命周期钩子、事件注册与触发机制 |
161
+ | [解析器](docs/parsers.md) | JSON / XPath / JSONPath / Regex 解析规则详解 |
162
+ | [下载器](docs/downloaders.md) | 各类下载器的使用与自定义扩展 |
163
+ | [任务队列](docs/queues.md) | Local / Redis 队列与分布式爬虫 |
164
+ | [代理管理](docs/proxies.md) | 代理池配置与自动轮换策略 |
165
+ | [信号机制](docs/signals.md) | Retry / Success / Failure 等控制信号 |
166
+ | [RPC 模式](docs/rpc.md) | 将爬虫暴露为远程 API |
167
+ | [存储插件](docs/storage.md) | 内置 SQLite / MongoDB / Redis / CSV 存储 |
168
+
169
+ > 完整在线文档:[https://kkkkkkkem.vercel.app/bricks](https://kkkkkkkem.vercel.app/bricks)
170
+
171
+ ---
172
+
173
+ ## 🏗️ 架构概览
174
+
175
+ ```
176
+ bricks/
177
+ ├── spider/ # 爬虫基类
178
+ │ ├── air.py # 纯代码式爬虫
179
+ │ ├── form.py # 自定义流程配置式爬虫
180
+ │ └── template.py # 固定流程配置式爬虫
181
+ ├── core/ # 核心机制
182
+ │ ├── context.py # 上下文 / 流程控制
183
+ │ ├── events.py # 事件管理器
184
+ │ ├── genesis.py # 基础类 Chaos / Pangu
185
+ │ ├── dispatch.py # 调度器
186
+ │ └── signals.py # 控制信号
187
+ ├── downloader/ # 下载器
188
+ ├── lib/ # 基础库(Request / Response / Queue / Proxy 等)
189
+ ├── plugins/ # 内置插件(storage / scripts 等)
190
+ └── rpc/ # RPC 模式
191
+ ```
192
+
193
+ **爬虫生命周期**:
194
+
195
+ ```
196
+ make_seeds → [BEFORE_PUT_SEEDS] → put_seeds → [AFTER_PUT_SEEDS]
197
+
198
+ [BEFORE_GET_SEEDS] → get_seeds → [AFTER_GET_SEEDS]
199
+
200
+ [BEFORE_MAKE_REQUEST] → make_request → [AFTER_MAKE_REQUEST]
201
+
202
+ [BEFORE_REQUEST] → on_request → [AFTER_REQUEST]
203
+
204
+ on_response (parse)
205
+
206
+ [BEFORE_PIPELINE] → on_pipeline → [AFTER_PIPELINE]
207
+ ```
208
+
209
+ ---
210
+
211
+ ## 🤝 贡献
212
+
213
+ 欢迎提交 [Issue][issues] 和 [Pull Request][prs]。
214
+
215
+ ---
216
+
217
+ ## 📄 License
218
+
219
+ [MIT](LICENSE) © Kem
@@ -0,0 +1,4 @@
1
+ from bricks.client.cli import main
2
+
3
+ if __name__ == "__main__":
4
+ main()
@@ -0,0 +1,199 @@
1
+ # -*- coding: utf-8 -*-
2
+ # @Desc : CLI 入口
3
+ import argparse
4
+ import inspect
5
+ import os
6
+ import sys
7
+
8
+ from loguru import logger
9
+
10
+
11
+ def _find_spider_class(module):
12
+ """从模块中查找 Spider 子类"""
13
+ from bricks.spider import air
14
+
15
+ candidates = []
16
+ for name in dir(module):
17
+ obj = getattr(module, name)
18
+ if (
19
+ isinstance(obj, type)
20
+ and issubclass(obj, air.Spider)
21
+ and obj is not air.Spider
22
+ and obj.__module__ == module.__name__
23
+ ):
24
+ candidates.append(obj)
25
+
26
+ if not candidates:
27
+ return None
28
+
29
+ if len(candidates) == 1:
30
+ return candidates[0]
31
+
32
+ # 多个候选时按源代码行号排序取第一个
33
+ candidates.sort(key=lambda c: inspect.getsourcelines(c)[1])
34
+ return candidates[0]
35
+
36
+
37
+ def _load_spider(filepath, concurrency=None):
38
+ """加载爬虫文件并返回实例化的 Spider"""
39
+ from bricks.utils import pandora
40
+
41
+ module = pandora.load_objects(filepath)
42
+ cls = _find_spider_class(module)
43
+ if cls is None:
44
+ logger.error(f"未在 {filepath} 中找到 Spider 子类")
45
+ sys.exit(1)
46
+
47
+ kwargs = {}
48
+ if concurrency is not None:
49
+ kwargs["concurrency"] = concurrency
50
+
51
+ return cls(**kwargs)
52
+
53
+
54
+ def cmd_new(args):
55
+ """生成爬虫文件"""
56
+ name = args.name
57
+ form = args.type
58
+ output = args.output or f"./{name}.py"
59
+ curl = args.curl
60
+
61
+ if curl:
62
+ from bricks.utils.convert import curl2spider
63
+ curl2spider(curl=curl, path=output, name=name, form=form)
64
+ else:
65
+ tpl_path = os.path.join(
66
+ os.path.dirname(os.path.dirname(__file__)), "tpls", "spider", form + ".tpl"
67
+ )
68
+ if not os.path.exists(tpl_path):
69
+ logger.error(f"模板文件不存在: {tpl_path}")
70
+ sys.exit(1)
71
+
72
+ with open(tpl_path) as f:
73
+ tpl = f.read()
74
+
75
+ content = tpl.format(
76
+ **{
77
+ "SPIDER": name,
78
+ "URL": "https://example.com",
79
+ "METHOD": "GET",
80
+ "PARAMS": None,
81
+ "BODY": None,
82
+ "HEADERS": None,
83
+ "COOKIES": None,
84
+ "OPTIONS": None,
85
+ "ALLOW_REDIRECTS": True,
86
+ "PROXIES": None,
87
+ "PROXY": None,
88
+ "MAX_RETRY": 5,
89
+ "USE_SESSION": False,
90
+ }
91
+ )
92
+
93
+ target_dir = os.path.dirname(output)
94
+ if target_dir and not os.path.exists(target_dir):
95
+ os.makedirs(target_dir, exist_ok=True)
96
+
97
+ with open(output, "w") as f:
98
+ f.write(content)
99
+
100
+ logger.debug(f"生成成功, 路径为: {output}")
101
+
102
+
103
+ def cmd_run(args):
104
+ """运行爬虫"""
105
+ spider = _load_spider(args.file, concurrency=args.concurrency)
106
+ spider.run(task_name=args.task)
107
+
108
+
109
+ def cmd_serve(args):
110
+ """启动 RPC 服务"""
111
+ from bricks.client.runner import RpcProxy
112
+
113
+ spider = _load_spider(args.file, concurrency=args.concurrency)
114
+ proxy = RpcProxy(spider.run)
115
+ proxy.bind(spider)
116
+ proxy.register_adapter("get_stats", lambda: {
117
+ "number_of_total_requests": spider.number_of_total_requests.value,
118
+ "number_of_failure_requests": spider.number_of_failure_requests.value,
119
+ "number_of_new_seeds": spider.number_of_new_seeds.value,
120
+ "number_of_seeds_obtained": spider.number_of_seeds_obtained.value,
121
+ "number_of_seeds_pending": spider.number_of_seeds_pending,
122
+ })
123
+ proxy.start(mode=args.mode, ident=args.port)
124
+
125
+
126
+ def cmd_status(args):
127
+ """查询 RPC 服务状态"""
128
+ from bricks.rpc.http_.service import Client
129
+
130
+ endpoint = f"{args.host}:{args.port}"
131
+ client = Client(endpoint)
132
+
133
+ try:
134
+ resp = client.rpc("PING")
135
+ print(f"连接成功: {endpoint}")
136
+ print(f"PING -> {resp.data}")
137
+ except Exception as e:
138
+ logger.error(f"无法连接到 {endpoint}: {e}")
139
+ sys.exit(1)
140
+
141
+ try:
142
+ resp = client.rpc("get_stats")
143
+ stats = resp.data
144
+ if isinstance(stats, dict):
145
+ print("\n--- 运行统计 ---")
146
+ for key, value in stats.items():
147
+ print(f" {key}: {value}")
148
+ else:
149
+ print(f"统计数据: {stats}")
150
+ except Exception as e:
151
+ logger.warning(f"获取统计数据失败: {e}")
152
+
153
+
154
+ def main():
155
+ parser = argparse.ArgumentParser(
156
+ prog="bricks",
157
+ description="Bricks 爬虫框架 CLI",
158
+ )
159
+ subparsers = parser.add_subparsers(dest="command")
160
+
161
+ # bricks new
162
+ p_new = subparsers.add_parser("new", help="生成爬虫模板文件")
163
+ p_new.add_argument("name", help="爬虫类名")
164
+ p_new.add_argument("-t", "--type", default="air", choices=["air", "form", "template"], help="模板类型 (默认: air)")
165
+ p_new.add_argument("--curl", help="curl 命令字符串,自动填充请求配置")
166
+ p_new.add_argument("-o", "--output", help="输出路径 (默认: ./<name>.py)")
167
+
168
+ # bricks run
169
+ p_run = subparsers.add_parser("run", help="运行爬虫")
170
+ p_run.add_argument("file", help="爬虫 Python 文件路径")
171
+ p_run.add_argument("-c", "--concurrency", type=int, help="并发数")
172
+ p_run.add_argument("--task", default="all", help="任务名 (默认: all)")
173
+
174
+ # bricks serve
175
+ p_serve = subparsers.add_parser("serve", help="启动 RPC 服务")
176
+ p_serve.add_argument("file", help="爬虫文件路径")
177
+ p_serve.add_argument("-m", "--mode", default="http", choices=["http", "websocket", "socket", "grpc", "redis"], help="RPC 模式 (默认: http)")
178
+ p_serve.add_argument("-p", "--port", type=int, default=8080, help="端口 (默认: 8080)")
179
+ p_serve.add_argument("-c", "--concurrency", type=int, help="并发数")
180
+
181
+ # bricks status
182
+ p_status = subparsers.add_parser("status", help="查询 RPC 服务状态")
183
+ p_status.add_argument("--host", default="localhost", help="主机 (默认: localhost)")
184
+ p_status.add_argument("-p", "--port", type=int, default=8080, help="端口 (默认: 8080)")
185
+
186
+ args = parser.parse_args()
187
+
188
+ handlers = {
189
+ "new": cmd_new,
190
+ "run": cmd_run,
191
+ "serve": cmd_serve,
192
+ "status": cmd_status,
193
+ }
194
+
195
+ handler = handlers.get(args.command)
196
+ if handler:
197
+ handler(args)
198
+ else:
199
+ parser.print_help()