spidermanager-sdk 0.1.3.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spidermanager_sdk-0.1.3.dev0/.github/workflows/publish.yml +46 -0
- spidermanager_sdk-0.1.3.dev0/.gitignore +15 -0
- spidermanager_sdk-0.1.3.dev0/LICENSE +21 -0
- spidermanager_sdk-0.1.3.dev0/PKG-INFO +97 -0
- spidermanager_sdk-0.1.3.dev0/README.md +82 -0
- spidermanager_sdk-0.1.3.dev0/pyproject.toml +31 -0
- spidermanager_sdk-0.1.3.dev0/setup.cfg +4 -0
- spidermanager_sdk-0.1.3.dev0/src/spidermanager_sdk/__init__.py +24 -0
- spidermanager_sdk-0.1.3.dev0/src/spidermanager_sdk/_version.py +34 -0
- spidermanager_sdk-0.1.3.dev0/src/spidermanager_sdk/aio.py +213 -0
- spidermanager_sdk-0.1.3.dev0/src/spidermanager_sdk/buffer.py +126 -0
- spidermanager_sdk-0.1.3.dev0/src/spidermanager_sdk/client.py +238 -0
- spidermanager_sdk-0.1.3.dev0/src/spidermanager_sdk/transport.py +119 -0
- spidermanager_sdk-0.1.3.dev0/src/spidermanager_sdk.egg-info/PKG-INFO +97 -0
- spidermanager_sdk-0.1.3.dev0/src/spidermanager_sdk.egg-info/SOURCES.txt +16 -0
- spidermanager_sdk-0.1.3.dev0/src/spidermanager_sdk.egg-info/dependency_links.txt +1 -0
- spidermanager_sdk-0.1.3.dev0/src/spidermanager_sdk.egg-info/requires.txt +5 -0
- spidermanager_sdk-0.1.3.dev0/src/spidermanager_sdk.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- 'v*' # 当你推送以 v 开头的标签时触发,如 v0.1.0
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
build-n-publish:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
# 这一步是为了安全,建议使用 PyPI 官方推荐的受信任发布者(Trusted Publishers)
|
|
12
|
+
permissions:
|
|
13
|
+
id-token: write
|
|
14
|
+
contents: write
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- name: Checkout code
|
|
18
|
+
uses: actions/checkout@v4
|
|
19
|
+
with:
|
|
20
|
+
fetch-depth: 0 # 必须设为 0,setuptools-scm 需要完整的 Git 历史来计算版本
|
|
21
|
+
|
|
22
|
+
- name: Set up Python
|
|
23
|
+
uses: actions/setup-python@v5
|
|
24
|
+
with:
|
|
25
|
+
python-version: '3.10'
|
|
26
|
+
|
|
27
|
+
- name: Install dependencies
|
|
28
|
+
run: |
|
|
29
|
+
python -m pip install --upgrade pip
|
|
30
|
+
pip install build
|
|
31
|
+
|
|
32
|
+
- name: Build package
|
|
33
|
+
run: python -m build
|
|
34
|
+
|
|
35
|
+
- name: Publish to PyPI
|
|
36
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
37
|
+
# 推荐使用 Trusted Publishers,不再需要手动在 Secrets 里存 Token
|
|
38
|
+
# 只需要在 PyPI 后台关联这个 GitHub 仓库即可
|
|
39
|
+
|
|
40
|
+
- name: Create GitHub Release
|
|
41
|
+
uses: softprops/action-gh-release@v2
|
|
42
|
+
with:
|
|
43
|
+
files: dist/*
|
|
44
|
+
generate_release_notes: true # 自动根据 commit 记录生成更新日志
|
|
45
|
+
env:
|
|
46
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Rosia
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: spidermanager-sdk
|
|
3
|
+
Version: 0.1.3.dev0
|
|
4
|
+
Summary: 极简 Python SDK,将爬虫采集数据通过 HTTP 异步中转至 SpiderManager 后端
|
|
5
|
+
Author: SpiderManager Team
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: httpx>=0.25.0
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
13
|
+
Requires-Dist: pytest-asyncio>=0.21; extra == "dev"
|
|
14
|
+
Dynamic: license-file
|
|
15
|
+
|
|
16
|
+
# SpiderManager SDK
|
|
17
|
+
|
|
18
|
+
极简 Python SDK,将爬虫采集数据通过 HTTP 异步中转至 SpiderManager 后端,实现爬虫逻辑与数据存储的完全解耦。
|
|
19
|
+
|
|
20
|
+
## 特性
|
|
21
|
+
|
|
22
|
+
- **零配置启动**:自动从环境变量 `TASK_ID` / `SPIDER_API_URL` 读取配置
|
|
23
|
+
- **异步缓冲**:内存 buffer 按阈值(20条)或时间窗口(3秒)批量上报,减少网络 IO
|
|
24
|
+
- **容器安全**:通过 `atexit` + `SIGTERM` 双保险,确保 Docker 销毁前 flush 全部数据
|
|
25
|
+
- **最小依赖**:仅依赖 `httpx`,不侵入爬虫业务代码
|
|
26
|
+
|
|
27
|
+
## 安装
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install spidermanager-sdk
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## 快速开始
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from spidermanager_sdk import sdk
|
|
37
|
+
|
|
38
|
+
# 初始化(容器中自动读取环境变量,本地开发可手动指定)
|
|
39
|
+
sdk.init(api_url="http://localhost:8000", task_id="task-001")
|
|
40
|
+
|
|
41
|
+
# 上报数据(会自动缓冲、批量上报)
|
|
42
|
+
sdk.insert("articles", {"title": "Hello", "url": "https://example.com"})
|
|
43
|
+
|
|
44
|
+
# 批量上报
|
|
45
|
+
sdk.insert("articles", [
|
|
46
|
+
{"title": "A", "url": "https://a.com"},
|
|
47
|
+
{"title": "B", "url": "https://b.com"},
|
|
48
|
+
])
|
|
49
|
+
|
|
50
|
+
# 程序结束时自动 flush,也可手动触发
|
|
51
|
+
sdk.flush()
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## 配置选项
|
|
55
|
+
|
|
56
|
+
| 参数 | 环境变量 | 默认值 | 说明 |
|
|
57
|
+
|------|----------|--------|------|
|
|
58
|
+
| `api_url` | `SPIDER_API_URL` | — | 后端地址 |
|
|
59
|
+
| `task_id` | `TASK_ID` | — | 任务 ID |
|
|
60
|
+
| `buffer_size` | — | `20` | 缓冲条数阈值 |
|
|
61
|
+
| `flush_interval` | — | `3.0` | 时间窗口(秒) |
|
|
62
|
+
|
|
63
|
+
## 异步 API (Asyncio)
|
|
64
|
+
|
|
65
|
+
由于爬虫开发经常使用 `httpx`, `aiohttp`, `Playwright` 等异步工具,SDK 也提供了原生的 Async 接口。
|
|
66
|
+
推荐使用 `async with` 上下文管理器,离开上下文时将自动触发 flush。
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
import asyncio
|
|
70
|
+
from spidermanager_sdk.aio import async_sdk
|
|
71
|
+
|
|
72
|
+
async def main():
|
|
73
|
+
# 自动读取环境变量并初始化,退出时自动 flush
|
|
74
|
+
async with async_sdk:
|
|
75
|
+
await async_sdk.insert("articles", {"title": "Async Data", "url": "https://a.com"})
|
|
76
|
+
|
|
77
|
+
# 批量插入
|
|
78
|
+
await async_sdk.insert("articles", [
|
|
79
|
+
{"title": "B", "url": "https://b.com"},
|
|
80
|
+
{"title": "C", "url": "https://c.com"},
|
|
81
|
+
])
|
|
82
|
+
|
|
83
|
+
if __name__ == "__main__":
|
|
84
|
+
asyncio.run(main())
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## 架构
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
sdk.insert()
|
|
91
|
+
↓
|
|
92
|
+
FlushBuffer (内存缓冲, 线程安全)
|
|
93
|
+
↓ 条数阈值 / 定时器触发
|
|
94
|
+
HttpTransport.send_batch()
|
|
95
|
+
↓
|
|
96
|
+
POST /api/v1/tasks/data/ingest?task_id=xxx
|
|
97
|
+
```
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# SpiderManager SDK
|
|
2
|
+
|
|
3
|
+
极简 Python SDK,将爬虫采集数据通过 HTTP 异步中转至 SpiderManager 后端,实现爬虫逻辑与数据存储的完全解耦。
|
|
4
|
+
|
|
5
|
+
## 特性
|
|
6
|
+
|
|
7
|
+
- **零配置启动**:自动从环境变量 `TASK_ID` / `SPIDER_API_URL` 读取配置
|
|
8
|
+
- **异步缓冲**:内存 buffer 按阈值(20条)或时间窗口(3秒)批量上报,减少网络 IO
|
|
9
|
+
- **容器安全**:通过 `atexit` + `SIGTERM` 双保险,确保 Docker 销毁前 flush 全部数据
|
|
10
|
+
- **最小依赖**:仅依赖 `httpx`,不侵入爬虫业务代码
|
|
11
|
+
|
|
12
|
+
## 安装
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install spidermanager-sdk
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## 快速开始
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
from spidermanager_sdk import sdk
|
|
22
|
+
|
|
23
|
+
# 初始化(容器中自动读取环境变量,本地开发可手动指定)
|
|
24
|
+
sdk.init(api_url="http://localhost:8000", task_id="task-001")
|
|
25
|
+
|
|
26
|
+
# 上报数据(会自动缓冲、批量上报)
|
|
27
|
+
sdk.insert("articles", {"title": "Hello", "url": "https://example.com"})
|
|
28
|
+
|
|
29
|
+
# 批量上报
|
|
30
|
+
sdk.insert("articles", [
|
|
31
|
+
{"title": "A", "url": "https://a.com"},
|
|
32
|
+
{"title": "B", "url": "https://b.com"},
|
|
33
|
+
])
|
|
34
|
+
|
|
35
|
+
# 程序结束时自动 flush,也可手动触发
|
|
36
|
+
sdk.flush()
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## 配置选项
|
|
40
|
+
|
|
41
|
+
| 参数 | 环境变量 | 默认值 | 说明 |
|
|
42
|
+
|------|----------|--------|------|
|
|
43
|
+
| `api_url` | `SPIDER_API_URL` | — | 后端地址 |
|
|
44
|
+
| `task_id` | `TASK_ID` | — | 任务 ID |
|
|
45
|
+
| `buffer_size` | — | `20` | 缓冲条数阈值 |
|
|
46
|
+
| `flush_interval` | — | `3.0` | 时间窗口(秒) |
|
|
47
|
+
|
|
48
|
+
## 异步 API (Asyncio)
|
|
49
|
+
|
|
50
|
+
由于爬虫开发经常使用 `httpx`, `aiohttp`, `Playwright` 等异步工具,SDK 也提供了原生的 Async 接口。
|
|
51
|
+
推荐使用 `async with` 上下文管理器,离开上下文时将自动触发 flush。
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
import asyncio
|
|
55
|
+
from spidermanager_sdk.aio import async_sdk
|
|
56
|
+
|
|
57
|
+
async def main():
|
|
58
|
+
# 自动读取环境变量并初始化,退出时自动 flush
|
|
59
|
+
async with async_sdk:
|
|
60
|
+
await async_sdk.insert("articles", {"title": "Async Data", "url": "https://a.com"})
|
|
61
|
+
|
|
62
|
+
# 批量插入
|
|
63
|
+
await async_sdk.insert("articles", [
|
|
64
|
+
{"title": "B", "url": "https://b.com"},
|
|
65
|
+
{"title": "C", "url": "https://c.com"},
|
|
66
|
+
])
|
|
67
|
+
|
|
68
|
+
if __name__ == "__main__":
|
|
69
|
+
asyncio.run(main())
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## 架构
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
sdk.insert()
|
|
76
|
+
↓
|
|
77
|
+
FlushBuffer (内存缓冲, 线程安全)
|
|
78
|
+
↓ 条数阈值 / 定时器触发
|
|
79
|
+
HttpTransport.send_batch()
|
|
80
|
+
↓
|
|
81
|
+
POST /api/v1/tasks/data/ingest?task_id=xxx
|
|
82
|
+
```
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "setuptools_scm>=8.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "spidermanager-sdk"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "极简 Python SDK,将爬虫采集数据通过 HTTP 异步中转至 SpiderManager 后端"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "SpiderManager Team"},
|
|
14
|
+
]
|
|
15
|
+
dependencies = [
|
|
16
|
+
"httpx>=0.25.0",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.optional-dependencies]
|
|
20
|
+
dev = [
|
|
21
|
+
"pytest>=7.0",
|
|
22
|
+
"pytest-asyncio>=0.21",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[tool.setuptools.packages.find]
|
|
26
|
+
where = ["src"]
|
|
27
|
+
|
|
28
|
+
[tool.setuptools_scm]
|
|
29
|
+
version_file = "src/spidermanager_sdk/_version.py"
|
|
30
|
+
# 强制不显示 +xxx 这种本地版本后缀,确保符合 PyPI 标准
|
|
31
|
+
local_scheme = "no-local-version"
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SpiderManager SDK — 极简爬虫数据上报库
|
|
3
|
+
|
|
4
|
+
典型用法::
|
|
5
|
+
|
|
6
|
+
from spidermanager_sdk import sdk
|
|
7
|
+
|
|
8
|
+
sdk.init() # 自动读取环境变量
|
|
9
|
+
sdk.insert("articles", {"title": "..."}) # 内部缓冲, 达到阈值后异步上报
|
|
10
|
+
sdk.flush() # 手动强制上报(一般不需要)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from spidermanager_sdk.client import SpiderManagerClient
|
|
14
|
+
from spidermanager_sdk.aio import async_sdk, AsyncSpiderManagerClient
|
|
15
|
+
|
|
16
|
+
# ── 全局单例,用户直接操作此对象 ──
|
|
17
|
+
sdk: SpiderManagerClient = SpiderManagerClient()
|
|
18
|
+
|
|
19
|
+
__all__ = ["sdk", "SpiderManagerClient", "async_sdk", "AsyncSpiderManagerClient"]
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
from spidermanager_sdk._version import version as __version__
|
|
23
|
+
except ImportError:
|
|
24
|
+
__version__ = "0.0.0"
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
TYPE_CHECKING = False
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
18
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
20
|
+
else:
|
|
21
|
+
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
23
|
+
|
|
24
|
+
version: str
|
|
25
|
+
__version__: str
|
|
26
|
+
__version_tuple__: VERSION_TUPLE
|
|
27
|
+
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
30
|
+
|
|
31
|
+
__version__ = version = '0.1.3.dev0'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 3, 'dev0')
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = 'g2385ac666'
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SpiderManager SDK 异步客户端 (asyncio)
|
|
3
|
+
|
|
4
|
+
提供原生的 async/await 支持,适用于各类异步爬虫框架 (如 httpx, aiohttp, playwright)。
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import logging
|
|
11
|
+
import os
|
|
12
|
+
from collections import defaultdict
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from typing import Any, Callable
|
|
15
|
+
|
|
16
|
+
import httpx
|
|
17
|
+
|
|
18
|
+
from spidermanager_sdk.buffer import BufferEntry
|
|
19
|
+
from spidermanager_sdk.client import _DEFAULT_BUFFER_SIZE, _DEFAULT_FLUSH_INTERVAL
|
|
20
|
+
from spidermanager_sdk.transport import _INGEST_PATH, _DEFAULT_TIMEOUT
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("spidermanager_sdk.aio")
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class AsyncHttpTransport:
|
|
26
|
+
api_url: str = ""
|
|
27
|
+
task_id: str = ""
|
|
28
|
+
_client: httpx.AsyncClient | None = field(default=None, init=False, repr=False)
|
|
29
|
+
|
|
30
|
+
async def open(self) -> None:
|
|
31
|
+
if self._client is not None:
|
|
32
|
+
return
|
|
33
|
+
self._client = httpx.AsyncClient(
|
|
34
|
+
base_url=self.api_url,
|
|
35
|
+
timeout=_DEFAULT_TIMEOUT,
|
|
36
|
+
headers={"Content-Type": "application/json"},
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
async def close(self) -> None:
|
|
40
|
+
if self._client is not None:
|
|
41
|
+
try:
|
|
42
|
+
await self._client.aclose()
|
|
43
|
+
except Exception:
|
|
44
|
+
pass
|
|
45
|
+
finally:
|
|
46
|
+
self._client = None
|
|
47
|
+
|
|
48
|
+
async def send_batch(self, table_name: str, records: list[dict[str, Any]]) -> bool:
|
|
49
|
+
if not self._client:
|
|
50
|
+
await self.open()
|
|
51
|
+
assert self._client is not None
|
|
52
|
+
|
|
53
|
+
url = _INGEST_PATH
|
|
54
|
+
payload: dict[str, Any] = {"table_name": table_name, "data": records}
|
|
55
|
+
params: dict[str, str] = {"task_id": self.task_id}
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
response = await self._client.post(url, json=payload, params=params)
|
|
59
|
+
if response.status_code == 200:
|
|
60
|
+
return True
|
|
61
|
+
logger.warning("上报失败 HTTP %d", response.status_code)
|
|
62
|
+
return False
|
|
63
|
+
except Exception as exc:
|
|
64
|
+
logger.error("上报异常: %s", exc)
|
|
65
|
+
return False
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class AsyncFlushBuffer:
|
|
69
|
+
max_size: int = 20
|
|
70
|
+
flush_interval: float = 3.0
|
|
71
|
+
on_flush: Callable[[list[BufferEntry]], Any] | None = None
|
|
72
|
+
|
|
73
|
+
_entries: list[BufferEntry] = field(default_factory=list, init=False, repr=False)
|
|
74
|
+
_lock: asyncio.Lock = field(default_factory=asyncio.Lock, init=False, repr=False)
|
|
75
|
+
_task: asyncio.Task[None] | None = field(default=None, init=False, repr=False)
|
|
76
|
+
_started: bool = field(default=False, init=False, repr=False)
|
|
77
|
+
|
|
78
|
+
async def start(self) -> None:
|
|
79
|
+
if self._started:
|
|
80
|
+
return
|
|
81
|
+
self._started = True
|
|
82
|
+
self._task = asyncio.create_task(self._loop())
|
|
83
|
+
|
|
84
|
+
async def stop(self) -> None:
|
|
85
|
+
self._started = False
|
|
86
|
+
if self._task:
|
|
87
|
+
self._task.cancel()
|
|
88
|
+
try:
|
|
89
|
+
await self._task
|
|
90
|
+
except asyncio.CancelledError:
|
|
91
|
+
pass
|
|
92
|
+
self._task = None
|
|
93
|
+
await self.flush()
|
|
94
|
+
|
|
95
|
+
async def add(self, entry: BufferEntry) -> None:
|
|
96
|
+
async with self._lock:
|
|
97
|
+
self._entries.append(entry)
|
|
98
|
+
current_size = len(self._entries)
|
|
99
|
+
|
|
100
|
+
if current_size >= self.max_size:
|
|
101
|
+
await self.flush()
|
|
102
|
+
|
|
103
|
+
async def flush(self) -> None:
|
|
104
|
+
async with self._lock:
|
|
105
|
+
if not self._entries:
|
|
106
|
+
return
|
|
107
|
+
batch = self._entries.copy()
|
|
108
|
+
self._entries.clear()
|
|
109
|
+
|
|
110
|
+
if self.on_flush:
|
|
111
|
+
try:
|
|
112
|
+
res = self.on_flush(batch)
|
|
113
|
+
if asyncio.iscoroutine(res):
|
|
114
|
+
await res
|
|
115
|
+
except Exception:
|
|
116
|
+
logger.exception("flush 处理失败")
|
|
117
|
+
|
|
118
|
+
async def _loop(self) -> None:
|
|
119
|
+
while self._started:
|
|
120
|
+
try:
|
|
121
|
+
await asyncio.sleep(self.flush_interval)
|
|
122
|
+
await self.flush()
|
|
123
|
+
except asyncio.CancelledError:
|
|
124
|
+
break
|
|
125
|
+
except Exception:
|
|
126
|
+
logger.exception("异步缓冲区定时器异常")
|
|
127
|
+
|
|
128
|
+
class AsyncSpiderManagerClient:
|
|
129
|
+
def __init__(self) -> None:
|
|
130
|
+
self._api_url: str = ""
|
|
131
|
+
self._task_id: str = ""
|
|
132
|
+
self._initialized: bool = False
|
|
133
|
+
self._transport: AsyncHttpTransport | None = None
|
|
134
|
+
self._buffer: AsyncFlushBuffer | None = None
|
|
135
|
+
|
|
136
|
+
async def init(
|
|
137
|
+
self,
|
|
138
|
+
api_url: str | None = None,
|
|
139
|
+
task_id: str | None = None,
|
|
140
|
+
*,
|
|
141
|
+
buffer_size: int = _DEFAULT_BUFFER_SIZE,
|
|
142
|
+
flush_interval: float = _DEFAULT_FLUSH_INTERVAL,
|
|
143
|
+
) -> None:
|
|
144
|
+
self._api_url = api_url or os.environ.get("SPIDER_API_URL", "")
|
|
145
|
+
self._task_id = task_id or os.environ.get("TASK_ID", "")
|
|
146
|
+
if not self._api_url or not self._task_id:
|
|
147
|
+
raise ValueError("api_url 或 task_id 未配置 (或环境变量缺失)")
|
|
148
|
+
self._api_url = self._api_url.rstrip("/")
|
|
149
|
+
|
|
150
|
+
self._transport = AsyncHttpTransport(api_url=self._api_url, task_id=self._task_id)
|
|
151
|
+
await self._transport.open()
|
|
152
|
+
|
|
153
|
+
self._buffer = AsyncFlushBuffer(
|
|
154
|
+
max_size=buffer_size,
|
|
155
|
+
flush_interval=flush_interval,
|
|
156
|
+
on_flush=self._handle_flush,
|
|
157
|
+
)
|
|
158
|
+
await self._buffer.start()
|
|
159
|
+
self._initialized = True
|
|
160
|
+
logger.info("Async SDK 初始化: url=%s task=%s", self._api_url, self._task_id)
|
|
161
|
+
|
|
162
|
+
async def insert(self, table_name: str, data: dict[str, Any] | list[dict[str, Any]]) -> None:
|
|
163
|
+
self._ensure_initialized()
|
|
164
|
+
if isinstance(data, dict):
|
|
165
|
+
data = [data]
|
|
166
|
+
if not data:
|
|
167
|
+
return
|
|
168
|
+
assert self._buffer is not None
|
|
169
|
+
for record in data:
|
|
170
|
+
await self._buffer.add(BufferEntry(table_name=table_name, data=record))
|
|
171
|
+
|
|
172
|
+
async def flush(self) -> None:
|
|
173
|
+
if self._buffer:
|
|
174
|
+
await self._buffer.flush()
|
|
175
|
+
|
|
176
|
+
async def shutdown(self) -> None:
|
|
177
|
+
if self._buffer:
|
|
178
|
+
await self._buffer.stop()
|
|
179
|
+
if self._transport:
|
|
180
|
+
await self._transport.close()
|
|
181
|
+
self._initialized = False
|
|
182
|
+
|
|
183
|
+
async def __aenter__(self):
|
|
184
|
+
# 兼容自动初始化
|
|
185
|
+
if not self._initialized:
|
|
186
|
+
await self.init()
|
|
187
|
+
return self
|
|
188
|
+
|
|
189
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
190
|
+
if self._initialized:
|
|
191
|
+
await self.shutdown()
|
|
192
|
+
|
|
193
|
+
def _ensure_initialized(self) -> None:
|
|
194
|
+
if not self._initialized:
|
|
195
|
+
raise RuntimeError("SDK 尚未初始化")
|
|
196
|
+
|
|
197
|
+
async def _handle_flush(self, entries: list[BufferEntry]) -> None:
|
|
198
|
+
if not self._transport:
|
|
199
|
+
return
|
|
200
|
+
grouped: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
201
|
+
for entry in entries:
|
|
202
|
+
grouped[entry.table_name].append(entry.data)
|
|
203
|
+
|
|
204
|
+
# 内部并发上报多表
|
|
205
|
+
tasks = []
|
|
206
|
+
for table_name, records in grouped.items():
|
|
207
|
+
tasks.append(self._transport.send_batch(table_name, records))
|
|
208
|
+
|
|
209
|
+
if tasks:
|
|
210
|
+
await asyncio.gather(*tasks)
|
|
211
|
+
|
|
212
|
+
# 默认全局异步单例
|
|
213
|
+
async_sdk = AsyncSpiderManagerClient()
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""
|
|
2
|
+
异步缓冲区模块
|
|
3
|
+
|
|
4
|
+
维护内存 buffer,在满足条件(数据量阈值 或 时间窗口)时
|
|
5
|
+
触发一次批量 HTTP POST,减少爬虫端网络 IO 开销。
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import threading
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from typing import Callable
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger("spidermanager_sdk.buffer")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class BufferEntry:
|
|
20
|
+
"""
|
|
21
|
+
单条缓冲记录,对应一次 sdk.insert() 调用。
|
|
22
|
+
"""
|
|
23
|
+
table_name: str
|
|
24
|
+
data: dict[str, object]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class FlushBuffer:
|
|
29
|
+
"""
|
|
30
|
+
线程安全的内存缓冲区。
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
max_size : int
|
|
35
|
+
缓冲数据条数阈值,达到后触发 flush。
|
|
36
|
+
flush_interval : float
|
|
37
|
+
时间窗口(秒),即使未达到 max_size 也触发 flush。
|
|
38
|
+
on_flush : Callable[[list[BufferEntry]], None]
|
|
39
|
+
实际的 flush 回调,由 Client 层注入。
|
|
40
|
+
"""
|
|
41
|
+
max_size: int = 20
|
|
42
|
+
flush_interval: float = 3.0
|
|
43
|
+
on_flush: Callable[[list[BufferEntry]], None] | None = None
|
|
44
|
+
|
|
45
|
+
# ── 内部状态 ──
|
|
46
|
+
_entries: list[BufferEntry] = field(default_factory=list, init=False, repr=False)
|
|
47
|
+
_lock: threading.Lock = field(default_factory=threading.Lock, init=False, repr=False)
|
|
48
|
+
_timer: threading.Timer | None = field(default=None, init=False, repr=False)
|
|
49
|
+
_started: bool = field(default=False, init=False, repr=False)
|
|
50
|
+
|
|
51
|
+
# ──────────────────────────────────────────────
|
|
52
|
+
# 公开 API
|
|
53
|
+
# ──────────────────────────────────────────────
|
|
54
|
+
|
|
55
|
+
def start(self) -> None:
|
|
56
|
+
"""启动定时 flush 循环。"""
|
|
57
|
+
if self._started:
|
|
58
|
+
return
|
|
59
|
+
self._started = True
|
|
60
|
+
self._schedule_timer()
|
|
61
|
+
|
|
62
|
+
def stop(self) -> None:
|
|
63
|
+
"""停止定时器并强制 flush 剩余数据。"""
|
|
64
|
+
self._started = False
|
|
65
|
+
self._cancel_timer()
|
|
66
|
+
self.flush()
|
|
67
|
+
|
|
68
|
+
def add(self, entry: BufferEntry) -> None:
|
|
69
|
+
"""
|
|
70
|
+
向缓冲区追加一条记录。
|
|
71
|
+
如积累到阈值,立即触发 flush。
|
|
72
|
+
"""
|
|
73
|
+
with self._lock:
|
|
74
|
+
self._entries.append(entry)
|
|
75
|
+
current_size = len(self._entries)
|
|
76
|
+
|
|
77
|
+
if current_size >= self.max_size:
|
|
78
|
+
self.flush()
|
|
79
|
+
|
|
80
|
+
def flush(self) -> None:
|
|
81
|
+
"""
|
|
82
|
+
将缓冲区中所有数据取出,交给 on_flush 回调处理。
|
|
83
|
+
本方法线程安全,可被定时器 / 阈值触发 / atexit 多次调用。
|
|
84
|
+
"""
|
|
85
|
+
with self._lock:
|
|
86
|
+
if not self._entries:
|
|
87
|
+
return
|
|
88
|
+
batch = self._entries.copy()
|
|
89
|
+
self._entries.clear()
|
|
90
|
+
|
|
91
|
+
if not self.on_flush:
|
|
92
|
+
logger.warning("on_flush 回调未设置,%d 条数据被丢弃", len(batch))
|
|
93
|
+
return
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
self.on_flush(batch)
|
|
97
|
+
except Exception:
|
|
98
|
+
logger.exception("flush 回调执行失败,%d 条数据可能丢失", len(batch))
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def pending_count(self) -> int:
|
|
102
|
+
"""当前缓冲区中等待 flush 的数据条数。"""
|
|
103
|
+
with self._lock:
|
|
104
|
+
return len(self._entries)
|
|
105
|
+
|
|
106
|
+
# ──────────────────────────────────────────────
|
|
107
|
+
# 内部辅助
|
|
108
|
+
# ──────────────────────────────────────────────
|
|
109
|
+
|
|
110
|
+
def _schedule_timer(self) -> None:
|
|
111
|
+
"""注册下一次定时 flush。"""
|
|
112
|
+
if not self._started:
|
|
113
|
+
return
|
|
114
|
+
self._timer = threading.Timer(self.flush_interval, self._on_timer_tick)
|
|
115
|
+
self._timer.daemon = True
|
|
116
|
+
self._timer.start()
|
|
117
|
+
|
|
118
|
+
def _cancel_timer(self) -> None:
|
|
119
|
+
if self._timer is not None:
|
|
120
|
+
self._timer.cancel()
|
|
121
|
+
self._timer = None
|
|
122
|
+
|
|
123
|
+
def _on_timer_tick(self) -> None:
|
|
124
|
+
"""定时器触发:flush 后重新调度。"""
|
|
125
|
+
self.flush()
|
|
126
|
+
self._schedule_timer()
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SpiderManager SDK 客户端
|
|
3
|
+
|
|
4
|
+
对外暴露三个核心方法::
|
|
5
|
+
|
|
6
|
+
sdk.init(api_url=None, task_id=None)
|
|
7
|
+
sdk.insert(table_name, data)
|
|
8
|
+
sdk.flush()
|
|
9
|
+
|
|
10
|
+
内部协调 Buffer ↔ Transport 之间的数据流转,
|
|
11
|
+
并通过 atexit + signal 保证容器销毁前数据完整性。
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import atexit
|
|
17
|
+
import logging
|
|
18
|
+
import os
|
|
19
|
+
import signal
|
|
20
|
+
import sys
|
|
21
|
+
from collections import defaultdict
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
from spidermanager_sdk.buffer import BufferEntry, FlushBuffer
|
|
25
|
+
from spidermanager_sdk.transport import HttpTransport
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger("spidermanager_sdk.client")
|
|
28
|
+
|
|
29
|
+
# ── 默认缓冲参数 ──
|
|
30
|
+
_DEFAULT_BUFFER_SIZE: int = 20
|
|
31
|
+
_DEFAULT_FLUSH_INTERVAL: float = 3.0
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class SpiderManagerClient:
|
|
35
|
+
"""
|
|
36
|
+
SDK 主客户端。
|
|
37
|
+
|
|
38
|
+
职责:
|
|
39
|
+
1. 管理初始化配置(api_url / task_id)
|
|
40
|
+
2. 维护内存 flush buffer
|
|
41
|
+
3. 注册退出钩子确保数据不丢失
|
|
42
|
+
4. 将用户的 ``insert()`` 调用转化为最终的 HTTP POST
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(self) -> None:
|
|
46
|
+
self._api_url: str = ""
|
|
47
|
+
self._task_id: str = ""
|
|
48
|
+
self._initialized: bool = False
|
|
49
|
+
self._transport: HttpTransport | None = None
|
|
50
|
+
self._buffer: FlushBuffer | None = None
|
|
51
|
+
self._atexit_registered: bool = False
|
|
52
|
+
|
|
53
|
+
# ──────────────────────────────────────────────
|
|
54
|
+
# 公开 API
|
|
55
|
+
# ──────────────────────────────────────────────
|
|
56
|
+
|
|
57
|
+
def init(
|
|
58
|
+
self,
|
|
59
|
+
api_url: str | None = None,
|
|
60
|
+
task_id: str | None = None,
|
|
61
|
+
*,
|
|
62
|
+
buffer_size: int = _DEFAULT_BUFFER_SIZE,
|
|
63
|
+
flush_interval: float = _DEFAULT_FLUSH_INTERVAL,
|
|
64
|
+
) -> None:
|
|
65
|
+
"""
|
|
66
|
+
初始化 SDK。
|
|
67
|
+
|
|
68
|
+
Parameters
|
|
69
|
+
----------
|
|
70
|
+
api_url : str | None
|
|
71
|
+
SpiderManager 后端地址。若不传,自动读取环境变量 ``SPIDER_API_URL``。
|
|
72
|
+
task_id : str | None
|
|
73
|
+
当前任务 ID。若不传,自动读取环境变量 ``TASK_ID``。
|
|
74
|
+
buffer_size : int
|
|
75
|
+
缓冲数据条数阈值,默认 20。
|
|
76
|
+
flush_interval : float
|
|
77
|
+
时间窗口(秒),默认 3.0。
|
|
78
|
+
"""
|
|
79
|
+
# ── 1. 解析配置 ──
|
|
80
|
+
self._api_url = api_url or os.environ.get("SPIDER_API_URL", "")
|
|
81
|
+
self._task_id = task_id or os.environ.get("TASK_ID", "")
|
|
82
|
+
|
|
83
|
+
if not self._api_url:
|
|
84
|
+
raise ValueError(
|
|
85
|
+
"api_url 未指定且环境变量 SPIDER_API_URL 未设置,"
|
|
86
|
+
"请通过 sdk.init(api_url='...') 或设置环境变量来配置后端地址。"
|
|
87
|
+
)
|
|
88
|
+
if not self._task_id:
|
|
89
|
+
raise ValueError(
|
|
90
|
+
"task_id 未指定且环境变量 TASK_ID 未设置,"
|
|
91
|
+
"请通过 sdk.init(task_id='...') 或设置环境变量来配置任务 ID。"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# 去除尾部斜杠
|
|
95
|
+
self._api_url = self._api_url.rstrip("/")
|
|
96
|
+
|
|
97
|
+
# ── 2. 初始化传输层 ──
|
|
98
|
+
self._transport = HttpTransport(api_url=self._api_url, task_id=self._task_id)
|
|
99
|
+
self._transport.open()
|
|
100
|
+
|
|
101
|
+
# ── 3. 初始化缓冲区 ──
|
|
102
|
+
self._buffer = FlushBuffer(
|
|
103
|
+
max_size=buffer_size,
|
|
104
|
+
flush_interval=flush_interval,
|
|
105
|
+
on_flush=self._handle_flush,
|
|
106
|
+
)
|
|
107
|
+
self._buffer.start()
|
|
108
|
+
|
|
109
|
+
# ── 4. 注册退出钩子 ──
|
|
110
|
+
self._register_exit_hooks()
|
|
111
|
+
|
|
112
|
+
self._initialized = True
|
|
113
|
+
logger.info(
|
|
114
|
+
"SDK 初始化完成: api_url=%s, task_id=%s, buffer_size=%d, interval=%.1fs",
|
|
115
|
+
self._api_url, self._task_id, buffer_size, flush_interval,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
def insert(self, table_name: str, data: dict[str, Any] | list[dict[str, Any]]) -> None:
|
|
119
|
+
"""
|
|
120
|
+
将采集数据提交到缓冲区。
|
|
121
|
+
|
|
122
|
+
Parameters
|
|
123
|
+
----------
|
|
124
|
+
table_name : str
|
|
125
|
+
目标数据表名。
|
|
126
|
+
data : dict | list[dict]
|
|
127
|
+
单条或多条数据记录。
|
|
128
|
+
"""
|
|
129
|
+
self._ensure_initialized()
|
|
130
|
+
|
|
131
|
+
if isinstance(data, dict):
|
|
132
|
+
data = [data]
|
|
133
|
+
|
|
134
|
+
if not data:
|
|
135
|
+
return
|
|
136
|
+
|
|
137
|
+
assert self._buffer is not None
|
|
138
|
+
for record in data:
|
|
139
|
+
self._buffer.add(BufferEntry(table_name=table_name, data=record))
|
|
140
|
+
|
|
141
|
+
def flush(self) -> None:
|
|
142
|
+
"""
|
|
143
|
+
手动强制将缓冲区中所有数据立即上报到后端。
|
|
144
|
+
通常在程序结束前调用,或在需要即时可见性时使用。
|
|
145
|
+
"""
|
|
146
|
+
if self._buffer:
|
|
147
|
+
self._buffer.flush()
|
|
148
|
+
|
|
149
|
+
def shutdown(self) -> None:
|
|
150
|
+
"""
|
|
151
|
+
优雅关闭 SDK:停止定时器 → flush 剩余数据 → 关闭 HTTP 连接。
|
|
152
|
+
"""
|
|
153
|
+
logger.info("SDK 正在关闭...")
|
|
154
|
+
if self._buffer:
|
|
155
|
+
self._buffer.stop()
|
|
156
|
+
if self._transport:
|
|
157
|
+
self._transport.close()
|
|
158
|
+
self._initialized = False
|
|
159
|
+
logger.info("SDK 已关闭")
|
|
160
|
+
|
|
161
|
+
@property
|
|
162
|
+
def is_initialized(self) -> bool:
|
|
163
|
+
return self._initialized
|
|
164
|
+
|
|
165
|
+
@property
|
|
166
|
+
def pending_count(self) -> int:
|
|
167
|
+
"""当前缓冲区中等待上报的数据条数。"""
|
|
168
|
+
if self._buffer:
|
|
169
|
+
return self._buffer.pending_count
|
|
170
|
+
return 0
|
|
171
|
+
|
|
172
|
+
# ──────────────────────────────────────────────
|
|
173
|
+
# 内部方法
|
|
174
|
+
# ──────────────────────────────────────────────
|
|
175
|
+
|
|
176
|
+
def _ensure_initialized(self) -> None:
|
|
177
|
+
if not self._initialized:
|
|
178
|
+
raise RuntimeError(
|
|
179
|
+
"SDK 尚未初始化,请先调用 sdk.init() 进行配置。"
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
def _handle_flush(self, entries: list[BufferEntry]) -> None:
|
|
183
|
+
"""
|
|
184
|
+
FlushBuffer 的回调。
|
|
185
|
+
将 entries 按 table_name 分组,逐组上报。
|
|
186
|
+
"""
|
|
187
|
+
if not self._transport:
|
|
188
|
+
logger.warning("Transport 未初始化,%d 条数据被丢弃", len(entries))
|
|
189
|
+
return
|
|
190
|
+
|
|
191
|
+
# 按表名分组
|
|
192
|
+
grouped: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
193
|
+
for entry in entries:
|
|
194
|
+
grouped[entry.table_name].append(entry.data)
|
|
195
|
+
|
|
196
|
+
for table_name, records in grouped.items():
|
|
197
|
+
success = self._transport.send_batch(table_name, records)
|
|
198
|
+
if not success:
|
|
199
|
+
logger.error(
|
|
200
|
+
"表 '%s' 的 %d 条数据上报失败", table_name, len(records),
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
def _register_exit_hooks(self) -> None:
|
|
204
|
+
"""
|
|
205
|
+
注册 atexit 和 SIGTERM 信号处理,确保容器销毁前 flush 数据。
|
|
206
|
+
"""
|
|
207
|
+
if self._atexit_registered:
|
|
208
|
+
return
|
|
209
|
+
|
|
210
|
+
# atexit: 正常退出时 flush
|
|
211
|
+
atexit.register(self.shutdown)
|
|
212
|
+
|
|
213
|
+
# SIGTERM: Docker 在 stop 容器时发送此信号
|
|
214
|
+
# 仅在主线程中注册,且仅支持 POSIX 系统
|
|
215
|
+
if _can_register_signal():
|
|
216
|
+
_original_sigterm = signal.getsignal(signal.SIGTERM)
|
|
217
|
+
|
|
218
|
+
def _sigterm_handler(signum: int, frame: object) -> None:
|
|
219
|
+
logger.info("收到 SIGTERM 信号,正在 flush 缓冲数据...")
|
|
220
|
+
self.shutdown()
|
|
221
|
+
# 调用原始 handler(如果有)
|
|
222
|
+
if callable(_original_sigterm):
|
|
223
|
+
_original_sigterm(signum, frame)
|
|
224
|
+
sys.exit(0)
|
|
225
|
+
|
|
226
|
+
signal.signal(signal.SIGTERM, _sigterm_handler)
|
|
227
|
+
|
|
228
|
+
self._atexit_registered = True
|
|
229
|
+
logger.debug("退出钩子已注册")
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _can_register_signal() -> bool:
|
|
233
|
+
"""
|
|
234
|
+
判断当前环境是否允许注册信号处理。
|
|
235
|
+
只有主线程才能注册 signal handler。
|
|
236
|
+
"""
|
|
237
|
+
import threading
|
|
238
|
+
return threading.current_thread() is threading.main_thread()
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HTTP 传输层
|
|
3
|
+
|
|
4
|
+
负责将序列化后的数据批量 POST 到 SpiderManager 后端。
|
|
5
|
+
使用 httpx 同步客户端,确保在 atexit / 信号处理等
|
|
6
|
+
非 async 上下文中也能可靠发送。
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
import httpx
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger("spidermanager_sdk.transport")
|
|
18
|
+
|
|
19
|
+
# 上报接口固定路径
|
|
20
|
+
_INGEST_PATH: str = "/api/v1/tasks/data/ingest"
|
|
21
|
+
|
|
22
|
+
# 默认超时配置(连接 / 读取 / 写入 / 总计)
|
|
23
|
+
_DEFAULT_TIMEOUT = httpx.Timeout(connect=5.0, read=10.0, write=10.0, pool=30.0)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class HttpTransport:
|
|
28
|
+
"""
|
|
29
|
+
同步 HTTP 传输客户端。
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
api_url : str
|
|
34
|
+
SpiderManager 后端根地址,例如 ``http://backend:8000``。
|
|
35
|
+
task_id : str
|
|
36
|
+
当前任务 ID,每次请求作为 query 参数传递。
|
|
37
|
+
"""
|
|
38
|
+
api_url: str = ""
|
|
39
|
+
task_id: str = ""
|
|
40
|
+
|
|
41
|
+
_client: httpx.Client | None = field(default=None, init=False, repr=False)
|
|
42
|
+
|
|
43
|
+
# ──────────────────────────────────────────────
|
|
44
|
+
# 生命周期
|
|
45
|
+
# ──────────────────────────────────────────────
|
|
46
|
+
|
|
47
|
+
def open(self) -> None:
|
|
48
|
+
"""初始化底层 httpx 连接池。"""
|
|
49
|
+
if self._client is not None:
|
|
50
|
+
return
|
|
51
|
+
self._client = httpx.Client(
|
|
52
|
+
base_url=self.api_url,
|
|
53
|
+
timeout=_DEFAULT_TIMEOUT,
|
|
54
|
+
headers={"Content-Type": "application/json"},
|
|
55
|
+
)
|
|
56
|
+
logger.debug("HTTP transport opened → %s", self.api_url)
|
|
57
|
+
|
|
58
|
+
def close(self) -> None:
|
|
59
|
+
"""关闭连接池,释放资源。"""
|
|
60
|
+
if self._client is not None:
|
|
61
|
+
try:
|
|
62
|
+
self._client.close()
|
|
63
|
+
except Exception:
|
|
64
|
+
logger.debug("关闭 HTTP 客户端时出现异常", exc_info=True)
|
|
65
|
+
finally:
|
|
66
|
+
self._client = None
|
|
67
|
+
|
|
68
|
+
# ──────────────────────────────────────────────
|
|
69
|
+
# 数据上报
|
|
70
|
+
# ──────────────────────────────────────────────
|
|
71
|
+
|
|
72
|
+
def send_batch(self, table_name: str, records: list[dict[str, Any]]) -> bool:
|
|
73
|
+
"""
|
|
74
|
+
向后端发送一批数据。
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
table_name : str
|
|
79
|
+
目标表名。
|
|
80
|
+
records : list[dict[str, Any]]
|
|
81
|
+
数据记录列表。
|
|
82
|
+
|
|
83
|
+
Returns
|
|
84
|
+
-------
|
|
85
|
+
bool
|
|
86
|
+
上报成功返回 True,否则 False(不抛异常,由调用方决策)。
|
|
87
|
+
"""
|
|
88
|
+
if not self._client:
|
|
89
|
+
self.open()
|
|
90
|
+
assert self._client is not None
|
|
91
|
+
|
|
92
|
+
url = _INGEST_PATH
|
|
93
|
+
payload: dict[str, Any] = {
|
|
94
|
+
"table_name": table_name,
|
|
95
|
+
"data": records,
|
|
96
|
+
}
|
|
97
|
+
params: dict[str, str] = {"task_id": self.task_id}
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
response = self._client.post(url, json=payload, params=params)
|
|
101
|
+
if response.status_code == 200:
|
|
102
|
+
logger.debug(
|
|
103
|
+
"上报成功: table=%s, count=%d", table_name, len(records),
|
|
104
|
+
)
|
|
105
|
+
return True
|
|
106
|
+
else:
|
|
107
|
+
logger.warning(
|
|
108
|
+
"上报失败 HTTP %d: %s", response.status_code, response.text[:200],
|
|
109
|
+
)
|
|
110
|
+
return False
|
|
111
|
+
except httpx.TimeoutException:
|
|
112
|
+
logger.error("上报超时: table=%s, count=%d", table_name, len(records))
|
|
113
|
+
return False
|
|
114
|
+
except httpx.ConnectError as exc:
|
|
115
|
+
logger.error("连接后端失败 (%s): %s", self.api_url, exc)
|
|
116
|
+
return False
|
|
117
|
+
except Exception:
|
|
118
|
+
logger.exception("上报数据时发生未知异常")
|
|
119
|
+
return False
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: spidermanager-sdk
|
|
3
|
+
Version: 0.1.3.dev0
|
|
4
|
+
Summary: 极简 Python SDK,将爬虫采集数据通过 HTTP 异步中转至 SpiderManager 后端
|
|
5
|
+
Author: SpiderManager Team
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: httpx>=0.25.0
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
13
|
+
Requires-Dist: pytest-asyncio>=0.21; extra == "dev"
|
|
14
|
+
Dynamic: license-file
|
|
15
|
+
|
|
16
|
+
# SpiderManager SDK
|
|
17
|
+
|
|
18
|
+
极简 Python SDK,将爬虫采集数据通过 HTTP 异步中转至 SpiderManager 后端,实现爬虫逻辑与数据存储的完全解耦。
|
|
19
|
+
|
|
20
|
+
## 特性
|
|
21
|
+
|
|
22
|
+
- **零配置启动**:自动从环境变量 `TASK_ID` / `SPIDER_API_URL` 读取配置
|
|
23
|
+
- **异步缓冲**:内存 buffer 按阈值(20条)或时间窗口(3秒)批量上报,减少网络 IO
|
|
24
|
+
- **容器安全**:通过 `atexit` + `SIGTERM` 双保险,确保 Docker 销毁前 flush 全部数据
|
|
25
|
+
- **最小依赖**:仅依赖 `httpx`,不侵入爬虫业务代码
|
|
26
|
+
|
|
27
|
+
## 安装
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install spidermanager-sdk
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## 快速开始
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from spidermanager_sdk import sdk
|
|
37
|
+
|
|
38
|
+
# 初始化(容器中自动读取环境变量,本地开发可手动指定)
|
|
39
|
+
sdk.init(api_url="http://localhost:8000", task_id="task-001")
|
|
40
|
+
|
|
41
|
+
# 上报数据(会自动缓冲、批量上报)
|
|
42
|
+
sdk.insert("articles", {"title": "Hello", "url": "https://example.com"})
|
|
43
|
+
|
|
44
|
+
# 批量上报
|
|
45
|
+
sdk.insert("articles", [
|
|
46
|
+
{"title": "A", "url": "https://a.com"},
|
|
47
|
+
{"title": "B", "url": "https://b.com"},
|
|
48
|
+
])
|
|
49
|
+
|
|
50
|
+
# 程序结束时自动 flush,也可手动触发
|
|
51
|
+
sdk.flush()
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## 配置选项
|
|
55
|
+
|
|
56
|
+
| 参数 | 环境变量 | 默认值 | 说明 |
|
|
57
|
+
|------|----------|--------|------|
|
|
58
|
+
| `api_url` | `SPIDER_API_URL` | — | 后端地址 |
|
|
59
|
+
| `task_id` | `TASK_ID` | — | 任务 ID |
|
|
60
|
+
| `buffer_size` | — | `20` | 缓冲条数阈值 |
|
|
61
|
+
| `flush_interval` | — | `3.0` | 时间窗口(秒) |
|
|
62
|
+
|
|
63
|
+
## 异步 API (Asyncio)
|
|
64
|
+
|
|
65
|
+
由于爬虫开发经常使用 `httpx`, `aiohttp`, `Playwright` 等异步工具,SDK 也提供了原生的 Async 接口。
|
|
66
|
+
推荐使用 `async with` 上下文管理器,离开上下文时将自动触发 flush。
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
import asyncio
|
|
70
|
+
from spidermanager_sdk.aio import async_sdk
|
|
71
|
+
|
|
72
|
+
async def main():
|
|
73
|
+
# 自动读取环境变量并初始化,退出时自动 flush
|
|
74
|
+
async with async_sdk:
|
|
75
|
+
await async_sdk.insert("articles", {"title": "Async Data", "url": "https://a.com"})
|
|
76
|
+
|
|
77
|
+
# 批量插入
|
|
78
|
+
await async_sdk.insert("articles", [
|
|
79
|
+
{"title": "B", "url": "https://b.com"},
|
|
80
|
+
{"title": "C", "url": "https://c.com"},
|
|
81
|
+
])
|
|
82
|
+
|
|
83
|
+
if __name__ == "__main__":
|
|
84
|
+
asyncio.run(main())
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## 架构
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
sdk.insert()
|
|
91
|
+
↓
|
|
92
|
+
FlushBuffer (内存缓冲, 线程安全)
|
|
93
|
+
↓ 条数阈值 / 定时器触发
|
|
94
|
+
HttpTransport.send_batch()
|
|
95
|
+
↓
|
|
96
|
+
POST /api/v1/tasks/data/ingest?task_id=xxx
|
|
97
|
+
```
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
.gitignore
|
|
2
|
+
LICENSE
|
|
3
|
+
README.md
|
|
4
|
+
pyproject.toml
|
|
5
|
+
.github/workflows/publish.yml
|
|
6
|
+
src/spidermanager_sdk/__init__.py
|
|
7
|
+
src/spidermanager_sdk/_version.py
|
|
8
|
+
src/spidermanager_sdk/aio.py
|
|
9
|
+
src/spidermanager_sdk/buffer.py
|
|
10
|
+
src/spidermanager_sdk/client.py
|
|
11
|
+
src/spidermanager_sdk/transport.py
|
|
12
|
+
src/spidermanager_sdk.egg-info/PKG-INFO
|
|
13
|
+
src/spidermanager_sdk.egg-info/SOURCES.txt
|
|
14
|
+
src/spidermanager_sdk.egg-info/dependency_links.txt
|
|
15
|
+
src/spidermanager_sdk.egg-info/requires.txt
|
|
16
|
+
src/spidermanager_sdk.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
spidermanager_sdk
|