spidermanager-sdk 0.1.3.dev0__tar.gz → 0.1.5.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {spidermanager_sdk-0.1.3.dev0/src/spidermanager_sdk.egg-info → spidermanager_sdk-0.1.5.dev0}/PKG-INFO +2 -2
- {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/README.md +1 -1
- {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk/_version.py +3 -3
- {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk/aio.py +47 -8
- {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk/buffer.py +10 -1
- {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk/client.py +36 -10
- {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk/transport.py +11 -2
- spidermanager_sdk-0.1.5.dev0/src/spidermanager_sdk/utils.py +26 -0
- {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0/src/spidermanager_sdk.egg-info}/PKG-INFO +2 -2
- {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk.egg-info/SOURCES.txt +3 -0
- spidermanager_sdk-0.1.5.dev0/test_robustness.py +52 -0
- spidermanager_sdk-0.1.5.dev0/test_transport.py +27 -0
- {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/.github/workflows/publish.yml +0 -0
- {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/.gitignore +0 -0
- {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/LICENSE +0 -0
- {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/pyproject.toml +0 -0
- {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/setup.cfg +0 -0
- {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk/__init__.py +0 -0
- {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk.egg-info/dependency_links.txt +0 -0
- {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk.egg-info/requires.txt +0 -0
- {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: spidermanager-sdk
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5.dev0
|
|
4
4
|
Summary: 极简 Python SDK,将爬虫采集数据通过 HTTP 异步中转至 SpiderManager 后端
|
|
5
5
|
Author: SpiderManager Team
|
|
6
6
|
License: MIT
|
|
@@ -93,5 +93,5 @@ FlushBuffer (内存缓冲, 线程安全)
|
|
|
93
93
|
↓ 条数阈值 / 定时器触发
|
|
94
94
|
HttpTransport.send_batch()
|
|
95
95
|
↓
|
|
96
|
-
POST /api/
|
|
96
|
+
POST /api/tasks/data/ingest?task_id=xxx
|
|
97
97
|
```
|
{spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk/_version.py
RENAMED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.1.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.5.dev0'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 5, 'dev0')
|
|
33
33
|
|
|
34
|
-
__commit_id__ = commit_id = '
|
|
34
|
+
__commit_id__ = commit_id = 'g5c067d7e2'
|
|
@@ -17,7 +17,7 @@ import httpx
|
|
|
17
17
|
|
|
18
18
|
from spidermanager_sdk.buffer import BufferEntry
|
|
19
19
|
from spidermanager_sdk.client import _DEFAULT_BUFFER_SIZE, _DEFAULT_FLUSH_INTERVAL
|
|
20
|
-
from spidermanager_sdk.transport import _INGEST_PATH, _DEFAULT_TIMEOUT
|
|
20
|
+
from spidermanager_sdk.transport import _INGEST_PATH, _DEFAULT_TIMEOUT, _DEFAULT_LIMITS
|
|
21
21
|
|
|
22
22
|
logger = logging.getLogger("spidermanager_sdk.aio")
|
|
23
23
|
|
|
@@ -25,15 +25,21 @@ logger = logging.getLogger("spidermanager_sdk.aio")
|
|
|
25
25
|
class AsyncHttpTransport:
|
|
26
26
|
api_url: str = ""
|
|
27
27
|
task_id: str = ""
|
|
28
|
+
host_header: str | None = None
|
|
28
29
|
_client: httpx.AsyncClient | None = field(default=None, init=False, repr=False)
|
|
29
30
|
|
|
30
31
|
async def open(self) -> None:
|
|
31
32
|
if self._client is not None:
|
|
32
33
|
return
|
|
34
|
+
headers = {"Content-Type": "application/json"}
|
|
35
|
+
if self.host_header:
|
|
36
|
+
headers["Host"] = self.host_header
|
|
37
|
+
|
|
33
38
|
self._client = httpx.AsyncClient(
|
|
34
39
|
base_url=self.api_url,
|
|
35
40
|
timeout=_DEFAULT_TIMEOUT,
|
|
36
|
-
|
|
41
|
+
limits=_DEFAULT_LIMITS,
|
|
42
|
+
headers=headers,
|
|
37
43
|
)
|
|
38
44
|
|
|
39
45
|
async def close(self) -> None:
|
|
@@ -100,6 +106,14 @@ class AsyncFlushBuffer:
|
|
|
100
106
|
if current_size >= self.max_size:
|
|
101
107
|
await self.flush()
|
|
102
108
|
|
|
109
|
+
async def rollback(self, entries: list[BufferEntry]) -> None:
|
|
110
|
+
"""
|
|
111
|
+
异步回滚:将失败数据写回缓冲区头部。
|
|
112
|
+
"""
|
|
113
|
+
async with self._lock:
|
|
114
|
+
self._entries = entries + self._entries
|
|
115
|
+
logger.debug("已异步回滚 %d 条数据至缓冲区", len(entries))
|
|
116
|
+
|
|
103
117
|
async def flush(self) -> None:
|
|
104
118
|
async with self._lock:
|
|
105
119
|
if not self._entries:
|
|
@@ -129,6 +143,7 @@ class AsyncSpiderManagerClient:
|
|
|
129
143
|
def __init__(self) -> None:
|
|
130
144
|
self._api_url: str = ""
|
|
131
145
|
self._task_id: str = ""
|
|
146
|
+
self._host_header: str | None = None
|
|
132
147
|
self._initialized: bool = False
|
|
133
148
|
self._transport: AsyncHttpTransport | None = None
|
|
134
149
|
self._buffer: AsyncFlushBuffer | None = None
|
|
@@ -138,16 +153,26 @@ class AsyncSpiderManagerClient:
|
|
|
138
153
|
api_url: str | None = None,
|
|
139
154
|
task_id: str | None = None,
|
|
140
155
|
*,
|
|
141
|
-
buffer_size: int =
|
|
156
|
+
buffer_size: int = 50,
|
|
142
157
|
flush_interval: float = _DEFAULT_FLUSH_INTERVAL,
|
|
158
|
+
resolve_dns: bool = True,
|
|
143
159
|
) -> None:
|
|
144
|
-
self._api_url = api_url or os.environ.get("SPIDER_API_URL", "")
|
|
145
|
-
self._task_id = task_id or os.environ.get("TASK_ID", "")
|
|
160
|
+
self._api_url = (api_url or os.environ.get("SPIDER_API_URL", "")).strip()
|
|
161
|
+
self._task_id = (task_id or os.environ.get("TASK_ID", "")).strip()
|
|
146
162
|
if not self._api_url or not self._task_id:
|
|
147
163
|
raise ValueError("api_url 或 task_id 未配置 (或环境变量缺失)")
|
|
148
164
|
self._api_url = self._api_url.rstrip("/")
|
|
149
165
|
|
|
150
|
-
|
|
166
|
+
# ── 1.5 DNS 预解析 ──
|
|
167
|
+
if resolve_dns:
|
|
168
|
+
from spidermanager_sdk.utils import resolve_provider_url
|
|
169
|
+
self._api_url, self._host_header = resolve_provider_url(self._api_url)
|
|
170
|
+
|
|
171
|
+
self._transport = AsyncHttpTransport(
|
|
172
|
+
api_url=self._api_url,
|
|
173
|
+
task_id=self._task_id,
|
|
174
|
+
host_header=self._host_header
|
|
175
|
+
)
|
|
151
176
|
await self._transport.open()
|
|
152
177
|
|
|
153
178
|
self._buffer = AsyncFlushBuffer(
|
|
@@ -206,8 +231,22 @@ class AsyncSpiderManagerClient:
|
|
|
206
231
|
for table_name, records in grouped.items():
|
|
207
232
|
tasks.append(self._transport.send_batch(table_name, records))
|
|
208
233
|
|
|
209
|
-
if tasks:
|
|
210
|
-
|
|
234
|
+
if not tasks:
|
|
235
|
+
return
|
|
236
|
+
|
|
237
|
+
try:
|
|
238
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
239
|
+
for res in results:
|
|
240
|
+
if res is not True:
|
|
241
|
+
# 发生异常或返回 False,触发回退
|
|
242
|
+
logger.warning("异步上报部分表失败,触发回退")
|
|
243
|
+
if self._buffer:
|
|
244
|
+
await self._buffer.rollback(entries)
|
|
245
|
+
break
|
|
246
|
+
except Exception:
|
|
247
|
+
logger.exception("异步上报发生未知异常,触发回退")
|
|
248
|
+
if self._buffer:
|
|
249
|
+
await self._buffer.rollback(entries)
|
|
211
250
|
|
|
212
251
|
# 默认全局异步单例
|
|
213
252
|
async_sdk = AsyncSpiderManagerClient()
|
{spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk/buffer.py
RENAMED
|
@@ -38,7 +38,7 @@ class FlushBuffer:
|
|
|
38
38
|
on_flush : Callable[[list[BufferEntry]], None]
|
|
39
39
|
实际的 flush 回调,由 Client 层注入。
|
|
40
40
|
"""
|
|
41
|
-
max_size: int =
|
|
41
|
+
max_size: int = 50
|
|
42
42
|
flush_interval: float = 3.0
|
|
43
43
|
on_flush: Callable[[list[BufferEntry]], None] | None = None
|
|
44
44
|
|
|
@@ -77,6 +77,15 @@ class FlushBuffer:
|
|
|
77
77
|
if current_size >= self.max_size:
|
|
78
78
|
self.flush()
|
|
79
79
|
|
|
80
|
+
def rollback(self, entries: list[BufferEntry]) -> None:
|
|
81
|
+
"""
|
|
82
|
+
将上报失败的数据重新写回缓冲区头部,等待下次 flush。
|
|
83
|
+
"""
|
|
84
|
+
with self._lock:
|
|
85
|
+
# 将失败的数据插到现有数据之前
|
|
86
|
+
self._entries = entries + self._entries
|
|
87
|
+
logger.debug("已回滚 %d 条数据至缓冲区", len(entries))
|
|
88
|
+
|
|
80
89
|
def flush(self) -> None:
|
|
81
90
|
"""
|
|
82
91
|
将缓冲区中所有数据取出,交给 on_flush 回调处理。
|
{spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk/client.py
RENAMED
|
@@ -45,6 +45,7 @@ class SpiderManagerClient:
|
|
|
45
45
|
def __init__(self) -> None:
|
|
46
46
|
self._api_url: str = ""
|
|
47
47
|
self._task_id: str = ""
|
|
48
|
+
self._host_header: str | None = None
|
|
48
49
|
self._initialized: bool = False
|
|
49
50
|
self._transport: HttpTransport | None = None
|
|
50
51
|
self._buffer: FlushBuffer | None = None
|
|
@@ -59,8 +60,9 @@ class SpiderManagerClient:
|
|
|
59
60
|
api_url: str | None = None,
|
|
60
61
|
task_id: str | None = None,
|
|
61
62
|
*,
|
|
62
|
-
buffer_size: int =
|
|
63
|
+
buffer_size: int = 50,
|
|
63
64
|
flush_interval: float = _DEFAULT_FLUSH_INTERVAL,
|
|
65
|
+
resolve_dns: bool = True,
|
|
64
66
|
) -> None:
|
|
65
67
|
"""
|
|
66
68
|
初始化 SDK。
|
|
@@ -77,8 +79,8 @@ class SpiderManagerClient:
|
|
|
77
79
|
时间窗口(秒),默认 3.0。
|
|
78
80
|
"""
|
|
79
81
|
# ── 1. 解析配置 ──
|
|
80
|
-
self._api_url = api_url or os.environ.get("SPIDER_API_URL", "")
|
|
81
|
-
self._task_id = task_id or os.environ.get("TASK_ID", "")
|
|
82
|
+
self._api_url = (api_url or os.environ.get("SPIDER_API_URL", "")).strip()
|
|
83
|
+
self._task_id = (task_id or os.environ.get("TASK_ID", "")).strip()
|
|
82
84
|
|
|
83
85
|
if not self._api_url:
|
|
84
86
|
raise ValueError(
|
|
@@ -94,8 +96,17 @@ class SpiderManagerClient:
|
|
|
94
96
|
# 去除尾部斜杠
|
|
95
97
|
self._api_url = self._api_url.rstrip("/")
|
|
96
98
|
|
|
99
|
+
# ── 1.5 DNS 预解析 ──
|
|
100
|
+
if resolve_dns:
|
|
101
|
+
from spidermanager_sdk.utils import resolve_provider_url
|
|
102
|
+
self._api_url, self._host_header = resolve_provider_url(self._api_url)
|
|
103
|
+
|
|
97
104
|
# ── 2. 初始化传输层 ──
|
|
98
|
-
self._transport = HttpTransport(
|
|
105
|
+
self._transport = HttpTransport(
|
|
106
|
+
api_url=self._api_url,
|
|
107
|
+
task_id=self._task_id,
|
|
108
|
+
host_header=self._host_header
|
|
109
|
+
)
|
|
99
110
|
self._transport.open()
|
|
100
111
|
|
|
101
112
|
# ── 3. 初始化缓冲区 ──
|
|
@@ -148,11 +159,21 @@ class SpiderManagerClient:
|
|
|
148
159
|
|
|
149
160
|
def shutdown(self) -> None:
|
|
150
161
|
"""
|
|
151
|
-
优雅关闭 SDK:停止定时器 → flush
|
|
162
|
+
优雅关闭 SDK:停止定时器 → 多轮强行 flush → 关闭连接。
|
|
152
163
|
"""
|
|
153
164
|
logger.info("SDK 正在关闭...")
|
|
154
165
|
if self._buffer:
|
|
155
166
|
self._buffer.stop()
|
|
167
|
+
|
|
168
|
+
# “夺命连环报”:如果停止后还有残留(由于 rollback 可能导致残留),再尝试最后一搏
|
|
169
|
+
# 这对处理 atexit 时的重试很有用
|
|
170
|
+
for attempt in range(3):
|
|
171
|
+
if self._buffer.pending_count > 0:
|
|
172
|
+
logger.info("正在执行进程退出前的最后一轮数据冲刷 (Round %d)...", attempt + 1)
|
|
173
|
+
self._buffer.flush()
|
|
174
|
+
else:
|
|
175
|
+
break
|
|
176
|
+
|
|
156
177
|
if self._transport:
|
|
157
178
|
self._transport.close()
|
|
158
179
|
self._initialized = False
|
|
@@ -194,11 +215,16 @@ class SpiderManagerClient:
|
|
|
194
215
|
grouped[entry.table_name].append(entry.data)
|
|
195
216
|
|
|
196
217
|
for table_name, records in grouped.items():
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
"表 '%s'
|
|
201
|
-
|
|
218
|
+
try:
|
|
219
|
+
success = self._transport.send_batch(table_name, records)
|
|
220
|
+
if not success:
|
|
221
|
+
logger.error("表 '%s' 上报失败,触发回退", table_name)
|
|
222
|
+
self._buffer.rollback(entries)
|
|
223
|
+
break
|
|
224
|
+
except Exception:
|
|
225
|
+
logger.exception("上报过程发生异常,触发回退")
|
|
226
|
+
self._buffer.rollback(entries)
|
|
227
|
+
break
|
|
202
228
|
|
|
203
229
|
def _register_exit_hooks(self) -> None:
|
|
204
230
|
"""
|
{spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk/transport.py
RENAMED
|
@@ -17,11 +17,14 @@ import httpx
|
|
|
17
17
|
logger = logging.getLogger("spidermanager_sdk.transport")
|
|
18
18
|
|
|
19
19
|
# 上报接口固定路径
|
|
20
|
-
_INGEST_PATH: str = "/api/
|
|
20
|
+
_INGEST_PATH: str = "/api/tasks/data/ingest"
|
|
21
21
|
|
|
22
22
|
# 默认超时配置(连接 / 读取 / 写入 / 总计)
|
|
23
23
|
_DEFAULT_TIMEOUT = httpx.Timeout(connect=5.0, read=10.0, write=10.0, pool=30.0)
|
|
24
24
|
|
|
25
|
+
# 默认连接池限制:增加保活连接数,减少高频上报下的 DNS 解析压力
|
|
26
|
+
_DEFAULT_LIMITS = httpx.Limits(max_connections=50, max_keepalive_connections=20)
|
|
27
|
+
|
|
25
28
|
|
|
26
29
|
@dataclass
|
|
27
30
|
class HttpTransport:
|
|
@@ -37,6 +40,7 @@ class HttpTransport:
|
|
|
37
40
|
"""
|
|
38
41
|
api_url: str = ""
|
|
39
42
|
task_id: str = ""
|
|
43
|
+
host_header: str | None = None
|
|
40
44
|
|
|
41
45
|
_client: httpx.Client | None = field(default=None, init=False, repr=False)
|
|
42
46
|
|
|
@@ -48,10 +52,15 @@ class HttpTransport:
|
|
|
48
52
|
"""初始化底层 httpx 连接池。"""
|
|
49
53
|
if self._client is not None:
|
|
50
54
|
return
|
|
55
|
+
headers = {"Content-Type": "application/json"}
|
|
56
|
+
if self.host_header:
|
|
57
|
+
headers["Host"] = self.host_header
|
|
58
|
+
|
|
51
59
|
self._client = httpx.Client(
|
|
52
60
|
base_url=self.api_url,
|
|
53
61
|
timeout=_DEFAULT_TIMEOUT,
|
|
54
|
-
|
|
62
|
+
limits=_DEFAULT_LIMITS,
|
|
63
|
+
headers=headers,
|
|
55
64
|
)
|
|
56
65
|
logger.debug("HTTP transport opened → %s", self.api_url)
|
|
57
66
|
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import socket
|
|
2
|
+
import logging
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger("spidermanager_sdk.utils")
|
|
6
|
+
|
|
7
|
+
def resolve_provider_url(api_url: str) -> tuple[str, str | None]:
|
|
8
|
+
"""
|
|
9
|
+
将 URL 中的域名解析为 IP,绕过 Docker DNS 抖动。
|
|
10
|
+
返回: (基于IP的URL, 原始Hostname)
|
|
11
|
+
"""
|
|
12
|
+
parsed = urlparse(api_url)
|
|
13
|
+
hostname = parsed.hostname
|
|
14
|
+
if not hostname:
|
|
15
|
+
return api_url, None
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
# 仅在初始化时执行一次同步解析
|
|
19
|
+
ip = socket.gethostbyname(hostname)
|
|
20
|
+
port = f":{parsed.port}" if parsed.port else ""
|
|
21
|
+
# 重新构建 URL,保留协议和路径,替换域名为 IP
|
|
22
|
+
resolved_url = f"{parsed.scheme}://{ip}{port}"
|
|
23
|
+
return resolved_url, hostname
|
|
24
|
+
except Exception as e:
|
|
25
|
+
logger.warning(f"DNS 预解析失败: {e},将回退到原始地址")
|
|
26
|
+
return api_url, hostname
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: spidermanager-sdk
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5.dev0
|
|
4
4
|
Summary: 极简 Python SDK,将爬虫采集数据通过 HTTP 异步中转至 SpiderManager 后端
|
|
5
5
|
Author: SpiderManager Team
|
|
6
6
|
License: MIT
|
|
@@ -93,5 +93,5 @@ FlushBuffer (内存缓冲, 线程安全)
|
|
|
93
93
|
↓ 条数阈值 / 定时器触发
|
|
94
94
|
HttpTransport.send_batch()
|
|
95
95
|
↓
|
|
96
|
-
POST /api/
|
|
96
|
+
POST /api/tasks/data/ingest?task_id=xxx
|
|
97
97
|
```
|
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
LICENSE
|
|
3
3
|
README.md
|
|
4
4
|
pyproject.toml
|
|
5
|
+
test_robustness.py
|
|
6
|
+
test_transport.py
|
|
5
7
|
.github/workflows/publish.yml
|
|
6
8
|
src/spidermanager_sdk/__init__.py
|
|
7
9
|
src/spidermanager_sdk/_version.py
|
|
@@ -9,6 +11,7 @@ src/spidermanager_sdk/aio.py
|
|
|
9
11
|
src/spidermanager_sdk/buffer.py
|
|
10
12
|
src/spidermanager_sdk/client.py
|
|
11
13
|
src/spidermanager_sdk/transport.py
|
|
14
|
+
src/spidermanager_sdk/utils.py
|
|
12
15
|
src/spidermanager_sdk.egg-info/PKG-INFO
|
|
13
16
|
src/spidermanager_sdk.egg-info/SOURCES.txt
|
|
14
17
|
src/spidermanager_sdk.egg-info/dependency_links.txt
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from unittest.mock import MagicMock, patch
|
|
3
|
+
from spidermanager_sdk.buffer import BufferEntry, FlushBuffer
|
|
4
|
+
from spidermanager_sdk.client import SpiderManagerClient
|
|
5
|
+
|
|
6
|
+
class TestRobustness(unittest.TestCase):
|
|
7
|
+
def test_buffer_rollback(self):
|
|
8
|
+
buffer = FlushBuffer(max_size=10)
|
|
9
|
+
entries = [BufferEntry("test", {"id": i}) for i in range(5)]
|
|
10
|
+
|
|
11
|
+
# Initial entries
|
|
12
|
+
for e in entries:
|
|
13
|
+
buffer.add(e)
|
|
14
|
+
self.assertEqual(buffer.pending_count, 5)
|
|
15
|
+
|
|
16
|
+
# Rollback some failed entries
|
|
17
|
+
failed_entries = [BufferEntry("test", {"id": "failed"})]
|
|
18
|
+
buffer.rollback(failed_entries)
|
|
19
|
+
|
|
20
|
+
# Failed entries should be at the front
|
|
21
|
+
self.assertEqual(buffer.pending_count, 6)
|
|
22
|
+
|
|
23
|
+
# Verify ordering (simplistic check)
|
|
24
|
+
# In a real test we'd access private _entries but let's just check count
|
|
25
|
+
|
|
26
|
+
@patch("spidermanager_sdk.transport.HttpTransport.send_batch")
|
|
27
|
+
def test_client_rollback_on_failure(self, mock_send):
|
|
28
|
+
mock_send.return_value = False # Simulate failure
|
|
29
|
+
|
|
30
|
+
client = SpiderManagerClient()
|
|
31
|
+
# Mocking transport and buffer to avoid real IO
|
|
32
|
+
client.init(api_url="http://localhost:8000", task_id="test", resolve_dns=False)
|
|
33
|
+
|
|
34
|
+
client.insert("test", {"data": "foo"})
|
|
35
|
+
client.flush()
|
|
36
|
+
|
|
37
|
+
# Since it failed, data should be rolled back to buffer
|
|
38
|
+
self.assertEqual(client.pending_count, 1)
|
|
39
|
+
|
|
40
|
+
@patch("spidermanager_sdk.utils.resolve_provider_url")
|
|
41
|
+
def test_dns_resolution_integration(self, mock_resolve):
|
|
42
|
+
mock_resolve.return_value = ("http://1.2.3.4:8000", "my-host")
|
|
43
|
+
|
|
44
|
+
client = SpiderManagerClient()
|
|
45
|
+
client.init(api_url="http://my-host:8000", task_id="test", resolve_dns=True)
|
|
46
|
+
|
|
47
|
+
self.assertEqual(client._api_url, "http://1.2.3.4:8000")
|
|
48
|
+
self.assertEqual(client._host_header, "my-host")
|
|
49
|
+
self.assertEqual(client._transport.host_header, "my-host")
|
|
50
|
+
|
|
51
|
+
if __name__ == "__main__":
|
|
52
|
+
unittest.main()
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from unittest.mock import patch, MagicMock
|
|
3
|
+
from spidermanager_sdk.transport import HttpTransport, _INGEST_PATH
|
|
4
|
+
|
|
5
|
+
class TestTransport(unittest.TestCase):
|
|
6
|
+
@patch("httpx.Client.post")
|
|
7
|
+
def test_send_batch_url(self, mock_post):
|
|
8
|
+
mock_response = MagicMock()
|
|
9
|
+
mock_response.status_code = 200
|
|
10
|
+
mock_post.return_value = mock_response
|
|
11
|
+
|
|
12
|
+
transport = HttpTransport(api_url="http://test_backend:8000", task_id="test_task_123")
|
|
13
|
+
|
|
14
|
+
# 验证发送批量数据时使用的 URL 是否正确
|
|
15
|
+
success = transport.send_batch("test_table", [{"col1": "val1"}])
|
|
16
|
+
|
|
17
|
+
self.assertTrue(success)
|
|
18
|
+
mock_post.assert_called_once()
|
|
19
|
+
args, kwargs = mock_post.call_args
|
|
20
|
+
|
|
21
|
+
self.assertEqual(args[0], "/api/tasks/data/ingest")
|
|
22
|
+
self.assertEqual(kwargs["json"], {"table_name": "test_table", "data": [{"col1": "val1"}]})
|
|
23
|
+
self.assertEqual(kwargs["params"], {"task_id": "test_task_123"})
|
|
24
|
+
print("Test passed: URL is correctly formed as /api/tasks/data/ingest")
|
|
25
|
+
|
|
26
|
+
if __name__ == "__main__":
|
|
27
|
+
unittest.main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|