spidermanager-sdk 0.1.3.dev0__tar.gz → 0.1.5.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {spidermanager_sdk-0.1.3.dev0/src/spidermanager_sdk.egg-info → spidermanager_sdk-0.1.5.dev0}/PKG-INFO +2 -2
  2. {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/README.md +1 -1
  3. {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk/_version.py +3 -3
  4. {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk/aio.py +47 -8
  5. {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk/buffer.py +10 -1
  6. {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk/client.py +36 -10
  7. {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk/transport.py +11 -2
  8. spidermanager_sdk-0.1.5.dev0/src/spidermanager_sdk/utils.py +26 -0
  9. {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0/src/spidermanager_sdk.egg-info}/PKG-INFO +2 -2
  10. {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk.egg-info/SOURCES.txt +3 -0
  11. spidermanager_sdk-0.1.5.dev0/test_robustness.py +52 -0
  12. spidermanager_sdk-0.1.5.dev0/test_transport.py +27 -0
  13. {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/.github/workflows/publish.yml +0 -0
  14. {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/.gitignore +0 -0
  15. {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/LICENSE +0 -0
  16. {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/pyproject.toml +0 -0
  17. {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/setup.cfg +0 -0
  18. {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk/__init__.py +0 -0
  19. {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk.egg-info/dependency_links.txt +0 -0
  20. {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk.egg-info/requires.txt +0 -0
  21. {spidermanager_sdk-0.1.3.dev0 → spidermanager_sdk-0.1.5.dev0}/src/spidermanager_sdk.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: spidermanager-sdk
3
- Version: 0.1.3.dev0
3
+ Version: 0.1.5.dev0
4
4
  Summary: 极简 Python SDK,将爬虫采集数据通过 HTTP 异步中转至 SpiderManager 后端
5
5
  Author: SpiderManager Team
6
6
  License: MIT
@@ -93,5 +93,5 @@ FlushBuffer (内存缓冲, 线程安全)
93
93
  ↓ 条数阈值 / 定时器触发
94
94
  HttpTransport.send_batch()
95
95
 
96
- POST /api/v1/tasks/data/ingest?task_id=xxx
96
+ POST /api/tasks/data/ingest?task_id=xxx
97
97
  ```
@@ -78,5 +78,5 @@ FlushBuffer (内存缓冲, 线程安全)
78
78
  ↓ 条数阈值 / 定时器触发
79
79
  HttpTransport.send_batch()
80
80
 
81
- POST /api/v1/tasks/data/ingest?task_id=xxx
81
+ POST /api/tasks/data/ingest?task_id=xxx
82
82
  ```
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.3.dev0'
32
- __version_tuple__ = version_tuple = (0, 1, 3, 'dev0')
31
+ __version__ = version = '0.1.5.dev0'
32
+ __version_tuple__ = version_tuple = (0, 1, 5, 'dev0')
33
33
 
34
- __commit_id__ = commit_id = 'g2385ac666'
34
+ __commit_id__ = commit_id = 'g5c067d7e2'
@@ -17,7 +17,7 @@ import httpx
17
17
 
18
18
  from spidermanager_sdk.buffer import BufferEntry
19
19
  from spidermanager_sdk.client import _DEFAULT_BUFFER_SIZE, _DEFAULT_FLUSH_INTERVAL
20
- from spidermanager_sdk.transport import _INGEST_PATH, _DEFAULT_TIMEOUT
20
+ from spidermanager_sdk.transport import _INGEST_PATH, _DEFAULT_TIMEOUT, _DEFAULT_LIMITS
21
21
 
22
22
  logger = logging.getLogger("spidermanager_sdk.aio")
23
23
 
@@ -25,15 +25,21 @@ logger = logging.getLogger("spidermanager_sdk.aio")
25
25
  class AsyncHttpTransport:
26
26
  api_url: str = ""
27
27
  task_id: str = ""
28
+ host_header: str | None = None
28
29
  _client: httpx.AsyncClient | None = field(default=None, init=False, repr=False)
29
30
 
30
31
  async def open(self) -> None:
31
32
  if self._client is not None:
32
33
  return
34
+ headers = {"Content-Type": "application/json"}
35
+ if self.host_header:
36
+ headers["Host"] = self.host_header
37
+
33
38
  self._client = httpx.AsyncClient(
34
39
  base_url=self.api_url,
35
40
  timeout=_DEFAULT_TIMEOUT,
36
- headers={"Content-Type": "application/json"},
41
+ limits=_DEFAULT_LIMITS,
42
+ headers=headers,
37
43
  )
38
44
 
39
45
  async def close(self) -> None:
@@ -100,6 +106,14 @@ class AsyncFlushBuffer:
100
106
  if current_size >= self.max_size:
101
107
  await self.flush()
102
108
 
109
+ async def rollback(self, entries: list[BufferEntry]) -> None:
110
+ """
111
+ 异步回滚:将失败数据写回缓冲区头部。
112
+ """
113
+ async with self._lock:
114
+ self._entries = entries + self._entries
115
+ logger.debug("已异步回滚 %d 条数据至缓冲区", len(entries))
116
+
103
117
  async def flush(self) -> None:
104
118
  async with self._lock:
105
119
  if not self._entries:
@@ -129,6 +143,7 @@ class AsyncSpiderManagerClient:
129
143
  def __init__(self) -> None:
130
144
  self._api_url: str = ""
131
145
  self._task_id: str = ""
146
+ self._host_header: str | None = None
132
147
  self._initialized: bool = False
133
148
  self._transport: AsyncHttpTransport | None = None
134
149
  self._buffer: AsyncFlushBuffer | None = None
@@ -138,16 +153,26 @@ class AsyncSpiderManagerClient:
138
153
  api_url: str | None = None,
139
154
  task_id: str | None = None,
140
155
  *,
141
- buffer_size: int = _DEFAULT_BUFFER_SIZE,
156
+ buffer_size: int = 50,
142
157
  flush_interval: float = _DEFAULT_FLUSH_INTERVAL,
158
+ resolve_dns: bool = True,
143
159
  ) -> None:
144
- self._api_url = api_url or os.environ.get("SPIDER_API_URL", "")
145
- self._task_id = task_id or os.environ.get("TASK_ID", "")
160
+ self._api_url = (api_url or os.environ.get("SPIDER_API_URL", "")).strip()
161
+ self._task_id = (task_id or os.environ.get("TASK_ID", "")).strip()
146
162
  if not self._api_url or not self._task_id:
147
163
  raise ValueError("api_url 或 task_id 未配置 (或环境变量缺失)")
148
164
  self._api_url = self._api_url.rstrip("/")
149
165
 
150
- self._transport = AsyncHttpTransport(api_url=self._api_url, task_id=self._task_id)
166
+ # ── 1.5 DNS 预解析 ──
167
+ if resolve_dns:
168
+ from spidermanager_sdk.utils import resolve_provider_url
169
+ self._api_url, self._host_header = resolve_provider_url(self._api_url)
170
+
171
+ self._transport = AsyncHttpTransport(
172
+ api_url=self._api_url,
173
+ task_id=self._task_id,
174
+ host_header=self._host_header
175
+ )
151
176
  await self._transport.open()
152
177
 
153
178
  self._buffer = AsyncFlushBuffer(
@@ -206,8 +231,22 @@ class AsyncSpiderManagerClient:
206
231
  for table_name, records in grouped.items():
207
232
  tasks.append(self._transport.send_batch(table_name, records))
208
233
 
209
- if tasks:
210
- await asyncio.gather(*tasks)
234
+ if not tasks:
235
+ return
236
+
237
+ try:
238
+ results = await asyncio.gather(*tasks, return_exceptions=True)
239
+ for res in results:
240
+ if res is not True:
241
+ # 发生异常或返回 False,触发回退
242
+ logger.warning("异步上报部分表失败,触发回退")
243
+ if self._buffer:
244
+ await self._buffer.rollback(entries)
245
+ break
246
+ except Exception:
247
+ logger.exception("异步上报发生未知异常,触发回退")
248
+ if self._buffer:
249
+ await self._buffer.rollback(entries)
211
250
 
212
251
  # 默认全局异步单例
213
252
  async_sdk = AsyncSpiderManagerClient()
@@ -38,7 +38,7 @@ class FlushBuffer:
38
38
  on_flush : Callable[[list[BufferEntry]], None]
39
39
  实际的 flush 回调,由 Client 层注入。
40
40
  """
41
- max_size: int = 20
41
+ max_size: int = 50
42
42
  flush_interval: float = 3.0
43
43
  on_flush: Callable[[list[BufferEntry]], None] | None = None
44
44
 
@@ -77,6 +77,15 @@ class FlushBuffer:
77
77
  if current_size >= self.max_size:
78
78
  self.flush()
79
79
 
80
+ def rollback(self, entries: list[BufferEntry]) -> None:
81
+ """
82
+ 将上报失败的数据重新写回缓冲区头部,等待下次 flush。
83
+ """
84
+ with self._lock:
85
+ # 将失败的数据插到现有数据之前
86
+ self._entries = entries + self._entries
87
+ logger.debug("已回滚 %d 条数据至缓冲区", len(entries))
88
+
80
89
  def flush(self) -> None:
81
90
  """
82
91
  将缓冲区中所有数据取出,交给 on_flush 回调处理。
@@ -45,6 +45,7 @@ class SpiderManagerClient:
45
45
  def __init__(self) -> None:
46
46
  self._api_url: str = ""
47
47
  self._task_id: str = ""
48
+ self._host_header: str | None = None
48
49
  self._initialized: bool = False
49
50
  self._transport: HttpTransport | None = None
50
51
  self._buffer: FlushBuffer | None = None
@@ -59,8 +60,9 @@ class SpiderManagerClient:
59
60
  api_url: str | None = None,
60
61
  task_id: str | None = None,
61
62
  *,
62
- buffer_size: int = _DEFAULT_BUFFER_SIZE,
63
+ buffer_size: int = 50,
63
64
  flush_interval: float = _DEFAULT_FLUSH_INTERVAL,
65
+ resolve_dns: bool = True,
64
66
  ) -> None:
65
67
  """
66
68
  初始化 SDK。
@@ -77,8 +79,8 @@ class SpiderManagerClient:
77
79
  时间窗口(秒),默认 3.0。
78
80
  """
79
81
  # ── 1. 解析配置 ──
80
- self._api_url = api_url or os.environ.get("SPIDER_API_URL", "")
81
- self._task_id = task_id or os.environ.get("TASK_ID", "")
82
+ self._api_url = (api_url or os.environ.get("SPIDER_API_URL", "")).strip()
83
+ self._task_id = (task_id or os.environ.get("TASK_ID", "")).strip()
82
84
 
83
85
  if not self._api_url:
84
86
  raise ValueError(
@@ -94,8 +96,17 @@ class SpiderManagerClient:
94
96
  # 去除尾部斜杠
95
97
  self._api_url = self._api_url.rstrip("/")
96
98
 
99
+ # ── 1.5 DNS 预解析 ──
100
+ if resolve_dns:
101
+ from spidermanager_sdk.utils import resolve_provider_url
102
+ self._api_url, self._host_header = resolve_provider_url(self._api_url)
103
+
97
104
  # ── 2. 初始化传输层 ──
98
- self._transport = HttpTransport(api_url=self._api_url, task_id=self._task_id)
105
+ self._transport = HttpTransport(
106
+ api_url=self._api_url,
107
+ task_id=self._task_id,
108
+ host_header=self._host_header
109
+ )
99
110
  self._transport.open()
100
111
 
101
112
  # ── 3. 初始化缓冲区 ──
@@ -148,11 +159,21 @@ class SpiderManagerClient:
148
159
 
149
160
  def shutdown(self) -> None:
150
161
  """
151
- 优雅关闭 SDK:停止定时器 → flush 剩余数据 关闭 HTTP 连接。
162
+ 优雅关闭 SDK:停止定时器 → 多轮强行 flush → 关闭连接。
152
163
  """
153
164
  logger.info("SDK 正在关闭...")
154
165
  if self._buffer:
155
166
  self._buffer.stop()
167
+
168
+ # “夺命连环报”:如果停止后还有残留(由于 rollback 可能导致残留),再尝试最后一搏
169
+ # 这对处理 atexit 时的重试很有用
170
+ for attempt in range(3):
171
+ if self._buffer.pending_count > 0:
172
+ logger.info("正在执行进程退出前的最后一轮数据冲刷 (Round %d)...", attempt + 1)
173
+ self._buffer.flush()
174
+ else:
175
+ break
176
+
156
177
  if self._transport:
157
178
  self._transport.close()
158
179
  self._initialized = False
@@ -194,11 +215,16 @@ class SpiderManagerClient:
194
215
  grouped[entry.table_name].append(entry.data)
195
216
 
196
217
  for table_name, records in grouped.items():
197
- success = self._transport.send_batch(table_name, records)
198
- if not success:
199
- logger.error(
200
- "表 '%s' 的 %d 条数据上报失败", table_name, len(records),
201
- )
218
+ try:
219
+ success = self._transport.send_batch(table_name, records)
220
+ if not success:
221
+ logger.error("表 '%s' 上报失败,触发回退", table_name)
222
+ self._buffer.rollback(entries)
223
+ break
224
+ except Exception:
225
+ logger.exception("上报过程发生异常,触发回退")
226
+ self._buffer.rollback(entries)
227
+ break
202
228
 
203
229
  def _register_exit_hooks(self) -> None:
204
230
  """
@@ -17,11 +17,14 @@ import httpx
17
17
  logger = logging.getLogger("spidermanager_sdk.transport")
18
18
 
19
19
  # 上报接口固定路径
20
- _INGEST_PATH: str = "/api/v1/tasks/data/ingest"
20
+ _INGEST_PATH: str = "/api/tasks/data/ingest"
21
21
 
22
22
  # 默认超时配置(连接 / 读取 / 写入 / 总计)
23
23
  _DEFAULT_TIMEOUT = httpx.Timeout(connect=5.0, read=10.0, write=10.0, pool=30.0)
24
24
 
25
+ # 默认连接池限制:增加保活连接数,减少高频上报下的 DNS 解析压力
26
+ _DEFAULT_LIMITS = httpx.Limits(max_connections=50, max_keepalive_connections=20)
27
+
25
28
 
26
29
  @dataclass
27
30
  class HttpTransport:
@@ -37,6 +40,7 @@ class HttpTransport:
37
40
  """
38
41
  api_url: str = ""
39
42
  task_id: str = ""
43
+ host_header: str | None = None
40
44
 
41
45
  _client: httpx.Client | None = field(default=None, init=False, repr=False)
42
46
 
@@ -48,10 +52,15 @@ class HttpTransport:
48
52
  """初始化底层 httpx 连接池。"""
49
53
  if self._client is not None:
50
54
  return
55
+ headers = {"Content-Type": "application/json"}
56
+ if self.host_header:
57
+ headers["Host"] = self.host_header
58
+
51
59
  self._client = httpx.Client(
52
60
  base_url=self.api_url,
53
61
  timeout=_DEFAULT_TIMEOUT,
54
- headers={"Content-Type": "application/json"},
62
+ limits=_DEFAULT_LIMITS,
63
+ headers=headers,
55
64
  )
56
65
  logger.debug("HTTP transport opened → %s", self.api_url)
57
66
 
@@ -0,0 +1,26 @@
1
+ import socket
2
+ import logging
3
+ from urllib.parse import urlparse
4
+
5
+ logger = logging.getLogger("spidermanager_sdk.utils")
6
+
7
+ def resolve_provider_url(api_url: str) -> tuple[str, str | None]:
8
+ """
9
+ 将 URL 中的域名解析为 IP,绕过 Docker DNS 抖动。
10
+ 返回: (基于IP的URL, 原始Hostname)
11
+ """
12
+ parsed = urlparse(api_url)
13
+ hostname = parsed.hostname
14
+ if not hostname:
15
+ return api_url, None
16
+
17
+ try:
18
+ # 仅在初始化时执行一次同步解析
19
+ ip = socket.gethostbyname(hostname)
20
+ port = f":{parsed.port}" if parsed.port else ""
21
+ # 重新构建 URL,保留协议和路径,替换域名为 IP
22
+ resolved_url = f"{parsed.scheme}://{ip}{port}"
23
+ return resolved_url, hostname
24
+ except Exception as e:
25
+ logger.warning(f"DNS 预解析失败: {e},将回退到原始地址")
26
+ return api_url, hostname
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: spidermanager-sdk
3
- Version: 0.1.3.dev0
3
+ Version: 0.1.5.dev0
4
4
  Summary: 极简 Python SDK,将爬虫采集数据通过 HTTP 异步中转至 SpiderManager 后端
5
5
  Author: SpiderManager Team
6
6
  License: MIT
@@ -93,5 +93,5 @@ FlushBuffer (内存缓冲, 线程安全)
93
93
  ↓ 条数阈值 / 定时器触发
94
94
  HttpTransport.send_batch()
95
95
 
96
- POST /api/v1/tasks/data/ingest?task_id=xxx
96
+ POST /api/tasks/data/ingest?task_id=xxx
97
97
  ```
@@ -2,6 +2,8 @@
2
2
  LICENSE
3
3
  README.md
4
4
  pyproject.toml
5
+ test_robustness.py
6
+ test_transport.py
5
7
  .github/workflows/publish.yml
6
8
  src/spidermanager_sdk/__init__.py
7
9
  src/spidermanager_sdk/_version.py
@@ -9,6 +11,7 @@ src/spidermanager_sdk/aio.py
9
11
  src/spidermanager_sdk/buffer.py
10
12
  src/spidermanager_sdk/client.py
11
13
  src/spidermanager_sdk/transport.py
14
+ src/spidermanager_sdk/utils.py
12
15
  src/spidermanager_sdk.egg-info/PKG-INFO
13
16
  src/spidermanager_sdk.egg-info/SOURCES.txt
14
17
  src/spidermanager_sdk.egg-info/dependency_links.txt
@@ -0,0 +1,52 @@
1
+ import unittest
2
+ from unittest.mock import MagicMock, patch
3
+ from spidermanager_sdk.buffer import BufferEntry, FlushBuffer
4
+ from spidermanager_sdk.client import SpiderManagerClient
5
+
6
+ class TestRobustness(unittest.TestCase):
7
+ def test_buffer_rollback(self):
8
+ buffer = FlushBuffer(max_size=10)
9
+ entries = [BufferEntry("test", {"id": i}) for i in range(5)]
10
+
11
+ # Initial entries
12
+ for e in entries:
13
+ buffer.add(e)
14
+ self.assertEqual(buffer.pending_count, 5)
15
+
16
+ # Rollback some failed entries
17
+ failed_entries = [BufferEntry("test", {"id": "failed"})]
18
+ buffer.rollback(failed_entries)
19
+
20
+ # Failed entries should be at the front
21
+ self.assertEqual(buffer.pending_count, 6)
22
+
23
+ # Verify ordering (simplistic check)
24
+ # In a real test we'd access private _entries but let's just check count
25
+
26
+ @patch("spidermanager_sdk.transport.HttpTransport.send_batch")
27
+ def test_client_rollback_on_failure(self, mock_send):
28
+ mock_send.return_value = False # Simulate failure
29
+
30
+ client = SpiderManagerClient()
31
+ # Mocking transport and buffer to avoid real IO
32
+ client.init(api_url="http://localhost:8000", task_id="test", resolve_dns=False)
33
+
34
+ client.insert("test", {"data": "foo"})
35
+ client.flush()
36
+
37
+ # Since it failed, data should be rolled back to buffer
38
+ self.assertEqual(client.pending_count, 1)
39
+
40
+ @patch("spidermanager_sdk.utils.resolve_provider_url")
41
+ def test_dns_resolution_integration(self, mock_resolve):
42
+ mock_resolve.return_value = ("http://1.2.3.4:8000", "my-host")
43
+
44
+ client = SpiderManagerClient()
45
+ client.init(api_url="http://my-host:8000", task_id="test", resolve_dns=True)
46
+
47
+ self.assertEqual(client._api_url, "http://1.2.3.4:8000")
48
+ self.assertEqual(client._host_header, "my-host")
49
+ self.assertEqual(client._transport.host_header, "my-host")
50
+
51
+ if __name__ == "__main__":
52
+ unittest.main()
@@ -0,0 +1,27 @@
1
+ import unittest
2
+ from unittest.mock import patch, MagicMock
3
+ from spidermanager_sdk.transport import HttpTransport, _INGEST_PATH
4
+
5
+ class TestTransport(unittest.TestCase):
6
+ @patch("httpx.Client.post")
7
+ def test_send_batch_url(self, mock_post):
8
+ mock_response = MagicMock()
9
+ mock_response.status_code = 200
10
+ mock_post.return_value = mock_response
11
+
12
+ transport = HttpTransport(api_url="http://test_backend:8000", task_id="test_task_123")
13
+
14
+ # 验证发送批量数据时使用的 URL 是否正确
15
+ success = transport.send_batch("test_table", [{"col1": "val1"}])
16
+
17
+ self.assertTrue(success)
18
+ mock_post.assert_called_once()
19
+ args, kwargs = mock_post.call_args
20
+
21
+ self.assertEqual(args[0], "/api/tasks/data/ingest")
22
+ self.assertEqual(kwargs["json"], {"table_name": "test_table", "data": [{"col1": "val1"}]})
23
+ self.assertEqual(kwargs["params"], {"task_id": "test_task_123"})
24
+ print("Test passed: URL is correctly formed as /api/tasks/data/ingest")
25
+
26
+ if __name__ == "__main__":
27
+ unittest.main()