undetecta 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- undetecta-0.1.0/.gitignore +55 -0
- undetecta-0.1.0/DESIGN.md +581 -0
- undetecta-0.1.0/PKG-INFO +375 -0
- undetecta-0.1.0/README.md +351 -0
- undetecta-0.1.0/pyproject.toml +77 -0
- undetecta-0.1.0/src/undetecta/__init__.py +120 -0
- undetecta-0.1.0/src/undetecta/_transport.py +217 -0
- undetecta-0.1.0/src/undetecta/client.py +307 -0
- undetecta-0.1.0/src/undetecta/errors.py +203 -0
- undetecta-0.1.0/src/undetecta/types.py +493 -0
- undetecta-0.1.0/tests/__init__.py +1 -0
- undetecta-0.1.0/tests/conftest.py +6 -0
- undetecta-0.1.0/tests/test_client.py +498 -0
- undetecta-0.1.0/tests/test_errors.py +320 -0
- undetecta-0.1.0/tests/test_transport.py +337 -0
- undetecta-0.1.0/tests/test_types.py +605 -0
- undetecta-0.1.0/uv.lock +966 -0
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Dependencies
|
|
2
|
+
node_modules/
|
|
3
|
+
.pnpm-store/
|
|
4
|
+
|
|
5
|
+
# Build outputs
|
|
6
|
+
dist/
|
|
7
|
+
build/
|
|
8
|
+
|
|
9
|
+
# Python
|
|
10
|
+
__pycache__/
|
|
11
|
+
*.py[cod]
|
|
12
|
+
.venv/
|
|
13
|
+
*.egg-info/
|
|
14
|
+
.pytest_cache/
|
|
15
|
+
.coverage
|
|
16
|
+
htmlcov/
|
|
17
|
+
.ruff_cache/
|
|
18
|
+
.mypy_cache/
|
|
19
|
+
|
|
20
|
+
# Environment
|
|
21
|
+
.env
|
|
22
|
+
.env.local
|
|
23
|
+
.env.prod
|
|
24
|
+
.env.*.local
|
|
25
|
+
|
|
26
|
+
# IDE
|
|
27
|
+
.idea/
|
|
28
|
+
.vscode/
|
|
29
|
+
*.code-workspace
|
|
30
|
+
|
|
31
|
+
# OS
|
|
32
|
+
.DS_Store
|
|
33
|
+
Thumbs.db
|
|
34
|
+
|
|
35
|
+
# Turbo
|
|
36
|
+
.turbo/
|
|
37
|
+
|
|
38
|
+
# Logs
|
|
39
|
+
*.log
|
|
40
|
+
logs/
|
|
41
|
+
|
|
42
|
+
# Temp
|
|
43
|
+
tmp/
|
|
44
|
+
.tmp/
|
|
45
|
+
*.tmp
|
|
46
|
+
|
|
47
|
+
# Project specific
|
|
48
|
+
browser_profiles/
|
|
49
|
+
screenshots/
|
|
50
|
+
.browser/
|
|
51
|
+
user_data_dir/
|
|
52
|
+
|
|
53
|
+
# Git worktrees
|
|
54
|
+
.worktrees/
|
|
55
|
+
worktrees/
|
|
@@ -0,0 +1,581 @@
|
|
|
1
|
+
# Undetecta Python SDK Architecture Design
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
The `undetecta` Python package provides a type-safe, ergonomic interface for the Undetecta API. It mirrors the JavaScript SDK's API surface while following Python best practices (PEP 517, PEP 621, type hints).
|
|
6
|
+
|
|
7
|
+
## Design Principles
|
|
8
|
+
|
|
9
|
+
1. **Type Safety**: Full type hints using Python's `typing` module with Pydantic for runtime validation
|
|
10
|
+
2. **Ergonomics**: Simple, intuitive API that follows common Python client patterns (requests, httpx)
|
|
11
|
+
3. **Async Support**: First-class async support with httpx for modern Python applications
|
|
12
|
+
4. **Error Handling**: Structured exception hierarchy for predictable error handling
|
|
13
|
+
5. **Modern Packaging**: `pyproject.toml` with PEP 517/621 compliance
|
|
14
|
+
6. **Minimal Dependencies**: Only essential dependencies (httpx, pydantic)
|
|
15
|
+
|
|
16
|
+
## API Surface Analysis
|
|
17
|
+
|
|
18
|
+
The Undetecta API has the following public endpoints:
|
|
19
|
+
|
|
20
|
+
| Method | Endpoint | Description |
|
|
21
|
+
|--------|----------|-------------|
|
|
22
|
+
| POST | `/v1/scrape` | Scrape a URL (sync response) |
|
|
23
|
+
| POST | `/v1/search` | Web search (sync response) |
|
|
24
|
+
| GET | `/v1/health` | Health check |
|
|
25
|
+
|
|
26
|
+
**Note**: Admin endpoints (users, API keys, proxies) are internal and not exposed in the public client.
|
|
27
|
+
|
|
28
|
+
### Request/Response Pattern
|
|
29
|
+
|
|
30
|
+
All responses follow the `result()` wrapper pattern:
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
# Success response
|
|
34
|
+
{"success": True, "data": T}
|
|
35
|
+
|
|
36
|
+
# Error response
|
|
37
|
+
{"success": False, "error": {"code": str, "message": str}}
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Authentication
|
|
41
|
+
|
|
42
|
+
Authentication is via `x-api-key` header.
|
|
43
|
+
|
|
44
|
+
## Package Structure
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
undetecta/
|
|
48
|
+
├── pyproject.toml # PEP 621 package configuration
|
|
49
|
+
├── README.md # Package documentation
|
|
50
|
+
├── LICENSE # MIT License
|
|
51
|
+
├── src/
|
|
52
|
+
│ └── undetecta/
|
|
53
|
+
│ ├── __init__.py # Main entry point, exports
|
|
54
|
+
│ ├── _client.py # UndetectaClient class
|
|
55
|
+
│ ├── _types.py # Type definitions and Pydantic models
|
|
56
|
+
│ ├── _errors.py # Exception classes
|
|
57
|
+
│ ├── _scrape.py # Scrape API methods
|
|
58
|
+
│ ├── _search.py # Search API methods
|
|
59
|
+
│ └── _transport/
|
|
60
|
+
│ ├── __init__.py
|
|
61
|
+
│ ├── _http.py # HTTP transport using httpx
|
|
62
|
+
│ └── _retry.py # Retry logic with exponential backoff
|
|
63
|
+
├── tests/
|
|
64
|
+
│ ├── __init__.py
|
|
65
|
+
│ ├── test_client.py
|
|
66
|
+
│ ├── test_scrape.py
|
|
67
|
+
│ ├── test_search.py
|
|
68
|
+
│ └── test_errors.py
|
|
69
|
+
├── .gitignore
|
|
70
|
+
├── .python-version # Python version (3.10+)
|
|
71
|
+
└── CHANGELOG.md
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
**Note**: Leading underscores on modules (`_client.py`) indicate private/internal modules. The public API is exported through `__init__.py`.
|
|
75
|
+
|
|
76
|
+
## Client Architecture
|
|
77
|
+
|
|
78
|
+
### Main Client Class
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
class UndetectaClient:
|
|
82
|
+
"""Undetecta API client."""
|
|
83
|
+
|
|
84
|
+
def __init__(
|
|
85
|
+
self,
|
|
86
|
+
*,
|
|
87
|
+
api_key: str,
|
|
88
|
+
base_url: str = "https://api.undetecta.com",
|
|
89
|
+
timeout: float = 60.0,
|
|
90
|
+
max_retries: int = 3,
|
|
91
|
+
) -> None: ...
|
|
92
|
+
|
|
93
|
+
# Core methods
|
|
94
|
+
async def scrape(self, request: ScrapeRequest) -> ScrapeResponse: ...
|
|
95
|
+
async def search(self, request: SearchRequest) -> SearchResponse: ...
|
|
96
|
+
async def health(self) -> HealthResponse: ...
|
|
97
|
+
|
|
98
|
+
# Sync variants (optional, convenience)
|
|
99
|
+
def scrape_sync(self, request: ScrapeRequest) -> ScrapeResponse: ...
|
|
100
|
+
def search_sync(self, request: SearchRequest) -> SearchResponse: ...
|
|
101
|
+
|
|
102
|
+
# Configuration (read-only)
|
|
103
|
+
@property
|
|
104
|
+
def api_key(self) -> str: ...
|
|
105
|
+
@property
|
|
106
|
+
def base_url(self) -> str: ...
|
|
107
|
+
@property
|
|
108
|
+
def timeout(self) -> float: ...
|
|
109
|
+
@property
|
|
110
|
+
def max_retries(self) -> int: ...
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Synchronous Support
|
|
114
|
+
|
|
115
|
+
For users who prefer synchronous code, the SDK provides blocking methods via `httpx.Client`:
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
# Using async (default)
|
|
119
|
+
client = UndetectaClient(api_key="sk_...")
|
|
120
|
+
result = await client.scrape(ScrapeRequest(url="https://example.com"))
|
|
121
|
+
|
|
122
|
+
# Using sync convenience methods
|
|
123
|
+
result = client.scrape_sync(ScrapeRequest(url="https://example.com"))
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Type System
|
|
127
|
+
|
|
128
|
+
All types use Python's `typing` module with Pydantic for validation:
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
# Enums (using StrEnum for Python 3.11+)
|
|
132
|
+
class ScrapeFormat(StrEnum):
|
|
133
|
+
HTML = "html"
|
|
134
|
+
RAW_HTML = "rawHtml"
|
|
135
|
+
MARKDOWN = "markdown"
|
|
136
|
+
LINKS = "links"
|
|
137
|
+
SCREENSHOT = "screenshot"
|
|
138
|
+
BRANDING = "branding"
|
|
139
|
+
|
|
140
|
+
class WaitUntil(StrEnum):
|
|
141
|
+
LOAD = "load"
|
|
142
|
+
DOM_CONTENT_LOADED = "domcontentloaded"
|
|
143
|
+
NETWORK_IDLE = "networkidle"
|
|
144
|
+
|
|
145
|
+
class JobStatus(StrEnum):
|
|
146
|
+
PENDING = "pending"
|
|
147
|
+
RUNNING = "running"
|
|
148
|
+
COMPLETED = "completed"
|
|
149
|
+
FAILED = "failed"
|
|
150
|
+
STOPPED = "stopped"
|
|
151
|
+
|
|
152
|
+
# Request models (Pydantic)
|
|
153
|
+
class ScrapeRequest(BaseModel):
|
|
154
|
+
url: str
|
|
155
|
+
formats: list[ScrapeFormat] = [ScrapeFormat.MARKDOWN]
|
|
156
|
+
timeout: int = 30000
|
|
157
|
+
wait_until: WaitUntil = WaitUntil.DOM_CONTENT_LOADED
|
|
158
|
+
# ... 30+ other optional fields
|
|
159
|
+
|
|
160
|
+
# Response models (Pydantic)
|
|
161
|
+
class ScrapeMetadata(BaseModel):
|
|
162
|
+
status_code: int | None = None
|
|
163
|
+
url: str | None = None
|
|
164
|
+
title: str | None = None
|
|
165
|
+
description: str | None = None
|
|
166
|
+
# ...
|
|
167
|
+
|
|
168
|
+
class ScrapeResponse(BaseModel):
|
|
169
|
+
id: str
|
|
170
|
+
status: JobStatus
|
|
171
|
+
created_at: str # ISO 8601 datetime
|
|
172
|
+
completed_at: str | None = None
|
|
173
|
+
metadata: ScrapeMetadata | None = None
|
|
174
|
+
markdown: str | None = None
|
|
175
|
+
html: str | None = None
|
|
176
|
+
# ...
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### Export Structure
|
|
180
|
+
|
|
181
|
+
```python
|
|
182
|
+
# src/undetecta/__init__.py
|
|
183
|
+
"""Undetecta API Client."""
|
|
184
|
+
|
|
185
|
+
from undetecta._client import UndetectaClient
|
|
186
|
+
from undetecta._types import (
|
|
187
|
+
ScrapeFormat,
|
|
188
|
+
ScrapeRequest,
|
|
189
|
+
ScrapeResponse,
|
|
190
|
+
SearchRequest,
|
|
191
|
+
SearchResponse,
|
|
192
|
+
WaitUntil,
|
|
193
|
+
# ... other types
|
|
194
|
+
)
|
|
195
|
+
from undetecta._errors import (
|
|
196
|
+
UndetectaError,
|
|
197
|
+
ApiKeyError,
|
|
198
|
+
RateLimitError,
|
|
199
|
+
ValidationError,
|
|
200
|
+
NetworkError,
|
|
201
|
+
TimeoutError,
|
|
202
|
+
NotFoundError,
|
|
203
|
+
ServerError,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
__all__ = [
|
|
207
|
+
# Client
|
|
208
|
+
"UndetectaClient",
|
|
209
|
+
# Types
|
|
210
|
+
"ScrapeFormat",
|
|
211
|
+
"ScrapeRequest",
|
|
212
|
+
"ScrapeResponse",
|
|
213
|
+
"SearchRequest",
|
|
214
|
+
"SearchResponse",
|
|
215
|
+
"WaitUntil",
|
|
216
|
+
# ...
|
|
217
|
+
# Errors
|
|
218
|
+
"UndetectaError",
|
|
219
|
+
"ApiKeyError",
|
|
220
|
+
"RateLimitError",
|
|
221
|
+
"ValidationError",
|
|
222
|
+
"NetworkError",
|
|
223
|
+
"TimeoutError",
|
|
224
|
+
"NotFoundError",
|
|
225
|
+
"ServerError",
|
|
226
|
+
]
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
## Error Handling Strategy
|
|
230
|
+
|
|
231
|
+
### Exception Hierarchy
|
|
232
|
+
|
|
233
|
+
```python
|
|
234
|
+
class UndetectaError(Exception):
|
|
235
|
+
"""Base class for all Undetecta errors."""
|
|
236
|
+
|
|
237
|
+
def __init__(self, code: str, message: str, status_code: int | None = None) -> None: ...
|
|
238
|
+
|
|
239
|
+
class ApiKeyError(UndetectaError):
|
|
240
|
+
"""Raised when API key is invalid or missing (401)."""
|
|
241
|
+
|
|
242
|
+
class RateLimitError(UndetectaError):
|
|
243
|
+
"""Raised when rate limit is exceeded (429)."""
|
|
244
|
+
|
|
245
|
+
class ValidationError(UndetectaError):
|
|
246
|
+
"""Raised when request validation fails (400)."""
|
|
247
|
+
|
|
248
|
+
class NotFoundError(UndetectaError):
|
|
249
|
+
"""Raised when a resource is not found (404)."""
|
|
250
|
+
|
|
251
|
+
class ServerError(UndetectaError):
|
|
252
|
+
"""Raised when the server returns a 5xx error."""
|
|
253
|
+
|
|
254
|
+
class NetworkError(UndetectaError):
|
|
255
|
+
"""Raised when network request fails."""
|
|
256
|
+
|
|
257
|
+
class TimeoutError(UndetectaError):
|
|
258
|
+
"""Raised when request times out."""
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
### Error Handling Pattern
|
|
262
|
+
|
|
263
|
+
```python
|
|
264
|
+
from undetecta import UndetectaClient, ApiKeyError, RateLimitError
|
|
265
|
+
|
|
266
|
+
client = UndetectaClient(api_key="sk_...")
|
|
267
|
+
|
|
268
|
+
try:
|
|
269
|
+
result = await client.scrape(ScrapeRequest(url="https://example.com"))
|
|
270
|
+
except ApiKeyError as e:
|
|
271
|
+
print(f"Invalid API key: {e.message}")
|
|
272
|
+
except RateLimitError as e:
|
|
273
|
+
print(f"Rate limited, retry later: {e.message}")
|
|
274
|
+
except ValidationError as e:
|
|
275
|
+
print(f"Invalid request: {e.message}")
|
|
276
|
+
except UndetectaError as e:
|
|
277
|
+
print(f"API error: {e.code} - {e.message}")
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
## HTTP Transport
|
|
281
|
+
|
|
282
|
+
### Using httpx
|
|
283
|
+
|
|
284
|
+
The SDK uses `httpx` for HTTP requests:
|
|
285
|
+
|
|
286
|
+
1. **Async-first**: Primary API uses `httpx.AsyncClient`
|
|
287
|
+
2. **Sync fallback**: Sync variants use `httpx.Client`
|
|
288
|
+
3. **Automatic retries**: Exponential backoff for 5xx and network errors
|
|
289
|
+
4. **Timeout handling**: Per-request timeout with `httpx.Timeout`
|
|
290
|
+
5. **Headers injection**: `x-api-key`, `Content-Type`, `User-Agent`
|
|
291
|
+
|
|
292
|
+
```python
|
|
293
|
+
import httpx
|
|
294
|
+
|
|
295
|
+
class _HttpTransport:
|
|
296
|
+
"""Internal HTTP transport with retry logic."""
|
|
297
|
+
|
|
298
|
+
async def _request(
|
|
299
|
+
self,
|
|
300
|
+
method: str,
|
|
301
|
+
url: str,
|
|
302
|
+
*,
|
|
303
|
+
json: dict[str, Any] | None = None,
|
|
304
|
+
max_retries: int = 3,
|
|
305
|
+
) -> dict[str, Any]: ...
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
### Retry Strategy
|
|
309
|
+
|
|
310
|
+
- **Retryable status codes**: 408, 429, 500-599
|
|
311
|
+
- **Backoff**: Exponential with jitter (base: 1s, max: 30s)
|
|
312
|
+
- **Max retries**: Configurable (default: 3)
|
|
313
|
+
- **Non-retryable**: 4xx errors (except 429)
|
|
314
|
+
|
|
315
|
+
## Configuration
|
|
316
|
+
|
|
317
|
+
### pyproject.toml
|
|
318
|
+
|
|
319
|
+
```toml
|
|
320
|
+
[build-system]
|
|
321
|
+
requires = ["hatchling"]
|
|
322
|
+
build-backend = "hatchling.build"
|
|
323
|
+
|
|
324
|
+
[project]
|
|
325
|
+
name = "undetecta"
|
|
326
|
+
version = "0.1.0"
|
|
327
|
+
description = "Python SDK for the Undetecta API"
|
|
328
|
+
readme = "README.md"
|
|
329
|
+
requires-python = ">=3.10"
|
|
330
|
+
license = { text = "MIT" }
|
|
331
|
+
authors = [
|
|
332
|
+
{ name = "Undetecta", email = "sdk@undetecta.com" }
|
|
333
|
+
]
|
|
334
|
+
keywords = ["scraping", "automation", "anti-detection"]
|
|
335
|
+
classifiers = [
|
|
336
|
+
"Development Status :: 4 - Beta",
|
|
337
|
+
"Intended Audience :: Developers",
|
|
338
|
+
"License :: OSI Approved :: MIT License",
|
|
339
|
+
"Programming Language :: Python :: 3.10",
|
|
340
|
+
"Programming Language :: Python :: 3.11",
|
|
341
|
+
"Programming Language :: Python :: 3.12",
|
|
342
|
+
]
|
|
343
|
+
|
|
344
|
+
dependencies = [
|
|
345
|
+
"httpx>=0.27.0",
|
|
346
|
+
"pydantic>=2.0.0",
|
|
347
|
+
]
|
|
348
|
+
|
|
349
|
+
[project.optional-dependencies]
|
|
350
|
+
dev = [
|
|
351
|
+
"pytest>=8.0.0",
|
|
352
|
+
"pytest-asyncio>=0.24.0",
|
|
353
|
+
"pytest-cov>=5.0.0",
|
|
354
|
+
"ruff>=0.8.0",
|
|
355
|
+
"mypy>=1.0.0",
|
|
356
|
+
]
|
|
357
|
+
|
|
358
|
+
[project.urls]
|
|
359
|
+
Homepage = "https://undetecta.com"
|
|
360
|
+
Documentation = "https://github.com/undetecta/sdk-python"
|
|
361
|
+
Repository = "https://github.com/undetecta/sdk-python"
|
|
362
|
+
Issues = "https://github.com/undetecta/sdk-python/issues"
|
|
363
|
+
|
|
364
|
+
[tool.hatch.build.targets.wheel]
|
|
365
|
+
packages = ["src/undetecta"]
|
|
366
|
+
|
|
367
|
+
[tool.ruff]
|
|
368
|
+
line-length = 100
|
|
369
|
+
target-version = "py310"
|
|
370
|
+
|
|
371
|
+
[tool.ruff.lint]
|
|
372
|
+
select = ["E", "F", "I", "N", "W", "UP"]
|
|
373
|
+
ignore = ["E501"]
|
|
374
|
+
|
|
375
|
+
[tool.mypy]
|
|
376
|
+
python_version = "3.10"
|
|
377
|
+
strict = true
|
|
378
|
+
warn_return_any = true
|
|
379
|
+
warn_unused_ignores = true
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
## Usage Examples
|
|
383
|
+
|
|
384
|
+
### Basic Scraping
|
|
385
|
+
|
|
386
|
+
```python
|
|
387
|
+
import asyncio
|
|
388
|
+
from undetecta import UndetectaClient, ScrapeRequest, ScrapeFormat
|
|
389
|
+
|
|
390
|
+
async def main():
|
|
391
|
+
client = UndetectaClient(api_key="sk_...")
|
|
392
|
+
|
|
393
|
+
result = await client.scrape(ScrapeRequest(
|
|
394
|
+
url="https://example.com",
|
|
395
|
+
formats=[ScrapeFormat.MARKDOWN]
|
|
396
|
+
))
|
|
397
|
+
|
|
398
|
+
print(result.markdown)
|
|
399
|
+
|
|
400
|
+
asyncio.run(main())
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
### Synchronous Usage
|
|
404
|
+
|
|
405
|
+
```python
|
|
406
|
+
from undetecta import UndetectaClient, ScrapeRequest
|
|
407
|
+
|
|
408
|
+
client = UndetectaClient(api_key="sk_...")
|
|
409
|
+
result = client.scrape_sync(ScrapeRequest(url="https://example.com"))
|
|
410
|
+
print(result.markdown)
|
|
411
|
+
```
|
|
412
|
+
|
|
413
|
+
### Advanced Scraping
|
|
414
|
+
|
|
415
|
+
```python
|
|
416
|
+
result = await client.scrape(ScrapeRequest(
|
|
417
|
+
url="https://example.com",
|
|
418
|
+
formats=[ScrapeFormat.MARKDOWN, ScrapeFormat.SCREENSHOT, ScrapeFormat.LINKS],
|
|
419
|
+
screenshot_options=ScreenshotOptions(
|
|
420
|
+
full_page=True,
|
|
421
|
+
format="png"
|
|
422
|
+
),
|
|
423
|
+
wait_for_selector=".main-content",
|
|
424
|
+
actions=[
|
|
425
|
+
Action(type="click", selector=".cookie-accept")
|
|
426
|
+
]
|
|
427
|
+
))
|
|
428
|
+
```
|
|
429
|
+
|
|
430
|
+
### Search
|
|
431
|
+
|
|
432
|
+
```python
|
|
433
|
+
result = await client.search(SearchRequest(
|
|
434
|
+
query="python web scraping",
|
|
435
|
+
limit=10,
|
|
436
|
+
sources=[SearchSource.WEB],
|
|
437
|
+
scrape_options=SearchScrapeOptions(
|
|
438
|
+
formats=[ScrapeFormat.MARKDOWN]
|
|
439
|
+
)
|
|
440
|
+
))
|
|
441
|
+
|
|
442
|
+
for item in result.web or []:
|
|
443
|
+
print(f"{item.title}: {item.url}")
|
|
444
|
+
if item.markdown:
|
|
445
|
+
print(item.markdown[:200])
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
### Custom Configuration
|
|
449
|
+
|
|
450
|
+
```python
|
|
451
|
+
client = UndetectaClient(
|
|
452
|
+
api_key="sk_...",
|
|
453
|
+
base_url="https://api.undetecta.loc:6363", # Local development
|
|
454
|
+
timeout=120.0,
|
|
455
|
+
max_retries=5
|
|
456
|
+
)
|
|
457
|
+
```
|
|
458
|
+
|
|
459
|
+
## Dependencies
|
|
460
|
+
|
|
461
|
+
### Runtime
|
|
462
|
+
- `httpx>=0.27.0`: Async HTTP client with sync support
|
|
463
|
+
- `pydantic>=2.0.0`: Runtime validation and type definitions
|
|
464
|
+
|
|
465
|
+
### Development
|
|
466
|
+
- `pytest>=8.0.0`: Testing framework
|
|
467
|
+
- `pytest-asyncio>=0.24.0`: Async test support
|
|
468
|
+
- `pytest-cov>=5.0.0`: Coverage reporting
|
|
469
|
+
- `ruff>=0.8.0`: Fast Python linter
|
|
470
|
+
- `mypy>=1.0.0`: Static type checking
|
|
471
|
+
|
|
472
|
+
## Type Definitions Mapping
|
|
473
|
+
|
|
474
|
+
The Python SDK types mirror the JavaScript SDK and Zod schemas:
|
|
475
|
+
|
|
476
|
+
| JavaScript/TS | Python | Pydantic |
|
|
477
|
+
|---------------|--------|----------|
|
|
478
|
+
| `type ScrapeFormat = ...` | `class ScrapeFormat(StrEnum)` | N/A |
|
|
479
|
+
| `interface ScrapeOptions` | `class ScrapeRequest(BaseModel)` | `BaseModel` |
|
|
480
|
+
| `interface ScrapeJobResponse` | `class ScrapeResponse(BaseModel)` | `BaseModel` |
|
|
481
|
+
| `enum WaitUntil` | `class WaitUntil(StrEnum)` | N/A |
|
|
482
|
+
| `type JobStatus = ...` | `class JobStatus(StrEnum)` | N/A |
|
|
483
|
+
|
|
484
|
+
### Naming Conventions
|
|
485
|
+
|
|
486
|
+
- **Request types**: `{Operation}Request` (e.g., `ScrapeRequest`)
|
|
487
|
+
- **Response types**: `{Operation}Response` (e.g., `ScrapeResponse`)
|
|
488
|
+
- **Enums**: `PascalCase` with `StrEnum` for value serialization
|
|
489
|
+
- **Fields**: `snake_case` (Python convention)
|
|
490
|
+
- JS `waitForSelector` → Python `wait_for_selector`
|
|
491
|
+
- JS `onlyMainContent` → Python `only_main_content`
|
|
492
|
+
|
|
493
|
+
## Publishing to PyPI
|
|
494
|
+
|
|
495
|
+
### Release Process
|
|
496
|
+
|
|
497
|
+
1. **Version bump**: Update `version` in `pyproject.toml`
|
|
498
|
+
2. **Build**: `python -m build`
|
|
499
|
+
3. **Check**: `twine check dist/*`
|
|
500
|
+
4. **Upload**: `twine upload dist/*`
|
|
501
|
+
|
|
502
|
+
### Trusted Publishers (Recommended)
|
|
503
|
+
|
|
504
|
+
Configure PyPI to use trusted publishers (no tokens needed):
|
|
505
|
+
|
|
506
|
+
```toml
|
|
507
|
+
[tool.hatch.publish.index]
|
|
508
|
+
disable = true
|
|
509
|
+
|
|
510
|
+
# Or via PyPI dashboard: GitHub Actions -> OIDC
|
|
511
|
+
```
|
|
512
|
+
|
|
513
|
+
### CI/CD
|
|
514
|
+
|
|
515
|
+
```yaml
|
|
516
|
+
# .github/workflows/release.yml
|
|
517
|
+
name: Release to PyPI
|
|
518
|
+
|
|
519
|
+
on:
|
|
520
|
+
release:
|
|
521
|
+
types: [published]
|
|
522
|
+
|
|
523
|
+
jobs:
|
|
524
|
+
publish:
|
|
525
|
+
runs-on: ubuntu-latest
|
|
526
|
+
permissions:
|
|
527
|
+
id-token: write # Required for trusted publishing
|
|
528
|
+
steps:
|
|
529
|
+
- uses: actions/checkout@v4
|
|
530
|
+
- uses: actions/setup-python@v5
|
|
531
|
+
with:
|
|
532
|
+
python-version: "3.12"
|
|
533
|
+
- run: pip install build
|
|
534
|
+
- run: python -m build
|
|
535
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
536
|
+
```
|
|
537
|
+
|
|
538
|
+
## Testing Strategy
|
|
539
|
+
|
|
540
|
+
### Test Structure
|
|
541
|
+
|
|
542
|
+
```python
|
|
543
|
+
# tests/test_client.py
|
|
544
|
+
import pytest
|
|
545
|
+
from undetecta import UndetectaClient, ScrapeRequest
|
|
546
|
+
from unittest.mock import AsyncMock, patch
|
|
547
|
+
|
|
548
|
+
@pytest.mark.asyncio
|
|
549
|
+
async def test_scrape_success():
|
|
550
|
+
client = UndetectaClient(api_key="test-key")
|
|
551
|
+
# Mock HTTP response
|
|
552
|
+
# Assert response structure
|
|
553
|
+
```
|
|
554
|
+
|
|
555
|
+
### Test Coverage
|
|
556
|
+
|
|
557
|
+
- Unit tests for each module
|
|
558
|
+
- HTTP client mocking
|
|
559
|
+
- Error handling tests
|
|
560
|
+
- Type validation tests
|
|
561
|
+
|
|
562
|
+
## Implementation Order
|
|
563
|
+
|
|
564
|
+
1. **Package setup**: `pyproject.toml`, directory structure
|
|
565
|
+
2. **Type definitions**: `_types.py` with Pydantic models
|
|
566
|
+
3. **Error classes**: `_errors.py`
|
|
567
|
+
4. **HTTP transport**: `_transport/_http.py`, `_transport/_retry.py`
|
|
568
|
+
5. **Client**: `_client.py`
|
|
569
|
+
6. **API methods**: `_scrape.py`, `_search.py`
|
|
570
|
+
7. **Entry point**: `__init__.py` with exports
|
|
571
|
+
8. **Tests**: Unit tests for each module
|
|
572
|
+
9. **Documentation**: README with examples
|
|
573
|
+
|
|
574
|
+
## Future Enhancements (Out of Scope for MVP)
|
|
575
|
+
|
|
576
|
+
1. Webhook support for async jobs
|
|
577
|
+
2. Streaming responses
|
|
578
|
+
3. Batch operations
|
|
579
|
+
4. Context manager support
|
|
580
|
+
5. OpenTelemetry integration
|
|
581
|
+
6. Sphinx documentation
|