datablue 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datablue-1.0.0/.gitignore +137 -0
- datablue-1.0.0/PKG-INFO +375 -0
- datablue-1.0.0/README.md +346 -0
- datablue-1.0.0/datablue/__init__.py +79 -0
- datablue-1.0.0/datablue/_batch.py +60 -0
- datablue-1.0.0/datablue/_http.py +201 -0
- datablue-1.0.0/datablue/_utils.py +20 -0
- datablue-1.0.0/datablue/_version.py +1 -0
- datablue-1.0.0/datablue/async_client.py +377 -0
- datablue-1.0.0/datablue/client.py +369 -0
- datablue-1.0.0/datablue/config.py +59 -0
- datablue-1.0.0/datablue/exceptions.py +148 -0
- datablue-1.0.0/datablue/models/__init__.py +20 -0
- datablue-1.0.0/datablue/models/common.py +91 -0
- datablue-1.0.0/datablue/models/crawl.py +41 -0
- datablue-1.0.0/datablue/models/map.py +27 -0
- datablue-1.0.0/datablue/models/scrape.py +15 -0
- datablue-1.0.0/datablue/models/search.py +65 -0
- datablue-1.0.0/keywords_1000.txt +1000 -0
- datablue-1.0.0/pyproject.toml +49 -0
- datablue-1.0.0/tests/conftest.py +13 -0
- datablue-1.0.0/tests/test_batch.py +101 -0
- datablue-1.0.0/tests/test_config.py +65 -0
- datablue-1.0.0/tests/test_crawl.py +88 -0
- datablue-1.0.0/tests/test_exceptions.py +71 -0
- datablue-1.0.0/tests/test_http_retry.py +33 -0
- datablue-1.0.0/tests/test_models.py +159 -0
- datablue-1.0.0/tests/test_scrape.py +72 -0
- datablue-1.0.0/tests/test_search.py +59 -0
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
ADMIN_CLI.txt
|
|
2
|
+
|
|
3
|
+
# Python
|
|
4
|
+
__pycache__/
|
|
5
|
+
*.py[cod]
|
|
6
|
+
*$py.class
|
|
7
|
+
*.so
|
|
8
|
+
.Python
|
|
9
|
+
env/
|
|
10
|
+
venv/
|
|
11
|
+
.venv/
|
|
12
|
+
*.egg-info/
|
|
13
|
+
dist/
|
|
14
|
+
build/
|
|
15
|
+
.eggs/
|
|
16
|
+
|
|
17
|
+
# Node
|
|
18
|
+
node_modules/
|
|
19
|
+
.next/
|
|
20
|
+
out/
|
|
21
|
+
.turbo/
|
|
22
|
+
|
|
23
|
+
# Environment
|
|
24
|
+
.env
|
|
25
|
+
.env.local
|
|
26
|
+
.env.*.local
|
|
27
|
+
|
|
28
|
+
# IDE
|
|
29
|
+
.vscode/
|
|
30
|
+
.idea/
|
|
31
|
+
*.swp
|
|
32
|
+
*.swo
|
|
33
|
+
*~
|
|
34
|
+
|
|
35
|
+
# OS
|
|
36
|
+
.DS_Store
|
|
37
|
+
Thumbs.db
|
|
38
|
+
|
|
39
|
+
# Docker
|
|
40
|
+
docker-compose.override.yml
|
|
41
|
+
|
|
42
|
+
# Database
|
|
43
|
+
*.sqlite3
|
|
44
|
+
*.db
|
|
45
|
+
|
|
46
|
+
# Logs
|
|
47
|
+
*.log
|
|
48
|
+
logs/
|
|
49
|
+
|
|
50
|
+
# Coverage
|
|
51
|
+
htmlcov/
|
|
52
|
+
.coverage
|
|
53
|
+
coverage/
|
|
54
|
+
.nyc_output/
|
|
55
|
+
|
|
56
|
+
# Celery
|
|
57
|
+
celerybeat-schedule
|
|
58
|
+
celerybeat.pid
|
|
59
|
+
|
|
60
|
+
# AI tools
|
|
61
|
+
.agent/
|
|
62
|
+
.agents/
|
|
63
|
+
.windsurf/
|
|
64
|
+
|
|
65
|
+
# TypeScript build artifacts
|
|
66
|
+
*.tsbuildinfo
|
|
67
|
+
.claude/skills/
|
|
68
|
+
.playwright-mcp/
|
|
69
|
+
|
|
70
|
+
# Landing page prototypes
|
|
71
|
+
datablue-landing-v2/
|
|
72
|
+
.gitnexus
|
|
73
|
+
|
|
74
|
+
# Deploy secrets (never commit env files with credentials)
|
|
75
|
+
deploy/*.env
|
|
76
|
+
deploy/generate-env.sh
|
|
77
|
+
|
|
78
|
+
# Screenshots & design iterations (bloat)
|
|
79
|
+
/*.png
|
|
80
|
+
/logo/*.png
|
|
81
|
+
|
|
82
|
+
# Embedded repos
|
|
83
|
+
new-ui/
|
|
84
|
+
|
|
85
|
+
# Research & benchmarking (ephemeral work)
|
|
86
|
+
alt_engines_research/
|
|
87
|
+
bing_research/
|
|
88
|
+
brave_research/
|
|
89
|
+
ddg_research/
|
|
90
|
+
startpage_research/
|
|
91
|
+
pow_capture/
|
|
92
|
+
bench_results/
|
|
93
|
+
|
|
94
|
+
# Temp scripts & result files at root
|
|
95
|
+
/test_*.py
|
|
96
|
+
/bench_*.py
|
|
97
|
+
/analyze_*.py
|
|
98
|
+
/capture_*.py
|
|
99
|
+
/scrape_*.py
|
|
100
|
+
/find_*.py
|
|
101
|
+
/keyword_*.py
|
|
102
|
+
/engine_*.py
|
|
103
|
+
/cookie_replay*.py
|
|
104
|
+
/pow_*.py
|
|
105
|
+
/pow_*.js
|
|
106
|
+
/extract_script*.py
|
|
107
|
+
/*.csv
|
|
108
|
+
/*.json
|
|
109
|
+
/*.xlsx
|
|
110
|
+
/cookie_test_*.html
|
|
111
|
+
/google_js_shell_sample.html
|
|
112
|
+
/datablue_*.xlsx
|
|
113
|
+
/pow_shell_raw.html
|
|
114
|
+
/client_scrape_export.json
|
|
115
|
+
|
|
116
|
+
# Output & temp directories
|
|
117
|
+
tmp/
|
|
118
|
+
output/
|
|
119
|
+
|
|
120
|
+
# MCP config (local)
|
|
121
|
+
.mcp.json
|
|
122
|
+
|
|
123
|
+
# Report/research docs (not product docs)
|
|
124
|
+
BOT_DETECTION_RESEARCH.md
|
|
125
|
+
COOKIE_REPLAY_REPORT.md
|
|
126
|
+
MICROBROWSER_INTEGRATION.md
|
|
127
|
+
MICROBROWSER_OSINT_REPORT.md
|
|
128
|
+
MICROBROWSER_PLAN.md
|
|
129
|
+
MULTI_ENGINE_DESIGN.md
|
|
130
|
+
SEARCH_ENGINE_RESEARCH.md
|
|
131
|
+
SERP_MISSION_REPORT.md
|
|
132
|
+
V8_MINIRACER_REPORT.md
|
|
133
|
+
MIGRATION_RUNBOOK.md
|
|
134
|
+
datablue-architecture.html
|
|
135
|
+
|
|
136
|
+
# Test results
|
|
137
|
+
test-results/
|
datablue-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datablue
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Python SDK for the DataBlue web scraping platform — scrape, crawl, search, and map
|
|
5
|
+
Project-URL: Homepage, https://datablue.dev
|
|
6
|
+
Project-URL: Documentation, https://docs.datablue.dev
|
|
7
|
+
Project-URL: Repository, https://github.com/datablue-dev/datablue-python
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: crawling,datablue,firecrawl-alternative,scraping,web-data
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Classifier: Typing :: Typed
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Requires-Dist: httpx>=0.27.0
|
|
23
|
+
Requires-Dist: pydantic>=2.0.0
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
27
|
+
Requires-Dist: respx>=0.21; extra == 'dev'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# DataBlue Python SDK
|
|
31
|
+
|
|
32
|
+
The official Python SDK for [DataBlue](https://datablue.dev) — a self-hosted web scraping platform with anti-bot bypass, structured data extraction, and site crawling.
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install datablue
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Quick Start
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from datablue import DataBlue
|
|
44
|
+
|
|
45
|
+
client = DataBlue(
|
|
46
|
+
api_url="https://api.datablue.dev",
|
|
47
|
+
api_key="your_api_key",
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Scrape a page
|
|
51
|
+
result = client.scrape("https://example.com")
|
|
52
|
+
print(result.data.markdown)
|
|
53
|
+
|
|
54
|
+
client.close()
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Context Manager
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
with DataBlue(api_key="your_api_key") as client:
|
|
61
|
+
result = client.scrape("https://example.com")
|
|
62
|
+
print(result.data.markdown)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Environment Variables
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
export DATABLUE_API_KEY=your_api_key
|
|
69
|
+
export DATABLUE_API_URL=https://api.datablue.dev
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
client = DataBlue.from_env()
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Scrape
|
|
77
|
+
|
|
78
|
+
Scrape a single URL and get structured content back.
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
result = client.scrape("https://example.com")
|
|
82
|
+
|
|
83
|
+
# Access content
|
|
84
|
+
print(result.data.markdown) # Markdown content
|
|
85
|
+
print(result.data.html) # HTML content
|
|
86
|
+
print(result.data.links) # Extracted links
|
|
87
|
+
print(result.data.metadata.title) # Page title
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Advanced Scraping
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
result = client.scrape(
|
|
94
|
+
"https://example.com",
|
|
95
|
+
formats=["markdown", "html", "links", "screenshot"],
|
|
96
|
+
only_main_content=True,
|
|
97
|
+
wait_for=2000, # Wait 2s for JS to load
|
|
98
|
+
timeout=30000, # 30s timeout
|
|
99
|
+
css_selector=".article", # Target specific element
|
|
100
|
+
mobile=True, # Mobile viewport
|
|
101
|
+
headers={"Accept-Language": "en-US"},
|
|
102
|
+
cookies={"session": "abc123"},
|
|
103
|
+
)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Browser Actions
|
|
107
|
+
|
|
108
|
+
Execute browser actions before scraping:
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
result = client.scrape(
|
|
112
|
+
"https://example.com",
|
|
113
|
+
actions=[
|
|
114
|
+
{"type": "click", "selector": "#load-more"},
|
|
115
|
+
{"type": "wait", "milliseconds": 2000},
|
|
116
|
+
{"type": "scroll", "direction": "down", "amount": 3},
|
|
117
|
+
{"type": "screenshot"},
|
|
118
|
+
],
|
|
119
|
+
)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### LLM Extraction
|
|
123
|
+
|
|
124
|
+
Extract structured data using AI:
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
result = client.scrape(
|
|
128
|
+
"https://example.com/product",
|
|
129
|
+
extract={
|
|
130
|
+
"prompt": "Extract the product name, price, and rating",
|
|
131
|
+
"schema": {
|
|
132
|
+
"type": "object",
|
|
133
|
+
"properties": {
|
|
134
|
+
"name": {"type": "string"},
|
|
135
|
+
"price": {"type": "number"},
|
|
136
|
+
"rating": {"type": "number"},
|
|
137
|
+
},
|
|
138
|
+
},
|
|
139
|
+
},
|
|
140
|
+
)
|
|
141
|
+
print(result.data.extract)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Crawl
|
|
145
|
+
|
|
146
|
+
Crawl an entire website. Returns results as pages are discovered.
|
|
147
|
+
|
|
148
|
+
### Blocking (waits for completion)
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
result = client.crawl(
|
|
152
|
+
"https://example.com",
|
|
153
|
+
max_pages=50,
|
|
154
|
+
max_depth=3,
|
|
155
|
+
timeout=300,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
for page in result.data:
|
|
159
|
+
print(page.url, len(page.markdown or ""))
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Non-blocking (manual polling)
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
job = client.start_crawl("https://example.com", max_pages=100)
|
|
166
|
+
print(f"Job started: {job.job_id}")
|
|
167
|
+
|
|
168
|
+
# Poll for status
|
|
169
|
+
status = client.get_crawl_status(job.job_id)
|
|
170
|
+
print(f"Progress: {status.completed_pages}/{status.total_pages}")
|
|
171
|
+
|
|
172
|
+
# Cancel if needed
|
|
173
|
+
client.cancel_crawl(job.job_id)
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### Crawl Options
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
result = client.crawl(
|
|
180
|
+
"https://example.com",
|
|
181
|
+
max_pages=200,
|
|
182
|
+
max_depth=5,
|
|
183
|
+
concurrency=5,
|
|
184
|
+
crawl_strategy="bfs", # bfs, dfs, or bff (best-first)
|
|
185
|
+
include_paths=["/blog/*"], # Only crawl matching paths
|
|
186
|
+
exclude_paths=["/admin/*"], # Skip these paths
|
|
187
|
+
allow_external_links=False,
|
|
188
|
+
respect_robots_txt=True,
|
|
189
|
+
scrape_options={
|
|
190
|
+
"formats": ["markdown"],
|
|
191
|
+
"only_main_content": True,
|
|
192
|
+
},
|
|
193
|
+
)
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
## Search
|
|
197
|
+
|
|
198
|
+
Search the web and scrape each result page.
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
result = client.search(
|
|
202
|
+
"best python web scraping libraries",
|
|
203
|
+
num_results=10,
|
|
204
|
+
formats=["markdown"],
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
for item in result.data:
|
|
208
|
+
print(f"{item.title}: {item.url}")
|
|
209
|
+
print(item.markdown[:200])
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### Search Engines
|
|
213
|
+
|
|
214
|
+
```python
|
|
215
|
+
# Default: Google (via SearXNG)
|
|
216
|
+
result = client.search("query", engine="google")
|
|
217
|
+
|
|
218
|
+
# DuckDuckGo
|
|
219
|
+
result = client.search("query", engine="duckduckgo")
|
|
220
|
+
|
|
221
|
+
# Brave (requires API key)
|
|
222
|
+
result = client.search("query", engine="brave", brave_api_key="...")
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
## Map
|
|
226
|
+
|
|
227
|
+
Discover all URLs on a website using sitemaps and link crawling.
|
|
228
|
+
|
|
229
|
+
```python
|
|
230
|
+
result = client.map("https://example.com", limit=500)
|
|
231
|
+
|
|
232
|
+
for link in result.links:
|
|
233
|
+
print(f"{link.url} - {link.title}")
|
|
234
|
+
|
|
235
|
+
# Just the URLs
|
|
236
|
+
print(result.urls)
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
### Filter URLs
|
|
240
|
+
|
|
241
|
+
```python
|
|
242
|
+
result = client.map(
|
|
243
|
+
"https://example.com",
|
|
244
|
+
search="blog", # Filter by keyword
|
|
245
|
+
include_subdomains=True,
|
|
246
|
+
use_sitemap=True,
|
|
247
|
+
limit=1000,
|
|
248
|
+
)
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
## Batch Scrape
|
|
252
|
+
|
|
253
|
+
Scrape multiple URLs efficiently.
|
|
254
|
+
|
|
255
|
+
```python
|
|
256
|
+
results = client.batch_scrape(
|
|
257
|
+
[
|
|
258
|
+
"https://example.com/page1",
|
|
259
|
+
"https://example.com/page2",
|
|
260
|
+
"https://example.com/page3",
|
|
261
|
+
],
|
|
262
|
+
scrape_options={"formats": ["markdown"], "only_main_content": True},
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
for r in results:
|
|
266
|
+
if r.success:
|
|
267
|
+
print(r.data.metadata.title)
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
## Async Client
|
|
271
|
+
|
|
272
|
+
Full async support for high-performance applications.
|
|
273
|
+
|
|
274
|
+
```python
|
|
275
|
+
import asyncio
|
|
276
|
+
from datablue import AsyncDataBlue
|
|
277
|
+
|
|
278
|
+
async def main():
|
|
279
|
+
async with AsyncDataBlue(api_key="your_key") as client:
|
|
280
|
+
# Scrape
|
|
281
|
+
result = await client.scrape("https://example.com")
|
|
282
|
+
|
|
283
|
+
# Crawl
|
|
284
|
+
crawl = await client.crawl("https://example.com", max_pages=50)
|
|
285
|
+
|
|
286
|
+
# Search
|
|
287
|
+
search = await client.search("python scraping", num_results=5)
|
|
288
|
+
|
|
289
|
+
# Map
|
|
290
|
+
sitemap = await client.map("https://example.com")
|
|
291
|
+
|
|
292
|
+
# Batch scrape (concurrent)
|
|
293
|
+
results = await client.batch_scrape(urls, concurrency=10)
|
|
294
|
+
|
|
295
|
+
# Streaming batch (yields as completed)
|
|
296
|
+
async for result in client.batch_scrape_iter(urls, concurrency=10):
|
|
297
|
+
print(result.data.url)
|
|
298
|
+
|
|
299
|
+
asyncio.run(main())
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
## Error Handling
|
|
303
|
+
|
|
304
|
+
```python
|
|
305
|
+
from datablue import (
|
|
306
|
+
DataBlueError,
|
|
307
|
+
AuthenticationError,
|
|
308
|
+
RateLimitError,
|
|
309
|
+
NotFoundError,
|
|
310
|
+
ServerError,
|
|
311
|
+
JobFailedError,
|
|
312
|
+
TimeoutError,
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
try:
|
|
316
|
+
result = client.scrape("https://example.com")
|
|
317
|
+
except AuthenticationError:
|
|
318
|
+
print("Bad API key")
|
|
319
|
+
except RateLimitError as e:
|
|
320
|
+
print(f"Rate limited. Retry after {e.retry_after}s")
|
|
321
|
+
except TimeoutError as e:
|
|
322
|
+
print(f"Job timed out after {e.elapsed}s")
|
|
323
|
+
except JobFailedError as e:
|
|
324
|
+
print(f"Job {e.job_id} failed")
|
|
325
|
+
except ServerError:
|
|
326
|
+
print("Server error (auto-retried)")
|
|
327
|
+
except DataBlueError as e:
|
|
328
|
+
print(f"Error {e.status_code}: {e.message}")
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
All errors include:
|
|
332
|
+
- `e.message` — human-readable description
|
|
333
|
+
- `e.status_code` — HTTP status code
|
|
334
|
+
- `e.is_retryable` — whether the request can be retried
|
|
335
|
+
- `e.retry_after` — seconds to wait (for 429s)
|
|
336
|
+
- `e.docs_url` — link to error documentation
|
|
337
|
+
|
|
338
|
+
## Configuration
|
|
339
|
+
|
|
340
|
+
```python
|
|
341
|
+
client = DataBlue(
|
|
342
|
+
api_url="https://api.datablue.dev", # API base URL
|
|
343
|
+
api_key="your_key", # API key
|
|
344
|
+
timeout=60.0, # Request timeout (seconds)
|
|
345
|
+
max_retries=3, # Retry count for 5xx/429
|
|
346
|
+
)
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
### Self-Hosted
|
|
350
|
+
|
|
351
|
+
```python
|
|
352
|
+
client = DataBlue(
|
|
353
|
+
api_url="http://localhost:8000",
|
|
354
|
+
api_key="your_key",
|
|
355
|
+
)
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
### Login with Email/Password
|
|
359
|
+
|
|
360
|
+
```python
|
|
361
|
+
client = DataBlue(api_url="https://api.datablue.dev")
|
|
362
|
+
client.login("you@email.com", "password")
|
|
363
|
+
# JWT token is stored automatically
|
|
364
|
+
result = client.scrape("https://example.com")
|
|
365
|
+
```
|
|
366
|
+
|
|
367
|
+
## Requirements
|
|
368
|
+
|
|
369
|
+
- Python 3.10+
|
|
370
|
+
- httpx
|
|
371
|
+
- pydantic v2
|
|
372
|
+
|
|
373
|
+
## License
|
|
374
|
+
|
|
375
|
+
MIT
|