datablue 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,137 @@
1
+ ADMIN_CLI.txt
2
+
3
+ # Python
4
+ __pycache__/
5
+ *.py[cod]
6
+ *$py.class
7
+ *.so
8
+ .Python
9
+ env/
10
+ venv/
11
+ .venv/
12
+ *.egg-info/
13
+ dist/
14
+ build/
15
+ .eggs/
16
+
17
+ # Node
18
+ node_modules/
19
+ .next/
20
+ out/
21
+ .turbo/
22
+
23
+ # Environment
24
+ .env
25
+ .env.local
26
+ .env.*.local
27
+
28
+ # IDE
29
+ .vscode/
30
+ .idea/
31
+ *.swp
32
+ *.swo
33
+ *~
34
+
35
+ # OS
36
+ .DS_Store
37
+ Thumbs.db
38
+
39
+ # Docker
40
+ docker-compose.override.yml
41
+
42
+ # Database
43
+ *.sqlite3
44
+ *.db
45
+
46
+ # Logs
47
+ *.log
48
+ logs/
49
+
50
+ # Coverage
51
+ htmlcov/
52
+ .coverage
53
+ coverage/
54
+ .nyc_output/
55
+
56
+ # Celery
57
+ celerybeat-schedule
58
+ celerybeat.pid
59
+
60
+ # AI tools
61
+ .agent/
62
+ .agents/
63
+ .windsurf/
64
+
65
+ # TypeScript build artifacts
66
+ *.tsbuildinfo
67
+ .claude/skills/
68
+ .playwright-mcp/
69
+
70
+ # Landing page prototypes
71
+ datablue-landing-v2/
72
+ .gitnexus
73
+
74
+ # Deploy secrets (never commit env files with credentials)
75
+ deploy/*.env
76
+ deploy/generate-env.sh
77
+
78
+ # Screenshots & design iterations (bloat)
79
+ /*.png
80
+ /logo/*.png
81
+
82
+ # Embedded repos
83
+ new-ui/
84
+
85
+ # Research & benchmarking (ephemeral work)
86
+ alt_engines_research/
87
+ bing_research/
88
+ brave_research/
89
+ ddg_research/
90
+ startpage_research/
91
+ pow_capture/
92
+ bench_results/
93
+
94
+ # Temp scripts & result files at root
95
+ /test_*.py
96
+ /bench_*.py
97
+ /analyze_*.py
98
+ /capture_*.py
99
+ /scrape_*.py
100
+ /find_*.py
101
+ /keyword_*.py
102
+ /engine_*.py
103
+ /cookie_replay*.py
104
+ /pow_*.py
105
+ /pow_*.js
106
+ /extract_script*.py
107
+ /*.csv
108
+ /*.json
109
+ /*.xlsx
110
+ /cookie_test_*.html
111
+ /google_js_shell_sample.html
112
+ /datablue_*.xlsx
113
+ /pow_shell_raw.html
114
+ /client_scrape_export.json
115
+
116
+ # Output & temp directories
117
+ tmp/
118
+ output/
119
+
120
+ # MCP config (local)
121
+ .mcp.json
122
+
123
+ # Report/research docs (not product docs)
124
+ BOT_DETECTION_RESEARCH.md
125
+ COOKIE_REPLAY_REPORT.md
126
+ MICROBROWSER_INTEGRATION.md
127
+ MICROBROWSER_OSINT_REPORT.md
128
+ MICROBROWSER_PLAN.md
129
+ MULTI_ENGINE_DESIGN.md
130
+ SEARCH_ENGINE_RESEARCH.md
131
+ SERP_MISSION_REPORT.md
132
+ V8_MINIRACER_REPORT.md
133
+ MIGRATION_RUNBOOK.md
134
+ datablue-architecture.html
135
+
136
+ # Test results
137
+ test-results/
@@ -0,0 +1,375 @@
1
+ Metadata-Version: 2.4
2
+ Name: datablue
3
+ Version: 1.0.0
4
+ Summary: Python SDK for the DataBlue web scraping platform — scrape, crawl, search, and map
5
+ Project-URL: Homepage, https://datablue.dev
6
+ Project-URL: Documentation, https://docs.datablue.dev
7
+ Project-URL: Repository, https://github.com/datablue-dev/datablue-python
8
+ License-Expression: MIT
9
+ Keywords: crawling,datablue,firecrawl-alternative,scraping,web-data
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Internet :: WWW/HTTP
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Classifier: Typing :: Typed
21
+ Requires-Python: >=3.10
22
+ Requires-Dist: httpx>=0.27.0
23
+ Requires-Dist: pydantic>=2.0.0
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
26
+ Requires-Dist: pytest>=8.0; extra == 'dev'
27
+ Requires-Dist: respx>=0.21; extra == 'dev'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # DataBlue Python SDK
31
+
32
+ The official Python SDK for [DataBlue](https://datablue.dev) — a self-hosted web scraping platform with anti-bot bypass, structured data extraction, and site crawling.
33
+
34
+ ## Installation
35
+
36
+ ```bash
37
+ pip install datablue
38
+ ```
39
+
40
+ ## Quick Start
41
+
42
+ ```python
43
+ from datablue import DataBlue
44
+
45
+ client = DataBlue(
46
+ api_url="https://api.datablue.dev",
47
+ api_key="your_api_key",
48
+ )
49
+
50
+ # Scrape a page
51
+ result = client.scrape("https://example.com")
52
+ print(result.data.markdown)
53
+
54
+ client.close()
55
+ ```
56
+
57
+ ### Context Manager
58
+
59
+ ```python
60
+ with DataBlue(api_key="your_api_key") as client:
61
+ result = client.scrape("https://example.com")
62
+ print(result.data.markdown)
63
+ ```
64
+
65
+ ### Environment Variables
66
+
67
+ ```bash
68
+ export DATABLUE_API_KEY=your_api_key
69
+ export DATABLUE_API_URL=https://api.datablue.dev
70
+ ```
71
+
72
+ ```python
73
+ client = DataBlue.from_env()
74
+ ```
75
+
76
+ ## Scrape
77
+
78
+ Scrape a single URL and get structured content back.
79
+
80
+ ```python
81
+ result = client.scrape("https://example.com")
82
+
83
+ # Access content
84
+ print(result.data.markdown) # Markdown content
85
+ print(result.data.html) # HTML content
86
+ print(result.data.links) # Extracted links
87
+ print(result.data.metadata.title) # Page title
88
+ ```
89
+
90
+ ### Advanced Scraping
91
+
92
+ ```python
93
+ result = client.scrape(
94
+ "https://example.com",
95
+ formats=["markdown", "html", "links", "screenshot"],
96
+ only_main_content=True,
97
+ wait_for=2000, # Wait 2s for JS to load
98
+ timeout=30000, # 30s timeout
99
+ css_selector=".article", # Target specific element
100
+ mobile=True, # Mobile viewport
101
+ headers={"Accept-Language": "en-US"},
102
+ cookies={"session": "abc123"},
103
+ )
104
+ ```
105
+
106
+ ### Browser Actions
107
+
108
+ Execute browser actions before scraping:
109
+
110
+ ```python
111
+ result = client.scrape(
112
+ "https://example.com",
113
+ actions=[
114
+ {"type": "click", "selector": "#load-more"},
115
+ {"type": "wait", "milliseconds": 2000},
116
+ {"type": "scroll", "direction": "down", "amount": 3},
117
+ {"type": "screenshot"},
118
+ ],
119
+ )
120
+ ```
121
+
122
+ ### LLM Extraction
123
+
124
+ Extract structured data using AI:
125
+
126
+ ```python
127
+ result = client.scrape(
128
+ "https://example.com/product",
129
+ extract={
130
+ "prompt": "Extract the product name, price, and rating",
131
+ "schema": {
132
+ "type": "object",
133
+ "properties": {
134
+ "name": {"type": "string"},
135
+ "price": {"type": "number"},
136
+ "rating": {"type": "number"},
137
+ },
138
+ },
139
+ },
140
+ )
141
+ print(result.data.extract)
142
+ ```
143
+
144
+ ## Crawl
145
+
146
+ Crawl an entire website. Returns results as pages are discovered.
147
+
148
+ ### Blocking (waits for completion)
149
+
150
+ ```python
151
+ result = client.crawl(
152
+ "https://example.com",
153
+ max_pages=50,
154
+ max_depth=3,
155
+ timeout=300,
156
+ )
157
+
158
+ for page in result.data:
159
+ print(page.url, len(page.markdown or ""))
160
+ ```
161
+
162
+ ### Non-blocking (manual polling)
163
+
164
+ ```python
165
+ job = client.start_crawl("https://example.com", max_pages=100)
166
+ print(f"Job started: {job.job_id}")
167
+
168
+ # Poll for status
169
+ status = client.get_crawl_status(job.job_id)
170
+ print(f"Progress: {status.completed_pages}/{status.total_pages}")
171
+
172
+ # Cancel if needed
173
+ client.cancel_crawl(job.job_id)
174
+ ```
175
+
176
+ ### Crawl Options
177
+
178
+ ```python
179
+ result = client.crawl(
180
+ "https://example.com",
181
+ max_pages=200,
182
+ max_depth=5,
183
+ concurrency=5,
184
+ crawl_strategy="bfs", # bfs, dfs, or bff (best-first)
185
+ include_paths=["/blog/*"], # Only crawl matching paths
186
+ exclude_paths=["/admin/*"], # Skip these paths
187
+ allow_external_links=False,
188
+ respect_robots_txt=True,
189
+ scrape_options={
190
+ "formats": ["markdown"],
191
+ "only_main_content": True,
192
+ },
193
+ )
194
+ ```
195
+
196
+ ## Search
197
+
198
+ Search the web and scrape each result page.
199
+
200
+ ```python
201
+ result = client.search(
202
+ "best python web scraping libraries",
203
+ num_results=10,
204
+ formats=["markdown"],
205
+ )
206
+
207
+ for item in result.data:
208
+ print(f"{item.title}: {item.url}")
209
+ print(item.markdown[:200])
210
+ ```
211
+
212
+ ### Search Engines
213
+
214
+ ```python
215
+ # Default: Google (via SearXNG)
216
+ result = client.search("query", engine="google")
217
+
218
+ # DuckDuckGo
219
+ result = client.search("query", engine="duckduckgo")
220
+
221
+ # Brave (requires API key)
222
+ result = client.search("query", engine="brave", brave_api_key="...")
223
+ ```
224
+
225
+ ## Map
226
+
227
+ Discover all URLs on a website using sitemaps and link crawling.
228
+
229
+ ```python
230
+ result = client.map("https://example.com", limit=500)
231
+
232
+ for link in result.links:
233
+ print(f"{link.url} - {link.title}")
234
+
235
+ # Just the URLs
236
+ print(result.urls)
237
+ ```
238
+
239
+ ### Filter URLs
240
+
241
+ ```python
242
+ result = client.map(
243
+ "https://example.com",
244
+ search="blog", # Filter by keyword
245
+ include_subdomains=True,
246
+ use_sitemap=True,
247
+ limit=1000,
248
+ )
249
+ ```
250
+
251
+ ## Batch Scrape
252
+
253
+ Scrape multiple URLs efficiently.
254
+
255
+ ```python
256
+ results = client.batch_scrape(
257
+ [
258
+ "https://example.com/page1",
259
+ "https://example.com/page2",
260
+ "https://example.com/page3",
261
+ ],
262
+ scrape_options={"formats": ["markdown"], "only_main_content": True},
263
+ )
264
+
265
+ for r in results:
266
+ if r.success:
267
+ print(r.data.metadata.title)
268
+ ```
269
+
270
+ ## Async Client
271
+
272
+ Full async support for high-performance applications.
273
+
274
+ ```python
275
+ import asyncio
276
+ from datablue import AsyncDataBlue
277
+
278
+ async def main():
279
+ async with AsyncDataBlue(api_key="your_key") as client:
280
+ # Scrape
281
+ result = await client.scrape("https://example.com")
282
+
283
+ # Crawl
284
+ crawl = await client.crawl("https://example.com", max_pages=50)
285
+
286
+ # Search
287
+ search = await client.search("python scraping", num_results=5)
288
+
289
+ # Map
290
+ sitemap = await client.map("https://example.com")
291
+
292
+ # Batch scrape (concurrent)
293
+ results = await client.batch_scrape(urls, concurrency=10)
294
+
295
+ # Streaming batch (yields as completed)
296
+ async for result in client.batch_scrape_iter(urls, concurrency=10):
297
+ print(result.data.url)
298
+
299
+ asyncio.run(main())
300
+ ```
301
+
302
+ ## Error Handling
303
+
304
+ ```python
305
+ from datablue import (
306
+ DataBlueError,
307
+ AuthenticationError,
308
+ RateLimitError,
309
+ NotFoundError,
310
+ ServerError,
311
+ JobFailedError,
312
+ TimeoutError,
313
+ )
314
+
315
+ try:
316
+ result = client.scrape("https://example.com")
317
+ except AuthenticationError:
318
+ print("Bad API key")
319
+ except RateLimitError as e:
320
+ print(f"Rate limited. Retry after {e.retry_after}s")
321
+ except TimeoutError as e:
322
+ print(f"Job timed out after {e.elapsed}s")
323
+ except JobFailedError as e:
324
+ print(f"Job {e.job_id} failed")
325
+ except ServerError:
326
+ print("Server error (auto-retried)")
327
+ except DataBlueError as e:
328
+ print(f"Error {e.status_code}: {e.message}")
329
+ ```
330
+
331
+ All errors include:
332
+ - `e.message` — human-readable description
333
+ - `e.status_code` — HTTP status code
334
+ - `e.is_retryable` — whether the request can be retried
335
+ - `e.retry_after` — seconds to wait (for 429s)
336
+ - `e.docs_url` — link to error documentation
337
+
338
+ ## Configuration
339
+
340
+ ```python
341
+ client = DataBlue(
342
+ api_url="https://api.datablue.dev", # API base URL
343
+ api_key="your_key", # API key
344
+ timeout=60.0, # Request timeout (seconds)
345
+ max_retries=3, # Retry count for 5xx/429
346
+ )
347
+ ```
348
+
349
+ ### Self-Hosted
350
+
351
+ ```python
352
+ client = DataBlue(
353
+ api_url="http://localhost:8000",
354
+ api_key="your_key",
355
+ )
356
+ ```
357
+
358
+ ### Login with Email/Password
359
+
360
+ ```python
361
+ client = DataBlue(api_url="https://api.datablue.dev")
362
+ client.login("you@email.com", "password")
363
+ # JWT token is stored automatically
364
+ result = client.scrape("https://example.com")
365
+ ```
366
+
367
+ ## Requirements
368
+
369
+ - Python 3.10+
370
+ - httpx
371
+ - pydantic v2
372
+
373
+ ## License
374
+
375
+ MIT