scrapedatshi 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapedatshi-0.1.0/.env.example +9 -0
- scrapedatshi-0.1.0/.gitignore +39 -0
- scrapedatshi-0.1.0/PKG-INFO +317 -0
- scrapedatshi-0.1.0/README.md +285 -0
- scrapedatshi-0.1.0/pyproject.toml +70 -0
- scrapedatshi-0.1.0/scrapedatshi/__init__.py +67 -0
- scrapedatshi-0.1.0/scrapedatshi/client.py +247 -0
- scrapedatshi-0.1.0/scrapedatshi/exceptions.py +41 -0
- scrapedatshi-0.1.0/scrapedatshi/models.py +175 -0
- scrapedatshi-0.1.0/scrapedatshi/pipeline.py +550 -0
- scrapedatshi-0.1.0/tests/__init__.py +0 -0
- scrapedatshi-0.1.0/tests/test_client.py +265 -0
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# scrapedatshi SDK — environment variables
|
|
2
|
+
# Copy this file to .env and fill in your values.
|
|
3
|
+
# Never commit your actual .env file to version control.
|
|
4
|
+
|
|
5
|
+
# Your scrapedatshi API key (get one at https://scrapedatshi.com/portal/register)
|
|
6
|
+
SCRAPEDATSHI_API_KEY=sds_your_key_here
|
|
7
|
+
|
|
8
|
+
# Optional: override the API base URL (for self-hosted or staging environments)
|
|
9
|
+
# SCRAPEDATSHI_BASE_URL=https://api.scrapedatshi.com
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.pyo
|
|
5
|
+
*.pyd
|
|
6
|
+
.Python
|
|
7
|
+
*.egg
|
|
8
|
+
*.egg-info/
|
|
9
|
+
dist/
|
|
10
|
+
build/
|
|
11
|
+
.eggs/
|
|
12
|
+
*.whl
|
|
13
|
+
|
|
14
|
+
# Virtual environments
|
|
15
|
+
.venv/
|
|
16
|
+
venv/
|
|
17
|
+
env/
|
|
18
|
+
ENV/
|
|
19
|
+
|
|
20
|
+
# Environment variables
|
|
21
|
+
.env
|
|
22
|
+
|
|
23
|
+
# Testing
|
|
24
|
+
.pytest_cache/
|
|
25
|
+
.coverage
|
|
26
|
+
htmlcov/
|
|
27
|
+
.tox/
|
|
28
|
+
|
|
29
|
+
# Type checking
|
|
30
|
+
.mypy_cache/
|
|
31
|
+
|
|
32
|
+
# IDE
|
|
33
|
+
.vscode/
|
|
34
|
+
.idea/
|
|
35
|
+
*.swp
|
|
36
|
+
*.swo
|
|
37
|
+
|
|
38
|
+
# Distribution
|
|
39
|
+
dist/
|
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scrapedatshi
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Official Python SDK for the scrapedatshi RAG pipeline API
|
|
5
|
+
Project-URL: Homepage, https://scrapedatshi.com
|
|
6
|
+
Project-URL: Documentation, https://docs.scrapedatshi.com/sdk/python
|
|
7
|
+
Project-URL: Repository, https://github.com/mxchris18/scrapedatshi-py
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/mxchris18/scrapedatshi-py/issues
|
|
9
|
+
Author-email: scrapedatshi <hello@scrapedatshi.com>
|
|
10
|
+
License: MIT
|
|
11
|
+
Keywords: ai,chunking,embeddings,llm,rag,retrieval-augmented-generation,scraping,vector-database
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Classifier: Typing :: Typed
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Requires-Dist: httpx>=0.27.0
|
|
24
|
+
Requires-Dist: pydantic>=2.0.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
27
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: respx>=0.21; extra == 'dev'
|
|
30
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# scrapedatshi-py
|
|
34
|
+
|
|
35
|
+
Official Python SDK for the [scrapedatshi](https://scrapedatshi.com) RAG pipeline API.
|
|
36
|
+
|
|
37
|
+
Scrape URLs, chunk documents, embed content, and inject into vector databases — all from a clean, typed Python interface.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Installation
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install scrapedatshi
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Requires Python 3.10+.
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## Quick Start
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from scrapedatshi import ScrapedatshiClient
|
|
55
|
+
|
|
56
|
+
client = ScrapedatshiClient(api_key="sds_...")
|
|
57
|
+
|
|
58
|
+
# Chunk a URL to JSON (all tiers — no embedding required)
|
|
59
|
+
result = client.pipeline.chunk_url("https://docs.example.com")
|
|
60
|
+
|
|
61
|
+
print(f"Got {result.total_chunks} chunks")
|
|
62
|
+
for chunk in result.chunks:
|
|
63
|
+
print(chunk.content[:80])
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## Authentication
|
|
69
|
+
|
|
70
|
+
Pass your API key directly or set the `SCRAPEDATSHI_API_KEY` environment variable:
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
export SCRAPEDATSHI_API_KEY="sds_..."
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
# Explicit key
|
|
78
|
+
client = ScrapedatshiClient(api_key="sds_...")
|
|
79
|
+
|
|
80
|
+
# From environment variable
|
|
81
|
+
client = ScrapedatshiClient()
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Get your API key at [scrapedatshi.com/portal/register](https://scrapedatshi.com/portal/register).
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## Pipeline Methods
|
|
89
|
+
|
|
90
|
+
### Chunk to JSON (all tiers)
|
|
91
|
+
|
|
92
|
+
No embedding or vector DB required. Returns structured JSON chunks from any source.
|
|
93
|
+
|
|
94
|
+
#### Chunk a URL
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
result = client.pipeline.chunk_url("https://docs.example.com")
|
|
98
|
+
|
|
99
|
+
# result.chunks → list[Chunk]
|
|
100
|
+
# result.total_chunks → int
|
|
101
|
+
# result.source → str (the URL)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
#### Chunk a local file
|
|
105
|
+
|
|
106
|
+
Supports PDF, DOCX, TXT, MD, and HTML.
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
result = client.pipeline.chunk_file("./docs/manual.pdf")
|
|
110
|
+
|
|
111
|
+
print(f"Got {result.total_chunks} chunks from {result.source}")
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
#### Crawl a website (Basic tier+)
|
|
115
|
+
|
|
116
|
+
Crawls via sitemap and chunks all pages.
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
result = client.pipeline.crawl("https://example.com", max_pages=10)
|
|
120
|
+
|
|
121
|
+
print(f"Crawled {result.pages_crawled} pages → {result.total_chunks} chunks")
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
### Full Pipeline — Embed + Inject (Pro/Enterprise)
|
|
127
|
+
|
|
128
|
+
Scrape, embed, and inject directly into your vector database in one call.
|
|
129
|
+
|
|
130
|
+
#### Sync a URL
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
result = client.pipeline.sync(
|
|
134
|
+
url="https://docs.example.com",
|
|
135
|
+
embedding_provider="openai",
|
|
136
|
+
embedding_api_key="sk-...",
|
|
137
|
+
vector_db="pinecone",
|
|
138
|
+
vector_db_api_key="pc-...",
|
|
139
|
+
index_name="my-docs",
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
print(f"Upserted {result.vectors_upserted} vectors ({result.total_tokens} tokens)")
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
#### Ingest a local file
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
result = client.pipeline.ingest(
|
|
149
|
+
file_path="./docs/manual.pdf",
|
|
150
|
+
embedding_provider="openai",
|
|
151
|
+
embedding_api_key="sk-...",
|
|
152
|
+
vector_db="pinecone",
|
|
153
|
+
vector_db_api_key="pc-...",
|
|
154
|
+
index_name="my-docs",
|
|
155
|
+
)
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
### Contextual Retrieval (RAG 2.0) — Basic tier+
|
|
161
|
+
|
|
162
|
+
Prepend an LLM-generated document summary to every chunk before embedding, dramatically improving retrieval accuracy.
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
result = client.pipeline.chunk_url(
|
|
166
|
+
"https://docs.example.com",
|
|
167
|
+
contextual_retrieval=True,
|
|
168
|
+
llm_provider="openai",
|
|
169
|
+
llm_api_key="sk-...",
|
|
170
|
+
llm_model="gpt-4o-mini",
|
|
171
|
+
)
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
Supported LLM providers: `openai`, `anthropic`, `gemini`
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## Async Support
|
|
179
|
+
|
|
180
|
+
All methods have an `_async` variant for use with `asyncio`.
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
import asyncio
|
|
184
|
+
from scrapedatshi import ScrapedatshiClient
|
|
185
|
+
|
|
186
|
+
async def main():
|
|
187
|
+
async with ScrapedatshiClient(api_key="sds_...") as client:
|
|
188
|
+
result = await client.pipeline.chunk_url_async("https://docs.example.com")
|
|
189
|
+
print(f"Got {result.total_chunks} chunks")
|
|
190
|
+
|
|
191
|
+
asyncio.run(main())
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
#### Parallel processing with `asyncio.gather`
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
async def main():
|
|
198
|
+
async with ScrapedatshiClient(api_key="sds_...") as client:
|
|
199
|
+
urls = [
|
|
200
|
+
"https://docs.example.com/page1",
|
|
201
|
+
"https://docs.example.com/page2",
|
|
202
|
+
"https://docs.example.com/page3",
|
|
203
|
+
]
|
|
204
|
+
results = await asyncio.gather(
|
|
205
|
+
*[client.pipeline.chunk_url_async(url) for url in urls]
|
|
206
|
+
)
|
|
207
|
+
total = sum(r.total_chunks for r in results)
|
|
208
|
+
print(f"Processed {len(urls)} URLs → {total} total chunks")
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## Response Models
|
|
214
|
+
|
|
215
|
+
All methods return typed Pydantic models with full IDE autocomplete support.
|
|
216
|
+
|
|
217
|
+
### `ChunkResult`
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
result.chunks # list[Chunk]
|
|
221
|
+
result.total_chunks # int
|
|
222
|
+
result.source # str
|
|
223
|
+
result.contextual_retrieval_used # bool
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### `Chunk`
|
|
227
|
+
|
|
228
|
+
```python
|
|
229
|
+
chunk.content # str — the chunk text
|
|
230
|
+
chunk.token_estimate # int — estimated token count
|
|
231
|
+
chunk.metadata # dict — source URL, page number, etc.
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### `CrawlChunkResult`
|
|
235
|
+
|
|
236
|
+
```python
|
|
237
|
+
result.chunks # list[Chunk]
|
|
238
|
+
result.total_chunks # int
|
|
239
|
+
result.pages_crawled # int
|
|
240
|
+
result.source_url # str
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### `SyncResult` / `IngestResult`
|
|
244
|
+
|
|
245
|
+
```python
|
|
246
|
+
result.status # "success" | "error"
|
|
247
|
+
result.chunks_created # int
|
|
248
|
+
result.vectors_upserted # int
|
|
249
|
+
result.total_tokens # int
|
|
250
|
+
result.embedding_provider # str
|
|
251
|
+
result.vector_db_provider # str
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
---
|
|
255
|
+
|
|
256
|
+
## Error Handling
|
|
257
|
+
|
|
258
|
+
```python
|
|
259
|
+
from scrapedatshi.exceptions import (
|
|
260
|
+
AuthError, # Invalid or missing API key (401/403)
|
|
261
|
+
TierError, # Feature not available on your plan (403)
|
|
262
|
+
RateLimitError, # Monthly or per-minute limit exceeded (429)
|
|
263
|
+
ValidationError, # Bad request payload (422)
|
|
264
|
+
ServerError, # API server error (5xx)
|
|
265
|
+
TimeoutError, # Request timed out
|
|
266
|
+
ScrapedatshiError # Base exception — catch-all
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
try:
|
|
270
|
+
result = client.pipeline.sync(
|
|
271
|
+
url="https://docs.example.com",
|
|
272
|
+
embedding_provider="openai",
|
|
273
|
+
embedding_api_key="sk-...",
|
|
274
|
+
vector_db="pinecone",
|
|
275
|
+
vector_db_api_key="pc-...",
|
|
276
|
+
index_name="my-docs",
|
|
277
|
+
)
|
|
278
|
+
except TierError as e:
|
|
279
|
+
print(f"Upgrade required: {e.message}")
|
|
280
|
+
except RateLimitError as e:
|
|
281
|
+
print(f"Rate limit hit: {e.message}")
|
|
282
|
+
except ScrapedatshiError as e:
|
|
283
|
+
print(f"API error {e.status_code}: {e.message}")
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
---
|
|
287
|
+
|
|
288
|
+
## Tier Limits
|
|
289
|
+
|
|
290
|
+
| Feature | Free | Basic | Pro | Enterprise |
|
|
291
|
+
|---|---|---|---|---|
|
|
292
|
+
| Price | $0/mo | $9/mo | $29/mo | $49/mo + usage |
|
|
293
|
+
| Chunk to JSON | ✓ | ✓ | ✓ | ✓ |
|
|
294
|
+
| Sitemap Crawl | — | ✓ | ✓ | ✓ |
|
|
295
|
+
| Contextual Retrieval | — | ✓ | ✓ | ✓ |
|
|
296
|
+
| Full Pipeline | — | — | ✓ | ✓ |
|
|
297
|
+
| Deep Spider Crawl | — | — | ✓ | ✓ |
|
|
298
|
+
| Max pages / crawl | 5 | 10 | 25 | 50 |
|
|
299
|
+
| Max chunks / request | 500 | 2,000 | 10,000 | Unlimited |
|
|
300
|
+
| Concurrent requests | 1 | 3 | 10 | 25 |
|
|
301
|
+
|
|
302
|
+
---
|
|
303
|
+
|
|
304
|
+
## Development
|
|
305
|
+
|
|
306
|
+
```bash
|
|
307
|
+
git clone https://github.com/mxchris18/scrapedatshi-py
|
|
308
|
+
cd scrapedatshi-py
|
|
309
|
+
pip install -e ".[dev]"
|
|
310
|
+
pytest
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
---
|
|
314
|
+
|
|
315
|
+
## License
|
|
316
|
+
|
|
317
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
# scrapedatshi-py
|
|
2
|
+
|
|
3
|
+
Official Python SDK for the [scrapedatshi](https://scrapedatshi.com) RAG pipeline API.
|
|
4
|
+
|
|
5
|
+
Scrape URLs, chunk documents, embed content, and inject into vector databases — all from a clean, typed Python interface.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install scrapedatshi
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Requires Python 3.10+.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Quick Start
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
from scrapedatshi import ScrapedatshiClient
|
|
23
|
+
|
|
24
|
+
client = ScrapedatshiClient(api_key="sds_...")
|
|
25
|
+
|
|
26
|
+
# Chunk a URL to JSON (all tiers — no embedding required)
|
|
27
|
+
result = client.pipeline.chunk_url("https://docs.example.com")
|
|
28
|
+
|
|
29
|
+
print(f"Got {result.total_chunks} chunks")
|
|
30
|
+
for chunk in result.chunks:
|
|
31
|
+
print(chunk.content[:80])
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Authentication
|
|
37
|
+
|
|
38
|
+
Pass your API key directly or set the `SCRAPEDATSHI_API_KEY` environment variable:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
export SCRAPEDATSHI_API_KEY="sds_..."
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
# Explicit key
|
|
46
|
+
client = ScrapedatshiClient(api_key="sds_...")
|
|
47
|
+
|
|
48
|
+
# From environment variable
|
|
49
|
+
client = ScrapedatshiClient()
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Get your API key at [scrapedatshi.com/portal/register](https://scrapedatshi.com/portal/register).
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Pipeline Methods
|
|
57
|
+
|
|
58
|
+
### Chunk to JSON (all tiers)
|
|
59
|
+
|
|
60
|
+
No embedding or vector DB required. Returns structured JSON chunks from any source.
|
|
61
|
+
|
|
62
|
+
#### Chunk a URL
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
result = client.pipeline.chunk_url("https://docs.example.com")
|
|
66
|
+
|
|
67
|
+
# result.chunks → list[Chunk]
|
|
68
|
+
# result.total_chunks → int
|
|
69
|
+
# result.source → str (the URL)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
#### Chunk a local file
|
|
73
|
+
|
|
74
|
+
Supports PDF, DOCX, TXT, MD, and HTML.
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
result = client.pipeline.chunk_file("./docs/manual.pdf")
|
|
78
|
+
|
|
79
|
+
print(f"Got {result.total_chunks} chunks from {result.source}")
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
#### Crawl a website (Basic tier+)
|
|
83
|
+
|
|
84
|
+
Crawls via sitemap and chunks all pages.
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
result = client.pipeline.crawl("https://example.com", max_pages=10)
|
|
88
|
+
|
|
89
|
+
print(f"Crawled {result.pages_crawled} pages → {result.total_chunks} chunks")
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
### Full Pipeline — Embed + Inject (Pro/Enterprise)
|
|
95
|
+
|
|
96
|
+
Scrape, embed, and inject directly into your vector database in one call.
|
|
97
|
+
|
|
98
|
+
#### Sync a URL
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
result = client.pipeline.sync(
|
|
102
|
+
url="https://docs.example.com",
|
|
103
|
+
embedding_provider="openai",
|
|
104
|
+
embedding_api_key="sk-...",
|
|
105
|
+
vector_db="pinecone",
|
|
106
|
+
vector_db_api_key="pc-...",
|
|
107
|
+
index_name="my-docs",
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
print(f"Upserted {result.vectors_upserted} vectors ({result.total_tokens} tokens)")
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
#### Ingest a local file
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
result = client.pipeline.ingest(
|
|
117
|
+
file_path="./docs/manual.pdf",
|
|
118
|
+
embedding_provider="openai",
|
|
119
|
+
embedding_api_key="sk-...",
|
|
120
|
+
vector_db="pinecone",
|
|
121
|
+
vector_db_api_key="pc-...",
|
|
122
|
+
index_name="my-docs",
|
|
123
|
+
)
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
### Contextual Retrieval (RAG 2.0) — Basic tier+
|
|
129
|
+
|
|
130
|
+
Prepend an LLM-generated document summary to every chunk before embedding, dramatically improving retrieval accuracy.
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
result = client.pipeline.chunk_url(
|
|
134
|
+
"https://docs.example.com",
|
|
135
|
+
contextual_retrieval=True,
|
|
136
|
+
llm_provider="openai",
|
|
137
|
+
llm_api_key="sk-...",
|
|
138
|
+
llm_model="gpt-4o-mini",
|
|
139
|
+
)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
Supported LLM providers: `openai`, `anthropic`, `gemini`
|
|
143
|
+
|
|
144
|
+
---
|
|
145
|
+
|
|
146
|
+
## Async Support
|
|
147
|
+
|
|
148
|
+
All methods have an `_async` variant for use with `asyncio`.
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
import asyncio
|
|
152
|
+
from scrapedatshi import ScrapedatshiClient
|
|
153
|
+
|
|
154
|
+
async def main():
|
|
155
|
+
async with ScrapedatshiClient(api_key="sds_...") as client:
|
|
156
|
+
result = await client.pipeline.chunk_url_async("https://docs.example.com")
|
|
157
|
+
print(f"Got {result.total_chunks} chunks")
|
|
158
|
+
|
|
159
|
+
asyncio.run(main())
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
#### Parallel processing with `asyncio.gather`
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
async def main():
|
|
166
|
+
async with ScrapedatshiClient(api_key="sds_...") as client:
|
|
167
|
+
urls = [
|
|
168
|
+
"https://docs.example.com/page1",
|
|
169
|
+
"https://docs.example.com/page2",
|
|
170
|
+
"https://docs.example.com/page3",
|
|
171
|
+
]
|
|
172
|
+
results = await asyncio.gather(
|
|
173
|
+
*[client.pipeline.chunk_url_async(url) for url in urls]
|
|
174
|
+
)
|
|
175
|
+
total = sum(r.total_chunks for r in results)
|
|
176
|
+
print(f"Processed {len(urls)} URLs → {total} total chunks")
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
---
|
|
180
|
+
|
|
181
|
+
## Response Models
|
|
182
|
+
|
|
183
|
+
All methods return typed Pydantic models with full IDE autocomplete support.
|
|
184
|
+
|
|
185
|
+
### `ChunkResult`
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
result.chunks # list[Chunk]
|
|
189
|
+
result.total_chunks # int
|
|
190
|
+
result.source # str
|
|
191
|
+
result.contextual_retrieval_used # bool
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
### `Chunk`
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
chunk.content # str — the chunk text
|
|
198
|
+
chunk.token_estimate # int — estimated token count
|
|
199
|
+
chunk.metadata # dict — source URL, page number, etc.
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### `CrawlChunkResult`
|
|
203
|
+
|
|
204
|
+
```python
|
|
205
|
+
result.chunks # list[Chunk]
|
|
206
|
+
result.total_chunks # int
|
|
207
|
+
result.pages_crawled # int
|
|
208
|
+
result.source_url # str
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
### `SyncResult` / `IngestResult`
|
|
212
|
+
|
|
213
|
+
```python
|
|
214
|
+
result.status # "success" | "error"
|
|
215
|
+
result.chunks_created # int
|
|
216
|
+
result.vectors_upserted # int
|
|
217
|
+
result.total_tokens # int
|
|
218
|
+
result.embedding_provider # str
|
|
219
|
+
result.vector_db_provider # str
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
---
|
|
223
|
+
|
|
224
|
+
## Error Handling
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
from scrapedatshi.exceptions import (
|
|
228
|
+
AuthError, # Invalid or missing API key (401/403)
|
|
229
|
+
TierError, # Feature not available on your plan (403)
|
|
230
|
+
RateLimitError, # Monthly or per-minute limit exceeded (429)
|
|
231
|
+
ValidationError, # Bad request payload (422)
|
|
232
|
+
ServerError, # API server error (5xx)
|
|
233
|
+
TimeoutError, # Request timed out
|
|
234
|
+
ScrapedatshiError # Base exception — catch-all
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
try:
|
|
238
|
+
result = client.pipeline.sync(
|
|
239
|
+
url="https://docs.example.com",
|
|
240
|
+
embedding_provider="openai",
|
|
241
|
+
embedding_api_key="sk-...",
|
|
242
|
+
vector_db="pinecone",
|
|
243
|
+
vector_db_api_key="pc-...",
|
|
244
|
+
index_name="my-docs",
|
|
245
|
+
)
|
|
246
|
+
except TierError as e:
|
|
247
|
+
print(f"Upgrade required: {e.message}")
|
|
248
|
+
except RateLimitError as e:
|
|
249
|
+
print(f"Rate limit hit: {e.message}")
|
|
250
|
+
except ScrapedatshiError as e:
|
|
251
|
+
print(f"API error {e.status_code}: {e.message}")
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
---
|
|
255
|
+
|
|
256
|
+
## Tier Limits
|
|
257
|
+
|
|
258
|
+
| Feature | Free | Basic | Pro | Enterprise |
|
|
259
|
+
|---|---|---|---|---|
|
|
260
|
+
| Price | $0/mo | $9/mo | $29/mo | $49/mo + usage |
|
|
261
|
+
| Chunk to JSON | ✓ | ✓ | ✓ | ✓ |
|
|
262
|
+
| Sitemap Crawl | — | ✓ | ✓ | ✓ |
|
|
263
|
+
| Contextual Retrieval | — | ✓ | ✓ | ✓ |
|
|
264
|
+
| Full Pipeline | — | — | ✓ | ✓ |
|
|
265
|
+
| Deep Spider Crawl | — | — | ✓ | ✓ |
|
|
266
|
+
| Max pages / crawl | 5 | 10 | 25 | 50 |
|
|
267
|
+
| Max chunks / request | 500 | 2,000 | 10,000 | Unlimited |
|
|
268
|
+
| Concurrent requests | 1 | 3 | 10 | 25 |
|
|
269
|
+
|
|
270
|
+
---
|
|
271
|
+
|
|
272
|
+
## Development
|
|
273
|
+
|
|
274
|
+
```bash
|
|
275
|
+
git clone https://github.com/mxchris18/scrapedatshi-py
|
|
276
|
+
cd scrapedatshi-py
|
|
277
|
+
pip install -e ".[dev]"
|
|
278
|
+
pytest
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
---
|
|
282
|
+
|
|
283
|
+
## License
|
|
284
|
+
|
|
285
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "scrapedatshi"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Official Python SDK for the scrapedatshi RAG pipeline API"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
authors = [{ name = "scrapedatshi", email = "hello@scrapedatshi.com" }]
|
|
12
|
+
keywords = [
|
|
13
|
+
"rag",
|
|
14
|
+
"retrieval-augmented-generation",
|
|
15
|
+
"scraping",
|
|
16
|
+
"chunking",
|
|
17
|
+
"embeddings",
|
|
18
|
+
"vector-database",
|
|
19
|
+
"ai",
|
|
20
|
+
"llm",
|
|
21
|
+
]
|
|
22
|
+
classifiers = [
|
|
23
|
+
"Development Status :: 4 - Beta",
|
|
24
|
+
"Intended Audience :: Developers",
|
|
25
|
+
"License :: OSI Approved :: MIT License",
|
|
26
|
+
"Programming Language :: Python :: 3",
|
|
27
|
+
"Programming Language :: Python :: 3.10",
|
|
28
|
+
"Programming Language :: Python :: 3.11",
|
|
29
|
+
"Programming Language :: Python :: 3.12",
|
|
30
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
31
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
32
|
+
"Typing :: Typed",
|
|
33
|
+
]
|
|
34
|
+
requires-python = ">=3.10"
|
|
35
|
+
dependencies = ["httpx>=0.27.0", "pydantic>=2.0.0"]
|
|
36
|
+
|
|
37
|
+
[project.optional-dependencies]
|
|
38
|
+
dev = [
|
|
39
|
+
"pytest>=8.0",
|
|
40
|
+
"pytest-asyncio>=0.23",
|
|
41
|
+
"respx>=0.21", # httpx mock transport for tests
|
|
42
|
+
"ruff>=0.4",
|
|
43
|
+
"mypy>=1.10",
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
[project.urls]
|
|
47
|
+
Homepage = "https://scrapedatshi.com"
|
|
48
|
+
Documentation = "https://docs.scrapedatshi.com/sdk/python"
|
|
49
|
+
Repository = "https://github.com/mxchris18/scrapedatshi-py"
|
|
50
|
+
"Bug Tracker" = "https://github.com/mxchris18/scrapedatshi-py/issues"
|
|
51
|
+
|
|
52
|
+
[tool.hatch.build.targets.wheel]
|
|
53
|
+
packages = ["scrapedatshi"]
|
|
54
|
+
|
|
55
|
+
[tool.ruff]
|
|
56
|
+
line-length = 100
|
|
57
|
+
target-version = "py310"
|
|
58
|
+
|
|
59
|
+
[tool.ruff.lint]
|
|
60
|
+
select = ["E", "F", "I", "UP", "B"]
|
|
61
|
+
ignore = ["E501"]
|
|
62
|
+
|
|
63
|
+
[tool.mypy]
|
|
64
|
+
python_version = "3.10"
|
|
65
|
+
strict = true
|
|
66
|
+
ignore_missing_imports = true
|
|
67
|
+
|
|
68
|
+
[tool.pytest.ini_options]
|
|
69
|
+
asyncio_mode = "auto"
|
|
70
|
+
testpaths = ["tests"]
|