krira-augment 2.1.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- krira_augment/__init__.py +515 -0
- krira_augment/_python/__init__.py +14 -0
- krira_augment/_python/cleaning.py +394 -0
- krira_augment/_python/pipeline.py +738 -0
- krira_augment/_python/transformation.py +551 -0
- krira_augment/_rust.cp313-win_amd64.pyd +0 -0
- krira_augment-2.1.3.dist-info/METADATA +722 -0
- krira_augment-2.1.3.dist-info/RECORD +10 -0
- krira_augment-2.1.3.dist-info/WHEEL +4 -0
- krira_augment-2.1.3.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,722 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: krira-augment
|
|
3
|
+
Version: 2.1.3
|
|
4
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
5
|
+
Classifier: Intended Audience :: Developers
|
|
6
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
7
|
+
Classifier: Operating System :: OS Independent
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Classifier: Programming Language :: Rust
|
|
14
|
+
Classifier: Topic :: Text Processing
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Requires-Dist: openpyxl>=3.0 ; extra == 'xlsx'
|
|
17
|
+
Requires-Dist: pdfplumber>=0.10 ; extra == 'pdf'
|
|
18
|
+
Requires-Dist: python-docx>=0.8 ; extra == 'docx'
|
|
19
|
+
Requires-Dist: polars>=0.20 ; extra == 'csv'
|
|
20
|
+
Requires-Dist: openpyxl>=3.0 ; extra == 'all'
|
|
21
|
+
Requires-Dist: pdfplumber>=0.10 ; extra == 'all'
|
|
22
|
+
Requires-Dist: python-docx>=0.8 ; extra == 'all'
|
|
23
|
+
Requires-Dist: polars>=0.20 ; extra == 'all'
|
|
24
|
+
Requires-Dist: pytest>=7.0 ; extra == 'dev'
|
|
25
|
+
Requires-Dist: pytest-cov>=4.0 ; extra == 'dev'
|
|
26
|
+
Requires-Dist: black>=23.0 ; extra == 'dev'
|
|
27
|
+
Requires-Dist: mypy>=1.0 ; extra == 'dev'
|
|
28
|
+
Requires-Dist: ruff>=0.1 ; extra == 'dev'
|
|
29
|
+
Provides-Extra: xlsx
|
|
30
|
+
Provides-Extra: pdf
|
|
31
|
+
Provides-Extra: docx
|
|
32
|
+
Provides-Extra: csv
|
|
33
|
+
Provides-Extra: all
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
License-File: LICENSE
|
|
36
|
+
Summary: Production-grade document chunking library for RAG systems - Rust-powered Python library
|
|
37
|
+
Keywords: rag,chunking,nlp,document-processing,ai,rust,pyo3
|
|
38
|
+
Author-email: Krira Labs <contact@kriralabs.com>
|
|
39
|
+
License: MIT
|
|
40
|
+
Requires-Python: >=3.10
|
|
41
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
42
|
+
Project-URL: Homepage, https://github.com/Krira-Labs/krira-chunker
|
|
43
|
+
Project-URL: Repository, https://github.com/Krira-Labs/krira-chunker
|
|
44
|
+
Project-URL: Documentation, https://github.com/Krira-Labs/krira-chunker#readme
|
|
45
|
+
Project-URL: Issues, https://github.com/Krira-Labs/krira-chunker/issues
|
|
46
|
+
|
|
47
|
+
# Krira Augment
|
|
48
|
+
|
|
49
|
+
**High-Performance Rust Chunking Engine for RAG Pipelines**
|
|
50
|
+
|
|
51
|
+
[](https://badge.fury.io/py/krira-augment)
|
|
52
|
+
[](https://opensource.org/licenses/MIT)
|
|
53
|
+
[](https://www.rust-lang.org/)
|
|
54
|
+
|
|
55
|
+
Process gigabytes of text in seconds. **40x faster than LangChain** with **O(1) memory usage**.
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## Performance Benchmarks
|
|
60
|
+
|
|
61
|
+
| Dataset Size | LangChain/Pandas | Krira (Rust) | Speedup |
|
|
62
|
+
|--------------|------------------|--------------|---------|
|
|
63
|
+
| 100 MB | ~45 sec | 0.8 sec | 56x |
|
|
64
|
+
| 1 GB | ~8.0 min | 12.0 sec | 40x |
|
|
65
|
+
| 10 GB | Timeout / OOM | 2.1 min | Stable |
|
|
66
|
+
|
|
67
|
+
**Memory stays constant (O(1)) regardless of file size.**
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## Installation
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
pip install krira-augment
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## Complete Example: OpenAI + Pinecone
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from krira_augment import Pipeline, PipelineConfig
|
|
83
|
+
import json
|
|
84
|
+
from openai import OpenAI
|
|
85
|
+
from pinecone import Pinecone
|
|
86
|
+
|
|
87
|
+
# API Keys
|
|
88
|
+
OPENAI_API_KEY = "sk-..." # https://platform.openai.com/api-keys
|
|
89
|
+
PINECONE_API_KEY = "pcone-..." # https://app.pinecone.io/
|
|
90
|
+
PINECONE_INDEX_NAME = "my-rag"
|
|
91
|
+
|
|
92
|
+
# Step 1: Chunk the file
|
|
93
|
+
config = PipelineConfig(chunk_size=512, chunk_overlap=50)
|
|
94
|
+
pipeline = Pipeline(config=config)
|
|
95
|
+
result = pipeline.process("sample.csv", output_path="chunks.jsonl")
|
|
96
|
+
|
|
97
|
+
print(f"Chunks Created: {result.chunks_created}")
|
|
98
|
+
print(f"Execution Time: {result.execution_time:.2f}s")
|
|
99
|
+
print(f"Throughput: {result.mb_per_second:.2f} MB/s")
|
|
100
|
+
print(f"Preview: {result.preview_chunks[:3]}")
|
|
101
|
+
|
|
102
|
+
# Step 2: Embed and store
|
|
103
|
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
|
104
|
+
pc = Pinecone(api_key=PINECONE_API_KEY)
|
|
105
|
+
index = pc.Index(PINECONE_INDEX_NAME)
|
|
106
|
+
|
|
107
|
+
with open("chunks.jsonl", "r") as f:
|
|
108
|
+
for line_num, line in enumerate(f, 1):
|
|
109
|
+
chunk = json.loads(line)
|
|
110
|
+
|
|
111
|
+
response = client.embeddings.create(
|
|
112
|
+
input=chunk["text"],
|
|
113
|
+
model="text-embedding-3-small"
|
|
114
|
+
)
|
|
115
|
+
embedding = response.data[0].embedding
|
|
116
|
+
|
|
117
|
+
index.upsert(vectors=[(f"chunk_{line_num}", embedding, chunk.get("metadata", {}))])
|
|
118
|
+
|
|
119
|
+
if line_num % 100 == 0:
|
|
120
|
+
print(f"Processed {line_num} chunks...")
|
|
121
|
+
|
|
122
|
+
print("Done! All chunks embedded and stored in Pinecone.")
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
## Other Integrations
|
|
128
|
+
|
|
129
|
+
After running **Step 1** (chunking), replace **Step 2** with any of these integrations:
|
|
130
|
+
|
|
131
|
+
### OpenAI + Qdrant
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from openai import OpenAI
|
|
135
|
+
from qdrant_client import QdrantClient
|
|
136
|
+
from qdrant_client.models import PointStruct
|
|
137
|
+
|
|
138
|
+
client = OpenAI(api_key="sk-...")
|
|
139
|
+
qdrant = QdrantClient(url="https://xyz.qdrant.io", api_key="qdrant-...")
|
|
140
|
+
|
|
141
|
+
with open("chunks.jsonl", "r") as f:
|
|
142
|
+
for line_num, line in enumerate(f, 1):
|
|
143
|
+
chunk = json.loads(line)
|
|
144
|
+
response = client.embeddings.create(input=chunk["text"], model="text-embedding-3-small")
|
|
145
|
+
embedding = response.data[0].embedding
|
|
146
|
+
qdrant.upsert(collection_name="my-chunks", points=[PointStruct(id=line_num, vector=embedding, payload=chunk.get("metadata", {}))])
|
|
147
|
+
|
|
148
|
+
if line_num % 100 == 0:
|
|
149
|
+
print(f"Processed {line_num} chunks...")
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
### OpenAI + Weaviate
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
import weaviate
|
|
156
|
+
import weaviate.classes as wvc
|
|
157
|
+
from openai import OpenAI
|
|
158
|
+
|
|
159
|
+
# Connect to Weaviate Cloud
|
|
160
|
+
client_w = weaviate.connect_to_wcs(
|
|
161
|
+
cluster_url="https://xyz.weaviate.network",
|
|
162
|
+
auth_credentials=weaviate.auth.AuthApiKey("weaviate-...")
|
|
163
|
+
)
|
|
164
|
+
client_o = OpenAI(api_key="sk-...")
|
|
165
|
+
|
|
166
|
+
# Get collection
|
|
167
|
+
collection = client_w.collections.get("Chunk")
|
|
168
|
+
|
|
169
|
+
with open("chunks.jsonl", "r") as f:
|
|
170
|
+
for line_num, line in enumerate(f, 1):
|
|
171
|
+
chunk = json.loads(line)
|
|
172
|
+
response = client_o.embeddings.create(input=chunk["text"], model="text-embedding-3-small")
|
|
173
|
+
embedding = response.data[0].embedding
|
|
174
|
+
|
|
175
|
+
# Insert with vector
|
|
176
|
+
collection.data.insert(
|
|
177
|
+
properties={"text": chunk["text"], "metadata": str(chunk.get("metadata", {}))},
|
|
178
|
+
vector=embedding
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
if line_num % 100 == 0:
|
|
182
|
+
print(f"Processed {line_num} chunks...")
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
### Cohere + Pinecone
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
import cohere
|
|
189
|
+
from pinecone import Pinecone
|
|
190
|
+
|
|
191
|
+
co = cohere.Client("co-...")
|
|
192
|
+
pc = Pinecone(api_key="pcone-...")
|
|
193
|
+
index = pc.Index("my-rag")
|
|
194
|
+
|
|
195
|
+
with open("chunks.jsonl", "r") as f:
|
|
196
|
+
for line_num, line in enumerate(f, 1):
|
|
197
|
+
chunk = json.loads(line)
|
|
198
|
+
response = co.embed(texts=[chunk["text"]], model="embed-english-v3.0")
|
|
199
|
+
embedding = response.embeddings[0]
|
|
200
|
+
index.upsert(vectors=[(f"chunk_{line_num}", embedding, chunk.get("metadata", {}))])
|
|
201
|
+
|
|
202
|
+
if line_num % 100 == 0:
|
|
203
|
+
print(f"Processed {line_num} chunks...")
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
### Cohere + Qdrant
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
import cohere
|
|
210
|
+
from qdrant_client import QdrantClient
|
|
211
|
+
from qdrant_client.models import PointStruct
|
|
212
|
+
|
|
213
|
+
co = cohere.Client("co-...")
|
|
214
|
+
qdrant = QdrantClient(url="https://xyz.qdrant.io", api_key="qdrant-...")
|
|
215
|
+
|
|
216
|
+
with open("chunks.jsonl", "r") as f:
|
|
217
|
+
for line_num, line in enumerate(f, 1):
|
|
218
|
+
chunk = json.loads(line)
|
|
219
|
+
response = co.embed(texts=[chunk["text"]], model="embed-english-v3.0")
|
|
220
|
+
embedding = response.embeddings[0]
|
|
221
|
+
qdrant.upsert(
|
|
222
|
+
collection_name="my-chunks",
|
|
223
|
+
points=[PointStruct(id=line_num, vector=embedding, payload=chunk.get("metadata", {}))]
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
if line_num % 100 == 0:
|
|
227
|
+
print(f"Processed {line_num} chunks...")
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### Local (Sentence Transformers) + ChromaDB (FREE)
|
|
231
|
+
|
|
232
|
+
```python
|
|
233
|
+
from sentence_transformers import SentenceTransformer
|
|
234
|
+
import chromadb
|
|
235
|
+
import json
|
|
236
|
+
|
|
237
|
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
238
|
+
client = chromadb.Client()
|
|
239
|
+
collection = client.get_or_create_collection("my_chunks")
|
|
240
|
+
|
|
241
|
+
with open("chunks.jsonl", "r") as f:
|
|
242
|
+
for line_num, line in enumerate(f, 1):
|
|
243
|
+
chunk = json.loads(line)
|
|
244
|
+
embedding = model.encode(chunk["text"])
|
|
245
|
+
|
|
246
|
+
# Handle empty metadata
|
|
247
|
+
meta = chunk.get("metadata")
|
|
248
|
+
if not meta:
|
|
249
|
+
meta = None
|
|
250
|
+
|
|
251
|
+
collection.add(
|
|
252
|
+
ids=[f"chunk_{line_num}"],
|
|
253
|
+
embeddings=[embedding.tolist()],
|
|
254
|
+
metadatas=[meta] if meta else None,
|
|
255
|
+
documents=[chunk["text"]]
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
if line_num % 100 == 0:
|
|
259
|
+
print(f"Processed {line_num} chunks...")
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
### Hugging Face + FAISS (FREE)
|
|
263
|
+
|
|
264
|
+
```python
|
|
265
|
+
from transformers import AutoTokenizer, AutoModel
|
|
266
|
+
import torch
|
|
267
|
+
import faiss
|
|
268
|
+
import numpy as np
|
|
269
|
+
import json
|
|
270
|
+
|
|
271
|
+
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
|
272
|
+
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
|
273
|
+
index = faiss.IndexFlatL2(384)
|
|
274
|
+
|
|
275
|
+
embeddings_list = []
|
|
276
|
+
with open("chunks.jsonl", "r") as f:
|
|
277
|
+
for line_num, line in enumerate(f, 1):
|
|
278
|
+
chunk = json.loads(line)
|
|
279
|
+
inputs = tokenizer(chunk["text"], return_tensors="pt", truncation=True, max_length=512)
|
|
280
|
+
with torch.no_grad():
|
|
281
|
+
outputs = model(**inputs)
|
|
282
|
+
embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
|
283
|
+
embeddings_list.append(embedding)
|
|
284
|
+
|
|
285
|
+
if line_num % 100 == 0:
|
|
286
|
+
print(f"Processed {line_num} chunks...")
|
|
287
|
+
|
|
288
|
+
embeddings_array = np.array(embeddings_list).astype('float32')
|
|
289
|
+
index.add(embeddings_array)
|
|
290
|
+
faiss.write_index(index, "my_vectors.index")
|
|
291
|
+
print("Done! Vectors saved to my_vectors.index")
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
---
|
|
295
|
+
|
|
296
|
+
## Streaming Mode (No Files)
|
|
297
|
+
|
|
298
|
+
Process chunks without saving to disk - maximum efficiency for real-time pipelines:
|
|
299
|
+
|
|
300
|
+
### Complete Example: OpenAI + Pinecone (Streaming)
|
|
301
|
+
|
|
302
|
+
```python
|
|
303
|
+
from krira_augment import Pipeline, PipelineConfig
|
|
304
|
+
from openai import OpenAI
|
|
305
|
+
from pinecone import Pinecone
|
|
306
|
+
|
|
307
|
+
# API Keys
|
|
308
|
+
OPENAI_API_KEY = "sk-..." # https://platform.openai.com/api-keys
|
|
309
|
+
PINECONE_API_KEY = "pcone-..." # https://app.pinecone.io/
|
|
310
|
+
PINECONE_INDEX_NAME = "my-rag"
|
|
311
|
+
|
|
312
|
+
# Initialize
|
|
313
|
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
|
314
|
+
pc = Pinecone(api_key=PINECONE_API_KEY)
|
|
315
|
+
index = pc.Index(PINECONE_INDEX_NAME)
|
|
316
|
+
|
|
317
|
+
# Configure pipeline
|
|
318
|
+
config = PipelineConfig(chunk_size=512, chunk_overlap=50)
|
|
319
|
+
pipeline = Pipeline(config=config)
|
|
320
|
+
|
|
321
|
+
# Stream and embed (no file created)
|
|
322
|
+
chunk_count = 0
|
|
323
|
+
print("Starting streaming pipeline...")
|
|
324
|
+
|
|
325
|
+
for chunk in pipeline.process_stream("data.csv"):
|
|
326
|
+
chunk_count += 1
|
|
327
|
+
|
|
328
|
+
# Embed
|
|
329
|
+
response = client.embeddings.create(
|
|
330
|
+
input=chunk["text"],
|
|
331
|
+
model="text-embedding-3-small"
|
|
332
|
+
)
|
|
333
|
+
embedding = response.data[0].embedding
|
|
334
|
+
|
|
335
|
+
# Store immediately
|
|
336
|
+
index.upsert(vectors=[(
|
|
337
|
+
f"chunk_{chunk_count}",
|
|
338
|
+
embedding,
|
|
339
|
+
chunk["metadata"]
|
|
340
|
+
)])
|
|
341
|
+
|
|
342
|
+
# Progress
|
|
343
|
+
if chunk_count % 100 == 0:
|
|
344
|
+
print(f"Processed {chunk_count} chunks...")
|
|
345
|
+
|
|
346
|
+
print(f"Done! Embedded {chunk_count} chunks. No intermediate file created.")
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
---
|
|
350
|
+
|
|
351
|
+
## Other Streaming Integrations
|
|
352
|
+
|
|
353
|
+
Replace the embedding/storage logic with any of these:
|
|
354
|
+
|
|
355
|
+
### OpenAI + Qdrant (Streaming)
|
|
356
|
+
|
|
357
|
+
```python
|
|
358
|
+
from krira_augment import Pipeline, PipelineConfig
|
|
359
|
+
from openai import OpenAI
|
|
360
|
+
from qdrant_client import QdrantClient
|
|
361
|
+
from qdrant_client.models import PointStruct
|
|
362
|
+
|
|
363
|
+
# Initialize
|
|
364
|
+
client = OpenAI(api_key="sk-...")
|
|
365
|
+
qdrant = QdrantClient(url="https://xyz.qdrant.io", api_key="qdrant-...")
|
|
366
|
+
|
|
367
|
+
# Configure and stream
|
|
368
|
+
config = PipelineConfig(chunk_size=512, chunk_overlap=50)
|
|
369
|
+
pipeline = Pipeline(config=config)
|
|
370
|
+
|
|
371
|
+
chunk_count = 0
|
|
372
|
+
for chunk in pipeline.process_stream("data.csv"):
|
|
373
|
+
chunk_count += 1
|
|
374
|
+
|
|
375
|
+
# Embed
|
|
376
|
+
response = client.embeddings.create(input=chunk["text"], model="text-embedding-3-small")
|
|
377
|
+
embedding = response.data[0].embedding
|
|
378
|
+
|
|
379
|
+
# Store
|
|
380
|
+
qdrant.upsert(
|
|
381
|
+
collection_name="my-chunks",
|
|
382
|
+
points=[PointStruct(id=chunk_count, vector=embedding, payload=chunk["metadata"])]
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
if chunk_count % 100 == 0:
|
|
386
|
+
print(f"Processed {chunk_count} chunks...")
|
|
387
|
+
|
|
388
|
+
print(f"Done! {chunk_count} chunks embedded.")
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
### OpenAI + Weaviate (Streaming)
|
|
392
|
+
|
|
393
|
+
```python
|
|
394
|
+
from krira_augment import Pipeline, PipelineConfig
|
|
395
|
+
from openai import OpenAI
|
|
396
|
+
import weaviate
|
|
397
|
+
|
|
398
|
+
# Initialize
|
|
399
|
+
client_o = OpenAI(api_key="sk-...")
|
|
400
|
+
client_w = weaviate.connect_to_wcs(
|
|
401
|
+
cluster_url="https://xyz.weaviate.network",
|
|
402
|
+
auth_credentials=weaviate.auth.AuthApiKey("weaviate-...")
|
|
403
|
+
)
|
|
404
|
+
collection = client_w.collections.get("Chunk")
|
|
405
|
+
|
|
406
|
+
# Configure and stream
|
|
407
|
+
config = PipelineConfig(chunk_size=512, chunk_overlap=50)
|
|
408
|
+
pipeline = Pipeline(config=config)
|
|
409
|
+
|
|
410
|
+
chunk_count = 0
|
|
411
|
+
for chunk in pipeline.process_stream("data.csv"):
|
|
412
|
+
chunk_count += 1
|
|
413
|
+
|
|
414
|
+
# Embed
|
|
415
|
+
response = client_o.embeddings.create(input=chunk["text"], model="text-embedding-3-small")
|
|
416
|
+
embedding = response.data[0].embedding
|
|
417
|
+
|
|
418
|
+
# Store
|
|
419
|
+
collection.data.insert(
|
|
420
|
+
properties={"text": chunk["text"], "metadata": str(chunk["metadata"])},
|
|
421
|
+
vector=embedding
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
if chunk_count % 100 == 0:
|
|
425
|
+
print(f"Processed {chunk_count} chunks...")
|
|
426
|
+
|
|
427
|
+
print(f"Done! {chunk_count} chunks embedded.")
|
|
428
|
+
```
|
|
429
|
+
|
|
430
|
+
### Cohere + Pinecone (Streaming)
|
|
431
|
+
|
|
432
|
+
```python
|
|
433
|
+
from krira_augment import Pipeline, PipelineConfig
|
|
434
|
+
import cohere
|
|
435
|
+
from pinecone import Pinecone
|
|
436
|
+
|
|
437
|
+
# Initialize
|
|
438
|
+
co = cohere.Client("co-...")
|
|
439
|
+
pc = Pinecone(api_key="pcone-...")
|
|
440
|
+
index = pc.Index("my-rag")
|
|
441
|
+
|
|
442
|
+
# Configure and stream
|
|
443
|
+
config = PipelineConfig(chunk_size=512, chunk_overlap=50)
|
|
444
|
+
pipeline = Pipeline(config=config)
|
|
445
|
+
|
|
446
|
+
chunk_count = 0
|
|
447
|
+
for chunk in pipeline.process_stream("data.csv"):
|
|
448
|
+
chunk_count += 1
|
|
449
|
+
|
|
450
|
+
# Embed
|
|
451
|
+
response = co.embed(texts=[chunk["text"]], model="embed-english-v3.0")
|
|
452
|
+
embedding = response.embeddings[0]
|
|
453
|
+
|
|
454
|
+
# Store
|
|
455
|
+
index.upsert(vectors=[(f"chunk_{chunk_count}", embedding, chunk["metadata"])])
|
|
456
|
+
|
|
457
|
+
if chunk_count % 100 == 0:
|
|
458
|
+
print(f"Processed {chunk_count} chunks...")
|
|
459
|
+
|
|
460
|
+
print(f"Done! {chunk_count} chunks embedded.")
|
|
461
|
+
```
|
|
462
|
+
|
|
463
|
+
### Cohere + Qdrant (Streaming)
|
|
464
|
+
|
|
465
|
+
```python
|
|
466
|
+
from krira_augment import Pipeline, PipelineConfig
|
|
467
|
+
import cohere
|
|
468
|
+
from qdrant_client import QdrantClient
|
|
469
|
+
from qdrant_client.models import PointStruct
|
|
470
|
+
|
|
471
|
+
# Initialize
|
|
472
|
+
co = cohere.Client("co-...")
|
|
473
|
+
qdrant = QdrantClient(url="https://xyz.qdrant.io", api_key="qdrant-...")
|
|
474
|
+
|
|
475
|
+
# Configure and stream
|
|
476
|
+
config = PipelineConfig(chunk_size=512, chunk_overlap=50)
|
|
477
|
+
pipeline = Pipeline(config=config)
|
|
478
|
+
|
|
479
|
+
chunk_count = 0
|
|
480
|
+
for chunk in pipeline.process_stream("data.csv"):
|
|
481
|
+
chunk_count += 1
|
|
482
|
+
|
|
483
|
+
# Embed
|
|
484
|
+
response = co.embed(texts=[chunk["text"]], model="embed-english-v3.0")
|
|
485
|
+
embedding = response.embeddings[0]
|
|
486
|
+
|
|
487
|
+
# Store
|
|
488
|
+
qdrant.upsert(
|
|
489
|
+
collection_name="my-chunks",
|
|
490
|
+
points=[PointStruct(id=chunk_count, vector=embedding, payload=chunk["metadata"])]
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
if chunk_count % 100 == 0:
|
|
494
|
+
print(f"Processed {chunk_count} chunks...")
|
|
495
|
+
|
|
496
|
+
print(f"Done! {chunk_count} chunks embedded.")
|
|
497
|
+
```
|
|
498
|
+
|
|
499
|
+
### Local (Sentence Transformers) + ChromaDB (Streaming, FREE)
|
|
500
|
+
|
|
501
|
+
```python
|
|
502
|
+
from krira_augment import Pipeline, PipelineConfig
|
|
503
|
+
from sentence_transformers import SentenceTransformer
|
|
504
|
+
import chromadb
|
|
505
|
+
|
|
506
|
+
# Initialize (no API keys needed)
|
|
507
|
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
508
|
+
client = chromadb.Client()
|
|
509
|
+
collection = client.create_collection("my_chunks")
|
|
510
|
+
|
|
511
|
+
# Configure and stream
|
|
512
|
+
config = PipelineConfig(chunk_size=512, chunk_overlap=50)
|
|
513
|
+
pipeline = Pipeline(config=config)
|
|
514
|
+
|
|
515
|
+
chunk_count = 0
|
|
516
|
+
for chunk in pipeline.process_stream("data.csv"):
|
|
517
|
+
chunk_count += 1
|
|
518
|
+
|
|
519
|
+
# Embed locally (free, runs on your machine)
|
|
520
|
+
embedding = model.encode(chunk["text"])
|
|
521
|
+
|
|
522
|
+
# Store locally
|
|
523
|
+
collection.add(
|
|
524
|
+
ids=[f"chunk_{chunk_count}"],
|
|
525
|
+
embeddings=[embedding.tolist()],
|
|
526
|
+
metadatas=[chunk["metadata"]],
|
|
527
|
+
documents=[chunk["text"]]
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
if chunk_count % 100 == 0:
|
|
531
|
+
print(f"Processed {chunk_count} chunks...")
|
|
532
|
+
|
|
533
|
+
print(f"Done! {chunk_count} chunks embedded. All local, no API costs.")
|
|
534
|
+
```
|
|
535
|
+
|
|
536
|
+
### Hugging Face + FAISS (Streaming, FREE)
|
|
537
|
+
|
|
538
|
+
```python
|
|
539
|
+
from krira_augment import Pipeline, PipelineConfig
|
|
540
|
+
from transformers import AutoTokenizer, AutoModel
|
|
541
|
+
import torch
|
|
542
|
+
import faiss
|
|
543
|
+
import numpy as np
|
|
544
|
+
|
|
545
|
+
# Initialize (no API keys needed)
|
|
546
|
+
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
|
547
|
+
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
|
548
|
+
index = faiss.IndexFlatL2(384)
|
|
549
|
+
|
|
550
|
+
# Configure and stream
|
|
551
|
+
config = PipelineConfig(chunk_size=512, chunk_overlap=50)
|
|
552
|
+
pipeline = Pipeline(config=config)
|
|
553
|
+
|
|
554
|
+
chunk_count = 0
|
|
555
|
+
embeddings_batch = []
|
|
556
|
+
BATCH_SIZE = 100 # Process in batches for efficiency
|
|
557
|
+
|
|
558
|
+
for chunk in pipeline.process_stream("data.csv"):
|
|
559
|
+
chunk_count += 1
|
|
560
|
+
|
|
561
|
+
# Embed locally
|
|
562
|
+
inputs = tokenizer(chunk["text"], return_tensors="pt", truncation=True, max_length=512)
|
|
563
|
+
with torch.no_grad():
|
|
564
|
+
outputs = model(**inputs)
|
|
565
|
+
embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
|
566
|
+
|
|
567
|
+
embeddings_batch.append(embedding)
|
|
568
|
+
|
|
569
|
+
# Add to FAISS in batches
|
|
570
|
+
if len(embeddings_batch) >= BATCH_SIZE:
|
|
571
|
+
embeddings_array = np.array(embeddings_batch).astype('float32')
|
|
572
|
+
index.add(embeddings_array)
|
|
573
|
+
embeddings_batch = []
|
|
574
|
+
print(f"Processed {chunk_count} chunks...")
|
|
575
|
+
|
|
576
|
+
# Add remaining embeddings
|
|
577
|
+
if embeddings_batch:
|
|
578
|
+
embeddings_array = np.array(embeddings_batch).astype('float32')
|
|
579
|
+
index.add(embeddings_array)
|
|
580
|
+
|
|
581
|
+
# Save index
|
|
582
|
+
faiss.write_index(index, "my_vectors.index")
|
|
583
|
+
print(f"Done! {chunk_count} chunks embedded and saved to my_vectors.index")
|
|
584
|
+
```
|
|
585
|
+
|
|
586
|
+
---
|
|
587
|
+
|
|
588
|
+
## Streaming Mode Advantages
|
|
589
|
+
|
|
590
|
+
| Feature | File-Based | Streaming |
|
|
591
|
+
|---------|------------|-----------|
|
|
592
|
+
| **Disk I/O** | Creates chunks.jsonl | None |
|
|
593
|
+
| **Memory Usage** | O(1) constant | O(1) constant |
|
|
594
|
+
| **Speed** | Chunking + Embedding | Overlapped (faster) |
|
|
595
|
+
| **Use Case** | Large files, batch processing | Real-time, no storage |
|
|
596
|
+
| **Flexibility** | Can re-process chunks | Single pass only |
|
|
597
|
+
|
|
598
|
+
---
|
|
599
|
+
|
|
600
|
+
## When to Use Streaming vs File-Based
|
|
601
|
+
|
|
602
|
+
**Use Streaming When:**
|
|
603
|
+
- You want maximum speed (no disk writes)
|
|
604
|
+
- You don't need to save chunks for later
|
|
605
|
+
- You're building real-time pipelines
|
|
606
|
+
- You have limited disk space
|
|
607
|
+
|
|
608
|
+
**Use File-Based When:**
|
|
609
|
+
- You want to inspect/debug chunks
|
|
610
|
+
- You need to re-process with different embeddings
|
|
611
|
+
- You want to share chunks with your team
|
|
612
|
+
- You're experimenting with different models
|
|
613
|
+
|
|
614
|
+
---
|
|
615
|
+
|
|
616
|
+
## Error Handling (Production Ready)
|
|
617
|
+
|
|
618
|
+
```python
|
|
619
|
+
from krira_augment import Pipeline, PipelineConfig
|
|
620
|
+
from openai import OpenAI
|
|
621
|
+
from pinecone import Pinecone
|
|
622
|
+
import time
|
|
623
|
+
|
|
624
|
+
client = OpenAI(api_key="sk-...")
|
|
625
|
+
pc = Pinecone(api_key="pcone-...")
|
|
626
|
+
index = pc.Index("my-rag")
|
|
627
|
+
|
|
628
|
+
config = PipelineConfig(chunk_size=512, chunk_overlap=50)
|
|
629
|
+
pipeline = Pipeline(config=config)
|
|
630
|
+
|
|
631
|
+
chunk_count = 0
|
|
632
|
+
error_count = 0
|
|
633
|
+
|
|
634
|
+
for chunk in pipeline.process_stream("data.csv"):
|
|
635
|
+
chunk_count += 1
|
|
636
|
+
|
|
637
|
+
try:
|
|
638
|
+
# Embed
|
|
639
|
+
response = client.embeddings.create(input=chunk["text"], model="text-embedding-3-small")
|
|
640
|
+
embedding = response.data[0].embedding
|
|
641
|
+
|
|
642
|
+
# Store
|
|
643
|
+
index.upsert(vectors=[(f"chunk_{chunk_count}", embedding, chunk["metadata"])])
|
|
644
|
+
|
|
645
|
+
except Exception as e:
|
|
646
|
+
error_count += 1
|
|
647
|
+
print(f"Error on chunk {chunk_count}: {e}")
|
|
648
|
+
|
|
649
|
+
# Retry logic
|
|
650
|
+
if "rate_limit" in str(e).lower():
|
|
651
|
+
print("Rate limited, waiting 60 seconds...")
|
|
652
|
+
time.sleep(60)
|
|
653
|
+
# Retry (add your retry logic here)
|
|
654
|
+
|
|
655
|
+
if chunk_count % 100 == 0:
|
|
656
|
+
print(f"Processed {chunk_count} chunks, {error_count} errors")
|
|
657
|
+
|
|
658
|
+
print(f"Done! {chunk_count} chunks processed, {error_count} errors")
|
|
659
|
+
```
|
|
660
|
+
|
|
661
|
+
---
|
|
662
|
+
|
|
663
|
+
## Supported Formats
|
|
664
|
+
|
|
665
|
+
| Format | Extension | Method |
|
|
666
|
+
|--------|-----------|--------|
|
|
667
|
+
| **CSV** | `.csv` | Direct processing |
|
|
668
|
+
| **Text** | `.txt` | Direct processing |
|
|
669
|
+
| **JSONL** | `.jsonl` | Direct processing |
|
|
670
|
+
| **JSON** | `.json` | Auto-flattening |
|
|
671
|
+
| **PDF** | `.pdf` | pdfplumber extraction |
|
|
672
|
+
| **Word** | `.docx` | python-docx extraction |
|
|
673
|
+
| **Excel** | `.xlsx` | openpyxl extraction |
|
|
674
|
+
| **XML** | `.xml` | ElementTree parsing |
|
|
675
|
+
| **URLs** | `http://` | BeautifulSoup scraping |
|
|
676
|
+
|
|
677
|
+
---
|
|
678
|
+
|
|
679
|
+
## Provider Comparison
|
|
680
|
+
|
|
681
|
+
| Embedding | Vector Store | Cost | API Keys | Streaming Support |
|
|
682
|
+
|-----------|--------------|------|----------|-------------------|
|
|
683
|
+
| OpenAI | Pinecone | Paid | 2 | ✅ Yes |
|
|
684
|
+
| OpenAI | Qdrant | Paid | 2 | ✅ Yes |
|
|
685
|
+
| OpenAI | Weaviate | Paid | 2 | ✅ Yes |
|
|
686
|
+
| Cohere | Pinecone | Paid | 2 | ✅ Yes |
|
|
687
|
+
| Cohere | Qdrant | Paid | 2 | ✅ Yes |
|
|
688
|
+
| SentenceTransformers | ChromaDB | **FREE** | 0 | ✅ Yes |
|
|
689
|
+
| Hugging Face | FAISS | **FREE** | 0 | ✅ Yes |
|
|
690
|
+
|
|
691
|
+
---
|
|
692
|
+
|
|
693
|
+
## API Keys Setup
|
|
694
|
+
|
|
695
|
+
Get your keys from:
|
|
696
|
+
- **OpenAI:** https://platform.openai.com/api-keys
|
|
697
|
+
- **Cohere:** https://dashboard.cohere.com/api-keys
|
|
698
|
+
- **Pinecone:** https://app.pinecone.io/
|
|
699
|
+
- **Qdrant:** https://cloud.qdrant.io/
|
|
700
|
+
- **Weaviate:** https://console.weaviate.cloud/
|
|
701
|
+
|
|
702
|
+
---
|
|
703
|
+
|
|
704
|
+
## Development
|
|
705
|
+
|
|
706
|
+
1. **Clone the repo**
|
|
707
|
+
2. **Install Maturin**
|
|
708
|
+
```bash
|
|
709
|
+
pip install maturin
|
|
710
|
+
```
|
|
711
|
+
3. **Build and Install locally**
|
|
712
|
+
```bash
|
|
713
|
+
python -m build
|
|
714
|
+
pip install dist/*.whl --force-reinstall
|
|
715
|
+
```
|
|
716
|
+
|
|
717
|
+
---
|
|
718
|
+
|
|
719
|
+
## License
|
|
720
|
+
|
|
721
|
+
MIT License. (c) 2025 Krira Labs.
|
|
722
|
+
|