krira-augment 2.1.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- krira_augment/__init__.py +515 -0
- krira_augment/_python/__init__.py +14 -0
- krira_augment/_python/cleaning.py +394 -0
- krira_augment/_python/pipeline.py +738 -0
- krira_augment/_python/transformation.py +551 -0
- krira_augment/_rust.cp313-win_amd64.pyd +0 -0
- krira_augment-2.1.3.dist-info/METADATA +722 -0
- krira_augment-2.1.3.dist-info/RECORD +10 -0
- krira_augment-2.1.3.dist-info/WHEEL +4 -0
- krira_augment-2.1.3.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,515 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Krira Augment - High Performance RAG Framework
|
|
3
|
+
|
|
4
|
+
A production-grade Python library for document chunking in RAG pipelines,
|
|
5
|
+
backed by a highly optimized Rust core for maximum performance.
|
|
6
|
+
"""
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
from dataclasses import dataclass, asdict
|
|
10
|
+
from enum import Enum, auto
|
|
11
|
+
from typing import Optional, List, Dict, Any, Iterator, Generator
|
|
12
|
+
import tempfile
|
|
13
|
+
import shutil
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
# Optional dependencies support
|
|
17
|
+
def _check_import(module_name: str, feature_name: str):
|
|
18
|
+
import importlib
|
|
19
|
+
try:
|
|
20
|
+
return importlib.import_module(module_name)
|
|
21
|
+
except ImportError:
|
|
22
|
+
raise ImportError(f"Missing optional dependency '{module_name}' for {feature_name}. Install it with `pip install krira-augment[{feature_name}]` or `pip install {module_name}`.")
|
|
23
|
+
|
|
24
|
+
# Import Rust functions
|
|
25
|
+
try:
|
|
26
|
+
from ._rust import process_file_rust, process_stream as _rust_process_stream
|
|
27
|
+
except ImportError:
|
|
28
|
+
try:
|
|
29
|
+
from krira_augment._rust import process_file_rust, process_stream as _rust_process_stream
|
|
30
|
+
except ImportError:
|
|
31
|
+
def process_file_rust(*args, **kwargs):
|
|
32
|
+
raise ImportError(
|
|
33
|
+
"Rust extension not found. Please ensure the package is installed correctly "
|
|
34
|
+
"or build in development mode with `maturin develop --release`."
|
|
35
|
+
)
|
|
36
|
+
def _rust_process_stream(*args, **kwargs):
|
|
37
|
+
raise ImportError(
|
|
38
|
+
"Rust extension not found. Please ensure the package is installed correctly "
|
|
39
|
+
"or build in development mode with `maturin develop --release`."
|
|
40
|
+
)
|
|
41
|
+
print("WARNING: Rust extension not found. Chunking will fail.")
|
|
42
|
+
|
|
43
|
+
# =============================================================================
|
|
44
|
+
# Professional API (Matching README)
|
|
45
|
+
# =============================================================================
|
|
46
|
+
|
|
47
|
+
class SplitStrategy(Enum):
|
|
48
|
+
"""Chunking strategy enum."""
|
|
49
|
+
FIXED = "fixed"
|
|
50
|
+
SMART = "smart" # Hybrid
|
|
51
|
+
MARKDOWN = "markdown"
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class PipelineConfig:
|
|
55
|
+
"""
|
|
56
|
+
Configuration for the Krira Pipeline.
|
|
57
|
+
"""
|
|
58
|
+
# Chunking
|
|
59
|
+
chunk_size: int = 1000
|
|
60
|
+
chunk_overlap: int = 100
|
|
61
|
+
strategy: SplitStrategy = SplitStrategy.SMART
|
|
62
|
+
|
|
63
|
+
# Cleaning
|
|
64
|
+
clean_html: bool = True
|
|
65
|
+
clean_unicode: bool = True
|
|
66
|
+
min_chunk_len: int = 20
|
|
67
|
+
|
|
68
|
+
# Performance
|
|
69
|
+
threads: int = 8
|
|
70
|
+
batch_size: int = 1000
|
|
71
|
+
|
|
72
|
+
def to_json(self) -> str:
|
|
73
|
+
"""Serialize configuration for Rust backend."""
|
|
74
|
+
# Map nice Python names to internal Rust names
|
|
75
|
+
return json.dumps({
|
|
76
|
+
"max_chars": self.chunk_size,
|
|
77
|
+
# Current V2 Rust core mainly uses max_chars.
|
|
78
|
+
# Future versions will use the rest.
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
@dataclass
|
|
82
|
+
class PipelineStats:
|
|
83
|
+
"""Statistics returned after processing a file."""
|
|
84
|
+
chunks_created: int
|
|
85
|
+
execution_time: float # Time in seconds
|
|
86
|
+
mb_per_second: float
|
|
87
|
+
output_file: str
|
|
88
|
+
preview_chunks: List[str] # Top 3 chunks as preview
|
|
89
|
+
|
|
90
|
+
def __str__(self) -> str:
|
|
91
|
+
"""Pretty print the stats."""
|
|
92
|
+
lines = [
|
|
93
|
+
f"\n{'='*60}",
|
|
94
|
+
f"ā
KRIRA AUGMENT - Processing Complete",
|
|
95
|
+
f"{'='*60}",
|
|
96
|
+
f"š Chunks Created: {self.chunks_created:,}",
|
|
97
|
+
f"ā±ļø Execution Time: {self.execution_time:.2f} seconds",
|
|
98
|
+
f"š Throughput: {self.mb_per_second:.2f} MB/s",
|
|
99
|
+
f"š Output File: {self.output_file}",
|
|
100
|
+
f"{'='*60}",
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
if self.preview_chunks:
|
|
104
|
+
lines.append(f"\nš Preview (Top 3 Chunks):")
|
|
105
|
+
lines.append(f"{'-'*60}")
|
|
106
|
+
for i, chunk in enumerate(self.preview_chunks, 1):
|
|
107
|
+
# Truncate long chunks for display
|
|
108
|
+
display_text = chunk[:150] + "..." if len(chunk) > 150 else chunk
|
|
109
|
+
lines.append(f"[{i}] {display_text}")
|
|
110
|
+
lines.append(f"{'-'*60}")
|
|
111
|
+
|
|
112
|
+
return "\n".join(lines)
|
|
113
|
+
|
|
114
|
+
class Pipeline:
|
|
115
|
+
"""
|
|
116
|
+
Main entry point for Krira Augment.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
def __init__(self, config: Optional[PipelineConfig] = None):
|
|
120
|
+
self.config = config or PipelineConfig()
|
|
121
|
+
|
|
122
|
+
def _convert_to_jsonl(self, input_path: str) -> str:
|
|
123
|
+
"""
|
|
124
|
+
Convert various input formats to a temporary JSONL file that the Rust core can process.
|
|
125
|
+
Returns the path to the temporary file.
|
|
126
|
+
"""
|
|
127
|
+
base_ext = os.path.splitext(input_path)[1].lower()
|
|
128
|
+
|
|
129
|
+
# 0. URL Handling
|
|
130
|
+
if input_path.startswith("http://") or input_path.startswith("https://"):
|
|
131
|
+
return self._process_url(input_path)
|
|
132
|
+
|
|
133
|
+
# 1. Text/JSONL/CSV (Direct Pass-through possibilities, but we want consistency)
|
|
134
|
+
# For now, we pass CSV/JSONL/TXT directly if they are simple,
|
|
135
|
+
# BUT if we want proper row handling for CSV, we should convert here too?
|
|
136
|
+
# The Rust core treats lines as text.
|
|
137
|
+
# - TXT: Fine.
|
|
138
|
+
# - JSONL: Fine.
|
|
139
|
+
# - CSV: Rust sees "col1,col2,col3". If that's okay, pass through.
|
|
140
|
+
if base_ext in ['.txt', '.jsonl', '.csv']:
|
|
141
|
+
return input_path
|
|
142
|
+
|
|
143
|
+
# 2. Complex Formats -> Start conversion
|
|
144
|
+
temp_fd, temp_path = tempfile.mkstemp(suffix=".jsonl", prefix="krira_convert_")
|
|
145
|
+
os.close(temp_fd)
|
|
146
|
+
|
|
147
|
+
try:
|
|
148
|
+
if base_ext == '.json':
|
|
149
|
+
self._convert_json(input_path, temp_path)
|
|
150
|
+
elif base_ext == '.pdf':
|
|
151
|
+
self._convert_pdf(input_path, temp_path)
|
|
152
|
+
elif base_ext == '.docx':
|
|
153
|
+
self._convert_docx(input_path, temp_path)
|
|
154
|
+
elif base_ext == '.xlsx':
|
|
155
|
+
self._convert_xlsx(input_path, temp_path)
|
|
156
|
+
elif base_ext == '.xml':
|
|
157
|
+
self._convert_xml(input_path, temp_path)
|
|
158
|
+
else:
|
|
159
|
+
# Fallback: Treat as text
|
|
160
|
+
print(f"WARNING: Unknown extension {base_ext}, treating as text.")
|
|
161
|
+
return input_path
|
|
162
|
+
|
|
163
|
+
except Exception as e:
|
|
164
|
+
if os.path.exists(temp_path):
|
|
165
|
+
os.unlink(temp_path)
|
|
166
|
+
raise RuntimeError(f"Failed to convert {input_path}: {e}")
|
|
167
|
+
|
|
168
|
+
return temp_path
|
|
169
|
+
|
|
170
|
+
def _write_temp_jsonl(self, temp_path: str, generator):
|
|
171
|
+
with open(temp_path, 'w', encoding='utf-8') as f:
|
|
172
|
+
for item in generator:
|
|
173
|
+
f.write(json.dumps(item, ensure_ascii=False) + "\n")
|
|
174
|
+
|
|
175
|
+
def _convert_json(self, input_path, temp_path):
|
|
176
|
+
"""Flatten JSON list or dict to JSONL."""
|
|
177
|
+
with open(input_path, 'r', encoding='utf-8') as f:
|
|
178
|
+
data = json.load(f)
|
|
179
|
+
|
|
180
|
+
items = []
|
|
181
|
+
if isinstance(data, list):
|
|
182
|
+
items = data
|
|
183
|
+
elif isinstance(data, dict):
|
|
184
|
+
items = [data]
|
|
185
|
+
else:
|
|
186
|
+
raise ValueError("JSON must be a list or dict")
|
|
187
|
+
|
|
188
|
+
# Ensure strings
|
|
189
|
+
final_items = []
|
|
190
|
+
for item in items:
|
|
191
|
+
if isinstance(item, str):
|
|
192
|
+
final_items.append({"text": item})
|
|
193
|
+
else:
|
|
194
|
+
# Dump object to string if it's not a string
|
|
195
|
+
final_items.append({"text": json.dumps(item, ensure_ascii=False)})
|
|
196
|
+
|
|
197
|
+
self._write_temp_jsonl(temp_path, final_items)
|
|
198
|
+
|
|
199
|
+
def _convert_pdf(self, input_path, temp_path):
|
|
200
|
+
pdfplumber = _check_import("pdfplumber", "pdf")
|
|
201
|
+
|
|
202
|
+
items = []
|
|
203
|
+
with pdfplumber.open(input_path) as pdf:
|
|
204
|
+
for i, page in enumerate(pdf.pages):
|
|
205
|
+
text = page.extract_text()
|
|
206
|
+
if text:
|
|
207
|
+
items.append({
|
|
208
|
+
"text": text,
|
|
209
|
+
"metadata": {"page": i + 1, "source": input_path}
|
|
210
|
+
})
|
|
211
|
+
self._write_temp_jsonl(temp_path, items)
|
|
212
|
+
|
|
213
|
+
def _convert_docx(self, input_path, temp_path):
|
|
214
|
+
docx = _check_import("docx", "docx")
|
|
215
|
+
|
|
216
|
+
doc = docx.Document(input_path)
|
|
217
|
+
items = []
|
|
218
|
+
for para in doc.paragraphs:
|
|
219
|
+
if para.text.strip():
|
|
220
|
+
items.append({
|
|
221
|
+
"text": para.text,
|
|
222
|
+
"metadata": {"source": input_path}
|
|
223
|
+
})
|
|
224
|
+
self._write_temp_jsonl(temp_path, items)
|
|
225
|
+
|
|
226
|
+
def _convert_xlsx(self, input_path, temp_path):
|
|
227
|
+
openpyxl = _check_import("openpyxl", "xlsx")
|
|
228
|
+
|
|
229
|
+
wb = openpyxl.load_workbook(input_path, read_only=True, data_only=True)
|
|
230
|
+
items = []
|
|
231
|
+
for sheet in wb:
|
|
232
|
+
rows = sheet.values
|
|
233
|
+
headers = next(rows, None)
|
|
234
|
+
if not headers:
|
|
235
|
+
continue
|
|
236
|
+
|
|
237
|
+
headers = [str(h) for h in headers]
|
|
238
|
+
for row in rows:
|
|
239
|
+
# Convert row to text representation
|
|
240
|
+
row_dict = {h: str(v) if v is not None else "" for h, v in zip(headers, row)}
|
|
241
|
+
# Serialize row as text
|
|
242
|
+
text_rep = " | ".join(f"{k}: {v}" for k, v in row_dict.items() if v)
|
|
243
|
+
if text_rep:
|
|
244
|
+
items.append({
|
|
245
|
+
"text": text_rep,
|
|
246
|
+
"metadata": {"sheet": sheet.title, "source": input_path}
|
|
247
|
+
})
|
|
248
|
+
self._write_temp_jsonl(temp_path, items)
|
|
249
|
+
|
|
250
|
+
def _convert_xml(self, input_path, temp_path):
|
|
251
|
+
import xml.etree.ElementTree as ET
|
|
252
|
+
tree = ET.parse(input_path)
|
|
253
|
+
root = tree.getroot()
|
|
254
|
+
|
|
255
|
+
# Naive XML: Convert each child of root to a string
|
|
256
|
+
items = []
|
|
257
|
+
for child in root:
|
|
258
|
+
# Get all text recursively
|
|
259
|
+
text = "".join(child.itertext()).strip()
|
|
260
|
+
if text:
|
|
261
|
+
items.append({"text": text, "metadata": {"tag": child.tag}})
|
|
262
|
+
|
|
263
|
+
self._write_temp_jsonl(temp_path, items)
|
|
264
|
+
|
|
265
|
+
def _process_url(self, url):
|
|
266
|
+
requests = _check_import("requests", "url")
|
|
267
|
+
bs4 = _check_import("bs4", "url")
|
|
268
|
+
|
|
269
|
+
response = requests.get(url, timeout=10)
|
|
270
|
+
response.raise_for_status()
|
|
271
|
+
|
|
272
|
+
soup = bs4.BeautifulSoup(response.text, "html.parser")
|
|
273
|
+
|
|
274
|
+
# Kill all script and style elements
|
|
275
|
+
for script in soup(["script", "style"]):
|
|
276
|
+
script.decompose()
|
|
277
|
+
|
|
278
|
+
text = soup.get_text(separator="\n")
|
|
279
|
+
|
|
280
|
+
# Break into lines and remove leading and trailing space on each
|
|
281
|
+
lines = (line.strip() for line in text.splitlines())
|
|
282
|
+
# Break multi-headlines into a line each
|
|
283
|
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
|
284
|
+
# Drop blank lines
|
|
285
|
+
text = '\n'.join(chunk for chunk in chunks if chunk)
|
|
286
|
+
|
|
287
|
+
temp_fd, temp_path = tempfile.mkstemp(suffix=".jsonl", prefix="krira_url_")
|
|
288
|
+
os.close(temp_fd)
|
|
289
|
+
|
|
290
|
+
self._write_temp_jsonl(temp_path, [{"text": text, "metadata": {"url": url}}])
|
|
291
|
+
return temp_path
|
|
292
|
+
|
|
293
|
+
def process(self, input_path: str, output_path: Optional[str] = None) -> PipelineStats:
|
|
294
|
+
"""
|
|
295
|
+
Process a file using the Rust core engine.
|
|
296
|
+
Automatically converts PDF, DOCX, XLSX, XML, JSON, and URLs to a format Rust can handle.
|
|
297
|
+
"""
|
|
298
|
+
import time
|
|
299
|
+
|
|
300
|
+
# Check input existence only if it's not a URL
|
|
301
|
+
is_url = input_path.startswith("http://") or input_path.startswith("https://")
|
|
302
|
+
if not is_url:
|
|
303
|
+
if not os.path.exists(input_path):
|
|
304
|
+
raise FileNotFoundError(f"Input file not found: {input_path}")
|
|
305
|
+
|
|
306
|
+
# Determine output path if not provided
|
|
307
|
+
if output_path is None:
|
|
308
|
+
if is_url:
|
|
309
|
+
# Use a safe filename based on URL hash or simple sanitize
|
|
310
|
+
import hashlib
|
|
311
|
+
url_hash = hashlib.md5(input_path.encode()).hexdigest()[:8]
|
|
312
|
+
output_path = f"url_output_{url_hash}.jsonl"
|
|
313
|
+
else:
|
|
314
|
+
base, _ = os.path.splitext(input_path)
|
|
315
|
+
output_path = f"{base}_processed.jsonl"
|
|
316
|
+
|
|
317
|
+
start_time = time.time()
|
|
318
|
+
|
|
319
|
+
# Pre-process
|
|
320
|
+
processed_input_path = self._convert_to_jsonl(input_path)
|
|
321
|
+
is_temp = processed_input_path != input_path
|
|
322
|
+
|
|
323
|
+
try:
|
|
324
|
+
# Invoke Rust Core (which expects text-based files)
|
|
325
|
+
process_file_rust(processed_input_path, output_path, self.config.to_json())
|
|
326
|
+
finally:
|
|
327
|
+
# Cleanup temp file if created
|
|
328
|
+
if is_temp and os.path.exists(processed_input_path):
|
|
329
|
+
try:
|
|
330
|
+
os.unlink(processed_input_path)
|
|
331
|
+
except OSError:
|
|
332
|
+
pass
|
|
333
|
+
|
|
334
|
+
duration = time.time() - start_time
|
|
335
|
+
|
|
336
|
+
# Count chunks and get preview from output file
|
|
337
|
+
chunks_created = 0
|
|
338
|
+
preview_chunks = []
|
|
339
|
+
|
|
340
|
+
try:
|
|
341
|
+
if os.path.exists(output_path):
|
|
342
|
+
with open(output_path, 'r', encoding='utf-8') as f:
|
|
343
|
+
for i, line in enumerate(f):
|
|
344
|
+
chunks_created += 1
|
|
345
|
+
# Collect first 3 chunks for preview
|
|
346
|
+
if i < 3:
|
|
347
|
+
try:
|
|
348
|
+
chunk_data = json.loads(line.strip())
|
|
349
|
+
text = chunk_data.get('text', str(chunk_data))
|
|
350
|
+
preview_chunks.append(text)
|
|
351
|
+
except json.JSONDecodeError:
|
|
352
|
+
preview_chunks.append(line.strip())
|
|
353
|
+
except Exception:
|
|
354
|
+
pass # If reading fails, keep defaults
|
|
355
|
+
|
|
356
|
+
# Calculate throughput based on input file size
|
|
357
|
+
try:
|
|
358
|
+
if not is_url and os.path.exists(input_path):
|
|
359
|
+
file_size_mb = os.path.getsize(input_path) / (1024 * 1024)
|
|
360
|
+
else:
|
|
361
|
+
file_size_mb = 0
|
|
362
|
+
except OSError:
|
|
363
|
+
file_size_mb = 0
|
|
364
|
+
|
|
365
|
+
throughput = file_size_mb / duration if duration > 0 else 0
|
|
366
|
+
|
|
367
|
+
return PipelineStats(
|
|
368
|
+
chunks_created=chunks_created,
|
|
369
|
+
execution_time=duration,
|
|
370
|
+
mb_per_second=throughput,
|
|
371
|
+
output_file=output_path,
|
|
372
|
+
preview_chunks=preview_chunks
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
def process_stream(self, input_path: str) -> Iterator[Dict[str, Any]]:
|
|
376
|
+
"""
|
|
377
|
+
Stream chunks from a file without creating intermediate files.
|
|
378
|
+
|
|
379
|
+
This method provides a memory-efficient way to process large files by yielding
|
|
380
|
+
chunks one at a time. Each chunk can be embedded and stored immediately,
|
|
381
|
+
eliminating the need for intermediate file storage.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
input_path (str): Path to the input file. Supports CSV, TXT, JSON, JSONL,
|
|
385
|
+
PDF, DOCX, XLSX, XML, and URLs.
|
|
386
|
+
|
|
387
|
+
Yields:
|
|
388
|
+
dict: A dictionary containing:
|
|
389
|
+
- text (str): The chunk text content
|
|
390
|
+
- metadata (dict): Metadata including:
|
|
391
|
+
- source (str): Original file path
|
|
392
|
+
- chunk_index (int): Sequential chunk number
|
|
393
|
+
- char_count (int): Number of characters in chunk
|
|
394
|
+
|
|
395
|
+
Memory:
|
|
396
|
+
O(1) - Constant memory usage regardless of file size.
|
|
397
|
+
Maximum ~50MB for internal buffering.
|
|
398
|
+
|
|
399
|
+
Performance:
|
|
400
|
+
- Processes 1GB file in ~12 seconds
|
|
401
|
+
- Utilizes multi-core parallel processing
|
|
402
|
+
- No disk I/O for intermediate files
|
|
403
|
+
|
|
404
|
+
Example:
|
|
405
|
+
Basic usage:
|
|
406
|
+
>>> config = PipelineConfig(chunk_size=512, chunk_overlap=50)
|
|
407
|
+
>>> pipeline = Pipeline(config=config)
|
|
408
|
+
>>> for chunk in pipeline.process_stream("data.csv"):
|
|
409
|
+
... print(chunk["text"][:50])
|
|
410
|
+
|
|
411
|
+
With OpenAI embedding:
|
|
412
|
+
>>> import openai
|
|
413
|
+
>>> for chunk in pipeline.process_stream("data.csv"):
|
|
414
|
+
... embedding = openai.Embedding.create(input=chunk["text"])
|
|
415
|
+
... # Store embedding immediately
|
|
416
|
+
|
|
417
|
+
With progress tracking:
|
|
418
|
+
>>> chunk_count = 0
|
|
419
|
+
>>> for chunk in pipeline.process_stream("data.csv"):
|
|
420
|
+
... chunk_count += 1
|
|
421
|
+
... if chunk_count % 100 == 0:
|
|
422
|
+
... print(f"Processed {chunk_count} chunks")
|
|
423
|
+
|
|
424
|
+
Raises:
|
|
425
|
+
FileNotFoundError: If input_path does not exist
|
|
426
|
+
ImportError: If required optional dependencies are not installed
|
|
427
|
+
|
|
428
|
+
Note:
|
|
429
|
+
- Chunks are processed sequentially for consistent ordering
|
|
430
|
+
- The iterator cannot be restarted; create a new one if needed
|
|
431
|
+
- For very large files (>50GB), consider using file-based `process()` mode
|
|
432
|
+
"""
|
|
433
|
+
# Check input existence
|
|
434
|
+
is_url = input_path.startswith("http://") or input_path.startswith("https://")
|
|
435
|
+
if not is_url:
|
|
436
|
+
if not os.path.exists(input_path):
|
|
437
|
+
raise FileNotFoundError(f"Input file not found: {input_path}")
|
|
438
|
+
|
|
439
|
+
# Pre-process file if needed (PDF, DOCX, etc.)
|
|
440
|
+
processed_input_path = self._convert_to_jsonl(input_path)
|
|
441
|
+
is_temp = processed_input_path != input_path
|
|
442
|
+
|
|
443
|
+
try:
|
|
444
|
+
# Stream from the Rust core
|
|
445
|
+
iterator = _rust_process_stream(
|
|
446
|
+
processed_input_path,
|
|
447
|
+
self.config.chunk_size,
|
|
448
|
+
self.config.chunk_overlap
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
# Wrap to ensure cleanup
|
|
452
|
+
for chunk in iterator:
|
|
453
|
+
yield chunk
|
|
454
|
+
|
|
455
|
+
finally:
|
|
456
|
+
# Cleanup temp file if created
|
|
457
|
+
if is_temp and os.path.exists(processed_input_path):
|
|
458
|
+
try:
|
|
459
|
+
os.unlink(processed_input_path)
|
|
460
|
+
except OSError:
|
|
461
|
+
pass
|
|
462
|
+
|
|
463
|
+
def preview(self, n: int = 3) -> str:
|
|
464
|
+
"""
|
|
465
|
+
Preview the first n chunks (deprecated, use process() and check preview_chunks).
|
|
466
|
+
"""
|
|
467
|
+
return f"Use pipeline.process(...).preview_chunks for preview"
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
# =============================================================================
|
|
471
|
+
# Streaming Utilities
|
|
472
|
+
# =============================================================================
|
|
473
|
+
|
|
474
|
+
class StreamingChunkIterator:
|
|
475
|
+
"""
|
|
476
|
+
A wrapper for streaming chunk iteration with additional utilities.
|
|
477
|
+
"""
|
|
478
|
+
def __init__(self, pipeline: Pipeline, input_path: str):
|
|
479
|
+
self.pipeline = pipeline
|
|
480
|
+
self.input_path = input_path
|
|
481
|
+
self._iterator = None
|
|
482
|
+
self._chunk_count = 0
|
|
483
|
+
|
|
484
|
+
def __iter__(self):
|
|
485
|
+
self._iterator = self.pipeline.process_stream(self.input_path)
|
|
486
|
+
return self
|
|
487
|
+
|
|
488
|
+
def __next__(self) -> Dict[str, Any]:
|
|
489
|
+
if self._iterator is None:
|
|
490
|
+
self._iterator = self.pipeline.process_stream(self.input_path)
|
|
491
|
+
chunk = next(self._iterator)
|
|
492
|
+
self._chunk_count += 1
|
|
493
|
+
return chunk
|
|
494
|
+
|
|
495
|
+
@property
|
|
496
|
+
def chunks_processed(self) -> int:
|
|
497
|
+
"""Return the number of chunks processed so far."""
|
|
498
|
+
return self._chunk_count
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
# =============================================================================
|
|
502
|
+
# Legacy & Exports
|
|
503
|
+
# =============================================================================
|
|
504
|
+
|
|
505
|
+
# For backward compatibility if needed
|
|
506
|
+
KriraLoader = Pipeline
|
|
507
|
+
TextSplitter = PipelineConfig
|
|
508
|
+
|
|
509
|
+
__all__ = [
|
|
510
|
+
"Pipeline",
|
|
511
|
+
"PipelineConfig",
|
|
512
|
+
"SplitStrategy",
|
|
513
|
+
"PipelineStats",
|
|
514
|
+
"StreamingChunkIterator"
|
|
515
|
+
]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Pure Python fallback implementations."""
|
|
2
|
+
|
|
3
|
+
from .cleaning import CleaningConfig, DataCleaner
|
|
4
|
+
from .transformation import TransformConfig, DataTransformer
|
|
5
|
+
from .pipeline import PipelineConfig, KriraPipeline
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"CleaningConfig",
|
|
9
|
+
"DataCleaner",
|
|
10
|
+
"TransformConfig",
|
|
11
|
+
"DataTransformer",
|
|
12
|
+
"PipelineConfig",
|
|
13
|
+
"KriraPipeline",
|
|
14
|
+
]
|