krira-augment 2.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. krira_augment-2.0.4/Cargo.lock +400 -0
  2. krira_augment-2.0.4/Cargo.toml +42 -0
  3. krira_augment-2.0.4/PKG-INFO +209 -0
  4. krira_augment-2.0.4/README.md +163 -0
  5. krira_augment-2.0.4/build.bat +39 -0
  6. krira_augment-2.0.4/demo_usage.py +47 -0
  7. krira_augment-2.0.4/dist/krira_augment-2.0.2-cp313-cp313-win_amd64.whl +0 -0
  8. krira_augment-2.0.4/dist/krira_augment-2.0.2.tar.gz +0 -0
  9. krira_augment-2.0.4/dist/krira_augment-2.0.3-cp313-cp313-win_amd64.whl +0 -0
  10. krira_augment-2.0.4/dist/krira_augment-2.0.3.tar.gz +0 -0
  11. krira_augment-2.0.4/dist/krira_augment-2.0.4-cp313-cp313-win_amd64.whl +0 -0
  12. krira_augment-2.0.4/dist/krira_augment-2.0.4.tar.gz +0 -0
  13. krira_augment-2.0.4/examples/csv_advanced_pipeline.py +289 -0
  14. krira_augment-2.0.4/pyproject.toml +95 -0
  15. krira_augment-2.0.4/python/krira_augment/__init__.py +322 -0
  16. krira_augment-2.0.4/python/krira_augment/_python/__init__.py +14 -0
  17. krira_augment-2.0.4/python/krira_augment/_python/cleaning.py +394 -0
  18. krira_augment-2.0.4/python/krira_augment/_python/pipeline.py +738 -0
  19. krira_augment-2.0.4/python/krira_augment/_python/transformation.py +551 -0
  20. krira_augment-2.0.4/src/chunker.rs +81 -0
  21. krira_augment-2.0.4/src/cleaning.rs +38 -0
  22. krira_augment-2.0.4/src/config.rs +286 -0
  23. krira_augment-2.0.4/src/errors.rs +29 -0
  24. krira_augment-2.0.4/src/lib.rs +158 -0
  25. krira_augment-2.0.4/src/pipeline.rs +574 -0
  26. krira_augment-2.0.4/src/transformation.rs +571 -0
  27. krira_augment-2.0.4/test_data/test.csv +11 -0
  28. krira_augment-2.0.4/test_data/test.json +42 -0
  29. krira_augment-2.0.4/test_data/test.jsonl +10 -0
  30. krira_augment-2.0.4/test_data/test.txt +10 -0
  31. krira_augment-2.0.4/test_data_comprehensive/test.csv +11 -0
  32. krira_augment-2.0.4/test_data_comprehensive/test.json +42 -0
  33. krira_augment-2.0.4/test_data_comprehensive/test.jsonl +10 -0
  34. krira_augment-2.0.4/test_data_comprehensive/test.txt +20 -0
  35. krira_augment-2.0.4/test_data_comprehensive/test.xml +12 -0
  36. krira_augment-2.0.4/test_data_comprehensive/test_processed.jsonl +20 -0
  37. krira_augment-2.0.4/test_formats_basic.py +69 -0
  38. krira_augment-2.0.4/test_output/output_CSV.jsonl +11 -0
  39. krira_augment-2.0.4/test_output/output_JSON.jsonl +10 -0
  40. krira_augment-2.0.4/test_output/output_JSONL.jsonl +10 -0
  41. krira_augment-2.0.4/test_output/output_TXT.jsonl +10 -0
  42. krira_augment-2.0.4/test_output_comprehensive/output_jsonl.jsonl +10 -0
  43. krira_augment-2.0.4/tests/test_comprehensive.py +138 -0
@@ -0,0 +1,400 @@
1
+ # This file is automatically @generated by Cargo.
2
+ # It is not intended for manual editing.
3
+ version = 4
4
+
5
+ [[package]]
6
+ name = "aho-corasick"
7
+ version = "1.1.4"
8
+ source = "registry+https://github.com/rust-lang/crates.io-index"
9
+ checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
10
+ dependencies = [
11
+ "memchr",
12
+ ]
13
+
14
+ [[package]]
15
+ name = "autocfg"
16
+ version = "1.5.0"
17
+ source = "registry+https://github.com/rust-lang/crates.io-index"
18
+ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
19
+
20
+ [[package]]
21
+ name = "cfg-if"
22
+ version = "1.0.4"
23
+ source = "registry+https://github.com/rust-lang/crates.io-index"
24
+ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
25
+
26
+ [[package]]
27
+ name = "crossbeam-deque"
28
+ version = "0.8.6"
29
+ source = "registry+https://github.com/rust-lang/crates.io-index"
30
+ checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
31
+ dependencies = [
32
+ "crossbeam-epoch",
33
+ "crossbeam-utils",
34
+ ]
35
+
36
+ [[package]]
37
+ name = "crossbeam-epoch"
38
+ version = "0.9.18"
39
+ source = "registry+https://github.com/rust-lang/crates.io-index"
40
+ checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
41
+ dependencies = [
42
+ "crossbeam-utils",
43
+ ]
44
+
45
+ [[package]]
46
+ name = "crossbeam-utils"
47
+ version = "0.8.21"
48
+ source = "registry+https://github.com/rust-lang/crates.io-index"
49
+ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
50
+
51
+ [[package]]
52
+ name = "csv"
53
+ version = "1.4.0"
54
+ source = "registry+https://github.com/rust-lang/crates.io-index"
55
+ checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938"
56
+ dependencies = [
57
+ "csv-core",
58
+ "itoa",
59
+ "ryu",
60
+ "serde_core",
61
+ ]
62
+
63
+ [[package]]
64
+ name = "csv-core"
65
+ version = "0.1.13"
66
+ source = "registry+https://github.com/rust-lang/crates.io-index"
67
+ checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782"
68
+ dependencies = [
69
+ "memchr",
70
+ ]
71
+
72
+ [[package]]
73
+ name = "either"
74
+ version = "1.15.0"
75
+ source = "registry+https://github.com/rust-lang/crates.io-index"
76
+ checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
77
+
78
+ [[package]]
79
+ name = "heck"
80
+ version = "0.5.0"
81
+ source = "registry+https://github.com/rust-lang/crates.io-index"
82
+ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
83
+
84
+ [[package]]
85
+ name = "indoc"
86
+ version = "2.0.7"
87
+ source = "registry+https://github.com/rust-lang/crates.io-index"
88
+ checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706"
89
+ dependencies = [
90
+ "rustversion",
91
+ ]
92
+
93
+ [[package]]
94
+ name = "itoa"
95
+ version = "1.0.17"
96
+ source = "registry+https://github.com/rust-lang/crates.io-index"
97
+ checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
98
+
99
+ [[package]]
100
+ name = "krira_augment"
101
+ version = "2.0.4"
102
+ dependencies = [
103
+ "csv",
104
+ "lazy_static",
105
+ "memmap2",
106
+ "pyo3",
107
+ "rayon",
108
+ "regex",
109
+ "serde",
110
+ "serde_json",
111
+ "thiserror",
112
+ ]
113
+
114
+ [[package]]
115
+ name = "lazy_static"
116
+ version = "1.5.0"
117
+ source = "registry+https://github.com/rust-lang/crates.io-index"
118
+ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
119
+
120
+ [[package]]
121
+ name = "libc"
122
+ version = "0.2.179"
123
+ source = "registry+https://github.com/rust-lang/crates.io-index"
124
+ checksum = "c5a2d376baa530d1238d133232d15e239abad80d05838b4b59354e5268af431f"
125
+
126
+ [[package]]
127
+ name = "memchr"
128
+ version = "2.7.6"
129
+ source = "registry+https://github.com/rust-lang/crates.io-index"
130
+ checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
131
+
132
+ [[package]]
133
+ name = "memmap2"
134
+ version = "0.9.9"
135
+ source = "registry+https://github.com/rust-lang/crates.io-index"
136
+ checksum = "744133e4a0e0a658e1374cf3bf8e415c4052a15a111acd372764c55b4177d490"
137
+ dependencies = [
138
+ "libc",
139
+ ]
140
+
141
+ [[package]]
142
+ name = "memoffset"
143
+ version = "0.9.1"
144
+ source = "registry+https://github.com/rust-lang/crates.io-index"
145
+ checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
146
+ dependencies = [
147
+ "autocfg",
148
+ ]
149
+
150
+ [[package]]
151
+ name = "once_cell"
152
+ version = "1.21.3"
153
+ source = "registry+https://github.com/rust-lang/crates.io-index"
154
+ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
155
+
156
+ [[package]]
157
+ name = "portable-atomic"
158
+ version = "1.13.0"
159
+ source = "registry+https://github.com/rust-lang/crates.io-index"
160
+ checksum = "f89776e4d69bb58bc6993e99ffa1d11f228b839984854c7daeb5d37f87cbe950"
161
+
162
+ [[package]]
163
+ name = "proc-macro2"
164
+ version = "1.0.104"
165
+ source = "registry+https://github.com/rust-lang/crates.io-index"
166
+ checksum = "9695f8df41bb4f3d222c95a67532365f569318332d03d5f3f67f37b20e6ebdf0"
167
+ dependencies = [
168
+ "unicode-ident",
169
+ ]
170
+
171
+ [[package]]
172
+ name = "pyo3"
173
+ version = "0.23.5"
174
+ source = "registry+https://github.com/rust-lang/crates.io-index"
175
+ checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872"
176
+ dependencies = [
177
+ "cfg-if",
178
+ "indoc",
179
+ "libc",
180
+ "memoffset",
181
+ "once_cell",
182
+ "portable-atomic",
183
+ "pyo3-build-config",
184
+ "pyo3-ffi",
185
+ "pyo3-macros",
186
+ "unindent",
187
+ ]
188
+
189
+ [[package]]
190
+ name = "pyo3-build-config"
191
+ version = "0.23.5"
192
+ source = "registry+https://github.com/rust-lang/crates.io-index"
193
+ checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb"
194
+ dependencies = [
195
+ "once_cell",
196
+ "target-lexicon",
197
+ ]
198
+
199
+ [[package]]
200
+ name = "pyo3-ffi"
201
+ version = "0.23.5"
202
+ source = "registry+https://github.com/rust-lang/crates.io-index"
203
+ checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d"
204
+ dependencies = [
205
+ "libc",
206
+ "pyo3-build-config",
207
+ ]
208
+
209
+ [[package]]
210
+ name = "pyo3-macros"
211
+ version = "0.23.5"
212
+ source = "registry+https://github.com/rust-lang/crates.io-index"
213
+ checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da"
214
+ dependencies = [
215
+ "proc-macro2",
216
+ "pyo3-macros-backend",
217
+ "quote",
218
+ "syn",
219
+ ]
220
+
221
+ [[package]]
222
+ name = "pyo3-macros-backend"
223
+ version = "0.23.5"
224
+ source = "registry+https://github.com/rust-lang/crates.io-index"
225
+ checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028"
226
+ dependencies = [
227
+ "heck",
228
+ "proc-macro2",
229
+ "pyo3-build-config",
230
+ "quote",
231
+ "syn",
232
+ ]
233
+
234
+ [[package]]
235
+ name = "quote"
236
+ version = "1.0.42"
237
+ source = "registry+https://github.com/rust-lang/crates.io-index"
238
+ checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f"
239
+ dependencies = [
240
+ "proc-macro2",
241
+ ]
242
+
243
+ [[package]]
244
+ name = "rayon"
245
+ version = "1.11.0"
246
+ source = "registry+https://github.com/rust-lang/crates.io-index"
247
+ checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f"
248
+ dependencies = [
249
+ "either",
250
+ "rayon-core",
251
+ ]
252
+
253
+ [[package]]
254
+ name = "rayon-core"
255
+ version = "1.13.0"
256
+ source = "registry+https://github.com/rust-lang/crates.io-index"
257
+ checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
258
+ dependencies = [
259
+ "crossbeam-deque",
260
+ "crossbeam-utils",
261
+ ]
262
+
263
+ [[package]]
264
+ name = "regex"
265
+ version = "1.12.2"
266
+ source = "registry+https://github.com/rust-lang/crates.io-index"
267
+ checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
268
+ dependencies = [
269
+ "aho-corasick",
270
+ "memchr",
271
+ "regex-automata",
272
+ "regex-syntax",
273
+ ]
274
+
275
+ [[package]]
276
+ name = "regex-automata"
277
+ version = "0.4.13"
278
+ source = "registry+https://github.com/rust-lang/crates.io-index"
279
+ checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c"
280
+ dependencies = [
281
+ "aho-corasick",
282
+ "memchr",
283
+ "regex-syntax",
284
+ ]
285
+
286
+ [[package]]
287
+ name = "regex-syntax"
288
+ version = "0.8.8"
289
+ source = "registry+https://github.com/rust-lang/crates.io-index"
290
+ checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
291
+
292
+ [[package]]
293
+ name = "rustversion"
294
+ version = "1.0.22"
295
+ source = "registry+https://github.com/rust-lang/crates.io-index"
296
+ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
297
+
298
+ [[package]]
299
+ name = "ryu"
300
+ version = "1.0.22"
301
+ source = "registry+https://github.com/rust-lang/crates.io-index"
302
+ checksum = "a50f4cf475b65d88e057964e0e9bb1f0aa9bbb2036dc65c64596b42932536984"
303
+
304
+ [[package]]
305
+ name = "serde"
306
+ version = "1.0.228"
307
+ source = "registry+https://github.com/rust-lang/crates.io-index"
308
+ checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
309
+ dependencies = [
310
+ "serde_core",
311
+ "serde_derive",
312
+ ]
313
+
314
+ [[package]]
315
+ name = "serde_core"
316
+ version = "1.0.228"
317
+ source = "registry+https://github.com/rust-lang/crates.io-index"
318
+ checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
319
+ dependencies = [
320
+ "serde_derive",
321
+ ]
322
+
323
+ [[package]]
324
+ name = "serde_derive"
325
+ version = "1.0.228"
326
+ source = "registry+https://github.com/rust-lang/crates.io-index"
327
+ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
328
+ dependencies = [
329
+ "proc-macro2",
330
+ "quote",
331
+ "syn",
332
+ ]
333
+
334
+ [[package]]
335
+ name = "serde_json"
336
+ version = "1.0.148"
337
+ source = "registry+https://github.com/rust-lang/crates.io-index"
338
+ checksum = "3084b546a1dd6289475996f182a22aba973866ea8e8b02c51d9f46b1336a22da"
339
+ dependencies = [
340
+ "itoa",
341
+ "memchr",
342
+ "serde",
343
+ "serde_core",
344
+ "zmij",
345
+ ]
346
+
347
+ [[package]]
348
+ name = "syn"
349
+ version = "2.0.112"
350
+ source = "registry+https://github.com/rust-lang/crates.io-index"
351
+ checksum = "21f182278bf2d2bcb3c88b1b08a37df029d71ce3d3ae26168e3c653b213b99d4"
352
+ dependencies = [
353
+ "proc-macro2",
354
+ "quote",
355
+ "unicode-ident",
356
+ ]
357
+
358
+ [[package]]
359
+ name = "target-lexicon"
360
+ version = "0.12.16"
361
+ source = "registry+https://github.com/rust-lang/crates.io-index"
362
+ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
363
+
364
+ [[package]]
365
+ name = "thiserror"
366
+ version = "1.0.69"
367
+ source = "registry+https://github.com/rust-lang/crates.io-index"
368
+ checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
369
+ dependencies = [
370
+ "thiserror-impl",
371
+ ]
372
+
373
+ [[package]]
374
+ name = "thiserror-impl"
375
+ version = "1.0.69"
376
+ source = "registry+https://github.com/rust-lang/crates.io-index"
377
+ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
378
+ dependencies = [
379
+ "proc-macro2",
380
+ "quote",
381
+ "syn",
382
+ ]
383
+
384
+ [[package]]
385
+ name = "unicode-ident"
386
+ version = "1.0.22"
387
+ source = "registry+https://github.com/rust-lang/crates.io-index"
388
+ checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
389
+
390
+ [[package]]
391
+ name = "unindent"
392
+ version = "0.2.4"
393
+ source = "registry+https://github.com/rust-lang/crates.io-index"
394
+ checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
395
+
396
+ [[package]]
397
+ name = "zmij"
398
+ version = "1.0.8"
399
+ source = "registry+https://github.com/rust-lang/crates.io-index"
400
+ checksum = "317f17ff091ac4515f17cc7a190d2769a8c9a96d227de5d64b500b01cda8f2cd"
@@ -0,0 +1,42 @@
1
+ [package]
2
+ name = "krira_augment"
3
+ version = "2.0.4"
4
+ edition = "2021"
5
+ authors = ["Krira Labs <contact@kriralabs.com>"]
6
+ description = "Production-grade document chunking library for RAG systems"
7
+ license = "MIT"
8
+ readme = "README.md"
9
+ repository = "https://github.com/Krira-Labs/krira-chunker"
10
+
11
+ [lib]
12
+ name = "krira_augment"
13
+ crate-type = ["cdylib"]
14
+
15
+ [dependencies]
16
+ # PyO3 for Python bindings
17
+ pyo3 = { version = "0.23.3", features = ["extension-module"] }
18
+
19
+ # Parallel processing
20
+ rayon = "1.8"
21
+
22
+ # Fast file reading (Zero-copy)
23
+ memmap2 = "0.9"
24
+
25
+ # Regex for cleaning
26
+ regex = "1.10"
27
+ lazy_static = "1.4"
28
+
29
+ # CSV and JSON
30
+ csv = "1.3"
31
+ serde = { version = "1.0", features = ["derive"] }
32
+ serde_json = "1.0"
33
+
34
+ # Error handling
35
+ thiserror = "1.0"
36
+
37
+ # Performance optimization
38
+ [profile.release]
39
+ opt-level = 3
40
+ lto = "fat"
41
+ codegen-units = 1
42
+ panic = "abort"
@@ -0,0 +1,209 @@
1
+ Metadata-Version: 2.4
2
+ Name: krira-augment
3
+ Version: 2.0.4
4
+ Classifier: Development Status :: 5 - Production/Stable
5
+ Classifier: Intended Audience :: Developers
6
+ Classifier: License :: OSI Approved :: MIT License
7
+ Classifier: Operating System :: OS Independent
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Classifier: Programming Language :: Rust
14
+ Classifier: Topic :: Text Processing
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Requires-Dist: openpyxl>=3.0 ; extra == 'xlsx'
17
+ Requires-Dist: pdfplumber>=0.10 ; extra == 'pdf'
18
+ Requires-Dist: python-docx>=0.8 ; extra == 'docx'
19
+ Requires-Dist: polars>=0.20 ; extra == 'csv'
20
+ Requires-Dist: openpyxl>=3.0 ; extra == 'all'
21
+ Requires-Dist: pdfplumber>=0.10 ; extra == 'all'
22
+ Requires-Dist: python-docx>=0.8 ; extra == 'all'
23
+ Requires-Dist: polars>=0.20 ; extra == 'all'
24
+ Requires-Dist: pytest>=7.0 ; extra == 'dev'
25
+ Requires-Dist: pytest-cov>=4.0 ; extra == 'dev'
26
+ Requires-Dist: black>=23.0 ; extra == 'dev'
27
+ Requires-Dist: mypy>=1.0 ; extra == 'dev'
28
+ Requires-Dist: ruff>=0.1 ; extra == 'dev'
29
+ Provides-Extra: xlsx
30
+ Provides-Extra: pdf
31
+ Provides-Extra: docx
32
+ Provides-Extra: csv
33
+ Provides-Extra: all
34
+ Provides-Extra: dev
35
+ Summary: Production-grade document chunking library for RAG systems - Rust-powered Python library
36
+ Keywords: rag,chunking,nlp,document-processing,ai,rust,pyo3
37
+ Author-email: Krira Labs <contact@kriralabs.com>
38
+ License: MIT
39
+ Requires-Python: >=3.10
40
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
41
+ Project-URL: Homepage, https://github.com/Krira-Labs/krira-chunker
42
+ Project-URL: Repository, https://github.com/Krira-Labs/krira-chunker
43
+ Project-URL: Documentation, https://github.com/Krira-Labs/krira-chunker#readme
44
+ Project-URL: Issues, https://github.com/Krira-Labs/krira-chunker/issues
45
+
46
+ # Krira Augment โšก๐Ÿฆ€
47
+
48
+ **The High-Performance Rust Chunking Engine for RAG Pipelines**
49
+
50
+ [![PyPI version](https://badge.fury.io/py/krira-augment.svg)](https://badge.fury.io/py/krira-augment)
51
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
52
+ [![Rust](https://img.shields.io/badge/Built_with-Rust-orange)](https://www.rust-lang.org/)
53
+
54
+ **Krira Augment** is a production-grade Python library backed by a highly optimized Rust core. It is designed to replace slow, memory-intensive preprocessing steps in large-scale Retrieval Augmented Generation (RAG) systems.
55
+
56
+ It processes gigabytes of raw unstructured data (CSV, JSONL, TXT) into high-quality, clean chunks in secondsโ€”utilizing **zero-copy memory mapping** and **parallel CPU execution**.
57
+
58
+ ---
59
+
60
+ ## ๐Ÿš€ Performance Benchmarks
61
+
62
+ Benchmarks run on a standard 8-core machine (M2 Air equivalent).
63
+
64
+ | Dataset Size | Legacy (LangChain/Pandas) | Krira V2 (Rust Core) | Speedup |
65
+ | :--- | :--- | :--- | :--- |
66
+ | **100 MB** | ~45 sec | **~0.8 sec** | **56x** ๐Ÿš€ |
67
+ | **1 GB** | ~8.0 min | **~12.0 sec** | **40x** ๐Ÿš€ |
68
+ | **10 GB** | *Timeout / OOM* | **~2.1 min** | **Stable** โœ… |
69
+
70
+ > **Note:** Krira uses O(1) memory. Processing a 100GB file uses the same amount of RAM as a 10MB file.
71
+
72
+ ---
73
+
74
+ ## ๐Ÿ“ฆ Installation
75
+
76
+ ```bash
77
+ pip install krira-augment
78
+ ```
79
+
80
+ *Requirements: Python 3.8+*
81
+
82
+ ---
83
+
84
+ ## ๐Ÿ› ๏ธ Usage
85
+
86
+ ### 1. Quick Start
87
+ For standard use cases, use the default high-throughput pipeline.
88
+
89
+ ```python
90
+ from krira_augment import Pipeline
91
+
92
+ # Initialize the pipeline
93
+ pipeline = Pipeline()
94
+
95
+ # Process a 1GB file in seconds
96
+ stats = pipeline.process(
97
+ input_path="data/raw_knowledge_base.csv",
98
+ output_path="data/processed_chunks.jsonl"
99
+ )
100
+
101
+ print(f"โœ… Processing complete chunking job.")
102
+ ```
103
+
104
+ ### 2. Advanced Configuration (Professional)
105
+ For production RAG, you need fine-grained control over chunking strategies, overlap, and data cleaning.
106
+
107
+ ```python
108
+ from krira_augment import Pipeline, PipelineConfig, SplitStrategy
109
+
110
+ # Define a robust configuration
111
+ config = PipelineConfig(
112
+ # Chunking Strategy
113
+ chunk_size=512, # Target characters per chunk
114
+ chunk_overlap=50, # Context overlap for better retrieval
115
+ strategy=SplitStrategy.SMART, # Respects sentence/paragraph boundaries
116
+
117
+ # Data Cleaning Rules (Rust-native regex)
118
+ clean_html=True, # Remove <div>, <br>, etc.
119
+ clean_unicode=True, # Normalize whitespace and emojis
120
+ min_chunk_len=20, # Discard garbage/empty chunks
121
+
122
+ # System Performance
123
+ threads=8, # Force usage of 8 CPU cores
124
+ batch_size=1000 # Write to disk every 1k chunks (Low RAM usage)
125
+ )
126
+
127
+ # Initialize with config
128
+ pipeline = Pipeline(config=config)
129
+
130
+ # Execute
131
+ result = pipeline.process(
132
+ input_path="large_corpus.csv",
133
+ output_path="corpus_vectors.jsonl"
134
+ )
135
+
136
+ print(f"Job ID: {result.job_id}")
137
+ print(f"Throughput: {result.mb_per_second:.2f} MB/s")
138
+ ```
139
+
140
+ ---
141
+
142
+ ## ๐Ÿ“„ Output Format
143
+
144
+ The library outputs standard **JSONL** (JSON Lines), ready for direct ingestion into vector databases (Pinecone, Weaviate, Qdrant).
145
+
146
+ **`processed_chunks.jsonl`**:
147
+ ```json
148
+ {"text": "The mitochondria is the powerhouse...", "metadata": {"source": "doc1.csv", "row": 1, "chunk_index": 0}}
149
+ {"text": "It generates most of the chemical energy...", "metadata": {"source": "doc1.csv", "row": 1, "chunk_index": 1}}
150
+ ```
151
+
152
+ ---
153
+
154
+ ## ๐Ÿ—๏ธ Architecture
155
+
156
+ Krira differs from standard Python loaders by offloading the entire ETL process to a compiled Rust binary.
157
+
158
+ 1. **Memory Mapping (mmap):** The file is mapped directly from disk to virtual memory. No loading 1GB CSVs into Python RAM.
159
+ 2. **Rayon Parallelism:** The file is sliced into segments and processed across all available CPU cores simultaneously.
160
+ 3. **Serde Serialization:** Chunks are serialized to JSONL directly on the Rust thread, minimizing Python GIL interaction.
161
+
162
+ ---
163
+
164
+ ## ๐Ÿค Integration Example
165
+
166
+ Seamlessly integrate with generic Python generators to feed embeddings.
167
+
168
+ ```python
169
+ import json
170
+ import openai
171
+
172
+ def stream_chunks(jsonl_path):
173
+ """Yields chunks efficiently for embedding API calls."""
174
+ with open(jsonl_path, 'r') as f:
175
+ for line in f:
176
+ yield json.loads(line)
177
+
178
+ # Use in your downstream application
179
+ for chunk in stream_chunks("processed_chunks.jsonl"):
180
+ # Mock embedding call
181
+ # embedding = openai.Embedding.create(input=chunk['text'])
182
+ pass
183
+
184
+ # Upsert to Vector DB (e.g., Pinecone)
185
+ # index.upsert(vectors=[(chunk['id'], embedding, chunk['metadata'])])
186
+ ```
187
+
188
+ ---
189
+
190
+ ## ๐Ÿง‘โ€๐Ÿ’ป Development
191
+
192
+ If you want to modify the Rust core:
193
+
194
+ 1. **Clone the repo**
195
+ 2. **Install Maturin** (Rust-Python bridge builder)
196
+ ```bash
197
+ pip install maturin
198
+ ```
199
+ 3. **Build and Install locally**
200
+ ```bash
201
+ maturin develop --release
202
+ ```
203
+
204
+ ---
205
+
206
+ ## License
207
+
208
+ MIT License.
209
+