py-chunks 0.2.3__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {py_chunks-0.2.3 → py_chunks-0.3.0}/Cargo.lock +47 -1
- {py_chunks-0.2.3 → py_chunks-0.3.0}/Cargo.toml +1 -0
- py_chunks-0.2.3/README.md → py_chunks-0.3.0/PKG-INFO +151 -7
- py_chunks-0.2.3/PKG-INFO → py_chunks-0.3.0/README.md +134 -21
- {py_chunks-0.2.3 → py_chunks-0.3.0}/py_chunks/__init__.py +53 -5
- {py_chunks-0.2.3 → py_chunks-0.3.0}/py_chunks/chunkers/__init__.py +3 -0
- py_chunks-0.3.0/py_chunks/chunkers/xlsx.py +158 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/pyproject.toml +6 -3
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/mod.rs +1 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pptx/semantic.rs +1 -1
- py_chunks-0.3.0/src/extensions/xlsx/common.rs +511 -0
- py_chunks-0.3.0/src/extensions/xlsx/mod.rs +19 -0
- py_chunks-0.3.0/src/extensions/xlsx/page_aware.rs +432 -0
- py_chunks-0.3.0/src/extensions/xlsx/row_document.rs +68 -0
- py_chunks-0.3.0/src/extensions/xlsx/semantic.rs +332 -0
- py_chunks-0.3.0/src/extensions/xlsx/sheet.rs +195 -0
- py_chunks-0.3.0/src/extensions/xlsx/sliding_window.rs +221 -0
- py_chunks-0.3.0/src/extensions/xlsx/stream_iter.rs +503 -0
- py_chunks-0.3.0/src/extensions/xlsx/table_region.rs +506 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/lib.rs +1 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/.github/workflows/release.yml +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/.gitignore +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/.pylintrc +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/LICENSE +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/py_chunks/chunkers/docx.py +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/py_chunks/chunkers/html.py +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/py_chunks/chunkers/md.py +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/py_chunks/chunkers/pdf.py +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/py_chunks/chunkers/pptx.py +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/py_chunks/chunkers/txt.py +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/docx/common.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/docx/mod.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/docx/page_aware.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/docx/section.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/docx/semantic.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/docx/sentence.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/docx/sliding_window.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/docx/structural.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/html/common.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/html/mod.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/html/page_aware.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/html/section.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/html/semantic.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/html/sentence.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/html/sliding_window.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/html/stream_iter.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/html/structural.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/md/common.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/md/mod.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/md/page_aware.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/md/section.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/md/semantic.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/md/sentence.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/md/sliding_window.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/md/stream_iter.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/md/structural.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pdf/common.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pdf/mod.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pdf/stream_iter.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pdf/structural.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pptx/common.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pptx/mod.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pptx/page_aware.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pptx/section.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pptx/sentence.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pptx/sliding_window.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pptx/stream_iter.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pptx/structural.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/shared.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/txt/common.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/txt/mod.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/txt/page_aware.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/txt/section.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/txt/semantic.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/txt/sentence.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/txt/sliding_window.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/txt/stream_iter.rs +0 -0
- {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/txt/structural.rs +0 -0
|
@@ -107,6 +107,22 @@ dependencies = [
|
|
|
107
107
|
"pkg-config",
|
|
108
108
|
]
|
|
109
109
|
|
|
110
|
+
[[package]]
|
|
111
|
+
name = "calamine"
|
|
112
|
+
version = "0.26.1"
|
|
113
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
114
|
+
checksum = "138646b9af2c5d7f1804ea4bf93afc597737d2bd4f7341d67c48b03316976eb1"
|
|
115
|
+
dependencies = [
|
|
116
|
+
"byteorder",
|
|
117
|
+
"chrono",
|
|
118
|
+
"codepage",
|
|
119
|
+
"encoding_rs",
|
|
120
|
+
"log",
|
|
121
|
+
"quick-xml 0.31.0",
|
|
122
|
+
"serde",
|
|
123
|
+
"zip",
|
|
124
|
+
]
|
|
125
|
+
|
|
110
126
|
[[package]]
|
|
111
127
|
name = "cc"
|
|
112
128
|
version = "1.2.61"
|
|
@@ -134,6 +150,7 @@ dependencies = [
|
|
|
134
150
|
"iana-time-zone",
|
|
135
151
|
"js-sys",
|
|
136
152
|
"num-traits",
|
|
153
|
+
"serde",
|
|
137
154
|
"wasm-bindgen",
|
|
138
155
|
"windows-link",
|
|
139
156
|
]
|
|
@@ -148,6 +165,15 @@ dependencies = [
|
|
|
148
165
|
"inout",
|
|
149
166
|
]
|
|
150
167
|
|
|
168
|
+
[[package]]
|
|
169
|
+
name = "codepage"
|
|
170
|
+
version = "0.1.2"
|
|
171
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
172
|
+
checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4"
|
|
173
|
+
dependencies = [
|
|
174
|
+
"encoding_rs",
|
|
175
|
+
]
|
|
176
|
+
|
|
151
177
|
[[package]]
|
|
152
178
|
name = "console_error_panic_hook"
|
|
153
179
|
version = "0.1.7"
|
|
@@ -283,6 +309,15 @@ version = "1.15.0"
|
|
|
283
309
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
284
310
|
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
|
|
285
311
|
|
|
312
|
+
[[package]]
|
|
313
|
+
name = "encoding_rs"
|
|
314
|
+
version = "0.8.35"
|
|
315
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
316
|
+
checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
|
|
317
|
+
dependencies = [
|
|
318
|
+
"cfg-if",
|
|
319
|
+
]
|
|
320
|
+
|
|
286
321
|
[[package]]
|
|
287
322
|
name = "equivalent"
|
|
288
323
|
version = "1.0.2"
|
|
@@ -667,10 +702,11 @@ checksum = "e0c5ccf5294c6ccd63a74f1565028353830a9c2f5eb0c682c355c471726a6e3f"
|
|
|
667
702
|
name = "py_chunks"
|
|
668
703
|
version = "0.1.6"
|
|
669
704
|
dependencies = [
|
|
705
|
+
"calamine",
|
|
670
706
|
"pdfium-render",
|
|
671
707
|
"pyo3",
|
|
672
708
|
"pythonize",
|
|
673
|
-
"quick-xml",
|
|
709
|
+
"quick-xml 0.38.4",
|
|
674
710
|
"serde_json",
|
|
675
711
|
"zip",
|
|
676
712
|
]
|
|
@@ -748,6 +784,16 @@ dependencies = [
|
|
|
748
784
|
"serde",
|
|
749
785
|
]
|
|
750
786
|
|
|
787
|
+
[[package]]
|
|
788
|
+
name = "quick-xml"
|
|
789
|
+
version = "0.31.0"
|
|
790
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
791
|
+
checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
|
|
792
|
+
dependencies = [
|
|
793
|
+
"encoding_rs",
|
|
794
|
+
"memchr",
|
|
795
|
+
]
|
|
796
|
+
|
|
751
797
|
[[package]]
|
|
752
798
|
name = "quick-xml"
|
|
753
799
|
version = "0.38.4"
|
|
@@ -1,18 +1,36 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: py-chunks
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Classifier: Programming Language :: Python :: 3
|
|
5
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
6
|
+
Classifier: Operating System :: OS Independent
|
|
7
|
+
Requires-Dist: pypdfium2
|
|
8
|
+
Requires-Dist: openpyxl>=3.1 ; extra == 'dev'
|
|
9
|
+
Provides-Extra: dev
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Summary: Rust-backed Python chunking library
|
|
12
|
+
Keywords: chunking,document,pdf,docx,xlsx,rust
|
|
13
|
+
License: MIT
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
16
|
+
|
|
1
17
|
# py-chunks
|
|
2
18
|
|
|
3
19
|
[](https://www.python.org/downloads/) [](LICENSE)
|
|
4
20
|
|
|
5
|
-
Fast, framework-agnostic document chunking library backed by Rust. Extract meaningful content segments from DOCX, PDF, PPTX, TXT, Markdown, and
|
|
21
|
+
Fast, framework-agnostic document chunking library backed by Rust. Extract meaningful content segments from DOCX, PDF, PPTX, TXT, Markdown, HTML, XLSX, and XLS files — optimised for production use.
|
|
6
22
|
|
|
7
23
|
## Features
|
|
8
24
|
|
|
9
|
-
- **
|
|
10
|
-
- **7 Chunking Modes
|
|
25
|
+
- **8 Document Formats**: PDF, DOCX, PPTX, Markdown, HTML, TXT, XLSX, XLS
|
|
26
|
+
- **7 Chunking Modes for document formats**: `default`, `structural`, `section`, `semantic`, `sliding_window`, `sentence`, `page_aware`
|
|
27
|
+
- **6 Chunking Modes for spreadsheet formats** (XLSX / XLS): `row`, `table`, `sheet`, `sliding_window`, `page_aware`, `semantic`
|
|
11
28
|
- **Streaming for every format** via a single `stream_chunks()` entry point
|
|
12
29
|
- PDF: background Rust thread + `mpsc` channel (all 7 modes, true one-chunk-at-a-time)
|
|
13
30
|
- Markdown / HTML / TXT: block-by-block state machine for `structural` + `semantic`; batch-drain for the rest
|
|
14
31
|
- DOCX: all 7 modes — `DocxStructuralIterator` for `default`/`structural`; dedicated per-mode iterators for the remaining 5 modes (lazy chunk emission after a single upfront parse)
|
|
15
32
|
- PPTX: batch-drain (ZIP must be read upfront, then chunks are yielded one at a time)
|
|
33
|
+
- XLSX / XLS: `row` and `sliding_window` use true state machines (one chunk per `__next__`, O(parsed_rows) memory); `table`, `sheet`, `page_aware`, and `semantic` use batch-drain (global sheet analysis required before first chunk)
|
|
16
34
|
- **Multiple Input Sources**: local file paths, raw `bytes` / `bytearray` / `memoryview`, file-like objects (`BytesIO`, open files), FastAPI / Starlette `UploadFile`, HTTP(S) / S3 pre-signed URLs
|
|
17
35
|
- **Consistent Output Schema**: every chunk is a `dict` with `content`, `content_type`, and `metadata` keys
|
|
18
36
|
- **Zero Python runtime dependencies**: all parsing happens in the Rust extension; the PDFium native binary is bundled inside the wheel
|
|
@@ -50,6 +68,8 @@ chunks = get_chunks("notes.md", mode="semantic")
|
|
|
50
68
|
chunks = get_chunks("page.html", mode="section")
|
|
51
69
|
chunks = get_chunks("deck.pptx", mode="sliding_window", window_size=3, overlap=1)
|
|
52
70
|
chunks = get_chunks("report.docx", mode="sentence", sentences_per_chunk=3)
|
|
71
|
+
chunks = get_chunks("data.xlsx", mode="row", rows_per_chunk=5)
|
|
72
|
+
chunks = get_chunks("legacy.xls", mode="row", rows_per_chunk=5)
|
|
53
73
|
|
|
54
74
|
for chunk in chunks:
|
|
55
75
|
print(chunk["content"])
|
|
@@ -195,6 +215,86 @@ These three formats also support **streaming in every mode** — see the Streami
|
|
|
195
215
|
|
|
196
216
|
---
|
|
197
217
|
|
|
218
|
+
### XLSX / XLS modes
|
|
219
|
+
|
|
220
|
+
Both `.xlsx` and `.xls` files are handled by the same chunker. All 6 modes are available for batch and streaming:
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
from py_chunks import get_chunks, stream_chunks
|
|
224
|
+
from py_chunks.chunkers.xlsx import chunk_xlsx, stream_chunk_xlsx
|
|
225
|
+
|
|
226
|
+
# Batch — via unified API
|
|
227
|
+
chunks = get_chunks("data.xlsx", mode="row", rows_per_chunk=5)
|
|
228
|
+
chunks = get_chunks("legacy.xls", mode="row", rows_per_chunk=5)
|
|
229
|
+
|
|
230
|
+
# Batch — via format-specific chunker (returns chunks + timing)
|
|
231
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="row", rows_per_chunk=5)
|
|
232
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="table", max_chunk_chars=3000)
|
|
233
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="sheet", max_chunk_chars=5000)
|
|
234
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1)
|
|
235
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="page_aware", max_chunk_chars=3000)
|
|
236
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="semantic", rows_per_chunk=10)
|
|
237
|
+
|
|
238
|
+
# Filter to specific sheets
|
|
239
|
+
chunks, _ = chunk_xlsx("data.xlsx", mode="row", sheet_names=["Sales", "Q4"])
|
|
240
|
+
|
|
241
|
+
# Streaming — identical output to batch
|
|
242
|
+
for chunk in stream_chunks("data.xlsx", mode="row", rows_per_chunk=5):
|
|
243
|
+
print(chunk["content"])
|
|
244
|
+
|
|
245
|
+
for chunk in stream_chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1):
|
|
246
|
+
embed_and_store(chunk)
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
| Mode | `content_type` | Description |
|
|
250
|
+
|---|---|---|
|
|
251
|
+
| `row` | `row_document` | Groups N consecutive data rows into one chunk. Header row is auto-detected and excluded from content. Param: `rows_per_chunk` (default 1). |
|
|
252
|
+
| `table` | `table_region` | Named Excel tables (XLSX only) or heuristic contiguous data regions per sheet. For XLS and sheets without named tables, falls back to bounding-box detection. Param: `max_chunk_chars`. |
|
|
253
|
+
| `sheet` | `sheet` | One chunk per sheet (split by `max_chunk_chars` if needed). Includes named-table metadata. Param: `max_chunk_chars`. |
|
|
254
|
+
| `sliding_window` | `row_window` | Overlapping windows of N rows. Params: `window_size` (default 3), `overlap` (default 1, must be `< window_size`). |
|
|
255
|
+
| `page_aware` | `sheet_region` | Chunks by Excel print areas (XLSX only); falls back to the full sheet when no print area is defined. For XLS, always uses the full-sheet fallback. Param: `max_chunk_chars`. |
|
|
256
|
+
| `semantic` | `semantic_group` | Detects the column with the lowest cardinality of string values, sorts by it, and groups rows sharing the same category value. Falls back to fixed-size chunking when no suitable column is found. Param: `rows_per_chunk` (used for the fallback). |
|
|
257
|
+
|
|
258
|
+
**Parameters accepted by `chunk_xlsx` and `stream_chunk_xlsx`:**
|
|
259
|
+
|
|
260
|
+
| Parameter | Type | Default | Description |
|
|
261
|
+
|---|---|---|---|
|
|
262
|
+
| `file_path` | str | — | Path to `.xlsx` or `.xls` file |
|
|
263
|
+
| `mode` | str | `"row"` | One of the 6 modes above |
|
|
264
|
+
| `rows_per_chunk` | int | 1 | Rows per chunk (`row` mode and `semantic` fallback). Must be `> 0`. |
|
|
265
|
+
| `window_size` | int | 3 | Window size in rows (`sliding_window` mode). Must be `>= 1`. |
|
|
266
|
+
| `overlap` | int | 1 | Overlapping rows between windows. Must be `< window_size`. |
|
|
267
|
+
| `include_headers` | bool | True | Prefix each row value with its column header (`key: value` format). |
|
|
268
|
+
| `sheet_names` | list[str] \| None | None | Process only the named sheets; processes all sheets when `None` or `[]`. |
|
|
269
|
+
| `skip_empty_rows` | bool | True | Skip rows where every cell is empty. |
|
|
270
|
+
| `max_chunk_chars` | int | 2000 | Character limit per chunk (`table`, `sheet`, `page_aware` modes). |
|
|
271
|
+
|
|
272
|
+
**XLS vs XLSX differences:**
|
|
273
|
+
|
|
274
|
+
| Feature | XLSX | XLS |
|
|
275
|
+
|---|---|---|
|
|
276
|
+
| Named table detection (`table` mode) | ZIP XML (`table1.xml`) — full named-table metadata | Not available — heuristic bounding-box only; `is_named_table` is always `false` |
|
|
277
|
+
| Print area detection (`page_aware` mode) | Parsed from `xl/workbook.xml` | Not available — always uses full-sheet fallback; `has_print_area` is always `false` |
|
|
278
|
+
| Named table metadata in `sheet` mode | `has_named_tables: true/false`, `named_tables: [...]` | Always `has_named_tables: false`, `named_tables: []` |
|
|
279
|
+
| All other modes | Identical | Identical |
|
|
280
|
+
|
|
281
|
+
**XLSX / XLS metadata fields by mode:**
|
|
282
|
+
|
|
283
|
+
| Mode | Notable metadata keys |
|
|
284
|
+
|---|---|
|
|
285
|
+
| `row` | `sheet_name`, `sheet_index`, `row_index`, `header_row`, `col_count`, `rows_per_chunk`, `actual_row_count`, `chunk_index` |
|
|
286
|
+
| `table` | `sheet_name`, `sheet_index`, `table_name`, `is_named_table`, `header_row`, `start_row`, `end_row`, `start_col`, `end_col`, `row_count`, `col_count`, `chunk_index`, `is_split`, `split_part` |
|
|
287
|
+
| `sheet` | `sheet_name`, `sheet_index`, `row_count`, `col_count`, `header_row`, `has_named_tables`, `named_tables`, `chunk_index`, `is_split`, `split_part` |
|
|
288
|
+
| `sliding_window` | `sheet_name`, `sheet_index`, `window_size`, `overlap`, `actual_row_count`, `window_index`, `start_row`, `end_row`, `header_row`, `col_count`, `chunk_index` |
|
|
289
|
+
| `page_aware` | `sheet_name`, `sheet_index`, `has_print_area`, `print_area_ref`, `start_row`, `end_row`, `start_col`, `end_col`, `row_count`, `col_count`, `header_row`, `region_index`, `chunk_index`, `is_split`, `split_part` |
|
|
290
|
+
| `semantic` | `sheet_name`, `sheet_index`, `category_column`, `category_value`, `used_fallback`, `low_grouping_quality`, `avg_group_size`, `start_row`, `end_row`, `actual_row_count`, `header_row`, `col_count`, `group_index`, `chunk_index` |
|
|
291
|
+
|
|
292
|
+
> **Streaming memory profile**: `row` and `sliding_window` pre-parse all sheet data once (calamine reads the entire file on open — there is no incremental I/O at the format level), then build and yield one chunk per `__next__`. The other four modes require global sheet analysis before the first chunk can be emitted, so they materialise all chunks at construction time and drain them lazily. In both cases the streaming iterator yields one chunk at a time.
|
|
293
|
+
|
|
294
|
+
> **Header detection**: the first all-string row in each sheet is automatically detected as the header row and excluded from chunk content. Columns without a header label are named `Column 1`, `Column 2`, etc.
|
|
295
|
+
|
|
296
|
+
---
|
|
297
|
+
|
|
198
298
|
## Streaming
|
|
199
299
|
|
|
200
300
|
### When to use streaming
|
|
@@ -214,6 +314,7 @@ Use `stream_chunks` (or the `stream_chunks_from_*` variants) when:
|
|
|
214
314
|
| **TXT** | All 7 | Same as Markdown | Pure Rust, no threads. |
|
|
215
315
|
| **DOCX** | All 7 | `DocxStructuralIterator` for `default`/`structural`; dedicated per-mode Rust iterators for the other 5 | Full document parsed once upfront; chunks emitted lazily. Peak memory ≈ file size + chunk vec. Output equals `get_chunks` for every mode. |
|
|
216
316
|
| **PPTX** | All 7 | Batch-drain | PPTX requires the full ZIP up front, so chunks are computed once at construction and yielded one per `__next__`. |
|
|
317
|
+
| **XLSX / XLS** | All 6 | State machine for `row` / `sliding_window`; batch-drain for `table` / `sheet` / `page_aware` / `semantic` | calamine reads the full file on open (no incremental I/O at format level). `row` and `sliding_window` build one chunk per `__next__` from pre-parsed row data. The other four modes require global analysis first and materialise all chunks at iterator construction. Output is identical to `chunk_xlsx` for every mode. |
|
|
217
318
|
|
|
218
319
|
> **Parity guarantee**: streaming output equals `list(get_chunks(...))` for every format and every supported mode (this is exercised by `test_pdf_streaming.py` for PDF and by the tests in `py_chunks/tests/test_source_apis.py`).
|
|
219
320
|
|
|
@@ -246,6 +347,19 @@ for chunk in stream_chunks("document.docx", mode="page_aware", paragraphs_per_
|
|
|
246
347
|
for chunk in stream_chunks("deck.pptx", mode="semantic"):
|
|
247
348
|
...
|
|
248
349
|
|
|
350
|
+
# XLSX / XLS — all 6 modes
|
|
351
|
+
for chunk in stream_chunks("data.xlsx", mode="row", rows_per_chunk=10):
|
|
352
|
+
embed_and_index(chunk)
|
|
353
|
+
|
|
354
|
+
for chunk in stream_chunks("report.xls", mode="sliding_window", window_size=5, overlap=2):
|
|
355
|
+
process(chunk)
|
|
356
|
+
|
|
357
|
+
for chunk in stream_chunks("data.xlsx", mode="table", max_chunk_chars=3000):
|
|
358
|
+
store_in_db(chunk)
|
|
359
|
+
|
|
360
|
+
for chunk in stream_chunks("data.xlsx", mode="semantic", rows_per_chunk=20):
|
|
361
|
+
handle(chunk)
|
|
362
|
+
|
|
249
363
|
# From bytes (e.g. FastAPI body)
|
|
250
364
|
for chunk in stream_chunks(request_body, filename="report.pdf", mode="semantic"):
|
|
251
365
|
process(chunk)
|
|
@@ -298,8 +412,11 @@ Or use the explicit source-specific helpers:
|
|
|
298
412
|
| Markdown | `.md` | All 7 | All 7 (state machine for `structural` / `semantic`) |
|
|
299
413
|
| HTML | `.html`, `.htm` | All 7 | All 7 (state machine for `structural` / `semantic`) |
|
|
300
414
|
| Plain Text | `.txt` | All 7 | All 7 (state machine for `structural` / `semantic`) |
|
|
415
|
+
| Excel | `.xlsx`, `.xls` | All 6 | All 6 (`row` / `sliding_window` state machine; others batch-drain) |
|
|
416
|
+
|
|
417
|
+
The 7 document modes are: `default`, `structural`, `section`, `semantic`, `sliding_window`, `sentence`, `page_aware`.
|
|
301
418
|
|
|
302
|
-
The
|
|
419
|
+
The 6 spreadsheet modes are: `row`, `table`, `sheet`, `sliding_window`, `page_aware`, `semantic`.
|
|
303
420
|
|
|
304
421
|
---
|
|
305
422
|
|
|
@@ -378,6 +495,7 @@ from py_chunks.chunkers.pptx import chunk_pptx, stream_chunk_pptx, chunk_pptx_wi
|
|
|
378
495
|
from py_chunks.chunkers.html import chunk_html, stream_chunk_html
|
|
379
496
|
from py_chunks.chunkers.md import chunk_md, stream_chunk_md
|
|
380
497
|
from py_chunks.chunkers.txt import chunk_txt, stream_chunk_txt
|
|
498
|
+
from py_chunks.chunkers.xlsx import chunk_xlsx, stream_chunk_xlsx # handles both .xlsx and .xls
|
|
381
499
|
|
|
382
500
|
# Batch with timing
|
|
383
501
|
chunks, timing = chunk_pdf("file.pdf", mode="section")
|
|
@@ -401,6 +519,19 @@ for chunk in stream_chunk_md("book.md", mode="sentence", sentences_per_chunk=2):
|
|
|
401
519
|
for chunk in stream_chunk_html("page.html", mode="section"): ...
|
|
402
520
|
for chunk in stream_chunk_txt("log.txt", mode="page_aware", paragraphs_per_page=20): ...
|
|
403
521
|
for chunk in stream_chunk_pptx("deck.pptx", mode="semantic"): ...
|
|
522
|
+
|
|
523
|
+
# XLSX / XLS — all 6 modes, batch and streaming
|
|
524
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="row", rows_per_chunk=5)
|
|
525
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="table", max_chunk_chars=3000)
|
|
526
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="sheet", max_chunk_chars=5000)
|
|
527
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1)
|
|
528
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="page_aware", max_chunk_chars=3000)
|
|
529
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="semantic", rows_per_chunk=10)
|
|
530
|
+
chunks, timing = chunk_xlsx("legacy.xls", mode="row", rows_per_chunk=5) # XLS works identically
|
|
531
|
+
|
|
532
|
+
for chunk in stream_chunk_xlsx("data.xlsx", mode="row", rows_per_chunk=10): ...
|
|
533
|
+
for chunk in stream_chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1): ...
|
|
534
|
+
for chunk in stream_chunk_xlsx("legacy.xls", mode="semantic", rows_per_chunk=20): ...
|
|
404
535
|
```
|
|
405
536
|
|
|
406
537
|
---
|
|
@@ -433,7 +564,13 @@ for chunk in stream_chunk_pptx("deck.pptx", mode="semantic"): ...
|
|
|
433
564
|
| `semantic` | Heuristic topic-continuity group (`semantic` mode) |
|
|
434
565
|
| `sliding_window` | Fixed-size overlapping window (`sliding_window` mode) |
|
|
435
566
|
| `sentence` | Sentence-count group (`sentence` mode) |
|
|
436
|
-
| `page_aware` | Page boundary group (`page_aware` mode) |
|
|
567
|
+
| `page_aware` | Page boundary group (`page_aware` mode for document formats) |
|
|
568
|
+
| `row_document` | XLSX/XLS: N consecutive data rows (`row` mode) |
|
|
569
|
+
| `table_region` | XLSX/XLS: named table or heuristic data region (`table` mode) |
|
|
570
|
+
| `sheet` | XLSX/XLS: full sheet or split part (`sheet` mode) |
|
|
571
|
+
| `row_window` | XLSX/XLS: overlapping row window (`sliding_window` mode) |
|
|
572
|
+
| `sheet_region` | XLSX/XLS: print area or full sheet (`page_aware` mode) |
|
|
573
|
+
| `semantic_group` | XLSX/XLS: category-grouped rows or fallback fixed-size group (`semantic` mode) |
|
|
437
574
|
|
|
438
575
|
### Metadata fields by mode
|
|
439
576
|
|
|
@@ -613,7 +750,8 @@ def process_document(file_path: str):
|
|
|
613
750
|
│ Format Dispatcher │
|
|
614
751
|
│ (py_chunks/chunkers/*.py) │
|
|
615
752
|
│ chunk_pdf / chunk_docx / chunk_pptx / │
|
|
616
|
-
│ chunk_md / chunk_html / chunk_txt
|
|
753
|
+
│ chunk_md / chunk_html / chunk_txt / │
|
|
754
|
+
│ chunk_xlsx (handles .xlsx + .xls) + │
|
|
617
755
|
│ matching stream_chunk_* variants │
|
|
618
756
|
└──────────────┬───────────────────────────────┘
|
|
619
757
|
│ validates args, dispatches to the right Rust function,
|
|
@@ -639,6 +777,11 @@ def process_document(file_path: str):
|
|
|
639
777
|
│ DOCX stream — DocxStructuralIterator (default/structural) + │
|
|
640
778
|
│ per-mode iterators for all other 5 modes │
|
|
641
779
|
│ PPTX stream — batch-drain (ZIP must be read upfront) │
|
|
780
|
+
│ XLSX/XLS — open_workbook_auto() handles both formats; │
|
|
781
|
+
│ row / sliding_window: state machine, one chunk per __next__ │
|
|
782
|
+
│ table / sheet / page_aware / semantic: batch-drain │
|
|
783
|
+
│ table mode: ZIP XML for named tables (XLSX) or heuristic │
|
|
784
|
+
│ page_aware: print-area XML (XLSX) or full-sheet fallback │
|
|
642
785
|
└──────────────────────────────────────────────────────────────────┘
|
|
643
786
|
```
|
|
644
787
|
|
|
@@ -667,7 +810,7 @@ except FileNotFoundError as e:
|
|
|
667
810
|
try:
|
|
668
811
|
chunks = get_chunks("image.png")
|
|
669
812
|
except ValueError as e:
|
|
670
|
-
print(e) # Unsupported file type '.png'. Supported: .docx, .htm, .html, .md, .pdf, .pptx, .txt
|
|
813
|
+
print(e) # Unsupported file type '.png'. Supported: .docx, .htm, .html, .md, .pdf, .pptx, .txt, .xls, .xlsx
|
|
671
814
|
|
|
672
815
|
# Scanned / image-only PDF (no text layer)
|
|
673
816
|
try:
|
|
@@ -742,3 +885,4 @@ MIT
|
|
|
742
885
|
---
|
|
743
886
|
|
|
744
887
|
Built with Rust (performance) + Python (simplicity)
|
|
888
|
+
|
|
@@ -1,31 +1,20 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: py-chunks
|
|
3
|
-
Version: 0.2.3
|
|
4
|
-
Classifier: Programming Language :: Python :: 3
|
|
5
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
6
|
-
Classifier: Operating System :: OS Independent
|
|
7
|
-
License-File: LICENSE
|
|
8
|
-
Summary: Rust-backed Python chunking library
|
|
9
|
-
Keywords: chunking,document,pdf,docx,rust
|
|
10
|
-
License: MIT
|
|
11
|
-
Requires-Python: >=3.9
|
|
12
|
-
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
13
|
-
|
|
14
1
|
# py-chunks
|
|
15
2
|
|
|
16
3
|
[](https://www.python.org/downloads/) [](LICENSE)
|
|
17
4
|
|
|
18
|
-
Fast, framework-agnostic document chunking library backed by Rust. Extract meaningful content segments from DOCX, PDF, PPTX, TXT, Markdown, and
|
|
5
|
+
Fast, framework-agnostic document chunking library backed by Rust. Extract meaningful content segments from DOCX, PDF, PPTX, TXT, Markdown, HTML, XLSX, and XLS files — optimised for production use.
|
|
19
6
|
|
|
20
7
|
## Features
|
|
21
8
|
|
|
22
|
-
- **
|
|
23
|
-
- **7 Chunking Modes
|
|
9
|
+
- **8 Document Formats**: PDF, DOCX, PPTX, Markdown, HTML, TXT, XLSX, XLS
|
|
10
|
+
- **7 Chunking Modes for document formats**: `default`, `structural`, `section`, `semantic`, `sliding_window`, `sentence`, `page_aware`
|
|
11
|
+
- **6 Chunking Modes for spreadsheet formats** (XLSX / XLS): `row`, `table`, `sheet`, `sliding_window`, `page_aware`, `semantic`
|
|
24
12
|
- **Streaming for every format** via a single `stream_chunks()` entry point
|
|
25
13
|
- PDF: background Rust thread + `mpsc` channel (all 7 modes, true one-chunk-at-a-time)
|
|
26
14
|
- Markdown / HTML / TXT: block-by-block state machine for `structural` + `semantic`; batch-drain for the rest
|
|
27
15
|
- DOCX: all 7 modes — `DocxStructuralIterator` for `default`/`structural`; dedicated per-mode iterators for the remaining 5 modes (lazy chunk emission after a single upfront parse)
|
|
28
16
|
- PPTX: batch-drain (ZIP must be read upfront, then chunks are yielded one at a time)
|
|
17
|
+
- XLSX / XLS: `row` and `sliding_window` use true state machines (one chunk per `__next__`, O(parsed_rows) memory); `table`, `sheet`, `page_aware`, and `semantic` use batch-drain (global sheet analysis required before first chunk)
|
|
29
18
|
- **Multiple Input Sources**: local file paths, raw `bytes` / `bytearray` / `memoryview`, file-like objects (`BytesIO`, open files), FastAPI / Starlette `UploadFile`, HTTP(S) / S3 pre-signed URLs
|
|
30
19
|
- **Consistent Output Schema**: every chunk is a `dict` with `content`, `content_type`, and `metadata` keys
|
|
31
20
|
- **Zero Python runtime dependencies**: all parsing happens in the Rust extension; the PDFium native binary is bundled inside the wheel
|
|
@@ -63,6 +52,8 @@ chunks = get_chunks("notes.md", mode="semantic")
|
|
|
63
52
|
chunks = get_chunks("page.html", mode="section")
|
|
64
53
|
chunks = get_chunks("deck.pptx", mode="sliding_window", window_size=3, overlap=1)
|
|
65
54
|
chunks = get_chunks("report.docx", mode="sentence", sentences_per_chunk=3)
|
|
55
|
+
chunks = get_chunks("data.xlsx", mode="row", rows_per_chunk=5)
|
|
56
|
+
chunks = get_chunks("legacy.xls", mode="row", rows_per_chunk=5)
|
|
66
57
|
|
|
67
58
|
for chunk in chunks:
|
|
68
59
|
print(chunk["content"])
|
|
@@ -208,6 +199,86 @@ These three formats also support **streaming in every mode** — see the Streami
|
|
|
208
199
|
|
|
209
200
|
---
|
|
210
201
|
|
|
202
|
+
### XLSX / XLS modes
|
|
203
|
+
|
|
204
|
+
Both `.xlsx` and `.xls` files are handled by the same chunker. All 6 modes are available for batch and streaming:
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
from py_chunks import get_chunks, stream_chunks
|
|
208
|
+
from py_chunks.chunkers.xlsx import chunk_xlsx, stream_chunk_xlsx
|
|
209
|
+
|
|
210
|
+
# Batch — via unified API
|
|
211
|
+
chunks = get_chunks("data.xlsx", mode="row", rows_per_chunk=5)
|
|
212
|
+
chunks = get_chunks("legacy.xls", mode="row", rows_per_chunk=5)
|
|
213
|
+
|
|
214
|
+
# Batch — via format-specific chunker (returns chunks + timing)
|
|
215
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="row", rows_per_chunk=5)
|
|
216
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="table", max_chunk_chars=3000)
|
|
217
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="sheet", max_chunk_chars=5000)
|
|
218
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1)
|
|
219
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="page_aware", max_chunk_chars=3000)
|
|
220
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="semantic", rows_per_chunk=10)
|
|
221
|
+
|
|
222
|
+
# Filter to specific sheets
|
|
223
|
+
chunks, _ = chunk_xlsx("data.xlsx", mode="row", sheet_names=["Sales", "Q4"])
|
|
224
|
+
|
|
225
|
+
# Streaming — identical output to batch
|
|
226
|
+
for chunk in stream_chunks("data.xlsx", mode="row", rows_per_chunk=5):
|
|
227
|
+
print(chunk["content"])
|
|
228
|
+
|
|
229
|
+
for chunk in stream_chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1):
|
|
230
|
+
embed_and_store(chunk)
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
| Mode | `content_type` | Description |
|
|
234
|
+
|---|---|---|
|
|
235
|
+
| `row` | `row_document` | Groups N consecutive data rows into one chunk. Header row is auto-detected and excluded from content. Param: `rows_per_chunk` (default 1). |
|
|
236
|
+
| `table` | `table_region` | Named Excel tables (XLSX only) or heuristic contiguous data regions per sheet. For XLS and sheets without named tables, falls back to bounding-box detection. Param: `max_chunk_chars`. |
|
|
237
|
+
| `sheet` | `sheet` | One chunk per sheet (split by `max_chunk_chars` if needed). Includes named-table metadata. Param: `max_chunk_chars`. |
|
|
238
|
+
| `sliding_window` | `row_window` | Overlapping windows of N rows. Params: `window_size` (default 3), `overlap` (default 1, must be `< window_size`). |
|
|
239
|
+
| `page_aware` | `sheet_region` | Chunks by Excel print areas (XLSX only); falls back to the full sheet when no print area is defined. For XLS, always uses the full-sheet fallback. Param: `max_chunk_chars`. |
|
|
240
|
+
| `semantic` | `semantic_group` | Detects the column with the lowest cardinality of string values, sorts by it, and groups rows sharing the same category value. Falls back to fixed-size chunking when no suitable column is found. Param: `rows_per_chunk` (used for the fallback). |
|
|
241
|
+
|
|
242
|
+
**Parameters accepted by `chunk_xlsx` and `stream_chunk_xlsx`:**
|
|
243
|
+
|
|
244
|
+
| Parameter | Type | Default | Description |
|
|
245
|
+
|---|---|---|---|
|
|
246
|
+
| `file_path` | str | — | Path to `.xlsx` or `.xls` file |
|
|
247
|
+
| `mode` | str | `"row"` | One of the 6 modes above |
|
|
248
|
+
| `rows_per_chunk` | int | 1 | Rows per chunk (`row` mode and `semantic` fallback). Must be `> 0`. |
|
|
249
|
+
| `window_size` | int | 3 | Window size in rows (`sliding_window` mode). Must be `>= 1`. |
|
|
250
|
+
| `overlap` | int | 1 | Overlapping rows between windows. Must be `< window_size`. |
|
|
251
|
+
| `include_headers` | bool | True | Prefix each row value with its column header (`key: value` format). |
|
|
252
|
+
| `sheet_names` | list[str] \| None | None | Process only the named sheets; processes all sheets when `None` or `[]`. |
|
|
253
|
+
| `skip_empty_rows` | bool | True | Skip rows where every cell is empty. |
|
|
254
|
+
| `max_chunk_chars` | int | 2000 | Character limit per chunk (`table`, `sheet`, `page_aware` modes). |
|
|
255
|
+
|
|
256
|
+
**XLS vs XLSX differences:**
|
|
257
|
+
|
|
258
|
+
| Feature | XLSX | XLS |
|
|
259
|
+
|---|---|---|
|
|
260
|
+
| Named table detection (`table` mode) | ZIP XML (`table1.xml`) — full named-table metadata | Not available — heuristic bounding-box only; `is_named_table` is always `false` |
|
|
261
|
+
| Print area detection (`page_aware` mode) | Parsed from `xl/workbook.xml` | Not available — always uses full-sheet fallback; `has_print_area` is always `false` |
|
|
262
|
+
| Named table metadata in `sheet` mode | `has_named_tables: true/false`, `named_tables: [...]` | Always `has_named_tables: false`, `named_tables: []` |
|
|
263
|
+
| All other modes | Identical | Identical |
|
|
264
|
+
|
|
265
|
+
**XLSX / XLS metadata fields by mode:**
|
|
266
|
+
|
|
267
|
+
| Mode | Notable metadata keys |
|
|
268
|
+
|---|---|
|
|
269
|
+
| `row` | `sheet_name`, `sheet_index`, `row_index`, `header_row`, `col_count`, `rows_per_chunk`, `actual_row_count`, `chunk_index` |
|
|
270
|
+
| `table` | `sheet_name`, `sheet_index`, `table_name`, `is_named_table`, `header_row`, `start_row`, `end_row`, `start_col`, `end_col`, `row_count`, `col_count`, `chunk_index`, `is_split`, `split_part` |
|
|
271
|
+
| `sheet` | `sheet_name`, `sheet_index`, `row_count`, `col_count`, `header_row`, `has_named_tables`, `named_tables`, `chunk_index`, `is_split`, `split_part` |
|
|
272
|
+
| `sliding_window` | `sheet_name`, `sheet_index`, `window_size`, `overlap`, `actual_row_count`, `window_index`, `start_row`, `end_row`, `header_row`, `col_count`, `chunk_index` |
|
|
273
|
+
| `page_aware` | `sheet_name`, `sheet_index`, `has_print_area`, `print_area_ref`, `start_row`, `end_row`, `start_col`, `end_col`, `row_count`, `col_count`, `header_row`, `region_index`, `chunk_index`, `is_split`, `split_part` |
|
|
274
|
+
| `semantic` | `sheet_name`, `sheet_index`, `category_column`, `category_value`, `used_fallback`, `low_grouping_quality`, `avg_group_size`, `start_row`, `end_row`, `actual_row_count`, `header_row`, `col_count`, `group_index`, `chunk_index` |
|
|
275
|
+
|
|
276
|
+
> **Streaming memory profile**: `row` and `sliding_window` pre-parse all sheet data once (calamine reads the entire file on open — there is no incremental I/O at the format level), then build and yield one chunk per `__next__`. The other four modes require global sheet analysis before the first chunk can be emitted, so they materialise all chunks at construction time and drain them lazily. In both cases the streaming iterator yields one chunk at a time.
|
|
277
|
+
|
|
278
|
+
> **Header detection**: the first all-string row in each sheet is automatically detected as the header row and excluded from chunk content. Columns without a header label are named `Column 1`, `Column 2`, etc.
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
211
282
|
## Streaming
|
|
212
283
|
|
|
213
284
|
### When to use streaming
|
|
@@ -227,6 +298,7 @@ Use `stream_chunks` (or the `stream_chunks_from_*` variants) when:
|
|
|
227
298
|
| **TXT** | All 7 | Same as Markdown | Pure Rust, no threads. |
|
|
228
299
|
| **DOCX** | All 7 | `DocxStructuralIterator` for `default`/`structural`; dedicated per-mode Rust iterators for the other 5 | Full document parsed once upfront; chunks emitted lazily. Peak memory ≈ file size + chunk vec. Output equals `get_chunks` for every mode. |
|
|
229
300
|
| **PPTX** | All 7 | Batch-drain | PPTX requires the full ZIP up front, so chunks are computed once at construction and yielded one per `__next__`. |
|
|
301
|
+
| **XLSX / XLS** | All 6 | State machine for `row` / `sliding_window`; batch-drain for `table` / `sheet` / `page_aware` / `semantic` | calamine reads the full file on open (no incremental I/O at format level). `row` and `sliding_window` build one chunk per `__next__` from pre-parsed row data. The other four modes require global analysis first and materialise all chunks at iterator construction. Output is identical to `chunk_xlsx` for every mode. |
|
|
230
302
|
|
|
231
303
|
> **Parity guarantee**: streaming output equals `list(get_chunks(...))` for every format and every supported mode (this is exercised by `test_pdf_streaming.py` for PDF and by the tests in `py_chunks/tests/test_source_apis.py`).
|
|
232
304
|
|
|
@@ -259,6 +331,19 @@ for chunk in stream_chunks("document.docx", mode="page_aware", paragraphs_per_
|
|
|
259
331
|
for chunk in stream_chunks("deck.pptx", mode="semantic"):
|
|
260
332
|
...
|
|
261
333
|
|
|
334
|
+
# XLSX / XLS — all 6 modes
|
|
335
|
+
for chunk in stream_chunks("data.xlsx", mode="row", rows_per_chunk=10):
|
|
336
|
+
embed_and_index(chunk)
|
|
337
|
+
|
|
338
|
+
for chunk in stream_chunks("report.xls", mode="sliding_window", window_size=5, overlap=2):
|
|
339
|
+
process(chunk)
|
|
340
|
+
|
|
341
|
+
for chunk in stream_chunks("data.xlsx", mode="table", max_chunk_chars=3000):
|
|
342
|
+
store_in_db(chunk)
|
|
343
|
+
|
|
344
|
+
for chunk in stream_chunks("data.xlsx", mode="semantic", rows_per_chunk=20):
|
|
345
|
+
handle(chunk)
|
|
346
|
+
|
|
262
347
|
# From bytes (e.g. FastAPI body)
|
|
263
348
|
for chunk in stream_chunks(request_body, filename="report.pdf", mode="semantic"):
|
|
264
349
|
process(chunk)
|
|
@@ -311,8 +396,11 @@ Or use the explicit source-specific helpers:
|
|
|
311
396
|
| Markdown | `.md` | All 7 | All 7 (state machine for `structural` / `semantic`) |
|
|
312
397
|
| HTML | `.html`, `.htm` | All 7 | All 7 (state machine for `structural` / `semantic`) |
|
|
313
398
|
| Plain Text | `.txt` | All 7 | All 7 (state machine for `structural` / `semantic`) |
|
|
399
|
+
| Excel | `.xlsx`, `.xls` | All 6 | All 6 (`row` / `sliding_window` state machine; others batch-drain) |
|
|
400
|
+
|
|
401
|
+
The 7 document modes are: `default`, `structural`, `section`, `semantic`, `sliding_window`, `sentence`, `page_aware`.
|
|
314
402
|
|
|
315
|
-
The
|
|
403
|
+
The 6 spreadsheet modes are: `row`, `table`, `sheet`, `sliding_window`, `page_aware`, `semantic`.
|
|
316
404
|
|
|
317
405
|
---
|
|
318
406
|
|
|
@@ -391,6 +479,7 @@ from py_chunks.chunkers.pptx import chunk_pptx, stream_chunk_pptx, chunk_pptx_wi
|
|
|
391
479
|
from py_chunks.chunkers.html import chunk_html, stream_chunk_html
|
|
392
480
|
from py_chunks.chunkers.md import chunk_md, stream_chunk_md
|
|
393
481
|
from py_chunks.chunkers.txt import chunk_txt, stream_chunk_txt
|
|
482
|
+
from py_chunks.chunkers.xlsx import chunk_xlsx, stream_chunk_xlsx # handles both .xlsx and .xls
|
|
394
483
|
|
|
395
484
|
# Batch with timing
|
|
396
485
|
chunks, timing = chunk_pdf("file.pdf", mode="section")
|
|
@@ -414,6 +503,19 @@ for chunk in stream_chunk_md("book.md", mode="sentence", sentences_per_chunk=2):
|
|
|
414
503
|
for chunk in stream_chunk_html("page.html", mode="section"): ...
|
|
415
504
|
for chunk in stream_chunk_txt("log.txt", mode="page_aware", paragraphs_per_page=20): ...
|
|
416
505
|
for chunk in stream_chunk_pptx("deck.pptx", mode="semantic"): ...
|
|
506
|
+
|
|
507
|
+
# XLSX / XLS — all 6 modes, batch and streaming
|
|
508
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="row", rows_per_chunk=5)
|
|
509
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="table", max_chunk_chars=3000)
|
|
510
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="sheet", max_chunk_chars=5000)
|
|
511
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1)
|
|
512
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="page_aware", max_chunk_chars=3000)
|
|
513
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="semantic", rows_per_chunk=10)
|
|
514
|
+
chunks, timing = chunk_xlsx("legacy.xls", mode="row", rows_per_chunk=5) # XLS works identically
|
|
515
|
+
|
|
516
|
+
for chunk in stream_chunk_xlsx("data.xlsx", mode="row", rows_per_chunk=10): ...
|
|
517
|
+
for chunk in stream_chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1): ...
|
|
518
|
+
for chunk in stream_chunk_xlsx("legacy.xls", mode="semantic", rows_per_chunk=20): ...
|
|
417
519
|
```
|
|
418
520
|
|
|
419
521
|
---
|
|
@@ -446,7 +548,13 @@ for chunk in stream_chunk_pptx("deck.pptx", mode="semantic"): ...
|
|
|
446
548
|
| `semantic` | Heuristic topic-continuity group (`semantic` mode) |
|
|
447
549
|
| `sliding_window` | Fixed-size overlapping window (`sliding_window` mode) |
|
|
448
550
|
| `sentence` | Sentence-count group (`sentence` mode) |
|
|
449
|
-
| `page_aware` | Page boundary group (`page_aware` mode) |
|
|
551
|
+
| `page_aware` | Page boundary group (`page_aware` mode for document formats) |
|
|
552
|
+
| `row_document` | XLSX/XLS: N consecutive data rows (`row` mode) |
|
|
553
|
+
| `table_region` | XLSX/XLS: named table or heuristic data region (`table` mode) |
|
|
554
|
+
| `sheet` | XLSX/XLS: full sheet or split part (`sheet` mode) |
|
|
555
|
+
| `row_window` | XLSX/XLS: overlapping row window (`sliding_window` mode) |
|
|
556
|
+
| `sheet_region` | XLSX/XLS: print area or full sheet (`page_aware` mode) |
|
|
557
|
+
| `semantic_group` | XLSX/XLS: category-grouped rows or fallback fixed-size group (`semantic` mode) |
|
|
450
558
|
|
|
451
559
|
### Metadata fields by mode
|
|
452
560
|
|
|
@@ -626,7 +734,8 @@ def process_document(file_path: str):
|
|
|
626
734
|
│ Format Dispatcher │
|
|
627
735
|
│ (py_chunks/chunkers/*.py) │
|
|
628
736
|
│ chunk_pdf / chunk_docx / chunk_pptx / │
|
|
629
|
-
│ chunk_md / chunk_html / chunk_txt
|
|
737
|
+
│ chunk_md / chunk_html / chunk_txt / │
|
|
738
|
+
│ chunk_xlsx (handles .xlsx + .xls) + │
|
|
630
739
|
│ matching stream_chunk_* variants │
|
|
631
740
|
└──────────────┬───────────────────────────────┘
|
|
632
741
|
│ validates args, dispatches to the right Rust function,
|
|
@@ -652,6 +761,11 @@ def process_document(file_path: str):
|
|
|
652
761
|
│ DOCX stream — DocxStructuralIterator (default/structural) + │
|
|
653
762
|
│ per-mode iterators for all other 5 modes │
|
|
654
763
|
│ PPTX stream — batch-drain (ZIP must be read upfront) │
|
|
764
|
+
│ XLSX/XLS — open_workbook_auto() handles both formats; │
|
|
765
|
+
│ row / sliding_window: state machine, one chunk per __next__ │
|
|
766
|
+
│ table / sheet / page_aware / semantic: batch-drain │
|
|
767
|
+
│ table mode: ZIP XML for named tables (XLSX) or heuristic │
|
|
768
|
+
│ page_aware: print-area XML (XLSX) or full-sheet fallback │
|
|
655
769
|
└──────────────────────────────────────────────────────────────────┘
|
|
656
770
|
```
|
|
657
771
|
|
|
@@ -680,7 +794,7 @@ except FileNotFoundError as e:
|
|
|
680
794
|
try:
|
|
681
795
|
chunks = get_chunks("image.png")
|
|
682
796
|
except ValueError as e:
|
|
683
|
-
print(e) # Unsupported file type '.png'. Supported: .docx, .htm, .html, .md, .pdf, .pptx, .txt
|
|
797
|
+
print(e) # Unsupported file type '.png'. Supported: .docx, .htm, .html, .md, .pdf, .pptx, .txt, .xls, .xlsx
|
|
684
798
|
|
|
685
799
|
# Scanned / image-only PDF (no text layer)
|
|
686
800
|
try:
|
|
@@ -755,4 +869,3 @@ MIT
|
|
|
755
869
|
---
|
|
756
870
|
|
|
757
871
|
Built with Rust (performance) + Python (simplicity)
|
|
758
|
-
|