py-chunks 0.2.4__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {py_chunks-0.2.4 → py_chunks-0.3.0}/Cargo.lock +47 -1
- {py_chunks-0.2.4 → py_chunks-0.3.0}/Cargo.toml +1 -0
- py_chunks-0.2.4/README.md → py_chunks-0.3.0/PKG-INFO +151 -7
- py_chunks-0.2.4/PKG-INFO → py_chunks-0.3.0/README.md +134 -22
- {py_chunks-0.2.4 → py_chunks-0.3.0}/py_chunks/__init__.py +53 -5
- {py_chunks-0.2.4 → py_chunks-0.3.0}/py_chunks/chunkers/__init__.py +3 -0
- py_chunks-0.3.0/py_chunks/chunkers/xlsx.py +158 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/pyproject.toml +5 -2
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/mod.rs +1 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/pptx/semantic.rs +1 -1
- py_chunks-0.3.0/src/extensions/xlsx/common.rs +511 -0
- py_chunks-0.3.0/src/extensions/xlsx/mod.rs +19 -0
- py_chunks-0.3.0/src/extensions/xlsx/page_aware.rs +432 -0
- py_chunks-0.3.0/src/extensions/xlsx/row_document.rs +68 -0
- py_chunks-0.3.0/src/extensions/xlsx/semantic.rs +332 -0
- py_chunks-0.3.0/src/extensions/xlsx/sheet.rs +195 -0
- py_chunks-0.3.0/src/extensions/xlsx/sliding_window.rs +221 -0
- py_chunks-0.3.0/src/extensions/xlsx/stream_iter.rs +503 -0
- py_chunks-0.3.0/src/extensions/xlsx/table_region.rs +506 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/lib.rs +1 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/.github/workflows/release.yml +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/.gitignore +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/.pylintrc +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/LICENSE +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/py_chunks/chunkers/docx.py +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/py_chunks/chunkers/html.py +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/py_chunks/chunkers/md.py +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/py_chunks/chunkers/pdf.py +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/py_chunks/chunkers/pptx.py +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/py_chunks/chunkers/txt.py +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/docx/common.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/docx/mod.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/docx/page_aware.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/docx/section.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/docx/semantic.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/docx/sentence.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/docx/sliding_window.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/docx/structural.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/html/common.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/html/mod.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/html/page_aware.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/html/section.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/html/semantic.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/html/sentence.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/html/sliding_window.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/html/stream_iter.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/html/structural.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/md/common.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/md/mod.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/md/page_aware.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/md/section.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/md/semantic.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/md/sentence.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/md/sliding_window.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/md/stream_iter.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/md/structural.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/pdf/common.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/pdf/mod.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/pdf/stream_iter.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/pdf/structural.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/pptx/common.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/pptx/mod.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/pptx/page_aware.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/pptx/section.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/pptx/sentence.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/pptx/sliding_window.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/pptx/stream_iter.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/pptx/structural.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/shared.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/txt/common.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/txt/mod.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/txt/page_aware.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/txt/section.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/txt/semantic.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/txt/sentence.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/txt/sliding_window.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/txt/stream_iter.rs +0 -0
- {py_chunks-0.2.4 → py_chunks-0.3.0}/src/extensions/txt/structural.rs +0 -0
|
@@ -107,6 +107,22 @@ dependencies = [
|
|
|
107
107
|
"pkg-config",
|
|
108
108
|
]
|
|
109
109
|
|
|
110
|
+
[[package]]
|
|
111
|
+
name = "calamine"
|
|
112
|
+
version = "0.26.1"
|
|
113
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
114
|
+
checksum = "138646b9af2c5d7f1804ea4bf93afc597737d2bd4f7341d67c48b03316976eb1"
|
|
115
|
+
dependencies = [
|
|
116
|
+
"byteorder",
|
|
117
|
+
"chrono",
|
|
118
|
+
"codepage",
|
|
119
|
+
"encoding_rs",
|
|
120
|
+
"log",
|
|
121
|
+
"quick-xml 0.31.0",
|
|
122
|
+
"serde",
|
|
123
|
+
"zip",
|
|
124
|
+
]
|
|
125
|
+
|
|
110
126
|
[[package]]
|
|
111
127
|
name = "cc"
|
|
112
128
|
version = "1.2.61"
|
|
@@ -134,6 +150,7 @@ dependencies = [
|
|
|
134
150
|
"iana-time-zone",
|
|
135
151
|
"js-sys",
|
|
136
152
|
"num-traits",
|
|
153
|
+
"serde",
|
|
137
154
|
"wasm-bindgen",
|
|
138
155
|
"windows-link",
|
|
139
156
|
]
|
|
@@ -148,6 +165,15 @@ dependencies = [
|
|
|
148
165
|
"inout",
|
|
149
166
|
]
|
|
150
167
|
|
|
168
|
+
[[package]]
|
|
169
|
+
name = "codepage"
|
|
170
|
+
version = "0.1.2"
|
|
171
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
172
|
+
checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4"
|
|
173
|
+
dependencies = [
|
|
174
|
+
"encoding_rs",
|
|
175
|
+
]
|
|
176
|
+
|
|
151
177
|
[[package]]
|
|
152
178
|
name = "console_error_panic_hook"
|
|
153
179
|
version = "0.1.7"
|
|
@@ -283,6 +309,15 @@ version = "1.15.0"
|
|
|
283
309
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
284
310
|
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
|
|
285
311
|
|
|
312
|
+
[[package]]
|
|
313
|
+
name = "encoding_rs"
|
|
314
|
+
version = "0.8.35"
|
|
315
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
316
|
+
checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
|
|
317
|
+
dependencies = [
|
|
318
|
+
"cfg-if",
|
|
319
|
+
]
|
|
320
|
+
|
|
286
321
|
[[package]]
|
|
287
322
|
name = "equivalent"
|
|
288
323
|
version = "1.0.2"
|
|
@@ -667,10 +702,11 @@ checksum = "e0c5ccf5294c6ccd63a74f1565028353830a9c2f5eb0c682c355c471726a6e3f"
|
|
|
667
702
|
name = "py_chunks"
|
|
668
703
|
version = "0.1.6"
|
|
669
704
|
dependencies = [
|
|
705
|
+
"calamine",
|
|
670
706
|
"pdfium-render",
|
|
671
707
|
"pyo3",
|
|
672
708
|
"pythonize",
|
|
673
|
-
"quick-xml",
|
|
709
|
+
"quick-xml 0.38.4",
|
|
674
710
|
"serde_json",
|
|
675
711
|
"zip",
|
|
676
712
|
]
|
|
@@ -748,6 +784,16 @@ dependencies = [
|
|
|
748
784
|
"serde",
|
|
749
785
|
]
|
|
750
786
|
|
|
787
|
+
[[package]]
|
|
788
|
+
name = "quick-xml"
|
|
789
|
+
version = "0.31.0"
|
|
790
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
791
|
+
checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
|
|
792
|
+
dependencies = [
|
|
793
|
+
"encoding_rs",
|
|
794
|
+
"memchr",
|
|
795
|
+
]
|
|
796
|
+
|
|
751
797
|
[[package]]
|
|
752
798
|
name = "quick-xml"
|
|
753
799
|
version = "0.38.4"
|
|
@@ -1,18 +1,36 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: py-chunks
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Classifier: Programming Language :: Python :: 3
|
|
5
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
6
|
+
Classifier: Operating System :: OS Independent
|
|
7
|
+
Requires-Dist: pypdfium2
|
|
8
|
+
Requires-Dist: openpyxl>=3.1 ; extra == 'dev'
|
|
9
|
+
Provides-Extra: dev
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Summary: Rust-backed Python chunking library
|
|
12
|
+
Keywords: chunking,document,pdf,docx,xlsx,rust
|
|
13
|
+
License: MIT
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
16
|
+
|
|
1
17
|
# py-chunks
|
|
2
18
|
|
|
3
19
|
[](https://www.python.org/downloads/) [](LICENSE)
|
|
4
20
|
|
|
5
|
-
Fast, framework-agnostic document chunking library backed by Rust. Extract meaningful content segments from DOCX, PDF, PPTX, TXT, Markdown, and
|
|
21
|
+
Fast, framework-agnostic document chunking library backed by Rust. Extract meaningful content segments from DOCX, PDF, PPTX, TXT, Markdown, HTML, XLSX, and XLS files — optimised for production use.
|
|
6
22
|
|
|
7
23
|
## Features
|
|
8
24
|
|
|
9
|
-
- **
|
|
10
|
-
- **7 Chunking Modes
|
|
25
|
+
- **8 Document Formats**: PDF, DOCX, PPTX, Markdown, HTML, TXT, XLSX, XLS
|
|
26
|
+
- **7 Chunking Modes for document formats**: `default`, `structural`, `section`, `semantic`, `sliding_window`, `sentence`, `page_aware`
|
|
27
|
+
- **6 Chunking Modes for spreadsheet formats** (XLSX / XLS): `row`, `table`, `sheet`, `sliding_window`, `page_aware`, `semantic`
|
|
11
28
|
- **Streaming for every format** via a single `stream_chunks()` entry point
|
|
12
29
|
- PDF: background Rust thread + `mpsc` channel (all 7 modes, true one-chunk-at-a-time)
|
|
13
30
|
- Markdown / HTML / TXT: block-by-block state machine for `structural` + `semantic`; batch-drain for the rest
|
|
14
31
|
- DOCX: all 7 modes — `DocxStructuralIterator` for `default`/`structural`; dedicated per-mode iterators for the remaining 5 modes (lazy chunk emission after a single upfront parse)
|
|
15
32
|
- PPTX: batch-drain (ZIP must be read upfront, then chunks are yielded one at a time)
|
|
33
|
+
- XLSX / XLS: `row` and `sliding_window` use true state machines (one chunk per `__next__`, O(parsed_rows) memory); `table`, `sheet`, `page_aware`, and `semantic` use batch-drain (global sheet analysis required before first chunk)
|
|
16
34
|
- **Multiple Input Sources**: local file paths, raw `bytes` / `bytearray` / `memoryview`, file-like objects (`BytesIO`, open files), FastAPI / Starlette `UploadFile`, HTTP(S) / S3 pre-signed URLs
|
|
17
35
|
- **Consistent Output Schema**: every chunk is a `dict` with `content`, `content_type`, and `metadata` keys
|
|
18
36
|
- **Zero Python runtime dependencies**: all parsing happens in the Rust extension; the PDFium native binary is bundled inside the wheel
|
|
@@ -50,6 +68,8 @@ chunks = get_chunks("notes.md", mode="semantic")
|
|
|
50
68
|
chunks = get_chunks("page.html", mode="section")
|
|
51
69
|
chunks = get_chunks("deck.pptx", mode="sliding_window", window_size=3, overlap=1)
|
|
52
70
|
chunks = get_chunks("report.docx", mode="sentence", sentences_per_chunk=3)
|
|
71
|
+
chunks = get_chunks("data.xlsx", mode="row", rows_per_chunk=5)
|
|
72
|
+
chunks = get_chunks("legacy.xls", mode="row", rows_per_chunk=5)
|
|
53
73
|
|
|
54
74
|
for chunk in chunks:
|
|
55
75
|
print(chunk["content"])
|
|
@@ -195,6 +215,86 @@ These three formats also support **streaming in every mode** — see the Streami
|
|
|
195
215
|
|
|
196
216
|
---
|
|
197
217
|
|
|
218
|
+
### XLSX / XLS modes
|
|
219
|
+
|
|
220
|
+
Both `.xlsx` and `.xls` files are handled by the same chunker. All 6 modes are available for batch and streaming:
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
from py_chunks import get_chunks, stream_chunks
|
|
224
|
+
from py_chunks.chunkers.xlsx import chunk_xlsx, stream_chunk_xlsx
|
|
225
|
+
|
|
226
|
+
# Batch — via unified API
|
|
227
|
+
chunks = get_chunks("data.xlsx", mode="row", rows_per_chunk=5)
|
|
228
|
+
chunks = get_chunks("legacy.xls", mode="row", rows_per_chunk=5)
|
|
229
|
+
|
|
230
|
+
# Batch — via format-specific chunker (returns chunks + timing)
|
|
231
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="row", rows_per_chunk=5)
|
|
232
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="table", max_chunk_chars=3000)
|
|
233
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="sheet", max_chunk_chars=5000)
|
|
234
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1)
|
|
235
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="page_aware", max_chunk_chars=3000)
|
|
236
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="semantic", rows_per_chunk=10)
|
|
237
|
+
|
|
238
|
+
# Filter to specific sheets
|
|
239
|
+
chunks, _ = chunk_xlsx("data.xlsx", mode="row", sheet_names=["Sales", "Q4"])
|
|
240
|
+
|
|
241
|
+
# Streaming — identical output to batch
|
|
242
|
+
for chunk in stream_chunks("data.xlsx", mode="row", rows_per_chunk=5):
|
|
243
|
+
print(chunk["content"])
|
|
244
|
+
|
|
245
|
+
for chunk in stream_chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1):
|
|
246
|
+
embed_and_store(chunk)
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
| Mode | `content_type` | Description |
|
|
250
|
+
|---|---|---|
|
|
251
|
+
| `row` | `row_document` | Groups N consecutive data rows into one chunk. Header row is auto-detected and excluded from content. Param: `rows_per_chunk` (default 1). |
|
|
252
|
+
| `table` | `table_region` | Named Excel tables (XLSX only) or heuristic contiguous data regions per sheet. For XLS and sheets without named tables, falls back to bounding-box detection. Param: `max_chunk_chars`. |
|
|
253
|
+
| `sheet` | `sheet` | One chunk per sheet (split by `max_chunk_chars` if needed). Includes named-table metadata. Param: `max_chunk_chars`. |
|
|
254
|
+
| `sliding_window` | `row_window` | Overlapping windows of N rows. Params: `window_size` (default 3), `overlap` (default 1, must be `< window_size`). |
|
|
255
|
+
| `page_aware` | `sheet_region` | Chunks by Excel print areas (XLSX only); falls back to the full sheet when no print area is defined. For XLS, always uses the full-sheet fallback. Param: `max_chunk_chars`. |
|
|
256
|
+
| `semantic` | `semantic_group` | Detects the column with the lowest cardinality of string values, sorts by it, and groups rows sharing the same category value. Falls back to fixed-size chunking when no suitable column is found. Param: `rows_per_chunk` (used for the fallback). |
|
|
257
|
+
|
|
258
|
+
**Parameters accepted by `chunk_xlsx` and `stream_chunk_xlsx`:**
|
|
259
|
+
|
|
260
|
+
| Parameter | Type | Default | Description |
|
|
261
|
+
|---|---|---|---|
|
|
262
|
+
| `file_path` | str | — | Path to `.xlsx` or `.xls` file |
|
|
263
|
+
| `mode` | str | `"row"` | One of the 6 modes above |
|
|
264
|
+
| `rows_per_chunk` | int | 1 | Rows per chunk (`row` mode and `semantic` fallback). Must be `> 0`. |
|
|
265
|
+
| `window_size` | int | 3 | Window size in rows (`sliding_window` mode). Must be `>= 1`. |
|
|
266
|
+
| `overlap` | int | 1 | Overlapping rows between windows. Must be `< window_size`. |
|
|
267
|
+
| `include_headers` | bool | True | Prefix each row value with its column header (`key: value` format). |
|
|
268
|
+
| `sheet_names` | list[str] \| None | None | Process only the named sheets; processes all sheets when `None` or `[]`. |
|
|
269
|
+
| `skip_empty_rows` | bool | True | Skip rows where every cell is empty. |
|
|
270
|
+
| `max_chunk_chars` | int | 2000 | Character limit per chunk (`table`, `sheet`, `page_aware` modes). |
|
|
271
|
+
|
|
272
|
+
**XLS vs XLSX differences:**
|
|
273
|
+
|
|
274
|
+
| Feature | XLSX | XLS |
|
|
275
|
+
|---|---|---|
|
|
276
|
+
| Named table detection (`table` mode) | ZIP XML (`table1.xml`) — full named-table metadata | Not available — heuristic bounding-box only; `is_named_table` is always `false` |
|
|
277
|
+
| Print area detection (`page_aware` mode) | Parsed from `xl/workbook.xml` | Not available — always uses full-sheet fallback; `has_print_area` is always `false` |
|
|
278
|
+
| Named table metadata in `sheet` mode | `has_named_tables: true/false`, `named_tables: [...]` | Always `has_named_tables: false`, `named_tables: []` |
|
|
279
|
+
| All other modes | Identical | Identical |
|
|
280
|
+
|
|
281
|
+
**XLSX / XLS metadata fields by mode:**
|
|
282
|
+
|
|
283
|
+
| Mode | Notable metadata keys |
|
|
284
|
+
|---|---|
|
|
285
|
+
| `row` | `sheet_name`, `sheet_index`, `row_index`, `header_row`, `col_count`, `rows_per_chunk`, `actual_row_count`, `chunk_index` |
|
|
286
|
+
| `table` | `sheet_name`, `sheet_index`, `table_name`, `is_named_table`, `header_row`, `start_row`, `end_row`, `start_col`, `end_col`, `row_count`, `col_count`, `chunk_index`, `is_split`, `split_part` |
|
|
287
|
+
| `sheet` | `sheet_name`, `sheet_index`, `row_count`, `col_count`, `header_row`, `has_named_tables`, `named_tables`, `chunk_index`, `is_split`, `split_part` |
|
|
288
|
+
| `sliding_window` | `sheet_name`, `sheet_index`, `window_size`, `overlap`, `actual_row_count`, `window_index`, `start_row`, `end_row`, `header_row`, `col_count`, `chunk_index` |
|
|
289
|
+
| `page_aware` | `sheet_name`, `sheet_index`, `has_print_area`, `print_area_ref`, `start_row`, `end_row`, `start_col`, `end_col`, `row_count`, `col_count`, `header_row`, `region_index`, `chunk_index`, `is_split`, `split_part` |
|
|
290
|
+
| `semantic` | `sheet_name`, `sheet_index`, `category_column`, `category_value`, `used_fallback`, `low_grouping_quality`, `avg_group_size`, `start_row`, `end_row`, `actual_row_count`, `header_row`, `col_count`, `group_index`, `chunk_index` |
|
|
291
|
+
|
|
292
|
+
> **Streaming memory profile**: `row` and `sliding_window` pre-parse all sheet data once (calamine reads the entire file on open — there is no incremental I/O at the format level), then build and yield one chunk per `__next__`. The other four modes require global sheet analysis before the first chunk can be emitted, so they materialise all chunks at construction time and drain them lazily. In both cases the streaming iterator yields one chunk at a time.
|
|
293
|
+
|
|
294
|
+
> **Header detection**: the first all-string row in each sheet is automatically detected as the header row and excluded from chunk content. Columns without a header label are named `Column 1`, `Column 2`, etc.
|
|
295
|
+
|
|
296
|
+
---
|
|
297
|
+
|
|
198
298
|
## Streaming
|
|
199
299
|
|
|
200
300
|
### When to use streaming
|
|
@@ -214,6 +314,7 @@ Use `stream_chunks` (or the `stream_chunks_from_*` variants) when:
|
|
|
214
314
|
| **TXT** | All 7 | Same as Markdown | Pure Rust, no threads. |
|
|
215
315
|
| **DOCX** | All 7 | `DocxStructuralIterator` for `default`/`structural`; dedicated per-mode Rust iterators for the other 5 | Full document parsed once upfront; chunks emitted lazily. Peak memory ≈ file size + chunk vec. Output equals `get_chunks` for every mode. |
|
|
216
316
|
| **PPTX** | All 7 | Batch-drain | PPTX requires the full ZIP up front, so chunks are computed once at construction and yielded one per `__next__`. |
|
|
317
|
+
| **XLSX / XLS** | All 6 | State machine for `row` / `sliding_window`; batch-drain for `table` / `sheet` / `page_aware` / `semantic` | calamine reads the full file on open (no incremental I/O at format level). `row` and `sliding_window` build one chunk per `__next__` from pre-parsed row data. The other four modes require global analysis first and materialise all chunks at iterator construction. Output is identical to `chunk_xlsx` for every mode. |
|
|
217
318
|
|
|
218
319
|
> **Parity guarantee**: streaming output equals `list(get_chunks(...))` for every format and every supported mode (this is exercised by `test_pdf_streaming.py` for PDF and by the tests in `py_chunks/tests/test_source_apis.py`).
|
|
219
320
|
|
|
@@ -246,6 +347,19 @@ for chunk in stream_chunks("document.docx", mode="page_aware", paragraphs_per_
|
|
|
246
347
|
for chunk in stream_chunks("deck.pptx", mode="semantic"):
|
|
247
348
|
...
|
|
248
349
|
|
|
350
|
+
# XLSX / XLS — all 6 modes
|
|
351
|
+
for chunk in stream_chunks("data.xlsx", mode="row", rows_per_chunk=10):
|
|
352
|
+
embed_and_index(chunk)
|
|
353
|
+
|
|
354
|
+
for chunk in stream_chunks("report.xls", mode="sliding_window", window_size=5, overlap=2):
|
|
355
|
+
process(chunk)
|
|
356
|
+
|
|
357
|
+
for chunk in stream_chunks("data.xlsx", mode="table", max_chunk_chars=3000):
|
|
358
|
+
store_in_db(chunk)
|
|
359
|
+
|
|
360
|
+
for chunk in stream_chunks("data.xlsx", mode="semantic", rows_per_chunk=20):
|
|
361
|
+
handle(chunk)
|
|
362
|
+
|
|
249
363
|
# From bytes (e.g. FastAPI body)
|
|
250
364
|
for chunk in stream_chunks(request_body, filename="report.pdf", mode="semantic"):
|
|
251
365
|
process(chunk)
|
|
@@ -298,8 +412,11 @@ Or use the explicit source-specific helpers:
|
|
|
298
412
|
| Markdown | `.md` | All 7 | All 7 (state machine for `structural` / `semantic`) |
|
|
299
413
|
| HTML | `.html`, `.htm` | All 7 | All 7 (state machine for `structural` / `semantic`) |
|
|
300
414
|
| Plain Text | `.txt` | All 7 | All 7 (state machine for `structural` / `semantic`) |
|
|
415
|
+
| Excel | `.xlsx`, `.xls` | All 6 | All 6 (`row` / `sliding_window` state machine; others batch-drain) |
|
|
416
|
+
|
|
417
|
+
The 7 document modes are: `default`, `structural`, `section`, `semantic`, `sliding_window`, `sentence`, `page_aware`.
|
|
301
418
|
|
|
302
|
-
The
|
|
419
|
+
The 6 spreadsheet modes are: `row`, `table`, `sheet`, `sliding_window`, `page_aware`, `semantic`.
|
|
303
420
|
|
|
304
421
|
---
|
|
305
422
|
|
|
@@ -378,6 +495,7 @@ from py_chunks.chunkers.pptx import chunk_pptx, stream_chunk_pptx, chunk_pptx_wi
|
|
|
378
495
|
from py_chunks.chunkers.html import chunk_html, stream_chunk_html
|
|
379
496
|
from py_chunks.chunkers.md import chunk_md, stream_chunk_md
|
|
380
497
|
from py_chunks.chunkers.txt import chunk_txt, stream_chunk_txt
|
|
498
|
+
from py_chunks.chunkers.xlsx import chunk_xlsx, stream_chunk_xlsx # handles both .xlsx and .xls
|
|
381
499
|
|
|
382
500
|
# Batch with timing
|
|
383
501
|
chunks, timing = chunk_pdf("file.pdf", mode="section")
|
|
@@ -401,6 +519,19 @@ for chunk in stream_chunk_md("book.md", mode="sentence", sentences_per_chunk=2):
|
|
|
401
519
|
for chunk in stream_chunk_html("page.html", mode="section"): ...
|
|
402
520
|
for chunk in stream_chunk_txt("log.txt", mode="page_aware", paragraphs_per_page=20): ...
|
|
403
521
|
for chunk in stream_chunk_pptx("deck.pptx", mode="semantic"): ...
|
|
522
|
+
|
|
523
|
+
# XLSX / XLS — all 6 modes, batch and streaming
|
|
524
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="row", rows_per_chunk=5)
|
|
525
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="table", max_chunk_chars=3000)
|
|
526
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="sheet", max_chunk_chars=5000)
|
|
527
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1)
|
|
528
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="page_aware", max_chunk_chars=3000)
|
|
529
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="semantic", rows_per_chunk=10)
|
|
530
|
+
chunks, timing = chunk_xlsx("legacy.xls", mode="row", rows_per_chunk=5) # XLS works identically
|
|
531
|
+
|
|
532
|
+
for chunk in stream_chunk_xlsx("data.xlsx", mode="row", rows_per_chunk=10): ...
|
|
533
|
+
for chunk in stream_chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1): ...
|
|
534
|
+
for chunk in stream_chunk_xlsx("legacy.xls", mode="semantic", rows_per_chunk=20): ...
|
|
404
535
|
```
|
|
405
536
|
|
|
406
537
|
---
|
|
@@ -433,7 +564,13 @@ for chunk in stream_chunk_pptx("deck.pptx", mode="semantic"): ...
|
|
|
433
564
|
| `semantic` | Heuristic topic-continuity group (`semantic` mode) |
|
|
434
565
|
| `sliding_window` | Fixed-size overlapping window (`sliding_window` mode) |
|
|
435
566
|
| `sentence` | Sentence-count group (`sentence` mode) |
|
|
436
|
-
| `page_aware` | Page boundary group (`page_aware` mode) |
|
|
567
|
+
| `page_aware` | Page boundary group (`page_aware` mode for document formats) |
|
|
568
|
+
| `row_document` | XLSX/XLS: N consecutive data rows (`row` mode) |
|
|
569
|
+
| `table_region` | XLSX/XLS: named table or heuristic data region (`table` mode) |
|
|
570
|
+
| `sheet` | XLSX/XLS: full sheet or split part (`sheet` mode) |
|
|
571
|
+
| `row_window` | XLSX/XLS: overlapping row window (`sliding_window` mode) |
|
|
572
|
+
| `sheet_region` | XLSX/XLS: print area or full sheet (`page_aware` mode) |
|
|
573
|
+
| `semantic_group` | XLSX/XLS: category-grouped rows or fallback fixed-size group (`semantic` mode) |
|
|
437
574
|
|
|
438
575
|
### Metadata fields by mode
|
|
439
576
|
|
|
@@ -613,7 +750,8 @@ def process_document(file_path: str):
|
|
|
613
750
|
│ Format Dispatcher │
|
|
614
751
|
│ (py_chunks/chunkers/*.py) │
|
|
615
752
|
│ chunk_pdf / chunk_docx / chunk_pptx / │
|
|
616
|
-
│ chunk_md / chunk_html / chunk_txt
|
|
753
|
+
│ chunk_md / chunk_html / chunk_txt / │
|
|
754
|
+
│ chunk_xlsx (handles .xlsx + .xls) + │
|
|
617
755
|
│ matching stream_chunk_* variants │
|
|
618
756
|
└──────────────┬───────────────────────────────┘
|
|
619
757
|
│ validates args, dispatches to the right Rust function,
|
|
@@ -639,6 +777,11 @@ def process_document(file_path: str):
|
|
|
639
777
|
│ DOCX stream — DocxStructuralIterator (default/structural) + │
|
|
640
778
|
│ per-mode iterators for all other 5 modes │
|
|
641
779
|
│ PPTX stream — batch-drain (ZIP must be read upfront) │
|
|
780
|
+
│ XLSX/XLS — open_workbook_auto() handles both formats; │
|
|
781
|
+
│ row / sliding_window: state machine, one chunk per __next__ │
|
|
782
|
+
│ table / sheet / page_aware / semantic: batch-drain │
|
|
783
|
+
│ table mode: ZIP XML for named tables (XLSX) or heuristic │
|
|
784
|
+
│ page_aware: print-area XML (XLSX) or full-sheet fallback │
|
|
642
785
|
└──────────────────────────────────────────────────────────────────┘
|
|
643
786
|
```
|
|
644
787
|
|
|
@@ -667,7 +810,7 @@ except FileNotFoundError as e:
|
|
|
667
810
|
try:
|
|
668
811
|
chunks = get_chunks("image.png")
|
|
669
812
|
except ValueError as e:
|
|
670
|
-
print(e) # Unsupported file type '.png'. Supported: .docx, .htm, .html, .md, .pdf, .pptx, .txt
|
|
813
|
+
print(e) # Unsupported file type '.png'. Supported: .docx, .htm, .html, .md, .pdf, .pptx, .txt, .xls, .xlsx
|
|
671
814
|
|
|
672
815
|
# Scanned / image-only PDF (no text layer)
|
|
673
816
|
try:
|
|
@@ -742,3 +885,4 @@ MIT
|
|
|
742
885
|
---
|
|
743
886
|
|
|
744
887
|
Built with Rust (performance) + Python (simplicity)
|
|
888
|
+
|
|
@@ -1,32 +1,20 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: py-chunks
|
|
3
|
-
Version: 0.2.4
|
|
4
|
-
Classifier: Programming Language :: Python :: 3
|
|
5
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
6
|
-
Classifier: Operating System :: OS Independent
|
|
7
|
-
Requires-Dist: pypdfium2
|
|
8
|
-
License-File: LICENSE
|
|
9
|
-
Summary: Rust-backed Python chunking library
|
|
10
|
-
Keywords: chunking,document,pdf,docx,rust
|
|
11
|
-
License: MIT
|
|
12
|
-
Requires-Python: >=3.9
|
|
13
|
-
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
14
|
-
|
|
15
1
|
# py-chunks
|
|
16
2
|
|
|
17
3
|
[](https://www.python.org/downloads/) [](LICENSE)
|
|
18
4
|
|
|
19
|
-
Fast, framework-agnostic document chunking library backed by Rust. Extract meaningful content segments from DOCX, PDF, PPTX, TXT, Markdown, and
|
|
5
|
+
Fast, framework-agnostic document chunking library backed by Rust. Extract meaningful content segments from DOCX, PDF, PPTX, TXT, Markdown, HTML, XLSX, and XLS files — optimised for production use.
|
|
20
6
|
|
|
21
7
|
## Features
|
|
22
8
|
|
|
23
|
-
- **
|
|
24
|
-
- **7 Chunking Modes
|
|
9
|
+
- **8 Document Formats**: PDF, DOCX, PPTX, Markdown, HTML, TXT, XLSX, XLS
|
|
10
|
+
- **7 Chunking Modes for document formats**: `default`, `structural`, `section`, `semantic`, `sliding_window`, `sentence`, `page_aware`
|
|
11
|
+
- **6 Chunking Modes for spreadsheet formats** (XLSX / XLS): `row`, `table`, `sheet`, `sliding_window`, `page_aware`, `semantic`
|
|
25
12
|
- **Streaming for every format** via a single `stream_chunks()` entry point
|
|
26
13
|
- PDF: background Rust thread + `mpsc` channel (all 7 modes, true one-chunk-at-a-time)
|
|
27
14
|
- Markdown / HTML / TXT: block-by-block state machine for `structural` + `semantic`; batch-drain for the rest
|
|
28
15
|
- DOCX: all 7 modes — `DocxStructuralIterator` for `default`/`structural`; dedicated per-mode iterators for the remaining 5 modes (lazy chunk emission after a single upfront parse)
|
|
29
16
|
- PPTX: batch-drain (ZIP must be read upfront, then chunks are yielded one at a time)
|
|
17
|
+
- XLSX / XLS: `row` and `sliding_window` use true state machines (one chunk per `__next__`, O(parsed_rows) memory); `table`, `sheet`, `page_aware`, and `semantic` use batch-drain (global sheet analysis required before first chunk)
|
|
30
18
|
- **Multiple Input Sources**: local file paths, raw `bytes` / `bytearray` / `memoryview`, file-like objects (`BytesIO`, open files), FastAPI / Starlette `UploadFile`, HTTP(S) / S3 pre-signed URLs
|
|
31
19
|
- **Consistent Output Schema**: every chunk is a `dict` with `content`, `content_type`, and `metadata` keys
|
|
32
20
|
- **Zero Python runtime dependencies**: all parsing happens in the Rust extension; the PDFium native binary is bundled inside the wheel
|
|
@@ -64,6 +52,8 @@ chunks = get_chunks("notes.md", mode="semantic")
|
|
|
64
52
|
chunks = get_chunks("page.html", mode="section")
|
|
65
53
|
chunks = get_chunks("deck.pptx", mode="sliding_window", window_size=3, overlap=1)
|
|
66
54
|
chunks = get_chunks("report.docx", mode="sentence", sentences_per_chunk=3)
|
|
55
|
+
chunks = get_chunks("data.xlsx", mode="row", rows_per_chunk=5)
|
|
56
|
+
chunks = get_chunks("legacy.xls", mode="row", rows_per_chunk=5)
|
|
67
57
|
|
|
68
58
|
for chunk in chunks:
|
|
69
59
|
print(chunk["content"])
|
|
@@ -209,6 +199,86 @@ These three formats also support **streaming in every mode** — see the Streami
|
|
|
209
199
|
|
|
210
200
|
---
|
|
211
201
|
|
|
202
|
+
### XLSX / XLS modes
|
|
203
|
+
|
|
204
|
+
Both `.xlsx` and `.xls` files are handled by the same chunker. All 6 modes are available for batch and streaming:
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
from py_chunks import get_chunks, stream_chunks
|
|
208
|
+
from py_chunks.chunkers.xlsx import chunk_xlsx, stream_chunk_xlsx
|
|
209
|
+
|
|
210
|
+
# Batch — via unified API
|
|
211
|
+
chunks = get_chunks("data.xlsx", mode="row", rows_per_chunk=5)
|
|
212
|
+
chunks = get_chunks("legacy.xls", mode="row", rows_per_chunk=5)
|
|
213
|
+
|
|
214
|
+
# Batch — via format-specific chunker (returns chunks + timing)
|
|
215
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="row", rows_per_chunk=5)
|
|
216
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="table", max_chunk_chars=3000)
|
|
217
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="sheet", max_chunk_chars=5000)
|
|
218
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1)
|
|
219
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="page_aware", max_chunk_chars=3000)
|
|
220
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="semantic", rows_per_chunk=10)
|
|
221
|
+
|
|
222
|
+
# Filter to specific sheets
|
|
223
|
+
chunks, _ = chunk_xlsx("data.xlsx", mode="row", sheet_names=["Sales", "Q4"])
|
|
224
|
+
|
|
225
|
+
# Streaming — identical output to batch
|
|
226
|
+
for chunk in stream_chunks("data.xlsx", mode="row", rows_per_chunk=5):
|
|
227
|
+
print(chunk["content"])
|
|
228
|
+
|
|
229
|
+
for chunk in stream_chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1):
|
|
230
|
+
embed_and_store(chunk)
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
| Mode | `content_type` | Description |
|
|
234
|
+
|---|---|---|
|
|
235
|
+
| `row` | `row_document` | Groups N consecutive data rows into one chunk. Header row is auto-detected and excluded from content. Param: `rows_per_chunk` (default 1). |
|
|
236
|
+
| `table` | `table_region` | Named Excel tables (XLSX only) or heuristic contiguous data regions per sheet. For XLS and sheets without named tables, falls back to bounding-box detection. Param: `max_chunk_chars`. |
|
|
237
|
+
| `sheet` | `sheet` | One chunk per sheet (split by `max_chunk_chars` if needed). Includes named-table metadata. Param: `max_chunk_chars`. |
|
|
238
|
+
| `sliding_window` | `row_window` | Overlapping windows of N rows. Params: `window_size` (default 3), `overlap` (default 1, must be `< window_size`). |
|
|
239
|
+
| `page_aware` | `sheet_region` | Chunks by Excel print areas (XLSX only); falls back to the full sheet when no print area is defined. For XLS, always uses the full-sheet fallback. Param: `max_chunk_chars`. |
|
|
240
|
+
| `semantic` | `semantic_group` | Detects the column with the lowest cardinality of string values, sorts by it, and groups rows sharing the same category value. Falls back to fixed-size chunking when no suitable column is found. Param: `rows_per_chunk` (used for the fallback). |
|
|
241
|
+
|
|
242
|
+
**Parameters accepted by `chunk_xlsx` and `stream_chunk_xlsx`:**
|
|
243
|
+
|
|
244
|
+
| Parameter | Type | Default | Description |
|
|
245
|
+
|---|---|---|---|
|
|
246
|
+
| `file_path` | str | — | Path to `.xlsx` or `.xls` file |
|
|
247
|
+
| `mode` | str | `"row"` | One of the 6 modes above |
|
|
248
|
+
| `rows_per_chunk` | int | 1 | Rows per chunk (`row` mode and `semantic` fallback). Must be `> 0`. |
|
|
249
|
+
| `window_size` | int | 3 | Window size in rows (`sliding_window` mode). Must be `>= 1`. |
|
|
250
|
+
| `overlap` | int | 1 | Overlapping rows between windows. Must be `< window_size`. |
|
|
251
|
+
| `include_headers` | bool | True | Prefix each row value with its column header (`key: value` format). |
|
|
252
|
+
| `sheet_names` | list[str] \| None | None | Process only the named sheets; processes all sheets when `None` or `[]`. |
|
|
253
|
+
| `skip_empty_rows` | bool | True | Skip rows where every cell is empty. |
|
|
254
|
+
| `max_chunk_chars` | int | 2000 | Character limit per chunk (`table`, `sheet`, `page_aware` modes). |
|
|
255
|
+
|
|
256
|
+
**XLS vs XLSX differences:**
|
|
257
|
+
|
|
258
|
+
| Feature | XLSX | XLS |
|
|
259
|
+
|---|---|---|
|
|
260
|
+
| Named table detection (`table` mode) | ZIP XML (`table1.xml`) — full named-table metadata | Not available — heuristic bounding-box only; `is_named_table` is always `false` |
|
|
261
|
+
| Print area detection (`page_aware` mode) | Parsed from `xl/workbook.xml` | Not available — always uses full-sheet fallback; `has_print_area` is always `false` |
|
|
262
|
+
| Named table metadata in `sheet` mode | `has_named_tables: true/false`, `named_tables: [...]` | Always `has_named_tables: false`, `named_tables: []` |
|
|
263
|
+
| All other modes | Identical | Identical |
|
|
264
|
+
|
|
265
|
+
**XLSX / XLS metadata fields by mode:**
|
|
266
|
+
|
|
267
|
+
| Mode | Notable metadata keys |
|
|
268
|
+
|---|---|
|
|
269
|
+
| `row` | `sheet_name`, `sheet_index`, `row_index`, `header_row`, `col_count`, `rows_per_chunk`, `actual_row_count`, `chunk_index` |
|
|
270
|
+
| `table` | `sheet_name`, `sheet_index`, `table_name`, `is_named_table`, `header_row`, `start_row`, `end_row`, `start_col`, `end_col`, `row_count`, `col_count`, `chunk_index`, `is_split`, `split_part` |
|
|
271
|
+
| `sheet` | `sheet_name`, `sheet_index`, `row_count`, `col_count`, `header_row`, `has_named_tables`, `named_tables`, `chunk_index`, `is_split`, `split_part` |
|
|
272
|
+
| `sliding_window` | `sheet_name`, `sheet_index`, `window_size`, `overlap`, `actual_row_count`, `window_index`, `start_row`, `end_row`, `header_row`, `col_count`, `chunk_index` |
|
|
273
|
+
| `page_aware` | `sheet_name`, `sheet_index`, `has_print_area`, `print_area_ref`, `start_row`, `end_row`, `start_col`, `end_col`, `row_count`, `col_count`, `header_row`, `region_index`, `chunk_index`, `is_split`, `split_part` |
|
|
274
|
+
| `semantic` | `sheet_name`, `sheet_index`, `category_column`, `category_value`, `used_fallback`, `low_grouping_quality`, `avg_group_size`, `start_row`, `end_row`, `actual_row_count`, `header_row`, `col_count`, `group_index`, `chunk_index` |
|
|
275
|
+
|
|
276
|
+
> **Streaming memory profile**: `row` and `sliding_window` pre-parse all sheet data once (calamine reads the entire file on open — there is no incremental I/O at the format level), then build and yield one chunk per `__next__`. The other four modes require global sheet analysis before the first chunk can be emitted, so they materialise all chunks at construction time and drain them lazily. In both cases the streaming iterator yields one chunk at a time.
|
|
277
|
+
|
|
278
|
+
> **Header detection**: the first all-string row in each sheet is automatically detected as the header row and excluded from chunk content. Columns without a header label are named `Column 1`, `Column 2`, etc.
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
212
282
|
## Streaming
|
|
213
283
|
|
|
214
284
|
### When to use streaming
|
|
@@ -228,6 +298,7 @@ Use `stream_chunks` (or the `stream_chunks_from_*` variants) when:
|
|
|
228
298
|
| **TXT** | All 7 | Same as Markdown | Pure Rust, no threads. |
|
|
229
299
|
| **DOCX** | All 7 | `DocxStructuralIterator` for `default`/`structural`; dedicated per-mode Rust iterators for the other 5 | Full document parsed once upfront; chunks emitted lazily. Peak memory ≈ file size + chunk vec. Output equals `get_chunks` for every mode. |
|
|
230
300
|
| **PPTX** | All 7 | Batch-drain | PPTX requires the full ZIP up front, so chunks are computed once at construction and yielded one per `__next__`. |
|
|
301
|
+
| **XLSX / XLS** | All 6 | State machine for `row` / `sliding_window`; batch-drain for `table` / `sheet` / `page_aware` / `semantic` | calamine reads the full file on open (no incremental I/O at format level). `row` and `sliding_window` build one chunk per `__next__` from pre-parsed row data. The other four modes require global analysis first and materialise all chunks at iterator construction. Output is identical to `chunk_xlsx` for every mode. |
|
|
231
302
|
|
|
232
303
|
> **Parity guarantee**: streaming output equals `list(get_chunks(...))` for every format and every supported mode (this is exercised by `test_pdf_streaming.py` for PDF and by the tests in `py_chunks/tests/test_source_apis.py`).
|
|
233
304
|
|
|
@@ -260,6 +331,19 @@ for chunk in stream_chunks("document.docx", mode="page_aware", paragraphs_per_
|
|
|
260
331
|
for chunk in stream_chunks("deck.pptx", mode="semantic"):
|
|
261
332
|
...
|
|
262
333
|
|
|
334
|
+
# XLSX / XLS — all 6 modes
|
|
335
|
+
for chunk in stream_chunks("data.xlsx", mode="row", rows_per_chunk=10):
|
|
336
|
+
embed_and_index(chunk)
|
|
337
|
+
|
|
338
|
+
for chunk in stream_chunks("report.xls", mode="sliding_window", window_size=5, overlap=2):
|
|
339
|
+
process(chunk)
|
|
340
|
+
|
|
341
|
+
for chunk in stream_chunks("data.xlsx", mode="table", max_chunk_chars=3000):
|
|
342
|
+
store_in_db(chunk)
|
|
343
|
+
|
|
344
|
+
for chunk in stream_chunks("data.xlsx", mode="semantic", rows_per_chunk=20):
|
|
345
|
+
handle(chunk)
|
|
346
|
+
|
|
263
347
|
# From bytes (e.g. FastAPI body)
|
|
264
348
|
for chunk in stream_chunks(request_body, filename="report.pdf", mode="semantic"):
|
|
265
349
|
process(chunk)
|
|
@@ -312,8 +396,11 @@ Or use the explicit source-specific helpers:
|
|
|
312
396
|
| Markdown | `.md` | All 7 | All 7 (state machine for `structural` / `semantic`) |
|
|
313
397
|
| HTML | `.html`, `.htm` | All 7 | All 7 (state machine for `structural` / `semantic`) |
|
|
314
398
|
| Plain Text | `.txt` | All 7 | All 7 (state machine for `structural` / `semantic`) |
|
|
399
|
+
| Excel | `.xlsx`, `.xls` | All 6 | All 6 (`row` / `sliding_window` state machine; others batch-drain) |
|
|
400
|
+
|
|
401
|
+
The 7 document modes are: `default`, `structural`, `section`, `semantic`, `sliding_window`, `sentence`, `page_aware`.
|
|
315
402
|
|
|
316
|
-
The
|
|
403
|
+
The 6 spreadsheet modes are: `row`, `table`, `sheet`, `sliding_window`, `page_aware`, `semantic`.
|
|
317
404
|
|
|
318
405
|
---
|
|
319
406
|
|
|
@@ -392,6 +479,7 @@ from py_chunks.chunkers.pptx import chunk_pptx, stream_chunk_pptx, chunk_pptx_wi
|
|
|
392
479
|
from py_chunks.chunkers.html import chunk_html, stream_chunk_html
|
|
393
480
|
from py_chunks.chunkers.md import chunk_md, stream_chunk_md
|
|
394
481
|
from py_chunks.chunkers.txt import chunk_txt, stream_chunk_txt
|
|
482
|
+
from py_chunks.chunkers.xlsx import chunk_xlsx, stream_chunk_xlsx # handles both .xlsx and .xls
|
|
395
483
|
|
|
396
484
|
# Batch with timing
|
|
397
485
|
chunks, timing = chunk_pdf("file.pdf", mode="section")
|
|
@@ -415,6 +503,19 @@ for chunk in stream_chunk_md("book.md", mode="sentence", sentences_per_chunk=2):
|
|
|
415
503
|
for chunk in stream_chunk_html("page.html", mode="section"): ...
|
|
416
504
|
for chunk in stream_chunk_txt("log.txt", mode="page_aware", paragraphs_per_page=20): ...
|
|
417
505
|
for chunk in stream_chunk_pptx("deck.pptx", mode="semantic"): ...
|
|
506
|
+
|
|
507
|
+
# XLSX / XLS — all 6 modes, batch and streaming
|
|
508
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="row", rows_per_chunk=5)
|
|
509
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="table", max_chunk_chars=3000)
|
|
510
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="sheet", max_chunk_chars=5000)
|
|
511
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1)
|
|
512
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="page_aware", max_chunk_chars=3000)
|
|
513
|
+
chunks, timing = chunk_xlsx("data.xlsx", mode="semantic", rows_per_chunk=10)
|
|
514
|
+
chunks, timing = chunk_xlsx("legacy.xls", mode="row", rows_per_chunk=5) # XLS works identically
|
|
515
|
+
|
|
516
|
+
for chunk in stream_chunk_xlsx("data.xlsx", mode="row", rows_per_chunk=10): ...
|
|
517
|
+
for chunk in stream_chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1): ...
|
|
518
|
+
for chunk in stream_chunk_xlsx("legacy.xls", mode="semantic", rows_per_chunk=20): ...
|
|
418
519
|
```
|
|
419
520
|
|
|
420
521
|
---
|
|
@@ -447,7 +548,13 @@ for chunk in stream_chunk_pptx("deck.pptx", mode="semantic"): ...
|
|
|
447
548
|
| `semantic` | Heuristic topic-continuity group (`semantic` mode) |
|
|
448
549
|
| `sliding_window` | Fixed-size overlapping window (`sliding_window` mode) |
|
|
449
550
|
| `sentence` | Sentence-count group (`sentence` mode) |
|
|
450
|
-
| `page_aware` | Page boundary group (`page_aware` mode) |
|
|
551
|
+
| `page_aware` | Page boundary group (`page_aware` mode for document formats) |
|
|
552
|
+
| `row_document` | XLSX/XLS: N consecutive data rows (`row` mode) |
|
|
553
|
+
| `table_region` | XLSX/XLS: named table or heuristic data region (`table` mode) |
|
|
554
|
+
| `sheet` | XLSX/XLS: full sheet or split part (`sheet` mode) |
|
|
555
|
+
| `row_window` | XLSX/XLS: overlapping row window (`sliding_window` mode) |
|
|
556
|
+
| `sheet_region` | XLSX/XLS: print area or full sheet (`page_aware` mode) |
|
|
557
|
+
| `semantic_group` | XLSX/XLS: category-grouped rows or fallback fixed-size group (`semantic` mode) |
|
|
451
558
|
|
|
452
559
|
### Metadata fields by mode
|
|
453
560
|
|
|
@@ -627,7 +734,8 @@ def process_document(file_path: str):
|
|
|
627
734
|
│ Format Dispatcher │
|
|
628
735
|
│ (py_chunks/chunkers/*.py) │
|
|
629
736
|
│ chunk_pdf / chunk_docx / chunk_pptx / │
|
|
630
|
-
│ chunk_md / chunk_html / chunk_txt
|
|
737
|
+
│ chunk_md / chunk_html / chunk_txt / │
|
|
738
|
+
│ chunk_xlsx (handles .xlsx + .xls) + │
|
|
631
739
|
│ matching stream_chunk_* variants │
|
|
632
740
|
└──────────────┬───────────────────────────────┘
|
|
633
741
|
│ validates args, dispatches to the right Rust function,
|
|
@@ -653,6 +761,11 @@ def process_document(file_path: str):
|
|
|
653
761
|
│ DOCX stream — DocxStructuralIterator (default/structural) + │
|
|
654
762
|
│ per-mode iterators for all other 5 modes │
|
|
655
763
|
│ PPTX stream — batch-drain (ZIP must be read upfront) │
|
|
764
|
+
│ XLSX/XLS — open_workbook_auto() handles both formats; │
|
|
765
|
+
│ row / sliding_window: state machine, one chunk per __next__ │
|
|
766
|
+
│ table / sheet / page_aware / semantic: batch-drain │
|
|
767
|
+
│ table mode: ZIP XML for named tables (XLSX) or heuristic │
|
|
768
|
+
│ page_aware: print-area XML (XLSX) or full-sheet fallback │
|
|
656
769
|
└──────────────────────────────────────────────────────────────────┘
|
|
657
770
|
```
|
|
658
771
|
|
|
@@ -681,7 +794,7 @@ except FileNotFoundError as e:
|
|
|
681
794
|
try:
|
|
682
795
|
chunks = get_chunks("image.png")
|
|
683
796
|
except ValueError as e:
|
|
684
|
-
print(e) # Unsupported file type '.png'. Supported: .docx, .htm, .html, .md, .pdf, .pptx, .txt
|
|
797
|
+
print(e) # Unsupported file type '.png'. Supported: .docx, .htm, .html, .md, .pdf, .pptx, .txt, .xls, .xlsx
|
|
685
798
|
|
|
686
799
|
# Scanned / image-only PDF (no text layer)
|
|
687
800
|
try:
|
|
@@ -756,4 +869,3 @@ MIT
|
|
|
756
869
|
---
|
|
757
870
|
|
|
758
871
|
Built with Rust (performance) + Python (simplicity)
|
|
759
|
-
|