py-chunks 0.2.3__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {py_chunks-0.2.3 → py_chunks-0.3.0}/Cargo.lock +47 -1
  2. {py_chunks-0.2.3 → py_chunks-0.3.0}/Cargo.toml +1 -0
  3. py_chunks-0.2.3/README.md → py_chunks-0.3.0/PKG-INFO +151 -7
  4. py_chunks-0.2.3/PKG-INFO → py_chunks-0.3.0/README.md +134 -21
  5. {py_chunks-0.2.3 → py_chunks-0.3.0}/py_chunks/__init__.py +53 -5
  6. {py_chunks-0.2.3 → py_chunks-0.3.0}/py_chunks/chunkers/__init__.py +3 -0
  7. py_chunks-0.3.0/py_chunks/chunkers/xlsx.py +158 -0
  8. {py_chunks-0.2.3 → py_chunks-0.3.0}/pyproject.toml +6 -3
  9. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/mod.rs +1 -0
  10. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pptx/semantic.rs +1 -1
  11. py_chunks-0.3.0/src/extensions/xlsx/common.rs +511 -0
  12. py_chunks-0.3.0/src/extensions/xlsx/mod.rs +19 -0
  13. py_chunks-0.3.0/src/extensions/xlsx/page_aware.rs +432 -0
  14. py_chunks-0.3.0/src/extensions/xlsx/row_document.rs +68 -0
  15. py_chunks-0.3.0/src/extensions/xlsx/semantic.rs +332 -0
  16. py_chunks-0.3.0/src/extensions/xlsx/sheet.rs +195 -0
  17. py_chunks-0.3.0/src/extensions/xlsx/sliding_window.rs +221 -0
  18. py_chunks-0.3.0/src/extensions/xlsx/stream_iter.rs +503 -0
  19. py_chunks-0.3.0/src/extensions/xlsx/table_region.rs +506 -0
  20. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/lib.rs +1 -0
  21. {py_chunks-0.2.3 → py_chunks-0.3.0}/.github/workflows/release.yml +0 -0
  22. {py_chunks-0.2.3 → py_chunks-0.3.0}/.gitignore +0 -0
  23. {py_chunks-0.2.3 → py_chunks-0.3.0}/.pylintrc +0 -0
  24. {py_chunks-0.2.3 → py_chunks-0.3.0}/LICENSE +0 -0
  25. {py_chunks-0.2.3 → py_chunks-0.3.0}/py_chunks/chunkers/docx.py +0 -0
  26. {py_chunks-0.2.3 → py_chunks-0.3.0}/py_chunks/chunkers/html.py +0 -0
  27. {py_chunks-0.2.3 → py_chunks-0.3.0}/py_chunks/chunkers/md.py +0 -0
  28. {py_chunks-0.2.3 → py_chunks-0.3.0}/py_chunks/chunkers/pdf.py +0 -0
  29. {py_chunks-0.2.3 → py_chunks-0.3.0}/py_chunks/chunkers/pptx.py +0 -0
  30. {py_chunks-0.2.3 → py_chunks-0.3.0}/py_chunks/chunkers/txt.py +0 -0
  31. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/docx/common.rs +0 -0
  32. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/docx/mod.rs +0 -0
  33. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/docx/page_aware.rs +0 -0
  34. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/docx/section.rs +0 -0
  35. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/docx/semantic.rs +0 -0
  36. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/docx/sentence.rs +0 -0
  37. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/docx/sliding_window.rs +0 -0
  38. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/docx/structural.rs +0 -0
  39. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/html/common.rs +0 -0
  40. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/html/mod.rs +0 -0
  41. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/html/page_aware.rs +0 -0
  42. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/html/section.rs +0 -0
  43. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/html/semantic.rs +0 -0
  44. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/html/sentence.rs +0 -0
  45. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/html/sliding_window.rs +0 -0
  46. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/html/stream_iter.rs +0 -0
  47. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/html/structural.rs +0 -0
  48. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/md/common.rs +0 -0
  49. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/md/mod.rs +0 -0
  50. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/md/page_aware.rs +0 -0
  51. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/md/section.rs +0 -0
  52. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/md/semantic.rs +0 -0
  53. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/md/sentence.rs +0 -0
  54. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/md/sliding_window.rs +0 -0
  55. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/md/stream_iter.rs +0 -0
  56. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/md/structural.rs +0 -0
  57. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pdf/common.rs +0 -0
  58. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pdf/mod.rs +0 -0
  59. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pdf/stream_iter.rs +0 -0
  60. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pdf/structural.rs +0 -0
  61. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pptx/common.rs +0 -0
  62. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pptx/mod.rs +0 -0
  63. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pptx/page_aware.rs +0 -0
  64. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pptx/section.rs +0 -0
  65. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pptx/sentence.rs +0 -0
  66. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pptx/sliding_window.rs +0 -0
  67. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pptx/stream_iter.rs +0 -0
  68. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/pptx/structural.rs +0 -0
  69. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/shared.rs +0 -0
  70. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/txt/common.rs +0 -0
  71. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/txt/mod.rs +0 -0
  72. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/txt/page_aware.rs +0 -0
  73. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/txt/section.rs +0 -0
  74. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/txt/semantic.rs +0 -0
  75. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/txt/sentence.rs +0 -0
  76. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/txt/sliding_window.rs +0 -0
  77. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/txt/stream_iter.rs +0 -0
  78. {py_chunks-0.2.3 → py_chunks-0.3.0}/src/extensions/txt/structural.rs +0 -0
@@ -107,6 +107,22 @@ dependencies = [
107
107
  "pkg-config",
108
108
  ]
109
109
 
110
+ [[package]]
111
+ name = "calamine"
112
+ version = "0.26.1"
113
+ source = "registry+https://github.com/rust-lang/crates.io-index"
114
+ checksum = "138646b9af2c5d7f1804ea4bf93afc597737d2bd4f7341d67c48b03316976eb1"
115
+ dependencies = [
116
+ "byteorder",
117
+ "chrono",
118
+ "codepage",
119
+ "encoding_rs",
120
+ "log",
121
+ "quick-xml 0.31.0",
122
+ "serde",
123
+ "zip",
124
+ ]
125
+
110
126
  [[package]]
111
127
  name = "cc"
112
128
  version = "1.2.61"
@@ -134,6 +150,7 @@ dependencies = [
134
150
  "iana-time-zone",
135
151
  "js-sys",
136
152
  "num-traits",
153
+ "serde",
137
154
  "wasm-bindgen",
138
155
  "windows-link",
139
156
  ]
@@ -148,6 +165,15 @@ dependencies = [
148
165
  "inout",
149
166
  ]
150
167
 
168
+ [[package]]
169
+ name = "codepage"
170
+ version = "0.1.2"
171
+ source = "registry+https://github.com/rust-lang/crates.io-index"
172
+ checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4"
173
+ dependencies = [
174
+ "encoding_rs",
175
+ ]
176
+
151
177
  [[package]]
152
178
  name = "console_error_panic_hook"
153
179
  version = "0.1.7"
@@ -283,6 +309,15 @@ version = "1.15.0"
283
309
  source = "registry+https://github.com/rust-lang/crates.io-index"
284
310
  checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
285
311
 
312
+ [[package]]
313
+ name = "encoding_rs"
314
+ version = "0.8.35"
315
+ source = "registry+https://github.com/rust-lang/crates.io-index"
316
+ checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
317
+ dependencies = [
318
+ "cfg-if",
319
+ ]
320
+
286
321
  [[package]]
287
322
  name = "equivalent"
288
323
  version = "1.0.2"
@@ -667,10 +702,11 @@ checksum = "e0c5ccf5294c6ccd63a74f1565028353830a9c2f5eb0c682c355c471726a6e3f"
667
702
  name = "py_chunks"
668
703
  version = "0.1.6"
669
704
  dependencies = [
705
+ "calamine",
670
706
  "pdfium-render",
671
707
  "pyo3",
672
708
  "pythonize",
673
- "quick-xml",
709
+ "quick-xml 0.38.4",
674
710
  "serde_json",
675
711
  "zip",
676
712
  ]
@@ -748,6 +784,16 @@ dependencies = [
748
784
  "serde",
749
785
  ]
750
786
 
787
+ [[package]]
788
+ name = "quick-xml"
789
+ version = "0.31.0"
790
+ source = "registry+https://github.com/rust-lang/crates.io-index"
791
+ checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
792
+ dependencies = [
793
+ "encoding_rs",
794
+ "memchr",
795
+ ]
796
+
751
797
  [[package]]
752
798
  name = "quick-xml"
753
799
  version = "0.38.4"
@@ -18,3 +18,4 @@ quick-xml = "0.38"
18
18
  serde_json = "1"
19
19
  zip = "2"
20
20
  pdfium-render = { version = "0.8", features = ["sync"] }
21
+ calamine = { version = "0.26", features = ["dates"] }
@@ -1,18 +1,36 @@
1
+ Metadata-Version: 2.4
2
+ Name: py-chunks
3
+ Version: 0.3.0
4
+ Classifier: Programming Language :: Python :: 3
5
+ Classifier: License :: OSI Approved :: MIT License
6
+ Classifier: Operating System :: OS Independent
7
+ Requires-Dist: pypdfium2
8
+ Requires-Dist: openpyxl>=3.1 ; extra == 'dev'
9
+ Provides-Extra: dev
10
+ License-File: LICENSE
11
+ Summary: Rust-backed Python chunking library
12
+ Keywords: chunking,document,pdf,docx,xlsx,rust
13
+ License: MIT
14
+ Requires-Python: >=3.9
15
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
16
+
1
17
  # py-chunks
2
18
 
3
19
  [![Python](https://img.shields.io/badge/python-3.9+-blue)](https://www.python.org/downloads/) [![License](https://img.shields.io/badge/license-MIT-green)](LICENSE)
4
20
 
5
- Fast, framework-agnostic document chunking library backed by Rust. Extract meaningful content segments from DOCX, PDF, PPTX, TXT, Markdown, and HTML files — optimised for production use.
21
+ Fast, framework-agnostic document chunking library backed by Rust. Extract meaningful content segments from DOCX, PDF, PPTX, TXT, Markdown, HTML, XLSX, and XLS files — optimised for production use.
6
22
 
7
23
  ## Features
8
24
 
9
- - **6 Document Formats**: PDF, DOCX, PPTX, Markdown, HTML, TXT
10
- - **7 Chunking Modes across every format**: `default`, `structural`, `section`, `semantic`, `sliding_window`, `sentence`, `page_aware`
25
+ - **8 Document Formats**: PDF, DOCX, PPTX, Markdown, HTML, TXT, XLSX, XLS
26
+ - **7 Chunking Modes for document formats**: `default`, `structural`, `section`, `semantic`, `sliding_window`, `sentence`, `page_aware`
27
+ - **6 Chunking Modes for spreadsheet formats** (XLSX / XLS): `row`, `table`, `sheet`, `sliding_window`, `page_aware`, `semantic`
11
28
  - **Streaming for every format** via a single `stream_chunks()` entry point
12
29
  - PDF: background Rust thread + `mpsc` channel (all 7 modes, true one-chunk-at-a-time)
13
30
  - Markdown / HTML / TXT: block-by-block state machine for `structural` + `semantic`; batch-drain for the rest
14
31
  - DOCX: all 7 modes — `DocxStructuralIterator` for `default`/`structural`; dedicated per-mode iterators for the remaining 5 modes (lazy chunk emission after a single upfront parse)
15
32
  - PPTX: batch-drain (ZIP must be read upfront, then chunks are yielded one at a time)
33
+ - XLSX / XLS: `row` and `sliding_window` use true state machines (one chunk per `__next__`, O(parsed_rows) memory); `table`, `sheet`, `page_aware`, and `semantic` use batch-drain (global sheet analysis required before first chunk)
16
34
  - **Multiple Input Sources**: local file paths, raw `bytes` / `bytearray` / `memoryview`, file-like objects (`BytesIO`, open files), FastAPI / Starlette `UploadFile`, HTTP(S) / S3 pre-signed URLs
17
35
  - **Consistent Output Schema**: every chunk is a `dict` with `content`, `content_type`, and `metadata` keys
18
36
  - **Zero Python runtime dependencies**: all parsing happens in the Rust extension; the PDFium native binary is bundled inside the wheel
@@ -50,6 +68,8 @@ chunks = get_chunks("notes.md", mode="semantic")
50
68
  chunks = get_chunks("page.html", mode="section")
51
69
  chunks = get_chunks("deck.pptx", mode="sliding_window", window_size=3, overlap=1)
52
70
  chunks = get_chunks("report.docx", mode="sentence", sentences_per_chunk=3)
71
+ chunks = get_chunks("data.xlsx", mode="row", rows_per_chunk=5)
72
+ chunks = get_chunks("legacy.xls", mode="row", rows_per_chunk=5)
53
73
 
54
74
  for chunk in chunks:
55
75
  print(chunk["content"])
@@ -195,6 +215,86 @@ These three formats also support **streaming in every mode** — see the Streami
195
215
 
196
216
  ---
197
217
 
218
+ ### XLSX / XLS modes
219
+
220
+ Both `.xlsx` and `.xls` files are handled by the same chunker. All 6 modes are available for batch and streaming:
221
+
222
+ ```python
223
+ from py_chunks import get_chunks, stream_chunks
224
+ from py_chunks.chunkers.xlsx import chunk_xlsx, stream_chunk_xlsx
225
+
226
+ # Batch — via unified API
227
+ chunks = get_chunks("data.xlsx", mode="row", rows_per_chunk=5)
228
+ chunks = get_chunks("legacy.xls", mode="row", rows_per_chunk=5)
229
+
230
+ # Batch — via format-specific chunker (returns chunks + timing)
231
+ chunks, timing = chunk_xlsx("data.xlsx", mode="row", rows_per_chunk=5)
232
+ chunks, timing = chunk_xlsx("data.xlsx", mode="table", max_chunk_chars=3000)
233
+ chunks, timing = chunk_xlsx("data.xlsx", mode="sheet", max_chunk_chars=5000)
234
+ chunks, timing = chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1)
235
+ chunks, timing = chunk_xlsx("data.xlsx", mode="page_aware", max_chunk_chars=3000)
236
+ chunks, timing = chunk_xlsx("data.xlsx", mode="semantic", rows_per_chunk=10)
237
+
238
+ # Filter to specific sheets
239
+ chunks, _ = chunk_xlsx("data.xlsx", mode="row", sheet_names=["Sales", "Q4"])
240
+
241
+ # Streaming — identical output to batch
242
+ for chunk in stream_chunks("data.xlsx", mode="row", rows_per_chunk=5):
243
+ print(chunk["content"])
244
+
245
+ for chunk in stream_chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1):
246
+ embed_and_store(chunk)
247
+ ```
248
+
249
+ | Mode | `content_type` | Description |
250
+ |---|---|---|
251
+ | `row` | `row_document` | Groups N consecutive data rows into one chunk. Header row is auto-detected and excluded from content. Param: `rows_per_chunk` (default 1). |
252
+ | `table` | `table_region` | Named Excel tables (XLSX only) or heuristic contiguous data regions per sheet. For XLS and sheets without named tables, falls back to bounding-box detection. Param: `max_chunk_chars`. |
253
+ | `sheet` | `sheet` | One chunk per sheet (split by `max_chunk_chars` if needed). Includes named-table metadata. Param: `max_chunk_chars`. |
254
+ | `sliding_window` | `row_window` | Overlapping windows of N rows. Params: `window_size` (default 3), `overlap` (default 1, must be `< window_size`). |
255
+ | `page_aware` | `sheet_region` | Chunks by Excel print areas (XLSX only); falls back to the full sheet when no print area is defined. For XLS, always uses the full-sheet fallback. Param: `max_chunk_chars`. |
256
+ | `semantic` | `semantic_group` | Detects the column with the lowest cardinality of string values, sorts by it, and groups rows sharing the same category value. Falls back to fixed-size chunking when no suitable column is found. Param: `rows_per_chunk` (used for the fallback). |
257
+
258
+ **Parameters accepted by `chunk_xlsx` and `stream_chunk_xlsx`:**
259
+
260
+ | Parameter | Type | Default | Description |
261
+ |---|---|---|---|
262
+ | `file_path` | str | — | Path to `.xlsx` or `.xls` file |
263
+ | `mode` | str | `"row"` | One of the 6 modes above |
264
+ | `rows_per_chunk` | int | 1 | Rows per chunk (`row` mode and `semantic` fallback). Must be `> 0`. |
265
+ | `window_size` | int | 3 | Window size in rows (`sliding_window` mode). Must be `>= 1`. |
266
+ | `overlap` | int | 1 | Overlapping rows between windows. Must be `< window_size`. |
267
+ | `include_headers` | bool | True | Prefix each row value with its column header (`key: value` format). |
268
+ | `sheet_names` | list[str] \| None | None | Process only the named sheets; processes all sheets when `None` or `[]`. |
269
+ | `skip_empty_rows` | bool | True | Skip rows where every cell is empty. |
270
+ | `max_chunk_chars` | int | 2000 | Character limit per chunk (`table`, `sheet`, `page_aware` modes). |
271
+
272
+ **XLS vs XLSX differences:**
273
+
274
+ | Feature | XLSX | XLS |
275
+ |---|---|---|
276
+ | Named table detection (`table` mode) | ZIP XML (`table1.xml`) — full named-table metadata | Not available — heuristic bounding-box only; `is_named_table` is always `false` |
277
+ | Print area detection (`page_aware` mode) | Parsed from `xl/workbook.xml` | Not available — always uses full-sheet fallback; `has_print_area` is always `false` |
278
+ | Named table metadata in `sheet` mode | `has_named_tables: true/false`, `named_tables: [...]` | Always `has_named_tables: false`, `named_tables: []` |
279
+ | All other modes | Identical | Identical |
280
+
281
+ **XLSX / XLS metadata fields by mode:**
282
+
283
+ | Mode | Notable metadata keys |
284
+ |---|---|
285
+ | `row` | `sheet_name`, `sheet_index`, `row_index`, `header_row`, `col_count`, `rows_per_chunk`, `actual_row_count`, `chunk_index` |
286
+ | `table` | `sheet_name`, `sheet_index`, `table_name`, `is_named_table`, `header_row`, `start_row`, `end_row`, `start_col`, `end_col`, `row_count`, `col_count`, `chunk_index`, `is_split`, `split_part` |
287
+ | `sheet` | `sheet_name`, `sheet_index`, `row_count`, `col_count`, `header_row`, `has_named_tables`, `named_tables`, `chunk_index`, `is_split`, `split_part` |
288
+ | `sliding_window` | `sheet_name`, `sheet_index`, `window_size`, `overlap`, `actual_row_count`, `window_index`, `start_row`, `end_row`, `header_row`, `col_count`, `chunk_index` |
289
+ | `page_aware` | `sheet_name`, `sheet_index`, `has_print_area`, `print_area_ref`, `start_row`, `end_row`, `start_col`, `end_col`, `row_count`, `col_count`, `header_row`, `region_index`, `chunk_index`, `is_split`, `split_part` |
290
+ | `semantic` | `sheet_name`, `sheet_index`, `category_column`, `category_value`, `used_fallback`, `low_grouping_quality`, `avg_group_size`, `start_row`, `end_row`, `actual_row_count`, `header_row`, `col_count`, `group_index`, `chunk_index` |
291
+
292
+ > **Streaming memory profile**: `row` and `sliding_window` pre-parse all sheet data once (calamine reads the entire file on open — there is no incremental I/O at the format level), then build and yield one chunk per `__next__`. The other four modes require global sheet analysis before the first chunk can be emitted, so they materialise all chunks at construction time and drain them lazily. In both cases the streaming iterator yields one chunk at a time.
293
+
294
+ > **Header detection**: the first all-string row in each sheet is automatically detected as the header row and excluded from chunk content. Columns without a header label are named `Column 1`, `Column 2`, etc.
295
+
296
+ ---
297
+
198
298
  ## Streaming
199
299
 
200
300
  ### When to use streaming
@@ -214,6 +314,7 @@ Use `stream_chunks` (or the `stream_chunks_from_*` variants) when:
214
314
  | **TXT** | All 7 | Same as Markdown | Pure Rust, no threads. |
215
315
  | **DOCX** | All 7 | `DocxStructuralIterator` for `default`/`structural`; dedicated per-mode Rust iterators for the other 5 | Full document parsed once upfront; chunks emitted lazily. Peak memory ≈ file size + chunk vec. Output equals `get_chunks` for every mode. |
216
316
  | **PPTX** | All 7 | Batch-drain | PPTX requires the full ZIP up front, so chunks are computed once at construction and yielded one per `__next__`. |
317
+ | **XLSX / XLS** | All 6 | State machine for `row` / `sliding_window`; batch-drain for `table` / `sheet` / `page_aware` / `semantic` | calamine reads the full file on open (no incremental I/O at format level). `row` and `sliding_window` build one chunk per `__next__` from pre-parsed row data. The other four modes require global analysis first and materialise all chunks at iterator construction. Output is identical to `chunk_xlsx` for every mode. |
217
318
 
218
319
  > **Parity guarantee**: streaming output equals `list(get_chunks(...))` for every format and every supported mode (this is exercised by `test_pdf_streaming.py` for PDF and by the tests in `py_chunks/tests/test_source_apis.py`).
219
320
 
@@ -246,6 +347,19 @@ for chunk in stream_chunks("document.docx", mode="page_aware", paragraphs_per_
246
347
  for chunk in stream_chunks("deck.pptx", mode="semantic"):
247
348
  ...
248
349
 
350
+ # XLSX / XLS — all 6 modes
351
+ for chunk in stream_chunks("data.xlsx", mode="row", rows_per_chunk=10):
352
+ embed_and_index(chunk)
353
+
354
+ for chunk in stream_chunks("report.xls", mode="sliding_window", window_size=5, overlap=2):
355
+ process(chunk)
356
+
357
+ for chunk in stream_chunks("data.xlsx", mode="table", max_chunk_chars=3000):
358
+ store_in_db(chunk)
359
+
360
+ for chunk in stream_chunks("data.xlsx", mode="semantic", rows_per_chunk=20):
361
+ handle(chunk)
362
+
249
363
  # From bytes (e.g. FastAPI body)
250
364
  for chunk in stream_chunks(request_body, filename="report.pdf", mode="semantic"):
251
365
  process(chunk)
@@ -298,8 +412,11 @@ Or use the explicit source-specific helpers:
298
412
  | Markdown | `.md` | All 7 | All 7 (state machine for `structural` / `semantic`) |
299
413
  | HTML | `.html`, `.htm` | All 7 | All 7 (state machine for `structural` / `semantic`) |
300
414
  | Plain Text | `.txt` | All 7 | All 7 (state machine for `structural` / `semantic`) |
415
+ | Excel | `.xlsx`, `.xls` | All 6 | All 6 (`row` / `sliding_window` state machine; others batch-drain) |
416
+
417
+ The 7 document modes are: `default`, `structural`, `section`, `semantic`, `sliding_window`, `sentence`, `page_aware`.
301
418
 
302
- The 7 modes are: `default`, `structural`, `section`, `semantic`, `sliding_window`, `sentence`, `page_aware`.
419
+ The 6 spreadsheet modes are: `row`, `table`, `sheet`, `sliding_window`, `page_aware`, `semantic`.
303
420
 
304
421
  ---
305
422
 
@@ -378,6 +495,7 @@ from py_chunks.chunkers.pptx import chunk_pptx, stream_chunk_pptx, chunk_pptx_wi
378
495
  from py_chunks.chunkers.html import chunk_html, stream_chunk_html
379
496
  from py_chunks.chunkers.md import chunk_md, stream_chunk_md
380
497
  from py_chunks.chunkers.txt import chunk_txt, stream_chunk_txt
498
+ from py_chunks.chunkers.xlsx import chunk_xlsx, stream_chunk_xlsx # handles both .xlsx and .xls
381
499
 
382
500
  # Batch with timing
383
501
  chunks, timing = chunk_pdf("file.pdf", mode="section")
@@ -401,6 +519,19 @@ for chunk in stream_chunk_md("book.md", mode="sentence", sentences_per_chunk=2):
401
519
  for chunk in stream_chunk_html("page.html", mode="section"): ...
402
520
  for chunk in stream_chunk_txt("log.txt", mode="page_aware", paragraphs_per_page=20): ...
403
521
  for chunk in stream_chunk_pptx("deck.pptx", mode="semantic"): ...
522
+
523
+ # XLSX / XLS — all 6 modes, batch and streaming
524
+ chunks, timing = chunk_xlsx("data.xlsx", mode="row", rows_per_chunk=5)
525
+ chunks, timing = chunk_xlsx("data.xlsx", mode="table", max_chunk_chars=3000)
526
+ chunks, timing = chunk_xlsx("data.xlsx", mode="sheet", max_chunk_chars=5000)
527
+ chunks, timing = chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1)
528
+ chunks, timing = chunk_xlsx("data.xlsx", mode="page_aware", max_chunk_chars=3000)
529
+ chunks, timing = chunk_xlsx("data.xlsx", mode="semantic", rows_per_chunk=10)
530
+ chunks, timing = chunk_xlsx("legacy.xls", mode="row", rows_per_chunk=5) # XLS works identically
531
+
532
+ for chunk in stream_chunk_xlsx("data.xlsx", mode="row", rows_per_chunk=10): ...
533
+ for chunk in stream_chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1): ...
534
+ for chunk in stream_chunk_xlsx("legacy.xls", mode="semantic", rows_per_chunk=20): ...
404
535
  ```
405
536
 
406
537
  ---
@@ -433,7 +564,13 @@ for chunk in stream_chunk_pptx("deck.pptx", mode="semantic"): ...
433
564
  | `semantic` | Heuristic topic-continuity group (`semantic` mode) |
434
565
  | `sliding_window` | Fixed-size overlapping window (`sliding_window` mode) |
435
566
  | `sentence` | Sentence-count group (`sentence` mode) |
436
- | `page_aware` | Page boundary group (`page_aware` mode) |
567
+ | `page_aware` | Page boundary group (`page_aware` mode for document formats) |
568
+ | `row_document` | XLSX/XLS: N consecutive data rows (`row` mode) |
569
+ | `table_region` | XLSX/XLS: named table or heuristic data region (`table` mode) |
570
+ | `sheet` | XLSX/XLS: full sheet or split part (`sheet` mode) |
571
+ | `row_window` | XLSX/XLS: overlapping row window (`sliding_window` mode) |
572
+ | `sheet_region` | XLSX/XLS: print area or full sheet (`page_aware` mode) |
573
+ | `semantic_group` | XLSX/XLS: category-grouped rows or fallback fixed-size group (`semantic` mode) |
437
574
 
438
575
  ### Metadata fields by mode
439
576
 
@@ -613,7 +750,8 @@ def process_document(file_path: str):
613
750
  │ Format Dispatcher │
614
751
  │ (py_chunks/chunkers/*.py) │
615
752
  │ chunk_pdf / chunk_docx / chunk_pptx / │
616
- │ chunk_md / chunk_html / chunk_txt +
753
+ │ chunk_md / chunk_html / chunk_txt /
754
+ │ chunk_xlsx (handles .xlsx + .xls) + │
617
755
  │ matching stream_chunk_* variants │
618
756
  └──────────────┬───────────────────────────────┘
619
757
  │ validates args, dispatches to the right Rust function,
@@ -639,6 +777,11 @@ def process_document(file_path: str):
639
777
  │ DOCX stream — DocxStructuralIterator (default/structural) + │
640
778
  │ per-mode iterators for all other 5 modes │
641
779
  │ PPTX stream — batch-drain (ZIP must be read upfront) │
780
+ │ XLSX/XLS — open_workbook_auto() handles both formats; │
781
+ │ row / sliding_window: state machine, one chunk per __next__ │
782
+ │ table / sheet / page_aware / semantic: batch-drain │
783
+ │ table mode: ZIP XML for named tables (XLSX) or heuristic │
784
+ │ page_aware: print-area XML (XLSX) or full-sheet fallback │
642
785
  └──────────────────────────────────────────────────────────────────┘
643
786
  ```
644
787
 
@@ -667,7 +810,7 @@ except FileNotFoundError as e:
667
810
  try:
668
811
  chunks = get_chunks("image.png")
669
812
  except ValueError as e:
670
- print(e) # Unsupported file type '.png'. Supported: .docx, .htm, .html, .md, .pdf, .pptx, .txt
813
+ print(e) # Unsupported file type '.png'. Supported: .docx, .htm, .html, .md, .pdf, .pptx, .txt, .xls, .xlsx
671
814
 
672
815
  # Scanned / image-only PDF (no text layer)
673
816
  try:
@@ -742,3 +885,4 @@ MIT
742
885
  ---
743
886
 
744
887
  Built with Rust (performance) + Python (simplicity)
888
+
@@ -1,31 +1,20 @@
1
- Metadata-Version: 2.4
2
- Name: py-chunks
3
- Version: 0.2.3
4
- Classifier: Programming Language :: Python :: 3
5
- Classifier: License :: OSI Approved :: MIT License
6
- Classifier: Operating System :: OS Independent
7
- License-File: LICENSE
8
- Summary: Rust-backed Python chunking library
9
- Keywords: chunking,document,pdf,docx,rust
10
- License: MIT
11
- Requires-Python: >=3.9
12
- Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
13
-
14
1
  # py-chunks
15
2
 
16
3
  [![Python](https://img.shields.io/badge/python-3.9+-blue)](https://www.python.org/downloads/) [![License](https://img.shields.io/badge/license-MIT-green)](LICENSE)
17
4
 
18
- Fast, framework-agnostic document chunking library backed by Rust. Extract meaningful content segments from DOCX, PDF, PPTX, TXT, Markdown, and HTML files — optimised for production use.
5
+ Fast, framework-agnostic document chunking library backed by Rust. Extract meaningful content segments from DOCX, PDF, PPTX, TXT, Markdown, HTML, XLSX, and XLS files — optimised for production use.
19
6
 
20
7
  ## Features
21
8
 
22
- - **6 Document Formats**: PDF, DOCX, PPTX, Markdown, HTML, TXT
23
- - **7 Chunking Modes across every format**: `default`, `structural`, `section`, `semantic`, `sliding_window`, `sentence`, `page_aware`
9
+ - **8 Document Formats**: PDF, DOCX, PPTX, Markdown, HTML, TXT, XLSX, XLS
10
+ - **7 Chunking Modes for document formats**: `default`, `structural`, `section`, `semantic`, `sliding_window`, `sentence`, `page_aware`
11
+ - **6 Chunking Modes for spreadsheet formats** (XLSX / XLS): `row`, `table`, `sheet`, `sliding_window`, `page_aware`, `semantic`
24
12
  - **Streaming for every format** via a single `stream_chunks()` entry point
25
13
  - PDF: background Rust thread + `mpsc` channel (all 7 modes, true one-chunk-at-a-time)
26
14
  - Markdown / HTML / TXT: block-by-block state machine for `structural` + `semantic`; batch-drain for the rest
27
15
  - DOCX: all 7 modes — `DocxStructuralIterator` for `default`/`structural`; dedicated per-mode iterators for the remaining 5 modes (lazy chunk emission after a single upfront parse)
28
16
  - PPTX: batch-drain (ZIP must be read upfront, then chunks are yielded one at a time)
17
+ - XLSX / XLS: `row` and `sliding_window` use true state machines (one chunk per `__next__`, O(parsed_rows) memory); `table`, `sheet`, `page_aware`, and `semantic` use batch-drain (global sheet analysis required before first chunk)
29
18
  - **Multiple Input Sources**: local file paths, raw `bytes` / `bytearray` / `memoryview`, file-like objects (`BytesIO`, open files), FastAPI / Starlette `UploadFile`, HTTP(S) / S3 pre-signed URLs
30
19
  - **Consistent Output Schema**: every chunk is a `dict` with `content`, `content_type`, and `metadata` keys
31
20
  - **Zero Python runtime dependencies**: all parsing happens in the Rust extension; the PDFium native binary is bundled inside the wheel
@@ -63,6 +52,8 @@ chunks = get_chunks("notes.md", mode="semantic")
63
52
  chunks = get_chunks("page.html", mode="section")
64
53
  chunks = get_chunks("deck.pptx", mode="sliding_window", window_size=3, overlap=1)
65
54
  chunks = get_chunks("report.docx", mode="sentence", sentences_per_chunk=3)
55
+ chunks = get_chunks("data.xlsx", mode="row", rows_per_chunk=5)
56
+ chunks = get_chunks("legacy.xls", mode="row", rows_per_chunk=5)
66
57
 
67
58
  for chunk in chunks:
68
59
  print(chunk["content"])
@@ -208,6 +199,86 @@ These three formats also support **streaming in every mode** — see the Streami
208
199
 
209
200
  ---
210
201
 
202
+ ### XLSX / XLS modes
203
+
204
+ Both `.xlsx` and `.xls` files are handled by the same chunker. All 6 modes are available for batch and streaming:
205
+
206
+ ```python
207
+ from py_chunks import get_chunks, stream_chunks
208
+ from py_chunks.chunkers.xlsx import chunk_xlsx, stream_chunk_xlsx
209
+
210
+ # Batch — via unified API
211
+ chunks = get_chunks("data.xlsx", mode="row", rows_per_chunk=5)
212
+ chunks = get_chunks("legacy.xls", mode="row", rows_per_chunk=5)
213
+
214
+ # Batch — via format-specific chunker (returns chunks + timing)
215
+ chunks, timing = chunk_xlsx("data.xlsx", mode="row", rows_per_chunk=5)
216
+ chunks, timing = chunk_xlsx("data.xlsx", mode="table", max_chunk_chars=3000)
217
+ chunks, timing = chunk_xlsx("data.xlsx", mode="sheet", max_chunk_chars=5000)
218
+ chunks, timing = chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1)
219
+ chunks, timing = chunk_xlsx("data.xlsx", mode="page_aware", max_chunk_chars=3000)
220
+ chunks, timing = chunk_xlsx("data.xlsx", mode="semantic", rows_per_chunk=10)
221
+
222
+ # Filter to specific sheets
223
+ chunks, _ = chunk_xlsx("data.xlsx", mode="row", sheet_names=["Sales", "Q4"])
224
+
225
+ # Streaming — identical output to batch
226
+ for chunk in stream_chunks("data.xlsx", mode="row", rows_per_chunk=5):
227
+ print(chunk["content"])
228
+
229
+ for chunk in stream_chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1):
230
+ embed_and_store(chunk)
231
+ ```
232
+
233
+ | Mode | `content_type` | Description |
234
+ |---|---|---|
235
+ | `row` | `row_document` | Groups N consecutive data rows into one chunk. Header row is auto-detected and excluded from content. Param: `rows_per_chunk` (default 1). |
236
+ | `table` | `table_region` | Named Excel tables (XLSX only) or heuristic contiguous data regions per sheet. For XLS and sheets without named tables, falls back to bounding-box detection. Param: `max_chunk_chars`. |
237
+ | `sheet` | `sheet` | One chunk per sheet (split by `max_chunk_chars` if needed). Includes named-table metadata. Param: `max_chunk_chars`. |
238
+ | `sliding_window` | `row_window` | Overlapping windows of N rows. Params: `window_size` (default 3), `overlap` (default 1, must be `< window_size`). |
239
+ | `page_aware` | `sheet_region` | Chunks by Excel print areas (XLSX only); falls back to the full sheet when no print area is defined. For XLS, always uses the full-sheet fallback. Param: `max_chunk_chars`. |
240
+ | `semantic` | `semantic_group` | Detects the column with the lowest cardinality of string values, sorts by it, and groups rows sharing the same category value. Falls back to fixed-size chunking when no suitable column is found. Param: `rows_per_chunk` (used for the fallback). |
241
+
242
+ **Parameters accepted by `chunk_xlsx` and `stream_chunk_xlsx`:**
243
+
244
+ | Parameter | Type | Default | Description |
245
+ |---|---|---|---|
246
+ | `file_path` | str | — | Path to `.xlsx` or `.xls` file |
247
+ | `mode` | str | `"row"` | One of the 6 modes above |
248
+ | `rows_per_chunk` | int | 1 | Rows per chunk (`row` mode and `semantic` fallback). Must be `> 0`. |
249
+ | `window_size` | int | 3 | Window size in rows (`sliding_window` mode). Must be `>= 1`. |
250
+ | `overlap` | int | 1 | Overlapping rows between windows. Must be `< window_size`. |
251
+ | `include_headers` | bool | True | Prefix each row value with its column header (`key: value` format). |
252
+ | `sheet_names` | list[str] \| None | None | Process only the named sheets; processes all sheets when `None` or `[]`. |
253
+ | `skip_empty_rows` | bool | True | Skip rows where every cell is empty. |
254
+ | `max_chunk_chars` | int | 2000 | Character limit per chunk (`table`, `sheet`, `page_aware` modes). |
255
+
256
+ **XLS vs XLSX differences:**
257
+
258
+ | Feature | XLSX | XLS |
259
+ |---|---|---|
260
+ | Named table detection (`table` mode) | ZIP XML (`table1.xml`) — full named-table metadata | Not available — heuristic bounding-box only; `is_named_table` is always `false` |
261
+ | Print area detection (`page_aware` mode) | Parsed from `xl/workbook.xml` | Not available — always uses full-sheet fallback; `has_print_area` is always `false` |
262
+ | Named table metadata in `sheet` mode | `has_named_tables: true/false`, `named_tables: [...]` | Always `has_named_tables: false`, `named_tables: []` |
263
+ | All other modes | Identical | Identical |
264
+
265
+ **XLSX / XLS metadata fields by mode:**
266
+
267
+ | Mode | Notable metadata keys |
268
+ |---|---|
269
+ | `row` | `sheet_name`, `sheet_index`, `row_index`, `header_row`, `col_count`, `rows_per_chunk`, `actual_row_count`, `chunk_index` |
270
+ | `table` | `sheet_name`, `sheet_index`, `table_name`, `is_named_table`, `header_row`, `start_row`, `end_row`, `start_col`, `end_col`, `row_count`, `col_count`, `chunk_index`, `is_split`, `split_part` |
271
+ | `sheet` | `sheet_name`, `sheet_index`, `row_count`, `col_count`, `header_row`, `has_named_tables`, `named_tables`, `chunk_index`, `is_split`, `split_part` |
272
+ | `sliding_window` | `sheet_name`, `sheet_index`, `window_size`, `overlap`, `actual_row_count`, `window_index`, `start_row`, `end_row`, `header_row`, `col_count`, `chunk_index` |
273
+ | `page_aware` | `sheet_name`, `sheet_index`, `has_print_area`, `print_area_ref`, `start_row`, `end_row`, `start_col`, `end_col`, `row_count`, `col_count`, `header_row`, `region_index`, `chunk_index`, `is_split`, `split_part` |
274
+ | `semantic` | `sheet_name`, `sheet_index`, `category_column`, `category_value`, `used_fallback`, `low_grouping_quality`, `avg_group_size`, `start_row`, `end_row`, `actual_row_count`, `header_row`, `col_count`, `group_index`, `chunk_index` |
275
+
276
+ > **Streaming memory profile**: `row` and `sliding_window` pre-parse all sheet data once (calamine reads the entire file on open — there is no incremental I/O at the format level), then build and yield one chunk per `__next__`. The other four modes require global sheet analysis before the first chunk can be emitted, so they materialise all chunks at construction time and drain them lazily. In both cases the streaming iterator yields one chunk at a time.
277
+
278
+ > **Header detection**: the first all-string row in each sheet is automatically detected as the header row and excluded from chunk content. Columns without a header label are named `Column 1`, `Column 2`, etc.
279
+
280
+ ---
281
+
211
282
  ## Streaming
212
283
 
213
284
  ### When to use streaming
@@ -227,6 +298,7 @@ Use `stream_chunks` (or the `stream_chunks_from_*` variants) when:
227
298
  | **TXT** | All 7 | Same as Markdown | Pure Rust, no threads. |
228
299
  | **DOCX** | All 7 | `DocxStructuralIterator` for `default`/`structural`; dedicated per-mode Rust iterators for the other 5 | Full document parsed once upfront; chunks emitted lazily. Peak memory ≈ file size + chunk vec. Output equals `get_chunks` for every mode. |
229
300
  | **PPTX** | All 7 | Batch-drain | PPTX requires the full ZIP up front, so chunks are computed once at construction and yielded one per `__next__`. |
301
+ | **XLSX / XLS** | All 6 | State machine for `row` / `sliding_window`; batch-drain for `table` / `sheet` / `page_aware` / `semantic` | calamine reads the full file on open (no incremental I/O at format level). `row` and `sliding_window` build one chunk per `__next__` from pre-parsed row data. The other four modes require global analysis first and materialise all chunks at iterator construction. Output is identical to `chunk_xlsx` for every mode. |
230
302
 
231
303
  > **Parity guarantee**: streaming output equals `list(get_chunks(...))` for every format and every supported mode (this is exercised by `test_pdf_streaming.py` for PDF and by the tests in `py_chunks/tests/test_source_apis.py`).
232
304
 
@@ -259,6 +331,19 @@ for chunk in stream_chunks("document.docx", mode="page_aware", paragraphs_per_
259
331
  for chunk in stream_chunks("deck.pptx", mode="semantic"):
260
332
  ...
261
333
 
334
+ # XLSX / XLS — all 6 modes
335
+ for chunk in stream_chunks("data.xlsx", mode="row", rows_per_chunk=10):
336
+ embed_and_index(chunk)
337
+
338
+ for chunk in stream_chunks("report.xls", mode="sliding_window", window_size=5, overlap=2):
339
+ process(chunk)
340
+
341
+ for chunk in stream_chunks("data.xlsx", mode="table", max_chunk_chars=3000):
342
+ store_in_db(chunk)
343
+
344
+ for chunk in stream_chunks("data.xlsx", mode="semantic", rows_per_chunk=20):
345
+ handle(chunk)
346
+
262
347
  # From bytes (e.g. FastAPI body)
263
348
  for chunk in stream_chunks(request_body, filename="report.pdf", mode="semantic"):
264
349
  process(chunk)
@@ -311,8 +396,11 @@ Or use the explicit source-specific helpers:
311
396
  | Markdown | `.md` | All 7 | All 7 (state machine for `structural` / `semantic`) |
312
397
  | HTML | `.html`, `.htm` | All 7 | All 7 (state machine for `structural` / `semantic`) |
313
398
  | Plain Text | `.txt` | All 7 | All 7 (state machine for `structural` / `semantic`) |
399
+ | Excel | `.xlsx`, `.xls` | All 6 | All 6 (`row` / `sliding_window` state machine; others batch-drain) |
400
+
401
+ The 7 document modes are: `default`, `structural`, `section`, `semantic`, `sliding_window`, `sentence`, `page_aware`.
314
402
 
315
- The 7 modes are: `default`, `structural`, `section`, `semantic`, `sliding_window`, `sentence`, `page_aware`.
403
+ The 6 spreadsheet modes are: `row`, `table`, `sheet`, `sliding_window`, `page_aware`, `semantic`.
316
404
 
317
405
  ---
318
406
 
@@ -391,6 +479,7 @@ from py_chunks.chunkers.pptx import chunk_pptx, stream_chunk_pptx, chunk_pptx_wi
391
479
  from py_chunks.chunkers.html import chunk_html, stream_chunk_html
392
480
  from py_chunks.chunkers.md import chunk_md, stream_chunk_md
393
481
  from py_chunks.chunkers.txt import chunk_txt, stream_chunk_txt
482
+ from py_chunks.chunkers.xlsx import chunk_xlsx, stream_chunk_xlsx # handles both .xlsx and .xls
394
483
 
395
484
  # Batch with timing
396
485
  chunks, timing = chunk_pdf("file.pdf", mode="section")
@@ -414,6 +503,19 @@ for chunk in stream_chunk_md("book.md", mode="sentence", sentences_per_chunk=2):
414
503
  for chunk in stream_chunk_html("page.html", mode="section"): ...
415
504
  for chunk in stream_chunk_txt("log.txt", mode="page_aware", paragraphs_per_page=20): ...
416
505
  for chunk in stream_chunk_pptx("deck.pptx", mode="semantic"): ...
506
+
507
+ # XLSX / XLS — all 6 modes, batch and streaming
508
+ chunks, timing = chunk_xlsx("data.xlsx", mode="row", rows_per_chunk=5)
509
+ chunks, timing = chunk_xlsx("data.xlsx", mode="table", max_chunk_chars=3000)
510
+ chunks, timing = chunk_xlsx("data.xlsx", mode="sheet", max_chunk_chars=5000)
511
+ chunks, timing = chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1)
512
+ chunks, timing = chunk_xlsx("data.xlsx", mode="page_aware", max_chunk_chars=3000)
513
+ chunks, timing = chunk_xlsx("data.xlsx", mode="semantic", rows_per_chunk=10)
514
+ chunks, timing = chunk_xlsx("legacy.xls", mode="row", rows_per_chunk=5) # XLS works identically
515
+
516
+ for chunk in stream_chunk_xlsx("data.xlsx", mode="row", rows_per_chunk=10): ...
517
+ for chunk in stream_chunk_xlsx("data.xlsx", mode="sliding_window", window_size=4, overlap=1): ...
518
+ for chunk in stream_chunk_xlsx("legacy.xls", mode="semantic", rows_per_chunk=20): ...
417
519
  ```
418
520
 
419
521
  ---
@@ -446,7 +548,13 @@ for chunk in stream_chunk_pptx("deck.pptx", mode="semantic"): ...
446
548
  | `semantic` | Heuristic topic-continuity group (`semantic` mode) |
447
549
  | `sliding_window` | Fixed-size overlapping window (`sliding_window` mode) |
448
550
  | `sentence` | Sentence-count group (`sentence` mode) |
449
- | `page_aware` | Page boundary group (`page_aware` mode) |
551
+ | `page_aware` | Page boundary group (`page_aware` mode for document formats) |
552
+ | `row_document` | XLSX/XLS: N consecutive data rows (`row` mode) |
553
+ | `table_region` | XLSX/XLS: named table or heuristic data region (`table` mode) |
554
+ | `sheet` | XLSX/XLS: full sheet or split part (`sheet` mode) |
555
+ | `row_window` | XLSX/XLS: overlapping row window (`sliding_window` mode) |
556
+ | `sheet_region` | XLSX/XLS: print area or full sheet (`page_aware` mode) |
557
+ | `semantic_group` | XLSX/XLS: category-grouped rows or fallback fixed-size group (`semantic` mode) |
450
558
 
451
559
  ### Metadata fields by mode
452
560
 
@@ -626,7 +734,8 @@ def process_document(file_path: str):
626
734
  │ Format Dispatcher │
627
735
  │ (py_chunks/chunkers/*.py) │
628
736
  │ chunk_pdf / chunk_docx / chunk_pptx / │
629
- │ chunk_md / chunk_html / chunk_txt +
737
+ │ chunk_md / chunk_html / chunk_txt /
738
+ │ chunk_xlsx (handles .xlsx + .xls) + │
630
739
  │ matching stream_chunk_* variants │
631
740
  └──────────────┬───────────────────────────────┘
632
741
  │ validates args, dispatches to the right Rust function,
@@ -652,6 +761,11 @@ def process_document(file_path: str):
652
761
  │ DOCX stream — DocxStructuralIterator (default/structural) + │
653
762
  │ per-mode iterators for all other 5 modes │
654
763
  │ PPTX stream — batch-drain (ZIP must be read upfront) │
764
+ │ XLSX/XLS — open_workbook_auto() handles both formats; │
765
+ │ row / sliding_window: state machine, one chunk per __next__ │
766
+ │ table / sheet / page_aware / semantic: batch-drain │
767
+ │ table mode: ZIP XML for named tables (XLSX) or heuristic │
768
+ │ page_aware: print-area XML (XLSX) or full-sheet fallback │
655
769
  └──────────────────────────────────────────────────────────────────┘
656
770
  ```
657
771
 
@@ -680,7 +794,7 @@ except FileNotFoundError as e:
680
794
  try:
681
795
  chunks = get_chunks("image.png")
682
796
  except ValueError as e:
683
- print(e) # Unsupported file type '.png'. Supported: .docx, .htm, .html, .md, .pdf, .pptx, .txt
797
+ print(e) # Unsupported file type '.png'. Supported: .docx, .htm, .html, .md, .pdf, .pptx, .txt, .xls, .xlsx
684
798
 
685
799
  # Scanned / image-only PDF (no text layer)
686
800
  try:
@@ -755,4 +869,3 @@ MIT
755
869
  ---
756
870
 
757
871
  Built with Rust (performance) + Python (simplicity)
758
-