py-chunks 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. py_chunks-0.2.1/.github/workflows/release.yml +193 -0
  2. py_chunks-0.2.1/.gitignore +18 -0
  3. py_chunks-0.2.1/.pylintrc +8 -0
  4. py_chunks-0.2.1/Cargo.lock +1213 -0
  5. py_chunks-0.2.1/Cargo.toml +20 -0
  6. py_chunks-0.2.1/LICENSE +21 -0
  7. py_chunks-0.2.1/PKG-INFO +758 -0
  8. py_chunks-0.2.1/README.md +744 -0
  9. py_chunks-0.2.1/py_chunks/__init__.py +848 -0
  10. py_chunks-0.2.1/py_chunks/chunkers/__init__.py +23 -0
  11. py_chunks-0.2.1/py_chunks/chunkers/docx.py +182 -0
  12. py_chunks-0.2.1/py_chunks/chunkers/html.py +105 -0
  13. py_chunks-0.2.1/py_chunks/chunkers/md.py +126 -0
  14. py_chunks-0.2.1/py_chunks/chunkers/pdf.py +161 -0
  15. py_chunks-0.2.1/py_chunks/chunkers/pptx.py +158 -0
  16. py_chunks-0.2.1/py_chunks/chunkers/txt.py +108 -0
  17. py_chunks-0.2.1/pyproject.toml +23 -0
  18. py_chunks-0.2.1/src/extensions/docx/common.rs +897 -0
  19. py_chunks-0.2.1/src/extensions/docx/mod.rs +17 -0
  20. py_chunks-0.2.1/src/extensions/docx/page_aware.rs +331 -0
  21. py_chunks-0.2.1/src/extensions/docx/section.rs +389 -0
  22. py_chunks-0.2.1/src/extensions/docx/semantic.rs +515 -0
  23. py_chunks-0.2.1/src/extensions/docx/sentence.rs +339 -0
  24. py_chunks-0.2.1/src/extensions/docx/sliding_window.rs +192 -0
  25. py_chunks-0.2.1/src/extensions/docx/structural.rs +1508 -0
  26. py_chunks-0.2.1/src/extensions/html/common.rs +476 -0
  27. py_chunks-0.2.1/src/extensions/html/mod.rs +19 -0
  28. py_chunks-0.2.1/src/extensions/html/page_aware.rs +92 -0
  29. py_chunks-0.2.1/src/extensions/html/section.rs +179 -0
  30. py_chunks-0.2.1/src/extensions/html/semantic.rs +341 -0
  31. py_chunks-0.2.1/src/extensions/html/sentence.rs +128 -0
  32. py_chunks-0.2.1/src/extensions/html/sliding_window.rs +84 -0
  33. py_chunks-0.2.1/src/extensions/html/stream_iter.rs +324 -0
  34. py_chunks-0.2.1/src/extensions/html/structural.rs +174 -0
  35. py_chunks-0.2.1/src/extensions/md/common.rs +646 -0
  36. py_chunks-0.2.1/src/extensions/md/mod.rs +19 -0
  37. py_chunks-0.2.1/src/extensions/md/page_aware.rs +329 -0
  38. py_chunks-0.2.1/src/extensions/md/section.rs +302 -0
  39. py_chunks-0.2.1/src/extensions/md/semantic.rs +514 -0
  40. py_chunks-0.2.1/src/extensions/md/sentence.rs +417 -0
  41. py_chunks-0.2.1/src/extensions/md/sliding_window.rs +198 -0
  42. py_chunks-0.2.1/src/extensions/md/stream_iter.rs +806 -0
  43. py_chunks-0.2.1/src/extensions/md/structural.rs +250 -0
  44. py_chunks-0.2.1/src/extensions/mod.rs +7 -0
  45. py_chunks-0.2.1/src/extensions/pdf/common.rs +720 -0
  46. py_chunks-0.2.1/src/extensions/pdf/mod.rs +13 -0
  47. py_chunks-0.2.1/src/extensions/pdf/stream_iter.rs +1477 -0
  48. py_chunks-0.2.1/src/extensions/pdf/structural.rs +1872 -0
  49. py_chunks-0.2.1/src/extensions/pptx/common.rs +1012 -0
  50. py_chunks-0.2.1/src/extensions/pptx/mod.rs +19 -0
  51. py_chunks-0.2.1/src/extensions/pptx/page_aware.rs +87 -0
  52. py_chunks-0.2.1/src/extensions/pptx/section.rs +261 -0
  53. py_chunks-0.2.1/src/extensions/pptx/semantic.rs +408 -0
  54. py_chunks-0.2.1/src/extensions/pptx/sentence.rs +101 -0
  55. py_chunks-0.2.1/src/extensions/pptx/sliding_window.rs +142 -0
  56. py_chunks-0.2.1/src/extensions/pptx/stream_iter.rs +104 -0
  57. py_chunks-0.2.1/src/extensions/pptx/structural.rs +140 -0
  58. py_chunks-0.2.1/src/extensions/shared.rs +366 -0
  59. py_chunks-0.2.1/src/extensions/txt/common.rs +543 -0
  60. py_chunks-0.2.1/src/extensions/txt/mod.rs +19 -0
  61. py_chunks-0.2.1/src/extensions/txt/page_aware.rs +221 -0
  62. py_chunks-0.2.1/src/extensions/txt/section.rs +312 -0
  63. py_chunks-0.2.1/src/extensions/txt/semantic.rs +430 -0
  64. py_chunks-0.2.1/src/extensions/txt/sentence.rs +251 -0
  65. py_chunks-0.2.1/src/extensions/txt/sliding_window.rs +208 -0
  66. py_chunks-0.2.1/src/extensions/txt/stream_iter.rs +488 -0
  67. py_chunks-0.2.1/src/extensions/txt/structural.rs +263 -0
  68. py_chunks-0.2.1/src/lib.rs +16 -0
@@ -0,0 +1,193 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v*'
7
+ workflow_dispatch:
8
+
9
+ # PDFium build number from https://github.com/bblanchon/pdfium-binaries/releases
10
+ # Update this to pick up a newer PDFium build.
11
+ env:
12
+ PDFIUM_VERSION: "6996"
13
+
14
+ jobs:
15
+
16
+ # ── macOS ──────────────────────────────────────────────────────────────────
17
+ build-macos:
18
+ name: macOS (${{ matrix.target }})
19
+ runs-on: ${{ matrix.os }}
20
+ strategy:
21
+ fail-fast: false
22
+ matrix:
23
+ include:
24
+ - os: macos-14 # Apple Silicon
25
+ target: aarch64-apple-darwin
26
+ pdfium_artifact: pdfium-mac-arm64
27
+ - os: macos-15-intel # Intel (macos-13 retired Dec 2025)
28
+ target: x86_64-apple-darwin
29
+ pdfium_artifact: pdfium-mac-x64
30
+
31
+ steps:
32
+ - uses: actions/checkout@v4
33
+
34
+ - name: Download and bundle PDFium
35
+ run: |
36
+ curl -fL \
37
+ "https://github.com/bblanchon/pdfium-binaries/releases/download/chromium%2F${{ env.PDFIUM_VERSION }}/${{ matrix.pdfium_artifact }}.tgz" \
38
+ -o pdfium.tgz
39
+ tar -xzf pdfium.tgz
40
+ LIB=$(find . -name "libpdfium.dylib" | grep -v pdfium.tgz | head -1)
41
+ [ -n "$LIB" ] || { echo "libpdfium.dylib not found in archive"; exit 1; }
42
+ cp "$LIB" py_chunks/libpdfium.dylib
43
+ # Ad-hoc sign so macOS will dlopen() it without quarantine issues
44
+ codesign -s - --force py_chunks/libpdfium.dylib
45
+ echo "Bundled $(du -sh py_chunks/libpdfium.dylib)"
46
+
47
+ - uses: actions/setup-python@v5
48
+ with:
49
+ python-version: "3.13"
50
+
51
+ - name: Build wheel
52
+ uses: PyO3/maturin-action@v1
53
+ with:
54
+ command: build
55
+ target: ${{ matrix.target }}
56
+ args: --release --out target/wheels
57
+
58
+ - uses: actions/upload-artifact@v4
59
+ with:
60
+ name: wheels-${{ matrix.target }}
61
+ path: target/wheels
62
+
63
+ # ── Linux x86_64 ───────────────────────────────────────────────────────────
64
+ # Strategy: build the manylinux wheel first (so auditwheel can audit the
65
+ # Rust extension cleanly), then inject the PDFium .so into the wheel zip
66
+ # afterwards. auditwheel only audits ELF extension modules — a plain data
67
+ # file added post-build is never inspected.
68
+ build-linux:
69
+ name: Linux x86_64
70
+ runs-on: ubuntu-22.04
71
+ steps:
72
+ - uses: actions/checkout@v4
73
+
74
+ - uses: actions/setup-python@v5
75
+ with:
76
+ python-version: "3.13"
77
+
78
+ - name: Download PDFium (save for post-build injection)
79
+ run: |
80
+ curl -fL \
81
+ "https://github.com/bblanchon/pdfium-binaries/releases/download/chromium%2F${{ env.PDFIUM_VERSION }}/pdfium-linux-x64.tgz" \
82
+ -o pdfium.tgz
83
+ tar -xzf pdfium.tgz
84
+ LIB=$(find . -name "libpdfium.so" | grep -v pdfium.tgz | head -1)
85
+ [ -n "$LIB" ] || { echo "libpdfium.so not found in archive"; exit 1; }
86
+ mkdir -p /tmp/pdfium_bins
87
+ cp "$LIB" /tmp/pdfium_bins/libpdfium.so
88
+ echo "Saved $(du -sh /tmp/pdfium_bins/libpdfium.so)"
89
+
90
+ - name: Build manylinux wheel
91
+ uses: PyO3/maturin-action@v1
92
+ with:
93
+ command: build
94
+ target: x86_64-unknown-linux-gnu
95
+ manylinux: "2_28"
96
+ args: --release --out target/wheels
97
+
98
+ - name: Inject PDFium into wheel
99
+ run: |
100
+ python3 - << 'PYEOF'
101
+ import zipfile, glob, os
102
+
103
+ wheels = [w for w in glob.glob("target/wheels/*.whl") if "linux" in w or "manylinux" in w]
104
+ if not wheels:
105
+ raise FileNotFoundError("No linux wheel found in target/wheels/")
106
+
107
+ for wheel_path in wheels:
108
+ tmp = wheel_path + ".tmp"
109
+ with zipfile.ZipFile(wheel_path, "r") as zin:
110
+ with zipfile.ZipFile(tmp, "w", zipfile.ZIP_DEFLATED) as zout:
111
+ for item in zin.infolist():
112
+ zout.writestr(item, zin.read(item.filename))
113
+ zout.write("/tmp/pdfium_bins/libpdfium.so", "py_chunks/libpdfium.so")
114
+ os.replace(tmp, wheel_path)
115
+ print(f"Injected libpdfium.so into {os.path.basename(wheel_path)}")
116
+ PYEOF
117
+
118
+ - uses: actions/upload-artifact@v4
119
+ with:
120
+ name: wheels-linux-x86_64
121
+ path: target/wheels
122
+
123
+ # ── Windows x64 ────────────────────────────────────────────────────────────
124
+ build-windows:
125
+ name: Windows x64
126
+ runs-on: windows-2022
127
+ steps:
128
+ - uses: actions/checkout@v4
129
+
130
+ - uses: actions/setup-python@v5
131
+ with:
132
+ python-version: "3.13"
133
+
134
+ - name: Download and bundle PDFium
135
+ shell: pwsh
136
+ run: |
137
+ Invoke-WebRequest `
138
+ -Uri "https://github.com/bblanchon/pdfium-binaries/releases/download/chromium%2F${{ env.PDFIUM_VERSION }}/pdfium-win-x64.tgz" `
139
+ -OutFile pdfium.tgz
140
+ New-Item -ItemType Directory -Name pdfium_tmp -Force | Out-Null
141
+ tar -xzf pdfium.tgz -C pdfium_tmp
142
+ $dll = Get-ChildItem -Path pdfium_tmp -Recurse -Filter "pdfium.dll" | Select-Object -First 1
143
+ if (-not $dll) { Write-Error "pdfium.dll not found in archive"; exit 1 }
144
+ Copy-Item -Path $dll.FullName -Destination "py_chunks\pdfium.dll"
145
+ Write-Host "Bundled $($dll.Name) ($('{0:N0}' -f (Get-Item py_chunks\pdfium.dll).Length) bytes)"
146
+
147
+ - name: Build wheel
148
+ uses: PyO3/maturin-action@v1
149
+ with:
150
+ command: build
151
+ target: x86_64-pc-windows-msvc
152
+ args: --release --out target/wheels
153
+
154
+ - uses: actions/upload-artifact@v4
155
+ with:
156
+ name: wheels-windows-x86_64
157
+ path: target/wheels
158
+
159
+ # ── Source distribution ─────────────────────────────────────────────────────
160
+ sdist:
161
+ name: sdist
162
+ runs-on: ubuntu-latest
163
+ steps:
164
+ - uses: actions/checkout@v4
165
+ - name: Build sdist
166
+ uses: PyO3/maturin-action@v1
167
+ with:
168
+ command: sdist
169
+ args: --out target/wheels
170
+ - uses: actions/upload-artifact@v4
171
+ with:
172
+ name: wheels-sdist
173
+ path: target/wheels
174
+
175
+ # ── Publish ─────────────────────────────────────────────────────────────────
176
+ publish:
177
+ needs: [build-macos, build-linux, build-windows, sdist]
178
+ runs-on: ubuntu-latest
179
+ if: startsWith(github.ref, 'refs/tags/')
180
+ permissions:
181
+ id-token: write # required for PyPI trusted publishing
182
+
183
+ steps:
184
+ - uses: actions/download-artifact@v4
185
+ with:
186
+ pattern: wheels-*
187
+ merge-multiple: true
188
+ path: dist
189
+
190
+ - name: List wheels
191
+ run: ls -lh dist/
192
+
193
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,18 @@
1
+ # Bundled PDFium binaries — downloaded by CI, never committed
2
+ py_chunks/libpdfium.dylib
3
+ py_chunks/libpdfium.so
4
+ py_chunks/pdfium.dll
5
+
6
+ # Python artifacts
7
+ __pycache__/
8
+ *.py[cod]
9
+ .pytest_cache/
10
+ *.egg-info/
11
+
12
+ # Rust build artifacts
13
+ target/
14
+
15
+ # Local virtual environments
16
+ .venv/
17
+ venv/
18
+ tests/
@@ -0,0 +1,8 @@
1
+ [MASTER]
2
+ extension-pkg-allow-list=py_chunks._rust
3
+
4
+ [MESSAGES CONTROL]
5
+ disable=
6
+ duplicate-code,
7
+ too-few-public-methods,
8
+ missing-function-docstring