py-chunks 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py_chunks-0.2.1/.github/workflows/release.yml +193 -0
- py_chunks-0.2.1/.gitignore +18 -0
- py_chunks-0.2.1/.pylintrc +8 -0
- py_chunks-0.2.1/Cargo.lock +1213 -0
- py_chunks-0.2.1/Cargo.toml +20 -0
- py_chunks-0.2.1/LICENSE +21 -0
- py_chunks-0.2.1/PKG-INFO +758 -0
- py_chunks-0.2.1/README.md +744 -0
- py_chunks-0.2.1/py_chunks/__init__.py +848 -0
- py_chunks-0.2.1/py_chunks/chunkers/__init__.py +23 -0
- py_chunks-0.2.1/py_chunks/chunkers/docx.py +182 -0
- py_chunks-0.2.1/py_chunks/chunkers/html.py +105 -0
- py_chunks-0.2.1/py_chunks/chunkers/md.py +126 -0
- py_chunks-0.2.1/py_chunks/chunkers/pdf.py +161 -0
- py_chunks-0.2.1/py_chunks/chunkers/pptx.py +158 -0
- py_chunks-0.2.1/py_chunks/chunkers/txt.py +108 -0
- py_chunks-0.2.1/pyproject.toml +23 -0
- py_chunks-0.2.1/src/extensions/docx/common.rs +897 -0
- py_chunks-0.2.1/src/extensions/docx/mod.rs +17 -0
- py_chunks-0.2.1/src/extensions/docx/page_aware.rs +331 -0
- py_chunks-0.2.1/src/extensions/docx/section.rs +389 -0
- py_chunks-0.2.1/src/extensions/docx/semantic.rs +515 -0
- py_chunks-0.2.1/src/extensions/docx/sentence.rs +339 -0
- py_chunks-0.2.1/src/extensions/docx/sliding_window.rs +192 -0
- py_chunks-0.2.1/src/extensions/docx/structural.rs +1508 -0
- py_chunks-0.2.1/src/extensions/html/common.rs +476 -0
- py_chunks-0.2.1/src/extensions/html/mod.rs +19 -0
- py_chunks-0.2.1/src/extensions/html/page_aware.rs +92 -0
- py_chunks-0.2.1/src/extensions/html/section.rs +179 -0
- py_chunks-0.2.1/src/extensions/html/semantic.rs +341 -0
- py_chunks-0.2.1/src/extensions/html/sentence.rs +128 -0
- py_chunks-0.2.1/src/extensions/html/sliding_window.rs +84 -0
- py_chunks-0.2.1/src/extensions/html/stream_iter.rs +324 -0
- py_chunks-0.2.1/src/extensions/html/structural.rs +174 -0
- py_chunks-0.2.1/src/extensions/md/common.rs +646 -0
- py_chunks-0.2.1/src/extensions/md/mod.rs +19 -0
- py_chunks-0.2.1/src/extensions/md/page_aware.rs +329 -0
- py_chunks-0.2.1/src/extensions/md/section.rs +302 -0
- py_chunks-0.2.1/src/extensions/md/semantic.rs +514 -0
- py_chunks-0.2.1/src/extensions/md/sentence.rs +417 -0
- py_chunks-0.2.1/src/extensions/md/sliding_window.rs +198 -0
- py_chunks-0.2.1/src/extensions/md/stream_iter.rs +806 -0
- py_chunks-0.2.1/src/extensions/md/structural.rs +250 -0
- py_chunks-0.2.1/src/extensions/mod.rs +7 -0
- py_chunks-0.2.1/src/extensions/pdf/common.rs +720 -0
- py_chunks-0.2.1/src/extensions/pdf/mod.rs +13 -0
- py_chunks-0.2.1/src/extensions/pdf/stream_iter.rs +1477 -0
- py_chunks-0.2.1/src/extensions/pdf/structural.rs +1872 -0
- py_chunks-0.2.1/src/extensions/pptx/common.rs +1012 -0
- py_chunks-0.2.1/src/extensions/pptx/mod.rs +19 -0
- py_chunks-0.2.1/src/extensions/pptx/page_aware.rs +87 -0
- py_chunks-0.2.1/src/extensions/pptx/section.rs +261 -0
- py_chunks-0.2.1/src/extensions/pptx/semantic.rs +408 -0
- py_chunks-0.2.1/src/extensions/pptx/sentence.rs +101 -0
- py_chunks-0.2.1/src/extensions/pptx/sliding_window.rs +142 -0
- py_chunks-0.2.1/src/extensions/pptx/stream_iter.rs +104 -0
- py_chunks-0.2.1/src/extensions/pptx/structural.rs +140 -0
- py_chunks-0.2.1/src/extensions/shared.rs +366 -0
- py_chunks-0.2.1/src/extensions/txt/common.rs +543 -0
- py_chunks-0.2.1/src/extensions/txt/mod.rs +19 -0
- py_chunks-0.2.1/src/extensions/txt/page_aware.rs +221 -0
- py_chunks-0.2.1/src/extensions/txt/section.rs +312 -0
- py_chunks-0.2.1/src/extensions/txt/semantic.rs +430 -0
- py_chunks-0.2.1/src/extensions/txt/sentence.rs +251 -0
- py_chunks-0.2.1/src/extensions/txt/sliding_window.rs +208 -0
- py_chunks-0.2.1/src/extensions/txt/stream_iter.rs +488 -0
- py_chunks-0.2.1/src/extensions/txt/structural.rs +263 -0
- py_chunks-0.2.1/src/lib.rs +16 -0
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- 'v*'
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
# PDFium build number from https://github.com/bblanchon/pdfium-binaries/releases
|
|
10
|
+
# Update this to pick up a newer PDFium build.
|
|
11
|
+
env:
|
|
12
|
+
PDFIUM_VERSION: "6996"
|
|
13
|
+
|
|
14
|
+
jobs:
|
|
15
|
+
|
|
16
|
+
# ── macOS ──────────────────────────────────────────────────────────────────
|
|
17
|
+
build-macos:
|
|
18
|
+
name: macOS (${{ matrix.target }})
|
|
19
|
+
runs-on: ${{ matrix.os }}
|
|
20
|
+
strategy:
|
|
21
|
+
fail-fast: false
|
|
22
|
+
matrix:
|
|
23
|
+
include:
|
|
24
|
+
- os: macos-14 # Apple Silicon
|
|
25
|
+
target: aarch64-apple-darwin
|
|
26
|
+
pdfium_artifact: pdfium-mac-arm64
|
|
27
|
+
- os: macos-15-intel # Intel (macos-13 retired Dec 2025)
|
|
28
|
+
target: x86_64-apple-darwin
|
|
29
|
+
pdfium_artifact: pdfium-mac-x64
|
|
30
|
+
|
|
31
|
+
steps:
|
|
32
|
+
- uses: actions/checkout@v4
|
|
33
|
+
|
|
34
|
+
- name: Download and bundle PDFium
|
|
35
|
+
run: |
|
|
36
|
+
curl -fL \
|
|
37
|
+
"https://github.com/bblanchon/pdfium-binaries/releases/download/chromium%2F${{ env.PDFIUM_VERSION }}/${{ matrix.pdfium_artifact }}.tgz" \
|
|
38
|
+
-o pdfium.tgz
|
|
39
|
+
tar -xzf pdfium.tgz
|
|
40
|
+
LIB=$(find . -name "libpdfium.dylib" | grep -v pdfium.tgz | head -1)
|
|
41
|
+
[ -n "$LIB" ] || { echo "libpdfium.dylib not found in archive"; exit 1; }
|
|
42
|
+
cp "$LIB" py_chunks/libpdfium.dylib
|
|
43
|
+
# Ad-hoc sign so macOS will dlopen() it without quarantine issues
|
|
44
|
+
codesign -s - --force py_chunks/libpdfium.dylib
|
|
45
|
+
echo "Bundled $(du -sh py_chunks/libpdfium.dylib)"
|
|
46
|
+
|
|
47
|
+
- uses: actions/setup-python@v5
|
|
48
|
+
with:
|
|
49
|
+
python-version: "3.13"
|
|
50
|
+
|
|
51
|
+
- name: Build wheel
|
|
52
|
+
uses: PyO3/maturin-action@v1
|
|
53
|
+
with:
|
|
54
|
+
command: build
|
|
55
|
+
target: ${{ matrix.target }}
|
|
56
|
+
args: --release --out target/wheels
|
|
57
|
+
|
|
58
|
+
- uses: actions/upload-artifact@v4
|
|
59
|
+
with:
|
|
60
|
+
name: wheels-${{ matrix.target }}
|
|
61
|
+
path: target/wheels
|
|
62
|
+
|
|
63
|
+
# ── Linux x86_64 ───────────────────────────────────────────────────────────
|
|
64
|
+
# Strategy: build the manylinux wheel first (so auditwheel can audit the
|
|
65
|
+
# Rust extension cleanly), then inject the PDFium .so into the wheel zip
|
|
66
|
+
# afterwards. auditwheel only audits ELF extension modules — a plain data
|
|
67
|
+
# file added post-build is never inspected.
|
|
68
|
+
build-linux:
|
|
69
|
+
name: Linux x86_64
|
|
70
|
+
runs-on: ubuntu-22.04
|
|
71
|
+
steps:
|
|
72
|
+
- uses: actions/checkout@v4
|
|
73
|
+
|
|
74
|
+
- uses: actions/setup-python@v5
|
|
75
|
+
with:
|
|
76
|
+
python-version: "3.13"
|
|
77
|
+
|
|
78
|
+
- name: Download PDFium (save for post-build injection)
|
|
79
|
+
run: |
|
|
80
|
+
curl -fL \
|
|
81
|
+
"https://github.com/bblanchon/pdfium-binaries/releases/download/chromium%2F${{ env.PDFIUM_VERSION }}/pdfium-linux-x64.tgz" \
|
|
82
|
+
-o pdfium.tgz
|
|
83
|
+
tar -xzf pdfium.tgz
|
|
84
|
+
LIB=$(find . -name "libpdfium.so" | grep -v pdfium.tgz | head -1)
|
|
85
|
+
[ -n "$LIB" ] || { echo "libpdfium.so not found in archive"; exit 1; }
|
|
86
|
+
mkdir -p /tmp/pdfium_bins
|
|
87
|
+
cp "$LIB" /tmp/pdfium_bins/libpdfium.so
|
|
88
|
+
echo "Saved $(du -sh /tmp/pdfium_bins/libpdfium.so)"
|
|
89
|
+
|
|
90
|
+
- name: Build manylinux wheel
|
|
91
|
+
uses: PyO3/maturin-action@v1
|
|
92
|
+
with:
|
|
93
|
+
command: build
|
|
94
|
+
target: x86_64-unknown-linux-gnu
|
|
95
|
+
manylinux: "2_28"
|
|
96
|
+
args: --release --out target/wheels
|
|
97
|
+
|
|
98
|
+
- name: Inject PDFium into wheel
|
|
99
|
+
run: |
|
|
100
|
+
python3 - << 'PYEOF'
|
|
101
|
+
import zipfile, glob, os
|
|
102
|
+
|
|
103
|
+
wheels = [w for w in glob.glob("target/wheels/*.whl") if "linux" in w or "manylinux" in w]
|
|
104
|
+
if not wheels:
|
|
105
|
+
raise FileNotFoundError("No linux wheel found in target/wheels/")
|
|
106
|
+
|
|
107
|
+
for wheel_path in wheels:
|
|
108
|
+
tmp = wheel_path + ".tmp"
|
|
109
|
+
with zipfile.ZipFile(wheel_path, "r") as zin:
|
|
110
|
+
with zipfile.ZipFile(tmp, "w", zipfile.ZIP_DEFLATED) as zout:
|
|
111
|
+
for item in zin.infolist():
|
|
112
|
+
zout.writestr(item, zin.read(item.filename))
|
|
113
|
+
zout.write("/tmp/pdfium_bins/libpdfium.so", "py_chunks/libpdfium.so")
|
|
114
|
+
os.replace(tmp, wheel_path)
|
|
115
|
+
print(f"Injected libpdfium.so into {os.path.basename(wheel_path)}")
|
|
116
|
+
PYEOF
|
|
117
|
+
|
|
118
|
+
- uses: actions/upload-artifact@v4
|
|
119
|
+
with:
|
|
120
|
+
name: wheels-linux-x86_64
|
|
121
|
+
path: target/wheels
|
|
122
|
+
|
|
123
|
+
# ── Windows x64 ────────────────────────────────────────────────────────────
|
|
124
|
+
build-windows:
|
|
125
|
+
name: Windows x64
|
|
126
|
+
runs-on: windows-2022
|
|
127
|
+
steps:
|
|
128
|
+
- uses: actions/checkout@v4
|
|
129
|
+
|
|
130
|
+
- uses: actions/setup-python@v5
|
|
131
|
+
with:
|
|
132
|
+
python-version: "3.13"
|
|
133
|
+
|
|
134
|
+
- name: Download and bundle PDFium
|
|
135
|
+
shell: pwsh
|
|
136
|
+
run: |
|
|
137
|
+
Invoke-WebRequest `
|
|
138
|
+
-Uri "https://github.com/bblanchon/pdfium-binaries/releases/download/chromium%2F${{ env.PDFIUM_VERSION }}/pdfium-win-x64.tgz" `
|
|
139
|
+
-OutFile pdfium.tgz
|
|
140
|
+
New-Item -ItemType Directory -Name pdfium_tmp -Force | Out-Null
|
|
141
|
+
tar -xzf pdfium.tgz -C pdfium_tmp
|
|
142
|
+
$dll = Get-ChildItem -Path pdfium_tmp -Recurse -Filter "pdfium.dll" | Select-Object -First 1
|
|
143
|
+
if (-not $dll) { Write-Error "pdfium.dll not found in archive"; exit 1 }
|
|
144
|
+
Copy-Item -Path $dll.FullName -Destination "py_chunks\pdfium.dll"
|
|
145
|
+
Write-Host "Bundled $($dll.Name) ($('{0:N0}' -f (Get-Item py_chunks\pdfium.dll).Length) bytes)"
|
|
146
|
+
|
|
147
|
+
- name: Build wheel
|
|
148
|
+
uses: PyO3/maturin-action@v1
|
|
149
|
+
with:
|
|
150
|
+
command: build
|
|
151
|
+
target: x86_64-pc-windows-msvc
|
|
152
|
+
args: --release --out target/wheels
|
|
153
|
+
|
|
154
|
+
- uses: actions/upload-artifact@v4
|
|
155
|
+
with:
|
|
156
|
+
name: wheels-windows-x86_64
|
|
157
|
+
path: target/wheels
|
|
158
|
+
|
|
159
|
+
# ── Source distribution ─────────────────────────────────────────────────────
|
|
160
|
+
sdist:
|
|
161
|
+
name: sdist
|
|
162
|
+
runs-on: ubuntu-latest
|
|
163
|
+
steps:
|
|
164
|
+
- uses: actions/checkout@v4
|
|
165
|
+
- name: Build sdist
|
|
166
|
+
uses: PyO3/maturin-action@v1
|
|
167
|
+
with:
|
|
168
|
+
command: sdist
|
|
169
|
+
args: --out target/wheels
|
|
170
|
+
- uses: actions/upload-artifact@v4
|
|
171
|
+
with:
|
|
172
|
+
name: wheels-sdist
|
|
173
|
+
path: target/wheels
|
|
174
|
+
|
|
175
|
+
# ── Publish ─────────────────────────────────────────────────────────────────
|
|
176
|
+
publish:
|
|
177
|
+
needs: [build-macos, build-linux, build-windows, sdist]
|
|
178
|
+
runs-on: ubuntu-latest
|
|
179
|
+
if: startsWith(github.ref, 'refs/tags/')
|
|
180
|
+
permissions:
|
|
181
|
+
id-token: write # required for PyPI trusted publishing
|
|
182
|
+
|
|
183
|
+
steps:
|
|
184
|
+
- uses: actions/download-artifact@v4
|
|
185
|
+
with:
|
|
186
|
+
pattern: wheels-*
|
|
187
|
+
merge-multiple: true
|
|
188
|
+
path: dist
|
|
189
|
+
|
|
190
|
+
- name: List wheels
|
|
191
|
+
run: ls -lh dist/
|
|
192
|
+
|
|
193
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Bundled PDFium binaries — downloaded by CI, never committed
|
|
2
|
+
py_chunks/libpdfium.dylib
|
|
3
|
+
py_chunks/libpdfium.so
|
|
4
|
+
py_chunks/pdfium.dll
|
|
5
|
+
|
|
6
|
+
# Python artifacts
|
|
7
|
+
__pycache__/
|
|
8
|
+
*.py[cod]
|
|
9
|
+
.pytest_cache/
|
|
10
|
+
*.egg-info/
|
|
11
|
+
|
|
12
|
+
# Rust build artifacts
|
|
13
|
+
target/
|
|
14
|
+
|
|
15
|
+
# Local virtual environments
|
|
16
|
+
.venv/
|
|
17
|
+
venv/
|
|
18
|
+
tests/
|