py-chunks 0.2.1__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {py_chunks-0.2.1 → py_chunks-0.2.3}/.github/workflows/release.yml +12 -1
- {py_chunks-0.2.1 → py_chunks-0.2.3}/.gitignore +3 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/PKG-INFO +1 -1
- {py_chunks-0.2.1 → py_chunks-0.2.3}/py_chunks/__init__.py +25 -5
- {py_chunks-0.2.1 → py_chunks-0.2.3}/pyproject.toml +1 -1
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pdf/structural.rs +47 -1
- {py_chunks-0.2.1 → py_chunks-0.2.3}/.pylintrc +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/Cargo.lock +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/Cargo.toml +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/LICENSE +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/README.md +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/py_chunks/chunkers/__init__.py +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/py_chunks/chunkers/docx.py +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/py_chunks/chunkers/html.py +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/py_chunks/chunkers/md.py +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/py_chunks/chunkers/pdf.py +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/py_chunks/chunkers/pptx.py +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/py_chunks/chunkers/txt.py +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/docx/common.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/docx/mod.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/docx/page_aware.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/docx/section.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/docx/semantic.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/docx/sentence.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/docx/sliding_window.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/docx/structural.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/html/common.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/html/mod.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/html/page_aware.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/html/section.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/html/semantic.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/html/sentence.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/html/sliding_window.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/html/stream_iter.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/html/structural.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/md/common.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/md/mod.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/md/page_aware.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/md/section.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/md/semantic.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/md/sentence.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/md/sliding_window.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/md/stream_iter.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/md/structural.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/mod.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pdf/common.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pdf/mod.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pdf/stream_iter.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pptx/common.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pptx/mod.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pptx/page_aware.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pptx/section.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pptx/semantic.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pptx/sentence.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pptx/sliding_window.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pptx/stream_iter.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pptx/structural.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/shared.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/txt/common.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/txt/mod.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/txt/page_aware.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/txt/section.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/txt/semantic.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/txt/sentence.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/txt/sliding_window.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/txt/stream_iter.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/txt/structural.rs +0 -0
- {py_chunks-0.2.1 → py_chunks-0.2.3}/src/lib.rs +0 -0
|
@@ -131,7 +131,7 @@ jobs:
|
|
|
131
131
|
with:
|
|
132
132
|
python-version: "3.13"
|
|
133
133
|
|
|
134
|
-
- name: Download and bundle PDFium
|
|
134
|
+
- name: Download and bundle PDFium + MSVC runtime
|
|
135
135
|
shell: pwsh
|
|
136
136
|
run: |
|
|
137
137
|
Invoke-WebRequest `
|
|
@@ -144,6 +144,17 @@ jobs:
|
|
|
144
144
|
Copy-Item -Path $dll.FullName -Destination "py_chunks\pdfium.dll"
|
|
145
145
|
Write-Host "Bundled $($dll.Name) ($('{0:N0}' -f (Get-Item py_chunks\pdfium.dll).Length) bytes)"
|
|
146
146
|
|
|
147
|
+
# Bundle MSVC runtime DLLs alongside pdfium.dll so the Rust
|
|
148
|
+
# LOAD_WITH_ALTERED_SEARCH_PATH preload finds them in py_chunks/.
|
|
149
|
+
$runtimeDlls = @("vcruntime140.dll", "msvcp140.dll", "vcruntime140_1.dll")
|
|
150
|
+
foreach ($rt in $runtimeDlls) {
|
|
151
|
+
$src = "C:\Windows\System32\$rt"
|
|
152
|
+
if (Test-Path $src) {
|
|
153
|
+
Copy-Item -Path $src -Destination "py_chunks\$rt"
|
|
154
|
+
Write-Host "Bundled runtime $rt"
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
|
|
147
158
|
- name: Build wheel
|
|
148
159
|
uses: PyO3/maturin-action@v1
|
|
149
160
|
with:
|
|
@@ -6,6 +6,7 @@ URLs.
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import os
|
|
9
|
+
import sys
|
|
9
10
|
import tempfile
|
|
10
11
|
from os import PathLike, fspath
|
|
11
12
|
from pathlib import Path
|
|
@@ -13,11 +14,30 @@ from typing import Any
|
|
|
13
14
|
from urllib.parse import urlparse
|
|
14
15
|
from urllib.request import urlopen
|
|
15
16
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
#
|
|
19
|
-
|
|
20
|
-
|
|
17
|
+
_pkg_dir = Path(__file__).parent
|
|
18
|
+
|
|
19
|
+
# Tell the Rust layer where to find the bundled PDFium binary.
|
|
20
|
+
os.environ.setdefault("PY_CHUNKS_PACKAGE_DIR", str(_pkg_dir))
|
|
21
|
+
|
|
22
|
+
# Directly resolve the bundled binary and set PDFIUM_LIBRARY_PATH to its
|
|
23
|
+
# absolute path. This hits the highest-priority branch in the Rust resolver
|
|
24
|
+
# so no directory scanning is needed — the path is always exact.
|
|
25
|
+
_PDFIUM_NAMES = {
|
|
26
|
+
"win32": "pdfium.dll",
|
|
27
|
+
"darwin": "libpdfium.dylib",
|
|
28
|
+
"linux": "libpdfium.so",
|
|
29
|
+
}
|
|
30
|
+
_pdfium_bin = _pkg_dir / _PDFIUM_NAMES.get(sys.platform, "")
|
|
31
|
+
if _pdfium_bin.exists():
|
|
32
|
+
os.environ.setdefault("PDFIUM_LIBRARY_PATH", str(_pdfium_bin))
|
|
33
|
+
|
|
34
|
+
# On Windows, register the package directory as a DLL search directory so
|
|
35
|
+
# pdfium.dll's own dependencies (vcruntime140.dll, msvcp140.dll …) are found
|
|
36
|
+
# in py_chunks/ rather than failing with LoadLibrary error 126.
|
|
37
|
+
# os.add_dll_directory() wraps AddDllDirectory() — available on Python 3.8+,
|
|
38
|
+
# which we always satisfy (requires-python = ">=3.9").
|
|
39
|
+
if sys.platform == "win32" and hasattr(os, "add_dll_directory"):
|
|
40
|
+
os.add_dll_directory(str(_pkg_dir))
|
|
21
41
|
|
|
22
42
|
from .chunkers.docx import chunk_docx, stream_chunk_docx
|
|
23
43
|
from .chunkers.html import chunk_html, stream_chunk_html
|
|
@@ -434,6 +434,14 @@ pub(super) fn get_pdfium() -> Result<Pdfium, String> {
|
|
|
434
434
|
add_library_search_path(parent);
|
|
435
435
|
}
|
|
436
436
|
|
|
437
|
+
// On Windows, pre-load with LOAD_WITH_ALTERED_SEARCH_PATH so the
|
|
438
|
+
// loader searches pdfium.dll's own directory for its dependencies
|
|
439
|
+
// (e.g. bundled vcruntime140.dll). If successful the module is
|
|
440
|
+
// cached; pdfium-render's subsequent LoadLibraryExW returns the
|
|
441
|
+
// already-loaded handle without a new search.
|
|
442
|
+
#[cfg(windows)]
|
|
443
|
+
windows_preload_pdfium(&path);
|
|
444
|
+
|
|
437
445
|
if let Ok(bindings) = Pdfium::bind_to_library(&path) {
|
|
438
446
|
return Ok(Pdfium::new(bindings));
|
|
439
447
|
}
|
|
@@ -443,12 +451,50 @@ pub(super) fn get_pdfium() -> Result<Pdfium, String> {
|
|
|
443
451
|
.map(|b| Pdfium::new(b))
|
|
444
452
|
.map_err(|e| {
|
|
445
453
|
format!(
|
|
446
|
-
"PDFium native library not found or could not be loaded.
|
|
454
|
+
"PDFium native library not found or could not be loaded. \
|
|
455
|
+
The py-chunks wheel bundles pdfium.dll — if you see this on \
|
|
456
|
+
Windows, install the Visual C++ Redistributable 2022 (x64): \
|
|
457
|
+
https://aka.ms/vs/17/release/vc_redist.x64.exe \
|
|
458
|
+
Or set PDFIUM_LIBRARY_PATH to point at a working pdfium.dll. \
|
|
459
|
+
Loader error: {}",
|
|
447
460
|
e
|
|
448
461
|
)
|
|
449
462
|
})
|
|
450
463
|
}
|
|
451
464
|
|
|
465
|
+
#[cfg(windows)]
|
|
466
|
+
fn windows_preload_pdfium(path: &str) {
|
|
467
|
+
use std::ffi::OsStr;
|
|
468
|
+
use std::os::windows::ffi::OsStrExt;
|
|
469
|
+
|
|
470
|
+
extern "system" {
|
|
471
|
+
fn LoadLibraryExW(
|
|
472
|
+
lp_file_name: *const u16,
|
|
473
|
+
h_file: *mut std::ffi::c_void,
|
|
474
|
+
dw_flags: u32,
|
|
475
|
+
) -> *mut std::ffi::c_void;
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
// LOAD_WITH_ALTERED_SEARCH_PATH: use the DLL's own directory as the
|
|
479
|
+
// base when resolving its import dependencies.
|
|
480
|
+
const LOAD_WITH_ALTERED_SEARCH_PATH: u32 = 0x0000_0008;
|
|
481
|
+
|
|
482
|
+
let wide: Vec<u16> = OsStr::new(path)
|
|
483
|
+
.encode_wide()
|
|
484
|
+
.chain(std::iter::once(0u16))
|
|
485
|
+
.collect();
|
|
486
|
+
|
|
487
|
+
// Ignore the return value — on failure we fall through to pdfium-render's
|
|
488
|
+
// own load attempt which will surface the OS error code.
|
|
489
|
+
unsafe {
|
|
490
|
+
LoadLibraryExW(
|
|
491
|
+
wide.as_ptr(),
|
|
492
|
+
std::ptr::null_mut(),
|
|
493
|
+
LOAD_WITH_ALTERED_SEARCH_PATH,
|
|
494
|
+
);
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
|
|
452
498
|
fn collect_paragraph_records(
|
|
453
499
|
grouped_paragraphs: Vec<(String, bool, usize, f32)>,
|
|
454
500
|
) -> Vec<ParagraphRecord> {
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|