py-chunks 0.2.1__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {py_chunks-0.2.1 → py_chunks-0.2.3}/.github/workflows/release.yml +12 -1
  2. {py_chunks-0.2.1 → py_chunks-0.2.3}/.gitignore +3 -0
  3. {py_chunks-0.2.1 → py_chunks-0.2.3}/PKG-INFO +1 -1
  4. {py_chunks-0.2.1 → py_chunks-0.2.3}/py_chunks/__init__.py +25 -5
  5. {py_chunks-0.2.1 → py_chunks-0.2.3}/pyproject.toml +1 -1
  6. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pdf/structural.rs +47 -1
  7. {py_chunks-0.2.1 → py_chunks-0.2.3}/.pylintrc +0 -0
  8. {py_chunks-0.2.1 → py_chunks-0.2.3}/Cargo.lock +0 -0
  9. {py_chunks-0.2.1 → py_chunks-0.2.3}/Cargo.toml +0 -0
  10. {py_chunks-0.2.1 → py_chunks-0.2.3}/LICENSE +0 -0
  11. {py_chunks-0.2.1 → py_chunks-0.2.3}/README.md +0 -0
  12. {py_chunks-0.2.1 → py_chunks-0.2.3}/py_chunks/chunkers/__init__.py +0 -0
  13. {py_chunks-0.2.1 → py_chunks-0.2.3}/py_chunks/chunkers/docx.py +0 -0
  14. {py_chunks-0.2.1 → py_chunks-0.2.3}/py_chunks/chunkers/html.py +0 -0
  15. {py_chunks-0.2.1 → py_chunks-0.2.3}/py_chunks/chunkers/md.py +0 -0
  16. {py_chunks-0.2.1 → py_chunks-0.2.3}/py_chunks/chunkers/pdf.py +0 -0
  17. {py_chunks-0.2.1 → py_chunks-0.2.3}/py_chunks/chunkers/pptx.py +0 -0
  18. {py_chunks-0.2.1 → py_chunks-0.2.3}/py_chunks/chunkers/txt.py +0 -0
  19. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/docx/common.rs +0 -0
  20. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/docx/mod.rs +0 -0
  21. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/docx/page_aware.rs +0 -0
  22. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/docx/section.rs +0 -0
  23. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/docx/semantic.rs +0 -0
  24. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/docx/sentence.rs +0 -0
  25. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/docx/sliding_window.rs +0 -0
  26. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/docx/structural.rs +0 -0
  27. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/html/common.rs +0 -0
  28. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/html/mod.rs +0 -0
  29. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/html/page_aware.rs +0 -0
  30. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/html/section.rs +0 -0
  31. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/html/semantic.rs +0 -0
  32. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/html/sentence.rs +0 -0
  33. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/html/sliding_window.rs +0 -0
  34. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/html/stream_iter.rs +0 -0
  35. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/html/structural.rs +0 -0
  36. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/md/common.rs +0 -0
  37. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/md/mod.rs +0 -0
  38. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/md/page_aware.rs +0 -0
  39. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/md/section.rs +0 -0
  40. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/md/semantic.rs +0 -0
  41. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/md/sentence.rs +0 -0
  42. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/md/sliding_window.rs +0 -0
  43. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/md/stream_iter.rs +0 -0
  44. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/md/structural.rs +0 -0
  45. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/mod.rs +0 -0
  46. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pdf/common.rs +0 -0
  47. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pdf/mod.rs +0 -0
  48. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pdf/stream_iter.rs +0 -0
  49. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pptx/common.rs +0 -0
  50. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pptx/mod.rs +0 -0
  51. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pptx/page_aware.rs +0 -0
  52. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pptx/section.rs +0 -0
  53. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pptx/semantic.rs +0 -0
  54. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pptx/sentence.rs +0 -0
  55. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pptx/sliding_window.rs +0 -0
  56. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pptx/stream_iter.rs +0 -0
  57. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/pptx/structural.rs +0 -0
  58. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/shared.rs +0 -0
  59. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/txt/common.rs +0 -0
  60. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/txt/mod.rs +0 -0
  61. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/txt/page_aware.rs +0 -0
  62. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/txt/section.rs +0 -0
  63. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/txt/semantic.rs +0 -0
  64. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/txt/sentence.rs +0 -0
  65. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/txt/sliding_window.rs +0 -0
  66. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/txt/stream_iter.rs +0 -0
  67. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/extensions/txt/structural.rs +0 -0
  68. {py_chunks-0.2.1 → py_chunks-0.2.3}/src/lib.rs +0 -0
@@ -131,7 +131,7 @@ jobs:
131
131
  with:
132
132
  python-version: "3.13"
133
133
 
134
- - name: Download and bundle PDFium
134
+ - name: Download and bundle PDFium + MSVC runtime
135
135
  shell: pwsh
136
136
  run: |
137
137
  Invoke-WebRequest `
@@ -144,6 +144,17 @@ jobs:
144
144
  Copy-Item -Path $dll.FullName -Destination "py_chunks\pdfium.dll"
145
145
  Write-Host "Bundled $($dll.Name) ($('{0:N0}' -f (Get-Item py_chunks\pdfium.dll).Length) bytes)"
146
146
 
147
+ # Bundle MSVC runtime DLLs alongside pdfium.dll so the Rust
148
+ # LOAD_WITH_ALTERED_SEARCH_PATH preload finds them in py_chunks/.
149
+ $runtimeDlls = @("vcruntime140.dll", "msvcp140.dll", "vcruntime140_1.dll")
150
+ foreach ($rt in $runtimeDlls) {
151
+ $src = "C:\Windows\System32\$rt"
152
+ if (Test-Path $src) {
153
+ Copy-Item -Path $src -Destination "py_chunks\$rt"
154
+ Write-Host "Bundled runtime $rt"
155
+ }
156
+ }
157
+
147
158
  - name: Build wheel
148
159
  uses: PyO3/maturin-action@v1
149
160
  with:
@@ -2,6 +2,9 @@
2
2
  py_chunks/libpdfium.dylib
3
3
  py_chunks/libpdfium.so
4
4
  py_chunks/pdfium.dll
5
+ py_chunks/vcruntime140.dll
6
+ py_chunks/msvcp140.dll
7
+ py_chunks/vcruntime140_1.dll
5
8
 
6
9
  # Python artifacts
7
10
  __pycache__/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: py-chunks
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Classifier: Programming Language :: Python :: 3
5
5
  Classifier: License :: OSI Approved :: MIT License
6
6
  Classifier: Operating System :: OS Independent
@@ -6,6 +6,7 @@ URLs.
6
6
  """
7
7
 
8
8
  import os
9
+ import sys
9
10
  import tempfile
10
11
  from os import PathLike, fspath
11
12
  from pathlib import Path
@@ -13,11 +14,30 @@ from typing import Any
13
14
  from urllib.parse import urlparse
14
15
  from urllib.request import urlopen
15
16
 
16
- # Register the package directory so the Rust layer can find a bundled
17
- # libpdfium binary (libpdfium.dylib / pdfium.dll / libpdfium.so) placed
18
- # alongside _rust.so inside this directory. Must be set before any Rust
19
- # symbol that calls get_pdfium() is first invoked.
20
- os.environ.setdefault("PY_CHUNKS_PACKAGE_DIR", str(Path(__file__).parent))
17
+ _pkg_dir = Path(__file__).parent
18
+
19
+ # Tell the Rust layer where to find the bundled PDFium binary.
20
+ os.environ.setdefault("PY_CHUNKS_PACKAGE_DIR", str(_pkg_dir))
21
+
22
+ # Directly resolve the bundled binary and set PDFIUM_LIBRARY_PATH to its
23
+ # absolute path. This hits the highest-priority branch in the Rust resolver
24
+ # so no directory scanning is needed — the path is always exact.
25
+ _PDFIUM_NAMES = {
26
+ "win32": "pdfium.dll",
27
+ "darwin": "libpdfium.dylib",
28
+ "linux": "libpdfium.so",
29
+ }
30
+ _pdfium_bin = _pkg_dir / _PDFIUM_NAMES.get(sys.platform, "")
31
+ if _pdfium_bin.exists():
32
+ os.environ.setdefault("PDFIUM_LIBRARY_PATH", str(_pdfium_bin))
33
+
34
+ # On Windows, register the package directory as a DLL search directory so
35
+ # pdfium.dll's own dependencies (vcruntime140.dll, msvcp140.dll …) are found
36
+ # in py_chunks/ rather than failing with LoadLibrary error 126.
37
+ # os.add_dll_directory() wraps AddDllDirectory() — available on Python 3.8+,
38
+ # which we always satisfy (requires-python = ">=3.9").
39
+ if sys.platform == "win32" and hasattr(os, "add_dll_directory"):
40
+ os.add_dll_directory(str(_pkg_dir))
21
41
 
22
42
  from .chunkers.docx import chunk_docx, stream_chunk_docx
23
43
  from .chunkers.html import chunk_html, stream_chunk_html
@@ -4,7 +4,7 @@ build-backend = "maturin"
4
4
 
5
5
  [project]
6
6
  name = "py-chunks"
7
- version = "0.2.1"
7
+ version = "0.2.3"
8
8
  description = "Rust-backed Python chunking library"
9
9
  requires-python = ">=3.9"
10
10
  dependencies = []
@@ -434,6 +434,14 @@ pub(super) fn get_pdfium() -> Result<Pdfium, String> {
434
434
  add_library_search_path(parent);
435
435
  }
436
436
 
437
+ // On Windows, pre-load with LOAD_WITH_ALTERED_SEARCH_PATH so the
438
+ // loader searches pdfium.dll's own directory for its dependencies
439
+ // (e.g. bundled vcruntime140.dll). If successful the module is
440
+ // cached; pdfium-render's subsequent LoadLibraryExW returns the
441
+ // already-loaded handle without a new search.
442
+ #[cfg(windows)]
443
+ windows_preload_pdfium(&path);
444
+
437
445
  if let Ok(bindings) = Pdfium::bind_to_library(&path) {
438
446
  return Ok(Pdfium::new(bindings));
439
447
  }
@@ -443,12 +451,50 @@ pub(super) fn get_pdfium() -> Result<Pdfium, String> {
443
451
  .map(|b| Pdfium::new(b))
444
452
  .map_err(|e| {
445
453
  format!(
446
- "PDFium native library not found or could not be loaded. The py-chunks wheel expects pypdfium2 to supply a native PDFium binary automatically. If this still fails, verify that the installed pypdfium2 package is intact and that your environment has access to the bundled pdfium library. On Windows, ensure pdfium.dll is present on PATH. Loader error: {}",
454
+ "PDFium native library not found or could not be loaded. \
455
+ The py-chunks wheel bundles pdfium.dll — if you see this on \
456
+ Windows, install the Visual C++ Redistributable 2022 (x64): \
457
+ https://aka.ms/vs/17/release/vc_redist.x64.exe \
458
+ Or set PDFIUM_LIBRARY_PATH to point at a working pdfium.dll. \
459
+ Loader error: {}",
447
460
  e
448
461
  )
449
462
  })
450
463
  }
451
464
 
465
+ #[cfg(windows)]
466
+ fn windows_preload_pdfium(path: &str) {
467
+ use std::ffi::OsStr;
468
+ use std::os::windows::ffi::OsStrExt;
469
+
470
+ extern "system" {
471
+ fn LoadLibraryExW(
472
+ lp_file_name: *const u16,
473
+ h_file: *mut std::ffi::c_void,
474
+ dw_flags: u32,
475
+ ) -> *mut std::ffi::c_void;
476
+ }
477
+
478
+ // LOAD_WITH_ALTERED_SEARCH_PATH: use the DLL's own directory as the
479
+ // base when resolving its import dependencies.
480
+ const LOAD_WITH_ALTERED_SEARCH_PATH: u32 = 0x0000_0008;
481
+
482
+ let wide: Vec<u16> = OsStr::new(path)
483
+ .encode_wide()
484
+ .chain(std::iter::once(0u16))
485
+ .collect();
486
+
487
+ // Ignore the return value — on failure we fall through to pdfium-render's
488
+ // own load attempt which will surface the OS error code.
489
+ unsafe {
490
+ LoadLibraryExW(
491
+ wide.as_ptr(),
492
+ std::ptr::null_mut(),
493
+ LOAD_WITH_ALTERED_SEARCH_PATH,
494
+ );
495
+ }
496
+ }
497
+
452
498
  fn collect_paragraph_records(
453
499
  grouped_paragraphs: Vec<(String, bool, usize, f32)>,
454
500
  ) -> Vec<ParagraphRecord> {
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes