speedy-utils 1.1.28__py3-none-any.whl → 1.1.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: speedy-utils
3
- Version: 1.1.28
3
+ Version: 1.1.29
4
4
  Summary: Fast and easy-to-use package for data science
5
5
  Project-URL: Homepage, https://github.com/anhvth/speedy
6
6
  Project-URL: Repository, https://github.com/anhvth/speedy
@@ -48,10 +48,10 @@ speedy_utils/scripts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
48
48
  speedy_utils/scripts/mpython.py,sha256=aZvusJLKa3APVhabbFUAEo873VBm8Fym7HKGmVW4LyE,3843
49
49
  speedy_utils/scripts/openapi_client_codegen.py,sha256=GModmmhkvGnxljK4KczyixKDrk-VEcLaW5I0XT6tzWo,9657
50
50
  vision_utils/README.md,sha256=AIDZZj8jo_QNrEjFyHwd00iOO431s-js-M2dLtVTn3I,5740
51
- vision_utils/__init__.py,sha256=FjorBD123F8WmPgCDJJO3UA52anam6hJxcv8ngvnM4Q,220
52
- vision_utils/io_utils.py,sha256=eMJmCaDuymuYSGLoG32KfCvX3S2e8i1G10BrjtzKgtc,15619
51
+ vision_utils/__init__.py,sha256=XsLxy1Fn33Zxu6hTFl3NEWfxGjuQQ-0Wmoh6lU9NZ_o,257
52
+ vision_utils/io_utils.py,sha256=1FkG6k7uwZALh3-JkWXEHoGQJhjTqG1jC20SxObPRS0,25921
53
53
  vision_utils/plot.py,sha256=3NW2atYPdcs6-Qq8N179Pwji-p5ZUN6ddC0zE2WZb_8,11984
54
- speedy_utils-1.1.28.dist-info/METADATA,sha256=KjwVNg8n43ltnYrVLlN7rMlk0qO7nQElEdl6VjfzP8E,8028
55
- speedy_utils-1.1.28.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
56
- speedy_utils-1.1.28.dist-info/entry_points.txt,sha256=1rrFMfqvaMUE9hvwGiD6vnVh98kmgy0TARBj-v0Lfhs,244
57
- speedy_utils-1.1.28.dist-info/RECORD,,
54
+ speedy_utils-1.1.29.dist-info/METADATA,sha256=FcgtzlK1CJlS3sWFYi3Apkx4qqTcTRoLOaHz6YW8Wuc,8028
55
+ speedy_utils-1.1.29.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
56
+ speedy_utils-1.1.29.dist-info/entry_points.txt,sha256=1rrFMfqvaMUE9hvwGiD6vnVh98kmgy0TARBj-v0Lfhs,244
57
+ speedy_utils-1.1.29.dist-info/RECORD,,
vision_utils/__init__.py CHANGED
@@ -1,5 +1,4 @@
1
- from .io_utils import read_images, read_images_cpu, read_images_gpu, ImageMmap
1
+ from .io_utils import read_images, read_images_cpu, read_images_gpu, ImageMmap, ImageMmapDynamic
2
2
  from .plot import plot_images_notebook
3
3
 
4
-
5
- __all__ = ['plot_images_notebook', 'read_images_cpu', 'read_images_gpu', 'read_images', 'ImageMmap']
4
+ __all__ = ['plot_images_notebook', 'read_images_cpu', 'read_images_gpu', 'read_images', 'ImageMmap', 'ImageMmapDynamic']
vision_utils/io_utils.py CHANGED
@@ -466,5 +466,270 @@ class ImageMmap(Dataset):
466
466
  assert summary > 0, f"Image at {image_path} appears to be all zeros"
467
467
  return img
468
468
 
469
+ class ImageMmapDynamic(Dataset):
470
+ """
471
+ Dynamic-shape mmap dataset.
472
+
473
+ - First run (no mmap/meta or hash mismatch): read all img_paths, keep original H/W,
474
+ append flattened bytes sequentially into a flat mmap file.
475
+ - Also writes a .meta file with mapping:
476
+ img_path -> [offset, H, W, C]
477
+ - Next runs: only open mmap + meta and do constant-time slice + reshape.
478
+ """
479
+
480
+ def __init__(
481
+ self,
482
+ img_paths: Sequence[str | os.PathLike],
483
+ mmap_path: str | os.PathLike | None = None,
484
+ dtype: np.dtype | str = np.uint8,
485
+ safe: bool = True,
486
+ ) -> None:
487
+ self.img_paths = [str(p) for p in img_paths]
488
+ self.imgpath2idx = {p: i for i, p in enumerate(self.img_paths)}
489
+ self.n = len(self.img_paths)
490
+ if self.n == 0:
491
+ raise ValueError('Cannot create ImageMmapDynamic with empty img_paths list')
492
+
493
+ self.dtype = np.dtype(dtype)
494
+ self.safe = safe
495
+
496
+ # Default path if not provided
497
+ if mmap_path is None:
498
+ hash_idx = identify(''.join(self.img_paths))
499
+ mmap_path = Path('.cache') / f'mmap_dynamic_{hash_idx}.dat'
500
+
501
+ self.mmap_path = Path(mmap_path)
502
+ self.meta_path = Path(str(self.mmap_path) + '.meta')
503
+ self.hash_path = Path(str(self.mmap_path) + '.hash')
504
+ self.lock_path = Path(str(self.mmap_path) + '.lock')
505
+
506
+ # Hash of the path list to detect changes
507
+ current_hash = identify(self.img_paths)
508
+ needs_rebuild = False
509
+
510
+ if not self.mmap_path.exists() or not self.meta_path.exists():
511
+ needs_rebuild = True
512
+ print('Dynamic mmap or meta file does not exist, building cache...')
513
+ elif not self.hash_path.exists():
514
+ needs_rebuild = True
515
+ print('Hash file does not exist for dynamic mmap, rebuilding cache...')
516
+ else:
517
+ stored_hash = self.hash_path.read_text().strip()
518
+ if stored_hash != current_hash:
519
+ needs_rebuild = True
520
+ print(
521
+ f'Dynamic mmap hash mismatch '
522
+ f'(stored: {stored_hash[:16]}..., current: {current_hash[:16]}...), '
523
+ 'rebuilding cache...'
524
+ )
525
+ else:
526
+ # Check size vs meta
527
+ import json
528
+
529
+ try:
530
+ with open(self.meta_path, 'r') as f:
531
+ meta = json.load(f)
532
+ meta_dtype = np.dtype(meta.get('dtype', 'uint8'))
533
+ total_elems = int(meta['total_elems'])
534
+ expected_bytes = total_elems * meta_dtype.itemsize
535
+ actual_bytes = self.mmap_path.stat().st_size
536
+ if actual_bytes != expected_bytes:
537
+ needs_rebuild = True
538
+ print(
539
+ 'Dynamic mmap file size mismatch '
540
+ f'(expected: {expected_bytes}, got: {actual_bytes}), '
541
+ 'rebuilding cache...'
542
+ )
543
+ except Exception as e:
544
+ needs_rebuild = True
545
+ print(f'Failed to read dynamic mmap meta ({e}), rebuilding cache...')
546
+
547
+ if needs_rebuild:
548
+ self._build_cache_with_lock(current_hash)
549
+
550
+ # After build (or if cache was already OK), load meta + mmap
551
+ self._load_metadata()
552
+ self.data = np.memmap(
553
+ self.mmap_path,
554
+ dtype=self.dtype,
555
+ mode='r',
556
+ shape=(self.total_elems,),
557
+ )
558
+
559
+ # ------------------------------------------------------------------ #
560
+ # Build phase with lock (same pattern as ImageMmap)
561
+ # ------------------------------------------------------------------ #
562
+ def _build_cache_with_lock(self, current_hash: str) -> None:
563
+ """Build dynamic mmap with a lock file to prevent concurrent writes."""
564
+ self.mmap_path.parent.mkdir(parents=True, exist_ok=True)
565
+
566
+ lock_fd = None
567
+ try:
568
+ import fcntl # POSIX only, same as ImageMmap
569
+
570
+ lock_fd = open(self.lock_path, 'w')
571
+ fcntl.flock(lock_fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
572
+
573
+ # We got the lock -> build cache
574
+ self._build_cache(current_hash)
575
+ except BlockingIOError:
576
+ # Another process is building -> wait
577
+ print('Another process is building the dynamic mmap cache, waiting...')
578
+ if lock_fd:
579
+ lock_fd.close()
580
+ lock_fd = open(self.lock_path, 'w')
581
+ import fcntl as _fcntl
582
+
583
+ _fcntl.flock(lock_fd.fileno(), _fcntl.LOCK_EX) # block until released
584
+ print('Dynamic mmap cache built by another process!')
585
+ finally:
586
+ if lock_fd:
587
+ lock_fd.close()
588
+ if self.lock_path.exists():
589
+ try:
590
+ self.lock_path.unlink()
591
+ except Exception:
592
+ pass
593
+
594
+ def _build_cache(self, current_hash: str) -> None:
595
+ """
596
+ Build the flat mmap + .meta file.
597
+
598
+ Layout:
599
+ - data file: concatenated flattened images in path order
600
+ - meta: JSON with offsets, shapes, dtype, total_elems, paths, n
601
+ """
602
+ from tqdm import tqdm
603
+ import json
604
+
605
+ print(f'Building dynamic mmap cache for {self.n} images...')
606
+ # We don't know total size up front -> write sequentially
607
+ offsets = np.zeros(self.n, dtype=np.int64)
608
+ shapes = np.zeros((self.n, 3), dtype=np.int64)
609
+
610
+ batch_size = 4096
611
+ num_batches = (self.n + batch_size - 1) // batch_size
612
+
613
+ current_offset = 0 # in elements, not bytes
614
+
615
+ with open(self.mmap_path, 'wb') as f, tqdm(
616
+ total=self.n, desc='Processing images (dynamic)', unit='img'
617
+ ) as pbar:
618
+ for batch_idx in range(num_batches):
619
+ start_idx = batch_idx * batch_size
620
+ end_idx = min(start_idx + batch_size, self.n)
621
+ batch_paths = self.img_paths[start_idx:end_idx]
622
+
623
+ images_dict = read_images(
624
+ batch_paths,
625
+ hw=None, # keep original size
626
+ batch_size=32,
627
+ num_threads=max(1, cpu_count() - 1),
628
+ )
629
+
630
+ for local_idx, path in enumerate(batch_paths):
631
+ global_idx = start_idx + local_idx
632
+ img = images_dict.get(path)
633
+
634
+ if img is None:
635
+ if self.safe:
636
+ raise ValueError(f'Failed to load image: {path}')
637
+ else:
638
+ print(
639
+ f'Warning: Failed to load {path}, storing 1x1x3 zeros'
640
+ )
641
+ img = np.zeros((1, 1, 3), dtype=self.dtype)
642
+
643
+ if img.dtype != self.dtype:
644
+ img = img.astype(self.dtype)
645
+
646
+ if img.ndim != 3:
647
+ raise ValueError(
648
+ f'Expected image with 3 dims (H,W,C), got shape {img.shape} '
649
+ f'for path {path}'
650
+ )
651
+
652
+ h, w, c = img.shape
653
+ shapes[global_idx] = (h, w, c)
654
+ offsets[global_idx] = current_offset
655
+
656
+ flat = img.reshape(-1)
657
+ f.write(flat.tobytes())
658
+
659
+ current_offset += flat.size
660
+ pbar.update(1)
661
+
662
+ total_elems = int(current_offset)
663
+ self.total_elems = total_elems
664
+
665
+ meta = {
666
+ 'version': 1,
667
+ 'dtype': self.dtype.name,
668
+ 'n': self.n,
669
+ 'paths': self.img_paths,
670
+ 'offsets': offsets.tolist(),
671
+ 'shapes': shapes.tolist(),
672
+ 'total_elems': total_elems,
673
+ }
674
+
675
+ with open(self.meta_path, 'w') as mf:
676
+ json.dump(meta, mf)
677
+
678
+ self.hash_path.write_text(current_hash)
679
+ print(
680
+ f'Dynamic mmap cache built successfully! '
681
+ f'Meta saved to {self.meta_path}, total_elems={total_elems}'
682
+ )
683
+
684
+ # ------------------------------------------------------------------ #
685
+ # Metadata loader
686
+ # ------------------------------------------------------------------ #
687
+ def _load_metadata(self) -> None:
688
+ import json
689
+
690
+ with open(self.meta_path, 'r') as f:
691
+ meta = json.load(f)
692
+
693
+ # If paths order changed without hash mismatch, this will still keep
694
+ # the meta-consistent order (but hash comparison should prevent that).
695
+ self.img_paths = [str(p) for p in meta['paths']]
696
+ self.imgpath2idx = {p: i for i, p in enumerate(self.img_paths)}
697
+ self.n = int(meta['n'])
698
+ self.dtype = np.dtype(meta.get('dtype', 'uint8'))
699
+ self.offsets = np.asarray(meta['offsets'], dtype=np.int64)
700
+ self.shapes = np.asarray(meta['shapes'], dtype=np.int64)
701
+ self.total_elems = int(meta['total_elems'])
702
+
703
+ assert len(self.offsets) == self.n
704
+ assert self.shapes.shape == (self.n, 3)
705
+
706
+ # ------------------------------------------------------------------ #
707
+ # Dataset API
708
+ # ------------------------------------------------------------------ #
709
+ def __len__(self) -> int:
710
+ return self.n
711
+
712
+ def _get_flat_slice(self, idx: int) -> np.ndarray:
713
+ """Return flat view for image idx (no copy)."""
714
+ offset = int(self.offsets[idx])
715
+ h, w, c = [int(x) for x in self.shapes[idx]]
716
+ num_elems = h * w * c
717
+ flat = self.data[offset : offset + num_elems]
718
+ return flat, h, w, c
719
+
720
+ def __getitem__(self, idx: int) -> np.ndarray:
721
+ flat, h, w, c = self._get_flat_slice(idx)
722
+ img = np.array(flat).reshape(h, w, c) # copy to normal ndarray
723
+ return img
724
+
725
+ def imread(self, image_path: str | os.PathLike) -> np.ndarray:
726
+ idx = self.imgpath2idx.get(str(image_path))
727
+ if idx is None:
728
+ raise ValueError(f'Image path {image_path} not found in dynamic dataset')
729
+ img = self[idx]
730
+ if self.safe:
731
+ summary = img.sum()
732
+ assert summary > 0, f'Image at {image_path} appears to be all zeros'
733
+ return img
469
734
 
470
- __all__ = ['read_images', 'read_images_cpu', 'read_images_gpu', 'ImageMmap']
735
+ __all__ = ['read_images', 'read_images_cpu', 'read_images_gpu', 'ImageMmap', 'ImageMmapDynamic']