PyPI - tfd-utils - Versions diffs - 0.1.0__py3-none-any.whl - Mend

tfd-utils 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

tfd_utils/README.md +168 -0
tfd_utils/__init__.py +6 -0
tfd_utils/py.typed +0 -0
tfd_utils/random_access.py +306 -0
tfd_utils-0.1.0.dist-info/METADATA +90 -0
tfd_utils-0.1.0.dist-info/RECORD +7 -0
tfd_utils-0.1.0.dist-info/WHEEL +4 -0

tfd_utils/README.md ADDED Viewed

@@ -0,0 +1,168 @@
+# TFRecord Random Access
+This module provides the `TFRecordRandomAccess` class for efficient random access to TFRecord files with automatic index caching.
+## Features
+- **Efficient Random Access**: Build an index once, then access any record by key in O(1) time
+- **Automatic Caching**: Index is built on first access and cached for subsequent uses
+- **Multiple File Support**: Works with single files, lists of files, or glob patterns
+- **Flexible Key Types**: Supports string, integer, and float keys
+- **Progress Tracking**: Shows progress during index building
+- **Memory Efficient**: Only loads the index, not the entire dataset
+## Installation
+The package is managed by `uv`. Make sure you have the required dependencies:
+```bash
+uv sync
+```
+## Usage
+### Basic Usage
+```python
+from tfd_utils import TFRecordRandomAccess
+# Create a random access reader
+reader = TFRecordRandomAccess(
+    tfrecord_path="/path/to/your/file.tfrecord",
+    key_feature_name="key"  # Name of the feature containing the record key
+)
+# Access a record by key
+example = reader.get_record("your_key")
+# Get a specific feature from a record
+image_bytes = reader.get_feature("your_key", "image")
+# Dictionary-like access
+example = reader["your_key"]
+# Check if key exists
+if "your_key" in reader:
+    print("Key exists!")
+```
+### Multiple Files
+```python
+# Using glob pattern
+reader = TFRecordRandomAccess(
+    tfrecord_path="/path/to/files/*.tfrecord",
+    key_feature_name="key"
+)
+# Using list of files
+reader = TFRecordRandomAccess(
+    tfrecord_path=[
+        "/path/to/file1.tfrecord",
+        "/path/to/file2.tfrecord"
+    ],
+    key_feature_name="key"
+)
+```
+### Custom Index Location
+```python
+reader = TFRecordRandomAccess(
+    tfrecord_path="/path/to/your/file.tfrecord",
+    key_feature_name="key",
+    index_file="/custom/path/to/index.pkl"
+)
+```
+### Advanced Usage
+```python
+# Get statistics
+stats = reader.get_stats()
+print(f"Total records: {stats['total_records']}")
+print(f"Records per file: {stats['records_per_file']}")
+# Get all keys
+all_keys = reader.get_keys()
+# Get raw record bytes
+raw_bytes = reader.get_raw_record("your_key")
+# Force rebuild index
+reader.rebuild_index()
+# Get number of records
+num_records = len(reader)
+```
+## API Reference
+### TFRecordRandomAccess
+#### Constructor
+```python
+TFRecordRandomAccess(
+    tfrecord_path: Union[str, Path, List[str], List[Path]],
+    key_feature_name: str = 'key',
+    index_file: Optional[Union[str, Path]] = None,
+    progress_interval: int = 1000
+)
+```
+**Parameters:**
+- `tfrecord_path`: Path to TFRecord file(s). Can be a single file, list of files, or glob pattern.
+- `key_feature_name`: Name of the feature containing the record key (default: 'key')
+- `index_file`: Optional path to save/load the index cache. Auto-generated if None.
+- `progress_interval`: Print progress every N records during indexing (default: 1000)
+#### Methods
+- `get_record(key: str) -> Optional[tf.train.Example]`: Get a TFRecord by key
+- `get_raw_record(key: str) -> Optional[bytes]`: Get raw record bytes by key
+- `get_feature(key: str, feature_name: str) -> Optional[Any]`: Get specific feature value
+- `contains_key(key: str) -> bool`: Check if key exists
+- `get_keys() -> List[str]`: Get all available keys
+- `get_stats() -> Dict[str, Any]`: Get statistics about indexed records
+- `rebuild_index() -> None`: Force rebuild the index
+#### Special Methods
+- `len(reader)`: Get number of records
+- `key in reader`: Check if key exists
+- `reader[key]`: Get record by key (raises KeyError if not found)
+## Index File Format
+The index is stored as a pickled dictionary with the following structure:
+```python
+{
+    "key1": {
+        "file": "/path/to/file.tfrecord",
+        "offset": 1234,
+        "length": 5678
+    },
+    "key2": {
+        "file": "/path/to/file.tfrecord",
+        "offset": 5912,
+        "length": 2345
+    }
+}
+```
+## Performance
+- **Index Building**: O(n) where n is the total number of records
+- **Record Access**: O(1) after index is built
+- **Memory Usage**: Only the index is kept in memory (~50-100 bytes per record)
+## Examples
+See `example_usage.py` and `test_with_experimental_data.py` for complete examples.
+## Requirements
+- Python >= 3.10
+- TensorFlow >= 2.13.0

tfd_utils/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from .random_access import TFRecordRandomAccess
+def hello() -> str:
+    return "Hello from tfd-utils!"
+__all__ = ['TFRecordRandomAccess', 'hello']

tfd_utils/py.typed ADDED Viewed

File without changes

tfd_utils/random_access.py ADDED Viewed

@@ -0,0 +1,306 @@
+"""
+TFRecord Random Access Reader
+This module provides a class for efficient random access to TFRecord files.
+It builds an index on first access and caches it for subsequent lookups.
+"""
+import os
+import pickle
+import glob
+import tensorflow as tf
+from typing import Dict, Any, Optional, List, Union
+from pathlib import Path
+class TFRecordRandomAccess:
+    """
+    A class for random access to TFRecord files with automatic index caching.
+    This class provides efficient random access to TFRecord files by building
+    an index that maps keys to file positions. The index is built on first
+    access and cached for subsequent uses.
+    """
+    def __init__(self,
+                 tfrecord_path: Union[str, Path, List[str], List[Path]],
+                 key_feature_name: str = 'key',
+                 index_file: Optional[Union[str, Path]] = None,
+                 progress_interval: int = 1000):
+        """
+        Initialize the TFRecord random access reader.
+        Args:
+            tfrecord_path: Path to TFRecord file(s). Can be:
+                - Single file path (str or Path)
+                - List of file paths
+                - Glob pattern (str) for multiple files
+            key_feature_name: Name of the feature containing the record key
+            index_file: Optional path to save/load the index cache. If None,
+                       will be auto-generated based on tfrecord_path
+            progress_interval: Print progress every N records during indexing
+        """
+        self.key_feature_name = key_feature_name
+        self.progress_interval = progress_interval
+        # Resolve TFRecord files
+        self.tfrecord_files = self._resolve_tfrecord_files(tfrecord_path)
+        if not self.tfrecord_files:
+            raise ValueError(f"No TFRecord files found for path: {tfrecord_path}")
+        # Set up index file path
+        self.index_file = self._get_index_file_path(index_file)
+        # Initialize index
+        self._index: Optional[Dict[str, Dict[str, Any]]] = None
+    def _resolve_tfrecord_files(self, tfrecord_path: Union[str, Path, List[str], List[Path]]) -> List[str]:
+        """Resolve the input path(s) to a list of TFRecord file paths."""
+        if isinstance(tfrecord_path, (list, tuple)):
+            # List of paths
+            files = []
+            for path in tfrecord_path:
+                path_str = str(path)
+                if os.path.exists(path_str):
+                    files.append(path_str)
+                else:
+                    # Try as glob pattern
+                    files.extend(glob.glob(path_str))
+            return sorted(files)
+        else:
+            # Single path (string or Path)
+            path_str = str(tfrecord_path)
+            if os.path.exists(path_str):
+                return [path_str]
+            else:
+                # Try as glob pattern
+                return sorted(glob.glob(path_str))
+    def _get_index_file_path(self, index_file: Optional[Union[str, Path]]) -> str:
+        """Generate index file path if not provided."""
+        if index_file is not None:
+            return str(index_file)
+        # Generate based on first TFRecord file
+        first_file = Path(self.tfrecord_files[0])
+        if len(self.tfrecord_files) == 1:
+            # Single file: use same directory with .index extension
+            return str(first_file.with_suffix('.index'))
+        else:
+            return str(first_file.parent / f"{first_file.stem}_unified.index")
+    def _build_index(self) -> Dict[str, Dict[str, Any]]:
+        """Build index for all TFRecord files."""
+        print(f"Building index for {len(self.tfrecord_files)} TFRecord file(s)...")
+        index = {}
+        total_records = 0
+        for tfrecord_file in self.tfrecord_files:
+            print(f"Processing {os.path.basename(tfrecord_file)}...")
+            file_records = 0
+            with open(tfrecord_file, 'rb') as f:
+                while True:
+                    offset = f.tell()
+                    try:
+                        # Read TFRecord format: [length][length_crc][data][data_crc]
+                        len_bytes = f.read(8)
+                        if not len_bytes:
+                            break
+                        length = int.from_bytes(len_bytes, 'little')
+                        # Skip the CRC checksum for the length
+                        f.seek(4, os.SEEK_CUR)
+                        # Read the record data
+                        record_bytes = f.read(length)
+                        if len(record_bytes) != length:
+                            break
+                        # Skip the CRC checksum for the record
+                        f.seek(4, os.SEEK_CUR)
+                        # Parse the record to extract the key
+                        example = tf.train.Example.FromString(record_bytes)
+                        # Extract key from the specified feature
+                        if self.key_feature_name not in example.features.feature:
+                            raise ValueError(f"Feature '{self.key_feature_name}' not found in record")
+                        feature = example.features.feature[self.key_feature_name]
+                        if feature.bytes_list.value:
+                            key = feature.bytes_list.value[0].decode('utf-8')
+                        elif feature.int64_list.value:
+                            key = str(feature.int64_list.value[0])
+                        elif feature.float_list.value:
+                            key = str(feature.float_list.value[0])
+                        else:
+                            raise ValueError(f"Unsupported feature type for key: {self.key_feature_name}")
+                        # Store file path and offset in the index
+                        index[key] = {
+                            'file': tfrecord_file,
+                            'offset': offset,
+                            'length': length
+                        }
+                        file_records += 1
+                        total_records += 1
+                        if file_records % self.progress_interval == 0:
+                            print(f"  Processed {file_records} records from {os.path.basename(tfrecord_file)}")
+                    except Exception as e:
+                        print(f"Error reading record at offset {offset} in {tfrecord_file}: {e}")
+                        break
+            print(f"  Completed {os.path.basename(tfrecord_file)}: {file_records} records")
+        print(f"Total records indexed: {total_records}")
+        # Save the index to cache file
+        with open(self.index_file, 'wb') as f:
+            pickle.dump(index, f)
+        print(f"Index saved to {self.index_file}")
+        return index
+    def _load_index(self) -> Dict[str, Dict[str, Any]]:
+        """Load index from cache file or build if not exists."""
+        if os.path.exists(self.index_file):
+            print(f"Loading index from {self.index_file}")
+            with open(self.index_file, 'rb') as f:
+                return pickle.load(f)
+        else:
+            return self._build_index()
+    @property
+    def index(self) -> Dict[str, Dict[str, Any]]:
+        """Get the index, building it if necessary."""
+        if self._index is None:
+            self._index = self._load_index()
+        return self._index
+    def get_record(self, key: str) -> Optional[tf.train.Example]:
+        """
+        Get a TFRecord by key.
+        Args:
+            key: The key to lookup
+        Returns:
+            tf.train.Example if found, None otherwise
+        """
+        if key not in self.index:
+            return None
+        record_info = self.index[key]
+        tfrecord_file = record_info['file']
+        offset = record_info['offset']
+        with open(tfrecord_file, 'rb') as f:
+            f.seek(offset)
+            # Read the record at the given offset
+            len_bytes = f.read(8)
+            length = int.from_bytes(len_bytes, 'little')
+            # Skip length CRC
+            f.seek(4, os.SEEK_CUR)
+            # Read record data
+            record_bytes = f.read(length)
+            # Parse and return the example
+            return tf.train.Example.FromString(record_bytes)
+    def get_raw_record(self, key: str) -> Optional[bytes]:
+        """
+        Get raw record bytes by key.
+        Args:
+            key: The key to lookup
+        Returns:
+            Raw record bytes if found, None otherwise
+        """
+        example = self.get_record(key)
+        return example.SerializeToString() if example else None
+    def get_feature(self, key: str, feature_name: str) -> Optional[Any]:
+        """
+        Get a specific feature value from a record.
+        Args:
+            key: The key to lookup
+            feature_name: Name of the feature to extract
+        Returns:
+            Feature value if found, None otherwise
+        """
+        example = self.get_record(key)
+        if example is None:
+            return None
+        if feature_name not in example.features.feature:
+            return None
+        feature = example.features.feature[feature_name]
+        # Return the appropriate value based on feature type
+        if feature.bytes_list.value:
+            return feature.bytes_list.value[0]
+        elif feature.int64_list.value:
+            return feature.int64_list.value[0]
+        elif feature.float_list.value:
+            return feature.float_list.value[0]
+        else:
+            return None
+    def contains_key(self, key: str) -> bool:
+        """Check if a key exists in the index."""
+        return key in self.index
+    def get_keys(self) -> List[str]:
+        """Get all available keys."""
+        return list(self.index.keys())
+    def get_stats(self) -> Dict[str, Any]:
+        """Get statistics about the indexed records."""
+        file_counts = {}
+        for key, info in self.index.items():
+            file_path = info['file']
+            file_name = os.path.basename(file_path)
+            file_counts[file_name] = file_counts.get(file_name, 0) + 1
+        return {
+            'total_records': len(self.index),
+            'total_files': len(self.tfrecord_files),
+            'records_per_file': file_counts,
+            'index_file': self.index_file
+        }
+    def rebuild_index(self) -> None:
+        """Force rebuild the index."""
+        if os.path.exists(self.index_file):
+            os.remove(self.index_file)
+        self._index = None
+        # Trigger rebuild on next access
+        _ = self.index
+    def __len__(self) -> int:
+        """Return the number of records in the index."""
+        return len(self.index)
+    def __contains__(self, key: str) -> bool:
+        """Check if key exists using 'in' operator."""
+        return self.contains_key(key)
+    def __getitem__(self, key: str) -> tf.train.Example:
+        """Get record using [] operator."""
+        result = self.get_record(key)
+        if result is None:
+            raise KeyError(f"Key '{key}' not found")
+        return result

tfd_utils-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,90 @@
+Metadata-Version: 2.4
+Name: tfd-utils
+Version: 0.1.0
+Summary: TensorFlow utilities for efficient TFRecord processing and random access
+Author-email: Haobo Yuan <haoboyuan@ucmerced.edu>
+Requires-Python: >=3.10
+Requires-Dist: tensorflow-cpu>=2.13.0
+Provides-Extra: dev
+Requires-Dist: pillow>=9.0.0; extra == 'dev'
+Requires-Dist: pytest>=7.0.0; extra == 'dev'
+Requires-Dist: requests>=2.25.0; extra == 'dev'
+Description-Content-Type: text/markdown
+# TFD Utils
+A Python library for efficient TensorFlow TFRecord processing and random access.
+## Features
+- **Random Access to TFRecord Files**: Efficiently access specific records in TFRecord files without reading the entire file
+- **Automatic Index Caching**: Builds and caches an index on first access for fast subsequent lookups
+- **Multiple File Support**: Handle single files, lists of files, or glob patterns
+- **Flexible Key Types**: Support for string, integer, and float keys
+- **Memory Efficient**: Only loads requested records into memory
+## Quick Start
+```python
+from tfd_utils.random_access import TFRecordRandomAccess
+# Initialize with a single file
+reader = TFRecordRandomAccess("path/to/your/file.tfrecord")
+# Or with multiple files
+reader = TFRecordRandomAccess([
+    "path/to/file1.tfrecord",
+    "path/to/file2.tfrecord"
+])
+# Or with a glob pattern
+reader = TFRecordRandomAccess("path/to/data_*.tfrecord")
+# Get a record by key
+record = reader.get_record("your_key")
+# Get a specific feature from a record
+image_bytes = reader.get_feature("your_key", "image")
+# Check if key exists
+if "your_key" in reader:
+    print("Key exists!")
+# Get statistics
+stats = reader.get_stats()
+print(f"Total records: {stats['total_records']}")
+```
+## Advanced Usage
+### Custom Key Feature
+By default, the library looks for keys in a feature named 'key'. You can specify a different feature name:
+```python
+# Use 'id' feature as the key
+reader = TFRecordRandomAccess("file.tfrecord", key_feature_name="id")
+```
+### Custom Index File
+You can specify where to save the index cache:
+```python
+reader = TFRecordRandomAccess(
+    "file.tfrecord",
+    index_file="my_custom_index.cache"
+)
+```
+### Rebuilding Index
+If your TFRecord files change, you can rebuild the index:
+```python
+reader.rebuild_index()
+```
+## License
+MIT License

tfd_utils-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+tfd_utils/README.md,sha256=AIOekmH2cAFHRCWe7VsminL-s_9TQHFU5AJsneZnmfE,4265
+tfd_utils/__init__.py,sha256=9q6PcLW0t0QMpPMvlXAKvEDrGTNYSMZA4NINihGXMd8,149
+tfd_utils/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+tfd_utils/random_access.py,sha256=95pFx950yjW5CeoqAemblqeP4MoEGXKzPf7i8hviwW4,11601
+tfd_utils-0.1.0.dist-info/METADATA,sha256=cejJLhHYeCApBhi09kZj4muJ0TSb867LVoVRUT2ilEU,2295
+tfd_utils-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+tfd_utils-0.1.0.dist-info/RECORD,,

tfd_utils-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.27.0
+Root-Is-Purelib: true
+Tag: py3-none-any