tfd-utils 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tfd_utils/README.md ADDED
@@ -0,0 +1,168 @@
1
+ # TFRecord Random Access
2
+
3
+ This module provides the `TFRecordRandomAccess` class for efficient random access to TFRecord files with automatic index caching.
4
+
5
+ ## Features
6
+
7
+ - **Efficient Random Access**: Build an index once, then access any record by key in O(1) time
8
+ - **Automatic Caching**: Index is built on first access and cached for subsequent uses
9
+ - **Multiple File Support**: Works with single files, lists of files, or glob patterns
10
+ - **Flexible Key Types**: Supports string, integer, and float keys
11
+ - **Progress Tracking**: Shows progress during index building
12
+ - **Memory Efficient**: Only loads the index, not the entire dataset
13
+
14
+ ## Installation
15
+
16
+ The package is managed by `uv`. Make sure you have the required dependencies:
17
+
18
+ ```bash
19
+ uv sync
20
+ ```
21
+
22
+ ## Usage
23
+
24
+ ### Basic Usage
25
+
26
+ ```python
27
+ from tfd_utils import TFRecordRandomAccess
28
+
29
+ # Create a random access reader
30
+ reader = TFRecordRandomAccess(
31
+ tfrecord_path="/path/to/your/file.tfrecord",
32
+ key_feature_name="key" # Name of the feature containing the record key
33
+ )
34
+
35
+ # Access a record by key
36
+ example = reader.get_record("your_key")
37
+
38
+ # Get a specific feature from a record
39
+ image_bytes = reader.get_feature("your_key", "image")
40
+
41
+ # Dictionary-like access
42
+ example = reader["your_key"]
43
+
44
+ # Check if key exists
45
+ if "your_key" in reader:
46
+ print("Key exists!")
47
+ ```
48
+
49
+ ### Multiple Files
50
+
51
+ ```python
52
+ # Using glob pattern
53
+ reader = TFRecordRandomAccess(
54
+ tfrecord_path="/path/to/files/*.tfrecord",
55
+ key_feature_name="key"
56
+ )
57
+
58
+ # Using list of files
59
+ reader = TFRecordRandomAccess(
60
+ tfrecord_path=[
61
+ "/path/to/file1.tfrecord",
62
+ "/path/to/file2.tfrecord"
63
+ ],
64
+ key_feature_name="key"
65
+ )
66
+ ```
67
+
68
+ ### Custom Index Location
69
+
70
+ ```python
71
+ reader = TFRecordRandomAccess(
72
+ tfrecord_path="/path/to/your/file.tfrecord",
73
+ key_feature_name="key",
74
+ index_file="/custom/path/to/index.pkl"
75
+ )
76
+ ```
77
+
78
+ ### Advanced Usage
79
+
80
+ ```python
81
+ # Get statistics
82
+ stats = reader.get_stats()
83
+ print(f"Total records: {stats['total_records']}")
84
+ print(f"Records per file: {stats['records_per_file']}")
85
+
86
+ # Get all keys
87
+ all_keys = reader.get_keys()
88
+
89
+ # Get raw record bytes
90
+ raw_bytes = reader.get_raw_record("your_key")
91
+
92
+ # Force rebuild index
93
+ reader.rebuild_index()
94
+
95
+ # Get number of records
96
+ num_records = len(reader)
97
+ ```
98
+
99
+ ## API Reference
100
+
101
+ ### TFRecordRandomAccess
102
+
103
+ #### Constructor
104
+
105
+ ```python
106
+ TFRecordRandomAccess(
107
+ tfrecord_path: Union[str, Path, List[str], List[Path]],
108
+ key_feature_name: str = 'key',
109
+ index_file: Optional[Union[str, Path]] = None,
110
+ progress_interval: int = 1000
111
+ )
112
+ ```
113
+
114
+ **Parameters:**
115
+ - `tfrecord_path`: Path to TFRecord file(s). Can be a single file, list of files, or glob pattern.
116
+ - `key_feature_name`: Name of the feature containing the record key (default: 'key')
117
+ - `index_file`: Optional path to save/load the index cache. Auto-generated if None.
118
+ - `progress_interval`: Print progress every N records during indexing (default: 1000)
119
+
120
+ #### Methods
121
+
122
+ - `get_record(key: str) -> Optional[tf.train.Example]`: Get a TFRecord by key
123
+ - `get_raw_record(key: str) -> Optional[bytes]`: Get raw record bytes by key
124
+ - `get_feature(key: str, feature_name: str) -> Optional[Any]`: Get specific feature value
125
+ - `contains_key(key: str) -> bool`: Check if key exists
126
+ - `get_keys() -> List[str]`: Get all available keys
127
+ - `get_stats() -> Dict[str, Any]`: Get statistics about indexed records
128
+ - `rebuild_index() -> None`: Force rebuild the index
129
+
130
+ #### Special Methods
131
+
132
+ - `len(reader)`: Get number of records
133
+ - `key in reader`: Check if key exists
134
+ - `reader[key]`: Get record by key (raises KeyError if not found)
135
+
136
+ ## Index File Format
137
+
138
+ The index is stored as a pickled dictionary with the following structure:
139
+
140
+ ```python
141
+ {
142
+ "key1": {
143
+ "file": "/path/to/file.tfrecord",
144
+ "offset": 1234,
145
+ "length": 5678
146
+ },
147
+ "key2": {
148
+ "file": "/path/to/file.tfrecord",
149
+ "offset": 5912,
150
+ "length": 2345
151
+ }
152
+ }
153
+ ```
154
+
155
+ ## Performance
156
+
157
+ - **Index Building**: O(n) where n is the total number of records
158
+ - **Record Access**: O(1) after index is built
159
+ - **Memory Usage**: Only the index is kept in memory (~50-100 bytes per record)
160
+
161
+ ## Examples
162
+
163
+ See `example_usage.py` and `test_with_experimental_data.py` for complete examples.
164
+
165
+ ## Requirements
166
+
167
+ - Python >= 3.10
168
+ - TensorFlow >= 2.13.0
tfd_utils/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ from .random_access import TFRecordRandomAccess
2
+
3
+ def hello() -> str:
4
+ return "Hello from tfd-utils!"
5
+
6
+ __all__ = ['TFRecordRandomAccess', 'hello']
tfd_utils/py.typed ADDED
File without changes
@@ -0,0 +1,306 @@
1
+ """
2
+ TFRecord Random Access Reader
3
+
4
+ This module provides a class for efficient random access to TFRecord files.
5
+ It builds an index on first access and caches it for subsequent lookups.
6
+ """
7
+
8
+ import os
9
+ import pickle
10
+ import glob
11
+ import tensorflow as tf
12
+ from typing import Dict, Any, Optional, List, Union
13
+ from pathlib import Path
14
+
15
+
16
+ class TFRecordRandomAccess:
17
+ """
18
+ A class for random access to TFRecord files with automatic index caching.
19
+
20
+ This class provides efficient random access to TFRecord files by building
21
+ an index that maps keys to file positions. The index is built on first
22
+ access and cached for subsequent uses.
23
+ """
24
+
25
+ def __init__(self,
26
+ tfrecord_path: Union[str, Path, List[str], List[Path]],
27
+ key_feature_name: str = 'key',
28
+ index_file: Optional[Union[str, Path]] = None,
29
+ progress_interval: int = 1000):
30
+ """
31
+ Initialize the TFRecord random access reader.
32
+
33
+ Args:
34
+ tfrecord_path: Path to TFRecord file(s). Can be:
35
+ - Single file path (str or Path)
36
+ - List of file paths
37
+ - Glob pattern (str) for multiple files
38
+ key_feature_name: Name of the feature containing the record key
39
+ index_file: Optional path to save/load the index cache. If None,
40
+ will be auto-generated based on tfrecord_path
41
+ progress_interval: Print progress every N records during indexing
42
+ """
43
+ self.key_feature_name = key_feature_name
44
+ self.progress_interval = progress_interval
45
+
46
+ # Resolve TFRecord files
47
+ self.tfrecord_files = self._resolve_tfrecord_files(tfrecord_path)
48
+ if not self.tfrecord_files:
49
+ raise ValueError(f"No TFRecord files found for path: {tfrecord_path}")
50
+
51
+ # Set up index file path
52
+ self.index_file = self._get_index_file_path(index_file)
53
+
54
+ # Initialize index
55
+ self._index: Optional[Dict[str, Dict[str, Any]]] = None
56
+
57
+ def _resolve_tfrecord_files(self, tfrecord_path: Union[str, Path, List[str], List[Path]]) -> List[str]:
58
+ """Resolve the input path(s) to a list of TFRecord file paths."""
59
+ if isinstance(tfrecord_path, (list, tuple)):
60
+ # List of paths
61
+ files = []
62
+ for path in tfrecord_path:
63
+ path_str = str(path)
64
+ if os.path.exists(path_str):
65
+ files.append(path_str)
66
+ else:
67
+ # Try as glob pattern
68
+ files.extend(glob.glob(path_str))
69
+ return sorted(files)
70
+ else:
71
+ # Single path (string or Path)
72
+ path_str = str(tfrecord_path)
73
+ if os.path.exists(path_str):
74
+ return [path_str]
75
+ else:
76
+ # Try as glob pattern
77
+ return sorted(glob.glob(path_str))
78
+
79
+ def _get_index_file_path(self, index_file: Optional[Union[str, Path]]) -> str:
80
+ """Generate index file path if not provided."""
81
+ if index_file is not None:
82
+ return str(index_file)
83
+
84
+ # Generate based on first TFRecord file
85
+ first_file = Path(self.tfrecord_files[0])
86
+ if len(self.tfrecord_files) == 1:
87
+ # Single file: use same directory with .index extension
88
+ return str(first_file.with_suffix('.index'))
89
+ else:
90
+ return str(first_file.parent / f"{first_file.stem}_unified.index")
91
+
92
+ def _build_index(self) -> Dict[str, Dict[str, Any]]:
93
+ """Build index for all TFRecord files."""
94
+ print(f"Building index for {len(self.tfrecord_files)} TFRecord file(s)...")
95
+
96
+ index = {}
97
+ total_records = 0
98
+
99
+ for tfrecord_file in self.tfrecord_files:
100
+ print(f"Processing {os.path.basename(tfrecord_file)}...")
101
+ file_records = 0
102
+
103
+ with open(tfrecord_file, 'rb') as f:
104
+ while True:
105
+ offset = f.tell()
106
+ try:
107
+ # Read TFRecord format: [length][length_crc][data][data_crc]
108
+ len_bytes = f.read(8)
109
+ if not len_bytes:
110
+ break
111
+
112
+ length = int.from_bytes(len_bytes, 'little')
113
+
114
+ # Skip the CRC checksum for the length
115
+ f.seek(4, os.SEEK_CUR)
116
+
117
+ # Read the record data
118
+ record_bytes = f.read(length)
119
+ if len(record_bytes) != length:
120
+ break
121
+
122
+ # Skip the CRC checksum for the record
123
+ f.seek(4, os.SEEK_CUR)
124
+
125
+ # Parse the record to extract the key
126
+ example = tf.train.Example.FromString(record_bytes)
127
+
128
+ # Extract key from the specified feature
129
+ if self.key_feature_name not in example.features.feature:
130
+ raise ValueError(f"Feature '{self.key_feature_name}' not found in record")
131
+
132
+ feature = example.features.feature[self.key_feature_name]
133
+ if feature.bytes_list.value:
134
+ key = feature.bytes_list.value[0].decode('utf-8')
135
+ elif feature.int64_list.value:
136
+ key = str(feature.int64_list.value[0])
137
+ elif feature.float_list.value:
138
+ key = str(feature.float_list.value[0])
139
+ else:
140
+ raise ValueError(f"Unsupported feature type for key: {self.key_feature_name}")
141
+
142
+ # Store file path and offset in the index
143
+ index[key] = {
144
+ 'file': tfrecord_file,
145
+ 'offset': offset,
146
+ 'length': length
147
+ }
148
+
149
+ file_records += 1
150
+ total_records += 1
151
+
152
+ if file_records % self.progress_interval == 0:
153
+ print(f" Processed {file_records} records from {os.path.basename(tfrecord_file)}")
154
+
155
+ except Exception as e:
156
+ print(f"Error reading record at offset {offset} in {tfrecord_file}: {e}")
157
+ break
158
+
159
+ print(f" Completed {os.path.basename(tfrecord_file)}: {file_records} records")
160
+
161
+ print(f"Total records indexed: {total_records}")
162
+
163
+ # Save the index to cache file
164
+ with open(self.index_file, 'wb') as f:
165
+ pickle.dump(index, f)
166
+ print(f"Index saved to {self.index_file}")
167
+
168
+ return index
169
+
170
+ def _load_index(self) -> Dict[str, Dict[str, Any]]:
171
+ """Load index from cache file or build if not exists."""
172
+ if os.path.exists(self.index_file):
173
+ print(f"Loading index from {self.index_file}")
174
+ with open(self.index_file, 'rb') as f:
175
+ return pickle.load(f)
176
+ else:
177
+ return self._build_index()
178
+
179
+ @property
180
+ def index(self) -> Dict[str, Dict[str, Any]]:
181
+ """Get the index, building it if necessary."""
182
+ if self._index is None:
183
+ self._index = self._load_index()
184
+ return self._index
185
+
186
+ def get_record(self, key: str) -> Optional[tf.train.Example]:
187
+ """
188
+ Get a TFRecord by key.
189
+
190
+ Args:
191
+ key: The key to lookup
192
+
193
+ Returns:
194
+ tf.train.Example if found, None otherwise
195
+ """
196
+ if key not in self.index:
197
+ return None
198
+
199
+ record_info = self.index[key]
200
+ tfrecord_file = record_info['file']
201
+ offset = record_info['offset']
202
+
203
+ with open(tfrecord_file, 'rb') as f:
204
+ f.seek(offset)
205
+
206
+ # Read the record at the given offset
207
+ len_bytes = f.read(8)
208
+ length = int.from_bytes(len_bytes, 'little')
209
+
210
+ # Skip length CRC
211
+ f.seek(4, os.SEEK_CUR)
212
+
213
+ # Read record data
214
+ record_bytes = f.read(length)
215
+
216
+ # Parse and return the example
217
+ return tf.train.Example.FromString(record_bytes)
218
+
219
+ def get_raw_record(self, key: str) -> Optional[bytes]:
220
+ """
221
+ Get raw record bytes by key.
222
+
223
+ Args:
224
+ key: The key to lookup
225
+
226
+ Returns:
227
+ Raw record bytes if found, None otherwise
228
+ """
229
+ example = self.get_record(key)
230
+ return example.SerializeToString() if example else None
231
+
232
+ def get_feature(self, key: str, feature_name: str) -> Optional[Any]:
233
+ """
234
+ Get a specific feature value from a record.
235
+
236
+ Args:
237
+ key: The key to lookup
238
+ feature_name: Name of the feature to extract
239
+
240
+ Returns:
241
+ Feature value if found, None otherwise
242
+ """
243
+ example = self.get_record(key)
244
+ if example is None:
245
+ return None
246
+
247
+ if feature_name not in example.features.feature:
248
+ return None
249
+
250
+ feature = example.features.feature[feature_name]
251
+
252
+ # Return the appropriate value based on feature type
253
+ if feature.bytes_list.value:
254
+ return feature.bytes_list.value[0]
255
+ elif feature.int64_list.value:
256
+ return feature.int64_list.value[0]
257
+ elif feature.float_list.value:
258
+ return feature.float_list.value[0]
259
+ else:
260
+ return None
261
+
262
+ def contains_key(self, key: str) -> bool:
263
+ """Check if a key exists in the index."""
264
+ return key in self.index
265
+
266
+ def get_keys(self) -> List[str]:
267
+ """Get all available keys."""
268
+ return list(self.index.keys())
269
+
270
+ def get_stats(self) -> Dict[str, Any]:
271
+ """Get statistics about the indexed records."""
272
+ file_counts = {}
273
+ for key, info in self.index.items():
274
+ file_path = info['file']
275
+ file_name = os.path.basename(file_path)
276
+ file_counts[file_name] = file_counts.get(file_name, 0) + 1
277
+
278
+ return {
279
+ 'total_records': len(self.index),
280
+ 'total_files': len(self.tfrecord_files),
281
+ 'records_per_file': file_counts,
282
+ 'index_file': self.index_file
283
+ }
284
+
285
+ def rebuild_index(self) -> None:
286
+ """Force rebuild the index."""
287
+ if os.path.exists(self.index_file):
288
+ os.remove(self.index_file)
289
+ self._index = None
290
+ # Trigger rebuild on next access
291
+ _ = self.index
292
+
293
+ def __len__(self) -> int:
294
+ """Return the number of records in the index."""
295
+ return len(self.index)
296
+
297
+ def __contains__(self, key: str) -> bool:
298
+ """Check if key exists using 'in' operator."""
299
+ return self.contains_key(key)
300
+
301
+ def __getitem__(self, key: str) -> tf.train.Example:
302
+ """Get record using [] operator."""
303
+ result = self.get_record(key)
304
+ if result is None:
305
+ raise KeyError(f"Key '{key}' not found")
306
+ return result
@@ -0,0 +1,90 @@
1
+ Metadata-Version: 2.4
2
+ Name: tfd-utils
3
+ Version: 0.1.0
4
+ Summary: TensorFlow utilities for efficient TFRecord processing and random access
5
+ Author-email: Haobo Yuan <haoboyuan@ucmerced.edu>
6
+ Requires-Python: >=3.10
7
+ Requires-Dist: tensorflow-cpu>=2.13.0
8
+ Provides-Extra: dev
9
+ Requires-Dist: pillow>=9.0.0; extra == 'dev'
10
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
11
+ Requires-Dist: requests>=2.25.0; extra == 'dev'
12
+ Description-Content-Type: text/markdown
13
+
14
+ # TFD Utils
15
+
16
+ A Python library for efficient TensorFlow TFRecord processing and random access.
17
+
18
+ ## Features
19
+
20
+ - **Random Access to TFRecord Files**: Efficiently access specific records in TFRecord files without reading the entire file
21
+ - **Automatic Index Caching**: Builds and caches an index on first access for fast subsequent lookups
22
+ - **Multiple File Support**: Handle single files, lists of files, or glob patterns
23
+ - **Flexible Key Types**: Support for string, integer, and float keys
24
+ - **Memory Efficient**: Only loads requested records into memory
25
+
26
+ ## Quick Start
27
+
28
+ ```python
29
+ from tfd_utils.random_access import TFRecordRandomAccess
30
+
31
+ # Initialize with a single file
32
+ reader = TFRecordRandomAccess("path/to/your/file.tfrecord")
33
+
34
+ # Or with multiple files
35
+ reader = TFRecordRandomAccess([
36
+ "path/to/file1.tfrecord",
37
+ "path/to/file2.tfrecord"
38
+ ])
39
+
40
+ # Or with a glob pattern
41
+ reader = TFRecordRandomAccess("path/to/data_*.tfrecord")
42
+
43
+ # Get a record by key
44
+ record = reader.get_record("your_key")
45
+
46
+ # Get a specific feature from a record
47
+ image_bytes = reader.get_feature("your_key", "image")
48
+
49
+ # Check if key exists
50
+ if "your_key" in reader:
51
+ print("Key exists!")
52
+
53
+ # Get statistics
54
+ stats = reader.get_stats()
55
+ print(f"Total records: {stats['total_records']}")
56
+ ```
57
+
58
+ ## Advanced Usage
59
+
60
+ ### Custom Key Feature
61
+
62
+ By default, the library looks for keys in a feature named 'key'. You can specify a different feature name:
63
+
64
+ ```python
65
+ # Use 'id' feature as the key
66
+ reader = TFRecordRandomAccess("file.tfrecord", key_feature_name="id")
67
+ ```
68
+
69
+ ### Custom Index File
70
+
71
+ You can specify where to save the index cache:
72
+
73
+ ```python
74
+ reader = TFRecordRandomAccess(
75
+ "file.tfrecord",
76
+ index_file="my_custom_index.cache"
77
+ )
78
+ ```
79
+
80
+ ### Rebuilding Index
81
+
82
+ If your TFRecord files change, you can rebuild the index:
83
+
84
+ ```python
85
+ reader.rebuild_index()
86
+ ```
87
+
88
+ ## License
89
+
90
+ MIT License
@@ -0,0 +1,7 @@
1
+ tfd_utils/README.md,sha256=AIOekmH2cAFHRCWe7VsminL-s_9TQHFU5AJsneZnmfE,4265
2
+ tfd_utils/__init__.py,sha256=9q6PcLW0t0QMpPMvlXAKvEDrGTNYSMZA4NINihGXMd8,149
3
+ tfd_utils/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ tfd_utils/random_access.py,sha256=95pFx950yjW5CeoqAemblqeP4MoEGXKzPf7i8hviwW4,11601
5
+ tfd_utils-0.1.0.dist-info/METADATA,sha256=cejJLhHYeCApBhi09kZj4muJ0TSb867LVoVRUT2ilEU,2295
6
+ tfd_utils-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
7
+ tfd_utils-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any