tfd-utils 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tfd_utils-0.1.0/.github/workflows/publish.yml +28 -0
- tfd_utils-0.1.0/.gitignore +10 -0
- tfd_utils-0.1.0/.python-version +1 -0
- tfd_utils-0.1.0/PKG-INFO +90 -0
- tfd_utils-0.1.0/README.md +77 -0
- tfd_utils-0.1.0/pyproject.toml +23 -0
- tfd_utils-0.1.0/src/tfd_utils/README.md +168 -0
- tfd_utils-0.1.0/src/tfd_utils/__init__.py +6 -0
- tfd_utils-0.1.0/src/tfd_utils/py.typed +0 -0
- tfd_utils-0.1.0/src/tfd_utils/random_access.py +306 -0
- tfd_utils-0.1.0/tests/__init__.py +5 -0
- tfd_utils-0.1.0/tests/conftest.py +21 -0
- tfd_utils-0.1.0/tests/generate_test_data.py +146 -0
- tfd_utils-0.1.0/tests/integration_test.py +112 -0
- tfd_utils-0.1.0/tests/test_random_access.py +245 -0
- tfd_utils-0.1.0/uv.lock +1025 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
build-and-publish:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
permissions:
|
|
11
|
+
id-token: write # For trusted publishing
|
|
12
|
+
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
|
|
16
|
+
- name: Install uv
|
|
17
|
+
uses: astral-sh/setup-uv@v2
|
|
18
|
+
with:
|
|
19
|
+
version: "latest"
|
|
20
|
+
|
|
21
|
+
- name: Set up Python
|
|
22
|
+
run: uv python install 3.10
|
|
23
|
+
|
|
24
|
+
- name: Build package
|
|
25
|
+
run: uv build
|
|
26
|
+
|
|
27
|
+
- name: Publish to PyPI
|
|
28
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.10
|
tfd_utils-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tfd-utils
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: TensorFlow utilities for efficient TFRecord processing and random access
|
|
5
|
+
Author-email: Haobo Yuan <haoboyuan@ucmerced.edu>
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Requires-Dist: tensorflow-cpu>=2.13.0
|
|
8
|
+
Provides-Extra: dev
|
|
9
|
+
Requires-Dist: pillow>=9.0.0; extra == 'dev'
|
|
10
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
11
|
+
Requires-Dist: requests>=2.25.0; extra == 'dev'
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# TFD Utils
|
|
15
|
+
|
|
16
|
+
A Python library for efficient TensorFlow TFRecord processing and random access.
|
|
17
|
+
|
|
18
|
+
## Features
|
|
19
|
+
|
|
20
|
+
- **Random Access to TFRecord Files**: Efficiently access specific records in TFRecord files without reading the entire file
|
|
21
|
+
- **Automatic Index Caching**: Builds and caches an index on first access for fast subsequent lookups
|
|
22
|
+
- **Multiple File Support**: Handle single files, lists of files, or glob patterns
|
|
23
|
+
- **Flexible Key Types**: Support for string, integer, and float keys
|
|
24
|
+
- **Memory Efficient**: Only loads requested records into memory
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
from tfd_utils.random_access import TFRecordRandomAccess
|
|
30
|
+
|
|
31
|
+
# Initialize with a single file
|
|
32
|
+
reader = TFRecordRandomAccess("path/to/your/file.tfrecord")
|
|
33
|
+
|
|
34
|
+
# Or with multiple files
|
|
35
|
+
reader = TFRecordRandomAccess([
|
|
36
|
+
"path/to/file1.tfrecord",
|
|
37
|
+
"path/to/file2.tfrecord"
|
|
38
|
+
])
|
|
39
|
+
|
|
40
|
+
# Or with a glob pattern
|
|
41
|
+
reader = TFRecordRandomAccess("path/to/data_*.tfrecord")
|
|
42
|
+
|
|
43
|
+
# Get a record by key
|
|
44
|
+
record = reader.get_record("your_key")
|
|
45
|
+
|
|
46
|
+
# Get a specific feature from a record
|
|
47
|
+
image_bytes = reader.get_feature("your_key", "image")
|
|
48
|
+
|
|
49
|
+
# Check if key exists
|
|
50
|
+
if "your_key" in reader:
|
|
51
|
+
print("Key exists!")
|
|
52
|
+
|
|
53
|
+
# Get statistics
|
|
54
|
+
stats = reader.get_stats()
|
|
55
|
+
print(f"Total records: {stats['total_records']}")
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Advanced Usage
|
|
59
|
+
|
|
60
|
+
### Custom Key Feature
|
|
61
|
+
|
|
62
|
+
By default, the library looks for keys in a feature named 'key'. You can specify a different feature name:
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
# Use 'id' feature as the key
|
|
66
|
+
reader = TFRecordRandomAccess("file.tfrecord", key_feature_name="id")
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Custom Index File
|
|
70
|
+
|
|
71
|
+
You can specify where to save the index cache:
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
reader = TFRecordRandomAccess(
|
|
75
|
+
"file.tfrecord",
|
|
76
|
+
index_file="my_custom_index.cache"
|
|
77
|
+
)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Rebuilding Index
|
|
81
|
+
|
|
82
|
+
If your TFRecord files change, you can rebuild the index:
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
reader.rebuild_index()
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## License
|
|
89
|
+
|
|
90
|
+
MIT License
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# TFD Utils
|
|
2
|
+
|
|
3
|
+
A Python library for efficient TensorFlow TFRecord processing and random access.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Random Access to TFRecord Files**: Efficiently access specific records in TFRecord files without reading the entire file
|
|
8
|
+
- **Automatic Index Caching**: Builds and caches an index on first access for fast subsequent lookups
|
|
9
|
+
- **Multiple File Support**: Handle single files, lists of files, or glob patterns
|
|
10
|
+
- **Flexible Key Types**: Support for string, integer, and float keys
|
|
11
|
+
- **Memory Efficient**: Only loads requested records into memory
|
|
12
|
+
|
|
13
|
+
## Quick Start
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from tfd_utils.random_access import TFRecordRandomAccess
|
|
17
|
+
|
|
18
|
+
# Initialize with a single file
|
|
19
|
+
reader = TFRecordRandomAccess("path/to/your/file.tfrecord")
|
|
20
|
+
|
|
21
|
+
# Or with multiple files
|
|
22
|
+
reader = TFRecordRandomAccess([
|
|
23
|
+
"path/to/file1.tfrecord",
|
|
24
|
+
"path/to/file2.tfrecord"
|
|
25
|
+
])
|
|
26
|
+
|
|
27
|
+
# Or with a glob pattern
|
|
28
|
+
reader = TFRecordRandomAccess("path/to/data_*.tfrecord")
|
|
29
|
+
|
|
30
|
+
# Get a record by key
|
|
31
|
+
record = reader.get_record("your_key")
|
|
32
|
+
|
|
33
|
+
# Get a specific feature from a record
|
|
34
|
+
image_bytes = reader.get_feature("your_key", "image")
|
|
35
|
+
|
|
36
|
+
# Check if key exists
|
|
37
|
+
if "your_key" in reader:
|
|
38
|
+
print("Key exists!")
|
|
39
|
+
|
|
40
|
+
# Get statistics
|
|
41
|
+
stats = reader.get_stats()
|
|
42
|
+
print(f"Total records: {stats['total_records']}")
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Advanced Usage
|
|
46
|
+
|
|
47
|
+
### Custom Key Feature
|
|
48
|
+
|
|
49
|
+
By default, the library looks for keys in a feature named 'key'. You can specify a different feature name:
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
# Use 'id' feature as the key
|
|
53
|
+
reader = TFRecordRandomAccess("file.tfrecord", key_feature_name="id")
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Custom Index File
|
|
57
|
+
|
|
58
|
+
You can specify where to save the index cache:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
reader = TFRecordRandomAccess(
|
|
62
|
+
"file.tfrecord",
|
|
63
|
+
index_file="my_custom_index.cache"
|
|
64
|
+
)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Rebuilding Index
|
|
68
|
+
|
|
69
|
+
If your TFRecord files change, you can rebuild the index:
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
reader.rebuild_index()
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## License
|
|
76
|
+
|
|
77
|
+
MIT License
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "tfd-utils"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "TensorFlow utilities for efficient TFRecord processing and random access"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Haobo Yuan", email = "haoboyuan@ucmerced.edu" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.10"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"tensorflow-cpu>=2.13.0",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[project.optional-dependencies]
|
|
15
|
+
dev = [
|
|
16
|
+
"pytest>=7.0.0",
|
|
17
|
+
"requests>=2.25.0",
|
|
18
|
+
"Pillow>=9.0.0",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[build-system]
|
|
22
|
+
requires = ["hatchling"]
|
|
23
|
+
build-backend = "hatchling.build"
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# TFRecord Random Access
|
|
2
|
+
|
|
3
|
+
This module provides the `TFRecordRandomAccess` class for efficient random access to TFRecord files with automatic index caching.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Efficient Random Access**: Build an index once, then access any record by key in O(1) time
|
|
8
|
+
- **Automatic Caching**: Index is built on first access and cached for subsequent uses
|
|
9
|
+
- **Multiple File Support**: Works with single files, lists of files, or glob patterns
|
|
10
|
+
- **Flexible Key Types**: Supports string, integer, and float keys
|
|
11
|
+
- **Progress Tracking**: Shows progress during index building
|
|
12
|
+
- **Memory Efficient**: Only loads the index, not the entire dataset
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
The package is managed by `uv`. Make sure you have the required dependencies:
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
uv sync
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Usage
|
|
23
|
+
|
|
24
|
+
### Basic Usage
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from tfd_utils import TFRecordRandomAccess
|
|
28
|
+
|
|
29
|
+
# Create a random access reader
|
|
30
|
+
reader = TFRecordRandomAccess(
|
|
31
|
+
tfrecord_path="/path/to/your/file.tfrecord",
|
|
32
|
+
key_feature_name="key" # Name of the feature containing the record key
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# Access a record by key
|
|
36
|
+
example = reader.get_record("your_key")
|
|
37
|
+
|
|
38
|
+
# Get a specific feature from a record
|
|
39
|
+
image_bytes = reader.get_feature("your_key", "image")
|
|
40
|
+
|
|
41
|
+
# Dictionary-like access
|
|
42
|
+
example = reader["your_key"]
|
|
43
|
+
|
|
44
|
+
# Check if key exists
|
|
45
|
+
if "your_key" in reader:
|
|
46
|
+
print("Key exists!")
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Multiple Files
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
# Using glob pattern
|
|
53
|
+
reader = TFRecordRandomAccess(
|
|
54
|
+
tfrecord_path="/path/to/files/*.tfrecord",
|
|
55
|
+
key_feature_name="key"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Using list of files
|
|
59
|
+
reader = TFRecordRandomAccess(
|
|
60
|
+
tfrecord_path=[
|
|
61
|
+
"/path/to/file1.tfrecord",
|
|
62
|
+
"/path/to/file2.tfrecord"
|
|
63
|
+
],
|
|
64
|
+
key_feature_name="key"
|
|
65
|
+
)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Custom Index Location
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
reader = TFRecordRandomAccess(
|
|
72
|
+
tfrecord_path="/path/to/your/file.tfrecord",
|
|
73
|
+
key_feature_name="key",
|
|
74
|
+
index_file="/custom/path/to/index.pkl"
|
|
75
|
+
)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Advanced Usage
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
# Get statistics
|
|
82
|
+
stats = reader.get_stats()
|
|
83
|
+
print(f"Total records: {stats['total_records']}")
|
|
84
|
+
print(f"Records per file: {stats['records_per_file']}")
|
|
85
|
+
|
|
86
|
+
# Get all keys
|
|
87
|
+
all_keys = reader.get_keys()
|
|
88
|
+
|
|
89
|
+
# Get raw record bytes
|
|
90
|
+
raw_bytes = reader.get_raw_record("your_key")
|
|
91
|
+
|
|
92
|
+
# Force rebuild index
|
|
93
|
+
reader.rebuild_index()
|
|
94
|
+
|
|
95
|
+
# Get number of records
|
|
96
|
+
num_records = len(reader)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## API Reference
|
|
100
|
+
|
|
101
|
+
### TFRecordRandomAccess
|
|
102
|
+
|
|
103
|
+
#### Constructor
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
TFRecordRandomAccess(
|
|
107
|
+
tfrecord_path: Union[str, Path, List[str], List[Path]],
|
|
108
|
+
key_feature_name: str = 'key',
|
|
109
|
+
index_file: Optional[Union[str, Path]] = None,
|
|
110
|
+
progress_interval: int = 1000
|
|
111
|
+
)
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
**Parameters:**
|
|
115
|
+
- `tfrecord_path`: Path to TFRecord file(s). Can be a single file, list of files, or glob pattern.
|
|
116
|
+
- `key_feature_name`: Name of the feature containing the record key (default: 'key')
|
|
117
|
+
- `index_file`: Optional path to save/load the index cache. Auto-generated if None.
|
|
118
|
+
- `progress_interval`: Print progress every N records during indexing (default: 1000)
|
|
119
|
+
|
|
120
|
+
#### Methods
|
|
121
|
+
|
|
122
|
+
- `get_record(key: str) -> Optional[tf.train.Example]`: Get a TFRecord by key
|
|
123
|
+
- `get_raw_record(key: str) -> Optional[bytes]`: Get raw record bytes by key
|
|
124
|
+
- `get_feature(key: str, feature_name: str) -> Optional[Any]`: Get specific feature value
|
|
125
|
+
- `contains_key(key: str) -> bool`: Check if key exists
|
|
126
|
+
- `get_keys() -> List[str]`: Get all available keys
|
|
127
|
+
- `get_stats() -> Dict[str, Any]`: Get statistics about indexed records
|
|
128
|
+
- `rebuild_index() -> None`: Force rebuild the index
|
|
129
|
+
|
|
130
|
+
#### Special Methods
|
|
131
|
+
|
|
132
|
+
- `len(reader)`: Get number of records
|
|
133
|
+
- `key in reader`: Check if key exists
|
|
134
|
+
- `reader[key]`: Get record by key (raises KeyError if not found)
|
|
135
|
+
|
|
136
|
+
## Index File Format
|
|
137
|
+
|
|
138
|
+
The index is stored as a pickled dictionary with the following structure:
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
{
|
|
142
|
+
"key1": {
|
|
143
|
+
"file": "/path/to/file.tfrecord",
|
|
144
|
+
"offset": 1234,
|
|
145
|
+
"length": 5678
|
|
146
|
+
},
|
|
147
|
+
"key2": {
|
|
148
|
+
"file": "/path/to/file.tfrecord",
|
|
149
|
+
"offset": 5912,
|
|
150
|
+
"length": 2345
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## Performance
|
|
156
|
+
|
|
157
|
+
- **Index Building**: O(n) where n is the total number of records
|
|
158
|
+
- **Record Access**: O(1) after index is built
|
|
159
|
+
- **Memory Usage**: Only the index is kept in memory (~50-100 bytes per record)
|
|
160
|
+
|
|
161
|
+
## Examples
|
|
162
|
+
|
|
163
|
+
See `example_usage.py` and `test_with_experimental_data.py` for complete examples.
|
|
164
|
+
|
|
165
|
+
## Requirements
|
|
166
|
+
|
|
167
|
+
- Python >= 3.10
|
|
168
|
+
- TensorFlow >= 2.13.0
|
|
File without changes
|