datashelf-py 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datashelf_py-0.1.0/LICENSE +21 -0
- datashelf_py-0.1.0/PKG-INFO +317 -0
- datashelf_py-0.1.0/README.md +265 -0
- datashelf_py-0.1.0/datashelf/__init__.py +9 -0
- datashelf_py-0.1.0/datashelf/checkout.py +33 -0
- datashelf_py-0.1.0/datashelf/cli.py +240 -0
- datashelf_py-0.1.0/datashelf/core/__init__.py +0 -0
- datashelf_py-0.1.0/datashelf/core/config.py +52 -0
- datashelf_py-0.1.0/datashelf/core/directory.py +46 -0
- datashelf_py-0.1.0/datashelf/core/hashing.py +77 -0
- datashelf_py-0.1.0/datashelf/core/metadata.py +152 -0
- datashelf_py-0.1.0/datashelf/init.py +36 -0
- datashelf_py-0.1.0/datashelf/inspect.py +170 -0
- datashelf_py-0.1.0/datashelf/load.py +93 -0
- datashelf_py-0.1.0/datashelf/save.py +120 -0
- datashelf_py-0.1.0/datashelf_py.egg-info/PKG-INFO +317 -0
- datashelf_py-0.1.0/datashelf_py.egg-info/SOURCES.txt +26 -0
- datashelf_py-0.1.0/datashelf_py.egg-info/dependency_links.txt +1 -0
- datashelf_py-0.1.0/datashelf_py.egg-info/entry_points.txt +2 -0
- datashelf_py-0.1.0/datashelf_py.egg-info/requires.txt +7 -0
- datashelf_py-0.1.0/datashelf_py.egg-info/top_level.txt +1 -0
- datashelf_py-0.1.0/pyproject.toml +57 -0
- datashelf_py-0.1.0/setup.cfg +4 -0
- datashelf_py-0.1.0/tests/test_checkout.py +13 -0
- datashelf_py-0.1.0/tests/test_init copy.py +41 -0
- datashelf_py-0.1.0/tests/test_inspect.py +31 -0
- datashelf_py-0.1.0/tests/test_load copy.py +43 -0
- datashelf_py-0.1.0/tests/test_save copy.py +58 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Rohan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datashelf-py
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A local dataset tracking tool.
|
|
5
|
+
Author-email: Rohan Krishnan <krishnan.rohan@outlook.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2025 Rohan
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/r0hankrishnan/datashelf
|
|
29
|
+
Project-URL: Repository, https://github.com/r0hankrishnan/datashelf
|
|
30
|
+
Project-URL: Issues, https://github.com/r0hankrishnan/datashelf/issues
|
|
31
|
+
Keywords: data,datasets,tracking,versioning,cli
|
|
32
|
+
Classifier: Development Status :: 3 - Alpha
|
|
33
|
+
Classifier: Intended Audience :: Science/Research
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
36
|
+
Classifier: Programming Language :: Python :: 3
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
39
|
+
Classifier: Operating System :: OS Independent
|
|
40
|
+
Classifier: Topic :: Scientific/Engineering
|
|
41
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
42
|
+
Requires-Python: >=3.10
|
|
43
|
+
Description-Content-Type: text/markdown
|
|
44
|
+
License-File: LICENSE
|
|
45
|
+
Requires-Dist: pandas<3.0,>=1.5
|
|
46
|
+
Requires-Dist: PyYAML<7.0,>=6.0
|
|
47
|
+
Requires-Dist: fastparquet>=2023.1.0
|
|
48
|
+
Requires-Dist: openpyxl<4.0,>=3.0
|
|
49
|
+
Provides-Extra: dev
|
|
50
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
51
|
+
Dynamic: license-file
|
|
52
|
+
|
|
53
|
+
# Datashelf
|
|
54
|
+
|
|
55
|
+

|
|
56
|
+
|
|
57
|
+
Datashelf is a lightweight **local dataset tracking tool** for data science projects.
|
|
58
|
+
|
|
59
|
+
It stores tabular datasets as immutable artifacts, tracks metadata, and lets you retrieve them later by **name or hash**. The goal is to make experiments easier to reproduce without introducing heavy infrastructure.
|
|
60
|
+
|
|
61
|
+
## Example
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
$ datashelf init
|
|
65
|
+
Initialized DataShelf at .datashelf/
|
|
66
|
+
|
|
67
|
+
$ datashelf save data/people.csv people_raw --message "tiny dataset" --tag raw
|
|
68
|
+
Successfully saved 'people_raw' with hash c8a2f8e1
|
|
69
|
+
|
|
70
|
+
$ datashelf list
|
|
71
|
+
|
|
72
|
+
Hash Name Tag Message
|
|
73
|
+
-----------------------------------------
|
|
74
|
+
c8a2f8e1 people_raw raw tiny dataset
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
# Why Datashelf?
|
|
78
|
+
|
|
79
|
+
Many data science workflows struggle with dataset organization:
|
|
80
|
+
|
|
81
|
+
* intermediate datasets get overwritten
|
|
82
|
+
* multiple versions accumulate
|
|
83
|
+
* experiments become difficult to reproduce
|
|
84
|
+
|
|
85
|
+
Instead of accumulating files like:
|
|
86
|
+
|
|
87
|
+
```
|
|
88
|
+
data.csv
|
|
89
|
+
data_clean.csv
|
|
90
|
+
data_final_v2.csv
|
|
91
|
+
data_final_really_final.csv
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Datashelf stores datasets using **content hashes** and maintains a metadata registry so artifacts can always be located again.
|
|
95
|
+
|
|
96
|
+
Key ideas:
|
|
97
|
+
|
|
98
|
+
* **content-addressed storage** (SHA256)
|
|
99
|
+
* **metadata registry** for datasets
|
|
100
|
+
* lookup by **name or hash prefix**
|
|
101
|
+
* **CLI + Python API**
|
|
102
|
+
* **opinionated dataset tags** based on Cookiecutter Data Science
|
|
103
|
+
|
|
104
|
+
It is intentionally **local and lightweight**, designed for individual projects rather than large data pipelines.
|
|
105
|
+
|
|
106
|
+
# Features
|
|
107
|
+
|
|
108
|
+
* Local dataset artifact storage
|
|
109
|
+
* SHA256 content hashing
|
|
110
|
+
* Metadata registry (name, tag, message, timestamp)
|
|
111
|
+
* Lookup by dataset name or hash prefix
|
|
112
|
+
* Optional dataset tags and messages
|
|
113
|
+
* CLI + Python API
|
|
114
|
+
* Automatic normalization to Parquet
|
|
115
|
+
* Duplicate dataset detection
|
|
116
|
+
* Basic unit test coverage
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# Installation
|
|
120
|
+
|
|
121
|
+
Clone the repository and install locally:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
git clone <repo-url>
|
|
125
|
+
cd datashelf
|
|
126
|
+
pip install -e .
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
I am also working on getting it published on PyPi!
|
|
130
|
+
|
|
131
|
+
# Quick Start
|
|
132
|
+
|
|
133
|
+
Initialize a Datashelf repository in your project directory:
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
datashelf init
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
This creates a hidden directory used to store artifacts and metadata:
|
|
140
|
+
|
|
141
|
+
```
|
|
142
|
+
.datashelf/
|
|
143
|
+
├── config.yaml
|
|
144
|
+
├── metadata.json
|
|
145
|
+
└── artifacts/
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
# Example Workflow
|
|
149
|
+
|
|
150
|
+
Save a dataset:
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
datashelf save data/people.csv people_raw --message "tiny dataset" --tag raw
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
List stored datasets:
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
datashelf list
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
Inspect metadata:
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
datashelf show people_raw
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
Load the stored dataset path:
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
datashelf load people_raw
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
Load directly into pandas:
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
datashelf load people_raw --df
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
Export the artifact to another location:
|
|
181
|
+
|
|
182
|
+
```bash
|
|
183
|
+
datashelf checkout people_raw exports/people.parquet
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
# Python API
|
|
187
|
+
|
|
188
|
+
Datashelf can also be used directly from Python:
|
|
189
|
+
|
|
190
|
+
```python
|
|
191
|
+
import datashelf as ds
|
|
192
|
+
|
|
193
|
+
ds.init()
|
|
194
|
+
|
|
195
|
+
ds.save(
|
|
196
|
+
data="data.csv",
|
|
197
|
+
name="training_data",
|
|
198
|
+
message="clean dataset",
|
|
199
|
+
tag="processed"
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
df = ds.load("training_data", to_df=True)
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
# Architecture
|
|
206
|
+
|
|
207
|
+
Datashelf separates **user commands** from **internal system services**.
|
|
208
|
+
|
|
209
|
+
```
|
|
210
|
+
User / CLI
|
|
211
|
+
│
|
|
212
|
+
▼
|
|
213
|
+
Command Layer
|
|
214
|
+
(init, save, load, inspect, checkout)
|
|
215
|
+
│
|
|
216
|
+
▼
|
|
217
|
+
Core Services
|
|
218
|
+
(directory, hashing, metadata, config)
|
|
219
|
+
│
|
|
220
|
+
▼
|
|
221
|
+
.datashelf/
|
|
222
|
+
artifacts + metadata registry
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
### Command Layer
|
|
226
|
+
|
|
227
|
+
Handles user workflows such as saving, loading, inspecting, and exporting datasets.
|
|
228
|
+
|
|
229
|
+
### Core Layer
|
|
230
|
+
|
|
231
|
+
Implements internal functionality including:
|
|
232
|
+
|
|
233
|
+
* content hashing
|
|
234
|
+
* metadata management
|
|
235
|
+
* artifact storage
|
|
236
|
+
* configuration management
|
|
237
|
+
|
|
238
|
+
This separation keeps command modules simple and makes core logic easier to test and maintain.
|
|
239
|
+
|
|
240
|
+
# How Artifacts Are Stored
|
|
241
|
+
|
|
242
|
+
Datasets are stored using their **SHA256 hash**:
|
|
243
|
+
|
|
244
|
+
```
|
|
245
|
+
.datashelf/artifacts/<hash>.parquet
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
Metadata is stored in a registry:
|
|
249
|
+
|
|
250
|
+
```json
|
|
251
|
+
{
|
|
252
|
+
"file_hash": "c8a2f8e1...",
|
|
253
|
+
"name": "people_raw",
|
|
254
|
+
"tag": "raw",
|
|
255
|
+
"message": "tiny dataset",
|
|
256
|
+
"stored_path": "artifacts/c8a2f8e1.parquet",
|
|
257
|
+
"datetime_added": "2026-03-10T12:30:00"
|
|
258
|
+
}
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
This ensures datasets can always be referenced reliably.
|
|
262
|
+
|
|
263
|
+
# Comparison
|
|
264
|
+
|
|
265
|
+
Datashelf focuses on **simple, local dataset tracking**.
|
|
266
|
+
|
|
267
|
+
| Tool | Purpose |
|
|
268
|
+
| --------- | --------------------------------------------------- |
|
|
269
|
+
| DataShelf | Lightweight local dataset tracking for tabular data |
|
|
270
|
+
| DVC | Full data version control with remote storage |
|
|
271
|
+
| Git LFS | Large file versioning inside Git |
|
|
272
|
+
|
|
273
|
+
Datashelf intentionally avoids:
|
|
274
|
+
|
|
275
|
+
* Git integration
|
|
276
|
+
* remote storage
|
|
277
|
+
* pipeline orchestration
|
|
278
|
+
|
|
279
|
+
This keeps the tool simple and easy to use for smaller data science projects.
|
|
280
|
+
|
|
281
|
+
# Running Tests
|
|
282
|
+
|
|
283
|
+
Run tests with:
|
|
284
|
+
|
|
285
|
+
```bash
|
|
286
|
+
pytest
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
Tests cover repository initialization, dataset saving, loading, metadata inspection, and artifact checkout.
|
|
290
|
+
|
|
291
|
+
# Future Work
|
|
292
|
+
|
|
293
|
+
Possible extensions include:
|
|
294
|
+
|
|
295
|
+
* dataset diffing
|
|
296
|
+
* experiment tracking
|
|
297
|
+
* dataset lineage tracking
|
|
298
|
+
* remote artifact storage
|
|
299
|
+
* richer filtering and search
|
|
300
|
+
|
|
301
|
+
# License
|
|
302
|
+
|
|
303
|
+
MIT License.
|
|
304
|
+
|
|
305
|
+
# About This Project
|
|
306
|
+
|
|
307
|
+
Datashelf was built as a personal project to **make something that I thought would be useful in my day-to-day work at school**.
|
|
308
|
+
|
|
309
|
+
The project demonstrates:
|
|
310
|
+
|
|
311
|
+
* CLI tool development
|
|
312
|
+
* artifact-based dataset management
|
|
313
|
+
* modular Python package architecture
|
|
314
|
+
* reproducible data pipelines
|
|
315
|
+
* test-driven development
|
|
316
|
+
|
|
317
|
+
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
# Datashelf
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+
|
|
5
|
+
Datashelf is a lightweight **local dataset tracking tool** for data science projects.
|
|
6
|
+
|
|
7
|
+
It stores tabular datasets as immutable artifacts, tracks metadata, and lets you retrieve them later by **name or hash**. The goal is to make experiments easier to reproduce without introducing heavy infrastructure.
|
|
8
|
+
|
|
9
|
+
## Example
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
$ datashelf init
|
|
13
|
+
Initialized DataShelf at .datashelf/
|
|
14
|
+
|
|
15
|
+
$ datashelf save data/people.csv people_raw --message "tiny dataset" --tag raw
|
|
16
|
+
Successfully saved 'people_raw' with hash c8a2f8e1
|
|
17
|
+
|
|
18
|
+
$ datashelf list
|
|
19
|
+
|
|
20
|
+
Hash Name Tag Message
|
|
21
|
+
-----------------------------------------
|
|
22
|
+
c8a2f8e1 people_raw raw tiny dataset
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
# Why Datashelf?
|
|
26
|
+
|
|
27
|
+
Many data science workflows struggle with dataset organization:
|
|
28
|
+
|
|
29
|
+
* intermediate datasets get overwritten
|
|
30
|
+
* multiple versions accumulate
|
|
31
|
+
* experiments become difficult to reproduce
|
|
32
|
+
|
|
33
|
+
Instead of accumulating files like:
|
|
34
|
+
|
|
35
|
+
```
|
|
36
|
+
data.csv
|
|
37
|
+
data_clean.csv
|
|
38
|
+
data_final_v2.csv
|
|
39
|
+
data_final_really_final.csv
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Datashelf stores datasets using **content hashes** and maintains a metadata registry so artifacts can always be located again.
|
|
43
|
+
|
|
44
|
+
Key ideas:
|
|
45
|
+
|
|
46
|
+
* **content-addressed storage** (SHA256)
|
|
47
|
+
* **metadata registry** for datasets
|
|
48
|
+
* lookup by **name or hash prefix**
|
|
49
|
+
* **CLI + Python API**
|
|
50
|
+
* **opinionated dataset tags** based on Cookiecutter Data Science
|
|
51
|
+
|
|
52
|
+
It is intentionally **local and lightweight**, designed for individual projects rather than large data pipelines.
|
|
53
|
+
|
|
54
|
+
# Features
|
|
55
|
+
|
|
56
|
+
* Local dataset artifact storage
|
|
57
|
+
* SHA256 content hashing
|
|
58
|
+
* Metadata registry (name, tag, message, timestamp)
|
|
59
|
+
* Lookup by dataset name or hash prefix
|
|
60
|
+
* Optional dataset tags and messages
|
|
61
|
+
* CLI + Python API
|
|
62
|
+
* Automatic normalization to Parquet
|
|
63
|
+
* Duplicate dataset detection
|
|
64
|
+
* Basic unit test coverage
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# Installation
|
|
68
|
+
|
|
69
|
+
Clone the repository and install locally:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
git clone <repo-url>
|
|
73
|
+
cd datashelf
|
|
74
|
+
pip install -e .
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
I am also working on getting it published on PyPi!
|
|
78
|
+
|
|
79
|
+
# Quick Start
|
|
80
|
+
|
|
81
|
+
Initialize a Datashelf repository in your project directory:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
datashelf init
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
This creates a hidden directory used to store artifacts and metadata:
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
.datashelf/
|
|
91
|
+
├── config.yaml
|
|
92
|
+
├── metadata.json
|
|
93
|
+
└── artifacts/
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
# Example Workflow
|
|
97
|
+
|
|
98
|
+
Save a dataset:
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
datashelf save data/people.csv people_raw --message "tiny dataset" --tag raw
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
List stored datasets:
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
datashelf list
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Inspect metadata:
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
datashelf show people_raw
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Load the stored dataset path:
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
datashelf load people_raw
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
Load directly into pandas:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
datashelf load people_raw --df
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Export the artifact to another location:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
datashelf checkout people_raw exports/people.parquet
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
# Python API
|
|
135
|
+
|
|
136
|
+
Datashelf can also be used directly from Python:
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
import datashelf as ds
|
|
140
|
+
|
|
141
|
+
ds.init()
|
|
142
|
+
|
|
143
|
+
ds.save(
|
|
144
|
+
data="data.csv",
|
|
145
|
+
name="training_data",
|
|
146
|
+
message="clean dataset",
|
|
147
|
+
tag="processed"
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
df = ds.load("training_data", to_df=True)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
# Architecture
|
|
154
|
+
|
|
155
|
+
Datashelf separates **user commands** from **internal system services**.
|
|
156
|
+
|
|
157
|
+
```
|
|
158
|
+
User / CLI
|
|
159
|
+
│
|
|
160
|
+
▼
|
|
161
|
+
Command Layer
|
|
162
|
+
(init, save, load, inspect, checkout)
|
|
163
|
+
│
|
|
164
|
+
▼
|
|
165
|
+
Core Services
|
|
166
|
+
(directory, hashing, metadata, config)
|
|
167
|
+
│
|
|
168
|
+
▼
|
|
169
|
+
.datashelf/
|
|
170
|
+
artifacts + metadata registry
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### Command Layer
|
|
174
|
+
|
|
175
|
+
Handles user workflows such as saving, loading, inspecting, and exporting datasets.
|
|
176
|
+
|
|
177
|
+
### Core Layer
|
|
178
|
+
|
|
179
|
+
Implements internal functionality including:
|
|
180
|
+
|
|
181
|
+
* content hashing
|
|
182
|
+
* metadata management
|
|
183
|
+
* artifact storage
|
|
184
|
+
* configuration management
|
|
185
|
+
|
|
186
|
+
This separation keeps command modules simple and makes core logic easier to test and maintain.
|
|
187
|
+
|
|
188
|
+
# How Artifacts Are Stored
|
|
189
|
+
|
|
190
|
+
Datasets are stored using their **SHA256 hash**:
|
|
191
|
+
|
|
192
|
+
```
|
|
193
|
+
.datashelf/artifacts/<hash>.parquet
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
Metadata is stored in a registry:
|
|
197
|
+
|
|
198
|
+
```json
|
|
199
|
+
{
|
|
200
|
+
"file_hash": "c8a2f8e1...",
|
|
201
|
+
"name": "people_raw",
|
|
202
|
+
"tag": "raw",
|
|
203
|
+
"message": "tiny dataset",
|
|
204
|
+
"stored_path": "artifacts/c8a2f8e1.parquet",
|
|
205
|
+
"datetime_added": "2026-03-10T12:30:00"
|
|
206
|
+
}
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
This ensures datasets can always be referenced reliably.
|
|
210
|
+
|
|
211
|
+
# Comparison
|
|
212
|
+
|
|
213
|
+
Datashelf focuses on **simple, local dataset tracking**.
|
|
214
|
+
|
|
215
|
+
| Tool | Purpose |
|
|
216
|
+
| --------- | --------------------------------------------------- |
|
|
217
|
+
| DataShelf | Lightweight local dataset tracking for tabular data |
|
|
218
|
+
| DVC | Full data version control with remote storage |
|
|
219
|
+
| Git LFS | Large file versioning inside Git |
|
|
220
|
+
|
|
221
|
+
Datashelf intentionally avoids:
|
|
222
|
+
|
|
223
|
+
* Git integration
|
|
224
|
+
* remote storage
|
|
225
|
+
* pipeline orchestration
|
|
226
|
+
|
|
227
|
+
This keeps the tool simple and easy to use for smaller data science projects.
|
|
228
|
+
|
|
229
|
+
# Running Tests
|
|
230
|
+
|
|
231
|
+
Run tests with:
|
|
232
|
+
|
|
233
|
+
```bash
|
|
234
|
+
pytest
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
Tests cover repository initialization, dataset saving, loading, metadata inspection, and artifact checkout.
|
|
238
|
+
|
|
239
|
+
# Future Work
|
|
240
|
+
|
|
241
|
+
Possible extensions include:
|
|
242
|
+
|
|
243
|
+
* dataset diffing
|
|
244
|
+
* experiment tracking
|
|
245
|
+
* dataset lineage tracking
|
|
246
|
+
* remote artifact storage
|
|
247
|
+
* richer filtering and search
|
|
248
|
+
|
|
249
|
+
# License
|
|
250
|
+
|
|
251
|
+
MIT License.
|
|
252
|
+
|
|
253
|
+
# About This Project
|
|
254
|
+
|
|
255
|
+
Datashelf was built as a personal project to **make something that I thought would be useful in my day-to-day work at school**.
|
|
256
|
+
|
|
257
|
+
The project demonstrates:
|
|
258
|
+
|
|
259
|
+
* CLI tool development
|
|
260
|
+
* artifact-based dataset management
|
|
261
|
+
* modular Python package architecture
|
|
262
|
+
* reproducible data pipelines
|
|
263
|
+
* test-driven development
|
|
264
|
+
|
|
265
|
+
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from datashelf.load import load
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def checkout(lookup_key: str, dest: str | Path) -> Path:
|
|
7
|
+
"""Copy a stored artifact from the datashelf to a user-specified destination.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
lookup_key (str): Dataset name, full hash, or unique hash prefix.
|
|
11
|
+
dest (str | Path): Destination file path to copy the artifact to.
|
|
12
|
+
Raises:
|
|
13
|
+
TypeError: If the destination file does not have a .parquet suffix.
|
|
14
|
+
FileExistsError: If the destination file already exists.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Path: The path to the copied artifact.
|
|
18
|
+
"""
|
|
19
|
+
src_path = load(lookup_key=lookup_key)
|
|
20
|
+
|
|
21
|
+
dest_path: Path = Path(dest).resolve()
|
|
22
|
+
|
|
23
|
+
if dest_path.suffix != ".parquet":
|
|
24
|
+
raise TypeError(f"{dest} is invalid. Make sure file has .parquet suffix.")
|
|
25
|
+
|
|
26
|
+
if dest_path.exists():
|
|
27
|
+
raise FileExistsError(f"Destination already exists: {dest_path}")
|
|
28
|
+
|
|
29
|
+
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
shutil.copy2(str(src_path), str(dest_path))
|
|
31
|
+
|
|
32
|
+
print(f"Checked out artifact to {dest_path}")
|
|
33
|
+
return dest_path
|