datashelf-py 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Rohan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,317 @@
1
+ Metadata-Version: 2.4
2
+ Name: datashelf-py
3
+ Version: 0.1.0
4
+ Summary: A local dataset tracking tool.
5
+ Author-email: Rohan Krishnan <krishnan.rohan@outlook.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 Rohan
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/r0hankrishnan/datashelf
29
+ Project-URL: Repository, https://github.com/r0hankrishnan/datashelf
30
+ Project-URL: Issues, https://github.com/r0hankrishnan/datashelf/issues
31
+ Keywords: data,datasets,tracking,versioning,cli
32
+ Classifier: Development Status :: 3 - Alpha
33
+ Classifier: Intended Audience :: Science/Research
34
+ Classifier: Intended Audience :: Developers
35
+ Classifier: License :: OSI Approved :: MIT License
36
+ Classifier: Programming Language :: Python :: 3
37
+ Classifier: Programming Language :: Python :: 3.10
38
+ Classifier: Programming Language :: Python :: 3.11
39
+ Classifier: Operating System :: OS Independent
40
+ Classifier: Topic :: Scientific/Engineering
41
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
42
+ Requires-Python: >=3.10
43
+ Description-Content-Type: text/markdown
44
+ License-File: LICENSE
45
+ Requires-Dist: pandas<3.0,>=1.5
46
+ Requires-Dist: PyYAML<7.0,>=6.0
47
+ Requires-Dist: fastparquet>=2023.1.0
48
+ Requires-Dist: openpyxl<4.0,>=3.0
49
+ Provides-Extra: dev
50
+ Requires-Dist: pytest>=7.0; extra == "dev"
51
+ Dynamic: license-file
52
+
53
+ # Datashelf
54
+
55
+ ![datashelf logo](./assets/ds.svg)
56
+
57
+ Datashelf is a lightweight **local dataset tracking tool** for data science projects.
58
+
59
+ It stores tabular datasets as immutable artifacts, tracks metadata, and lets you retrieve them later by **name or hash**. The goal is to make experiments easier to reproduce without introducing heavy infrastructure.
60
+
61
+ ## Example
62
+
63
+ ```bash
64
+ $ datashelf init
65
+ Initialized DataShelf at .datashelf/
66
+
67
+ $ datashelf save data/people.csv people_raw --message "tiny dataset" --tag raw
68
+ Successfully saved 'people_raw' with hash c8a2f8e1
69
+
70
+ $ datashelf list
71
+
72
+ Hash Name Tag Message
73
+ -----------------------------------------
74
+ c8a2f8e1 people_raw raw tiny dataset
75
+ ```
76
+
77
+ # Why Datashelf?
78
+
79
+ Many data science workflows struggle with dataset organization:
80
+
81
+ * intermediate datasets get overwritten
82
+ * multiple versions accumulate
83
+ * experiments become difficult to reproduce
84
+
85
+ Instead of accumulating files like:
86
+
87
+ ```
88
+ data.csv
89
+ data_clean.csv
90
+ data_final_v2.csv
91
+ data_final_really_final.csv
92
+ ```
93
+
94
+ Datashelf stores datasets using **content hashes** and maintains a metadata registry so artifacts can always be located again.
95
+
96
+ Key ideas:
97
+
98
+ * **content-addressed storage** (SHA256)
99
+ * **metadata registry** for datasets
100
+ * lookup by **name or hash prefix**
101
+ * **CLI + Python API**
102
+ * **opinionated dataset tags** based on Cookiecutter Data Science
103
+
104
+ It is intentionally **local and lightweight**, designed for individual projects rather than large data pipelines.
105
+
106
+ # Features
107
+
108
+ * Local dataset artifact storage
109
+ * SHA256 content hashing
110
+ * Metadata registry (name, tag, message, timestamp)
111
+ * Lookup by dataset name or hash prefix
112
+ * Optional dataset tags and messages
113
+ * CLI + Python API
114
+ * Automatic normalization to Parquet
115
+ * Duplicate dataset detection
116
+ * Basic unit test coverage
117
+
118
+
119
+ # Installation
120
+
121
+ Clone the repository and install locally:
122
+
123
+ ```bash
124
+ git clone <repo-url>
125
+ cd datashelf
126
+ pip install -e .
127
+ ```
128
+
129
+ I am also working on getting it published on PyPi!
130
+
131
+ # Quick Start
132
+
133
+ Initialize a Datashelf repository in your project directory:
134
+
135
+ ```bash
136
+ datashelf init
137
+ ```
138
+
139
+ This creates a hidden directory used to store artifacts and metadata:
140
+
141
+ ```
142
+ .datashelf/
143
+ ├── config.yaml
144
+ ├── metadata.json
145
+ └── artifacts/
146
+ ```
147
+
148
+ # Example Workflow
149
+
150
+ Save a dataset:
151
+
152
+ ```bash
153
+ datashelf save data/people.csv people_raw --message "tiny dataset" --tag raw
154
+ ```
155
+
156
+ List stored datasets:
157
+
158
+ ```bash
159
+ datashelf list
160
+ ```
161
+
162
+ Inspect metadata:
163
+
164
+ ```bash
165
+ datashelf show people_raw
166
+ ```
167
+
168
+ Load the stored dataset path:
169
+
170
+ ```bash
171
+ datashelf load people_raw
172
+ ```
173
+
174
+ Load directly into pandas:
175
+
176
+ ```bash
177
+ datashelf load people_raw --df
178
+ ```
179
+
180
+ Export the artifact to another location:
181
+
182
+ ```bash
183
+ datashelf checkout people_raw exports/people.parquet
184
+ ```
185
+
186
+ # Python API
187
+
188
+ Datashelf can also be used directly from Python:
189
+
190
+ ```python
191
+ import datashelf as ds
192
+
193
+ ds.init()
194
+
195
+ ds.save(
196
+ data="data.csv",
197
+ name="training_data",
198
+ message="clean dataset",
199
+ tag="processed"
200
+ )
201
+
202
+ df = ds.load("training_data", to_df=True)
203
+ ```
204
+
205
+ # Architecture
206
+
207
+ Datashelf separates **user commands** from **internal system services**.
208
+
209
+ ```
210
+ User / CLI
211
+
212
+
213
+ Command Layer
214
+ (init, save, load, inspect, checkout)
215
+
216
+
217
+ Core Services
218
+ (directory, hashing, metadata, config)
219
+
220
+
221
+ .datashelf/
222
+ artifacts + metadata registry
223
+ ```
224
+
225
+ ### Command Layer
226
+
227
+ Handles user workflows such as saving, loading, inspecting, and exporting datasets.
228
+
229
+ ### Core Layer
230
+
231
+ Implements internal functionality including:
232
+
233
+ * content hashing
234
+ * metadata management
235
+ * artifact storage
236
+ * configuration management
237
+
238
+ This separation keeps command modules simple and makes core logic easier to test and maintain.
239
+
240
+ # How Artifacts Are Stored
241
+
242
+ Datasets are stored using their **SHA256 hash**:
243
+
244
+ ```
245
+ .datashelf/artifacts/<hash>.parquet
246
+ ```
247
+
248
+ Metadata is stored in a registry:
249
+
250
+ ```json
251
+ {
252
+ "file_hash": "c8a2f8e1...",
253
+ "name": "people_raw",
254
+ "tag": "raw",
255
+ "message": "tiny dataset",
256
+ "stored_path": "artifacts/c8a2f8e1.parquet",
257
+ "datetime_added": "2026-03-10T12:30:00"
258
+ }
259
+ ```
260
+
261
+ This ensures datasets can always be referenced reliably.
262
+
263
+ # Comparison
264
+
265
+ Datashelf focuses on **simple, local dataset tracking**.
266
+
267
+ | Tool | Purpose |
268
+ | --------- | --------------------------------------------------- |
269
+ | DataShelf | Lightweight local dataset tracking for tabular data |
270
+ | DVC | Full data version control with remote storage |
271
+ | Git LFS | Large file versioning inside Git |
272
+
273
+ Datashelf intentionally avoids:
274
+
275
+ * Git integration
276
+ * remote storage
277
+ * pipeline orchestration
278
+
279
+ This keeps the tool simple and easy to use for smaller data science projects.
280
+
281
+ # Running Tests
282
+
283
+ Run tests with:
284
+
285
+ ```bash
286
+ pytest
287
+ ```
288
+
289
+ Tests cover repository initialization, dataset saving, loading, metadata inspection, and artifact checkout.
290
+
291
+ # Future Work
292
+
293
+ Possible extensions include:
294
+
295
+ * dataset diffing
296
+ * experiment tracking
297
+ * dataset lineage tracking
298
+ * remote artifact storage
299
+ * richer filtering and search
300
+
301
+ # License
302
+
303
+ MIT License.
304
+
305
+ # About This Project
306
+
307
+ Datashelf was built as a personal project to **make something that I thought would be useful in my day-to-day work at school**.
308
+
309
+ The project demonstrates:
310
+
311
+ * CLI tool development
312
+ * artifact-based dataset management
313
+ * modular Python package architecture
314
+ * reproducible data pipelines
315
+ * test-driven development
316
+
317
+
@@ -0,0 +1,265 @@
1
+ # Datashelf
2
+
3
+ ![datashelf logo](./assets/ds.svg)
4
+
5
+ Datashelf is a lightweight **local dataset tracking tool** for data science projects.
6
+
7
+ It stores tabular datasets as immutable artifacts, tracks metadata, and lets you retrieve them later by **name or hash**. The goal is to make experiments easier to reproduce without introducing heavy infrastructure.
8
+
9
+ ## Example
10
+
11
+ ```bash
12
+ $ datashelf init
13
+ Initialized DataShelf at .datashelf/
14
+
15
+ $ datashelf save data/people.csv people_raw --message "tiny dataset" --tag raw
16
+ Successfully saved 'people_raw' with hash c8a2f8e1
17
+
18
+ $ datashelf list
19
+
20
+ Hash Name Tag Message
21
+ -----------------------------------------
22
+ c8a2f8e1 people_raw raw tiny dataset
23
+ ```
24
+
25
+ # Why Datashelf?
26
+
27
+ Many data science workflows struggle with dataset organization:
28
+
29
+ * intermediate datasets get overwritten
30
+ * multiple versions accumulate
31
+ * experiments become difficult to reproduce
32
+
33
+ Instead of accumulating files like:
34
+
35
+ ```
36
+ data.csv
37
+ data_clean.csv
38
+ data_final_v2.csv
39
+ data_final_really_final.csv
40
+ ```
41
+
42
+ Datashelf stores datasets using **content hashes** and maintains a metadata registry so artifacts can always be located again.
43
+
44
+ Key ideas:
45
+
46
+ * **content-addressed storage** (SHA256)
47
+ * **metadata registry** for datasets
48
+ * lookup by **name or hash prefix**
49
+ * **CLI + Python API**
50
+ * **opinionated dataset tags** based on Cookiecutter Data Science
51
+
52
+ It is intentionally **local and lightweight**, designed for individual projects rather than large data pipelines.
53
+
54
+ # Features
55
+
56
+ * Local dataset artifact storage
57
+ * SHA256 content hashing
58
+ * Metadata registry (name, tag, message, timestamp)
59
+ * Lookup by dataset name or hash prefix
60
+ * Optional dataset tags and messages
61
+ * CLI + Python API
62
+ * Automatic normalization to Parquet
63
+ * Duplicate dataset detection
64
+ * Basic unit test coverage
65
+
66
+
67
+ # Installation
68
+
69
+ Clone the repository and install locally:
70
+
71
+ ```bash
72
+ git clone <repo-url>
73
+ cd datashelf
74
+ pip install -e .
75
+ ```
76
+
77
+ I am also working on getting it published on PyPi!
78
+
79
+ # Quick Start
80
+
81
+ Initialize a Datashelf repository in your project directory:
82
+
83
+ ```bash
84
+ datashelf init
85
+ ```
86
+
87
+ This creates a hidden directory used to store artifacts and metadata:
88
+
89
+ ```
90
+ .datashelf/
91
+ ├── config.yaml
92
+ ├── metadata.json
93
+ └── artifacts/
94
+ ```
95
+
96
+ # Example Workflow
97
+
98
+ Save a dataset:
99
+
100
+ ```bash
101
+ datashelf save data/people.csv people_raw --message "tiny dataset" --tag raw
102
+ ```
103
+
104
+ List stored datasets:
105
+
106
+ ```bash
107
+ datashelf list
108
+ ```
109
+
110
+ Inspect metadata:
111
+
112
+ ```bash
113
+ datashelf show people_raw
114
+ ```
115
+
116
+ Load the stored dataset path:
117
+
118
+ ```bash
119
+ datashelf load people_raw
120
+ ```
121
+
122
+ Load directly into pandas:
123
+
124
+ ```bash
125
+ datashelf load people_raw --df
126
+ ```
127
+
128
+ Export the artifact to another location:
129
+
130
+ ```bash
131
+ datashelf checkout people_raw exports/people.parquet
132
+ ```
133
+
134
+ # Python API
135
+
136
+ Datashelf can also be used directly from Python:
137
+
138
+ ```python
139
+ import datashelf as ds
140
+
141
+ ds.init()
142
+
143
+ ds.save(
144
+ data="data.csv",
145
+ name="training_data",
146
+ message="clean dataset",
147
+ tag="processed"
148
+ )
149
+
150
+ df = ds.load("training_data", to_df=True)
151
+ ```
152
+
153
+ # Architecture
154
+
155
+ Datashelf separates **user commands** from **internal system services**.
156
+
157
+ ```
158
+ User / CLI
159
+
160
+
161
+ Command Layer
162
+ (init, save, load, inspect, checkout)
163
+
164
+
165
+ Core Services
166
+ (directory, hashing, metadata, config)
167
+
168
+
169
+ .datashelf/
170
+ artifacts + metadata registry
171
+ ```
172
+
173
+ ### Command Layer
174
+
175
+ Handles user workflows such as saving, loading, inspecting, and exporting datasets.
176
+
177
+ ### Core Layer
178
+
179
+ Implements internal functionality including:
180
+
181
+ * content hashing
182
+ * metadata management
183
+ * artifact storage
184
+ * configuration management
185
+
186
+ This separation keeps command modules simple and makes core logic easier to test and maintain.
187
+
188
+ # How Artifacts Are Stored
189
+
190
+ Datasets are stored using their **SHA256 hash**:
191
+
192
+ ```
193
+ .datashelf/artifacts/<hash>.parquet
194
+ ```
195
+
196
+ Metadata is stored in a registry:
197
+
198
+ ```json
199
+ {
200
+ "file_hash": "c8a2f8e1...",
201
+ "name": "people_raw",
202
+ "tag": "raw",
203
+ "message": "tiny dataset",
204
+ "stored_path": "artifacts/c8a2f8e1.parquet",
205
+ "datetime_added": "2026-03-10T12:30:00"
206
+ }
207
+ ```
208
+
209
+ This ensures datasets can always be referenced reliably.
210
+
211
+ # Comparison
212
+
213
+ Datashelf focuses on **simple, local dataset tracking**.
214
+
215
+ | Tool | Purpose |
216
+ | --------- | --------------------------------------------------- |
217
+ | DataShelf | Lightweight local dataset tracking for tabular data |
218
+ | DVC | Full data version control with remote storage |
219
+ | Git LFS | Large file versioning inside Git |
220
+
221
+ Datashelf intentionally avoids:
222
+
223
+ * Git integration
224
+ * remote storage
225
+ * pipeline orchestration
226
+
227
+ This keeps the tool simple and easy to use for smaller data science projects.
228
+
229
+ # Running Tests
230
+
231
+ Run tests with:
232
+
233
+ ```bash
234
+ pytest
235
+ ```
236
+
237
+ Tests cover repository initialization, dataset saving, loading, metadata inspection, and artifact checkout.
238
+
239
+ # Future Work
240
+
241
+ Possible extensions include:
242
+
243
+ * dataset diffing
244
+ * experiment tracking
245
+ * dataset lineage tracking
246
+ * remote artifact storage
247
+ * richer filtering and search
248
+
249
+ # License
250
+
251
+ MIT License.
252
+
253
+ # About This Project
254
+
255
+ Datashelf was built as a personal project to **make something that I thought would be useful in my day-to-day work at school**.
256
+
257
+ The project demonstrates:
258
+
259
+ * CLI tool development
260
+ * artifact-based dataset management
261
+ * modular Python package architecture
262
+ * reproducible data pipelines
263
+ * test-driven development
264
+
265
+
@@ -0,0 +1,9 @@
1
+ from .init import init
2
+ from .save import save
3
+ from .inspect import ls, show
4
+ from .load import load
5
+ from .checkout import checkout
6
+
7
+ __version__ = "0.1.0"
8
+
9
+ __all__ = ["init", "save", "ls", "show", "load", "checkout"]
@@ -0,0 +1,33 @@
1
+ import shutil
2
+ from pathlib import Path
3
+ from datashelf.load import load
4
+
5
+
6
+ def checkout(lookup_key: str, dest: str | Path) -> Path:
7
+ """Copy a stored artifact from the datashelf to a user-specified destination.
8
+
9
+ Args:
10
+ lookup_key (str): Dataset name, full hash, or unique hash prefix.
11
+ dest (str | Path): Destination file path to copy the artifact to.
12
+ Raises:
13
+ TypeError: If the destination file does not have a .parquet suffix.
14
+ FileExistsError: If the destination file already exists.
15
+
16
+ Returns:
17
+ Path: The path to the copied artifact.
18
+ """
19
+ src_path = load(lookup_key=lookup_key)
20
+
21
+ dest_path: Path = Path(dest).resolve()
22
+
23
+ if dest_path.suffix != ".parquet":
24
+ raise TypeError(f"{dest} is invalid. Make sure file has .parquet suffix.")
25
+
26
+ if dest_path.exists():
27
+ raise FileExistsError(f"Destination already exists: {dest_path}")
28
+
29
+ dest_path.parent.mkdir(parents=True, exist_ok=True)
30
+ shutil.copy2(str(src_path), str(dest_path))
31
+
32
+ print(f"Checked out artifact to {dest_path}")
33
+ return dest_path