datafolio 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datafolio-1.0.0/PKG-INFO +390 -0
- datafolio-1.0.0/README.md +371 -0
- datafolio-1.0.0/pyproject.toml +112 -0
- datafolio-1.0.0/src/datafolio/__init__.py +7 -0
- datafolio-1.0.0/src/datafolio/accessors.py +338 -0
- datafolio-1.0.0/src/datafolio/base/__init__.py +25 -0
- datafolio-1.0.0/src/datafolio/base/handler.py +220 -0
- datafolio-1.0.0/src/datafolio/base/registry.py +228 -0
- datafolio-1.0.0/src/datafolio/cache/__init__.py +29 -0
- datafolio-1.0.0/src/datafolio/cache/config.py +167 -0
- datafolio-1.0.0/src/datafolio/cache/manager.py +519 -0
- datafolio-1.0.0/src/datafolio/cache/metadata.py +218 -0
- datafolio-1.0.0/src/datafolio/cache/validation.py +162 -0
- datafolio-1.0.0/src/datafolio/cli/__init__.py +5 -0
- datafolio-1.0.0/src/datafolio/cli/main.py +929 -0
- datafolio-1.0.0/src/datafolio/display.py +653 -0
- datafolio-1.0.0/src/datafolio/folio.py +4851 -0
- datafolio-1.0.0/src/datafolio/handlers/__init__.py +64 -0
- datafolio-1.0.0/src/datafolio/handlers/arrays.py +148 -0
- datafolio-1.0.0/src/datafolio/handlers/artifacts.py +143 -0
- datafolio-1.0.0/src/datafolio/handlers/json_data.py +151 -0
- datafolio-1.0.0/src/datafolio/handlers/sklearn_models.py +260 -0
- datafolio-1.0.0/src/datafolio/handlers/tables.py +285 -0
- datafolio-1.0.0/src/datafolio/handlers/timestamps.py +160 -0
- datafolio-1.0.0/src/datafolio/metadata.py +94 -0
- datafolio-1.0.0/src/datafolio/py.typed +0 -0
- datafolio-1.0.0/src/datafolio/readers.py +191 -0
- datafolio-1.0.0/src/datafolio/storage/__init__.py +22 -0
- datafolio-1.0.0/src/datafolio/storage/backend.py +597 -0
- datafolio-1.0.0/src/datafolio/storage/categories.py +111 -0
- datafolio-1.0.0/src/datafolio/utils.py +600 -0
datafolio-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: datafolio
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Lightweight wrapping of dataframes, models, and metadata to track analyses.
|
|
5
|
+
Author: Casey Schneider-Mizell
|
|
6
|
+
Author-email: Casey Schneider-Mizell <caseysm@gmail.com>
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Requires-Dist: pandas>=2.0.0
|
|
9
|
+
Requires-Dist: pyarrow>=14.0.0
|
|
10
|
+
Requires-Dist: joblib>=1.3.0
|
|
11
|
+
Requires-Dist: skops>=0.10.0
|
|
12
|
+
Requires-Dist: orjson>=3.9.0
|
|
13
|
+
Requires-Dist: cloud-files>=5.8.1
|
|
14
|
+
Requires-Dist: click>=8.1.0
|
|
15
|
+
Requires-Dist: rich>=13.0.0
|
|
16
|
+
Requires-Dist: filelock>=3.12.0
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# DataFolio
|
|
21
|
+
|
|
22
|
+
[](https://www.python.org/downloads/)
|
|
23
|
+
[](tests/)
|
|
24
|
+
[](tests/)
|
|
25
|
+
|
|
26
|
+
**A lightweight, filesystem-based data versioning and experiment tracking library for Python.**
|
|
27
|
+
|
|
28
|
+
DataFolio helps you organize, version, and track your data science experiments by storing datasets, models, and artifacts in a simple, transparent directory structure. Everything is saved as plain files (Parquet, JSON, etc) that you can inspect, version with git, or backup to any storage system.
|
|
29
|
+
|
|
30
|
+
Note: DataFolio has been an exercise in how extensively I can use Claude Code. Currently all work has been done via Mr Claude, but now that it's getting very useful for workflows I might transition over to more manual curation.
|
|
31
|
+
|
|
32
|
+
## Features
|
|
33
|
+
|
|
34
|
+
- **Universal Data Management**: Single `add_data()` method automatically handles DataFrames, numpy arrays, dicts, lists, and scalars
|
|
35
|
+
- **Model Support**: Save and load scikit-learn models with full metadata tracking
|
|
36
|
+
- **Data Lineage**: Track inputs and dependencies between datasets and models
|
|
37
|
+
- **External References**: Point to data stored externally (S3, local paths) without copying
|
|
38
|
+
- **Multi-Instance Sync**: Automatic refresh when multiple notebooks/processes access the same bundle
|
|
39
|
+
- **Autocomplete Access**: IDE-friendly `folio.data.item_name.content` syntax with full autocomplete support
|
|
40
|
+
- **Smart Metadata Display**: Automatic metadata truncation and formatting in `describe()`
|
|
41
|
+
- **Item Management**: Delete items with dependency tracking and warnings
|
|
42
|
+
- **Git-Friendly**: All data stored as standard file formats in a simple directory structure
|
|
43
|
+
- **Type-Safe**: Full type hints and comprehensive error handling
|
|
44
|
+
- **Snapshots**: Create immutable checkpoints of your experiments with copy-on-write versioning
|
|
45
|
+
- **CLI Tools**: Command-line interface for snapshot management and bundle operations
|
|
46
|
+
|
|
47
|
+
## Quick Start
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from datafolio import DataFolio
|
|
51
|
+
import pandas as pd
|
|
52
|
+
import numpy as np
|
|
53
|
+
|
|
54
|
+
# Create a new folio
|
|
55
|
+
folio = DataFolio('experiments/my_experiment')
|
|
56
|
+
|
|
57
|
+
# Add any type of data with a single method
|
|
58
|
+
folio.add_data('results', df) # DataFrame
|
|
59
|
+
folio.add_data('embeddings', np.array([1, 2, 3])) # Numpy array
|
|
60
|
+
folio.add_data('config', {'lr': 0.01}) # Dict/JSON
|
|
61
|
+
folio.add_data('accuracy', 0.95) # Scalar
|
|
62
|
+
|
|
63
|
+
# Retrieve data (automatically returns correct type)
|
|
64
|
+
df = folio.get_data('results') # Returns DataFrame
|
|
65
|
+
arr = folio.get_data('embeddings') # Returns numpy array
|
|
66
|
+
config = folio.get_data('config') # Returns dict
|
|
67
|
+
|
|
68
|
+
# Or use autocomplete-friendly access
|
|
69
|
+
df = folio.data.results.content # Same as get_data()
|
|
70
|
+
arr = folio.data.embeddings.content
|
|
71
|
+
config = folio.data.config.content
|
|
72
|
+
|
|
73
|
+
# View everything (including custom metadata)
|
|
74
|
+
folio.describe()
|
|
75
|
+
|
|
76
|
+
# Clean up temporary items
|
|
77
|
+
folio.delete('temp_data')
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Installation
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
pip install datafolio
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
This includes the `datafolio` command-line tool for snapshot management and bundle operations.
|
|
87
|
+
|
|
88
|
+
## Core Concepts
|
|
89
|
+
|
|
90
|
+
### Generic Data Methods
|
|
91
|
+
|
|
92
|
+
The `add_data()` and `get_data()` methods provide a unified interface for all data types:
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
# add_data() automatically detects type and uses the appropriate method
|
|
96
|
+
folio.add_data('my_data', data) # Works with DataFrame, array, dict, list, scalar
|
|
97
|
+
|
|
98
|
+
# get_data() automatically detects stored type and returns correct format
|
|
99
|
+
data = folio.get_data('my_data') # Returns original type
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Supported data types:
|
|
103
|
+
|
|
104
|
+
- **DataFrames** (`pd.DataFrame`) → stored as Parquet
|
|
105
|
+
- **Numpy arrays** (`np.ndarray`) → stored as `.npy`
|
|
106
|
+
- **JSON data** (`dict`, `list`, `int`, `float`, `str`, `bool`, `None`) → stored as JSON
|
|
107
|
+
- **External references** → metadata only, data stays in original location
|
|
108
|
+
|
|
109
|
+
### Multi-Instance Access
|
|
110
|
+
|
|
111
|
+
DataFolio automatically keeps multiple instances synchronized when accessing the same bundle:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
# Notebook 1: Create and update bundle
|
|
115
|
+
folio1 = DataFolio('experiments/shared')
|
|
116
|
+
folio1.add_data('results', df)
|
|
117
|
+
|
|
118
|
+
# Notebook 2: Open same bundle
|
|
119
|
+
folio2 = DataFolio('experiments/shared')
|
|
120
|
+
|
|
121
|
+
# Notebook 1: Add more data
|
|
122
|
+
folio1.add_data('analysis', new_df)
|
|
123
|
+
|
|
124
|
+
# Notebook 2: Automatically sees new data!
|
|
125
|
+
folio2.describe() # Shows both 'results' and 'analysis'
|
|
126
|
+
analysis = folio2.get_data('analysis') # Works immediately ✅
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
All read operations (`describe()`, `list_contents()`, `get_*()` methods, and `folio.data` accessors) automatically refresh from disk when changes are detected, ensuring you always see the latest data without manual intervention.
|
|
130
|
+
|
|
131
|
+
### Data Lineage
|
|
132
|
+
|
|
133
|
+
Track dependencies between datasets and models:
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
# Create dependency chain
|
|
137
|
+
folio.reference_table('raw', reference='s3://bucket/raw.parquet')
|
|
138
|
+
folio.add_table('clean', cleaned_df, inputs=['raw'])
|
|
139
|
+
folio.add_table('features', feature_df, inputs=['clean'])
|
|
140
|
+
folio.add_model('model', clf, inputs=['features'])
|
|
141
|
+
|
|
142
|
+
# Lineage is preserved in metadata and shown in describe()
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Autocomplete-Friendly Access
|
|
146
|
+
|
|
147
|
+
Access your data with autocomplete support using the `folio.data` property:
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
# Attribute-style access (autocomplete-friendly!)
|
|
151
|
+
df = folio.data.results.content # Get DataFrame
|
|
152
|
+
desc = folio.data.results.description # Get description
|
|
153
|
+
type_str = folio.data.results.type # Get item type
|
|
154
|
+
inputs = folio.data.results.inputs # Get lineage inputs
|
|
155
|
+
|
|
156
|
+
# Works for all data types
|
|
157
|
+
arr = folio.data.embeddings.content # numpy array
|
|
158
|
+
cfg = folio.data.config.content # dict
|
|
159
|
+
model = folio.data.classifier.content # model object
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
In IPython/Jupyter, `folio.data.<TAB>` shows all available items with autocomplete!
|
|
163
|
+
|
|
164
|
+
## Directory Structure
|
|
165
|
+
|
|
166
|
+
DataFolio creates a transparent directory structure:
|
|
167
|
+
|
|
168
|
+
```text
|
|
169
|
+
experiments/my_experiment/
|
|
170
|
+
├── metadata.json # Folio metadata
|
|
171
|
+
├── items.json # Unified manifest with versioning
|
|
172
|
+
├── snapshots.json # Snapshot registry (when using snapshots)
|
|
173
|
+
├── tables/
|
|
174
|
+
│ └── results.parquet # DataFrame storage
|
|
175
|
+
├── models/
|
|
176
|
+
│ ├── classifier.joblib # Sklearn model (v1)
|
|
177
|
+
│ └── classifier_v2.joblib # Version 2 (when snapshot exists)
|
|
178
|
+
└── artifacts/
|
|
179
|
+
├── embeddings.npy # Numpy arrays
|
|
180
|
+
├── config.json # JSON data
|
|
181
|
+
└── plot.png # Any file type
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
## Snapshots: Version Control for Experiments
|
|
185
|
+
|
|
186
|
+
Snapshots let you create immutable checkpoints of your experiments, making it easy to track different versions, compare results, and return to previous states without duplicating data.
|
|
187
|
+
|
|
188
|
+
### Why Snapshots?
|
|
189
|
+
|
|
190
|
+
**The Problem**: You train a model with 89% accuracy, then experiment with improvements. The new version gets 85%—worse! But you've already overwritten your good model. You need to recreate it from git history.
|
|
191
|
+
|
|
192
|
+
**The Solution**: Create snapshots before experimenting. Snapshots preserve exact states while sharing unchanged data to save disk space.
|
|
193
|
+
|
|
194
|
+
### Quick Start with Snapshots
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
from datafolio import DataFolio
|
|
198
|
+
|
|
199
|
+
# Create your experiment
|
|
200
|
+
folio = DataFolio('experiments/classifier')
|
|
201
|
+
folio.add_data('train_data', train_df)
|
|
202
|
+
folio.add_model('model', baseline_model)
|
|
203
|
+
folio.metadata['accuracy'] = 0.89
|
|
204
|
+
|
|
205
|
+
# Create a snapshot before experimenting
|
|
206
|
+
folio.create_snapshot('v1.0-baseline',
|
|
207
|
+
description='Baseline random forest model',
|
|
208
|
+
tags=['baseline', 'production'])
|
|
209
|
+
|
|
210
|
+
# Experiment freely - the snapshot is preserved
|
|
211
|
+
folio.add_model('model', experimental_model, overwrite=True)
|
|
212
|
+
folio.metadata['accuracy'] = 0.85 # Worse!
|
|
213
|
+
|
|
214
|
+
# Load the original version
|
|
215
|
+
baseline = DataFolio.load_snapshot('experiments/classifier', 'v1.0-baseline')
|
|
216
|
+
model = baseline.get_model('model') # Original model with 89% accuracy!
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
### CLI for Snapshot Management
|
|
220
|
+
|
|
221
|
+
DataFolio includes a command-line tool for easy snapshot operations:
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
# Create a snapshot
|
|
225
|
+
datafolio snapshot create v1.0 -d "Baseline model" -t baseline
|
|
226
|
+
|
|
227
|
+
# List all snapshots
|
|
228
|
+
datafolio snapshot list
|
|
229
|
+
|
|
230
|
+
# Show snapshot details
|
|
231
|
+
datafolio snapshot show v1.0
|
|
232
|
+
|
|
233
|
+
# Compare two snapshots
|
|
234
|
+
datafolio snapshot compare v1.0 v2.0
|
|
235
|
+
|
|
236
|
+
# Delete old snapshots and cleanup
|
|
237
|
+
datafolio snapshot delete experimental-v5 --cleanup
|
|
238
|
+
|
|
239
|
+
# Show reproduction instructions
|
|
240
|
+
datafolio snapshot reproduce v1.0
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### Key Features
|
|
244
|
+
|
|
245
|
+
- **Immutable**: Once created, snapshots never change—guaranteed reproducibility
|
|
246
|
+
- **Space-efficient**: Uses copy-on-write versioning—only changed items create new files
|
|
247
|
+
- **Git integration**: Automatically captures commit hash, branch, and dirty status
|
|
248
|
+
- **Environment tracking**: Records Python version and dependencies for full reproducibility
|
|
249
|
+
- **Metadata preservation**: Snapshots include complete metadata state at that moment
|
|
250
|
+
- **Multiple snapshots**: Load different versions simultaneously for comparison
|
|
251
|
+
|
|
252
|
+
### Use Cases
|
|
253
|
+
|
|
254
|
+
**Paper Submission**: Snapshot your exact code, data, and model state when submitting. Months later, you can reproduce those exact results.
|
|
255
|
+
|
|
256
|
+
**A/B Testing**: Create snapshots for baseline and experimental versions, deploy both, and compare performance metrics.
|
|
257
|
+
|
|
258
|
+
**Hyperparameter Tuning**: Snapshot each configuration, then compare results to find the best settings.
|
|
259
|
+
|
|
260
|
+
**Production Deployment**: Tag production-ready snapshots and deploy specific versions with confidence.
|
|
261
|
+
|
|
262
|
+
For complete snapshot documentation, see [snapshots.md](snapshots.md).
|
|
263
|
+
|
|
264
|
+
## Examples
|
|
265
|
+
|
|
266
|
+
### Complete ML Workflow
|
|
267
|
+
|
|
268
|
+
```python
|
|
269
|
+
from datafolio import DataFolio
|
|
270
|
+
import pandas as pd
|
|
271
|
+
import numpy as np
|
|
272
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
273
|
+
|
|
274
|
+
# Initialize
|
|
275
|
+
folio = DataFolio('experiments/classifier_v1')
|
|
276
|
+
|
|
277
|
+
# Reference external data
|
|
278
|
+
folio.add_data('raw', reference='s3://bucket/raw.csv',
|
|
279
|
+
description='Raw training data from database')
|
|
280
|
+
|
|
281
|
+
# Add processed data
|
|
282
|
+
folio.add_data('clean', cleaned_df,
|
|
283
|
+
description='Cleaned and preprocessed data',
|
|
284
|
+
inputs=['raw'])
|
|
285
|
+
|
|
286
|
+
# Add features
|
|
287
|
+
folio.add_data('features', feature_df,
|
|
288
|
+
description='Engineered features',
|
|
289
|
+
inputs=['clean'])
|
|
290
|
+
|
|
291
|
+
# Train and save model
|
|
292
|
+
clf = RandomForestClassifier(n_estimators=100)
|
|
293
|
+
clf.fit(X_train, y_train)
|
|
294
|
+
|
|
295
|
+
folio.add_model('classifier', clf,
|
|
296
|
+
description='Random forest classifier',
|
|
297
|
+
inputs=['features'])
|
|
298
|
+
|
|
299
|
+
# Save metrics
|
|
300
|
+
folio.add_data('metrics', {
|
|
301
|
+
'accuracy': 0.95,
|
|
302
|
+
'f1': 0.92,
|
|
303
|
+
'precision': 0.94
|
|
304
|
+
})
|
|
305
|
+
|
|
306
|
+
# Add custom metadata to the folio itself
|
|
307
|
+
folio.metadata['experiment_name'] = 'rf_baseline'
|
|
308
|
+
folio.metadata['tags'] = ['classification', 'production']
|
|
309
|
+
|
|
310
|
+
# View summary (shows data and custom metadata)
|
|
311
|
+
folio.describe()
|
|
312
|
+
|
|
313
|
+
# Access data with autocomplete
|
|
314
|
+
config = folio.data.config.content
|
|
315
|
+
metrics = folio.data.metrics.content
|
|
316
|
+
trained_model = folio.data.classifier.content
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
## Best Practices
|
|
320
|
+
|
|
321
|
+
1. **Use descriptive names**: `add_data('training_features', ...)` not `add_data('data1', ...)`
|
|
322
|
+
2. **Track lineage**: Always specify `inputs` to track data dependencies
|
|
323
|
+
3. **Add descriptions**: Help future you understand what each item contains
|
|
324
|
+
4. **Use custom metadata**: Store experiment context in `folio.metadata` for better tracking
|
|
325
|
+
5. **Leverage autocomplete**: Use `folio.data.item_name.content` for cleaner, more discoverable code
|
|
326
|
+
6. **Clean up regularly**: Use `delete()` to remove temporary or obsolete items
|
|
327
|
+
7. **Version control**: Commit your folio directories to git (data is stored efficiently)
|
|
328
|
+
8. **Use references**: For large external datasets, use `reference` to avoid copying
|
|
329
|
+
9. **Check describe()**: Regularly review your folio with `folio.describe()` to see data and metadata
|
|
330
|
+
10. **Share across notebooks**: Multiple DataFolio instances can safely access the same bundle - changes are automatically detected and synchronized
|
|
331
|
+
11. **Snapshot before major changes**: Create snapshots before experimenting with new approaches—it's free insurance
|
|
332
|
+
12. **Tag snapshots meaningfully**: Use tags like `baseline`, `production`, `paper` to organize versions
|
|
333
|
+
|
|
334
|
+
## Development
|
|
335
|
+
|
|
336
|
+
```bash
|
|
337
|
+
# Clone the repo
|
|
338
|
+
git clone https://github.com/caseysm/datafolio.git
|
|
339
|
+
cd datafolio
|
|
340
|
+
|
|
341
|
+
# Install with dev dependencies
|
|
342
|
+
uv sync
|
|
343
|
+
|
|
344
|
+
# Run tests
|
|
345
|
+
poe test
|
|
346
|
+
|
|
347
|
+
# Preview documentation
|
|
348
|
+
poe doc-preview
|
|
349
|
+
|
|
350
|
+
# Lint
|
|
351
|
+
uv run ruff check src/ tests/
|
|
352
|
+
|
|
353
|
+
# Bump version
|
|
354
|
+
poe bump patch # or minor, major
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
## Documentation
|
|
358
|
+
|
|
359
|
+
For complete API documentation and detailed guides, see the [full documentation](docs/index.md).
|
|
360
|
+
|
|
361
|
+
## Requirements
|
|
362
|
+
|
|
363
|
+
- Python 3.10+
|
|
364
|
+
- pandas >= 2.0.0
|
|
365
|
+
- pyarrow >= 14.0.0
|
|
366
|
+
- joblib >= 1.3.0
|
|
367
|
+
- orjson >= 3.9.0
|
|
368
|
+
- cloud-files >= 5.8.1
|
|
369
|
+
- click >= 8.1.0 (for CLI)
|
|
370
|
+
- rich >= 13.0.0 (for CLI formatting)
|
|
371
|
+
|
|
372
|
+
## License
|
|
373
|
+
|
|
374
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|
|
375
|
+
|
|
376
|
+
## Contributing
|
|
377
|
+
|
|
378
|
+
Contributions welcome! Please:
|
|
379
|
+
|
|
380
|
+
1. Fork the repository
|
|
381
|
+
2. Create a feature branch
|
|
382
|
+
3. Add tests for new functionality
|
|
383
|
+
4. Ensure all tests pass (`poe test`)
|
|
384
|
+
5. Submit a pull request
|
|
385
|
+
|
|
386
|
+
See [CLAUDE.md](CLAUDE.md) for development guidelines.
|
|
387
|
+
|
|
388
|
+
---
|
|
389
|
+
|
|
390
|
+
Made with ❤️ for data scientists who need simple, lightweight experiment tracking.
|