datafolio 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datafolio/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """datafolio: Lightweight wrapping of dataframes, models, and metadata to track analyses."""
2
+
3
+ __version__ = "1.1.0"
4
+
5
+ from datafolio.folio import DataFolio
6
+
7
+ __all__ = ["DataFolio", "__version__"]
datafolio/accessors.py ADDED
@@ -0,0 +1,338 @@
1
+ """Data accessors for convenient item access.
2
+
3
+ This module provides ItemProxy and DataAccessor classes that enable
4
+ autocomplete-friendly access to DataFolio items using both attribute
5
+ and dictionary-style syntax.
6
+ """
7
+
8
+ from typing import TYPE_CHECKING, Any, Dict, Optional
9
+
10
+ if TYPE_CHECKING:
11
+ from datafolio.folio import DataFolio
12
+
13
+
14
+ class ItemProxy:
15
+ """Proxy for accessing a single item with autocomplete-friendly properties.
16
+
17
+ Provides convenient property-based access to item data and metadata.
18
+ """
19
+
20
+ def __init__(self, folio: "DataFolio", name: str):
21
+ """Initialize ItemProxy.
22
+
23
+ Args:
24
+ folio: Parent DataFolio instance
25
+ name: Name of the item
26
+ """
27
+ self._folio = folio
28
+ self._name = name
29
+
30
+ @property
31
+ def content(self) -> Any:
32
+ """Get the content of this item.
33
+
34
+ Returns:
35
+ - For tables: DataFrame
36
+ - For numpy arrays: numpy array
37
+ - For JSON data: dict/list/scalar
38
+ - For timestamps: datetime object (UTC-aware)
39
+ - For models: loaded model object
40
+ - For artifacts: file path string (use with open())
41
+
42
+ Examples:
43
+ >>> df = folio.data.results.content # DataFrame
44
+ >>> arr = folio.data.embeddings.content # numpy array
45
+ >>> cfg = folio.data.config.content # dict
46
+ >>> ts = folio.data.event_time.content # datetime
47
+ >>> model = folio.data.classifier.content # model object
48
+ >>> with open(folio.data.plot.content, 'rb') as f: # file path
49
+ ... img = f.read()
50
+ """
51
+ # Auto-refresh before accessing
52
+ self._folio._refresh_if_needed()
53
+
54
+ item = self._folio._items[self._name]
55
+ item_type = item.get("item_type")
56
+
57
+ # Dispatch to appropriate getter
58
+ if item_type in ("referenced_table", "included_table"):
59
+ return self._folio.get_table(self._name)
60
+ elif item_type == "numpy_array":
61
+ return self._folio.get_numpy(self._name)
62
+ elif item_type == "json_data":
63
+ return self._folio.get_json(self._name)
64
+ elif item_type == "timestamp":
65
+ return self._folio.get_timestamp(self._name)
66
+ elif item_type in ("model", "pytorch_model"):
67
+ return self._folio.get_model(self._name)
68
+ elif item_type == "artifact":
69
+ return self._folio.get_artifact_path(self._name)
70
+ else:
71
+ raise ValueError(f"Unknown item type: {item_type}")
72
+
73
+ @property
74
+ def description(self) -> Optional[str]:
75
+ """Get the description of this item.
76
+
77
+ Returns:
78
+ Description string or None if not set
79
+ """
80
+ # Auto-refresh before accessing
81
+ self._folio._refresh_if_needed()
82
+
83
+ item = self._folio._items[self._name]
84
+ return item.get("description")
85
+
86
+ @property
87
+ def type(self) -> str:
88
+ """Get the type of this item.
89
+
90
+ Returns:
91
+ Item type string ('referenced_table', 'included_table', 'model',
92
+ 'pytorch_model', 'numpy_array', 'json_data', 'artifact')
93
+ """
94
+ # Auto-refresh before accessing
95
+ self._folio._refresh_if_needed()
96
+
97
+ item = self._folio._items[self._name]
98
+ return item.get("item_type", "unknown")
99
+
100
+ @property
101
+ def path(self) -> Optional[str]:
102
+ """Get the file path for this item.
103
+
104
+ Returns:
105
+ - For referenced tables: external file path
106
+ - For artifacts: artifact file path
107
+ - For other types: None
108
+
109
+ Examples:
110
+ >>> folio.data.external_data.path # 's3://bucket/data.parquet'
111
+ >>> folio.data.plot.path # '/path/to/bundle/artifacts/plot.png'
112
+ """
113
+ # Auto-refresh before accessing
114
+ self._folio._refresh_if_needed()
115
+
116
+ item = self._folio._items[self._name]
117
+ item_type = item.get("item_type")
118
+
119
+ if item_type == "referenced_table":
120
+ return item.get("path")
121
+ elif item_type == "artifact":
122
+ return self._folio.get_artifact_path(self._name)
123
+ else:
124
+ return None
125
+
126
+ @property
127
+ def inputs(self) -> list[str]:
128
+ """Get the list of items this item depends on (lineage).
129
+
130
+ Returns:
131
+ List of item names that were used to create this item
132
+ """
133
+ return self._folio.get_inputs(self._name)
134
+
135
+ @property
136
+ def dependents(self) -> list[str]:
137
+ """Get the list of items that depend on this item (lineage).
138
+
139
+ Returns:
140
+ List of item names that use this item as input
141
+ """
142
+ return self._folio.get_dependents(self._name)
143
+
144
+ @property
145
+ def metadata(self) -> Dict[str, Any]:
146
+ """Get the full metadata dictionary for this item.
147
+
148
+ Returns:
149
+ Dictionary containing all item metadata
150
+ """
151
+ # Auto-refresh before accessing
152
+ self._folio._refresh_if_needed()
153
+
154
+ return dict(self._folio._items[self._name])
155
+
156
+ def __repr__(self) -> str:
157
+ """Return string representation."""
158
+ # Auto-refresh before accessing
159
+ self._folio._refresh_if_needed()
160
+
161
+ item = self._folio._items[self._name]
162
+ item_type = item.get("item_type", "unknown")
163
+ desc = item.get("description", "")
164
+ desc_str = f": {desc}" if desc else ""
165
+ return f"ItemProxy('{self._name}', type='{item_type}'{desc_str})"
166
+
167
+
168
+ class DataAccessor:
169
+ """Accessor for autocomplete-friendly item access.
170
+
171
+ Supports both attribute-style (folio.data.my_item) and
172
+ dictionary-style (folio.data['my_item']) access.
173
+ """
174
+
175
+ def __init__(self, folio: "DataFolio"):
176
+ """Initialize DataAccessor.
177
+
178
+ Args:
179
+ folio: Parent DataFolio instance
180
+ """
181
+ # Store folio reference
182
+ self._folio = folio
183
+
184
+ # Create a dynamic class with item attributes for better autocomplete
185
+ self._setup_dynamic_attributes()
186
+
187
+ def _setup_dynamic_attributes(self) -> None:
188
+ """Set up dynamic attributes on the class for autocomplete support.
189
+
190
+ This sets attributes on the class itself (not just the instance),
191
+ which makes them more visible to IDE autocomplete and Jedi.
192
+ """
193
+ # Get current items
194
+ self._folio._refresh_if_needed()
195
+
196
+ # Get the class of this instance
197
+ cls = self.__class__
198
+
199
+ # Track which attributes we've added (store on the class)
200
+ if not hasattr(cls, "_dynamic_attrs"):
201
+ cls._dynamic_attrs = set()
202
+
203
+ # Get current item names
204
+ current_items = set(self._folio._items.keys())
205
+
206
+ # Remove attributes that no longer exist
207
+ for attr in list(cls._dynamic_attrs):
208
+ if attr not in current_items:
209
+ if hasattr(cls, attr):
210
+ delattr(cls, attr)
211
+ cls._dynamic_attrs.discard(attr)
212
+
213
+ # Add new attributes as properties on the class
214
+ for item_name in current_items:
215
+ if item_name not in cls._dynamic_attrs:
216
+ # Create a property that returns an ItemProxy
217
+ # Use a default argument to capture the item_name in the closure
218
+ def make_property(name: str):
219
+ def item_property(self) -> ItemProxy:
220
+ return ItemProxy(self._folio, name)
221
+
222
+ return property(item_property)
223
+
224
+ setattr(cls, item_name, make_property(item_name))
225
+ cls._dynamic_attrs.add(item_name)
226
+
227
+ def _sync_items(self) -> None:
228
+ """Sync item attributes with current folio state.
229
+
230
+ This method is called when items may have changed.
231
+ """
232
+ self._setup_dynamic_attributes()
233
+
234
+ def __getattr__(self, name: str) -> ItemProxy:
235
+ """Get item by attribute access.
236
+
237
+ Args:
238
+ name: Item name
239
+
240
+ Returns:
241
+ ItemProxy for the item
242
+
243
+ Raises:
244
+ AttributeError: If item doesn't exist
245
+ """
246
+ if name.startswith("_"):
247
+ # Allow access to private attributes
248
+ raise AttributeError(
249
+ f"'{type(self).__name__}' object has no attribute '{name}'"
250
+ )
251
+
252
+ # Auto-refresh and re-sync items if needed
253
+ self._folio._refresh_if_needed()
254
+ self._sync_items()
255
+
256
+ # Try to get the attribute again after syncing
257
+ # Use object.__getattribute__ directly to avoid recursion
258
+ try:
259
+ return object.__getattribute__(self, name)
260
+ except AttributeError:
261
+ raise AttributeError(
262
+ f"Item '{name}' not found in DataFolio. "
263
+ f"Available items: {', '.join(sorted(self._folio._items.keys()))}"
264
+ )
265
+
266
+ def __getitem__(self, name: str) -> ItemProxy:
267
+ """Get item by dictionary access.
268
+
269
+ Args:
270
+ name: Item name
271
+
272
+ Returns:
273
+ ItemProxy for the item
274
+
275
+ Raises:
276
+ KeyError: If item doesn't exist
277
+ """
278
+ # Auto-refresh before accessing
279
+ self._folio._refresh_if_needed()
280
+
281
+ if name not in self._folio._items:
282
+ raise KeyError(f"Item '{name}' not found in DataFolio")
283
+
284
+ return ItemProxy(self._folio, name)
285
+
286
+ def __dir__(self) -> list[str]:
287
+ """Return list of item names for autocomplete.
288
+
289
+ Returns:
290
+ List combining standard attributes and item names
291
+ """
292
+ # Auto-refresh before accessing
293
+ self._folio._refresh_if_needed()
294
+
295
+ # Combine standard object attributes with item names
296
+ standard_attrs = list(object.__dir__(self))
297
+ item_names = list(self._folio._items.keys())
298
+ return sorted(set(standard_attrs + item_names))
299
+
300
+ def _ipython_key_completions_(self) -> list[str]:
301
+ """Provide key completions for IPython/Jupyter.
302
+
303
+ This method is specifically for IPython's autocomplete system,
304
+ which provides better autocomplete support in Jupyter notebooks.
305
+
306
+ Returns:
307
+ Sorted list of all item names
308
+ """
309
+ # Auto-refresh before accessing
310
+ self._folio._refresh_if_needed()
311
+
312
+ return sorted(self._folio._items.keys())
313
+
314
+ def __repr__(self) -> str:
315
+ """Return string representation.
316
+
317
+ Returns:
318
+ String showing available items
319
+ """
320
+ items = sorted(self._folio._items.keys())
321
+ if not items:
322
+ return "DataAccessor(no items)"
323
+
324
+ # Group by type
325
+ contents = self._folio.list_contents()
326
+ lines = ["DataAccessor:"]
327
+
328
+ for category, item_list in [
329
+ ("Tables", contents["referenced_tables"] + contents["included_tables"]),
330
+ ("Models", contents["models"]),
331
+ ("Numpy Arrays", contents["numpy_arrays"]),
332
+ ("JSON Data", contents["json_data"]),
333
+ ("Artifacts", contents["artifacts"]),
334
+ ]:
335
+ if item_list:
336
+ lines.append(f" {category}: {', '.join(sorted(item_list))}")
337
+
338
+ return "\n".join(lines)
@@ -0,0 +1,25 @@
1
+ """Base classes and utilities for datafolio handler system.
2
+
3
+ This module provides the foundation for the handler-based architecture:
4
+ - BaseHandler: Abstract base class for all data type handlers
5
+ - HandlerRegistry: Registry for managing handlers
6
+ - Convenience functions: register_handler, get_handler, detect_handler
7
+ """
8
+
9
+ from datafolio.base.handler import BaseHandler
10
+ from datafolio.base.registry import (
11
+ HandlerRegistry,
12
+ detect_handler,
13
+ get_handler,
14
+ get_registry,
15
+ register_handler,
16
+ )
17
+
18
+ __all__ = [
19
+ "BaseHandler",
20
+ "HandlerRegistry",
21
+ "register_handler",
22
+ "get_handler",
23
+ "detect_handler",
24
+ "get_registry",
25
+ ]
@@ -0,0 +1,220 @@
1
+ """Base handler interface for data type handlers.
2
+
3
+ This module defines the abstract base class that all data type handlers must implement.
4
+ Handlers are responsible for serialization, deserialization, and metadata management
5
+ for specific data types (e.g., pandas DataFrames, numpy arrays, PyTorch models).
6
+ """
7
+
8
+ from abc import ABC, abstractmethod
9
+ from typing import TYPE_CHECKING, Any, Dict, Optional
10
+
11
+ from datafolio.storage.categories import StorageCategory, get_storage_category
12
+
13
+ if TYPE_CHECKING:
14
+ from datafolio.folio import DataFolio
15
+
16
+
17
+ class BaseHandler(ABC):
18
+ """Abstract base class for data type handlers.
19
+
20
+ Each handler manages serialization, deserialization, and metadata
21
+ for one data type (tables, arrays, models, etc.).
22
+
23
+ Handlers are responsible for:
24
+ 1. Type detection (can_handle)
25
+ 2. Writing data to storage (add)
26
+ 3. Reading data from storage (get)
27
+ 4. File cleanup (delete) - optional override
28
+ 5. Storage location (get_storage_subdir) - automatically derived from item_type
29
+
30
+ The storage location is automatically determined from the item_type using
31
+ the ITEM_TYPE_TO_CATEGORY mapping. Handlers only need to define their
32
+ item_type, and the storage category is looked up automatically.
33
+
34
+ Examples:
35
+ Create a custom handler (minimal implementation):
36
+ >>> class MyHandler(BaseHandler):
37
+ ... @property
38
+ ... def item_type(self) -> str:
39
+ ... return "my_type" # Must be in ITEM_TYPE_TO_CATEGORY
40
+ ...
41
+ ... def can_handle(self, data: Any) -> bool:
42
+ ... return isinstance(data, MyDataType)
43
+ ...
44
+ ... def add(self, folio, name, data, **kwargs):
45
+ ... # Write data and return metadata
46
+ ... ...
47
+ ...
48
+ ... def get(self, folio, name, **kwargs):
49
+ ... # Read and return data
50
+ ... ...
51
+ ...
52
+ ... # No need to implement get_storage_subdir() - it's automatic!
53
+ """
54
+
55
+ @property
56
+ @abstractmethod
57
+ def item_type(self) -> str:
58
+ """Unique identifier for this item type.
59
+
60
+ Must match the 'item_type' value stored in items.json.
61
+
62
+ Returns:
63
+ Item type string (e.g., 'included_table', 'numpy_array', 'pytorch_model')
64
+
65
+ Examples:
66
+ >>> handler = PandasHandler()
67
+ >>> handler.item_type
68
+ 'included_table'
69
+ """
70
+ pass
71
+
72
+ @abstractmethod
73
+ def can_handle(self, data: Any) -> bool:
74
+ """Check if this handler can process the given data.
75
+
76
+ Used by add_data() for auto-detection. Return False if the handler
77
+ should not participate in auto-detection (e.g., for reference tables
78
+ or models that require explicit parameters).
79
+
80
+ Args:
81
+ data: Data object to check
82
+
83
+ Returns:
84
+ True if this handler supports this data type
85
+
86
+ Examples:
87
+ >>> handler = PandasHandler()
88
+ >>> handler.can_handle(pd.DataFrame())
89
+ True
90
+ >>> handler.can_handle(np.array([1, 2, 3]))
91
+ False
92
+ """
93
+ pass
94
+
95
+ @abstractmethod
96
+ def add(
97
+ self,
98
+ folio: "DataFolio",
99
+ name: str,
100
+ data: Any,
101
+ description: Optional[str] = None,
102
+ inputs: Optional[list[str]] = None,
103
+ **kwargs,
104
+ ) -> Dict[str, Any]:
105
+ """Add data to folio and return metadata.
106
+
107
+ Responsible for:
108
+ 1. Validating data type
109
+ 2. Writing data to storage
110
+ 3. Building metadata dictionary
111
+
112
+ Args:
113
+ folio: DataFolio instance (for accessing storage, bundle_dir, etc.)
114
+ name: Item name
115
+ data: Data to store
116
+ description: Optional description
117
+ inputs: Optional lineage inputs
118
+ **kwargs: Handler-specific options
119
+
120
+ Returns:
121
+ Complete metadata dict to store in folio._items[name].
122
+ Must include at minimum: 'name', 'item_type', 'created_at'.
123
+ Should include 'description' and 'inputs' if provided.
124
+
125
+ Raises:
126
+ TypeError: If data is wrong type for this handler
127
+
128
+ Examples:
129
+ >>> metadata = handler.add(folio, 'results', df, description='Results')
130
+ >>> # Returns: {'name': 'results', 'item_type': 'included_table',
131
+ >>> # 'filename': 'results.parquet', 'num_rows': 100, ...}
132
+ """
133
+ pass
134
+
135
+ @abstractmethod
136
+ def get(self, folio: "DataFolio", name: str, **kwargs) -> Any:
137
+ """Load and return data from folio.
138
+
139
+ Args:
140
+ folio: DataFolio instance
141
+ name: Item name
142
+ **kwargs: Handler-specific options (passed to reader)
143
+
144
+ Returns:
145
+ The loaded data object
146
+
147
+ Raises:
148
+ KeyError: If item doesn't exist
149
+
150
+ Examples:
151
+ >>> df = handler.get(folio, 'results')
152
+ >>> df = handler.get(folio, 'results', columns=['col1', 'col2'])
153
+ """
154
+ pass
155
+
156
+ def delete(self, folio: "DataFolio", name: str) -> None:
157
+ """Delete data files for this item.
158
+
159
+ Default implementation deletes file at items[name]['filename']
160
+ from the appropriate subdirectory. Override if custom logic needed
161
+ (e.g., for external references that have no local files).
162
+
163
+ Args:
164
+ folio: DataFolio instance
165
+ name: Item name
166
+
167
+ Examples:
168
+ Default behavior (can be overridden):
169
+ >>> handler.delete(folio, 'results')
170
+ # Deletes: {bundle_dir}/{subdir}/{filename}
171
+ """
172
+ item = folio._items[name]
173
+ if "filename" in item:
174
+ subdir = self.get_storage_subdir()
175
+ filepath = folio._storage.join_paths(
176
+ folio._bundle_dir, subdir, item["filename"]
177
+ )
178
+ folio._storage.delete_file(filepath)
179
+
180
+ def get_storage_category(self) -> StorageCategory:
181
+ """Get the storage category for this handler's item type.
182
+
183
+ This method looks up the storage category in ITEM_TYPE_TO_CATEGORY
184
+ based on the handler's item_type. Override this method if you need
185
+ custom category logic that differs from the standard mapping.
186
+
187
+ Returns:
188
+ StorageCategory enum value
189
+
190
+ Examples:
191
+ >>> handler = PandasHandler()
192
+ >>> handler.get_storage_category()
193
+ <StorageCategory.TABLES: 'tables'>
194
+
195
+ >>> handler = NumpyHandler()
196
+ >>> handler.get_storage_category()
197
+ <StorageCategory.ARTIFACTS: 'artifacts'>
198
+ """
199
+ return get_storage_category(self.item_type)
200
+
201
+ def get_storage_subdir(self) -> str:
202
+ """Return subdirectory for this data type.
203
+
204
+ This method is derived from get_storage_category(). In most cases,
205
+ you should not need to override this - override get_storage_category()
206
+ instead if you need custom storage location logic.
207
+
208
+ Returns:
209
+ Subdirectory name: 'tables', 'models', or 'artifacts'
210
+
211
+ Examples:
212
+ >>> handler = PandasHandler()
213
+ >>> handler.get_storage_subdir()
214
+ 'tables'
215
+
216
+ >>> handler = SklearnHandler()
217
+ >>> handler.get_storage_subdir()
218
+ 'models'
219
+ """
220
+ return self.get_storage_category().directory