datamint 2.3.3__py3-none-any.whl → 2.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. datamint/__init__.py +1 -3
  2. datamint/api/__init__.py +0 -3
  3. datamint/api/base_api.py +286 -54
  4. datamint/api/client.py +76 -13
  5. datamint/api/endpoints/__init__.py +2 -2
  6. datamint/api/endpoints/annotations_api.py +186 -28
  7. datamint/api/endpoints/deploy_model_api.py +78 -0
  8. datamint/api/endpoints/models_api.py +1 -0
  9. datamint/api/endpoints/projects_api.py +38 -7
  10. datamint/api/endpoints/resources_api.py +227 -100
  11. datamint/api/entity_base_api.py +66 -7
  12. datamint/apihandler/base_api_handler.py +0 -1
  13. datamint/apihandler/dto/annotation_dto.py +2 -0
  14. datamint/client_cmd_tools/datamint_config.py +0 -1
  15. datamint/client_cmd_tools/datamint_upload.py +3 -1
  16. datamint/configs.py +11 -7
  17. datamint/dataset/base_dataset.py +24 -4
  18. datamint/dataset/dataset.py +1 -1
  19. datamint/entities/__init__.py +1 -1
  20. datamint/entities/annotations/__init__.py +13 -0
  21. datamint/entities/{annotation.py → annotations/annotation.py} +81 -47
  22. datamint/entities/annotations/image_classification.py +12 -0
  23. datamint/entities/annotations/image_segmentation.py +252 -0
  24. datamint/entities/annotations/volume_segmentation.py +273 -0
  25. datamint/entities/base_entity.py +100 -6
  26. datamint/entities/cache_manager.py +129 -15
  27. datamint/entities/datasetinfo.py +60 -65
  28. datamint/entities/deployjob.py +18 -0
  29. datamint/entities/project.py +39 -0
  30. datamint/entities/resource.py +310 -46
  31. datamint/lightning/__init__.py +1 -0
  32. datamint/lightning/datamintdatamodule.py +103 -0
  33. datamint/mlflow/__init__.py +65 -0
  34. datamint/mlflow/artifact/__init__.py +1 -0
  35. datamint/mlflow/artifact/datamint_artifacts_repo.py +8 -0
  36. datamint/mlflow/env_utils.py +131 -0
  37. datamint/mlflow/env_vars.py +5 -0
  38. datamint/mlflow/flavors/__init__.py +17 -0
  39. datamint/mlflow/flavors/datamint_flavor.py +150 -0
  40. datamint/mlflow/flavors/model.py +877 -0
  41. datamint/mlflow/lightning/callbacks/__init__.py +1 -0
  42. datamint/mlflow/lightning/callbacks/modelcheckpoint.py +410 -0
  43. datamint/mlflow/models/__init__.py +93 -0
  44. datamint/mlflow/tracking/datamint_store.py +76 -0
  45. datamint/mlflow/tracking/default_experiment.py +27 -0
  46. datamint/mlflow/tracking/fluent.py +91 -0
  47. datamint/utils/env.py +27 -0
  48. datamint/utils/visualization.py +21 -13
  49. datamint-2.9.0.dist-info/METADATA +220 -0
  50. datamint-2.9.0.dist-info/RECORD +73 -0
  51. {datamint-2.3.3.dist-info → datamint-2.9.0.dist-info}/WHEEL +1 -1
  52. datamint-2.9.0.dist-info/entry_points.txt +18 -0
  53. datamint/apihandler/exp_api_handler.py +0 -204
  54. datamint/experiment/__init__.py +0 -1
  55. datamint/experiment/_patcher.py +0 -570
  56. datamint/experiment/experiment.py +0 -1049
  57. datamint-2.3.3.dist-info/METADATA +0 -125
  58. datamint-2.3.3.dist-info/RECORD +0 -54
  59. datamint-2.3.3.dist-info/entry_points.txt +0 -4
@@ -105,7 +105,11 @@ class CacheManager(Generic[T]):
105
105
  Returns:
106
106
  Path to the data file
107
107
  """
108
- return self._get_entity_cache_dir(entity_id) / f"{data_key}.pkl"
108
+
109
+ datapath = self._get_entity_cache_dir(entity_id) / f"{data_key}"
110
+ if datapath.with_suffix('.pkl').exists():
111
+ return datapath.with_suffix('.pkl')
112
+ return datapath
109
113
 
110
114
  def _compute_version_hash(self, version_info: dict[str, Any]) -> str:
111
115
  """Compute a hash from version information.
@@ -120,13 +124,13 @@ class CacheManager(Generic[T]):
120
124
  sorted_info = json.dumps(version_info, sort_keys=True)
121
125
  return hashlib.sha256(sorted_info.encode()).hexdigest()
122
126
 
123
- def get(
127
+ def _get_validated_metadata(
124
128
  self,
125
129
  entity_id: str,
126
130
  data_key: str,
127
131
  version_info: dict[str, Any] | None = None
128
- ) -> T | None:
129
- """Retrieve cached data for an entity.
132
+ ) -> tuple['CacheManager.ItemMetadata', Path] | tuple[None, None]:
133
+ """Get and validate cached metadata for an entity.
130
134
 
131
135
  Args:
132
136
  entity_id: Unique identifier for the entity
@@ -134,42 +138,152 @@ class CacheManager(Generic[T]):
134
138
  version_info: Optional version information from server to validate cache
135
139
 
136
140
  Returns:
137
- Cached data if valid, None if cache miss or invalid
141
+ Tuple of (metadata, data_path) if valid, (None, None) if cache miss or invalid
138
142
  """
139
143
  metadata_path = self._get_metadata_path(entity_id)
140
- data_path = self._get_data_path(entity_id, data_key)
141
144
 
142
- # Check if cache exists
143
- if not metadata_path.exists() or not data_path.exists():
144
- _LOGGER.debug(f"Cache miss for {entity_id}/{data_key}")
145
- return None
145
+ if not metadata_path.exists():
146
+ _LOGGER.debug(f"Cache miss for {entity_id}/{data_key} - no metadata")
147
+ return None, None
146
148
 
147
149
  try:
148
- # Load or create metadata
150
+ # Read metadata first to get the actual data path (could be external)
149
151
  with open(metadata_path, 'r') as f:
150
152
  jsondata = f.read()
151
153
  cached_metadata = CacheManager.ItemMetadata.model_validate_json(jsondata)
154
+
155
+ # Use the data_path from metadata (supports external file locations)
156
+ data_path = Path(cached_metadata.data_path)
157
+
158
+ # Check if the actual data file exists
159
+ if not data_path.exists():
160
+ _LOGGER.debug(f"Cache miss for {entity_id}/{data_key} - data file not found at {data_path}")
161
+ return None, None
152
162
 
153
163
  # Validate version if provided
154
164
  if version_info is not None:
155
165
  server_version = self._compute_version_hash(version_info)
156
-
157
166
  if server_version != cached_metadata.version_hash:
158
167
  _LOGGER.debug(
159
168
  f"Cache version mismatch for {entity_id}/{data_key}. "
160
169
  f"Server: {server_version}, Cached: {cached_metadata.version_hash}"
161
170
  )
162
- return None
171
+ return None, None
163
172
 
164
- data = self._load_data(cached_metadata)
173
+ return cached_metadata, data_path
174
+ except Exception as e:
175
+ _LOGGER.warning(f"Error reading cache metadata for {entity_id}/{data_key}: {e}")
176
+ return None, None
165
177
 
178
+ def get(
179
+ self,
180
+ entity_id: str,
181
+ data_key: str,
182
+ version_info: dict[str, Any] | None = None
183
+ ) -> T | None:
184
+ """Retrieve cached data for an entity.
185
+
186
+ Args:
187
+ entity_id: Unique identifier for the entity
188
+ data_key: Key identifying the type of data
189
+ version_info: Optional version information from server to validate cache
190
+
191
+ Returns:
192
+ Cached data if valid, None if cache miss or invalid
193
+ """
194
+ cached_metadata, data_path = self._get_validated_metadata(entity_id, data_key, version_info)
195
+
196
+ if cached_metadata is None:
197
+ return None
198
+
199
+ try:
200
+ data = self._load_data(cached_metadata)
166
201
  _LOGGER.debug(f"Cache hit for {entity_id}/{data_key}")
167
202
  return data
168
-
169
203
  except Exception as e:
170
204
  _LOGGER.warning(f"Error reading cache for {entity_id}/{data_key}: {e}")
171
205
  return None
172
206
 
207
+ def get_path(
208
+ self,
209
+ entity_id: str,
210
+ data_key: str,
211
+ version_info: dict[str, Any] | None = None
212
+ ) -> Path | None:
213
+ """Get the path to cached data for an entity if valid.
214
+
215
+ Args:
216
+ entity_id: Unique identifier for the entity
217
+ data_key: Key identifying the type of data
218
+ version_info: Optional version information from server to validate cache
219
+
220
+ Returns:
221
+ Path to cached data if valid, None if cache miss or invalid
222
+ """
223
+ cached_metadata, data_path = self._get_validated_metadata(entity_id, data_key, version_info)
224
+ return data_path
225
+
226
+ def get_expected_path(self, entity_id: str, data_key: str) -> Path:
227
+ """Get the expected cache path for an entity (even if not yet cached).
228
+
229
+ This is useful for downloading directly to the cache location.
230
+
231
+ Args:
232
+ entity_id: Unique identifier for the entity
233
+ data_key: Key identifying the type of data
234
+
235
+ Returns:
236
+ Path where data will be cached
237
+ """
238
+ return self._get_data_path(entity_id, data_key)
239
+
240
+ def register_file_location(
241
+ self,
242
+ entity_id: str,
243
+ data_key: str,
244
+ file_path: str | Path,
245
+ version_info: dict[str, Any] | None = None,
246
+ mimetype: str = 'application/octet-stream'
247
+ ) -> None:
248
+ """Register an external file location in cache metadata without copying data.
249
+
250
+ This allows tracking a file stored at an arbitrary location (e.g., user's save_path)
251
+ while keeping version metadata in the cache directory.
252
+
253
+ Args:
254
+ entity_id: Unique identifier for the entity
255
+ data_key: Key identifying the type of data
256
+ file_path: Path to the external file to register
257
+ version_info: Optional version information from server
258
+ mimetype: MIME type of the file data
259
+ """
260
+ metadata_path = self._get_metadata_path(entity_id)
261
+ file_path = Path(file_path).resolve().absolute()
262
+
263
+ if not file_path.exists():
264
+ raise FileNotFoundError(f"Cannot register non-existent file: {file_path}")
265
+
266
+ try:
267
+ metadata = CacheManager.ItemMetadata(
268
+ cached_at=datetime.now(),
269
+ data_path=str(file_path),
270
+ data_type='bytes',
271
+ mimetype=mimetype,
272
+ entity_id=entity_id
273
+ )
274
+
275
+ if version_info is not None:
276
+ metadata.version_hash = self._compute_version_hash(version_info)
277
+ metadata.version_info = version_info
278
+
279
+ with open(metadata_path, 'w') as f:
280
+ f.write(metadata.model_dump_json(indent=2))
281
+
282
+ _LOGGER.debug(f"Registered external file for {entity_id}/{data_key}: {file_path}")
283
+
284
+ except Exception as e:
285
+ _LOGGER.warning(f"Error registering file location for {entity_id}/{data_key}: {e}")
286
+
173
287
  def set(
174
288
  self,
175
289
  entity_id: str,
@@ -1,8 +1,8 @@
1
1
  """Dataset entity module for DataMint API."""
2
2
 
3
- from datetime import datetime
4
3
  import logging
5
4
  from typing import TYPE_CHECKING, Sequence
5
+ from pydantic import PrivateAttr
6
6
 
7
7
  from .base_entity import BaseEntity, MISSING_FIELD
8
8
 
@@ -10,6 +10,7 @@ if TYPE_CHECKING:
10
10
  from datamint.api.client import Api
11
11
  from .resource import Resource
12
12
  from .project import Project
13
+ from datamint.api.endpoints.datasetsinfo_api import DatasetsInfoApi
13
14
 
14
15
  logger = logging.getLogger(__name__)
15
16
 
@@ -31,92 +32,86 @@ class DatasetInfo(BaseEntity):
31
32
  total_resource: int
32
33
  resource_ids: list[str]
33
34
 
35
+ _api: 'DatasetsInfoApi' = PrivateAttr()
36
+
34
37
  def __init__(self, **data):
35
38
  """Initialize the dataset info entity."""
36
39
  super().__init__(**data)
37
- self._manager: EntityManager['DatasetInfo'] = EntityManager(self)
38
40
 
39
41
  # Cache for lazy-loaded data
40
- self._resources_cache: Sequence['Resource'] | None = None
41
- self._projects_cache: Sequence['Project'] | None = None
42
+ # self._resources_cache: Sequence['Resource'] | None = None
43
+ # self._projects_cache: Sequence['Project'] | None = None
42
44
 
43
- def _inject_api(self, api: 'Api') -> None:
44
- """Inject API client into this dataset (called automatically by Api class)."""
45
- self._manager.set_api(api)
46
-
47
- def get_resources(
48
- self,
49
- refresh: bool = False,
50
- limit: int | None = None
51
- ) -> Sequence['Resource']:
52
- """Get all resources in this dataset.
45
+ # def get_resources(
46
+ # self,
47
+ # refresh: bool = False,
48
+ # limit: int | None = None
49
+ # ) -> Sequence['Resource']:
50
+ # """Get all resources in this dataset.
53
51
 
54
- Results are cached after the first call unless refresh=True.
52
+ # Results are cached after the first call unless refresh=True.
55
53
 
56
- Args:
57
- api: Optional API client. Uses the one from set_api() if not provided.
58
- refresh: If True, bypass cache and fetch fresh data
54
+ # Args:
55
+ # api: Optional API client. Uses the one from set_api() if not provided.
56
+ # refresh: If True, bypass cache and fetch fresh data
59
57
 
60
- Returns:
61
- List of Resource instances in this dataset
58
+ # Returns:
59
+ # List of Resource instances in this dataset
62
60
 
63
- Raises:
64
- RuntimeError: If no API client is available
61
+ # Raises:
62
+ # RuntimeError: If no API client is available
65
63
 
66
- Example:
67
- >>> dataset = api._datasetsinfo.get_by_id("dataset-id")
68
- >>> dataset.set_api(api)
69
- >>> resources = dataset.get_resources()
70
- """
71
- if refresh or self._resources_cache is None:
72
- api_client = self._manager._ensure_api(api)
64
+ # Example:
65
+ # >>> dataset = api._datasetsinfo.get_by_id("dataset-id")
66
+ # >>> dataset.set_api(api)
67
+ # >>> resources = dataset.get_resources()
68
+ # """
69
+ # if refresh or self._resources_cache is None:
70
+ # # Fetch resources by their IDs
71
+ # resources = []
72
+ # for resource_id in self.resource_ids:
73
+ # try:
74
+ # resource = self._api.get.get_by_id(resource_id)
75
+ # resource.set_api(self._api)
76
+ # resources.append(resource)
77
+ # except Exception as e:
78
+ # logger.warning(f"Failed to fetch resource {resource_id}: {e}")
73
79
 
74
- # Fetch resources by their IDs
75
- resources = []
76
- for resource_id in self.resource_ids:
77
- try:
78
- resource = api_client.resources.get_by_id(resource_id)
79
- resource.set_api(api_client)
80
- resources.append(resource)
81
- except Exception as e:
82
- logger.warning(f"Failed to fetch resource {resource_id}: {e}")
83
-
84
- self._resources_cache = resources
80
+ # self._resources_cache = resources
85
81
 
86
- return self._resources_cache
82
+ # return self._resources_cache
87
83
 
88
- def get_projects(
89
- self,
90
- api: 'Api | None' = None,
91
- refresh: bool = False
92
- ) -> Sequence['Project']:
93
- """Get all projects associated with this dataset.
84
+ # def get_projects(
85
+ # self,
86
+ # api: 'Api | None' = None,
87
+ # refresh: bool = False
88
+ # ) -> Sequence['Project']:
89
+ # """Get all projects associated with this dataset.
94
90
 
95
- Results are cached after the first call unless refresh=True.
91
+ # Results are cached after the first call unless refresh=True.
96
92
 
97
- Args:
98
- refresh: If True, bypass cache and fetch fresh data
93
+ # Args:
94
+ # refresh: If True, bypass cache and fetch fresh data
99
95
 
100
- Returns:
101
- List of Project instances
96
+ # Returns:
97
+ # List of Project instances
102
98
 
103
- Raises:
104
- RuntimeError: If no API client is available
99
+ # Raises:
100
+ # RuntimeError: If no API client is available
105
101
 
106
- Example:
107
- >>> dataset = api.datasetsinfo.get_by_id("dataset-id")
108
- >>> projects = dataset.get_projects()
109
- """
110
- if refresh or self._projects_cache is None:
111
- api_client = self._manager.api
102
+ # Example:
103
+ # >>> dataset = api.datasetsinfo.get_by_id("dataset-id")
104
+ # >>> projects = dataset.get_projects()
105
+ # """
106
+ # if refresh or self._projects_cache is None:
112
107
 
113
- # Get all projects and filter by dataset_id
114
- all_projects = api_client.projects.get_all()
115
- projects = [p for p in all_projects if p.dataset_id == self.id]
108
+ # # Get all projects and filter by dataset_id
109
+ # all_projects = api_client.projects.get_all()
110
+ # projects = [p for p in all_projects if p.dataset_id == self.id]
116
111
 
117
- self._projects_cache = projects
112
+ # self._projects_cache = projects
118
113
 
119
- return self._projects_cache
114
+ # return self._projects_cache
120
115
 
121
116
  def invalidate_cache(self) -> None:
122
117
  """Invalidate all cached relationship data.
@@ -0,0 +1,18 @@
1
+ from datamint.entities.base_entity import BaseEntity
2
+
3
+
4
+ class DeployJob(BaseEntity):
5
+ id: str
6
+ status: str
7
+ model_name: str
8
+ model_version: int | None = None
9
+ model_alias: str | None = None
10
+ image_name: str | None = None
11
+ image_tag: str | None = None
12
+ error_message: str | None = None
13
+ progress_percentage: int = 0
14
+ current_step: str | None = None
15
+ with_gpu: bool = False
16
+ recent_logs: list[str] | None = None
17
+ started_at: str | None = None
18
+ completed_at: str | None = None
@@ -75,6 +75,45 @@ class Project(BaseEntity):
75
75
  """
76
76
  return self._api.get_project_resources(self.id)
77
77
 
78
+ def download_resources_datas(self, progress_bar: bool = True) -> None:
79
+ """Downloads all project resources in parallel for faster subsequent access.
80
+
81
+ This method downloads and caches all resource file data concurrently,
82
+ skipping resources that are already cached. This dramatically improves
83
+ performance when working with large projects.
84
+
85
+ Args:
86
+ progress_bar: Whether to show a progress bar. Default is True.
87
+
88
+ Example:
89
+ >>> proj = api.projects.get_by_name("My Project")
90
+ >>> proj.download_resources() # Cache all resources in parallel
91
+ >>> # Now fetch_file_data() will be instantaneous for cached resources
92
+ >>> for res in proj.fetch_resources():
93
+ ... data = res.fetch_file_data(use_cache=True)
94
+ """
95
+ return self.cache_resources(progress_bar=progress_bar)
96
+
97
+ def cache_resources(self, progress_bar: bool = True) -> None:
98
+ """Cache all project resources in parallel for faster subsequent access.
99
+
100
+ This method downloads and caches all resource file data concurrently,
101
+ skipping resources that are already cached. This dramatically improves
102
+ performance when working with large projects.
103
+
104
+ Args:
105
+ progress_bar: Whether to show a progress bar. Default is True.
106
+
107
+ Example:
108
+ >>> proj = api.projects.get_by_name("My Project")
109
+ >>> proj.cache_resources() # Cache all resources in parallel
110
+ >>> # Now fetch_file_data() will be instantaneous for cached resources
111
+ >>> for res in proj.fetch_resources():
112
+ ... data = res.fetch_file_data(use_cache=True)
113
+ """
114
+ resources = self.fetch_resources()
115
+ self._api.resources_api.cache_resources(resources, progress_bar=progress_bar)
116
+
78
117
  def set_work_status(self, resource: 'Resource', status: Literal['opened', 'annotated', 'closed']) -> None:
79
118
  """Set the status of a resource.
80
119