datamint 2.3.1__py3-none-any.whl → 2.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,302 @@
1
+ """Cache manager for storing and retrieving entity-related data locally.
2
+
3
+ This module provides caching functionality for resource data (images, segmentations, etc.)
4
+ with automatic validation against server versions to ensure data freshness.
5
+ """
6
+
7
+ import hashlib
8
+ import json
9
+ import logging
10
+ import pickle
11
+ from datetime import datetime
12
+ from pathlib import Path
13
+ from typing import Any, TypeVar, Generic
14
+ from pydantic import BaseModel
15
+ # import appdirs
16
+ import datamint.configs
17
+
18
+ _LOGGER = logging.getLogger(__name__)
19
+
20
+ T = TypeVar('T')
21
+
22
+
23
+ class CacheManager(Generic[T]):
24
+ """Manages local caching of entity data with versioning support.
25
+
26
+ This class handles storing and retrieving cached data with automatic
27
+ validation against server versions to ensure data consistency.
28
+
29
+ The cache uses a directory structure:
30
+ - cache_root/
31
+ - resources/
32
+ - {resource_id}/
33
+ - image_data.pkl
34
+ - metadata.json
35
+ - annotations/
36
+ - {annotation_id}/
37
+ - segmentation_data.pkl
38
+ - metadata.json
39
+
40
+ Attributes:
41
+ cache_root: Root directory for cache storage
42
+ entity_type: Type of entity being cached (e.g., 'resources', 'annotations')
43
+ """
44
+
45
+ class ItemMetadata(BaseModel):
46
+ cached_at: datetime
47
+ data_path: str
48
+ data_type: str
49
+ mimetype: str
50
+ version_hash: str | None = None
51
+ version_info: dict | None = None
52
+ entity_id: str | None = None
53
+
54
+ def __init__(self, entity_type: str, cache_root: Path | str | None = None):
55
+ """Initialize the cache manager.
56
+
57
+ Args:
58
+ entity_type: Type of entity (e.g., 'resources', 'annotations')
59
+ cache_root: Root directory for cache. If None, uses system cache directory.
60
+ """
61
+ self.entity_type = entity_type
62
+
63
+ if cache_root is None:
64
+ # Use platform-specific cache directory
65
+ # app_cache_dir = appdirs.user_cache_dir('datamint', 'sonance')
66
+ # cache_root = Path(app_cache_dir) / 'entity_cache'
67
+ cache_root = Path(datamint.configs.DATAMINT_DATA_DIR)
68
+ else:
69
+ cache_root = Path(cache_root)
70
+
71
+ self.cache_root = cache_root / entity_type
72
+
73
+ def _get_entity_cache_dir(self, entity_id: str) -> Path:
74
+ """Get the cache directory for a specific entity.
75
+
76
+ Args:
77
+ entity_id: Unique identifier for the entity
78
+
79
+ Returns:
80
+ Path to the entity's cache directory
81
+ """
82
+ entity_dir = self.cache_root / entity_id
83
+ entity_dir = entity_dir.resolve().absolute()
84
+ entity_dir.mkdir(parents=True, exist_ok=True)
85
+ return entity_dir
86
+
87
+ def _get_metadata_path(self, entity_id: str) -> Path:
88
+ """Get the path to the metadata file for an entity.
89
+
90
+ Args:
91
+ entity_id: Unique identifier for the entity
92
+
93
+ Returns:
94
+ Path to the metadata file
95
+ """
96
+ return self._get_entity_cache_dir(entity_id) / 'metadata.json'
97
+
98
+ def _get_data_path(self, entity_id: str, data_key: str) -> Path:
99
+ """Get the path to a data file for an entity.
100
+
101
+ Args:
102
+ entity_id: Unique identifier for the entity
103
+ data_key: Key identifying the type of data (e.g., 'image_data', 'segmentation')
104
+
105
+ Returns:
106
+ Path to the data file
107
+ """
108
+ return self._get_entity_cache_dir(entity_id) / f"{data_key}.pkl"
109
+
110
+ def _compute_version_hash(self, version_info: dict[str, Any]) -> str:
111
+ """Compute a hash from version information.
112
+
113
+ Args:
114
+ version_info: Dictionary containing version information (e.g., updated_at, size)
115
+
116
+ Returns:
117
+ Hash string representing the version
118
+ """
119
+ # Sort keys for consistent hashing
120
+ sorted_info = json.dumps(version_info, sort_keys=True)
121
+ return hashlib.sha256(sorted_info.encode()).hexdigest()
122
+
123
+ def get(
124
+ self,
125
+ entity_id: str,
126
+ data_key: str,
127
+ version_info: dict[str, Any] | None = None
128
+ ) -> T | None:
129
+ """Retrieve cached data for an entity.
130
+
131
+ Args:
132
+ entity_id: Unique identifier for the entity
133
+ data_key: Key identifying the type of data
134
+ version_info: Optional version information from server to validate cache
135
+
136
+ Returns:
137
+ Cached data if valid, None if cache miss or invalid
138
+ """
139
+ metadata_path = self._get_metadata_path(entity_id)
140
+ data_path = self._get_data_path(entity_id, data_key)
141
+
142
+ # Check if cache exists
143
+ if not metadata_path.exists() or not data_path.exists():
144
+ _LOGGER.debug(f"Cache miss for {entity_id}/{data_key}")
145
+ return None
146
+
147
+ try:
148
+ # Load or create metadata
149
+ with open(metadata_path, 'r') as f:
150
+ jsondata = f.read()
151
+ cached_metadata = CacheManager.ItemMetadata.model_validate_json(jsondata)
152
+
153
+ # Validate version if provided
154
+ if version_info is not None:
155
+ server_version = self._compute_version_hash(version_info)
156
+
157
+ if server_version != cached_metadata.version_hash:
158
+ _LOGGER.debug(
159
+ f"Cache version mismatch for {entity_id}/{data_key}. "
160
+ f"Server: {server_version}, Cached: {cached_metadata.version_hash}"
161
+ )
162
+ return None
163
+
164
+ data = self._load_data(cached_metadata)
165
+
166
+ _LOGGER.debug(f"Cache hit for {entity_id}/{data_key}")
167
+ return data
168
+
169
+ except Exception as e:
170
+ _LOGGER.warning(f"Error reading cache for {entity_id}/{data_key}: {e}")
171
+ return None
172
+
173
+ def set(
174
+ self,
175
+ entity_id: str,
176
+ data_key: str,
177
+ data: T,
178
+ version_info: dict[str, Any] | None = None
179
+ ) -> None:
180
+ """Store data in cache for an entity.
181
+
182
+ Args:
183
+ entity_id: Unique identifier for the entity
184
+ data_key: Key identifying the type of data
185
+ data: Data to cache
186
+ version_info: Optional version information from server
187
+ """
188
+ metadata_path = self._get_metadata_path(entity_id)
189
+ data_path = self._get_data_path(entity_id, data_key)
190
+
191
+ try:
192
+ mimetype = self._save_data(data_path, data)
193
+
194
+ metadata = CacheManager.ItemMetadata(
195
+ cached_at=datetime.now(),
196
+ data_path=str(data_path.absolute()),
197
+ data_type=type(data).__name__,
198
+ mimetype=mimetype,
199
+ entity_id=entity_id
200
+ )
201
+
202
+ # Update metadata for this data key
203
+
204
+ if version_info is not None:
205
+ metadata.version_hash = self._compute_version_hash(version_info)
206
+ # Store version_info as JSON string to ensure metadata is JSON-serializable
207
+ metadata.version_info = version_info
208
+
209
+ # Save metadata
210
+ with open(metadata_path, 'w') as f:
211
+ f.write(metadata.model_dump_json(indent=2))
212
+
213
+ _LOGGER.debug(f"Cached data for {entity_id}/{data_key}")
214
+
215
+ except Exception as e:
216
+ _LOGGER.warning(f"Error writing cache for {entity_id}/{data_key}: {e}")
217
+
218
+ def _load_data(self,
219
+ metadata: 'CacheManager.ItemMetadata') -> T:
220
+ path = metadata.data_path
221
+ if metadata.mimetype == 'application/octet-stream':
222
+ with open(path, 'rb') as f:
223
+ return f.read()
224
+ else:
225
+ with open(path, 'rb') as f:
226
+ return pickle.load(f)
227
+
228
+
229
+ def _save_data(self, path: Path, data: T) -> str:
230
+ """
231
+ Save data and returns the mimetype
232
+ """
233
+ if isinstance(data, bytes):
234
+ with open(path, 'wb') as f:
235
+ f.write(data)
236
+ return 'application/octet-stream'
237
+ else:
238
+ with open(path, 'wb') as f:
239
+ pickle.dump(data, f)
240
+ return 'application/x-python-serialize'
241
+
242
+ def invalidate(self, entity_id: str, data_key: str | None = None) -> None:
243
+ """Invalidate cached data for an entity.
244
+
245
+ Args:
246
+ entity_id: Unique identifier for the entity
247
+ data_key: Optional key for specific data. If None, invalidates all data for entity.
248
+ """
249
+ if data_key is None:
250
+ # Invalidate entire entity cache
251
+ entity_dir = self._get_entity_cache_dir(entity_id)
252
+ if entity_dir.exists():
253
+ import shutil
254
+ shutil.rmtree(entity_dir)
255
+ _LOGGER.debug(f"Invalidated all cache for {entity_id}")
256
+ else:
257
+ # Invalidate specific data
258
+ data_path = self._get_data_path(entity_id, data_key)
259
+ if data_path.exists():
260
+ data_path.unlink()
261
+ _LOGGER.debug(f"Invalidated cache for {entity_id}/{data_key}")
262
+
263
+ # Update metadata
264
+ metadata_path = self._get_metadata_path(entity_id)
265
+ if metadata_path.exists():
266
+ with open(metadata_path, 'r') as f:
267
+ metadata = json.load(f)
268
+
269
+ if data_key in metadata:
270
+ del metadata[data_key]
271
+
272
+ with open(metadata_path, 'w') as f:
273
+ json.dump(metadata, f, indent=2)
274
+
275
+ def clear_all(self) -> None:
276
+ """Clear all cached data for this entity type."""
277
+ if self.cache_root.exists():
278
+ import shutil
279
+ shutil.rmtree(self.cache_root)
280
+ self.cache_root.mkdir(parents=True, exist_ok=True)
281
+ _LOGGER.info(f"Cleared all cache for {self.entity_type}")
282
+
283
+ def get_cache_info(self, entity_id: str) -> dict[str, Any]:
284
+ """Get information about cached data for an entity.
285
+
286
+ Args:
287
+ entity_id: Unique identifier for the entity
288
+
289
+ Returns:
290
+ Dictionary containing cache information
291
+ """
292
+ metadata_path = self._get_metadata_path(entity_id)
293
+
294
+ if not metadata_path.exists():
295
+ return {}
296
+
297
+ try:
298
+ with open(metadata_path, 'r') as f:
299
+ return json.load(f)
300
+ except Exception as e:
301
+ _LOGGER.warning(f"Error reading cache info for {entity_id}: {e}")
302
+ return {}
@@ -1,14 +1,24 @@
1
- """Project entity module for DataMint API."""
1
+ """Dataset entity module for DataMint API."""
2
2
 
3
3
  from datetime import datetime
4
4
  import logging
5
+ from typing import TYPE_CHECKING, Sequence
6
+
5
7
  from .base_entity import BaseEntity, MISSING_FIELD
6
8
 
9
+ if TYPE_CHECKING:
10
+ from datamint.api.client import Api
11
+ from .resource import Resource
12
+ from .project import Project
13
+
7
14
  logger = logging.getLogger(__name__)
8
15
 
9
16
 
10
17
  class DatasetInfo(BaseEntity):
11
18
  """Pydantic Model representing a DataMint dataset.
19
+
20
+ This class provides access to dataset information and related entities
21
+ like resources and projects.
12
22
  """
13
23
 
14
24
  id: str
@@ -20,3 +30,100 @@ class DatasetInfo(BaseEntity):
20
30
  updated_at: str | None
21
31
  total_resource: int
22
32
  resource_ids: list[str]
33
+
34
+ def __init__(self, **data):
35
+ """Initialize the dataset info entity."""
36
+ super().__init__(**data)
37
+ self._manager: EntityManager['DatasetInfo'] = EntityManager(self)
38
+
39
+ # Cache for lazy-loaded data
40
+ self._resources_cache: Sequence['Resource'] | None = None
41
+ self._projects_cache: Sequence['Project'] | None = None
42
+
43
+ def _inject_api(self, api: 'Api') -> None:
44
+ """Inject API client into this dataset (called automatically by Api class)."""
45
+ self._manager.set_api(api)
46
+
47
+ def get_resources(
48
+ self,
49
+ refresh: bool = False,
50
+ limit: int | None = None
51
+ ) -> Sequence['Resource']:
52
+ """Get all resources in this dataset.
53
+
54
+ Results are cached after the first call unless refresh=True.
55
+
56
+ Args:
57
+ api: Optional API client. Uses the one from set_api() if not provided.
58
+ refresh: If True, bypass cache and fetch fresh data
59
+
60
+ Returns:
61
+ List of Resource instances in this dataset
62
+
63
+ Raises:
64
+ RuntimeError: If no API client is available
65
+
66
+ Example:
67
+ >>> dataset = api._datasetsinfo.get_by_id("dataset-id")
68
+ >>> dataset.set_api(api)
69
+ >>> resources = dataset.get_resources()
70
+ """
71
+ if refresh or self._resources_cache is None:
72
+ api_client = self._manager._ensure_api(api)
73
+
74
+ # Fetch resources by their IDs
75
+ resources = []
76
+ for resource_id in self.resource_ids:
77
+ try:
78
+ resource = api_client.resources.get_by_id(resource_id)
79
+ resource.set_api(api_client)
80
+ resources.append(resource)
81
+ except Exception as e:
82
+ logger.warning(f"Failed to fetch resource {resource_id}: {e}")
83
+
84
+ self._resources_cache = resources
85
+
86
+ return self._resources_cache
87
+
88
+ def get_projects(
89
+ self,
90
+ api: 'Api | None' = None,
91
+ refresh: bool = False
92
+ ) -> Sequence['Project']:
93
+ """Get all projects associated with this dataset.
94
+
95
+ Results are cached after the first call unless refresh=True.
96
+
97
+ Args:
98
+ refresh: If True, bypass cache and fetch fresh data
99
+
100
+ Returns:
101
+ List of Project instances
102
+
103
+ Raises:
104
+ RuntimeError: If no API client is available
105
+
106
+ Example:
107
+ >>> dataset = api.datasetsinfo.get_by_id("dataset-id")
108
+ >>> projects = dataset.get_projects()
109
+ """
110
+ if refresh or self._projects_cache is None:
111
+ api_client = self._manager.api
112
+
113
+ # Get all projects and filter by dataset_id
114
+ all_projects = api_client.projects.get_all()
115
+ projects = [p for p in all_projects if p.dataset_id == self.id]
116
+
117
+ self._projects_cache = projects
118
+
119
+ return self._projects_cache
120
+
121
+ def invalidate_cache(self) -> None:
122
+ """Invalidate all cached relationship data.
123
+
124
+ This forces fresh data fetches on the next access.
125
+ """
126
+ self._resources_cache = None
127
+ self._projects_cache = None
128
+ logger.debug(f"Invalidated cache for dataset {self.id}")
129
+
@@ -1,9 +1,15 @@
1
1
  """Project entity module for DataMint API."""
2
-
3
2
  from datetime import datetime
4
3
  import logging
4
+ from typing import Sequence, Literal, TYPE_CHECKING
5
5
  from .base_entity import BaseEntity, MISSING_FIELD
6
6
  from typing import Any
7
+ import webbrowser
8
+ from pydantic import PrivateAttr
9
+
10
+ if TYPE_CHECKING:
11
+ from datamint.api.endpoints.projects_api import ProjectsApi
12
+ from .resource import Resource
7
13
 
8
14
  logger = logging.getLogger(__name__)
9
15
 
@@ -36,7 +42,7 @@ class Project(BaseEntity):
36
42
  """
37
43
  id: str
38
44
  name: str
39
- created_at: str # ISO timestamp string
45
+ created_at: str
40
46
  created_by: str
41
47
  dataset_id: str
42
48
  worklist_id: str
@@ -50,17 +56,52 @@ class Project(BaseEntity):
50
56
  ai_model_id: str | None = MISSING_FIELD
51
57
  closed_resources_count: int = MISSING_FIELD
52
58
  resources_to_annotate_count: int = MISSING_FIELD
53
- most_recent_experiment: str | None = MISSING_FIELD # ISO timestamp string
59
+ most_recent_experiment: str | None = MISSING_FIELD
54
60
  annotators: list[dict] = MISSING_FIELD
55
- customer_id: str | None = MISSING_FIELD
56
61
  archived_on: str | None = MISSING_FIELD
57
62
  archived_by: str | None = MISSING_FIELD
58
63
  is_active_learning: bool = MISSING_FIELD
59
64
  two_up_display: bool = MISSING_FIELD
60
65
  require_review: bool = MISSING_FIELD
61
66
 
67
+ _api: 'ProjectsApi' = PrivateAttr()
68
+
69
+ def fetch_resources(self) -> Sequence['Resource']:
70
+ """Fetch resources associated with this project from the API,
71
+ IMPORTANT: It always fetches fresh data from the server.
72
+
73
+ Returns:
74
+ List of Resource instances associated with the project.
75
+ """
76
+ return self._api.get_project_resources(self.id)
77
+
78
+ def set_work_status(self, resource: 'Resource', status: Literal['opened', 'annotated', 'closed']) -> None:
79
+ """Set the status of a resource.
80
+
81
+ Args:
82
+ resource: The resource unique id or a resource object.
83
+ status: The new status to set.
84
+ """
85
+
86
+ return self._api.set_work_status(self, resource, status)
87
+
62
88
  @property
63
89
  def url(self) -> str:
64
90
  """Get the URL to access this project in the DataMint web application."""
65
- base_url = "https://app.datamint.io/projects/edit"
66
- return f"{base_url}/{self.id}"
91
+ base_url = self._api.config.web_app_url
92
+ return f'{base_url}/projects/edit/{self.id}'
93
+
94
+ def show(self) -> None:
95
+ """Open the project in the default web browser."""
96
+ webbrowser.open(self.url)
97
+
98
+ def as_torch_dataset(self,
99
+ root_dir: str | None = None,
100
+ auto_update: bool = True,
101
+ return_as_semantic_segmentation: bool = False):
102
+ from datamint.dataset import Dataset
103
+ return Dataset(project_name=self.name,
104
+ root=root_dir,
105
+ auto_update=auto_update,
106
+ return_as_semantic_segmentation=return_as_semantic_segmentation,
107
+ all_annotations=True)