PyPI - datamint - Versions diffs - 2.3.5__py3-none-any.whl → 2.4.1__py3-none-any.whl - Mend

datamint 2.3.5py3-none-any.whl → 2.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datamint might be problematic. Click here for more details.

Files changed (25) hide show

datamint/api/base_api.py +42 -8
datamint/api/client.py +2 -0
datamint/api/endpoints/resources_api.py +37 -13
datamint/apihandler/base_api_handler.py +0 -1
datamint/apihandler/dto/annotation_dto.py +2 -0
datamint/dataset/base_dataset.py +4 -0
datamint/lightning/__init__.py +1 -0
datamint/lightning/datamintdatamodule.py +103 -0
datamint/mlflow/__init__.py +46 -0
datamint/mlflow/artifact/__init__.py +1 -0
datamint/mlflow/artifact/datamint_artifacts_repo.py +8 -0
datamint/mlflow/env_utils.py +109 -0
datamint/mlflow/env_vars.py +5 -0
datamint/mlflow/lightning/callbacks/__init__.py +1 -0
datamint/mlflow/lightning/callbacks/modelcheckpoint.py +338 -0
datamint/mlflow/models/__init__.py +94 -0
datamint/mlflow/tracking/datamint_store.py +46 -0
datamint/mlflow/tracking/default_experiment.py +27 -0
datamint/mlflow/tracking/fluent.py +78 -0
datamint-2.4.1.dist-info/METADATA +320 -0
{datamint-2.3.5.dist-info → datamint-2.4.1.dist-info}/RECORD +23 -10
datamint-2.4.1.dist-info/entry_points.txt +18 -0
datamint-2.3.5.dist-info/METADATA +0 -125
datamint-2.3.5.dist-info/entry_points.txt +0 -4
{datamint-2.3.5.dist-info → datamint-2.4.1.dist-info}/WHEEL +0 -0

datamint/api/base_api.py CHANGED Viewed

@@ -61,22 +61,56 @@ class BaseApi:
             client: Optional HTTP client instance. If None, a new one will be created.
         """
         self.config = config
-        self.client = client or self._create_client()
+        self._owns_client = client is None  # Track if we created the client
+        self.client = client or BaseApi._create_client(config)
         self.semaphore = asyncio.Semaphore(20)
         self._api_instance: 'Api | None' = None  # Injected by Api class
-    def _create_client(self) -> httpx.Client:
-        """Create and configure HTTP client with authentication and timeouts."""
-        headers = None
-        if self.config.api_key:
-            headers = {"apikey": self.config.api_key}
+    @staticmethod
+    def _create_client(config: ApiConfig) -> httpx.Client:
+        """Create and configure HTTP client with authentication and timeouts.
+        The client is designed to be long-lived and reused across multiple requests.
+        It maintains connection pooling for improved performance.
+        Default limits: max_keepalive_connections=20, max_connections=100
+        """
+        headers = {"apikey": config.api_key} if config.api_key else None
         return httpx.Client(
-            base_url=self.config.server_url,
+            base_url=config.server_url,
             headers=headers,
-            timeout=self.config.timeout
+            timeout=config.timeout,
+            limits=httpx.Limits(
+                max_keepalive_connections=5,  # Increased from default 20
+                max_connections=20,  # Increased from default 100
+                keepalive_expiry=8
+            )
         )
+    def close(self) -> None:
+        """Close the HTTP client and release resources.
+        Should be called when the API instance is no longer needed.
+        Only closes the client if it was created by this instance.
+        """
+        if self._owns_client and self.client is not None:
+            self.client.close()
+    def __enter__(self):
+        """Context manager entry."""
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit - ensures client is closed."""
+        self.close()
+    def __del__(self):
+        """Destructor - ensures client is closed when instance is garbage collected."""
+        try:
+            self.close()
+        except Exception:
+            pass  # Ignore errors during cleanup
     def _stream_request(self, method: str, endpoint: str, **kwargs):
         """Make streaming HTTP request with error handling.

datamint/api/client.py CHANGED Viewed

@@ -68,6 +68,8 @@ class Api:
                                     f" Please check your api_key and/or other configurations. {e}")
     def _get_endpoint(self, name: str):
+        if self._client is None:
+            self._client = BaseApi._create_client(self.config)
         if name not in self._endpoints:
             api_class = self._API_MAP[name]
             endpoint = api_class(self.config, self._client)

datamint/api/endpoints/resources_api.py CHANGED Viewed

@@ -25,6 +25,7 @@ import nest_asyncio  # For running asyncio in jupyter notebooks
 from PIL import Image
 import io
 from datamint.types import ImagingData
+from collections import defaultdict
 _LOGGER = logging.getLogger(__name__)
@@ -279,7 +280,6 @@ class ResourcesApi(CreatableEntityApi[Resource], DeletableEntityApi[Resource]):
             _LOGGER.warning(msg)
             _USER_LOGGER.warning(msg)
         mimetype = standardize_mimetype(mimetype)
         if is_a_dicom_file == True or is_dicom(file_path):
@@ -440,12 +440,12 @@ class ResourcesApi(CreatableEntityApi[Resource], DeletableEntityApi[Resource]):
             try:
                 tasks = [__upload_single_resource(f, segfiles, metadata_file)
-                        for f, segfiles, metadata_file in zip(files_path, segmentation_files, metadata_files)]
+                         for f, segfiles, metadata_file in zip(files_path, segmentation_files, metadata_files)]
             except ValueError:
                 msg = f"Error preparing upload tasks. Try `assemble_dicom=False`."
                 _LOGGER.error(msg)
                 _USER_LOGGER.error(msg)
-                raise
+                raise
             return await asyncio.gather(*tasks, return_exceptions=on_error == 'skip')
     def upload_resources(self,
@@ -996,22 +996,28 @@ class ResourcesApi(CreatableEntityApi[Resource], DeletableEntityApi[Resource]):
                     raise
     def set_tags(self,
-                 resource: str | Resource,
+                 resource: str | Resource | Sequence[str | Resource],
                  tags: Sequence[str],
                  ):
         """
         Set tags for a resource, IMPORTANT: This replaces all existing tags.
         Args:
-            resource: The resource unique id or Resource object.
+            resource: The resource object or a list of resources.
             tags: The tags to set.
         """
         data = {'tags': tags}
-        resource_id = self._entid(resource)
-        response = self._make_entity_request('PUT',
-                                             resource_id,
-                                             add_path='tags',
-                                             json=data)
+        if isinstance(resource, Sequence):
+            resource_ids = [self._entid(res) for res in resource]
+            response = self._make_request('PUT',
+                                          f'{self.endpoint_base}/tags',
+                                          json={'resource_ids': resource_ids,
+                                                'tags': tags})
+        else:
+            resource_id = self._entid(resource)
+            response = self._make_entity_request('PUT',
+                                                 resource_id,
+                                                 add_path='tags',
+                                                 json=data)
         return response
     # def get_projects(self, resource: Resource) -> Sequence[Project]:
@@ -1029,7 +1035,7 @@ class ResourcesApi(CreatableEntityApi[Resource], DeletableEntityApi[Resource]):
     #     return [proj for proj in self.projects_api.get_all() if proj.id in proj_ids]
     def add_tags(self,
-                 resource: str | Resource,
+                 resource: str | Resource | Sequence[str | Resource],
                  tags: Sequence[str],
                  ):
         """
@@ -1040,8 +1046,26 @@ class ResourcesApi(CreatableEntityApi[Resource], DeletableEntityApi[Resource]):
         """
         if isinstance(resource, str):
             resource = self.get_by_id(resource)
+        elif isinstance(resource, Sequence):
+            # Transform every str to Resource first.
+            resources = [self.get_by_id(res) if isinstance(res, str) else res for res in resource]
+            # group resource having the exact same tags to minimize requests
+            tag_map: dict[tuple, list[Resource]] = defaultdict(list)
+            for res in resources:
+                old_tags = res.tags if res.tags is not None else []
+                # key = tuple(sorted(old_tags))
+                key = tuple(old_tags)  # keep order, assuming order matters for tags
+                tag_map[key].append(res)
+            # finally, set tags for each group
+            for old_tags_tuple, res_group in tag_map.items():
+                old_tags = list(old_tags_tuple)
+                self.set_tags(res_group, old_tags + list(tags))
+            return
         old_tags = resource.tags if resource.tags is not None else []
-        return self.set_tags(resource, old_tags + list(tags))
+        self.set_tags(resource, old_tags + list(tags))
     def bulk_delete(self, entities: Sequence[str | Resource]) -> None:
         """Delete multiple entities. Faster than deleting them one by one.

datamint/apihandler/base_api_handler.py CHANGED Viewed

@@ -30,7 +30,6 @@ ResourceFields: TypeAlias = Literal['modality', 'created_by', 'published_by', 'p
 _PAGE_LIMIT = 5000
 @deprecated(reason="Please use `from datamint import Api` instead.", version="2.0.0")
 class BaseAPIHandler:
     """

datamint/apihandler/dto/annotation_dto.py CHANGED Viewed

@@ -178,6 +178,8 @@ class CreateAnnotationDto:
         if model_id is not None:
             if is_model == False:
                 raise ValueError("model_id==False while self.model_id is provided.")
+            if not isinstance(model_id, str):
+                raise ValueError("model_id must be a string if provided.")
             is_model = True
         self.is_model = is_model
         self.geometry = geometry

datamint/dataset/base_dataset.py CHANGED Viewed

@@ -307,6 +307,10 @@ class DatamintBaseDataset:
         self.image_lsets, self.image_lcodes = self._get_labels_set(framed=False)
         worklist_id = self.get_info()['worklist_id']
         groups: dict[str, dict] = self.api.annotationsets.get_segmentation_group(worklist_id)['groups']
+        if not groups:
+            self.seglabel_list = []
+            self.seglabel2code = {}
+            return
         # order by 'index' key
         max_index = max([g['index'] for g in groups.values()])
         self.seglabel_list : list[str] = ['UNKNOWN'] * max_index  # 1-based

datamint/lightning/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .datamintdatamodule import DatamintDataModule

datamint/lightning/datamintdatamodule.py ADDED Viewed

@@ -0,0 +1,103 @@
+from torch.utils.data import DataLoader
+from datamint import Dataset
+import lightning as L
+from typing import Any
+from copy import copy
+import numpy as np
+class DatamintDataModule(L.LightningDataModule):
+    """
+    LightningDataModule for Datamint datasets with train/val split.
+    TODO: Add support for test and predict dataloaders.
+    """
+    def __init__(
+        self,
+        project_name: str = "./",
+        batch_size: int = 32,
+        image_transform=None,
+        mask_transform=None,
+        alb_transform=None,
+        alb_train_transform=None,
+        alb_val_transform=None,
+        train_split: float = 0.9,
+        val_split: float = 0.1,
+        seed: int = 42,
+        num_workers: int = 4,
+        **dataset_kwargs: Any,
+    ):
+        super().__init__()
+        self.project_name = project_name
+        self.batch_size = batch_size
+        self.image_transform = image_transform
+        self.mask_transform = mask_transform
+        if alb_transform is not None and (alb_train_transform is not None or alb_val_transform is not None):
+            raise ValueError("You cannot specify both `alb_transform` and `alb_train_transform`/`alb_val_transform`.")
+        # Handle backward compatibility for alb_transform
+        if alb_transform is not None:
+            self.alb_train_transform = alb_transform
+            self.alb_val_transform = alb_transform
+        else:
+            self.alb_train_transform = alb_train_transform
+            self.alb_val_transform = alb_val_transform
+        self.train_split = train_split
+        self.val_split = val_split
+        self.seed = seed
+        self.dataset_kwargs = dataset_kwargs
+        self.num_workers = num_workers
+        self.dataset = None
+    def prepare_data(self) -> None:
+        """Download or update data if needed."""
+        Dataset(
+            project_name=self.project_name,
+            auto_update=True,
+        )
+    def setup(self, stage: str = None) -> None:
+        """Set up datasets and perform train/val split."""
+        if self.dataset is None:
+            # Create base dataset for getting indices
+            self.dataset = Dataset(
+                return_as_semantic_segmentation=True,
+                semantic_seg_merge_strategy="union",
+                return_frame_by_frame=True,
+                include_unannotated=False,
+                project_name=self.project_name,
+                image_transform=self.image_transform,
+                mask_transform=self.mask_transform,
+                alb_transform=None,  # No transform for base dataset
+                auto_update=False,
+                **self.dataset_kwargs,
+            )
+            indices = list(copy(self.dataset.subset_indices))
+            rs = np.random.RandomState(self.seed)
+            rs.shuffle(indices)
+            train_end = int(self.train_split * len(indices))
+            train_idx = indices[:train_end]
+            val_idx = indices[train_end:]
+            self.train_dataset = copy(self.dataset).subset(train_idx)
+            self.train_dataset.alb_transform = self.alb_train_transform
+            self.val_dataset = copy(self.dataset).subset(val_idx)
+            self.val_dataset.alb_transform = self.alb_val_transform
+    def train_dataloader(self) -> DataLoader:
+        return self.train_dataset.get_dataloader(batch_size=self.batch_size, num_workers=self.num_workers, shuffle=True)
+    def val_dataloader(self) -> DataLoader:
+        return self.val_dataset.get_dataloader(batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)
+    def test_dataloader(self):
+        # Use the same dataloader as validation for testing, because we have so few samples
+        return self.val_dataset.get_dataloader(batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)
+    def predict_dataloader(self):
+        # Use the same dataloader as validation for testing, because we have so few samples
+        return self.val_dataset.get_dataloader(batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)

datamint/mlflow/__init__.py ADDED Viewed

@@ -0,0 +1,46 @@
+# Monkey patch mlflow.tracking._tracking_service.utils.get_tracking_uri
+from .tracking.fluent import set_project
+import mlflow.tracking._tracking_service.utils as mlflow_utils
+from functools import wraps
+import logging
+from .env_utils import setup_mlflow_environment, ensure_mlflow_configured
+_LOGGER = logging.getLogger(__name__)
+# Store reference to original function
+_original_get_tracking_uri = mlflow_utils.get_tracking_uri
+_SETUP_CALLED_SUCCESSFULLY = False
+@wraps(_original_get_tracking_uri)
+def _patched_get_tracking_uri(*args, **kwargs):
+    """Patched version of get_tracking_uri that ensures MLflow environment is set up first.
+    This wrapper ensures that setup_mlflow_environment is called before any tracking
+    URI operations, guaranteeing proper MLflow configuration.
+    Args:
+        *args: Arguments passed to the original get_tracking_uri function.
+        **kwargs: Keyword arguments passed to the original get_tracking_uri function.
+    Returns:
+        The result of the original get_tracking_uri function.
+    """
+    global _SETUP_CALLED_SUCCESSFULLY
+    if _SETUP_CALLED_SUCCESSFULLY:
+        return _original_get_tracking_uri(*args, **kwargs)
+    try:
+        _SETUP_CALLED_SUCCESSFULLY = setup_mlflow_environment(set_mlflow=True)
+    except Exception as e:
+        _SETUP_CALLED_SUCCESSFULLY = False
+        _LOGGER.error("Failed to set up MLflow environment: %s", e)
+    ret = _original_get_tracking_uri(*args, **kwargs)
+    return ret
+setup_mlflow_environment(set_mlflow=False)
+# Replace the original function with our patched version
+mlflow_utils.get_tracking_uri = _patched_get_tracking_uri
+__all__ = ['set_project', 'setup_mlflow_environment', 'ensure_mlflow_configured']

datamint/mlflow/artifact/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .datamint_artifacts_repo import DatamintArtifactsRepository

datamint/mlflow/artifact/datamint_artifacts_repo.py ADDED Viewed

@@ -0,0 +1,8 @@
+from mlflow.store.artifact.mlflow_artifacts_repo import MlflowArtifactsRepository
+class DatamintArtifactsRepository(MlflowArtifactsRepository):
+    @classmethod
+    def resolve_uri(cls, artifact_uri, tracking_uri):
+        tracking_uri = tracking_uri.split('datamint://', maxsplit=1)[-1]
+        return super().resolve_uri(artifact_uri, tracking_uri)

datamint/mlflow/env_utils.py ADDED Viewed

@@ -0,0 +1,109 @@
+"""
+Utility functions for automatically configuring MLflow environment variables
+based on Datamint configuration.
+"""
+import os
+import logging
+from typing import Optional
+from urllib.parse import urlparse
+from datamint import configs
+_LOGGER = logging.getLogger(__name__)
+def get_datamint_api_url() -> Optional[str]:
+    """Get the Datamint API URL from configuration or environment variables."""
+    # First check environment variable
+    api_url = os.getenv('DATAMINT_API_URL')
+    if api_url:
+        return api_url
+    # Then check configuration
+    api_url = configs.get_value(configs.APIURL_KEY)
+    if api_url:
+        return api_url
+    return None
+def get_datamint_api_key() -> Optional[str]:
+    """Get the Datamint API key from configuration or environment variables."""
+    # First check environment variable
+    api_key = os.getenv('DATAMINT_API_KEY')
+    if api_key:
+        return api_key
+    # Then check configuration
+    api_key = configs.get_value(configs.APIKEY_KEY)
+    if api_key:
+        return api_key
+    return None
+def _get_mlflowdatamint_uri() -> Optional[str]:
+    api_url = get_datamint_api_url()
+    if not api_url:
+        return None
+    _LOGGER.debug(f"Retrieved Datamint API URL: {api_url}")
+    # Remove trailing slash if present
+    api_url = api_url.rstrip('/')
+    # api_url samples:
+    # https://api.datamint.io
+    # http://localhost:3001
+    parsed_url = urlparse(api_url)
+    base_url = f"{parsed_url.scheme}://{parsed_url.hostname}"
+    _LOGGER.debug(f"Derived base URL for MLflow Datamint: {base_url}")
+    # FIXME: It should work with https or datamint-api server should forward https requests.
+    base_url = base_url.replace('https://', 'http://')
+    if len(base_url.replace('http:', '')) == 0:
+        return None
+    mlflow_uri = f"{base_url}:5000"
+    return mlflow_uri
+def setup_mlflow_environment(overwrite: bool = False,
+                             set_mlflow: bool = True) -> bool:
+    """
+    Automatically set up MLflow environment variables based on Datamint configuration.
+    Returns:
+        bool: True if MLflow environment was successfully configured, False otherwise.
+    """
+    _LOGGER.debug("Setting up MLflow environment variables from Datamint configuration")
+    api_key = get_datamint_api_key()
+    mlflow_uri = _get_mlflowdatamint_uri()
+    if not mlflow_uri or not api_key:
+        _LOGGER.warning("Datamint configuration incomplete, cannot auto-configure MLflow")
+        return False
+    if overwrite or not os.getenv('MLFLOW_TRACKING_TOKEN'):
+        os.environ['MLFLOW_TRACKING_TOKEN'] = api_key
+    if overwrite or not os.getenv('MLFLOW_TRACKING_URI'):
+        os.environ['MLFLOW_TRACKING_URI'] = mlflow_uri
+    if set_mlflow:
+        import mlflow
+        mlflow.set_tracking_uri(mlflow_uri)
+    return True
+def ensure_mlflow_configured() -> None:
+    """
+    Ensure MLflow environment is properly configured.
+    Raises an exception if configuration is incomplete.
+    """
+    if not setup_mlflow_environment():
+        if not os.getenv('MLFLOW_TRACKING_URI') or not os.getenv('MLFLOW_TRACKING_TOKEN'):
+            raise ValueError(
+                "MLflow environment not configured. Please either:\n"
+                "1. Run 'datamint-config' to set up Datamint configuration, or\n"
+                "2. Set DATAMINT_API_URL and DATAMINT_API_KEY environment variables, or\n"
+                "3. Manually set MLFLOW_TRACKING_URI and MLFLOW_TRACKING_TOKEN environment variables"
+            )

datamint/mlflow/env_vars.py ADDED Viewed

@@ -0,0 +1,5 @@
+from enum import Enum
+class EnvVars(Enum):
+    DATAMINT_PROJECT_ID = "DATAMINT_PROJECT_ID"
+    DATAMINT_PROJECT_NAME = "DATAMINT_PROJECT_NAME"

datamint/mlflow/lightning/callbacks/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .modelcheckpoint import MLFlowModelCheckpoint

datamint 2.3.5__py3-none-any.whl → 2.4.1__py3-none-any.whl

Potentially problematic release.

datamint 2.3.5py3-none-any.whl → 2.4.1py3-none-any.whl