deriva-ml 1.17.10__py3-none-any.whl → 1.17.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. deriva_ml/__init__.py +69 -1
  2. deriva_ml/asset/__init__.py +17 -0
  3. deriva_ml/asset/asset.py +357 -0
  4. deriva_ml/asset/aux_classes.py +100 -0
  5. deriva_ml/bump_version.py +254 -11
  6. deriva_ml/catalog/__init__.py +31 -0
  7. deriva_ml/catalog/clone.py +1939 -0
  8. deriva_ml/catalog/localize.py +426 -0
  9. deriva_ml/core/__init__.py +29 -0
  10. deriva_ml/core/base.py +845 -1067
  11. deriva_ml/core/config.py +169 -21
  12. deriva_ml/core/constants.py +120 -19
  13. deriva_ml/core/definitions.py +123 -13
  14. deriva_ml/core/enums.py +47 -73
  15. deriva_ml/core/ermrest.py +226 -193
  16. deriva_ml/core/exceptions.py +297 -14
  17. deriva_ml/core/filespec.py +99 -28
  18. deriva_ml/core/logging_config.py +225 -0
  19. deriva_ml/core/mixins/__init__.py +42 -0
  20. deriva_ml/core/mixins/annotation.py +915 -0
  21. deriva_ml/core/mixins/asset.py +384 -0
  22. deriva_ml/core/mixins/dataset.py +237 -0
  23. deriva_ml/core/mixins/execution.py +408 -0
  24. deriva_ml/core/mixins/feature.py +365 -0
  25. deriva_ml/core/mixins/file.py +263 -0
  26. deriva_ml/core/mixins/path_builder.py +145 -0
  27. deriva_ml/core/mixins/rid_resolution.py +204 -0
  28. deriva_ml/core/mixins/vocabulary.py +400 -0
  29. deriva_ml/core/mixins/workflow.py +322 -0
  30. deriva_ml/core/validation.py +389 -0
  31. deriva_ml/dataset/__init__.py +2 -1
  32. deriva_ml/dataset/aux_classes.py +20 -4
  33. deriva_ml/dataset/catalog_graph.py +575 -0
  34. deriva_ml/dataset/dataset.py +1242 -1008
  35. deriva_ml/dataset/dataset_bag.py +1311 -182
  36. deriva_ml/dataset/history.py +27 -14
  37. deriva_ml/dataset/upload.py +225 -38
  38. deriva_ml/demo_catalog.py +126 -110
  39. deriva_ml/execution/__init__.py +46 -2
  40. deriva_ml/execution/base_config.py +639 -0
  41. deriva_ml/execution/execution.py +543 -242
  42. deriva_ml/execution/execution_configuration.py +26 -11
  43. deriva_ml/execution/execution_record.py +592 -0
  44. deriva_ml/execution/find_caller.py +298 -0
  45. deriva_ml/execution/model_protocol.py +175 -0
  46. deriva_ml/execution/multirun_config.py +153 -0
  47. deriva_ml/execution/runner.py +595 -0
  48. deriva_ml/execution/workflow.py +223 -34
  49. deriva_ml/experiment/__init__.py +8 -0
  50. deriva_ml/experiment/experiment.py +411 -0
  51. deriva_ml/feature.py +6 -1
  52. deriva_ml/install_kernel.py +143 -6
  53. deriva_ml/interfaces.py +862 -0
  54. deriva_ml/model/__init__.py +99 -0
  55. deriva_ml/model/annotations.py +1278 -0
  56. deriva_ml/model/catalog.py +286 -60
  57. deriva_ml/model/database.py +144 -649
  58. deriva_ml/model/deriva_ml_database.py +308 -0
  59. deriva_ml/model/handles.py +14 -0
  60. deriva_ml/run_model.py +319 -0
  61. deriva_ml/run_notebook.py +507 -38
  62. deriva_ml/schema/__init__.py +18 -2
  63. deriva_ml/schema/annotations.py +62 -33
  64. deriva_ml/schema/create_schema.py +169 -69
  65. deriva_ml/schema/validation.py +601 -0
  66. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/METADATA +4 -4
  67. deriva_ml-1.17.12.dist-info/RECORD +77 -0
  68. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/WHEEL +1 -1
  69. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/entry_points.txt +1 -0
  70. deriva_ml/protocols/dataset.py +0 -19
  71. deriva_ml/test.py +0 -94
  72. deriva_ml-1.17.10.dist-info/RECORD +0 -45
  73. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/licenses/LICENSE +0 -0
  74. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,145 @@
1
+ """Path builder mixin for DerivaML.
2
+
3
+ This module provides the PathBuilderMixin class which handles
4
+ catalog path building and table access utilities.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, Any, Iterable
11
+
12
+ # Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
13
+ import importlib
14
+ datapath = importlib.import_module("deriva.core.datapath")
15
+ _ermrest_catalog = importlib.import_module("deriva.core.ermrest_catalog")
16
+ _ermrest_model = importlib.import_module("deriva.core.ermrest_model")
17
+
18
+ SchemaWrapper = datapath._SchemaWrapper
19
+ ErmrestCatalog = _ermrest_catalog.ErmrestCatalog
20
+ ErmrestSnapshot = _ermrest_catalog.ErmrestSnapshot
21
+ Table = _ermrest_model.Table
22
+
23
+ import pandas as pd
24
+
25
+ from deriva_ml.dataset.upload import table_path as _table_path
26
+
27
+ if TYPE_CHECKING:
28
+ from deriva_ml.model.catalog import DerivaModel
29
+
30
+
31
+ class PathBuilderMixin:
32
+ """Mixin providing path building and table access utilities.
33
+
34
+ This mixin requires the host class to have:
35
+ - catalog: ErmrestCatalog or ErmrestSnapshot instance
36
+ - domain_schema: str - name of the domain schema
37
+ - model: DerivaModel instance
38
+ - working_dir: Path - working directory path
39
+
40
+ Methods:
41
+ pathBuilder: Get catalog path builder for queries
42
+ domain_path: Property returning path builder for domain schema
43
+ table_path: Get local filesystem path for table CSV files
44
+ get_table_as_dataframe: Get table contents as pandas DataFrame
45
+ get_table_as_dict: Get table contents as dictionaries
46
+ """
47
+
48
+ # Type hints for IDE support - actual attributes from host class
49
+ catalog: ErmrestCatalog | ErmrestSnapshot
50
+ domain_schemas: frozenset[str]
51
+ default_schema: str | None
52
+ model: "DerivaModel"
53
+ working_dir: Path
54
+
55
+ def pathBuilder(self) -> SchemaWrapper:
56
+ """Returns catalog path builder for queries.
57
+
58
+ The path builder provides a fluent interface for constructing complex queries against the catalog.
59
+ This is a core component used by many other methods to interact with the catalog.
60
+
61
+ Returns:
62
+ datapath._CatalogWrapper: A new instance of the catalog path builder.
63
+
64
+ Example:
65
+ >>> path = ml.pathBuilder.schemas['my_schema'].tables['my_table']
66
+ >>> results = path.entities().fetch()
67
+ """
68
+ return self.catalog.getPathBuilder()
69
+
70
+ def domain_path(self, schema: str | None = None) -> datapath.DataPath:
71
+ """Returns path builder for a domain schema.
72
+
73
+ Provides a convenient way to access tables and construct queries within a domain-specific schema.
74
+
75
+ Args:
76
+ schema: Schema name to get path builder for. If None, uses default_schema.
77
+
78
+ Returns:
79
+ datapath._CatalogWrapper: Path builder object scoped to the specified domain schema.
80
+
81
+ Raises:
82
+ DerivaMLException: If no schema specified and default_schema is not set.
83
+
84
+ Example:
85
+ >>> domain = ml.domain_path() # Uses default schema
86
+ >>> results = domain.my_table.entities().fetch()
87
+ >>> # Or with explicit schema:
88
+ >>> domain = ml.domain_path("my_schema")
89
+ """
90
+ schema = schema or self.model._require_default_schema()
91
+ return self.pathBuilder().schemas[schema]
92
+
93
+ def table_path(self, table: str | Table, schema: str | None = None) -> Path:
94
+ """Returns a local filesystem path for table CSV files.
95
+
96
+ Generates a standardized path where CSV files should be placed when preparing to upload data to a table.
97
+ The path follows the project's directory structure conventions.
98
+
99
+ Args:
100
+ table: Name of the table or Table object to get the path for.
101
+ schema: Schema name for the path. If None, uses the table's schema or default_schema.
102
+
103
+ Returns:
104
+ Path: Filesystem path where the CSV file should be placed.
105
+
106
+ Example:
107
+ >>> path = ml.table_path("experiment_results")
108
+ >>> df.to_csv(path) # Save data for upload
109
+ """
110
+ table_obj = self.model.name_to_table(table)
111
+ # Use table's schema if available, otherwise use provided schema or default
112
+ schema = schema or table_obj.schema.name
113
+ return _table_path(
114
+ self.working_dir,
115
+ schema=schema,
116
+ table=table_obj.name,
117
+ )
118
+
119
+ def get_table_as_dataframe(self, table: str) -> pd.DataFrame:
120
+ """Get table contents as a pandas DataFrame.
121
+
122
+ Retrieves all contents of a table from the catalog.
123
+
124
+ Args:
125
+ table: Name of the table to retrieve.
126
+
127
+ Returns:
128
+ DataFrame containing all table contents.
129
+ """
130
+ return pd.DataFrame(list(self.get_table_as_dict(table)))
131
+
132
+ def get_table_as_dict(self, table: str) -> Iterable[dict[str, Any]]:
133
+ """Get table contents as dictionaries.
134
+
135
+ Retrieves all contents of a table from the catalog.
136
+
137
+ Args:
138
+ table: Name of the table to retrieve.
139
+
140
+ Returns:
141
+ Iterable yielding dictionaries for each row.
142
+ """
143
+ table_obj = self.model.name_to_table(table)
144
+ pb = self.pathBuilder()
145
+ yield from pb.schemas[table_obj.schema.name].tables[table_obj.name].entities().fetch()
@@ -0,0 +1,204 @@
1
+ """RID resolution mixin for DerivaML.
2
+
3
+ This module provides the RidResolutionMixin class which handles
4
+ Resource Identifier (RID) resolution and retrieval operations.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass
10
+ from typing import TYPE_CHECKING, Any
11
+
12
+ # Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
13
+ import importlib
14
+ _datapath = importlib.import_module("deriva.core.datapath")
15
+ _ermrest_catalog = importlib.import_module("deriva.core.ermrest_catalog")
16
+ _ermrest_model = importlib.import_module("deriva.core.ermrest_model")
17
+
18
+ AnyQuantifier = _datapath.Any
19
+ ErmrestCatalog = _ermrest_catalog.ErmrestCatalog
20
+ ErmrestSnapshot = _ermrest_catalog.ErmrestSnapshot
21
+ ResolveRidResult = _ermrest_catalog.ResolveRidResult
22
+ Table = _ermrest_model.Table
23
+
24
+ from deriva_ml.core.definitions import RID
25
+ from deriva_ml.core.exceptions import DerivaMLException
26
+
27
+ if TYPE_CHECKING:
28
+ from deriva_ml.model.catalog import DerivaModel
29
+
30
+
31
+ @dataclass
32
+ class BatchRidResult:
33
+ """Result of batch RID resolution.
34
+
35
+ Attributes:
36
+ rid: The resolved RID (normalized form).
37
+ table: The Table object containing this RID.
38
+ table_name: The name of the table containing this RID.
39
+ schema_name: The name of the schema containing this RID.
40
+ """
41
+
42
+ rid: RID
43
+ table: Table
44
+ table_name: str
45
+ schema_name: str
46
+
47
+
48
+ class RidResolutionMixin:
49
+ """Mixin providing RID resolution and retrieval operations.
50
+
51
+ This mixin requires the host class to have:
52
+ - catalog: ErmrestCatalog or ErmrestSnapshot instance
53
+ - model: DerivaModel instance (with .model attribute for ermrest model)
54
+ - pathBuilder(): method returning catalog path builder
55
+
56
+ Methods:
57
+ resolve_rid: Resolve a RID to its catalog location
58
+ resolve_rids: Batch resolve multiple RIDs efficiently
59
+ retrieve_rid: Retrieve the complete record for a RID
60
+ """
61
+
62
+ # Type hints for IDE support - actual attributes from host class
63
+ catalog: ErmrestCatalog | ErmrestSnapshot
64
+ model: "DerivaModel"
65
+ pathBuilder: Any # Callable returning path builder
66
+
67
+ def resolve_rid(self, rid: RID) -> ResolveRidResult:
68
+ """Resolves RID to catalog location.
69
+
70
+ Looks up a RID and returns information about where it exists in the catalog, including schema,
71
+ table, and column metadata.
72
+
73
+ Args:
74
+ rid: Resource Identifier to resolve.
75
+
76
+ Returns:
77
+ ResolveRidResult: Named tuple containing:
78
+ - schema: Schema name
79
+ - table: Table name
80
+ - columns: Column definitions
81
+ - datapath: Path builder for accessing the entity
82
+
83
+ Raises:
84
+ DerivaMLException: If RID doesn't exist in catalog.
85
+
86
+ Examples:
87
+ >>> result = ml.resolve_rid("1-abc123")
88
+ >>> print(f"Found in {result.schema}.{result.table}")
89
+ >>> data = result.datapath.entities().fetch()
90
+ """
91
+ try:
92
+ # Attempt to resolve RID using catalog model
93
+ return self.catalog.resolve_rid(rid, self.model.model)
94
+ except KeyError as _e:
95
+ raise DerivaMLException(f"Invalid RID {rid}")
96
+
97
+ def retrieve_rid(self, rid: RID) -> dict[str, Any]:
98
+ """Retrieves complete record for RID.
99
+
100
+ Fetches all column values for the entity identified by the RID.
101
+
102
+ Args:
103
+ rid: Resource Identifier of the record to retrieve.
104
+
105
+ Returns:
106
+ dict[str, Any]: Dictionary containing all column values for the entity.
107
+
108
+ Raises:
109
+ DerivaMLException: If the RID doesn't exist in the catalog.
110
+
111
+ Example:
112
+ >>> record = ml.retrieve_rid("1-abc123")
113
+ >>> print(f"Name: {record['name']}, Created: {record['creation_date']}")
114
+ """
115
+ # Resolve RID and fetch the first (only) matching record
116
+ return self.resolve_rid(rid).datapath.entities().fetch()[0]
117
+
118
+ def resolve_rids(
119
+ self,
120
+ rids: set[RID] | list[RID],
121
+ candidate_tables: list[Table] | None = None,
122
+ ) -> dict[RID, BatchRidResult]:
123
+ """Batch resolve multiple RIDs efficiently.
124
+
125
+ Resolves multiple RIDs in batched queries, significantly faster than
126
+ calling resolve_rid() for each RID individually. Instead of N network
127
+ calls for N RIDs, this makes one query per candidate table.
128
+
129
+ Args:
130
+ rids: Set or list of RIDs to resolve.
131
+ candidate_tables: Optional list of Table objects to search in.
132
+ If not provided, searches all tables in domain and ML schemas.
133
+
134
+ Returns:
135
+ dict[RID, BatchRidResult]: Mapping from each resolved RID to its
136
+ BatchRidResult containing table information.
137
+
138
+ Raises:
139
+ DerivaMLException: If any RID cannot be resolved.
140
+
141
+ Example:
142
+ >>> results = ml.resolve_rids(["1-ABC", "2-DEF", "3-GHI"])
143
+ >>> for rid, info in results.items():
144
+ ... print(f"{rid} is in table {info.table_name}")
145
+ """
146
+ rids = set(rids)
147
+ if not rids:
148
+ return {}
149
+
150
+ results: dict[RID, BatchRidResult] = {}
151
+ remaining_rids = set(rids)
152
+
153
+ # Determine which tables to search
154
+ if candidate_tables is None:
155
+ # Search all tables in domain and ML schemas
156
+ candidate_tables = []
157
+ for schema_name in [*self.model.domain_schemas, self.model.ml_schema]:
158
+ schema = self.model.model.schemas.get(schema_name)
159
+ if schema:
160
+ candidate_tables.extend(schema.tables.values())
161
+
162
+ pb = self.pathBuilder()
163
+
164
+ # Query each candidate table for matching RIDs
165
+ for table in candidate_tables:
166
+ if not remaining_rids:
167
+ break
168
+
169
+ schema_name = table.schema.name
170
+ table_name = table.name
171
+
172
+ # Build a query with RID filter for all remaining RIDs
173
+ table_path = pb.schemas[schema_name].tables[table_name]
174
+
175
+ # Use ERMrest's Any quantifier for IN-style query
176
+ # Query only for RID column to minimize data transfer
177
+ try:
178
+ # Filter: RID = any(rid1, rid2, ...) - ERMrest's way of doing IN clause
179
+ found_entities = list(
180
+ table_path.filter(table_path.RID == AnyQuantifier(*remaining_rids))
181
+ .attributes(table_path.RID)
182
+ .fetch()
183
+ )
184
+ except Exception:
185
+ # Table might not support this query, skip it
186
+ continue
187
+
188
+ # Process found RIDs
189
+ for entity in found_entities:
190
+ rid = entity["RID"]
191
+ if rid in remaining_rids:
192
+ results[rid] = BatchRidResult(
193
+ rid=rid,
194
+ table=table,
195
+ table_name=table_name,
196
+ schema_name=schema_name,
197
+ )
198
+ remaining_rids.remove(rid)
199
+
200
+ # Check if any RIDs were not found
201
+ if remaining_rids:
202
+ raise DerivaMLException(f"Invalid RIDs: {remaining_rids}")
203
+
204
+ return results