dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. build_backend.py +93 -0
  2. dsgrid/__init__.py +22 -0
  3. dsgrid/api/__init__.py +0 -0
  4. dsgrid/api/api_manager.py +179 -0
  5. dsgrid/api/app.py +419 -0
  6. dsgrid/api/models.py +60 -0
  7. dsgrid/api/response_models.py +116 -0
  8. dsgrid/apps/__init__.py +0 -0
  9. dsgrid/apps/project_viewer/app.py +216 -0
  10. dsgrid/apps/registration_gui.py +444 -0
  11. dsgrid/chronify.py +32 -0
  12. dsgrid/cli/__init__.py +0 -0
  13. dsgrid/cli/common.py +120 -0
  14. dsgrid/cli/config.py +176 -0
  15. dsgrid/cli/download.py +13 -0
  16. dsgrid/cli/dsgrid.py +157 -0
  17. dsgrid/cli/dsgrid_admin.py +92 -0
  18. dsgrid/cli/install_notebooks.py +62 -0
  19. dsgrid/cli/query.py +729 -0
  20. dsgrid/cli/registry.py +1862 -0
  21. dsgrid/cloud/__init__.py +0 -0
  22. dsgrid/cloud/cloud_storage_interface.py +140 -0
  23. dsgrid/cloud/factory.py +31 -0
  24. dsgrid/cloud/fake_storage_interface.py +37 -0
  25. dsgrid/cloud/s3_storage_interface.py +156 -0
  26. dsgrid/common.py +36 -0
  27. dsgrid/config/__init__.py +0 -0
  28. dsgrid/config/annual_time_dimension_config.py +194 -0
  29. dsgrid/config/common.py +142 -0
  30. dsgrid/config/config_base.py +148 -0
  31. dsgrid/config/dataset_config.py +907 -0
  32. dsgrid/config/dataset_schema_handler_factory.py +46 -0
  33. dsgrid/config/date_time_dimension_config.py +136 -0
  34. dsgrid/config/dimension_config.py +54 -0
  35. dsgrid/config/dimension_config_factory.py +65 -0
  36. dsgrid/config/dimension_mapping_base.py +350 -0
  37. dsgrid/config/dimension_mappings_config.py +48 -0
  38. dsgrid/config/dimensions.py +1025 -0
  39. dsgrid/config/dimensions_config.py +71 -0
  40. dsgrid/config/file_schema.py +190 -0
  41. dsgrid/config/index_time_dimension_config.py +80 -0
  42. dsgrid/config/input_dataset_requirements.py +31 -0
  43. dsgrid/config/mapping_tables.py +209 -0
  44. dsgrid/config/noop_time_dimension_config.py +42 -0
  45. dsgrid/config/project_config.py +1462 -0
  46. dsgrid/config/registration_models.py +188 -0
  47. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  48. dsgrid/config/simple_models.py +49 -0
  49. dsgrid/config/supplemental_dimension.py +29 -0
  50. dsgrid/config/time_dimension_base_config.py +192 -0
  51. dsgrid/data_models.py +155 -0
  52. dsgrid/dataset/__init__.py +0 -0
  53. dsgrid/dataset/dataset.py +123 -0
  54. dsgrid/dataset/dataset_expression_handler.py +86 -0
  55. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  56. dsgrid/dataset/dataset_schema_handler_base.py +945 -0
  57. dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
  58. dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
  59. dsgrid/dataset/growth_rates.py +162 -0
  60. dsgrid/dataset/models.py +51 -0
  61. dsgrid/dataset/table_format_handler_base.py +257 -0
  62. dsgrid/dataset/table_format_handler_factory.py +17 -0
  63. dsgrid/dataset/unpivoted_table.py +121 -0
  64. dsgrid/dimension/__init__.py +0 -0
  65. dsgrid/dimension/base_models.py +230 -0
  66. dsgrid/dimension/dimension_filters.py +308 -0
  67. dsgrid/dimension/standard.py +252 -0
  68. dsgrid/dimension/time.py +352 -0
  69. dsgrid/dimension/time_utils.py +103 -0
  70. dsgrid/dsgrid_rc.py +88 -0
  71. dsgrid/exceptions.py +105 -0
  72. dsgrid/filesystem/__init__.py +0 -0
  73. dsgrid/filesystem/cloud_filesystem.py +32 -0
  74. dsgrid/filesystem/factory.py +32 -0
  75. dsgrid/filesystem/filesystem_interface.py +136 -0
  76. dsgrid/filesystem/local_filesystem.py +74 -0
  77. dsgrid/filesystem/s3_filesystem.py +118 -0
  78. dsgrid/loggers.py +132 -0
  79. dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
  80. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
  81. dsgrid/notebooks/registration.ipynb +48 -0
  82. dsgrid/notebooks/start_notebook.sh +11 -0
  83. dsgrid/project.py +451 -0
  84. dsgrid/query/__init__.py +0 -0
  85. dsgrid/query/dataset_mapping_plan.py +142 -0
  86. dsgrid/query/derived_dataset.py +388 -0
  87. dsgrid/query/models.py +728 -0
  88. dsgrid/query/query_context.py +287 -0
  89. dsgrid/query/query_submitter.py +994 -0
  90. dsgrid/query/report_factory.py +19 -0
  91. dsgrid/query/report_peak_load.py +70 -0
  92. dsgrid/query/reports_base.py +20 -0
  93. dsgrid/registry/__init__.py +0 -0
  94. dsgrid/registry/bulk_register.py +165 -0
  95. dsgrid/registry/common.py +287 -0
  96. dsgrid/registry/config_update_checker_base.py +63 -0
  97. dsgrid/registry/data_store_factory.py +34 -0
  98. dsgrid/registry/data_store_interface.py +74 -0
  99. dsgrid/registry/dataset_config_generator.py +158 -0
  100. dsgrid/registry/dataset_registry_manager.py +950 -0
  101. dsgrid/registry/dataset_update_checker.py +16 -0
  102. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  103. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  104. dsgrid/registry/dimension_registry_manager.py +413 -0
  105. dsgrid/registry/dimension_update_checker.py +16 -0
  106. dsgrid/registry/duckdb_data_store.py +207 -0
  107. dsgrid/registry/filesystem_data_store.py +150 -0
  108. dsgrid/registry/filter_registry_manager.py +123 -0
  109. dsgrid/registry/project_config_generator.py +57 -0
  110. dsgrid/registry/project_registry_manager.py +1623 -0
  111. dsgrid/registry/project_update_checker.py +48 -0
  112. dsgrid/registry/registration_context.py +223 -0
  113. dsgrid/registry/registry_auto_updater.py +316 -0
  114. dsgrid/registry/registry_database.py +667 -0
  115. dsgrid/registry/registry_interface.py +446 -0
  116. dsgrid/registry/registry_manager.py +558 -0
  117. dsgrid/registry/registry_manager_base.py +367 -0
  118. dsgrid/registry/versioning.py +92 -0
  119. dsgrid/rust_ext/__init__.py +14 -0
  120. dsgrid/rust_ext/find_minimal_patterns.py +129 -0
  121. dsgrid/spark/__init__.py +0 -0
  122. dsgrid/spark/functions.py +589 -0
  123. dsgrid/spark/types.py +110 -0
  124. dsgrid/tests/__init__.py +0 -0
  125. dsgrid/tests/common.py +140 -0
  126. dsgrid/tests/make_us_data_registry.py +265 -0
  127. dsgrid/tests/register_derived_datasets.py +103 -0
  128. dsgrid/tests/utils.py +25 -0
  129. dsgrid/time/__init__.py +0 -0
  130. dsgrid/time/time_conversions.py +80 -0
  131. dsgrid/time/types.py +67 -0
  132. dsgrid/units/__init__.py +0 -0
  133. dsgrid/units/constants.py +113 -0
  134. dsgrid/units/convert.py +71 -0
  135. dsgrid/units/energy.py +145 -0
  136. dsgrid/units/power.py +87 -0
  137. dsgrid/utils/__init__.py +0 -0
  138. dsgrid/utils/dataset.py +830 -0
  139. dsgrid/utils/files.py +179 -0
  140. dsgrid/utils/filters.py +125 -0
  141. dsgrid/utils/id_remappings.py +100 -0
  142. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  143. dsgrid/utils/py_expression_eval/README.md +8 -0
  144. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  145. dsgrid/utils/py_expression_eval/tests.py +283 -0
  146. dsgrid/utils/run_command.py +70 -0
  147. dsgrid/utils/scratch_dir_context.py +65 -0
  148. dsgrid/utils/spark.py +918 -0
  149. dsgrid/utils/spark_partition.py +98 -0
  150. dsgrid/utils/timing.py +239 -0
  151. dsgrid/utils/utilities.py +221 -0
  152. dsgrid/utils/versioning.py +36 -0
  153. dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
  154. dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
  155. dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
  156. dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
  157. dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,19 @@
1
+ """Constructs a report class from a type."""
2
+
3
+ from dsgrid.query.models import ReportType
4
+ from dsgrid.query.report_peak_load import PeakLoadReport
5
+ from dsgrid.query.reports_base import ReportsBase
6
+
7
+
8
+ _TYPE_TO_CLASS = {
9
+ ReportType.PEAK_LOAD: PeakLoadReport,
10
+ }
11
+
12
+
13
+ def make_report(report_type: ReportType) -> ReportsBase:
14
+ """Make a report class from a report_type."""
15
+ cls = _TYPE_TO_CLASS.get(report_type)
16
+ if cls is None:
17
+ msg = str(report_type)
18
+ raise NotImplementedError(msg)
19
+ return cls()
@@ -0,0 +1,70 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ from dsgrid.common import VALUE_COLUMN
5
+ from dsgrid.data_models import DSGBaseModel
6
+ from dsgrid.dataset.models import ValueFormat
7
+ from dsgrid.dimension.base_models import DimensionType
8
+ from dsgrid.exceptions import DSGInvalidQuery
9
+ from dsgrid.query.models import ProjectQueryModel
10
+ from dsgrid.spark.functions import join_multiple_columns
11
+ from dsgrid.spark.types import F
12
+ from dsgrid.utils.dataset import ordered_subset_columns
13
+ from dsgrid.utils.files import delete_if_exists
14
+ from dsgrid.utils.spark import read_dataframe
15
+ from .query_context import QueryContext
16
+ from .reports_base import ReportsBase
17
+
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class PeakLoadInputModel(DSGBaseModel):
23
+ group_by_columns: list[str]
24
+
25
+
26
+ class PeakLoadReport(ReportsBase):
27
+ """Find peak load in a derived dataset."""
28
+
29
+ REPORT_FILENAME = "peak_load.parquet"
30
+
31
+ def check_query(self, query: ProjectQueryModel) -> None:
32
+ if query.result.table_format.format_type != ValueFormat.STACKED:
33
+ msg = "The PeakLoadReport requires the value format to be stacked."
34
+ raise DSGInvalidQuery(msg)
35
+
36
+ def generate(
37
+ self,
38
+ filename: Path,
39
+ output_dir: Path,
40
+ context: QueryContext,
41
+ inputs: PeakLoadInputModel,
42
+ ) -> Path:
43
+ value_columns = [VALUE_COLUMN]
44
+ metric_columns = context.get_dimension_column_names(DimensionType.METRIC)
45
+ if len(metric_columns) > 1:
46
+ msg = f"Bug: {metric_columns=}"
47
+ raise Exception(msg)
48
+ metric_column = next(iter(metric_columns))
49
+ group_by_columns = inputs.group_by_columns[:]
50
+ if metric_column not in group_by_columns:
51
+ group_by_columns.append(metric_column)
52
+
53
+ df = read_dataframe(filename)
54
+ expr = [F.max(x).alias(x) for x in value_columns]
55
+ peak_load = df.groupBy(*group_by_columns).agg(*expr)
56
+ join_cols = group_by_columns + value_columns
57
+ time_columns = context.get_dimension_column_names(DimensionType.TIME)
58
+ diff = time_columns.difference(df.columns)
59
+ if diff:
60
+ msg = f"BUG: expected time column(s) {diff} are not present in table"
61
+ raise Exception(msg)
62
+ columns = ordered_subset_columns(df, time_columns) + join_cols
63
+ with_time = join_multiple_columns(peak_load, df.select(*columns), join_cols).sort(
64
+ *group_by_columns
65
+ )
66
+ output_file = output_dir / PeakLoadReport.REPORT_FILENAME
67
+ delete_if_exists(output_file)
68
+ with_time.write.parquet(str(output_file))
69
+ logger.info("Wrote Peak Load Report to %s", output_file)
70
+ return output_file
@@ -0,0 +1,20 @@
1
+ import abc
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ from dsgrid.query.models import ProjectQueryModel
6
+ from dsgrid.query.query_context import QueryContext
7
+
8
+
9
+ class ReportsBase(abc.ABC):
10
+ """Base class for pre-defined reports"""
11
+
12
+ @abc.abstractmethod
13
+ def check_query(self, query: ProjectQueryModel) -> None:
14
+ """Check compatibility of the user query with the report."""
15
+
16
+ @abc.abstractmethod
17
+ def generate(
18
+ self, filename: Path, output_dir: Path, context: QueryContext, inputs: Any
19
+ ) -> Path:
20
+ """Generate the report on df into output_dir."""
File without changes
@@ -0,0 +1,165 @@
1
+ import getpass
2
+ import logging
3
+ import shutil
4
+ from pathlib import Path
5
+ from uuid import uuid4
6
+
7
+ from dsgrid.config.registration_models import RegistrationModel, RegistrationJournal
8
+ from dsgrid.dimension.base_models import DatasetDimensionRequirements
9
+ from dsgrid.registry.registry_manager import RegistryManager
10
+ from dsgrid.utils.id_remappings import (
11
+ map_dimension_ids_to_names,
12
+ map_dimension_names_to_ids,
13
+ map_dimension_mapping_names_to_ids,
14
+ replace_dimension_mapping_names_with_current_ids,
15
+ replace_dimension_names_with_current_ids,
16
+ )
17
+
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def bulk_register(
23
+ registry_manager: RegistryManager,
24
+ registration_file: Path,
25
+ data_base_dir: Path | None = None,
26
+ missing_associations_base_dir: Path | None = None,
27
+ repo_base_dir: Path | None = None,
28
+ journal_file: Path | None = None,
29
+ dataset_dimension_requirements: DatasetDimensionRequirements | None = None,
30
+ ):
31
+ """Bulk register projects, datasets, and their dimensions. If any failure occurs, the code
32
+ records successfully registered project and dataset IDs to a journal file and prints its
33
+ filename to the console. Users can pass that filename with the --journal-file option to
34
+ avoid re-registering those projects and datasets on subsequent attempts.
35
+
36
+ The JSON/JSON5 filename must match the data model defined by this documentation:
37
+
38
+ https://dsgrid.github.io/dsgrid/reference/data_models/project.html#dsgrid.config.registration_models.RegistrationModel
39
+ """
40
+ registration = RegistrationModel.from_file(registration_file)
41
+ tmp_files = []
42
+ if journal_file is None:
43
+ journal_file = Path(f"journal__{uuid4()}.json5")
44
+ journal = RegistrationJournal()
45
+ else:
46
+ journal = RegistrationJournal.from_file(journal_file)
47
+ registration = registration.filter_by_journal(journal)
48
+ failure_occurred = False
49
+ try:
50
+ return _run_bulk_registration(
51
+ registry_manager,
52
+ registration,
53
+ tmp_files,
54
+ data_base_dir,
55
+ missing_associations_base_dir,
56
+ repo_base_dir,
57
+ journal,
58
+ dataset_dimension_requirements,
59
+ )
60
+ except Exception:
61
+ failure_occurred = True
62
+ raise
63
+ finally:
64
+ if failure_occurred and journal.has_entries():
65
+ journal_file.write_text(journal.model_dump_json(indent=2), encoding="utf-8")
66
+ logger.info(
67
+ "Recorded successfully registered projects and datasets to %s. "
68
+ "Pass this file to the `--journal-file` option of this command to skip those IDs "
69
+ "on subsequent attempts.",
70
+ journal_file,
71
+ )
72
+ elif journal_file.exists():
73
+ journal_file.unlink()
74
+ logger.info("Deleted journal file %s after successful registration.", journal_file)
75
+ for path in tmp_files:
76
+ path.unlink()
77
+
78
+
79
+ def _run_bulk_registration(
80
+ mgr: RegistryManager,
81
+ registration: RegistrationModel,
82
+ tmp_files: list[Path],
83
+ data_base_dir: Path | None,
84
+ missing_associations_base_dir: Path | None,
85
+ base_repo_dir: Path | None,
86
+ journal: RegistrationJournal,
87
+ dataset_dimension_requirements: DatasetDimensionRequirements | None,
88
+ ):
89
+ user = getpass.getuser()
90
+ project_mgr = mgr.project_manager
91
+ dataset_mgr = mgr.dataset_manager
92
+ dim_mgr = mgr.dimension_manager
93
+ dim_mapping_mgr = mgr.dimension_mapping_manager
94
+
95
+ if base_repo_dir is not None:
96
+ for project in registration.projects:
97
+ if not project.config_file.is_absolute():
98
+ project.config_file = base_repo_dir / project.config_file
99
+ for dataset in registration.datasets:
100
+ if not dataset.config_file.is_absolute():
101
+ dataset.config_file = base_repo_dir / dataset.config_file
102
+ for dataset in registration.dataset_submissions:
103
+ for field in (
104
+ "dimension_mapping_file",
105
+ "dimension_mapping_references_file",
106
+ ):
107
+ path = getattr(dataset, field)
108
+ if path is not None and not path.is_absolute():
109
+ setattr(dataset, field, base_repo_dir / path)
110
+
111
+ for project in registration.projects:
112
+ assert project.log_message is not None
113
+ project_mgr.register(project.config_file, user, project.log_message)
114
+ journal.add_project(project.project_id)
115
+
116
+ for dataset in registration.datasets:
117
+ config_file = None
118
+ if dataset.replace_dimension_names_with_ids:
119
+ mappings = map_dimension_names_to_ids(dim_mgr)
120
+ orig = dataset.config_file
121
+ config_file = orig.with_stem(orig.name + "__tmp")
122
+ shutil.copyfile(orig, config_file)
123
+ tmp_files.append(config_file)
124
+ replace_dimension_names_with_current_ids(config_file, mappings)
125
+ else:
126
+ config_file = dataset.config_file
127
+
128
+ assert dataset.log_message is not None
129
+ dataset_mgr.register(
130
+ config_file,
131
+ user,
132
+ dataset.log_message,
133
+ data_base_dir=data_base_dir,
134
+ missing_associations_base_dir=missing_associations_base_dir,
135
+ requirements=dataset_dimension_requirements,
136
+ )
137
+ journal.add_dataset(dataset.dataset_id)
138
+
139
+ for dataset in registration.dataset_submissions:
140
+ refs_file = None
141
+ if (
142
+ dataset.replace_dimension_mapping_names_with_ids
143
+ and dataset.dimension_mapping_references_file is not None
144
+ ):
145
+ dim_id_to_name = map_dimension_ids_to_names(mgr.dimension_manager)
146
+ mappings = map_dimension_mapping_names_to_ids(dim_mapping_mgr, dim_id_to_name)
147
+ orig = dataset.dimension_mapping_references_file
148
+ refs_file = orig.with_stem(orig.name + "__tmp")
149
+ shutil.copyfile(orig, refs_file)
150
+ tmp_files.append(refs_file)
151
+ replace_dimension_mapping_names_with_current_ids(refs_file, mappings)
152
+ else:
153
+ refs_file = dataset.dimension_mapping_references_file
154
+
155
+ assert dataset.log_message is not None
156
+ project_mgr.submit_dataset(
157
+ dataset.project_id,
158
+ dataset.dataset_id,
159
+ user,
160
+ dataset.log_message,
161
+ dimension_mapping_file=dataset.dimension_mapping_file,
162
+ dimension_mapping_references_file=refs_file,
163
+ autogen_reverse_supplemental_mappings=dataset.autogen_reverse_supplemental_mappings,
164
+ )
165
+ journal.add_submitted_dataset(dataset.dataset_id, dataset.project_id)
@@ -0,0 +1,287 @@
1
+ """Common definitions for registry components"""
2
+
3
+ import logging
4
+ import re
5
+ from collections import namedtuple
6
+ from datetime import datetime
7
+ from enum import StrEnum
8
+ from pathlib import Path
9
+
10
+
11
+ from pydantic import Field
12
+
13
+ from dsgrid.data_models import DSGBaseModel
14
+ from dsgrid.exceptions import DSGInvalidParameter
15
+ from dsgrid.utils.versioning import make_version
16
+
17
+
18
+ REGISTRY_LOG_FILE = "dsgrid_registry.log"
19
+ # Allows letters, numbers, underscores, spaces, dashes
20
+ # Allows letters, numbers, underscores, dashes, spaces
21
+ REGEX_VALID_REGISTRY_NAME = re.compile(r"^[\w -]+$")
22
+ # Allows letters, numbers, underscores, dashes
23
+ REGEX_VALID_REGISTRY_CONFIG_ID_LOOSE = re.compile(r"^[\w/-]+$")
24
+ # Allows letters, numbers, underscores.
25
+ # dataset_id cannot start with a number because of uses in DatasetExpressionHandler
26
+ # It's likely a good rule everywhere else.
27
+ REGEX_VALID_REGISTRY_CONFIG_ID_STRICT = re.compile(r"^[a-zA-Z][\w]+$")
28
+
29
+ REGISTRY_ID_DELIMITER = "__"
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ def check_config_id_loose(config_id, tag):
35
+ # Raises ValueError because this is used in Pydantic models.
36
+ if not REGEX_VALID_REGISTRY_CONFIG_ID_LOOSE.search(config_id):
37
+ msg = f"{tag} ID={config_id} is invalid. Restricted to letters, numbers, underscores, and dashes."
38
+ raise ValueError(msg)
39
+
40
+
41
+ def check_config_id_strict(config_id, tag):
42
+ # Raises ValueError because this is used in Pydantic models.
43
+ if not REGEX_VALID_REGISTRY_CONFIG_ID_STRICT.search(config_id):
44
+ msg = (
45
+ f"{tag} ID={config_id} is invalid. Restricted to letters, numbers, and underscores. "
46
+ "Cannot start with a number."
47
+ )
48
+ raise ValueError(msg)
49
+
50
+
51
+ class DatabaseConnection(DSGBaseModel):
52
+ """Input information to connect to a registry database"""
53
+
54
+ url: str
55
+ # There attributes are commented-out because the registry is currently only
56
+ # supported in SQLite. If/when we add postgres support, these can be added back.
57
+ # database: str = "dsgrid"
58
+ # hostname: str = "localhost"
59
+ # port: int = 8529
60
+ # username: str = "root"
61
+ # password: str = DEFAULT_DB_PASSWORD
62
+
63
+ # @classmethod
64
+ # def from_url(cls, url, **kwargs):
65
+ # """Create a connection from a URL."""
66
+ # regex = re.compile(r"http://(.*):(\d+)")
67
+ # match = regex.search(url)
68
+ # if match is None:
69
+ # raise DSGInvalidParameter(f"Invalid URL format: {url}")
70
+ # hostname = match.group(1)
71
+ # port = match.group(2)
72
+ # return cls(hostname=hostname, port=port, **kwargs)
73
+
74
+ def get_filename(self) -> Path:
75
+ """Return the filename from the URL. Only valid for SQLite databases.
76
+
77
+ Raises
78
+ ------
79
+ DSGInvalidParameter
80
+ Raised if the URL does not conform to the SQLite format.
81
+ """
82
+ # All call sites will need to be changed if/when we support Postgres.
83
+ filename = self.try_get_filename()
84
+ if filename is None:
85
+ msg = (
86
+ f"Failed to parse '{self.url}' into a SQLite URL. "
87
+ "The SQLite file path must be specified in the format 'sqlite:///</path/to/db_file.db>'. "
88
+ )
89
+ raise DSGInvalidParameter(msg)
90
+ return filename
91
+
92
+ def try_get_filename(self) -> Path | None:
93
+ """Return the filename from the URL, if file-based, otherwise None."""
94
+ regex = re.compile(r"sqlite:\/\/\/(.*)")
95
+ match = regex.search(self.url)
96
+ if not match:
97
+ return None
98
+ return Path(match.group(1))
99
+
100
+
101
+ class DataStoreType(StrEnum):
102
+ """Specifies the type of data store used for the registry."""
103
+
104
+ FILESYSTEM = "filesystem"
105
+ DUCKDB = "duckdb"
106
+
107
+
108
+ class RegistryType(StrEnum):
109
+ """Registry types"""
110
+
111
+ DATASET = "dataset"
112
+ DIMENSION = "dimension"
113
+ DIMENSION_MAPPING = "dimension_mapping"
114
+ PROJECT = "project"
115
+
116
+
117
+ MODEL_TYPE_TO_ID_FIELD_MAPPING = {
118
+ RegistryType.PROJECT: "project_id",
119
+ RegistryType.DATASET: "dataset_id",
120
+ RegistryType.DIMENSION: "dimension_id",
121
+ RegistryType.DIMENSION_MAPPING: "mapping_id",
122
+ }
123
+
124
+
125
+ class RegistryTables(StrEnum):
126
+ """Registry tables"""
127
+
128
+ KEY_VALUE = "key_value"
129
+ CURRENT_VERSIONS = "current_versions"
130
+ MODELS = "models"
131
+ REGISTRATIONS = "registrations"
132
+ CONTAINS = "contains"
133
+
134
+
135
+ class DatasetRegistryStatus(StrEnum):
136
+ """Statuses for a dataset within a project"""
137
+
138
+ UNREGISTERED = "Unregistered"
139
+ REGISTERED = "Registered"
140
+
141
+
142
+ class ProjectRegistryStatus(StrEnum):
143
+ """Statuses for a project within the DSGRID registry"""
144
+
145
+ INITIAL_REGISTRATION = "Initial Registration"
146
+ IN_PROGRESS = "In Progress"
147
+ COMPLETE = "Complete"
148
+ PUBLISHED = "Published"
149
+ DEPRECATED = "Deprecated"
150
+
151
+
152
+ class VersionUpdateType(StrEnum):
153
+ """Types of updates that can be made to projects, datasets, and dimensions"""
154
+
155
+ # TODO: we need to find general version update types that can be mapped to
156
+ # major, minor and patch.
157
+ # i.e., replace input_dataset, fix project_config,
158
+ MAJOR = "major"
159
+ MINOR = "minor"
160
+ PATCH = "patch"
161
+
162
+
163
+ # These keys are used to store references to project/dataset configs and dimensions
164
+ # in dictionaries.
165
+ ConfigKey = namedtuple("ConfigKey", ["id", "version"])
166
+
167
+ # Convenience container to be shared among the registry managers.
168
+ # Obviates the need to pass parameters to many constructors.
169
+ RegistryManagerParams = namedtuple(
170
+ "RegistryManagerParams",
171
+ [
172
+ "base_path",
173
+ "remote_path",
174
+ "use_remote_data",
175
+ "fs_interface",
176
+ "cloud_interface",
177
+ "offline",
178
+ "scratch_dir",
179
+ ],
180
+ )
181
+
182
+
183
+ class RegistrationModel(DSGBaseModel):
184
+ """Registration fields required by the ProjectConfig and DatasetConfig"""
185
+
186
+ id: int | None = Field(default=None, description="database ID of the registration")
187
+ timestamp: datetime = Field(
188
+ title="timestamp",
189
+ description="Registration timestamp",
190
+ )
191
+ submitter: str = Field(
192
+ title="submitter",
193
+ description="Username that submitted the registration",
194
+ )
195
+ log_message: str | None = Field(
196
+ default=None,
197
+ title="log_message",
198
+ description="Reason for the update",
199
+ )
200
+ update_type: VersionUpdateType = Field(
201
+ title="update_type",
202
+ description="Type of update",
203
+ )
204
+
205
+
206
+ def get_version_from_filename(filename):
207
+ """Return the handle and version from a registry file."""
208
+ regex = re.compile(r"(?P<handle>\w+)-v(?P<version>[\d\.]+).json5")
209
+ match = regex.search(filename)
210
+ assert match, filename
211
+ return match.groupdict("handle"), make_version(match.groupdict("version"))
212
+
213
+
214
+ def make_filename_from_version(handle, version):
215
+ """Make a filename with the handle and version."""
216
+ return f"{handle}-v{version}.json5"
217
+
218
+
219
+ # def update_version(id_handle, update, registry_path):
220
+ # """Determine registration or project version for registration.
221
+ #
222
+ # TODO: Current solution is a quick hack. This needs to be better/formalized.
223
+ # - Need smarter version updating / checks; use semvar packages
224
+ # - Set to work with some central version (like S3)
225
+ # - Currently only updating major version
226
+ # - NOTE: not currently utilitzing the update_type in
227
+ # RegistrationModel. Could use this to set
228
+ # major/minor/patch update decisiosns
229
+ #
230
+ # Args:
231
+ # registry_type (RegistryType): type of registry (e.g., Project, Dataset)
232
+ # id_handle (str): ID handle is either the project_id or dataset_id
233
+ # update (bool): config registration update setting
234
+ # """
235
+ #
236
+ # # TODO: remove when done. project path should be set somewhere else
237
+ # if not os.path.exists(registry_path):
238
+ # raise ValueError(f"Path does not exist: {registry_path}")
239
+ #
240
+ # # if config.update is False, then assume major=1, minor=0, patch=0
241
+ # if not update:
242
+ # version = VersionInfo(major=1)
243
+ # registry_file = Path(registry_path) / make_filename_from_version(id_handle, version)
244
+ # # Raise error if v1.0.0 registry exists for project_id
245
+ # if os.path.exists(registry_file):
246
+ # raise ValueError(
247
+ # f'{registry_type} registry for "{registry_file}" already '
248
+ # f"exists. If you want to update the project registration"
249
+ # f" with a new {registry_type} version, then you will need to"
250
+ # f" set update=True in {registry_type} config. Alternatively, "
251
+ # f"if you want to initiate a new dsgrid {registry_type}, you "
252
+ # "will need to specify a new version handle in the "
253
+ # f"{registry_type} config."
254
+ # )
255
+ # # if update is true...
256
+ # else:
257
+ # # list existing project registries
258
+ # existing_versions = []
259
+ # for f in os.listdir(registry_path):
260
+ # handle, version = get_version_from_filename(f)
261
+ # if handle == id_handle:
262
+ # existing_versions.append(version)
263
+ # # check for existing project registries
264
+ # if not existing_versions:
265
+ # raise ValueError(
266
+ # "Registration.update=True, however, no updates can be made "
267
+ # f"because there are no existing registries for {registry_type}"
268
+ # f" ID = {id_handle}. Check project_id or set "
269
+ # f"Registration.update=True in the {registry_type} Config."
270
+ # )
271
+ # # find the latest registry version
272
+ # # NOTE: this is currently based on major verison only
273
+ # last_version = sorted(existing_versions)[-1]
274
+ # old_project_version = make_filename_from_version(id_handle, last_version)
275
+ # old_registry_file = os.path.join(registry_path, old_project_version)
276
+ #
277
+ # # deprecate old project registry
278
+ # t = deserialize_registry(old_registry_file)
279
+ # # DT: Can we use an enum here? Spelling/capitalization mistakes could be costly.
280
+ # # Deprecated is a project status.
281
+ # t["status"] = "Deprecated"
282
+ # # DT: can we use version
283
+ # t["version"] = last_version.bump_major()
284
+ # # TODO: deserialize_registry should have returned a Pydantic model
285
+ # serialize_registry(t, make_filename_from_version(id_handle, t["version"]))
286
+ #
287
+ # return version
@@ -0,0 +1,63 @@
1
+ import abc
2
+ import logging
3
+
4
+ from dsgrid.exceptions import DSGInvalidOperation
5
+
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class ConfigUpdateCheckerBase(abc.ABC):
11
+ """Base class for updating all config models"""
12
+
13
+ def __init__(self, old_model, new_model):
14
+ self._old_model = old_model
15
+ self._new_model = new_model
16
+ assert type(self._old_model) is type(self._new_model) # noqa: E721
17
+ self._type = type(self._old_model)
18
+ self._changed_fields = set()
19
+
20
+ def _check_common(self):
21
+ for field, attrs in self._type.model_fields.items():
22
+ old = getattr(self._old_model, field)
23
+ new = getattr(self._new_model, field)
24
+ if old != new:
25
+ extra = attrs.json_schema_extra
26
+ if extra and not extra.get("updateable", True):
27
+ msg = f"{self._type}.{field} cannot be updated"
28
+ raise DSGInvalidOperation(msg)
29
+ self._changed_fields.add(field)
30
+ logger.info("%s %s changed from %s to %s.", self._type, field, old, new)
31
+ # FUTURE: We could recurse into each dsgrid pydantic model and check each individual
32
+ # field. Would also need to handle lists and dicts of models.
33
+ # This would allow more precise control of changed fields and much better logging.
34
+
35
+ @abc.abstractmethod
36
+ def check_preconditions(self):
37
+ """Check preconditions for performing an update.
38
+
39
+ Raises
40
+ ------
41
+ DSGInvalidRegistryState
42
+ Raised if a precondition is violated.
43
+
44
+ """
45
+
46
+ @abc.abstractmethod
47
+ def handle_postconditions(self):
48
+ """Handle any required postconditions."""
49
+
50
+ def run(self):
51
+ """Run all checks.
52
+
53
+ Raises
54
+ ------
55
+ DSGInvalidOperation
56
+ Raised if the user is changing an immutable field.
57
+ DSGInvalidRegistryState
58
+ Raised if a precondition is violated.
59
+
60
+ """
61
+ self.check_preconditions()
62
+ self._check_common()
63
+ self.handle_postconditions()
@@ -0,0 +1,34 @@
1
+ from pathlib import Path
2
+
3
+ from dsgrid.registry.common import DataStoreType
4
+ from dsgrid.registry.data_store_interface import DataStoreInterface
5
+ from dsgrid.registry.duckdb_data_store import DuckDbDataStore
6
+ from dsgrid.registry.filesystem_data_store import FilesystemDataStore
7
+
8
+
9
+ def make_data_store(
10
+ base_path: Path, data_store_type: DataStoreType, initialize: bool = False
11
+ ) -> DataStoreInterface:
12
+ """Factory function to create a data store.
13
+
14
+ Parameters
15
+ ----------
16
+ base_path : Path
17
+ The base path for the data store.
18
+ data_store_type : DataStoreType
19
+ The type of data store to create.
20
+ initialize : bool
21
+ Whether to initialize the data store.
22
+ """
23
+ match data_store_type:
24
+ case DataStoreType.FILESYSTEM:
25
+ cls = FilesystemDataStore
26
+ case DataStoreType.DUCKDB:
27
+ cls = DuckDbDataStore
28
+ case _:
29
+ msg = f"Unsupported data store type: {data_store_type}"
30
+ raise NotImplementedError(msg)
31
+
32
+ if initialize:
33
+ return cls.create(base_path)
34
+ return cls.load(base_path)