dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. build_backend.py +93 -0
  2. dsgrid/__init__.py +22 -0
  3. dsgrid/api/__init__.py +0 -0
  4. dsgrid/api/api_manager.py +179 -0
  5. dsgrid/api/app.py +419 -0
  6. dsgrid/api/models.py +60 -0
  7. dsgrid/api/response_models.py +116 -0
  8. dsgrid/apps/__init__.py +0 -0
  9. dsgrid/apps/project_viewer/app.py +216 -0
  10. dsgrid/apps/registration_gui.py +444 -0
  11. dsgrid/chronify.py +32 -0
  12. dsgrid/cli/__init__.py +0 -0
  13. dsgrid/cli/common.py +120 -0
  14. dsgrid/cli/config.py +176 -0
  15. dsgrid/cli/download.py +13 -0
  16. dsgrid/cli/dsgrid.py +157 -0
  17. dsgrid/cli/dsgrid_admin.py +92 -0
  18. dsgrid/cli/install_notebooks.py +62 -0
  19. dsgrid/cli/query.py +729 -0
  20. dsgrid/cli/registry.py +1862 -0
  21. dsgrid/cloud/__init__.py +0 -0
  22. dsgrid/cloud/cloud_storage_interface.py +140 -0
  23. dsgrid/cloud/factory.py +31 -0
  24. dsgrid/cloud/fake_storage_interface.py +37 -0
  25. dsgrid/cloud/s3_storage_interface.py +156 -0
  26. dsgrid/common.py +36 -0
  27. dsgrid/config/__init__.py +0 -0
  28. dsgrid/config/annual_time_dimension_config.py +194 -0
  29. dsgrid/config/common.py +142 -0
  30. dsgrid/config/config_base.py +148 -0
  31. dsgrid/config/dataset_config.py +907 -0
  32. dsgrid/config/dataset_schema_handler_factory.py +46 -0
  33. dsgrid/config/date_time_dimension_config.py +136 -0
  34. dsgrid/config/dimension_config.py +54 -0
  35. dsgrid/config/dimension_config_factory.py +65 -0
  36. dsgrid/config/dimension_mapping_base.py +350 -0
  37. dsgrid/config/dimension_mappings_config.py +48 -0
  38. dsgrid/config/dimensions.py +1025 -0
  39. dsgrid/config/dimensions_config.py +71 -0
  40. dsgrid/config/file_schema.py +190 -0
  41. dsgrid/config/index_time_dimension_config.py +80 -0
  42. dsgrid/config/input_dataset_requirements.py +31 -0
  43. dsgrid/config/mapping_tables.py +209 -0
  44. dsgrid/config/noop_time_dimension_config.py +42 -0
  45. dsgrid/config/project_config.py +1462 -0
  46. dsgrid/config/registration_models.py +188 -0
  47. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  48. dsgrid/config/simple_models.py +49 -0
  49. dsgrid/config/supplemental_dimension.py +29 -0
  50. dsgrid/config/time_dimension_base_config.py +192 -0
  51. dsgrid/data_models.py +155 -0
  52. dsgrid/dataset/__init__.py +0 -0
  53. dsgrid/dataset/dataset.py +123 -0
  54. dsgrid/dataset/dataset_expression_handler.py +86 -0
  55. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  56. dsgrid/dataset/dataset_schema_handler_base.py +945 -0
  57. dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
  58. dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
  59. dsgrid/dataset/growth_rates.py +162 -0
  60. dsgrid/dataset/models.py +51 -0
  61. dsgrid/dataset/table_format_handler_base.py +257 -0
  62. dsgrid/dataset/table_format_handler_factory.py +17 -0
  63. dsgrid/dataset/unpivoted_table.py +121 -0
  64. dsgrid/dimension/__init__.py +0 -0
  65. dsgrid/dimension/base_models.py +230 -0
  66. dsgrid/dimension/dimension_filters.py +308 -0
  67. dsgrid/dimension/standard.py +252 -0
  68. dsgrid/dimension/time.py +352 -0
  69. dsgrid/dimension/time_utils.py +103 -0
  70. dsgrid/dsgrid_rc.py +88 -0
  71. dsgrid/exceptions.py +105 -0
  72. dsgrid/filesystem/__init__.py +0 -0
  73. dsgrid/filesystem/cloud_filesystem.py +32 -0
  74. dsgrid/filesystem/factory.py +32 -0
  75. dsgrid/filesystem/filesystem_interface.py +136 -0
  76. dsgrid/filesystem/local_filesystem.py +74 -0
  77. dsgrid/filesystem/s3_filesystem.py +118 -0
  78. dsgrid/loggers.py +132 -0
  79. dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
  80. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
  81. dsgrid/notebooks/registration.ipynb +48 -0
  82. dsgrid/notebooks/start_notebook.sh +11 -0
  83. dsgrid/project.py +451 -0
  84. dsgrid/query/__init__.py +0 -0
  85. dsgrid/query/dataset_mapping_plan.py +142 -0
  86. dsgrid/query/derived_dataset.py +388 -0
  87. dsgrid/query/models.py +728 -0
  88. dsgrid/query/query_context.py +287 -0
  89. dsgrid/query/query_submitter.py +994 -0
  90. dsgrid/query/report_factory.py +19 -0
  91. dsgrid/query/report_peak_load.py +70 -0
  92. dsgrid/query/reports_base.py +20 -0
  93. dsgrid/registry/__init__.py +0 -0
  94. dsgrid/registry/bulk_register.py +165 -0
  95. dsgrid/registry/common.py +287 -0
  96. dsgrid/registry/config_update_checker_base.py +63 -0
  97. dsgrid/registry/data_store_factory.py +34 -0
  98. dsgrid/registry/data_store_interface.py +74 -0
  99. dsgrid/registry/dataset_config_generator.py +158 -0
  100. dsgrid/registry/dataset_registry_manager.py +950 -0
  101. dsgrid/registry/dataset_update_checker.py +16 -0
  102. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  103. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  104. dsgrid/registry/dimension_registry_manager.py +413 -0
  105. dsgrid/registry/dimension_update_checker.py +16 -0
  106. dsgrid/registry/duckdb_data_store.py +207 -0
  107. dsgrid/registry/filesystem_data_store.py +150 -0
  108. dsgrid/registry/filter_registry_manager.py +123 -0
  109. dsgrid/registry/project_config_generator.py +57 -0
  110. dsgrid/registry/project_registry_manager.py +1623 -0
  111. dsgrid/registry/project_update_checker.py +48 -0
  112. dsgrid/registry/registration_context.py +223 -0
  113. dsgrid/registry/registry_auto_updater.py +316 -0
  114. dsgrid/registry/registry_database.py +667 -0
  115. dsgrid/registry/registry_interface.py +446 -0
  116. dsgrid/registry/registry_manager.py +558 -0
  117. dsgrid/registry/registry_manager_base.py +367 -0
  118. dsgrid/registry/versioning.py +92 -0
  119. dsgrid/rust_ext/__init__.py +14 -0
  120. dsgrid/rust_ext/find_minimal_patterns.py +129 -0
  121. dsgrid/spark/__init__.py +0 -0
  122. dsgrid/spark/functions.py +589 -0
  123. dsgrid/spark/types.py +110 -0
  124. dsgrid/tests/__init__.py +0 -0
  125. dsgrid/tests/common.py +140 -0
  126. dsgrid/tests/make_us_data_registry.py +265 -0
  127. dsgrid/tests/register_derived_datasets.py +103 -0
  128. dsgrid/tests/utils.py +25 -0
  129. dsgrid/time/__init__.py +0 -0
  130. dsgrid/time/time_conversions.py +80 -0
  131. dsgrid/time/types.py +67 -0
  132. dsgrid/units/__init__.py +0 -0
  133. dsgrid/units/constants.py +113 -0
  134. dsgrid/units/convert.py +71 -0
  135. dsgrid/units/energy.py +145 -0
  136. dsgrid/units/power.py +87 -0
  137. dsgrid/utils/__init__.py +0 -0
  138. dsgrid/utils/dataset.py +830 -0
  139. dsgrid/utils/files.py +179 -0
  140. dsgrid/utils/filters.py +125 -0
  141. dsgrid/utils/id_remappings.py +100 -0
  142. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  143. dsgrid/utils/py_expression_eval/README.md +8 -0
  144. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  145. dsgrid/utils/py_expression_eval/tests.py +283 -0
  146. dsgrid/utils/run_command.py +70 -0
  147. dsgrid/utils/scratch_dir_context.py +65 -0
  148. dsgrid/utils/spark.py +918 -0
  149. dsgrid/utils/spark_partition.py +98 -0
  150. dsgrid/utils/timing.py +239 -0
  151. dsgrid/utils/utilities.py +221 -0
  152. dsgrid/utils/versioning.py +36 -0
  153. dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
  154. dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
  155. dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
  156. dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
  157. dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,46 @@
1
+ from sqlalchemy import Connection
2
+
3
+ from dsgrid.config.dataset_config import DatasetConfig
4
+ from dsgrid.dataset.models import TableFormat
5
+ from dsgrid.dataset.dataset_schema_handler_two_table import TwoTableDatasetSchemaHandler
6
+ from dsgrid.dataset.dataset_schema_handler_one_table import OneTableDatasetSchemaHandler
7
+ from dsgrid.registry.data_store_interface import DataStoreInterface
8
+ from dsgrid.registry.dimension_registry_manager import DimensionRegistryManager
9
+ from dsgrid.registry.dimension_mapping_registry_manager import DimensionMappingRegistryManager
10
+ from dsgrid.config.dimension_mapping_base import DimensionMappingReferenceModel
11
+ from dsgrid.utils.scratch_dir_context import ScratchDirContext
12
+
13
+
14
+ def make_dataset_schema_handler(
15
+ conn: Connection | None,
16
+ config: DatasetConfig,
17
+ dimension_mgr: DimensionRegistryManager,
18
+ dimension_mapping_mgr: DimensionMappingRegistryManager,
19
+ store: DataStoreInterface | None = None,
20
+ mapping_references: list[DimensionMappingReferenceModel] | None = None,
21
+ scratch_dir_context: ScratchDirContext | None = None,
22
+ ):
23
+ match config.get_table_format():
24
+ case TableFormat.TWO_TABLE:
25
+ return TwoTableDatasetSchemaHandler.load(
26
+ config,
27
+ conn,
28
+ dimension_mgr,
29
+ dimension_mapping_mgr,
30
+ store=store,
31
+ mapping_references=mapping_references,
32
+ scratch_dir_context=scratch_dir_context,
33
+ )
34
+ case TableFormat.ONE_TABLE:
35
+ return OneTableDatasetSchemaHandler.load(
36
+ config,
37
+ conn,
38
+ dimension_mgr,
39
+ dimension_mapping_mgr,
40
+ store=store,
41
+ mapping_references=mapping_references,
42
+ scratch_dir_context=scratch_dir_context,
43
+ )
44
+ case _:
45
+ msg = f"Unsupported table format: {config.get_table_format()}"
46
+ raise NotImplementedError(msg)
@@ -0,0 +1,136 @@
1
+ import logging
2
+ from datetime import datetime, timedelta, tzinfo
3
+ from zoneinfo import ZoneInfo
4
+
5
+ import pandas as pd
6
+
7
+ import chronify
8
+
9
+ from dsgrid.dimension.time import TimeZoneFormat, TimeIntervalType
10
+ from .dimensions import DateTimeDimensionModel
11
+ from .time_dimension_base_config import TimeDimensionBaseConfig
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class DateTimeDimensionConfig(TimeDimensionBaseConfig):
17
+ """Provides an interface to a DateTimeDimensionModel."""
18
+
19
+ @staticmethod
20
+ def model_class() -> DateTimeDimensionModel:
21
+ return DateTimeDimensionModel
22
+
23
+ def supports_chronify(self) -> bool:
24
+ return True
25
+
26
+ def to_chronify(self) -> chronify.DatetimeRange:
27
+ time_cols = self.get_load_data_time_columns()
28
+ assert len(self._model.ranges) == 1
29
+ assert len(time_cols) == 1
30
+ # TODO: issue #341: this is actually tied to the weather_year problem #340
31
+ # If there are no ranges, all of this must be dynamic.
32
+ # The two issues should be solved together.
33
+ datetime_type = self._get_datetime_type()
34
+
35
+ if datetime_type == "tz_aware_datetime_single_tz":
36
+ return chronify.DatetimeRange(
37
+ time_column=time_cols[0],
38
+ start=pd.Timestamp(self.get_start_times()[0]),
39
+ length=self.get_lengths()[0],
40
+ resolution=self.get_frequency(),
41
+ measurement_type=self._model.measurement_type,
42
+ interval_type=self._model.time_interval_type,
43
+ )
44
+ if datetime_type == "tz_naive_datetime_single_tz":
45
+ # localize to time zones, may do this outside of Chronify
46
+ msg = "dsgrid does not support NTZ datetime in dataframe yet"
47
+ raise NotImplementedError(msg)
48
+ if datetime_type == "tz_aware_datetime_multiple_tz":
49
+ return chronify.DatetimeRangeWithTZColumn(
50
+ time_column=time_cols[0],
51
+ start=pd.Timestamp(self.get_start_times()[0]),
52
+ length=self.get_lengths()[0],
53
+ resolution=self.get_frequency(),
54
+ time_zone_column="time_zone",
55
+ time_zones=self.get_time_zones(),
56
+ measurement_type=self._model.measurement_type,
57
+ interval_type=self._model.time_interval_type,
58
+ )
59
+ if datetime_type == "tz_naive_datetime_multiple_tz":
60
+ # localize to time zones, may do this outside of Chronify
61
+ msg = "dsgrid does not support NTZ datetime in dataframe yet"
62
+ raise NotImplementedError(msg)
63
+
64
+ def get_frequency(self) -> timedelta:
65
+ freqs = [trange.frequency for trange in self.model.ranges]
66
+ if len(set(freqs)) > 1:
67
+ msg = f"DateTimeDimensionConfig.get_frequency found multiple frequencies: {freqs}"
68
+ raise ValueError(msg)
69
+ return freqs[0]
70
+
71
+ def get_start_times(self) -> list[pd.Timestamp]:
72
+ tz = self.get_tzinfo()
73
+ start_times = []
74
+ for trange in self.model.ranges:
75
+ start = datetime.strptime(trange.start, trange.str_format)
76
+ assert start.tzinfo is None
77
+ start_times.append(start.replace(tzinfo=tz))
78
+ return start_times
79
+
80
+ def get_lengths(self) -> list[int]:
81
+ tz = self.get_tzinfo()
82
+ lengths = []
83
+ for trange in self.model.ranges:
84
+ start = datetime.strptime(trange.start, trange.str_format)
85
+ end = datetime.strptime(trange.end, trange.str_format)
86
+ assert start.tzinfo is None
87
+ assert end.tzinfo is None
88
+ start_utc = start.replace(tzinfo=tz).astimezone(tz=ZoneInfo("UTC"))
89
+ end_utc = end.replace(tzinfo=tz).astimezone(tz=ZoneInfo("UTC"))
90
+ freq = trange.frequency
91
+ length = (end_utc - start_utc) / freq + 1
92
+ assert length % 1 == 0, f"{length=} is not a whole number"
93
+ lengths.append(int(length))
94
+ return lengths
95
+
96
+ def get_load_data_time_columns(self) -> list[str]:
97
+ return [self.model.time_column]
98
+
99
+ def get_time_zone(self) -> str | None:
100
+ if self.model.time_zone_format.format_type == TimeZoneFormat.ALIGNED_IN_ABSOLUTE_TIME:
101
+ return self.model.time_zone_format.time_zone
102
+ return None
103
+
104
+ def get_time_zones(self) -> list[str]:
105
+ if self.model.time_zone_format.format_type == TimeZoneFormat.ALIGNED_IN_ABSOLUTE_TIME:
106
+ return [self.model.time_zone_format.time_zone]
107
+ if self.model.time_zone_format.format_type == TimeZoneFormat.ALIGNED_IN_CLOCK_TIME:
108
+ return self.model.time_zone_format.time_zones
109
+ return []
110
+
111
+ def get_tzinfo(self) -> tzinfo | None:
112
+ time_zone = self.get_time_zone()
113
+ if time_zone is None:
114
+ return None
115
+ return ZoneInfo(time_zone)
116
+
117
+ def get_time_interval_type(self) -> TimeIntervalType:
118
+ return self.model.time_interval_type
119
+
120
+ def _get_datetime_type(self) -> str:
121
+ """Return a string representing the datetime type for this dimension."""
122
+ match (self.model.time_zone_format.format_type, self.model.localize_to_time_zone):
123
+ case (TimeZoneFormat.ALIGNED_IN_ABSOLUTE_TIME, True):
124
+ return "tz_aware_datetime_single_tz"
125
+ case (TimeZoneFormat.ALIGNED_IN_ABSOLUTE_TIME, False):
126
+ return "tz_naive_datetime_single_tz"
127
+ case (TimeZoneFormat.ALIGNED_IN_CLOCK_TIME, True):
128
+ return "tz_aware_datetime_multiple_tz"
129
+ case (TimeZoneFormat.ALIGNED_IN_CLOCK_TIME, False):
130
+ return "tz_naive_datetime_multiple_tz"
131
+ case _:
132
+ msg = (
133
+ f"Unsupported combination of format_type {self.model.time_zone_format.format_type} "
134
+ f"and localize_to_time_zone {self.model.localize_to_time_zone}"
135
+ )
136
+ raise ValueError(msg)
@@ -0,0 +1,54 @@
1
+ import abc
2
+ import logging
3
+ from typing import Union
4
+
5
+ from .config_base import ConfigBase, ConfigWithRecordFileBase
6
+ from .dimensions import DimensionModel
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class DimensionBaseConfigWithFiles(ConfigWithRecordFileBase, abc.ABC):
12
+ """Base class for dimension configs"""
13
+
14
+ @staticmethod
15
+ def config_filename():
16
+ return "dimension.json5"
17
+
18
+ @property
19
+ def config_id(self):
20
+ return self.model.dimension_id
21
+
22
+ def get_unique_ids(self) -> set[str]:
23
+ """Return the unique IDs in a dimension's records.
24
+
25
+ Returns
26
+ -------
27
+ set
28
+ set of str
29
+
30
+ """
31
+ return {x.id for x in self.model.records}
32
+
33
+
34
+ class DimensionBaseConfigWithoutFiles(ConfigBase, abc.ABC):
35
+ """Base class for dimension configs"""
36
+
37
+ @staticmethod
38
+ def config_filename():
39
+ return "dimension.json5"
40
+
41
+ @property
42
+ def config_id(self):
43
+ return self.model.dimension_id
44
+
45
+
46
+ class DimensionConfig(DimensionBaseConfigWithFiles):
47
+ """Provides an interface to a DimensionModel."""
48
+
49
+ @staticmethod
50
+ def model_class():
51
+ return DimensionModel
52
+
53
+
54
+ DimensionBaseConfig = Union[DimensionBaseConfigWithFiles, DimensionBaseConfigWithoutFiles]
@@ -0,0 +1,65 @@
1
+ from dsgrid.dimension.time import TimeDimensionType
2
+ from dsgrid.utils.files import load_data
3
+ from .date_time_dimension_config import DateTimeDimensionConfig
4
+ from .annual_time_dimension_config import AnnualTimeDimensionConfig
5
+ from .noop_time_dimension_config import NoOpTimeDimensionConfig
6
+ from .index_time_dimension_config import IndexTimeDimensionConfig
7
+ from .dimension_config import DimensionConfig
8
+ from .representative_period_time_dimension_config import RepresentativePeriodTimeDimensionConfig
9
+ from .dimensions import (
10
+ DateTimeDimensionModel,
11
+ DimensionModel,
12
+ DimensionType,
13
+ AnnualTimeDimensionModel,
14
+ RepresentativePeriodTimeDimensionModel,
15
+ NoOpTimeDimensionModel,
16
+ IndexTimeDimensionModel,
17
+ )
18
+
19
+
20
+ def get_dimension_config(model):
21
+ if isinstance(model, DateTimeDimensionModel):
22
+ return DateTimeDimensionConfig(model)
23
+ if isinstance(model, AnnualTimeDimensionModel):
24
+ return AnnualTimeDimensionConfig(model)
25
+ if isinstance(model, RepresentativePeriodTimeDimensionModel):
26
+ return RepresentativePeriodTimeDimensionConfig(model)
27
+ if isinstance(model, DimensionModel):
28
+ config = DimensionConfig(model)
29
+ return config
30
+ if isinstance(model, NoOpTimeDimensionModel):
31
+ return NoOpTimeDimensionConfig(model)
32
+ if isinstance(model, IndexTimeDimensionModel):
33
+ return IndexTimeDimensionConfig(model)
34
+ assert False, type(model)
35
+
36
+
37
+ def load_dimension_config(filename):
38
+ """Loads a dimension config file before the exact type is known.
39
+
40
+ Parameters
41
+ ----------
42
+ filename : Path
43
+
44
+ Returns
45
+ -------
46
+ DimensionBaseConfig
47
+
48
+ """
49
+ data = load_data(filename)
50
+ if data["type"] == DimensionType.TIME.value:
51
+ if data["time_type"] == TimeDimensionType.DATETIME.value:
52
+ return DateTimeDimensionConfig.load(filename)
53
+ elif data["time_type"] == TimeDimensionType.ANNUAL.value:
54
+ return AnnualTimeDimensionConfig.load(filename)
55
+ elif data["time_type"] == TimeDimensionType.REPRESENTATIVE_PERIOD.value:
56
+ return RepresentativePeriodTimeDimensionConfig.load(filename)
57
+ elif data["time_type"] == TimeDimensionType.NOOP.value:
58
+ return NoOpTimeDimensionConfig.load(filename)
59
+ elif data["time_type"] == TimeDimensionType.INDEX.value:
60
+ return IndexTimeDimensionConfig.load(filename)
61
+ else:
62
+ msg = f"time_type={data['time_type']} not supported"
63
+ raise ValueError(msg)
64
+
65
+ return DimensionConfig.load(filename)
@@ -0,0 +1,350 @@
1
+ import logging
2
+
3
+
4
+ from pydantic import Field, ValidationInfo, field_validator, model_validator
5
+
6
+ from dsgrid.data_models import DSGBaseDatabaseModel, DSGBaseModel, DSGEnum, EnumValue
7
+ from dsgrid.dimension.base_models import DimensionType
8
+ from dsgrid.exceptions import DSGInvalidDimensionMapping
9
+ from .dimensions import DimensionReferenceModel
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class DimensionMappingType(DSGEnum):
15
+ """Defines the operation dsgrid will apply to the data during a mapping."""
16
+
17
+ # optional from_fraction col, FRACTION_SUM_EQ1 when grouped by from_id
18
+ ONE_TO_ONE = "one_to_one" # includes rename, down-selection
19
+ MANY_TO_ONE_AGGREGATION = "many_to_one_aggregation"
20
+ MANY_TO_ONE_REASSIGNMENT = "many_to_one_reassignment"
21
+
22
+ # optional from_fraction col, no FRACTION_SUM check
23
+ DUPLICATION = "duplication"
24
+
25
+ # required from_fraction col, FRACTION_SUM_EQ1 when grouped by from_id
26
+ ONE_TO_MANY_DISAGGREGATION = "one_to_many_disaggregation"
27
+ MANY_TO_MANY_AGGREGATION = "many_to_many_aggregation"
28
+ MANY_TO_MANY_DISAGGREGATION = "many_to_many_disaggregation"
29
+
30
+ # required from_fraction col, FRACTION_SUM_EQ1 when grouped by to_id
31
+ MANY_TO_ONE_ASSIGNMENT = "many_to_one_assignment"
32
+ ONE_TO_MANY_ASSIGNMENT = "one_to_many_assignment"
33
+ MANY_TO_MANY_ASSIGNMENT = "many_to_many_assignment"
34
+
35
+ # required from_fraction col, no FRACTION_SUM check
36
+ ONE_TO_ONE_EXPLICIT_MULTIPLIERS = "one_to_one_explicit_multipliers"
37
+ ONE_TO_MANY_EXPLICIT_MULTIPLIERS = "one_to_many_explicit_multipliers"
38
+ MANY_TO_ONE_EXPLICIT_MULTIPLIERS = "many_to_one_explicit_multipliers"
39
+ MANY_TO_MANY_EXPLICIT_MULTIPLIERS = "many_to_many_explicit_multipliers"
40
+
41
+
42
+ class DimensionMappingArchetype(DSGEnum):
43
+ """Dimension mapping archetype, used to check whether duplicates are allowed in from/to
44
+ dimensions and apply rules about the sum of the from_fraction column.
45
+ """
46
+
47
+ ONE_TO_ONE_MAP_FRACTION_SUM_FROM_ID_EQ1 = EnumValue(
48
+ value="one_to_one_map_fraction_sum_from_id_eq1",
49
+ description="One-to-one dimension mapping with sum of from_fraction = 1 when grouped by from_id",
50
+ allow_dup_from_records=False,
51
+ allow_dup_to_records=False,
52
+ check_fraction_sum_eq1_from_id=True,
53
+ check_fraction_sum_eq1_to_id=False,
54
+ )
55
+ ONE_TO_MANY_MAP_FRACTION_SUM_FROM_ID_EQ1 = EnumValue(
56
+ value="one_to_many_map_fraction_sum_from_id_eq1",
57
+ description="One-to-many dimension mapping with sum of from_fraction = 1 when grouped by from_id",
58
+ allow_dup_from_records=True,
59
+ allow_dup_to_records=False,
60
+ check_fraction_sum_eq1_from_id=True,
61
+ check_fraction_sum_eq1_to_id=False,
62
+ )
63
+ MANY_TO_ONE_MAP_FRACTION_SUM_FROM_ID_EQ1 = EnumValue(
64
+ value="many_to_one_map_fraction_sum_from_id_eq1",
65
+ description="Many-to-one dimension mapping with sum of from_fraction = 1 when grouped by from_id",
66
+ allow_dup_from_records=False,
67
+ allow_dup_to_records=True,
68
+ check_fraction_sum_eq1_from_id=True,
69
+ check_fraction_sum_eq1_to_id=False,
70
+ )
71
+ MANY_TO_MANY_MAP_FRACTION_SUM_FROM_ID_EQ1 = EnumValue(
72
+ value="many_to_many_map_fraction_sum_from_id_eq1",
73
+ description="Many-to-many dimension mapping with sum of from_fraction = 1 when grouped by from_id",
74
+ allow_dup_from_records=True,
75
+ allow_dup_to_records=True,
76
+ check_fraction_sum_eq1_from_id=True,
77
+ check_fraction_sum_eq1_to_id=False,
78
+ )
79
+
80
+ ONE_TO_ONE_MAP_FRACTION_SUM_TO_ID_EQ1 = EnumValue(
81
+ value="one_to_one_map_fraction_sum_to_id_eq1",
82
+ description="One-to-one dimension mapping with sum of from_fraction = 1 when grouped by to_id",
83
+ allow_dup_from_records=False,
84
+ allow_dup_to_records=False,
85
+ check_fraction_sum_eq1_from_id=False,
86
+ check_fraction_sum_eq1_to_id=True,
87
+ )
88
+ ONE_TO_MANY_MAP_FRACTION_SUM_TO_ID_EQ1 = EnumValue(
89
+ value="one_to_many_map_fraction_sum_to_id_eq1",
90
+ description="One-to-many dimension mapping with sum of from_fraction = 1 when grouped by to_id",
91
+ allow_dup_from_records=True,
92
+ allow_dup_to_records=False,
93
+ check_fraction_sum_eq1_from_id=False,
94
+ check_fraction_sum_eq1_to_id=True,
95
+ )
96
+ MANY_TO_ONE_MAP_FRACTION_SUM_TO_ID_EQ1 = EnumValue(
97
+ value="many_to_one_map_fraction_sum_to_id_eq1",
98
+ description="Many-to-one dimension mapping with sum of from_fraction = 1 when grouped by to_id",
99
+ allow_dup_from_records=False,
100
+ allow_dup_to_records=True,
101
+ check_fraction_sum_eq1_from_id=False,
102
+ check_fraction_sum_eq1_to_id=True,
103
+ )
104
+ MANY_TO_MANY_MAP_FRACTION_SUM_TO_ID_EQ1 = EnumValue(
105
+ value="many_to_many_map_fraction_sum_to_id_eq1",
106
+ description="Many-to-many dimension mapping with sum of from_fraction = 1 when grouped by to_id",
107
+ allow_dup_from_records=True,
108
+ allow_dup_to_records=True,
109
+ check_fraction_sum_eq1_from_id=False,
110
+ check_fraction_sum_eq1_to_id=True,
111
+ )
112
+
113
+ ONE_TO_ONE_MAP = EnumValue(
114
+ value="one_to_one_map",
115
+ description="One-to-one dimension mapping with no from_fraction sum check",
116
+ allow_dup_from_records=False,
117
+ allow_dup_to_records=False,
118
+ check_fraction_sum_eq1_from_id=False,
119
+ check_fraction_sum_eq1_to_id=False,
120
+ )
121
+ ONE_TO_MANY_MAP = EnumValue(
122
+ value="one_to_many_map",
123
+ description="One-to-many dimension mapping with no from_fraction sum check",
124
+ allow_dup_from_records=True,
125
+ allow_dup_to_records=False,
126
+ check_fraction_sum_eq1_from_id=False,
127
+ check_fraction_sum_eq1_to_id=False,
128
+ )
129
+ MANY_TO_ONE_MAP = EnumValue(
130
+ value="many_to_one_map",
131
+ description="Many-to-one dimension mapping with no from_fraction sum check",
132
+ allow_dup_from_records=False,
133
+ allow_dup_to_records=True,
134
+ check_fraction_sum_eq1_from_id=False,
135
+ check_fraction_sum_eq1_to_id=False,
136
+ )
137
+ MANY_TO_MANY_MAP = EnumValue(
138
+ value="many_to_many_map",
139
+ description="Many-to-many dimension mapping with no from_fraction sum check",
140
+ allow_dup_from_records=True,
141
+ allow_dup_to_records=True,
142
+ check_fraction_sum_eq1_from_id=False,
143
+ check_fraction_sum_eq1_to_id=False,
144
+ )
145
+
146
+
147
+ class DimensionMappingBaseModel(DSGBaseDatabaseModel):
148
+ """Base class for mapping dimensions"""
149
+
150
+ mapping_type: DimensionMappingType = Field(
151
+ title="mapping_type",
152
+ description="Type/purpose of the dimension mapping",
153
+ default="many_to_one_aggregation",
154
+ json_schema_extra={
155
+ "options": DimensionMappingType.format_for_docs(),
156
+ },
157
+ )
158
+ archetype: DimensionMappingArchetype | None = Field(
159
+ default=None,
160
+ title="archetype",
161
+ description="Dimension mapping archetype, determined based on mapping_type",
162
+ json_schema_extra={
163
+ "dsgrid_internal": True,
164
+ "options": DimensionMappingArchetype.format_for_docs(),
165
+ },
166
+ )
167
+ from_dimension: DimensionReferenceModel = Field(
168
+ title="from_dimension",
169
+ description="From dimension",
170
+ )
171
+ to_dimension: DimensionReferenceModel = Field(
172
+ title="to_dimension",
173
+ description="To dimension",
174
+ )
175
+ from_fraction_tolerance: float = Field(
176
+ title="from_fraction_tolerance",
177
+ description="Tolerance to apply when checking from_fraction column sums",
178
+ default=1e-6,
179
+ )
180
+ to_fraction_tolerance: float = Field(
181
+ title="to_fraction_tolerance",
182
+ description="Tolerance to apply when checking to_fraction column sums",
183
+ default=1e-6,
184
+ )
185
+ description: str | None = Field(
186
+ default=None,
187
+ title="description",
188
+ description="Description of dimension mapping",
189
+ )
190
+ mapping_id: str | None = Field(
191
+ default=None,
192
+ title="mapping_id",
193
+ description="Unique dimension mapping identifier, generated by dsgrid",
194
+ json_schema_extra={
195
+ "dsgrid_internal": True,
196
+ "updateable": False,
197
+ },
198
+ )
199
+
200
+ @field_validator("archetype")
201
+ @classmethod
202
+ def check_archetype(cls, archetype, info: ValidationInfo):
203
+ if "mapping_type" not in info.data:
204
+ return archetype
205
+
206
+ archetype_assignment = {
207
+ # optional from_fraction col, FRACTION_SUM_EQ1 when grouped by from_id
208
+ DimensionMappingType.ONE_TO_ONE: DimensionMappingArchetype.ONE_TO_ONE_MAP_FRACTION_SUM_FROM_ID_EQ1,
209
+ DimensionMappingType.MANY_TO_ONE_AGGREGATION: DimensionMappingArchetype.MANY_TO_ONE_MAP_FRACTION_SUM_FROM_ID_EQ1,
210
+ DimensionMappingType.MANY_TO_ONE_REASSIGNMENT: DimensionMappingArchetype.MANY_TO_ONE_MAP_FRACTION_SUM_FROM_ID_EQ1,
211
+ # optional from_fraction col, no FRACTION_SUM check
212
+ DimensionMappingType.DUPLICATION: DimensionMappingArchetype.ONE_TO_MANY_MAP,
213
+ # required from_fraction col, FRACTION_SUM_EQ1 when grouped by from_id
214
+ DimensionMappingType.ONE_TO_MANY_DISAGGREGATION: DimensionMappingArchetype.ONE_TO_MANY_MAP_FRACTION_SUM_FROM_ID_EQ1,
215
+ DimensionMappingType.MANY_TO_MANY_AGGREGATION: DimensionMappingArchetype.MANY_TO_MANY_MAP_FRACTION_SUM_FROM_ID_EQ1,
216
+ DimensionMappingType.MANY_TO_MANY_DISAGGREGATION: DimensionMappingArchetype.MANY_TO_MANY_MAP_FRACTION_SUM_FROM_ID_EQ1,
217
+ # required from_fraction col, FRACTION_SUM_EQ1 when grouped by to_id
218
+ DimensionMappingType.ONE_TO_MANY_ASSIGNMENT: DimensionMappingArchetype.ONE_TO_MANY_MAP_FRACTION_SUM_TO_ID_EQ1,
219
+ DimensionMappingType.MANY_TO_ONE_ASSIGNMENT: DimensionMappingArchetype.MANY_TO_MANY_MAP_FRACTION_SUM_TO_ID_EQ1,
220
+ DimensionMappingType.MANY_TO_MANY_ASSIGNMENT: DimensionMappingArchetype.MANY_TO_MANY_MAP_FRACTION_SUM_TO_ID_EQ1,
221
+ # required from_fraction col, no FRACTION_SUM check
222
+ DimensionMappingType.ONE_TO_ONE_EXPLICIT_MULTIPLIERS: DimensionMappingArchetype.ONE_TO_ONE_MAP,
223
+ DimensionMappingType.ONE_TO_MANY_EXPLICIT_MULTIPLIERS: DimensionMappingArchetype.ONE_TO_MANY_MAP,
224
+ DimensionMappingType.MANY_TO_ONE_EXPLICIT_MULTIPLIERS: DimensionMappingArchetype.MANY_TO_ONE_MAP,
225
+ DimensionMappingType.MANY_TO_MANY_EXPLICIT_MULTIPLIERS: DimensionMappingArchetype.MANY_TO_MANY_MAP,
226
+ }
227
+
228
+ mapping_type = info.data["mapping_type"]
229
+ assigned_archetype = archetype_assignment[mapping_type]
230
+ if archetype is None:
231
+ archetype = assigned_archetype
232
+ elif archetype != assigned_archetype:
233
+ msg = (
234
+ '"mapping_type" and "archetype" are both defined AND they DO NOT correspond to each other. '
235
+ "archetype can be removed from config so that it can be assigned automatically based on mapping_type. "
236
+ f"Otherwise, {mapping_type=} should have archetype={assigned_archetype} "
237
+ )
238
+ raise DSGInvalidDimensionMapping(msg)
239
+ return archetype
240
+
241
+
242
+ class DimensionMappingPreRegisteredBaseModel(DSGBaseModel):
243
+ """Base class for mapping soon-to-be registered dimensions. As soon as the dimensions
244
+ are registered this will be converted to a DimensionMappingBaseModel and then registered.
245
+ """
246
+
247
+ mapping_type: DimensionMappingType = Field(
248
+ title="mapping_type",
249
+ description="Type/purpose of the dimension mapping",
250
+ default="many_to_one_aggregation",
251
+ json_schema_extra={
252
+ "options": DimensionMappingType.format_for_docs(),
253
+ },
254
+ )
255
+ archetype: DimensionMappingArchetype | None = Field(
256
+ default=None,
257
+ title="archetype",
258
+ description="Dimension mapping archetype, determined based on mapping_type",
259
+ json_schema_extra={
260
+ "dsgrid_internal": True,
261
+ "options": DimensionMappingArchetype.format_for_docs(),
262
+ },
263
+ )
264
+ description: str | None = Field(
265
+ default=None,
266
+ title="description",
267
+ description="Description of dimension mapping",
268
+ )
269
+ from_fraction_tolerance: float = Field(
270
+ title="from_fraction_tolerance",
271
+ description="Tolerance value to apply to the from_fraction column",
272
+ default=1e-6,
273
+ )
274
+ to_fraction_tolerance: float = Field(
275
+ title="to_fraction_tolerance",
276
+ description="Tolerance value to apply to the to_fraction column",
277
+ default=1e-6,
278
+ )
279
+ project_base_dimension_name: str | None = Field(
280
+ default=None,
281
+ description="Name of the base dimension for which the mapping is being registered. "
282
+ "This is required in cases where the project has multiple base dimensions of the same "
283
+ "type. If None, there must only be one base dimension of this type in the project.",
284
+ )
285
+
286
+
287
+ class DimensionMappingDatasetToProjectBaseModel(DimensionMappingPreRegisteredBaseModel):
288
+ """Base class for mapping soon-to-be registered dimensions for a dataset. Used when
289
+ automatically registering mappings while submitting a dataset to a project.
290
+ """
291
+
292
+ dimension_type: DimensionType = Field(
293
+ title="dimension_type",
294
+ description="Dimension types that will be mapped",
295
+ )
296
+
297
+
298
+ class DimensionMappingReferenceModel(DSGBaseModel):
299
+ """Reference to a dimension mapping stored in the registry.
300
+
301
+ The DimensionMappingReferenceModel is utilized by the project configuration (project.json5) as well as by the
302
+ dimension mapping reference configuration (dimension_mapping_references.json5) that may be required when submitting a dataset to a project.
303
+ """
304
+
305
+ from_dimension_type: DimensionType = Field(
306
+ title="from_dimension_type",
307
+ description="Dimension Type",
308
+ json_schema_extra={
309
+ "options": DimensionType.format_for_docs(),
310
+ },
311
+ )
312
+ to_dimension_type: DimensionType = Field(
313
+ title="to_dimension_type",
314
+ description="Dimension Type",
315
+ json_schema_extra={
316
+ "options": DimensionType.format_for_docs(),
317
+ },
318
+ )
319
+ mapping_id: str = Field(
320
+ title="mapping_id",
321
+ description="Unique ID of the dimension mapping",
322
+ json_schema_extra={
323
+ "updateable": False,
324
+ },
325
+ )
326
+ version: str = Field(
327
+ title="version",
328
+ description="Version of the dimension",
329
+ )
330
+
331
+ # This function can be deleted once all dataset repositories have been updated.
332
+ @model_validator(mode="before")
333
+ @classmethod
334
+ def handle_legacy_fields(cls, values):
335
+ if "required_for_validation" in values:
336
+ logger.warning(
337
+ "Removing deprecated required_for_validation field from a dimension mapping reference."
338
+ )
339
+ values.pop("required_for_validation")
340
+
341
+ return values
342
+
343
+
344
+ class DimensionMappingReferenceListModel(DSGBaseModel):
345
+ """List of dimension mapping references used by the dimensions_mappings.json5 config"""
346
+
347
+ references: list[DimensionMappingReferenceModel] = Field(
348
+ title="references",
349
+ description="List of dimension mapping references",
350
+ )