dsgrid-toolkit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dsgrid-toolkit might be problematic. Click here for more details.

Files changed (152) hide show
  1. dsgrid/__init__.py +22 -0
  2. dsgrid/api/__init__.py +0 -0
  3. dsgrid/api/api_manager.py +179 -0
  4. dsgrid/api/app.py +420 -0
  5. dsgrid/api/models.py +60 -0
  6. dsgrid/api/response_models.py +116 -0
  7. dsgrid/apps/__init__.py +0 -0
  8. dsgrid/apps/project_viewer/app.py +216 -0
  9. dsgrid/apps/registration_gui.py +444 -0
  10. dsgrid/chronify.py +22 -0
  11. dsgrid/cli/__init__.py +0 -0
  12. dsgrid/cli/common.py +120 -0
  13. dsgrid/cli/config.py +177 -0
  14. dsgrid/cli/download.py +13 -0
  15. dsgrid/cli/dsgrid.py +142 -0
  16. dsgrid/cli/dsgrid_admin.py +349 -0
  17. dsgrid/cli/install_notebooks.py +62 -0
  18. dsgrid/cli/query.py +711 -0
  19. dsgrid/cli/registry.py +1773 -0
  20. dsgrid/cloud/__init__.py +0 -0
  21. dsgrid/cloud/cloud_storage_interface.py +140 -0
  22. dsgrid/cloud/factory.py +31 -0
  23. dsgrid/cloud/fake_storage_interface.py +37 -0
  24. dsgrid/cloud/s3_storage_interface.py +156 -0
  25. dsgrid/common.py +35 -0
  26. dsgrid/config/__init__.py +0 -0
  27. dsgrid/config/annual_time_dimension_config.py +187 -0
  28. dsgrid/config/common.py +131 -0
  29. dsgrid/config/config_base.py +148 -0
  30. dsgrid/config/dataset_config.py +684 -0
  31. dsgrid/config/dataset_schema_handler_factory.py +41 -0
  32. dsgrid/config/date_time_dimension_config.py +108 -0
  33. dsgrid/config/dimension_config.py +54 -0
  34. dsgrid/config/dimension_config_factory.py +65 -0
  35. dsgrid/config/dimension_mapping_base.py +349 -0
  36. dsgrid/config/dimension_mappings_config.py +48 -0
  37. dsgrid/config/dimensions.py +775 -0
  38. dsgrid/config/dimensions_config.py +71 -0
  39. dsgrid/config/index_time_dimension_config.py +76 -0
  40. dsgrid/config/input_dataset_requirements.py +31 -0
  41. dsgrid/config/mapping_tables.py +209 -0
  42. dsgrid/config/noop_time_dimension_config.py +42 -0
  43. dsgrid/config/project_config.py +1457 -0
  44. dsgrid/config/registration_models.py +199 -0
  45. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  46. dsgrid/config/simple_models.py +49 -0
  47. dsgrid/config/supplemental_dimension.py +29 -0
  48. dsgrid/config/time_dimension_base_config.py +200 -0
  49. dsgrid/data_models.py +155 -0
  50. dsgrid/dataset/__init__.py +0 -0
  51. dsgrid/dataset/dataset.py +123 -0
  52. dsgrid/dataset/dataset_expression_handler.py +86 -0
  53. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  54. dsgrid/dataset/dataset_schema_handler_base.py +899 -0
  55. dsgrid/dataset/dataset_schema_handler_one_table.py +196 -0
  56. dsgrid/dataset/dataset_schema_handler_standard.py +303 -0
  57. dsgrid/dataset/growth_rates.py +162 -0
  58. dsgrid/dataset/models.py +44 -0
  59. dsgrid/dataset/table_format_handler_base.py +257 -0
  60. dsgrid/dataset/table_format_handler_factory.py +17 -0
  61. dsgrid/dataset/unpivoted_table.py +121 -0
  62. dsgrid/dimension/__init__.py +0 -0
  63. dsgrid/dimension/base_models.py +218 -0
  64. dsgrid/dimension/dimension_filters.py +308 -0
  65. dsgrid/dimension/standard.py +213 -0
  66. dsgrid/dimension/time.py +531 -0
  67. dsgrid/dimension/time_utils.py +88 -0
  68. dsgrid/dsgrid_rc.py +88 -0
  69. dsgrid/exceptions.py +105 -0
  70. dsgrid/filesystem/__init__.py +0 -0
  71. dsgrid/filesystem/cloud_filesystem.py +32 -0
  72. dsgrid/filesystem/factory.py +32 -0
  73. dsgrid/filesystem/filesystem_interface.py +136 -0
  74. dsgrid/filesystem/local_filesystem.py +74 -0
  75. dsgrid/filesystem/s3_filesystem.py +118 -0
  76. dsgrid/loggers.py +132 -0
  77. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +950 -0
  78. dsgrid/notebooks/registration.ipynb +48 -0
  79. dsgrid/notebooks/start_notebook.sh +11 -0
  80. dsgrid/project.py +451 -0
  81. dsgrid/query/__init__.py +0 -0
  82. dsgrid/query/dataset_mapping_plan.py +142 -0
  83. dsgrid/query/derived_dataset.py +384 -0
  84. dsgrid/query/models.py +726 -0
  85. dsgrid/query/query_context.py +287 -0
  86. dsgrid/query/query_submitter.py +847 -0
  87. dsgrid/query/report_factory.py +19 -0
  88. dsgrid/query/report_peak_load.py +70 -0
  89. dsgrid/query/reports_base.py +20 -0
  90. dsgrid/registry/__init__.py +0 -0
  91. dsgrid/registry/bulk_register.py +161 -0
  92. dsgrid/registry/common.py +287 -0
  93. dsgrid/registry/config_update_checker_base.py +63 -0
  94. dsgrid/registry/data_store_factory.py +34 -0
  95. dsgrid/registry/data_store_interface.py +69 -0
  96. dsgrid/registry/dataset_config_generator.py +156 -0
  97. dsgrid/registry/dataset_registry_manager.py +734 -0
  98. dsgrid/registry/dataset_update_checker.py +16 -0
  99. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  100. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  101. dsgrid/registry/dimension_registry_manager.py +413 -0
  102. dsgrid/registry/dimension_update_checker.py +16 -0
  103. dsgrid/registry/duckdb_data_store.py +185 -0
  104. dsgrid/registry/filesystem_data_store.py +141 -0
  105. dsgrid/registry/filter_registry_manager.py +123 -0
  106. dsgrid/registry/project_config_generator.py +57 -0
  107. dsgrid/registry/project_registry_manager.py +1616 -0
  108. dsgrid/registry/project_update_checker.py +48 -0
  109. dsgrid/registry/registration_context.py +223 -0
  110. dsgrid/registry/registry_auto_updater.py +316 -0
  111. dsgrid/registry/registry_database.py +662 -0
  112. dsgrid/registry/registry_interface.py +446 -0
  113. dsgrid/registry/registry_manager.py +544 -0
  114. dsgrid/registry/registry_manager_base.py +367 -0
  115. dsgrid/registry/versioning.py +92 -0
  116. dsgrid/spark/__init__.py +0 -0
  117. dsgrid/spark/functions.py +545 -0
  118. dsgrid/spark/types.py +50 -0
  119. dsgrid/tests/__init__.py +0 -0
  120. dsgrid/tests/common.py +139 -0
  121. dsgrid/tests/make_us_data_registry.py +204 -0
  122. dsgrid/tests/register_derived_datasets.py +103 -0
  123. dsgrid/tests/utils.py +25 -0
  124. dsgrid/time/__init__.py +0 -0
  125. dsgrid/time/time_conversions.py +80 -0
  126. dsgrid/time/types.py +67 -0
  127. dsgrid/units/__init__.py +0 -0
  128. dsgrid/units/constants.py +113 -0
  129. dsgrid/units/convert.py +71 -0
  130. dsgrid/units/energy.py +145 -0
  131. dsgrid/units/power.py +87 -0
  132. dsgrid/utils/__init__.py +0 -0
  133. dsgrid/utils/dataset.py +612 -0
  134. dsgrid/utils/files.py +179 -0
  135. dsgrid/utils/filters.py +125 -0
  136. dsgrid/utils/id_remappings.py +100 -0
  137. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  138. dsgrid/utils/py_expression_eval/README.md +8 -0
  139. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  140. dsgrid/utils/py_expression_eval/tests.py +283 -0
  141. dsgrid/utils/run_command.py +70 -0
  142. dsgrid/utils/scratch_dir_context.py +64 -0
  143. dsgrid/utils/spark.py +918 -0
  144. dsgrid/utils/spark_partition.py +98 -0
  145. dsgrid/utils/timing.py +239 -0
  146. dsgrid/utils/utilities.py +184 -0
  147. dsgrid/utils/versioning.py +36 -0
  148. dsgrid_toolkit-0.2.0.dist-info/METADATA +216 -0
  149. dsgrid_toolkit-0.2.0.dist-info/RECORD +152 -0
  150. dsgrid_toolkit-0.2.0.dist-info/WHEEL +4 -0
  151. dsgrid_toolkit-0.2.0.dist-info/entry_points.txt +4 -0
  152. dsgrid_toolkit-0.2.0.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,41 @@
1
+ from sqlalchemy import Connection
2
+
3
+ from dsgrid.config.dataset_config import DataSchemaType, DatasetConfig
4
+ from dsgrid.dataset.dataset_schema_handler_standard import StandardDatasetSchemaHandler
5
+ from dsgrid.dataset.dataset_schema_handler_one_table import OneTableDatasetSchemaHandler
6
+ from dsgrid.registry.data_store_interface import DataStoreInterface
7
+ from dsgrid.registry.dimension_registry_manager import DimensionRegistryManager
8
+ from dsgrid.registry.dimension_mapping_registry_manager import DimensionMappingRegistryManager
9
+ from dsgrid.config.dimension_mapping_base import DimensionMappingReferenceModel
10
+
11
+
12
+ def make_dataset_schema_handler(
13
+ conn: Connection | None,
14
+ config: DatasetConfig,
15
+ dimension_mgr: DimensionRegistryManager,
16
+ dimension_mapping_mgr: DimensionMappingRegistryManager,
17
+ store: DataStoreInterface | None = None,
18
+ mapping_references: list[DimensionMappingReferenceModel] | None = None,
19
+ ):
20
+ match config.get_data_schema_type():
21
+ case DataSchemaType.STANDARD:
22
+ return StandardDatasetSchemaHandler.load(
23
+ config,
24
+ conn,
25
+ dimension_mgr,
26
+ dimension_mapping_mgr,
27
+ store=store,
28
+ mapping_references=mapping_references,
29
+ )
30
+ case DataSchemaType.ONE_TABLE:
31
+ return OneTableDatasetSchemaHandler.load(
32
+ config,
33
+ conn,
34
+ dimension_mgr,
35
+ dimension_mapping_mgr,
36
+ store=store,
37
+ mapping_references=mapping_references,
38
+ )
39
+ case _:
40
+ msg = f"{config.model.data_schema.data_schema_type=}"
41
+ raise NotImplementedError(msg)
@@ -0,0 +1,108 @@
1
+ import logging
2
+ from datetime import datetime, timedelta, tzinfo
3
+ from zoneinfo import ZoneInfo
4
+
5
+ import pandas as pd
6
+
7
+ import chronify
8
+
9
+ from dsgrid.dimension.time import TimeZone
10
+ from dsgrid.spark.types import DataFrame, F
11
+ from dsgrid.time.types import DatetimeTimestampType
12
+ from dsgrid.dimension.time import DatetimeFormat, TimeIntervalType
13
+ from .dimensions import DateTimeDimensionModel
14
+ from .time_dimension_base_config import TimeDimensionBaseConfig
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class DateTimeDimensionConfig(TimeDimensionBaseConfig):
20
+ """Provides an interface to a DateTimeDimensionModel."""
21
+
22
+ @staticmethod
23
+ def model_class() -> DateTimeDimensionModel:
24
+ return DateTimeDimensionModel
25
+
26
+ def supports_chronify(self) -> bool:
27
+ return True
28
+
29
+ def to_chronify(self) -> chronify.DatetimeRange:
30
+ time_cols = self.get_load_data_time_columns()
31
+ assert len(self._model.ranges) == 1
32
+ assert len(time_cols) == 1
33
+ # TODO: issue #341: this is actually tied to the weather_year problem #340
34
+ # If there are no ranges, all of this must be dynamic.
35
+ # The two issues should be solved together.
36
+ return chronify.DatetimeRange(
37
+ time_column=time_cols[0],
38
+ start=pd.Timestamp(self.get_start_times()[0]),
39
+ length=self.get_lengths()[0],
40
+ resolution=self._model.frequency,
41
+ measurement_type=self._model.measurement_type,
42
+ interval_type=self._model.time_interval_type,
43
+ )
44
+
45
+ def get_frequency(self) -> timedelta:
46
+ return self.model.frequency
47
+
48
+ def get_start_times(self) -> list[pd.Timestamp]:
49
+ tz = self.get_tzinfo()
50
+ start_times = []
51
+ for trange in self.model.ranges:
52
+ start = datetime.strptime(trange.start, self.model.str_format)
53
+ assert start.tzinfo is None
54
+ start_times.append(start.replace(tzinfo=tz))
55
+ return start_times
56
+
57
+ def get_lengths(self) -> list[int]:
58
+ tz = self.get_tzinfo()
59
+ lengths = []
60
+ for trange in self.model.ranges:
61
+ start = datetime.strptime(trange.start, self.model.str_format)
62
+ end = datetime.strptime(trange.end, self.model.str_format)
63
+ assert start.tzinfo is None
64
+ assert end.tzinfo is None
65
+ start_utc = start.replace(tzinfo=tz).astimezone(tz=ZoneInfo("UTC"))
66
+ end_utc = end.replace(tzinfo=tz).astimezone(tz=ZoneInfo("UTC"))
67
+ freq = self.get_frequency()
68
+ length = (end_utc - start_utc) / freq + 1
69
+ assert length % 1 == 0, f"{length=} is not a whole number"
70
+ lengths.append(int(length))
71
+ return lengths
72
+
73
+ def get_load_data_time_columns(self) -> list[str]:
74
+ return list(DatetimeTimestampType._fields)
75
+
76
+ def get_time_zone(self) -> TimeZone | None:
77
+ if self.model.datetime_format.format_type == DatetimeFormat.ALIGNED:
78
+ return self.model.datetime_format.timezone
79
+ if self.model.datetime_format.format_type in [DatetimeFormat.LOCAL_AS_STRINGS]:
80
+ return None
81
+ msg = f"Undefined time zone for {self.model.datetime_format.format_type=}"
82
+ raise NotImplementedError(msg)
83
+
84
+ def get_tzinfo(self) -> tzinfo | None:
85
+ time_zone = self.get_time_zone()
86
+ if time_zone is None:
87
+ return None
88
+ return time_zone.tz
89
+
90
+ def get_time_interval_type(self) -> TimeIntervalType:
91
+ return self.model.time_interval_type
92
+
93
+ def convert_time_format(self, df: DataFrame, update_model: bool = False) -> DataFrame:
94
+ if self.model.datetime_format.format_type != DatetimeFormat.LOCAL_AS_STRINGS:
95
+ return df
96
+ time_col = self.get_load_data_time_columns()
97
+ assert len(time_col) == 1, time_col
98
+ time_col = time_col[0]
99
+ df = df.withColumn(
100
+ time_col,
101
+ F.to_timestamp(time_col, self.model.datetime_format.data_str_format),
102
+ )
103
+ if update_model:
104
+ # TODO: The code doesn't support DatetimeFormat.LOCAL.
105
+ # self.model.datetime_format.format_type = DatetimeFormat.LOCAL
106
+ msg = "convert_time_format DatetimeFormat.LOCAL_AS_STRINGS update_model=True"
107
+ raise NotImplementedError(msg)
108
+ return df
@@ -0,0 +1,54 @@
1
+ import abc
2
+ import logging
3
+ from typing import Union
4
+
5
+ from .config_base import ConfigBase, ConfigWithRecordFileBase
6
+ from .dimensions import DimensionModel
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class DimensionBaseConfigWithFiles(ConfigWithRecordFileBase, abc.ABC):
12
+ """Base class for dimension configs"""
13
+
14
+ @staticmethod
15
+ def config_filename():
16
+ return "dimension.json5"
17
+
18
+ @property
19
+ def config_id(self):
20
+ return self.model.dimension_id
21
+
22
+ def get_unique_ids(self) -> set[str]:
23
+ """Return the unique IDs in a dimension's records.
24
+
25
+ Returns
26
+ -------
27
+ set
28
+ set of str
29
+
30
+ """
31
+ return {x.id for x in self.model.records}
32
+
33
+
34
+ class DimensionBaseConfigWithoutFiles(ConfigBase, abc.ABC):
35
+ """Base class for dimension configs"""
36
+
37
+ @staticmethod
38
+ def config_filename():
39
+ return "dimension.json5"
40
+
41
+ @property
42
+ def config_id(self):
43
+ return self.model.dimension_id
44
+
45
+
46
+ class DimensionConfig(DimensionBaseConfigWithFiles):
47
+ """Provides an interface to a DimensionModel."""
48
+
49
+ @staticmethod
50
+ def model_class():
51
+ return DimensionModel
52
+
53
+
54
+ DimensionBaseConfig = Union[DimensionBaseConfigWithFiles, DimensionBaseConfigWithoutFiles]
@@ -0,0 +1,65 @@
1
+ from dsgrid.dimension.time import TimeDimensionType
2
+ from dsgrid.utils.files import load_data
3
+ from .date_time_dimension_config import DateTimeDimensionConfig
4
+ from .annual_time_dimension_config import AnnualTimeDimensionConfig
5
+ from .noop_time_dimension_config import NoOpTimeDimensionConfig
6
+ from .index_time_dimension_config import IndexTimeDimensionConfig
7
+ from .dimension_config import DimensionConfig
8
+ from .representative_period_time_dimension_config import RepresentativePeriodTimeDimensionConfig
9
+ from .dimensions import (
10
+ DateTimeDimensionModel,
11
+ DimensionModel,
12
+ DimensionType,
13
+ AnnualTimeDimensionModel,
14
+ RepresentativePeriodTimeDimensionModel,
15
+ NoOpTimeDimensionModel,
16
+ IndexTimeDimensionModel,
17
+ )
18
+
19
+
20
+ def get_dimension_config(model):
21
+ if isinstance(model, DateTimeDimensionModel):
22
+ return DateTimeDimensionConfig(model)
23
+ if isinstance(model, AnnualTimeDimensionModel):
24
+ return AnnualTimeDimensionConfig(model)
25
+ if isinstance(model, RepresentativePeriodTimeDimensionModel):
26
+ return RepresentativePeriodTimeDimensionConfig(model)
27
+ if isinstance(model, DimensionModel):
28
+ config = DimensionConfig(model)
29
+ return config
30
+ if isinstance(model, NoOpTimeDimensionModel):
31
+ return NoOpTimeDimensionConfig(model)
32
+ if isinstance(model, IndexTimeDimensionModel):
33
+ return IndexTimeDimensionConfig(model)
34
+ assert False, type(model)
35
+
36
+
37
+ def load_dimension_config(filename):
38
+ """Loads a dimension config file before the exact type is known.
39
+
40
+ Parameters
41
+ ----------
42
+ filename : Path
43
+
44
+ Returns
45
+ -------
46
+ DimensionBaseConfig
47
+
48
+ """
49
+ data = load_data(filename)
50
+ if data["type"] == DimensionType.TIME.value:
51
+ if data["time_type"] == TimeDimensionType.DATETIME.value:
52
+ return DateTimeDimensionConfig.load(filename)
53
+ elif data["time_type"] == TimeDimensionType.ANNUAL.value:
54
+ return AnnualTimeDimensionConfig.load(filename)
55
+ elif data["time_type"] == TimeDimensionType.REPRESENTATIVE_PERIOD.value:
56
+ return RepresentativePeriodTimeDimensionConfig.load(filename)
57
+ elif data["time_type"] == TimeDimensionType.NOOP.value:
58
+ return NoOpTimeDimensionConfig.load(filename)
59
+ elif data["time_type"] == TimeDimensionType.INDEX.value:
60
+ return IndexTimeDimensionConfig.load(filename)
61
+ else:
62
+ msg = f"time_type={data['time_type']} not supported"
63
+ raise ValueError(msg)
64
+
65
+ return DimensionConfig.load(filename)
@@ -0,0 +1,349 @@
1
+ import logging
2
+
3
+
4
+ from pydantic import Field, ValidationInfo, field_validator
5
+
6
+ from dsgrid.data_models import DSGBaseDatabaseModel, DSGBaseModel, DSGEnum, EnumValue
7
+ from dsgrid.dimension.base_models import DimensionType
8
+ from dsgrid.exceptions import DSGInvalidDimensionMapping
9
+ from .dimensions import DimensionReferenceModel
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class DimensionMappingType(DSGEnum):
15
+ """Defines the operation dsgrid will apply to the data during a mapping."""
16
+
17
+ # optional from_fraction col, FRACTION_SUM_EQ1 when grouped by from_id
18
+ ONE_TO_ONE = "one_to_one" # includes rename, down-selection
19
+ MANY_TO_ONE_AGGREGATION = "many_to_one_aggregation"
20
+ MANY_TO_ONE_REASSIGNMENT = "many_to_one_reassignment"
21
+
22
+ # optional from_fraction col, no FRACTION_SUM check
23
+ DUPLICATION = "duplication"
24
+
25
+ # required from_fraction col, FRACTION_SUM_EQ1 when grouped by from_id
26
+ ONE_TO_MANY_DISAGGREGATION = "one_to_many_disaggregation"
27
+ MANY_TO_MANY_AGGREGATION = "many_to_many_aggregation"
28
+ MANY_TO_MANY_DISAGGREGATION = "many_to_many_disaggregation"
29
+
30
+ # required from_fraction col, FRACTION_SUM_EQ1 when grouped by to_id
31
+ MANY_TO_ONE_ASSIGNMENT = "many_to_one_assignment"
32
+ ONE_TO_MANY_ASSIGNMENT = "one_to_many_assignment"
33
+ MANY_TO_MANY_ASSIGNMENT = "many_to_many_assignment"
34
+
35
+ # required from_fraction col, no FRACTION_SUM check
36
+ ONE_TO_ONE_EXPLICIT_MULTIPLIERS = "one_to_one_explicit_multipliers"
37
+ ONE_TO_MANY_EXPLICIT_MULTIPLIERS = "one_to_many_explicit_multipliers"
38
+ MANY_TO_ONE_EXPLICIT_MULTIPLIERS = "many_to_one_explicit_multipliers"
39
+ MANY_TO_MANY_EXPLICIT_MULTIPLIERS = "many_to_many_explicit_multipliers"
40
+
41
+
42
+ class DimensionMappingArchetype(DSGEnum):
43
+ """Dimension mapping archetype, used to check whether duplicates are allowed in from/to
44
+ dimensions and apply rules about the sum of the from_fraction column.
45
+ """
46
+
47
+ ONE_TO_ONE_MAP_FRACTION_SUM_FROM_ID_EQ1 = EnumValue(
48
+ value="one_to_one_map_fraction_sum_from_id_eq1",
49
+ description="One-to-one dimension mapping with sum of from_fraction = 1 when grouped by from_id",
50
+ allow_dup_from_records=False,
51
+ allow_dup_to_records=False,
52
+ check_fraction_sum_eq1_from_id=True,
53
+ check_fraction_sum_eq1_to_id=False,
54
+ )
55
+ ONE_TO_MANY_MAP_FRACTION_SUM_FROM_ID_EQ1 = EnumValue(
56
+ value="one_to_many_map_fraction_sum_from_id_eq1",
57
+ description="One-to-many dimension mapping with sum of from_fraction = 1 when grouped by from_id",
58
+ allow_dup_from_records=True,
59
+ allow_dup_to_records=False,
60
+ check_fraction_sum_eq1_from_id=True,
61
+ check_fraction_sum_eq1_to_id=False,
62
+ )
63
+ MANY_TO_ONE_MAP_FRACTION_SUM_FROM_ID_EQ1 = EnumValue(
64
+ value="many_to_one_map_fraction_sum_from_id_eq1",
65
+ description="Many-to-one dimension mapping with sum of from_fraction = 1 when grouped by from_id",
66
+ allow_dup_from_records=False,
67
+ allow_dup_to_records=True,
68
+ check_fraction_sum_eq1_from_id=True,
69
+ check_fraction_sum_eq1_to_id=False,
70
+ )
71
+ MANY_TO_MANY_MAP_FRACTION_SUM_FROM_ID_EQ1 = EnumValue(
72
+ value="many_to_many_map_fraction_sum_from_id_eq1",
73
+ description="Many-to-many dimension mapping with sum of from_fraction = 1 when grouped by from_id",
74
+ allow_dup_from_records=True,
75
+ allow_dup_to_records=True,
76
+ check_fraction_sum_eq1_from_id=True,
77
+ check_fraction_sum_eq1_to_id=False,
78
+ )
79
+
80
+ ONE_TO_ONE_MAP_FRACTION_SUM_TO_ID_EQ1 = EnumValue(
81
+ value="one_to_one_map_fraction_sum_to_id_eq1",
82
+ description="One-to-one dimension mapping with sum of from_fraction = 1 when grouped by to_id",
83
+ allow_dup_from_records=False,
84
+ allow_dup_to_records=False,
85
+ check_fraction_sum_eq1_from_id=False,
86
+ check_fraction_sum_eq1_to_id=True,
87
+ )
88
+ ONE_TO_MANY_MAP_FRACTION_SUM_TO_ID_EQ1 = EnumValue(
89
+ value="one_to_many_map_fraction_sum_to_id_eq1",
90
+ description="One-to-many dimension mapping with sum of from_fraction = 1 when grouped by to_id",
91
+ allow_dup_from_records=True,
92
+ allow_dup_to_records=False,
93
+ check_fraction_sum_eq1_from_id=False,
94
+ check_fraction_sum_eq1_to_id=True,
95
+ )
96
+ MANY_TO_ONE_MAP_FRACTION_SUM_TO_ID_EQ1 = EnumValue(
97
+ value="many_to_one_map_fraction_sum_to_id_eq1",
98
+ description="Many-to-one dimension mapping with sum of from_fraction = 1 when grouped by to_id",
99
+ allow_dup_from_records=False,
100
+ allow_dup_to_records=True,
101
+ check_fraction_sum_eq1_from_id=False,
102
+ check_fraction_sum_eq1_to_id=True,
103
+ )
104
+ MANY_TO_MANY_MAP_FRACTION_SUM_TO_ID_EQ1 = EnumValue(
105
+ value="many_to_many_map_fraction_sum_to_id_eq1",
106
+ description="Many-to-many dimension mapping with sum of from_fraction = 1 when grouped by to_id",
107
+ allow_dup_from_records=True,
108
+ allow_dup_to_records=True,
109
+ check_fraction_sum_eq1_from_id=False,
110
+ check_fraction_sum_eq1_to_id=True,
111
+ )
112
+
113
+ ONE_TO_ONE_MAP = EnumValue(
114
+ value="one_to_one_map",
115
+ description="One-to-one dimension mapping with no from_fraction sum check",
116
+ allow_dup_from_records=False,
117
+ allow_dup_to_records=False,
118
+ check_fraction_sum_eq1_from_id=False,
119
+ check_fraction_sum_eq1_to_id=False,
120
+ )
121
+ ONE_TO_MANY_MAP = EnumValue(
122
+ value="one_to_many_map",
123
+ description="One-to-many dimension mapping with no from_fraction sum check",
124
+ allow_dup_from_records=True,
125
+ allow_dup_to_records=False,
126
+ check_fraction_sum_eq1_from_id=False,
127
+ check_fraction_sum_eq1_to_id=False,
128
+ )
129
+ MANY_TO_ONE_MAP = EnumValue(
130
+ value="many_to_one_map",
131
+ description="Many-to-one dimension mapping with no from_fraction sum check",
132
+ allow_dup_from_records=False,
133
+ allow_dup_to_records=True,
134
+ check_fraction_sum_eq1_from_id=False,
135
+ check_fraction_sum_eq1_to_id=False,
136
+ )
137
+ MANY_TO_MANY_MAP = EnumValue(
138
+ value="many_to_many_map",
139
+ description="Many-to-many dimension mapping with no from_fraction sum check",
140
+ allow_dup_from_records=True,
141
+ allow_dup_to_records=True,
142
+ check_fraction_sum_eq1_from_id=False,
143
+ check_fraction_sum_eq1_to_id=False,
144
+ )
145
+
146
+
147
+ class DimensionMappingBaseModel(DSGBaseDatabaseModel):
148
+ """Base class for mapping dimensions"""
149
+
150
+ mapping_type: DimensionMappingType = Field(
151
+ title="mapping_type",
152
+ description="Type/purpose of the dimension mapping",
153
+ default="many_to_one_aggregation",
154
+ json_schema_extra={
155
+ "options": DimensionMappingType.format_for_docs(),
156
+ },
157
+ )
158
+ archetype: DimensionMappingArchetype | None = Field(
159
+ default=None,
160
+ title="archetype",
161
+ description="Dimension mapping archetype, determined based on mapping_type",
162
+ json_schema_extra={
163
+ "dsgrid_internal": True,
164
+ "options": DimensionMappingArchetype.format_for_docs(),
165
+ },
166
+ )
167
+ from_dimension: DimensionReferenceModel = Field(
168
+ title="from_dimension",
169
+ description="From dimension",
170
+ )
171
+ to_dimension: DimensionReferenceModel = Field(
172
+ title="to_dimension",
173
+ description="To dimension",
174
+ )
175
+ from_fraction_tolerance: float = Field(
176
+ title="from_fraction_tolerance",
177
+ description="Tolerance to apply when checking from_fraction column sums",
178
+ default=1e-6,
179
+ )
180
+ to_fraction_tolerance: float = Field(
181
+ title="to_fraction_tolerance",
182
+ description="Tolerance to apply when checking to_fraction column sums",
183
+ default=1e-6,
184
+ )
185
+ description: str = Field(
186
+ title="description",
187
+ description="Description of dimension mapping",
188
+ )
189
+ mapping_id: str | None = Field(
190
+ default=None,
191
+ title="mapping_id",
192
+ description="Unique dimension mapping identifier, generated by dsgrid",
193
+ json_schema_extra={
194
+ "dsgrid_internal": True,
195
+ "updateable": False,
196
+ },
197
+ )
198
+
199
+ @field_validator("archetype")
200
+ @classmethod
201
+ def check_archetype(cls, archetype, info: ValidationInfo):
202
+ if "mapping_type" not in info.data:
203
+ return archetype
204
+
205
+ archetype_assignment = {
206
+ # optional from_fraction col, FRACTION_SUM_EQ1 when grouped by from_id
207
+ DimensionMappingType.ONE_TO_ONE: DimensionMappingArchetype.ONE_TO_ONE_MAP_FRACTION_SUM_FROM_ID_EQ1,
208
+ DimensionMappingType.MANY_TO_ONE_AGGREGATION: DimensionMappingArchetype.MANY_TO_ONE_MAP_FRACTION_SUM_FROM_ID_EQ1,
209
+ DimensionMappingType.MANY_TO_ONE_REASSIGNMENT: DimensionMappingArchetype.MANY_TO_ONE_MAP_FRACTION_SUM_FROM_ID_EQ1,
210
+ # optional from_fraction col, no FRACTION_SUM check
211
+ DimensionMappingType.DUPLICATION: DimensionMappingArchetype.ONE_TO_MANY_MAP,
212
+ # required from_fraction col, FRACTION_SUM_EQ1 when grouped by from_id
213
+ DimensionMappingType.ONE_TO_MANY_DISAGGREGATION: DimensionMappingArchetype.ONE_TO_MANY_MAP_FRACTION_SUM_FROM_ID_EQ1,
214
+ DimensionMappingType.MANY_TO_MANY_AGGREGATION: DimensionMappingArchetype.MANY_TO_MANY_MAP_FRACTION_SUM_FROM_ID_EQ1,
215
+ DimensionMappingType.MANY_TO_MANY_DISAGGREGATION: DimensionMappingArchetype.MANY_TO_MANY_MAP_FRACTION_SUM_FROM_ID_EQ1,
216
+ # required from_fraction col, FRACTION_SUM_EQ1 when grouped by to_id
217
+ DimensionMappingType.ONE_TO_MANY_ASSIGNMENT: DimensionMappingArchetype.ONE_TO_MANY_MAP_FRACTION_SUM_TO_ID_EQ1,
218
+ DimensionMappingType.MANY_TO_ONE_ASSIGNMENT: DimensionMappingArchetype.MANY_TO_MANY_MAP_FRACTION_SUM_TO_ID_EQ1,
219
+ DimensionMappingType.MANY_TO_MANY_ASSIGNMENT: DimensionMappingArchetype.MANY_TO_MANY_MAP_FRACTION_SUM_TO_ID_EQ1,
220
+ # required from_fraction col, no FRACTION_SUM check
221
+ DimensionMappingType.ONE_TO_ONE_EXPLICIT_MULTIPLIERS: DimensionMappingArchetype.ONE_TO_ONE_MAP,
222
+ DimensionMappingType.ONE_TO_MANY_EXPLICIT_MULTIPLIERS: DimensionMappingArchetype.ONE_TO_MANY_MAP,
223
+ DimensionMappingType.MANY_TO_ONE_EXPLICIT_MULTIPLIERS: DimensionMappingArchetype.MANY_TO_ONE_MAP,
224
+ DimensionMappingType.MANY_TO_MANY_EXPLICIT_MULTIPLIERS: DimensionMappingArchetype.MANY_TO_MANY_MAP,
225
+ }
226
+
227
+ mapping_type = info.data["mapping_type"]
228
+ assigned_archetype = archetype_assignment[mapping_type]
229
+ if archetype is None:
230
+ archetype = assigned_archetype
231
+ elif archetype != assigned_archetype:
232
+ msg = (
233
+ '"mapping_type" and "archetype" are both defined AND they DO NOT correspond to each other. '
234
+ "archetype can be removed from config so that it can be assigned automatically based on mapping_type. "
235
+ f"Otherwise, {mapping_type=} should have archetype={assigned_archetype} "
236
+ )
237
+ raise DSGInvalidDimensionMapping(msg)
238
+ return archetype
239
+
240
+
241
+ class DimensionMappingPreRegisteredBaseModel(DSGBaseModel):
242
+ """Base class for mapping soon-to-be registered dimensions. As soon as the dimensions
243
+ are registered this will be converted to a DimensionMappingBaseModel and then registered.
244
+ """
245
+
246
+ mapping_type: DimensionMappingType = Field(
247
+ title="mapping_type",
248
+ description="Type/purpose of the dimension mapping",
249
+ default="many_to_one_aggregation",
250
+ json_schema_extra={
251
+ "options": DimensionMappingType.format_for_docs(),
252
+ },
253
+ )
254
+ archetype: DimensionMappingArchetype | None = Field(
255
+ default=None,
256
+ title="archetype",
257
+ description="Dimension mapping archetype, determined based on mapping_type",
258
+ json_schema_extra={
259
+ "dsgrid_internal": True,
260
+ "options": DimensionMappingArchetype.format_for_docs(),
261
+ },
262
+ )
263
+ description: str = Field(
264
+ title="description",
265
+ description="Description of dimension mapping",
266
+ )
267
+ from_fraction_tolerance: float = Field(
268
+ title="from_fraction_tolerance",
269
+ description="Tolerance value to apply to the from_fraction column",
270
+ default=1e-6,
271
+ )
272
+ to_fraction_tolerance: float = Field(
273
+ title="to_fraction_tolerance",
274
+ description="Tolerance value to apply to the to_fraction column",
275
+ default=1e-6,
276
+ )
277
+ project_base_dimension_name: str | None = Field(
278
+ default=None,
279
+ description="Name of the base dimension for which the mapping is being registered. "
280
+ "This is required in cases where the project has multiple base dimensions of the same "
281
+ "type. If None, there must only be one base dimension of this type in the project.",
282
+ )
283
+
284
+
285
+ class DimensionMappingDatasetToProjectBaseModel(DimensionMappingPreRegisteredBaseModel):
286
+ """Base class for mapping soon-to-be registered dimensions for a dataset. Used when
287
+ automatically registering mappings while submitting a dataset to a project.
288
+ """
289
+
290
+ dimension_type: DimensionType = Field(
291
+ title="dimension_type",
292
+ description="Dimension types that will be mapped",
293
+ )
294
+
295
+
296
+ class DimensionMappingReferenceModel(DSGBaseModel):
297
+ """Reference to a dimension mapping stored in the registry.
298
+
299
+ The DimensionMappingReferenceModel is utilized by the project configuration (project.json5) as well as by the
300
+ dimension mapping reference configuration (dimension_mapping_references.json5) that may be required when submitting a dataset to a project.
301
+ """
302
+
303
+ from_dimension_type: DimensionType = Field(
304
+ title="from_dimension_type",
305
+ description="Dimension Type",
306
+ json_schema_extra={
307
+ "options": DimensionType.format_for_docs(),
308
+ },
309
+ )
310
+ to_dimension_type: DimensionType = Field(
311
+ title="to_dimension_type",
312
+ description="Dimension Type",
313
+ json_schema_extra={
314
+ "options": DimensionType.format_for_docs(),
315
+ },
316
+ )
317
+ mapping_id: str = Field(
318
+ title="mapping_id",
319
+ description="Unique ID of the dimension mapping",
320
+ json_schema_extra={
321
+ "updateable": False,
322
+ },
323
+ )
324
+ version: str = Field(
325
+ title="version",
326
+ description="Version of the dimension",
327
+ # TODO: add notes about warnings for outdated versions DSGRID-189 & DSGRID-148
328
+ )
329
+ required_for_validation: bool = Field(
330
+ title="version",
331
+ description="Set to False if a given dimension association is NOT required for input dataset validation; default is True",
332
+ default=True,
333
+ # TODO: add notes about warnings for outdated versions DSGRID-189 & DSGRID-148
334
+ )
335
+
336
+ # @field_validator("required_for_validation")
337
+ # @classmethod
338
+ # def check_required_for_validation_field(cls, value):
339
+ # # TODO if base_to_supplemental, raise error
340
+ # return value
341
+
342
+
343
+ class DimensionMappingReferenceListModel(DSGBaseModel):
344
+ """List of dimension mapping references used by the dimensions_mappings.json5 config"""
345
+
346
+ references: list[DimensionMappingReferenceModel] = Field(
347
+ title="references",
348
+ description="List of dimension mapping references",
349
+ )
@@ -0,0 +1,48 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ from pydantic import Field
5
+
6
+ from dsgrid.data_models import DSGBaseModel
7
+ from .mapping_tables import MappingTableModel
8
+ from .config_base import ConfigBase
9
+
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class DimensionMappingsConfigModel(DSGBaseModel):
15
+ """Represents dimension mapping model configurations"""
16
+
17
+ # This may eventually change to a Union if there are more subclasses.
18
+ mappings: list[MappingTableModel] = Field(
19
+ title="mappings",
20
+ description="dimension mappings between and within projects and datasets",
21
+ )
22
+
23
+
24
+ class DimensionMappingsConfig(ConfigBase):
25
+ """Provides an interface to a DimensionMappingsConfigModel."""
26
+
27
+ def __init__(self, *args, **kwargs):
28
+ super().__init__(*args, **kwargs)
29
+
30
+ @staticmethod
31
+ def config_filename():
32
+ return "dimension_mappings.json5"
33
+
34
+ @property
35
+ def config_id(self):
36
+ return self._model.dimension_mapping_id
37
+
38
+ @staticmethod
39
+ def model_class():
40
+ return DimensionMappingsConfigModel
41
+
42
+ @classmethod
43
+ def load(cls, config_filename: Path, *args, **kwargs):
44
+ return super().load(config_filename, *args, **kwargs)
45
+
46
+ @classmethod
47
+ def load_from_model(cls, model):
48
+ return cls(model)