dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. build_backend.py +93 -0
  2. dsgrid/__init__.py +22 -0
  3. dsgrid/api/__init__.py +0 -0
  4. dsgrid/api/api_manager.py +179 -0
  5. dsgrid/api/app.py +419 -0
  6. dsgrid/api/models.py +60 -0
  7. dsgrid/api/response_models.py +116 -0
  8. dsgrid/apps/__init__.py +0 -0
  9. dsgrid/apps/project_viewer/app.py +216 -0
  10. dsgrid/apps/registration_gui.py +444 -0
  11. dsgrid/chronify.py +32 -0
  12. dsgrid/cli/__init__.py +0 -0
  13. dsgrid/cli/common.py +120 -0
  14. dsgrid/cli/config.py +176 -0
  15. dsgrid/cli/download.py +13 -0
  16. dsgrid/cli/dsgrid.py +157 -0
  17. dsgrid/cli/dsgrid_admin.py +92 -0
  18. dsgrid/cli/install_notebooks.py +62 -0
  19. dsgrid/cli/query.py +729 -0
  20. dsgrid/cli/registry.py +1862 -0
  21. dsgrid/cloud/__init__.py +0 -0
  22. dsgrid/cloud/cloud_storage_interface.py +140 -0
  23. dsgrid/cloud/factory.py +31 -0
  24. dsgrid/cloud/fake_storage_interface.py +37 -0
  25. dsgrid/cloud/s3_storage_interface.py +156 -0
  26. dsgrid/common.py +36 -0
  27. dsgrid/config/__init__.py +0 -0
  28. dsgrid/config/annual_time_dimension_config.py +194 -0
  29. dsgrid/config/common.py +142 -0
  30. dsgrid/config/config_base.py +148 -0
  31. dsgrid/config/dataset_config.py +907 -0
  32. dsgrid/config/dataset_schema_handler_factory.py +46 -0
  33. dsgrid/config/date_time_dimension_config.py +136 -0
  34. dsgrid/config/dimension_config.py +54 -0
  35. dsgrid/config/dimension_config_factory.py +65 -0
  36. dsgrid/config/dimension_mapping_base.py +350 -0
  37. dsgrid/config/dimension_mappings_config.py +48 -0
  38. dsgrid/config/dimensions.py +1025 -0
  39. dsgrid/config/dimensions_config.py +71 -0
  40. dsgrid/config/file_schema.py +190 -0
  41. dsgrid/config/index_time_dimension_config.py +80 -0
  42. dsgrid/config/input_dataset_requirements.py +31 -0
  43. dsgrid/config/mapping_tables.py +209 -0
  44. dsgrid/config/noop_time_dimension_config.py +42 -0
  45. dsgrid/config/project_config.py +1462 -0
  46. dsgrid/config/registration_models.py +188 -0
  47. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  48. dsgrid/config/simple_models.py +49 -0
  49. dsgrid/config/supplemental_dimension.py +29 -0
  50. dsgrid/config/time_dimension_base_config.py +192 -0
  51. dsgrid/data_models.py +155 -0
  52. dsgrid/dataset/__init__.py +0 -0
  53. dsgrid/dataset/dataset.py +123 -0
  54. dsgrid/dataset/dataset_expression_handler.py +86 -0
  55. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  56. dsgrid/dataset/dataset_schema_handler_base.py +945 -0
  57. dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
  58. dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
  59. dsgrid/dataset/growth_rates.py +162 -0
  60. dsgrid/dataset/models.py +51 -0
  61. dsgrid/dataset/table_format_handler_base.py +257 -0
  62. dsgrid/dataset/table_format_handler_factory.py +17 -0
  63. dsgrid/dataset/unpivoted_table.py +121 -0
  64. dsgrid/dimension/__init__.py +0 -0
  65. dsgrid/dimension/base_models.py +230 -0
  66. dsgrid/dimension/dimension_filters.py +308 -0
  67. dsgrid/dimension/standard.py +252 -0
  68. dsgrid/dimension/time.py +352 -0
  69. dsgrid/dimension/time_utils.py +103 -0
  70. dsgrid/dsgrid_rc.py +88 -0
  71. dsgrid/exceptions.py +105 -0
  72. dsgrid/filesystem/__init__.py +0 -0
  73. dsgrid/filesystem/cloud_filesystem.py +32 -0
  74. dsgrid/filesystem/factory.py +32 -0
  75. dsgrid/filesystem/filesystem_interface.py +136 -0
  76. dsgrid/filesystem/local_filesystem.py +74 -0
  77. dsgrid/filesystem/s3_filesystem.py +118 -0
  78. dsgrid/loggers.py +132 -0
  79. dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
  80. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
  81. dsgrid/notebooks/registration.ipynb +48 -0
  82. dsgrid/notebooks/start_notebook.sh +11 -0
  83. dsgrid/project.py +451 -0
  84. dsgrid/query/__init__.py +0 -0
  85. dsgrid/query/dataset_mapping_plan.py +142 -0
  86. dsgrid/query/derived_dataset.py +388 -0
  87. dsgrid/query/models.py +728 -0
  88. dsgrid/query/query_context.py +287 -0
  89. dsgrid/query/query_submitter.py +994 -0
  90. dsgrid/query/report_factory.py +19 -0
  91. dsgrid/query/report_peak_load.py +70 -0
  92. dsgrid/query/reports_base.py +20 -0
  93. dsgrid/registry/__init__.py +0 -0
  94. dsgrid/registry/bulk_register.py +165 -0
  95. dsgrid/registry/common.py +287 -0
  96. dsgrid/registry/config_update_checker_base.py +63 -0
  97. dsgrid/registry/data_store_factory.py +34 -0
  98. dsgrid/registry/data_store_interface.py +74 -0
  99. dsgrid/registry/dataset_config_generator.py +158 -0
  100. dsgrid/registry/dataset_registry_manager.py +950 -0
  101. dsgrid/registry/dataset_update_checker.py +16 -0
  102. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  103. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  104. dsgrid/registry/dimension_registry_manager.py +413 -0
  105. dsgrid/registry/dimension_update_checker.py +16 -0
  106. dsgrid/registry/duckdb_data_store.py +207 -0
  107. dsgrid/registry/filesystem_data_store.py +150 -0
  108. dsgrid/registry/filter_registry_manager.py +123 -0
  109. dsgrid/registry/project_config_generator.py +57 -0
  110. dsgrid/registry/project_registry_manager.py +1623 -0
  111. dsgrid/registry/project_update_checker.py +48 -0
  112. dsgrid/registry/registration_context.py +223 -0
  113. dsgrid/registry/registry_auto_updater.py +316 -0
  114. dsgrid/registry/registry_database.py +667 -0
  115. dsgrid/registry/registry_interface.py +446 -0
  116. dsgrid/registry/registry_manager.py +558 -0
  117. dsgrid/registry/registry_manager_base.py +367 -0
  118. dsgrid/registry/versioning.py +92 -0
  119. dsgrid/rust_ext/__init__.py +14 -0
  120. dsgrid/rust_ext/find_minimal_patterns.py +129 -0
  121. dsgrid/spark/__init__.py +0 -0
  122. dsgrid/spark/functions.py +589 -0
  123. dsgrid/spark/types.py +110 -0
  124. dsgrid/tests/__init__.py +0 -0
  125. dsgrid/tests/common.py +140 -0
  126. dsgrid/tests/make_us_data_registry.py +265 -0
  127. dsgrid/tests/register_derived_datasets.py +103 -0
  128. dsgrid/tests/utils.py +25 -0
  129. dsgrid/time/__init__.py +0 -0
  130. dsgrid/time/time_conversions.py +80 -0
  131. dsgrid/time/types.py +67 -0
  132. dsgrid/units/__init__.py +0 -0
  133. dsgrid/units/constants.py +113 -0
  134. dsgrid/units/convert.py +71 -0
  135. dsgrid/units/energy.py +145 -0
  136. dsgrid/units/power.py +87 -0
  137. dsgrid/utils/__init__.py +0 -0
  138. dsgrid/utils/dataset.py +830 -0
  139. dsgrid/utils/files.py +179 -0
  140. dsgrid/utils/filters.py +125 -0
  141. dsgrid/utils/id_remappings.py +100 -0
  142. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  143. dsgrid/utils/py_expression_eval/README.md +8 -0
  144. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  145. dsgrid/utils/py_expression_eval/tests.py +283 -0
  146. dsgrid/utils/run_command.py +70 -0
  147. dsgrid/utils/scratch_dir_context.py +65 -0
  148. dsgrid/utils/spark.py +918 -0
  149. dsgrid/utils/spark_partition.py +98 -0
  150. dsgrid/utils/timing.py +239 -0
  151. dsgrid/utils/utilities.py +221 -0
  152. dsgrid/utils/versioning.py +36 -0
  153. dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
  154. dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
  155. dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
  156. dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
  157. dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,74 @@
1
+ import abc
2
+ from pathlib import Path
3
+ from typing import Self
4
+
5
+ from dsgrid.spark.types import DataFrame
6
+
7
+
8
+ class DataStoreInterface(abc.ABC):
9
+ """Base class for data stores."""
10
+
11
+ def __init__(self, base_path: Path):
12
+ self._base_path = base_path
13
+
14
+ @classmethod
15
+ @abc.abstractmethod
16
+ def create(cls, base_path: Path) -> Self:
17
+ """Create the data store."""
18
+
19
+ @classmethod
20
+ @abc.abstractmethod
21
+ def load(cls, base_path: Path) -> Self:
22
+ """Load an existing data store."""
23
+
24
+ @property
25
+ def base_path(self) -> Path:
26
+ """Return the base path of the data store."""
27
+ return self._base_path
28
+
29
+ @abc.abstractmethod
30
+ def read_table(self, dataset_id: str, version: str) -> DataFrame:
31
+ """Read a table from the data store."""
32
+
33
+ @abc.abstractmethod
34
+ def replace_table(self, df: DataFrame, dataset_id: str, version: str) -> None:
35
+ """Replace a table in the data store."""
36
+
37
+ @abc.abstractmethod
38
+ def read_lookup_table(self, dataset_id: str, version: str) -> DataFrame:
39
+ """Read a lookup table from the data store."""
40
+
41
+ @abc.abstractmethod
42
+ def replace_lookup_table(self, df: DataFrame, dataset_id: str, version: str) -> None:
43
+ """Replace a lookup table in the data store."""
44
+
45
+ @abc.abstractmethod
46
+ def write_table(
47
+ self, df: DataFrame, dataset_id: str, version: str, overwrite: bool = False
48
+ ) -> None:
49
+ """Write a table to the data store."""
50
+
51
+ @abc.abstractmethod
52
+ def write_lookup_table(
53
+ self, df: DataFrame, dataset_id: str, version: str, overwrite: bool = False
54
+ ) -> None:
55
+ """Write a lookup table to the data store."""
56
+
57
+ @abc.abstractmethod
58
+ def write_missing_associations_tables(
59
+ self, dfs: dict[str, DataFrame], dataset_id: str, version: str, overwrite: bool = False
60
+ ) -> None:
61
+ """Write a set of tables of missing dimension associations to the data store.
62
+ The dictionary keys of the dfs argument should human-readable tags for the contents of
63
+ the tables, but are not otherwise significant.
64
+ """
65
+
66
+ @abc.abstractmethod
67
+ def read_missing_associations_tables(
68
+ self, dataset_id: str, version: str
69
+ ) -> dict[str, DataFrame]:
70
+ """Read a missing dimensions association tables from the data store."""
71
+
72
+ @abc.abstractmethod
73
+ def remove_tables(self, dataset_id: str, version: str) -> None:
74
+ """Remove the data and lookup tables from the data store."""
@@ -0,0 +1,158 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Iterable
4
+
5
+ from chronify.utils.path_utils import check_overwrite
6
+
7
+ from dsgrid.config.dataset_config import (
8
+ get_unique_dimension_record_ids,
9
+ make_unvalidated_dataset_config,
10
+ )
11
+ from dsgrid.dataset.models import TableFormat
12
+ from dsgrid.config.project_config import ProjectConfig
13
+ from dsgrid.dimension.base_models import DimensionType
14
+ from dsgrid.dimension.time import TimeDimensionType
15
+ from dsgrid.registry.dimension_registry_manager import DimensionRegistryManager
16
+ from dsgrid.registry.registry_manager import RegistryManager
17
+ from dsgrid.utils.files import dump_data
18
+ from dsgrid.config.dimensions import DimensionReferenceModel
19
+ from dsgrid.config.dimension_config import DimensionBaseConfigWithFiles
20
+
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ def generate_config_from_dataset(
26
+ registry_manager: RegistryManager,
27
+ dataset_id: str,
28
+ dataset_path: Path,
29
+ table_format: TableFormat,
30
+ metric_type: str,
31
+ pivoted_dimension_type: DimensionType | None = None,
32
+ time_type: TimeDimensionType | None = None,
33
+ time_columns: set[str] | None = None,
34
+ output_directory: Path | None = None,
35
+ project_id: str | None = None,
36
+ overwrite: bool = False,
37
+ no_prompts: bool = False,
38
+ ):
39
+ """Generate dataset config files from a dataset table.
40
+
41
+ Fill out the dimension record files based on the unique values in the dataset.
42
+
43
+ Look for matches for dimensions in the registry, checking for project base dimensions
44
+ first. Prompt the user for confirmation unless --no-prompts is set. If --no-prompts is
45
+ set, the first match is automatically accepted.
46
+ """
47
+ project_config = (
48
+ None if project_id is None else registry_manager.project_manager.get_by_id(project_id)
49
+ )
50
+ output_dir = (output_directory or Path()) / dataset_id
51
+ check_overwrite(output_dir, overwrite)
52
+ output_dir.mkdir()
53
+ dimensions_dir = output_dir / "dimensions"
54
+ dimensions_dir.mkdir()
55
+ dataset_file = output_dir / "dataset.json5"
56
+ time_cols = time_columns or {"timestamp"}
57
+
58
+ dimension_references: list[DimensionReferenceModel] = []
59
+ for dim_type, ids in get_unique_dimension_record_ids(
60
+ dataset_path, table_format, pivoted_dimension_type, time_cols
61
+ ).items():
62
+ ref, checked_project_dim_ids = find_matching_project_base_dimension(
63
+ project_config, ids, dim_type, no_prompts=no_prompts
64
+ )
65
+ if ref is None:
66
+ ref = find_matching_registry_dimensions(
67
+ registry_manager.dimension_manager,
68
+ ids,
69
+ dim_type,
70
+ checked_project_dim_ids,
71
+ no_prompts=no_prompts,
72
+ )
73
+ if ref is None:
74
+ write_dimension_records(ids, dimensions_dir / f"{dim_type.value}.csv")
75
+ else:
76
+ dimension_references.append(ref)
77
+
78
+ config = make_unvalidated_dataset_config(
79
+ dataset_id,
80
+ metric_type,
81
+ dimension_references=dimension_references,
82
+ time_type=time_type,
83
+ )
84
+ dump_data(config, dataset_file, indent=2)
85
+ logger.info("Wrote dataset config to %s", dataset_file)
86
+
87
+
88
+ def write_dimension_records(ids: Iterable[str], filename: Path) -> None:
89
+ with open(filename, "w", encoding="utf-8") as f:
90
+ header = ["id", "name"]
91
+ f.write(",".join(header))
92
+ f.write("\n")
93
+ for id_ in ids:
94
+ str_id = str(id_)
95
+ values = [str_id, str_id.title().replace("_", " ")]
96
+ f.write(",".join(values))
97
+ f.write("\n")
98
+ logger.info("Wrote dimension records to %s", filename)
99
+
100
+
101
+ def find_matching_project_base_dimension(
102
+ project_config: ProjectConfig | None,
103
+ sorted_record_ids: list[str],
104
+ dimension_type: DimensionType,
105
+ no_prompts: bool = False,
106
+ ) -> tuple[DimensionReferenceModel | None, set[str]]:
107
+ """Find matching base dimensions for a dataset in a project."""
108
+ checked_project_dim_ids: set[str] = set()
109
+ if project_config is None:
110
+ return None, checked_project_dim_ids
111
+
112
+ if dimension_type == DimensionType.TIME:
113
+ return None, checked_project_dim_ids
114
+
115
+ for dim in project_config.list_base_dimensions_with_records(dimension_type=dimension_type):
116
+ project_records = sorted(dim.get_unique_ids())
117
+ checked_project_dim_ids.add(dim.model.dimension_id)
118
+ if sorted_record_ids == project_records and (
119
+ no_prompts or get_user_input_on_dimension_match(dim, "project base dimension")
120
+ ):
121
+ return make_dimension_ref(dim), checked_project_dim_ids
122
+
123
+ return None, checked_project_dim_ids
124
+
125
+
126
+ def find_matching_registry_dimensions(
127
+ dimension_manager: DimensionRegistryManager,
128
+ ids: list[str],
129
+ dimension_type: DimensionType,
130
+ checked_project_dim_ids: set[str],
131
+ no_prompts: bool = False,
132
+ ) -> DimensionReferenceModel | None:
133
+ for dim in dimension_manager.find_matching_dimensions(ids, dimension_type):
134
+ if dim.model.dimension_id not in checked_project_dim_ids and (
135
+ no_prompts or get_user_input_on_dimension_match(dim, "dimension from the registry")
136
+ ):
137
+ return make_dimension_ref(dim)
138
+ return None
139
+
140
+
141
+ def get_user_input_on_dimension_match(dim: DimensionBaseConfigWithFiles, tag: str) -> bool:
142
+ value = input(
143
+ f"Found a {tag} with matching records:\n"
144
+ f" Dimension type: {dim.model.dimension_type.value}\n"
145
+ f" Name: {dim.model.name}\n"
146
+ f" Description: {dim.model.description}\n"
147
+ f" Dimension ID: {dim.model.dimension_id}\n"
148
+ "Do you want to use it? (y/n) >>> "
149
+ )
150
+ return value.lower().strip() == "y"
151
+
152
+
153
+ def make_dimension_ref(dim: DimensionBaseConfigWithFiles) -> DimensionReferenceModel:
154
+ return DimensionReferenceModel(
155
+ dimension_id=dim.model.dimension_id,
156
+ type=dim.model.dimension_type,
157
+ version=dim.model.version,
158
+ )