dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backend.py +93 -0
- dsgrid/__init__.py +22 -0
- dsgrid/api/__init__.py +0 -0
- dsgrid/api/api_manager.py +179 -0
- dsgrid/api/app.py +419 -0
- dsgrid/api/models.py +60 -0
- dsgrid/api/response_models.py +116 -0
- dsgrid/apps/__init__.py +0 -0
- dsgrid/apps/project_viewer/app.py +216 -0
- dsgrid/apps/registration_gui.py +444 -0
- dsgrid/chronify.py +32 -0
- dsgrid/cli/__init__.py +0 -0
- dsgrid/cli/common.py +120 -0
- dsgrid/cli/config.py +176 -0
- dsgrid/cli/download.py +13 -0
- dsgrid/cli/dsgrid.py +157 -0
- dsgrid/cli/dsgrid_admin.py +92 -0
- dsgrid/cli/install_notebooks.py +62 -0
- dsgrid/cli/query.py +729 -0
- dsgrid/cli/registry.py +1862 -0
- dsgrid/cloud/__init__.py +0 -0
- dsgrid/cloud/cloud_storage_interface.py +140 -0
- dsgrid/cloud/factory.py +31 -0
- dsgrid/cloud/fake_storage_interface.py +37 -0
- dsgrid/cloud/s3_storage_interface.py +156 -0
- dsgrid/common.py +36 -0
- dsgrid/config/__init__.py +0 -0
- dsgrid/config/annual_time_dimension_config.py +194 -0
- dsgrid/config/common.py +142 -0
- dsgrid/config/config_base.py +148 -0
- dsgrid/config/dataset_config.py +907 -0
- dsgrid/config/dataset_schema_handler_factory.py +46 -0
- dsgrid/config/date_time_dimension_config.py +136 -0
- dsgrid/config/dimension_config.py +54 -0
- dsgrid/config/dimension_config_factory.py +65 -0
- dsgrid/config/dimension_mapping_base.py +350 -0
- dsgrid/config/dimension_mappings_config.py +48 -0
- dsgrid/config/dimensions.py +1025 -0
- dsgrid/config/dimensions_config.py +71 -0
- dsgrid/config/file_schema.py +190 -0
- dsgrid/config/index_time_dimension_config.py +80 -0
- dsgrid/config/input_dataset_requirements.py +31 -0
- dsgrid/config/mapping_tables.py +209 -0
- dsgrid/config/noop_time_dimension_config.py +42 -0
- dsgrid/config/project_config.py +1462 -0
- dsgrid/config/registration_models.py +188 -0
- dsgrid/config/representative_period_time_dimension_config.py +194 -0
- dsgrid/config/simple_models.py +49 -0
- dsgrid/config/supplemental_dimension.py +29 -0
- dsgrid/config/time_dimension_base_config.py +192 -0
- dsgrid/data_models.py +155 -0
- dsgrid/dataset/__init__.py +0 -0
- dsgrid/dataset/dataset.py +123 -0
- dsgrid/dataset/dataset_expression_handler.py +86 -0
- dsgrid/dataset/dataset_mapping_manager.py +121 -0
- dsgrid/dataset/dataset_schema_handler_base.py +945 -0
- dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
- dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
- dsgrid/dataset/growth_rates.py +162 -0
- dsgrid/dataset/models.py +51 -0
- dsgrid/dataset/table_format_handler_base.py +257 -0
- dsgrid/dataset/table_format_handler_factory.py +17 -0
- dsgrid/dataset/unpivoted_table.py +121 -0
- dsgrid/dimension/__init__.py +0 -0
- dsgrid/dimension/base_models.py +230 -0
- dsgrid/dimension/dimension_filters.py +308 -0
- dsgrid/dimension/standard.py +252 -0
- dsgrid/dimension/time.py +352 -0
- dsgrid/dimension/time_utils.py +103 -0
- dsgrid/dsgrid_rc.py +88 -0
- dsgrid/exceptions.py +105 -0
- dsgrid/filesystem/__init__.py +0 -0
- dsgrid/filesystem/cloud_filesystem.py +32 -0
- dsgrid/filesystem/factory.py +32 -0
- dsgrid/filesystem/filesystem_interface.py +136 -0
- dsgrid/filesystem/local_filesystem.py +74 -0
- dsgrid/filesystem/s3_filesystem.py +118 -0
- dsgrid/loggers.py +132 -0
- dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
- dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
- dsgrid/notebooks/registration.ipynb +48 -0
- dsgrid/notebooks/start_notebook.sh +11 -0
- dsgrid/project.py +451 -0
- dsgrid/query/__init__.py +0 -0
- dsgrid/query/dataset_mapping_plan.py +142 -0
- dsgrid/query/derived_dataset.py +388 -0
- dsgrid/query/models.py +728 -0
- dsgrid/query/query_context.py +287 -0
- dsgrid/query/query_submitter.py +994 -0
- dsgrid/query/report_factory.py +19 -0
- dsgrid/query/report_peak_load.py +70 -0
- dsgrid/query/reports_base.py +20 -0
- dsgrid/registry/__init__.py +0 -0
- dsgrid/registry/bulk_register.py +165 -0
- dsgrid/registry/common.py +287 -0
- dsgrid/registry/config_update_checker_base.py +63 -0
- dsgrid/registry/data_store_factory.py +34 -0
- dsgrid/registry/data_store_interface.py +74 -0
- dsgrid/registry/dataset_config_generator.py +158 -0
- dsgrid/registry/dataset_registry_manager.py +950 -0
- dsgrid/registry/dataset_update_checker.py +16 -0
- dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
- dsgrid/registry/dimension_mapping_update_checker.py +16 -0
- dsgrid/registry/dimension_registry_manager.py +413 -0
- dsgrid/registry/dimension_update_checker.py +16 -0
- dsgrid/registry/duckdb_data_store.py +207 -0
- dsgrid/registry/filesystem_data_store.py +150 -0
- dsgrid/registry/filter_registry_manager.py +123 -0
- dsgrid/registry/project_config_generator.py +57 -0
- dsgrid/registry/project_registry_manager.py +1623 -0
- dsgrid/registry/project_update_checker.py +48 -0
- dsgrid/registry/registration_context.py +223 -0
- dsgrid/registry/registry_auto_updater.py +316 -0
- dsgrid/registry/registry_database.py +667 -0
- dsgrid/registry/registry_interface.py +446 -0
- dsgrid/registry/registry_manager.py +558 -0
- dsgrid/registry/registry_manager_base.py +367 -0
- dsgrid/registry/versioning.py +92 -0
- dsgrid/rust_ext/__init__.py +14 -0
- dsgrid/rust_ext/find_minimal_patterns.py +129 -0
- dsgrid/spark/__init__.py +0 -0
- dsgrid/spark/functions.py +589 -0
- dsgrid/spark/types.py +110 -0
- dsgrid/tests/__init__.py +0 -0
- dsgrid/tests/common.py +140 -0
- dsgrid/tests/make_us_data_registry.py +265 -0
- dsgrid/tests/register_derived_datasets.py +103 -0
- dsgrid/tests/utils.py +25 -0
- dsgrid/time/__init__.py +0 -0
- dsgrid/time/time_conversions.py +80 -0
- dsgrid/time/types.py +67 -0
- dsgrid/units/__init__.py +0 -0
- dsgrid/units/constants.py +113 -0
- dsgrid/units/convert.py +71 -0
- dsgrid/units/energy.py +145 -0
- dsgrid/units/power.py +87 -0
- dsgrid/utils/__init__.py +0 -0
- dsgrid/utils/dataset.py +830 -0
- dsgrid/utils/files.py +179 -0
- dsgrid/utils/filters.py +125 -0
- dsgrid/utils/id_remappings.py +100 -0
- dsgrid/utils/py_expression_eval/LICENSE +19 -0
- dsgrid/utils/py_expression_eval/README.md +8 -0
- dsgrid/utils/py_expression_eval/__init__.py +847 -0
- dsgrid/utils/py_expression_eval/tests.py +283 -0
- dsgrid/utils/run_command.py +70 -0
- dsgrid/utils/scratch_dir_context.py +65 -0
- dsgrid/utils/spark.py +918 -0
- dsgrid/utils/spark_partition.py +98 -0
- dsgrid/utils/timing.py +239 -0
- dsgrid/utils/utilities.py +221 -0
- dsgrid/utils/versioning.py +36 -0
- dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
- dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
- dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
- dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
- dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Self
|
|
4
|
+
|
|
5
|
+
from dsgrid.spark.types import DataFrame
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DataStoreInterface(abc.ABC):
|
|
9
|
+
"""Base class for data stores."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, base_path: Path):
|
|
12
|
+
self._base_path = base_path
|
|
13
|
+
|
|
14
|
+
@classmethod
|
|
15
|
+
@abc.abstractmethod
|
|
16
|
+
def create(cls, base_path: Path) -> Self:
|
|
17
|
+
"""Create the data store."""
|
|
18
|
+
|
|
19
|
+
@classmethod
|
|
20
|
+
@abc.abstractmethod
|
|
21
|
+
def load(cls, base_path: Path) -> Self:
|
|
22
|
+
"""Load an existing data store."""
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def base_path(self) -> Path:
|
|
26
|
+
"""Return the base path of the data store."""
|
|
27
|
+
return self._base_path
|
|
28
|
+
|
|
29
|
+
@abc.abstractmethod
|
|
30
|
+
def read_table(self, dataset_id: str, version: str) -> DataFrame:
|
|
31
|
+
"""Read a table from the data store."""
|
|
32
|
+
|
|
33
|
+
@abc.abstractmethod
|
|
34
|
+
def replace_table(self, df: DataFrame, dataset_id: str, version: str) -> None:
|
|
35
|
+
"""Replace a table in the data store."""
|
|
36
|
+
|
|
37
|
+
@abc.abstractmethod
|
|
38
|
+
def read_lookup_table(self, dataset_id: str, version: str) -> DataFrame:
|
|
39
|
+
"""Read a lookup table from the data store."""
|
|
40
|
+
|
|
41
|
+
@abc.abstractmethod
|
|
42
|
+
def replace_lookup_table(self, df: DataFrame, dataset_id: str, version: str) -> None:
|
|
43
|
+
"""Replace a lookup table in the data store."""
|
|
44
|
+
|
|
45
|
+
@abc.abstractmethod
|
|
46
|
+
def write_table(
|
|
47
|
+
self, df: DataFrame, dataset_id: str, version: str, overwrite: bool = False
|
|
48
|
+
) -> None:
|
|
49
|
+
"""Write a table to the data store."""
|
|
50
|
+
|
|
51
|
+
@abc.abstractmethod
|
|
52
|
+
def write_lookup_table(
|
|
53
|
+
self, df: DataFrame, dataset_id: str, version: str, overwrite: bool = False
|
|
54
|
+
) -> None:
|
|
55
|
+
"""Write a lookup table to the data store."""
|
|
56
|
+
|
|
57
|
+
@abc.abstractmethod
|
|
58
|
+
def write_missing_associations_tables(
|
|
59
|
+
self, dfs: dict[str, DataFrame], dataset_id: str, version: str, overwrite: bool = False
|
|
60
|
+
) -> None:
|
|
61
|
+
"""Write a set of tables of missing dimension associations to the data store.
|
|
62
|
+
The dictionary keys of the dfs argument should human-readable tags for the contents of
|
|
63
|
+
the tables, but are not otherwise significant.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
@abc.abstractmethod
|
|
67
|
+
def read_missing_associations_tables(
|
|
68
|
+
self, dataset_id: str, version: str
|
|
69
|
+
) -> dict[str, DataFrame]:
|
|
70
|
+
"""Read a missing dimensions association tables from the data store."""
|
|
71
|
+
|
|
72
|
+
@abc.abstractmethod
|
|
73
|
+
def remove_tables(self, dataset_id: str, version: str) -> None:
|
|
74
|
+
"""Remove the data and lookup tables from the data store."""
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Iterable
|
|
4
|
+
|
|
5
|
+
from chronify.utils.path_utils import check_overwrite
|
|
6
|
+
|
|
7
|
+
from dsgrid.config.dataset_config import (
|
|
8
|
+
get_unique_dimension_record_ids,
|
|
9
|
+
make_unvalidated_dataset_config,
|
|
10
|
+
)
|
|
11
|
+
from dsgrid.dataset.models import TableFormat
|
|
12
|
+
from dsgrid.config.project_config import ProjectConfig
|
|
13
|
+
from dsgrid.dimension.base_models import DimensionType
|
|
14
|
+
from dsgrid.dimension.time import TimeDimensionType
|
|
15
|
+
from dsgrid.registry.dimension_registry_manager import DimensionRegistryManager
|
|
16
|
+
from dsgrid.registry.registry_manager import RegistryManager
|
|
17
|
+
from dsgrid.utils.files import dump_data
|
|
18
|
+
from dsgrid.config.dimensions import DimensionReferenceModel
|
|
19
|
+
from dsgrid.config.dimension_config import DimensionBaseConfigWithFiles
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def generate_config_from_dataset(
|
|
26
|
+
registry_manager: RegistryManager,
|
|
27
|
+
dataset_id: str,
|
|
28
|
+
dataset_path: Path,
|
|
29
|
+
table_format: TableFormat,
|
|
30
|
+
metric_type: str,
|
|
31
|
+
pivoted_dimension_type: DimensionType | None = None,
|
|
32
|
+
time_type: TimeDimensionType | None = None,
|
|
33
|
+
time_columns: set[str] | None = None,
|
|
34
|
+
output_directory: Path | None = None,
|
|
35
|
+
project_id: str | None = None,
|
|
36
|
+
overwrite: bool = False,
|
|
37
|
+
no_prompts: bool = False,
|
|
38
|
+
):
|
|
39
|
+
"""Generate dataset config files from a dataset table.
|
|
40
|
+
|
|
41
|
+
Fill out the dimension record files based on the unique values in the dataset.
|
|
42
|
+
|
|
43
|
+
Look for matches for dimensions in the registry, checking for project base dimensions
|
|
44
|
+
first. Prompt the user for confirmation unless --no-prompts is set. If --no-prompts is
|
|
45
|
+
set, the first match is automatically accepted.
|
|
46
|
+
"""
|
|
47
|
+
project_config = (
|
|
48
|
+
None if project_id is None else registry_manager.project_manager.get_by_id(project_id)
|
|
49
|
+
)
|
|
50
|
+
output_dir = (output_directory or Path()) / dataset_id
|
|
51
|
+
check_overwrite(output_dir, overwrite)
|
|
52
|
+
output_dir.mkdir()
|
|
53
|
+
dimensions_dir = output_dir / "dimensions"
|
|
54
|
+
dimensions_dir.mkdir()
|
|
55
|
+
dataset_file = output_dir / "dataset.json5"
|
|
56
|
+
time_cols = time_columns or {"timestamp"}
|
|
57
|
+
|
|
58
|
+
dimension_references: list[DimensionReferenceModel] = []
|
|
59
|
+
for dim_type, ids in get_unique_dimension_record_ids(
|
|
60
|
+
dataset_path, table_format, pivoted_dimension_type, time_cols
|
|
61
|
+
).items():
|
|
62
|
+
ref, checked_project_dim_ids = find_matching_project_base_dimension(
|
|
63
|
+
project_config, ids, dim_type, no_prompts=no_prompts
|
|
64
|
+
)
|
|
65
|
+
if ref is None:
|
|
66
|
+
ref = find_matching_registry_dimensions(
|
|
67
|
+
registry_manager.dimension_manager,
|
|
68
|
+
ids,
|
|
69
|
+
dim_type,
|
|
70
|
+
checked_project_dim_ids,
|
|
71
|
+
no_prompts=no_prompts,
|
|
72
|
+
)
|
|
73
|
+
if ref is None:
|
|
74
|
+
write_dimension_records(ids, dimensions_dir / f"{dim_type.value}.csv")
|
|
75
|
+
else:
|
|
76
|
+
dimension_references.append(ref)
|
|
77
|
+
|
|
78
|
+
config = make_unvalidated_dataset_config(
|
|
79
|
+
dataset_id,
|
|
80
|
+
metric_type,
|
|
81
|
+
dimension_references=dimension_references,
|
|
82
|
+
time_type=time_type,
|
|
83
|
+
)
|
|
84
|
+
dump_data(config, dataset_file, indent=2)
|
|
85
|
+
logger.info("Wrote dataset config to %s", dataset_file)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def write_dimension_records(ids: Iterable[str], filename: Path) -> None:
|
|
89
|
+
with open(filename, "w", encoding="utf-8") as f:
|
|
90
|
+
header = ["id", "name"]
|
|
91
|
+
f.write(",".join(header))
|
|
92
|
+
f.write("\n")
|
|
93
|
+
for id_ in ids:
|
|
94
|
+
str_id = str(id_)
|
|
95
|
+
values = [str_id, str_id.title().replace("_", " ")]
|
|
96
|
+
f.write(",".join(values))
|
|
97
|
+
f.write("\n")
|
|
98
|
+
logger.info("Wrote dimension records to %s", filename)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def find_matching_project_base_dimension(
|
|
102
|
+
project_config: ProjectConfig | None,
|
|
103
|
+
sorted_record_ids: list[str],
|
|
104
|
+
dimension_type: DimensionType,
|
|
105
|
+
no_prompts: bool = False,
|
|
106
|
+
) -> tuple[DimensionReferenceModel | None, set[str]]:
|
|
107
|
+
"""Find matching base dimensions for a dataset in a project."""
|
|
108
|
+
checked_project_dim_ids: set[str] = set()
|
|
109
|
+
if project_config is None:
|
|
110
|
+
return None, checked_project_dim_ids
|
|
111
|
+
|
|
112
|
+
if dimension_type == DimensionType.TIME:
|
|
113
|
+
return None, checked_project_dim_ids
|
|
114
|
+
|
|
115
|
+
for dim in project_config.list_base_dimensions_with_records(dimension_type=dimension_type):
|
|
116
|
+
project_records = sorted(dim.get_unique_ids())
|
|
117
|
+
checked_project_dim_ids.add(dim.model.dimension_id)
|
|
118
|
+
if sorted_record_ids == project_records and (
|
|
119
|
+
no_prompts or get_user_input_on_dimension_match(dim, "project base dimension")
|
|
120
|
+
):
|
|
121
|
+
return make_dimension_ref(dim), checked_project_dim_ids
|
|
122
|
+
|
|
123
|
+
return None, checked_project_dim_ids
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def find_matching_registry_dimensions(
|
|
127
|
+
dimension_manager: DimensionRegistryManager,
|
|
128
|
+
ids: list[str],
|
|
129
|
+
dimension_type: DimensionType,
|
|
130
|
+
checked_project_dim_ids: set[str],
|
|
131
|
+
no_prompts: bool = False,
|
|
132
|
+
) -> DimensionReferenceModel | None:
|
|
133
|
+
for dim in dimension_manager.find_matching_dimensions(ids, dimension_type):
|
|
134
|
+
if dim.model.dimension_id not in checked_project_dim_ids and (
|
|
135
|
+
no_prompts or get_user_input_on_dimension_match(dim, "dimension from the registry")
|
|
136
|
+
):
|
|
137
|
+
return make_dimension_ref(dim)
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def get_user_input_on_dimension_match(dim: DimensionBaseConfigWithFiles, tag: str) -> bool:
|
|
142
|
+
value = input(
|
|
143
|
+
f"Found a {tag} with matching records:\n"
|
|
144
|
+
f" Dimension type: {dim.model.dimension_type.value}\n"
|
|
145
|
+
f" Name: {dim.model.name}\n"
|
|
146
|
+
f" Description: {dim.model.description}\n"
|
|
147
|
+
f" Dimension ID: {dim.model.dimension_id}\n"
|
|
148
|
+
"Do you want to use it? (y/n) >>> "
|
|
149
|
+
)
|
|
150
|
+
return value.lower().strip() == "y"
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def make_dimension_ref(dim: DimensionBaseConfigWithFiles) -> DimensionReferenceModel:
|
|
154
|
+
return DimensionReferenceModel(
|
|
155
|
+
dimension_id=dim.model.dimension_id,
|
|
156
|
+
type=dim.model.dimension_type,
|
|
157
|
+
version=dim.model.version,
|
|
158
|
+
)
|