dsgrid-toolkit 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dsgrid-toolkit might be problematic. Click here for more details.
- dsgrid/__init__.py +22 -0
- dsgrid/api/__init__.py +0 -0
- dsgrid/api/api_manager.py +179 -0
- dsgrid/api/app.py +420 -0
- dsgrid/api/models.py +60 -0
- dsgrid/api/response_models.py +116 -0
- dsgrid/apps/__init__.py +0 -0
- dsgrid/apps/project_viewer/app.py +216 -0
- dsgrid/apps/registration_gui.py +444 -0
- dsgrid/chronify.py +22 -0
- dsgrid/cli/__init__.py +0 -0
- dsgrid/cli/common.py +120 -0
- dsgrid/cli/config.py +177 -0
- dsgrid/cli/download.py +13 -0
- dsgrid/cli/dsgrid.py +142 -0
- dsgrid/cli/dsgrid_admin.py +349 -0
- dsgrid/cli/install_notebooks.py +62 -0
- dsgrid/cli/query.py +711 -0
- dsgrid/cli/registry.py +1773 -0
- dsgrid/cloud/__init__.py +0 -0
- dsgrid/cloud/cloud_storage_interface.py +140 -0
- dsgrid/cloud/factory.py +31 -0
- dsgrid/cloud/fake_storage_interface.py +37 -0
- dsgrid/cloud/s3_storage_interface.py +156 -0
- dsgrid/common.py +35 -0
- dsgrid/config/__init__.py +0 -0
- dsgrid/config/annual_time_dimension_config.py +187 -0
- dsgrid/config/common.py +131 -0
- dsgrid/config/config_base.py +148 -0
- dsgrid/config/dataset_config.py +684 -0
- dsgrid/config/dataset_schema_handler_factory.py +41 -0
- dsgrid/config/date_time_dimension_config.py +108 -0
- dsgrid/config/dimension_config.py +54 -0
- dsgrid/config/dimension_config_factory.py +65 -0
- dsgrid/config/dimension_mapping_base.py +349 -0
- dsgrid/config/dimension_mappings_config.py +48 -0
- dsgrid/config/dimensions.py +775 -0
- dsgrid/config/dimensions_config.py +71 -0
- dsgrid/config/index_time_dimension_config.py +76 -0
- dsgrid/config/input_dataset_requirements.py +31 -0
- dsgrid/config/mapping_tables.py +209 -0
- dsgrid/config/noop_time_dimension_config.py +42 -0
- dsgrid/config/project_config.py +1457 -0
- dsgrid/config/registration_models.py +199 -0
- dsgrid/config/representative_period_time_dimension_config.py +194 -0
- dsgrid/config/simple_models.py +49 -0
- dsgrid/config/supplemental_dimension.py +29 -0
- dsgrid/config/time_dimension_base_config.py +200 -0
- dsgrid/data_models.py +155 -0
- dsgrid/dataset/__init__.py +0 -0
- dsgrid/dataset/dataset.py +123 -0
- dsgrid/dataset/dataset_expression_handler.py +86 -0
- dsgrid/dataset/dataset_mapping_manager.py +121 -0
- dsgrid/dataset/dataset_schema_handler_base.py +899 -0
- dsgrid/dataset/dataset_schema_handler_one_table.py +196 -0
- dsgrid/dataset/dataset_schema_handler_standard.py +303 -0
- dsgrid/dataset/growth_rates.py +162 -0
- dsgrid/dataset/models.py +44 -0
- dsgrid/dataset/table_format_handler_base.py +257 -0
- dsgrid/dataset/table_format_handler_factory.py +17 -0
- dsgrid/dataset/unpivoted_table.py +121 -0
- dsgrid/dimension/__init__.py +0 -0
- dsgrid/dimension/base_models.py +218 -0
- dsgrid/dimension/dimension_filters.py +308 -0
- dsgrid/dimension/standard.py +213 -0
- dsgrid/dimension/time.py +531 -0
- dsgrid/dimension/time_utils.py +88 -0
- dsgrid/dsgrid_rc.py +88 -0
- dsgrid/exceptions.py +105 -0
- dsgrid/filesystem/__init__.py +0 -0
- dsgrid/filesystem/cloud_filesystem.py +32 -0
- dsgrid/filesystem/factory.py +32 -0
- dsgrid/filesystem/filesystem_interface.py +136 -0
- dsgrid/filesystem/local_filesystem.py +74 -0
- dsgrid/filesystem/s3_filesystem.py +118 -0
- dsgrid/loggers.py +132 -0
- dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +950 -0
- dsgrid/notebooks/registration.ipynb +48 -0
- dsgrid/notebooks/start_notebook.sh +11 -0
- dsgrid/project.py +451 -0
- dsgrid/query/__init__.py +0 -0
- dsgrid/query/dataset_mapping_plan.py +142 -0
- dsgrid/query/derived_dataset.py +384 -0
- dsgrid/query/models.py +726 -0
- dsgrid/query/query_context.py +287 -0
- dsgrid/query/query_submitter.py +847 -0
- dsgrid/query/report_factory.py +19 -0
- dsgrid/query/report_peak_load.py +70 -0
- dsgrid/query/reports_base.py +20 -0
- dsgrid/registry/__init__.py +0 -0
- dsgrid/registry/bulk_register.py +161 -0
- dsgrid/registry/common.py +287 -0
- dsgrid/registry/config_update_checker_base.py +63 -0
- dsgrid/registry/data_store_factory.py +34 -0
- dsgrid/registry/data_store_interface.py +69 -0
- dsgrid/registry/dataset_config_generator.py +156 -0
- dsgrid/registry/dataset_registry_manager.py +734 -0
- dsgrid/registry/dataset_update_checker.py +16 -0
- dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
- dsgrid/registry/dimension_mapping_update_checker.py +16 -0
- dsgrid/registry/dimension_registry_manager.py +413 -0
- dsgrid/registry/dimension_update_checker.py +16 -0
- dsgrid/registry/duckdb_data_store.py +185 -0
- dsgrid/registry/filesystem_data_store.py +141 -0
- dsgrid/registry/filter_registry_manager.py +123 -0
- dsgrid/registry/project_config_generator.py +57 -0
- dsgrid/registry/project_registry_manager.py +1616 -0
- dsgrid/registry/project_update_checker.py +48 -0
- dsgrid/registry/registration_context.py +223 -0
- dsgrid/registry/registry_auto_updater.py +316 -0
- dsgrid/registry/registry_database.py +662 -0
- dsgrid/registry/registry_interface.py +446 -0
- dsgrid/registry/registry_manager.py +544 -0
- dsgrid/registry/registry_manager_base.py +367 -0
- dsgrid/registry/versioning.py +92 -0
- dsgrid/spark/__init__.py +0 -0
- dsgrid/spark/functions.py +545 -0
- dsgrid/spark/types.py +50 -0
- dsgrid/tests/__init__.py +0 -0
- dsgrid/tests/common.py +139 -0
- dsgrid/tests/make_us_data_registry.py +204 -0
- dsgrid/tests/register_derived_datasets.py +103 -0
- dsgrid/tests/utils.py +25 -0
- dsgrid/time/__init__.py +0 -0
- dsgrid/time/time_conversions.py +80 -0
- dsgrid/time/types.py +67 -0
- dsgrid/units/__init__.py +0 -0
- dsgrid/units/constants.py +113 -0
- dsgrid/units/convert.py +71 -0
- dsgrid/units/energy.py +145 -0
- dsgrid/units/power.py +87 -0
- dsgrid/utils/__init__.py +0 -0
- dsgrid/utils/dataset.py +612 -0
- dsgrid/utils/files.py +179 -0
- dsgrid/utils/filters.py +125 -0
- dsgrid/utils/id_remappings.py +100 -0
- dsgrid/utils/py_expression_eval/LICENSE +19 -0
- dsgrid/utils/py_expression_eval/README.md +8 -0
- dsgrid/utils/py_expression_eval/__init__.py +847 -0
- dsgrid/utils/py_expression_eval/tests.py +283 -0
- dsgrid/utils/run_command.py +70 -0
- dsgrid/utils/scratch_dir_context.py +64 -0
- dsgrid/utils/spark.py +918 -0
- dsgrid/utils/spark_partition.py +98 -0
- dsgrid/utils/timing.py +239 -0
- dsgrid/utils/utilities.py +184 -0
- dsgrid/utils/versioning.py +36 -0
- dsgrid_toolkit-0.2.0.dist-info/METADATA +216 -0
- dsgrid_toolkit-0.2.0.dist-info/RECORD +152 -0
- dsgrid_toolkit-0.2.0.dist-info/WHEEL +4 -0
- dsgrid_toolkit-0.2.0.dist-info/entry_points.txt +4 -0
- dsgrid_toolkit-0.2.0.dist-info/licenses/LICENSE +29 -0
dsgrid/cli/query.py
ADDED
|
@@ -0,0 +1,711 @@
|
|
|
1
|
+
"""Runs dsgrid queries."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import rich_click as click
|
|
8
|
+
from chronify.utils.path_utils import check_overwrite
|
|
9
|
+
from pydantic import ValidationError
|
|
10
|
+
|
|
11
|
+
from dsgrid.common import REMOTE_REGISTRY
|
|
12
|
+
from dsgrid.cli.common import (
|
|
13
|
+
check_output_directory,
|
|
14
|
+
get_value_from_context,
|
|
15
|
+
handle_dsgrid_exception,
|
|
16
|
+
path_callback,
|
|
17
|
+
)
|
|
18
|
+
from dsgrid.dimension.base_models import DimensionType
|
|
19
|
+
from dsgrid.dimension.dimension_filters import (
|
|
20
|
+
DimensionFilterType,
|
|
21
|
+
DimensionFilterExpressionModel,
|
|
22
|
+
DimensionFilterExpressionRawModel,
|
|
23
|
+
DimensionFilterBetweenColumnOperatorModel,
|
|
24
|
+
DimensionFilterColumnOperatorModel,
|
|
25
|
+
SubsetDimensionFilterModel,
|
|
26
|
+
SupplementalDimensionFilterColumnOperatorModel,
|
|
27
|
+
)
|
|
28
|
+
from dsgrid.filesystem.factory import make_filesystem_interface
|
|
29
|
+
from dsgrid.query.dataset_mapping_plan import DatasetMappingPlan
|
|
30
|
+
from dsgrid.query.derived_dataset import create_derived_dataset_config_from_query
|
|
31
|
+
from dsgrid.query.models import (
|
|
32
|
+
AggregationModel,
|
|
33
|
+
DatasetQueryModel,
|
|
34
|
+
DimensionNamesModel,
|
|
35
|
+
ProjectQueryModel,
|
|
36
|
+
ProjectQueryParamsModel,
|
|
37
|
+
CreateCompositeDatasetQueryModel,
|
|
38
|
+
CompositeDatasetQueryModel,
|
|
39
|
+
StandaloneDatasetModel,
|
|
40
|
+
ColumnType,
|
|
41
|
+
DatasetModel,
|
|
42
|
+
make_query_for_standalone_dataset,
|
|
43
|
+
)
|
|
44
|
+
from dsgrid.query.query_submitter import (
|
|
45
|
+
DatasetQuerySubmitter,
|
|
46
|
+
ProjectQuerySubmitter,
|
|
47
|
+
) # , CompositeDatasetQuerySubmitter
|
|
48
|
+
from dsgrid.registry.common import DatabaseConnection
|
|
49
|
+
from dsgrid.registry.registry_manager import RegistryManager
|
|
50
|
+
from dsgrid.utils.files import dump_json_file
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
QUERY_OUTPUT_DIR = "query_output"
|
|
54
|
+
|
|
55
|
+
logger = logging.getLogger(__name__)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def add_options(options):
|
|
59
|
+
def _add_options(func):
|
|
60
|
+
for option in reversed(options):
|
|
61
|
+
func = option(func)
|
|
62
|
+
return func
|
|
63
|
+
|
|
64
|
+
return _add_options
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
_COMMON_REGISTRY_OPTIONS = (
|
|
68
|
+
click.option(
|
|
69
|
+
"--remote-path",
|
|
70
|
+
default=REMOTE_REGISTRY,
|
|
71
|
+
show_default=True,
|
|
72
|
+
help="Path to dsgrid remote registry",
|
|
73
|
+
),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
_COMMON_RUN_OPTIONS = (
|
|
78
|
+
click.option(
|
|
79
|
+
"-o",
|
|
80
|
+
"--output",
|
|
81
|
+
default=QUERY_OUTPUT_DIR,
|
|
82
|
+
show_default=True,
|
|
83
|
+
type=str,
|
|
84
|
+
help="Output directory for query results",
|
|
85
|
+
callback=path_callback,
|
|
86
|
+
),
|
|
87
|
+
click.option(
|
|
88
|
+
"--load-cached-table/--no-load-cached-table",
|
|
89
|
+
is_flag=True,
|
|
90
|
+
default=True,
|
|
91
|
+
show_default=True,
|
|
92
|
+
help="Try to load a cached table if one exists.",
|
|
93
|
+
),
|
|
94
|
+
click.option(
|
|
95
|
+
"--overwrite",
|
|
96
|
+
is_flag=True,
|
|
97
|
+
default=False,
|
|
98
|
+
show_default=True,
|
|
99
|
+
help="Overwrite results directory if it exists.",
|
|
100
|
+
),
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
_create_project_query_epilog = """
|
|
105
|
+
Examples:\n
|
|
106
|
+
$ dsgrid query project create my_query_result_name my_project_id my_dataset_id\n
|
|
107
|
+
$ dsgrid query project create --default-result-aggregation my_query_result_name my_project_id my_dataset_id\n
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@click.command("create", epilog=_create_project_query_epilog)
|
|
112
|
+
@click.argument("query_name")
|
|
113
|
+
@click.argument("project_id")
|
|
114
|
+
@click.argument("dataset_id")
|
|
115
|
+
@click.option(
|
|
116
|
+
"-F",
|
|
117
|
+
"--filters",
|
|
118
|
+
type=click.Choice([x.value for x in DimensionFilterType]),
|
|
119
|
+
multiple=True,
|
|
120
|
+
help="Add a dimension filter. Requires user customization.",
|
|
121
|
+
)
|
|
122
|
+
@click.option(
|
|
123
|
+
"-a",
|
|
124
|
+
"--aggregation-function",
|
|
125
|
+
default="sum",
|
|
126
|
+
show_default=True,
|
|
127
|
+
help="Aggregation function for any included default aggregations.",
|
|
128
|
+
)
|
|
129
|
+
@click.option(
|
|
130
|
+
"-f",
|
|
131
|
+
"--query-file",
|
|
132
|
+
default="query.json5",
|
|
133
|
+
show_default=True,
|
|
134
|
+
help="Query file to create.",
|
|
135
|
+
callback=path_callback,
|
|
136
|
+
)
|
|
137
|
+
@click.option(
|
|
138
|
+
"-r",
|
|
139
|
+
"--default-result-aggregation",
|
|
140
|
+
is_flag=True,
|
|
141
|
+
default=False,
|
|
142
|
+
show_default=True,
|
|
143
|
+
help="Add default result aggregration.",
|
|
144
|
+
)
|
|
145
|
+
@click.option(
|
|
146
|
+
"--overwrite",
|
|
147
|
+
is_flag=True,
|
|
148
|
+
default=False,
|
|
149
|
+
show_default=True,
|
|
150
|
+
help="Overwrite query file if it exists.",
|
|
151
|
+
)
|
|
152
|
+
@add_options(_COMMON_REGISTRY_OPTIONS)
|
|
153
|
+
@click.pass_context
|
|
154
|
+
def create_project_query(
|
|
155
|
+
ctx,
|
|
156
|
+
query_name,
|
|
157
|
+
project_id,
|
|
158
|
+
dataset_id,
|
|
159
|
+
filters,
|
|
160
|
+
aggregation_function,
|
|
161
|
+
query_file,
|
|
162
|
+
default_result_aggregation,
|
|
163
|
+
overwrite,
|
|
164
|
+
remote_path,
|
|
165
|
+
):
|
|
166
|
+
"""Create a default query file for a dsgrid project."""
|
|
167
|
+
check_overwrite(query_file, overwrite)
|
|
168
|
+
conn = DatabaseConnection(
|
|
169
|
+
url=get_value_from_context(ctx, "url"),
|
|
170
|
+
)
|
|
171
|
+
registry_manager = RegistryManager.load(
|
|
172
|
+
conn,
|
|
173
|
+
remote_path=remote_path,
|
|
174
|
+
offline_mode=get_value_from_context(ctx, "offline"),
|
|
175
|
+
)
|
|
176
|
+
project = registry_manager.project_manager.load_project(project_id)
|
|
177
|
+
query = ProjectQueryModel(
|
|
178
|
+
name=query_name,
|
|
179
|
+
project=ProjectQueryParamsModel(
|
|
180
|
+
project_id=project_id,
|
|
181
|
+
dataset=DatasetModel(
|
|
182
|
+
dataset_id=dataset_id,
|
|
183
|
+
source_datasets=[
|
|
184
|
+
StandaloneDatasetModel(dataset_id=x)
|
|
185
|
+
for x in project.config.list_registered_dataset_ids()
|
|
186
|
+
],
|
|
187
|
+
),
|
|
188
|
+
),
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
for dim_filter in filters:
|
|
192
|
+
filter_type = DimensionFilterType(dim_filter)
|
|
193
|
+
match filter_type:
|
|
194
|
+
case DimensionFilterType.EXPRESSION:
|
|
195
|
+
flt = DimensionFilterExpressionModel(
|
|
196
|
+
dimension_type=DimensionType.GEOGRAPHY,
|
|
197
|
+
dimension_name="county",
|
|
198
|
+
operator="==",
|
|
199
|
+
value="",
|
|
200
|
+
)
|
|
201
|
+
case DimensionFilterType.BETWEEN_COLUMN_OPERATOR:
|
|
202
|
+
flt = DimensionFilterBetweenColumnOperatorModel(
|
|
203
|
+
dimension_type=DimensionType.TIME,
|
|
204
|
+
dimension_name="time_est",
|
|
205
|
+
lower_bound="",
|
|
206
|
+
upper_bound="",
|
|
207
|
+
)
|
|
208
|
+
case DimensionFilterType.COLUMN_OPERATOR:
|
|
209
|
+
flt = DimensionFilterColumnOperatorModel(
|
|
210
|
+
dimension_type=DimensionType.GEOGRAPHY,
|
|
211
|
+
dimension_name="county",
|
|
212
|
+
value="",
|
|
213
|
+
operator="contains",
|
|
214
|
+
)
|
|
215
|
+
case DimensionFilterType.SUPPLEMENTAL_COLUMN_OPERATOR:
|
|
216
|
+
flt = SupplementalDimensionFilterColumnOperatorModel(
|
|
217
|
+
dimension_type=DimensionType.GEOGRAPHY,
|
|
218
|
+
dimension_name="state",
|
|
219
|
+
)
|
|
220
|
+
case DimensionFilterType.EXPRESSION_RAW:
|
|
221
|
+
flt = DimensionFilterExpressionRawModel(
|
|
222
|
+
dimension_type=DimensionType.GEOGRAPHY,
|
|
223
|
+
dimension_name="county",
|
|
224
|
+
value="== '06037'",
|
|
225
|
+
)
|
|
226
|
+
case DimensionFilterType.SUBSET:
|
|
227
|
+
flt = SubsetDimensionFilterModel(
|
|
228
|
+
dimension_type=DimensionType.SUBSECTOR,
|
|
229
|
+
dimension_names=["commercial_subsectors", "residential_subsectors"],
|
|
230
|
+
)
|
|
231
|
+
case _:
|
|
232
|
+
msg = f"Bug: {filter_type}"
|
|
233
|
+
raise NotImplementedError(msg)
|
|
234
|
+
query.project.dataset.params.dimension_filters.append(flt)
|
|
235
|
+
|
|
236
|
+
if default_result_aggregation:
|
|
237
|
+
default_aggs = {
|
|
238
|
+
k.value: v for k, v in project.config.get_dimension_type_to_base_name_mapping().items()
|
|
239
|
+
}
|
|
240
|
+
if default_result_aggregation:
|
|
241
|
+
query.result.aggregations = [
|
|
242
|
+
AggregationModel(
|
|
243
|
+
dimensions=DimensionNamesModel(**default_aggs),
|
|
244
|
+
aggregation_function=aggregation_function,
|
|
245
|
+
),
|
|
246
|
+
]
|
|
247
|
+
|
|
248
|
+
query_file.write_text(query.model_dump_json(indent=2))
|
|
249
|
+
print(f"Wrote query to {query_file}", file=sys.stderr)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
@click.command("validate")
|
|
253
|
+
@click.argument("query_file", type=click.Path(exists=True), callback=path_callback)
|
|
254
|
+
def validate_project_query(query_file):
|
|
255
|
+
try:
|
|
256
|
+
ProjectQueryModel.from_file(query_file)
|
|
257
|
+
print(f"Validated {query_file}", file=sys.stderr)
|
|
258
|
+
except ValidationError:
|
|
259
|
+
print(f"Failed to validate query file {query_file}", file=sys.stderr)
|
|
260
|
+
raise
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
_run_project_query_epilog = """
|
|
264
|
+
Examples:\n
|
|
265
|
+
$ dsgrid query project run query.json5
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
@click.command("run", epilog=_run_project_query_epilog)
|
|
270
|
+
@click.argument("query_definition_file", type=click.Path(exists=True))
|
|
271
|
+
@click.option(
|
|
272
|
+
"-c",
|
|
273
|
+
"--checkpoint-file",
|
|
274
|
+
type=click.Path(exists=True),
|
|
275
|
+
callback=path_callback,
|
|
276
|
+
help="Checkpoint file created by a previous map operation. If passed, the code will "
|
|
277
|
+
"read it and resume from the last persisted file.",
|
|
278
|
+
)
|
|
279
|
+
@click.option(
|
|
280
|
+
"--persist-intermediate-table/--no-persist-intermediate-table",
|
|
281
|
+
is_flag=True,
|
|
282
|
+
default=True,
|
|
283
|
+
show_default=True,
|
|
284
|
+
help="Persist the intermediate table to the filesystem to allow for reuse.",
|
|
285
|
+
)
|
|
286
|
+
@click.option(
|
|
287
|
+
"-z",
|
|
288
|
+
"--zip-file",
|
|
289
|
+
is_flag=True,
|
|
290
|
+
default=False,
|
|
291
|
+
show_default=True,
|
|
292
|
+
help="Create a zip file containing all output files.",
|
|
293
|
+
)
|
|
294
|
+
@add_options(_COMMON_REGISTRY_OPTIONS)
|
|
295
|
+
@add_options(_COMMON_RUN_OPTIONS)
|
|
296
|
+
@click.pass_context
|
|
297
|
+
def run_project_query(
|
|
298
|
+
ctx: click.Context,
|
|
299
|
+
query_definition_file: Path,
|
|
300
|
+
checkpoint_file: Path | None,
|
|
301
|
+
persist_intermediate_table: bool,
|
|
302
|
+
zip_file: bool,
|
|
303
|
+
remote_path: str,
|
|
304
|
+
output: Path,
|
|
305
|
+
load_cached_table: bool,
|
|
306
|
+
overwrite: bool,
|
|
307
|
+
):
|
|
308
|
+
"""Run a query on a dsgrid project."""
|
|
309
|
+
query = ProjectQueryModel.from_file(query_definition_file)
|
|
310
|
+
_run_project_query(
|
|
311
|
+
ctx,
|
|
312
|
+
query,
|
|
313
|
+
checkpoint_file,
|
|
314
|
+
persist_intermediate_table,
|
|
315
|
+
zip_file,
|
|
316
|
+
remote_path,
|
|
317
|
+
output,
|
|
318
|
+
load_cached_table,
|
|
319
|
+
overwrite,
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def _run_project_query(
|
|
324
|
+
ctx: click.Context,
|
|
325
|
+
query: ProjectQueryModel,
|
|
326
|
+
checkpoint_file: Path | None,
|
|
327
|
+
persist_intermediate_table: bool,
|
|
328
|
+
zip_file: bool,
|
|
329
|
+
remote_path,
|
|
330
|
+
output: Path,
|
|
331
|
+
load_cached_table: bool,
|
|
332
|
+
overwrite: bool,
|
|
333
|
+
) -> None:
|
|
334
|
+
conn = DatabaseConnection(
|
|
335
|
+
url=get_value_from_context(ctx, "url"),
|
|
336
|
+
)
|
|
337
|
+
scratch_dir = get_value_from_context(ctx, "scratch_dir")
|
|
338
|
+
registry_manager = RegistryManager.load(
|
|
339
|
+
conn,
|
|
340
|
+
remote_path=remote_path,
|
|
341
|
+
offline_mode=get_value_from_context(ctx, "offline"),
|
|
342
|
+
)
|
|
343
|
+
project = registry_manager.project_manager.load_project(query.project.project_id)
|
|
344
|
+
fs_interface = make_filesystem_interface(output)
|
|
345
|
+
submitter = ProjectQuerySubmitter(project, fs_interface.path(output))
|
|
346
|
+
res = handle_dsgrid_exception(
|
|
347
|
+
ctx,
|
|
348
|
+
submitter.submit,
|
|
349
|
+
query,
|
|
350
|
+
scratch_dir,
|
|
351
|
+
checkpoint_file=checkpoint_file,
|
|
352
|
+
persist_intermediate_table=persist_intermediate_table,
|
|
353
|
+
load_cached_table=load_cached_table,
|
|
354
|
+
zip_file=zip_file,
|
|
355
|
+
overwrite=overwrite,
|
|
356
|
+
)
|
|
357
|
+
if res[1] != 0:
|
|
358
|
+
ctx.exit(res[1])
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
_map_dataset_epilog = """
|
|
362
|
+
Examples:\n
|
|
363
|
+
$ dsgrid query project map_dataset project_id dataset_id
|
|
364
|
+
"""
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
@click.command("map-dataset", epilog=_map_dataset_epilog)
|
|
368
|
+
@click.argument("project-id")
|
|
369
|
+
@click.argument("dataset-id")
|
|
370
|
+
@click.option(
|
|
371
|
+
"-c",
|
|
372
|
+
"--checkpoint-file",
|
|
373
|
+
type=click.Path(exists=True),
|
|
374
|
+
callback=path_callback,
|
|
375
|
+
help="Checkpoint file created by a previous map operation. If passed, the code will "
|
|
376
|
+
"read it and resume from the last persisted file.",
|
|
377
|
+
)
|
|
378
|
+
@click.option(
|
|
379
|
+
"-p",
|
|
380
|
+
"--mapping-plan",
|
|
381
|
+
type=click.Path(exists=True),
|
|
382
|
+
help="Path to a mapping plan file. If not provided, the default mapping plan will be used.",
|
|
383
|
+
callback=path_callback,
|
|
384
|
+
)
|
|
385
|
+
@click.option(
|
|
386
|
+
"--persist-intermediate-table/--no-persist-intermediate-table",
|
|
387
|
+
is_flag=True,
|
|
388
|
+
default=True,
|
|
389
|
+
show_default=True,
|
|
390
|
+
help="Persist the intermediate table to the filesystem to allow for reuse.",
|
|
391
|
+
)
|
|
392
|
+
@click.option(
|
|
393
|
+
"-t",
|
|
394
|
+
"--column-type",
|
|
395
|
+
type=click.Choice([x.value for x in ColumnType]),
|
|
396
|
+
default=ColumnType.DIMENSION_NAMES.value,
|
|
397
|
+
callback=lambda *x: ColumnType(x[2]),
|
|
398
|
+
)
|
|
399
|
+
@click.option(
|
|
400
|
+
"-z",
|
|
401
|
+
"--zip-file",
|
|
402
|
+
is_flag=True,
|
|
403
|
+
default=False,
|
|
404
|
+
show_default=True,
|
|
405
|
+
help="Create a zip file containing all output files.",
|
|
406
|
+
)
|
|
407
|
+
@add_options(_COMMON_REGISTRY_OPTIONS)
|
|
408
|
+
@add_options(_COMMON_RUN_OPTIONS)
|
|
409
|
+
@click.pass_context
|
|
410
|
+
def map_dataset(
|
|
411
|
+
ctx: click.Context,
|
|
412
|
+
project_id: str,
|
|
413
|
+
dataset_id: str,
|
|
414
|
+
checkpoint_file: Path | None,
|
|
415
|
+
mapping_plan: Path | None,
|
|
416
|
+
persist_intermediate_table: bool,
|
|
417
|
+
remote_path,
|
|
418
|
+
output: Path,
|
|
419
|
+
load_cached_table: bool,
|
|
420
|
+
overwrite: bool,
|
|
421
|
+
column_type: ColumnType,
|
|
422
|
+
zip_file: bool,
|
|
423
|
+
):
|
|
424
|
+
"""Map a dataset to the project's base dimensions."""
|
|
425
|
+
plan = DatasetMappingPlan.from_file(mapping_plan) if mapping_plan else None
|
|
426
|
+
query = make_query_for_standalone_dataset(
|
|
427
|
+
project_id, dataset_id, plan, column_type=column_type
|
|
428
|
+
)
|
|
429
|
+
_run_project_query(
|
|
430
|
+
ctx=ctx,
|
|
431
|
+
query=query,
|
|
432
|
+
checkpoint_file=checkpoint_file,
|
|
433
|
+
persist_intermediate_table=persist_intermediate_table,
|
|
434
|
+
zip_file=zip_file,
|
|
435
|
+
remote_path=remote_path,
|
|
436
|
+
output=output,
|
|
437
|
+
load_cached_table=load_cached_table,
|
|
438
|
+
overwrite=overwrite,
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
@click.command("create-query")
|
|
443
|
+
@click.argument("name", type=str)
|
|
444
|
+
@click.argument("dataset_id", type=str)
|
|
445
|
+
@click.option(
|
|
446
|
+
"-f",
|
|
447
|
+
"--query-file",
|
|
448
|
+
default="dataset_query.json5",
|
|
449
|
+
show_default=True,
|
|
450
|
+
help="Query file to create.",
|
|
451
|
+
callback=path_callback,
|
|
452
|
+
)
|
|
453
|
+
@click.option(
|
|
454
|
+
"--overwrite",
|
|
455
|
+
is_flag=True,
|
|
456
|
+
default=False,
|
|
457
|
+
show_default=True,
|
|
458
|
+
help="Overwrite query file if it exists.",
|
|
459
|
+
)
|
|
460
|
+
@click.pass_context
|
|
461
|
+
def create_dataset_query(
|
|
462
|
+
ctx,
|
|
463
|
+
name: str,
|
|
464
|
+
dataset_id: str,
|
|
465
|
+
query_file: Path,
|
|
466
|
+
overwrite: bool,
|
|
467
|
+
):
|
|
468
|
+
"""Create a query file to be used for mapping a dataset to an arbitrary list of dimensions."""
|
|
469
|
+
query = DatasetQueryModel(name=name, dataset_id=dataset_id, to_dimension_references=[])
|
|
470
|
+
check_overwrite(query_file, overwrite)
|
|
471
|
+
data = query.model_dump(mode="json")
|
|
472
|
+
unsupported_result_fields = (
|
|
473
|
+
"column_type",
|
|
474
|
+
"replace_ids_with_names",
|
|
475
|
+
"aggregations",
|
|
476
|
+
"aggregate_each_dataset",
|
|
477
|
+
"reports",
|
|
478
|
+
"dimension_filters",
|
|
479
|
+
"time_zone",
|
|
480
|
+
)
|
|
481
|
+
data.pop("version")
|
|
482
|
+
for field in unsupported_result_fields:
|
|
483
|
+
data["result"].pop(field)
|
|
484
|
+
|
|
485
|
+
dump_json_file(data, query_file, indent=2)
|
|
486
|
+
print(f"Wrote query to {query_file}", file=sys.stderr)
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
_run_dataset_query_epilog = """
|
|
490
|
+
Examples:\n
|
|
491
|
+
$ dsgrid query dataset run query.json5
|
|
492
|
+
"""
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
@click.command("run", epilog=_run_dataset_query_epilog)
|
|
496
|
+
@click.argument("query_definition_file", type=click.Path(exists=True))
|
|
497
|
+
@click.option(
|
|
498
|
+
"-c",
|
|
499
|
+
"--checkpoint-file",
|
|
500
|
+
type=click.Path(exists=True),
|
|
501
|
+
callback=path_callback,
|
|
502
|
+
help="Checkpoint file created by a previous map operation. If passed, the code will "
|
|
503
|
+
"read it and resume from the last persisted file.",
|
|
504
|
+
)
|
|
505
|
+
@click.option(
|
|
506
|
+
"-o",
|
|
507
|
+
"--output",
|
|
508
|
+
default=QUERY_OUTPUT_DIR,
|
|
509
|
+
show_default=True,
|
|
510
|
+
type=str,
|
|
511
|
+
help="Output directory for query results",
|
|
512
|
+
callback=path_callback,
|
|
513
|
+
)
|
|
514
|
+
@click.option(
|
|
515
|
+
"--overwrite",
|
|
516
|
+
is_flag=True,
|
|
517
|
+
default=False,
|
|
518
|
+
show_default=True,
|
|
519
|
+
help="Overwrite results directory if it exists.",
|
|
520
|
+
)
|
|
521
|
+
@add_options(_COMMON_REGISTRY_OPTIONS)
|
|
522
|
+
@click.pass_context
|
|
523
|
+
def run_dataset_query(
|
|
524
|
+
ctx: click.Context,
|
|
525
|
+
query_definition_file: Path,
|
|
526
|
+
checkpoint_file: Path | None,
|
|
527
|
+
output: Path,
|
|
528
|
+
overwrite: bool,
|
|
529
|
+
remote_path: str,
|
|
530
|
+
):
|
|
531
|
+
"""Run a query on a dsgrid dataset."""
|
|
532
|
+
query = DatasetQueryModel.from_file(query_definition_file)
|
|
533
|
+
_run_dataset_query(
|
|
534
|
+
ctx,
|
|
535
|
+
query,
|
|
536
|
+
checkpoint_file,
|
|
537
|
+
remote_path,
|
|
538
|
+
output,
|
|
539
|
+
overwrite,
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
def _run_dataset_query(
|
|
544
|
+
ctx: click.Context,
|
|
545
|
+
query: DatasetQueryModel,
|
|
546
|
+
checkpoint_file: Path | None,
|
|
547
|
+
remote_path,
|
|
548
|
+
output: Path,
|
|
549
|
+
overwrite: bool,
|
|
550
|
+
) -> None:
|
|
551
|
+
conn = DatabaseConnection(
|
|
552
|
+
url=get_value_from_context(ctx, "url"),
|
|
553
|
+
)
|
|
554
|
+
scratch_dir = get_value_from_context(ctx, "scratch_dir")
|
|
555
|
+
registry_manager = RegistryManager.load(
|
|
556
|
+
conn,
|
|
557
|
+
remote_path=remote_path,
|
|
558
|
+
offline_mode=get_value_from_context(ctx, "offline"),
|
|
559
|
+
)
|
|
560
|
+
fs_interface = make_filesystem_interface(output)
|
|
561
|
+
submitter = DatasetQuerySubmitter(fs_interface.path(output))
|
|
562
|
+
res = handle_dsgrid_exception(
|
|
563
|
+
ctx,
|
|
564
|
+
submitter.submit,
|
|
565
|
+
query,
|
|
566
|
+
registry_manager,
|
|
567
|
+
scratch_dir,
|
|
568
|
+
overwrite=overwrite,
|
|
569
|
+
)
|
|
570
|
+
if res[1] != 0:
|
|
571
|
+
ctx.exit(res[1])
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
@click.command("create_dataset")
|
|
575
|
+
@click.argument("query_definition_file", type=click.Path(exists=True))
|
|
576
|
+
@add_options(_COMMON_RUN_OPTIONS)
|
|
577
|
+
@click.pass_context
|
|
578
|
+
def create_composite_dataset(
|
|
579
|
+
ctx,
|
|
580
|
+
query_definition_file,
|
|
581
|
+
remote_path,
|
|
582
|
+
output,
|
|
583
|
+
load_cached_table,
|
|
584
|
+
overwrite,
|
|
585
|
+
):
|
|
586
|
+
"""Run a query to create a composite dataset."""
|
|
587
|
+
CreateCompositeDatasetQueryModel.from_file(query_definition_file)
|
|
588
|
+
# conn = DatabaseConnection.from_url(
|
|
589
|
+
# get_value_from_context(ctx, "url"),
|
|
590
|
+
# database=get_value_from_context(ctx, "database_name"),
|
|
591
|
+
# username=get_value_from_context(ctx, "username"),
|
|
592
|
+
# password=get_value_from_context(ctx, "password"),
|
|
593
|
+
# )
|
|
594
|
+
# TODO
|
|
595
|
+
print("not implemented yet")
|
|
596
|
+
return 1
|
|
597
|
+
# registry_manager = RegistryManager.load(
|
|
598
|
+
# conn,
|
|
599
|
+
# remote_path=remote_path,
|
|
600
|
+
# offline_mode=get_value_from_context(ctx, "offline"),
|
|
601
|
+
# )
|
|
602
|
+
# project = registry_manager.project_manager.load_project(query.project.project_id)
|
|
603
|
+
# CompositeDatasetQuerySubmitter.submit(project, output).submit(query, force=overwrite)
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
@click.command("run")
|
|
607
|
+
@click.argument("query_definition_file", type=click.Path(exists=True))
|
|
608
|
+
@add_options(_COMMON_RUN_OPTIONS)
|
|
609
|
+
@click.pass_context
|
|
610
|
+
def query_composite_dataset(
|
|
611
|
+
ctx,
|
|
612
|
+
query_definition_file,
|
|
613
|
+
remote_path,
|
|
614
|
+
output,
|
|
615
|
+
load_cached_table,
|
|
616
|
+
overwrite,
|
|
617
|
+
):
|
|
618
|
+
"""Run a query on a composite dataset."""
|
|
619
|
+
CompositeDatasetQueryModel.from_file(query_definition_file)
|
|
620
|
+
# conn = DatabaseConnection.from_url(
|
|
621
|
+
# get_value_from_context(ctx, "url"),
|
|
622
|
+
# database=get_value_from_context(ctx, "database_name"),
|
|
623
|
+
# username=get_value_from_context(ctx, "username"),
|
|
624
|
+
# password=get_value_from_context(ctx, "password"),
|
|
625
|
+
# )
|
|
626
|
+
# TODO
|
|
627
|
+
print("not implemented yet")
|
|
628
|
+
return 1
|
|
629
|
+
# registry_manager = RegistryManager.load(
|
|
630
|
+
# registry_path,
|
|
631
|
+
# remote_path=remote_path,
|
|
632
|
+
# offline_mode=get_value_from_context(ctx, "offline"),
|
|
633
|
+
# )
|
|
634
|
+
# project = registry_manager.project_manager.load_project(query.project.project_id)
|
|
635
|
+
# CompositeDatasetQuerySubmitter.submit(project, output).submit(query, overwrite=overwrite)
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
_create_derived_dataset_config_epilog = f"""
|
|
639
|
+
Examples:\n
|
|
640
|
+
$ dsgrid query project create-derived-dataset-config {QUERY_OUTPUT_DIR}/my_query_result_name my_dataset_config\n
|
|
641
|
+
"""
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
@click.command(epilog=_create_derived_dataset_config_epilog)
|
|
645
|
+
@click.argument("src")
|
|
646
|
+
@click.argument("dst")
|
|
647
|
+
@add_options(_COMMON_REGISTRY_OPTIONS)
|
|
648
|
+
@click.option(
|
|
649
|
+
"--overwrite",
|
|
650
|
+
is_flag=True,
|
|
651
|
+
default=False,
|
|
652
|
+
show_default=True,
|
|
653
|
+
help="Overwrite results directory if it exists.",
|
|
654
|
+
)
|
|
655
|
+
@click.pass_context
|
|
656
|
+
def create_derived_dataset_config(ctx, src, dst, remote_path, overwrite):
|
|
657
|
+
"""Create a derived dataset configuration and dimensions from a query result."""
|
|
658
|
+
fs_interface = make_filesystem_interface(src)
|
|
659
|
+
src_path = fs_interface.path(src)
|
|
660
|
+
if not src_path.exists():
|
|
661
|
+
print(f"{src} does not exist", file=sys.stderr)
|
|
662
|
+
return 1
|
|
663
|
+
dst_path = fs_interface.path(dst)
|
|
664
|
+
check_output_directory(dst_path, fs_interface, overwrite)
|
|
665
|
+
|
|
666
|
+
conn = DatabaseConnection(
|
|
667
|
+
url=get_value_from_context(ctx, "url"),
|
|
668
|
+
)
|
|
669
|
+
registry_manager = RegistryManager.load(
|
|
670
|
+
conn,
|
|
671
|
+
remote_path=remote_path,
|
|
672
|
+
offline_mode=get_value_from_context(ctx, "offline"),
|
|
673
|
+
)
|
|
674
|
+
result = create_derived_dataset_config_from_query(src_path, dst_path, registry_manager)
|
|
675
|
+
if not result:
|
|
676
|
+
logger.error("The query defined in %s does not support a derived dataset.", src)
|
|
677
|
+
return 1
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
@click.group()
|
|
681
|
+
def query():
|
|
682
|
+
"""Query group commands"""
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
@click.group()
|
|
686
|
+
def project():
|
|
687
|
+
"""Project group commands"""
|
|
688
|
+
|
|
689
|
+
|
|
690
|
+
@click.group()
|
|
691
|
+
def dataset():
|
|
692
|
+
"""Dataset group commands"""
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
@click.group()
|
|
696
|
+
def composite_dataset():
|
|
697
|
+
"""Composite dataset group commands"""
|
|
698
|
+
|
|
699
|
+
|
|
700
|
+
query.add_command(composite_dataset)
|
|
701
|
+
query.add_command(project)
|
|
702
|
+
query.add_command(dataset)
|
|
703
|
+
project.add_command(create_project_query)
|
|
704
|
+
project.add_command(validate_project_query)
|
|
705
|
+
project.add_command(run_project_query)
|
|
706
|
+
project.add_command(create_derived_dataset_config)
|
|
707
|
+
project.add_command(map_dataset)
|
|
708
|
+
dataset.add_command(create_dataset_query)
|
|
709
|
+
dataset.add_command(run_dataset_query)
|
|
710
|
+
composite_dataset.add_command(create_composite_dataset)
|
|
711
|
+
composite_dataset.add_command(query_composite_dataset)
|