dsgrid-toolkit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dsgrid-toolkit might be problematic. Click here for more details.

Files changed (152) hide show
  1. dsgrid/__init__.py +22 -0
  2. dsgrid/api/__init__.py +0 -0
  3. dsgrid/api/api_manager.py +179 -0
  4. dsgrid/api/app.py +420 -0
  5. dsgrid/api/models.py +60 -0
  6. dsgrid/api/response_models.py +116 -0
  7. dsgrid/apps/__init__.py +0 -0
  8. dsgrid/apps/project_viewer/app.py +216 -0
  9. dsgrid/apps/registration_gui.py +444 -0
  10. dsgrid/chronify.py +22 -0
  11. dsgrid/cli/__init__.py +0 -0
  12. dsgrid/cli/common.py +120 -0
  13. dsgrid/cli/config.py +177 -0
  14. dsgrid/cli/download.py +13 -0
  15. dsgrid/cli/dsgrid.py +142 -0
  16. dsgrid/cli/dsgrid_admin.py +349 -0
  17. dsgrid/cli/install_notebooks.py +62 -0
  18. dsgrid/cli/query.py +711 -0
  19. dsgrid/cli/registry.py +1773 -0
  20. dsgrid/cloud/__init__.py +0 -0
  21. dsgrid/cloud/cloud_storage_interface.py +140 -0
  22. dsgrid/cloud/factory.py +31 -0
  23. dsgrid/cloud/fake_storage_interface.py +37 -0
  24. dsgrid/cloud/s3_storage_interface.py +156 -0
  25. dsgrid/common.py +35 -0
  26. dsgrid/config/__init__.py +0 -0
  27. dsgrid/config/annual_time_dimension_config.py +187 -0
  28. dsgrid/config/common.py +131 -0
  29. dsgrid/config/config_base.py +148 -0
  30. dsgrid/config/dataset_config.py +684 -0
  31. dsgrid/config/dataset_schema_handler_factory.py +41 -0
  32. dsgrid/config/date_time_dimension_config.py +108 -0
  33. dsgrid/config/dimension_config.py +54 -0
  34. dsgrid/config/dimension_config_factory.py +65 -0
  35. dsgrid/config/dimension_mapping_base.py +349 -0
  36. dsgrid/config/dimension_mappings_config.py +48 -0
  37. dsgrid/config/dimensions.py +775 -0
  38. dsgrid/config/dimensions_config.py +71 -0
  39. dsgrid/config/index_time_dimension_config.py +76 -0
  40. dsgrid/config/input_dataset_requirements.py +31 -0
  41. dsgrid/config/mapping_tables.py +209 -0
  42. dsgrid/config/noop_time_dimension_config.py +42 -0
  43. dsgrid/config/project_config.py +1457 -0
  44. dsgrid/config/registration_models.py +199 -0
  45. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  46. dsgrid/config/simple_models.py +49 -0
  47. dsgrid/config/supplemental_dimension.py +29 -0
  48. dsgrid/config/time_dimension_base_config.py +200 -0
  49. dsgrid/data_models.py +155 -0
  50. dsgrid/dataset/__init__.py +0 -0
  51. dsgrid/dataset/dataset.py +123 -0
  52. dsgrid/dataset/dataset_expression_handler.py +86 -0
  53. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  54. dsgrid/dataset/dataset_schema_handler_base.py +899 -0
  55. dsgrid/dataset/dataset_schema_handler_one_table.py +196 -0
  56. dsgrid/dataset/dataset_schema_handler_standard.py +303 -0
  57. dsgrid/dataset/growth_rates.py +162 -0
  58. dsgrid/dataset/models.py +44 -0
  59. dsgrid/dataset/table_format_handler_base.py +257 -0
  60. dsgrid/dataset/table_format_handler_factory.py +17 -0
  61. dsgrid/dataset/unpivoted_table.py +121 -0
  62. dsgrid/dimension/__init__.py +0 -0
  63. dsgrid/dimension/base_models.py +218 -0
  64. dsgrid/dimension/dimension_filters.py +308 -0
  65. dsgrid/dimension/standard.py +213 -0
  66. dsgrid/dimension/time.py +531 -0
  67. dsgrid/dimension/time_utils.py +88 -0
  68. dsgrid/dsgrid_rc.py +88 -0
  69. dsgrid/exceptions.py +105 -0
  70. dsgrid/filesystem/__init__.py +0 -0
  71. dsgrid/filesystem/cloud_filesystem.py +32 -0
  72. dsgrid/filesystem/factory.py +32 -0
  73. dsgrid/filesystem/filesystem_interface.py +136 -0
  74. dsgrid/filesystem/local_filesystem.py +74 -0
  75. dsgrid/filesystem/s3_filesystem.py +118 -0
  76. dsgrid/loggers.py +132 -0
  77. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +950 -0
  78. dsgrid/notebooks/registration.ipynb +48 -0
  79. dsgrid/notebooks/start_notebook.sh +11 -0
  80. dsgrid/project.py +451 -0
  81. dsgrid/query/__init__.py +0 -0
  82. dsgrid/query/dataset_mapping_plan.py +142 -0
  83. dsgrid/query/derived_dataset.py +384 -0
  84. dsgrid/query/models.py +726 -0
  85. dsgrid/query/query_context.py +287 -0
  86. dsgrid/query/query_submitter.py +847 -0
  87. dsgrid/query/report_factory.py +19 -0
  88. dsgrid/query/report_peak_load.py +70 -0
  89. dsgrid/query/reports_base.py +20 -0
  90. dsgrid/registry/__init__.py +0 -0
  91. dsgrid/registry/bulk_register.py +161 -0
  92. dsgrid/registry/common.py +287 -0
  93. dsgrid/registry/config_update_checker_base.py +63 -0
  94. dsgrid/registry/data_store_factory.py +34 -0
  95. dsgrid/registry/data_store_interface.py +69 -0
  96. dsgrid/registry/dataset_config_generator.py +156 -0
  97. dsgrid/registry/dataset_registry_manager.py +734 -0
  98. dsgrid/registry/dataset_update_checker.py +16 -0
  99. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  100. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  101. dsgrid/registry/dimension_registry_manager.py +413 -0
  102. dsgrid/registry/dimension_update_checker.py +16 -0
  103. dsgrid/registry/duckdb_data_store.py +185 -0
  104. dsgrid/registry/filesystem_data_store.py +141 -0
  105. dsgrid/registry/filter_registry_manager.py +123 -0
  106. dsgrid/registry/project_config_generator.py +57 -0
  107. dsgrid/registry/project_registry_manager.py +1616 -0
  108. dsgrid/registry/project_update_checker.py +48 -0
  109. dsgrid/registry/registration_context.py +223 -0
  110. dsgrid/registry/registry_auto_updater.py +316 -0
  111. dsgrid/registry/registry_database.py +662 -0
  112. dsgrid/registry/registry_interface.py +446 -0
  113. dsgrid/registry/registry_manager.py +544 -0
  114. dsgrid/registry/registry_manager_base.py +367 -0
  115. dsgrid/registry/versioning.py +92 -0
  116. dsgrid/spark/__init__.py +0 -0
  117. dsgrid/spark/functions.py +545 -0
  118. dsgrid/spark/types.py +50 -0
  119. dsgrid/tests/__init__.py +0 -0
  120. dsgrid/tests/common.py +139 -0
  121. dsgrid/tests/make_us_data_registry.py +204 -0
  122. dsgrid/tests/register_derived_datasets.py +103 -0
  123. dsgrid/tests/utils.py +25 -0
  124. dsgrid/time/__init__.py +0 -0
  125. dsgrid/time/time_conversions.py +80 -0
  126. dsgrid/time/types.py +67 -0
  127. dsgrid/units/__init__.py +0 -0
  128. dsgrid/units/constants.py +113 -0
  129. dsgrid/units/convert.py +71 -0
  130. dsgrid/units/energy.py +145 -0
  131. dsgrid/units/power.py +87 -0
  132. dsgrid/utils/__init__.py +0 -0
  133. dsgrid/utils/dataset.py +612 -0
  134. dsgrid/utils/files.py +179 -0
  135. dsgrid/utils/filters.py +125 -0
  136. dsgrid/utils/id_remappings.py +100 -0
  137. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  138. dsgrid/utils/py_expression_eval/README.md +8 -0
  139. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  140. dsgrid/utils/py_expression_eval/tests.py +283 -0
  141. dsgrid/utils/run_command.py +70 -0
  142. dsgrid/utils/scratch_dir_context.py +64 -0
  143. dsgrid/utils/spark.py +918 -0
  144. dsgrid/utils/spark_partition.py +98 -0
  145. dsgrid/utils/timing.py +239 -0
  146. dsgrid/utils/utilities.py +184 -0
  147. dsgrid/utils/versioning.py +36 -0
  148. dsgrid_toolkit-0.2.0.dist-info/METADATA +216 -0
  149. dsgrid_toolkit-0.2.0.dist-info/RECORD +152 -0
  150. dsgrid_toolkit-0.2.0.dist-info/WHEEL +4 -0
  151. dsgrid_toolkit-0.2.0.dist-info/entry_points.txt +4 -0
  152. dsgrid_toolkit-0.2.0.dist-info/licenses/LICENSE +29 -0
dsgrid/cli/query.py ADDED
@@ -0,0 +1,711 @@
1
+ """Runs dsgrid queries."""
2
+
3
+ import logging
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ import rich_click as click
8
+ from chronify.utils.path_utils import check_overwrite
9
+ from pydantic import ValidationError
10
+
11
+ from dsgrid.common import REMOTE_REGISTRY
12
+ from dsgrid.cli.common import (
13
+ check_output_directory,
14
+ get_value_from_context,
15
+ handle_dsgrid_exception,
16
+ path_callback,
17
+ )
18
+ from dsgrid.dimension.base_models import DimensionType
19
+ from dsgrid.dimension.dimension_filters import (
20
+ DimensionFilterType,
21
+ DimensionFilterExpressionModel,
22
+ DimensionFilterExpressionRawModel,
23
+ DimensionFilterBetweenColumnOperatorModel,
24
+ DimensionFilterColumnOperatorModel,
25
+ SubsetDimensionFilterModel,
26
+ SupplementalDimensionFilterColumnOperatorModel,
27
+ )
28
+ from dsgrid.filesystem.factory import make_filesystem_interface
29
+ from dsgrid.query.dataset_mapping_plan import DatasetMappingPlan
30
+ from dsgrid.query.derived_dataset import create_derived_dataset_config_from_query
31
+ from dsgrid.query.models import (
32
+ AggregationModel,
33
+ DatasetQueryModel,
34
+ DimensionNamesModel,
35
+ ProjectQueryModel,
36
+ ProjectQueryParamsModel,
37
+ CreateCompositeDatasetQueryModel,
38
+ CompositeDatasetQueryModel,
39
+ StandaloneDatasetModel,
40
+ ColumnType,
41
+ DatasetModel,
42
+ make_query_for_standalone_dataset,
43
+ )
44
+ from dsgrid.query.query_submitter import (
45
+ DatasetQuerySubmitter,
46
+ ProjectQuerySubmitter,
47
+ ) # , CompositeDatasetQuerySubmitter
48
+ from dsgrid.registry.common import DatabaseConnection
49
+ from dsgrid.registry.registry_manager import RegistryManager
50
+ from dsgrid.utils.files import dump_json_file
51
+
52
+
53
+ QUERY_OUTPUT_DIR = "query_output"
54
+
55
+ logger = logging.getLogger(__name__)
56
+
57
+
58
+ def add_options(options):
59
+ def _add_options(func):
60
+ for option in reversed(options):
61
+ func = option(func)
62
+ return func
63
+
64
+ return _add_options
65
+
66
+
67
+ _COMMON_REGISTRY_OPTIONS = (
68
+ click.option(
69
+ "--remote-path",
70
+ default=REMOTE_REGISTRY,
71
+ show_default=True,
72
+ help="Path to dsgrid remote registry",
73
+ ),
74
+ )
75
+
76
+
77
+ _COMMON_RUN_OPTIONS = (
78
+ click.option(
79
+ "-o",
80
+ "--output",
81
+ default=QUERY_OUTPUT_DIR,
82
+ show_default=True,
83
+ type=str,
84
+ help="Output directory for query results",
85
+ callback=path_callback,
86
+ ),
87
+ click.option(
88
+ "--load-cached-table/--no-load-cached-table",
89
+ is_flag=True,
90
+ default=True,
91
+ show_default=True,
92
+ help="Try to load a cached table if one exists.",
93
+ ),
94
+ click.option(
95
+ "--overwrite",
96
+ is_flag=True,
97
+ default=False,
98
+ show_default=True,
99
+ help="Overwrite results directory if it exists.",
100
+ ),
101
+ )
102
+
103
+
104
+ _create_project_query_epilog = """
105
+ Examples:\n
106
+ $ dsgrid query project create my_query_result_name my_project_id my_dataset_id\n
107
+ $ dsgrid query project create --default-result-aggregation my_query_result_name my_project_id my_dataset_id\n
108
+ """
109
+
110
+
111
+ @click.command("create", epilog=_create_project_query_epilog)
112
+ @click.argument("query_name")
113
+ @click.argument("project_id")
114
+ @click.argument("dataset_id")
115
+ @click.option(
116
+ "-F",
117
+ "--filters",
118
+ type=click.Choice([x.value for x in DimensionFilterType]),
119
+ multiple=True,
120
+ help="Add a dimension filter. Requires user customization.",
121
+ )
122
+ @click.option(
123
+ "-a",
124
+ "--aggregation-function",
125
+ default="sum",
126
+ show_default=True,
127
+ help="Aggregation function for any included default aggregations.",
128
+ )
129
+ @click.option(
130
+ "-f",
131
+ "--query-file",
132
+ default="query.json5",
133
+ show_default=True,
134
+ help="Query file to create.",
135
+ callback=path_callback,
136
+ )
137
+ @click.option(
138
+ "-r",
139
+ "--default-result-aggregation",
140
+ is_flag=True,
141
+ default=False,
142
+ show_default=True,
143
+ help="Add default result aggregration.",
144
+ )
145
+ @click.option(
146
+ "--overwrite",
147
+ is_flag=True,
148
+ default=False,
149
+ show_default=True,
150
+ help="Overwrite query file if it exists.",
151
+ )
152
+ @add_options(_COMMON_REGISTRY_OPTIONS)
153
+ @click.pass_context
154
+ def create_project_query(
155
+ ctx,
156
+ query_name,
157
+ project_id,
158
+ dataset_id,
159
+ filters,
160
+ aggregation_function,
161
+ query_file,
162
+ default_result_aggregation,
163
+ overwrite,
164
+ remote_path,
165
+ ):
166
+ """Create a default query file for a dsgrid project."""
167
+ check_overwrite(query_file, overwrite)
168
+ conn = DatabaseConnection(
169
+ url=get_value_from_context(ctx, "url"),
170
+ )
171
+ registry_manager = RegistryManager.load(
172
+ conn,
173
+ remote_path=remote_path,
174
+ offline_mode=get_value_from_context(ctx, "offline"),
175
+ )
176
+ project = registry_manager.project_manager.load_project(project_id)
177
+ query = ProjectQueryModel(
178
+ name=query_name,
179
+ project=ProjectQueryParamsModel(
180
+ project_id=project_id,
181
+ dataset=DatasetModel(
182
+ dataset_id=dataset_id,
183
+ source_datasets=[
184
+ StandaloneDatasetModel(dataset_id=x)
185
+ for x in project.config.list_registered_dataset_ids()
186
+ ],
187
+ ),
188
+ ),
189
+ )
190
+
191
+ for dim_filter in filters:
192
+ filter_type = DimensionFilterType(dim_filter)
193
+ match filter_type:
194
+ case DimensionFilterType.EXPRESSION:
195
+ flt = DimensionFilterExpressionModel(
196
+ dimension_type=DimensionType.GEOGRAPHY,
197
+ dimension_name="county",
198
+ operator="==",
199
+ value="",
200
+ )
201
+ case DimensionFilterType.BETWEEN_COLUMN_OPERATOR:
202
+ flt = DimensionFilterBetweenColumnOperatorModel(
203
+ dimension_type=DimensionType.TIME,
204
+ dimension_name="time_est",
205
+ lower_bound="",
206
+ upper_bound="",
207
+ )
208
+ case DimensionFilterType.COLUMN_OPERATOR:
209
+ flt = DimensionFilterColumnOperatorModel(
210
+ dimension_type=DimensionType.GEOGRAPHY,
211
+ dimension_name="county",
212
+ value="",
213
+ operator="contains",
214
+ )
215
+ case DimensionFilterType.SUPPLEMENTAL_COLUMN_OPERATOR:
216
+ flt = SupplementalDimensionFilterColumnOperatorModel(
217
+ dimension_type=DimensionType.GEOGRAPHY,
218
+ dimension_name="state",
219
+ )
220
+ case DimensionFilterType.EXPRESSION_RAW:
221
+ flt = DimensionFilterExpressionRawModel(
222
+ dimension_type=DimensionType.GEOGRAPHY,
223
+ dimension_name="county",
224
+ value="== '06037'",
225
+ )
226
+ case DimensionFilterType.SUBSET:
227
+ flt = SubsetDimensionFilterModel(
228
+ dimension_type=DimensionType.SUBSECTOR,
229
+ dimension_names=["commercial_subsectors", "residential_subsectors"],
230
+ )
231
+ case _:
232
+ msg = f"Bug: {filter_type}"
233
+ raise NotImplementedError(msg)
234
+ query.project.dataset.params.dimension_filters.append(flt)
235
+
236
+ if default_result_aggregation:
237
+ default_aggs = {
238
+ k.value: v for k, v in project.config.get_dimension_type_to_base_name_mapping().items()
239
+ }
240
+ if default_result_aggregation:
241
+ query.result.aggregations = [
242
+ AggregationModel(
243
+ dimensions=DimensionNamesModel(**default_aggs),
244
+ aggregation_function=aggregation_function,
245
+ ),
246
+ ]
247
+
248
+ query_file.write_text(query.model_dump_json(indent=2))
249
+ print(f"Wrote query to {query_file}", file=sys.stderr)
250
+
251
+
252
+ @click.command("validate")
253
+ @click.argument("query_file", type=click.Path(exists=True), callback=path_callback)
254
+ def validate_project_query(query_file):
255
+ try:
256
+ ProjectQueryModel.from_file(query_file)
257
+ print(f"Validated {query_file}", file=sys.stderr)
258
+ except ValidationError:
259
+ print(f"Failed to validate query file {query_file}", file=sys.stderr)
260
+ raise
261
+
262
+
263
+ _run_project_query_epilog = """
264
+ Examples:\n
265
+ $ dsgrid query project run query.json5
266
+ """
267
+
268
+
269
+ @click.command("run", epilog=_run_project_query_epilog)
270
+ @click.argument("query_definition_file", type=click.Path(exists=True))
271
+ @click.option(
272
+ "-c",
273
+ "--checkpoint-file",
274
+ type=click.Path(exists=True),
275
+ callback=path_callback,
276
+ help="Checkpoint file created by a previous map operation. If passed, the code will "
277
+ "read it and resume from the last persisted file.",
278
+ )
279
+ @click.option(
280
+ "--persist-intermediate-table/--no-persist-intermediate-table",
281
+ is_flag=True,
282
+ default=True,
283
+ show_default=True,
284
+ help="Persist the intermediate table to the filesystem to allow for reuse.",
285
+ )
286
+ @click.option(
287
+ "-z",
288
+ "--zip-file",
289
+ is_flag=True,
290
+ default=False,
291
+ show_default=True,
292
+ help="Create a zip file containing all output files.",
293
+ )
294
+ @add_options(_COMMON_REGISTRY_OPTIONS)
295
+ @add_options(_COMMON_RUN_OPTIONS)
296
+ @click.pass_context
297
+ def run_project_query(
298
+ ctx: click.Context,
299
+ query_definition_file: Path,
300
+ checkpoint_file: Path | None,
301
+ persist_intermediate_table: bool,
302
+ zip_file: bool,
303
+ remote_path: str,
304
+ output: Path,
305
+ load_cached_table: bool,
306
+ overwrite: bool,
307
+ ):
308
+ """Run a query on a dsgrid project."""
309
+ query = ProjectQueryModel.from_file(query_definition_file)
310
+ _run_project_query(
311
+ ctx,
312
+ query,
313
+ checkpoint_file,
314
+ persist_intermediate_table,
315
+ zip_file,
316
+ remote_path,
317
+ output,
318
+ load_cached_table,
319
+ overwrite,
320
+ )
321
+
322
+
323
+ def _run_project_query(
324
+ ctx: click.Context,
325
+ query: ProjectQueryModel,
326
+ checkpoint_file: Path | None,
327
+ persist_intermediate_table: bool,
328
+ zip_file: bool,
329
+ remote_path,
330
+ output: Path,
331
+ load_cached_table: bool,
332
+ overwrite: bool,
333
+ ) -> None:
334
+ conn = DatabaseConnection(
335
+ url=get_value_from_context(ctx, "url"),
336
+ )
337
+ scratch_dir = get_value_from_context(ctx, "scratch_dir")
338
+ registry_manager = RegistryManager.load(
339
+ conn,
340
+ remote_path=remote_path,
341
+ offline_mode=get_value_from_context(ctx, "offline"),
342
+ )
343
+ project = registry_manager.project_manager.load_project(query.project.project_id)
344
+ fs_interface = make_filesystem_interface(output)
345
+ submitter = ProjectQuerySubmitter(project, fs_interface.path(output))
346
+ res = handle_dsgrid_exception(
347
+ ctx,
348
+ submitter.submit,
349
+ query,
350
+ scratch_dir,
351
+ checkpoint_file=checkpoint_file,
352
+ persist_intermediate_table=persist_intermediate_table,
353
+ load_cached_table=load_cached_table,
354
+ zip_file=zip_file,
355
+ overwrite=overwrite,
356
+ )
357
+ if res[1] != 0:
358
+ ctx.exit(res[1])
359
+
360
+
361
+ _map_dataset_epilog = """
362
+ Examples:\n
363
+ $ dsgrid query project map_dataset project_id dataset_id
364
+ """
365
+
366
+
367
+ @click.command("map-dataset", epilog=_map_dataset_epilog)
368
+ @click.argument("project-id")
369
+ @click.argument("dataset-id")
370
+ @click.option(
371
+ "-c",
372
+ "--checkpoint-file",
373
+ type=click.Path(exists=True),
374
+ callback=path_callback,
375
+ help="Checkpoint file created by a previous map operation. If passed, the code will "
376
+ "read it and resume from the last persisted file.",
377
+ )
378
+ @click.option(
379
+ "-p",
380
+ "--mapping-plan",
381
+ type=click.Path(exists=True),
382
+ help="Path to a mapping plan file. If not provided, the default mapping plan will be used.",
383
+ callback=path_callback,
384
+ )
385
+ @click.option(
386
+ "--persist-intermediate-table/--no-persist-intermediate-table",
387
+ is_flag=True,
388
+ default=True,
389
+ show_default=True,
390
+ help="Persist the intermediate table to the filesystem to allow for reuse.",
391
+ )
392
+ @click.option(
393
+ "-t",
394
+ "--column-type",
395
+ type=click.Choice([x.value for x in ColumnType]),
396
+ default=ColumnType.DIMENSION_NAMES.value,
397
+ callback=lambda *x: ColumnType(x[2]),
398
+ )
399
+ @click.option(
400
+ "-z",
401
+ "--zip-file",
402
+ is_flag=True,
403
+ default=False,
404
+ show_default=True,
405
+ help="Create a zip file containing all output files.",
406
+ )
407
+ @add_options(_COMMON_REGISTRY_OPTIONS)
408
+ @add_options(_COMMON_RUN_OPTIONS)
409
+ @click.pass_context
410
+ def map_dataset(
411
+ ctx: click.Context,
412
+ project_id: str,
413
+ dataset_id: str,
414
+ checkpoint_file: Path | None,
415
+ mapping_plan: Path | None,
416
+ persist_intermediate_table: bool,
417
+ remote_path,
418
+ output: Path,
419
+ load_cached_table: bool,
420
+ overwrite: bool,
421
+ column_type: ColumnType,
422
+ zip_file: bool,
423
+ ):
424
+ """Map a dataset to the project's base dimensions."""
425
+ plan = DatasetMappingPlan.from_file(mapping_plan) if mapping_plan else None
426
+ query = make_query_for_standalone_dataset(
427
+ project_id, dataset_id, plan, column_type=column_type
428
+ )
429
+ _run_project_query(
430
+ ctx=ctx,
431
+ query=query,
432
+ checkpoint_file=checkpoint_file,
433
+ persist_intermediate_table=persist_intermediate_table,
434
+ zip_file=zip_file,
435
+ remote_path=remote_path,
436
+ output=output,
437
+ load_cached_table=load_cached_table,
438
+ overwrite=overwrite,
439
+ )
440
+
441
+
442
+ @click.command("create-query")
443
+ @click.argument("name", type=str)
444
+ @click.argument("dataset_id", type=str)
445
+ @click.option(
446
+ "-f",
447
+ "--query-file",
448
+ default="dataset_query.json5",
449
+ show_default=True,
450
+ help="Query file to create.",
451
+ callback=path_callback,
452
+ )
453
+ @click.option(
454
+ "--overwrite",
455
+ is_flag=True,
456
+ default=False,
457
+ show_default=True,
458
+ help="Overwrite query file if it exists.",
459
+ )
460
+ @click.pass_context
461
+ def create_dataset_query(
462
+ ctx,
463
+ name: str,
464
+ dataset_id: str,
465
+ query_file: Path,
466
+ overwrite: bool,
467
+ ):
468
+ """Create a query file to be used for mapping a dataset to an arbitrary list of dimensions."""
469
+ query = DatasetQueryModel(name=name, dataset_id=dataset_id, to_dimension_references=[])
470
+ check_overwrite(query_file, overwrite)
471
+ data = query.model_dump(mode="json")
472
+ unsupported_result_fields = (
473
+ "column_type",
474
+ "replace_ids_with_names",
475
+ "aggregations",
476
+ "aggregate_each_dataset",
477
+ "reports",
478
+ "dimension_filters",
479
+ "time_zone",
480
+ )
481
+ data.pop("version")
482
+ for field in unsupported_result_fields:
483
+ data["result"].pop(field)
484
+
485
+ dump_json_file(data, query_file, indent=2)
486
+ print(f"Wrote query to {query_file}", file=sys.stderr)
487
+
488
+
489
+ _run_dataset_query_epilog = """
490
+ Examples:\n
491
+ $ dsgrid query dataset run query.json5
492
+ """
493
+
494
+
495
+ @click.command("run", epilog=_run_dataset_query_epilog)
496
+ @click.argument("query_definition_file", type=click.Path(exists=True))
497
+ @click.option(
498
+ "-c",
499
+ "--checkpoint-file",
500
+ type=click.Path(exists=True),
501
+ callback=path_callback,
502
+ help="Checkpoint file created by a previous map operation. If passed, the code will "
503
+ "read it and resume from the last persisted file.",
504
+ )
505
+ @click.option(
506
+ "-o",
507
+ "--output",
508
+ default=QUERY_OUTPUT_DIR,
509
+ show_default=True,
510
+ type=str,
511
+ help="Output directory for query results",
512
+ callback=path_callback,
513
+ )
514
+ @click.option(
515
+ "--overwrite",
516
+ is_flag=True,
517
+ default=False,
518
+ show_default=True,
519
+ help="Overwrite results directory if it exists.",
520
+ )
521
+ @add_options(_COMMON_REGISTRY_OPTIONS)
522
+ @click.pass_context
523
+ def run_dataset_query(
524
+ ctx: click.Context,
525
+ query_definition_file: Path,
526
+ checkpoint_file: Path | None,
527
+ output: Path,
528
+ overwrite: bool,
529
+ remote_path: str,
530
+ ):
531
+ """Run a query on a dsgrid dataset."""
532
+ query = DatasetQueryModel.from_file(query_definition_file)
533
+ _run_dataset_query(
534
+ ctx,
535
+ query,
536
+ checkpoint_file,
537
+ remote_path,
538
+ output,
539
+ overwrite,
540
+ )
541
+
542
+
543
+ def _run_dataset_query(
544
+ ctx: click.Context,
545
+ query: DatasetQueryModel,
546
+ checkpoint_file: Path | None,
547
+ remote_path,
548
+ output: Path,
549
+ overwrite: bool,
550
+ ) -> None:
551
+ conn = DatabaseConnection(
552
+ url=get_value_from_context(ctx, "url"),
553
+ )
554
+ scratch_dir = get_value_from_context(ctx, "scratch_dir")
555
+ registry_manager = RegistryManager.load(
556
+ conn,
557
+ remote_path=remote_path,
558
+ offline_mode=get_value_from_context(ctx, "offline"),
559
+ )
560
+ fs_interface = make_filesystem_interface(output)
561
+ submitter = DatasetQuerySubmitter(fs_interface.path(output))
562
+ res = handle_dsgrid_exception(
563
+ ctx,
564
+ submitter.submit,
565
+ query,
566
+ registry_manager,
567
+ scratch_dir,
568
+ overwrite=overwrite,
569
+ )
570
+ if res[1] != 0:
571
+ ctx.exit(res[1])
572
+
573
+
574
+ @click.command("create_dataset")
575
+ @click.argument("query_definition_file", type=click.Path(exists=True))
576
+ @add_options(_COMMON_RUN_OPTIONS)
577
+ @click.pass_context
578
+ def create_composite_dataset(
579
+ ctx,
580
+ query_definition_file,
581
+ remote_path,
582
+ output,
583
+ load_cached_table,
584
+ overwrite,
585
+ ):
586
+ """Run a query to create a composite dataset."""
587
+ CreateCompositeDatasetQueryModel.from_file(query_definition_file)
588
+ # conn = DatabaseConnection.from_url(
589
+ # get_value_from_context(ctx, "url"),
590
+ # database=get_value_from_context(ctx, "database_name"),
591
+ # username=get_value_from_context(ctx, "username"),
592
+ # password=get_value_from_context(ctx, "password"),
593
+ # )
594
+ # TODO
595
+ print("not implemented yet")
596
+ return 1
597
+ # registry_manager = RegistryManager.load(
598
+ # conn,
599
+ # remote_path=remote_path,
600
+ # offline_mode=get_value_from_context(ctx, "offline"),
601
+ # )
602
+ # project = registry_manager.project_manager.load_project(query.project.project_id)
603
+ # CompositeDatasetQuerySubmitter.submit(project, output).submit(query, force=overwrite)
604
+
605
+
606
+ @click.command("run")
607
+ @click.argument("query_definition_file", type=click.Path(exists=True))
608
+ @add_options(_COMMON_RUN_OPTIONS)
609
+ @click.pass_context
610
+ def query_composite_dataset(
611
+ ctx,
612
+ query_definition_file,
613
+ remote_path,
614
+ output,
615
+ load_cached_table,
616
+ overwrite,
617
+ ):
618
+ """Run a query on a composite dataset."""
619
+ CompositeDatasetQueryModel.from_file(query_definition_file)
620
+ # conn = DatabaseConnection.from_url(
621
+ # get_value_from_context(ctx, "url"),
622
+ # database=get_value_from_context(ctx, "database_name"),
623
+ # username=get_value_from_context(ctx, "username"),
624
+ # password=get_value_from_context(ctx, "password"),
625
+ # )
626
+ # TODO
627
+ print("not implemented yet")
628
+ return 1
629
+ # registry_manager = RegistryManager.load(
630
+ # registry_path,
631
+ # remote_path=remote_path,
632
+ # offline_mode=get_value_from_context(ctx, "offline"),
633
+ # )
634
+ # project = registry_manager.project_manager.load_project(query.project.project_id)
635
+ # CompositeDatasetQuerySubmitter.submit(project, output).submit(query, overwrite=overwrite)
636
+
637
+
638
+ _create_derived_dataset_config_epilog = f"""
639
+ Examples:\n
640
+ $ dsgrid query project create-derived-dataset-config {QUERY_OUTPUT_DIR}/my_query_result_name my_dataset_config\n
641
+ """
642
+
643
+
644
+ @click.command(epilog=_create_derived_dataset_config_epilog)
645
+ @click.argument("src")
646
+ @click.argument("dst")
647
+ @add_options(_COMMON_REGISTRY_OPTIONS)
648
+ @click.option(
649
+ "--overwrite",
650
+ is_flag=True,
651
+ default=False,
652
+ show_default=True,
653
+ help="Overwrite results directory if it exists.",
654
+ )
655
+ @click.pass_context
656
+ def create_derived_dataset_config(ctx, src, dst, remote_path, overwrite):
657
+ """Create a derived dataset configuration and dimensions from a query result."""
658
+ fs_interface = make_filesystem_interface(src)
659
+ src_path = fs_interface.path(src)
660
+ if not src_path.exists():
661
+ print(f"{src} does not exist", file=sys.stderr)
662
+ return 1
663
+ dst_path = fs_interface.path(dst)
664
+ check_output_directory(dst_path, fs_interface, overwrite)
665
+
666
+ conn = DatabaseConnection(
667
+ url=get_value_from_context(ctx, "url"),
668
+ )
669
+ registry_manager = RegistryManager.load(
670
+ conn,
671
+ remote_path=remote_path,
672
+ offline_mode=get_value_from_context(ctx, "offline"),
673
+ )
674
+ result = create_derived_dataset_config_from_query(src_path, dst_path, registry_manager)
675
+ if not result:
676
+ logger.error("The query defined in %s does not support a derived dataset.", src)
677
+ return 1
678
+
679
+
680
+ @click.group()
681
+ def query():
682
+ """Query group commands"""
683
+
684
+
685
+ @click.group()
686
+ def project():
687
+ """Project group commands"""
688
+
689
+
690
+ @click.group()
691
+ def dataset():
692
+ """Dataset group commands"""
693
+
694
+
695
+ @click.group()
696
+ def composite_dataset():
697
+ """Composite dataset group commands"""
698
+
699
+
700
+ query.add_command(composite_dataset)
701
+ query.add_command(project)
702
+ query.add_command(dataset)
703
+ project.add_command(create_project_query)
704
+ project.add_command(validate_project_query)
705
+ project.add_command(run_project_query)
706
+ project.add_command(create_derived_dataset_config)
707
+ project.add_command(map_dataset)
708
+ dataset.add_command(create_dataset_query)
709
+ dataset.add_command(run_dataset_query)
710
+ composite_dataset.add_command(create_composite_dataset)
711
+ composite_dataset.add_command(query_composite_dataset)