dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backend.py +93 -0
- dsgrid/__init__.py +22 -0
- dsgrid/api/__init__.py +0 -0
- dsgrid/api/api_manager.py +179 -0
- dsgrid/api/app.py +419 -0
- dsgrid/api/models.py +60 -0
- dsgrid/api/response_models.py +116 -0
- dsgrid/apps/__init__.py +0 -0
- dsgrid/apps/project_viewer/app.py +216 -0
- dsgrid/apps/registration_gui.py +444 -0
- dsgrid/chronify.py +32 -0
- dsgrid/cli/__init__.py +0 -0
- dsgrid/cli/common.py +120 -0
- dsgrid/cli/config.py +176 -0
- dsgrid/cli/download.py +13 -0
- dsgrid/cli/dsgrid.py +157 -0
- dsgrid/cli/dsgrid_admin.py +92 -0
- dsgrid/cli/install_notebooks.py +62 -0
- dsgrid/cli/query.py +729 -0
- dsgrid/cli/registry.py +1862 -0
- dsgrid/cloud/__init__.py +0 -0
- dsgrid/cloud/cloud_storage_interface.py +140 -0
- dsgrid/cloud/factory.py +31 -0
- dsgrid/cloud/fake_storage_interface.py +37 -0
- dsgrid/cloud/s3_storage_interface.py +156 -0
- dsgrid/common.py +36 -0
- dsgrid/config/__init__.py +0 -0
- dsgrid/config/annual_time_dimension_config.py +194 -0
- dsgrid/config/common.py +142 -0
- dsgrid/config/config_base.py +148 -0
- dsgrid/config/dataset_config.py +907 -0
- dsgrid/config/dataset_schema_handler_factory.py +46 -0
- dsgrid/config/date_time_dimension_config.py +136 -0
- dsgrid/config/dimension_config.py +54 -0
- dsgrid/config/dimension_config_factory.py +65 -0
- dsgrid/config/dimension_mapping_base.py +350 -0
- dsgrid/config/dimension_mappings_config.py +48 -0
- dsgrid/config/dimensions.py +1025 -0
- dsgrid/config/dimensions_config.py +71 -0
- dsgrid/config/file_schema.py +190 -0
- dsgrid/config/index_time_dimension_config.py +80 -0
- dsgrid/config/input_dataset_requirements.py +31 -0
- dsgrid/config/mapping_tables.py +209 -0
- dsgrid/config/noop_time_dimension_config.py +42 -0
- dsgrid/config/project_config.py +1462 -0
- dsgrid/config/registration_models.py +188 -0
- dsgrid/config/representative_period_time_dimension_config.py +194 -0
- dsgrid/config/simple_models.py +49 -0
- dsgrid/config/supplemental_dimension.py +29 -0
- dsgrid/config/time_dimension_base_config.py +192 -0
- dsgrid/data_models.py +155 -0
- dsgrid/dataset/__init__.py +0 -0
- dsgrid/dataset/dataset.py +123 -0
- dsgrid/dataset/dataset_expression_handler.py +86 -0
- dsgrid/dataset/dataset_mapping_manager.py +121 -0
- dsgrid/dataset/dataset_schema_handler_base.py +945 -0
- dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
- dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
- dsgrid/dataset/growth_rates.py +162 -0
- dsgrid/dataset/models.py +51 -0
- dsgrid/dataset/table_format_handler_base.py +257 -0
- dsgrid/dataset/table_format_handler_factory.py +17 -0
- dsgrid/dataset/unpivoted_table.py +121 -0
- dsgrid/dimension/__init__.py +0 -0
- dsgrid/dimension/base_models.py +230 -0
- dsgrid/dimension/dimension_filters.py +308 -0
- dsgrid/dimension/standard.py +252 -0
- dsgrid/dimension/time.py +352 -0
- dsgrid/dimension/time_utils.py +103 -0
- dsgrid/dsgrid_rc.py +88 -0
- dsgrid/exceptions.py +105 -0
- dsgrid/filesystem/__init__.py +0 -0
- dsgrid/filesystem/cloud_filesystem.py +32 -0
- dsgrid/filesystem/factory.py +32 -0
- dsgrid/filesystem/filesystem_interface.py +136 -0
- dsgrid/filesystem/local_filesystem.py +74 -0
- dsgrid/filesystem/s3_filesystem.py +118 -0
- dsgrid/loggers.py +132 -0
- dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
- dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
- dsgrid/notebooks/registration.ipynb +48 -0
- dsgrid/notebooks/start_notebook.sh +11 -0
- dsgrid/project.py +451 -0
- dsgrid/query/__init__.py +0 -0
- dsgrid/query/dataset_mapping_plan.py +142 -0
- dsgrid/query/derived_dataset.py +388 -0
- dsgrid/query/models.py +728 -0
- dsgrid/query/query_context.py +287 -0
- dsgrid/query/query_submitter.py +994 -0
- dsgrid/query/report_factory.py +19 -0
- dsgrid/query/report_peak_load.py +70 -0
- dsgrid/query/reports_base.py +20 -0
- dsgrid/registry/__init__.py +0 -0
- dsgrid/registry/bulk_register.py +165 -0
- dsgrid/registry/common.py +287 -0
- dsgrid/registry/config_update_checker_base.py +63 -0
- dsgrid/registry/data_store_factory.py +34 -0
- dsgrid/registry/data_store_interface.py +74 -0
- dsgrid/registry/dataset_config_generator.py +158 -0
- dsgrid/registry/dataset_registry_manager.py +950 -0
- dsgrid/registry/dataset_update_checker.py +16 -0
- dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
- dsgrid/registry/dimension_mapping_update_checker.py +16 -0
- dsgrid/registry/dimension_registry_manager.py +413 -0
- dsgrid/registry/dimension_update_checker.py +16 -0
- dsgrid/registry/duckdb_data_store.py +207 -0
- dsgrid/registry/filesystem_data_store.py +150 -0
- dsgrid/registry/filter_registry_manager.py +123 -0
- dsgrid/registry/project_config_generator.py +57 -0
- dsgrid/registry/project_registry_manager.py +1623 -0
- dsgrid/registry/project_update_checker.py +48 -0
- dsgrid/registry/registration_context.py +223 -0
- dsgrid/registry/registry_auto_updater.py +316 -0
- dsgrid/registry/registry_database.py +667 -0
- dsgrid/registry/registry_interface.py +446 -0
- dsgrid/registry/registry_manager.py +558 -0
- dsgrid/registry/registry_manager_base.py +367 -0
- dsgrid/registry/versioning.py +92 -0
- dsgrid/rust_ext/__init__.py +14 -0
- dsgrid/rust_ext/find_minimal_patterns.py +129 -0
- dsgrid/spark/__init__.py +0 -0
- dsgrid/spark/functions.py +589 -0
- dsgrid/spark/types.py +110 -0
- dsgrid/tests/__init__.py +0 -0
- dsgrid/tests/common.py +140 -0
- dsgrid/tests/make_us_data_registry.py +265 -0
- dsgrid/tests/register_derived_datasets.py +103 -0
- dsgrid/tests/utils.py +25 -0
- dsgrid/time/__init__.py +0 -0
- dsgrid/time/time_conversions.py +80 -0
- dsgrid/time/types.py +67 -0
- dsgrid/units/__init__.py +0 -0
- dsgrid/units/constants.py +113 -0
- dsgrid/units/convert.py +71 -0
- dsgrid/units/energy.py +145 -0
- dsgrid/units/power.py +87 -0
- dsgrid/utils/__init__.py +0 -0
- dsgrid/utils/dataset.py +830 -0
- dsgrid/utils/files.py +179 -0
- dsgrid/utils/filters.py +125 -0
- dsgrid/utils/id_remappings.py +100 -0
- dsgrid/utils/py_expression_eval/LICENSE +19 -0
- dsgrid/utils/py_expression_eval/README.md +8 -0
- dsgrid/utils/py_expression_eval/__init__.py +847 -0
- dsgrid/utils/py_expression_eval/tests.py +283 -0
- dsgrid/utils/run_command.py +70 -0
- dsgrid/utils/scratch_dir_context.py +65 -0
- dsgrid/utils/spark.py +918 -0
- dsgrid/utils/spark_partition.py +98 -0
- dsgrid/utils/timing.py +239 -0
- dsgrid/utils/utilities.py +221 -0
- dsgrid/utils/versioning.py +36 -0
- dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
- dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
- dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
- dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
- dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
dsgrid/api/app.py
ADDED
|
@@ -0,0 +1,419 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
from tempfile import NamedTemporaryFile
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from fastapi import FastAPI, HTTPException, BackgroundTasks, Query
|
|
7
|
+
|
|
8
|
+
from fastapi.middleware.gzip import GZipMiddleware
|
|
9
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
10
|
+
from fastapi.responses import Response, FileResponse
|
|
11
|
+
|
|
12
|
+
from dsgrid.common import REMOTE_REGISTRY
|
|
13
|
+
from dsgrid.dataset.models import ValueFormat
|
|
14
|
+
from dsgrid.config.dimensions import create_dimension_common_model, create_project_dimension_model
|
|
15
|
+
from dsgrid.dimension.base_models import DimensionType, DimensionCategory
|
|
16
|
+
from dsgrid.dsgrid_rc import DsgridRuntimeConfig
|
|
17
|
+
from dsgrid.exceptions import DSGValueNotStored
|
|
18
|
+
from dsgrid.loggers import setup_logging
|
|
19
|
+
from dsgrid.query.models import ReportType
|
|
20
|
+
from dsgrid.registry.registry_database import DatabaseConnection
|
|
21
|
+
from dsgrid.registry.registry_manager import RegistryManager
|
|
22
|
+
from dsgrid.utils.run_command import run_command
|
|
23
|
+
from dsgrid.utils.spark import init_spark, read_parquet
|
|
24
|
+
from .api_manager import ApiManager
|
|
25
|
+
from .models import (
|
|
26
|
+
AsyncTaskStatus,
|
|
27
|
+
AsyncTaskType,
|
|
28
|
+
ProjectQueryAsyncResultModel,
|
|
29
|
+
SparkSubmitProjectQueryRequest,
|
|
30
|
+
)
|
|
31
|
+
from .response_models import (
|
|
32
|
+
GetAsyncTaskResponse,
|
|
33
|
+
GetDatasetResponse,
|
|
34
|
+
GetDimensionResponse,
|
|
35
|
+
GetProjectBaseDimensionNameResponse,
|
|
36
|
+
GetProjectDimensionNamesResponse,
|
|
37
|
+
ListProjectDimensionsResponse,
|
|
38
|
+
GetProjectResponse,
|
|
39
|
+
ListAsyncTasksResponse,
|
|
40
|
+
ListDatasetsResponse,
|
|
41
|
+
ListDimensionRecordsResponse,
|
|
42
|
+
ListDimensionTypesResponse,
|
|
43
|
+
ListDimensionsResponse,
|
|
44
|
+
ListProjectSupplementalDimensionNames,
|
|
45
|
+
ListProjectsResponse,
|
|
46
|
+
ListReportTypesResponse,
|
|
47
|
+
ListValueFormatsResponse,
|
|
48
|
+
SparkSubmitProjectQueryResponse,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
logger = setup_logging(__name__, "dsgrid_api.log")
|
|
53
|
+
DSGRID_REGISTRY_DATABASE_URL = os.environ.get("DSGRID_REGISTRY_DATABASE_URL")
|
|
54
|
+
if DSGRID_REGISTRY_DATABASE_URL is None:
|
|
55
|
+
msg = "The environment variable DSGRID_REGISTRY_DATABASE_URL must be set."
|
|
56
|
+
raise Exception(msg)
|
|
57
|
+
if "DSGRID_QUERY_OUTPUT_DIR" not in os.environ:
|
|
58
|
+
msg = "The environment variable DSGRID_QUERY_OUTPUT_DIR must be set."
|
|
59
|
+
raise Exception(msg)
|
|
60
|
+
QUERY_OUTPUT_DIR = os.environ["DSGRID_QUERY_OUTPUT_DIR"]
|
|
61
|
+
API_SERVER_STORE_DIR = os.environ.get("DSGRID_API_SERVER_STORE_DIR")
|
|
62
|
+
if API_SERVER_STORE_DIR is None:
|
|
63
|
+
msg = "The environment variable DSGRID_API_SERVER_STORE_DIR must be set."
|
|
64
|
+
raise Exception(msg)
|
|
65
|
+
|
|
66
|
+
offline_mode = True
|
|
67
|
+
no_prompts = True
|
|
68
|
+
# There could be collisions on the only-allowed SparkSession between the main process and
|
|
69
|
+
# subprocesses that run queries.
|
|
70
|
+
# If both processes try to use the Hive metastore, a crash will occur.
|
|
71
|
+
spark = init_spark("dsgrid_api", check_env=False)
|
|
72
|
+
dsgrid_config = DsgridRuntimeConfig.load()
|
|
73
|
+
conn = DatabaseConnection(
|
|
74
|
+
url=DSGRID_REGISTRY_DATABASE_URL,
|
|
75
|
+
# username=dsgrid_config.database_user,
|
|
76
|
+
# password=dsgrid_config.database_password,
|
|
77
|
+
)
|
|
78
|
+
manager = RegistryManager.load(
|
|
79
|
+
conn, REMOTE_REGISTRY, offline_mode=offline_mode, no_prompts=no_prompts
|
|
80
|
+
)
|
|
81
|
+
api_mgr = ApiManager(API_SERVER_STORE_DIR, manager)
|
|
82
|
+
|
|
83
|
+
# Current limitations:
|
|
84
|
+
# This can only run in one process. State is tracked in memory. This could be solved by
|
|
85
|
+
# storing state in a database like Redis or MongoDB.
|
|
86
|
+
# Deployment strategy is TBD.
|
|
87
|
+
app = FastAPI(swagger_ui_parameters={"tryItOutEnabled": True})
|
|
88
|
+
app.add_middleware(GZipMiddleware, minimum_size=1024)
|
|
89
|
+
origins = [
|
|
90
|
+
"http://localhost",
|
|
91
|
+
"https://localhost",
|
|
92
|
+
"http://localhost:8000",
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
app.add_middleware(
|
|
96
|
+
CORSMiddleware,
|
|
97
|
+
allow_origins=origins,
|
|
98
|
+
allow_credentials=True,
|
|
99
|
+
allow_methods=["*"],
|
|
100
|
+
allow_headers=["*"],
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@app.get("/")
|
|
105
|
+
async def root():
|
|
106
|
+
return {"message": "Welcome to the dsgrid API!"}
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# TODO: Filtering?
|
|
110
|
+
@app.get("/projects", response_model=ListProjectsResponse)
|
|
111
|
+
async def list_projects():
|
|
112
|
+
"""List the projects."""
|
|
113
|
+
mgr = manager.project_manager
|
|
114
|
+
return ListProjectsResponse(
|
|
115
|
+
projects=[mgr.get_by_id(x).model for x in mgr.list_ids()],
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@app.get("/projects/{project_id}", response_model=GetProjectResponse)
|
|
120
|
+
async def get_project(project_id: str):
|
|
121
|
+
"""Return the project with project_ID."""
|
|
122
|
+
mgr = manager.project_manager
|
|
123
|
+
return GetProjectResponse(
|
|
124
|
+
project=mgr.get_by_id(project_id).model,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@app.get(
|
|
129
|
+
"/projects/{project_id}/dimensions",
|
|
130
|
+
response_model=ListProjectDimensionsResponse,
|
|
131
|
+
)
|
|
132
|
+
async def list_project_dimensions(project_id: str):
|
|
133
|
+
"""List the project's dimensions."""
|
|
134
|
+
mgr = manager.project_manager
|
|
135
|
+
project = mgr.get_by_id(project_id)
|
|
136
|
+
dimensions = []
|
|
137
|
+
for item in project.get_dimension_names_model().model_dump().values():
|
|
138
|
+
for query_name in item["base"]:
|
|
139
|
+
dimension = create_project_dimension_model(
|
|
140
|
+
project.get_dimension(query_name).model, DimensionCategory.BASE
|
|
141
|
+
)
|
|
142
|
+
dimensions.append(dimension)
|
|
143
|
+
for query_name in item["subset"]:
|
|
144
|
+
dimension = create_project_dimension_model(
|
|
145
|
+
project.get_dimension(query_name).model, DimensionCategory.SUBSET
|
|
146
|
+
)
|
|
147
|
+
dimensions.append(dimension)
|
|
148
|
+
for query_name in item["supplemental"]:
|
|
149
|
+
dimension = create_project_dimension_model(
|
|
150
|
+
project.get_dimension(query_name).model, DimensionCategory.SUPPLEMENTAL
|
|
151
|
+
)
|
|
152
|
+
dimensions.append(dimension)
|
|
153
|
+
|
|
154
|
+
return ListProjectDimensionsResponse(project_id=project_id, dimensions=dimensions)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@app.get(
|
|
158
|
+
"/projects/{project_id}/dimensions/dimension_names",
|
|
159
|
+
response_model=GetProjectDimensionNamesResponse,
|
|
160
|
+
)
|
|
161
|
+
async def get_project_dimension_names(project_id: str):
|
|
162
|
+
"""List the base and supplemental dimension query names for the project by type."""
|
|
163
|
+
mgr = manager.project_manager
|
|
164
|
+
project = mgr.get_by_id(project_id)
|
|
165
|
+
return GetProjectDimensionNamesResponse(
|
|
166
|
+
project_id=project_id,
|
|
167
|
+
dimension_names=project.get_dimension_names_model(),
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@app.get(
|
|
172
|
+
"/projects/{project_id}/dimensions/base_dimension_name/{dimension_type}",
|
|
173
|
+
response_model=GetProjectBaseDimensionNameResponse,
|
|
174
|
+
)
|
|
175
|
+
async def get_project_base_dimension_name(project_id: str, dimension_type: DimensionType):
|
|
176
|
+
"""Get the project's base dimension query name for the given dimension type."""
|
|
177
|
+
mgr = manager.project_manager
|
|
178
|
+
config = mgr.get_by_id(project_id)
|
|
179
|
+
return GetProjectBaseDimensionNameResponse(
|
|
180
|
+
project_id=project_id,
|
|
181
|
+
dimension_type=dimension_type,
|
|
182
|
+
dimension_name=config.get_base_dimension(dimension_type).model.name,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
@app.get(
|
|
187
|
+
"/projects/{project_id}/dimensions/supplemental_dimension_names/{dimension_type}",
|
|
188
|
+
response_model=ListProjectSupplementalDimensionNames,
|
|
189
|
+
)
|
|
190
|
+
async def list_project_supplemental_dimension_names(
|
|
191
|
+
project_id: str, dimension_type: DimensionType
|
|
192
|
+
):
|
|
193
|
+
"""list the project's supplemental dimension query names for the given dimension type."""
|
|
194
|
+
mgr = manager.project_manager
|
|
195
|
+
config = mgr.get_by_id(project_id)
|
|
196
|
+
return ListProjectSupplementalDimensionNames(
|
|
197
|
+
project_id=project_id,
|
|
198
|
+
dimension_type=dimension_type,
|
|
199
|
+
dimension_names=[
|
|
200
|
+
x.model.name
|
|
201
|
+
for x in config.list_supplemental_dimensions(dimension_type, sort_by="name")
|
|
202
|
+
],
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
@app.get(
|
|
207
|
+
"/projects/{project_id}/dimensions/dimensions_by_name/{dimension_name}",
|
|
208
|
+
response_model=GetDimensionResponse,
|
|
209
|
+
)
|
|
210
|
+
async def get_project_dimension(project_id: str, dimension_name: str):
|
|
211
|
+
"""Get the project's dimension for the given dimension query name."""
|
|
212
|
+
mgr = manager.project_manager
|
|
213
|
+
config = mgr.get_by_id(project_id)
|
|
214
|
+
return GetDimensionResponse(
|
|
215
|
+
dimension=create_dimension_common_model(config.get_dimension(dimension_name).model)
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
# TODO: Add filtering by project_id
|
|
220
|
+
@app.get("/datasets", response_model=ListDatasetsResponse)
|
|
221
|
+
async def list_datasets():
|
|
222
|
+
"""list the datasets."""
|
|
223
|
+
mgr = manager.dataset_manager
|
|
224
|
+
return ListDatasetsResponse(
|
|
225
|
+
datasets=[mgr.get_by_id(x).model for x in mgr.list_ids()],
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
@app.get("/datasets/{dataset_id}", response_model=GetDatasetResponse)
|
|
230
|
+
async def get_dataset(dataset_id: str):
|
|
231
|
+
"""Return the dataset with dataset_id."""
|
|
232
|
+
mgr = manager.dataset_manager
|
|
233
|
+
return GetDatasetResponse(dataset=mgr.get_by_id(dataset_id).model)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
@app.get("/dimensions/types", response_model=ListDimensionTypesResponse)
|
|
237
|
+
async def list_dimension_types():
|
|
238
|
+
"""List the dimension types."""
|
|
239
|
+
return ListDimensionTypesResponse(types=_list_enums(DimensionType))
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
# TODO: Add filtering for dimension IDs
|
|
243
|
+
@app.get("/dimensions", response_model=ListDimensionsResponse)
|
|
244
|
+
async def list_dimensions(dimension_type: DimensionType | None = None):
|
|
245
|
+
"""List the dimensions for the given type."""
|
|
246
|
+
mgr = manager.dimension_manager
|
|
247
|
+
return ListDimensionsResponse(
|
|
248
|
+
dimensions=[
|
|
249
|
+
create_dimension_common_model(mgr.get_by_id(x).model)
|
|
250
|
+
for x in mgr.list_ids(dimension_type=dimension_type)
|
|
251
|
+
],
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
@app.get("/dimensions/{dimension_id}", response_model=GetDimensionResponse)
|
|
256
|
+
async def get_dimension(dimension_id: str):
|
|
257
|
+
"""Get the dimension for the dimension_id."""
|
|
258
|
+
mgr = manager.dimension_manager
|
|
259
|
+
return GetDimensionResponse(
|
|
260
|
+
dimension=create_dimension_common_model(mgr.get_by_id(dimension_id).model)
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
@app.get("/dimensions/records/{dimension_id}", response_model=ListDimensionRecordsResponse)
|
|
265
|
+
async def list_dimension_records(dimension_id: str):
|
|
266
|
+
"""List the records for the dimension ID."""
|
|
267
|
+
mgr = manager.dimension_manager
|
|
268
|
+
model = mgr.get_by_id(dimension_id).model
|
|
269
|
+
records = (
|
|
270
|
+
[]
|
|
271
|
+
if model.dimension_type == DimensionType.TIME
|
|
272
|
+
else [x.model_dump() for x in model.records]
|
|
273
|
+
)
|
|
274
|
+
return ListDimensionRecordsResponse(records=records)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
@app.get("/reports/types", response_model=ListReportTypesResponse)
|
|
278
|
+
async def list_report_types():
|
|
279
|
+
"""List the report types available for queries."""
|
|
280
|
+
return ListReportTypesResponse(types=_list_enums(ReportType))
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
@app.get("/value_formats", response_model=ListValueFormatsResponse)
|
|
284
|
+
async def list_value_formats():
|
|
285
|
+
"""List the value formats available for query results."""
|
|
286
|
+
return ListValueFormatsResponse(formats=_list_enums(ValueFormat))
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
@app.post("/queries/projects", response_model=SparkSubmitProjectQueryResponse)
|
|
290
|
+
async def submit_project_query(
|
|
291
|
+
query: SparkSubmitProjectQueryRequest, background_tasks: BackgroundTasks
|
|
292
|
+
):
|
|
293
|
+
"""Submit a project query for execution."""
|
|
294
|
+
if not api_mgr.can_start_new_async_task():
|
|
295
|
+
# TODO: queue the task and run it later.
|
|
296
|
+
raise HTTPException(422, "Too many async tasks are already running")
|
|
297
|
+
async_task_id = api_mgr.initialize_async_task(AsyncTaskType.PROJECT_QUERY)
|
|
298
|
+
# TODO: how to handle the output directory on the server?
|
|
299
|
+
# TODO: force should not be True
|
|
300
|
+
# TODO: how do we manage the number of background tasks?
|
|
301
|
+
background_tasks.add_task(_submit_project_query, query, async_task_id)
|
|
302
|
+
return SparkSubmitProjectQueryResponse(async_task_id=async_task_id)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
@app.get("/async_tasks/status", response_model=ListAsyncTasksResponse)
|
|
306
|
+
def list_async_tasks(
|
|
307
|
+
async_task_ids: list[int] | None = Query(default=None), status: AsyncTaskStatus | None = None
|
|
308
|
+
):
|
|
309
|
+
"""Return the async tasks. Filter results by async task ID or status."""
|
|
310
|
+
return ListAsyncTasksResponse(
|
|
311
|
+
async_tasks=api_mgr.list_async_tasks(async_task_ids=async_task_ids, status=status)
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
@app.get("/async_tasks/status/{async_task_id}", response_model=GetAsyncTaskResponse)
|
|
316
|
+
def get_async_task_status(async_task_id: int):
|
|
317
|
+
"""Return the async task."""
|
|
318
|
+
try:
|
|
319
|
+
result = api_mgr.list_async_tasks(async_task_ids=[async_task_id])
|
|
320
|
+
assert len(result) == 1
|
|
321
|
+
return GetAsyncTaskResponse(async_task=result[0])
|
|
322
|
+
except DSGValueNotStored as e:
|
|
323
|
+
raise HTTPException(404, detail=str(e))
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
@app.get("/async_tasks/data/{async_task_id}")
|
|
327
|
+
def get_async_task_data(async_task_id: int):
|
|
328
|
+
"""Return the data for a completed async task."""
|
|
329
|
+
task = api_mgr.get_async_task_status(async_task_id)
|
|
330
|
+
if task.status != AsyncTaskStatus.COMPLETE:
|
|
331
|
+
msg = f"Data can only be read for completed tasks: async_task_id={async_task_id} status={task.status}"
|
|
332
|
+
raise HTTPException(422, detail=msg)
|
|
333
|
+
if task.task_type == AsyncTaskType.PROJECT_QUERY:
|
|
334
|
+
if not task.result.data_file:
|
|
335
|
+
msg = f"{task.result.data_file=} is invalid"
|
|
336
|
+
raise HTTPException(400, msg)
|
|
337
|
+
# TODO: Sending data this way has major limitations. We lose all the benefits of Parquet and
|
|
338
|
+
# compression.
|
|
339
|
+
# We should also check how much data we can read through the Spark driver.
|
|
340
|
+
text = (
|
|
341
|
+
read_parquet(str(task.result.data_file))
|
|
342
|
+
.toPandas()
|
|
343
|
+
.to_json(orient="split", index=False)
|
|
344
|
+
)
|
|
345
|
+
else:
|
|
346
|
+
msg = f"task type {task.task_type} is not implemented"
|
|
347
|
+
raise NotImplementedError(msg)
|
|
348
|
+
|
|
349
|
+
return Response(content=text, media_type="application/json")
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
@app.get("/async_tasks/archive_file/{async_task_id}", response_class=FileResponse)
|
|
353
|
+
def download_async_task_archive_file(async_task_id: int):
|
|
354
|
+
"""Download the archive file for a completed async task."""
|
|
355
|
+
task = api_mgr.get_async_task_status(async_task_id)
|
|
356
|
+
if task.status != AsyncTaskStatus.COMPLETE:
|
|
357
|
+
msg = f"Data can only be downloaded for completed tasks: async_task_id={async_task_id} status={task.status}"
|
|
358
|
+
raise HTTPException(422, detail=msg)
|
|
359
|
+
return FileResponse(task.result.archive_file)
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def _submit_project_query(spark_query: SparkSubmitProjectQueryRequest, async_task_id):
|
|
363
|
+
with NamedTemporaryFile(mode="w", suffix=".json") as fp:
|
|
364
|
+
query = spark_query.query
|
|
365
|
+
fp.write(query.model_dump_json())
|
|
366
|
+
fp.write("\n")
|
|
367
|
+
fp.flush()
|
|
368
|
+
output_dir = Path(QUERY_OUTPUT_DIR)
|
|
369
|
+
dsgrid_exec = "dsgrid-cli.py"
|
|
370
|
+
base_cmd = (
|
|
371
|
+
f"--url={DSGRID_REGISTRY_DATABASE_URL} "
|
|
372
|
+
f"query project run "
|
|
373
|
+
f"--output={output_dir} --zip-file --overwrite {fp.name}"
|
|
374
|
+
)
|
|
375
|
+
if spark_query.use_spark_submit:
|
|
376
|
+
# Need to find the full path to pass to spark-submit.
|
|
377
|
+
dsgrid_exec = _find_exec(dsgrid_exec)
|
|
378
|
+
spark_cmd = "spark-submit"
|
|
379
|
+
if spark_query.spark_submit_options:
|
|
380
|
+
spark_cmd += " " + " ".join(
|
|
381
|
+
(f"{k} {v}" for k, v in spark_query.spark_submit_options.items())
|
|
382
|
+
)
|
|
383
|
+
cmd = f"{spark_cmd} {dsgrid_exec} {base_cmd}"
|
|
384
|
+
else:
|
|
385
|
+
cmd = f"{dsgrid_exec} {base_cmd}"
|
|
386
|
+
logger.info(f"Submitting project query command: {cmd}")
|
|
387
|
+
ret = run_command(cmd)
|
|
388
|
+
if ret == 0:
|
|
389
|
+
data_dir = output_dir / query.name / "table.parquet"
|
|
390
|
+
zip_filename = str(output_dir / query.name) + ".zip"
|
|
391
|
+
result = ProjectQueryAsyncResultModel(
|
|
392
|
+
# metadata=load_data(output_dir / query.name / "metadata.json"),
|
|
393
|
+
data_file=str(data_dir),
|
|
394
|
+
archive_file=str(zip_filename),
|
|
395
|
+
archive_file_size_mb=os.stat(zip_filename).st_size / 1_000_000,
|
|
396
|
+
)
|
|
397
|
+
else:
|
|
398
|
+
logger.error("Failed to submit a project query: return_code=%s", ret)
|
|
399
|
+
result = ProjectQueryAsyncResultModel(
|
|
400
|
+
# metadata={},
|
|
401
|
+
data_file="",
|
|
402
|
+
archive_file="",
|
|
403
|
+
archive_file_size_mb=0,
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
api_mgr.complete_async_task(async_task_id, ret, result=result)
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def _find_exec(name):
|
|
410
|
+
for path in sys.path:
|
|
411
|
+
exec_path = Path(path) / name
|
|
412
|
+
if exec_path.exists():
|
|
413
|
+
return exec_path
|
|
414
|
+
msg = f"Did not find {name}"
|
|
415
|
+
raise Exception(msg)
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def _list_enums(enum_type):
|
|
419
|
+
return sorted([x.value for x in enum_type])
|
dsgrid/api/models.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import enum
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
|
|
4
|
+
from pydantic import Field
|
|
5
|
+
|
|
6
|
+
from dsgrid.data_models import DSGBaseModel
|
|
7
|
+
from dsgrid.query.models import ProjectQueryModel
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AsyncTaskStatus(enum.Enum):
|
|
11
|
+
"""Statuses for async operations"""
|
|
12
|
+
|
|
13
|
+
QUEUED = "queued" # not used yet
|
|
14
|
+
IN_PROGRESS = "in_progress"
|
|
15
|
+
COMPLETE = "complete"
|
|
16
|
+
CANCELED = "canceled" # not used yet
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class AsyncTaskType(enum.Enum):
|
|
20
|
+
"""Asynchronous task types"""
|
|
21
|
+
|
|
22
|
+
PROJECT_QUERY = "project_query"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ProjectQueryAsyncResultModel(DSGBaseModel):
|
|
26
|
+
# metadata: DatasetMetadataModel # TODO: not sure if we need this
|
|
27
|
+
data_file: str
|
|
28
|
+
archive_file: str
|
|
29
|
+
archive_file_size_mb: float
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class AsyncTaskModel(DSGBaseModel):
|
|
33
|
+
"""Tracks an asynchronous operation."""
|
|
34
|
+
|
|
35
|
+
async_task_id: int
|
|
36
|
+
task_type: AsyncTaskType
|
|
37
|
+
status: AsyncTaskStatus
|
|
38
|
+
return_code: int | None = None
|
|
39
|
+
result: ProjectQueryAsyncResultModel | None = None # eventually, union of all result types
|
|
40
|
+
start_time: datetime
|
|
41
|
+
completion_time: datetime | None = None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class StoreModel(DSGBaseModel):
|
|
45
|
+
next_async_task_id: int = 1
|
|
46
|
+
async_tasks: dict[int, AsyncTaskModel] = {}
|
|
47
|
+
outstanding_async_tasks: set[int] = set()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class SparkSubmitProjectQueryRequest(DSGBaseModel):
|
|
51
|
+
use_spark_submit: bool = Field(
|
|
52
|
+
default=True,
|
|
53
|
+
description="If True, run the query command through spark-submit. If False, run the "
|
|
54
|
+
"command directly in dsgrid.",
|
|
55
|
+
)
|
|
56
|
+
spark_submit_options: dict[str, str] = Field(
|
|
57
|
+
default={},
|
|
58
|
+
description="Options to forward to the spark-submit command (e.g., --master spark://hostname:7077",
|
|
59
|
+
)
|
|
60
|
+
query: ProjectQueryModel
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
from dsgrid.data_models import DSGBaseModel
|
|
2
|
+
from dsgrid.config.dataset_config import DatasetConfigModel
|
|
3
|
+
from dsgrid.config.dimensions import DimensionCommonModel, ProjectDimensionModel
|
|
4
|
+
from dsgrid.config.project_config import ProjectConfigModel, ProjectDimensionNamesModel
|
|
5
|
+
from dsgrid.dataset.models import ValueFormat
|
|
6
|
+
from dsgrid.dimension.base_models import DimensionType
|
|
7
|
+
from dsgrid.query.models import ReportType
|
|
8
|
+
from .models import AsyncTaskModel
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ListProjectsResponse(DSGBaseModel):
|
|
12
|
+
"""Defines the reponse to the list_projects command."""
|
|
13
|
+
|
|
14
|
+
projects: list[ProjectConfigModel]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class GetProjectResponse(DSGBaseModel):
|
|
18
|
+
"""Defines the reponse to the get_project command."""
|
|
19
|
+
|
|
20
|
+
project: ProjectConfigModel
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ListDatasetsResponse(DSGBaseModel):
|
|
24
|
+
"""Defines the reponse to the list_datasets command."""
|
|
25
|
+
|
|
26
|
+
datasets: list[DatasetConfigModel]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class GetDatasetResponse(DSGBaseModel):
|
|
30
|
+
"""Defines the reponse to the get_dataset command."""
|
|
31
|
+
|
|
32
|
+
dataset: DatasetConfigModel
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ListProjectDimensionsResponse(DSGBaseModel):
|
|
36
|
+
"""Defines the reponse to the list_project_dimensions command."""
|
|
37
|
+
|
|
38
|
+
project_id: str
|
|
39
|
+
dimensions: list[ProjectDimensionModel]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class GetProjectDimensionNamesResponse(DSGBaseModel):
|
|
43
|
+
"""Defines the reponse to the get_project_dimension_names command."""
|
|
44
|
+
|
|
45
|
+
project_id: str
|
|
46
|
+
dimension_names: ProjectDimensionNamesModel
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class GetProjectBaseDimensionNameResponse(DSGBaseModel):
|
|
50
|
+
"""Defines the reponse to the get_project_dimension_name command."""
|
|
51
|
+
|
|
52
|
+
project_id: str
|
|
53
|
+
dimension_type: DimensionType
|
|
54
|
+
dimension_name: str
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class ListProjectSupplementalDimensionNames(DSGBaseModel):
|
|
58
|
+
"""Defines the response to the list_project_supplemental_dimension_names command"""
|
|
59
|
+
|
|
60
|
+
project_id: str
|
|
61
|
+
dimension_type: DimensionType
|
|
62
|
+
dimension_names: list[str]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class ListDimensionTypesResponse(DSGBaseModel):
|
|
66
|
+
"""Defines the response to the list_dimension_types command."""
|
|
67
|
+
|
|
68
|
+
types: list[DimensionType]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class ListDimensionsResponse(DSGBaseModel):
|
|
72
|
+
"""Defines the response to the list_dimensions command."""
|
|
73
|
+
|
|
74
|
+
dimensions: list[DimensionCommonModel]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class GetDimensionResponse(DSGBaseModel):
|
|
78
|
+
"""Defines the response to the get_dimension command."""
|
|
79
|
+
|
|
80
|
+
dimension: DimensionCommonModel
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class ListDimensionRecordsResponse(DSGBaseModel):
|
|
84
|
+
"""Defines the response to the list_dimension_records command."""
|
|
85
|
+
|
|
86
|
+
records: list[dict]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class ListReportTypesResponse(DSGBaseModel):
|
|
90
|
+
"""Defines the response to the list_report_types command."""
|
|
91
|
+
|
|
92
|
+
types: list[ReportType]
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class ListValueFormatsResponse(DSGBaseModel):
|
|
96
|
+
"""Defines the response to the list_value_formats command."""
|
|
97
|
+
|
|
98
|
+
formats: list[ValueFormat]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class SparkSubmitProjectQueryResponse(DSGBaseModel):
|
|
102
|
+
"""Defines the response to the submit_project_query command."""
|
|
103
|
+
|
|
104
|
+
async_task_id: int
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class ListAsyncTasksResponse(DSGBaseModel):
|
|
108
|
+
"""Defines the response to the list_async_tasks command."""
|
|
109
|
+
|
|
110
|
+
async_tasks: list[AsyncTaskModel]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class GetAsyncTaskResponse(DSGBaseModel):
|
|
114
|
+
"""Defines the response to the list_async_tasks command."""
|
|
115
|
+
|
|
116
|
+
async_task: AsyncTaskModel
|
dsgrid/apps/__init__.py
ADDED
|
File without changes
|