mainsequence 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mainsequence/__init__.py +0 -0
- mainsequence/__main__.py +9 -0
- mainsequence/cli/__init__.py +1 -0
- mainsequence/cli/api.py +157 -0
- mainsequence/cli/cli.py +442 -0
- mainsequence/cli/config.py +78 -0
- mainsequence/cli/ssh_utils.py +126 -0
- mainsequence/client/__init__.py +17 -0
- mainsequence/client/base.py +431 -0
- mainsequence/client/data_sources_interfaces/__init__.py +0 -0
- mainsequence/client/data_sources_interfaces/duckdb.py +1468 -0
- mainsequence/client/data_sources_interfaces/timescale.py +479 -0
- mainsequence/client/models_helpers.py +113 -0
- mainsequence/client/models_report_studio.py +412 -0
- mainsequence/client/models_tdag.py +2276 -0
- mainsequence/client/models_vam.py +1983 -0
- mainsequence/client/utils.py +387 -0
- mainsequence/dashboards/__init__.py +0 -0
- mainsequence/dashboards/streamlit/__init__.py +0 -0
- mainsequence/dashboards/streamlit/assets/config.toml +12 -0
- mainsequence/dashboards/streamlit/assets/favicon.png +0 -0
- mainsequence/dashboards/streamlit/assets/logo.png +0 -0
- mainsequence/dashboards/streamlit/core/__init__.py +0 -0
- mainsequence/dashboards/streamlit/core/theme.py +212 -0
- mainsequence/dashboards/streamlit/pages/__init__.py +0 -0
- mainsequence/dashboards/streamlit/scaffold.py +220 -0
- mainsequence/instrumentation/__init__.py +7 -0
- mainsequence/instrumentation/utils.py +101 -0
- mainsequence/instruments/__init__.py +1 -0
- mainsequence/instruments/data_interface/__init__.py +10 -0
- mainsequence/instruments/data_interface/data_interface.py +361 -0
- mainsequence/instruments/instruments/__init__.py +3 -0
- mainsequence/instruments/instruments/base_instrument.py +85 -0
- mainsequence/instruments/instruments/bond.py +447 -0
- mainsequence/instruments/instruments/european_option.py +74 -0
- mainsequence/instruments/instruments/interest_rate_swap.py +217 -0
- mainsequence/instruments/instruments/json_codec.py +585 -0
- mainsequence/instruments/instruments/knockout_fx_option.py +146 -0
- mainsequence/instruments/instruments/position.py +475 -0
- mainsequence/instruments/instruments/ql_fields.py +239 -0
- mainsequence/instruments/instruments/vanilla_fx_option.py +107 -0
- mainsequence/instruments/pricing_models/__init__.py +0 -0
- mainsequence/instruments/pricing_models/black_scholes.py +49 -0
- mainsequence/instruments/pricing_models/bond_pricer.py +182 -0
- mainsequence/instruments/pricing_models/fx_option_pricer.py +90 -0
- mainsequence/instruments/pricing_models/indices.py +350 -0
- mainsequence/instruments/pricing_models/knockout_fx_pricer.py +209 -0
- mainsequence/instruments/pricing_models/swap_pricer.py +502 -0
- mainsequence/instruments/settings.py +175 -0
- mainsequence/instruments/utils.py +29 -0
- mainsequence/logconf.py +284 -0
- mainsequence/reportbuilder/__init__.py +0 -0
- mainsequence/reportbuilder/__main__.py +0 -0
- mainsequence/reportbuilder/examples/ms_template_report.py +706 -0
- mainsequence/reportbuilder/model.py +713 -0
- mainsequence/reportbuilder/slide_templates.py +532 -0
- mainsequence/tdag/__init__.py +8 -0
- mainsequence/tdag/__main__.py +0 -0
- mainsequence/tdag/config.py +129 -0
- mainsequence/tdag/data_nodes/__init__.py +12 -0
- mainsequence/tdag/data_nodes/build_operations.py +751 -0
- mainsequence/tdag/data_nodes/data_nodes.py +1292 -0
- mainsequence/tdag/data_nodes/persist_managers.py +812 -0
- mainsequence/tdag/data_nodes/run_operations.py +543 -0
- mainsequence/tdag/data_nodes/utils.py +24 -0
- mainsequence/tdag/future_registry.py +25 -0
- mainsequence/tdag/utils.py +40 -0
- mainsequence/virtualfundbuilder/__init__.py +45 -0
- mainsequence/virtualfundbuilder/__main__.py +235 -0
- mainsequence/virtualfundbuilder/agent_interface.py +77 -0
- mainsequence/virtualfundbuilder/config_handling.py +86 -0
- mainsequence/virtualfundbuilder/contrib/__init__.py +0 -0
- mainsequence/virtualfundbuilder/contrib/apps/__init__.py +8 -0
- mainsequence/virtualfundbuilder/contrib/apps/etf_replicator_app.py +164 -0
- mainsequence/virtualfundbuilder/contrib/apps/generate_report.py +292 -0
- mainsequence/virtualfundbuilder/contrib/apps/load_external_portfolio.py +107 -0
- mainsequence/virtualfundbuilder/contrib/apps/news_app.py +437 -0
- mainsequence/virtualfundbuilder/contrib/apps/portfolio_report_app.py +91 -0
- mainsequence/virtualfundbuilder/contrib/apps/portfolio_table.py +95 -0
- mainsequence/virtualfundbuilder/contrib/apps/run_named_portfolio.py +45 -0
- mainsequence/virtualfundbuilder/contrib/apps/run_portfolio.py +40 -0
- mainsequence/virtualfundbuilder/contrib/apps/templates/base.html +147 -0
- mainsequence/virtualfundbuilder/contrib/apps/templates/report.html +77 -0
- mainsequence/virtualfundbuilder/contrib/data_nodes/__init__.py +5 -0
- mainsequence/virtualfundbuilder/contrib/data_nodes/external_weights.py +61 -0
- mainsequence/virtualfundbuilder/contrib/data_nodes/intraday_trend.py +149 -0
- mainsequence/virtualfundbuilder/contrib/data_nodes/market_cap.py +310 -0
- mainsequence/virtualfundbuilder/contrib/data_nodes/mock_signal.py +78 -0
- mainsequence/virtualfundbuilder/contrib/data_nodes/portfolio_replicator.py +269 -0
- mainsequence/virtualfundbuilder/contrib/prices/__init__.py +1 -0
- mainsequence/virtualfundbuilder/contrib/prices/data_nodes.py +810 -0
- mainsequence/virtualfundbuilder/contrib/prices/utils.py +11 -0
- mainsequence/virtualfundbuilder/contrib/rebalance_strategies/__init__.py +1 -0
- mainsequence/virtualfundbuilder/contrib/rebalance_strategies/rebalance_strategies.py +313 -0
- mainsequence/virtualfundbuilder/data_nodes.py +637 -0
- mainsequence/virtualfundbuilder/enums.py +23 -0
- mainsequence/virtualfundbuilder/models.py +282 -0
- mainsequence/virtualfundbuilder/notebook_handling.py +42 -0
- mainsequence/virtualfundbuilder/portfolio_interface.py +272 -0
- mainsequence/virtualfundbuilder/resource_factory/__init__.py +0 -0
- mainsequence/virtualfundbuilder/resource_factory/app_factory.py +170 -0
- mainsequence/virtualfundbuilder/resource_factory/base_factory.py +238 -0
- mainsequence/virtualfundbuilder/resource_factory/rebalance_factory.py +101 -0
- mainsequence/virtualfundbuilder/resource_factory/signal_factory.py +183 -0
- mainsequence/virtualfundbuilder/utils.py +381 -0
- mainsequence-2.0.0.dist-info/METADATA +105 -0
- mainsequence-2.0.0.dist-info/RECORD +110 -0
- mainsequence-2.0.0.dist-info/WHEEL +5 -0
- mainsequence-2.0.0.dist-info/licenses/LICENSE +40 -0
- mainsequence-2.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2276 @@
|
|
1
|
+
from importlib.metadata import metadata
|
2
|
+
|
3
|
+
import yaml
|
4
|
+
|
5
|
+
from .base import BasePydanticModel, BaseObjectOrm, TDAG_ENDPOINT
|
6
|
+
from .data_sources_interfaces.duckdb import DuckDBInterface
|
7
|
+
from .utils import (is_process_running, get_network_ip, DateInfo,
|
8
|
+
TDAG_CONSTANTS, DataFrequency, UniqueIdentifierRangeMap,
|
9
|
+
DATE_FORMAT, AuthLoaders, make_request, set_types_in_table, request_to_datetime, serialize_to_json,
|
10
|
+
bios_uuid)
|
11
|
+
import copy
|
12
|
+
import datetime
|
13
|
+
import pytz
|
14
|
+
import requests
|
15
|
+
import pandas as pd
|
16
|
+
import json
|
17
|
+
from typing import Union
|
18
|
+
import time
|
19
|
+
import os
|
20
|
+
from mainsequence.logconf import logger
|
21
|
+
|
22
|
+
from pydantic import BaseModel, Field, field_validator
|
23
|
+
from typing import Optional, List, Dict, Any, TypedDict, Tuple
|
24
|
+
from .data_sources_interfaces import timescale as TimeScaleInterface
|
25
|
+
from functools import wraps
|
26
|
+
import math
|
27
|
+
import gzip
|
28
|
+
import base64
|
29
|
+
import numpy as np
|
30
|
+
import concurrent.futures
|
31
|
+
|
32
|
+
_default_data_source = None # Module-level cache
|
33
|
+
|
34
|
+
JSON_COMPRESSED_PREFIX = ["json_compressed", "jcomp_"]
|
35
|
+
|
36
|
+
loaders = AuthLoaders()
|
37
|
+
|
38
|
+
# Global executor (or you could define one on your class)
|
39
|
+
_executor = concurrent.futures.ThreadPoolExecutor(max_workers=5)
|
40
|
+
DUCK_DB = "duck_db"
|
41
|
+
|
42
|
+
|
43
|
+
class AlreadyExist(Exception):
|
44
|
+
pass
|
45
|
+
|
46
|
+
|
47
|
+
def build_session(loaders):
|
48
|
+
from requests.adapters import HTTPAdapter, Retry
|
49
|
+
s = requests.Session()
|
50
|
+
s.headers.update(loaders.auth_headers)
|
51
|
+
retries = Retry(total=2, backoff_factor=2, )
|
52
|
+
s.mount('http://', HTTPAdapter(max_retries=retries))
|
53
|
+
return s
|
54
|
+
|
55
|
+
|
56
|
+
session = build_session(loaders=loaders)
|
57
|
+
|
58
|
+
|
59
|
+
class SchedulerDoesNotExist(Exception):
|
60
|
+
pass
|
61
|
+
|
62
|
+
|
63
|
+
class LocalTimeSeriesDoesNotExist(Exception):
|
64
|
+
pass
|
65
|
+
|
66
|
+
|
67
|
+
class DynamicTableDoesNotExist(Exception):
|
68
|
+
pass
|
69
|
+
|
70
|
+
|
71
|
+
class SourceTableConfigurationDoesNotExist(Exception):
|
72
|
+
pass
|
73
|
+
|
74
|
+
|
75
|
+
class ColumnMetaData(BasePydanticModel, BaseObjectOrm):
|
76
|
+
source_config_id: int = Field(
|
77
|
+
...,
|
78
|
+
alias="source_config",
|
79
|
+
description="Primary key of the related SourceTableConfiguration"
|
80
|
+
)
|
81
|
+
column_name: str = Field(
|
82
|
+
...,
|
83
|
+
max_length=63,
|
84
|
+
description="Name of the column (must match column_dtypes_map key)"
|
85
|
+
)
|
86
|
+
dtype: str = Field(
|
87
|
+
...,
|
88
|
+
max_length=100,
|
89
|
+
description="Data type (will be synced from the configuration’s dtype map)"
|
90
|
+
)
|
91
|
+
label: str = Field(
|
92
|
+
...,
|
93
|
+
max_length=250,
|
94
|
+
description="Human‐readable label"
|
95
|
+
)
|
96
|
+
description: str = Field(
|
97
|
+
...,
|
98
|
+
description="Longer description of the column"
|
99
|
+
)
|
100
|
+
|
101
|
+
|
102
|
+
class SourceTableConfiguration(BasePydanticModel, BaseObjectOrm):
|
103
|
+
id: Optional[int] = Field(None, description="Primary key, auto-incremented ID")
|
104
|
+
related_table: Union[int, "DynamicTableMetaData"]
|
105
|
+
time_index_name: str = Field(..., max_length=100, description="Time index name")
|
106
|
+
column_dtypes_map: Dict[str, Any] = Field(..., description="Column data types map")
|
107
|
+
index_names: List
|
108
|
+
last_time_index_value: Optional[datetime.datetime] = Field(None, description="Last time index value")
|
109
|
+
earliest_index_value: Optional[datetime.datetime] = Field(None, description="Earliest index value")
|
110
|
+
|
111
|
+
# multi_index_stats: Optional[Dict[str, Any]] = Field(None, description="Multi-index statistics JSON field")
|
112
|
+
# multi_index_column_stats:Optional[Dict[str, Any]] = Field(None, description="Multi-index statistics JSON field column based")
|
113
|
+
|
114
|
+
table_partition: Dict[str, Any] = Field(..., description="Table partition settings")
|
115
|
+
open_for_everyone: bool = Field(default=False, description="Whether the table configuration is open for everyone")
|
116
|
+
columns_metadata: Optional[List[ColumnMetaData]] = None
|
117
|
+
|
118
|
+
# todo remove
|
119
|
+
column_index_names: Optional[list] = [None]
|
120
|
+
|
121
|
+
def get_data_updates(self):
|
122
|
+
max_per_asset = None
|
123
|
+
|
124
|
+
url = self.get_object_url() + f"/{self.related_table}/get_stats/"
|
125
|
+
s = self.build_session()
|
126
|
+
r = make_request(s=s, loaders=self.LOADERS, r_type="GET", url=url, accept_gzip=True)
|
127
|
+
if r.status_code != 200:
|
128
|
+
raise Exception(r.text)
|
129
|
+
data = r.json()
|
130
|
+
multi_index_stats = data["multi_index_stats"]
|
131
|
+
multi_index_column_stats = data["multi_index_column_stats"]
|
132
|
+
max_time_index_value = self.last_time_index_value
|
133
|
+
if multi_index_stats is not None:
|
134
|
+
max_per_asset = multi_index_stats["max_per_asset_symbol"]
|
135
|
+
max_per_asset = {k: request_to_datetime(v) for k, v in max_per_asset.items()}
|
136
|
+
max_time_index_value = np.max(list(max_per_asset.values()))
|
137
|
+
|
138
|
+
du = UpdateStatistics(
|
139
|
+
max_time_index_value=max_time_index_value,
|
140
|
+
asset_time_statistics=max_per_asset,
|
141
|
+
multi_index_column_stats=multi_index_column_stats
|
142
|
+
)
|
143
|
+
|
144
|
+
du._max_time_in_update_statistics = max_time_index_value
|
145
|
+
return du
|
146
|
+
|
147
|
+
def get_time_scale_extra_table_indices(self) -> dict:
|
148
|
+
url = self.get_object_url() + f"/{self.related_table}/get_time_scale_extra_table_indices/"
|
149
|
+
s = self.build_session()
|
150
|
+
r = make_request(s=s, loaders=self.LOADERS, r_type="GET", url=url, )
|
151
|
+
if r.status_code != 200:
|
152
|
+
raise Exception(r.text)
|
153
|
+
return r.json()
|
154
|
+
|
155
|
+
def set_or_update_columns_metadata(self, columns_metadata: List[ColumnMetaData],
|
156
|
+
timeout=None) -> None:
|
157
|
+
"""
|
158
|
+
"""
|
159
|
+
|
160
|
+
columns_metadata = [
|
161
|
+
c.model_dump(exclude={'orm_class'})
|
162
|
+
for c in columns_metadata
|
163
|
+
]
|
164
|
+
url = self.get_object_url() + f"/{self.related_table}/set_or_update_columns_metadata/"
|
165
|
+
s = self.build_session()
|
166
|
+
r = make_request(s=s, loaders=self.LOADERS, r_type="POST",
|
167
|
+
time_out=timeout,
|
168
|
+
url=url, payload={"json": {"columns_metadata": columns_metadata}})
|
169
|
+
if r.status_code not in [200, 201]:
|
170
|
+
raise Exception(r.text)
|
171
|
+
return r.json()
|
172
|
+
|
173
|
+
def patch(self, *args, **kwargs):
|
174
|
+
# related table is the primary key of this model
|
175
|
+
if isinstance(self.related_table, int):
|
176
|
+
id = self.related_table
|
177
|
+
else:
|
178
|
+
id = self.related_table.id
|
179
|
+
return self.__class__.patch_by_id(id, *args, **kwargs)
|
180
|
+
|
181
|
+
|
182
|
+
class ColumnMetaData(BasePydanticModel):
|
183
|
+
source_config_id: Optional[int] = Field(None, description="FK to SourceTableConfiguration")
|
184
|
+
column_name: str = Field(..., max_length=63, description="Name of the column")
|
185
|
+
dtype: str = Field(..., max_length=100, description="Data type of the column")
|
186
|
+
label: str = Field(..., max_length=255, description="Human-readable label")
|
187
|
+
description: str = Field(..., description="Detailed description")
|
188
|
+
|
189
|
+
|
190
|
+
class LocalTimeSerie(BasePydanticModel, BaseObjectOrm):
|
191
|
+
id: Optional[int] = Field(None, description="Primary key, auto-incremented ID")
|
192
|
+
update_hash: str = Field(..., max_length=63, description="Max length of PostgreSQL table name")
|
193
|
+
remote_table: Union[int, "DynamicTableMetaData"]
|
194
|
+
build_configuration: Dict[str, Any] = Field(..., description="Configuration in JSON format")
|
195
|
+
build_meta_data: Optional[Dict[str, Any]] = Field(None, description="Optional YAML metadata")
|
196
|
+
ogm_dependencies_linked: bool = Field(default=False, description="OGM dependencies linked flag")
|
197
|
+
tags: Optional[list[str]] = Field(default=[], description="List of tags")
|
198
|
+
description: Optional[str] = Field(None, description="Optional HTML description")
|
199
|
+
localtimeserieupdatedetails: Optional[Union["LocalTimeSerieUpdateDetails", int]] = None
|
200
|
+
run_configuration: Optional["RunConfiguration"] = None
|
201
|
+
open_for_everyone: bool = Field(default=False, description="Whether the ts is open for everyone")
|
202
|
+
|
203
|
+
@property
|
204
|
+
def data_source_id(self):
|
205
|
+
if isinstance(self.remote_table.data_source, int):
|
206
|
+
return self.remote_table.data_source
|
207
|
+
else:
|
208
|
+
return self.remote_table.data_source.id
|
209
|
+
|
210
|
+
@classmethod
|
211
|
+
def get_or_create(cls, **kwargs):
|
212
|
+
url = cls.get_object_url() + "/get_or_create/"
|
213
|
+
kwargs = serialize_to_json(kwargs)
|
214
|
+
|
215
|
+
payload = {"json": kwargs}
|
216
|
+
s = cls.build_session()
|
217
|
+
r = make_request(s=s, loaders=cls.LOADERS, r_type="POST", url=url, payload=payload)
|
218
|
+
if r.status_code not in [200, 201]:
|
219
|
+
raise Exception(r.text)
|
220
|
+
data = r.json()
|
221
|
+
|
222
|
+
return cls(**data)
|
223
|
+
|
224
|
+
def add_tags(self, tags: list, timeout=None):
|
225
|
+
base_url = self.get_object_url()
|
226
|
+
s = self.build_session()
|
227
|
+
payload = {"json": {"tags": tags}}
|
228
|
+
# r = self.s.get(, )
|
229
|
+
url = f"{base_url}/{self.id}/add_tags/"
|
230
|
+
r = make_request(s=s, loaders=self.LOADERS, r_type="PATCH", url=url,
|
231
|
+
payload=payload,
|
232
|
+
time_out=timeout)
|
233
|
+
if r.status_code != 200:
|
234
|
+
raise Exception(f"Error in request {r.json()}")
|
235
|
+
return r.json()
|
236
|
+
|
237
|
+
@classmethod
|
238
|
+
def filter_by_hash_id(cls, local_hash_id_list: list, timeout=None):
|
239
|
+
s = cls.build_session()
|
240
|
+
base_url = cls.get_object_url()
|
241
|
+
url = f"{base_url}/filter_by_hash_id/"
|
242
|
+
payload = {"json": {"local_hash_id__in": local_hash_id_list}, }
|
243
|
+
r = make_request(s=s, loaders=cls.LOADERS, r_type="POST", url=url, payload=payload, time_out=timeout)
|
244
|
+
if r.status_code != 200:
|
245
|
+
raise Exception(f"{r.text}")
|
246
|
+
all_metadatas = {m["update_hash"]: m for m in r.json()}
|
247
|
+
return all_metadatas
|
248
|
+
|
249
|
+
def set_start_of_execution(self, **kwargs):
|
250
|
+
s = self.build_session()
|
251
|
+
base_url = self.get_object_url()
|
252
|
+
payload = {"json": kwargs}
|
253
|
+
url = f"{base_url}/{self.id}/set_start_of_execution/"
|
254
|
+
r = make_request(s=s, loaders=self.LOADERS, r_type="PATCH", url=url, payload=payload,
|
255
|
+
accept_gzip=True)
|
256
|
+
if r.status_code != 201:
|
257
|
+
raise Exception(f"Error in request {r.text}")
|
258
|
+
|
259
|
+
def _recurse_to_datetime(node):
|
260
|
+
if isinstance(node, dict):
|
261
|
+
return {k: _recurse_to_datetime(v) for k, v in node.items()}
|
262
|
+
# leaf: assume it’s your timestamp string
|
263
|
+
return request_to_datetime(node)
|
264
|
+
|
265
|
+
result = r.json()
|
266
|
+
if result["last_time_index_value"] is not None:
|
267
|
+
datetime.datetime.fromtimestamp(result["last_time_index_value"], tz=pytz.utc)
|
268
|
+
|
269
|
+
if result['asset_time_statistics'] is not None:
|
270
|
+
result['asset_time_statistics'] = _recurse_to_datetime(
|
271
|
+
result['asset_time_statistics']
|
272
|
+
)
|
273
|
+
|
274
|
+
hu = LocalTimeSeriesHistoricalUpdate(
|
275
|
+
**result["historical_update"],
|
276
|
+
update_statistics=UpdateStatistics(
|
277
|
+
asset_time_statistics=result['asset_time_statistics'],
|
278
|
+
max_time_index_value=result["last_time_index_value"],
|
279
|
+
multi_index_column_stats=result["multi_index_column_stats"],
|
280
|
+
),
|
281
|
+
must_update=result["must_update"],
|
282
|
+
direct_dependencies_ids=result["direct_dependencies_ids"]
|
283
|
+
)
|
284
|
+
return hu
|
285
|
+
|
286
|
+
def set_end_of_execution(
|
287
|
+
self,
|
288
|
+
historical_update_id: int,
|
289
|
+
timeout=None, threaded_request=True,
|
290
|
+
**kwargs
|
291
|
+
):
|
292
|
+
s = self.build_session()
|
293
|
+
url = self.get_object_url() + f"/{self.id}/set_end_of_execution/"
|
294
|
+
kwargs.update(dict(historical_update_id=historical_update_id))
|
295
|
+
payload = {"json": kwargs}
|
296
|
+
|
297
|
+
def _do_request():
|
298
|
+
r = make_request(s=s, loaders=self.LOADERS, r_type="PATCH", url=url, payload=payload, time_out=timeout)
|
299
|
+
if r.status_code != 200:
|
300
|
+
raise Exception("Error in request")
|
301
|
+
return r
|
302
|
+
|
303
|
+
if threaded_request:
|
304
|
+
# Submit the request to an executor. The returned Future will be non-blocking.
|
305
|
+
future = _executor.submit(_do_request)
|
306
|
+
|
307
|
+
# Optionally, attach a callback to log failures. (Exceptions will also be
|
308
|
+
# re-raised when someone calls future.result().)
|
309
|
+
def _handle_exception(fut):
|
310
|
+
try:
|
311
|
+
fut.result() # This will re-raise any exception caught in _do_request.
|
312
|
+
except Exception as e:
|
313
|
+
logger.error("set_end_of_execution: request failed: %s", e)
|
314
|
+
|
315
|
+
future.add_done_callback(_handle_exception)
|
316
|
+
return future
|
317
|
+
else:
|
318
|
+
# Synchronous execution that will raise exceptions inline.
|
319
|
+
return _do_request()
|
320
|
+
|
321
|
+
@classmethod
|
322
|
+
def batch_set_end_of_execution(cls, update_map: dict, timeout=None):
|
323
|
+
s = cls.build_session()
|
324
|
+
url = f"{cls.get_object_url()}/batch_set_end_of_execution/"
|
325
|
+
payload = {"json": {"update_map": update_map}}
|
326
|
+
r = make_request(s=s, loaders=cls.LOADERS, r_type="PATCH", url=url, payload=payload, time_out=timeout)
|
327
|
+
if r.status_code != 200:
|
328
|
+
raise Exception(f"Error in request ")
|
329
|
+
|
330
|
+
@classmethod
|
331
|
+
def set_last_update_index_time(cls, metadata, timeout=None):
|
332
|
+
s = cls.build_session()
|
333
|
+
url = cls.get_object_url() + f"/{metadata['id']}/set_last_update_index_time/"
|
334
|
+
r = make_request(s=s, loaders=cls.LOADERS, r_type="GET", url=url, time_out=timeout)
|
335
|
+
|
336
|
+
if r.status_code == 404:
|
337
|
+
raise SourceTableConfigurationDoesNotExist
|
338
|
+
|
339
|
+
if r.status_code != 200:
|
340
|
+
raise Exception(f"{metadata['update_hash']}{r.text}")
|
341
|
+
return r
|
342
|
+
|
343
|
+
def set_last_update_index_time_from_update_stats(
|
344
|
+
self,
|
345
|
+
last_time_index_value: float,
|
346
|
+
max_per_asset_symbol,
|
347
|
+
multi_index_column_stats,
|
348
|
+
timeout=None
|
349
|
+
) -> "LocalTimeSerie":
|
350
|
+
s = self.build_session()
|
351
|
+
url = self.get_object_url() + f"/{self.id}/set_last_update_index_time_from_update_stats/"
|
352
|
+
|
353
|
+
data_to_comp = {
|
354
|
+
"last_time_index_value": last_time_index_value,
|
355
|
+
"max_per_asset_symbol": max_per_asset_symbol,
|
356
|
+
"multi_index_column_stats": multi_index_column_stats,
|
357
|
+
}
|
358
|
+
chunk_json_str = json.dumps(data_to_comp)
|
359
|
+
compressed = gzip.compress(chunk_json_str.encode('utf-8'))
|
360
|
+
compressed_b64 = base64.b64encode(compressed).decode('utf-8')
|
361
|
+
payload = dict(json={
|
362
|
+
"data": compressed_b64, # compres
|
363
|
+
})
|
364
|
+
|
365
|
+
r = make_request(s=s, loaders=self.LOADERS, payload=payload, r_type="POST", url=url, time_out=timeout)
|
366
|
+
|
367
|
+
if r.status_code == 404:
|
368
|
+
raise SourceTableConfigurationDoesNotExist
|
369
|
+
|
370
|
+
if r.status_code != 200:
|
371
|
+
raise Exception(f"{self.update_hash}{r.text}")
|
372
|
+
return LocalTimeSerie(**r.json())
|
373
|
+
|
374
|
+
@classmethod
|
375
|
+
def create_historical_update(cls, *args, **kwargs):
|
376
|
+
s = cls.build_session()
|
377
|
+
base_url = cls.ENDPOINT["LocalTimeSerieHistoricalUpdate"]
|
378
|
+
data = serialize_to_json(kwargs)
|
379
|
+
payload = {"json": data, }
|
380
|
+
r = make_request(s=s, loaders=cls.LOADERS, r_type="POST", url=f"{base_url}/", payload=payload)
|
381
|
+
if r.status_code != 201:
|
382
|
+
raise Exception(f"Error in request {r.url} {r.text}")
|
383
|
+
|
384
|
+
@classmethod
|
385
|
+
def get_mermaid_dependency_diagram(cls, update_hash, data_source_id, desc=True, timeout=None) -> dict:
|
386
|
+
s = cls.build_session()
|
387
|
+
url = cls.get_object_url(
|
388
|
+
"DataNode") + f"/{update_hash}/dependencies_graph_mermaid?desc={desc}&data_source_id={data_source_id}"
|
389
|
+
r = make_request(s=s, loaders=cls.LOADERS, r_type="GET", url=url,
|
390
|
+
time_out=timeout)
|
391
|
+
if r.status_code != 200:
|
392
|
+
raise Exception(f"Error in request {r.text}")
|
393
|
+
|
394
|
+
return r.json()
|
395
|
+
|
396
|
+
def get_all_dependencies_update_priority(self, timeout=None) -> pd.DataFrame:
|
397
|
+
s = self.build_session()
|
398
|
+
url = self.get_object_url() + f"/{self.id}/get_all_dependencies_update_priority/"
|
399
|
+
r = make_request(s=s, loaders=self.LOADERS, r_type="GET", url=url, time_out=timeout)
|
400
|
+
if r.status_code != 200:
|
401
|
+
raise Exception(f"Error in request {r.text}")
|
402
|
+
|
403
|
+
depth_df = pd.DataFrame(r.json())
|
404
|
+
return depth_df
|
405
|
+
|
406
|
+
@classmethod
|
407
|
+
def get_upstream_nodes(cls, storage_hash, data_source_id, timeout=None):
|
408
|
+
s = cls.build_session()
|
409
|
+
url = cls.get_object_url("DataNode") + f"/{storage_hash}/get_upstream_nodes?data_source_id={data_source_id}"
|
410
|
+
r = make_request(s=s, loaders=cls.LOADERS, r_type="GET", url=url, time_out=timeout)
|
411
|
+
if r.status_code != 200:
|
412
|
+
raise Exception(f"Error in request {r.text}")
|
413
|
+
|
414
|
+
depth_df = pd.DataFrame(r.json())
|
415
|
+
return depth_df
|
416
|
+
|
417
|
+
@classmethod
|
418
|
+
def create(cls, timeout=None, *args, **kwargs):
|
419
|
+
url = cls.get_object_url("DataNode") + "/"
|
420
|
+
payload = {"json": serialize_to_json(kwargs)}
|
421
|
+
s = cls.build_session()
|
422
|
+
r = make_request(s=s, loaders=cls.LOADERS, r_type="POST", url=url, payload=payload, time_out=timeout)
|
423
|
+
if r.status_code != 201:
|
424
|
+
raise Exception(f"Error in request {r.text}")
|
425
|
+
instance = cls(**r.json())
|
426
|
+
return instance
|
427
|
+
|
428
|
+
def verify_if_direct_dependencies_are_updated(self) -> dict:
|
429
|
+
"""
|
430
|
+
Response({
|
431
|
+
"error_on_update_dependencies": False,
|
432
|
+
"updated": all_success,
|
433
|
+
})
|
434
|
+
"""
|
435
|
+
s = self.build_session()
|
436
|
+
url = self.get_object_url() + f"/{self.id}/verify_if_direct_dependencies_are_updated/"
|
437
|
+
r = make_request(s=s, loaders=None, r_type="GET", url=url)
|
438
|
+
if r.status_code != 200:
|
439
|
+
raise Exception(f"Error in request: {r.text}")
|
440
|
+
return r.json()
|
441
|
+
|
442
|
+
def get_data_between_dates_from_api(
|
443
|
+
self,
|
444
|
+
*args, **kwargs
|
445
|
+
):
|
446
|
+
|
447
|
+
return self.remote_table.get_data_between_dates_from_api(*args, **kwargs)
|
448
|
+
|
449
|
+
@classmethod
|
450
|
+
def insert_data_into_table(cls, local_metadata_id, records: List[dict],
|
451
|
+
overwrite=True, add_insertion_time=False):
|
452
|
+
s = cls.build_session()
|
453
|
+
url = cls.get_object_url() + f"/{local_metadata_id}/insert_data_into_table/"
|
454
|
+
|
455
|
+
chunk_json_str = json.dumps(records)
|
456
|
+
compressed = gzip.compress(chunk_json_str.encode('utf-8'))
|
457
|
+
compressed_b64 = base64.b64encode(compressed).decode('utf-8')
|
458
|
+
|
459
|
+
payload = dict(json={
|
460
|
+
"data": compressed_b64, # compressed JSON data
|
461
|
+
"chunk_stats": None,
|
462
|
+
"overwrite": overwrite,
|
463
|
+
"chunk_index": 0,
|
464
|
+
"total_chunks": 1,
|
465
|
+
})
|
466
|
+
|
467
|
+
try:
|
468
|
+
r = make_request(s=s, loaders=None, payload=payload, r_type="POST", url=url, time_out=60 * 15)
|
469
|
+
if r.status_code not in [200, 204]:
|
470
|
+
logger.warning(f"Error in request: {r.text}")
|
471
|
+
logger.info(f"Chunk uploaded successfully.")
|
472
|
+
except requests.exceptions.RequestException as e:
|
473
|
+
logger.exception(f"Error uploading chunk : {e}")
|
474
|
+
# Optionally, you could retry or break here
|
475
|
+
raise e
|
476
|
+
if r.status_code not in [200, 204]:
|
477
|
+
raise Exception(r.text)
|
478
|
+
|
479
|
+
@classmethod
|
480
|
+
def post_data_frame_in_chunks(
|
481
|
+
cls,
|
482
|
+
serialized_data_frame: pd.DataFrame,
|
483
|
+
chunk_size: int = 50_000,
|
484
|
+
local_metadata: dict = None,
|
485
|
+
data_source: str = None,
|
486
|
+
index_names: list = None,
|
487
|
+
time_index_name: str = 'timestamp',
|
488
|
+
overwrite: bool = False,
|
489
|
+
):
|
490
|
+
"""
|
491
|
+
Sends a large DataFrame to a Django backend in multiple chunks.
|
492
|
+
If a chunk is too large (HTTP 413), it's automatically split in half and retried.
|
493
|
+
"""
|
494
|
+
s = cls.build_session()
|
495
|
+
url = cls.get_object_url() + f"/{local_metadata.id}/insert_data_into_table/"
|
496
|
+
|
497
|
+
def _send_chunk_recursively(df_chunk: pd.DataFrame, chunk_idx: int, total_chunks: int,
|
498
|
+
is_sub_chunk: bool = False):
|
499
|
+
"""
|
500
|
+
Internal helper to send a chunk. If it receives a 413 error, it splits
|
501
|
+
the chunk and calls itself on the two halves.
|
502
|
+
"""
|
503
|
+
if df_chunk.empty:
|
504
|
+
return
|
505
|
+
|
506
|
+
part_label = f"{chunk_idx + 1}/{total_chunks}" if not is_sub_chunk else f"sub-chunk of {chunk_idx + 1}"
|
507
|
+
|
508
|
+
# Prepare the payload
|
509
|
+
chunk_stats, _ = get_chunk_stats(
|
510
|
+
chunk_df=df_chunk, index_names=index_names, time_index_name=time_index_name
|
511
|
+
)
|
512
|
+
chunk_json_str = df_chunk.to_json(orient="records", date_format="iso")
|
513
|
+
compressed = gzip.compress(chunk_json_str.encode('utf-8'))
|
514
|
+
compressed_b64 = base64.b64encode(compressed).decode('utf-8')
|
515
|
+
|
516
|
+
# For sub-chunks, we treat it as a new, single-chunk upload.
|
517
|
+
payload = dict(json={
|
518
|
+
"data": compressed_b64,
|
519
|
+
"chunk_stats": chunk_stats,
|
520
|
+
"overwrite": overwrite,
|
521
|
+
"chunk_index": 0 if is_sub_chunk else chunk_idx,
|
522
|
+
"total_chunks": 1 if is_sub_chunk else total_chunks,
|
523
|
+
})
|
524
|
+
|
525
|
+
try:
|
526
|
+
r = make_request(s=s, loaders=None, payload=payload, r_type="POST", url=url, time_out=60 * 15)
|
527
|
+
|
528
|
+
if r.status_code in [200, 204]:
|
529
|
+
logger.info(f"Chunk {part_label} ({len(df_chunk)} rows) uploaded successfully.")
|
530
|
+
return
|
531
|
+
|
532
|
+
if r.status_code == 413:
|
533
|
+
logger.warning(
|
534
|
+
f"Chunk {part_label} ({len(df_chunk)} rows) is too large (413). "
|
535
|
+
f"Splitting in half and retrying as new uploads."
|
536
|
+
)
|
537
|
+
if len(df_chunk) <= 1:
|
538
|
+
logger.error(
|
539
|
+
f"A single row is too large to upload (from chunk {part_label}). Cannot split further.")
|
540
|
+
raise Exception(f"A single row from chunk {part_label} is too large to upload.")
|
541
|
+
|
542
|
+
mid_point = len(df_chunk) // 2
|
543
|
+
first_half = df_chunk.iloc[:mid_point]
|
544
|
+
second_half = df_chunk.iloc[mid_point:]
|
545
|
+
|
546
|
+
# Recursively call for each half, marking them as sub-chunks.
|
547
|
+
_send_chunk_recursively(first_half, chunk_idx, total_chunks, is_sub_chunk=True)
|
548
|
+
_send_chunk_recursively(second_half, chunk_idx, total_chunks, is_sub_chunk=True)
|
549
|
+
return
|
550
|
+
|
551
|
+
logger.warning(f"Error in request for chunk {part_label}: {r.text}")
|
552
|
+
raise Exception(r.text)
|
553
|
+
|
554
|
+
except requests.exceptions.RequestException as e:
|
555
|
+
logger.exception(f"Network error uploading chunk {part_label}: {e}")
|
556
|
+
raise e
|
557
|
+
|
558
|
+
total_rows = len(serialized_data_frame)
|
559
|
+
if total_rows == 0:
|
560
|
+
logger.info("DataFrame is empty, nothing to upload.")
|
561
|
+
return
|
562
|
+
|
563
|
+
total_chunks = math.ceil(total_rows / chunk_size) if chunk_size > 0 else 1
|
564
|
+
logger.info(f"Starting upload of {total_rows} rows in {total_chunks} initial chunk(s).")
|
565
|
+
|
566
|
+
for i in range(total_chunks):
|
567
|
+
start_idx = i * chunk_size
|
568
|
+
end_idx = min((i + 1) * chunk_size, total_rows)
|
569
|
+
chunk_df = serialized_data_frame.iloc[start_idx:end_idx]
|
570
|
+
|
571
|
+
_send_chunk_recursively(chunk_df, i, total_chunks)
|
572
|
+
|
573
|
+
@classmethod
|
574
|
+
def get_metadatas_and_set_updates(
|
575
|
+
cls,
|
576
|
+
local_time_series_ids: list,
|
577
|
+
update_details_kwargs,
|
578
|
+
update_priority_dict
|
579
|
+
):
|
580
|
+
"""
|
581
|
+
{'local_hash_id__in': [{'update_hash': 'alpacaequitybarstest_97018e7280c1bad321b3f4153cc7e986', 'data_source_id': 1},
|
582
|
+
:param local_hash_id__in:
|
583
|
+
:param multi_index_asset_symbols_filter:
|
584
|
+
:param update_details_kwargs:
|
585
|
+
:param update_priority_dict:
|
586
|
+
:return:
|
587
|
+
"""
|
588
|
+
base_url = cls.get_object_url()
|
589
|
+
s = cls.build_session()
|
590
|
+
payload = {"json": dict(local_time_series_ids=local_time_series_ids,
|
591
|
+
update_details_kwargs=update_details_kwargs,
|
592
|
+
update_priority_dict=update_priority_dict,
|
593
|
+
)}
|
594
|
+
# r = self.s.post(f"{base_url}/get_metadatas_and_set_updates/", **payload)
|
595
|
+
url = f"{base_url}/get_metadatas_and_set_updates/"
|
596
|
+
r = make_request(s=s, loaders=cls.LOADERS, r_type="POST", url=url, payload=payload)
|
597
|
+
if r.status_code != 200:
|
598
|
+
raise Exception(f"Error in request {r.text}")
|
599
|
+
r = r.json()
|
600
|
+
r["source_table_config_map"] = {int(k): SourceTableConfiguration(**v) if v is not None else v for k, v in
|
601
|
+
r["source_table_config_map"].items()}
|
602
|
+
r["state_data"] = {int(k): LocalTimeSerieUpdateDetails(**v) for k, v in r["state_data"].items()}
|
603
|
+
r["all_index_stats"] = {int(k): v for k, v in r["all_index_stats"].items()}
|
604
|
+
r["local_metadatas"] = [LocalTimeSerie(**v) for v in r["local_metadatas"]]
|
605
|
+
return r
|
606
|
+
|
607
|
+
def depends_on_connect(self, target_time_serie_id
|
608
|
+
):
|
609
|
+
|
610
|
+
url = self.get_object_url() + f"/{self.id}/depends_on_connect/"
|
611
|
+
s = self.build_session()
|
612
|
+
payload = dict(json={
|
613
|
+
"target_time_serie_id": target_time_serie_id,
|
614
|
+
})
|
615
|
+
r = make_request(s=s, loaders=self.LOADERS, r_type="PATCH", url=url, payload=payload)
|
616
|
+
if r.status_code != 204:
|
617
|
+
raise Exception(f"Error in request {r.text}")
|
618
|
+
|
619
|
+
def depends_on_connect_to_api_table(self, target_table_id,
|
620
|
+
timeout=None):
|
621
|
+
|
622
|
+
url = self.get_object_url() + f"/{self.id}/depends_on_connect_to_api_table/"
|
623
|
+
s = self.build_session()
|
624
|
+
payload = dict(json={
|
625
|
+
"target_table_id": target_table_id,
|
626
|
+
})
|
627
|
+
r = make_request(s=s, loaders=self.LOADERS, r_type="PATCH", url=url,
|
628
|
+
time_out=timeout,
|
629
|
+
payload=payload)
|
630
|
+
if r.status_code != 204:
|
631
|
+
raise Exception(f"Error in request {r.text}")
|
632
|
+
|
633
|
+
@classmethod
|
634
|
+
def _break_pandas_dataframe(cls, data_frame: pd.DataFrame, time_index_name: Union[str, None] = None):
|
635
|
+
if time_index_name == None:
|
636
|
+
time_index_name = data_frame.index.names[0]
|
637
|
+
if time_index_name is None:
|
638
|
+
time_index_name = "time_index"
|
639
|
+
names = [c if i != 0 else time_index_name for i, c in
|
640
|
+
enumerate(data_frame.index.names)]
|
641
|
+
data_frame.index.names = names
|
642
|
+
|
643
|
+
time_col_loc = data_frame.index.names.index(time_index_name)
|
644
|
+
index_names = data_frame.index.names
|
645
|
+
data_frame = data_frame.reset_index()
|
646
|
+
data_frame.columns = [str(c) for c in data_frame.columns]
|
647
|
+
data_frame = data_frame.rename(columns={data_frame.columns[time_col_loc]: time_index_name})
|
648
|
+
column_dtypes_map = {key: str(value) for key, value in data_frame.dtypes.to_dict().items()}
|
649
|
+
|
650
|
+
data_frame = data_frame.replace({np.nan: None})
|
651
|
+
|
652
|
+
return data_frame, index_names, column_dtypes_map, time_index_name
|
653
|
+
|
654
|
+
def upsert_data_into_table(
|
655
|
+
self,
|
656
|
+
data: pd.DataFrame,
|
657
|
+
data_source: "DynamicTableDataSource",
|
658
|
+
):
|
659
|
+
|
660
|
+
overwrite = True # ALWAYS OVERWRITE
|
661
|
+
metadata = self.remote_table
|
662
|
+
|
663
|
+
data, index_names, column_dtypes_map, time_index_name = self._break_pandas_dataframe(
|
664
|
+
data)
|
665
|
+
|
666
|
+
# overwrite data origina data frame to release memory
|
667
|
+
if not data[time_index_name].is_monotonic_increasing:
|
668
|
+
data = data.sort_values(time_index_name)
|
669
|
+
|
670
|
+
metadata.handle_source_table_configuration_creation(
|
671
|
+
column_dtypes_map=column_dtypes_map,
|
672
|
+
index_names=index_names,
|
673
|
+
time_index_name=time_index_name,
|
674
|
+
data=data,
|
675
|
+
overwrite=overwrite
|
676
|
+
)
|
677
|
+
|
678
|
+
duplicates_exist = data.duplicated(subset=index_names).any()
|
679
|
+
if duplicates_exist:
|
680
|
+
raise Exception(f"Duplicates found in columns: {index_names}")
|
681
|
+
|
682
|
+
global_stats, grouped_dates = get_chunk_stats(
|
683
|
+
chunk_df=data,
|
684
|
+
index_names=index_names,
|
685
|
+
time_index_name=time_index_name
|
686
|
+
)
|
687
|
+
multi_index_column_stats = {}
|
688
|
+
column_names = [c for c in data.columns if c not in index_names]
|
689
|
+
for c in column_names:
|
690
|
+
multi_index_column_stats[c] = global_stats["_PER_ASSET_"]
|
691
|
+
|
692
|
+
data_source.insert_data_into_table(
|
693
|
+
serialized_data_frame=data,
|
694
|
+
local_metadata=self,
|
695
|
+
overwrite=overwrite,
|
696
|
+
time_index_name=time_index_name,
|
697
|
+
index_names=index_names,
|
698
|
+
grouped_dates=grouped_dates,
|
699
|
+
)
|
700
|
+
|
701
|
+
min_d, last_time_index_value = global_stats["_GLOBAL_"]["min"], global_stats["_GLOBAL_"]["max"]
|
702
|
+
max_per_asset_symbol = None
|
703
|
+
|
704
|
+
def extract_max(node):
|
705
|
+
# Leaf case: a dict with 'min' and 'max'
|
706
|
+
if isinstance(node, dict) and "min" in node and "max" in node:
|
707
|
+
return node["max"]
|
708
|
+
# Otherwise recurse
|
709
|
+
return {k: extract_max(v) for k, v in node.items()}
|
710
|
+
|
711
|
+
if len(index_names) > 1:
|
712
|
+
max_per_asset_symbol = {
|
713
|
+
uid: extract_max(stats)
|
714
|
+
for uid, stats in global_stats["_PER_ASSET_"].items()
|
715
|
+
}
|
716
|
+
local_metadata = self.set_last_update_index_time_from_update_stats(
|
717
|
+
max_per_asset_symbol=max_per_asset_symbol,
|
718
|
+
last_time_index_value=last_time_index_value,
|
719
|
+
multi_index_column_stats=multi_index_column_stats
|
720
|
+
)
|
721
|
+
return local_metadata
|
722
|
+
|
723
|
+
def get_node_time_to_wait(self):
|
724
|
+
|
725
|
+
next_update = self.localtimeserieupdatedetails.next_update
|
726
|
+
time_to_wait = 0.0
|
727
|
+
if next_update is not None:
|
728
|
+
time_to_wait = (pd.to_datetime(next_update) - datetime.datetime.now(pytz.utc)).total_seconds()
|
729
|
+
time_to_wait = max(0, time_to_wait)
|
730
|
+
return time_to_wait, next_update
|
731
|
+
|
732
|
+
def wait_for_update_time(self, ):
|
733
|
+
time_to_wait, next_update = self.get_node_time_to_wait()
|
734
|
+
if time_to_wait > 0:
|
735
|
+
|
736
|
+
logger.info(f"Scheduler Waiting for ts update time at {next_update} {time_to_wait}")
|
737
|
+
time.sleep(time_to_wait)
|
738
|
+
else:
|
739
|
+
time_to_wait = max(0, 60 - datetime.datetime.now(pytz.utc).second)
|
740
|
+
logger.info(f"Scheduler Waiting for ts update at start of minute")
|
741
|
+
time.sleep(time_to_wait)
|
742
|
+
|
743
|
+
|
744
|
+
class TableMetaData(BaseModel):
|
745
|
+
identifier: str = None
|
746
|
+
description: Optional[str] = None
|
747
|
+
data_frequency_id: Optional[DataFrequency] = None
|
748
|
+
|
749
|
+
|
750
|
+
class DynamicTableMetaData(BasePydanticModel, BaseObjectOrm):
|
751
|
+
id: int = Field(None, description="Primary key, auto-incremented ID")
|
752
|
+
storage_hash: str = Field(..., max_length=63, description="Max length of PostgreSQL table name")
|
753
|
+
table_name: Optional[str] = Field(None, max_length=63, description="Max length of PostgreSQL table name")
|
754
|
+
creation_date: datetime.datetime = Field(..., description="Creation timestamp")
|
755
|
+
created_by_user: Optional[int] = Field(None, description="Foreign key reference to User")
|
756
|
+
organization_owner: int = Field(None, description="Foreign key reference to Organization")
|
757
|
+
open_for_everyone: bool = Field(default=False, description="Whether the table is open for everyone")
|
758
|
+
data_source_open_for_everyone: bool = Field(default=False,
|
759
|
+
description="Whether the data source is open for everyone")
|
760
|
+
build_configuration: Optional[Dict[str, Any]] = Field(None, description="Configuration in JSON format")
|
761
|
+
build_meta_data: Optional[Dict[str, Any]] = Field(None, description="Optional YAML metadata")
|
762
|
+
time_serie_source_code_git_hash: Optional[str] = Field(None, max_length=255,
|
763
|
+
description="Git hash of the time series source code")
|
764
|
+
time_serie_source_code: Optional[str] = Field(None, description="File path for time series source code")
|
765
|
+
protect_from_deletion: bool = Field(default=False, description="Flag to protect the record from deletion")
|
766
|
+
data_source: Union[int, "DynamicTableDataSource"]
|
767
|
+
source_class_name: str
|
768
|
+
sourcetableconfiguration: Optional[SourceTableConfiguration] = None
|
769
|
+
table_index_names: Optional[Dict] = None
|
770
|
+
|
771
|
+
# TS specifi
|
772
|
+
compression_policy_config: Optional[Dict] = None
|
773
|
+
retention_policy_config: Optional[Dict] = None
|
774
|
+
|
775
|
+
# MetaData
|
776
|
+
identifier: Optional[str] = None
|
777
|
+
description: Optional[str] = None
|
778
|
+
data_frequency_id: Optional[DataFrequency] = None
|
779
|
+
|
780
|
+
_drop_indices: bool = False # for direct incertion we can pass this values
|
781
|
+
_rebuild_indices: bool = False # for direct incertion we can pass this values
|
782
|
+
|
783
|
+
def patch(self, time_out: Union[None, int] = None, *args, **kwargs, ):
|
784
|
+
url = self.get_object_url() + f"/{self.id}/"
|
785
|
+
payload = {"json": serialize_to_json(kwargs)}
|
786
|
+
s = self.build_session()
|
787
|
+
r = make_request(s=s, loaders=self.LOADERS, r_type="PATCH", url=url, payload=payload, time_out=time_out)
|
788
|
+
if r.status_code != 200:
|
789
|
+
raise Exception(f"Error in request {r.text}")
|
790
|
+
return self.__class__(**r.json())
|
791
|
+
|
792
|
+
@classmethod
|
793
|
+
def patch_by_hash(cls, storage_hash: str, *args, **kwargs):
|
794
|
+
metadata = cls.get(storage_hash=storage_hash)
|
795
|
+
metadata.patch(*args, **kwargs)
|
796
|
+
|
797
|
+
@classmethod
|
798
|
+
def get_or_create(cls, **kwargs):
|
799
|
+
kwargs = serialize_to_json(kwargs)
|
800
|
+
url = cls.get_object_url() + "/get_or_create/"
|
801
|
+
payload = {"json": kwargs}
|
802
|
+
s = cls.build_session()
|
803
|
+
r = make_request(s=s, loaders=cls.LOADERS, r_type="POST", url=url, payload=payload)
|
804
|
+
if r.status_code not in [201, 200]:
|
805
|
+
raise Exception(r.text)
|
806
|
+
data = r.json()
|
807
|
+
return cls(**data)
|
808
|
+
|
809
|
+
def build_or_update_update_details(self, *args, **kwargs):
|
810
|
+
base_url = self.get_object_url()
|
811
|
+
payload = {"json": kwargs}
|
812
|
+
s = self.build_session()
|
813
|
+
url = f"{base_url}/{self.id}/build_or_update_update_details/"
|
814
|
+
r = make_request(r_type="PATCH", url=url, payload=payload, s=s, loaders=self.LOADERS, )
|
815
|
+
if r.status_code != 202:
|
816
|
+
raise Exception(f"Error in request {r.text}")
|
817
|
+
|
818
|
+
@classmethod
|
819
|
+
def patch_build_configuration(
|
820
|
+
cls,
|
821
|
+
remote_table_patch: Union[dict, None],
|
822
|
+
build_meta_data: dict,
|
823
|
+
data_source_id: int,
|
824
|
+
local_table_patch: dict,
|
825
|
+
):
|
826
|
+
|
827
|
+
logger.warning("TODO Fix Patch Build Configuration")
|
828
|
+
# url = cls.get_object_url() + "/patch_build_configuration"
|
829
|
+
# payload = {"json": {"remote_table_patch": remote_table_patch, "local_table_patch": local_table_patch,
|
830
|
+
# "build_meta_data": build_meta_data, "data_source_id": data_source_id,
|
831
|
+
# }}
|
832
|
+
# s = cls.build_session()
|
833
|
+
# r = make_request(s=s, loaders=cls.LOADERS, r_type="POST", url=url, payload=payload,
|
834
|
+
#
|
835
|
+
# )
|
836
|
+
# if r.status_code != 200:
|
837
|
+
# raise Exception(r.text)
|
838
|
+
|
839
|
+
def delete_table(self):
|
840
|
+
data_source = PodDataSource._get_duck_db()
|
841
|
+
duckdb_dynamic_data_source = DynamicTableDataSource.get_or_create_duck_db(
|
842
|
+
related_resource=data_source.id,
|
843
|
+
)
|
844
|
+
if (isinstance(self.data_source, int) and self.data_source.id == duckdb_dynamic_data_source.id) or \
|
845
|
+
(not isinstance(self.data_source, int) and self.data_source.related_resource.class_type == DUCK_DB):
|
846
|
+
db_interface = DuckDBInterface()
|
847
|
+
db_interface.drop_table(self.table_name)
|
848
|
+
|
849
|
+
self.delete()
|
850
|
+
|
851
|
+
def handle_source_table_configuration_creation(self,
|
852
|
+
|
853
|
+
column_dtypes_map: dict,
|
854
|
+
index_names: List[str],
|
855
|
+
time_index_name,
|
856
|
+
data,
|
857
|
+
overwrite=False
|
858
|
+
):
|
859
|
+
"""
|
860
|
+
Handles the creation or retrieval of the source table configuration.
|
861
|
+
|
862
|
+
Parameters:
|
863
|
+
----------
|
864
|
+
metadata : dict
|
865
|
+
Metadata dictionary containing "sourcetableconfiguration" and "id".
|
866
|
+
column_dtypes_map : dict
|
867
|
+
Mapping of column names to their data types.
|
868
|
+
index_names : list
|
869
|
+
List of index names.
|
870
|
+
time_index_name : str
|
871
|
+
Name of the time index column.
|
872
|
+
|
873
|
+
data : DataFrame
|
874
|
+
The input DataFrame.
|
875
|
+
overwrite : bool, optional
|
876
|
+
Whether to overwrite existing configurations (default is False).
|
877
|
+
|
878
|
+
Returns:
|
879
|
+
-------
|
880
|
+
dict or None
|
881
|
+
Updated metadata with the source table configuration, and potentially filtered data.
|
882
|
+
"""
|
883
|
+
stc = self.sourcetableconfiguration
|
884
|
+
|
885
|
+
if stc is None:
|
886
|
+
try:
|
887
|
+
stc = SourceTableConfiguration.create(
|
888
|
+
column_dtypes_map=column_dtypes_map,
|
889
|
+
index_names=index_names,
|
890
|
+
time_index_name=time_index_name,
|
891
|
+
metadata_id=self.id
|
892
|
+
)
|
893
|
+
self.sourcetableconfiguration = stc
|
894
|
+
except AlreadyExist:
|
895
|
+
|
896
|
+
if not overwrite:
|
897
|
+
raise NotImplementedError("TODO Needs to remove values per asset")
|
898
|
+
# Filter the data based on time_index_name and last_time_index_value
|
899
|
+
|
900
|
+
def get_data_between_dates_from_api(
|
901
|
+
self,
|
902
|
+
start_date: datetime.datetime = None,
|
903
|
+
end_date: datetime.datetime = None,
|
904
|
+
great_or_equal: bool = None,
|
905
|
+
less_or_equal: bool = None,
|
906
|
+
unique_identifier_list: list = None,
|
907
|
+
columns: list = None,
|
908
|
+
unique_identifier_range_map: Union[None, UniqueIdentifierRangeMap] = None,
|
909
|
+
column_range_descriptor: Union[None, UniqueIdentifierRangeMap] = None
|
910
|
+
):
|
911
|
+
""" Helper function to make a single batch request (or multiple paged requests if next_offset). """
|
912
|
+
|
913
|
+
def fetch_one_batch(chunk_range_map):
|
914
|
+
all_results_chunk = []
|
915
|
+
offset = 0
|
916
|
+
while True:
|
917
|
+
payload = {
|
918
|
+
"json": {
|
919
|
+
"start_date": start_date.timestamp() if start_date else None,
|
920
|
+
"end_date": end_date.timestamp() if end_date else None,
|
921
|
+
"great_or_equal": great_or_equal,
|
922
|
+
"less_or_equal": less_or_equal,
|
923
|
+
"unique_identifier_list": unique_identifier_list,
|
924
|
+
"columns": columns,
|
925
|
+
"offset": offset, # pagination offset
|
926
|
+
"unique_identifier_range_map": chunk_range_map,
|
927
|
+
}
|
928
|
+
}
|
929
|
+
|
930
|
+
# Perform the POST request
|
931
|
+
r = make_request(s=s, loaders=self.LOADERS, payload=payload, r_type="POST", url=url)
|
932
|
+
if r.status_code != 200:
|
933
|
+
logger.warning(f"Error in request: {r.text}")
|
934
|
+
return []
|
935
|
+
|
936
|
+
response_data = r.json()
|
937
|
+
# Accumulate results
|
938
|
+
chunk = response_data.get("results", [])
|
939
|
+
all_results_chunk.extend(chunk)
|
940
|
+
|
941
|
+
# Retrieve next offset; if None, we've got all the data in this chunk
|
942
|
+
next_offset = response_data.get("next_offset")
|
943
|
+
if not next_offset:
|
944
|
+
break
|
945
|
+
|
946
|
+
# Update offset for the next iteration
|
947
|
+
offset = next_offset
|
948
|
+
|
949
|
+
return all_results_chunk
|
950
|
+
|
951
|
+
s = self.build_session()
|
952
|
+
url = self.get_object_url() + f"/{self.id}/get_data_between_dates_from_remote/"
|
953
|
+
|
954
|
+
unique_identifier_range_map = copy.deepcopy(unique_identifier_range_map)
|
955
|
+
if unique_identifier_range_map is not None:
|
956
|
+
for unique_identifier, date_info in unique_identifier_range_map.items():
|
957
|
+
# Convert start_date if present
|
958
|
+
if 'start_date' in date_info and isinstance(date_info['start_date'], datetime.datetime):
|
959
|
+
date_info['start_date'] = int(date_info['start_date'].timestamp())
|
960
|
+
|
961
|
+
# Convert end_date if present
|
962
|
+
if 'end_date' in date_info and isinstance(date_info['end_date'], datetime.datetime):
|
963
|
+
date_info['end_date'] = int(date_info['end_date'].timestamp())
|
964
|
+
|
965
|
+
all_results = []
|
966
|
+
if unique_identifier_range_map:
|
967
|
+
keys = list(unique_identifier_range_map.keys())
|
968
|
+
chunk_size = 100
|
969
|
+
for start_idx in range(0, len(keys), chunk_size):
|
970
|
+
key_chunk = keys[start_idx: start_idx + chunk_size]
|
971
|
+
|
972
|
+
# Build sub-dictionary for this chunk
|
973
|
+
chunk_map = {
|
974
|
+
k: unique_identifier_range_map[k] for k in key_chunk
|
975
|
+
}
|
976
|
+
|
977
|
+
# Fetch data (including any pagination via next_offset)
|
978
|
+
chunk_results = fetch_one_batch(chunk_map)
|
979
|
+
all_results.extend(chunk_results)
|
980
|
+
else:
|
981
|
+
# If unique_identifier_range_map is None, do a single batch with offset-based pagination.
|
982
|
+
chunk_results = fetch_one_batch(None)
|
983
|
+
all_results.extend(chunk_results)
|
984
|
+
|
985
|
+
return pd.DataFrame(all_results)
|
986
|
+
|
987
|
+
|
988
|
+
class Scheduler(BasePydanticModel, BaseObjectOrm):
|
989
|
+
id: Optional[int] = None
|
990
|
+
name: str
|
991
|
+
is_running: bool
|
992
|
+
running_process_pid: Optional[int]
|
993
|
+
running_in_debug_mode: bool
|
994
|
+
updates_halted: bool
|
995
|
+
host: Optional[str]
|
996
|
+
api_address: Optional[str]
|
997
|
+
api_port: Optional[int]
|
998
|
+
last_heart_beat: Optional[datetime.datetime] = None
|
999
|
+
pre_loads_in_tree: Optional[List[LocalTimeSerie]] = None # Assuming this is a list of strings
|
1000
|
+
in_active_tree: Optional[List[LocalTimeSerie]] = None # Assuming this is a list of strings
|
1001
|
+
schedules_to: Optional[List[LocalTimeSerie]] = None
|
1002
|
+
# for heartbeat
|
1003
|
+
_stop_heart_beat: bool = False
|
1004
|
+
_executor: Optional[object] = None
|
1005
|
+
|
1006
|
+
@classmethod
|
1007
|
+
def get_scheduler_for_ts(cls, ts_id: int):
|
1008
|
+
"""
|
1009
|
+
GET /schedulers/for-ts/?ts_id=<LocalTimeSerie PK>
|
1010
|
+
"""
|
1011
|
+
s = cls.build_session()
|
1012
|
+
url = cls.get_object_url() + "/for-ts/"
|
1013
|
+
r = make_request(
|
1014
|
+
s=s,
|
1015
|
+
r_type="GET",
|
1016
|
+
url=url,
|
1017
|
+
payload={"params": {"ts_id": ts_id}},
|
1018
|
+
loaders=cls.LOADERS,
|
1019
|
+
)
|
1020
|
+
if r.status_code == 404:
|
1021
|
+
raise SchedulerDoesNotExist(r.json().get("detail", r.text))
|
1022
|
+
r.raise_for_status()
|
1023
|
+
return cls(**r.json())
|
1024
|
+
|
1025
|
+
@classmethod
|
1026
|
+
def initialize_debug_for_ts(
|
1027
|
+
cls,
|
1028
|
+
time_serie_id: int,
|
1029
|
+
name_suffix: Union[str, None] = None,
|
1030
|
+
):
|
1031
|
+
"""
|
1032
|
+
POST /schedulers/initialize‑debug/
|
1033
|
+
body: { time_serie_id, name_suffix? }
|
1034
|
+
"""
|
1035
|
+
s = cls.build_session()
|
1036
|
+
url = cls.get_object_url() + "/initialize-debug/"
|
1037
|
+
payload = {
|
1038
|
+
"json": {
|
1039
|
+
"time_serie_id": time_serie_id,
|
1040
|
+
**({"name_suffix": name_suffix} if name_suffix is not None else {}),
|
1041
|
+
}
|
1042
|
+
}
|
1043
|
+
r = make_request(s=s, r_type="POST", url=url, payload=payload, loaders=cls.LOADERS)
|
1044
|
+
r.raise_for_status()
|
1045
|
+
return cls(**r.json())
|
1046
|
+
|
1047
|
+
@classmethod
|
1048
|
+
def build_and_assign_to_ts(
|
1049
|
+
cls,
|
1050
|
+
scheduler_name: str,
|
1051
|
+
time_serie_ids: List[int],
|
1052
|
+
delink_all_ts: bool = False,
|
1053
|
+
remove_from_other_schedulers: bool = True,
|
1054
|
+
timeout=None,
|
1055
|
+
**kwargs,
|
1056
|
+
):
|
1057
|
+
"""
|
1058
|
+
POST /schedulers/build-and-assign/
|
1059
|
+
body: {
|
1060
|
+
scheduler_name, time_serie_ids, delink_all_ts?,
|
1061
|
+
remove_from_other_schedulers?, scheduler_kwargs?
|
1062
|
+
}
|
1063
|
+
"""
|
1064
|
+
s = cls.build_session()
|
1065
|
+
url = cls.get_object_url() + "/build_and_assign_to_ts/"
|
1066
|
+
payload = {
|
1067
|
+
"json": {
|
1068
|
+
"scheduler_name": scheduler_name,
|
1069
|
+
"time_serie_ids": time_serie_ids,
|
1070
|
+
"delink_all_ts": delink_all_ts,
|
1071
|
+
"remove_from_other_schedulers": remove_from_other_schedulers,
|
1072
|
+
"scheduler_kwargs": kwargs or {},
|
1073
|
+
}
|
1074
|
+
}
|
1075
|
+
r = make_request(s=s, r_type="POST", url=url, payload=payload,
|
1076
|
+
time_out=timeout,
|
1077
|
+
loaders=cls.LOADERS)
|
1078
|
+
if r.status_code not in [200, 201]:
|
1079
|
+
r.raise_for_status()
|
1080
|
+
return cls(**r.json())
|
1081
|
+
|
1082
|
+
def in_active_tree_connect(self, local_time_series_ids: List[int]):
|
1083
|
+
"""
|
1084
|
+
PATCH /schedulers/{id}/in-active-tree/
|
1085
|
+
body: { time_serie_ids }
|
1086
|
+
"""
|
1087
|
+
s = self.build_session()
|
1088
|
+
url = f"{self.get_object_url()}/{self.id}/in-active-tree/"
|
1089
|
+
r = make_request(
|
1090
|
+
s=s,
|
1091
|
+
r_type="PATCH",
|
1092
|
+
url=url,
|
1093
|
+
payload={"json": {"time_serie_ids": local_time_series_ids}},
|
1094
|
+
loaders=self.LOADERS,
|
1095
|
+
)
|
1096
|
+
if r.status_code not in (200, 204):
|
1097
|
+
raise Exception(f"Error in request {r.text}")
|
1098
|
+
|
1099
|
+
def assign_to_scheduler(self, time_serie_ids: List[int]):
|
1100
|
+
"""
|
1101
|
+
PATCH /schedulers/{id}/assign/
|
1102
|
+
body: { time_serie_ids }
|
1103
|
+
"""
|
1104
|
+
s = self.build_session()
|
1105
|
+
url = f"{self.get_object_url()}/{self.id}/assign/"
|
1106
|
+
r = make_request(
|
1107
|
+
s=s,
|
1108
|
+
r_type="PATCH",
|
1109
|
+
url=url,
|
1110
|
+
payload={"json": {"time_serie_ids": time_serie_ids}},
|
1111
|
+
loaders=self.LOADERS,
|
1112
|
+
)
|
1113
|
+
r.raise_for_status()
|
1114
|
+
return Scheduler(**r.json())
|
1115
|
+
|
1116
|
+
def is_scheduler_running_in_process(self):
|
1117
|
+
# test call
|
1118
|
+
if self.is_running == True and hasattr(self, "api_address"):
|
1119
|
+
# verify scheduler host is the same
|
1120
|
+
if self.api_address == get_network_ip() and is_process_running(self.running_process_pid) == True:
|
1121
|
+
return True
|
1122
|
+
return False
|
1123
|
+
|
1124
|
+
def _heart_beat_patch(self):
|
1125
|
+
try:
|
1126
|
+
scheduler = self.patch(is_running=True,
|
1127
|
+
running_process_pid=os.getpid(),
|
1128
|
+
running_in_debug_mode=self.running_in_debug_mode,
|
1129
|
+
last_heart_beat=datetime.datetime.utcnow().replace(
|
1130
|
+
tzinfo=pytz.utc).timestamp(),
|
1131
|
+
)
|
1132
|
+
for field, value in scheduler.__dict__.items():
|
1133
|
+
setattr(self, field, value)
|
1134
|
+
except Exception as e:
|
1135
|
+
logger.error(e)
|
1136
|
+
|
1137
|
+
def _heartbeat_runner(self, run_interval):
|
1138
|
+
"""
|
1139
|
+
Runs forever (until the main thread ends),
|
1140
|
+
calling _scheduler_heart_beat_patch every 30 seconds.
|
1141
|
+
"""
|
1142
|
+
logger.debug("Heartbeat thread started with interval = %d seconds", run_interval)
|
1143
|
+
|
1144
|
+
while True:
|
1145
|
+
self._heart_beat_patch()
|
1146
|
+
# Sleep in a loop so that if we ever decide to
|
1147
|
+
# add a cancellation event, we can check it in smaller intervals
|
1148
|
+
for _ in range(run_interval):
|
1149
|
+
# could check for a stop event here if not daemon
|
1150
|
+
if self._stop_heart_beat == True:
|
1151
|
+
return
|
1152
|
+
time.sleep(1)
|
1153
|
+
|
1154
|
+
def start_heart_beat(self):
|
1155
|
+
from concurrent.futures import ThreadPoolExecutor
|
1156
|
+
|
1157
|
+
if self._executor is None:
|
1158
|
+
self._executor = ThreadPoolExecutor(max_workers=1)
|
1159
|
+
run_interval = TDAG_CONSTANTS.SCHEDULER_HEART_BEAT_FREQUENCY_SECONDS
|
1160
|
+
self._heartbeat_future = self._executor.submit(self._heartbeat_runner, run_interval)
|
1161
|
+
|
1162
|
+
def stop_heart_beat(self):
|
1163
|
+
"""
|
1164
|
+
Stop the heartbeat gracefully.
|
1165
|
+
"""
|
1166
|
+
# Signal the runner loop to exit
|
1167
|
+
self._stop_heart_beat = True
|
1168
|
+
|
1169
|
+
# Optionally wait for the future to complete
|
1170
|
+
if hasattr(self, "heartbeat_future") and self._heartbeat_future:
|
1171
|
+
logger.info("Waiting for the heartbeat thread to finish...")
|
1172
|
+
self._heartbeat_future.result() # or .cancel() if you prefer
|
1173
|
+
|
1174
|
+
# Shut down the executor if no longer needed
|
1175
|
+
if self._executor:
|
1176
|
+
self._executor.shutdown(wait=True)
|
1177
|
+
self._executor = None
|
1178
|
+
|
1179
|
+
logger.info("Heartbeat thread stopped.")
|
1180
|
+
|
1181
|
+
|
1182
|
+
class RunConfiguration(BasePydanticModel, BaseObjectOrm):
|
1183
|
+
local_time_serie_update_details: Optional[int] = None
|
1184
|
+
retry_on_error: int = 0
|
1185
|
+
seconds_wait_on_retry: float = 50
|
1186
|
+
required_cpus: int = 1
|
1187
|
+
required_gpus: int = 0
|
1188
|
+
execution_time_out_seconds: float = 50
|
1189
|
+
update_schedule: str = "*/1 * * * *"
|
1190
|
+
|
1191
|
+
@classmethod
|
1192
|
+
@property
|
1193
|
+
def ROOT_URL(cls):
|
1194
|
+
return None
|
1195
|
+
|
1196
|
+
|
1197
|
+
class LocalTimeSerieUpdateDetails(BasePydanticModel, BaseObjectOrm):
|
1198
|
+
related_table: Union[int, LocalTimeSerie]
|
1199
|
+
active_update: bool = Field(default=False, description="Flag to indicate if update is active")
|
1200
|
+
update_pid: int = Field(default=0, description="Process ID of the update")
|
1201
|
+
error_on_last_update: bool = Field(default=False,
|
1202
|
+
description="Flag to indicate if there was an error in the last update")
|
1203
|
+
last_update: Optional[datetime.datetime] = Field(None, description="Timestamp of the last update")
|
1204
|
+
next_update: Optional[datetime.datetime] = Field(None, description="Timestamp of the next update")
|
1205
|
+
update_statistics: Optional[Dict[str, Any]] = Field(None, description="JSON field for update statistics")
|
1206
|
+
active_update_status: str = Field(default="Q", max_length=20, description="Current update status")
|
1207
|
+
active_update_scheduler: Optional[Union[int, Scheduler]] = Field(None,
|
1208
|
+
description="Scheduler for active update")
|
1209
|
+
update_priority: int = Field(default=0, description="Priority level of the update")
|
1210
|
+
last_updated_by_user: Optional[int] = Field(None, description="Foreign key reference to User")
|
1211
|
+
|
1212
|
+
run_configuration: Optional["RunConfiguration"] = None
|
1213
|
+
|
1214
|
+
@staticmethod
|
1215
|
+
def _parse_parameters_filter(parameters):
|
1216
|
+
for key, value in parameters.items():
|
1217
|
+
if "__in" in key:
|
1218
|
+
assert isinstance(value, list)
|
1219
|
+
parameters[key] = ",".join(value)
|
1220
|
+
return parameters
|
1221
|
+
|
1222
|
+
|
1223
|
+
class UpdateStatistics(BaseModel):
|
1224
|
+
"""
|
1225
|
+
This class contains the update details of the table in the main sequence engine
|
1226
|
+
"""
|
1227
|
+
asset_time_statistics: Optional[Dict[str, Union[datetime.datetime, None, Dict]]] = None
|
1228
|
+
|
1229
|
+
max_time_index_value: Optional[datetime.datetime] = None # does not include fitler
|
1230
|
+
asset_list: Optional[List] = None
|
1231
|
+
limit_update_time: Optional[datetime.datetime] = None # flag to limit the update of data node
|
1232
|
+
_max_time_in_update_statistics: Optional[datetime.datetime] = None # include filter
|
1233
|
+
_initial_fallback_date: Optional[datetime.datetime] = None
|
1234
|
+
|
1235
|
+
# when working with DuckDb and column based storage we want to have also stats by column
|
1236
|
+
multi_index_column_stats: Optional[Dict[str, Any]] = None
|
1237
|
+
is_backfill: bool = False
|
1238
|
+
|
1239
|
+
class Config:
|
1240
|
+
arbitrary_types_allowed = True
|
1241
|
+
|
1242
|
+
@staticmethod
|
1243
|
+
def _to_utc_datetime(value: Any):
|
1244
|
+
# pandas / numpy friendly path first
|
1245
|
+
if hasattr(value, "to_pydatetime"): # pandas.Timestamp
|
1246
|
+
value = value.to_pydatetime()
|
1247
|
+
# Handle numpy.datetime64 without importing numpy explicitly
|
1248
|
+
if type(value).__name__ == "datetime64":
|
1249
|
+
try:
|
1250
|
+
import pandas as pd # only if available
|
1251
|
+
value = pd.to_datetime(value).to_pydatetime()
|
1252
|
+
except Exception:
|
1253
|
+
return value
|
1254
|
+
|
1255
|
+
if isinstance(value, datetime.datetime):
|
1256
|
+
return value.astimezone(datetime.timezone.utc) if value.tzinfo else value.replace(
|
1257
|
+
tzinfo=datetime.timezone.utc)
|
1258
|
+
|
1259
|
+
if isinstance(value, (int, float)):
|
1260
|
+
v = float(value)
|
1261
|
+
# seconds / ms / µs / ns heuristics by magnitude
|
1262
|
+
if v > 1e17: # ns
|
1263
|
+
v /= 1e9
|
1264
|
+
elif v > 1e14: # µs
|
1265
|
+
v /= 1e6
|
1266
|
+
elif v > 1e11: # ms
|
1267
|
+
v /= 1e3
|
1268
|
+
return datetime.datetime.fromtimestamp(v, tz=datetime.timezone.utc)
|
1269
|
+
|
1270
|
+
if isinstance(value, str):
|
1271
|
+
s = value.strip()
|
1272
|
+
if s.endswith("Z"): # ISO Z suffix
|
1273
|
+
s = s[:-1] + "+00:00"
|
1274
|
+
try:
|
1275
|
+
dt = datetime.datetime.fromisoformat(s)
|
1276
|
+
return dt.astimezone(datetime.timezone.utc) if dt.tzinfo else dt.replace(tzinfo=datetime.timezone.utc)
|
1277
|
+
except ValueError:
|
1278
|
+
return value
|
1279
|
+
|
1280
|
+
return value
|
1281
|
+
|
1282
|
+
@classmethod
|
1283
|
+
def _normalize_nested(cls, obj: Any):
|
1284
|
+
if obj is None:
|
1285
|
+
return None
|
1286
|
+
if isinstance(obj, dict):
|
1287
|
+
return {k: cls._normalize_nested(v) for k, v in obj.items()}
|
1288
|
+
return cls._to_utc_datetime(obj)
|
1289
|
+
|
1290
|
+
@field_validator("multi_index_column_stats", mode="before")
|
1291
|
+
@classmethod
|
1292
|
+
def _coerce_multi_index_column_stats(cls, v):
|
1293
|
+
# Normalize before standard parsing so ints/strings become datetimes
|
1294
|
+
return cls._normalize_nested(v)
|
1295
|
+
|
1296
|
+
@classmethod
|
1297
|
+
def return_empty(cls):
|
1298
|
+
return cls()
|
1299
|
+
|
1300
|
+
def pretty_print(self):
|
1301
|
+
print(f"{self.__class__.__name__} summary:")
|
1302
|
+
|
1303
|
+
# asset_list
|
1304
|
+
if self.asset_list is None:
|
1305
|
+
print(" asset_list: None")
|
1306
|
+
else:
|
1307
|
+
print(f" asset_list: {len(self.asset_list)} assets")
|
1308
|
+
|
1309
|
+
# DataFrame
|
1310
|
+
if self.last_observation is None or self.last_observation.empty:
|
1311
|
+
print(" last_observation: empty DataFrame")
|
1312
|
+
else:
|
1313
|
+
rows, cols = self.last_observation.shape
|
1314
|
+
print(f" last_observation: DataFrame with {rows} rows × {cols} columns")
|
1315
|
+
|
1316
|
+
# Other attributes
|
1317
|
+
print(f" max_time_index_value: {self.max_time_index_value}")
|
1318
|
+
print(f" _max_time_in_update_statistics: {self._max_time_in_update_statistics}")
|
1319
|
+
|
1320
|
+
def is_empty(self):
|
1321
|
+
return self.asset_time_statistics is None and self.max_time_index_value is None
|
1322
|
+
|
1323
|
+
def asset_identifier(self):
|
1324
|
+
return list(self.asset_time_statistics.keys())
|
1325
|
+
|
1326
|
+
def get_max_time_in_update_statistics(self):
|
1327
|
+
if hasattr(self, "_max_time_in_update_statistics") == False:
|
1328
|
+
self._max_time_in_update_statistics = self.max_time_index_value or self._initial_fallback_date
|
1329
|
+
if self._max_time_in_update_statistics is None and self.asset_time_statistics is not None:
|
1330
|
+
new_update_statistics, _max_time_in_asset_time_statistics = self._get_update_statistics(
|
1331
|
+
|
1332
|
+
asset_list=None, unique_identifier_list=None
|
1333
|
+
)
|
1334
|
+
self._max_time_in_update_statistics = _max_time_in_asset_time_statistics
|
1335
|
+
|
1336
|
+
return self._max_time_in_update_statistics
|
1337
|
+
|
1338
|
+
def get_update_range_map_great_or_equal_columnar(self, extra_time_delta: Optional[datetime.timedelta] = None,
|
1339
|
+
column_filter: Optional[List[str]] = None,
|
1340
|
+
):
|
1341
|
+
fallback = {c: {a.unique_identifier: {"min": self._initial_fallback_date,
|
1342
|
+
"max": self._initial_fallback_date,
|
1343
|
+
} for a in self.asset_list} for c in column_filter}
|
1344
|
+
|
1345
|
+
multi_index_column_stats = self.multi_index_column_stats or {}
|
1346
|
+
fallback.update(multi_index_column_stats)
|
1347
|
+
|
1348
|
+
def _start_dt(bounds):
|
1349
|
+
dt = (bounds or {}).get("max") or self._initial_fallback_date
|
1350
|
+
if extra_time_delta:
|
1351
|
+
dt = dt + extra_time_delta
|
1352
|
+
return dt
|
1353
|
+
|
1354
|
+
target_cols = fallback.keys() if column_filter is None else column_filter
|
1355
|
+
|
1356
|
+
range_map = {
|
1357
|
+
col: {
|
1358
|
+
asset_id: DateInfo({
|
1359
|
+
"start_date_operand": ">=",
|
1360
|
+
"start_date": _start_dt(bounds),
|
1361
|
+
})
|
1362
|
+
for asset_id, bounds in col_stats.items()
|
1363
|
+
}
|
1364
|
+
for col, col_stats in fallback.items() if col in column_filter
|
1365
|
+
}
|
1366
|
+
|
1367
|
+
return range_map
|
1368
|
+
|
1369
|
+
def get_update_range_map_great_or_equal(self,
|
1370
|
+
extra_time_delta: Optional[datetime.timedelta] = None,
|
1371
|
+
):
|
1372
|
+
|
1373
|
+
if extra_time_delta is None:
|
1374
|
+
range_map = {k: DateInfo({"start_date_operand": ">=", "start_date": v or self._initial_fallback_date}) for
|
1375
|
+
k, v in self.asset_time_statistics.items()}
|
1376
|
+
else:
|
1377
|
+
range_map = {k: DateInfo(
|
1378
|
+
{"start_date_operand": ">=", "start_date": (v or self._initial_fallback_date) + extra_time_delta}) for
|
1379
|
+
k, v in self.asset_time_statistics.items()}
|
1380
|
+
return range_map
|
1381
|
+
|
1382
|
+
def get_last_update_index_2d(self, uid):
|
1383
|
+
return self.asset_time_statistics[uid] or self._initial_fallback_date
|
1384
|
+
|
1385
|
+
def get_asset_earliest_multiindex_update(self, asset):
|
1386
|
+
stats = self.asset_time_statistics.get(asset.unique_identifier)
|
1387
|
+
if not stats:
|
1388
|
+
return self._initial_fallback_date
|
1389
|
+
|
1390
|
+
def _min_in_nested(node):
|
1391
|
+
# If this is a dict, recurse into its values
|
1392
|
+
if isinstance(node, dict):
|
1393
|
+
m = None
|
1394
|
+
for v in node.values():
|
1395
|
+
cand = _min_in_nested(v)
|
1396
|
+
if cand is not None and (m is None or cand < m):
|
1397
|
+
m = cand
|
1398
|
+
return m
|
1399
|
+
# Leaf: assume it’s a timestamp (datetime or numeric)
|
1400
|
+
return node
|
1401
|
+
|
1402
|
+
return _min_in_nested(stats)
|
1403
|
+
|
1404
|
+
def filter_assets_by_level(self,
|
1405
|
+
level: int,
|
1406
|
+
filters: List,
|
1407
|
+
):
|
1408
|
+
"""
|
1409
|
+
Prune `self.asset_time_statistics` so that at the specified index level
|
1410
|
+
only the given keys remain. Works for any depth of nesting.
|
1411
|
+
|
1412
|
+
Parameters
|
1413
|
+
----------
|
1414
|
+
level_name : str
|
1415
|
+
The name of the index-level to filter on (must be one of
|
1416
|
+
self.metadata.sourcetableconfiguration.index_names).
|
1417
|
+
filters : List
|
1418
|
+
The allowed values at that level. Any branches whose key at
|
1419
|
+
`level_name` is not in this list will be removed.
|
1420
|
+
|
1421
|
+
Returns
|
1422
|
+
-------
|
1423
|
+
self
|
1424
|
+
(Allows method chaining.)
|
1425
|
+
"""
|
1426
|
+
# Grab the full list of index names, in order
|
1427
|
+
|
1428
|
+
# Determine the numeric depth of the target level
|
1429
|
+
# 0 == unique_identifier, 1 == first nested level, etc.
|
1430
|
+
target_depth = level - 1
|
1431
|
+
|
1432
|
+
# Special‐case: filtering on unique_identifier itself
|
1433
|
+
if target_depth == 0:
|
1434
|
+
self.asset_time_statistics = {
|
1435
|
+
asset: stats
|
1436
|
+
for asset, stats in self.asset_time_statistics.items()
|
1437
|
+
if asset in filters
|
1438
|
+
}
|
1439
|
+
return self
|
1440
|
+
|
1441
|
+
allowed = set(filters)
|
1442
|
+
default = self._initial_fallback_date
|
1443
|
+
|
1444
|
+
def _prune(node: Any, current_depth: int) -> Any:
|
1445
|
+
# leaf timestamp
|
1446
|
+
if not isinstance(node, dict):
|
1447
|
+
return node
|
1448
|
+
|
1449
|
+
# we've reached the level to filter
|
1450
|
+
if current_depth == target_depth:
|
1451
|
+
out: Dict[str, Any] = {}
|
1452
|
+
for key in allowed:
|
1453
|
+
if key in node:
|
1454
|
+
out[key] = node[key]
|
1455
|
+
else:
|
1456
|
+
# missing filter → assign fallback date
|
1457
|
+
out[key] = default
|
1458
|
+
return out
|
1459
|
+
|
1460
|
+
# otherwise recurse deeper
|
1461
|
+
pruned: Dict[str, Any] = {}
|
1462
|
+
for key, subnode in node.items():
|
1463
|
+
new_sub = _prune(subnode, current_depth + 1)
|
1464
|
+
# keep non-empty dicts or valid leaves
|
1465
|
+
if isinstance(new_sub, dict):
|
1466
|
+
if new_sub:
|
1467
|
+
pruned[key] = new_sub
|
1468
|
+
elif new_sub is not None:
|
1469
|
+
pruned[key] = new_sub
|
1470
|
+
return pruned
|
1471
|
+
|
1472
|
+
new_stats: Dict[str, Any] = {}
|
1473
|
+
# stats dict sits at depth=1 under each asset
|
1474
|
+
for asset, stats in self.asset_time_statistics.items():
|
1475
|
+
if stats is None:
|
1476
|
+
new_stats[asset] = {f: self._initial_fallback_date for f in allowed}
|
1477
|
+
else:
|
1478
|
+
pr = _prune(stats, current_depth=1)
|
1479
|
+
new_stats[asset] = pr or None
|
1480
|
+
|
1481
|
+
self.asset_time_statistics = new_stats
|
1482
|
+
return self
|
1483
|
+
|
1484
|
+
def _get_update_statistics(self,
|
1485
|
+
asset_list: Optional[List],
|
1486
|
+
unique_identifier_list: Union[list, None],init_fallback_date=None):
|
1487
|
+
new_update_statistics = {}
|
1488
|
+
if asset_list is None and unique_identifier_list is None:
|
1489
|
+
assert self.asset_time_statistics is not None
|
1490
|
+
unique_identifier_list = list(self.asset_time_statistics.keys())
|
1491
|
+
|
1492
|
+
else:
|
1493
|
+
unique_identifier_list = [a.unique_identifier for a in
|
1494
|
+
asset_list] if unique_identifier_list is None else unique_identifier_list
|
1495
|
+
|
1496
|
+
for unique_identifier in unique_identifier_list:
|
1497
|
+
|
1498
|
+
if self.asset_time_statistics and unique_identifier in self.asset_time_statistics:
|
1499
|
+
new_update_statistics[unique_identifier] = self.asset_time_statistics[unique_identifier]
|
1500
|
+
else:
|
1501
|
+
|
1502
|
+
new_update_statistics[unique_identifier] = init_fallback_date
|
1503
|
+
|
1504
|
+
def _max_in_nested(d):
|
1505
|
+
"""
|
1506
|
+
Recursively find the max leaf value in a nested dict-of-dicts,
|
1507
|
+
where the leaves are comparable (e.g. datetime objects).
|
1508
|
+
Returns None if there are no leaves.
|
1509
|
+
"""
|
1510
|
+
max_val = None
|
1511
|
+
for v in d.values():
|
1512
|
+
if isinstance(v, dict):
|
1513
|
+
candidate = _max_in_nested(v)
|
1514
|
+
else:
|
1515
|
+
candidate = v
|
1516
|
+
if candidate is not None and (max_val is None or candidate > max_val):
|
1517
|
+
max_val = candidate
|
1518
|
+
return max_val
|
1519
|
+
|
1520
|
+
_max_time_in_asset_time_statistics = _max_in_nested(new_update_statistics) if len(
|
1521
|
+
new_update_statistics) > 0 else init_fallback_date
|
1522
|
+
|
1523
|
+
return new_update_statistics, _max_time_in_asset_time_statistics
|
1524
|
+
|
1525
|
+
def update_assets(
|
1526
|
+
self,
|
1527
|
+
asset_list: Optional[List],
|
1528
|
+
*,
|
1529
|
+
init_fallback_date: datetime = None,
|
1530
|
+
unique_identifier_list: Union[list, None] = None
|
1531
|
+
):
|
1532
|
+
self.asset_list = asset_list
|
1533
|
+
new_update_statistics = self.asset_time_statistics
|
1534
|
+
if asset_list is not None or unique_identifier_list is not None:
|
1535
|
+
new_update_statistics, _max_time_in_asset_time_statistics = self._get_update_statistics(
|
1536
|
+
unique_identifier_list=unique_identifier_list,
|
1537
|
+
asset_list=asset_list,init_fallback_date=init_fallback_date,
|
1538
|
+
)
|
1539
|
+
|
1540
|
+
else:
|
1541
|
+
_max_time_in_asset_time_statistics = self.max_time_index_value or init_fallback_date
|
1542
|
+
|
1543
|
+
new_multi_index_column_stats = self.multi_index_column_stats
|
1544
|
+
if self.max_time_index_value is not None and self.multi_index_column_stats is not None:
|
1545
|
+
new_multi_index_column_stats = {k: v for k, v in self.multi_index_column_stats.items() if
|
1546
|
+
k in new_update_statistics.keys()}
|
1547
|
+
|
1548
|
+
du = UpdateStatistics(
|
1549
|
+
asset_time_statistics=new_update_statistics,
|
1550
|
+
max_time_index_value=self.max_time_index_value,
|
1551
|
+
asset_list=asset_list,
|
1552
|
+
multi_index_column_stats=new_multi_index_column_stats
|
1553
|
+
)
|
1554
|
+
du._max_time_in_update_statistics = _max_time_in_asset_time_statistics
|
1555
|
+
du._initial_fallback_date = init_fallback_date
|
1556
|
+
return du
|
1557
|
+
|
1558
|
+
def is_empty(self):
|
1559
|
+
return self.max_time_index_value is None
|
1560
|
+
|
1561
|
+
def __getitem__(self, key: str) -> Any:
|
1562
|
+
if self.asset_time_statistics is None:
|
1563
|
+
raise KeyError(f"{key} not found (asset_time_statistics is None).")
|
1564
|
+
return self.asset_time_statistics[key]
|
1565
|
+
|
1566
|
+
def __setitem__(self, key: str, value: Any) -> None:
|
1567
|
+
if self.asset_time_statistics is None:
|
1568
|
+
self.asset_time_statistics = {}
|
1569
|
+
self.asset_time_statistics[key] = value
|
1570
|
+
|
1571
|
+
def __delitem__(self, key: str) -> None:
|
1572
|
+
if not self.asset_time_statistics or key not in self.asset_time_statistics:
|
1573
|
+
raise KeyError(f"{key} not found in asset_time_statistics.")
|
1574
|
+
del self.asset_time_statistics[key]
|
1575
|
+
|
1576
|
+
def __iter__(self):
|
1577
|
+
"""Iterate over keys."""
|
1578
|
+
if self.asset_time_statistics is None:
|
1579
|
+
return iter([])
|
1580
|
+
return iter(self.asset_time_statistics)
|
1581
|
+
|
1582
|
+
def __len__(self) -> int:
|
1583
|
+
if not self.asset_time_statistics:
|
1584
|
+
return 0
|
1585
|
+
return len(self.asset_time_statistics)
|
1586
|
+
|
1587
|
+
def keys(self):
|
1588
|
+
if not self.asset_time_statistics:
|
1589
|
+
return []
|
1590
|
+
return self.asset_time_statistics.keys()
|
1591
|
+
|
1592
|
+
def values(self):
|
1593
|
+
if not self.asset_time_statistics:
|
1594
|
+
return []
|
1595
|
+
return self.asset_time_statistics.values()
|
1596
|
+
|
1597
|
+
def items(self):
|
1598
|
+
if not self.asset_time_statistics:
|
1599
|
+
return []
|
1600
|
+
return self.asset_time_statistics.items()
|
1601
|
+
|
1602
|
+
def filter_df_by_latest_value(self, df: pd.DataFrame) -> pd.DataFrame:
|
1603
|
+
if self.is_empty():
|
1604
|
+
return df
|
1605
|
+
|
1606
|
+
# Single-index time series fallback
|
1607
|
+
if (
|
1608
|
+
(self.asset_time_statistics is None or "unique_identifier" not in df.index.names)
|
1609
|
+
and self.max_time_index_value is not None
|
1610
|
+
):
|
1611
|
+
return df[df.index >= self.max_time_index_value]
|
1612
|
+
|
1613
|
+
names = df.index.names
|
1614
|
+
time_level = names[0]
|
1615
|
+
|
1616
|
+
grouping_levels = [n for n in names if n != time_level]
|
1617
|
+
|
1618
|
+
# Build a mask by iterating over each row tuple + its timestamp
|
1619
|
+
mask = []
|
1620
|
+
for idx_tuple, ts in zip(df.index, df.index.get_level_values(time_level)):
|
1621
|
+
# map level names → values
|
1622
|
+
level_vals = dict(zip(names, idx_tuple))
|
1623
|
+
asset = level_vals["unique_identifier"]
|
1624
|
+
|
1625
|
+
# fetch this asset’s nested stats
|
1626
|
+
stats = self.asset_time_statistics.get(asset)
|
1627
|
+
if stats is None:
|
1628
|
+
# no prior stats for this asset → keep row
|
1629
|
+
mask.append(True)
|
1630
|
+
continue
|
1631
|
+
|
1632
|
+
# drill into the nested stats for the remaining levels
|
1633
|
+
nested = stats
|
1634
|
+
for lvl in grouping_levels[1:]: # skip 'unique_identifier'
|
1635
|
+
key = level_vals[lvl]
|
1636
|
+
if not isinstance(nested, dict) or key not in nested:
|
1637
|
+
# no prior stats for this subgroup → keep row
|
1638
|
+
nested = None
|
1639
|
+
break
|
1640
|
+
nested = nested[key]
|
1641
|
+
|
1642
|
+
# if we couldn’t find a prior timestamp, or this ts is newer, keep it
|
1643
|
+
if nested is None or ts > nested:
|
1644
|
+
mask.append(True)
|
1645
|
+
else:
|
1646
|
+
# ts ≤ last seen → filter out
|
1647
|
+
mask.append(False)
|
1648
|
+
|
1649
|
+
# apply the mask
|
1650
|
+
df = df[mask]
|
1651
|
+
|
1652
|
+
# drop any exact duplicate multi‐index rows that remain
|
1653
|
+
dup = df.index.duplicated(keep="first")
|
1654
|
+
if dup.any():
|
1655
|
+
n = dup.sum()
|
1656
|
+
logger.warning(f"Removed {n} duplicated rows after filtering.")
|
1657
|
+
df = df[~dup]
|
1658
|
+
return df
|
1659
|
+
|
1660
|
+
|
1661
|
+
def get_chunk_stats(chunk_df, time_index_name, index_names):
|
1662
|
+
chunk_stats = {
|
1663
|
+
"_GLOBAL_": {
|
1664
|
+
"max": chunk_df[time_index_name].max().timestamp(),
|
1665
|
+
"min": chunk_df[time_index_name].min().timestamp()
|
1666
|
+
}
|
1667
|
+
}
|
1668
|
+
chunk_stats["_PER_ASSET_"] = {}
|
1669
|
+
grouped_dates = None
|
1670
|
+
if len(index_names) > 1:
|
1671
|
+
grouped_dates = chunk_df.groupby(index_names[1:])[
|
1672
|
+
time_index_name].agg(
|
1673
|
+
["min", "max"])
|
1674
|
+
|
1675
|
+
# 2) decompose the grouped index names
|
1676
|
+
first, *rest = grouped_dates.index.names
|
1677
|
+
|
1678
|
+
# 3) reset to a flat DataFrame for easy iteration
|
1679
|
+
df = grouped_dates.reset_index()
|
1680
|
+
|
1681
|
+
# 4) build the nested dict
|
1682
|
+
per_asset: dict = {}
|
1683
|
+
for _, row in df.iterrows():
|
1684
|
+
uid = row[first] # e.g. the unique_identifier
|
1685
|
+
# only one extra level beyond uid?
|
1686
|
+
if len(rest) == 0:
|
1687
|
+
|
1688
|
+
per_asset[uid] = {
|
1689
|
+
"min": row["min"].timestamp(),
|
1690
|
+
"max": row["max"].timestamp(),
|
1691
|
+
}
|
1692
|
+
else:
|
1693
|
+
# multiple extra levels → walk a path of dicts
|
1694
|
+
keys = [row[level] for level in rest]
|
1695
|
+
sub = per_asset.setdefault(uid, {})
|
1696
|
+
for key in keys[:-1]:
|
1697
|
+
sub = sub.setdefault(key, {})
|
1698
|
+
sub[keys[-1]] = {
|
1699
|
+
"min": row["min"].timestamp(),
|
1700
|
+
"max": row["max"].timestamp(),
|
1701
|
+
}
|
1702
|
+
# 5) assign into your stats structure
|
1703
|
+
chunk_stats["_PER_ASSET_"] = per_asset
|
1704
|
+
return chunk_stats, grouped_dates
|
1705
|
+
|
1706
|
+
|
1707
|
+
class LocalTimeSeriesHistoricalUpdate(BasePydanticModel, BaseObjectOrm):
|
1708
|
+
id: Optional[int] = None
|
1709
|
+
related_table: int # Assuming you're using the ID of the related table
|
1710
|
+
update_time_start: datetime.datetime
|
1711
|
+
update_time_end: Optional[datetime.datetime] = None
|
1712
|
+
error_on_update: bool = False
|
1713
|
+
trace_id: Optional[str] = Field(default=None, max_length=255)
|
1714
|
+
updated_by_user: Optional[int] = None # Assuming you're using the ID of the user
|
1715
|
+
|
1716
|
+
last_time_index_value: Optional[datetime.datetime] = None
|
1717
|
+
|
1718
|
+
# extra fields for local control
|
1719
|
+
update_statistics: Optional[UpdateStatistics]
|
1720
|
+
must_update: Optional[bool]
|
1721
|
+
direct_dependencies_ids: Optional[List[int]]
|
1722
|
+
|
1723
|
+
|
1724
|
+
class DataSource(BasePydanticModel, BaseObjectOrm):
|
1725
|
+
id: Optional[int] = Field(None, description="The unique identifier of the Local Disk Source Lake")
|
1726
|
+
display_name: str
|
1727
|
+
organization: Optional[int] = Field(None, description="The unique identifier of the Local Disk Source Lake")
|
1728
|
+
class_type: str
|
1729
|
+
status: str
|
1730
|
+
extra_arguments: Optional[Dict] = None
|
1731
|
+
|
1732
|
+
@classmethod
|
1733
|
+
def get_or_create_duck_db(cls, time_out=None, *args, **kwargs):
|
1734
|
+
url = cls.get_object_url() + f"/get_or_create_duck_db/"
|
1735
|
+
payload = {"json": serialize_to_json(kwargs)}
|
1736
|
+
s = cls.build_session()
|
1737
|
+
r = make_request(s=s, loaders=cls.LOADERS, r_type="POST", url=url, payload=payload, time_out=time_out)
|
1738
|
+
if r.status_code not in [200, 201]:
|
1739
|
+
raise Exception(f"Error in request {r.text}")
|
1740
|
+
return cls(**r.json())
|
1741
|
+
|
1742
|
+
def insert_data_into_table(
|
1743
|
+
self,
|
1744
|
+
serialized_data_frame: pd.DataFrame,
|
1745
|
+
local_metadata: LocalTimeSerie,
|
1746
|
+
overwrite: bool,
|
1747
|
+
time_index_name: str,
|
1748
|
+
index_names: list,
|
1749
|
+
grouped_dates: dict,
|
1750
|
+
):
|
1751
|
+
|
1752
|
+
if self.class_type == DUCK_DB:
|
1753
|
+
DuckDBInterface().upsert(
|
1754
|
+
df=serialized_data_frame,
|
1755
|
+
table=local_metadata.remote_table.table_name
|
1756
|
+
)
|
1757
|
+
else:
|
1758
|
+
LocalTimeSerie.post_data_frame_in_chunks(
|
1759
|
+
serialized_data_frame=serialized_data_frame,
|
1760
|
+
local_metadata=local_metadata,
|
1761
|
+
data_source=self,
|
1762
|
+
index_names=index_names,
|
1763
|
+
time_index_name=time_index_name,
|
1764
|
+
overwrite=overwrite,
|
1765
|
+
)
|
1766
|
+
|
1767
|
+
def insert_data_into_local_table(
|
1768
|
+
self,
|
1769
|
+
serialized_data_frame: pd.DataFrame,
|
1770
|
+
local_metadata: LocalTimeSerie,
|
1771
|
+
overwrite: bool,
|
1772
|
+
time_index_name: str,
|
1773
|
+
index_names: list,
|
1774
|
+
grouped_dates: dict,
|
1775
|
+
):
|
1776
|
+
|
1777
|
+
# LocalTimeSerie.post_data_frame_in_chunks(
|
1778
|
+
# serialized_data_frame=serialized_data_frame,
|
1779
|
+
# local_metadata=local_metadata,
|
1780
|
+
# data_source=self,
|
1781
|
+
# index_names=index_names,
|
1782
|
+
# time_index_name=time_index_name,
|
1783
|
+
# overwrite=overwrite,
|
1784
|
+
# )
|
1785
|
+
raise NotImplementedError
|
1786
|
+
|
1787
|
+
def get_data_by_time_index(
|
1788
|
+
self,
|
1789
|
+
local_metadata: dict,
|
1790
|
+
start_date: Optional[datetime.datetime] = None,
|
1791
|
+
end_date: Optional[datetime.datetime] = None,
|
1792
|
+
great_or_equal: bool = True,
|
1793
|
+
less_or_equal: bool = True,
|
1794
|
+
columns: Optional[List[str]] = None,
|
1795
|
+
unique_identifier_list: Optional[List[str]] = None,
|
1796
|
+
unique_identifier_range_map: Optional[UniqueIdentifierRangeMap] = None,
|
1797
|
+
column_range_descriptor: Optional[Dict[str, UniqueIdentifierRangeMap]] = None,
|
1798
|
+
) -> pd.DataFrame:
|
1799
|
+
|
1800
|
+
logger.warning("EXTEND THE CONSTRAIN READ HERE!!")
|
1801
|
+
if self.class_type == DUCK_DB:
|
1802
|
+
db_interface = DuckDBInterface()
|
1803
|
+
table_name = local_metadata.remote_table.table_name
|
1804
|
+
|
1805
|
+
adjusted_start, adjusted_end, adjusted_uirm, _ = db_interface.constrain_read(
|
1806
|
+
table=table_name,
|
1807
|
+
start=start_date,
|
1808
|
+
end=end_date,
|
1809
|
+
ids=unique_identifier_list,
|
1810
|
+
unique_identifier_range_map=unique_identifier_range_map,
|
1811
|
+
)
|
1812
|
+
if unique_identifier_range_map is not None and adjusted_end is not None:
|
1813
|
+
adjusted_end = datetime.datetime(adjusted_end.year, adjusted_end.month, adjusted_end.day,
|
1814
|
+
tzinfo=datetime.timezone.utc)
|
1815
|
+
for v in unique_identifier_range_map.values():
|
1816
|
+
v["end_date"] = adjusted_end
|
1817
|
+
v["end_date_operand"] = "<="
|
1818
|
+
|
1819
|
+
df = db_interface.read(
|
1820
|
+
table=table_name,
|
1821
|
+
start=start_date,
|
1822
|
+
end=end_date,
|
1823
|
+
great_or_equal=great_or_equal,
|
1824
|
+
less_or_equal=less_or_equal,
|
1825
|
+
ids=unique_identifier_list,
|
1826
|
+
columns=columns,
|
1827
|
+
unique_identifier_range_map=unique_identifier_range_map, # Pass range map
|
1828
|
+
)
|
1829
|
+
|
1830
|
+
|
1831
|
+
else:
|
1832
|
+
if column_range_descriptor is not None:
|
1833
|
+
raise Exception("On this data source do not use column_range_descriptor")
|
1834
|
+
df = local_metadata.get_data_between_dates_from_api(
|
1835
|
+
start_date=start_date,
|
1836
|
+
end_date=end_date,
|
1837
|
+
great_or_equal=great_or_equal,
|
1838
|
+
less_or_equal=less_or_equal,
|
1839
|
+
unique_identifier_list=unique_identifier_list,
|
1840
|
+
columns=columns,
|
1841
|
+
unique_identifier_range_map=unique_identifier_range_map
|
1842
|
+
)
|
1843
|
+
if len(df) == 0:
|
1844
|
+
logger.warning(
|
1845
|
+
f"No data returned from remote API for {local_metadata.update_hash}"
|
1846
|
+
)
|
1847
|
+
return df
|
1848
|
+
|
1849
|
+
stc = local_metadata.remote_table.sourcetableconfiguration
|
1850
|
+
try:
|
1851
|
+
df[stc.time_index_name] = pd.to_datetime(df[stc.time_index_name], format='ISO8601')
|
1852
|
+
except Exception as e:
|
1853
|
+
raise e
|
1854
|
+
columns_to_loop = columns or stc.column_dtypes_map.keys()
|
1855
|
+
for c, c_type in stc.column_dtypes_map.items():
|
1856
|
+
if c not in columns_to_loop:
|
1857
|
+
continue
|
1858
|
+
if c != stc.time_index_name:
|
1859
|
+
if c_type == "object":
|
1860
|
+
c_type = "str"
|
1861
|
+
df[c] = df[c].astype(c_type)
|
1862
|
+
df = df.set_index(stc.index_names)
|
1863
|
+
return df
|
1864
|
+
|
1865
|
+
def get_earliest_value(self,
|
1866
|
+
local_metadata: LocalTimeSerie,
|
1867
|
+
) -> Tuple[Optional[pd.Timestamp], Dict[Any, Optional[pd.Timestamp]]]:
|
1868
|
+
if self.class_type == DUCK_DB:
|
1869
|
+
db_interface = DuckDBInterface()
|
1870
|
+
table_name = local_metadata.remote_table.table_name
|
1871
|
+
return db_interface.time_index_minima(table=table_name)
|
1872
|
+
|
1873
|
+
|
1874
|
+
else:
|
1875
|
+
raise NotImplementedError
|
1876
|
+
|
1877
|
+
|
1878
|
+
class DynamicTableDataSource(BasePydanticModel, BaseObjectOrm):
|
1879
|
+
id: int
|
1880
|
+
related_resource: DataSource
|
1881
|
+
related_resource_class_type: str
|
1882
|
+
|
1883
|
+
class Config:
|
1884
|
+
use_enum_values = True # This ensures that enums are stored as their values (e.g., 'TEXT')
|
1885
|
+
|
1886
|
+
def model_dump_json(self, **json_dumps_kwargs) -> str:
|
1887
|
+
"""
|
1888
|
+
Dump the current instance to a JSON string,
|
1889
|
+
ensuring that the dependent `related_resource` is also properly dumped.
|
1890
|
+
"""
|
1891
|
+
# Obtain the dictionary representation using Pydantic's model_dump
|
1892
|
+
dump = self.model_dump()
|
1893
|
+
# Properly dump the dependent resource if it supports model_dump
|
1894
|
+
dump["related_resource"] = self.related_resource.model_dump()
|
1895
|
+
# Convert the dict to a JSON string
|
1896
|
+
return json.dumps(dump, **json_dumps_kwargs)
|
1897
|
+
|
1898
|
+
@classmethod
|
1899
|
+
def get_default_data_source_for_token(cls):
|
1900
|
+
global _default_data_source
|
1901
|
+
if _default_data_source is not None:
|
1902
|
+
return _default_data_source # Return cached result if already set
|
1903
|
+
url = cls.ROOT_URL + "/get_default_data_source_for_token/"
|
1904
|
+
|
1905
|
+
s = cls.build_session()
|
1906
|
+
r = make_request(s=s, loaders=cls.LOADERS, r_type="GET", url=url, payload={})
|
1907
|
+
|
1908
|
+
if r.status_code != 200:
|
1909
|
+
raise Exception(f"Error in request {r.text}")
|
1910
|
+
data = r.json()
|
1911
|
+
|
1912
|
+
return cls(**r.json())
|
1913
|
+
|
1914
|
+
def persist_to_pickle(self, path):
|
1915
|
+
import cloudpickle
|
1916
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
1917
|
+
with open(path, 'wb') as handle:
|
1918
|
+
cloudpickle.dump(self, handle)
|
1919
|
+
|
1920
|
+
@classmethod
|
1921
|
+
def get_or_create_duck_db(cls, *args, **kwargs):
|
1922
|
+
url = cls.get_object_url() + "/get_or_create_duck_db/"
|
1923
|
+
s = cls.build_session()
|
1924
|
+
r = make_request(s=s, loaders=cls.LOADERS, r_type="POST", url=url, payload={"json": kwargs})
|
1925
|
+
if r.status_code not in [200, 201]:
|
1926
|
+
raise Exception(f"Error in request {r.text}")
|
1927
|
+
return cls(**r.json())
|
1928
|
+
|
1929
|
+
def has_direct_postgres_connection(self):
|
1930
|
+
return self.related_resource.class_type == 'direct'
|
1931
|
+
|
1932
|
+
def get_data_by_time_index(self, *args, **kwargs):
|
1933
|
+
if self.has_direct_postgres_connection():
|
1934
|
+
stc = kwargs["local_metadata"].remote_table.sourcetableconfiguration
|
1935
|
+
|
1936
|
+
df = TimeScaleInterface.direct_data_from_db(
|
1937
|
+
connection_uri=self.related_resource.get_connection_uri(),
|
1938
|
+
*args, **kwargs,
|
1939
|
+
|
1940
|
+
)
|
1941
|
+
df = set_types_in_table(df, stc.column_dtypes_map)
|
1942
|
+
return df
|
1943
|
+
else:
|
1944
|
+
return self.related_resource.get_data_by_time_index(*args, **kwargs)
|
1945
|
+
|
1946
|
+
def insert_data_into_table(self, *args, **kwargs):
|
1947
|
+
if self.has_direct_postgres_connection():
|
1948
|
+
TimeScaleInterface.process_and_update_table(
|
1949
|
+
data_source=self.related_resource,
|
1950
|
+
*args, **kwargs,
|
1951
|
+
)
|
1952
|
+
|
1953
|
+
else:
|
1954
|
+
self.related_resource.insert_data_into_table(*args, **kwargs)
|
1955
|
+
|
1956
|
+
|
1957
|
+
class Project(BasePydanticModel, BaseObjectOrm):
|
1958
|
+
id: int
|
1959
|
+
project_name: str
|
1960
|
+
data_source: DynamicTableDataSource
|
1961
|
+
git_ssh_url: Optional[str] = None
|
1962
|
+
|
1963
|
+
@classmethod
|
1964
|
+
def get_user_default_project(cls):
|
1965
|
+
url = cls.get_object_url() + "/get_user_default_project/"
|
1966
|
+
|
1967
|
+
s = cls.build_session()
|
1968
|
+
r = make_request(s=s, loaders=cls.LOADERS, r_type="GET", url=url, )
|
1969
|
+
if r.status_code == 404:
|
1970
|
+
raise Exception(r.text)
|
1971
|
+
if r.status_code != 200:
|
1972
|
+
raise Exception(f"Error in request {r.text}")
|
1973
|
+
return cls(**r.json())
|
1974
|
+
|
1975
|
+
def __str__(self):
|
1976
|
+
return yaml.safe_dump(
|
1977
|
+
self.model_dump(),
|
1978
|
+
sort_keys=False,
|
1979
|
+
default_flow_style=False,
|
1980
|
+
)
|
1981
|
+
|
1982
|
+
|
1983
|
+
class TimeScaleDB(DataSource):
|
1984
|
+
database_user: str
|
1985
|
+
password: str
|
1986
|
+
host: str
|
1987
|
+
database_name: str
|
1988
|
+
port: int
|
1989
|
+
|
1990
|
+
def get_connection_uri(self):
|
1991
|
+
password = self.password # Decrypt password if necessary
|
1992
|
+
return f"postgresql://{self.database_user}:{password}@{self.host}:{self.port}/{self.database_name}"
|
1993
|
+
|
1994
|
+
def insert_data_into_table(
|
1995
|
+
self,
|
1996
|
+
serialized_data_frame: pd.DataFrame,
|
1997
|
+
local_metadata: dict,
|
1998
|
+
overwrite: bool,
|
1999
|
+
time_index_name: str,
|
2000
|
+
index_names: list,
|
2001
|
+
grouped_dates: dict,
|
2002
|
+
):
|
2003
|
+
|
2004
|
+
LocalTimeSerie.post_data_frame_in_chunks(
|
2005
|
+
serialized_data_frame=serialized_data_frame,
|
2006
|
+
local_metadata=local_metadata,
|
2007
|
+
data_source=self,
|
2008
|
+
index_names=index_names,
|
2009
|
+
time_index_name=time_index_name,
|
2010
|
+
overwrite=overwrite,
|
2011
|
+
)
|
2012
|
+
|
2013
|
+
def filter_by_assets_ranges(
|
2014
|
+
self,
|
2015
|
+
asset_ranges_map: dict,
|
2016
|
+
metadata: dict,
|
2017
|
+
update_hash: str,
|
2018
|
+
has_direct_connection: bool
|
2019
|
+
):
|
2020
|
+
table_name = metadata.table_name
|
2021
|
+
index_names = metadata.sourcetableconfiguration.index_names
|
2022
|
+
column_types = metadata.sourcetableconfiguration.column_dtypes_map
|
2023
|
+
if has_direct_connection:
|
2024
|
+
df = TimeScaleInterface.filter_by_assets_ranges(
|
2025
|
+
table_name=table_name,
|
2026
|
+
asset_ranges_map=asset_ranges_map,
|
2027
|
+
index_names=index_names,
|
2028
|
+
data_source=self,
|
2029
|
+
column_types=column_types
|
2030
|
+
)
|
2031
|
+
else:
|
2032
|
+
df = LocalTimeSerie.get_data_between_dates_from_api(
|
2033
|
+
update_hash=update_hash,
|
2034
|
+
data_source_id=self.id,
|
2035
|
+
start_date=None,
|
2036
|
+
end_date=None,
|
2037
|
+
great_or_equal=True,
|
2038
|
+
less_or_equal=True,
|
2039
|
+
asset_symbols=None,
|
2040
|
+
columns=None,
|
2041
|
+
execution_venue_symbols=None,
|
2042
|
+
symbol_range_map=asset_ranges_map, # <-- key for applying ranges
|
2043
|
+
)
|
2044
|
+
return df
|
2045
|
+
|
2046
|
+
def get_data_by_time_index(
|
2047
|
+
self,
|
2048
|
+
local_metadata: dict,
|
2049
|
+
start_date: Optional[datetime.datetime] = None,
|
2050
|
+
end_date: Optional[datetime.datetime] = None,
|
2051
|
+
great_or_equal: bool = True,
|
2052
|
+
less_or_equal: bool = True,
|
2053
|
+
columns: Optional[List[str]] = None,
|
2054
|
+
unique_identifier_list: Optional[List[str]] = None,
|
2055
|
+
|
2056
|
+
) -> pd.DataFrame:
|
2057
|
+
|
2058
|
+
metadata = local_metadata.remote_table
|
2059
|
+
|
2060
|
+
df = local_metadata.get_data_between_dates_from_api(
|
2061
|
+
|
2062
|
+
start_date=start_date,
|
2063
|
+
end_date=end_date,
|
2064
|
+
great_or_equal=great_or_equal,
|
2065
|
+
less_or_equal=less_or_equal,
|
2066
|
+
unique_identifier_list=unique_identifier_list,
|
2067
|
+
columns=columns,
|
2068
|
+
)
|
2069
|
+
if len(df) == 0:
|
2070
|
+
if logger:
|
2071
|
+
logger.warning(
|
2072
|
+
f"No data returned from remote API for {local_metadata.update_hash}"
|
2073
|
+
)
|
2074
|
+
return df
|
2075
|
+
|
2076
|
+
stc = local_metadata.remote_table.sourcetableconfiguration
|
2077
|
+
df[stc.time_index_name] = pd.to_datetime(df[stc.time_index_name])
|
2078
|
+
for c, c_type in stc.column_dtypes_map.items():
|
2079
|
+
if c != stc.time_index_name:
|
2080
|
+
if c_type == "object":
|
2081
|
+
c_type = "str"
|
2082
|
+
df[c] = df[c].astype(c_type)
|
2083
|
+
df = df.set_index(stc.index_names)
|
2084
|
+
return df
|
2085
|
+
|
2086
|
+
|
2087
|
+
class DynamicResource(BasePydanticModel, BaseObjectOrm):
|
2088
|
+
id: Optional[int] = None
|
2089
|
+
name: str
|
2090
|
+
type: str
|
2091
|
+
object_signature: dict
|
2092
|
+
attributes: Optional[dict]
|
2093
|
+
|
2094
|
+
created_at: datetime.datetime
|
2095
|
+
updated_at: datetime.datetime
|
2096
|
+
is_production: bool
|
2097
|
+
pod: int
|
2098
|
+
|
2099
|
+
|
2100
|
+
def create_configuration_for_strategy(json_payload: dict, timeout=None):
|
2101
|
+
url = TDAG_ENDPOINT + "/orm/api/tdag-gpt/create_configuration_for_strategy/"
|
2102
|
+
from requests.adapters import HTTPAdapter, Retry
|
2103
|
+
s = requests.Session()
|
2104
|
+
s.headers.update(loaders.auth_headers)
|
2105
|
+
retries = Retry(total=2, backoff_factor=2)
|
2106
|
+
s.mount('http://', HTTPAdapter(max_retries=retries))
|
2107
|
+
|
2108
|
+
r = make_request(s=s, r_type="POST", url=url, payload={"json": json_payload},
|
2109
|
+
loaders=loaders, time_out=200)
|
2110
|
+
return r
|
2111
|
+
|
2112
|
+
|
2113
|
+
def query_agent(json_payload: dict, timeout=None):
|
2114
|
+
url = TDAG_ENDPOINT + "/orm/api/tdag-gpt/query_agent/"
|
2115
|
+
from requests.adapters import HTTPAdapter, Retry
|
2116
|
+
s = requests.Session()
|
2117
|
+
s.headers.update(loaders.auth_headers)
|
2118
|
+
retries = Retry(total=2, backoff_factor=2)
|
2119
|
+
s.mount('http://', HTTPAdapter(max_retries=retries))
|
2120
|
+
|
2121
|
+
r = make_request(s=s, r_type="POST", url=url, payload={"json": json_payload},
|
2122
|
+
loaders=loaders, time_out=200)
|
2123
|
+
return r
|
2124
|
+
|
2125
|
+
|
2126
|
+
def add_created_object_to_jobrun(model_name: str, app_label: str, object_id: int,
|
2127
|
+
timeout: Optional[int] = None) -> dict:
|
2128
|
+
"""
|
2129
|
+
Logs a new object that was created by this JobRun instance.
|
2130
|
+
|
2131
|
+
Args:
|
2132
|
+
model_name: The string name of the created model (e.g., "Project").
|
2133
|
+
app_label: The Django app label where the model is defined (e.g., "pod_manager").
|
2134
|
+
object_id: The primary key of the created object instance.
|
2135
|
+
timeout: Optional request timeout in seconds.
|
2136
|
+
|
2137
|
+
Returns:
|
2138
|
+
A dictionary representing the created record.
|
2139
|
+
"""
|
2140
|
+
url = TDAG_ENDPOINT + f"/orm/api/pods/job-run/{os.getenv('JOB_RUN_ID')}/add_created_object/"
|
2141
|
+
s = requests.Session()
|
2142
|
+
payload = {
|
2143
|
+
"json": {
|
2144
|
+
"app_label": app_label,
|
2145
|
+
"model_name": model_name,
|
2146
|
+
"object_id": object_id
|
2147
|
+
}
|
2148
|
+
}
|
2149
|
+
r = make_request(
|
2150
|
+
s=s,
|
2151
|
+
loaders=loaders,
|
2152
|
+
r_type="POST",
|
2153
|
+
url=url,
|
2154
|
+
payload=payload,
|
2155
|
+
time_out=timeout
|
2156
|
+
)
|
2157
|
+
if r.status_code not in [200, 201]:
|
2158
|
+
raise Exception(f"Failed to add created object: {r.status_code} - {r.text}")
|
2159
|
+
return r.json()
|
2160
|
+
|
2161
|
+
|
2162
|
+
class Artifact(BasePydanticModel, BaseObjectOrm):
|
2163
|
+
id: Optional[int]
|
2164
|
+
name: str
|
2165
|
+
created_by_resource_name: str
|
2166
|
+
bucket_name: str
|
2167
|
+
content: Any
|
2168
|
+
|
2169
|
+
@classmethod
|
2170
|
+
def upload_file(cls, filepath, name, created_by_resource_name, bucket_name=None):
|
2171
|
+
bucket_name if bucket_name else "default_bucket"
|
2172
|
+
return cls.get_or_create(filepath=filepath, name=name, created_by_resource_name=created_by_resource_name,
|
2173
|
+
bucket_name=bucket_name)
|
2174
|
+
|
2175
|
+
@classmethod
|
2176
|
+
def get_or_create(cls, filepath, name, created_by_resource_name, bucket_name):
|
2177
|
+
url = cls.get_object_url() + "/get_or_create/"
|
2178
|
+
s = cls.build_session()
|
2179
|
+
with open(filepath, "rb") as f:
|
2180
|
+
data = {
|
2181
|
+
"name": name,
|
2182
|
+
"created_by_resource_name": created_by_resource_name,
|
2183
|
+
"bucket_name": bucket_name if bucket_name else "default_bucket",
|
2184
|
+
}
|
2185
|
+
files = {"content": (str(filepath), f, "application/pdf")}
|
2186
|
+
payload = {
|
2187
|
+
"json": data,
|
2188
|
+
"files": files
|
2189
|
+
}
|
2190
|
+
r = make_request(s=s, loaders=cls.LOADERS, r_type="POST", url=url, payload=payload)
|
2191
|
+
|
2192
|
+
if r.status_code not in [200, 201]:
|
2193
|
+
raise Exception(f"Failed to get artifact: {r.status_code} - {r.text}")
|
2194
|
+
|
2195
|
+
return cls(**r.json())
|
2196
|
+
|
2197
|
+
|
2198
|
+
try:
|
2199
|
+
POD_PROJECT = Project.get_user_default_project()
|
2200
|
+
except Exception as e:
|
2201
|
+
POD_PROJECT = None
|
2202
|
+
logger.exception(f"Could not retrive pod project {e}")
|
2203
|
+
raise e
|
2204
|
+
|
2205
|
+
|
2206
|
+
class PodDataSource:
|
2207
|
+
def set_remote_db(self):
|
2208
|
+
self.data_source = POD_PROJECT.data_source
|
2209
|
+
logger.info(f"Set remote data source to {self.data_source.related_resource}")
|
2210
|
+
|
2211
|
+
if self.data_source.related_resource.status != "AVAILABLE":
|
2212
|
+
raise Exception(f"Project Database {self.data_source} is not available")
|
2213
|
+
|
2214
|
+
@staticmethod
|
2215
|
+
def _get_duck_db():
|
2216
|
+
host_uid = bios_uuid()
|
2217
|
+
data_source = DataSource.get_or_create_duck_db(
|
2218
|
+
display_name=f"DuckDB_{host_uid}",
|
2219
|
+
host_mac_address=host_uid
|
2220
|
+
)
|
2221
|
+
return data_source
|
2222
|
+
|
2223
|
+
@property
|
2224
|
+
def is_local_duck_db(self):
|
2225
|
+
return SessionDataSource.data_source.related_resource.class_type == DUCK_DB
|
2226
|
+
|
2227
|
+
def set_local_db(self):
|
2228
|
+
data_source = self._get_duck_db()
|
2229
|
+
|
2230
|
+
duckdb_dynamic_data_source = DynamicTableDataSource.get_or_create_duck_db(
|
2231
|
+
related_resource=data_source.id,
|
2232
|
+
)
|
2233
|
+
|
2234
|
+
# drop local tables that are not in registered in the backend anymore (probably have been deleted)
|
2235
|
+
remote_tables = DynamicTableMetaData.filter(data_source__id=duckdb_dynamic_data_source.id, list_tables=True)
|
2236
|
+
remote_table_names = [t.table_name for t in remote_tables]
|
2237
|
+
from mainsequence.client.data_sources_interfaces.duckdb import DuckDBInterface
|
2238
|
+
from mainsequence.client.utils import DataFrequency
|
2239
|
+
db_interface = DuckDBInterface()
|
2240
|
+
local_table_names = db_interface.list_tables()
|
2241
|
+
|
2242
|
+
tables_to_delete_locally = set(local_table_names) - set(remote_table_names)
|
2243
|
+
for table_name in tables_to_delete_locally:
|
2244
|
+
logger.debug(f"Deleting table in local duck db {table_name}")
|
2245
|
+
db_interface.drop_table(table_name)
|
2246
|
+
|
2247
|
+
tables_to_delete_remotely = set(remote_table_names) - set(local_table_names)
|
2248
|
+
for remote_table in remote_tables:
|
2249
|
+
if remote_table.table_name in tables_to_delete_remotely:
|
2250
|
+
logger.debug(f"Deleting table remotely {remote_table.table_name}")
|
2251
|
+
if remote_table.protect_from_deletion:
|
2252
|
+
remote_table.patch(protect_from_deletion=False)
|
2253
|
+
|
2254
|
+
remote_table.delete()
|
2255
|
+
|
2256
|
+
self.data_source = duckdb_dynamic_data_source
|
2257
|
+
|
2258
|
+
physical_ds = self.data_source.related_resource
|
2259
|
+
banner = (
|
2260
|
+
"─" * 40 + "\n"
|
2261
|
+
f"LOCAL: {physical_ds.display_name} (engine={physical_ds.class_type})\n\n"
|
2262
|
+
"import duckdb, pathlib\n"
|
2263
|
+
f"path = pathlib.Path('{db_interface.db_path}') / 'duck_meta.duckdb'\n"
|
2264
|
+
"conn = duckdb.connect(':memory:')\n"
|
2265
|
+
"conn.execute(f\"ATTACH '{path}' AS ro (READ_ONLY)\")\n"
|
2266
|
+
"conn.execute('INSTALL ui; LOAD ui; CALL start_ui();')\n"
|
2267
|
+
+ "─" * 40
|
2268
|
+
)
|
2269
|
+
logger.info(banner)
|
2270
|
+
|
2271
|
+
def __repr__(self):
|
2272
|
+
return f"{self.data_source.related_resource}"
|
2273
|
+
|
2274
|
+
|
2275
|
+
SessionDataSource = PodDataSource()
|
2276
|
+
SessionDataSource.set_remote_db()
|