tdfs4ds 0.2.4.41__py3-none-any.whl → 0.2.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +214 -38
- tdfs4ds/feature_store/feature_data_processing.py +7 -5
- tdfs4ds/genai/__init__.py +27 -0
- tdfs4ds/genai/documentation.py +1878 -0
- tdfs4ds/process_store/process_store_catalog_management.py +77 -24
- tdfs4ds/utils/filter_management.py +40 -13
- tdfs4ds/utils/time_management.py +28 -11
- {tdfs4ds-0.2.4.41.dist-info → tdfs4ds-0.2.5.1.dist-info}/METADATA +1 -1
- {tdfs4ds-0.2.4.41.dist-info → tdfs4ds-0.2.5.1.dist-info}/RECORD +11 -17
- tdfs/__init__.py +0 -1
- tdfs/data/curves.csv +0 -5086
- tdfs/datasets.py +0 -27
- tdfs/feature_store.py +0 -723
- tdfs4ds/feature_engineering.py +0 -152
- tdfs4ds/feature_store.py +0 -1529
- tdfs4ds/process_store.py +0 -387
- tdfs4ds/utils.py +0 -579
- {tdfs4ds-0.2.4.41.dist-info → tdfs4ds-0.2.5.1.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.41.dist-info → tdfs4ds-0.2.5.1.dist-info}/top_level.txt +0 -0
tdfs4ds/__init__.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
__version__ = '0.2.
|
|
1
|
+
__version__ = '0.2.5.1'
|
|
2
|
+
import difflib
|
|
2
3
|
import logging
|
|
3
4
|
import json
|
|
4
5
|
|
|
@@ -23,6 +24,7 @@ logger = logging.getLogger(__name__)
|
|
|
23
24
|
from tdfs4ds.feature_store.feature_query_retrieval import get_available_entity_id_records, write_where_clause_filter
|
|
24
25
|
from tdfs4ds.process_store.process_followup import follow_up_report
|
|
25
26
|
from tdfs4ds.dataset.dataset_catalog import DatasetCatalog, Dataset
|
|
27
|
+
from . import genai
|
|
26
28
|
|
|
27
29
|
DATA_DOMAIN = None
|
|
28
30
|
SCHEMA = None
|
|
@@ -55,6 +57,18 @@ FEATURE_PARTITION_EACH = 1
|
|
|
55
57
|
|
|
56
58
|
VARCHAR_SIZE = 1024
|
|
57
59
|
|
|
60
|
+
INSTRUCT_MODEL_URL = None
|
|
61
|
+
INSTRUCT_MODEL_API_KEY = None
|
|
62
|
+
INSTRUCT_MODEL_MODEL = None
|
|
63
|
+
INSTRUCT_MODEL_PROVIDER = None
|
|
64
|
+
|
|
65
|
+
DOCUMENTATION_PROCESS_BUSINESS_LOGIC = 'FS_PROCESS_DOCUMENTATION_BUSINESS_LOGIC'
|
|
66
|
+
DOCUMENTATION_PROCESS_FEATURES = 'FS_PROCESS_DOCUMENTATION_FEATURES'
|
|
67
|
+
DOCUMENTATION_PROCESS_BUSINESS_LOGIC_VIEW = 'FS_V_PROCESS_DOCUMENTATION_BUSINESS_LOGIC'
|
|
68
|
+
DOCUMENTATION_PROCESS_FEATURES_VIEW = 'FS_V_PROCESS_DOCUMENTATION_FEATURES'
|
|
69
|
+
DOCUMENTATION_PROCESS_EXPLAIN = 'FS_PROCESS_DOCUMENTATION_EXPLAIN'
|
|
70
|
+
DOCUMENTATION_PROCESS_EXPLAIN_VIEW = 'FS_V_PROCESS_DOCUMENTATION_EXPLAIN'
|
|
71
|
+
|
|
58
72
|
import warnings
|
|
59
73
|
warnings.filterwarnings('ignore')
|
|
60
74
|
|
|
@@ -152,6 +166,11 @@ def setup(database, if_exists='fail'):
|
|
|
152
166
|
logger_safe("info", "Dataset catalog created: %s", tdfs4ds.DATASET_CATALOG_NAME)
|
|
153
167
|
|
|
154
168
|
logger_safe("info", "Setup complete.")
|
|
169
|
+
try:
|
|
170
|
+
tdfs4ds.genai.documentations_tables_creation()
|
|
171
|
+
logger_safe("info", "Documentation tables created successfully.")
|
|
172
|
+
except Exception as e:
|
|
173
|
+
logger_safe("error", "Documentation tables creation failed: %s", str(e).split('\n')[0])
|
|
155
174
|
return
|
|
156
175
|
|
|
157
176
|
|
|
@@ -165,6 +184,9 @@ def connect(
|
|
|
165
184
|
feature_catalog_name_view = tdfs4ds.FEATURE_CATALOG_NAME_VIEW,
|
|
166
185
|
process_catalog_name_view = tdfs4ds.PROCESS_CATALOG_NAME_VIEW,
|
|
167
186
|
dataset_catalog_name = tdfs4ds.DATASET_CATALOG_NAME,
|
|
187
|
+
documentation_process_business_logic = tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC,
|
|
188
|
+
documentation_process_features = tdfs4ds.DOCUMENTATION_PROCESS_FEATURES,
|
|
189
|
+
documentation_process_explain = tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN,
|
|
168
190
|
create_if_missing = False
|
|
169
191
|
):
|
|
170
192
|
if database is None:
|
|
@@ -179,20 +201,31 @@ def connect(
|
|
|
179
201
|
distrib_exists = data_distribution_name.lower() in tables
|
|
180
202
|
filter_manager_exists = filter_manager_name.lower() in tables
|
|
181
203
|
followup_name_exists = followup_name.lower() in tables
|
|
204
|
+
documentation_process_business_logic_exist = documentation_process_business_logic.lower() in tables
|
|
205
|
+
documentation_process_features_exist = documentation_process_features.lower() in tables
|
|
206
|
+
documentation_process_explain_exist = documentation_process_explain.lower() in tables
|
|
182
207
|
|
|
183
|
-
|
|
208
|
+
|
|
209
|
+
if not (feature_exists and process_exists and distrib_exists and filter_manager_exists and documentation_process_business_logic_exist and documentation_process_features_exist):
|
|
184
210
|
if not create_if_missing:
|
|
185
211
|
logger_safe("warning", "Feature store components missing and create_if_missing=False")
|
|
186
212
|
return False
|
|
187
213
|
logger_safe("info", "Missing components detected; creating missing parts...")
|
|
188
214
|
if not feature_exists:
|
|
215
|
+
logger_safe("info", "Creating feature catalog: %s", feature_catalog_name)
|
|
189
216
|
tdfs4ds.feature_store.feature_store_management.feature_store_catalog_creation()
|
|
190
217
|
if not process_exists:
|
|
218
|
+
logger_safe("info", "Creating process catalog: %s", process_catalog_name)
|
|
191
219
|
tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_creation()
|
|
192
220
|
if not distrib_exists:
|
|
221
|
+
logger_safe("info", "Creating data distribution table: %s", data_distribution_name)
|
|
193
222
|
tdfs4ds.data_distribution.data_distribution_catalog_creation()
|
|
194
223
|
if not filter_manager_exists:
|
|
224
|
+
logger_safe("info", "Creating filter manager table: %s", filter_manager_name)
|
|
195
225
|
tdfs4ds.filter_manager.filter_manager_catalog_creation()
|
|
226
|
+
if not documentation_process_business_logic_exist or not documentation_process_features_exist or not documentation_process_explain_exist:
|
|
227
|
+
logger_safe("info", "Creating documentation tables.")
|
|
228
|
+
tdfs4ds.genai.documentation_tables_creation()
|
|
196
229
|
|
|
197
230
|
if not followup_name_exists:
|
|
198
231
|
logger_safe("info", "Creating follow-up table: %s", followup_name)
|
|
@@ -229,12 +262,111 @@ def connect(
|
|
|
229
262
|
schema_name=tdfs4ds.SCHEMA,
|
|
230
263
|
object_type='table'
|
|
231
264
|
)
|
|
265
|
+
|
|
266
|
+
query_data_domain = f"""
|
|
267
|
+
SELECT DISTINCT DATA_DOMAIN
|
|
268
|
+
FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
|
|
269
|
+
UNION
|
|
270
|
+
SELECT DISTINCT DATA_DOMAIN
|
|
271
|
+
FROM {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW}
|
|
272
|
+
"""
|
|
273
|
+
data_domains = tdml.DataFrame.from_query(query_data_domain).to_pandas()['DATA_DOMAIN'].tolist()
|
|
274
|
+
logger_safe("info", "Data domains in feature store: %s", data_domains)
|
|
232
275
|
|
|
233
276
|
tdfs4ds.DATA_DISTRIBUTION_TEMPORAL = is_data_distribution_temporal()
|
|
234
277
|
logger_safe("info", "Connected to feature store successfully.")
|
|
235
278
|
return True
|
|
236
279
|
|
|
280
|
+
def get_data_domains(verbose=True):
|
|
281
|
+
"""
|
|
282
|
+
Retrieve and display all data domains available in the feature store.
|
|
283
|
+
This function queries the feature store to obtain a list of all distinct data domains
|
|
284
|
+
that have been defined within the system. It combines data domains from both the process
|
|
285
|
+
catalog and the feature catalog, ensuring a comprehensive overview. The current data
|
|
286
|
+
domain in use is highlighted for easy identification.
|
|
287
|
+
Parameters:
|
|
288
|
+
- verbose (bool): If True, prints the list of data domains with the current one marked.
|
|
289
|
+
Returns:
|
|
290
|
+
- str: The current data domain in use.
|
|
291
|
+
"""
|
|
237
292
|
|
|
293
|
+
query_data_domain = f"""
|
|
294
|
+
SELECT DISTINCT DATA_DOMAIN
|
|
295
|
+
FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
|
|
296
|
+
UNION
|
|
297
|
+
SELECT DISTINCT DATA_DOMAIN
|
|
298
|
+
FROM {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW}
|
|
299
|
+
"""
|
|
300
|
+
data_domains = tdml.DataFrame.from_query(query_data_domain).to_pandas()['DATA_DOMAIN'].tolist()
|
|
301
|
+
|
|
302
|
+
if verbose:
|
|
303
|
+
print("Data Domains in Feature Store:")
|
|
304
|
+
for d in data_domains:
|
|
305
|
+
if d != tdfs4ds.DATA_DOMAIN:
|
|
306
|
+
print('\t'+d)
|
|
307
|
+
else:
|
|
308
|
+
print('*\t'+d)
|
|
309
|
+
if tdfs4ds.DATA_DOMAIN not in data_domains and tdfs4ds.DATA_DOMAIN is not None:
|
|
310
|
+
print("\nCurrent data domain (%s) not available yet in feature store. It may be a new one" % tdfs4ds.DATA_DOMAIN)
|
|
311
|
+
return
|
|
312
|
+
return data_domains
|
|
313
|
+
|
|
314
|
+
def select_data_domain(data_domain):
|
|
315
|
+
"""
|
|
316
|
+
Set the active data domain for feature store operations.
|
|
317
|
+
|
|
318
|
+
This function allows users to specify which data domain should be considered
|
|
319
|
+
as the current context for subsequent feature store operations. By setting
|
|
320
|
+
the data domain, users can ensure that all feature queries, registrations,
|
|
321
|
+
and other interactions with the feature store are scoped appropriately.
|
|
322
|
+
This is particularly useful in environments where multiple data domains
|
|
323
|
+
exist, allowing for clear separation and organization of features.
|
|
324
|
+
|
|
325
|
+
Parameters:
|
|
326
|
+
- data_domain (str): The name of the data domain to set as active.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
- str: The data domain that has been set as active.
|
|
330
|
+
"""
|
|
331
|
+
data_domains = get_data_domains(verbose=False)
|
|
332
|
+
if data_domain not in data_domains:
|
|
333
|
+
logger_safe("error", "Data domain '%s' not found in feature store.", data_domain)
|
|
334
|
+
raise ValueError(f"Data domain '{data_domain}' not found in feature store.")
|
|
335
|
+
#suggest a data domain closest to the requested one
|
|
336
|
+
closest_domain = difflib.get_close_matches(data_domain, data_domains, n=1)
|
|
337
|
+
if data_domain in data_domains:
|
|
338
|
+
tdfs4ds.DATA_DOMAIN = data_domain
|
|
339
|
+
elif closest_domain:
|
|
340
|
+
logger_safe("info", "Did you mean '%s'?", closest_domain[0])
|
|
341
|
+
return
|
|
342
|
+
tdfs4ds.DATA_DOMAIN = data_domain
|
|
343
|
+
logger_safe("info", "Data domain set to: %s", data_domain)
|
|
344
|
+
return
|
|
345
|
+
|
|
346
|
+
def create_data_domain(data_domain):
|
|
347
|
+
"""
|
|
348
|
+
Create a new data domain in the feature store.
|
|
349
|
+
|
|
350
|
+
This function facilitates the creation of a new data domain within the feature store.
|
|
351
|
+
A data domain serves as a logical grouping for features, allowing for better organization
|
|
352
|
+
and management. By creating a new data domain, users can segregate features based on
|
|
353
|
+
specific criteria, such as business units, projects, or data types. This helps in
|
|
354
|
+
maintaining clarity and structure within the feature store, especially in environments
|
|
355
|
+
with diverse datasets and use cases.
|
|
356
|
+
|
|
357
|
+
Parameters:
|
|
358
|
+
- data_domain (str): The name of the new data domain to be created.
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
- str: The name of the newly created data domain.
|
|
362
|
+
"""
|
|
363
|
+
existing_domains = get_data_domains(verbose=False)
|
|
364
|
+
if data_domain in existing_domains:
|
|
365
|
+
logger_safe("warning", "Data domain '%s' already exists in feature store.", data_domain)
|
|
366
|
+
return data_domain
|
|
367
|
+
tdfs4ds.DATA_DOMAIN = data_domain
|
|
368
|
+
logger_safe("info", "Data domain '%s' created in locally.", data_domain)
|
|
369
|
+
return
|
|
238
370
|
|
|
239
371
|
|
|
240
372
|
def feature_catalog():
|
|
@@ -287,7 +419,7 @@ def get_dataset_entity(dataset_id = None):
|
|
|
287
419
|
def get_dataset_features(dataset_id = None):
|
|
288
420
|
return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
|
|
289
421
|
|
|
290
|
-
def run(process_id, return_dataset=False, force_compute=False, force_varchar_length=None):
|
|
422
|
+
def run(process_id, return_dataset=False, force_compute=False, force_varchar_length=None, dataset_view_name=None):
|
|
291
423
|
"""
|
|
292
424
|
Executes a specific process from the feature store identified by the process ID.
|
|
293
425
|
Uses global `logger` for diagnostics (gated by tdfs4ds.DISPLAY_LOGS).
|
|
@@ -383,7 +515,8 @@ def run(process_id, return_dataset=False, force_compute=False, force_varchar_len
|
|
|
383
515
|
entity_null_substitute=entity_null_substitute,
|
|
384
516
|
process_id=process_id,
|
|
385
517
|
force_compute=force_compute,
|
|
386
|
-
force_varchar_length=force_varchar_length
|
|
518
|
+
force_varchar_length=force_varchar_length,
|
|
519
|
+
dataset_view_name = dataset_view_name
|
|
387
520
|
)
|
|
388
521
|
|
|
389
522
|
# Handling 'tdstone2 view' process type
|
|
@@ -413,7 +546,8 @@ def upload_features(
|
|
|
413
546
|
filtermanager=None,
|
|
414
547
|
entity_null_substitute={},
|
|
415
548
|
force_compute=True,
|
|
416
|
-
force_varchar_length=1024
|
|
549
|
+
force_varchar_length=1024,
|
|
550
|
+
dataset_view_name = None
|
|
417
551
|
):
|
|
418
552
|
"""
|
|
419
553
|
Uploads feature data from a DataFrame to the feature store for a specified entity.
|
|
@@ -518,19 +652,21 @@ def upload_features(
|
|
|
518
652
|
try:
|
|
519
653
|
if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
|
|
520
654
|
dataset = run(
|
|
521
|
-
process_id=process_id,
|
|
522
|
-
return_dataset=True,
|
|
523
|
-
force_compute=force_compute,
|
|
524
|
-
force_varchar_length=force_varchar_length
|
|
655
|
+
process_id = process_id,
|
|
656
|
+
return_dataset = True,
|
|
657
|
+
force_compute = force_compute,
|
|
658
|
+
force_varchar_length = force_varchar_length,
|
|
659
|
+
dataset_view_name = dataset_view_name
|
|
525
660
|
)
|
|
526
661
|
logger_safe("info", "Run completed with dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
|
|
527
662
|
return dataset
|
|
528
663
|
else:
|
|
529
664
|
run(
|
|
530
|
-
process_id=process_id,
|
|
531
|
-
return_dataset=False,
|
|
532
|
-
force_compute=force_compute,
|
|
533
|
-
force_varchar_length=force_varchar_length
|
|
665
|
+
process_id = process_id,
|
|
666
|
+
return_dataset = False,
|
|
667
|
+
force_compute = force_compute,
|
|
668
|
+
force_varchar_length = force_varchar_length,
|
|
669
|
+
dataset_view_name = dataset_view_name
|
|
534
670
|
)
|
|
535
671
|
logger_safe("info", "Run completed without dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
|
|
536
672
|
return
|
|
@@ -539,10 +675,10 @@ def upload_features(
|
|
|
539
675
|
# Keep your existing follow-up close behavior, but ensure the error is logged.
|
|
540
676
|
try:
|
|
541
677
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
542
|
-
run_id
|
|
543
|
-
process_type
|
|
544
|
-
process_id
|
|
545
|
-
status
|
|
678
|
+
run_id = tdfs4ds.RUN_ID,
|
|
679
|
+
process_type = tdfs4ds.PROCESS_TYPE,
|
|
680
|
+
process_id = process_id,
|
|
681
|
+
status = 'FAILED,' + str(e).split('\n')[0]
|
|
546
682
|
)
|
|
547
683
|
finally:
|
|
548
684
|
logger_safe("exception", "Run failed (run_id=%s, process_id=%s): %s",
|
|
@@ -557,11 +693,12 @@ def upload_features(
|
|
|
557
693
|
|
|
558
694
|
def _upload_features(
|
|
559
695
|
df, entity_id, feature_names,
|
|
560
|
-
feature_versions=FEATURE_VERSION_DEFAULT,
|
|
561
|
-
primary_index=None, partitioning='',
|
|
562
|
-
filtermanager=None, entity_null_substitute={},
|
|
563
|
-
process_id=None, force_compute=False,
|
|
564
|
-
force_varchar_length=None
|
|
696
|
+
feature_versions = FEATURE_VERSION_DEFAULT,
|
|
697
|
+
primary_index = None, partitioning = '',
|
|
698
|
+
filtermanager = None, entity_null_substitute = {},
|
|
699
|
+
process_id = None, force_compute = False,
|
|
700
|
+
force_varchar_length = None,
|
|
701
|
+
dataset_view_name = None
|
|
565
702
|
):
|
|
566
703
|
"""
|
|
567
704
|
Uploads a set of features into the Feature Store for a given entity.
|
|
@@ -708,6 +845,7 @@ def _upload_features(
|
|
|
708
845
|
]
|
|
709
846
|
|
|
710
847
|
if filtermanager is None:
|
|
848
|
+
dataset_created = False
|
|
711
849
|
do_compute = not (process_id and follow_up is not None and follow_up.shape[0] > 0)
|
|
712
850
|
if not do_compute and not force_compute:
|
|
713
851
|
logger_safe(
|
|
@@ -730,8 +868,12 @@ def _upload_features(
|
|
|
730
868
|
entity_null_substitute=entity_null_substitute,
|
|
731
869
|
partitioning=partitioning
|
|
732
870
|
)
|
|
733
|
-
|
|
871
|
+
|
|
872
|
+
count_rows = store_feature(entity_id, volatile_table, entity_null_substitute,
|
|
734
873
|
primary_index, partitioning, features_infos)
|
|
874
|
+
|
|
875
|
+
|
|
876
|
+
|
|
735
877
|
apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
|
|
736
878
|
|
|
737
879
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
@@ -740,6 +882,20 @@ def _upload_features(
|
|
|
740
882
|
process_id=process_id
|
|
741
883
|
)
|
|
742
884
|
logger_safe("info", "Feature ingestion completed for entity=%s", entity_id)
|
|
885
|
+
# Build dataset for validation if enabled
|
|
886
|
+
if tdfs4ds.BUILD_DATASET_AT_UPLOAD or dataset_view_name is not None:
|
|
887
|
+
logger_safe("info", "Building dataset for validation...")
|
|
888
|
+
try:
|
|
889
|
+
dataset = build_dataset(
|
|
890
|
+
entity_id, selected_features,
|
|
891
|
+
view_name = dataset_view_name
|
|
892
|
+
)
|
|
893
|
+
dataset_created = True
|
|
894
|
+
except Exception as e:
|
|
895
|
+
logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
|
|
896
|
+
logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
|
|
897
|
+
else:
|
|
898
|
+
logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False) and no dataset view name provided.")
|
|
743
899
|
|
|
744
900
|
except Exception as e:
|
|
745
901
|
logger_safe("exception", "Feature ingestion failed for entity=%s", entity_id)
|
|
@@ -762,7 +918,7 @@ def _upload_features(
|
|
|
762
918
|
unit="filter",
|
|
763
919
|
leave=False
|
|
764
920
|
)
|
|
765
|
-
|
|
921
|
+
dataset_created = False
|
|
766
922
|
for i in pbar:
|
|
767
923
|
filter_id = i + 1
|
|
768
924
|
filtermanager.update(filter_id)
|
|
@@ -834,7 +990,7 @@ def _upload_features(
|
|
|
834
990
|
partitioning = partitioning
|
|
835
991
|
)
|
|
836
992
|
|
|
837
|
-
store_feature(entity_id, volatile_table, entity_null_substitute,
|
|
993
|
+
count_rows = store_feature(entity_id, volatile_table, entity_null_substitute,
|
|
838
994
|
primary_index, partitioning, features_infos)
|
|
839
995
|
|
|
840
996
|
something_computed = True
|
|
@@ -846,6 +1002,21 @@ def _upload_features(
|
|
|
846
1002
|
filtermanager = filtermanager
|
|
847
1003
|
)
|
|
848
1004
|
|
|
1005
|
+
# Build dataset for validation if enabled
|
|
1006
|
+
if (tdfs4ds.BUILD_DATASET_AT_UPLOAD or dataset_view_name is not None) and dataset_created==False:
|
|
1007
|
+
logger_safe("info", "Building dataset for validation...")
|
|
1008
|
+
try:
|
|
1009
|
+
dataset = build_dataset(
|
|
1010
|
+
entity_id, selected_features,
|
|
1011
|
+
view_name = dataset_view_name
|
|
1012
|
+
)
|
|
1013
|
+
dataset_created = True
|
|
1014
|
+
except Exception as e:
|
|
1015
|
+
logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
|
|
1016
|
+
logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
|
|
1017
|
+
else:
|
|
1018
|
+
logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False) and no dataset view name provided.")
|
|
1019
|
+
|
|
849
1020
|
except Exception as e:
|
|
850
1021
|
logger_safe("exception", "Error with filter iteration %s: %s", i + 1, str(e))
|
|
851
1022
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
@@ -860,19 +1031,24 @@ def _upload_features(
|
|
|
860
1031
|
if something_computed:
|
|
861
1032
|
apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
|
|
862
1033
|
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
1034
|
+
if dataset_created == False and tdfs4ds.BUILD_DATASET_AT_UPLOAD and dataset_view_name == None:
|
|
1035
|
+
logger_safe("info", "Building dataset for validation...")
|
|
1036
|
+
try:
|
|
1037
|
+
dataset = build_dataset(
|
|
1038
|
+
entity_id, selected_features,
|
|
1039
|
+
view_name = dataset_view_name
|
|
1040
|
+
)
|
|
1041
|
+
return dataset
|
|
1042
|
+
except Exception as e:
|
|
1043
|
+
logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
|
|
1044
|
+
logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
|
|
1045
|
+
else:
|
|
1046
|
+
if tdfs4ds.BUILD_DATASET_AT_UPLOAD == False:
|
|
1047
|
+
logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False) and no dataset view name provided.")
|
|
1048
|
+
else:
|
|
1049
|
+
return
|
|
1050
|
+
|
|
1051
|
+
|
|
876
1052
|
return
|
|
877
1053
|
|
|
878
1054
|
|
|
@@ -287,10 +287,10 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
|
|
|
287
287
|
# Execute: create volatile table and test unicity
|
|
288
288
|
try:
|
|
289
289
|
tdml.DataFrame.from_query(nested_query).to_sql(
|
|
290
|
-
table_name=volatile_table_name,
|
|
291
|
-
temporary=True,
|
|
292
|
-
primary_index=primary_index.split(','),
|
|
293
|
-
if_exists='replace'
|
|
290
|
+
table_name = volatile_table_name,
|
|
291
|
+
temporary = True,
|
|
292
|
+
primary_index = primary_index.split(','),
|
|
293
|
+
if_exists = 'replace'
|
|
294
294
|
)
|
|
295
295
|
nb_duplicates = tdml.execute_sql(query_test_unicity).fetchall()[0][0]
|
|
296
296
|
if nb_duplicates is not None and nb_duplicates > 0:
|
|
@@ -731,6 +731,8 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
|
|
|
731
731
|
).fetchall(),
|
|
732
732
|
columns=['NB_ROWS']
|
|
733
733
|
)
|
|
734
|
+
# log the number of rows obtained after transformations
|
|
735
|
+
logger_safe("info", f"{count_features.NB_ROWS.values[0]} rows of features")
|
|
734
736
|
|
|
735
737
|
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
736
738
|
logger_safe("debug", "count_features=%s", count_features)
|
|
@@ -858,7 +860,7 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
|
|
|
858
860
|
logger_safe("exception", "Feature storage (merge) failed: %s", str(e).split('\n')[0])
|
|
859
861
|
raise
|
|
860
862
|
|
|
861
|
-
return
|
|
863
|
+
return count_features.NB_ROWS.values[0]
|
|
862
864
|
|
|
863
865
|
def store_feature(entity_id, volatile_table_name, entity_null_substitute = {},primary_index=None,
|
|
864
866
|
partitioning='', features_infos = None, **kwargs):
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from .documentation import (
|
|
2
|
+
document_sql_query_columns,
|
|
3
|
+
document_process,
|
|
4
|
+
documentation_tables_creation,
|
|
5
|
+
document_sql_query_explain,
|
|
6
|
+
build_explain_documentation_chain,
|
|
7
|
+
run_explain_documentation,
|
|
8
|
+
build_sql_documentation_chain,
|
|
9
|
+
run_sql_documentation,
|
|
10
|
+
build_llm,
|
|
11
|
+
get_the_explain,
|
|
12
|
+
display_process_info
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"document_sql_query_columns",
|
|
17
|
+
"document_process",
|
|
18
|
+
"documentation_tables_creation",
|
|
19
|
+
"document_sql_query_explain",
|
|
20
|
+
"build_explain_documentation_chain",
|
|
21
|
+
"run_explain_documentation",
|
|
22
|
+
"build_sql_documentation_chain",
|
|
23
|
+
"run_sql_documentation",
|
|
24
|
+
"build_llm",
|
|
25
|
+
"get_the_explain",
|
|
26
|
+
"display_process_info"
|
|
27
|
+
]
|