tdfs4ds 0.2.4.41__py3-none-any.whl → 0.2.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tdfs4ds/__init__.py CHANGED
@@ -1,4 +1,5 @@
1
- __version__ = '0.2.4.41'
1
+ __version__ = '0.2.5.1'
2
+ import difflib
2
3
  import logging
3
4
  import json
4
5
 
@@ -23,6 +24,7 @@ logger = logging.getLogger(__name__)
23
24
  from tdfs4ds.feature_store.feature_query_retrieval import get_available_entity_id_records, write_where_clause_filter
24
25
  from tdfs4ds.process_store.process_followup import follow_up_report
25
26
  from tdfs4ds.dataset.dataset_catalog import DatasetCatalog, Dataset
27
+ from . import genai
26
28
 
27
29
  DATA_DOMAIN = None
28
30
  SCHEMA = None
@@ -55,6 +57,18 @@ FEATURE_PARTITION_EACH = 1
55
57
 
56
58
  VARCHAR_SIZE = 1024
57
59
 
60
+ INSTRUCT_MODEL_URL = None
61
+ INSTRUCT_MODEL_API_KEY = None
62
+ INSTRUCT_MODEL_MODEL = None
63
+ INSTRUCT_MODEL_PROVIDER = None
64
+
65
+ DOCUMENTATION_PROCESS_BUSINESS_LOGIC = 'FS_PROCESS_DOCUMENTATION_BUSINESS_LOGIC'
66
+ DOCUMENTATION_PROCESS_FEATURES = 'FS_PROCESS_DOCUMENTATION_FEATURES'
67
+ DOCUMENTATION_PROCESS_BUSINESS_LOGIC_VIEW = 'FS_V_PROCESS_DOCUMENTATION_BUSINESS_LOGIC'
68
+ DOCUMENTATION_PROCESS_FEATURES_VIEW = 'FS_V_PROCESS_DOCUMENTATION_FEATURES'
69
+ DOCUMENTATION_PROCESS_EXPLAIN = 'FS_PROCESS_DOCUMENTATION_EXPLAIN'
70
+ DOCUMENTATION_PROCESS_EXPLAIN_VIEW = 'FS_V_PROCESS_DOCUMENTATION_EXPLAIN'
71
+
58
72
  import warnings
59
73
  warnings.filterwarnings('ignore')
60
74
 
@@ -152,6 +166,11 @@ def setup(database, if_exists='fail'):
152
166
  logger_safe("info", "Dataset catalog created: %s", tdfs4ds.DATASET_CATALOG_NAME)
153
167
 
154
168
  logger_safe("info", "Setup complete.")
169
+ try:
170
+ tdfs4ds.genai.documentations_tables_creation()
171
+ logger_safe("info", "Documentation tables created successfully.")
172
+ except Exception as e:
173
+ logger_safe("error", "Documentation tables creation failed: %s", str(e).split('\n')[0])
155
174
  return
156
175
 
157
176
 
@@ -165,6 +184,9 @@ def connect(
165
184
  feature_catalog_name_view = tdfs4ds.FEATURE_CATALOG_NAME_VIEW,
166
185
  process_catalog_name_view = tdfs4ds.PROCESS_CATALOG_NAME_VIEW,
167
186
  dataset_catalog_name = tdfs4ds.DATASET_CATALOG_NAME,
187
+ documentation_process_business_logic = tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC,
188
+ documentation_process_features = tdfs4ds.DOCUMENTATION_PROCESS_FEATURES,
189
+ documentation_process_explain = tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN,
168
190
  create_if_missing = False
169
191
  ):
170
192
  if database is None:
@@ -179,20 +201,31 @@ def connect(
179
201
  distrib_exists = data_distribution_name.lower() in tables
180
202
  filter_manager_exists = filter_manager_name.lower() in tables
181
203
  followup_name_exists = followup_name.lower() in tables
204
+ documentation_process_business_logic_exist = documentation_process_business_logic.lower() in tables
205
+ documentation_process_features_exist = documentation_process_features.lower() in tables
206
+ documentation_process_explain_exist = documentation_process_explain.lower() in tables
182
207
 
183
- if not (feature_exists and process_exists and distrib_exists and filter_manager_exists):
208
+
209
+ if not (feature_exists and process_exists and distrib_exists and filter_manager_exists and documentation_process_business_logic_exist and documentation_process_features_exist):
184
210
  if not create_if_missing:
185
211
  logger_safe("warning", "Feature store components missing and create_if_missing=False")
186
212
  return False
187
213
  logger_safe("info", "Missing components detected; creating missing parts...")
188
214
  if not feature_exists:
215
+ logger_safe("info", "Creating feature catalog: %s", feature_catalog_name)
189
216
  tdfs4ds.feature_store.feature_store_management.feature_store_catalog_creation()
190
217
  if not process_exists:
218
+ logger_safe("info", "Creating process catalog: %s", process_catalog_name)
191
219
  tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_creation()
192
220
  if not distrib_exists:
221
+ logger_safe("info", "Creating data distribution table: %s", data_distribution_name)
193
222
  tdfs4ds.data_distribution.data_distribution_catalog_creation()
194
223
  if not filter_manager_exists:
224
+ logger_safe("info", "Creating filter manager table: %s", filter_manager_name)
195
225
  tdfs4ds.filter_manager.filter_manager_catalog_creation()
226
+ if not documentation_process_business_logic_exist or not documentation_process_features_exist or not documentation_process_explain_exist:
227
+ logger_safe("info", "Creating documentation tables.")
228
+ tdfs4ds.genai.documentation_tables_creation()
196
229
 
197
230
  if not followup_name_exists:
198
231
  logger_safe("info", "Creating follow-up table: %s", followup_name)
@@ -229,12 +262,111 @@ def connect(
229
262
  schema_name=tdfs4ds.SCHEMA,
230
263
  object_type='table'
231
264
  )
265
+
266
+ query_data_domain = f"""
267
+ SELECT DISTINCT DATA_DOMAIN
268
+ FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
269
+ UNION
270
+ SELECT DISTINCT DATA_DOMAIN
271
+ FROM {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW}
272
+ """
273
+ data_domains = tdml.DataFrame.from_query(query_data_domain).to_pandas()['DATA_DOMAIN'].tolist()
274
+ logger_safe("info", "Data domains in feature store: %s", data_domains)
232
275
 
233
276
  tdfs4ds.DATA_DISTRIBUTION_TEMPORAL = is_data_distribution_temporal()
234
277
  logger_safe("info", "Connected to feature store successfully.")
235
278
  return True
236
279
 
280
+ def get_data_domains(verbose=True):
281
+ """
282
+ Retrieve and display all data domains available in the feature store.
283
+ This function queries the feature store to obtain a list of all distinct data domains
284
+ that have been defined within the system. It combines data domains from both the process
285
+ catalog and the feature catalog, ensuring a comprehensive overview. The current data
286
+ domain in use is highlighted for easy identification.
287
+ Parameters:
288
+ - verbose (bool): If True, prints the list of data domains with the current one marked.
289
+ Returns:
290
+ - str: The current data domain in use.
291
+ """
237
292
 
293
+ query_data_domain = f"""
294
+ SELECT DISTINCT DATA_DOMAIN
295
+ FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
296
+ UNION
297
+ SELECT DISTINCT DATA_DOMAIN
298
+ FROM {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW}
299
+ """
300
+ data_domains = tdml.DataFrame.from_query(query_data_domain).to_pandas()['DATA_DOMAIN'].tolist()
301
+
302
+ if verbose:
303
+ print("Data Domains in Feature Store:")
304
+ for d in data_domains:
305
+ if d != tdfs4ds.DATA_DOMAIN:
306
+ print('\t'+d)
307
+ else:
308
+ print('*\t'+d)
309
+ if tdfs4ds.DATA_DOMAIN not in data_domains and tdfs4ds.DATA_DOMAIN is not None:
310
+ print("\nCurrent data domain (%s) not available yet in feature store. It may be a new one" % tdfs4ds.DATA_DOMAIN)
311
+ return
312
+ return data_domains
313
+
314
+ def select_data_domain(data_domain):
315
+ """
316
+ Set the active data domain for feature store operations.
317
+
318
+ This function allows users to specify which data domain should be considered
319
+ as the current context for subsequent feature store operations. By setting
320
+ the data domain, users can ensure that all feature queries, registrations,
321
+ and other interactions with the feature store are scoped appropriately.
322
+ This is particularly useful in environments where multiple data domains
323
+ exist, allowing for clear separation and organization of features.
324
+
325
+ Parameters:
326
+ - data_domain (str): The name of the data domain to set as active.
327
+
328
+ Returns:
329
+ - str: The data domain that has been set as active.
330
+ """
331
+ data_domains = get_data_domains(verbose=False)
332
+ if data_domain not in data_domains:
333
+ logger_safe("error", "Data domain '%s' not found in feature store.", data_domain)
334
+ raise ValueError(f"Data domain '{data_domain}' not found in feature store.")
335
+ #suggest a data domain closest to the requested one
336
+ closest_domain = difflib.get_close_matches(data_domain, data_domains, n=1)
337
+ if data_domain in data_domains:
338
+ tdfs4ds.DATA_DOMAIN = data_domain
339
+ elif closest_domain:
340
+ logger_safe("info", "Did you mean '%s'?", closest_domain[0])
341
+ return
342
+ tdfs4ds.DATA_DOMAIN = data_domain
343
+ logger_safe("info", "Data domain set to: %s", data_domain)
344
+ return
345
+
346
+ def create_data_domain(data_domain):
347
+ """
348
+ Create a new data domain in the feature store.
349
+
350
+ This function facilitates the creation of a new data domain within the feature store.
351
+ A data domain serves as a logical grouping for features, allowing for better organization
352
+ and management. By creating a new data domain, users can segregate features based on
353
+ specific criteria, such as business units, projects, or data types. This helps in
354
+ maintaining clarity and structure within the feature store, especially in environments
355
+ with diverse datasets and use cases.
356
+
357
+ Parameters:
358
+ - data_domain (str): The name of the new data domain to be created.
359
+
360
+ Returns:
361
+ - str: The name of the newly created data domain.
362
+ """
363
+ existing_domains = get_data_domains(verbose=False)
364
+ if data_domain in existing_domains:
365
+ logger_safe("warning", "Data domain '%s' already exists in feature store.", data_domain)
366
+ return data_domain
367
+ tdfs4ds.DATA_DOMAIN = data_domain
368
+ logger_safe("info", "Data domain '%s' created in locally.", data_domain)
369
+ return
238
370
 
239
371
 
240
372
  def feature_catalog():
@@ -287,7 +419,7 @@ def get_dataset_entity(dataset_id = None):
287
419
  def get_dataset_features(dataset_id = None):
288
420
  return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
289
421
 
290
- def run(process_id, return_dataset=False, force_compute=False, force_varchar_length=None):
422
+ def run(process_id, return_dataset=False, force_compute=False, force_varchar_length=None, dataset_view_name=None):
291
423
  """
292
424
  Executes a specific process from the feature store identified by the process ID.
293
425
  Uses global `logger` for diagnostics (gated by tdfs4ds.DISPLAY_LOGS).
@@ -383,7 +515,8 @@ def run(process_id, return_dataset=False, force_compute=False, force_varchar_len
383
515
  entity_null_substitute=entity_null_substitute,
384
516
  process_id=process_id,
385
517
  force_compute=force_compute,
386
- force_varchar_length=force_varchar_length
518
+ force_varchar_length=force_varchar_length,
519
+ dataset_view_name = dataset_view_name
387
520
  )
388
521
 
389
522
  # Handling 'tdstone2 view' process type
@@ -413,7 +546,8 @@ def upload_features(
413
546
  filtermanager=None,
414
547
  entity_null_substitute={},
415
548
  force_compute=True,
416
- force_varchar_length=1024
549
+ force_varchar_length=1024,
550
+ dataset_view_name = None
417
551
  ):
418
552
  """
419
553
  Uploads feature data from a DataFrame to the feature store for a specified entity.
@@ -518,19 +652,21 @@ def upload_features(
518
652
  try:
519
653
  if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
520
654
  dataset = run(
521
- process_id=process_id,
522
- return_dataset=True,
523
- force_compute=force_compute,
524
- force_varchar_length=force_varchar_length
655
+ process_id = process_id,
656
+ return_dataset = True,
657
+ force_compute = force_compute,
658
+ force_varchar_length = force_varchar_length,
659
+ dataset_view_name = dataset_view_name
525
660
  )
526
661
  logger_safe("info", "Run completed with dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
527
662
  return dataset
528
663
  else:
529
664
  run(
530
- process_id=process_id,
531
- return_dataset=False,
532
- force_compute=force_compute,
533
- force_varchar_length=force_varchar_length
665
+ process_id = process_id,
666
+ return_dataset = False,
667
+ force_compute = force_compute,
668
+ force_varchar_length = force_varchar_length,
669
+ dataset_view_name = dataset_view_name
534
670
  )
535
671
  logger_safe("info", "Run completed without dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
536
672
  return
@@ -539,10 +675,10 @@ def upload_features(
539
675
  # Keep your existing follow-up close behavior, but ensure the error is logged.
540
676
  try:
541
677
  tdfs4ds.process_store.process_followup.followup_close(
542
- run_id = tdfs4ds.RUN_ID,
543
- process_type = tdfs4ds.PROCESS_TYPE,
544
- process_id = process_id,
545
- status = 'FAILED,' + str(e).split('\n')[0]
678
+ run_id = tdfs4ds.RUN_ID,
679
+ process_type = tdfs4ds.PROCESS_TYPE,
680
+ process_id = process_id,
681
+ status = 'FAILED,' + str(e).split('\n')[0]
546
682
  )
547
683
  finally:
548
684
  logger_safe("exception", "Run failed (run_id=%s, process_id=%s): %s",
@@ -557,11 +693,12 @@ def upload_features(
557
693
 
558
694
  def _upload_features(
559
695
  df, entity_id, feature_names,
560
- feature_versions=FEATURE_VERSION_DEFAULT,
561
- primary_index=None, partitioning='',
562
- filtermanager=None, entity_null_substitute={},
563
- process_id=None, force_compute=False,
564
- force_varchar_length=None
696
+ feature_versions = FEATURE_VERSION_DEFAULT,
697
+ primary_index = None, partitioning = '',
698
+ filtermanager = None, entity_null_substitute = {},
699
+ process_id = None, force_compute = False,
700
+ force_varchar_length = None,
701
+ dataset_view_name = None
565
702
  ):
566
703
  """
567
704
  Uploads a set of features into the Feature Store for a given entity.
@@ -708,6 +845,7 @@ def _upload_features(
708
845
  ]
709
846
 
710
847
  if filtermanager is None:
848
+ dataset_created = False
711
849
  do_compute = not (process_id and follow_up is not None and follow_up.shape[0] > 0)
712
850
  if not do_compute and not force_compute:
713
851
  logger_safe(
@@ -730,8 +868,12 @@ def _upload_features(
730
868
  entity_null_substitute=entity_null_substitute,
731
869
  partitioning=partitioning
732
870
  )
733
- store_feature(entity_id, volatile_table, entity_null_substitute,
871
+
872
+ count_rows = store_feature(entity_id, volatile_table, entity_null_substitute,
734
873
  primary_index, partitioning, features_infos)
874
+
875
+
876
+
735
877
  apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
736
878
 
737
879
  tdfs4ds.process_store.process_followup.followup_close(
@@ -740,6 +882,20 @@ def _upload_features(
740
882
  process_id=process_id
741
883
  )
742
884
  logger_safe("info", "Feature ingestion completed for entity=%s", entity_id)
885
+ # Build dataset for validation if enabled
886
+ if tdfs4ds.BUILD_DATASET_AT_UPLOAD or dataset_view_name is not None:
887
+ logger_safe("info", "Building dataset for validation...")
888
+ try:
889
+ dataset = build_dataset(
890
+ entity_id, selected_features,
891
+ view_name = dataset_view_name
892
+ )
893
+ dataset_created = True
894
+ except Exception as e:
895
+ logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
896
+ logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
897
+ else:
898
+ logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False) and no dataset view name provided.")
743
899
 
744
900
  except Exception as e:
745
901
  logger_safe("exception", "Feature ingestion failed for entity=%s", entity_id)
@@ -762,7 +918,7 @@ def _upload_features(
762
918
  unit="filter",
763
919
  leave=False
764
920
  )
765
-
921
+ dataset_created = False
766
922
  for i in pbar:
767
923
  filter_id = i + 1
768
924
  filtermanager.update(filter_id)
@@ -834,7 +990,7 @@ def _upload_features(
834
990
  partitioning = partitioning
835
991
  )
836
992
 
837
- store_feature(entity_id, volatile_table, entity_null_substitute,
993
+ count_rows = store_feature(entity_id, volatile_table, entity_null_substitute,
838
994
  primary_index, partitioning, features_infos)
839
995
 
840
996
  something_computed = True
@@ -846,6 +1002,21 @@ def _upload_features(
846
1002
  filtermanager = filtermanager
847
1003
  )
848
1004
 
1005
+ # Build dataset for validation if enabled
1006
+ if (tdfs4ds.BUILD_DATASET_AT_UPLOAD or dataset_view_name is not None) and dataset_created==False:
1007
+ logger_safe("info", "Building dataset for validation...")
1008
+ try:
1009
+ dataset = build_dataset(
1010
+ entity_id, selected_features,
1011
+ view_name = dataset_view_name
1012
+ )
1013
+ dataset_created = True
1014
+ except Exception as e:
1015
+ logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
1016
+ logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
1017
+ else:
1018
+ logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False) and no dataset view name provided.")
1019
+
849
1020
  except Exception as e:
850
1021
  logger_safe("exception", "Error with filter iteration %s: %s", i + 1, str(e))
851
1022
  tdfs4ds.process_store.process_followup.followup_close(
@@ -860,19 +1031,24 @@ def _upload_features(
860
1031
  if something_computed:
861
1032
  apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
862
1033
 
863
- if tdfs4ds.BUILD_DATASET_AT_UPLOAD:
864
- logger_safe("info", "Building dataset for validation...")
865
- try:
866
- return build_dataset(
867
- entity_id, selected_features,
868
- view_name=None,
869
- entity_null_substitute=entity_null_substitute
870
- )
871
- except Exception as e:
872
- logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
873
- logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
874
- else:
875
- logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False)")
1034
+ if dataset_created == False and tdfs4ds.BUILD_DATASET_AT_UPLOAD and dataset_view_name == None:
1035
+ logger_safe("info", "Building dataset for validation...")
1036
+ try:
1037
+ dataset = build_dataset(
1038
+ entity_id, selected_features,
1039
+ view_name = dataset_view_name
1040
+ )
1041
+ return dataset
1042
+ except Exception as e:
1043
+ logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
1044
+ logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
1045
+ else:
1046
+ if tdfs4ds.BUILD_DATASET_AT_UPLOAD == False:
1047
+ logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False) and no dataset view name provided.")
1048
+ else:
1049
+ return
1050
+
1051
+
876
1052
  return
877
1053
 
878
1054
 
@@ -287,10 +287,10 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
287
287
  # Execute: create volatile table and test unicity
288
288
  try:
289
289
  tdml.DataFrame.from_query(nested_query).to_sql(
290
- table_name=volatile_table_name,
291
- temporary=True,
292
- primary_index=primary_index.split(','),
293
- if_exists='replace'
290
+ table_name = volatile_table_name,
291
+ temporary = True,
292
+ primary_index = primary_index.split(','),
293
+ if_exists = 'replace'
294
294
  )
295
295
  nb_duplicates = tdml.execute_sql(query_test_unicity).fetchall()[0][0]
296
296
  if nb_duplicates is not None and nb_duplicates > 0:
@@ -731,6 +731,8 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
731
731
  ).fetchall(),
732
732
  columns=['NB_ROWS']
733
733
  )
734
+ # log the number of rows obtained after transformations
735
+ logger_safe("info", f"{count_features.NB_ROWS.values[0]} rows of features")
734
736
 
735
737
  if getattr(tdfs4ds, "DEBUG_MODE", False):
736
738
  logger_safe("debug", "count_features=%s", count_features)
@@ -858,7 +860,7 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
858
860
  logger_safe("exception", "Feature storage (merge) failed: %s", str(e).split('\n')[0])
859
861
  raise
860
862
 
861
- return
863
+ return count_features.NB_ROWS.values[0]
862
864
 
863
865
  def store_feature(entity_id, volatile_table_name, entity_null_substitute = {},primary_index=None,
864
866
  partitioning='', features_infos = None, **kwargs):
@@ -0,0 +1,27 @@
1
+ from .documentation import (
2
+ document_sql_query_columns,
3
+ document_process,
4
+ documentation_tables_creation,
5
+ document_sql_query_explain,
6
+ build_explain_documentation_chain,
7
+ run_explain_documentation,
8
+ build_sql_documentation_chain,
9
+ run_sql_documentation,
10
+ build_llm,
11
+ get_the_explain,
12
+ display_process_info
13
+ )
14
+
15
+ __all__ = [
16
+ "document_sql_query_columns",
17
+ "document_process",
18
+ "documentation_tables_creation",
19
+ "document_sql_query_explain",
20
+ "build_explain_documentation_chain",
21
+ "run_explain_documentation",
22
+ "build_sql_documentation_chain",
23
+ "run_sql_documentation",
24
+ "build_llm",
25
+ "get_the_explain",
26
+ "display_process_info"
27
+ ]