teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (108) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/LICENSE.pdf +0 -0
  3. teradataml/README.md +71 -0
  4. teradataml/_version.py +2 -2
  5. teradataml/analytics/analytic_function_executor.py +51 -24
  6. teradataml/analytics/json_parser/utils.py +11 -17
  7. teradataml/automl/__init__.py +103 -48
  8. teradataml/automl/data_preparation.py +55 -37
  9. teradataml/automl/data_transformation.py +131 -69
  10. teradataml/automl/feature_engineering.py +117 -185
  11. teradataml/automl/feature_exploration.py +9 -2
  12. teradataml/automl/model_evaluation.py +13 -25
  13. teradataml/automl/model_training.py +214 -75
  14. teradataml/catalog/model_cataloging_utils.py +1 -1
  15. teradataml/clients/auth_client.py +133 -0
  16. teradataml/common/aed_utils.py +3 -2
  17. teradataml/common/constants.py +11 -6
  18. teradataml/common/garbagecollector.py +5 -0
  19. teradataml/common/messagecodes.py +3 -1
  20. teradataml/common/messages.py +2 -1
  21. teradataml/common/utils.py +6 -0
  22. teradataml/context/context.py +49 -29
  23. teradataml/data/advertising.csv +201 -0
  24. teradataml/data/bank_marketing.csv +11163 -0
  25. teradataml/data/bike_sharing.csv +732 -0
  26. teradataml/data/boston2cols.csv +721 -0
  27. teradataml/data/breast_cancer.csv +570 -0
  28. teradataml/data/customer_segmentation_test.csv +2628 -0
  29. teradataml/data/customer_segmentation_train.csv +8069 -0
  30. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
  31. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
  32. teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
  33. teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
  34. teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
  35. teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
  36. teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
  37. teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
  38. teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
  39. teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
  40. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
  41. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
  42. teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
  43. teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
  44. teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
  45. teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
  46. teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
  47. teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
  48. teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
  49. teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
  50. teradataml/data/glm_example.json +28 -1
  51. teradataml/data/housing_train_segment.csv +201 -0
  52. teradataml/data/insect2Cols.csv +61 -0
  53. teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
  54. teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
  55. teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
  56. teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
  57. teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
  58. teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
  59. teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
  60. teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
  61. teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
  62. teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
  63. teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
  64. teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
  65. teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
  66. teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
  67. teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
  68. teradataml/data/kmeans_example.json +5 -0
  69. teradataml/data/kmeans_table.csv +10 -0
  70. teradataml/data/onehot_encoder_train.csv +4 -0
  71. teradataml/data/openml_example.json +29 -0
  72. teradataml/data/scale_attributes.csv +3 -0
  73. teradataml/data/scale_example.json +52 -1
  74. teradataml/data/scale_input_part_sparse.csv +31 -0
  75. teradataml/data/scale_input_partitioned.csv +16 -0
  76. teradataml/data/scale_input_sparse.csv +11 -0
  77. teradataml/data/scale_parameters.csv +3 -0
  78. teradataml/data/scripts/deploy_script.py +20 -1
  79. teradataml/data/scripts/sklearn/sklearn_fit.py +23 -27
  80. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +20 -28
  81. teradataml/data/scripts/sklearn/sklearn_function.template +13 -18
  82. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
  83. teradataml/data/scripts/sklearn/sklearn_neighbors.py +18 -27
  84. teradataml/data/scripts/sklearn/sklearn_score.py +20 -29
  85. teradataml/data/scripts/sklearn/sklearn_transform.py +30 -38
  86. teradataml/data/teradataml_example.json +77 -0
  87. teradataml/data/ztest_example.json +16 -0
  88. teradataml/dataframe/copy_to.py +8 -3
  89. teradataml/dataframe/data_transfer.py +120 -61
  90. teradataml/dataframe/dataframe.py +102 -17
  91. teradataml/dataframe/dataframe_utils.py +47 -9
  92. teradataml/dataframe/fastload.py +272 -89
  93. teradataml/dataframe/sql.py +84 -0
  94. teradataml/dbutils/dbutils.py +2 -2
  95. teradataml/lib/aed_0_1.dll +0 -0
  96. teradataml/opensource/sklearn/_sklearn_wrapper.py +102 -55
  97. teradataml/options/__init__.py +13 -4
  98. teradataml/options/configure.py +27 -6
  99. teradataml/scriptmgmt/UserEnv.py +19 -16
  100. teradataml/scriptmgmt/lls_utils.py +117 -14
  101. teradataml/table_operators/Script.py +2 -3
  102. teradataml/table_operators/TableOperator.py +58 -10
  103. teradataml/utils/validators.py +40 -2
  104. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/METADATA +78 -6
  105. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/RECORD +108 -90
  106. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/WHEEL +0 -0
  107. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/top_level.txt +0 -0
  108. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/zip-safe +0 -0
@@ -24,7 +24,7 @@ from teradataml.dataframe.dataframe import DataFrame
24
24
  from teradataml.dataframe.copy_to import copy_to_sql
25
25
  from teradataml import Antiselect
26
26
  from teradataml import BincodeFit, BincodeTransform
27
- from teradataml import ColumnSummary, CategoricalSummary, GetFutileColumns, FillRowId
27
+ from teradataml import CategoricalSummary, ColumnSummary, ConvertTo, GetFutileColumns, FillRowId
28
28
  from teradataml import Fit, Transform
29
29
  from teradataml import NonLinearCombineFit, NonLinearCombineTransform
30
30
  from teradataml import NumApply
@@ -36,6 +36,8 @@ from teradataml import TargetEncodingFit, TargetEncodingTransform
36
36
  from sqlalchemy import literal_column
37
37
  from teradatasqlalchemy import INTEGER
38
38
  from teradataml import display
39
+ from teradataml.common.garbagecollector import GarbageCollector
40
+ from teradataml.dataframe.sql_functions import case
39
41
  from teradataml.hyperparameter_tuner.utils import _ProgressBar
40
42
  from teradataml.utils.validators import _Validators
41
43
 
@@ -61,12 +63,12 @@ class _FeatureEngineering:
61
63
  Types: teradataml Dataframe
62
64
 
63
65
  target_column:
64
- Required Arugment.
66
+ Required Argument.
65
67
  Specifies the name of the target column in "data"..
66
68
  Types: str
67
69
 
68
70
  model_list:
69
- Required Arugment.
71
+ Required Argument.
70
72
  Specifies the list of models to be used for model training.
71
73
  Types: list
72
74
 
@@ -81,7 +83,7 @@ class _FeatureEngineering:
81
83
  Types: int
82
84
 
83
85
  task_type:
84
- Required Arugment.
86
+ Required Argument.
85
87
  Specifies the task type for AutoML, whether to apply regresion OR classification
86
88
  on the provived dataset.
87
89
  Default Value: "Regression"
@@ -89,7 +91,7 @@ class _FeatureEngineering:
89
91
  Types: str
90
92
 
91
93
  custom_data:
92
- Optional Arugment.
94
+ Optional Argument.
93
95
  Specifies json object containing user customized input.
94
96
  Types: json object
95
97
  """
@@ -120,7 +122,7 @@ class _FeatureEngineering:
120
122
 
121
123
  PARAMETERS:
122
124
  auto:
123
- Optional Arugment.
125
+ Optional Argument.
124
126
  Specifies whether to run AutoML in custom mode or auto mode.
125
127
  When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
126
128
  Default Value: True
@@ -255,7 +257,7 @@ class _FeatureEngineering:
255
257
  f"Remaining Columns in the data: {self.data.shape[1]}",
256
258
  progress_bar=self.progress_bar)
257
259
  else:
258
- self._display_msg(inline_msg="Analysis complete. No action taken.",
260
+ self._display_msg(inline_msg="Analysis completed. No action taken.",
259
261
  progress_bar=self.progress_bar)
260
262
 
261
263
  end_time = time.time()
@@ -333,7 +335,7 @@ class _FeatureEngineering:
333
335
  f_cols = [row[0] for row in gfc_out.result.itertuples()]
334
336
 
335
337
  if len(f_cols) == 0:
336
- self._display_msg(inline_msg="All categorical columns seem to be significant.",
338
+ self._display_msg(inline_msg="Analysis indicates all categorical columns are significant. No action Needed.",
337
339
  progress_bar=self.progress_bar)
338
340
  else:
339
341
 
@@ -350,128 +352,68 @@ class _FeatureEngineering:
350
352
  self._display_msg(msg="Total time to handle less significant features: {:.2f} sec ".format( end_time - start_time),
351
353
  progress_bar=self.progress_bar,
352
354
  show_data=True)
353
-
354
- def _handle_date_component(self,
355
- date_component_columns,
356
- date_component):
357
355
 
356
+ def _fetch_date_component(self):
358
357
  """
359
358
  DESCRIPTION:
360
- Function to handle newly generated date components, i.e., day , month and year diff.
361
- Based on their distinct values, binning is done with predefined prefix.
362
- Binned component is used further as categorical features.
363
-
364
- PARAMETERS:
365
- date_component_columns:
366
- Required Argument.
367
- Specifies the list of newly generated differnt component of date features.
368
- Types: list
369
-
370
- date_component:
371
- Required Argument.
372
- Specifies identifier for the differnt component of date features, i.e., D - Days , M - Months and Y - Year diffs.
373
- Types: str
374
-
375
- """
376
- # Check for day
377
- if date_component == "D":
378
- prefix_value = "Day_"
379
- # Check for month
380
- elif date_component == "M":
381
- prefix_value = "Month_"
382
- # Check for year diff
383
- elif date_component == "Y":
384
- prefix_value = "Year_diff_"
359
+ Function to fetch day of week, week of month, month of quarter, quarter of year
360
+ component from date column. Generate weekend and month half details from day of week and
361
+ week of month columns respectively. Convert quarter of year and month of quarter
362
+ component columns to VARCHAR.
385
363
 
386
- # Deciding bins based on distinct value of date component features.
387
- for col in date_component_columns:
388
- data_size = self.data.drop_duplicate(col).size
389
- if data_size < 4:
390
- num_bins = data_size
391
- else:
392
- num_bins = 4
393
- # Performing bincode for converting date component to specific labels
394
- fit_params = {
395
- "data": self.data,
396
- "target_columns": col,
397
- "method_type":"Equal-Width",
398
- "nbins": num_bins,
399
- "label_prefix" : prefix_value
400
- }
401
- bin_code_fit = BincodeFit(**fit_params)
402
-
403
- fit_params_map = {"D": "day_component_fit_object",
404
- "M": "month_component_fit_object",
405
- "Y": "year_diff_component_fit_object"}
406
-
407
- # Storing fit object for each date component in data transform dictionary
408
- self.data_transform_dict[fit_params_map[date_component]][col] = bin_code_fit.output
409
-
410
- accumulate_columns = self._extract_list(self.data.columns, [col])
411
- transform_params = {
412
- "data": self.data,
413
- "object": bin_code_fit.output,
414
- "accumulate": accumulate_columns,
415
- "persist": True
416
- }
417
- self.data = BincodeTransform(**transform_params).result
418
-
419
- def _fetch_date_component(self,
420
- process,
421
- regex_str,
422
- columns,
423
- date_component):
424
-
425
- """
426
- DESCRIPTION:
427
- Function to fetch newly generated date component features.
428
- Passing ahead for performing binning.
429
-
430
- PARAMETERS:
431
- process:
432
- Required Argument.
433
- Specifies date component of date feature which is going to be fetched and handled.
434
- Types: str
435
-
436
- regex_str:
437
- Required Argument.
438
- Specifies regular expression for identifying newly generated date component features.
439
- Types: str
440
-
441
- columns:
442
- Required Argument.
443
- Specifies list of newly generated date component features.
444
- Types: list
445
-
446
- date_component:
447
- Required Argument.
448
- Specifies identifier for the differnt component of date features, i.e., D - Days , M - Months and Y - Year diffs.
449
- Types: str
450
-
364
+ RETURNS:
365
+ List of newly generated date component features.
451
366
  """
452
- date_component_columns = [col for col in columns if re.search(regex_str+"$", col)]
453
- if len(date_component_columns) != 0:
454
- self._handle_date_component(date_component_columns,date_component)
455
- self._display_msg(msg="Useful {} features:".format(process),
456
- col_lst=date_component_columns,
457
- progress_bar=self.progress_bar)
458
- self._display_msg(msg="Updated dataset sample:",
459
- data=self.data,
460
- progress_bar=self.progress_bar)
461
-
462
- else:
463
- self._display_msg("\nNo useful feature found for {} component:".format(process),
464
- progress_bar=self.progress_bar)
367
+ # List for storing newly generated date component features
368
+ new_date_components=[]
369
+ # Extracting weekend, month, quarter details information from date columns
370
+ date_component_param={}
371
+ for col in self.date_column_list:
372
+ # Generating new column names for extracted date components
373
+ weekend_col = f'{col}_weekend'
374
+ month_half_col = f'{col}_month_half'
375
+ month_of_quarter_col=f'{col}_month_of_quarter'
376
+ quarter_of_year_col=f'{col}_quarter_of_year'
465
377
 
466
- return date_component_columns
378
+ date_component_param = {
379
+ **date_component_param,
380
+ weekend_col: case([(self.data[col].day_of_week().isin([1, 7]), 'yes')], else_='no'),
381
+ month_half_col: case([(self.data[col].week_of_month().isin([1, 2]), 'first_half')], else_='second_half'),
382
+ month_of_quarter_col: self.data[col].month_of_quarter(),
383
+ quarter_of_year_col: self.data[col].quarter_of_year()
384
+ }
385
+ # Storing newly generated date component month and quarter columns.
386
+ # Skipping day of week and week of month columns as they will be used
387
+ # later for extracting weekend and month part details.
388
+ new_date_components.extend([weekend_col, month_half_col, month_of_quarter_col, quarter_of_year_col])
389
+ # Adding new date component columns to dataset
390
+ self.data=self.data.assign(**date_component_param)
391
+ # Dropping date columns as different component columns are extracted.
392
+ self.data = self.data.drop(self.date_column_list, axis=1)
393
+
394
+ # Converting remaining component columns to VARCHAR
395
+ # So that it will be treated as categorical columns
396
+ remaining_component_columns = [col for col in self.data.columns if re.search('month_of_quarter|quarter_of_year'+"$", col)]
397
+ accumulate_columns = self._extract_list(self.data.columns, remaining_component_columns)
398
+ convertto_params = {
399
+ "data" : self.data,
400
+ "target_columns" : remaining_component_columns,
401
+ "target_datatype" : ["VARCHAR(charlen=20,charset=UNICODE,casespecific=NO)"],
402
+ "accumulate" : accumulate_columns,
403
+ "persist" : True
404
+ }
405
+ # returning dataset after performing string manipulation
406
+ self.data = ConvertTo(**convertto_params).result
407
+ # Adding transformed data containing table to garbage collector
408
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
409
+ return new_date_components
467
410
 
468
411
  def _handle_date_columns_helper(self):
469
412
 
470
413
  """
471
414
  DESCRIPTION:
472
- Function for dropping irrelevent date features.
473
- Extracting day, month and year component from revelent date features.
474
- Passing extracted component for performing binning.
415
+ Function for dropping irrelevent date features. Perform Extraction of different
416
+ component from revelent date features and transform them.
475
417
  """
476
418
 
477
419
  # Dropping missing value for all date columns
@@ -484,7 +426,7 @@ class _FeatureEngineering:
484
426
  # Date columns list eligible for dropping from dataset
485
427
  drop_date_cols = []
486
428
 
487
- # Checking for single valued date columns
429
+ # Checking for unique valued date columns
488
430
  for col in self.date_column_list:
489
431
  if self.data.drop_duplicate(col).size == self.data.shape[0]:
490
432
  drop_date_cols.append(col)
@@ -496,46 +438,18 @@ class _FeatureEngineering:
496
438
  self._display_msg(msg='Dropping date features with all unique value:',
497
439
  col_lst = drop_date_cols,
498
440
  progress_bar=self.progress_bar)
499
-
500
- # Updated date columns list
501
- self.date_column_list = [item for item in self.date_column_list if item not in drop_date_cols]
441
+ # Updated date column list after dropping irrelevant date columns
442
+ self.date_column_list = [item for item in self.date_column_list if item not in drop_date_cols]
502
443
 
503
- # List for storing newly generated date component features
504
- new_columns=[]
505
-
506
- # Extracting day, month and year difference from date columns
507
444
  if len(self.date_column_list) != 0:
508
445
 
509
- component_param={}
510
- for col in self.date_column_list:
511
-
512
- day_column=str(col)+"_day_comp"
513
- month_column=str(col)+"_month_comp"
514
- year_diff_column=str(col)+"_year_diff_comp"
515
- new_columns.extend([day_column,month_column,year_diff_column])
516
- day_query=("EXTRACT(DAY FROM {0})".format(col))
517
- month_query=("EXTRACT(MONTH FROM {0})".format(col))
518
- year_query=("EXTRACT(YEAR FROM CURRENT_DATE) - EXTRACT(YEAR FROM {0})".format(col))
519
- component_param[day_column]=literal_column(day_query,INTEGER())
520
- component_param[month_column]=literal_column(month_query,INTEGER())
521
- component_param[year_diff_column]=literal_column(year_query,INTEGER())
522
-
523
- self.data=self.data.assign(**component_param)
524
- # Storing newly generated date component list along with parameters in data transform dictionary
525
- self.data_transform_dict['extract_date_comp_col'] = self.date_column_list
526
- self.data_transform_dict['extract_date_comp_param'] = component_param
527
-
528
- # Dropping date columns as we have already extracted day, month and year in new columns
529
- self.data = self.data.drop(self.date_column_list, axis=1)
446
+ # List for storing newly generated date component features
447
+ new_columns=self._fetch_date_component()
530
448
  self._display_msg(msg='List of newly generated features from existing date features:',
531
449
  col_lst=new_columns,
532
450
  progress_bar=self.progress_bar)
533
- self._display_msg(msg='List of newly generated features from existing date features:',
534
- data=self.data,
535
- progress_bar=self.progress_bar)
536
-
451
+ # Dropping columns with all unique values or single value
537
452
  drop_cols=[]
538
-
539
453
  for col in new_columns:
540
454
  distinct_rows = self.data.drop_duplicate(col).size
541
455
  if distinct_rows == self.data.shape[0]:
@@ -555,21 +469,11 @@ class _FeatureEngineering:
555
469
  self.data = self.data.drop(drop_cols, axis=1)
556
470
  # Storing extract date component list for drop in data transform dictionary
557
471
  self.data_transform_dict['drop_extract_date_columns'] = drop_cols
558
-
559
- # Extracting all newly generated columns
560
- new_columns = [item for item in new_columns if item not in drop_cols]
472
+ # Extracting all newly generated columns
473
+ new_columns = [item for item in new_columns if item not in drop_cols]
561
474
 
562
- # Storing each date component transformation fit object in data transform dictionary
563
- self.data_transform_dict = {**self.data_transform_dict,
564
- 'day_component_fit_object': {},
565
- 'month_component_fit_object': {},
566
- 'year_diff_component_fit_object': {}}
567
- # Grouping date components based on types i.e., day, month, and year_diff for performing binning
568
- if len(new_columns) != 0:
569
- self.day_columns = self._fetch_date_component("day", "_day_comp", new_columns, "D")
570
- self.month_columns = self._fetch_date_component("month", "_month_comp", new_columns, "M")
571
- self.year_diff_columns = self._fetch_date_component("year_diff", "_year_diff_comp", new_columns, "Y")
572
- self._display_msg(inline_msg="No useful date component found",
475
+ self._display_msg(msg='Updated list of newly generated features from existing date features :',
476
+ col_lst=new_columns,
573
477
  progress_bar=self.progress_bar)
574
478
 
575
479
  self._display_msg(msg='Updated dataset sample after handling date features:',
@@ -595,7 +499,7 @@ class _FeatureEngineering:
595
499
  if d_type in ["datetime.date","datetime.datetime"]]
596
500
 
597
501
  if len(self.date_column_list) == 0:
598
- self._display_msg(inline_msg="Dataset does not contain any feature related to dates.",
502
+ self._display_msg(inline_msg="Analysis Completed. Dataset does not contain any feature related to dates. No action needed.",
599
503
  progress_bar=self.progress_bar)
600
504
  else:
601
505
  # Storing date column list in data transform dictionary
@@ -622,8 +526,7 @@ class _FeatureEngineering:
622
526
  self.data = self.data.dropna(subset=[self.target_column])
623
527
 
624
528
  obj = ColumnSummary(data=self.data,
625
- target_columns=self.data.columns,
626
- volatile=True)
529
+ target_columns=self.data.columns)
627
530
 
628
531
  cols_miss_val={}
629
532
  # Iterating over each row in the column summary result
@@ -705,11 +608,15 @@ class _FeatureEngineering:
705
608
  self.data_transform_dict['imputation_columns'] = self.imputation_cols
706
609
 
707
610
  if len(delete_rows) != 0:
611
+ rows = self.data.shape[0]
708
612
  self.data = self.data.dropna(subset=delete_rows)
709
613
  msg_val_found=1
710
614
  self._display_msg(msg='Deleting rows of these columns for handling missing values:',
711
615
  col_lst=delete_rows,
712
616
  progress_bar=self.progress_bar)
617
+ self._display_msg(msg=f'Sample of dataset after removing {rows-self.data.shape[0]} rows:',
618
+ data=self.data,
619
+ progress_bar=self.progress_bar)
713
620
 
714
621
  if len(drop_cols) != 0:
715
622
  self.data = self.data.drop(drop_cols, axis=1)
@@ -719,9 +626,12 @@ class _FeatureEngineering:
719
626
  self._display_msg(msg='Dropping these columns for handling missing values:',
720
627
  col_lst=drop_cols,
721
628
  progress_bar=self.progress_bar)
629
+ self._display_msg(msg=f'Sample of dataset after removing {len(drop_cols)} columns:',
630
+ data=self.data,
631
+ progress_bar=self.progress_bar)
722
632
 
723
633
  if len(self.imputation_cols) == 0 and msg_val_found ==0:
724
- self._display_msg(inline_msg="No Missing Values Detected.",
634
+ self._display_msg(inline_msg="Analysis Completed. No Missing Values Detected.",
725
635
  progress_bar=self.progress_bar)
726
636
 
727
637
  end_time = time.time()
@@ -787,21 +697,19 @@ class _FeatureEngineering:
787
697
 
788
698
  fit_obj = SimpleImputeFit(data=self.data,
789
699
  stats_columns=col_stat,
790
- stats=stat,
791
- volatile=True)
700
+ stats=stat)
792
701
 
793
702
  # Storing fit object for imputation in data transform dictionary
794
703
  self.data_transform_dict['imputation_fit_object'] = fit_obj.output
795
704
  sm = SimpleImputeTransform(data=self.data,
796
- object=fit_obj,
797
- volatile=True)
705
+ object=fit_obj)
798
706
 
799
707
  self.data = sm.result
800
- self._display_msg(msg="Sample of Data after Imputation:",
708
+ self._display_msg(msg="Sample of dataset after Imputation:",
801
709
  data=self.data,
802
710
  progress_bar=self.progress_bar)
803
711
  else:
804
- self._display_msg(inline_msg="No imputation is Required.",
712
+ self._display_msg(inline_msg="Analysis completed. No imputation required.",
805
713
  progress_bar=self.progress_bar)
806
714
 
807
715
  end_time = time.time()
@@ -898,6 +806,8 @@ class _FeatureEngineering:
898
806
  }
899
807
  # Updating dataset with transform result
900
808
  self.data = SimpleImputeTransform(**transform_param).result
809
+ # Adding transformed data containing table to garbage collector
810
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
901
811
  self._display_msg(msg="Updated dataset sample after performing customized missing value imputation:",
902
812
  data=self.data,
903
813
  progress_bar=self.progress_bar)
@@ -987,6 +897,8 @@ class _FeatureEngineering:
987
897
  "persist" : True,
988
898
  }
989
899
  self.data = BincodeTransform(**eql_transform_params).result
900
+ # Adding transformed data containing table to garbage collector
901
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
990
902
  self._display_msg(msg="\nUpdated dataset sample after performing Equal-Width binning :-",
991
903
  data=self.data,
992
904
  progress_bar=self.progress_bar)
@@ -1026,6 +938,8 @@ class _FeatureEngineering:
1026
938
  "persist" : True
1027
939
  }
1028
940
  self.data = BincodeTransform(**var_transform_params).result
941
+ # Adding transformed data containing table to garbage collector
942
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
1029
943
  self._display_msg(msg="Updated dataset sample after performing Variable-Width binning:",
1030
944
  data=self.data,
1031
945
  progress_bar=self.progress_bar)
@@ -1125,7 +1039,10 @@ class _FeatureEngineering:
1125
1039
  "string_length" : string_length}
1126
1040
 
1127
1041
  # returning dataset after performing string manipulation
1128
- return StrApply(**fit_params).result
1042
+ transform_output = StrApply(**fit_params).result
1043
+ # Adding transformed data containing table to garbage collector
1044
+ GarbageCollector._add_to_garbagecollector(transform_output._table_name)
1045
+ return transform_output
1129
1046
 
1130
1047
  def _one_hot_encoding(self,
1131
1048
  one_hot_columns,
@@ -1173,8 +1090,10 @@ class _FeatureEngineering:
1173
1090
  "persist" : True
1174
1091
  }
1175
1092
  # Performing one hot encoding transformation
1176
- transform_obj = OneHotEncodingTransform(**transform_params)
1177
- self.data = transform_obj.result.drop(drop_lst, axis=1)
1093
+ transform_output = OneHotEncodingTransform(**transform_params).result
1094
+ # Adding transformed data containing table to garbage collector
1095
+ GarbageCollector._add_to_garbagecollector(transform_output._table_name)
1096
+ self.data = transform_output.drop(drop_lst, axis=1)
1178
1097
 
1179
1098
  def _ordinal_encoding(self,
1180
1099
  ordinal_columns):
@@ -1191,8 +1110,7 @@ class _FeatureEngineering:
1191
1110
  # Adding fit parameters for performing encoding
1192
1111
  fit_params = {
1193
1112
  "data" : self.data,
1194
- "target_column" : ordinal_columns,
1195
- "volatile" : True
1113
+ "target_column" : ordinal_columns
1196
1114
  }
1197
1115
  # Performing ordinal encoding fit on target columns
1198
1116
  ord_fit_obj = OrdinalEncodingFit(**fit_params)
@@ -1214,6 +1132,8 @@ class _FeatureEngineering:
1214
1132
  }
1215
1133
  # Performing ordinal encoding transformation
1216
1134
  self.data = OrdinalEncodingTransform(**transform_params).result
1135
+ # Adding transformed data containing table to garbage collector
1136
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
1217
1137
 
1218
1138
  if len(ordinal_columns) == 1 and ordinal_columns[0] == self.target_column:
1219
1139
  self.target_label = ord_fit_obj
@@ -1276,6 +1196,8 @@ class _FeatureEngineering:
1276
1196
  }
1277
1197
  # Performing ordinal encoding transformation
1278
1198
  self.data = TargetEncodingTransform(**transform_params).result
1199
+ # Adding transformed data containing table to garbage collector
1200
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
1279
1201
 
1280
1202
  def _encoding_categorical_columns(self):
1281
1203
  """
@@ -1308,8 +1230,11 @@ class _FeatureEngineering:
1308
1230
  self._display_msg(msg="ONE HOT Encoding these Columns:",
1309
1231
  col_lst=ohe_col,
1310
1232
  progress_bar=self.progress_bar)
1233
+ self._display_msg(msg="Sample of dataset after performing one hot encoding:",
1234
+ data=self.data,
1235
+ progress_bar=self.progress_bar)
1311
1236
  else:
1312
- self._display_msg(inline_msg="Encoding not required.",
1237
+ self._display_msg(inline_msg="Analysis completed. No categorical columns were found.",
1313
1238
  progress_bar=self.progress_bar)
1314
1239
 
1315
1240
  # List of columns after one hot
@@ -1434,7 +1359,10 @@ class _FeatureEngineering:
1434
1359
  sigmoid_style=transform_val["sigmoid_style"]
1435
1360
  fit_params = {**fit_params, "sigmoid_style" : sigmoid_style}
1436
1361
  # Performing transformation on target columns
1437
- return NumApply(**fit_params).result
1362
+ transform_output = NumApply(**fit_params).result
1363
+ # Adding transformed data containing table to garbage collector
1364
+ GarbageCollector._add_to_garbagecollector(transform_output._table_name)
1365
+ return transform_output
1438
1366
 
1439
1367
  def _numerical_transformation(self, target_columns, num_transform_data):
1440
1368
  """
@@ -1465,7 +1393,9 @@ class _FeatureEngineering:
1465
1393
  "persist" :True
1466
1394
  }
1467
1395
  # Peforming transformation on target columns
1468
- self.data = Transform(**transform_params).result
1396
+ self.data = Transform(**transform_params).result
1397
+ # Adding transformed data containing table to garbage collector
1398
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
1469
1399
  self._display_msg(msg="Updated dataset sample after applying numerical transformation:",
1470
1400
  data=self.data,
1471
1401
  progress_bar=self.progress_bar)
@@ -1595,6 +1525,8 @@ class _FeatureEngineering:
1595
1525
  "persist" : True
1596
1526
  }
1597
1527
  self.data = NonLinearCombineTransform(**transform_params).result
1528
+ # Adding transformed data containing table to garbage collector
1529
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
1598
1530
  else:
1599
1531
  self._display_msg(inline_msg="Combinations are not as per expectation.",
1600
1532
  progress_bar=self.progress_bar)
@@ -21,6 +21,7 @@ from teradataml import ColumnSummary, CategoricalSummary, GetFutileColumns
21
21
  from teradataml import OutlierFilterFit, OutlierFilterTransform
22
22
  from teradataml.hyperparameter_tuner.utils import _ProgressBar
23
23
  from teradataml.common.messages import Messages, MessageCodes
24
+ from teradataml import display as dp
24
25
 
25
26
  def _is_terminal():
26
27
  """
@@ -158,13 +159,14 @@ class _FeatureExplore:
158
159
  Internal function displays the column summary of categorical column such as
159
160
  datatype, null count, non null count, zero count.
160
161
  """
162
+ dp.max_rows = self.data.shape[1]
161
163
  # Column Summary of all columns of dataset
162
164
  obj = ColumnSummary(data=self.data,
163
- target_columns=self.data.columns,
164
- volatile=True)
165
+ target_columns=self.data.columns)
165
166
  self._display_msg(msg='\nColumn Summary:',
166
167
  data=obj.result,
167
168
  show_data=True)
169
+ dp.max_rows = 10
168
170
 
169
171
  def _categorical_summary(self,
170
172
  categorical_columns=None):
@@ -503,6 +505,11 @@ class _FeatureExplore:
503
505
  progress_bar.update(msg=msg, data=col_lst if col_lst else data if data is not None else None,
504
506
  progress=False,
505
507
  ipython=not self.terminal_print)
508
+ # Displaying shape of data
509
+ if data is not None:
510
+ progress_bar.update(msg=f'{data.shape[0]} rows X {data.shape[1]} columns',
511
+ progress=False,
512
+ ipython=not self.terminal_print)
506
513
  # If an inline message is provided instead
507
514
  elif inline_msg:
508
515
  # Update the progress bar with the inline message
@@ -18,6 +18,7 @@ import time
18
18
 
19
19
  # Teradata libraries
20
20
  from teradataml.dataframe.dataframe import DataFrame
21
+ from teradataml.automl.model_training import _ModelTraining
21
22
 
22
23
 
23
24
  class _ModelEvaluator:
@@ -38,12 +39,12 @@ class _ModelEvaluator:
38
39
  Types: teradataml Dataframe
39
40
 
40
41
  target_column:
41
- Required Arugment.
42
+ Required Argument.
42
43
  Specifies the target column present inside the dataset.
43
44
  Types: str
44
45
 
45
46
  task_type:
46
- Required Arugment.
47
+ Required Argument.
47
48
  Specifies the task type for AutoML, whether to apply regresion OR classification
48
49
  on the provived dataset.
49
50
  Default Value: "Regression"
@@ -115,37 +116,24 @@ class _ModelEvaluator:
115
116
  model = self.model_info.loc[rank]
116
117
 
117
118
  # Defining eval_params
118
- eval_params = {"id_column": "id",
119
- "accumulate": self.target_column}
120
-
121
- # eval_params for Classification
122
- if self.task_type != "Regression":
123
- # XGboost
124
- if model['Name'] == 'xgboost':
125
- eval_params['model_type'] = 'Classification'
126
- eval_params['object_order_column'] = ['task_index', 'tree_num', 'iter','class_num', 'tree_order']
127
- else:
128
- # DF,KNN,SVM,GLM
129
- eval_params['output_prob'] = True
130
- else:
131
- # eval_params for Regression in XGboost
132
- if model['Name'] == 'xgboost':
133
- eval_params['model_type'] = 'Regression'
134
- eval_params['object_order_column'] = ['task_index', 'tree_num', 'iter', 'tree_order']
135
-
119
+ eval_params = _ModelTraining._eval_params_generation(model['Name'],
120
+ self.target_column,
121
+ self.task_type)
136
122
 
137
123
  # Test Data
138
- test = DataFrame(self.table_name_mapping['{}_test'.format(model['Feature selection'])])
124
+ test = DataFrame(self.table_name_mapping['{}_test'.format(model['Feature-Selection'])])
139
125
 
140
126
  # Getting test data from table
141
127
  if not self.test_data_ind:
142
128
  # Test Data
143
- test = DataFrame(self.table_name_mapping['{}_test'.format(model['Feature selection'])])
129
+ test = DataFrame(self.table_name_mapping['{}_test'.format(model['Feature-Selection'])])
144
130
  else:
145
- test = DataFrame(self.table_name_mapping['{}_new_test'.format(model['Feature selection'])])
131
+ test = DataFrame(self.table_name_mapping['{}_new_test'.format(model['Feature-Selection'])])
132
+
133
+ print("\nFollowing model is being used for generating prediction :")
134
+ print("Model ID :", model['Model-ID'],
135
+ "\nFeature Selection Method :",model['Feature-Selection'])
146
136
 
147
- print(model['Name'], model['Feature selection'])
148
-
149
137
  # Evaluation and predictions
150
138
  if model['Name'] == 'knn':
151
139
  metrics = model['model-obj'].evaluate(test_data=test)