teradataml 20.0.0.4__py3-none-any.whl → 20.0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (131) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +182 -13
  3. teradataml/__init__.py +2 -1
  4. teradataml/_version.py +2 -2
  5. teradataml/analytics/analytic_function_executor.py +8 -13
  6. teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
  7. teradataml/analytics/sqle/__init__.py +16 -1
  8. teradataml/analytics/utils.py +60 -1
  9. teradataml/automl/__init__.py +290 -106
  10. teradataml/automl/autodataprep/__init__.py +471 -0
  11. teradataml/automl/data_preparation.py +29 -10
  12. teradataml/automl/data_transformation.py +11 -0
  13. teradataml/automl/feature_engineering.py +64 -4
  14. teradataml/automl/feature_exploration.py +639 -25
  15. teradataml/automl/model_training.py +1 -1
  16. teradataml/clients/auth_client.py +12 -8
  17. teradataml/clients/keycloak_client.py +165 -0
  18. teradataml/common/constants.py +71 -26
  19. teradataml/common/exceptions.py +32 -0
  20. teradataml/common/messagecodes.py +28 -0
  21. teradataml/common/messages.py +13 -4
  22. teradataml/common/sqlbundle.py +3 -2
  23. teradataml/common/utils.py +345 -45
  24. teradataml/context/context.py +259 -93
  25. teradataml/data/apriori_example.json +22 -0
  26. teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
  27. teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
  28. teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
  29. teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
  30. teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
  31. teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
  32. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
  33. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
  34. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
  35. teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
  36. teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
  37. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
  38. teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
  39. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
  40. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
  41. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
  42. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
  43. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  44. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
  45. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
  46. teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
  47. teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
  48. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  49. teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
  50. teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
  51. teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
  52. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  53. teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
  54. teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
  55. teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
  56. teradataml/data/jsons/byom/onnxembeddings.json +1 -0
  57. teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
  58. teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
  59. teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
  60. teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
  61. teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
  62. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
  63. teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
  64. teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
  65. teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
  66. teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
  67. teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
  68. teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
  69. teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
  70. teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
  71. teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
  72. teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
  73. teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
  74. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +2 -2
  75. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +1 -1
  76. teradataml/data/ner_dict.csv +8 -0
  77. teradataml/data/ner_input_eng.csv +7 -0
  78. teradataml/data/ner_rule.csv +5 -0
  79. teradataml/data/pattern_matching_data.csv +11 -0
  80. teradataml/data/pos_input.csv +40 -0
  81. teradataml/data/sdk/modelops/modelops_spec.json +101737 -0
  82. teradataml/data/tdnerextractor_example.json +14 -0
  83. teradataml/data/teradataml_example.json +21 -1
  84. teradataml/data/textmorph_example.json +5 -0
  85. teradataml/data/to_num_data.csv +4 -0
  86. teradataml/data/tochar_data.csv +5 -0
  87. teradataml/data/trans_dense.csv +16 -0
  88. teradataml/data/trans_sparse.csv +55 -0
  89. teradataml/data/url_data.csv +10 -9
  90. teradataml/dataframe/copy_to.py +38 -27
  91. teradataml/dataframe/data_transfer.py +61 -45
  92. teradataml/dataframe/dataframe.py +1110 -132
  93. teradataml/dataframe/dataframe_utils.py +73 -27
  94. teradataml/dataframe/functions.py +1070 -9
  95. teradataml/dataframe/sql.py +750 -959
  96. teradataml/dbutils/dbutils.py +33 -13
  97. teradataml/dbutils/filemgr.py +14 -10
  98. teradataml/hyperparameter_tuner/utils.py +4 -2
  99. teradataml/lib/aed_0_1.dll +0 -0
  100. teradataml/opensource/_base.py +12 -157
  101. teradataml/options/configure.py +24 -9
  102. teradataml/scriptmgmt/UserEnv.py +317 -39
  103. teradataml/scriptmgmt/lls_utils.py +456 -135
  104. teradataml/sdk/README.md +79 -0
  105. teradataml/sdk/__init__.py +4 -0
  106. teradataml/sdk/_auth_modes.py +422 -0
  107. teradataml/sdk/_func_params.py +487 -0
  108. teradataml/sdk/_json_parser.py +453 -0
  109. teradataml/sdk/_openapi_spec_constants.py +249 -0
  110. teradataml/sdk/_utils.py +236 -0
  111. teradataml/sdk/api_client.py +897 -0
  112. teradataml/sdk/constants.py +62 -0
  113. teradataml/sdk/modelops/__init__.py +98 -0
  114. teradataml/sdk/modelops/_client.py +406 -0
  115. teradataml/sdk/modelops/_constants.py +304 -0
  116. teradataml/sdk/modelops/models.py +2308 -0
  117. teradataml/sdk/spinner.py +107 -0
  118. teradataml/store/__init__.py +1 -1
  119. teradataml/table_operators/Apply.py +16 -1
  120. teradataml/table_operators/Script.py +20 -1
  121. teradataml/table_operators/query_generator.py +4 -21
  122. teradataml/table_operators/table_operator_util.py +58 -9
  123. teradataml/utils/dtypes.py +4 -2
  124. teradataml/utils/internal_buffer.py +22 -2
  125. teradataml/utils/utils.py +0 -1
  126. teradataml/utils/validators.py +318 -58
  127. {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/METADATA +188 -14
  128. {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/RECORD +131 -84
  129. {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/WHEEL +0 -0
  130. {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/top_level.txt +0 -0
  131. {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/zip-safe +0 -0
@@ -13,6 +13,11 @@
13
13
  # Function Version: 1.0
14
14
  # ##################################################################
15
15
 
16
+ # Python Libraries
17
+ import pandas as pd
18
+ import matplotlib.pyplot as plt
19
+ import seaborn as sns
20
+ import math
16
21
 
17
22
  # Teradata libraries
18
23
  from teradataml.dataframe.dataframe import DataFrame
@@ -22,6 +27,8 @@ from teradataml import OutlierFilterFit, OutlierFilterTransform
22
27
  from teradataml.hyperparameter_tuner.utils import _ProgressBar
23
28
  from teradataml.common.messages import Messages, MessageCodes
24
29
  from teradataml import display as dp
30
+ from teradataml.utils.validators import _Validators
31
+ from teradataml.common.utils import UtilFuncs
25
32
 
26
33
  def _is_terminal():
27
34
  """
@@ -54,7 +61,8 @@ class _FeatureExplore:
54
61
  def __init__(self,
55
62
  data=None,
56
63
  target_column=None,
57
- verbose=0):
64
+ verbose=0,
65
+ task_type='regression'):
58
66
  """
59
67
  DESCRIPTION:
60
68
  Internal function initializes the data, target column for feature exploration.
@@ -79,14 +87,25 @@ class _FeatureExplore:
79
87
  * 1: prints the execution steps of AutoML.
80
88
  * 2: prints the intermediate data between the execution of each step of AutoML.
81
89
  Types: int
90
+
91
+ task_type:
92
+ Optional Argument.
93
+ Specifies the task type of the data.
94
+ Default Value: 'regression'
95
+ Permitted Values:
96
+ * 'regression'
97
+ * 'classification'
98
+ Types: str
82
99
  """
83
100
  self.data = data
84
101
  self.target_column = target_column
85
102
  self.verbose = verbose
86
103
  self.terminal_print = _is_terminal()
87
104
  self.style = self._common_style()
105
+ self.task_type = task_type
88
106
 
89
- def _exploration(self):
107
+ def _exploration(self,
108
+ **kwargs):
90
109
  """
91
110
  DESCRIPTION:
92
111
  Internal function performs following operations:
@@ -101,7 +120,9 @@ class _FeatureExplore:
101
120
  categorical_columns= []
102
121
  date_column_list = []
103
122
 
104
- self._display_heading(phase=0)
123
+ aml_phases = kwargs.get('automl_phases', None)
124
+ self._display_heading(phase=0,
125
+ automl_phases=aml_phases)
105
126
  self._display_msg(msg='Feature Exploration started ...')
106
127
 
107
128
  # Detecting numerical and categorical column
@@ -227,33 +248,22 @@ class _FeatureExplore:
227
248
  data=gfc_out.result,
228
249
  show_data=True)
229
250
 
230
- def _target_column_details(self,
231
- plot_data = None):
251
+ def _target_column_details(self):
232
252
  """
233
253
  DESCRIPTION:
234
254
  Internal function displays the target column distribution of Target column/ Response column.
235
255
 
236
256
  PARAMETERS:
237
- plot_data:
238
- Required Argument.
239
- Specifies the input teradataml DataFrame for plotting distribution.
240
- Types: teradataml Dataframe
257
+ None
241
258
  """
242
259
  if self._check_visualization_libraries() and not _is_terminal():
243
- import matplotlib.pyplot as plt
244
- import seaborn as sns
245
- if plot_data is None:
246
- target_data = self.data.select([self.target_column]).to_pandas()
247
- else:
248
- target_data = plot_data[[self.target_column]]
260
+ # Plotting target column distribution
249
261
  self._display_msg(msg='\nTarget Column Distribution:',
250
262
  show_data=True)
251
- plt.figure(figsize=(8, 6))
252
- # Ploting a histogram for target column
253
- plt.hist(target_data, bins=10, density=True, edgecolor='black')
254
- plt.xlabel(self.target_column)
255
- plt.ylabel('Density')
256
- plt.show()
263
+ _FeatureExplore._visualize(data=self.data,
264
+ target_column=self.target_column,
265
+ plot_type=["target"],
266
+ problem_type=self.task_type)
257
267
 
258
268
  def _check_visualization_libraries(self):
259
269
  """
@@ -308,6 +318,9 @@ class _FeatureExplore:
308
318
  Pandas DataFrame containing, column name with outlier percentage.
309
319
 
310
320
  """
321
+ # Removing target column from the list of columns
322
+ column_list = [col for col in column_list if col != self.target_column]
323
+
311
324
  # Performing outlier fit on the data for replacing outliers with NULL value
312
325
  fit_params = {
313
326
  "data" : self.data,
@@ -379,7 +392,8 @@ class _FeatureExplore:
379
392
 
380
393
  def _display_heading(self,
381
394
  phase=0,
382
- progress_bar=None):
395
+ progress_bar=None,
396
+ **kwargs):
383
397
  """
384
398
  DESCRIPTION:
385
399
  Internal function to print the phase of AutoML that
@@ -399,9 +413,14 @@ class _FeatureExplore:
399
413
  RETURNS:
400
414
  None.
401
415
  """
402
- # Phases of automl
403
- steps = ["1. Feature Exploration ->", " 2. Feature Engineering ->",
416
+ phases = ["1. Feature Exploration ->", " 2. Feature Engineering ->",
404
417
  " 3. Data Preparation ->", " 4. Model Training & Evaluation"]
418
+ # Phases of automl
419
+ if kwargs.get('automl_phases', None) is not None:
420
+ steps = kwargs.get('automl_phases')
421
+ else:
422
+ steps = phases
423
+
405
424
  # Check verbose > 0
406
425
  if self.verbose > 0:
407
426
 
@@ -551,4 +570,599 @@ class _FeatureExplore:
551
570
  # If data is provided
552
571
  if data is not None:
553
572
  # Print the data if terminal_print is True, else display the data
554
- print(data) if self.terminal_print else display(data)
573
+ print(data) if self.terminal_print else display(data)
574
+
575
+ @staticmethod
576
+ def _visualize(data,
577
+ target_column,
578
+ plot_type=["target"],
579
+ length=10,
580
+ breadth=8,
581
+ max_features=10,
582
+ columns=None,
583
+ problem_type=None):
584
+ """
585
+ DESCRIPTION:
586
+ Internal function to visualize the data using various plots such as heatmap,
587
+ pair plot, density, count plot, box plot, and target distribution.
588
+
589
+ PARAMETERS:
590
+ data:
591
+ Required Argument.
592
+ Specifies the input teradataml DataFrame for plotting.
593
+ Types: teradataml Dataframe
594
+
595
+ target_column:
596
+ Required Argument.
597
+ Specifies the name of the target column in "data".
598
+ Types: str
599
+
600
+ plot_type:
601
+ Optional Argument.
602
+ Specifies the type of plot to be displayed.
603
+ Default Value: "target"
604
+ Permitted Values:
605
+ * "heatmap": Displays a heatmap of feature correlations.
606
+ * "pair": Displays a pair plot of features.
607
+ * "density": Displays a density plot of features.
608
+ * "count": Displays a count plot of categorical features.
609
+ * "box": Displays a box plot of numerical features.
610
+ * "target": Displays the distribution of the target variable.
611
+ * "all": Displays all the plots.
612
+ Types: str, list of str
613
+
614
+ length:
615
+ Optional Argument.
616
+ Specifies the length of the plot.
617
+ Default Value: 10
618
+ Types: int
619
+
620
+ breadth:
621
+ Optional Argument.
622
+ Specifies the breadth of the plot.
623
+ Default Value: 8
624
+ Types: int
625
+
626
+ columns:
627
+ Optional Argument.
628
+ Specifies the column names to be used for plotting.
629
+ Types: str or list of string
630
+
631
+ max_features:
632
+ Optional Argument.
633
+ Specifies the maximum number of features to be used for plotting.
634
+ Default Value: 10
635
+ Note:
636
+ * It applies separately to categorical and numerical features.
637
+ Types: int
638
+
639
+ problem_type:
640
+ Optional Argument.
641
+ Specifies the type of problem.
642
+ Permitted Values:
643
+ * 'regression'
644
+ * 'classification'
645
+ Types: str
646
+
647
+ RETURNS:
648
+ None
649
+
650
+ RAISES:
651
+ TeradataMlException, ValueError, TypeError
652
+
653
+ EXAMPLES:
654
+ >>> _FeatureExplore._visualize(data=data,
655
+ target_column="target",
656
+ plot_type="heatmap",
657
+ length=10,
658
+ breadth=8,
659
+ max_features=10,
660
+ columns=["feature1", "feature2"],
661
+ problem_type="regression")
662
+ """
663
+ # Appending arguments to list for validation
664
+ arg_info_matrix = []
665
+ arg_info_matrix.append(["data", data, False, (DataFrame)])
666
+ arg_info_matrix.append(["target_column", target_column, False, (str)])
667
+ arg_info_matrix.append(["plot_type", plot_type, True, (str, list), True, ["heatmap", "pair", "all",
668
+ "density", "count", "box", "target"]])
669
+ arg_info_matrix.append(["length", length, True, (int)])
670
+ arg_info_matrix.append(["breadth", breadth, True, (int)])
671
+ arg_info_matrix.append(["max_features", max_features, True, (int)])
672
+ arg_info_matrix.append(["problem_type", problem_type, True, (str), True, ["regression", "classification"]])
673
+ arg_info_matrix.append(["columns", columns, True, (str, list)])
674
+
675
+ # Validate argument types
676
+ _Validators._validate_function_arguments(arg_info_matrix)
677
+
678
+ # Validate that data has the required columns
679
+ _Validators._validate_dataframe_has_argument_columns(target_column, "target_column", data, "data")
680
+ _Validators._validate_dataframe_has_argument_columns(columns, "columns", data, "data")
681
+
682
+ # Convert data to pandas DataFrame if it's a teradataml DataFrame
683
+ cols = data.columns
684
+ data = data.to_pandas().reset_index()
685
+ # avoiding the index column
686
+ data = data[cols]
687
+
688
+ available_plots = ["target", "density", "count", "box", "pair", "heatmap"]
689
+
690
+ # if target_column is str
691
+ if isinstance(target_column, str):
692
+ data[target_column] = data[target_column].astype("category").cat.codes
693
+
694
+ if plot_type == "all":
695
+ plot_type = available_plots
696
+ else:
697
+ plot_type = UtilFuncs._as_list(plot_type)
698
+
699
+ # Identify numerical and categorical columns
700
+ numerical_features = data.select_dtypes(include=['number']).columns.drop(target_column).tolist()
701
+ categorical_features = data.select_dtypes(include=['object', 'category']).columns.tolist()
702
+
703
+ # Handle selected_columns input
704
+ if columns:
705
+ selected_columns = UtilFuncs._as_list(columns)
706
+ selected_num_features = [col for col in selected_columns if col in numerical_features][:max_features]
707
+ selected_cat_features = [col for col in selected_columns if col in categorical_features][:max_features]
708
+ else:
709
+ # Compute correlation with target and select top correlated numerical features
710
+ if target_column in data.columns and pd.api.types.is_numeric_dtype(data[target_column]):
711
+ selected_num_features = (
712
+ data[numerical_features]
713
+ .corrwith(data[target_column])
714
+ .abs()
715
+ .nlargest(max_features)
716
+ .index.tolist()
717
+ )
718
+ else:
719
+ selected_num_features = numerical_features[:max_features]
720
+
721
+ # Select top categorical features based on appearance
722
+ selected_cat_features = categorical_features[:max_features]
723
+
724
+ irrelevant_plot = []
725
+
726
+ # Sort plot_type based on the order in available_plots
727
+ # display univariate plots first, then bivariate, and finally multivariate
728
+ sorted_plot_type = sorted(plot_type, key=lambda x: available_plots.index(x.lower()))
729
+
730
+ for plot in sorted_plot_type:
731
+ # Target Distribution
732
+ if plot.lower() == "target":
733
+ msg = _FeatureExplore._target_distribution(data=data,
734
+ target_column=target_column,
735
+ problem_type=problem_type,
736
+ length=length,
737
+ breadth=breadth)
738
+ # Density Plot (for numerical features) - Grid
739
+ elif plot.lower() == "density":
740
+ msg = _FeatureExplore._density_plot(data=data,
741
+ length=length,
742
+ breadth=breadth,
743
+ numerical_features=selected_num_features)
744
+ # Count Plot (for categorical features) - Grid
745
+ elif plot.lower() == "count":
746
+ msg = _FeatureExplore._count_plot(data=data,
747
+ length=length,
748
+ breadth=breadth,
749
+ categorical_features=selected_cat_features)
750
+ # Box Plot (for numerical features) - Grid
751
+ elif plot.lower() == "box":
752
+ msg = _FeatureExplore._box_plot(data=data,
753
+ length=length,
754
+ breadth=breadth,
755
+ numerical_features=selected_num_features)
756
+ # Scatter Plot / Pair Plot
757
+ elif plot.lower() == "pair":
758
+ msg = _FeatureExplore._pair_plot(data=data,
759
+ target_column=target_column,
760
+ length=length,
761
+ breadth=breadth,
762
+ numerical_features=selected_num_features,
763
+ categorical_features=selected_cat_features)
764
+ # Heatmap
765
+ elif plot.lower() == "heatmap":
766
+ msg = _FeatureExplore._heatmap(data=data,
767
+ target_column=target_column,
768
+ length=length,
769
+ breadth=breadth,
770
+ numerical_features=selected_num_features)
771
+
772
+ if msg:
773
+ irrelevant_plot.append(msg)
774
+
775
+ if irrelevant_plot:
776
+ for msg in irrelevant_plot:
777
+ print(msg)
778
+
779
+ @staticmethod
780
+ def _heatmap(data,
781
+ target_column,
782
+ length=10,
783
+ breadth=8,
784
+ numerical_features=[]):
785
+ """
786
+ DESCRIPTION:
787
+ Internal function to visualize the data using heatmap.
788
+
789
+ PARAMETERS:
790
+ data:
791
+ Required Argument.
792
+ Specifies the input pandas DataFrame for plotting.
793
+ Types: pandas Dataframe
794
+
795
+ target_column:
796
+ Required Argument.
797
+ Specifies the name of the target column in "data".
798
+ Types: str
799
+
800
+ length:
801
+ Optional Argument.
802
+ Specifies the length of the plot.
803
+ Default Value: 10
804
+ Types: int
805
+
806
+ breadth:
807
+ Optional Argument.
808
+ Specifies the breadth of the plot.
809
+ Default Value: 8
810
+ Types: int
811
+
812
+ numerical_features:
813
+ Optional Argument.
814
+ Specifies the list of numerical features to be plotted.
815
+ Types: list of str
816
+
817
+ RETURNS:
818
+ str
819
+
820
+ RAISES:
821
+ None
822
+
823
+ EXAMPLES:
824
+ >>> _FeatureExplore._heatmap(data=data,
825
+ target_column="target",
826
+ length=10,
827
+ breadth=8,
828
+ numerical_features=["feature1", "feature2"])
829
+
830
+ """
831
+ if len(numerical_features) >= 1:
832
+ plt.figure(figsize=(length, breadth))
833
+ sns.heatmap(data[numerical_features + [target_column]].corr(), annot=True, cmap="coolwarm")
834
+ plt.title("Feature Correlation Heatmap")
835
+ plt.show()
836
+ else:
837
+ return f"Plot type 'heatmap' is not applicable as no numerical features are available."
838
+
839
+ @staticmethod
840
+ def _pair_plot(data,
841
+ target_column,
842
+ length=10,
843
+ breadth=8,
844
+ numerical_features=[],
845
+ categorical_features=[]):
846
+ """
847
+ DESCRIPTION:
848
+ Internal function to visualize the data using pair plot.
849
+
850
+ PARAMETERS:
851
+ data:
852
+ Required Argument.
853
+ Specifies the input pandas DataFrame for plotting.
854
+ Types: pandas Dataframe
855
+
856
+ target_column:
857
+ Required Argument.
858
+ Specifies the name of the target column in "data".
859
+ Types: str
860
+
861
+ length:
862
+ Optional Argument.
863
+ Specifies the length of the plot.
864
+ Default Value: 10
865
+ Types: int
866
+
867
+ breadth:
868
+ Optional Argument.
869
+ Specifies the breadth of the plot.
870
+ Default Value: 8
871
+ Types: int
872
+
873
+ numerical_features:
874
+ Optional Argument.
875
+ Specifies the list of numerical features to be plotted.
876
+ Types: list of str
877
+
878
+ categorical_features:
879
+ Optional Argument.
880
+ Specifies the list of categorical features to be plotted.
881
+ Types: list of str
882
+
883
+ RETURNS:
884
+ str
885
+
886
+ RAISES:
887
+ None
888
+
889
+ EXAMPLES:
890
+ >>> _FeatureExplore._pair_plot(data=data,
891
+ target_column="target",
892
+ length=10,
893
+ breadth=8,
894
+ numerical_features=["feature1", "feature2"])
895
+
896
+ """
897
+ if len(numerical_features) >= 1:
898
+ pair = sns.pairplot(data[numerical_features + [target_column]],
899
+ hue=target_column if target_column in categorical_features else None)
900
+
901
+ # Add a centered title
902
+ pair.figure.suptitle("pair Plot", fontsize=16, y=1.02)
903
+ plt.show()
904
+ else:
905
+ return f"Plot type 'pair' is not applicable as no numerical features are available."
906
+
907
+ @staticmethod
908
+ def _density_plot(data,
909
+ length=10,
910
+ breadth=8,
911
+ numerical_features=[]):
912
+ """
913
+ DESCRIPTION:
914
+ Internal function to visualize the data using density plot.
915
+
916
+ PARAMETERS:
917
+ data:
918
+ Required Argument.
919
+ Specifies the input pandas DataFrame for plotting.
920
+ Types: pandas Dataframe
921
+
922
+ length:
923
+ Optional Argument.
924
+ Specifies the length of the plot.
925
+ Default Value: 10
926
+ Types: int
927
+
928
+ breadth:
929
+ Optional Argument.
930
+ Specifies the breadth of the plot.
931
+ Default Value: 8
932
+ Types: int
933
+
934
+ numerical_features:
935
+ Optional Argument.
936
+ Specifies the list of numerical features to be plotted.
937
+ Types: list of str
938
+
939
+ RETURNS:
940
+ str
941
+
942
+ RAISES:
943
+ None
944
+
945
+ EXAMPLES:
946
+ >>> _FeatureExplore._density_plot(data=data,
947
+ length=10,
948
+ breadth=8,
949
+ numerical_features=["feature1", "feature2"])
950
+
951
+ """
952
+ if len(numerical_features) >= 1:
953
+ rows = math.ceil(len(numerical_features) / 3)
954
+ fig, axes = plt.subplots(rows, 3, figsize=(length, breadth))
955
+ axes = axes.flatten()
956
+ fig.suptitle("Density plot", fontsize=14)
957
+
958
+ for i, feature in enumerate(numerical_features):
959
+ sns.kdeplot(data[feature], fill=True, color="green", alpha=0.6, ax=axes[i])
960
+
961
+ # Hide any empty subplots
962
+ for i in range(len(numerical_features), len(axes)):
963
+ axes[i].axis('off')
964
+
965
+ plt.tight_layout()
966
+ plt.show()
967
+ return None
968
+ else:
969
+ return f"Plot type 'density' is not applicable as no numerical features are available."
970
+
971
+ @staticmethod
972
+ def _target_distribution(data,
973
+ target_column,
974
+ problem_type=None,
975
+ length=10,
976
+ breadth=8):
977
+ """
978
+ DESCRIPTION:
979
+ Function visualizes the target distribution.
980
+
981
+ PARAMETERS:
982
+ data:
983
+ Required Argument.
984
+ Specifies the input pandas DataFrame for plotting.
985
+ Types: pandas Dataframe
986
+
987
+ target_column:
988
+ Required Argument.
989
+ Specifies the name of the target column in "data".
990
+ Types: str
991
+
992
+ problem_type:
993
+ Optional Argument.
994
+ Specifies the type of problem.
995
+ Permitted Values:
996
+ * 'regression'
997
+ * 'classification'
998
+ Types: str
999
+
1000
+ length:
1001
+ Optional Argument.
1002
+ Specifies the length of the plot.
1003
+ Default Value: 10
1004
+ Types: int
1005
+
1006
+ breadth:
1007
+ Optional Argument.
1008
+ Specifies the breadth of the plot.
1009
+ Default Value: 8
1010
+ Types: int
1011
+
1012
+ """
1013
+ plt.figure(figsize=(length, breadth))
1014
+ # Categorical Target
1015
+ if (problem_type is None and data[target_column].nunique() <= 20) or \
1016
+ (problem_type and problem_type.lower() == 'classification'):
1017
+ sns.countplot(x=target_column,
1018
+ data=data,
1019
+ palette="coolwarm",
1020
+ hue=target_column,
1021
+ legend=False)
1022
+ else:
1023
+ # Numerical Target
1024
+ sns.histplot(data[target_column], kde=True, color="blue")
1025
+ plt.title("Target Distribution")
1026
+ plt.tight_layout()
1027
+ plt.show()
1028
+
1029
+
1030
+ @staticmethod
1031
+ def _count_plot(data,
1032
+ length=10,
1033
+ breadth=8,
1034
+ categorical_features=[]):
1035
+ """
1036
+ DESCRIPTION:
1037
+ Internal function to visualize the data using count plot.
1038
+
1039
+ PARAMETERS:
1040
+ data:
1041
+ Required Argument.
1042
+ Specifies the input pandas DataFrame for plotting.
1043
+ Types: pandas Dataframe
1044
+
1045
+ length:
1046
+ Optional Argument.
1047
+ Specifies the length of the plot.
1048
+ Default Value: 10
1049
+ Types: int
1050
+
1051
+ breadth:
1052
+ Optional Argument.
1053
+ Specifies the breadth of the plot.
1054
+ Default Value: 8
1055
+ Types: int
1056
+
1057
+ categorical_features:
1058
+ Optional Argument.
1059
+ Specifies the list of categorical features to be plotted.
1060
+ Types: list of str
1061
+
1062
+ RETURNS:
1063
+ str
1064
+
1065
+ RAISES:
1066
+ None
1067
+
1068
+ EXAMPLES:
1069
+ >>> _FeatureExplore._count_plot(data=data,
1070
+ length=10,
1071
+ breadth=8,
1072
+ categorical_features=["feature1", "feature2"])
1073
+ """
1074
+ if len(categorical_features) >= 1:
1075
+ rows = math.ceil(len(categorical_features) / 3)
1076
+ fig, axes = plt.subplots(rows, 3, figsize=(length, rows * 5))
1077
+ axes = axes.flatten()
1078
+ fig.suptitle("Count plot", fontsize=14)
1079
+
1080
+ for i, feature in enumerate(categorical_features):
1081
+ # Get top 20 most frequent categories
1082
+ top_categories = data[feature].value_counts().nlargest(25)
1083
+
1084
+ # Plot only top 20 categories
1085
+ sns.barplot(x=top_categories.index,
1086
+ y=top_categories.values,
1087
+ hue=top_categories.index,
1088
+ palette="coolwarm",
1089
+ legend=False,
1090
+ ax=axes[i])
1091
+
1092
+ # Rotate labels for readability
1093
+ axes[i].tick_params(axis='x', rotation=90)
1094
+
1095
+ # Hide empty subplots
1096
+ for i in range(len(categorical_features), len(axes)):
1097
+ axes[i].axis('off')
1098
+
1099
+ # Adjust layout spacing
1100
+ plt.subplots_adjust(hspace=1.5, wspace=0.3)
1101
+ plt.show()
1102
+ else:
1103
+ return f"Plot type 'count' is not applicable as no categorical features are available."
1104
+
1105
+ @staticmethod
1106
+ def _box_plot(data,
1107
+ length=10,
1108
+ breadth=8,
1109
+ numerical_features=[]):
1110
+ """
1111
+ DESCRIPTION:
1112
+ Internal function to visualize the data using box plot.
1113
+
1114
+ PARAMETERS:
1115
+ data:
1116
+ Required Argument.
1117
+ Specifies the input pandas DataFrame for plotting.
1118
+ Types: pandas Dataframe
1119
+
1120
+ length:
1121
+ Optional Argument.
1122
+ Specifies the length of the plot.
1123
+ Default Value: 10
1124
+ Types: int
1125
+
1126
+ breadth:
1127
+ Optional Argument.
1128
+ Specifies the breadth of the plot.
1129
+ Default Value: 8
1130
+ Types: int
1131
+
1132
+ numerical_features:
1133
+ Optional Argument.
1134
+ Specifies the list of numerical features to be plotted.
1135
+ Types: list of str
1136
+
1137
+ RETURNS:
1138
+ str
1139
+
1140
+ RAISES:
1141
+ None
1142
+
1143
+ EXAMPLES:
1144
+ >>> _FeatureExplore._box_plot(data=data,
1145
+ length=10,
1146
+ breadth=8,
1147
+ numerical_features=["feature1", "feature2"])
1148
+
1149
+ """
1150
+ if len(numerical_features) >= 1:
1151
+ rows = math.ceil(len(numerical_features) / 3)
1152
+ fig, axes = plt.subplots(rows, 3, figsize=(length, breadth))
1153
+ axes = axes.flatten()
1154
+ fig.suptitle("Box plot", fontsize=14)
1155
+
1156
+ for i, feature in enumerate(numerical_features):
1157
+ # Removed the hue argument and passed only the feature to x
1158
+ sns.boxplot(y=data[feature], data=data, ax=axes[i], legend=False)
1159
+ # Adjust layout to prevent label overlap
1160
+ plt.tight_layout()
1161
+
1162
+ # Hide any empty subplots
1163
+ for i in range(len(numerical_features), len(axes)):
1164
+ axes[i].axis('off')
1165
+
1166
+ plt.show()
1167
+ else:
1168
+ return f"Plot type 'box' is not applicable as no numerical features are available."