teradataml 20.0.0.4__py3-none-any.whl → 20.0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +182 -13
- teradataml/__init__.py +2 -1
- teradataml/_version.py +2 -2
- teradataml/analytics/analytic_function_executor.py +8 -13
- teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
- teradataml/analytics/sqle/__init__.py +16 -1
- teradataml/analytics/utils.py +60 -1
- teradataml/automl/__init__.py +290 -106
- teradataml/automl/autodataprep/__init__.py +471 -0
- teradataml/automl/data_preparation.py +29 -10
- teradataml/automl/data_transformation.py +11 -0
- teradataml/automl/feature_engineering.py +64 -4
- teradataml/automl/feature_exploration.py +639 -25
- teradataml/automl/model_training.py +1 -1
- teradataml/clients/auth_client.py +12 -8
- teradataml/clients/keycloak_client.py +165 -0
- teradataml/common/constants.py +71 -26
- teradataml/common/exceptions.py +32 -0
- teradataml/common/messagecodes.py +28 -0
- teradataml/common/messages.py +13 -4
- teradataml/common/sqlbundle.py +3 -2
- teradataml/common/utils.py +345 -45
- teradataml/context/context.py +259 -93
- teradataml/data/apriori_example.json +22 -0
- teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
- teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
- teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
- teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
- teradataml/data/jsons/byom/onnxembeddings.json +1 -0
- teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
- teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
- teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
- teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
- teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +2 -2
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +1 -1
- teradataml/data/ner_dict.csv +8 -0
- teradataml/data/ner_input_eng.csv +7 -0
- teradataml/data/ner_rule.csv +5 -0
- teradataml/data/pattern_matching_data.csv +11 -0
- teradataml/data/pos_input.csv +40 -0
- teradataml/data/sdk/modelops/modelops_spec.json +101737 -0
- teradataml/data/tdnerextractor_example.json +14 -0
- teradataml/data/teradataml_example.json +21 -1
- teradataml/data/textmorph_example.json +5 -0
- teradataml/data/to_num_data.csv +4 -0
- teradataml/data/tochar_data.csv +5 -0
- teradataml/data/trans_dense.csv +16 -0
- teradataml/data/trans_sparse.csv +55 -0
- teradataml/data/url_data.csv +10 -9
- teradataml/dataframe/copy_to.py +38 -27
- teradataml/dataframe/data_transfer.py +61 -45
- teradataml/dataframe/dataframe.py +1110 -132
- teradataml/dataframe/dataframe_utils.py +73 -27
- teradataml/dataframe/functions.py +1070 -9
- teradataml/dataframe/sql.py +750 -959
- teradataml/dbutils/dbutils.py +33 -13
- teradataml/dbutils/filemgr.py +14 -10
- teradataml/hyperparameter_tuner/utils.py +4 -2
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/_base.py +12 -157
- teradataml/options/configure.py +24 -9
- teradataml/scriptmgmt/UserEnv.py +317 -39
- teradataml/scriptmgmt/lls_utils.py +456 -135
- teradataml/sdk/README.md +79 -0
- teradataml/sdk/__init__.py +4 -0
- teradataml/sdk/_auth_modes.py +422 -0
- teradataml/sdk/_func_params.py +487 -0
- teradataml/sdk/_json_parser.py +453 -0
- teradataml/sdk/_openapi_spec_constants.py +249 -0
- teradataml/sdk/_utils.py +236 -0
- teradataml/sdk/api_client.py +897 -0
- teradataml/sdk/constants.py +62 -0
- teradataml/sdk/modelops/__init__.py +98 -0
- teradataml/sdk/modelops/_client.py +406 -0
- teradataml/sdk/modelops/_constants.py +304 -0
- teradataml/sdk/modelops/models.py +2308 -0
- teradataml/sdk/spinner.py +107 -0
- teradataml/store/__init__.py +1 -1
- teradataml/table_operators/Apply.py +16 -1
- teradataml/table_operators/Script.py +20 -1
- teradataml/table_operators/query_generator.py +4 -21
- teradataml/table_operators/table_operator_util.py +58 -9
- teradataml/utils/dtypes.py +4 -2
- teradataml/utils/internal_buffer.py +22 -2
- teradataml/utils/utils.py +0 -1
- teradataml/utils/validators.py +318 -58
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/METADATA +188 -14
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/RECORD +131 -84
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/zip-safe +0 -0
|
@@ -13,6 +13,11 @@
|
|
|
13
13
|
# Function Version: 1.0
|
|
14
14
|
# ##################################################################
|
|
15
15
|
|
|
16
|
+
# Python Libraries
|
|
17
|
+
import pandas as pd
|
|
18
|
+
import matplotlib.pyplot as plt
|
|
19
|
+
import seaborn as sns
|
|
20
|
+
import math
|
|
16
21
|
|
|
17
22
|
# Teradata libraries
|
|
18
23
|
from teradataml.dataframe.dataframe import DataFrame
|
|
@@ -22,6 +27,8 @@ from teradataml import OutlierFilterFit, OutlierFilterTransform
|
|
|
22
27
|
from teradataml.hyperparameter_tuner.utils import _ProgressBar
|
|
23
28
|
from teradataml.common.messages import Messages, MessageCodes
|
|
24
29
|
from teradataml import display as dp
|
|
30
|
+
from teradataml.utils.validators import _Validators
|
|
31
|
+
from teradataml.common.utils import UtilFuncs
|
|
25
32
|
|
|
26
33
|
def _is_terminal():
|
|
27
34
|
"""
|
|
@@ -54,7 +61,8 @@ class _FeatureExplore:
|
|
|
54
61
|
def __init__(self,
|
|
55
62
|
data=None,
|
|
56
63
|
target_column=None,
|
|
57
|
-
verbose=0
|
|
64
|
+
verbose=0,
|
|
65
|
+
task_type='regression'):
|
|
58
66
|
"""
|
|
59
67
|
DESCRIPTION:
|
|
60
68
|
Internal function initializes the data, target column for feature exploration.
|
|
@@ -79,14 +87,25 @@ class _FeatureExplore:
|
|
|
79
87
|
* 1: prints the execution steps of AutoML.
|
|
80
88
|
* 2: prints the intermediate data between the execution of each step of AutoML.
|
|
81
89
|
Types: int
|
|
90
|
+
|
|
91
|
+
task_type:
|
|
92
|
+
Optional Argument.
|
|
93
|
+
Specifies the task type of the data.
|
|
94
|
+
Default Value: 'regression'
|
|
95
|
+
Permitted Values:
|
|
96
|
+
* 'regression'
|
|
97
|
+
* 'classification'
|
|
98
|
+
Types: str
|
|
82
99
|
"""
|
|
83
100
|
self.data = data
|
|
84
101
|
self.target_column = target_column
|
|
85
102
|
self.verbose = verbose
|
|
86
103
|
self.terminal_print = _is_terminal()
|
|
87
104
|
self.style = self._common_style()
|
|
105
|
+
self.task_type = task_type
|
|
88
106
|
|
|
89
|
-
def _exploration(self
|
|
107
|
+
def _exploration(self,
|
|
108
|
+
**kwargs):
|
|
90
109
|
"""
|
|
91
110
|
DESCRIPTION:
|
|
92
111
|
Internal function performs following operations:
|
|
@@ -101,7 +120,9 @@ class _FeatureExplore:
|
|
|
101
120
|
categorical_columns= []
|
|
102
121
|
date_column_list = []
|
|
103
122
|
|
|
104
|
-
|
|
123
|
+
aml_phases = kwargs.get('automl_phases', None)
|
|
124
|
+
self._display_heading(phase=0,
|
|
125
|
+
automl_phases=aml_phases)
|
|
105
126
|
self._display_msg(msg='Feature Exploration started ...')
|
|
106
127
|
|
|
107
128
|
# Detecting numerical and categorical column
|
|
@@ -227,33 +248,22 @@ class _FeatureExplore:
|
|
|
227
248
|
data=gfc_out.result,
|
|
228
249
|
show_data=True)
|
|
229
250
|
|
|
230
|
-
def _target_column_details(self
|
|
231
|
-
plot_data = None):
|
|
251
|
+
def _target_column_details(self):
|
|
232
252
|
"""
|
|
233
253
|
DESCRIPTION:
|
|
234
254
|
Internal function displays the target column distribution of Target column/ Response column.
|
|
235
255
|
|
|
236
256
|
PARAMETERS:
|
|
237
|
-
|
|
238
|
-
Required Argument.
|
|
239
|
-
Specifies the input teradataml DataFrame for plotting distribution.
|
|
240
|
-
Types: teradataml Dataframe
|
|
257
|
+
None
|
|
241
258
|
"""
|
|
242
259
|
if self._check_visualization_libraries() and not _is_terminal():
|
|
243
|
-
|
|
244
|
-
import seaborn as sns
|
|
245
|
-
if plot_data is None:
|
|
246
|
-
target_data = self.data.select([self.target_column]).to_pandas()
|
|
247
|
-
else:
|
|
248
|
-
target_data = plot_data[[self.target_column]]
|
|
260
|
+
# Plotting target column distribution
|
|
249
261
|
self._display_msg(msg='\nTarget Column Distribution:',
|
|
250
262
|
show_data=True)
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
plt.ylabel('Density')
|
|
256
|
-
plt.show()
|
|
263
|
+
_FeatureExplore._visualize(data=self.data,
|
|
264
|
+
target_column=self.target_column,
|
|
265
|
+
plot_type=["target"],
|
|
266
|
+
problem_type=self.task_type)
|
|
257
267
|
|
|
258
268
|
def _check_visualization_libraries(self):
|
|
259
269
|
"""
|
|
@@ -308,6 +318,9 @@ class _FeatureExplore:
|
|
|
308
318
|
Pandas DataFrame containing, column name with outlier percentage.
|
|
309
319
|
|
|
310
320
|
"""
|
|
321
|
+
# Removing target column from the list of columns
|
|
322
|
+
column_list = [col for col in column_list if col != self.target_column]
|
|
323
|
+
|
|
311
324
|
# Performing outlier fit on the data for replacing outliers with NULL value
|
|
312
325
|
fit_params = {
|
|
313
326
|
"data" : self.data,
|
|
@@ -379,7 +392,8 @@ class _FeatureExplore:
|
|
|
379
392
|
|
|
380
393
|
def _display_heading(self,
|
|
381
394
|
phase=0,
|
|
382
|
-
progress_bar=None
|
|
395
|
+
progress_bar=None,
|
|
396
|
+
**kwargs):
|
|
383
397
|
"""
|
|
384
398
|
DESCRIPTION:
|
|
385
399
|
Internal function to print the phase of AutoML that
|
|
@@ -399,9 +413,14 @@ class _FeatureExplore:
|
|
|
399
413
|
RETURNS:
|
|
400
414
|
None.
|
|
401
415
|
"""
|
|
402
|
-
|
|
403
|
-
steps = ["1. Feature Exploration ->", " 2. Feature Engineering ->",
|
|
416
|
+
phases = ["1. Feature Exploration ->", " 2. Feature Engineering ->",
|
|
404
417
|
" 3. Data Preparation ->", " 4. Model Training & Evaluation"]
|
|
418
|
+
# Phases of automl
|
|
419
|
+
if kwargs.get('automl_phases', None) is not None:
|
|
420
|
+
steps = kwargs.get('automl_phases')
|
|
421
|
+
else:
|
|
422
|
+
steps = phases
|
|
423
|
+
|
|
405
424
|
# Check verbose > 0
|
|
406
425
|
if self.verbose > 0:
|
|
407
426
|
|
|
@@ -551,4 +570,599 @@ class _FeatureExplore:
|
|
|
551
570
|
# If data is provided
|
|
552
571
|
if data is not None:
|
|
553
572
|
# Print the data if terminal_print is True, else display the data
|
|
554
|
-
print(data) if self.terminal_print else display(data)
|
|
573
|
+
print(data) if self.terminal_print else display(data)
|
|
574
|
+
|
|
575
|
+
@staticmethod
|
|
576
|
+
def _visualize(data,
|
|
577
|
+
target_column,
|
|
578
|
+
plot_type=["target"],
|
|
579
|
+
length=10,
|
|
580
|
+
breadth=8,
|
|
581
|
+
max_features=10,
|
|
582
|
+
columns=None,
|
|
583
|
+
problem_type=None):
|
|
584
|
+
"""
|
|
585
|
+
DESCRIPTION:
|
|
586
|
+
Internal function to visualize the data using various plots such as heatmap,
|
|
587
|
+
pair plot, density, count plot, box plot, and target distribution.
|
|
588
|
+
|
|
589
|
+
PARAMETERS:
|
|
590
|
+
data:
|
|
591
|
+
Required Argument.
|
|
592
|
+
Specifies the input teradataml DataFrame for plotting.
|
|
593
|
+
Types: teradataml Dataframe
|
|
594
|
+
|
|
595
|
+
target_column:
|
|
596
|
+
Required Argument.
|
|
597
|
+
Specifies the name of the target column in "data".
|
|
598
|
+
Types: str
|
|
599
|
+
|
|
600
|
+
plot_type:
|
|
601
|
+
Optional Argument.
|
|
602
|
+
Specifies the type of plot to be displayed.
|
|
603
|
+
Default Value: "target"
|
|
604
|
+
Permitted Values:
|
|
605
|
+
* "heatmap": Displays a heatmap of feature correlations.
|
|
606
|
+
* "pair": Displays a pair plot of features.
|
|
607
|
+
* "density": Displays a density plot of features.
|
|
608
|
+
* "count": Displays a count plot of categorical features.
|
|
609
|
+
* "box": Displays a box plot of numerical features.
|
|
610
|
+
* "target": Displays the distribution of the target variable.
|
|
611
|
+
* "all": Displays all the plots.
|
|
612
|
+
Types: str, list of str
|
|
613
|
+
|
|
614
|
+
length:
|
|
615
|
+
Optional Argument.
|
|
616
|
+
Specifies the length of the plot.
|
|
617
|
+
Default Value: 10
|
|
618
|
+
Types: int
|
|
619
|
+
|
|
620
|
+
breadth:
|
|
621
|
+
Optional Argument.
|
|
622
|
+
Specifies the breadth of the plot.
|
|
623
|
+
Default Value: 8
|
|
624
|
+
Types: int
|
|
625
|
+
|
|
626
|
+
columns:
|
|
627
|
+
Optional Argument.
|
|
628
|
+
Specifies the column names to be used for plotting.
|
|
629
|
+
Types: str or list of string
|
|
630
|
+
|
|
631
|
+
max_features:
|
|
632
|
+
Optional Argument.
|
|
633
|
+
Specifies the maximum number of features to be used for plotting.
|
|
634
|
+
Default Value: 10
|
|
635
|
+
Note:
|
|
636
|
+
* It applies separately to categorical and numerical features.
|
|
637
|
+
Types: int
|
|
638
|
+
|
|
639
|
+
problem_type:
|
|
640
|
+
Optional Argument.
|
|
641
|
+
Specifies the type of problem.
|
|
642
|
+
Permitted Values:
|
|
643
|
+
* 'regression'
|
|
644
|
+
* 'classification'
|
|
645
|
+
Types: str
|
|
646
|
+
|
|
647
|
+
RETURNS:
|
|
648
|
+
None
|
|
649
|
+
|
|
650
|
+
RAISES:
|
|
651
|
+
TeradataMlException, ValueError, TypeError
|
|
652
|
+
|
|
653
|
+
EXAMPLES:
|
|
654
|
+
>>> _FeatureExplore._visualize(data=data,
|
|
655
|
+
target_column="target",
|
|
656
|
+
plot_type="heatmap",
|
|
657
|
+
length=10,
|
|
658
|
+
breadth=8,
|
|
659
|
+
max_features=10,
|
|
660
|
+
columns=["feature1", "feature2"],
|
|
661
|
+
problem_type="regression")
|
|
662
|
+
"""
|
|
663
|
+
# Appending arguments to list for validation
|
|
664
|
+
arg_info_matrix = []
|
|
665
|
+
arg_info_matrix.append(["data", data, False, (DataFrame)])
|
|
666
|
+
arg_info_matrix.append(["target_column", target_column, False, (str)])
|
|
667
|
+
arg_info_matrix.append(["plot_type", plot_type, True, (str, list), True, ["heatmap", "pair", "all",
|
|
668
|
+
"density", "count", "box", "target"]])
|
|
669
|
+
arg_info_matrix.append(["length", length, True, (int)])
|
|
670
|
+
arg_info_matrix.append(["breadth", breadth, True, (int)])
|
|
671
|
+
arg_info_matrix.append(["max_features", max_features, True, (int)])
|
|
672
|
+
arg_info_matrix.append(["problem_type", problem_type, True, (str), True, ["regression", "classification"]])
|
|
673
|
+
arg_info_matrix.append(["columns", columns, True, (str, list)])
|
|
674
|
+
|
|
675
|
+
# Validate argument types
|
|
676
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
677
|
+
|
|
678
|
+
# Validate that data has the required columns
|
|
679
|
+
_Validators._validate_dataframe_has_argument_columns(target_column, "target_column", data, "data")
|
|
680
|
+
_Validators._validate_dataframe_has_argument_columns(columns, "columns", data, "data")
|
|
681
|
+
|
|
682
|
+
# Convert data to pandas DataFrame if it's a teradataml DataFrame
|
|
683
|
+
cols = data.columns
|
|
684
|
+
data = data.to_pandas().reset_index()
|
|
685
|
+
# avoiding the index column
|
|
686
|
+
data = data[cols]
|
|
687
|
+
|
|
688
|
+
available_plots = ["target", "density", "count", "box", "pair", "heatmap"]
|
|
689
|
+
|
|
690
|
+
# if target_column is str
|
|
691
|
+
if isinstance(target_column, str):
|
|
692
|
+
data[target_column] = data[target_column].astype("category").cat.codes
|
|
693
|
+
|
|
694
|
+
if plot_type == "all":
|
|
695
|
+
plot_type = available_plots
|
|
696
|
+
else:
|
|
697
|
+
plot_type = UtilFuncs._as_list(plot_type)
|
|
698
|
+
|
|
699
|
+
# Identify numerical and categorical columns
|
|
700
|
+
numerical_features = data.select_dtypes(include=['number']).columns.drop(target_column).tolist()
|
|
701
|
+
categorical_features = data.select_dtypes(include=['object', 'category']).columns.tolist()
|
|
702
|
+
|
|
703
|
+
# Handle selected_columns input
|
|
704
|
+
if columns:
|
|
705
|
+
selected_columns = UtilFuncs._as_list(columns)
|
|
706
|
+
selected_num_features = [col for col in selected_columns if col in numerical_features][:max_features]
|
|
707
|
+
selected_cat_features = [col for col in selected_columns if col in categorical_features][:max_features]
|
|
708
|
+
else:
|
|
709
|
+
# Compute correlation with target and select top correlated numerical features
|
|
710
|
+
if target_column in data.columns and pd.api.types.is_numeric_dtype(data[target_column]):
|
|
711
|
+
selected_num_features = (
|
|
712
|
+
data[numerical_features]
|
|
713
|
+
.corrwith(data[target_column])
|
|
714
|
+
.abs()
|
|
715
|
+
.nlargest(max_features)
|
|
716
|
+
.index.tolist()
|
|
717
|
+
)
|
|
718
|
+
else:
|
|
719
|
+
selected_num_features = numerical_features[:max_features]
|
|
720
|
+
|
|
721
|
+
# Select top categorical features based on appearance
|
|
722
|
+
selected_cat_features = categorical_features[:max_features]
|
|
723
|
+
|
|
724
|
+
irrelevant_plot = []
|
|
725
|
+
|
|
726
|
+
# Sort plot_type based on the order in available_plots
|
|
727
|
+
# display univariate plots first, then bivariate, and finally multivariate
|
|
728
|
+
sorted_plot_type = sorted(plot_type, key=lambda x: available_plots.index(x.lower()))
|
|
729
|
+
|
|
730
|
+
for plot in sorted_plot_type:
|
|
731
|
+
# Target Distribution
|
|
732
|
+
if plot.lower() == "target":
|
|
733
|
+
msg = _FeatureExplore._target_distribution(data=data,
|
|
734
|
+
target_column=target_column,
|
|
735
|
+
problem_type=problem_type,
|
|
736
|
+
length=length,
|
|
737
|
+
breadth=breadth)
|
|
738
|
+
# Density Plot (for numerical features) - Grid
|
|
739
|
+
elif plot.lower() == "density":
|
|
740
|
+
msg = _FeatureExplore._density_plot(data=data,
|
|
741
|
+
length=length,
|
|
742
|
+
breadth=breadth,
|
|
743
|
+
numerical_features=selected_num_features)
|
|
744
|
+
# Count Plot (for categorical features) - Grid
|
|
745
|
+
elif plot.lower() == "count":
|
|
746
|
+
msg = _FeatureExplore._count_plot(data=data,
|
|
747
|
+
length=length,
|
|
748
|
+
breadth=breadth,
|
|
749
|
+
categorical_features=selected_cat_features)
|
|
750
|
+
# Box Plot (for numerical features) - Grid
|
|
751
|
+
elif plot.lower() == "box":
|
|
752
|
+
msg = _FeatureExplore._box_plot(data=data,
|
|
753
|
+
length=length,
|
|
754
|
+
breadth=breadth,
|
|
755
|
+
numerical_features=selected_num_features)
|
|
756
|
+
# Scatter Plot / Pair Plot
|
|
757
|
+
elif plot.lower() == "pair":
|
|
758
|
+
msg = _FeatureExplore._pair_plot(data=data,
|
|
759
|
+
target_column=target_column,
|
|
760
|
+
length=length,
|
|
761
|
+
breadth=breadth,
|
|
762
|
+
numerical_features=selected_num_features,
|
|
763
|
+
categorical_features=selected_cat_features)
|
|
764
|
+
# Heatmap
|
|
765
|
+
elif plot.lower() == "heatmap":
|
|
766
|
+
msg = _FeatureExplore._heatmap(data=data,
|
|
767
|
+
target_column=target_column,
|
|
768
|
+
length=length,
|
|
769
|
+
breadth=breadth,
|
|
770
|
+
numerical_features=selected_num_features)
|
|
771
|
+
|
|
772
|
+
if msg:
|
|
773
|
+
irrelevant_plot.append(msg)
|
|
774
|
+
|
|
775
|
+
if irrelevant_plot:
|
|
776
|
+
for msg in irrelevant_plot:
|
|
777
|
+
print(msg)
|
|
778
|
+
|
|
779
|
+
@staticmethod
|
|
780
|
+
def _heatmap(data,
|
|
781
|
+
target_column,
|
|
782
|
+
length=10,
|
|
783
|
+
breadth=8,
|
|
784
|
+
numerical_features=[]):
|
|
785
|
+
"""
|
|
786
|
+
DESCRIPTION:
|
|
787
|
+
Internal function to visualize the data using heatmap.
|
|
788
|
+
|
|
789
|
+
PARAMETERS:
|
|
790
|
+
data:
|
|
791
|
+
Required Argument.
|
|
792
|
+
Specifies the input pandas DataFrame for plotting.
|
|
793
|
+
Types: pandas Dataframe
|
|
794
|
+
|
|
795
|
+
target_column:
|
|
796
|
+
Required Argument.
|
|
797
|
+
Specifies the name of the target column in "data".
|
|
798
|
+
Types: str
|
|
799
|
+
|
|
800
|
+
length:
|
|
801
|
+
Optional Argument.
|
|
802
|
+
Specifies the length of the plot.
|
|
803
|
+
Default Value: 10
|
|
804
|
+
Types: int
|
|
805
|
+
|
|
806
|
+
breadth:
|
|
807
|
+
Optional Argument.
|
|
808
|
+
Specifies the breadth of the plot.
|
|
809
|
+
Default Value: 8
|
|
810
|
+
Types: int
|
|
811
|
+
|
|
812
|
+
numerical_features:
|
|
813
|
+
Optional Argument.
|
|
814
|
+
Specifies the list of numerical features to be plotted.
|
|
815
|
+
Types: list of str
|
|
816
|
+
|
|
817
|
+
RETURNS:
|
|
818
|
+
str
|
|
819
|
+
|
|
820
|
+
RAISES:
|
|
821
|
+
None
|
|
822
|
+
|
|
823
|
+
EXAMPLES:
|
|
824
|
+
>>> _FeatureExplore._heatmap(data=data,
|
|
825
|
+
target_column="target",
|
|
826
|
+
length=10,
|
|
827
|
+
breadth=8,
|
|
828
|
+
numerical_features=["feature1", "feature2"])
|
|
829
|
+
|
|
830
|
+
"""
|
|
831
|
+
if len(numerical_features) >= 1:
|
|
832
|
+
plt.figure(figsize=(length, breadth))
|
|
833
|
+
sns.heatmap(data[numerical_features + [target_column]].corr(), annot=True, cmap="coolwarm")
|
|
834
|
+
plt.title("Feature Correlation Heatmap")
|
|
835
|
+
plt.show()
|
|
836
|
+
else:
|
|
837
|
+
return f"Plot type 'heatmap' is not applicable as no numerical features are available."
|
|
838
|
+
|
|
839
|
+
@staticmethod
|
|
840
|
+
def _pair_plot(data,
|
|
841
|
+
target_column,
|
|
842
|
+
length=10,
|
|
843
|
+
breadth=8,
|
|
844
|
+
numerical_features=[],
|
|
845
|
+
categorical_features=[]):
|
|
846
|
+
"""
|
|
847
|
+
DESCRIPTION:
|
|
848
|
+
Internal function to visualize the data using pair plot.
|
|
849
|
+
|
|
850
|
+
PARAMETERS:
|
|
851
|
+
data:
|
|
852
|
+
Required Argument.
|
|
853
|
+
Specifies the input pandas DataFrame for plotting.
|
|
854
|
+
Types: pandas Dataframe
|
|
855
|
+
|
|
856
|
+
target_column:
|
|
857
|
+
Required Argument.
|
|
858
|
+
Specifies the name of the target column in "data".
|
|
859
|
+
Types: str
|
|
860
|
+
|
|
861
|
+
length:
|
|
862
|
+
Optional Argument.
|
|
863
|
+
Specifies the length of the plot.
|
|
864
|
+
Default Value: 10
|
|
865
|
+
Types: int
|
|
866
|
+
|
|
867
|
+
breadth:
|
|
868
|
+
Optional Argument.
|
|
869
|
+
Specifies the breadth of the plot.
|
|
870
|
+
Default Value: 8
|
|
871
|
+
Types: int
|
|
872
|
+
|
|
873
|
+
numerical_features:
|
|
874
|
+
Optional Argument.
|
|
875
|
+
Specifies the list of numerical features to be plotted.
|
|
876
|
+
Types: list of str
|
|
877
|
+
|
|
878
|
+
categorical_features:
|
|
879
|
+
Optional Argument.
|
|
880
|
+
Specifies the list of categorical features to be plotted.
|
|
881
|
+
Types: list of str
|
|
882
|
+
|
|
883
|
+
RETURNS:
|
|
884
|
+
str
|
|
885
|
+
|
|
886
|
+
RAISES:
|
|
887
|
+
None
|
|
888
|
+
|
|
889
|
+
EXAMPLES:
|
|
890
|
+
>>> _FeatureExplore._pair_plot(data=data,
|
|
891
|
+
target_column="target",
|
|
892
|
+
length=10,
|
|
893
|
+
breadth=8,
|
|
894
|
+
numerical_features=["feature1", "feature2"])
|
|
895
|
+
|
|
896
|
+
"""
|
|
897
|
+
if len(numerical_features) >= 1:
|
|
898
|
+
pair = sns.pairplot(data[numerical_features + [target_column]],
|
|
899
|
+
hue=target_column if target_column in categorical_features else None)
|
|
900
|
+
|
|
901
|
+
# Add a centered title
|
|
902
|
+
pair.figure.suptitle("pair Plot", fontsize=16, y=1.02)
|
|
903
|
+
plt.show()
|
|
904
|
+
else:
|
|
905
|
+
return f"Plot type 'pair' is not applicable as no numerical features are available."
|
|
906
|
+
|
|
907
|
+
@staticmethod
|
|
908
|
+
def _density_plot(data,
|
|
909
|
+
length=10,
|
|
910
|
+
breadth=8,
|
|
911
|
+
numerical_features=[]):
|
|
912
|
+
"""
|
|
913
|
+
DESCRIPTION:
|
|
914
|
+
Internal function to visualize the data using density plot.
|
|
915
|
+
|
|
916
|
+
PARAMETERS:
|
|
917
|
+
data:
|
|
918
|
+
Required Argument.
|
|
919
|
+
Specifies the input pandas DataFrame for plotting.
|
|
920
|
+
Types: pandas Dataframe
|
|
921
|
+
|
|
922
|
+
length:
|
|
923
|
+
Optional Argument.
|
|
924
|
+
Specifies the length of the plot.
|
|
925
|
+
Default Value: 10
|
|
926
|
+
Types: int
|
|
927
|
+
|
|
928
|
+
breadth:
|
|
929
|
+
Optional Argument.
|
|
930
|
+
Specifies the breadth of the plot.
|
|
931
|
+
Default Value: 8
|
|
932
|
+
Types: int
|
|
933
|
+
|
|
934
|
+
numerical_features:
|
|
935
|
+
Optional Argument.
|
|
936
|
+
Specifies the list of numerical features to be plotted.
|
|
937
|
+
Types: list of str
|
|
938
|
+
|
|
939
|
+
RETURNS:
|
|
940
|
+
str
|
|
941
|
+
|
|
942
|
+
RAISES:
|
|
943
|
+
None
|
|
944
|
+
|
|
945
|
+
EXAMPLES:
|
|
946
|
+
>>> _FeatureExplore._density_plot(data=data,
|
|
947
|
+
length=10,
|
|
948
|
+
breadth=8,
|
|
949
|
+
numerical_features=["feature1", "feature2"])
|
|
950
|
+
|
|
951
|
+
"""
|
|
952
|
+
if len(numerical_features) >= 1:
|
|
953
|
+
rows = math.ceil(len(numerical_features) / 3)
|
|
954
|
+
fig, axes = plt.subplots(rows, 3, figsize=(length, breadth))
|
|
955
|
+
axes = axes.flatten()
|
|
956
|
+
fig.suptitle("Density plot", fontsize=14)
|
|
957
|
+
|
|
958
|
+
for i, feature in enumerate(numerical_features):
|
|
959
|
+
sns.kdeplot(data[feature], fill=True, color="green", alpha=0.6, ax=axes[i])
|
|
960
|
+
|
|
961
|
+
# Hide any empty subplots
|
|
962
|
+
for i in range(len(numerical_features), len(axes)):
|
|
963
|
+
axes[i].axis('off')
|
|
964
|
+
|
|
965
|
+
plt.tight_layout()
|
|
966
|
+
plt.show()
|
|
967
|
+
return None
|
|
968
|
+
else:
|
|
969
|
+
return f"Plot type 'density' is not applicable as no numerical features are available."
|
|
970
|
+
|
|
971
|
+
@staticmethod
|
|
972
|
+
def _target_distribution(data,
|
|
973
|
+
target_column,
|
|
974
|
+
problem_type=None,
|
|
975
|
+
length=10,
|
|
976
|
+
breadth=8):
|
|
977
|
+
"""
|
|
978
|
+
DESCRIPTION:
|
|
979
|
+
Function visualizes the target distribution.
|
|
980
|
+
|
|
981
|
+
PARAMETERS:
|
|
982
|
+
data:
|
|
983
|
+
Required Argument.
|
|
984
|
+
Specifies the input pandas DataFrame for plotting.
|
|
985
|
+
Types: pandas Dataframe
|
|
986
|
+
|
|
987
|
+
target_column:
|
|
988
|
+
Required Argument.
|
|
989
|
+
Specifies the name of the target column in "data".
|
|
990
|
+
Types: str
|
|
991
|
+
|
|
992
|
+
problem_type:
|
|
993
|
+
Optional Argument.
|
|
994
|
+
Specifies the type of problem.
|
|
995
|
+
Permitted Values:
|
|
996
|
+
* 'regression'
|
|
997
|
+
* 'classification'
|
|
998
|
+
Types: str
|
|
999
|
+
|
|
1000
|
+
length:
|
|
1001
|
+
Optional Argument.
|
|
1002
|
+
Specifies the length of the plot.
|
|
1003
|
+
Default Value: 10
|
|
1004
|
+
Types: int
|
|
1005
|
+
|
|
1006
|
+
breadth:
|
|
1007
|
+
Optional Argument.
|
|
1008
|
+
Specifies the breadth of the plot.
|
|
1009
|
+
Default Value: 8
|
|
1010
|
+
Types: int
|
|
1011
|
+
|
|
1012
|
+
"""
|
|
1013
|
+
plt.figure(figsize=(length, breadth))
|
|
1014
|
+
# Categorical Target
|
|
1015
|
+
if (problem_type is None and data[target_column].nunique() <= 20) or \
|
|
1016
|
+
(problem_type and problem_type.lower() == 'classification'):
|
|
1017
|
+
sns.countplot(x=target_column,
|
|
1018
|
+
data=data,
|
|
1019
|
+
palette="coolwarm",
|
|
1020
|
+
hue=target_column,
|
|
1021
|
+
legend=False)
|
|
1022
|
+
else:
|
|
1023
|
+
# Numerical Target
|
|
1024
|
+
sns.histplot(data[target_column], kde=True, color="blue")
|
|
1025
|
+
plt.title("Target Distribution")
|
|
1026
|
+
plt.tight_layout()
|
|
1027
|
+
plt.show()
|
|
1028
|
+
|
|
1029
|
+
|
|
1030
|
+
@staticmethod
|
|
1031
|
+
def _count_plot(data,
|
|
1032
|
+
length=10,
|
|
1033
|
+
breadth=8,
|
|
1034
|
+
categorical_features=[]):
|
|
1035
|
+
"""
|
|
1036
|
+
DESCRIPTION:
|
|
1037
|
+
Internal function to visualize the data using count plot.
|
|
1038
|
+
|
|
1039
|
+
PARAMETERS:
|
|
1040
|
+
data:
|
|
1041
|
+
Required Argument.
|
|
1042
|
+
Specifies the input pandas DataFrame for plotting.
|
|
1043
|
+
Types: pandas Dataframe
|
|
1044
|
+
|
|
1045
|
+
length:
|
|
1046
|
+
Optional Argument.
|
|
1047
|
+
Specifies the length of the plot.
|
|
1048
|
+
Default Value: 10
|
|
1049
|
+
Types: int
|
|
1050
|
+
|
|
1051
|
+
breadth:
|
|
1052
|
+
Optional Argument.
|
|
1053
|
+
Specifies the breadth of the plot.
|
|
1054
|
+
Default Value: 8
|
|
1055
|
+
Types: int
|
|
1056
|
+
|
|
1057
|
+
categorical_features:
|
|
1058
|
+
Optional Argument.
|
|
1059
|
+
Specifies the list of categorical features to be plotted.
|
|
1060
|
+
Types: list of str
|
|
1061
|
+
|
|
1062
|
+
RETURNS:
|
|
1063
|
+
str
|
|
1064
|
+
|
|
1065
|
+
RAISES:
|
|
1066
|
+
None
|
|
1067
|
+
|
|
1068
|
+
EXAMPLES:
|
|
1069
|
+
>>> _FeatureExplore._count_plot(data=data,
|
|
1070
|
+
length=10,
|
|
1071
|
+
breadth=8,
|
|
1072
|
+
categorical_features=["feature1", "feature2"])
|
|
1073
|
+
"""
|
|
1074
|
+
if len(categorical_features) >= 1:
|
|
1075
|
+
rows = math.ceil(len(categorical_features) / 3)
|
|
1076
|
+
fig, axes = plt.subplots(rows, 3, figsize=(length, rows * 5))
|
|
1077
|
+
axes = axes.flatten()
|
|
1078
|
+
fig.suptitle("Count plot", fontsize=14)
|
|
1079
|
+
|
|
1080
|
+
for i, feature in enumerate(categorical_features):
|
|
1081
|
+
# Get top 20 most frequent categories
|
|
1082
|
+
top_categories = data[feature].value_counts().nlargest(25)
|
|
1083
|
+
|
|
1084
|
+
# Plot only top 20 categories
|
|
1085
|
+
sns.barplot(x=top_categories.index,
|
|
1086
|
+
y=top_categories.values,
|
|
1087
|
+
hue=top_categories.index,
|
|
1088
|
+
palette="coolwarm",
|
|
1089
|
+
legend=False,
|
|
1090
|
+
ax=axes[i])
|
|
1091
|
+
|
|
1092
|
+
# Rotate labels for readability
|
|
1093
|
+
axes[i].tick_params(axis='x', rotation=90)
|
|
1094
|
+
|
|
1095
|
+
# Hide empty subplots
|
|
1096
|
+
for i in range(len(categorical_features), len(axes)):
|
|
1097
|
+
axes[i].axis('off')
|
|
1098
|
+
|
|
1099
|
+
# Adjust layout spacing
|
|
1100
|
+
plt.subplots_adjust(hspace=1.5, wspace=0.3)
|
|
1101
|
+
plt.show()
|
|
1102
|
+
else:
|
|
1103
|
+
return f"Plot type 'count' is not applicable as no categorical features are available."
|
|
1104
|
+
|
|
1105
|
+
@staticmethod
|
|
1106
|
+
def _box_plot(data,
|
|
1107
|
+
length=10,
|
|
1108
|
+
breadth=8,
|
|
1109
|
+
numerical_features=[]):
|
|
1110
|
+
"""
|
|
1111
|
+
DESCRIPTION:
|
|
1112
|
+
Internal function to visualize the data using box plot.
|
|
1113
|
+
|
|
1114
|
+
PARAMETERS:
|
|
1115
|
+
data:
|
|
1116
|
+
Required Argument.
|
|
1117
|
+
Specifies the input pandas DataFrame for plotting.
|
|
1118
|
+
Types: pandas Dataframe
|
|
1119
|
+
|
|
1120
|
+
length:
|
|
1121
|
+
Optional Argument.
|
|
1122
|
+
Specifies the length of the plot.
|
|
1123
|
+
Default Value: 10
|
|
1124
|
+
Types: int
|
|
1125
|
+
|
|
1126
|
+
breadth:
|
|
1127
|
+
Optional Argument.
|
|
1128
|
+
Specifies the breadth of the plot.
|
|
1129
|
+
Default Value: 8
|
|
1130
|
+
Types: int
|
|
1131
|
+
|
|
1132
|
+
numerical_features:
|
|
1133
|
+
Optional Argument.
|
|
1134
|
+
Specifies the list of numerical features to be plotted.
|
|
1135
|
+
Types: list of str
|
|
1136
|
+
|
|
1137
|
+
RETURNS:
|
|
1138
|
+
str
|
|
1139
|
+
|
|
1140
|
+
RAISES:
|
|
1141
|
+
None
|
|
1142
|
+
|
|
1143
|
+
EXAMPLES:
|
|
1144
|
+
>>> _FeatureExplore._box_plot(data=data,
|
|
1145
|
+
length=10,
|
|
1146
|
+
breadth=8,
|
|
1147
|
+
numerical_features=["feature1", "feature2"])
|
|
1148
|
+
|
|
1149
|
+
"""
|
|
1150
|
+
if len(numerical_features) >= 1:
|
|
1151
|
+
rows = math.ceil(len(numerical_features) / 3)
|
|
1152
|
+
fig, axes = plt.subplots(rows, 3, figsize=(length, breadth))
|
|
1153
|
+
axes = axes.flatten()
|
|
1154
|
+
fig.suptitle("Box plot", fontsize=14)
|
|
1155
|
+
|
|
1156
|
+
for i, feature in enumerate(numerical_features):
|
|
1157
|
+
# Removed the hue argument and passed only the feature to x
|
|
1158
|
+
sns.boxplot(y=data[feature], data=data, ax=axes[i], legend=False)
|
|
1159
|
+
# Adjust layout to prevent label overlap
|
|
1160
|
+
plt.tight_layout()
|
|
1161
|
+
|
|
1162
|
+
# Hide any empty subplots
|
|
1163
|
+
for i in range(len(numerical_features), len(axes)):
|
|
1164
|
+
axes[i].axis('off')
|
|
1165
|
+
|
|
1166
|
+
plt.show()
|
|
1167
|
+
else:
|
|
1168
|
+
return f"Plot type 'box' is not applicable as no numerical features are available."
|