teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (108) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/LICENSE.pdf +0 -0
  3. teradataml/README.md +71 -0
  4. teradataml/_version.py +2 -2
  5. teradataml/analytics/analytic_function_executor.py +51 -24
  6. teradataml/analytics/json_parser/utils.py +11 -17
  7. teradataml/automl/__init__.py +103 -48
  8. teradataml/automl/data_preparation.py +55 -37
  9. teradataml/automl/data_transformation.py +131 -69
  10. teradataml/automl/feature_engineering.py +117 -185
  11. teradataml/automl/feature_exploration.py +9 -2
  12. teradataml/automl/model_evaluation.py +13 -25
  13. teradataml/automl/model_training.py +214 -75
  14. teradataml/catalog/model_cataloging_utils.py +1 -1
  15. teradataml/clients/auth_client.py +133 -0
  16. teradataml/common/aed_utils.py +3 -2
  17. teradataml/common/constants.py +11 -6
  18. teradataml/common/garbagecollector.py +5 -0
  19. teradataml/common/messagecodes.py +3 -1
  20. teradataml/common/messages.py +2 -1
  21. teradataml/common/utils.py +6 -0
  22. teradataml/context/context.py +49 -29
  23. teradataml/data/advertising.csv +201 -0
  24. teradataml/data/bank_marketing.csv +11163 -0
  25. teradataml/data/bike_sharing.csv +732 -0
  26. teradataml/data/boston2cols.csv +721 -0
  27. teradataml/data/breast_cancer.csv +570 -0
  28. teradataml/data/customer_segmentation_test.csv +2628 -0
  29. teradataml/data/customer_segmentation_train.csv +8069 -0
  30. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
  31. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
  32. teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
  33. teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
  34. teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
  35. teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
  36. teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
  37. teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
  38. teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
  39. teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
  40. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
  41. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
  42. teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
  43. teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
  44. teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
  45. teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
  46. teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
  47. teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
  48. teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
  49. teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
  50. teradataml/data/glm_example.json +28 -1
  51. teradataml/data/housing_train_segment.csv +201 -0
  52. teradataml/data/insect2Cols.csv +61 -0
  53. teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
  54. teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
  55. teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
  56. teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
  57. teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
  58. teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
  59. teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
  60. teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
  61. teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
  62. teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
  63. teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
  64. teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
  65. teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
  66. teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
  67. teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
  68. teradataml/data/kmeans_example.json +5 -0
  69. teradataml/data/kmeans_table.csv +10 -0
  70. teradataml/data/onehot_encoder_train.csv +4 -0
  71. teradataml/data/openml_example.json +29 -0
  72. teradataml/data/scale_attributes.csv +3 -0
  73. teradataml/data/scale_example.json +52 -1
  74. teradataml/data/scale_input_part_sparse.csv +31 -0
  75. teradataml/data/scale_input_partitioned.csv +16 -0
  76. teradataml/data/scale_input_sparse.csv +11 -0
  77. teradataml/data/scale_parameters.csv +3 -0
  78. teradataml/data/scripts/deploy_script.py +20 -1
  79. teradataml/data/scripts/sklearn/sklearn_fit.py +23 -27
  80. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +20 -28
  81. teradataml/data/scripts/sklearn/sklearn_function.template +13 -18
  82. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
  83. teradataml/data/scripts/sklearn/sklearn_neighbors.py +18 -27
  84. teradataml/data/scripts/sklearn/sklearn_score.py +20 -29
  85. teradataml/data/scripts/sklearn/sklearn_transform.py +30 -38
  86. teradataml/data/teradataml_example.json +77 -0
  87. teradataml/data/ztest_example.json +16 -0
  88. teradataml/dataframe/copy_to.py +8 -3
  89. teradataml/dataframe/data_transfer.py +120 -61
  90. teradataml/dataframe/dataframe.py +102 -17
  91. teradataml/dataframe/dataframe_utils.py +47 -9
  92. teradataml/dataframe/fastload.py +272 -89
  93. teradataml/dataframe/sql.py +84 -0
  94. teradataml/dbutils/dbutils.py +2 -2
  95. teradataml/lib/aed_0_1.dll +0 -0
  96. teradataml/opensource/sklearn/_sklearn_wrapper.py +102 -55
  97. teradataml/options/__init__.py +13 -4
  98. teradataml/options/configure.py +27 -6
  99. teradataml/scriptmgmt/UserEnv.py +19 -16
  100. teradataml/scriptmgmt/lls_utils.py +117 -14
  101. teradataml/table_operators/Script.py +2 -3
  102. teradataml/table_operators/TableOperator.py +58 -10
  103. teradataml/utils/validators.py +40 -2
  104. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/METADATA +78 -6
  105. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/RECORD +108 -90
  106. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/WHEEL +0 -0
  107. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/top_level.txt +0 -0
  108. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/zip-safe +0 -0
@@ -28,9 +28,14 @@ from teradataml import OutlierFilterFit, OutlierFilterTransform
28
28
  from teradataml import RoundColumns, TeradataMlException
29
29
  from teradataml import ScaleFit, ScaleTransform
30
30
  from teradataml import TrainTestSplit, UtilFuncs, TeradataConstants
31
+ from teradataml.common.garbagecollector import GarbageCollector
31
32
  from teradataml.common.messages import Messages, MessageCodes
32
33
  from teradataml.utils.validators import _Validators
34
+ from teradataml import INTEGER
33
35
 
36
+ # Control Randomnes
37
+ random.seed(42)
38
+ np.random.seed(42)
34
39
 
35
40
  class _DataPreparation:
36
41
 
@@ -54,7 +59,7 @@ class _DataPreparation:
54
59
  Types: teradataml Dataframe
55
60
 
56
61
  target_column:
57
- Required Arugment.
62
+ Required Argument.
58
63
  Specifies the name of the target column in "data".
59
64
  Types: str
60
65
 
@@ -69,22 +74,22 @@ class _DataPreparation:
69
74
  Types: int
70
75
 
71
76
  excluded_columns:
72
- Required Arugment.
77
+ Required Argument.
73
78
  Specifies the columns should be excluded from any processing.
74
79
  Types: str or list of strings (str)
75
80
 
76
81
  custom_data:
77
- Optional Arugment.
82
+ Optional Argument.
78
83
  Specifies json object containing user customized input.
79
84
  Types: json object
80
85
 
81
86
  data_transform_dict:
82
- Optional Arugment.
87
+ Optional Argument.
83
88
  Specifies the parameters for data transformation.
84
89
  Types: dict
85
90
 
86
91
  task_type:
87
- Required Arugment.
92
+ Required Argument.
88
93
  Specifies the task type for AutoML, whether to apply regresion OR classification
89
94
  on the provived dataset.
90
95
  Default Value: "Regression"
@@ -106,8 +111,6 @@ class _DataPreparation:
106
111
  self._scale_method_cls = "RANGE"
107
112
  self.table_name_mapping = {}
108
113
 
109
- random.seed(42)
110
- np.random.seed(42)
111
114
  self.data_types = {key: value for key, value in self.data._column_names_and_types}
112
115
 
113
116
 
@@ -123,7 +126,7 @@ class _DataPreparation:
123
126
 
124
127
  PARAMETERS:
125
128
  auto:
126
- Optional Arugment.
129
+ Optional Argument.
127
130
  Specifies whether to run AutoML in custom mode or auto mode.
128
131
  When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
129
132
  Default Value: True
@@ -163,6 +166,10 @@ class _DataPreparation:
163
166
  train = self._data_sampling(train)
164
167
  self.progress_bar.update()
165
168
 
169
+ # Sorting the data based on id to
170
+ # remove any shuffling done by sampling
171
+ train = train.sort_values(by='id')
172
+
166
173
  # Performing feature selection using lasso followed by scaling
167
174
  self._feature_selection_Lasso(train, test)
168
175
  self._scaling_features(feature_selection_mtd="lasso")
@@ -375,6 +382,8 @@ class _DataPreparation:
375
382
  "persist" : True
376
383
  }
377
384
  self.train_df = OutlierFilterTransform(**transform_params).result
385
+ # Adding transformed data containing table to garbage collector
386
+ GarbageCollector._add_to_garbagecollector(self.train_df._table_name)
378
387
 
379
388
  def _outlier_processing(self):
380
389
  """
@@ -400,6 +409,9 @@ class _DataPreparation:
400
409
  target_columns=columns_to_drop_rows
401
410
  replacement_strategy = "DELETE"
402
411
  self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
412
+ self._display_msg(msg="Sample of training dataset after removing outlier rows:",
413
+ data=self.train_df,
414
+ progress_bar=self.progress_bar)
403
415
 
404
416
  # Imputing Median value in place of outliers
405
417
  if len(columns_to_impute) != 0:
@@ -409,6 +421,13 @@ class _DataPreparation:
409
421
  target_columns=columns_to_impute
410
422
  replacement_strategy = "MEDIAN"
411
423
  self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
424
+ self._display_msg(msg="Sample of training dataset after performing MEDIAN inplace:",
425
+ data=self.train_df,
426
+ progress_bar=self.progress_bar)
427
+
428
+ if len(columns_to_drop_rows) == 0 and len(columns_to_impute) == 0:
429
+ self._display_msg(msg='Analysis indicates not outlier in the dataset. No Action Taken.',
430
+ progress_bar=self.progress_bar)
412
431
 
413
432
  end_time = time.time()
414
433
  self._display_msg("Time Taken by Outlier processing: {:.2f} sec ".format(end_time - start_time),
@@ -557,10 +576,6 @@ class _DataPreparation:
557
576
  progress_bar=self.progress_bar,
558
577
  show_data=True)
559
578
 
560
- if self.is_classification_type():
561
- train_df[self.target_column] = train_df[self.target_column].astype('int')
562
- test_df[self.target_column] = test_df[self.target_column].astype('int')
563
-
564
579
  # Pushing the data in database
565
580
  self.copy_dataframe_to_sql(train_df, test_df, 'pca')
566
581
 
@@ -590,7 +605,7 @@ class _DataPreparation:
590
605
  # Required imports for RFE
591
606
  from sklearn.feature_selection import RFECV
592
607
  from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
593
- from sklearn.model_selection import StratifiedKFold,KFold
608
+ from sklearn.model_selection import StratifiedKFold
594
609
 
595
610
  start_time = time.time()
596
611
  # Regression
@@ -606,9 +621,9 @@ class _DataPreparation:
606
621
  score = 'r2' if not self.is_classification_type() \
607
622
  else 'roc_auc' if self.data.drop_duplicate(self.target_column).size == 2 else 'f1_macro'
608
623
 
609
- # Instantiate StratifiedKFold with shuffling for classification
624
+ # # Instantiate StratifiedKFold with shuffling for classification
610
625
  cv = folds if not self.is_classification_type() \
611
- else StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
626
+ else StratifiedKFold(n_splits=folds, shuffle=False)
612
627
 
613
628
  # Define the RFE with cross-validation
614
629
  rfecv = RFECV(rf, cv=cv, scoring=score)
@@ -682,7 +697,8 @@ class _DataPreparation:
682
697
  from sklearn.model_selection import GridSearchCV
683
698
  from sklearn.linear_model import Lasso
684
699
  from sklearn.linear_model import LogisticRegression
685
-
700
+ from sklearn.model_selection import StratifiedKFold
701
+
686
702
  # Getting the value k in k-fold cross-validation
687
703
  num_folds = self._num_of_folds(train.shape[0])
688
704
 
@@ -696,15 +712,21 @@ class _DataPreparation:
696
712
  scoring_metric = 'roc_auc'
697
713
  else:
698
714
  scoring_metric = 'f1_macro'
699
- estimator = LogisticRegression(penalty='l1', solver='liblinear', multi_class='auto')
715
+ estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=42)
700
716
  parameters = {'C':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
701
717
  else:
702
- estimator = Lasso()
718
+ estimator = Lasso(random_state=42)
703
719
  parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
704
720
  scoring_metric = "r2"
705
721
 
722
+ if self.is_classification_type():
723
+ cv = StratifiedKFold(n_splits=5, shuffle=False)
724
+ else:
725
+ cv = num_folds
726
+
706
727
  # Applying hyperparameter tuning and optimizing score
707
- hyperparameter_search = GridSearchCV(estimator, parameters, cv=num_folds, scoring=scoring_metric, verbose=0)
728
+ hyperparameter_search = GridSearchCV(estimator, parameters, cv=cv, refit=True,
729
+ scoring=scoring_metric, verbose=0)
708
730
 
709
731
  # Fitting the best result from hyperparameter
710
732
  hyperparameter_search.fit(train_features, train_target)
@@ -775,8 +797,12 @@ class _DataPreparation:
775
797
  self.table_name_mapping['{}_test'.format(prefix)] = test_table_name
776
798
 
777
799
  # Pushing data into database
778
- copy_to_sql(df=train, table_name=train_table_name, if_exists="replace")
779
- copy_to_sql(df=test, table_name=test_table_name, if_exists="replace")
800
+ if self.is_classification_type():
801
+ copy_to_sql(df=train, table_name=train_table_name, if_exists="replace", types={f'{self.target_column}': INTEGER})
802
+ copy_to_sql(df=test, table_name=test_table_name, if_exists="replace", types={f'{self.target_column}': INTEGER})
803
+ else:
804
+ copy_to_sql(df=train, table_name=train_table_name, if_exists="replace")
805
+ copy_to_sql(df=test, table_name=test_table_name, if_exists="replace")
780
806
 
781
807
 
782
808
 
@@ -901,9 +927,6 @@ class _DataPreparation:
901
927
  else:
902
928
  self._display_msg(msg="No columns to scale.",
903
929
  progress_bar=self.progress_bar)
904
-
905
- if self.is_classification_type():
906
- train, test = self._bigint_to_int(train, test)
907
930
 
908
931
  self.copy_dataframe_to_sql(train, test, feature_selection_mtd)
909
932
 
@@ -911,15 +934,6 @@ class _DataPreparation:
911
934
  self._display_msg(msg="Total time taken by feature scaling: {:.2f} sec".format( end_time - start_time),
912
935
  progress_bar=self.progress_bar,
913
936
  show_data=True)
914
-
915
- def _bigint_to_int(self, train, test):
916
- tr = train.to_pandas()
917
- tr[self.target_column] = tr[self.target_column].astype('int')
918
-
919
- ts = test.to_pandas()
920
- ts[self.target_column] = ts[self.target_column].astype('int')
921
-
922
- return tr, ts
923
937
 
924
938
  def _set_custom_scaling_method(self):
925
939
  """
@@ -987,7 +1001,11 @@ class _DataPreparation:
987
1001
  "precision_digit" : 4,
988
1002
  "accumulate" : accumulate_columns,
989
1003
  "persist" : True}
990
-
991
- obj = RoundColumns(**fit_params).result
992
- df = obj.to_pandas()
993
- return df.reset_index()
1004
+
1005
+ transform_output = RoundColumns(**fit_params).result
1006
+ # Adding transformed data containing table to garbage collector
1007
+ GarbageCollector._add_to_garbagecollector(transform_output._table_name)
1008
+ cols = transform_output.columns
1009
+ df = transform_output.to_pandas().reset_index()
1010
+ df = df[cols]
1011
+ return df