teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +183 -0
- teradataml/__init__.py +6 -3
- teradataml/_version.py +2 -2
- teradataml/analytics/__init__.py +3 -2
- teradataml/analytics/analytic_function_executor.py +275 -40
- teradataml/analytics/analytic_query_generator.py +92 -0
- teradataml/analytics/byom/__init__.py +3 -2
- teradataml/analytics/json_parser/metadata.py +1 -0
- teradataml/analytics/json_parser/utils.py +17 -21
- teradataml/analytics/meta_class.py +40 -1
- teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
- teradataml/analytics/sqle/__init__.py +10 -2
- teradataml/analytics/table_operator/__init__.py +3 -2
- teradataml/analytics/uaf/__init__.py +21 -2
- teradataml/analytics/utils.py +62 -1
- teradataml/analytics/valib.py +1 -1
- teradataml/automl/__init__.py +1553 -319
- teradataml/automl/custom_json_utils.py +139 -61
- teradataml/automl/data_preparation.py +276 -319
- teradataml/automl/data_transformation.py +163 -81
- teradataml/automl/feature_engineering.py +402 -239
- teradataml/automl/feature_exploration.py +9 -2
- teradataml/automl/model_evaluation.py +48 -51
- teradataml/automl/model_training.py +291 -189
- teradataml/catalog/byom.py +8 -8
- teradataml/catalog/model_cataloging_utils.py +1 -1
- teradataml/clients/auth_client.py +133 -0
- teradataml/clients/pkce_client.py +1 -1
- teradataml/common/aed_utils.py +3 -2
- teradataml/common/constants.py +48 -6
- teradataml/common/deprecations.py +13 -7
- teradataml/common/garbagecollector.py +156 -120
- teradataml/common/messagecodes.py +6 -1
- teradataml/common/messages.py +3 -1
- teradataml/common/sqlbundle.py +1 -1
- teradataml/common/utils.py +103 -11
- teradataml/common/wrapper_utils.py +1 -1
- teradataml/context/context.py +121 -31
- teradataml/data/advertising.csv +201 -0
- teradataml/data/bank_marketing.csv +11163 -0
- teradataml/data/bike_sharing.csv +732 -0
- teradataml/data/boston2cols.csv +721 -0
- teradataml/data/breast_cancer.csv +570 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/customer_segmentation_test.csv +2628 -0
- teradataml/data/customer_segmentation_train.csv +8069 -0
- teradataml/data/dataframe_example.json +10 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
- teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
- teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
- teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
- teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
- teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
- teradataml/data/docs/sqle/docs_17_20/Shap.py +197 -0
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
- teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
- teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
- teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/glm_example.json +28 -1
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/housing_train_segment.csv +201 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/insect2Cols.csv +61 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/jsons/paired_functions.json +14 -0
- teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
- teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
- teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
- teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
- teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
- teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
- teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
- teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
- teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
- teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
- teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
- teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +208 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +400 -0
- teradataml/data/kmeans_example.json +5 -0
- teradataml/data/kmeans_table.csv +10 -0
- teradataml/data/load_example_data.py +8 -2
- teradataml/data/naivebayestextclassifier_example.json +1 -1
- teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
- teradataml/data/onehot_encoder_train.csv +4 -0
- teradataml/data/openml_example.json +29 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/sax_example.json +8 -0
- teradataml/data/scale_attributes.csv +3 -0
- teradataml/data/scale_example.json +52 -1
- teradataml/data/scale_input_part_sparse.csv +31 -0
- teradataml/data/scale_input_partitioned.csv +16 -0
- teradataml/data/scale_input_sparse.csv +11 -0
- teradataml/data/scale_parameters.csv +3 -0
- teradataml/data/scripts/deploy_script.py +21 -2
- teradataml/data/scripts/sklearn/sklearn_fit.py +40 -37
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +22 -30
- teradataml/data/scripts/sklearn/sklearn_function.template +42 -24
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +19 -28
- teradataml/data/scripts/sklearn/sklearn_score.py +32 -32
- teradataml/data/scripts/sklearn/sklearn_transform.py +85 -42
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/templates/open_source_ml.json +2 -1
- teradataml/data/teradataml_example.json +97 -1
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/uaf_example.json +55 -1
- teradataml/data/unpivot_example.json +15 -0
- teradataml/data/url_data.csv +9 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/data/ztest_example.json +16 -0
- teradataml/dataframe/copy_to.py +9 -4
- teradataml/dataframe/data_transfer.py +125 -64
- teradataml/dataframe/dataframe.py +575 -57
- teradataml/dataframe/dataframe_utils.py +47 -9
- teradataml/dataframe/fastload.py +273 -90
- teradataml/dataframe/functions.py +339 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +2 -2
- teradataml/dataframe/sql.py +740 -18
- teradataml/dataframe/window.py +1 -1
- teradataml/dbutils/dbutils.py +324 -18
- teradataml/geospatial/geodataframe.py +1 -1
- teradataml/geospatial/geodataframecolumn.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +13 -13
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/sklearn/_sklearn_wrapper.py +254 -122
- teradataml/options/__init__.py +16 -5
- teradataml/options/configure.py +39 -6
- teradataml/options/display.py +2 -2
- teradataml/plot/axis.py +4 -4
- teradataml/scriptmgmt/UserEnv.py +26 -19
- teradataml/scriptmgmt/lls_utils.py +120 -16
- teradataml/table_operators/Script.py +4 -5
- teradataml/table_operators/TableOperator.py +160 -26
- teradataml/table_operators/table_operator_util.py +88 -41
- teradataml/table_operators/templates/dataframe_udf.template +63 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +52 -0
- teradataml/utils/validators.py +41 -3
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/METADATA +191 -6
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/RECORD +263 -185
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/zip-safe +0 -0
|
@@ -24,6 +24,7 @@ import teradataml.context.context as tdmlctx
|
|
|
24
24
|
from collections import OrderedDict, namedtuple
|
|
25
25
|
from sqlalchemy.sql import ClauseElement
|
|
26
26
|
from teradataml import execute_sql
|
|
27
|
+
from teradataml import GarbageCollector
|
|
27
28
|
from teradataml.dataframe.sql import _MetaExpression
|
|
28
29
|
from teradataml.dataframe.sql_interfaces import ColumnExpression
|
|
29
30
|
from teradataml.dataframe.sql_functions import case
|
|
@@ -41,6 +42,7 @@ from teradataml.dataframe.indexer import _LocationIndexer
|
|
|
41
42
|
from teradataml.common.aed_utils import AedUtils
|
|
42
43
|
from teradataml.options.display import display
|
|
43
44
|
from teradataml.dataframe.copy_to import copy_to_sql
|
|
45
|
+
from teradataml.dataframe.row import _Row
|
|
44
46
|
from teradataml.dataframe.setop import concat
|
|
45
47
|
from teradataml.plot.plot import _Plot
|
|
46
48
|
from teradataml.scriptmgmt.UserEnv import UserEnv
|
|
@@ -52,7 +54,9 @@ from teradatasql import OperationalError
|
|
|
52
54
|
from teradataml.dataframe.window import Window
|
|
53
55
|
from teradataml.dataframe.data_transfer import _DataTransferUtils
|
|
54
56
|
from teradataml.common.bulk_exposed_utils import _validate_unimplemented_function
|
|
55
|
-
from
|
|
57
|
+
from teradataml.telemetry_utils.queryband import collect_queryband
|
|
58
|
+
from teradataml.options.configure import configure
|
|
59
|
+
from teradataml.utils.internal_buffer import _InternalBuffer
|
|
56
60
|
|
|
57
61
|
# TODO use logger when available on master branch
|
|
58
62
|
# logger = teradatapylog.getLogger()
|
|
@@ -150,6 +154,11 @@ class DataFrame():
|
|
|
150
154
|
# This attribute added to add setter for columns property,
|
|
151
155
|
# it is required when setting columns from groupby
|
|
152
156
|
self._columns = None
|
|
157
|
+
# This attribute stores the internal AED query and avoid multiple
|
|
158
|
+
# calls to AED utility function aed_show_query()
|
|
159
|
+
self._aed_query = None
|
|
160
|
+
# This attribute stores the type of query stored in self._aed_query.
|
|
161
|
+
self._is_full_query = None
|
|
153
162
|
|
|
154
163
|
# Property to determine if table is an ART table or not.
|
|
155
164
|
self._is_art = None
|
|
@@ -417,6 +426,130 @@ class DataFrame():
|
|
|
417
426
|
|
|
418
427
|
return df
|
|
419
428
|
|
|
429
|
+
def create_temp_view(self, name):
|
|
430
|
+
"""
|
|
431
|
+
DESCRIPTION:
|
|
432
|
+
Creates a temporary view for session on the DataFrame.
|
|
433
|
+
|
|
434
|
+
PARAMETERS:
|
|
435
|
+
name:
|
|
436
|
+
Required Argument.
|
|
437
|
+
Specifies the name of the temporary view.
|
|
438
|
+
Type: str
|
|
439
|
+
|
|
440
|
+
RETURNS:
|
|
441
|
+
None
|
|
442
|
+
|
|
443
|
+
RAISES:
|
|
444
|
+
OperationalError (When view already exists).
|
|
445
|
+
|
|
446
|
+
EXAMPLES:
|
|
447
|
+
# Load the data to run the example.
|
|
448
|
+
>>> load_example_data("dataframe", "admissions_train")
|
|
449
|
+
>>> df = DataFrame("admissions_train")
|
|
450
|
+
>>> df
|
|
451
|
+
masters gpa stats programming admitted
|
|
452
|
+
id
|
|
453
|
+
38 yes 2.65 Advanced Beginner 1
|
|
454
|
+
7 yes 2.33 Novice Novice 1
|
|
455
|
+
26 yes 3.57 Advanced Advanced 1
|
|
456
|
+
17 no 3.83 Advanced Advanced 1
|
|
457
|
+
34 yes 3.85 Advanced Beginner 0
|
|
458
|
+
13 no 4.00 Advanced Novice 1
|
|
459
|
+
32 yes 3.46 Advanced Beginner 0
|
|
460
|
+
11 no 3.13 Advanced Advanced 1
|
|
461
|
+
15 yes 4.00 Advanced Advanced 1
|
|
462
|
+
36 no 3.00 Advanced Novice 0
|
|
463
|
+
|
|
464
|
+
# Example 1: Create view 'new_admissions'.
|
|
465
|
+
>>> df.create_temp_view("new_admissions")
|
|
466
|
+
>>> new_df = DataFrame("new_admissions")
|
|
467
|
+
>>> new_df
|
|
468
|
+
masters gpa stats programming admitted
|
|
469
|
+
id
|
|
470
|
+
38 yes 2.65 Advanced Beginner 1
|
|
471
|
+
7 yes 2.33 Novice Novice 1
|
|
472
|
+
26 yes 3.57 Advanced Advanced 1
|
|
473
|
+
17 no 3.83 Advanced Advanced 1
|
|
474
|
+
34 yes 3.85 Advanced Beginner 0
|
|
475
|
+
13 no 4.00 Advanced Novice 1
|
|
476
|
+
32 yes 3.46 Advanced Beginner 0
|
|
477
|
+
11 no 3.13 Advanced Advanced 1
|
|
478
|
+
15 yes 4.00 Advanced Advanced 1
|
|
479
|
+
36 no 3.00 Advanced Novice 0
|
|
480
|
+
"""
|
|
481
|
+
# Validating Arguments
|
|
482
|
+
arg_type_matrix = []
|
|
483
|
+
arg_type_matrix.append(["name", name, False, (str), True])
|
|
484
|
+
_Validators._validate_function_arguments(arg_type_matrix)
|
|
485
|
+
|
|
486
|
+
GarbageCollector._add_to_garbagecollector(name, TeradataConstants.TERADATA_VIEW)
|
|
487
|
+
UtilFuncs._create_view(name, self.show_query())
|
|
488
|
+
|
|
489
|
+
def materialize(self):
|
|
490
|
+
"""
|
|
491
|
+
DESCRIPTION:
|
|
492
|
+
Method to materialize teradataml DataFrame into a database object.
|
|
493
|
+
Notes:
|
|
494
|
+
* DataFrames are materialized in either view/table/volatile table,
|
|
495
|
+
which is decided and taken care by teradataml.
|
|
496
|
+
* If user wants to materialize object into specific database object
|
|
497
|
+
such as table/volatile table, use 'to_sql()' or 'copy_to_sql()' or
|
|
498
|
+
'fastload()' functions.
|
|
499
|
+
* Materialized object is garbage collected at the end of the session.
|
|
500
|
+
|
|
501
|
+
PARAMETERS:
|
|
502
|
+
None
|
|
503
|
+
|
|
504
|
+
RETURNS:
|
|
505
|
+
DataFrame
|
|
506
|
+
|
|
507
|
+
EXAMPLES:
|
|
508
|
+
>>> load_example_data("dataframe", "admissions_train")
|
|
509
|
+
>>> df = DataFrame("admissions_train")
|
|
510
|
+
>>> df
|
|
511
|
+
masters gpa stats programming admitted
|
|
512
|
+
id
|
|
513
|
+
13 no 4.00 Advanced Novice 1
|
|
514
|
+
26 yes 3.57 Advanced Advanced 1
|
|
515
|
+
5 no 3.44 Novice Novice 0
|
|
516
|
+
19 yes 1.98 Advanced Advanced 0
|
|
517
|
+
15 yes 4.00 Advanced Advanced 1
|
|
518
|
+
40 yes 3.95 Novice Beginner 0
|
|
519
|
+
7 yes 2.33 Novice Novice 1
|
|
520
|
+
22 yes 3.46 Novice Beginner 0
|
|
521
|
+
36 no 3.00 Advanced Novice 0
|
|
522
|
+
38 yes 2.65 Advanced Beginner 1
|
|
523
|
+
|
|
524
|
+
# Example 1: Perform operations on teradataml DataFrame
|
|
525
|
+
# and materializeit in a database object.
|
|
526
|
+
>>> df2 = df.get([["id", "masters", "gpa"]])
|
|
527
|
+
|
|
528
|
+
# Initially table_name will be None.
|
|
529
|
+
>>> df2._table_name
|
|
530
|
+
|
|
531
|
+
>>> df2.materialize()
|
|
532
|
+
masters gpa
|
|
533
|
+
id
|
|
534
|
+
15 yes 4.00
|
|
535
|
+
7 yes 2.33
|
|
536
|
+
22 yes 3.46
|
|
537
|
+
17 no 3.83
|
|
538
|
+
13 no 4.00
|
|
539
|
+
38 yes 2.65
|
|
540
|
+
26 yes 3.57
|
|
541
|
+
5 no 3.44
|
|
542
|
+
34 yes 3.85
|
|
543
|
+
40 yes 3.95
|
|
544
|
+
|
|
545
|
+
# After materialize(), view name will be assigned.
|
|
546
|
+
>>> df2._table_name
|
|
547
|
+
'"ALICE"."ml__select__172077355985236"'
|
|
548
|
+
>>>
|
|
549
|
+
"""
|
|
550
|
+
self.__execute_node_and_set_table_name(self._nodeid, self._metaexpr)
|
|
551
|
+
return self
|
|
552
|
+
|
|
420
553
|
@collect_queryband(queryband="DF_fillna")
|
|
421
554
|
def fillna(self, value=None, columns=None, literal_value=False):
|
|
422
555
|
"""
|
|
@@ -5017,7 +5150,7 @@ class DataFrame():
|
|
|
5017
5150
|
'median', 'var'
|
|
5018
5151
|
|
|
5019
5152
|
Acceptable formats for function(s) are
|
|
5020
|
-
string, dictionary
|
|
5153
|
+
string, dictionary, list of strings/functions/ColumnExpression or ColumnExpression.
|
|
5021
5154
|
|
|
5022
5155
|
Accepted combinations are:
|
|
5023
5156
|
1. String function name
|
|
@@ -5025,12 +5158,57 @@ class DataFrame():
|
|
|
5025
5158
|
3. Dictionary containing column name as key and
|
|
5026
5159
|
aggregate function name (string or list of
|
|
5027
5160
|
strings) as value
|
|
5161
|
+
4. ColumnExpression built using the aggregate functions.
|
|
5162
|
+
5. List of ColumnExpression built using the aggregate functions.
|
|
5163
|
+
|
|
5164
|
+
Note:
|
|
5165
|
+
* The name of the output columns are generated based on aggregate functions and column names.
|
|
5166
|
+
For Example,
|
|
5167
|
+
1. "func" passed as a string.
|
|
5168
|
+
>>> df.agg('mean')
|
|
5169
|
+
Assume that the column names of the dataframe are employee_no, first_name, marks, dob, joined_date.
|
|
5170
|
+
After the above operation, the output column names are:
|
|
5171
|
+
mean_employee_no, mean_marks, mean_dob, mean_joined_date
|
|
5172
|
+
|
|
5173
|
+
2. "func" passed as a list of string functions.
|
|
5174
|
+
>>> df.agg(['min', 'sum'])
|
|
5175
|
+
Assume that the column names of the dataframe are employee_no, first_name, marks, dob, joined_date.
|
|
5176
|
+
After the above operation, the output column names are:
|
|
5177
|
+
min_employee_no, sum_employee_no, min_first_name, min_marks, sum_marks, min_dob, min_joined_date
|
|
5178
|
+
|
|
5179
|
+
3. "func" passed as a dictionary containing column name as key and aggregate function name as value.
|
|
5180
|
+
>>> df.agg({'employee_no' : ['min', 'sum', 'var'], 'first_name' : ['min']})
|
|
5181
|
+
Output column names after the above operation are:
|
|
5182
|
+
min_employee_no, sum_employee_no, var_employee_no, min_first_name
|
|
5183
|
+
|
|
5184
|
+
4. "func" passed as a ColumnExpression built using the aggregate functions.
|
|
5185
|
+
>>> df.agg(df.first_name.count())
|
|
5186
|
+
Output column name after the above operation is:
|
|
5187
|
+
count(first_name)
|
|
5188
|
+
|
|
5189
|
+
5. "func" passed as a list of ColumnExpression built using the aggregate functions.
|
|
5190
|
+
>>> df.agg([df.employee_no.min(), df.first_name.count()])
|
|
5191
|
+
Output column names after the above operation are:
|
|
5192
|
+
min(employee_no), count(first_name)
|
|
5193
|
+
|
|
5194
|
+
* On ColumnExpression or list of ColumnExpression alias() can be used to
|
|
5195
|
+
return the output columns with aliased name.
|
|
5196
|
+
For Example,
|
|
5197
|
+
>>> df.agg(df.first_name.count().alias("total_names"))
|
|
5198
|
+
Output column name after the above operation is:
|
|
5199
|
+
total_names
|
|
5200
|
+
|
|
5201
|
+
>>> df.agg([df.joined_date.min().alias("min_date"), df.first_name.count().alias("total_names")])
|
|
5202
|
+
Output column names after the above operation are:
|
|
5203
|
+
min_date, total_names
|
|
5204
|
+
|
|
5028
5205
|
|
|
5029
5206
|
RETURNS:
|
|
5030
5207
|
teradataml DataFrame object with operations
|
|
5031
5208
|
mentioned in parameter 'func' performed on specified
|
|
5032
5209
|
columns.
|
|
5033
5210
|
|
|
5211
|
+
|
|
5034
5212
|
RAISES:
|
|
5035
5213
|
TeradataMLException
|
|
5036
5214
|
1. TDMLDF_AGGREGATE_FAILED - If operations on given columns
|
|
@@ -5072,8 +5250,8 @@ class DataFrame():
|
|
|
5072
5250
|
valid datatype.
|
|
5073
5251
|
|
|
5074
5252
|
Possible error message:
|
|
5075
|
-
Invalid type(s) passed to argument 'func', should be:
|
|
5076
|
-
|
|
5253
|
+
Invalid type(s) passed to argument 'func', should be:
|
|
5254
|
+
['str, dict, ColumnExpression or list of values of type(s): str, ColumnExpression'].
|
|
5077
5255
|
|
|
5078
5256
|
EXAMPLES :
|
|
5079
5257
|
# Load the data to run the example.
|
|
@@ -5090,21 +5268,49 @@ class DataFrame():
|
|
|
5090
5268
|
112 None None None 18/12/05
|
|
5091
5269
|
>>>
|
|
5092
5270
|
|
|
5093
|
-
#
|
|
5271
|
+
# Get the minimum, sum and variance of employee number and minimum and mean of name,
|
|
5272
|
+
# by passing dictionary of column names to string function/list of string functions as parameter.
|
|
5094
5273
|
>>> df.agg({'employee_no' : ['min', 'sum', 'var'], 'first_name' : ['min', 'mean']})
|
|
5095
|
-
|
|
5096
|
-
|
|
5274
|
+
min_employee_no sum_employee_no var_employee_no min_first_name
|
|
5275
|
+
0 100 313 44.333333 abcd
|
|
5097
5276
|
|
|
5098
|
-
#
|
|
5277
|
+
# Get the minimum and sum of all the columns in the dataframe,
|
|
5278
|
+
# by passing list of string functions as parameter.
|
|
5099
5279
|
>>> df.agg(['min', 'sum'])
|
|
5100
|
-
|
|
5101
|
-
|
|
5280
|
+
min_employee_no sum_employee_no min_first_name min_marks sum_marks min_dob min_joined_date
|
|
5281
|
+
0 100 313 abcd None None None 1902-05-12
|
|
5102
5282
|
|
|
5103
|
-
#
|
|
5283
|
+
# Get the mean of all the columns in the dataframe, by passing string function as parameter.
|
|
5104
5284
|
>>> df.agg('mean')
|
|
5105
5285
|
mean_employee_no mean_marks mean_dob mean_joined_date
|
|
5106
5286
|
0 104.333333 None None 60/12/04
|
|
5107
5287
|
|
|
5288
|
+
# Get the total names in the dataframe, by running count() on the "first_name"
|
|
5289
|
+
# and passing ColumnExpression as parameter.
|
|
5290
|
+
>>> df.agg(df.first_name.count())
|
|
5291
|
+
count(first_name)
|
|
5292
|
+
0 2
|
|
5293
|
+
|
|
5294
|
+
# Get the minimum of joining date and total of names in the dataframe,
|
|
5295
|
+
# by running min() on joined_date and count() on the "first_name"
|
|
5296
|
+
# and passing list of ColumnExpression as parameter.
|
|
5297
|
+
>>> df.agg([df.employee_no.min(), df.first_name.count()])
|
|
5298
|
+
min(employee_no) count(first_name)
|
|
5299
|
+
0 100 2
|
|
5300
|
+
|
|
5301
|
+
# Get the total names in the dataframe, by running count() on the "first_name" and
|
|
5302
|
+
# use alias() to have the output column named as "total_names".
|
|
5303
|
+
>>> df.agg(df.first_name.count().alias("total_names"))
|
|
5304
|
+
total_names
|
|
5305
|
+
0 2
|
|
5306
|
+
|
|
5307
|
+
# Get the minimum of joining date and total names in the dataframe,
|
|
5308
|
+
# by running min() on joined_date and count() on the "first_name" and
|
|
5309
|
+
# use alias() to have the output column named as "min_date" and "total_names".
|
|
5310
|
+
>>> df.agg([df.joined_date.min().alias("min_date"), df.first_name.count().alias("total_names")])
|
|
5311
|
+
min_date total_names
|
|
5312
|
+
0 02/12/05 2
|
|
5313
|
+
|
|
5108
5314
|
# Select only subset of columns from the DataFrame.
|
|
5109
5315
|
>>> df1 = df.select(['employee_no', 'first_name', 'joined_date'])
|
|
5110
5316
|
|
|
@@ -5145,9 +5351,9 @@ class DataFrame():
|
|
|
5145
5351
|
raise TeradataMlException(Messages.get_message(MessageCodes.MISSING_ARGS, "func"),
|
|
5146
5352
|
MessageCodes.MISSING_ARGS)
|
|
5147
5353
|
|
|
5148
|
-
if not isinstance(func, str
|
|
5354
|
+
if not isinstance(func, (str, list, dict, ColumnExpression)):
|
|
5149
5355
|
raise TeradataMlException(Messages.get_message(MessageCodes.UNSUPPORTED_DATATYPE,
|
|
5150
|
-
|
|
5356
|
+
'func', ['str, dict, ColumnExpression or list of values of type(s): str, ColumnExpression']),
|
|
5151
5357
|
MessageCodes.UNSUPPORTED_DATATYPE)
|
|
5152
5358
|
|
|
5153
5359
|
return self._get_dataframe_aggregate(func)
|
|
@@ -5169,6 +5375,8 @@ class DataFrame():
|
|
|
5169
5375
|
3. Dictionary containing column name as key and
|
|
5170
5376
|
aggregate function name (string or list of
|
|
5171
5377
|
strings) as value
|
|
5378
|
+
4. ColumnExpression built using the aggregate functions.
|
|
5379
|
+
5. List of ColumnExpression built using the aggregate functions.
|
|
5172
5380
|
|
|
5173
5381
|
**kwargs: Keyword arguments. Mainly used for Time Series Aggragates.
|
|
5174
5382
|
|
|
@@ -5345,7 +5553,9 @@ class DataFrame():
|
|
|
5345
5553
|
result = self._check_numeric_overflow(agg_df)
|
|
5346
5554
|
"""
|
|
5347
5555
|
try:
|
|
5348
|
-
|
|
5556
|
+
# Printing the DF will actually run underlying select query and
|
|
5557
|
+
# will brought up numeric overflow if any. Only materializing won't work.
|
|
5558
|
+
print(result_df)
|
|
5349
5559
|
return False
|
|
5350
5560
|
except TeradataMlException as tme:
|
|
5351
5561
|
if "Numeric overflow occurred during computation" in str(tme):
|
|
@@ -5481,18 +5691,73 @@ class DataFrame():
|
|
|
5481
5691
|
EXAMPLES:
|
|
5482
5692
|
self.__get_data_columns()
|
|
5483
5693
|
"""
|
|
5484
|
-
|
|
5485
|
-
|
|
5486
|
-
|
|
5694
|
+
if not self._table_name:
|
|
5695
|
+
if not self._aed_query:
|
|
5696
|
+
self.__generate_aed_query()
|
|
5697
|
+
# TODO: Check the length of query and if it fails, create a view in catch block.
|
|
5698
|
+
# Address in this JIRA: https://teradata-pe.atlassian.net/browse/ELE-6922
|
|
5699
|
+
query = repr(self._metaexpr) + ' FROM ( ' + self._aed_query + ' ) as temp_table'
|
|
5700
|
+
else:
|
|
5701
|
+
query = repr(self._metaexpr) + ' FROM ' + self._table_name
|
|
5487
5702
|
|
|
5488
5703
|
if self._orderby is not None:
|
|
5489
5704
|
query += ' ORDER BY ' + self._orderby
|
|
5490
5705
|
|
|
5706
|
+
query += ';'
|
|
5491
5707
|
# Execute the query and get the results in a list.
|
|
5492
5708
|
self.__data, self.__data_columns = UtilFuncs._execute_query(query=query, fetchWarnings=True)
|
|
5493
5709
|
|
|
5494
5710
|
return self.__data, self.__data_columns
|
|
5495
5711
|
|
|
5712
|
+
def __generate_aed_query(self, full_query=False):
|
|
5713
|
+
"""
|
|
5714
|
+
DESCRIPTION:
|
|
5715
|
+
Internal function to return underlying SQL for the teradataml
|
|
5716
|
+
DataFrame. It is the same SQL that is used to view the data for
|
|
5717
|
+
a teradataml DataFrame.
|
|
5718
|
+
|
|
5719
|
+
PARAMETERS:
|
|
5720
|
+
full_query:
|
|
5721
|
+
Optional Argument.
|
|
5722
|
+
Specifies if the complete query for the dataframe should be returned.
|
|
5723
|
+
When this parameter is set to True, query for the dataframe is returned
|
|
5724
|
+
with respect to the base dataframe's table (from_table() or from_query())
|
|
5725
|
+
or from the output tables of analytical functions (if there are any in the
|
|
5726
|
+
workflow). This query may or may not be directly used to retrieve data
|
|
5727
|
+
for the dataframe upon which the function is called.
|
|
5728
|
+
When this parameter is not used, string returned is the query already used
|
|
5729
|
+
or will be used to retrieve data for the teradataml DataFrame.
|
|
5730
|
+
Default Value: False
|
|
5731
|
+
Types: bool
|
|
5732
|
+
|
|
5733
|
+
RETURNS:
|
|
5734
|
+
String representing the underlying SQL query for the teradataml DataFrame.
|
|
5735
|
+
|
|
5736
|
+
RAISES:
|
|
5737
|
+
None.
|
|
5738
|
+
|
|
5739
|
+
EXAMPLES:
|
|
5740
|
+
self.__generate_aed_query()
|
|
5741
|
+
"""
|
|
5742
|
+
# Run aed call only when _aed_query is None or
|
|
5743
|
+
# the type of current stored query (full/short) is not matching
|
|
5744
|
+
# with asked query type.
|
|
5745
|
+
if (not self._aed_query) or (not self._is_full_query == full_query):
|
|
5746
|
+
node_id = self._nodeid
|
|
5747
|
+
|
|
5748
|
+
if isinstance(self, (DataFrameGroupBy, DataFrameGroupByTime)):
|
|
5749
|
+
# If dataframe is either of type groupby or groupbytime
|
|
5750
|
+
# then get its parent dataframe nodeid and return queries
|
|
5751
|
+
# for the same
|
|
5752
|
+
node_id = self._aed_utils._aed_get_parent_nodeids(self._nodeid)[0]
|
|
5753
|
+
|
|
5754
|
+
queries = self._aed_utils._aed_show_query(node_id, query_with_reference_to_top=full_query)
|
|
5755
|
+
# Store query and type of query in class attributes to avoid future runs.
|
|
5756
|
+
self._aed_query = queries[0][0]
|
|
5757
|
+
self._is_full_query = full_query
|
|
5758
|
+
|
|
5759
|
+
return self._aed_query
|
|
5760
|
+
|
|
5496
5761
|
@collect_queryband(queryband="DF_select")
|
|
5497
5762
|
def select(self, select_expression):
|
|
5498
5763
|
"""
|
|
@@ -7032,6 +7297,97 @@ class DataFrame():
|
|
|
7032
7297
|
if function_name is None or function_name in VANTAGE_FUNCTION_ARGTYPE_DEPENDENT_MAPPER:
|
|
7033
7298
|
self.__execute_node_and_set_table_name(self._nodeid)
|
|
7034
7299
|
return True
|
|
7300
|
+
|
|
7301
|
+
def _assign_udf(self, udf_expr):
|
|
7302
|
+
"""
|
|
7303
|
+
DESCRIPTION:
|
|
7304
|
+
Internal function for DataFrame.assign() to execute the udf using
|
|
7305
|
+
Script Table Operator and create new column for teradataml DataFrame.
|
|
7306
|
+
|
|
7307
|
+
PARAMETER:
|
|
7308
|
+
udf_expr:
|
|
7309
|
+
Required Argument.
|
|
7310
|
+
Specifies a dictionary of column name to UDF expressions.
|
|
7311
|
+
Types: dict
|
|
7312
|
+
|
|
7313
|
+
RETURNS:
|
|
7314
|
+
teradataml DataFrame
|
|
7315
|
+
|
|
7316
|
+
RAISES:
|
|
7317
|
+
None.
|
|
7318
|
+
|
|
7319
|
+
EXAMPLES:
|
|
7320
|
+
self._assign_udf(udf_expr)
|
|
7321
|
+
"""
|
|
7322
|
+
|
|
7323
|
+
df = self
|
|
7324
|
+
env_name = None
|
|
7325
|
+
# Create a dictionary of env_name to list of output columns to be run on that env.
|
|
7326
|
+
env_mapper = OrderedDict()
|
|
7327
|
+
|
|
7328
|
+
exec_mode = 'REMOTE' if UtilFuncs._is_lake() else 'IN-DB'
|
|
7329
|
+
if exec_mode == 'REMOTE':
|
|
7330
|
+
if _InternalBuffer.get("auth_token") is None:
|
|
7331
|
+
raise TeradataMlException(Messages.get_message(
|
|
7332
|
+
MessageCodes.FUNC_EXECUTION_FAILED, "'udf'", 'Authentication token is required to run udf. Set token using set_auth_token().'),
|
|
7333
|
+
MessageCodes.FUNC_EXECUTION_FAILED)
|
|
7334
|
+
else:
|
|
7335
|
+
for colname, col in udf_expr.items():
|
|
7336
|
+
env_name = UtilFuncs._get_env_name(col)
|
|
7337
|
+
# Store the env_name and its corresponding output column
|
|
7338
|
+
if env_name in env_mapper:
|
|
7339
|
+
env_mapper[env_name].append(colname)
|
|
7340
|
+
else:
|
|
7341
|
+
env_mapper[env_name] = [colname]
|
|
7342
|
+
else:
|
|
7343
|
+
env_mapper[env_name] = udf_expr.keys()
|
|
7344
|
+
|
|
7345
|
+
for env_name, cols in env_mapper.items():
|
|
7346
|
+
# Create a dictionary of output columns to column type.
|
|
7347
|
+
returns = OrderedDict([(column.name, column.type) for column in df._metaexpr.c])
|
|
7348
|
+
# Store the udf functions
|
|
7349
|
+
user_function = []
|
|
7350
|
+
# Create a dictionary of output column name to udf name
|
|
7351
|
+
columns_definitions = {}
|
|
7352
|
+
# Create a dictionary of output column name to udf arguments
|
|
7353
|
+
function_args = {}
|
|
7354
|
+
for colname, col in udf_expr.items():
|
|
7355
|
+
delimiter = col._delimiter
|
|
7356
|
+
quotechar = col._quotechar
|
|
7357
|
+
if colname in cols:
|
|
7358
|
+
user_function.append(col._udf)
|
|
7359
|
+
function_args[colname] = col._udf_args if col._udf_args else ()
|
|
7360
|
+
returns[colname] = col.type
|
|
7361
|
+
columns_definitions[colname] = col._udf.__name__
|
|
7362
|
+
|
|
7363
|
+
tbl_operators = _TableOperatorUtils([],
|
|
7364
|
+
df,
|
|
7365
|
+
"udf",
|
|
7366
|
+
user_function,
|
|
7367
|
+
exec_mode,
|
|
7368
|
+
chunk_size=None,
|
|
7369
|
+
returns=returns,
|
|
7370
|
+
delimiter=delimiter,
|
|
7371
|
+
quotechar=quotechar,
|
|
7372
|
+
num_rows=1,
|
|
7373
|
+
auth=None,
|
|
7374
|
+
data_partition_column=None,
|
|
7375
|
+
data_hash_column=None,
|
|
7376
|
+
data_order_column=None,
|
|
7377
|
+
is_local_order=None,
|
|
7378
|
+
nulls_first=None,
|
|
7379
|
+
sort_ascending=None,
|
|
7380
|
+
charset=None,
|
|
7381
|
+
env_name = env_name,
|
|
7382
|
+
style = "csv",
|
|
7383
|
+
function_args=function_args,
|
|
7384
|
+
columns_definitions=columns_definitions,
|
|
7385
|
+
output_type_converters={
|
|
7386
|
+
col_name: _Dtypes._teradata_type_to_python_type(col_type)
|
|
7387
|
+
for col_name, col_type in returns.items()})
|
|
7388
|
+
|
|
7389
|
+
df = tbl_operators.execute()
|
|
7390
|
+
return df
|
|
7035
7391
|
|
|
7036
7392
|
@collect_queryband(queryband="DF_assign")
|
|
7037
7393
|
def assign(self, drop_columns=False, **kwargs):
|
|
@@ -7043,10 +7399,12 @@ class DataFrame():
|
|
|
7043
7399
|
drop_columns:
|
|
7044
7400
|
Optional Argument.
|
|
7045
7401
|
If True, drop columns that are not specified in assign.
|
|
7046
|
-
|
|
7047
|
-
When DataFrame.assign() is run on DataFrame.groupby(), this argument
|
|
7048
|
-
|
|
7049
|
-
|
|
7402
|
+
Notes:
|
|
7403
|
+
1. When DataFrame.assign() is run on DataFrame.groupby(), this argument
|
|
7404
|
+
is ignored. In such cases, all columns are dropped and only new columns
|
|
7405
|
+
and grouping columns are returned.
|
|
7406
|
+
2. Argument is ignored for UDF functions.
|
|
7407
|
+
|
|
7050
7408
|
Default Value: False
|
|
7051
7409
|
Types: bool
|
|
7052
7410
|
|
|
@@ -7062,6 +7420,7 @@ class DataFrame():
|
|
|
7062
7420
|
* SQLAlchemy ClauseElements.
|
|
7063
7421
|
(See teradataml extension with SQLAlchemy in teradataml User Guide
|
|
7064
7422
|
and Function reference guide for more details)
|
|
7423
|
+
* Function - udf.
|
|
7065
7424
|
|
|
7066
7425
|
|
|
7067
7426
|
RETURNS:
|
|
@@ -7087,6 +7446,16 @@ class DataFrame():
|
|
|
7087
7446
|
used, but the column used in such function must be a part of group by columns.
|
|
7088
7447
|
See examples for teradataml extension with SQLAlchemy on using various
|
|
7089
7448
|
functions with DataFrame.assign().
|
|
7449
|
+
6. UDF expressions can run on both Vantage Cloud Lake leveraging Apply Table Operator
|
|
7450
|
+
of Open Analytics Framework and Enterprise leveraging Vantage's Script Table Operator.
|
|
7451
|
+
7. One can pass both regular expressions and udf expressions to this API.
|
|
7452
|
+
However, regular expressions are computed first followed by udf expressions.
|
|
7453
|
+
Hence the order of columns also maintained in same order.
|
|
7454
|
+
Look at Example 18 to understand more.
|
|
7455
|
+
8. While passing multiple udf expressions, one can not pass one column output
|
|
7456
|
+
as another column input in the same ``assign`` call.
|
|
7457
|
+
9. If user pass multiple udf expressions, delimiter and quotechar specified in
|
|
7458
|
+
last udf expression are considered for processing.
|
|
7090
7459
|
|
|
7091
7460
|
RAISES:
|
|
7092
7461
|
1. ValueError - When a callable is passed as a value, or columns from different
|
|
@@ -7348,6 +7717,134 @@ class DataFrame():
|
|
|
7348
7717
|
1 Advanced 2.886226 3.508750 84.21
|
|
7349
7718
|
2 Novice 6.377775 3.559091 39.15
|
|
7350
7719
|
>>>
|
|
7720
|
+
|
|
7721
|
+
#
|
|
7722
|
+
# Executing user defined function (UDF) with assign()
|
|
7723
|
+
#
|
|
7724
|
+
# Example 15: Create two user defined functions to 'to_upper' and 'sum',
|
|
7725
|
+
# 'to_upper' to get the values in 'accounts' to upper case and
|
|
7726
|
+
# 'sum' to add length of string values in column 'accounts'
|
|
7727
|
+
# with column 'Feb' and store the result in Integer type column.
|
|
7728
|
+
>>> @udf
|
|
7729
|
+
... def to_upper(s):
|
|
7730
|
+
... if s is not None:
|
|
7731
|
+
... return s.upper()
|
|
7732
|
+
>>>
|
|
7733
|
+
>>> from teradatasqlalchemy.types import INTEGER
|
|
7734
|
+
>>> @udf(returns=INTEGER())
|
|
7735
|
+
... def sum(x, y):
|
|
7736
|
+
... return len(x)+y
|
|
7737
|
+
>>>
|
|
7738
|
+
# Assign both Column Expressions returned by user defined functions
|
|
7739
|
+
# to the DataFrame.
|
|
7740
|
+
>>> res = df.assign(upper_stats = to_upper('accounts'), len_sum = sum('accounts', 'Feb'))
|
|
7741
|
+
>>> res
|
|
7742
|
+
Feb Jan Mar Apr datetime upper_stats len_sum
|
|
7743
|
+
accounts
|
|
7744
|
+
Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC 98
|
|
7745
|
+
Red Inc 200.0 150.0 140.0 NaN 17/01/04 RED INC 207
|
|
7746
|
+
Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC 100
|
|
7747
|
+
Jones LLC 200.0 150.0 140.0 180.0 17/01/04 JONES LLC 209
|
|
7748
|
+
Orange Inc 210.0 NaN NaN 250.0 17/01/04 ORANGE INC 220
|
|
7749
|
+
Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO 218
|
|
7750
|
+
>>>
|
|
7751
|
+
|
|
7752
|
+
# Example 16: Create a user defined function to add 4 to the 'datetime' column
|
|
7753
|
+
# and store the result in DATE type column.
|
|
7754
|
+
>>> from teradatasqlalchemy.types import DATE
|
|
7755
|
+
>>> import datetime
|
|
7756
|
+
>>> @udf(returns=DATE())
|
|
7757
|
+
... def add_date(x, y):
|
|
7758
|
+
... return (datetime.datetime.strptime(x, "%y/%m/%d")+datetime.timedelta(y)).strftime("%y/%m/%d")
|
|
7759
|
+
>>>
|
|
7760
|
+
# Assign the Column Expression returned by user defined function
|
|
7761
|
+
# to the DataFrame.
|
|
7762
|
+
>>> res = df.assign(new_date = add_date('datetime', 4))
|
|
7763
|
+
>>> res
|
|
7764
|
+
Feb Jan Mar Apr datetime new_date
|
|
7765
|
+
accounts
|
|
7766
|
+
Alpha Co 210.0 200.0 215.0 250.0 17/01/04 17/01/08
|
|
7767
|
+
Blue Inc 90.0 50.0 95.0 101.0 17/01/04 17/01/08
|
|
7768
|
+
Jones LLC 200.0 150.0 140.0 180.0 17/01/04 17/01/08
|
|
7769
|
+
Orange Inc 210.0 NaN NaN 250.0 17/01/04 17/01/08
|
|
7770
|
+
Yellow Inc 90.0 NaN NaN NaN 17/01/04 17/01/08
|
|
7771
|
+
Red Inc 200.0 150.0 140.0 NaN 17/01/04 17/01/08
|
|
7772
|
+
>>>
|
|
7773
|
+
|
|
7774
|
+
# Example 17: Create a user defined functions to 'to_upper' to get
|
|
7775
|
+
# the values in 'accounts' to upper case and create a
|
|
7776
|
+
# new column with a string literal value.
|
|
7777
|
+
>>> @udf
|
|
7778
|
+
... def to_upper(s):
|
|
7779
|
+
... if s is not None:
|
|
7780
|
+
... return s.upper()
|
|
7781
|
+
>>>
|
|
7782
|
+
# Assign both expressions to the DataFrame.
|
|
7783
|
+
>>> res = df.assign(upper_stats = to_upper('accounts'), new_col = 'string')
|
|
7784
|
+
>>> res
|
|
7785
|
+
Feb Jan Mar Apr datetime new_col upper_stats
|
|
7786
|
+
accounts
|
|
7787
|
+
Alpha Co 210.0 200.0 215.0 250.0 17/01/04 string ALPHA CO
|
|
7788
|
+
Blue Inc 90.0 50.0 95.0 101.0 17/01/04 string BLUE INC
|
|
7789
|
+
Yellow Inc 90.0 NaN NaN NaN 17/01/04 string YELLOW INC
|
|
7790
|
+
Jones LLC 200.0 150.0 140.0 180.0 17/01/04 string JONES LLC
|
|
7791
|
+
Red Inc 200.0 150.0 140.0 NaN 17/01/04 string RED INC
|
|
7792
|
+
Orange Inc 210.0 NaN NaN 250.0 17/01/04 string ORANGE INC
|
|
7793
|
+
>>>
|
|
7794
|
+
|
|
7795
|
+
# Example 18: Create two user defined functions to 'to_upper' and 'sum'
|
|
7796
|
+
# and create new columns with string literal value and
|
|
7797
|
+
# arithmetic operation on column 'Feb'.
|
|
7798
|
+
>>> @udf
|
|
7799
|
+
... def to_upper(s):
|
|
7800
|
+
... if s is not None:
|
|
7801
|
+
... return s.upper()
|
|
7802
|
+
>>>
|
|
7803
|
+
>>> from teradatasqlalchemy.types import INTEGER
|
|
7804
|
+
>>> @udf(returns=INTEGER())
|
|
7805
|
+
... def sum(x, y):
|
|
7806
|
+
... return len(x)+y
|
|
7807
|
+
>>>
|
|
7808
|
+
# Assign all expressions to the DataFrame.
|
|
7809
|
+
>>> res = df.assign(upper_stats = to_upper('accounts'),new_col = 'abc',
|
|
7810
|
+
... len_sum = sum('accounts', 'Feb'), col_sum = df.Feb+1)
|
|
7811
|
+
>>> res
|
|
7812
|
+
Feb Jan Mar Apr datetime col_sum new_col upper_stats len_sum
|
|
7813
|
+
accounts
|
|
7814
|
+
Blue Inc 90.0 50.0 95.0 101.0 17/01/04 91.0 abc BLUE INC 98
|
|
7815
|
+
Alpha Co 210.0 200.0 215.0 250.0 17/01/04 211.0 abc ALPHA CO 218
|
|
7816
|
+
Jones LLC 200.0 150.0 140.0 180.0 17/01/04 201.0 abc JONES LLC 209
|
|
7817
|
+
Yellow Inc 90.0 NaN NaN NaN 17/01/04 91.0 abc YELLOW INC 100
|
|
7818
|
+
Orange Inc 210.0 NaN NaN 250.0 17/01/04 211.0 abc ORANGE INC 220
|
|
7819
|
+
Red Inc 200.0 150.0 140.0 NaN 17/01/04 201.0 abc RED INC 207
|
|
7820
|
+
>>>
|
|
7821
|
+
|
|
7822
|
+
# Example 19: Convert the values is 'accounts' column to upper case using a user
|
|
7823
|
+
# defined function on Vantage Cloud Lake.
|
|
7824
|
+
# Create a Python 3.10.5 environment with given name and description in Vantage.
|
|
7825
|
+
>>> env = create_env('test_udf', 'python_3.10.5', 'Test environment for UDF')
|
|
7826
|
+
User environment 'test_udf' created.
|
|
7827
|
+
>>>
|
|
7828
|
+
# Create a user defined functions to 'to_upper' to get the values in upper case
|
|
7829
|
+
# and pass the user env to run it on.
|
|
7830
|
+
>>> from teradataml.dataframe.functions import udf
|
|
7831
|
+
>>> @udf(env_name = env)
|
|
7832
|
+
... def to_upper(s):
|
|
7833
|
+
... if s is not None:
|
|
7834
|
+
... return s.upper()
|
|
7835
|
+
>>>
|
|
7836
|
+
# Assign the Column Expression returned by user defined function
|
|
7837
|
+
# to the DataFrame.
|
|
7838
|
+
>>> df.assign(upper_stats = to_upper('accounts'))
|
|
7839
|
+
Feb Jan Mar Apr datetime upper_stats
|
|
7840
|
+
accounts
|
|
7841
|
+
Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO
|
|
7842
|
+
Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC
|
|
7843
|
+
Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC
|
|
7844
|
+
Jones LLC 200.0 150.0 140.0 180.0 17/01/04 JONES LLC
|
|
7845
|
+
Orange Inc 210.0 NaN NaN 250.0 17/01/04 ORANGE INC
|
|
7846
|
+
Red Inc 200.0 150.0 140.0 NaN 17/01/04 RED INC
|
|
7847
|
+
>>>
|
|
7351
7848
|
"""
|
|
7352
7849
|
# Argument validations
|
|
7353
7850
|
awu_matrix = []
|
|
@@ -7393,13 +7890,35 @@ class DataFrame():
|
|
|
7393
7890
|
msg = Messages.get_message(MessageCodes.TDMLDF_INFO_ERROR)
|
|
7394
7891
|
raise TeradataMlException(msg, MessageCodes.TDMLDF_INFO_ERROR)
|
|
7395
7892
|
|
|
7396
|
-
|
|
7397
|
-
|
|
7398
|
-
|
|
7399
|
-
|
|
7400
|
-
|
|
7401
|
-
|
|
7402
|
-
|
|
7893
|
+
# Create a dictionary of column name to udf expressions and
|
|
7894
|
+
# column name to normal/regular expressions.
|
|
7895
|
+
udf_expr = {}
|
|
7896
|
+
regular_expr = {}
|
|
7897
|
+
for colname, col in kwargs.items():
|
|
7898
|
+
# If value passed in kwargs is a ColumnExpression and is a udf, store it.
|
|
7899
|
+
if isinstance(col, ColumnExpression) and col._udf:
|
|
7900
|
+
udf_expr[colname] = col
|
|
7901
|
+
else:
|
|
7902
|
+
regular_expr[colname] = col
|
|
7903
|
+
df = self
|
|
7904
|
+
|
|
7905
|
+
# If kwargs contains both regular and udf expressions, first create new columns
|
|
7906
|
+
# from normal/regular expressions then on the output dataframe create new columns
|
|
7907
|
+
# from udf expression.
|
|
7908
|
+
if bool(regular_expr):
|
|
7909
|
+
try:
|
|
7910
|
+
(new_meta, new_nodeid) = df._generate_assign_metaexpr_aed_nodeid(drop_columns, **regular_expr)
|
|
7911
|
+
df = df._create_dataframe_from_node(new_nodeid, new_meta, df._index_label)
|
|
7912
|
+
except Exception as err:
|
|
7913
|
+
errcode = MessageCodes.TDMLDF_INFO_ERROR
|
|
7914
|
+
msg = Messages.get_message(MessageCodes.TDMLDF_INFO_ERROR)
|
|
7915
|
+
raise TeradataMlException(msg, errcode) from err
|
|
7916
|
+
|
|
7917
|
+
if bool(udf_expr):
|
|
7918
|
+
df = df._assign_udf(udf_expr)
|
|
7919
|
+
|
|
7920
|
+
return df
|
|
7921
|
+
|
|
7403
7922
|
|
|
7404
7923
|
@collect_queryband(queryband="DF_get")
|
|
7405
7924
|
def get(self, key):
|
|
@@ -10013,9 +10532,10 @@ class DataFrame():
|
|
|
10013
10532
|
case_when_then = {}
|
|
10014
10533
|
list_of_fracs = frac
|
|
10015
10534
|
|
|
10016
|
-
# When stratify column is passed for sample
|
|
10017
|
-
#
|
|
10018
|
-
|
|
10535
|
+
# When stratify column is passed for sample or when seed is passed for
|
|
10536
|
+
# reproducibilty of result then
|
|
10537
|
+
# perform TrainTestSplit for data sampling.
|
|
10538
|
+
if stratify_column is not None or seed is not None:
|
|
10019
10539
|
# Local import TrainTestSplit function.
|
|
10020
10540
|
from teradataml.analytics.sqle import TrainTestSplit
|
|
10021
10541
|
|
|
@@ -10029,7 +10549,16 @@ class DataFrame():
|
|
|
10029
10549
|
train_size=list_of_fracs[0],
|
|
10030
10550
|
test_size=list_of_fracs[1],
|
|
10031
10551
|
stratify_column=stratify_column,
|
|
10032
|
-
seed=seed
|
|
10552
|
+
seed=seed,
|
|
10553
|
+
persist=True,
|
|
10554
|
+
display_table_name=False)
|
|
10555
|
+
|
|
10556
|
+
# Retrieve the table name from TrainTestSplit_out object.
|
|
10557
|
+
table_name = TrainTestSplit_out.result._table_name
|
|
10558
|
+
|
|
10559
|
+
# Add the table to garbage collector.
|
|
10560
|
+
table_added = GarbageCollector._add_to_garbagecollector(table_name)
|
|
10561
|
+
|
|
10033
10562
|
# Retrieve the sampled result and updated the column name and values
|
|
10034
10563
|
# for backward compatibility.
|
|
10035
10564
|
_sampled_df = TrainTestSplit_out.result
|
|
@@ -10133,10 +10662,10 @@ class DataFrame():
|
|
|
10133
10662
|
|
|
10134
10663
|
# Make this non-lazy. Added this in order to fix https://teradata-pe.atlassian.net/browse/ELE-6368
|
|
10135
10664
|
# Cannot use __execute_node_and_set_table_name because self points to original df.
|
|
10136
|
-
# Hence, setting the
|
|
10665
|
+
# Hence, setting the _table_name with _execute_node_return_db_object_name.
|
|
10137
10666
|
|
|
10138
10667
|
df = self._create_dataframe_from_node(sample_node_id, new_metaexpr, self._index_label)
|
|
10139
|
-
df.
|
|
10668
|
+
df._table_name = df_utils._execute_node_return_db_object_name(sample_node_id, new_metaexpr)
|
|
10140
10669
|
|
|
10141
10670
|
return df
|
|
10142
10671
|
|
|
@@ -10267,26 +10796,14 @@ class DataFrame():
|
|
|
10267
10796
|
where admitted > 0) as temp_table SAMPLE 0.9'
|
|
10268
10797
|
|
|
10269
10798
|
"""
|
|
10799
|
+
# Argument validations
|
|
10800
|
+
awu_matrix = []
|
|
10801
|
+
awu_matrix.append(["full_query", full_query, False, (bool)])
|
|
10802
|
+
# Validate argument types
|
|
10803
|
+
_Validators._validate_function_arguments(awu_matrix)
|
|
10270
10804
|
|
|
10271
10805
|
try:
|
|
10272
|
-
|
|
10273
|
-
awu_matrix = []
|
|
10274
|
-
awu_matrix.append(["full_query", full_query, False, (bool)])
|
|
10275
|
-
# Validate argument types
|
|
10276
|
-
_Validators._validate_function_arguments(awu_matrix)
|
|
10277
|
-
|
|
10278
|
-
node_id = self._nodeid
|
|
10279
|
-
|
|
10280
|
-
if isinstance(self, (DataFrameGroupBy, DataFrameGroupByTime)):
|
|
10281
|
-
# If dataframe is either of type groupby or groupbytime
|
|
10282
|
-
# then get it's parent dataframe nodeid and return queries
|
|
10283
|
-
# for the same
|
|
10284
|
-
node_id = self._aed_utils._aed_get_parent_nodeids(self._nodeid)[0]
|
|
10285
|
-
|
|
10286
|
-
queries = self._aed_utils._aed_show_query(node_id, query_with_reference_to_top=full_query)
|
|
10287
|
-
|
|
10288
|
-
return queries[0][0]
|
|
10289
|
-
|
|
10806
|
+
return self.__generate_aed_query(full_query)
|
|
10290
10807
|
except TeradataMlException:
|
|
10291
10808
|
raise
|
|
10292
10809
|
|
|
@@ -10296,7 +10813,7 @@ class DataFrame():
|
|
|
10296
10813
|
except Exception as err:
|
|
10297
10814
|
errcode = MessageCodes.TDMLDF_INFO_ERROR
|
|
10298
10815
|
msg = Messages.get_message(errcode)
|
|
10299
|
-
raise TeradataMlException(msg, errcode) from err
|
|
10816
|
+
raise TeradataMlException(msg, errcode) from err
|
|
10300
10817
|
|
|
10301
10818
|
@collect_queryband(queryband="DF_mapRow")
|
|
10302
10819
|
def map_row(self,
|
|
@@ -13755,7 +14272,7 @@ class DataFrame():
|
|
|
13755
14272
|
Types: int OR NoneType
|
|
13756
14273
|
|
|
13757
14274
|
RETURNS:
|
|
13758
|
-
iterator, an object to iterate over
|
|
14275
|
+
iterator, an object to iterate over row in the DataFrame.
|
|
13759
14276
|
|
|
13760
14277
|
RAISES:
|
|
13761
14278
|
None
|
|
@@ -13804,9 +14321,10 @@ class DataFrame():
|
|
|
13804
14321
|
cur = execute_sql(query)
|
|
13805
14322
|
|
|
13806
14323
|
if name:
|
|
14324
|
+
columns = [column[0] for column in cur.description]
|
|
13807
14325
|
for rec in cur:
|
|
13808
|
-
|
|
13809
|
-
yield
|
|
14326
|
+
row = _Row(columns=columns, values=rec)
|
|
14327
|
+
yield row
|
|
13810
14328
|
else:
|
|
13811
14329
|
for rec in cur:
|
|
13812
14330
|
yield rec
|
|
@@ -16626,7 +17144,7 @@ class _TDUAF(DataFrame):
|
|
|
16626
17144
|
# UAF Functions do not accept double quotes.
|
|
16627
17145
|
db_name = UtilFuncs._extract_db_name(table_name)
|
|
16628
17146
|
if db_name:
|
|
16629
|
-
table_name = "{}.{}".format(db_name, UtilFuncs._extract_table_name(table_name))
|
|
17147
|
+
table_name = '"{}"."{}"'.format(db_name, UtilFuncs._extract_table_name(table_name))
|
|
16630
17148
|
else:
|
|
16631
17149
|
table_name = UtilFuncs._extract_table_name(table_name)
|
|
16632
17150
|
|