teradataml 20.0.0.1__py3-none-any.whl → 20.0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +306 -0
- teradataml/__init__.py +10 -3
- teradataml/_version.py +1 -1
- teradataml/analytics/__init__.py +3 -2
- teradataml/analytics/analytic_function_executor.py +299 -16
- teradataml/analytics/analytic_query_generator.py +92 -0
- teradataml/analytics/byom/__init__.py +3 -2
- teradataml/analytics/json_parser/metadata.py +13 -3
- teradataml/analytics/json_parser/utils.py +13 -6
- teradataml/analytics/meta_class.py +40 -1
- teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
- teradataml/analytics/sqle/__init__.py +11 -2
- teradataml/analytics/table_operator/__init__.py +4 -3
- teradataml/analytics/uaf/__init__.py +21 -2
- teradataml/analytics/utils.py +66 -1
- teradataml/analytics/valib.py +1 -1
- teradataml/automl/__init__.py +1502 -323
- teradataml/automl/custom_json_utils.py +139 -61
- teradataml/automl/data_preparation.py +247 -307
- teradataml/automl/data_transformation.py +32 -12
- teradataml/automl/feature_engineering.py +325 -86
- teradataml/automl/model_evaluation.py +44 -35
- teradataml/automl/model_training.py +122 -153
- teradataml/catalog/byom.py +8 -8
- teradataml/clients/pkce_client.py +1 -1
- teradataml/common/__init__.py +2 -1
- teradataml/common/constants.py +72 -0
- teradataml/common/deprecations.py +13 -7
- teradataml/common/garbagecollector.py +152 -120
- teradataml/common/messagecodes.py +11 -2
- teradataml/common/messages.py +4 -1
- teradataml/common/sqlbundle.py +26 -4
- teradataml/common/utils.py +225 -14
- teradataml/common/wrapper_utils.py +1 -1
- teradataml/context/context.py +82 -2
- teradataml/data/SQL_Fundamentals.pdf +0 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/dataframe_example.json +27 -1
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/Shap.py +203 -0
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
- teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
- teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/jsons/paired_functions.json +14 -0
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
- teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +210 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +410 -0
- teradataml/data/load_example_data.py +8 -2
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/naivebayestextclassifier_example.json +1 -1
- teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/sax_example.json +8 -0
- teradataml/data/scripts/deploy_script.py +1 -1
- teradataml/data/scripts/lightgbm/dataset.template +157 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +194 -160
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
- teradataml/data/scripts/sklearn/sklearn_function.template +34 -16
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +1 -1
- teradataml/data/scripts/sklearn/sklearn_score.py +12 -3
- teradataml/data/scripts/sklearn/sklearn_transform.py +162 -24
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/templates/open_source_ml.json +3 -1
- teradataml/data/teradataml_example.json +20 -1
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/uaf_example.json +55 -1
- teradataml/data/unpivot_example.json +15 -0
- teradataml/data/url_data.csv +9 -0
- teradataml/data/vectordistance_example.json +4 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/dataframe/copy_to.py +1 -1
- teradataml/dataframe/data_transfer.py +5 -3
- teradataml/dataframe/dataframe.py +1002 -201
- teradataml/dataframe/fastload.py +3 -3
- teradataml/dataframe/functions.py +867 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +2 -2
- teradataml/dataframe/sql.py +840 -33
- teradataml/dataframe/window.py +1 -1
- teradataml/dbutils/dbutils.py +878 -34
- teradataml/dbutils/filemgr.py +48 -1
- teradataml/geospatial/geodataframe.py +1 -1
- teradataml/geospatial/geodataframecolumn.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +13 -13
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/__init__.py +1 -1
- teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
- teradataml/opensource/_lightgbm.py +950 -0
- teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
- teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
- teradataml/opensource/sklearn/__init__.py +0 -1
- teradataml/opensource/sklearn/_sklearn_wrapper.py +1019 -574
- teradataml/options/__init__.py +9 -23
- teradataml/options/configure.py +42 -4
- teradataml/options/display.py +2 -2
- teradataml/plot/axis.py +4 -4
- teradataml/scriptmgmt/UserEnv.py +13 -9
- teradataml/scriptmgmt/lls_utils.py +77 -23
- teradataml/store/__init__.py +13 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +291 -0
- teradataml/store/feature_store/feature_store.py +2223 -0
- teradataml/store/feature_store/models.py +1505 -0
- teradataml/store/vector_store/__init__.py +1586 -0
- teradataml/table_operators/Script.py +2 -2
- teradataml/table_operators/TableOperator.py +106 -20
- teradataml/table_operators/query_generator.py +3 -0
- teradataml/table_operators/table_operator_query_generator.py +3 -1
- teradataml/table_operators/table_operator_util.py +102 -56
- teradataml/table_operators/templates/dataframe_register.template +69 -0
- teradataml/table_operators/templates/dataframe_udf.template +63 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +52 -0
- teradataml/utils/dtypes.py +4 -2
- teradataml/utils/validators.py +34 -2
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +311 -3
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +240 -157
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0
|
@@ -19,6 +19,7 @@ import pandas as pd
|
|
|
19
19
|
import re
|
|
20
20
|
import sqlalchemy
|
|
21
21
|
import sys
|
|
22
|
+
import urllib.parse
|
|
22
23
|
import teradataml.context.context as tdmlctx
|
|
23
24
|
|
|
24
25
|
from collections import OrderedDict, namedtuple
|
|
@@ -42,7 +43,9 @@ from teradataml.dataframe.indexer import _LocationIndexer
|
|
|
42
43
|
from teradataml.common.aed_utils import AedUtils
|
|
43
44
|
from teradataml.options.display import display
|
|
44
45
|
from teradataml.dataframe.copy_to import copy_to_sql
|
|
46
|
+
from teradataml.dataframe.row import _Row
|
|
45
47
|
from teradataml.dataframe.setop import concat
|
|
48
|
+
from teradataml.dbutils.dbutils import list_td_reserved_keywords
|
|
46
49
|
from teradataml.plot.plot import _Plot
|
|
47
50
|
from teradataml.scriptmgmt.UserEnv import UserEnv
|
|
48
51
|
from teradataml.utils.dtypes import _Dtypes, _ListOf, _TupleOf
|
|
@@ -53,7 +56,10 @@ from teradatasql import OperationalError
|
|
|
53
56
|
from teradataml.dataframe.window import Window
|
|
54
57
|
from teradataml.dataframe.data_transfer import _DataTransferUtils
|
|
55
58
|
from teradataml.common.bulk_exposed_utils import _validate_unimplemented_function
|
|
56
|
-
from
|
|
59
|
+
from teradataml.telemetry_utils.queryband import collect_queryband
|
|
60
|
+
from teradataml.options.configure import configure
|
|
61
|
+
from teradataml.utils.internal_buffer import _InternalBuffer
|
|
62
|
+
from teradataml.common.constants import OutputStyle
|
|
57
63
|
|
|
58
64
|
# TODO use logger when available on master branch
|
|
59
65
|
# logger = teradatapylog.getLogger()
|
|
@@ -151,6 +157,11 @@ class DataFrame():
|
|
|
151
157
|
# This attribute added to add setter for columns property,
|
|
152
158
|
# it is required when setting columns from groupby
|
|
153
159
|
self._columns = None
|
|
160
|
+
# This attribute stores the internal AED query and avoid multiple
|
|
161
|
+
# calls to AED utility function aed_show_query()
|
|
162
|
+
self._aed_query = None
|
|
163
|
+
# This attribute stores the type of query stored in self._aed_query.
|
|
164
|
+
self._is_full_query = None
|
|
154
165
|
|
|
155
166
|
# Property to determine if table is an ART table or not.
|
|
156
167
|
self._is_art = None
|
|
@@ -221,7 +232,7 @@ class DataFrame():
|
|
|
221
232
|
|
|
222
233
|
self._nodeid = self._aed_utils._aed_query(self._query, temp_table_name)
|
|
223
234
|
else:
|
|
224
|
-
if inspect.stack()[1][3] not in ['_from_node', '__init__']:
|
|
235
|
+
if inspect.stack()[1][3] not in ['_from_node', '__init__', 'alias']:
|
|
225
236
|
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_CREATE_FAIL),
|
|
226
237
|
MessageCodes.TDMLDF_CREATE_FAIL)
|
|
227
238
|
|
|
@@ -233,6 +244,7 @@ class DataFrame():
|
|
|
233
244
|
self._iloc = _LocationIndexer(self, integer_indexing=True)
|
|
234
245
|
self.__data = None
|
|
235
246
|
self.__data_columns = None
|
|
247
|
+
self._alias = None
|
|
236
248
|
|
|
237
249
|
except TeradataMlException:
|
|
238
250
|
raise
|
|
@@ -242,6 +254,100 @@ class DataFrame():
|
|
|
242
254
|
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_CREATE_FAIL),
|
|
243
255
|
MessageCodes.TDMLDF_CREATE_FAIL) from err
|
|
244
256
|
|
|
257
|
+
@property
|
|
258
|
+
def db_object_name(self):
|
|
259
|
+
"""
|
|
260
|
+
DESCRIPTION:
|
|
261
|
+
Get the underlying database object name, on which DataFrame is
|
|
262
|
+
created.
|
|
263
|
+
|
|
264
|
+
RETURNS:
|
|
265
|
+
str representing object name of DataFrame
|
|
266
|
+
|
|
267
|
+
EXAMPLES:
|
|
268
|
+
>>> load_example_data("dataframe", "sales")
|
|
269
|
+
>>> df = DataFrame('sales')
|
|
270
|
+
>>> df.db_object_name
|
|
271
|
+
'"sales"'
|
|
272
|
+
"""
|
|
273
|
+
if self._table_name is not None:
|
|
274
|
+
return self._table_name
|
|
275
|
+
else:
|
|
276
|
+
msg = "Object name is available once DataFrame is materialized. " \
|
|
277
|
+
"Use DataFrame.materialize() to materialize DataFrame."
|
|
278
|
+
print(msg)
|
|
279
|
+
|
|
280
|
+
def alias(self, alias_name):
|
|
281
|
+
"""
|
|
282
|
+
DESCRIPTION:
|
|
283
|
+
Method to create an aliased teradataml DataFrame.
|
|
284
|
+
Note:
|
|
285
|
+
* This method is recommended to be used before performing
|
|
286
|
+
self join using DataFrame's join() API.
|
|
287
|
+
|
|
288
|
+
PARAMETERS:
|
|
289
|
+
alias_name:
|
|
290
|
+
Required Argument.
|
|
291
|
+
Specifies the alias name to be assigned to a teradataml DataFrame.
|
|
292
|
+
Types: str
|
|
293
|
+
|
|
294
|
+
RETURNS:
|
|
295
|
+
teradataml DataFrame
|
|
296
|
+
|
|
297
|
+
EXAMPLES:
|
|
298
|
+
>>> load_example_data("dataframe", "admissions_train")
|
|
299
|
+
>>> df = DataFrame("admissions_train")
|
|
300
|
+
>>> df
|
|
301
|
+
masters gpa stats programming admitted
|
|
302
|
+
id
|
|
303
|
+
13 no 4.00 Advanced Novice 1
|
|
304
|
+
26 yes 3.57 Advanced Advanced 1
|
|
305
|
+
5 no 3.44 Novice Novice 0
|
|
306
|
+
19 yes 1.98 Advanced Advanced 0
|
|
307
|
+
15 yes 4.00 Advanced Advanced 1
|
|
308
|
+
40 yes 3.95 Novice Beginner 0
|
|
309
|
+
7 yes 2.33 Novice Novice 1
|
|
310
|
+
22 yes 3.46 Novice Beginner 0
|
|
311
|
+
36 no 3.00 Advanced Novice 0
|
|
312
|
+
38 yes 2.65 Advanced Beginner 1
|
|
313
|
+
|
|
314
|
+
# Example 1: Create an alias of teradataml DataFrame.
|
|
315
|
+
|
|
316
|
+
>>> df2 = df.alias("adm_trn")
|
|
317
|
+
|
|
318
|
+
# Print aliased DataFrame.
|
|
319
|
+
>>> df2
|
|
320
|
+
masters gpa stats programming admitted
|
|
321
|
+
id
|
|
322
|
+
13 no 4.00 Advanced Novice 1
|
|
323
|
+
26 yes 3.57 Advanced Advanced 1
|
|
324
|
+
5 no 3.44 Novice Novice 0
|
|
325
|
+
19 yes 1.98 Advanced Advanced 0
|
|
326
|
+
15 yes 4.00 Advanced Advanced 1
|
|
327
|
+
40 yes 3.95 Novice Beginner 0
|
|
328
|
+
7 yes 2.33 Novice Novice 1
|
|
329
|
+
22 yes 3.46 Novice Beginner 0
|
|
330
|
+
36 no 3.00 Advanced Novice 0
|
|
331
|
+
38 yes 2.65 Advanced Beginner 1
|
|
332
|
+
"""
|
|
333
|
+
arg_info_matrix = [["alias_name", alias_name, False, (str), True]]
|
|
334
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
335
|
+
try:
|
|
336
|
+
alias_df = self._from_node(self._nodeid, self._metaexpr, self._index_label,
|
|
337
|
+
reuse_metaexpr=False)
|
|
338
|
+
# Assigning self attributes to newly created alias dataframe.
|
|
339
|
+
alias_df._table_name = self._table_name
|
|
340
|
+
alias_df._index = self._index
|
|
341
|
+
alias_df._index_label = self._index_label
|
|
342
|
+
setattr(alias_df._metaexpr.t, "table_alias", alias_name)
|
|
343
|
+
alias_df._alias = alias_name
|
|
344
|
+
return alias_df
|
|
345
|
+
except Exception as err:
|
|
346
|
+
error_code = MessageCodes.EXECUTION_FAILED
|
|
347
|
+
error_msg = Messages.get_message(
|
|
348
|
+
error_code, "create alias dataFrame", '{}'.format(str(err)))
|
|
349
|
+
raise TeradataMlException(error_msg, error_code)
|
|
350
|
+
|
|
245
351
|
@classmethod
|
|
246
352
|
@collect_queryband(queryband="DF_fromTable")
|
|
247
353
|
def from_table(cls, table_name, index=True, index_label=None):
|
|
@@ -356,7 +462,7 @@ class DataFrame():
|
|
|
356
462
|
return cls(index=index, index_label=index_label, query=query, materialize=materialize)
|
|
357
463
|
|
|
358
464
|
@classmethod
|
|
359
|
-
def _from_node(cls, nodeid, metaexpr, index_label=None, undropped_index=None):
|
|
465
|
+
def _from_node(cls, nodeid, metaexpr, index_label=None, undropped_index=None, reuse_metaexpr=True):
|
|
360
466
|
"""
|
|
361
467
|
Private class method for creating a DataFrame from a nodeid and parent metadata.
|
|
362
468
|
|
|
@@ -377,6 +483,12 @@ class DataFrame():
|
|
|
377
483
|
Optional Argument.
|
|
378
484
|
List specifying index column(s) to be retained as columns for printing.
|
|
379
485
|
|
|
486
|
+
reuse_metaexpr:
|
|
487
|
+
Optional Argument.
|
|
488
|
+
Specifies the flag to decide whether to use same _MetaExpression object or not.
|
|
489
|
+
Default Value: True
|
|
490
|
+
Types: bool
|
|
491
|
+
|
|
380
492
|
EXAMPLES:
|
|
381
493
|
from teradataml.dataframe.dataframe import DataFrame
|
|
382
494
|
df = DataFrame._from_node(1234, metaexpr)
|
|
@@ -392,32 +504,171 @@ class DataFrame():
|
|
|
392
504
|
df = cls()
|
|
393
505
|
df._nodeid = nodeid
|
|
394
506
|
df._source_type = SourceType.TABLE.value
|
|
395
|
-
|
|
507
|
+
|
|
508
|
+
if not reuse_metaexpr:
|
|
509
|
+
# Create new _MetaExpression object using reference metaExpression
|
|
510
|
+
# for newly created DataFrame.
|
|
511
|
+
df._metaexpr = UtilFuncs._get_metaexpr_using_parent_metaexpr(nodeid, metaexpr)
|
|
512
|
+
# When metaexpression is created using only column information from parent DataFrame,
|
|
513
|
+
# underlying SQLAlchemy table is created with '' string as Table name.
|
|
514
|
+
# Assign name from reference mataexpression here.
|
|
515
|
+
df._metaexpr.t.name = metaexpr.t.name
|
|
516
|
+
# Populate corresponding information into newly created DataFrame object
|
|
517
|
+
# using newly created metaExpression.
|
|
518
|
+
df._get_metadata_from_metaexpr(df._metaexpr)
|
|
519
|
+
else:
|
|
520
|
+
# Populate corresponding information into newly created DataFrame object
|
|
521
|
+
# using reference metaExpression.
|
|
522
|
+
df._get_metadata_from_metaexpr(metaexpr)
|
|
396
523
|
|
|
397
524
|
if isinstance(index_label, str):
|
|
398
525
|
index_label = [index_label]
|
|
399
526
|
|
|
400
|
-
if index_label is not None and all(elem in [col.name for col in
|
|
527
|
+
if index_label is not None and all(elem in [col.name for col in df._metaexpr.c] for elem in index_label):
|
|
401
528
|
df._index_label = index_label
|
|
402
529
|
elif index_label is not None and all(UtilFuncs._teradata_quote_arg(elem, "\"", False)
|
|
403
|
-
in [col.name for col in
|
|
530
|
+
in [col.name for col in df._metaexpr.c] for elem in index_label):
|
|
404
531
|
df._index_label = index_label
|
|
405
532
|
|
|
406
533
|
# Set the flag suggesting that the _index_label is set,
|
|
407
|
-
# and that a database lookup
|
|
534
|
+
# and that a database lookup won't be required even when it is None.
|
|
408
535
|
df._index_query_required = False
|
|
409
536
|
|
|
410
537
|
if isinstance(undropped_index, str):
|
|
411
538
|
undropped_index = [undropped_index]
|
|
412
539
|
|
|
413
|
-
if undropped_index is not None and all(elem in [col.name for col in
|
|
540
|
+
if undropped_index is not None and all(elem in [col.name for col in df._metaexpr.c] for elem in undropped_index):
|
|
414
541
|
df._undropped_index = undropped_index
|
|
415
542
|
elif undropped_index is not None and all(UtilFuncs._teradata_quote_arg(elem, "\"", False)
|
|
416
|
-
in [col.name for col in
|
|
543
|
+
in [col.name for col in df._metaexpr.c] for elem in undropped_index):
|
|
417
544
|
df._undropped_index = undropped_index
|
|
418
545
|
|
|
419
546
|
return df
|
|
420
547
|
|
|
548
|
+
def create_temp_view(self, name):
|
|
549
|
+
"""
|
|
550
|
+
DESCRIPTION:
|
|
551
|
+
Creates a temporary view for session on the DataFrame.
|
|
552
|
+
|
|
553
|
+
PARAMETERS:
|
|
554
|
+
name:
|
|
555
|
+
Required Argument.
|
|
556
|
+
Specifies the name of the temporary view.
|
|
557
|
+
Type: str
|
|
558
|
+
|
|
559
|
+
RETURNS:
|
|
560
|
+
None
|
|
561
|
+
|
|
562
|
+
RAISES:
|
|
563
|
+
OperationalError (When view already exists).
|
|
564
|
+
|
|
565
|
+
EXAMPLES:
|
|
566
|
+
# Load the data to run the example.
|
|
567
|
+
>>> load_example_data("dataframe", "admissions_train")
|
|
568
|
+
>>> df = DataFrame("admissions_train")
|
|
569
|
+
>>> df
|
|
570
|
+
masters gpa stats programming admitted
|
|
571
|
+
id
|
|
572
|
+
38 yes 2.65 Advanced Beginner 1
|
|
573
|
+
7 yes 2.33 Novice Novice 1
|
|
574
|
+
26 yes 3.57 Advanced Advanced 1
|
|
575
|
+
17 no 3.83 Advanced Advanced 1
|
|
576
|
+
34 yes 3.85 Advanced Beginner 0
|
|
577
|
+
13 no 4.00 Advanced Novice 1
|
|
578
|
+
32 yes 3.46 Advanced Beginner 0
|
|
579
|
+
11 no 3.13 Advanced Advanced 1
|
|
580
|
+
15 yes 4.00 Advanced Advanced 1
|
|
581
|
+
36 no 3.00 Advanced Novice 0
|
|
582
|
+
|
|
583
|
+
# Example 1: Create view 'new_admissions'.
|
|
584
|
+
>>> df.create_temp_view("new_admissions")
|
|
585
|
+
>>> new_df = DataFrame("new_admissions")
|
|
586
|
+
>>> new_df
|
|
587
|
+
masters gpa stats programming admitted
|
|
588
|
+
id
|
|
589
|
+
38 yes 2.65 Advanced Beginner 1
|
|
590
|
+
7 yes 2.33 Novice Novice 1
|
|
591
|
+
26 yes 3.57 Advanced Advanced 1
|
|
592
|
+
17 no 3.83 Advanced Advanced 1
|
|
593
|
+
34 yes 3.85 Advanced Beginner 0
|
|
594
|
+
13 no 4.00 Advanced Novice 1
|
|
595
|
+
32 yes 3.46 Advanced Beginner 0
|
|
596
|
+
11 no 3.13 Advanced Advanced 1
|
|
597
|
+
15 yes 4.00 Advanced Advanced 1
|
|
598
|
+
36 no 3.00 Advanced Novice 0
|
|
599
|
+
"""
|
|
600
|
+
# Validating Arguments
|
|
601
|
+
arg_type_matrix = []
|
|
602
|
+
arg_type_matrix.append(["name", name, False, (str), True])
|
|
603
|
+
_Validators._validate_function_arguments(arg_type_matrix)
|
|
604
|
+
|
|
605
|
+
GarbageCollector._add_to_garbagecollector(name, TeradataConstants.TERADATA_VIEW)
|
|
606
|
+
UtilFuncs._create_view(name, self.show_query())
|
|
607
|
+
|
|
608
|
+
def materialize(self):
|
|
609
|
+
"""
|
|
610
|
+
DESCRIPTION:
|
|
611
|
+
Method to materialize teradataml DataFrame into a database object.
|
|
612
|
+
Notes:
|
|
613
|
+
* DataFrames are materialized in either view/table/volatile table,
|
|
614
|
+
which is decided and taken care by teradataml.
|
|
615
|
+
* If user wants to materialize object into specific database object
|
|
616
|
+
such as table/volatile table, use 'to_sql()' or 'copy_to_sql()' or
|
|
617
|
+
'fastload()' functions.
|
|
618
|
+
* Materialized object is garbage collected at the end of the session.
|
|
619
|
+
|
|
620
|
+
PARAMETERS:
|
|
621
|
+
None
|
|
622
|
+
|
|
623
|
+
RETURNS:
|
|
624
|
+
DataFrame
|
|
625
|
+
|
|
626
|
+
EXAMPLES:
|
|
627
|
+
>>> load_example_data("dataframe", "admissions_train")
|
|
628
|
+
>>> df = DataFrame("admissions_train")
|
|
629
|
+
>>> df
|
|
630
|
+
masters gpa stats programming admitted
|
|
631
|
+
id
|
|
632
|
+
13 no 4.00 Advanced Novice 1
|
|
633
|
+
26 yes 3.57 Advanced Advanced 1
|
|
634
|
+
5 no 3.44 Novice Novice 0
|
|
635
|
+
19 yes 1.98 Advanced Advanced 0
|
|
636
|
+
15 yes 4.00 Advanced Advanced 1
|
|
637
|
+
40 yes 3.95 Novice Beginner 0
|
|
638
|
+
7 yes 2.33 Novice Novice 1
|
|
639
|
+
22 yes 3.46 Novice Beginner 0
|
|
640
|
+
36 no 3.00 Advanced Novice 0
|
|
641
|
+
38 yes 2.65 Advanced Beginner 1
|
|
642
|
+
|
|
643
|
+
# Example 1: Perform operations on teradataml DataFrame
|
|
644
|
+
# and materializeit in a database object.
|
|
645
|
+
>>> df2 = df.get([["id", "masters", "gpa"]])
|
|
646
|
+
|
|
647
|
+
# Initially table_name will be None.
|
|
648
|
+
>>> df2._table_name
|
|
649
|
+
|
|
650
|
+
>>> df2.materialize()
|
|
651
|
+
masters gpa
|
|
652
|
+
id
|
|
653
|
+
15 yes 4.00
|
|
654
|
+
7 yes 2.33
|
|
655
|
+
22 yes 3.46
|
|
656
|
+
17 no 3.83
|
|
657
|
+
13 no 4.00
|
|
658
|
+
38 yes 2.65
|
|
659
|
+
26 yes 3.57
|
|
660
|
+
5 no 3.44
|
|
661
|
+
34 yes 3.85
|
|
662
|
+
40 yes 3.95
|
|
663
|
+
|
|
664
|
+
# After materialize(), view name will be assigned.
|
|
665
|
+
>>> df2._table_name
|
|
666
|
+
'"ALICE"."ml__select__172077355985236"'
|
|
667
|
+
>>>
|
|
668
|
+
"""
|
|
669
|
+
self.__execute_node_and_set_table_name(self._nodeid, self._metaexpr)
|
|
670
|
+
return self
|
|
671
|
+
|
|
421
672
|
@collect_queryband(queryband="DF_fillna")
|
|
422
673
|
def fillna(self, value=None, columns=None, literal_value=False):
|
|
423
674
|
"""
|
|
@@ -657,7 +908,10 @@ class DataFrame():
|
|
|
657
908
|
Private method for setting _metaexpr and retrieving column names and types.
|
|
658
909
|
|
|
659
910
|
PARAMETERS:
|
|
660
|
-
metaexpr
|
|
911
|
+
metaexpr:
|
|
912
|
+
Required Argument.
|
|
913
|
+
Specifies parent meta data (_MetaExpression object).
|
|
914
|
+
Types: _MetaExpression
|
|
661
915
|
|
|
662
916
|
RETURNS:
|
|
663
917
|
None
|
|
@@ -670,7 +924,8 @@ class DataFrame():
|
|
|
670
924
|
self._column_names_and_types = []
|
|
671
925
|
self._td_column_names_and_types = []
|
|
672
926
|
self._td_column_names_and_sqlalchemy_types = {}
|
|
673
|
-
|
|
927
|
+
|
|
928
|
+
for col in self._metaexpr.c:
|
|
674
929
|
if isinstance(col.type, sqlalchemy.sql.sqltypes.NullType):
|
|
675
930
|
tdtype = TeradataTypes.TD_NULL_TYPE.value
|
|
676
931
|
else:
|
|
@@ -1934,7 +2189,7 @@ class DataFrame():
|
|
|
1934
2189
|
else:
|
|
1935
2190
|
col_filters = col_names
|
|
1936
2191
|
|
|
1937
|
-
col_filters_decode = ["
|
|
2192
|
+
col_filters_decode = ["CASE WHEN \"{}\" IS NULL THEN 0 ELSE 1 END".format(col_name) for col_name in col_filters]
|
|
1938
2193
|
fmt_filter = " + ".join(col_filters_decode)
|
|
1939
2194
|
|
|
1940
2195
|
if thresh is not None:
|
|
@@ -5421,6 +5676,8 @@ class DataFrame():
|
|
|
5421
5676
|
result = self._check_numeric_overflow(agg_df)
|
|
5422
5677
|
"""
|
|
5423
5678
|
try:
|
|
5679
|
+
# Printing the DF will actually run underlying select query and
|
|
5680
|
+
# will brought up numeric overflow if any. Only materializing won't work.
|
|
5424
5681
|
repr(result_df)
|
|
5425
5682
|
return False
|
|
5426
5683
|
except TeradataMlException as tme:
|
|
@@ -5557,18 +5814,73 @@ class DataFrame():
|
|
|
5557
5814
|
EXAMPLES:
|
|
5558
5815
|
self.__get_data_columns()
|
|
5559
5816
|
"""
|
|
5560
|
-
|
|
5561
|
-
|
|
5562
|
-
|
|
5817
|
+
if not self._table_name:
|
|
5818
|
+
if not self._aed_query:
|
|
5819
|
+
self.__generate_aed_query()
|
|
5820
|
+
# TODO: Check the length of query and if it fails, create a view in catch block.
|
|
5821
|
+
# Address in this JIRA: https://teradata-pe.atlassian.net/browse/ELE-6922
|
|
5822
|
+
query = repr(self._metaexpr) + ' FROM ( ' + self._aed_query + ' ) as temp_table'
|
|
5823
|
+
else:
|
|
5824
|
+
query = repr(self._metaexpr) + ' FROM ' + self._table_name
|
|
5563
5825
|
|
|
5564
5826
|
if self._orderby is not None:
|
|
5565
5827
|
query += ' ORDER BY ' + self._orderby
|
|
5566
5828
|
|
|
5829
|
+
query += ';'
|
|
5567
5830
|
# Execute the query and get the results in a list.
|
|
5568
5831
|
self.__data, self.__data_columns = UtilFuncs._execute_query(query=query, fetchWarnings=True)
|
|
5569
5832
|
|
|
5570
5833
|
return self.__data, self.__data_columns
|
|
5571
5834
|
|
|
5835
|
+
def __generate_aed_query(self, full_query=False):
|
|
5836
|
+
"""
|
|
5837
|
+
DESCRIPTION:
|
|
5838
|
+
Internal function to return underlying SQL for the teradataml
|
|
5839
|
+
DataFrame. It is the same SQL that is used to view the data for
|
|
5840
|
+
a teradataml DataFrame.
|
|
5841
|
+
|
|
5842
|
+
PARAMETERS:
|
|
5843
|
+
full_query:
|
|
5844
|
+
Optional Argument.
|
|
5845
|
+
Specifies if the complete query for the dataframe should be returned.
|
|
5846
|
+
When this parameter is set to True, query for the dataframe is returned
|
|
5847
|
+
with respect to the base dataframe's table (from_table() or from_query())
|
|
5848
|
+
or from the output tables of analytical functions (if there are any in the
|
|
5849
|
+
workflow). This query may or may not be directly used to retrieve data
|
|
5850
|
+
for the dataframe upon which the function is called.
|
|
5851
|
+
When this parameter is not used, string returned is the query already used
|
|
5852
|
+
or will be used to retrieve data for the teradataml DataFrame.
|
|
5853
|
+
Default Value: False
|
|
5854
|
+
Types: bool
|
|
5855
|
+
|
|
5856
|
+
RETURNS:
|
|
5857
|
+
String representing the underlying SQL query for the teradataml DataFrame.
|
|
5858
|
+
|
|
5859
|
+
RAISES:
|
|
5860
|
+
None.
|
|
5861
|
+
|
|
5862
|
+
EXAMPLES:
|
|
5863
|
+
self.__generate_aed_query()
|
|
5864
|
+
"""
|
|
5865
|
+
# Run aed call only when _aed_query is None or
|
|
5866
|
+
# the type of current stored query (full/short) is not matching
|
|
5867
|
+
# with asked query type.
|
|
5868
|
+
if (not self._aed_query) or (not self._is_full_query == full_query):
|
|
5869
|
+
node_id = self._nodeid
|
|
5870
|
+
|
|
5871
|
+
if isinstance(self, (DataFrameGroupBy, DataFrameGroupByTime)):
|
|
5872
|
+
# If dataframe is either of type groupby or groupbytime
|
|
5873
|
+
# then get its parent dataframe nodeid and return queries
|
|
5874
|
+
# for the same
|
|
5875
|
+
node_id = self._aed_utils._aed_get_parent_nodeids(self._nodeid)[0]
|
|
5876
|
+
|
|
5877
|
+
queries = self._aed_utils._aed_show_query(node_id, query_with_reference_to_top=full_query)
|
|
5878
|
+
# Store query and type of query in class attributes to avoid future runs.
|
|
5879
|
+
self._aed_query = queries[0][0]
|
|
5880
|
+
self._is_full_query = full_query
|
|
5881
|
+
|
|
5882
|
+
return self._aed_query
|
|
5883
|
+
|
|
5572
5884
|
@collect_queryband(queryband="DF_select")
|
|
5573
5885
|
def select(self, select_expression):
|
|
5574
5886
|
"""
|
|
@@ -5830,6 +6142,8 @@ class DataFrame():
|
|
|
5830
6142
|
* "open_sessions" specifies the number of Teradata data transfer
|
|
5831
6143
|
sessions to be opened for fastexport. This argument is only applicable
|
|
5832
6144
|
in fastexport mode.
|
|
6145
|
+
* Function returns the pandas dataframe with Decimal columns types as float instead of object.
|
|
6146
|
+
If user want datatype to be object, set argument "coerce_float" to False.
|
|
5833
6147
|
|
|
5834
6148
|
Notes:
|
|
5835
6149
|
1. For additional information about "coerce_float" and
|
|
@@ -6145,15 +6459,22 @@ class DataFrame():
|
|
|
6145
6459
|
Supported join operators are =, ==, <, <=, >, >=, <> and != (= and <> operators are
|
|
6146
6460
|
not supported when using DataFrame columns as operands).
|
|
6147
6461
|
|
|
6148
|
-
|
|
6149
|
-
1. When multiple join conditions are given
|
|
6150
|
-
|
|
6151
|
-
2.
|
|
6152
|
-
|
|
6153
|
-
|
|
6462
|
+
Notes:
|
|
6463
|
+
1. When multiple join conditions are given as a list string/ColumnExpression,
|
|
6464
|
+
they are joined using AND operator.
|
|
6465
|
+
2. Two or more on conditions can be combined using & and | operators
|
|
6466
|
+
and can be passed as single ColumnExpression.
|
|
6467
|
+
You can use (df1.a == df1.b) & (df1.c == df1.d) in place of
|
|
6468
|
+
[df1.a == df1.b, df1.c == df1.d].
|
|
6469
|
+
3. Two or more on conditions can not be combined using pythonic 'and'
|
|
6470
|
+
and 'or'.
|
|
6471
|
+
You can use (df1.a == df1.b) & (df1.c == df1.d) in place of
|
|
6472
|
+
[df1.a == df1.b and df1.c == df1.d].
|
|
6473
|
+
4. Performing self join using same DataFrame object in 'other'
|
|
6474
|
+
argument is not supported. In order to perform self join,
|
|
6475
|
+
first create aliased DataFrame using alias() API and pass it
|
|
6476
|
+
for 'other' argument. Refer to Example 10 in EXAMPLES section.
|
|
6154
6477
|
|
|
6155
|
-
You can use [df1.a == df1.b, df1.c == df1.d] in place of
|
|
6156
|
-
[(df1.a == df1.b) & (df1.c == df1.d)].
|
|
6157
6478
|
|
|
6158
6479
|
PARAMETERS:
|
|
6159
6480
|
|
|
@@ -6181,15 +6502,20 @@ class DataFrame():
|
|
|
6181
6502
|
is the column of left dataframe df1 and col2 is the column of right
|
|
6182
6503
|
dataframe df2.
|
|
6183
6504
|
Examples:
|
|
6184
|
-
1. [df1.a == df2.a, df1.b == df2.b] indicates df1.a = df2.a
|
|
6185
|
-
2. [df1.a == df2.b, df1.c == df2.d] indicates df1.a = df2.b
|
|
6186
|
-
3. [df1.a <= df2.b
|
|
6187
|
-
4. [df1.a < df2.b
|
|
6505
|
+
1. [df1.a == df2.a, df1.b == df2.b] indicates df1.a = df2.a AND df1.b = df2.b.
|
|
6506
|
+
2. [df1.a == df2.b, df1.c == df2.d] indicates df1.a = df2.b AND df1.c = df2.d.
|
|
6507
|
+
3. [df1.a <= df2.b & df1.c > df2.d] indicates df1.a <= df2.b AND df1.c > df2.d.
|
|
6508
|
+
4. [df1.a < df2.b | df1.c >= df2.d] indicates df1.a < df2.b OR df1.c >= df2.d.
|
|
6188
6509
|
5. df1.a != df2.b indicates df1.a != df2.b.
|
|
6189
6510
|
• The combination of both string comparisons and comparisons as column expressions.
|
|
6190
6511
|
Examples:
|
|
6191
|
-
1. ["a", df1.b == df2.b] indicates df1.a = df2.a
|
|
6192
|
-
2. [df1.a <= df2.b, "c > d"] indicates df1.a <= df2.b
|
|
6512
|
+
1. ["a", df1.b == df2.b] indicates df1.a = df2.a AND df1.b = df2.b.
|
|
6513
|
+
2. [df1.a <= df2.b, "c > d"] indicates df1.a <= df2.b AND df1.c > df2.d.
|
|
6514
|
+
• ColumnExpressions containing FunctionExpressions which represent SQL functions
|
|
6515
|
+
invoked on DataFrame Columns.
|
|
6516
|
+
Examples:
|
|
6517
|
+
1. (df1.a.round(1) - df2.a.round(1)).mod(2.5) > 2
|
|
6518
|
+
2. df1.a.floor() - df2.b.floor() > 2
|
|
6193
6519
|
|
|
6194
6520
|
Types: str (or) ColumnExpression (or) List of strings(str) or ColumnExpressions
|
|
6195
6521
|
|
|
@@ -6211,7 +6537,7 @@ class DataFrame():
|
|
|
6211
6537
|
Specifies the suffix to be added to the right table columns.
|
|
6212
6538
|
Default Value: None.
|
|
6213
6539
|
Types: str
|
|
6214
|
-
|
|
6540
|
+
|
|
6215
6541
|
lprefix:
|
|
6216
6542
|
Optional Argument.
|
|
6217
6543
|
Specifies the prefix to be added to the left table columns.
|
|
@@ -6261,7 +6587,7 @@ class DataFrame():
|
|
|
6261
6587
|
0 2 2 analytics 2.3 2.3 b analytics b
|
|
6262
6588
|
1 1 1 teradata 1.3 1.3 a teradata a
|
|
6263
6589
|
|
|
6264
|
-
# Example 2: One "on" argument condition is ColumnExpression and other is string having two
|
|
6590
|
+
# Example 2: One "on" argument condition is ColumnExpression and other is string having two
|
|
6265
6591
|
# columns with left outer join.
|
|
6266
6592
|
>>> df1.join(df2, on = [df1.col2 == df2.col4,"col5 = col7"], how = "left", lprefix = "t1", rprefix = "t2")
|
|
6267
6593
|
t1_col1 t2_col1 col2 t1_col3 t2_col3 col5 col4 col7
|
|
@@ -6275,7 +6601,7 @@ class DataFrame():
|
|
|
6275
6601
|
0 2 2 analytics 2.3 2.3 b analytics b
|
|
6276
6602
|
1 1 1 teradata 1.3 1.3 a teradata a
|
|
6277
6603
|
|
|
6278
|
-
# Example 4: One "on" argument condition is ColumnExpression and other is string having two
|
|
6604
|
+
# Example 4: One "on" argument condition is ColumnExpression and other is string having two
|
|
6279
6605
|
# columns with full join.
|
|
6280
6606
|
>>> df1.join(other = df2, on = ["col2=col4",df1.col5 == df2.col7], how = "full", lprefix = "t1", rprefix = "t2")
|
|
6281
6607
|
t1_col1 t2_col1 col2 t1_col3 t2_col3 col5 col4 col7
|
|
@@ -6353,7 +6679,53 @@ class DataFrame():
|
|
|
6353
6679
|
3 Beginner Beginner 1 3.95 Beginner 3.70 Novice 0 1 no yes
|
|
6354
6680
|
3 Beginner Beginner 2 3.76 Beginner 3.70 Novice 0 1 no yes
|
|
6355
6681
|
3 Beginner Novice 3 3.70 Beginner 3.70 Novice 1 1 no no
|
|
6682
|
+
|
|
6683
|
+
# Example 10: Perform self join using aliased DataFrame.
|
|
6684
|
+
# Create an aliased DataFrame.
|
|
6685
|
+
>>> lhs = DataFrame("admissions_train").head(3).sort("id")
|
|
6686
|
+
>>> rhs = lhs.alias("rhs")
|
|
6687
|
+
# Use aliased DataFrame for self join.
|
|
6688
|
+
>>> joined_df = lhs.join(other=rhs, how="cross", lprefix="l", rprefix="r")
|
|
6689
|
+
>>> joined_df
|
|
6690
|
+
l_id r_id l_masters r_masters l_gpa r_gpa l_stats r_stats l_programming r_programming l_admitted r_admitted
|
|
6691
|
+
0 1 3 yes no 3.95 3.70 Beginner Novice Beginner Beginner 0 1
|
|
6692
|
+
1 2 2 yes yes 3.76 3.76 Beginner Beginner Beginner Beginner 0 0
|
|
6693
|
+
2 2 3 yes no 3.76 3.70 Beginner Novice Beginner Beginner 0 1
|
|
6694
|
+
3 3 1 no yes 3.70 3.95 Novice Beginner Beginner Beginner 1 0
|
|
6695
|
+
4 3 3 no no 3.70 3.70 Novice Novice Beginner Beginner 1 1
|
|
6696
|
+
5 3 2 no yes 3.70 3.76 Novice Beginner Beginner Beginner 1 0
|
|
6697
|
+
6 2 1 yes yes 3.76 3.95 Beginner Beginner Beginner Beginner 0 0
|
|
6698
|
+
7 1 2 yes yes 3.95 3.76 Beginner Beginner Beginner Beginner 0 0
|
|
6699
|
+
8 1 1 yes yes 3.95 3.95 Beginner Beginner Beginner Beginner 0 0
|
|
6700
|
+
|
|
6701
|
+
# Example 11: Perform join with compound 'on' condition having
|
|
6702
|
+
# more than one binary operator.
|
|
6703
|
+
>>> rhs_2 = lhs.assign(double_gpa=lhs.gpa * 2)
|
|
6704
|
+
>>> joined_df_2 = lhs.join(rhs_2, on=rhs_2.double_gpa == lhs.gpa * 2, how="left", lprefix="l", rprefix="r")
|
|
6705
|
+
>>> joined_df_2
|
|
6706
|
+
l_id r_id l_masters r_masters l_gpa r_gpa l_stats r_stats l_programming r_programming l_admitted r_admitted double_gpa
|
|
6707
|
+
0 3 3 no no 3.70 3.70 Novice Novice Beginner Beginner 1 1 7.40
|
|
6708
|
+
1 2 2 yes yes 3.76 3.76 Beginner Beginner Beginner Beginner 0 0 7.52
|
|
6709
|
+
2 1 1 yes yes 3.95 3.95 Beginner Beginner Beginner Beginner 0 0 7.90
|
|
6710
|
+
|
|
6711
|
+
# Example 12: Perform join on DataFrames with 'on' condition
|
|
6712
|
+
# having FunctionExpression.
|
|
6713
|
+
>>> df = DataFrame("admissions_train")
|
|
6714
|
+
>>> df2 = df.alias("rhs_df")
|
|
6715
|
+
>>> joined_df_3 = df.join(df2, on=(df.gpa.round(1) - df2.gpa.round(1)).mod(2.5) > 2,
|
|
6716
|
+
>>> how="inner", lprefix="l")
|
|
6717
|
+
>>> joined_df_3.sort(["id", "l_id"])
|
|
6718
|
+
l_id id l_masters masters l_gpa gpa l_stats stats l_programming programming l_admitted admitted
|
|
6719
|
+
0 1 24 yes no 3.95 1.87 Beginner Advanced Beginner Novice 0 1
|
|
6720
|
+
1 13 24 no no 4.0 1.87 Advanced Advanced Novice Novice 1 1
|
|
6721
|
+
2 15 24 yes no 4.0 1.87 Advanced Advanced Advanced Novice 1 1
|
|
6722
|
+
3 25 24 no no 3.96 1.87 Advanced Advanced Advanced Novice 1 1
|
|
6723
|
+
4 27 24 yes no 3.96 1.87 Advanced Advanced Advanced Novice 0 1
|
|
6724
|
+
5 29 24 yes no 4.0 1.87 Novice Advanced Beginner Novice 0 1
|
|
6725
|
+
6 40 24 yes no 3.95 1.87 Novice Advanced Beginner Novice 0 1
|
|
6726
|
+
|
|
6356
6727
|
"""
|
|
6728
|
+
|
|
6357
6729
|
# Argument validations
|
|
6358
6730
|
awu_matrix = []
|
|
6359
6731
|
awu_matrix.append(["other", other, False, (DataFrame)])
|
|
@@ -6367,17 +6739,11 @@ class DataFrame():
|
|
|
6367
6739
|
# Validate argument types
|
|
6368
6740
|
_Validators._validate_function_arguments(awu_matrix)
|
|
6369
6741
|
|
|
6370
|
-
# If
|
|
6371
|
-
#
|
|
6372
|
-
|
|
6373
|
-
|
|
6374
|
-
|
|
6375
|
-
raffix = rsuffix
|
|
6376
|
-
affix_type = "suffix"
|
|
6377
|
-
else:
|
|
6378
|
-
laffix = lprefix
|
|
6379
|
-
raffix = rprefix
|
|
6380
|
-
affix_type = "prefix"
|
|
6742
|
+
# If self and other DataFrames are pointing to same Table object,
|
|
6743
|
+
# raise error.
|
|
6744
|
+
if self._metaexpr.t is other._metaexpr.t:
|
|
6745
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_ALIAS_REQUIRED, "join"),
|
|
6746
|
+
MessageCodes.TDMLDF_ALIAS_REQUIRED)
|
|
6381
6747
|
|
|
6382
6748
|
how_lc = how.lower()
|
|
6383
6749
|
|
|
@@ -6395,12 +6761,33 @@ class DataFrame():
|
|
|
6395
6761
|
for col in other.columns:
|
|
6396
6762
|
other_columns_lower_actual_map[col.lower()] = col
|
|
6397
6763
|
|
|
6398
|
-
|
|
6399
|
-
|
|
6400
|
-
|
|
6401
|
-
|
|
6402
|
-
|
|
6403
|
-
|
|
6764
|
+
# Set the affix variables (laffix and raffix) with provided value(s)
|
|
6765
|
+
# of lsuffix, rsuffix, lprefix and rprefix.
|
|
6766
|
+
# Also set affix_type appropriately.
|
|
6767
|
+
laffix = None
|
|
6768
|
+
raffix = None
|
|
6769
|
+
affix_type = None
|
|
6770
|
+
if lsuffix is not None or rsuffix is not None:
|
|
6771
|
+
laffix = lsuffix
|
|
6772
|
+
raffix = rsuffix
|
|
6773
|
+
affix_type = "suffix"
|
|
6774
|
+
elif lprefix is not None or rprefix is not None:
|
|
6775
|
+
laffix = lprefix
|
|
6776
|
+
raffix = rprefix
|
|
6777
|
+
affix_type = "prefix"
|
|
6778
|
+
|
|
6779
|
+
# Same column names can be present in two dataframes involved
|
|
6780
|
+
# in join operation in below two cases:
|
|
6781
|
+
# Case 1: Self join.
|
|
6782
|
+
# Case 2: Two tables having common column names.
|
|
6783
|
+
# In any case, at least one kind of affix is required to generate
|
|
6784
|
+
# distinct column names in resultant table. Throw error if no affix
|
|
6785
|
+
# is available.
|
|
6786
|
+
if not set(self_columns_lower_actual_map.keys()).isdisjoint(other_columns_lower_actual_map.keys()):
|
|
6787
|
+
if affix_type is None:
|
|
6788
|
+
raise TeradataMlException(
|
|
6789
|
+
Messages.get_message(MessageCodes.TDMLDF_REQUIRED_TABLE_ALIAS),
|
|
6790
|
+
MessageCodes.TDMLDF_REQUIRED_TABLE_ALIAS)
|
|
6404
6791
|
|
|
6405
6792
|
# Both affixes should not be equal to perform join.
|
|
6406
6793
|
if laffix == raffix and laffix is not None:
|
|
@@ -6409,115 +6796,158 @@ class DataFrame():
|
|
|
6409
6796
|
"'l{affix_type}' and 'r{affix_type}'".format(affix_type=affix_type)),
|
|
6410
6797
|
MessageCodes.TDMLDF_INVALID_TABLE_ALIAS)
|
|
6411
6798
|
|
|
6412
|
-
|
|
6413
|
-
|
|
6414
|
-
|
|
6415
|
-
|
|
6416
|
-
|
|
6417
|
-
|
|
6418
|
-
|
|
6419
|
-
|
|
6420
|
-
|
|
6421
|
-
|
|
6422
|
-
|
|
6423
|
-
|
|
6424
|
-
|
|
6425
|
-
|
|
6426
|
-
#
|
|
6427
|
-
|
|
6428
|
-
|
|
6429
|
-
|
|
6430
|
-
|
|
6431
|
-
|
|
6432
|
-
|
|
6433
|
-
|
|
6434
|
-
|
|
6435
|
-
|
|
6436
|
-
|
|
6437
|
-
|
|
6438
|
-
|
|
6439
|
-
|
|
6440
|
-
|
|
6441
|
-
|
|
6442
|
-
|
|
6443
|
-
|
|
6444
|
-
|
|
6445
|
-
|
|
6446
|
-
|
|
6447
|
-
|
|
6448
|
-
|
|
6449
|
-
|
|
6450
|
-
|
|
6451
|
-
|
|
6452
|
-
|
|
6453
|
-
|
|
6454
|
-
|
|
6455
|
-
|
|
6456
|
-
|
|
6457
|
-
|
|
6458
|
-
|
|
6459
|
-
|
|
6460
|
-
|
|
6461
|
-
|
|
6462
|
-
|
|
6463
|
-
|
|
6464
|
-
|
|
6465
|
-
|
|
6466
|
-
|
|
6467
|
-
|
|
6468
|
-
|
|
6469
|
-
|
|
6470
|
-
|
|
6471
|
-
|
|
6472
|
-
|
|
6473
|
-
|
|
6474
|
-
|
|
6475
|
-
|
|
6476
|
-
|
|
6477
|
-
|
|
6478
|
-
|
|
6479
|
-
|
|
6480
|
-
|
|
6481
|
-
|
|
6482
|
-
|
|
6483
|
-
|
|
6484
|
-
|
|
6485
|
-
df2_column_with_affix = self.__check_and_return_new_column_name(raffix, column,
|
|
6486
|
-
self_columns_lower_actual_map.keys(),
|
|
6487
|
-
"left", affix_type)
|
|
6488
|
-
select_columns.append("{0} as {1}".format(
|
|
6489
|
-
self.__get_fully_qualified_col_name(column, "df2" if raffix is None else raffix),
|
|
6490
|
-
df2_column_with_affix))
|
|
6491
|
-
|
|
6492
|
-
# As we are creating new column name, adding it to new metadata dict for new dataframe from join.
|
|
6493
|
-
self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
|
|
6494
|
-
UtilFuncs._teradata_unquote_arg(df1_column_with_affix, "\""),
|
|
6495
|
-
column, df1_columns_types)
|
|
6496
|
-
|
|
6497
|
-
self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
|
|
6498
|
-
UtilFuncs._teradata_unquote_arg(df2_column_with_affix, "\""),
|
|
6499
|
-
other_column, df2_columns_types)
|
|
6500
|
-
|
|
6799
|
+
try:
|
|
6800
|
+
# Set an attribute named '_join_alias' to underlying SQLAlchemy table objects
|
|
6801
|
+
# and use it as default alias for compiling.
|
|
6802
|
+
setattr(self._metaexpr.t, "_join_alias", "lhs")
|
|
6803
|
+
setattr(other._metaexpr.t, "_join_alias", "rhs")
|
|
6804
|
+
lhs_alias = "lhs"
|
|
6805
|
+
rhs_alias = "rhs"
|
|
6806
|
+
|
|
6807
|
+
# Step 1: Generate the on clause string.
|
|
6808
|
+
if how_lc != "cross":
|
|
6809
|
+
on = UtilFuncs._as_list(on)
|
|
6810
|
+
|
|
6811
|
+
all_join_conditions = []
|
|
6812
|
+
invalid_join_conditions = []
|
|
6813
|
+
# Forming join condition
|
|
6814
|
+
for condition in on:
|
|
6815
|
+
# Process only when the on condition is either a string or a ColumnExpression.
|
|
6816
|
+
if not isinstance(condition, (ColumnExpression, str)):
|
|
6817
|
+
invalid_join_conditions.append(condition)
|
|
6818
|
+
continue
|
|
6819
|
+
|
|
6820
|
+
# Generate final on clause string from string representation of condition.
|
|
6821
|
+
if isinstance(condition, str):
|
|
6822
|
+
# Process the string manually.
|
|
6823
|
+
# 1. Parse the string to get operator.
|
|
6824
|
+
for op in TeradataConstants.TERADATA_JOIN_OPERATORS.value:
|
|
6825
|
+
if op in condition:
|
|
6826
|
+
conditional_separator = op
|
|
6827
|
+
break
|
|
6828
|
+
else:
|
|
6829
|
+
# If no join condition is mentioned, then string represents the column.
|
|
6830
|
+
# In this case, default operator is taken as equal.
|
|
6831
|
+
# If on is ['a'], then it is equal to 'lhs.a = rhs.a'
|
|
6832
|
+
columns = [condition, condition]
|
|
6833
|
+
condition = "{0} = {0}".format(condition)
|
|
6834
|
+
conditional_separator = "="
|
|
6835
|
+
# 2. Split the string using operator and extract LHS and RHS
|
|
6836
|
+
# columns from a binary expression.
|
|
6837
|
+
columns = [column.strip() for column in condition.split(sep=conditional_separator)
|
|
6838
|
+
if len(column) > 0]
|
|
6839
|
+
|
|
6840
|
+
if len(columns) != 2:
|
|
6841
|
+
invalid_join_conditions.append(condition)
|
|
6842
|
+
# TODO: Raise exception here only.
|
|
6843
|
+
else:
|
|
6844
|
+
# 3. Generate fully qualified names using affix and table alias
|
|
6845
|
+
# and create final on clause condition string.
|
|
6846
|
+
left_col = self.__add_alias_to_column(columns[0], self, lhs_alias)
|
|
6847
|
+
right_col = self.__add_alias_to_column(columns[1], other, rhs_alias)
|
|
6848
|
+
if conditional_separator == "!=":
|
|
6849
|
+
# "!=" is python way of expressing 'not equal to'. "<>" is Teradata way of
|
|
6850
|
+
# expressing 'not equal to'. Adding support for "!=".
|
|
6851
|
+
conditional_separator = "<>"
|
|
6852
|
+
all_join_conditions.append(
|
|
6853
|
+
'{0} {1} {2}'.format(left_col, conditional_separator, right_col))
|
|
6854
|
+
|
|
6855
|
+
# Generate on clause string from column expression.
|
|
6856
|
+
if isinstance(condition, ColumnExpression):
|
|
6857
|
+
compiled_condition = condition.compile(compile_kwargs={'include_table': True,
|
|
6858
|
+
'literal_binds': True,
|
|
6859
|
+
'table_name_kind': '_join_alias',
|
|
6860
|
+
'compile_with_caller_table': True})
|
|
6861
|
+
|
|
6862
|
+
all_join_conditions.append(compiled_condition)
|
|
6863
|
+
|
|
6864
|
+
# Raise error if invalid on conditions are passed.
|
|
6865
|
+
if len(invalid_join_conditions) > 0:
|
|
6866
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_INVALID_JOIN_CONDITION,
|
|
6867
|
+
", ".join(invalid_join_conditions)),
|
|
6868
|
+
MessageCodes.TDMLDF_INVALID_JOIN_CONDITION)
|
|
6869
|
+
|
|
6870
|
+
# Generate final on condition.
|
|
6871
|
+
join_condition = " and ".join(all_join_conditions)
|
|
6501
6872
|
else:
|
|
6502
|
-
#
|
|
6503
|
-
|
|
6504
|
-
select_columns.append(UtilFuncs._teradata_quote_arg(column, "\"", False))
|
|
6873
|
+
# In case of cross join no need of condition.
|
|
6874
|
+
join_condition = ""
|
|
6505
6875
|
|
|
6506
|
-
|
|
6507
|
-
|
|
6508
|
-
|
|
6509
|
-
|
|
6510
|
-
|
|
6876
|
+
# Step 2: Generate the select clause string.
|
|
6877
|
+
# Generate new column names for overlapping column names using lsuffix, rsuffix, lprefix, rprefix.
|
|
6878
|
+
# Also, use table alias while addressing overlapping column names.
|
|
6879
|
+
lhs_columns_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr)
|
|
6880
|
+
rhs_columns_types = df_utils._get_required_columns_types_from_metaexpr(other._metaexpr)
|
|
6511
6881
|
|
|
6512
|
-
|
|
6513
|
-
|
|
6514
|
-
join_condition, "df1" if laffix is None else laffix,
|
|
6515
|
-
"df2" if raffix is None else raffix)
|
|
6882
|
+
select_columns = []
|
|
6883
|
+
new_metaexpr_columns_types = OrderedDict()
|
|
6516
6884
|
|
|
6517
|
-
|
|
6518
|
-
|
|
6885
|
+
# Processing columns in LHS DF/ self DF.
|
|
6886
|
+
for column in self.columns:
|
|
6887
|
+
if df_utils._check_column_exists(column.lower(), other_columns_lower_actual_map.keys()):
|
|
6888
|
+
# Check if column found in other DataFrame has same case or different.
|
|
6889
|
+
# Return the column name from the other DataFrame.
|
|
6890
|
+
other_column = other_columns_lower_actual_map[column.lower()]
|
|
6891
|
+
|
|
6892
|
+
# Check if column name in LHS dataframe is same as that of in RHS dataframe.
|
|
6893
|
+
# If so, generate new name for LHS DF column using provided affix.
|
|
6894
|
+
df1_column_with_affix = self.__check_and_return_new_column_name(laffix, other_column,
|
|
6895
|
+
other_columns_lower_actual_map.keys(),
|
|
6896
|
+
"right", affix_type)
|
|
6897
|
+
|
|
6898
|
+
# Generate select clause string for current column and append to list.
|
|
6899
|
+
select_columns.append("{0} as {1}".format(
|
|
6900
|
+
self.__get_fully_qualified_col_name(other_column, lhs_alias),
|
|
6901
|
+
df1_column_with_affix))
|
|
6902
|
+
|
|
6903
|
+
# Check if column name in RHS dataframe is same as that of in LHS dataframe.
|
|
6904
|
+
# If so, generate new name for RHS DF column using provided affix.
|
|
6905
|
+
df2_column_with_affix = self.__check_and_return_new_column_name(raffix, column,
|
|
6906
|
+
self_columns_lower_actual_map.keys(),
|
|
6907
|
+
"left", affix_type)
|
|
6908
|
+
# Generate select clause string for current column and append to list.
|
|
6909
|
+
select_columns.append("{0} as {1}".format(
|
|
6910
|
+
self.__get_fully_qualified_col_name(column, rhs_alias),
|
|
6911
|
+
df2_column_with_affix))
|
|
6912
|
+
|
|
6913
|
+
# As we are creating new column name, adding it to new metadata dict for new dataframe from join.
|
|
6914
|
+
self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
|
|
6915
|
+
UtilFuncs._teradata_unquote_arg(df1_column_with_affix, "\""),
|
|
6916
|
+
column, lhs_columns_types)
|
|
6917
|
+
|
|
6918
|
+
self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
|
|
6919
|
+
UtilFuncs._teradata_unquote_arg(df2_column_with_affix, "\""),
|
|
6920
|
+
other_column, rhs_columns_types)
|
|
6519
6921
|
|
|
6520
|
-
|
|
6922
|
+
else:
|
|
6923
|
+
# As column with same name is not present in RHS DataFrame now,
|
|
6924
|
+
# directly adding column to new metadata dict.
|
|
6925
|
+
self.__add_column_type_item_to_dict(new_metaexpr_columns_types, column, column, lhs_columns_types)
|
|
6926
|
+
select_columns.append(UtilFuncs._teradata_quote_arg(column, "\"", False))
|
|
6927
|
+
|
|
6928
|
+
# Processing columns in RHS DF/ other DF.
|
|
6929
|
+
# Here we will only be processing columns which are not overlapping.
|
|
6930
|
+
for column in other.columns:
|
|
6931
|
+
if not df_utils._check_column_exists(column.lower(), self_columns_lower_actual_map.keys()):
|
|
6932
|
+
# As column not present in left DataFrame, directly adding column to new metadata dict.
|
|
6933
|
+
self.__add_column_type_item_to_dict(new_metaexpr_columns_types, column, column, rhs_columns_types)
|
|
6934
|
+
select_columns.append(UtilFuncs._teradata_quote_arg(column, "\"", False))
|
|
6935
|
+
|
|
6936
|
+
# Step 3: Create a node in AED using _aed_join using appropriate alias for involved tables.
|
|
6937
|
+
join_node_id = self._aed_utils._aed_join(self._nodeid, other._nodeid, ", ".join(select_columns),
|
|
6938
|
+
how_lc, join_condition, lhs_alias, rhs_alias)
|
|
6939
|
+
|
|
6940
|
+
# Step 4: Constructing new Metadata (_metaexpr) without DB; using dummy select_nodeid
|
|
6941
|
+
# and underlying table name.
|
|
6942
|
+
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(join_node_id, new_metaexpr_columns_types.items())
|
|
6943
|
+
|
|
6944
|
+
# Return a new joined dataframe.
|
|
6945
|
+
return self._create_dataframe_from_node(join_node_id, new_metaexpr, self._index_label)
|
|
6946
|
+
finally:
|
|
6947
|
+
# Delete the '_join_alias' attribute attached to underlying
|
|
6948
|
+
# SQLALchemy table objects.
|
|
6949
|
+
delattr(self._metaexpr.t, "_join_alias")
|
|
6950
|
+
delattr(other._metaexpr.t, "_join_alias")
|
|
6521
6951
|
|
|
6522
6952
|
def __add_alias_to_column(self, column, df, alias):
|
|
6523
6953
|
"""
|
|
@@ -6577,7 +7007,7 @@ class DataFrame():
|
|
|
6577
7007
|
return "{0}.{1}".format(UtilFuncs._teradata_quote_arg(alias, "\"", False),
|
|
6578
7008
|
UtilFuncs._teradata_quote_arg(column, "\"", False))
|
|
6579
7009
|
|
|
6580
|
-
def __check_and_return_new_column_name(self, affix, column, col_list,
|
|
7010
|
+
def __check_and_return_new_column_name(self, affix, column, col_list, other_df_side, affix_type):
|
|
6581
7011
|
"""
|
|
6582
7012
|
Check new column name alias with column exists in col_list or not, if exists throws exception else
|
|
6583
7013
|
returns new column name.
|
|
@@ -6586,7 +7016,7 @@ class DataFrame():
|
|
|
6586
7016
|
affix - affix to be added to column.
|
|
6587
7017
|
column - column name.
|
|
6588
7018
|
col_list - list of columns to check in which new column is exists or not.
|
|
6589
|
-
|
|
7019
|
+
other_df_side - Side on which the other dataframe in current join operation resides.
|
|
6590
7020
|
affix_type - Type of affix. Either "prefix" or "suffix".
|
|
6591
7021
|
|
|
6592
7022
|
EXAMPLES:
|
|
@@ -6600,19 +7030,19 @@ class DataFrame():
|
|
|
6600
7030
|
return UtilFuncs._teradata_quote_arg(column, "\"", False)
|
|
6601
7031
|
|
|
6602
7032
|
# If Prefix, affix is added before column name else it is appended.
|
|
6603
|
-
|
|
6604
|
-
|
|
6605
|
-
|
|
6606
|
-
if df_utils._check_column_exists(
|
|
6607
|
-
if
|
|
6608
|
-
|
|
7033
|
+
column_with_affix = "{0}_{1}" if affix_type == "prefix" else "{1}_{0}"
|
|
7034
|
+
column_with_affix = column_with_affix.format(affix,
|
|
7035
|
+
UtilFuncs._teradata_unquote_arg(column, "\""))
|
|
7036
|
+
if df_utils._check_column_exists(column_with_affix.lower(), col_list):
|
|
7037
|
+
if other_df_side == "right":
|
|
7038
|
+
affix_type = "l{}".format(affix_type)
|
|
6609
7039
|
else:
|
|
6610
|
-
|
|
7040
|
+
affix_type = "r{}".format(affix_type)
|
|
6611
7041
|
raise TeradataMlException(
|
|
6612
|
-
Messages.get_message(MessageCodes.TDMLDF_COLUMN_ALREADY_EXISTS,
|
|
6613
|
-
|
|
7042
|
+
Messages.get_message(MessageCodes.TDMLDF_COLUMN_ALREADY_EXISTS, column_with_affix, other_df_side,
|
|
7043
|
+
affix_type),
|
|
6614
7044
|
MessageCodes.TDMLDF_COLUMN_ALREADY_EXISTS)
|
|
6615
|
-
return UtilFuncs._teradata_quote_arg(
|
|
7045
|
+
return UtilFuncs._teradata_quote_arg(column_with_affix, "\"", False)
|
|
6616
7046
|
|
|
6617
7047
|
def __add_column_type_item_to_dict(self, new_metadata_dict, new_column, column, column_types):
|
|
6618
7048
|
"""
|
|
@@ -7108,6 +7538,184 @@ class DataFrame():
|
|
|
7108
7538
|
if function_name is None or function_name in VANTAGE_FUNCTION_ARGTYPE_DEPENDENT_MAPPER:
|
|
7109
7539
|
self.__execute_node_and_set_table_name(self._nodeid)
|
|
7110
7540
|
return True
|
|
7541
|
+
|
|
7542
|
+
def _assign_udf(self, udf_expr):
|
|
7543
|
+
"""
|
|
7544
|
+
DESCRIPTION:
|
|
7545
|
+
Internal function for DataFrame.assign() to execute the udf using
|
|
7546
|
+
Script Table Operator and create new column for teradataml DataFrame.
|
|
7547
|
+
|
|
7548
|
+
PARAMETER:
|
|
7549
|
+
udf_expr:
|
|
7550
|
+
Required Argument.
|
|
7551
|
+
Specifies a dictionary of column name to UDF expressions.
|
|
7552
|
+
Types: dict
|
|
7553
|
+
|
|
7554
|
+
RETURNS:
|
|
7555
|
+
teradataml DataFrame
|
|
7556
|
+
|
|
7557
|
+
RAISES:
|
|
7558
|
+
None.
|
|
7559
|
+
|
|
7560
|
+
EXAMPLES:
|
|
7561
|
+
self._assign_udf(udf_expr)
|
|
7562
|
+
"""
|
|
7563
|
+
|
|
7564
|
+
df = self
|
|
7565
|
+
env_name = None
|
|
7566
|
+
# Create a dictionary of env_name to list of output columns to be run on that env.
|
|
7567
|
+
env_mapper = OrderedDict()
|
|
7568
|
+
|
|
7569
|
+
exec_mode = 'REMOTE' if UtilFuncs._is_lake() else 'IN-DB'
|
|
7570
|
+
if exec_mode == 'REMOTE':
|
|
7571
|
+
_Validators._check_auth_token("udf")
|
|
7572
|
+
for colname, col in udf_expr.items():
|
|
7573
|
+
env_name = UtilFuncs._get_env_name(col)
|
|
7574
|
+
# Store the env_name and its corresponding output column
|
|
7575
|
+
if env_name in env_mapper:
|
|
7576
|
+
env_mapper[env_name].append(colname)
|
|
7577
|
+
else:
|
|
7578
|
+
env_mapper[env_name] = [colname]
|
|
7579
|
+
else:
|
|
7580
|
+
env_mapper[env_name] = udf_expr.keys()
|
|
7581
|
+
|
|
7582
|
+
for env_name, cols in env_mapper.items():
|
|
7583
|
+
# Create a dictionary of output columns to column type.
|
|
7584
|
+
returns = OrderedDict([(column.name, column.type) for column in df._metaexpr.c])
|
|
7585
|
+
# Store the udf functions
|
|
7586
|
+
user_function = []
|
|
7587
|
+
# Create a dictionary of output column name to udf name
|
|
7588
|
+
columns_definitions = {}
|
|
7589
|
+
# Create a dictionary of output column name to udf arguments
|
|
7590
|
+
function_args = {}
|
|
7591
|
+
for colname, col in udf_expr.items():
|
|
7592
|
+
delimiter = col._delimiter
|
|
7593
|
+
quotechar = col._quotechar
|
|
7594
|
+
if colname in cols:
|
|
7595
|
+
user_function.append(col._udf)
|
|
7596
|
+
function_args[colname] = col._udf_args if col._udf_args else ()
|
|
7597
|
+
returns[colname] = col.type
|
|
7598
|
+
columns_definitions[colname] = col._udf.__name__
|
|
7599
|
+
|
|
7600
|
+
tbl_operators = _TableOperatorUtils([],
|
|
7601
|
+
df,
|
|
7602
|
+
"udf",
|
|
7603
|
+
user_function,
|
|
7604
|
+
exec_mode,
|
|
7605
|
+
chunk_size=None,
|
|
7606
|
+
returns=returns,
|
|
7607
|
+
delimiter=delimiter,
|
|
7608
|
+
quotechar=quotechar,
|
|
7609
|
+
num_rows=1,
|
|
7610
|
+
auth=None,
|
|
7611
|
+
data_partition_column=None,
|
|
7612
|
+
data_hash_column=None,
|
|
7613
|
+
data_order_column=None,
|
|
7614
|
+
is_local_order=None,
|
|
7615
|
+
nulls_first=None,
|
|
7616
|
+
sort_ascending=None,
|
|
7617
|
+
charset=None,
|
|
7618
|
+
env_name = env_name,
|
|
7619
|
+
style = "csv",
|
|
7620
|
+
function_args=function_args,
|
|
7621
|
+
columns_definitions=columns_definitions,
|
|
7622
|
+
output_type_converters={
|
|
7623
|
+
col_name: _Dtypes._teradata_type_to_python_type(col_type)
|
|
7624
|
+
for col_name, col_type in returns.items()})
|
|
7625
|
+
|
|
7626
|
+
df = tbl_operators.execute()
|
|
7627
|
+
return df
|
|
7628
|
+
|
|
7629
|
+
def _assign_call_udf(self, call_udf_expr):
|
|
7630
|
+
"""
|
|
7631
|
+
DESCRIPTION:
|
|
7632
|
+
Internal function for DataFrame.assign() to execute the call_udf using
|
|
7633
|
+
Script/Apply Table Operator and create new column for teradataml DataFrame.
|
|
7634
|
+
|
|
7635
|
+
PARAMETER:
|
|
7636
|
+
call_udf_expr:
|
|
7637
|
+
Required Argument.
|
|
7638
|
+
Specifies a dictionary of column name to call_udf expressions.
|
|
7639
|
+
Types: dict
|
|
7640
|
+
|
|
7641
|
+
RETURNS:
|
|
7642
|
+
teradataml DataFrame
|
|
7643
|
+
|
|
7644
|
+
RAISES:
|
|
7645
|
+
None.
|
|
7646
|
+
|
|
7647
|
+
EXAMPLES:
|
|
7648
|
+
# call_udf_expr is a dictionary of column names to call_udf expressions.
|
|
7649
|
+
call_udf_expr = {'upper_col': <teradataml.dataframe.sql._SQLColumnExpression object at 0x0000028E59C44310>,
|
|
7650
|
+
'sum_col': <teradataml.dataframe.sql._SQLColumnExpression object at 0x0000028E59C41690>}
|
|
7651
|
+
self._assign_register(call_udf_expr)
|
|
7652
|
+
"""
|
|
7653
|
+
df = self
|
|
7654
|
+
# Create a dictionary of output columns to column type (teradata type).
|
|
7655
|
+
returns = OrderedDict([(column.name, column.type) for column in df._metaexpr.c])
|
|
7656
|
+
# Create a dictionary of output columns to column type (python types).
|
|
7657
|
+
output_type_converters = {col_name: _Dtypes._teradata_type_to_python_type(col_type) \
|
|
7658
|
+
for col_name, col_type in returns.items()}
|
|
7659
|
+
|
|
7660
|
+
for colname, col in call_udf_expr.items():
|
|
7661
|
+
returns[colname] = col.type
|
|
7662
|
+
output_type_converters[colname] = _Dtypes._teradata_type_to_python_type(col.type)
|
|
7663
|
+
script_name = col._udf_script
|
|
7664
|
+
delimiter = col._delimiter
|
|
7665
|
+
quotechar = col._quotechar
|
|
7666
|
+
|
|
7667
|
+
# Create a dictionary of arguments to be passed to the script.
|
|
7668
|
+
script_data = {}
|
|
7669
|
+
script_data['input_cols'] = df.columns
|
|
7670
|
+
script_data['output_cols'] = list(returns.keys())
|
|
7671
|
+
script_data['output_type_converters'] = output_type_converters
|
|
7672
|
+
script_data['function_args'] = {colname: col._udf_args}
|
|
7673
|
+
script_data['delimiter'] = delimiter
|
|
7674
|
+
script_data['qoutechar'] = quotechar
|
|
7675
|
+
|
|
7676
|
+
# Convert the dictionary to a string.
|
|
7677
|
+
# The string is URL encoded to pass it as a parameter to the script.
|
|
7678
|
+
script_data = urllib.parse.quote_plus(json.dumps(script_data))
|
|
7679
|
+
|
|
7680
|
+
if UtilFuncs._is_lake():
|
|
7681
|
+
from teradataml.table_operators.Apply import Apply
|
|
7682
|
+
apply_op_obj = Apply(data=df,
|
|
7683
|
+
script_name=script_name,
|
|
7684
|
+
env_name=col._env_name,
|
|
7685
|
+
returns = returns,
|
|
7686
|
+
delimiter = delimiter,
|
|
7687
|
+
quotechar=quotechar,
|
|
7688
|
+
files_local_path=GarbageCollector._get_temp_dir_name(),
|
|
7689
|
+
apply_command="python3 {} {}".format(script_name, script_data)
|
|
7690
|
+
)
|
|
7691
|
+
try:
|
|
7692
|
+
df = apply_op_obj.execute_script(
|
|
7693
|
+
output_style=OutputStyle.OUTPUT_TABLE.value)
|
|
7694
|
+
except Exception:
|
|
7695
|
+
raise
|
|
7696
|
+
else:
|
|
7697
|
+
import teradataml.context.context as context
|
|
7698
|
+
database = context._get_current_databasename()
|
|
7699
|
+
|
|
7700
|
+
check_reserved_keyword = False if sorted(list(returns.keys())) == sorted(df.columns) else True
|
|
7701
|
+
|
|
7702
|
+
from teradataml.table_operators.Script import Script
|
|
7703
|
+
table_op_obj = Script(data=df,
|
|
7704
|
+
script_name=script_name,
|
|
7705
|
+
files_local_path=GarbageCollector._get_temp_dir_name(),
|
|
7706
|
+
script_command="{}/bin/python3 ./{}/{} {}".format(
|
|
7707
|
+
configure.indb_install_location, database, script_name, script_data),
|
|
7708
|
+
returns=returns,
|
|
7709
|
+
quotechar=quotechar,
|
|
7710
|
+
delimiter = delimiter
|
|
7711
|
+
)
|
|
7712
|
+
table_op_obj.check_reserved_keyword = check_reserved_keyword
|
|
7713
|
+
try:
|
|
7714
|
+
df = table_op_obj.execute_script(
|
|
7715
|
+
output_style=OutputStyle.OUTPUT_TABLE.value)
|
|
7716
|
+
except Exception:
|
|
7717
|
+
raise
|
|
7718
|
+
return df
|
|
7111
7719
|
|
|
7112
7720
|
@collect_queryband(queryband="DF_assign")
|
|
7113
7721
|
def assign(self, drop_columns=False, **kwargs):
|
|
@@ -7119,10 +7727,12 @@ class DataFrame():
|
|
|
7119
7727
|
drop_columns:
|
|
7120
7728
|
Optional Argument.
|
|
7121
7729
|
If True, drop columns that are not specified in assign.
|
|
7122
|
-
|
|
7123
|
-
When DataFrame.assign() is run on DataFrame.groupby(), this argument
|
|
7124
|
-
|
|
7125
|
-
|
|
7730
|
+
Notes:
|
|
7731
|
+
1. When DataFrame.assign() is run on DataFrame.groupby(), this argument
|
|
7732
|
+
is ignored. In such cases, all columns are dropped and only new columns
|
|
7733
|
+
and grouping columns are returned.
|
|
7734
|
+
2. Argument is ignored for UDF functions.
|
|
7735
|
+
|
|
7126
7736
|
Default Value: False
|
|
7127
7737
|
Types: bool
|
|
7128
7738
|
|
|
@@ -7138,6 +7748,7 @@ class DataFrame():
|
|
|
7138
7748
|
* SQLAlchemy ClauseElements.
|
|
7139
7749
|
(See teradataml extension with SQLAlchemy in teradataml User Guide
|
|
7140
7750
|
and Function reference guide for more details)
|
|
7751
|
+
* Function - udf, call_udf.
|
|
7141
7752
|
|
|
7142
7753
|
|
|
7143
7754
|
RETURNS:
|
|
@@ -7163,6 +7774,16 @@ class DataFrame():
|
|
|
7163
7774
|
used, but the column used in such function must be a part of group by columns.
|
|
7164
7775
|
See examples for teradataml extension with SQLAlchemy on using various
|
|
7165
7776
|
functions with DataFrame.assign().
|
|
7777
|
+
6. UDF expressions can run on both Vantage Cloud Lake leveraging Apply Table Operator
|
|
7778
|
+
of Open Analytics Framework and Enterprise leveraging Vantage's Script Table Operator.
|
|
7779
|
+
7. One can pass both regular expressions and udf expressions to this API.
|
|
7780
|
+
However, regular expressions are computed first followed by udf expressions.
|
|
7781
|
+
Hence the order of columns also maintained in same order.
|
|
7782
|
+
Look at Example 18 to understand more.
|
|
7783
|
+
8. While passing multiple udf expressions, one can not pass one column output
|
|
7784
|
+
as another column input in the same ``assign`` call.
|
|
7785
|
+
9. If user pass multiple udf expressions, delimiter and quotechar specified in
|
|
7786
|
+
last udf expression are considered for processing.
|
|
7166
7787
|
|
|
7167
7788
|
RAISES:
|
|
7168
7789
|
1. ValueError - When a callable is passed as a value, or columns from different
|
|
@@ -7424,6 +8045,158 @@ class DataFrame():
|
|
|
7424
8045
|
1 Advanced 2.886226 3.508750 84.21
|
|
7425
8046
|
2 Novice 6.377775 3.559091 39.15
|
|
7426
8047
|
>>>
|
|
8048
|
+
|
|
8049
|
+
#
|
|
8050
|
+
# Executing user defined function (UDF) with assign()
|
|
8051
|
+
#
|
|
8052
|
+
# Example 15: Create two user defined functions to 'to_upper' and 'sum',
|
|
8053
|
+
# 'to_upper' to get the values in 'accounts' to upper case and
|
|
8054
|
+
# 'sum' to add length of string values in column 'accounts'
|
|
8055
|
+
# with column 'Feb' and store the result in Integer type column.
|
|
8056
|
+
>>> @udf
|
|
8057
|
+
... def to_upper(s):
|
|
8058
|
+
... if s is not None:
|
|
8059
|
+
... return s.upper()
|
|
8060
|
+
>>>
|
|
8061
|
+
>>> from teradatasqlalchemy.types import INTEGER
|
|
8062
|
+
>>> @udf(returns=INTEGER())
|
|
8063
|
+
... def sum(x, y):
|
|
8064
|
+
... return len(x)+y
|
|
8065
|
+
>>>
|
|
8066
|
+
# Assign both Column Expressions returned by user defined functions
|
|
8067
|
+
# to the DataFrame.
|
|
8068
|
+
>>> res = df.assign(upper_stats = to_upper('accounts'), len_sum = sum('accounts', 'Feb'))
|
|
8069
|
+
>>> res
|
|
8070
|
+
Feb Jan Mar Apr datetime upper_stats len_sum
|
|
8071
|
+
accounts
|
|
8072
|
+
Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC 98
|
|
8073
|
+
Red Inc 200.0 150.0 140.0 NaN 17/01/04 RED INC 207
|
|
8074
|
+
Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC 100
|
|
8075
|
+
Jones LLC 200.0 150.0 140.0 180.0 17/01/04 JONES LLC 209
|
|
8076
|
+
Orange Inc 210.0 NaN NaN 250.0 17/01/04 ORANGE INC 220
|
|
8077
|
+
Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO 218
|
|
8078
|
+
>>>
|
|
8079
|
+
|
|
8080
|
+
# Example 16: Create a user defined function to add 4 to the 'datetime' column
|
|
8081
|
+
# and store the result in DATE type column.
|
|
8082
|
+
>>> from teradatasqlalchemy.types import DATE
|
|
8083
|
+
>>> import datetime
|
|
8084
|
+
>>> @udf(returns=DATE())
|
|
8085
|
+
... def add_date(x, y):
|
|
8086
|
+
... return (datetime.datetime.strptime(x, "%y/%m/%d")+datetime.timedelta(y)).strftime("%y/%m/%d")
|
|
8087
|
+
>>>
|
|
8088
|
+
# Assign the Column Expression returned by user defined function
|
|
8089
|
+
# to the DataFrame.
|
|
8090
|
+
>>> res = df.assign(new_date = add_date('datetime', 4))
|
|
8091
|
+
>>> res
|
|
8092
|
+
Feb Jan Mar Apr datetime new_date
|
|
8093
|
+
accounts
|
|
8094
|
+
Alpha Co 210.0 200.0 215.0 250.0 17/01/04 17/01/08
|
|
8095
|
+
Blue Inc 90.0 50.0 95.0 101.0 17/01/04 17/01/08
|
|
8096
|
+
Jones LLC 200.0 150.0 140.0 180.0 17/01/04 17/01/08
|
|
8097
|
+
Orange Inc 210.0 NaN NaN 250.0 17/01/04 17/01/08
|
|
8098
|
+
Yellow Inc 90.0 NaN NaN NaN 17/01/04 17/01/08
|
|
8099
|
+
Red Inc 200.0 150.0 140.0 NaN 17/01/04 17/01/08
|
|
8100
|
+
>>>
|
|
8101
|
+
|
|
8102
|
+
# Example 17: Create a user defined functions to 'to_upper' to get
|
|
8103
|
+
# the values in 'accounts' to upper case and create a
|
|
8104
|
+
# new column with a string literal value.
|
|
8105
|
+
>>> @udf
|
|
8106
|
+
... def to_upper(s):
|
|
8107
|
+
... if s is not None:
|
|
8108
|
+
... return s.upper()
|
|
8109
|
+
>>>
|
|
8110
|
+
# Assign both expressions to the DataFrame.
|
|
8111
|
+
>>> res = df.assign(upper_stats = to_upper('accounts'), new_col = 'string')
|
|
8112
|
+
>>> res
|
|
8113
|
+
Feb Jan Mar Apr datetime new_col upper_stats
|
|
8114
|
+
accounts
|
|
8115
|
+
Alpha Co 210.0 200.0 215.0 250.0 17/01/04 string ALPHA CO
|
|
8116
|
+
Blue Inc 90.0 50.0 95.0 101.0 17/01/04 string BLUE INC
|
|
8117
|
+
Yellow Inc 90.0 NaN NaN NaN 17/01/04 string YELLOW INC
|
|
8118
|
+
Jones LLC 200.0 150.0 140.0 180.0 17/01/04 string JONES LLC
|
|
8119
|
+
Red Inc 200.0 150.0 140.0 NaN 17/01/04 string RED INC
|
|
8120
|
+
Orange Inc 210.0 NaN NaN 250.0 17/01/04 string ORANGE INC
|
|
8121
|
+
>>>
|
|
8122
|
+
|
|
8123
|
+
# Example 18: Create two user defined functions to 'to_upper' and 'sum'
|
|
8124
|
+
# and create new columns with string literal value and
|
|
8125
|
+
# arithmetic operation on column 'Feb'.
|
|
8126
|
+
>>> @udf
|
|
8127
|
+
... def to_upper(s):
|
|
8128
|
+
... if s is not None:
|
|
8129
|
+
... return s.upper()
|
|
8130
|
+
>>>
|
|
8131
|
+
>>> from teradatasqlalchemy.types import INTEGER
|
|
8132
|
+
>>> @udf(returns=INTEGER())
|
|
8133
|
+
... def sum(x, y):
|
|
8134
|
+
... return len(x)+y
|
|
8135
|
+
>>>
|
|
8136
|
+
# Assign all expressions to the DataFrame.
|
|
8137
|
+
>>> res = df.assign(upper_stats = to_upper('accounts'),new_col = 'abc',
|
|
8138
|
+
... len_sum = sum('accounts', 'Feb'), col_sum = df.Feb+1)
|
|
8139
|
+
>>> res
|
|
8140
|
+
Feb Jan Mar Apr datetime col_sum new_col upper_stats len_sum
|
|
8141
|
+
accounts
|
|
8142
|
+
Blue Inc 90.0 50.0 95.0 101.0 17/01/04 91.0 abc BLUE INC 98
|
|
8143
|
+
Alpha Co 210.0 200.0 215.0 250.0 17/01/04 211.0 abc ALPHA CO 218
|
|
8144
|
+
Jones LLC 200.0 150.0 140.0 180.0 17/01/04 201.0 abc JONES LLC 209
|
|
8145
|
+
Yellow Inc 90.0 NaN NaN NaN 17/01/04 91.0 abc YELLOW INC 100
|
|
8146
|
+
Orange Inc 210.0 NaN NaN 250.0 17/01/04 211.0 abc ORANGE INC 220
|
|
8147
|
+
Red Inc 200.0 150.0 140.0 NaN 17/01/04 201.0 abc RED INC 207
|
|
8148
|
+
>>>
|
|
8149
|
+
|
|
8150
|
+
# Example 19: Convert the values is 'accounts' column to upper case using a user
|
|
8151
|
+
# defined function on Vantage Cloud Lake.
|
|
8152
|
+
# Create a Python 3.10.5 environment with given name and description in Vantage.
|
|
8153
|
+
>>> env = create_env('test_udf', 'python_3.10.5', 'Test environment for UDF')
|
|
8154
|
+
User environment 'test_udf' created.
|
|
8155
|
+
>>>
|
|
8156
|
+
# Create a user defined functions to 'to_upper' to get the values in upper case
|
|
8157
|
+
# and pass the user env to run it on.
|
|
8158
|
+
>>> from teradataml.dataframe.functions import udf
|
|
8159
|
+
>>> @udf(env_name = env)
|
|
8160
|
+
... def to_upper(s):
|
|
8161
|
+
... if s is not None:
|
|
8162
|
+
... return s.upper()
|
|
8163
|
+
>>>
|
|
8164
|
+
# Assign the Column Expression returned by user defined function
|
|
8165
|
+
# to the DataFrame.
|
|
8166
|
+
>>> df.assign(upper_stats = to_upper('accounts'))
|
|
8167
|
+
Feb Jan Mar Apr datetime upper_stats
|
|
8168
|
+
accounts
|
|
8169
|
+
Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO
|
|
8170
|
+
Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC
|
|
8171
|
+
Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC
|
|
8172
|
+
Jones LLC 200.0 150.0 140.0 180.0 17/01/04 JONES LLC
|
|
8173
|
+
Orange Inc 210.0 NaN NaN 250.0 17/01/04 ORANGE INC
|
|
8174
|
+
Red Inc 200.0 150.0 140.0 NaN 17/01/04 RED INC
|
|
8175
|
+
>>>
|
|
8176
|
+
|
|
8177
|
+
# Example 20: Register and Call the user defined function to get the values upper case.
|
|
8178
|
+
>>> from teradataml.dataframe.functions import udf, register, call_udf
|
|
8179
|
+
>>> @udf
|
|
8180
|
+
... def to_upper(s):
|
|
8181
|
+
... if s is not None:
|
|
8182
|
+
... return s.upper()
|
|
8183
|
+
>>>
|
|
8184
|
+
# Register the created user defined function with name "upper".
|
|
8185
|
+
>>> register("upper", to_upper)
|
|
8186
|
+
>>>
|
|
8187
|
+
# Call the user defined function registered with name "upper" and assign the
|
|
8188
|
+
# ColumnExpression returned to the DataFrame.
|
|
8189
|
+
>>> res = df.assign(upper_col = call_udf("upper", ('accounts',)))
|
|
8190
|
+
>>> res
|
|
8191
|
+
Feb Jan Mar Apr datetime upper_col
|
|
8192
|
+
accounts
|
|
8193
|
+
Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO
|
|
8194
|
+
Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC
|
|
8195
|
+
Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC
|
|
8196
|
+
Jones LLC 200.0 150.0 140.0 180.0 17/01/04 JONES LLC
|
|
8197
|
+
Orange Inc 210.0 NaN NaN 250.0 17/01/04 ORANGE INC
|
|
8198
|
+
Red Inc 200.0 150.0 140.0 NaN 17/01/04 RED INC
|
|
8199
|
+
>>>
|
|
7427
8200
|
"""
|
|
7428
8201
|
# Argument validations
|
|
7429
8202
|
awu_matrix = []
|
|
@@ -7469,13 +8242,42 @@ class DataFrame():
|
|
|
7469
8242
|
msg = Messages.get_message(MessageCodes.TDMLDF_INFO_ERROR)
|
|
7470
8243
|
raise TeradataMlException(msg, MessageCodes.TDMLDF_INFO_ERROR)
|
|
7471
8244
|
|
|
7472
|
-
|
|
7473
|
-
|
|
7474
|
-
|
|
7475
|
-
|
|
7476
|
-
|
|
7477
|
-
|
|
7478
|
-
|
|
8245
|
+
# Create a dictionary of column name to udf expressions and
|
|
8246
|
+
# column name to normal/regular expressions.
|
|
8247
|
+
udf_expr = {}
|
|
8248
|
+
regular_expr = {}
|
|
8249
|
+
call_udf_expr = {}
|
|
8250
|
+
for colname, col in kwargs.items():
|
|
8251
|
+
# If value passed in kwargs is a ColumnExpression and is a udf, store it.
|
|
8252
|
+
if isinstance(col, ColumnExpression) and col._udf:
|
|
8253
|
+
udf_expr[colname] = col
|
|
8254
|
+
# If value passed in kwargs is a ColumnExpression and is a registerd udf script, store it.
|
|
8255
|
+
elif isinstance(col, ColumnExpression) and col._udf_script:
|
|
8256
|
+
call_udf_expr[colname] = col
|
|
8257
|
+
else:
|
|
8258
|
+
regular_expr[colname] = col
|
|
8259
|
+
df = self
|
|
8260
|
+
|
|
8261
|
+
# If kwargs contains both regular and udf expressions, first create new columns
|
|
8262
|
+
# from normal/regular expressions then on the output dataframe create new columns
|
|
8263
|
+
# from udf expression.
|
|
8264
|
+
if bool(regular_expr):
|
|
8265
|
+
try:
|
|
8266
|
+
(new_meta, new_nodeid) = df._generate_assign_metaexpr_aed_nodeid(drop_columns, **regular_expr)
|
|
8267
|
+
df = df._create_dataframe_from_node(new_nodeid, new_meta, df._index_label)
|
|
8268
|
+
except Exception as err:
|
|
8269
|
+
errcode = MessageCodes.TDMLDF_INFO_ERROR
|
|
8270
|
+
msg = Messages.get_message(MessageCodes.TDMLDF_INFO_ERROR)
|
|
8271
|
+
raise TeradataMlException(msg, errcode) from err
|
|
8272
|
+
|
|
8273
|
+
if bool(udf_expr):
|
|
8274
|
+
df = df._assign_udf(udf_expr)
|
|
8275
|
+
|
|
8276
|
+
if bool(call_udf_expr):
|
|
8277
|
+
df = df._assign_call_udf(call_udf_expr)
|
|
8278
|
+
|
|
8279
|
+
return df
|
|
8280
|
+
|
|
7479
8281
|
|
|
7480
8282
|
@collect_queryband(queryband="DF_get")
|
|
7481
8283
|
def get(self, key):
|
|
@@ -9110,6 +9912,12 @@ class DataFrame():
|
|
|
9110
9912
|
# Validate argument types
|
|
9111
9913
|
_Validators._validate_function_arguments(awu_matrix)
|
|
9112
9914
|
|
|
9915
|
+
# If self and right DataFrames are pointing to same Table object,
|
|
9916
|
+
# raise error.
|
|
9917
|
+
if self._metaexpr.t is right._metaexpr.t:
|
|
9918
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_ALIAS_REQUIRED, "merge"),
|
|
9919
|
+
MessageCodes.TDMLDF_ALIAS_REQUIRED)
|
|
9920
|
+
|
|
9113
9921
|
if (right_on is not None and left_on is None) or (right_on is None and left_on is not None):
|
|
9114
9922
|
raise TeradataMlException(
|
|
9115
9923
|
Messages.get_message(MessageCodes.MUST_PASS_ARGUMENT, "left_on", "right_on"),
|
|
@@ -10107,7 +10915,8 @@ class DataFrame():
|
|
|
10107
10915
|
test_size=list_of_fracs[1],
|
|
10108
10916
|
stratify_column=stratify_column,
|
|
10109
10917
|
seed=seed,
|
|
10110
|
-
persist=True
|
|
10918
|
+
persist=True,
|
|
10919
|
+
display_table_name=False)
|
|
10111
10920
|
|
|
10112
10921
|
# Retrieve the table name from TrainTestSplit_out object.
|
|
10113
10922
|
table_name = TrainTestSplit_out.result._table_name
|
|
@@ -10218,10 +11027,10 @@ class DataFrame():
|
|
|
10218
11027
|
|
|
10219
11028
|
# Make this non-lazy. Added this in order to fix https://teradata-pe.atlassian.net/browse/ELE-6368
|
|
10220
11029
|
# Cannot use __execute_node_and_set_table_name because self points to original df.
|
|
10221
|
-
# Hence, setting the
|
|
11030
|
+
# Hence, setting the _table_name with _execute_node_return_db_object_name.
|
|
10222
11031
|
|
|
10223
11032
|
df = self._create_dataframe_from_node(sample_node_id, new_metaexpr, self._index_label)
|
|
10224
|
-
df.
|
|
11033
|
+
df._table_name = df_utils._execute_node_return_db_object_name(sample_node_id, new_metaexpr)
|
|
10225
11034
|
|
|
10226
11035
|
return df
|
|
10227
11036
|
|
|
@@ -10352,26 +11161,14 @@ class DataFrame():
|
|
|
10352
11161
|
where admitted > 0) as temp_table SAMPLE 0.9'
|
|
10353
11162
|
|
|
10354
11163
|
"""
|
|
11164
|
+
# Argument validations
|
|
11165
|
+
awu_matrix = []
|
|
11166
|
+
awu_matrix.append(["full_query", full_query, False, (bool)])
|
|
11167
|
+
# Validate argument types
|
|
11168
|
+
_Validators._validate_function_arguments(awu_matrix)
|
|
10355
11169
|
|
|
10356
11170
|
try:
|
|
10357
|
-
|
|
10358
|
-
awu_matrix = []
|
|
10359
|
-
awu_matrix.append(["full_query", full_query, False, (bool)])
|
|
10360
|
-
# Validate argument types
|
|
10361
|
-
_Validators._validate_function_arguments(awu_matrix)
|
|
10362
|
-
|
|
10363
|
-
node_id = self._nodeid
|
|
10364
|
-
|
|
10365
|
-
if isinstance(self, (DataFrameGroupBy, DataFrameGroupByTime)):
|
|
10366
|
-
# If dataframe is either of type groupby or groupbytime
|
|
10367
|
-
# then get it's parent dataframe nodeid and return queries
|
|
10368
|
-
# for the same
|
|
10369
|
-
node_id = self._aed_utils._aed_get_parent_nodeids(self._nodeid)[0]
|
|
10370
|
-
|
|
10371
|
-
queries = self._aed_utils._aed_show_query(node_id, query_with_reference_to_top=full_query)
|
|
10372
|
-
|
|
10373
|
-
return queries[0][0]
|
|
10374
|
-
|
|
11171
|
+
return self.__generate_aed_query(full_query)
|
|
10375
11172
|
except TeradataMlException:
|
|
10376
11173
|
raise
|
|
10377
11174
|
|
|
@@ -10381,7 +11178,7 @@ class DataFrame():
|
|
|
10381
11178
|
except Exception as err:
|
|
10382
11179
|
errcode = MessageCodes.TDMLDF_INFO_ERROR
|
|
10383
11180
|
msg = Messages.get_message(errcode)
|
|
10384
|
-
raise TeradataMlException(msg, errcode) from err
|
|
11181
|
+
raise TeradataMlException(msg, errcode) from err
|
|
10385
11182
|
|
|
10386
11183
|
@collect_queryband(queryband="DF_mapRow")
|
|
10387
11184
|
def map_row(self,
|
|
@@ -11899,6 +12696,9 @@ class DataFrame():
|
|
|
11899
12696
|
_Validators._validate_column_exists_in_dataframe(column_names, self._metaexpr,
|
|
11900
12697
|
False)
|
|
11901
12698
|
column_names = list(dict.fromkeys(column_names))
|
|
12699
|
+
|
|
12700
|
+
if list_td_reserved_keywords(column_names):
|
|
12701
|
+
column_names = UtilFuncs._teradata_quote_arg(column_names, "\"", False)
|
|
11902
12702
|
|
|
11903
12703
|
col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, column_names)
|
|
11904
12704
|
sel_nodeid = self._aed_utils._aed_select(self._nodeid, ','.join(column_names), True)
|
|
@@ -13840,7 +14640,7 @@ class DataFrame():
|
|
|
13840
14640
|
Types: int OR NoneType
|
|
13841
14641
|
|
|
13842
14642
|
RETURNS:
|
|
13843
|
-
iterator, an object to iterate over
|
|
14643
|
+
iterator, an object to iterate over row in the DataFrame.
|
|
13844
14644
|
|
|
13845
14645
|
RAISES:
|
|
13846
14646
|
None
|
|
@@ -13889,9 +14689,10 @@ class DataFrame():
|
|
|
13889
14689
|
cur = execute_sql(query)
|
|
13890
14690
|
|
|
13891
14691
|
if name:
|
|
14692
|
+
columns = [column[0] for column in cur.description]
|
|
13892
14693
|
for rec in cur:
|
|
13893
|
-
|
|
13894
|
-
yield
|
|
14694
|
+
row = _Row(columns=columns, values=rec)
|
|
14695
|
+
yield row
|
|
13895
14696
|
else:
|
|
13896
14697
|
for rec in cur:
|
|
13897
14698
|
yield rec
|