teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +183 -0
- teradataml/__init__.py +6 -3
- teradataml/_version.py +2 -2
- teradataml/analytics/__init__.py +3 -2
- teradataml/analytics/analytic_function_executor.py +275 -40
- teradataml/analytics/analytic_query_generator.py +92 -0
- teradataml/analytics/byom/__init__.py +3 -2
- teradataml/analytics/json_parser/metadata.py +1 -0
- teradataml/analytics/json_parser/utils.py +17 -21
- teradataml/analytics/meta_class.py +40 -1
- teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
- teradataml/analytics/sqle/__init__.py +10 -2
- teradataml/analytics/table_operator/__init__.py +3 -2
- teradataml/analytics/uaf/__init__.py +21 -2
- teradataml/analytics/utils.py +62 -1
- teradataml/analytics/valib.py +1 -1
- teradataml/automl/__init__.py +1553 -319
- teradataml/automl/custom_json_utils.py +139 -61
- teradataml/automl/data_preparation.py +276 -319
- teradataml/automl/data_transformation.py +163 -81
- teradataml/automl/feature_engineering.py +402 -239
- teradataml/automl/feature_exploration.py +9 -2
- teradataml/automl/model_evaluation.py +48 -51
- teradataml/automl/model_training.py +291 -189
- teradataml/catalog/byom.py +8 -8
- teradataml/catalog/model_cataloging_utils.py +1 -1
- teradataml/clients/auth_client.py +133 -0
- teradataml/clients/pkce_client.py +1 -1
- teradataml/common/aed_utils.py +3 -2
- teradataml/common/constants.py +48 -6
- teradataml/common/deprecations.py +13 -7
- teradataml/common/garbagecollector.py +156 -120
- teradataml/common/messagecodes.py +6 -1
- teradataml/common/messages.py +3 -1
- teradataml/common/sqlbundle.py +1 -1
- teradataml/common/utils.py +103 -11
- teradataml/common/wrapper_utils.py +1 -1
- teradataml/context/context.py +121 -31
- teradataml/data/advertising.csv +201 -0
- teradataml/data/bank_marketing.csv +11163 -0
- teradataml/data/bike_sharing.csv +732 -0
- teradataml/data/boston2cols.csv +721 -0
- teradataml/data/breast_cancer.csv +570 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/customer_segmentation_test.csv +2628 -0
- teradataml/data/customer_segmentation_train.csv +8069 -0
- teradataml/data/dataframe_example.json +10 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
- teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
- teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
- teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
- teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
- teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
- teradataml/data/docs/sqle/docs_17_20/Shap.py +197 -0
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
- teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
- teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
- teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/glm_example.json +28 -1
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/housing_train_segment.csv +201 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/insect2Cols.csv +61 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/jsons/paired_functions.json +14 -0
- teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
- teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
- teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
- teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
- teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
- teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
- teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
- teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
- teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
- teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
- teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
- teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +208 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +400 -0
- teradataml/data/kmeans_example.json +5 -0
- teradataml/data/kmeans_table.csv +10 -0
- teradataml/data/load_example_data.py +8 -2
- teradataml/data/naivebayestextclassifier_example.json +1 -1
- teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
- teradataml/data/onehot_encoder_train.csv +4 -0
- teradataml/data/openml_example.json +29 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/sax_example.json +8 -0
- teradataml/data/scale_attributes.csv +3 -0
- teradataml/data/scale_example.json +52 -1
- teradataml/data/scale_input_part_sparse.csv +31 -0
- teradataml/data/scale_input_partitioned.csv +16 -0
- teradataml/data/scale_input_sparse.csv +11 -0
- teradataml/data/scale_parameters.csv +3 -0
- teradataml/data/scripts/deploy_script.py +21 -2
- teradataml/data/scripts/sklearn/sklearn_fit.py +40 -37
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +22 -30
- teradataml/data/scripts/sklearn/sklearn_function.template +42 -24
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +19 -28
- teradataml/data/scripts/sklearn/sklearn_score.py +32 -32
- teradataml/data/scripts/sklearn/sklearn_transform.py +85 -42
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/templates/open_source_ml.json +2 -1
- teradataml/data/teradataml_example.json +97 -1
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/uaf_example.json +55 -1
- teradataml/data/unpivot_example.json +15 -0
- teradataml/data/url_data.csv +9 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/data/ztest_example.json +16 -0
- teradataml/dataframe/copy_to.py +9 -4
- teradataml/dataframe/data_transfer.py +125 -64
- teradataml/dataframe/dataframe.py +575 -57
- teradataml/dataframe/dataframe_utils.py +47 -9
- teradataml/dataframe/fastload.py +273 -90
- teradataml/dataframe/functions.py +339 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +2 -2
- teradataml/dataframe/sql.py +740 -18
- teradataml/dataframe/window.py +1 -1
- teradataml/dbutils/dbutils.py +324 -18
- teradataml/geospatial/geodataframe.py +1 -1
- teradataml/geospatial/geodataframecolumn.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +13 -13
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/sklearn/_sklearn_wrapper.py +254 -122
- teradataml/options/__init__.py +16 -5
- teradataml/options/configure.py +39 -6
- teradataml/options/display.py +2 -2
- teradataml/plot/axis.py +4 -4
- teradataml/scriptmgmt/UserEnv.py +26 -19
- teradataml/scriptmgmt/lls_utils.py +120 -16
- teradataml/table_operators/Script.py +4 -5
- teradataml/table_operators/TableOperator.py +160 -26
- teradataml/table_operators/table_operator_util.py +88 -41
- teradataml/table_operators/templates/dataframe_udf.template +63 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +52 -0
- teradataml/utils/validators.py +41 -3
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/METADATA +191 -6
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/RECORD +263 -185
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/zip-safe +0 -0
teradataml/dataframe/sql.py
CHANGED
|
@@ -40,7 +40,7 @@ import sqlalchemy as sqlalc
|
|
|
40
40
|
|
|
41
41
|
import re
|
|
42
42
|
|
|
43
|
-
from teradatasqlalchemy.dialect import dialect as td_dialect, compiler as td_compiler
|
|
43
|
+
from teradatasqlalchemy.dialect import dialect as td_dialect, compiler as td_compiler, TeradataTypeCompiler as td_type_compiler
|
|
44
44
|
from teradatasqlalchemy import (INTEGER, SMALLINT, BIGINT, BYTEINT, DECIMAL, FLOAT, NUMBER)
|
|
45
45
|
from teradatasqlalchemy import (DATE, TIME, TIMESTAMP)
|
|
46
46
|
from teradatasqlalchemy import (BYTE, VARBYTE, BLOB)
|
|
@@ -52,7 +52,7 @@ from teradatasqlalchemy import (INTERVAL_DAY, INTERVAL_DAY_TO_HOUR, INTERVAL_DAY
|
|
|
52
52
|
INTERVAL_YEAR_TO_MONTH)
|
|
53
53
|
from teradatasqlalchemy import (PERIOD_DATE, PERIOD_TIME, PERIOD_TIMESTAMP)
|
|
54
54
|
from teradatasqlalchemy import XML, GEOMETRY
|
|
55
|
-
from
|
|
55
|
+
from teradataml.telemetry_utils.queryband import collect_queryband
|
|
56
56
|
import decimal
|
|
57
57
|
import datetime as dt
|
|
58
58
|
from teradataml.dataframe.window import Window
|
|
@@ -612,8 +612,9 @@ class _SQLTableExpression(_PandasTableExpression):
|
|
|
612
612
|
expression = display_number(c.expression)
|
|
613
613
|
elif isinstance(c.type, tuple(datetime_period_types)):
|
|
614
614
|
expression = cast_expr(c.expression, 30)
|
|
615
|
+
# Change the size as INTERVAL_DAY_TO_SECOND(4, 6) is failing.
|
|
615
616
|
elif isinstance(c.type, tuple(interval_types)):
|
|
616
|
-
expression = cast_expr(c.expression,
|
|
617
|
+
expression = cast_expr(c.expression, 25)
|
|
617
618
|
elif isinstance(c.type, GEOMETRY):
|
|
618
619
|
expression = cast_expr(c.expression, display.geometry_column_length) if \
|
|
619
620
|
display.geometry_column_length is not None else c.expression.label(c.name)
|
|
@@ -1618,6 +1619,8 @@ class _ArithmeticColumnExpression(ColumnExpression):
|
|
|
1618
1619
|
def __sub__(self, other):
|
|
1619
1620
|
"""
|
|
1620
1621
|
Compute the difference between two ColumnExpressions using -
|
|
1622
|
+
Note:
|
|
1623
|
+
* Difference between two timestamp columns return value in seconds.
|
|
1621
1624
|
|
|
1622
1625
|
PARAMETERS:
|
|
1623
1626
|
other:
|
|
@@ -1644,6 +1647,15 @@ class _ArithmeticColumnExpression(ColumnExpression):
|
|
|
1644
1647
|
2 67/06/30 07/07/10 421.0 465.0 179.0
|
|
1645
1648
|
3 67/06/30 07/07/10 434.0 485.0 185.0
|
|
1646
1649
|
5 67/06/30 07/07/10 459.0 509.0 211.0
|
|
1650
|
+
>>> load_example_data("uaf", "Convolve2RealsLeft")
|
|
1651
|
+
>>> timestamp_df = DataFrame("Convolve2RealsLeft")
|
|
1652
|
+
>>> timestamp_df
|
|
1653
|
+
row_seq row_i_time col_seq column_i_time A B C D
|
|
1654
|
+
id
|
|
1655
|
+
1 1 2018-08-08 08:02:00.000000 0 2018-08-08 08:00:00.000000 1.3 10.3 20.3 30.3
|
|
1656
|
+
1 1 2018-08-08 08:02:00.000000 1 2018-08-08 08:02:00.000000 1.4 10.4 20.4 30.4
|
|
1657
|
+
1 0 2018-08-08 08:00:00.000000 1 2018-08-08 08:02:00.000000 1.2 10.2 20.2 30.2
|
|
1658
|
+
1 0 2018-08-08 08:00:00.000000 0 2018-08-08 08:00:00.000000 1.1 10.1 20.1 30.1
|
|
1647
1659
|
|
|
1648
1660
|
# Example 1: Subtract 100 from the income amount and assign the final amount
|
|
1649
1661
|
# to new column 'remaining_income'.
|
|
@@ -1666,7 +1678,26 @@ class _ArithmeticColumnExpression(ColumnExpression):
|
|
|
1666
1678
|
1 67/06/30 07/07/10 415.0 451.0 180.0 271.0
|
|
1667
1679
|
5 67/06/30 07/07/10 459.0 509.0 211.0 298.0
|
|
1668
1680
|
4 67/06/30 07/07/10 448.0 493.0 192.0 301.0
|
|
1681
|
+
|
|
1682
|
+
# Example 3: Subtract 2 timestamp columns and assign to new column 'seconds'.
|
|
1683
|
+
>>> timestamp_df.assign(seconds = timestamp_df.row_i_time-timestamp_df.column_i_time)
|
|
1684
|
+
row_seq row_i_time col_seq column_i_time A B C D seconds
|
|
1685
|
+
id
|
|
1686
|
+
1 1 2018-08-08 08:02:00.000000 0 2018-08-08 08:00:00.000000 1.3 10.3 20.3 30.3 120.0
|
|
1687
|
+
1 1 2018-08-08 08:02:00.000000 1 2018-08-08 08:02:00.000000 1.4 10.4 20.4 30.4 0.0
|
|
1688
|
+
1 0 2018-08-08 08:00:00.000000 1 2018-08-08 08:02:00.000000 1.2 10.2 20.2 30.2 -120.0
|
|
1689
|
+
1 0 2018-08-08 08:00:00.000000 0 2018-08-08 08:00:00.000000 1.1 10.1 20.1 30.1 0.0
|
|
1690
|
+
|
|
1669
1691
|
"""
|
|
1692
|
+
if isinstance(self._type, TIMESTAMP) and isinstance(other._type, TIMESTAMP):
|
|
1693
|
+
s = """
|
|
1694
|
+
(CAST((CAST({0} AS DATE)-CAST({1} AS DATE)) AS FLOAT) * 86400) +
|
|
1695
|
+
((EXTRACT(HOUR FROM {0}) - EXTRACT(HOUR FROM {1})) * 3600) +
|
|
1696
|
+
((EXTRACT(MINUTE FROM {0}) - EXTRACT(MINUTE FROM {1})) * 60) +
|
|
1697
|
+
((EXTRACT(SECOND FROM {0}) - EXTRACT(SECOND FROM {1})))
|
|
1698
|
+
""".format(self.compile(), other.compile())
|
|
1699
|
+
return _SQLColumnExpression(literal_column(s, type_ = FLOAT))
|
|
1700
|
+
|
|
1670
1701
|
expr = other.expression if isinstance(other, _SQLColumnExpression) else other
|
|
1671
1702
|
res = _SQLColumnExpression(self.expression - expr)
|
|
1672
1703
|
return res
|
|
@@ -5431,12 +5462,18 @@ class _SQLColumnExpression(_LogicalColumnExpression,
|
|
|
5431
5462
|
expression = literal_column(expression)
|
|
5432
5463
|
self.kw = kw
|
|
5433
5464
|
self.expression = expression
|
|
5434
|
-
self.type = kw.get("type", expression.type)
|
|
5465
|
+
self.type = kw.get("type", expression.type if expression is not None else kw.get("udf_type"))
|
|
5435
5466
|
# Initial ColumnExpression has only one dataframe and hence
|
|
5436
5467
|
# __has_multiple_dataframes = False.
|
|
5437
5468
|
# eg: df1.col1, df2.col2
|
|
5438
5469
|
self.__has_multiple_dataframes = False
|
|
5439
5470
|
self.__names = []
|
|
5471
|
+
self._udf = kw.get("udf", None)
|
|
5472
|
+
self._udf_args = kw.get("udf_args", None)
|
|
5473
|
+
self._env_name = kw.get("env_name", None)
|
|
5474
|
+
self._delimiter = kw.get("delimiter", None)
|
|
5475
|
+
self._quotechar = kw.get("quotechar", None)
|
|
5476
|
+
self.alias_name = self.compile() if self._udf is None else None
|
|
5440
5477
|
|
|
5441
5478
|
@property
|
|
5442
5479
|
def expression(self):
|
|
@@ -5801,7 +5838,7 @@ class _SQLColumnExpression(_LogicalColumnExpression,
|
|
|
5801
5838
|
return _SQLColumnExpression(func.concat(*columns_))
|
|
5802
5839
|
|
|
5803
5840
|
@collect_queryband(queryband="DFC_cast")
|
|
5804
|
-
def cast(self, type_ = None):
|
|
5841
|
+
def cast(self, type_ = None, format = None, timezone = None):
|
|
5805
5842
|
"""
|
|
5806
5843
|
DESCRIPTION:
|
|
5807
5844
|
Apply the CAST SQL function to the column with the type specified.
|
|
@@ -5817,6 +5854,32 @@ class _SQLColumnExpression(_LogicalColumnExpression,
|
|
|
5817
5854
|
Default value: None
|
|
5818
5855
|
Types: teradatasqlalchemy type or object of teradatasqlalchemy type
|
|
5819
5856
|
|
|
5857
|
+
format:
|
|
5858
|
+
Optional Argument.
|
|
5859
|
+
Specifies a variable length string containing formatting characters
|
|
5860
|
+
that define the display format for the data type.
|
|
5861
|
+
Formats can be specified for columns that have character, numeric, byte,
|
|
5862
|
+
DateTime, Period or UDT data types.
|
|
5863
|
+
Note:
|
|
5864
|
+
* Teradata supports different formats. Look at 'Formats' section in
|
|
5865
|
+
"SQL-Data-Types-and-Literals" in Vantage documentation for additional
|
|
5866
|
+
details.
|
|
5867
|
+
Default value: None
|
|
5868
|
+
Types: str
|
|
5869
|
+
|
|
5870
|
+
timezone:
|
|
5871
|
+
Optional Argument.
|
|
5872
|
+
Specifies the timezone string.
|
|
5873
|
+
Check "SQL-Date-and-Time-Functions-and-Expressions" in
|
|
5874
|
+
Vantage documentation for supported timezones.
|
|
5875
|
+
Type: ColumnExpression or str.
|
|
5876
|
+
|
|
5877
|
+
RETURNS:
|
|
5878
|
+
ColumnExpression
|
|
5879
|
+
|
|
5880
|
+
RAISES:
|
|
5881
|
+
TeradataMlException
|
|
5882
|
+
|
|
5820
5883
|
EXAMPLES:
|
|
5821
5884
|
>>> load_example_data("dataframe","admissions_train")
|
|
5822
5885
|
>>> df = DataFrame('admissions_train')
|
|
@@ -5841,8 +5904,24 @@ class _SQLColumnExpression(_LogicalColumnExpression,
|
|
|
5841
5904
|
programming str
|
|
5842
5905
|
admitted int
|
|
5843
5906
|
|
|
5844
|
-
>>>
|
|
5845
|
-
>>>
|
|
5907
|
+
>>> dataframe_dict = {"id": [100, 200,300],
|
|
5908
|
+
>>> "timestamp_col": ['1000-01-10 23:00:12-02:00', '2015-01-08 13:00:00+12:00', '2014-12-10 10:00:35-08:00'],
|
|
5909
|
+
>>> "timezone_col": ["GMT", "America Pacific", "GMT+10"]}
|
|
5910
|
+
>>> pandas_df = pd.DataFrame(dataframe_dict)
|
|
5911
|
+
>>> copy_to_sql(pandas_df, table_name = 'new_table', if_exists = 'replace')
|
|
5912
|
+
>>> df1 = DataFrame("new_table")
|
|
5913
|
+
>>> df1
|
|
5914
|
+
id timestamp_col timezone_col
|
|
5915
|
+
300 2014-12-10 10:00:35-08:00 GMT+10
|
|
5916
|
+
200 2015-01-08 13:00:00+12:00 America Pacific
|
|
5917
|
+
100 1000-01-10 23:00:12-02:00 GMT
|
|
5918
|
+
>>> df1.dtypes
|
|
5919
|
+
id int
|
|
5920
|
+
timestamp_col str
|
|
5921
|
+
timezone_col str
|
|
5922
|
+
|
|
5923
|
+
# Example 1: Let's try creating a new DataFrame casting 'id' column (of type INTEGER) to VARCHAR(5),
|
|
5924
|
+
# an object of a teradatasqlalchemy type.
|
|
5846
5925
|
>>> from teradatasqlalchemy import VARCHAR
|
|
5847
5926
|
>>> new_df = df.assign(char_id = df.id.cast(type_=VARCHAR(5)))
|
|
5848
5927
|
>>> new_df
|
|
@@ -5867,8 +5946,8 @@ class _SQLColumnExpression(_LogicalColumnExpression,
|
|
|
5867
5946
|
admitted int
|
|
5868
5947
|
char_id str
|
|
5869
5948
|
|
|
5870
|
-
|
|
5871
|
-
|
|
5949
|
+
# Example 2: Now let's try creating a new DataFrame casting 'id' column (of type INTEGER) to VARCHAR,
|
|
5950
|
+
# a teradatasqlalchemy type.
|
|
5872
5951
|
>>> new_df_2 = df.assign(char_id = df.id.cast(type_=VARCHAR))
|
|
5873
5952
|
>>> new_df_2
|
|
5874
5953
|
masters gpa stats programming admitted char_id
|
|
@@ -5892,25 +5971,65 @@ class _SQLColumnExpression(_LogicalColumnExpression,
|
|
|
5892
5971
|
admitted int
|
|
5893
5972
|
char_id str
|
|
5894
5973
|
|
|
5895
|
-
|
|
5896
|
-
|
|
5974
|
+
# Example 3: Let's try filtering some data with a match on a column cast to another type,
|
|
5975
|
+
# an object of a teradatasqlalchemy type.
|
|
5897
5976
|
>>> df[df.id.cast(VARCHAR(5)) == '1']
|
|
5898
5977
|
masters gpa stats programming admitted
|
|
5899
5978
|
id
|
|
5900
5979
|
1 yes 3.95 Beginner Beginner 0
|
|
5901
5980
|
|
|
5902
|
-
|
|
5981
|
+
# Example 4: Now let's try the same, this time using a teradatasqlalchemy type.
|
|
5903
5982
|
>>> df[df.id.cast(VARCHAR) == '1']
|
|
5904
5983
|
masters gpa stats programming admitted
|
|
5905
5984
|
id
|
|
5906
5985
|
1 yes 3.95 Beginner Beginner 0
|
|
5907
5986
|
|
|
5908
|
-
|
|
5909
|
-
|
|
5987
|
+
# Example 5: Let's try creating a new DataFrame casting 'timestamp_col' column (of type VARCHAR) to TIMESTAMP,
|
|
5988
|
+
# using format.
|
|
5989
|
+
>>> new_df1 = df1.assign(new_col = df1.timestamp_col.cast(TIMESTAMP, format='Y4-MM-DDBHH:MI:SSBZ'))
|
|
5990
|
+
id timestamp_col timezone_col new_col
|
|
5991
|
+
300 2014-12-10 10:00:35-08:00 GMT+10 2014-12-10 18:00:35
|
|
5992
|
+
200 2015-01-08 13:00:00+12:00 America Pacific 2015-01-08 01:00:00
|
|
5993
|
+
100 1000-01-10 23:00:12-02:00 GMT 1000-01-11 01:00:12
|
|
5994
|
+
>>> new_df1.tdtypes
|
|
5995
|
+
id int
|
|
5996
|
+
timestamp_col str
|
|
5997
|
+
timezone_col str
|
|
5998
|
+
new_col datetime.datetime
|
|
5999
|
+
|
|
6000
|
+
# Example 6: Let's try creating a new DataFrame casting 'id' column (of type INTEGER) to VARCHAR,
|
|
6001
|
+
# using format.
|
|
6002
|
+
>>> new_df2 = df1.assign(new_col = df1.id.cast(VARCHAR, format='zzz.zz'))
|
|
6003
|
+
id timestamp_col timezone_col new_col
|
|
6004
|
+
300 2014-12-10 10:00:35-08:00 GMT+10 300.00
|
|
6005
|
+
200 2015-01-08 13:00:00+12:00 America Pacific 200.00
|
|
6006
|
+
100 1000-01-10 23:00:12-02:00 GMT 100.00
|
|
6007
|
+
>>> new_df2.dtypes
|
|
6008
|
+
id int
|
|
6009
|
+
timestamp_col str
|
|
6010
|
+
timezone_col str
|
|
6011
|
+
new_col str
|
|
6012
|
+
|
|
6013
|
+
# Example 7: Let's try creating a new DataFrame casting 'timestamp_with_timezone' column (of type TIMESTAMP) to
|
|
6014
|
+
# TIMESTAMP WITH TIMEZONE, with offset 'GMT+10'.
|
|
6015
|
+
>>> new_df3 = new_df1.assign(timestamp_with_timezone = new_df1.new_col.cast(TIMESTAMP(timezone=True), timezone='GMT+10'))
|
|
6016
|
+
id timestamp_col timezone_col new_col timestamp_with_timezone
|
|
6017
|
+
300 2014-12-10 10:00:35-08:00 GMT+10 2014-12-10 18:00:35 2014-12-11 04:00:35.000000+10:00
|
|
6018
|
+
200 2015-01-08 13:00:00+12:00 America Pacific 2015-01-08 01:00:00 2015-01-08 11:00:00.000000+10:00
|
|
6019
|
+
100 1000-01-10 23:00:12-02:00 GMT 1000-01-11 01:00:12 1000-01-11 11:00:12.000000+10:00
|
|
6020
|
+
>>> new_df3.dtypes
|
|
6021
|
+
id int
|
|
6022
|
+
timestamp_col str
|
|
6023
|
+
timezone_col str
|
|
6024
|
+
new_col datetime.datetime
|
|
6025
|
+
timestamp_with_timezone datetime.datetime
|
|
6026
|
+
"""
|
|
6027
|
+
# Validating Arguments
|
|
6028
|
+
arg_type_matrix = []
|
|
6029
|
+
arg_type_matrix.append(["format", format , True, (str), True])
|
|
6030
|
+
arg_type_matrix.append(["timezone", timezone, True, (str, ColumnExpression, int, float), True])
|
|
6031
|
+
_Validators._validate_function_arguments(arg_type_matrix)
|
|
5910
6032
|
|
|
5911
|
-
RAISES:
|
|
5912
|
-
TeradataMlException
|
|
5913
|
-
"""
|
|
5914
6033
|
# If type_ is None or not specified, raise an Exception
|
|
5915
6034
|
if type_ is None:
|
|
5916
6035
|
raise TeradataMlException(Messages.get_message(MessageCodes.MISSING_ARGS, 'type_'),
|
|
@@ -5921,8 +6040,26 @@ class _SQLColumnExpression(_LogicalColumnExpression,
|
|
|
5921
6040
|
raise TeradataMlException(Messages.get_message(MessageCodes.UNSUPPORTED_DATATYPE, 'type_',
|
|
5922
6041
|
'a valid teradatasqlalchemy type'),
|
|
5923
6042
|
MessageCodes.UNSUPPORTED_DATATYPE)
|
|
5924
|
-
|
|
5925
6043
|
expression = func.cast(self.expression, type_=type_).label(self.name)
|
|
6044
|
+
if format or timezone:
|
|
6045
|
+
# Casting to VARCHAR or CHAR with format require this type of query
|
|
6046
|
+
# CAST((CAST (F1 AS FORMAT 'format_str')) AS [CHAR|VARCHAR])
|
|
6047
|
+
if isinstance(type_, (VARCHAR, CHAR)) or (isinstance(type_, type) and issubclass(type_, (VARCHAR, CHAR))):
|
|
6048
|
+
expression = func.cast(literal_column("""CAST({} AS FORMAT '{}')""".format(self.compile(), format)), type_=type_)
|
|
6049
|
+
else:
|
|
6050
|
+
# Compile _TDType to string
|
|
6051
|
+
type_compiler = td_type_compiler(td_dialect)
|
|
6052
|
+
type_expression = type_compiler.process(type_) if not isinstance(type_, type) else type_compiler.process(type_())
|
|
6053
|
+
# Create a query with format and timezone string
|
|
6054
|
+
# CAST(TIMESTAMP "column_name" AS "_TDType" FORMAT "format" AT TIMEZONE "timezone_str")
|
|
6055
|
+
format = " FORMAT '{}'".format(format) if format else ""
|
|
6056
|
+
if timezone and isinstance(timezone, _SQLColumnExpression):
|
|
6057
|
+
timezone = _SQLColumnExpression(literal_column(f' AT TIME ZONE {timezone.compile()}')).compile()
|
|
6058
|
+
elif timezone:
|
|
6059
|
+
timezone = _SQLColumnExpression(literal_column(_SQLColumnExpression._timezone_string(timezone))).compile()
|
|
6060
|
+
else:
|
|
6061
|
+
timezone = ""
|
|
6062
|
+
expression = literal_column("""CAST({} AS {}{}{})""".format(self.compile(), type_expression, timezone, format), type_=type_)
|
|
5926
6063
|
return _SQLColumnExpression(expression)
|
|
5927
6064
|
|
|
5928
6065
|
def __hash__(self):
|
|
@@ -10088,3 +10225,588 @@ class _SQLColumnExpression(_LogicalColumnExpression,
|
|
|
10088
10225
|
return list(set(result))
|
|
10089
10226
|
|
|
10090
10227
|
return []
|
|
10228
|
+
|
|
10229
|
+
def alias(self, name):
|
|
10230
|
+
"""
|
|
10231
|
+
DESCRIPTION:
|
|
10232
|
+
Function to returns this column with aliased name.
|
|
10233
|
+
|
|
10234
|
+
PARAMETERS:
|
|
10235
|
+
name:
|
|
10236
|
+
Required Argument.
|
|
10237
|
+
Specifies the column name.
|
|
10238
|
+
Type: str
|
|
10239
|
+
|
|
10240
|
+
RAISES:
|
|
10241
|
+
TypeError, ValueError
|
|
10242
|
+
|
|
10243
|
+
RETURNS:
|
|
10244
|
+
ColumnExpression
|
|
10245
|
+
|
|
10246
|
+
EXAMPLES:
|
|
10247
|
+
# Load the data to run the example.
|
|
10248
|
+
>>> load_example_data("dataframe", "admissions_train")
|
|
10249
|
+
|
|
10250
|
+
# Create a DataFrame on 'admissions_train' table.
|
|
10251
|
+
>>> df = DataFrame("admissions_train")
|
|
10252
|
+
>>> df
|
|
10253
|
+
masters gpa stats programming admitted
|
|
10254
|
+
id
|
|
10255
|
+
38 yes 2.65 Advanced Beginner 1
|
|
10256
|
+
7 yes 2.33 Novice Novice 1
|
|
10257
|
+
26 yes 3.57 Advanced Advanced 1
|
|
10258
|
+
5 no 3.44 Novice Novice 0
|
|
10259
|
+
3 no 3.70 Novice Beginner 1
|
|
10260
|
+
22 yes 3.46 Novice Beginner 0
|
|
10261
|
+
24 no 1.87 Advanced Novice 1
|
|
10262
|
+
36 no 3.00 Advanced Novice 0
|
|
10263
|
+
19 yes 1.98 Advanced Advanced 0
|
|
10264
|
+
40 yes 3.95 Novice Beginner 0
|
|
10265
|
+
|
|
10266
|
+
# Example 1: Alias the resultant column after aggregation with "count_program".
|
|
10267
|
+
>>> res = df.agg(df.programming.count().alias("count_program"))
|
|
10268
|
+
>>> res
|
|
10269
|
+
count_program
|
|
10270
|
+
0 40
|
|
10271
|
+
|
|
10272
|
+
"""
|
|
10273
|
+
|
|
10274
|
+
# Validate argument types
|
|
10275
|
+
arg_type_matrix = [["name", name , True, (str), True]]
|
|
10276
|
+
_Validators._validate_function_arguments(arg_type_matrix)
|
|
10277
|
+
|
|
10278
|
+
self.alias_name = name
|
|
10279
|
+
return self
|
|
10280
|
+
|
|
10281
|
+
@staticmethod
|
|
10282
|
+
def _timezone_string(value):
|
|
10283
|
+
"""
|
|
10284
|
+
DESCRIPTION:
|
|
10285
|
+
Function to return timezone string in correct format.
|
|
10286
|
+
|
|
10287
|
+
PARAMETERS:
|
|
10288
|
+
value:
|
|
10289
|
+
Required Argument.
|
|
10290
|
+
Specifies timezone string.
|
|
10291
|
+
Types: str, int , float
|
|
10292
|
+
|
|
10293
|
+
RETURNS:
|
|
10294
|
+
bool
|
|
10295
|
+
"""
|
|
10296
|
+
if isinstance(value, (float, int)):
|
|
10297
|
+
return " AT TIME ZONE {}".format(value)
|
|
10298
|
+
if value.upper() not in ['LOCAL']:
|
|
10299
|
+
return " AT TIME ZONE '{}'".format(value)
|
|
10300
|
+
return " AT {}".format(value)
|
|
10301
|
+
|
|
10302
|
+
def to_timestamp(self, format=None, type_=TIMESTAMP, timezone=None):
|
|
10303
|
+
"""
|
|
10304
|
+
DESCRIPTION:
|
|
10305
|
+
Converts string or integer to a TIMESTAMP data type or TIMESTAMP WITH
|
|
10306
|
+
TIME ZONE data type.
|
|
10307
|
+
Note:
|
|
10308
|
+
* POSIX epoch conversion is implicit in the "to_timestamp" when column
|
|
10309
|
+
is integer type. POSIX epoch is the number of seconds that have elapsed
|
|
10310
|
+
since midnight Coordinated Universal Time (UTC) of January 1, 1970.
|
|
10311
|
+
|
|
10312
|
+
PARAMETERS:
|
|
10313
|
+
format:
|
|
10314
|
+
Specifies the format of string column.
|
|
10315
|
+
Argument is not required when column is integer type, Otherwise Required.
|
|
10316
|
+
For valid 'format' values, see documentation on
|
|
10317
|
+
"to_date" or "help(df.col_name.to_date)".
|
|
10318
|
+
Type: ColumnExpression or str
|
|
10319
|
+
|
|
10320
|
+
type_:
|
|
10321
|
+
Optional Argument.
|
|
10322
|
+
Specifies a TIMESTAMP type or an object of a
|
|
10323
|
+
TIMESTAMP type that the column needs to be cast to.
|
|
10324
|
+
Default value: TIMESTAMP
|
|
10325
|
+
Permitted Values: TIMESTAMP data type
|
|
10326
|
+
Types: teradatasqlalchemy type or object of teradatasqlalchemy type
|
|
10327
|
+
|
|
10328
|
+
timezone:
|
|
10329
|
+
Optional Argument.
|
|
10330
|
+
Specifies the timezone string.
|
|
10331
|
+
For valid timezone strings, user should check Vantage documentation.
|
|
10332
|
+
Type: ColumnExpression or str.
|
|
10333
|
+
|
|
10334
|
+
RETURNS:
|
|
10335
|
+
ColumnExpression
|
|
10336
|
+
|
|
10337
|
+
EXAMPLES:
|
|
10338
|
+
# Load the data to run the example.
|
|
10339
|
+
>>> load_example_data("teradataml", "timestamp_data")
|
|
10340
|
+
|
|
10341
|
+
# Create a DataFrame on 'timestamp_data' table.
|
|
10342
|
+
>>> df = DataFrame("timestamp_data")
|
|
10343
|
+
>>> df
|
|
10344
|
+
id timestamp_col timestamp_col1 format_col timezone_col
|
|
10345
|
+
2 2015-01-08 00:00:12.2+10:00 45678910234 YYYY-MM-DD HH24:MI:SS.FF6 TZH:TZM GMT+10
|
|
10346
|
+
1 2015-01-08 13:00 878986 YYYY-MM-DD HH24:MI America Pacific
|
|
10347
|
+
0 2015-01-08 00:00:12.2 123456 YYYY-MM-DD HH24:MI:SS.FF6 GMT
|
|
10348
|
+
|
|
10349
|
+
>>> df.tdtypes
|
|
10350
|
+
id INTEGER()
|
|
10351
|
+
timestamp_col VARCHAR(length=30, charset='LATIN')
|
|
10352
|
+
timestamp_col1 BIGINT()
|
|
10353
|
+
format_col VARCHAR(length=30, charset='LATIN')
|
|
10354
|
+
timezone_col VARCHAR(length=30, charset='LATIN')
|
|
10355
|
+
|
|
10356
|
+
# Example 1: Convert Epoch seconds to timestamp.
|
|
10357
|
+
>>> df.select(['id','timestamp_col1']).assign(col = df.timestamp_col1.to_timestamp())
|
|
10358
|
+
id timestamp_col1 col
|
|
10359
|
+
2 45678910234 3417-07-05 02:10:34.000000
|
|
10360
|
+
1 878986 1970-01-11 04:09:46.000000
|
|
10361
|
+
0 123456 1970-01-02 10:17:36.000000
|
|
10362
|
+
|
|
10363
|
+
# Example 2: Convert timestamp string to timestamp with timezone in
|
|
10364
|
+
# format mentioned in column "format_col".
|
|
10365
|
+
>>> df.select(['id', 'timestamp_col', 'format_col']).assign(col = df.timestamp_col.to_timestamp(df.format_col, TIMESTAMP(timezone=True)))
|
|
10366
|
+
id timestamp_col format_col col
|
|
10367
|
+
2 2015-01-08 00:00:12.2+10:00 YYYY-MM-DD HH24:MI:SS.FF6 TZH:TZM 2015-01-08 00:00:12.200000+10:00
|
|
10368
|
+
1 2015-01-08 13:00 YYYY-MM-DD HH24:MI 2015-01-08 13:00:00.000000+00:00
|
|
10369
|
+
0 2015-01-08 00:00:12.2 YYYY-MM-DD HH24:MI:SS.FF6 2015-01-08 00:00:12.200000+00:00
|
|
10370
|
+
|
|
10371
|
+
# Example 3: Convert Epoch seconds to timestamp with timezone in 'GMT+2' location.
|
|
10372
|
+
>>> df.select(['id', 'timestamp_col1', 'format_col']).assign(col = df.timestamp_col1.to_timestamp(df.format_col, TIMESTAMP(timezone=True), 'GMT+2'))
|
|
10373
|
+
id timestamp_col1 format_col col
|
|
10374
|
+
2 45678910234 YYYY-MM-DD HH24:MI:SS.FF6 TZH:TZM 3417-07-05 04:10:34.000000+02:00
|
|
10375
|
+
1 878986 YYYY-MM-DD HH24:MI 1970-01-11 06:09:46.000000+02:00
|
|
10376
|
+
0 123456 YYYY-MM-DD HH24:MI:SS.FF6 1970-01-02 12:17:36.000000+02:00
|
|
10377
|
+
|
|
10378
|
+
"""
|
|
10379
|
+
# Validating Arguments
|
|
10380
|
+
arg_type_matrix = []
|
|
10381
|
+
arg_type_matrix.append(["format", format , True, (str, ColumnExpression), True])
|
|
10382
|
+
arg_type_matrix.append(["timezone", timezone, True, (str, ColumnExpression, int, float), True])
|
|
10383
|
+
_Validators._validate_function_arguments(arg_type_matrix)
|
|
10384
|
+
|
|
10385
|
+
if not UtilFuncs._is_valid_td_type(type_):
|
|
10386
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.UNSUPPORTED_DATATYPE, 'type_',
|
|
10387
|
+
'a valid teradatasqlalchemy type'),
|
|
10388
|
+
MessageCodes.UNSUPPORTED_DATATYPE)
|
|
10389
|
+
|
|
10390
|
+
_format = format.expression if isinstance(format, _SQLColumnExpression) else format
|
|
10391
|
+
_params = [self.expression, _format]
|
|
10392
|
+
# format is not required when column is of below types.
|
|
10393
|
+
if isinstance(self._type, (BYTEINT, SMALLINT, INTEGER, BIGINT)):
|
|
10394
|
+
_params.pop()
|
|
10395
|
+
# Use to_timestamp_tz when below 3 conditions are true.
|
|
10396
|
+
# Resultant query will be Example:
|
|
10397
|
+
# TO_TIMESTAMP('2015-10-08 00:00:12.2') or TO_TIMESTAMP_TZ('2015-10-08 00:00:12.2+03:00') based on type_
|
|
10398
|
+
_fun = getattr(func, "to_timestamp_tz") if isinstance(type_, TIMESTAMP) and type_.timezone and len(_params) == 2 \
|
|
10399
|
+
else getattr(func, "to_timestamp")
|
|
10400
|
+
if not timezone:
|
|
10401
|
+
return _SQLColumnExpression(_fun(*_params), type=type_)
|
|
10402
|
+
|
|
10403
|
+
# If user uses timezone generate query with time zone.
|
|
10404
|
+
# Resultant query will be Example:
|
|
10405
|
+
# TO_TIMESTAMP('2015-10-08 00:00:12.2') at time zone 'America Alaska',
|
|
10406
|
+
# TO_TIMESTAMP_TZ('2015-10-08 00:00:12.2+03:00') at time zone 'America Alaska'.
|
|
10407
|
+
if isinstance(timezone, _SQLColumnExpression):
|
|
10408
|
+
_timezone_expr = _SQLColumnExpression(literal_column(f' AT TIME ZONE {timezone.compile()}')).compile()
|
|
10409
|
+
else:
|
|
10410
|
+
_timezone_expr = _SQLColumnExpression(literal_column(_SQLColumnExpression._timezone_string(timezone))).compile()
|
|
10411
|
+
return _SQLColumnExpression(_SQLColumnExpression(_fun(*_params)).compile() + _timezone_expr, type=type_)
|
|
10412
|
+
|
|
10413
|
+
def extract(self, value, timezone=None):
|
|
10414
|
+
"""
|
|
10415
|
+
DESCRIPTION:
|
|
10416
|
+
Extracts a single specified field from any DateTime, Interval or timestamp value,
|
|
10417
|
+
converting it to an exact numeric value.
|
|
10418
|
+
|
|
10419
|
+
PARAMETERS:
|
|
10420
|
+
value:
|
|
10421
|
+
Required Argument.
|
|
10422
|
+
Specifies the field which needs to be extracted.
|
|
10423
|
+
Permitted Values: YEAR, MONTH, DAY, HOUR, MINUTE, SECOND, TIMEZONE_HOUR, TIMEZONE_MINUTE
|
|
10424
|
+
Note:
|
|
10425
|
+
* Permitted Values are case insensitive.
|
|
10426
|
+
Type: str
|
|
10427
|
+
|
|
10428
|
+
timezone:
|
|
10429
|
+
Optional Argument.
|
|
10430
|
+
Specifies the timezone string.
|
|
10431
|
+
For valid timezone strings, user should check Vantage documentation.
|
|
10432
|
+
Type: ColumnExpression or str.
|
|
10433
|
+
|
|
10434
|
+
RETURNS:
|
|
10435
|
+
ColumnExpression
|
|
10436
|
+
|
|
10437
|
+
EXAMPLES:
|
|
10438
|
+
# Load the data to run the example.
|
|
10439
|
+
>>> load_example_data("uaf", "Traindata")
|
|
10440
|
+
|
|
10441
|
+
# Create a DataFrame on 'Traindata' table.
|
|
10442
|
+
|
|
10443
|
+
>>> temp_df = DataFrame("Traindata")
|
|
10444
|
+
>>> df = temp_df.select(["seq_no", "schedule_date", "arrivalTime"])
|
|
10445
|
+
>>> df
|
|
10446
|
+
schedule_date arrivalTime
|
|
10447
|
+
seq_no
|
|
10448
|
+
26 16/03/26 2016-03-26 12:33:05
|
|
10449
|
+
24 16/03/26 2016-03-26 12:25:06
|
|
10450
|
+
3 16/03/26 2016-03-26 10:52:05
|
|
10451
|
+
22 16/03/26 2016-03-26 12:18:01
|
|
10452
|
+
20 16/03/26 2016-03-26 12:10:06
|
|
10453
|
+
18 16/03/26 2016-03-26 12:04:01
|
|
10454
|
+
8 16/03/26 2016-03-26 11:15:06
|
|
10455
|
+
17 16/03/26 2016-03-26 11:56:06
|
|
10456
|
+
15 16/03/26 2016-03-26 11:45:00
|
|
10457
|
+
13 16/03/26 2016-03-26 11:33:00
|
|
10458
|
+
11 16/03/26 2016-03-26 11:26:00
|
|
10459
|
+
|
|
10460
|
+
# Example 1: Extract year from column 'schedule_date'.
|
|
10461
|
+
>>> df.assign(col = df.schedule_date.extract('YEAR'))
|
|
10462
|
+
schedule_date arrivalTime col
|
|
10463
|
+
seq_no
|
|
10464
|
+
26 16/03/26 2016-03-26 12:33:05 2016
|
|
10465
|
+
24 16/03/26 2016-03-26 12:25:06 2016
|
|
10466
|
+
3 16/03/26 2016-03-26 10:52:05 2016
|
|
10467
|
+
22 16/03/26 2016-03-26 12:18:01 2016
|
|
10468
|
+
20 16/03/26 2016-03-26 12:10:06 2016
|
|
10469
|
+
18 16/03/26 2016-03-26 12:04:01 2016
|
|
10470
|
+
8 16/03/26 2016-03-26 11:15:06 2016
|
|
10471
|
+
17 16/03/26 2016-03-26 11:56:06 2016
|
|
10472
|
+
15 16/03/26 2016-03-26 11:45:00 2016
|
|
10473
|
+
13 16/03/26 2016-03-26 11:33:00 2016
|
|
10474
|
+
11 16/03/26 2016-03-26 11:26:00 2016
|
|
10475
|
+
|
|
10476
|
+
# Example 2: Extract hour from column 'arrivalTime'.
|
|
10477
|
+
>>> df.assign(col = df.arrivalTime.extract('HOUR'))
|
|
10478
|
+
schedule_date arrivalTime col
|
|
10479
|
+
seq_no
|
|
10480
|
+
26 16/03/26 2016-03-26 12:33:05 12
|
|
10481
|
+
24 16/03/26 2016-03-26 12:25:06 12
|
|
10482
|
+
3 16/03/26 2016-03-26 10:52:05 10
|
|
10483
|
+
22 16/03/26 2016-03-26 12:18:01 12
|
|
10484
|
+
20 16/03/26 2016-03-26 12:10:06 12
|
|
10485
|
+
18 16/03/26 2016-03-26 12:04:01 12
|
|
10486
|
+
8 16/03/26 2016-03-26 11:15:06 11
|
|
10487
|
+
17 16/03/26 2016-03-26 11:56:06 11
|
|
10488
|
+
15 16/03/26 2016-03-26 11:45:00 11
|
|
10489
|
+
|
|
10490
|
+
# Example 3: Extract hour from column 'arrivalTime' with offset '-11:00'.
|
|
10491
|
+
>>> df.assign(col = df.arrivalTime.extract('HOUR', '-11:00'))
|
|
10492
|
+
schedule_date arrivalTime col
|
|
10493
|
+
seq_no
|
|
10494
|
+
26 16/03/26 2016-03-26 12:33:05 1
|
|
10495
|
+
24 16/03/26 2016-03-26 12:25:06 1
|
|
10496
|
+
3 16/03/26 2016-03-26 10:52:05 23
|
|
10497
|
+
22 16/03/26 2016-03-26 12:18:01 1
|
|
10498
|
+
20 16/03/26 2016-03-26 12:10:06 1
|
|
10499
|
+
18 16/03/26 2016-03-26 12:04:01 1
|
|
10500
|
+
8 16/03/26 2016-03-26 11:15:06 0
|
|
10501
|
+
17 16/03/26 2016-03-26 11:56:06 0
|
|
10502
|
+
15 16/03/26 2016-03-26 11:45:00 0
|
|
10503
|
+
|
|
10504
|
+
# Example 4: Extract hour from column 'arrivalTime' with offset 10.
|
|
10505
|
+
>>> df.assign(col = df.arrivalTime.extract('HOUR', 10))
|
|
10506
|
+
schedule_date arrivalTime col
|
|
10507
|
+
seq_no
|
|
10508
|
+
26 16/03/26 2016-03-26 12:33:05 22
|
|
10509
|
+
24 16/03/26 2016-03-26 12:25:06 22
|
|
10510
|
+
3 16/03/26 2016-03-26 10:52:05 20
|
|
10511
|
+
22 16/03/26 2016-03-26 12:18:01 22
|
|
10512
|
+
20 16/03/26 2016-03-26 12:10:06 22
|
|
10513
|
+
18 16/03/26 2016-03-26 12:04:01 22
|
|
10514
|
+
8 16/03/26 2016-03-26 11:15:06 21
|
|
10515
|
+
17 16/03/26 2016-03-26 11:56:06 21
|
|
10516
|
+
15 16/03/26 2016-03-26 11:45:00 21
|
|
10517
|
+
13 16/03/26 2016-03-26 11:33:00 21
|
|
10518
|
+
11 16/03/26 2016-03-26 11:26:00 21
|
|
10519
|
+
"""
|
|
10520
|
+
# Validating Arguments
|
|
10521
|
+
arg_type_matrix = []
|
|
10522
|
+
arg_type_matrix.append(["value", value , True, (str), True])
|
|
10523
|
+
arg_type_matrix.append(["timezone", timezone, True, (str, ColumnExpression, int, float), True])
|
|
10524
|
+
_Validators._validate_function_arguments(arg_type_matrix)
|
|
10525
|
+
|
|
10526
|
+
# If user doesn't provide timezone simply use extract functionality.
|
|
10527
|
+
if not timezone:
|
|
10528
|
+
return _SQLColumnExpression(func.extract(value, self.expression))
|
|
10529
|
+
|
|
10530
|
+
# If user uses timezone generate query with time zone.
|
|
10531
|
+
if isinstance(timezone, _SQLColumnExpression):
|
|
10532
|
+
_timezone_expr = _SQLColumnExpression(literal_column(f' AT TIME ZONE {timezone.compile()}')).compile()
|
|
10533
|
+
else:
|
|
10534
|
+
_timezone_expr = _SQLColumnExpression(literal_column(_SQLColumnExpression._timezone_string(timezone))).compile()
|
|
10535
|
+
return _SQLColumnExpression(func.extract(value, literal_column('({}{})'.format(self.compile(), _timezone_expr))))
|
|
10536
|
+
|
|
10537
|
+
def to_interval(self, value=None, type_=INTERVAL_DAY_TO_SECOND):
|
|
10538
|
+
"""
|
|
10539
|
+
DESCRIPTION:
|
|
10540
|
+
Converts a numeric value or string value into an INTERVAL_DAY_TO_SECOND or INTERVAL_YEAR_TO_MONTH value.
|
|
10541
|
+
|
|
10542
|
+
PARAMETERS:
|
|
10543
|
+
value:
|
|
10544
|
+
Optional, when column type is VARCHAR or CHAR, otherwise required.
|
|
10545
|
+
Specifies the unit of value for numeric value.
|
|
10546
|
+
when type_ is INTERVAL_DAY_TO_SECOND permitted values:
|
|
10547
|
+
* DAY, HOUR, MINUTE, SECOND
|
|
10548
|
+
when type_ is INTERVAL_YEAR_TO_MONTH permitted values:
|
|
10549
|
+
* YEAR, MONTH
|
|
10550
|
+
Note:
|
|
10551
|
+
* Permitted Values are case insensitive.
|
|
10552
|
+
Type: str or ColumnExpression
|
|
10553
|
+
|
|
10554
|
+
type_:
|
|
10555
|
+
Optional Argument.
|
|
10556
|
+
Specifies a teradatasqlalchemy type or an object of a teradatasqlalchemy type
|
|
10557
|
+
that the column needs to be cast to.
|
|
10558
|
+
Default value: TIMESTAMP
|
|
10559
|
+
Permitted Values: INTERVAL_DAY_TO_SECOND or INTERVAL_YEAR_TO_MONTH type.
|
|
10560
|
+
Types: teradatasqlalchemy type or object of teradatasqlalchemy type
|
|
10561
|
+
|
|
10562
|
+
Returns:
|
|
10563
|
+
ColumnExpression
|
|
10564
|
+
|
|
10565
|
+
EXAMPLES:
|
|
10566
|
+
# Load the data to run the example.
|
|
10567
|
+
>>> load_example_data("teradataml", "interval_data")
|
|
10568
|
+
|
|
10569
|
+
# Create a DataFrame on 'interval_data' table.
|
|
10570
|
+
>>> df = DataFrame("interval_data")
|
|
10571
|
+
>>> df
|
|
10572
|
+
id int_col value_col value_col1 str_col1 str_col2
|
|
10573
|
+
2 657 MINUTE MONTH PT73H -P14M
|
|
10574
|
+
3 1234 SECOND MONTH 100 04:23:59 06-10
|
|
10575
|
+
1 240 HOUR YEAR P100DT4H23M59S P100Y4M
|
|
10576
|
+
0 20 DAY YEAR 100 04:23:59 04-10
|
|
10577
|
+
|
|
10578
|
+
>>> df.tdtypes
|
|
10579
|
+
id INTEGER()
|
|
10580
|
+
int_col BIGINT()
|
|
10581
|
+
value_col VARCHAR(length=30, charset='LATIN')
|
|
10582
|
+
value_col1 VARCHAR(length=30, charset='LATIN')
|
|
10583
|
+
str_col1 VARCHAR(length=30, charset='LATIN')
|
|
10584
|
+
str_col2 VARCHAR(length=30, charset='LATIN')
|
|
10585
|
+
|
|
10586
|
+
|
|
10587
|
+
# Example 1: Convert "int_col" column to INTERVAL_DAY_TO_SECOND with value
|
|
10588
|
+
# provided in "value_col".
|
|
10589
|
+
>>> df.assign(col = df.int_col.to_interval(df.value_col))
|
|
10590
|
+
id int_col value_col value_col1 str_col1 str_col2 col
|
|
10591
|
+
2 657 MINUTE MONTH PT73H -P14M 0 10:57:00.000000
|
|
10592
|
+
3 1234 SECOND MONTH 100 04:23:59 06-10 0 00:20:34.000000
|
|
10593
|
+
1 240 HOUR YEAR P100DT4H23M59S P100Y4M 10 00:00:00.000000
|
|
10594
|
+
0 20 DAY YEAR 100 04:23:59 04-10 20 00:00:00.000000
|
|
10595
|
+
|
|
10596
|
+
# Example 2: Convert int_col to INTERVAL_YEAR_TO_MONTH when value = 'MONTH'.
|
|
10597
|
+
>>> df.assign(col = df.int_col.to_interval('MONTH', INTERVAL_YEAR_TO_MONTH))
|
|
10598
|
+
id int_col value_col value_col1 str_col1 str_col2 col
|
|
10599
|
+
2 657 MINUTE MONTH PT73H -P14M 54-09
|
|
10600
|
+
3 1234 SECOND MONTH 100 04:23:59 06-10 102-10
|
|
10601
|
+
1 240 HOUR YEAR P100DT4H23M59S P100Y4M 20-00
|
|
10602
|
+
0 20 DAY YEAR 100 04:23:59 04-10 1-08
|
|
10603
|
+
|
|
10604
|
+
# Example 3: Convert string column "str_col1" to INTERVAL_DAY_TO_SECOND.
|
|
10605
|
+
>>> df.assign(col = df.str_col1.to_interval())
|
|
10606
|
+
id int_col value_col value_col1 str_col1 str_col2 col
|
|
10607
|
+
2 657 MINUTE MONTH PT73H -P14M 3 01:00:00.000000
|
|
10608
|
+
3 1234 SECOND MONTH 100 04:23:59 06-10 100 04:23:59.000000
|
|
10609
|
+
1 240 HOUR YEAR P100DT4H23M59S P100Y4M 100 04:23:59.000000
|
|
10610
|
+
0 20 DAY YEAR 100 04:23:59 04-10 100 04:23:59.000000
|
|
10611
|
+
|
|
10612
|
+
# Example 4: Convert string column "str_col2" to INTERVAL_DAY_TO_MONTH.
|
|
10613
|
+
>>> df.assign(col = df.str_col2.to_interval(type_=INTERVAL_YEAR_TO_MONTH))
|
|
10614
|
+
id int_col value_col value_col1 str_col1 str_col2 col
|
|
10615
|
+
2 657 MINUTE MONTH PT73H -P14M -1-02
|
|
10616
|
+
3 1234 SECOND MONTH 100 04:23:59 06-10 6-10
|
|
10617
|
+
1 240 HOUR YEAR P100DT4H23M59S P100Y4M 100-04
|
|
10618
|
+
0 20 DAY YEAR 100 04:23:59 04-10 4-10
|
|
10619
|
+
|
|
10620
|
+
"""
|
|
10621
|
+
# Validating Arguments
|
|
10622
|
+
arg_type_matrix = []
|
|
10623
|
+
arg_type_matrix.append(["value", value , True, (str, ColumnExpression), True])
|
|
10624
|
+
_Validators._validate_function_arguments(arg_type_matrix)
|
|
10625
|
+
|
|
10626
|
+
if not UtilFuncs._is_valid_td_type(type_):
|
|
10627
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.UNSUPPORTED_DATATYPE, 'type_',
|
|
10628
|
+
'a valid teradatasqlalchemy type'),
|
|
10629
|
+
MessageCodes.UNSUPPORTED_DATATYPE)
|
|
10630
|
+
|
|
10631
|
+
# When column type is string, use either to_dsinterval or to_yminterval function based on "type_".
|
|
10632
|
+
if isinstance(self._type, (VARCHAR, CHAR)):
|
|
10633
|
+
_fun = (getattr(func, "to_dsinterval")) if isinstance(type_, INTERVAL_DAY_TO_SECOND)\
|
|
10634
|
+
or (isinstance(type_, type) and issubclass(type_, INTERVAL_DAY_TO_SECOND)) \
|
|
10635
|
+
else (getattr(func, "to_yminterval"))
|
|
10636
|
+
return _SQLColumnExpression(_fun(self.expression), type=type_)
|
|
10637
|
+
|
|
10638
|
+
# When column type is integer or float type, use either numtodsinterval or numtoyminterval
|
|
10639
|
+
# function based on "type_".
|
|
10640
|
+
_fun = (getattr(func, "numtodsinterval")) if isinstance(type_, INTERVAL_DAY_TO_SECOND) \
|
|
10641
|
+
or (isinstance(type_, type) and issubclass(type_, INTERVAL_DAY_TO_SECOND))\
|
|
10642
|
+
else (getattr(func, "numtoyminterval"))
|
|
10643
|
+
value = value.expression if isinstance(value, _SQLColumnExpression) else value
|
|
10644
|
+
return _SQLColumnExpression(_fun(self.expression, value), type=type_)
|
|
10645
|
+
|
|
10646
|
+
def parse_url(self, url_part):
|
|
10647
|
+
"""
|
|
10648
|
+
DESCRIPTION:
|
|
10649
|
+
Extracts a specific part from the URL.
|
|
10650
|
+
|
|
10651
|
+
PARAMETERS:
|
|
10652
|
+
url_part:
|
|
10653
|
+
Required Argument.
|
|
10654
|
+
Specifies which part to be extracted.
|
|
10655
|
+
Permitted Values: HOST, PATH, QUERY, REF, PROTOCOL, FILE, AUTHORITY, USERINFO
|
|
10656
|
+
Type: str or ColumnExpression
|
|
10657
|
+
|
|
10658
|
+
Returns:
|
|
10659
|
+
ColumnExpression
|
|
10660
|
+
|
|
10661
|
+
EXAMPLES:
|
|
10662
|
+
# Load the data to run the example.
|
|
10663
|
+
>>> load_example_data("teradataml", "url_data")
|
|
10664
|
+
|
|
10665
|
+
# Create a DataFrame on 'url_data' table.
|
|
10666
|
+
>>> df = DataFrame("url_data")
|
|
10667
|
+
>>> df
|
|
10668
|
+
urls part
|
|
10669
|
+
id
|
|
10670
|
+
3 https://www.facebook.com HOST
|
|
10671
|
+
6 smtp://user:password@smtp.example.com:21/file.txt USERINFO
|
|
10672
|
+
4 https://teracloud-pod-services-pod-account-service.dummyvalu QUERY
|
|
10673
|
+
2 https://example.net/path4/path5/path6?query4=value4#fragment REF
|
|
10674
|
+
0 http://example.com:8080/path FILE
|
|
10675
|
+
1 ftp://example.net:21/path PATH
|
|
10676
|
+
5 http://pg.example.ml/path150#fragment90 AUTHORITY
|
|
10677
|
+
7 https://www.google.com PROTOCOL
|
|
10678
|
+
|
|
10679
|
+
# Example 1: Extract components from column 'urls' using column 'part'
|
|
10680
|
+
>>> df.assign(col = df.urls.parse_url(df.part))
|
|
10681
|
+
urls part col
|
|
10682
|
+
id
|
|
10683
|
+
3 https://www.facebook.com HOST www.facebook.com
|
|
10684
|
+
6 smtp://user:password@smtp.example.com:21/file.txt USERINFO user:password
|
|
10685
|
+
4 https://teracloud-pod-services-pod-account-service.dummyvalu QUERY None
|
|
10686
|
+
2 https://example.net/path4/path5/path6?query4=value4#fragment REF fragment
|
|
10687
|
+
0 http://example.com:8080/path FILE /path
|
|
10688
|
+
1 ftp://example.net:21/path PATH /path
|
|
10689
|
+
5 http://pg.example.ml/path150#fragment90 AUTHORITY pg.example.ml
|
|
10690
|
+
7 https://www.google.com PROTOCOL https
|
|
10691
|
+
>>>
|
|
10692
|
+
"""
|
|
10693
|
+
|
|
10694
|
+
# Validating Arguments
|
|
10695
|
+
arg_type_matrix = []
|
|
10696
|
+
arg_type_matrix.append(["url_part", url_part, False, (str, ColumnExpression), True])
|
|
10697
|
+
_Validators._validate_function_arguments(arg_type_matrix)
|
|
10698
|
+
|
|
10699
|
+
# Regex pattern used to extract 'url_part' is '^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'.
|
|
10700
|
+
# teradataml does not support regex grouping hence in some cases first used 'regex_replace' and
|
|
10701
|
+
# then 'regex_substr' or vice-versa.
|
|
10702
|
+
_part_to_extract_dict = {'HOST': _SQLColumnExpression(
|
|
10703
|
+
func.regexp_replace(func.regexp_substr(self.expression, '//([^/?#]*)'), '(//[^/?#]+@)|(//)|(:\d+)', ''),
|
|
10704
|
+
type=VARCHAR()),
|
|
10705
|
+
'PATH': _SQLColumnExpression(func.regexp_substr(
|
|
10706
|
+
func.regexp_replace(self.expression, '^(([^:/?#]+):)?(//([^/?#]*))?', ''),
|
|
10707
|
+
'([^?#]*)'), type=VARCHAR()),
|
|
10708
|
+
'QUERY': _SQLColumnExpression(func.ltrim(func.regexp_substr(
|
|
10709
|
+
func.regexp_replace(self.expression, '^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)', ''),
|
|
10710
|
+
'\?([^#]*)'), '?'), type=VARCHAR()),
|
|
10711
|
+
'REF': _SQLColumnExpression(func.ltrim(func.regexp_substr(
|
|
10712
|
+
func.regexp_replace(self.expression,
|
|
10713
|
+
'^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?', ''),
|
|
10714
|
+
'(#(.*))'), '#'), type=VARCHAR()),
|
|
10715
|
+
'PROTOCOL': _SQLColumnExpression(
|
|
10716
|
+
func.rtrim(func.regexp_substr(self.expression, '^(([^:/?#]+):)'), ':'),
|
|
10717
|
+
type=VARCHAR()),
|
|
10718
|
+
'FILE': _SQLColumnExpression(func.regexp_substr(
|
|
10719
|
+
func.regexp_replace(self.expression, '^(([^:/?#]+):)?(//([^/?#]*))?', ''),
|
|
10720
|
+
'([^?#]*)(\?([^#]*))?'), type=VARCHAR()),
|
|
10721
|
+
'AUTHORITY': _SQLColumnExpression(
|
|
10722
|
+
func.ltrim(func.regexp_substr(self.expression, '//([^/?#]*)'), '//'),
|
|
10723
|
+
type=VARCHAR()),
|
|
10724
|
+
'USERINFO': _SQLColumnExpression(func.rtrim(func.ltrim(
|
|
10725
|
+
func.regexp_substr(func.regexp_substr(self.expression, '//([^/?#]*)'),
|
|
10726
|
+
'//[^/?#]+@'), '/'), '@'), type=VARCHAR())
|
|
10727
|
+
}
|
|
10728
|
+
|
|
10729
|
+
if isinstance(url_part, str):
|
|
10730
|
+
return _part_to_extract_dict[url_part]
|
|
10731
|
+
|
|
10732
|
+
whens = [(url_part == 'HOST', _part_to_extract_dict['HOST']),
|
|
10733
|
+
(url_part == 'PATH', _part_to_extract_dict['PATH'] ),
|
|
10734
|
+
(url_part == 'QUERY', _part_to_extract_dict['QUERY']),
|
|
10735
|
+
(url_part == 'REF', _part_to_extract_dict['REF']),
|
|
10736
|
+
(url_part == 'PROTOCOL', _part_to_extract_dict['PROTOCOL']),
|
|
10737
|
+
(url_part == 'FILE', _part_to_extract_dict['FILE']),
|
|
10738
|
+
(url_part == 'AUTHORITY', _part_to_extract_dict['AUTHORITY']),
|
|
10739
|
+
(url_part == 'USERINFO', _part_to_extract_dict['USERINFO'])]
|
|
10740
|
+
|
|
10741
|
+
from teradataml.dataframe.sql_functions import case
|
|
10742
|
+
return case(whens)
|
|
10743
|
+
|
|
10744
|
+
def log(self, base):
|
|
10745
|
+
"""
|
|
10746
|
+
DESCRIPTION:
|
|
10747
|
+
Returns the logarithm value of the column with respect to 'base'.
|
|
10748
|
+
|
|
10749
|
+
PARAMETERS:
|
|
10750
|
+
base:
|
|
10751
|
+
Required Argument.
|
|
10752
|
+
Specifies base of logarithm.
|
|
10753
|
+
Type: int or float or ColumnExpression
|
|
10754
|
+
|
|
10755
|
+
Returns:
|
|
10756
|
+
ColumnExpression
|
|
10757
|
+
|
|
10758
|
+
EXAMPLES:
|
|
10759
|
+
# Load the data to run the example.
|
|
10760
|
+
>>> load_example_data("teradataml", "titanic")
|
|
10761
|
+
|
|
10762
|
+
# Create a DataFrame on 'titanic' table.
|
|
10763
|
+
>>> titanic = DataFrame.from_table('titanic')
|
|
10764
|
+
>>> df = titanic.select(["passenger", "age", "fare"])
|
|
10765
|
+
>>> print(df)
|
|
10766
|
+
age fare
|
|
10767
|
+
passenger
|
|
10768
|
+
326 36.0 135.6333
|
|
10769
|
+
183 9.0 31.3875
|
|
10770
|
+
652 18.0 23.0000
|
|
10771
|
+
265 NaN 7.7500
|
|
10772
|
+
530 23.0 11.5000
|
|
10773
|
+
122 NaN 8.0500
|
|
10774
|
+
591 35.0 7.1250
|
|
10775
|
+
387 1.0 46.9000
|
|
10776
|
+
734 23.0 13.0000
|
|
10777
|
+
795 25.0 7.8958
|
|
10778
|
+
>>>
|
|
10779
|
+
|
|
10780
|
+
# Example 1: Compute log values for column 'fare' using base as column 'age'.
|
|
10781
|
+
>>> log_df = df.assign(fare_log=df.fare.log(df.age))
|
|
10782
|
+
>>> print(log_df)
|
|
10783
|
+
age fare fare_log
|
|
10784
|
+
passenger
|
|
10785
|
+
326 36.0 135.6333 1.370149
|
|
10786
|
+
183 9.0 31.3875 1.568529
|
|
10787
|
+
652 18.0 23.0000 1.084807
|
|
10788
|
+
40 14.0 11.2417 0.916854
|
|
10789
|
+
774 NaN 7.2250 NaN
|
|
10790
|
+
366 30.0 7.2500 0.582442
|
|
10791
|
+
509 28.0 22.5250 0.934704
|
|
10792
|
+
795 25.0 7.8958 0.641942
|
|
10793
|
+
61 22.0 7.2292 0.639955
|
|
10794
|
+
469 NaN 7.7250 NaN
|
|
10795
|
+
>>>
|
|
10796
|
+
"""
|
|
10797
|
+
# Validating Arguments
|
|
10798
|
+
arg_type_matrix = []
|
|
10799
|
+
arg_type_matrix.append(["base", base, False, (int, float, ColumnExpression), True])
|
|
10800
|
+
_Validators._validate_function_arguments(arg_type_matrix)
|
|
10801
|
+
|
|
10802
|
+
# Handling cases when 'base' or 'self' column values are zero or when denominator is zero
|
|
10803
|
+
from teradataml.dataframe.sql_functions import case
|
|
10804
|
+
|
|
10805
|
+
if not isinstance(base, _SQLColumnExpression):
|
|
10806
|
+
whens = case([((self != 0) & (_SQLColumnExpression(literal(base)).ln() != 0),
|
|
10807
|
+
(self.ln() / _SQLColumnExpression(literal(base)).ln()).cast(FLOAT))])
|
|
10808
|
+
else:
|
|
10809
|
+
whens = case([((self != 0) & (base != 0) & (base.ln() != 0),
|
|
10810
|
+
(self.ln() / base.ln()).cast(FLOAT))])
|
|
10811
|
+
|
|
10812
|
+
return whens
|