teradataml 20.0.0.1__py3-none-any.whl → 20.0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (200) hide show
  1. teradataml/LICENSE.pdf +0 -0
  2. teradataml/README.md +112 -0
  3. teradataml/__init__.py +6 -3
  4. teradataml/_version.py +1 -1
  5. teradataml/analytics/__init__.py +3 -2
  6. teradataml/analytics/analytic_function_executor.py +224 -16
  7. teradataml/analytics/analytic_query_generator.py +92 -0
  8. teradataml/analytics/byom/__init__.py +3 -2
  9. teradataml/analytics/json_parser/metadata.py +1 -0
  10. teradataml/analytics/json_parser/utils.py +6 -4
  11. teradataml/analytics/meta_class.py +40 -1
  12. teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
  13. teradataml/analytics/sqle/__init__.py +10 -2
  14. teradataml/analytics/table_operator/__init__.py +3 -2
  15. teradataml/analytics/uaf/__init__.py +21 -2
  16. teradataml/analytics/utils.py +62 -1
  17. teradataml/analytics/valib.py +1 -1
  18. teradataml/automl/__init__.py +1502 -323
  19. teradataml/automl/custom_json_utils.py +139 -61
  20. teradataml/automl/data_preparation.py +245 -306
  21. teradataml/automl/data_transformation.py +32 -12
  22. teradataml/automl/feature_engineering.py +313 -82
  23. teradataml/automl/model_evaluation.py +44 -35
  24. teradataml/automl/model_training.py +109 -146
  25. teradataml/catalog/byom.py +8 -8
  26. teradataml/clients/pkce_client.py +1 -1
  27. teradataml/common/constants.py +37 -0
  28. teradataml/common/deprecations.py +13 -7
  29. teradataml/common/garbagecollector.py +151 -120
  30. teradataml/common/messagecodes.py +4 -1
  31. teradataml/common/messages.py +2 -1
  32. teradataml/common/sqlbundle.py +1 -1
  33. teradataml/common/utils.py +97 -11
  34. teradataml/common/wrapper_utils.py +1 -1
  35. teradataml/context/context.py +72 -2
  36. teradataml/data/complaints_test_tokenized.csv +353 -0
  37. teradataml/data/complaints_tokens_model.csv +348 -0
  38. teradataml/data/covid_confirm_sd.csv +83 -0
  39. teradataml/data/dataframe_example.json +10 -0
  40. teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
  41. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
  42. teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
  43. teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
  44. teradataml/data/docs/sqle/docs_17_20/Shap.py +197 -0
  45. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
  46. teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
  47. teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
  48. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
  49. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
  50. teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
  51. teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
  52. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
  53. teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
  54. teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
  55. teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
  56. teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
  57. teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
  58. teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
  59. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
  60. teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
  61. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
  62. teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
  63. teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
  64. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
  65. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
  66. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
  67. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
  68. teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
  69. teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
  70. teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
  71. teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
  72. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  73. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
  74. teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
  75. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
  76. teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
  77. teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
  78. teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
  79. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  80. teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
  81. teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
  82. teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
  83. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  84. teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
  85. teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
  86. teradataml/data/dwt2d_dataTable.csv +65 -0
  87. teradataml/data/dwt_dataTable.csv +8 -0
  88. teradataml/data/dwt_filterTable.csv +3 -0
  89. teradataml/data/finance_data4.csv +13 -0
  90. teradataml/data/grocery_transaction.csv +19 -0
  91. teradataml/data/idwt2d_dataTable.csv +5 -0
  92. teradataml/data/idwt_dataTable.csv +8 -0
  93. teradataml/data/idwt_filterTable.csv +3 -0
  94. teradataml/data/interval_data.csv +5 -0
  95. teradataml/data/jsons/paired_functions.json +14 -0
  96. teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
  97. teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
  98. teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
  99. teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
  100. teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
  101. teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
  102. teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
  103. teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
  104. teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
  105. teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
  106. teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
  107. teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
  108. teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
  109. teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
  110. teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
  111. teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
  112. teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
  113. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
  114. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
  115. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
  116. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
  117. teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
  118. teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
  119. teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
  120. teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
  121. teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
  122. teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
  123. teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
  124. teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
  125. teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
  126. teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
  127. teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
  128. teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
  129. teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
  130. teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
  131. teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
  132. teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
  133. teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
  134. teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
  135. teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
  136. teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
  137. teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
  138. teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
  139. teradataml/data/jsons/uaf/17.20/TD_SAX.json +208 -0
  140. teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
  141. teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
  142. teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
  143. teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
  144. teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
  145. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +400 -0
  146. teradataml/data/load_example_data.py +8 -2
  147. teradataml/data/naivebayestextclassifier_example.json +1 -1
  148. teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
  149. teradataml/data/peppers.png +0 -0
  150. teradataml/data/real_values.csv +14 -0
  151. teradataml/data/sax_example.json +8 -0
  152. teradataml/data/scripts/deploy_script.py +1 -1
  153. teradataml/data/scripts/sklearn/sklearn_fit.py +17 -10
  154. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +2 -2
  155. teradataml/data/scripts/sklearn/sklearn_function.template +30 -7
  156. teradataml/data/scripts/sklearn/sklearn_neighbors.py +1 -1
  157. teradataml/data/scripts/sklearn/sklearn_score.py +12 -3
  158. teradataml/data/scripts/sklearn/sklearn_transform.py +55 -4
  159. teradataml/data/star_pivot.csv +8 -0
  160. teradataml/data/templates/open_source_ml.json +2 -1
  161. teradataml/data/teradataml_example.json +20 -1
  162. teradataml/data/timestamp_data.csv +4 -0
  163. teradataml/data/titanic_dataset_unpivoted.csv +19 -0
  164. teradataml/data/uaf_example.json +55 -1
  165. teradataml/data/unpivot_example.json +15 -0
  166. teradataml/data/url_data.csv +9 -0
  167. teradataml/data/windowdfft.csv +16 -0
  168. teradataml/dataframe/copy_to.py +1 -1
  169. teradataml/dataframe/data_transfer.py +5 -3
  170. teradataml/dataframe/dataframe.py +474 -41
  171. teradataml/dataframe/fastload.py +3 -3
  172. teradataml/dataframe/functions.py +339 -0
  173. teradataml/dataframe/row.py +160 -0
  174. teradataml/dataframe/setop.py +2 -2
  175. teradataml/dataframe/sql.py +658 -20
  176. teradataml/dataframe/window.py +1 -1
  177. teradataml/dbutils/dbutils.py +322 -16
  178. teradataml/geospatial/geodataframe.py +1 -1
  179. teradataml/geospatial/geodataframecolumn.py +1 -1
  180. teradataml/hyperparameter_tuner/optimizer.py +13 -13
  181. teradataml/lib/aed_0_1.dll +0 -0
  182. teradataml/opensource/sklearn/_sklearn_wrapper.py +154 -69
  183. teradataml/options/__init__.py +3 -1
  184. teradataml/options/configure.py +14 -2
  185. teradataml/options/display.py +2 -2
  186. teradataml/plot/axis.py +4 -4
  187. teradataml/scriptmgmt/UserEnv.py +10 -6
  188. teradataml/scriptmgmt/lls_utils.py +3 -2
  189. teradataml/table_operators/Script.py +2 -2
  190. teradataml/table_operators/TableOperator.py +106 -20
  191. teradataml/table_operators/table_operator_util.py +88 -41
  192. teradataml/table_operators/templates/dataframe_udf.template +63 -0
  193. teradataml/telemetry_utils/__init__.py +0 -0
  194. teradataml/telemetry_utils/queryband.py +52 -0
  195. teradataml/utils/validators.py +1 -1
  196. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.2.dist-info}/METADATA +115 -2
  197. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.2.dist-info}/RECORD +200 -140
  198. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.2.dist-info}/WHEEL +0 -0
  199. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.2.dist-info}/top_level.txt +0 -0
  200. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.2.dist-info}/zip-safe +0 -0
@@ -42,6 +42,7 @@ from teradataml.dataframe.indexer import _LocationIndexer
42
42
  from teradataml.common.aed_utils import AedUtils
43
43
  from teradataml.options.display import display
44
44
  from teradataml.dataframe.copy_to import copy_to_sql
45
+ from teradataml.dataframe.row import _Row
45
46
  from teradataml.dataframe.setop import concat
46
47
  from teradataml.plot.plot import _Plot
47
48
  from teradataml.scriptmgmt.UserEnv import UserEnv
@@ -53,7 +54,9 @@ from teradatasql import OperationalError
53
54
  from teradataml.dataframe.window import Window
54
55
  from teradataml.dataframe.data_transfer import _DataTransferUtils
55
56
  from teradataml.common.bulk_exposed_utils import _validate_unimplemented_function
56
- from teradatasqlalchemy.telemetry.queryband import collect_queryband
57
+ from teradataml.telemetry_utils.queryband import collect_queryband
58
+ from teradataml.options.configure import configure
59
+ from teradataml.utils.internal_buffer import _InternalBuffer
57
60
 
58
61
  # TODO use logger when available on master branch
59
62
  # logger = teradatapylog.getLogger()
@@ -151,6 +154,11 @@ class DataFrame():
151
154
  # This attribute added to add setter for columns property,
152
155
  # it is required when setting columns from groupby
153
156
  self._columns = None
157
+ # This attribute stores the internal AED query and avoid multiple
158
+ # calls to AED utility function aed_show_query()
159
+ self._aed_query = None
160
+ # This attribute stores the type of query stored in self._aed_query.
161
+ self._is_full_query = None
154
162
 
155
163
  # Property to determine if table is an ART table or not.
156
164
  self._is_art = None
@@ -418,6 +426,130 @@ class DataFrame():
418
426
 
419
427
  return df
420
428
 
429
+ def create_temp_view(self, name):
430
+ """
431
+ DESCRIPTION:
432
+ Creates a temporary view for session on the DataFrame.
433
+
434
+ PARAMETERS:
435
+ name:
436
+ Required Argument.
437
+ Specifies the name of the temporary view.
438
+ Type: str
439
+
440
+ RETURNS:
441
+ None
442
+
443
+ RAISES:
444
+ OperationalError (When view already exists).
445
+
446
+ EXAMPLES:
447
+ # Load the data to run the example.
448
+ >>> load_example_data("dataframe", "admissions_train")
449
+ >>> df = DataFrame("admissions_train")
450
+ >>> df
451
+ masters gpa stats programming admitted
452
+ id
453
+ 38 yes 2.65 Advanced Beginner 1
454
+ 7 yes 2.33 Novice Novice 1
455
+ 26 yes 3.57 Advanced Advanced 1
456
+ 17 no 3.83 Advanced Advanced 1
457
+ 34 yes 3.85 Advanced Beginner 0
458
+ 13 no 4.00 Advanced Novice 1
459
+ 32 yes 3.46 Advanced Beginner 0
460
+ 11 no 3.13 Advanced Advanced 1
461
+ 15 yes 4.00 Advanced Advanced 1
462
+ 36 no 3.00 Advanced Novice 0
463
+
464
+ # Example 1: Create view 'new_admissions'.
465
+ >>> df.create_temp_view("new_admissions")
466
+ >>> new_df = DataFrame("new_admissions")
467
+ >>> new_df
468
+ masters gpa stats programming admitted
469
+ id
470
+ 38 yes 2.65 Advanced Beginner 1
471
+ 7 yes 2.33 Novice Novice 1
472
+ 26 yes 3.57 Advanced Advanced 1
473
+ 17 no 3.83 Advanced Advanced 1
474
+ 34 yes 3.85 Advanced Beginner 0
475
+ 13 no 4.00 Advanced Novice 1
476
+ 32 yes 3.46 Advanced Beginner 0
477
+ 11 no 3.13 Advanced Advanced 1
478
+ 15 yes 4.00 Advanced Advanced 1
479
+ 36 no 3.00 Advanced Novice 0
480
+ """
481
+ # Validating Arguments
482
+ arg_type_matrix = []
483
+ arg_type_matrix.append(["name", name, False, (str), True])
484
+ _Validators._validate_function_arguments(arg_type_matrix)
485
+
486
+ GarbageCollector._add_to_garbagecollector(name, TeradataConstants.TERADATA_VIEW)
487
+ UtilFuncs._create_view(name, self.show_query())
488
+
489
+ def materialize(self):
490
+ """
491
+ DESCRIPTION:
492
+ Method to materialize teradataml DataFrame into a database object.
493
+ Notes:
494
+ * DataFrames are materialized in either view/table/volatile table,
495
+ which is decided and taken care by teradataml.
496
+ * If user wants to materialize object into specific database object
497
+ such as table/volatile table, use 'to_sql()' or 'copy_to_sql()' or
498
+ 'fastload()' functions.
499
+ * Materialized object is garbage collected at the end of the session.
500
+
501
+ PARAMETERS:
502
+ None
503
+
504
+ RETURNS:
505
+ DataFrame
506
+
507
+ EXAMPLES:
508
+ >>> load_example_data("dataframe", "admissions_train")
509
+ >>> df = DataFrame("admissions_train")
510
+ >>> df
511
+ masters gpa stats programming admitted
512
+ id
513
+ 13 no 4.00 Advanced Novice 1
514
+ 26 yes 3.57 Advanced Advanced 1
515
+ 5 no 3.44 Novice Novice 0
516
+ 19 yes 1.98 Advanced Advanced 0
517
+ 15 yes 4.00 Advanced Advanced 1
518
+ 40 yes 3.95 Novice Beginner 0
519
+ 7 yes 2.33 Novice Novice 1
520
+ 22 yes 3.46 Novice Beginner 0
521
+ 36 no 3.00 Advanced Novice 0
522
+ 38 yes 2.65 Advanced Beginner 1
523
+
524
+ # Example 1: Perform operations on teradataml DataFrame
525
+ # and materializeit in a database object.
526
+ >>> df2 = df.get([["id", "masters", "gpa"]])
527
+
528
+ # Initially table_name will be None.
529
+ >>> df2._table_name
530
+
531
+ >>> df2.materialize()
532
+ masters gpa
533
+ id
534
+ 15 yes 4.00
535
+ 7 yes 2.33
536
+ 22 yes 3.46
537
+ 17 no 3.83
538
+ 13 no 4.00
539
+ 38 yes 2.65
540
+ 26 yes 3.57
541
+ 5 no 3.44
542
+ 34 yes 3.85
543
+ 40 yes 3.95
544
+
545
+ # After materialize(), view name will be assigned.
546
+ >>> df2._table_name
547
+ '"ALICE"."ml__select__172077355985236"'
548
+ >>>
549
+ """
550
+ self.__execute_node_and_set_table_name(self._nodeid, self._metaexpr)
551
+ return self
552
+
421
553
  @collect_queryband(queryband="DF_fillna")
422
554
  def fillna(self, value=None, columns=None, literal_value=False):
423
555
  """
@@ -5421,7 +5553,9 @@ class DataFrame():
5421
5553
  result = self._check_numeric_overflow(agg_df)
5422
5554
  """
5423
5555
  try:
5424
- repr(result_df)
5556
+ # Printing the DF will actually run underlying select query and
5557
+ # will brought up numeric overflow if any. Only materializing won't work.
5558
+ print(result_df)
5425
5559
  return False
5426
5560
  except TeradataMlException as tme:
5427
5561
  if "Numeric overflow occurred during computation" in str(tme):
@@ -5557,18 +5691,73 @@ class DataFrame():
5557
5691
  EXAMPLES:
5558
5692
  self.__get_data_columns()
5559
5693
  """
5560
- self.__execute_node_and_set_table_name(self._nodeid, self._metaexpr)
5561
-
5562
- query = repr(self._metaexpr) + ' FROM ' + self._table_name
5694
+ if not self._table_name:
5695
+ if not self._aed_query:
5696
+ self.__generate_aed_query()
5697
+ # TODO: Check the length of query and if it fails, create a view in catch block.
5698
+ # Address in this JIRA: https://teradata-pe.atlassian.net/browse/ELE-6922
5699
+ query = repr(self._metaexpr) + ' FROM ( ' + self._aed_query + ' ) as temp_table'
5700
+ else:
5701
+ query = repr(self._metaexpr) + ' FROM ' + self._table_name
5563
5702
 
5564
5703
  if self._orderby is not None:
5565
5704
  query += ' ORDER BY ' + self._orderby
5566
5705
 
5706
+ query += ';'
5567
5707
  # Execute the query and get the results in a list.
5568
5708
  self.__data, self.__data_columns = UtilFuncs._execute_query(query=query, fetchWarnings=True)
5569
5709
 
5570
5710
  return self.__data, self.__data_columns
5571
5711
 
5712
+ def __generate_aed_query(self, full_query=False):
5713
+ """
5714
+ DESCRIPTION:
5715
+ Internal function to return underlying SQL for the teradataml
5716
+ DataFrame. It is the same SQL that is used to view the data for
5717
+ a teradataml DataFrame.
5718
+
5719
+ PARAMETERS:
5720
+ full_query:
5721
+ Optional Argument.
5722
+ Specifies if the complete query for the dataframe should be returned.
5723
+ When this parameter is set to True, query for the dataframe is returned
5724
+ with respect to the base dataframe's table (from_table() or from_query())
5725
+ or from the output tables of analytical functions (if there are any in the
5726
+ workflow). This query may or may not be directly used to retrieve data
5727
+ for the dataframe upon which the function is called.
5728
+ When this parameter is not used, string returned is the query already used
5729
+ or will be used to retrieve data for the teradataml DataFrame.
5730
+ Default Value: False
5731
+ Types: bool
5732
+
5733
+ RETURNS:
5734
+ String representing the underlying SQL query for the teradataml DataFrame.
5735
+
5736
+ RAISES:
5737
+ None.
5738
+
5739
+ EXAMPLES:
5740
+ self.__generate_aed_query()
5741
+ """
5742
+ # Run aed call only when _aed_query is None or
5743
+ # the type of current stored query (full/short) is not matching
5744
+ # with asked query type.
5745
+ if (not self._aed_query) or (not self._is_full_query == full_query):
5746
+ node_id = self._nodeid
5747
+
5748
+ if isinstance(self, (DataFrameGroupBy, DataFrameGroupByTime)):
5749
+ # If dataframe is either of type groupby or groupbytime
5750
+ # then get its parent dataframe nodeid and return queries
5751
+ # for the same
5752
+ node_id = self._aed_utils._aed_get_parent_nodeids(self._nodeid)[0]
5753
+
5754
+ queries = self._aed_utils._aed_show_query(node_id, query_with_reference_to_top=full_query)
5755
+ # Store query and type of query in class attributes to avoid future runs.
5756
+ self._aed_query = queries[0][0]
5757
+ self._is_full_query = full_query
5758
+
5759
+ return self._aed_query
5760
+
5572
5761
  @collect_queryband(queryband="DF_select")
5573
5762
  def select(self, select_expression):
5574
5763
  """
@@ -7108,6 +7297,97 @@ class DataFrame():
7108
7297
  if function_name is None or function_name in VANTAGE_FUNCTION_ARGTYPE_DEPENDENT_MAPPER:
7109
7298
  self.__execute_node_and_set_table_name(self._nodeid)
7110
7299
  return True
7300
+
7301
+ def _assign_udf(self, udf_expr):
7302
+ """
7303
+ DESCRIPTION:
7304
+ Internal function for DataFrame.assign() to execute the udf using
7305
+ Script Table Operator and create new column for teradataml DataFrame.
7306
+
7307
+ PARAMETER:
7308
+ udf_expr:
7309
+ Required Argument.
7310
+ Specifies a dictionary of column name to UDF expressions.
7311
+ Types: dict
7312
+
7313
+ RETURNS:
7314
+ teradataml DataFrame
7315
+
7316
+ RAISES:
7317
+ None.
7318
+
7319
+ EXAMPLES:
7320
+ self._assign_udf(udf_expr)
7321
+ """
7322
+
7323
+ df = self
7324
+ env_name = None
7325
+ # Create a dictionary of env_name to list of output columns to be run on that env.
7326
+ env_mapper = OrderedDict()
7327
+
7328
+ exec_mode = 'REMOTE' if UtilFuncs._is_lake() else 'IN-DB'
7329
+ if exec_mode == 'REMOTE':
7330
+ if _InternalBuffer.get("auth_token") is None:
7331
+ raise TeradataMlException(Messages.get_message(
7332
+ MessageCodes.FUNC_EXECUTION_FAILED, "'udf'", 'Authentication token is required to run udf. Set token using set_auth_token().'),
7333
+ MessageCodes.FUNC_EXECUTION_FAILED)
7334
+ else:
7335
+ for colname, col in udf_expr.items():
7336
+ env_name = UtilFuncs._get_env_name(col)
7337
+ # Store the env_name and its corresponding output column
7338
+ if env_name in env_mapper:
7339
+ env_mapper[env_name].append(colname)
7340
+ else:
7341
+ env_mapper[env_name] = [colname]
7342
+ else:
7343
+ env_mapper[env_name] = udf_expr.keys()
7344
+
7345
+ for env_name, cols in env_mapper.items():
7346
+ # Create a dictionary of output columns to column type.
7347
+ returns = OrderedDict([(column.name, column.type) for column in df._metaexpr.c])
7348
+ # Store the udf functions
7349
+ user_function = []
7350
+ # Create a dictionary of output column name to udf name
7351
+ columns_definitions = {}
7352
+ # Create a dictionary of output column name to udf arguments
7353
+ function_args = {}
7354
+ for colname, col in udf_expr.items():
7355
+ delimiter = col._delimiter
7356
+ quotechar = col._quotechar
7357
+ if colname in cols:
7358
+ user_function.append(col._udf)
7359
+ function_args[colname] = col._udf_args if col._udf_args else ()
7360
+ returns[colname] = col.type
7361
+ columns_definitions[colname] = col._udf.__name__
7362
+
7363
+ tbl_operators = _TableOperatorUtils([],
7364
+ df,
7365
+ "udf",
7366
+ user_function,
7367
+ exec_mode,
7368
+ chunk_size=None,
7369
+ returns=returns,
7370
+ delimiter=delimiter,
7371
+ quotechar=quotechar,
7372
+ num_rows=1,
7373
+ auth=None,
7374
+ data_partition_column=None,
7375
+ data_hash_column=None,
7376
+ data_order_column=None,
7377
+ is_local_order=None,
7378
+ nulls_first=None,
7379
+ sort_ascending=None,
7380
+ charset=None,
7381
+ env_name = env_name,
7382
+ style = "csv",
7383
+ function_args=function_args,
7384
+ columns_definitions=columns_definitions,
7385
+ output_type_converters={
7386
+ col_name: _Dtypes._teradata_type_to_python_type(col_type)
7387
+ for col_name, col_type in returns.items()})
7388
+
7389
+ df = tbl_operators.execute()
7390
+ return df
7111
7391
 
7112
7392
  @collect_queryband(queryband="DF_assign")
7113
7393
  def assign(self, drop_columns=False, **kwargs):
@@ -7119,10 +7399,12 @@ class DataFrame():
7119
7399
  drop_columns:
7120
7400
  Optional Argument.
7121
7401
  If True, drop columns that are not specified in assign.
7122
- Note:
7123
- When DataFrame.assign() is run on DataFrame.groupby(), this argument
7124
- is ignored. In such cases, all columns are dropped and only new columns
7125
- and grouping columns are returned.
7402
+ Notes:
7403
+ 1. When DataFrame.assign() is run on DataFrame.groupby(), this argument
7404
+ is ignored. In such cases, all columns are dropped and only new columns
7405
+ and grouping columns are returned.
7406
+ 2. Argument is ignored for UDF functions.
7407
+
7126
7408
  Default Value: False
7127
7409
  Types: bool
7128
7410
 
@@ -7138,6 +7420,7 @@ class DataFrame():
7138
7420
  * SQLAlchemy ClauseElements.
7139
7421
  (See teradataml extension with SQLAlchemy in teradataml User Guide
7140
7422
  and Function reference guide for more details)
7423
+ * Function - udf.
7141
7424
 
7142
7425
 
7143
7426
  RETURNS:
@@ -7163,6 +7446,16 @@ class DataFrame():
7163
7446
  used, but the column used in such function must be a part of group by columns.
7164
7447
  See examples for teradataml extension with SQLAlchemy on using various
7165
7448
  functions with DataFrame.assign().
7449
+ 6. UDF expressions can run on both Vantage Cloud Lake leveraging Apply Table Operator
7450
+ of Open Analytics Framework and Enterprise leveraging Vantage's Script Table Operator.
7451
+ 7. One can pass both regular expressions and udf expressions to this API.
7452
+ However, regular expressions are computed first followed by udf expressions.
7453
+ Hence the order of columns also maintained in same order.
7454
+ Look at Example 18 to understand more.
7455
+ 8. While passing multiple udf expressions, one can not pass one column output
7456
+ as another column input in the same ``assign`` call.
7457
+ 9. If user pass multiple udf expressions, delimiter and quotechar specified in
7458
+ last udf expression are considered for processing.
7166
7459
 
7167
7460
  RAISES:
7168
7461
  1. ValueError - When a callable is passed as a value, or columns from different
@@ -7424,6 +7717,134 @@ class DataFrame():
7424
7717
  1 Advanced 2.886226 3.508750 84.21
7425
7718
  2 Novice 6.377775 3.559091 39.15
7426
7719
  >>>
7720
+
7721
+ #
7722
+ # Executing user defined function (UDF) with assign()
7723
+ #
7724
+ # Example 15: Create two user defined functions to 'to_upper' and 'sum',
7725
+ # 'to_upper' to get the values in 'accounts' to upper case and
7726
+ # 'sum' to add length of string values in column 'accounts'
7727
+ # with column 'Feb' and store the result in Integer type column.
7728
+ >>> @udf
7729
+ ... def to_upper(s):
7730
+ ... if s is not None:
7731
+ ... return s.upper()
7732
+ >>>
7733
+ >>> from teradatasqlalchemy.types import INTEGER
7734
+ >>> @udf(returns=INTEGER())
7735
+ ... def sum(x, y):
7736
+ ... return len(x)+y
7737
+ >>>
7738
+ # Assign both Column Expressions returned by user defined functions
7739
+ # to the DataFrame.
7740
+ >>> res = df.assign(upper_stats = to_upper('accounts'), len_sum = sum('accounts', 'Feb'))
7741
+ >>> res
7742
+ Feb Jan Mar Apr datetime upper_stats len_sum
7743
+ accounts
7744
+ Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC 98
7745
+ Red Inc 200.0 150.0 140.0 NaN 17/01/04 RED INC 207
7746
+ Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC 100
7747
+ Jones LLC 200.0 150.0 140.0 180.0 17/01/04 JONES LLC 209
7748
+ Orange Inc 210.0 NaN NaN 250.0 17/01/04 ORANGE INC 220
7749
+ Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO 218
7750
+ >>>
7751
+
7752
+ # Example 16: Create a user defined function to add 4 to the 'datetime' column
7753
+ # and store the result in DATE type column.
7754
+ >>> from teradatasqlalchemy.types import DATE
7755
+ >>> import datetime
7756
+ >>> @udf(returns=DATE())
7757
+ ... def add_date(x, y):
7758
+ ... return (datetime.datetime.strptime(x, "%y/%m/%d")+datetime.timedelta(y)).strftime("%y/%m/%d")
7759
+ >>>
7760
+ # Assign the Column Expression returned by user defined function
7761
+ # to the DataFrame.
7762
+ >>> res = df.assign(new_date = add_date('datetime', 4))
7763
+ >>> res
7764
+ Feb Jan Mar Apr datetime new_date
7765
+ accounts
7766
+ Alpha Co 210.0 200.0 215.0 250.0 17/01/04 17/01/08
7767
+ Blue Inc 90.0 50.0 95.0 101.0 17/01/04 17/01/08
7768
+ Jones LLC 200.0 150.0 140.0 180.0 17/01/04 17/01/08
7769
+ Orange Inc 210.0 NaN NaN 250.0 17/01/04 17/01/08
7770
+ Yellow Inc 90.0 NaN NaN NaN 17/01/04 17/01/08
7771
+ Red Inc 200.0 150.0 140.0 NaN 17/01/04 17/01/08
7772
+ >>>
7773
+
7774
+ # Example 17: Create a user defined functions to 'to_upper' to get
7775
+ # the values in 'accounts' to upper case and create a
7776
+ # new column with a string literal value.
7777
+ >>> @udf
7778
+ ... def to_upper(s):
7779
+ ... if s is not None:
7780
+ ... return s.upper()
7781
+ >>>
7782
+ # Assign both expressions to the DataFrame.
7783
+ >>> res = df.assign(upper_stats = to_upper('accounts'), new_col = 'string')
7784
+ >>> res
7785
+ Feb Jan Mar Apr datetime new_col upper_stats
7786
+ accounts
7787
+ Alpha Co 210.0 200.0 215.0 250.0 17/01/04 string ALPHA CO
7788
+ Blue Inc 90.0 50.0 95.0 101.0 17/01/04 string BLUE INC
7789
+ Yellow Inc 90.0 NaN NaN NaN 17/01/04 string YELLOW INC
7790
+ Jones LLC 200.0 150.0 140.0 180.0 17/01/04 string JONES LLC
7791
+ Red Inc 200.0 150.0 140.0 NaN 17/01/04 string RED INC
7792
+ Orange Inc 210.0 NaN NaN 250.0 17/01/04 string ORANGE INC
7793
+ >>>
7794
+
7795
+ # Example 18: Create two user defined functions to 'to_upper' and 'sum'
7796
+ # and create new columns with string literal value and
7797
+ # arithmetic operation on column 'Feb'.
7798
+ >>> @udf
7799
+ ... def to_upper(s):
7800
+ ... if s is not None:
7801
+ ... return s.upper()
7802
+ >>>
7803
+ >>> from teradatasqlalchemy.types import INTEGER
7804
+ >>> @udf(returns=INTEGER())
7805
+ ... def sum(x, y):
7806
+ ... return len(x)+y
7807
+ >>>
7808
+ # Assign all expressions to the DataFrame.
7809
+ >>> res = df.assign(upper_stats = to_upper('accounts'),new_col = 'abc',
7810
+ ... len_sum = sum('accounts', 'Feb'), col_sum = df.Feb+1)
7811
+ >>> res
7812
+ Feb Jan Mar Apr datetime col_sum new_col upper_stats len_sum
7813
+ accounts
7814
+ Blue Inc 90.0 50.0 95.0 101.0 17/01/04 91.0 abc BLUE INC 98
7815
+ Alpha Co 210.0 200.0 215.0 250.0 17/01/04 211.0 abc ALPHA CO 218
7816
+ Jones LLC 200.0 150.0 140.0 180.0 17/01/04 201.0 abc JONES LLC 209
7817
+ Yellow Inc 90.0 NaN NaN NaN 17/01/04 91.0 abc YELLOW INC 100
7818
+ Orange Inc 210.0 NaN NaN 250.0 17/01/04 211.0 abc ORANGE INC 220
7819
+ Red Inc 200.0 150.0 140.0 NaN 17/01/04 201.0 abc RED INC 207
7820
+ >>>
7821
+
7822
+ # Example 19: Convert the values is 'accounts' column to upper case using a user
7823
+ # defined function on Vantage Cloud Lake.
7824
+ # Create a Python 3.10.5 environment with given name and description in Vantage.
7825
+ >>> env = create_env('test_udf', 'python_3.10.5', 'Test environment for UDF')
7826
+ User environment 'test_udf' created.
7827
+ >>>
7828
+ # Create a user defined functions to 'to_upper' to get the values in upper case
7829
+ # and pass the user env to run it on.
7830
+ >>> from teradataml.dataframe.functions import udf
7831
+ >>> @udf(env_name = env)
7832
+ ... def to_upper(s):
7833
+ ... if s is not None:
7834
+ ... return s.upper()
7835
+ >>>
7836
+ # Assign the Column Expression returned by user defined function
7837
+ # to the DataFrame.
7838
+ >>> df.assign(upper_stats = to_upper('accounts'))
7839
+ Feb Jan Mar Apr datetime upper_stats
7840
+ accounts
7841
+ Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO
7842
+ Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC
7843
+ Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC
7844
+ Jones LLC 200.0 150.0 140.0 180.0 17/01/04 JONES LLC
7845
+ Orange Inc 210.0 NaN NaN 250.0 17/01/04 ORANGE INC
7846
+ Red Inc 200.0 150.0 140.0 NaN 17/01/04 RED INC
7847
+ >>>
7427
7848
  """
7428
7849
  # Argument validations
7429
7850
  awu_matrix = []
@@ -7469,13 +7890,35 @@ class DataFrame():
7469
7890
  msg = Messages.get_message(MessageCodes.TDMLDF_INFO_ERROR)
7470
7891
  raise TeradataMlException(msg, MessageCodes.TDMLDF_INFO_ERROR)
7471
7892
 
7472
- try:
7473
- (new_meta, new_nodeid) = self._generate_assign_metaexpr_aed_nodeid(drop_columns, **kwargs)
7474
- return self._create_dataframe_from_node(new_nodeid, new_meta, self._index_label)
7475
- except Exception as err:
7476
- errcode = MessageCodes.TDMLDF_INFO_ERROR
7477
- msg = Messages.get_message(MessageCodes.TDMLDF_INFO_ERROR)
7478
- raise TeradataMlException(msg, errcode) from err
7893
+ # Create a dictionary of column name to udf expressions and
7894
+ # column name to normal/regular expressions.
7895
+ udf_expr = {}
7896
+ regular_expr = {}
7897
+ for colname, col in kwargs.items():
7898
+ # If value passed in kwargs is a ColumnExpression and is a udf, store it.
7899
+ if isinstance(col, ColumnExpression) and col._udf:
7900
+ udf_expr[colname] = col
7901
+ else:
7902
+ regular_expr[colname] = col
7903
+ df = self
7904
+
7905
+ # If kwargs contains both regular and udf expressions, first create new columns
7906
+ # from normal/regular expressions then on the output dataframe create new columns
7907
+ # from udf expression.
7908
+ if bool(regular_expr):
7909
+ try:
7910
+ (new_meta, new_nodeid) = df._generate_assign_metaexpr_aed_nodeid(drop_columns, **regular_expr)
7911
+ df = df._create_dataframe_from_node(new_nodeid, new_meta, df._index_label)
7912
+ except Exception as err:
7913
+ errcode = MessageCodes.TDMLDF_INFO_ERROR
7914
+ msg = Messages.get_message(MessageCodes.TDMLDF_INFO_ERROR)
7915
+ raise TeradataMlException(msg, errcode) from err
7916
+
7917
+ if bool(udf_expr):
7918
+ df = df._assign_udf(udf_expr)
7919
+
7920
+ return df
7921
+
7479
7922
 
7480
7923
  @collect_queryband(queryband="DF_get")
7481
7924
  def get(self, key):
@@ -10107,7 +10550,8 @@ class DataFrame():
10107
10550
  test_size=list_of_fracs[1],
10108
10551
  stratify_column=stratify_column,
10109
10552
  seed=seed,
10110
- persist=True)
10553
+ persist=True,
10554
+ display_table_name=False)
10111
10555
 
10112
10556
  # Retrieve the table name from TrainTestSplit_out object.
10113
10557
  table_name = TrainTestSplit_out.result._table_name
@@ -10218,10 +10662,10 @@ class DataFrame():
10218
10662
 
10219
10663
  # Make this non-lazy. Added this in order to fix https://teradata-pe.atlassian.net/browse/ELE-6368
10220
10664
  # Cannot use __execute_node_and_set_table_name because self points to original df.
10221
- # Hence, setting the __table_name with _execute_node_return_db_object_name.
10665
+ # Hence, setting the _table_name with _execute_node_return_db_object_name.
10222
10666
 
10223
10667
  df = self._create_dataframe_from_node(sample_node_id, new_metaexpr, self._index_label)
10224
- df.__table_name = df_utils._execute_node_return_db_object_name(sample_node_id, new_metaexpr)
10668
+ df._table_name = df_utils._execute_node_return_db_object_name(sample_node_id, new_metaexpr)
10225
10669
 
10226
10670
  return df
10227
10671
 
@@ -10352,26 +10796,14 @@ class DataFrame():
10352
10796
  where admitted > 0) as temp_table SAMPLE 0.9'
10353
10797
 
10354
10798
  """
10799
+ # Argument validations
10800
+ awu_matrix = []
10801
+ awu_matrix.append(["full_query", full_query, False, (bool)])
10802
+ # Validate argument types
10803
+ _Validators._validate_function_arguments(awu_matrix)
10355
10804
 
10356
10805
  try:
10357
- # Argument validations
10358
- awu_matrix = []
10359
- awu_matrix.append(["full_query", full_query, False, (bool)])
10360
- # Validate argument types
10361
- _Validators._validate_function_arguments(awu_matrix)
10362
-
10363
- node_id = self._nodeid
10364
-
10365
- if isinstance(self, (DataFrameGroupBy, DataFrameGroupByTime)):
10366
- # If dataframe is either of type groupby or groupbytime
10367
- # then get it's parent dataframe nodeid and return queries
10368
- # for the same
10369
- node_id = self._aed_utils._aed_get_parent_nodeids(self._nodeid)[0]
10370
-
10371
- queries = self._aed_utils._aed_show_query(node_id, query_with_reference_to_top=full_query)
10372
-
10373
- return queries[0][0]
10374
-
10806
+ return self.__generate_aed_query(full_query)
10375
10807
  except TeradataMlException:
10376
10808
  raise
10377
10809
 
@@ -10381,7 +10813,7 @@ class DataFrame():
10381
10813
  except Exception as err:
10382
10814
  errcode = MessageCodes.TDMLDF_INFO_ERROR
10383
10815
  msg = Messages.get_message(errcode)
10384
- raise TeradataMlException(msg, errcode) from err
10816
+ raise TeradataMlException(msg, errcode) from err
10385
10817
 
10386
10818
  @collect_queryband(queryband="DF_mapRow")
10387
10819
  def map_row(self,
@@ -13840,7 +14272,7 @@ class DataFrame():
13840
14272
  Types: int OR NoneType
13841
14273
 
13842
14274
  RETURNS:
13843
- iterator, an object to iterate over namedtuples for each row in the DataFrame.
14275
+ iterator, an object to iterate over row in the DataFrame.
13844
14276
 
13845
14277
  RAISES:
13846
14278
  None
@@ -13889,9 +14321,10 @@ class DataFrame():
13889
14321
  cur = execute_sql(query)
13890
14322
 
13891
14323
  if name:
14324
+ columns = [column[0] for column in cur.description]
13892
14325
  for rec in cur:
13893
- Row = namedtuple(name, [column[0] for column in cur.description])
13894
- yield Row(*rec)
14326
+ row = _Row(columns=columns, values=rec)
14327
+ yield row
13895
14328
  else:
13896
14329
  for rec in cur:
13897
14330
  yield rec
@@ -30,7 +30,7 @@ from teradataml.dataframe.copy_to import copy_to_sql, \
30
30
  _create_pti_table_object, _extract_column_info, \
31
31
  _check_columns_insertion_compatible
32
32
  from teradataml.dataframe.data_transfer import _DataTransferUtils
33
- from teradatasqlalchemy.telemetry.queryband import collect_queryband
33
+ from teradataml.telemetry_utils.queryband import collect_queryband
34
34
 
35
35
 
36
36
  @collect_queryband(queryband="fstLd")
@@ -348,11 +348,11 @@ def fastload(df, table_name, schema_name=None, if_exists='replace', index=False,
348
348
  308 2014-03-06 10:01:20.000000
349
349
 
350
350
  # Validate error and warning tables.
351
- >>> DataFrame("fld_errors")
351
+ >>> DataFrame(in_schema("stage_db", "fld_errors"))
352
352
  batch_no error_message
353
353
  1 [Session 14527] [Teradata Database] [Error 2673] FastLoad failed to insert 1 of 9 batched rows. Batched row 3 failed to insert because of Teradata Database error 2673 in "target_db"."fastload_with_err_warn_tbl_stag_db"."C_timestamp"
354
354
 
355
- >>> DataFrame("fld_warnings")
355
+ >>> DataFrame(in_schema("stage_db", "fld_warnings"))
356
356
  batch_no error_message
357
357
  batch_summary [Session 14526] [Teradata SQL Driver] [Warning 518] Found 1 duplicate or faulty row(s) while ending FastLoad of database table "target_db"."fastload_with_err_warn_tbl_stag_db": expected a row count of 8, got a row count of 7
358
358