teradataml 17.20.0.7__py3-none-any.whl → 20.0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (1303) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/LICENSE.pdf +0 -0
  3. teradataml/README.md +1935 -1640
  4. teradataml/__init__.py +70 -60
  5. teradataml/_version.py +11 -11
  6. teradataml/analytics/Transformations.py +2995 -2995
  7. teradataml/analytics/__init__.py +81 -83
  8. teradataml/analytics/analytic_function_executor.py +2040 -2010
  9. teradataml/analytics/analytic_query_generator.py +958 -958
  10. teradataml/analytics/byom/H2OPredict.py +514 -514
  11. teradataml/analytics/byom/PMMLPredict.py +437 -437
  12. teradataml/analytics/byom/__init__.py +14 -14
  13. teradataml/analytics/json_parser/__init__.py +130 -130
  14. teradataml/analytics/json_parser/analytic_functions_argument.py +1707 -1707
  15. teradataml/analytics/json_parser/json_store.py +191 -191
  16. teradataml/analytics/json_parser/metadata.py +1637 -1637
  17. teradataml/analytics/json_parser/utils.py +798 -803
  18. teradataml/analytics/meta_class.py +196 -196
  19. teradataml/analytics/sqle/DecisionTreePredict.py +455 -470
  20. teradataml/analytics/sqle/NaiveBayesPredict.py +419 -428
  21. teradataml/analytics/sqle/__init__.py +97 -110
  22. teradataml/analytics/sqle/json/decisiontreepredict_sqle.json +78 -78
  23. teradataml/analytics/sqle/json/naivebayespredict_sqle.json +62 -62
  24. teradataml/analytics/table_operator/__init__.py +10 -10
  25. teradataml/analytics/uaf/__init__.py +63 -63
  26. teradataml/analytics/utils.py +693 -692
  27. teradataml/analytics/valib.py +1603 -1600
  28. teradataml/automl/__init__.py +1683 -0
  29. teradataml/automl/custom_json_utils.py +1270 -0
  30. teradataml/automl/data_preparation.py +1011 -0
  31. teradataml/automl/data_transformation.py +789 -0
  32. teradataml/automl/feature_engineering.py +1580 -0
  33. teradataml/automl/feature_exploration.py +554 -0
  34. teradataml/automl/model_evaluation.py +151 -0
  35. teradataml/automl/model_training.py +1026 -0
  36. teradataml/catalog/__init__.py +1 -3
  37. teradataml/catalog/byom.py +1759 -1716
  38. teradataml/catalog/function_argument_mapper.py +859 -861
  39. teradataml/catalog/model_cataloging_utils.py +491 -1510
  40. teradataml/clients/auth_client.py +133 -0
  41. teradataml/clients/pkce_client.py +481 -481
  42. teradataml/common/aed_utils.py +7 -2
  43. teradataml/common/bulk_exposed_utils.py +111 -111
  44. teradataml/common/constants.py +1438 -1441
  45. teradataml/common/deprecations.py +160 -0
  46. teradataml/common/exceptions.py +73 -73
  47. teradataml/common/formula.py +742 -742
  48. teradataml/common/garbagecollector.py +597 -635
  49. teradataml/common/messagecodes.py +424 -431
  50. teradataml/common/messages.py +228 -231
  51. teradataml/common/sqlbundle.py +693 -693
  52. teradataml/common/td_coltype_code_to_tdtype.py +48 -48
  53. teradataml/common/utils.py +2424 -2500
  54. teradataml/common/warnings.py +25 -25
  55. teradataml/common/wrapper_utils.py +1 -110
  56. teradataml/config/dummy_file1.cfg +4 -4
  57. teradataml/config/dummy_file2.cfg +2 -2
  58. teradataml/config/sqlengine_alias_definitions_v1.0 +13 -13
  59. teradataml/config/sqlengine_alias_definitions_v1.1 +19 -19
  60. teradataml/config/sqlengine_alias_definitions_v1.3 +18 -18
  61. teradataml/context/aed_context.py +217 -217
  62. teradataml/context/context.py +1091 -999
  63. teradataml/data/A_loan.csv +19 -19
  64. teradataml/data/BINARY_REALS_LEFT.csv +11 -11
  65. teradataml/data/BINARY_REALS_RIGHT.csv +11 -11
  66. teradataml/data/B_loan.csv +49 -49
  67. teradataml/data/BuoyData2.csv +17 -17
  68. teradataml/data/CONVOLVE2_COMPLEX_LEFT.csv +5 -5
  69. teradataml/data/CONVOLVE2_COMPLEX_RIGHT.csv +5 -5
  70. teradataml/data/Convolve2RealsLeft.csv +5 -5
  71. teradataml/data/Convolve2RealsRight.csv +5 -5
  72. teradataml/data/Convolve2ValidLeft.csv +11 -11
  73. teradataml/data/Convolve2ValidRight.csv +11 -11
  74. teradataml/data/DFFTConv_Real_8_8.csv +65 -65
  75. teradataml/data/Orders1_12mf.csv +24 -24
  76. teradataml/data/Pi_loan.csv +7 -7
  77. teradataml/data/SMOOTHED_DATA.csv +7 -7
  78. teradataml/data/TestDFFT8.csv +9 -9
  79. teradataml/data/TestRiver.csv +109 -109
  80. teradataml/data/Traindata.csv +28 -28
  81. teradataml/data/acf.csv +17 -17
  82. teradataml/data/adaboost_example.json +34 -34
  83. teradataml/data/adaboostpredict_example.json +24 -24
  84. teradataml/data/additional_table.csv +10 -10
  85. teradataml/data/admissions_test.csv +21 -21
  86. teradataml/data/admissions_train.csv +41 -41
  87. teradataml/data/admissions_train_nulls.csv +41 -41
  88. teradataml/data/advertising.csv +201 -0
  89. teradataml/data/ageandheight.csv +13 -13
  90. teradataml/data/ageandpressure.csv +31 -31
  91. teradataml/data/antiselect_example.json +36 -36
  92. teradataml/data/antiselect_input.csv +8 -8
  93. teradataml/data/antiselect_input_mixed_case.csv +8 -8
  94. teradataml/data/applicant_external.csv +6 -6
  95. teradataml/data/applicant_reference.csv +6 -6
  96. teradataml/data/arima_example.json +9 -9
  97. teradataml/data/assortedtext_input.csv +8 -8
  98. teradataml/data/attribution_example.json +33 -33
  99. teradataml/data/attribution_sample_table.csv +27 -27
  100. teradataml/data/attribution_sample_table1.csv +6 -6
  101. teradataml/data/attribution_sample_table2.csv +11 -11
  102. teradataml/data/bank_churn.csv +10001 -0
  103. teradataml/data/bank_marketing.csv +11163 -0
  104. teradataml/data/bank_web_clicks1.csv +42 -42
  105. teradataml/data/bank_web_clicks2.csv +91 -91
  106. teradataml/data/bank_web_url.csv +85 -85
  107. teradataml/data/barrier.csv +2 -2
  108. teradataml/data/barrier_new.csv +3 -3
  109. teradataml/data/betweenness_example.json +13 -13
  110. teradataml/data/bike_sharing.csv +732 -0
  111. teradataml/data/bin_breaks.csv +8 -8
  112. teradataml/data/bin_fit_ip.csv +3 -3
  113. teradataml/data/binary_complex_left.csv +11 -11
  114. teradataml/data/binary_complex_right.csv +11 -11
  115. teradataml/data/binary_matrix_complex_left.csv +21 -21
  116. teradataml/data/binary_matrix_complex_right.csv +21 -21
  117. teradataml/data/binary_matrix_real_left.csv +21 -21
  118. teradataml/data/binary_matrix_real_right.csv +21 -21
  119. teradataml/data/blood2ageandweight.csv +26 -26
  120. teradataml/data/bmi.csv +501 -0
  121. teradataml/data/boston.csv +507 -507
  122. teradataml/data/boston2cols.csv +721 -0
  123. teradataml/data/breast_cancer.csv +570 -0
  124. teradataml/data/buoydata_mix.csv +11 -11
  125. teradataml/data/burst_data.csv +5 -5
  126. teradataml/data/burst_example.json +20 -20
  127. teradataml/data/byom_example.json +17 -17
  128. teradataml/data/bytes_table.csv +3 -3
  129. teradataml/data/cal_housing_ex_raw.csv +70 -70
  130. teradataml/data/callers.csv +7 -7
  131. teradataml/data/calls.csv +10 -10
  132. teradataml/data/cars_hist.csv +33 -33
  133. teradataml/data/cat_table.csv +24 -24
  134. teradataml/data/ccm_example.json +31 -31
  135. teradataml/data/ccm_input.csv +91 -91
  136. teradataml/data/ccm_input2.csv +13 -13
  137. teradataml/data/ccmexample.csv +101 -101
  138. teradataml/data/ccmprepare_example.json +8 -8
  139. teradataml/data/ccmprepare_input.csv +91 -91
  140. teradataml/data/cfilter_example.json +12 -12
  141. teradataml/data/changepointdetection_example.json +18 -18
  142. teradataml/data/changepointdetectionrt_example.json +8 -8
  143. teradataml/data/chi_sq.csv +2 -2
  144. teradataml/data/churn_data.csv +14 -14
  145. teradataml/data/churn_emission.csv +35 -35
  146. teradataml/data/churn_initial.csv +3 -3
  147. teradataml/data/churn_state_transition.csv +5 -5
  148. teradataml/data/citedges_2.csv +745 -745
  149. teradataml/data/citvertices_2.csv +1210 -1210
  150. teradataml/data/clicks2.csv +16 -16
  151. teradataml/data/clickstream.csv +12 -12
  152. teradataml/data/clickstream1.csv +11 -11
  153. teradataml/data/closeness_example.json +15 -15
  154. teradataml/data/complaints.csv +21 -21
  155. teradataml/data/complaints_mini.csv +3 -3
  156. teradataml/data/complaints_testtoken.csv +224 -224
  157. teradataml/data/complaints_tokens_test.csv +353 -353
  158. teradataml/data/complaints_traintoken.csv +472 -472
  159. teradataml/data/computers_category.csv +1001 -1001
  160. teradataml/data/computers_test1.csv +1252 -1252
  161. teradataml/data/computers_train1.csv +5009 -5009
  162. teradataml/data/computers_train1_clustered.csv +5009 -5009
  163. teradataml/data/confusionmatrix_example.json +9 -9
  164. teradataml/data/conversion_event_table.csv +3 -3
  165. teradataml/data/corr_input.csv +17 -17
  166. teradataml/data/correlation_example.json +11 -11
  167. teradataml/data/coxhazardratio_example.json +39 -39
  168. teradataml/data/coxph_example.json +15 -15
  169. teradataml/data/coxsurvival_example.json +28 -28
  170. teradataml/data/cpt.csv +41 -41
  171. teradataml/data/credit_ex_merged.csv +45 -45
  172. teradataml/data/customer_loyalty.csv +301 -301
  173. teradataml/data/customer_loyalty_newseq.csv +31 -31
  174. teradataml/data/customer_segmentation_test.csv +2628 -0
  175. teradataml/data/customer_segmentation_train.csv +8069 -0
  176. teradataml/data/dataframe_example.json +146 -146
  177. teradataml/data/decisionforest_example.json +37 -37
  178. teradataml/data/decisionforestpredict_example.json +38 -38
  179. teradataml/data/decisiontree_example.json +21 -21
  180. teradataml/data/decisiontreepredict_example.json +45 -45
  181. teradataml/data/dfft2_size4_real.csv +17 -17
  182. teradataml/data/dfft2_test_matrix16.csv +17 -17
  183. teradataml/data/dfft2conv_real_4_4.csv +65 -65
  184. teradataml/data/diabetes.csv +443 -443
  185. teradataml/data/diabetes_test.csv +89 -89
  186. teradataml/data/dict_table.csv +5 -5
  187. teradataml/data/docperterm_table.csv +4 -4
  188. teradataml/data/docs/__init__.py +1 -1
  189. teradataml/data/docs/byom/docs/DataRobotPredict.py +180 -180
  190. teradataml/data/docs/byom/docs/DataikuPredict.py +177 -177
  191. teradataml/data/docs/byom/docs/H2OPredict.py +324 -324
  192. teradataml/data/docs/byom/docs/ONNXPredict.py +283 -283
  193. teradataml/data/docs/byom/docs/PMMLPredict.py +277 -277
  194. teradataml/data/docs/sqle/docs_17_10/Antiselect.py +82 -82
  195. teradataml/data/docs/sqle/docs_17_10/Attribution.py +199 -199
  196. teradataml/data/docs/sqle/docs_17_10/BincodeFit.py +171 -171
  197. teradataml/data/docs/sqle/docs_17_10/BincodeTransform.py +131 -130
  198. teradataml/data/docs/sqle/docs_17_10/CategoricalSummary.py +86 -86
  199. teradataml/data/docs/sqle/docs_17_10/ChiSq.py +90 -90
  200. teradataml/data/docs/sqle/docs_17_10/ColumnSummary.py +85 -85
  201. teradataml/data/docs/sqle/docs_17_10/ConvertTo.py +95 -95
  202. teradataml/data/docs/sqle/docs_17_10/DecisionForestPredict.py +139 -139
  203. teradataml/data/docs/sqle/docs_17_10/DecisionTreePredict.py +151 -151
  204. teradataml/data/docs/sqle/docs_17_10/FTest.py +160 -160
  205. teradataml/data/docs/sqle/docs_17_10/FillRowId.py +82 -82
  206. teradataml/data/docs/sqle/docs_17_10/Fit.py +87 -87
  207. teradataml/data/docs/sqle/docs_17_10/GLMPredict.py +144 -144
  208. teradataml/data/docs/sqle/docs_17_10/GetRowsWithMissingValues.py +84 -84
  209. teradataml/data/docs/sqle/docs_17_10/GetRowsWithoutMissingValues.py +81 -81
  210. teradataml/data/docs/sqle/docs_17_10/Histogram.py +164 -164
  211. teradataml/data/docs/sqle/docs_17_10/MovingAverage.py +134 -134
  212. teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +208 -208
  213. teradataml/data/docs/sqle/docs_17_10/NPath.py +265 -265
  214. teradataml/data/docs/sqle/docs_17_10/NaiveBayesPredict.py +116 -116
  215. teradataml/data/docs/sqle/docs_17_10/NaiveBayesTextClassifierPredict.py +176 -176
  216. teradataml/data/docs/sqle/docs_17_10/NumApply.py +147 -147
  217. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +134 -132
  218. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +109 -103
  219. teradataml/data/docs/sqle/docs_17_10/OutlierFilterFit.py +165 -165
  220. teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +105 -101
  221. teradataml/data/docs/sqle/docs_17_10/Pack.py +128 -128
  222. teradataml/data/docs/sqle/docs_17_10/PolynomialFeaturesFit.py +111 -111
  223. teradataml/data/docs/sqle/docs_17_10/PolynomialFeaturesTransform.py +102 -102
  224. teradataml/data/docs/sqle/docs_17_10/QQNorm.py +104 -104
  225. teradataml/data/docs/sqle/docs_17_10/RoundColumns.py +109 -109
  226. teradataml/data/docs/sqle/docs_17_10/RowNormalizeFit.py +117 -117
  227. teradataml/data/docs/sqle/docs_17_10/RowNormalizeTransform.py +99 -98
  228. teradataml/data/docs/sqle/docs_17_10/SVMSparsePredict.py +152 -152
  229. teradataml/data/docs/sqle/docs_17_10/ScaleFit.py +197 -197
  230. teradataml/data/docs/sqle/docs_17_10/ScaleTransform.py +99 -98
  231. teradataml/data/docs/sqle/docs_17_10/Sessionize.py +113 -113
  232. teradataml/data/docs/sqle/docs_17_10/SimpleImputeFit.py +116 -116
  233. teradataml/data/docs/sqle/docs_17_10/SimpleImputeTransform.py +98 -98
  234. teradataml/data/docs/sqle/docs_17_10/StrApply.py +187 -187
  235. teradataml/data/docs/sqle/docs_17_10/StringSimilarity.py +145 -145
  236. teradataml/data/docs/sqle/docs_17_10/Transform.py +105 -104
  237. teradataml/data/docs/sqle/docs_17_10/UnivariateStatistics.py +141 -141
  238. teradataml/data/docs/sqle/docs_17_10/Unpack.py +214 -214
  239. teradataml/data/docs/sqle/docs_17_10/WhichMax.py +83 -83
  240. teradataml/data/docs/sqle/docs_17_10/WhichMin.py +83 -83
  241. teradataml/data/docs/sqle/docs_17_10/ZTest.py +155 -155
  242. teradataml/data/docs/sqle/docs_17_20/ANOVA.py +186 -126
  243. teradataml/data/docs/sqle/docs_17_20/Antiselect.py +82 -82
  244. teradataml/data/docs/sqle/docs_17_20/Attribution.py +200 -200
  245. teradataml/data/docs/sqle/docs_17_20/BincodeFit.py +171 -171
  246. teradataml/data/docs/sqle/docs_17_20/BincodeTransform.py +139 -138
  247. teradataml/data/docs/sqle/docs_17_20/CategoricalSummary.py +86 -86
  248. teradataml/data/docs/sqle/docs_17_20/ChiSq.py +90 -90
  249. teradataml/data/docs/sqle/docs_17_20/ClassificationEvaluator.py +166 -166
  250. teradataml/data/docs/sqle/docs_17_20/ColumnSummary.py +85 -85
  251. teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +245 -243
  252. teradataml/data/docs/sqle/docs_17_20/ConvertTo.py +113 -113
  253. teradataml/data/docs/sqle/docs_17_20/DecisionForest.py +279 -279
  254. teradataml/data/docs/sqle/docs_17_20/DecisionForestPredict.py +144 -144
  255. teradataml/data/docs/sqle/docs_17_20/DecisionTreePredict.py +135 -135
  256. teradataml/data/docs/sqle/docs_17_20/FTest.py +239 -160
  257. teradataml/data/docs/sqle/docs_17_20/FillRowId.py +82 -82
  258. teradataml/data/docs/sqle/docs_17_20/Fit.py +87 -87
  259. teradataml/data/docs/sqle/docs_17_20/GLM.py +541 -380
  260. teradataml/data/docs/sqle/docs_17_20/GLMPerSegment.py +414 -414
  261. teradataml/data/docs/sqle/docs_17_20/GLMPredict.py +144 -144
  262. teradataml/data/docs/sqle/docs_17_20/GLMPredictPerSegment.py +233 -234
  263. teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +125 -123
  264. teradataml/data/docs/sqle/docs_17_20/GetRowsWithMissingValues.py +108 -108
  265. teradataml/data/docs/sqle/docs_17_20/GetRowsWithoutMissingValues.py +105 -105
  266. teradataml/data/docs/sqle/docs_17_20/Histogram.py +223 -223
  267. teradataml/data/docs/sqle/docs_17_20/KMeans.py +251 -204
  268. teradataml/data/docs/sqle/docs_17_20/KMeansPredict.py +144 -143
  269. teradataml/data/docs/sqle/docs_17_20/KNN.py +214 -214
  270. teradataml/data/docs/sqle/docs_17_20/MovingAverage.py +134 -134
  271. teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +208 -208
  272. teradataml/data/docs/sqle/docs_17_20/NPath.py +265 -265
  273. teradataml/data/docs/sqle/docs_17_20/NaiveBayesPredict.py +116 -116
  274. teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierPredict.py +177 -176
  275. teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierTrainer.py +126 -126
  276. teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +118 -117
  277. teradataml/data/docs/sqle/docs_17_20/NonLinearCombineTransform.py +112 -112
  278. teradataml/data/docs/sqle/docs_17_20/NumApply.py +147 -147
  279. teradataml/data/docs/sqle/docs_17_20/OneClassSVM.py +307 -307
  280. teradataml/data/docs/sqle/docs_17_20/OneClassSVMPredict.py +185 -184
  281. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +230 -225
  282. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +121 -115
  283. teradataml/data/docs/sqle/docs_17_20/OrdinalEncodingFit.py +219 -219
  284. teradataml/data/docs/sqle/docs_17_20/OrdinalEncodingTransform.py +127 -127
  285. teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +189 -189
  286. teradataml/data/docs/sqle/docs_17_20/OutlierFilterTransform.py +117 -112
  287. teradataml/data/docs/sqle/docs_17_20/Pack.py +128 -128
  288. teradataml/data/docs/sqle/docs_17_20/PolynomialFeaturesFit.py +111 -111
  289. teradataml/data/docs/sqle/docs_17_20/PolynomialFeaturesTransform.py +112 -111
  290. teradataml/data/docs/sqle/docs_17_20/QQNorm.py +104 -104
  291. teradataml/data/docs/sqle/docs_17_20/ROC.py +164 -163
  292. teradataml/data/docs/sqle/docs_17_20/RandomProjectionFit.py +154 -154
  293. teradataml/data/docs/sqle/docs_17_20/RandomProjectionMinComponents.py +106 -106
  294. teradataml/data/docs/sqle/docs_17_20/RandomProjectionTransform.py +120 -120
  295. teradataml/data/docs/sqle/docs_17_20/RegressionEvaluator.py +211 -211
  296. teradataml/data/docs/sqle/docs_17_20/RoundColumns.py +108 -108
  297. teradataml/data/docs/sqle/docs_17_20/RowNormalizeFit.py +117 -117
  298. teradataml/data/docs/sqle/docs_17_20/RowNormalizeTransform.py +111 -110
  299. teradataml/data/docs/sqle/docs_17_20/SVM.py +413 -413
  300. teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +213 -202
  301. teradataml/data/docs/sqle/docs_17_20/SVMSparsePredict.py +152 -152
  302. teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +315 -197
  303. teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +202 -109
  304. teradataml/data/docs/sqle/docs_17_20/SentimentExtractor.py +206 -206
  305. teradataml/data/docs/sqle/docs_17_20/Sessionize.py +113 -113
  306. teradataml/data/docs/sqle/docs_17_20/Silhouette.py +152 -152
  307. teradataml/data/docs/sqle/docs_17_20/SimpleImputeFit.py +116 -116
  308. teradataml/data/docs/sqle/docs_17_20/SimpleImputeTransform.py +109 -108
  309. teradataml/data/docs/sqle/docs_17_20/StrApply.py +187 -187
  310. teradataml/data/docs/sqle/docs_17_20/StringSimilarity.py +145 -145
  311. teradataml/data/docs/sqle/docs_17_20/TDDecisionForestPredict.py +207 -207
  312. teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +333 -171
  313. teradataml/data/docs/sqle/docs_17_20/TargetEncodingFit.py +266 -266
  314. teradataml/data/docs/sqle/docs_17_20/TargetEncodingTransform.py +141 -140
  315. teradataml/data/docs/sqle/docs_17_20/TextParser.py +172 -172
  316. teradataml/data/docs/sqle/docs_17_20/TrainTestSplit.py +159 -159
  317. teradataml/data/docs/sqle/docs_17_20/Transform.py +123 -123
  318. teradataml/data/docs/sqle/docs_17_20/UnivariateStatistics.py +141 -141
  319. teradataml/data/docs/sqle/docs_17_20/Unpack.py +214 -214
  320. teradataml/data/docs/sqle/docs_17_20/VectorDistance.py +168 -168
  321. teradataml/data/docs/sqle/docs_17_20/WhichMax.py +83 -83
  322. teradataml/data/docs/sqle/docs_17_20/WhichMin.py +83 -83
  323. teradataml/data/docs/sqle/docs_17_20/WordEmbeddings.py +236 -236
  324. teradataml/data/docs/sqle/docs_17_20/XGBoost.py +361 -353
  325. teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +281 -275
  326. teradataml/data/docs/sqle/docs_17_20/ZTest.py +220 -155
  327. teradataml/data/docs/tableoperator/docs_17_00/ReadNOS.py +429 -429
  328. teradataml/data/docs/tableoperator/docs_17_05/ReadNOS.py +429 -429
  329. teradataml/data/docs/tableoperator/docs_17_05/WriteNOS.py +347 -347
  330. teradataml/data/docs/tableoperator/docs_17_10/ReadNOS.py +428 -428
  331. teradataml/data/docs/tableoperator/docs_17_10/WriteNOS.py +347 -347
  332. teradataml/data/docs/tableoperator/docs_17_20/ReadNOS.py +439 -439
  333. teradataml/data/docs/tableoperator/docs_17_20/WriteNOS.py +386 -386
  334. teradataml/data/docs/uaf/docs_17_20/ACF.py +195 -195
  335. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +369 -369
  336. teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +142 -142
  337. teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +159 -159
  338. teradataml/data/docs/uaf/docs_17_20/BinaryMatrixOp.py +247 -247
  339. teradataml/data/docs/uaf/docs_17_20/BinarySeriesOp.py +252 -252
  340. teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +177 -177
  341. teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +174 -174
  342. teradataml/data/docs/uaf/docs_17_20/Convolve.py +226 -226
  343. teradataml/data/docs/uaf/docs_17_20/Convolve2.py +214 -214
  344. teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +183 -183
  345. teradataml/data/docs/uaf/docs_17_20/DFFT.py +203 -203
  346. teradataml/data/docs/uaf/docs_17_20/DFFT2.py +216 -216
  347. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +215 -215
  348. teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +191 -191
  349. teradataml/data/docs/uaf/docs_17_20/DTW.py +179 -179
  350. teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +144 -144
  351. teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +183 -183
  352. teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +184 -184
  353. teradataml/data/docs/uaf/docs_17_20/FitMetrics.py +172 -172
  354. teradataml/data/docs/uaf/docs_17_20/GenseriesFormula.py +205 -205
  355. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +142 -142
  356. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +258 -258
  357. teradataml/data/docs/uaf/docs_17_20/IDFFT.py +164 -164
  358. teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +198 -198
  359. teradataml/data/docs/uaf/docs_17_20/InputValidator.py +120 -120
  360. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +155 -155
  361. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +214 -214
  362. teradataml/data/docs/uaf/docs_17_20/MAMean.py +173 -173
  363. teradataml/data/docs/uaf/docs_17_20/MInfo.py +133 -133
  364. teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +135 -135
  365. teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +190 -190
  366. teradataml/data/docs/uaf/docs_17_20/PACF.py +158 -158
  367. teradataml/data/docs/uaf/docs_17_20/Portman.py +216 -216
  368. teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +154 -154
  369. teradataml/data/docs/uaf/docs_17_20/Resample.py +228 -228
  370. teradataml/data/docs/uaf/docs_17_20/SInfo.py +122 -122
  371. teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +165 -165
  372. teradataml/data/docs/uaf/docs_17_20/SelectionCriteria.py +173 -173
  373. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +170 -170
  374. teradataml/data/docs/uaf/docs_17_20/SignifResidmean.py +163 -163
  375. teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +179 -179
  376. teradataml/data/docs/uaf/docs_17_20/Smoothma.py +207 -207
  377. teradataml/data/docs/uaf/docs_17_20/TrackingOp.py +150 -150
  378. teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +171 -171
  379. teradataml/data/docs/uaf/docs_17_20/Unnormalize.py +201 -201
  380. teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +169 -169
  381. teradataml/data/dtw_example.json +17 -17
  382. teradataml/data/dtw_t1.csv +11 -11
  383. teradataml/data/dtw_t2.csv +4 -4
  384. teradataml/data/dwt2d_example.json +15 -15
  385. teradataml/data/dwt_example.json +14 -14
  386. teradataml/data/dwt_filter_dim.csv +5 -5
  387. teradataml/data/emission.csv +9 -9
  388. teradataml/data/emp_table_by_dept.csv +19 -19
  389. teradataml/data/employee_info.csv +4 -4
  390. teradataml/data/employee_table.csv +6 -6
  391. teradataml/data/excluding_event_table.csv +2 -2
  392. teradataml/data/finance_data.csv +6 -6
  393. teradataml/data/finance_data2.csv +61 -61
  394. teradataml/data/finance_data3.csv +93 -93
  395. teradataml/data/fish.csv +160 -0
  396. teradataml/data/fm_blood2ageandweight.csv +26 -26
  397. teradataml/data/fmeasure_example.json +11 -11
  398. teradataml/data/followers_leaders.csv +10 -10
  399. teradataml/data/fpgrowth_example.json +12 -12
  400. teradataml/data/frequentpaths_example.json +29 -29
  401. teradataml/data/friends.csv +9 -9
  402. teradataml/data/fs_input.csv +33 -33
  403. teradataml/data/fs_input1.csv +33 -33
  404. teradataml/data/genData.csv +513 -513
  405. teradataml/data/geodataframe_example.json +39 -39
  406. teradataml/data/glass_types.csv +215 -0
  407. teradataml/data/glm_admissions_model.csv +12 -12
  408. teradataml/data/glm_example.json +56 -29
  409. teradataml/data/glml1l2_example.json +28 -28
  410. teradataml/data/glml1l2predict_example.json +54 -54
  411. teradataml/data/glmpredict_example.json +54 -54
  412. teradataml/data/gq_t1.csv +21 -21
  413. teradataml/data/hconvolve_complex_right.csv +5 -5
  414. teradataml/data/hconvolve_complex_rightmulti.csv +5 -5
  415. teradataml/data/histogram_example.json +11 -11
  416. teradataml/data/hmmdecoder_example.json +78 -78
  417. teradataml/data/hmmevaluator_example.json +24 -24
  418. teradataml/data/hmmsupervised_example.json +10 -10
  419. teradataml/data/hmmunsupervised_example.json +7 -7
  420. teradataml/data/house_values.csv +12 -12
  421. teradataml/data/house_values2.csv +13 -13
  422. teradataml/data/housing_cat.csv +7 -7
  423. teradataml/data/housing_data.csv +9 -9
  424. teradataml/data/housing_test.csv +47 -47
  425. teradataml/data/housing_test_binary.csv +47 -47
  426. teradataml/data/housing_train.csv +493 -493
  427. teradataml/data/housing_train_attribute.csv +4 -4
  428. teradataml/data/housing_train_binary.csv +437 -437
  429. teradataml/data/housing_train_parameter.csv +2 -2
  430. teradataml/data/housing_train_response.csv +493 -493
  431. teradataml/data/housing_train_segment.csv +201 -0
  432. teradataml/data/ibm_stock.csv +370 -370
  433. teradataml/data/ibm_stock1.csv +370 -370
  434. teradataml/data/identitymatch_example.json +21 -21
  435. teradataml/data/idf_table.csv +4 -4
  436. teradataml/data/impressions.csv +101 -101
  437. teradataml/data/inflation.csv +21 -21
  438. teradataml/data/initial.csv +3 -3
  439. teradataml/data/insect2Cols.csv +61 -0
  440. teradataml/data/insect_sprays.csv +12 -12
  441. teradataml/data/insurance.csv +1339 -1339
  442. teradataml/data/interpolator_example.json +12 -12
  443. teradataml/data/iris_altinput.csv +481 -481
  444. teradataml/data/iris_attribute_output.csv +8 -8
  445. teradataml/data/iris_attribute_test.csv +121 -121
  446. teradataml/data/iris_attribute_train.csv +481 -481
  447. teradataml/data/iris_category_expect_predict.csv +31 -31
  448. teradataml/data/iris_data.csv +151 -0
  449. teradataml/data/iris_input.csv +151 -151
  450. teradataml/data/iris_response_train.csv +121 -121
  451. teradataml/data/iris_test.csv +31 -31
  452. teradataml/data/iris_train.csv +121 -121
  453. teradataml/data/join_table1.csv +4 -4
  454. teradataml/data/join_table2.csv +4 -4
  455. teradataml/data/jsons/anly_function_name.json +6 -6
  456. teradataml/data/jsons/byom/dataikupredict.json +147 -147
  457. teradataml/data/jsons/byom/datarobotpredict.json +146 -146
  458. teradataml/data/jsons/byom/h2opredict.json +194 -194
  459. teradataml/data/jsons/byom/onnxpredict.json +186 -186
  460. teradataml/data/jsons/byom/pmmlpredict.json +146 -146
  461. teradataml/data/jsons/paired_functions.json +435 -435
  462. teradataml/data/jsons/sqle/16.20/Antiselect.json +56 -56
  463. teradataml/data/jsons/sqle/16.20/Attribution.json +249 -249
  464. teradataml/data/jsons/sqle/16.20/DecisionForestPredict.json +156 -156
  465. teradataml/data/jsons/sqle/16.20/DecisionTreePredict.json +170 -170
  466. teradataml/data/jsons/sqle/16.20/GLMPredict.json +122 -122
  467. teradataml/data/jsons/sqle/16.20/MovingAverage.json +367 -367
  468. teradataml/data/jsons/sqle/16.20/NGramSplitter.json +239 -239
  469. teradataml/data/jsons/sqle/16.20/NaiveBayesPredict.json +136 -136
  470. teradataml/data/jsons/sqle/16.20/NaiveBayesTextClassifierPredict.json +235 -235
  471. teradataml/data/jsons/sqle/16.20/Pack.json +98 -98
  472. teradataml/data/jsons/sqle/16.20/SVMSparsePredict.json +162 -162
  473. teradataml/data/jsons/sqle/16.20/Sessionize.json +105 -105
  474. teradataml/data/jsons/sqle/16.20/StringSimilarity.json +86 -86
  475. teradataml/data/jsons/sqle/16.20/Unpack.json +166 -166
  476. teradataml/data/jsons/sqle/16.20/nPath.json +269 -269
  477. teradataml/data/jsons/sqle/17.00/Antiselect.json +56 -56
  478. teradataml/data/jsons/sqle/17.00/Attribution.json +249 -249
  479. teradataml/data/jsons/sqle/17.00/DecisionForestPredict.json +156 -156
  480. teradataml/data/jsons/sqle/17.00/DecisionTreePredict.json +170 -170
  481. teradataml/data/jsons/sqle/17.00/GLMPredict.json +122 -122
  482. teradataml/data/jsons/sqle/17.00/MovingAverage.json +367 -367
  483. teradataml/data/jsons/sqle/17.00/NGramSplitter.json +239 -239
  484. teradataml/data/jsons/sqle/17.00/NaiveBayesPredict.json +136 -136
  485. teradataml/data/jsons/sqle/17.00/NaiveBayesTextClassifierPredict.json +235 -235
  486. teradataml/data/jsons/sqle/17.00/Pack.json +98 -98
  487. teradataml/data/jsons/sqle/17.00/SVMSparsePredict.json +162 -162
  488. teradataml/data/jsons/sqle/17.00/Sessionize.json +105 -105
  489. teradataml/data/jsons/sqle/17.00/StringSimilarity.json +86 -86
  490. teradataml/data/jsons/sqle/17.00/Unpack.json +166 -166
  491. teradataml/data/jsons/sqle/17.00/nPath.json +269 -269
  492. teradataml/data/jsons/sqle/17.05/Antiselect.json +56 -56
  493. teradataml/data/jsons/sqle/17.05/Attribution.json +249 -249
  494. teradataml/data/jsons/sqle/17.05/DecisionForestPredict.json +156 -156
  495. teradataml/data/jsons/sqle/17.05/DecisionTreePredict.json +170 -170
  496. teradataml/data/jsons/sqle/17.05/GLMPredict.json +122 -122
  497. teradataml/data/jsons/sqle/17.05/MovingAverage.json +367 -367
  498. teradataml/data/jsons/sqle/17.05/NGramSplitter.json +239 -239
  499. teradataml/data/jsons/sqle/17.05/NaiveBayesPredict.json +136 -136
  500. teradataml/data/jsons/sqle/17.05/NaiveBayesTextClassifierPredict.json +235 -235
  501. teradataml/data/jsons/sqle/17.05/Pack.json +98 -98
  502. teradataml/data/jsons/sqle/17.05/SVMSparsePredict.json +162 -162
  503. teradataml/data/jsons/sqle/17.05/Sessionize.json +105 -105
  504. teradataml/data/jsons/sqle/17.05/StringSimilarity.json +86 -86
  505. teradataml/data/jsons/sqle/17.05/Unpack.json +166 -166
  506. teradataml/data/jsons/sqle/17.05/nPath.json +269 -269
  507. teradataml/data/jsons/sqle/17.10/Antiselect.json +56 -56
  508. teradataml/data/jsons/sqle/17.10/Attribution.json +249 -249
  509. teradataml/data/jsons/sqle/17.10/DecisionForestPredict.json +185 -185
  510. teradataml/data/jsons/sqle/17.10/DecisionTreePredict.json +171 -171
  511. teradataml/data/jsons/sqle/17.10/GLMPredict.json +151 -151
  512. teradataml/data/jsons/sqle/17.10/MovingAverage.json +368 -368
  513. teradataml/data/jsons/sqle/17.10/NGramSplitter.json +239 -239
  514. teradataml/data/jsons/sqle/17.10/NaiveBayesPredict.json +149 -149
  515. teradataml/data/jsons/sqle/17.10/NaiveBayesTextClassifierPredict.json +288 -288
  516. teradataml/data/jsons/sqle/17.10/Pack.json +133 -133
  517. teradataml/data/jsons/sqle/17.10/SVMSparsePredict.json +193 -193
  518. teradataml/data/jsons/sqle/17.10/Sessionize.json +105 -105
  519. teradataml/data/jsons/sqle/17.10/StringSimilarity.json +86 -86
  520. teradataml/data/jsons/sqle/17.10/TD_BinCodeFit.json +239 -239
  521. teradataml/data/jsons/sqle/17.10/TD_BinCodeTransform.json +70 -70
  522. teradataml/data/jsons/sqle/17.10/TD_CategoricalSummary.json +53 -53
  523. teradataml/data/jsons/sqle/17.10/TD_Chisq.json +67 -67
  524. teradataml/data/jsons/sqle/17.10/TD_ColumnSummary.json +53 -53
  525. teradataml/data/jsons/sqle/17.10/TD_ConvertTo.json +68 -68
  526. teradataml/data/jsons/sqle/17.10/TD_FTest.json +187 -187
  527. teradataml/data/jsons/sqle/17.10/TD_FillRowID.json +51 -51
  528. teradataml/data/jsons/sqle/17.10/TD_FunctionFit.json +46 -46
  529. teradataml/data/jsons/sqle/17.10/TD_FunctionTransform.json +72 -71
  530. teradataml/data/jsons/sqle/17.10/TD_GetRowsWithMissingValues.json +52 -52
  531. teradataml/data/jsons/sqle/17.10/TD_GetRowsWithoutMissingValues.json +52 -52
  532. teradataml/data/jsons/sqle/17.10/TD_Histogram.json +132 -132
  533. teradataml/data/jsons/sqle/17.10/TD_NumApply.json +147 -147
  534. teradataml/data/jsons/sqle/17.10/TD_OneHotEncodingFit.json +182 -182
  535. teradataml/data/jsons/sqle/17.10/TD_OneHotEncodingTransform.json +65 -64
  536. teradataml/data/jsons/sqle/17.10/TD_OutlierFilterFit.json +196 -196
  537. teradataml/data/jsons/sqle/17.10/TD_OutlierFilterTransform.json +48 -47
  538. teradataml/data/jsons/sqle/17.10/TD_PolynomialFeaturesFit.json +114 -114
  539. teradataml/data/jsons/sqle/17.10/TD_PolynomialFeaturesTransform.json +72 -71
  540. teradataml/data/jsons/sqle/17.10/TD_QQNorm.json +111 -111
  541. teradataml/data/jsons/sqle/17.10/TD_RoundColumns.json +93 -93
  542. teradataml/data/jsons/sqle/17.10/TD_RowNormalizeFit.json +127 -127
  543. teradataml/data/jsons/sqle/17.10/TD_RowNormalizeTransform.json +70 -69
  544. teradataml/data/jsons/sqle/17.10/TD_ScaleFit.json +156 -156
  545. teradataml/data/jsons/sqle/17.10/TD_ScaleTransform.json +70 -69
  546. teradataml/data/jsons/sqle/17.10/TD_SimpleImputeFit.json +147 -147
  547. teradataml/data/jsons/sqle/17.10/TD_SimpleImputeTransform.json +48 -47
  548. teradataml/data/jsons/sqle/17.10/TD_StrApply.json +240 -240
  549. teradataml/data/jsons/sqle/17.10/TD_UnivariateStatistics.json +118 -118
  550. teradataml/data/jsons/sqle/17.10/TD_WhichMax.json +52 -52
  551. teradataml/data/jsons/sqle/17.10/TD_WhichMin.json +52 -52
  552. teradataml/data/jsons/sqle/17.10/TD_ZTest.json +171 -171
  553. teradataml/data/jsons/sqle/17.10/Unpack.json +188 -188
  554. teradataml/data/jsons/sqle/17.10/nPath.json +269 -269
  555. teradataml/data/jsons/sqle/17.20/Antiselect.json +56 -56
  556. teradataml/data/jsons/sqle/17.20/Attribution.json +249 -249
  557. teradataml/data/jsons/sqle/17.20/DecisionForestPredict.json +185 -185
  558. teradataml/data/jsons/sqle/17.20/DecisionTreePredict.json +172 -172
  559. teradataml/data/jsons/sqle/17.20/GLMPredict.json +151 -151
  560. teradataml/data/jsons/sqle/17.20/MovingAverage.json +367 -367
  561. teradataml/data/jsons/sqle/17.20/NGramSplitter.json +239 -239
  562. teradataml/data/jsons/sqle/17.20/NaiveBayesPredict.json +149 -149
  563. teradataml/data/jsons/sqle/17.20/NaiveBayesTextClassifierPredict.json +287 -287
  564. teradataml/data/jsons/sqle/17.20/Pack.json +133 -133
  565. teradataml/data/jsons/sqle/17.20/SVMSparsePredict.json +192 -192
  566. teradataml/data/jsons/sqle/17.20/Sessionize.json +105 -105
  567. teradataml/data/jsons/sqle/17.20/StringSimilarity.json +86 -86
  568. teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +148 -76
  569. teradataml/data/jsons/sqle/17.20/TD_BinCodeFit.json +239 -239
  570. teradataml/data/jsons/sqle/17.20/TD_BinCodeTransform.json +71 -71
  571. teradataml/data/jsons/sqle/17.20/TD_CategoricalSummary.json +53 -53
  572. teradataml/data/jsons/sqle/17.20/TD_Chisq.json +67 -67
  573. teradataml/data/jsons/sqle/17.20/TD_ClassificationEvaluator.json +145 -145
  574. teradataml/data/jsons/sqle/17.20/TD_ColumnSummary.json +53 -53
  575. teradataml/data/jsons/sqle/17.20/TD_ColumnTransformer.json +218 -218
  576. teradataml/data/jsons/sqle/17.20/TD_ConvertTo.json +92 -92
  577. teradataml/data/jsons/sqle/17.20/TD_DecisionForest.json +259 -259
  578. teradataml/data/jsons/sqle/17.20/TD_DecisionForestPredict.json +139 -139
  579. teradataml/data/jsons/sqle/17.20/TD_FTest.json +269 -186
  580. teradataml/data/jsons/sqle/17.20/TD_FillRowID.json +52 -52
  581. teradataml/data/jsons/sqle/17.20/TD_FunctionFit.json +46 -46
  582. teradataml/data/jsons/sqle/17.20/TD_FunctionTransform.json +72 -72
  583. teradataml/data/jsons/sqle/17.20/TD_GLM.json +507 -431
  584. teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +168 -125
  585. teradataml/data/jsons/sqle/17.20/TD_GLMPerSegment.json +411 -411
  586. teradataml/data/jsons/sqle/17.20/TD_GLMPredictPerSegment.json +146 -146
  587. teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +93 -91
  588. teradataml/data/jsons/sqle/17.20/TD_GetRowsWithMissingValues.json +76 -76
  589. teradataml/data/jsons/sqle/17.20/TD_GetRowsWithoutMissingValues.json +76 -76
  590. teradataml/data/jsons/sqle/17.20/TD_Histogram.json +152 -152
  591. teradataml/data/jsons/sqle/17.20/TD_KMeans.json +231 -211
  592. teradataml/data/jsons/sqle/17.20/TD_KMeansPredict.json +86 -86
  593. teradataml/data/jsons/sqle/17.20/TD_KNN.json +262 -262
  594. teradataml/data/jsons/sqle/17.20/TD_NaiveBayesTextClassifierTrainer.json +137 -137
  595. teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +102 -101
  596. teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineTransform.json +71 -71
  597. teradataml/data/jsons/sqle/17.20/TD_NumApply.json +147 -147
  598. teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +315 -315
  599. teradataml/data/jsons/sqle/17.20/TD_OneClassSVMPredict.json +123 -123
  600. teradataml/data/jsons/sqle/17.20/TD_OneHotEncodingFit.json +271 -271
  601. teradataml/data/jsons/sqle/17.20/TD_OneHotEncodingTransform.json +65 -65
  602. teradataml/data/jsons/sqle/17.20/TD_OrdinalEncodingFit.json +229 -229
  603. teradataml/data/jsons/sqle/17.20/TD_OrdinalEncodingTransform.json +75 -75
  604. teradataml/data/jsons/sqle/17.20/TD_OutlierFilterFit.json +217 -217
  605. teradataml/data/jsons/sqle/17.20/TD_OutlierFilterTransform.json +48 -48
  606. teradataml/data/jsons/sqle/17.20/TD_PolynomialFeaturesFit.json +114 -114
  607. teradataml/data/jsons/sqle/17.20/TD_PolynomialFeaturesTransform.json +72 -72
  608. teradataml/data/jsons/sqle/17.20/TD_QQNorm.json +111 -111
  609. teradataml/data/jsons/sqle/17.20/TD_ROC.json +178 -177
  610. teradataml/data/jsons/sqle/17.20/TD_RandomProjectionFit.json +178 -178
  611. teradataml/data/jsons/sqle/17.20/TD_RandomProjectionMinComponents.json +73 -73
  612. teradataml/data/jsons/sqle/17.20/TD_RandomProjectionTransform.json +74 -74
  613. teradataml/data/jsons/sqle/17.20/TD_RegressionEvaluator.json +137 -137
  614. teradataml/data/jsons/sqle/17.20/TD_RoundColumns.json +93 -93
  615. teradataml/data/jsons/sqle/17.20/TD_RowNormalizeFit.json +127 -127
  616. teradataml/data/jsons/sqle/17.20/TD_RowNormalizeTransform.json +70 -70
  617. teradataml/data/jsons/sqle/17.20/TD_SVM.json +389 -389
  618. teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +142 -124
  619. teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +309 -156
  620. teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +119 -70
  621. teradataml/data/jsons/sqle/17.20/TD_SentimentExtractor.json +193 -193
  622. teradataml/data/jsons/sqle/17.20/TD_Silhouette.json +142 -142
  623. teradataml/data/jsons/sqle/17.20/TD_SimpleImputeFit.json +147 -147
  624. teradataml/data/jsons/sqle/17.20/TD_SimpleImputeTransform.json +48 -48
  625. teradataml/data/jsons/sqle/17.20/TD_StrApply.json +240 -240
  626. teradataml/data/jsons/sqle/17.20/TD_TargetEncodingFit.json +248 -248
  627. teradataml/data/jsons/sqle/17.20/TD_TargetEncodingTransform.json +75 -75
  628. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +192 -192
  629. teradataml/data/jsons/sqle/17.20/TD_TrainTestSplit.json +142 -142
  630. teradataml/data/jsons/sqle/17.20/TD_UnivariateStatistics.json +117 -117
  631. teradataml/data/jsons/sqle/17.20/TD_VectorDistance.json +182 -182
  632. teradataml/data/jsons/sqle/17.20/TD_WhichMax.json +52 -52
  633. teradataml/data/jsons/sqle/17.20/TD_WhichMin.json +52 -52
  634. teradataml/data/jsons/sqle/17.20/TD_WordEmbeddings.json +241 -241
  635. teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +330 -312
  636. teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +195 -182
  637. teradataml/data/jsons/sqle/17.20/TD_ZTest.json +247 -170
  638. teradataml/data/jsons/sqle/17.20/Unpack.json +188 -188
  639. teradataml/data/jsons/sqle/17.20/nPath.json +269 -269
  640. teradataml/data/jsons/tableoperator/17.00/read_nos.json +197 -197
  641. teradataml/data/jsons/tableoperator/17.05/read_nos.json +197 -197
  642. teradataml/data/jsons/tableoperator/17.05/write_nos.json +194 -194
  643. teradataml/data/jsons/tableoperator/17.10/read_nos.json +183 -183
  644. teradataml/data/jsons/tableoperator/17.10/write_nos.json +194 -194
  645. teradataml/data/jsons/tableoperator/17.20/read_nos.json +182 -182
  646. teradataml/data/jsons/tableoperator/17.20/write_nos.json +223 -223
  647. teradataml/data/jsons/uaf/17.20/TD_ACF.json +149 -149
  648. teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +409 -409
  649. teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +79 -79
  650. teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +151 -151
  651. teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +109 -109
  652. teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +107 -107
  653. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +87 -87
  654. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +106 -106
  655. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +80 -80
  656. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +67 -67
  657. teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +91 -91
  658. teradataml/data/jsons/uaf/17.20/TD_DFFT.json +136 -136
  659. teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +148 -148
  660. teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +108 -108
  661. teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +109 -109
  662. teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +86 -86
  663. teradataml/data/jsons/uaf/17.20/TD_DIFF.json +91 -91
  664. teradataml/data/jsons/uaf/17.20/TD_DTW.json +116 -116
  665. teradataml/data/jsons/uaf/17.20/TD_DURBIN_WATSON.json +100 -100
  666. teradataml/data/jsons/uaf/17.20/TD_EXTRACT_RESULTS.json +38 -38
  667. teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +100 -100
  668. teradataml/data/jsons/uaf/17.20/TD_GENSERIES4FORMULA.json +84 -84
  669. teradataml/data/jsons/uaf/17.20/TD_GENSERIES4SINUSOIDS.json +70 -70
  670. teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +152 -152
  671. teradataml/data/jsons/uaf/17.20/TD_HOLT_WINTERS_FORECAST.json +313 -313
  672. teradataml/data/jsons/uaf/17.20/TD_IDFFT.json +57 -57
  673. teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +94 -94
  674. teradataml/data/jsons/uaf/17.20/TD_INPUTVALIDATOR.json +63 -63
  675. teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +181 -181
  676. teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +102 -102
  677. teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +182 -182
  678. teradataml/data/jsons/uaf/17.20/TD_MATRIXMULTIPLY.json +67 -67
  679. teradataml/data/jsons/uaf/17.20/TD_MINFO.json +66 -66
  680. teradataml/data/jsons/uaf/17.20/TD_MULTIVAR_REGR.json +178 -178
  681. teradataml/data/jsons/uaf/17.20/TD_PACF.json +114 -114
  682. teradataml/data/jsons/uaf/17.20/TD_PORTMAN.json +118 -118
  683. teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +175 -175
  684. teradataml/data/jsons/uaf/17.20/TD_POWERTRANSFORM.json +97 -97
  685. teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +173 -173
  686. teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +136 -136
  687. teradataml/data/jsons/uaf/17.20/TD_SELECTION_CRITERIA.json +89 -89
  688. teradataml/data/jsons/uaf/17.20/TD_SIGNIF_PERIODICITIES.json +79 -79
  689. teradataml/data/jsons/uaf/17.20/TD_SIGNIF_RESIDMEAN.json +67 -67
  690. teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +184 -184
  691. teradataml/data/jsons/uaf/17.20/TD_SINFO.json +57 -57
  692. teradataml/data/jsons/uaf/17.20/TD_SMOOTHMA.json +162 -162
  693. teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +100 -100
  694. teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +111 -111
  695. teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +95 -95
  696. teradataml/data/jsons/uaf/17.20/TD_WHITES_GENERAL.json +77 -77
  697. teradataml/data/kmeans_example.json +22 -17
  698. teradataml/data/kmeans_table.csv +10 -0
  699. teradataml/data/kmeans_us_arrests_data.csv +0 -0
  700. teradataml/data/knn_example.json +18 -18
  701. teradataml/data/knnrecommender_example.json +6 -6
  702. teradataml/data/knnrecommenderpredict_example.json +12 -12
  703. teradataml/data/lar_example.json +17 -17
  704. teradataml/data/larpredict_example.json +30 -30
  705. teradataml/data/lc_new_predictors.csv +5 -5
  706. teradataml/data/lc_new_reference.csv +9 -9
  707. teradataml/data/lda_example.json +8 -8
  708. teradataml/data/ldainference_example.json +14 -14
  709. teradataml/data/ldatopicsummary_example.json +8 -8
  710. teradataml/data/levendist_input.csv +13 -13
  711. teradataml/data/levenshteindistance_example.json +10 -10
  712. teradataml/data/linreg_example.json +9 -9
  713. teradataml/data/load_example_data.py +326 -323
  714. teradataml/data/loan_prediction.csv +295 -295
  715. teradataml/data/lungcancer.csv +138 -138
  716. teradataml/data/mappingdata.csv +12 -12
  717. teradataml/data/milk_timeseries.csv +157 -157
  718. teradataml/data/min_max_titanic.csv +4 -4
  719. teradataml/data/minhash_example.json +6 -6
  720. teradataml/data/ml_ratings.csv +7547 -7547
  721. teradataml/data/ml_ratings_10.csv +2445 -2445
  722. teradataml/data/model1_table.csv +5 -5
  723. teradataml/data/model2_table.csv +5 -5
  724. teradataml/data/models/iris_db_glm_model.pmml +56 -56
  725. teradataml/data/models/iris_db_xgb_model.pmml +4471 -4471
  726. teradataml/data/modularity_example.json +12 -12
  727. teradataml/data/movavg_example.json +7 -7
  728. teradataml/data/mtx1.csv +7 -7
  729. teradataml/data/mtx2.csv +13 -13
  730. teradataml/data/multi_model_classification.csv +401 -0
  731. teradataml/data/multi_model_regression.csv +401 -0
  732. teradataml/data/mvdfft8.csv +9 -9
  733. teradataml/data/naivebayes_example.json +9 -9
  734. teradataml/data/naivebayespredict_example.json +19 -19
  735. teradataml/data/naivebayestextclassifier2_example.json +6 -6
  736. teradataml/data/naivebayestextclassifier_example.json +8 -8
  737. teradataml/data/naivebayestextclassifierpredict_example.json +20 -20
  738. teradataml/data/name_Find_configure.csv +10 -10
  739. teradataml/data/namedentityfinder_example.json +14 -14
  740. teradataml/data/namedentityfinderevaluator_example.json +10 -10
  741. teradataml/data/namedentityfindertrainer_example.json +6 -6
  742. teradataml/data/nb_iris_input_test.csv +31 -31
  743. teradataml/data/nb_iris_input_train.csv +121 -121
  744. teradataml/data/nbp_iris_model.csv +13 -13
  745. teradataml/data/ner_extractor_text.csv +2 -2
  746. teradataml/data/ner_sports_test2.csv +29 -29
  747. teradataml/data/ner_sports_train.csv +501 -501
  748. teradataml/data/nerevaluator_example.json +5 -5
  749. teradataml/data/nerextractor_example.json +18 -18
  750. teradataml/data/nermem_sports_test.csv +17 -17
  751. teradataml/data/nermem_sports_train.csv +50 -50
  752. teradataml/data/nertrainer_example.json +6 -6
  753. teradataml/data/ngrams_example.json +6 -6
  754. teradataml/data/notebooks/sqlalchemy/Teradata Vantage Aggregate Functions using SQLAlchemy.ipynb +1455 -1455
  755. teradataml/data/notebooks/sqlalchemy/Teradata Vantage Arithmetic Functions Using SQLAlchemy.ipynb +1993 -1993
  756. teradataml/data/notebooks/sqlalchemy/Teradata Vantage Bit-Byte Manipulation Functions using SQLAlchemy.ipynb +1492 -1492
  757. teradataml/data/notebooks/sqlalchemy/Teradata Vantage Built-in functions using SQLAlchemy.ipynb +536 -536
  758. teradataml/data/notebooks/sqlalchemy/Teradata Vantage Regular Expressions Using SQLAlchemy.ipynb +570 -570
  759. teradataml/data/notebooks/sqlalchemy/Teradata Vantage String Functions Using SQLAlchemy.ipynb +2559 -2559
  760. teradataml/data/notebooks/sqlalchemy/Teradata Vantage Window Aggregate Functions using SQLAlchemy.ipynb +2911 -2911
  761. teradataml/data/notebooks/sqlalchemy/Using Generic SQLAlchemy ClauseElements teradataml DataFrame assign method.ipynb +698 -698
  762. teradataml/data/notebooks/sqlalchemy/teradataml filtering using SQLAlchemy ClauseElements.ipynb +784 -784
  763. teradataml/data/npath_example.json +23 -23
  764. teradataml/data/ntree_example.json +14 -14
  765. teradataml/data/numeric_strings.csv +4 -4
  766. teradataml/data/numerics.csv +4 -4
  767. teradataml/data/ocean_buoy.csv +17 -17
  768. teradataml/data/ocean_buoy2.csv +17 -17
  769. teradataml/data/ocean_buoys.csv +27 -27
  770. teradataml/data/ocean_buoys2.csv +10 -10
  771. teradataml/data/ocean_buoys_nonpti.csv +28 -28
  772. teradataml/data/ocean_buoys_seq.csv +29 -29
  773. teradataml/data/onehot_encoder_train.csv +4 -0
  774. teradataml/data/openml_example.json +92 -0
  775. teradataml/data/optional_event_table.csv +4 -4
  776. teradataml/data/orders1.csv +11 -11
  777. teradataml/data/orders1_12.csv +12 -12
  778. teradataml/data/orders_ex.csv +4 -4
  779. teradataml/data/pack_example.json +8 -8
  780. teradataml/data/package_tracking.csv +19 -19
  781. teradataml/data/package_tracking_pti.csv +18 -18
  782. teradataml/data/pagerank_example.json +13 -13
  783. teradataml/data/paragraphs_input.csv +6 -6
  784. teradataml/data/pathanalyzer_example.json +7 -7
  785. teradataml/data/pathgenerator_example.json +7 -7
  786. teradataml/data/phrases.csv +7 -7
  787. teradataml/data/pivot_example.json +8 -8
  788. teradataml/data/pivot_input.csv +22 -22
  789. teradataml/data/playerRating.csv +31 -31
  790. teradataml/data/postagger_example.json +6 -6
  791. teradataml/data/posttagger_output.csv +44 -44
  792. teradataml/data/production_data.csv +16 -16
  793. teradataml/data/production_data2.csv +7 -7
  794. teradataml/data/randomsample_example.json +31 -31
  795. teradataml/data/randomwalksample_example.json +8 -8
  796. teradataml/data/rank_table.csv +6 -6
  797. teradataml/data/ref_mobile_data.csv +4 -4
  798. teradataml/data/ref_mobile_data_dense.csv +2 -2
  799. teradataml/data/ref_url.csv +17 -17
  800. teradataml/data/restaurant_reviews.csv +7 -7
  801. teradataml/data/river_data.csv +145 -145
  802. teradataml/data/roc_example.json +7 -7
  803. teradataml/data/roc_input.csv +101 -101
  804. teradataml/data/rule_inputs.csv +6 -6
  805. teradataml/data/rule_table.csv +2 -2
  806. teradataml/data/sales.csv +7 -7
  807. teradataml/data/sales_transaction.csv +501 -501
  808. teradataml/data/salesdata.csv +342 -342
  809. teradataml/data/sample_cities.csv +2 -2
  810. teradataml/data/sample_shapes.csv +10 -10
  811. teradataml/data/sample_streets.csv +2 -2
  812. teradataml/data/sampling_example.json +15 -15
  813. teradataml/data/sax_example.json +8 -8
  814. teradataml/data/scale_attributes.csv +3 -0
  815. teradataml/data/scale_example.json +74 -23
  816. teradataml/data/scale_housing.csv +11 -11
  817. teradataml/data/scale_housing_test.csv +6 -6
  818. teradataml/data/scale_input_part_sparse.csv +31 -0
  819. teradataml/data/scale_input_partitioned.csv +16 -0
  820. teradataml/data/scale_input_sparse.csv +11 -0
  821. teradataml/data/scale_parameters.csv +3 -0
  822. teradataml/data/scale_stat.csv +11 -11
  823. teradataml/data/scalebypartition_example.json +13 -13
  824. teradataml/data/scalemap_example.json +13 -13
  825. teradataml/data/scalesummary_example.json +12 -12
  826. teradataml/data/score_category.csv +101 -101
  827. teradataml/data/score_summary.csv +4 -4
  828. teradataml/data/script_example.json +9 -9
  829. teradataml/data/scripts/deploy_script.py +84 -0
  830. teradataml/data/scripts/mapper.R +20 -0
  831. teradataml/data/scripts/mapper.py +15 -15
  832. teradataml/data/scripts/mapper_replace.py +15 -15
  833. teradataml/data/scripts/sklearn/__init__.py +0 -0
  834. teradataml/data/scripts/sklearn/sklearn_fit.py +171 -0
  835. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +127 -0
  836. teradataml/data/scripts/sklearn/sklearn_function.template +108 -0
  837. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +148 -0
  838. teradataml/data/scripts/sklearn/sklearn_neighbors.py +143 -0
  839. teradataml/data/scripts/sklearn/sklearn_score.py +119 -0
  840. teradataml/data/scripts/sklearn/sklearn_transform.py +171 -0
  841. teradataml/data/seeds.csv +10 -10
  842. teradataml/data/sentenceextractor_example.json +6 -6
  843. teradataml/data/sentiment_extract_input.csv +11 -11
  844. teradataml/data/sentiment_train.csv +16 -16
  845. teradataml/data/sentiment_word.csv +20 -20
  846. teradataml/data/sentiment_word_input.csv +19 -19
  847. teradataml/data/sentimentextractor_example.json +24 -24
  848. teradataml/data/sentimenttrainer_example.json +8 -8
  849. teradataml/data/sequence_table.csv +10 -10
  850. teradataml/data/seriessplitter_example.json +7 -7
  851. teradataml/data/sessionize_example.json +17 -17
  852. teradataml/data/sessionize_table.csv +116 -116
  853. teradataml/data/setop_test1.csv +24 -24
  854. teradataml/data/setop_test2.csv +22 -22
  855. teradataml/data/soc_nw_edges.csv +10 -10
  856. teradataml/data/soc_nw_vertices.csv +7 -7
  857. teradataml/data/souvenir_timeseries.csv +167 -167
  858. teradataml/data/sparse_iris_attribute.csv +5 -5
  859. teradataml/data/sparse_iris_test.csv +121 -121
  860. teradataml/data/sparse_iris_train.csv +601 -601
  861. teradataml/data/star1.csv +6 -6
  862. teradataml/data/state_transition.csv +5 -5
  863. teradataml/data/stock_data.csv +53 -53
  864. teradataml/data/stock_movement.csv +11 -11
  865. teradataml/data/stock_vol.csv +76 -76
  866. teradataml/data/stop_words.csv +8 -8
  867. teradataml/data/store_sales.csv +37 -37
  868. teradataml/data/stringsimilarity_example.json +7 -7
  869. teradataml/data/strsimilarity_input.csv +13 -13
  870. teradataml/data/students.csv +101 -101
  871. teradataml/data/svm_iris_input_test.csv +121 -121
  872. teradataml/data/svm_iris_input_train.csv +481 -481
  873. teradataml/data/svm_iris_model.csv +7 -7
  874. teradataml/data/svmdense_example.json +9 -9
  875. teradataml/data/svmdensepredict_example.json +18 -18
  876. teradataml/data/svmsparse_example.json +7 -7
  877. teradataml/data/svmsparsepredict_example.json +13 -13
  878. teradataml/data/svmsparsesummary_example.json +7 -7
  879. teradataml/data/target_mobile_data.csv +13 -13
  880. teradataml/data/target_mobile_data_dense.csv +5 -5
  881. teradataml/data/templatedata.csv +1201 -1201
  882. teradataml/data/templates/open_source_ml.json +9 -0
  883. teradataml/data/teradataml_example.json +150 -1
  884. teradataml/data/test_classification.csv +101 -0
  885. teradataml/data/test_loan_prediction.csv +53 -53
  886. teradataml/data/test_pacf_12.csv +37 -37
  887. teradataml/data/test_prediction.csv +101 -0
  888. teradataml/data/test_regression.csv +101 -0
  889. teradataml/data/test_river2.csv +109 -109
  890. teradataml/data/text_inputs.csv +6 -6
  891. teradataml/data/textchunker_example.json +7 -7
  892. teradataml/data/textclassifier_example.json +6 -6
  893. teradataml/data/textclassifier_input.csv +7 -7
  894. teradataml/data/textclassifiertrainer_example.json +6 -6
  895. teradataml/data/textmorph_example.json +5 -5
  896. teradataml/data/textparser_example.json +15 -15
  897. teradataml/data/texttagger_example.json +11 -11
  898. teradataml/data/texttokenizer_example.json +6 -6
  899. teradataml/data/texttrainer_input.csv +11 -11
  900. teradataml/data/tf_example.json +6 -6
  901. teradataml/data/tfidf_example.json +13 -13
  902. teradataml/data/tfidf_input1.csv +201 -201
  903. teradataml/data/tfidf_train.csv +6 -6
  904. teradataml/data/time_table1.csv +535 -535
  905. teradataml/data/time_table2.csv +14 -14
  906. teradataml/data/timeseriesdata.csv +1601 -1601
  907. teradataml/data/timeseriesdatasetsd4.csv +105 -105
  908. teradataml/data/titanic.csv +892 -892
  909. teradataml/data/token_table.csv +696 -696
  910. teradataml/data/train_multiclass.csv +101 -0
  911. teradataml/data/train_regression.csv +101 -0
  912. teradataml/data/train_regression_multiple_labels.csv +101 -0
  913. teradataml/data/train_tracking.csv +27 -27
  914. teradataml/data/transformation_table.csv +5 -5
  915. teradataml/data/transformation_table_new.csv +1 -1
  916. teradataml/data/tv_spots.csv +16 -16
  917. teradataml/data/twod_climate_data.csv +117 -117
  918. teradataml/data/uaf_example.json +475 -475
  919. teradataml/data/univariatestatistics_example.json +8 -8
  920. teradataml/data/unpack_example.json +9 -9
  921. teradataml/data/unpivot_example.json +9 -9
  922. teradataml/data/unpivot_input.csv +8 -8
  923. teradataml/data/us_air_pass.csv +36 -36
  924. teradataml/data/us_population.csv +624 -624
  925. teradataml/data/us_states_shapes.csv +52 -52
  926. teradataml/data/varmax_example.json +17 -17
  927. teradataml/data/vectordistance_example.json +25 -25
  928. teradataml/data/ville_climatedata.csv +121 -121
  929. teradataml/data/ville_tempdata.csv +12 -12
  930. teradataml/data/ville_tempdata1.csv +12 -12
  931. teradataml/data/ville_temperature.csv +11 -11
  932. teradataml/data/waveletTable.csv +1605 -1605
  933. teradataml/data/waveletTable2.csv +1605 -1605
  934. teradataml/data/weightedmovavg_example.json +8 -8
  935. teradataml/data/wft_testing.csv +5 -5
  936. teradataml/data/wine_data.csv +1600 -0
  937. teradataml/data/word_embed_input_table1.csv +5 -5
  938. teradataml/data/word_embed_input_table2.csv +4 -4
  939. teradataml/data/word_embed_model.csv +22 -22
  940. teradataml/data/words_input.csv +13 -13
  941. teradataml/data/xconvolve_complex_left.csv +6 -6
  942. teradataml/data/xconvolve_complex_leftmulti.csv +6 -6
  943. teradataml/data/xgboost_example.json +35 -35
  944. teradataml/data/xgboostpredict_example.json +31 -31
  945. teradataml/data/ztest_example.json +16 -0
  946. teradataml/dataframe/copy_to.py +1769 -1698
  947. teradataml/dataframe/data_transfer.py +2812 -2745
  948. teradataml/dataframe/dataframe.py +17630 -16946
  949. teradataml/dataframe/dataframe_utils.py +1875 -1740
  950. teradataml/dataframe/fastload.py +794 -603
  951. teradataml/dataframe/indexer.py +424 -424
  952. teradataml/dataframe/setop.py +1179 -1166
  953. teradataml/dataframe/sql.py +10174 -6432
  954. teradataml/dataframe/sql_function_parameters.py +439 -388
  955. teradataml/dataframe/sql_functions.py +652 -652
  956. teradataml/dataframe/sql_interfaces.py +220 -220
  957. teradataml/dataframe/vantage_function_types.py +674 -630
  958. teradataml/dataframe/window.py +693 -692
  959. teradataml/dbutils/__init__.py +3 -3
  960. teradataml/dbutils/dbutils.py +1167 -1150
  961. teradataml/dbutils/filemgr.py +267 -267
  962. teradataml/gen_ai/__init__.py +2 -2
  963. teradataml/gen_ai/convAI.py +472 -472
  964. teradataml/geospatial/__init__.py +3 -3
  965. teradataml/geospatial/geodataframe.py +1105 -1094
  966. teradataml/geospatial/geodataframecolumn.py +392 -387
  967. teradataml/geospatial/geometry_types.py +925 -925
  968. teradataml/hyperparameter_tuner/__init__.py +1 -1
  969. teradataml/hyperparameter_tuner/optimizer.py +3783 -2993
  970. teradataml/hyperparameter_tuner/utils.py +281 -187
  971. teradataml/lib/aed_0_1.dll +0 -0
  972. teradataml/lib/libaed_0_1.dylib +0 -0
  973. teradataml/lib/libaed_0_1.so +0 -0
  974. teradataml/libaed_0_1.dylib +0 -0
  975. teradataml/libaed_0_1.so +0 -0
  976. teradataml/opensource/__init__.py +1 -0
  977. teradataml/opensource/sklearn/__init__.py +1 -0
  978. teradataml/opensource/sklearn/_class.py +255 -0
  979. teradataml/opensource/sklearn/_sklearn_wrapper.py +1715 -0
  980. teradataml/opensource/sklearn/_wrapper_utils.py +268 -0
  981. teradataml/opensource/sklearn/constants.py +54 -0
  982. teradataml/options/__init__.py +130 -124
  983. teradataml/options/configure.py +358 -336
  984. teradataml/options/display.py +176 -176
  985. teradataml/plot/__init__.py +2 -2
  986. teradataml/plot/axis.py +1388 -1388
  987. teradataml/plot/constants.py +15 -15
  988. teradataml/plot/figure.py +398 -398
  989. teradataml/plot/plot.py +760 -760
  990. teradataml/plot/query_generator.py +83 -83
  991. teradataml/plot/subplot.py +216 -216
  992. teradataml/scriptmgmt/UserEnv.py +3791 -3761
  993. teradataml/scriptmgmt/__init__.py +3 -3
  994. teradataml/scriptmgmt/lls_utils.py +1719 -1604
  995. teradataml/series/series.py +532 -532
  996. teradataml/series/series_utils.py +71 -71
  997. teradataml/table_operators/Apply.py +949 -917
  998. teradataml/table_operators/Script.py +1718 -1982
  999. teradataml/table_operators/TableOperator.py +1255 -1616
  1000. teradataml/table_operators/__init__.py +2 -3
  1001. teradataml/table_operators/apply_query_generator.py +262 -262
  1002. teradataml/table_operators/query_generator.py +507 -507
  1003. teradataml/table_operators/table_operator_query_generator.py +460 -460
  1004. teradataml/table_operators/table_operator_util.py +631 -639
  1005. teradataml/table_operators/templates/dataframe_apply.template +184 -184
  1006. teradataml/table_operators/templates/dataframe_map.template +176 -176
  1007. teradataml/table_operators/templates/script_executor.template +170 -170
  1008. teradataml/utils/dtypes.py +684 -684
  1009. teradataml/utils/internal_buffer.py +84 -84
  1010. teradataml/utils/print_versions.py +205 -205
  1011. teradataml/utils/utils.py +410 -410
  1012. teradataml/utils/validators.py +2277 -2115
  1013. {teradataml-17.20.0.7.dist-info → teradataml-20.0.0.1.dist-info}/METADATA +346 -45
  1014. teradataml-20.0.0.1.dist-info/RECORD +1056 -0
  1015. {teradataml-17.20.0.7.dist-info → teradataml-20.0.0.1.dist-info}/WHEEL +1 -1
  1016. {teradataml-17.20.0.7.dist-info → teradataml-20.0.0.1.dist-info}/zip-safe +1 -1
  1017. teradataml/analytics/mle/AdaBoost.py +0 -651
  1018. teradataml/analytics/mle/AdaBoostPredict.py +0 -564
  1019. teradataml/analytics/mle/Antiselect.py +0 -342
  1020. teradataml/analytics/mle/Arima.py +0 -641
  1021. teradataml/analytics/mle/ArimaPredict.py +0 -477
  1022. teradataml/analytics/mle/Attribution.py +0 -1070
  1023. teradataml/analytics/mle/Betweenness.py +0 -658
  1024. teradataml/analytics/mle/Burst.py +0 -711
  1025. teradataml/analytics/mle/CCM.py +0 -600
  1026. teradataml/analytics/mle/CCMPrepare.py +0 -324
  1027. teradataml/analytics/mle/CFilter.py +0 -460
  1028. teradataml/analytics/mle/ChangePointDetection.py +0 -572
  1029. teradataml/analytics/mle/ChangePointDetectionRT.py +0 -477
  1030. teradataml/analytics/mle/Closeness.py +0 -737
  1031. teradataml/analytics/mle/ConfusionMatrix.py +0 -420
  1032. teradataml/analytics/mle/Correlation.py +0 -477
  1033. teradataml/analytics/mle/Correlation2.py +0 -573
  1034. teradataml/analytics/mle/CoxHazardRatio.py +0 -679
  1035. teradataml/analytics/mle/CoxPH.py +0 -556
  1036. teradataml/analytics/mle/CoxSurvival.py +0 -478
  1037. teradataml/analytics/mle/CumulativeMovAvg.py +0 -363
  1038. teradataml/analytics/mle/DTW.py +0 -623
  1039. teradataml/analytics/mle/DWT.py +0 -564
  1040. teradataml/analytics/mle/DWT2D.py +0 -599
  1041. teradataml/analytics/mle/DecisionForest.py +0 -716
  1042. teradataml/analytics/mle/DecisionForestEvaluator.py +0 -363
  1043. teradataml/analytics/mle/DecisionForestPredict.py +0 -561
  1044. teradataml/analytics/mle/DecisionTree.py +0 -830
  1045. teradataml/analytics/mle/DecisionTreePredict.py +0 -528
  1046. teradataml/analytics/mle/ExponentialMovAvg.py +0 -418
  1047. teradataml/analytics/mle/FMeasure.py +0 -402
  1048. teradataml/analytics/mle/FPGrowth.py +0 -734
  1049. teradataml/analytics/mle/FrequentPaths.py +0 -695
  1050. teradataml/analytics/mle/GLM.py +0 -558
  1051. teradataml/analytics/mle/GLML1L2.py +0 -547
  1052. teradataml/analytics/mle/GLML1L2Predict.py +0 -519
  1053. teradataml/analytics/mle/GLMPredict.py +0 -529
  1054. teradataml/analytics/mle/HMMDecoder.py +0 -945
  1055. teradataml/analytics/mle/HMMEvaluator.py +0 -901
  1056. teradataml/analytics/mle/HMMSupervised.py +0 -521
  1057. teradataml/analytics/mle/HMMUnsupervised.py +0 -572
  1058. teradataml/analytics/mle/Histogram.py +0 -561
  1059. teradataml/analytics/mle/IDWT.py +0 -476
  1060. teradataml/analytics/mle/IDWT2D.py +0 -493
  1061. teradataml/analytics/mle/IdentityMatch.py +0 -763
  1062. teradataml/analytics/mle/Interpolator.py +0 -918
  1063. teradataml/analytics/mle/KMeans.py +0 -485
  1064. teradataml/analytics/mle/KNN.py +0 -627
  1065. teradataml/analytics/mle/KNNRecommender.py +0 -488
  1066. teradataml/analytics/mle/KNNRecommenderPredict.py +0 -581
  1067. teradataml/analytics/mle/LAR.py +0 -439
  1068. teradataml/analytics/mle/LARPredict.py +0 -478
  1069. teradataml/analytics/mle/LDA.py +0 -548
  1070. teradataml/analytics/mle/LDAInference.py +0 -492
  1071. teradataml/analytics/mle/LDATopicSummary.py +0 -464
  1072. teradataml/analytics/mle/LevenshteinDistance.py +0 -450
  1073. teradataml/analytics/mle/LinReg.py +0 -433
  1074. teradataml/analytics/mle/LinRegPredict.py +0 -438
  1075. teradataml/analytics/mle/MinHash.py +0 -544
  1076. teradataml/analytics/mle/Modularity.py +0 -587
  1077. teradataml/analytics/mle/NEREvaluator.py +0 -410
  1078. teradataml/analytics/mle/NERExtractor.py +0 -595
  1079. teradataml/analytics/mle/NERTrainer.py +0 -458
  1080. teradataml/analytics/mle/NGrams.py +0 -570
  1081. teradataml/analytics/mle/NPath.py +0 -634
  1082. teradataml/analytics/mle/NTree.py +0 -549
  1083. teradataml/analytics/mle/NaiveBayes.py +0 -462
  1084. teradataml/analytics/mle/NaiveBayesPredict.py +0 -513
  1085. teradataml/analytics/mle/NaiveBayesTextClassifier.py +0 -607
  1086. teradataml/analytics/mle/NaiveBayesTextClassifier2.py +0 -531
  1087. teradataml/analytics/mle/NaiveBayesTextClassifierPredict.py +0 -799
  1088. teradataml/analytics/mle/NamedEntityFinder.py +0 -529
  1089. teradataml/analytics/mle/NamedEntityFinderEvaluator.py +0 -414
  1090. teradataml/analytics/mle/NamedEntityFinderTrainer.py +0 -396
  1091. teradataml/analytics/mle/POSTagger.py +0 -417
  1092. teradataml/analytics/mle/Pack.py +0 -411
  1093. teradataml/analytics/mle/PageRank.py +0 -535
  1094. teradataml/analytics/mle/PathAnalyzer.py +0 -426
  1095. teradataml/analytics/mle/PathGenerator.py +0 -367
  1096. teradataml/analytics/mle/PathStart.py +0 -464
  1097. teradataml/analytics/mle/PathSummarizer.py +0 -470
  1098. teradataml/analytics/mle/Pivot.py +0 -471
  1099. teradataml/analytics/mle/ROC.py +0 -425
  1100. teradataml/analytics/mle/RandomSample.py +0 -637
  1101. teradataml/analytics/mle/RandomWalkSample.py +0 -490
  1102. teradataml/analytics/mle/SAX.py +0 -779
  1103. teradataml/analytics/mle/SVMDense.py +0 -677
  1104. teradataml/analytics/mle/SVMDensePredict.py +0 -536
  1105. teradataml/analytics/mle/SVMDenseSummary.py +0 -437
  1106. teradataml/analytics/mle/SVMSparse.py +0 -557
  1107. teradataml/analytics/mle/SVMSparsePredict.py +0 -553
  1108. teradataml/analytics/mle/SVMSparseSummary.py +0 -435
  1109. teradataml/analytics/mle/Sampling.py +0 -549
  1110. teradataml/analytics/mle/Scale.py +0 -565
  1111. teradataml/analytics/mle/ScaleByPartition.py +0 -496
  1112. teradataml/analytics/mle/ScaleMap.py +0 -378
  1113. teradataml/analytics/mle/ScaleSummary.py +0 -320
  1114. teradataml/analytics/mle/SentenceExtractor.py +0 -363
  1115. teradataml/analytics/mle/SentimentEvaluator.py +0 -432
  1116. teradataml/analytics/mle/SentimentExtractor.py +0 -578
  1117. teradataml/analytics/mle/SentimentTrainer.py +0 -405
  1118. teradataml/analytics/mle/SeriesSplitter.py +0 -641
  1119. teradataml/analytics/mle/Sessionize.py +0 -475
  1120. teradataml/analytics/mle/SimpleMovAvg.py +0 -397
  1121. teradataml/analytics/mle/StringSimilarity.py +0 -425
  1122. teradataml/analytics/mle/TF.py +0 -389
  1123. teradataml/analytics/mle/TFIDF.py +0 -504
  1124. teradataml/analytics/mle/TextChunker.py +0 -414
  1125. teradataml/analytics/mle/TextClassifier.py +0 -399
  1126. teradataml/analytics/mle/TextClassifierEvaluator.py +0 -413
  1127. teradataml/analytics/mle/TextClassifierTrainer.py +0 -565
  1128. teradataml/analytics/mle/TextMorph.py +0 -494
  1129. teradataml/analytics/mle/TextParser.py +0 -623
  1130. teradataml/analytics/mle/TextTagger.py +0 -530
  1131. teradataml/analytics/mle/TextTokenizer.py +0 -502
  1132. teradataml/analytics/mle/UnivariateStatistics.py +0 -488
  1133. teradataml/analytics/mle/Unpack.py +0 -526
  1134. teradataml/analytics/mle/Unpivot.py +0 -438
  1135. teradataml/analytics/mle/VarMax.py +0 -776
  1136. teradataml/analytics/mle/VectorDistance.py +0 -762
  1137. teradataml/analytics/mle/WeightedMovAvg.py +0 -400
  1138. teradataml/analytics/mle/XGBoost.py +0 -842
  1139. teradataml/analytics/mle/XGBoostPredict.py +0 -627
  1140. teradataml/analytics/mle/__init__.py +0 -123
  1141. teradataml/analytics/mle/json/adaboost_mle.json +0 -135
  1142. teradataml/analytics/mle/json/adaboostpredict_mle.json +0 -85
  1143. teradataml/analytics/mle/json/antiselect_mle.json +0 -34
  1144. teradataml/analytics/mle/json/antiselect_mle_mle.json +0 -34
  1145. teradataml/analytics/mle/json/arima_mle.json +0 -172
  1146. teradataml/analytics/mle/json/arimapredict_mle.json +0 -52
  1147. teradataml/analytics/mle/json/attribution_mle_mle.json +0 -143
  1148. teradataml/analytics/mle/json/betweenness_mle.json +0 -97
  1149. teradataml/analytics/mle/json/burst_mle.json +0 -140
  1150. teradataml/analytics/mle/json/ccm_mle.json +0 -124
  1151. teradataml/analytics/mle/json/ccmprepare_mle.json +0 -14
  1152. teradataml/analytics/mle/json/cfilter_mle.json +0 -93
  1153. teradataml/analytics/mle/json/changepointdetection_mle.json +0 -92
  1154. teradataml/analytics/mle/json/changepointdetectionrt_mle.json +0 -78
  1155. teradataml/analytics/mle/json/closeness_mle.json +0 -104
  1156. teradataml/analytics/mle/json/confusionmatrix_mle.json +0 -79
  1157. teradataml/analytics/mle/json/correlation_mle.json +0 -86
  1158. teradataml/analytics/mle/json/correlationreduce_mle.json +0 -49
  1159. teradataml/analytics/mle/json/coxhazardratio_mle.json +0 -89
  1160. teradataml/analytics/mle/json/coxph_mle.json +0 -98
  1161. teradataml/analytics/mle/json/coxsurvival_mle.json +0 -79
  1162. teradataml/analytics/mle/json/cumulativemovavg_mle.json +0 -34
  1163. teradataml/analytics/mle/json/decisionforest_mle.json +0 -167
  1164. teradataml/analytics/mle/json/decisionforestevaluator_mle.json +0 -33
  1165. teradataml/analytics/mle/json/decisionforestpredict_mle_mle.json +0 -74
  1166. teradataml/analytics/mle/json/decisiontree_mle.json +0 -194
  1167. teradataml/analytics/mle/json/decisiontreepredict_mle_mle.json +0 -86
  1168. teradataml/analytics/mle/json/dtw_mle.json +0 -97
  1169. teradataml/analytics/mle/json/dwt2d_mle.json +0 -116
  1170. teradataml/analytics/mle/json/dwt_mle.json +0 -101
  1171. teradataml/analytics/mle/json/exponentialmovavg_mle.json +0 -55
  1172. teradataml/analytics/mle/json/fmeasure_mle.json +0 -58
  1173. teradataml/analytics/mle/json/fpgrowth_mle.json +0 -159
  1174. teradataml/analytics/mle/json/frequentpaths_mle.json +0 -129
  1175. teradataml/analytics/mle/json/glm_mle.json +0 -111
  1176. teradataml/analytics/mle/json/glml1l2_mle.json +0 -106
  1177. teradataml/analytics/mle/json/glml1l2predict_mle.json +0 -57
  1178. teradataml/analytics/mle/json/glmpredict_mle_mle.json +0 -74
  1179. teradataml/analytics/mle/json/histogram_mle.json +0 -100
  1180. teradataml/analytics/mle/json/hmmdecoder_mle.json +0 -192
  1181. teradataml/analytics/mle/json/hmmevaluator_mle.json +0 -206
  1182. teradataml/analytics/mle/json/hmmsupervised_mle.json +0 -91
  1183. teradataml/analytics/mle/json/hmmunsupervised_mle.json +0 -114
  1184. teradataml/analytics/mle/json/identitymatch_mle.json +0 -88
  1185. teradataml/analytics/mle/json/idwt2d_mle.json +0 -73
  1186. teradataml/analytics/mle/json/idwt_mle.json +0 -66
  1187. teradataml/analytics/mle/json/interpolator_mle.json +0 -151
  1188. teradataml/analytics/mle/json/kmeans_mle.json +0 -97
  1189. teradataml/analytics/mle/json/knn_mle.json +0 -141
  1190. teradataml/analytics/mle/json/knnrecommender_mle.json +0 -111
  1191. teradataml/analytics/mle/json/knnrecommenderpredict_mle.json +0 -75
  1192. teradataml/analytics/mle/json/lar_mle.json +0 -78
  1193. teradataml/analytics/mle/json/larpredict_mle.json +0 -69
  1194. teradataml/analytics/mle/json/lda_mle.json +0 -130
  1195. teradataml/analytics/mle/json/ldainference_mle.json +0 -78
  1196. teradataml/analytics/mle/json/ldatopicsummary_mle.json +0 -64
  1197. teradataml/analytics/mle/json/levenshteindistance_mle.json +0 -92
  1198. teradataml/analytics/mle/json/linreg_mle.json +0 -42
  1199. teradataml/analytics/mle/json/linregpredict_mle.json +0 -56
  1200. teradataml/analytics/mle/json/minhash_mle.json +0 -113
  1201. teradataml/analytics/mle/json/modularity_mle.json +0 -91
  1202. teradataml/analytics/mle/json/naivebayespredict_mle_mle.json +0 -85
  1203. teradataml/analytics/mle/json/naivebayesreduce_mle.json +0 -52
  1204. teradataml/analytics/mle/json/naivebayestextclassifierpredict_mle_mle.json +0 -147
  1205. teradataml/analytics/mle/json/naivebayestextclassifiertrainer2_mle.json +0 -108
  1206. teradataml/analytics/mle/json/naivebayestextclassifiertrainer_mle.json +0 -102
  1207. teradataml/analytics/mle/json/namedentityfinder_mle.json +0 -84
  1208. teradataml/analytics/mle/json/namedentityfinderevaluatorreduce_mle.json +0 -43
  1209. teradataml/analytics/mle/json/namedentityfindertrainer_mle.json +0 -64
  1210. teradataml/analytics/mle/json/nerevaluator_mle.json +0 -54
  1211. teradataml/analytics/mle/json/nerextractor_mle.json +0 -87
  1212. teradataml/analytics/mle/json/nertrainer_mle.json +0 -89
  1213. teradataml/analytics/mle/json/ngrams_mle.json +0 -137
  1214. teradataml/analytics/mle/json/ngramsplitter_mle_mle.json +0 -137
  1215. teradataml/analytics/mle/json/npath@coprocessor_mle.json +0 -73
  1216. teradataml/analytics/mle/json/ntree@coprocessor_mle.json +0 -123
  1217. teradataml/analytics/mle/json/pack_mle.json +0 -58
  1218. teradataml/analytics/mle/json/pack_mle_mle.json +0 -58
  1219. teradataml/analytics/mle/json/pagerank_mle.json +0 -81
  1220. teradataml/analytics/mle/json/pathanalyzer_mle.json +0 -63
  1221. teradataml/analytics/mle/json/pathgenerator_mle.json +0 -40
  1222. teradataml/analytics/mle/json/pathstart_mle.json +0 -62
  1223. teradataml/analytics/mle/json/pathsummarizer_mle.json +0 -72
  1224. teradataml/analytics/mle/json/pivoting_mle.json +0 -71
  1225. teradataml/analytics/mle/json/postagger_mle.json +0 -51
  1226. teradataml/analytics/mle/json/randomsample_mle.json +0 -131
  1227. teradataml/analytics/mle/json/randomwalksample_mle.json +0 -85
  1228. teradataml/analytics/mle/json/roc_mle.json +0 -73
  1229. teradataml/analytics/mle/json/sampling_mle.json +0 -75
  1230. teradataml/analytics/mle/json/sax_mle.json +0 -154
  1231. teradataml/analytics/mle/json/scale_mle.json +0 -93
  1232. teradataml/analytics/mle/json/scalebypartition_mle.json +0 -89
  1233. teradataml/analytics/mle/json/scalemap_mle.json +0 -44
  1234. teradataml/analytics/mle/json/scalesummary_mle.json +0 -14
  1235. teradataml/analytics/mle/json/sentenceextractor_mle.json +0 -41
  1236. teradataml/analytics/mle/json/sentimentevaluator_mle.json +0 -43
  1237. teradataml/analytics/mle/json/sentimentextractor_mle.json +0 -100
  1238. teradataml/analytics/mle/json/sentimenttrainer_mle.json +0 -68
  1239. teradataml/analytics/mle/json/seriessplitter_mle.json +0 -133
  1240. teradataml/analytics/mle/json/sessionize_mle_mle.json +0 -62
  1241. teradataml/analytics/mle/json/simplemovavg_mle.json +0 -48
  1242. teradataml/analytics/mle/json/stringsimilarity_mle.json +0 -50
  1243. teradataml/analytics/mle/json/stringsimilarity_mle_mle.json +0 -50
  1244. teradataml/analytics/mle/json/svmdense_mle.json +0 -165
  1245. teradataml/analytics/mle/json/svmdensepredict_mle.json +0 -95
  1246. teradataml/analytics/mle/json/svmdensesummary_mle.json +0 -58
  1247. teradataml/analytics/mle/json/svmsparse_mle.json +0 -148
  1248. teradataml/analytics/mle/json/svmsparsepredict_mle_mle.json +0 -103
  1249. teradataml/analytics/mle/json/svmsparsesummary_mle.json +0 -57
  1250. teradataml/analytics/mle/json/textchunker_mle.json +0 -40
  1251. teradataml/analytics/mle/json/textclassifier_mle.json +0 -51
  1252. teradataml/analytics/mle/json/textclassifierevaluator_mle.json +0 -43
  1253. teradataml/analytics/mle/json/textclassifiertrainer_mle.json +0 -103
  1254. teradataml/analytics/mle/json/textmorph_mle.json +0 -63
  1255. teradataml/analytics/mle/json/textparser_mle.json +0 -166
  1256. teradataml/analytics/mle/json/texttagger_mle.json +0 -81
  1257. teradataml/analytics/mle/json/texttokenizer_mle.json +0 -91
  1258. teradataml/analytics/mle/json/tf_mle.json +0 -33
  1259. teradataml/analytics/mle/json/tfidf_mle.json +0 -34
  1260. teradataml/analytics/mle/json/univariatestatistics_mle.json +0 -81
  1261. teradataml/analytics/mle/json/unpack_mle.json +0 -91
  1262. teradataml/analytics/mle/json/unpack_mle_mle.json +0 -91
  1263. teradataml/analytics/mle/json/unpivoting_mle.json +0 -63
  1264. teradataml/analytics/mle/json/varmax_mle.json +0 -176
  1265. teradataml/analytics/mle/json/vectordistance_mle.json +0 -179
  1266. teradataml/analytics/mle/json/weightedmovavg_mle.json +0 -48
  1267. teradataml/analytics/mle/json/xgboost_mle.json +0 -178
  1268. teradataml/analytics/mle/json/xgboostpredict_mle.json +0 -104
  1269. teradataml/analytics/sqle/Antiselect.py +0 -321
  1270. teradataml/analytics/sqle/Attribution.py +0 -603
  1271. teradataml/analytics/sqle/DecisionForestPredict.py +0 -408
  1272. teradataml/analytics/sqle/GLMPredict.py +0 -430
  1273. teradataml/analytics/sqle/MovingAverage.py +0 -543
  1274. teradataml/analytics/sqle/NGramSplitter.py +0 -548
  1275. teradataml/analytics/sqle/NPath.py +0 -632
  1276. teradataml/analytics/sqle/NaiveBayesTextClassifierPredict.py +0 -515
  1277. teradataml/analytics/sqle/Pack.py +0 -388
  1278. teradataml/analytics/sqle/SVMSparsePredict.py +0 -464
  1279. teradataml/analytics/sqle/Sessionize.py +0 -390
  1280. teradataml/analytics/sqle/StringSimilarity.py +0 -400
  1281. teradataml/analytics/sqle/Unpack.py +0 -503
  1282. teradataml/analytics/sqle/json/antiselect_sqle.json +0 -21
  1283. teradataml/analytics/sqle/json/attribution_sqle.json +0 -92
  1284. teradataml/analytics/sqle/json/decisionforestpredict_sqle.json +0 -48
  1285. teradataml/analytics/sqle/json/glmpredict_sqle.json +0 -48
  1286. teradataml/analytics/sqle/json/h2opredict_sqle.json +0 -63
  1287. teradataml/analytics/sqle/json/movingaverage_sqle.json +0 -58
  1288. teradataml/analytics/sqle/json/naivebayestextclassifierpredict_sqle.json +0 -76
  1289. teradataml/analytics/sqle/json/ngramsplitter_sqle.json +0 -126
  1290. teradataml/analytics/sqle/json/npath_sqle.json +0 -67
  1291. teradataml/analytics/sqle/json/pack_sqle.json +0 -47
  1292. teradataml/analytics/sqle/json/pmmlpredict_sqle.json +0 -55
  1293. teradataml/analytics/sqle/json/sessionize_sqle.json +0 -43
  1294. teradataml/analytics/sqle/json/stringsimilarity_sqle.json +0 -39
  1295. teradataml/analytics/sqle/json/svmsparsepredict_sqle.json +0 -74
  1296. teradataml/analytics/sqle/json/unpack_sqle.json +0 -80
  1297. teradataml/catalog/model_cataloging.py +0 -980
  1298. teradataml/config/mlengine_alias_definitions_v1.0 +0 -118
  1299. teradataml/config/mlengine_alias_definitions_v1.1 +0 -127
  1300. teradataml/config/mlengine_alias_definitions_v1.3 +0 -129
  1301. teradataml/table_operators/sandbox_container_util.py +0 -643
  1302. teradataml-17.20.0.7.dist-info/RECORD +0 -1280
  1303. {teradataml-17.20.0.7.dist-info → teradataml-20.0.0.1.dist-info}/top_level.txt +0 -0
@@ -1,1740 +1,1875 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
-
4
- Unpublished work.
5
- Copyright (c) 2018 by Teradata Corporation. All rights reserved.
6
- TERADATA CORPORATION CONFIDENTIAL AND TRADE SECRET
7
-
8
- Primary Owner: mark.sandan@teradata.com
9
- Secondary Owner:
10
-
11
- This file implements util functions of data frame.
12
- """
13
-
14
- import numbers
15
- import pandas as pd
16
- from collections import OrderedDict
17
-
18
- from teradataml.common.utils import UtilFuncs
19
- from teradataml.common.aed_utils import AedUtils
20
- from teradataml.common.constants import AEDConstants, PTITableConstants, \
21
- SQLPattern, PythonTypes
22
- from teradataml.common.sqlbundle import SQLBundle
23
- from teradataml.common.exceptions import TeradataMlException
24
- from teradataml.common.messages import Messages
25
- from teradataml.common.messagecodes import MessageCodes
26
-
27
- from teradataml.context.context import get_context, get_connection
28
- from teradataml.context.context import _get_current_databasename
29
- from teradataml.dbutils.dbutils import _execute_query_and_generate_pandas_df
30
-
31
- from teradataml.options.display import display
32
- from teradataml.options.configure import configure
33
- from teradataml.utils.utils import execute_sql
34
-
35
- from teradatasqlalchemy.types import FLOAT, NUMBER, DECIMAL, PERIOD_TIMESTAMP
36
- from teradatasqlalchemy.dialect import preparer, dialect as td_dialect
37
- import teradataml.dataframe as tdmldf
38
-
39
- from sqlalchemy.sql import select
40
- from sqlalchemy.sql.expression import text
41
- from sqlalchemy import table, column, func
42
- from datetime import datetime, date, time
43
- from decimal import Decimal
44
-
45
- # TODO - Need to write unit testcases for these functions
46
- class DataFrameUtils():
47
-
48
- @staticmethod
49
- def _execute_node_return_db_object_name(nodeid, metaexpression = None):
50
- """
51
- Fetches queries and view names from AED node and creates views from queries
52
- Additionally inspects the metaexpression for consistency
53
-
54
- PARAMETERS:
55
- nodeid: nodeid to execute
56
- metaexpression: (optional) updated _metaexpr to validate
57
-
58
- EXAMPLES:
59
- _execute_node_return_db_object_name(nodeid)
60
- _execute_node_return_db_object_name(nodeid, metaexpr)
61
-
62
- RETURNS:
63
- Top level view name.
64
-
65
- """
66
- aed_obj = AedUtils()
67
- if not aed_obj._aed_is_node_executed(nodeid):
68
-
69
- view_query_node_type_list = aed_obj._aed_get_exec_query(nodeid)
70
- view_names, queries, node_query_types, node_ids = view_query_node_type_list
71
-
72
- # Executing Nodes / Creating Views
73
- for index in range(len(queries) - 1, -1, -1):
74
- is_persist = False
75
- if metaexpression and metaexpression._is_persist:
76
- is_persist = True
77
-
78
- try:
79
- if node_query_types[index] == AEDConstants.AED_QUERY_NODE_TYPE_ML_QUERY_MULTI_OUTPUT.value or\
80
- ("OUT TABLE " in queries[index] and SQLPattern.SQLMR.value.match(queries[index])) or \
81
- is_persist:
82
- # TODO:: OR condition in above needs to be removed once AED support is added.
83
- UtilFuncs._create_table(view_names[index], queries[index])
84
-
85
- elif node_query_types in ['groupby', 'groupbytime']:
86
- # If query_type is either groupby or groupbytime get it's parent
87
- # nodeid and execute queries for the same
88
- parent_nodeid = aed_obj._aed_get_parent_nodeids(nodeid)[0]
89
- DataFrameUtils._execute_node_return_db_object_name(parent_nodeid)
90
-
91
- elif node_query_types[index] == AEDConstants.AED_QUERY_NODE_TYPE_REFERENCE.value:
92
- # Reference nodes - To be ignored.
93
- pass
94
-
95
- else:
96
- UtilFuncs._create_view(view_names[index], queries[index])
97
-
98
- # Updating Node Status for executed Node
99
- aed_obj._aed_update_node_state_single(node_ids[index], AEDConstants.AED_NODE_EXECUTED.value)
100
-
101
- except Exception as emsg:
102
- # TODO:: Append node execution details to emsg.
103
- # Node description, such as nodeType or node operation, should be added
104
- # here in 'emsg' to give away more information, where exactly
105
- # node execution failed.
106
- raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_EXEC_SQL_FAILED, str(emsg)),
107
- MessageCodes.TDMLDF_EXEC_SQL_FAILED)
108
-
109
- # Setting New Table name retrieved to TDML DF
110
- result_table_view_name = aed_obj._aed_get_tablename(nodeid)
111
- # validate the metaexpression
112
- if configure._validate_metaexpression:
113
- DataFrameUtils._validate_metaexpression(result_table_view_name, metaexpression)
114
-
115
- return result_table_view_name
116
-
117
- @staticmethod
118
- def _validate_metaexpression(result_table_view_name, metaexpression):
119
- """
120
- Inspects the metaexpression for consistency with the underlying table/view
121
-
122
- PARAMETERS:
123
- result_table_view_name: a string representing the table/view name to check column metadata
124
- metaexpression: the metaexpr of the DataFrame to compare against the result_table_view_name
125
-
126
- EXAMPLES:
127
- _validate_metaexpression('t1', df._metaexpr)
128
- _execute_node_return_db_object_name(nodeid, metaexpr)
129
-
130
- RETURNS:
131
- None
132
- Outputs RuntimeWarnings if mismatches are found
133
-
134
- """
135
- # metaexpression should have already been updated
136
- if metaexpression is not None:
137
-
138
- name = lambda x: x[0]
139
- type_ = lambda x: x[1]
140
-
141
- # compare sorted by name of column
142
- df = sorted(UtilFuncs._describe_column(DataFrameUtils._get_metadata_from_table(result_table_view_name)), key = lambda x: x[0])
143
- meta = sorted(metaexpression.c, key = lambda x: x.name)
144
-
145
- # check length
146
- if len(df) == len(meta):
147
- for i in range(len(df)):
148
-
149
- # map Teradata type to python type
150
- meta_type = UtilFuncs._teradata_type_to_python_type(meta[i].type)
151
-
152
- # compare column names and types
153
- if meta[i].name != name(df[i]) or meta_type != type_(df[i]):
154
- err_msg = "[Mismatch when checking %s]\n\t[Table/View] %s %s\n\t[MetaExpression] %s %s (mapped from => %s)\n"
155
- raise RuntimeError(err_msg % (result_table_view_name,
156
- name(df[i]), type_(df[i]),
157
- meta[i].name, meta_type, meta[i].type))
158
- else:
159
- err_msg = "[Length mismatch when checking %s]\nSource Table/View has length %s but MetaExpression has length %s"
160
- raise RuntimeError(err_msg % (result_table_view_name, len(df), len(meta)))
161
-
162
- @staticmethod
163
- def _get_dataframe_print_string(table_name, index_label, orderby=None, undropped_index=None):
164
- """
165
- Builds string output for teradataml DataFrame
166
-
167
- PARAMETERS:
168
- table_name - Name of the database table to read from.
169
- index_label - String/List specifying column to use as index.
170
- orderby - order expression to sort returned rows
171
-
172
- EXAMPLES:
173
- _get_dataframe_print_string('table_name', None, None)
174
-
175
- RETURNS:
176
- String representation of a pandas DataFrame.
177
-
178
- """
179
- read_query = SQLBundle._build_top_n_print_query(table_name, display.max_rows, orderby)
180
-
181
- if index_label is not None:
182
- pandas_df = _execute_query_and_generate_pandas_df(read_query, index=index_label)
183
- else:
184
- pandas_df = _execute_query_and_generate_pandas_df(read_query)
185
-
186
- return pandas_df.to_string()
187
-
188
- @staticmethod
189
- def _get_pprint_dtypes(column_names_and_types, null_count=False):
190
- """
191
- returns a string containing the column names and types.
192
- If null_count is not None, the string will also contain
193
- the number of non-null values for each column.
194
-
195
- PARAMETERS:
196
- column_names_and_types - List of column names and types.
197
- null_count(optional) - List of the non-null count for each column.
198
-
199
- EXAMPLES:
200
- >>>print(_get_pprint_dtypes(column_names_and_types)
201
- accounts str
202
- Feb float
203
- Jan int
204
- Mar int
205
- Apr int
206
- datetime str
207
-
208
- >>>print(_get_pprint_dtypes(column_names_and_types, null_count)
209
- accounts 3 non-null str
210
- Feb 3 non-null float
211
- Jan 3 non-null int
212
- Mar 3 non-null int
213
- Apr 3 non-null int
214
- datetime 3 non-null str
215
-
216
- RAISES:
217
-
218
- """
219
-
220
- col_names = [i[0] for i in column_names_and_types]
221
- col_types = [i[1] for i in column_names_and_types]
222
- max_col_names = len(max(col_names, key=len)) + 4
223
- max_col_types = len(max(col_types, key=len))
224
- dtypes_string = ""
225
- if not null_count:
226
- for colname, coltype in column_names_and_types:
227
- dtypes_string += "{0: <{name_width}}{1: >{type_width}}\n".format(colname, coltype,
228
- name_width=max_col_names,
229
- type_width=max_col_types)
230
- else:
231
- null_count = [i[2] for i in column_names_and_types]
232
- max_null_count = len(str(max(null_count, key=len)))
233
- for colname, coltype, num_nulls in column_names_and_types:
234
- dtypes_string += "{0: <{name_width}}{1: <{count_width}} non-null {2: <{type_width}}\n".format(colname,
235
- num_nulls,
236
- coltype,
237
- name_width=max_col_names,
238
- count_width=max_null_count,
239
- type_width=max_col_types)
240
- # Remove last new line character.
241
- dtypes_string = dtypes_string[:-1]
242
- return dtypes_string
243
-
244
- @staticmethod
245
- def _get_metadata_from_table(table_name):
246
- """
247
- Retrieves column metadata by executing a HELP COLUMN command.
248
-
249
- PARAMETERS:
250
- table_name - The table name or view name.
251
-
252
- RETURNS:
253
- returns the result set (column information) from HELP COLUMN.
254
-
255
- RAISES:
256
- Database error if an error occurred while executing the HELP COLUMN.
257
-
258
- EXAMPLES:
259
- df = DataFrame.from_table('mytab')
260
- metadata = _get_metadata_from_table(df._table_name)
261
- """
262
- # Construct HELP COLUMN command.
263
- help_col_sql = SQLBundle._build_help_column(table_name)
264
- # Execute HELP COLUMN command.
265
- return UtilFuncs._execute_query(help_col_sql)
266
-
267
- @staticmethod
268
- def _extract_select_string(select_expression):
269
- """
270
- Takes in a string/list representing a Pandas selection clause of any of the forms (only):
271
- a) "col1" or 'col1'
272
- b) ["col 1"] or ['col 1']
273
- c) ["col1", "col2", "col3"] or ['col1', 'col2', 'col3']
274
- d) [['col1', 'col2', 'col3']] or [["col1", "col2", "col3"]]
275
-
276
- And returns a list with column strings representing the selection of the form:
277
- a) ['col1']
278
- b) ['col 1']
279
- c) ['col1','col2','col3']
280
- d) ['col1','col2','col3']
281
-
282
- Column Names ("col1", "col2"..) are Strings representing database table Columns.
283
- All Standard Teradata Data-Types for columns supported: INTEGER, VARCHAR(5), FLOAT.
284
-
285
- PARAMETERS:
286
- selection_expression - Expression representing column selection
287
- Type - String or List of Strings or List of List (Single level only)
288
- Required - Yes
289
-
290
- EXAMPLES:
291
- UtilFuncs._extract_select_string([['col1', 'col2']])
292
- UtilFuncs._extract_select_string("col1")
293
- UtilFuncs._extract_select_string(["col1"])
294
- UtilFuncs._extract_select_string(["col1","col2","col3"])
295
-
296
- RETURNS:
297
- List of Strings representing column names.
298
-
299
- RAISES:
300
- TeradataMlException
301
- """
302
- tdp = preparer(td_dialect)
303
- column_list = []
304
-
305
- # Single String column
306
- if isinstance(select_expression, str):
307
- # Error handling - Empty String
308
- if select_expression == "":
309
- raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_SELECT_NONE_OR_EMPTY),
310
- MessageCodes.TDMLDF_SELECT_NONE_OR_EMPTY)
311
- else:
312
- column_list.append(tdp.quote("{0}".format(select_expression.strip())))
313
-
314
- # Error Handling - [], [""], [None], ["None"], ['col1', None], ['col1', '']
315
- elif isinstance(select_expression, list) and (len(select_expression) == 0 or
316
- any(element in [None, "None", ""] for element in select_expression)):
317
- raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_SELECT_NONE_OR_EMPTY),
318
- MessageCodes.TDMLDF_SELECT_NONE_OR_EMPTY)
319
-
320
- # List - ["col1"] or ["col1", "col2", "col3"]
321
- elif isinstance(select_expression, list) and all(isinstance(element, str) for element in select_expression):
322
- if len(select_expression) == 1:
323
- column_list.append(tdp.quote("{0}".format(select_expression[0].strip())))
324
- else:
325
- column_list = [tdp.quote("{0}".format(element.strip())) for element in select_expression]
326
-
327
- # List of List (Single level only - Pandas Syntax) - [["col1", "col2", "col3"]]
328
- elif isinstance(select_expression, list) and isinstance(select_expression[0], list):
329
- # Error Handling - [[]], [[""]], [[None]], [['col1', None]], [['col1', "None"]], ["col1", ""]
330
- if len(select_expression[0]) == 0 or any(element in [None, "None", ""] for element in select_expression[0]):
331
- raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_SELECT_NONE_OR_EMPTY),
332
- MessageCodes.TDMLDF_SELECT_NONE_OR_EMPTY)
333
-
334
- else:
335
- column_list = [tdp.quote("{0}".format(element.strip())) for element in select_expression[0]]
336
-
337
- # Any other Format - Raise Format Exception
338
- else:
339
- raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_SELECT_INVALID_FORMAT),
340
- MessageCodes.TDMLDF_SELECT_INVALID_FORMAT)
341
- return column_list
342
-
343
- @staticmethod
344
- def _get_primary_index_from_table(table_name):
345
- """
346
- Retrieves the primary index by executing a HELP INDEX command.
347
- PARAMETERS:
348
- table_name - The table name or volatile table name.
349
- RETURNS:
350
- Returns a list containing the primary index columns from HELP INDEX.
351
- If the there are no primary index (NoPI table), then returns None.
352
- RAISES:
353
- Database error if an error occurred while executing the HELP INDEX.
354
- EXAMPLES:
355
- df = DataFrame.from_table('mytab')
356
- index_labels = df._get_metadata_from_table(df._table_name)
357
- """
358
- # Construct HELP INDEX command.
359
- help_index_sql = SQLBundle._build_help_index(table_name)
360
-
361
- # Execute HELP INDEX command.
362
- rows = UtilFuncs._execute_query(help_index_sql)
363
- index_labels = []
364
- for row in rows:
365
- # row[1] specifies whether the Index is 'Primary or Secondary?'
366
- if row[1].rstrip() == 'P':
367
- # row[2] specifies a string of comma separated column names that form the primary index
368
- if "," in row[2]:
369
- index_cols = row[2].split(',')
370
- else:
371
- index_cols = [row[2]]
372
- for index_col in index_cols:
373
- # Since TD_TIMEBUCKET column in PTI tables is not functionally available, it can be ignored
374
- # from the index information as well (else a warning is generated by SQLAlchemy).
375
- # row[12] corresponds to 'Timebucket' column in the results of 'help index' SQL command, which
376
- # is available only when the version supports PTI tables.
377
- if index_col == PTITableConstants.TD_TIMEBUCKET.value and len(row) > 11 and row[12] is not None:
378
- continue
379
- else:
380
- index_labels.append(index_col)
381
-
382
- if len(index_labels) > 0:
383
- return index_labels
384
- else:
385
- return None
386
-
387
- @staticmethod
388
- def __validate_sort_type_raise_exception(sort_col_type):
389
- """
390
- Function to raise TeradatamlException for errors encountered for invalid/incorrect
391
- "sort_col_type" in "_validate_sort_type" function.
392
-
393
- PARAMETERS:
394
- sort_col_type: The sort column type.
395
-
396
- RETURNS:
397
- None
398
-
399
- RAISES:
400
- TeradataMlException
401
-
402
- EXAMPLES:
403
- df_utils.__validate_sort_type_raise_exception(PythonTypes.PY_STRING_TYPE.value)
404
- """
405
- msg = Messages.get_message(MessageCodes.TDMLDF_DROP_INVALID_INDEX_TYPE).format(sort_col_type)
406
- raise TeradataMlException(msg, MessageCodes.TDMLDF_DROP_INVALID_INDEX_TYPE)
407
-
408
- @staticmethod
409
- def _validate_sort_col_type(sort_col_type, sort_col_values):
410
- """
411
- Validates a list of sort column values with the sort column type.
412
-
413
- PARAMETERS:
414
- sort_col_type - The sort column type.
415
- sort_col_values - A single value or list-like values
416
-
417
- RETURNS:
418
- None
419
-
420
- RAISES:
421
- TeradataMlException
422
-
423
- EXAMPLES:
424
- df_utils._validate_sort_col_type(PythonTypes.PY_STRING_TYPE.value, ["Jan", "Feb"])
425
- df_utils._validate_sort_col_type(PythonTypes.PY_STRING_TYPE.value, "Jan")
426
- df_utils._validate_sort_col_type(PythonTypes.PY_INT_TYPE.value, [1, 2])
427
- """
428
- if isinstance(sort_col_values, list):
429
- if sort_col_type == PythonTypes.PY_STRING_TYPE.value:
430
- if not all(isinstance(i, str) for i in sort_col_values):
431
- DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
432
- elif sort_col_type == PythonTypes.PY_FLOAT_TYPE.value:
433
- if not all(isinstance(i, float) for i in sort_col_values):
434
- DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
435
- elif sort_col_type == PythonTypes.PY_DECIMAL_TYPE.value:
436
- if not all(isinstance(i, Decimal) for i in sort_col_values):
437
- DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
438
- elif sort_col_type == PythonTypes.PY_DATETIME_TYPE.value:
439
- if not all(isinstance(i, datetime) for i in sort_col_values):
440
- DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
441
- elif sort_col_type == PythonTypes.PY_TIME_TYPE.value:
442
- if not all(isinstance(i, time) for i in sort_col_values):
443
- DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
444
- elif sort_col_type == PythonTypes.PY_DATE_TYPE.value:
445
- if not all(isinstance(i, date) for i in sort_col_values):
446
- DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
447
- elif sort_col_type == PythonTypes.PY_BYTES_TYPE.value:
448
- if not all(isinstance(i, bytes) for i in sort_col_values):
449
- DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
450
- else: # numeric type
451
- if not all(isinstance(i, numbers.Integral) for i in sort_col_values):
452
- DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
453
- elif isinstance(sort_col_values, (tuple, dict)):
454
- raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_DROP_ARGS),
455
- MessageCodes.TDMLDF_DROP_ARGS)
456
- else:
457
- if sort_col_type == PythonTypes.PY_STRING_TYPE.value:
458
- if not isinstance(sort_col_values, str):
459
- DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
460
- elif sort_col_type == PythonTypes.PY_FLOAT_TYPE.value:
461
- if not isinstance(sort_col_values, float):
462
- DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
463
- elif sort_col_type == PythonTypes.PY_DECIMAL_TYPE.value:
464
- if not isinstance(sort_col_values, Decimal):
465
- DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
466
- elif sort_col_type == PythonTypes.PY_DATETIME_TYPE.value:
467
- if not isinstance(sort_col_values, datetime):
468
- DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
469
- elif sort_col_type == PythonTypes.PY_TIME_TYPE.value:
470
- if not isinstance(sort_col_values, time):
471
- DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
472
- elif sort_col_type == PythonTypes.PY_DATE_TYPE.value:
473
- if not isinstance(sort_col_values, date):
474
- DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
475
- elif sort_col_type == PythonTypes.PY_BYTES_TYPE.value:
476
- if not isinstance(sort_col_values, bytes):
477
- DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
478
- else: # numeric type
479
- if not isinstance(sort_col_values, numbers.Integral):
480
- DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
481
-
482
- def _get_required_columns_types_from_metaexpr(metaexpr, col_list = None):
483
- """
484
- Retrieves column names and types from meta expression. If you want to get types for only some columns,
485
- pass those columns to 'col_list' argument.
486
-
487
- PARAMETERS:
488
- metaexpr - Meta expression from which columns and types to be retrieved.
489
- col_list - Column list for which you want to get types
490
-
491
- RETURNS:
492
- Dictionary: key as column name and datatype as value.
493
-
494
- EXAMPLES:
495
- df = DataFrame.from_table('mytab')
496
- metadata = _get_required_columns_types_from_metaexpr()
497
- """
498
-
499
- if isinstance(col_list, str):
500
- col_list = [col_list]
501
-
502
- if col_list is not None and not isinstance(col_list, list):
503
- return None
504
-
505
- meta_cols = metaexpr.t.c
506
- meta_columns = [c.name for c in meta_cols]
507
- col_names = []
508
- col_types = []
509
-
510
- # When column list to retrieve is not provided, return meta-data for all columns.
511
- if col_list is None:
512
- for col_name in meta_columns:
513
- col_names.append(meta_cols[col_name].name)
514
- col_types.append(meta_cols[col_name].type)
515
-
516
- # Return meta-data for only requested columns otherwise.
517
- else:
518
- for col_name in col_list:
519
- if DataFrameUtils._check_column_exists(col_name, meta_columns):
520
- # _metaexpr saves columns without quotes, so unquoting.
521
- unquoted_col_name = col_name.replace('"', "")
522
- col_names.append(meta_cols[unquoted_col_name].name)
523
- col_types.append(meta_cols[unquoted_col_name].type)
524
-
525
- return OrderedDict(zip(col_names, col_types))
526
-
527
- @staticmethod
528
- def _check_column_exists(column_name, df_columns):
529
- """
530
- Checks provide column present in list of columns or not.
531
- Note:
532
- It is calling functions responsibility to send the column and columns list in proper case.
533
- By default the look up is case-sensitive. If they would like to have it case insensitive, then
534
- one should send the the column_name and df_columns list in lower case.
535
-
536
- PARAMETERS:
537
- column_name - Column name which need to be check.
538
- df_columns - List columns in which column to be check.
539
-
540
- RETURNS:
541
- True if column exists otherwase False.
542
-
543
- EXAMPLES:
544
- df = DataFrame.from_table('mytab')
545
- metadata = _check_column_exists("col1", df.columns)
546
- """
547
- unquoted_df_columns = [column.replace('"', "") for column in df_columns]
548
- if column_name.replace('"', "") in unquoted_df_columns:
549
- return True
550
- else:
551
- return False
552
-
553
- @staticmethod
554
- def _validate_agg_function(func, col_names):
555
- """
556
- Internal function to validate column names against actual
557
- column names passed as parameter and aggregate operations
558
- against valid aggregate operations.
559
-
560
- PARAMETERS:
561
- func - (Required) Specifies the function(s) to be
562
- applied on teradataml DataFrame columns.
563
- Acceptable formats for function(s) are string,
564
- dictionary or list of strings/functions.
565
- Accepted combinations are:
566
- 1. String function name
567
- 2. List of string functions
568
- 3. Dictionary of column names -> string function
569
- (or list of string functions)
570
- col_names - List. Names of the columns in Dataframe.
571
-
572
- RETURNS:
573
- operations - dict of columns -> aggregate operations
574
- Unified dictionary, similar to func, even for string and
575
- list of strings or functions.
576
-
577
- RAISES:
578
- 1. TDMLDF_INVALID_AGGREGATE_OPERATION - If the aggregate
579
- operation(s) received in parameter 'func' is/are
580
- invalid.
581
-
582
- Possible Value :
583
- Invalid aggregate operation(s): minimum, counter.
584
- Valid aggregate operation(s): count, max, mean, min,
585
- std, sum.
586
-
587
- 2. TDMLDF_AGGREGATE_INVALID_COLUMN - If any of the columns
588
- specified in 'func' is not present in the dataframe.
589
-
590
- Possible Value :
591
- Invalid column(s) given in parameter func: col1.
592
- Valid column(s) : A, B, C, D.
593
-
594
- EXAMPLES:
595
- Let the dataframe contain 2 columns, col1 and col2.
596
-
597
- VALID EXAMPLES:
598
- 1. operations = DataFrameUtils._validate_agg_function(
599
- operation = 'mean', ['col1', 'col2'])
600
-
601
- 2. operations = DataFrameUtils._validate_agg_function(
602
- operation = ['mean', 'min'], ['col1', 'col2'])
603
-
604
- 3. operations = DataFrameUtils._validate_agg_function(
605
- {'col1' : ['mean', 'min'], 'col2' : 'count'},
606
- ['col1', 'col2'])
607
-
608
- INVALID EXAMPLES:
609
- 1. operations = DataFrameUtils._validate_agg_function(
610
- operation = 'counter', ['col1', 'col2'])
611
-
612
- 2. operations = DataFrameUtils._validate_agg_function(
613
- {'col1' : ['mean', 'min'], 'col55' : 'count'},
614
- ['col1', 'col2'])
615
- """
616
- operations = OrderedDict()
617
-
618
- valid_aggregate_operations = UtilFuncs._get_valid_aggregate_operations()
619
-
620
- if isinstance(func, str):
621
- for column in col_names:
622
- operations[column] = [func]
623
- elif isinstance(func, list):
624
- for column in col_names:
625
- operations[column] = func
626
- else:
627
- for column in func:
628
- if isinstance(func[column], str):
629
- func[column] = [func[column]] # Converts string inside dict to list
630
- operations = func
631
-
632
- given_columns = operations.keys()
633
- invalid_columns = []
634
- all_operations = []
635
- for col in given_columns:
636
- all_operations.extend(operations[col])
637
- if col not in col_names:
638
- invalid_columns.append(col)
639
- if len(invalid_columns) > 0: # If any of the columns specified is not present in dataframe
640
- col_names.sort()
641
- invalid_columns.sort()
642
- msg = Messages.get_message(MessageCodes.TDMLDF_AGGREGATE_INVALID_COLUMN). \
643
- format(", ".join(invalid_columns), 'func', ", ".join(col_names))
644
- raise TeradataMlException(msg, MessageCodes.TDMLDF_AGGREGATE_INVALID_COLUMN)
645
-
646
- all_operations = list(set(all_operations))
647
- invalid_aggregates = []
648
- for operation in all_operations:
649
- if operation not in valid_aggregate_operations \
650
- and operation not in UtilFuncs._get_valid_time_series_aggregate_operations():
651
- invalid_aggregates.append(operation)
652
- if len(invalid_aggregates) > 0: # If any of the aggregate operations specified is not valid
653
- # To raise error message, let's add other time series aggregate operations those can be
654
- # used with DataFrame.agg() method.
655
- valid_aggregate_operations = valid_aggregate_operations + ['first', 'last', 'mode']
656
- valid_aggregate_operations.sort()
657
- invalid_aggregates.sort()
658
- msg = Messages.get_message(MessageCodes.TDMLDF_INVALID_AGGREGATE_OPERATION). \
659
- format(", ".join(invalid_aggregates), ", ".join(valid_aggregate_operations))
660
- raise TeradataMlException(msg, MessageCodes.TDMLDF_INVALID_AGGREGATE_OPERATION)
661
-
662
- return operations
663
-
664
- @staticmethod
665
- def _generate_aggregate_column_expression(df, column, operation, describe_op, tdp, **kwargs):
666
- """
667
- Function generate the aggregate column expression for the provided column
668
- and aggregate function.
669
-
670
- PARAMETERS:
671
- df:
672
- Required Argument.
673
- Specifies teradataml DataFrame which is to be used to get the
674
- desired aggregate column expression.
675
- Types: teradataml DataFrame
676
-
677
- column:
678
- Required Argument.
679
- Specifies the column name for which desired aggregate operation is
680
- to be used.
681
- Types: str
682
-
683
- operation:
684
- Required Argument.
685
- Specifies the aggregate operation.
686
- Types: str
687
-
688
- describe_op:
689
- Required Argument.
690
- Specifies a boolean flag, that will decide whether the aggregate
691
- operation is being performed for DataFrame.describe() or not.
692
- Types: bool
693
-
694
- tdp:
695
- Required Argument.
696
- Specifies a TeradataIdentifierPreparer object. It is required for
697
- quoting.
698
- Types: TeradataIdentifierPreparer
699
-
700
- kwargs:
701
- Specifies miscellaneous keyword arguments that can be passed to
702
- aggregate functions.
703
-
704
- RAISES:
705
- AttributeError - In case ColumnExpression does not have desired aggregate
706
- function implemnted.
707
-
708
- RETURNS:
709
- A boolean stating whether column is supported or not, New column name,
710
- New column type, A string representing column aggregate expression,
711
- invalid column information in case column has unsupported type for an
712
- aggregate operation.
713
-
714
- EXAMPLES:
715
- column_supported, new_column_name, new_column_type, column_aggr_expr, invalid_column_str = \
716
- DataFrameUtils._generate_aggregate_column_expression(df=df, column=column, operation=func,
717
- describe_op=describe_op, percentile=percentile,
718
- tdp=tdp, **kwargs)
719
- """
720
- try:
721
- key_to_process = ""
722
- # quote column names same as that of the Teradata reserved keywords.
723
- if "sort_columns" in kwargs:
724
- key_to_process = "sort_columns"
725
- elif "sort_column" in kwargs:
726
- key_to_process = "sort_column"
727
-
728
- if key_to_process:
729
- quoted_columns = UtilFuncs._process_for_teradata_keyword(kwargs[key_to_process])
730
- kwargs[key_to_process] = quoted_columns
731
-
732
- func_expression = getattr(df[column], operation)(describe_op=describe_op, **kwargs)
733
- new_column_name = column if describe_op else "{1}_{0}".format(column, operation)
734
- # column_supported, new_column_name, new_column_type, column_aggr_expr, invalid_column_str
735
- return True, new_column_name, NUMBER() if describe_op else func_expression.type, \
736
- func_expression.compile_label(new_column_name), None
737
- except AttributeError:
738
- # We are here means, provided operation is invalid and is not supported.
739
- # This if for internal purpose only.
740
- # Validation of operations for "agg" should be done in "agg" only.
741
- raise RuntimeError("Invalid aggregate function: {}".format(operation))
742
- except RuntimeError:
743
- # We are here means, column does not support the provided operation.
744
- # We will ignore this and add the column to invalid column list.
745
- # invalid_columns[operation].append("({0} - {1})".format(column, column_type)) OR
746
- # We will raise Generic message, mentioning DF does not have any column with type
747
- # supported to perform an operation.
748
- if describe_op:
749
- return True, tdp.quote(column), NUMBER(), 'null as {}'.format(tdp.quote(column)), None
750
- else:
751
- return False, None, None, None, "({0} - {1})".format(column, df[column].type)
752
- except Exception:
753
- raise
754
-
755
- @staticmethod
756
- def _construct_sql_expression_for_aggregations(df, column_names, column_types, func, percentile=.5,
757
- describe_op=False, **kwargs):
758
- """
759
- Internal function to create and return the sql expression
760
- corresponding to given operation, given column_names and
761
- column_types.
762
-
763
- Column_types are used to check whether all the datatypes are
764
- valid types for given operation and throw exception if they
765
- are not.
766
-
767
- PARAMETERS :
768
- df:
769
- Required Argument.
770
- Specifies teradataml DataFrame which is to be used to get the desired
771
- aggregate column expression.
772
- Types: teradataml DataFrame
773
-
774
- column_names:
775
- Required Argument.
776
- Specifies the column names for which desired aggregate operation is
777
- to be executed.
778
- Types: List of strings
779
-
780
- column_types:
781
- Required Argument.
782
- Specifies the respective column types for column names.
783
- Types: List of teradatasqlalchemy types
784
-
785
- func:
786
- Required Argument.
787
- Specifies the aggregate function(s) to be applied on teradataml
788
- DataFrame columns.
789
- Types: string, dictionary or list of strings/functions.
790
- Accepted combinations are:
791
- 1. String function name
792
- 2. List of functions
793
- 3. Dictionary containing column name as key and aggregate
794
- function name (string or list of strings) as value
795
-
796
- percentile:
797
- Optional Argument.
798
- Specifies a value between 0 and 1 that can only be used with func = 'percentile'.
799
- The default is .5, which returns the 50th percentiles.
800
- Types: float
801
-
802
- describe_op:
803
- Optional Argument.
804
- Specifies a boolean flag, that will decide whether the aggregate operation being
805
- performed is for DataFrame.describe() or not.
806
- Types: bool
807
-
808
- kwargs:
809
- Specifies miscellaneous keyword arguments that can be passed to aggregate functions.
810
-
811
- RETURNS :
812
- a)sql expression as
813
- 1. 'min(col1) as min_col1, min(col2) as min_col2' if
814
- col1 and col2 are the columns in Dataframe and
815
- operation is 'min'
816
- 2. 'max(col1) as max_col1, max(col2) as max_col2' if
817
- col1 and col2 are the columns in Dataframe and
818
- operation is 'max'
819
- 3. 'min(col1) as min_col1, stddev_samp(col2) as
820
- std_col2' if col1, col2 are the columns in
821
- Dataframe and operations are min, std.
822
- etc...
823
- b) new columns' names (eg min_col1, min_col2 ...)
824
- c) new columns' types
825
- RAISES:
826
- TeradataMLException
827
- 1. TDMLDF_AGGREGATE_COMBINED_ERR - If the provided
828
- aggregate operations do not support specified columns.
829
-
830
- Possible Value :
831
- No results. Below is/are the error message(s):
832
- All selected columns [(col1 - VARCHAR)] is/are
833
- unsupported for 'sum' operation.
834
-
835
- 2. TDMLDF_INVALID_AGGREGATE_OPERATION - If the aggregate
836
- operation(s) received in parameter 'func' is/are
837
- invalid.
838
-
839
- Possible Value :
840
- Invalid aggregate operation(s): minimum, counter.
841
- Valid aggregate operation(s): count, max, mean, min,
842
- std, sum.
843
-
844
- 3. TDMLDF_AGGREGATE_INVALID_COLUMN - If any of the columns
845
- specified in func is not present in the dataframe.
846
-
847
- Possible Value :
848
- Invalid column(s) given in parameter func: col1.
849
- Valid column(s) : A, B, C, D.
850
-
851
- EXAMPLES:
852
- col_names, col_types = \
853
- df_utils._get_column_names_and_types_from_metaexpr(
854
- self._metaexpr)
855
- expr, new_col_names, new_col_types = \
856
- df_utils._construct_sql_expression_for_aggregations(
857
- col_names, col_types, 'min')
858
-
859
- expr1, new_col_names1, new_col_types1 = \
860
- df_utils._construct_sql_expression_for_aggregations(
861
- col_names, col_types, ['min', 'sum'])
862
-
863
- expr2, new_col_names2, new_col_types2 = \
864
- df_utils._construct_sql_expression_for_aggregations(
865
- col_names, col_types, {'col1 : ['min', 'sum'],
866
- 'col2' : 'mean'})
867
-
868
- """
869
-
870
- # eg of column_types: [VARCHAR(length=13), INTEGER(), VARCHAR(length=60), VARCHAR(length=5),
871
- # FLOAT(precision=0)]
872
-
873
- # eg of types of each column are <class 'teradatasqlalchemy.types.VARCHAR'>,
874
- # <class 'teradatasqlalchemy.types.INTEGER'>, <class 'teradatasqlalchemy.types.FLOAT'>,
875
- # <class 'teradatasqlalchemy.types.INTERVAL_MINUTE_TO_SECOND'> etc..
876
-
877
- # If function is of type time series aggregates, we process aggregation differently.
878
- if not isinstance(func, str):
879
- # If func is not instance of string, that means function call is
880
- # from DataFrame.agg(). And is made to process multiple functions.
881
- # We will process the same differently, as we need to map and serialize the
882
- # column names and aggregate function operate on.
883
- # If we have just function to be executed on complete DataFrame, then we don't need
884
- # this extra processing. Also, if call is from DataFrame.agg(), time series aggregate check
885
- # is not required. As special Time Series aggregate functions cannot be used in
886
- # DataFrame.agg().
887
- return DataFrameUtils._construct_sql_expression_for_aggregations_for_agg(df, column_names, column_types,
888
- func, percentile, describe_op,
889
- **kwargs)
890
-
891
- as_time_series_aggregate = False
892
- if "as_time_series_aggregate" in kwargs.keys():
893
- as_time_series_aggregate = kwargs["as_time_series_aggregate"]
894
-
895
- if as_time_series_aggregate and func in ['bottom', 'bottom with ties', 'delta_t', 'mad', 'top',
896
- 'top with ties']:
897
- return DataFrameUtils._construct_sql_expression_for_time_series_aggregations(df, column_names, column_types,
898
- func, **kwargs)
899
-
900
- tdp = preparer(td_dialect)
901
-
902
- # This variable is used to decide whether DataFrame has all columns unsupported
903
- # for the provided operations.
904
- all_unsupported_columns = True
905
- valid_columns = []
906
- invalid_columns = []
907
- new_column_names = []
908
- new_column_types = []
909
- for column in column_names:
910
- column_supported, new_column_name, new_column_type, column_aggr_expr, invalid_column_str = \
911
- DataFrameUtils._generate_aggregate_column_expression(df=df, column=column, operation=func,
912
- describe_op=describe_op, percentile=percentile,
913
- tdp=tdp, **kwargs)
914
-
915
- if column_supported:
916
- all_unsupported_columns = False
917
- new_column_names.append(new_column_name)
918
- new_column_types.append(new_column_type)
919
- valid_columns.append(column_aggr_expr)
920
- else:
921
- invalid_columns.append("({0} - {1})".format(column, df[column].type))
922
-
923
- if all_unsupported_columns:
924
-
925
- error_msgs = []
926
- invalid_columns.sort() # Helps in catching the columns in lexicographic order
927
- error = MessageCodes.TDMLDF_AGGREGATE_UNSUPPORTED.value.format(", ".join(invalid_columns),
928
- func)
929
- error_msgs.append(error)
930
-
931
- if len(valid_columns) == 0: # No supported columns in the given list of columns
932
- raise TeradataMlException(Messages.get_message(
933
- MessageCodes.TDMLDF_AGGREGATE_COMBINED_ERR).format("\n".join(error_msgs)),
934
- MessageCodes.TDMLDF_AGGREGATE_COMBINED_ERR)
935
-
936
- # quote column names same as that of the Teradata reserved keywords.
937
- quote_column_name = [UtilFuncs._process_for_teradata_keyword(col) for col in column_names]
938
-
939
- # Actual columns should be retained if "drop_columns" is set to False.
940
- if kwargs.get("drop_columns") is False:
941
- valid_columns = quote_column_name + valid_columns
942
- new_column_names = column_names + new_column_names
943
- new_column_types = column_types + new_column_types
944
-
945
- aggregate_expr = ", ".join(valid_columns)
946
- return aggregate_expr, new_column_names, new_column_types
947
-
948
- @staticmethod
949
- def _construct_sql_expression_for_aggregations_for_agg(df, column_names, column_types, func, percentile=.5,
950
- describe_op=False, **kwargs):
951
- """
952
- Internal function to create and return the sql expression
953
- corresponding to given operation, given column_names and
954
- column_types.
955
-
956
- Column_types are used to check whether all the datatypes are
957
- valid types for given operation and throw exception if they
958
- are not.
959
-
960
- PARAMETERS :
961
- df:
962
- Required Argument.
963
- Specifies teradataml DataFrame which is to be used to get the desired
964
- aggregate column expression.
965
- Types: teradataml DataFrame
966
-
967
- column_names:
968
- Required Argument.
969
- Specifies the column names for which desired aggregate operation is
970
- to be executed.
971
- Types: List of strings
972
-
973
- column_types:
974
- Required Argument.
975
- Specifies the respective column types for column names.
976
- Types: List of teradatasqlalchemy types
977
-
978
- func:
979
- Required Argument.
980
- Specifies the aggregate function(s) to be applied on teradataml
981
- DataFrame columns.
982
- Types: string, dictionary or list of strings/functions.
983
- Accepted combinations are:
984
- 1. String function name
985
- 2. List of functions
986
- 3. Dictionary containing column name as key and aggregate
987
- function name (string or list of strings) as value
988
-
989
- percentile:
990
- Optional Argument.
991
- Specifies a value between 0 and 1 that can only be used with func = 'percentile'.
992
- The default is .5, which returns the 50th percentiles.
993
- Types: float
994
-
995
- describe_op:
996
- Optional Argument.
997
- Specifies a boolean flag, that will decide whether the aggregate operation being
998
- performed is for DataFrame.describe() or not.
999
- Types: bool
1000
-
1001
- kwargs:
1002
- Specifies miscellaneous keyword arguments that can be passed to aggregate functions.
1003
-
1004
- RETURNS :
1005
- a)sql expression as
1006
- 1. 'min(col1) as min_col1, min(col2) as min_col2' if
1007
- col1 and col2 are the columns in Dataframe and
1008
- operation is 'min'
1009
- 2. 'max(col1) as max_col1, max(col2) as max_col2' if
1010
- col1 and col2 are the columns in Dataframe and
1011
- operation is 'max'
1012
- 3. 'min(col1) as min_col1, stddev_samp(col2) as
1013
- std_col2' if col1, col2 are the columns in
1014
- Dataframe and operations are min, std.
1015
- etc...
1016
- b) new columns' names (eg min_col1, min_col2 ...)
1017
- c) new columns' types
1018
- RAISES:
1019
- TeradataMLException
1020
- 1. TDMLDF_AGGREGATE_COMBINED_ERR - If the provided
1021
- aggregate operations do not support specified columns.
1022
-
1023
- Possible Value :
1024
- No results. Below is/are the error message(s):
1025
- All selected columns [(col1 - VARCHAR)] is/are
1026
- unsupported for 'sum' operation.
1027
-
1028
- 2. TDMLDF_INVALID_AGGREGATE_OPERATION - If the aggregate
1029
- operation(s) received in parameter 'func' is/are
1030
- invalid.
1031
-
1032
- Possible Value :
1033
- Invalid aggregate operation(s): minimum, counter.
1034
- Valid aggregate operation(s): count, max, mean, min,
1035
- std, sum.
1036
-
1037
- 3. TDMLDF_AGGREGATE_INVALID_COLUMN - If any of the columns
1038
- specified in func is not present in the dataframe.
1039
-
1040
- Possible Value :
1041
- Invalid column(s) given in parameter func: col1.
1042
- Valid column(s) : A, B, C, D.
1043
-
1044
- EXAMPLES:
1045
- col_names, col_types = \
1046
- df_utils._get_column_names_and_types_from_metaexpr(
1047
- self._metaexpr)
1048
- expr, new_col_names, new_col_types = \
1049
- df_utils._construct_sql_expression_for_aggregations_for_agg(
1050
- col_names, col_types, 'min')
1051
-
1052
- expr1, new_col_names1, new_col_types1 = \
1053
- df_utils._construct_sql_expression_for_aggregations_for_agg(
1054
- col_names, col_types, ['min', 'sum'])
1055
-
1056
- expr2, new_col_names2, new_col_types2 = \
1057
- df_utils._construct_sql_expression_for_aggregations_for_agg(
1058
- col_names, col_types, {'col1 : ['min', 'sum'],
1059
- 'col2' : 'mean'})
1060
-
1061
- """
1062
- # If function is of type time series aggregates, we process aggregation differently.
1063
- # Also, one is not supposed to pass below time series aggreagtes to DataFrame.agg():
1064
- # ['bottom', 'bottom with ties', 'delta_t', 'mad', 'top', 'top with ties']
1065
- # Thus, no extra processing is required for time series aggregates over here.
1066
-
1067
- # 'operations' contains dict of columns -> list of aggregate operations
1068
- operations = DataFrameUtils._validate_agg_function(func, column_names)
1069
-
1070
- all_valid_columns = []
1071
- all_invalid_columns = {}
1072
- all_new_column_names = []
1073
- all_new_column_types = []
1074
-
1075
- # For each column, the value is True if there is at least one valid operation (operation on valid datatype)
1076
- column_supported = {}
1077
- tdp = preparer(td_dialect)
1078
- for column in operations:
1079
- column_supported[column] = False
1080
- valid_columns = []
1081
- invalid_columns = {}
1082
- new_column_names = []
1083
- new_column_types = []
1084
- for operation in operations[column]:
1085
- is_colop_supported, new_col, new_coltype, column_aggr_expr, invalid_column_info = \
1086
- DataFrameUtils._generate_aggregate_column_expression(df=df, column=column, operation=operation,
1087
- describe_op=describe_op, percentile=percentile,
1088
- tdp=tdp, **kwargs)
1089
- if is_colop_supported:
1090
- column_supported[column] = is_colop_supported
1091
- new_column_names.append(new_col)
1092
- new_column_types.append(new_coltype)
1093
- valid_columns.append(column_aggr_expr)
1094
- else:
1095
- if operation in invalid_columns:
1096
- invalid_columns[operation].append(invalid_column_info)
1097
- else:
1098
- invalid_columns[operation] = [invalid_column_info]
1099
-
1100
- all_valid_columns.extend(valid_columns)
1101
- all_new_column_names.extend(new_column_names)
1102
- all_new_column_types.extend(new_column_types)
1103
-
1104
- for operation in invalid_columns:
1105
- if operation in all_invalid_columns:
1106
- all_invalid_columns[operation].extend(invalid_columns[operation])
1107
- else:
1108
- all_invalid_columns[operation] = invalid_columns[operation]
1109
-
1110
- unsupported_columns = [col for col in column_supported if not column_supported[col]]
1111
- unsupported_columns.sort() # helps in catching the columns in lexicographic order
1112
-
1113
- error_msgs = []
1114
- for operation in sorted(all_invalid_columns):
1115
- all_invalid_columns[operation].sort() # helps in catching the columns in
1116
- # lexicographic order
1117
- error = MessageCodes.TDMLDF_AGGREGATE_UNSUPPORTED.value.format(
1118
- ", ".join(all_invalid_columns[operation]), operation)
1119
- error_msgs.append(error)
1120
-
1121
- if not all(column_supported[oper] for oper in column_supported):
1122
- new_msg = MessageCodes.TDMLDF_AGGREGATE_AGG_DICT_ERR.value.format(", ".join(unsupported_columns))
1123
- error_msgs.append(new_msg)
1124
- msg = Messages.get_message(MessageCodes.TDMLDF_AGGREGATE_COMBINED_ERR).format("\n".join(error_msgs))
1125
- raise TeradataMlException(msg, MessageCodes.TDMLDF_AGGREGATE_COMBINED_ERR)
1126
-
1127
- elif len(all_valid_columns) == 0: # No supported columns in the given list of columns
1128
- raise TeradataMlException(Messages.get_message(
1129
- MessageCodes.TDMLDF_AGGREGATE_COMBINED_ERR).format("\n".join(error_msgs)),
1130
- MessageCodes.TDMLDF_AGGREGATE_COMBINED_ERR)
1131
-
1132
- aggregate_expr = ", ".join(all_valid_columns)
1133
- return aggregate_expr, all_new_column_names, all_new_column_types
1134
-
1135
- @staticmethod
1136
- def _construct_sql_expression_for_time_series_aggregations(df, column_names, column_types, func, **kwargs):
1137
- """
1138
- Internal function to create and return the sql expression
1139
- corresponding to given time series function, given column_names and
1140
- column_types.
1141
-
1142
- Column_types are used to check whether all the datatypes are
1143
- valid types for given operation and throw exception if they
1144
- are not.
1145
-
1146
- NOTE:
1147
- This function should be used only for time series aggregates.
1148
-
1149
- PARAMETERS :
1150
- df:
1151
- Required Argument.
1152
- Specifies teradataml DataFrame which is to be used to get the desired
1153
- aggregate column expression.
1154
- Types: teradataml DataFrame
1155
-
1156
- column_names:
1157
- Required Argument.
1158
- Specifies the column names for which desired aggregate operation is
1159
- to be executed.
1160
- Types: List of strings
1161
-
1162
- column_types:
1163
- Required Argument.
1164
- Specifies the respective column types for column names.
1165
- Types: List of teradatasqlalchemy types
1166
-
1167
- func:
1168
- Required Argument.
1169
- Specifies the aggregate function(s) to be applied on teradataml
1170
- DataFrame columns. For Time Series aggregates it is usually a string.
1171
- Types: str
1172
-
1173
- kwargs:
1174
- Specifies miscellaneous keyword arguments that can be passed to aggregate functions.
1175
-
1176
- RETURNS :
1177
- a)sql expression as
1178
- 1. 'bottom(2, "col1") as "bottom2col1"' if
1179
- col1 and col2 are the columns in Dataframe and
1180
- operation is 'bottom'
1181
- etc...
1182
- b) new columns' names (eg min_col1, min_col2 ...)
1183
- c) new columns' types
1184
-
1185
- RAISES:
1186
- None.
1187
-
1188
- EXAMPLES:
1189
- colname_to_numvalues = {"col1" : 2, "col2": 3}
1190
- kwargs = {"colname_to_numvalues": colname_to_numvalues}
1191
- aggregate_expr, column_names, column_types = \
1192
- df_utils._construct_sql_expression_for_time_series_aggregations(column_names, column_types,
1193
- func, **kwargs)
1194
-
1195
- """
1196
-
1197
- # eg of column_types: [VARCHAR(length=13), INTEGER(), VARCHAR(length=60), VARCHAR(length=5),
1198
- # FLOAT(precision=0)]
1199
-
1200
- # eg of types of each column are <class 'teradatasqlalchemy.types.VARCHAR'>,
1201
- # <class 'teradatasqlalchemy.types.INTEGER'>, <class 'teradatasqlalchemy.types.FLOAT'>,
1202
- # <class 'teradatasqlalchemy.types.INTERVAL_MINUTE_TO_SECOND'> etc..
1203
-
1204
- col_names_and_types = dict(zip(column_names, column_types))
1205
- tdp = preparer(td_dialect)
1206
-
1207
- select_columns = []
1208
- new_column_names = []
1209
- new_column_types = []
1210
- if func in ["bottom", "bottom with ties", "top", "top with ties"]:
1211
- # Processing for bottom and top.
1212
- # Function name to be used in column aliasing.
1213
- column_alias_func = func.replace(" ", "_")
1214
- bottom_col_val = kwargs["colname_to_numvalues"]
1215
- for column in sorted(list(bottom_col_val.keys())):
1216
- new_col_name = "{2}{0}{1}".format(bottom_col_val[column], column, column_alias_func)
1217
- quoted_parent_column_name = tdp.quote("{0}".format(column))
1218
- quoted_new_column_name = tdp.quote(new_col_name)
1219
- select_columns.append("{0}({1}, {2}) as {3}".format(func, bottom_col_val[column],
1220
- quoted_parent_column_name, quoted_new_column_name))
1221
- new_column_names.append(new_col_name)
1222
- new_column_types.append(col_names_and_types[column])
1223
-
1224
- if func == "delta_t":
1225
- # Argument processing for DELTA-T
1226
- new_column_names.append("delta_t_td_timecode")
1227
- quoted_new_column_name = tdp.quote(new_column_names[0])
1228
- new_column_types.append(PERIOD_TIMESTAMP)
1229
- select_columns.append("{0}((WHERE {1}), (WHERE {2})) as {3}".format(func, kwargs["start_condition"],
1230
- kwargs["end_condition"],
1231
- quoted_new_column_name))
1232
-
1233
- if func == 'mad':
1234
- # Processing for Median Absolute Deviation.
1235
- # Function name to be used in column aliasing.
1236
- column_alias_func = func.replace(" ", "_")
1237
- bottom_col_val = kwargs["colname_to_numvalues"]
1238
- for column in sorted(list(bottom_col_val.keys())):
1239
- new_col_name = "{2}{0}{1}".format(bottom_col_val[column], column, column_alias_func)
1240
- quoted_parent_column_name = tdp.quote("{0}".format(column))
1241
- quoted_new_column_name = tdp.quote(new_col_name)
1242
- select_columns.append("{0}({1}, {2}) as {3}".format(func, bottom_col_val[column],
1243
- quoted_parent_column_name, quoted_new_column_name))
1244
- new_column_names.append(new_col_name)
1245
- if type(col_names_and_types[column]) in [DECIMAL, NUMBER]:
1246
- # If column types is DECIMAL or NUMBER, then output column types should also be same.
1247
- # Otherwise, it is FLOAT.
1248
- new_column_types.append(col_names_and_types[column])
1249
- else:
1250
- new_column_types.append(FLOAT())
1251
-
1252
- if "default_constant_for_columns" in kwargs.keys():
1253
- column_names = kwargs["default_constant_for_columns"]
1254
- column_types = [col_names_and_types[column] for column in column_names]
1255
- if len(column_names) > 0:
1256
- aggregate_expr, all_new_column_names, all_new_column_types = \
1257
- DataFrameUtils._construct_sql_expression_for_aggregations(df=df, column_names=column_names,
1258
- column_types=column_types, func=func,
1259
- )
1260
- aggregate_expr_default_column_list = [col.strip() for col in aggregate_expr.split(",")]
1261
- select_columns = select_columns + aggregate_expr_default_column_list
1262
- new_column_names = new_column_names + all_new_column_names
1263
- new_column_types = new_column_types + all_new_column_types
1264
-
1265
-
1266
- aggregate_expr = ", ".join(select_columns)
1267
- return aggregate_expr, new_column_names, new_column_types
1268
-
1269
- @staticmethod
1270
- def _construct_describe_query(df, metaexpr, percentiles, function_label, groupby_column_list=None,
1271
- include=None, is_time_series_aggregate=False, verbose=False, distinct=False,
1272
- statistics=None, **kwargs):
1273
- """
1274
- Internal function to create the sql query for describe().
1275
-
1276
- PARAMETERS :
1277
- df:
1278
- Required Argument.
1279
- Specifies teradataml DataFrame we are collecting statistics for.
1280
- Types: str
1281
-
1282
- metaexpr:
1283
- Required Argument.
1284
- Specifies the meta expression for the dataframe.
1285
- Types: _MetaExpression
1286
-
1287
- percentiles:
1288
- Required Argument.
1289
- Specifies a list of values between 0 and 1.
1290
- Types: List of floats
1291
-
1292
- function_label:
1293
- Required Argument.
1294
- Specifies a string value used as the label for the aggregate function column.
1295
- Types: str
1296
-
1297
- groupby_column_list:
1298
- Optional Argument.
1299
- Specifies the group by columns for the dataframe.
1300
- Default Values: None.
1301
- Types: str ot List of strings (str)
1302
-
1303
- include:
1304
- Optional Argument.
1305
- Specifies a string that must be "all" or None. If "all", then all columns will be included.
1306
- Otherwise, only numeric columns are used for collecting statistics.
1307
- Default Values: None.
1308
- Types: str
1309
-
1310
- is_time_series_aggregate:
1311
- Optional Argument.
1312
- Specifies a flag stating whether describe operation is time series aggregate or not.
1313
- Default Values: False.
1314
- Types: bool
1315
-
1316
- verbose:
1317
- Optional Argument.
1318
- Specifies a flag stating whether DESCRIBE VERBOSE option for time series aggregate is to be
1319
- performed or not.
1320
- Default Values: False.
1321
- Types: bool
1322
-
1323
- distinct:
1324
- Optional Argument.
1325
- Specifies a flag that decides whether to consider duplicate rows in calculation or not.
1326
- Default Values: False
1327
- Types: bool
1328
-
1329
- kwargs:
1330
- Optional Arguments.
1331
- Keyword argument for time series aggregate functions.
1332
-
1333
-
1334
- RETURNS :
1335
- A SQL query like:
1336
- select 'count' as "func", cast(count("Feb") as Number) as "Feb", cast(count(accounts) as Number) as accounts from "PYUSER"."salesview"
1337
- union all
1338
- select 'mean' as "func", cast(avg("Feb") as Number) as "Feb", null as accounts from "PYUSER"."salesview"
1339
- union all
1340
- select 'std' as "func", cast(stddev_samp("Feb") as Number) as "Feb", null as accounts from "PYUSER"."salesview"
1341
- union all
1342
- select 'min' as "func", cast(min("Feb") as Number) as "Feb", cast(min(accounts) as Number) as accounts from "PYUSER"."salesview"
1343
- union all
1344
- select '25%' as "func", percentile_cont(0.25) within group(order by cast("Feb" as Number) ) as "Feb", null as accounts from "PYUSER"."salesview"
1345
- union all
1346
- select '50%' as "func", percentile_cont(0.5) within group(order by cast("Feb" as Number) ) as "Feb", null as accounts from "PYUSER"."salesview"
1347
- union all
1348
- select '75%' as "func", percentile_cont(0.75) within group(order by cast("Feb" as Number) ) as "Feb", null as accounts from "PYUSER"."salesview"
1349
- union all
1350
- select 'max' as "func", cast(max("Feb") as Number) as "Feb", cast(max(accounts) as Number) as accounts from "PYUSER"."salesview"
1351
-
1352
- RAISES:
1353
- TeradataMLException
1354
-
1355
- EXAMPLES:
1356
- agg_query = \
1357
- df_utils._construct_describe_query("self._table_name", self._metaexpr, [.25, .5, .75], "func", self.groupby_column_list)
1358
- agg_query = \
1359
- df_utils._construct_describe_query("self._table_name", self._metaexpr, [.3, .6], "func", self.groupby_column_list, include="all")
1360
-
1361
- """
1362
- table_name = df._table_name
1363
- operators = ["count", "mean", "std", "min", "percentile", "max"]
1364
- all_operators = ["count", "unique", "mean", "std", "min", "percentile", "max"]
1365
-
1366
- if is_time_series_aggregate and verbose:
1367
- # Time Series Aggregate Operators for Vantage DESCRIBE function with verbose
1368
- operators = ['max', 'mean', 'median', 'min', 'mode', "percentile", 'std']
1369
- elif is_time_series_aggregate and not verbose:
1370
- # Time Series Aggregate Operators for Vantage DESCRIBE function.
1371
- operators = ['max', 'mean', 'min', 'std']
1372
-
1373
- col_names = []
1374
- col_types = []
1375
- sel_agg_stmts = []
1376
- tdp = preparer(td_dialect)
1377
- quoted_function_label = tdp.quote(function_label)
1378
-
1379
- if include is not None and include == 'all' and not is_time_series_aggregate:
1380
- operators = all_operators
1381
-
1382
- if include is None and statistics is not None:
1383
- operators = statistics
1384
-
1385
- table_name, sel_groupby, groupby = DataFrameUtils()._process_groupby_clause(table_name, groupby_column_list,
1386
- is_time_series_aggregate, **kwargs)
1387
-
1388
- for col in metaexpr.c:
1389
- if (include is None and type(col.type) in UtilFuncs()._get_numeric_datatypes()) or include == 'all' or statistics is not None:
1390
- if not(groupby is not None and col.name in groupby_column_list):
1391
- col_names.append(col.name)
1392
- col_types.append(col.type)
1393
-
1394
- if len(col_names) == 0:
1395
- raise TeradataMlException(
1396
- Messages.get_message(MessageCodes.TDMLDF_AGGREGATE_COMBINED_ERR,
1397
- "The DataFrame does not contain numeric columns"),
1398
- MessageCodes.TDMLDF_AGGREGATE_COMBINED_ERR)
1399
-
1400
- for op in operators:
1401
- if op == "percentile":
1402
- for p in percentiles:
1403
- agg_expr, new_col_names, new_col_types = \
1404
- DataFrameUtils._construct_sql_expression_for_aggregations(df,
1405
- col_names, col_types, op, percentile=p, describe_op=True, distinct=distinct,
1406
- as_time_series_aggregate=is_time_series_aggregate)
1407
- sel_agg_stmts.append("SELECT \n\t{4} \n\tcast('{0}%' as varchar(6)) as \"{1}\", {2} from {3} ".format(
1408
- int(p*100), quoted_function_label, agg_expr, table_name, sel_groupby))
1409
- else:
1410
- agg_expr, new_col_names, new_col_types = \
1411
- DataFrameUtils._construct_sql_expression_for_aggregations(df,
1412
- col_names, col_types, op, describe_op=True, distinct=distinct,
1413
- as_time_series_aggregate=is_time_series_aggregate)
1414
- sel_agg_stmts.append("SELECT \n\t{4} \n\tcast('{0}' as varchar(6)) as \"{1}\", \n\t{2} \nfrom \n\t{3} ".format(
1415
- op, quoted_function_label, agg_expr, table_name, sel_groupby))
1416
- return " \nunion all\n ".join(sel_agg_stmts)
1417
-
1418
- @staticmethod
1419
- def _process_groupby_clause(table_name, groupby_column_list, is_time_series_aggregate, **kwargs):
1420
- """
1421
- Internal function used to process and generate GROUP BY or GROUP BY TIME clauses required for
1422
- query to be run for describe operation.
1423
-
1424
- PARAMETERS:
1425
- table_name:
1426
- Required Arguments.
1427
- Specifies table name to be used for forming describe query.
1428
- Types: str
1429
-
1430
- groupby_column_list:
1431
- Required Arguments.
1432
- Specifies list of column names involved in Group By.
1433
- Types: List of Strings.
1434
-
1435
- is_time_series_aggregate:
1436
- Required Arguments.
1437
- Specifies a boolean stating whether GROUP BY clause to be formed is for
1438
- Time series aggregate or not.
1439
- Types: bool
1440
-
1441
- kwargs:
1442
- Optional Arguments.
1443
- Keyword argument for time series aggregate functions.
1444
-
1445
- RETURNS:
1446
- 1. Table Name appended with GROUP BY clause.
1447
- 2. Column projection string for GROUP BY columns.
1448
- 3. Group By Clause.
1449
-
1450
- RAISES:
1451
- None.
1452
-
1453
- EXAMPLES:
1454
- table_name, sel_groupby, groupby = DataFrameUtils()._process_groupby_clause(table_name, groupby_column_list,
1455
- is_time_series_aggregate, **kwargs)
1456
-
1457
- """
1458
- sel_groupby = ""
1459
- grp_by_clause = None
1460
-
1461
- if is_time_series_aggregate:
1462
- # For time series aggregate timebucket_duration is must so, it'll be always present in kwargs.
1463
- grp_by_clause = "GROUP BY TIME ({0}".format(kwargs['timebucket_duration'])
1464
-
1465
- # Add columns in value expression to GROUP BY TIME
1466
- if 'value_expression' in kwargs and \
1467
- kwargs['value_expression'] is not None and \
1468
- len(kwargs['value_expression']) > 0:
1469
- grp_by_clause = "{0} and {1}".format(grp_by_clause, ", ".join(kwargs['value_expression']))
1470
-
1471
- # Complete the parenthesis for GROUP BY TIME
1472
- grp_by_clause = "{0})".format(grp_by_clause)
1473
-
1474
- # Add Time code column information.
1475
- if 'timecode_column' in kwargs and \
1476
- kwargs['timecode_column'] is not None and \
1477
- len(kwargs['timecode_column']) > 0:
1478
- if 'sequence_column' in kwargs and \
1479
- kwargs['timecode_column'] is not None and \
1480
- len(kwargs['timecode_column']) > 0:
1481
- grp_by_clause = "{0} USING TIMECODE({1}, {2})".format(grp_by_clause, kwargs['timecode_column'],
1482
- kwargs['sequence_column'])
1483
- else:
1484
- grp_by_clause = "{0} USING TIMECODE({1})".format(grp_by_clause, kwargs['timecode_column'])
1485
-
1486
- # Add Fill inforamtion
1487
- if 'fill' in kwargs and kwargs['fill'] is not None and len(kwargs['fill']) > 0:
1488
- grp_by_clause = "{0} FILL({1})".format(grp_by_clause, kwargs['fill'])
1489
-
1490
- else:
1491
- if groupby_column_list is not None:
1492
- grp_by_clause = "GROUP BY {0}".format(",".join(groupby_column_list))
1493
-
1494
- if grp_by_clause is not None:
1495
- table_name = "{0} \n{1}".format(table_name, grp_by_clause)
1496
- tdp = preparer(td_dialect)
1497
- for g in groupby_column_list:
1498
- if is_time_series_aggregate:
1499
- if g == "TIMECODE_RANGE":
1500
- g = "$TD_TIMECODE_RANGE"
1501
-
1502
- if "GROUP BY TIME" in g:
1503
- g = "$TD_GROUP_BY_TIME"
1504
-
1505
- quoted_name = tdp.quote(g)
1506
- sel_groupby += "{0}, ".format(quoted_name)
1507
-
1508
- return table_name, sel_groupby, grp_by_clause
1509
-
1510
- @staticmethod
1511
- def _get_column_names_and_types_from_metaexpr(metaexpr):
1512
- """
1513
- Internal function to return column names and respective types
1514
- given _metaexpr.
1515
-
1516
- PARAMETERS:
1517
- metaexpr:
1518
- Required Argument.
1519
- Dataframe's metaexpr. It is used to get column names and types.
1520
- Types: MetaExpression
1521
-
1522
- RETURNS:
1523
- Two lists - one for column names and another for column types
1524
-
1525
- RAISES:
1526
- None
1527
-
1528
- EXAMPLES:
1529
- dfUtils._get_column_names_and_types_from_metaexpr(
1530
- df._metaexpr)
1531
- """
1532
- # Constructing New Column names & Types for selected columns ONLY using Parent _metaexpr
1533
- col_names = []
1534
- col_types = []
1535
- for c in metaexpr.c:
1536
- col_names.append(c.name)
1537
- col_types.append(c.type)
1538
-
1539
- return col_names, col_types
1540
-
1541
- @staticmethod
1542
- def _insert_all_from_table(to_table_name, from_table_name, column_list, schema_name,
1543
- temporary=False):
1544
- """
1545
- Inserts all records from one table into the second, using columns ordered by column list.
1546
-
1547
- PARAMETERS:
1548
- to_table_name - String specifying name of the SQL Table to insert to.
1549
- insert_from_table_name - String specifying name of the SQL Table to insert from.
1550
- column_list - List of strings specifying column names used in the insertion.
1551
- schema_name - Name of the database schema to insert table data into.
1552
- temporary - Specifies whether to create Vantage tables as permanent or volatile.
1553
- Default: False
1554
- Note: When True:
1555
- 1. volatile Tables are created, and
1556
- 2. schema_name is ignored.
1557
- When False, permanent tables are created.
1558
- RETURNS:
1559
- None
1560
-
1561
- RAISES:
1562
- Database error if an error occurred while executing the insert command.
1563
-
1564
- EXAMPLES:
1565
- df_utils._insert_all_from_table('table1_name', 'table2_name', ['col1', 'col2', 'col3'])
1566
- """
1567
- tdp = preparer(td_dialect)
1568
-
1569
- # Construct INSERT command.
1570
- column_order_string = ', '.join([tdp.quote("{0}".format(element)) for element in column_list])
1571
-
1572
- if schema_name:
1573
- full_to_table_name = tdp.quote(schema_name) + "." + tdp.quote(to_table_name)
1574
- elif temporary:
1575
- full_to_table_name = tdp.quote(to_table_name)
1576
- else:
1577
- full_to_table_name = tdp.quote(_get_current_databasename()) + "." + tdp.quote(
1578
- to_table_name)
1579
-
1580
- insert_sql = SQLBundle._build_insert_from_table_query(full_to_table_name, from_table_name, column_order_string)
1581
-
1582
- # Execute INSERT command.
1583
- return UtilFuncs._execute_ddl_statement(insert_sql)
1584
-
1585
- @staticmethod
1586
- def _dataframe_has_column(data, column):
1587
- """
1588
- Function to check whether column names in columns are present in given dataframe or not.
1589
- This function is used currently only for Analytics wrappers.
1590
-
1591
- PARAMETERS:
1592
- data - teradataml DataFrame to check against for column existence.
1593
- column - Column name (a string).
1594
-
1595
- RAISES:
1596
- None
1597
-
1598
- EXAMPLES:
1599
- DataFrameUtils._dataframe_has_column(data, col)
1600
- """
1601
- if column in [c.name for c in data._metaexpr.c]:
1602
- return True
1603
-
1604
- return False
1605
-
1606
- @staticmethod
1607
- def _get_row_count(table_name):
1608
- """
1609
- Function to return the row count of a teradataml Dataframe.
1610
- This function is used currently to determine the shape/size of a dataframe.
1611
-
1612
- PARAMETERS:
1613
- table_name - Name of the table to get the row count for.
1614
-
1615
- RAISES:
1616
- TeradataMlException (TDMLDF_INFO_ERROR)
1617
-
1618
- EXAMPLES:
1619
- DataFrameUtils._get_row_count(table_name)
1620
- """
1621
- # Construct COUNT(*) Query
1622
- try:
1623
- row_count_query = SQLBundle._build_nrows_print_query(table_name)
1624
- res = execute_sql(row_count_query)
1625
- return res.fetchone()[0]
1626
-
1627
- except TeradataMlException:
1628
- raise
1629
-
1630
- except Exception as err:
1631
- # TODO Better handle the level of information being presented to the user with logging
1632
- raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_INFO_ERROR) + str(err),
1633
- MessageCodes.TDMLDF_INFO_ERROR) from err
1634
-
1635
- @staticmethod
1636
- def _get_scalar_value(table_name):
1637
- """
1638
- Function to return the the only 1x1 (scalar) value from a teradataml Dataframe.
1639
-
1640
- PARAMETERS:
1641
- table_name - Name of the table to get the value from.
1642
-
1643
- RAISES:
1644
- TeradataMlException (TDMLDF_INFO_ERROR)
1645
-
1646
- EXAMPLES:
1647
- DataFrameUtils._get_scalar_value(table_name)
1648
- """
1649
- # Construct the base Query
1650
- try:
1651
- select_query = SQLBundle._build_base_query(table_name)
1652
- res = execute_sql(select_query)
1653
- return res.fetchone()[0]
1654
-
1655
- except TeradataMlException:
1656
- raise
1657
-
1658
- except Exception as err:
1659
- # TODO Better handle the level of information being presented to the user with logging
1660
- raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_INFO_ERROR) + str(err),
1661
- MessageCodes.TDMLDF_INFO_ERROR) from err
1662
-
1663
- @staticmethod
1664
- def _get_sorted_nrow(df, n, sort_col, asc=True):
1665
- """
1666
- Internal Utility function that returns a teradataml DataFrame containing n rows
1667
- of the DataFrame. The Dataframe is sorted on the index column or the first column
1668
- if there is no index column.
1669
-
1670
- PARAMETERS:
1671
- df: teradataml DataFrame
1672
- n: Specifies the number of rows to select.
1673
- Type: int
1674
- sort_col: The column to sort on.
1675
- Type: str
1676
- asc: (optional) - Specifies sort order.
1677
- If True, sort in ascending order.
1678
- If False, sort in descending order.
1679
- The default value is True.
1680
- Type: boolean
1681
-
1682
- RETURNS:
1683
- teradataml DataFrame
1684
-
1685
- EXAMPLES:
1686
- DataFrameUtils._get_sorted_nrow(df, 10)
1687
- DataFrameUtils._get_sorted_nrow(df, 20, asc=True)
1688
- DataFrameUtils._get_sorted_nrow(df, 30, asc=False)
1689
-
1690
- """
1691
- #TODO: implement and use this in teradatasqlalchemy
1692
- tdp = preparer(td_dialect)
1693
- aed_utils = AedUtils()
1694
-
1695
- sort_order = "asc"
1696
- if not asc:
1697
- sort_order = "desc"
1698
-
1699
- quoted_cols = [tdp.quote(c) for c in df.columns]
1700
- sel_cols_str = ",".join(quoted_cols)
1701
- sel_row_num = "row_number() over (order by \"{0}\" {1}) - 1 as tdml_row_num, {2}".format(sort_col, sort_order, sel_cols_str)
1702
- filter_str = "tdml_row_num < {0}".format(n)
1703
- sel_nodeid = aed_utils._aed_select(df._nodeid, sel_row_num)
1704
- fil_nodeid = aed_utils._aed_filter(sel_nodeid, filter_str)
1705
- sel2_nodeid = aed_utils._aed_select(fil_nodeid, sel_cols_str)
1706
- col_names, col_types = __class__._get_column_names_and_types_from_metaexpr(df._metaexpr)
1707
- new_metaexpr = UtilFuncs._get_metaexpr_using_columns(df._nodeid, zip(col_names, col_types))
1708
- # Call the function from_node from appropriate class either DataFrame or GeoDataFrame
1709
- new_df = df.__class__._from_node(sel2_nodeid, new_metaexpr, df._index_label)
1710
- new_df._orderby = df._orderby
1711
- new_df._metaexpr._n_rows = n
1712
- return new_df
1713
-
1714
- @staticmethod
1715
- def _get_database_names(connection, schema_name):
1716
- """
1717
- Function to return a list valid of database names for a given sqlalchemy connection.
1718
- This function is used to determine whether the database used is valid in user APIs such as copy_to_sql.
1719
-
1720
- PARAMETERS:
1721
- connection: Required Argument.
1722
- A SQLAlchemy connection object.
1723
-
1724
- schema_name: Required Argument
1725
- String specifying the requested schema name.
1726
-
1727
- RAISES:
1728
- TeradataMlException (TDMLDF_INFO_ERROR)
1729
-
1730
- EXAMPLES:
1731
- DataFrameUtils._get_database_names(get_connection(), schema_name)
1732
- """
1733
- #TODO: implement and use this in teradatasqlalchemy
1734
- table_obj = table('databasesV', column('databasename'), schema='dbc')
1735
- stmt = select(text(str(func.lower(table_obj.c.databasename)) + ' as databasename')).where(
1736
- text('databasename (NOT CASESPECIFIC) = {} (NOT CASESPECIFIC)'.format(':schema_name')))
1737
- stmt = text(str(stmt))
1738
- stmt = stmt.bindparams(schema_name=schema_name)
1739
- res = connection.execute(stmt).fetchall()
1740
- return [name.databasename for name in res]
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+
4
+ Unpublished work.
5
+ Copyright (c) 2018 by Teradata Corporation. All rights reserved.
6
+ TERADATA CORPORATION CONFIDENTIAL AND TRADE SECRET
7
+
8
+ Primary Owner: mark.sandan@teradata.com
9
+ Secondary Owner:
10
+
11
+ This file implements util functions of data frame.
12
+ """
13
+
14
+ import numbers
15
+ import pandas as pd
16
+ from collections import OrderedDict
17
+
18
+ from teradataml.common.utils import UtilFuncs
19
+ from teradataml.common.aed_utils import AedUtils
20
+ from teradataml.common.constants import AEDConstants, PTITableConstants, \
21
+ SQLPattern, PythonTypes
22
+ from teradataml.common.sqlbundle import SQLBundle
23
+ from teradataml.common.exceptions import TeradataMlException
24
+ from teradataml.common.messages import Messages
25
+ from teradataml.common.messagecodes import MessageCodes
26
+
27
+ from teradataml.context.context import get_context, get_connection
28
+ from teradataml.context.context import _get_current_databasename
29
+ from teradataml.dbutils.dbutils import _execute_query_and_generate_pandas_df
30
+
31
+ from teradataml.options.display import display
32
+ from teradataml.options.configure import configure
33
+ from teradataml.utils.utils import execute_sql
34
+
35
+ from teradatasqlalchemy.types import FLOAT, NUMBER, DECIMAL, PERIOD_TIMESTAMP
36
+ from teradatasqlalchemy.dialect import preparer, dialect as td_dialect
37
+ import teradataml.dataframe as tdmldf
38
+ from teradataml.dataframe.sql_interfaces import ColumnExpression
39
+
40
+ from sqlalchemy.sql import select
41
+ from sqlalchemy.sql.expression import text
42
+ from sqlalchemy import table, column, func
43
+ from datetime import datetime, date, time
44
+ from decimal import Decimal
45
+
46
+ # TODO - Need to write unit testcases for these functions
47
+ class DataFrameUtils():
48
+
49
+ @staticmethod
50
+ def _execute_node_return_db_object_name(nodeid, metaexpression = None):
51
+ """
52
+ Fetches queries and view names from AED node and creates views from queries
53
+ Additionally inspects the metaexpression for consistency
54
+
55
+ PARAMETERS:
56
+ nodeid: nodeid to execute
57
+ metaexpression: (optional) updated _metaexpr to validate
58
+
59
+ EXAMPLES:
60
+ _execute_node_return_db_object_name(nodeid)
61
+ _execute_node_return_db_object_name(nodeid, metaexpr)
62
+
63
+ RETURNS:
64
+ Top level view name.
65
+
66
+ """
67
+ aed_obj = AedUtils()
68
+ if not aed_obj._aed_is_node_executed(nodeid):
69
+
70
+ view_query_node_type_list = aed_obj._aed_get_exec_query(nodeid)
71
+ view_names, queries, node_query_types, node_ids = view_query_node_type_list
72
+
73
+ # Executing Nodes / Creating Views
74
+ for index in range(len(queries) - 1, -1, -1):
75
+ is_persist = False
76
+ if metaexpression and metaexpression._is_persist:
77
+ is_persist = True
78
+
79
+ try:
80
+ if node_query_types[index] == AEDConstants.AED_QUERY_NODE_TYPE_ML_QUERY_MULTI_OUTPUT.value or\
81
+ ("OUT TABLE " in queries[index] and SQLPattern.SQLMR.value.match(queries[index])) or \
82
+ is_persist:
83
+ # TODO:: OR condition in above needs to be removed once AED support is added.
84
+ UtilFuncs._create_table(view_names[index], queries[index])
85
+
86
+ elif node_query_types in ['groupby', 'groupbytime']:
87
+ # If query_type is either groupby or groupbytime get it's parent
88
+ # nodeid and execute queries for the same
89
+ parent_nodeid = aed_obj._aed_get_parent_nodeids(nodeid)[0]
90
+ DataFrameUtils._execute_node_return_db_object_name(parent_nodeid)
91
+
92
+ elif node_query_types[index] == AEDConstants.AED_QUERY_NODE_TYPE_REFERENCE.value:
93
+ # Reference nodes - To be ignored.
94
+ pass
95
+
96
+ else:
97
+ UtilFuncs._create_view(view_names[index], queries[index])
98
+
99
+ # Updating Node Status for executed Node
100
+ aed_obj._aed_update_node_state_single(node_ids[index], AEDConstants.AED_NODE_EXECUTED.value)
101
+
102
+ except Exception as emsg:
103
+ # TODO:: Append node execution details to emsg.
104
+ # Node description, such as nodeType or node operation, should be added
105
+ # here in 'emsg' to give away more information, where exactly
106
+ # node execution failed.
107
+ raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_EXEC_SQL_FAILED, str(emsg)),
108
+ MessageCodes.TDMLDF_EXEC_SQL_FAILED)
109
+
110
+ # Setting New Table name retrieved to TDML DF
111
+ result_table_view_name = aed_obj._aed_get_tablename(nodeid)
112
+ # validate the metaexpression
113
+ if configure._validate_metaexpression:
114
+ DataFrameUtils._validate_metaexpression(result_table_view_name, metaexpression)
115
+
116
+ return result_table_view_name
117
+
118
+ @staticmethod
119
+ def _validate_metaexpression(result_table_view_name, metaexpression):
120
+ """
121
+ Inspects the metaexpression for consistency with the underlying table/view
122
+
123
+ PARAMETERS:
124
+ result_table_view_name: a string representing the table/view name to check column metadata
125
+ metaexpression: the metaexpr of the DataFrame to compare against the result_table_view_name
126
+
127
+ EXAMPLES:
128
+ _validate_metaexpression('t1', df._metaexpr)
129
+ _execute_node_return_db_object_name(nodeid, metaexpr)
130
+
131
+ RETURNS:
132
+ None
133
+ Outputs RuntimeWarnings if mismatches are found
134
+
135
+ """
136
+ # metaexpression should have already been updated
137
+ if metaexpression is not None:
138
+
139
+ name = lambda x: x[0]
140
+ type_ = lambda x: x[1]
141
+
142
+ # compare sorted by name of column
143
+ df = sorted(UtilFuncs._describe_column(DataFrameUtils._get_metadata_from_table(result_table_view_name)), key = lambda x: x[0])
144
+ meta = sorted(metaexpression.c, key = lambda x: x.name)
145
+
146
+ # check length
147
+ if len(df) == len(meta):
148
+ for i in range(len(df)):
149
+
150
+ # map Teradata type to python type
151
+ meta_type = UtilFuncs._teradata_type_to_python_type(meta[i].type)
152
+
153
+ # compare column names and types
154
+ if meta[i].name != name(df[i]) or meta_type != type_(df[i]):
155
+ err_msg = "[Mismatch when checking %s]\n\t[Table/View] %s %s\n\t[MetaExpression] %s %s (mapped from => %s)\n"
156
+ raise RuntimeError(err_msg % (result_table_view_name,
157
+ name(df[i]), type_(df[i]),
158
+ meta[i].name, meta_type, meta[i].type))
159
+ else:
160
+ err_msg = "[Length mismatch when checking %s]\nSource Table/View has length %s but MetaExpression has length %s"
161
+ raise RuntimeError(err_msg % (result_table_view_name, len(df), len(meta)))
162
+
163
+ @staticmethod
164
+ def _get_dataframe_print_string(table_name, index_label, orderby=None, undropped_index=None):
165
+ """
166
+ Builds string output for teradataml DataFrame
167
+
168
+ PARAMETERS:
169
+ table_name - Name of the database table to read from.
170
+ index_label - String/List specifying column to use as index.
171
+ orderby - order expression to sort returned rows
172
+
173
+ EXAMPLES:
174
+ _get_dataframe_print_string('table_name', None, None)
175
+
176
+ RETURNS:
177
+ String representation of a pandas DataFrame.
178
+
179
+ """
180
+ read_query = SQLBundle._build_top_n_print_query(table_name, display.max_rows, orderby)
181
+
182
+ if index_label is not None:
183
+ pandas_df = _execute_query_and_generate_pandas_df(read_query, index=index_label)
184
+ else:
185
+ pandas_df = _execute_query_and_generate_pandas_df(read_query)
186
+
187
+ return pandas_df.to_string()
188
+
189
+ @staticmethod
190
+ def _get_pprint_dtypes(column_names_and_types, null_count=False):
191
+ """
192
+ returns a string containing the column names and types.
193
+ If null_count is not None, the string will also contain
194
+ the number of non-null values for each column.
195
+
196
+ PARAMETERS:
197
+ column_names_and_types - List of column names and types.
198
+ null_count(optional) - List of the non-null count for each column.
199
+
200
+ EXAMPLES:
201
+ >>>print(_get_pprint_dtypes(column_names_and_types)
202
+ accounts str
203
+ Feb float
204
+ Jan int
205
+ Mar int
206
+ Apr int
207
+ datetime str
208
+
209
+ >>>print(_get_pprint_dtypes(column_names_and_types, null_count)
210
+ accounts 3 non-null str
211
+ Feb 3 non-null float
212
+ Jan 3 non-null int
213
+ Mar 3 non-null int
214
+ Apr 3 non-null int
215
+ datetime 3 non-null str
216
+
217
+ RAISES:
218
+
219
+ """
220
+
221
+ col_names = [i[0] for i in column_names_and_types]
222
+ col_types = [i[1] for i in column_names_and_types]
223
+ max_col_names = len(max(col_names, key=len)) + 4
224
+ max_col_types = len(max(col_types, key=len))
225
+ dtypes_string = ""
226
+ if not null_count:
227
+ for colname, coltype in column_names_and_types:
228
+ dtypes_string += "{0: <{name_width}}{1: >{type_width}}\n".format(colname, coltype,
229
+ name_width=max_col_names,
230
+ type_width=max_col_types)
231
+ else:
232
+ null_count = [i[2] for i in column_names_and_types]
233
+ max_null_count = len(str(max(null_count, key=len)))
234
+ for colname, coltype, num_nulls in column_names_and_types:
235
+ dtypes_string += "{0: <{name_width}}{1: <{count_width}} non-null {2: <{type_width}}\n".format(colname,
236
+ num_nulls,
237
+ coltype,
238
+ name_width=max_col_names,
239
+ count_width=max_null_count,
240
+ type_width=max_col_types)
241
+ # Remove last new line character.
242
+ dtypes_string = dtypes_string[:-1]
243
+ return dtypes_string
244
+
245
+ @staticmethod
246
+ def _get_metadata_from_table(table_name):
247
+ """
248
+ Retrieves column metadata by executing a HELP COLUMN command.
249
+
250
+ PARAMETERS:
251
+ table_name - The table name or view name.
252
+
253
+ RETURNS:
254
+ returns the result set (column information) from HELP COLUMN.
255
+
256
+ RAISES:
257
+ Database error if an error occurred while executing the HELP COLUMN.
258
+
259
+ EXAMPLES:
260
+ df = DataFrame.from_table('mytab')
261
+ metadata = _get_metadata_from_table(df._table_name)
262
+ """
263
+ # Construct HELP COLUMN command.
264
+ help_col_sql = SQLBundle._build_help_column(table_name)
265
+ # Execute HELP COLUMN command.
266
+ return UtilFuncs._execute_query(help_col_sql)
267
+
268
+ @staticmethod
269
+ def _extract_select_string(select_expression):
270
+ """
271
+ Takes in a string/list representing a Pandas selection clause of any of the forms (only):
272
+ a) "col1" or 'col1'
273
+ b) ["col 1"] or ['col 1']
274
+ c) ["col1", "col2", "col3"] or ['col1', 'col2', 'col3']
275
+ d) [['col1', 'col2', 'col3']] or [["col1", "col2", "col3"]]
276
+
277
+ And returns a list with column strings representing the selection of the form:
278
+ a) ['col1']
279
+ b) ['col 1']
280
+ c) ['col1','col2','col3']
281
+ d) ['col1','col2','col3']
282
+
283
+ Column Names ("col1", "col2"..) are Strings representing database table Columns.
284
+ All Standard Teradata Data-Types for columns supported: INTEGER, VARCHAR(5), FLOAT.
285
+
286
+ PARAMETERS:
287
+ selection_expression - Expression representing column selection
288
+ Type - String or List of Strings or List of List (Single level only)
289
+ Required - Yes
290
+
291
+ EXAMPLES:
292
+ UtilFuncs._extract_select_string([['col1', 'col2']])
293
+ UtilFuncs._extract_select_string("col1")
294
+ UtilFuncs._extract_select_string(["col1"])
295
+ UtilFuncs._extract_select_string(["col1","col2","col3"])
296
+
297
+ RETURNS:
298
+ List of Strings representing column names.
299
+
300
+ RAISES:
301
+ TeradataMlException
302
+ """
303
+ tdp = preparer(td_dialect)
304
+ column_list = []
305
+
306
+ # Single String column
307
+ if isinstance(select_expression, str):
308
+ # Error handling - Empty String
309
+ if select_expression == "":
310
+ raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_SELECT_NONE_OR_EMPTY),
311
+ MessageCodes.TDMLDF_SELECT_NONE_OR_EMPTY)
312
+ else:
313
+ column_list.append(tdp.quote("{0}".format(select_expression.strip())))
314
+
315
+ # Error Handling - [], [""], [None], ["None"], ['col1', None], ['col1', '']
316
+ elif isinstance(select_expression, list) and (len(select_expression) == 0 or
317
+ any(element in [None, "None", ""] for element in select_expression)):
318
+ raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_SELECT_NONE_OR_EMPTY),
319
+ MessageCodes.TDMLDF_SELECT_NONE_OR_EMPTY)
320
+
321
+ # List - ["col1"] or ["col1", "col2", "col3"]
322
+ elif isinstance(select_expression, list) and all(isinstance(element, str) for element in select_expression):
323
+ if len(select_expression) == 1:
324
+ column_list.append(tdp.quote("{0}".format(select_expression[0].strip())))
325
+ else:
326
+ column_list = [tdp.quote("{0}".format(element.strip())) for element in select_expression]
327
+
328
+ # List of List (Single level only - Pandas Syntax) - [["col1", "col2", "col3"]]
329
+ elif isinstance(select_expression, list) and isinstance(select_expression[0], list):
330
+ # Error Handling - [[]], [[""]], [[None]], [['col1', None]], [['col1', "None"]], ["col1", ""]
331
+ if len(select_expression[0]) == 0 or any(element in [None, "None", ""] for element in select_expression[0]):
332
+ raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_SELECT_NONE_OR_EMPTY),
333
+ MessageCodes.TDMLDF_SELECT_NONE_OR_EMPTY)
334
+
335
+ else:
336
+ column_list = [tdp.quote("{0}".format(element.strip())) for element in select_expression[0]]
337
+
338
+ # Any other Format - Raise Format Exception
339
+ else:
340
+ raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_SELECT_INVALID_FORMAT),
341
+ MessageCodes.TDMLDF_SELECT_INVALID_FORMAT)
342
+ return column_list
343
+
344
+ @staticmethod
345
+ def _get_primary_index_from_table(table_name):
346
+ """
347
+ Retrieves the primary index by executing a HELP INDEX command.
348
+ PARAMETERS:
349
+ table_name - The table name or volatile table name.
350
+ RETURNS:
351
+ Returns a list containing the primary index columns from HELP INDEX.
352
+ If the there are no primary index (NoPI table), then returns None.
353
+ RAISES:
354
+ Database error if an error occurred while executing the HELP INDEX.
355
+ EXAMPLES:
356
+ df = DataFrame.from_table('mytab')
357
+ index_labels = df._get_metadata_from_table(df._table_name)
358
+ """
359
+ # Construct HELP INDEX command.
360
+ help_index_sql = SQLBundle._build_help_index(table_name)
361
+
362
+ # Execute HELP INDEX command.
363
+ rows = UtilFuncs._execute_query(help_index_sql)
364
+ index_labels = []
365
+ for row in rows:
366
+ # row[1] specifies whether the Index is 'Primary or Secondary?'
367
+ if row[1].rstrip() == 'P':
368
+ # row[2] specifies a string of comma separated column names that form the primary index
369
+ if "," in row[2]:
370
+ index_cols = row[2].split(',')
371
+ else:
372
+ index_cols = [row[2]]
373
+ for index_col in index_cols:
374
+ # Since TD_TIMEBUCKET column in PTI tables is not functionally available, it can be ignored
375
+ # from the index information as well (else a warning is generated by SQLAlchemy).
376
+ # row[12] corresponds to 'Timebucket' column in the results of 'help index' SQL command, which
377
+ # is available only when the version supports PTI tables.
378
+ if index_col == PTITableConstants.TD_TIMEBUCKET.value and len(row) > 11 and row[12] is not None:
379
+ continue
380
+ else:
381
+ index_labels.append(index_col)
382
+
383
+ if len(index_labels) > 0:
384
+ return index_labels
385
+ else:
386
+ return None
387
+
388
+ @staticmethod
389
+ def __validate_sort_type_raise_exception(sort_col_type):
390
+ """
391
+ Function to raise TeradatamlException for errors encountered for invalid/incorrect
392
+ "sort_col_type" in "_validate_sort_type" function.
393
+
394
+ PARAMETERS:
395
+ sort_col_type: The sort column type.
396
+
397
+ RETURNS:
398
+ None
399
+
400
+ RAISES:
401
+ TeradataMlException
402
+
403
+ EXAMPLES:
404
+ df_utils.__validate_sort_type_raise_exception(PythonTypes.PY_STRING_TYPE.value)
405
+ """
406
+ msg = Messages.get_message(MessageCodes.TDMLDF_DROP_INVALID_INDEX_TYPE).format(sort_col_type)
407
+ raise TeradataMlException(msg, MessageCodes.TDMLDF_DROP_INVALID_INDEX_TYPE)
408
+
409
+ @staticmethod
410
+ def _validate_sort_col_type(sort_col_type, sort_col_values):
411
+ """
412
+ Validates a list of sort column values with the sort column type.
413
+
414
+ PARAMETERS:
415
+ sort_col_type - The sort column type.
416
+ sort_col_values - A single value or list-like values
417
+
418
+ RETURNS:
419
+ None
420
+
421
+ RAISES:
422
+ TeradataMlException
423
+
424
+ EXAMPLES:
425
+ df_utils._validate_sort_col_type(PythonTypes.PY_STRING_TYPE.value, ["Jan", "Feb"])
426
+ df_utils._validate_sort_col_type(PythonTypes.PY_STRING_TYPE.value, "Jan")
427
+ df_utils._validate_sort_col_type(PythonTypes.PY_INT_TYPE.value, [1, 2])
428
+ """
429
+ if isinstance(sort_col_values, list):
430
+ if sort_col_type == PythonTypes.PY_STRING_TYPE.value:
431
+ if not all(isinstance(i, str) for i in sort_col_values):
432
+ DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
433
+ elif sort_col_type == PythonTypes.PY_FLOAT_TYPE.value:
434
+ if not all(isinstance(i, float) for i in sort_col_values):
435
+ DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
436
+ elif sort_col_type == PythonTypes.PY_DECIMAL_TYPE.value:
437
+ if not all(isinstance(i, Decimal) for i in sort_col_values):
438
+ DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
439
+ elif sort_col_type == PythonTypes.PY_DATETIME_TYPE.value:
440
+ if not all(isinstance(i, datetime) for i in sort_col_values):
441
+ DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
442
+ elif sort_col_type == PythonTypes.PY_TIME_TYPE.value:
443
+ if not all(isinstance(i, time) for i in sort_col_values):
444
+ DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
445
+ elif sort_col_type == PythonTypes.PY_DATE_TYPE.value:
446
+ if not all(isinstance(i, date) for i in sort_col_values):
447
+ DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
448
+ elif sort_col_type == PythonTypes.PY_BYTES_TYPE.value:
449
+ if not all(isinstance(i, bytes) for i in sort_col_values):
450
+ DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
451
+ else: # numeric type
452
+ if not all(isinstance(i, numbers.Integral) for i in sort_col_values):
453
+ DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
454
+ elif isinstance(sort_col_values, (tuple, dict)):
455
+ raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_DROP_ARGS),
456
+ MessageCodes.TDMLDF_DROP_ARGS)
457
+ else:
458
+ if sort_col_type == PythonTypes.PY_STRING_TYPE.value:
459
+ if not isinstance(sort_col_values, str):
460
+ DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
461
+ elif sort_col_type == PythonTypes.PY_FLOAT_TYPE.value:
462
+ if not isinstance(sort_col_values, float):
463
+ DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
464
+ elif sort_col_type == PythonTypes.PY_DECIMAL_TYPE.value:
465
+ if not isinstance(sort_col_values, Decimal):
466
+ DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
467
+ elif sort_col_type == PythonTypes.PY_DATETIME_TYPE.value:
468
+ if not isinstance(sort_col_values, datetime):
469
+ DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
470
+ elif sort_col_type == PythonTypes.PY_TIME_TYPE.value:
471
+ if not isinstance(sort_col_values, time):
472
+ DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
473
+ elif sort_col_type == PythonTypes.PY_DATE_TYPE.value:
474
+ if not isinstance(sort_col_values, date):
475
+ DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
476
+ elif sort_col_type == PythonTypes.PY_BYTES_TYPE.value:
477
+ if not isinstance(sort_col_values, bytes):
478
+ DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
479
+ else: # numeric type
480
+ if not isinstance(sort_col_values, numbers.Integral):
481
+ DataFrameUtils.__validate_sort_type_raise_exception(sort_col_type)
482
+
483
+ def _get_required_columns_types_from_metaexpr(metaexpr, col_list = None):
484
+ """
485
+ Retrieves column names and types from meta expression. If you want to get types for only some columns,
486
+ pass those columns to 'col_list' argument.
487
+
488
+ PARAMETERS:
489
+ metaexpr - Meta expression from which columns and types to be retrieved.
490
+ col_list - Column list for which you want to get types
491
+
492
+ RETURNS:
493
+ Dictionary: key as column name and datatype as value.
494
+
495
+ EXAMPLES:
496
+ df = DataFrame.from_table('mytab')
497
+ metadata = _get_required_columns_types_from_metaexpr()
498
+ """
499
+
500
+ if isinstance(col_list, str):
501
+ col_list = [col_list]
502
+
503
+ if col_list is not None and not isinstance(col_list, list):
504
+ return None
505
+
506
+ meta_cols = metaexpr.t.c
507
+ meta_columns = [c.name for c in meta_cols]
508
+ col_names = []
509
+ col_types = []
510
+
511
+ # When column list to retrieve is not provided, return meta-data for all columns.
512
+ if col_list is None:
513
+ for col_name in meta_columns:
514
+ col_names.append(meta_cols[col_name].name)
515
+ col_types.append(meta_cols[col_name].type)
516
+
517
+ # Return meta-data for only requested columns otherwise.
518
+ else:
519
+ for col_name in col_list:
520
+ if DataFrameUtils._check_column_exists(col_name, meta_columns):
521
+ # _metaexpr saves columns without quotes, so unquoting.
522
+ unquoted_col_name = col_name.replace('"', "")
523
+ col_names.append(meta_cols[unquoted_col_name].name)
524
+ col_types.append(meta_cols[unquoted_col_name].type)
525
+
526
+ return OrderedDict(zip(col_names, col_types))
527
+
528
+ @staticmethod
529
+ def _check_column_exists(column_name, df_columns):
530
+ """
531
+ Checks provide column present in list of columns or not.
532
+ Note:
533
+ It is calling functions responsibility to send the column and columns list in proper case.
534
+ By default the look up is case-sensitive. If they would like to have it case insensitive, then
535
+ one should send the the column_name and df_columns list in lower case.
536
+
537
+ PARAMETERS:
538
+ column_name - Column name which need to be check.
539
+ df_columns - List columns in which column to be check.
540
+
541
+ RETURNS:
542
+ True if column exists otherwase False.
543
+
544
+ EXAMPLES:
545
+ df = DataFrame.from_table('mytab')
546
+ metadata = _check_column_exists("col1", df.columns)
547
+ """
548
+ unquoted_df_columns = [column.replace('"', "") for column in df_columns]
549
+ if column_name.replace('"', "") in unquoted_df_columns:
550
+ return True
551
+ else:
552
+ return False
553
+
554
+ @staticmethod
555
+ def _validate_agg_function(func, col_names):
556
+ """
557
+ Internal function to validate column names against actual
558
+ column names passed as parameter and aggregate operations
559
+ against valid aggregate operations.
560
+
561
+ PARAMETERS:
562
+ func - (Required) Specifies the function(s) to be
563
+ applied on teradataml DataFrame columns.
564
+ Acceptable formats for function(s) are string,
565
+ dictionary or list of strings/functions.
566
+ Accepted combinations are:
567
+ 1. String function name
568
+ 2. List of string functions
569
+ 3. Dictionary of column names -> string function
570
+ (or list of string functions)
571
+ col_names - List. Names of the columns in Dataframe.
572
+
573
+ RETURNS:
574
+ operations - dict of columns -> aggregate operations
575
+ Unified dictionary, similar to func, even for string and
576
+ list of strings or functions.
577
+
578
+ RAISES:
579
+ 1. TDMLDF_INVALID_AGGREGATE_OPERATION - If the aggregate
580
+ operation(s) received in parameter 'func' is/are
581
+ invalid.
582
+
583
+ Possible Value :
584
+ Invalid aggregate operation(s): minimum, counter.
585
+ Valid aggregate operation(s): count, max, mean, min,
586
+ std, sum.
587
+
588
+ 2. TDMLDF_AGGREGATE_INVALID_COLUMN - If any of the columns
589
+ specified in 'func' is not present in the dataframe.
590
+
591
+ Possible Value :
592
+ Invalid column(s) given in parameter func: col1.
593
+ Valid column(s) : A, B, C, D.
594
+
595
+ EXAMPLES:
596
+ Let the dataframe contain 2 columns, col1 and col2.
597
+
598
+ VALID EXAMPLES:
599
+ 1. operations = DataFrameUtils._validate_agg_function(
600
+ operation = 'mean', ['col1', 'col2'])
601
+
602
+ 2. operations = DataFrameUtils._validate_agg_function(
603
+ operation = ['mean', 'min'], ['col1', 'col2'])
604
+
605
+ 3. operations = DataFrameUtils._validate_agg_function(
606
+ {'col1' : ['mean', 'min'], 'col2' : 'count'},
607
+ ['col1', 'col2'])
608
+
609
+ INVALID EXAMPLES:
610
+ 1. operations = DataFrameUtils._validate_agg_function(
611
+ operation = 'counter', ['col1', 'col2'])
612
+
613
+ 2. operations = DataFrameUtils._validate_agg_function(
614
+ {'col1' : ['mean', 'min'], 'col55' : 'count'},
615
+ ['col1', 'col2'])
616
+ """
617
+ operations = OrderedDict()
618
+
619
+ valid_aggregate_operations = UtilFuncs._get_valid_aggregate_operations()
620
+
621
+ if isinstance(func, str):
622
+ for column in col_names:
623
+ operations[column] = [func]
624
+ elif isinstance(func, list):
625
+ for column in col_names:
626
+ operations[column] = func
627
+ else:
628
+ for column in func:
629
+ if isinstance(func[column], str):
630
+ func[column] = [func[column]] # Converts string inside dict to list
631
+ operations = func
632
+
633
+ given_columns = operations.keys()
634
+ invalid_columns = []
635
+ all_operations = []
636
+ for col in given_columns:
637
+ all_operations.extend(operations[col])
638
+ if col not in col_names:
639
+ invalid_columns.append(col)
640
+ if len(invalid_columns) > 0: # If any of the columns specified is not present in dataframe
641
+ col_names.sort()
642
+ invalid_columns.sort()
643
+ msg = Messages.get_message(MessageCodes.TDMLDF_AGGREGATE_INVALID_COLUMN). \
644
+ format(", ".join(invalid_columns), 'func', ", ".join(col_names))
645
+ raise TeradataMlException(msg, MessageCodes.TDMLDF_AGGREGATE_INVALID_COLUMN)
646
+
647
+ all_operations = list(set(all_operations))
648
+ invalid_aggregates = []
649
+ for operation in all_operations:
650
+ if operation not in valid_aggregate_operations \
651
+ and operation not in UtilFuncs._get_valid_time_series_aggregate_operations():
652
+ invalid_aggregates.append(operation)
653
+ if len(invalid_aggregates) > 0: # If any of the aggregate operations specified is not valid
654
+ # To raise error message, let's add other time series aggregate operations those can be
655
+ # used with DataFrame.agg() method.
656
+ valid_aggregate_operations = valid_aggregate_operations + ['first', 'last', 'mode']
657
+ valid_aggregate_operations.sort()
658
+ invalid_aggregates.sort()
659
+ msg = Messages.get_message(MessageCodes.TDMLDF_INVALID_AGGREGATE_OPERATION). \
660
+ format(", ".join(invalid_aggregates), ", ".join(valid_aggregate_operations))
661
+ raise TeradataMlException(msg, MessageCodes.TDMLDF_INVALID_AGGREGATE_OPERATION)
662
+
663
+ return operations
664
+
665
+ @staticmethod
666
+ def _generate_aggregate_column_expression(df, column, operation, describe_op, tdp, **kwargs):
667
+ """
668
+ Function generate the aggregate column expression for the provided column
669
+ and aggregate function.
670
+
671
+ PARAMETERS:
672
+ df:
673
+ Required Argument.
674
+ Specifies teradataml DataFrame which is to be used to get the
675
+ desired aggregate column expression.
676
+ Types: teradataml DataFrame
677
+
678
+ column:
679
+ Required Argument.
680
+ Specifies the column name for which desired aggregate operation is
681
+ to be used.
682
+ Types: str
683
+
684
+ operation:
685
+ Required Argument.
686
+ Specifies the aggregate operation.
687
+ Types: str
688
+
689
+ describe_op:
690
+ Required Argument.
691
+ Specifies a boolean flag, that will decide whether the aggregate
692
+ operation is being performed for DataFrame.describe() or not.
693
+ Types: bool
694
+
695
+ tdp:
696
+ Required Argument.
697
+ Specifies a TeradataIdentifierPreparer object. It is required for
698
+ quoting.
699
+ Types: TeradataIdentifierPreparer
700
+
701
+ kwargs:
702
+ Specifies miscellaneous keyword arguments that can be passed to
703
+ aggregate functions.
704
+
705
+ RAISES:
706
+ AttributeError - In case ColumnExpression does not have desired aggregate
707
+ function implemnted.
708
+
709
+ RETURNS:
710
+ A boolean stating whether column is supported or not, New column name,
711
+ New column type, A string representing column aggregate expression,
712
+ invalid column information in case column has unsupported type for an
713
+ aggregate operation.
714
+
715
+ EXAMPLES:
716
+ column_supported, new_column_name, new_column_type, column_aggr_expr, invalid_column_str = \
717
+ DataFrameUtils._generate_aggregate_column_expression(df=df, column=column, operation=func,
718
+ describe_op=describe_op, percentile=percentile,
719
+ tdp=tdp, **kwargs)
720
+ """
721
+ try:
722
+ key_to_process = ""
723
+ # quote column names same as that of the Teradata reserved keywords.
724
+ if "sort_columns" in kwargs:
725
+ key_to_process = "sort_columns"
726
+ elif "sort_column" in kwargs:
727
+ key_to_process = "sort_column"
728
+
729
+ if key_to_process:
730
+ quoted_columns = UtilFuncs._process_for_teradata_keyword(kwargs[key_to_process])
731
+ kwargs[key_to_process] = quoted_columns
732
+
733
+ func_expression = getattr(df[column], operation)(describe_op=describe_op, **kwargs)
734
+ new_column_name = column if describe_op else "{1}_{0}".format(column, operation)
735
+ # column_supported, new_column_name, new_column_type, column_aggr_expr, invalid_column_str
736
+ return True, new_column_name, NUMBER() if describe_op else func_expression.type, \
737
+ func_expression.compile_label(new_column_name), None
738
+ except AttributeError:
739
+ # We are here means, provided operation is invalid and is not supported.
740
+ # This if for internal purpose only.
741
+ # Validation of operations for "agg" should be done in "agg" only.
742
+ raise RuntimeError("Invalid aggregate function: {}".format(operation))
743
+ except RuntimeError:
744
+ # We are here means, column does not support the provided operation.
745
+ # We will ignore this and add the column to invalid column list.
746
+ # invalid_columns[operation].append("({0} - {1})".format(column, column_type)) OR
747
+ # We will raise Generic message, mentioning DF does not have any column with type
748
+ # supported to perform an operation.
749
+ if describe_op:
750
+ return True, tdp.quote(column), NUMBER(), 'null as {}'.format(tdp.quote(column)), None
751
+ else:
752
+ return False, None, None, None, "({0} - {1})".format(column, df[column].type)
753
+ except Exception:
754
+ raise
755
+
756
+ @staticmethod
757
+ def _construct_sql_expression_for_aggregations(df, column_names, column_types, func, percentile=.5,
758
+ describe_op=False, **kwargs):
759
+ """
760
+ Internal function to create and return the sql expression
761
+ corresponding to given operation, given column_names and
762
+ column_types.
763
+
764
+ Column_types are used to check whether all the datatypes are
765
+ valid types for given operation and throw exception if they
766
+ are not.
767
+
768
+ PARAMETERS :
769
+ df:
770
+ Required Argument.
771
+ Specifies teradataml DataFrame which is to be used to get the desired
772
+ aggregate column expression.
773
+ Types: teradataml DataFrame
774
+
775
+ column_names:
776
+ Required Argument.
777
+ Specifies the column names for which desired aggregate operation is
778
+ to be executed.
779
+ Types: List of strings
780
+
781
+ column_types:
782
+ Required Argument.
783
+ Specifies the respective column types for column names.
784
+ Types: List of teradatasqlalchemy types
785
+
786
+ func:
787
+ Required Argument.
788
+ Specifies the aggregate function(s) to be applied on teradataml
789
+ DataFrame columns.
790
+ Types: string, dictionary or list of strings/functions.
791
+ Accepted combinations are:
792
+ 1. String function name
793
+ 2. List of functions
794
+ 3. Dictionary containing column name as key and aggregate
795
+ function name (string or list of strings) as value
796
+ 4. ColumnExpression built using the aggregate functions.
797
+ 5. List of ColumnExpression built using the aggregate functions.
798
+
799
+ percentile:
800
+ Optional Argument.
801
+ Specifies a value between 0 and 1 that can only be used with func = 'percentile'.
802
+ The default is .5, which returns the 50th percentiles.
803
+ Types: float
804
+
805
+ describe_op:
806
+ Optional Argument.
807
+ Specifies a boolean flag, that will decide whether the aggregate operation being
808
+ performed is for DataFrame.describe() or not.
809
+ Types: bool
810
+
811
+ kwargs:
812
+ Specifies miscellaneous keyword arguments that can be passed to aggregate functions.
813
+
814
+ RETURNS :
815
+ a)sql expression as
816
+ 1. 'min(col1) as min_col1, min(col2) as min_col2' if
817
+ col1 and col2 are the columns in Dataframe and
818
+ operation is 'min'
819
+ 2. 'max(col1) as max_col1, max(col2) as max_col2' if
820
+ col1 and col2 are the columns in Dataframe and
821
+ operation is 'max'
822
+ 3. 'min(col1) as min_col1, stddev_samp(col2) as
823
+ std_col2' if col1, col2 are the columns in
824
+ Dataframe and operations are min, std.
825
+ etc...
826
+ b) new columns' names (eg min_col1, min_col2 ...)
827
+ c) new columns' types
828
+ RAISES:
829
+ TeradataMLException
830
+ 1. TDMLDF_AGGREGATE_COMBINED_ERR - If the provided
831
+ aggregate operations do not support specified columns.
832
+
833
+ Possible Value :
834
+ No results. Below is/are the error message(s):
835
+ All selected columns [(col1 - VARCHAR)] is/are
836
+ unsupported for 'sum' operation.
837
+
838
+ 2. TDMLDF_INVALID_AGGREGATE_OPERATION - If the aggregate
839
+ operation(s) received in parameter 'func' is/are
840
+ invalid.
841
+
842
+ Possible Value :
843
+ Invalid aggregate operation(s): minimum, counter.
844
+ Valid aggregate operation(s): count, max, mean, min,
845
+ std, sum.
846
+
847
+ 3. TDMLDF_AGGREGATE_INVALID_COLUMN - If any of the columns
848
+ specified in func is not present in the dataframe.
849
+
850
+ Possible Value :
851
+ Invalid column(s) given in parameter func: col1.
852
+ Valid column(s) : A, B, C, D.
853
+
854
+ EXAMPLES:
855
+ col_names, col_types = \
856
+ df_utils._get_column_names_and_types_from_metaexpr(
857
+ self._metaexpr)
858
+ expr, new_col_names, new_col_types = \
859
+ df_utils._construct_sql_expression_for_aggregations(
860
+ col_names, col_types, 'min')
861
+
862
+ expr1, new_col_names1, new_col_types1 = \
863
+ df_utils._construct_sql_expression_for_aggregations(
864
+ col_names, col_types, ['min', 'sum'])
865
+
866
+ expr2, new_col_names2, new_col_types2 = \
867
+ df_utils._construct_sql_expression_for_aggregations(
868
+ col_names, col_types, {'col1 : ['min', 'sum'],
869
+ 'col2' : 'mean'})
870
+
871
+ """
872
+
873
+ # eg of column_types: [VARCHAR(length=13), INTEGER(), VARCHAR(length=60), VARCHAR(length=5),
874
+ # FLOAT(precision=0)]
875
+
876
+ # eg of types of each column are <class 'teradatasqlalchemy.types.VARCHAR'>,
877
+ # <class 'teradatasqlalchemy.types.INTEGER'>, <class 'teradatasqlalchemy.types.FLOAT'>,
878
+ # <class 'teradatasqlalchemy.types.INTERVAL_MINUTE_TO_SECOND'> etc..
879
+
880
+ # If function is of type time series aggregates, we process aggregation differently.
881
+ if not isinstance(func, str):
882
+ # If func is not instance of string, that means function call is
883
+ # from DataFrame.agg(). And is made to process multiple functions.
884
+ # We will process the same differently, as we need to map and serialize the
885
+ # column names and aggregate function operate on.
886
+ # If we have just function to be executed on complete DataFrame, then we don't need
887
+ # this extra processing. Also, if call is from DataFrame.agg(), time series aggregate check
888
+ # is not required. As special Time Series aggregate functions cannot be used in
889
+ # DataFrame.agg().
890
+ return DataFrameUtils._construct_sql_expression_for_aggregations_for_agg(df, column_names, column_types,
891
+ func, percentile, describe_op,
892
+ **kwargs)
893
+
894
+ as_time_series_aggregate = False
895
+ if "as_time_series_aggregate" in kwargs.keys():
896
+ as_time_series_aggregate = kwargs["as_time_series_aggregate"]
897
+
898
+ if as_time_series_aggregate and func in ['bottom', 'bottom with ties', 'delta_t', 'mad', 'top',
899
+ 'top with ties']:
900
+ return DataFrameUtils._construct_sql_expression_for_time_series_aggregations(df, column_names, column_types,
901
+ func, **kwargs)
902
+
903
+ tdp = preparer(td_dialect)
904
+
905
+ # This variable is used to decide whether DataFrame has all columns unsupported
906
+ # for the provided operations.
907
+ all_unsupported_columns = True
908
+ valid_columns = []
909
+ invalid_columns = []
910
+ new_column_names = []
911
+ new_column_types = []
912
+ for column in column_names:
913
+ column_supported, new_column_name, new_column_type, column_aggr_expr, invalid_column_str = \
914
+ DataFrameUtils._generate_aggregate_column_expression(df=df, column=column, operation=func,
915
+ describe_op=describe_op, percentile=percentile,
916
+ tdp=tdp, **kwargs)
917
+ if column_supported:
918
+ all_unsupported_columns = False
919
+ new_column_names.append(new_column_name)
920
+ new_column_types.append(new_column_type)
921
+ valid_columns.append(column_aggr_expr)
922
+ else:
923
+ invalid_columns.append("({0} - {1})".format(column, df[column].type))
924
+
925
+ if all_unsupported_columns:
926
+
927
+ error_msgs = []
928
+ invalid_columns.sort() # Helps in catching the columns in lexicographic order
929
+ error = MessageCodes.TDMLDF_AGGREGATE_UNSUPPORTED.value.format(", ".join(invalid_columns),
930
+ func)
931
+ error_msgs.append(error)
932
+
933
+ if len(valid_columns) == 0: # No supported columns in the given list of columns
934
+ raise TeradataMlException(Messages.get_message(
935
+ MessageCodes.TDMLDF_AGGREGATE_COMBINED_ERR).format("\n".join(error_msgs)),
936
+ MessageCodes.TDMLDF_AGGREGATE_COMBINED_ERR)
937
+
938
+ # quote column names same as that of the Teradata reserved keywords.
939
+ quote_column_name = [UtilFuncs._process_for_teradata_keyword(col) for col in column_names]
940
+
941
+ # Actual columns should be retained if "drop_columns" is set to False.
942
+ if kwargs.get("drop_columns") is False:
943
+ valid_columns = quote_column_name + valid_columns
944
+ new_column_names = column_names + new_column_names
945
+ new_column_types = column_types + new_column_types
946
+
947
+ aggregate_expr = ", ".join(valid_columns)
948
+ return aggregate_expr, new_column_names, new_column_types
949
+
950
+ @staticmethod
951
+ def _construct_sql_expression_for_aggregations_for_agg(df, column_names, column_types, func, percentile=.5,
952
+ describe_op=False, **kwargs):
953
+ """
954
+ Internal function to create and return the sql expression
955
+ corresponding to given operation, given column_names and
956
+ column_types.
957
+
958
+ Column_types are used to check whether all the datatypes are
959
+ valid types for given operation and throw exception if they
960
+ are not.
961
+
962
+ PARAMETERS :
963
+ df:
964
+ Required Argument.
965
+ Specifies teradataml DataFrame which is to be used to get the desired
966
+ aggregate column expression.
967
+ Types: teradataml DataFrame
968
+
969
+ column_names:
970
+ Required Argument.
971
+ Specifies the column names for which desired aggregate operation is
972
+ to be executed.
973
+ Types: List of strings
974
+
975
+ column_types:
976
+ Required Argument.
977
+ Specifies the respective column types for column names.
978
+ Types: List of teradatasqlalchemy types
979
+
980
+ func:
981
+ Required Argument.
982
+ Specifies the aggregate function(s) to be applied on teradataml
983
+ DataFrame columns.
984
+ Types: string, dictionary or list of strings/functions.
985
+ Accepted combinations are:
986
+ 1. String function name
987
+ 2. List of functions
988
+ 3. Dictionary containing column name as key and aggregate
989
+ function name (string or list of strings) as value
990
+ 4. ColumnExpression built using the aggregate functions.
991
+ 5. List of ColumnExpression built using the aggregate functions.
992
+
993
+ percentile:
994
+ Optional Argument.
995
+ Specifies a value between 0 and 1 that can only be used with func = 'percentile'.
996
+ The default is .5, which returns the 50th percentiles.
997
+ Types: float
998
+
999
+ describe_op:
1000
+ Optional Argument.
1001
+ Specifies a boolean flag, that will decide whether the aggregate operation being
1002
+ performed is for DataFrame.describe() or not.
1003
+ Types: bool
1004
+
1005
+ kwargs:
1006
+ Specifies miscellaneous keyword arguments that can be passed to aggregate functions.
1007
+
1008
+ RETURNS :
1009
+ a)sql expression as
1010
+ 1. 'min(col1) as min_col1, min(col2) as min_col2' if
1011
+ col1 and col2 are the columns in Dataframe and
1012
+ operation is 'min'
1013
+ 2. 'max(col1) as max_col1, max(col2) as max_col2' if
1014
+ col1 and col2 are the columns in Dataframe and
1015
+ operation is 'max'
1016
+ 3. 'min(col1) as min_col1, stddev_samp(col2) as
1017
+ std_col2' if col1, col2 are the columns in
1018
+ Dataframe and operations are min, std.
1019
+ etc...
1020
+ b) new columns' names (eg min_col1, min_col2 ...)
1021
+ c) new columns' types
1022
+ RAISES:
1023
+ TeradataMLException
1024
+ 1. TDMLDF_AGGREGATE_COMBINED_ERR - If the provided
1025
+ aggregate operations do not support specified columns.
1026
+
1027
+ Possible Value :
1028
+ No results. Below is/are the error message(s):
1029
+ All selected columns [(col1 - VARCHAR)] is/are
1030
+ unsupported for 'sum' operation.
1031
+
1032
+ 2. TDMLDF_INVALID_AGGREGATE_OPERATION - If the aggregate
1033
+ operation(s) received in parameter 'func' is/are
1034
+ invalid.
1035
+
1036
+ Possible Value :
1037
+ Invalid aggregate operation(s): minimum, counter.
1038
+ Valid aggregate operation(s): count, max, mean, min,
1039
+ std, sum.
1040
+
1041
+ 3. TDMLDF_AGGREGATE_INVALID_COLUMN - If any of the columns
1042
+ specified in func is not present in the dataframe.
1043
+
1044
+ Possible Value :
1045
+ Invalid column(s) given in parameter func: col1.
1046
+ Valid column(s) : A, B, C, D.
1047
+
1048
+ EXAMPLES:
1049
+ col_names, col_types = \
1050
+ df_utils._get_column_names_and_types_from_metaexpr(
1051
+ self._metaexpr)
1052
+ expr, new_col_names, new_col_types = \
1053
+ df_utils._construct_sql_expression_for_aggregations_for_agg(
1054
+ col_names, col_types, 'min')
1055
+
1056
+ expr1, new_col_names1, new_col_types1 = \
1057
+ df_utils._construct_sql_expression_for_aggregations_for_agg(
1058
+ col_names, col_types, ['min', 'sum'])
1059
+
1060
+ expr2, new_col_names2, new_col_types2 = \
1061
+ df_utils._construct_sql_expression_for_aggregations_for_agg(
1062
+ col_names, col_types, {'col1 : ['min', 'sum'],
1063
+ 'col2' : 'mean'})
1064
+
1065
+ """
1066
+ # If function is of type time series aggregates, we process aggregation differently.
1067
+ # Also, one is not supposed to pass below time series aggreagtes to DataFrame.agg():
1068
+ # ['bottom', 'bottom with ties', 'delta_t', 'mad', 'top', 'top with ties']
1069
+ # Thus, no extra processing is required for time series aggregates over here.
1070
+
1071
+ if isinstance(func, ColumnExpression) or (isinstance(func, list) and isinstance(func[0], ColumnExpression)):
1072
+ column_agg_expr = []
1073
+ new_column_names = []
1074
+ new_column_types = []
1075
+ if isinstance(func, ColumnExpression):
1076
+ func= UtilFuncs._as_list(func)
1077
+
1078
+ # validate that func is a list of ColumnExpression
1079
+ for expr in func:
1080
+ if not isinstance(expr, ColumnExpression):
1081
+ raise TeradataMlException(Messages.get_message(MessageCodes.UNSUPPORTED_DATATYPE,
1082
+ 'func', ['str, dict, ColumnExpression or list of values of type(s): str, ColumnExpression']),
1083
+ MessageCodes.UNSUPPORTED_DATATYPE)
1084
+
1085
+ for operations in func:
1086
+ alias = operations.alias_name
1087
+ column_agg_expr.append(operations.compile_label(alias))
1088
+ new_column_names.append(alias)
1089
+ new_column_types.append(operations.type)
1090
+ aggregate_expr = ", ".join(column_agg_expr)
1091
+ return aggregate_expr, new_column_names, new_column_types
1092
+
1093
+ # 'operations' contains dict of columns -> list of aggregate operations
1094
+ operations = DataFrameUtils._validate_agg_function(func, column_names)
1095
+
1096
+ all_valid_columns = []
1097
+ all_invalid_columns = {}
1098
+ all_new_column_names = []
1099
+ all_new_column_types = []
1100
+
1101
+ # For each column, the value is True if there is at least one valid operation (operation on valid datatype)
1102
+ column_supported = {}
1103
+ tdp = preparer(td_dialect)
1104
+ for column in operations:
1105
+ column_supported[column] = False
1106
+ valid_columns = []
1107
+ invalid_columns = {}
1108
+ new_column_names = []
1109
+ new_column_types = []
1110
+ for operation in operations[column]:
1111
+ is_colop_supported, new_col, new_coltype, column_aggr_expr, invalid_column_info = \
1112
+ DataFrameUtils._generate_aggregate_column_expression(df=df, column=column, operation=operation,
1113
+ describe_op=describe_op, percentile=percentile,
1114
+ tdp=tdp, **kwargs)
1115
+ if is_colop_supported:
1116
+ column_supported[column] = is_colop_supported
1117
+ new_column_names.append(new_col)
1118
+ new_column_types.append(new_coltype)
1119
+ valid_columns.append(column_aggr_expr)
1120
+ else:
1121
+ if operation in invalid_columns:
1122
+ invalid_columns[operation].append(invalid_column_info)
1123
+ else:
1124
+ invalid_columns[operation] = [invalid_column_info]
1125
+
1126
+ all_valid_columns.extend(valid_columns)
1127
+ all_new_column_names.extend(new_column_names)
1128
+ all_new_column_types.extend(new_column_types)
1129
+
1130
+ for operation in invalid_columns:
1131
+ if operation in all_invalid_columns:
1132
+ all_invalid_columns[operation].extend(invalid_columns[operation])
1133
+ else:
1134
+ all_invalid_columns[operation] = invalid_columns[operation]
1135
+
1136
+ unsupported_columns = [col for col in column_supported if not column_supported[col]]
1137
+ unsupported_columns.sort() # helps in catching the columns in lexicographic order
1138
+
1139
+ error_msgs = []
1140
+ for operation in sorted(all_invalid_columns):
1141
+ all_invalid_columns[operation].sort() # helps in catching the columns in
1142
+ # lexicographic order
1143
+ error = MessageCodes.TDMLDF_AGGREGATE_UNSUPPORTED.value.format(
1144
+ ", ".join(all_invalid_columns[operation]), operation)
1145
+ error_msgs.append(error)
1146
+
1147
+ if not all(column_supported[oper] for oper in column_supported):
1148
+ new_msg = MessageCodes.TDMLDF_AGGREGATE_AGG_DICT_ERR.value.format(", ".join(unsupported_columns))
1149
+ error_msgs.append(new_msg)
1150
+ msg = Messages.get_message(MessageCodes.TDMLDF_AGGREGATE_COMBINED_ERR).format("\n".join(error_msgs))
1151
+ raise TeradataMlException(msg, MessageCodes.TDMLDF_AGGREGATE_COMBINED_ERR)
1152
+
1153
+ elif len(all_valid_columns) == 0: # No supported columns in the given list of columns
1154
+ raise TeradataMlException(Messages.get_message(
1155
+ MessageCodes.TDMLDF_AGGREGATE_COMBINED_ERR).format("\n".join(error_msgs)),
1156
+ MessageCodes.TDMLDF_AGGREGATE_COMBINED_ERR)
1157
+
1158
+ aggregate_expr = ", ".join(all_valid_columns)
1159
+ return aggregate_expr, all_new_column_names, all_new_column_types
1160
+
1161
+ @staticmethod
1162
+ def _construct_sql_expression_for_time_series_aggregations(df, column_names, column_types, func, **kwargs):
1163
+ """
1164
+ Internal function to create and return the sql expression
1165
+ corresponding to given time series function, given column_names and
1166
+ column_types.
1167
+
1168
+ Column_types are used to check whether all the datatypes are
1169
+ valid types for given operation and throw exception if they
1170
+ are not.
1171
+
1172
+ NOTE:
1173
+ This function should be used only for time series aggregates.
1174
+
1175
+ PARAMETERS :
1176
+ df:
1177
+ Required Argument.
1178
+ Specifies teradataml DataFrame which is to be used to get the desired
1179
+ aggregate column expression.
1180
+ Types: teradataml DataFrame
1181
+
1182
+ column_names:
1183
+ Required Argument.
1184
+ Specifies the column names for which desired aggregate operation is
1185
+ to be executed.
1186
+ Types: List of strings
1187
+
1188
+ column_types:
1189
+ Required Argument.
1190
+ Specifies the respective column types for column names.
1191
+ Types: List of teradatasqlalchemy types
1192
+
1193
+ func:
1194
+ Required Argument.
1195
+ Specifies the aggregate function(s) to be applied on teradataml
1196
+ DataFrame columns. For Time Series aggregates it is usually a string.
1197
+ Types: str
1198
+
1199
+ kwargs:
1200
+ Specifies miscellaneous keyword arguments that can be passed to aggregate functions.
1201
+
1202
+ RETURNS :
1203
+ a)sql expression as
1204
+ 1. 'bottom(2, "col1") as "bottom2col1"' if
1205
+ col1 and col2 are the columns in Dataframe and
1206
+ operation is 'bottom'
1207
+ etc...
1208
+ b) new columns' names (eg min_col1, min_col2 ...)
1209
+ c) new columns' types
1210
+
1211
+ RAISES:
1212
+ None.
1213
+
1214
+ EXAMPLES:
1215
+ colname_to_numvalues = {"col1" : 2, "col2": 3}
1216
+ kwargs = {"colname_to_numvalues": colname_to_numvalues}
1217
+ aggregate_expr, column_names, column_types = \
1218
+ df_utils._construct_sql_expression_for_time_series_aggregations(column_names, column_types,
1219
+ func, **kwargs)
1220
+
1221
+ """
1222
+
1223
+ # eg of column_types: [VARCHAR(length=13), INTEGER(), VARCHAR(length=60), VARCHAR(length=5),
1224
+ # FLOAT(precision=0)]
1225
+
1226
+ # eg of types of each column are <class 'teradatasqlalchemy.types.VARCHAR'>,
1227
+ # <class 'teradatasqlalchemy.types.INTEGER'>, <class 'teradatasqlalchemy.types.FLOAT'>,
1228
+ # <class 'teradatasqlalchemy.types.INTERVAL_MINUTE_TO_SECOND'> etc..
1229
+
1230
+ col_names_and_types = dict(zip(column_names, column_types))
1231
+ tdp = preparer(td_dialect)
1232
+
1233
+ select_columns = []
1234
+ new_column_names = []
1235
+ new_column_types = []
1236
+ if func in ["bottom", "bottom with ties", "top", "top with ties"]:
1237
+ # Processing for bottom and top.
1238
+ # Function name to be used in column aliasing.
1239
+ column_alias_func = func.replace(" ", "_")
1240
+ bottom_col_val = kwargs["colname_to_numvalues"]
1241
+ for column in sorted(list(bottom_col_val.keys())):
1242
+ new_col_name = "{2}{0}{1}".format(bottom_col_val[column], column, column_alias_func)
1243
+ quoted_parent_column_name = tdp.quote("{0}".format(column))
1244
+ quoted_new_column_name = tdp.quote(new_col_name)
1245
+ select_columns.append("{0}({1}, {2}) as {3}".format(func, bottom_col_val[column],
1246
+ quoted_parent_column_name, quoted_new_column_name))
1247
+ new_column_names.append(new_col_name)
1248
+ new_column_types.append(col_names_and_types[column])
1249
+
1250
+ if func == "delta_t":
1251
+ # Argument processing for DELTA-T
1252
+ new_column_names.append("delta_t_td_timecode")
1253
+ quoted_new_column_name = tdp.quote(new_column_names[0])
1254
+ new_column_types.append(PERIOD_TIMESTAMP)
1255
+ select_columns.append("{0}((WHERE {1}), (WHERE {2})) as {3}".format(func, kwargs["start_condition"],
1256
+ kwargs["end_condition"],
1257
+ quoted_new_column_name))
1258
+
1259
+ if func == 'mad':
1260
+ # Processing for Median Absolute Deviation.
1261
+ # Function name to be used in column aliasing.
1262
+ column_alias_func = func.replace(" ", "_")
1263
+ bottom_col_val = kwargs["colname_to_numvalues"]
1264
+ for column in sorted(list(bottom_col_val.keys())):
1265
+ new_col_name = "{2}{0}{1}".format(bottom_col_val[column], column, column_alias_func)
1266
+ quoted_parent_column_name = tdp.quote("{0}".format(column))
1267
+ quoted_new_column_name = tdp.quote(new_col_name)
1268
+ select_columns.append("{0}({1}, {2}) as {3}".format(func, bottom_col_val[column],
1269
+ quoted_parent_column_name, quoted_new_column_name))
1270
+ new_column_names.append(new_col_name)
1271
+ if type(col_names_and_types[column]) in [DECIMAL, NUMBER]:
1272
+ # If column types is DECIMAL or NUMBER, then output column types should also be same.
1273
+ # Otherwise, it is FLOAT.
1274
+ new_column_types.append(col_names_and_types[column])
1275
+ else:
1276
+ new_column_types.append(FLOAT())
1277
+
1278
+ if "default_constant_for_columns" in kwargs.keys():
1279
+ column_names = kwargs["default_constant_for_columns"]
1280
+ column_types = [col_names_and_types[column] for column in column_names]
1281
+ if len(column_names) > 0:
1282
+ aggregate_expr, all_new_column_names, all_new_column_types = \
1283
+ DataFrameUtils._construct_sql_expression_for_aggregations(df=df, column_names=column_names,
1284
+ column_types=column_types, func=func,
1285
+ )
1286
+ aggregate_expr_default_column_list = [col.strip() for col in aggregate_expr.split(",")]
1287
+ select_columns = select_columns + aggregate_expr_default_column_list
1288
+ new_column_names = new_column_names + all_new_column_names
1289
+ new_column_types = new_column_types + all_new_column_types
1290
+
1291
+
1292
+ aggregate_expr = ", ".join(select_columns)
1293
+ return aggregate_expr, new_column_names, new_column_types
1294
+
1295
+ @staticmethod
1296
+ def _invalid_describe_column(df, columns, metaexpr, groupby_column_list):
1297
+ """
1298
+ Internal function to validate columns provided to describe() is correct or not,
1299
+ when DataFrame is output of groupby and groupby_time.
1300
+
1301
+ PARAMETERS:
1302
+ df:
1303
+ Required Argument.
1304
+ Specifies teradataml DataFrame we are collecting statistics for.
1305
+ Types: str
1306
+
1307
+ columns:
1308
+ Optional Argument.
1309
+ Specifies the name(s) of columns we are collecting statistics for.
1310
+ Types: str ot List of strings (str)
1311
+
1312
+ metaexpr:
1313
+ Required Argument.
1314
+ Specifies the meta expression for the dataframe.
1315
+ Types: _MetaExpression
1316
+
1317
+ groupby_column_list:
1318
+ Optional Argument.
1319
+ Specifies the group by columns for the dataframe.
1320
+ Default Values: None.
1321
+ Types: str ot List of strings (str)
1322
+
1323
+ Returns:
1324
+ None
1325
+
1326
+ Raises:
1327
+ TeradataMLException
1328
+ """
1329
+ invalid_columns = [_column for _column in groupby_column_list if columns is not None
1330
+ and _column in columns]
1331
+ if len(invalid_columns) > 0:
1332
+ all_columns = [col.name for col in metaexpr.c]
1333
+ valid_columns = [item for item in all_columns if item not in groupby_column_list]
1334
+ msg = Messages.get_message(MessageCodes.TDMLDF_AGGREGATE_INVALID_COLUMN). \
1335
+ format(", ".join(invalid_columns), 'columns', ", ".join(valid_columns))
1336
+ raise TeradataMlException(msg, MessageCodes.TDMLDF_AGGREGATE_INVALID_COLUMN)
1337
+
1338
+ @staticmethod
1339
+ def _construct_describe_query(df, columns, metaexpr, percentiles, function_label, groupby_column_list=None,
1340
+ include=None, is_time_series_aggregate=False, verbose=False, distinct=False,
1341
+ statistics=None, **kwargs):
1342
+ """
1343
+ Internal function to create the sql query for describe().
1344
+
1345
+ PARAMETERS :
1346
+ df:
1347
+ Required Argument.
1348
+ Specifies teradataml DataFrame we are collecting statistics for.
1349
+ Types: str
1350
+
1351
+ columns:
1352
+ Optional Argument.
1353
+ Specifies the name(s) of columns we are collecting statistics for.
1354
+ Types: str ot List of strings (str)
1355
+
1356
+ metaexpr:
1357
+ Required Argument.
1358
+ Specifies the meta expression for the dataframe.
1359
+ Types: _MetaExpression
1360
+
1361
+ percentiles:
1362
+ Required Argument.
1363
+ Specifies a list of values between 0 and 1.
1364
+ Types: List of floats
1365
+
1366
+ function_label:
1367
+ Required Argument.
1368
+ Specifies a string value used as the label for the aggregate function column.
1369
+ Types: str
1370
+
1371
+ groupby_column_list:
1372
+ Optional Argument.
1373
+ Specifies the group by columns for the dataframe.
1374
+ Default Values: None.
1375
+ Types: str ot List of strings (str)
1376
+
1377
+ include:
1378
+ Optional Argument.
1379
+ Specifies a string that must be "all" or None. If "all", then all columns will be included.
1380
+ Otherwise, only numeric columns are used for collecting statistics.
1381
+ Default Values: None.
1382
+ Types: str
1383
+
1384
+ is_time_series_aggregate:
1385
+ Optional Argument.
1386
+ Specifies a flag stating whether describe operation is time series aggregate or not.
1387
+ Default Values: False.
1388
+ Types: bool
1389
+
1390
+ verbose:
1391
+ Optional Argument.
1392
+ Specifies a flag stating whether DESCRIBE VERBOSE option for time series aggregate is to be
1393
+ performed or not.
1394
+ Default Values: False.
1395
+ Types: bool
1396
+
1397
+ distinct:
1398
+ Optional Argument.
1399
+ Specifies a flag that decides whether to consider duplicate rows in calculation or not.
1400
+ Default Values: False
1401
+ Types: bool
1402
+
1403
+ kwargs:
1404
+ Optional Arguments.
1405
+ Keyword argument for time series aggregate functions.
1406
+
1407
+
1408
+ RETURNS :
1409
+ A SQL query like:
1410
+ select 'count' as "func", cast(count("Feb") as Number) as "Feb", cast(count(accounts) as Number) as accounts from "PYUSER"."salesview"
1411
+ union all
1412
+ select 'mean' as "func", cast(avg("Feb") as Number) as "Feb", null as accounts from "PYUSER"."salesview"
1413
+ union all
1414
+ select 'std' as "func", cast(stddev_samp("Feb") as Number) as "Feb", null as accounts from "PYUSER"."salesview"
1415
+ union all
1416
+ select 'min' as "func", cast(min("Feb") as Number) as "Feb", cast(min(accounts) as Number) as accounts from "PYUSER"."salesview"
1417
+ union all
1418
+ select '25%' as "func", percentile_cont(0.25) within group(order by cast("Feb" as Number) ) as "Feb", null as accounts from "PYUSER"."salesview"
1419
+ union all
1420
+ select '50%' as "func", percentile_cont(0.5) within group(order by cast("Feb" as Number) ) as "Feb", null as accounts from "PYUSER"."salesview"
1421
+ union all
1422
+ select '75%' as "func", percentile_cont(0.75) within group(order by cast("Feb" as Number) ) as "Feb", null as accounts from "PYUSER"."salesview"
1423
+ union all
1424
+ select 'max' as "func", cast(max("Feb") as Number) as "Feb", cast(max(accounts) as Number) as accounts from "PYUSER"."salesview"
1425
+
1426
+ RAISES:
1427
+ TeradataMLException
1428
+
1429
+ EXAMPLES:
1430
+ agg_query = \
1431
+ df_utils._construct_describe_query("self._table_name", self._metaexpr, [.25, .5, .75], "func", self.groupby_column_list)
1432
+ agg_query = \
1433
+ df_utils._construct_describe_query("self._table_name", self._metaexpr, [.3, .6], "func", self.groupby_column_list, include="all")
1434
+
1435
+ """
1436
+ table_name = df._table_name
1437
+ operators = ["count", "mean", "std", "min", "percentile", "max"]
1438
+ all_operators = ["count", "unique", "mean", "std", "min", "percentile", "max"]
1439
+
1440
+ if is_time_series_aggregate and verbose:
1441
+ # Time Series Aggregate Operators for Vantage DESCRIBE function with verbose
1442
+ operators = ['max', 'mean', 'median', 'min', 'mode', "percentile", 'std']
1443
+ elif is_time_series_aggregate and not verbose:
1444
+ # Time Series Aggregate Operators for Vantage DESCRIBE function.
1445
+ operators = ['max', 'mean', 'min', 'std']
1446
+
1447
+ col_names = []
1448
+ col_types = []
1449
+ sel_agg_stmts = []
1450
+ tdp = preparer(td_dialect)
1451
+ quoted_function_label = tdp.quote(function_label)
1452
+
1453
+ if include is not None and include == 'all' and not is_time_series_aggregate:
1454
+ operators = all_operators
1455
+
1456
+ if include is None and statistics is not None:
1457
+ operators = statistics
1458
+
1459
+ table_name, sel_groupby, groupby = DataFrameUtils()._process_groupby_clause(table_name, groupby_column_list,
1460
+ is_time_series_aggregate, **kwargs)
1461
+
1462
+ for col in metaexpr.c:
1463
+ if (include is None and type(col.type) in UtilFuncs()._get_numeric_datatypes()) or include == 'all' or statistics is not None:
1464
+ if not(groupby is not None and col.name in groupby_column_list):
1465
+ if columns is None or col.name in columns:
1466
+ col_names.append(col.name)
1467
+ col_types.append(col.type)
1468
+
1469
+
1470
+ if len(col_names) == 0:
1471
+ raise TeradataMlException(
1472
+ Messages.get_message(MessageCodes.TDMLDF_AGGREGATE_COMBINED_ERR,
1473
+ "The DataFrame does not contain numeric columns"),
1474
+ MessageCodes.TDMLDF_AGGREGATE_COMBINED_ERR)
1475
+ for op in operators:
1476
+ if op == "percentile":
1477
+ for p in percentiles:
1478
+ agg_expr, new_col_names, new_col_types = \
1479
+ DataFrameUtils._construct_sql_expression_for_aggregations(df,
1480
+ col_names, col_types, op, percentile=p, describe_op=True, distinct=distinct,
1481
+ as_time_series_aggregate=is_time_series_aggregate)
1482
+ sel_agg_stmts.append("SELECT \n\t{4} \n\tcast('{0}%' as varchar(6)) as \"{1}\", {2} from {3} ".format(
1483
+ int(p*100), quoted_function_label, agg_expr, table_name, sel_groupby))
1484
+ else:
1485
+ agg_expr, new_col_names, new_col_types = \
1486
+ DataFrameUtils._construct_sql_expression_for_aggregations(df,
1487
+ col_names, col_types, op, describe_op=True, distinct=distinct,
1488
+ as_time_series_aggregate=is_time_series_aggregate)
1489
+ sel_agg_stmts.append("SELECT \n\t{4} \n\tcast('{0}' as varchar(6)) as \"{1}\", \n\t{2} \nfrom \n\t{3} ".format(
1490
+ op, quoted_function_label, agg_expr, table_name, sel_groupby))
1491
+ return " \nunion all\n ".join(sel_agg_stmts)
1492
+
1493
+ @staticmethod
1494
+ def _process_groupby_clause(table_name, groupby_column_list, is_time_series_aggregate, **kwargs):
1495
+ """
1496
+ Internal function used to process and generate GROUP BY or GROUP BY TIME clauses required for
1497
+ query to be run for describe operation.
1498
+
1499
+ PARAMETERS:
1500
+ table_name:
1501
+ Required Arguments.
1502
+ Specifies table name to be used for forming describe query.
1503
+ Types: str
1504
+
1505
+ groupby_column_list:
1506
+ Required Arguments.
1507
+ Specifies list of column names involved in Group By.
1508
+ Types: List of Strings.
1509
+
1510
+ is_time_series_aggregate:
1511
+ Required Arguments.
1512
+ Specifies a boolean stating whether GROUP BY clause to be formed is for
1513
+ Time series aggregate or not.
1514
+ Types: bool
1515
+
1516
+ kwargs:
1517
+ Optional Arguments.
1518
+ Keyword argument for time series aggregate functions.
1519
+
1520
+ RETURNS:
1521
+ 1. Table Name appended with GROUP BY clause.
1522
+ 2. Column projection string for GROUP BY columns.
1523
+ 3. Group By Clause.
1524
+
1525
+ RAISES:
1526
+ None.
1527
+
1528
+ EXAMPLES:
1529
+ table_name, sel_groupby, groupby = DataFrameUtils()._process_groupby_clause(table_name, groupby_column_list,
1530
+ is_time_series_aggregate, **kwargs)
1531
+
1532
+ """
1533
+ sel_groupby = ""
1534
+ grp_by_clause = None
1535
+
1536
+ if is_time_series_aggregate:
1537
+ # For time series aggregate timebucket_duration is must so, it'll be always present in kwargs.
1538
+ grp_by_clause = "GROUP BY TIME ({0}".format(kwargs['timebucket_duration'])
1539
+
1540
+ # Add columns in value expression to GROUP BY TIME
1541
+ if 'value_expression' in kwargs and \
1542
+ kwargs['value_expression'] is not None and \
1543
+ len(kwargs['value_expression']) > 0:
1544
+ grp_by_clause = "{0} and {1}".format(grp_by_clause, ", ".join(kwargs['value_expression']))
1545
+
1546
+ # Complete the parenthesis for GROUP BY TIME
1547
+ grp_by_clause = "{0})".format(grp_by_clause)
1548
+
1549
+ # Add Time code column information.
1550
+ if 'timecode_column' in kwargs and \
1551
+ kwargs['timecode_column'] is not None and \
1552
+ len(kwargs['timecode_column']) > 0:
1553
+ if 'sequence_column' in kwargs and \
1554
+ kwargs['timecode_column'] is not None and \
1555
+ len(kwargs['timecode_column']) > 0:
1556
+ grp_by_clause = "{0} USING TIMECODE({1}, {2})".format(grp_by_clause, kwargs['timecode_column'],
1557
+ kwargs['sequence_column'])
1558
+ else:
1559
+ grp_by_clause = "{0} USING TIMECODE({1})".format(grp_by_clause, kwargs['timecode_column'])
1560
+
1561
+ # Add Fill inforamtion
1562
+ if 'fill' in kwargs and kwargs['fill'] is not None and len(kwargs['fill']) > 0:
1563
+ grp_by_clause = "{0} FILL({1})".format(grp_by_clause, kwargs['fill'])
1564
+
1565
+ else:
1566
+ if groupby_column_list is not None:
1567
+ grp_by_clause = "GROUP BY {0}".format(",".join(groupby_column_list))
1568
+
1569
+ if grp_by_clause is not None:
1570
+ table_name = "{0} \n{1}".format(table_name, grp_by_clause)
1571
+ tdp = preparer(td_dialect)
1572
+ for g in groupby_column_list:
1573
+ if is_time_series_aggregate:
1574
+ if g == "TIMECODE_RANGE":
1575
+ g = "$TD_TIMECODE_RANGE"
1576
+
1577
+ if "GROUP BY TIME" in g:
1578
+ g = "$TD_GROUP_BY_TIME"
1579
+
1580
+ quoted_name = tdp.quote(g)
1581
+ sel_groupby += "{0}, ".format(quoted_name)
1582
+
1583
+ return table_name, sel_groupby, grp_by_clause
1584
+
1585
+ @staticmethod
1586
+ def _get_column_names_and_types_from_metaexpr(metaexpr):
1587
+ """
1588
+ Internal function to return column names and respective types
1589
+ given _metaexpr.
1590
+
1591
+ PARAMETERS:
1592
+ metaexpr:
1593
+ Required Argument.
1594
+ Dataframe's metaexpr. It is used to get column names and types.
1595
+ Types: MetaExpression
1596
+
1597
+ RETURNS:
1598
+ Two lists - one for column names and another for column types
1599
+
1600
+ RAISES:
1601
+ None
1602
+
1603
+ EXAMPLES:
1604
+ dfUtils._get_column_names_and_types_from_metaexpr(
1605
+ df._metaexpr)
1606
+ """
1607
+ # Constructing New Column names & Types for selected columns ONLY using Parent _metaexpr
1608
+ col_names = []
1609
+ col_types = []
1610
+ for c in metaexpr.c:
1611
+ col_names.append(c.name)
1612
+ col_types.append(c.type)
1613
+
1614
+ return col_names, col_types
1615
+
1616
+ @staticmethod
1617
+ def _insert_all_from_table(to_table_name, from_table_name, column_list, to_schema_name=None,
1618
+ from_schema_name=None, temporary=False):
1619
+ """
1620
+ Inserts all records from one table into the second, using columns ordered by column list.
1621
+
1622
+ PARAMETERS:
1623
+ to_table_name - String specifying name of the SQL Table to insert to.
1624
+ from_table_name - String specifying name of the SQL Table to insert from.
1625
+ column_list - List of strings specifying column names used in the insertion.
1626
+ to_schema_name - Name of the database schema to insert table data into.
1627
+ from_schema_name - Name of the database schema to insert table data from.
1628
+ temporary - Specifies whether to create Vantage tables as permanent or volatile.
1629
+ Default: False
1630
+ Note: When True:
1631
+ 1. volatile Tables are created, and
1632
+ 2. schema_name is ignored.
1633
+ When False, permanent tables are created.
1634
+ RETURNS:
1635
+ None
1636
+
1637
+ RAISES:
1638
+ Database error if an error occurred while executing the insert command.
1639
+
1640
+ EXAMPLES:
1641
+ df_utils._insert_all_from_table('table1_name', 'table2_name', ['col1', 'col2', 'col3'])
1642
+ """
1643
+ tdp = preparer(td_dialect)
1644
+
1645
+ # Construct INSERT command.
1646
+ column_order_string = ', '.join([tdp.quote("{0}".format(element)) for element in column_list])
1647
+
1648
+ # Generate full name of the destination table.
1649
+ if to_schema_name:
1650
+ full_to_table_name = tdp.quote(to_schema_name) + "." + tdp.quote(to_table_name)
1651
+ elif temporary:
1652
+ full_to_table_name = tdp.quote(to_table_name)
1653
+ else:
1654
+ full_to_table_name = tdp.quote(_get_current_databasename()) + "." + tdp.quote(
1655
+ to_table_name)
1656
+
1657
+ # Generate full name of source table.
1658
+ if from_schema_name:
1659
+ full_from_table_name = tdp.quote(from_schema_name) + "." + tdp.quote(from_table_name)
1660
+ else:
1661
+ full_from_table_name = tdp.quote(_get_current_databasename()) + "." + tdp.quote(
1662
+ from_table_name)
1663
+
1664
+ insert_sql = SQLBundle._build_insert_from_table_query(full_to_table_name,
1665
+ full_from_table_name,
1666
+ column_order_string)
1667
+ # Execute INSERT command.
1668
+ return UtilFuncs._execute_ddl_statement(insert_sql)
1669
+
1670
+ @staticmethod
1671
+ def _dataframe_has_column(data, column):
1672
+ """
1673
+ Function to check whether column names in columns are present in given dataframe or not.
1674
+ This function is used currently only for Analytics wrappers.
1675
+
1676
+ PARAMETERS:
1677
+ data - teradataml DataFrame to check against for column existence.
1678
+ column - Column name (a string).
1679
+
1680
+ RAISES:
1681
+ None
1682
+
1683
+ EXAMPLES:
1684
+ DataFrameUtils._dataframe_has_column(data, col)
1685
+ """
1686
+ if column in [c.name for c in data._metaexpr.c]:
1687
+ return True
1688
+
1689
+ return False
1690
+
1691
+ @staticmethod
1692
+ def _get_row_count(table_name):
1693
+ """
1694
+ Function to return the row count of a teradataml Dataframe.
1695
+ This function is used currently to determine the shape/size of a dataframe.
1696
+
1697
+ PARAMETERS:
1698
+ table_name - Name of the table to get the row count for.
1699
+
1700
+ RAISES:
1701
+ TeradataMlException (TDMLDF_INFO_ERROR)
1702
+
1703
+ EXAMPLES:
1704
+ DataFrameUtils._get_row_count(table_name)
1705
+ """
1706
+ # Construct COUNT(*) Query
1707
+ try:
1708
+ row_count_query = SQLBundle._build_nrows_print_query(table_name)
1709
+ res = execute_sql(row_count_query)
1710
+ return res.fetchone()[0]
1711
+
1712
+ except TeradataMlException:
1713
+ raise
1714
+
1715
+ except Exception as err:
1716
+ # TODO Better handle the level of information being presented to the user with logging
1717
+ raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_INFO_ERROR) + str(err),
1718
+ MessageCodes.TDMLDF_INFO_ERROR) from err
1719
+
1720
+ @staticmethod
1721
+ def _get_scalar_value(table_name):
1722
+ """
1723
+ Function to return the the only 1x1 (scalar) value from a teradataml Dataframe.
1724
+
1725
+ PARAMETERS:
1726
+ table_name - Name of the table to get the value from.
1727
+
1728
+ RAISES:
1729
+ TeradataMlException (TDMLDF_INFO_ERROR)
1730
+
1731
+ EXAMPLES:
1732
+ DataFrameUtils._get_scalar_value(table_name)
1733
+ """
1734
+ # Construct the base Query
1735
+ try:
1736
+ select_query = SQLBundle._build_base_query(table_name)
1737
+ res = execute_sql(select_query)
1738
+ return res.fetchone()[0]
1739
+
1740
+ except TeradataMlException:
1741
+ raise
1742
+
1743
+ except Exception as err:
1744
+ # TODO Better handle the level of information being presented to the user with logging
1745
+ raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_INFO_ERROR) + str(err),
1746
+ MessageCodes.TDMLDF_INFO_ERROR) from err
1747
+
1748
+ @staticmethod
1749
+ def _get_sorted_nrow(df, n, sort_col, asc=True):
1750
+ """
1751
+ Internal Utility function that returns a teradataml DataFrame containing n rows
1752
+ of the DataFrame. The Dataframe is sorted on the index column or the first column
1753
+ if there is no index column.
1754
+
1755
+ PARAMETERS:
1756
+ df: teradataml DataFrame
1757
+ n: Specifies the number of rows to select.
1758
+ Type: int
1759
+ sort_col: The column to sort on.
1760
+ Type: str
1761
+ asc: (optional) - Specifies sort order.
1762
+ If True, sort in ascending order.
1763
+ If False, sort in descending order.
1764
+ The default value is True.
1765
+ Type: boolean
1766
+
1767
+ RETURNS:
1768
+ teradataml DataFrame
1769
+
1770
+ EXAMPLES:
1771
+ DataFrameUtils._get_sorted_nrow(df, 10)
1772
+ DataFrameUtils._get_sorted_nrow(df, 20, asc=True)
1773
+ DataFrameUtils._get_sorted_nrow(df, 30, asc=False)
1774
+
1775
+ """
1776
+ #TODO: implement and use this in teradatasqlalchemy
1777
+ tdp = preparer(td_dialect)
1778
+ aed_utils = AedUtils()
1779
+
1780
+ sort_order = "asc"
1781
+ if not asc:
1782
+ sort_order = "desc"
1783
+
1784
+ quoted_cols = [tdp.quote(c) for c in df.columns]
1785
+ sel_cols_str = ",".join(quoted_cols)
1786
+ sel_row_num = "row_number() over (order by \"{0}\" {1}) - 1 as tdml_row_num, {2}".format(sort_col, sort_order, sel_cols_str)
1787
+ filter_str = "tdml_row_num < {0}".format(n)
1788
+ sel_nodeid = aed_utils._aed_select(df._nodeid, sel_row_num)
1789
+ fil_nodeid = aed_utils._aed_filter(sel_nodeid, filter_str)
1790
+ sel2_nodeid = aed_utils._aed_select(fil_nodeid, sel_cols_str)
1791
+ col_names, col_types = __class__._get_column_names_and_types_from_metaexpr(df._metaexpr)
1792
+ new_metaexpr = UtilFuncs._get_metaexpr_using_columns(df._nodeid, zip(col_names, col_types))
1793
+ # Call the function from_node from appropriate class either DataFrame or GeoDataFrame
1794
+ new_df = df.__class__._from_node(sel2_nodeid, new_metaexpr, df._index_label)
1795
+ new_df._orderby = df._orderby
1796
+ new_df._metaexpr._n_rows = n
1797
+ return new_df
1798
+
1799
+ @staticmethod
1800
+ def _get_database_names(connection, schema_name):
1801
+ """
1802
+ Function to return a list valid of database names for a given sqlalchemy connection.
1803
+ This function is used to determine whether the database used is valid in user APIs such as copy_to_sql.
1804
+
1805
+ PARAMETERS:
1806
+ connection: Required Argument.
1807
+ A SQLAlchemy connection object.
1808
+
1809
+ schema_name: Required Argument
1810
+ String specifying the requested schema name.
1811
+
1812
+ RAISES:
1813
+ TeradataMlException (TDMLDF_INFO_ERROR)
1814
+
1815
+ EXAMPLES:
1816
+ DataFrameUtils._get_database_names(get_connection(), schema_name)
1817
+ """
1818
+ #TODO: implement and use this in teradatasqlalchemy
1819
+ table_obj = table('databasesV', column('databasename'), schema='dbc')
1820
+ stmt = select(text(str(func.lower(table_obj.c.databasename)) + ' as databasename')).where(
1821
+ text('databasename (NOT CASESPECIFIC) = {} (NOT CASESPECIFIC)'.format(':schema_name')))
1822
+ stmt = text(str(stmt))
1823
+ stmt = stmt.bindparams(schema_name=schema_name)
1824
+ res = connection.execute(stmt).fetchall()
1825
+ return [name.databasename for name in res]
1826
+
1827
+ @staticmethod
1828
+ def _get_common_parent_df_from_dataframes(dfs):
1829
+ """
1830
+ Internal function to return common parent dataframe from given list of dataframes.
1831
+ """
1832
+ from teradataml import DataFrame, in_schema
1833
+ aed_utils = AedUtils()
1834
+ if len(dfs) == 1:
1835
+ operation = aed_utils._aed_get_node_query_type(dfs[0]._nodeid)
1836
+ if operation in ["table", "assign"]:
1837
+ # Assign might have removed some columns and if it is only one dataframe,
1838
+ # then return the same dataframe.
1839
+ # Return the same dataframe if it is DataFrame object from table.
1840
+ return dfs[0]
1841
+
1842
+ # If select node or any other node, then get the parent node and execute it.
1843
+ pids = aed_utils._aed_get_parent_nodeids(dfs[0]._nodeid)
1844
+ if not aed_utils._aed_is_node_executed(pids[0]):
1845
+ _ = DataFrameUtils._execute_node_return_db_object_name(pids[0])
1846
+
1847
+ tab_name_first = aed_utils._aed_get_source_tablename(pids[0])
1848
+
1849
+ db_schema = UtilFuncs._extract_db_name(tab_name_first)
1850
+ db_table_name = UtilFuncs._extract_table_name(tab_name_first)
1851
+
1852
+ return DataFrame(in_schema(db_schema, db_table_name))
1853
+
1854
+ pids_first = None
1855
+ parent_df = None
1856
+ for i in range(len(dfs)):
1857
+ pids = aed_utils._aed_get_parent_nodeids(dfs[i]._nodeid)
1858
+
1859
+ if parent_df is None:
1860
+ if not aed_utils._aed_is_node_executed(pids[0]):
1861
+ _ = DataFrameUtils._execute_node_return_db_object_name(pids[0])
1862
+
1863
+ tab_name_first = aed_utils._aed_get_source_tablename(pids[0])
1864
+
1865
+ db_schema = UtilFuncs._extract_db_name(tab_name_first)
1866
+ db_table_name = UtilFuncs._extract_table_name(tab_name_first)
1867
+
1868
+ parent_df = DataFrame(in_schema(db_schema, db_table_name))
1869
+ pids_first = pids
1870
+ else:
1871
+ if pids_first != pids:
1872
+ raise TeradataMlException(Messages.get_message(MessageCodes.DFS_NO_COMMON_PARENT),
1873
+ MessageCodes.DFS_NO_COMMON_PARENT)
1874
+
1875
+ return parent_df