teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (263) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/LICENSE.pdf +0 -0
  3. teradataml/README.md +183 -0
  4. teradataml/__init__.py +6 -3
  5. teradataml/_version.py +2 -2
  6. teradataml/analytics/__init__.py +3 -2
  7. teradataml/analytics/analytic_function_executor.py +275 -40
  8. teradataml/analytics/analytic_query_generator.py +92 -0
  9. teradataml/analytics/byom/__init__.py +3 -2
  10. teradataml/analytics/json_parser/metadata.py +1 -0
  11. teradataml/analytics/json_parser/utils.py +17 -21
  12. teradataml/analytics/meta_class.py +40 -1
  13. teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
  14. teradataml/analytics/sqle/__init__.py +10 -2
  15. teradataml/analytics/table_operator/__init__.py +3 -2
  16. teradataml/analytics/uaf/__init__.py +21 -2
  17. teradataml/analytics/utils.py +62 -1
  18. teradataml/analytics/valib.py +1 -1
  19. teradataml/automl/__init__.py +1553 -319
  20. teradataml/automl/custom_json_utils.py +139 -61
  21. teradataml/automl/data_preparation.py +276 -319
  22. teradataml/automl/data_transformation.py +163 -81
  23. teradataml/automl/feature_engineering.py +402 -239
  24. teradataml/automl/feature_exploration.py +9 -2
  25. teradataml/automl/model_evaluation.py +48 -51
  26. teradataml/automl/model_training.py +291 -189
  27. teradataml/catalog/byom.py +8 -8
  28. teradataml/catalog/model_cataloging_utils.py +1 -1
  29. teradataml/clients/auth_client.py +133 -0
  30. teradataml/clients/pkce_client.py +1 -1
  31. teradataml/common/aed_utils.py +3 -2
  32. teradataml/common/constants.py +48 -6
  33. teradataml/common/deprecations.py +13 -7
  34. teradataml/common/garbagecollector.py +156 -120
  35. teradataml/common/messagecodes.py +6 -1
  36. teradataml/common/messages.py +3 -1
  37. teradataml/common/sqlbundle.py +1 -1
  38. teradataml/common/utils.py +103 -11
  39. teradataml/common/wrapper_utils.py +1 -1
  40. teradataml/context/context.py +121 -31
  41. teradataml/data/advertising.csv +201 -0
  42. teradataml/data/bank_marketing.csv +11163 -0
  43. teradataml/data/bike_sharing.csv +732 -0
  44. teradataml/data/boston2cols.csv +721 -0
  45. teradataml/data/breast_cancer.csv +570 -0
  46. teradataml/data/complaints_test_tokenized.csv +353 -0
  47. teradataml/data/complaints_tokens_model.csv +348 -0
  48. teradataml/data/covid_confirm_sd.csv +83 -0
  49. teradataml/data/customer_segmentation_test.csv +2628 -0
  50. teradataml/data/customer_segmentation_train.csv +8069 -0
  51. teradataml/data/dataframe_example.json +10 -0
  52. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
  53. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
  54. teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
  55. teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
  56. teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
  57. teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
  58. teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
  59. teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
  60. teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
  61. teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
  62. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
  63. teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
  64. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
  65. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
  66. teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
  67. teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
  68. teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
  69. teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
  70. teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
  71. teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
  72. teradataml/data/docs/sqle/docs_17_20/Shap.py +197 -0
  73. teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
  74. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
  75. teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
  76. teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
  77. teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
  78. teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
  79. teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
  80. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
  81. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
  82. teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
  83. teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
  84. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
  85. teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
  86. teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
  87. teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
  88. teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
  89. teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
  90. teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
  91. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
  92. teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
  93. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
  94. teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
  95. teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
  96. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
  97. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
  98. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
  99. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
  100. teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
  101. teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
  102. teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
  103. teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
  104. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  105. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
  106. teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
  107. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
  108. teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
  109. teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
  110. teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
  111. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  112. teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
  113. teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
  114. teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
  115. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  116. teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
  117. teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
  118. teradataml/data/dwt2d_dataTable.csv +65 -0
  119. teradataml/data/dwt_dataTable.csv +8 -0
  120. teradataml/data/dwt_filterTable.csv +3 -0
  121. teradataml/data/finance_data4.csv +13 -0
  122. teradataml/data/glm_example.json +28 -1
  123. teradataml/data/grocery_transaction.csv +19 -0
  124. teradataml/data/housing_train_segment.csv +201 -0
  125. teradataml/data/idwt2d_dataTable.csv +5 -0
  126. teradataml/data/idwt_dataTable.csv +8 -0
  127. teradataml/data/idwt_filterTable.csv +3 -0
  128. teradataml/data/insect2Cols.csv +61 -0
  129. teradataml/data/interval_data.csv +5 -0
  130. teradataml/data/jsons/paired_functions.json +14 -0
  131. teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
  132. teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
  133. teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
  134. teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
  135. teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
  136. teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
  137. teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
  138. teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
  139. teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
  140. teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
  141. teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
  142. teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
  143. teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
  144. teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
  145. teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
  146. teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
  147. teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
  148. teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
  149. teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
  150. teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
  151. teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
  152. teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
  153. teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
  154. teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
  155. teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
  156. teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
  157. teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
  158. teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
  159. teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
  160. teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
  161. teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
  162. teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
  163. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
  164. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
  165. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
  166. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
  167. teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
  168. teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
  169. teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
  170. teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
  171. teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
  172. teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
  173. teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
  174. teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
  175. teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
  176. teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
  177. teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
  178. teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
  179. teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
  180. teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
  181. teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
  182. teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
  183. teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
  184. teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
  185. teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
  186. teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
  187. teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
  188. teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
  189. teradataml/data/jsons/uaf/17.20/TD_SAX.json +208 -0
  190. teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
  191. teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
  192. teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
  193. teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
  194. teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
  195. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +400 -0
  196. teradataml/data/kmeans_example.json +5 -0
  197. teradataml/data/kmeans_table.csv +10 -0
  198. teradataml/data/load_example_data.py +8 -2
  199. teradataml/data/naivebayestextclassifier_example.json +1 -1
  200. teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
  201. teradataml/data/onehot_encoder_train.csv +4 -0
  202. teradataml/data/openml_example.json +29 -0
  203. teradataml/data/peppers.png +0 -0
  204. teradataml/data/real_values.csv +14 -0
  205. teradataml/data/sax_example.json +8 -0
  206. teradataml/data/scale_attributes.csv +3 -0
  207. teradataml/data/scale_example.json +52 -1
  208. teradataml/data/scale_input_part_sparse.csv +31 -0
  209. teradataml/data/scale_input_partitioned.csv +16 -0
  210. teradataml/data/scale_input_sparse.csv +11 -0
  211. teradataml/data/scale_parameters.csv +3 -0
  212. teradataml/data/scripts/deploy_script.py +21 -2
  213. teradataml/data/scripts/sklearn/sklearn_fit.py +40 -37
  214. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +22 -30
  215. teradataml/data/scripts/sklearn/sklearn_function.template +42 -24
  216. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
  217. teradataml/data/scripts/sklearn/sklearn_neighbors.py +19 -28
  218. teradataml/data/scripts/sklearn/sklearn_score.py +32 -32
  219. teradataml/data/scripts/sklearn/sklearn_transform.py +85 -42
  220. teradataml/data/star_pivot.csv +8 -0
  221. teradataml/data/templates/open_source_ml.json +2 -1
  222. teradataml/data/teradataml_example.json +97 -1
  223. teradataml/data/timestamp_data.csv +4 -0
  224. teradataml/data/titanic_dataset_unpivoted.csv +19 -0
  225. teradataml/data/uaf_example.json +55 -1
  226. teradataml/data/unpivot_example.json +15 -0
  227. teradataml/data/url_data.csv +9 -0
  228. teradataml/data/windowdfft.csv +16 -0
  229. teradataml/data/ztest_example.json +16 -0
  230. teradataml/dataframe/copy_to.py +9 -4
  231. teradataml/dataframe/data_transfer.py +125 -64
  232. teradataml/dataframe/dataframe.py +575 -57
  233. teradataml/dataframe/dataframe_utils.py +47 -9
  234. teradataml/dataframe/fastload.py +273 -90
  235. teradataml/dataframe/functions.py +339 -0
  236. teradataml/dataframe/row.py +160 -0
  237. teradataml/dataframe/setop.py +2 -2
  238. teradataml/dataframe/sql.py +740 -18
  239. teradataml/dataframe/window.py +1 -1
  240. teradataml/dbutils/dbutils.py +324 -18
  241. teradataml/geospatial/geodataframe.py +1 -1
  242. teradataml/geospatial/geodataframecolumn.py +1 -1
  243. teradataml/hyperparameter_tuner/optimizer.py +13 -13
  244. teradataml/lib/aed_0_1.dll +0 -0
  245. teradataml/opensource/sklearn/_sklearn_wrapper.py +254 -122
  246. teradataml/options/__init__.py +16 -5
  247. teradataml/options/configure.py +39 -6
  248. teradataml/options/display.py +2 -2
  249. teradataml/plot/axis.py +4 -4
  250. teradataml/scriptmgmt/UserEnv.py +26 -19
  251. teradataml/scriptmgmt/lls_utils.py +120 -16
  252. teradataml/table_operators/Script.py +4 -5
  253. teradataml/table_operators/TableOperator.py +160 -26
  254. teradataml/table_operators/table_operator_util.py +88 -41
  255. teradataml/table_operators/templates/dataframe_udf.template +63 -0
  256. teradataml/telemetry_utils/__init__.py +0 -0
  257. teradataml/telemetry_utils/queryband.py +52 -0
  258. teradataml/utils/validators.py +41 -3
  259. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/METADATA +191 -6
  260. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/RECORD +263 -185
  261. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/WHEEL +0 -0
  262. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/top_level.txt +0 -0
  263. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/zip-safe +0 -0
@@ -59,5 +59,34 @@
59
59
  "group_column" : "integer",
60
60
  "partition_column_1" : "integer",
61
61
  "partition_column_2" : "integer"
62
+ },
63
+ "onehot_encoder_train": {
64
+ "gender" : "varchar(20)",
65
+ "numb" : "integer"
66
+ },
67
+ "customer_segmentation_train": {
68
+ "ID" : "integer",
69
+ "Gender" : "varchar(10)",
70
+ "Ever_Married" : "varchar(10)",
71
+ "Age" : "integer",
72
+ "Graduated" : "varchar(10)",
73
+ "Profession" : "varchar(30)",
74
+ "Work_Experience" : "integer",
75
+ "Spending_Score" : "varchar(10)",
76
+ "Family_Size": "integer",
77
+ "Var_1": "varchar(10)",
78
+ "Segmentation": "varchar(2)"
79
+ },
80
+ "customer_segmentation_test": {
81
+ "ID" : "integer",
82
+ "Gender" : "varchar(10)",
83
+ "Ever_Married" : "varchar(10)",
84
+ "Age" : "integer",
85
+ "Graduated" : "varchar(10)",
86
+ "Profession" : "varchar(30)",
87
+ "Work_Experience" : "integer",
88
+ "Spending_Score" : "varchar(10)",
89
+ "Family_Size": "integer",
90
+ "Var_1": "varchar(10)"
62
91
  }
63
92
  }
Binary file
@@ -0,0 +1,14 @@
1
+ "TD_TIMECODE","id","val"
2
+ 2020-01-01 08:00:00,33,1.2e+02
3
+ 2020-02-01 08:00:00,33,1.95e+02
4
+ 2020-03-01 08:00:00,33,8e+02
5
+ 2020-04-01 08:00:00,33,6.6e+01
6
+ 2020-05-01 08:00:00,33,1.44e+02
7
+ 2020-06-01 08:00:00,33,2.1e+04
8
+ 2020-07-01 08:00:00,33,3.2e+02
9
+ 2020-08-01 08:00:00,33,1.44e+02
10
+ 2020-09-01 08:00:00,33,2.2e+02
11
+ 2020-10-01 08:00:00,33,2.1e+02
12
+ 2020-11-01 08:00:00,33,1.34e+02
13
+ 2020-12-01 08:00:00,33,1.84e+02
14
+ 2020-12-02 08:00:00,33,1.98e+02
@@ -5,5 +5,13 @@
5
5
  "expenditure": "integer",
6
6
  "income": "integer",
7
7
  "investment": "integer"
8
+ },
9
+ "finance_data4":{
10
+ "id": "integer",
11
+ "period": "integer",
12
+ "expenditure": "float",
13
+ "income": "float",
14
+ "investment": "float"
15
+
8
16
  }
9
17
  }
@@ -0,0 +1,3 @@
1
+ pid,attribute_column
2
+ 1,fare
3
+ 2,age
@@ -19,5 +19,56 @@
19
19
  "bathrms" : "real",
20
20
  "stories" : "real"
21
21
 
22
- }
22
+ },
23
+ "scale_attributes":{
24
+
25
+ "pid" : "integer",
26
+ "attribute_column" : "varchar(150)"
27
+
28
+ },
29
+ "scale_parameters":{
30
+
31
+ "pid" : "integer",
32
+ "parameter_column" : "varchar(150)",
33
+ "value_column" : "varchar(150)"
34
+
35
+ },
36
+ "scale_input_partitioned":{
37
+ "passenger" : "integer",
38
+ "pid" : "integer",
39
+ "survived" : "integer",
40
+ "pclass" : "integer",
41
+ "name" : "varchar(90)",
42
+ "gender" : "varchar(10)",
43
+ "age" : "integer",
44
+ "sibsp" : "integer",
45
+ "parch" : "integer",
46
+ "ticket" : "varchar(20)",
47
+ "fare" : "integer",
48
+ "cabin" : "varchar(20)",
49
+ "embarked" : "varchar(10)"
50
+
51
+
52
+ },
53
+
54
+ "scale_input_sparse":
55
+ {
56
+
57
+ "passenger" : "integer",
58
+ "attribute_column" : "varchar(20)",
59
+ "attribute_value" : "real"
60
+
61
+ },
62
+
63
+ "scale_input_part_sparse":
64
+ {
65
+
66
+ "pid" : "integer",
67
+ "passenger" : "integer",
68
+ "attribute_column" : "varchar(20)",
69
+ "attribute_value" : "real"
70
+
23
71
  }
72
+
73
+
74
+ }
@@ -0,0 +1,31 @@
1
+ pid,passenger,attribute_column,attribute_value
2
+ 3,56,age,
3
+ 3,56,fare,35.5
4
+ 3,63,age,45.0
5
+ 3,63,fare,83.475
6
+ 3,67,age,29.0
7
+ 3,67,fare,10.5
8
+ 3,76,age,25.0
9
+ 3,76,fare,7.65
10
+ 3,93,age,46.0
11
+ 3,93,fare,61.175
12
+ 1,2,age,38.0
13
+ 1,2,fare,71.2833
14
+ 1,4,age,35.0
15
+ 1,4,fare,53.1
16
+ 1,7,age,54.0
17
+ 1,7,fare,51.8625
18
+ 1,11,age,4.0
19
+ 1,11,fare,16.7
20
+ 1,12,age,58.0
21
+ 1,12,fare,26.55
22
+ 2,22,age,34.0
23
+ 2,22,fare,13.0
24
+ 2,24,age,28.0
25
+ 2,24,fare,35.5
26
+ 2,32,age,
27
+ 2,32,fare,146.5208
28
+ 2,53,age,49.0
29
+ 2,53,fare,76.7292
30
+ 2,55,age,65.0
31
+ 2,55,fare,61.9792
@@ -0,0 +1,16 @@
1
+ passenger,pid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
2
+ 76,3,0,3,Moen; Mr. Sigurd Hansen,male,25.0,0,0,348123,7.65,F G73,S
3
+ 32,2,1,1,Spencer; Mrs. William Augustus (Marie Eugenie),female,,1,0,PC 17569,146.5208,B78,C
4
+ 55,2,0,1,Ostby; Mr. Engelhart Cornelius,male,65.0,0,1,113509,61.9792,B30,C
5
+ 53,2,1,1,Harper; Mrs. Henry Sleeper (Myna Haxtun),female,49.0,1,0,PC 17572,76.7292,D33,C
6
+ 93,3,0,1,Chaffee; Mr. Herbert Fuller,male,46.0,1,0,W.E.P. 5734,61.175,E31,S
7
+ 11,1,1,3,Sandstrom; Miss. Marguerite Rut,female,4.0,1,1,PP 9549,16.7,G6,S
8
+ 7,1,0,1,McCarthy; Mr. Timothy J,male,54.0,0,0,17463,51.8625,E46,S
9
+ 24,2,1,1,Sloper; Mr. William Thompson,male,28.0,0,0,113788,35.5,A6,S
10
+ 63,3,0,1,Harris; Mr. Henry Birkhardt,male,45.0,1,0,36973,83.475,C83,S
11
+ 22,2,1,2,Beesley; Mr. Lawrence,male,34.0,0,0,248698,13.0,D56,S
12
+ 56,3,1,1,Woolner; Mr. Hugh,male,,0,0,19947,35.5,C52,S
13
+ 12,1,1,1,Bonnell; Miss. Elizabeth,female,58.0,0,0,113783,26.55,C103,S
14
+ 2,1,1,1,Cumings; Mrs. John Bradley (Florence Briggs Thayer),female,38.0,1,0,PC 17599,71.2833,C85,C
15
+ 67,3,1,2,Nye; Mrs. (Elizabeth Ramell),female,29.0,0,0,C.A. 29395,10.5,F33,S
16
+ 4,1,1,1,Futrelle; Mrs. Jacques Heath (Lily May Peel),female,35.0,1,0,113803,53.1,C123,S
@@ -0,0 +1,11 @@
1
+ passenger,attribute_column,attribute_value
2
+ 873,age,33.0
3
+ 631,age,80.0
4
+ 97,age,71.0
5
+ 873,fare,5.0
6
+ 631,fare,30.0
7
+ 97,fare,34.6542
8
+ 488,age,58.0
9
+ 488,fare,29.7
10
+ 505,age,16.0
11
+ 505,fare,86.5
@@ -0,0 +1,3 @@
1
+ pid,parameter_column,value_column
2
+ 1,scalemethod,midrange
3
+ 2,scalemethod,range
@@ -27,6 +27,11 @@ def get_values_list(values, ignore_none=True):
27
27
 
28
28
  return ret_vals
29
29
 
30
+ if len(sys.argv) != 2:
31
+ sys.exit("Script command format: python deploy_script.py <enterprise/lake>")
32
+
33
+ vantage_type = sys.argv[1]
34
+
30
35
  data_partition_column_values = []
31
36
  data_partition_column_indices = [5, 6]
32
37
 
@@ -55,11 +60,25 @@ if not len(features):
55
60
  sys.exit(0)
56
61
 
57
62
  X = np.array(features)
58
- y = np.array(labels)
63
+ y = np.array(labels).ravel()
59
64
 
60
65
  clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
61
66
  clf.fit(X, y)
62
67
 
63
- model = base64.b64encode(pickle.dumps(clf))
68
+ model_str = pickle.dumps(clf)
69
+
70
+ # Prepare the corresponding model file name and extract model.
71
+ partition_join = "_".join([str(x) for x in data_partition_column_values])
72
+ # Replace '-' with '_' as '-' because partition_columns can be negative.
73
+ partition_join = partition_join.replace("-", "_")
74
+
75
+ if vantage_type == "lake":
76
+ model = f"/tmp/sklearn_model_{partition_join}.pickle"
77
+ with open(model, "wb") as fp:
78
+ fp.write(model_str)
79
+ elif vantage_type == "enterprise":
80
+ model = base64.b64encode(model_str)
81
+ else:
82
+ sys.exit("Invalid vantage type. Use either 'lake' or 'enterprise'.")
64
83
 
65
84
  print(*(data_partition_column_values + [model]), sep=DELIMITER)
@@ -5,35 +5,24 @@ import base64
5
5
 
6
6
  DELIMITER = '\t'
7
7
 
8
- def get_value(value):
9
- ret_val = value
10
- try:
11
- ret_val = float(value.replace(' ', ''))
12
- except Exception as ex:
13
- # If the value can't be converted to float, then it is string.
14
- pass
15
- return ret_val
16
-
17
-
18
- def get_values_list(values, ignore_none=True):
8
+ def get_values_list(values, types, model_obj):
19
9
  ret_vals = []
20
- for val in values:
21
- if val == "" and ignore_none:
22
- # Empty cell value in the database table.
10
+ for i, val in enumerate(values):
11
+ if type(model_obj).__name__ == "MultiLabelBinarizer" and val == "":
23
12
  continue
24
- ret_vals.append(get_value(val))
25
-
13
+ ret_vals.append(convert_to_type(val, types[i]))
26
14
  return ret_vals
27
15
 
28
16
  def convert_to_type(val, typee):
29
17
  if typee == 'int':
30
- return int(val)
18
+ return int(val) if val != "" else np.nan
31
19
  if typee == 'float':
32
- val = get_value(val)
33
- return float(val)
20
+ if isinstance(val, str):
21
+ val = val.replace(' ', '')
22
+ return float(val) if val != "" else np.nan
34
23
  if typee == 'bool':
35
- return bool(val)
36
- return str(val)
24
+ return eval(val) if val != "" else None
25
+ return str(val) if val != "" else None
37
26
 
38
27
  def get_classes_as_list(classes, actual_type):
39
28
  if classes == "None":
@@ -66,14 +55,14 @@ if len(sys.argv) != 10:
66
55
  # 3. No of feature columns.
67
56
  # 4. No of class labels.
68
57
  # 5. Comma separated indices of partition columns.
69
- # 6. Comma separated types of the partition columns.
58
+ # 6. Comma separated types of all the data columns.
70
59
  # 7. Model file prefix to generated model file using partition columns.
71
60
  # 8. classes (separated by '--') - should be converted to list. "None" if no classes exists.
72
61
  # 9. type of elements in passed in classes. "None" if no classes exists.
73
62
  # 10. Flag to check the system type. True, means Lake, Enterprise otherwise
74
63
  sys.exit("10 arguments command line arguments should be passed: file to be run,"
75
64
  " function name, no of feature columns, no of class labels, comma separated indices"
76
- " and types of partition columns, model file prefix ,"
65
+ " of partition columns, comma separated types of all columns, model file prefix ,"
77
66
  " classes, type of elements in classes and flag to check lake or enterprise.")
78
67
 
79
68
  is_lake_system = eval(sys.argv[9])
@@ -82,12 +71,14 @@ if not is_lake_system:
82
71
  function_name = sys.argv[1]
83
72
  n_f_cols = int(sys.argv[2])
84
73
  n_c_labels = int(sys.argv[3])
85
- data_partition_column_types = splitter(sys.argv[5])
74
+ data_column_types = splitter(sys.argv[5], delim="--")
86
75
  data_partition_column_indices = splitter(sys.argv[4], convert_to="int") # indices are integers.
87
76
  model_file_prefix = sys.argv[6]
88
77
  class_type = sys.argv[8]
89
78
  classes = get_classes_as_list(sys.argv[7], class_type)
90
79
 
80
+ data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
81
+
91
82
  model = None
92
83
 
93
84
  # Data Format (n_features, k_labels, one data_partition_column):
@@ -108,9 +99,7 @@ while 1:
108
99
  break
109
100
  else:
110
101
  values = line.split(DELIMITER)
111
- features.append(get_values_list(values[:n_f_cols]))
112
- if n_c_labels > 0:
113
- labels.append(get_values_list(values[n_f_cols:(n_f_cols+n_c_labels)]))
102
+
114
103
  if not data_partition_column_values:
115
104
  # Partition column values is same for all rows. Hence, only read once.
116
105
  for i, val in enumerate(data_partition_column_indices):
@@ -133,6 +122,13 @@ while 1:
133
122
  if model is None:
134
123
  sys.exit("Model file is not installed in Vantage.")
135
124
 
125
+ values = get_values_list(values, data_column_types, model)
126
+ values = values[:-len(data_partition_column_indices)] # Already processed partition columns.
127
+ features.append(values[:n_f_cols])
128
+ if n_c_labels > 0:
129
+ labels.append(values[n_f_cols:(n_f_cols+n_c_labels)])
130
+
131
+
136
132
  except EOFError: # Exit if reached EOF or CTRL-D
137
133
  break
138
134
 
@@ -142,22 +138,29 @@ if not len(features):
142
138
  # Fit/partial_fit the model to the data.
143
139
  if function_name == "partial_fit":
144
140
  if labels and classes:
145
- model.partial_fit(np.array(features), np.array(labels), classes=classes)
141
+ model.partial_fit(features, labels, classes=classes)
146
142
  elif labels:
147
- model.partial_fit(np.array(features), np.array(labels))
143
+ model.partial_fit(features, labels)
148
144
  elif classes:
149
- model.partial_fit(np.array(features), classes=classes)
145
+ model.partial_fit(features, classes=classes)
150
146
  else:
151
- model.partial_fit(np.array(features))
147
+ model.partial_fit(features)
152
148
  elif function_name == "fit":
153
- # For IsotonicRegression, fit() accepts training target as
154
- # y: array-like of shape (n_samples,).
149
+ model_name = model.__class__.__name__
150
+ np_func_list = ["OneVsRestClassifier", "LabelBinarizer", "TSNE"]
155
151
  if labels:
156
- labels = np.array(labels).reshape(-1) \
157
- if model.__class__.__name__ == "IsotonicRegression" else np.array(labels)
158
- model.fit(np.array(features), labels)
152
+ # For IsotonicRegression, fit() accepts training target as
153
+ # y: array-like of shape (n_samples,).
154
+ if model_name in ["IsotonicRegression", "LinearSVC"]:
155
+ labels = np.array(labels).reshape(-1)
156
+ if model_name in np_func_list:
157
+ labels = np.array(labels)
158
+ features = np.array(features)
159
+ model.fit(features, labels)
159
160
  else:
160
- model.fit(np.array(features))
161
+ if model_name in np_func_list:
162
+ features = np.array(features)
163
+ model.fit(features)
161
164
 
162
165
  model_str = pickle.dumps(model)
163
166
 
@@ -5,33 +5,22 @@ import math
5
5
 
6
6
  DELIMITER = '\t'
7
7
 
8
- def get_value(value):
9
- ret_val = value
10
- try:
11
- ret_val = float(value.replace(' ', ''))
12
- except Exception as ex:
13
- # If the value can't be converted to float, then it is string.
14
- pass
15
- return ret_val
16
-
17
- def get_values_list(values, ignore_none=True):
8
+ def get_values_list(values, types):
18
9
  ret_vals = []
19
- for val in values:
20
- if val == "" and ignore_none:
21
- # Empty cell value in the database table.
22
- continue
23
- ret_vals.append(get_value(val))
24
-
10
+ for i, val in enumerate(values):
11
+ ret_vals.append(convert_to_type(val, types[i]))
25
12
  return ret_vals
26
13
 
27
14
  def convert_to_type(val, typee):
28
15
  if typee == 'int':
29
- return int(val)
16
+ return int(val) if val != "" else np.nan
30
17
  if typee == 'float':
31
- return float(val)
18
+ if isinstance(val, str):
19
+ val = val.replace(' ', '')
20
+ return float(val) if val != "" else np.nan
32
21
  if typee == 'bool':
33
- return bool(val)
34
- return str(val)
22
+ return eval(val) if val != "" else None
23
+ return str(val) if val != "" else None
35
24
 
36
25
  def splitter(strr, delim=",", convert_to="str"):
37
26
  """
@@ -48,13 +37,13 @@ if len(sys.argv) != 7:
48
37
  # 2. No of feature columns.
49
38
  # 3. No of class labels.
50
39
  # 4. Comma separated indices of partition columns.
51
- # 5. Comma separated types of the partition columns.
40
+ # 5. Comma separated types of all the data columns.
52
41
  # 6. Model file prefix to generated model file using partition columns.
53
42
  # 7. Flag to check the system type. True, means Lake, Enterprise otherwise.
54
43
  sys.exit("7 arguments should be passed to this file - file to be run, "\
55
- "no of feature columns, no of class labels, comma separated indices and types of "\
56
- "partition columns, model file prefix to generate model file using partition "\
57
- "columns and flag to check lake or enterprise.")
44
+ "no of feature columns, no of class labels, comma separated indices of partition "
45
+ "columns, comma separated types of all columns, model file prefix to generate model "
46
+ "file using partition columns and flag to check lake or enterprise.")
58
47
 
59
48
  is_lake_system = eval(sys.argv[6])
60
49
  if not is_lake_system:
@@ -62,9 +51,11 @@ if not is_lake_system:
62
51
  n_f_cols = int(sys.argv[1])
63
52
  n_c_labels = int(sys.argv[2])
64
53
  model_file_prefix = sys.argv[5]
65
- data_partition_column_types = splitter(sys.argv[4])
54
+ data_column_types = splitter(sys.argv[4], delim="--")
66
55
  data_partition_column_indices = splitter(sys.argv[3], convert_to="int") # indices are integers.
67
56
 
57
+ data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
58
+
68
59
  model = None
69
60
 
70
61
  # Data Format (n_features, k_labels, one data_partition_columns):
@@ -85,9 +76,10 @@ while 1:
85
76
  break
86
77
  else:
87
78
  values = line.split(DELIMITER)
88
- features.append(get_values_list(values[:n_f_cols]))
79
+ values = get_values_list(values, data_column_types)
80
+ features.append(values[:n_f_cols])
89
81
  if n_c_labels > 0:
90
- labels.append(get_values_list(values[n_f_cols:(n_f_cols+n_c_labels)]))
82
+ labels.append(values[n_f_cols:(n_f_cols+n_c_labels)])
91
83
  if not data_partition_column_values:
92
84
  # Partition column values is same for all rows. Hence, only read once.
93
85
  for i, val in enumerate(data_partition_column_indices):
@@ -118,9 +110,9 @@ if not len(features):
118
110
 
119
111
  # write code to call fit_predict with features and labels when n_c_labels > 0
120
112
  if n_c_labels > 0:
121
- predictions = model.fit_predict(np.array(features), np.array(labels))
113
+ predictions = model.fit_predict(features, labels)
122
114
  else:
123
- predictions = model.fit_predict(np.array(features))
115
+ predictions = model.fit_predict(features)
124
116
 
125
117
  # Export results to to the Databse through standard output
126
118
  for i in range(len(predictions)):
@@ -130,6 +122,6 @@ for i in range(len(predictions)):
130
122
  else:
131
123
  result_list = features[i] + [predictions[i]]
132
124
  print(*(data_partition_column_values +
133
- ['' if (val is None or math.isnan(val) or math.isinf(val))
125
+ ['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val))))
134
126
  else val for val in result_list]),
135
127
  sep= DELIMITER)
@@ -8,23 +8,16 @@ params = json.loads('<params>')
8
8
 
9
9
  DELIMITER = '\t'
10
10
 
11
- def get_value(value):
12
- ret_val = value
13
- try:
14
- ret_val = float(value.replace(' ', ''))
15
- except Exception as ex:
16
- # If the value can't be converted to float, then it is string.
17
- pass
18
- return ret_val
19
-
20
11
  def convert_to_type(val, typee):
21
12
  if typee == 'int':
22
- return int(val)
13
+ return int(val) if val != "" else np.nan
23
14
  if typee == 'float':
24
- return get_value(val)
15
+ if isinstance(val, str):
16
+ val = val.replace(' ', '')
17
+ return float(val) if val != "" else np.nan
25
18
  if typee == 'bool':
26
- return bool(val)
27
- return str(val)
19
+ return eval(val) if val != "" else None
20
+ return str(val) if val != "" else None
28
21
 
29
22
  def splitter(strr, delim=",", convert_to="str"):
30
23
  """
@@ -35,21 +28,30 @@ def splitter(strr, delim=",", convert_to="str"):
35
28
  return [convert_to_type(i, convert_to) for i in strr.split(delim)]
36
29
 
37
30
  # Arguments to the Script.
38
- if len(sys.argv) != 4:
39
- # 4 arguments command line arguments should be passed to this file.
31
+ if len(sys.argv) != 6:
32
+ # 5 arguments command line arguments should be passed to this file.
40
33
  # 1: file to be run
41
34
  # 2. Comma separated indices of partition columns.
42
- # 3. Comma separated types of the partition columns.
35
+ # 3. Comma separated types of all the data columns.
43
36
  # 4. Data columns information separted by "--" where each data column information is in the form
44
37
  # "<arg_name>-<comma separated data indices>-<comma separated data types>".
45
- sys.exit("4 arguments command line arguments should be passed: file to be run,"
46
- " comma separated indices and types of partition columns, data columns information"
47
- " separated by '--' where each data column information is in the form"
48
- " '<arg_name>-<comma separated data indices>-<comma separated data types>'.")
49
-
50
- db = sys.argv[0].split("/")[1]
38
+ # 5. Flag to check the system type. True, means Lake, Enterprise otherwise.
39
+ # 6. Model file prefix for lake system, None otherwise.
40
+ sys.exit("5 arguments command line arguments should be passed: file to be run,"
41
+ " comma separated indices of partition columns, comma separated types of all columns,"
42
+ " data columns information separated by '--' where each data column information is"
43
+ " in the form '<arg_name>-<comma separated data indices>-<comma separated data types>',"
44
+ " flag to check lake or enterprise and model file prefix used only for lake system.")
45
+
46
+ is_lake_system = eval(sys.argv[4])
47
+ if not is_lake_system:
48
+ db = sys.argv[0].split("/")[1]
49
+ else:
50
+ model_file_prefix = sys.argv[5]
51
51
  data_partition_column_indices = splitter(sys.argv[1], convert_to="int") # indices are integers.
52
- data_partition_column_types = splitter(sys.argv[2])
52
+ data_column_types = splitter(sys.argv[2], delim="--")
53
+
54
+ data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
53
55
 
54
56
  # Data related arguments information of indices and types.
55
57
  data_args_indices_types = OrderedDict()
@@ -84,6 +86,11 @@ while 1:
84
86
  data_partition_column_values.append(
85
87
  convert_to_type(values[val], typee=data_partition_column_types[i])
86
88
  )
89
+
90
+ # Prepare the corresponding model file name and extract model.
91
+ partition_join = "_".join([str(x) for x in data_partition_column_values])
92
+ # Replace '-' with '_' as '-' because partition_columns can be negative.
93
+ partition_join = partition_join.replace("-", "_")
87
94
 
88
95
  # Prepare data dictionary containing only arguments related to data.
89
96
  for arg_name in data_args_values:
@@ -110,4 +117,15 @@ all_args = {**data_args_values, **params}
110
117
  module_ = importlib.import_module(module_name)
111
118
  sklearn_model = getattr(module_, func_name)(**all_args)
112
119
 
113
- print(*(data_partition_column_values + [base64.b64encode(pickle.dumps(sklearn_model))]), sep=DELIMITER)
120
+ model_str = pickle.dumps(sklearn_model)
121
+
122
+ if is_lake_system:
123
+ model_file_path = f"/tmp/{model_file_prefix}_{partition_join}.pickle"
124
+
125
+ # Write to file in Vantage, to be used in predict/scoring.
126
+ with open(model_file_path, "wb") as fp:
127
+ fp.write(model_str)
128
+
129
+ model_data = model_file_path if is_lake_system else base64.b64encode(model_str)
130
+
131
+ print(*(data_partition_column_values + [model_data]), sep=DELIMITER)