oracle-ads 2.13.7__py3-none-any.whl → 2.13.9rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (857) hide show
  1. {oracle_ads-2.13.7.dist-info → oracle_ads-2.13.9rc0.dist-info}/METADATA +151 -151
  2. oracle_ads-2.13.9rc0.dist-info/RECORD +9 -0
  3. {oracle_ads-2.13.7.dist-info → oracle_ads-2.13.9rc0.dist-info}/WHEEL +2 -1
  4. {oracle_ads-2.13.7.dist-info → oracle_ads-2.13.9rc0.dist-info}/entry_points.txt +1 -2
  5. oracle_ads-2.13.9rc0.dist-info/top_level.txt +1 -0
  6. ads/aqua/__init__.py +0 -40
  7. ads/aqua/app.py +0 -506
  8. ads/aqua/cli.py +0 -96
  9. ads/aqua/client/__init__.py +0 -3
  10. ads/aqua/client/client.py +0 -836
  11. ads/aqua/client/openai_client.py +0 -305
  12. ads/aqua/common/__init__.py +0 -5
  13. ads/aqua/common/decorator.py +0 -125
  14. ads/aqua/common/entities.py +0 -266
  15. ads/aqua/common/enums.py +0 -122
  16. ads/aqua/common/errors.py +0 -109
  17. ads/aqua/common/utils.py +0 -1285
  18. ads/aqua/config/__init__.py +0 -4
  19. ads/aqua/config/container_config.py +0 -248
  20. ads/aqua/config/evaluation/__init__.py +0 -4
  21. ads/aqua/config/evaluation/evaluation_service_config.py +0 -147
  22. ads/aqua/config/utils/__init__.py +0 -4
  23. ads/aqua/config/utils/serializer.py +0 -339
  24. ads/aqua/constants.py +0 -114
  25. ads/aqua/data.py +0 -14
  26. ads/aqua/dummy_data/icon.txt +0 -1
  27. ads/aqua/dummy_data/oci_model_deployments.json +0 -56
  28. ads/aqua/dummy_data/oci_models.json +0 -1
  29. ads/aqua/dummy_data/readme.md +0 -26
  30. ads/aqua/evaluation/__init__.py +0 -8
  31. ads/aqua/evaluation/constants.py +0 -53
  32. ads/aqua/evaluation/entities.py +0 -186
  33. ads/aqua/evaluation/errors.py +0 -70
  34. ads/aqua/evaluation/evaluation.py +0 -1814
  35. ads/aqua/extension/__init__.py +0 -42
  36. ads/aqua/extension/aqua_ws_msg_handler.py +0 -76
  37. ads/aqua/extension/base_handler.py +0 -90
  38. ads/aqua/extension/common_handler.py +0 -121
  39. ads/aqua/extension/common_ws_msg_handler.py +0 -36
  40. ads/aqua/extension/deployment_handler.py +0 -298
  41. ads/aqua/extension/deployment_ws_msg_handler.py +0 -54
  42. ads/aqua/extension/errors.py +0 -30
  43. ads/aqua/extension/evaluation_handler.py +0 -129
  44. ads/aqua/extension/evaluation_ws_msg_handler.py +0 -61
  45. ads/aqua/extension/finetune_handler.py +0 -96
  46. ads/aqua/extension/model_handler.py +0 -390
  47. ads/aqua/extension/models/__init__.py +0 -0
  48. ads/aqua/extension/models/ws_models.py +0 -145
  49. ads/aqua/extension/models_ws_msg_handler.py +0 -50
  50. ads/aqua/extension/ui_handler.py +0 -282
  51. ads/aqua/extension/ui_websocket_handler.py +0 -130
  52. ads/aqua/extension/utils.py +0 -133
  53. ads/aqua/finetuning/__init__.py +0 -7
  54. ads/aqua/finetuning/constants.py +0 -23
  55. ads/aqua/finetuning/entities.py +0 -181
  56. ads/aqua/finetuning/finetuning.py +0 -731
  57. ads/aqua/model/__init__.py +0 -8
  58. ads/aqua/model/constants.py +0 -60
  59. ads/aqua/model/entities.py +0 -306
  60. ads/aqua/model/enums.py +0 -30
  61. ads/aqua/model/model.py +0 -2079
  62. ads/aqua/modeldeployment/__init__.py +0 -8
  63. ads/aqua/modeldeployment/constants.py +0 -10
  64. ads/aqua/modeldeployment/deployment.py +0 -1324
  65. ads/aqua/modeldeployment/entities.py +0 -653
  66. ads/aqua/modeldeployment/inference.py +0 -74
  67. ads/aqua/modeldeployment/utils.py +0 -543
  68. ads/aqua/resources/gpu_shapes_index.json +0 -94
  69. ads/aqua/server/__init__.py +0 -4
  70. ads/aqua/server/__main__.py +0 -24
  71. ads/aqua/server/app.py +0 -47
  72. ads/aqua/server/aqua_spec.yml +0 -1291
  73. ads/aqua/training/__init__.py +0 -4
  74. ads/aqua/training/exceptions.py +0 -476
  75. ads/aqua/ui.py +0 -499
  76. ads/automl/__init__.py +0 -9
  77. ads/automl/driver.py +0 -330
  78. ads/automl/provider.py +0 -975
  79. ads/bds/__init__.py +0 -5
  80. ads/bds/auth.py +0 -127
  81. ads/bds/big_data_service.py +0 -255
  82. ads/catalog/__init__.py +0 -19
  83. ads/catalog/model.py +0 -1576
  84. ads/catalog/notebook.py +0 -461
  85. ads/catalog/project.py +0 -468
  86. ads/catalog/summary.py +0 -178
  87. ads/common/__init__.py +0 -11
  88. ads/common/analyzer.py +0 -65
  89. ads/common/artifact/.model-ignore +0 -63
  90. ads/common/artifact/__init__.py +0 -10
  91. ads/common/auth.py +0 -1122
  92. ads/common/card_identifier.py +0 -83
  93. ads/common/config.py +0 -647
  94. ads/common/data.py +0 -165
  95. ads/common/decorator/__init__.py +0 -9
  96. ads/common/decorator/argument_to_case.py +0 -88
  97. ads/common/decorator/deprecate.py +0 -69
  98. ads/common/decorator/require_nonempty_arg.py +0 -65
  99. ads/common/decorator/runtime_dependency.py +0 -178
  100. ads/common/decorator/threaded.py +0 -97
  101. ads/common/decorator/utils.py +0 -35
  102. ads/common/dsc_file_system.py +0 -303
  103. ads/common/error.py +0 -14
  104. ads/common/extended_enum.py +0 -81
  105. ads/common/function/__init__.py +0 -5
  106. ads/common/function/fn_util.py +0 -142
  107. ads/common/function/func_conf.yaml +0 -25
  108. ads/common/ipython.py +0 -76
  109. ads/common/model.py +0 -679
  110. ads/common/model_artifact.py +0 -1759
  111. ads/common/model_artifact_schema.json +0 -107
  112. ads/common/model_export_util.py +0 -664
  113. ads/common/model_metadata.py +0 -24
  114. ads/common/object_storage_details.py +0 -296
  115. ads/common/oci_client.py +0 -175
  116. ads/common/oci_datascience.py +0 -46
  117. ads/common/oci_logging.py +0 -1144
  118. ads/common/oci_mixin.py +0 -957
  119. ads/common/oci_resource.py +0 -136
  120. ads/common/serializer.py +0 -559
  121. ads/common/utils.py +0 -1852
  122. ads/common/word_lists.py +0 -1491
  123. ads/common/work_request.py +0 -189
  124. ads/data_labeling/__init__.py +0 -13
  125. ads/data_labeling/boundingbox.py +0 -253
  126. ads/data_labeling/constants.py +0 -47
  127. ads/data_labeling/data_labeling_service.py +0 -244
  128. ads/data_labeling/interface/__init__.py +0 -5
  129. ads/data_labeling/interface/loader.py +0 -16
  130. ads/data_labeling/interface/parser.py +0 -16
  131. ads/data_labeling/interface/reader.py +0 -23
  132. ads/data_labeling/loader/__init__.py +0 -5
  133. ads/data_labeling/loader/file_loader.py +0 -241
  134. ads/data_labeling/metadata.py +0 -110
  135. ads/data_labeling/mixin/__init__.py +0 -5
  136. ads/data_labeling/mixin/data_labeling.py +0 -232
  137. ads/data_labeling/ner.py +0 -129
  138. ads/data_labeling/parser/__init__.py +0 -5
  139. ads/data_labeling/parser/dls_record_parser.py +0 -388
  140. ads/data_labeling/parser/export_metadata_parser.py +0 -94
  141. ads/data_labeling/parser/export_record_parser.py +0 -473
  142. ads/data_labeling/reader/__init__.py +0 -5
  143. ads/data_labeling/reader/dataset_reader.py +0 -574
  144. ads/data_labeling/reader/dls_record_reader.py +0 -121
  145. ads/data_labeling/reader/export_record_reader.py +0 -62
  146. ads/data_labeling/reader/jsonl_reader.py +0 -75
  147. ads/data_labeling/reader/metadata_reader.py +0 -203
  148. ads/data_labeling/reader/record_reader.py +0 -263
  149. ads/data_labeling/record.py +0 -52
  150. ads/data_labeling/visualizer/__init__.py +0 -5
  151. ads/data_labeling/visualizer/image_visualizer.py +0 -525
  152. ads/data_labeling/visualizer/text_visualizer.py +0 -357
  153. ads/database/__init__.py +0 -5
  154. ads/database/connection.py +0 -338
  155. ads/dataset/__init__.py +0 -10
  156. ads/dataset/capabilities.md +0 -51
  157. ads/dataset/classification_dataset.py +0 -339
  158. ads/dataset/correlation.py +0 -226
  159. ads/dataset/correlation_plot.py +0 -563
  160. ads/dataset/dask_series.py +0 -173
  161. ads/dataset/dataframe_transformer.py +0 -110
  162. ads/dataset/dataset.py +0 -1979
  163. ads/dataset/dataset_browser.py +0 -360
  164. ads/dataset/dataset_with_target.py +0 -995
  165. ads/dataset/exception.py +0 -25
  166. ads/dataset/factory.py +0 -987
  167. ads/dataset/feature_engineering_transformer.py +0 -35
  168. ads/dataset/feature_selection.py +0 -107
  169. ads/dataset/forecasting_dataset.py +0 -26
  170. ads/dataset/helper.py +0 -1450
  171. ads/dataset/label_encoder.py +0 -99
  172. ads/dataset/mixin/__init__.py +0 -5
  173. ads/dataset/mixin/dataset_accessor.py +0 -134
  174. ads/dataset/pipeline.py +0 -58
  175. ads/dataset/plot.py +0 -710
  176. ads/dataset/progress.py +0 -86
  177. ads/dataset/recommendation.py +0 -297
  178. ads/dataset/recommendation_transformer.py +0 -502
  179. ads/dataset/regression_dataset.py +0 -14
  180. ads/dataset/sampled_dataset.py +0 -1050
  181. ads/dataset/target.py +0 -98
  182. ads/dataset/timeseries.py +0 -18
  183. ads/dbmixin/__init__.py +0 -5
  184. ads/dbmixin/db_pandas_accessor.py +0 -153
  185. ads/environment/__init__.py +0 -9
  186. ads/environment/ml_runtime.py +0 -66
  187. ads/evaluations/README.md +0 -14
  188. ads/evaluations/__init__.py +0 -109
  189. ads/evaluations/evaluation_plot.py +0 -983
  190. ads/evaluations/evaluator.py +0 -1334
  191. ads/evaluations/statistical_metrics.py +0 -543
  192. ads/experiments/__init__.py +0 -9
  193. ads/experiments/capabilities.md +0 -0
  194. ads/explanations/__init__.py +0 -21
  195. ads/explanations/base_explainer.py +0 -142
  196. ads/explanations/capabilities.md +0 -83
  197. ads/explanations/explainer.py +0 -190
  198. ads/explanations/mlx_global_explainer.py +0 -1050
  199. ads/explanations/mlx_interface.py +0 -386
  200. ads/explanations/mlx_local_explainer.py +0 -287
  201. ads/explanations/mlx_whatif_explainer.py +0 -201
  202. ads/feature_engineering/__init__.py +0 -20
  203. ads/feature_engineering/accessor/__init__.py +0 -5
  204. ads/feature_engineering/accessor/dataframe_accessor.py +0 -535
  205. ads/feature_engineering/accessor/mixin/__init__.py +0 -5
  206. ads/feature_engineering/accessor/mixin/correlation.py +0 -166
  207. ads/feature_engineering/accessor/mixin/eda_mixin.py +0 -266
  208. ads/feature_engineering/accessor/mixin/eda_mixin_series.py +0 -85
  209. ads/feature_engineering/accessor/mixin/feature_types_mixin.py +0 -211
  210. ads/feature_engineering/accessor/mixin/utils.py +0 -65
  211. ads/feature_engineering/accessor/series_accessor.py +0 -431
  212. ads/feature_engineering/adsimage/__init__.py +0 -5
  213. ads/feature_engineering/adsimage/image.py +0 -192
  214. ads/feature_engineering/adsimage/image_reader.py +0 -170
  215. ads/feature_engineering/adsimage/interface/__init__.py +0 -5
  216. ads/feature_engineering/adsimage/interface/reader.py +0 -19
  217. ads/feature_engineering/adsstring/__init__.py +0 -7
  218. ads/feature_engineering/adsstring/oci_language/__init__.py +0 -8
  219. ads/feature_engineering/adsstring/string/__init__.py +0 -8
  220. ads/feature_engineering/data_schema.json +0 -57
  221. ads/feature_engineering/dataset/__init__.py +0 -5
  222. ads/feature_engineering/dataset/zip_code_data.py +0 -42062
  223. ads/feature_engineering/exceptions.py +0 -40
  224. ads/feature_engineering/feature_type/__init__.py +0 -133
  225. ads/feature_engineering/feature_type/address.py +0 -184
  226. ads/feature_engineering/feature_type/adsstring/__init__.py +0 -5
  227. ads/feature_engineering/feature_type/adsstring/common_regex_mixin.py +0 -164
  228. ads/feature_engineering/feature_type/adsstring/oci_language.py +0 -93
  229. ads/feature_engineering/feature_type/adsstring/parsers/__init__.py +0 -5
  230. ads/feature_engineering/feature_type/adsstring/parsers/base.py +0 -47
  231. ads/feature_engineering/feature_type/adsstring/parsers/nltk_parser.py +0 -96
  232. ads/feature_engineering/feature_type/adsstring/parsers/spacy_parser.py +0 -221
  233. ads/feature_engineering/feature_type/adsstring/string.py +0 -258
  234. ads/feature_engineering/feature_type/base.py +0 -58
  235. ads/feature_engineering/feature_type/boolean.py +0 -183
  236. ads/feature_engineering/feature_type/category.py +0 -146
  237. ads/feature_engineering/feature_type/constant.py +0 -137
  238. ads/feature_engineering/feature_type/continuous.py +0 -151
  239. ads/feature_engineering/feature_type/creditcard.py +0 -314
  240. ads/feature_engineering/feature_type/datetime.py +0 -190
  241. ads/feature_engineering/feature_type/discrete.py +0 -134
  242. ads/feature_engineering/feature_type/document.py +0 -43
  243. ads/feature_engineering/feature_type/gis.py +0 -251
  244. ads/feature_engineering/feature_type/handler/__init__.py +0 -5
  245. ads/feature_engineering/feature_type/handler/feature_validator.py +0 -524
  246. ads/feature_engineering/feature_type/handler/feature_warning.py +0 -319
  247. ads/feature_engineering/feature_type/handler/warnings.py +0 -128
  248. ads/feature_engineering/feature_type/integer.py +0 -142
  249. ads/feature_engineering/feature_type/ip_address.py +0 -144
  250. ads/feature_engineering/feature_type/ip_address_v4.py +0 -138
  251. ads/feature_engineering/feature_type/ip_address_v6.py +0 -138
  252. ads/feature_engineering/feature_type/lat_long.py +0 -256
  253. ads/feature_engineering/feature_type/object.py +0 -43
  254. ads/feature_engineering/feature_type/ordinal.py +0 -132
  255. ads/feature_engineering/feature_type/phone_number.py +0 -135
  256. ads/feature_engineering/feature_type/string.py +0 -171
  257. ads/feature_engineering/feature_type/text.py +0 -93
  258. ads/feature_engineering/feature_type/unknown.py +0 -43
  259. ads/feature_engineering/feature_type/zip_code.py +0 -164
  260. ads/feature_engineering/feature_type_manager.py +0 -406
  261. ads/feature_engineering/schema.py +0 -795
  262. ads/feature_engineering/utils.py +0 -245
  263. ads/feature_store/.readthedocs.yaml +0 -19
  264. ads/feature_store/README.md +0 -65
  265. ads/feature_store/__init__.py +0 -9
  266. ads/feature_store/common/__init__.py +0 -0
  267. ads/feature_store/common/enums.py +0 -339
  268. ads/feature_store/common/exceptions.py +0 -18
  269. ads/feature_store/common/spark_session_singleton.py +0 -125
  270. ads/feature_store/common/utils/__init__.py +0 -0
  271. ads/feature_store/common/utils/base64_encoder_decoder.py +0 -72
  272. ads/feature_store/common/utils/feature_schema_mapper.py +0 -283
  273. ads/feature_store/common/utils/transformation_utils.py +0 -82
  274. ads/feature_store/common/utils/utility.py +0 -403
  275. ads/feature_store/data_validation/__init__.py +0 -0
  276. ads/feature_store/data_validation/great_expectation.py +0 -129
  277. ads/feature_store/dataset.py +0 -1230
  278. ads/feature_store/dataset_job.py +0 -530
  279. ads/feature_store/docs/Dockerfile +0 -7
  280. ads/feature_store/docs/Makefile +0 -44
  281. ads/feature_store/docs/conf.py +0 -28
  282. ads/feature_store/docs/requirements.txt +0 -14
  283. ads/feature_store/docs/source/ads.feature_store.query.rst +0 -20
  284. ads/feature_store/docs/source/cicd.rst +0 -137
  285. ads/feature_store/docs/source/conf.py +0 -86
  286. ads/feature_store/docs/source/data_versioning.rst +0 -33
  287. ads/feature_store/docs/source/dataset.rst +0 -388
  288. ads/feature_store/docs/source/dataset_job.rst +0 -27
  289. ads/feature_store/docs/source/demo.rst +0 -70
  290. ads/feature_store/docs/source/entity.rst +0 -78
  291. ads/feature_store/docs/source/feature_group.rst +0 -624
  292. ads/feature_store/docs/source/feature_group_job.rst +0 -29
  293. ads/feature_store/docs/source/feature_store.rst +0 -122
  294. ads/feature_store/docs/source/feature_store_class.rst +0 -123
  295. ads/feature_store/docs/source/feature_validation.rst +0 -66
  296. ads/feature_store/docs/source/figures/cicd.png +0 -0
  297. ads/feature_store/docs/source/figures/data_validation.png +0 -0
  298. ads/feature_store/docs/source/figures/data_versioning.png +0 -0
  299. ads/feature_store/docs/source/figures/dataset.gif +0 -0
  300. ads/feature_store/docs/source/figures/dataset.png +0 -0
  301. ads/feature_store/docs/source/figures/dataset_lineage.png +0 -0
  302. ads/feature_store/docs/source/figures/dataset_statistics.png +0 -0
  303. ads/feature_store/docs/source/figures/dataset_statistics_viz.png +0 -0
  304. ads/feature_store/docs/source/figures/dataset_validation_results.png +0 -0
  305. ads/feature_store/docs/source/figures/dataset_validation_summary.png +0 -0
  306. ads/feature_store/docs/source/figures/drift_monitoring.png +0 -0
  307. ads/feature_store/docs/source/figures/entity.png +0 -0
  308. ads/feature_store/docs/source/figures/feature_group.png +0 -0
  309. ads/feature_store/docs/source/figures/feature_group_lineage.png +0 -0
  310. ads/feature_store/docs/source/figures/feature_group_statistics_viz.png +0 -0
  311. ads/feature_store/docs/source/figures/feature_store_deployment.png +0 -0
  312. ads/feature_store/docs/source/figures/feature_store_overview.png +0 -0
  313. ads/feature_store/docs/source/figures/featuregroup.gif +0 -0
  314. ads/feature_store/docs/source/figures/lineage_d1.png +0 -0
  315. ads/feature_store/docs/source/figures/lineage_d2.png +0 -0
  316. ads/feature_store/docs/source/figures/lineage_fg.png +0 -0
  317. ads/feature_store/docs/source/figures/logo-dark-mode.png +0 -0
  318. ads/feature_store/docs/source/figures/logo-light-mode.png +0 -0
  319. ads/feature_store/docs/source/figures/overview.png +0 -0
  320. ads/feature_store/docs/source/figures/resource_manager.png +0 -0
  321. ads/feature_store/docs/source/figures/resource_manager_feature_store_stack.png +0 -0
  322. ads/feature_store/docs/source/figures/resource_manager_home.png +0 -0
  323. ads/feature_store/docs/source/figures/stats_1.png +0 -0
  324. ads/feature_store/docs/source/figures/stats_2.png +0 -0
  325. ads/feature_store/docs/source/figures/stats_d.png +0 -0
  326. ads/feature_store/docs/source/figures/stats_fg.png +0 -0
  327. ads/feature_store/docs/source/figures/transformation.png +0 -0
  328. ads/feature_store/docs/source/figures/transformations.gif +0 -0
  329. ads/feature_store/docs/source/figures/validation.png +0 -0
  330. ads/feature_store/docs/source/figures/validation_fg.png +0 -0
  331. ads/feature_store/docs/source/figures/validation_results.png +0 -0
  332. ads/feature_store/docs/source/figures/validation_summary.png +0 -0
  333. ads/feature_store/docs/source/index.rst +0 -81
  334. ads/feature_store/docs/source/module.rst +0 -8
  335. ads/feature_store/docs/source/notebook.rst +0 -94
  336. ads/feature_store/docs/source/overview.rst +0 -47
  337. ads/feature_store/docs/source/quickstart.rst +0 -176
  338. ads/feature_store/docs/source/release_notes.rst +0 -194
  339. ads/feature_store/docs/source/setup_feature_store.rst +0 -81
  340. ads/feature_store/docs/source/statistics.rst +0 -58
  341. ads/feature_store/docs/source/transformation.rst +0 -199
  342. ads/feature_store/docs/source/ui.rst +0 -65
  343. ads/feature_store/docs/source/user_guides.setup.feature_store_operator.rst +0 -66
  344. ads/feature_store/docs/source/user_guides.setup.helm_chart.rst +0 -192
  345. ads/feature_store/docs/source/user_guides.setup.terraform.rst +0 -338
  346. ads/feature_store/entity.py +0 -718
  347. ads/feature_store/execution_strategy/__init__.py +0 -0
  348. ads/feature_store/execution_strategy/delta_lake/__init__.py +0 -0
  349. ads/feature_store/execution_strategy/delta_lake/delta_lake_service.py +0 -375
  350. ads/feature_store/execution_strategy/engine/__init__.py +0 -0
  351. ads/feature_store/execution_strategy/engine/spark_engine.py +0 -316
  352. ads/feature_store/execution_strategy/execution_strategy.py +0 -113
  353. ads/feature_store/execution_strategy/execution_strategy_provider.py +0 -47
  354. ads/feature_store/execution_strategy/spark/__init__.py +0 -0
  355. ads/feature_store/execution_strategy/spark/spark_execution.py +0 -618
  356. ads/feature_store/feature.py +0 -192
  357. ads/feature_store/feature_group.py +0 -1494
  358. ads/feature_store/feature_group_expectation.py +0 -346
  359. ads/feature_store/feature_group_job.py +0 -602
  360. ads/feature_store/feature_lineage/__init__.py +0 -0
  361. ads/feature_store/feature_lineage/graphviz_service.py +0 -180
  362. ads/feature_store/feature_option_details.py +0 -50
  363. ads/feature_store/feature_statistics/__init__.py +0 -0
  364. ads/feature_store/feature_statistics/statistics_service.py +0 -99
  365. ads/feature_store/feature_store.py +0 -699
  366. ads/feature_store/feature_store_registrar.py +0 -518
  367. ads/feature_store/input_feature_detail.py +0 -149
  368. ads/feature_store/mixin/__init__.py +0 -4
  369. ads/feature_store/mixin/oci_feature_store.py +0 -145
  370. ads/feature_store/model_details.py +0 -73
  371. ads/feature_store/query/__init__.py +0 -0
  372. ads/feature_store/query/filter.py +0 -266
  373. ads/feature_store/query/generator/__init__.py +0 -0
  374. ads/feature_store/query/generator/query_generator.py +0 -298
  375. ads/feature_store/query/join.py +0 -161
  376. ads/feature_store/query/query.py +0 -403
  377. ads/feature_store/query/validator/__init__.py +0 -0
  378. ads/feature_store/query/validator/query_validator.py +0 -57
  379. ads/feature_store/response/__init__.py +0 -0
  380. ads/feature_store/response/response_builder.py +0 -68
  381. ads/feature_store/service/__init__.py +0 -0
  382. ads/feature_store/service/oci_dataset.py +0 -139
  383. ads/feature_store/service/oci_dataset_job.py +0 -199
  384. ads/feature_store/service/oci_entity.py +0 -125
  385. ads/feature_store/service/oci_feature_group.py +0 -164
  386. ads/feature_store/service/oci_feature_group_job.py +0 -214
  387. ads/feature_store/service/oci_feature_store.py +0 -182
  388. ads/feature_store/service/oci_lineage.py +0 -87
  389. ads/feature_store/service/oci_transformation.py +0 -104
  390. ads/feature_store/statistics/__init__.py +0 -0
  391. ads/feature_store/statistics/abs_feature_value.py +0 -49
  392. ads/feature_store/statistics/charts/__init__.py +0 -0
  393. ads/feature_store/statistics/charts/abstract_feature_plot.py +0 -37
  394. ads/feature_store/statistics/charts/box_plot.py +0 -148
  395. ads/feature_store/statistics/charts/frequency_distribution.py +0 -65
  396. ads/feature_store/statistics/charts/probability_distribution.py +0 -68
  397. ads/feature_store/statistics/charts/top_k_frequent_elements.py +0 -98
  398. ads/feature_store/statistics/feature_stat.py +0 -126
  399. ads/feature_store/statistics/generic_feature_value.py +0 -33
  400. ads/feature_store/statistics/statistics.py +0 -41
  401. ads/feature_store/statistics_config.py +0 -101
  402. ads/feature_store/templates/feature_store_template.yaml +0 -45
  403. ads/feature_store/transformation.py +0 -499
  404. ads/feature_store/validation_output.py +0 -57
  405. ads/hpo/__init__.py +0 -9
  406. ads/hpo/_imports.py +0 -91
  407. ads/hpo/ads_search_space.py +0 -439
  408. ads/hpo/distributions.py +0 -325
  409. ads/hpo/objective.py +0 -280
  410. ads/hpo/search_cv.py +0 -1657
  411. ads/hpo/stopping_criterion.py +0 -75
  412. ads/hpo/tuner_artifact.py +0 -413
  413. ads/hpo/utils.py +0 -91
  414. ads/hpo/validation.py +0 -140
  415. ads/hpo/visualization/__init__.py +0 -5
  416. ads/hpo/visualization/_contour.py +0 -23
  417. ads/hpo/visualization/_edf.py +0 -20
  418. ads/hpo/visualization/_intermediate_values.py +0 -21
  419. ads/hpo/visualization/_optimization_history.py +0 -25
  420. ads/hpo/visualization/_parallel_coordinate.py +0 -169
  421. ads/hpo/visualization/_param_importances.py +0 -26
  422. ads/jobs/__init__.py +0 -53
  423. ads/jobs/ads_job.py +0 -663
  424. ads/jobs/builders/__init__.py +0 -5
  425. ads/jobs/builders/base.py +0 -156
  426. ads/jobs/builders/infrastructure/__init__.py +0 -6
  427. ads/jobs/builders/infrastructure/base.py +0 -165
  428. ads/jobs/builders/infrastructure/dataflow.py +0 -1252
  429. ads/jobs/builders/infrastructure/dsc_job.py +0 -1894
  430. ads/jobs/builders/infrastructure/dsc_job_runtime.py +0 -1233
  431. ads/jobs/builders/infrastructure/utils.py +0 -65
  432. ads/jobs/builders/runtimes/__init__.py +0 -5
  433. ads/jobs/builders/runtimes/artifact.py +0 -338
  434. ads/jobs/builders/runtimes/base.py +0 -325
  435. ads/jobs/builders/runtimes/container_runtime.py +0 -242
  436. ads/jobs/builders/runtimes/python_runtime.py +0 -1016
  437. ads/jobs/builders/runtimes/pytorch_runtime.py +0 -204
  438. ads/jobs/cli.py +0 -104
  439. ads/jobs/env_var_parser.py +0 -131
  440. ads/jobs/extension.py +0 -160
  441. ads/jobs/schema/__init__.py +0 -5
  442. ads/jobs/schema/infrastructure_schema.json +0 -116
  443. ads/jobs/schema/job_schema.json +0 -42
  444. ads/jobs/schema/runtime_schema.json +0 -183
  445. ads/jobs/schema/validator.py +0 -141
  446. ads/jobs/serializer.py +0 -296
  447. ads/jobs/templates/__init__.py +0 -5
  448. ads/jobs/templates/container.py +0 -6
  449. ads/jobs/templates/driver_notebook.py +0 -177
  450. ads/jobs/templates/driver_oci.py +0 -500
  451. ads/jobs/templates/driver_python.py +0 -48
  452. ads/jobs/templates/driver_pytorch.py +0 -852
  453. ads/jobs/templates/driver_utils.py +0 -615
  454. ads/jobs/templates/hostname_from_env.c +0 -55
  455. ads/jobs/templates/oci_metrics.py +0 -181
  456. ads/jobs/utils.py +0 -104
  457. ads/llm/__init__.py +0 -28
  458. ads/llm/autogen/__init__.py +0 -2
  459. ads/llm/autogen/constants.py +0 -15
  460. ads/llm/autogen/reports/__init__.py +0 -2
  461. ads/llm/autogen/reports/base.py +0 -67
  462. ads/llm/autogen/reports/data.py +0 -103
  463. ads/llm/autogen/reports/session.py +0 -526
  464. ads/llm/autogen/reports/templates/chat_box.html +0 -13
  465. ads/llm/autogen/reports/templates/chat_box_lt.html +0 -5
  466. ads/llm/autogen/reports/templates/chat_box_rt.html +0 -6
  467. ads/llm/autogen/reports/utils.py +0 -56
  468. ads/llm/autogen/v02/__init__.py +0 -4
  469. ads/llm/autogen/v02/client.py +0 -295
  470. ads/llm/autogen/v02/log_handlers/__init__.py +0 -2
  471. ads/llm/autogen/v02/log_handlers/oci_file_handler.py +0 -83
  472. ads/llm/autogen/v02/loggers/__init__.py +0 -6
  473. ads/llm/autogen/v02/loggers/metric_logger.py +0 -320
  474. ads/llm/autogen/v02/loggers/session_logger.py +0 -580
  475. ads/llm/autogen/v02/loggers/utils.py +0 -86
  476. ads/llm/autogen/v02/runtime_logging.py +0 -163
  477. ads/llm/chain.py +0 -268
  478. ads/llm/chat_template.py +0 -31
  479. ads/llm/deploy.py +0 -63
  480. ads/llm/guardrails/__init__.py +0 -5
  481. ads/llm/guardrails/base.py +0 -442
  482. ads/llm/guardrails/huggingface.py +0 -44
  483. ads/llm/langchain/__init__.py +0 -5
  484. ads/llm/langchain/plugins/__init__.py +0 -5
  485. ads/llm/langchain/plugins/chat_models/__init__.py +0 -5
  486. ads/llm/langchain/plugins/chat_models/oci_data_science.py +0 -1027
  487. ads/llm/langchain/plugins/embeddings/__init__.py +0 -4
  488. ads/llm/langchain/plugins/embeddings/oci_data_science_model_deployment_endpoint.py +0 -184
  489. ads/llm/langchain/plugins/llms/__init__.py +0 -5
  490. ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py +0 -979
  491. ads/llm/requirements.txt +0 -3
  492. ads/llm/serialize.py +0 -219
  493. ads/llm/serializers/__init__.py +0 -0
  494. ads/llm/serializers/retrieval_qa.py +0 -153
  495. ads/llm/serializers/runnable_parallel.py +0 -27
  496. ads/llm/templates/score_chain.jinja2 +0 -155
  497. ads/llm/templates/tool_chat_template_hermes.jinja +0 -130
  498. ads/llm/templates/tool_chat_template_mistral_parallel.jinja +0 -94
  499. ads/model/__init__.py +0 -52
  500. ads/model/artifact.py +0 -573
  501. ads/model/artifact_downloader.py +0 -254
  502. ads/model/artifact_uploader.py +0 -267
  503. ads/model/base_properties.py +0 -238
  504. ads/model/common/.model-ignore +0 -66
  505. ads/model/common/__init__.py +0 -5
  506. ads/model/common/utils.py +0 -142
  507. ads/model/datascience_model.py +0 -2635
  508. ads/model/deployment/__init__.py +0 -20
  509. ads/model/deployment/common/__init__.py +0 -5
  510. ads/model/deployment/common/utils.py +0 -308
  511. ads/model/deployment/model_deployer.py +0 -466
  512. ads/model/deployment/model_deployment.py +0 -1846
  513. ads/model/deployment/model_deployment_infrastructure.py +0 -671
  514. ads/model/deployment/model_deployment_properties.py +0 -493
  515. ads/model/deployment/model_deployment_runtime.py +0 -838
  516. ads/model/extractor/__init__.py +0 -5
  517. ads/model/extractor/automl_extractor.py +0 -74
  518. ads/model/extractor/embedding_onnx_extractor.py +0 -80
  519. ads/model/extractor/huggingface_extractor.py +0 -88
  520. ads/model/extractor/keras_extractor.py +0 -84
  521. ads/model/extractor/lightgbm_extractor.py +0 -93
  522. ads/model/extractor/model_info_extractor.py +0 -114
  523. ads/model/extractor/model_info_extractor_factory.py +0 -105
  524. ads/model/extractor/pytorch_extractor.py +0 -87
  525. ads/model/extractor/sklearn_extractor.py +0 -112
  526. ads/model/extractor/spark_extractor.py +0 -89
  527. ads/model/extractor/tensorflow_extractor.py +0 -85
  528. ads/model/extractor/xgboost_extractor.py +0 -94
  529. ads/model/framework/__init__.py +0 -5
  530. ads/model/framework/automl_model.py +0 -178
  531. ads/model/framework/embedding_onnx_model.py +0 -438
  532. ads/model/framework/huggingface_model.py +0 -399
  533. ads/model/framework/lightgbm_model.py +0 -266
  534. ads/model/framework/pytorch_model.py +0 -266
  535. ads/model/framework/sklearn_model.py +0 -250
  536. ads/model/framework/spark_model.py +0 -326
  537. ads/model/framework/tensorflow_model.py +0 -254
  538. ads/model/framework/xgboost_model.py +0 -258
  539. ads/model/generic_model.py +0 -3518
  540. ads/model/model_artifact_boilerplate/README.md +0 -381
  541. ads/model/model_artifact_boilerplate/__init__.py +0 -5
  542. ads/model/model_artifact_boilerplate/artifact_introspection_test/__init__.py +0 -5
  543. ads/model/model_artifact_boilerplate/artifact_introspection_test/model_artifact_validate.py +0 -427
  544. ads/model/model_artifact_boilerplate/artifact_introspection_test/requirements.txt +0 -2
  545. ads/model/model_artifact_boilerplate/runtime.yaml +0 -7
  546. ads/model/model_artifact_boilerplate/score.py +0 -61
  547. ads/model/model_file_description_schema.json +0 -68
  548. ads/model/model_introspect.py +0 -331
  549. ads/model/model_metadata.py +0 -1810
  550. ads/model/model_metadata_mixin.py +0 -460
  551. ads/model/model_properties.py +0 -63
  552. ads/model/model_version_set.py +0 -739
  553. ads/model/runtime/__init__.py +0 -5
  554. ads/model/runtime/env_info.py +0 -306
  555. ads/model/runtime/model_deployment_details.py +0 -37
  556. ads/model/runtime/model_provenance_details.py +0 -58
  557. ads/model/runtime/runtime_info.py +0 -81
  558. ads/model/runtime/schemas/inference_env_info_schema.yaml +0 -16
  559. ads/model/runtime/schemas/model_provenance_schema.yaml +0 -36
  560. ads/model/runtime/schemas/training_env_info_schema.yaml +0 -16
  561. ads/model/runtime/utils.py +0 -201
  562. ads/model/serde/__init__.py +0 -5
  563. ads/model/serde/common.py +0 -40
  564. ads/model/serde/model_input.py +0 -547
  565. ads/model/serde/model_serializer.py +0 -1184
  566. ads/model/service/__init__.py +0 -5
  567. ads/model/service/oci_datascience_model.py +0 -1076
  568. ads/model/service/oci_datascience_model_deployment.py +0 -500
  569. ads/model/service/oci_datascience_model_version_set.py +0 -176
  570. ads/model/transformer/__init__.py +0 -5
  571. ads/model/transformer/onnx_transformer.py +0 -324
  572. ads/mysqldb/__init__.py +0 -5
  573. ads/mysqldb/mysql_db.py +0 -227
  574. ads/opctl/__init__.py +0 -18
  575. ads/opctl/anomaly_detection.py +0 -11
  576. ads/opctl/backend/__init__.py +0 -5
  577. ads/opctl/backend/ads_dataflow.py +0 -353
  578. ads/opctl/backend/ads_ml_job.py +0 -710
  579. ads/opctl/backend/ads_ml_pipeline.py +0 -164
  580. ads/opctl/backend/ads_model_deployment.py +0 -209
  581. ads/opctl/backend/base.py +0 -146
  582. ads/opctl/backend/local.py +0 -1053
  583. ads/opctl/backend/marketplace/__init__.py +0 -9
  584. ads/opctl/backend/marketplace/helm_helper.py +0 -173
  585. ads/opctl/backend/marketplace/local_marketplace.py +0 -271
  586. ads/opctl/backend/marketplace/marketplace_backend_runner.py +0 -71
  587. ads/opctl/backend/marketplace/marketplace_operator_interface.py +0 -44
  588. ads/opctl/backend/marketplace/marketplace_operator_runner.py +0 -24
  589. ads/opctl/backend/marketplace/marketplace_utils.py +0 -212
  590. ads/opctl/backend/marketplace/models/__init__.py +0 -5
  591. ads/opctl/backend/marketplace/models/bearer_token.py +0 -94
  592. ads/opctl/backend/marketplace/models/marketplace_type.py +0 -70
  593. ads/opctl/backend/marketplace/models/ocir_details.py +0 -56
  594. ads/opctl/backend/marketplace/prerequisite_checker.py +0 -238
  595. ads/opctl/cli.py +0 -707
  596. ads/opctl/cmds.py +0 -869
  597. ads/opctl/conda/__init__.py +0 -5
  598. ads/opctl/conda/cli.py +0 -193
  599. ads/opctl/conda/cmds.py +0 -749
  600. ads/opctl/conda/config.yaml +0 -34
  601. ads/opctl/conda/manifest_template.yaml +0 -13
  602. ads/opctl/conda/multipart_uploader.py +0 -188
  603. ads/opctl/conda/pack.py +0 -89
  604. ads/opctl/config/__init__.py +0 -5
  605. ads/opctl/config/base.py +0 -57
  606. ads/opctl/config/diagnostics/__init__.py +0 -5
  607. ads/opctl/config/diagnostics/distributed/default_requirements_config.yaml +0 -62
  608. ads/opctl/config/merger.py +0 -255
  609. ads/opctl/config/resolver.py +0 -297
  610. ads/opctl/config/utils.py +0 -79
  611. ads/opctl/config/validator.py +0 -17
  612. ads/opctl/config/versioner.py +0 -68
  613. ads/opctl/config/yaml_parsers/__init__.py +0 -7
  614. ads/opctl/config/yaml_parsers/base.py +0 -58
  615. ads/opctl/config/yaml_parsers/distributed/__init__.py +0 -7
  616. ads/opctl/config/yaml_parsers/distributed/yaml_parser.py +0 -201
  617. ads/opctl/constants.py +0 -66
  618. ads/opctl/decorator/__init__.py +0 -5
  619. ads/opctl/decorator/common.py +0 -129
  620. ads/opctl/diagnostics/__init__.py +0 -5
  621. ads/opctl/diagnostics/__main__.py +0 -25
  622. ads/opctl/diagnostics/check_distributed_job_requirements.py +0 -212
  623. ads/opctl/diagnostics/check_requirements.py +0 -144
  624. ads/opctl/diagnostics/requirement_exception.py +0 -9
  625. ads/opctl/distributed/README.md +0 -109
  626. ads/opctl/distributed/__init__.py +0 -5
  627. ads/opctl/distributed/certificates.py +0 -32
  628. ads/opctl/distributed/cli.py +0 -207
  629. ads/opctl/distributed/cmds.py +0 -731
  630. ads/opctl/distributed/common/__init__.py +0 -5
  631. ads/opctl/distributed/common/abstract_cluster_provider.py +0 -449
  632. ads/opctl/distributed/common/abstract_framework_spec_builder.py +0 -88
  633. ads/opctl/distributed/common/cluster_config_helper.py +0 -103
  634. ads/opctl/distributed/common/cluster_provider_factory.py +0 -21
  635. ads/opctl/distributed/common/cluster_runner.py +0 -54
  636. ads/opctl/distributed/common/framework_factory.py +0 -29
  637. ads/opctl/docker/Dockerfile.job +0 -103
  638. ads/opctl/docker/Dockerfile.job.arm +0 -107
  639. ads/opctl/docker/Dockerfile.job.gpu +0 -175
  640. ads/opctl/docker/base-env.yaml +0 -13
  641. ads/opctl/docker/cuda.repo +0 -6
  642. ads/opctl/docker/operator/.dockerignore +0 -0
  643. ads/opctl/docker/operator/Dockerfile +0 -41
  644. ads/opctl/docker/operator/Dockerfile.gpu +0 -85
  645. ads/opctl/docker/operator/cuda.repo +0 -6
  646. ads/opctl/docker/operator/environment.yaml +0 -8
  647. ads/opctl/forecast.py +0 -11
  648. ads/opctl/index.yaml +0 -3
  649. ads/opctl/model/__init__.py +0 -5
  650. ads/opctl/model/cli.py +0 -65
  651. ads/opctl/model/cmds.py +0 -73
  652. ads/opctl/operator/README.md +0 -4
  653. ads/opctl/operator/__init__.py +0 -31
  654. ads/opctl/operator/cli.py +0 -344
  655. ads/opctl/operator/cmd.py +0 -596
  656. ads/opctl/operator/common/__init__.py +0 -5
  657. ads/opctl/operator/common/backend_factory.py +0 -460
  658. ads/opctl/operator/common/const.py +0 -27
  659. ads/opctl/operator/common/data/synthetic.csv +0 -16001
  660. ads/opctl/operator/common/dictionary_merger.py +0 -148
  661. ads/opctl/operator/common/errors.py +0 -42
  662. ads/opctl/operator/common/operator_config.py +0 -99
  663. ads/opctl/operator/common/operator_loader.py +0 -811
  664. ads/opctl/operator/common/operator_schema.yaml +0 -130
  665. ads/opctl/operator/common/operator_yaml_generator.py +0 -152
  666. ads/opctl/operator/common/utils.py +0 -208
  667. ads/opctl/operator/lowcode/__init__.py +0 -5
  668. ads/opctl/operator/lowcode/anomaly/MLoperator +0 -16
  669. ads/opctl/operator/lowcode/anomaly/README.md +0 -207
  670. ads/opctl/operator/lowcode/anomaly/__init__.py +0 -5
  671. ads/opctl/operator/lowcode/anomaly/__main__.py +0 -103
  672. ads/opctl/operator/lowcode/anomaly/cmd.py +0 -35
  673. ads/opctl/operator/lowcode/anomaly/const.py +0 -167
  674. ads/opctl/operator/lowcode/anomaly/environment.yaml +0 -10
  675. ads/opctl/operator/lowcode/anomaly/model/__init__.py +0 -5
  676. ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py +0 -146
  677. ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py +0 -162
  678. ads/opctl/operator/lowcode/anomaly/model/automlx.py +0 -99
  679. ads/opctl/operator/lowcode/anomaly/model/autots.py +0 -115
  680. ads/opctl/operator/lowcode/anomaly/model/base_model.py +0 -404
  681. ads/opctl/operator/lowcode/anomaly/model/factory.py +0 -110
  682. ads/opctl/operator/lowcode/anomaly/model/isolationforest.py +0 -78
  683. ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py +0 -78
  684. ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py +0 -120
  685. ads/opctl/operator/lowcode/anomaly/model/tods.py +0 -119
  686. ads/opctl/operator/lowcode/anomaly/operator_config.py +0 -127
  687. ads/opctl/operator/lowcode/anomaly/schema.yaml +0 -401
  688. ads/opctl/operator/lowcode/anomaly/utils.py +0 -88
  689. ads/opctl/operator/lowcode/common/__init__.py +0 -5
  690. ads/opctl/operator/lowcode/common/const.py +0 -10
  691. ads/opctl/operator/lowcode/common/data.py +0 -116
  692. ads/opctl/operator/lowcode/common/errors.py +0 -47
  693. ads/opctl/operator/lowcode/common/transformations.py +0 -296
  694. ads/opctl/operator/lowcode/common/utils.py +0 -293
  695. ads/opctl/operator/lowcode/feature_store_marketplace/MLoperator +0 -13
  696. ads/opctl/operator/lowcode/feature_store_marketplace/README.md +0 -30
  697. ads/opctl/operator/lowcode/feature_store_marketplace/__init__.py +0 -5
  698. ads/opctl/operator/lowcode/feature_store_marketplace/__main__.py +0 -116
  699. ads/opctl/operator/lowcode/feature_store_marketplace/cmd.py +0 -85
  700. ads/opctl/operator/lowcode/feature_store_marketplace/const.py +0 -15
  701. ads/opctl/operator/lowcode/feature_store_marketplace/environment.yaml +0 -0
  702. ads/opctl/operator/lowcode/feature_store_marketplace/models/__init__.py +0 -4
  703. ads/opctl/operator/lowcode/feature_store_marketplace/models/apigw_config.py +0 -32
  704. ads/opctl/operator/lowcode/feature_store_marketplace/models/db_config.py +0 -43
  705. ads/opctl/operator/lowcode/feature_store_marketplace/models/mysql_config.py +0 -120
  706. ads/opctl/operator/lowcode/feature_store_marketplace/models/serializable_yaml_model.py +0 -34
  707. ads/opctl/operator/lowcode/feature_store_marketplace/operator_utils.py +0 -386
  708. ads/opctl/operator/lowcode/feature_store_marketplace/schema.yaml +0 -160
  709. ads/opctl/operator/lowcode/forecast/MLoperator +0 -25
  710. ads/opctl/operator/lowcode/forecast/README.md +0 -209
  711. ads/opctl/operator/lowcode/forecast/__init__.py +0 -5
  712. ads/opctl/operator/lowcode/forecast/__main__.py +0 -89
  713. ads/opctl/operator/lowcode/forecast/cmd.py +0 -40
  714. ads/opctl/operator/lowcode/forecast/const.py +0 -92
  715. ads/opctl/operator/lowcode/forecast/environment.yaml +0 -20
  716. ads/opctl/operator/lowcode/forecast/errors.py +0 -26
  717. ads/opctl/operator/lowcode/forecast/model/__init__.py +0 -5
  718. ads/opctl/operator/lowcode/forecast/model/arima.py +0 -279
  719. ads/opctl/operator/lowcode/forecast/model/automlx.py +0 -542
  720. ads/opctl/operator/lowcode/forecast/model/autots.py +0 -312
  721. ads/opctl/operator/lowcode/forecast/model/base_model.py +0 -863
  722. ads/opctl/operator/lowcode/forecast/model/factory.py +0 -106
  723. ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py +0 -492
  724. ads/opctl/operator/lowcode/forecast/model/ml_forecast.py +0 -243
  725. ads/opctl/operator/lowcode/forecast/model/neuralprophet.py +0 -486
  726. ads/opctl/operator/lowcode/forecast/model/prophet.py +0 -445
  727. ads/opctl/operator/lowcode/forecast/model_evaluator.py +0 -244
  728. ads/opctl/operator/lowcode/forecast/operator_config.py +0 -234
  729. ads/opctl/operator/lowcode/forecast/schema.yaml +0 -506
  730. ads/opctl/operator/lowcode/forecast/utils.py +0 -413
  731. ads/opctl/operator/lowcode/forecast/whatifserve/__init__.py +0 -7
  732. ads/opctl/operator/lowcode/forecast/whatifserve/deployment_manager.py +0 -285
  733. ads/opctl/operator/lowcode/forecast/whatifserve/score.py +0 -246
  734. ads/opctl/operator/lowcode/pii/MLoperator +0 -17
  735. ads/opctl/operator/lowcode/pii/README.md +0 -208
  736. ads/opctl/operator/lowcode/pii/__init__.py +0 -5
  737. ads/opctl/operator/lowcode/pii/__main__.py +0 -78
  738. ads/opctl/operator/lowcode/pii/cmd.py +0 -39
  739. ads/opctl/operator/lowcode/pii/constant.py +0 -84
  740. ads/opctl/operator/lowcode/pii/environment.yaml +0 -17
  741. ads/opctl/operator/lowcode/pii/errors.py +0 -27
  742. ads/opctl/operator/lowcode/pii/model/__init__.py +0 -5
  743. ads/opctl/operator/lowcode/pii/model/factory.py +0 -82
  744. ads/opctl/operator/lowcode/pii/model/guardrails.py +0 -167
  745. ads/opctl/operator/lowcode/pii/model/pii.py +0 -145
  746. ads/opctl/operator/lowcode/pii/model/processor/__init__.py +0 -34
  747. ads/opctl/operator/lowcode/pii/model/processor/email_replacer.py +0 -34
  748. ads/opctl/operator/lowcode/pii/model/processor/mbi_replacer.py +0 -35
  749. ads/opctl/operator/lowcode/pii/model/processor/name_replacer.py +0 -225
  750. ads/opctl/operator/lowcode/pii/model/processor/number_replacer.py +0 -73
  751. ads/opctl/operator/lowcode/pii/model/processor/remover.py +0 -26
  752. ads/opctl/operator/lowcode/pii/model/report.py +0 -487
  753. ads/opctl/operator/lowcode/pii/operator_config.py +0 -95
  754. ads/opctl/operator/lowcode/pii/schema.yaml +0 -108
  755. ads/opctl/operator/lowcode/pii/utils.py +0 -43
  756. ads/opctl/operator/lowcode/recommender/MLoperator +0 -16
  757. ads/opctl/operator/lowcode/recommender/README.md +0 -206
  758. ads/opctl/operator/lowcode/recommender/__init__.py +0 -5
  759. ads/opctl/operator/lowcode/recommender/__main__.py +0 -82
  760. ads/opctl/operator/lowcode/recommender/cmd.py +0 -33
  761. ads/opctl/operator/lowcode/recommender/constant.py +0 -30
  762. ads/opctl/operator/lowcode/recommender/environment.yaml +0 -11
  763. ads/opctl/operator/lowcode/recommender/model/base_model.py +0 -212
  764. ads/opctl/operator/lowcode/recommender/model/factory.py +0 -56
  765. ads/opctl/operator/lowcode/recommender/model/recommender_dataset.py +0 -25
  766. ads/opctl/operator/lowcode/recommender/model/svd.py +0 -106
  767. ads/opctl/operator/lowcode/recommender/operator_config.py +0 -81
  768. ads/opctl/operator/lowcode/recommender/schema.yaml +0 -265
  769. ads/opctl/operator/lowcode/recommender/utils.py +0 -13
  770. ads/opctl/operator/runtime/__init__.py +0 -5
  771. ads/opctl/operator/runtime/const.py +0 -17
  772. ads/opctl/operator/runtime/container_runtime_schema.yaml +0 -50
  773. ads/opctl/operator/runtime/marketplace_runtime.py +0 -50
  774. ads/opctl/operator/runtime/python_marketplace_runtime_schema.yaml +0 -21
  775. ads/opctl/operator/runtime/python_runtime_schema.yaml +0 -21
  776. ads/opctl/operator/runtime/runtime.py +0 -115
  777. ads/opctl/schema.yaml.yml +0 -36
  778. ads/opctl/script.py +0 -40
  779. ads/opctl/spark/__init__.py +0 -5
  780. ads/opctl/spark/cli.py +0 -43
  781. ads/opctl/spark/cmds.py +0 -147
  782. ads/opctl/templates/diagnostic_report_template.jinja2 +0 -102
  783. ads/opctl/utils.py +0 -344
  784. ads/oracledb/__init__.py +0 -5
  785. ads/oracledb/oracle_db.py +0 -346
  786. ads/pipeline/__init__.py +0 -39
  787. ads/pipeline/ads_pipeline.py +0 -2279
  788. ads/pipeline/ads_pipeline_run.py +0 -772
  789. ads/pipeline/ads_pipeline_step.py +0 -605
  790. ads/pipeline/builders/__init__.py +0 -5
  791. ads/pipeline/builders/infrastructure/__init__.py +0 -5
  792. ads/pipeline/builders/infrastructure/custom_script.py +0 -32
  793. ads/pipeline/cli.py +0 -119
  794. ads/pipeline/extension.py +0 -291
  795. ads/pipeline/schema/__init__.py +0 -5
  796. ads/pipeline/schema/cs_step_schema.json +0 -35
  797. ads/pipeline/schema/ml_step_schema.json +0 -31
  798. ads/pipeline/schema/pipeline_schema.json +0 -71
  799. ads/pipeline/visualizer/__init__.py +0 -5
  800. ads/pipeline/visualizer/base.py +0 -570
  801. ads/pipeline/visualizer/graph_renderer.py +0 -272
  802. ads/pipeline/visualizer/text_renderer.py +0 -84
  803. ads/secrets/__init__.py +0 -11
  804. ads/secrets/adb.py +0 -386
  805. ads/secrets/auth_token.py +0 -86
  806. ads/secrets/big_data_service.py +0 -365
  807. ads/secrets/mysqldb.py +0 -149
  808. ads/secrets/oracledb.py +0 -160
  809. ads/secrets/secrets.py +0 -407
  810. ads/telemetry/__init__.py +0 -7
  811. ads/telemetry/base.py +0 -69
  812. ads/telemetry/client.py +0 -125
  813. ads/telemetry/telemetry.py +0 -257
  814. ads/templates/dataflow_pyspark.jinja2 +0 -13
  815. ads/templates/dataflow_sparksql.jinja2 +0 -22
  816. ads/templates/func.jinja2 +0 -20
  817. ads/templates/schemas/openapi.json +0 -1740
  818. ads/templates/score-pkl.jinja2 +0 -173
  819. ads/templates/score.jinja2 +0 -322
  820. ads/templates/score_embedding_onnx.jinja2 +0 -202
  821. ads/templates/score_generic.jinja2 +0 -165
  822. ads/templates/score_huggingface_pipeline.jinja2 +0 -217
  823. ads/templates/score_lightgbm.jinja2 +0 -185
  824. ads/templates/score_onnx.jinja2 +0 -407
  825. ads/templates/score_onnx_new.jinja2 +0 -473
  826. ads/templates/score_oracle_automl.jinja2 +0 -185
  827. ads/templates/score_pyspark.jinja2 +0 -154
  828. ads/templates/score_pytorch.jinja2 +0 -219
  829. ads/templates/score_scikit-learn.jinja2 +0 -184
  830. ads/templates/score_tensorflow.jinja2 +0 -184
  831. ads/templates/score_xgboost.jinja2 +0 -178
  832. ads/text_dataset/__init__.py +0 -5
  833. ads/text_dataset/backends.py +0 -211
  834. ads/text_dataset/dataset.py +0 -445
  835. ads/text_dataset/extractor.py +0 -207
  836. ads/text_dataset/options.py +0 -53
  837. ads/text_dataset/udfs.py +0 -22
  838. ads/text_dataset/utils.py +0 -49
  839. ads/type_discovery/__init__.py +0 -9
  840. ads/type_discovery/abstract_detector.py +0 -21
  841. ads/type_discovery/constant_detector.py +0 -41
  842. ads/type_discovery/continuous_detector.py +0 -54
  843. ads/type_discovery/credit_card_detector.py +0 -99
  844. ads/type_discovery/datetime_detector.py +0 -92
  845. ads/type_discovery/discrete_detector.py +0 -118
  846. ads/type_discovery/document_detector.py +0 -146
  847. ads/type_discovery/ip_detector.py +0 -68
  848. ads/type_discovery/latlon_detector.py +0 -90
  849. ads/type_discovery/phone_number_detector.py +0 -63
  850. ads/type_discovery/type_discovery_driver.py +0 -87
  851. ads/type_discovery/typed_feature.py +0 -594
  852. ads/type_discovery/unknown_detector.py +0 -41
  853. ads/type_discovery/zipcode_detector.py +0 -48
  854. ads/vault/__init__.py +0 -7
  855. ads/vault/vault.py +0 -237
  856. oracle_ads-2.13.7.dist-info/RECORD +0 -858
  857. {oracle_ads-2.13.7.dist-info → oracle_ads-2.13.9rc0.dist-info}/licenses/LICENSE.txt +0 -0
ads/dataset/helper.py DELETED
@@ -1,1450 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8; -*-
3
-
4
- # Copyright (c) 2020, 2023 Oracle and/or its affiliates.
5
- # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
6
-
7
- import ast
8
- import base64
9
- import html
10
- import io
11
- import math
12
- import os
13
- import warnings
14
- import re
15
- from collections import defaultdict
16
- import inspect
17
- import importlib
18
- from typing import Callable, List, Tuple, Union
19
- import fsspec
20
-
21
- # from pandas.io.common import _compression_to_extension
22
-
23
- from numbers import Number
24
- from urllib.parse import urlparse
25
-
26
- import numpy as np
27
- import pandas as pd
28
-
29
- from pandas.core.dtypes.common import (
30
- is_numeric_dtype,
31
- is_bool_dtype,
32
- is_categorical_dtype,
33
- is_datetime64_any_dtype,
34
- is_float_dtype,
35
- )
36
-
37
- from ads.common.decorator.runtime_dependency import (
38
- runtime_dependency,
39
- OptionalDependency,
40
- )
41
- from ads.common import utils
42
- from ads.dataset import logger
43
- from ads.type_discovery.type_discovery_driver import TypeDiscoveryDriver
44
- from ads.type_discovery.typed_feature import (
45
- ContinuousTypedFeature,
46
- DateTimeTypedFeature,
47
- CategoricalTypedFeature,
48
- GISTypedFeature,
49
- TypedFeature,
50
- UnknownTypedFeature,
51
- OrdinalTypedFeature,
52
- DocumentTypedFeature,
53
- )
54
-
55
-
56
- class DatasetDefaults:
57
- sampling_confidence_level = 95
58
- sampling_confidence_interval = 1.0
59
-
60
-
61
- _known_db_protocols = {"sqlite", "ADB", "oracle+cx_oracle"}
62
-
63
-
64
- def concatenate(X, y):
65
- if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
66
- return pd.concat([X, y], axis=1)
67
- else:
68
- return X.assign(**{y.name: y})
69
-
70
-
71
- def fix_column_names(X):
72
- X.columns = X.columns.astype("str").str.strip().str.replace(" ", "_")
73
- return X
74
-
75
-
76
- def convert_columns(df, feature_metadata=None, dtypes=None):
77
- if feature_metadata is not None:
78
- dtypes = {}
79
- for feature in feature_metadata:
80
- dtype = get_dtype(feature_metadata[feature], df[feature].dtype)
81
- if dtype is not None:
82
- dtypes[feature] = dtype
83
- return df.astype(dtypes)
84
-
85
-
86
- def get_dtype(feature_type, dtype):
87
- if isinstance(feature_type, ContinuousTypedFeature) or isinstance(
88
- feature_type, OrdinalTypedFeature
89
- ):
90
- return dtype.name if is_numeric_dtype(dtype) else "float"
91
- elif isinstance(feature_type, DateTimeTypedFeature):
92
- return "datetime64[ns]" if not dtype.name.startswith("datetime") else dtype
93
- elif isinstance(feature_type, CategoricalTypedFeature):
94
- return "bool" if is_bool_dtype(dtype) else "category"
95
-
96
-
97
- def get_feature_type(name, series):
98
- if is_bool_dtype(series) or is_categorical_dtype(series):
99
- return CategoricalTypedFeature.build(name, series)
100
- elif is_numeric_dtype(series):
101
- if is_float_dtype(series):
102
- return ContinuousTypedFeature.build(name, series)
103
- else:
104
- return OrdinalTypedFeature.build(name, series)
105
- elif is_datetime64_any_dtype(series):
106
- return DateTimeTypedFeature.build(name, series)
107
- else:
108
- return UnknownTypedFeature.build(name, series)
109
-
110
-
111
- def convert_to_html(plot):
112
- img = io.BytesIO()
113
- plot.savefig(img, format="png", bbox_inches="tight")
114
- img.seek(0)
115
- encoded = base64.b64encode(img.getvalue())
116
- return '<img width=95%" src="data:image/png;base64, {}"><hr><br>'.format(
117
- encoded.decode("utf-8")
118
- )
119
-
120
-
121
- def _num_partitions_for_dataframe(df):
122
- # takes pandas dataframe, guesses good number of partitions
123
- return utils.get_cpu_count() if df.shape[0] > 1000 * utils.get_cpu_count() else 1
124
-
125
-
126
- class ElaboratedPath:
127
- """
128
- The Elaborated Path class unifies all of the operations and information related to a path or pathlist.
129
- Whether the user wants to
130
- An Elaborated path can accept any of the following as a valid source:
131
- * A single path
132
- * A glob pattern path
133
- * A directory
134
- * A list of paths (Note: all of these paths must be from the same filesystem AND have the same format)
135
- * A sqlalchemy connection url
136
- """
137
-
138
- def __init__(
139
- self,
140
- source: Union[str, List[str]],
141
- format: str = None,
142
- name: str = None,
143
- **kwargs,
144
- ):
145
- """
146
- :param source:
147
- :param format:
148
- :param kwargs:
149
-
150
- By the end of this method, this class needs to have paths, format, and name ready
151
- """
152
- self._kwargs = kwargs
153
- self._format = format
154
- self._name = name
155
- if isinstance(source, str):
156
- self._original_source = source
157
- self._determine_protocol_type()
158
- if self._type == "db":
159
- self._paths = [self._original_source]
160
- else:
161
- self._elaborate_path()
162
- elif isinstance(source, list) and all(isinstance(file, str) for file in source):
163
- assert len(source) > 0, "Error, the source you passed in was an empty list."
164
- self._original_source = source[0]
165
- self._paths = source
166
- self._type = "list"
167
- else:
168
- raise ValueError(f"Source argument not understood: {source}")
169
- if self.num_paths == 0:
170
- raise FileNotFoundError(
171
- f"Error: We could not find any files associated with the source: "
172
- f"{source}. Double check that this source is a valid glob pattern,"
173
- f" directory, or path."
174
- )
175
- self._determine_format()
176
- self._determine_name()
177
-
178
- @property
179
- def paths(self) -> List[str]:
180
- """
181
- :return: a list of str
182
- Each element will be a valid path
183
- """
184
- return self._paths
185
-
186
- @property
187
- def num_paths(self) -> int:
188
- """
189
- This method will return the number of paths found with the associated original glob, folder, or path.
190
- If this returns 0,
191
- :return:
192
- """
193
- return len(self._paths)
194
-
195
- @property
196
- def name(self) -> str:
197
- return self._name
198
-
199
- @property
200
- def format(self) -> str:
201
- return self._format
202
-
203
- def _determine_name(self):
204
- if self._name is None:
205
- if self._type == "list":
206
- self._name = (
207
- f"DataFrame from [{os.path.basename(self._original_source)}, ...]"
208
- )
209
- elif self._type == "glob":
210
- self._name = f"DataFrame from {os.path.basename(self._original_source)}"
211
- else:
212
- self._name = f"DataFrame from {urlparse(self._original_source).scheme}"
213
-
214
- def _determine_format(self):
215
- """
216
- Infer format from the path.
217
-
218
- If its a compressed file, returns the extension before compression extension.
219
- If the extension cannot be inferred, returns None
220
-
221
- Parameters
222
- ----------
223
- path : ElaboratedPath
224
- an ElaboratedPath object
225
-
226
- Returns
227
- -------
228
- format : str
229
- """
230
- if self._format in [None, "infer"]:
231
- format_keys = []
232
- for i in range(min(self.num_paths, 5)):
233
- format_keys.append(self._remove_compressions(self.paths[i]))
234
- if len(format_keys) == 0:
235
- raise ValueError(
236
- f"Could not determine the format key for source: {self._original_source}"
237
- )
238
- if format_keys.count(format_keys[0]) != len(format_keys):
239
- raise ValueError(
240
- f"Got multiple formats from the source: {self._original_source}. Run again "
241
- f'using the format parameter. Ex: format=<your format key, like: "csv", "hdf", etc.>'
242
- )
243
- self._format = format_keys[0]
244
- else:
245
- self._format = self._format.lower()
246
-
247
- def _elaborate_path(self):
248
- self._paths = self._fs.glob(self._original_source)
249
- if self._protocol != "":
250
- self._paths = [f"{self._protocol}://{p}" for p in self._paths]
251
-
252
- def _determine_protocol_type(self):
253
- self._protocol = urlparse(self._original_source).scheme
254
-
255
- if self._kwargs.get("fs") is not None:
256
- self._fs = self._kwargs.pop("fs")
257
- self._type = "glob"
258
- elif self._original_source.startswith("oracle+cx_oracle://"):
259
- self._protocol = "oracle+cx_oracle"
260
- self._type = "db"
261
- else:
262
- try:
263
- self._fs = fsspec.filesystem(
264
- self._protocol, **self._kwargs.get("storage_options", dict())
265
- )
266
- self._type = "glob"
267
- except ValueError:
268
- try:
269
- self.engine = utils.get_sqlalchemy_engine(
270
- self._original_source, **self._kwargs
271
- )
272
- self._type = "db"
273
- except:
274
- if self._protocol in _known_db_protocols:
275
- self._type = "db"
276
- else:
277
- raise ValueError(
278
- f"Error in trying to understand the protocol for source: "
279
- f"{self._original_source}. The protocol found: {self._protocol} is not "
280
- f"registered with fsspec or sqlalchemy"
281
- )
282
-
283
- @staticmethod
284
- def _remove_compressions(filename: str):
285
- _compression_to_extension = [
286
- ".gz",
287
- ".bz2",
288
- ".zip",
289
- ".xz",
290
- ".zst",
291
- ".tar",
292
- ".tar.gz",
293
- ".tar.xz",
294
- ".tar.bz2",
295
- ]
296
- for compression in _compression_to_extension:
297
- if filename.strip().endswith(compression):
298
- return ElaboratedPath._remove_compressions(
299
- os.path.splitext(filename.rstrip("/*"))[0]
300
- )
301
- format = os.path.splitext(filename.rstrip("/*"))[1][1:].lower()
302
- return format.lower() if format != "" else None
303
-
304
-
305
- class DatasetLoadException(BaseException):
306
- def __init__(self, exc_msg):
307
- self.exc_msg = exc_msg
308
-
309
- def __str__(self):
310
- return self.exc_msg
311
-
312
-
313
- def _get_dtype_from_error(e):
314
- error_string = str(e)
315
-
316
- if "mismatched dtypes" in error_string.lower():
317
- # For the mismatched dtypes error, dask either returns a error message containing the dtype argument
318
- # to specify, or the found and expected dtypes in a table format, depending on what stage
319
- # the type inferencing fails. The below logic supports building the dtype dictionary for both cases
320
- found_dtype_dict_str_list = re.findall(
321
- r"dtype=({[^{}]+})", error_string, re.MULTILINE
322
- )
323
- if found_dtype_dict_str_list:
324
- found_dtype_dict = ast.literal_eval(found_dtype_dict_str_list[0])
325
- else:
326
- found_dtype_dict = _find_dtypes_from_table(error_string)
327
- if found_dtype_dict:
328
- logger.warning(
329
- "Dask type-inference/coercion failed. Retrying with "
330
- f"dtype={found_dtype_dict}.",
331
- exc_info=True,
332
- )
333
- return found_dtype_dict
334
- return None
335
-
336
-
337
- def _find_dtypes_from_table(error_string):
338
- error_lines = error_string.splitlines()
339
- dtypes = {}
340
- # matches '| Column | Found | Expected |'
341
- pattern = re.compile(
342
- "\\s*\\|\\s*Column\\s*\\|\\s*Found\\s*\\|\\s*Expected\\s*\\|\\s*"
343
- )
344
- for i, line in enumerate(error_lines):
345
- if re.match(pattern, line):
346
- for j in range(i + 2, len(error_lines)):
347
- # extracts column_name and found_dtype from '| <column_name> | <found_dtype> | <expected_dtype |'
348
- dtype_suggestion = re.compile("\\s*\\|([^\\|]+)\\|([^\\|]+)\\|.*")
349
- match_groups = re.match(dtype_suggestion, error_lines[j])
350
- if match_groups is None:
351
- break
352
- dtypes[match_groups.group(1).strip()] = match_groups.group(2).strip()
353
- return dtypes
354
-
355
-
356
- def rename_duplicate_cols(original_cols):
357
- seen_col_names = defaultdict(int)
358
- new_cols = []
359
- for col in original_cols:
360
- # remove any spaces form column names
361
- if isinstance(col, str):
362
- col.replace(" ", "_")
363
- if col not in seen_col_names:
364
- new_cols.append(col)
365
- else:
366
- dup_count = seen_col_names[col]
367
- new_cols.append(f"{col}.{dup_count}")
368
- seen_col_names[col] += 1
369
- assert len(new_cols) == len(
370
- original_cols
371
- ), "There has been an error in re-naming duplicate columns"
372
- return new_cols
373
-
374
-
375
- def write_parquet(
376
- path,
377
- data,
378
- engine="fastparquet",
379
- metadata_dict=None,
380
- compression=None,
381
- storage_options=None,
382
- ):
383
- """
384
- Uses fast parquet to write dask dataframe and custom metadata in parquet format
385
-
386
- Parameters
387
- ----------
388
- path : str
389
- Path to write to
390
- data : pandas.DataFrame
391
- engine : string
392
- "auto" by default
393
- metadata_dict : Deprecated, will not pass through
394
- compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy'
395
- Name of the compression to use
396
- storage_options : dict, optional
397
- storage arguments required to read the path
398
-
399
- Returns
400
- -------
401
- str : the file path the parquet was written to
402
- """
403
- assert isinstance(data, pd.DataFrame)
404
- if metadata_dict is not None:
405
- warnings.warn(
406
- "The `metadata_dict` argument is deprecated and has no effect on this method.",
407
- DeprecationWarning,
408
- stacklevel=2,
409
- )
410
- data.to_parquet(
411
- path,
412
- engine=engine,
413
- compression=compression,
414
- storage_options=storage_options,
415
- )
416
- return path
417
-
418
-
419
- def is_text_data(df, target=None):
420
- if len(df.columns.values) == 2:
421
- feature_name = (
422
- list(set(df.columns.values) ^ set([target]))[0]
423
- if target
424
- else list(set(df.columns.values))[0]
425
- )
426
- elif len(df.columns.values == 1):
427
- feature_name = df.columns.values[0]
428
- else:
429
- return False
430
- return isinstance(
431
- TypeDiscoveryDriver().discover(feature_name, df[feature_name]),
432
- DocumentTypedFeature,
433
- )
434
-
435
-
436
- def generate_sample(
437
- df: pd.DataFrame,
438
- n: int,
439
- confidence_level: int = DatasetDefaults.sampling_confidence_level,
440
- confidence_interval: float = DatasetDefaults.sampling_confidence_interval,
441
- **kwargs,
442
- ):
443
- min_size_to_sample = min(n, 10000)
444
-
445
- sample_size = None
446
-
447
- if "sample_max_rows" in kwargs:
448
- requested_sample_size = int(kwargs["sample_max_rows"])
449
-
450
- if requested_sample_size < 0:
451
- sample_size = calculate_sample_size(
452
- n, min_size_to_sample, confidence_level, confidence_interval
453
- )
454
- else:
455
- if min_size_to_sample < requested_sample_size < n:
456
- logger.info(
457
- f"Downsampling from {n} rows, to the user specified {requested_sample_size} rows for graphing."
458
- )
459
- sample_size = requested_sample_size
460
- elif requested_sample_size >= n:
461
- logger.info(f"Using the entire dataset of {n} rows for graphing.")
462
- sample_size = n
463
- else:
464
- sample_size = min_size_to_sample
465
- logger.info(
466
- f"Downsampling from {n} rows, to {sample_size} rows for graphing."
467
- )
468
-
469
- if sample_size and len(df) > sample_size:
470
- frac = min(1.0, sample_size * 1.05 / n)
471
- df = df.sample(frac=frac, random_state=42)
472
- return df.head(sample_size) if len(df) > sample_size else df
473
- else:
474
- return df
475
-
476
-
477
- def calculate_sample_size(
478
- population_size, min_size_to_sample, confidence_level=95, confidence_interval=1.0
479
- ):
480
- """Find sample size for a population using Cochran’s Sample Size Formula.
481
- With default values for confidence_level (percentage, default: 95%)
482
- and confidence_interval (margin of error, percentage, default: 1%)
483
-
484
- SUPPORTED CONFIDENCE LEVELS: 50%, 68%, 90%, 95%, and 99% *ONLY* - this
485
- is because the Z-score is table based, and I'm only providing Z
486
- for common confidence levels.
487
- """
488
-
489
- if population_size < min_size_to_sample:
490
- return None
491
-
492
- confidence_level_constant = {
493
- 50: 0.67,
494
- 68: 0.99,
495
- 90: 1.64,
496
- 95: 1.96,
497
- 99: 2.57,
498
- 99.5: 2.807,
499
- 99.9: 3.291,
500
- }
501
-
502
- p = 0.5
503
- e = confidence_interval / 100.0
504
- N = population_size
505
- n_0 = 0.0
506
- n = 0.0
507
-
508
- Z = confidence_level_constant.get(confidence_level, 99)
509
-
510
- n_0 = ((Z**2) * p * (1 - p)) / (e**2)
511
- n = n_0 / (1 + ((n_0 - 1) / float(N)))
512
-
513
- sample_size = max(int(math.ceil(n)), min_size_to_sample)
514
-
515
- logger.info(f"Downsampling from {population_size} rows to {sample_size} rows.")
516
-
517
- return sample_size
518
-
519
-
520
- def map_types(types):
521
- for column in types:
522
- if types[column] == "continuous":
523
- types[column] = "float64"
524
- elif types[column] == "ordinal":
525
- types[column] = "int64"
526
- elif types[column] == "categorical":
527
- types[column] = "category"
528
- elif types[column] == "datetime":
529
- types[column] = "datetime64[ns]"
530
- return types
531
-
532
-
533
- @runtime_dependency(module="IPython", install_from=OptionalDependency.NOTEBOOK)
534
- @runtime_dependency(module="graphviz", install_from=OptionalDependency.VIZ)
535
- def visualize_transformation(transformer_pipeline, text=None):
536
- dot = graphviz.Digraph()
537
-
538
- # show a single node for paritions
539
- dot.attr(
540
- "node",
541
- shape="tab",
542
- style="filled",
543
- fontname="courier",
544
- fontsize="12",
545
- fontcolor="white",
546
- resolution="144",
547
- )
548
- if text:
549
- dot.node("partitions", text, margin="0.25", fillcolor="dimgray")
550
-
551
- dot.attr(
552
- "node",
553
- shape="component",
554
- style="filled",
555
- fontname="courier",
556
- fontsize="10",
557
- fontcolor="black",
558
- resolution="144",
559
- )
560
- for step in transformer_pipeline.steps:
561
- name, clazz, clazzname, is_ads = (
562
- step[0],
563
- step[1],
564
- step[1].__class__.__name__,
565
- "ads" in str(step[1].__class__),
566
- )
567
- ads_node = str(step[1].__class__.__name__) in [
568
- "AutoMLPreprocessingTransformer",
569
- "DataFrameTransformer",
570
- "RecommendationTransformer",
571
- "AutoMLFeatureSelection",
572
- "FeatureEngineeringTransformer",
573
- ]
574
- if ads_node:
575
- text2html = "< {} >".format(
576
- html.escape(step[1].__repr__()).replace("\n", "<br/>")
577
- )
578
- dot.node(name, text2html, margin="0.25", fillcolor="gold2")
579
- else:
580
- dot.node(name, name.rsplit("/")[0], fillcolor="azure")
581
-
582
- def format_label(stage):
583
- if "FunctionTransformer" in str(transformer_pipeline.steps[stage][1].__class__):
584
- return "<<font face='courier' point-size='10'>&nbsp;<b>{}</b>&nbsp;</font>>".format(
585
- html.escape(transformer_pipeline.steps[stage][1].func.__name__)
586
- )
587
- else:
588
- is_ads = "ads" in str(transformer_pipeline.steps[stage][1].__class__)
589
- return "<<font face='courier' point-size='10'>&nbsp;<b>{}</b>&nbsp;</font>>".format(
590
- transformer_pipeline.steps[stage][1].__class__.__name__
591
- )
592
-
593
- edges = [x[0] for x in transformer_pipeline.steps]
594
- for i, edge in enumerate(list(zip(edges[:-1], edges[1:]))):
595
- dot.edge(*edge, len="1.00", label=format_label(i))
596
-
597
- # terminus node
598
- dot.node("terminus", "", shape="terminator", fillcolor="white")
599
- dot.edge(edges[-1], "terminus", len="1.00", label=format_label(len(edges) - 1))
600
-
601
- graph = graphviz.Source(dot)
602
-
603
- from IPython.core.display import display, SVG
604
-
605
- display(SVG(graph.pipe(format="svg")))
606
-
607
-
608
- def up_sample(df, target, sampler="default", feature_types=None):
609
- """
610
- Fixes imbalanced dataset by up-sampling
611
-
612
- Parameters
613
- ----------
614
- df : Union[pandas.DataFrame, dask.dataframe.core.DataFrame]
615
- target : name of the target column in df
616
- sampler: Should implement fit_resample(X,y) method
617
- fillna: a dictionary contains the column name as well as the fill value,
618
- only needed when the column has missing values
619
-
620
- Returns
621
- -------
622
- upsampled_df : Union[pandas.DataFrame, dask.dataframe.core.DataFrame]
623
- """
624
- if sampler != "default":
625
- if inspect.getattr_static(sampler, "fit_resample", None) is None:
626
- raise AttributeError("`sampler` object must has method `fit_resample`.")
627
- else:
628
- # exactly two input args X, y will be passed to fit_resample()
629
- # check signature of fit_sample
630
- num_no_default_params = 0
631
- sig = inspect.signature(sampler.fit_resample)
632
- for param in sig.parameters.values():
633
- if param.default is param.empty:
634
- num_no_default_params += 1
635
- if len(sig.parameters) < 2 or num_no_default_params > 2:
636
- raise RuntimeError(
637
- "The signature for `sampler.fit_resample` has to be `fit_resample(X, y)`."
638
- )
639
-
640
- X = df.drop(target, axis=1)
641
- y = df[target]
642
-
643
- feature_types = feature_types if feature_types is not None else {}
644
-
645
- columns_with_nans = X.columns.values[X.isna().any()]
646
- if len(columns_with_nans) > 0:
647
- fill_nan_dict = {}
648
- for column in columns_with_nans:
649
- if column in feature_types and "mode" in feature_types[column]["stats"]:
650
- fill_nan_dict[column] = feature_types[column]["stats"]["mode"]
651
- elif column in feature_types and "mean" in feature_types[column]["stats"]:
652
- fill_nan_dict[column] = feature_types[column]["stats"]["mean"]
653
- elif column in feature_types and "median" in feature_types[column]["stats"]:
654
- fill_nan_dict[column] = feature_types[column]["stats"]["median"]
655
- else:
656
- logger.warning(
657
- "Sampling from a column that has missing values may cause an error."
658
- )
659
- X = X.fillna(fill_nan_dict)
660
-
661
- if sampler == "default":
662
- imblearn_found = importlib.util.find_spec("imblearn") is not None
663
- if not imblearn_found:
664
- raise ModuleNotFoundError(
665
- """
666
- Required package for up-sampling `imblearn` not found.
667
- Install `imblearn` with `pip install imbalanced-learn`
668
- and rerun to enable up-sampling.
669
- """
670
- )
671
- else:
672
- sampler = _get_imblearn_sampler(X, y)
673
- return _sample(sampler, X, y)
674
-
675
-
676
- def _get_imblearn_sampler(X, y):
677
- from imblearn.over_sampling import SMOTE, RandomOverSampler
678
-
679
- categorical_feature_indices = [
680
- X.columns.get_loc(c)
681
- for c in X.select_dtypes(
682
- include=["category", "object", "datetime64"]
683
- ).columns.values
684
- ]
685
-
686
- if len(categorical_feature_indices) > 0:
687
- logger.info(
688
- """
689
- Using the default `RandomOverSampler` sampler. Use `sample` to specify a sampler.
690
- Classes will be equalized.
691
- You can also pass in other samplers such as `imblearn.SMOTENC` instead, e.g.
692
- sampler = SMOTENC(categorical_features=categorical_feature_indices)
693
- ds.up_sample(sampler=sampler)
694
- """
695
- )
696
- return RandomOverSampler(random_state=42)
697
-
698
- min_sample_size = y.value_counts().min()
699
-
700
- k_neighbors = min(min_sample_size - 1, 5)
701
- if k_neighbors == 0:
702
- logger.warning(
703
- f"""k_neighbors is 0 as in the target there exists a class label that appeared only once.
704
- SMOTE will fail. Default to RandomOverSampler.
705
- """
706
- )
707
- return RandomOverSampler(random_state=42)
708
- else:
709
- if 5 > k_neighbors > 0:
710
- logger.info(
711
- f"`k_neighbors()` of SMOTE has changed to {k_neighbors}"
712
- " as the target has at least one class which appeared "
713
- f"only {min_sample_size} times in the data. "
714
- )
715
- logger.info("Using SMOTE for over sampling. Classes will be equalized.")
716
- return SMOTE(random_state=42, k_neighbors=k_neighbors)
717
-
718
-
719
- def down_sample(df, target):
720
- """
721
- Fixes imbalanced dataset by down-sampling
722
-
723
- Parameters
724
- ----------
725
- df : pandas.DataFrame
726
- target : name of the target column in df
727
-
728
- Returns
729
- -------
730
- downsampled_df : pandas.DataFrame
731
- """
732
- dfs = []
733
- target_value_counts = df[target].value_counts()
734
- min_key = min(target_value_counts.items(), key=lambda k: k[1])
735
- for key, value in target_value_counts.items():
736
- if key != min_key[0]:
737
- dfs.append(
738
- df[df[target] == key].sample(frac=1 - ((value - min_key[1]) / value))
739
- )
740
- dfs.append(df[df[target] == min_key[0]])
741
- return pd.concat(dfs)
742
-
743
-
744
- def _sample(sampler, X, y):
745
- if isinstance(y, pd.Series) and (
746
- isinstance(y[0], bool) or isinstance(y[0], np.bool_)
747
- ):
748
- y_trans = y.astype(int) ## Convert to ints to let SMOTE sample properly
749
- X_resampled, y_resampled = sampler.fit_resample(X=X, y=y_trans)
750
- else:
751
- X_resampled, y_resampled = sampler.fit_resample(X=X, y=y)
752
-
753
- if not isinstance(X_resampled, pd.DataFrame):
754
- X_resampled = pd.DataFrame(X_resampled, columns=X.columns.values)
755
- if not isinstance(y_resampled, pd.Series):
756
- y_resampled = pd.DataFrame(y_resampled, columns=[y.name])[y.name]
757
-
758
- for k in X.dtypes.keys():
759
- X_resampled[k] = X_resampled[k].astype(X.dtypes[k].name)
760
- balanced_df = concatenate(X_resampled, y_resampled)
761
- return balanced_df
762
-
763
-
764
- def get_fill_val(feature_types, column, action, constant="constant"):
765
- # action can be one of the following
766
- # "Fill missing values with mean", "Fill missing values with median",
767
- # "Fill missing values with frequent", "Fill missing values with constant"
768
- action_ = action.split(" ")[-1]
769
- fill_type = "mode" if action_ == "frequent" else action_
770
- try:
771
- fill_val = (
772
- feature_types[column].meta_data["stats"][fill_type]
773
- if action_ != "constant"
774
- else constant
775
- )
776
- fill_val = round(fill_val, 4) if isinstance(fill_val, Number) else fill_val
777
- except:
778
- fill_val = None
779
- return fill_val
780
-
781
-
782
- def parse_apache_log_str(x):
783
- """
784
- Returns the string delimited by two characters.
785
-
786
- Source: https://mmas.github.io/read-apache-access-log-pandas
787
- Example:
788
- `>>> parse_str('[my string]')`
789
- `'my string'`
790
- """
791
- if x is not None:
792
- return x[1:-1]
793
- return np.nan
794
-
795
-
796
- def parse_apache_log_datetime(x):
797
- """
798
- Parses datetime with timezone formatted as:
799
- `[day/month/year:hour:minute:second zone]`
800
-
801
- Source: https://mmas.github.io/read-apache-access-log-pandas
802
- Example:
803
- `>>> parse_datetime('13/Nov/2015:11:45:42 +0000')`
804
- `datetime.datetime(2015, 11, 3, 11, 45, 4, tzinfo=<UTC>)`
805
-
806
- Due to problems parsing the timezone (`%z`) with `datetime.strptime`, the
807
- timezone will be obtained using the `pytz` library.
808
- """
809
- import pytz
810
- from datetime import datetime
811
-
812
- dt = datetime.strptime(x[1:-7], "%d/%b/%Y:%H:%M:%S")
813
- dt_tz = int(x[-6:-3]) * 60 + int(x[-3:-1])
814
- return dt.replace(tzinfo=pytz.FixedOffset(dt_tz))
815
-
816
-
817
- def deprecate_variable(old_var, new_var, warning_msg, warning_type):
818
- if old_var is not None:
819
- warnings.warn(warning_msg, warning_type)
820
- return old_var
821
- return new_var
822
-
823
-
824
- def deprecate_default_value(var, old_value, new_value, warning_msg, warning_type):
825
- if var == old_value:
826
- warnings.warn(warning_msg, warning_type)
827
- return new_value
828
- else:
829
- return var
830
-
831
-
832
- def _log_yscale_not_set():
833
- logger.info(
834
- "`yscale` parameter is not set. Valid values are `'linear'`, `'log'`, `'symlog'`."
835
- )
836
-
837
-
838
- def infer_target_type(target, target_series, discover_target_type=True):
839
- # if type discovery is turned off, infer type from pandas dtype
840
- if discover_target_type:
841
- target_type = TypeDiscoveryDriver().discover(
842
- target, target_series, is_target=True
843
- )
844
- else:
845
- target_type = get_feature_type(target, target_series)
846
- return target_type
847
-
848
-
849
- def get_target_type(target, sampled_df, **init_kwargs):
850
- discover_target_type = init_kwargs.get("type_discovery", True)
851
- if target in init_kwargs.get("types", {}):
852
- sampled_df[target] = sampled_df[target].astype(init_kwargs.get("types")[target])
853
- discover_target_type = False
854
- return infer_target_type(target, sampled_df[target], discover_target_type)
855
-
856
-
857
- def get_dataset(
858
- df: pd.DataFrame,
859
- sampled_df: pd.DataFrame,
860
- target: str,
861
- target_type: TypedFeature,
862
- shape: Tuple[int, int],
863
- positive_class=None,
864
- **init_kwargs,
865
- ):
866
- from ads.dataset.classification_dataset import (
867
- BinaryClassificationDataset,
868
- BinaryTextClassificationDataset,
869
- MultiClassClassificationDataset,
870
- MultiClassTextClassificationDataset,
871
- )
872
- from ads.dataset.forecasting_dataset import ForecastingDataset
873
- from ads.dataset.regression_dataset import RegressionDataset
874
-
875
- if len(df[target].dropna()) == 0:
876
- logger.warning(
877
- "It is not recommended to use an empty column as the target variable."
878
- )
879
- raise ValueError(f"We do not support using empty columns as the chosen target")
880
- if utils.is_same_class(target_type, ContinuousTypedFeature):
881
- return RegressionDataset(
882
- df=df,
883
- sampled_df=sampled_df,
884
- target=target,
885
- target_type=target_type,
886
- shape=shape,
887
- **init_kwargs,
888
- )
889
- elif utils.is_same_class(
890
- target_type, DateTimeTypedFeature
891
- ) or df.index.dtype.name.startswith("datetime"):
892
- return ForecastingDataset(
893
- df=df,
894
- sampled_df=sampled_df,
895
- target=target,
896
- target_type=target_type,
897
- shape=shape,
898
- **init_kwargs,
899
- )
900
-
901
- # Adding ordinal typed feature, but ultimately we should rethink how we want to model this type
902
- elif utils.is_same_class(
903
- target_type, CategoricalTypedFeature
904
- ) or utils.is_same_class(target_type, OrdinalTypedFeature):
905
- if target_type.meta_data["internal"]["unique"] == 2:
906
- if is_text_data(sampled_df, target):
907
- return BinaryTextClassificationDataset(
908
- df=df,
909
- sampled_df=sampled_df,
910
- target=target,
911
- shape=shape,
912
- target_type=target_type,
913
- positive_class=positive_class,
914
- **init_kwargs,
915
- )
916
-
917
- return BinaryClassificationDataset(
918
- df=df,
919
- sampled_df=sampled_df,
920
- target=target,
921
- shape=shape,
922
- target_type=target_type,
923
- positive_class=positive_class,
924
- **init_kwargs,
925
- )
926
- else:
927
- if is_text_data(sampled_df, target):
928
- return MultiClassTextClassificationDataset(
929
- df=df,
930
- sampled_df=sampled_df,
931
- target=target,
932
- target_type=target_type,
933
- shape=shape,
934
- **init_kwargs,
935
- )
936
- return MultiClassClassificationDataset(
937
- df=df,
938
- sampled_df=sampled_df,
939
- target=target,
940
- target_type=target_type,
941
- shape=shape,
942
- **init_kwargs,
943
- )
944
- elif (
945
- utils.is_same_class(target, DocumentTypedFeature)
946
- or "text" in target_type["type"]
947
- or "text" in target
948
- ):
949
- raise ValueError(f"The column {target} cannot be used as the target column.")
950
- elif (
951
- utils.is_same_class(target_type, GISTypedFeature)
952
- or "coord" in target_type["type"]
953
- or "coord" in target
954
- ):
955
- raise ValueError(f"The column {target} cannot be used as the target column.")
956
- # This is to catch constant columns that are boolean. Added as a fix for pd.isnull(), and datasets with a
957
- # binary target, but only data on one instance
958
- elif target_type["low_level_type"] == "bool":
959
- return BinaryClassificationDataset(
960
- df=df,
961
- sampled_df=sampled_df,
962
- target=target,
963
- shape=shape,
964
- target_type=target_type,
965
- positive_class=positive_class,
966
- **init_kwargs,
967
- )
968
- raise ValueError(
969
- f"Unable to identify problem type. Specify the data type of {target} using 'types'. "
970
- f"For example, types = {{{target}: 'category'}}"
971
- )
972
-
973
-
974
- def open(
975
- source,
976
- target=None,
977
- format="infer",
978
- reader_fn: Callable = None,
979
- name: str = None,
980
- description="",
981
- npartitions: int = None,
982
- type_discovery=True,
983
- html_table_index=None,
984
- column_names="infer",
985
- sample_max_rows=10000,
986
- positive_class=None,
987
- transformer_pipeline=None,
988
- types={},
989
- **kwargs,
990
- ):
991
- """
992
- Returns an object of ADSDataset or ADSDatasetWithTarget read from the given path
993
-
994
- .. deprecated:: 2.6.6
995
- "Deprecated in favor of using Pandas. Pandas supports reading from object storage directly.
996
- Check https://accelerated-data-science.readthedocs.io/en/latest/user_guide/loading_data/connect.html",
997
-
998
- Parameters
999
- ----------
1000
- source: Union[str, pandas.DataFrame, h2o.DataFrame, pyspark.sql.dataframe.DataFrame]
1001
- If str, URI for the dataset. The dataset could be read from local or network file system, hdfs, s3, gcs and optionally pyspark in pyspark
1002
- conda env
1003
- target: str, optional
1004
- Name of the target in dataset.
1005
- If set an ADSDatasetWithTarget object is returned, otherwise an ADSDataset object is returned which can be
1006
- used to understand the dataset through visualizations
1007
- format: str, default: infer
1008
- Format of the dataset.
1009
- Supported formats: CSV, TSV, Parquet, libsvm, JSON, XLS/XLSX (Excel), HDF5, SQL, XML,
1010
- Apache server log files (clf, log), ARFF.
1011
- By default, the format would be inferred from the ending of the dataset file path.
1012
- reader_fn: Callable, default: None
1013
- The user may pass in their own custom reader function.
1014
- It must accept `(path, **kwarg)` and return a pandas DataFrame
1015
- name: str, optional default: ""
1016
- description: str, optional default: ""
1017
- Text describing the dataset
1018
- npartitions: int, deprecated
1019
- Number of partitions to split the data
1020
- By default this is set to the max number of cores supported by the backend compute accelerator
1021
- type_discovery: bool, default: True
1022
- If false, the data types of the dataframe are used as such.
1023
- By default, the dataframe columns are associated with the best suited data types. Associating the features
1024
- with the disovered datatypes would impact visualizations and model prediction.
1025
- html_table_index: int, optional
1026
- The index of the dataframe table in html content. This is used when the format of dataset is html
1027
- column_names: 'infer', list of str or None, default: 'infer'
1028
- Supported only for CSV and TSV.
1029
- List of column names to use.
1030
- By default, column names are inferred from the first line of the file.
1031
- If set to None, column names would be auto-generated instead of inferring from file.
1032
- If the file already contains a column header, specify header=0 to ignore the existing column names.
1033
- sample_max_rows: int, default: 10000, use -1 auto calculate sample size, use 0 (zero) for no sampling
1034
- Sample size of the dataframe to use for visualization and optimization.
1035
- positive_class: Any, optional
1036
- Label in target for binary classification problems which should be identified as positive for modeling.
1037
- By default, the first unique value is considered as the positive label.
1038
- types: dict, optional
1039
- Dictionary of <feature_name> : <data_type> to override the data type of features.
1040
- transformer_pipeline: datasets.pipeline.TransformerPipeline, optional
1041
- A pipeline of transformations done outside the sdk and need to be applied at the time of scoring
1042
- storage_options: dict, default: varies by source type
1043
- Parameters passed on to the backend filesystem class.
1044
- sep: str
1045
- Delimiting character for parsing the input file.
1046
- kwargs: additional keyword arguments that would be passed to underlying dataframe read API
1047
- based on the format of the dataset
1048
-
1049
- Returns
1050
- -------
1051
- dataset : An instance of ADSDataset
1052
- (or)
1053
- dataset_with_target : An instance of ADSDatasetWithTarget
1054
- """
1055
- if npartitions:
1056
- warnings.warn(
1057
- "Variable `npartitions` is deprecated and will not be used",
1058
- DeprecationWarning,
1059
- stacklevel=2,
1060
- )
1061
- if (
1062
- "storage_options" not in kwargs
1063
- and type(source) is str
1064
- and len(source) > 6
1065
- and source[:6] == "oci://"
1066
- ):
1067
- kwargs["storage_options"] = {"config": {}}
1068
-
1069
- if isinstance(source, str) or isinstance(source, list):
1070
- progress = utils.get_progress_bar(4)
1071
- progress.update("Opening data")
1072
- path = ElaboratedPath(source, format=format, **kwargs)
1073
- reader_fn = (
1074
- get_format_reader(path=path, **kwargs) if reader_fn is None else reader_fn
1075
- )
1076
- df = load_dataset(path=path, reader_fn=reader_fn, **kwargs)
1077
- name = path.name
1078
- elif isinstance(source, pd.DataFrame):
1079
- progress = utils.get_progress_bar(4)
1080
- progress.update("Partitioning data")
1081
- df = source
1082
- name = "User Provided DataFrame" if name is None else name
1083
- else:
1084
- raise TypeError(
1085
- f"The Source type: {type(source)} is not supported for DatasetFactory."
1086
- )
1087
- shape = df.shape
1088
- return build_dataset(
1089
- df=df,
1090
- shape=shape,
1091
- target=target,
1092
- sample_max_rows=sample_max_rows,
1093
- type_discovery=type_discovery,
1094
- types=types,
1095
- positive_class=positive_class,
1096
- name=name,
1097
- transformer_pipeline=transformer_pipeline,
1098
- description=description,
1099
- progress=progress,
1100
- **utils.inject_and_copy_kwargs(
1101
- kwargs,
1102
- **{"html_table_index": html_table_index, "column_names": column_names},
1103
- ),
1104
- )
1105
-
1106
-
1107
- def build_dataset(
1108
- df: pd.DataFrame,
1109
- shape: Tuple[int, int],
1110
- target: str = None,
1111
- progress=None,
1112
- **kwargs,
1113
- ):
1114
- from ads.dataset.dataset import ADSDataset
1115
-
1116
- n = shape[0]
1117
- if progress:
1118
- progress.update("Generating data sample")
1119
-
1120
- sampled_df = generate_sample(
1121
- df,
1122
- n,
1123
- DatasetDefaults.sampling_confidence_level,
1124
- DatasetDefaults.sampling_confidence_interval,
1125
- **kwargs,
1126
- )
1127
-
1128
- if target is None:
1129
- if progress:
1130
- progress.update("Building the dataset with no target.")
1131
- result = ADSDataset(df=df, sampled_df=sampled_df, shape=shape, **kwargs)
1132
- if progress:
1133
- progress.update("Done")
1134
- logger.info(
1135
- "Use `set_target()` to type the dataset for a particular learning task."
1136
- )
1137
- return result
1138
-
1139
- if progress:
1140
- progress.update("Building dataset")
1141
-
1142
- discover_target_type = kwargs["type_discovery"]
1143
- if target in kwargs["types"]:
1144
- sampled_df[target] = sampled_df[target].astype(kwargs["types"][target])
1145
- discover_target_type = False
1146
-
1147
- # if type discovery is turned off, infer type from pandas dtype
1148
- target_type = infer_target_type(target, sampled_df[target], discover_target_type)
1149
-
1150
- result = get_dataset(
1151
- df=df,
1152
- sampled_df=sampled_df,
1153
- target=target,
1154
- target_type=target_type,
1155
- shape=shape,
1156
- **kwargs,
1157
- )
1158
- if progress:
1159
- progress.update("Done")
1160
- logger.info(
1161
- "Use `suggest_recommendations()` to view and apply recommendations for dataset optimization."
1162
- )
1163
- return result
1164
-
1165
-
1166
- class CustomFormatReaders:
1167
- @staticmethod
1168
- def read_tsv(path: str, **kwargs) -> pd.DataFrame:
1169
- return pd.read_csv(
1170
- path, **utils.inject_and_copy_kwargs(kwargs, **{"sep": "\t"})
1171
- )
1172
-
1173
- @staticmethod
1174
- def read_json(path: str, **kwargs) -> pd.DataFrame:
1175
- try:
1176
- return pd.read_json(path, **kwargs)
1177
- except ValueError as e:
1178
- return pd.read_json(
1179
- path, **utils.inject_and_copy_kwargs(kwargs, **{"lines": True})
1180
- )
1181
-
1182
- @staticmethod
1183
- def read_libsvm(path: str, **kwargs) -> pd.DataFrame:
1184
- from sklearn.datasets import load_svmlight_file
1185
- from joblib import Memory
1186
-
1187
- mem = Memory("./mycache")
1188
-
1189
- @mem.cache
1190
- def get_data(path):
1191
- X, y = load_svmlight_file(path)
1192
- df = pd.DataFrame(X.todense())
1193
- df["target"] = y
1194
- return df
1195
-
1196
- return get_data(path)
1197
-
1198
- @staticmethod
1199
- @runtime_dependency(
1200
- module="pandavro", object="read_avro", install_from=OptionalDependency.DATA
1201
- )
1202
- def read_avro(path: str, **kwargs) -> pd.DataFrame:
1203
- return read_avro(path, **kwargs)
1204
-
1205
- DEFAULT_SQL_CHUNKSIZE = 12007
1206
- DEFAULT_SQL_ARRAYSIZE = 50000
1207
- DEFAULT_SQL_MIL = 128
1208
- DEFAULT_SQL_CTU = False
1209
-
1210
- @classmethod
1211
- def read_sql(cls, path: str, table: str = None, **kwargs) -> pd.DataFrame:
1212
- """
1213
-
1214
- :param path: str
1215
- This is the connection URL that gets passed to sqlalchemy's create_engine method
1216
- :param table: str
1217
- This is either the name of a table to select * from or a sql query to be run
1218
- :param kwargs:
1219
- :return: pd.DataFrame
1220
- """
1221
- if table is None:
1222
- raise ValueError(
1223
- "In order to read from a database you need to specify the table using the `table` "
1224
- "argument."
1225
- )
1226
- # check if it's oracle dialect
1227
- if str(path).lower().startswith("oracle"):
1228
- kwargs = utils.inject_and_copy_kwargs(
1229
- kwargs,
1230
- **{
1231
- "arraysize": cls.DEFAULT_SQL_ARRAYSIZE,
1232
- "max_identifier_length": cls.DEFAULT_SQL_MIL,
1233
- "coerce_to_unicode": cls.DEFAULT_SQL_CTU,
1234
- },
1235
- )
1236
- engine = utils.get_sqlalchemy_engine(path, **kwargs)
1237
-
1238
- table_name = table.strip()
1239
- with engine.connect() as connection:
1240
- # if it's a query expression:
1241
- if table_name.lower().startswith("select"):
1242
- sql_query = table_name
1243
- else:
1244
- sql_query = f"select * from {table_name}"
1245
-
1246
- chunks = pd.read_sql_query(
1247
- sql_query,
1248
- con=connection,
1249
- **validate_kwargs(
1250
- pd.read_sql_query,
1251
- utils.inject_and_copy_kwargs(
1252
- kwargs, **{"chunksize": cls.DEFAULT_SQL_CHUNKSIZE}
1253
- ),
1254
- ),
1255
- )
1256
- df = pd.DataFrame()
1257
- from tqdm import tqdm
1258
-
1259
- with tqdm(chunks, unit=" rows") as t:
1260
- for chunk in chunks:
1261
- df = pd.concat([df, chunk])
1262
- t.update(len(chunk))
1263
-
1264
- df = df.reset_index(drop=True)
1265
- if df.shape[0] == 0:
1266
- logger.warning(
1267
- "The SQL expression returned zero rows. Therefore, no `ADSdataset` object was created."
1268
- )
1269
- raise Exception("The SQL expression returned no rows")
1270
- return df
1271
-
1272
- @staticmethod
1273
- def read_log(path, **kwargs):
1274
- from ads.dataset.helper import parse_apache_log_str, parse_apache_log_datetime
1275
-
1276
- df = pd.read_csv(
1277
- path,
1278
- # assume_missing=True,
1279
- sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
1280
- engine="python",
1281
- na_values="-",
1282
- header=None,
1283
- names=[
1284
- "host",
1285
- "identity",
1286
- "user",
1287
- "time",
1288
- "request",
1289
- "http_code",
1290
- "response_bytes",
1291
- "referer",
1292
- "user_agent",
1293
- "unknown",
1294
- ],
1295
- converters={
1296
- "time": parse_apache_log_datetime,
1297
- "request": parse_apache_log_str,
1298
- "status": int,
1299
- "size": int,
1300
- "referer": parse_apache_log_str,
1301
- "user_agent": parse_apache_log_str,
1302
- },
1303
- **kwargs,
1304
- )
1305
- return df
1306
-
1307
- @staticmethod
1308
- def read_html(path, html_table_index: int = None, **kwargs):
1309
- if html_table_index is None:
1310
- return pd.concat(df for df in pd.read_html(path, **kwargs))
1311
- else:
1312
- return pd.read_html(path, **kwargs)[html_table_index]
1313
-
1314
- @staticmethod
1315
- @runtime_dependency(module="scipy", install_from=OptionalDependency.VIZ)
1316
- def read_arff(path, **kwargs):
1317
- from scipy.io import arff
1318
- import requests
1319
- from io import BytesIO, TextIOWrapper
1320
-
1321
- data = None
1322
- if os.path.isfile(path):
1323
- data, _ = arff.loadarff(path)
1324
- else:
1325
- with requests.get(path) as r:
1326
- if r.status_code == requests.codes.ok:
1327
- f = TextIOWrapper(BytesIO(r.content))
1328
- data, _ = arff.loadarff(f)
1329
- return pd.DataFrame(data)
1330
-
1331
- @staticmethod
1332
- def read_xml(path: str, **kwargs) -> pd.DataFrame:
1333
- """
1334
- Load data from xml file.
1335
-
1336
- Parameters
1337
- ----------
1338
- path: str
1339
- Path to XML file
1340
- storage_options: dict, optional
1341
- Storage options passed to Pandas to read the file.
1342
-
1343
- Returns
1344
- -------
1345
- dataframe : pandas.DataFrame
1346
- """
1347
- import xml.etree.cElementTree as et
1348
-
1349
- def get_children(df, node, parent, i):
1350
- for name in node.attrib.keys():
1351
- df.at[i, parent + name] = node.attrib[name]
1352
- for child in list(node):
1353
- if len(list(child)) > 0:
1354
- get_children(df, child, parent + child.tag + "/", i)
1355
- else:
1356
- df.at[i, parent + child.tag] = child.text
1357
-
1358
- storage_options = kwargs.get("storage_options", {})
1359
-
1360
- file_handles = fsspec.open_files(path, mode="rb", **storage_options)
1361
- ret_df = pd.DataFrame()
1362
- last_i = 0
1363
- for file_handle in file_handles:
1364
- with file_handle:
1365
- parsed_xml = et.parse(path)
1366
- for i, node in enumerate(parsed_xml.getroot()):
1367
- get_children(ret_df, node, node.tag + "/", last_i + i)
1368
- last_i = i
1369
- return ret_df
1370
-
1371
-
1372
- reader_fns = {
1373
- "csv": pd.read_csv,
1374
- "tsv": CustomFormatReaders.read_tsv,
1375
- "json": CustomFormatReaders.read_json,
1376
- "jsonl": CustomFormatReaders.read_json,
1377
- "excel": pd.read_excel,
1378
- "xls": pd.read_excel,
1379
- "xlsx": pd.read_excel,
1380
- "parquet": pd.read_parquet,
1381
- "libsvm": CustomFormatReaders.read_libsvm,
1382
- "hdf": pd.read_hdf, # Todo: re.match(format, "hdf\d*") or format == "h5"
1383
- "hdf3": pd.read_hdf,
1384
- "hdf4": pd.read_hdf,
1385
- "h5": pd.read_hdf,
1386
- "avro": CustomFormatReaders.read_avro,
1387
- "avsc": CustomFormatReaders.read_avro,
1388
- "sql": CustomFormatReaders.read_sql,
1389
- "db": CustomFormatReaders.read_sql,
1390
- "log": CustomFormatReaders.read_log,
1391
- "clf": CustomFormatReaders.read_log,
1392
- "html": CustomFormatReaders.read_html,
1393
- "arff": CustomFormatReaders.read_arff,
1394
- "xml": CustomFormatReaders.read_xml,
1395
- }
1396
-
1397
-
1398
- def validate_kwargs(func: Callable, kwargs):
1399
- valid_params = inspect.signature(func).parameters
1400
- if "kwargs" in valid_params:
1401
- return kwargs
1402
- else:
1403
- return {k: v for k, v in kwargs.items() if k in valid_params}
1404
-
1405
-
1406
- def get_format_reader(path: ElaboratedPath, **kwargs) -> Callable:
1407
- format_key = path.format
1408
- try:
1409
- reader_fn = reader_fns[format_key]
1410
- except (KeyError, NameError):
1411
- raise ValueError(
1412
- f"We were unable to load the specified dataset. We have interpreted the format "
1413
- f"as {format_key}, if this is not correct, call again and set the `format` parameter = "
1414
- f"to the desired format. Read more here: https://docs.cloud.oracle.com/en-us/iaas/tools/ads"
1415
- f"-sdk/latest/user_guide/loading_data/loading_data.html#specify-data-types-in-load-dataset"
1416
- )
1417
-
1418
- return reader_fn
1419
-
1420
-
1421
- def load_dataset(path: ElaboratedPath, reader_fn: Callable, **kwargs) -> pd.DataFrame:
1422
- dfs = []
1423
- for filename in path.paths:
1424
- data = reader_fn(filename, **validate_kwargs(reader_fn, kwargs))
1425
- if not isinstance(data, pd.DataFrame):
1426
- fn_name = f"{reader_fn.__module__}.{reader_fn.__qualname__}"
1427
- raise ValueError(
1428
- f"{fn_name} is used to load the data. "
1429
- f"However, {fn_name} returned {type(data)} instead of pandas DataFrame. "
1430
- f"Refer to the usage of {fn_name} to set the correct arguments."
1431
- )
1432
- dfs.append(data)
1433
- if len(dfs) == 0:
1434
- raise ValueError(
1435
- f"We were unable to load the specified dataset. Read more here: "
1436
- f"https://docs.cloud.oracle.com/en-us/iaas/tools/ads"
1437
- f"-sdk/latest/user_guide/loading_data/loading_data.html#specify-data-types-in-load-dataset"
1438
- )
1439
-
1440
- df = pd.concat(dfs)
1441
-
1442
- if df is None:
1443
- raise ValueError(
1444
- f"We were unable to load the specified dataset. Read more here: "
1445
- f"https://docs.cloud.oracle.com/en-us/iaas/tools/ads"
1446
- f"-sdk/latest/user_guide/loading_data/loading_data.html#specify-data-types-in-load-dataset"
1447
- )
1448
- if df.empty:
1449
- raise DatasetLoadException("Empty DataFrame, not producing a ADSDataset")
1450
- return df