oracle-ads 2.13.9rc0__py3-none-any.whl → 2.13.9rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (857) hide show
  1. ads/aqua/__init__.py +40 -0
  2. ads/aqua/app.py +506 -0
  3. ads/aqua/cli.py +96 -0
  4. ads/aqua/client/__init__.py +3 -0
  5. ads/aqua/client/client.py +836 -0
  6. ads/aqua/client/openai_client.py +305 -0
  7. ads/aqua/common/__init__.py +5 -0
  8. ads/aqua/common/decorator.py +125 -0
  9. ads/aqua/common/entities.py +269 -0
  10. ads/aqua/common/enums.py +122 -0
  11. ads/aqua/common/errors.py +109 -0
  12. ads/aqua/common/utils.py +1285 -0
  13. ads/aqua/config/__init__.py +4 -0
  14. ads/aqua/config/container_config.py +248 -0
  15. ads/aqua/config/evaluation/__init__.py +4 -0
  16. ads/aqua/config/evaluation/evaluation_service_config.py +147 -0
  17. ads/aqua/config/utils/__init__.py +4 -0
  18. ads/aqua/config/utils/serializer.py +339 -0
  19. ads/aqua/constants.py +116 -0
  20. ads/aqua/data.py +14 -0
  21. ads/aqua/dummy_data/icon.txt +1 -0
  22. ads/aqua/dummy_data/oci_model_deployments.json +56 -0
  23. ads/aqua/dummy_data/oci_models.json +1 -0
  24. ads/aqua/dummy_data/readme.md +26 -0
  25. ads/aqua/evaluation/__init__.py +8 -0
  26. ads/aqua/evaluation/constants.py +53 -0
  27. ads/aqua/evaluation/entities.py +186 -0
  28. ads/aqua/evaluation/errors.py +70 -0
  29. ads/aqua/evaluation/evaluation.py +1814 -0
  30. ads/aqua/extension/__init__.py +42 -0
  31. ads/aqua/extension/aqua_ws_msg_handler.py +76 -0
  32. ads/aqua/extension/base_handler.py +90 -0
  33. ads/aqua/extension/common_handler.py +121 -0
  34. ads/aqua/extension/common_ws_msg_handler.py +36 -0
  35. ads/aqua/extension/deployment_handler.py +298 -0
  36. ads/aqua/extension/deployment_ws_msg_handler.py +54 -0
  37. ads/aqua/extension/errors.py +30 -0
  38. ads/aqua/extension/evaluation_handler.py +129 -0
  39. ads/aqua/extension/evaluation_ws_msg_handler.py +61 -0
  40. ads/aqua/extension/finetune_handler.py +96 -0
  41. ads/aqua/extension/model_handler.py +390 -0
  42. ads/aqua/extension/models/__init__.py +0 -0
  43. ads/aqua/extension/models/ws_models.py +145 -0
  44. ads/aqua/extension/models_ws_msg_handler.py +50 -0
  45. ads/aqua/extension/ui_handler.py +282 -0
  46. ads/aqua/extension/ui_websocket_handler.py +130 -0
  47. ads/aqua/extension/utils.py +133 -0
  48. ads/aqua/finetuning/__init__.py +7 -0
  49. ads/aqua/finetuning/constants.py +23 -0
  50. ads/aqua/finetuning/entities.py +181 -0
  51. ads/aqua/finetuning/finetuning.py +749 -0
  52. ads/aqua/model/__init__.py +8 -0
  53. ads/aqua/model/constants.py +60 -0
  54. ads/aqua/model/entities.py +385 -0
  55. ads/aqua/model/enums.py +32 -0
  56. ads/aqua/model/model.py +2114 -0
  57. ads/aqua/modeldeployment/__init__.py +8 -0
  58. ads/aqua/modeldeployment/constants.py +10 -0
  59. ads/aqua/modeldeployment/deployment.py +1326 -0
  60. ads/aqua/modeldeployment/entities.py +653 -0
  61. ads/aqua/modeldeployment/inference.py +74 -0
  62. ads/aqua/modeldeployment/utils.py +543 -0
  63. ads/aqua/resources/gpu_shapes_index.json +94 -0
  64. ads/aqua/server/__init__.py +4 -0
  65. ads/aqua/server/__main__.py +24 -0
  66. ads/aqua/server/app.py +47 -0
  67. ads/aqua/server/aqua_spec.yml +1291 -0
  68. ads/aqua/training/__init__.py +4 -0
  69. ads/aqua/training/exceptions.py +476 -0
  70. ads/aqua/ui.py +499 -0
  71. ads/automl/__init__.py +9 -0
  72. ads/automl/driver.py +330 -0
  73. ads/automl/provider.py +975 -0
  74. ads/bds/__init__.py +5 -0
  75. ads/bds/auth.py +127 -0
  76. ads/bds/big_data_service.py +255 -0
  77. ads/catalog/__init__.py +19 -0
  78. ads/catalog/model.py +1576 -0
  79. ads/catalog/notebook.py +461 -0
  80. ads/catalog/project.py +468 -0
  81. ads/catalog/summary.py +178 -0
  82. ads/common/__init__.py +11 -0
  83. ads/common/analyzer.py +65 -0
  84. ads/common/artifact/.model-ignore +63 -0
  85. ads/common/artifact/__init__.py +10 -0
  86. ads/common/auth.py +1122 -0
  87. ads/common/card_identifier.py +83 -0
  88. ads/common/config.py +647 -0
  89. ads/common/data.py +165 -0
  90. ads/common/decorator/__init__.py +9 -0
  91. ads/common/decorator/argument_to_case.py +88 -0
  92. ads/common/decorator/deprecate.py +69 -0
  93. ads/common/decorator/require_nonempty_arg.py +65 -0
  94. ads/common/decorator/runtime_dependency.py +178 -0
  95. ads/common/decorator/threaded.py +97 -0
  96. ads/common/decorator/utils.py +35 -0
  97. ads/common/dsc_file_system.py +303 -0
  98. ads/common/error.py +14 -0
  99. ads/common/extended_enum.py +81 -0
  100. ads/common/function/__init__.py +5 -0
  101. ads/common/function/fn_util.py +142 -0
  102. ads/common/function/func_conf.yaml +25 -0
  103. ads/common/ipython.py +76 -0
  104. ads/common/model.py +679 -0
  105. ads/common/model_artifact.py +1759 -0
  106. ads/common/model_artifact_schema.json +107 -0
  107. ads/common/model_export_util.py +664 -0
  108. ads/common/model_metadata.py +24 -0
  109. ads/common/object_storage_details.py +296 -0
  110. ads/common/oci_client.py +175 -0
  111. ads/common/oci_datascience.py +46 -0
  112. ads/common/oci_logging.py +1144 -0
  113. ads/common/oci_mixin.py +957 -0
  114. ads/common/oci_resource.py +136 -0
  115. ads/common/serializer.py +559 -0
  116. ads/common/utils.py +1852 -0
  117. ads/common/word_lists.py +1491 -0
  118. ads/common/work_request.py +189 -0
  119. ads/data_labeling/__init__.py +13 -0
  120. ads/data_labeling/boundingbox.py +253 -0
  121. ads/data_labeling/constants.py +47 -0
  122. ads/data_labeling/data_labeling_service.py +244 -0
  123. ads/data_labeling/interface/__init__.py +5 -0
  124. ads/data_labeling/interface/loader.py +16 -0
  125. ads/data_labeling/interface/parser.py +16 -0
  126. ads/data_labeling/interface/reader.py +23 -0
  127. ads/data_labeling/loader/__init__.py +5 -0
  128. ads/data_labeling/loader/file_loader.py +241 -0
  129. ads/data_labeling/metadata.py +110 -0
  130. ads/data_labeling/mixin/__init__.py +5 -0
  131. ads/data_labeling/mixin/data_labeling.py +232 -0
  132. ads/data_labeling/ner.py +129 -0
  133. ads/data_labeling/parser/__init__.py +5 -0
  134. ads/data_labeling/parser/dls_record_parser.py +388 -0
  135. ads/data_labeling/parser/export_metadata_parser.py +94 -0
  136. ads/data_labeling/parser/export_record_parser.py +473 -0
  137. ads/data_labeling/reader/__init__.py +5 -0
  138. ads/data_labeling/reader/dataset_reader.py +574 -0
  139. ads/data_labeling/reader/dls_record_reader.py +121 -0
  140. ads/data_labeling/reader/export_record_reader.py +62 -0
  141. ads/data_labeling/reader/jsonl_reader.py +75 -0
  142. ads/data_labeling/reader/metadata_reader.py +203 -0
  143. ads/data_labeling/reader/record_reader.py +263 -0
  144. ads/data_labeling/record.py +52 -0
  145. ads/data_labeling/visualizer/__init__.py +5 -0
  146. ads/data_labeling/visualizer/image_visualizer.py +525 -0
  147. ads/data_labeling/visualizer/text_visualizer.py +357 -0
  148. ads/database/__init__.py +5 -0
  149. ads/database/connection.py +338 -0
  150. ads/dataset/__init__.py +10 -0
  151. ads/dataset/capabilities.md +51 -0
  152. ads/dataset/classification_dataset.py +339 -0
  153. ads/dataset/correlation.py +226 -0
  154. ads/dataset/correlation_plot.py +563 -0
  155. ads/dataset/dask_series.py +173 -0
  156. ads/dataset/dataframe_transformer.py +110 -0
  157. ads/dataset/dataset.py +1979 -0
  158. ads/dataset/dataset_browser.py +360 -0
  159. ads/dataset/dataset_with_target.py +995 -0
  160. ads/dataset/exception.py +25 -0
  161. ads/dataset/factory.py +987 -0
  162. ads/dataset/feature_engineering_transformer.py +35 -0
  163. ads/dataset/feature_selection.py +107 -0
  164. ads/dataset/forecasting_dataset.py +26 -0
  165. ads/dataset/helper.py +1450 -0
  166. ads/dataset/label_encoder.py +99 -0
  167. ads/dataset/mixin/__init__.py +5 -0
  168. ads/dataset/mixin/dataset_accessor.py +134 -0
  169. ads/dataset/pipeline.py +58 -0
  170. ads/dataset/plot.py +710 -0
  171. ads/dataset/progress.py +86 -0
  172. ads/dataset/recommendation.py +297 -0
  173. ads/dataset/recommendation_transformer.py +502 -0
  174. ads/dataset/regression_dataset.py +14 -0
  175. ads/dataset/sampled_dataset.py +1050 -0
  176. ads/dataset/target.py +98 -0
  177. ads/dataset/timeseries.py +18 -0
  178. ads/dbmixin/__init__.py +5 -0
  179. ads/dbmixin/db_pandas_accessor.py +153 -0
  180. ads/environment/__init__.py +9 -0
  181. ads/environment/ml_runtime.py +66 -0
  182. ads/evaluations/README.md +14 -0
  183. ads/evaluations/__init__.py +109 -0
  184. ads/evaluations/evaluation_plot.py +983 -0
  185. ads/evaluations/evaluator.py +1334 -0
  186. ads/evaluations/statistical_metrics.py +543 -0
  187. ads/experiments/__init__.py +9 -0
  188. ads/experiments/capabilities.md +0 -0
  189. ads/explanations/__init__.py +21 -0
  190. ads/explanations/base_explainer.py +142 -0
  191. ads/explanations/capabilities.md +83 -0
  192. ads/explanations/explainer.py +190 -0
  193. ads/explanations/mlx_global_explainer.py +1050 -0
  194. ads/explanations/mlx_interface.py +386 -0
  195. ads/explanations/mlx_local_explainer.py +287 -0
  196. ads/explanations/mlx_whatif_explainer.py +201 -0
  197. ads/feature_engineering/__init__.py +20 -0
  198. ads/feature_engineering/accessor/__init__.py +5 -0
  199. ads/feature_engineering/accessor/dataframe_accessor.py +535 -0
  200. ads/feature_engineering/accessor/mixin/__init__.py +5 -0
  201. ads/feature_engineering/accessor/mixin/correlation.py +166 -0
  202. ads/feature_engineering/accessor/mixin/eda_mixin.py +266 -0
  203. ads/feature_engineering/accessor/mixin/eda_mixin_series.py +85 -0
  204. ads/feature_engineering/accessor/mixin/feature_types_mixin.py +211 -0
  205. ads/feature_engineering/accessor/mixin/utils.py +65 -0
  206. ads/feature_engineering/accessor/series_accessor.py +431 -0
  207. ads/feature_engineering/adsimage/__init__.py +5 -0
  208. ads/feature_engineering/adsimage/image.py +192 -0
  209. ads/feature_engineering/adsimage/image_reader.py +170 -0
  210. ads/feature_engineering/adsimage/interface/__init__.py +5 -0
  211. ads/feature_engineering/adsimage/interface/reader.py +19 -0
  212. ads/feature_engineering/adsstring/__init__.py +7 -0
  213. ads/feature_engineering/adsstring/oci_language/__init__.py +8 -0
  214. ads/feature_engineering/adsstring/string/__init__.py +8 -0
  215. ads/feature_engineering/data_schema.json +57 -0
  216. ads/feature_engineering/dataset/__init__.py +5 -0
  217. ads/feature_engineering/dataset/zip_code_data.py +42062 -0
  218. ads/feature_engineering/exceptions.py +40 -0
  219. ads/feature_engineering/feature_type/__init__.py +133 -0
  220. ads/feature_engineering/feature_type/address.py +184 -0
  221. ads/feature_engineering/feature_type/adsstring/__init__.py +5 -0
  222. ads/feature_engineering/feature_type/adsstring/common_regex_mixin.py +164 -0
  223. ads/feature_engineering/feature_type/adsstring/oci_language.py +93 -0
  224. ads/feature_engineering/feature_type/adsstring/parsers/__init__.py +5 -0
  225. ads/feature_engineering/feature_type/adsstring/parsers/base.py +47 -0
  226. ads/feature_engineering/feature_type/adsstring/parsers/nltk_parser.py +96 -0
  227. ads/feature_engineering/feature_type/adsstring/parsers/spacy_parser.py +221 -0
  228. ads/feature_engineering/feature_type/adsstring/string.py +258 -0
  229. ads/feature_engineering/feature_type/base.py +58 -0
  230. ads/feature_engineering/feature_type/boolean.py +183 -0
  231. ads/feature_engineering/feature_type/category.py +146 -0
  232. ads/feature_engineering/feature_type/constant.py +137 -0
  233. ads/feature_engineering/feature_type/continuous.py +151 -0
  234. ads/feature_engineering/feature_type/creditcard.py +314 -0
  235. ads/feature_engineering/feature_type/datetime.py +190 -0
  236. ads/feature_engineering/feature_type/discrete.py +134 -0
  237. ads/feature_engineering/feature_type/document.py +43 -0
  238. ads/feature_engineering/feature_type/gis.py +251 -0
  239. ads/feature_engineering/feature_type/handler/__init__.py +5 -0
  240. ads/feature_engineering/feature_type/handler/feature_validator.py +524 -0
  241. ads/feature_engineering/feature_type/handler/feature_warning.py +319 -0
  242. ads/feature_engineering/feature_type/handler/warnings.py +128 -0
  243. ads/feature_engineering/feature_type/integer.py +142 -0
  244. ads/feature_engineering/feature_type/ip_address.py +144 -0
  245. ads/feature_engineering/feature_type/ip_address_v4.py +138 -0
  246. ads/feature_engineering/feature_type/ip_address_v6.py +138 -0
  247. ads/feature_engineering/feature_type/lat_long.py +256 -0
  248. ads/feature_engineering/feature_type/object.py +43 -0
  249. ads/feature_engineering/feature_type/ordinal.py +132 -0
  250. ads/feature_engineering/feature_type/phone_number.py +135 -0
  251. ads/feature_engineering/feature_type/string.py +171 -0
  252. ads/feature_engineering/feature_type/text.py +93 -0
  253. ads/feature_engineering/feature_type/unknown.py +43 -0
  254. ads/feature_engineering/feature_type/zip_code.py +164 -0
  255. ads/feature_engineering/feature_type_manager.py +406 -0
  256. ads/feature_engineering/schema.py +795 -0
  257. ads/feature_engineering/utils.py +245 -0
  258. ads/feature_store/.readthedocs.yaml +19 -0
  259. ads/feature_store/README.md +65 -0
  260. ads/feature_store/__init__.py +9 -0
  261. ads/feature_store/common/__init__.py +0 -0
  262. ads/feature_store/common/enums.py +339 -0
  263. ads/feature_store/common/exceptions.py +18 -0
  264. ads/feature_store/common/spark_session_singleton.py +125 -0
  265. ads/feature_store/common/utils/__init__.py +0 -0
  266. ads/feature_store/common/utils/base64_encoder_decoder.py +72 -0
  267. ads/feature_store/common/utils/feature_schema_mapper.py +283 -0
  268. ads/feature_store/common/utils/transformation_utils.py +82 -0
  269. ads/feature_store/common/utils/utility.py +403 -0
  270. ads/feature_store/data_validation/__init__.py +0 -0
  271. ads/feature_store/data_validation/great_expectation.py +129 -0
  272. ads/feature_store/dataset.py +1230 -0
  273. ads/feature_store/dataset_job.py +530 -0
  274. ads/feature_store/docs/Dockerfile +7 -0
  275. ads/feature_store/docs/Makefile +44 -0
  276. ads/feature_store/docs/conf.py +28 -0
  277. ads/feature_store/docs/requirements.txt +14 -0
  278. ads/feature_store/docs/source/ads.feature_store.query.rst +20 -0
  279. ads/feature_store/docs/source/cicd.rst +137 -0
  280. ads/feature_store/docs/source/conf.py +86 -0
  281. ads/feature_store/docs/source/data_versioning.rst +33 -0
  282. ads/feature_store/docs/source/dataset.rst +388 -0
  283. ads/feature_store/docs/source/dataset_job.rst +27 -0
  284. ads/feature_store/docs/source/demo.rst +70 -0
  285. ads/feature_store/docs/source/entity.rst +78 -0
  286. ads/feature_store/docs/source/feature_group.rst +624 -0
  287. ads/feature_store/docs/source/feature_group_job.rst +29 -0
  288. ads/feature_store/docs/source/feature_store.rst +122 -0
  289. ads/feature_store/docs/source/feature_store_class.rst +123 -0
  290. ads/feature_store/docs/source/feature_validation.rst +66 -0
  291. ads/feature_store/docs/source/figures/cicd.png +0 -0
  292. ads/feature_store/docs/source/figures/data_validation.png +0 -0
  293. ads/feature_store/docs/source/figures/data_versioning.png +0 -0
  294. ads/feature_store/docs/source/figures/dataset.gif +0 -0
  295. ads/feature_store/docs/source/figures/dataset.png +0 -0
  296. ads/feature_store/docs/source/figures/dataset_lineage.png +0 -0
  297. ads/feature_store/docs/source/figures/dataset_statistics.png +0 -0
  298. ads/feature_store/docs/source/figures/dataset_statistics_viz.png +0 -0
  299. ads/feature_store/docs/source/figures/dataset_validation_results.png +0 -0
  300. ads/feature_store/docs/source/figures/dataset_validation_summary.png +0 -0
  301. ads/feature_store/docs/source/figures/drift_monitoring.png +0 -0
  302. ads/feature_store/docs/source/figures/entity.png +0 -0
  303. ads/feature_store/docs/source/figures/feature_group.png +0 -0
  304. ads/feature_store/docs/source/figures/feature_group_lineage.png +0 -0
  305. ads/feature_store/docs/source/figures/feature_group_statistics_viz.png +0 -0
  306. ads/feature_store/docs/source/figures/feature_store_deployment.png +0 -0
  307. ads/feature_store/docs/source/figures/feature_store_overview.png +0 -0
  308. ads/feature_store/docs/source/figures/featuregroup.gif +0 -0
  309. ads/feature_store/docs/source/figures/lineage_d1.png +0 -0
  310. ads/feature_store/docs/source/figures/lineage_d2.png +0 -0
  311. ads/feature_store/docs/source/figures/lineage_fg.png +0 -0
  312. ads/feature_store/docs/source/figures/logo-dark-mode.png +0 -0
  313. ads/feature_store/docs/source/figures/logo-light-mode.png +0 -0
  314. ads/feature_store/docs/source/figures/overview.png +0 -0
  315. ads/feature_store/docs/source/figures/resource_manager.png +0 -0
  316. ads/feature_store/docs/source/figures/resource_manager_feature_store_stack.png +0 -0
  317. ads/feature_store/docs/source/figures/resource_manager_home.png +0 -0
  318. ads/feature_store/docs/source/figures/stats_1.png +0 -0
  319. ads/feature_store/docs/source/figures/stats_2.png +0 -0
  320. ads/feature_store/docs/source/figures/stats_d.png +0 -0
  321. ads/feature_store/docs/source/figures/stats_fg.png +0 -0
  322. ads/feature_store/docs/source/figures/transformation.png +0 -0
  323. ads/feature_store/docs/source/figures/transformations.gif +0 -0
  324. ads/feature_store/docs/source/figures/validation.png +0 -0
  325. ads/feature_store/docs/source/figures/validation_fg.png +0 -0
  326. ads/feature_store/docs/source/figures/validation_results.png +0 -0
  327. ads/feature_store/docs/source/figures/validation_summary.png +0 -0
  328. ads/feature_store/docs/source/index.rst +81 -0
  329. ads/feature_store/docs/source/module.rst +8 -0
  330. ads/feature_store/docs/source/notebook.rst +94 -0
  331. ads/feature_store/docs/source/overview.rst +47 -0
  332. ads/feature_store/docs/source/quickstart.rst +176 -0
  333. ads/feature_store/docs/source/release_notes.rst +194 -0
  334. ads/feature_store/docs/source/setup_feature_store.rst +81 -0
  335. ads/feature_store/docs/source/statistics.rst +58 -0
  336. ads/feature_store/docs/source/transformation.rst +199 -0
  337. ads/feature_store/docs/source/ui.rst +65 -0
  338. ads/feature_store/docs/source/user_guides.setup.feature_store_operator.rst +66 -0
  339. ads/feature_store/docs/source/user_guides.setup.helm_chart.rst +192 -0
  340. ads/feature_store/docs/source/user_guides.setup.terraform.rst +338 -0
  341. ads/feature_store/entity.py +718 -0
  342. ads/feature_store/execution_strategy/__init__.py +0 -0
  343. ads/feature_store/execution_strategy/delta_lake/__init__.py +0 -0
  344. ads/feature_store/execution_strategy/delta_lake/delta_lake_service.py +375 -0
  345. ads/feature_store/execution_strategy/engine/__init__.py +0 -0
  346. ads/feature_store/execution_strategy/engine/spark_engine.py +316 -0
  347. ads/feature_store/execution_strategy/execution_strategy.py +113 -0
  348. ads/feature_store/execution_strategy/execution_strategy_provider.py +47 -0
  349. ads/feature_store/execution_strategy/spark/__init__.py +0 -0
  350. ads/feature_store/execution_strategy/spark/spark_execution.py +618 -0
  351. ads/feature_store/feature.py +192 -0
  352. ads/feature_store/feature_group.py +1494 -0
  353. ads/feature_store/feature_group_expectation.py +346 -0
  354. ads/feature_store/feature_group_job.py +602 -0
  355. ads/feature_store/feature_lineage/__init__.py +0 -0
  356. ads/feature_store/feature_lineage/graphviz_service.py +180 -0
  357. ads/feature_store/feature_option_details.py +50 -0
  358. ads/feature_store/feature_statistics/__init__.py +0 -0
  359. ads/feature_store/feature_statistics/statistics_service.py +99 -0
  360. ads/feature_store/feature_store.py +699 -0
  361. ads/feature_store/feature_store_registrar.py +518 -0
  362. ads/feature_store/input_feature_detail.py +149 -0
  363. ads/feature_store/mixin/__init__.py +4 -0
  364. ads/feature_store/mixin/oci_feature_store.py +145 -0
  365. ads/feature_store/model_details.py +73 -0
  366. ads/feature_store/query/__init__.py +0 -0
  367. ads/feature_store/query/filter.py +266 -0
  368. ads/feature_store/query/generator/__init__.py +0 -0
  369. ads/feature_store/query/generator/query_generator.py +298 -0
  370. ads/feature_store/query/join.py +161 -0
  371. ads/feature_store/query/query.py +403 -0
  372. ads/feature_store/query/validator/__init__.py +0 -0
  373. ads/feature_store/query/validator/query_validator.py +57 -0
  374. ads/feature_store/response/__init__.py +0 -0
  375. ads/feature_store/response/response_builder.py +68 -0
  376. ads/feature_store/service/__init__.py +0 -0
  377. ads/feature_store/service/oci_dataset.py +139 -0
  378. ads/feature_store/service/oci_dataset_job.py +199 -0
  379. ads/feature_store/service/oci_entity.py +125 -0
  380. ads/feature_store/service/oci_feature_group.py +164 -0
  381. ads/feature_store/service/oci_feature_group_job.py +214 -0
  382. ads/feature_store/service/oci_feature_store.py +182 -0
  383. ads/feature_store/service/oci_lineage.py +87 -0
  384. ads/feature_store/service/oci_transformation.py +104 -0
  385. ads/feature_store/statistics/__init__.py +0 -0
  386. ads/feature_store/statistics/abs_feature_value.py +49 -0
  387. ads/feature_store/statistics/charts/__init__.py +0 -0
  388. ads/feature_store/statistics/charts/abstract_feature_plot.py +37 -0
  389. ads/feature_store/statistics/charts/box_plot.py +148 -0
  390. ads/feature_store/statistics/charts/frequency_distribution.py +65 -0
  391. ads/feature_store/statistics/charts/probability_distribution.py +68 -0
  392. ads/feature_store/statistics/charts/top_k_frequent_elements.py +98 -0
  393. ads/feature_store/statistics/feature_stat.py +126 -0
  394. ads/feature_store/statistics/generic_feature_value.py +33 -0
  395. ads/feature_store/statistics/statistics.py +41 -0
  396. ads/feature_store/statistics_config.py +101 -0
  397. ads/feature_store/templates/feature_store_template.yaml +45 -0
  398. ads/feature_store/transformation.py +499 -0
  399. ads/feature_store/validation_output.py +57 -0
  400. ads/hpo/__init__.py +9 -0
  401. ads/hpo/_imports.py +91 -0
  402. ads/hpo/ads_search_space.py +439 -0
  403. ads/hpo/distributions.py +325 -0
  404. ads/hpo/objective.py +280 -0
  405. ads/hpo/search_cv.py +1657 -0
  406. ads/hpo/stopping_criterion.py +75 -0
  407. ads/hpo/tuner_artifact.py +413 -0
  408. ads/hpo/utils.py +91 -0
  409. ads/hpo/validation.py +140 -0
  410. ads/hpo/visualization/__init__.py +5 -0
  411. ads/hpo/visualization/_contour.py +23 -0
  412. ads/hpo/visualization/_edf.py +20 -0
  413. ads/hpo/visualization/_intermediate_values.py +21 -0
  414. ads/hpo/visualization/_optimization_history.py +25 -0
  415. ads/hpo/visualization/_parallel_coordinate.py +169 -0
  416. ads/hpo/visualization/_param_importances.py +26 -0
  417. ads/jobs/__init__.py +53 -0
  418. ads/jobs/ads_job.py +663 -0
  419. ads/jobs/builders/__init__.py +5 -0
  420. ads/jobs/builders/base.py +156 -0
  421. ads/jobs/builders/infrastructure/__init__.py +6 -0
  422. ads/jobs/builders/infrastructure/base.py +165 -0
  423. ads/jobs/builders/infrastructure/dataflow.py +1252 -0
  424. ads/jobs/builders/infrastructure/dsc_job.py +1894 -0
  425. ads/jobs/builders/infrastructure/dsc_job_runtime.py +1233 -0
  426. ads/jobs/builders/infrastructure/utils.py +65 -0
  427. ads/jobs/builders/runtimes/__init__.py +5 -0
  428. ads/jobs/builders/runtimes/artifact.py +338 -0
  429. ads/jobs/builders/runtimes/base.py +325 -0
  430. ads/jobs/builders/runtimes/container_runtime.py +242 -0
  431. ads/jobs/builders/runtimes/python_runtime.py +1016 -0
  432. ads/jobs/builders/runtimes/pytorch_runtime.py +204 -0
  433. ads/jobs/cli.py +104 -0
  434. ads/jobs/env_var_parser.py +131 -0
  435. ads/jobs/extension.py +160 -0
  436. ads/jobs/schema/__init__.py +5 -0
  437. ads/jobs/schema/infrastructure_schema.json +116 -0
  438. ads/jobs/schema/job_schema.json +42 -0
  439. ads/jobs/schema/runtime_schema.json +183 -0
  440. ads/jobs/schema/validator.py +141 -0
  441. ads/jobs/serializer.py +296 -0
  442. ads/jobs/templates/__init__.py +5 -0
  443. ads/jobs/templates/container.py +6 -0
  444. ads/jobs/templates/driver_notebook.py +177 -0
  445. ads/jobs/templates/driver_oci.py +500 -0
  446. ads/jobs/templates/driver_python.py +48 -0
  447. ads/jobs/templates/driver_pytorch.py +852 -0
  448. ads/jobs/templates/driver_utils.py +615 -0
  449. ads/jobs/templates/hostname_from_env.c +55 -0
  450. ads/jobs/templates/oci_metrics.py +181 -0
  451. ads/jobs/utils.py +104 -0
  452. ads/llm/__init__.py +28 -0
  453. ads/llm/autogen/__init__.py +2 -0
  454. ads/llm/autogen/constants.py +15 -0
  455. ads/llm/autogen/reports/__init__.py +2 -0
  456. ads/llm/autogen/reports/base.py +67 -0
  457. ads/llm/autogen/reports/data.py +103 -0
  458. ads/llm/autogen/reports/session.py +526 -0
  459. ads/llm/autogen/reports/templates/chat_box.html +13 -0
  460. ads/llm/autogen/reports/templates/chat_box_lt.html +5 -0
  461. ads/llm/autogen/reports/templates/chat_box_rt.html +6 -0
  462. ads/llm/autogen/reports/utils.py +56 -0
  463. ads/llm/autogen/v02/__init__.py +4 -0
  464. ads/llm/autogen/v02/client.py +295 -0
  465. ads/llm/autogen/v02/log_handlers/__init__.py +2 -0
  466. ads/llm/autogen/v02/log_handlers/oci_file_handler.py +83 -0
  467. ads/llm/autogen/v02/loggers/__init__.py +6 -0
  468. ads/llm/autogen/v02/loggers/metric_logger.py +320 -0
  469. ads/llm/autogen/v02/loggers/session_logger.py +580 -0
  470. ads/llm/autogen/v02/loggers/utils.py +86 -0
  471. ads/llm/autogen/v02/runtime_logging.py +163 -0
  472. ads/llm/chain.py +268 -0
  473. ads/llm/chat_template.py +31 -0
  474. ads/llm/deploy.py +63 -0
  475. ads/llm/guardrails/__init__.py +5 -0
  476. ads/llm/guardrails/base.py +442 -0
  477. ads/llm/guardrails/huggingface.py +44 -0
  478. ads/llm/langchain/__init__.py +5 -0
  479. ads/llm/langchain/plugins/__init__.py +5 -0
  480. ads/llm/langchain/plugins/chat_models/__init__.py +5 -0
  481. ads/llm/langchain/plugins/chat_models/oci_data_science.py +1027 -0
  482. ads/llm/langchain/plugins/embeddings/__init__.py +4 -0
  483. ads/llm/langchain/plugins/embeddings/oci_data_science_model_deployment_endpoint.py +184 -0
  484. ads/llm/langchain/plugins/llms/__init__.py +5 -0
  485. ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py +979 -0
  486. ads/llm/requirements.txt +3 -0
  487. ads/llm/serialize.py +219 -0
  488. ads/llm/serializers/__init__.py +0 -0
  489. ads/llm/serializers/retrieval_qa.py +153 -0
  490. ads/llm/serializers/runnable_parallel.py +27 -0
  491. ads/llm/templates/score_chain.jinja2 +155 -0
  492. ads/llm/templates/tool_chat_template_hermes.jinja +130 -0
  493. ads/llm/templates/tool_chat_template_mistral_parallel.jinja +94 -0
  494. ads/model/__init__.py +52 -0
  495. ads/model/artifact.py +573 -0
  496. ads/model/artifact_downloader.py +254 -0
  497. ads/model/artifact_uploader.py +267 -0
  498. ads/model/base_properties.py +238 -0
  499. ads/model/common/.model-ignore +66 -0
  500. ads/model/common/__init__.py +5 -0
  501. ads/model/common/utils.py +142 -0
  502. ads/model/datascience_model.py +2635 -0
  503. ads/model/deployment/__init__.py +20 -0
  504. ads/model/deployment/common/__init__.py +5 -0
  505. ads/model/deployment/common/utils.py +308 -0
  506. ads/model/deployment/model_deployer.py +466 -0
  507. ads/model/deployment/model_deployment.py +1846 -0
  508. ads/model/deployment/model_deployment_infrastructure.py +671 -0
  509. ads/model/deployment/model_deployment_properties.py +493 -0
  510. ads/model/deployment/model_deployment_runtime.py +838 -0
  511. ads/model/extractor/__init__.py +5 -0
  512. ads/model/extractor/automl_extractor.py +74 -0
  513. ads/model/extractor/embedding_onnx_extractor.py +80 -0
  514. ads/model/extractor/huggingface_extractor.py +88 -0
  515. ads/model/extractor/keras_extractor.py +84 -0
  516. ads/model/extractor/lightgbm_extractor.py +93 -0
  517. ads/model/extractor/model_info_extractor.py +114 -0
  518. ads/model/extractor/model_info_extractor_factory.py +105 -0
  519. ads/model/extractor/pytorch_extractor.py +87 -0
  520. ads/model/extractor/sklearn_extractor.py +112 -0
  521. ads/model/extractor/spark_extractor.py +89 -0
  522. ads/model/extractor/tensorflow_extractor.py +85 -0
  523. ads/model/extractor/xgboost_extractor.py +94 -0
  524. ads/model/framework/__init__.py +5 -0
  525. ads/model/framework/automl_model.py +178 -0
  526. ads/model/framework/embedding_onnx_model.py +438 -0
  527. ads/model/framework/huggingface_model.py +399 -0
  528. ads/model/framework/lightgbm_model.py +266 -0
  529. ads/model/framework/pytorch_model.py +266 -0
  530. ads/model/framework/sklearn_model.py +250 -0
  531. ads/model/framework/spark_model.py +326 -0
  532. ads/model/framework/tensorflow_model.py +254 -0
  533. ads/model/framework/xgboost_model.py +258 -0
  534. ads/model/generic_model.py +3518 -0
  535. ads/model/model_artifact_boilerplate/README.md +381 -0
  536. ads/model/model_artifact_boilerplate/__init__.py +5 -0
  537. ads/model/model_artifact_boilerplate/artifact_introspection_test/__init__.py +5 -0
  538. ads/model/model_artifact_boilerplate/artifact_introspection_test/model_artifact_validate.py +427 -0
  539. ads/model/model_artifact_boilerplate/artifact_introspection_test/requirements.txt +2 -0
  540. ads/model/model_artifact_boilerplate/runtime.yaml +7 -0
  541. ads/model/model_artifact_boilerplate/score.py +61 -0
  542. ads/model/model_file_description_schema.json +68 -0
  543. ads/model/model_introspect.py +331 -0
  544. ads/model/model_metadata.py +1810 -0
  545. ads/model/model_metadata_mixin.py +460 -0
  546. ads/model/model_properties.py +63 -0
  547. ads/model/model_version_set.py +739 -0
  548. ads/model/runtime/__init__.py +5 -0
  549. ads/model/runtime/env_info.py +306 -0
  550. ads/model/runtime/model_deployment_details.py +37 -0
  551. ads/model/runtime/model_provenance_details.py +58 -0
  552. ads/model/runtime/runtime_info.py +81 -0
  553. ads/model/runtime/schemas/inference_env_info_schema.yaml +16 -0
  554. ads/model/runtime/schemas/model_provenance_schema.yaml +36 -0
  555. ads/model/runtime/schemas/training_env_info_schema.yaml +16 -0
  556. ads/model/runtime/utils.py +201 -0
  557. ads/model/serde/__init__.py +5 -0
  558. ads/model/serde/common.py +40 -0
  559. ads/model/serde/model_input.py +547 -0
  560. ads/model/serde/model_serializer.py +1184 -0
  561. ads/model/service/__init__.py +5 -0
  562. ads/model/service/oci_datascience_model.py +1076 -0
  563. ads/model/service/oci_datascience_model_deployment.py +500 -0
  564. ads/model/service/oci_datascience_model_version_set.py +176 -0
  565. ads/model/transformer/__init__.py +5 -0
  566. ads/model/transformer/onnx_transformer.py +324 -0
  567. ads/mysqldb/__init__.py +5 -0
  568. ads/mysqldb/mysql_db.py +227 -0
  569. ads/opctl/__init__.py +18 -0
  570. ads/opctl/anomaly_detection.py +11 -0
  571. ads/opctl/backend/__init__.py +5 -0
  572. ads/opctl/backend/ads_dataflow.py +353 -0
  573. ads/opctl/backend/ads_ml_job.py +710 -0
  574. ads/opctl/backend/ads_ml_pipeline.py +164 -0
  575. ads/opctl/backend/ads_model_deployment.py +209 -0
  576. ads/opctl/backend/base.py +146 -0
  577. ads/opctl/backend/local.py +1053 -0
  578. ads/opctl/backend/marketplace/__init__.py +9 -0
  579. ads/opctl/backend/marketplace/helm_helper.py +173 -0
  580. ads/opctl/backend/marketplace/local_marketplace.py +271 -0
  581. ads/opctl/backend/marketplace/marketplace_backend_runner.py +71 -0
  582. ads/opctl/backend/marketplace/marketplace_operator_interface.py +44 -0
  583. ads/opctl/backend/marketplace/marketplace_operator_runner.py +24 -0
  584. ads/opctl/backend/marketplace/marketplace_utils.py +212 -0
  585. ads/opctl/backend/marketplace/models/__init__.py +5 -0
  586. ads/opctl/backend/marketplace/models/bearer_token.py +94 -0
  587. ads/opctl/backend/marketplace/models/marketplace_type.py +70 -0
  588. ads/opctl/backend/marketplace/models/ocir_details.py +56 -0
  589. ads/opctl/backend/marketplace/prerequisite_checker.py +238 -0
  590. ads/opctl/cli.py +707 -0
  591. ads/opctl/cmds.py +869 -0
  592. ads/opctl/conda/__init__.py +5 -0
  593. ads/opctl/conda/cli.py +193 -0
  594. ads/opctl/conda/cmds.py +749 -0
  595. ads/opctl/conda/config.yaml +34 -0
  596. ads/opctl/conda/manifest_template.yaml +13 -0
  597. ads/opctl/conda/multipart_uploader.py +188 -0
  598. ads/opctl/conda/pack.py +89 -0
  599. ads/opctl/config/__init__.py +5 -0
  600. ads/opctl/config/base.py +57 -0
  601. ads/opctl/config/diagnostics/__init__.py +5 -0
  602. ads/opctl/config/diagnostics/distributed/default_requirements_config.yaml +62 -0
  603. ads/opctl/config/merger.py +255 -0
  604. ads/opctl/config/resolver.py +297 -0
  605. ads/opctl/config/utils.py +79 -0
  606. ads/opctl/config/validator.py +17 -0
  607. ads/opctl/config/versioner.py +68 -0
  608. ads/opctl/config/yaml_parsers/__init__.py +7 -0
  609. ads/opctl/config/yaml_parsers/base.py +58 -0
  610. ads/opctl/config/yaml_parsers/distributed/__init__.py +7 -0
  611. ads/opctl/config/yaml_parsers/distributed/yaml_parser.py +201 -0
  612. ads/opctl/constants.py +66 -0
  613. ads/opctl/decorator/__init__.py +5 -0
  614. ads/opctl/decorator/common.py +129 -0
  615. ads/opctl/diagnostics/__init__.py +5 -0
  616. ads/opctl/diagnostics/__main__.py +25 -0
  617. ads/opctl/diagnostics/check_distributed_job_requirements.py +212 -0
  618. ads/opctl/diagnostics/check_requirements.py +144 -0
  619. ads/opctl/diagnostics/requirement_exception.py +9 -0
  620. ads/opctl/distributed/README.md +109 -0
  621. ads/opctl/distributed/__init__.py +5 -0
  622. ads/opctl/distributed/certificates.py +32 -0
  623. ads/opctl/distributed/cli.py +207 -0
  624. ads/opctl/distributed/cmds.py +731 -0
  625. ads/opctl/distributed/common/__init__.py +5 -0
  626. ads/opctl/distributed/common/abstract_cluster_provider.py +449 -0
  627. ads/opctl/distributed/common/abstract_framework_spec_builder.py +88 -0
  628. ads/opctl/distributed/common/cluster_config_helper.py +103 -0
  629. ads/opctl/distributed/common/cluster_provider_factory.py +21 -0
  630. ads/opctl/distributed/common/cluster_runner.py +54 -0
  631. ads/opctl/distributed/common/framework_factory.py +29 -0
  632. ads/opctl/docker/Dockerfile.job +103 -0
  633. ads/opctl/docker/Dockerfile.job.arm +107 -0
  634. ads/opctl/docker/Dockerfile.job.gpu +175 -0
  635. ads/opctl/docker/base-env.yaml +13 -0
  636. ads/opctl/docker/cuda.repo +6 -0
  637. ads/opctl/docker/operator/.dockerignore +0 -0
  638. ads/opctl/docker/operator/Dockerfile +41 -0
  639. ads/opctl/docker/operator/Dockerfile.gpu +85 -0
  640. ads/opctl/docker/operator/cuda.repo +6 -0
  641. ads/opctl/docker/operator/environment.yaml +8 -0
  642. ads/opctl/forecast.py +11 -0
  643. ads/opctl/index.yaml +3 -0
  644. ads/opctl/model/__init__.py +5 -0
  645. ads/opctl/model/cli.py +65 -0
  646. ads/opctl/model/cmds.py +73 -0
  647. ads/opctl/operator/README.md +4 -0
  648. ads/opctl/operator/__init__.py +31 -0
  649. ads/opctl/operator/cli.py +344 -0
  650. ads/opctl/operator/cmd.py +596 -0
  651. ads/opctl/operator/common/__init__.py +5 -0
  652. ads/opctl/operator/common/backend_factory.py +460 -0
  653. ads/opctl/operator/common/const.py +27 -0
  654. ads/opctl/operator/common/data/synthetic.csv +16001 -0
  655. ads/opctl/operator/common/dictionary_merger.py +148 -0
  656. ads/opctl/operator/common/errors.py +42 -0
  657. ads/opctl/operator/common/operator_config.py +99 -0
  658. ads/opctl/operator/common/operator_loader.py +811 -0
  659. ads/opctl/operator/common/operator_schema.yaml +130 -0
  660. ads/opctl/operator/common/operator_yaml_generator.py +152 -0
  661. ads/opctl/operator/common/utils.py +208 -0
  662. ads/opctl/operator/lowcode/__init__.py +5 -0
  663. ads/opctl/operator/lowcode/anomaly/MLoperator +16 -0
  664. ads/opctl/operator/lowcode/anomaly/README.md +207 -0
  665. ads/opctl/operator/lowcode/anomaly/__init__.py +5 -0
  666. ads/opctl/operator/lowcode/anomaly/__main__.py +103 -0
  667. ads/opctl/operator/lowcode/anomaly/cmd.py +35 -0
  668. ads/opctl/operator/lowcode/anomaly/const.py +167 -0
  669. ads/opctl/operator/lowcode/anomaly/environment.yaml +10 -0
  670. ads/opctl/operator/lowcode/anomaly/model/__init__.py +5 -0
  671. ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py +146 -0
  672. ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py +162 -0
  673. ads/opctl/operator/lowcode/anomaly/model/automlx.py +99 -0
  674. ads/opctl/operator/lowcode/anomaly/model/autots.py +115 -0
  675. ads/opctl/operator/lowcode/anomaly/model/base_model.py +404 -0
  676. ads/opctl/operator/lowcode/anomaly/model/factory.py +110 -0
  677. ads/opctl/operator/lowcode/anomaly/model/isolationforest.py +78 -0
  678. ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py +78 -0
  679. ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py +120 -0
  680. ads/opctl/operator/lowcode/anomaly/model/tods.py +119 -0
  681. ads/opctl/operator/lowcode/anomaly/operator_config.py +127 -0
  682. ads/opctl/operator/lowcode/anomaly/schema.yaml +401 -0
  683. ads/opctl/operator/lowcode/anomaly/utils.py +88 -0
  684. ads/opctl/operator/lowcode/common/__init__.py +5 -0
  685. ads/opctl/operator/lowcode/common/const.py +10 -0
  686. ads/opctl/operator/lowcode/common/data.py +116 -0
  687. ads/opctl/operator/lowcode/common/errors.py +47 -0
  688. ads/opctl/operator/lowcode/common/transformations.py +296 -0
  689. ads/opctl/operator/lowcode/common/utils.py +384 -0
  690. ads/opctl/operator/lowcode/feature_store_marketplace/MLoperator +13 -0
  691. ads/opctl/operator/lowcode/feature_store_marketplace/README.md +30 -0
  692. ads/opctl/operator/lowcode/feature_store_marketplace/__init__.py +5 -0
  693. ads/opctl/operator/lowcode/feature_store_marketplace/__main__.py +116 -0
  694. ads/opctl/operator/lowcode/feature_store_marketplace/cmd.py +85 -0
  695. ads/opctl/operator/lowcode/feature_store_marketplace/const.py +15 -0
  696. ads/opctl/operator/lowcode/feature_store_marketplace/environment.yaml +0 -0
  697. ads/opctl/operator/lowcode/feature_store_marketplace/models/__init__.py +4 -0
  698. ads/opctl/operator/lowcode/feature_store_marketplace/models/apigw_config.py +32 -0
  699. ads/opctl/operator/lowcode/feature_store_marketplace/models/db_config.py +43 -0
  700. ads/opctl/operator/lowcode/feature_store_marketplace/models/mysql_config.py +120 -0
  701. ads/opctl/operator/lowcode/feature_store_marketplace/models/serializable_yaml_model.py +34 -0
  702. ads/opctl/operator/lowcode/feature_store_marketplace/operator_utils.py +386 -0
  703. ads/opctl/operator/lowcode/feature_store_marketplace/schema.yaml +160 -0
  704. ads/opctl/operator/lowcode/forecast/MLoperator +25 -0
  705. ads/opctl/operator/lowcode/forecast/README.md +209 -0
  706. ads/opctl/operator/lowcode/forecast/__init__.py +5 -0
  707. ads/opctl/operator/lowcode/forecast/__main__.py +89 -0
  708. ads/opctl/operator/lowcode/forecast/cmd.py +40 -0
  709. ads/opctl/operator/lowcode/forecast/const.py +92 -0
  710. ads/opctl/operator/lowcode/forecast/environment.yaml +20 -0
  711. ads/opctl/operator/lowcode/forecast/errors.py +26 -0
  712. ads/opctl/operator/lowcode/forecast/model/__init__.py +5 -0
  713. ads/opctl/operator/lowcode/forecast/model/arima.py +279 -0
  714. ads/opctl/operator/lowcode/forecast/model/automlx.py +553 -0
  715. ads/opctl/operator/lowcode/forecast/model/autots.py +312 -0
  716. ads/opctl/operator/lowcode/forecast/model/base_model.py +875 -0
  717. ads/opctl/operator/lowcode/forecast/model/factory.py +106 -0
  718. ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py +492 -0
  719. ads/opctl/operator/lowcode/forecast/model/ml_forecast.py +243 -0
  720. ads/opctl/operator/lowcode/forecast/model/neuralprophet.py +482 -0
  721. ads/opctl/operator/lowcode/forecast/model/prophet.py +445 -0
  722. ads/opctl/operator/lowcode/forecast/model_evaluator.py +244 -0
  723. ads/opctl/operator/lowcode/forecast/operator_config.py +234 -0
  724. ads/opctl/operator/lowcode/forecast/schema.yaml +506 -0
  725. ads/opctl/operator/lowcode/forecast/utils.py +397 -0
  726. ads/opctl/operator/lowcode/forecast/whatifserve/__init__.py +7 -0
  727. ads/opctl/operator/lowcode/forecast/whatifserve/deployment_manager.py +285 -0
  728. ads/opctl/operator/lowcode/forecast/whatifserve/score.py +246 -0
  729. ads/opctl/operator/lowcode/pii/MLoperator +17 -0
  730. ads/opctl/operator/lowcode/pii/README.md +208 -0
  731. ads/opctl/operator/lowcode/pii/__init__.py +5 -0
  732. ads/opctl/operator/lowcode/pii/__main__.py +78 -0
  733. ads/opctl/operator/lowcode/pii/cmd.py +39 -0
  734. ads/opctl/operator/lowcode/pii/constant.py +84 -0
  735. ads/opctl/operator/lowcode/pii/environment.yaml +17 -0
  736. ads/opctl/operator/lowcode/pii/errors.py +27 -0
  737. ads/opctl/operator/lowcode/pii/model/__init__.py +5 -0
  738. ads/opctl/operator/lowcode/pii/model/factory.py +82 -0
  739. ads/opctl/operator/lowcode/pii/model/guardrails.py +167 -0
  740. ads/opctl/operator/lowcode/pii/model/pii.py +145 -0
  741. ads/opctl/operator/lowcode/pii/model/processor/__init__.py +34 -0
  742. ads/opctl/operator/lowcode/pii/model/processor/email_replacer.py +34 -0
  743. ads/opctl/operator/lowcode/pii/model/processor/mbi_replacer.py +35 -0
  744. ads/opctl/operator/lowcode/pii/model/processor/name_replacer.py +225 -0
  745. ads/opctl/operator/lowcode/pii/model/processor/number_replacer.py +73 -0
  746. ads/opctl/operator/lowcode/pii/model/processor/remover.py +26 -0
  747. ads/opctl/operator/lowcode/pii/model/report.py +487 -0
  748. ads/opctl/operator/lowcode/pii/operator_config.py +95 -0
  749. ads/opctl/operator/lowcode/pii/schema.yaml +108 -0
  750. ads/opctl/operator/lowcode/pii/utils.py +43 -0
  751. ads/opctl/operator/lowcode/recommender/MLoperator +16 -0
  752. ads/opctl/operator/lowcode/recommender/README.md +206 -0
  753. ads/opctl/operator/lowcode/recommender/__init__.py +5 -0
  754. ads/opctl/operator/lowcode/recommender/__main__.py +82 -0
  755. ads/opctl/operator/lowcode/recommender/cmd.py +33 -0
  756. ads/opctl/operator/lowcode/recommender/constant.py +30 -0
  757. ads/opctl/operator/lowcode/recommender/environment.yaml +11 -0
  758. ads/opctl/operator/lowcode/recommender/model/base_model.py +212 -0
  759. ads/opctl/operator/lowcode/recommender/model/factory.py +56 -0
  760. ads/opctl/operator/lowcode/recommender/model/recommender_dataset.py +25 -0
  761. ads/opctl/operator/lowcode/recommender/model/svd.py +106 -0
  762. ads/opctl/operator/lowcode/recommender/operator_config.py +81 -0
  763. ads/opctl/operator/lowcode/recommender/schema.yaml +265 -0
  764. ads/opctl/operator/lowcode/recommender/utils.py +13 -0
  765. ads/opctl/operator/runtime/__init__.py +5 -0
  766. ads/opctl/operator/runtime/const.py +17 -0
  767. ads/opctl/operator/runtime/container_runtime_schema.yaml +50 -0
  768. ads/opctl/operator/runtime/marketplace_runtime.py +50 -0
  769. ads/opctl/operator/runtime/python_marketplace_runtime_schema.yaml +21 -0
  770. ads/opctl/operator/runtime/python_runtime_schema.yaml +21 -0
  771. ads/opctl/operator/runtime/runtime.py +115 -0
  772. ads/opctl/schema.yaml.yml +36 -0
  773. ads/opctl/script.py +40 -0
  774. ads/opctl/spark/__init__.py +5 -0
  775. ads/opctl/spark/cli.py +43 -0
  776. ads/opctl/spark/cmds.py +147 -0
  777. ads/opctl/templates/diagnostic_report_template.jinja2 +102 -0
  778. ads/opctl/utils.py +344 -0
  779. ads/oracledb/__init__.py +5 -0
  780. ads/oracledb/oracle_db.py +346 -0
  781. ads/pipeline/__init__.py +39 -0
  782. ads/pipeline/ads_pipeline.py +2279 -0
  783. ads/pipeline/ads_pipeline_run.py +772 -0
  784. ads/pipeline/ads_pipeline_step.py +605 -0
  785. ads/pipeline/builders/__init__.py +5 -0
  786. ads/pipeline/builders/infrastructure/__init__.py +5 -0
  787. ads/pipeline/builders/infrastructure/custom_script.py +32 -0
  788. ads/pipeline/cli.py +119 -0
  789. ads/pipeline/extension.py +291 -0
  790. ads/pipeline/schema/__init__.py +5 -0
  791. ads/pipeline/schema/cs_step_schema.json +35 -0
  792. ads/pipeline/schema/ml_step_schema.json +31 -0
  793. ads/pipeline/schema/pipeline_schema.json +71 -0
  794. ads/pipeline/visualizer/__init__.py +5 -0
  795. ads/pipeline/visualizer/base.py +570 -0
  796. ads/pipeline/visualizer/graph_renderer.py +272 -0
  797. ads/pipeline/visualizer/text_renderer.py +84 -0
  798. ads/secrets/__init__.py +11 -0
  799. ads/secrets/adb.py +386 -0
  800. ads/secrets/auth_token.py +86 -0
  801. ads/secrets/big_data_service.py +365 -0
  802. ads/secrets/mysqldb.py +149 -0
  803. ads/secrets/oracledb.py +160 -0
  804. ads/secrets/secrets.py +407 -0
  805. ads/telemetry/__init__.py +7 -0
  806. ads/telemetry/base.py +69 -0
  807. ads/telemetry/client.py +125 -0
  808. ads/telemetry/telemetry.py +257 -0
  809. ads/templates/dataflow_pyspark.jinja2 +13 -0
  810. ads/templates/dataflow_sparksql.jinja2 +22 -0
  811. ads/templates/func.jinja2 +20 -0
  812. ads/templates/schemas/openapi.json +1740 -0
  813. ads/templates/score-pkl.jinja2 +173 -0
  814. ads/templates/score.jinja2 +322 -0
  815. ads/templates/score_embedding_onnx.jinja2 +202 -0
  816. ads/templates/score_generic.jinja2 +165 -0
  817. ads/templates/score_huggingface_pipeline.jinja2 +217 -0
  818. ads/templates/score_lightgbm.jinja2 +185 -0
  819. ads/templates/score_onnx.jinja2 +407 -0
  820. ads/templates/score_onnx_new.jinja2 +473 -0
  821. ads/templates/score_oracle_automl.jinja2 +185 -0
  822. ads/templates/score_pyspark.jinja2 +154 -0
  823. ads/templates/score_pytorch.jinja2 +219 -0
  824. ads/templates/score_scikit-learn.jinja2 +184 -0
  825. ads/templates/score_tensorflow.jinja2 +184 -0
  826. ads/templates/score_xgboost.jinja2 +178 -0
  827. ads/text_dataset/__init__.py +5 -0
  828. ads/text_dataset/backends.py +211 -0
  829. ads/text_dataset/dataset.py +445 -0
  830. ads/text_dataset/extractor.py +207 -0
  831. ads/text_dataset/options.py +53 -0
  832. ads/text_dataset/udfs.py +22 -0
  833. ads/text_dataset/utils.py +49 -0
  834. ads/type_discovery/__init__.py +9 -0
  835. ads/type_discovery/abstract_detector.py +21 -0
  836. ads/type_discovery/constant_detector.py +41 -0
  837. ads/type_discovery/continuous_detector.py +54 -0
  838. ads/type_discovery/credit_card_detector.py +99 -0
  839. ads/type_discovery/datetime_detector.py +92 -0
  840. ads/type_discovery/discrete_detector.py +118 -0
  841. ads/type_discovery/document_detector.py +146 -0
  842. ads/type_discovery/ip_detector.py +68 -0
  843. ads/type_discovery/latlon_detector.py +90 -0
  844. ads/type_discovery/phone_number_detector.py +63 -0
  845. ads/type_discovery/type_discovery_driver.py +87 -0
  846. ads/type_discovery/typed_feature.py +594 -0
  847. ads/type_discovery/unknown_detector.py +41 -0
  848. ads/type_discovery/zipcode_detector.py +48 -0
  849. ads/vault/__init__.py +7 -0
  850. ads/vault/vault.py +237 -0
  851. {oracle_ads-2.13.9rc0.dist-info → oracle_ads-2.13.9rc1.dist-info}/METADATA +150 -150
  852. oracle_ads-2.13.9rc1.dist-info/RECORD +858 -0
  853. {oracle_ads-2.13.9rc0.dist-info → oracle_ads-2.13.9rc1.dist-info}/WHEEL +1 -2
  854. {oracle_ads-2.13.9rc0.dist-info → oracle_ads-2.13.9rc1.dist-info}/entry_points.txt +2 -1
  855. oracle_ads-2.13.9rc0.dist-info/RECORD +0 -9
  856. oracle_ads-2.13.9rc0.dist-info/top_level.txt +0 -1
  857. {oracle_ads-2.13.9rc0.dist-info → oracle_ads-2.13.9rc1.dist-info}/licenses/LICENSE.txt +0 -0
ads/dataset/dataset.py ADDED
@@ -0,0 +1,1979 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*--
3
+
4
+ # Copyright (c) 2020, 2024 Oracle and/or its affiliates.
5
+ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
6
+
7
+ from __future__ import print_function, absolute_import, division
8
+
9
+ import copy
10
+ import datetime
11
+ import fsspec
12
+ import numpy as np
13
+ import os
14
+ import pandas as pd
15
+ import uuid
16
+
17
+ from collections import Counter
18
+ from sklearn.preprocessing import FunctionTransformer
19
+ from typing import Iterable, Tuple, Union
20
+
21
+ from ads import set_documentation_mode
22
+ from ads.common import utils
23
+ from ads.common.decorator.deprecate import deprecated
24
+ from ads.dataset import helper, logger
25
+ from ads.dataset.dataframe_transformer import DataFrameTransformer
26
+ from ads.dataset.exception import ValidationError
27
+ from ads.dataset.helper import (
28
+ convert_columns,
29
+ fix_column_names,
30
+ generate_sample,
31
+ DatasetDefaults,
32
+ deprecate_default_value,
33
+ deprecate_variable,
34
+ get_dataset,
35
+ infer_target_type,
36
+ )
37
+ from ads.dataset.label_encoder import DataFrameLabelEncoder
38
+ from ads.dataset.pipeline import TransformerPipeline
39
+ from ads.dataset.progress import DummyProgressBar
40
+ from ads.dataset.sampled_dataset import PandasDataset
41
+ from ads.type_discovery.type_discovery_driver import TypeDiscoveryDriver
42
+ from ads.dataset.helper import get_feature_type
43
+ from ads.dataset.correlation_plot import plot_correlation_heatmap
44
+ from ads.dataset.correlation import (
45
+ _cat_vs_cts,
46
+ _cat_vs_cat,
47
+ _get_columns_by_type,
48
+ _validate_correlation_methods,
49
+ )
50
+ from ads.common.decorator.runtime_dependency import (
51
+ runtime_dependency,
52
+ OptionalDependency,
53
+ )
54
+
55
+ N_Features_Wide_Dataset = 64
56
+
57
+
58
+ pd.set_option("display.max_colwidth", None)
59
+
60
+
61
+ class ADSDataset(PandasDataset):
62
+ """
63
+ An ADSDataset Object.
64
+
65
+ The ADSDataset object cannot be used for classification or regression problems until a
66
+ target has been set using `set_target`. To see some rows in the data use any of the usual
67
+ Pandas functions like `head()`. There are also a variety of converters, to_dask,
68
+ to_pandas, to_h2o, to_xgb, to_csv, to_parquet, to_json & to_hdf .
69
+ """
70
+
71
+ df_read_functions = ["head", "describe", "_get_numeric_data"]
72
+
73
+ def __init__(
74
+ self,
75
+ df,
76
+ sampled_df=None,
77
+ shape=None,
78
+ name="",
79
+ description=None,
80
+ type_discovery=True,
81
+ types={},
82
+ metadata=None,
83
+ progress=DummyProgressBar(),
84
+ transformer_pipeline=None,
85
+ interactive=False,
86
+ **kwargs,
87
+ ):
88
+ #
89
+ # to keep performance high and linear no matter the size of the distributed dataset we
90
+ # create a pandas df that's used internally because this has a fixed upper size.
91
+ #
92
+ if shape is None:
93
+ shape = df.shape
94
+
95
+ if sampled_df is None:
96
+ sampled_df = generate_sample(
97
+ df,
98
+ shape[0],
99
+ DatasetDefaults.sampling_confidence_level,
100
+ DatasetDefaults.sampling_confidence_interval,
101
+ **kwargs,
102
+ )
103
+ super().__init__(
104
+ sampled_df,
105
+ type_discovery=type_discovery,
106
+ types=types,
107
+ metadata=metadata,
108
+ progress=progress,
109
+ )
110
+ self.df = fix_column_names(df)
111
+
112
+ self.name = name
113
+ self.description = description
114
+ self.shape = shape
115
+ # store these args to reapply when building a new dataset for delegate operations on dataframe
116
+ self.init_kwargs = {**kwargs, "type_discovery": type_discovery}
117
+ if transformer_pipeline is None:
118
+ # Update transformer pipeline to convert column types and fix names
119
+ self.transformer_pipeline = TransformerPipeline(
120
+ steps=[
121
+ (
122
+ "prepare",
123
+ FunctionTransformer(func=fix_column_names, validate=False),
124
+ )
125
+ ]
126
+ )
127
+ self.transformer_pipeline = self._update_transformer_pipeline(
128
+ steps=[
129
+ (
130
+ "type_discovery",
131
+ FunctionTransformer(
132
+ func=convert_columns,
133
+ validate=False,
134
+ kw_args={"dtypes": self.sampled_df.dtypes},
135
+ ),
136
+ )
137
+ ]
138
+ )
139
+ else:
140
+ self.transformer_pipeline = transformer_pipeline
141
+
142
+ def __repr__(self):
143
+ rows, cols = self.shape
144
+ return f"{rows:,} rows, {cols:,} columns"
145
+
146
+ def __len__(self):
147
+ return self.shape[0]
148
+
149
+ @staticmethod
150
+ def from_dataframe(
151
+ df,
152
+ sampled_df=None,
153
+ shape=None,
154
+ name="",
155
+ description=None,
156
+ type_discovery=True,
157
+ types={},
158
+ metadata=None,
159
+ progress=DummyProgressBar(),
160
+ transformer_pipeline=None,
161
+ interactive=False,
162
+ **kwargs,
163
+ ) -> "ADSDataset":
164
+ return ADSDataset(
165
+ df=df,
166
+ sampled_df=sampled_df,
167
+ shape=shape,
168
+ name=name,
169
+ description=description,
170
+ type_discovery=type_discovery,
171
+ types=types,
172
+ metadata=metadata,
173
+ progress=progress,
174
+ transformer_pipeline=transformer_pipeline,
175
+ interactive=interactive,
176
+ **kwargs,
177
+ )
178
+
179
+ @property
180
+ @deprecated(
181
+ "2.5.2", details="The ddf attribute is deprecated. Use the df attribute."
182
+ )
183
+ def ddf(self):
184
+ return self.df
185
+
186
+ @deprecated(
187
+ "2.5.2", details="The compute method is deprecated. Use the df attribute."
188
+ )
189
+ def compute(self):
190
+ return self.df
191
+
192
+ @runtime_dependency(
193
+ module="ipywidgets", object="HTML", install_from=OptionalDependency.NOTEBOOK
194
+ )
195
+ @runtime_dependency(module="IPython", install_from=OptionalDependency.NOTEBOOK)
196
+ def _repr_html_(self):
197
+ from IPython.core.display import display, HTML
198
+
199
+ display(
200
+ HTML(
201
+ utils.horizontal_scrollable_div(
202
+ self.sampled_df.head(5)
203
+ .style.set_table_styles(utils.get_dataframe_styles())
204
+ .set_table_attributes("class=table")
205
+ .hide()
206
+ .to_html()
207
+ )
208
+ )
209
+ )
210
+
211
+ def _head(self, n=5):
212
+ """
213
+ Return the first `n` rows of the dataset.
214
+
215
+ Parameters
216
+ ----------
217
+ n : int, default 5
218
+ Number of rows to select.
219
+
220
+ Returns
221
+ -------
222
+ dataset_head : pandas.DataFrame
223
+ The first `n` rows of the dataset
224
+
225
+ Examples
226
+ --------
227
+ >>> import pandas as pd
228
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("classfication_data.csv"))
229
+ >>> ds.head()
230
+ * displays the first 5 rows of the dataset, just as the traditional head() function would *
231
+ """
232
+ df = self.df.head(n=n)
233
+
234
+ #
235
+ # we could just return the above but, jupyterlab doesn't render these well
236
+ # when the width exceeds the screen area. To address that we wrap the dataframe
237
+ # with a class that has an optimized _repr_html_ handler, this object
238
+ # extends the pandas dataframe so it can still be used as-a dataframe
239
+ #
240
+ class FormattedDataFrame(pd.DataFrame):
241
+ def __init__(self, *args, **kwargs):
242
+ super(FormattedDataFrame, self).__init__(*args, **kwargs)
243
+
244
+ @property
245
+ def _constructor(self):
246
+ return FormattedDataFrame
247
+
248
+ @runtime_dependency(
249
+ module="ipywidgets",
250
+ object="HTML",
251
+ install_from=OptionalDependency.NOTEBOOK,
252
+ )
253
+ @runtime_dependency(
254
+ module="IPython", install_from=OptionalDependency.NOTEBOOK
255
+ )
256
+ def _repr_html_(self):
257
+ from IPython.core.display import display, HTML
258
+
259
+ display(
260
+ HTML(
261
+ utils.horizontal_scrollable_div(
262
+ self.style.set_table_styles(utils.get_dataframe_styles())
263
+ .set_table_attributes("class=table")
264
+ .hide()
265
+ .to_html()
266
+ )
267
+ )
268
+ )
269
+ return None
270
+
271
+ def __repr__(self):
272
+ return "{} rows, {} columns".format(*self.shape)
273
+
274
+ return FormattedDataFrame(df)
275
+
276
+ def call(self, func, *args, sample_size=None, **kwargs):
277
+ r"""
278
+ Runs a custom function on dataframe
279
+
280
+ func will receive the pandas dataframe (which represents the dataset) as an argument named 'df' by default.
281
+ This can be overridden by specifying the dataframe argument name in a tuple (func, dataframe_name).
282
+
283
+ Parameters
284
+ ----------
285
+ func: Union[callable, tuple]
286
+ Custom function that takes pandas dataframe as input
287
+ Alternatively a (callable, data) tuple where data is a string indicating the keyword of callable
288
+ that expects the dataframe name
289
+ args: iterable, optional
290
+ Positional arguments passed into func
291
+ sample_size: int, Optional
292
+ To use a sampled dataframe
293
+ kwargs: mapping, optional
294
+ A dictionary of keyword arguments passed into func
295
+
296
+ Returns
297
+ -------
298
+ func: function
299
+ a plotting function that contains `*args` and `**kwargs`
300
+
301
+ Examples
302
+ --------
303
+ >>> import pandas as pd
304
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("classfication_data.csv"))
305
+ >>> def f1(df):
306
+ ... return(sum(df), axis=0)
307
+ >>> sum_ds = ds.call(f1)
308
+ """
309
+
310
+ data = "df"
311
+ if isinstance(func, tuple):
312
+ func, data = func
313
+ if data in kwargs:
314
+ raise ValueError(
315
+ "'%s' is both the data argument and a keyword argument" % data
316
+ )
317
+
318
+ if sample_size is None:
319
+ # user has asked not to do sampling
320
+ df = self.df.copy()
321
+ else:
322
+ df = self.df.sample(n=sample_size)
323
+ kwargs[data] = df
324
+ return func(*args, **kwargs)
325
+
326
+ def set_target(self, target, type_discovery=True, target_type=None):
327
+ """
328
+ Returns a dataset tagged based on the type of target.
329
+
330
+ Parameters
331
+ ----------
332
+ target: str
333
+ name of the feature to use as target.
334
+ type_discovery: bool
335
+ This is set as True by default.
336
+ target_type: type
337
+ If provided, then the target will be typed with the provided value.
338
+
339
+ Returns
340
+ -------
341
+ ds: ADSDataset
342
+ tagged according to the type of the target column.
343
+
344
+ Examples
345
+ --------
346
+ >>> import pandas as pd
347
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("classfication_data.csv"))
348
+ >>> ds_with_target= ds.set_target("target_class")
349
+ """
350
+ if target_type:
351
+ target_series = self.sampled_df[target].astype(target_type)
352
+ else:
353
+ target_series = self.sampled_df[target]
354
+ return get_dataset(
355
+ self.df,
356
+ self.sampled_df,
357
+ target,
358
+ infer_target_type(target, target_series, type_discovery),
359
+ self.shape,
360
+ **self.init_kwargs,
361
+ )
362
+
363
+ @deprecated("2.5.2", details="Instead use `to_pandas`.")
364
+ def to_pandas_dataframe(
365
+ self, filter=None, frac=None, include_transformer_pipeline=False
366
+ ):
367
+ return self.to_pandas(
368
+ filter=filter,
369
+ frac=frac,
370
+ include_transformer_pipeline=include_transformer_pipeline,
371
+ )
372
+
373
+ def to_pandas(self, filter=None, frac=None, include_transformer_pipeline=False):
374
+ """
375
+ Returns a copy of the data as pandas.DataFrame, and a sklearn pipeline optionally that holds the
376
+ transformations run so far on the data.
377
+
378
+ The pipeline returned can be updated with the transformations done offline and passed along with the
379
+ dataframe to Dataset.open API if the transformations need to be reproduced at the time of scoring.
380
+
381
+ Parameters
382
+ ----------
383
+ filter: str, optional
384
+ The query string to filter the dataframe, for example
385
+ ds.to_pandas(filter="age > 50 and location == 'san francisco")
386
+ See also https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
387
+ frac: float, optional
388
+ fraction of original data to return.
389
+ include_transformer_pipeline: bool, default: False
390
+ If True, (dataframe, transformer_pipeline) is returned as a tuple
391
+
392
+ Returns
393
+ -------
394
+ dataframe : pandas.DataFrame
395
+ if include_transformer_pipeline is False.
396
+ (data, transformer_pipeline): tuple of pandas.DataFrame and dataset.pipeline.TransformerPipeline
397
+ if include_transformer_pipeline is True.
398
+
399
+ Examples
400
+ --------
401
+ >>> import pandas as pd
402
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
403
+ >>> ds_as_df = ds.to_pandas()
404
+
405
+ Notes
406
+ -----
407
+ See also https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline
408
+ """
409
+ df = self.df.query(filter) if filter is not None else self.df.copy()
410
+ if frac is not None:
411
+ df = df.sample(frac=frac)
412
+ return (
413
+ (df, copy.deepcopy(self.transformer_pipeline))
414
+ if include_transformer_pipeline
415
+ else df
416
+ )
417
+
418
+ @deprecated("2.5.2", details="Instead use `to_dask`.")
419
+ def to_dask_dataframe(
420
+ self,
421
+ filter=None,
422
+ frac=None,
423
+ npartitions=None,
424
+ include_transformer_pipeline=False,
425
+ ):
426
+ return self.to_dask(
427
+ filter=filter,
428
+ frac=frac,
429
+ npartitions=npartitions,
430
+ include_transformer_pipeline=include_transformer_pipeline,
431
+ )
432
+
433
+ @runtime_dependency(module="dask.dataframe", short_name="dd")
434
+ def to_dask(
435
+ self,
436
+ filter=None,
437
+ frac=None,
438
+ npartitions=None,
439
+ include_transformer_pipeline=False,
440
+ ):
441
+ """
442
+ Returns a copy of the data as dask.dataframe.core.DataFrame, and a sklearn pipeline optionally that holds the
443
+ transformations run so far on the data.
444
+
445
+ The pipeline returned can be updated with the transformations done offline and passed along with the
446
+ dataframe to Dataset.open API if the transformations need to be reproduced at the time of scoring.
447
+
448
+ Parameters
449
+ ----------
450
+ filter: str, optional
451
+ The query string to filter the dataframe, for example
452
+ ds.to_dask(filter="age > 50 and location == 'san francisco")
453
+ See also https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
454
+ frac: float, optional
455
+ fraction of original data to return.
456
+ include_transformer_pipeline: bool, default: False
457
+ If True, (dataframe, transformer_pipeline) is returned as a tuple.
458
+
459
+ Returns
460
+ -------
461
+ dataframe : dask.dataframe.core.DataFrame
462
+ if include_transformer_pipeline is False.
463
+ (data, transformer_pipeline): tuple of dask.dataframe.core.DataFrame and dataset.pipeline.TransformerPipeline
464
+ if include_transformer_pipeline is True.
465
+
466
+ Examples
467
+ --------
468
+ >>> import pandas as pd
469
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
470
+ >>> ds_dask = ds.to_dask()
471
+
472
+ Notes
473
+ -----
474
+ See also http://docs.dask.org/en/latest/dataframe-api.html#dataframe and
475
+ https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline
476
+
477
+ """
478
+ res = self.to_pandas(
479
+ filter=filter,
480
+ frac=frac,
481
+ include_transformer_pipeline=include_transformer_pipeline,
482
+ )
483
+ return (
484
+ (dd.from_pandas(res[0], npartitions=npartitions), res[1])
485
+ if include_transformer_pipeline
486
+ else dd.from_pandas(res, npartitions=npartitions)
487
+ )
488
+
489
+ @deprecated("2.5.2", details="Instead use `to_h2o`.")
490
+ def to_h2o_dataframe(
491
+ self, filter=None, frac=None, include_transformer_pipeline=False
492
+ ):
493
+ return self.to_h2o(
494
+ filter=filter,
495
+ frac=frac,
496
+ include_transformer_pipeline=include_transformer_pipeline,
497
+ )
498
+
499
+ @runtime_dependency(module="h2o")
500
+ def to_h2o(self, filter=None, frac=None, include_transformer_pipeline=False):
501
+ """
502
+ Returns a copy of the data as h2o.H2OFrame, and a sklearn pipeline optionally that holds the
503
+ transformations run so far on the data.
504
+
505
+ The pipeline returned can be updated with the transformations done offline and passed along with the
506
+ dataframe to Dataset.open API if the transformations need to be reproduced at the time of scoring.
507
+
508
+ Parameters
509
+ ----------
510
+ filter: str, optional
511
+ The query string to filter the dataframe, for example
512
+ ds.to_h2o(filter="age > 50 and location == 'san francisco")
513
+ See also https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
514
+ frac: float, optional
515
+ fraction of original data to return.
516
+ include_transformer_pipeline: bool, default: False
517
+ If True, (dataframe, transformer_pipeline) is returned as a tuple.
518
+
519
+ Returns
520
+ -------
521
+ dataframe : h2o.H2OFrame
522
+ if include_transformer_pipeline is False.
523
+ (data, transformer_pipeline): tuple of h2o.H2OFrame and dataset.pipeline.TransformerPipeline
524
+ if include_transformer_pipeline is True.
525
+
526
+ Examples
527
+ --------
528
+ >>> import pandas as pd
529
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
530
+ >>> ds_as_h2o = ds.to_h2o()
531
+
532
+ Notes
533
+ -----
534
+ See also https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline
535
+ """
536
+ res = self.to_pandas(
537
+ filter=filter,
538
+ frac=frac,
539
+ include_transformer_pipeline=include_transformer_pipeline,
540
+ )
541
+ return (
542
+ (h2o.H2OFrame(res[0]), res[1])
543
+ if include_transformer_pipeline
544
+ else h2o.H2OFrame(res)
545
+ )
546
+
547
+ @deprecated("2.5.2", details="Instead use `to_xgb`.")
548
+ def to_xgb_dmatrix(
549
+ self, filter=None, frac=None, include_transformer_pipeline=False
550
+ ):
551
+ return self.to_xgb(
552
+ filter=filter,
553
+ frac=frac,
554
+ include_transformer_pipeline=include_transformer_pipeline,
555
+ )
556
+
557
+ @runtime_dependency(module="xgboost", install_from=OptionalDependency.BOOSTED)
558
+ def to_xgb(self, filter=None, frac=None, include_transformer_pipeline=False):
559
+ """
560
+ Returns a copy of the data as xgboost.DMatrix, and a sklearn pipeline optionally that holds the
561
+ transformations run so far on the data.
562
+
563
+ The pipeline returned can be updated with the transformations done offline and passed along with the
564
+ dataframe to Dataset.open API if the transformations need to be reproduced at the time of scoring.
565
+
566
+ Parameters
567
+ ----------
568
+ filter: str, optional
569
+ The query string to filter the dataframe, for example
570
+ ds.to_xgb(filter="age > 50 and location == 'san francisco")
571
+ See also https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
572
+ frac: float, optional
573
+ fraction of original data to return.
574
+ include_transformer_pipeline: bool, default: False
575
+ If True, (dataframe, transformer_pipeline) is returned as a tuple.
576
+
577
+ Returns
578
+ -------
579
+ dataframe : xgboost.DMatrix
580
+ if include_transformer_pipeline is False.
581
+ (data, transformer_pipeline): tuple of xgboost.DMatrix and dataset.pipeline.TransformerPipeline
582
+ if include_transformer_pipeline is True.
583
+
584
+ Examples
585
+ --------
586
+ >>> import pandas as pd
587
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
588
+ >>> xgb_dmat = ds.to_xgb()
589
+
590
+ Notes
591
+ -----
592
+ See also https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline
593
+ """
594
+ res = self.to_pandas(
595
+ filter=filter,
596
+ frac=frac,
597
+ include_transformer_pipeline=include_transformer_pipeline,
598
+ )
599
+ df = res[0] if include_transformer_pipeline else res
600
+ le = DataFrameLabelEncoder()
601
+ df = le.fit_transform(df)
602
+ if include_transformer_pipeline:
603
+ res[1].add(le)
604
+ xgb_matrix = xgboost.DMatrix(df)
605
+ return (xgb_matrix, res[1]) if include_transformer_pipeline else xgb_matrix
606
+
607
+ def sample(self, frac=None, random_state=utils.random_state):
608
+ """
609
+ Returns random sample of dataset.
610
+
611
+ Parameters
612
+ ----------
613
+ frac : float, optional
614
+ Fraction of axis items to return.
615
+ random_state : int or ``np.random.RandomState``
616
+ If int we create a new RandomState with this as the seed
617
+ Otherwise we draw from the passed RandomState
618
+
619
+ Returns
620
+ -------
621
+ sampled_dataset: ADSDataset
622
+ An ADSDataset which was randomly sampled.
623
+
624
+ Examples
625
+ --------
626
+ >>> import pandas as pd
627
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
628
+ >>> ds_sample = ds.sample()
629
+ """
630
+ df = self.df.sample(frac=frac, random_state=random_state)
631
+ return self._build_new_dataset(df)
632
+
633
+ def drop_columns(self, columns):
634
+ """
635
+ Return new dataset with specified columns removed.
636
+
637
+ Parameters
638
+ ----------
639
+ columns : str or list
640
+ columns to drop.
641
+
642
+ Returns
643
+ -------
644
+ dataset: same type as the caller
645
+ a dataset with specified columns dropped.
646
+
647
+ Raises
648
+ ------
649
+ ValidationError
650
+ If any of the feature names is not found in the dataset.
651
+
652
+ Examples
653
+ --------
654
+ >>> import pandas as pd
655
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
656
+ >>> ds_smaller = ds.drop_columns(['col1', 'col2'])
657
+ """
658
+ self._validate_feature(columns)
659
+ return self.drop(columns, axis=1)
660
+
661
+ def assign_column(self, column, arg):
662
+ """
663
+ Return new dataset with new column or values of the existing column mapped according to input correspondence.
664
+
665
+ Used for adding a new column or substituting each value in a column with another value, that may be derived from
666
+ a function, a :class:`pandas.Series` or a :class:`pandas.DataFrame`.
667
+
668
+ Parameters
669
+ ----------
670
+ column : str
671
+ Name of the feature to update.
672
+ arg : function, dict, Series or DataFrame
673
+ Mapping correspondence.
674
+
675
+ Returns
676
+ -------
677
+ dataset: same type as the caller
678
+ a dataset with the specified column assigned.
679
+
680
+ Examples
681
+ --------
682
+ >>> import pandas as pd
683
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
684
+ >>> ds_same_size = ds.assign_column('target',lambda x: x>15 if x not None)
685
+ >>> ds_bigger = ds.assign_column('new_col', np.arange(ds.shape[0]))
686
+ """
687
+ target_name = (
688
+ self.target.name if not utils.is_same_class(self, ADSDataset) else None
689
+ )
690
+ if isinstance(arg, Iterable) or isinstance(arg, ADSDataset):
691
+ df = self.df.copy()
692
+ if type(arg) == pd.DataFrame:
693
+ col_to_add = arg
694
+ elif type(arg) == ADSDataset:
695
+ col_to_add = arg.df
696
+ elif type(arg) == dict:
697
+ col_to_add = pd.DataFrame.from_dict(arg)
698
+ elif type(arg) in [list, np.ndarray]:
699
+ col_to_add = pd.DataFrame(arg, columns=["new_col"])
700
+ elif type(arg) == pd.Series:
701
+ col_to_add = arg.rename("new_col").to_frame()
702
+ elif utils._is_dask_dataframe(arg):
703
+ col_to_add = arg.compute()
704
+ elif utils._is_dask_series(arg):
705
+ col_to_add = arg.compute().rename("new_col").to_frame()
706
+ else:
707
+ raise ValueError(
708
+ f"assign_column currently does not support arg of type {type(arg)}. Reformat "
709
+ f"as types: Pandas, numpy, list, or dict"
710
+ )
711
+ if column in df.columns:
712
+ df = df.drop(columns=column)
713
+ new_df = pd.concat([df, col_to_add], axis=1).rename(
714
+ columns={"new_col": column}
715
+ )
716
+ return self._build_new_dataset(new_df)
717
+
718
+ else:
719
+ sampled_df = self.sampled_df.copy()
720
+ df = self.df.copy()
721
+ sampled_df[column] = sampled_df[column].apply(arg)
722
+ df[column] = df[column].apply(arg)
723
+ if column == target_name:
724
+ target_type = get_feature_type(target_name, sampled_df[target_name])
725
+ return self._build_new_dataset(
726
+ df, sampled_df, target=target_name, target_type=target_type
727
+ )
728
+ else:
729
+ return self._build_new_dataset(
730
+ df,
731
+ sampled_df,
732
+ target=target_name,
733
+ target_type=self.target.type
734
+ if target_name != column and target_name is not None
735
+ else None,
736
+ )
737
+
738
+ def rename_columns(self, columns):
739
+ """
740
+ Returns a new dataset with altered column names.
741
+
742
+ dict values must be unique (1-to-1). Labels not contained in a dict will be left as-is.
743
+ Extra labels listed don't throw an error.
744
+
745
+ Parameters
746
+ ----------
747
+ columns: dict-like or function or list of str
748
+ dict to rename columns selectively, or list of names to rename all columns, or a function like
749
+ str.upper
750
+
751
+ Returns
752
+ -------
753
+ dataset: same type as the caller
754
+ A dataset with specified columns renamed.
755
+
756
+ Examples
757
+ --------
758
+ >>> import pandas as pd
759
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
760
+ >>> ds_renamed = ds.rename_columns({'col1': 'target'})
761
+ """
762
+ if isinstance(columns, list):
763
+ assert len(columns) == len(
764
+ self.columns.values
765
+ ), "columns length do not match the dataset"
766
+ columns = dict(zip(self.columns.values, columns))
767
+ return self.rename(columns=columns)
768
+
769
+ def set_name(self, name):
770
+ """
771
+ Sets name for the dataset.
772
+
773
+ This name will be used to filter the datasets returned by ds.list() API.
774
+ Calling this API is optional. By default name of the dataset is set to empty.
775
+
776
+ Parameters
777
+ ----------
778
+ name: str
779
+ Name of the dataset.
780
+
781
+ Examples
782
+ --------
783
+ >>> import pandas as pd
784
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data1.csv"))
785
+ >>> ds_renamed = ds.set_name("dataset1")
786
+ """
787
+ self.name = name
788
+
789
+ def set_description(self, description):
790
+ """
791
+ Sets description for the dataset.
792
+
793
+ Give your dataset a description.
794
+
795
+ Parameters
796
+ ----------
797
+ description: str
798
+ Description of the dataset.
799
+
800
+ Examples
801
+ --------
802
+ >>> import pandas as pd
803
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data1.csv"))
804
+ >>> ds_renamed = ds.set_description("dataset1 is from "data1.csv"")
805
+ """
806
+ self.description = description
807
+
808
+ def snapshot(self, snapshot_dir=None, name="", storage_options=None):
809
+ """
810
+ Snapshot the dataset with modifications made so far.
811
+
812
+ Optionally caller can invoke ds.set_name() before saving to identify the dataset uniquely at the time of
813
+ using ds.list().
814
+
815
+ The snapshot can be reloaded by providing the URI returned by this API to DatasetFactory.open()
816
+
817
+ Parameters
818
+ ----------
819
+ snapshot_dir: str, optional
820
+ Directory path under which dataset snapshot will be created.
821
+ Defaults to snapshots_dir set using DatasetFactory.set_default_storage().
822
+ name: str, optional, default: ""
823
+ Name to uniquely identify the snapshot using DatasetFactory.list_snapshots().
824
+ If not provided, an auto-generated name is used.
825
+ storage_options: dict, optional
826
+ Parameters passed on to the backend filesystem class.
827
+ Defaults to storage_options set using DatasetFactory.set_default_storage().
828
+
829
+ Returns
830
+ -------
831
+ p_str: str
832
+ the URI to access the snapshotted dataset.
833
+
834
+ Examples
835
+ --------
836
+ >>> import pandas as pd
837
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
838
+ >>> ds_uri = ds.snapshot()
839
+ """
840
+ if snapshot_dir is None:
841
+ import ads.dataset.factory as factory
842
+
843
+ snapshot_dir = factory.default_snapshots_dir
844
+ if snapshot_dir is None:
845
+ raise ValueError(
846
+ "Specify snapshot_dir or use DatasetFactory.set_default_storage() to set default \
847
+ storage options"
848
+ )
849
+ else:
850
+ logger.info("Using default snapshots dir %s" % snapshot_dir)
851
+ name = self._get_unique_name(name)
852
+ if not snapshot_dir.endswith("/"):
853
+ snapshot_dir = snapshot_dir + "/"
854
+ parquet_file = "%s%s.parquet" % (snapshot_dir, name)
855
+ os.makedirs(snapshot_dir, exist_ok=True)
856
+ if storage_options is None and parquet_file[:3] == "oci":
857
+ import ads.dataset.factory as factory
858
+
859
+ storage_options = factory.default_storage_options
860
+ logger.info("Using default storage options.")
861
+
862
+ return helper.write_parquet(
863
+ path=parquet_file,
864
+ data=self.df,
865
+ metadata_dict={
866
+ "metadata": self.feature_types,
867
+ "transformer": self.transformer_pipeline,
868
+ },
869
+ storage_options=storage_options,
870
+ )
871
+
872
+ def to_csv(self, path, storage_options=None, **kwargs):
873
+ """
874
+ Save the materialized dataframe to csv file.
875
+
876
+ Parameters
877
+ ----------
878
+ path: str
879
+ Location to write to. If there are more than one partitions in df, should include a glob character to
880
+ expand into a set of file names, or provide a `name_function=parameter`.
881
+ Supports protocol specifications such as `"oci://"`, `"s3://"`.
882
+ storage_options: dict, optional
883
+ Parameters passed on to the backend filesystem class.
884
+ Defaults to storage_options set using DatasetFactory.set_default_storage().
885
+ kwargs: dict, optional
886
+
887
+ Examples
888
+ --------
889
+ >>> import pandas as pd
890
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
891
+ >>> [ds_link] = ds.to_csv("my/path.csv")
892
+ """
893
+ if storage_options is None:
894
+ import ads.dataset.factory as factory
895
+
896
+ storage_options = factory.default_storage_options
897
+ logger.info("Using default storage options")
898
+ return self.df.to_csv(path, storage_options=storage_options, **kwargs)
899
+
900
+ def to_parquet(self, path, storage_options=None, **kwargs):
901
+ """
902
+ Save data to parquet file.
903
+
904
+ Parameters
905
+ ----------
906
+ path: str
907
+ Location to write to. If there are more than one partitions in df, should include a glob character to
908
+ expand into a set of file names, or provide a `name_function=parameter`.
909
+ Supports protocol specifications such as `"oci://"`, `"s3://"`.
910
+ storage_options: dict, optional
911
+ Parameters passed on to the backend filesystem class.
912
+ Defaults to storage_options set using DatasetFactory.set_default_storage().
913
+ kwargs: dict, optional
914
+
915
+ Examples
916
+ --------
917
+ >>> import pandas as pd
918
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
919
+ >>> ds.to_parquet("my/path")
920
+ """
921
+ if storage_options is None:
922
+ import ads.dataset.factory as factory
923
+
924
+ storage_options = factory.default_storage_options
925
+ logger.info("Using default storage options")
926
+ return self.df.to_parquet(path, storage_options=storage_options, **kwargs)
927
+
928
+ def to_json(self, path, storage_options=None, **kwargs):
929
+ """
930
+ Save data to JSON files.
931
+
932
+ Parameters
933
+ ----------
934
+ path: str
935
+ Location to write to. If there are more than one partitions in df, should include a glob character to
936
+ expand into a set of file names, or provide a `name_function=parameter`.
937
+ Supports protocol specifications such as `"oci://"`, `"s3://"`.
938
+ storage_options: dict, optional
939
+ Parameters passed on to the backend filesystem class.
940
+ Defaults to storage_options set using DatasetFactory.set_default_storage().
941
+ kwargs: dict, optional
942
+
943
+ Examples
944
+ --------
945
+ >>> import pandas as pd
946
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
947
+ >>> ds.to_json("my/path.json")
948
+ """
949
+ if storage_options is None:
950
+ import ads.dataset.factory as factory
951
+
952
+ storage_options = factory.default_storage_options
953
+ logger.info("Using default storage options")
954
+
955
+ return self.df.to_json(path, storage_options=storage_options, **kwargs)
956
+
957
+ def to_hdf(
958
+ self, path: str, key: str, storage_options: dict = None, **kwargs
959
+ ) -> str:
960
+ """
961
+ Save data to Hierarchical Data Format (HDF) files.
962
+
963
+ Parameters
964
+ ----------
965
+ path : string
966
+ Path to a target filename.
967
+ key : string
968
+ Datapath within the files.
969
+ storage_options: dict, optional
970
+ Parameters passed to the backend filesystem class.
971
+ Defaults to storage_options set using DatasetFactory.set_default_storage().
972
+ kwargs: dict, optional
973
+
974
+ Returns
975
+ -------
976
+ str
977
+ The filename of the HDF5 file created.
978
+
979
+ Examples
980
+ --------
981
+ >>> import pandas as pd
982
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
983
+ >>> ds.to_hdf(path="my/path.h5", key="df")
984
+ """
985
+ if storage_options is None:
986
+ import ads.dataset.factory as factory
987
+
988
+ storage_options = factory.default_storage_options
989
+ logger.info("Using default storage options")
990
+
991
+ with pd.HDFStore(
992
+ "memory",
993
+ mode="w",
994
+ driver="H5FD_CORE",
995
+ driver_core_backing_store=0,
996
+ ) as hdf_store:
997
+ hdf_store.put(key, self.df, format=kwargs.get("hdf5_format", "fixed"))
998
+ data = hdf_store._handle.get_file_image()
999
+
1000
+ new_path = (
1001
+ path.replace("*", "0")
1002
+ if path[-3:] == ".h5"
1003
+ else path.replace("*", "0") + ".h5"
1004
+ )
1005
+
1006
+ with fsspec.open(
1007
+ urlpath=new_path, mode="wb", storage_options=storage_options, **kwargs
1008
+ ) as fo:
1009
+ fo.write(data)
1010
+
1011
+ return new_path
1012
+
1013
+ @runtime_dependency(module="fastavro", install_from=OptionalDependency.DATA)
1014
+ def to_avro(self, path, schema=None, storage_options=None, **kwargs):
1015
+ """
1016
+ Save data to Avro files.
1017
+ Avro is a remote procedure call and data serialization framework developed within Apache's Hadoop project. It
1018
+ uses JSON for defining data types and protocols, and serializes data in a compact binary format.
1019
+
1020
+ Parameters
1021
+ ----------
1022
+ path : string
1023
+ Path to a target filename. May contain a ``*`` to denote many filenames.
1024
+ schema : dict
1025
+ Avro schema dictionary, see below.
1026
+ storage_options: dict, optional
1027
+ Parameters passed to the backend filesystem class.
1028
+ Defaults to storage_options set using DatasetFactory.set_default_storage().
1029
+ kwargs: dict, optional
1030
+ See https://fastavro.readthedocs.io/en/latest/writer.html
1031
+
1032
+ Notes
1033
+ -----
1034
+ Avro schema is a complex dictionary describing the data,
1035
+ see https://avro.apache.org/docs/1.8.2/gettingstartedpython.html#Defining+a+schema
1036
+ and https://fastavro.readthedocs.io/en/latest/writer.html.
1037
+ Its structure is as follows::
1038
+
1039
+ {'name': 'Test',
1040
+ 'namespace': 'Test',
1041
+ 'doc': 'Descriptive text',
1042
+ 'type': 'record',
1043
+ 'fields': [
1044
+ {'name': 'a', 'type': 'int'},
1045
+ ]}
1046
+
1047
+ where the "name" field is required, but "namespace" and "doc" are optional
1048
+ descriptors; "type" must always be "record". The list of fields should
1049
+ have an entry for every key of the input records, and the types are
1050
+ like the primitive, complex or logical types of the Avro spec
1051
+ (https://avro.apache.org/docs/1.8.2/spec.html).
1052
+
1053
+ Examples
1054
+ --------
1055
+ >>> import pandas
1056
+ >>> import fastavro
1057
+ >>> with open("data.avro", "rb") as fp:
1058
+ >>> reader = fastavro.reader(fp)
1059
+ >>> records = [r for r in reader]
1060
+ >>> df = pandas.DataFrame.from_records(records)
1061
+ >>> ds = ADSDataset.from_dataframe(df)
1062
+ >>> ds.to_avro("my/path.avro")
1063
+ """
1064
+ # Get the row by row formatting
1065
+ data_row_by_row = []
1066
+ for i, row in self.df.iterrows():
1067
+ data_row_by_row.append(row.to_dict())
1068
+ # Try to auto-generate schema
1069
+ if schema is None:
1070
+ avro_types = self._convert_dtypes_to_avro_types()
1071
+ schema = {"name": self.name, "doc": self.description, "type": "record"}
1072
+ fields = []
1073
+ ## Add vars
1074
+ for col, dtype in avro_types:
1075
+ fields.append({"name": col, "type": ["null", dtype]})
1076
+ schema["fields"] = fields
1077
+
1078
+ parsed_schema = fastavro.parse_schema(schema=schema)
1079
+ new_path = (
1080
+ path.replace("*", "0")
1081
+ if path[-5:] == ".avro"
1082
+ else path.replace("*", "0") + ".avro"
1083
+ )
1084
+ with fsspec.open(
1085
+ new_path, "wb", storage_options=storage_options, **kwargs
1086
+ ) as fo:
1087
+ fastavro.writer(fo, parsed_schema, data_row_by_row)
1088
+ return new_path
1089
+
1090
+ def _convert_dtypes_to_avro_types(self):
1091
+ avro_types = []
1092
+ for name, dtype in zip(self.dtypes.index, self.dtypes.values):
1093
+ if dtype == np.int64:
1094
+ avro_dtype = "long"
1095
+ elif "int" in str(dtype):
1096
+ avro_dtype = "int"
1097
+ elif dtype == np.float64:
1098
+ avro_dtype = "double"
1099
+ elif "float" in str(dtype):
1100
+ avro_dtype = "float"
1101
+ elif dtype == np.bool_:
1102
+ avro_dtype = "boolean"
1103
+ else:
1104
+ avro_dtype = "string"
1105
+ avro_types.append((name, avro_dtype))
1106
+ return avro_types
1107
+
1108
+ def astype(self, types):
1109
+ """
1110
+ Convert data type of features.
1111
+
1112
+ Parameters
1113
+ ----------
1114
+ types: dict
1115
+ key is the existing feature name
1116
+ value is the data type to which the values of the feature should be converted.
1117
+ Valid data types: All numpy datatypes (Example: np.float64, np.int64, ...)
1118
+ or one of categorical, continuous, ordinal or datetime.
1119
+
1120
+ Returns
1121
+ -------
1122
+ updated_dataset: `ADSDataset`
1123
+ an ADSDataset with new data types
1124
+
1125
+ Examples
1126
+ --------
1127
+ >>> import pandas as pd
1128
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
1129
+ >>> ds_reformatted = ds.astype({"target": "categorical"})
1130
+ """
1131
+ return self.__getattr__("astype")(helper.map_types(types))
1132
+
1133
+ def merge(self, data, **kwargs):
1134
+ """
1135
+ Merges this dataset with another ADSDataset or pandas dataframe.
1136
+
1137
+ Parameters
1138
+ ----------
1139
+ data : Union[ADSDataset, pandas.DataFrame]
1140
+ Data to merge.
1141
+ kwargs : dict, optional
1142
+ additional keyword arguments that would be passed to underlying dataframe's merge API.
1143
+
1144
+ Examples
1145
+ --------
1146
+ >>> import pandas as pd
1147
+ >>> df1 = pd.read_csv("data1.csv")
1148
+ >>> df2 = pd.read_csv("data2.csv")
1149
+ >>> ds = ADSDataset.from_dataframe(df1.merge(df2))
1150
+ >>> ds_12 = ds1.merge(ds2)
1151
+ """
1152
+ assert isinstance(data, pd.DataFrame) or isinstance(
1153
+ data, ADSDataset
1154
+ ), "Can only merge datasets if they are of the types pandas or ads"
1155
+ df = self.df.merge(data.df if isinstance(data, ADSDataset) else data, **kwargs)
1156
+ return self._build_new_dataset(df, progress=utils.get_progress_bar(3))
1157
+
1158
+ """
1159
+ Internal methods
1160
+ """
1161
+
1162
+ def __getattr__(self, item):
1163
+ attr = getattr(self.df, item)
1164
+ if callable(attr):
1165
+ return self._apply(attr)
1166
+ else:
1167
+ return attr
1168
+
1169
+ def __getitem__(self, key):
1170
+ if isinstance(key, str) or isinstance(key, (tuple, str)):
1171
+ return self.df[key]
1172
+ else:
1173
+ return self._build_new_dataset(self.df[key])
1174
+
1175
+ def _apply(self, func):
1176
+ def df_func(*args, _new_target=None, **kwargs):
1177
+ has_dataframe_arg = False
1178
+ args = list(args)
1179
+ for i, arg in enumerate(args):
1180
+ if isinstance(arg, ADSDataset) or isinstance(arg, pd.DataFrame):
1181
+ has_dataframe_arg = True
1182
+ # convert any argument that is of type ADSDataset to dataframe. This is useful in delegate calls
1183
+ # like dataset1.concat(dataset2)
1184
+ args[i] = arg.df if isinstance(arg, ADSDataset) else arg
1185
+
1186
+ result = func(*args, **kwargs)
1187
+
1188
+ # return the response as such if the the result is not a dataframe and it is a read function such as head
1189
+ if (
1190
+ isinstance(result, pd.DataFrame)
1191
+ and func.__name__ not in self.df_read_functions
1192
+ ):
1193
+ target_name = None
1194
+ target_sample_val = None
1195
+ if not utils.is_same_class(self, ADSDataset):
1196
+ target_name = (
1197
+ self.target.name if _new_target is None else _new_target
1198
+ )
1199
+ target_sample_val = (
1200
+ self.sampled_df[self.target.name].dropna().values[0]
1201
+ )
1202
+
1203
+ df = result
1204
+ n = len(df)
1205
+ trans_df = None
1206
+ transformed = False
1207
+ transformers = []
1208
+
1209
+ # The sampled dataframe needs to be re-generated when this operation involves another dataframe.
1210
+ # Also, this kind of transformations cannot be reproduced at the time of scoring
1211
+ if not has_dataframe_arg:
1212
+ ft = DataFrameTransformer(
1213
+ func_name=func.__name__,
1214
+ target_name=target_name,
1215
+ target_sample_val=target_sample_val,
1216
+ args=args,
1217
+ kw_args=kwargs,
1218
+ ).fit(result)
1219
+ # transformed is set to false if the method fails to run on pandas dataframe. In this case a new
1220
+ # sampled dataframe is added
1221
+ trans_df, transformed = ft._transform(self.sampled_df.copy())
1222
+ # if the dataset length changes as a result of transformation, these operations need not be added to
1223
+ # pipeline as they do not need to be reproduced at the time of scoring.
1224
+ transformers = (func.__name__, ft) if n == self.shape[0] else []
1225
+
1226
+ init_kwargs = self.init_kwargs.copy()
1227
+ if func.__name__ == "astype":
1228
+ if "types" in init_kwargs:
1229
+ init_kwargs["types"] = init_kwargs["types"] + args[0]
1230
+ else:
1231
+ init_kwargs["types"] = args[0]
1232
+
1233
+ # if the transforming function is not supported by pandas dataframe, we need to sample the dask
1234
+ # dataframe again to get a new representation
1235
+ return self._build_new_dataset(
1236
+ df,
1237
+ sampled_df=df,
1238
+ target=target_name,
1239
+ target_type=TypeDiscoveryDriver().discover(
1240
+ target_name, df[target_name]
1241
+ )
1242
+ if target_name is not None and target_name in df
1243
+ else None,
1244
+ sample=not transformed,
1245
+ transformers=transformers,
1246
+ **init_kwargs,
1247
+ )
1248
+ return result
1249
+
1250
+ return df_func
1251
+
1252
+ def _handle_key_error(self, args):
1253
+ raise ValidationError("Column %s does not exist in data" % str(args))
1254
+
1255
+ def _build_new_dataset(
1256
+ self,
1257
+ df,
1258
+ sampled_df=None,
1259
+ target=None,
1260
+ target_type=None,
1261
+ transformers=[],
1262
+ sample=False,
1263
+ progress=DummyProgressBar(),
1264
+ n=None,
1265
+ **init_kwargs,
1266
+ ):
1267
+ prev_doc_mode = utils.is_documentation_mode()
1268
+
1269
+ set_documentation_mode(False)
1270
+
1271
+ init_kwargs = (
1272
+ self.init_kwargs
1273
+ if init_kwargs is None or len(init_kwargs) == 0
1274
+ else init_kwargs.copy()
1275
+ )
1276
+ n = len(df) if n is None else n
1277
+
1278
+ # re-calculate sample df if not provided
1279
+ if sampled_df is None or sample:
1280
+ if progress:
1281
+ progress.update("Sampling data")
1282
+ sampled_df = generate_sample(
1283
+ df,
1284
+ n,
1285
+ DatasetDefaults.sampling_confidence_level,
1286
+ DatasetDefaults.sampling_confidence_interval,
1287
+ **init_kwargs,
1288
+ )
1289
+ else:
1290
+ if progress:
1291
+ progress.update()
1292
+ shape = (n, len(df.columns))
1293
+ if not utils.is_same_class(self, ADSDataset) and target is None:
1294
+ target = self.target.name
1295
+
1296
+ set_documentation_mode(prev_doc_mode)
1297
+
1298
+ # return a ADSDataset object if the target has been removed from the dataframe
1299
+ if target in sampled_df.columns:
1300
+ if progress:
1301
+ progress.update("Building new dataset")
1302
+ target_type = self.target.type if target_type is None else target_type
1303
+
1304
+ new_ds = get_dataset(
1305
+ df,
1306
+ sampled_df,
1307
+ target,
1308
+ target_type,
1309
+ shape,
1310
+ progress=progress,
1311
+ **init_kwargs,
1312
+ )
1313
+
1314
+ new_ds.transformer_pipeline = self._update_transformer_pipeline(
1315
+ transformers
1316
+ )
1317
+ return new_ds
1318
+ else:
1319
+ if target is not None and not isinstance(progress, DummyProgressBar):
1320
+ logger.info(
1321
+ "The target variable does not exist. Use `set_target()` to specify the target."
1322
+ )
1323
+ if progress:
1324
+ progress.update("Building the dataset with no target.")
1325
+ dsp = ADSDataset(
1326
+ df,
1327
+ sampled_df,
1328
+ shape,
1329
+ progress=progress,
1330
+ interactive=False,
1331
+ **init_kwargs,
1332
+ )
1333
+ dsp.transformer_pipeline = self._update_transformer_pipeline(transformers)
1334
+ return dsp
1335
+
1336
+ def _validate_feature(self, feature_names):
1337
+ if np.isscalar(feature_names):
1338
+ feature_names = [feature_names]
1339
+ for feature in feature_names:
1340
+ if feature not in self.df.columns:
1341
+ self._handle_key_error(feature)
1342
+
1343
+ def _update_transformer_pipeline(self, steps=[]):
1344
+ if isinstance(steps, tuple):
1345
+ steps = [steps]
1346
+ if steps is None or len(steps) == 0:
1347
+ return copy.deepcopy(self.transformer_pipeline)
1348
+ if self.transformer_pipeline is not None:
1349
+ transformer_pipeline = TransformerPipeline(
1350
+ steps=self.transformer_pipeline.steps + steps
1351
+ )
1352
+ else:
1353
+ transformer_pipeline = TransformerPipeline(steps=steps)
1354
+ return transformer_pipeline
1355
+
1356
+ def _get_unique_name(self, name):
1357
+ id = (
1358
+ uuid.uuid4().hex + "_" + datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
1359
+ )
1360
+ if name == "":
1361
+ return id
1362
+ return name + "_" + id
1363
+
1364
+ def corr(
1365
+ self,
1366
+ correlation_methods: Union[list, str] = "pearson",
1367
+ frac: float = 1.0,
1368
+ sample_size: float = 1.0,
1369
+ nan_threshold: float = 0.8,
1370
+ overwrite: bool = None,
1371
+ force_recompute: bool = False,
1372
+ ):
1373
+ """
1374
+ Compute pairwise correlation of numeric and categorical columns, output a matrix or a list of matrices computed
1375
+ using the correlation methods passed in.
1376
+
1377
+ Parameters
1378
+ ----------
1379
+ correlation_methods: Union[list, str], default to 'pearson'
1380
+
1381
+ - 'pearson': Use Pearson's Correlation between continuous features,
1382
+ - 'cramers v': Use Cramer's V correlations between categorical features,
1383
+ - 'correlation ratio': Use Correlation Ratio Correlation between categorical and continuous features,
1384
+ - 'all': Is equivalent to ['pearson', 'cramers v', 'correlation ratio'].
1385
+
1386
+ Or a list containing any combination of these methods, for example, ['pearson', 'cramers v'].
1387
+ frac:
1388
+ Is deprecated and replaced by sample_size.
1389
+ sample_size: float, defaults to 1.0. Float, Range -> (0, 1]
1390
+ What fraction of the data should be used in the calculation?
1391
+ nan_threshold: float, default to 0.8, Range -> [0, 1]
1392
+ Only compute a correlation when the proportion of the values, in a column, is less than or equal to nan_threshold.
1393
+ overwrite:
1394
+ Is deprecated and replaced by force_recompute.
1395
+ force_recompute: bool, default to be False
1396
+
1397
+ - If False, it calculates the correlation matrix if there is no cached correlation matrix. Otherwise,
1398
+ it returns the cached correlation matrix.
1399
+ - If True, it calculates the correlation matrix regardless whether there is cached result or not.
1400
+
1401
+ Returns
1402
+ -------
1403
+ correlation: Union[list, pandas.DataFrame]
1404
+ The pairwise correlations as a matrix (DataFrame) or list of matrices
1405
+ """
1406
+ frac = deprecate_default_value(
1407
+ frac,
1408
+ None,
1409
+ 1,
1410
+ "<code>frac=None</code> is superseded by <code>sample_size=1.0</code>.",
1411
+ FutureWarning,
1412
+ )
1413
+
1414
+ if frac != 1.0:
1415
+ deprecate_frac = deprecate_variable(
1416
+ frac,
1417
+ sample_size,
1418
+ "<code>frac</code> is superseded by <code>sample_size</code>.",
1419
+ DeprecationWarning,
1420
+ )
1421
+ if sample_size == 1.0:
1422
+ sample_size = deprecate_frac
1423
+
1424
+ force_recompute = deprecate_variable(
1425
+ overwrite,
1426
+ force_recompute,
1427
+ f"<code>overwrite=None</code> is deprecated. Use <code>force_recompute</code> instead.",
1428
+ DeprecationWarning,
1429
+ )
1430
+ if sample_size > 1 or sample_size <= 0:
1431
+ logger.error("`sample_size` must to be in the range of (0, 1].")
1432
+ return
1433
+ if nan_threshold > 1 or nan_threshold < 0:
1434
+ logger.error("`nan_threshold` must be between 0 and 1 (exclusive).")
1435
+ return
1436
+ return self._compute_correlation(
1437
+ frac=sample_size,
1438
+ threshold=nan_threshold,
1439
+ force_recompute=force_recompute,
1440
+ correlation_methods=correlation_methods,
1441
+ )
1442
+
1443
+ def _compute_correlation(
1444
+ self,
1445
+ frac=1.0,
1446
+ threshold=0.8,
1447
+ include_n_features=16,
1448
+ correlation_methods="pearson",
1449
+ force_recompute=False,
1450
+ ):
1451
+ """
1452
+ returns a list of correlation matrix/matrices
1453
+ """
1454
+
1455
+ # validate the correlation methods
1456
+ correlation_methods = _validate_correlation_methods(correlation_methods)
1457
+
1458
+ # if users choose to sample a frac of the data
1459
+ corr_df = self.df if not frac else self.df.sample(frac=frac)
1460
+
1461
+ # return columns by type and filter by threshold
1462
+ threshold = threshold * 100
1463
+ feature_types_df = pd.DataFrame.from_dict(self.feature_types).T
1464
+
1465
+ # reduce the dim of wide data
1466
+ n_rows, n_columns = self.shape
1467
+
1468
+ is_wide_dataset = n_columns >= N_Features_Wide_Dataset
1469
+
1470
+ if is_wide_dataset and include_n_features:
1471
+ corr_df, feature_types_df = self._reduce_dim_for_wide_dataset(
1472
+ corr_df, feature_types_df, include_n_features
1473
+ )
1474
+
1475
+ categorical_columns, continuous_columns, _ = _get_columns_by_type(
1476
+ feature_types_df, threshold=threshold
1477
+ )
1478
+
1479
+ # get the correlation
1480
+ correlation_list = []
1481
+ for method in correlation_methods:
1482
+ correlation_list.append(
1483
+ self._return_correlation(
1484
+ corr_df,
1485
+ method,
1486
+ categorical_columns,
1487
+ continuous_columns,
1488
+ force_recompute,
1489
+ )
1490
+ )
1491
+ return correlation_list[0] if len(correlation_list) == 1 else correlation_list
1492
+
1493
+ def _calc_pearson(self, df: pd.DataFrame, continuous_columns: list) -> pd.DataFrame:
1494
+ self._pearson = (
1495
+ df[continuous_columns].corr()
1496
+ if len(continuous_columns) > 1
1497
+ else pd.DataFrame()
1498
+ )
1499
+ return self._pearson
1500
+
1501
+ def _calc_cramers_v(
1502
+ self, df: pd.DataFrame, categorical_columns: list
1503
+ ) -> pd.DataFrame:
1504
+ self._cramers_v = _cat_vs_cat(df, categorical_columns)
1505
+ return self._cramers_v
1506
+
1507
+ def _calc_correlation_ratio(
1508
+ self,
1509
+ df: pd.core.frame.DataFrame,
1510
+ categorical_columns: list,
1511
+ continuous_columns: list,
1512
+ ) -> pd.DataFrame:
1513
+ self._correlation_ratio = _cat_vs_cts(
1514
+ df, categorical_columns, continuous_columns
1515
+ )
1516
+ return self._correlation_ratio
1517
+
1518
+ def _return_correlation(
1519
+ self,
1520
+ corr_df,
1521
+ method,
1522
+ categorical_columns,
1523
+ continuous_columns,
1524
+ force_recompute,
1525
+ ):
1526
+ if not force_recompute and hasattr(self, "_" + "_".join(method.split())):
1527
+ logger.info(
1528
+ f"Using cached results for {method} correlation. Use"
1529
+ " `force_recompute=True` to override."
1530
+ )
1531
+ return getattr(self, "_" + "_".join(method.split()))
1532
+ else:
1533
+ if method == "pearson":
1534
+ self._calc_pearson(corr_df, continuous_columns)
1535
+ return self._pearson
1536
+ elif method == "cramers v":
1537
+ self._calc_cramers_v(corr_df, categorical_columns)
1538
+ return self._cramers_v
1539
+ elif method == "correlation ratio":
1540
+ self._calc_correlation_ratio(
1541
+ corr_df, categorical_columns, continuous_columns
1542
+ )
1543
+ return self._correlation_ratio
1544
+ else:
1545
+ raise ValueError(f"The {method} method is not supported.")
1546
+
1547
+ @runtime_dependency(module="IPython", install_from=OptionalDependency.NOTEBOOK)
1548
+ def _reduce_dim_for_wide_dataset(
1549
+ self, corr_df: pd.DataFrame, feature_types_df: pd.DataFrame, include_n_features
1550
+ ):
1551
+ min_cores_for_correlation = 2
1552
+ n_rows, n_columns = self.shape
1553
+
1554
+ from IPython.core.display import display, HTML
1555
+
1556
+ if utils.get_cpu_count() <= min_cores_for_correlation:
1557
+ msg = (
1558
+ f"Not attempting to calculate correlations, too few cores ({utils.get_cpu_count()}) "
1559
+ f"for wide dataset ({n_columns} columns)"
1560
+ )
1561
+ display(HTML(f"<li>{msg}</li>"))
1562
+ return None, None
1563
+
1564
+ display(HTML(f"<li>detected wide dataset ({n_columns} columns)</li>"))
1565
+
1566
+ if "target" in self.__dict__:
1567
+ display(
1568
+ HTML(
1569
+ f"<li>feature reduction using mutual information (max {include_n_features} columns)</li>"
1570
+ )
1571
+ )
1572
+ logger.info("Set `include_n_features=None` to include all features.")
1573
+ corr_sampled_df = self._find_feature_subset(
1574
+ self.sampled_df, self.target.name, include_n_features=include_n_features
1575
+ )
1576
+ corr_df, feature_types_df = self._update_dataframes(
1577
+ corr_sampled_df, corr_df, feature_types_df
1578
+ )
1579
+ else:
1580
+ #
1581
+ # in the absense of a target we simply use the first_n
1582
+ #
1583
+ logger.info(
1584
+ f"To include the first {include_n_features} features based on the feature"
1585
+ f"importance, use `.set_target`()."
1586
+ )
1587
+ feature_types_df = feature_types_df[
1588
+ (feature_types_df.index.isin(corr_df.columns.values))
1589
+ & feature_types_df.type.isin(
1590
+ ["categorical", "ordinal", "continuous", "zipcode"]
1591
+ )
1592
+ ]
1593
+ corr_df = corr_df[feature_types_df.index[:include_n_features]]
1594
+ feature_types_df = feature_types_df.iloc[:include_n_features, :]
1595
+ return corr_df, feature_types_df
1596
+
1597
+ def _update_dataframes(self, corr_sampled_df, corr_df, feature_types_df):
1598
+ """
1599
+ update the dataframe and feature types based on the reduced dataframe
1600
+ """
1601
+ cols = corr_sampled_df.columns.tolist()
1602
+ cols.insert(0, cols.pop(cols.index(self.target.name)))
1603
+ corr_df_reduced = corr_df[[*cols]]
1604
+ feature_types_df_reduced = feature_types_df[feature_types_df.index.isin(cols)]
1605
+ return corr_df_reduced, feature_types_df_reduced
1606
+
1607
+ def show_corr(
1608
+ self,
1609
+ frac: float = 1.0,
1610
+ sample_size: float = 1.0,
1611
+ nan_threshold: float = 0.8,
1612
+ overwrite: bool = None,
1613
+ force_recompute: bool = False,
1614
+ correlation_target: str = None,
1615
+ plot_type: str = "heatmap",
1616
+ correlation_threshold: float = -1,
1617
+ correlation_methods="pearson",
1618
+ **kwargs,
1619
+ ):
1620
+ """
1621
+ Show heatmap or barplot of pairwise correlation of numeric and categorical columns, output three tabs
1622
+ which are heatmap or barplot of correlation matrix of numeric columns vs numeric columns using pearson
1623
+ correlation method, categorical columns vs categorical columns using Cramer's V method,
1624
+ and numeric vs categorical columns, excluding NA/null values and columns which have more than
1625
+ 80% of NA/null values. By default, only 'pearson' correlation is calculated and shown in the first tab.
1626
+ Set correlation_methods='all' to show all correlation charts.
1627
+
1628
+ Parameters
1629
+ ----------
1630
+ frac: Is superseded by sample_size
1631
+ sample_size: float, defaults to 1.0. Float, Range -> (0, 1]
1632
+ What fraction of the data should be used in the calculation?
1633
+ nan_threshold: float, defaults to 0.8, Range -> [0, 1]
1634
+ In the default case, it will only calculate the correlation of the columns which has less than or equal to
1635
+ 80% of missing values.
1636
+ overwrite:
1637
+ Is deprecated and replaced by force_recompute.
1638
+ force_recompute: bool, default to be False.
1639
+
1640
+ - If False, it calculates the correlation matrix if there is no cached correlation matrix. Otherwise,
1641
+ it returns the cached correlation matrix.
1642
+ - If True, it calculates the correlation matrix regardless whether there is cached result or not.
1643
+
1644
+ plot_type: str, default to "heatmap"
1645
+ It can only be "heatmap" or "bar". Note that if "bar" is chosen, correlation_target also has to be set and
1646
+ the bar chart will only show the correlation values of the pairs which have the target in them.
1647
+ correlation_target: str, default to Non
1648
+ It can be any columns of type continuous, ordinal, categorical or zipcode. When correlation_target is set,
1649
+ only pairs that contains correlation_target will show.
1650
+ correlation_threshold: float, default to -1
1651
+ It can be any number between -1 and 1.
1652
+ correlation_methods: Union[list, str], defaults to 'pearson'
1653
+
1654
+ - 'pearson': Use Pearson's Correlation between continuous features,
1655
+ - 'cramers v': Use Cramer's V correlations between categorical features,
1656
+ - 'correlation ratio': Use Correlation Ratio Correlation between categorical and continuous features,
1657
+ - 'all': Is equivalent to ['pearson', 'cramers v', 'correlation ratio'].
1658
+
1659
+ Or a list containing any combination of these methods, for example, ['pearson', 'cramers v'].
1660
+
1661
+ Returns
1662
+ -------
1663
+ None
1664
+ """
1665
+ frac = deprecate_default_value(
1666
+ frac,
1667
+ None,
1668
+ 1,
1669
+ "<code>frac=None</code> is superseded by <code>sample_size=1.0</code>.",
1670
+ FutureWarning,
1671
+ )
1672
+ if frac != 1.0:
1673
+ deprecate_frac = deprecate_variable(
1674
+ frac,
1675
+ sample_size,
1676
+ "<code>frac</code> is deprecated. Use <code>sample_size</code> instead.",
1677
+ DeprecationWarning,
1678
+ )
1679
+ if sample_size == 1.0:
1680
+ sample_size = deprecate_frac
1681
+
1682
+ feature_types_df = pd.DataFrame.from_dict(self.feature_types).loc["type", :]
1683
+ features_list = list(
1684
+ feature_types_df[
1685
+ feature_types_df.isin(
1686
+ ["categorical", "zipcode", "continuous", "ordinal"]
1687
+ )
1688
+ ].index
1689
+ )
1690
+ if plot_type not in ["heatmap", "bar"]:
1691
+ raise ValueError('plot_type has to be "heatmap" ' 'or "bar"')
1692
+
1693
+ if plot_type == "bar" and correlation_target is None:
1694
+ raise ValueError('correlation_target has to be set when plot_type="bar".')
1695
+
1696
+ if correlation_target:
1697
+ if correlation_target not in features_list:
1698
+ raise ValueError(
1699
+ "correlation_target has to be in {}.".format(features_list)
1700
+ )
1701
+
1702
+ force_recompute = deprecate_variable(
1703
+ overwrite,
1704
+ force_recompute,
1705
+ f"<code>overwrite=None</code> is deprecated. Use <code>force_recompute</code> instead.",
1706
+ DeprecationWarning,
1707
+ )
1708
+
1709
+ plot_correlation_heatmap(
1710
+ ds=self,
1711
+ frac=sample_size,
1712
+ force_recompute=force_recompute,
1713
+ correlation_target=correlation_target,
1714
+ plot_type=plot_type,
1715
+ correlation_threshold=correlation_threshold,
1716
+ nan_threshold=nan_threshold,
1717
+ correlation_methods=correlation_methods,
1718
+ **kwargs,
1719
+ )
1720
+
1721
+ @runtime_dependency(module="IPython", install_from=OptionalDependency.NOTEBOOK)
1722
+ @runtime_dependency(module="ipywidgets", install_from=OptionalDependency.NOTEBOOK)
1723
+ def show_in_notebook(
1724
+ self,
1725
+ correlation_threshold=-1,
1726
+ selected_index=0,
1727
+ sample_size=0,
1728
+ visualize_features=True,
1729
+ correlation_methods="pearson",
1730
+ **kwargs,
1731
+ ):
1732
+ """
1733
+ Provide visualization of dataset.
1734
+
1735
+ - Display feature distribution. The data table display will show a maximum of 8 digits,
1736
+ - Plot the correlation between the dataset features (as a heatmap) only when all the features are
1737
+ continuous or ordinal,
1738
+ - Display data head.
1739
+
1740
+ Parameters
1741
+ ----------
1742
+ correlation_threshold : int, default -1
1743
+ The correlation threshold to select, which only show features that have larger or equal
1744
+ correlation values than the threshold.
1745
+ selected_index: int, str, default 0
1746
+ The displayed output is stacked into an accordion widget, use selected_index to force the display to open
1747
+ a specific element, use the (zero offset) index or any prefix string of the name (eg, 'corr' for
1748
+ correlations)
1749
+ sample_size: int, default 0
1750
+ The size (in rows) to sample for visualizations
1751
+ visualize_features: bool default False
1752
+ For the "Features" section control if feature visualizations are shown or not. If not only
1753
+ a summary of the numeric statistics is shown. The numeric statistics are also always shows
1754
+ for wide (>64 features) datasets
1755
+ correlation_methods: Union[list, str], default to 'pearson'
1756
+
1757
+ - 'pearson': Use Pearson's Correlation between continuous features,
1758
+ - 'cramers v': Use Cramer's V correlations between categorical features,
1759
+ - 'correlation ratio': Use Correlation Ratio Correlation between categorical and continuous features,
1760
+ - 'all': Is equivalent to ['pearson', 'cramers v', 'correlation ratio'].
1761
+
1762
+ Or a list containing any combination of these methods, for example, ['pearson', 'cramers v'].
1763
+ """
1764
+
1765
+ if not utils.is_notebook():
1766
+ print("show_in_notebook called but not in notebook environment")
1767
+ return
1768
+
1769
+ n_rows, n_columns = self.shape
1770
+
1771
+ min_sample_size = 10000
1772
+ if sample_size == 0:
1773
+ sub_samp_size = len(self.sampled_df)
1774
+ sub_samp_df = self.sampled_df
1775
+ else:
1776
+ sub_samp_size = max(min(sample_size, len(self.sampled_df)), min_sample_size)
1777
+ sub_samp_df = self.sampled_df.sample(n=sub_samp_size)
1778
+
1779
+ html_summary = ""
1780
+ if self.name:
1781
+ html_summary += "<h1>Name: %s</h1>" % (self.name)
1782
+
1783
+ # dataset type (problem type)
1784
+ html_summary += "<h3>Type: %s</h3>" % self.__class__.__name__
1785
+
1786
+ if self.description:
1787
+ html_summary += "<pre>%s</pre>" % self.description
1788
+ html_summary += "<hr>"
1789
+
1790
+ html_summary += "<h3>{:,} Rows, {:,} Columns</h3>".format(n_rows, n_columns)
1791
+ html_summary += "<h4>Column Types:</h4><UL>"
1792
+
1793
+ for group in Counter(
1794
+ [self.feature_types[k].meta_data["type"] for k in self.feature_types]
1795
+ ).most_common():
1796
+ html_summary += "<LI><b>%s:</b> %d features" % (group[0], group[1])
1797
+
1798
+ html_summary += "</UL>"
1799
+
1800
+ html_summary += """
1801
+ <p><b>
1802
+ Note: Visualizations use a sampled subset of the dataset, this is to
1803
+ improve plotting performance. The sample size is calculated to be statistically
1804
+ significant within the confidence level: {} and confidence interval: {}.
1805
+
1806
+ The sampled data has {:,} rows
1807
+ </b>
1808
+ </p>
1809
+
1810
+ <ul>
1811
+ <li>The confidence <i>level</i> refers to the long-term success rate of the
1812
+ method, that is, how often this type of interval will capture the parameter
1813
+ of interest.
1814
+ </li>
1815
+
1816
+ <li>A specific confidence <i>interval</i> gives a range of plausible values for
1817
+ the parameter of interest
1818
+ </li>
1819
+ </ul>
1820
+
1821
+ """.format(
1822
+ DatasetDefaults.sampling_confidence_level,
1823
+ DatasetDefaults.sampling_confidence_interval,
1824
+ sub_samp_df.shape[0],
1825
+ )
1826
+
1827
+ html_summary += "</UL>"
1828
+
1829
+ from ipywidgets import widgets
1830
+
1831
+ summary = widgets.HTML(html_summary)
1832
+
1833
+ features = widgets.HTML()
1834
+ correlations = widgets.Output()
1835
+ warningz = widgets.HTML()
1836
+
1837
+ warningz.value = "Analyzing for warnings..."
1838
+ features.value = "Calculating full statistical info..."
1839
+
1840
+ # with correlations:
1841
+ # display(HTML("<li>calculating...</li>"))
1842
+
1843
+ accordion = widgets.Accordion(
1844
+ children=[summary, features, correlations, warningz]
1845
+ )
1846
+ accordion.set_title(0, "Summary")
1847
+ accordion.set_title(1, "Features")
1848
+ accordion.set_title(2, "Correlations")
1849
+ accordion.set_title(3, "Warnings")
1850
+
1851
+ if isinstance(selected_index, str):
1852
+ # lookup by title
1853
+ possible_titles = [
1854
+ accordion.get_title(i) for i in range(len(accordion.children))
1855
+ ]
1856
+ for i, title in enumerate(possible_titles):
1857
+ if title.lower().startswith(selected_index.lower()):
1858
+ selected_index = i
1859
+ break
1860
+
1861
+ if isinstance(selected_index, str):
1862
+ # failed to match a title
1863
+ logger.info(
1864
+ "`selected_index` should be one of: {}.".format(
1865
+ ", ".join(possible_titles)
1866
+ )
1867
+ )
1868
+ selected_index = 0
1869
+
1870
+ accordion.selected_index = selected_index
1871
+
1872
+ is_wide_dataset = n_columns >= N_Features_Wide_Dataset
1873
+
1874
+ #
1875
+ # set up dataframe to use for correlation calculations
1876
+ #
1877
+
1878
+ self.df_stats = self._calculate_dataset_statistics(
1879
+ is_wide_dataset, [features, warningz]
1880
+ )
1881
+
1882
+ with correlations:
1883
+ feature_types_df = pd.DataFrame.from_dict(self.feature_types).loc["type", :]
1884
+ if not is_wide_dataset:
1885
+ feature_types_df = feature_types_df[
1886
+ self.df_stats["missing"] < len(self.df)
1887
+ ]
1888
+
1889
+ frac = kwargs.pop("frac", 1.0)
1890
+ overwrite = kwargs.pop("overwrite", None)
1891
+ force_recompute = kwargs.pop("force_recompute", False)
1892
+ force_recompute = deprecate_variable(
1893
+ overwrite,
1894
+ force_recompute,
1895
+ f"<code>overwrite=None</code> is deprecated. Use <code>force_recompute</code> instead.",
1896
+ DeprecationWarning,
1897
+ )
1898
+ plot_type = kwargs.pop("plot_type", "heatmap")
1899
+ correlation_target = kwargs.pop("correlation_target", None)
1900
+ nan_threshold = kwargs.pop("nan_threshold", 0.8)
1901
+ self.show_corr(
1902
+ correlation_threshold=correlation_threshold,
1903
+ sample_size=frac,
1904
+ force_recompute=force_recompute,
1905
+ plot_type=plot_type,
1906
+ correlation_target=correlation_target,
1907
+ nan_threshold=nan_threshold,
1908
+ correlation_methods=correlation_methods,
1909
+ **kwargs,
1910
+ )
1911
+
1912
+ from IPython.core.display import display
1913
+
1914
+ display(accordion)
1915
+
1916
+ # generate html for feature_distribution & warnings
1917
+
1918
+ accordion.set_title(
1919
+ 1, f"Features ({n_columns})"
1920
+ ) # adjust for datasets with target
1921
+
1922
+ #
1923
+ # compute missing value statistics
1924
+ # not done for wide datasets
1925
+ #
1926
+
1927
+ features.value = self._generate_features_html(
1928
+ is_wide_dataset,
1929
+ n_columns,
1930
+ self.df_stats,
1931
+ visualizations_follow=bool(visualize_features),
1932
+ )
1933
+
1934
+ warningz.value = self._generate_warnings_html(
1935
+ is_wide_dataset, n_rows, n_columns, self.df_stats, warningz, accordion
1936
+ )
1937
+
1938
+ if visualize_features and not is_wide_dataset:
1939
+ self._visualize_feature_distribution(features)
1940
+
1941
+ def get_recommendations(self, *args, **kwargs): # real signature may change
1942
+ """
1943
+ Returns user-friendly error message to set target variable before invoking this API.
1944
+
1945
+ Parameters
1946
+ ----------
1947
+ kwargs
1948
+
1949
+ Returns
1950
+ -------
1951
+ NotImplementedError
1952
+ raises NotImplementedError, if target parameter value not provided
1953
+
1954
+ """
1955
+ raise NotImplementedError(
1956
+ "Please set the target using set_target() before invoking this API. See "
1957
+ "https://accelerated-data-science.readthedocs.io/en/latest/ads.dataset.html#ads.dataset.dataset.ADSDataset.set_target "
1958
+ "for the API usage."
1959
+ )
1960
+
1961
+ def suggest_recommendations(self, *args, **kwargs): # real signature may change
1962
+ """
1963
+ Returns user-friendly error message to set target variable before invoking this API.
1964
+
1965
+ Parameters
1966
+ ----------
1967
+ kwargs
1968
+
1969
+ Returns
1970
+ -------
1971
+ NotImplementedError
1972
+ raises NotImplementedError, if target parameter value not provided
1973
+
1974
+ """
1975
+ raise NotImplementedError(
1976
+ "Please set the target using set_target() before invoking this API. See "
1977
+ "https://accelerated-data-science.readthedocs.io/en/latest/ads.dataset.html#ads.dataset.dataset.ADSDataset.set_target "
1978
+ "for the API usage."
1979
+ )