workbench 0.8.89__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (243) hide show
  1. workbench/__init__.py +45 -0
  2. workbench/algorithms/__init__.py +0 -0
  3. workbench/algorithms/dataframe/Readme.md +3 -0
  4. workbench/algorithms/dataframe/__init__.py +12 -0
  5. workbench/algorithms/dataframe/aggregation.py +99 -0
  6. workbench/algorithms/dataframe/data_source_eda.py +46 -0
  7. workbench/algorithms/dataframe/dimensionality_reduction.py +145 -0
  8. workbench/algorithms/dataframe/feature_resolution.py +243 -0
  9. workbench/algorithms/dataframe/feature_space_proximity.py +335 -0
  10. workbench/algorithms/dataframe/feature_spider.py +363 -0
  11. workbench/algorithms/dataframe/knn_spider_deprecated.py +283 -0
  12. workbench/algorithms/dataframe/quantile_regression.py +373 -0
  13. workbench/algorithms/dataframe/residuals_calculator.py +218 -0
  14. workbench/algorithms/dataframe/row_tagger.py +192 -0
  15. workbench/algorithms/dataframe/target_gradients.py +145 -0
  16. workbench/algorithms/graph/__init__.py +10 -0
  17. workbench/algorithms/graph/heavy/Readme.md +2 -0
  18. workbench/algorithms/graph/light/Readme.md +2 -0
  19. workbench/algorithms/graph/light/__init__.py +0 -0
  20. workbench/algorithms/graph/light/proximity_graph.py +171 -0
  21. workbench/algorithms/spark/Readme.md +9 -0
  22. workbench/algorithms/sql/Readme.md +9 -0
  23. workbench/algorithms/sql/__init__.py +15 -0
  24. workbench/algorithms/sql/column_stats.py +146 -0
  25. workbench/algorithms/sql/correlations.py +116 -0
  26. workbench/algorithms/sql/descriptive_stats.py +109 -0
  27. workbench/algorithms/sql/outliers.py +248 -0
  28. workbench/algorithms/sql/sample_rows.py +78 -0
  29. workbench/algorithms/sql/value_counts.py +92 -0
  30. workbench/api/__init__.py +32 -0
  31. workbench/api/data_source.py +225 -0
  32. workbench/api/df_store.py +194 -0
  33. workbench/api/endpoint.py +94 -0
  34. workbench/api/feature_set.py +146 -0
  35. workbench/api/meta.py +273 -0
  36. workbench/api/model.py +78 -0
  37. workbench/api/monitor.py +163 -0
  38. workbench/api/parameter_store.py +241 -0
  39. workbench/api/pipeline.py +188 -0
  40. workbench/cached/__init__.py +11 -0
  41. workbench/cached/cached_data_source.py +87 -0
  42. workbench/cached/cached_endpoint.py +86 -0
  43. workbench/cached/cached_feature_set.py +88 -0
  44. workbench/cached/cached_meta.py +383 -0
  45. workbench/cached/cached_model.py +139 -0
  46. workbench/cached/cached_pipeline.py +64 -0
  47. workbench/core/__init__.py +0 -0
  48. workbench/core/artifacts/__init__.py +21 -0
  49. workbench/core/artifacts/artifact.py +486 -0
  50. workbench/core/artifacts/athena_source.py +676 -0
  51. workbench/core/artifacts/cached_artifact_mixin.py +92 -0
  52. workbench/core/artifacts/data_source_abstract.py +320 -0
  53. workbench/core/artifacts/data_source_factory.py +70 -0
  54. workbench/core/artifacts/endpoint_core.py +994 -0
  55. workbench/core/artifacts/feature_set_core.py +694 -0
  56. workbench/core/artifacts/graph_core.py +257 -0
  57. workbench/core/artifacts/model_core.py +1081 -0
  58. workbench/core/artifacts/monitor_core.py +504 -0
  59. workbench/core/cloud_platform/aws/README.md +2 -0
  60. workbench/core/cloud_platform/aws/aws_account_clamp.py +154 -0
  61. workbench/core/cloud_platform/aws/aws_df_store.py +349 -0
  62. workbench/core/cloud_platform/aws/aws_meta.py +773 -0
  63. workbench/core/cloud_platform/aws/aws_session.py +160 -0
  64. workbench/core/cloud_platform/aws/cache_dataframe.py +51 -0
  65. workbench/core/cloud_platform/azure/README.md +2 -0
  66. workbench/core/cloud_platform/cloud_meta.py +281 -0
  67. workbench/core/cloud_platform/gcp/README.md +2 -0
  68. workbench/core/pipelines/pipeline_executor.py +162 -0
  69. workbench/core/transforms/Readme.md +55 -0
  70. workbench/core/transforms/__init__.py +0 -0
  71. workbench/core/transforms/data_loaders/__init__.py +0 -0
  72. workbench/core/transforms/data_loaders/heavy/__init__.py +10 -0
  73. workbench/core/transforms/data_loaders/heavy/s3_heavy_to_data_source.py +245 -0
  74. workbench/core/transforms/data_loaders/light/__init__.py +14 -0
  75. workbench/core/transforms/data_loaders/light/csv_to_data_source.py +81 -0
  76. workbench/core/transforms/data_loaders/light/json_to_data_source.py +79 -0
  77. workbench/core/transforms/data_loaders/light/s3_to_data_source_light.py +106 -0
  78. workbench/core/transforms/data_to_data/__init__.py +0 -0
  79. workbench/core/transforms/data_to_data/heavy/__init__.py +0 -0
  80. workbench/core/transforms/data_to_data/heavy/emr/Readme.md +2 -0
  81. workbench/core/transforms/data_to_data/heavy/glue/Readme.md +2 -0
  82. workbench/core/transforms/data_to_data/light/__init__.py +0 -0
  83. workbench/core/transforms/data_to_data/light/clean_data.py +63 -0
  84. workbench/core/transforms/data_to_data/light/data_to_data_light.py +68 -0
  85. workbench/core/transforms/data_to_features/__init__.py +0 -0
  86. workbench/core/transforms/data_to_features/heavy/__init__.py +0 -0
  87. workbench/core/transforms/data_to_features/heavy/chunk/__init__.py +0 -0
  88. workbench/core/transforms/data_to_features/heavy/chunk/data_to_features_chunk.py +120 -0
  89. workbench/core/transforms/data_to_features/heavy/emr/Readme.md +2 -0
  90. workbench/core/transforms/data_to_features/heavy/glue/Readme.md +2 -0
  91. workbench/core/transforms/data_to_features/heavy/storage/data_to_features_heavy_old.py +161 -0
  92. workbench/core/transforms/data_to_features/light/__init__.py +0 -0
  93. workbench/core/transforms/data_to_features/light/data_to_features_light.py +83 -0
  94. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +64 -0
  95. workbench/core/transforms/features_to_features/__init__.py +0 -0
  96. workbench/core/transforms/features_to_features/heavy/emr/Readme.md +2 -0
  97. workbench/core/transforms/features_to_features/heavy/glue/Readme.md +2 -0
  98. workbench/core/transforms/features_to_model/__init__.py +0 -0
  99. workbench/core/transforms/features_to_model/features_to_model.py +389 -0
  100. workbench/core/transforms/model_to_endpoint/__init__.py +0 -0
  101. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +120 -0
  102. workbench/core/transforms/pandas_transforms/__init__.py +18 -0
  103. workbench/core/transforms/pandas_transforms/data_to_pandas.py +100 -0
  104. workbench/core/transforms/pandas_transforms/features_to_pandas.py +93 -0
  105. workbench/core/transforms/pandas_transforms/pandas_to_data.py +214 -0
  106. workbench/core/transforms/pandas_transforms/pandas_to_features.py +450 -0
  107. workbench/core/transforms/pandas_transforms/pandas_to_features_chunked.py +111 -0
  108. workbench/core/transforms/transform.py +147 -0
  109. workbench/core/views/__init__.py +24 -0
  110. workbench/core/views/column_subset_view.py +113 -0
  111. workbench/core/views/computation_view.py +64 -0
  112. workbench/core/views/create_view.py +52 -0
  113. workbench/core/views/display_view.py +70 -0
  114. workbench/core/views/mdq_view.py +142 -0
  115. workbench/core/views/pandas_to_view.py +158 -0
  116. workbench/core/views/training_view.py +160 -0
  117. workbench/core/views/view.py +327 -0
  118. workbench/core/views/view_utils.py +314 -0
  119. workbench/model_scripts/custom_models/chem_info/Readme.md +10 -0
  120. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +60 -0
  121. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +62 -0
  122. workbench/model_scripts/custom_models/chem_info/requirements.txt +4 -0
  123. workbench/model_scripts/custom_models/chem_info/tautomerize.py +62 -0
  124. workbench/model_scripts/custom_models/network_security/Readme.md +7 -0
  125. workbench/model_scripts/custom_script_example/custom_model_script.py +138 -0
  126. workbench/model_scripts/custom_script_example/requirements.txt +2 -0
  127. workbench/model_scripts/light_quant_regression/quant_regression.template +297 -0
  128. workbench/model_scripts/light_quant_regression/requirements.txt +2 -0
  129. workbench/model_scripts/light_scikit_learn/requirements.txt +3 -0
  130. workbench/model_scripts/light_scikit_learn/scikit_learn.template +293 -0
  131. workbench/model_scripts/light_xgb_model/generated_model_script.py +368 -0
  132. workbench/model_scripts/light_xgb_model/requirements.txt +2 -0
  133. workbench/model_scripts/light_xgb_model/xgb_model.template +368 -0
  134. workbench/model_scripts/script_generation.py +190 -0
  135. workbench/plugins/pages/mdq_page.py +127 -0
  136. workbench/plugins/web_components/endpoint_turbo.py +106 -0
  137. workbench/repl/__init__.py +0 -0
  138. workbench/repl/workbench_shell.py +514 -0
  139. workbench/resources/open_source_api.key +1 -0
  140. workbench/resources/signature_verify_pub.pem +6 -0
  141. workbench/scripts/monitor_cloud_watch.py +349 -0
  142. workbench/scripts/redis_expire.py +41 -0
  143. workbench/scripts/redis_report.py +38 -0
  144. workbench/scripts/show_config.py +20 -0
  145. workbench/themes/dark/base_css.url +1 -0
  146. workbench/themes/dark/custom.css +70 -0
  147. workbench/themes/dark/dark.json +763 -0
  148. workbench/themes/light/base_css.url +1 -0
  149. workbench/themes/light/custom.css +80 -0
  150. workbench/themes/light/light.json +753 -0
  151. workbench/themes/quartz/base_css.url +1 -0
  152. workbench/themes/quartz/custom.css +69 -0
  153. workbench/themes/quartz/quartz.json +728 -0
  154. workbench/themes/quartz_dark/base_css.url +1 -0
  155. workbench/themes/quartz_dark/custom.css +71 -0
  156. workbench/themes/quartz_dark/quartz_dark.json +728 -0
  157. workbench/utils/__init__.py +0 -0
  158. workbench/utils/athena_utils.py +133 -0
  159. workbench/utils/aws_utils.py +678 -0
  160. workbench/utils/bulk_utils.py +35 -0
  161. workbench/utils/cache.py +195 -0
  162. workbench/utils/chem_utils.py +335 -0
  163. workbench/utils/cloudwatch_handler.py +149 -0
  164. workbench/utils/config_manager.py +445 -0
  165. workbench/utils/dashboard_metrics.py +194 -0
  166. workbench/utils/datetime_utils.py +137 -0
  167. workbench/utils/deprecated_utils.py +115 -0
  168. workbench/utils/df_to_endpoint.py +155 -0
  169. workbench/utils/ecs_info.py +73 -0
  170. workbench/utils/endpoint_metrics.py +188 -0
  171. workbench/utils/endpoint_utils.py +177 -0
  172. workbench/utils/execution_environment.py +211 -0
  173. workbench/utils/extract_model_artifact.py +242 -0
  174. workbench/utils/fast_inference.py +65 -0
  175. workbench/utils/glue_utils.py +68 -0
  176. workbench/utils/ipython_utils.py +26 -0
  177. workbench/utils/json_utils.py +50 -0
  178. workbench/utils/lambda_utils.py +65 -0
  179. workbench/utils/license_manager.py +177 -0
  180. workbench/utils/log_utils.py +43 -0
  181. workbench/utils/markdown_utils.py +42 -0
  182. workbench/utils/pandas_utils.py +614 -0
  183. workbench/utils/performance_utils.py +52 -0
  184. workbench/utils/pipeline_utils.py +72 -0
  185. workbench/utils/plugin_manager.py +356 -0
  186. workbench/utils/redis_cache.py +333 -0
  187. workbench/utils/repl_utils.py +148 -0
  188. workbench/utils/resource_utils.py +39 -0
  189. workbench/utils/s3_utils.py +202 -0
  190. workbench/utils/shapley_values.py +101 -0
  191. workbench/utils/symbols.py +43 -0
  192. workbench/utils/test_data_generator.py +314 -0
  193. workbench/utils/theme_manager.py +241 -0
  194. workbench/utils/trace_calls.py +65 -0
  195. workbench/utils/type_abbrev.py +48 -0
  196. workbench/utils/workbench_cache.py +159 -0
  197. workbench/utils/workbench_event_bridge.py +115 -0
  198. workbench/utils/workbench_logging.py +266 -0
  199. workbench/utils/workbench_sqs.py +67 -0
  200. workbench/web_interface/components/component_interface.py +228 -0
  201. workbench/web_interface/components/correlation_matrix.py +158 -0
  202. workbench/web_interface/components/data_details_markdown.py +251 -0
  203. workbench/web_interface/components/endpoint_metric_plots.py +85 -0
  204. workbench/web_interface/components/experiments/color_maps.py +25 -0
  205. workbench/web_interface/components/experiments/compound_details.py +49 -0
  206. workbench/web_interface/components/experiments/dashboard_metric_plots.py +85 -0
  207. workbench/web_interface/components/experiments/data_table.py +65 -0
  208. workbench/web_interface/components/experiments/graph_test.py +83 -0
  209. workbench/web_interface/components/experiments/hello.py +50 -0
  210. workbench/web_interface/components/experiments/histogram.py +47 -0
  211. workbench/web_interface/components/experiments/line_chart.py +30 -0
  212. workbench/web_interface/components/experiments/outlier_plot.py +103 -0
  213. workbench/web_interface/components/experiments/plugin_callbacks.py +60 -0
  214. workbench/web_interface/components/experiments/scatter_plot.py +65 -0
  215. workbench/web_interface/components/model_plot.py +65 -0
  216. workbench/web_interface/components/plugin_interface.py +219 -0
  217. workbench/web_interface/components/plugin_unit_test.py +149 -0
  218. workbench/web_interface/components/plugins/ag_table.py +94 -0
  219. workbench/web_interface/components/plugins/confusion_matrix.py +188 -0
  220. workbench/web_interface/components/plugins/dashboard_status.py +111 -0
  221. workbench/web_interface/components/plugins/data_details.py +270 -0
  222. workbench/web_interface/components/plugins/endpoint_details.py +114 -0
  223. workbench/web_interface/components/plugins/graph_plot.py +279 -0
  224. workbench/web_interface/components/plugins/license_details.py +138 -0
  225. workbench/web_interface/components/plugins/model_details.py +227 -0
  226. workbench/web_interface/components/plugins/pipeline_details.py +157 -0
  227. workbench/web_interface/components/plugins/scatter_plot.py +270 -0
  228. workbench/web_interface/components/regression_plot.py +103 -0
  229. workbench/web_interface/components/table.py +185 -0
  230. workbench/web_interface/components/violin_plots.py +149 -0
  231. workbench/web_interface/page_views/data_sources_page_view.py +126 -0
  232. workbench/web_interface/page_views/endpoints_page_view.py +82 -0
  233. workbench/web_interface/page_views/feature_sets_page_view.py +124 -0
  234. workbench/web_interface/page_views/main_page.py +226 -0
  235. workbench/web_interface/page_views/models_page_view.py +83 -0
  236. workbench/web_interface/page_views/page_view.py +16 -0
  237. workbench/web_interface/page_views/pipelines_page_view.py +80 -0
  238. workbench-0.8.89.dist-info/LICENSE +21 -0
  239. workbench-0.8.89.dist-info/METADATA +191 -0
  240. workbench-0.8.89.dist-info/RECORD +243 -0
  241. workbench-0.8.89.dist-info/WHEEL +5 -0
  242. workbench-0.8.89.dist-info/entry_points.txt +4 -0
  243. workbench-0.8.89.dist-info/top_level.txt +1 -0
workbench/__init__.py ADDED
@@ -0,0 +1,45 @@
1
+ # Copyright (c) 2021-2024 SuperCowPowers LLC
2
+
3
+ """
4
+ Workbench Main Classes
5
+ - Artifacts
6
+ - DataSource
7
+ - FeatureSet
8
+ - Model
9
+ - Endpoint
10
+ - Transforms
11
+ - DataLoaders
12
+ - DataToData
13
+ - DataToFeatures
14
+ - FeaturesToModel
15
+ - ModelToEndpoint
16
+
17
+ For help on particular classes you can do this
18
+ - from workbench.core.transforms.data_loaders.light.json_to_data_source import JSONToDataSource
19
+ - help(JSONToDataSource)
20
+
21
+
22
+ class JSONToDataSource(workbench.core.transforms.transform.Transform)
23
+ | JSONToDataSource(json_file_path: str, data_uuid: str)
24
+ |
25
+ | JSONToDataSource: Class to move local JSON Files into a Workbench DataSource
26
+ |
27
+ | Common Usage:
28
+ | json_to_data = JSONToDataSource(json_file_path, data_uuid)
29
+ | json_to_data.set_output_tags(["abalone", "json", "whatever"])
30
+ | json_to_data.transform()
31
+ """
32
+ import os
33
+ from importlib.metadata import version
34
+
35
+ try:
36
+ __version__ = version("workbench")
37
+ except Exception:
38
+ __version__ = "unknown"
39
+
40
+ # Workbench Logging
41
+ from workbench.utils.workbench_logging import logging_setup
42
+
43
+ # Check the environment variable to decide whether to set up logging
44
+ if os.getenv("WORKBENCH_SKIP_LOGGING", "False").lower() != "true":
45
+ logging_setup()
File without changes
@@ -0,0 +1,3 @@
1
+ # Algorithms: Dataframe
2
+ - **Dataframes:** These algorithms are algorithms that have Pandas Dataframes as inputs and typically (not always) DataFrames as outputs. Depedending on the algorithm they might have outputs like dictionaries or lists.
3
+ - **Light:** These algorithms are considered **light** algorithms since they are contrained by one process where the data is in memory.
@@ -0,0 +1,12 @@
1
+ """Welcome to the Dataframe Algorithm Classes
2
+
3
+ These classes provide functionality for Pandas Dataframes
4
+
5
+ - TBD: TBD
6
+ """
7
+
8
+ from .feature_space_proximity import FeatureSpaceProximity
9
+ from .residuals_calculator import ResidualsCalculator
10
+ from .dimensionality_reduction import DimensionalityReduction
11
+
12
+ __all__ = ["FeatureSpaceProximity", "ResidualsCalculator", "DimensionalityReduction"]
@@ -0,0 +1,99 @@
1
+ """Aggregation: Perform Row Aggregation on a DataFrame"""
2
+
3
+ import pandas as pd
4
+ import logging
5
+
6
+ # Workbench Logger
7
+ log = logging.getLogger("workbench")
8
+
9
+
10
+ def aggregate(df: pd.DataFrame, group_column: str, features: list = None) -> pd.DataFrame:
11
+ """Aggregate Row of a DataFrame
12
+ Args:
13
+ df: Pandas DataFrame
14
+ group_column: The column to aggregate/group on
15
+ features: List of column names (numeric) to perform aggregation on (default: None)
16
+ Returns:
17
+ Pandas DataFrame with aggregated rows and averaged numeric columns
18
+ """
19
+
20
+ # If no features are given, indentify all numeric columns
21
+ if features is None:
22
+ features = [x for x in df.select_dtypes(include="number").columns.tolist() if not x.endswith("id")]
23
+ log.info("No features given, auto identifying numeric columns...")
24
+ log.info(f"{features}")
25
+
26
+ # Sanity checks
27
+ if not all(column in df.columns for column in features):
28
+ log.critical("Some features are missing in the DataFrame")
29
+ return df
30
+ if df.empty:
31
+ log.critical("DataFrame is empty")
32
+ return df
33
+
34
+ # Now aggregate the DataFrame
35
+ log.info(f"Aggregating dataframe, averaging {features}...")
36
+
37
+ # Subset the DataFrame to only the features and group column
38
+ df = df[features + [group_column]].copy()
39
+ df["group_count"] = 1
40
+
41
+ # Define the aggregation methods for each column
42
+ agg_methods = {feature: "mean" for feature in features}
43
+ agg_methods["group_count"] = "size"
44
+
45
+ # Group by the group column and perform the aggregation
46
+ df = df.groupby(group_column).agg(agg_methods).reset_index()
47
+
48
+ # Return the DataFrame
49
+ return df
50
+
51
+
52
+ def test():
53
+ """Test for the Row Aggregation Class"""
54
+ # Set some pandas options
55
+ pd.set_option("display.max_columns", None)
56
+ pd.set_option("display.width", 1000)
57
+
58
+ # Make some fake data
59
+ data = {
60
+ "ID": [
61
+ "id_0",
62
+ "id_0",
63
+ "id_2",
64
+ "id_3",
65
+ "id_4",
66
+ "id_5",
67
+ "id_6",
68
+ "id_7",
69
+ "id_8",
70
+ "id_9",
71
+ ],
72
+ "feat1": [1.0, 1.0, 1.1, 3.0, 4.0, 1.0, 1.0, 1.1, 3.0, 4.0],
73
+ "feat2": [1.0, 1.0, 1.1, 3.0, 4.0, 1.0, 1.0, 1.1, 3.0, 4.0],
74
+ "feat3": [0.1, 0.1, 0.2, 1.6, 2.5, 0.1, 0.1, 0.2, 1.6, 2.5],
75
+ "price": [31, 60, 62, 40, 20, 31, 61, 60, 40, 20],
76
+ "outlier_group": [
77
+ "sample",
78
+ "a_low",
79
+ "sample",
80
+ "b_high",
81
+ "sample",
82
+ "c_high",
83
+ "sample",
84
+ "d_low",
85
+ "sample",
86
+ "e_high",
87
+ ],
88
+ }
89
+ data_df = pd.DataFrame(data)
90
+
91
+ # Aggregate the DataFrame
92
+ new_df = aggregate(data_df, group_column="outlier_group")
93
+
94
+ # Output the DataFrame
95
+ print(new_df)
96
+
97
+
98
+ if __name__ == "__main__":
99
+ test()
@@ -0,0 +1,46 @@
1
+ """DataSourceEDA: Provide basic EDA (Exploratory Data Analysis) for a DataFrame"""
2
+
3
+ import pandas as pd
4
+ import logging
5
+
6
+ # Local Imports
7
+ from workbench.core.transforms.pandas_transforms.data_to_pandas import DataToPandas
8
+ from workbench.utils import pandas_utils
9
+
10
+
11
+ class DataSourceEDA:
12
+ def __init__(self, data_source_uuid: str):
13
+ """DataSourceEDA: Provide basic EDA (Exploratory Data Analysis) for a DataSource
14
+ Args:
15
+ data_source_uuid (AthenaSource): DataSource for Exploratory Data Analysis"""
16
+ self.log = logging.getLogger("workbench")
17
+ self.data_source_uuid = data_source_uuid
18
+
19
+ # Spin up the DataToPandas class
20
+ self.data_to_pandas = DataToPandas(self.data_source_uuid)
21
+ self.log.info(f"Getting DataFrame from {self.data_source_uuid}...")
22
+ self.data_to_pandas.transform()
23
+ self.df = self.data_to_pandas.get_output()
24
+
25
+ def get_column_info(self):
26
+ """Return the Column Information for the DataSource"""
27
+ column_info_df = pandas_utils.info(self.df)
28
+ return column_info_df
29
+
30
+ def get_numeric_stats(self):
31
+ """Return the Column Information for the DataSource"""
32
+ stats_df = pandas_utils.numeric_stats(self.df)
33
+ return stats_df
34
+
35
+
36
+ if __name__ == "__main__":
37
+ """Exercise the DataSourceEDA Class"""
38
+
39
+ # Set some pandas options
40
+ pd.set_option("display.max_columns", None)
41
+ pd.set_option("display.width", 1000)
42
+
43
+ # Spin up the class and get the EDA output
44
+ my_eda = DataSourceEDA("abalone_data")
45
+ print(my_eda.get_column_info())
46
+ print(my_eda.get_numeric_stats())
@@ -0,0 +1,145 @@
1
+ """DimensionalityReduction: Perform Dimensionality Reduction on a DataFrame"""
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import logging
6
+ from sklearn.manifold import TSNE, MDS
7
+ from sklearn.decomposition import PCA
8
+ from sklearn.preprocessing import StandardScaler
9
+
10
+
11
+ # Dimensionality Reduction Class
12
+ class DimensionalityReduction:
13
+ def __init__(self):
14
+ """DimensionalityReduction: Perform Dimensionality Reduction on a DataFrame"""
15
+ self.log = logging.getLogger("workbench")
16
+ self.projection_model = None
17
+ self.features = None
18
+
19
+ def fit_transform(self, df: pd.DataFrame, features: list = None, projection: str = "TSNE") -> pd.DataFrame:
20
+ """Fit and Transform the DataFrame
21
+ Args:
22
+ df: Pandas DataFrame
23
+ features: List of feature column names (default: None)
24
+ projection: The projection model to use (TSNE, MDS or PCA, default: PCA)
25
+ Returns:
26
+ Pandas DataFrame with new columns x and y
27
+ """
28
+
29
+ # If no features are given, indentify all numeric columns
30
+ if features is None:
31
+ features = [x for x in df.select_dtypes(include="number").columns.tolist() if not x.endswith("id")]
32
+ # Also drop group_count if it exists
33
+ features = [x for x in features if x != "group_count"]
34
+ self.log.info("No features given, auto identifying numeric columns...")
35
+ self.log.info(f"{features}")
36
+ self.features = features
37
+
38
+ # Sanity checks
39
+ if not all(column in df.columns for column in self.features):
40
+ self.log.critical("Some features are missing in the DataFrame")
41
+ return df
42
+ if len(self.features) < 2:
43
+ self.log.critical("At least two features are required")
44
+ return df
45
+ if df.empty:
46
+ self.log.critical("DataFrame is empty")
47
+ return df
48
+
49
+ # Most projection models will fail if there are any NaNs in the data
50
+ # So we'll fill NaNs with the mean value for that column
51
+ for col in df[self.features].columns:
52
+ df[col].fillna(df[col].mean(), inplace=True)
53
+
54
+ # Normalize the features
55
+ scaler = StandardScaler()
56
+ normalized_data = scaler.fit_transform(df[self.features])
57
+ df[self.features] = normalized_data
58
+
59
+ # Project the multidimensional features onto an x,y plane
60
+ self.log.info("Projecting features onto an x,y plane...")
61
+
62
+ # Perform the projection
63
+ if projection == "TSNE":
64
+ # Perplexity is a hyperparameter that controls the number of neighbors used to compute the manifold
65
+ # The number of neighbors should be less than the number of samples
66
+ perplexity = min(40, len(df) - 1)
67
+ self.log.info(f"Perplexity: {perplexity}")
68
+ self.projection_model = TSNE(perplexity=perplexity)
69
+ elif projection == "MDS":
70
+ self.projection_model = MDS(n_components=2, random_state=0)
71
+ elif projection == "PCA":
72
+ self.projection_model = PCA(n_components=2)
73
+
74
+ # Fit the projection model
75
+ # Hack PCA + TSNE to work together
76
+ projection = self.projection_model.fit_transform(df[self.features])
77
+
78
+ # Put the projection results back into the given DataFrame
79
+ df["x"] = projection[:, 0] # Projection X Column
80
+ df["y"] = projection[:, 1] # Projection Y Column
81
+
82
+ # Jitter the data to resolve coincident points
83
+ # df = self.resolve_coincident_points(df)
84
+
85
+ # Return the DataFrame with the new columns
86
+ return df
87
+
88
+ @staticmethod
89
+ def resolve_coincident_points(df: pd.DataFrame):
90
+ """Resolve coincident points in a DataFrame
91
+ Args:
92
+ df(pd.DataFrame): The DataFrame to resolve coincident points in
93
+ Returns:
94
+ pd.DataFrame: The DataFrame with resolved coincident points
95
+ """
96
+ # Adding Jitter to the projection
97
+ x_scale = (df["x"].max() - df["x"].min()) * 0.1
98
+ y_scale = (df["y"].max() - df["y"].min()) * 0.1
99
+ df["x"] += np.random.normal(-x_scale, +x_scale, len(df))
100
+ df["y"] += np.random.normal(-y_scale, +y_scale, len(df))
101
+ return df
102
+
103
+
104
+ def test():
105
+ """Test for the Dimensionality Reduction Class"""
106
+ # Set some pandas options
107
+ pd.set_option("display.max_columns", None)
108
+ pd.set_option("display.width", 1000)
109
+
110
+ # Make some fake data
111
+ data = {
112
+ "ID": [
113
+ "id_0",
114
+ "id_0",
115
+ "id_2",
116
+ "id_3",
117
+ "id_4",
118
+ "id_5",
119
+ "id_6",
120
+ "id_7",
121
+ "id_8",
122
+ "id_9",
123
+ ],
124
+ "feat1": [1.0, 1.0, 1.1, 3.0, 4.0, 1.0, 1.0, 1.1, 3.0, 4.0],
125
+ "feat2": [1.0, 1.0, 1.1, 3.0, 4.0, 1.0, 1.0, 1.1, 3.0, 4.0],
126
+ "feat3": [0.1, 0.1, 0.2, 1.6, 2.5, 0.1, 0.1, 0.2, 1.6, 2.5],
127
+ "price": [31, 60, 62, 40, 20, 31, 61, 60, 40, 20],
128
+ }
129
+ data_df = pd.DataFrame(data)
130
+ features = ["feat1", "feat2", "feat3"]
131
+
132
+ # Create the class and run the dimensionality reduction
133
+ projection = DimensionalityReduction()
134
+ new_df = projection.fit_transform(data_df, features=features, projection="TSNE")
135
+
136
+ # Check that the x and y columns were added
137
+ assert "x" in new_df.columns
138
+ assert "y" in new_df.columns
139
+
140
+ # Output the DataFrame
141
+ print(new_df)
142
+
143
+
144
+ if __name__ == "__main__":
145
+ test()
@@ -0,0 +1,243 @@
1
+ """FeatureResolution: Report on Feature Space Resolution Issues"""
2
+
3
+ import logging
4
+ from typing import Union
5
+ import pandas as pd
6
+ from sklearn.neighbors import KNeighborsRegressor
7
+ from sklearn.preprocessing import StandardScaler
8
+
9
+ # Workbench Imports
10
+ from workbench.utils.pandas_utils import DataFrameBuilder
11
+
12
+
13
+ # Feature Resolution Class
14
+ class FeatureResolution:
15
+ def __init__(
16
+ self,
17
+ input_df: pd.DataFrame,
18
+ features: list,
19
+ target_column: str,
20
+ id_column: str,
21
+ distance_metric: str = "minkowski",
22
+ ):
23
+ """Initialize the FeatureResolution object
24
+
25
+ Args:
26
+ distance_metric: Distance metric to use (default: "minkowski")
27
+ """
28
+ self.log = logging.getLogger("workbench")
29
+ self.df = input_df.copy().reset_index(drop=True)
30
+ self.features = features
31
+ self.target_column = target_column
32
+ self.id_column = id_column
33
+ self.n_neighbors = 10
34
+ self.scalar = StandardScaler()
35
+ self.knn = KNeighborsRegressor(metric=distance_metric, n_neighbors=self.n_neighbors, weights="distance")
36
+ self.dataframe_builder = DataFrameBuilder()
37
+ self.recursive_df_list = []
38
+
39
+ def compute(
40
+ self, within_distance: float, min_target_difference: float, output_columns: list = [], verbose=True
41
+ ) -> Union[pd.DataFrame, None]:
42
+ """FeatureResolution: Compute Feature Space to Target Resolution and Report Issues
43
+
44
+ Args:
45
+ within_distance: Features within this distance should have similar target values
46
+ min_target_difference: Minimum target difference to consider
47
+ output_columns: List of additional columns to output (default: []])
48
+ verbose: Whether to print out the resolution issues (default: True)
49
+
50
+ Returns:
51
+ Pandas DataFrame of Feature Space to Target Resolution Issues
52
+ Includes any additional output_columns if specified
53
+ """
54
+
55
+ # Check for expected columns
56
+ for column in [self.target_column] + self.features:
57
+ if column not in self.df.columns:
58
+ self.log.error(f"DataFrame does not have required {column} Column!")
59
+ return
60
+
61
+ # Set up the output columns (add id and target columns if they are not already included)
62
+ output_columns = list(set(output_columns).union({self.id_column, self.target_column}))
63
+
64
+ # Check the output columns
65
+ if output_columns is not None:
66
+ for column in output_columns:
67
+ if column not in self.df.columns:
68
+ self.log.error(f"DataFrame does not have required {column} Column!")
69
+ return
70
+
71
+ # Check for NaNs in the features and log the percentage
72
+ for feature in self.features:
73
+ nan_count = self.df[feature].isna().sum()
74
+ if nan_count > 0:
75
+ print(f"Feature '{feature}' has {nan_count} NaNs ({nan_count / len(self.df) * 100:.2f}%).")
76
+
77
+ # Remove and NaNs or INFs in the features
78
+ self.log.info(f"Dataframe Shape before NaN/INF removal {self.df.shape}")
79
+ self.df = self.df.replace([float("inf"), float("-inf")], pd.NA).dropna().reset_index(drop=True)
80
+ self.log.info(f"Dataframe Shape after NaN/INF removal {self.df.shape}")
81
+
82
+ # Standardize the features
83
+ X = self.scalar.fit_transform(self.df[self.features])
84
+ y = self.df[self.target_column]
85
+
86
+ # Fit the KNN model
87
+ self.knn.fit(X, y)
88
+
89
+ # Compute the feature space to target resolution to the nearest neighbors
90
+ output_count = 0
91
+ for my_index, row in enumerate(X):
92
+ # Find the nearest neighbors
93
+ distances, indices = self.knn.kneighbors([row])
94
+ distances = distances[0] # Returns a list within a list so grab the inner list
95
+ indices = indices[0]
96
+ target_values = y[indices]
97
+
98
+ # Grab the info for this observation
99
+ my_id = self.df.iloc[my_index][self.id_column]
100
+ my_output_data = self.df.iloc[my_index][output_columns]
101
+ my_target = y[my_index]
102
+
103
+ # Loop through the neighbors
104
+ for n_index, n_distance, n_target in zip(indices, distances, target_values):
105
+ # Skip myself
106
+ if n_index == my_index:
107
+ continue
108
+
109
+ # Compute the difference in feature space and target space
110
+ feature_diff = n_distance
111
+ target_diff = abs(my_target - n_target)
112
+
113
+ # Compute target differences `within_distance` feature space
114
+ if feature_diff <= within_distance and target_diff >= min_target_difference:
115
+ # Gather info about the neighbor
116
+ neighbor_id = self.df.iloc[n_index][self.id_column]
117
+ neighbor_output_data = self.df.iloc[n_index][output_columns]
118
+
119
+ # Add to the output DataFrame
120
+ row_data = my_output_data.to_dict()
121
+ row_data["feature_diff"] = feature_diff
122
+ row_data["target_diff"] = target_diff
123
+ row_data["n_id"] = neighbor_id
124
+ self.dataframe_builder.add_row(row_data)
125
+
126
+ # Print out the resolution issue (if verbose)
127
+ if verbose:
128
+ print(f"{output_count} Feature Diff: {feature_diff} Target Diff: {target_diff}")
129
+ print(f"\t{my_id}: {my_target:.3f} {list(my_output_data)}")
130
+ print(f"\t{neighbor_id}: {n_target:.3f} {list(neighbor_output_data)}")
131
+ # Increment the output count
132
+ output_count += 1
133
+
134
+ # Return the output DataFrame
135
+ return self.dataframe_builder.build()
136
+
137
+ def recursive_compute(
138
+ self, within_distance: float, min_target_difference: float, output_columns: list = [], verbose=True
139
+ ) -> pd.DataFrame:
140
+ """Compute Feature Resolution Issues, remove the issues, and recurse until no issues are found"""
141
+
142
+ # Compute the resolution issues
143
+ resolution_df = self.compute(within_distance, min_target_difference, output_columns, verbose)
144
+ self.recursive_df_list.append(resolution_df)
145
+
146
+ # If there are no resolution issues, return the combined DataFrame
147
+ if len(resolution_df) == 0:
148
+ return pd.concat(self.recursive_df_list)
149
+
150
+ # Gather all IDs to be removed
151
+ ids_to_remove = set(list(resolution_df[self.id_column]) + list(resolution_df["n_id"]))
152
+
153
+ # Remove the rows of the observations that had issues
154
+ print("Removing IDs: ", ids_to_remove)
155
+ self.df = self.df[~self.df[self.id_column].isin(ids_to_remove)]
156
+
157
+ # Recurse
158
+ print("Recursing...")
159
+ self.df = self.df.reset_index(drop=True)
160
+ self.dataframe_builder = DataFrameBuilder()
161
+ return self.recursive_compute(within_distance, min_target_difference, output_columns, verbose)
162
+
163
+
164
+ # Test the FeatureResolution Class
165
+ def simple_unit_test():
166
+ """Test for the Feature Spider Class"""
167
+ # Set some pandas options
168
+ pd.set_option("display.max_columns", None)
169
+ pd.set_option("display.width", 1000)
170
+
171
+ # Make some fake data
172
+ data = {
173
+ "ID": [
174
+ "id_0",
175
+ "id_1",
176
+ "id_2",
177
+ "id_3",
178
+ "id_4",
179
+ "id_5",
180
+ "id_6",
181
+ "id_7",
182
+ "id_8",
183
+ "id_9",
184
+ ],
185
+ "feat1": [1.0, 1.0, 1.1, 3.0, 4.0, 1.0, 1.0, 1.1, 3.0, 4.0],
186
+ "feat2": [1.0, 1.0, 1.1, 3.0, 4.0, 1.0, 1.0, 1.1, 3.0, 4.0],
187
+ "feat3": [0.1, 0.2, 0.2, 1.6, 2.5, 0.1, 0.3, 0.2, 1.6, 2.5],
188
+ "price": [10, 11, 12, 40, 20, 35, 61, 60, 40, 20],
189
+ }
190
+ data_df = pd.DataFrame(data)
191
+
192
+ # Create the class and run the report
193
+ resolution = FeatureResolution(data_df, features=["feat1", "feat2", "feat3"], target_column="price", id_column="ID")
194
+ resolution.compute(within_distance=0.1, min_target_difference=10)
195
+
196
+
197
+ def unit_test():
198
+ """Unit Test for the FeatureResolution Class"""
199
+ from workbench.api.feature_set import FeatureSet
200
+ from workbench.api.model import Model
201
+
202
+ # Grab a test dataframe
203
+ fs = FeatureSet("aqsol_mol_descriptors")
204
+ test_df = fs.pull_dataframe()
205
+
206
+ # Get the target and feature columns
207
+ m = Model("aqsol-mol-regression")
208
+ target_column = m.target()
209
+ feature_columns = m.features()
210
+
211
+ # Create the class and run the report
212
+ resolution = FeatureResolution(
213
+ test_df, features=feature_columns, target_column=target_column, id_column=fs.id_column
214
+ )
215
+ df = resolution.compute(within_distance=0.01, min_target_difference=1.0)
216
+ print(df)
217
+
218
+
219
+ def recursive_test():
220
+ from workbench.api.feature_set import FeatureSet
221
+ from workbench.api.model import Model
222
+
223
+ # Grab a test dataframe
224
+ fs = FeatureSet("aqsol_mol_descriptors")
225
+ test_df = fs.pull_dataframe()
226
+
227
+ # Get the target and feature columns
228
+ m = Model("aqsol-mol-regression")
229
+ target_column = m.target()
230
+ feature_columns = m.features()
231
+
232
+ # Create the class and run the report
233
+ resolution = FeatureResolution(
234
+ test_df, features=feature_columns, target_column=target_column, id_column=fs.id_column
235
+ )
236
+ df = resolution.recursive_compute(within_distance=0.01, min_target_difference=1.0)
237
+ print(df)
238
+
239
+
240
+ if __name__ == "__main__":
241
+ simple_unit_test()
242
+ unit_test()
243
+ # recursive_test()