workbench 0.8.89__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/__init__.py +45 -0
- workbench/algorithms/__init__.py +0 -0
- workbench/algorithms/dataframe/Readme.md +3 -0
- workbench/algorithms/dataframe/__init__.py +12 -0
- workbench/algorithms/dataframe/aggregation.py +99 -0
- workbench/algorithms/dataframe/data_source_eda.py +46 -0
- workbench/algorithms/dataframe/dimensionality_reduction.py +145 -0
- workbench/algorithms/dataframe/feature_resolution.py +243 -0
- workbench/algorithms/dataframe/feature_space_proximity.py +335 -0
- workbench/algorithms/dataframe/feature_spider.py +363 -0
- workbench/algorithms/dataframe/knn_spider_deprecated.py +283 -0
- workbench/algorithms/dataframe/quantile_regression.py +373 -0
- workbench/algorithms/dataframe/residuals_calculator.py +218 -0
- workbench/algorithms/dataframe/row_tagger.py +192 -0
- workbench/algorithms/dataframe/target_gradients.py +145 -0
- workbench/algorithms/graph/__init__.py +10 -0
- workbench/algorithms/graph/heavy/Readme.md +2 -0
- workbench/algorithms/graph/light/Readme.md +2 -0
- workbench/algorithms/graph/light/__init__.py +0 -0
- workbench/algorithms/graph/light/proximity_graph.py +171 -0
- workbench/algorithms/spark/Readme.md +9 -0
- workbench/algorithms/sql/Readme.md +9 -0
- workbench/algorithms/sql/__init__.py +15 -0
- workbench/algorithms/sql/column_stats.py +146 -0
- workbench/algorithms/sql/correlations.py +116 -0
- workbench/algorithms/sql/descriptive_stats.py +109 -0
- workbench/algorithms/sql/outliers.py +248 -0
- workbench/algorithms/sql/sample_rows.py +78 -0
- workbench/algorithms/sql/value_counts.py +92 -0
- workbench/api/__init__.py +32 -0
- workbench/api/data_source.py +225 -0
- workbench/api/df_store.py +194 -0
- workbench/api/endpoint.py +94 -0
- workbench/api/feature_set.py +146 -0
- workbench/api/meta.py +273 -0
- workbench/api/model.py +78 -0
- workbench/api/monitor.py +163 -0
- workbench/api/parameter_store.py +241 -0
- workbench/api/pipeline.py +188 -0
- workbench/cached/__init__.py +11 -0
- workbench/cached/cached_data_source.py +87 -0
- workbench/cached/cached_endpoint.py +86 -0
- workbench/cached/cached_feature_set.py +88 -0
- workbench/cached/cached_meta.py +383 -0
- workbench/cached/cached_model.py +139 -0
- workbench/cached/cached_pipeline.py +64 -0
- workbench/core/__init__.py +0 -0
- workbench/core/artifacts/__init__.py +21 -0
- workbench/core/artifacts/artifact.py +486 -0
- workbench/core/artifacts/athena_source.py +676 -0
- workbench/core/artifacts/cached_artifact_mixin.py +92 -0
- workbench/core/artifacts/data_source_abstract.py +320 -0
- workbench/core/artifacts/data_source_factory.py +70 -0
- workbench/core/artifacts/endpoint_core.py +994 -0
- workbench/core/artifacts/feature_set_core.py +694 -0
- workbench/core/artifacts/graph_core.py +257 -0
- workbench/core/artifacts/model_core.py +1081 -0
- workbench/core/artifacts/monitor_core.py +504 -0
- workbench/core/cloud_platform/aws/README.md +2 -0
- workbench/core/cloud_platform/aws/aws_account_clamp.py +154 -0
- workbench/core/cloud_platform/aws/aws_df_store.py +349 -0
- workbench/core/cloud_platform/aws/aws_meta.py +773 -0
- workbench/core/cloud_platform/aws/aws_session.py +160 -0
- workbench/core/cloud_platform/aws/cache_dataframe.py +51 -0
- workbench/core/cloud_platform/azure/README.md +2 -0
- workbench/core/cloud_platform/cloud_meta.py +281 -0
- workbench/core/cloud_platform/gcp/README.md +2 -0
- workbench/core/pipelines/pipeline_executor.py +162 -0
- workbench/core/transforms/Readme.md +55 -0
- workbench/core/transforms/__init__.py +0 -0
- workbench/core/transforms/data_loaders/__init__.py +0 -0
- workbench/core/transforms/data_loaders/heavy/__init__.py +10 -0
- workbench/core/transforms/data_loaders/heavy/s3_heavy_to_data_source.py +245 -0
- workbench/core/transforms/data_loaders/light/__init__.py +14 -0
- workbench/core/transforms/data_loaders/light/csv_to_data_source.py +81 -0
- workbench/core/transforms/data_loaders/light/json_to_data_source.py +79 -0
- workbench/core/transforms/data_loaders/light/s3_to_data_source_light.py +106 -0
- workbench/core/transforms/data_to_data/__init__.py +0 -0
- workbench/core/transforms/data_to_data/heavy/__init__.py +0 -0
- workbench/core/transforms/data_to_data/heavy/emr/Readme.md +2 -0
- workbench/core/transforms/data_to_data/heavy/glue/Readme.md +2 -0
- workbench/core/transforms/data_to_data/light/__init__.py +0 -0
- workbench/core/transforms/data_to_data/light/clean_data.py +63 -0
- workbench/core/transforms/data_to_data/light/data_to_data_light.py +68 -0
- workbench/core/transforms/data_to_features/__init__.py +0 -0
- workbench/core/transforms/data_to_features/heavy/__init__.py +0 -0
- workbench/core/transforms/data_to_features/heavy/chunk/__init__.py +0 -0
- workbench/core/transforms/data_to_features/heavy/chunk/data_to_features_chunk.py +120 -0
- workbench/core/transforms/data_to_features/heavy/emr/Readme.md +2 -0
- workbench/core/transforms/data_to_features/heavy/glue/Readme.md +2 -0
- workbench/core/transforms/data_to_features/heavy/storage/data_to_features_heavy_old.py +161 -0
- workbench/core/transforms/data_to_features/light/__init__.py +0 -0
- workbench/core/transforms/data_to_features/light/data_to_features_light.py +83 -0
- workbench/core/transforms/data_to_features/light/molecular_descriptors.py +64 -0
- workbench/core/transforms/features_to_features/__init__.py +0 -0
- workbench/core/transforms/features_to_features/heavy/emr/Readme.md +2 -0
- workbench/core/transforms/features_to_features/heavy/glue/Readme.md +2 -0
- workbench/core/transforms/features_to_model/__init__.py +0 -0
- workbench/core/transforms/features_to_model/features_to_model.py +389 -0
- workbench/core/transforms/model_to_endpoint/__init__.py +0 -0
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +120 -0
- workbench/core/transforms/pandas_transforms/__init__.py +18 -0
- workbench/core/transforms/pandas_transforms/data_to_pandas.py +100 -0
- workbench/core/transforms/pandas_transforms/features_to_pandas.py +93 -0
- workbench/core/transforms/pandas_transforms/pandas_to_data.py +214 -0
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +450 -0
- workbench/core/transforms/pandas_transforms/pandas_to_features_chunked.py +111 -0
- workbench/core/transforms/transform.py +147 -0
- workbench/core/views/__init__.py +24 -0
- workbench/core/views/column_subset_view.py +113 -0
- workbench/core/views/computation_view.py +64 -0
- workbench/core/views/create_view.py +52 -0
- workbench/core/views/display_view.py +70 -0
- workbench/core/views/mdq_view.py +142 -0
- workbench/core/views/pandas_to_view.py +158 -0
- workbench/core/views/training_view.py +160 -0
- workbench/core/views/view.py +327 -0
- workbench/core/views/view_utils.py +314 -0
- workbench/model_scripts/custom_models/chem_info/Readme.md +10 -0
- workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +60 -0
- workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +62 -0
- workbench/model_scripts/custom_models/chem_info/requirements.txt +4 -0
- workbench/model_scripts/custom_models/chem_info/tautomerize.py +62 -0
- workbench/model_scripts/custom_models/network_security/Readme.md +7 -0
- workbench/model_scripts/custom_script_example/custom_model_script.py +138 -0
- workbench/model_scripts/custom_script_example/requirements.txt +2 -0
- workbench/model_scripts/light_quant_regression/quant_regression.template +297 -0
- workbench/model_scripts/light_quant_regression/requirements.txt +2 -0
- workbench/model_scripts/light_scikit_learn/requirements.txt +3 -0
- workbench/model_scripts/light_scikit_learn/scikit_learn.template +293 -0
- workbench/model_scripts/light_xgb_model/generated_model_script.py +368 -0
- workbench/model_scripts/light_xgb_model/requirements.txt +2 -0
- workbench/model_scripts/light_xgb_model/xgb_model.template +368 -0
- workbench/model_scripts/script_generation.py +190 -0
- workbench/plugins/pages/mdq_page.py +127 -0
- workbench/plugins/web_components/endpoint_turbo.py +106 -0
- workbench/repl/__init__.py +0 -0
- workbench/repl/workbench_shell.py +514 -0
- workbench/resources/open_source_api.key +1 -0
- workbench/resources/signature_verify_pub.pem +6 -0
- workbench/scripts/monitor_cloud_watch.py +349 -0
- workbench/scripts/redis_expire.py +41 -0
- workbench/scripts/redis_report.py +38 -0
- workbench/scripts/show_config.py +20 -0
- workbench/themes/dark/base_css.url +1 -0
- workbench/themes/dark/custom.css +70 -0
- workbench/themes/dark/dark.json +763 -0
- workbench/themes/light/base_css.url +1 -0
- workbench/themes/light/custom.css +80 -0
- workbench/themes/light/light.json +753 -0
- workbench/themes/quartz/base_css.url +1 -0
- workbench/themes/quartz/custom.css +69 -0
- workbench/themes/quartz/quartz.json +728 -0
- workbench/themes/quartz_dark/base_css.url +1 -0
- workbench/themes/quartz_dark/custom.css +71 -0
- workbench/themes/quartz_dark/quartz_dark.json +728 -0
- workbench/utils/__init__.py +0 -0
- workbench/utils/athena_utils.py +133 -0
- workbench/utils/aws_utils.py +678 -0
- workbench/utils/bulk_utils.py +35 -0
- workbench/utils/cache.py +195 -0
- workbench/utils/chem_utils.py +335 -0
- workbench/utils/cloudwatch_handler.py +149 -0
- workbench/utils/config_manager.py +445 -0
- workbench/utils/dashboard_metrics.py +194 -0
- workbench/utils/datetime_utils.py +137 -0
- workbench/utils/deprecated_utils.py +115 -0
- workbench/utils/df_to_endpoint.py +155 -0
- workbench/utils/ecs_info.py +73 -0
- workbench/utils/endpoint_metrics.py +188 -0
- workbench/utils/endpoint_utils.py +177 -0
- workbench/utils/execution_environment.py +211 -0
- workbench/utils/extract_model_artifact.py +242 -0
- workbench/utils/fast_inference.py +65 -0
- workbench/utils/glue_utils.py +68 -0
- workbench/utils/ipython_utils.py +26 -0
- workbench/utils/json_utils.py +50 -0
- workbench/utils/lambda_utils.py +65 -0
- workbench/utils/license_manager.py +177 -0
- workbench/utils/log_utils.py +43 -0
- workbench/utils/markdown_utils.py +42 -0
- workbench/utils/pandas_utils.py +614 -0
- workbench/utils/performance_utils.py +52 -0
- workbench/utils/pipeline_utils.py +72 -0
- workbench/utils/plugin_manager.py +356 -0
- workbench/utils/redis_cache.py +333 -0
- workbench/utils/repl_utils.py +148 -0
- workbench/utils/resource_utils.py +39 -0
- workbench/utils/s3_utils.py +202 -0
- workbench/utils/shapley_values.py +101 -0
- workbench/utils/symbols.py +43 -0
- workbench/utils/test_data_generator.py +314 -0
- workbench/utils/theme_manager.py +241 -0
- workbench/utils/trace_calls.py +65 -0
- workbench/utils/type_abbrev.py +48 -0
- workbench/utils/workbench_cache.py +159 -0
- workbench/utils/workbench_event_bridge.py +115 -0
- workbench/utils/workbench_logging.py +266 -0
- workbench/utils/workbench_sqs.py +67 -0
- workbench/web_interface/components/component_interface.py +228 -0
- workbench/web_interface/components/correlation_matrix.py +158 -0
- workbench/web_interface/components/data_details_markdown.py +251 -0
- workbench/web_interface/components/endpoint_metric_plots.py +85 -0
- workbench/web_interface/components/experiments/color_maps.py +25 -0
- workbench/web_interface/components/experiments/compound_details.py +49 -0
- workbench/web_interface/components/experiments/dashboard_metric_plots.py +85 -0
- workbench/web_interface/components/experiments/data_table.py +65 -0
- workbench/web_interface/components/experiments/graph_test.py +83 -0
- workbench/web_interface/components/experiments/hello.py +50 -0
- workbench/web_interface/components/experiments/histogram.py +47 -0
- workbench/web_interface/components/experiments/line_chart.py +30 -0
- workbench/web_interface/components/experiments/outlier_plot.py +103 -0
- workbench/web_interface/components/experiments/plugin_callbacks.py +60 -0
- workbench/web_interface/components/experiments/scatter_plot.py +65 -0
- workbench/web_interface/components/model_plot.py +65 -0
- workbench/web_interface/components/plugin_interface.py +219 -0
- workbench/web_interface/components/plugin_unit_test.py +149 -0
- workbench/web_interface/components/plugins/ag_table.py +94 -0
- workbench/web_interface/components/plugins/confusion_matrix.py +188 -0
- workbench/web_interface/components/plugins/dashboard_status.py +111 -0
- workbench/web_interface/components/plugins/data_details.py +270 -0
- workbench/web_interface/components/plugins/endpoint_details.py +114 -0
- workbench/web_interface/components/plugins/graph_plot.py +279 -0
- workbench/web_interface/components/plugins/license_details.py +138 -0
- workbench/web_interface/components/plugins/model_details.py +227 -0
- workbench/web_interface/components/plugins/pipeline_details.py +157 -0
- workbench/web_interface/components/plugins/scatter_plot.py +270 -0
- workbench/web_interface/components/regression_plot.py +103 -0
- workbench/web_interface/components/table.py +185 -0
- workbench/web_interface/components/violin_plots.py +149 -0
- workbench/web_interface/page_views/data_sources_page_view.py +126 -0
- workbench/web_interface/page_views/endpoints_page_view.py +82 -0
- workbench/web_interface/page_views/feature_sets_page_view.py +124 -0
- workbench/web_interface/page_views/main_page.py +226 -0
- workbench/web_interface/page_views/models_page_view.py +83 -0
- workbench/web_interface/page_views/page_view.py +16 -0
- workbench/web_interface/page_views/pipelines_page_view.py +80 -0
- workbench-0.8.89.dist-info/LICENSE +21 -0
- workbench-0.8.89.dist-info/METADATA +191 -0
- workbench-0.8.89.dist-info/RECORD +243 -0
- workbench-0.8.89.dist-info/WHEEL +5 -0
- workbench-0.8.89.dist-info/entry_points.txt +4 -0
- workbench-0.8.89.dist-info/top_level.txt +1 -0
workbench/__init__.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Copyright (c) 2021-2024 SuperCowPowers LLC
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Workbench Main Classes
|
|
5
|
+
- Artifacts
|
|
6
|
+
- DataSource
|
|
7
|
+
- FeatureSet
|
|
8
|
+
- Model
|
|
9
|
+
- Endpoint
|
|
10
|
+
- Transforms
|
|
11
|
+
- DataLoaders
|
|
12
|
+
- DataToData
|
|
13
|
+
- DataToFeatures
|
|
14
|
+
- FeaturesToModel
|
|
15
|
+
- ModelToEndpoint
|
|
16
|
+
|
|
17
|
+
For help on particular classes you can do this
|
|
18
|
+
- from workbench.core.transforms.data_loaders.light.json_to_data_source import JSONToDataSource
|
|
19
|
+
- help(JSONToDataSource)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class JSONToDataSource(workbench.core.transforms.transform.Transform)
|
|
23
|
+
| JSONToDataSource(json_file_path: str, data_uuid: str)
|
|
24
|
+
|
|
|
25
|
+
| JSONToDataSource: Class to move local JSON Files into a Workbench DataSource
|
|
26
|
+
|
|
|
27
|
+
| Common Usage:
|
|
28
|
+
| json_to_data = JSONToDataSource(json_file_path, data_uuid)
|
|
29
|
+
| json_to_data.set_output_tags(["abalone", "json", "whatever"])
|
|
30
|
+
| json_to_data.transform()
|
|
31
|
+
"""
|
|
32
|
+
import os
|
|
33
|
+
from importlib.metadata import version
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
__version__ = version("workbench")
|
|
37
|
+
except Exception:
|
|
38
|
+
__version__ = "unknown"
|
|
39
|
+
|
|
40
|
+
# Workbench Logging
|
|
41
|
+
from workbench.utils.workbench_logging import logging_setup
|
|
42
|
+
|
|
43
|
+
# Check the environment variable to decide whether to set up logging
|
|
44
|
+
if os.getenv("WORKBENCH_SKIP_LOGGING", "False").lower() != "true":
|
|
45
|
+
logging_setup()
|
|
File without changes
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
# Algorithms: Dataframe
|
|
2
|
+
- **Dataframes:** These algorithms are algorithms that have Pandas Dataframes as inputs and typically (not always) DataFrames as outputs. Depedending on the algorithm they might have outputs like dictionaries or lists.
|
|
3
|
+
- **Light:** These algorithms are considered **light** algorithms since they are contrained by one process where the data is in memory.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Welcome to the Dataframe Algorithm Classes
|
|
2
|
+
|
|
3
|
+
These classes provide functionality for Pandas Dataframes
|
|
4
|
+
|
|
5
|
+
- TBD: TBD
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .feature_space_proximity import FeatureSpaceProximity
|
|
9
|
+
from .residuals_calculator import ResidualsCalculator
|
|
10
|
+
from .dimensionality_reduction import DimensionalityReduction
|
|
11
|
+
|
|
12
|
+
__all__ = ["FeatureSpaceProximity", "ResidualsCalculator", "DimensionalityReduction"]
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Aggregation: Perform Row Aggregation on a DataFrame"""
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
# Workbench Logger
|
|
7
|
+
log = logging.getLogger("workbench")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def aggregate(df: pd.DataFrame, group_column: str, features: list = None) -> pd.DataFrame:
|
|
11
|
+
"""Aggregate Row of a DataFrame
|
|
12
|
+
Args:
|
|
13
|
+
df: Pandas DataFrame
|
|
14
|
+
group_column: The column to aggregate/group on
|
|
15
|
+
features: List of column names (numeric) to perform aggregation on (default: None)
|
|
16
|
+
Returns:
|
|
17
|
+
Pandas DataFrame with aggregated rows and averaged numeric columns
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
# If no features are given, indentify all numeric columns
|
|
21
|
+
if features is None:
|
|
22
|
+
features = [x for x in df.select_dtypes(include="number").columns.tolist() if not x.endswith("id")]
|
|
23
|
+
log.info("No features given, auto identifying numeric columns...")
|
|
24
|
+
log.info(f"{features}")
|
|
25
|
+
|
|
26
|
+
# Sanity checks
|
|
27
|
+
if not all(column in df.columns for column in features):
|
|
28
|
+
log.critical("Some features are missing in the DataFrame")
|
|
29
|
+
return df
|
|
30
|
+
if df.empty:
|
|
31
|
+
log.critical("DataFrame is empty")
|
|
32
|
+
return df
|
|
33
|
+
|
|
34
|
+
# Now aggregate the DataFrame
|
|
35
|
+
log.info(f"Aggregating dataframe, averaging {features}...")
|
|
36
|
+
|
|
37
|
+
# Subset the DataFrame to only the features and group column
|
|
38
|
+
df = df[features + [group_column]].copy()
|
|
39
|
+
df["group_count"] = 1
|
|
40
|
+
|
|
41
|
+
# Define the aggregation methods for each column
|
|
42
|
+
agg_methods = {feature: "mean" for feature in features}
|
|
43
|
+
agg_methods["group_count"] = "size"
|
|
44
|
+
|
|
45
|
+
# Group by the group column and perform the aggregation
|
|
46
|
+
df = df.groupby(group_column).agg(agg_methods).reset_index()
|
|
47
|
+
|
|
48
|
+
# Return the DataFrame
|
|
49
|
+
return df
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test():
|
|
53
|
+
"""Test for the Row Aggregation Class"""
|
|
54
|
+
# Set some pandas options
|
|
55
|
+
pd.set_option("display.max_columns", None)
|
|
56
|
+
pd.set_option("display.width", 1000)
|
|
57
|
+
|
|
58
|
+
# Make some fake data
|
|
59
|
+
data = {
|
|
60
|
+
"ID": [
|
|
61
|
+
"id_0",
|
|
62
|
+
"id_0",
|
|
63
|
+
"id_2",
|
|
64
|
+
"id_3",
|
|
65
|
+
"id_4",
|
|
66
|
+
"id_5",
|
|
67
|
+
"id_6",
|
|
68
|
+
"id_7",
|
|
69
|
+
"id_8",
|
|
70
|
+
"id_9",
|
|
71
|
+
],
|
|
72
|
+
"feat1": [1.0, 1.0, 1.1, 3.0, 4.0, 1.0, 1.0, 1.1, 3.0, 4.0],
|
|
73
|
+
"feat2": [1.0, 1.0, 1.1, 3.0, 4.0, 1.0, 1.0, 1.1, 3.0, 4.0],
|
|
74
|
+
"feat3": [0.1, 0.1, 0.2, 1.6, 2.5, 0.1, 0.1, 0.2, 1.6, 2.5],
|
|
75
|
+
"price": [31, 60, 62, 40, 20, 31, 61, 60, 40, 20],
|
|
76
|
+
"outlier_group": [
|
|
77
|
+
"sample",
|
|
78
|
+
"a_low",
|
|
79
|
+
"sample",
|
|
80
|
+
"b_high",
|
|
81
|
+
"sample",
|
|
82
|
+
"c_high",
|
|
83
|
+
"sample",
|
|
84
|
+
"d_low",
|
|
85
|
+
"sample",
|
|
86
|
+
"e_high",
|
|
87
|
+
],
|
|
88
|
+
}
|
|
89
|
+
data_df = pd.DataFrame(data)
|
|
90
|
+
|
|
91
|
+
# Aggregate the DataFrame
|
|
92
|
+
new_df = aggregate(data_df, group_column="outlier_group")
|
|
93
|
+
|
|
94
|
+
# Output the DataFrame
|
|
95
|
+
print(new_df)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
if __name__ == "__main__":
|
|
99
|
+
test()
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""DataSourceEDA: Provide basic EDA (Exploratory Data Analysis) for a DataFrame"""
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
# Local Imports
|
|
7
|
+
from workbench.core.transforms.pandas_transforms.data_to_pandas import DataToPandas
|
|
8
|
+
from workbench.utils import pandas_utils
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DataSourceEDA:
|
|
12
|
+
def __init__(self, data_source_uuid: str):
|
|
13
|
+
"""DataSourceEDA: Provide basic EDA (Exploratory Data Analysis) for a DataSource
|
|
14
|
+
Args:
|
|
15
|
+
data_source_uuid (AthenaSource): DataSource for Exploratory Data Analysis"""
|
|
16
|
+
self.log = logging.getLogger("workbench")
|
|
17
|
+
self.data_source_uuid = data_source_uuid
|
|
18
|
+
|
|
19
|
+
# Spin up the DataToPandas class
|
|
20
|
+
self.data_to_pandas = DataToPandas(self.data_source_uuid)
|
|
21
|
+
self.log.info(f"Getting DataFrame from {self.data_source_uuid}...")
|
|
22
|
+
self.data_to_pandas.transform()
|
|
23
|
+
self.df = self.data_to_pandas.get_output()
|
|
24
|
+
|
|
25
|
+
def get_column_info(self):
|
|
26
|
+
"""Return the Column Information for the DataSource"""
|
|
27
|
+
column_info_df = pandas_utils.info(self.df)
|
|
28
|
+
return column_info_df
|
|
29
|
+
|
|
30
|
+
def get_numeric_stats(self):
|
|
31
|
+
"""Return the Column Information for the DataSource"""
|
|
32
|
+
stats_df = pandas_utils.numeric_stats(self.df)
|
|
33
|
+
return stats_df
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
if __name__ == "__main__":
|
|
37
|
+
"""Exercise the DataSourceEDA Class"""
|
|
38
|
+
|
|
39
|
+
# Set some pandas options
|
|
40
|
+
pd.set_option("display.max_columns", None)
|
|
41
|
+
pd.set_option("display.width", 1000)
|
|
42
|
+
|
|
43
|
+
# Spin up the class and get the EDA output
|
|
44
|
+
my_eda = DataSourceEDA("abalone_data")
|
|
45
|
+
print(my_eda.get_column_info())
|
|
46
|
+
print(my_eda.get_numeric_stats())
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""DimensionalityReduction: Perform Dimensionality Reduction on a DataFrame"""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import logging
|
|
6
|
+
from sklearn.manifold import TSNE, MDS
|
|
7
|
+
from sklearn.decomposition import PCA
|
|
8
|
+
from sklearn.preprocessing import StandardScaler
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# Dimensionality Reduction Class
|
|
12
|
+
class DimensionalityReduction:
|
|
13
|
+
def __init__(self):
|
|
14
|
+
"""DimensionalityReduction: Perform Dimensionality Reduction on a DataFrame"""
|
|
15
|
+
self.log = logging.getLogger("workbench")
|
|
16
|
+
self.projection_model = None
|
|
17
|
+
self.features = None
|
|
18
|
+
|
|
19
|
+
def fit_transform(self, df: pd.DataFrame, features: list = None, projection: str = "TSNE") -> pd.DataFrame:
|
|
20
|
+
"""Fit and Transform the DataFrame
|
|
21
|
+
Args:
|
|
22
|
+
df: Pandas DataFrame
|
|
23
|
+
features: List of feature column names (default: None)
|
|
24
|
+
projection: The projection model to use (TSNE, MDS or PCA, default: PCA)
|
|
25
|
+
Returns:
|
|
26
|
+
Pandas DataFrame with new columns x and y
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
# If no features are given, indentify all numeric columns
|
|
30
|
+
if features is None:
|
|
31
|
+
features = [x for x in df.select_dtypes(include="number").columns.tolist() if not x.endswith("id")]
|
|
32
|
+
# Also drop group_count if it exists
|
|
33
|
+
features = [x for x in features if x != "group_count"]
|
|
34
|
+
self.log.info("No features given, auto identifying numeric columns...")
|
|
35
|
+
self.log.info(f"{features}")
|
|
36
|
+
self.features = features
|
|
37
|
+
|
|
38
|
+
# Sanity checks
|
|
39
|
+
if not all(column in df.columns for column in self.features):
|
|
40
|
+
self.log.critical("Some features are missing in the DataFrame")
|
|
41
|
+
return df
|
|
42
|
+
if len(self.features) < 2:
|
|
43
|
+
self.log.critical("At least two features are required")
|
|
44
|
+
return df
|
|
45
|
+
if df.empty:
|
|
46
|
+
self.log.critical("DataFrame is empty")
|
|
47
|
+
return df
|
|
48
|
+
|
|
49
|
+
# Most projection models will fail if there are any NaNs in the data
|
|
50
|
+
# So we'll fill NaNs with the mean value for that column
|
|
51
|
+
for col in df[self.features].columns:
|
|
52
|
+
df[col].fillna(df[col].mean(), inplace=True)
|
|
53
|
+
|
|
54
|
+
# Normalize the features
|
|
55
|
+
scaler = StandardScaler()
|
|
56
|
+
normalized_data = scaler.fit_transform(df[self.features])
|
|
57
|
+
df[self.features] = normalized_data
|
|
58
|
+
|
|
59
|
+
# Project the multidimensional features onto an x,y plane
|
|
60
|
+
self.log.info("Projecting features onto an x,y plane...")
|
|
61
|
+
|
|
62
|
+
# Perform the projection
|
|
63
|
+
if projection == "TSNE":
|
|
64
|
+
# Perplexity is a hyperparameter that controls the number of neighbors used to compute the manifold
|
|
65
|
+
# The number of neighbors should be less than the number of samples
|
|
66
|
+
perplexity = min(40, len(df) - 1)
|
|
67
|
+
self.log.info(f"Perplexity: {perplexity}")
|
|
68
|
+
self.projection_model = TSNE(perplexity=perplexity)
|
|
69
|
+
elif projection == "MDS":
|
|
70
|
+
self.projection_model = MDS(n_components=2, random_state=0)
|
|
71
|
+
elif projection == "PCA":
|
|
72
|
+
self.projection_model = PCA(n_components=2)
|
|
73
|
+
|
|
74
|
+
# Fit the projection model
|
|
75
|
+
# Hack PCA + TSNE to work together
|
|
76
|
+
projection = self.projection_model.fit_transform(df[self.features])
|
|
77
|
+
|
|
78
|
+
# Put the projection results back into the given DataFrame
|
|
79
|
+
df["x"] = projection[:, 0] # Projection X Column
|
|
80
|
+
df["y"] = projection[:, 1] # Projection Y Column
|
|
81
|
+
|
|
82
|
+
# Jitter the data to resolve coincident points
|
|
83
|
+
# df = self.resolve_coincident_points(df)
|
|
84
|
+
|
|
85
|
+
# Return the DataFrame with the new columns
|
|
86
|
+
return df
|
|
87
|
+
|
|
88
|
+
@staticmethod
|
|
89
|
+
def resolve_coincident_points(df: pd.DataFrame):
|
|
90
|
+
"""Resolve coincident points in a DataFrame
|
|
91
|
+
Args:
|
|
92
|
+
df(pd.DataFrame): The DataFrame to resolve coincident points in
|
|
93
|
+
Returns:
|
|
94
|
+
pd.DataFrame: The DataFrame with resolved coincident points
|
|
95
|
+
"""
|
|
96
|
+
# Adding Jitter to the projection
|
|
97
|
+
x_scale = (df["x"].max() - df["x"].min()) * 0.1
|
|
98
|
+
y_scale = (df["y"].max() - df["y"].min()) * 0.1
|
|
99
|
+
df["x"] += np.random.normal(-x_scale, +x_scale, len(df))
|
|
100
|
+
df["y"] += np.random.normal(-y_scale, +y_scale, len(df))
|
|
101
|
+
return df
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def test():
|
|
105
|
+
"""Test for the Dimensionality Reduction Class"""
|
|
106
|
+
# Set some pandas options
|
|
107
|
+
pd.set_option("display.max_columns", None)
|
|
108
|
+
pd.set_option("display.width", 1000)
|
|
109
|
+
|
|
110
|
+
# Make some fake data
|
|
111
|
+
data = {
|
|
112
|
+
"ID": [
|
|
113
|
+
"id_0",
|
|
114
|
+
"id_0",
|
|
115
|
+
"id_2",
|
|
116
|
+
"id_3",
|
|
117
|
+
"id_4",
|
|
118
|
+
"id_5",
|
|
119
|
+
"id_6",
|
|
120
|
+
"id_7",
|
|
121
|
+
"id_8",
|
|
122
|
+
"id_9",
|
|
123
|
+
],
|
|
124
|
+
"feat1": [1.0, 1.0, 1.1, 3.0, 4.0, 1.0, 1.0, 1.1, 3.0, 4.0],
|
|
125
|
+
"feat2": [1.0, 1.0, 1.1, 3.0, 4.0, 1.0, 1.0, 1.1, 3.0, 4.0],
|
|
126
|
+
"feat3": [0.1, 0.1, 0.2, 1.6, 2.5, 0.1, 0.1, 0.2, 1.6, 2.5],
|
|
127
|
+
"price": [31, 60, 62, 40, 20, 31, 61, 60, 40, 20],
|
|
128
|
+
}
|
|
129
|
+
data_df = pd.DataFrame(data)
|
|
130
|
+
features = ["feat1", "feat2", "feat3"]
|
|
131
|
+
|
|
132
|
+
# Create the class and run the dimensionality reduction
|
|
133
|
+
projection = DimensionalityReduction()
|
|
134
|
+
new_df = projection.fit_transform(data_df, features=features, projection="TSNE")
|
|
135
|
+
|
|
136
|
+
# Check that the x and y columns were added
|
|
137
|
+
assert "x" in new_df.columns
|
|
138
|
+
assert "y" in new_df.columns
|
|
139
|
+
|
|
140
|
+
# Output the DataFrame
|
|
141
|
+
print(new_df)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
if __name__ == "__main__":
|
|
145
|
+
test()
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""FeatureResolution: Report on Feature Space Resolution Issues"""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Union
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sklearn.neighbors import KNeighborsRegressor
|
|
7
|
+
from sklearn.preprocessing import StandardScaler
|
|
8
|
+
|
|
9
|
+
# Workbench Imports
|
|
10
|
+
from workbench.utils.pandas_utils import DataFrameBuilder
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# Feature Resolution Class
|
|
14
|
+
class FeatureResolution:
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
input_df: pd.DataFrame,
|
|
18
|
+
features: list,
|
|
19
|
+
target_column: str,
|
|
20
|
+
id_column: str,
|
|
21
|
+
distance_metric: str = "minkowski",
|
|
22
|
+
):
|
|
23
|
+
"""Initialize the FeatureResolution object
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
distance_metric: Distance metric to use (default: "minkowski")
|
|
27
|
+
"""
|
|
28
|
+
self.log = logging.getLogger("workbench")
|
|
29
|
+
self.df = input_df.copy().reset_index(drop=True)
|
|
30
|
+
self.features = features
|
|
31
|
+
self.target_column = target_column
|
|
32
|
+
self.id_column = id_column
|
|
33
|
+
self.n_neighbors = 10
|
|
34
|
+
self.scalar = StandardScaler()
|
|
35
|
+
self.knn = KNeighborsRegressor(metric=distance_metric, n_neighbors=self.n_neighbors, weights="distance")
|
|
36
|
+
self.dataframe_builder = DataFrameBuilder()
|
|
37
|
+
self.recursive_df_list = []
|
|
38
|
+
|
|
39
|
+
def compute(
|
|
40
|
+
self, within_distance: float, min_target_difference: float, output_columns: list = [], verbose=True
|
|
41
|
+
) -> Union[pd.DataFrame, None]:
|
|
42
|
+
"""FeatureResolution: Compute Feature Space to Target Resolution and Report Issues
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
within_distance: Features within this distance should have similar target values
|
|
46
|
+
min_target_difference: Minimum target difference to consider
|
|
47
|
+
output_columns: List of additional columns to output (default: []])
|
|
48
|
+
verbose: Whether to print out the resolution issues (default: True)
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Pandas DataFrame of Feature Space to Target Resolution Issues
|
|
52
|
+
Includes any additional output_columns if specified
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
# Check for expected columns
|
|
56
|
+
for column in [self.target_column] + self.features:
|
|
57
|
+
if column not in self.df.columns:
|
|
58
|
+
self.log.error(f"DataFrame does not have required {column} Column!")
|
|
59
|
+
return
|
|
60
|
+
|
|
61
|
+
# Set up the output columns (add id and target columns if they are not already included)
|
|
62
|
+
output_columns = list(set(output_columns).union({self.id_column, self.target_column}))
|
|
63
|
+
|
|
64
|
+
# Check the output columns
|
|
65
|
+
if output_columns is not None:
|
|
66
|
+
for column in output_columns:
|
|
67
|
+
if column not in self.df.columns:
|
|
68
|
+
self.log.error(f"DataFrame does not have required {column} Column!")
|
|
69
|
+
return
|
|
70
|
+
|
|
71
|
+
# Check for NaNs in the features and log the percentage
|
|
72
|
+
for feature in self.features:
|
|
73
|
+
nan_count = self.df[feature].isna().sum()
|
|
74
|
+
if nan_count > 0:
|
|
75
|
+
print(f"Feature '{feature}' has {nan_count} NaNs ({nan_count / len(self.df) * 100:.2f}%).")
|
|
76
|
+
|
|
77
|
+
# Remove and NaNs or INFs in the features
|
|
78
|
+
self.log.info(f"Dataframe Shape before NaN/INF removal {self.df.shape}")
|
|
79
|
+
self.df = self.df.replace([float("inf"), float("-inf")], pd.NA).dropna().reset_index(drop=True)
|
|
80
|
+
self.log.info(f"Dataframe Shape after NaN/INF removal {self.df.shape}")
|
|
81
|
+
|
|
82
|
+
# Standardize the features
|
|
83
|
+
X = self.scalar.fit_transform(self.df[self.features])
|
|
84
|
+
y = self.df[self.target_column]
|
|
85
|
+
|
|
86
|
+
# Fit the KNN model
|
|
87
|
+
self.knn.fit(X, y)
|
|
88
|
+
|
|
89
|
+
# Compute the feature space to target resolution to the nearest neighbors
|
|
90
|
+
output_count = 0
|
|
91
|
+
for my_index, row in enumerate(X):
|
|
92
|
+
# Find the nearest neighbors
|
|
93
|
+
distances, indices = self.knn.kneighbors([row])
|
|
94
|
+
distances = distances[0] # Returns a list within a list so grab the inner list
|
|
95
|
+
indices = indices[0]
|
|
96
|
+
target_values = y[indices]
|
|
97
|
+
|
|
98
|
+
# Grab the info for this observation
|
|
99
|
+
my_id = self.df.iloc[my_index][self.id_column]
|
|
100
|
+
my_output_data = self.df.iloc[my_index][output_columns]
|
|
101
|
+
my_target = y[my_index]
|
|
102
|
+
|
|
103
|
+
# Loop through the neighbors
|
|
104
|
+
for n_index, n_distance, n_target in zip(indices, distances, target_values):
|
|
105
|
+
# Skip myself
|
|
106
|
+
if n_index == my_index:
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
# Compute the difference in feature space and target space
|
|
110
|
+
feature_diff = n_distance
|
|
111
|
+
target_diff = abs(my_target - n_target)
|
|
112
|
+
|
|
113
|
+
# Compute target differences `within_distance` feature space
|
|
114
|
+
if feature_diff <= within_distance and target_diff >= min_target_difference:
|
|
115
|
+
# Gather info about the neighbor
|
|
116
|
+
neighbor_id = self.df.iloc[n_index][self.id_column]
|
|
117
|
+
neighbor_output_data = self.df.iloc[n_index][output_columns]
|
|
118
|
+
|
|
119
|
+
# Add to the output DataFrame
|
|
120
|
+
row_data = my_output_data.to_dict()
|
|
121
|
+
row_data["feature_diff"] = feature_diff
|
|
122
|
+
row_data["target_diff"] = target_diff
|
|
123
|
+
row_data["n_id"] = neighbor_id
|
|
124
|
+
self.dataframe_builder.add_row(row_data)
|
|
125
|
+
|
|
126
|
+
# Print out the resolution issue (if verbose)
|
|
127
|
+
if verbose:
|
|
128
|
+
print(f"{output_count} Feature Diff: {feature_diff} Target Diff: {target_diff}")
|
|
129
|
+
print(f"\t{my_id}: {my_target:.3f} {list(my_output_data)}")
|
|
130
|
+
print(f"\t{neighbor_id}: {n_target:.3f} {list(neighbor_output_data)}")
|
|
131
|
+
# Increment the output count
|
|
132
|
+
output_count += 1
|
|
133
|
+
|
|
134
|
+
# Return the output DataFrame
|
|
135
|
+
return self.dataframe_builder.build()
|
|
136
|
+
|
|
137
|
+
def recursive_compute(
|
|
138
|
+
self, within_distance: float, min_target_difference: float, output_columns: list = [], verbose=True
|
|
139
|
+
) -> pd.DataFrame:
|
|
140
|
+
"""Compute Feature Resolution Issues, remove the issues, and recurse until no issues are found"""
|
|
141
|
+
|
|
142
|
+
# Compute the resolution issues
|
|
143
|
+
resolution_df = self.compute(within_distance, min_target_difference, output_columns, verbose)
|
|
144
|
+
self.recursive_df_list.append(resolution_df)
|
|
145
|
+
|
|
146
|
+
# If there are no resolution issues, return the combined DataFrame
|
|
147
|
+
if len(resolution_df) == 0:
|
|
148
|
+
return pd.concat(self.recursive_df_list)
|
|
149
|
+
|
|
150
|
+
# Gather all IDs to be removed
|
|
151
|
+
ids_to_remove = set(list(resolution_df[self.id_column]) + list(resolution_df["n_id"]))
|
|
152
|
+
|
|
153
|
+
# Remove the rows of the observations that had issues
|
|
154
|
+
print("Removing IDs: ", ids_to_remove)
|
|
155
|
+
self.df = self.df[~self.df[self.id_column].isin(ids_to_remove)]
|
|
156
|
+
|
|
157
|
+
# Recurse
|
|
158
|
+
print("Recursing...")
|
|
159
|
+
self.df = self.df.reset_index(drop=True)
|
|
160
|
+
self.dataframe_builder = DataFrameBuilder()
|
|
161
|
+
return self.recursive_compute(within_distance, min_target_difference, output_columns, verbose)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# Test the FeatureResolution Class
|
|
165
|
+
def simple_unit_test():
|
|
166
|
+
"""Test for the Feature Spider Class"""
|
|
167
|
+
# Set some pandas options
|
|
168
|
+
pd.set_option("display.max_columns", None)
|
|
169
|
+
pd.set_option("display.width", 1000)
|
|
170
|
+
|
|
171
|
+
# Make some fake data
|
|
172
|
+
data = {
|
|
173
|
+
"ID": [
|
|
174
|
+
"id_0",
|
|
175
|
+
"id_1",
|
|
176
|
+
"id_2",
|
|
177
|
+
"id_3",
|
|
178
|
+
"id_4",
|
|
179
|
+
"id_5",
|
|
180
|
+
"id_6",
|
|
181
|
+
"id_7",
|
|
182
|
+
"id_8",
|
|
183
|
+
"id_9",
|
|
184
|
+
],
|
|
185
|
+
"feat1": [1.0, 1.0, 1.1, 3.0, 4.0, 1.0, 1.0, 1.1, 3.0, 4.0],
|
|
186
|
+
"feat2": [1.0, 1.0, 1.1, 3.0, 4.0, 1.0, 1.0, 1.1, 3.0, 4.0],
|
|
187
|
+
"feat3": [0.1, 0.2, 0.2, 1.6, 2.5, 0.1, 0.3, 0.2, 1.6, 2.5],
|
|
188
|
+
"price": [10, 11, 12, 40, 20, 35, 61, 60, 40, 20],
|
|
189
|
+
}
|
|
190
|
+
data_df = pd.DataFrame(data)
|
|
191
|
+
|
|
192
|
+
# Create the class and run the report
|
|
193
|
+
resolution = FeatureResolution(data_df, features=["feat1", "feat2", "feat3"], target_column="price", id_column="ID")
|
|
194
|
+
resolution.compute(within_distance=0.1, min_target_difference=10)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def unit_test():
|
|
198
|
+
"""Unit Test for the FeatureResolution Class"""
|
|
199
|
+
from workbench.api.feature_set import FeatureSet
|
|
200
|
+
from workbench.api.model import Model
|
|
201
|
+
|
|
202
|
+
# Grab a test dataframe
|
|
203
|
+
fs = FeatureSet("aqsol_mol_descriptors")
|
|
204
|
+
test_df = fs.pull_dataframe()
|
|
205
|
+
|
|
206
|
+
# Get the target and feature columns
|
|
207
|
+
m = Model("aqsol-mol-regression")
|
|
208
|
+
target_column = m.target()
|
|
209
|
+
feature_columns = m.features()
|
|
210
|
+
|
|
211
|
+
# Create the class and run the report
|
|
212
|
+
resolution = FeatureResolution(
|
|
213
|
+
test_df, features=feature_columns, target_column=target_column, id_column=fs.id_column
|
|
214
|
+
)
|
|
215
|
+
df = resolution.compute(within_distance=0.01, min_target_difference=1.0)
|
|
216
|
+
print(df)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def recursive_test():
|
|
220
|
+
from workbench.api.feature_set import FeatureSet
|
|
221
|
+
from workbench.api.model import Model
|
|
222
|
+
|
|
223
|
+
# Grab a test dataframe
|
|
224
|
+
fs = FeatureSet("aqsol_mol_descriptors")
|
|
225
|
+
test_df = fs.pull_dataframe()
|
|
226
|
+
|
|
227
|
+
# Get the target and feature columns
|
|
228
|
+
m = Model("aqsol-mol-regression")
|
|
229
|
+
target_column = m.target()
|
|
230
|
+
feature_columns = m.features()
|
|
231
|
+
|
|
232
|
+
# Create the class and run the report
|
|
233
|
+
resolution = FeatureResolution(
|
|
234
|
+
test_df, features=feature_columns, target_column=target_column, id_column=fs.id_column
|
|
235
|
+
)
|
|
236
|
+
df = resolution.recursive_compute(within_distance=0.01, min_target_difference=1.0)
|
|
237
|
+
print(df)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
if __name__ == "__main__":
|
|
241
|
+
simple_unit_test()
|
|
242
|
+
unit_test()
|
|
243
|
+
# recursive_test()
|