pandas-survey-toolkit 1.0.4__py3-none-any.whl → 1.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,113 +1,151 @@
1
- import warnings
2
- from typing import List, Union
3
-
4
- import numpy as np
5
- import pandas as pd
6
- import pandas_flavor as pf
7
- import umap
8
- from sentence_transformers import SentenceTransformer
9
- from sklearn.cluster import HDBSCAN
10
- from sklearn.preprocessing import StandardScaler
11
-
12
- from pandas_survey_toolkit.utils import combine_results, create_masked_df
13
-
14
-
15
- @pf.register_dataframe_method
16
- def fit_umap(
17
- df, input_columns: Union[List[str], str], output_columns=["umap_x", "umap_y"], target_y:str=None, embeddings_in_list=False, **kwargs
18
- ):
19
- """applies UMAP to the columns in the dataframe and appends the x and y co-ordinates
20
- to the dataframe as 2 new columns
21
- most import kwargs to use would be n_neighbors (default is 15) - note american spelling.
22
- If your embeddings are a list of values in a single column, set embeddings_in_list to True,
23
- otherwise it assumes each column is a separate set of values / dimension to be reduced.
24
-
25
- Returns: modified dataframe
26
- """
27
-
28
- if isinstance(input_columns, str):
29
- input_columns = [input_columns] #ensure consistent handling in code
30
-
31
- columns_to_mask = input_columns
32
- if target_y:
33
- if target_y not in df.columns:
34
- raise KeyError(f"Your target_y value {target_y} should be the name of a column in the dataframe.")
35
- columns_to_mask = input_columns + [target_y]
36
-
37
- masked_df, mask = create_masked_df(df, columns_to_mask) #propogate NaN
38
-
39
- if embeddings_in_list:
40
- if len(input_columns) > 1:
41
- raise ValueError("If your embeddings are in a list, they should be in a single column.")
42
- embedding_data = np.array(masked_df[input_columns[0]].tolist())
43
- else:
44
- embedding_data = masked_df[input_columns].values
45
-
46
- # Adjust n_neighbors if the dataset is too small
47
- original_n_neighbors = kwargs.get('n_neighbors', 15)
48
- adjusted_n_neighbors = min(original_n_neighbors, max(2, embedding_data.shape[0] - 1))
49
-
50
- if adjusted_n_neighbors != original_n_neighbors:
51
- warnings.warn(f"n_neighbors adjusted from {original_n_neighbors} to {adjusted_n_neighbors} due to small dataset size.")
52
-
53
- kwargs['n_neighbors'] = adjusted_n_neighbors
54
-
55
- reducer = umap.UMAP(**kwargs)
56
- if target_y is not None:
57
- target_y = masked_df[target_y].values
58
-
59
- umap_coordinates = reducer.fit_transform(embedding_data, target_y)
60
-
61
- # Append UMAP coordinates to DataFrame
62
- masked_df[output_columns[0]] = umap_coordinates[:, 0]
63
- masked_df[output_columns[1]] = umap_coordinates[:, 1]
64
-
65
- df_to_return = combine_results(df, masked_df, mask, output_columns)
66
- return df_to_return
67
-
68
- @pf.register_dataframe_method
69
- def fit_cluster_hdbscan(df, input_columns=['umap_x', 'umap_y'], output_columns=["cluster", "cluster_probability"], min_cluster_size=5, min_samples=None,
70
- cluster_selection_epsilon=0.0, metric='euclidean', cluster_selection_method='eom',
71
- allow_single_cluster=False):
72
- """
73
- Apply HDBSCAN clustering to the specified columns of the DataFrame.
74
-
75
- Parameters:
76
- df (pandas.DataFrame): The input DataFrame.
77
- columns (list): List of column names to use for clustering. Default is ['umap_1', 'umap_2'].
78
- min_cluster_size (int): The minimum size of clusters. Default is 5.
79
- min_samples (int): The number of samples in a neighborhood for a point to be considered a core point. Default is None.
80
- cluster_selection_epsilon (float): A distance threshold. Clusters below this value will be merged. Default is 0.0. higher epslion = fewer, larger clusters
81
- metric (str): The metric to use for distance computation. Default is 'euclidean'.
82
- cluster_selection_method (str): The method to select clusters. Either 'eom' or 'leaf'. Default is 'eom'.
83
- allow_single_cluster (bool): Whether to allow a single cluster. Default is False.
84
-
85
- Returns:
86
- pandas.DataFrame: The input DataFrame with an additional 'cluster' column containing cluster labels.
87
- """
88
- # Extract the specified columns for clustering
89
-
90
- masked_df, mask = create_masked_df(df, input_columns)
91
-
92
- X = masked_df[input_columns].values
93
-
94
- # Initialize and fit HDBSCAN
95
- clusterer = HDBSCAN(min_cluster_size=min_cluster_size,
96
- min_samples=min_samples,
97
- cluster_selection_epsilon=cluster_selection_epsilon,
98
- metric=metric,
99
- cluster_selection_method=cluster_selection_method,
100
- allow_single_cluster=allow_single_cluster)
101
-
102
- cluster_labels = clusterer.fit_predict(X)
103
-
104
- # Add cluster labels to the DataFrame
105
- masked_df[output_columns[0]] = cluster_labels
106
-
107
- # Add cluster probabilities to the DataFrame
108
- masked_df[output_columns[1]] = clusterer.probabilities_
109
-
110
- df_to_return = combine_results(df, masked_df, mask, output_columns)
111
- return df_to_return
112
-
113
-
1
+ import warnings
2
+ from typing import List, Union
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ import pandas_flavor as pf
7
+ import umap
8
+ from sentence_transformers import SentenceTransformer
9
+ from sklearn.cluster import HDBSCAN
10
+ from sklearn.preprocessing import StandardScaler
11
+
12
+ from pandas_survey_toolkit.utils import combine_results, create_masked_df
13
+
14
+
15
+ @pf.register_dataframe_method
16
+ def fit_umap(
17
+ df, input_columns: Union[List[str], str], output_columns=["umap_x", "umap_y"], target_y:str=None, embeddings_in_list=False, **kwargs
18
+ ):
19
+ """Apply UMAP to the columns in the dataframe.
20
+
21
+ This function applies UMAP dimensionality reduction to the specified columns
22
+ and appends the x and y coordinates to the dataframe as new columns.
23
+
24
+ Parameters
25
+ ----------
26
+ df : pandas.DataFrame
27
+ The input dataframe to transform.
28
+ input_columns : Union[List[str], str]
29
+ Column name(s) containing the data to reduce.
30
+ output_columns : list, optional
31
+ Names for the output coordinate columns, by default ["umap_x", "umap_y"]
32
+ target_y : str, optional
33
+ Name of a column to use as the target variable for supervised UMAP, by default None
34
+ embeddings_in_list : bool, optional
35
+ Set to True if embeddings are a list of values in a single column,
36
+ False if each column is a separate dimension, by default False
37
+ **kwargs
38
+ Additional arguments to pass to UMAP. Most important is n_neighbors (default is 15).
39
+
40
+ Returns
41
+ -------
42
+ pandas.DataFrame
43
+ The input dataframe with added UMAP coordinate columns.
44
+
45
+ Raises
46
+ ------
47
+ KeyError
48
+ If the specified target_y is not a column in the dataframe.
49
+ ValueError
50
+ If embeddings_in_list is True but multiple input columns are provided.
51
+ """
52
+
53
+ if isinstance(input_columns, str):
54
+ input_columns = [input_columns] #ensure consistent handling in code
55
+
56
+ columns_to_mask = input_columns
57
+ if target_y:
58
+ if target_y not in df.columns:
59
+ raise KeyError(f"Your target_y value {target_y} should be the name of a column in the dataframe.")
60
+ columns_to_mask = input_columns + [target_y]
61
+
62
+ masked_df, mask = create_masked_df(df, columns_to_mask) #propogate NaN
63
+
64
+ if embeddings_in_list:
65
+ if len(input_columns) > 1:
66
+ raise ValueError("If your embeddings are in a list, they should be in a single column.")
67
+ embedding_data = np.array(masked_df[input_columns[0]].tolist())
68
+ else:
69
+ embedding_data = masked_df[input_columns].values
70
+
71
+ # Adjust n_neighbors if the dataset is too small
72
+ original_n_neighbors = kwargs.get('n_neighbors', 15)
73
+ adjusted_n_neighbors = min(original_n_neighbors, max(2, embedding_data.shape[0] - 1))
74
+
75
+ if adjusted_n_neighbors != original_n_neighbors:
76
+ warnings.warn(f"n_neighbors adjusted from {original_n_neighbors} to {adjusted_n_neighbors} due to small dataset size.")
77
+
78
+ kwargs['n_neighbors'] = adjusted_n_neighbors
79
+
80
+ reducer = umap.UMAP(**kwargs)
81
+ if target_y is not None:
82
+ target_y = masked_df[target_y].values
83
+
84
+ umap_coordinates = reducer.fit_transform(embedding_data, target_y)
85
+
86
+ # Append UMAP coordinates to DataFrame
87
+ masked_df[output_columns[0]] = umap_coordinates[:, 0]
88
+ masked_df[output_columns[1]] = umap_coordinates[:, 1]
89
+
90
+ df_to_return = combine_results(df, masked_df, mask, output_columns)
91
+ return df_to_return
92
+
93
+ @pf.register_dataframe_method
94
+ def fit_cluster_hdbscan(df, input_columns=['umap_x', 'umap_y'], output_columns=["cluster", "cluster_probability"], min_cluster_size=5, min_samples=None,
95
+ cluster_selection_epsilon=0.0, metric='euclidean', cluster_selection_method='eom',
96
+ allow_single_cluster=False):
97
+ """Apply HDBSCAN clustering to the specified columns of the DataFrame.
98
+
99
+ Parameters
100
+ ----------
101
+ df : pandas.DataFrame
102
+ The input DataFrame.
103
+ input_columns : list, optional
104
+ List of column names to use for clustering, by default ['umap_x', 'umap_y']
105
+ output_columns : list, optional
106
+ Names for the output columns, by default ["cluster", "cluster_probability"]
107
+ min_cluster_size : int, optional
108
+ The minimum size of clusters, by default 5
109
+ min_samples : int, optional
110
+ The number of samples in a neighborhood for a point to be considered a core point, by default None
111
+ cluster_selection_epsilon : float, optional
112
+ A distance threshold. Clusters below this value will be merged.
113
+ Higher epsilon means fewer, larger clusters, by default 0.0
114
+ metric : str, optional
115
+ The metric to use for distance computation, by default 'euclidean'
116
+ cluster_selection_method : str, optional
117
+ The method to select clusters. Either 'eom' or 'leaf', by default 'eom'
118
+ allow_single_cluster : bool, optional
119
+ Whether to allow a single cluster, by default False
120
+
121
+ Returns
122
+ -------
123
+ pandas.DataFrame
124
+ The input DataFrame with additional columns containing cluster labels and probabilities.
125
+ """
126
+ # Extract the specified columns for clustering
127
+
128
+ masked_df, mask = create_masked_df(df, input_columns)
129
+
130
+ X = masked_df[input_columns].values
131
+
132
+ # Initialize and fit HDBSCAN
133
+ clusterer = HDBSCAN(min_cluster_size=min_cluster_size,
134
+ min_samples=min_samples,
135
+ cluster_selection_epsilon=cluster_selection_epsilon,
136
+ metric=metric,
137
+ cluster_selection_method=cluster_selection_method,
138
+ allow_single_cluster=allow_single_cluster)
139
+
140
+ cluster_labels = clusterer.fit_predict(X)
141
+
142
+ # Add cluster labels to the DataFrame
143
+ masked_df[output_columns[0]] = cluster_labels
144
+
145
+ # Add cluster probabilities to the DataFrame
146
+ masked_df[output_columns[1]] = clusterer.probabilities_
147
+
148
+ df_to_return = combine_results(df, masked_df, mask, output_columns)
149
+ return df_to_return
150
+
151
+