pandas-survey-toolkit 1.0.4__py3-none-any.whl → 1.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pandas_survey_toolkit/analytics.py +151 -113
- pandas_survey_toolkit/nlp.py +997 -824
- pandas_survey_toolkit/utils.py +120 -88
- pandas_survey_toolkit/vis.py +198 -760
- {pandas_survey_toolkit-1.0.4.dist-info → pandas_survey_toolkit-1.0.10.dist-info}/METADATA +77 -73
- pandas_survey_toolkit-1.0.10.dist-info/RECORD +10 -0
- {pandas_survey_toolkit-1.0.4.dist-info → pandas_survey_toolkit-1.0.10.dist-info}/WHEEL +1 -1
- {pandas_survey_toolkit-1.0.4.dist-info → pandas_survey_toolkit-1.0.10.dist-info}/licenses/LICENSE +21 -21
- pandas_survey_toolkit-1.0.4.dist-info/RECORD +0 -10
- {pandas_survey_toolkit-1.0.4.dist-info → pandas_survey_toolkit-1.0.10.dist-info}/top_level.txt +0 -0
@@ -1,113 +1,151 @@
|
|
1
|
-
import warnings
|
2
|
-
from typing import List, Union
|
3
|
-
|
4
|
-
import numpy as np
|
5
|
-
import pandas as pd
|
6
|
-
import pandas_flavor as pf
|
7
|
-
import umap
|
8
|
-
from sentence_transformers import SentenceTransformer
|
9
|
-
from sklearn.cluster import HDBSCAN
|
10
|
-
from sklearn.preprocessing import StandardScaler
|
11
|
-
|
12
|
-
from pandas_survey_toolkit.utils import combine_results, create_masked_df
|
13
|
-
|
14
|
-
|
15
|
-
@pf.register_dataframe_method
|
16
|
-
def fit_umap(
|
17
|
-
df, input_columns: Union[List[str], str], output_columns=["umap_x", "umap_y"], target_y:str=None, embeddings_in_list=False, **kwargs
|
18
|
-
):
|
19
|
-
"""
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
masked_df
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
1
|
+
import warnings
|
2
|
+
from typing import List, Union
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
import pandas as pd
|
6
|
+
import pandas_flavor as pf
|
7
|
+
import umap
|
8
|
+
from sentence_transformers import SentenceTransformer
|
9
|
+
from sklearn.cluster import HDBSCAN
|
10
|
+
from sklearn.preprocessing import StandardScaler
|
11
|
+
|
12
|
+
from pandas_survey_toolkit.utils import combine_results, create_masked_df
|
13
|
+
|
14
|
+
|
15
|
+
@pf.register_dataframe_method
|
16
|
+
def fit_umap(
|
17
|
+
df, input_columns: Union[List[str], str], output_columns=["umap_x", "umap_y"], target_y:str=None, embeddings_in_list=False, **kwargs
|
18
|
+
):
|
19
|
+
"""Apply UMAP to the columns in the dataframe.
|
20
|
+
|
21
|
+
This function applies UMAP dimensionality reduction to the specified columns
|
22
|
+
and appends the x and y coordinates to the dataframe as new columns.
|
23
|
+
|
24
|
+
Parameters
|
25
|
+
----------
|
26
|
+
df : pandas.DataFrame
|
27
|
+
The input dataframe to transform.
|
28
|
+
input_columns : Union[List[str], str]
|
29
|
+
Column name(s) containing the data to reduce.
|
30
|
+
output_columns : list, optional
|
31
|
+
Names for the output coordinate columns, by default ["umap_x", "umap_y"]
|
32
|
+
target_y : str, optional
|
33
|
+
Name of a column to use as the target variable for supervised UMAP, by default None
|
34
|
+
embeddings_in_list : bool, optional
|
35
|
+
Set to True if embeddings are a list of values in a single column,
|
36
|
+
False if each column is a separate dimension, by default False
|
37
|
+
**kwargs
|
38
|
+
Additional arguments to pass to UMAP. Most important is n_neighbors (default is 15).
|
39
|
+
|
40
|
+
Returns
|
41
|
+
-------
|
42
|
+
pandas.DataFrame
|
43
|
+
The input dataframe with added UMAP coordinate columns.
|
44
|
+
|
45
|
+
Raises
|
46
|
+
------
|
47
|
+
KeyError
|
48
|
+
If the specified target_y is not a column in the dataframe.
|
49
|
+
ValueError
|
50
|
+
If embeddings_in_list is True but multiple input columns are provided.
|
51
|
+
"""
|
52
|
+
|
53
|
+
if isinstance(input_columns, str):
|
54
|
+
input_columns = [input_columns] #ensure consistent handling in code
|
55
|
+
|
56
|
+
columns_to_mask = input_columns
|
57
|
+
if target_y:
|
58
|
+
if target_y not in df.columns:
|
59
|
+
raise KeyError(f"Your target_y value {target_y} should be the name of a column in the dataframe.")
|
60
|
+
columns_to_mask = input_columns + [target_y]
|
61
|
+
|
62
|
+
masked_df, mask = create_masked_df(df, columns_to_mask) #propogate NaN
|
63
|
+
|
64
|
+
if embeddings_in_list:
|
65
|
+
if len(input_columns) > 1:
|
66
|
+
raise ValueError("If your embeddings are in a list, they should be in a single column.")
|
67
|
+
embedding_data = np.array(masked_df[input_columns[0]].tolist())
|
68
|
+
else:
|
69
|
+
embedding_data = masked_df[input_columns].values
|
70
|
+
|
71
|
+
# Adjust n_neighbors if the dataset is too small
|
72
|
+
original_n_neighbors = kwargs.get('n_neighbors', 15)
|
73
|
+
adjusted_n_neighbors = min(original_n_neighbors, max(2, embedding_data.shape[0] - 1))
|
74
|
+
|
75
|
+
if adjusted_n_neighbors != original_n_neighbors:
|
76
|
+
warnings.warn(f"n_neighbors adjusted from {original_n_neighbors} to {adjusted_n_neighbors} due to small dataset size.")
|
77
|
+
|
78
|
+
kwargs['n_neighbors'] = adjusted_n_neighbors
|
79
|
+
|
80
|
+
reducer = umap.UMAP(**kwargs)
|
81
|
+
if target_y is not None:
|
82
|
+
target_y = masked_df[target_y].values
|
83
|
+
|
84
|
+
umap_coordinates = reducer.fit_transform(embedding_data, target_y)
|
85
|
+
|
86
|
+
# Append UMAP coordinates to DataFrame
|
87
|
+
masked_df[output_columns[0]] = umap_coordinates[:, 0]
|
88
|
+
masked_df[output_columns[1]] = umap_coordinates[:, 1]
|
89
|
+
|
90
|
+
df_to_return = combine_results(df, masked_df, mask, output_columns)
|
91
|
+
return df_to_return
|
92
|
+
|
93
|
+
@pf.register_dataframe_method
|
94
|
+
def fit_cluster_hdbscan(df, input_columns=['umap_x', 'umap_y'], output_columns=["cluster", "cluster_probability"], min_cluster_size=5, min_samples=None,
|
95
|
+
cluster_selection_epsilon=0.0, metric='euclidean', cluster_selection_method='eom',
|
96
|
+
allow_single_cluster=False):
|
97
|
+
"""Apply HDBSCAN clustering to the specified columns of the DataFrame.
|
98
|
+
|
99
|
+
Parameters
|
100
|
+
----------
|
101
|
+
df : pandas.DataFrame
|
102
|
+
The input DataFrame.
|
103
|
+
input_columns : list, optional
|
104
|
+
List of column names to use for clustering, by default ['umap_x', 'umap_y']
|
105
|
+
output_columns : list, optional
|
106
|
+
Names for the output columns, by default ["cluster", "cluster_probability"]
|
107
|
+
min_cluster_size : int, optional
|
108
|
+
The minimum size of clusters, by default 5
|
109
|
+
min_samples : int, optional
|
110
|
+
The number of samples in a neighborhood for a point to be considered a core point, by default None
|
111
|
+
cluster_selection_epsilon : float, optional
|
112
|
+
A distance threshold. Clusters below this value will be merged.
|
113
|
+
Higher epsilon means fewer, larger clusters, by default 0.0
|
114
|
+
metric : str, optional
|
115
|
+
The metric to use for distance computation, by default 'euclidean'
|
116
|
+
cluster_selection_method : str, optional
|
117
|
+
The method to select clusters. Either 'eom' or 'leaf', by default 'eom'
|
118
|
+
allow_single_cluster : bool, optional
|
119
|
+
Whether to allow a single cluster, by default False
|
120
|
+
|
121
|
+
Returns
|
122
|
+
-------
|
123
|
+
pandas.DataFrame
|
124
|
+
The input DataFrame with additional columns containing cluster labels and probabilities.
|
125
|
+
"""
|
126
|
+
# Extract the specified columns for clustering
|
127
|
+
|
128
|
+
masked_df, mask = create_masked_df(df, input_columns)
|
129
|
+
|
130
|
+
X = masked_df[input_columns].values
|
131
|
+
|
132
|
+
# Initialize and fit HDBSCAN
|
133
|
+
clusterer = HDBSCAN(min_cluster_size=min_cluster_size,
|
134
|
+
min_samples=min_samples,
|
135
|
+
cluster_selection_epsilon=cluster_selection_epsilon,
|
136
|
+
metric=metric,
|
137
|
+
cluster_selection_method=cluster_selection_method,
|
138
|
+
allow_single_cluster=allow_single_cluster)
|
139
|
+
|
140
|
+
cluster_labels = clusterer.fit_predict(X)
|
141
|
+
|
142
|
+
# Add cluster labels to the DataFrame
|
143
|
+
masked_df[output_columns[0]] = cluster_labels
|
144
|
+
|
145
|
+
# Add cluster probabilities to the DataFrame
|
146
|
+
masked_df[output_columns[1]] = clusterer.probabilities_
|
147
|
+
|
148
|
+
df_to_return = combine_results(df, masked_df, mask, output_columns)
|
149
|
+
return df_to_return
|
150
|
+
|
151
|
+
|