pandas-survey-toolkit 1.0.4__py3-none-any.whl → 1.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pandas_survey_toolkit/analytics.py +151 -113
- pandas_survey_toolkit/nlp.py +997 -824
- pandas_survey_toolkit/utils.py +120 -88
- pandas_survey_toolkit/vis.py +198 -760
- {pandas_survey_toolkit-1.0.4.dist-info → pandas_survey_toolkit-1.0.9.dist-info}/METADATA +76 -73
- pandas_survey_toolkit-1.0.9.dist-info/RECORD +10 -0
- {pandas_survey_toolkit-1.0.4.dist-info → pandas_survey_toolkit-1.0.9.dist-info}/WHEEL +1 -1
- {pandas_survey_toolkit-1.0.4.dist-info → pandas_survey_toolkit-1.0.9.dist-info}/licenses/LICENSE +21 -21
- pandas_survey_toolkit-1.0.4.dist-info/RECORD +0 -10
- {pandas_survey_toolkit-1.0.4.dist-info → pandas_survey_toolkit-1.0.9.dist-info}/top_level.txt +0 -0
pandas_survey_toolkit/utils.py
CHANGED
@@ -1,89 +1,121 @@
|
|
1
|
-
from typing import List, Union
|
2
|
-
|
3
|
-
import numpy as np
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
"""
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
"""
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
1
|
+
from typing import List, Tuple, Union
|
2
|
+
|
3
|
+
import numpy as np
|
4
|
+
import pandas as pd
|
5
|
+
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
6
|
+
|
7
|
+
|
8
|
+
def create_masked_df(df: pd.DataFrame, input_columns: List[str]) -> Tuple[pd.DataFrame, pd.Series]:
|
9
|
+
"""Create a masked DataFrame excluding rows with NaN values in specified columns.
|
10
|
+
|
11
|
+
Parameters
|
12
|
+
----------
|
13
|
+
df : pd.DataFrame
|
14
|
+
The input DataFrame.
|
15
|
+
input_columns : List[str]
|
16
|
+
List of column names to check for NaN values.
|
17
|
+
|
18
|
+
Returns
|
19
|
+
-------
|
20
|
+
Tuple[pd.DataFrame, pd.Series]
|
21
|
+
A tuple containing:
|
22
|
+
|
23
|
+
- masked_df : pd.DataFrame
|
24
|
+
DataFrame with NaN rows removed.
|
25
|
+
- mask : pd.Series
|
26
|
+
Boolean mask indicating non-NaN rows.
|
27
|
+
"""
|
28
|
+
|
29
|
+
mask = df[input_columns].notna().all(axis=1)
|
30
|
+
masked_df = df[mask].copy()
|
31
|
+
return masked_df, mask
|
32
|
+
|
33
|
+
def combine_results(original_df: pd.DataFrame, result_df: pd.DataFrame,
|
34
|
+
mask: pd.Series, output_columns: Union[List[str], str]) -> pd.DataFrame:
|
35
|
+
"""Combine the results from a function applied to a masked DataFrame back into the original DataFrame.
|
36
|
+
|
37
|
+
Parameters
|
38
|
+
----------
|
39
|
+
original_df : pd.DataFrame
|
40
|
+
The original input DataFrame.
|
41
|
+
result_df : pd.DataFrame
|
42
|
+
The DataFrame with results to be combined.
|
43
|
+
mask : pd.Series
|
44
|
+
Boolean mask indicating which rows to update.
|
45
|
+
output_columns : Union[List[str], str]
|
46
|
+
List of column names or name of single column for the output.
|
47
|
+
|
48
|
+
Returns
|
49
|
+
-------
|
50
|
+
pd.DataFrame
|
51
|
+
The original DataFrame updated with new results.
|
52
|
+
"""
|
53
|
+
|
54
|
+
df = original_df.copy()
|
55
|
+
|
56
|
+
if isinstance(output_columns, str):
|
57
|
+
output_columns = [output_columns]
|
58
|
+
|
59
|
+
df.loc[mask, output_columns] = result_df.loc[mask, output_columns]
|
60
|
+
|
61
|
+
return df
|
62
|
+
|
63
|
+
def apply_vectorizer(df: pd.DataFrame, input_column: str,
|
64
|
+
vectorizer_name: str = 'TfidfVectorizer',
|
65
|
+
feature_prefix: str = 'vect_features_',
|
66
|
+
**vectorizer_kwargs) -> Tuple[pd.DataFrame, object, np.ndarray]:
|
67
|
+
"""Apply a vectorizer to a text column in a DataFrame.
|
68
|
+
|
69
|
+
Parameters
|
70
|
+
----------
|
71
|
+
df : pd.DataFrame
|
72
|
+
The input DataFrame.
|
73
|
+
input_column : str
|
74
|
+
Name of the column containing text to vectorize.
|
75
|
+
vectorizer_name : str, optional
|
76
|
+
Name of the vectorizer to use ('CountVectorizer' or 'TfidfVectorizer').
|
77
|
+
Default is 'TfidfVectorizer'.
|
78
|
+
feature_prefix : str, optional
|
79
|
+
Prefix for the feature column names. Default is 'vect_features_'.
|
80
|
+
**vectorizer_kwargs
|
81
|
+
Additional keyword arguments to pass to the vectorizer.
|
82
|
+
|
83
|
+
Returns
|
84
|
+
-------
|
85
|
+
Tuple[pd.DataFrame, object, np.ndarray]
|
86
|
+
A tuple containing:
|
87
|
+
|
88
|
+
- feature_df : pd.DataFrame
|
89
|
+
A new DataFrame containing the vectorized features
|
90
|
+
- vectorizer : object
|
91
|
+
The fitted vectorizer object
|
92
|
+
- feature_names : np.ndarray
|
93
|
+
An array of feature names
|
94
|
+
|
95
|
+
Raises
|
96
|
+
------
|
97
|
+
ValueError
|
98
|
+
If an unsupported vectorizer name is provided.
|
99
|
+
"""
|
100
|
+
# Select the appropriate vectorizer
|
101
|
+
if vectorizer_name == 'CountVectorizer':
|
102
|
+
vectorizer = CountVectorizer(**vectorizer_kwargs)
|
103
|
+
elif vectorizer_name == 'TfidfVectorizer':
|
104
|
+
vectorizer = TfidfVectorizer(**vectorizer_kwargs)
|
105
|
+
else:
|
106
|
+
raise ValueError("Unsupported vectorizer. Use 'CountVectorizer' or 'TfidfVectorizer'.")
|
107
|
+
|
108
|
+
# Fit and transform the input text
|
109
|
+
feature_matrix = vectorizer.fit_transform(df[input_column].fillna(''))
|
110
|
+
|
111
|
+
# Get feature names
|
112
|
+
feature_names = vectorizer.get_feature_names_out()
|
113
|
+
|
114
|
+
# Create a DataFrame with the vectorized features
|
115
|
+
feature_df = pd.DataFrame(
|
116
|
+
feature_matrix.toarray(),
|
117
|
+
columns=[f"{feature_prefix}{name}" for name in feature_names],
|
118
|
+
index=df.index
|
119
|
+
)
|
120
|
+
|
89
121
|
return feature_df, vectorizer, feature_names
|