pandas-survey-toolkit 1.0.3__py3-none-any.whl → 1.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,89 +1,121 @@
1
- from typing import List, Union
2
- import pandas as pd
3
- import numpy as np
4
-
5
- def create_masked_df(df, input_columns):
6
- """
7
- Create a masked DataFrame excluding rows with NaN values in specified columns.
8
-
9
- Parameters:
10
- df (pandas.DataFrame): The input DataFrame.
11
- input_columns (list): List of column names to check for NaN values.
12
-
13
- Returns:
14
- tuple: (masked_df, mask)
15
- masked_df (pandas.DataFrame): DataFrame with NaN rows removed.
16
- mask (pandas.Series): Boolean mask indicating non-NaN rows.
17
- """
18
- mask = df[input_columns].notna().all(axis=1)
19
- masked_df = df[mask].copy()
20
- return masked_df, mask
21
-
22
- def combine_results(original_df, result_df, mask, output_columns:Union[List[str], str]):
23
- """
24
- Combine the results from a function applied to a masked DataFrame
25
- back into the original DataFrame.
26
-
27
- Parameters:
28
- original_df (pandas.DataFrame): The original input DataFrame.
29
- result_df (pandas.DataFrame): The DataFrame with results to be combined.
30
- mask (pandas.Series): Boolean mask indicating which rows to update.
31
- output_columns (list or str): List of column names (or name of single column) for the output.
32
-
33
- Returns:
34
- pandas.DataFrame: The original DataFrame updated with new results.
35
- """
36
-
37
- df = original_df.copy()
38
-
39
- if isinstance(output_columns, str):
40
- output_columns = [output_columns]
41
-
42
- df.loc[mask, output_columns] = result_df.loc[mask, output_columns]
43
-
44
- return df
45
-
46
- from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
47
- import pandas as pd
48
- from typing import Tuple
49
-
50
- def apply_vectorizer(df: pd.DataFrame, input_column: str, vectorizer_name: str = 'TfidfVectorizer', feature_prefix: str = 'vect_features_', **vectorizer_kwargs) -> Tuple[pd.DataFrame, object, np.ndarray]:
51
- """
52
- Apply a vectorizer to a text column in a DataFrame and return a new DataFrame with the vectorized features,
53
- along with the fitted vectorizer and feature names.
54
-
55
- Parameters:
56
- df (pandas.DataFrame): The input DataFrame.
57
- input_column (str): Name of the column containing text to vectorize.
58
- vectorizer_name (str): Name of the vectorizer to use ('CountVectorizer' or 'TfidfVectorizer'). Default is 'TfidfVectorizer'.
59
- feature_prefix (str): Prefix for the feature column names. Default is 'vect_features_'.
60
- **vectorizer_kwargs: Additional keyword arguments to pass to the vectorizer.
61
-
62
- Returns:
63
- Tuple[pandas.DataFrame, object, np.ndarray]:
64
- - A new DataFrame containing the vectorized features
65
- - The fitted vectorizer object
66
- - An array of feature names
67
- """
68
- # Select the appropriate vectorizer
69
- if vectorizer_name == 'CountVectorizer':
70
- vectorizer = CountVectorizer(**vectorizer_kwargs)
71
- elif vectorizer_name == 'TfidfVectorizer':
72
- vectorizer = TfidfVectorizer(**vectorizer_kwargs)
73
- else:
74
- raise ValueError("Unsupported vectorizer. Use 'CountVectorizer' or 'TfidfVectorizer'.")
75
-
76
- # Fit and transform the input text
77
- feature_matrix = vectorizer.fit_transform(df[input_column].fillna(''))
78
-
79
- # Get feature names
80
- feature_names = vectorizer.get_feature_names_out()
81
-
82
- # Create a DataFrame with the vectorized features
83
- feature_df = pd.DataFrame(
84
- feature_matrix.toarray(),
85
- columns=[f"{feature_prefix}{name}" for name in feature_names],
86
- index=df.index
87
- )
88
-
1
+ from typing import List, Tuple, Union
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
6
+
7
+
8
+ def create_masked_df(df: pd.DataFrame, input_columns: List[str]) -> Tuple[pd.DataFrame, pd.Series]:
9
+ """Create a masked DataFrame excluding rows with NaN values in specified columns.
10
+
11
+ Parameters
12
+ ----------
13
+ df : pd.DataFrame
14
+ The input DataFrame.
15
+ input_columns : List[str]
16
+ List of column names to check for NaN values.
17
+
18
+ Returns
19
+ -------
20
+ Tuple[pd.DataFrame, pd.Series]
21
+ A tuple containing:
22
+
23
+ - masked_df : pd.DataFrame
24
+ DataFrame with NaN rows removed.
25
+ - mask : pd.Series
26
+ Boolean mask indicating non-NaN rows.
27
+ """
28
+
29
+ mask = df[input_columns].notna().all(axis=1)
30
+ masked_df = df[mask].copy()
31
+ return masked_df, mask
32
+
33
+ def combine_results(original_df: pd.DataFrame, result_df: pd.DataFrame,
34
+ mask: pd.Series, output_columns: Union[List[str], str]) -> pd.DataFrame:
35
+ """Combine the results from a function applied to a masked DataFrame back into the original DataFrame.
36
+
37
+ Parameters
38
+ ----------
39
+ original_df : pd.DataFrame
40
+ The original input DataFrame.
41
+ result_df : pd.DataFrame
42
+ The DataFrame with results to be combined.
43
+ mask : pd.Series
44
+ Boolean mask indicating which rows to update.
45
+ output_columns : Union[List[str], str]
46
+ List of column names or name of single column for the output.
47
+
48
+ Returns
49
+ -------
50
+ pd.DataFrame
51
+ The original DataFrame updated with new results.
52
+ """
53
+
54
+ df = original_df.copy()
55
+
56
+ if isinstance(output_columns, str):
57
+ output_columns = [output_columns]
58
+
59
+ df.loc[mask, output_columns] = result_df.loc[mask, output_columns]
60
+
61
+ return df
62
+
63
+ def apply_vectorizer(df: pd.DataFrame, input_column: str,
64
+ vectorizer_name: str = 'TfidfVectorizer',
65
+ feature_prefix: str = 'vect_features_',
66
+ **vectorizer_kwargs) -> Tuple[pd.DataFrame, object, np.ndarray]:
67
+ """Apply a vectorizer to a text column in a DataFrame.
68
+
69
+ Parameters
70
+ ----------
71
+ df : pd.DataFrame
72
+ The input DataFrame.
73
+ input_column : str
74
+ Name of the column containing text to vectorize.
75
+ vectorizer_name : str, optional
76
+ Name of the vectorizer to use ('CountVectorizer' or 'TfidfVectorizer').
77
+ Default is 'TfidfVectorizer'.
78
+ feature_prefix : str, optional
79
+ Prefix for the feature column names. Default is 'vect_features_'.
80
+ **vectorizer_kwargs
81
+ Additional keyword arguments to pass to the vectorizer.
82
+
83
+ Returns
84
+ -------
85
+ Tuple[pd.DataFrame, object, np.ndarray]
86
+ A tuple containing:
87
+
88
+ - feature_df : pd.DataFrame
89
+ A new DataFrame containing the vectorized features
90
+ - vectorizer : object
91
+ The fitted vectorizer object
92
+ - feature_names : np.ndarray
93
+ An array of feature names
94
+
95
+ Raises
96
+ ------
97
+ ValueError
98
+ If an unsupported vectorizer name is provided.
99
+ """
100
+ # Select the appropriate vectorizer
101
+ if vectorizer_name == 'CountVectorizer':
102
+ vectorizer = CountVectorizer(**vectorizer_kwargs)
103
+ elif vectorizer_name == 'TfidfVectorizer':
104
+ vectorizer = TfidfVectorizer(**vectorizer_kwargs)
105
+ else:
106
+ raise ValueError("Unsupported vectorizer. Use 'CountVectorizer' or 'TfidfVectorizer'.")
107
+
108
+ # Fit and transform the input text
109
+ feature_matrix = vectorizer.fit_transform(df[input_column].fillna(''))
110
+
111
+ # Get feature names
112
+ feature_names = vectorizer.get_feature_names_out()
113
+
114
+ # Create a DataFrame with the vectorized features
115
+ feature_df = pd.DataFrame(
116
+ feature_matrix.toarray(),
117
+ columns=[f"{feature_prefix}{name}" for name in feature_names],
118
+ index=df.index
119
+ )
120
+
89
121
  return feature_df, vectorizer, feature_names