pandas-survey-toolkit 1.0.3__py3-none-any.whl → 1.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,824 +1,997 @@
1
- import re
2
- import warnings
3
- from collections import defaultdict
4
- from typing import List, Tuple, Union
5
-
6
- import numpy as np
7
- import pandas as pd
8
- import pandas_flavor as pf
9
- import spacy
10
- from gensim.parsing.preprocessing import (
11
- remove_stopwords,
12
- strip_multiple_whitespaces,
13
- strip_numeric,
14
- strip_tags,
15
- )
16
- from scipy.special import softmax
17
- from sentence_transformers import SentenceTransformer
18
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
19
-
20
- from pandas_survey_toolkit.analytics import fit_cluster_hdbscan, fit_umap
21
- from pandas_survey_toolkit.utils import (
22
- apply_vectorizer,
23
- combine_results,
24
- create_masked_df,
25
- )
26
-
27
-
28
- @pf.register_dataframe_method
29
- def cluster_questions(
30
- df,
31
- columns=None,
32
- pattern=None,
33
- likert_mapping=None,
34
- umap_n_neighbors=15,
35
- umap_min_dist=0.1,
36
- hdbscan_min_cluster_size=20,
37
- hdbscan_min_samples=None,
38
- cluster_selection_epsilon=0.4,
39
- ):
40
- """
41
- Cluster Likert scale questions based on response patterns.
42
-
43
- Parameters:
44
- df (pandas.DataFrame): The input DataFrame.
45
- columns (list): List of column names to cluster. If None, all columns matching the pattern will be used.
46
- pattern (str): Regex pattern to match column names. Used if columns is None.
47
- likert_mapping (dict): Custom mapping for Likert scale responses. If None, default mapping is used.
48
- umap_n_neighbors (int): The size of local neighborhood for UMAP. Default is 15.
49
- umap_min_dist (float): The minimum distance between points in UMAP. Default is 0.1.
50
- umap_n_components (int): The number of dimensions for UMAP output. Default is 2.
51
- hdbscan_min_cluster_size (int): The minimum size of clusters for HDBSCAN. Default is 5.
52
- hdbscan_min_samples (int): The number of samples in a neighborhood for a core point in HDBSCAN. Default is None.
53
- cluster_selection_epsilon (float): A distance threshold. Clusters below this value will be merged. Default is 0.0. higher epslion = fewer, larger clusters
54
-
55
- Returns:
56
- pandas.DataFrame: The input DataFrame with additional columns for encoded Likert responses, UMAP coordinates, and cluster IDs.
57
- """
58
-
59
- # Select columns
60
- if columns is None and pattern is None:
61
- raise ValueError("Either 'columns' or 'pattern' must be provided.")
62
- elif columns is None:
63
- columns = df.filter(regex=pattern).columns.tolist()
64
-
65
- # Encode Likert scales
66
- df = df.encode_likert(columns, custom_mapping=likert_mapping)
67
- encoded_columns = [f"likert_encoded_{col}" for col in columns]
68
-
69
- # Apply UMAP
70
- df = df.fit_umap(
71
- input_columns=encoded_columns,
72
- output_columns=["likert_umap_x", "likert_umap_y"],
73
- n_neighbors=umap_n_neighbors,
74
- min_dist=umap_min_dist,
75
- metric="cosine",
76
- )
77
-
78
- # Apply HDBSCAN
79
- df = df.fit_cluster_hdbscan(
80
- input_columns=["likert_umap_x", "likert_umap_y"],
81
- output_columns=["question_cluster_id", "question_cluster_probability"],
82
- min_cluster_size=hdbscan_min_cluster_size,
83
- min_samples=hdbscan_min_samples,
84
- cluster_selection_epsilon=cluster_selection_epsilon,
85
- )
86
-
87
- return df
88
-
89
-
90
- @pf.register_dataframe_method
91
- def encode_likert(
92
- df, likert_columns, output_prefix="likert_encoded_", custom_mapping=None, debug=True
93
- ):
94
- """
95
- Encode Likert scale responses to numeric values.
96
-
97
- Parameters:
98
- df (pandas.DataFrame): The input DataFrame.
99
- likert_columns (list): List of column names containing Likert scale responses.
100
- output_prefix (str): Prefix for the new encoded columns. Default is 'likert_encoded_'.
101
- custom_mapping (dict): Optional custom mapping for Likert scale responses.
102
- debug (bool): Prints out the mappings
103
-
104
- Returns:
105
- pandas.DataFrame: The input DataFrame with additional columns for encoded Likert responses.
106
- """
107
-
108
- def default_mapping(response):
109
- if pd.isna(response):
110
- return pd.NA
111
- response = str(response).lower().strip()
112
-
113
- # Neutral / Neither / Unsure / Don't know (0)
114
- if re.search(r"\b(neutral|neither|unsure|know)\b", response) or re.search(
115
- r"neither\s+agree\s+nor\s+disagree", response
116
- ):
117
- return 0
118
-
119
- # Disagree / Dissatisfied (-1)
120
- if re.search(r"\b(disagree)\b", response) or re.search(
121
- r"\b(dis|not|no)[-]{0,1}\s*(agree|satisf)", response
122
- ):
123
- return -1
124
-
125
- # Agree / Satisfied (1)
126
- if re.search(r"\bagree\b", response) or re.search(r"satisf", response):
127
- return 1
128
-
129
- # Unable to classify
130
- return None
131
-
132
- conversion_summary = defaultdict(int)
133
- unconverted_phrases = set()
134
-
135
- if custom_mapping is None:
136
- mapping_func = default_mapping
137
- if debug:
138
- print("Using default mapping:")
139
- print("-1: Phrases containing 'disagree', 'do not agree', etc.")
140
- print(" 0: Phrases containing 'neutral', 'neither', 'unsure', etc.")
141
- print("+1: Phrases containing 'agree' (but not 'disagree' or 'not agree')")
142
- print("NaN: NaN values are preserved")
143
- else:
144
-
145
- def mapping_func(response):
146
- if pd.isna(response):
147
- return pd.NA
148
- converted = custom_mapping.get(str(response).lower().strip())
149
- if converted is None:
150
- unconverted_phrases.add(str(response))
151
- return pd.NA
152
- return converted
153
-
154
- if debug:
155
- print("Using custom mapping:", custom_mapping)
156
- print("NaN: NaN values are preserved")
157
-
158
- for column in likert_columns:
159
- output_column = f"{output_prefix}{column}"
160
- df[output_column] = df[column].apply(lambda x: mapping_func(x))
161
-
162
- # Update conversion summary
163
- for original, converted in zip(df[column], df[output_column]):
164
- conversion_summary[f"{original} -> {converted}"] += 1
165
-
166
- if debug:
167
- for conversion, count in conversion_summary.items():
168
- print(f" {conversion}: {count} times")
169
-
170
- # Alert about unconverted phrases
171
- if unconverted_phrases:
172
- warnings.warn(
173
- f"The following phrases were not converted (mapped to NaN): {', '.join(unconverted_phrases)}"
174
- )
175
-
176
- # Alert if default mapping didn't convert everything
177
- if custom_mapping is None:
178
- all_responses = set()
179
- for column in likert_columns:
180
- all_responses.update(df[column].dropna().unique())
181
- unconverted = [
182
- resp for resp in all_responses if default_mapping(resp) not in [-1, 0, 1]
183
- ]
184
- if unconverted:
185
- warnings.warn(
186
- f"The default mapping didn't convert the following responses: {', '.join(unconverted)}"
187
- )
188
-
189
- return df
190
-
191
-
192
- @pf.register_dataframe_method
193
- def extract_keywords(
194
- df: pd.DataFrame,
195
- input_column: str,
196
- output_column: str = "keywords",
197
- preprocessed_column: str = "preprocessed_text",
198
- spacy_column: str = "spacy_output",
199
- lemma_column: str = "lemmatized_text",
200
- top_n: int = 3,
201
- threshold: float = 0.4,
202
- ngram_range: Tuple[int, int] = (1, 1),
203
- min_df: int = 5,
204
- min_count: int = None,
205
- min_proportion_with_keywords: float = 0.95,
206
- **kwargs,
207
- ) -> pd.DataFrame:
208
- """
209
- Apply a pipeline of text preprocessing, spaCy processing, lemmatization, and TF-IDF
210
- to extract keywords from the specified column.
211
-
212
- Parameters:
213
- df (pandas.DataFrame): The input DataFrame.
214
- input_column (str): Name of the column containing text to process.
215
- output_column (str): Name of the column to store the extracted keywords. Default is 'keywords'.
216
- preprocessed_column (str): Name of the column to store preprocessed text. Default is 'preprocessed_text'.
217
- spacy_column (str): Name of the column to store spaCy output. Default is 'spacy_output'.
218
- lemma_column (str): Name of the column to store lemmatized text. Default is 'lemmatized_text'.
219
- top_n (int): Number of top keywords to extract for each document. Default is 3.
220
- threshold (float): Minimum TF-IDF score for a keyword to be included. Default is 0.0.
221
- ngram_range (tuple): The lower and upper boundary of the range of n-values for different n-grams to be extracted.
222
- Default is (1, 1) which means only unigrams.
223
- **kwargs: Additional keyword arguments to pass to the preprocessing, spaCy, lemmatization, or TF-IDF functions.
224
-
225
- Returns:
226
- pandas.DataFrame: The input DataFrame with additional columns for preprocessed text,
227
- spaCy output, lemmatized text, and extracted keywords.
228
- """
229
-
230
- df_temp = df.copy()
231
- # Step 1: Preprocess text
232
- df_temp = df_temp.preprocess_text(
233
- input_column=input_column,
234
- output_column=preprocessed_column,
235
- **kwargs.get("preprocess_kwargs", {}),
236
- )
237
-
238
- df_temp = df_temp.remove_short_comments(
239
- input_column=input_column, min_comment_length=5
240
- )
241
-
242
- # Step 2: Apply spaCy
243
- df_temp = df_temp.fit_spacy(
244
- input_column=preprocessed_column, output_column=spacy_column
245
- )
246
-
247
- # Step 3: Get lemmatized text
248
- df_temp = df_temp.get_lemma(
249
- input_column=spacy_column,
250
- output_column=lemma_column,
251
- **kwargs.get("lemma_kwargs", {}),
252
- )
253
-
254
- # Step 4: Apply TF-IDF and extract keywords
255
- df_temp = df_temp.fit_tfidf(
256
- input_column=lemma_column,
257
- output_column=output_column,
258
- top_n=top_n,
259
- threshold=threshold,
260
- ngram_range=ngram_range,
261
- min_df=min_df,
262
- **kwargs.get("tfidf_kwargs", {}),
263
- )
264
-
265
- df_temp = df_temp.refine_keywords(
266
- keyword_column=output_column,
267
- text_column=lemma_column,
268
- min_proportion=min_proportion_with_keywords,
269
- output_column="refined_keywords",
270
- min_count=min_count,
271
- )
272
-
273
- return df_temp
274
-
275
-
276
- @pf.register_dataframe_method
277
- def refine_keywords(
278
- df: pd.DataFrame,
279
- keyword_column: str = "keywords",
280
- text_column: str = "lemmatized_text",
281
- min_count: Union[int, None] = None,
282
- min_proportion: float = 0.95,
283
- output_column: str = None,
284
- debug: bool = True,
285
- ) -> pd.DataFrame:
286
- """
287
- Refine keywords by replacing rare keywords with more common ones based on the text content.
288
-
289
- Parameters:
290
- df (pd.DataFrame): The input DataFrame.
291
- keyword_column (str): Name of the column containing keyword lists.
292
- text_column (str): Name of the column containing the original text.
293
- min_count (int, optional): Minimum count for a keyword to be considered common. If None, it will be determined automatically.
294
- min_proportion (float): Minimum proportion of rows that should have keywords after refinement. Used only if min_count is None. Default is 0.95.
295
- output_column (str): Column name for the refined keyword output. If it is None, then the keyword_column is over-written.
296
- debug (bool): If True, print detailed statistics about the refinement process. Default is True.
297
-
298
- Returns:
299
- pd.DataFrame: The input DataFrame with refined keywords.
300
- """
301
- if output_column is None:
302
- output_column = keyword_column
303
-
304
- # Create masked DataFrame
305
- masked_df, mask = create_masked_df(df, [keyword_column, text_column])
306
-
307
- # Step 1 & 2: Collect all keywords and count them
308
- all_keywords = [
309
- keyword
310
- for keywords in masked_df[keyword_column]
311
- if isinstance(keywords, list)
312
- for keyword in keywords
313
- ]
314
- keyword_counts = pd.Series(all_keywords).value_counts()
315
-
316
- def refine_row_keywords(row, common_keywords):
317
- if pd.isna(row[text_column]) or not isinstance(row[keyword_column], list):
318
- return []
319
-
320
- text = str(row[text_column]).lower()
321
- current_keywords = row[keyword_column]
322
- refined_keywords = []
323
-
324
- for keyword in current_keywords:
325
- if keyword in common_keywords:
326
- refined_keywords.append(keyword)
327
- else:
328
- # Find a replacement from common keywords
329
- for common_keyword in sorted(
330
- common_keywords, key=lambda k: (-keyword_counts[k], len(k))
331
- ):
332
- if (
333
- common_keyword in text
334
- and common_keyword not in refined_keywords
335
- ):
336
- refined_keywords.append(common_keyword)
337
- break
338
-
339
- # Ensure correct ordering based on appearance in the original text
340
- return (
341
- sorted(refined_keywords, key=lambda k: text.index(k))
342
- if refined_keywords
343
- else []
344
- )
345
-
346
- if min_count is None:
347
- # Determine min_count automatically
348
- def get_proportion_with_keywords(count):
349
- common_keywords = set(keyword_counts[keyword_counts >= count].index)
350
- refined_keywords = masked_df.apply(
351
- lambda row: refine_row_keywords(row, common_keywords), axis=1
352
- )
353
- return (refined_keywords.str.len() > 0).mean()
354
-
355
- min_count = 1
356
- while get_proportion_with_keywords(min_count) > min_proportion:
357
- min_count += 1
358
- min_count -= 1 # Go back one step to ensure we're above the min_proportion
359
-
360
- # Separate common and rare keywords
361
- common_keywords = set(keyword_counts[keyword_counts >= min_count].index)
362
-
363
- # Apply the refinement to each row
364
- masked_df[output_column] = masked_df.apply(
365
- lambda row: refine_row_keywords(row, common_keywords), axis=1
366
- )
367
-
368
- # Combine results
369
- df_to_return = combine_results(df, masked_df, mask, [output_column])
370
-
371
- if debug:
372
- # Calculate statistics
373
- original_keyword_count = masked_df[keyword_column].apply(
374
- lambda x: len(x) if isinstance(x, list) else 0
375
- )
376
- refined_keyword_count = masked_df[output_column].apply(len)
377
-
378
- original_unique_keywords = set(
379
- keyword
380
- for keywords in masked_df[keyword_column]
381
- if isinstance(keywords, list)
382
- for keyword in keywords
383
- )
384
- refined_unique_keywords = set(
385
- keyword for keywords in masked_df[output_column] for keyword in keywords
386
- )
387
-
388
- print(f"Refinement complete. Min count used: {min_count}")
389
- print(f"Original average keywords per row: {original_keyword_count.mean():.2f}")
390
- print(f"Refined average keywords per row: {refined_keyword_count.mean():.2f}")
391
- print(
392
- f"Proportion of rows with keywords after refinement: {(refined_keyword_count > 0).mean():.2%}"
393
- )
394
- print(
395
- f"Total unique keywords before refinement: {len(original_unique_keywords)}"
396
- )
397
- print(f"Total unique keywords after refinement: {len(refined_unique_keywords)}")
398
- print(
399
- f"Reduction in unique keywords: {(1 - len(refined_unique_keywords) / len(original_unique_keywords)):.2%}"
400
- )
401
-
402
- return df_to_return
403
-
404
-
405
- @pf.register_dataframe_method
406
- def remove_short_comments(
407
- df: pd.DataFrame, input_column: str, min_comment_length: int = 5
408
- ) -> pd.DataFrame:
409
- """
410
- Replace comments shorter than the specified minimum length with NaN.
411
-
412
- Parameters:
413
- df (pandas.DataFrame): The input DataFrame.
414
- input_column (str): Name of the column containing text to process.
415
- min_comment_length (int): Minimum length of comment to keep. Default is 5.
416
-
417
- Returns:
418
- pandas.DataFrame: The input DataFrame with short comments replaced by NaN.
419
- """
420
- # Create a copy of the DataFrame to avoid modifying the original
421
- df_copy = df.copy()
422
-
423
- # Replace short comments with NaN
424
- df_copy[input_column] = df_copy[input_column].apply(
425
- lambda x: x if isinstance(x, str) and len(x) >= min_comment_length else np.nan
426
- )
427
-
428
- return df_copy
429
-
430
-
431
- @pf.register_dataframe_method
432
- def fit_sentence_transformer(
433
- df,
434
- input_column: str,
435
- model_name="all-MiniLM-L6-v2",
436
- output_column="sentence_embedding",
437
- ):
438
- """Adds a list of vector embeddings for each string in the input column. These can then be used for downstream
439
- tasks like clustering"""
440
- # Initialize the sentence transformer model
441
- masked_df, mask = create_masked_df(df, [input_column])
442
- model = SentenceTransformer(model_name)
443
-
444
- # Create sentence embeddings
445
- embeddings = model.encode(masked_df[input_column].tolist())
446
-
447
- # Convert embeddings to a list of numpy arrays
448
- embeddings_list = [embedding for embedding in embeddings]
449
-
450
- # Add the embeddings as a new column in the dataframe
451
- masked_df[output_column] = embeddings_list
452
- df_to_return = combine_results(df, masked_df, mask, output_column)
453
-
454
- return df_to_return
455
-
456
-
457
- @pf.register_dataframe_method
458
- def extract_sentiment(
459
- df,
460
- input_column: str,
461
- output_columns=["positive", "neutral", "negative", "sentiment"],
462
- ):
463
- """
464
- Extract sentiment from text using the cardiffnlp/twitter-roberta-base-sentiment model.
465
-
466
- Parameters:
467
- df (pandas.DataFrame): The input DataFrame.
468
- input_column (str): Name of the column containing text to analyze.
469
- output_columns (list): List of column names for the output. Default is ["positive", "neutral", "negative", "sentiment"].
470
-
471
- Returns:
472
- pandas.DataFrame: The input DataFrame with additional columns for sentiment scores and labels.
473
- """
474
- MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
475
- tokenizer = AutoTokenizer.from_pretrained(MODEL)
476
- model = AutoModelForSequenceClassification.from_pretrained(MODEL)
477
-
478
- masked_df, mask = create_masked_df(df, [input_column])
479
-
480
- def analyze_sentiment(text):
481
- encoded_input = tokenizer(
482
- text, return_tensors="pt", truncation=True, max_length=512, padding=True
483
- )
484
- output = model(**encoded_input)
485
- scores = output.logits[0].detach().numpy()
486
- scores = softmax(scores)
487
- return scores
488
-
489
- sentiment_scores = masked_df[input_column].apply(analyze_sentiment)
490
-
491
- masked_df[output_columns[0]] = sentiment_scores.apply(lambda x: x[2]) # Positive
492
- masked_df[output_columns[1]] = sentiment_scores.apply(lambda x: x[1]) # Neutral
493
- masked_df[output_columns[2]] = sentiment_scores.apply(lambda x: x[0]) # Negative
494
-
495
- masked_df[output_columns[3]] = masked_df[
496
- [output_columns[0], output_columns[1], output_columns[2]]
497
- ].idxmax(axis=1)
498
- masked_df[output_columns[3]] = masked_df[output_columns[3]].map(
499
- {
500
- output_columns[0]: "positive",
501
- output_columns[1]: "neutral",
502
- output_columns[2]: "negative",
503
- }
504
- )
505
-
506
- df_to_return = combine_results(df, masked_df, mask, output_columns)
507
- return df_to_return
508
-
509
-
510
- @pf.register_dataframe_method
511
- def cluster_comments(
512
- df: pd.DataFrame,
513
- input_column: str,
514
- output_columns: str = ["cluster", "cluster_probability"],
515
- min_cluster_size=5,
516
- cluster_selection_epsilon: float = 0.2,
517
- n_neighbors: int = 15,
518
- ):
519
- """applies a pipeline of 1) vector embeddings 2) dimensional reduction 3) clustering
520
- to assign each row a cluster ID so that similar free text comments (found in the input_column) can be grouped together.
521
- Returns a modified dataframe. If you want control over parameters for the various functions,
522
- then apply them separately. The defaults should be OK in most cases."""
523
-
524
- df_temp = (
525
- df.fit_sentence_transformer(
526
- input_column=input_column, output_column="sentence_embedding"
527
- )
528
- .fit_umap(
529
- input_columns="sentence_embedding",
530
- embeddings_in_list=True,
531
- n_neighbors=n_neighbors,
532
- )
533
- .fit_cluster_hdbscan(
534
- output_columns=output_columns,
535
- min_cluster_size=min_cluster_size,
536
- cluster_selection_epsilon=cluster_selection_epsilon,
537
- )
538
- )
539
-
540
- return df_temp
541
-
542
-
543
- @pf.register_dataframe_method
544
- def fit_tfidf(
545
- df: pd.DataFrame,
546
- input_column: str,
547
- output_column: str = "keywords",
548
- top_n: int = 3,
549
- threshold: float = 0.6,
550
- append_features: bool = False,
551
- ngram_range: Tuple[int, int] = (1, 1),
552
- **tfidf_kwargs,
553
- ) -> pd.DataFrame:
554
- """
555
- Apply TF-IDF vectorization to a text column and extract top N keywords for each document,
556
- while preserving NaN values in the original DataFrame. Supports n-gram extraction.
557
-
558
- Parameters:
559
- df (pandas.DataFrame): The input DataFrame.
560
- input_column (str): Name of the column containing text to vectorize.
561
- output_column (str): Name of the column to store the extracted keywords. Default is 'keywords'.
562
- top_n (int): Number of top keywords to extract for each document. Default is 5.
563
- threshold (float): Minimum TF-IDF score for a keyword to be included. Default is 0.0.
564
- append_features (bool): If True, append all TF-IDF features to the DataFrame. Default is False.
565
- ngram_range (tuple): The lower and upper boundary of the range of n-values for different n-grams to be extracted.
566
- Default is (1, 1) which means only unigrams. Set to (1, 2) for unigrams and bigrams, and so on.
567
- **tfidf_kwargs: Additional keyword arguments to pass to TfidfVectorizer.
568
-
569
- Returns:
570
- pandas.DataFrame: The input DataFrame with an additional column containing the top keywords.
571
- """
572
- # Create a masked DataFrame
573
- masked_df, mask = create_masked_df(df, [input_column])
574
-
575
- # Ensure ngram_range is included in the TfidfVectorizer parameters
576
- tfidf_kwargs["ngram_range"] = ngram_range
577
- # Inside fit_tfidf function
578
- tfidf_kwargs["min_df"] = tfidf_kwargs.get("min_df", 1)
579
-
580
- # Apply TF-IDF vectorization to the masked DataFrame
581
- tfidf_features, _, feature_names = apply_vectorizer(
582
- masked_df, input_column, vectorizer_name="TfidfVectorizer", **tfidf_kwargs
583
- )
584
-
585
- def extract_top_keywords(row: pd.Series) -> List[str]:
586
- # Get indices of top N TF-IDF scores
587
- top_indices = row.nlargest(top_n).index
588
-
589
- # Get the original text for this row
590
- original_text = masked_df.loc[row.name, input_column].lower()
591
-
592
- # Filter based on threshold, presence in original text, and get the corresponding feature names
593
- top_keywords = [
594
- feature_names[i]
595
- for i, idx in enumerate(tfidf_features.columns)
596
- if idx in top_indices
597
- and row[idx] >= threshold
598
- and feature_names[i].lower() in original_text
599
- ]
600
-
601
- # Sort keywords based on their order in the original text
602
- return sorted(top_keywords, key=lambda x: original_text.index(x.lower()))
603
-
604
- # Extract top keywords for each document
605
- masked_df[output_column] = tfidf_features.apply(extract_top_keywords, axis=1)
606
-
607
- # Combine the results back into the original DataFrame
608
- result_df = combine_results(df, masked_df, mask, [output_column])
609
-
610
- # Optionally append all TF-IDF features
611
- if append_features:
612
- # We need to handle NaN values in the features as well
613
- feature_columns = tfidf_features.columns.tolist()
614
- masked_df = pd.concat([masked_df, tfidf_features], axis=1)
615
- result_df = combine_results(result_df, masked_df, mask, feature_columns)
616
-
617
- return result_df
618
-
619
-
620
- @pf.register_dataframe_method
621
- def fit_spacy(df, input_column: str, output_column: str = "spacy_output"):
622
- """
623
- Apply the en_core_web_md spaCy model to the specified column of the DataFrame.
624
-
625
- Parameters:
626
- df (pandas.DataFrame): The input DataFrame.
627
- input_column (str): Name of the column containing text to analyze.
628
- output_column (str): Name of the output column. Default is "spacy_output".
629
-
630
- Returns:
631
- pandas.DataFrame: The input DataFrame with an additional column containing spaCy doc objects.
632
- """
633
- # Check if the model is downloaded, if not, download it
634
- try:
635
- nlp = spacy.load("en_core_web_md")
636
- except OSError:
637
- print("Downloading en_core_web_md model...")
638
- spacy.cli.download("en_core_web_md")
639
- nlp = spacy.load("en_core_web_md")
640
-
641
- # Create masked DataFrame
642
- masked_df, mask = create_masked_df(df, [input_column])
643
-
644
- # Apply spaCy model
645
- masked_df[output_column] = masked_df[input_column].apply(nlp)
646
-
647
- # Combine results
648
- df_to_return = combine_results(df, masked_df, mask, output_column)
649
-
650
- return df_to_return
651
-
652
-
653
- @pf.register_dataframe_method
654
- def get_lemma(
655
- df: pd.DataFrame,
656
- input_column: str = "spacy_output",
657
- output_column: str = "lemmatized_text",
658
- text_pos: List[str] = ["PRON"],
659
- remove_punct: bool = True,
660
- remove_space: bool = True,
661
- remove_stop: bool = True,
662
- keep_tokens: Union[List[str], None] = None,
663
- keep_pos: Union[List[str], None] = None,
664
- keep_dep: Union[List[str], None] = ["neg"],
665
- join_tokens: bool = True,
666
- ) -> pd.DataFrame:
667
- """
668
- Extract lemmatized text from the spaCy doc objects in the specified column.
669
-
670
- Parameters:
671
- df (pandas.DataFrame): The input DataFrame.
672
- input_column (str): Name of the column containing spaCy doc objects. Default is 'spacy_output'.
673
- output_column (str): Name of the output column for lemmatized text. Default is 'lemmatized_text'.
674
- text_pos (List[str]): List of POS tags to exclude from lemmatization and return the text. Default is ['PRON'].
675
- remove_punct (bool): Whether to remove punctuation. Default is True.
676
- remove_space (bool): Whether to remove whitespace tokens. Default is True.
677
- remove_stop (bool): Whether to remove stop words. Default is True.
678
- keep_tokens (List[str]): List of token texts to always keep. Default is None.
679
- keep_pos (List[str]): List of POS tags to always keep. Default is None.
680
- keep_dep (List[str]): List of dependency labels to always keep. Default is None.
681
- join_tokens (bool): Whether to join tokens into a string. If False, returns a list of tokens. Default is True.
682
-
683
- Returns:
684
- pandas.DataFrame: The input DataFrame with an additional column containing lemmatized text or token list.
685
- """
686
- # Create masked DataFrame
687
- masked_df, mask = create_masked_df(df, [input_column])
688
-
689
- def remove_token(token):
690
- """
691
- Returns True if the token should be removed.
692
- """
693
- if (
694
- (keep_tokens and token.text in keep_tokens)
695
- or (keep_pos and token.pos_ in keep_pos)
696
- or (keep_dep and token.dep_ in keep_dep)
697
- ):
698
- return False
699
- return (
700
- (remove_punct and token.is_punct)
701
- or (remove_space and token.is_space)
702
- or (remove_stop and token.is_stop)
703
- )
704
-
705
- def process_text(doc):
706
- tokens = [
707
- token.text if token.pos_ in text_pos else token.lemma_
708
- for token in doc
709
- if not remove_token(token)
710
- ]
711
- return " ".join(tokens) if join_tokens else tokens
712
-
713
- # Apply processing
714
- masked_df[output_column] = masked_df[input_column].apply(process_text)
715
-
716
- # Combine results
717
- df_to_return = combine_results(df, masked_df, mask, output_column)
718
-
719
- return df_to_return
720
-
721
-
722
- @pf.register_dataframe_method
723
- def preprocess_text(
724
- df: pd.DataFrame,
725
- input_column: str,
726
- output_column: str = None,
727
- remove_html: bool = True,
728
- lower_case: bool = False,
729
- normalize_whitespace: bool = True,
730
- remove_numbers: bool = False,
731
- remove_stopwords: bool = False,
732
- flag_short_comments: bool = False,
733
- min_comment_length: int = 5,
734
- max_comment_length: int = None,
735
- remove_punctuation: bool = True,
736
- keep_sentence_punctuation: bool = True,
737
- comment_length_column: str = None,
738
- ) -> pd.DataFrame:
739
- """
740
- Preprocess text data in the specified column, tailored for survey responses.
741
-
742
- Parameters:
743
- df (pandas.DataFrame): The input DataFrame.
744
- input_column (str): Name of the column containing text to preprocess.
745
- output_column (str): Name of the output column. If None, overwrites the input column.
746
- remove_html (bool): Whether to remove unexpected HTML tags. Default is True.
747
- lower_case (bool): Whether to lowercase all words. Default is False
748
- normalize_whitespace (bool): Whether to normalize whitespace. Default is True.
749
- remove_numbers (bool): Whether to remove numbers. Default is False.
750
- remove_stopwords (bool): Whether to remove stop words. Default is False.
751
- flag_short_comments (bool): Whether to flag very short comments. Default is False.
752
- min_comment_length (int): Minimum length of comment to not be flagged as short. Default is 5.
753
- max_comment_length (int): Maximum length of comment to keep. If None, keeps full length. Default is None.
754
- remove_extra_punctuation (bool): Whether to remove extra punctuation. Default is True.
755
- keep_sentence_punctuation (bool): Whether to keep sentence-level punctuation. Default is True.
756
- comment_length_column (str): Name of the column to store comment lengths. If None, no column is added. Default is None.
757
-
758
- Returns:
759
- pandas.DataFrame: The input DataFrame with preprocessed text and optionally new columns for short comments, truncation info, and comment length.
760
- """
761
- output_column = output_column or input_column
762
-
763
- # Create masked DataFrame
764
- masked_df, mask = create_masked_df(df, [input_column])
765
-
766
- def process_text(text):
767
- if lower_case:
768
- text = text.lower()
769
- if remove_html:
770
- text = strip_tags(text)
771
-
772
- if normalize_whitespace:
773
- text = strip_multiple_whitespaces(text)
774
-
775
- if remove_numbers:
776
- text = strip_numeric(text)
777
-
778
- if remove_stopwords:
779
- text = remove_stopwords(text)
780
-
781
- if remove_punctuation:
782
- if keep_sentence_punctuation:
783
- # Remove all punctuation except .,!?'" and apostrophes
784
- text = re.sub(r"[^\w\s.,!?'\"]", "", text)
785
- # Remove spaces before punctuation, but not before apostrophes
786
- text = re.sub(r"\s([.,!?\"](?:\s|$))", r"\1", text)
787
- else:
788
- # Remove all punctuation except apostrophes
789
- text = re.sub(r"[^\w\s']", "", text)
790
-
791
- text = text.strip()
792
-
793
- if max_comment_length:
794
- text = text[:max_comment_length]
795
-
796
- return text
797
-
798
- # Apply processing
799
- masked_df[output_column] = masked_df[input_column].apply(process_text)
800
-
801
- columns_to_combine = [output_column]
802
-
803
- if flag_short_comments:
804
- short_comment_col = f"{output_column}_is_short"
805
- masked_df[short_comment_col] = (
806
- masked_df[output_column].str.len() < min_comment_length
807
- )
808
- columns_to_combine.append(short_comment_col)
809
-
810
- if max_comment_length:
811
- truncated_col = f"{output_column}_was_truncated"
812
- masked_df[truncated_col] = (
813
- masked_df[input_column].str.len() > max_comment_length
814
- )
815
- columns_to_combine.append(truncated_col)
816
-
817
- if comment_length_column:
818
- masked_df[comment_length_column] = masked_df[output_column].str.len()
819
- columns_to_combine.append(comment_length_column)
820
-
821
- # Combine results
822
- df_to_return = combine_results(df, masked_df, mask, columns_to_combine)
823
-
824
- return df_to_return
1
+ import re
2
+ import warnings
3
+ from collections import defaultdict
4
+ from typing import List, Tuple, Union
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ import pandas_flavor as pf
9
+ import spacy
10
+ from gensim.parsing.preprocessing import (
11
+ remove_stopwords,
12
+ strip_multiple_whitespaces,
13
+ strip_numeric,
14
+ strip_tags,
15
+ )
16
+ from scipy.special import softmax
17
+ from sentence_transformers import SentenceTransformer
18
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
19
+
20
+ from pandas_survey_toolkit.analytics import fit_cluster_hdbscan, fit_umap
21
+ from pandas_survey_toolkit.utils import (
22
+ apply_vectorizer,
23
+ combine_results,
24
+ create_masked_df,
25
+ )
26
+
27
+
28
+ @pf.register_dataframe_method
29
+ def cluster_questions(
30
+ df,
31
+ columns=None,
32
+ pattern=None,
33
+ likert_mapping=None,
34
+ umap_n_neighbors=15,
35
+ umap_min_dist=0.1,
36
+ hdbscan_min_cluster_size=20,
37
+ hdbscan_min_samples=None,
38
+ cluster_selection_epsilon=0.4,
39
+ ):
40
+ """Cluster Likert scale questions based on response patterns.
41
+
42
+ Parameters
43
+ ----------
44
+ df : pandas.DataFrame
45
+ The input DataFrame.
46
+ columns : list, optional
47
+ List of column names to cluster. If None, all columns matching the pattern will be used.
48
+ pattern : str, optional
49
+ Regex pattern to match column names. Used if columns is None.
50
+ likert_mapping : dict, optional
51
+ Custom mapping for Likert scale responses. If None, default mapping is used.
52
+ umap_n_neighbors : int, optional
53
+ The size of local neighborhood for UMAP. Default is 15.
54
+ umap_min_dist : float, optional
55
+ The minimum distance between points in UMAP. Default is 0.1.
56
+ hdbscan_min_cluster_size : int, optional
57
+ The minimum size of clusters for HDBSCAN. Default is 20.
58
+ hdbscan_min_samples : int, optional
59
+ The number of samples in a neighborhood for a core point in HDBSCAN. Default is None.
60
+ cluster_selection_epsilon : float, optional
61
+ A distance threshold. Clusters below this value will be merged. Default is 0.4.
62
+ Higher epsilon means fewer, larger clusters.
63
+
64
+ Returns
65
+ -------
66
+ pandas.DataFrame
67
+ The input DataFrame with additional columns for encoded Likert responses,
68
+ UMAP coordinates, and cluster IDs.
69
+
70
+ Raises
71
+ ------
72
+ ValueError
73
+ If neither 'columns' nor 'pattern' is provided.
74
+ """
75
+ # Select columns
76
+ if columns is None and pattern is None:
77
+ raise ValueError("Either 'columns' or 'pattern' must be provided.")
78
+ elif columns is None:
79
+ columns = df.filter(regex=pattern).columns.tolist()
80
+
81
+ # Encode Likert scales
82
+ df = df.encode_likert(columns, custom_mapping=likert_mapping)
83
+ encoded_columns = [f"likert_encoded_{col}" for col in columns]
84
+
85
+ # Apply UMAP
86
+ df = df.fit_umap(
87
+ input_columns=encoded_columns,
88
+ output_columns=["likert_umap_x", "likert_umap_y"],
89
+ n_neighbors=umap_n_neighbors,
90
+ min_dist=umap_min_dist,
91
+ metric="cosine",
92
+ )
93
+
94
+ # Apply HDBSCAN
95
+ df = df.fit_cluster_hdbscan(
96
+ input_columns=["likert_umap_x", "likert_umap_y"],
97
+ output_columns=["question_cluster_id", "question_cluster_probability"],
98
+ min_cluster_size=hdbscan_min_cluster_size,
99
+ min_samples=hdbscan_min_samples,
100
+ cluster_selection_epsilon=cluster_selection_epsilon,
101
+ )
102
+
103
+ return df
104
+
105
+
106
+ @pf.register_dataframe_method
107
+ def encode_likert(
108
+ df, likert_columns, output_prefix="likert_encoded_", custom_mapping=None, debug=True
109
+ ):
110
+ """Encode Likert scale responses to numeric values.
111
+
112
+ Parameters
113
+ ----------
114
+ df : pandas.DataFrame
115
+ The input DataFrame.
116
+ likert_columns : list
117
+ List of column names containing Likert scale responses.
118
+ output_prefix : str, optional
119
+ Prefix for the new encoded columns. Default is 'likert_encoded_'.
120
+ custom_mapping : dict, optional
121
+ Optional custom mapping for Likert scale responses.
122
+ debug : bool, optional
123
+ If True, prints out the mappings. Default is True.
124
+
125
+ Returns
126
+ -------
127
+ pandas.DataFrame
128
+ The input DataFrame with additional columns for encoded Likert responses.
129
+
130
+ Notes
131
+ -----
132
+ Default mapping:
133
+ - -1: Phrases containing 'disagree', 'do not agree', etc.
134
+ - 0: Phrases containing 'neutral', 'neither', 'unsure', etc.
135
+ - +1: Phrases containing 'agree' (but not 'disagree' or 'not agree')
136
+ - NaN: NaN values are preserved
137
+ """
138
+
139
+
140
+ def default_mapping(response):
141
+ if pd.isna(response):
142
+ return pd.NA
143
+ response = str(response).lower().strip()
144
+
145
+ # Neutral / Neither / Unsure / Don't know (0)
146
+ if re.search(r"\b(neutral|neither|unsure|know)\b", response) or re.search(
147
+ r"neither\s+agree\s+nor\s+disagree", response
148
+ ):
149
+ return 0
150
+
151
+ # Disagree / Dissatisfied (-1)
152
+ if re.search(r"\b(disagree)\b", response) or re.search(
153
+ r"\b(dis|not|no)[-]{0,1}\s*(agree|satisf)", response
154
+ ):
155
+ return -1
156
+
157
+ # Agree / Satisfied (1)
158
+ if re.search(r"\bagree\b", response) or re.search(r"satisf", response):
159
+ return 1
160
+
161
+ # Unable to classify
162
+ return None
163
+
164
+ conversion_summary = defaultdict(int)
165
+ unconverted_phrases = set()
166
+
167
+ if custom_mapping is None:
168
+ mapping_func = default_mapping
169
+ if debug:
170
+ print("Using default mapping:")
171
+ print("-1: Phrases containing 'disagree', 'do not agree', etc.")
172
+ print(" 0: Phrases containing 'neutral', 'neither', 'unsure', etc.")
173
+ print("+1: Phrases containing 'agree' (but not 'disagree' or 'not agree')")
174
+ print("NaN: NaN values are preserved")
175
+ else:
176
+
177
+ def mapping_func(response):
178
+ if pd.isna(response):
179
+ return pd.NA
180
+ converted = custom_mapping.get(str(response).lower().strip())
181
+ if converted is None:
182
+ unconverted_phrases.add(str(response))
183
+ return pd.NA
184
+ return converted
185
+
186
+ if debug:
187
+ print("Using custom mapping:", custom_mapping)
188
+ print("NaN: NaN values are preserved")
189
+
190
+ for column in likert_columns:
191
+ output_column = f"{output_prefix}{column}"
192
+ df[output_column] = df[column].apply(lambda x: mapping_func(x))
193
+
194
+ # Update conversion summary
195
+ for original, converted in zip(df[column], df[output_column]):
196
+ conversion_summary[f"{original} -> {converted}"] += 1
197
+
198
+ if debug:
199
+ for conversion, count in conversion_summary.items():
200
+ print(f" {conversion}: {count} times")
201
+
202
+ # Alert about unconverted phrases
203
+ if unconverted_phrases:
204
+ warnings.warn(
205
+ f"The following phrases were not converted (mapped to NaN): {', '.join(unconverted_phrases)}"
206
+ )
207
+
208
+ # Alert if default mapping didn't convert everything
209
+ if custom_mapping is None:
210
+ all_responses = set()
211
+ for column in likert_columns:
212
+ all_responses.update(df[column].dropna().unique())
213
+ unconverted = [
214
+ resp for resp in all_responses if default_mapping(resp) not in [-1, 0, 1]
215
+ ]
216
+ if unconverted:
217
+ warnings.warn(
218
+ f"The default mapping didn't convert the following responses: {', '.join(unconverted)}"
219
+ )
220
+
221
+ return df
222
+
223
+ @pf.register_dataframe_method
224
+ def extract_keywords(
225
+ df: pd.DataFrame,
226
+ input_column: str,
227
+ output_column: str = "keywords",
228
+ preprocessed_column: str = "preprocessed_text",
229
+ spacy_column: str = "spacy_output",
230
+ lemma_column: str = "lemmatized_text",
231
+ top_n: int = 3,
232
+ threshold: float = 0.4,
233
+ ngram_range: Tuple[int, int] = (1, 1),
234
+ min_df: int = 5,
235
+ min_count: int = None,
236
+ min_proportion_with_keywords: float = 0.95,
237
+ **kwargs,
238
+ ) -> pd.DataFrame:
239
+ """Apply a pipeline of text preprocessing, spaCy processing, lemmatization, and TF-IDF
240
+ to extract keywords from the specified column.
241
+
242
+ Parameters
243
+ ----------
244
+ df : pandas.DataFrame
245
+ The input DataFrame.
246
+ input_column : str
247
+ Name of the column containing text to process.
248
+ output_column : str, optional
249
+ Name of the column to store the extracted keywords. Default is 'keywords'.
250
+ preprocessed_column : str, optional
251
+ Name of the column to store preprocessed text. Default is 'preprocessed_text'.
252
+ spacy_column : str, optional
253
+ Name of the column to store spaCy output. Default is 'spacy_output'.
254
+ lemma_column : str, optional
255
+ Name of the column to store lemmatized text. Default is 'lemmatized_text'.
256
+ top_n : int, optional
257
+ Number of top keywords to extract for each document. Default is 3.
258
+ threshold : float, optional
259
+ Minimum TF-IDF score for a keyword to be included. Default is 0.4.
260
+ ngram_range : tuple, optional
261
+ The lower and upper boundary of the range of n-values for different n-grams to be extracted.
262
+ Default is (1, 1) which means only unigrams.
263
+ min_df : int, optional
264
+ Minimum document frequency for TF-IDF. Default is 5.
265
+ min_count : int, optional
266
+ Minimum count for a keyword to be considered common in refinement. Default is None.
267
+ min_proportion_with_keywords : float, optional
268
+ Minimum proportion of rows that should have keywords after refinement. Default is 0.95.
269
+ **kwargs
270
+ Additional keyword arguments to pass to the preprocessing, spaCy,
271
+ lemmatization, or TF-IDF functions.
272
+
273
+ Returns
274
+ -------
275
+ pandas.DataFrame
276
+ The input DataFrame with additional columns for preprocessed text,
277
+ spaCy output, lemmatized text, and extracted keywords.
278
+ """
279
+ df_temp = df.copy()
280
+ # Step 1: Preprocess text
281
+ df_temp = df_temp.preprocess_text(
282
+ input_column=input_column,
283
+ output_column=preprocessed_column,
284
+ **kwargs.get("preprocess_kwargs", {}),
285
+ )
286
+
287
+ df_temp = df_temp.remove_short_comments(
288
+ input_column=input_column, min_comment_length=5
289
+ )
290
+
291
+ # Step 2: Apply spaCy
292
+ df_temp = df_temp.fit_spacy(
293
+ input_column=preprocessed_column, output_column=spacy_column
294
+ )
295
+
296
+ # Step 3: Get lemmatized text
297
+ df_temp = df_temp.get_lemma(
298
+ input_column=spacy_column,
299
+ output_column=lemma_column,
300
+ **kwargs.get("lemma_kwargs", {}),
301
+ )
302
+
303
+ # Step 4: Apply TF-IDF and extract keywords
304
+ df_temp = df_temp.fit_tfidf(
305
+ input_column=lemma_column,
306
+ output_column=output_column,
307
+ top_n=top_n,
308
+ threshold=threshold,
309
+ ngram_range=ngram_range,
310
+ min_df=min_df,
311
+ **kwargs.get("tfidf_kwargs", {}),
312
+ )
313
+
314
+ df_temp = df_temp.refine_keywords(
315
+ keyword_column=output_column,
316
+ text_column=lemma_column,
317
+ min_proportion=min_proportion_with_keywords,
318
+ output_column="refined_keywords",
319
+ min_count=min_count,
320
+ )
321
+
322
+ return df_temp
323
+
324
+
325
+ @pf.register_dataframe_method
326
+ def refine_keywords(
327
+ df: pd.DataFrame,
328
+ keyword_column: str = "keywords",
329
+ text_column: str = "lemmatized_text",
330
+ min_count: Union[int, None] = None,
331
+ min_proportion: float = 0.95,
332
+ output_column: str = None,
333
+ debug: bool = True,
334
+ ) -> pd.DataFrame:
335
+ """Refine keywords by replacing rare keywords with more common ones based on the text content.
336
+
337
+ Parameters
338
+ ----------
339
+ df : pd.DataFrame
340
+ The input DataFrame.
341
+ keyword_column : str, optional
342
+ Name of the column containing keyword lists. Default is 'keywords'.
343
+ text_column : str, optional
344
+ Name of the column containing the original text. Default is 'lemmatized_text'.
345
+ min_count : int, optional
346
+ Minimum count for a keyword to be considered common. If None,
347
+ it will be determined automatically. Default is None.
348
+ min_proportion : float, optional
349
+ Minimum proportion of rows that should have keywords after refinement.
350
+ Used only if min_count is None. Default is 0.95.
351
+ output_column : str, optional
352
+ Column name for the refined keyword output. If None, the keyword_column
353
+ is overwritten. Default is None.
354
+ debug : bool, optional
355
+ If True, print detailed statistics about the refinement process. Default is True.
356
+
357
+ Returns
358
+ -------
359
+ pd.DataFrame
360
+ The input DataFrame with refined keywords.
361
+ """
362
+ if output_column is None:
363
+ output_column = keyword_column
364
+
365
+ # Create masked DataFrame
366
+ masked_df, mask = create_masked_df(df, [keyword_column, text_column])
367
+
368
+ # Step 1 & 2: Collect all keywords and count them
369
+ all_keywords = [
370
+ keyword
371
+ for keywords in masked_df[keyword_column]
372
+ if isinstance(keywords, list)
373
+ for keyword in keywords
374
+ ]
375
+ keyword_counts = pd.Series(all_keywords).value_counts()
376
+
377
+ def refine_row_keywords(row, common_keywords):
378
+ if pd.isna(row[text_column]) or not isinstance(row[keyword_column], list):
379
+ return []
380
+
381
+ text = str(row[text_column]).lower()
382
+ current_keywords = row[keyword_column]
383
+ refined_keywords = []
384
+
385
+ for keyword in current_keywords:
386
+ if keyword in common_keywords:
387
+ refined_keywords.append(keyword)
388
+ else:
389
+ # Find a replacement from common keywords
390
+ for common_keyword in sorted(
391
+ common_keywords, key=lambda k: (-keyword_counts[k], len(k))
392
+ ):
393
+ if (
394
+ common_keyword in text
395
+ and common_keyword not in refined_keywords
396
+ ):
397
+ refined_keywords.append(common_keyword)
398
+ break
399
+
400
+ # Ensure correct ordering based on appearance in the original text
401
+ return (
402
+ sorted(refined_keywords, key=lambda k: text.index(k))
403
+ if refined_keywords
404
+ else []
405
+ )
406
+
407
+ if min_count is None:
408
+ # Determine min_count automatically
409
+ def get_proportion_with_keywords(count):
410
+ common_keywords = set(keyword_counts[keyword_counts >= count].index)
411
+ refined_keywords = masked_df.apply(
412
+ lambda row: refine_row_keywords(row, common_keywords), axis=1
413
+ )
414
+ return (refined_keywords.str.len() > 0).mean()
415
+
416
+ min_count = 1
417
+ while get_proportion_with_keywords(min_count) > min_proportion:
418
+ min_count += 1
419
+ min_count -= 1 # Go back one step to ensure we're above the min_proportion
420
+
421
+ # Separate common and rare keywords
422
+ common_keywords = set(keyword_counts[keyword_counts >= min_count].index)
423
+
424
+ # Apply the refinement to each row
425
+ masked_df[output_column] = masked_df.apply(
426
+ lambda row: refine_row_keywords(row, common_keywords), axis=1
427
+ )
428
+
429
+ # Combine results
430
+ df_to_return = combine_results(df, masked_df, mask, [output_column])
431
+
432
+ if debug:
433
+ # Calculate statistics
434
+ original_keyword_count = masked_df[keyword_column].apply(
435
+ lambda x: len(x) if isinstance(x, list) else 0
436
+ )
437
+ refined_keyword_count = masked_df[output_column].apply(len)
438
+
439
+ original_unique_keywords = set(
440
+ keyword
441
+ for keywords in masked_df[keyword_column]
442
+ if isinstance(keywords, list)
443
+ for keyword in keywords
444
+ )
445
+ refined_unique_keywords = set(
446
+ keyword for keywords in masked_df[output_column] for keyword in keywords
447
+ )
448
+
449
+ print(f"Refinement complete. Min count used: {min_count}")
450
+ print(f"Original average keywords per row: {original_keyword_count.mean():.2f}")
451
+ print(f"Refined average keywords per row: {refined_keyword_count.mean():.2f}")
452
+ print(
453
+ f"Proportion of rows with keywords after refinement: {(refined_keyword_count > 0).mean():.2%}"
454
+ )
455
+ print(
456
+ f"Total unique keywords before refinement: {len(original_unique_keywords)}"
457
+ )
458
+ print(f"Total unique keywords after refinement: {len(refined_unique_keywords)}")
459
+ print(
460
+ f"Reduction in unique keywords: {(1 - len(refined_unique_keywords) / len(original_unique_keywords)):.2%}"
461
+ )
462
+
463
+ return df_to_return
464
+
465
+
466
+ @pf.register_dataframe_method
467
+ def remove_short_comments(
468
+ df: pd.DataFrame, input_column: str, min_comment_length: int = 5
469
+ ) -> pd.DataFrame:
470
+ """Replace comments shorter than the specified minimum length with NaN.
471
+
472
+ Parameters
473
+ ----------
474
+ df : pandas.DataFrame
475
+ The input DataFrame.
476
+ input_column : str
477
+ Name of the column containing text to process.
478
+ min_comment_length : int, optional
479
+ Minimum length of comment to keep. Default is 5.
480
+
481
+ Returns
482
+ -------
483
+ pandas.DataFrame
484
+ The input DataFrame with short comments replaced by NaN.
485
+ """
486
+ # Create a copy of the DataFrame to avoid modifying the original
487
+ df_copy = df.copy()
488
+
489
+ # Replace short comments with NaN
490
+ df_copy[input_column] = df_copy[input_column].apply(
491
+ lambda x: x if isinstance(x, str) and len(x) >= min_comment_length else np.nan
492
+ )
493
+
494
+ return df_copy
495
+
496
+
497
+ @pf.register_dataframe_method
498
+ def fit_sentence_transformer(
499
+ df,
500
+ input_column: str,
501
+ model_name="all-MiniLM-L6-v2",
502
+ output_column="sentence_embedding",
503
+ ):
504
+ """Add vector embeddings for each string in the input column.
505
+
506
+ Creates sentence embeddings that can be used for downstream tasks like clustering.
507
+
508
+ Parameters
509
+ ----------
510
+ df : pandas.DataFrame
511
+ The input DataFrame.
512
+ input_column : str
513
+ Name of the column containing text to embed.
514
+ model_name : str, optional
515
+ Name of the sentence transformer model to use. Default is 'all-MiniLM-L6-v2'.
516
+ output_column : str, optional
517
+ Name of the column to store embeddings. Default is 'sentence_embedding'.
518
+
519
+ Returns
520
+ -------
521
+ pandas.DataFrame
522
+ The input DataFrame with an additional column containing sentence embeddings.
523
+ """
524
+
525
+ # Initialize the sentence transformer model
526
+ masked_df, mask = create_masked_df(df, [input_column])
527
+ model = SentenceTransformer(model_name)
528
+
529
+ # Create sentence embeddings
530
+ embeddings = model.encode(masked_df[input_column].tolist())
531
+
532
+ # Convert embeddings to a list of numpy arrays
533
+ embeddings_list = [embedding for embedding in embeddings]
534
+
535
+ # Add the embeddings as a new column in the dataframe
536
+ masked_df[output_column] = embeddings_list
537
+ df_to_return = combine_results(df, masked_df, mask, output_column)
538
+
539
+ return df_to_return
540
+
541
+
542
+
543
+ @pf.register_dataframe_method
544
+ def extract_sentiment(
545
+ df,
546
+ input_column: str,
547
+ output_columns=["positive", "neutral", "negative", "sentiment"],
548
+ ):
549
+ """Extract sentiment from text using the cardiffnlp/twitter-roberta-base-sentiment model.
550
+
551
+ Parameters
552
+ ----------
553
+ df : pandas.DataFrame
554
+ The input DataFrame.
555
+ input_column : str
556
+ Name of the column containing text to analyze.
557
+ output_columns : list, optional
558
+ List of column names for the output.
559
+ Default is ["positive", "neutral", "negative", "sentiment"].
560
+
561
+ Returns
562
+ -------
563
+ pandas.DataFrame
564
+ The input DataFrame with additional columns for sentiment scores and labels.
565
+ """
566
+
567
+ MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
568
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
569
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL)
570
+
571
+ masked_df, mask = create_masked_df(df, [input_column])
572
+
573
+ def analyze_sentiment(text):
574
+ encoded_input = tokenizer(
575
+ text, return_tensors="pt", truncation=True, max_length=512, padding=True
576
+ )
577
+ output = model(**encoded_input)
578
+ scores = output.logits[0].detach().numpy()
579
+ scores = softmax(scores)
580
+ return scores
581
+
582
+ sentiment_scores = masked_df[input_column].apply(analyze_sentiment)
583
+
584
+ masked_df[output_columns[0]] = sentiment_scores.apply(lambda x: x[2]) # Positive
585
+ masked_df[output_columns[1]] = sentiment_scores.apply(lambda x: x[1]) # Neutral
586
+ masked_df[output_columns[2]] = sentiment_scores.apply(lambda x: x[0]) # Negative
587
+
588
+ masked_df[output_columns[3]] = masked_df[
589
+ [output_columns[0], output_columns[1], output_columns[2]]
590
+ ].idxmax(axis=1)
591
+ masked_df[output_columns[3]] = masked_df[output_columns[3]].map(
592
+ {
593
+ output_columns[0]: "positive",
594
+ output_columns[1]: "neutral",
595
+ output_columns[2]: "negative",
596
+ }
597
+ )
598
+
599
+ df_to_return = combine_results(df, masked_df, mask, output_columns)
600
+ return df_to_return
601
+
602
+
603
+ @pf.register_dataframe_method
604
+ def cluster_comments(
605
+ df: pd.DataFrame,
606
+ input_column: str,
607
+ output_columns: str = ["cluster", "cluster_probability"],
608
+ min_cluster_size=5,
609
+ cluster_selection_epsilon: float = 0.2,
610
+ n_neighbors: int = 15,
611
+ ):
612
+ """Apply a pipeline for clustering text comments.
613
+
614
+ Applies a pipeline of:
615
+ 1) Vector embeddings
616
+ 2) Dimensional reduction
617
+ 3) Clustering
618
+
619
+ This assigns each row a cluster ID so that similar free text comments
620
+ (found in the input_column) can be grouped together.
621
+
622
+ Parameters
623
+ ----------
624
+ df : pandas.DataFrame
625
+ The input DataFrame.
626
+ input_column : str
627
+ Name of the column containing text to cluster.
628
+ output_columns : list, optional
629
+ Names for the output columns. Default is ["cluster", "cluster_probability"].
630
+ min_cluster_size : int, optional
631
+ The minimum size of clusters for HDBSCAN. Default is 5.
632
+ cluster_selection_epsilon : float, optional
633
+ Distance threshold for HDBSCAN. Higher epsilon means fewer, larger clusters.
634
+ Default is 0.2.
635
+ n_neighbors : int, optional
636
+ The size of local neighborhood for UMAP. Default is 15.
637
+
638
+ Returns
639
+ -------
640
+ pandas.DataFrame
641
+ The input DataFrame with additional columns for cluster IDs and probabilities.
642
+ """
643
+
644
+
645
+ df_temp = (
646
+ df.fit_sentence_transformer(
647
+ input_column=input_column, output_column="sentence_embedding"
648
+ )
649
+ .fit_umap(
650
+ input_columns="sentence_embedding",
651
+ embeddings_in_list=True,
652
+ n_neighbors=n_neighbors,
653
+ )
654
+ .fit_cluster_hdbscan(
655
+ output_columns=output_columns,
656
+ min_cluster_size=min_cluster_size,
657
+ cluster_selection_epsilon=cluster_selection_epsilon,
658
+ )
659
+ )
660
+
661
+ return df_temp
662
+
663
+ @pf.register_dataframe_method
664
+ def fit_tfidf(
665
+ df: pd.DataFrame,
666
+ input_column: str,
667
+ output_column: str = "keywords",
668
+ top_n: int = 3,
669
+ threshold: float = 0.6,
670
+ append_features: bool = False,
671
+ ngram_range: Tuple[int, int] = (1, 1),
672
+ **tfidf_kwargs,
673
+ ) -> pd.DataFrame:
674
+ """Apply TF-IDF vectorization to extract top keywords from text.
675
+
676
+ Parameters
677
+ ----------
678
+ df : pandas.DataFrame
679
+ The input DataFrame.
680
+ input_column : str
681
+ Name of the column containing text to vectorize.
682
+ output_column : str, optional
683
+ Name of the column to store the extracted keywords. Default is 'keywords'.
684
+ top_n : int, optional
685
+ Number of top keywords to extract for each document. Default is 3.
686
+ threshold : float, optional
687
+ Minimum TF-IDF score for a keyword to be included. Default is 0.6.
688
+ append_features : bool, optional
689
+ If True, append all TF-IDF features to the DataFrame (useful for downstream machine learning tasks). Default is False.
690
+ ngram_range : tuple, optional
691
+ The lower and upper boundary of the range of n-values for different
692
+ n-grams to be extracted. Default is (1, 1) which means only unigrams.
693
+ Set to (1, 2) for unigrams and bigrams, and so on.
694
+ **tfidf_kwargs
695
+ Additional keyword arguments to pass to TfidfVectorizer.
696
+
697
+ Returns
698
+ -------
699
+ pandas.DataFrame
700
+ The input DataFrame with an additional column containing the top keywords.
701
+ """
702
+ # Create a masked DataFrame
703
+ masked_df, mask = create_masked_df(df, [input_column])
704
+
705
+ # Ensure ngram_range is included in the TfidfVectorizer parameters
706
+ tfidf_kwargs["ngram_range"] = ngram_range
707
+ # Inside fit_tfidf function
708
+ tfidf_kwargs["min_df"] = tfidf_kwargs.get("min_df", 1)
709
+
710
+ # Apply TF-IDF vectorization to the masked DataFrame
711
+ tfidf_features, _, feature_names = apply_vectorizer(
712
+ masked_df, input_column, vectorizer_name="TfidfVectorizer", **tfidf_kwargs
713
+ )
714
+
715
+ def extract_top_keywords(row: pd.Series) -> List[str]:
716
+ # Get indices of top N TF-IDF scores
717
+ top_indices = row.nlargest(top_n).index
718
+
719
+ # Get the original text for this row
720
+ original_text = masked_df.loc[row.name, input_column].lower()
721
+
722
+ # Filter based on threshold, presence in original text, and get the corresponding feature names
723
+ top_keywords = [
724
+ feature_names[i]
725
+ for i, idx in enumerate(tfidf_features.columns)
726
+ if idx in top_indices
727
+ and row[idx] >= threshold
728
+ and feature_names[i].lower() in original_text
729
+ ]
730
+
731
+ # Sort keywords based on their order in the original text
732
+ return sorted(top_keywords, key=lambda x: original_text.index(x.lower()))
733
+
734
+ # Extract top keywords for each document
735
+ masked_df[output_column] = tfidf_features.apply(extract_top_keywords, axis=1)
736
+
737
+ # Combine the results back into the original DataFrame
738
+ result_df = combine_results(df, masked_df, mask, [output_column])
739
+
740
+ # Optionally append all TF-IDF features
741
+ if append_features:
742
+ # We need to handle NaN values in the features as well
743
+ feature_columns = tfidf_features.columns.tolist()
744
+ masked_df = pd.concat([masked_df, tfidf_features], axis=1)
745
+ result_df = combine_results(result_df, masked_df, mask, feature_columns)
746
+
747
+ return result_df
748
+
749
+
750
+ @pf.register_dataframe_method
751
+ def fit_spacy(df, input_column: str, output_column: str = "spacy_output"):
752
+ """Apply the en_core_web_md spaCy model to the specified column.
753
+
754
+ Parameters
755
+ ----------
756
+ df : pandas.DataFrame
757
+ The input DataFrame.
758
+ input_column : str
759
+ Name of the column containing text to analyze.
760
+ output_column : str, optional
761
+ Name of the output column. Default is "spacy_output".
762
+
763
+ Returns
764
+ -------
765
+ pandas.DataFrame
766
+ The input DataFrame with an additional column containing spaCy doc objects.
767
+
768
+ Notes
769
+ -----
770
+ If the spaCy model is not already downloaded, this function will attempt
771
+ to download it automatically.
772
+ """
773
+
774
+ # Check if the model is downloaded, if not, download it
775
+ try:
776
+ nlp = spacy.load("en_core_web_md")
777
+ except OSError:
778
+ print("Downloading en_core_web_md model...")
779
+ spacy.cli.download("en_core_web_md")
780
+ nlp = spacy.load("en_core_web_md")
781
+
782
+ # Create masked DataFrame
783
+ masked_df, mask = create_masked_df(df, [input_column])
784
+
785
+ # Apply spaCy model
786
+ masked_df[output_column] = masked_df[input_column].apply(nlp)
787
+
788
+ # Combine results
789
+ df_to_return = combine_results(df, masked_df, mask, output_column)
790
+
791
+ return df_to_return
792
+
793
+
794
+ @pf.register_dataframe_method
795
+ def get_lemma(
796
+ df: pd.DataFrame,
797
+ input_column: str = "spacy_output",
798
+ output_column: str = "lemmatized_text",
799
+ text_pos: List[str] = ["PRON"],
800
+ remove_punct: bool = True,
801
+ remove_space: bool = True,
802
+ remove_stop: bool = True,
803
+ keep_tokens: Union[List[str], None] = None,
804
+ keep_pos: Union[List[str], None] = None,
805
+ keep_dep: Union[List[str], None] = ["neg"],
806
+ join_tokens: bool = True,
807
+ ) -> pd.DataFrame:
808
+ """Extract lemmatized text from spaCy doc objects.
809
+
810
+ Parameters
811
+ ----------
812
+ df : pandas.DataFrame
813
+ The input DataFrame.
814
+ input_column : str, optional
815
+ Name of the column containing spaCy doc objects. Default is 'spacy_output'.
816
+ output_column : str, optional
817
+ Name of the output column for lemmatized text. Default is 'lemmatized_text'.
818
+ text_pos : List[str], optional
819
+ List of POS tags to exclude from lemmatization and return the text. Default is ['PRON'].
820
+ remove_punct : bool, optional
821
+ Whether to remove punctuation. Default is True.
822
+ remove_space : bool, optional
823
+ Whether to remove whitespace tokens. Default is True.
824
+ remove_stop : bool, optional
825
+ Whether to remove stop words. Default is True.
826
+ keep_tokens : List[str], optional
827
+ List of token texts to always keep. Default is None.
828
+ keep_pos : List[str], optional
829
+ List of POS tags to always keep. Default is None.
830
+ keep_dep : List[str], optional
831
+ List of dependency labels to always keep. Default is ["neg"].
832
+ join_tokens : bool, optional
833
+ Whether to join tokens into a string. If False, returns a list of tokens. Default is True.
834
+
835
+ Returns
836
+ -------
837
+ pandas.DataFrame
838
+ The input DataFrame with an additional column containing lemmatized text or token list.
839
+ """
840
+
841
+ # Create masked DataFrame
842
+ masked_df, mask = create_masked_df(df, [input_column])
843
+
844
+ def remove_token(token):
845
+ """
846
+ Returns True if the token should be removed.
847
+ """
848
+ if (
849
+ (keep_tokens and token.text in keep_tokens)
850
+ or (keep_pos and token.pos_ in keep_pos)
851
+ or (keep_dep and token.dep_ in keep_dep)
852
+ ):
853
+ return False
854
+ return (
855
+ (remove_punct and token.is_punct)
856
+ or (remove_space and token.is_space)
857
+ or (remove_stop and token.is_stop)
858
+ )
859
+
860
+ def process_text(doc):
861
+ tokens = [
862
+ token.text if token.pos_ in text_pos else token.lemma_
863
+ for token in doc
864
+ if not remove_token(token)
865
+ ]
866
+ return " ".join(tokens) if join_tokens else tokens
867
+
868
+ # Apply processing
869
+ masked_df[output_column] = masked_df[input_column].apply(process_text)
870
+
871
+ # Combine results
872
+ df_to_return = combine_results(df, masked_df, mask, output_column)
873
+
874
+ return df_to_return
875
+
876
+
877
+ @pf.register_dataframe_method
878
+ def preprocess_text(
879
+ df: pd.DataFrame,
880
+ input_column: str,
881
+ output_column: str = None,
882
+ remove_html: bool = True,
883
+ lower_case: bool = False,
884
+ normalize_whitespace: bool = True,
885
+ remove_numbers: bool = False,
886
+ remove_stopwords: bool = False,
887
+ flag_short_comments: bool = False,
888
+ min_comment_length: int = 5,
889
+ max_comment_length: int = None,
890
+ remove_punctuation: bool = True,
891
+ keep_sentence_punctuation: bool = True,
892
+ comment_length_column: str = None,
893
+ ) -> pd.DataFrame:
894
+ """Preprocess text data in the specified column, tailored for survey responses.
895
+
896
+ Parameters
897
+ ----------
898
+ df : pandas.DataFrame
899
+ The input DataFrame.
900
+ input_column : str
901
+ Name of the column containing text to preprocess.
902
+ output_column : str, optional
903
+ Name of the output column. If None, overwrites the input column.
904
+ remove_html : bool, optional
905
+ Whether to remove unexpected HTML tags. Default is True.
906
+ lower_case : bool, optional
907
+ Whether to lowercase all words. Default is False.
908
+ normalize_whitespace : bool, optional
909
+ Whether to normalize whitespace. Default is True.
910
+ remove_numbers : bool, optional
911
+ Whether to remove numbers. Default is False.
912
+ remove_stopwords : bool, optional
913
+ Whether to remove stop words. Default is False.
914
+ flag_short_comments : bool, optional
915
+ Whether to flag very short comments. Default is False.
916
+ min_comment_length : int, optional
917
+ Minimum length of comment to not be flagged as short. Default is 5.
918
+ max_comment_length : int, optional
919
+ Maximum length of comment to keep. If None, keeps full length. Default is None.
920
+ remove_punctuation : bool, optional
921
+ Whether to remove punctuation. Default is True.
922
+ keep_sentence_punctuation : bool, optional
923
+ Whether to keep sentence-level punctuation. Default is True.
924
+ comment_length_column : str, optional
925
+ Name of the column to store comment lengths. If None, no column is added. Default is None.
926
+
927
+ Returns
928
+ -------
929
+ pandas.DataFrame
930
+ The input DataFrame with preprocessed text and optionally new columns for
931
+ short comments, truncation info, and comment length.
932
+ """
933
+
934
+ output_column = output_column or input_column
935
+
936
+ # Create masked DataFrame
937
+ masked_df, mask = create_masked_df(df, [input_column])
938
+
939
+ def process_text(text):
940
+ if lower_case:
941
+ text = text.lower()
942
+ if remove_html:
943
+ text = strip_tags(text)
944
+
945
+ if normalize_whitespace:
946
+ text = strip_multiple_whitespaces(text)
947
+
948
+ if remove_numbers:
949
+ text = strip_numeric(text)
950
+
951
+ if remove_stopwords:
952
+ text = remove_stopwords(text)
953
+
954
+ if remove_punctuation:
955
+ if keep_sentence_punctuation:
956
+ # Remove all punctuation except .,!?'" and apostrophes
957
+ text = re.sub(r"[^\w\s.,!?'\"]", "", text)
958
+ # Remove spaces before punctuation, but not before apostrophes
959
+ text = re.sub(r"\s([.,!?\"](?:\s|$))", r"\1", text)
960
+ else:
961
+ # Remove all punctuation except apostrophes
962
+ text = re.sub(r"[^\w\s']", "", text)
963
+
964
+ text = text.strip()
965
+
966
+ if max_comment_length:
967
+ text = text[:max_comment_length]
968
+
969
+ return text
970
+
971
+ # Apply processing
972
+ masked_df[output_column] = masked_df[input_column].apply(process_text)
973
+
974
+ columns_to_combine = [output_column]
975
+
976
+ if flag_short_comments:
977
+ short_comment_col = f"{output_column}_is_short"
978
+ masked_df[short_comment_col] = (
979
+ masked_df[output_column].str.len() < min_comment_length
980
+ )
981
+ columns_to_combine.append(short_comment_col)
982
+
983
+ if max_comment_length:
984
+ truncated_col = f"{output_column}_was_truncated"
985
+ masked_df[truncated_col] = (
986
+ masked_df[input_column].str.len() > max_comment_length
987
+ )
988
+ columns_to_combine.append(truncated_col)
989
+
990
+ if comment_length_column:
991
+ masked_df[comment_length_column] = masked_df[output_column].str.len()
992
+ columns_to_combine.append(comment_length_column)
993
+
994
+ # Combine results
995
+ df_to_return = combine_results(df, masked_df, mask, columns_to_combine)
996
+
997
+ return df_to_return