ssdiff 0.2.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ssdiff-0.2.0/ssdiff.egg-info → ssdiff-0.2.1}/PKG-INFO +44 -17
- {ssdiff-0.2.0 → ssdiff-0.2.1}/README.md +43 -16
- {ssdiff-0.2.0 → ssdiff-0.2.1}/pyproject.toml +1 -1
- {ssdiff-0.2.0 → ssdiff-0.2.1}/ssdiff/lexicon.py +280 -104
- {ssdiff-0.2.0 → ssdiff-0.2.1/ssdiff.egg-info}/PKG-INFO +44 -17
- {ssdiff-0.2.0 → ssdiff-0.2.1}/LICENSE +0 -0
- {ssdiff-0.2.0 → ssdiff-0.2.1}/setup.cfg +0 -0
- {ssdiff-0.2.0 → ssdiff-0.2.1}/ssdiff/__init__.py +0 -0
- {ssdiff-0.2.0 → ssdiff-0.2.1}/ssdiff/clusters.py +0 -0
- {ssdiff-0.2.0 → ssdiff-0.2.1}/ssdiff/core.py +0 -0
- {ssdiff-0.2.0 → ssdiff-0.2.1}/ssdiff/crossgroup.py +0 -0
- {ssdiff-0.2.0 → ssdiff-0.2.1}/ssdiff/io_utils.py +0 -0
- {ssdiff-0.2.0 → ssdiff-0.2.1}/ssdiff/preprocess.py +0 -0
- {ssdiff-0.2.0 → ssdiff-0.2.1}/ssdiff/snippets.py +0 -0
- {ssdiff-0.2.0 → ssdiff-0.2.1}/ssdiff/sweep.py +0 -0
- {ssdiff-0.2.0 → ssdiff-0.2.1}/ssdiff/utils.py +0 -0
- {ssdiff-0.2.0 → ssdiff-0.2.1}/ssdiff.egg-info/SOURCES.txt +0 -0
- {ssdiff-0.2.0 → ssdiff-0.2.1}/ssdiff.egg-info/dependency_links.txt +0 -0
- {ssdiff-0.2.0 → ssdiff-0.2.1}/ssdiff.egg-info/requires.txt +0 -0
- {ssdiff-0.2.0 → ssdiff-0.2.1}/ssdiff.egg-info/top_level.txt +0 -0
- {ssdiff-0.2.0 → ssdiff-0.2.1}/tests/test_basic_pipeline.py +0 -0
- {ssdiff-0.2.0 → ssdiff-0.2.1}/tests/test_imports.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ssdiff
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Supervised Semantic Differential (SSD): interpretable, embedding-based analysis of concept meaning in text.
|
|
5
5
|
Author-email: Hubert Plisiecki <hplisiecki@gmail.com>, Paweł Lenartowicz <pawellenartowicz@europe.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -271,49 +271,75 @@ These helpers make lexicon selection transparent and data-driven (you can also h
|
|
|
271
271
|
|
|
272
272
|
### `suggest_lexicon(...)`
|
|
273
273
|
|
|
274
|
-
Rank tokens by balanced coverage with a mild penalty for strong
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
274
|
+
Rank tokens by balanced coverage with a mild penalty for strong association with the outcome.
|
|
275
|
+
|
|
276
|
+
All three lexicon utilities accept `var_type='continuous'` (default) or `var_type='categorical'`:
|
|
277
|
+
|
|
278
|
+
| | `var_type='continuous'` | `var_type='categorical'` |
|
|
279
|
+
|---|---|---|
|
|
280
|
+
| `cov_bal` | average presence across 𝑛 quantile bins of 𝑦 | average presence across group labels |
|
|
281
|
+
| `corr` | Pearson correlation between 0/1 presence and standardized 𝑦 | Cramér's V between 0/1 presence and group label |
|
|
282
|
+
| `q1` / `q4` | coverage in lowest / highest 𝑦 quantile bin | min / max group coverage |
|
|
283
|
+
| `rank` | `cov_bal * (1 - min(1, \|corr\|/corr_cap))` | same formula (Cramér's V replaces Pearson) |
|
|
279
284
|
|
|
280
285
|
Accepts a DataFrame (`text_col`, `score_col`) or a `(texts, y)` tuple where texts can be raw strings or token lists.
|
|
281
286
|
|
|
282
287
|
```python
|
|
283
288
|
from ssdiff import suggest_lexicon
|
|
284
289
|
|
|
285
|
-
#
|
|
290
|
+
# Continuous outcome (default)
|
|
286
291
|
cands_df = suggest_lexicon(df, text_col="lemmatized", score_col="questionnaire_result", top_k=150)
|
|
287
292
|
|
|
288
293
|
# Or using a tuple (texts, y)
|
|
289
294
|
texts = [" ".join(doc) for doc in docs]
|
|
290
295
|
cands_df2 = suggest_lexicon((docs, y), top_k=150)
|
|
296
|
+
|
|
297
|
+
# Categorical groups
|
|
298
|
+
cands_cat = suggest_lexicon(df, text_col="lemmatized", score_col="diagnosis", top_k=150, var_type="categorical")
|
|
299
|
+
cands_cat2 = suggest_lexicon((docs, groups), top_k=150, var_type="categorical")
|
|
291
300
|
```
|
|
292
301
|
### `token_presence_stats(...)`
|
|
293
302
|
|
|
294
|
-
Per-token coverage &
|
|
303
|
+
Per-token coverage & association diagnostics:
|
|
295
304
|
```python
|
|
296
305
|
from ssdiff import token_presence_stats
|
|
297
|
-
|
|
298
|
-
|
|
306
|
+
|
|
307
|
+
# Continuous
|
|
308
|
+
stats = token_presence_stats(texts, y, token="concept_keyword_1", n_bins=4, verbose=True)
|
|
309
|
+
print(stats) # dict: token, docs, cov_all, cov_bal, corr, rank, q1, q4
|
|
310
|
+
|
|
311
|
+
# Categorical — output also includes group_cov (per-group coverage dict)
|
|
312
|
+
stats = token_presence_stats(texts, groups, token="concept_keyword_1", var_type="categorical", verbose=True)
|
|
313
|
+
print(stats["group_cov"]) # e.g. {"control": 0.45, "depression": 0.62}
|
|
299
314
|
```
|
|
300
315
|
|
|
301
316
|
### `coverage_by_lexicon(...)`
|
|
302
317
|
|
|
303
318
|
Summary for your chosen lexicon:
|
|
304
|
-
- `summary` : `docs_any`, `cov_all`, `q1
|
|
305
|
-
- `q1` / `q4`: coverage within the lowest/highest 𝑦 bins (
|
|
306
|
-
- `
|
|
319
|
+
- `summary` : `docs_any`, `cov_all`, `q1`, `q4`, `corr_any`, `hits_mean`, `hits_median`, `types_mean`, `types_median`
|
|
320
|
+
- `q1` / `q4`: coverage within the lowest/highest 𝑦 bins (continuous) or min/max group coverage (categorical)
|
|
321
|
+
- when `var_type='categorical'`, summary also includes `group_cov` (per-group coverage dict)
|
|
322
|
+
- `per_token_df`: per-token stats
|
|
307
323
|
|
|
308
324
|
```python
|
|
309
325
|
from ssdiff import coverage_by_lexicon
|
|
310
326
|
|
|
327
|
+
# Continuous
|
|
311
328
|
summary, per_tok = coverage_by_lexicon(
|
|
312
329
|
(texts, y),
|
|
313
330
|
lexicon={"concept_keyword_1", "concept_keyword_2", "concept_keyword_3", "concept_keyword_4"},
|
|
314
331
|
n_bins=4,
|
|
315
|
-
verbose=True
|
|
332
|
+
verbose=True,
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
# Categorical
|
|
336
|
+
summary, per_tok = coverage_by_lexicon(
|
|
337
|
+
(texts, groups),
|
|
338
|
+
lexicon={"concept_keyword_1", "concept_keyword_2"},
|
|
339
|
+
var_type="categorical",
|
|
340
|
+
verbose=True,
|
|
316
341
|
)
|
|
342
|
+
print(summary["group_cov"]) # e.g. {"control": 0.80, "depression": 0.75}
|
|
317
343
|
```
|
|
318
344
|
|
|
319
345
|
---
|
|
@@ -746,9 +772,10 @@ Returned by `SSDGroup.get_contrast()`. Duck-types with `SSD` for interpretation:
|
|
|
746
772
|
- `build_docs_from_preprocessed(pre_docs)` → list[list[str]] (lemmas for modeling)
|
|
747
773
|
|
|
748
774
|
### Lexicon
|
|
749
|
-
- `suggest_lexicon(df_or_tuple, text_col=None, score_col=None, top_k=150, min_docs=5, n_bins=4, corr_cap=0.30)` → DataFrame
|
|
750
|
-
- `token_presence_stats(
|
|
751
|
-
- `coverage_by_lexicon(df_or_tuple, lexicon, n_bins=4, verbose=False)` → `(summary, per_token_df)`
|
|
775
|
+
- `suggest_lexicon(df_or_tuple, text_col=None, score_col=None, top_k=150, min_docs=5, n_bins=4, corr_cap=0.30, var_type='continuous')` → DataFrame
|
|
776
|
+
- `token_presence_stats(texts, y, token, n_bins=4, corr_cap=0.30, verbose=False, var_type='continuous')` → dict
|
|
777
|
+
- `coverage_by_lexicon(df_or_tuple, text_col=None, score_col=None, lexicon=(), n_bins=4, verbose=False, var_type='continuous')` → `(summary, per_token_df)`
|
|
778
|
+
- `var_type`: `'continuous'` (numeric outcome, default) or `'categorical'` (group labels). When categorical, `corr` is Cramér's V, `cov_bal` is balanced across groups, and `q1`/`q4` are min/max group coverage.
|
|
752
779
|
|
|
753
780
|
---
|
|
754
781
|
## Citing & License
|
|
@@ -230,49 +230,75 @@ These helpers make lexicon selection transparent and data-driven (you can also h
|
|
|
230
230
|
|
|
231
231
|
### `suggest_lexicon(...)`
|
|
232
232
|
|
|
233
|
-
Rank tokens by balanced coverage with a mild penalty for strong
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
233
|
+
Rank tokens by balanced coverage with a mild penalty for strong association with the outcome.
|
|
234
|
+
|
|
235
|
+
All three lexicon utilities accept `var_type='continuous'` (default) or `var_type='categorical'`:
|
|
236
|
+
|
|
237
|
+
| | `var_type='continuous'` | `var_type='categorical'` |
|
|
238
|
+
|---|---|---|
|
|
239
|
+
| `cov_bal` | average presence across 𝑛 quantile bins of 𝑦 | average presence across group labels |
|
|
240
|
+
| `corr` | Pearson correlation between 0/1 presence and standardized 𝑦 | Cramér's V between 0/1 presence and group label |
|
|
241
|
+
| `q1` / `q4` | coverage in lowest / highest 𝑦 quantile bin | min / max group coverage |
|
|
242
|
+
| `rank` | `cov_bal * (1 - min(1, \|corr\|/corr_cap))` | same formula (Cramér's V replaces Pearson) |
|
|
238
243
|
|
|
239
244
|
Accepts a DataFrame (`text_col`, `score_col`) or a `(texts, y)` tuple where texts can be raw strings or token lists.
|
|
240
245
|
|
|
241
246
|
```python
|
|
242
247
|
from ssdiff import suggest_lexicon
|
|
243
248
|
|
|
244
|
-
#
|
|
249
|
+
# Continuous outcome (default)
|
|
245
250
|
cands_df = suggest_lexicon(df, text_col="lemmatized", score_col="questionnaire_result", top_k=150)
|
|
246
251
|
|
|
247
252
|
# Or using a tuple (texts, y)
|
|
248
253
|
texts = [" ".join(doc) for doc in docs]
|
|
249
254
|
cands_df2 = suggest_lexicon((docs, y), top_k=150)
|
|
255
|
+
|
|
256
|
+
# Categorical groups
|
|
257
|
+
cands_cat = suggest_lexicon(df, text_col="lemmatized", score_col="diagnosis", top_k=150, var_type="categorical")
|
|
258
|
+
cands_cat2 = suggest_lexicon((docs, groups), top_k=150, var_type="categorical")
|
|
250
259
|
```
|
|
251
260
|
### `token_presence_stats(...)`
|
|
252
261
|
|
|
253
|
-
Per-token coverage &
|
|
262
|
+
Per-token coverage & association diagnostics:
|
|
254
263
|
```python
|
|
255
264
|
from ssdiff import token_presence_stats
|
|
256
|
-
|
|
257
|
-
|
|
265
|
+
|
|
266
|
+
# Continuous
|
|
267
|
+
stats = token_presence_stats(texts, y, token="concept_keyword_1", n_bins=4, verbose=True)
|
|
268
|
+
print(stats) # dict: token, docs, cov_all, cov_bal, corr, rank, q1, q4
|
|
269
|
+
|
|
270
|
+
# Categorical — output also includes group_cov (per-group coverage dict)
|
|
271
|
+
stats = token_presence_stats(texts, groups, token="concept_keyword_1", var_type="categorical", verbose=True)
|
|
272
|
+
print(stats["group_cov"]) # e.g. {"control": 0.45, "depression": 0.62}
|
|
258
273
|
```
|
|
259
274
|
|
|
260
275
|
### `coverage_by_lexicon(...)`
|
|
261
276
|
|
|
262
277
|
Summary for your chosen lexicon:
|
|
263
|
-
- `summary` : `docs_any`, `cov_all`, `q1
|
|
264
|
-
- `q1` / `q4`: coverage within the lowest/highest 𝑦 bins (
|
|
265
|
-
- `
|
|
278
|
+
- `summary` : `docs_any`, `cov_all`, `q1`, `q4`, `corr_any`, `hits_mean`, `hits_median`, `types_mean`, `types_median`
|
|
279
|
+
- `q1` / `q4`: coverage within the lowest/highest 𝑦 bins (continuous) or min/max group coverage (categorical)
|
|
280
|
+
- when `var_type='categorical'`, summary also includes `group_cov` (per-group coverage dict)
|
|
281
|
+
- `per_token_df`: per-token stats
|
|
266
282
|
|
|
267
283
|
```python
|
|
268
284
|
from ssdiff import coverage_by_lexicon
|
|
269
285
|
|
|
286
|
+
# Continuous
|
|
270
287
|
summary, per_tok = coverage_by_lexicon(
|
|
271
288
|
(texts, y),
|
|
272
289
|
lexicon={"concept_keyword_1", "concept_keyword_2", "concept_keyword_3", "concept_keyword_4"},
|
|
273
290
|
n_bins=4,
|
|
274
|
-
verbose=True
|
|
291
|
+
verbose=True,
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
# Categorical
|
|
295
|
+
summary, per_tok = coverage_by_lexicon(
|
|
296
|
+
(texts, groups),
|
|
297
|
+
lexicon={"concept_keyword_1", "concept_keyword_2"},
|
|
298
|
+
var_type="categorical",
|
|
299
|
+
verbose=True,
|
|
275
300
|
)
|
|
301
|
+
print(summary["group_cov"]) # e.g. {"control": 0.80, "depression": 0.75}
|
|
276
302
|
```
|
|
277
303
|
|
|
278
304
|
---
|
|
@@ -705,9 +731,10 @@ Returned by `SSDGroup.get_contrast()`. Duck-types with `SSD` for interpretation:
|
|
|
705
731
|
- `build_docs_from_preprocessed(pre_docs)` → list[list[str]] (lemmas for modeling)
|
|
706
732
|
|
|
707
733
|
### Lexicon
|
|
708
|
-
- `suggest_lexicon(df_or_tuple, text_col=None, score_col=None, top_k=150, min_docs=5, n_bins=4, corr_cap=0.30)` → DataFrame
|
|
709
|
-
- `token_presence_stats(
|
|
710
|
-
- `coverage_by_lexicon(df_or_tuple, lexicon, n_bins=4, verbose=False)` → `(summary, per_token_df)`
|
|
734
|
+
- `suggest_lexicon(df_or_tuple, text_col=None, score_col=None, top_k=150, min_docs=5, n_bins=4, corr_cap=0.30, var_type='continuous')` → DataFrame
|
|
735
|
+
- `token_presence_stats(texts, y, token, n_bins=4, corr_cap=0.30, verbose=False, var_type='continuous')` → dict
|
|
736
|
+
- `coverage_by_lexicon(df_or_tuple, text_col=None, score_col=None, lexicon=(), n_bins=4, verbose=False, var_type='continuous')` → `(summary, per_token_df)`
|
|
737
|
+
- `var_type`: `'continuous'` (numeric outcome, default) or `'categorical'` (group labels). When categorical, `corr` is Cramér's V, `cov_bal` is balanced across groups, and `q1`/`q4` are min/max group coverage.
|
|
711
738
|
|
|
712
739
|
---
|
|
713
740
|
## Citing & License
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "ssdiff"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.1"
|
|
8
8
|
description = "Supervised Semantic Differential (SSD): interpretable, embedding-based analysis of concept meaning in text."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -63,31 +63,69 @@ def _z(v: pd.Series | np.ndarray) -> np.ndarray:
|
|
|
63
63
|
mu = float(np.nanmean(arr))
|
|
64
64
|
return (arr - mu) / sd
|
|
65
65
|
|
|
66
|
+
def _validate_var_type(var_type: str) -> None:
|
|
67
|
+
if var_type not in ("continuous", "categorical"):
|
|
68
|
+
raise ValueError(
|
|
69
|
+
f"var_type must be 'continuous' or 'categorical', got {var_type!r}"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def _categorical_mask(y) -> np.ndarray:
|
|
73
|
+
"""Boolean mask: True for valid categorical entries (not None/NaN/empty)."""
|
|
74
|
+
arr = np.asarray(y, dtype=object)
|
|
75
|
+
return np.array([
|
|
76
|
+
g is not None and g != "" and (not isinstance(g, float) or np.isfinite(g))
|
|
77
|
+
for g in arr
|
|
78
|
+
], dtype=bool)
|
|
79
|
+
|
|
80
|
+
def _cramers_v(presence: np.ndarray, groups: np.ndarray) -> float:
|
|
81
|
+
"""Cramér's V between binary presence (0/1) and group labels."""
|
|
82
|
+
ct = pd.crosstab(presence, groups)
|
|
83
|
+
if ct.shape[0] < 2 or ct.shape[1] < 2:
|
|
84
|
+
return 0.0
|
|
85
|
+
n = ct.values.sum()
|
|
86
|
+
row_sums = ct.sum(axis=1).values
|
|
87
|
+
col_sums = ct.sum(axis=0).values
|
|
88
|
+
expected = np.outer(row_sums, col_sums) / n
|
|
89
|
+
chi2 = float(((ct.values - expected) ** 2 / expected).sum())
|
|
90
|
+
k = min(ct.shape) - 1
|
|
91
|
+
return float(np.sqrt(chi2 / (n * k))) if n * k > 0 else 0.0
|
|
92
|
+
|
|
66
93
|
def _rank_for_token_stats(
|
|
67
94
|
presence_vec: np.ndarray,
|
|
68
95
|
y: pd.Series | np.ndarray,
|
|
69
96
|
n_bins: int = 4,
|
|
70
97
|
corr_cap: float = 0.30,
|
|
98
|
+
categorical: bool = False,
|
|
71
99
|
) -> tuple[float, float, float, float]:
|
|
72
100
|
"""
|
|
73
101
|
presence_vec: 0/1 per document
|
|
74
102
|
Returns: (cov_all, cov_bal, corr, rank)
|
|
75
103
|
rank = balanced_coverage * (1 - min(1, |corr|/corr_cap))
|
|
104
|
+
|
|
105
|
+
When categorical=True, bins are group labels and corr is Cramér's V.
|
|
76
106
|
"""
|
|
77
|
-
bins = _quantile_bins(y, n_bins=n_bins)
|
|
78
107
|
presence_vec = presence_vec.astype(float)
|
|
79
108
|
cov_all = float(np.mean(presence_vec)) if len(presence_vec) else 0.0
|
|
80
109
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
110
|
+
if categorical:
|
|
111
|
+
groups = np.asarray(y, dtype=object)
|
|
112
|
+
cov_per_group = []
|
|
113
|
+
for g in sorted(set(groups)):
|
|
114
|
+
idx = np.where(groups == g)[0]
|
|
115
|
+
cov_per_group.append(float(np.mean(presence_vec[idx])) if len(idx) else 0.0)
|
|
116
|
+
cov_bal = float(np.mean(cov_per_group)) if cov_per_group else 0.0
|
|
117
|
+
corr = _cramers_v(presence_vec.astype(int), groups)
|
|
118
|
+
else:
|
|
119
|
+
bins = _quantile_bins(y, n_bins=n_bins)
|
|
120
|
+
# balanced coverage: mean coverage within each bin
|
|
121
|
+
cov_per_bin = []
|
|
122
|
+
for b in sorted(np.unique(bins)):
|
|
123
|
+
idx = np.where(bins == b)[0]
|
|
124
|
+
cov_per_bin.append(float(np.mean(presence_vec[idx])) if len(idx) else 0.0)
|
|
125
|
+
cov_bal = float(np.mean(cov_per_bin)) if cov_per_bin else 0.0
|
|
126
|
+
y_std = _z(y)
|
|
127
|
+
corr = float(np.corrcoef(presence_vec, y_std)[0, 1]) if np.std(presence_vec) > 0 else 0.0
|
|
87
128
|
|
|
88
|
-
y_std = _z(y)
|
|
89
|
-
# guard zero variance in presence
|
|
90
|
-
corr = float(np.corrcoef(presence_vec, y_std)[0, 1]) if np.std(presence_vec) > 0 else 0.0
|
|
91
129
|
pen = min(1.0, abs(corr) / corr_cap)
|
|
92
130
|
rank = cov_bal * (1.0 - pen)
|
|
93
131
|
return cov_all, cov_bal, corr, rank
|
|
@@ -105,9 +143,11 @@ def suggest_lexicon(
|
|
|
105
143
|
min_docs: int = 5,
|
|
106
144
|
n_bins: int = 4,
|
|
107
145
|
corr_cap: float = 0.30,
|
|
146
|
+
var_type: str = "continuous",
|
|
108
147
|
) -> pd.DataFrame:
|
|
109
148
|
"""
|
|
110
|
-
Suggest candidate tokens ranked by coverage with a mild penalty for strong
|
|
149
|
+
Suggest candidate tokens ranked by coverage with a mild penalty for strong
|
|
150
|
+
association with y.
|
|
111
151
|
|
|
112
152
|
Parameters
|
|
113
153
|
----------
|
|
@@ -118,32 +158,54 @@ def suggest_lexicon(
|
|
|
118
158
|
text_col : str | None
|
|
119
159
|
Column name with preprocessed text (space-separated) if df provided.
|
|
120
160
|
score_col : str | None
|
|
121
|
-
Column name with
|
|
161
|
+
Column name with outcome variable if df provided (numeric for
|
|
162
|
+
continuous, any hashable for categorical).
|
|
163
|
+
var_type : str
|
|
164
|
+
``'continuous'`` (default) for numeric outcomes or ``'categorical'``
|
|
165
|
+
for group labels.
|
|
122
166
|
|
|
123
167
|
Returns
|
|
124
168
|
-------
|
|
125
169
|
DataFrame with columns: token, docs, cov_all, cov_bal, corr, rank (sorted desc).
|
|
170
|
+
When var_type='categorical', corr is Cramér's V and cov_bal is balanced
|
|
171
|
+
across group labels instead of quantile bins.
|
|
126
172
|
"""
|
|
173
|
+
_validate_var_type(var_type)
|
|
174
|
+
is_categorical = var_type == "categorical"
|
|
175
|
+
|
|
127
176
|
# Allow passing a tuple (texts, y) directly
|
|
128
177
|
if not isinstance(df_or_texts, pd.DataFrame):
|
|
129
178
|
if isinstance(df_or_texts, tuple) and len(df_or_texts) == 2:
|
|
130
179
|
texts, y = df_or_texts
|
|
131
180
|
texts = _texts_to_token_lists(texts)
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
181
|
+
if is_categorical:
|
|
182
|
+
y = np.asarray(y, dtype=object)
|
|
183
|
+
mask = _categorical_mask(y)
|
|
184
|
+
if not mask.all():
|
|
185
|
+
texts = [texts[i] for i in range(len(texts)) if mask[i]]
|
|
186
|
+
y = y[mask]
|
|
187
|
+
else:
|
|
188
|
+
y = _as_series_1d(y)
|
|
189
|
+
mask = ~y.isna()
|
|
190
|
+
if not mask.all():
|
|
191
|
+
texts = [texts[i] for i in range(len(texts)) if mask.iat[i]]
|
|
192
|
+
y = y[mask].reset_index(drop=True)
|
|
137
193
|
else:
|
|
138
194
|
raise ValueError("If not passing a DataFrame, pass (texts, y) as a tuple.")
|
|
139
195
|
else:
|
|
140
196
|
if not text_col or not score_col:
|
|
141
197
|
raise ValueError("Provide text_col and score_col when using a DataFrame.")
|
|
142
198
|
s = df_or_texts[text_col].fillna("").astype(str)
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
199
|
+
if is_categorical:
|
|
200
|
+
y = np.asarray(df_or_texts[score_col], dtype=object)
|
|
201
|
+
mask = _categorical_mask(y)
|
|
202
|
+
texts = _texts_to_token_lists(s[mask].tolist())
|
|
203
|
+
y = y[mask]
|
|
204
|
+
else:
|
|
205
|
+
y = _as_series_1d(df_or_texts[score_col])
|
|
206
|
+
mask = ~y.isna()
|
|
207
|
+
texts = _texts_to_token_lists(s[mask].tolist())
|
|
208
|
+
y = y[mask]
|
|
147
209
|
|
|
148
210
|
# Build doc-frequency counts
|
|
149
211
|
token_sets = _token_sets(texts)
|
|
@@ -156,10 +218,12 @@ def suggest_lexicon(
|
|
|
156
218
|
return pd.DataFrame(columns=["token", "docs", "cov_all", "cov_bal", "corr", "rank"])
|
|
157
219
|
|
|
158
220
|
rows = []
|
|
159
|
-
y_clean = y.reset_index(drop=True)
|
|
221
|
+
y_clean = y if is_categorical else y.reset_index(drop=True)
|
|
160
222
|
for t in vocab:
|
|
161
223
|
pres = np.fromiter((1 if t in ts else 0 for ts in token_sets), dtype=np.int8, count=len(token_sets))
|
|
162
|
-
cov_all, cov_bal, corr, rank = _rank_for_token_stats(
|
|
224
|
+
cov_all, cov_bal, corr, rank = _rank_for_token_stats(
|
|
225
|
+
pres, y_clean, n_bins=n_bins, corr_cap=corr_cap, categorical=is_categorical,
|
|
226
|
+
)
|
|
163
227
|
rows.append(dict(token=t, docs=int(pres.sum()), cov_all=cov_all, cov_bal=cov_bal, corr=corr, rank=rank))
|
|
164
228
|
|
|
165
229
|
out = pd.DataFrame(rows)
|
|
@@ -176,14 +240,26 @@ def token_presence_stats(
|
|
|
176
240
|
n_bins: int = 4,
|
|
177
241
|
corr_cap: float = 0.30,
|
|
178
242
|
verbose: bool = False,
|
|
243
|
+
var_type: str = "continuous",
|
|
179
244
|
) -> dict:
|
|
180
245
|
"""
|
|
181
246
|
Compute docs count, coverage, balanced coverage, correlation, and rank for a single token.
|
|
247
|
+
|
|
182
248
|
Accepts texts as:
|
|
183
249
|
- str → split() is used
|
|
184
250
|
- List[str] → treated as tokenized document
|
|
185
251
|
- List[List[str]] (or deeper) → treated as sentences of tokens (flattened)
|
|
252
|
+
|
|
253
|
+
Parameters
|
|
254
|
+
----------
|
|
255
|
+
var_type : str
|
|
256
|
+
``'continuous'`` (default) or ``'categorical'``. When categorical,
|
|
257
|
+
corr is Cramér's V, q1/q4 become min/max group coverage, and an
|
|
258
|
+
extra ``group_cov`` dict mapping each group label to its coverage
|
|
259
|
+
is included in the output.
|
|
186
260
|
"""
|
|
261
|
+
_validate_var_type(var_type)
|
|
262
|
+
is_categorical = var_type == "categorical"
|
|
187
263
|
|
|
188
264
|
def _doc_token_set(doc) -> set[str]:
|
|
189
265
|
# Fast, robust flattener to collect strings at any nesting depth.
|
|
@@ -210,48 +286,78 @@ def token_presence_stats(
|
|
|
210
286
|
|
|
211
287
|
# --- coerce inputs ---
|
|
212
288
|
token = str(token)
|
|
213
|
-
|
|
214
|
-
|
|
289
|
+
texts_list = list(texts)
|
|
290
|
+
|
|
291
|
+
if is_categorical:
|
|
292
|
+
y_arr = np.asarray(y, dtype=object)
|
|
293
|
+
mask = _categorical_mask(y_arr)
|
|
294
|
+
if not mask.all():
|
|
295
|
+
texts_list = [texts_list[i] for i in range(len(texts_list)) if mask[i]]
|
|
296
|
+
y_arr = y_arr[mask]
|
|
297
|
+
if len(texts_list) != len(y_arr):
|
|
298
|
+
raise ValueError(f"Length mismatch: texts={len(texts_list)} vs y={len(y_arr)}")
|
|
299
|
+
|
|
300
|
+
pres = np.fromiter(
|
|
301
|
+
(1 if token in _doc_token_set(doc) else 0 for doc in texts_list),
|
|
302
|
+
dtype=np.int8, count=len(texts_list),
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
cov_all, cov_bal, corr, rank = _rank_for_token_stats(
|
|
306
|
+
pres, y_arr, n_bins=n_bins, corr_cap=corr_cap, categorical=True,
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# per-group coverage
|
|
310
|
+
group_labels = sorted(set(y_arr))
|
|
311
|
+
group_cov = {}
|
|
312
|
+
for g in group_labels:
|
|
313
|
+
idx = np.where(y_arr == g)[0]
|
|
314
|
+
group_cov[g] = float(pres[idx].mean()) if len(idx) else 0.0
|
|
315
|
+
q1 = min(group_cov.values()) if group_cov else 0.0
|
|
316
|
+
q4 = max(group_cov.values()) if group_cov else 0.0
|
|
317
|
+
|
|
318
|
+
out = dict(
|
|
319
|
+
token=token, docs=int(pres.sum()),
|
|
320
|
+
cov_all=float(cov_all), cov_bal=float(cov_bal),
|
|
321
|
+
corr=float(corr), rank=float(rank),
|
|
322
|
+
q1=q1, q4=q4,
|
|
323
|
+
group_cov=group_cov,
|
|
324
|
+
)
|
|
215
325
|
else:
|
|
216
|
-
|
|
326
|
+
if isinstance(y, np.ndarray):
|
|
327
|
+
y_series = pd.Series(y, dtype=float)
|
|
328
|
+
else:
|
|
329
|
+
y_series = y.copy()
|
|
217
330
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
y_series = y_series[mask].reset_index(drop=True)
|
|
331
|
+
mask = ~y_series.isna()
|
|
332
|
+
if not mask.all():
|
|
333
|
+
texts_list = [texts_list[i] for i in range(len(texts_list)) if mask.iat[i]]
|
|
334
|
+
y_series = y_series[mask].reset_index(drop=True)
|
|
223
335
|
|
|
224
|
-
|
|
225
|
-
|
|
336
|
+
if len(texts_list) != len(y_series):
|
|
337
|
+
raise ValueError(f"Length mismatch: texts={len(texts_list)} vs y={len(y_series)}")
|
|
226
338
|
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
)
|
|
339
|
+
pres = np.fromiter(
|
|
340
|
+
(1 if token in _doc_token_set(doc) else 0 for doc in texts_list),
|
|
341
|
+
dtype=np.int8, count=len(texts_list),
|
|
342
|
+
)
|
|
232
343
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
)
|
|
344
|
+
cov_all, cov_bal, corr, rank = _rank_for_token_stats(
|
|
345
|
+
pres, y_series, n_bins=n_bins, corr_cap=corr_cap, categorical=False,
|
|
346
|
+
)
|
|
237
347
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
rank=float(rank),
|
|
252
|
-
q1=q1,
|
|
253
|
-
q4=q4,
|
|
254
|
-
)
|
|
348
|
+
# quartiles (for interpretability)
|
|
349
|
+
bins = _quantile_bins(y_series, n_bins=n_bins)
|
|
350
|
+
low = np.where(bins == bins.min())[0]
|
|
351
|
+
high = np.where(bins == bins.max())[0]
|
|
352
|
+
q1 = float(pres[low].mean()) if len(low) else 0.0
|
|
353
|
+
q4 = float(pres[high].mean()) if len(high) else 0.0
|
|
354
|
+
|
|
355
|
+
out = dict(
|
|
356
|
+
token=token, docs=int(pres.sum()),
|
|
357
|
+
cov_all=float(cov_all), cov_bal=float(cov_bal),
|
|
358
|
+
corr=float(corr), rank=float(rank),
|
|
359
|
+
q1=q1, q4=q4,
|
|
360
|
+
)
|
|
255
361
|
|
|
256
362
|
if verbose:
|
|
257
363
|
print(
|
|
@@ -259,6 +365,9 @@ def token_presence_stats(
|
|
|
259
365
|
f"docs={out['docs']} | cov_all={out['cov_all']:.3f} | cov_bal={out['cov_bal']:.3f} | "
|
|
260
366
|
f"q1={out['q1']:.3f} | q4={out['q4']:.3f} | corr={out['corr']:.3f} | rank={out['rank']:.3f}"
|
|
261
367
|
)
|
|
368
|
+
if is_categorical and "group_cov" in out:
|
|
369
|
+
parts = " | ".join(f"{g}={v:.3f}" for g, v in out["group_cov"].items())
|
|
370
|
+
print(f" group_cov: {parts}")
|
|
262
371
|
|
|
263
372
|
return out
|
|
264
373
|
|
|
@@ -272,6 +381,7 @@ def coverage_by_lexicon(
|
|
|
272
381
|
*,
|
|
273
382
|
n_bins: int = 4,
|
|
274
383
|
verbose: bool = False,
|
|
384
|
+
var_type: str = "continuous",
|
|
275
385
|
) -> tuple[dict, pd.DataFrame]:
|
|
276
386
|
"""
|
|
277
387
|
Summarize coverage for a given lexicon.
|
|
@@ -283,24 +393,35 @@ def coverage_by_lexicon(
|
|
|
283
393
|
* profiles: List[List[str]] (multiple independent posts per unit)
|
|
284
394
|
- Tuple (texts, y), where texts is Sequence of the same forms above.
|
|
285
395
|
|
|
396
|
+
Parameters
|
|
397
|
+
----------
|
|
398
|
+
var_type : str
|
|
399
|
+
``'continuous'`` (default) or ``'categorical'``. When categorical,
|
|
400
|
+
q1/q4 become min/max group coverage, corr uses Cramér's V, and the
|
|
401
|
+
summary includes a ``group_cov`` dict.
|
|
402
|
+
|
|
286
403
|
Returns
|
|
287
404
|
-------
|
|
288
405
|
summary : dict(
|
|
289
406
|
docs_any, cov_all, q1, q4, corr_any,
|
|
290
407
|
hits_mean, hits_median, types_mean, types_median
|
|
408
|
+
[, group_cov] — only when var_type='categorical'
|
|
291
409
|
)
|
|
292
410
|
per_token_df : DataFrame(word, docs, cov_all, q1, q4, corr)
|
|
293
411
|
"""
|
|
294
412
|
import numpy as np
|
|
295
413
|
import pandas as pd
|
|
296
414
|
|
|
415
|
+
_validate_var_type(var_type)
|
|
416
|
+
is_categorical = var_type == "categorical"
|
|
417
|
+
|
|
297
418
|
# --- small internal adapters (robust to nested inputs) --------------------
|
|
298
|
-
def
|
|
419
|
+
def _local_as_series_1d(y_like) -> pd.Series:
|
|
299
420
|
if isinstance(y_like, pd.Series):
|
|
300
421
|
return y_like.reset_index(drop=True)
|
|
301
422
|
return pd.Series(y_like, dtype="float64")
|
|
302
423
|
|
|
303
|
-
def
|
|
424
|
+
def _local_z(s: pd.Series) -> np.ndarray:
|
|
304
425
|
s = s.astype(float)
|
|
305
426
|
mu = float(s.mean())
|
|
306
427
|
sd = float(s.std(ddof=0))
|
|
@@ -308,7 +429,7 @@ def coverage_by_lexicon(
|
|
|
308
429
|
return np.zeros(len(s), dtype=float)
|
|
309
430
|
return ((s - mu) / sd).to_numpy(dtype=float)
|
|
310
431
|
|
|
311
|
-
def
|
|
432
|
+
def _local_quantile_bins(y: pd.Series, n_bins: int = 4) -> np.ndarray:
|
|
312
433
|
q = pd.qcut(y.rank(method="average"), n_bins, labels=False, duplicates="drop")
|
|
313
434
|
return q.to_numpy(dtype=int)
|
|
314
435
|
|
|
@@ -339,49 +460,61 @@ def coverage_by_lexicon(
|
|
|
339
460
|
return out
|
|
340
461
|
return str(unit).split()
|
|
341
462
|
|
|
342
|
-
def
|
|
463
|
+
def _local_texts_to_token_lists(texts_like) -> list[list[str]]:
|
|
343
464
|
return [_to_unit_tokens(u) for u in texts_like]
|
|
344
465
|
|
|
345
|
-
def
|
|
466
|
+
def _local_token_sets(text_lists: list[list[str]]) -> list[set[str]]:
|
|
346
467
|
return [set(toks) if toks else set() for toks in text_lists]
|
|
347
468
|
|
|
348
469
|
# --- coerce inputs --------------------------------------------------------
|
|
349
470
|
if not isinstance(df_or_texts, pd.DataFrame):
|
|
350
471
|
if isinstance(df_or_texts, tuple) and len(df_or_texts) == 2:
|
|
351
472
|
texts, y = df_or_texts
|
|
352
|
-
texts =
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
473
|
+
texts = _local_texts_to_token_lists(texts)
|
|
474
|
+
if is_categorical:
|
|
475
|
+
y = np.asarray(y, dtype=object)
|
|
476
|
+
mask = _categorical_mask(y)
|
|
477
|
+
if not mask.all():
|
|
478
|
+
texts = [texts[i] for i in range(len(texts)) if mask[i]]
|
|
479
|
+
y = y[mask]
|
|
480
|
+
else:
|
|
481
|
+
y = _local_as_series_1d(y)
|
|
482
|
+
mask = ~y.isna()
|
|
483
|
+
if not mask.all():
|
|
484
|
+
texts = [texts[i] for i in range(len(texts)) if mask.iat[i]]
|
|
485
|
+
y = y[mask].reset_index(drop=True)
|
|
358
486
|
else:
|
|
359
487
|
raise ValueError("If not passing a DataFrame, pass (texts, y) as a tuple.")
|
|
360
488
|
else:
|
|
361
489
|
if not text_col or not score_col:
|
|
362
490
|
raise ValueError("Provide text_col and score_col when using a DataFrame.")
|
|
363
491
|
s = df_or_texts[text_col]
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
492
|
+
if is_categorical:
|
|
493
|
+
y = np.asarray(df_or_texts[score_col], dtype=object)
|
|
494
|
+
mask = _categorical_mask(y)
|
|
495
|
+
s = s[mask]
|
|
496
|
+
y = y[mask]
|
|
497
|
+
texts = _local_texts_to_token_lists(s.tolist())
|
|
498
|
+
else:
|
|
499
|
+
y = _local_as_series_1d(df_or_texts[score_col])
|
|
500
|
+
mask = ~y.isna()
|
|
501
|
+
s = s[mask]
|
|
502
|
+
y = y[mask].reset_index(drop=True)
|
|
503
|
+
texts = _local_texts_to_token_lists(s.tolist())
|
|
369
504
|
|
|
370
505
|
# guard: empty after filtering
|
|
371
506
|
if len(texts) == 0 or len(y) == 0:
|
|
372
507
|
summary = dict(
|
|
373
508
|
docs_any=0, cov_all=0.0, q1=0.0, q4=0.0, corr_any=0.0,
|
|
374
|
-
hits_mean=0.0, hits_median=0.0, types_mean=0.0, types_median=0.0
|
|
509
|
+
hits_mean=0.0, hits_median=0.0, types_mean=0.0, types_median=0.0,
|
|
375
510
|
)
|
|
511
|
+
if is_categorical:
|
|
512
|
+
summary["group_cov"] = {}
|
|
376
513
|
return summary, pd.DataFrame(columns=["word","docs","cov_all","q1","q4","corr"])
|
|
377
514
|
|
|
378
515
|
# --- prep features --------------------------------------------------------
|
|
379
|
-
bins = _quantile_bins(y, n_bins=n_bins)
|
|
380
|
-
low_idx = np.where(bins == bins.min())[0]
|
|
381
|
-
high_idx = np.where(bins == bins.max())[0]
|
|
382
|
-
|
|
383
516
|
lex = [str(w) for w in lexicon]
|
|
384
|
-
token_sets =
|
|
517
|
+
token_sets = _local_token_sets(texts)
|
|
385
518
|
|
|
386
519
|
# presence of ANY lexicon word per unit
|
|
387
520
|
pres_any = np.fromiter(
|
|
@@ -389,19 +522,73 @@ def coverage_by_lexicon(
|
|
|
389
522
|
dtype=np.int8,
|
|
390
523
|
count=len(token_sets),
|
|
391
524
|
)
|
|
392
|
-
y_std = _z(y)
|
|
393
|
-
corr_any = float(np.corrcoef(pres_any, y_std)[0, 1]) if pres_any.std() > 0 else 0.0
|
|
394
525
|
|
|
395
526
|
overall = float(pres_any.mean()) if len(pres_any) else 0.0
|
|
396
|
-
q1 = float(pres_any[low_idx].mean()) if len(low_idx) else 0.0
|
|
397
|
-
q4 = float(pres_any[high_idx].mean()) if len(high_idx) else 0.0
|
|
398
527
|
docs_any = int(pres_any.sum())
|
|
399
528
|
|
|
400
|
-
|
|
529
|
+
if is_categorical:
|
|
530
|
+
groups = y # already np.ndarray of object dtype
|
|
531
|
+
group_labels = sorted(set(groups))
|
|
532
|
+
|
|
533
|
+
# q1/q4 → min/max group coverage for the any-presence vector
|
|
534
|
+
group_cov_any = {}
|
|
535
|
+
for g in group_labels:
|
|
536
|
+
idx = np.where(groups == g)[0]
|
|
537
|
+
group_cov_any[g] = float(pres_any[idx].mean()) if len(idx) else 0.0
|
|
538
|
+
q1 = min(group_cov_any.values()) if group_cov_any else 0.0
|
|
539
|
+
q4 = max(group_cov_any.values()) if group_cov_any else 0.0
|
|
540
|
+
|
|
541
|
+
corr_any = _cramers_v(pres_any.astype(int), groups)
|
|
542
|
+
|
|
543
|
+
# per-token stats
|
|
544
|
+
rows = []
|
|
545
|
+
for w in lex:
|
|
546
|
+
pres = np.fromiter(((1 if w in ts else 0) for ts in token_sets),
|
|
547
|
+
dtype=np.int8, count=len(token_sets))
|
|
548
|
+
corr = _cramers_v(pres.astype(int), groups)
|
|
549
|
+
gc = {}
|
|
550
|
+
for g in group_labels:
|
|
551
|
+
idx = np.where(groups == g)[0]
|
|
552
|
+
gc[g] = float(pres[idx].mean()) if len(idx) else 0.0
|
|
553
|
+
rows.append(dict(
|
|
554
|
+
word=w,
|
|
555
|
+
docs=int(pres.sum()),
|
|
556
|
+
cov_all=float(pres.mean()) if len(pres) else 0.0,
|
|
557
|
+
q1=min(gc.values()) if gc else 0.0,
|
|
558
|
+
q4=max(gc.values()) if gc else 0.0,
|
|
559
|
+
corr=corr,
|
|
560
|
+
))
|
|
561
|
+
else:
|
|
562
|
+
bins = _local_quantile_bins(y, n_bins=n_bins)
|
|
563
|
+
low_idx = np.where(bins == bins.min())[0]
|
|
564
|
+
high_idx = np.where(bins == bins.max())[0]
|
|
565
|
+
|
|
566
|
+
y_std = _local_z(y)
|
|
567
|
+
corr_any = float(np.corrcoef(pres_any, y_std)[0, 1]) if pres_any.std() > 0 else 0.0
|
|
568
|
+
|
|
569
|
+
q1 = float(pres_any[low_idx].mean()) if len(low_idx) else 0.0
|
|
570
|
+
q4 = float(pres_any[high_idx].mean()) if len(high_idx) else 0.0
|
|
571
|
+
|
|
572
|
+
# per-token stats
|
|
573
|
+
rows = []
|
|
574
|
+
for w in lex:
|
|
575
|
+
pres = np.fromiter(((1 if w in ts else 0) for ts in token_sets),
|
|
576
|
+
dtype=np.int8, count=len(token_sets))
|
|
577
|
+
corr = float(np.corrcoef(pres, y_std)[0, 1]) if pres.std() > 0 else 0.0
|
|
578
|
+
rows.append(dict(
|
|
579
|
+
word=w,
|
|
580
|
+
docs=int(pres.sum()),
|
|
581
|
+
cov_all=float(pres.mean()) if len(pres) else 0.0,
|
|
582
|
+
q1=float(pres[low_idx].mean()) if len(low_idx) else 0.0,
|
|
583
|
+
q4=float(pres[high_idx].mean()) if len(high_idx) else 0.0,
|
|
584
|
+
corr=corr,
|
|
585
|
+
))
|
|
586
|
+
|
|
587
|
+
per_token = pd.DataFrame(rows, columns=["word", "docs", "cov_all", "q1", "q4", "corr"])
|
|
588
|
+
|
|
589
|
+
# --- whole-profile lexicon frequency stats (DV-agnostic) ------------------
|
|
401
590
|
lex_set = set(lex)
|
|
402
|
-
# total occurrences of any lexicon token in each profile/unit
|
|
403
591
|
hits_per_unit = np.array([sum(1 for t in toks if t in lex_set) for toks in texts], dtype=np.int32)
|
|
404
|
-
# number of unique lexicon types present in each unit
|
|
405
592
|
types_per_unit = np.array([len(set(toks) & lex_set) for toks in texts], dtype=np.int32)
|
|
406
593
|
|
|
407
594
|
hits_mean = float(hits_per_unit.mean()) if len(hits_per_unit) else 0.0
|
|
@@ -409,22 +596,6 @@ def coverage_by_lexicon(
|
|
|
409
596
|
types_mean = float(types_per_unit.mean()) if len(types_per_unit) else 0.0
|
|
410
597
|
types_median = float(np.median(types_per_unit)) if len(types_per_unit) else 0.0
|
|
411
598
|
|
|
412
|
-
# per-token stats (vectorized presence via set membership)
|
|
413
|
-
rows = []
|
|
414
|
-
for w in lex:
|
|
415
|
-
pres = np.fromiter(((1 if w in ts else 0) for ts in token_sets),
|
|
416
|
-
dtype=np.int8, count=len(token_sets))
|
|
417
|
-
corr = float(np.corrcoef(pres, y_std)[0, 1]) if pres.std() > 0 else 0.0
|
|
418
|
-
rows.append(dict(
|
|
419
|
-
word=w,
|
|
420
|
-
docs=int(pres.sum()),
|
|
421
|
-
cov_all=float(pres.mean()) if len(pres) else 0.0,
|
|
422
|
-
q1=float(pres[low_idx].mean()) if len(low_idx) else 0.0,
|
|
423
|
-
q4=float(pres[high_idx].mean()) if len(high_idx) else 0.0,
|
|
424
|
-
corr=corr,
|
|
425
|
-
))
|
|
426
|
-
per_token = pd.DataFrame(rows, columns=["word", "docs", "cov_all", "q1", "q4", "corr"])
|
|
427
|
-
|
|
428
599
|
summary = dict(
|
|
429
600
|
docs_any=docs_any,
|
|
430
601
|
cov_all=overall,
|
|
@@ -436,6 +607,8 @@ def coverage_by_lexicon(
|
|
|
436
607
|
types_mean=types_mean,
|
|
437
608
|
types_median=types_median,
|
|
438
609
|
)
|
|
610
|
+
if is_categorical:
|
|
611
|
+
summary["group_cov"] = group_cov_any
|
|
439
612
|
|
|
440
613
|
per_token = per_token.sort_values(
|
|
441
614
|
["cov_all", "docs"], ascending=[False, False]
|
|
@@ -448,6 +621,9 @@ def coverage_by_lexicon(
|
|
|
448
621
|
f"docs_any={docs_any} | cov_all={overall:.3f} | "
|
|
449
622
|
f"q1={q1:.3f} | q4={q4:.3f} | corr_any={corr_any:.3f}"
|
|
450
623
|
)
|
|
624
|
+
if is_categorical:
|
|
625
|
+
parts = " | ".join(f"{g}={v:.3f}" for g, v in group_cov_any.items())
|
|
626
|
+
print(f" group_cov: {parts}")
|
|
451
627
|
print(
|
|
452
628
|
f" hits_mean={hits_mean:.2f} | hits_median={hits_median:.2f} | "
|
|
453
629
|
f"types_mean={types_mean:.2f} | types_median={types_median:.2f}"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ssdiff
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Supervised Semantic Differential (SSD): interpretable, embedding-based analysis of concept meaning in text.
|
|
5
5
|
Author-email: Hubert Plisiecki <hplisiecki@gmail.com>, Paweł Lenartowicz <pawellenartowicz@europe.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -271,49 +271,75 @@ These helpers make lexicon selection transparent and data-driven (you can also h
|
|
|
271
271
|
|
|
272
272
|
### `suggest_lexicon(...)`
|
|
273
273
|
|
|
274
|
-
Rank tokens by balanced coverage with a mild penalty for strong
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
274
|
+
Rank tokens by balanced coverage with a mild penalty for strong association with the outcome.
|
|
275
|
+
|
|
276
|
+
All three lexicon utilities accept `var_type='continuous'` (default) or `var_type='categorical'`:
|
|
277
|
+
|
|
278
|
+
| | `var_type='continuous'` | `var_type='categorical'` |
|
|
279
|
+
|---|---|---|
|
|
280
|
+
| `cov_bal` | average presence across 𝑛 quantile bins of 𝑦 | average presence across group labels |
|
|
281
|
+
| `corr` | Pearson correlation between 0/1 presence and standardized 𝑦 | Cramér's V between 0/1 presence and group label |
|
|
282
|
+
| `q1` / `q4` | coverage in lowest / highest 𝑦 quantile bin | min / max group coverage |
|
|
283
|
+
| `rank` | `cov_bal * (1 - min(1, \|corr\|/corr_cap))` | same formula (Cramér's V replaces Pearson) |
|
|
279
284
|
|
|
280
285
|
Accepts a DataFrame (`text_col`, `score_col`) or a `(texts, y)` tuple where texts can be raw strings or token lists.
|
|
281
286
|
|
|
282
287
|
```python
|
|
283
288
|
from ssdiff import suggest_lexicon
|
|
284
289
|
|
|
285
|
-
#
|
|
290
|
+
# Continuous outcome (default)
|
|
286
291
|
cands_df = suggest_lexicon(df, text_col="lemmatized", score_col="questionnaire_result", top_k=150)
|
|
287
292
|
|
|
288
293
|
# Or using a tuple (texts, y)
|
|
289
294
|
texts = [" ".join(doc) for doc in docs]
|
|
290
295
|
cands_df2 = suggest_lexicon((docs, y), top_k=150)
|
|
296
|
+
|
|
297
|
+
# Categorical groups
|
|
298
|
+
cands_cat = suggest_lexicon(df, text_col="lemmatized", score_col="diagnosis", top_k=150, var_type="categorical")
|
|
299
|
+
cands_cat2 = suggest_lexicon((docs, groups), top_k=150, var_type="categorical")
|
|
291
300
|
```
|
|
292
301
|
### `token_presence_stats(...)`
|
|
293
302
|
|
|
294
|
-
Per-token coverage &
|
|
303
|
+
Per-token coverage & association diagnostics:
|
|
295
304
|
```python
|
|
296
305
|
from ssdiff import token_presence_stats
|
|
297
|
-
|
|
298
|
-
|
|
306
|
+
|
|
307
|
+
# Continuous
|
|
308
|
+
stats = token_presence_stats(texts, y, token="concept_keyword_1", n_bins=4, verbose=True)
|
|
309
|
+
print(stats) # dict: token, docs, cov_all, cov_bal, corr, rank, q1, q4
|
|
310
|
+
|
|
311
|
+
# Categorical — output also includes group_cov (per-group coverage dict)
|
|
312
|
+
stats = token_presence_stats(texts, groups, token="concept_keyword_1", var_type="categorical", verbose=True)
|
|
313
|
+
print(stats["group_cov"]) # e.g. {"control": 0.45, "depression": 0.62}
|
|
299
314
|
```
|
|
300
315
|
|
|
301
316
|
### `coverage_by_lexicon(...)`
|
|
302
317
|
|
|
303
318
|
Summary for your chosen lexicon:
|
|
304
|
-
- `summary` : `docs_any`, `cov_all`, `q1
|
|
305
|
-
- `q1` / `q4`: coverage within the lowest/highest 𝑦 bins (
|
|
306
|
-
- `
|
|
319
|
+
- `summary` : `docs_any`, `cov_all`, `q1`, `q4`, `corr_any`, `hits_mean`, `hits_median`, `types_mean`, `types_median`
|
|
320
|
+
- `q1` / `q4`: coverage within the lowest/highest 𝑦 bins (continuous) or min/max group coverage (categorical)
|
|
321
|
+
- when `var_type='categorical'`, summary also includes `group_cov` (per-group coverage dict)
|
|
322
|
+
- `per_token_df`: per-token stats
|
|
307
323
|
|
|
308
324
|
```python
|
|
309
325
|
from ssdiff import coverage_by_lexicon
|
|
310
326
|
|
|
327
|
+
# Continuous
|
|
311
328
|
summary, per_tok = coverage_by_lexicon(
|
|
312
329
|
(texts, y),
|
|
313
330
|
lexicon={"concept_keyword_1", "concept_keyword_2", "concept_keyword_3", "concept_keyword_4"},
|
|
314
331
|
n_bins=4,
|
|
315
|
-
verbose=True
|
|
332
|
+
verbose=True,
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
# Categorical
|
|
336
|
+
summary, per_tok = coverage_by_lexicon(
|
|
337
|
+
(texts, groups),
|
|
338
|
+
lexicon={"concept_keyword_1", "concept_keyword_2"},
|
|
339
|
+
var_type="categorical",
|
|
340
|
+
verbose=True,
|
|
316
341
|
)
|
|
342
|
+
print(summary["group_cov"]) # e.g. {"control": 0.80, "depression": 0.75}
|
|
317
343
|
```
|
|
318
344
|
|
|
319
345
|
---
|
|
@@ -746,9 +772,10 @@ Returned by `SSDGroup.get_contrast()`. Duck-types with `SSD` for interpretation:
|
|
|
746
772
|
- `build_docs_from_preprocessed(pre_docs)` → list[list[str]] (lemmas for modeling)
|
|
747
773
|
|
|
748
774
|
### Lexicon
|
|
749
|
-
- `suggest_lexicon(df_or_tuple, text_col=None, score_col=None, top_k=150, min_docs=5, n_bins=4, corr_cap=0.30)` → DataFrame
|
|
750
|
-
- `token_presence_stats(
|
|
751
|
-
- `coverage_by_lexicon(df_or_tuple, lexicon, n_bins=4, verbose=False)` → `(summary, per_token_df)`
|
|
775
|
+
- `suggest_lexicon(df_or_tuple, text_col=None, score_col=None, top_k=150, min_docs=5, n_bins=4, corr_cap=0.30, var_type='continuous')` → DataFrame
|
|
776
|
+
- `token_presence_stats(texts, y, token, n_bins=4, corr_cap=0.30, verbose=False, var_type='continuous')` → dict
|
|
777
|
+
- `coverage_by_lexicon(df_or_tuple, text_col=None, score_col=None, lexicon=(), n_bins=4, verbose=False, var_type='continuous')` → `(summary, per_token_df)`
|
|
778
|
+
- `var_type`: `'continuous'` (numeric outcome, default) or `'categorical'` (group labels). When categorical, `corr` is Cramér's V, `cov_bal` is balanced across groups, and `q1`/`q4` are min/max group coverage.
|
|
752
779
|
|
|
753
780
|
---
|
|
754
781
|
## Citing & License
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|