ssdiff 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ssdiff/__init__.py ADDED
@@ -0,0 +1,20 @@
1
+ # words_apart_old/__init__.py
2
+ from .core import SSD
3
+ from .clusters import cluster_top_neighbors
4
+ from .utils import (
5
+ load_embeddings, normalize_kv, compute_global_sif,
6
+ build_doc_vectors, filtered_neighbors
7
+ )
8
+ from .lexicon import suggest_lexicon, coverage_by_lexicon, token_presence_stats
9
+ from .preprocess import load_spacy, load_stopwords, preprocess_texts, build_docs_from_preprocessed
10
+
11
+
12
+ __all__ = [
13
+ "SSD",
14
+ "cluster_top_neighbors",
15
+ "load_embeddings", "normalize_kv", "compute_global_sif",
16
+ "build_doc_vectors", "filtered_neighbors", "build_docs_from_preprocessed",
17
+ "suggest_lexicon", "coverage_by_lexicon", "token_presence_stats",
18
+ "load_spacy", 'load_stopwords', 'preprocess_texts',
19
+ "suggest_lexicon", "coverage_by_lexicon", "token_presence_stats",
20
+ ]
ssdiff/clusters.py ADDED
@@ -0,0 +1,101 @@
1
+ # ssdiff/clusters.py
2
+ from __future__ import annotations
3
+ import numpy as np
4
+ from typing import TYPE_CHECKING
5
+ from .utils import filtered_neighbors
6
+
7
+ if TYPE_CHECKING:
8
+ # Only used for hints; not imported at runtime → no circular import
9
+ from .core import SSD
10
+ import warnings
11
+ warnings.filterwarnings(
12
+ "ignore",
13
+ message=r"KMeans is known to have a memory leak on Windows with MKL.*",
14
+ category=UserWarning,
15
+ module=r"sklearn\.cluster\._kmeans"
16
+ )
17
+
18
+ def _require_sklearn():
19
+ try:
20
+ import sklearn # noqa
21
+ from sklearn.cluster import KMeans # noqa
22
+ from sklearn.metrics import silhouette_score # noqa
23
+ except Exception:
24
+ raise ImportError("scikit-learn is required for clustering. Install: pip install scikit-learn")
25
+
26
+ def cluster_top_neighbors(
27
+ ssd: SSD,
28
+ *,
29
+ topn: int = 100,
30
+ k: int | None = None,
31
+ k_min: int = 2,
32
+ k_max: int = 10,
33
+ restrict_vocab: int = 50000,
34
+ random_state: int = 13,
35
+ min_cluster_size: int = 2,
36
+ side: str = "pos", # "pos" → +β̂, "neg" → −β̂
37
+ ):
38
+ _require_sklearn()
39
+ from sklearn.cluster import KMeans
40
+ from sklearn.metrics import silhouette_score
41
+
42
+ b = ssd.beta_unit if ssd.use_unit_beta else ssd.beta
43
+ vec = b if side == "pos" else -b
44
+
45
+ pairs = filtered_neighbors(ssd.kv, vec, topn=topn, restrict=restrict_vocab)
46
+ words = [w for (w, _s) in pairs]
47
+ if len(words) < max(2, k_min):
48
+ raise ValueError("Not enough neighbors to cluster.")
49
+
50
+ W = np.vstack([ssd.kv.get_vector(w, norm=True).astype(np.float64) for w in words]) # unit rows
51
+
52
+ def choose_k_auto(W, kmin, kmax):
53
+ best_k, best_s = None, -1.0
54
+ upper = min(kmax, max(kmin, W.shape[0]-1))
55
+ for kk in range(max(2, kmin), max(2, upper) + 1):
56
+ km = KMeans(n_clusters=kk, random_state=random_state, n_init="auto")
57
+ labels = km.fit_predict(W)
58
+ if len(set(labels)) <= 1 or np.max(np.bincount(labels)) <= 1:
59
+ continue
60
+ s = silhouette_score(W, labels)
61
+ if s > best_s:
62
+ best_s, best_k = s, kk
63
+ return best_k if best_k is not None else max(2, kmin)
64
+
65
+ k_use = int(k) if k is not None else choose_k_auto(W, k_min, k_max)
66
+ km = KMeans(n_clusters=k_use, random_state=random_state, n_init="auto")
67
+ labels = km.fit_predict(W)
68
+
69
+ bu = b / max(float(np.linalg.norm(b)), 1e-12)
70
+ clusters = []
71
+ for cid in sorted(set(labels)):
72
+ idx = np.where(labels == cid)[0]
73
+ if len(idx) < min_cluster_size:
74
+ continue
75
+ Wc = W[idx]
76
+ centroid = Wc.mean(axis=0)
77
+ centroid /= max(float(np.linalg.norm(centroid)), 1e-12)
78
+ cos_beta = float(centroid @ bu)
79
+ cos_to_centroid = (Wc @ centroid).astype(float)
80
+ coherence = float(np.mean(cos_to_centroid))
81
+
82
+ rows = []
83
+ for j in idx:
84
+ w = words[j]
85
+ ccent = float(W[j] @ centroid)
86
+ cbeta = float(W[j] @ bu)
87
+ rows.append((w, ccent, cbeta))
88
+ rows.sort(key=lambda t: t[1], reverse=True)
89
+
90
+ clusters.append({
91
+ "id": int(cid),
92
+ "size": int(len(idx)),
93
+ "centroid_cos_beta": cos_beta,
94
+ "coherence": coherence,
95
+ "words": rows,
96
+ })
97
+
98
+ clusters.sort(key=lambda C: C["centroid_cos_beta"], reverse=True)
99
+
100
+ return clusters
101
+