ssdiff 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ssdiff/__init__.py +20 -0
- ssdiff/clusters.py +101 -0
- ssdiff/core.py +608 -0
- ssdiff/lexicon.py +320 -0
- ssdiff/preprocess.py +175 -0
- ssdiff/snippets.py +270 -0
- ssdiff/utils.py +157 -0
- ssdiff-0.1.0.dist-info/METADATA +500 -0
- ssdiff-0.1.0.dist-info/RECORD +12 -0
- ssdiff-0.1.0.dist-info/WHEEL +5 -0
- ssdiff-0.1.0.dist-info/licenses/LICENSE +21 -0
- ssdiff-0.1.0.dist-info/top_level.txt +1 -0
ssdiff/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# words_apart_old/__init__.py
|
|
2
|
+
from .core import SSD
|
|
3
|
+
from .clusters import cluster_top_neighbors
|
|
4
|
+
from .utils import (
|
|
5
|
+
load_embeddings, normalize_kv, compute_global_sif,
|
|
6
|
+
build_doc_vectors, filtered_neighbors
|
|
7
|
+
)
|
|
8
|
+
from .lexicon import suggest_lexicon, coverage_by_lexicon, token_presence_stats
|
|
9
|
+
from .preprocess import load_spacy, load_stopwords, preprocess_texts, build_docs_from_preprocessed
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"SSD",
|
|
14
|
+
"cluster_top_neighbors",
|
|
15
|
+
"load_embeddings", "normalize_kv", "compute_global_sif",
|
|
16
|
+
"build_doc_vectors", "filtered_neighbors", "build_docs_from_preprocessed",
|
|
17
|
+
"suggest_lexicon", "coverage_by_lexicon", "token_presence_stats",
|
|
18
|
+
"load_spacy", 'load_stopwords', 'preprocess_texts',
|
|
19
|
+
"suggest_lexicon", "coverage_by_lexicon", "token_presence_stats",
|
|
20
|
+
]
|
ssdiff/clusters.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# ssdiff/clusters.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import numpy as np
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
from .utils import filtered_neighbors
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
# Only used for hints; not imported at runtime → no circular import
|
|
9
|
+
from .core import SSD
|
|
10
|
+
import warnings
|
|
11
|
+
warnings.filterwarnings(
|
|
12
|
+
"ignore",
|
|
13
|
+
message=r"KMeans is known to have a memory leak on Windows with MKL.*",
|
|
14
|
+
category=UserWarning,
|
|
15
|
+
module=r"sklearn\.cluster\._kmeans"
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
def _require_sklearn():
|
|
19
|
+
try:
|
|
20
|
+
import sklearn # noqa
|
|
21
|
+
from sklearn.cluster import KMeans # noqa
|
|
22
|
+
from sklearn.metrics import silhouette_score # noqa
|
|
23
|
+
except Exception:
|
|
24
|
+
raise ImportError("scikit-learn is required for clustering. Install: pip install scikit-learn")
|
|
25
|
+
|
|
26
|
+
def cluster_top_neighbors(
|
|
27
|
+
ssd: SSD,
|
|
28
|
+
*,
|
|
29
|
+
topn: int = 100,
|
|
30
|
+
k: int | None = None,
|
|
31
|
+
k_min: int = 2,
|
|
32
|
+
k_max: int = 10,
|
|
33
|
+
restrict_vocab: int = 50000,
|
|
34
|
+
random_state: int = 13,
|
|
35
|
+
min_cluster_size: int = 2,
|
|
36
|
+
side: str = "pos", # "pos" → +β̂, "neg" → −β̂
|
|
37
|
+
):
|
|
38
|
+
_require_sklearn()
|
|
39
|
+
from sklearn.cluster import KMeans
|
|
40
|
+
from sklearn.metrics import silhouette_score
|
|
41
|
+
|
|
42
|
+
b = ssd.beta_unit if ssd.use_unit_beta else ssd.beta
|
|
43
|
+
vec = b if side == "pos" else -b
|
|
44
|
+
|
|
45
|
+
pairs = filtered_neighbors(ssd.kv, vec, topn=topn, restrict=restrict_vocab)
|
|
46
|
+
words = [w for (w, _s) in pairs]
|
|
47
|
+
if len(words) < max(2, k_min):
|
|
48
|
+
raise ValueError("Not enough neighbors to cluster.")
|
|
49
|
+
|
|
50
|
+
W = np.vstack([ssd.kv.get_vector(w, norm=True).astype(np.float64) for w in words]) # unit rows
|
|
51
|
+
|
|
52
|
+
def choose_k_auto(W, kmin, kmax):
|
|
53
|
+
best_k, best_s = None, -1.0
|
|
54
|
+
upper = min(kmax, max(kmin, W.shape[0]-1))
|
|
55
|
+
for kk in range(max(2, kmin), max(2, upper) + 1):
|
|
56
|
+
km = KMeans(n_clusters=kk, random_state=random_state, n_init="auto")
|
|
57
|
+
labels = km.fit_predict(W)
|
|
58
|
+
if len(set(labels)) <= 1 or np.max(np.bincount(labels)) <= 1:
|
|
59
|
+
continue
|
|
60
|
+
s = silhouette_score(W, labels)
|
|
61
|
+
if s > best_s:
|
|
62
|
+
best_s, best_k = s, kk
|
|
63
|
+
return best_k if best_k is not None else max(2, kmin)
|
|
64
|
+
|
|
65
|
+
k_use = int(k) if k is not None else choose_k_auto(W, k_min, k_max)
|
|
66
|
+
km = KMeans(n_clusters=k_use, random_state=random_state, n_init="auto")
|
|
67
|
+
labels = km.fit_predict(W)
|
|
68
|
+
|
|
69
|
+
bu = b / max(float(np.linalg.norm(b)), 1e-12)
|
|
70
|
+
clusters = []
|
|
71
|
+
for cid in sorted(set(labels)):
|
|
72
|
+
idx = np.where(labels == cid)[0]
|
|
73
|
+
if len(idx) < min_cluster_size:
|
|
74
|
+
continue
|
|
75
|
+
Wc = W[idx]
|
|
76
|
+
centroid = Wc.mean(axis=0)
|
|
77
|
+
centroid /= max(float(np.linalg.norm(centroid)), 1e-12)
|
|
78
|
+
cos_beta = float(centroid @ bu)
|
|
79
|
+
cos_to_centroid = (Wc @ centroid).astype(float)
|
|
80
|
+
coherence = float(np.mean(cos_to_centroid))
|
|
81
|
+
|
|
82
|
+
rows = []
|
|
83
|
+
for j in idx:
|
|
84
|
+
w = words[j]
|
|
85
|
+
ccent = float(W[j] @ centroid)
|
|
86
|
+
cbeta = float(W[j] @ bu)
|
|
87
|
+
rows.append((w, ccent, cbeta))
|
|
88
|
+
rows.sort(key=lambda t: t[1], reverse=True)
|
|
89
|
+
|
|
90
|
+
clusters.append({
|
|
91
|
+
"id": int(cid),
|
|
92
|
+
"size": int(len(idx)),
|
|
93
|
+
"centroid_cos_beta": cos_beta,
|
|
94
|
+
"coherence": coherence,
|
|
95
|
+
"words": rows,
|
|
96
|
+
})
|
|
97
|
+
|
|
98
|
+
clusters.sort(key=lambda C: C["centroid_cos_beta"], reverse=True)
|
|
99
|
+
|
|
100
|
+
return clusters
|
|
101
|
+
|