spell-exploder 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spell_exploder/__init__.py +205 -0
- spell_exploder/_version.py +1 -0
- spell_exploder/analyzers/__init__.py +18 -0
- spell_exploder/analyzers/adaptive_evolution.py +453 -0
- spell_exploder/analyzers/complexity_index.py +224 -0
- spell_exploder/analyzers/keyword_erp.py +477 -0
- spell_exploder/analyzers/valence_model.py +523 -0
- spell_exploder/core/__init__.py +45 -0
- spell_exploder/core/compression.py +103 -0
- spell_exploder/core/entropy.py +203 -0
- spell_exploder/core/information.py +179 -0
- spell_exploder/core/nlp.py +107 -0
- spell_exploder/exceptions.py +25 -0
- spell_exploder/extractors/__init__.py +35 -0
- spell_exploder/extractors/action_frames.py +133 -0
- spell_exploder/extractors/noun_dependencies.py +96 -0
- spell_exploder/extractors/sentence_parser.py +168 -0
- spell_exploder/graphs/__init__.py +0 -0
- spell_exploder/io/__init__.py +14 -0
- spell_exploder/io/exporters.py +94 -0
- spell_exploder/io/readers.py +117 -0
- spell_exploder/results/__init__.py +44 -0
- spell_exploder/results/complexity.py +111 -0
- spell_exploder/results/evolution.py +136 -0
- spell_exploder/results/keyword.py +139 -0
- spell_exploder/results/valence.py +134 -0
- spell_exploder/utils/__init__.py +11 -0
- spell_exploder/utils/imports.py +48 -0
- spell_exploder/utils/smoothing.py +42 -0
- spell_exploder/utils/statistics.py +54 -0
- spell_exploder/visualization/__init__.py +27 -0
- spell_exploder/visualization/plots.py +562 -0
- spell_exploder-0.1.0.dist-info/METADATA +221 -0
- spell_exploder-0.1.0.dist-info/RECORD +37 -0
- spell_exploder-0.1.0.dist-info/WHEEL +5 -0
- spell_exploder-0.1.0.dist-info/licenses/LICENSE +21 -0
- spell_exploder-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,523 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Language Complexity Valence Model (LCVM) Analyzer.
|
|
3
|
+
|
|
4
|
+
The most comprehensive Spellcaster analyzer, computing ~30 metrics per
|
|
5
|
+
document across five analytical dimensions:
|
|
6
|
+
|
|
7
|
+
1. **Variation** — Shannon entropy of token distributions.
|
|
8
|
+
2. **Redundancy** — Multiscale entropy-collapse curves measuring local
|
|
9
|
+
repetition at different window sizes.
|
|
10
|
+
3. **Organisation** — Mutual information between verbs and their
|
|
11
|
+
subjects/objects, capturing how tightly the action repertoire is
|
|
12
|
+
coupled.
|
|
13
|
+
4. **Repertoire** — Action-frame density and verb diversity.
|
|
14
|
+
5. **Semantic breadth** — Noun-dependency richness, schema-keyword
|
|
15
|
+
concentration, and valence entropy.
|
|
16
|
+
|
|
17
|
+
Additionally computes pairwise Jensen–Shannon divergence across N texts
|
|
18
|
+
and per-token channel capacities (Shannon–Hartley analogue).
|
|
19
|
+
|
|
20
|
+
Example
|
|
21
|
+
-------
|
|
22
|
+
>>> from spell_exploder.analyzers.valence_model import ValenceModelAnalyzer
|
|
23
|
+
>>> vm = ValenceModelAnalyzer()
|
|
24
|
+
>>> result = vm.analyze(["essay_a.txt", "essay_b.txt"])
|
|
25
|
+
>>> result.to_dataframe()
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import collections
|
|
31
|
+
import json
|
|
32
|
+
import logging
|
|
33
|
+
import math
|
|
34
|
+
from collections import Counter, defaultdict
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
from typing import TYPE_CHECKING
|
|
37
|
+
|
|
38
|
+
import numpy as np
|
|
39
|
+
import pandas as pd
|
|
40
|
+
|
|
41
|
+
if TYPE_CHECKING:
|
|
42
|
+
import spacy
|
|
43
|
+
|
|
44
|
+
from spell_exploder.core.compression import compressed_size
|
|
45
|
+
from spell_exploder.core.entropy import (
|
|
46
|
+
multiscale_collapse_curve,
|
|
47
|
+
shannon_entropy,
|
|
48
|
+
summarize_multiscale_collapse,
|
|
49
|
+
window_collapse,
|
|
50
|
+
)
|
|
51
|
+
from spell_exploder.core.information import (
|
|
52
|
+
channel_capacity,
|
|
53
|
+
js_divergence_from_counters,
|
|
54
|
+
mutual_information,
|
|
55
|
+
)
|
|
56
|
+
from spell_exploder.core.nlp import get_nlp, tokenize
|
|
57
|
+
from spell_exploder.extractors.action_frames import extract_action_frames, make_hashable_frame
|
|
58
|
+
from spell_exploder.extractors.noun_dependencies import extract_noun_dependencies
|
|
59
|
+
from spell_exploder.io.readers import TextDocument, load_texts, texts_from_strings
|
|
60
|
+
from spell_exploder.results.valence import PostMetrics, ValenceModelResult
|
|
61
|
+
from spell_exploder.utils.smoothing import per_1k
|
|
62
|
+
|
|
63
|
+
logger = logging.getLogger(__name__)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
# Internal: parsed document bundle
|
|
68
|
+
# ---------------------------------------------------------------------------
|
|
69
|
+
|
|
70
|
+
class _ParsedPost:
|
|
71
|
+
"""Intermediate representation of a single parsed document."""
|
|
72
|
+
|
|
73
|
+
__slots__ = ("path", "label", "text", "tokens", "frames", "noun_deps")
|
|
74
|
+
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
path: str,
|
|
78
|
+
label: str,
|
|
79
|
+
text: str,
|
|
80
|
+
tokens: list[str],
|
|
81
|
+
frames: list[dict],
|
|
82
|
+
noun_deps: list[tuple[str, str, str]],
|
|
83
|
+
):
|
|
84
|
+
self.path = path
|
|
85
|
+
self.label = label
|
|
86
|
+
self.text = text
|
|
87
|
+
self.tokens = tokens
|
|
88
|
+
self.frames = frames
|
|
89
|
+
self.noun_deps = noun_deps
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# ---------------------------------------------------------------------------
|
|
93
|
+
# Analyzer
|
|
94
|
+
# ---------------------------------------------------------------------------
|
|
95
|
+
|
|
96
|
+
class ValenceModelAnalyzer:
|
|
97
|
+
"""
|
|
98
|
+
Language Complexity Valence Model (LCVM) analyzer.
|
|
99
|
+
|
|
100
|
+
Parameters
|
|
101
|
+
----------
|
|
102
|
+
window_sizes : tuple[int, ...]
|
|
103
|
+
Window sizes for multiscale collapse analysis.
|
|
104
|
+
top_k_schemas : int
|
|
105
|
+
Number of top schema keywords to track in detail.
|
|
106
|
+
model_name : str
|
|
107
|
+
spaCy model for tokenisation and dependency parsing.
|
|
108
|
+
nlp : spacy.Language or None
|
|
109
|
+
Pre-loaded pipeline (overrides *model_name*).
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
def __init__(
|
|
113
|
+
self,
|
|
114
|
+
window_sizes: tuple[int, ...] = (25, 50, 100, 250, 500),
|
|
115
|
+
top_k_schemas: int = 20,
|
|
116
|
+
model_name: str = "en_core_web_sm",
|
|
117
|
+
nlp: spacy.Language | None = None,
|
|
118
|
+
):
|
|
119
|
+
self.window_sizes = window_sizes
|
|
120
|
+
self.top_k_schemas = top_k_schemas
|
|
121
|
+
self._model_name = model_name
|
|
122
|
+
self._nlp = nlp
|
|
123
|
+
|
|
124
|
+
# ── Public API ───────────────────────────────────────────────────
|
|
125
|
+
|
|
126
|
+
def analyze(
|
|
127
|
+
self,
|
|
128
|
+
texts_or_paths: list[str],
|
|
129
|
+
labels: list[str] | None = None,
|
|
130
|
+
from_files: bool = True,
|
|
131
|
+
) -> ValenceModelResult:
|
|
132
|
+
"""
|
|
133
|
+
Run the full LCVM pipeline on N texts.
|
|
134
|
+
|
|
135
|
+
Parameters
|
|
136
|
+
----------
|
|
137
|
+
texts_or_paths : list[str]
|
|
138
|
+
File paths (when *from_files* is ``True``) or raw text
|
|
139
|
+
strings (when ``False``).
|
|
140
|
+
labels : list[str] or None
|
|
141
|
+
Human-readable labels.
|
|
142
|
+
from_files : bool
|
|
143
|
+
Whether to read from files or treat as raw strings.
|
|
144
|
+
|
|
145
|
+
Returns
|
|
146
|
+
-------
|
|
147
|
+
ValenceModelResult
|
|
148
|
+
"""
|
|
149
|
+
# 1. Load
|
|
150
|
+
if from_files:
|
|
151
|
+
documents = load_texts(texts_or_paths, labels=labels)
|
|
152
|
+
else:
|
|
153
|
+
documents = texts_from_strings(texts_or_paths, labels=labels)
|
|
154
|
+
|
|
155
|
+
# 2. Parse all documents
|
|
156
|
+
posts = self._parse_all(documents)
|
|
157
|
+
logger.info("Parsed %d documents", len(posts))
|
|
158
|
+
|
|
159
|
+
# 3. Corpus-level entropy (for relative collapse)
|
|
160
|
+
corpus_tokens = [t for p in posts for t in p.tokens]
|
|
161
|
+
corpus_counter = Counter(corpus_tokens)
|
|
162
|
+
h_corpus = shannon_entropy(corpus_counter)
|
|
163
|
+
|
|
164
|
+
# 4. Per-document metrics
|
|
165
|
+
post_metrics: list[PostMetrics] = []
|
|
166
|
+
token_capacities: dict[str, pd.DataFrame] = {}
|
|
167
|
+
|
|
168
|
+
for p in posts:
|
|
169
|
+
pm, tc = self._compute_post_metrics(p, h_corpus)
|
|
170
|
+
post_metrics.append(pm)
|
|
171
|
+
token_capacities[p.label] = tc
|
|
172
|
+
|
|
173
|
+
# 5. Cross-document JS divergence matrix
|
|
174
|
+
js_matrix = self._compute_js_matrix(posts)
|
|
175
|
+
|
|
176
|
+
return ValenceModelResult(
|
|
177
|
+
posts=post_metrics,
|
|
178
|
+
js_divergence_matrix=js_matrix,
|
|
179
|
+
token_capacities=token_capacities,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
def analyze_documents(
|
|
183
|
+
self,
|
|
184
|
+
documents: list[TextDocument],
|
|
185
|
+
) -> ValenceModelResult:
|
|
186
|
+
"""
|
|
187
|
+
Analyse pre-loaded :class:`~spell_exploder.io.readers.TextDocument` objects.
|
|
188
|
+
"""
|
|
189
|
+
posts = self._parse_all(documents)
|
|
190
|
+
corpus_tokens = [t for p in posts for t in p.tokens]
|
|
191
|
+
h_corpus = shannon_entropy(Counter(corpus_tokens))
|
|
192
|
+
|
|
193
|
+
post_metrics = []
|
|
194
|
+
token_capacities = {}
|
|
195
|
+
for p in posts:
|
|
196
|
+
pm, tc = self._compute_post_metrics(p, h_corpus)
|
|
197
|
+
post_metrics.append(pm)
|
|
198
|
+
token_capacities[p.label] = tc
|
|
199
|
+
|
|
200
|
+
js_matrix = self._compute_js_matrix(posts)
|
|
201
|
+
return ValenceModelResult(
|
|
202
|
+
posts=post_metrics,
|
|
203
|
+
js_divergence_matrix=js_matrix,
|
|
204
|
+
token_capacities=token_capacities,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
def build_complexity_profile(
|
|
208
|
+
self,
|
|
209
|
+
result: ValenceModelResult,
|
|
210
|
+
) -> pd.DataFrame:
|
|
211
|
+
"""
|
|
212
|
+
Extract a concise complexity profile from a full result.
|
|
213
|
+
|
|
214
|
+
Returns a DataFrame with one row per text, containing the key
|
|
215
|
+
metrics across all five dimensions plus (for N=2) the pairwise
|
|
216
|
+
JS divergence. Column names use descriptive ``"Section: Metric"``
|
|
217
|
+
labels.
|
|
218
|
+
"""
|
|
219
|
+
df = result.to_dataframe()
|
|
220
|
+
wanted = [
|
|
221
|
+
"file",
|
|
222
|
+
"entropy_text",
|
|
223
|
+
"collapse_auc_norm",
|
|
224
|
+
"peak_win_size",
|
|
225
|
+
"mi_verb_subject",
|
|
226
|
+
"mi_verb_object",
|
|
227
|
+
"coupling_strength",
|
|
228
|
+
"coupling_orientation",
|
|
229
|
+
"frames_per_1k_tokens",
|
|
230
|
+
"verb_diversity",
|
|
231
|
+
"schema_keywords_per_1k_tokens",
|
|
232
|
+
"noun_deps_per_1k_tokens",
|
|
233
|
+
"schema_concentration_entropy",
|
|
234
|
+
"mean_schema_valence_entropy_topk",
|
|
235
|
+
]
|
|
236
|
+
cols = [c for c in wanted if c in df.columns]
|
|
237
|
+
prof = df[cols].copy()
|
|
238
|
+
|
|
239
|
+
# Add JS divergence for the N=2 case
|
|
240
|
+
if (
|
|
241
|
+
result.js_divergence_matrix is not None
|
|
242
|
+
and result.js_divergence_matrix.shape == (2, 2)
|
|
243
|
+
):
|
|
244
|
+
jsd = float(result.js_divergence_matrix[0, 1])
|
|
245
|
+
prof["js_divergence"] = jsd
|
|
246
|
+
|
|
247
|
+
rename = {
|
|
248
|
+
"entropy_text": "Variation: Text entropy (bits)",
|
|
249
|
+
"collapse_auc_norm": "Redundancy: Multiscale collapse AUC_norm",
|
|
250
|
+
"peak_win_size": "Redundancy: Peak scale (win size)",
|
|
251
|
+
"coupling_strength": "Organization: Coupling strength (bits)",
|
|
252
|
+
"coupling_orientation": "Organization: Orientation MI(V;O)-MI(V;S)",
|
|
253
|
+
"frames_per_1k_tokens": "Repertoire: Action density (frames/1k tokens)",
|
|
254
|
+
"verb_diversity": "Repertoire: Verb diversity (unique_verbs/frames)",
|
|
255
|
+
"schema_keywords_per_1k_tokens": "Semantic breadth: Schema keywords/1k tokens",
|
|
256
|
+
"js_divergence": "Distance: JS divergence (distance²)",
|
|
257
|
+
}
|
|
258
|
+
prof = prof.rename(columns=rename)
|
|
259
|
+
return prof
|
|
260
|
+
|
|
261
|
+
def profile_for_print(
|
|
262
|
+
self,
|
|
263
|
+
profile_df: pd.DataFrame,
|
|
264
|
+
label: str = "stem",
|
|
265
|
+
add_delta: bool = True,
|
|
266
|
+
group_sections: bool = True,
|
|
267
|
+
) -> pd.DataFrame:
|
|
268
|
+
"""
|
|
269
|
+
Transpose a complexity profile into a tall, print-friendly table.
|
|
270
|
+
|
|
271
|
+
Parameters
|
|
272
|
+
----------
|
|
273
|
+
profile_df : DataFrame
|
|
274
|
+
Output of :meth:`build_complexity_profile`.
|
|
275
|
+
label : str
|
|
276
|
+
``"stem"`` to use filename stems as column headers.
|
|
277
|
+
add_delta : bool
|
|
278
|
+
If ``True`` and exactly 2 texts, add Δ and %Δ columns.
|
|
279
|
+
group_sections : bool
|
|
280
|
+
If ``True``, split ``"Section: Metric"`` names into a MultiIndex.
|
|
281
|
+
"""
|
|
282
|
+
df = profile_df.copy()
|
|
283
|
+
|
|
284
|
+
if "file" in df.columns:
|
|
285
|
+
if label == "stem":
|
|
286
|
+
df["Text"] = df["file"].map(lambda p: Path(str(p)).stem)
|
|
287
|
+
else:
|
|
288
|
+
df["Text"] = df["file"].astype(str)
|
|
289
|
+
df = df.drop(columns=["file"]).set_index("Text")
|
|
290
|
+
|
|
291
|
+
t = df.T
|
|
292
|
+
|
|
293
|
+
if group_sections:
|
|
294
|
+
parts = t.index.to_series().str.split(":", n=1, expand=True)
|
|
295
|
+
section = parts[0].fillna("Other").str.strip()
|
|
296
|
+
metric = parts[1].fillna(parts[0]).str.strip()
|
|
297
|
+
t.index = pd.MultiIndex.from_arrays(
|
|
298
|
+
[section, metric], names=["Section", "Metric"]
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
if add_delta and t.shape[1] == 2:
|
|
302
|
+
a, b = t.columns[0], t.columns[1]
|
|
303
|
+
va = pd.to_numeric(t[a], errors="coerce")
|
|
304
|
+
vb = pd.to_numeric(t[b], errors="coerce")
|
|
305
|
+
t["Δ (B − A)"] = vb - va
|
|
306
|
+
t["%Δ (vs A)"] = np.where(va != 0, (vb - va) / va * 100.0, np.nan)
|
|
307
|
+
|
|
308
|
+
return t.reset_index()
|
|
309
|
+
|
|
310
|
+
# ── Internal: parsing ────────────────────────────────────────────
|
|
311
|
+
|
|
312
|
+
def _get_nlp(self) -> spacy.Language:
|
|
313
|
+
if self._nlp is not None:
|
|
314
|
+
return self._nlp
|
|
315
|
+
return get_nlp(self._model_name, disable=["ner"])
|
|
316
|
+
|
|
317
|
+
def _get_nlp_tok(self) -> spacy.Language:
|
|
318
|
+
return get_nlp(self._model_name, disable=["parser", "ner"])
|
|
319
|
+
|
|
320
|
+
def _parse_all(self, documents: list[TextDocument]) -> list[_ParsedPost]:
|
|
321
|
+
nlp = self._get_nlp()
|
|
322
|
+
posts = []
|
|
323
|
+
for doc in documents:
|
|
324
|
+
frames = extract_action_frames(doc.text, nlp=nlp)
|
|
325
|
+
noun_deps = extract_noun_dependencies(doc.text, nlp=nlp)
|
|
326
|
+
tok_list = tokenize(doc.text, nlp=self._get_nlp_tok())
|
|
327
|
+
posts.append(_ParsedPost(
|
|
328
|
+
path=doc.path,
|
|
329
|
+
label=doc.label,
|
|
330
|
+
text=doc.text,
|
|
331
|
+
tokens=tok_list,
|
|
332
|
+
frames=frames,
|
|
333
|
+
noun_deps=noun_deps,
|
|
334
|
+
))
|
|
335
|
+
return posts
|
|
336
|
+
|
|
337
|
+
# ── Internal: per-post metrics ───────────────────────────────────
|
|
338
|
+
|
|
339
|
+
def _compute_post_metrics(
|
|
340
|
+
self,
|
|
341
|
+
p: _ParsedPost,
|
|
342
|
+
h_corpus: float,
|
|
343
|
+
) -> tuple[PostMetrics, pd.DataFrame]:
|
|
344
|
+
"""Compute all metrics for a single document. Returns (PostMetrics, token_capacity_df)."""
|
|
345
|
+
|
|
346
|
+
tok_list = p.tokens
|
|
347
|
+
token_count = len(tok_list)
|
|
348
|
+
token_counter = Counter(tok_list)
|
|
349
|
+
h_text = shannon_entropy(token_counter)
|
|
350
|
+
|
|
351
|
+
# --- Variation: entropy deficit vs corpus ---
|
|
352
|
+
rel_entropy_deficit = (
|
|
353
|
+
(h_corpus - h_text) / h_corpus if h_corpus > 0 else None
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
# --- Redundancy: multiscale collapse ---
|
|
357
|
+
curve = multiscale_collapse_curve(tok_list, win_sizes=self.window_sizes)
|
|
358
|
+
red = summarize_multiscale_collapse(curve, x_scale="log")
|
|
359
|
+
|
|
360
|
+
collapses_250 = window_collapse(tok_list, win_size=250)
|
|
361
|
+
collapse_mean_250 = float(np.mean(collapses_250)) if collapses_250 else None
|
|
362
|
+
collapse_max_250 = float(np.max(collapses_250)) if collapses_250 else None
|
|
363
|
+
n_windows_250 = len(collapses_250)
|
|
364
|
+
|
|
365
|
+
# --- Repertoire: action frames ---
|
|
366
|
+
total_frames = len(p.frames)
|
|
367
|
+
verb_counter = Counter(f["verb"] for f in p.frames)
|
|
368
|
+
unique_verbs = len(verb_counter)
|
|
369
|
+
verb_diversity = (unique_verbs / total_frames) if total_frames > 0 else None
|
|
370
|
+
frames_1k = per_1k(total_frames, token_count)
|
|
371
|
+
|
|
372
|
+
top_verb, top_verb_count = (None, 0)
|
|
373
|
+
if verb_counter:
|
|
374
|
+
top_verb, top_verb_count = verb_counter.most_common(1)[0]
|
|
375
|
+
|
|
376
|
+
# --- Organisation: MI + frame entropy ---
|
|
377
|
+
hashable_frames = [make_hashable_frame(f) for f in p.frames]
|
|
378
|
+
h_frames = shannon_entropy(Counter(hashable_frames))
|
|
379
|
+
|
|
380
|
+
mi_vs = self._compute_mi_verb_role(p.frames, "subjects")
|
|
381
|
+
mi_vo = self._compute_mi_verb_role(p.frames, "objects")
|
|
382
|
+
|
|
383
|
+
coupling_strength = (
|
|
384
|
+
(mi_vs + mi_vo) / 2.0
|
|
385
|
+
if mi_vs is not None and mi_vo is not None
|
|
386
|
+
else None
|
|
387
|
+
)
|
|
388
|
+
coupling_orientation = (
|
|
389
|
+
(mi_vo - mi_vs)
|
|
390
|
+
if mi_vs is not None and mi_vo is not None
|
|
391
|
+
else None
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
# --- Semantic breadth: noun dependencies ---
|
|
395
|
+
noun_deps = p.noun_deps
|
|
396
|
+
total_noun_deps = len(noun_deps)
|
|
397
|
+
noun_deps_1k = per_1k(total_noun_deps, token_count)
|
|
398
|
+
|
|
399
|
+
schema_keywords = [dep[0] for dep in noun_deps]
|
|
400
|
+
schema_counter = Counter(schema_keywords)
|
|
401
|
+
unique_schemas = len(schema_counter)
|
|
402
|
+
schemas_1k = per_1k(unique_schemas, token_count)
|
|
403
|
+
schema_conc_entropy = shannon_entropy(schema_counter)
|
|
404
|
+
|
|
405
|
+
top_schemas = schema_counter.most_common(self.top_k_schemas)
|
|
406
|
+
top_schema_set = {k for k, _ in top_schemas}
|
|
407
|
+
|
|
408
|
+
valence_distrib: dict[str, list[str]] = defaultdict(list)
|
|
409
|
+
for sk, vk, _ in noun_deps:
|
|
410
|
+
if sk in top_schema_set:
|
|
411
|
+
valence_distrib[sk].append(vk)
|
|
412
|
+
|
|
413
|
+
schema_val_entropy: dict[str, float] = {}
|
|
414
|
+
for sk, valences in valence_distrib.items():
|
|
415
|
+
schema_val_entropy[sk] = round(shannon_entropy(Counter(valences)), 4)
|
|
416
|
+
|
|
417
|
+
mean_sv_entropy = (
|
|
418
|
+
float(np.mean(list(schema_val_entropy.values())))
|
|
419
|
+
if schema_val_entropy
|
|
420
|
+
else None
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
# --- Token channel capacities ---
|
|
424
|
+
tc_df = self._compute_token_capacities(p.frames)
|
|
425
|
+
|
|
426
|
+
# --- Assemble PostMetrics ---
|
|
427
|
+
def _r(v, d=4):
|
|
428
|
+
"""Round if not None."""
|
|
429
|
+
return round(v, d) if v is not None and not (isinstance(v, float) and math.isnan(v)) else v
|
|
430
|
+
|
|
431
|
+
pm = PostMetrics(
|
|
432
|
+
file=p.label,
|
|
433
|
+
entropy_text=_r(h_text),
|
|
434
|
+
shannon_entropy_corpus=_r(rel_entropy_deficit),
|
|
435
|
+
shannon_entropy_avg=_r(collapse_mean_250),
|
|
436
|
+
shannon_entropy_max=_r(collapse_max_250),
|
|
437
|
+
number_of_windows=n_windows_250,
|
|
438
|
+
collapse_curve=curve,
|
|
439
|
+
collapse_auc=_r(red["collapse_auc"], 6),
|
|
440
|
+
collapse_auc_norm=_r(red["collapse_auc_norm"], 6),
|
|
441
|
+
peak_win_size=red["peak_win_size"],
|
|
442
|
+
peak_mean_collapse=_r(red["peak_mean_collapse"], 6),
|
|
443
|
+
token_count=token_count,
|
|
444
|
+
most_common_verb=top_verb,
|
|
445
|
+
most_common_verb_pattern_count=int(top_verb_count),
|
|
446
|
+
total_frames=total_frames,
|
|
447
|
+
unique_verbs=unique_verbs,
|
|
448
|
+
verb_diversity=_r(verb_diversity, 6),
|
|
449
|
+
frames_per_1k_tokens=_r(frames_1k, 6),
|
|
450
|
+
entropy_frames=_r(h_frames),
|
|
451
|
+
mi_verb_subject=_r(mi_vs, 6),
|
|
452
|
+
mi_verb_object=_r(mi_vo, 6),
|
|
453
|
+
coupling_strength=_r(coupling_strength, 6),
|
|
454
|
+
coupling_orientation=_r(coupling_orientation, 6),
|
|
455
|
+
total_noun_dependencies=total_noun_deps,
|
|
456
|
+
noun_deps_per_1k_tokens=_r(noun_deps_1k, 6),
|
|
457
|
+
unique_schema_keywords_in_deps=unique_schemas,
|
|
458
|
+
schema_keywords_per_1k_tokens=_r(schemas_1k, 6),
|
|
459
|
+
schema_concentration_entropy=_r(schema_conc_entropy, 6),
|
|
460
|
+
mean_schema_valence_entropy_topk=_r(mean_sv_entropy, 6),
|
|
461
|
+
top_schema_keywords=top_schemas,
|
|
462
|
+
schema_valence_entropy=schema_val_entropy,
|
|
463
|
+
valence_distributions=dict(valence_distrib),
|
|
464
|
+
)
|
|
465
|
+
return pm, tc_df
|
|
466
|
+
|
|
467
|
+
# ── Internal: mutual information for verb–role pairs ─────────────
|
|
468
|
+
|
|
469
|
+
@staticmethod
|
|
470
|
+
def _compute_mi_verb_role(
|
|
471
|
+
frames: list[dict],
|
|
472
|
+
role_key: str,
|
|
473
|
+
) -> float | None:
|
|
474
|
+
"""MI(Verb; Role) where role_key is 'subjects' or 'objects'."""
|
|
475
|
+
pairs = [(f["verb"], r) for f in frames for r in f.get(role_key, [])]
|
|
476
|
+
if not pairs:
|
|
477
|
+
return None
|
|
478
|
+
joint = Counter(pairs)
|
|
479
|
+
verb_marginal = Counter(v for v, _ in pairs)
|
|
480
|
+
role_marginal = Counter(r for _, r in pairs)
|
|
481
|
+
return mutual_information(joint, verb_marginal, role_marginal, len(pairs))
|
|
482
|
+
|
|
483
|
+
# ── Internal: token channel capacities ───────────────────────────
|
|
484
|
+
|
|
485
|
+
@staticmethod
|
|
486
|
+
def _compute_token_capacities(frames: list[dict]) -> pd.DataFrame:
|
|
487
|
+
"""Shannon–Hartley channel capacity per unique token in action frames."""
|
|
488
|
+
frame_tokens: list[str] = []
|
|
489
|
+
for f in frames:
|
|
490
|
+
frame_tokens.append(f["verb"])
|
|
491
|
+
frame_tokens.extend(f.get("subjects", []))
|
|
492
|
+
frame_tokens.extend(f.get("objects", []))
|
|
493
|
+
frame_tokens.extend(dep[1] for dep in f.get("other_deps", []))
|
|
494
|
+
|
|
495
|
+
if not frame_tokens:
|
|
496
|
+
return pd.DataFrame(columns=["token", "channel_capacity"])
|
|
497
|
+
|
|
498
|
+
counter = Counter(frame_tokens)
|
|
499
|
+
total = len(frame_tokens)
|
|
500
|
+
rows = []
|
|
501
|
+
for token in sorted(set(frame_tokens)):
|
|
502
|
+
s = counter[token]
|
|
503
|
+
n = total - s
|
|
504
|
+
rows.append({"token": token, "channel_capacity": channel_capacity(s, max(n, 0))})
|
|
505
|
+
|
|
506
|
+
return pd.DataFrame(rows)
|
|
507
|
+
|
|
508
|
+
# ── Internal: JS divergence matrix ───────────────────────────────
|
|
509
|
+
|
|
510
|
+
def _compute_js_matrix(self, posts: list[_ParsedPost]) -> np.ndarray | None:
|
|
511
|
+
"""N×N pairwise JS divergence matrix. Returns None for N < 2."""
|
|
512
|
+
n = len(posts)
|
|
513
|
+
if n < 2:
|
|
514
|
+
return None
|
|
515
|
+
|
|
516
|
+
counters = [Counter(p.tokens) for p in posts]
|
|
517
|
+
mat = np.zeros((n, n), dtype=float)
|
|
518
|
+
for i in range(n):
|
|
519
|
+
for j in range(i + 1, n):
|
|
520
|
+
d = js_divergence_from_counters(counters[i], counters[j])
|
|
521
|
+
mat[i, j] = d
|
|
522
|
+
mat[j, i] = d
|
|
523
|
+
return mat
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core mathematical and NLP primitives used across all Spellcaster analyzers.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from spell_exploder.core.compression import (
|
|
6
|
+
compressed_size,
|
|
7
|
+
ncd_similarity,
|
|
8
|
+
normalized_compression_distance,
|
|
9
|
+
)
|
|
10
|
+
from spell_exploder.core.entropy import (
|
|
11
|
+
multiscale_collapse_curve,
|
|
12
|
+
shannon_entropy,
|
|
13
|
+
summarize_multiscale_collapse,
|
|
14
|
+
window_collapse,
|
|
15
|
+
)
|
|
16
|
+
from spell_exploder.core.information import (
|
|
17
|
+
channel_capacity,
|
|
18
|
+
js_distance_from_counters,
|
|
19
|
+
js_divergence_from_counters,
|
|
20
|
+
js_divergence_matrix,
|
|
21
|
+
mutual_information,
|
|
22
|
+
)
|
|
23
|
+
from spell_exploder.core.nlp import clear_model_cache, get_nlp, tokenize
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
# entropy
|
|
27
|
+
"shannon_entropy",
|
|
28
|
+
"window_collapse",
|
|
29
|
+
"multiscale_collapse_curve",
|
|
30
|
+
"summarize_multiscale_collapse",
|
|
31
|
+
# compression
|
|
32
|
+
"compressed_size",
|
|
33
|
+
"normalized_compression_distance",
|
|
34
|
+
"ncd_similarity",
|
|
35
|
+
# information
|
|
36
|
+
"mutual_information",
|
|
37
|
+
"channel_capacity",
|
|
38
|
+
"js_divergence_from_counters",
|
|
39
|
+
"js_distance_from_counters",
|
|
40
|
+
"js_divergence_matrix",
|
|
41
|
+
# nlp
|
|
42
|
+
"get_nlp",
|
|
43
|
+
"clear_model_cache",
|
|
44
|
+
"tokenize",
|
|
45
|
+
]
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Compression-based complexity measures.
|
|
3
|
+
|
|
4
|
+
Uses ``zlib`` (LZ77-family) as an approximation of Kolmogorov complexity.
|
|
5
|
+
Provides raw compressed size and the Normalized Compression Distance (NCD)
|
|
6
|
+
for comparing structural similarity between two sequences.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import zlib
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def compressed_size(text: str) -> int:
|
|
15
|
+
"""
|
|
16
|
+
Return the byte-length of *text* after zlib compression.
|
|
17
|
+
|
|
18
|
+
This serves as a practical upper-bound proxy for Kolmogorov complexity:
|
|
19
|
+
more compressible text → lower complexity.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
text : str
|
|
24
|
+
Raw input text.
|
|
25
|
+
|
|
26
|
+
Returns
|
|
27
|
+
-------
|
|
28
|
+
int
|
|
29
|
+
Size in bytes of the zlib-compressed UTF-8 encoding.
|
|
30
|
+
Returns ``0`` for empty input.
|
|
31
|
+
"""
|
|
32
|
+
if not text:
|
|
33
|
+
return 0
|
|
34
|
+
return len(zlib.compress(text.encode("utf-8")))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def normalized_compression_distance(
|
|
38
|
+
seq1: list[str],
|
|
39
|
+
seq2: list[str],
|
|
40
|
+
) -> float:
|
|
41
|
+
"""
|
|
42
|
+
Compute the Normalized Compression Distance (NCD) between two token sequences.
|
|
43
|
+
|
|
44
|
+
NCD is an approximation of normalized information distance based on
|
|
45
|
+
Kolmogorov complexity. Lower NCD means the two sequences share more
|
|
46
|
+
structural patterns.
|
|
47
|
+
|
|
48
|
+
.. math::
|
|
49
|
+
\\text{NCD}(x, y) = \\frac{C(xy) - \\min(C(x), C(y))}{\\max(C(x), C(y))}
|
|
50
|
+
|
|
51
|
+
Parameters
|
|
52
|
+
----------
|
|
53
|
+
seq1, seq2 : list[str]
|
|
54
|
+
Token sequences (e.g. POS tags).
|
|
55
|
+
|
|
56
|
+
Returns
|
|
57
|
+
-------
|
|
58
|
+
float
|
|
59
|
+
NCD value in [0, 1]. 0 = identical structure, 1 = maximally distinct.
|
|
60
|
+
"""
|
|
61
|
+
if not seq1 and not seq2:
|
|
62
|
+
return 0.0
|
|
63
|
+
|
|
64
|
+
s1 = " ".join(seq1).encode("utf-8")
|
|
65
|
+
s2 = " ".join(seq2).encode("utf-8")
|
|
66
|
+
|
|
67
|
+
if not s1 or not s2:
|
|
68
|
+
return 1.0 # One empty, one not → maximally distinct
|
|
69
|
+
|
|
70
|
+
c_x = len(zlib.compress(s1))
|
|
71
|
+
c_y = len(zlib.compress(s2))
|
|
72
|
+
# Average both concatenation orders to ensure symmetry
|
|
73
|
+
# (zlib's LZ77 window introduces order-dependent bias on short inputs)
|
|
74
|
+
c_xy = len(zlib.compress(s1 + b" " + s2))
|
|
75
|
+
c_yx = len(zlib.compress(s2 + b" " + s1))
|
|
76
|
+
c_concat = (c_xy + c_yx) / 2.0
|
|
77
|
+
|
|
78
|
+
max_c = max(c_x, c_y)
|
|
79
|
+
if max_c == 0:
|
|
80
|
+
return 0.0
|
|
81
|
+
|
|
82
|
+
ncd = (c_concat - min(c_x, c_y)) / max_c
|
|
83
|
+
return max(0.0, min(1.0, ncd))
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def ncd_similarity(seq1: list[str], seq2: list[str]) -> float:
|
|
87
|
+
"""
|
|
88
|
+
Structural similarity score: ``1 - NCD``.
|
|
89
|
+
|
|
90
|
+
A convenience wrapper that returns 1.0 for identical structure and
|
|
91
|
+
0.0 for maximally distinct structure.
|
|
92
|
+
|
|
93
|
+
Parameters
|
|
94
|
+
----------
|
|
95
|
+
seq1, seq2 : list[str]
|
|
96
|
+
Token sequences (e.g. POS tags).
|
|
97
|
+
|
|
98
|
+
Returns
|
|
99
|
+
-------
|
|
100
|
+
float
|
|
101
|
+
Similarity in [0, 1].
|
|
102
|
+
"""
|
|
103
|
+
return 1.0 - normalized_compression_distance(seq1, seq2)
|