spell-exploder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. spell_exploder/__init__.py +205 -0
  2. spell_exploder/_version.py +1 -0
  3. spell_exploder/analyzers/__init__.py +18 -0
  4. spell_exploder/analyzers/adaptive_evolution.py +453 -0
  5. spell_exploder/analyzers/complexity_index.py +224 -0
  6. spell_exploder/analyzers/keyword_erp.py +477 -0
  7. spell_exploder/analyzers/valence_model.py +523 -0
  8. spell_exploder/core/__init__.py +45 -0
  9. spell_exploder/core/compression.py +103 -0
  10. spell_exploder/core/entropy.py +203 -0
  11. spell_exploder/core/information.py +179 -0
  12. spell_exploder/core/nlp.py +107 -0
  13. spell_exploder/exceptions.py +25 -0
  14. spell_exploder/extractors/__init__.py +35 -0
  15. spell_exploder/extractors/action_frames.py +133 -0
  16. spell_exploder/extractors/noun_dependencies.py +96 -0
  17. spell_exploder/extractors/sentence_parser.py +168 -0
  18. spell_exploder/graphs/__init__.py +0 -0
  19. spell_exploder/io/__init__.py +14 -0
  20. spell_exploder/io/exporters.py +94 -0
  21. spell_exploder/io/readers.py +117 -0
  22. spell_exploder/results/__init__.py +44 -0
  23. spell_exploder/results/complexity.py +111 -0
  24. spell_exploder/results/evolution.py +136 -0
  25. spell_exploder/results/keyword.py +139 -0
  26. spell_exploder/results/valence.py +134 -0
  27. spell_exploder/utils/__init__.py +11 -0
  28. spell_exploder/utils/imports.py +48 -0
  29. spell_exploder/utils/smoothing.py +42 -0
  30. spell_exploder/utils/statistics.py +54 -0
  31. spell_exploder/visualization/__init__.py +27 -0
  32. spell_exploder/visualization/plots.py +562 -0
  33. spell_exploder-0.1.0.dist-info/METADATA +221 -0
  34. spell_exploder-0.1.0.dist-info/RECORD +37 -0
  35. spell_exploder-0.1.0.dist-info/WHEEL +5 -0
  36. spell_exploder-0.1.0.dist-info/licenses/LICENSE +21 -0
  37. spell_exploder-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,523 @@
1
+ """
2
+ Language Complexity Valence Model (LCVM) Analyzer.
3
+
4
+ The most comprehensive Spellcaster analyzer, computing ~30 metrics per
5
+ document across five analytical dimensions:
6
+
7
+ 1. **Variation** — Shannon entropy of token distributions.
8
+ 2. **Redundancy** — Multiscale entropy-collapse curves measuring local
9
+ repetition at different window sizes.
10
+ 3. **Organisation** — Mutual information between verbs and their
11
+ subjects/objects, capturing how tightly the action repertoire is
12
+ coupled.
13
+ 4. **Repertoire** — Action-frame density and verb diversity.
14
+ 5. **Semantic breadth** — Noun-dependency richness, schema-keyword
15
+ concentration, and valence entropy.
16
+
17
+ Additionally computes pairwise Jensen–Shannon divergence across N texts
18
+ and per-token channel capacities (Shannon–Hartley analogue).
19
+
20
+ Example
21
+ -------
22
+ >>> from spell_exploder.analyzers.valence_model import ValenceModelAnalyzer
23
+ >>> vm = ValenceModelAnalyzer()
24
+ >>> result = vm.analyze(["essay_a.txt", "essay_b.txt"])
25
+ >>> result.to_dataframe()
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import collections
31
+ import json
32
+ import logging
33
+ import math
34
+ from collections import Counter, defaultdict
35
+ from pathlib import Path
36
+ from typing import TYPE_CHECKING
37
+
38
+ import numpy as np
39
+ import pandas as pd
40
+
41
+ if TYPE_CHECKING:
42
+ import spacy
43
+
44
+ from spell_exploder.core.compression import compressed_size
45
+ from spell_exploder.core.entropy import (
46
+ multiscale_collapse_curve,
47
+ shannon_entropy,
48
+ summarize_multiscale_collapse,
49
+ window_collapse,
50
+ )
51
+ from spell_exploder.core.information import (
52
+ channel_capacity,
53
+ js_divergence_from_counters,
54
+ mutual_information,
55
+ )
56
+ from spell_exploder.core.nlp import get_nlp, tokenize
57
+ from spell_exploder.extractors.action_frames import extract_action_frames, make_hashable_frame
58
+ from spell_exploder.extractors.noun_dependencies import extract_noun_dependencies
59
+ from spell_exploder.io.readers import TextDocument, load_texts, texts_from_strings
60
+ from spell_exploder.results.valence import PostMetrics, ValenceModelResult
61
+ from spell_exploder.utils.smoothing import per_1k
62
+
63
+ logger = logging.getLogger(__name__)
64
+
65
+
66
+ # ---------------------------------------------------------------------------
67
+ # Internal: parsed document bundle
68
+ # ---------------------------------------------------------------------------
69
+
70
+ class _ParsedPost:
71
+ """Intermediate representation of a single parsed document."""
72
+
73
+ __slots__ = ("path", "label", "text", "tokens", "frames", "noun_deps")
74
+
75
+ def __init__(
76
+ self,
77
+ path: str,
78
+ label: str,
79
+ text: str,
80
+ tokens: list[str],
81
+ frames: list[dict],
82
+ noun_deps: list[tuple[str, str, str]],
83
+ ):
84
+ self.path = path
85
+ self.label = label
86
+ self.text = text
87
+ self.tokens = tokens
88
+ self.frames = frames
89
+ self.noun_deps = noun_deps
90
+
91
+
92
+ # ---------------------------------------------------------------------------
93
+ # Analyzer
94
+ # ---------------------------------------------------------------------------
95
+
96
+ class ValenceModelAnalyzer:
97
+ """
98
+ Language Complexity Valence Model (LCVM) analyzer.
99
+
100
+ Parameters
101
+ ----------
102
+ window_sizes : tuple[int, ...]
103
+ Window sizes for multiscale collapse analysis.
104
+ top_k_schemas : int
105
+ Number of top schema keywords to track in detail.
106
+ model_name : str
107
+ spaCy model for tokenisation and dependency parsing.
108
+ nlp : spacy.Language or None
109
+ Pre-loaded pipeline (overrides *model_name*).
110
+ """
111
+
112
+ def __init__(
113
+ self,
114
+ window_sizes: tuple[int, ...] = (25, 50, 100, 250, 500),
115
+ top_k_schemas: int = 20,
116
+ model_name: str = "en_core_web_sm",
117
+ nlp: spacy.Language | None = None,
118
+ ):
119
+ self.window_sizes = window_sizes
120
+ self.top_k_schemas = top_k_schemas
121
+ self._model_name = model_name
122
+ self._nlp = nlp
123
+
124
+ # ── Public API ───────────────────────────────────────────────────
125
+
126
+ def analyze(
127
+ self,
128
+ texts_or_paths: list[str],
129
+ labels: list[str] | None = None,
130
+ from_files: bool = True,
131
+ ) -> ValenceModelResult:
132
+ """
133
+ Run the full LCVM pipeline on N texts.
134
+
135
+ Parameters
136
+ ----------
137
+ texts_or_paths : list[str]
138
+ File paths (when *from_files* is ``True``) or raw text
139
+ strings (when ``False``).
140
+ labels : list[str] or None
141
+ Human-readable labels.
142
+ from_files : bool
143
+ Whether to read from files or treat as raw strings.
144
+
145
+ Returns
146
+ -------
147
+ ValenceModelResult
148
+ """
149
+ # 1. Load
150
+ if from_files:
151
+ documents = load_texts(texts_or_paths, labels=labels)
152
+ else:
153
+ documents = texts_from_strings(texts_or_paths, labels=labels)
154
+
155
+ # 2. Parse all documents
156
+ posts = self._parse_all(documents)
157
+ logger.info("Parsed %d documents", len(posts))
158
+
159
+ # 3. Corpus-level entropy (for relative collapse)
160
+ corpus_tokens = [t for p in posts for t in p.tokens]
161
+ corpus_counter = Counter(corpus_tokens)
162
+ h_corpus = shannon_entropy(corpus_counter)
163
+
164
+ # 4. Per-document metrics
165
+ post_metrics: list[PostMetrics] = []
166
+ token_capacities: dict[str, pd.DataFrame] = {}
167
+
168
+ for p in posts:
169
+ pm, tc = self._compute_post_metrics(p, h_corpus)
170
+ post_metrics.append(pm)
171
+ token_capacities[p.label] = tc
172
+
173
+ # 5. Cross-document JS divergence matrix
174
+ js_matrix = self._compute_js_matrix(posts)
175
+
176
+ return ValenceModelResult(
177
+ posts=post_metrics,
178
+ js_divergence_matrix=js_matrix,
179
+ token_capacities=token_capacities,
180
+ )
181
+
182
+ def analyze_documents(
183
+ self,
184
+ documents: list[TextDocument],
185
+ ) -> ValenceModelResult:
186
+ """
187
+ Analyse pre-loaded :class:`~spell_exploder.io.readers.TextDocument` objects.
188
+ """
189
+ posts = self._parse_all(documents)
190
+ corpus_tokens = [t for p in posts for t in p.tokens]
191
+ h_corpus = shannon_entropy(Counter(corpus_tokens))
192
+
193
+ post_metrics = []
194
+ token_capacities = {}
195
+ for p in posts:
196
+ pm, tc = self._compute_post_metrics(p, h_corpus)
197
+ post_metrics.append(pm)
198
+ token_capacities[p.label] = tc
199
+
200
+ js_matrix = self._compute_js_matrix(posts)
201
+ return ValenceModelResult(
202
+ posts=post_metrics,
203
+ js_divergence_matrix=js_matrix,
204
+ token_capacities=token_capacities,
205
+ )
206
+
207
+ def build_complexity_profile(
208
+ self,
209
+ result: ValenceModelResult,
210
+ ) -> pd.DataFrame:
211
+ """
212
+ Extract a concise complexity profile from a full result.
213
+
214
+ Returns a DataFrame with one row per text, containing the key
215
+ metrics across all five dimensions plus (for N=2) the pairwise
216
+ JS divergence. Column names use descriptive ``"Section: Metric"``
217
+ labels.
218
+ """
219
+ df = result.to_dataframe()
220
+ wanted = [
221
+ "file",
222
+ "entropy_text",
223
+ "collapse_auc_norm",
224
+ "peak_win_size",
225
+ "mi_verb_subject",
226
+ "mi_verb_object",
227
+ "coupling_strength",
228
+ "coupling_orientation",
229
+ "frames_per_1k_tokens",
230
+ "verb_diversity",
231
+ "schema_keywords_per_1k_tokens",
232
+ "noun_deps_per_1k_tokens",
233
+ "schema_concentration_entropy",
234
+ "mean_schema_valence_entropy_topk",
235
+ ]
236
+ cols = [c for c in wanted if c in df.columns]
237
+ prof = df[cols].copy()
238
+
239
+ # Add JS divergence for the N=2 case
240
+ if (
241
+ result.js_divergence_matrix is not None
242
+ and result.js_divergence_matrix.shape == (2, 2)
243
+ ):
244
+ jsd = float(result.js_divergence_matrix[0, 1])
245
+ prof["js_divergence"] = jsd
246
+
247
+ rename = {
248
+ "entropy_text": "Variation: Text entropy (bits)",
249
+ "collapse_auc_norm": "Redundancy: Multiscale collapse AUC_norm",
250
+ "peak_win_size": "Redundancy: Peak scale (win size)",
251
+ "coupling_strength": "Organization: Coupling strength (bits)",
252
+ "coupling_orientation": "Organization: Orientation MI(V;O)-MI(V;S)",
253
+ "frames_per_1k_tokens": "Repertoire: Action density (frames/1k tokens)",
254
+ "verb_diversity": "Repertoire: Verb diversity (unique_verbs/frames)",
255
+ "schema_keywords_per_1k_tokens": "Semantic breadth: Schema keywords/1k tokens",
256
+ "js_divergence": "Distance: JS divergence (distance²)",
257
+ }
258
+ prof = prof.rename(columns=rename)
259
+ return prof
260
+
261
+ def profile_for_print(
262
+ self,
263
+ profile_df: pd.DataFrame,
264
+ label: str = "stem",
265
+ add_delta: bool = True,
266
+ group_sections: bool = True,
267
+ ) -> pd.DataFrame:
268
+ """
269
+ Transpose a complexity profile into a tall, print-friendly table.
270
+
271
+ Parameters
272
+ ----------
273
+ profile_df : DataFrame
274
+ Output of :meth:`build_complexity_profile`.
275
+ label : str
276
+ ``"stem"`` to use filename stems as column headers.
277
+ add_delta : bool
278
+ If ``True`` and exactly 2 texts, add Δ and %Δ columns.
279
+ group_sections : bool
280
+ If ``True``, split ``"Section: Metric"`` names into a MultiIndex.
281
+ """
282
+ df = profile_df.copy()
283
+
284
+ if "file" in df.columns:
285
+ if label == "stem":
286
+ df["Text"] = df["file"].map(lambda p: Path(str(p)).stem)
287
+ else:
288
+ df["Text"] = df["file"].astype(str)
289
+ df = df.drop(columns=["file"]).set_index("Text")
290
+
291
+ t = df.T
292
+
293
+ if group_sections:
294
+ parts = t.index.to_series().str.split(":", n=1, expand=True)
295
+ section = parts[0].fillna("Other").str.strip()
296
+ metric = parts[1].fillna(parts[0]).str.strip()
297
+ t.index = pd.MultiIndex.from_arrays(
298
+ [section, metric], names=["Section", "Metric"]
299
+ )
300
+
301
+ if add_delta and t.shape[1] == 2:
302
+ a, b = t.columns[0], t.columns[1]
303
+ va = pd.to_numeric(t[a], errors="coerce")
304
+ vb = pd.to_numeric(t[b], errors="coerce")
305
+ t["Δ (B − A)"] = vb - va
306
+ t["%Δ (vs A)"] = np.where(va != 0, (vb - va) / va * 100.0, np.nan)
307
+
308
+ return t.reset_index()
309
+
310
+ # ── Internal: parsing ────────────────────────────────────────────
311
+
312
+ def _get_nlp(self) -> spacy.Language:
313
+ if self._nlp is not None:
314
+ return self._nlp
315
+ return get_nlp(self._model_name, disable=["ner"])
316
+
317
+ def _get_nlp_tok(self) -> spacy.Language:
318
+ return get_nlp(self._model_name, disable=["parser", "ner"])
319
+
320
+ def _parse_all(self, documents: list[TextDocument]) -> list[_ParsedPost]:
321
+ nlp = self._get_nlp()
322
+ posts = []
323
+ for doc in documents:
324
+ frames = extract_action_frames(doc.text, nlp=nlp)
325
+ noun_deps = extract_noun_dependencies(doc.text, nlp=nlp)
326
+ tok_list = tokenize(doc.text, nlp=self._get_nlp_tok())
327
+ posts.append(_ParsedPost(
328
+ path=doc.path,
329
+ label=doc.label,
330
+ text=doc.text,
331
+ tokens=tok_list,
332
+ frames=frames,
333
+ noun_deps=noun_deps,
334
+ ))
335
+ return posts
336
+
337
+ # ── Internal: per-post metrics ───────────────────────────────────
338
+
339
+ def _compute_post_metrics(
340
+ self,
341
+ p: _ParsedPost,
342
+ h_corpus: float,
343
+ ) -> tuple[PostMetrics, pd.DataFrame]:
344
+ """Compute all metrics for a single document. Returns (PostMetrics, token_capacity_df)."""
345
+
346
+ tok_list = p.tokens
347
+ token_count = len(tok_list)
348
+ token_counter = Counter(tok_list)
349
+ h_text = shannon_entropy(token_counter)
350
+
351
+ # --- Variation: entropy deficit vs corpus ---
352
+ rel_entropy_deficit = (
353
+ (h_corpus - h_text) / h_corpus if h_corpus > 0 else None
354
+ )
355
+
356
+ # --- Redundancy: multiscale collapse ---
357
+ curve = multiscale_collapse_curve(tok_list, win_sizes=self.window_sizes)
358
+ red = summarize_multiscale_collapse(curve, x_scale="log")
359
+
360
+ collapses_250 = window_collapse(tok_list, win_size=250)
361
+ collapse_mean_250 = float(np.mean(collapses_250)) if collapses_250 else None
362
+ collapse_max_250 = float(np.max(collapses_250)) if collapses_250 else None
363
+ n_windows_250 = len(collapses_250)
364
+
365
+ # --- Repertoire: action frames ---
366
+ total_frames = len(p.frames)
367
+ verb_counter = Counter(f["verb"] for f in p.frames)
368
+ unique_verbs = len(verb_counter)
369
+ verb_diversity = (unique_verbs / total_frames) if total_frames > 0 else None
370
+ frames_1k = per_1k(total_frames, token_count)
371
+
372
+ top_verb, top_verb_count = (None, 0)
373
+ if verb_counter:
374
+ top_verb, top_verb_count = verb_counter.most_common(1)[0]
375
+
376
+ # --- Organisation: MI + frame entropy ---
377
+ hashable_frames = [make_hashable_frame(f) for f in p.frames]
378
+ h_frames = shannon_entropy(Counter(hashable_frames))
379
+
380
+ mi_vs = self._compute_mi_verb_role(p.frames, "subjects")
381
+ mi_vo = self._compute_mi_verb_role(p.frames, "objects")
382
+
383
+ coupling_strength = (
384
+ (mi_vs + mi_vo) / 2.0
385
+ if mi_vs is not None and mi_vo is not None
386
+ else None
387
+ )
388
+ coupling_orientation = (
389
+ (mi_vo - mi_vs)
390
+ if mi_vs is not None and mi_vo is not None
391
+ else None
392
+ )
393
+
394
+ # --- Semantic breadth: noun dependencies ---
395
+ noun_deps = p.noun_deps
396
+ total_noun_deps = len(noun_deps)
397
+ noun_deps_1k = per_1k(total_noun_deps, token_count)
398
+
399
+ schema_keywords = [dep[0] for dep in noun_deps]
400
+ schema_counter = Counter(schema_keywords)
401
+ unique_schemas = len(schema_counter)
402
+ schemas_1k = per_1k(unique_schemas, token_count)
403
+ schema_conc_entropy = shannon_entropy(schema_counter)
404
+
405
+ top_schemas = schema_counter.most_common(self.top_k_schemas)
406
+ top_schema_set = {k for k, _ in top_schemas}
407
+
408
+ valence_distrib: dict[str, list[str]] = defaultdict(list)
409
+ for sk, vk, _ in noun_deps:
410
+ if sk in top_schema_set:
411
+ valence_distrib[sk].append(vk)
412
+
413
+ schema_val_entropy: dict[str, float] = {}
414
+ for sk, valences in valence_distrib.items():
415
+ schema_val_entropy[sk] = round(shannon_entropy(Counter(valences)), 4)
416
+
417
+ mean_sv_entropy = (
418
+ float(np.mean(list(schema_val_entropy.values())))
419
+ if schema_val_entropy
420
+ else None
421
+ )
422
+
423
+ # --- Token channel capacities ---
424
+ tc_df = self._compute_token_capacities(p.frames)
425
+
426
+ # --- Assemble PostMetrics ---
427
+ def _r(v, d=4):
428
+ """Round if not None."""
429
+ return round(v, d) if v is not None and not (isinstance(v, float) and math.isnan(v)) else v
430
+
431
+ pm = PostMetrics(
432
+ file=p.label,
433
+ entropy_text=_r(h_text),
434
+ shannon_entropy_corpus=_r(rel_entropy_deficit),
435
+ shannon_entropy_avg=_r(collapse_mean_250),
436
+ shannon_entropy_max=_r(collapse_max_250),
437
+ number_of_windows=n_windows_250,
438
+ collapse_curve=curve,
439
+ collapse_auc=_r(red["collapse_auc"], 6),
440
+ collapse_auc_norm=_r(red["collapse_auc_norm"], 6),
441
+ peak_win_size=red["peak_win_size"],
442
+ peak_mean_collapse=_r(red["peak_mean_collapse"], 6),
443
+ token_count=token_count,
444
+ most_common_verb=top_verb,
445
+ most_common_verb_pattern_count=int(top_verb_count),
446
+ total_frames=total_frames,
447
+ unique_verbs=unique_verbs,
448
+ verb_diversity=_r(verb_diversity, 6),
449
+ frames_per_1k_tokens=_r(frames_1k, 6),
450
+ entropy_frames=_r(h_frames),
451
+ mi_verb_subject=_r(mi_vs, 6),
452
+ mi_verb_object=_r(mi_vo, 6),
453
+ coupling_strength=_r(coupling_strength, 6),
454
+ coupling_orientation=_r(coupling_orientation, 6),
455
+ total_noun_dependencies=total_noun_deps,
456
+ noun_deps_per_1k_tokens=_r(noun_deps_1k, 6),
457
+ unique_schema_keywords_in_deps=unique_schemas,
458
+ schema_keywords_per_1k_tokens=_r(schemas_1k, 6),
459
+ schema_concentration_entropy=_r(schema_conc_entropy, 6),
460
+ mean_schema_valence_entropy_topk=_r(mean_sv_entropy, 6),
461
+ top_schema_keywords=top_schemas,
462
+ schema_valence_entropy=schema_val_entropy,
463
+ valence_distributions=dict(valence_distrib),
464
+ )
465
+ return pm, tc_df
466
+
467
+ # ── Internal: mutual information for verb–role pairs ─────────────
468
+
469
+ @staticmethod
470
+ def _compute_mi_verb_role(
471
+ frames: list[dict],
472
+ role_key: str,
473
+ ) -> float | None:
474
+ """MI(Verb; Role) where role_key is 'subjects' or 'objects'."""
475
+ pairs = [(f["verb"], r) for f in frames for r in f.get(role_key, [])]
476
+ if not pairs:
477
+ return None
478
+ joint = Counter(pairs)
479
+ verb_marginal = Counter(v for v, _ in pairs)
480
+ role_marginal = Counter(r for _, r in pairs)
481
+ return mutual_information(joint, verb_marginal, role_marginal, len(pairs))
482
+
483
+ # ── Internal: token channel capacities ───────────────────────────
484
+
485
+ @staticmethod
486
+ def _compute_token_capacities(frames: list[dict]) -> pd.DataFrame:
487
+ """Shannon–Hartley channel capacity per unique token in action frames."""
488
+ frame_tokens: list[str] = []
489
+ for f in frames:
490
+ frame_tokens.append(f["verb"])
491
+ frame_tokens.extend(f.get("subjects", []))
492
+ frame_tokens.extend(f.get("objects", []))
493
+ frame_tokens.extend(dep[1] for dep in f.get("other_deps", []))
494
+
495
+ if not frame_tokens:
496
+ return pd.DataFrame(columns=["token", "channel_capacity"])
497
+
498
+ counter = Counter(frame_tokens)
499
+ total = len(frame_tokens)
500
+ rows = []
501
+ for token in sorted(set(frame_tokens)):
502
+ s = counter[token]
503
+ n = total - s
504
+ rows.append({"token": token, "channel_capacity": channel_capacity(s, max(n, 0))})
505
+
506
+ return pd.DataFrame(rows)
507
+
508
+ # ── Internal: JS divergence matrix ───────────────────────────────
509
+
510
+ def _compute_js_matrix(self, posts: list[_ParsedPost]) -> np.ndarray | None:
511
+ """N×N pairwise JS divergence matrix. Returns None for N < 2."""
512
+ n = len(posts)
513
+ if n < 2:
514
+ return None
515
+
516
+ counters = [Counter(p.tokens) for p in posts]
517
+ mat = np.zeros((n, n), dtype=float)
518
+ for i in range(n):
519
+ for j in range(i + 1, n):
520
+ d = js_divergence_from_counters(counters[i], counters[j])
521
+ mat[i, j] = d
522
+ mat[j, i] = d
523
+ return mat
@@ -0,0 +1,45 @@
1
+ """
2
+ Core mathematical and NLP primitives used across all Spellcaster analyzers.
3
+ """
4
+
5
+ from spell_exploder.core.compression import (
6
+ compressed_size,
7
+ ncd_similarity,
8
+ normalized_compression_distance,
9
+ )
10
+ from spell_exploder.core.entropy import (
11
+ multiscale_collapse_curve,
12
+ shannon_entropy,
13
+ summarize_multiscale_collapse,
14
+ window_collapse,
15
+ )
16
+ from spell_exploder.core.information import (
17
+ channel_capacity,
18
+ js_distance_from_counters,
19
+ js_divergence_from_counters,
20
+ js_divergence_matrix,
21
+ mutual_information,
22
+ )
23
+ from spell_exploder.core.nlp import clear_model_cache, get_nlp, tokenize
24
+
25
+ __all__ = [
26
+ # entropy
27
+ "shannon_entropy",
28
+ "window_collapse",
29
+ "multiscale_collapse_curve",
30
+ "summarize_multiscale_collapse",
31
+ # compression
32
+ "compressed_size",
33
+ "normalized_compression_distance",
34
+ "ncd_similarity",
35
+ # information
36
+ "mutual_information",
37
+ "channel_capacity",
38
+ "js_divergence_from_counters",
39
+ "js_distance_from_counters",
40
+ "js_divergence_matrix",
41
+ # nlp
42
+ "get_nlp",
43
+ "clear_model_cache",
44
+ "tokenize",
45
+ ]
@@ -0,0 +1,103 @@
1
+ """
2
+ Compression-based complexity measures.
3
+
4
+ Uses ``zlib`` (LZ77-family) as an approximation of Kolmogorov complexity.
5
+ Provides raw compressed size and the Normalized Compression Distance (NCD)
6
+ for comparing structural similarity between two sequences.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import zlib
12
+
13
+
14
+ def compressed_size(text: str) -> int:
15
+ """
16
+ Return the byte-length of *text* after zlib compression.
17
+
18
+ This serves as a practical upper-bound proxy for Kolmogorov complexity:
19
+ more compressible text → lower complexity.
20
+
21
+ Parameters
22
+ ----------
23
+ text : str
24
+ Raw input text.
25
+
26
+ Returns
27
+ -------
28
+ int
29
+ Size in bytes of the zlib-compressed UTF-8 encoding.
30
+ Returns ``0`` for empty input.
31
+ """
32
+ if not text:
33
+ return 0
34
+ return len(zlib.compress(text.encode("utf-8")))
35
+
36
+
37
+ def normalized_compression_distance(
38
+ seq1: list[str],
39
+ seq2: list[str],
40
+ ) -> float:
41
+ """
42
+ Compute the Normalized Compression Distance (NCD) between two token sequences.
43
+
44
+ NCD is an approximation of normalized information distance based on
45
+ Kolmogorov complexity. Lower NCD means the two sequences share more
46
+ structural patterns.
47
+
48
+ .. math::
49
+ \\text{NCD}(x, y) = \\frac{C(xy) - \\min(C(x), C(y))}{\\max(C(x), C(y))}
50
+
51
+ Parameters
52
+ ----------
53
+ seq1, seq2 : list[str]
54
+ Token sequences (e.g. POS tags).
55
+
56
+ Returns
57
+ -------
58
+ float
59
+ NCD value in [0, 1]. 0 = identical structure, 1 = maximally distinct.
60
+ """
61
+ if not seq1 and not seq2:
62
+ return 0.0
63
+
64
+ s1 = " ".join(seq1).encode("utf-8")
65
+ s2 = " ".join(seq2).encode("utf-8")
66
+
67
+ if not s1 or not s2:
68
+ return 1.0 # One empty, one not → maximally distinct
69
+
70
+ c_x = len(zlib.compress(s1))
71
+ c_y = len(zlib.compress(s2))
72
+ # Average both concatenation orders to ensure symmetry
73
+ # (zlib's LZ77 window introduces order-dependent bias on short inputs)
74
+ c_xy = len(zlib.compress(s1 + b" " + s2))
75
+ c_yx = len(zlib.compress(s2 + b" " + s1))
76
+ c_concat = (c_xy + c_yx) / 2.0
77
+
78
+ max_c = max(c_x, c_y)
79
+ if max_c == 0:
80
+ return 0.0
81
+
82
+ ncd = (c_concat - min(c_x, c_y)) / max_c
83
+ return max(0.0, min(1.0, ncd))
84
+
85
+
86
+ def ncd_similarity(seq1: list[str], seq2: list[str]) -> float:
87
+ """
88
+ Structural similarity score: ``1 - NCD``.
89
+
90
+ A convenience wrapper that returns 1.0 for identical structure and
91
+ 0.0 for maximally distinct structure.
92
+
93
+ Parameters
94
+ ----------
95
+ seq1, seq2 : list[str]
96
+ Token sequences (e.g. POS tags).
97
+
98
+ Returns
99
+ -------
100
+ float
101
+ Similarity in [0, 1].
102
+ """
103
+ return 1.0 - normalized_compression_distance(seq1, seq2)