spell-exploder 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spell_exploder/__init__.py +205 -0
- spell_exploder/_version.py +1 -0
- spell_exploder/analyzers/__init__.py +18 -0
- spell_exploder/analyzers/adaptive_evolution.py +453 -0
- spell_exploder/analyzers/complexity_index.py +224 -0
- spell_exploder/analyzers/keyword_erp.py +477 -0
- spell_exploder/analyzers/valence_model.py +523 -0
- spell_exploder/core/__init__.py +45 -0
- spell_exploder/core/compression.py +103 -0
- spell_exploder/core/entropy.py +203 -0
- spell_exploder/core/information.py +179 -0
- spell_exploder/core/nlp.py +107 -0
- spell_exploder/exceptions.py +25 -0
- spell_exploder/extractors/__init__.py +35 -0
- spell_exploder/extractors/action_frames.py +133 -0
- spell_exploder/extractors/noun_dependencies.py +96 -0
- spell_exploder/extractors/sentence_parser.py +168 -0
- spell_exploder/graphs/__init__.py +0 -0
- spell_exploder/io/__init__.py +14 -0
- spell_exploder/io/exporters.py +94 -0
- spell_exploder/io/readers.py +117 -0
- spell_exploder/results/__init__.py +44 -0
- spell_exploder/results/complexity.py +111 -0
- spell_exploder/results/evolution.py +136 -0
- spell_exploder/results/keyword.py +139 -0
- spell_exploder/results/valence.py +134 -0
- spell_exploder/utils/__init__.py +11 -0
- spell_exploder/utils/imports.py +48 -0
- spell_exploder/utils/smoothing.py +42 -0
- spell_exploder/utils/statistics.py +54 -0
- spell_exploder/visualization/__init__.py +27 -0
- spell_exploder/visualization/plots.py +562 -0
- spell_exploder-0.1.0.dist-info/METADATA +221 -0
- spell_exploder-0.1.0.dist-info/RECORD +37 -0
- spell_exploder-0.1.0.dist-info/WHEEL +5 -0
- spell_exploder-0.1.0.dist-info/licenses/LICENSE +21 -0
- spell_exploder-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Spell Exploder
|
|
3
|
+
===========
|
|
4
|
+
|
|
5
|
+
Analyze natural language text through complex systems science,
|
|
6
|
+
information theory, information-theoretic physics analogues,
|
|
7
|
+
and evolutionary game theory.
|
|
8
|
+
|
|
9
|
+
Quick Start
|
|
10
|
+
-----------
|
|
11
|
+
>>> import spell_exploder
|
|
12
|
+
>>> result = spell_exploder.analyze_complexity("draft_a.txt", "draft_b.txt")
|
|
13
|
+
>>> result.to_dataframe()
|
|
14
|
+
|
|
15
|
+
Analyzers
|
|
16
|
+
---------
|
|
17
|
+
For full control, import the analyzer classes directly::
|
|
18
|
+
|
|
19
|
+
from spell_exploder.analyzers import (
|
|
20
|
+
TextComplexityAnalyzer, # LCX — compression, volatility, synergy
|
|
21
|
+
ValenceModelAnalyzer, # LCVM — entropy, MI, action frames, collapse
|
|
22
|
+
AdaptiveEvolutionAnalyzer,# APE — POS clustering, evolutionary dynamics
|
|
23
|
+
KeywordERPAnalyzer, # KEPM — keyword structural coherence
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
Results
|
|
27
|
+
-------
|
|
28
|
+
All analyzers return structured dataclass results with ``.to_dataframe()``
|
|
29
|
+
methods for easy integration with pandas, matplotlib, or any other tooling.
|
|
30
|
+
|
|
31
|
+
Export
|
|
32
|
+
------
|
|
33
|
+
>>> from spell_exploder.io import export_csv, export_json
|
|
34
|
+
>>> export_csv(result, "output.csv")
|
|
35
|
+
>>> export_json(result, "output.json")
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
from spell_exploder._version import __version__
|
|
39
|
+
|
|
40
|
+
from spell_exploder.analyzers.complexity_index import TextComplexityAnalyzer
|
|
41
|
+
from spell_exploder.analyzers.valence_model import ValenceModelAnalyzer
|
|
42
|
+
from spell_exploder.analyzers.adaptive_evolution import AdaptiveEvolutionAnalyzer
|
|
43
|
+
from spell_exploder.analyzers.keyword_erp import KeywordERPAnalyzer
|
|
44
|
+
|
|
45
|
+
from spell_exploder.io.readers import load_texts, texts_from_strings
|
|
46
|
+
from spell_exploder.io.exporters import export_csv, export_json
|
|
47
|
+
|
|
48
|
+
from spell_exploder.results.complexity import (
|
|
49
|
+
ComplexityComparisonResult,
|
|
50
|
+
ComplexityFlowResult,
|
|
51
|
+
)
|
|
52
|
+
from spell_exploder.results.valence import ValenceModelResult
|
|
53
|
+
from spell_exploder.results.evolution import EvolutionResult
|
|
54
|
+
from spell_exploder.results.keyword import KeywordERPResult
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# ── Convenience functions ────────────────────────────────────────────────────
|
|
58
|
+
|
|
59
|
+
def analyze_complexity(
|
|
60
|
+
*texts_or_paths: str,
|
|
61
|
+
labels: list[str] | None = None,
|
|
62
|
+
from_files: bool = True,
|
|
63
|
+
) -> ComplexityComparisonResult:
|
|
64
|
+
"""
|
|
65
|
+
One-liner complexity analysis.
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
*texts_or_paths : str
|
|
70
|
+
File paths (when *from_files* is True) or raw text strings.
|
|
71
|
+
labels : list[str] or None
|
|
72
|
+
Human-readable labels.
|
|
73
|
+
from_files : bool
|
|
74
|
+
Whether to read from files or treat as raw strings.
|
|
75
|
+
|
|
76
|
+
Returns
|
|
77
|
+
-------
|
|
78
|
+
ComplexityComparisonResult
|
|
79
|
+
"""
|
|
80
|
+
return TextComplexityAnalyzer().compare(
|
|
81
|
+
list(texts_or_paths), labels=labels, from_files=from_files,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def analyze_valence(
|
|
86
|
+
*texts_or_paths: str,
|
|
87
|
+
labels: list[str] | None = None,
|
|
88
|
+
from_files: bool = True,
|
|
89
|
+
window_sizes: tuple[int, ...] = (25, 50, 100, 250, 500),
|
|
90
|
+
) -> ValenceModelResult:
|
|
91
|
+
"""
|
|
92
|
+
One-liner valence model analysis.
|
|
93
|
+
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
*texts_or_paths : str
|
|
97
|
+
File paths or raw text strings.
|
|
98
|
+
labels : list[str] or None
|
|
99
|
+
Human-readable labels.
|
|
100
|
+
from_files : bool
|
|
101
|
+
Whether to read from files.
|
|
102
|
+
window_sizes : tuple[int, ...]
|
|
103
|
+
Window sizes for multiscale collapse.
|
|
104
|
+
|
|
105
|
+
Returns
|
|
106
|
+
-------
|
|
107
|
+
ValenceModelResult
|
|
108
|
+
"""
|
|
109
|
+
return ValenceModelAnalyzer(window_sizes=window_sizes).analyze(
|
|
110
|
+
list(texts_or_paths), labels=labels, from_files=from_files,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def analyze_evolution(
|
|
115
|
+
*texts_or_paths: str,
|
|
116
|
+
labels: list[str] | None = None,
|
|
117
|
+
from_files: bool = True,
|
|
118
|
+
use_embeddings: bool = True,
|
|
119
|
+
alpha_semantic: float = 0.5,
|
|
120
|
+
) -> EvolutionResult:
|
|
121
|
+
"""
|
|
122
|
+
One-liner adaptive evolution analysis.
|
|
123
|
+
|
|
124
|
+
Documents should be in chronological order (earliest first).
|
|
125
|
+
|
|
126
|
+
Parameters
|
|
127
|
+
----------
|
|
128
|
+
*texts_or_paths : str
|
|
129
|
+
File paths or raw text strings.
|
|
130
|
+
labels : list[str] or None
|
|
131
|
+
Human-readable labels.
|
|
132
|
+
from_files : bool
|
|
133
|
+
Whether to read from files.
|
|
134
|
+
use_embeddings : bool
|
|
135
|
+
Whether to use sentence-transformer embeddings.
|
|
136
|
+
alpha_semantic : float
|
|
137
|
+
Blend weight for semantic vs. structural distance.
|
|
138
|
+
|
|
139
|
+
Returns
|
|
140
|
+
-------
|
|
141
|
+
EvolutionResult
|
|
142
|
+
"""
|
|
143
|
+
return AdaptiveEvolutionAnalyzer(
|
|
144
|
+
use_embeddings=use_embeddings,
|
|
145
|
+
alpha_semantic=alpha_semantic,
|
|
146
|
+
).analyze(
|
|
147
|
+
list(texts_or_paths), labels=labels, from_files=from_files,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def analyze_keywords(
|
|
152
|
+
*texts_or_paths: str,
|
|
153
|
+
keywords: list[str],
|
|
154
|
+
labels: list[str] | None = None,
|
|
155
|
+
from_files: bool = True,
|
|
156
|
+
context_window: int = 25,
|
|
157
|
+
) -> KeywordERPResult:
|
|
158
|
+
"""
|
|
159
|
+
One-liner keyword ERP analysis.
|
|
160
|
+
|
|
161
|
+
Parameters
|
|
162
|
+
----------
|
|
163
|
+
*texts_or_paths : str
|
|
164
|
+
File paths or raw text strings.
|
|
165
|
+
keywords : list[str]
|
|
166
|
+
Keywords to analyse.
|
|
167
|
+
labels : list[str] or None
|
|
168
|
+
Human-readable labels.
|
|
169
|
+
from_files : bool
|
|
170
|
+
Whether to read from files.
|
|
171
|
+
context_window : int
|
|
172
|
+
±N sentences around each keyword mention.
|
|
173
|
+
|
|
174
|
+
Returns
|
|
175
|
+
-------
|
|
176
|
+
KeywordERPResult
|
|
177
|
+
"""
|
|
178
|
+
return KeywordERPAnalyzer(
|
|
179
|
+
keywords=keywords,
|
|
180
|
+
context_window=context_window,
|
|
181
|
+
).analyze(
|
|
182
|
+
list(texts_or_paths), labels=labels, from_files=from_files,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
__all__ = [
|
|
187
|
+
"__version__",
|
|
188
|
+
"analyze_complexity",
|
|
189
|
+
"analyze_valence",
|
|
190
|
+
"analyze_evolution",
|
|
191
|
+
"analyze_keywords",
|
|
192
|
+
"TextComplexityAnalyzer",
|
|
193
|
+
"ValenceModelAnalyzer",
|
|
194
|
+
"AdaptiveEvolutionAnalyzer",
|
|
195
|
+
"KeywordERPAnalyzer",
|
|
196
|
+
"load_texts",
|
|
197
|
+
"texts_from_strings",
|
|
198
|
+
"export_csv",
|
|
199
|
+
"export_json",
|
|
200
|
+
"ComplexityComparisonResult",
|
|
201
|
+
"ComplexityFlowResult",
|
|
202
|
+
"ValenceModelResult",
|
|
203
|
+
"EvolutionResult",
|
|
204
|
+
"KeywordERPResult",
|
|
205
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Spellcaster analyzers.
|
|
3
|
+
|
|
4
|
+
Each analyzer encapsulates a complete analytical pipeline, from text
|
|
5
|
+
input to structured result objects.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from spell_exploder.analyzers.complexity_index import TextComplexityAnalyzer
|
|
9
|
+
from spell_exploder.analyzers.valence_model import ValenceModelAnalyzer
|
|
10
|
+
from spell_exploder.analyzers.adaptive_evolution import AdaptiveEvolutionAnalyzer
|
|
11
|
+
from spell_exploder.analyzers.keyword_erp import KeywordERPAnalyzer
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"TextComplexityAnalyzer",
|
|
15
|
+
"ValenceModelAnalyzer",
|
|
16
|
+
"AdaptiveEvolutionAnalyzer",
|
|
17
|
+
"KeywordERPAnalyzer",
|
|
18
|
+
]
|
|
@@ -0,0 +1,453 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Adaptive POS Evolution (APE) Analyzer.
|
|
3
|
+
|
|
4
|
+
Treats syntactic structures as biological *species* competing for
|
|
5
|
+
"cognitive market share" across documents. The pipeline:
|
|
6
|
+
|
|
7
|
+
1. **Parse** — Segment each document into sentences with POS tags.
|
|
8
|
+
2. **Structural similarity** — Compute NCD-based structural similarity
|
|
9
|
+
between every pair of sentences (compression distance on POS-tag
|
|
10
|
+
sequences).
|
|
11
|
+
3. **Embed** — Optionally compute semantic embeddings (sentence-transformers).
|
|
12
|
+
4. **Hybrid distance** — Blend structural and semantic distances via a
|
|
13
|
+
configurable weight ``alpha``.
|
|
14
|
+
5. **Cluster** — Agglomerative clustering on the hybrid distance matrix
|
|
15
|
+
with a data-driven distance threshold.
|
|
16
|
+
6. **Evolutionary dynamics** — Track cluster (species) density across
|
|
17
|
+
documents to classify each as Emerging, Extinct, Thriving, Declining,
|
|
18
|
+
or Stable.
|
|
19
|
+
7. **POS composition** — Profile the syntactic makeup of each species.
|
|
20
|
+
|
|
21
|
+
Example
|
|
22
|
+
-------
|
|
23
|
+
>>> from spell_exploder.analyzers.adaptive_evolution import AdaptiveEvolutionAnalyzer
|
|
24
|
+
>>> ape = AdaptiveEvolutionAnalyzer()
|
|
25
|
+
>>> result = ape.analyze(["early_draft.txt", "final_draft.txt"])
|
|
26
|
+
>>> result.to_dataframe()
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from __future__ import annotations
|
|
30
|
+
|
|
31
|
+
import logging
|
|
32
|
+
from collections import Counter
|
|
33
|
+
from typing import TYPE_CHECKING
|
|
34
|
+
|
|
35
|
+
import numpy as np
|
|
36
|
+
import pandas as pd
|
|
37
|
+
|
|
38
|
+
if TYPE_CHECKING:
|
|
39
|
+
import spacy
|
|
40
|
+
|
|
41
|
+
from spell_exploder.core.compression import ncd_similarity
|
|
42
|
+
from spell_exploder.extractors.sentence_parser import parse_sentences, ParsedSentence
|
|
43
|
+
from spell_exploder.io.readers import TextDocument, load_texts, texts_from_strings
|
|
44
|
+
from spell_exploder.results.evolution import (
|
|
45
|
+
EvolutionaryStatus,
|
|
46
|
+
EvolutionResult,
|
|
47
|
+
POSComposition,
|
|
48
|
+
SpeciesRecord,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
logger = logging.getLogger(__name__)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
# Analyzer
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
class AdaptiveEvolutionAnalyzer:
|
|
59
|
+
"""
|
|
60
|
+
Adaptive POS Evolution (APE) analyzer.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
alpha_semantic : float
|
|
65
|
+
Weight for semantic (embedding) distance vs. structural (NCD)
|
|
66
|
+
distance. 0.0 = pure structural, 1.0 = pure semantic,
|
|
67
|
+
0.5 = equal blend. Ignored when *use_embeddings* is ``False``.
|
|
68
|
+
status_threshold : float
|
|
69
|
+
Minimum absolute density delta to classify a species as
|
|
70
|
+
Thriving or Declining (default ``0.02`` = 2%).
|
|
71
|
+
top_k_pos : int
|
|
72
|
+
Number of POS tags to report per species cluster.
|
|
73
|
+
embedding_model : str
|
|
74
|
+
Sentence-transformer model name for semantic embeddings.
|
|
75
|
+
use_embeddings : bool
|
|
76
|
+
If ``False``, skip semantic embeddings entirely and cluster
|
|
77
|
+
on structural (NCD) distance alone. This avoids the
|
|
78
|
+
``sentence-transformers`` dependency.
|
|
79
|
+
model_name : str
|
|
80
|
+
spaCy model for sentence parsing and POS tagging.
|
|
81
|
+
nlp : spacy.Language or None
|
|
82
|
+
Pre-loaded spaCy pipeline (overrides *model_name*).
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def __init__(
|
|
86
|
+
self,
|
|
87
|
+
alpha_semantic: float = 0.5,
|
|
88
|
+
status_threshold: float = 0.02,
|
|
89
|
+
top_k_pos: int = 10,
|
|
90
|
+
embedding_model: str = "all-MiniLM-L6-v2",
|
|
91
|
+
use_embeddings: bool = True,
|
|
92
|
+
model_name: str = "en_core_web_sm",
|
|
93
|
+
nlp: spacy.Language | None = None,
|
|
94
|
+
):
|
|
95
|
+
self.alpha_semantic = alpha_semantic
|
|
96
|
+
self.status_threshold = status_threshold
|
|
97
|
+
self.top_k_pos = top_k_pos
|
|
98
|
+
self.embedding_model = embedding_model
|
|
99
|
+
self.use_embeddings = use_embeddings
|
|
100
|
+
self._model_name = model_name
|
|
101
|
+
self._nlp = nlp
|
|
102
|
+
|
|
103
|
+
# ── Public API ───────────────────────────────────────────────────
|
|
104
|
+
|
|
105
|
+
def analyze(
|
|
106
|
+
self,
|
|
107
|
+
texts_or_paths: list[str],
|
|
108
|
+
labels: list[str] | None = None,
|
|
109
|
+
from_files: bool = True,
|
|
110
|
+
) -> EvolutionResult:
|
|
111
|
+
"""
|
|
112
|
+
Run the full APE pipeline on N ordered documents.
|
|
113
|
+
|
|
114
|
+
Documents should be in chronological order (earliest first).
|
|
115
|
+
|
|
116
|
+
Parameters
|
|
117
|
+
----------
|
|
118
|
+
texts_or_paths : list[str]
|
|
119
|
+
File paths or raw text strings.
|
|
120
|
+
labels : list[str] or None
|
|
121
|
+
Human-readable labels.
|
|
122
|
+
from_files : bool
|
|
123
|
+
Whether to read from files or treat as raw strings.
|
|
124
|
+
|
|
125
|
+
Returns
|
|
126
|
+
-------
|
|
127
|
+
EvolutionResult
|
|
128
|
+
"""
|
|
129
|
+
if from_files:
|
|
130
|
+
documents = load_texts(texts_or_paths, labels=labels)
|
|
131
|
+
else:
|
|
132
|
+
documents = texts_from_strings(texts_or_paths, labels=labels)
|
|
133
|
+
|
|
134
|
+
return self.analyze_documents(documents)
|
|
135
|
+
|
|
136
|
+
def analyze_documents(
|
|
137
|
+
self,
|
|
138
|
+
documents: list[TextDocument],
|
|
139
|
+
) -> EvolutionResult:
|
|
140
|
+
"""Analyse pre-loaded :class:`TextDocument` objects."""
|
|
141
|
+
|
|
142
|
+
# 1. Parse sentences with POS tags
|
|
143
|
+
parsed = self._prepare_data(documents)
|
|
144
|
+
logger.info("Parsed %d sentences from %d documents", len(parsed["df"]), len(documents))
|
|
145
|
+
|
|
146
|
+
df = parsed["df"]
|
|
147
|
+
if len(df) < 2:
|
|
148
|
+
logger.warning("Fewer than 2 sentences — cannot cluster.")
|
|
149
|
+
return EvolutionResult(document_order=parsed["doc_order"])
|
|
150
|
+
|
|
151
|
+
# 2. Structural similarity matrix (NCD on POS tags)
|
|
152
|
+
struct_sim = self._compute_structural_similarity(df)
|
|
153
|
+
|
|
154
|
+
# 3. Build distance matrix (hybrid or pure structural)
|
|
155
|
+
dist_matrix = self._build_distance_matrix(df, struct_sim)
|
|
156
|
+
|
|
157
|
+
# 4. Cluster
|
|
158
|
+
df = self._cluster(df, dist_matrix)
|
|
159
|
+
|
|
160
|
+
# 5. Evolutionary dynamics
|
|
161
|
+
species = self._compute_evolutionary_dynamics(df, parsed["doc_order"])
|
|
162
|
+
|
|
163
|
+
return EvolutionResult(
|
|
164
|
+
species=species,
|
|
165
|
+
structural_similarity_matrix=struct_sim,
|
|
166
|
+
cluster_assignments=df,
|
|
167
|
+
document_order=parsed["doc_order"],
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# ── Stage 1: Parse ───────────────────────────────────────────────
|
|
171
|
+
|
|
172
|
+
def _prepare_data(
|
|
173
|
+
self,
|
|
174
|
+
documents: list[TextDocument],
|
|
175
|
+
) -> dict:
|
|
176
|
+
"""Parse all documents into a combined DataFrame of sentences."""
|
|
177
|
+
all_rows: list[dict] = []
|
|
178
|
+
|
|
179
|
+
for doc in documents:
|
|
180
|
+
try:
|
|
181
|
+
sents = parse_sentences(
|
|
182
|
+
doc.text,
|
|
183
|
+
nlp=self._nlp,
|
|
184
|
+
model_name=self._model_name,
|
|
185
|
+
)
|
|
186
|
+
except Exception:
|
|
187
|
+
# Fallback: basic splitting (no POS tags)
|
|
188
|
+
logger.warning("spaCy parse failed for %s; skipping", doc.label)
|
|
189
|
+
continue
|
|
190
|
+
|
|
191
|
+
for s in sents:
|
|
192
|
+
all_rows.append({
|
|
193
|
+
"Sentence": s.text,
|
|
194
|
+
"POS_Tags": s.pos_tags,
|
|
195
|
+
"Source_Document": doc.label,
|
|
196
|
+
})
|
|
197
|
+
|
|
198
|
+
df = pd.DataFrame(all_rows)
|
|
199
|
+
doc_order = [d.label for d in documents]
|
|
200
|
+
|
|
201
|
+
return {"df": df, "doc_order": doc_order}
|
|
202
|
+
|
|
203
|
+
def prepare_data_from_parsed(
|
|
204
|
+
self,
|
|
205
|
+
sentences_per_doc: dict[str, list[ParsedSentence]],
|
|
206
|
+
doc_order: list[str] | None = None,
|
|
207
|
+
) -> dict:
|
|
208
|
+
"""
|
|
209
|
+
Build the internal DataFrame from pre-parsed sentences.
|
|
210
|
+
|
|
211
|
+
Useful for testing or when sentences have already been parsed.
|
|
212
|
+
|
|
213
|
+
Parameters
|
|
214
|
+
----------
|
|
215
|
+
sentences_per_doc : dict
|
|
216
|
+
Mapping of document label → list of :class:`ParsedSentence`.
|
|
217
|
+
doc_order : list[str] or None
|
|
218
|
+
Chronological order. Defaults to dict key order.
|
|
219
|
+
|
|
220
|
+
Returns
|
|
221
|
+
-------
|
|
222
|
+
dict with ``"df"`` and ``"doc_order"`` keys.
|
|
223
|
+
"""
|
|
224
|
+
rows = []
|
|
225
|
+
for doc_label, sents in sentences_per_doc.items():
|
|
226
|
+
for s in sents:
|
|
227
|
+
rows.append({
|
|
228
|
+
"Sentence": s.text,
|
|
229
|
+
"POS_Tags": s.pos_tags,
|
|
230
|
+
"Source_Document": doc_label,
|
|
231
|
+
})
|
|
232
|
+
df = pd.DataFrame(rows)
|
|
233
|
+
order = doc_order or list(sentences_per_doc.keys())
|
|
234
|
+
return {"df": df, "doc_order": order}
|
|
235
|
+
|
|
236
|
+
# ── Stage 2: Structural similarity ───────────────────────────────
|
|
237
|
+
|
|
238
|
+
@staticmethod
|
|
239
|
+
def _compute_structural_similarity(df: pd.DataFrame) -> np.ndarray:
|
|
240
|
+
"""NCD-based structural similarity matrix for all sentence pairs."""
|
|
241
|
+
n = len(df)
|
|
242
|
+
mat = np.eye(n, dtype=float)
|
|
243
|
+
|
|
244
|
+
pos_lists = df["POS_Tags"].tolist()
|
|
245
|
+
for i in range(n):
|
|
246
|
+
for j in range(i + 1, n):
|
|
247
|
+
sim = ncd_similarity(pos_lists[i], pos_lists[j])
|
|
248
|
+
mat[i, j] = sim
|
|
249
|
+
mat[j, i] = sim
|
|
250
|
+
|
|
251
|
+
return mat
|
|
252
|
+
|
|
253
|
+
# ── Stage 3: Distance matrix ─────────────────────────────────────
|
|
254
|
+
|
|
255
|
+
def _build_distance_matrix(
|
|
256
|
+
self,
|
|
257
|
+
df: pd.DataFrame,
|
|
258
|
+
structural_similarity: np.ndarray,
|
|
259
|
+
) -> np.ndarray:
|
|
260
|
+
"""
|
|
261
|
+
Build a hybrid or pure-structural distance matrix.
|
|
262
|
+
|
|
263
|
+
When *use_embeddings* is True, blends structural and semantic
|
|
264
|
+
distances using *alpha_semantic*.
|
|
265
|
+
"""
|
|
266
|
+
structural_distance = 1.0 - structural_similarity
|
|
267
|
+
|
|
268
|
+
if not self.use_embeddings or self.alpha_semantic == 0.0:
|
|
269
|
+
dist = structural_distance.copy()
|
|
270
|
+
else:
|
|
271
|
+
semantic_sim = self._compute_semantic_similarity(df)
|
|
272
|
+
semantic_distance = 1.0 - semantic_sim
|
|
273
|
+
dist = (
|
|
274
|
+
self.alpha_semantic * semantic_distance
|
|
275
|
+
+ (1.0 - self.alpha_semantic) * structural_distance
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
# Ensure symmetry and zero diagonal
|
|
279
|
+
dist = (dist + dist.T) / 2.0
|
|
280
|
+
np.fill_diagonal(dist, 0.0)
|
|
281
|
+
return dist
|
|
282
|
+
|
|
283
|
+
def _compute_semantic_similarity(self, df: pd.DataFrame) -> np.ndarray:
|
|
284
|
+
"""Cosine similarity of sentence embeddings."""
|
|
285
|
+
from spell_exploder.utils.imports import require_sentence_transformers, require_sklearn
|
|
286
|
+
|
|
287
|
+
SentenceTransformer = require_sentence_transformers()
|
|
288
|
+
sklearn = require_sklearn()
|
|
289
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
290
|
+
|
|
291
|
+
logger.info("Computing sentence embeddings with %s", self.embedding_model)
|
|
292
|
+
model = SentenceTransformer(self.embedding_model)
|
|
293
|
+
embeddings = model.encode(df["Sentence"].tolist())
|
|
294
|
+
return cosine_similarity(embeddings)
|
|
295
|
+
|
|
296
|
+
# ── Stage 4: Cluster ─────────────────────────────────────────────
|
|
297
|
+
|
|
298
|
+
@staticmethod
|
|
299
|
+
def _cluster(
|
|
300
|
+
df: pd.DataFrame,
|
|
301
|
+
distance_matrix: np.ndarray,
|
|
302
|
+
threshold_offset: float = 0.5,
|
|
303
|
+
) -> pd.DataFrame:
|
|
304
|
+
"""
|
|
305
|
+
Agglomerative clustering with a data-driven distance threshold.
|
|
306
|
+
|
|
307
|
+
Parameters
|
|
308
|
+
----------
|
|
309
|
+
df : DataFrame
|
|
310
|
+
Must include at least ``Sentence`` and ``Source_Document``.
|
|
311
|
+
distance_matrix : np.ndarray
|
|
312
|
+
Precomputed distance matrix.
|
|
313
|
+
threshold_offset : float
|
|
314
|
+
Number of standard deviations below the mean distance to
|
|
315
|
+
set as the clustering threshold (tighter = more clusters).
|
|
316
|
+
|
|
317
|
+
Returns
|
|
318
|
+
-------
|
|
319
|
+
DataFrame with an added ``Group_ID`` column.
|
|
320
|
+
"""
|
|
321
|
+
from spell_exploder.utils.imports import require_sklearn
|
|
322
|
+
require_sklearn()
|
|
323
|
+
from sklearn.cluster import AgglomerativeClustering
|
|
324
|
+
|
|
325
|
+
auto_threshold = float(
|
|
326
|
+
np.mean(distance_matrix) - np.std(distance_matrix) * threshold_offset
|
|
327
|
+
)
|
|
328
|
+
auto_threshold = max(auto_threshold, 0.01) # Floor
|
|
329
|
+
|
|
330
|
+
logger.info("Clustering with auto threshold=%.4f", auto_threshold)
|
|
331
|
+
|
|
332
|
+
agg = AgglomerativeClustering(
|
|
333
|
+
n_clusters=None,
|
|
334
|
+
distance_threshold=auto_threshold,
|
|
335
|
+
metric="precomputed",
|
|
336
|
+
linkage="average",
|
|
337
|
+
)
|
|
338
|
+
df = df.copy()
|
|
339
|
+
df["Group_ID"] = agg.fit_predict(distance_matrix)
|
|
340
|
+
|
|
341
|
+
logger.info("Found %d clusters", df["Group_ID"].nunique())
|
|
342
|
+
return df
|
|
343
|
+
|
|
344
|
+
# ── Stage 5: Evolutionary dynamics ───────────────────────────────
|
|
345
|
+
|
|
346
|
+
def _compute_evolutionary_dynamics(
|
|
347
|
+
self,
|
|
348
|
+
df: pd.DataFrame,
|
|
349
|
+
doc_order: list[str],
|
|
350
|
+
) -> list[SpeciesRecord]:
|
|
351
|
+
"""
|
|
352
|
+
Compute density, delta, status, and POS composition per cluster.
|
|
353
|
+
|
|
354
|
+
Generalizes to N documents by comparing the earliest and latest
|
|
355
|
+
in *doc_order*.
|
|
356
|
+
"""
|
|
357
|
+
if "Group_ID" not in df.columns:
|
|
358
|
+
return []
|
|
359
|
+
|
|
360
|
+
# Density per (document, cluster)
|
|
361
|
+
counts = (
|
|
362
|
+
df.groupby(["Source_Document", "Group_ID"])
|
|
363
|
+
.size()
|
|
364
|
+
.unstack(fill_value=0)
|
|
365
|
+
)
|
|
366
|
+
density = counts.div(counts.sum(axis=1), axis=0)
|
|
367
|
+
|
|
368
|
+
first_doc = doc_order[0]
|
|
369
|
+
last_doc = doc_order[-1]
|
|
370
|
+
|
|
371
|
+
# Ensure both docs exist in density index
|
|
372
|
+
for d in [first_doc, last_doc]:
|
|
373
|
+
if d not in density.index:
|
|
374
|
+
density.loc[d] = 0.0
|
|
375
|
+
|
|
376
|
+
species: list[SpeciesRecord] = []
|
|
377
|
+
|
|
378
|
+
for cluster_id in sorted(density.columns):
|
|
379
|
+
d_start = float(density.loc[first_doc, cluster_id])
|
|
380
|
+
d_end = float(density.loc[last_doc, cluster_id])
|
|
381
|
+
delta = d_end - d_start
|
|
382
|
+
|
|
383
|
+
status = self._classify_status(d_start, d_end, delta)
|
|
384
|
+
|
|
385
|
+
# Sample sentence
|
|
386
|
+
cluster_rows = df[df["Group_ID"] == cluster_id]
|
|
387
|
+
sample = (
|
|
388
|
+
cluster_rows["Sentence"].iloc[0]
|
|
389
|
+
if not cluster_rows.empty
|
|
390
|
+
else ""
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
# POS composition
|
|
394
|
+
pos_comp = self._compute_pos_composition(cluster_rows)
|
|
395
|
+
|
|
396
|
+
species.append(SpeciesRecord(
|
|
397
|
+
cluster_id=int(cluster_id),
|
|
398
|
+
status=status,
|
|
399
|
+
density_start=round(d_start, 6),
|
|
400
|
+
density_end=round(d_end, 6),
|
|
401
|
+
delta=round(delta, 6),
|
|
402
|
+
sample_sentence=sample,
|
|
403
|
+
pos_composition=pos_comp,
|
|
404
|
+
))
|
|
405
|
+
|
|
406
|
+
# Sort by end-state density (descending)
|
|
407
|
+
species.sort(key=lambda s: s.density_end, reverse=True)
|
|
408
|
+
return species
|
|
409
|
+
|
|
410
|
+
def _classify_status(
|
|
411
|
+
self,
|
|
412
|
+
d_start: float,
|
|
413
|
+
d_end: float,
|
|
414
|
+
delta: float,
|
|
415
|
+
) -> EvolutionaryStatus:
|
|
416
|
+
"""Classify a species' evolutionary trajectory."""
|
|
417
|
+
if d_start == 0:
|
|
418
|
+
return EvolutionaryStatus.EMERGING
|
|
419
|
+
if d_end == 0:
|
|
420
|
+
return EvolutionaryStatus.EXTINCT
|
|
421
|
+
if delta > self.status_threshold:
|
|
422
|
+
return EvolutionaryStatus.THRIVING
|
|
423
|
+
if delta < -self.status_threshold:
|
|
424
|
+
return EvolutionaryStatus.DECLINING
|
|
425
|
+
return EvolutionaryStatus.STABLE
|
|
426
|
+
|
|
427
|
+
def _compute_pos_composition(
|
|
428
|
+
self,
|
|
429
|
+
cluster_df: pd.DataFrame,
|
|
430
|
+
) -> list[POSComposition]:
|
|
431
|
+
"""Top-K POS tag frequencies for a cluster."""
|
|
432
|
+
if cluster_df.empty or "POS_Tags" not in cluster_df.columns:
|
|
433
|
+
return []
|
|
434
|
+
|
|
435
|
+
all_tags = [
|
|
436
|
+
tag
|
|
437
|
+
for tags in cluster_df["POS_Tags"]
|
|
438
|
+
for tag in (tags if isinstance(tags, list) else [])
|
|
439
|
+
]
|
|
440
|
+
if not all_tags:
|
|
441
|
+
return []
|
|
442
|
+
|
|
443
|
+
counter = Counter(all_tags)
|
|
444
|
+
total = sum(counter.values())
|
|
445
|
+
|
|
446
|
+
return [
|
|
447
|
+
POSComposition(
|
|
448
|
+
tag=tag,
|
|
449
|
+
percentage=round(count / total * 100, 2),
|
|
450
|
+
count=count,
|
|
451
|
+
)
|
|
452
|
+
for tag, count in counter.most_common(self.top_k_pos)
|
|
453
|
+
]
|