spell-exploder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. spell_exploder/__init__.py +205 -0
  2. spell_exploder/_version.py +1 -0
  3. spell_exploder/analyzers/__init__.py +18 -0
  4. spell_exploder/analyzers/adaptive_evolution.py +453 -0
  5. spell_exploder/analyzers/complexity_index.py +224 -0
  6. spell_exploder/analyzers/keyword_erp.py +477 -0
  7. spell_exploder/analyzers/valence_model.py +523 -0
  8. spell_exploder/core/__init__.py +45 -0
  9. spell_exploder/core/compression.py +103 -0
  10. spell_exploder/core/entropy.py +203 -0
  11. spell_exploder/core/information.py +179 -0
  12. spell_exploder/core/nlp.py +107 -0
  13. spell_exploder/exceptions.py +25 -0
  14. spell_exploder/extractors/__init__.py +35 -0
  15. spell_exploder/extractors/action_frames.py +133 -0
  16. spell_exploder/extractors/noun_dependencies.py +96 -0
  17. spell_exploder/extractors/sentence_parser.py +168 -0
  18. spell_exploder/graphs/__init__.py +0 -0
  19. spell_exploder/io/__init__.py +14 -0
  20. spell_exploder/io/exporters.py +94 -0
  21. spell_exploder/io/readers.py +117 -0
  22. spell_exploder/results/__init__.py +44 -0
  23. spell_exploder/results/complexity.py +111 -0
  24. spell_exploder/results/evolution.py +136 -0
  25. spell_exploder/results/keyword.py +139 -0
  26. spell_exploder/results/valence.py +134 -0
  27. spell_exploder/utils/__init__.py +11 -0
  28. spell_exploder/utils/imports.py +48 -0
  29. spell_exploder/utils/smoothing.py +42 -0
  30. spell_exploder/utils/statistics.py +54 -0
  31. spell_exploder/visualization/__init__.py +27 -0
  32. spell_exploder/visualization/plots.py +562 -0
  33. spell_exploder-0.1.0.dist-info/METADATA +221 -0
  34. spell_exploder-0.1.0.dist-info/RECORD +37 -0
  35. spell_exploder-0.1.0.dist-info/WHEEL +5 -0
  36. spell_exploder-0.1.0.dist-info/licenses/LICENSE +21 -0
  37. spell_exploder-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,205 @@
1
+ """
2
+ Spell Exploder
3
+ ===========
4
+
5
+ Analyze natural language text through complex systems science,
6
+ information theory, information-theoretic physics analogues,
7
+ and evolutionary game theory.
8
+
9
+ Quick Start
10
+ -----------
11
+ >>> import spell_exploder
12
+ >>> result = spell_exploder.analyze_complexity("draft_a.txt", "draft_b.txt")
13
+ >>> result.to_dataframe()
14
+
15
+ Analyzers
16
+ ---------
17
+ For full control, import the analyzer classes directly::
18
+
19
+ from spell_exploder.analyzers import (
20
+ TextComplexityAnalyzer, # LCX — compression, volatility, synergy
21
+ ValenceModelAnalyzer, # LCVM — entropy, MI, action frames, collapse
22
+ AdaptiveEvolutionAnalyzer,# APE — POS clustering, evolutionary dynamics
23
+ KeywordERPAnalyzer, # KEPM — keyword structural coherence
24
+ )
25
+
26
+ Results
27
+ -------
28
+ All analyzers return structured dataclass results with ``.to_dataframe()``
29
+ methods for easy integration with pandas, matplotlib, or any other tooling.
30
+
31
+ Export
32
+ ------
33
+ >>> from spell_exploder.io import export_csv, export_json
34
+ >>> export_csv(result, "output.csv")
35
+ >>> export_json(result, "output.json")
36
+ """
37
+
38
+ from spell_exploder._version import __version__
39
+
40
+ from spell_exploder.analyzers.complexity_index import TextComplexityAnalyzer
41
+ from spell_exploder.analyzers.valence_model import ValenceModelAnalyzer
42
+ from spell_exploder.analyzers.adaptive_evolution import AdaptiveEvolutionAnalyzer
43
+ from spell_exploder.analyzers.keyword_erp import KeywordERPAnalyzer
44
+
45
+ from spell_exploder.io.readers import load_texts, texts_from_strings
46
+ from spell_exploder.io.exporters import export_csv, export_json
47
+
48
+ from spell_exploder.results.complexity import (
49
+ ComplexityComparisonResult,
50
+ ComplexityFlowResult,
51
+ )
52
+ from spell_exploder.results.valence import ValenceModelResult
53
+ from spell_exploder.results.evolution import EvolutionResult
54
+ from spell_exploder.results.keyword import KeywordERPResult
55
+
56
+
57
+ # ── Convenience functions ────────────────────────────────────────────────────
58
+
59
+ def analyze_complexity(
60
+ *texts_or_paths: str,
61
+ labels: list[str] | None = None,
62
+ from_files: bool = True,
63
+ ) -> ComplexityComparisonResult:
64
+ """
65
+ One-liner complexity analysis.
66
+
67
+ Parameters
68
+ ----------
69
+ *texts_or_paths : str
70
+ File paths (when *from_files* is True) or raw text strings.
71
+ labels : list[str] or None
72
+ Human-readable labels.
73
+ from_files : bool
74
+ Whether to read from files or treat as raw strings.
75
+
76
+ Returns
77
+ -------
78
+ ComplexityComparisonResult
79
+ """
80
+ return TextComplexityAnalyzer().compare(
81
+ list(texts_or_paths), labels=labels, from_files=from_files,
82
+ )
83
+
84
+
85
+ def analyze_valence(
86
+ *texts_or_paths: str,
87
+ labels: list[str] | None = None,
88
+ from_files: bool = True,
89
+ window_sizes: tuple[int, ...] = (25, 50, 100, 250, 500),
90
+ ) -> ValenceModelResult:
91
+ """
92
+ One-liner valence model analysis.
93
+
94
+ Parameters
95
+ ----------
96
+ *texts_or_paths : str
97
+ File paths or raw text strings.
98
+ labels : list[str] or None
99
+ Human-readable labels.
100
+ from_files : bool
101
+ Whether to read from files.
102
+ window_sizes : tuple[int, ...]
103
+ Window sizes for multiscale collapse.
104
+
105
+ Returns
106
+ -------
107
+ ValenceModelResult
108
+ """
109
+ return ValenceModelAnalyzer(window_sizes=window_sizes).analyze(
110
+ list(texts_or_paths), labels=labels, from_files=from_files,
111
+ )
112
+
113
+
114
+ def analyze_evolution(
115
+ *texts_or_paths: str,
116
+ labels: list[str] | None = None,
117
+ from_files: bool = True,
118
+ use_embeddings: bool = True,
119
+ alpha_semantic: float = 0.5,
120
+ ) -> EvolutionResult:
121
+ """
122
+ One-liner adaptive evolution analysis.
123
+
124
+ Documents should be in chronological order (earliest first).
125
+
126
+ Parameters
127
+ ----------
128
+ *texts_or_paths : str
129
+ File paths or raw text strings.
130
+ labels : list[str] or None
131
+ Human-readable labels.
132
+ from_files : bool
133
+ Whether to read from files.
134
+ use_embeddings : bool
135
+ Whether to use sentence-transformer embeddings.
136
+ alpha_semantic : float
137
+ Blend weight for semantic vs. structural distance.
138
+
139
+ Returns
140
+ -------
141
+ EvolutionResult
142
+ """
143
+ return AdaptiveEvolutionAnalyzer(
144
+ use_embeddings=use_embeddings,
145
+ alpha_semantic=alpha_semantic,
146
+ ).analyze(
147
+ list(texts_or_paths), labels=labels, from_files=from_files,
148
+ )
149
+
150
+
151
+ def analyze_keywords(
152
+ *texts_or_paths: str,
153
+ keywords: list[str],
154
+ labels: list[str] | None = None,
155
+ from_files: bool = True,
156
+ context_window: int = 25,
157
+ ) -> KeywordERPResult:
158
+ """
159
+ One-liner keyword ERP analysis.
160
+
161
+ Parameters
162
+ ----------
163
+ *texts_or_paths : str
164
+ File paths or raw text strings.
165
+ keywords : list[str]
166
+ Keywords to analyse.
167
+ labels : list[str] or None
168
+ Human-readable labels.
169
+ from_files : bool
170
+ Whether to read from files.
171
+ context_window : int
172
+ ±N sentences around each keyword mention.
173
+
174
+ Returns
175
+ -------
176
+ KeywordERPResult
177
+ """
178
+ return KeywordERPAnalyzer(
179
+ keywords=keywords,
180
+ context_window=context_window,
181
+ ).analyze(
182
+ list(texts_or_paths), labels=labels, from_files=from_files,
183
+ )
184
+
185
+
186
+ __all__ = [
187
+ "__version__",
188
+ "analyze_complexity",
189
+ "analyze_valence",
190
+ "analyze_evolution",
191
+ "analyze_keywords",
192
+ "TextComplexityAnalyzer",
193
+ "ValenceModelAnalyzer",
194
+ "AdaptiveEvolutionAnalyzer",
195
+ "KeywordERPAnalyzer",
196
+ "load_texts",
197
+ "texts_from_strings",
198
+ "export_csv",
199
+ "export_json",
200
+ "ComplexityComparisonResult",
201
+ "ComplexityFlowResult",
202
+ "ValenceModelResult",
203
+ "EvolutionResult",
204
+ "KeywordERPResult",
205
+ ]
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,18 @@
1
+ """
2
+ Spellcaster analyzers.
3
+
4
+ Each analyzer encapsulates a complete analytical pipeline, from text
5
+ input to structured result objects.
6
+ """
7
+
8
+ from spell_exploder.analyzers.complexity_index import TextComplexityAnalyzer
9
+ from spell_exploder.analyzers.valence_model import ValenceModelAnalyzer
10
+ from spell_exploder.analyzers.adaptive_evolution import AdaptiveEvolutionAnalyzer
11
+ from spell_exploder.analyzers.keyword_erp import KeywordERPAnalyzer
12
+
13
+ __all__ = [
14
+ "TextComplexityAnalyzer",
15
+ "ValenceModelAnalyzer",
16
+ "AdaptiveEvolutionAnalyzer",
17
+ "KeywordERPAnalyzer",
18
+ ]
@@ -0,0 +1,453 @@
1
+ """
2
+ Adaptive POS Evolution (APE) Analyzer.
3
+
4
+ Treats syntactic structures as biological *species* competing for
5
+ "cognitive market share" across documents. The pipeline:
6
+
7
+ 1. **Parse** — Segment each document into sentences with POS tags.
8
+ 2. **Structural similarity** — Compute NCD-based structural similarity
9
+ between every pair of sentences (compression distance on POS-tag
10
+ sequences).
11
+ 3. **Embed** — Optionally compute semantic embeddings (sentence-transformers).
12
+ 4. **Hybrid distance** — Blend structural and semantic distances via a
13
+ configurable weight ``alpha``.
14
+ 5. **Cluster** — Agglomerative clustering on the hybrid distance matrix
15
+ with a data-driven distance threshold.
16
+ 6. **Evolutionary dynamics** — Track cluster (species) density across
17
+ documents to classify each as Emerging, Extinct, Thriving, Declining,
18
+ or Stable.
19
+ 7. **POS composition** — Profile the syntactic makeup of each species.
20
+
21
+ Example
22
+ -------
23
+ >>> from spell_exploder.analyzers.adaptive_evolution import AdaptiveEvolutionAnalyzer
24
+ >>> ape = AdaptiveEvolutionAnalyzer()
25
+ >>> result = ape.analyze(["early_draft.txt", "final_draft.txt"])
26
+ >>> result.to_dataframe()
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import logging
32
+ from collections import Counter
33
+ from typing import TYPE_CHECKING
34
+
35
+ import numpy as np
36
+ import pandas as pd
37
+
38
+ if TYPE_CHECKING:
39
+ import spacy
40
+
41
+ from spell_exploder.core.compression import ncd_similarity
42
+ from spell_exploder.extractors.sentence_parser import parse_sentences, ParsedSentence
43
+ from spell_exploder.io.readers import TextDocument, load_texts, texts_from_strings
44
+ from spell_exploder.results.evolution import (
45
+ EvolutionaryStatus,
46
+ EvolutionResult,
47
+ POSComposition,
48
+ SpeciesRecord,
49
+ )
50
+
51
+ logger = logging.getLogger(__name__)
52
+
53
+
54
+ # ---------------------------------------------------------------------------
55
+ # Analyzer
56
+ # ---------------------------------------------------------------------------
57
+
58
+ class AdaptiveEvolutionAnalyzer:
59
+ """
60
+ Adaptive POS Evolution (APE) analyzer.
61
+
62
+ Parameters
63
+ ----------
64
+ alpha_semantic : float
65
+ Weight for semantic (embedding) distance vs. structural (NCD)
66
+ distance. 0.0 = pure structural, 1.0 = pure semantic,
67
+ 0.5 = equal blend. Ignored when *use_embeddings* is ``False``.
68
+ status_threshold : float
69
+ Minimum absolute density delta to classify a species as
70
+ Thriving or Declining (default ``0.02`` = 2%).
71
+ top_k_pos : int
72
+ Number of POS tags to report per species cluster.
73
+ embedding_model : str
74
+ Sentence-transformer model name for semantic embeddings.
75
+ use_embeddings : bool
76
+ If ``False``, skip semantic embeddings entirely and cluster
77
+ on structural (NCD) distance alone. This avoids the
78
+ ``sentence-transformers`` dependency.
79
+ model_name : str
80
+ spaCy model for sentence parsing and POS tagging.
81
+ nlp : spacy.Language or None
82
+ Pre-loaded spaCy pipeline (overrides *model_name*).
83
+ """
84
+
85
+ def __init__(
86
+ self,
87
+ alpha_semantic: float = 0.5,
88
+ status_threshold: float = 0.02,
89
+ top_k_pos: int = 10,
90
+ embedding_model: str = "all-MiniLM-L6-v2",
91
+ use_embeddings: bool = True,
92
+ model_name: str = "en_core_web_sm",
93
+ nlp: spacy.Language | None = None,
94
+ ):
95
+ self.alpha_semantic = alpha_semantic
96
+ self.status_threshold = status_threshold
97
+ self.top_k_pos = top_k_pos
98
+ self.embedding_model = embedding_model
99
+ self.use_embeddings = use_embeddings
100
+ self._model_name = model_name
101
+ self._nlp = nlp
102
+
103
+ # ── Public API ───────────────────────────────────────────────────
104
+
105
+ def analyze(
106
+ self,
107
+ texts_or_paths: list[str],
108
+ labels: list[str] | None = None,
109
+ from_files: bool = True,
110
+ ) -> EvolutionResult:
111
+ """
112
+ Run the full APE pipeline on N ordered documents.
113
+
114
+ Documents should be in chronological order (earliest first).
115
+
116
+ Parameters
117
+ ----------
118
+ texts_or_paths : list[str]
119
+ File paths or raw text strings.
120
+ labels : list[str] or None
121
+ Human-readable labels.
122
+ from_files : bool
123
+ Whether to read from files or treat as raw strings.
124
+
125
+ Returns
126
+ -------
127
+ EvolutionResult
128
+ """
129
+ if from_files:
130
+ documents = load_texts(texts_or_paths, labels=labels)
131
+ else:
132
+ documents = texts_from_strings(texts_or_paths, labels=labels)
133
+
134
+ return self.analyze_documents(documents)
135
+
136
+ def analyze_documents(
137
+ self,
138
+ documents: list[TextDocument],
139
+ ) -> EvolutionResult:
140
+ """Analyse pre-loaded :class:`TextDocument` objects."""
141
+
142
+ # 1. Parse sentences with POS tags
143
+ parsed = self._prepare_data(documents)
144
+ logger.info("Parsed %d sentences from %d documents", len(parsed["df"]), len(documents))
145
+
146
+ df = parsed["df"]
147
+ if len(df) < 2:
148
+ logger.warning("Fewer than 2 sentences — cannot cluster.")
149
+ return EvolutionResult(document_order=parsed["doc_order"])
150
+
151
+ # 2. Structural similarity matrix (NCD on POS tags)
152
+ struct_sim = self._compute_structural_similarity(df)
153
+
154
+ # 3. Build distance matrix (hybrid or pure structural)
155
+ dist_matrix = self._build_distance_matrix(df, struct_sim)
156
+
157
+ # 4. Cluster
158
+ df = self._cluster(df, dist_matrix)
159
+
160
+ # 5. Evolutionary dynamics
161
+ species = self._compute_evolutionary_dynamics(df, parsed["doc_order"])
162
+
163
+ return EvolutionResult(
164
+ species=species,
165
+ structural_similarity_matrix=struct_sim,
166
+ cluster_assignments=df,
167
+ document_order=parsed["doc_order"],
168
+ )
169
+
170
+ # ── Stage 1: Parse ───────────────────────────────────────────────
171
+
172
+ def _prepare_data(
173
+ self,
174
+ documents: list[TextDocument],
175
+ ) -> dict:
176
+ """Parse all documents into a combined DataFrame of sentences."""
177
+ all_rows: list[dict] = []
178
+
179
+ for doc in documents:
180
+ try:
181
+ sents = parse_sentences(
182
+ doc.text,
183
+ nlp=self._nlp,
184
+ model_name=self._model_name,
185
+ )
186
+ except Exception:
187
+ # Fallback: basic splitting (no POS tags)
188
+ logger.warning("spaCy parse failed for %s; skipping", doc.label)
189
+ continue
190
+
191
+ for s in sents:
192
+ all_rows.append({
193
+ "Sentence": s.text,
194
+ "POS_Tags": s.pos_tags,
195
+ "Source_Document": doc.label,
196
+ })
197
+
198
+ df = pd.DataFrame(all_rows)
199
+ doc_order = [d.label for d in documents]
200
+
201
+ return {"df": df, "doc_order": doc_order}
202
+
203
+ def prepare_data_from_parsed(
204
+ self,
205
+ sentences_per_doc: dict[str, list[ParsedSentence]],
206
+ doc_order: list[str] | None = None,
207
+ ) -> dict:
208
+ """
209
+ Build the internal DataFrame from pre-parsed sentences.
210
+
211
+ Useful for testing or when sentences have already been parsed.
212
+
213
+ Parameters
214
+ ----------
215
+ sentences_per_doc : dict
216
+ Mapping of document label → list of :class:`ParsedSentence`.
217
+ doc_order : list[str] or None
218
+ Chronological order. Defaults to dict key order.
219
+
220
+ Returns
221
+ -------
222
+ dict with ``"df"`` and ``"doc_order"`` keys.
223
+ """
224
+ rows = []
225
+ for doc_label, sents in sentences_per_doc.items():
226
+ for s in sents:
227
+ rows.append({
228
+ "Sentence": s.text,
229
+ "POS_Tags": s.pos_tags,
230
+ "Source_Document": doc_label,
231
+ })
232
+ df = pd.DataFrame(rows)
233
+ order = doc_order or list(sentences_per_doc.keys())
234
+ return {"df": df, "doc_order": order}
235
+
236
+ # ── Stage 2: Structural similarity ───────────────────────────────
237
+
238
+ @staticmethod
239
+ def _compute_structural_similarity(df: pd.DataFrame) -> np.ndarray:
240
+ """NCD-based structural similarity matrix for all sentence pairs."""
241
+ n = len(df)
242
+ mat = np.eye(n, dtype=float)
243
+
244
+ pos_lists = df["POS_Tags"].tolist()
245
+ for i in range(n):
246
+ for j in range(i + 1, n):
247
+ sim = ncd_similarity(pos_lists[i], pos_lists[j])
248
+ mat[i, j] = sim
249
+ mat[j, i] = sim
250
+
251
+ return mat
252
+
253
+ # ── Stage 3: Distance matrix ─────────────────────────────────────
254
+
255
+ def _build_distance_matrix(
256
+ self,
257
+ df: pd.DataFrame,
258
+ structural_similarity: np.ndarray,
259
+ ) -> np.ndarray:
260
+ """
261
+ Build a hybrid or pure-structural distance matrix.
262
+
263
+ When *use_embeddings* is True, blends structural and semantic
264
+ distances using *alpha_semantic*.
265
+ """
266
+ structural_distance = 1.0 - structural_similarity
267
+
268
+ if not self.use_embeddings or self.alpha_semantic == 0.0:
269
+ dist = structural_distance.copy()
270
+ else:
271
+ semantic_sim = self._compute_semantic_similarity(df)
272
+ semantic_distance = 1.0 - semantic_sim
273
+ dist = (
274
+ self.alpha_semantic * semantic_distance
275
+ + (1.0 - self.alpha_semantic) * structural_distance
276
+ )
277
+
278
+ # Ensure symmetry and zero diagonal
279
+ dist = (dist + dist.T) / 2.0
280
+ np.fill_diagonal(dist, 0.0)
281
+ return dist
282
+
283
+ def _compute_semantic_similarity(self, df: pd.DataFrame) -> np.ndarray:
284
+ """Cosine similarity of sentence embeddings."""
285
+ from spell_exploder.utils.imports import require_sentence_transformers, require_sklearn
286
+
287
+ SentenceTransformer = require_sentence_transformers()
288
+ sklearn = require_sklearn()
289
+ from sklearn.metrics.pairwise import cosine_similarity
290
+
291
+ logger.info("Computing sentence embeddings with %s", self.embedding_model)
292
+ model = SentenceTransformer(self.embedding_model)
293
+ embeddings = model.encode(df["Sentence"].tolist())
294
+ return cosine_similarity(embeddings)
295
+
296
+ # ── Stage 4: Cluster ─────────────────────────────────────────────
297
+
298
+ @staticmethod
299
+ def _cluster(
300
+ df: pd.DataFrame,
301
+ distance_matrix: np.ndarray,
302
+ threshold_offset: float = 0.5,
303
+ ) -> pd.DataFrame:
304
+ """
305
+ Agglomerative clustering with a data-driven distance threshold.
306
+
307
+ Parameters
308
+ ----------
309
+ df : DataFrame
310
+ Must include at least ``Sentence`` and ``Source_Document``.
311
+ distance_matrix : np.ndarray
312
+ Precomputed distance matrix.
313
+ threshold_offset : float
314
+ Number of standard deviations below the mean distance to
315
+ set as the clustering threshold (tighter = more clusters).
316
+
317
+ Returns
318
+ -------
319
+ DataFrame with an added ``Group_ID`` column.
320
+ """
321
+ from spell_exploder.utils.imports import require_sklearn
322
+ require_sklearn()
323
+ from sklearn.cluster import AgglomerativeClustering
324
+
325
+ auto_threshold = float(
326
+ np.mean(distance_matrix) - np.std(distance_matrix) * threshold_offset
327
+ )
328
+ auto_threshold = max(auto_threshold, 0.01) # Floor
329
+
330
+ logger.info("Clustering with auto threshold=%.4f", auto_threshold)
331
+
332
+ agg = AgglomerativeClustering(
333
+ n_clusters=None,
334
+ distance_threshold=auto_threshold,
335
+ metric="precomputed",
336
+ linkage="average",
337
+ )
338
+ df = df.copy()
339
+ df["Group_ID"] = agg.fit_predict(distance_matrix)
340
+
341
+ logger.info("Found %d clusters", df["Group_ID"].nunique())
342
+ return df
343
+
344
+ # ── Stage 5: Evolutionary dynamics ───────────────────────────────
345
+
346
+ def _compute_evolutionary_dynamics(
347
+ self,
348
+ df: pd.DataFrame,
349
+ doc_order: list[str],
350
+ ) -> list[SpeciesRecord]:
351
+ """
352
+ Compute density, delta, status, and POS composition per cluster.
353
+
354
+ Generalizes to N documents by comparing the earliest and latest
355
+ in *doc_order*.
356
+ """
357
+ if "Group_ID" not in df.columns:
358
+ return []
359
+
360
+ # Density per (document, cluster)
361
+ counts = (
362
+ df.groupby(["Source_Document", "Group_ID"])
363
+ .size()
364
+ .unstack(fill_value=0)
365
+ )
366
+ density = counts.div(counts.sum(axis=1), axis=0)
367
+
368
+ first_doc = doc_order[0]
369
+ last_doc = doc_order[-1]
370
+
371
+ # Ensure both docs exist in density index
372
+ for d in [first_doc, last_doc]:
373
+ if d not in density.index:
374
+ density.loc[d] = 0.0
375
+
376
+ species: list[SpeciesRecord] = []
377
+
378
+ for cluster_id in sorted(density.columns):
379
+ d_start = float(density.loc[first_doc, cluster_id])
380
+ d_end = float(density.loc[last_doc, cluster_id])
381
+ delta = d_end - d_start
382
+
383
+ status = self._classify_status(d_start, d_end, delta)
384
+
385
+ # Sample sentence
386
+ cluster_rows = df[df["Group_ID"] == cluster_id]
387
+ sample = (
388
+ cluster_rows["Sentence"].iloc[0]
389
+ if not cluster_rows.empty
390
+ else ""
391
+ )
392
+
393
+ # POS composition
394
+ pos_comp = self._compute_pos_composition(cluster_rows)
395
+
396
+ species.append(SpeciesRecord(
397
+ cluster_id=int(cluster_id),
398
+ status=status,
399
+ density_start=round(d_start, 6),
400
+ density_end=round(d_end, 6),
401
+ delta=round(delta, 6),
402
+ sample_sentence=sample,
403
+ pos_composition=pos_comp,
404
+ ))
405
+
406
+ # Sort by end-state density (descending)
407
+ species.sort(key=lambda s: s.density_end, reverse=True)
408
+ return species
409
+
410
+ def _classify_status(
411
+ self,
412
+ d_start: float,
413
+ d_end: float,
414
+ delta: float,
415
+ ) -> EvolutionaryStatus:
416
+ """Classify a species' evolutionary trajectory."""
417
+ if d_start == 0:
418
+ return EvolutionaryStatus.EMERGING
419
+ if d_end == 0:
420
+ return EvolutionaryStatus.EXTINCT
421
+ if delta > self.status_threshold:
422
+ return EvolutionaryStatus.THRIVING
423
+ if delta < -self.status_threshold:
424
+ return EvolutionaryStatus.DECLINING
425
+ return EvolutionaryStatus.STABLE
426
+
427
+ def _compute_pos_composition(
428
+ self,
429
+ cluster_df: pd.DataFrame,
430
+ ) -> list[POSComposition]:
431
+ """Top-K POS tag frequencies for a cluster."""
432
+ if cluster_df.empty or "POS_Tags" not in cluster_df.columns:
433
+ return []
434
+
435
+ all_tags = [
436
+ tag
437
+ for tags in cluster_df["POS_Tags"]
438
+ for tag in (tags if isinstance(tags, list) else [])
439
+ ]
440
+ if not all_tags:
441
+ return []
442
+
443
+ counter = Counter(all_tags)
444
+ total = sum(counter.values())
445
+
446
+ return [
447
+ POSComposition(
448
+ tag=tag,
449
+ percentage=round(count / total * 100, 2),
450
+ count=count,
451
+ )
452
+ for tag, count in counter.most_common(self.top_k_pos)
453
+ ]