spell-exploder 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spell_exploder/__init__.py +205 -0
- spell_exploder/_version.py +1 -0
- spell_exploder/analyzers/__init__.py +18 -0
- spell_exploder/analyzers/adaptive_evolution.py +453 -0
- spell_exploder/analyzers/complexity_index.py +224 -0
- spell_exploder/analyzers/keyword_erp.py +477 -0
- spell_exploder/analyzers/valence_model.py +523 -0
- spell_exploder/core/__init__.py +45 -0
- spell_exploder/core/compression.py +103 -0
- spell_exploder/core/entropy.py +203 -0
- spell_exploder/core/information.py +179 -0
- spell_exploder/core/nlp.py +107 -0
- spell_exploder/exceptions.py +25 -0
- spell_exploder/extractors/__init__.py +35 -0
- spell_exploder/extractors/action_frames.py +133 -0
- spell_exploder/extractors/noun_dependencies.py +96 -0
- spell_exploder/extractors/sentence_parser.py +168 -0
- spell_exploder/graphs/__init__.py +0 -0
- spell_exploder/io/__init__.py +14 -0
- spell_exploder/io/exporters.py +94 -0
- spell_exploder/io/readers.py +117 -0
- spell_exploder/results/__init__.py +44 -0
- spell_exploder/results/complexity.py +111 -0
- spell_exploder/results/evolution.py +136 -0
- spell_exploder/results/keyword.py +139 -0
- spell_exploder/results/valence.py +134 -0
- spell_exploder/utils/__init__.py +11 -0
- spell_exploder/utils/imports.py +48 -0
- spell_exploder/utils/smoothing.py +42 -0
- spell_exploder/utils/statistics.py +54 -0
- spell_exploder/visualization/__init__.py +27 -0
- spell_exploder/visualization/plots.py +562 -0
- spell_exploder-0.1.0.dist-info/METADATA +221 -0
- spell_exploder-0.1.0.dist-info/RECORD +37 -0
- spell_exploder-0.1.0.dist-info/WHEEL +5 -0
- spell_exploder-0.1.0.dist-info/licenses/LICENSE +21 -0
- spell_exploder-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Entropy-based measures for text analysis.
|
|
3
|
+
|
|
4
|
+
Provides Shannon entropy computation, windowed entropy collapse (measuring
|
|
5
|
+
local redundancy relative to a document), and multiscale collapse curves
|
|
6
|
+
that summarize redundancy structure across multiple window sizes.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import math
|
|
12
|
+
from collections import Counter
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# ---------------------------------------------------------------------------
|
|
18
|
+
# Shannon entropy
|
|
19
|
+
# ---------------------------------------------------------------------------
|
|
20
|
+
|
|
21
|
+
def shannon_entropy(counter: Counter) -> float:
|
|
22
|
+
"""
|
|
23
|
+
Compute Shannon entropy (in bits) from a frequency counter.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
counter : Counter
|
|
28
|
+
Token → count mapping.
|
|
29
|
+
|
|
30
|
+
Returns
|
|
31
|
+
-------
|
|
32
|
+
float
|
|
33
|
+
Entropy in bits. Returns ``0.0`` for an empty counter.
|
|
34
|
+
|
|
35
|
+
Notes
|
|
36
|
+
-----
|
|
37
|
+
.. math::
|
|
38
|
+
H = -\\sum_{i} p_i \\log_2 p_i
|
|
39
|
+
"""
|
|
40
|
+
total = sum(counter.values())
|
|
41
|
+
if total == 0:
|
|
42
|
+
return 0.0
|
|
43
|
+
probs = np.array(list(counter.values()), dtype=float) / total
|
|
44
|
+
# Mask zeros to avoid log(0)
|
|
45
|
+
probs = probs[probs > 0]
|
|
46
|
+
return float(-(probs * np.log2(probs)).sum())
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
# Windowed entropy collapse
|
|
51
|
+
# ---------------------------------------------------------------------------
|
|
52
|
+
|
|
53
|
+
def window_collapse(
|
|
54
|
+
tokens: list[str],
|
|
55
|
+
win_size: int = 250,
|
|
56
|
+
) -> list[float]:
|
|
57
|
+
"""
|
|
58
|
+
Compute per-window *entropy collapse* values for non-overlapping windows.
|
|
59
|
+
|
|
60
|
+
Entropy collapse for a window is defined as the normalized deficit
|
|
61
|
+
of the window's entropy relative to the whole-document entropy:
|
|
62
|
+
|
|
63
|
+
.. math::
|
|
64
|
+
\\text{collapse}_w = \\frac{H_{\\text{doc}} - H_w}{H_{\\text{doc}}}
|
|
65
|
+
|
|
66
|
+
A value near 1 means the window is highly redundant (low local variety);
|
|
67
|
+
a value near 0 means the window is as diverse as the full document.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
tokens : list[str]
|
|
72
|
+
Full document token sequence.
|
|
73
|
+
win_size : int
|
|
74
|
+
Non-overlapping window width (in tokens).
|
|
75
|
+
|
|
76
|
+
Returns
|
|
77
|
+
-------
|
|
78
|
+
list[float]
|
|
79
|
+
One collapse value per window. Tail chunks shorter than 2 tokens
|
|
80
|
+
are dropped.
|
|
81
|
+
"""
|
|
82
|
+
h_doc = shannon_entropy(Counter(tokens))
|
|
83
|
+
if h_doc == 0:
|
|
84
|
+
return []
|
|
85
|
+
|
|
86
|
+
collapses: list[float] = []
|
|
87
|
+
for start in range(0, len(tokens), win_size):
|
|
88
|
+
chunk = tokens[start : start + win_size]
|
|
89
|
+
if len(chunk) < 2:
|
|
90
|
+
continue
|
|
91
|
+
h_chunk = shannon_entropy(Counter(chunk))
|
|
92
|
+
collapses.append((h_doc - h_chunk) / h_doc)
|
|
93
|
+
|
|
94
|
+
return collapses
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# ---------------------------------------------------------------------------
|
|
98
|
+
# Multiscale collapse
|
|
99
|
+
# ---------------------------------------------------------------------------
|
|
100
|
+
|
|
101
|
+
_DEFAULT_WIN_SIZES: tuple[int, ...] = (25, 50, 100, 250, 500)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def multiscale_collapse_curve(
|
|
105
|
+
tokens: list[str],
|
|
106
|
+
win_sizes: tuple[int, ...] = _DEFAULT_WIN_SIZES,
|
|
107
|
+
) -> list[dict]:
|
|
108
|
+
"""
|
|
109
|
+
Compute mean and max entropy collapse at multiple window sizes.
|
|
110
|
+
|
|
111
|
+
Parameters
|
|
112
|
+
----------
|
|
113
|
+
tokens : list[str]
|
|
114
|
+
Full document token sequence.
|
|
115
|
+
win_sizes : tuple[int, ...]
|
|
116
|
+
Window widths to evaluate.
|
|
117
|
+
|
|
118
|
+
Returns
|
|
119
|
+
-------
|
|
120
|
+
list[dict]
|
|
121
|
+
One dict per window size with keys:
|
|
122
|
+
``win_size``, ``n_windows``, ``mean_collapse``, ``max_collapse``.
|
|
123
|
+
"""
|
|
124
|
+
curve: list[dict] = []
|
|
125
|
+
for w in win_sizes:
|
|
126
|
+
cs = window_collapse(tokens, win_size=w)
|
|
127
|
+
curve.append({
|
|
128
|
+
"win_size": int(w),
|
|
129
|
+
"n_windows": len(cs),
|
|
130
|
+
"mean_collapse": float(np.mean(cs)) if cs else float("nan"),
|
|
131
|
+
"max_collapse": float(np.max(cs)) if cs else float("nan"),
|
|
132
|
+
})
|
|
133
|
+
return curve
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def summarize_multiscale_collapse(
|
|
137
|
+
curve: list[dict],
|
|
138
|
+
x_scale: str = "log",
|
|
139
|
+
) -> dict:
|
|
140
|
+
"""
|
|
141
|
+
Summarize a multiscale collapse curve into scalar metrics.
|
|
142
|
+
|
|
143
|
+
Parameters
|
|
144
|
+
----------
|
|
145
|
+
curve : list[dict]
|
|
146
|
+
Output of :func:`multiscale_collapse_curve`.
|
|
147
|
+
x_scale : str
|
|
148
|
+
``"log"`` for log-scaled x-axis (window size) in AUC integration,
|
|
149
|
+
``"linear"`` for raw window sizes.
|
|
150
|
+
|
|
151
|
+
Returns
|
|
152
|
+
-------
|
|
153
|
+
dict
|
|
154
|
+
``collapse_auc`` — trapezoidal area under the mean-collapse curve.
|
|
155
|
+
``collapse_auc_norm`` — AUC divided by x-range (average collapse across scales).
|
|
156
|
+
``peak_win_size`` — window size with highest mean collapse.
|
|
157
|
+
``peak_mean_collapse`` — that maximum value.
|
|
158
|
+
"""
|
|
159
|
+
pts = [
|
|
160
|
+
(d["win_size"], d["mean_collapse"])
|
|
161
|
+
for d in curve
|
|
162
|
+
if not math.isnan(d["mean_collapse"])
|
|
163
|
+
]
|
|
164
|
+
|
|
165
|
+
nan_result = dict(
|
|
166
|
+
collapse_auc=float("nan"),
|
|
167
|
+
collapse_auc_norm=float("nan"),
|
|
168
|
+
peak_win_size=None,
|
|
169
|
+
peak_mean_collapse=float("nan"),
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
if len(pts) == 0:
|
|
173
|
+
return nan_result
|
|
174
|
+
|
|
175
|
+
if len(pts) == 1:
|
|
176
|
+
w, m = pts[0]
|
|
177
|
+
return dict(
|
|
178
|
+
collapse_auc=0.0,
|
|
179
|
+
collapse_auc_norm=float(m),
|
|
180
|
+
peak_win_size=int(w),
|
|
181
|
+
peak_mean_collapse=float(m),
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
xs = np.array(
|
|
185
|
+
[math.log(w) if x_scale == "log" else float(w) for w, _ in pts],
|
|
186
|
+
dtype=float,
|
|
187
|
+
)
|
|
188
|
+
ys = np.array([m for _, m in pts], dtype=float)
|
|
189
|
+
|
|
190
|
+
# np.trapezoid in numpy 2.x; np.trapz in earlier versions
|
|
191
|
+
_trapz = getattr(np, "trapezoid", None) or np.trapz
|
|
192
|
+
auc = float(_trapz(ys, xs))
|
|
193
|
+
x_range = float(xs.max() - xs.min())
|
|
194
|
+
auc_norm = float(auc / x_range) if x_range > 0 else float(np.mean(ys))
|
|
195
|
+
|
|
196
|
+
peak_win, peak_mean = max(pts, key=lambda t: t[1])
|
|
197
|
+
|
|
198
|
+
return dict(
|
|
199
|
+
collapse_auc=auc,
|
|
200
|
+
collapse_auc_norm=auc_norm,
|
|
201
|
+
peak_win_size=int(peak_win),
|
|
202
|
+
peak_mean_collapse=float(peak_mean),
|
|
203
|
+
)
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Information-theoretic measures for text analysis.
|
|
3
|
+
|
|
4
|
+
Mutual information, channel capacity (Shannon–Hartley analogue),
|
|
5
|
+
and Jensen–Shannon divergence — with N-text generalizations.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import math
|
|
11
|
+
from collections import Counter
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
from scipy.spatial.distance import jensenshannon
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# ---------------------------------------------------------------------------
|
|
18
|
+
# Mutual information
|
|
19
|
+
# ---------------------------------------------------------------------------
|
|
20
|
+
|
|
21
|
+
def mutual_information(
|
|
22
|
+
joint: Counter,
|
|
23
|
+
marginal_x: Counter,
|
|
24
|
+
marginal_y: Counter,
|
|
25
|
+
n: int | None = None,
|
|
26
|
+
) -> float:
|
|
27
|
+
"""
|
|
28
|
+
Compute mutual information I(X; Y) from joint and marginal counters.
|
|
29
|
+
|
|
30
|
+
.. math::
|
|
31
|
+
I(X; Y) = \\sum_{x, y} p(x, y) \\log_2 \\frac{p(x, y)}{p(x)\\,p(y)}
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
joint : Counter
|
|
36
|
+
Mapping of ``(x, y)`` pairs → counts.
|
|
37
|
+
marginal_x : Counter
|
|
38
|
+
Mapping of ``x`` → counts.
|
|
39
|
+
marginal_y : Counter
|
|
40
|
+
Mapping of ``y`` → counts.
|
|
41
|
+
n : int or None
|
|
42
|
+
Total number of observations. When ``None``, the sum of *joint*
|
|
43
|
+
values is used.
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
float
|
|
48
|
+
Mutual information in bits. Returns ``0.0`` when *n* is 0.
|
|
49
|
+
"""
|
|
50
|
+
if n is None:
|
|
51
|
+
n = sum(joint.values())
|
|
52
|
+
if n == 0:
|
|
53
|
+
return 0.0
|
|
54
|
+
|
|
55
|
+
mi = 0.0
|
|
56
|
+
for (x, y), count_xy in joint.items():
|
|
57
|
+
p_xy = count_xy / n
|
|
58
|
+
p_x = marginal_x[x] / n
|
|
59
|
+
p_y = marginal_y[y] / n
|
|
60
|
+
if p_xy > 0 and p_x > 0 and p_y > 0:
|
|
61
|
+
mi += p_xy * math.log2(p_xy / (p_x * p_y))
|
|
62
|
+
|
|
63
|
+
return mi
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
# Channel capacity (Shannon–Hartley analogue)
|
|
68
|
+
# ---------------------------------------------------------------------------
|
|
69
|
+
|
|
70
|
+
def channel_capacity(signal: float, noise: float) -> float:
|
|
71
|
+
"""
|
|
72
|
+
Shannon–Hartley channel capacity with unit bandwidth.
|
|
73
|
+
|
|
74
|
+
.. math::
|
|
75
|
+
C = \\log_2(1 + S/N)
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
signal : float
|
|
80
|
+
Signal power (e.g. token frequency).
|
|
81
|
+
noise : float
|
|
82
|
+
Noise power (e.g. total other-token frequency).
|
|
83
|
+
|
|
84
|
+
Returns
|
|
85
|
+
-------
|
|
86
|
+
float
|
|
87
|
+
Channel capacity in bits.
|
|
88
|
+
"""
|
|
89
|
+
sn_ratio = signal / noise if noise > 0 else signal
|
|
90
|
+
sn_ratio = max(0.0, sn_ratio)
|
|
91
|
+
return math.log2(1 + sn_ratio)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# ---------------------------------------------------------------------------
|
|
95
|
+
# Jensen–Shannon divergence
|
|
96
|
+
# ---------------------------------------------------------------------------
|
|
97
|
+
|
|
98
|
+
def js_divergence_from_counters(
|
|
99
|
+
c1: Counter,
|
|
100
|
+
c2: Counter,
|
|
101
|
+
) -> float:
|
|
102
|
+
"""
|
|
103
|
+
Jensen–Shannon *divergence* between two frequency counters.
|
|
104
|
+
|
|
105
|
+
JS divergence is the square of the JS distance returned by
|
|
106
|
+
``scipy.spatial.distance.jensenshannon``.
|
|
107
|
+
|
|
108
|
+
Parameters
|
|
109
|
+
----------
|
|
110
|
+
c1, c2 : Counter
|
|
111
|
+
Token frequency counters.
|
|
112
|
+
|
|
113
|
+
Returns
|
|
114
|
+
-------
|
|
115
|
+
float
|
|
116
|
+
JS divergence. Ranges from 0 (identical) to ``ln(2) ≈ 0.693``
|
|
117
|
+
for fully disjoint distributions (scipy uses natural log internally).
|
|
118
|
+
Returns ``NaN`` if either counter is empty.
|
|
119
|
+
"""
|
|
120
|
+
vocab = sorted(set(c1.keys()) | set(c2.keys()))
|
|
121
|
+
p = np.array([c1.get(t, 0) for t in vocab], dtype=float)
|
|
122
|
+
q = np.array([c2.get(t, 0) for t in vocab], dtype=float)
|
|
123
|
+
|
|
124
|
+
if p.sum() <= 0 or q.sum() <= 0:
|
|
125
|
+
return float("nan")
|
|
126
|
+
|
|
127
|
+
js_dist = jensenshannon(p, q)
|
|
128
|
+
return float(js_dist ** 2)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def js_distance_from_counters(c1: Counter, c2: Counter) -> float:
|
|
132
|
+
"""
|
|
133
|
+
Jensen–Shannon *distance* (the square root of JS divergence).
|
|
134
|
+
|
|
135
|
+
Parameters
|
|
136
|
+
----------
|
|
137
|
+
c1, c2 : Counter
|
|
138
|
+
Token frequency counters.
|
|
139
|
+
|
|
140
|
+
Returns
|
|
141
|
+
-------
|
|
142
|
+
float
|
|
143
|
+
JS distance in [0, 1].
|
|
144
|
+
"""
|
|
145
|
+
vocab = sorted(set(c1.keys()) | set(c2.keys()))
|
|
146
|
+
p = np.array([c1.get(t, 0) for t in vocab], dtype=float)
|
|
147
|
+
q = np.array([c2.get(t, 0) for t in vocab], dtype=float)
|
|
148
|
+
|
|
149
|
+
if p.sum() <= 0 or q.sum() <= 0:
|
|
150
|
+
return float("nan")
|
|
151
|
+
|
|
152
|
+
return float(jensenshannon(p, q))
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def js_divergence_matrix(counters: list[Counter]) -> np.ndarray:
|
|
156
|
+
"""
|
|
157
|
+
Compute the pairwise JS divergence matrix for *N* frequency counters.
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
counters : list[Counter]
|
|
162
|
+
One frequency counter per document.
|
|
163
|
+
|
|
164
|
+
Returns
|
|
165
|
+
-------
|
|
166
|
+
np.ndarray
|
|
167
|
+
Symmetric N×N matrix where entry ``[i, j]`` is the JS divergence
|
|
168
|
+
between documents *i* and *j*. Diagonal is 0.
|
|
169
|
+
"""
|
|
170
|
+
n = len(counters)
|
|
171
|
+
mat = np.zeros((n, n), dtype=float)
|
|
172
|
+
|
|
173
|
+
for i in range(n):
|
|
174
|
+
for j in range(i + 1, n):
|
|
175
|
+
d = js_divergence_from_counters(counters[i], counters[j])
|
|
176
|
+
mat[i, j] = d
|
|
177
|
+
mat[j, i] = d
|
|
178
|
+
|
|
179
|
+
return mat
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared NLP infrastructure: spaCy model management and tokenization.
|
|
3
|
+
|
|
4
|
+
Models are loaded lazily (never at import time) and cached so that repeated
|
|
5
|
+
calls with the same configuration reuse the same ``spacy.Language`` instance.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from typing import TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
import spacy
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
# Model cache
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
_model_cache: dict[str, spacy.Language] = {}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_nlp(
|
|
25
|
+
model_name: str = "en_core_web_sm",
|
|
26
|
+
disable: list[str] | None = None,
|
|
27
|
+
) -> spacy.Language:
|
|
28
|
+
"""
|
|
29
|
+
Load (and cache) a spaCy model.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
model_name : str
|
|
34
|
+
Any installed spaCy model name (e.g. ``"en_core_web_sm"``).
|
|
35
|
+
disable : list[str] or None
|
|
36
|
+
Pipeline components to disable (e.g. ``["ner"]``).
|
|
37
|
+
|
|
38
|
+
Returns
|
|
39
|
+
-------
|
|
40
|
+
spacy.Language
|
|
41
|
+
The loaded (or cached) pipeline.
|
|
42
|
+
|
|
43
|
+
Raises
|
|
44
|
+
------
|
|
45
|
+
spell_exploder.exceptions.ModelNotLoadedError
|
|
46
|
+
If the requested model is not installed.
|
|
47
|
+
"""
|
|
48
|
+
import spacy
|
|
49
|
+
from spell_exploder.exceptions import ModelNotLoadedError
|
|
50
|
+
|
|
51
|
+
disable = disable or []
|
|
52
|
+
cache_key = f"{model_name}|{','.join(sorted(disable))}"
|
|
53
|
+
|
|
54
|
+
if cache_key not in _model_cache:
|
|
55
|
+
try:
|
|
56
|
+
logger.debug("Loading spaCy model %r (disable=%s)", model_name, disable)
|
|
57
|
+
_model_cache[cache_key] = spacy.load(model_name, disable=disable)
|
|
58
|
+
except OSError as exc:
|
|
59
|
+
raise ModelNotLoadedError(
|
|
60
|
+
f"spaCy model '{model_name}' is not installed. "
|
|
61
|
+
f"Run: python -m spacy download {model_name}"
|
|
62
|
+
) from exc
|
|
63
|
+
|
|
64
|
+
return _model_cache[cache_key]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def clear_model_cache() -> None:
|
|
68
|
+
"""Remove all cached models (useful in tests or to reclaim memory)."""
|
|
69
|
+
_model_cache.clear()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# ---------------------------------------------------------------------------
|
|
73
|
+
# Tokenization
|
|
74
|
+
# ---------------------------------------------------------------------------
|
|
75
|
+
|
|
76
|
+
def tokenize(
|
|
77
|
+
text: str,
|
|
78
|
+
model_name: str = "en_core_web_sm",
|
|
79
|
+
nlp: spacy.Language | None = None,
|
|
80
|
+
) -> list[str]:
|
|
81
|
+
"""
|
|
82
|
+
Tokenize *text* into lowercase lemmas, keeping only alphabetic tokens
|
|
83
|
+
and discarding stop-words.
|
|
84
|
+
|
|
85
|
+
This is the canonical tokenizer used across Spellcaster for
|
|
86
|
+
entropy, information-theoretic, and frequency-based analyses.
|
|
87
|
+
|
|
88
|
+
Parameters
|
|
89
|
+
----------
|
|
90
|
+
text : str
|
|
91
|
+
Raw input text.
|
|
92
|
+
model_name : str
|
|
93
|
+
spaCy model to use (ignored when *nlp* is provided).
|
|
94
|
+
nlp : spacy.Language or None
|
|
95
|
+
Pre-loaded pipeline. When ``None``, a model is loaded via
|
|
96
|
+
:func:`get_nlp` with the parser and NER disabled for speed.
|
|
97
|
+
|
|
98
|
+
Returns
|
|
99
|
+
-------
|
|
100
|
+
list[str]
|
|
101
|
+
Ordered list of lowercase lemma strings.
|
|
102
|
+
"""
|
|
103
|
+
if nlp is None:
|
|
104
|
+
nlp = get_nlp(model_name, disable=["parser", "ner"])
|
|
105
|
+
|
|
106
|
+
doc = nlp(text.lower())
|
|
107
|
+
return [t.lemma_ for t in doc if t.is_alpha and not t.is_stop]
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Spellcaster exception hierarchy."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SpellcasterError(Exception):
|
|
5
|
+
"""Base exception for all spell_exploder errors."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class InsufficientDataError(SpellcasterError):
|
|
9
|
+
"""Raised when input data is too small or empty for meaningful analysis."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ModelNotLoadedError(SpellcasterError):
|
|
13
|
+
"""Raised when a required NLP model is not available."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OptionalDependencyError(SpellcasterError):
|
|
17
|
+
"""Raised when an optional dependency is required but not installed."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, package: str, extra: str):
|
|
20
|
+
self.package = package
|
|
21
|
+
self.extra = extra
|
|
22
|
+
super().__init__(
|
|
23
|
+
f"'{package}' is required for this feature. "
|
|
24
|
+
f"Install it with: pip install spell-exploder[{extra}]"
|
|
25
|
+
)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NLP extraction modules for Spellcaster.
|
|
3
|
+
|
|
4
|
+
* :mod:`.action_frames` — Verb-centred action frame extraction.
|
|
5
|
+
* :mod:`.noun_dependencies` — Schema–valence noun dependency triples.
|
|
6
|
+
* :mod:`.sentence_parser` — Sentence segmentation with POS tags.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from spell_exploder.extractors.action_frames import (
|
|
10
|
+
ActionFrame,
|
|
11
|
+
extract_action_frames,
|
|
12
|
+
make_hashable_frame,
|
|
13
|
+
)
|
|
14
|
+
from spell_exploder.extractors.noun_dependencies import (
|
|
15
|
+
NounDependency,
|
|
16
|
+
extract_noun_dependencies,
|
|
17
|
+
)
|
|
18
|
+
from spell_exploder.extractors.sentence_parser import (
|
|
19
|
+
DEFAULT_ABBREVIATIONS,
|
|
20
|
+
ParsedSentence,
|
|
21
|
+
parse_sentences,
|
|
22
|
+
split_sentences_simple,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"ActionFrame",
|
|
27
|
+
"extract_action_frames",
|
|
28
|
+
"make_hashable_frame",
|
|
29
|
+
"NounDependency",
|
|
30
|
+
"extract_noun_dependencies",
|
|
31
|
+
"ParsedSentence",
|
|
32
|
+
"parse_sentences",
|
|
33
|
+
"split_sentences_simple",
|
|
34
|
+
"DEFAULT_ABBREVIATIONS",
|
|
35
|
+
]
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Action frame extraction from text.
|
|
3
|
+
|
|
4
|
+
An *action frame* is a verb-centred structure capturing who did what
|
|
5
|
+
to whom, extracted from spaCy dependency parses. Each frame records
|
|
6
|
+
the verb lemma together with its nominal subjects, objects, and other
|
|
7
|
+
syntactic dependents.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import Any, TYPE_CHECKING
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
import spacy
|
|
16
|
+
|
|
17
|
+
from spell_exploder.core.nlp import get_nlp
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
# Public types
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
|
|
24
|
+
# An action frame is a plain dict for now; typed as TypedDict for clarity.
|
|
25
|
+
# Using a dict keeps it JSON-serializable and easy to work with in pandas.
|
|
26
|
+
ActionFrame = dict[str, Any]
|
|
27
|
+
"""
|
|
28
|
+
Keys
|
|
29
|
+
----
|
|
30
|
+
verb : str
|
|
31
|
+
Verb lemma.
|
|
32
|
+
subjects : list[str]
|
|
33
|
+
Nominal-subject lemmas (``nsubj``, ``nsubjpass``).
|
|
34
|
+
objects : list[str]
|
|
35
|
+
Object / complement lemmas (``dobj``, ``pobj``, ``attr``,
|
|
36
|
+
``ccomp``, ``xcomp``).
|
|
37
|
+
other_deps : list[tuple[str, str]]
|
|
38
|
+
``(dep_label, lemma)`` for all other children.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
# Dependency labels grouped by role
|
|
42
|
+
_SUBJECT_DEPS = frozenset({"nsubj", "nsubjpass"})
|
|
43
|
+
_OBJECT_DEPS = frozenset({"dobj", "pobj", "attr", "ccomp", "xcomp"})
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
# Extraction
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
|
|
50
|
+
def extract_action_frames(
|
|
51
|
+
text: str,
|
|
52
|
+
*,
|
|
53
|
+
nlp: spacy.Language | None = None,
|
|
54
|
+
model_name: str = "en_core_web_sm",
|
|
55
|
+
) -> list[ActionFrame]:
|
|
56
|
+
"""
|
|
57
|
+
Extract verb-centred action frames from *text*.
|
|
58
|
+
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
61
|
+
text : str
|
|
62
|
+
Raw input text.
|
|
63
|
+
nlp : spacy.Language or None
|
|
64
|
+
Pre-loaded pipeline (must include the ``parser`` component).
|
|
65
|
+
When ``None``, a model is loaded via :func:`~spell_exploder.core.nlp.get_nlp`
|
|
66
|
+
with NER disabled for speed.
|
|
67
|
+
model_name : str
|
|
68
|
+
spaCy model name (used only when *nlp* is ``None``).
|
|
69
|
+
|
|
70
|
+
Returns
|
|
71
|
+
-------
|
|
72
|
+
list[ActionFrame]
|
|
73
|
+
One dict per verb token found in the text.
|
|
74
|
+
|
|
75
|
+
Examples
|
|
76
|
+
--------
|
|
77
|
+
>>> frames = extract_action_frames("The cat chased the mouse.")
|
|
78
|
+
>>> frames[0]["verb"]
|
|
79
|
+
'chase'
|
|
80
|
+
>>> frames[0]["subjects"]
|
|
81
|
+
['cat']
|
|
82
|
+
>>> frames[0]["objects"]
|
|
83
|
+
['mouse']
|
|
84
|
+
"""
|
|
85
|
+
if nlp is None:
|
|
86
|
+
nlp = get_nlp(model_name, disable=["ner"])
|
|
87
|
+
|
|
88
|
+
doc = nlp(text)
|
|
89
|
+
frames: list[ActionFrame] = []
|
|
90
|
+
|
|
91
|
+
for tok in doc:
|
|
92
|
+
if tok.pos_ != "VERB":
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
subjects = [
|
|
96
|
+
c.lemma_ for c in tok.children
|
|
97
|
+
if c.dep_.startswith("nsubj")
|
|
98
|
+
]
|
|
99
|
+
objects = [
|
|
100
|
+
c.lemma_ for c in tok.children
|
|
101
|
+
if c.dep_ in _OBJECT_DEPS
|
|
102
|
+
]
|
|
103
|
+
other_deps = [
|
|
104
|
+
(c.dep_, c.lemma_) for c in tok.children
|
|
105
|
+
if c.dep_ not in _SUBJECT_DEPS and c.dep_ not in _OBJECT_DEPS
|
|
106
|
+
]
|
|
107
|
+
|
|
108
|
+
frames.append({
|
|
109
|
+
"verb": tok.lemma_,
|
|
110
|
+
"subjects": subjects,
|
|
111
|
+
"objects": objects,
|
|
112
|
+
"other_deps": other_deps,
|
|
113
|
+
})
|
|
114
|
+
|
|
115
|
+
return frames
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def make_hashable_frame(frame: ActionFrame) -> tuple:
|
|
119
|
+
"""
|
|
120
|
+
Convert an action frame dict into a hashable tuple suitable for
|
|
121
|
+
counting in a :class:`~collections.Counter`.
|
|
122
|
+
|
|
123
|
+
Returns
|
|
124
|
+
-------
|
|
125
|
+
tuple
|
|
126
|
+
``(verb, sorted_subjects, sorted_objects, sorted_other_deps)``
|
|
127
|
+
"""
|
|
128
|
+
return (
|
|
129
|
+
frame["verb"],
|
|
130
|
+
tuple(sorted(frame["subjects"])),
|
|
131
|
+
tuple(sorted(frame["objects"])),
|
|
132
|
+
tuple(sorted(frame["other_deps"])),
|
|
133
|
+
)
|