spell-exploder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. spell_exploder/__init__.py +205 -0
  2. spell_exploder/_version.py +1 -0
  3. spell_exploder/analyzers/__init__.py +18 -0
  4. spell_exploder/analyzers/adaptive_evolution.py +453 -0
  5. spell_exploder/analyzers/complexity_index.py +224 -0
  6. spell_exploder/analyzers/keyword_erp.py +477 -0
  7. spell_exploder/analyzers/valence_model.py +523 -0
  8. spell_exploder/core/__init__.py +45 -0
  9. spell_exploder/core/compression.py +103 -0
  10. spell_exploder/core/entropy.py +203 -0
  11. spell_exploder/core/information.py +179 -0
  12. spell_exploder/core/nlp.py +107 -0
  13. spell_exploder/exceptions.py +25 -0
  14. spell_exploder/extractors/__init__.py +35 -0
  15. spell_exploder/extractors/action_frames.py +133 -0
  16. spell_exploder/extractors/noun_dependencies.py +96 -0
  17. spell_exploder/extractors/sentence_parser.py +168 -0
  18. spell_exploder/graphs/__init__.py +0 -0
  19. spell_exploder/io/__init__.py +14 -0
  20. spell_exploder/io/exporters.py +94 -0
  21. spell_exploder/io/readers.py +117 -0
  22. spell_exploder/results/__init__.py +44 -0
  23. spell_exploder/results/complexity.py +111 -0
  24. spell_exploder/results/evolution.py +136 -0
  25. spell_exploder/results/keyword.py +139 -0
  26. spell_exploder/results/valence.py +134 -0
  27. spell_exploder/utils/__init__.py +11 -0
  28. spell_exploder/utils/imports.py +48 -0
  29. spell_exploder/utils/smoothing.py +42 -0
  30. spell_exploder/utils/statistics.py +54 -0
  31. spell_exploder/visualization/__init__.py +27 -0
  32. spell_exploder/visualization/plots.py +562 -0
  33. spell_exploder-0.1.0.dist-info/METADATA +221 -0
  34. spell_exploder-0.1.0.dist-info/RECORD +37 -0
  35. spell_exploder-0.1.0.dist-info/WHEEL +5 -0
  36. spell_exploder-0.1.0.dist-info/licenses/LICENSE +21 -0
  37. spell_exploder-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,203 @@
1
+ """
2
+ Entropy-based measures for text analysis.
3
+
4
+ Provides Shannon entropy computation, windowed entropy collapse (measuring
5
+ local redundancy relative to a document), and multiscale collapse curves
6
+ that summarize redundancy structure across multiple window sizes.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import math
12
+ from collections import Counter
13
+
14
+ import numpy as np
15
+
16
+
17
+ # ---------------------------------------------------------------------------
18
+ # Shannon entropy
19
+ # ---------------------------------------------------------------------------
20
+
21
+ def shannon_entropy(counter: Counter) -> float:
22
+ """
23
+ Compute Shannon entropy (in bits) from a frequency counter.
24
+
25
+ Parameters
26
+ ----------
27
+ counter : Counter
28
+ Token → count mapping.
29
+
30
+ Returns
31
+ -------
32
+ float
33
+ Entropy in bits. Returns ``0.0`` for an empty counter.
34
+
35
+ Notes
36
+ -----
37
+ .. math::
38
+ H = -\\sum_{i} p_i \\log_2 p_i
39
+ """
40
+ total = sum(counter.values())
41
+ if total == 0:
42
+ return 0.0
43
+ probs = np.array(list(counter.values()), dtype=float) / total
44
+ # Mask zeros to avoid log(0)
45
+ probs = probs[probs > 0]
46
+ return float(-(probs * np.log2(probs)).sum())
47
+
48
+
49
+ # ---------------------------------------------------------------------------
50
+ # Windowed entropy collapse
51
+ # ---------------------------------------------------------------------------
52
+
53
+ def window_collapse(
54
+ tokens: list[str],
55
+ win_size: int = 250,
56
+ ) -> list[float]:
57
+ """
58
+ Compute per-window *entropy collapse* values for non-overlapping windows.
59
+
60
+ Entropy collapse for a window is defined as the normalized deficit
61
+ of the window's entropy relative to the whole-document entropy:
62
+
63
+ .. math::
64
+ \\text{collapse}_w = \\frac{H_{\\text{doc}} - H_w}{H_{\\text{doc}}}
65
+
66
+ A value near 1 means the window is highly redundant (low local variety);
67
+ a value near 0 means the window is as diverse as the full document.
68
+
69
+ Parameters
70
+ ----------
71
+ tokens : list[str]
72
+ Full document token sequence.
73
+ win_size : int
74
+ Non-overlapping window width (in tokens).
75
+
76
+ Returns
77
+ -------
78
+ list[float]
79
+ One collapse value per window. Tail chunks shorter than 2 tokens
80
+ are dropped.
81
+ """
82
+ h_doc = shannon_entropy(Counter(tokens))
83
+ if h_doc == 0:
84
+ return []
85
+
86
+ collapses: list[float] = []
87
+ for start in range(0, len(tokens), win_size):
88
+ chunk = tokens[start : start + win_size]
89
+ if len(chunk) < 2:
90
+ continue
91
+ h_chunk = shannon_entropy(Counter(chunk))
92
+ collapses.append((h_doc - h_chunk) / h_doc)
93
+
94
+ return collapses
95
+
96
+
97
+ # ---------------------------------------------------------------------------
98
+ # Multiscale collapse
99
+ # ---------------------------------------------------------------------------
100
+
101
+ _DEFAULT_WIN_SIZES: tuple[int, ...] = (25, 50, 100, 250, 500)
102
+
103
+
104
+ def multiscale_collapse_curve(
105
+ tokens: list[str],
106
+ win_sizes: tuple[int, ...] = _DEFAULT_WIN_SIZES,
107
+ ) -> list[dict]:
108
+ """
109
+ Compute mean and max entropy collapse at multiple window sizes.
110
+
111
+ Parameters
112
+ ----------
113
+ tokens : list[str]
114
+ Full document token sequence.
115
+ win_sizes : tuple[int, ...]
116
+ Window widths to evaluate.
117
+
118
+ Returns
119
+ -------
120
+ list[dict]
121
+ One dict per window size with keys:
122
+ ``win_size``, ``n_windows``, ``mean_collapse``, ``max_collapse``.
123
+ """
124
+ curve: list[dict] = []
125
+ for w in win_sizes:
126
+ cs = window_collapse(tokens, win_size=w)
127
+ curve.append({
128
+ "win_size": int(w),
129
+ "n_windows": len(cs),
130
+ "mean_collapse": float(np.mean(cs)) if cs else float("nan"),
131
+ "max_collapse": float(np.max(cs)) if cs else float("nan"),
132
+ })
133
+ return curve
134
+
135
+
136
+ def summarize_multiscale_collapse(
137
+ curve: list[dict],
138
+ x_scale: str = "log",
139
+ ) -> dict:
140
+ """
141
+ Summarize a multiscale collapse curve into scalar metrics.
142
+
143
+ Parameters
144
+ ----------
145
+ curve : list[dict]
146
+ Output of :func:`multiscale_collapse_curve`.
147
+ x_scale : str
148
+ ``"log"`` for log-scaled x-axis (window size) in AUC integration,
149
+ ``"linear"`` for raw window sizes.
150
+
151
+ Returns
152
+ -------
153
+ dict
154
+ ``collapse_auc`` — trapezoidal area under the mean-collapse curve.
155
+ ``collapse_auc_norm`` — AUC divided by x-range (average collapse across scales).
156
+ ``peak_win_size`` — window size with highest mean collapse.
157
+ ``peak_mean_collapse`` — that maximum value.
158
+ """
159
+ pts = [
160
+ (d["win_size"], d["mean_collapse"])
161
+ for d in curve
162
+ if not math.isnan(d["mean_collapse"])
163
+ ]
164
+
165
+ nan_result = dict(
166
+ collapse_auc=float("nan"),
167
+ collapse_auc_norm=float("nan"),
168
+ peak_win_size=None,
169
+ peak_mean_collapse=float("nan"),
170
+ )
171
+
172
+ if len(pts) == 0:
173
+ return nan_result
174
+
175
+ if len(pts) == 1:
176
+ w, m = pts[0]
177
+ return dict(
178
+ collapse_auc=0.0,
179
+ collapse_auc_norm=float(m),
180
+ peak_win_size=int(w),
181
+ peak_mean_collapse=float(m),
182
+ )
183
+
184
+ xs = np.array(
185
+ [math.log(w) if x_scale == "log" else float(w) for w, _ in pts],
186
+ dtype=float,
187
+ )
188
+ ys = np.array([m for _, m in pts], dtype=float)
189
+
190
+ # np.trapezoid in numpy 2.x; np.trapz in earlier versions
191
+ _trapz = getattr(np, "trapezoid", None) or np.trapz
192
+ auc = float(_trapz(ys, xs))
193
+ x_range = float(xs.max() - xs.min())
194
+ auc_norm = float(auc / x_range) if x_range > 0 else float(np.mean(ys))
195
+
196
+ peak_win, peak_mean = max(pts, key=lambda t: t[1])
197
+
198
+ return dict(
199
+ collapse_auc=auc,
200
+ collapse_auc_norm=auc_norm,
201
+ peak_win_size=int(peak_win),
202
+ peak_mean_collapse=float(peak_mean),
203
+ )
@@ -0,0 +1,179 @@
1
+ """
2
+ Information-theoretic measures for text analysis.
3
+
4
+ Mutual information, channel capacity (Shannon–Hartley analogue),
5
+ and Jensen–Shannon divergence — with N-text generalizations.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import math
11
+ from collections import Counter
12
+
13
+ import numpy as np
14
+ from scipy.spatial.distance import jensenshannon
15
+
16
+
17
+ # ---------------------------------------------------------------------------
18
+ # Mutual information
19
+ # ---------------------------------------------------------------------------
20
+
21
+ def mutual_information(
22
+ joint: Counter,
23
+ marginal_x: Counter,
24
+ marginal_y: Counter,
25
+ n: int | None = None,
26
+ ) -> float:
27
+ """
28
+ Compute mutual information I(X; Y) from joint and marginal counters.
29
+
30
+ .. math::
31
+ I(X; Y) = \\sum_{x, y} p(x, y) \\log_2 \\frac{p(x, y)}{p(x)\\,p(y)}
32
+
33
+ Parameters
34
+ ----------
35
+ joint : Counter
36
+ Mapping of ``(x, y)`` pairs → counts.
37
+ marginal_x : Counter
38
+ Mapping of ``x`` → counts.
39
+ marginal_y : Counter
40
+ Mapping of ``y`` → counts.
41
+ n : int or None
42
+ Total number of observations. When ``None``, the sum of *joint*
43
+ values is used.
44
+
45
+ Returns
46
+ -------
47
+ float
48
+ Mutual information in bits. Returns ``0.0`` when *n* is 0.
49
+ """
50
+ if n is None:
51
+ n = sum(joint.values())
52
+ if n == 0:
53
+ return 0.0
54
+
55
+ mi = 0.0
56
+ for (x, y), count_xy in joint.items():
57
+ p_xy = count_xy / n
58
+ p_x = marginal_x[x] / n
59
+ p_y = marginal_y[y] / n
60
+ if p_xy > 0 and p_x > 0 and p_y > 0:
61
+ mi += p_xy * math.log2(p_xy / (p_x * p_y))
62
+
63
+ return mi
64
+
65
+
66
+ # ---------------------------------------------------------------------------
67
+ # Channel capacity (Shannon–Hartley analogue)
68
+ # ---------------------------------------------------------------------------
69
+
70
+ def channel_capacity(signal: float, noise: float) -> float:
71
+ """
72
+ Shannon–Hartley channel capacity with unit bandwidth.
73
+
74
+ .. math::
75
+ C = \\log_2(1 + S/N)
76
+
77
+ Parameters
78
+ ----------
79
+ signal : float
80
+ Signal power (e.g. token frequency).
81
+ noise : float
82
+ Noise power (e.g. total other-token frequency).
83
+
84
+ Returns
85
+ -------
86
+ float
87
+ Channel capacity in bits.
88
+ """
89
+ sn_ratio = signal / noise if noise > 0 else signal
90
+ sn_ratio = max(0.0, sn_ratio)
91
+ return math.log2(1 + sn_ratio)
92
+
93
+
94
+ # ---------------------------------------------------------------------------
95
+ # Jensen–Shannon divergence
96
+ # ---------------------------------------------------------------------------
97
+
98
+ def js_divergence_from_counters(
99
+ c1: Counter,
100
+ c2: Counter,
101
+ ) -> float:
102
+ """
103
+ Jensen–Shannon *divergence* between two frequency counters.
104
+
105
+ JS divergence is the square of the JS distance returned by
106
+ ``scipy.spatial.distance.jensenshannon``.
107
+
108
+ Parameters
109
+ ----------
110
+ c1, c2 : Counter
111
+ Token frequency counters.
112
+
113
+ Returns
114
+ -------
115
+ float
116
+ JS divergence. Ranges from 0 (identical) to ``ln(2) ≈ 0.693``
117
+ for fully disjoint distributions (scipy uses natural log internally).
118
+ Returns ``NaN`` if either counter is empty.
119
+ """
120
+ vocab = sorted(set(c1.keys()) | set(c2.keys()))
121
+ p = np.array([c1.get(t, 0) for t in vocab], dtype=float)
122
+ q = np.array([c2.get(t, 0) for t in vocab], dtype=float)
123
+
124
+ if p.sum() <= 0 or q.sum() <= 0:
125
+ return float("nan")
126
+
127
+ js_dist = jensenshannon(p, q)
128
+ return float(js_dist ** 2)
129
+
130
+
131
+ def js_distance_from_counters(c1: Counter, c2: Counter) -> float:
132
+ """
133
+ Jensen–Shannon *distance* (the square root of JS divergence).
134
+
135
+ Parameters
136
+ ----------
137
+ c1, c2 : Counter
138
+ Token frequency counters.
139
+
140
+ Returns
141
+ -------
142
+ float
143
+ JS distance in [0, 1].
144
+ """
145
+ vocab = sorted(set(c1.keys()) | set(c2.keys()))
146
+ p = np.array([c1.get(t, 0) for t in vocab], dtype=float)
147
+ q = np.array([c2.get(t, 0) for t in vocab], dtype=float)
148
+
149
+ if p.sum() <= 0 or q.sum() <= 0:
150
+ return float("nan")
151
+
152
+ return float(jensenshannon(p, q))
153
+
154
+
155
+ def js_divergence_matrix(counters: list[Counter]) -> np.ndarray:
156
+ """
157
+ Compute the pairwise JS divergence matrix for *N* frequency counters.
158
+
159
+ Parameters
160
+ ----------
161
+ counters : list[Counter]
162
+ One frequency counter per document.
163
+
164
+ Returns
165
+ -------
166
+ np.ndarray
167
+ Symmetric N×N matrix where entry ``[i, j]`` is the JS divergence
168
+ between documents *i* and *j*. Diagonal is 0.
169
+ """
170
+ n = len(counters)
171
+ mat = np.zeros((n, n), dtype=float)
172
+
173
+ for i in range(n):
174
+ for j in range(i + 1, n):
175
+ d = js_divergence_from_counters(counters[i], counters[j])
176
+ mat[i, j] = d
177
+ mat[j, i] = d
178
+
179
+ return mat
@@ -0,0 +1,107 @@
1
+ """
2
+ Shared NLP infrastructure: spaCy model management and tokenization.
3
+
4
+ Models are loaded lazily (never at import time) and cached so that repeated
5
+ calls with the same configuration reuse the same ``spacy.Language`` instance.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from typing import TYPE_CHECKING
12
+
13
+ if TYPE_CHECKING:
14
+ import spacy
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # ---------------------------------------------------------------------------
19
+ # Model cache
20
+ # ---------------------------------------------------------------------------
21
+ _model_cache: dict[str, spacy.Language] = {}
22
+
23
+
24
+ def get_nlp(
25
+ model_name: str = "en_core_web_sm",
26
+ disable: list[str] | None = None,
27
+ ) -> spacy.Language:
28
+ """
29
+ Load (and cache) a spaCy model.
30
+
31
+ Parameters
32
+ ----------
33
+ model_name : str
34
+ Any installed spaCy model name (e.g. ``"en_core_web_sm"``).
35
+ disable : list[str] or None
36
+ Pipeline components to disable (e.g. ``["ner"]``).
37
+
38
+ Returns
39
+ -------
40
+ spacy.Language
41
+ The loaded (or cached) pipeline.
42
+
43
+ Raises
44
+ ------
45
+ spell_exploder.exceptions.ModelNotLoadedError
46
+ If the requested model is not installed.
47
+ """
48
+ import spacy
49
+ from spell_exploder.exceptions import ModelNotLoadedError
50
+
51
+ disable = disable or []
52
+ cache_key = f"{model_name}|{','.join(sorted(disable))}"
53
+
54
+ if cache_key not in _model_cache:
55
+ try:
56
+ logger.debug("Loading spaCy model %r (disable=%s)", model_name, disable)
57
+ _model_cache[cache_key] = spacy.load(model_name, disable=disable)
58
+ except OSError as exc:
59
+ raise ModelNotLoadedError(
60
+ f"spaCy model '{model_name}' is not installed. "
61
+ f"Run: python -m spacy download {model_name}"
62
+ ) from exc
63
+
64
+ return _model_cache[cache_key]
65
+
66
+
67
+ def clear_model_cache() -> None:
68
+ """Remove all cached models (useful in tests or to reclaim memory)."""
69
+ _model_cache.clear()
70
+
71
+
72
+ # ---------------------------------------------------------------------------
73
+ # Tokenization
74
+ # ---------------------------------------------------------------------------
75
+
76
+ def tokenize(
77
+ text: str,
78
+ model_name: str = "en_core_web_sm",
79
+ nlp: spacy.Language | None = None,
80
+ ) -> list[str]:
81
+ """
82
+ Tokenize *text* into lowercase lemmas, keeping only alphabetic tokens
83
+ and discarding stop-words.
84
+
85
+ This is the canonical tokenizer used across Spellcaster for
86
+ entropy, information-theoretic, and frequency-based analyses.
87
+
88
+ Parameters
89
+ ----------
90
+ text : str
91
+ Raw input text.
92
+ model_name : str
93
+ spaCy model to use (ignored when *nlp* is provided).
94
+ nlp : spacy.Language or None
95
+ Pre-loaded pipeline. When ``None``, a model is loaded via
96
+ :func:`get_nlp` with the parser and NER disabled for speed.
97
+
98
+ Returns
99
+ -------
100
+ list[str]
101
+ Ordered list of lowercase lemma strings.
102
+ """
103
+ if nlp is None:
104
+ nlp = get_nlp(model_name, disable=["parser", "ner"])
105
+
106
+ doc = nlp(text.lower())
107
+ return [t.lemma_ for t in doc if t.is_alpha and not t.is_stop]
@@ -0,0 +1,25 @@
1
+ """Spellcaster exception hierarchy."""
2
+
3
+
4
+ class SpellcasterError(Exception):
5
+ """Base exception for all spell_exploder errors."""
6
+
7
+
8
+ class InsufficientDataError(SpellcasterError):
9
+ """Raised when input data is too small or empty for meaningful analysis."""
10
+
11
+
12
+ class ModelNotLoadedError(SpellcasterError):
13
+ """Raised when a required NLP model is not available."""
14
+
15
+
16
+ class OptionalDependencyError(SpellcasterError):
17
+ """Raised when an optional dependency is required but not installed."""
18
+
19
+ def __init__(self, package: str, extra: str):
20
+ self.package = package
21
+ self.extra = extra
22
+ super().__init__(
23
+ f"'{package}' is required for this feature. "
24
+ f"Install it with: pip install spell-exploder[{extra}]"
25
+ )
@@ -0,0 +1,35 @@
1
+ """
2
+ NLP extraction modules for Spellcaster.
3
+
4
+ * :mod:`.action_frames` — Verb-centred action frame extraction.
5
+ * :mod:`.noun_dependencies` — Schema–valence noun dependency triples.
6
+ * :mod:`.sentence_parser` — Sentence segmentation with POS tags.
7
+ """
8
+
9
+ from spell_exploder.extractors.action_frames import (
10
+ ActionFrame,
11
+ extract_action_frames,
12
+ make_hashable_frame,
13
+ )
14
+ from spell_exploder.extractors.noun_dependencies import (
15
+ NounDependency,
16
+ extract_noun_dependencies,
17
+ )
18
+ from spell_exploder.extractors.sentence_parser import (
19
+ DEFAULT_ABBREVIATIONS,
20
+ ParsedSentence,
21
+ parse_sentences,
22
+ split_sentences_simple,
23
+ )
24
+
25
+ __all__ = [
26
+ "ActionFrame",
27
+ "extract_action_frames",
28
+ "make_hashable_frame",
29
+ "NounDependency",
30
+ "extract_noun_dependencies",
31
+ "ParsedSentence",
32
+ "parse_sentences",
33
+ "split_sentences_simple",
34
+ "DEFAULT_ABBREVIATIONS",
35
+ ]
@@ -0,0 +1,133 @@
1
+ """
2
+ Action frame extraction from text.
3
+
4
+ An *action frame* is a verb-centred structure capturing who did what
5
+ to whom, extracted from spaCy dependency parses. Each frame records
6
+ the verb lemma together with its nominal subjects, objects, and other
7
+ syntactic dependents.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Any, TYPE_CHECKING
13
+
14
+ if TYPE_CHECKING:
15
+ import spacy
16
+
17
+ from spell_exploder.core.nlp import get_nlp
18
+
19
+
20
+ # ---------------------------------------------------------------------------
21
+ # Public types
22
+ # ---------------------------------------------------------------------------
23
+
24
+ # An action frame is a plain dict for now; typed as TypedDict for clarity.
25
+ # Using a dict keeps it JSON-serializable and easy to work with in pandas.
26
+ ActionFrame = dict[str, Any]
27
+ """
28
+ Keys
29
+ ----
30
+ verb : str
31
+ Verb lemma.
32
+ subjects : list[str]
33
+ Nominal-subject lemmas (``nsubj``, ``nsubjpass``).
34
+ objects : list[str]
35
+ Object / complement lemmas (``dobj``, ``pobj``, ``attr``,
36
+ ``ccomp``, ``xcomp``).
37
+ other_deps : list[tuple[str, str]]
38
+ ``(dep_label, lemma)`` for all other children.
39
+ """
40
+
41
+ # Dependency labels grouped by role
42
+ _SUBJECT_DEPS = frozenset({"nsubj", "nsubjpass"})
43
+ _OBJECT_DEPS = frozenset({"dobj", "pobj", "attr", "ccomp", "xcomp"})
44
+
45
+
46
+ # ---------------------------------------------------------------------------
47
+ # Extraction
48
+ # ---------------------------------------------------------------------------
49
+
50
+ def extract_action_frames(
51
+ text: str,
52
+ *,
53
+ nlp: spacy.Language | None = None,
54
+ model_name: str = "en_core_web_sm",
55
+ ) -> list[ActionFrame]:
56
+ """
57
+ Extract verb-centred action frames from *text*.
58
+
59
+ Parameters
60
+ ----------
61
+ text : str
62
+ Raw input text.
63
+ nlp : spacy.Language or None
64
+ Pre-loaded pipeline (must include the ``parser`` component).
65
+ When ``None``, a model is loaded via :func:`~spell_exploder.core.nlp.get_nlp`
66
+ with NER disabled for speed.
67
+ model_name : str
68
+ spaCy model name (used only when *nlp* is ``None``).
69
+
70
+ Returns
71
+ -------
72
+ list[ActionFrame]
73
+ One dict per verb token found in the text.
74
+
75
+ Examples
76
+ --------
77
+ >>> frames = extract_action_frames("The cat chased the mouse.")
78
+ >>> frames[0]["verb"]
79
+ 'chase'
80
+ >>> frames[0]["subjects"]
81
+ ['cat']
82
+ >>> frames[0]["objects"]
83
+ ['mouse']
84
+ """
85
+ if nlp is None:
86
+ nlp = get_nlp(model_name, disable=["ner"])
87
+
88
+ doc = nlp(text)
89
+ frames: list[ActionFrame] = []
90
+
91
+ for tok in doc:
92
+ if tok.pos_ != "VERB":
93
+ continue
94
+
95
+ subjects = [
96
+ c.lemma_ for c in tok.children
97
+ if c.dep_.startswith("nsubj")
98
+ ]
99
+ objects = [
100
+ c.lemma_ for c in tok.children
101
+ if c.dep_ in _OBJECT_DEPS
102
+ ]
103
+ other_deps = [
104
+ (c.dep_, c.lemma_) for c in tok.children
105
+ if c.dep_ not in _SUBJECT_DEPS and c.dep_ not in _OBJECT_DEPS
106
+ ]
107
+
108
+ frames.append({
109
+ "verb": tok.lemma_,
110
+ "subjects": subjects,
111
+ "objects": objects,
112
+ "other_deps": other_deps,
113
+ })
114
+
115
+ return frames
116
+
117
+
118
+ def make_hashable_frame(frame: ActionFrame) -> tuple:
119
+ """
120
+ Convert an action frame dict into a hashable tuple suitable for
121
+ counting in a :class:`~collections.Counter`.
122
+
123
+ Returns
124
+ -------
125
+ tuple
126
+ ``(verb, sorted_subjects, sorted_objects, sorted_other_deps)``
127
+ """
128
+ return (
129
+ frame["verb"],
130
+ tuple(sorted(frame["subjects"])),
131
+ tuple(sorted(frame["objects"])),
132
+ tuple(sorted(frame["other_deps"])),
133
+ )