spell-exploder 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spell_exploder/__init__.py +205 -0
- spell_exploder/_version.py +1 -0
- spell_exploder/analyzers/__init__.py +18 -0
- spell_exploder/analyzers/adaptive_evolution.py +453 -0
- spell_exploder/analyzers/complexity_index.py +224 -0
- spell_exploder/analyzers/keyword_erp.py +477 -0
- spell_exploder/analyzers/valence_model.py +523 -0
- spell_exploder/core/__init__.py +45 -0
- spell_exploder/core/compression.py +103 -0
- spell_exploder/core/entropy.py +203 -0
- spell_exploder/core/information.py +179 -0
- spell_exploder/core/nlp.py +107 -0
- spell_exploder/exceptions.py +25 -0
- spell_exploder/extractors/__init__.py +35 -0
- spell_exploder/extractors/action_frames.py +133 -0
- spell_exploder/extractors/noun_dependencies.py +96 -0
- spell_exploder/extractors/sentence_parser.py +168 -0
- spell_exploder/graphs/__init__.py +0 -0
- spell_exploder/io/__init__.py +14 -0
- spell_exploder/io/exporters.py +94 -0
- spell_exploder/io/readers.py +117 -0
- spell_exploder/results/__init__.py +44 -0
- spell_exploder/results/complexity.py +111 -0
- spell_exploder/results/evolution.py +136 -0
- spell_exploder/results/keyword.py +139 -0
- spell_exploder/results/valence.py +134 -0
- spell_exploder/utils/__init__.py +11 -0
- spell_exploder/utils/imports.py +48 -0
- spell_exploder/utils/smoothing.py +42 -0
- spell_exploder/utils/statistics.py +54 -0
- spell_exploder/visualization/__init__.py +27 -0
- spell_exploder/visualization/plots.py +562 -0
- spell_exploder-0.1.0.dist-info/METADATA +221 -0
- spell_exploder-0.1.0.dist-info/RECORD +37 -0
- spell_exploder-0.1.0.dist-info/WHEEL +5 -0
- spell_exploder-0.1.0.dist-info/licenses/LICENSE +21 -0
- spell_exploder-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Noun-dependency extraction (schema–valence pairs).
|
|
3
|
+
|
|
4
|
+
Extracts structured relationships between nouns and their modifiers
|
|
5
|
+
or governing verbs. In Spellcaster's terminology:
|
|
6
|
+
|
|
7
|
+
* **Schema keyword** — the noun lemma (the concept being modified).
|
|
8
|
+
* **Valence keyword** — the adjective or verb lemma that colours the
|
|
9
|
+
noun's meaning in context.
|
|
10
|
+
* **Dependency type** — the syntactic relation (``amod``, ``nsubj``,
|
|
11
|
+
``dobj``).
|
|
12
|
+
|
|
13
|
+
These triples power the *valence entropy* and *semantic breadth*
|
|
14
|
+
metrics in the Valence Model analyzer.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from typing import TYPE_CHECKING
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
import spacy
|
|
23
|
+
|
|
24
|
+
from spell_exploder.core.nlp import get_nlp
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# Type alias for a single dependency triple
|
|
28
|
+
NounDependency = tuple[str, str, str]
|
|
29
|
+
"""``(schema_keyword, valence_keyword, dep_type)``"""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def extract_noun_dependencies(
|
|
33
|
+
text: str,
|
|
34
|
+
*,
|
|
35
|
+
nlp: spacy.Language | None = None,
|
|
36
|
+
model_name: str = "en_core_web_sm",
|
|
37
|
+
) -> list[NounDependency]:
|
|
38
|
+
"""
|
|
39
|
+
Extract noun–adjective and noun–verb dependency triples from *text*.
|
|
40
|
+
|
|
41
|
+
Three dependency patterns are captured:
|
|
42
|
+
|
|
43
|
+
1. **amod** — An adjective modifying a noun
|
|
44
|
+
(e.g. *"quick fox"* → ``("fox", "quick", "amod")``).
|
|
45
|
+
2. **nsubj** — A noun serving as subject of a verb
|
|
46
|
+
(e.g. *"The fox jumps"* → ``("fox", "jump", "nsubj")``).
|
|
47
|
+
3. **dobj** — A noun serving as direct object of a verb
|
|
48
|
+
(e.g. *"chase the mouse"* → ``("mouse", "chase", "dobj")``).
|
|
49
|
+
|
|
50
|
+
Stop-word valence keywords are filtered out.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
text : str
|
|
55
|
+
Raw input text.
|
|
56
|
+
nlp : spacy.Language or None
|
|
57
|
+
Pre-loaded pipeline (must include ``parser``).
|
|
58
|
+
When ``None``, loaded via :func:`~spell_exploder.core.nlp.get_nlp`.
|
|
59
|
+
model_name : str
|
|
60
|
+
spaCy model name (used only when *nlp* is ``None``).
|
|
61
|
+
|
|
62
|
+
Returns
|
|
63
|
+
-------
|
|
64
|
+
list[NounDependency]
|
|
65
|
+
List of ``(schema_keyword, valence_keyword, dep_type)`` tuples.
|
|
66
|
+
"""
|
|
67
|
+
if nlp is None:
|
|
68
|
+
nlp = get_nlp(model_name, disable=["ner"])
|
|
69
|
+
|
|
70
|
+
doc = nlp(text)
|
|
71
|
+
dependencies: list[NounDependency] = []
|
|
72
|
+
|
|
73
|
+
for tok in doc:
|
|
74
|
+
# --- Pattern 1: adjective modifier of a noun ---
|
|
75
|
+
if tok.pos_ in {"NOUN", "PROPN"}:
|
|
76
|
+
schema = tok.lemma_
|
|
77
|
+
for child in tok.children:
|
|
78
|
+
if child.dep_ == "amod":
|
|
79
|
+
valence = child.lemma_
|
|
80
|
+
if not nlp.vocab[valence].is_stop:
|
|
81
|
+
dependencies.append((schema, valence, "amod"))
|
|
82
|
+
|
|
83
|
+
# --- Patterns 2 & 3: noun as subject or object of a verb ---
|
|
84
|
+
if tok.pos_ == "VERB":
|
|
85
|
+
valence = tok.lemma_
|
|
86
|
+
if nlp.vocab[valence].is_stop:
|
|
87
|
+
continue
|
|
88
|
+
for child in tok.children:
|
|
89
|
+
if child.pos_ in {"NOUN", "PROPN"}:
|
|
90
|
+
schema = child.lemma_
|
|
91
|
+
if child.dep_ == "nsubj":
|
|
92
|
+
dependencies.append((schema, valence, "nsubj"))
|
|
93
|
+
elif child.dep_ == "dobj":
|
|
94
|
+
dependencies.append((schema, valence, "dobj"))
|
|
95
|
+
|
|
96
|
+
return dependencies
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Sentence parsing with POS-tag extraction.
|
|
3
|
+
|
|
4
|
+
Provides sentence segmentation (with a custom abbreviation-aware
|
|
5
|
+
boundary detector) and per-sentence POS-tag sequences, which feed
|
|
6
|
+
into the Adaptive Evolution (APE) and Keyword ERP (KEPM) analyzers.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from typing import TYPE_CHECKING
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
import spacy
|
|
17
|
+
|
|
18
|
+
from spell_exploder.core.nlp import get_nlp
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
# Default abbreviation set for sentence boundary detection
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
DEFAULT_ABBREVIATIONS: frozenset[str] = frozenset({
|
|
26
|
+
"mr.", "mrs.", "ms.", "dr.", "prof.", "rev.", "col.",
|
|
27
|
+
"gen.", "maj.", "capt.", "lt.", "sgt.", "pvt.",
|
|
28
|
+
"jr.", "sr.", "etc.", "e.g.", "i.e.",
|
|
29
|
+
})
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# Result dataclass
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
@dataclass(frozen=True)
|
|
37
|
+
class ParsedSentence:
|
|
38
|
+
"""A single sentence with its POS tag sequence."""
|
|
39
|
+
|
|
40
|
+
text: str
|
|
41
|
+
"""The raw sentence text (whitespace-stripped)."""
|
|
42
|
+
|
|
43
|
+
pos_tags: list[str] = field(default_factory=list)
|
|
44
|
+
"""POS tags for every token in the sentence (e.g. ``['DET', 'NOUN', 'VERB', ...]``)."""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
# Core parsing functions
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
def _register_boundary_detector(
|
|
52
|
+
nlp: spacy.Language,
|
|
53
|
+
abbreviations: frozenset[str],
|
|
54
|
+
) -> spacy.Language:
|
|
55
|
+
"""
|
|
56
|
+
Register (once) a custom sentence-boundary component on *nlp* that
|
|
57
|
+
respects abbreviations and newline boundaries.
|
|
58
|
+
|
|
59
|
+
The component is inserted **before** the parser so that spaCy's
|
|
60
|
+
built-in sentenciser can still override when needed.
|
|
61
|
+
"""
|
|
62
|
+
from spacy.language import Language as SpacyLanguage
|
|
63
|
+
|
|
64
|
+
component_name = "spell_exploder_sent_boundary"
|
|
65
|
+
|
|
66
|
+
if component_name in nlp.pipe_names:
|
|
67
|
+
return nlp
|
|
68
|
+
|
|
69
|
+
@SpacyLanguage.component(component_name)
|
|
70
|
+
def _boundary_detector(doc):
|
|
71
|
+
if len(doc) == 0:
|
|
72
|
+
return doc
|
|
73
|
+
doc[0].is_sent_start = True
|
|
74
|
+
|
|
75
|
+
for i, token in enumerate(doc[:-1]):
|
|
76
|
+
nxt = doc[i + 1]
|
|
77
|
+
# Period followed by capitalised word (not after abbreviation)
|
|
78
|
+
if (
|
|
79
|
+
token.text == "."
|
|
80
|
+
and nxt.text
|
|
81
|
+
and nxt.text[0].isupper()
|
|
82
|
+
and not nxt.is_space
|
|
83
|
+
and not nxt.is_punct
|
|
84
|
+
and token.text.lower() not in abbreviations
|
|
85
|
+
):
|
|
86
|
+
nxt.is_sent_start = True
|
|
87
|
+
# Newline triggers new sentence
|
|
88
|
+
if token.text in ("\n", "\r", "\r\n") and i + 1 < len(doc):
|
|
89
|
+
nxt.is_sent_start = True
|
|
90
|
+
|
|
91
|
+
return doc
|
|
92
|
+
|
|
93
|
+
nlp.add_pipe(component_name, before="parser")
|
|
94
|
+
return nlp
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def parse_sentences(
|
|
98
|
+
text: str,
|
|
99
|
+
*,
|
|
100
|
+
nlp: spacy.Language | None = None,
|
|
101
|
+
model_name: str = "en_core_web_sm",
|
|
102
|
+
abbreviations: frozenset[str] | None = None,
|
|
103
|
+
use_custom_boundaries: bool = True,
|
|
104
|
+
) -> list[ParsedSentence]:
|
|
105
|
+
"""
|
|
106
|
+
Segment *text* into sentences with per-token POS tags.
|
|
107
|
+
|
|
108
|
+
Parameters
|
|
109
|
+
----------
|
|
110
|
+
text : str
|
|
111
|
+
Raw input text.
|
|
112
|
+
nlp : spacy.Language or None
|
|
113
|
+
Pre-loaded pipeline. When ``None``, loaded via
|
|
114
|
+
:func:`~spell_exploder.core.nlp.get_nlp`.
|
|
115
|
+
model_name : str
|
|
116
|
+
spaCy model name (used only when *nlp* is ``None``).
|
|
117
|
+
abbreviations : frozenset[str] or None
|
|
118
|
+
Abbreviations that should **not** trigger sentence boundaries
|
|
119
|
+
(e.g. ``"dr."``). Defaults to :data:`DEFAULT_ABBREVIATIONS`.
|
|
120
|
+
use_custom_boundaries : bool
|
|
121
|
+
If ``True``, register a custom boundary detector component
|
|
122
|
+
on the pipeline.
|
|
123
|
+
|
|
124
|
+
Returns
|
|
125
|
+
-------
|
|
126
|
+
list[ParsedSentence]
|
|
127
|
+
Ordered list of non-empty sentences with POS tags.
|
|
128
|
+
"""
|
|
129
|
+
if nlp is None:
|
|
130
|
+
nlp = get_nlp(model_name)
|
|
131
|
+
|
|
132
|
+
if use_custom_boundaries:
|
|
133
|
+
abbrevs = abbreviations if abbreviations is not None else DEFAULT_ABBREVIATIONS
|
|
134
|
+
nlp = _register_boundary_detector(nlp, abbrevs)
|
|
135
|
+
|
|
136
|
+
doc = nlp(text)
|
|
137
|
+
results: list[ParsedSentence] = []
|
|
138
|
+
|
|
139
|
+
for sent in doc.sents:
|
|
140
|
+
sent_text = sent.text.strip()
|
|
141
|
+
if sent_text:
|
|
142
|
+
pos_tags = [token.pos_ for token in sent]
|
|
143
|
+
results.append(ParsedSentence(text=sent_text, pos_tags=pos_tags))
|
|
144
|
+
|
|
145
|
+
return results
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def split_sentences_simple(text: str) -> list[str]:
|
|
149
|
+
"""
|
|
150
|
+
Lightweight sentence splitter using regex (no spaCy required).
|
|
151
|
+
|
|
152
|
+
Splits on periods, newlines, and bullet characters. Used by the
|
|
153
|
+
:class:`~spell_exploder.analyzers.complexity_index.TextComplexityAnalyzer`
|
|
154
|
+
which does not need POS tags.
|
|
155
|
+
|
|
156
|
+
Parameters
|
|
157
|
+
----------
|
|
158
|
+
text : str
|
|
159
|
+
Raw input text.
|
|
160
|
+
|
|
161
|
+
Returns
|
|
162
|
+
-------
|
|
163
|
+
list[str]
|
|
164
|
+
Non-empty, whitespace-stripped sentence strings.
|
|
165
|
+
"""
|
|
166
|
+
clean = re.sub(r"<[^>]*>", "", text)
|
|
167
|
+
parts = re.split(r"[.\n•]+", clean)
|
|
168
|
+
return [s.strip() for s in parts if s.strip()]
|
|
File without changes
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
I/O utilities: text loading and result export.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from spell_exploder.io.readers import TextDocument, load_texts, texts_from_strings
|
|
6
|
+
from spell_exploder.io.exporters import export_csv, export_json
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"TextDocument",
|
|
10
|
+
"load_texts",
|
|
11
|
+
"texts_from_strings",
|
|
12
|
+
"export_csv",
|
|
13
|
+
"export_json",
|
|
14
|
+
]
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Export utilities for Spellcaster results.
|
|
3
|
+
|
|
4
|
+
Provides generic serialisation of any result object to JSON or CSV,
|
|
5
|
+
plus a specialised APE evolutionary-dynamics JSON report.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
from spell_exploder.results.evolution import EvolutionResult
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def export_csv(
|
|
22
|
+
result,
|
|
23
|
+
path: str,
|
|
24
|
+
**kwargs,
|
|
25
|
+
) -> Path:
|
|
26
|
+
"""
|
|
27
|
+
Export any result with a ``.to_dataframe()`` method to CSV.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
result
|
|
32
|
+
Any Spellcaster result object (``ComplexityComparisonResult``,
|
|
33
|
+
``ValenceModelResult``, ``EvolutionResult``, ``KeywordERPResult``).
|
|
34
|
+
path : str
|
|
35
|
+
Output file path.
|
|
36
|
+
**kwargs
|
|
37
|
+
Forwarded to :meth:`pandas.DataFrame.to_csv`.
|
|
38
|
+
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
Path
|
|
42
|
+
The written file path.
|
|
43
|
+
"""
|
|
44
|
+
df = result.to_dataframe()
|
|
45
|
+
p = Path(path)
|
|
46
|
+
df.to_csv(p, index=kwargs.pop("index", False), **kwargs)
|
|
47
|
+
logger.info("Exported %d rows to %s", len(df), p)
|
|
48
|
+
return p
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def export_json(
|
|
52
|
+
result,
|
|
53
|
+
path: str,
|
|
54
|
+
indent: int = 2,
|
|
55
|
+
) -> Path:
|
|
56
|
+
"""
|
|
57
|
+
Export a result to JSON.
|
|
58
|
+
|
|
59
|
+
For :class:`~spell_exploder.results.evolution.EvolutionResult`, uses the
|
|
60
|
+
structured ``to_json()`` format. For all other result types, converts
|
|
61
|
+
the ``.to_dataframe()`` output to JSON records.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
result
|
|
66
|
+
Any Spellcaster result object.
|
|
67
|
+
path : str
|
|
68
|
+
Output file path.
|
|
69
|
+
indent : int
|
|
70
|
+
JSON indentation.
|
|
71
|
+
|
|
72
|
+
Returns
|
|
73
|
+
-------
|
|
74
|
+
Path
|
|
75
|
+
The written file path.
|
|
76
|
+
"""
|
|
77
|
+
p = Path(path)
|
|
78
|
+
|
|
79
|
+
if isinstance(result, EvolutionResult):
|
|
80
|
+
data = result.to_json()
|
|
81
|
+
elif hasattr(result, "to_dataframe"):
|
|
82
|
+
df = result.to_dataframe()
|
|
83
|
+
data = json.loads(df.to_json(orient="records", default_handler=str))
|
|
84
|
+
else:
|
|
85
|
+
raise TypeError(
|
|
86
|
+
f"Cannot export {type(result).__name__}: "
|
|
87
|
+
"expected a Spellcaster result with .to_dataframe() or .to_json()"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
with open(p, "w", encoding="utf-8") as f:
|
|
91
|
+
json.dump(data, f, indent=indent, ensure_ascii=False, default=str)
|
|
92
|
+
|
|
93
|
+
logger.info("Exported JSON to %s", p)
|
|
94
|
+
return p
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text loading utilities.
|
|
3
|
+
|
|
4
|
+
Provides a uniform way to load documents from file paths or raw strings,
|
|
5
|
+
producing :class:`TextDocument` instances that all analyzers accept.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import pathlib
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class TextDocument:
|
|
16
|
+
"""
|
|
17
|
+
A loaded text document with provenance metadata.
|
|
18
|
+
|
|
19
|
+
Attributes
|
|
20
|
+
----------
|
|
21
|
+
path : str
|
|
22
|
+
Original file path, or ``"<inline>"`` for text supplied directly.
|
|
23
|
+
label : str
|
|
24
|
+
Human-readable label (defaults to the filename stem).
|
|
25
|
+
text : str
|
|
26
|
+
Full text content.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
path: str
|
|
30
|
+
label: str
|
|
31
|
+
text: str
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def load_texts(
|
|
35
|
+
file_paths: list[str],
|
|
36
|
+
labels: list[str] | None = None,
|
|
37
|
+
encoding: str = "utf-8",
|
|
38
|
+
) -> list[TextDocument]:
|
|
39
|
+
"""
|
|
40
|
+
Load text files from disk and wrap them as :class:`TextDocument` objects.
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
file_paths : list[str]
|
|
45
|
+
Paths to ``.txt`` files.
|
|
46
|
+
labels : list[str] or None
|
|
47
|
+
Human-readable labels, one per file. When ``None``, labels
|
|
48
|
+
default to the file-name stems (e.g. ``"essay1"`` for
|
|
49
|
+
``"/data/essay1.txt"``).
|
|
50
|
+
encoding : str
|
|
51
|
+
Text encoding to use when reading files.
|
|
52
|
+
|
|
53
|
+
Returns
|
|
54
|
+
-------
|
|
55
|
+
list[TextDocument]
|
|
56
|
+
|
|
57
|
+
Raises
|
|
58
|
+
------
|
|
59
|
+
FileNotFoundError
|
|
60
|
+
If any file does not exist.
|
|
61
|
+
ValueError
|
|
62
|
+
If *labels* is provided but its length differs from *file_paths*.
|
|
63
|
+
"""
|
|
64
|
+
if labels is not None and len(labels) != len(file_paths):
|
|
65
|
+
raise ValueError(
|
|
66
|
+
f"labels length ({len(labels)}) must match "
|
|
67
|
+
f"file_paths length ({len(file_paths)})"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
documents: list[TextDocument] = []
|
|
71
|
+
for i, fp in enumerate(file_paths):
|
|
72
|
+
p = pathlib.Path(fp)
|
|
73
|
+
if not p.exists():
|
|
74
|
+
raise FileNotFoundError(f"Text file not found: {fp}")
|
|
75
|
+
|
|
76
|
+
text = p.read_text(encoding=encoding, errors="ignore")
|
|
77
|
+
label = labels[i] if labels is not None else p.stem
|
|
78
|
+
|
|
79
|
+
documents.append(TextDocument(path=str(fp), label=label, text=text))
|
|
80
|
+
|
|
81
|
+
return documents
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def texts_from_strings(
|
|
85
|
+
texts: list[str],
|
|
86
|
+
labels: list[str] | None = None,
|
|
87
|
+
) -> list[TextDocument]:
|
|
88
|
+
"""
|
|
89
|
+
Wrap raw strings as :class:`TextDocument` objects (no file I/O).
|
|
90
|
+
|
|
91
|
+
Useful when text is already in memory — for example, from a
|
|
92
|
+
database or an API response.
|
|
93
|
+
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
texts : list[str]
|
|
97
|
+
Raw text strings.
|
|
98
|
+
labels : list[str] or None
|
|
99
|
+
Human-readable labels. When ``None``, defaults to
|
|
100
|
+
``"text_0"``, ``"text_1"``, etc.
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
-------
|
|
104
|
+
list[TextDocument]
|
|
105
|
+
"""
|
|
106
|
+
if labels is not None and len(labels) != len(texts):
|
|
107
|
+
raise ValueError(
|
|
108
|
+
f"labels length ({len(labels)}) must match "
|
|
109
|
+
f"texts length ({len(texts)})"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
documents: list[TextDocument] = []
|
|
113
|
+
for i, text in enumerate(texts):
|
|
114
|
+
label = labels[i] if labels is not None else f"text_{i}"
|
|
115
|
+
documents.append(TextDocument(path="<inline>", label=label, text=text))
|
|
116
|
+
|
|
117
|
+
return documents
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Structured result objects for all Spellcaster analyzers.
|
|
3
|
+
|
|
4
|
+
Every result class provides a ``.to_dataframe()`` method that produces
|
|
5
|
+
a flat :class:`pandas.DataFrame` for exploratory analysis.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from spell_exploder.results.complexity import (
|
|
9
|
+
ComplexityComparisonResult,
|
|
10
|
+
ComplexityFlowResult,
|
|
11
|
+
SentenceMetrics,
|
|
12
|
+
)
|
|
13
|
+
from spell_exploder.results.evolution import (
|
|
14
|
+
EvolutionaryStatus,
|
|
15
|
+
EvolutionResult,
|
|
16
|
+
POSComposition,
|
|
17
|
+
SpeciesRecord,
|
|
18
|
+
)
|
|
19
|
+
from spell_exploder.results.keyword import (
|
|
20
|
+
CrossKeywordEntanglement,
|
|
21
|
+
FileKeywordResult,
|
|
22
|
+
KeywordERPResult,
|
|
23
|
+
KeywordMeasures,
|
|
24
|
+
)
|
|
25
|
+
from spell_exploder.results.valence import (
|
|
26
|
+
PostMetrics,
|
|
27
|
+
ValenceModelResult,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"SentenceMetrics",
|
|
32
|
+
"ComplexityFlowResult",
|
|
33
|
+
"ComplexityComparisonResult",
|
|
34
|
+
"PostMetrics",
|
|
35
|
+
"ValenceModelResult",
|
|
36
|
+
"EvolutionaryStatus",
|
|
37
|
+
"SpeciesRecord",
|
|
38
|
+
"POSComposition",
|
|
39
|
+
"EvolutionResult",
|
|
40
|
+
"KeywordMeasures",
|
|
41
|
+
"CrossKeywordEntanglement",
|
|
42
|
+
"FileKeywordResult",
|
|
43
|
+
"KeywordERPResult",
|
|
44
|
+
]
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Result dataclasses for the Complexity Index (LCX) analyzer.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class SentenceMetrics:
|
|
15
|
+
"""Metrics for a single sentence within a complexity flow."""
|
|
16
|
+
|
|
17
|
+
text: str
|
|
18
|
+
"""Raw sentence text."""
|
|
19
|
+
|
|
20
|
+
index: int
|
|
21
|
+
"""Zero-based position in the document."""
|
|
22
|
+
|
|
23
|
+
k_hist: int
|
|
24
|
+
"""Cumulative compressed size of all text up to and including this sentence (bytes)."""
|
|
25
|
+
|
|
26
|
+
volatility: int
|
|
27
|
+
"""Levenshtein edit distance from the previous sentence (0 for the first)."""
|
|
28
|
+
|
|
29
|
+
synergy: float
|
|
30
|
+
"""Ratio of volatility to marginal compression cost (volatility / delta_k)."""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class ComplexityFlowResult:
|
|
35
|
+
"""
|
|
36
|
+
Result of analysing a single text's complexity flow.
|
|
37
|
+
|
|
38
|
+
Returned by :meth:`~spell_exploder.analyzers.complexity_index.TextComplexityAnalyzer.analyze_flow`.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
label: str
|
|
42
|
+
"""User-supplied label for this text (e.g. file name or condition)."""
|
|
43
|
+
|
|
44
|
+
sentences: list[SentenceMetrics] = field(default_factory=list)
|
|
45
|
+
"""Per-sentence metrics in document order."""
|
|
46
|
+
|
|
47
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
48
|
+
"""
|
|
49
|
+
Flat DataFrame with one row per sentence.
|
|
50
|
+
|
|
51
|
+
Columns: ``index``, ``text``, ``k_hist``, ``volatility``, ``synergy``.
|
|
52
|
+
"""
|
|
53
|
+
if not self.sentences:
|
|
54
|
+
return pd.DataFrame(
|
|
55
|
+
columns=["index", "text", "k_hist", "volatility", "synergy"]
|
|
56
|
+
)
|
|
57
|
+
return pd.DataFrame(
|
|
58
|
+
{
|
|
59
|
+
"index": [s.index for s in self.sentences],
|
|
60
|
+
"text": [s.text for s in self.sentences],
|
|
61
|
+
"k_hist": [s.k_hist for s in self.sentences],
|
|
62
|
+
"volatility": [s.volatility for s in self.sentences],
|
|
63
|
+
"synergy": [s.synergy for s in self.sentences],
|
|
64
|
+
}
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def k_hist_array(self) -> np.ndarray:
|
|
69
|
+
"""Cumulative complexity as a NumPy array."""
|
|
70
|
+
return np.array([s.k_hist for s in self.sentences])
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def volatility_array(self) -> np.ndarray:
|
|
74
|
+
"""Volatility sequence as a NumPy array."""
|
|
75
|
+
return np.array([s.volatility for s in self.sentences])
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def synergy_array(self) -> np.ndarray:
|
|
79
|
+
"""Synergy sequence as a NumPy array."""
|
|
80
|
+
return np.array([s.synergy for s in self.sentences])
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class ComplexityComparisonResult:
|
|
85
|
+
"""
|
|
86
|
+
Result of comparing N texts via complexity flow analysis.
|
|
87
|
+
|
|
88
|
+
Returned by :meth:`~spell_exploder.analyzers.complexity_index.TextComplexityAnalyzer.compare`.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
flows: list[ComplexityFlowResult] = field(default_factory=list)
|
|
92
|
+
"""One :class:`ComplexityFlowResult` per input text."""
|
|
93
|
+
|
|
94
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
95
|
+
"""
|
|
96
|
+
Combined DataFrame with a ``label`` column distinguishing texts.
|
|
97
|
+
"""
|
|
98
|
+
frames = []
|
|
99
|
+
for flow in self.flows:
|
|
100
|
+
df = flow.to_dataframe()
|
|
101
|
+
df["label"] = flow.label
|
|
102
|
+
frames.append(df)
|
|
103
|
+
if not frames:
|
|
104
|
+
return pd.DataFrame(
|
|
105
|
+
columns=["index", "text", "k_hist", "volatility", "synergy", "label"]
|
|
106
|
+
)
|
|
107
|
+
return pd.concat(frames, ignore_index=True)
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def labels(self) -> list[str]:
|
|
111
|
+
return [f.label for f in self.flows]
|