pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/README.md +42 -0
- pystylometry/__init__.py +45 -3
- pystylometry/_types.py +1017 -259
- pystylometry/authorship/README.md +21 -0
- pystylometry/authorship/__init__.py +28 -4
- pystylometry/authorship/additional_methods.py +260 -40
- pystylometry/authorship/compression.py +175 -0
- pystylometry/authorship/kilgarriff.py +354 -0
- pystylometry/character/README.md +17 -0
- pystylometry/character/character_metrics.py +267 -179
- pystylometry/cli.py +427 -0
- pystylometry/consistency/README.md +27 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/README.md +26 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/README.md +23 -0
- pystylometry/lexical/advanced_diversity.py +61 -22
- pystylometry/lexical/function_words.py +255 -56
- pystylometry/lexical/hapax.py +182 -52
- pystylometry/lexical/mtld.py +108 -26
- pystylometry/lexical/ttr.py +76 -10
- pystylometry/lexical/word_frequency_sophistication.py +1522 -298
- pystylometry/lexical/yule.py +136 -50
- pystylometry/ngrams/README.md +18 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +314 -69
- pystylometry/prosody/README.md +17 -0
- pystylometry/prosody/rhythm_prosody.py +773 -11
- pystylometry/readability/README.md +23 -0
- pystylometry/readability/additional_formulas.py +1887 -762
- pystylometry/readability/ari.py +144 -82
- pystylometry/readability/coleman_liau.py +136 -109
- pystylometry/readability/flesch.py +177 -73
- pystylometry/readability/gunning_fog.py +165 -161
- pystylometry/readability/smog.py +123 -42
- pystylometry/stylistic/README.md +20 -0
- pystylometry/stylistic/cohesion_coherence.py +669 -13
- pystylometry/stylistic/genre_register.py +1560 -17
- pystylometry/stylistic/markers.py +611 -17
- pystylometry/stylistic/vocabulary_overlap.py +354 -13
- pystylometry/syntactic/README.md +20 -0
- pystylometry/syntactic/advanced_syntactic.py +76 -14
- pystylometry/syntactic/pos_ratios.py +70 -6
- pystylometry/syntactic/sentence_stats.py +55 -12
- pystylometry/syntactic/sentence_types.py +71 -15
- pystylometry/viz/README.md +27 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- pystylometry-1.3.0.dist-info/METADATA +136 -0
- pystylometry-1.3.0.dist-info/RECORD +76 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
- pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
- pystylometry-1.0.0.dist-info/METADATA +0 -275
- pystylometry-1.0.0.dist-info/RECORD +0 -46
|
@@ -1,10 +1,17 @@
|
|
|
1
|
-
"""Part-of-Speech ratio analysis using spaCy.
|
|
1
|
+
"""Part-of-Speech ratio analysis using spaCy.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Related GitHub Issue:
|
|
4
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
5
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .._types import Distribution, POSResult, make_distribution
|
|
4
9
|
from .._utils import check_optional_dependency
|
|
5
10
|
|
|
6
11
|
|
|
7
|
-
def compute_pos_ratios(
|
|
12
|
+
def compute_pos_ratios(
|
|
13
|
+
text: str, model: str = "en_core_web_sm", chunk_size: int = 1000
|
|
14
|
+
) -> POSResult:
|
|
8
15
|
"""
|
|
9
16
|
Compute Part-of-Speech ratios and lexical density using spaCy.
|
|
10
17
|
|
|
@@ -18,6 +25,10 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
|
|
|
18
25
|
- Lexical density: (nouns + verbs + adjectives + adverbs) / total words
|
|
19
26
|
- Function word ratio: (determiners + prepositions + conjunctions) / total words
|
|
20
27
|
|
|
28
|
+
Related GitHub Issue:
|
|
29
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
30
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
31
|
+
|
|
21
32
|
References:
|
|
22
33
|
Biber, D. (1988). Variation across speech and writing.
|
|
23
34
|
Cambridge University Press.
|
|
@@ -25,9 +36,13 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
|
|
|
25
36
|
Args:
|
|
26
37
|
text: Input text to analyze
|
|
27
38
|
model: spaCy model name (default: "en_core_web_sm")
|
|
39
|
+
chunk_size: Number of words per chunk (default: 1000).
|
|
40
|
+
Note: POS analysis is performed on the full text for accuracy,
|
|
41
|
+
so this parameter is included for API consistency but actual
|
|
42
|
+
results are from a single pass.
|
|
28
43
|
|
|
29
44
|
Returns:
|
|
30
|
-
POSResult with all POS ratios and metadata
|
|
45
|
+
POSResult with all POS ratios, distributions, and metadata
|
|
31
46
|
|
|
32
47
|
Raises:
|
|
33
48
|
ImportError: If spaCy is not installed
|
|
@@ -47,8 +62,7 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
|
|
|
47
62
|
nlp = spacy.load(model)
|
|
48
63
|
except OSError:
|
|
49
64
|
raise OSError(
|
|
50
|
-
f"spaCy model '{model}' not found. "
|
|
51
|
-
f"Download it with: python -m spacy download {model}"
|
|
65
|
+
f"spaCy model '{model}' not found. Download it with: python -m spacy download {model}"
|
|
52
66
|
)
|
|
53
67
|
|
|
54
68
|
# Process text with spaCy
|
|
@@ -89,6 +103,14 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
|
|
|
89
103
|
|
|
90
104
|
# Handle empty text
|
|
91
105
|
if total_tokens == 0:
|
|
106
|
+
empty_dist = Distribution(
|
|
107
|
+
values=[],
|
|
108
|
+
mean=float("nan"),
|
|
109
|
+
median=float("nan"),
|
|
110
|
+
std=0.0,
|
|
111
|
+
range=0.0,
|
|
112
|
+
iqr=0.0,
|
|
113
|
+
)
|
|
92
114
|
return POSResult(
|
|
93
115
|
noun_ratio=float("nan"),
|
|
94
116
|
verb_ratio=float("nan"),
|
|
@@ -98,6 +120,16 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
|
|
|
98
120
|
adjective_noun_ratio=float("nan"),
|
|
99
121
|
lexical_density=float("nan"),
|
|
100
122
|
function_word_ratio=float("nan"),
|
|
123
|
+
noun_ratio_dist=empty_dist,
|
|
124
|
+
verb_ratio_dist=empty_dist,
|
|
125
|
+
adjective_ratio_dist=empty_dist,
|
|
126
|
+
adverb_ratio_dist=empty_dist,
|
|
127
|
+
noun_verb_ratio_dist=empty_dist,
|
|
128
|
+
adjective_noun_ratio_dist=empty_dist,
|
|
129
|
+
lexical_density_dist=empty_dist,
|
|
130
|
+
function_word_ratio_dist=empty_dist,
|
|
131
|
+
chunk_size=chunk_size,
|
|
132
|
+
chunk_count=0,
|
|
101
133
|
metadata={
|
|
102
134
|
"model": model,
|
|
103
135
|
"token_count": 0,
|
|
@@ -129,6 +161,28 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
|
|
|
129
161
|
function_words = det_count + adp_count + conj_count
|
|
130
162
|
function_word_ratio = function_words / total_tokens
|
|
131
163
|
|
|
164
|
+
# Create single-value distributions (POS analysis is done on full text)
|
|
165
|
+
noun_ratio_dist = make_distribution([noun_ratio])
|
|
166
|
+
verb_ratio_dist = make_distribution([verb_ratio])
|
|
167
|
+
adj_ratio_dist = make_distribution([adj_ratio])
|
|
168
|
+
adv_ratio_dist = make_distribution([adv_ratio])
|
|
169
|
+
noun_verb_dist = (
|
|
170
|
+
make_distribution([noun_verb_ratio])
|
|
171
|
+
if not (noun_verb_ratio != noun_verb_ratio)
|
|
172
|
+
else Distribution(
|
|
173
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
adj_noun_dist = (
|
|
177
|
+
make_distribution([adj_noun_ratio])
|
|
178
|
+
if not (adj_noun_ratio != adj_noun_ratio)
|
|
179
|
+
else Distribution(
|
|
180
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
181
|
+
)
|
|
182
|
+
)
|
|
183
|
+
lexical_density_dist = make_distribution([lexical_density])
|
|
184
|
+
function_word_dist = make_distribution([function_word_ratio])
|
|
185
|
+
|
|
132
186
|
return POSResult(
|
|
133
187
|
noun_ratio=noun_ratio,
|
|
134
188
|
verb_ratio=verb_ratio,
|
|
@@ -138,6 +192,16 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
|
|
|
138
192
|
adjective_noun_ratio=adj_noun_ratio,
|
|
139
193
|
lexical_density=lexical_density,
|
|
140
194
|
function_word_ratio=function_word_ratio,
|
|
195
|
+
noun_ratio_dist=noun_ratio_dist,
|
|
196
|
+
verb_ratio_dist=verb_ratio_dist,
|
|
197
|
+
adjective_ratio_dist=adj_ratio_dist,
|
|
198
|
+
adverb_ratio_dist=adv_ratio_dist,
|
|
199
|
+
noun_verb_ratio_dist=noun_verb_dist,
|
|
200
|
+
adjective_noun_ratio_dist=adj_noun_dist,
|
|
201
|
+
lexical_density_dist=lexical_density_dist,
|
|
202
|
+
function_word_ratio_dist=function_word_dist,
|
|
203
|
+
chunk_size=chunk_size,
|
|
204
|
+
chunk_count=1, # Single pass analysis
|
|
141
205
|
metadata={
|
|
142
206
|
"model": model,
|
|
143
207
|
"token_count": total_tokens,
|
|
@@ -1,10 +1,17 @@
|
|
|
1
|
-
"""Sentence-level statistics using spaCy.
|
|
1
|
+
"""Sentence-level statistics using spaCy.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
Related GitHub Issue:
|
|
4
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
5
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
6
|
+
"""
|
|
5
7
|
|
|
8
|
+
from .._types import Distribution, SentenceStatsResult, make_distribution
|
|
9
|
+
from .._utils import check_optional_dependency
|
|
6
10
|
|
|
7
|
-
|
|
11
|
+
|
|
12
|
+
def compute_sentence_stats(
|
|
13
|
+
text: str, model: str = "en_core_web_sm", chunk_size: int = 1000
|
|
14
|
+
) -> SentenceStatsResult:
|
|
8
15
|
"""
|
|
9
16
|
Compute sentence-level statistics using spaCy.
|
|
10
17
|
|
|
@@ -16,6 +23,10 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
|
|
|
16
23
|
- Maximum sentence length
|
|
17
24
|
- Total sentence count
|
|
18
25
|
|
|
26
|
+
Related GitHub Issue:
|
|
27
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
28
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
29
|
+
|
|
19
30
|
References:
|
|
20
31
|
Hunt, K. W. (1965). Grammatical structures written at three grade levels.
|
|
21
32
|
NCTE Research Report No. 3.
|
|
@@ -23,9 +34,13 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
|
|
|
23
34
|
Args:
|
|
24
35
|
text: Input text to analyze
|
|
25
36
|
model: spaCy model name (default: "en_core_web_sm")
|
|
37
|
+
chunk_size: Number of words per chunk (default: 1000).
|
|
38
|
+
Note: Sentence analysis is performed on the full text for accuracy,
|
|
39
|
+
so this parameter is included for API consistency but actual
|
|
40
|
+
results are from a single pass.
|
|
26
41
|
|
|
27
42
|
Returns:
|
|
28
|
-
SentenceStatsResult with sentence statistics and metadata
|
|
43
|
+
SentenceStatsResult with sentence statistics, distributions, and metadata
|
|
29
44
|
|
|
30
45
|
Raises:
|
|
31
46
|
ImportError: If spaCy is not installed
|
|
@@ -45,8 +60,7 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
|
|
|
45
60
|
nlp = spacy.load(model)
|
|
46
61
|
except OSError:
|
|
47
62
|
raise OSError(
|
|
48
|
-
f"spaCy model '{model}' not found. "
|
|
49
|
-
f"Download it with: python -m spacy download {model}"
|
|
63
|
+
f"spaCy model '{model}' not found. Download it with: python -m spacy download {model}"
|
|
50
64
|
)
|
|
51
65
|
|
|
52
66
|
# Process text with spaCy
|
|
@@ -62,13 +76,28 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
|
|
|
62
76
|
|
|
63
77
|
# Handle empty text
|
|
64
78
|
if len(sentence_lengths) == 0:
|
|
79
|
+
empty_dist = Distribution(
|
|
80
|
+
values=[],
|
|
81
|
+
mean=float("nan"),
|
|
82
|
+
median=float("nan"),
|
|
83
|
+
std=0.0,
|
|
84
|
+
range=0.0,
|
|
85
|
+
iqr=0.0,
|
|
86
|
+
)
|
|
65
87
|
return SentenceStatsResult(
|
|
66
88
|
mean_sentence_length=float("nan"),
|
|
67
89
|
sentence_length_std=float("nan"),
|
|
68
|
-
sentence_length_range=0,
|
|
69
|
-
min_sentence_length=0,
|
|
70
|
-
max_sentence_length=0,
|
|
90
|
+
sentence_length_range=0.0,
|
|
91
|
+
min_sentence_length=0.0,
|
|
92
|
+
max_sentence_length=0.0,
|
|
71
93
|
sentence_count=0,
|
|
94
|
+
mean_sentence_length_dist=empty_dist,
|
|
95
|
+
sentence_length_std_dist=empty_dist,
|
|
96
|
+
sentence_length_range_dist=empty_dist,
|
|
97
|
+
min_sentence_length_dist=empty_dist,
|
|
98
|
+
max_sentence_length_dist=empty_dist,
|
|
99
|
+
chunk_size=chunk_size,
|
|
100
|
+
chunk_count=0,
|
|
72
101
|
metadata={
|
|
73
102
|
"model": model,
|
|
74
103
|
},
|
|
@@ -86,10 +115,17 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
|
|
|
86
115
|
else:
|
|
87
116
|
std_dev = 0.0
|
|
88
117
|
|
|
89
|
-
min_length = min(sentence_lengths)
|
|
90
|
-
max_length = max(sentence_lengths)
|
|
118
|
+
min_length = float(min(sentence_lengths))
|
|
119
|
+
max_length = float(max(sentence_lengths))
|
|
91
120
|
length_range = max_length - min_length
|
|
92
121
|
|
|
122
|
+
# Create single-value distributions (sentence analysis is done on full text)
|
|
123
|
+
mean_dist = make_distribution([mean_length])
|
|
124
|
+
std_dist = make_distribution([std_dev])
|
|
125
|
+
range_dist = make_distribution([length_range])
|
|
126
|
+
min_dist = make_distribution([min_length])
|
|
127
|
+
max_dist = make_distribution([max_length])
|
|
128
|
+
|
|
93
129
|
return SentenceStatsResult(
|
|
94
130
|
mean_sentence_length=mean_length,
|
|
95
131
|
sentence_length_std=std_dev,
|
|
@@ -97,6 +133,13 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
|
|
|
97
133
|
min_sentence_length=min_length,
|
|
98
134
|
max_sentence_length=max_length,
|
|
99
135
|
sentence_count=len(sentence_lengths),
|
|
136
|
+
mean_sentence_length_dist=mean_dist,
|
|
137
|
+
sentence_length_std_dist=std_dist,
|
|
138
|
+
sentence_length_range_dist=range_dist,
|
|
139
|
+
min_sentence_length_dist=min_dist,
|
|
140
|
+
max_sentence_length_dist=max_dist,
|
|
141
|
+
chunk_size=chunk_size,
|
|
142
|
+
chunk_count=1, # Single pass analysis
|
|
100
143
|
metadata={
|
|
101
144
|
"model": model,
|
|
102
145
|
"sentence_lengths": sentence_lengths,
|
|
@@ -27,13 +27,19 @@ References:
|
|
|
27
27
|
Quirk, R., et al. (1985). A Comprehensive Grammar of the English Language. Longman.
|
|
28
28
|
"""
|
|
29
29
|
|
|
30
|
-
from
|
|
30
|
+
from typing import Any
|
|
31
|
+
|
|
32
|
+
from .._types import Distribution, SentenceTypeResult, make_distribution
|
|
31
33
|
from .._utils import check_optional_dependency
|
|
32
34
|
|
|
35
|
+
# Type alias for spaCy Span (loaded dynamically)
|
|
36
|
+
_SpaCySpan = Any
|
|
37
|
+
|
|
33
38
|
|
|
34
39
|
def compute_sentence_types(
|
|
35
40
|
text: str,
|
|
36
41
|
model: str = "en_core_web_sm",
|
|
42
|
+
chunk_size: int = 1000,
|
|
37
43
|
) -> SentenceTypeResult:
|
|
38
44
|
"""
|
|
39
45
|
Classify sentences by structure and function.
|
|
@@ -193,8 +199,7 @@ def compute_sentence_types(
|
|
|
193
199
|
nlp = spacy.load(model)
|
|
194
200
|
except OSError as e:
|
|
195
201
|
raise OSError(
|
|
196
|
-
f"spaCy model '{model}' not found. "
|
|
197
|
-
f"Download with: python -m spacy download {model}"
|
|
202
|
+
f"spaCy model '{model}' not found. Download with: python -m spacy download {model}"
|
|
198
203
|
) from e
|
|
199
204
|
|
|
200
205
|
# Parse text
|
|
@@ -203,6 +208,14 @@ def compute_sentence_types(
|
|
|
203
208
|
|
|
204
209
|
# Handle empty text
|
|
205
210
|
if len(sentences) == 0:
|
|
211
|
+
empty_dist = Distribution(
|
|
212
|
+
values=[],
|
|
213
|
+
mean=float("nan"),
|
|
214
|
+
median=float("nan"),
|
|
215
|
+
std=0.0,
|
|
216
|
+
range=0.0,
|
|
217
|
+
iqr=0.0,
|
|
218
|
+
)
|
|
206
219
|
return SentenceTypeResult(
|
|
207
220
|
simple_ratio=float("nan"),
|
|
208
221
|
compound_ratio=float("nan"),
|
|
@@ -223,6 +236,18 @@ def compute_sentence_types(
|
|
|
223
236
|
total_sentences=0,
|
|
224
237
|
structural_diversity=float("nan"),
|
|
225
238
|
functional_diversity=float("nan"),
|
|
239
|
+
simple_ratio_dist=empty_dist,
|
|
240
|
+
compound_ratio_dist=empty_dist,
|
|
241
|
+
complex_ratio_dist=empty_dist,
|
|
242
|
+
compound_complex_ratio_dist=empty_dist,
|
|
243
|
+
declarative_ratio_dist=empty_dist,
|
|
244
|
+
interrogative_ratio_dist=empty_dist,
|
|
245
|
+
imperative_ratio_dist=empty_dist,
|
|
246
|
+
exclamatory_ratio_dist=empty_dist,
|
|
247
|
+
structural_diversity_dist=empty_dist,
|
|
248
|
+
functional_diversity_dist=empty_dist,
|
|
249
|
+
chunk_size=chunk_size,
|
|
250
|
+
chunk_count=0,
|
|
226
251
|
metadata={
|
|
227
252
|
"warning": "Empty text or no sentences found",
|
|
228
253
|
},
|
|
@@ -249,13 +274,15 @@ def compute_sentence_types(
|
|
|
249
274
|
functional_counts[functional_type] += 1
|
|
250
275
|
|
|
251
276
|
# Store classification
|
|
252
|
-
sentence_classifications.append(
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
277
|
+
sentence_classifications.append(
|
|
278
|
+
{
|
|
279
|
+
"text": sent.text,
|
|
280
|
+
"structural_type": structural_type,
|
|
281
|
+
"functional_type": functional_type,
|
|
282
|
+
"independent_clauses": independent_count,
|
|
283
|
+
"dependent_clauses": dependent_count,
|
|
284
|
+
}
|
|
285
|
+
)
|
|
259
286
|
|
|
260
287
|
# Calculate ratios
|
|
261
288
|
total_sentences = len(sentences)
|
|
@@ -271,11 +298,28 @@ def compute_sentence_types(
|
|
|
271
298
|
|
|
272
299
|
# Calculate diversity metrics
|
|
273
300
|
structural_ratios = [simple_ratio, compound_ratio, complex_ratio, compound_complex_ratio]
|
|
274
|
-
functional_ratios = [
|
|
301
|
+
functional_ratios = [
|
|
302
|
+
declarative_ratio,
|
|
303
|
+
interrogative_ratio,
|
|
304
|
+
imperative_ratio,
|
|
305
|
+
exclamatory_ratio,
|
|
306
|
+
]
|
|
275
307
|
|
|
276
308
|
structural_diversity = _calculate_shannon_entropy(structural_ratios)
|
|
277
309
|
functional_diversity = _calculate_shannon_entropy(functional_ratios)
|
|
278
310
|
|
|
311
|
+
# Create single-value distributions (sentence analysis is done on full text)
|
|
312
|
+
simple_ratio_dist = make_distribution([simple_ratio])
|
|
313
|
+
compound_ratio_dist = make_distribution([compound_ratio])
|
|
314
|
+
complex_ratio_dist = make_distribution([complex_ratio])
|
|
315
|
+
compound_complex_ratio_dist = make_distribution([compound_complex_ratio])
|
|
316
|
+
declarative_ratio_dist = make_distribution([declarative_ratio])
|
|
317
|
+
interrogative_ratio_dist = make_distribution([interrogative_ratio])
|
|
318
|
+
imperative_ratio_dist = make_distribution([imperative_ratio])
|
|
319
|
+
exclamatory_ratio_dist = make_distribution([exclamatory_ratio])
|
|
320
|
+
structural_diversity_dist = make_distribution([structural_diversity])
|
|
321
|
+
functional_diversity_dist = make_distribution([functional_diversity])
|
|
322
|
+
|
|
279
323
|
# Collect metadata
|
|
280
324
|
metadata = {
|
|
281
325
|
"sentence_count": total_sentences,
|
|
@@ -306,11 +350,23 @@ def compute_sentence_types(
|
|
|
306
350
|
total_sentences=total_sentences,
|
|
307
351
|
structural_diversity=structural_diversity,
|
|
308
352
|
functional_diversity=functional_diversity,
|
|
353
|
+
simple_ratio_dist=simple_ratio_dist,
|
|
354
|
+
compound_ratio_dist=compound_ratio_dist,
|
|
355
|
+
complex_ratio_dist=complex_ratio_dist,
|
|
356
|
+
compound_complex_ratio_dist=compound_complex_ratio_dist,
|
|
357
|
+
declarative_ratio_dist=declarative_ratio_dist,
|
|
358
|
+
interrogative_ratio_dist=interrogative_ratio_dist,
|
|
359
|
+
imperative_ratio_dist=imperative_ratio_dist,
|
|
360
|
+
exclamatory_ratio_dist=exclamatory_ratio_dist,
|
|
361
|
+
structural_diversity_dist=structural_diversity_dist,
|
|
362
|
+
functional_diversity_dist=functional_diversity_dist,
|
|
363
|
+
chunk_size=chunk_size,
|
|
364
|
+
chunk_count=1, # Single pass analysis
|
|
309
365
|
metadata=metadata,
|
|
310
366
|
)
|
|
311
367
|
|
|
312
368
|
|
|
313
|
-
def _count_independent_clauses(sent) -> int:
|
|
369
|
+
def _count_independent_clauses(sent: _SpaCySpan) -> int:
|
|
314
370
|
"""
|
|
315
371
|
Count independent clauses in a sentence.
|
|
316
372
|
|
|
@@ -336,7 +392,7 @@ def _count_independent_clauses(sent) -> int:
|
|
|
336
392
|
return count
|
|
337
393
|
|
|
338
394
|
|
|
339
|
-
def _count_dependent_clauses(sent) -> int:
|
|
395
|
+
def _count_dependent_clauses(sent: _SpaCySpan) -> int:
|
|
340
396
|
"""
|
|
341
397
|
Count dependent clauses in a sentence.
|
|
342
398
|
|
|
@@ -382,7 +438,7 @@ def _classify_structural(independent: int, dependent: int) -> str:
|
|
|
382
438
|
return "simple"
|
|
383
439
|
|
|
384
440
|
|
|
385
|
-
def _classify_functional(sent) -> str:
|
|
441
|
+
def _classify_functional(sent: _SpaCySpan) -> str:
|
|
386
442
|
"""
|
|
387
443
|
Classify sentence function based on punctuation and structure.
|
|
388
444
|
|
|
@@ -415,7 +471,7 @@ def _classify_functional(sent) -> str:
|
|
|
415
471
|
return "declarative"
|
|
416
472
|
|
|
417
473
|
|
|
418
|
-
def _is_imperative_structure(sent) -> bool:
|
|
474
|
+
def _is_imperative_structure(sent: _SpaCySpan) -> bool:
|
|
419
475
|
"""
|
|
420
476
|
Check if sentence has imperative structure.
|
|
421
477
|
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# viz
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
Visualization for drift detection results. Two output modes: static PNG (matplotlib) and interactive HTML (React JSX).
|
|
7
|
+
|
|
8
|
+
## Catalogue
|
|
9
|
+
|
|
10
|
+
| File | Functions | Output |
|
|
11
|
+
|------|-----------|--------|
|
|
12
|
+
| `drift.py` | `plot_drift_timeline`, `plot_drift_scatter`, `plot_drift_report` | PNG via matplotlib/seaborn |
|
|
13
|
+
| `jsx/report.py` | `export_drift_report_jsx` | Interactive HTML dashboard |
|
|
14
|
+
| `jsx/timeline.py` | `export_drift_timeline_jsx` | Interactive HTML timeline |
|
|
15
|
+
| `jsx/viewer.py` | `export_drift_viewer` | Standalone HTML viewer with file upload |
|
|
16
|
+
| `jsx/_base.py` | _(internal)_ | React/JSX rendering base |
|
|
17
|
+
|
|
18
|
+
## Install
|
|
19
|
+
|
|
20
|
+
```
|
|
21
|
+
pip install pystylometry[viz] # For PNG output (matplotlib + seaborn)
|
|
22
|
+
# JSX/HTML output requires no additional dependencies
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## See Also
|
|
26
|
+
|
|
27
|
+
- [`consistency/`](../consistency/) produces the `KilgarriffDriftResult` consumed by all viz functions
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Visualization module for pystylometry.
|
|
2
|
+
|
|
3
|
+
This module provides visualization functions for stylometric analysis results.
|
|
4
|
+
|
|
5
|
+
Matplotlib Functions (PNG output):
|
|
6
|
+
Requires optional dependencies: pip install pystylometry[viz]
|
|
7
|
+
|
|
8
|
+
plot_drift_timeline: Line chart of chi-squared values over document
|
|
9
|
+
plot_drift_scatter: Scatter plot with reference zones (tic-tac-toe style)
|
|
10
|
+
plot_drift_report: Combined multi-panel visualization
|
|
11
|
+
|
|
12
|
+
Interactive JSX Functions (HTML output):
|
|
13
|
+
No additional dependencies required (uses React via CDN)
|
|
14
|
+
|
|
15
|
+
export_drift_timeline_jsx: Interactive timeline chart
|
|
16
|
+
export_drift_report_jsx: Interactive multi-panel dashboard
|
|
17
|
+
export_drift_viewer: Standalone viewer with file upload
|
|
18
|
+
|
|
19
|
+
Related GitHub Issues:
|
|
20
|
+
#38 - Visualization Options for Style Drift Detection
|
|
21
|
+
https://github.com/craigtrim/pystylometry/issues/38
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
>>> from pystylometry.consistency import compute_kilgarriff_drift
|
|
25
|
+
>>> from pystylometry.viz import plot_drift_timeline, export_drift_timeline_jsx
|
|
26
|
+
>>>
|
|
27
|
+
>>> result = compute_kilgarriff_drift(text)
|
|
28
|
+
>>> plot_drift_timeline(result, output="timeline.png") # Static PNG
|
|
29
|
+
>>> export_drift_timeline_jsx(result, "timeline.html") # Interactive HTML
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from .drift import ( # noqa: E402
|
|
33
|
+
plot_drift_report,
|
|
34
|
+
plot_drift_scatter,
|
|
35
|
+
plot_drift_timeline,
|
|
36
|
+
)
|
|
37
|
+
from .jsx import ( # noqa: E402
|
|
38
|
+
export_drift_report_jsx,
|
|
39
|
+
export_drift_timeline_jsx,
|
|
40
|
+
export_drift_viewer,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
import matplotlib # noqa: F401
|
|
45
|
+
import seaborn # noqa: F401 # type: ignore[import-untyped]
|
|
46
|
+
|
|
47
|
+
_VIZ_AVAILABLE = True
|
|
48
|
+
except ImportError:
|
|
49
|
+
_VIZ_AVAILABLE = False
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _check_viz_available() -> None:
|
|
53
|
+
"""Raise ImportError if visualization dependencies are not installed."""
|
|
54
|
+
if not _VIZ_AVAILABLE:
|
|
55
|
+
raise ImportError(
|
|
56
|
+
"Visualization requires optional dependencies. "
|
|
57
|
+
"Install with: pip install pystylometry[viz] or poetry install --with viz"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
__all__ = [
|
|
62
|
+
# Matplotlib (PNG)
|
|
63
|
+
"plot_drift_timeline",
|
|
64
|
+
"plot_drift_scatter",
|
|
65
|
+
"plot_drift_report",
|
|
66
|
+
# JSX (HTML)
|
|
67
|
+
"export_drift_timeline_jsx",
|
|
68
|
+
"export_drift_report_jsx",
|
|
69
|
+
# Standalone viewer
|
|
70
|
+
"export_drift_viewer",
|
|
71
|
+
]
|