pystylometry 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +29 -3
- pystylometry/_types.py +963 -259
- pystylometry/authorship/__init__.py +23 -2
- pystylometry/authorship/additional_methods.py +4 -29
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/character_metrics.py +267 -179
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/advanced_diversity.py +61 -22
- pystylometry/lexical/function_words.py +255 -56
- pystylometry/lexical/hapax.py +182 -52
- pystylometry/lexical/mtld.py +108 -26
- pystylometry/lexical/ttr.py +76 -10
- pystylometry/lexical/word_frequency_sophistication.py +1522 -298
- pystylometry/lexical/yule.py +136 -50
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/readability/additional_formulas.py +1887 -762
- pystylometry/readability/ari.py +144 -82
- pystylometry/readability/coleman_liau.py +136 -109
- pystylometry/readability/flesch.py +177 -73
- pystylometry/readability/gunning_fog.py +165 -161
- pystylometry/readability/smog.py +123 -42
- pystylometry/syntactic/advanced_syntactic.py +76 -14
- pystylometry/syntactic/pos_ratios.py +70 -6
- pystylometry/syntactic/sentence_stats.py +55 -12
- pystylometry/syntactic/sentence_types.py +71 -15
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +5 -2
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +1 -1
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-1.0.0.dist-info/RECORD +0 -46
|
@@ -28,13 +28,21 @@ References:
|
|
|
28
28
|
of linguistic complexity. In Image, language, brain (pp. 95-126).
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
|
-
from
|
|
31
|
+
from typing import Any
|
|
32
|
+
|
|
33
|
+
from .._types import AdvancedSyntacticResult, Distribution, make_distribution
|
|
32
34
|
from .._utils import check_optional_dependency
|
|
33
35
|
|
|
36
|
+
# Type aliases for spaCy objects (loaded dynamically)
|
|
37
|
+
_SpaCyToken = Any
|
|
38
|
+
_SpaCyDoc = Any
|
|
39
|
+
_SpaCySpan = Any
|
|
40
|
+
|
|
34
41
|
|
|
35
42
|
def compute_advanced_syntactic(
|
|
36
43
|
text: str,
|
|
37
44
|
model: str = "en_core_web_sm",
|
|
45
|
+
chunk_size: int = 1000,
|
|
38
46
|
) -> AdvancedSyntacticResult:
|
|
39
47
|
"""
|
|
40
48
|
Compute advanced syntactic complexity metrics using dependency parsing.
|
|
@@ -147,7 +155,6 @@ def compute_advanced_syntactic(
|
|
|
147
155
|
|
|
148
156
|
try:
|
|
149
157
|
import spacy # type: ignore
|
|
150
|
-
from spacy.tokens import Doc, Span, Token # type: ignore
|
|
151
158
|
except ImportError as e:
|
|
152
159
|
raise ImportError(
|
|
153
160
|
"spaCy is required for advanced syntactic analysis. "
|
|
@@ -159,8 +166,7 @@ def compute_advanced_syntactic(
|
|
|
159
166
|
nlp = spacy.load(model)
|
|
160
167
|
except OSError as e:
|
|
161
168
|
raise OSError(
|
|
162
|
-
f"spaCy model '{model}' not found. "
|
|
163
|
-
f"Download with: python -m spacy download {model}"
|
|
169
|
+
f"spaCy model '{model}' not found. Download with: python -m spacy download {model}"
|
|
164
170
|
) from e
|
|
165
171
|
|
|
166
172
|
# Parse text
|
|
@@ -169,6 +175,14 @@ def compute_advanced_syntactic(
|
|
|
169
175
|
|
|
170
176
|
# Handle empty text
|
|
171
177
|
if len(sentences) == 0 or len(doc) == 0:
|
|
178
|
+
empty_dist = Distribution(
|
|
179
|
+
values=[],
|
|
180
|
+
mean=float("nan"),
|
|
181
|
+
median=float("nan"),
|
|
182
|
+
std=0.0,
|
|
183
|
+
range=0.0,
|
|
184
|
+
iqr=0.0,
|
|
185
|
+
)
|
|
172
186
|
return AdvancedSyntacticResult(
|
|
173
187
|
mean_parse_tree_depth=float("nan"),
|
|
174
188
|
max_parse_tree_depth=0,
|
|
@@ -183,6 +197,20 @@ def compute_advanced_syntactic(
|
|
|
183
197
|
dependency_distance=float("nan"),
|
|
184
198
|
left_branching_ratio=float("nan"),
|
|
185
199
|
right_branching_ratio=float("nan"),
|
|
200
|
+
mean_parse_tree_depth_dist=empty_dist,
|
|
201
|
+
max_parse_tree_depth_dist=empty_dist,
|
|
202
|
+
mean_t_unit_length_dist=empty_dist,
|
|
203
|
+
clausal_density_dist=empty_dist,
|
|
204
|
+
dependent_clause_ratio_dist=empty_dist,
|
|
205
|
+
passive_voice_ratio_dist=empty_dist,
|
|
206
|
+
subordination_index_dist=empty_dist,
|
|
207
|
+
coordination_index_dist=empty_dist,
|
|
208
|
+
sentence_complexity_score_dist=empty_dist,
|
|
209
|
+
dependency_distance_dist=empty_dist,
|
|
210
|
+
left_branching_ratio_dist=empty_dist,
|
|
211
|
+
right_branching_ratio_dist=empty_dist,
|
|
212
|
+
chunk_size=chunk_size,
|
|
213
|
+
chunk_count=0,
|
|
186
214
|
metadata={
|
|
187
215
|
"sentence_count": 0,
|
|
188
216
|
"word_count": 0,
|
|
@@ -229,9 +257,7 @@ def compute_advanced_syntactic(
|
|
|
229
257
|
coordinate_clause_count = 0
|
|
230
258
|
|
|
231
259
|
for sent in sentences:
|
|
232
|
-
sent_total, sent_dependent, sent_subordinate, sent_coordinate = _count_clauses(
|
|
233
|
-
sent
|
|
234
|
-
)
|
|
260
|
+
sent_total, sent_dependent, sent_subordinate, sent_coordinate = _count_clauses(sent)
|
|
235
261
|
total_clauses += sent_total
|
|
236
262
|
dependent_clause_count += sent_dependent
|
|
237
263
|
subordinate_clause_count += sent_subordinate
|
|
@@ -279,14 +305,22 @@ def compute_advanced_syntactic(
|
|
|
279
305
|
# Normalize individual metrics to 0-1 range
|
|
280
306
|
normalized_parse_depth = min(mean_parse_tree_depth / 10, 1.0)
|
|
281
307
|
normalized_clausal_density = (
|
|
282
|
-
min(clausal_density / 3, 1.0)
|
|
308
|
+
min(clausal_density / 3, 1.0)
|
|
309
|
+
if not isinstance(clausal_density, float) or not (clausal_density != clausal_density)
|
|
310
|
+
else 0.0
|
|
283
311
|
)
|
|
284
312
|
normalized_t_unit_length = (
|
|
285
|
-
min(mean_t_unit_length / 25, 1.0)
|
|
313
|
+
min(mean_t_unit_length / 25, 1.0)
|
|
314
|
+
if not isinstance(mean_t_unit_length, float)
|
|
315
|
+
or not (mean_t_unit_length != mean_t_unit_length)
|
|
316
|
+
else 0.0
|
|
286
317
|
)
|
|
287
318
|
normalized_dependency_distance = min(mean_dependency_distance / 5, 1.0)
|
|
288
319
|
normalized_subordination = (
|
|
289
|
-
subordination_index
|
|
320
|
+
subordination_index
|
|
321
|
+
if not isinstance(subordination_index, float)
|
|
322
|
+
or not (subordination_index != subordination_index)
|
|
323
|
+
else 0.0
|
|
290
324
|
)
|
|
291
325
|
|
|
292
326
|
# Weighted combination
|
|
@@ -298,6 +332,20 @@ def compute_advanced_syntactic(
|
|
|
298
332
|
+ 0.1 * normalized_dependency_distance
|
|
299
333
|
)
|
|
300
334
|
|
|
335
|
+
# Create single-value distributions (analysis is done on full text)
|
|
336
|
+
mean_parse_tree_depth_dist = make_distribution([mean_parse_tree_depth])
|
|
337
|
+
max_parse_tree_depth_dist = make_distribution([float(max_parse_tree_depth)])
|
|
338
|
+
mean_t_unit_length_dist = make_distribution([mean_t_unit_length])
|
|
339
|
+
clausal_density_dist = make_distribution([clausal_density])
|
|
340
|
+
dependent_clause_ratio_dist = make_distribution([dependent_clause_ratio])
|
|
341
|
+
passive_voice_ratio_dist = make_distribution([passive_voice_ratio])
|
|
342
|
+
subordination_index_dist = make_distribution([subordination_index])
|
|
343
|
+
coordination_index_dist = make_distribution([coordination_index])
|
|
344
|
+
sentence_complexity_score_dist = make_distribution([sentence_complexity_score])
|
|
345
|
+
dependency_distance_dist = make_distribution([mean_dependency_distance])
|
|
346
|
+
left_branching_ratio_dist = make_distribution([left_branching_ratio])
|
|
347
|
+
right_branching_ratio_dist = make_distribution([right_branching_ratio])
|
|
348
|
+
|
|
301
349
|
# Collect metadata
|
|
302
350
|
metadata = {
|
|
303
351
|
"sentence_count": len(sentences),
|
|
@@ -331,11 +379,25 @@ def compute_advanced_syntactic(
|
|
|
331
379
|
dependency_distance=mean_dependency_distance,
|
|
332
380
|
left_branching_ratio=left_branching_ratio,
|
|
333
381
|
right_branching_ratio=right_branching_ratio,
|
|
382
|
+
mean_parse_tree_depth_dist=mean_parse_tree_depth_dist,
|
|
383
|
+
max_parse_tree_depth_dist=max_parse_tree_depth_dist,
|
|
384
|
+
mean_t_unit_length_dist=mean_t_unit_length_dist,
|
|
385
|
+
clausal_density_dist=clausal_density_dist,
|
|
386
|
+
dependent_clause_ratio_dist=dependent_clause_ratio_dist,
|
|
387
|
+
passive_voice_ratio_dist=passive_voice_ratio_dist,
|
|
388
|
+
subordination_index_dist=subordination_index_dist,
|
|
389
|
+
coordination_index_dist=coordination_index_dist,
|
|
390
|
+
sentence_complexity_score_dist=sentence_complexity_score_dist,
|
|
391
|
+
dependency_distance_dist=dependency_distance_dist,
|
|
392
|
+
left_branching_ratio_dist=left_branching_ratio_dist,
|
|
393
|
+
right_branching_ratio_dist=right_branching_ratio_dist,
|
|
394
|
+
chunk_size=chunk_size,
|
|
395
|
+
chunk_count=1, # Single pass analysis
|
|
334
396
|
metadata=metadata,
|
|
335
397
|
)
|
|
336
398
|
|
|
337
399
|
|
|
338
|
-
def _calculate_max_tree_depth(token) -> int:
|
|
400
|
+
def _calculate_max_tree_depth(token: _SpaCyToken) -> int:
|
|
339
401
|
"""
|
|
340
402
|
Calculate maximum depth of dependency tree starting from token.
|
|
341
403
|
|
|
@@ -352,7 +414,7 @@ def _calculate_max_tree_depth(token) -> int:
|
|
|
352
414
|
return max(child_depths) + 1
|
|
353
415
|
|
|
354
416
|
|
|
355
|
-
def _identify_t_units(doc) -> list:
|
|
417
|
+
def _identify_t_units(doc: _SpaCyDoc) -> list[_SpaCySpan]:
|
|
356
418
|
"""
|
|
357
419
|
Identify T-units (minimal terminable units) in document.
|
|
358
420
|
|
|
@@ -371,7 +433,7 @@ def _identify_t_units(doc) -> list:
|
|
|
371
433
|
return list(doc.sents)
|
|
372
434
|
|
|
373
435
|
|
|
374
|
-
def _count_clauses(sent) -> tuple[int, int, int, int]:
|
|
436
|
+
def _count_clauses(sent: _SpaCySpan) -> tuple[int, int, int, int]:
|
|
375
437
|
"""
|
|
376
438
|
Count different types of clauses in sentence.
|
|
377
439
|
|
|
@@ -406,7 +468,7 @@ def _count_clauses(sent) -> tuple[int, int, int, int]:
|
|
|
406
468
|
return total, dependent, subordinate, coordinate
|
|
407
469
|
|
|
408
470
|
|
|
409
|
-
def _is_passive_voice(sent) -> bool:
|
|
471
|
+
def _is_passive_voice(sent: _SpaCySpan) -> bool:
|
|
410
472
|
"""
|
|
411
473
|
Detect if sentence contains passive voice construction.
|
|
412
474
|
|
|
@@ -1,10 +1,17 @@
|
|
|
1
|
-
"""Part-of-Speech ratio analysis using spaCy.
|
|
1
|
+
"""Part-of-Speech ratio analysis using spaCy.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Related GitHub Issue:
|
|
4
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
5
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .._types import Distribution, POSResult, make_distribution
|
|
4
9
|
from .._utils import check_optional_dependency
|
|
5
10
|
|
|
6
11
|
|
|
7
|
-
def compute_pos_ratios(
|
|
12
|
+
def compute_pos_ratios(
|
|
13
|
+
text: str, model: str = "en_core_web_sm", chunk_size: int = 1000
|
|
14
|
+
) -> POSResult:
|
|
8
15
|
"""
|
|
9
16
|
Compute Part-of-Speech ratios and lexical density using spaCy.
|
|
10
17
|
|
|
@@ -18,6 +25,10 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
|
|
|
18
25
|
- Lexical density: (nouns + verbs + adjectives + adverbs) / total words
|
|
19
26
|
- Function word ratio: (determiners + prepositions + conjunctions) / total words
|
|
20
27
|
|
|
28
|
+
Related GitHub Issue:
|
|
29
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
30
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
31
|
+
|
|
21
32
|
References:
|
|
22
33
|
Biber, D. (1988). Variation across speech and writing.
|
|
23
34
|
Cambridge University Press.
|
|
@@ -25,9 +36,13 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
|
|
|
25
36
|
Args:
|
|
26
37
|
text: Input text to analyze
|
|
27
38
|
model: spaCy model name (default: "en_core_web_sm")
|
|
39
|
+
chunk_size: Number of words per chunk (default: 1000).
|
|
40
|
+
Note: POS analysis is performed on the full text for accuracy,
|
|
41
|
+
so this parameter is included for API consistency but actual
|
|
42
|
+
results are from a single pass.
|
|
28
43
|
|
|
29
44
|
Returns:
|
|
30
|
-
POSResult with all POS ratios and metadata
|
|
45
|
+
POSResult with all POS ratios, distributions, and metadata
|
|
31
46
|
|
|
32
47
|
Raises:
|
|
33
48
|
ImportError: If spaCy is not installed
|
|
@@ -47,8 +62,7 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
|
|
|
47
62
|
nlp = spacy.load(model)
|
|
48
63
|
except OSError:
|
|
49
64
|
raise OSError(
|
|
50
|
-
f"spaCy model '{model}' not found. "
|
|
51
|
-
f"Download it with: python -m spacy download {model}"
|
|
65
|
+
f"spaCy model '{model}' not found. Download it with: python -m spacy download {model}"
|
|
52
66
|
)
|
|
53
67
|
|
|
54
68
|
# Process text with spaCy
|
|
@@ -89,6 +103,14 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
|
|
|
89
103
|
|
|
90
104
|
# Handle empty text
|
|
91
105
|
if total_tokens == 0:
|
|
106
|
+
empty_dist = Distribution(
|
|
107
|
+
values=[],
|
|
108
|
+
mean=float("nan"),
|
|
109
|
+
median=float("nan"),
|
|
110
|
+
std=0.0,
|
|
111
|
+
range=0.0,
|
|
112
|
+
iqr=0.0,
|
|
113
|
+
)
|
|
92
114
|
return POSResult(
|
|
93
115
|
noun_ratio=float("nan"),
|
|
94
116
|
verb_ratio=float("nan"),
|
|
@@ -98,6 +120,16 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
|
|
|
98
120
|
adjective_noun_ratio=float("nan"),
|
|
99
121
|
lexical_density=float("nan"),
|
|
100
122
|
function_word_ratio=float("nan"),
|
|
123
|
+
noun_ratio_dist=empty_dist,
|
|
124
|
+
verb_ratio_dist=empty_dist,
|
|
125
|
+
adjective_ratio_dist=empty_dist,
|
|
126
|
+
adverb_ratio_dist=empty_dist,
|
|
127
|
+
noun_verb_ratio_dist=empty_dist,
|
|
128
|
+
adjective_noun_ratio_dist=empty_dist,
|
|
129
|
+
lexical_density_dist=empty_dist,
|
|
130
|
+
function_word_ratio_dist=empty_dist,
|
|
131
|
+
chunk_size=chunk_size,
|
|
132
|
+
chunk_count=0,
|
|
101
133
|
metadata={
|
|
102
134
|
"model": model,
|
|
103
135
|
"token_count": 0,
|
|
@@ -129,6 +161,28 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
|
|
|
129
161
|
function_words = det_count + adp_count + conj_count
|
|
130
162
|
function_word_ratio = function_words / total_tokens
|
|
131
163
|
|
|
164
|
+
# Create single-value distributions (POS analysis is done on full text)
|
|
165
|
+
noun_ratio_dist = make_distribution([noun_ratio])
|
|
166
|
+
verb_ratio_dist = make_distribution([verb_ratio])
|
|
167
|
+
adj_ratio_dist = make_distribution([adj_ratio])
|
|
168
|
+
adv_ratio_dist = make_distribution([adv_ratio])
|
|
169
|
+
noun_verb_dist = (
|
|
170
|
+
make_distribution([noun_verb_ratio])
|
|
171
|
+
if not (noun_verb_ratio != noun_verb_ratio)
|
|
172
|
+
else Distribution(
|
|
173
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
adj_noun_dist = (
|
|
177
|
+
make_distribution([adj_noun_ratio])
|
|
178
|
+
if not (adj_noun_ratio != adj_noun_ratio)
|
|
179
|
+
else Distribution(
|
|
180
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
181
|
+
)
|
|
182
|
+
)
|
|
183
|
+
lexical_density_dist = make_distribution([lexical_density])
|
|
184
|
+
function_word_dist = make_distribution([function_word_ratio])
|
|
185
|
+
|
|
132
186
|
return POSResult(
|
|
133
187
|
noun_ratio=noun_ratio,
|
|
134
188
|
verb_ratio=verb_ratio,
|
|
@@ -138,6 +192,16 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
|
|
|
138
192
|
adjective_noun_ratio=adj_noun_ratio,
|
|
139
193
|
lexical_density=lexical_density,
|
|
140
194
|
function_word_ratio=function_word_ratio,
|
|
195
|
+
noun_ratio_dist=noun_ratio_dist,
|
|
196
|
+
verb_ratio_dist=verb_ratio_dist,
|
|
197
|
+
adjective_ratio_dist=adj_ratio_dist,
|
|
198
|
+
adverb_ratio_dist=adv_ratio_dist,
|
|
199
|
+
noun_verb_ratio_dist=noun_verb_dist,
|
|
200
|
+
adjective_noun_ratio_dist=adj_noun_dist,
|
|
201
|
+
lexical_density_dist=lexical_density_dist,
|
|
202
|
+
function_word_ratio_dist=function_word_dist,
|
|
203
|
+
chunk_size=chunk_size,
|
|
204
|
+
chunk_count=1, # Single pass analysis
|
|
141
205
|
metadata={
|
|
142
206
|
"model": model,
|
|
143
207
|
"token_count": total_tokens,
|
|
@@ -1,10 +1,17 @@
|
|
|
1
|
-
"""Sentence-level statistics using spaCy.
|
|
1
|
+
"""Sentence-level statistics using spaCy.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
Related GitHub Issue:
|
|
4
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
5
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
6
|
+
"""
|
|
5
7
|
|
|
8
|
+
from .._types import Distribution, SentenceStatsResult, make_distribution
|
|
9
|
+
from .._utils import check_optional_dependency
|
|
6
10
|
|
|
7
|
-
|
|
11
|
+
|
|
12
|
+
def compute_sentence_stats(
|
|
13
|
+
text: str, model: str = "en_core_web_sm", chunk_size: int = 1000
|
|
14
|
+
) -> SentenceStatsResult:
|
|
8
15
|
"""
|
|
9
16
|
Compute sentence-level statistics using spaCy.
|
|
10
17
|
|
|
@@ -16,6 +23,10 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
|
|
|
16
23
|
- Maximum sentence length
|
|
17
24
|
- Total sentence count
|
|
18
25
|
|
|
26
|
+
Related GitHub Issue:
|
|
27
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
28
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
29
|
+
|
|
19
30
|
References:
|
|
20
31
|
Hunt, K. W. (1965). Grammatical structures written at three grade levels.
|
|
21
32
|
NCTE Research Report No. 3.
|
|
@@ -23,9 +34,13 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
|
|
|
23
34
|
Args:
|
|
24
35
|
text: Input text to analyze
|
|
25
36
|
model: spaCy model name (default: "en_core_web_sm")
|
|
37
|
+
chunk_size: Number of words per chunk (default: 1000).
|
|
38
|
+
Note: Sentence analysis is performed on the full text for accuracy,
|
|
39
|
+
so this parameter is included for API consistency but actual
|
|
40
|
+
results are from a single pass.
|
|
26
41
|
|
|
27
42
|
Returns:
|
|
28
|
-
SentenceStatsResult with sentence statistics and metadata
|
|
43
|
+
SentenceStatsResult with sentence statistics, distributions, and metadata
|
|
29
44
|
|
|
30
45
|
Raises:
|
|
31
46
|
ImportError: If spaCy is not installed
|
|
@@ -45,8 +60,7 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
|
|
|
45
60
|
nlp = spacy.load(model)
|
|
46
61
|
except OSError:
|
|
47
62
|
raise OSError(
|
|
48
|
-
f"spaCy model '{model}' not found. "
|
|
49
|
-
f"Download it with: python -m spacy download {model}"
|
|
63
|
+
f"spaCy model '{model}' not found. Download it with: python -m spacy download {model}"
|
|
50
64
|
)
|
|
51
65
|
|
|
52
66
|
# Process text with spaCy
|
|
@@ -62,13 +76,28 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
|
|
|
62
76
|
|
|
63
77
|
# Handle empty text
|
|
64
78
|
if len(sentence_lengths) == 0:
|
|
79
|
+
empty_dist = Distribution(
|
|
80
|
+
values=[],
|
|
81
|
+
mean=float("nan"),
|
|
82
|
+
median=float("nan"),
|
|
83
|
+
std=0.0,
|
|
84
|
+
range=0.0,
|
|
85
|
+
iqr=0.0,
|
|
86
|
+
)
|
|
65
87
|
return SentenceStatsResult(
|
|
66
88
|
mean_sentence_length=float("nan"),
|
|
67
89
|
sentence_length_std=float("nan"),
|
|
68
|
-
sentence_length_range=0,
|
|
69
|
-
min_sentence_length=0,
|
|
70
|
-
max_sentence_length=0,
|
|
90
|
+
sentence_length_range=0.0,
|
|
91
|
+
min_sentence_length=0.0,
|
|
92
|
+
max_sentence_length=0.0,
|
|
71
93
|
sentence_count=0,
|
|
94
|
+
mean_sentence_length_dist=empty_dist,
|
|
95
|
+
sentence_length_std_dist=empty_dist,
|
|
96
|
+
sentence_length_range_dist=empty_dist,
|
|
97
|
+
min_sentence_length_dist=empty_dist,
|
|
98
|
+
max_sentence_length_dist=empty_dist,
|
|
99
|
+
chunk_size=chunk_size,
|
|
100
|
+
chunk_count=0,
|
|
72
101
|
metadata={
|
|
73
102
|
"model": model,
|
|
74
103
|
},
|
|
@@ -86,10 +115,17 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
|
|
|
86
115
|
else:
|
|
87
116
|
std_dev = 0.0
|
|
88
117
|
|
|
89
|
-
min_length = min(sentence_lengths)
|
|
90
|
-
max_length = max(sentence_lengths)
|
|
118
|
+
min_length = float(min(sentence_lengths))
|
|
119
|
+
max_length = float(max(sentence_lengths))
|
|
91
120
|
length_range = max_length - min_length
|
|
92
121
|
|
|
122
|
+
# Create single-value distributions (sentence analysis is done on full text)
|
|
123
|
+
mean_dist = make_distribution([mean_length])
|
|
124
|
+
std_dist = make_distribution([std_dev])
|
|
125
|
+
range_dist = make_distribution([length_range])
|
|
126
|
+
min_dist = make_distribution([min_length])
|
|
127
|
+
max_dist = make_distribution([max_length])
|
|
128
|
+
|
|
93
129
|
return SentenceStatsResult(
|
|
94
130
|
mean_sentence_length=mean_length,
|
|
95
131
|
sentence_length_std=std_dev,
|
|
@@ -97,6 +133,13 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
|
|
|
97
133
|
min_sentence_length=min_length,
|
|
98
134
|
max_sentence_length=max_length,
|
|
99
135
|
sentence_count=len(sentence_lengths),
|
|
136
|
+
mean_sentence_length_dist=mean_dist,
|
|
137
|
+
sentence_length_std_dist=std_dist,
|
|
138
|
+
sentence_length_range_dist=range_dist,
|
|
139
|
+
min_sentence_length_dist=min_dist,
|
|
140
|
+
max_sentence_length_dist=max_dist,
|
|
141
|
+
chunk_size=chunk_size,
|
|
142
|
+
chunk_count=1, # Single pass analysis
|
|
100
143
|
metadata={
|
|
101
144
|
"model": model,
|
|
102
145
|
"sentence_lengths": sentence_lengths,
|
|
@@ -27,13 +27,19 @@ References:
|
|
|
27
27
|
Quirk, R., et al. (1985). A Comprehensive Grammar of the English Language. Longman.
|
|
28
28
|
"""
|
|
29
29
|
|
|
30
|
-
from
|
|
30
|
+
from typing import Any
|
|
31
|
+
|
|
32
|
+
from .._types import Distribution, SentenceTypeResult, make_distribution
|
|
31
33
|
from .._utils import check_optional_dependency
|
|
32
34
|
|
|
35
|
+
# Type alias for spaCy Span (loaded dynamically)
|
|
36
|
+
_SpaCySpan = Any
|
|
37
|
+
|
|
33
38
|
|
|
34
39
|
def compute_sentence_types(
|
|
35
40
|
text: str,
|
|
36
41
|
model: str = "en_core_web_sm",
|
|
42
|
+
chunk_size: int = 1000,
|
|
37
43
|
) -> SentenceTypeResult:
|
|
38
44
|
"""
|
|
39
45
|
Classify sentences by structure and function.
|
|
@@ -193,8 +199,7 @@ def compute_sentence_types(
|
|
|
193
199
|
nlp = spacy.load(model)
|
|
194
200
|
except OSError as e:
|
|
195
201
|
raise OSError(
|
|
196
|
-
f"spaCy model '{model}' not found. "
|
|
197
|
-
f"Download with: python -m spacy download {model}"
|
|
202
|
+
f"spaCy model '{model}' not found. Download with: python -m spacy download {model}"
|
|
198
203
|
) from e
|
|
199
204
|
|
|
200
205
|
# Parse text
|
|
@@ -203,6 +208,14 @@ def compute_sentence_types(
|
|
|
203
208
|
|
|
204
209
|
# Handle empty text
|
|
205
210
|
if len(sentences) == 0:
|
|
211
|
+
empty_dist = Distribution(
|
|
212
|
+
values=[],
|
|
213
|
+
mean=float("nan"),
|
|
214
|
+
median=float("nan"),
|
|
215
|
+
std=0.0,
|
|
216
|
+
range=0.0,
|
|
217
|
+
iqr=0.0,
|
|
218
|
+
)
|
|
206
219
|
return SentenceTypeResult(
|
|
207
220
|
simple_ratio=float("nan"),
|
|
208
221
|
compound_ratio=float("nan"),
|
|
@@ -223,6 +236,18 @@ def compute_sentence_types(
|
|
|
223
236
|
total_sentences=0,
|
|
224
237
|
structural_diversity=float("nan"),
|
|
225
238
|
functional_diversity=float("nan"),
|
|
239
|
+
simple_ratio_dist=empty_dist,
|
|
240
|
+
compound_ratio_dist=empty_dist,
|
|
241
|
+
complex_ratio_dist=empty_dist,
|
|
242
|
+
compound_complex_ratio_dist=empty_dist,
|
|
243
|
+
declarative_ratio_dist=empty_dist,
|
|
244
|
+
interrogative_ratio_dist=empty_dist,
|
|
245
|
+
imperative_ratio_dist=empty_dist,
|
|
246
|
+
exclamatory_ratio_dist=empty_dist,
|
|
247
|
+
structural_diversity_dist=empty_dist,
|
|
248
|
+
functional_diversity_dist=empty_dist,
|
|
249
|
+
chunk_size=chunk_size,
|
|
250
|
+
chunk_count=0,
|
|
226
251
|
metadata={
|
|
227
252
|
"warning": "Empty text or no sentences found",
|
|
228
253
|
},
|
|
@@ -249,13 +274,15 @@ def compute_sentence_types(
|
|
|
249
274
|
functional_counts[functional_type] += 1
|
|
250
275
|
|
|
251
276
|
# Store classification
|
|
252
|
-
sentence_classifications.append(
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
277
|
+
sentence_classifications.append(
|
|
278
|
+
{
|
|
279
|
+
"text": sent.text,
|
|
280
|
+
"structural_type": structural_type,
|
|
281
|
+
"functional_type": functional_type,
|
|
282
|
+
"independent_clauses": independent_count,
|
|
283
|
+
"dependent_clauses": dependent_count,
|
|
284
|
+
}
|
|
285
|
+
)
|
|
259
286
|
|
|
260
287
|
# Calculate ratios
|
|
261
288
|
total_sentences = len(sentences)
|
|
@@ -271,11 +298,28 @@ def compute_sentence_types(
|
|
|
271
298
|
|
|
272
299
|
# Calculate diversity metrics
|
|
273
300
|
structural_ratios = [simple_ratio, compound_ratio, complex_ratio, compound_complex_ratio]
|
|
274
|
-
functional_ratios = [
|
|
301
|
+
functional_ratios = [
|
|
302
|
+
declarative_ratio,
|
|
303
|
+
interrogative_ratio,
|
|
304
|
+
imperative_ratio,
|
|
305
|
+
exclamatory_ratio,
|
|
306
|
+
]
|
|
275
307
|
|
|
276
308
|
structural_diversity = _calculate_shannon_entropy(structural_ratios)
|
|
277
309
|
functional_diversity = _calculate_shannon_entropy(functional_ratios)
|
|
278
310
|
|
|
311
|
+
# Create single-value distributions (sentence analysis is done on full text)
|
|
312
|
+
simple_ratio_dist = make_distribution([simple_ratio])
|
|
313
|
+
compound_ratio_dist = make_distribution([compound_ratio])
|
|
314
|
+
complex_ratio_dist = make_distribution([complex_ratio])
|
|
315
|
+
compound_complex_ratio_dist = make_distribution([compound_complex_ratio])
|
|
316
|
+
declarative_ratio_dist = make_distribution([declarative_ratio])
|
|
317
|
+
interrogative_ratio_dist = make_distribution([interrogative_ratio])
|
|
318
|
+
imperative_ratio_dist = make_distribution([imperative_ratio])
|
|
319
|
+
exclamatory_ratio_dist = make_distribution([exclamatory_ratio])
|
|
320
|
+
structural_diversity_dist = make_distribution([structural_diversity])
|
|
321
|
+
functional_diversity_dist = make_distribution([functional_diversity])
|
|
322
|
+
|
|
279
323
|
# Collect metadata
|
|
280
324
|
metadata = {
|
|
281
325
|
"sentence_count": total_sentences,
|
|
@@ -306,11 +350,23 @@ def compute_sentence_types(
|
|
|
306
350
|
total_sentences=total_sentences,
|
|
307
351
|
structural_diversity=structural_diversity,
|
|
308
352
|
functional_diversity=functional_diversity,
|
|
353
|
+
simple_ratio_dist=simple_ratio_dist,
|
|
354
|
+
compound_ratio_dist=compound_ratio_dist,
|
|
355
|
+
complex_ratio_dist=complex_ratio_dist,
|
|
356
|
+
compound_complex_ratio_dist=compound_complex_ratio_dist,
|
|
357
|
+
declarative_ratio_dist=declarative_ratio_dist,
|
|
358
|
+
interrogative_ratio_dist=interrogative_ratio_dist,
|
|
359
|
+
imperative_ratio_dist=imperative_ratio_dist,
|
|
360
|
+
exclamatory_ratio_dist=exclamatory_ratio_dist,
|
|
361
|
+
structural_diversity_dist=structural_diversity_dist,
|
|
362
|
+
functional_diversity_dist=functional_diversity_dist,
|
|
363
|
+
chunk_size=chunk_size,
|
|
364
|
+
chunk_count=1, # Single pass analysis
|
|
309
365
|
metadata=metadata,
|
|
310
366
|
)
|
|
311
367
|
|
|
312
368
|
|
|
313
|
-
def _count_independent_clauses(sent) -> int:
|
|
369
|
+
def _count_independent_clauses(sent: _SpaCySpan) -> int:
|
|
314
370
|
"""
|
|
315
371
|
Count independent clauses in a sentence.
|
|
316
372
|
|
|
@@ -336,7 +392,7 @@ def _count_independent_clauses(sent) -> int:
|
|
|
336
392
|
return count
|
|
337
393
|
|
|
338
394
|
|
|
339
|
-
def _count_dependent_clauses(sent) -> int:
|
|
395
|
+
def _count_dependent_clauses(sent: _SpaCySpan) -> int:
|
|
340
396
|
"""
|
|
341
397
|
Count dependent clauses in a sentence.
|
|
342
398
|
|
|
@@ -382,7 +438,7 @@ def _classify_structural(independent: int, dependent: int) -> str:
|
|
|
382
438
|
return "simple"
|
|
383
439
|
|
|
384
440
|
|
|
385
|
-
def _classify_functional(sent) -> str:
|
|
441
|
+
def _classify_functional(sent: _SpaCySpan) -> str:
|
|
386
442
|
"""
|
|
387
443
|
Classify sentence function based on punctuation and structure.
|
|
388
444
|
|
|
@@ -415,7 +471,7 @@ def _classify_functional(sent) -> str:
|
|
|
415
471
|
return "declarative"
|
|
416
472
|
|
|
417
473
|
|
|
418
|
-
def _is_imperative_structure(sent) -> bool:
|
|
474
|
+
def _is_imperative_structure(sent: _SpaCySpan) -> bool:
|
|
419
475
|
"""
|
|
420
476
|
Check if sentence has imperative structure.
|
|
421
477
|
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Visualization module for pystylometry.
|
|
2
|
+
|
|
3
|
+
This module provides visualization functions for stylometric analysis results.
|
|
4
|
+
|
|
5
|
+
Matplotlib Functions (PNG output):
|
|
6
|
+
Requires optional dependencies: pip install pystylometry[viz]
|
|
7
|
+
|
|
8
|
+
plot_drift_timeline: Line chart of chi-squared values over document
|
|
9
|
+
plot_drift_scatter: Scatter plot with reference zones (tic-tac-toe style)
|
|
10
|
+
plot_drift_report: Combined multi-panel visualization
|
|
11
|
+
|
|
12
|
+
Interactive JSX Functions (HTML output):
|
|
13
|
+
No additional dependencies required (uses React via CDN)
|
|
14
|
+
|
|
15
|
+
export_drift_timeline_jsx: Interactive timeline chart
|
|
16
|
+
export_drift_report_jsx: Interactive multi-panel dashboard
|
|
17
|
+
export_drift_viewer: Standalone viewer with file upload
|
|
18
|
+
|
|
19
|
+
Related GitHub Issues:
|
|
20
|
+
#38 - Visualization Options for Style Drift Detection
|
|
21
|
+
https://github.com/craigtrim/pystylometry/issues/38
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
>>> from pystylometry.consistency import compute_kilgarriff_drift
|
|
25
|
+
>>> from pystylometry.viz import plot_drift_timeline, export_drift_timeline_jsx
|
|
26
|
+
>>>
|
|
27
|
+
>>> result = compute_kilgarriff_drift(text)
|
|
28
|
+
>>> plot_drift_timeline(result, output="timeline.png") # Static PNG
|
|
29
|
+
>>> export_drift_timeline_jsx(result, "timeline.html") # Interactive HTML
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from .drift import ( # noqa: E402
|
|
33
|
+
plot_drift_report,
|
|
34
|
+
plot_drift_scatter,
|
|
35
|
+
plot_drift_timeline,
|
|
36
|
+
)
|
|
37
|
+
from .jsx import ( # noqa: E402
|
|
38
|
+
export_drift_report_jsx,
|
|
39
|
+
export_drift_timeline_jsx,
|
|
40
|
+
export_drift_viewer,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
import matplotlib # noqa: F401
|
|
45
|
+
import seaborn # noqa: F401 # type: ignore[import-untyped]
|
|
46
|
+
|
|
47
|
+
_VIZ_AVAILABLE = True
|
|
48
|
+
except ImportError:
|
|
49
|
+
_VIZ_AVAILABLE = False
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _check_viz_available() -> None:
|
|
53
|
+
"""Raise ImportError if visualization dependencies are not installed."""
|
|
54
|
+
if not _VIZ_AVAILABLE:
|
|
55
|
+
raise ImportError(
|
|
56
|
+
"Visualization requires optional dependencies. "
|
|
57
|
+
"Install with: pip install pystylometry[viz] or poetry install --with viz"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
__all__ = [
|
|
62
|
+
# Matplotlib (PNG)
|
|
63
|
+
"plot_drift_timeline",
|
|
64
|
+
"plot_drift_scatter",
|
|
65
|
+
"plot_drift_report",
|
|
66
|
+
# JSX (HTML)
|
|
67
|
+
"export_drift_timeline_jsx",
|
|
68
|
+
"export_drift_report_jsx",
|
|
69
|
+
# Standalone viewer
|
|
70
|
+
"export_drift_viewer",
|
|
71
|
+
]
|