pystylometry 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. pystylometry/__init__.py +29 -3
  2. pystylometry/_types.py +963 -259
  3. pystylometry/authorship/__init__.py +23 -2
  4. pystylometry/authorship/additional_methods.py +4 -29
  5. pystylometry/authorship/kilgarriff.py +347 -0
  6. pystylometry/character/character_metrics.py +267 -179
  7. pystylometry/cli.py +427 -0
  8. pystylometry/consistency/__init__.py +57 -0
  9. pystylometry/consistency/_thresholds.py +162 -0
  10. pystylometry/consistency/drift.py +549 -0
  11. pystylometry/dialect/__init__.py +65 -0
  12. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  13. pystylometry/dialect/_loader.py +360 -0
  14. pystylometry/dialect/detector.py +533 -0
  15. pystylometry/lexical/advanced_diversity.py +61 -22
  16. pystylometry/lexical/function_words.py +255 -56
  17. pystylometry/lexical/hapax.py +182 -52
  18. pystylometry/lexical/mtld.py +108 -26
  19. pystylometry/lexical/ttr.py +76 -10
  20. pystylometry/lexical/word_frequency_sophistication.py +1522 -298
  21. pystylometry/lexical/yule.py +136 -50
  22. pystylometry/ngrams/entropy.py +150 -49
  23. pystylometry/readability/additional_formulas.py +1887 -762
  24. pystylometry/readability/ari.py +144 -82
  25. pystylometry/readability/coleman_liau.py +136 -109
  26. pystylometry/readability/flesch.py +177 -73
  27. pystylometry/readability/gunning_fog.py +165 -161
  28. pystylometry/readability/smog.py +123 -42
  29. pystylometry/syntactic/advanced_syntactic.py +76 -14
  30. pystylometry/syntactic/pos_ratios.py +70 -6
  31. pystylometry/syntactic/sentence_stats.py +55 -12
  32. pystylometry/syntactic/sentence_types.py +71 -15
  33. pystylometry/viz/__init__.py +71 -0
  34. pystylometry/viz/drift.py +589 -0
  35. pystylometry/viz/jsx/__init__.py +31 -0
  36. pystylometry/viz/jsx/_base.py +144 -0
  37. pystylometry/viz/jsx/report.py +677 -0
  38. pystylometry/viz/jsx/timeline.py +716 -0
  39. pystylometry/viz/jsx/viewer.py +1032 -0
  40. {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +5 -2
  41. pystylometry-1.1.0.dist-info/RECORD +63 -0
  42. {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +1 -1
  43. pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
  44. pystylometry-1.0.0.dist-info/RECORD +0 -46
@@ -28,13 +28,21 @@ References:
28
28
  of linguistic complexity. In Image, language, brain (pp. 95-126).
29
29
  """
30
30
 
31
- from .._types import AdvancedSyntacticResult
31
+ from typing import Any
32
+
33
+ from .._types import AdvancedSyntacticResult, Distribution, make_distribution
32
34
  from .._utils import check_optional_dependency
33
35
 
36
+ # Type aliases for spaCy objects (loaded dynamically)
37
+ _SpaCyToken = Any
38
+ _SpaCyDoc = Any
39
+ _SpaCySpan = Any
40
+
34
41
 
35
42
  def compute_advanced_syntactic(
36
43
  text: str,
37
44
  model: str = "en_core_web_sm",
45
+ chunk_size: int = 1000,
38
46
  ) -> AdvancedSyntacticResult:
39
47
  """
40
48
  Compute advanced syntactic complexity metrics using dependency parsing.
@@ -147,7 +155,6 @@ def compute_advanced_syntactic(
147
155
 
148
156
  try:
149
157
  import spacy # type: ignore
150
- from spacy.tokens import Doc, Span, Token # type: ignore
151
158
  except ImportError as e:
152
159
  raise ImportError(
153
160
  "spaCy is required for advanced syntactic analysis. "
@@ -159,8 +166,7 @@ def compute_advanced_syntactic(
159
166
  nlp = spacy.load(model)
160
167
  except OSError as e:
161
168
  raise OSError(
162
- f"spaCy model '{model}' not found. "
163
- f"Download with: python -m spacy download {model}"
169
+ f"spaCy model '{model}' not found. Download with: python -m spacy download {model}"
164
170
  ) from e
165
171
 
166
172
  # Parse text
@@ -169,6 +175,14 @@ def compute_advanced_syntactic(
169
175
 
170
176
  # Handle empty text
171
177
  if len(sentences) == 0 or len(doc) == 0:
178
+ empty_dist = Distribution(
179
+ values=[],
180
+ mean=float("nan"),
181
+ median=float("nan"),
182
+ std=0.0,
183
+ range=0.0,
184
+ iqr=0.0,
185
+ )
172
186
  return AdvancedSyntacticResult(
173
187
  mean_parse_tree_depth=float("nan"),
174
188
  max_parse_tree_depth=0,
@@ -183,6 +197,20 @@ def compute_advanced_syntactic(
183
197
  dependency_distance=float("nan"),
184
198
  left_branching_ratio=float("nan"),
185
199
  right_branching_ratio=float("nan"),
200
+ mean_parse_tree_depth_dist=empty_dist,
201
+ max_parse_tree_depth_dist=empty_dist,
202
+ mean_t_unit_length_dist=empty_dist,
203
+ clausal_density_dist=empty_dist,
204
+ dependent_clause_ratio_dist=empty_dist,
205
+ passive_voice_ratio_dist=empty_dist,
206
+ subordination_index_dist=empty_dist,
207
+ coordination_index_dist=empty_dist,
208
+ sentence_complexity_score_dist=empty_dist,
209
+ dependency_distance_dist=empty_dist,
210
+ left_branching_ratio_dist=empty_dist,
211
+ right_branching_ratio_dist=empty_dist,
212
+ chunk_size=chunk_size,
213
+ chunk_count=0,
186
214
  metadata={
187
215
  "sentence_count": 0,
188
216
  "word_count": 0,
@@ -229,9 +257,7 @@ def compute_advanced_syntactic(
229
257
  coordinate_clause_count = 0
230
258
 
231
259
  for sent in sentences:
232
- sent_total, sent_dependent, sent_subordinate, sent_coordinate = _count_clauses(
233
- sent
234
- )
260
+ sent_total, sent_dependent, sent_subordinate, sent_coordinate = _count_clauses(sent)
235
261
  total_clauses += sent_total
236
262
  dependent_clause_count += sent_dependent
237
263
  subordinate_clause_count += sent_subordinate
@@ -279,14 +305,22 @@ def compute_advanced_syntactic(
279
305
  # Normalize individual metrics to 0-1 range
280
306
  normalized_parse_depth = min(mean_parse_tree_depth / 10, 1.0)
281
307
  normalized_clausal_density = (
282
- min(clausal_density / 3, 1.0) if not isinstance(clausal_density, float) or not (clausal_density != clausal_density) else 0.0
308
+ min(clausal_density / 3, 1.0)
309
+ if not isinstance(clausal_density, float) or not (clausal_density != clausal_density)
310
+ else 0.0
283
311
  )
284
312
  normalized_t_unit_length = (
285
- min(mean_t_unit_length / 25, 1.0) if not isinstance(mean_t_unit_length, float) or not (mean_t_unit_length != mean_t_unit_length) else 0.0
313
+ min(mean_t_unit_length / 25, 1.0)
314
+ if not isinstance(mean_t_unit_length, float)
315
+ or not (mean_t_unit_length != mean_t_unit_length)
316
+ else 0.0
286
317
  )
287
318
  normalized_dependency_distance = min(mean_dependency_distance / 5, 1.0)
288
319
  normalized_subordination = (
289
- subordination_index if not isinstance(subordination_index, float) or not (subordination_index != subordination_index) else 0.0
320
+ subordination_index
321
+ if not isinstance(subordination_index, float)
322
+ or not (subordination_index != subordination_index)
323
+ else 0.0
290
324
  )
291
325
 
292
326
  # Weighted combination
@@ -298,6 +332,20 @@ def compute_advanced_syntactic(
298
332
  + 0.1 * normalized_dependency_distance
299
333
  )
300
334
 
335
+ # Create single-value distributions (analysis is done on full text)
336
+ mean_parse_tree_depth_dist = make_distribution([mean_parse_tree_depth])
337
+ max_parse_tree_depth_dist = make_distribution([float(max_parse_tree_depth)])
338
+ mean_t_unit_length_dist = make_distribution([mean_t_unit_length])
339
+ clausal_density_dist = make_distribution([clausal_density])
340
+ dependent_clause_ratio_dist = make_distribution([dependent_clause_ratio])
341
+ passive_voice_ratio_dist = make_distribution([passive_voice_ratio])
342
+ subordination_index_dist = make_distribution([subordination_index])
343
+ coordination_index_dist = make_distribution([coordination_index])
344
+ sentence_complexity_score_dist = make_distribution([sentence_complexity_score])
345
+ dependency_distance_dist = make_distribution([mean_dependency_distance])
346
+ left_branching_ratio_dist = make_distribution([left_branching_ratio])
347
+ right_branching_ratio_dist = make_distribution([right_branching_ratio])
348
+
301
349
  # Collect metadata
302
350
  metadata = {
303
351
  "sentence_count": len(sentences),
@@ -331,11 +379,25 @@ def compute_advanced_syntactic(
331
379
  dependency_distance=mean_dependency_distance,
332
380
  left_branching_ratio=left_branching_ratio,
333
381
  right_branching_ratio=right_branching_ratio,
382
+ mean_parse_tree_depth_dist=mean_parse_tree_depth_dist,
383
+ max_parse_tree_depth_dist=max_parse_tree_depth_dist,
384
+ mean_t_unit_length_dist=mean_t_unit_length_dist,
385
+ clausal_density_dist=clausal_density_dist,
386
+ dependent_clause_ratio_dist=dependent_clause_ratio_dist,
387
+ passive_voice_ratio_dist=passive_voice_ratio_dist,
388
+ subordination_index_dist=subordination_index_dist,
389
+ coordination_index_dist=coordination_index_dist,
390
+ sentence_complexity_score_dist=sentence_complexity_score_dist,
391
+ dependency_distance_dist=dependency_distance_dist,
392
+ left_branching_ratio_dist=left_branching_ratio_dist,
393
+ right_branching_ratio_dist=right_branching_ratio_dist,
394
+ chunk_size=chunk_size,
395
+ chunk_count=1, # Single pass analysis
334
396
  metadata=metadata,
335
397
  )
336
398
 
337
399
 
338
- def _calculate_max_tree_depth(token) -> int:
400
+ def _calculate_max_tree_depth(token: _SpaCyToken) -> int:
339
401
  """
340
402
  Calculate maximum depth of dependency tree starting from token.
341
403
 
@@ -352,7 +414,7 @@ def _calculate_max_tree_depth(token) -> int:
352
414
  return max(child_depths) + 1
353
415
 
354
416
 
355
- def _identify_t_units(doc) -> list:
417
+ def _identify_t_units(doc: _SpaCyDoc) -> list[_SpaCySpan]:
356
418
  """
357
419
  Identify T-units (minimal terminable units) in document.
358
420
 
@@ -371,7 +433,7 @@ def _identify_t_units(doc) -> list:
371
433
  return list(doc.sents)
372
434
 
373
435
 
374
- def _count_clauses(sent) -> tuple[int, int, int, int]:
436
+ def _count_clauses(sent: _SpaCySpan) -> tuple[int, int, int, int]:
375
437
  """
376
438
  Count different types of clauses in sentence.
377
439
 
@@ -406,7 +468,7 @@ def _count_clauses(sent) -> tuple[int, int, int, int]:
406
468
  return total, dependent, subordinate, coordinate
407
469
 
408
470
 
409
- def _is_passive_voice(sent) -> bool:
471
+ def _is_passive_voice(sent: _SpaCySpan) -> bool:
410
472
  """
411
473
  Detect if sentence contains passive voice construction.
412
474
 
@@ -1,10 +1,17 @@
1
- """Part-of-Speech ratio analysis using spaCy."""
1
+ """Part-of-Speech ratio analysis using spaCy.
2
2
 
3
- from .._types import POSResult
3
+ Related GitHub Issue:
4
+ #27 - Native chunked analysis with Distribution dataclass
5
+ https://github.com/craigtrim/pystylometry/issues/27
6
+ """
7
+
8
+ from .._types import Distribution, POSResult, make_distribution
4
9
  from .._utils import check_optional_dependency
5
10
 
6
11
 
7
- def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
12
+ def compute_pos_ratios(
13
+ text: str, model: str = "en_core_web_sm", chunk_size: int = 1000
14
+ ) -> POSResult:
8
15
  """
9
16
  Compute Part-of-Speech ratios and lexical density using spaCy.
10
17
 
@@ -18,6 +25,10 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
18
25
  - Lexical density: (nouns + verbs + adjectives + adverbs) / total words
19
26
  - Function word ratio: (determiners + prepositions + conjunctions) / total words
20
27
 
28
+ Related GitHub Issue:
29
+ #27 - Native chunked analysis with Distribution dataclass
30
+ https://github.com/craigtrim/pystylometry/issues/27
31
+
21
32
  References:
22
33
  Biber, D. (1988). Variation across speech and writing.
23
34
  Cambridge University Press.
@@ -25,9 +36,13 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
25
36
  Args:
26
37
  text: Input text to analyze
27
38
  model: spaCy model name (default: "en_core_web_sm")
39
+ chunk_size: Number of words per chunk (default: 1000).
40
+ Note: POS analysis is performed on the full text for accuracy,
41
+ so this parameter is included for API consistency but actual
42
+ results are from a single pass.
28
43
 
29
44
  Returns:
30
- POSResult with all POS ratios and metadata
45
+ POSResult with all POS ratios, distributions, and metadata
31
46
 
32
47
  Raises:
33
48
  ImportError: If spaCy is not installed
@@ -47,8 +62,7 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
47
62
  nlp = spacy.load(model)
48
63
  except OSError:
49
64
  raise OSError(
50
- f"spaCy model '{model}' not found. "
51
- f"Download it with: python -m spacy download {model}"
65
+ f"spaCy model '{model}' not found. Download it with: python -m spacy download {model}"
52
66
  )
53
67
 
54
68
  # Process text with spaCy
@@ -89,6 +103,14 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
89
103
 
90
104
  # Handle empty text
91
105
  if total_tokens == 0:
106
+ empty_dist = Distribution(
107
+ values=[],
108
+ mean=float("nan"),
109
+ median=float("nan"),
110
+ std=0.0,
111
+ range=0.0,
112
+ iqr=0.0,
113
+ )
92
114
  return POSResult(
93
115
  noun_ratio=float("nan"),
94
116
  verb_ratio=float("nan"),
@@ -98,6 +120,16 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
98
120
  adjective_noun_ratio=float("nan"),
99
121
  lexical_density=float("nan"),
100
122
  function_word_ratio=float("nan"),
123
+ noun_ratio_dist=empty_dist,
124
+ verb_ratio_dist=empty_dist,
125
+ adjective_ratio_dist=empty_dist,
126
+ adverb_ratio_dist=empty_dist,
127
+ noun_verb_ratio_dist=empty_dist,
128
+ adjective_noun_ratio_dist=empty_dist,
129
+ lexical_density_dist=empty_dist,
130
+ function_word_ratio_dist=empty_dist,
131
+ chunk_size=chunk_size,
132
+ chunk_count=0,
101
133
  metadata={
102
134
  "model": model,
103
135
  "token_count": 0,
@@ -129,6 +161,28 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
129
161
  function_words = det_count + adp_count + conj_count
130
162
  function_word_ratio = function_words / total_tokens
131
163
 
164
+ # Create single-value distributions (POS analysis is done on full text)
165
+ noun_ratio_dist = make_distribution([noun_ratio])
166
+ verb_ratio_dist = make_distribution([verb_ratio])
167
+ adj_ratio_dist = make_distribution([adj_ratio])
168
+ adv_ratio_dist = make_distribution([adv_ratio])
169
+ noun_verb_dist = (
170
+ make_distribution([noun_verb_ratio])
171
+ if not (noun_verb_ratio != noun_verb_ratio)
172
+ else Distribution(
173
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
174
+ )
175
+ )
176
+ adj_noun_dist = (
177
+ make_distribution([adj_noun_ratio])
178
+ if not (adj_noun_ratio != adj_noun_ratio)
179
+ else Distribution(
180
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
181
+ )
182
+ )
183
+ lexical_density_dist = make_distribution([lexical_density])
184
+ function_word_dist = make_distribution([function_word_ratio])
185
+
132
186
  return POSResult(
133
187
  noun_ratio=noun_ratio,
134
188
  verb_ratio=verb_ratio,
@@ -138,6 +192,16 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
138
192
  adjective_noun_ratio=adj_noun_ratio,
139
193
  lexical_density=lexical_density,
140
194
  function_word_ratio=function_word_ratio,
195
+ noun_ratio_dist=noun_ratio_dist,
196
+ verb_ratio_dist=verb_ratio_dist,
197
+ adjective_ratio_dist=adj_ratio_dist,
198
+ adverb_ratio_dist=adv_ratio_dist,
199
+ noun_verb_ratio_dist=noun_verb_dist,
200
+ adjective_noun_ratio_dist=adj_noun_dist,
201
+ lexical_density_dist=lexical_density_dist,
202
+ function_word_ratio_dist=function_word_dist,
203
+ chunk_size=chunk_size,
204
+ chunk_count=1, # Single pass analysis
141
205
  metadata={
142
206
  "model": model,
143
207
  "token_count": total_tokens,
@@ -1,10 +1,17 @@
1
- """Sentence-level statistics using spaCy."""
1
+ """Sentence-level statistics using spaCy.
2
2
 
3
- from .._types import SentenceStatsResult
4
- from .._utils import check_optional_dependency, split_sentences
3
+ Related GitHub Issue:
4
+ #27 - Native chunked analysis with Distribution dataclass
5
+ https://github.com/craigtrim/pystylometry/issues/27
6
+ """
5
7
 
8
+ from .._types import Distribution, SentenceStatsResult, make_distribution
9
+ from .._utils import check_optional_dependency
6
10
 
7
- def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> SentenceStatsResult:
11
+
12
+ def compute_sentence_stats(
13
+ text: str, model: str = "en_core_web_sm", chunk_size: int = 1000
14
+ ) -> SentenceStatsResult:
8
15
  """
9
16
  Compute sentence-level statistics using spaCy.
10
17
 
@@ -16,6 +23,10 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
16
23
  - Maximum sentence length
17
24
  - Total sentence count
18
25
 
26
+ Related GitHub Issue:
27
+ #27 - Native chunked analysis with Distribution dataclass
28
+ https://github.com/craigtrim/pystylometry/issues/27
29
+
19
30
  References:
20
31
  Hunt, K. W. (1965). Grammatical structures written at three grade levels.
21
32
  NCTE Research Report No. 3.
@@ -23,9 +34,13 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
23
34
  Args:
24
35
  text: Input text to analyze
25
36
  model: spaCy model name (default: "en_core_web_sm")
37
+ chunk_size: Number of words per chunk (default: 1000).
38
+ Note: Sentence analysis is performed on the full text for accuracy,
39
+ so this parameter is included for API consistency but actual
40
+ results are from a single pass.
26
41
 
27
42
  Returns:
28
- SentenceStatsResult with sentence statistics and metadata
43
+ SentenceStatsResult with sentence statistics, distributions, and metadata
29
44
 
30
45
  Raises:
31
46
  ImportError: If spaCy is not installed
@@ -45,8 +60,7 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
45
60
  nlp = spacy.load(model)
46
61
  except OSError:
47
62
  raise OSError(
48
- f"spaCy model '{model}' not found. "
49
- f"Download it with: python -m spacy download {model}"
63
+ f"spaCy model '{model}' not found. Download it with: python -m spacy download {model}"
50
64
  )
51
65
 
52
66
  # Process text with spaCy
@@ -62,13 +76,28 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
62
76
 
63
77
  # Handle empty text
64
78
  if len(sentence_lengths) == 0:
79
+ empty_dist = Distribution(
80
+ values=[],
81
+ mean=float("nan"),
82
+ median=float("nan"),
83
+ std=0.0,
84
+ range=0.0,
85
+ iqr=0.0,
86
+ )
65
87
  return SentenceStatsResult(
66
88
  mean_sentence_length=float("nan"),
67
89
  sentence_length_std=float("nan"),
68
- sentence_length_range=0,
69
- min_sentence_length=0,
70
- max_sentence_length=0,
90
+ sentence_length_range=0.0,
91
+ min_sentence_length=0.0,
92
+ max_sentence_length=0.0,
71
93
  sentence_count=0,
94
+ mean_sentence_length_dist=empty_dist,
95
+ sentence_length_std_dist=empty_dist,
96
+ sentence_length_range_dist=empty_dist,
97
+ min_sentence_length_dist=empty_dist,
98
+ max_sentence_length_dist=empty_dist,
99
+ chunk_size=chunk_size,
100
+ chunk_count=0,
72
101
  metadata={
73
102
  "model": model,
74
103
  },
@@ -86,10 +115,17 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
86
115
  else:
87
116
  std_dev = 0.0
88
117
 
89
- min_length = min(sentence_lengths)
90
- max_length = max(sentence_lengths)
118
+ min_length = float(min(sentence_lengths))
119
+ max_length = float(max(sentence_lengths))
91
120
  length_range = max_length - min_length
92
121
 
122
+ # Create single-value distributions (sentence analysis is done on full text)
123
+ mean_dist = make_distribution([mean_length])
124
+ std_dist = make_distribution([std_dev])
125
+ range_dist = make_distribution([length_range])
126
+ min_dist = make_distribution([min_length])
127
+ max_dist = make_distribution([max_length])
128
+
93
129
  return SentenceStatsResult(
94
130
  mean_sentence_length=mean_length,
95
131
  sentence_length_std=std_dev,
@@ -97,6 +133,13 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
97
133
  min_sentence_length=min_length,
98
134
  max_sentence_length=max_length,
99
135
  sentence_count=len(sentence_lengths),
136
+ mean_sentence_length_dist=mean_dist,
137
+ sentence_length_std_dist=std_dist,
138
+ sentence_length_range_dist=range_dist,
139
+ min_sentence_length_dist=min_dist,
140
+ max_sentence_length_dist=max_dist,
141
+ chunk_size=chunk_size,
142
+ chunk_count=1, # Single pass analysis
100
143
  metadata={
101
144
  "model": model,
102
145
  "sentence_lengths": sentence_lengths,
@@ -27,13 +27,19 @@ References:
27
27
  Quirk, R., et al. (1985). A Comprehensive Grammar of the English Language. Longman.
28
28
  """
29
29
 
30
- from .._types import SentenceTypeResult
30
+ from typing import Any
31
+
32
+ from .._types import Distribution, SentenceTypeResult, make_distribution
31
33
  from .._utils import check_optional_dependency
32
34
 
35
+ # Type alias for spaCy Span (loaded dynamically)
36
+ _SpaCySpan = Any
37
+
33
38
 
34
39
  def compute_sentence_types(
35
40
  text: str,
36
41
  model: str = "en_core_web_sm",
42
+ chunk_size: int = 1000,
37
43
  ) -> SentenceTypeResult:
38
44
  """
39
45
  Classify sentences by structure and function.
@@ -193,8 +199,7 @@ def compute_sentence_types(
193
199
  nlp = spacy.load(model)
194
200
  except OSError as e:
195
201
  raise OSError(
196
- f"spaCy model '{model}' not found. "
197
- f"Download with: python -m spacy download {model}"
202
+ f"spaCy model '{model}' not found. Download with: python -m spacy download {model}"
198
203
  ) from e
199
204
 
200
205
  # Parse text
@@ -203,6 +208,14 @@ def compute_sentence_types(
203
208
 
204
209
  # Handle empty text
205
210
  if len(sentences) == 0:
211
+ empty_dist = Distribution(
212
+ values=[],
213
+ mean=float("nan"),
214
+ median=float("nan"),
215
+ std=0.0,
216
+ range=0.0,
217
+ iqr=0.0,
218
+ )
206
219
  return SentenceTypeResult(
207
220
  simple_ratio=float("nan"),
208
221
  compound_ratio=float("nan"),
@@ -223,6 +236,18 @@ def compute_sentence_types(
223
236
  total_sentences=0,
224
237
  structural_diversity=float("nan"),
225
238
  functional_diversity=float("nan"),
239
+ simple_ratio_dist=empty_dist,
240
+ compound_ratio_dist=empty_dist,
241
+ complex_ratio_dist=empty_dist,
242
+ compound_complex_ratio_dist=empty_dist,
243
+ declarative_ratio_dist=empty_dist,
244
+ interrogative_ratio_dist=empty_dist,
245
+ imperative_ratio_dist=empty_dist,
246
+ exclamatory_ratio_dist=empty_dist,
247
+ structural_diversity_dist=empty_dist,
248
+ functional_diversity_dist=empty_dist,
249
+ chunk_size=chunk_size,
250
+ chunk_count=0,
226
251
  metadata={
227
252
  "warning": "Empty text or no sentences found",
228
253
  },
@@ -249,13 +274,15 @@ def compute_sentence_types(
249
274
  functional_counts[functional_type] += 1
250
275
 
251
276
  # Store classification
252
- sentence_classifications.append({
253
- "text": sent.text,
254
- "structural_type": structural_type,
255
- "functional_type": functional_type,
256
- "independent_clauses": independent_count,
257
- "dependent_clauses": dependent_count,
258
- })
277
+ sentence_classifications.append(
278
+ {
279
+ "text": sent.text,
280
+ "structural_type": structural_type,
281
+ "functional_type": functional_type,
282
+ "independent_clauses": independent_count,
283
+ "dependent_clauses": dependent_count,
284
+ }
285
+ )
259
286
 
260
287
  # Calculate ratios
261
288
  total_sentences = len(sentences)
@@ -271,11 +298,28 @@ def compute_sentence_types(
271
298
 
272
299
  # Calculate diversity metrics
273
300
  structural_ratios = [simple_ratio, compound_ratio, complex_ratio, compound_complex_ratio]
274
- functional_ratios = [declarative_ratio, interrogative_ratio, imperative_ratio, exclamatory_ratio]
301
+ functional_ratios = [
302
+ declarative_ratio,
303
+ interrogative_ratio,
304
+ imperative_ratio,
305
+ exclamatory_ratio,
306
+ ]
275
307
 
276
308
  structural_diversity = _calculate_shannon_entropy(structural_ratios)
277
309
  functional_diversity = _calculate_shannon_entropy(functional_ratios)
278
310
 
311
+ # Create single-value distributions (sentence analysis is done on full text)
312
+ simple_ratio_dist = make_distribution([simple_ratio])
313
+ compound_ratio_dist = make_distribution([compound_ratio])
314
+ complex_ratio_dist = make_distribution([complex_ratio])
315
+ compound_complex_ratio_dist = make_distribution([compound_complex_ratio])
316
+ declarative_ratio_dist = make_distribution([declarative_ratio])
317
+ interrogative_ratio_dist = make_distribution([interrogative_ratio])
318
+ imperative_ratio_dist = make_distribution([imperative_ratio])
319
+ exclamatory_ratio_dist = make_distribution([exclamatory_ratio])
320
+ structural_diversity_dist = make_distribution([structural_diversity])
321
+ functional_diversity_dist = make_distribution([functional_diversity])
322
+
279
323
  # Collect metadata
280
324
  metadata = {
281
325
  "sentence_count": total_sentences,
@@ -306,11 +350,23 @@ def compute_sentence_types(
306
350
  total_sentences=total_sentences,
307
351
  structural_diversity=structural_diversity,
308
352
  functional_diversity=functional_diversity,
353
+ simple_ratio_dist=simple_ratio_dist,
354
+ compound_ratio_dist=compound_ratio_dist,
355
+ complex_ratio_dist=complex_ratio_dist,
356
+ compound_complex_ratio_dist=compound_complex_ratio_dist,
357
+ declarative_ratio_dist=declarative_ratio_dist,
358
+ interrogative_ratio_dist=interrogative_ratio_dist,
359
+ imperative_ratio_dist=imperative_ratio_dist,
360
+ exclamatory_ratio_dist=exclamatory_ratio_dist,
361
+ structural_diversity_dist=structural_diversity_dist,
362
+ functional_diversity_dist=functional_diversity_dist,
363
+ chunk_size=chunk_size,
364
+ chunk_count=1, # Single pass analysis
309
365
  metadata=metadata,
310
366
  )
311
367
 
312
368
 
313
- def _count_independent_clauses(sent) -> int:
369
+ def _count_independent_clauses(sent: _SpaCySpan) -> int:
314
370
  """
315
371
  Count independent clauses in a sentence.
316
372
 
@@ -336,7 +392,7 @@ def _count_independent_clauses(sent) -> int:
336
392
  return count
337
393
 
338
394
 
339
- def _count_dependent_clauses(sent) -> int:
395
+ def _count_dependent_clauses(sent: _SpaCySpan) -> int:
340
396
  """
341
397
  Count dependent clauses in a sentence.
342
398
 
@@ -382,7 +438,7 @@ def _classify_structural(independent: int, dependent: int) -> str:
382
438
  return "simple"
383
439
 
384
440
 
385
- def _classify_functional(sent) -> str:
441
+ def _classify_functional(sent: _SpaCySpan) -> str:
386
442
  """
387
443
  Classify sentence function based on punctuation and structure.
388
444
 
@@ -415,7 +471,7 @@ def _classify_functional(sent) -> str:
415
471
  return "declarative"
416
472
 
417
473
 
418
- def _is_imperative_structure(sent) -> bool:
474
+ def _is_imperative_structure(sent: _SpaCySpan) -> bool:
419
475
  """
420
476
  Check if sentence has imperative structure.
421
477
 
@@ -0,0 +1,71 @@
1
+ """Visualization module for pystylometry.
2
+
3
+ This module provides visualization functions for stylometric analysis results.
4
+
5
+ Matplotlib Functions (PNG output):
6
+ Requires optional dependencies: pip install pystylometry[viz]
7
+
8
+ plot_drift_timeline: Line chart of chi-squared values over document
9
+ plot_drift_scatter: Scatter plot with reference zones (tic-tac-toe style)
10
+ plot_drift_report: Combined multi-panel visualization
11
+
12
+ Interactive JSX Functions (HTML output):
13
+ No additional dependencies required (uses React via CDN)
14
+
15
+ export_drift_timeline_jsx: Interactive timeline chart
16
+ export_drift_report_jsx: Interactive multi-panel dashboard
17
+ export_drift_viewer: Standalone viewer with file upload
18
+
19
+ Related GitHub Issues:
20
+ #38 - Visualization Options for Style Drift Detection
21
+ https://github.com/craigtrim/pystylometry/issues/38
22
+
23
+ Example:
24
+ >>> from pystylometry.consistency import compute_kilgarriff_drift
25
+ >>> from pystylometry.viz import plot_drift_timeline, export_drift_timeline_jsx
26
+ >>>
27
+ >>> result = compute_kilgarriff_drift(text)
28
+ >>> plot_drift_timeline(result, output="timeline.png") # Static PNG
29
+ >>> export_drift_timeline_jsx(result, "timeline.html") # Interactive HTML
30
+ """
31
+
32
+ from .drift import ( # noqa: E402
33
+ plot_drift_report,
34
+ plot_drift_scatter,
35
+ plot_drift_timeline,
36
+ )
37
+ from .jsx import ( # noqa: E402
38
+ export_drift_report_jsx,
39
+ export_drift_timeline_jsx,
40
+ export_drift_viewer,
41
+ )
42
+
43
+ try:
44
+ import matplotlib # noqa: F401
45
+ import seaborn # noqa: F401 # type: ignore[import-untyped]
46
+
47
+ _VIZ_AVAILABLE = True
48
+ except ImportError:
49
+ _VIZ_AVAILABLE = False
50
+
51
+
52
+ def _check_viz_available() -> None:
53
+ """Raise ImportError if visualization dependencies are not installed."""
54
+ if not _VIZ_AVAILABLE:
55
+ raise ImportError(
56
+ "Visualization requires optional dependencies. "
57
+ "Install with: pip install pystylometry[viz] or poetry install --with viz"
58
+ )
59
+
60
+
61
+ __all__ = [
62
+ # Matplotlib (PNG)
63
+ "plot_drift_timeline",
64
+ "plot_drift_scatter",
65
+ "plot_drift_report",
66
+ # JSX (HTML)
67
+ "export_drift_timeline_jsx",
68
+ "export_drift_report_jsx",
69
+ # Standalone viewer
70
+ "export_drift_viewer",
71
+ ]