pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. pystylometry/README.md +42 -0
  2. pystylometry/__init__.py +45 -3
  3. pystylometry/_types.py +1017 -259
  4. pystylometry/authorship/README.md +21 -0
  5. pystylometry/authorship/__init__.py +28 -4
  6. pystylometry/authorship/additional_methods.py +260 -40
  7. pystylometry/authorship/compression.py +175 -0
  8. pystylometry/authorship/kilgarriff.py +354 -0
  9. pystylometry/character/README.md +17 -0
  10. pystylometry/character/character_metrics.py +267 -179
  11. pystylometry/cli.py +427 -0
  12. pystylometry/consistency/README.md +27 -0
  13. pystylometry/consistency/__init__.py +57 -0
  14. pystylometry/consistency/_thresholds.py +162 -0
  15. pystylometry/consistency/drift.py +549 -0
  16. pystylometry/dialect/README.md +26 -0
  17. pystylometry/dialect/__init__.py +65 -0
  18. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  19. pystylometry/dialect/_loader.py +360 -0
  20. pystylometry/dialect/detector.py +533 -0
  21. pystylometry/lexical/README.md +23 -0
  22. pystylometry/lexical/advanced_diversity.py +61 -22
  23. pystylometry/lexical/function_words.py +255 -56
  24. pystylometry/lexical/hapax.py +182 -52
  25. pystylometry/lexical/mtld.py +108 -26
  26. pystylometry/lexical/ttr.py +76 -10
  27. pystylometry/lexical/word_frequency_sophistication.py +1522 -298
  28. pystylometry/lexical/yule.py +136 -50
  29. pystylometry/ngrams/README.md +18 -0
  30. pystylometry/ngrams/entropy.py +150 -49
  31. pystylometry/ngrams/extended_ngrams.py +314 -69
  32. pystylometry/prosody/README.md +17 -0
  33. pystylometry/prosody/rhythm_prosody.py +773 -11
  34. pystylometry/readability/README.md +23 -0
  35. pystylometry/readability/additional_formulas.py +1887 -762
  36. pystylometry/readability/ari.py +144 -82
  37. pystylometry/readability/coleman_liau.py +136 -109
  38. pystylometry/readability/flesch.py +177 -73
  39. pystylometry/readability/gunning_fog.py +165 -161
  40. pystylometry/readability/smog.py +123 -42
  41. pystylometry/stylistic/README.md +20 -0
  42. pystylometry/stylistic/cohesion_coherence.py +669 -13
  43. pystylometry/stylistic/genre_register.py +1560 -17
  44. pystylometry/stylistic/markers.py +611 -17
  45. pystylometry/stylistic/vocabulary_overlap.py +354 -13
  46. pystylometry/syntactic/README.md +20 -0
  47. pystylometry/syntactic/advanced_syntactic.py +76 -14
  48. pystylometry/syntactic/pos_ratios.py +70 -6
  49. pystylometry/syntactic/sentence_stats.py +55 -12
  50. pystylometry/syntactic/sentence_types.py +71 -15
  51. pystylometry/viz/README.md +27 -0
  52. pystylometry/viz/__init__.py +71 -0
  53. pystylometry/viz/drift.py +589 -0
  54. pystylometry/viz/jsx/__init__.py +31 -0
  55. pystylometry/viz/jsx/_base.py +144 -0
  56. pystylometry/viz/jsx/report.py +677 -0
  57. pystylometry/viz/jsx/timeline.py +716 -0
  58. pystylometry/viz/jsx/viewer.py +1032 -0
  59. pystylometry-1.3.0.dist-info/METADATA +136 -0
  60. pystylometry-1.3.0.dist-info/RECORD +76 -0
  61. {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
  62. pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
  63. pystylometry-1.0.0.dist-info/METADATA +0 -275
  64. pystylometry-1.0.0.dist-info/RECORD +0 -46
@@ -32,7 +32,13 @@ References:
32
32
  import random
33
33
  from typing import Optional
34
34
 
35
- from .._types import HDDResult, MATTRResult, MSTTRResult, VocdDResult
35
+ from .._types import (
36
+ HDDResult,
37
+ MATTRResult,
38
+ MSTTRResult,
39
+ VocdDResult,
40
+ make_distribution,
41
+ )
36
42
 
37
43
 
38
44
  def _tokenize_for_diversity(text: str) -> list[str]:
@@ -61,13 +67,13 @@ def _tokenize_for_diversity(text: str) -> list[str]:
61
67
  raw_tokens = text_lower.split()
62
68
 
63
69
  # Comprehensive punctuation set for stripping
64
- PUNCTUATION = set(".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„\"\"''‚'")
70
+ punctuation_chars = set(".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„\"\"''‚'")
65
71
 
66
72
  # Strip punctuation from each token
67
73
  tokens = []
68
74
  for token in raw_tokens:
69
75
  # Strip leading and trailing punctuation
70
- clean_token = token.strip("".join(PUNCTUATION))
76
+ clean_token = token.strip("".join(punctuation_chars))
71
77
  if clean_token: # Only add non-empty tokens
72
78
  tokens.append(clean_token)
73
79
 
@@ -80,6 +86,7 @@ def compute_vocd_d(
80
86
  num_samples: int = 100,
81
87
  min_tokens: int = 100,
82
88
  random_seed: Optional[int] = None,
89
+ chunk_size: int = 1000,
83
90
  ) -> VocdDResult:
84
91
  """
85
92
  Compute voc-D (vocabulary D) using curve-fitting approach.
@@ -167,9 +174,7 @@ def compute_vocd_d(
167
174
 
168
175
  # Step 2: Validate minimum length
169
176
  if total_tokens < min_tokens:
170
- raise ValueError(
171
- f"Text has {total_tokens} tokens, minimum {min_tokens} required for voc-D"
172
- )
177
+ raise ValueError(f"Text has {total_tokens} tokens, minimum {min_tokens} required for voc-D")
173
178
 
174
179
  # Step 3: Determine sample sizes to test
175
180
  # Test from 10 tokens up to min(100, total_tokens - 10)
@@ -212,12 +217,12 @@ def compute_vocd_d(
212
217
  numerator += ttr / (size**0.5)
213
218
  denominator += 1.0 / size
214
219
 
215
- D = numerator / denominator if denominator > 0 else 0.0
220
+ d_param = numerator / denominator if denominator > 0 else 0.0
216
221
 
217
222
  # Step 6: Calculate R² (goodness of fit)
218
223
  # Predicted TTR = D / sqrt(sample_size)
219
224
  y_actual = list(sample_size_to_mean_ttr.values())
220
- y_predicted = [D / (size**0.5) for size in sample_sizes]
225
+ y_predicted = [d_param / (size**0.5) for size in sample_sizes]
221
226
 
222
227
  # R² calculation
223
228
  mean_y = sum(y_actual) / len(y_actual)
@@ -237,17 +242,25 @@ def compute_vocd_d(
237
242
  "random_seed": random_seed,
238
243
  }
239
244
 
240
- # Step 8: Return result
245
+ # Step 8: Create distributions (single-pass analysis)
246
+ d_parameter_dist = make_distribution([d_param])
247
+ curve_fit_r_squared_dist = make_distribution([r_squared])
248
+
249
+ # Step 9: Return result
241
250
  return VocdDResult(
242
- d_parameter=D,
251
+ d_parameter=d_param,
243
252
  curve_fit_r_squared=r_squared,
244
253
  sample_count=len(sample_sizes),
245
254
  optimal_sample_size=sample_size, # Input parameter
255
+ d_parameter_dist=d_parameter_dist,
256
+ curve_fit_r_squared_dist=curve_fit_r_squared_dist,
257
+ chunk_size=chunk_size,
258
+ chunk_count=1, # Single pass analysis
246
259
  metadata=metadata,
247
260
  )
248
261
 
249
262
 
250
- def compute_mattr(text: str, window_size: int = 50) -> MATTRResult:
263
+ def compute_mattr(text: str, window_size: int = 50, chunk_size: int = 1000) -> MATTRResult:
251
264
  """
252
265
  Compute Moving-Average Type-Token Ratio (MATTR).
253
266
 
@@ -360,7 +373,13 @@ def compute_mattr(text: str, window_size: int = 50) -> MATTRResult:
360
373
  "last_window_ttr": window_ttrs[-1],
361
374
  }
362
375
 
363
- # Step 7: Return result
376
+ # Step 7: Create distributions (single-pass analysis)
377
+ mattr_score_dist = make_distribution([mattr_score])
378
+ ttr_std_dev_dist = make_distribution([ttr_std_dev])
379
+ min_ttr_dist = make_distribution([min_ttr])
380
+ max_ttr_dist = make_distribution([max_ttr])
381
+
382
+ # Step 8: Return result
364
383
  return MATTRResult(
365
384
  mattr_score=mattr_score,
366
385
  window_size=window_size,
@@ -368,11 +387,17 @@ def compute_mattr(text: str, window_size: int = 50) -> MATTRResult:
368
387
  ttr_std_dev=ttr_std_dev,
369
388
  min_ttr=min_ttr,
370
389
  max_ttr=max_ttr,
390
+ mattr_score_dist=mattr_score_dist,
391
+ ttr_std_dev_dist=ttr_std_dev_dist,
392
+ min_ttr_dist=min_ttr_dist,
393
+ max_ttr_dist=max_ttr_dist,
394
+ chunk_size=chunk_size,
395
+ chunk_count=1, # Single pass analysis
371
396
  metadata=metadata,
372
397
  )
373
398
 
374
399
 
375
- def compute_hdd(text: str, sample_size: int = 42) -> HDDResult:
400
+ def compute_hdd(text: str, sample_size: int = 42, chunk_size: int = 1000) -> HDDResult:
376
401
  """
377
402
  Compute HD-D (Hypergeometric Distribution D).
378
403
 
@@ -451,9 +476,7 @@ def compute_hdd(text: str, sample_size: int = 42) -> HDDResult:
451
476
 
452
477
  # Step 2: Validate minimum length
453
478
  if total_tokens < sample_size:
454
- raise ValueError(
455
- f"Text has {total_tokens} tokens, minimum {sample_size} required for HD-D"
456
- )
479
+ raise ValueError(f"Text has {total_tokens} tokens, minimum {sample_size} required for HD-D")
457
480
 
458
481
  # Step 3: Build frequency distribution
459
482
  type_counts: dict[str, int] = {}
@@ -485,17 +508,23 @@ def compute_hdd(text: str, sample_size: int = 42) -> HDDResult:
485
508
  "calculation_method": "simplified",
486
509
  }
487
510
 
488
- # Step 6: Return result
511
+ # Step 6: Create distribution (single-pass analysis)
512
+ hdd_score_dist = make_distribution([hdd_sum])
513
+
514
+ # Step 7: Return result
489
515
  return HDDResult(
490
516
  hdd_score=hdd_sum,
491
517
  sample_size=sample_size,
492
518
  type_count=total_types,
493
519
  token_count=total_tokens,
520
+ hdd_score_dist=hdd_score_dist,
521
+ chunk_size=chunk_size,
522
+ chunk_count=1, # Single pass analysis
494
523
  metadata=metadata,
495
524
  )
496
525
 
497
526
 
498
- def compute_msttr(text: str, segment_size: int = 100) -> MSTTRResult:
527
+ def compute_msttr(text: str, segment_size: int = 100, chunk_size: int = 1000) -> MSTTRResult:
499
528
  """
500
529
  Compute Mean Segmental Type-Token Ratio (MSTTR).
501
530
 
@@ -604,9 +633,7 @@ def compute_msttr(text: str, segment_size: int = 100) -> MSTTRResult:
604
633
 
605
634
  # Step 6: Calculate statistics
606
635
  # Standard deviation
607
- variance = sum((ttr - msttr_score) ** 2 for ttr in segment_ttrs) / len(
608
- segment_ttrs
609
- )
636
+ variance = sum((ttr - msttr_score) ** 2 for ttr in segment_ttrs) / len(segment_ttrs)
610
637
  ttr_std_dev = variance**0.5
611
638
 
612
639
  # Min and max
@@ -628,7 +655,13 @@ def compute_msttr(text: str, segment_size: int = 100) -> MSTTRResult:
628
655
  "last_segment_ttr": segment_ttrs[-1],
629
656
  }
630
657
 
631
- # Step 9: Return result
658
+ # Step 9: Create distributions (single-pass analysis)
659
+ msttr_score_dist = make_distribution([msttr_score])
660
+ ttr_std_dev_dist = make_distribution([ttr_std_dev])
661
+ min_ttr_dist = make_distribution([min_ttr])
662
+ max_ttr_dist = make_distribution([max_ttr])
663
+
664
+ # Step 10: Return result
632
665
  return MSTTRResult(
633
666
  msttr_score=msttr_score,
634
667
  segment_size=segment_size,
@@ -637,5 +670,11 @@ def compute_msttr(text: str, segment_size: int = 100) -> MSTTRResult:
637
670
  min_ttr=min_ttr,
638
671
  max_ttr=max_ttr,
639
672
  segment_ttrs=segment_ttrs,
673
+ msttr_score_dist=msttr_score_dist,
674
+ ttr_std_dev_dist=ttr_std_dev_dist,
675
+ min_ttr_dist=min_ttr_dist,
676
+ max_ttr_dist=max_ttr_dist,
677
+ chunk_size=chunk_size,
678
+ chunk_count=1, # Single pass analysis
640
679
  metadata=metadata,
641
680
  )
@@ -32,8 +32,7 @@ References:
32
32
  words for authorship attribution. ACH/ALLC.
33
33
  """
34
34
 
35
- from .._types import FunctionWordResult
36
-
35
+ from .._types import Distribution, FunctionWordResult, make_distribution
37
36
 
38
37
  # Function word lists for English
39
38
  # GitHub Issue #13: https://github.com/craigtrim/pystylometry/issues/13
@@ -42,78 +41,249 @@ from .._types import FunctionWordResult
42
41
 
43
42
  # Determiners (articles, demonstratives, possessives, quantifiers)
44
43
  DETERMINERS = {
45
- "the", "a", "an", # Articles
46
- "this", "that", "these", "those", # Demonstratives
47
- "my", "your", "his", "her", "its", "our", "their", # Possessive determiners
48
- "some", "any", "no", "every", "each", "either", "neither", # Quantifiers
49
- "much", "many", "more", "most", "few", "fewer", "less", "least",
50
- "all", "both", "half", "several", "enough",
44
+ "the",
45
+ "a",
46
+ "an", # Articles
47
+ "this",
48
+ "that",
49
+ "these",
50
+ "those", # Demonstratives
51
+ "my",
52
+ "your",
53
+ "his",
54
+ "her",
55
+ "its",
56
+ "our",
57
+ "their", # Possessive determiners
58
+ "some",
59
+ "any",
60
+ "no",
61
+ "every",
62
+ "each",
63
+ "either",
64
+ "neither", # Quantifiers
65
+ "much",
66
+ "many",
67
+ "more",
68
+ "most",
69
+ "few",
70
+ "fewer",
71
+ "less",
72
+ "least",
73
+ "all",
74
+ "both",
75
+ "half",
76
+ "several",
77
+ "enough",
51
78
  }
52
79
 
53
80
  # Prepositions (locative, temporal, other)
54
81
  PREPOSITIONS = {
55
- "in", "on", "at", "by", "for", "with", "from", "to", "of",
56
- "about", "above", "across", "after", "against", "along", "among",
57
- "around", "as", "before", "behind", "below", "beneath", "beside",
58
- "between", "beyond", "but", "concerning", "considering", "despite",
59
- "down", "during", "except", "inside", "into", "like", "near",
60
- "off", "onto", "out", "outside", "over", "past", "regarding",
61
- "since", "through", "throughout", "till", "toward", "under",
62
- "underneath", "until", "up", "upon", "via", "within", "without",
82
+ "in",
83
+ "on",
84
+ "at",
85
+ "by",
86
+ "for",
87
+ "with",
88
+ "from",
89
+ "to",
90
+ "of",
91
+ "about",
92
+ "above",
93
+ "across",
94
+ "after",
95
+ "against",
96
+ "along",
97
+ "among",
98
+ "around",
99
+ "as",
100
+ "before",
101
+ "behind",
102
+ "below",
103
+ "beneath",
104
+ "beside",
105
+ "between",
106
+ "beyond",
107
+ "but",
108
+ "concerning",
109
+ "considering",
110
+ "despite",
111
+ "down",
112
+ "during",
113
+ "except",
114
+ "inside",
115
+ "into",
116
+ "like",
117
+ "near",
118
+ "off",
119
+ "onto",
120
+ "out",
121
+ "outside",
122
+ "over",
123
+ "past",
124
+ "regarding",
125
+ "since",
126
+ "through",
127
+ "throughout",
128
+ "till",
129
+ "toward",
130
+ "under",
131
+ "underneath",
132
+ "until",
133
+ "up",
134
+ "upon",
135
+ "via",
136
+ "within",
137
+ "without",
63
138
  }
64
139
 
65
140
  # Conjunctions (coordinating, subordinating, correlative)
66
141
  CONJUNCTIONS = {
67
142
  # Coordinating
68
- "and", "but", "or", "nor", "for", "yet", "so",
143
+ "and",
144
+ "but",
145
+ "or",
146
+ "nor",
147
+ "for",
148
+ "yet",
149
+ "so",
69
150
  # Subordinating
70
- "although", "because", "since", "unless", "while", "if", "when",
71
- "where", "after", "before", "once", "until", "as", "though",
72
- "even", "whereas", "wherever", "whenever",
151
+ "although",
152
+ "because",
153
+ "since",
154
+ "unless",
155
+ "while",
156
+ "if",
157
+ "when",
158
+ "where",
159
+ "after",
160
+ "before",
161
+ "once",
162
+ "until",
163
+ "as",
164
+ "though",
165
+ "even",
166
+ "whereas",
167
+ "wherever",
168
+ "whenever",
73
169
  # Correlative components
74
- "either", "neither", "both", "whether",
170
+ "either",
171
+ "neither",
172
+ "both",
173
+ "whether",
75
174
  }
76
175
 
77
176
  # Pronouns (personal, possessive, reflexive, demonstrative, relative, indefinite)
78
177
  PRONOUNS = {
79
178
  # Personal (subject)
80
- "i", "you", "he", "she", "it", "we", "they",
179
+ "i",
180
+ "you",
181
+ "he",
182
+ "she",
183
+ "it",
184
+ "we",
185
+ "they",
81
186
  # Personal (object)
82
- "me", "him", "her", "us", "them",
187
+ "me",
188
+ "him",
189
+ "her",
190
+ "us",
191
+ "them",
83
192
  # Possessive
84
- "mine", "yours", "his", "hers", "its", "ours", "theirs",
193
+ "mine",
194
+ "yours",
195
+ "his",
196
+ "hers",
197
+ "its",
198
+ "ours",
199
+ "theirs",
85
200
  # Reflexive
86
- "myself", "yourself", "himself", "herself", "itself",
87
- "ourselves", "yourselves", "themselves",
201
+ "myself",
202
+ "yourself",
203
+ "himself",
204
+ "herself",
205
+ "itself",
206
+ "ourselves",
207
+ "yourselves",
208
+ "themselves",
88
209
  # Demonstrative
89
- "this", "that", "these", "those",
210
+ "this",
211
+ "that",
212
+ "these",
213
+ "those",
90
214
  # Relative
91
- "who", "whom", "whose", "which", "that",
215
+ "who",
216
+ "whom",
217
+ "whose",
218
+ "which",
219
+ "that",
92
220
  # Indefinite
93
- "anybody", "anyone", "anything", "everybody", "everyone",
94
- "everything", "nobody", "no one", "nothing", "somebody",
95
- "someone", "something", "one",
221
+ "anybody",
222
+ "anyone",
223
+ "anything",
224
+ "everybody",
225
+ "everyone",
226
+ "everything",
227
+ "nobody",
228
+ "no one",
229
+ "nothing",
230
+ "somebody",
231
+ "someone",
232
+ "something",
233
+ "one",
96
234
  }
97
235
 
98
236
  # Auxiliary verbs (modal, primary)
99
237
  AUXILIARIES = {
100
238
  # Modals
101
- "can", "could", "may", "might", "must", "shall", "should",
102
- "will", "would", "ought",
239
+ "can",
240
+ "could",
241
+ "may",
242
+ "might",
243
+ "must",
244
+ "shall",
245
+ "should",
246
+ "will",
247
+ "would",
248
+ "ought",
103
249
  # Primary auxiliaries (be, have, do)
104
- "am", "is", "are", "was", "were", "be", "being", "been",
105
- "have", "has", "had", "having",
106
- "do", "does", "did", "doing",
250
+ "am",
251
+ "is",
252
+ "are",
253
+ "was",
254
+ "were",
255
+ "be",
256
+ "being",
257
+ "been",
258
+ "have",
259
+ "has",
260
+ "had",
261
+ "having",
262
+ "do",
263
+ "does",
264
+ "did",
265
+ "doing",
107
266
  }
108
267
 
109
268
  # Particles (often used with phrasal verbs)
110
269
  PARTICLES = {
111
- "up", "down", "out", "off", "over", "in", "away",
112
- "back", "on", "along", "forth", "apart", "aside",
270
+ "up",
271
+ "down",
272
+ "out",
273
+ "off",
274
+ "over",
275
+ "in",
276
+ "away",
277
+ "back",
278
+ "on",
279
+ "along",
280
+ "forth",
281
+ "apart",
282
+ "aside",
113
283
  }
114
284
 
115
285
 
116
- def compute_function_words(text: str) -> FunctionWordResult:
286
+ def compute_function_words(text: str, chunk_size: int = 1000) -> FunctionWordResult:
117
287
  """
118
288
  Compute function word frequency profiles for authorship analysis.
119
289
 
@@ -180,18 +350,21 @@ def compute_function_words(text: str) -> FunctionWordResult:
180
350
  determiner and pronoun) - each category is counted independently
181
351
  """
182
352
  # Step 1: Create union set of all function words (for total ratio calculation)
183
- ALL_FUNCTION_WORDS = (
184
- DETERMINERS
185
- | PREPOSITIONS
186
- | CONJUNCTIONS
187
- | PRONOUNS
188
- | AUXILIARIES
189
- | PARTICLES
353
+ all_function_words = (
354
+ DETERMINERS | PREPOSITIONS | CONJUNCTIONS | PRONOUNS | AUXILIARIES | PARTICLES
190
355
  )
191
356
 
192
357
  # Step 2: Tokenize text (lowercase, split on whitespace, strip punctuation)
193
358
  if not text or not text.strip():
194
359
  # Handle empty text edge case
360
+ empty_dist = Distribution(
361
+ values=[],
362
+ mean=float("nan"),
363
+ median=float("nan"),
364
+ std=0.0,
365
+ range=0.0,
366
+ iqr=0.0,
367
+ )
195
368
  return FunctionWordResult(
196
369
  determiner_ratio=0.0,
197
370
  preposition_ratio=0.0,
@@ -204,6 +377,16 @@ def compute_function_words(text: str) -> FunctionWordResult:
204
377
  most_frequent_function_words=[],
205
378
  least_frequent_function_words=[],
206
379
  function_word_distribution={},
380
+ determiner_ratio_dist=empty_dist,
381
+ preposition_ratio_dist=empty_dist,
382
+ conjunction_ratio_dist=empty_dist,
383
+ pronoun_ratio_dist=empty_dist,
384
+ auxiliary_ratio_dist=empty_dist,
385
+ particle_ratio_dist=empty_dist,
386
+ total_function_word_ratio_dist=empty_dist,
387
+ function_word_diversity_dist=empty_dist,
388
+ chunk_size=chunk_size,
389
+ chunk_count=0,
207
390
  metadata={
208
391
  "total_word_count": 0,
209
392
  "total_function_word_count": 0,
@@ -232,15 +415,13 @@ def compute_function_words(text: str) -> FunctionWordResult:
232
415
  raw_tokens = text_lower.split()
233
416
 
234
417
  # Comprehensive punctuation set for stripping
235
- PUNCTUATION = set(
236
- ".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„""''‚'"
237
- )
418
+ punctuation_chars = set(".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„''‚'")
238
419
 
239
420
  # Strip punctuation from each token
240
421
  tokens = []
241
422
  for token in raw_tokens:
242
423
  # Strip leading and trailing punctuation
243
- clean_token = token.strip("".join(PUNCTUATION))
424
+ clean_token = token.strip("".join(punctuation_chars))
244
425
  if clean_token: # Only add non-empty tokens
245
426
  tokens.append(clean_token)
246
427
 
@@ -272,7 +453,7 @@ def compute_function_words(text: str) -> FunctionWordResult:
272
453
  # Step 5: Build distribution (count each function word only once per token)
273
454
  function_word_counts: dict[str, int] = {}
274
455
  for token in tokens:
275
- if token in ALL_FUNCTION_WORDS:
456
+ if token in all_function_words:
276
457
  function_word_counts[token] = function_word_counts.get(token, 0) + 1
277
458
 
278
459
  # Step 6: Calculate ratios
@@ -306,9 +487,7 @@ def compute_function_words(text: str) -> FunctionWordResult:
306
487
  # Step 8: Find most/least frequent function words
307
488
  if function_word_counts:
308
489
  # Sort by count descending
309
- sorted_by_count = sorted(
310
- function_word_counts.items(), key=lambda x: x[1], reverse=True
311
- )
490
+ sorted_by_count = sorted(function_word_counts.items(), key=lambda x: x[1], reverse=True)
312
491
 
313
492
  # Top 10 most frequent
314
493
  most_frequent = sorted_by_count[:10]
@@ -353,7 +532,17 @@ def compute_function_words(text: str) -> FunctionWordResult:
353
532
 
354
533
  overlapping_words.sort()
355
534
 
356
- # Step 11: Build metadata
535
+ # Step 11: Create single-value distributions (analysis is done on full text)
536
+ determiner_ratio_dist = make_distribution([determiner_ratio])
537
+ preposition_ratio_dist = make_distribution([preposition_ratio])
538
+ conjunction_ratio_dist = make_distribution([conjunction_ratio])
539
+ pronoun_ratio_dist = make_distribution([pronoun_ratio])
540
+ auxiliary_ratio_dist = make_distribution([auxiliary_ratio])
541
+ particle_ratio_dist = make_distribution([particle_ratio])
542
+ total_function_word_ratio_dist = make_distribution([total_function_word_ratio])
543
+ function_word_diversity_dist = make_distribution([function_word_diversity])
544
+
545
+ # Step 12: Build metadata
357
546
  metadata = {
358
547
  "total_word_count": total_words,
359
548
  "total_function_word_count": total_function_word_count,
@@ -374,7 +563,7 @@ def compute_function_words(text: str) -> FunctionWordResult:
374
563
  "overlapping_word_categories": overlapping_word_categories,
375
564
  }
376
565
 
377
- # Step 12: Return result
566
+ # Step 13: Return result
378
567
  return FunctionWordResult(
379
568
  determiner_ratio=determiner_ratio,
380
569
  preposition_ratio=preposition_ratio,
@@ -387,5 +576,15 @@ def compute_function_words(text: str) -> FunctionWordResult:
387
576
  most_frequent_function_words=most_frequent,
388
577
  least_frequent_function_words=least_frequent,
389
578
  function_word_distribution=function_word_counts,
579
+ determiner_ratio_dist=determiner_ratio_dist,
580
+ preposition_ratio_dist=preposition_ratio_dist,
581
+ conjunction_ratio_dist=conjunction_ratio_dist,
582
+ pronoun_ratio_dist=pronoun_ratio_dist,
583
+ auxiliary_ratio_dist=auxiliary_ratio_dist,
584
+ particle_ratio_dist=particle_ratio_dist,
585
+ total_function_word_ratio_dist=total_function_word_ratio_dist,
586
+ function_word_diversity_dist=function_word_diversity_dist,
587
+ chunk_size=chunk_size,
588
+ chunk_count=1, # Single pass analysis
390
589
  metadata=metadata,
391
590
  )