pystylometry 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. pystylometry/__init__.py +29 -3
  2. pystylometry/_types.py +963 -259
  3. pystylometry/authorship/__init__.py +23 -2
  4. pystylometry/authorship/additional_methods.py +4 -29
  5. pystylometry/authorship/kilgarriff.py +347 -0
  6. pystylometry/character/character_metrics.py +267 -179
  7. pystylometry/cli.py +427 -0
  8. pystylometry/consistency/__init__.py +57 -0
  9. pystylometry/consistency/_thresholds.py +162 -0
  10. pystylometry/consistency/drift.py +549 -0
  11. pystylometry/dialect/__init__.py +65 -0
  12. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  13. pystylometry/dialect/_loader.py +360 -0
  14. pystylometry/dialect/detector.py +533 -0
  15. pystylometry/lexical/advanced_diversity.py +61 -22
  16. pystylometry/lexical/function_words.py +255 -56
  17. pystylometry/lexical/hapax.py +182 -52
  18. pystylometry/lexical/mtld.py +108 -26
  19. pystylometry/lexical/ttr.py +76 -10
  20. pystylometry/lexical/word_frequency_sophistication.py +1522 -298
  21. pystylometry/lexical/yule.py +136 -50
  22. pystylometry/ngrams/entropy.py +150 -49
  23. pystylometry/readability/additional_formulas.py +1887 -762
  24. pystylometry/readability/ari.py +144 -82
  25. pystylometry/readability/coleman_liau.py +136 -109
  26. pystylometry/readability/flesch.py +177 -73
  27. pystylometry/readability/gunning_fog.py +165 -161
  28. pystylometry/readability/smog.py +123 -42
  29. pystylometry/syntactic/advanced_syntactic.py +76 -14
  30. pystylometry/syntactic/pos_ratios.py +70 -6
  31. pystylometry/syntactic/sentence_stats.py +55 -12
  32. pystylometry/syntactic/sentence_types.py +71 -15
  33. pystylometry/viz/__init__.py +71 -0
  34. pystylometry/viz/drift.py +589 -0
  35. pystylometry/viz/jsx/__init__.py +31 -0
  36. pystylometry/viz/jsx/_base.py +144 -0
  37. pystylometry/viz/jsx/report.py +677 -0
  38. pystylometry/viz/jsx/timeline.py +716 -0
  39. pystylometry/viz/jsx/viewer.py +1032 -0
  40. {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +5 -2
  41. pystylometry-1.1.0.dist-info/RECORD +63 -0
  42. {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +1 -1
  43. pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
  44. pystylometry-1.0.0.dist-info/RECORD +0 -46
@@ -5,9 +5,9 @@ in writing style. Character-level metrics are fundamental for authorship
5
5
  attribution and can reveal distinctive patterns in punctuation usage,
6
6
  word construction, and formatting preferences.
7
7
 
8
- Related GitHub Issue:
8
+ Related GitHub Issues:
9
9
  #12 - Character-Level Metrics
10
- https://github.com/craigtrim/pystylometry/issues/12
10
+ #27 - Native chunked analysis with Distribution dataclass
11
11
 
12
12
  Features implemented:
13
13
  - Average word length (characters per word)
@@ -26,114 +26,80 @@ References:
26
26
  JASIST, 60(3), 538-556.
27
27
  """
28
28
 
29
- from .._types import CharacterMetricsResult
30
-
31
-
32
- def compute_character_metrics(text: str) -> CharacterMetricsResult:
29
+ import math
30
+
31
+ from .._types import CharacterMetricsResult, Distribution, chunk_text, make_distribution
32
+
33
+ # Character sets
34
+ _PUNCTUATION = {
35
+ ".",
36
+ ",",
37
+ "!",
38
+ "?",
39
+ ";",
40
+ ":",
41
+ "-",
42
+ "—",
43
+ "–", # Basic punctuation
44
+ "'",
45
+ '"',
46
+ """, """,
47
+ "'",
48
+ "'", # Quotes
49
+ "(",
50
+ ")",
51
+ "[",
52
+ "]",
53
+ "{",
54
+ "}", # Brackets
55
+ "/",
56
+ "\\",
57
+ "|", # Slashes
58
+ "…", # Ellipsis
59
+ "*",
60
+ "&",
61
+ "@",
62
+ "#",
63
+ "$",
64
+ "%",
65
+ "^",
66
+ "~",
67
+ "`", # Special symbols
68
+ }
69
+ _VOWELS = {"a", "e", "i", "o", "u"}
70
+ _STANDARD_LETTERS = set("abcdefghijklmnopqrstuvwxyz")
71
+
72
+
73
+ def _compute_character_metrics_single(text: str) -> dict:
74
+ """Compute character-level metrics for a single chunk of text.
75
+
76
+ Returns a dict with all computed values, or values containing nan for empty text.
33
77
  """
34
- Compute character-level stylometric metrics.
35
-
36
- This function analyzes text at the character level to extract features
37
- related to word length, punctuation usage, letter distribution, and
38
- other low-level patterns that can be distinctive for authorship
39
- attribution and style analysis.
40
-
41
- Related GitHub Issue:
42
- #12 - Character-Level Metrics
43
- https://github.com/craigtrim/pystylometry/issues/12
44
-
45
- Character-level features are particularly valuable because:
46
- 1. They are language-independent (work across languages)
47
- 2. They capture subconscious writing patterns
48
- 3. They are resistant to topic variation
49
- 4. They complement higher-level metrics (words, syntax)
50
-
51
- Metrics computed:
52
- - Average word length: Mean characters per word
53
- - Average sentence length (chars): Mean characters per sentence
54
- - Punctuation density: Punctuation marks per 100 words
55
- - Punctuation variety: Count of unique punctuation types used
56
- - Letter frequency: Distribution of a-z (case-insensitive)
57
- - Vowel-to-consonant ratio: Ratio of vowels to consonants
58
- - Digit count/ratio: Numeric character usage
59
- - Uppercase ratio: Uppercase letters / total letters
60
- - Whitespace ratio: Whitespace characters / total characters
61
-
62
- Args:
63
- text: Input text to analyze. Should contain at least one sentence
64
- for meaningful results. Empty text will return NaN for ratios
65
- and 0 for counts.
66
-
67
- Returns:
68
- CharacterMetricsResult with all character-level features and metadata.
69
- For empty text, all ratios will be NaN and counts will be 0.
70
-
71
- Example:
72
- >>> result = compute_character_metrics("The quick brown fox jumps!")
73
- >>> print(f"Avg word length: {result.avg_word_length:.2f}")
74
- Avg word length: 4.17
75
- >>> print(f"Punctuation density: {result.punctuation_density:.2f}")
76
- Punctuation density: 16.67
77
- >>> print(f"Vowel/consonant ratio: {result.vowel_consonant_ratio:.2f}")
78
- Vowel/consonant ratio: 0.71
79
-
80
- >>> # Empty text handling
81
- >>> result = compute_character_metrics("")
82
- >>> import math
83
- >>> math.isnan(result.avg_word_length)
84
- True
85
- >>> result.digit_count
86
- 0
87
-
88
- Note:
89
- - Punctuation marks include: . , ! ? ; : - ' " ( ) [ ] { } ... etc.
90
- - Whitespace includes spaces, tabs, newlines
91
- - Letter frequency is case-insensitive (lowercase normalized)
92
- - Words are tokenized by whitespace for length calculation
93
- - Sentences are split using standard sentence delimiters (. ! ?)
94
- """
95
- # Define character sets
96
- # GitHub Issue #12: https://github.com/craigtrim/pystylometry/issues/12
97
- PUNCTUATION = {
98
- ".", ",", "!", "?", ";", ":", "-", "—", "–", # Basic punctuation
99
- "'", '"', """, """, "'", "'", # Quotes
100
- "(", ")", "[", "]", "{", "}", # Brackets
101
- "/", "\\", "|", # Slashes
102
- "…", # Ellipsis
103
- "*", "&", "@", "#", "$", "%", "^", "~", "`", # Special symbols
104
- }
105
- VOWELS = {"a", "e", "i", "o", "u"}
106
-
107
- # Handle empty text
108
78
  if not text:
109
- # Return NaN for all ratios, 0 for all counts
110
- empty_letter_freq = {letter: 0.0 for letter in "abcdefghijklmnopqrstuvwxyz"}
111
- return CharacterMetricsResult(
112
- avg_word_length=float("nan"),
113
- avg_sentence_length_chars=float("nan"),
114
- punctuation_density=float("nan"),
115
- punctuation_variety=0,
116
- letter_frequency=empty_letter_freq,
117
- vowel_consonant_ratio=float("nan"),
118
- digit_count=0,
119
- digit_ratio=float("nan"),
120
- uppercase_ratio=float("nan"),
121
- whitespace_ratio=float("nan"),
122
- metadata={
123
- "total_characters": 0,
124
- "total_letters": 0,
125
- "total_words": 0,
126
- "total_sentences": 0,
127
- "total_punctuation": 0,
128
- "total_whitespace": 0,
129
- "total_digits": 0,
130
- "punctuation_types": [],
131
- "vowel_count": 0,
132
- "consonant_count": 0,
133
- "uppercase_count": 0,
134
- "lowercase_count": 0,
135
- },
136
- )
79
+ return {
80
+ "avg_word_length": float("nan"),
81
+ "avg_sentence_length_chars": float("nan"),
82
+ "punctuation_density": float("nan"),
83
+ "punctuation_variety": 0,
84
+ "vowel_consonant_ratio": float("nan"),
85
+ "digit_count": 0,
86
+ "digit_ratio": float("nan"),
87
+ "uppercase_ratio": float("nan"),
88
+ "whitespace_ratio": float("nan"),
89
+ "letter_frequency": {letter: 0.0 for letter in "abcdefghijklmnopqrstuvwxyz"},
90
+ "total_characters": 0,
91
+ "total_letters": 0,
92
+ "total_words": 0,
93
+ "total_sentences": 0,
94
+ "total_punctuation": 0,
95
+ "total_whitespace": 0,
96
+ "total_digits": 0,
97
+ "punctuation_types": [],
98
+ "vowel_count": 0,
99
+ "consonant_count": 0,
100
+ "uppercase_count": 0,
101
+ "lowercase_count": 0,
102
+ }
137
103
 
138
104
  # Initialize counters
139
105
  total_chars = len(text)
@@ -147,19 +113,18 @@ def compute_character_metrics(text: str) -> CharacterMetricsResult:
147
113
  punctuation_count = 0
148
114
  punctuation_types = set()
149
115
 
150
- # Single pass through text to classify and count all characters
116
+ # Single pass through text
151
117
  for char in text:
152
118
  if char.isalpha():
153
- # Letter - update letter frequency (case-insensitive)
154
- letter_counts[char.lower()] += 1
119
+ lower_char = char.lower()
120
+ if lower_char in _STANDARD_LETTERS:
121
+ letter_counts[lower_char] += 1
155
122
 
156
- # Count vowels and consonants
157
- if char.lower() in VOWELS:
123
+ if lower_char in _VOWELS:
158
124
  vowel_count += 1
159
- else:
125
+ elif lower_char in _STANDARD_LETTERS:
160
126
  consonant_count += 1
161
127
 
162
- # Count uppercase and lowercase
163
128
  if char.isupper():
164
129
  uppercase_count += 1
165
130
  else:
@@ -167,44 +132,35 @@ def compute_character_metrics(text: str) -> CharacterMetricsResult:
167
132
 
168
133
  elif char.isdigit():
169
134
  digit_count += 1
170
-
171
135
  elif char.isspace():
172
136
  whitespace_count += 1
173
-
174
- elif char in PUNCTUATION:
137
+ elif char in _PUNCTUATION:
175
138
  punctuation_count += 1
176
139
  punctuation_types.add(char)
177
140
 
178
141
  total_letters = vowel_count + consonant_count
179
142
 
180
- # Calculate letter frequency distribution (normalize to sum to 1.0)
143
+ # Letter frequency distribution
181
144
  if total_letters > 0:
182
- letter_frequency = {letter: count / total_letters for letter, count in letter_counts.items()}
145
+ letter_frequency = {
146
+ letter: count / total_letters for letter, count in letter_counts.items()
147
+ }
183
148
  else:
184
149
  letter_frequency = {letter: 0.0 for letter in "abcdefghijklmnopqrstuvwxyz"}
185
150
 
186
- # Tokenize into words (split on whitespace, then strip punctuation for length)
151
+ # Word metrics
187
152
  words = text.split()
188
153
  total_words = len(words)
189
154
 
190
- # Calculate average word length (count only letters and digits in words)
191
155
  if total_words > 0:
192
- word_lengths = []
193
- for word in words:
194
- # Count only alphanumeric characters for word length
195
- word_length = sum(1 for char in word if char.isalnum())
196
- if word_length > 0: # Only count words with at least one alphanumeric char
197
- word_lengths.append(word_length)
198
-
199
- if word_lengths:
200
- avg_word_length = sum(word_lengths) / len(word_lengths)
201
- else:
202
- avg_word_length = float("nan")
156
+ word_lengths = [
157
+ sum(1 for c in w if c.isalnum()) for w in words if any(c.isalnum() for c in w)
158
+ ]
159
+ avg_word_length = sum(word_lengths) / len(word_lengths) if word_lengths else float("nan")
203
160
  else:
204
161
  avg_word_length = float("nan")
205
162
 
206
- # Segment text into sentences (split on . ! ?)
207
- # Simple approach: split on sentence delimiters
163
+ # Sentence metrics
208
164
  sentence_delimiters = {".", "!", "?"}
209
165
  sentences = []
210
166
  current_sentence = []
@@ -212,13 +168,11 @@ def compute_character_metrics(text: str) -> CharacterMetricsResult:
212
168
  for char in text:
213
169
  current_sentence.append(char)
214
170
  if char in sentence_delimiters:
215
- # End of sentence
216
171
  sentence_text = "".join(current_sentence).strip()
217
- if sentence_text: # Only add non-empty sentences
172
+ if sentence_text:
218
173
  sentences.append(sentence_text)
219
174
  current_sentence = []
220
175
 
221
- # Add any remaining text as a sentence if it's non-empty and doesn't end with delimiter
222
176
  if current_sentence:
223
177
  sentence_text = "".join(current_sentence).strip()
224
178
  if sentence_text:
@@ -226,52 +180,40 @@ def compute_character_metrics(text: str) -> CharacterMetricsResult:
226
180
 
227
181
  total_sentences = len(sentences)
228
182
 
229
- # Calculate average sentence length in characters
230
183
  if total_sentences > 0:
231
184
  sentence_lengths = [len(sent) for sent in sentences]
232
185
  avg_sentence_length_chars = sum(sentence_lengths) / total_sentences
233
186
  else:
234
187
  avg_sentence_length_chars = float("nan")
235
188
 
236
- # Calculate punctuation density (per 100 words)
237
- if total_words > 0:
238
- punctuation_density = (punctuation_count / total_words) * 100
239
- else:
240
- punctuation_density = float("nan")
241
-
242
- # Punctuation variety (count of unique punctuation types)
189
+ # Ratios
190
+ punctuation_density = (
191
+ (punctuation_count / total_words * 100) if total_words > 0 else float("nan")
192
+ )
243
193
  punctuation_variety = len(punctuation_types)
244
194
 
245
- # Calculate vowel-to-consonant ratio
246
195
  if consonant_count > 0:
247
196
  vowel_consonant_ratio = vowel_count / consonant_count
248
197
  elif vowel_count > 0:
249
- # Vowels but no consonants - ratio is infinity
250
198
  vowel_consonant_ratio = float("inf")
251
199
  else:
252
- # No letters at all
253
200
  vowel_consonant_ratio = float("nan")
254
201
 
255
- # Calculate digit ratio
256
- if total_chars > 0:
257
- digit_ratio = digit_count / total_chars
258
- else:
259
- digit_ratio = float("nan")
260
-
261
- # Calculate uppercase ratio
262
- if total_letters > 0:
263
- uppercase_ratio = uppercase_count / total_letters
264
- else:
265
- uppercase_ratio = float("nan")
266
-
267
- # Calculate whitespace ratio
268
- if total_chars > 0:
269
- whitespace_ratio = whitespace_count / total_chars
270
- else:
271
- whitespace_ratio = float("nan")
272
-
273
- # Build metadata
274
- metadata = {
202
+ digit_ratio = digit_count / total_chars if total_chars > 0 else float("nan")
203
+ uppercase_ratio = uppercase_count / total_letters if total_letters > 0 else float("nan")
204
+ whitespace_ratio = whitespace_count / total_chars if total_chars > 0 else float("nan")
205
+
206
+ return {
207
+ "avg_word_length": avg_word_length,
208
+ "avg_sentence_length_chars": avg_sentence_length_chars,
209
+ "punctuation_density": punctuation_density,
210
+ "punctuation_variety": punctuation_variety,
211
+ "vowel_consonant_ratio": vowel_consonant_ratio,
212
+ "digit_count": digit_count,
213
+ "digit_ratio": digit_ratio,
214
+ "uppercase_ratio": uppercase_ratio,
215
+ "whitespace_ratio": whitespace_ratio,
216
+ "letter_frequency": letter_frequency,
275
217
  "total_characters": total_chars,
276
218
  "total_letters": total_letters,
277
219
  "total_words": total_words,
@@ -286,16 +228,162 @@ def compute_character_metrics(text: str) -> CharacterMetricsResult:
286
228
  "lowercase_count": lowercase_count,
287
229
  }
288
230
 
231
+
232
+ def compute_character_metrics(text: str, chunk_size: int = 1000) -> CharacterMetricsResult:
233
+ """
234
+ Compute character-level stylometric metrics.
235
+
236
+ This function uses native chunked analysis to capture variance and patterns
237
+ across the text, which is essential for stylometric fingerprinting.
238
+
239
+ Related GitHub Issues:
240
+ #12 - Character-Level Metrics
241
+ #27 - Native chunked analysis with Distribution dataclass
242
+
243
+ Character-level features are particularly valuable because:
244
+ 1. They are language-independent (work across languages)
245
+ 2. They capture subconscious writing patterns
246
+ 3. They are resistant to topic variation
247
+ 4. They complement higher-level metrics (words, syntax)
248
+
249
+ Metrics computed:
250
+ - Average word length: Mean characters per word
251
+ - Average sentence length (chars): Mean characters per sentence
252
+ - Punctuation density: Punctuation marks per 100 words
253
+ - Punctuation variety: Count of unique punctuation types used
254
+ - Letter frequency: Distribution of a-z (case-insensitive)
255
+ - Vowel-to-consonant ratio: Ratio of vowels to consonants
256
+ - Digit count/ratio: Numeric character usage
257
+ - Uppercase ratio: Uppercase letters / total letters
258
+ - Whitespace ratio: Whitespace characters / total characters
259
+
260
+ Args:
261
+ text: Input text to analyze
262
+ chunk_size: Number of words per chunk (default: 1000)
263
+
264
+ Returns:
265
+ CharacterMetricsResult with all character-level features, distributions,
266
+ and metadata.
267
+
268
+ Example:
269
+ >>> result = compute_character_metrics("Long text...", chunk_size=1000)
270
+ >>> result.avg_word_length # Mean across chunks
271
+ 4.5
272
+ >>> result.avg_word_length_dist.std # Variance reveals fingerprint
273
+ 0.3
274
+ """
275
+ # Chunk the text
276
+ chunks = chunk_text(text, chunk_size)
277
+
278
+ # Compute metrics per chunk
279
+ chunk_results = [_compute_character_metrics_single(chunk) for chunk in chunks]
280
+
281
+ # Collect values for distributions
282
+ avg_word_length_vals = [
283
+ r["avg_word_length"] for r in chunk_results if not math.isnan(r["avg_word_length"])
284
+ ]
285
+ avg_sentence_vals = [
286
+ r["avg_sentence_length_chars"]
287
+ for r in chunk_results
288
+ if not math.isnan(r["avg_sentence_length_chars"])
289
+ ]
290
+ punct_density_vals = [
291
+ r["punctuation_density"] for r in chunk_results if not math.isnan(r["punctuation_density"])
292
+ ]
293
+ punct_variety_vals = [float(r["punctuation_variety"]) for r in chunk_results]
294
+ vc_ratio_vals = [
295
+ r["vowel_consonant_ratio"]
296
+ for r in chunk_results
297
+ if not math.isnan(r["vowel_consonant_ratio"]) and not math.isinf(r["vowel_consonant_ratio"])
298
+ ]
299
+ digit_ratio_vals = [r["digit_ratio"] for r in chunk_results if not math.isnan(r["digit_ratio"])]
300
+ uppercase_ratio_vals = [
301
+ r["uppercase_ratio"] for r in chunk_results if not math.isnan(r["uppercase_ratio"])
302
+ ]
303
+ whitespace_ratio_vals = [
304
+ r["whitespace_ratio"] for r in chunk_results if not math.isnan(r["whitespace_ratio"])
305
+ ]
306
+
307
+ # Aggregate totals
308
+ total_digits = sum(r["digit_count"] for r in chunk_results)
309
+ total_characters = sum(r["total_characters"] for r in chunk_results)
310
+ total_letters = sum(r["total_letters"] for r in chunk_results)
311
+ total_words = sum(r["total_words"] for r in chunk_results)
312
+ total_sentences = sum(r["total_sentences"] for r in chunk_results)
313
+ total_punctuation = sum(r["total_punctuation"] for r in chunk_results)
314
+ total_whitespace = sum(r["total_whitespace"] for r in chunk_results)
315
+ total_vowel_count = sum(r["vowel_count"] for r in chunk_results)
316
+ total_consonant_count = sum(r["consonant_count"] for r in chunk_results)
317
+ total_uppercase_count = sum(r["uppercase_count"] for r in chunk_results)
318
+ total_lowercase_count = sum(r["lowercase_count"] for r in chunk_results)
319
+ all_punctuation_types = set()
320
+ for r in chunk_results:
321
+ all_punctuation_types.update(r["punctuation_types"])
322
+
323
+ # Aggregate letter frequency
324
+ total_letter_counts = {letter: 0 for letter in "abcdefghijklmnopqrstuvwxyz"}
325
+ for r in chunk_results:
326
+ if r["total_letters"] > 0:
327
+ for letter, freq in r["letter_frequency"].items():
328
+ total_letter_counts[letter] += freq * r["total_letters"]
329
+
330
+ if total_letters > 0:
331
+ letter_frequency = {
332
+ letter: count / total_letters for letter, count in total_letter_counts.items()
333
+ }
334
+ else:
335
+ letter_frequency = {letter: 0.0 for letter in "abcdefghijklmnopqrstuvwxyz"}
336
+
337
+ # Build distributions (handle empty case)
338
+ def safe_dist(values: list[float]) -> Distribution:
339
+ if not values:
340
+ return Distribution(
341
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
342
+ )
343
+ return make_distribution(values)
344
+
345
+ avg_word_length_dist = safe_dist(avg_word_length_vals)
346
+ avg_sentence_dist = safe_dist(avg_sentence_vals)
347
+ punct_density_dist = safe_dist(punct_density_vals)
348
+ punct_variety_dist = safe_dist(punct_variety_vals)
349
+ vc_ratio_dist = safe_dist(vc_ratio_vals)
350
+ digit_ratio_dist = safe_dist(digit_ratio_vals)
351
+ uppercase_ratio_dist = safe_dist(uppercase_ratio_vals)
352
+ whitespace_ratio_dist = safe_dist(whitespace_ratio_vals)
353
+
289
354
  return CharacterMetricsResult(
290
- avg_word_length=avg_word_length,
291
- avg_sentence_length_chars=avg_sentence_length_chars,
292
- punctuation_density=punctuation_density,
293
- punctuation_variety=punctuation_variety,
355
+ avg_word_length=avg_word_length_dist.mean,
356
+ avg_sentence_length_chars=avg_sentence_dist.mean,
357
+ punctuation_density=punct_density_dist.mean,
358
+ punctuation_variety=punct_variety_dist.mean,
294
359
  letter_frequency=letter_frequency,
295
- vowel_consonant_ratio=vowel_consonant_ratio,
296
- digit_count=digit_count,
297
- digit_ratio=digit_ratio,
298
- uppercase_ratio=uppercase_ratio,
299
- whitespace_ratio=whitespace_ratio,
300
- metadata=metadata,
360
+ vowel_consonant_ratio=vc_ratio_dist.mean,
361
+ digit_count=total_digits,
362
+ digit_ratio=digit_ratio_dist.mean,
363
+ uppercase_ratio=uppercase_ratio_dist.mean,
364
+ whitespace_ratio=whitespace_ratio_dist.mean,
365
+ avg_word_length_dist=avg_word_length_dist,
366
+ avg_sentence_length_chars_dist=avg_sentence_dist,
367
+ punctuation_density_dist=punct_density_dist,
368
+ punctuation_variety_dist=punct_variety_dist,
369
+ vowel_consonant_ratio_dist=vc_ratio_dist,
370
+ digit_ratio_dist=digit_ratio_dist,
371
+ uppercase_ratio_dist=uppercase_ratio_dist,
372
+ whitespace_ratio_dist=whitespace_ratio_dist,
373
+ chunk_size=chunk_size,
374
+ chunk_count=len(chunks),
375
+ metadata={
376
+ "total_characters": total_characters,
377
+ "total_letters": total_letters,
378
+ "total_words": total_words,
379
+ "total_sentences": total_sentences,
380
+ "total_punctuation": total_punctuation,
381
+ "total_whitespace": total_whitespace,
382
+ "total_digits": total_digits,
383
+ "punctuation_types": sorted(list(all_punctuation_types)),
384
+ "vowel_count": total_vowel_count,
385
+ "consonant_count": total_consonant_count,
386
+ "uppercase_count": total_uppercase_count,
387
+ "lowercase_count": total_lowercase_count,
388
+ },
301
389
  )