pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pystylometry/__init__.py +30 -5
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1954 -28
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +26 -1
  6. pystylometry/authorship/additional_methods.py +75 -0
  7. pystylometry/authorship/kilgarriff.py +347 -0
  8. pystylometry/character/__init__.py +15 -0
  9. pystylometry/character/character_metrics.py +389 -0
  10. pystylometry/cli.py +427 -0
  11. pystylometry/consistency/__init__.py +57 -0
  12. pystylometry/consistency/_thresholds.py +162 -0
  13. pystylometry/consistency/drift.py +549 -0
  14. pystylometry/dialect/__init__.py +65 -0
  15. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  16. pystylometry/dialect/_loader.py +360 -0
  17. pystylometry/dialect/detector.py +533 -0
  18. pystylometry/lexical/__init__.py +13 -6
  19. pystylometry/lexical/advanced_diversity.py +680 -0
  20. pystylometry/lexical/function_words.py +590 -0
  21. pystylometry/lexical/hapax.py +310 -33
  22. pystylometry/lexical/mtld.py +180 -22
  23. pystylometry/lexical/ttr.py +149 -0
  24. pystylometry/lexical/word_frequency_sophistication.py +1805 -0
  25. pystylometry/lexical/yule.py +142 -29
  26. pystylometry/ngrams/__init__.py +2 -0
  27. pystylometry/ngrams/entropy.py +150 -49
  28. pystylometry/ngrams/extended_ngrams.py +235 -0
  29. pystylometry/prosody/__init__.py +12 -0
  30. pystylometry/prosody/rhythm_prosody.py +53 -0
  31. pystylometry/readability/__init__.py +12 -0
  32. pystylometry/readability/additional_formulas.py +2110 -0
  33. pystylometry/readability/ari.py +173 -35
  34. pystylometry/readability/coleman_liau.py +150 -30
  35. pystylometry/readability/complex_words.py +531 -0
  36. pystylometry/readability/flesch.py +181 -32
  37. pystylometry/readability/gunning_fog.py +208 -35
  38. pystylometry/readability/smog.py +126 -28
  39. pystylometry/readability/syllables.py +137 -30
  40. pystylometry/stylistic/__init__.py +20 -0
  41. pystylometry/stylistic/cohesion_coherence.py +45 -0
  42. pystylometry/stylistic/genre_register.py +45 -0
  43. pystylometry/stylistic/markers.py +131 -0
  44. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  45. pystylometry/syntactic/__init__.py +4 -0
  46. pystylometry/syntactic/advanced_syntactic.py +494 -0
  47. pystylometry/syntactic/pos_ratios.py +172 -17
  48. pystylometry/syntactic/sentence_stats.py +105 -18
  49. pystylometry/syntactic/sentence_types.py +526 -0
  50. pystylometry/viz/__init__.py +71 -0
  51. pystylometry/viz/drift.py +589 -0
  52. pystylometry/viz/jsx/__init__.py +31 -0
  53. pystylometry/viz/jsx/_base.py +144 -0
  54. pystylometry/viz/jsx/report.py +677 -0
  55. pystylometry/viz/jsx/timeline.py +716 -0
  56. pystylometry/viz/jsx/viewer.py +1032 -0
  57. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
  58. pystylometry-1.1.0.dist-info/RECORD +63 -0
  59. pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
  60. pystylometry-0.1.0.dist-info/RECORD +0 -26
  61. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,389 @@
1
+ """Character-level metrics for stylometric analysis.
2
+
3
+ This module provides character-level features that capture low-level patterns
4
+ in writing style. Character-level metrics are fundamental for authorship
5
+ attribution and can reveal distinctive patterns in punctuation usage,
6
+ word construction, and formatting preferences.
7
+
8
+ Related GitHub Issues:
9
+ #12 - Character-Level Metrics
10
+ #27 - Native chunked analysis with Distribution dataclass
11
+
12
+ Features implemented:
13
+ - Average word length (characters per word)
14
+ - Average sentence length (characters per sentence)
15
+ - Punctuation density and variety
16
+ - Letter frequency distribution
17
+ - Vowel-to-consonant ratio
18
+ - Digit frequency and ratio
19
+ - Uppercase ratio
20
+ - Whitespace ratio
21
+
22
+ References:
23
+ Grieve, J. (2007). Quantitative authorship attribution: An evaluation
24
+ of techniques. Literary and Linguistic Computing, 22(3), 251-270.
25
+ Stamatatos, E. (2009). A survey of modern authorship attribution methods.
26
+ JASIST, 60(3), 538-556.
27
+ """
28
+
29
+ import math
30
+
31
+ from .._types import CharacterMetricsResult, Distribution, chunk_text, make_distribution
32
+
33
+ # Character sets
34
+ _PUNCTUATION = {
35
+ ".",
36
+ ",",
37
+ "!",
38
+ "?",
39
+ ";",
40
+ ":",
41
+ "-",
42
+ "—",
43
+ "–", # Basic punctuation
44
+ "'",
45
+ '"',
46
+ """, """,
47
+ "'",
48
+ "'", # Quotes
49
+ "(",
50
+ ")",
51
+ "[",
52
+ "]",
53
+ "{",
54
+ "}", # Brackets
55
+ "/",
56
+ "\\",
57
+ "|", # Slashes
58
+ "…", # Ellipsis
59
+ "*",
60
+ "&",
61
+ "@",
62
+ "#",
63
+ "$",
64
+ "%",
65
+ "^",
66
+ "~",
67
+ "`", # Special symbols
68
+ }
69
+ _VOWELS = {"a", "e", "i", "o", "u"}
70
+ _STANDARD_LETTERS = set("abcdefghijklmnopqrstuvwxyz")
71
+
72
+
73
+ def _compute_character_metrics_single(text: str) -> dict:
74
+ """Compute character-level metrics for a single chunk of text.
75
+
76
+ Returns a dict with all computed values, or values containing nan for empty text.
77
+ """
78
+ if not text:
79
+ return {
80
+ "avg_word_length": float("nan"),
81
+ "avg_sentence_length_chars": float("nan"),
82
+ "punctuation_density": float("nan"),
83
+ "punctuation_variety": 0,
84
+ "vowel_consonant_ratio": float("nan"),
85
+ "digit_count": 0,
86
+ "digit_ratio": float("nan"),
87
+ "uppercase_ratio": float("nan"),
88
+ "whitespace_ratio": float("nan"),
89
+ "letter_frequency": {letter: 0.0 for letter in "abcdefghijklmnopqrstuvwxyz"},
90
+ "total_characters": 0,
91
+ "total_letters": 0,
92
+ "total_words": 0,
93
+ "total_sentences": 0,
94
+ "total_punctuation": 0,
95
+ "total_whitespace": 0,
96
+ "total_digits": 0,
97
+ "punctuation_types": [],
98
+ "vowel_count": 0,
99
+ "consonant_count": 0,
100
+ "uppercase_count": 0,
101
+ "lowercase_count": 0,
102
+ }
103
+
104
+ # Initialize counters
105
+ total_chars = len(text)
106
+ letter_counts = {letter: 0 for letter in "abcdefghijklmnopqrstuvwxyz"}
107
+ vowel_count = 0
108
+ consonant_count = 0
109
+ uppercase_count = 0
110
+ lowercase_count = 0
111
+ digit_count = 0
112
+ whitespace_count = 0
113
+ punctuation_count = 0
114
+ punctuation_types = set()
115
+
116
+ # Single pass through text
117
+ for char in text:
118
+ if char.isalpha():
119
+ lower_char = char.lower()
120
+ if lower_char in _STANDARD_LETTERS:
121
+ letter_counts[lower_char] += 1
122
+
123
+ if lower_char in _VOWELS:
124
+ vowel_count += 1
125
+ elif lower_char in _STANDARD_LETTERS:
126
+ consonant_count += 1
127
+
128
+ if char.isupper():
129
+ uppercase_count += 1
130
+ else:
131
+ lowercase_count += 1
132
+
133
+ elif char.isdigit():
134
+ digit_count += 1
135
+ elif char.isspace():
136
+ whitespace_count += 1
137
+ elif char in _PUNCTUATION:
138
+ punctuation_count += 1
139
+ punctuation_types.add(char)
140
+
141
+ total_letters = vowel_count + consonant_count
142
+
143
+ # Letter frequency distribution
144
+ if total_letters > 0:
145
+ letter_frequency = {
146
+ letter: count / total_letters for letter, count in letter_counts.items()
147
+ }
148
+ else:
149
+ letter_frequency = {letter: 0.0 for letter in "abcdefghijklmnopqrstuvwxyz"}
150
+
151
+ # Word metrics
152
+ words = text.split()
153
+ total_words = len(words)
154
+
155
+ if total_words > 0:
156
+ word_lengths = [
157
+ sum(1 for c in w if c.isalnum()) for w in words if any(c.isalnum() for c in w)
158
+ ]
159
+ avg_word_length = sum(word_lengths) / len(word_lengths) if word_lengths else float("nan")
160
+ else:
161
+ avg_word_length = float("nan")
162
+
163
+ # Sentence metrics
164
+ sentence_delimiters = {".", "!", "?"}
165
+ sentences = []
166
+ current_sentence = []
167
+
168
+ for char in text:
169
+ current_sentence.append(char)
170
+ if char in sentence_delimiters:
171
+ sentence_text = "".join(current_sentence).strip()
172
+ if sentence_text:
173
+ sentences.append(sentence_text)
174
+ current_sentence = []
175
+
176
+ if current_sentence:
177
+ sentence_text = "".join(current_sentence).strip()
178
+ if sentence_text:
179
+ sentences.append(sentence_text)
180
+
181
+ total_sentences = len(sentences)
182
+
183
+ if total_sentences > 0:
184
+ sentence_lengths = [len(sent) for sent in sentences]
185
+ avg_sentence_length_chars = sum(sentence_lengths) / total_sentences
186
+ else:
187
+ avg_sentence_length_chars = float("nan")
188
+
189
+ # Ratios
190
+ punctuation_density = (
191
+ (punctuation_count / total_words * 100) if total_words > 0 else float("nan")
192
+ )
193
+ punctuation_variety = len(punctuation_types)
194
+
195
+ if consonant_count > 0:
196
+ vowel_consonant_ratio = vowel_count / consonant_count
197
+ elif vowel_count > 0:
198
+ vowel_consonant_ratio = float("inf")
199
+ else:
200
+ vowel_consonant_ratio = float("nan")
201
+
202
+ digit_ratio = digit_count / total_chars if total_chars > 0 else float("nan")
203
+ uppercase_ratio = uppercase_count / total_letters if total_letters > 0 else float("nan")
204
+ whitespace_ratio = whitespace_count / total_chars if total_chars > 0 else float("nan")
205
+
206
+ return {
207
+ "avg_word_length": avg_word_length,
208
+ "avg_sentence_length_chars": avg_sentence_length_chars,
209
+ "punctuation_density": punctuation_density,
210
+ "punctuation_variety": punctuation_variety,
211
+ "vowel_consonant_ratio": vowel_consonant_ratio,
212
+ "digit_count": digit_count,
213
+ "digit_ratio": digit_ratio,
214
+ "uppercase_ratio": uppercase_ratio,
215
+ "whitespace_ratio": whitespace_ratio,
216
+ "letter_frequency": letter_frequency,
217
+ "total_characters": total_chars,
218
+ "total_letters": total_letters,
219
+ "total_words": total_words,
220
+ "total_sentences": total_sentences,
221
+ "total_punctuation": punctuation_count,
222
+ "total_whitespace": whitespace_count,
223
+ "total_digits": digit_count,
224
+ "punctuation_types": sorted(list(punctuation_types)),
225
+ "vowel_count": vowel_count,
226
+ "consonant_count": consonant_count,
227
+ "uppercase_count": uppercase_count,
228
+ "lowercase_count": lowercase_count,
229
+ }
230
+
231
+
232
+ def compute_character_metrics(text: str, chunk_size: int = 1000) -> CharacterMetricsResult:
233
+ """
234
+ Compute character-level stylometric metrics.
235
+
236
+ This function uses native chunked analysis to capture variance and patterns
237
+ across the text, which is essential for stylometric fingerprinting.
238
+
239
+ Related GitHub Issues:
240
+ #12 - Character-Level Metrics
241
+ #27 - Native chunked analysis with Distribution dataclass
242
+
243
+ Character-level features are particularly valuable because:
244
+ 1. They are language-independent (work across languages)
245
+ 2. They capture subconscious writing patterns
246
+ 3. They are resistant to topic variation
247
+ 4. They complement higher-level metrics (words, syntax)
248
+
249
+ Metrics computed:
250
+ - Average word length: Mean characters per word
251
+ - Average sentence length (chars): Mean characters per sentence
252
+ - Punctuation density: Punctuation marks per 100 words
253
+ - Punctuation variety: Count of unique punctuation types used
254
+ - Letter frequency: Distribution of a-z (case-insensitive)
255
+ - Vowel-to-consonant ratio: Ratio of vowels to consonants
256
+ - Digit count/ratio: Numeric character usage
257
+ - Uppercase ratio: Uppercase letters / total letters
258
+ - Whitespace ratio: Whitespace characters / total characters
259
+
260
+ Args:
261
+ text: Input text to analyze
262
+ chunk_size: Number of words per chunk (default: 1000)
263
+
264
+ Returns:
265
+ CharacterMetricsResult with all character-level features, distributions,
266
+ and metadata.
267
+
268
+ Example:
269
+ >>> result = compute_character_metrics("Long text...", chunk_size=1000)
270
+ >>> result.avg_word_length # Mean across chunks
271
+ 4.5
272
+ >>> result.avg_word_length_dist.std # Variance reveals fingerprint
273
+ 0.3
274
+ """
275
+ # Chunk the text
276
+ chunks = chunk_text(text, chunk_size)
277
+
278
+ # Compute metrics per chunk
279
+ chunk_results = [_compute_character_metrics_single(chunk) for chunk in chunks]
280
+
281
+ # Collect values for distributions
282
+ avg_word_length_vals = [
283
+ r["avg_word_length"] for r in chunk_results if not math.isnan(r["avg_word_length"])
284
+ ]
285
+ avg_sentence_vals = [
286
+ r["avg_sentence_length_chars"]
287
+ for r in chunk_results
288
+ if not math.isnan(r["avg_sentence_length_chars"])
289
+ ]
290
+ punct_density_vals = [
291
+ r["punctuation_density"] for r in chunk_results if not math.isnan(r["punctuation_density"])
292
+ ]
293
+ punct_variety_vals = [float(r["punctuation_variety"]) for r in chunk_results]
294
+ vc_ratio_vals = [
295
+ r["vowel_consonant_ratio"]
296
+ for r in chunk_results
297
+ if not math.isnan(r["vowel_consonant_ratio"]) and not math.isinf(r["vowel_consonant_ratio"])
298
+ ]
299
+ digit_ratio_vals = [r["digit_ratio"] for r in chunk_results if not math.isnan(r["digit_ratio"])]
300
+ uppercase_ratio_vals = [
301
+ r["uppercase_ratio"] for r in chunk_results if not math.isnan(r["uppercase_ratio"])
302
+ ]
303
+ whitespace_ratio_vals = [
304
+ r["whitespace_ratio"] for r in chunk_results if not math.isnan(r["whitespace_ratio"])
305
+ ]
306
+
307
+ # Aggregate totals
308
+ total_digits = sum(r["digit_count"] for r in chunk_results)
309
+ total_characters = sum(r["total_characters"] for r in chunk_results)
310
+ total_letters = sum(r["total_letters"] for r in chunk_results)
311
+ total_words = sum(r["total_words"] for r in chunk_results)
312
+ total_sentences = sum(r["total_sentences"] for r in chunk_results)
313
+ total_punctuation = sum(r["total_punctuation"] for r in chunk_results)
314
+ total_whitespace = sum(r["total_whitespace"] for r in chunk_results)
315
+ total_vowel_count = sum(r["vowel_count"] for r in chunk_results)
316
+ total_consonant_count = sum(r["consonant_count"] for r in chunk_results)
317
+ total_uppercase_count = sum(r["uppercase_count"] for r in chunk_results)
318
+ total_lowercase_count = sum(r["lowercase_count"] for r in chunk_results)
319
+ all_punctuation_types = set()
320
+ for r in chunk_results:
321
+ all_punctuation_types.update(r["punctuation_types"])
322
+
323
+ # Aggregate letter frequency
324
+ total_letter_counts = {letter: 0 for letter in "abcdefghijklmnopqrstuvwxyz"}
325
+ for r in chunk_results:
326
+ if r["total_letters"] > 0:
327
+ for letter, freq in r["letter_frequency"].items():
328
+ total_letter_counts[letter] += freq * r["total_letters"]
329
+
330
+ if total_letters > 0:
331
+ letter_frequency = {
332
+ letter: count / total_letters for letter, count in total_letter_counts.items()
333
+ }
334
+ else:
335
+ letter_frequency = {letter: 0.0 for letter in "abcdefghijklmnopqrstuvwxyz"}
336
+
337
+ # Build distributions (handle empty case)
338
+ def safe_dist(values: list[float]) -> Distribution:
339
+ if not values:
340
+ return Distribution(
341
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
342
+ )
343
+ return make_distribution(values)
344
+
345
+ avg_word_length_dist = safe_dist(avg_word_length_vals)
346
+ avg_sentence_dist = safe_dist(avg_sentence_vals)
347
+ punct_density_dist = safe_dist(punct_density_vals)
348
+ punct_variety_dist = safe_dist(punct_variety_vals)
349
+ vc_ratio_dist = safe_dist(vc_ratio_vals)
350
+ digit_ratio_dist = safe_dist(digit_ratio_vals)
351
+ uppercase_ratio_dist = safe_dist(uppercase_ratio_vals)
352
+ whitespace_ratio_dist = safe_dist(whitespace_ratio_vals)
353
+
354
+ return CharacterMetricsResult(
355
+ avg_word_length=avg_word_length_dist.mean,
356
+ avg_sentence_length_chars=avg_sentence_dist.mean,
357
+ punctuation_density=punct_density_dist.mean,
358
+ punctuation_variety=punct_variety_dist.mean,
359
+ letter_frequency=letter_frequency,
360
+ vowel_consonant_ratio=vc_ratio_dist.mean,
361
+ digit_count=total_digits,
362
+ digit_ratio=digit_ratio_dist.mean,
363
+ uppercase_ratio=uppercase_ratio_dist.mean,
364
+ whitespace_ratio=whitespace_ratio_dist.mean,
365
+ avg_word_length_dist=avg_word_length_dist,
366
+ avg_sentence_length_chars_dist=avg_sentence_dist,
367
+ punctuation_density_dist=punct_density_dist,
368
+ punctuation_variety_dist=punct_variety_dist,
369
+ vowel_consonant_ratio_dist=vc_ratio_dist,
370
+ digit_ratio_dist=digit_ratio_dist,
371
+ uppercase_ratio_dist=uppercase_ratio_dist,
372
+ whitespace_ratio_dist=whitespace_ratio_dist,
373
+ chunk_size=chunk_size,
374
+ chunk_count=len(chunks),
375
+ metadata={
376
+ "total_characters": total_characters,
377
+ "total_letters": total_letters,
378
+ "total_words": total_words,
379
+ "total_sentences": total_sentences,
380
+ "total_punctuation": total_punctuation,
381
+ "total_whitespace": total_whitespace,
382
+ "total_digits": total_digits,
383
+ "punctuation_types": sorted(list(all_punctuation_types)),
384
+ "vowel_count": total_vowel_count,
385
+ "consonant_count": total_consonant_count,
386
+ "uppercase_count": total_uppercase_count,
387
+ "lowercase_count": total_lowercase_count,
388
+ },
389
+ )