pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. pystylometry/README.md +42 -0
  2. pystylometry/__init__.py +45 -3
  3. pystylometry/_types.py +1017 -259
  4. pystylometry/authorship/README.md +21 -0
  5. pystylometry/authorship/__init__.py +28 -4
  6. pystylometry/authorship/additional_methods.py +260 -40
  7. pystylometry/authorship/compression.py +175 -0
  8. pystylometry/authorship/kilgarriff.py +354 -0
  9. pystylometry/character/README.md +17 -0
  10. pystylometry/character/character_metrics.py +267 -179
  11. pystylometry/cli.py +427 -0
  12. pystylometry/consistency/README.md +27 -0
  13. pystylometry/consistency/__init__.py +57 -0
  14. pystylometry/consistency/_thresholds.py +162 -0
  15. pystylometry/consistency/drift.py +549 -0
  16. pystylometry/dialect/README.md +26 -0
  17. pystylometry/dialect/__init__.py +65 -0
  18. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  19. pystylometry/dialect/_loader.py +360 -0
  20. pystylometry/dialect/detector.py +533 -0
  21. pystylometry/lexical/README.md +23 -0
  22. pystylometry/lexical/advanced_diversity.py +61 -22
  23. pystylometry/lexical/function_words.py +255 -56
  24. pystylometry/lexical/hapax.py +182 -52
  25. pystylometry/lexical/mtld.py +108 -26
  26. pystylometry/lexical/ttr.py +76 -10
  27. pystylometry/lexical/word_frequency_sophistication.py +1522 -298
  28. pystylometry/lexical/yule.py +136 -50
  29. pystylometry/ngrams/README.md +18 -0
  30. pystylometry/ngrams/entropy.py +150 -49
  31. pystylometry/ngrams/extended_ngrams.py +314 -69
  32. pystylometry/prosody/README.md +17 -0
  33. pystylometry/prosody/rhythm_prosody.py +773 -11
  34. pystylometry/readability/README.md +23 -0
  35. pystylometry/readability/additional_formulas.py +1887 -762
  36. pystylometry/readability/ari.py +144 -82
  37. pystylometry/readability/coleman_liau.py +136 -109
  38. pystylometry/readability/flesch.py +177 -73
  39. pystylometry/readability/gunning_fog.py +165 -161
  40. pystylometry/readability/smog.py +123 -42
  41. pystylometry/stylistic/README.md +20 -0
  42. pystylometry/stylistic/cohesion_coherence.py +669 -13
  43. pystylometry/stylistic/genre_register.py +1560 -17
  44. pystylometry/stylistic/markers.py +611 -17
  45. pystylometry/stylistic/vocabulary_overlap.py +354 -13
  46. pystylometry/syntactic/README.md +20 -0
  47. pystylometry/syntactic/advanced_syntactic.py +76 -14
  48. pystylometry/syntactic/pos_ratios.py +70 -6
  49. pystylometry/syntactic/sentence_stats.py +55 -12
  50. pystylometry/syntactic/sentence_types.py +71 -15
  51. pystylometry/viz/README.md +27 -0
  52. pystylometry/viz/__init__.py +71 -0
  53. pystylometry/viz/drift.py +589 -0
  54. pystylometry/viz/jsx/__init__.py +31 -0
  55. pystylometry/viz/jsx/_base.py +144 -0
  56. pystylometry/viz/jsx/report.py +677 -0
  57. pystylometry/viz/jsx/timeline.py +716 -0
  58. pystylometry/viz/jsx/viewer.py +1032 -0
  59. pystylometry-1.3.0.dist-info/METADATA +136 -0
  60. pystylometry-1.3.0.dist-info/RECORD +76 -0
  61. {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
  62. pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
  63. pystylometry-1.0.0.dist-info/METADATA +0 -275
  64. pystylometry-1.0.0.dist-info/RECORD +0 -46
@@ -26,7 +26,174 @@ References:
26
26
  attribution. PACLING.
27
27
  """
28
28
 
29
+ from __future__ import annotations
30
+
31
+ import math
32
+ from collections import Counter
33
+ from typing import Sequence
34
+
29
35
  from .._types import ExtendedNgramResult
36
+ from .._utils import advanced_tokenize
37
+
38
+
39
+ def _generate_ngrams(sequence: Sequence[str], n: int) -> list[tuple[str, ...]]:
40
+ """
41
+ Generate n-grams from a sequence.
42
+
43
+ Slides a window of size n across the sequence and yields tuples
44
+ of n consecutive elements.
45
+
46
+ Related GitHub Issue:
47
+ #19 - Extended N-gram Features
48
+ https://github.com/craigtrim/pystylometry/issues/19
49
+
50
+ Args:
51
+ sequence: List of tokens (words or characters)
52
+ n: Size of the n-gram (e.g., 3 for trigrams)
53
+
54
+ Returns:
55
+ List of n-gram tuples
56
+
57
+ Example:
58
+ >>> _generate_ngrams(["the", "quick", "brown", "fox"], 2)
59
+ [('the', 'quick'), ('quick', 'brown'), ('brown', 'fox')]
60
+ """
61
+ if len(sequence) < n:
62
+ return []
63
+ return [tuple(sequence[i : i + n]) for i in range(len(sequence) - n + 1)]
64
+
65
+
66
+ def _generate_skipgrams(sequence: Sequence[str], n: int, gap: int) -> list[tuple[str, ...]]:
67
+ """
68
+ Generate skipgrams (n-grams with gaps) from a sequence.
69
+
70
+ Skipgrams capture non-contiguous word patterns. For example, with n=2 and
71
+ gap=1, "the quick brown fox" yields ("the", "brown"), ("quick", "fox").
72
+ This captures syntactic frames independent of specific intervening words.
73
+
74
+ Related GitHub Issue:
75
+ #19 - Extended N-gram Features
76
+ https://github.com/craigtrim/pystylometry/issues/19
77
+
78
+ References:
79
+ Guthrie, D., et al. (2006). A closer look at skip-gram modelling. LREC.
80
+
81
+ Args:
82
+ sequence: List of tokens
83
+ n: Number of words to include in each skipgram
84
+ gap: Number of words to skip between included words
85
+
86
+ Returns:
87
+ List of skipgram tuples
88
+
89
+ Example:
90
+ >>> _generate_skipgrams(["the", "quick", "brown", "fox"], 2, 1)
91
+ [('the', 'brown'), ('quick', 'fox')]
92
+ >>> _generate_skipgrams(["a", "b", "c", "d", "e"], 3, 1)
93
+ [('a', 'c', 'd'), ('b', 'd', 'e')]
94
+ """
95
+ if n < 2:
96
+ return list(tuple([s]) for s in sequence)
97
+
98
+ # Total span needed: we need n items with (n-1) gaps of size `gap`
99
+ # First item at position i, subsequent items at i + (gap+1), i + 2*(gap+1), ...
100
+ # For n=2, gap=1: positions [i, i+2] -> span of 3
101
+ # For n=3, gap=1: positions [i, i+2, i+3] (first gap, then contiguous)
102
+ # Actually for skipgrams like "word1 _ word3 word4" (n=3, gap=1):
103
+ # positions [i, i+2, i+3]
104
+ # The pattern is: first word, skip `gap`, then n-1 contiguous words
105
+
106
+ skipgrams = []
107
+
108
+ # Pattern: first word at i, then skip `gap` words, then n-1 contiguous words
109
+ # Total span = 1 + gap + (n-1) = n + gap
110
+ total_span = n + gap
111
+ if len(sequence) < total_span:
112
+ return []
113
+
114
+ for i in range(len(sequence) - total_span + 1):
115
+ # First word
116
+ gram = [sequence[i]]
117
+ # Skip `gap` words, then take n-1 contiguous words
118
+ for j in range(n - 1):
119
+ gram.append(sequence[i + gap + 1 + j])
120
+ skipgrams.append(tuple(gram))
121
+
122
+ return skipgrams
123
+
124
+
125
+ def _calculate_shannon_entropy(counter: Counter[tuple[str, ...]]) -> float:
126
+ """
127
+ Calculate Shannon entropy of a frequency distribution.
128
+
129
+ Shannon entropy measures the uncertainty or information content in a
130
+ distribution. Higher entropy indicates more uniform (diverse) distributions,
131
+ while lower entropy indicates a few dominant n-grams.
132
+
133
+ Related GitHub Issue:
134
+ #19 - Extended N-gram Features
135
+ https://github.com/craigtrim/pystylometry/issues/19
136
+
137
+ Formula:
138
+ H = -Σ p(x) * log2(p(x))
139
+ where p(x) = count(x) / total
140
+
141
+ Args:
142
+ counter: Counter object with n-gram frequencies
143
+
144
+ Returns:
145
+ Shannon entropy in bits. Higher values indicate more diversity.
146
+
147
+ Example:
148
+ >>> from collections import Counter
149
+ >>> _calculate_shannon_entropy(Counter({"a": 1, "b": 1, "c": 1, "d": 1}))
150
+ 2.0 # Maximum entropy for 4 equally likely outcomes
151
+ """
152
+ if not counter:
153
+ return 0.0
154
+
155
+ total = sum(counter.values())
156
+ if total == 0:
157
+ return 0.0
158
+
159
+ entropy = 0.0
160
+ for count in counter.values():
161
+ if count > 0:
162
+ p = count / total
163
+ entropy -= p * math.log2(p)
164
+
165
+ return entropy
166
+
167
+
168
+ def _format_ngram(ngram: tuple[str, ...]) -> str:
169
+ """
170
+ Format an n-gram tuple as a readable string.
171
+
172
+ Args:
173
+ ngram: Tuple of tokens
174
+
175
+ Returns:
176
+ Space-joined string for word n-grams, concatenated string for characters
177
+
178
+ Example:
179
+ >>> _format_ngram(("the", "quick", "fox"))
180
+ 'the quick fox'
181
+ """
182
+ return " ".join(ngram)
183
+
184
+
185
+ def _get_top_ngrams(counter: Counter[tuple[str, ...]], n: int) -> list[tuple[str, int]]:
186
+ """
187
+ Get top n most frequent n-grams formatted as strings.
188
+
189
+ Args:
190
+ counter: Counter of n-gram tuples
191
+ n: Number of top items to return
192
+
193
+ Returns:
194
+ List of (ngram_string, count) tuples sorted by frequency
195
+ """
196
+ return [(_format_ngram(ngram), count) for ngram, count in counter.most_common(n)]
30
197
 
31
198
 
32
199
  def compute_extended_ngrams(
@@ -163,73 +330,151 @@ def compute_extended_ngrams(
163
330
  - Skipgrams can be very sparse (many unique patterns)
164
331
  - Entropy values higher for more diverse n-gram distributions
165
332
  """
166
- # TODO: Implement extended n-gram analysis
167
- # GitHub Issue #19: https://github.com/craigtrim/pystylometry/issues/19
168
- #
169
- # Implementation steps:
170
- #
171
- # Word N-grams:
172
- # 1. Tokenize text into words (lowercase, basic cleaning)
173
- # 2. Generate word trigrams:
174
- # - Slide window of size 3 across word list
175
- # - Create tuples of 3 consecutive words
176
- # - Count frequency of each trigram
177
- # 3. Generate word 4-grams (similar, window size 4)
178
- # 4. Sort by frequency, extract top_n for each
179
- # 5. Calculate Shannon entropy for each distribution:
180
- # H = -sum(p * log2(p)) where p = freq / total
181
- #
182
- # Skipgrams:
183
- # 6. Generate 2-skipgrams with gap 1:
184
- # - For each position i: (word[i], word[i+2])
185
- # - Skips middle word
186
- # - Count frequencies
187
- # 7. Generate 3-skipgrams with gap 1:
188
- # - For each position i: (word[i], word[i+2], word[i+3])
189
- # - Pattern: word, skip, word, word
190
- # - Count frequencies
191
- # 8. Sort and extract top_n skipgrams
192
- #
193
- # POS N-grams (if include_pos_ngrams):
194
- # 9. Load spaCy model for POS tagging
195
- # 10. Parse text to get POS tags for each word
196
- # 11. Generate POS trigrams (same as word trigrams, but use POS tags)
197
- # 12. Generate POS 4-grams
198
- # 13. Count frequencies, extract top_n
199
- # 14. Calculate Shannon entropy
200
- #
201
- # Character N-grams:
202
- # 15. Generate character trigrams:
203
- # - Slide window of size 3 across character sequence
204
- # - Include spaces and punctuation
205
- # - Count frequencies
206
- # 16. Generate character 4-grams (window size 4)
207
- # 17. Sort and extract top_n for each
208
- # 18. Calculate Shannon entropy
209
- #
210
- # Diversity Metrics:
211
- # 19. Count total unique n-grams for each type
212
- # 20. Calculate type-token ratios (unique / total)
213
- #
214
- # Metadata:
215
- # 21. Store full frequency distributions (optional, can be large)
216
- # 22. Store parameters: top_n, include_pos_ngrams, model
217
- # 23. Store token/character counts
218
- #
219
- # Helper Functions Needed:
220
- # - generate_ngrams(sequence, n) -> list[tuple]
221
- # - generate_skipgrams(sequence, n, gap) -> list[tuple]
222
- # - calculate_shannon_entropy(freq_dist) -> float
223
- # - get_top_n(freq_dist, n) -> list[tuple]
224
- #
225
- # Return ExtendedNgramResult
226
- #
227
- # Optimization notes:
228
- # - Use Counter from collections for frequency counting
229
- # - Consider sampling for very long texts
230
- # - Limit maximum n-gram types to prevent memory issues
231
- # - POS tagging is slowest step - make it optional
232
- raise NotImplementedError(
233
- "Extended n-gram features not yet implemented. "
234
- "See GitHub Issue #19: https://github.com/craigtrim/pystylometry/issues/19"
333
+ # =========================================================================
334
+ # TOKENIZATION
335
+ # =========================================================================
336
+
337
+ # Word tokenization: lowercase, strip punctuation for word n-grams
338
+ words = advanced_tokenize(text, lowercase=True, strip_punctuation=True)
339
+
340
+ # Character sequence: lowercase but preserve spaces (for character n-grams)
341
+ chars = list(text.lower())
342
+
343
+ # =========================================================================
344
+ # WORD N-GRAMS
345
+ # =========================================================================
346
+
347
+ # Generate word trigrams (3-grams)
348
+ word_trigrams = _generate_ngrams(words, 3)
349
+ word_trigram_counter: Counter[tuple[str, ...]] = Counter(word_trigrams)
350
+
351
+ # Generate word 4-grams
352
+ word_4grams = _generate_ngrams(words, 4)
353
+ word_4gram_counter: Counter[tuple[str, ...]] = Counter(word_4grams)
354
+
355
+ # =========================================================================
356
+ # SKIPGRAMS
357
+ # =========================================================================
358
+
359
+ # 2-skipgrams with gap of 1: (word1, word3) skipping word2
360
+ skipgrams_2_1 = _generate_skipgrams(words, 2, 1)
361
+ skipgram_2_1_counter: Counter[tuple[str, ...]] = Counter(skipgrams_2_1)
362
+
363
+ # 3-skipgrams with gap of 1: (word1, word3, word4) skipping word2
364
+ skipgrams_3_1 = _generate_skipgrams(words, 3, 1)
365
+ skipgram_3_1_counter: Counter[tuple[str, ...]] = Counter(skipgrams_3_1)
366
+
367
+ # =========================================================================
368
+ # POS N-GRAMS (optional, requires spaCy)
369
+ # =========================================================================
370
+
371
+ pos_trigram_counter: Counter[tuple[str, ...]] = Counter()
372
+ pos_4gram_counter: Counter[tuple[str, ...]] = Counter()
373
+ pos_trigram_entropy = 0.0
374
+
375
+ if include_pos_ngrams:
376
+ try:
377
+ import spacy
378
+
379
+ # Load spaCy model
380
+ try:
381
+ nlp = spacy.load(spacy_model)
382
+ except OSError:
383
+ # Model not installed - provide helpful message
384
+ raise ImportError(
385
+ f"spaCy model '{spacy_model}' not found. "
386
+ f"Install with: python -m spacy download {spacy_model}"
387
+ )
388
+
389
+ # Process text and extract POS tags
390
+ doc = nlp(text)
391
+ pos_tags = [token.pos_ for token in doc if not token.is_space]
392
+
393
+ # Generate POS trigrams
394
+ pos_trigrams = _generate_ngrams(pos_tags, 3)
395
+ pos_trigram_counter = Counter(pos_trigrams)
396
+
397
+ # Generate POS 4-grams
398
+ pos_4grams = _generate_ngrams(pos_tags, 4)
399
+ pos_4gram_counter = Counter(pos_4grams)
400
+
401
+ pos_trigram_entropy = _calculate_shannon_entropy(pos_trigram_counter)
402
+
403
+ except ImportError:
404
+ # spaCy not installed - leave POS results empty
405
+ pass
406
+
407
+ # =========================================================================
408
+ # CHARACTER N-GRAMS
409
+ # =========================================================================
410
+
411
+ # Character trigrams
412
+ char_trigrams = _generate_ngrams(chars, 3)
413
+ char_trigram_counter: Counter[tuple[str, ...]] = Counter(char_trigrams)
414
+
415
+ # Character 4-grams
416
+ char_4grams = _generate_ngrams(chars, 4)
417
+ char_4gram_counter: Counter[tuple[str, ...]] = Counter(char_4grams)
418
+
419
+ # =========================================================================
420
+ # ENTROPY CALCULATIONS
421
+ # =========================================================================
422
+
423
+ word_trigram_entropy = _calculate_shannon_entropy(word_trigram_counter)
424
+ word_4gram_entropy = _calculate_shannon_entropy(word_4gram_counter)
425
+ char_trigram_entropy = _calculate_shannon_entropy(char_trigram_counter)
426
+ char_4gram_entropy = _calculate_shannon_entropy(char_4gram_counter)
427
+
428
+ # =========================================================================
429
+ # BUILD RESULT
430
+ # =========================================================================
431
+
432
+ return ExtendedNgramResult(
433
+ # Word n-grams
434
+ top_word_trigrams=_get_top_ngrams(word_trigram_counter, top_n),
435
+ top_word_4grams=_get_top_ngrams(word_4gram_counter, top_n),
436
+ word_trigram_count=len(word_trigram_counter),
437
+ word_4gram_count=len(word_4gram_counter),
438
+ word_trigram_entropy=word_trigram_entropy,
439
+ word_4gram_entropy=word_4gram_entropy,
440
+ # Skipgrams
441
+ top_skipgrams_2_1=_get_top_ngrams(skipgram_2_1_counter, top_n),
442
+ top_skipgrams_3_1=_get_top_ngrams(skipgram_3_1_counter, top_n),
443
+ skipgram_2_1_count=len(skipgram_2_1_counter),
444
+ skipgram_3_1_count=len(skipgram_3_1_counter),
445
+ # POS n-grams
446
+ top_pos_trigrams=_get_top_ngrams(pos_trigram_counter, top_n),
447
+ top_pos_4grams=_get_top_ngrams(pos_4gram_counter, top_n),
448
+ pos_trigram_count=len(pos_trigram_counter),
449
+ pos_4gram_count=len(pos_4gram_counter),
450
+ pos_trigram_entropy=pos_trigram_entropy,
451
+ # Character n-grams
452
+ top_char_trigrams=_get_top_ngrams(char_trigram_counter, top_n),
453
+ top_char_4grams=_get_top_ngrams(char_4gram_counter, top_n),
454
+ char_trigram_entropy=char_trigram_entropy,
455
+ char_4gram_entropy=char_4gram_entropy,
456
+ # Metadata
457
+ metadata={
458
+ "parameters": {
459
+ "top_n": top_n,
460
+ "include_pos_ngrams": include_pos_ngrams,
461
+ "spacy_model": spacy_model if include_pos_ngrams else None,
462
+ },
463
+ "token_count": len(words),
464
+ "character_count": len(chars),
465
+ "word_trigram_tokens": len(word_trigrams),
466
+ "word_4gram_tokens": len(word_4grams),
467
+ "char_trigram_tokens": len(char_trigrams),
468
+ "char_4gram_tokens": len(char_4grams),
469
+ "full_distributions": {
470
+ "word_trigrams": dict(word_trigram_counter.most_common(100)),
471
+ "word_4grams": dict(word_4gram_counter.most_common(100)),
472
+ "skipgrams_2_1": dict(skipgram_2_1_counter.most_common(100)),
473
+ "skipgrams_3_1": dict(skipgram_3_1_counter.most_common(100)),
474
+ "pos_trigrams": dict(pos_trigram_counter.most_common(100)),
475
+ "pos_4grams": dict(pos_4gram_counter.most_common(100)),
476
+ "char_trigrams": dict(char_trigram_counter.most_common(100)),
477
+ "char_4grams": dict(char_4gram_counter.most_common(100)),
478
+ },
479
+ },
235
480
  )
@@ -0,0 +1,17 @@
1
+ # prosody
2
+
3
+ ![1 public function](https://img.shields.io/badge/functions-1-blue)
4
+ ![Requires spaCy](https://img.shields.io/badge/requires-spaCy-orange)
5
+
6
+ Rhythm and stress pattern analysis for written text.
7
+
8
+ ## Catalogue
9
+
10
+ | File | Function | What It Measures |
11
+ |------|----------|-----------------|
12
+ | `rhythm_prosody.py` | `compute_rhythm_prosody` | Syllable stress patterns, rhythm regularity, prose rhythm metrics |
13
+
14
+ ## See Also
15
+
16
+ - [`readability/syllables.py`](../readability/) for the syllable counting engine
17
+ - [`syntactic/`](../syntactic/) for sentence structure features that interact with prosodic rhythm