pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pystylometry/__init__.py +30 -5
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1954 -28
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +26 -1
  6. pystylometry/authorship/additional_methods.py +75 -0
  7. pystylometry/authorship/kilgarriff.py +347 -0
  8. pystylometry/character/__init__.py +15 -0
  9. pystylometry/character/character_metrics.py +389 -0
  10. pystylometry/cli.py +427 -0
  11. pystylometry/consistency/__init__.py +57 -0
  12. pystylometry/consistency/_thresholds.py +162 -0
  13. pystylometry/consistency/drift.py +549 -0
  14. pystylometry/dialect/__init__.py +65 -0
  15. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  16. pystylometry/dialect/_loader.py +360 -0
  17. pystylometry/dialect/detector.py +533 -0
  18. pystylometry/lexical/__init__.py +13 -6
  19. pystylometry/lexical/advanced_diversity.py +680 -0
  20. pystylometry/lexical/function_words.py +590 -0
  21. pystylometry/lexical/hapax.py +310 -33
  22. pystylometry/lexical/mtld.py +180 -22
  23. pystylometry/lexical/ttr.py +149 -0
  24. pystylometry/lexical/word_frequency_sophistication.py +1805 -0
  25. pystylometry/lexical/yule.py +142 -29
  26. pystylometry/ngrams/__init__.py +2 -0
  27. pystylometry/ngrams/entropy.py +150 -49
  28. pystylometry/ngrams/extended_ngrams.py +235 -0
  29. pystylometry/prosody/__init__.py +12 -0
  30. pystylometry/prosody/rhythm_prosody.py +53 -0
  31. pystylometry/readability/__init__.py +12 -0
  32. pystylometry/readability/additional_formulas.py +2110 -0
  33. pystylometry/readability/ari.py +173 -35
  34. pystylometry/readability/coleman_liau.py +150 -30
  35. pystylometry/readability/complex_words.py +531 -0
  36. pystylometry/readability/flesch.py +181 -32
  37. pystylometry/readability/gunning_fog.py +208 -35
  38. pystylometry/readability/smog.py +126 -28
  39. pystylometry/readability/syllables.py +137 -30
  40. pystylometry/stylistic/__init__.py +20 -0
  41. pystylometry/stylistic/cohesion_coherence.py +45 -0
  42. pystylometry/stylistic/genre_register.py +45 -0
  43. pystylometry/stylistic/markers.py +131 -0
  44. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  45. pystylometry/syntactic/__init__.py +4 -0
  46. pystylometry/syntactic/advanced_syntactic.py +494 -0
  47. pystylometry/syntactic/pos_ratios.py +172 -17
  48. pystylometry/syntactic/sentence_stats.py +105 -18
  49. pystylometry/syntactic/sentence_types.py +526 -0
  50. pystylometry/viz/__init__.py +71 -0
  51. pystylometry/viz/drift.py +589 -0
  52. pystylometry/viz/jsx/__init__.py +31 -0
  53. pystylometry/viz/jsx/_base.py +144 -0
  54. pystylometry/viz/jsx/report.py +677 -0
  55. pystylometry/viz/jsx/timeline.py +716 -0
  56. pystylometry/viz/jsx/viewer.py +1032 -0
  57. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
  58. pystylometry-1.1.0.dist-info/RECORD +63 -0
  59. pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
  60. pystylometry-0.1.0.dist-info/RECORD +0 -26
  61. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
@@ -1,25 +1,99 @@
1
- """Automated Readability Index (ARI)."""
1
+ """Automated Readability Index (ARI).
2
2
 
3
- from .._types import ARIResult
3
+ This module implements the ARI readability formula with native chunked
4
+ analysis for stylometric fingerprinting.
5
+
6
+ Related GitHub Issue:
7
+ #27 - Native chunked analysis with Distribution dataclass
8
+ https://github.com/craigtrim/pystylometry/issues/27
9
+ """
10
+
11
+ import math
12
+
13
+ from .._types import ARIResult, Distribution, chunk_text, make_distribution
4
14
  from .._utils import split_sentences, tokenize
5
15
 
16
+ # Formula coefficients from Senter & Smith (1967)
17
+ _CHARACTER_COEFFICIENT = 4.71
18
+ _WORD_COEFFICIENT = 0.5
19
+ _INTERCEPT = -21.43
20
+
21
+
22
+ def _get_age_range(grade_level: float) -> str:
23
+ """Map grade level to age range."""
24
+ if grade_level <= 0:
25
+ return "5-6 years (Kindergarten)"
26
+ elif grade_level <= 5:
27
+ return "6-11 years (Elementary)"
28
+ elif grade_level <= 8:
29
+ return "11-14 years (Middle School)"
30
+ elif grade_level <= 12:
31
+ return "14-18 years (High School)"
32
+ elif grade_level <= 14:
33
+ return "18-22 years (College)"
34
+ else:
35
+ return "22+ years (Graduate)"
36
+
37
+
38
+ def _compute_ari_single(text: str) -> tuple[float, float, dict]:
39
+ """Compute ARI metrics for a single chunk of text.
40
+
41
+ Returns:
42
+ Tuple of (ari_score, grade_level, metadata_dict).
43
+ Returns (nan, nan, metadata) for empty/invalid input.
44
+ """
45
+ sentences = split_sentences(text)
46
+ tokens = tokenize(text)
47
+ character_count = sum(1 for char in text if char.isalnum())
48
+
49
+ if len(sentences) == 0 or len(tokens) == 0:
50
+ return (
51
+ float("nan"),
52
+ float("nan"),
53
+ {"sentence_count": 0, "word_count": 0, "character_count": 0},
54
+ )
55
+
56
+ # Calculate ratios
57
+ chars_per_word = character_count / len(tokens)
58
+ words_per_sentence = len(tokens) / len(sentences)
59
+
60
+ # Apply ARI formula
61
+ ari_score = (
62
+ _CHARACTER_COEFFICIENT * chars_per_word
63
+ + _WORD_COEFFICIENT * words_per_sentence
64
+ + _INTERCEPT
65
+ )
66
+
67
+ grade_level = max(0, min(20, math.floor(ari_score + 0.5)))
6
68
 
7
- def compute_ari(text: str) -> ARIResult:
69
+ metadata = {
70
+ "sentence_count": len(sentences),
71
+ "word_count": len(tokens),
72
+ "character_count": character_count,
73
+ "characters_per_word": chars_per_word,
74
+ "words_per_sentence": words_per_sentence,
75
+ }
76
+
77
+ return (ari_score, float(grade_level), metadata)
78
+
79
+
80
+ def compute_ari(text: str, chunk_size: int = 1000) -> ARIResult:
8
81
  """
9
82
  Compute Automated Readability Index (ARI).
10
83
 
84
+ This function uses native chunked analysis to capture variance and patterns
85
+ across the text, which is essential for stylometric fingerprinting.
86
+
11
87
  Formula:
12
88
  ARI = 4.71 × (characters/words) + 0.5 × (words/sentences) - 21.43
13
89
 
14
- The ARI is designed to gauge the understandability of a text and produces
15
- an approximate representation of the US grade level needed to comprehend the text.
90
+ The ARI uses character counts and word counts (similar to Coleman-Liau)
91
+ but adds sentence length as a factor. It produces an approximate
92
+ representation of the US grade level needed to comprehend the text.
16
93
 
17
- Grade Level to Age mapping:
18
- 1-5: 5-11 years
19
- 6-8: 11-14 years
20
- 9-12: 14-18 years
21
- 13-14: 18-22 years
22
- 14+: 22+ years (college level)
94
+ Related GitHub Issue:
95
+ #27 - Native chunked analysis with Distribution dataclass
96
+ https://github.com/craigtrim/pystylometry/issues/27
23
97
 
24
98
  References:
25
99
  Senter, R. J., & Smith, E. A. (1967). Automated readability index.
@@ -27,44 +101,108 @@ def compute_ari(text: str) -> ARIResult:
27
101
 
28
102
  Args:
29
103
  text: Input text to analyze
104
+ chunk_size: Number of words per chunk (default: 1000).
105
+ The text is divided into chunks of this size, and metrics are
106
+ computed per-chunk.
30
107
 
31
108
  Returns:
32
- ARIResult with ARI score, grade level, and age range
109
+ ARIResult with:
110
+ - ari_score: Mean ARI score across chunks
111
+ - grade_level: Mean grade level across chunks
112
+ - age_range: Age range based on mean grade level
113
+ - ari_score_dist: Distribution with per-chunk values and stats
114
+ - grade_level_dist: Distribution with per-chunk values and stats
115
+ - chunk_size: The chunk size used
116
+ - chunk_count: Number of chunks analyzed
33
117
 
34
118
  Example:
35
- >>> result = compute_ari("The quick brown fox jumps over the lazy dog.")
36
- >>> print(f"ARI Score: {result.ari_score:.1f}")
37
- >>> print(f"Grade Level: {result.grade_level}")
38
- >>> print(f"Age Range: {result.age_range}")
119
+ >>> result = compute_ari("Long text here...", chunk_size=1000)
120
+ >>> result.ari_score # Mean across chunks
121
+ 9.5
122
+ >>> result.ari_score_dist.std # Variance reveals fingerprint
123
+ 1.5
39
124
  """
40
- sentences = split_sentences(text)
41
- tokens = tokenize(text)
125
+ # Chunk the text
126
+ chunks = chunk_text(text, chunk_size)
42
127
 
43
- if len(sentences) == 0 or len(tokens) == 0:
128
+ # Compute metrics per chunk
129
+ ari_values = []
130
+ grade_values = []
131
+ total_sentences = 0
132
+ total_words = 0
133
+ total_chars = 0
134
+
135
+ for chunk in chunks:
136
+ ai, gl, meta = _compute_ari_single(chunk)
137
+ if not math.isnan(ai):
138
+ ari_values.append(ai)
139
+ grade_values.append(gl)
140
+ total_sentences += meta.get("sentence_count", 0)
141
+ total_words += meta.get("word_count", 0)
142
+ total_chars += meta.get("character_count", 0)
143
+
144
+ # Handle empty or all-invalid chunks
145
+ if not ari_values:
146
+ empty_dist = Distribution(
147
+ values=[],
148
+ mean=float("nan"),
149
+ median=float("nan"),
150
+ std=0.0,
151
+ range=0.0,
152
+ iqr=0.0,
153
+ )
44
154
  return ARIResult(
45
- ari_score=0.0,
46
- grade_level=0,
155
+ ari_score=float("nan"),
156
+ grade_level=float("nan"),
47
157
  age_range="Unknown",
48
- metadata={"sentence_count": 0, "word_count": 0, "character_count": 0},
158
+ ari_score_dist=empty_dist,
159
+ grade_level_dist=empty_dist,
160
+ chunk_size=chunk_size,
161
+ chunk_count=len(chunks),
162
+ metadata={
163
+ # Backward-compatible keys
164
+ "sentence_count": 0,
165
+ "word_count": 0,
166
+ "character_count": 0,
167
+ "characters_per_word": 0.0,
168
+ "words_per_sentence": 0.0,
169
+ # New prefixed keys for consistency
170
+ "total_sentence_count": 0,
171
+ "total_word_count": 0,
172
+ "total_character_count": 0,
173
+ "reliable": False,
174
+ },
49
175
  )
50
176
 
51
- # Count characters (letters, numbers, excluding spaces and punctuation)
52
- character_count = sum(1 for char in text if char.isalnum())
177
+ # Build distributions
178
+ ari_dist = make_distribution(ari_values)
179
+ grade_dist = make_distribution(grade_values)
180
+
181
+ # Get age range from mean grade level
182
+ age_range = _get_age_range(grade_dist.mean)
53
183
 
54
- # TODO: Implement ARI formula
55
- ari_score = 0.0 # Placeholder
56
- grade_level = 0 # Placeholder
57
- age_range = "Unknown" # Placeholder
184
+ # Reliability heuristic
185
+ reliable = total_words >= 100
58
186
 
59
187
  return ARIResult(
60
- ari_score=ari_score,
61
- grade_level=grade_level,
188
+ ari_score=ari_dist.mean,
189
+ grade_level=grade_dist.mean,
62
190
  age_range=age_range,
191
+ ari_score_dist=ari_dist,
192
+ grade_level_dist=grade_dist,
193
+ chunk_size=chunk_size,
194
+ chunk_count=len(chunks),
63
195
  metadata={
64
- "sentence_count": len(sentences),
65
- "word_count": len(tokens),
66
- "character_count": character_count,
67
- "characters_per_word": character_count / len(tokens) if tokens else 0,
68
- "words_per_sentence": len(tokens) / len(sentences) if sentences else 0,
196
+ # Backward-compatible keys
197
+ "sentence_count": total_sentences,
198
+ "word_count": total_words,
199
+ "character_count": total_chars,
200
+ "characters_per_word": total_chars / total_words if total_words > 0 else 0,
201
+ "words_per_sentence": total_words / total_sentences if total_sentences > 0 else 0,
202
+ # New prefixed keys for consistency
203
+ "total_sentence_count": total_sentences,
204
+ "total_word_count": total_words,
205
+ "total_character_count": total_chars,
206
+ "reliable": reliable,
69
207
  },
70
208
  )
@@ -1,13 +1,69 @@
1
- """Coleman-Liau Index."""
1
+ """Coleman-Liau Index.
2
2
 
3
- from .._types import ColemanLiauResult
3
+ This module implements the Coleman-Liau readability formula with native chunked
4
+ analysis for stylometric fingerprinting.
5
+
6
+ Related GitHub Issue:
7
+ #27 - Native chunked analysis with Distribution dataclass
8
+ https://github.com/craigtrim/pystylometry/issues/27
9
+ """
10
+
11
+ import math
12
+
13
+ from .._types import ColemanLiauResult, Distribution, chunk_text, make_distribution
4
14
  from .._utils import split_sentences, tokenize
5
15
 
16
+ # Regression coefficients from Coleman & Liau (1975)
17
+ _LETTER_COEFFICIENT = 0.0588
18
+ _SENTENCE_COEFFICIENT = -0.296
19
+ _INTERCEPT = -15.8
20
+
6
21
 
7
- def compute_coleman_liau(text: str) -> ColemanLiauResult:
22
+ def _compute_coleman_liau_single(text: str) -> tuple[float, float, dict]:
23
+ """Compute Coleman-Liau metrics for a single chunk of text.
24
+
25
+ Returns:
26
+ Tuple of (cli_index, grade_level, metadata_dict).
27
+ Returns (nan, nan, metadata) for empty/invalid input.
28
+ """
29
+ sentences = split_sentences(text)
30
+ all_tokens = tokenize(text)
31
+ tokens = [token for token in all_tokens if any(char.isalpha() for char in token)]
32
+ letter_count = sum(1 for token in tokens for char in token if char.isalpha())
33
+
34
+ if len(sentences) == 0 or len(tokens) == 0:
35
+ return (
36
+ float("nan"),
37
+ float("nan"),
38
+ {"sentence_count": 0, "word_count": 0, "letter_count": 0},
39
+ )
40
+
41
+ # Calculate per 100 words
42
+ L = (letter_count / len(tokens)) * 100 # noqa: N806
43
+ S = (len(sentences) / len(tokens)) * 100 # noqa: N806
44
+
45
+ # Compute Coleman-Liau Index
46
+ cli_index = _LETTER_COEFFICIENT * L + _SENTENCE_COEFFICIENT * S + _INTERCEPT
47
+ grade_level = max(0, math.floor(cli_index + 0.5))
48
+
49
+ metadata = {
50
+ "sentence_count": len(sentences),
51
+ "word_count": len(tokens),
52
+ "letter_count": letter_count,
53
+ "letters_per_100_words": L,
54
+ "sentences_per_100_words": S,
55
+ }
56
+
57
+ return (cli_index, float(grade_level), metadata)
58
+
59
+
60
+ def compute_coleman_liau(text: str, chunk_size: int = 1000) -> ColemanLiauResult:
8
61
  """
9
62
  Compute Coleman-Liau Index.
10
63
 
64
+ This function uses native chunked analysis to capture variance and patterns
65
+ across the text, which is essential for stylometric fingerprinting.
66
+
11
67
  Formula:
12
68
  CLI = 0.0588 × L - 0.296 × S - 15.8
13
69
 
@@ -16,7 +72,11 @@ def compute_coleman_liau(text: str) -> ColemanLiauResult:
16
72
  S = average number of sentences per 100 words
17
73
 
18
74
  The Coleman-Liau index relies on characters rather than syllables,
19
- making it easier to compute and potentially more language-agnostic.
75
+ making it easier to compute and not requiring syllable-counting algorithms.
76
+
77
+ Related GitHub Issue:
78
+ #27 - Native chunked analysis with Distribution dataclass
79
+ https://github.com/craigtrim/pystylometry/issues/27
20
80
 
21
81
  References:
22
82
  Coleman, M., & Liau, T. L. (1975). A computer readability formula
@@ -24,44 +84,104 @@ def compute_coleman_liau(text: str) -> ColemanLiauResult:
24
84
 
25
85
  Args:
26
86
  text: Input text to analyze
87
+ chunk_size: Number of words per chunk (default: 1000).
88
+ The text is divided into chunks of this size, and metrics are
89
+ computed per-chunk.
27
90
 
28
91
  Returns:
29
- ColemanLiauResult with CLI index and grade level
92
+ ColemanLiauResult with:
93
+ - cli_index: Mean CLI across chunks
94
+ - grade_level: Mean grade level across chunks
95
+ - cli_index_dist: Distribution with per-chunk values and stats
96
+ - grade_level_dist: Distribution with per-chunk values and stats
97
+ - chunk_size: The chunk size used
98
+ - chunk_count: Number of chunks analyzed
30
99
 
31
100
  Example:
32
- >>> result = compute_coleman_liau("The quick brown fox jumps over the lazy dog.")
33
- >>> print(f"CLI Index: {result.cli_index:.1f}")
34
- >>> print(f"Grade Level: {result.grade_level}")
101
+ >>> result = compute_coleman_liau("Long text here...", chunk_size=1000)
102
+ >>> result.cli_index # Mean across chunks
103
+ 8.5
104
+ >>> result.cli_index_dist.std # Variance reveals fingerprint
105
+ 1.2
35
106
  """
36
- sentences = split_sentences(text)
37
- tokens = tokenize(text)
107
+ # Chunk the text
108
+ chunks = chunk_text(text, chunk_size)
38
109
 
39
- if len(sentences) == 0 or len(tokens) == 0:
110
+ # Compute metrics per chunk
111
+ cli_values = []
112
+ grade_values = []
113
+ total_sentences = 0
114
+ total_words = 0
115
+ total_letters = 0
116
+
117
+ for chunk in chunks:
118
+ ci, gl, meta = _compute_coleman_liau_single(chunk)
119
+ if not math.isnan(ci):
120
+ cli_values.append(ci)
121
+ grade_values.append(gl)
122
+ total_sentences += meta.get("sentence_count", 0)
123
+ total_words += meta.get("word_count", 0)
124
+ total_letters += meta.get("letter_count", 0)
125
+
126
+ # Handle empty or all-invalid chunks
127
+ if not cli_values:
128
+ empty_dist = Distribution(
129
+ values=[],
130
+ mean=float("nan"),
131
+ median=float("nan"),
132
+ std=0.0,
133
+ range=0.0,
134
+ iqr=0.0,
135
+ )
40
136
  return ColemanLiauResult(
41
- cli_index=0.0,
42
- grade_level=0,
43
- metadata={"sentence_count": 0, "word_count": 0, "letter_count": 0},
137
+ cli_index=float("nan"),
138
+ grade_level=float("nan"),
139
+ cli_index_dist=empty_dist,
140
+ grade_level_dist=empty_dist,
141
+ chunk_size=chunk_size,
142
+ chunk_count=len(chunks),
143
+ metadata={
144
+ # Backward-compatible keys
145
+ "sentence_count": 0,
146
+ "word_count": 0,
147
+ "letter_count": 0,
148
+ "letters_per_100_words": 0.0,
149
+ "sentences_per_100_words": 0.0,
150
+ # New prefixed keys for consistency
151
+ "total_sentence_count": 0,
152
+ "total_word_count": 0,
153
+ "total_letter_count": 0,
154
+ "reliable": False,
155
+ },
44
156
  )
45
157
 
46
- # Count letters (excluding spaces and punctuation)
47
- letter_count = sum(1 for char in text if char.isalpha())
48
-
49
- # Calculate per 100 words
50
- L = (letter_count / len(tokens)) * 100 # noqa: N806
51
- S = (len(sentences) / len(tokens)) * 100 # noqa: N806
158
+ # Build distributions
159
+ cli_dist = make_distribution(cli_values)
160
+ grade_dist = make_distribution(grade_values)
52
161
 
53
- # TODO: Implement Coleman-Liau formula
54
- cli_index = 0.0 # Placeholder
55
- grade_level = 0 # Placeholder
162
+ # Reliability heuristic
163
+ reliable = total_words >= 100
56
164
 
57
165
  return ColemanLiauResult(
58
- cli_index=cli_index,
59
- grade_level=grade_level,
166
+ cli_index=cli_dist.mean,
167
+ grade_level=grade_dist.mean,
168
+ cli_index_dist=cli_dist,
169
+ grade_level_dist=grade_dist,
170
+ chunk_size=chunk_size,
171
+ chunk_count=len(chunks),
60
172
  metadata={
61
- "sentence_count": len(sentences),
62
- "word_count": len(tokens),
63
- "letter_count": letter_count,
64
- "letters_per_100_words": L,
65
- "sentences_per_100_words": S,
173
+ # Backward-compatible keys
174
+ "sentence_count": total_sentences,
175
+ "word_count": total_words,
176
+ "letter_count": total_letters,
177
+ "letters_per_100_words": (total_letters / total_words * 100) if total_words > 0 else 0,
178
+ "sentences_per_100_words": (total_sentences / total_words * 100)
179
+ if total_words > 0
180
+ else 0,
181
+ # New prefixed keys for consistency
182
+ "total_sentence_count": total_sentences,
183
+ "total_word_count": total_words,
184
+ "total_letter_count": total_letters,
185
+ "reliable": reliable,
66
186
  },
67
187
  )