pystylometry 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. pystylometry/__init__.py +29 -3
  2. pystylometry/_types.py +963 -259
  3. pystylometry/authorship/__init__.py +23 -2
  4. pystylometry/authorship/additional_methods.py +4 -29
  5. pystylometry/authorship/kilgarriff.py +347 -0
  6. pystylometry/character/character_metrics.py +267 -179
  7. pystylometry/cli.py +427 -0
  8. pystylometry/consistency/__init__.py +57 -0
  9. pystylometry/consistency/_thresholds.py +162 -0
  10. pystylometry/consistency/drift.py +549 -0
  11. pystylometry/dialect/__init__.py +65 -0
  12. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  13. pystylometry/dialect/_loader.py +360 -0
  14. pystylometry/dialect/detector.py +533 -0
  15. pystylometry/lexical/advanced_diversity.py +61 -22
  16. pystylometry/lexical/function_words.py +255 -56
  17. pystylometry/lexical/hapax.py +182 -52
  18. pystylometry/lexical/mtld.py +108 -26
  19. pystylometry/lexical/ttr.py +76 -10
  20. pystylometry/lexical/word_frequency_sophistication.py +1522 -298
  21. pystylometry/lexical/yule.py +136 -50
  22. pystylometry/ngrams/entropy.py +150 -49
  23. pystylometry/readability/additional_formulas.py +1887 -762
  24. pystylometry/readability/ari.py +144 -82
  25. pystylometry/readability/coleman_liau.py +136 -109
  26. pystylometry/readability/flesch.py +177 -73
  27. pystylometry/readability/gunning_fog.py +165 -161
  28. pystylometry/readability/smog.py +123 -42
  29. pystylometry/syntactic/advanced_syntactic.py +76 -14
  30. pystylometry/syntactic/pos_ratios.py +70 -6
  31. pystylometry/syntactic/sentence_stats.py +55 -12
  32. pystylometry/syntactic/sentence_types.py +71 -15
  33. pystylometry/viz/__init__.py +71 -0
  34. pystylometry/viz/drift.py +589 -0
  35. pystylometry/viz/jsx/__init__.py +31 -0
  36. pystylometry/viz/jsx/_base.py +144 -0
  37. pystylometry/viz/jsx/report.py +677 -0
  38. pystylometry/viz/jsx/timeline.py +716 -0
  39. pystylometry/viz/jsx/viewer.py +1032 -0
  40. {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +5 -2
  41. pystylometry-1.1.0.dist-info/RECORD +63 -0
  42. {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +1 -1
  43. pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
  44. pystylometry-1.0.0.dist-info/RECORD +0 -46
@@ -1,34 +1,26 @@
1
- """Automated Readability Index (ARI)."""
1
+ """Automated Readability Index (ARI).
2
+
3
+ This module implements the ARI readability formula with native chunked
4
+ analysis for stylometric fingerprinting.
5
+
6
+ Related GitHub Issue:
7
+ #27 - Native chunked analysis with Distribution dataclass
8
+ https://github.com/craigtrim/pystylometry/issues/27
9
+ """
2
10
 
3
11
  import math
4
12
 
5
- from .._types import ARIResult
13
+ from .._types import ARIResult, Distribution, chunk_text, make_distribution
6
14
  from .._utils import split_sentences, tokenize
7
15
 
8
16
  # Formula coefficients from Senter & Smith (1967)
9
- # Reference: Senter, R. J., & Smith, E. A. (1967). Automated readability index.
10
- # AMRL-TR-6620. Aerospace Medical Research Laboratories.
11
-
12
- # Coefficient for characters per word
13
17
  _CHARACTER_COEFFICIENT = 4.71
14
-
15
- # Coefficient for words per sentence
16
18
  _WORD_COEFFICIENT = 0.5
17
-
18
- # Intercept to calibrate scale to U.S. grade levels
19
19
  _INTERCEPT = -21.43
20
20
 
21
21
 
22
- def _get_age_range(grade_level: int) -> str:
23
- """
24
- Map grade level to age range.
25
-
26
- Args:
27
- grade_level: U.S. grade level (0-20+)
28
-
29
- Returns:
30
- Age range string
31
- """
22
+ def _get_age_range(grade_level: float) -> str:
23
+ """Map grade level to age range."""
32
24
  if grade_level <= 0:
33
25
  return "5-6 years (Kindergarten)"
34
26
  elif grade_level <= 5:
@@ -43,10 +35,55 @@ def _get_age_range(grade_level: int) -> str:
43
35
  return "22+ years (Graduate)"
44
36
 
45
37
 
46
- def compute_ari(text: str) -> ARIResult:
38
+ def _compute_ari_single(text: str) -> tuple[float, float, dict]:
39
+ """Compute ARI metrics for a single chunk of text.
40
+
41
+ Returns:
42
+ Tuple of (ari_score, grade_level, metadata_dict).
43
+ Returns (nan, nan, metadata) for empty/invalid input.
44
+ """
45
+ sentences = split_sentences(text)
46
+ tokens = tokenize(text)
47
+ character_count = sum(1 for char in text if char.isalnum())
48
+
49
+ if len(sentences) == 0 or len(tokens) == 0:
50
+ return (
51
+ float("nan"),
52
+ float("nan"),
53
+ {"sentence_count": 0, "word_count": 0, "character_count": 0},
54
+ )
55
+
56
+ # Calculate ratios
57
+ chars_per_word = character_count / len(tokens)
58
+ words_per_sentence = len(tokens) / len(sentences)
59
+
60
+ # Apply ARI formula
61
+ ari_score = (
62
+ _CHARACTER_COEFFICIENT * chars_per_word
63
+ + _WORD_COEFFICIENT * words_per_sentence
64
+ + _INTERCEPT
65
+ )
66
+
67
+ grade_level = max(0, min(20, math.floor(ari_score + 0.5)))
68
+
69
+ metadata = {
70
+ "sentence_count": len(sentences),
71
+ "word_count": len(tokens),
72
+ "character_count": character_count,
73
+ "characters_per_word": chars_per_word,
74
+ "words_per_sentence": words_per_sentence,
75
+ }
76
+
77
+ return (ari_score, float(grade_level), metadata)
78
+
79
+
80
+ def compute_ari(text: str, chunk_size: int = 1000) -> ARIResult:
47
81
  """
48
82
  Compute Automated Readability Index (ARI).
49
83
 
84
+ This function uses native chunked analysis to capture variance and patterns
85
+ across the text, which is essential for stylometric fingerprinting.
86
+
50
87
  Formula:
51
88
  ARI = 4.71 × (characters/words) + 0.5 × (words/sentences) - 21.43
52
89
 
@@ -54,18 +91,9 @@ def compute_ari(text: str) -> ARIResult:
54
91
  but adds sentence length as a factor. It produces an approximate
55
92
  representation of the US grade level needed to comprehend the text.
56
93
 
57
- **Implementation Notes:**
58
- - Grade levels are clamped to [0, 20] range
59
- - Uses round-half-up rounding for grade level calculation
60
- - Character count includes alphanumeric characters only (letters and digits)
61
- - Reliability heuristic: 100+ words recommended
62
-
63
- Grade Level to Age mapping:
64
- 1-5: 6-11 years (Elementary)
65
- 6-8: 11-14 years (Middle School)
66
- 9-12: 14-18 years (High School)
67
- 13-14: 18-22 years (College)
68
- 15+: 22+ years (Graduate)
94
+ Related GitHub Issue:
95
+ #27 - Native chunked analysis with Distribution dataclass
96
+ https://github.com/craigtrim/pystylometry/issues/27
69
97
 
70
98
  References:
71
99
  Senter, R. J., & Smith, E. A. (1967). Automated readability index.
@@ -73,74 +101,108 @@ def compute_ari(text: str) -> ARIResult:
73
101
 
74
102
  Args:
75
103
  text: Input text to analyze
104
+ chunk_size: Number of words per chunk (default: 1000).
105
+ The text is divided into chunks of this size, and metrics are
106
+ computed per-chunk.
76
107
 
77
108
  Returns:
78
- ARIResult with ARI score, grade level, and age range
109
+ ARIResult with:
110
+ - ari_score: Mean ARI score across chunks
111
+ - grade_level: Mean grade level across chunks
112
+ - age_range: Age range based on mean grade level
113
+ - ari_score_dist: Distribution with per-chunk values and stats
114
+ - grade_level_dist: Distribution with per-chunk values and stats
115
+ - chunk_size: The chunk size used
116
+ - chunk_count: Number of chunks analyzed
79
117
 
80
118
  Example:
81
- >>> result = compute_ari("The quick brown fox jumps over the lazy dog.")
82
- >>> print(f"ARI Score: {result.ari_score:.1f}")
83
- ARI Score: 0.1
84
- >>> print(f"Grade Level: {result.grade_level}")
85
- Grade Level: 0
86
- >>> print(f"Age Range: {result.age_range}")
87
- Age Range: 5-6 years (Kindergarten)
88
- >>> result.metadata["reliable"]
89
- False
119
+ >>> result = compute_ari("Long text here...", chunk_size=1000)
120
+ >>> result.ari_score # Mean across chunks
121
+ 9.5
122
+ >>> result.ari_score_dist.std # Variance reveals fingerprint
123
+ 1.5
90
124
  """
91
- sentences = split_sentences(text)
92
- tokens = tokenize(text)
93
-
94
- # Count characters (alphanumeric: letters and digits, excluding spaces/punctuation)
95
- # Computed before early return to ensure metadata consistency
96
- character_count = sum(1 for char in text if char.isalnum())
97
-
98
- if len(sentences) == 0 or len(tokens) == 0:
125
+ # Chunk the text
126
+ chunks = chunk_text(text, chunk_size)
127
+
128
+ # Compute metrics per chunk
129
+ ari_values = []
130
+ grade_values = []
131
+ total_sentences = 0
132
+ total_words = 0
133
+ total_chars = 0
134
+
135
+ for chunk in chunks:
136
+ ai, gl, meta = _compute_ari_single(chunk)
137
+ if not math.isnan(ai):
138
+ ari_values.append(ai)
139
+ grade_values.append(gl)
140
+ total_sentences += meta.get("sentence_count", 0)
141
+ total_words += meta.get("word_count", 0)
142
+ total_chars += meta.get("character_count", 0)
143
+
144
+ # Handle empty or all-invalid chunks
145
+ if not ari_values:
146
+ empty_dist = Distribution(
147
+ values=[],
148
+ mean=float("nan"),
149
+ median=float("nan"),
150
+ std=0.0,
151
+ range=0.0,
152
+ iqr=0.0,
153
+ )
99
154
  return ARIResult(
100
- ari_score=0.0,
101
- grade_level=0,
102
- age_range="5-6 years (Kindergarten)",
155
+ ari_score=float("nan"),
156
+ grade_level=float("nan"),
157
+ age_range="Unknown",
158
+ ari_score_dist=empty_dist,
159
+ grade_level_dist=empty_dist,
160
+ chunk_size=chunk_size,
161
+ chunk_count=len(chunks),
103
162
  metadata={
104
- "sentence_count": len(sentences),
105
- "word_count": len(tokens),
106
- "character_count": character_count,
163
+ # Backward-compatible keys
164
+ "sentence_count": 0,
165
+ "word_count": 0,
166
+ "character_count": 0,
107
167
  "characters_per_word": 0.0,
108
168
  "words_per_sentence": 0.0,
169
+ # New prefixed keys for consistency
170
+ "total_sentence_count": 0,
171
+ "total_word_count": 0,
172
+ "total_character_count": 0,
109
173
  "reliable": False,
110
174
  },
111
175
  )
112
176
 
113
- # Calculate ratios
114
- chars_per_word = character_count / len(tokens)
115
- words_per_sentence = len(tokens) / len(sentences)
116
-
117
- # Apply ARI formula
118
- ari_score = (
119
- _CHARACTER_COEFFICIENT * chars_per_word
120
- + _WORD_COEFFICIENT * words_per_sentence
121
- + _INTERCEPT
122
- )
123
-
124
- # Use round-half-up rounding and clamp to valid grade range [0, 20]
125
- # math.floor(x + 0.5) implements round-half-up for both positive and negative values
126
- grade_level = max(0, min(20, math.floor(ari_score + 0.5)))
177
+ # Build distributions
178
+ ari_dist = make_distribution(ari_values)
179
+ grade_dist = make_distribution(grade_values)
127
180
 
128
- # Get age range from grade level
129
- age_range = _get_age_range(grade_level)
181
+ # Get age range from mean grade level
182
+ age_range = _get_age_range(grade_dist.mean)
130
183
 
131
- # Reliability heuristic: like other readability metrics, 100+ words recommended
132
- reliable = len(tokens) >= 100
184
+ # Reliability heuristic
185
+ reliable = total_words >= 100
133
186
 
134
187
  return ARIResult(
135
- ari_score=ari_score,
136
- grade_level=grade_level,
188
+ ari_score=ari_dist.mean,
189
+ grade_level=grade_dist.mean,
137
190
  age_range=age_range,
191
+ ari_score_dist=ari_dist,
192
+ grade_level_dist=grade_dist,
193
+ chunk_size=chunk_size,
194
+ chunk_count=len(chunks),
138
195
  metadata={
139
- "sentence_count": len(sentences),
140
- "word_count": len(tokens),
141
- "character_count": character_count,
142
- "characters_per_word": chars_per_word,
143
- "words_per_sentence": words_per_sentence,
196
+ # Backward-compatible keys
197
+ "sentence_count": total_sentences,
198
+ "word_count": total_words,
199
+ "character_count": total_chars,
200
+ "characters_per_word": total_chars / total_words if total_words > 0 else 0,
201
+ "words_per_sentence": total_words / total_sentences if total_sentences > 0 else 0,
202
+ # New prefixed keys for consistency
203
+ "total_sentence_count": total_sentences,
204
+ "total_word_count": total_words,
205
+ "total_character_count": total_chars,
144
206
  "reliable": reliable,
145
207
  },
146
208
  )
@@ -1,31 +1,69 @@
1
- """Coleman-Liau Index."""
1
+ """Coleman-Liau Index.
2
+
3
+ This module implements the Coleman-Liau readability formula with native chunked
4
+ analysis for stylometric fingerprinting.
5
+
6
+ Related GitHub Issue:
7
+ #27 - Native chunked analysis with Distribution dataclass
8
+ https://github.com/craigtrim/pystylometry/issues/27
9
+ """
2
10
 
3
11
  import math
4
12
 
5
- from .._types import ColemanLiauResult
13
+ from .._types import ColemanLiauResult, Distribution, chunk_text, make_distribution
6
14
  from .._utils import split_sentences, tokenize
7
15
 
8
16
  # Regression coefficients from Coleman & Liau (1975)
9
- # Derived from empirical analysis of Cloze test results on graded texts
10
- # Reference: Coleman, M., & Liau, T. L. (1975). A computer readability formula
11
- # designed for machine scoring. Journal of Applied Psychology, 60(2), 283.
12
-
13
- # Coefficient for letters per 100 words
14
- # Represents impact of word length on reading difficulty
15
17
  _LETTER_COEFFICIENT = 0.0588
16
-
17
- # Coefficient for sentences per 100 words (negative: more sentences = easier)
18
- # Represents impact of sentence length on reading difficulty
19
18
  _SENTENCE_COEFFICIENT = -0.296
20
-
21
- # Intercept to calibrate scale to U.S. grade levels (1-16)
22
19
  _INTERCEPT = -15.8
23
20
 
24
21
 
25
- def compute_coleman_liau(text: str) -> ColemanLiauResult:
22
+ def _compute_coleman_liau_single(text: str) -> tuple[float, float, dict]:
23
+ """Compute Coleman-Liau metrics for a single chunk of text.
24
+
25
+ Returns:
26
+ Tuple of (cli_index, grade_level, metadata_dict).
27
+ Returns (nan, nan, metadata) for empty/invalid input.
28
+ """
29
+ sentences = split_sentences(text)
30
+ all_tokens = tokenize(text)
31
+ tokens = [token for token in all_tokens if any(char.isalpha() for char in token)]
32
+ letter_count = sum(1 for token in tokens for char in token if char.isalpha())
33
+
34
+ if len(sentences) == 0 or len(tokens) == 0:
35
+ return (
36
+ float("nan"),
37
+ float("nan"),
38
+ {"sentence_count": 0, "word_count": 0, "letter_count": 0},
39
+ )
40
+
41
+ # Calculate per 100 words
42
+ L = (letter_count / len(tokens)) * 100 # noqa: N806
43
+ S = (len(sentences) / len(tokens)) * 100 # noqa: N806
44
+
45
+ # Compute Coleman-Liau Index
46
+ cli_index = _LETTER_COEFFICIENT * L + _SENTENCE_COEFFICIENT * S + _INTERCEPT
47
+ grade_level = max(0, math.floor(cli_index + 0.5))
48
+
49
+ metadata = {
50
+ "sentence_count": len(sentences),
51
+ "word_count": len(tokens),
52
+ "letter_count": letter_count,
53
+ "letters_per_100_words": L,
54
+ "sentences_per_100_words": S,
55
+ }
56
+
57
+ return (cli_index, float(grade_level), metadata)
58
+
59
+
60
+ def compute_coleman_liau(text: str, chunk_size: int = 1000) -> ColemanLiauResult:
26
61
  """
27
62
  Compute Coleman-Liau Index.
28
63
 
64
+ This function uses native chunked analysis to capture variance and patterns
65
+ across the text, which is essential for stylometric fingerprinting.
66
+
29
67
  Formula:
30
68
  CLI = 0.0588 × L - 0.296 × S - 15.8
31
69
 
@@ -36,19 +74,9 @@ def compute_coleman_liau(text: str) -> ColemanLiauResult:
36
74
  The Coleman-Liau index relies on characters rather than syllables,
37
75
  making it easier to compute and not requiring syllable-counting algorithms.
38
76
 
39
- **Implementation Notes:**
40
- - Grade levels are NOT clamped (removed upper bound of 20 per PR #2 review).
41
- The original Coleman & Liau (1975) paper calibrated to grades 1-16 but did not
42
- specify an upper bound. Post-graduate texts may exceed grade 20.
43
- - Uses round-half-up rounding (not banker's rounding) for grade level calculation
44
- - Letter counts (Unicode alphabetic characters only) computed from tokenized words
45
- to ensure measurement consistency. Both letter count and word count use identical
46
- tokenization logic, preventing divergence in edge cases (emails, URLs, hyphens).
47
- See PR #2 review discussion: https://github.com/craigtrim/pystylometry/pull/2
48
- - Reliability heuristic based on validation study passage lengths (~100 words);
49
- shorter texts flagged in metadata
50
- - English-centric sentence splitting and Unicode assumptions limit true
51
- cross-language applicability
77
+ Related GitHub Issue:
78
+ #27 - Native chunked analysis with Distribution dataclass
79
+ https://github.com/craigtrim/pystylometry/issues/27
52
80
 
53
81
  References:
54
82
  Coleman, M., & Liau, T. L. (1975). A computer readability formula
@@ -56,105 +84,104 @@ def compute_coleman_liau(text: str) -> ColemanLiauResult:
56
84
 
57
85
  Args:
58
86
  text: Input text to analyze
87
+ chunk_size: Number of words per chunk (default: 1000).
88
+ The text is divided into chunks of this size, and metrics are
89
+ computed per-chunk.
59
90
 
60
91
  Returns:
61
- ColemanLiauResult with CLI index and grade level
92
+ ColemanLiauResult with:
93
+ - cli_index: Mean CLI across chunks
94
+ - grade_level: Mean grade level across chunks
95
+ - cli_index_dist: Distribution with per-chunk values and stats
96
+ - grade_level_dist: Distribution with per-chunk values and stats
97
+ - chunk_size: The chunk size used
98
+ - chunk_count: Number of chunks analyzed
62
99
 
63
100
  Example:
64
- >>> result = compute_coleman_liau("The quick brown fox jumps over the lazy dog.")
65
- >>> print(f"CLI Index: {result.cli_index:.1f}")
66
- CLI Index: 3.8
67
- >>> print(f"Grade Level: {result.grade_level}")
68
- Grade Level: 4
69
- >>> result.metadata["reliable"]
70
- False
101
+ >>> result = compute_coleman_liau("Long text here...", chunk_size=1000)
102
+ >>> result.cli_index # Mean across chunks
103
+ 8.5
104
+ >>> result.cli_index_dist.std # Variance reveals fingerprint
105
+ 1.2
71
106
  """
72
- sentences = split_sentences(text)
73
- all_tokens = tokenize(text)
74
-
75
- # Filter to only tokens that contain at least one alphabetic character
76
- # This excludes pure punctuation (. ! ?) but keeps words with mixed content
77
- # (Hello123, Test@example.com) to count their letters per Coleman-Liau spec.
78
- # This is different from Gunning Fog which uses stricter normalization.
79
- tokens = [token for token in all_tokens if any(char.isalpha() for char in token)]
80
-
81
- # CRITICAL: Count letters from tokenized words, NOT from raw text
82
- # ===============================================================
83
- # Coleman & Liau (1975) define L as "average number of letters per 100 words"
84
- # where both letters and words must be measured consistently from the same text units.
85
- #
86
- # Original implementation (buggy):
87
- # letter_count = sum(1 for char in text if char.isalpha())
88
- # This counted letters from RAW text but words from TOKENIZED text
89
- #
90
- # Problem cases (PR #2 review https://github.com/craigtrim/pystylometry/pull/2):
91
- # - "test@example.com" tokenizer may split into ['test', '@', 'example', '.', 'com']
92
- # Raw letter count: 15 letters, Token count: 5 tokens → wrong ratio
93
- # - "co-operate" → tokenizer may split into ['co', '-', 'operate']
94
- # Raw letter count: 9 letters, Token count: 3 tokens → wrong ratio
95
- # - URLs, special tokens, etc. → similar inconsistencies
96
- #
97
- # Fixed implementation:
98
- # Count only alphabetic characters that appear in valid word tokens (after normalization).
99
- # This ensures both letter count and word count use identical tokenization logic,
100
- # maintaining the mathematical integrity of the L term in the Coleman-Liau formula.
101
- letter_count = sum(1 for token in tokens for char in token if char.isalpha())
102
-
103
- if len(sentences) == 0 or len(tokens) == 0:
107
+ # Chunk the text
108
+ chunks = chunk_text(text, chunk_size)
109
+
110
+ # Compute metrics per chunk
111
+ cli_values = []
112
+ grade_values = []
113
+ total_sentences = 0
114
+ total_words = 0
115
+ total_letters = 0
116
+
117
+ for chunk in chunks:
118
+ ci, gl, meta = _compute_coleman_liau_single(chunk)
119
+ if not math.isnan(ci):
120
+ cli_values.append(ci)
121
+ grade_values.append(gl)
122
+ total_sentences += meta.get("sentence_count", 0)
123
+ total_words += meta.get("word_count", 0)
124
+ total_letters += meta.get("letter_count", 0)
125
+
126
+ # Handle empty or all-invalid chunks
127
+ if not cli_values:
128
+ empty_dist = Distribution(
129
+ values=[],
130
+ mean=float("nan"),
131
+ median=float("nan"),
132
+ std=0.0,
133
+ range=0.0,
134
+ iqr=0.0,
135
+ )
104
136
  return ColemanLiauResult(
105
- cli_index=0.0,
106
- grade_level=0,
137
+ cli_index=float("nan"),
138
+ grade_level=float("nan"),
139
+ cli_index_dist=empty_dist,
140
+ grade_level_dist=empty_dist,
141
+ chunk_size=chunk_size,
142
+ chunk_count=len(chunks),
107
143
  metadata={
108
- "sentence_count": len(sentences),
109
- "word_count": len(tokens),
110
- "letter_count": letter_count,
144
+ # Backward-compatible keys
145
+ "sentence_count": 0,
146
+ "word_count": 0,
147
+ "letter_count": 0,
111
148
  "letters_per_100_words": 0.0,
112
149
  "sentences_per_100_words": 0.0,
150
+ # New prefixed keys for consistency
151
+ "total_sentence_count": 0,
152
+ "total_word_count": 0,
153
+ "total_letter_count": 0,
113
154
  "reliable": False,
114
155
  },
115
156
  )
116
157
 
117
- # Calculate per 100 words
118
- L = (letter_count / len(tokens)) * 100 # noqa: N806
119
- S = (len(sentences) / len(tokens)) * 100 # noqa: N806
120
-
121
- # Compute Coleman-Liau Index using empirically-derived coefficients
122
- cli_index = _LETTER_COEFFICIENT * L + _SENTENCE_COEFFICIENT * S + _INTERCEPT
123
-
124
- # Grade Level Calculation and Bounds
125
- # ===================================
126
- # Round-half-up rounding (not Python's default banker's rounding):
127
- # 4.5 → 5 (always rounds up), not round-half-to-even
128
- # math.floor(x + 0.5) implements this for both positive and negative values
129
- #
130
- # Lower bound (0): Prevent negative grades for very simple texts
131
- # Coleman & Liau (1975) calibrated to U.S. grades 1-16, but simpler texts
132
- # (e.g., "Go. Run. Stop.") can produce negative CLI values. We clamp to 0
133
- # as there is no "negative grade level" in the educational system.
134
- #
135
- # Upper bound (REMOVED per PR #2 review):
136
- # Original implementation clamped at grade 20, but this was arbitrary.
137
- # Coleman & Liau (1975) did not specify an upper bound in their paper.
138
- # Clamping discards information: PhD dissertations (grade 25) and complex
139
- # legal documents (grade 30+) would both report as grade 20, making them
140
- # indistinguishable. The empirical formula should determine the full range.
141
- #
142
- # See PR #2 discussion: https://github.com/craigtrim/pystylometry/pull/2
143
- grade_level = max(0, math.floor(cli_index + 0.5))
158
+ # Build distributions
159
+ cli_dist = make_distribution(cli_values)
160
+ grade_dist = make_distribution(grade_values)
144
161
 
145
- # Reliability heuristic: validation study used ~100-word passages
146
- # Not a hard minimum, but shorter texts may deviate from expected behavior
147
- reliable = len(tokens) >= 100
162
+ # Reliability heuristic
163
+ reliable = total_words >= 100
148
164
 
149
165
  return ColemanLiauResult(
150
- cli_index=cli_index,
151
- grade_level=grade_level,
166
+ cli_index=cli_dist.mean,
167
+ grade_level=grade_dist.mean,
168
+ cli_index_dist=cli_dist,
169
+ grade_level_dist=grade_dist,
170
+ chunk_size=chunk_size,
171
+ chunk_count=len(chunks),
152
172
  metadata={
153
- "sentence_count": len(sentences),
154
- "word_count": len(tokens),
155
- "letter_count": letter_count,
156
- "letters_per_100_words": L,
157
- "sentences_per_100_words": S,
173
+ # Backward-compatible keys
174
+ "sentence_count": total_sentences,
175
+ "word_count": total_words,
176
+ "letter_count": total_letters,
177
+ "letters_per_100_words": (total_letters / total_words * 100) if total_words > 0 else 0,
178
+ "sentences_per_100_words": (total_sentences / total_words * 100)
179
+ if total_words > 0
180
+ else 0,
181
+ # New prefixed keys for consistency
182
+ "total_sentence_count": total_sentences,
183
+ "total_word_count": total_words,
184
+ "total_letter_count": total_letters,
158
185
  "reliable": reliable,
159
186
  },
160
187
  )