pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/README.md +42 -0
- pystylometry/__init__.py +45 -3
- pystylometry/_types.py +1017 -259
- pystylometry/authorship/README.md +21 -0
- pystylometry/authorship/__init__.py +28 -4
- pystylometry/authorship/additional_methods.py +260 -40
- pystylometry/authorship/compression.py +175 -0
- pystylometry/authorship/kilgarriff.py +354 -0
- pystylometry/character/README.md +17 -0
- pystylometry/character/character_metrics.py +267 -179
- pystylometry/cli.py +427 -0
- pystylometry/consistency/README.md +27 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/README.md +26 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/README.md +23 -0
- pystylometry/lexical/advanced_diversity.py +61 -22
- pystylometry/lexical/function_words.py +255 -56
- pystylometry/lexical/hapax.py +182 -52
- pystylometry/lexical/mtld.py +108 -26
- pystylometry/lexical/ttr.py +76 -10
- pystylometry/lexical/word_frequency_sophistication.py +1522 -298
- pystylometry/lexical/yule.py +136 -50
- pystylometry/ngrams/README.md +18 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +314 -69
- pystylometry/prosody/README.md +17 -0
- pystylometry/prosody/rhythm_prosody.py +773 -11
- pystylometry/readability/README.md +23 -0
- pystylometry/readability/additional_formulas.py +1887 -762
- pystylometry/readability/ari.py +144 -82
- pystylometry/readability/coleman_liau.py +136 -109
- pystylometry/readability/flesch.py +177 -73
- pystylometry/readability/gunning_fog.py +165 -161
- pystylometry/readability/smog.py +123 -42
- pystylometry/stylistic/README.md +20 -0
- pystylometry/stylistic/cohesion_coherence.py +669 -13
- pystylometry/stylistic/genre_register.py +1560 -17
- pystylometry/stylistic/markers.py +611 -17
- pystylometry/stylistic/vocabulary_overlap.py +354 -13
- pystylometry/syntactic/README.md +20 -0
- pystylometry/syntactic/advanced_syntactic.py +76 -14
- pystylometry/syntactic/pos_ratios.py +70 -6
- pystylometry/syntactic/sentence_stats.py +55 -12
- pystylometry/syntactic/sentence_types.py +71 -15
- pystylometry/viz/README.md +27 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- pystylometry-1.3.0.dist-info/METADATA +136 -0
- pystylometry-1.3.0.dist-info/RECORD +76 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
- pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
- pystylometry-1.0.0.dist-info/METADATA +0 -275
- pystylometry-1.0.0.dist-info/RECORD +0 -46
pystylometry/readability/ari.py
CHANGED
|
@@ -1,34 +1,26 @@
|
|
|
1
|
-
"""Automated Readability Index (ARI).
|
|
1
|
+
"""Automated Readability Index (ARI).
|
|
2
|
+
|
|
3
|
+
This module implements the ARI readability formula with native chunked
|
|
4
|
+
analysis for stylometric fingerprinting.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
9
|
+
"""
|
|
2
10
|
|
|
3
11
|
import math
|
|
4
12
|
|
|
5
|
-
from .._types import ARIResult
|
|
13
|
+
from .._types import ARIResult, Distribution, chunk_text, make_distribution
|
|
6
14
|
from .._utils import split_sentences, tokenize
|
|
7
15
|
|
|
8
16
|
# Formula coefficients from Senter & Smith (1967)
|
|
9
|
-
# Reference: Senter, R. J., & Smith, E. A. (1967). Automated readability index.
|
|
10
|
-
# AMRL-TR-6620. Aerospace Medical Research Laboratories.
|
|
11
|
-
|
|
12
|
-
# Coefficient for characters per word
|
|
13
17
|
_CHARACTER_COEFFICIENT = 4.71
|
|
14
|
-
|
|
15
|
-
# Coefficient for words per sentence
|
|
16
18
|
_WORD_COEFFICIENT = 0.5
|
|
17
|
-
|
|
18
|
-
# Intercept to calibrate scale to U.S. grade levels
|
|
19
19
|
_INTERCEPT = -21.43
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
def _get_age_range(grade_level:
|
|
23
|
-
"""
|
|
24
|
-
Map grade level to age range.
|
|
25
|
-
|
|
26
|
-
Args:
|
|
27
|
-
grade_level: U.S. grade level (0-20+)
|
|
28
|
-
|
|
29
|
-
Returns:
|
|
30
|
-
Age range string
|
|
31
|
-
"""
|
|
22
|
+
def _get_age_range(grade_level: float) -> str:
|
|
23
|
+
"""Map grade level to age range."""
|
|
32
24
|
if grade_level <= 0:
|
|
33
25
|
return "5-6 years (Kindergarten)"
|
|
34
26
|
elif grade_level <= 5:
|
|
@@ -43,10 +35,55 @@ def _get_age_range(grade_level: int) -> str:
|
|
|
43
35
|
return "22+ years (Graduate)"
|
|
44
36
|
|
|
45
37
|
|
|
46
|
-
def
|
|
38
|
+
def _compute_ari_single(text: str) -> tuple[float, float, dict]:
|
|
39
|
+
"""Compute ARI metrics for a single chunk of text.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Tuple of (ari_score, grade_level, metadata_dict).
|
|
43
|
+
Returns (nan, nan, metadata) for empty/invalid input.
|
|
44
|
+
"""
|
|
45
|
+
sentences = split_sentences(text)
|
|
46
|
+
tokens = tokenize(text)
|
|
47
|
+
character_count = sum(1 for char in text if char.isalnum())
|
|
48
|
+
|
|
49
|
+
if len(sentences) == 0 or len(tokens) == 0:
|
|
50
|
+
return (
|
|
51
|
+
float("nan"),
|
|
52
|
+
float("nan"),
|
|
53
|
+
{"sentence_count": 0, "word_count": 0, "character_count": 0},
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Calculate ratios
|
|
57
|
+
chars_per_word = character_count / len(tokens)
|
|
58
|
+
words_per_sentence = len(tokens) / len(sentences)
|
|
59
|
+
|
|
60
|
+
# Apply ARI formula
|
|
61
|
+
ari_score = (
|
|
62
|
+
_CHARACTER_COEFFICIENT * chars_per_word
|
|
63
|
+
+ _WORD_COEFFICIENT * words_per_sentence
|
|
64
|
+
+ _INTERCEPT
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
grade_level = max(0, min(20, math.floor(ari_score + 0.5)))
|
|
68
|
+
|
|
69
|
+
metadata = {
|
|
70
|
+
"sentence_count": len(sentences),
|
|
71
|
+
"word_count": len(tokens),
|
|
72
|
+
"character_count": character_count,
|
|
73
|
+
"characters_per_word": chars_per_word,
|
|
74
|
+
"words_per_sentence": words_per_sentence,
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
return (ari_score, float(grade_level), metadata)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def compute_ari(text: str, chunk_size: int = 1000) -> ARIResult:
|
|
47
81
|
"""
|
|
48
82
|
Compute Automated Readability Index (ARI).
|
|
49
83
|
|
|
84
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
85
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
86
|
+
|
|
50
87
|
Formula:
|
|
51
88
|
ARI = 4.71 × (characters/words) + 0.5 × (words/sentences) - 21.43
|
|
52
89
|
|
|
@@ -54,18 +91,9 @@ def compute_ari(text: str) -> ARIResult:
|
|
|
54
91
|
but adds sentence length as a factor. It produces an approximate
|
|
55
92
|
representation of the US grade level needed to comprehend the text.
|
|
56
93
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
- Character count includes alphanumeric characters only (letters and digits)
|
|
61
|
-
- Reliability heuristic: 100+ words recommended
|
|
62
|
-
|
|
63
|
-
Grade Level to Age mapping:
|
|
64
|
-
1-5: 6-11 years (Elementary)
|
|
65
|
-
6-8: 11-14 years (Middle School)
|
|
66
|
-
9-12: 14-18 years (High School)
|
|
67
|
-
13-14: 18-22 years (College)
|
|
68
|
-
15+: 22+ years (Graduate)
|
|
94
|
+
Related GitHub Issue:
|
|
95
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
96
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
69
97
|
|
|
70
98
|
References:
|
|
71
99
|
Senter, R. J., & Smith, E. A. (1967). Automated readability index.
|
|
@@ -73,74 +101,108 @@ def compute_ari(text: str) -> ARIResult:
|
|
|
73
101
|
|
|
74
102
|
Args:
|
|
75
103
|
text: Input text to analyze
|
|
104
|
+
chunk_size: Number of words per chunk (default: 1000).
|
|
105
|
+
The text is divided into chunks of this size, and metrics are
|
|
106
|
+
computed per-chunk.
|
|
76
107
|
|
|
77
108
|
Returns:
|
|
78
|
-
ARIResult with
|
|
109
|
+
ARIResult with:
|
|
110
|
+
- ari_score: Mean ARI score across chunks
|
|
111
|
+
- grade_level: Mean grade level across chunks
|
|
112
|
+
- age_range: Age range based on mean grade level
|
|
113
|
+
- ari_score_dist: Distribution with per-chunk values and stats
|
|
114
|
+
- grade_level_dist: Distribution with per-chunk values and stats
|
|
115
|
+
- chunk_size: The chunk size used
|
|
116
|
+
- chunk_count: Number of chunks analyzed
|
|
79
117
|
|
|
80
118
|
Example:
|
|
81
|
-
>>> result = compute_ari("
|
|
82
|
-
>>>
|
|
83
|
-
|
|
84
|
-
>>>
|
|
85
|
-
|
|
86
|
-
>>> print(f"Age Range: {result.age_range}")
|
|
87
|
-
Age Range: 5-6 years (Kindergarten)
|
|
88
|
-
>>> result.metadata["reliable"]
|
|
89
|
-
False
|
|
119
|
+
>>> result = compute_ari("Long text here...", chunk_size=1000)
|
|
120
|
+
>>> result.ari_score # Mean across chunks
|
|
121
|
+
9.5
|
|
122
|
+
>>> result.ari_score_dist.std # Variance reveals fingerprint
|
|
123
|
+
1.5
|
|
90
124
|
"""
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
#
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
125
|
+
# Chunk the text
|
|
126
|
+
chunks = chunk_text(text, chunk_size)
|
|
127
|
+
|
|
128
|
+
# Compute metrics per chunk
|
|
129
|
+
ari_values = []
|
|
130
|
+
grade_values = []
|
|
131
|
+
total_sentences = 0
|
|
132
|
+
total_words = 0
|
|
133
|
+
total_chars = 0
|
|
134
|
+
|
|
135
|
+
for chunk in chunks:
|
|
136
|
+
ai, gl, meta = _compute_ari_single(chunk)
|
|
137
|
+
if not math.isnan(ai):
|
|
138
|
+
ari_values.append(ai)
|
|
139
|
+
grade_values.append(gl)
|
|
140
|
+
total_sentences += meta.get("sentence_count", 0)
|
|
141
|
+
total_words += meta.get("word_count", 0)
|
|
142
|
+
total_chars += meta.get("character_count", 0)
|
|
143
|
+
|
|
144
|
+
# Handle empty or all-invalid chunks
|
|
145
|
+
if not ari_values:
|
|
146
|
+
empty_dist = Distribution(
|
|
147
|
+
values=[],
|
|
148
|
+
mean=float("nan"),
|
|
149
|
+
median=float("nan"),
|
|
150
|
+
std=0.0,
|
|
151
|
+
range=0.0,
|
|
152
|
+
iqr=0.0,
|
|
153
|
+
)
|
|
99
154
|
return ARIResult(
|
|
100
|
-
ari_score=
|
|
101
|
-
grade_level=
|
|
102
|
-
age_range="
|
|
155
|
+
ari_score=float("nan"),
|
|
156
|
+
grade_level=float("nan"),
|
|
157
|
+
age_range="Unknown",
|
|
158
|
+
ari_score_dist=empty_dist,
|
|
159
|
+
grade_level_dist=empty_dist,
|
|
160
|
+
chunk_size=chunk_size,
|
|
161
|
+
chunk_count=len(chunks),
|
|
103
162
|
metadata={
|
|
104
|
-
|
|
105
|
-
"
|
|
106
|
-
"
|
|
163
|
+
# Backward-compatible keys
|
|
164
|
+
"sentence_count": 0,
|
|
165
|
+
"word_count": 0,
|
|
166
|
+
"character_count": 0,
|
|
107
167
|
"characters_per_word": 0.0,
|
|
108
168
|
"words_per_sentence": 0.0,
|
|
169
|
+
# New prefixed keys for consistency
|
|
170
|
+
"total_sentence_count": 0,
|
|
171
|
+
"total_word_count": 0,
|
|
172
|
+
"total_character_count": 0,
|
|
109
173
|
"reliable": False,
|
|
110
174
|
},
|
|
111
175
|
)
|
|
112
176
|
|
|
113
|
-
#
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
# Apply ARI formula
|
|
118
|
-
ari_score = (
|
|
119
|
-
_CHARACTER_COEFFICIENT * chars_per_word
|
|
120
|
-
+ _WORD_COEFFICIENT * words_per_sentence
|
|
121
|
-
+ _INTERCEPT
|
|
122
|
-
)
|
|
123
|
-
|
|
124
|
-
# Use round-half-up rounding and clamp to valid grade range [0, 20]
|
|
125
|
-
# math.floor(x + 0.5) implements round-half-up for both positive and negative values
|
|
126
|
-
grade_level = max(0, min(20, math.floor(ari_score + 0.5)))
|
|
177
|
+
# Build distributions
|
|
178
|
+
ari_dist = make_distribution(ari_values)
|
|
179
|
+
grade_dist = make_distribution(grade_values)
|
|
127
180
|
|
|
128
|
-
# Get age range from grade level
|
|
129
|
-
age_range = _get_age_range(
|
|
181
|
+
# Get age range from mean grade level
|
|
182
|
+
age_range = _get_age_range(grade_dist.mean)
|
|
130
183
|
|
|
131
|
-
# Reliability heuristic
|
|
132
|
-
reliable =
|
|
184
|
+
# Reliability heuristic
|
|
185
|
+
reliable = total_words >= 100
|
|
133
186
|
|
|
134
187
|
return ARIResult(
|
|
135
|
-
ari_score=
|
|
136
|
-
grade_level=
|
|
188
|
+
ari_score=ari_dist.mean,
|
|
189
|
+
grade_level=grade_dist.mean,
|
|
137
190
|
age_range=age_range,
|
|
191
|
+
ari_score_dist=ari_dist,
|
|
192
|
+
grade_level_dist=grade_dist,
|
|
193
|
+
chunk_size=chunk_size,
|
|
194
|
+
chunk_count=len(chunks),
|
|
138
195
|
metadata={
|
|
139
|
-
|
|
140
|
-
"
|
|
141
|
-
"
|
|
142
|
-
"
|
|
143
|
-
"
|
|
196
|
+
# Backward-compatible keys
|
|
197
|
+
"sentence_count": total_sentences,
|
|
198
|
+
"word_count": total_words,
|
|
199
|
+
"character_count": total_chars,
|
|
200
|
+
"characters_per_word": total_chars / total_words if total_words > 0 else 0,
|
|
201
|
+
"words_per_sentence": total_words / total_sentences if total_sentences > 0 else 0,
|
|
202
|
+
# New prefixed keys for consistency
|
|
203
|
+
"total_sentence_count": total_sentences,
|
|
204
|
+
"total_word_count": total_words,
|
|
205
|
+
"total_character_count": total_chars,
|
|
144
206
|
"reliable": reliable,
|
|
145
207
|
},
|
|
146
208
|
)
|
|
@@ -1,31 +1,69 @@
|
|
|
1
|
-
"""Coleman-Liau Index.
|
|
1
|
+
"""Coleman-Liau Index.
|
|
2
|
+
|
|
3
|
+
This module implements the Coleman-Liau readability formula with native chunked
|
|
4
|
+
analysis for stylometric fingerprinting.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
9
|
+
"""
|
|
2
10
|
|
|
3
11
|
import math
|
|
4
12
|
|
|
5
|
-
from .._types import ColemanLiauResult
|
|
13
|
+
from .._types import ColemanLiauResult, Distribution, chunk_text, make_distribution
|
|
6
14
|
from .._utils import split_sentences, tokenize
|
|
7
15
|
|
|
8
16
|
# Regression coefficients from Coleman & Liau (1975)
|
|
9
|
-
# Derived from empirical analysis of Cloze test results on graded texts
|
|
10
|
-
# Reference: Coleman, M., & Liau, T. L. (1975). A computer readability formula
|
|
11
|
-
# designed for machine scoring. Journal of Applied Psychology, 60(2), 283.
|
|
12
|
-
|
|
13
|
-
# Coefficient for letters per 100 words
|
|
14
|
-
# Represents impact of word length on reading difficulty
|
|
15
17
|
_LETTER_COEFFICIENT = 0.0588
|
|
16
|
-
|
|
17
|
-
# Coefficient for sentences per 100 words (negative: more sentences = easier)
|
|
18
|
-
# Represents impact of sentence length on reading difficulty
|
|
19
18
|
_SENTENCE_COEFFICIENT = -0.296
|
|
20
|
-
|
|
21
|
-
# Intercept to calibrate scale to U.S. grade levels (1-16)
|
|
22
19
|
_INTERCEPT = -15.8
|
|
23
20
|
|
|
24
21
|
|
|
25
|
-
def
|
|
22
|
+
def _compute_coleman_liau_single(text: str) -> tuple[float, float, dict]:
|
|
23
|
+
"""Compute Coleman-Liau metrics for a single chunk of text.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Tuple of (cli_index, grade_level, metadata_dict).
|
|
27
|
+
Returns (nan, nan, metadata) for empty/invalid input.
|
|
28
|
+
"""
|
|
29
|
+
sentences = split_sentences(text)
|
|
30
|
+
all_tokens = tokenize(text)
|
|
31
|
+
tokens = [token for token in all_tokens if any(char.isalpha() for char in token)]
|
|
32
|
+
letter_count = sum(1 for token in tokens for char in token if char.isalpha())
|
|
33
|
+
|
|
34
|
+
if len(sentences) == 0 or len(tokens) == 0:
|
|
35
|
+
return (
|
|
36
|
+
float("nan"),
|
|
37
|
+
float("nan"),
|
|
38
|
+
{"sentence_count": 0, "word_count": 0, "letter_count": 0},
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Calculate per 100 words
|
|
42
|
+
L = (letter_count / len(tokens)) * 100 # noqa: N806
|
|
43
|
+
S = (len(sentences) / len(tokens)) * 100 # noqa: N806
|
|
44
|
+
|
|
45
|
+
# Compute Coleman-Liau Index
|
|
46
|
+
cli_index = _LETTER_COEFFICIENT * L + _SENTENCE_COEFFICIENT * S + _INTERCEPT
|
|
47
|
+
grade_level = max(0, math.floor(cli_index + 0.5))
|
|
48
|
+
|
|
49
|
+
metadata = {
|
|
50
|
+
"sentence_count": len(sentences),
|
|
51
|
+
"word_count": len(tokens),
|
|
52
|
+
"letter_count": letter_count,
|
|
53
|
+
"letters_per_100_words": L,
|
|
54
|
+
"sentences_per_100_words": S,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return (cli_index, float(grade_level), metadata)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def compute_coleman_liau(text: str, chunk_size: int = 1000) -> ColemanLiauResult:
|
|
26
61
|
"""
|
|
27
62
|
Compute Coleman-Liau Index.
|
|
28
63
|
|
|
64
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
65
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
66
|
+
|
|
29
67
|
Formula:
|
|
30
68
|
CLI = 0.0588 × L - 0.296 × S - 15.8
|
|
31
69
|
|
|
@@ -36,19 +74,9 @@ def compute_coleman_liau(text: str) -> ColemanLiauResult:
|
|
|
36
74
|
The Coleman-Liau index relies on characters rather than syllables,
|
|
37
75
|
making it easier to compute and not requiring syllable-counting algorithms.
|
|
38
76
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
specify an upper bound. Post-graduate texts may exceed grade 20.
|
|
43
|
-
- Uses round-half-up rounding (not banker's rounding) for grade level calculation
|
|
44
|
-
- Letter counts (Unicode alphabetic characters only) computed from tokenized words
|
|
45
|
-
to ensure measurement consistency. Both letter count and word count use identical
|
|
46
|
-
tokenization logic, preventing divergence in edge cases (emails, URLs, hyphens).
|
|
47
|
-
See PR #2 review discussion: https://github.com/craigtrim/pystylometry/pull/2
|
|
48
|
-
- Reliability heuristic based on validation study passage lengths (~100 words);
|
|
49
|
-
shorter texts flagged in metadata
|
|
50
|
-
- English-centric sentence splitting and Unicode assumptions limit true
|
|
51
|
-
cross-language applicability
|
|
77
|
+
Related GitHub Issue:
|
|
78
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
79
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
52
80
|
|
|
53
81
|
References:
|
|
54
82
|
Coleman, M., & Liau, T. L. (1975). A computer readability formula
|
|
@@ -56,105 +84,104 @@ def compute_coleman_liau(text: str) -> ColemanLiauResult:
|
|
|
56
84
|
|
|
57
85
|
Args:
|
|
58
86
|
text: Input text to analyze
|
|
87
|
+
chunk_size: Number of words per chunk (default: 1000).
|
|
88
|
+
The text is divided into chunks of this size, and metrics are
|
|
89
|
+
computed per-chunk.
|
|
59
90
|
|
|
60
91
|
Returns:
|
|
61
|
-
ColemanLiauResult with
|
|
92
|
+
ColemanLiauResult with:
|
|
93
|
+
- cli_index: Mean CLI across chunks
|
|
94
|
+
- grade_level: Mean grade level across chunks
|
|
95
|
+
- cli_index_dist: Distribution with per-chunk values and stats
|
|
96
|
+
- grade_level_dist: Distribution with per-chunk values and stats
|
|
97
|
+
- chunk_size: The chunk size used
|
|
98
|
+
- chunk_count: Number of chunks analyzed
|
|
62
99
|
|
|
63
100
|
Example:
|
|
64
|
-
>>> result = compute_coleman_liau("
|
|
65
|
-
>>>
|
|
66
|
-
|
|
67
|
-
>>>
|
|
68
|
-
|
|
69
|
-
>>> result.metadata["reliable"]
|
|
70
|
-
False
|
|
101
|
+
>>> result = compute_coleman_liau("Long text here...", chunk_size=1000)
|
|
102
|
+
>>> result.cli_index # Mean across chunks
|
|
103
|
+
8.5
|
|
104
|
+
>>> result.cli_index_dist.std # Variance reveals fingerprint
|
|
105
|
+
1.2
|
|
71
106
|
"""
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
#
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
#
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
letter_count = sum(1 for token in tokens for char in token if char.isalpha())
|
|
102
|
-
|
|
103
|
-
if len(sentences) == 0 or len(tokens) == 0:
|
|
107
|
+
# Chunk the text
|
|
108
|
+
chunks = chunk_text(text, chunk_size)
|
|
109
|
+
|
|
110
|
+
# Compute metrics per chunk
|
|
111
|
+
cli_values = []
|
|
112
|
+
grade_values = []
|
|
113
|
+
total_sentences = 0
|
|
114
|
+
total_words = 0
|
|
115
|
+
total_letters = 0
|
|
116
|
+
|
|
117
|
+
for chunk in chunks:
|
|
118
|
+
ci, gl, meta = _compute_coleman_liau_single(chunk)
|
|
119
|
+
if not math.isnan(ci):
|
|
120
|
+
cli_values.append(ci)
|
|
121
|
+
grade_values.append(gl)
|
|
122
|
+
total_sentences += meta.get("sentence_count", 0)
|
|
123
|
+
total_words += meta.get("word_count", 0)
|
|
124
|
+
total_letters += meta.get("letter_count", 0)
|
|
125
|
+
|
|
126
|
+
# Handle empty or all-invalid chunks
|
|
127
|
+
if not cli_values:
|
|
128
|
+
empty_dist = Distribution(
|
|
129
|
+
values=[],
|
|
130
|
+
mean=float("nan"),
|
|
131
|
+
median=float("nan"),
|
|
132
|
+
std=0.0,
|
|
133
|
+
range=0.0,
|
|
134
|
+
iqr=0.0,
|
|
135
|
+
)
|
|
104
136
|
return ColemanLiauResult(
|
|
105
|
-
cli_index=
|
|
106
|
-
grade_level=
|
|
137
|
+
cli_index=float("nan"),
|
|
138
|
+
grade_level=float("nan"),
|
|
139
|
+
cli_index_dist=empty_dist,
|
|
140
|
+
grade_level_dist=empty_dist,
|
|
141
|
+
chunk_size=chunk_size,
|
|
142
|
+
chunk_count=len(chunks),
|
|
107
143
|
metadata={
|
|
108
|
-
|
|
109
|
-
"
|
|
110
|
-
"
|
|
144
|
+
# Backward-compatible keys
|
|
145
|
+
"sentence_count": 0,
|
|
146
|
+
"word_count": 0,
|
|
147
|
+
"letter_count": 0,
|
|
111
148
|
"letters_per_100_words": 0.0,
|
|
112
149
|
"sentences_per_100_words": 0.0,
|
|
150
|
+
# New prefixed keys for consistency
|
|
151
|
+
"total_sentence_count": 0,
|
|
152
|
+
"total_word_count": 0,
|
|
153
|
+
"total_letter_count": 0,
|
|
113
154
|
"reliable": False,
|
|
114
155
|
},
|
|
115
156
|
)
|
|
116
157
|
|
|
117
|
-
#
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
# Compute Coleman-Liau Index using empirically-derived coefficients
|
|
122
|
-
cli_index = _LETTER_COEFFICIENT * L + _SENTENCE_COEFFICIENT * S + _INTERCEPT
|
|
123
|
-
|
|
124
|
-
# Grade Level Calculation and Bounds
|
|
125
|
-
# ===================================
|
|
126
|
-
# Round-half-up rounding (not Python's default banker's rounding):
|
|
127
|
-
# 4.5 → 5 (always rounds up), not round-half-to-even
|
|
128
|
-
# math.floor(x + 0.5) implements this for both positive and negative values
|
|
129
|
-
#
|
|
130
|
-
# Lower bound (0): Prevent negative grades for very simple texts
|
|
131
|
-
# Coleman & Liau (1975) calibrated to U.S. grades 1-16, but simpler texts
|
|
132
|
-
# (e.g., "Go. Run. Stop.") can produce negative CLI values. We clamp to 0
|
|
133
|
-
# as there is no "negative grade level" in the educational system.
|
|
134
|
-
#
|
|
135
|
-
# Upper bound (REMOVED per PR #2 review):
|
|
136
|
-
# Original implementation clamped at grade 20, but this was arbitrary.
|
|
137
|
-
# Coleman & Liau (1975) did not specify an upper bound in their paper.
|
|
138
|
-
# Clamping discards information: PhD dissertations (grade 25) and complex
|
|
139
|
-
# legal documents (grade 30+) would both report as grade 20, making them
|
|
140
|
-
# indistinguishable. The empirical formula should determine the full range.
|
|
141
|
-
#
|
|
142
|
-
# See PR #2 discussion: https://github.com/craigtrim/pystylometry/pull/2
|
|
143
|
-
grade_level = max(0, math.floor(cli_index + 0.5))
|
|
158
|
+
# Build distributions
|
|
159
|
+
cli_dist = make_distribution(cli_values)
|
|
160
|
+
grade_dist = make_distribution(grade_values)
|
|
144
161
|
|
|
145
|
-
# Reliability heuristic
|
|
146
|
-
|
|
147
|
-
reliable = len(tokens) >= 100
|
|
162
|
+
# Reliability heuristic
|
|
163
|
+
reliable = total_words >= 100
|
|
148
164
|
|
|
149
165
|
return ColemanLiauResult(
|
|
150
|
-
cli_index=
|
|
151
|
-
grade_level=
|
|
166
|
+
cli_index=cli_dist.mean,
|
|
167
|
+
grade_level=grade_dist.mean,
|
|
168
|
+
cli_index_dist=cli_dist,
|
|
169
|
+
grade_level_dist=grade_dist,
|
|
170
|
+
chunk_size=chunk_size,
|
|
171
|
+
chunk_count=len(chunks),
|
|
152
172
|
metadata={
|
|
153
|
-
|
|
154
|
-
"
|
|
155
|
-
"
|
|
156
|
-
"
|
|
157
|
-
"
|
|
173
|
+
# Backward-compatible keys
|
|
174
|
+
"sentence_count": total_sentences,
|
|
175
|
+
"word_count": total_words,
|
|
176
|
+
"letter_count": total_letters,
|
|
177
|
+
"letters_per_100_words": (total_letters / total_words * 100) if total_words > 0 else 0,
|
|
178
|
+
"sentences_per_100_words": (total_sentences / total_words * 100)
|
|
179
|
+
if total_words > 0
|
|
180
|
+
else 0,
|
|
181
|
+
# New prefixed keys for consistency
|
|
182
|
+
"total_sentence_count": total_sentences,
|
|
183
|
+
"total_word_count": total_words,
|
|
184
|
+
"total_letter_count": total_letters,
|
|
158
185
|
"reliable": reliable,
|
|
159
186
|
},
|
|
160
187
|
)
|