pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +30 -5
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1954 -28
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +26 -1
- pystylometry/authorship/additional_methods.py +75 -0
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +389 -0
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +680 -0
- pystylometry/lexical/function_words.py +590 -0
- pystylometry/lexical/hapax.py +310 -33
- pystylometry/lexical/mtld.py +180 -22
- pystylometry/lexical/ttr.py +149 -0
- pystylometry/lexical/word_frequency_sophistication.py +1805 -0
- pystylometry/lexical/yule.py +142 -29
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +2110 -0
- pystylometry/readability/ari.py +173 -35
- pystylometry/readability/coleman_liau.py +150 -30
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +181 -32
- pystylometry/readability/gunning_fog.py +208 -35
- pystylometry/readability/smog.py +126 -28
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +494 -0
- pystylometry/syntactic/pos_ratios.py +172 -17
- pystylometry/syntactic/sentence_stats.py +105 -18
- pystylometry/syntactic/sentence_types.py +526 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-0.1.0.dist-info/RECORD +0 -26
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
pystylometry/readability/ari.py
CHANGED
|
@@ -1,25 +1,99 @@
|
|
|
1
|
-
"""Automated Readability Index (ARI).
|
|
1
|
+
"""Automated Readability Index (ARI).
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
This module implements the ARI readability formula with native chunked
|
|
4
|
+
analysis for stylometric fingerprinting.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import math
|
|
12
|
+
|
|
13
|
+
from .._types import ARIResult, Distribution, chunk_text, make_distribution
|
|
4
14
|
from .._utils import split_sentences, tokenize
|
|
5
15
|
|
|
16
|
+
# Formula coefficients from Senter & Smith (1967)
|
|
17
|
+
_CHARACTER_COEFFICIENT = 4.71
|
|
18
|
+
_WORD_COEFFICIENT = 0.5
|
|
19
|
+
_INTERCEPT = -21.43
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _get_age_range(grade_level: float) -> str:
|
|
23
|
+
"""Map grade level to age range."""
|
|
24
|
+
if grade_level <= 0:
|
|
25
|
+
return "5-6 years (Kindergarten)"
|
|
26
|
+
elif grade_level <= 5:
|
|
27
|
+
return "6-11 years (Elementary)"
|
|
28
|
+
elif grade_level <= 8:
|
|
29
|
+
return "11-14 years (Middle School)"
|
|
30
|
+
elif grade_level <= 12:
|
|
31
|
+
return "14-18 years (High School)"
|
|
32
|
+
elif grade_level <= 14:
|
|
33
|
+
return "18-22 years (College)"
|
|
34
|
+
else:
|
|
35
|
+
return "22+ years (Graduate)"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _compute_ari_single(text: str) -> tuple[float, float, dict]:
|
|
39
|
+
"""Compute ARI metrics for a single chunk of text.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Tuple of (ari_score, grade_level, metadata_dict).
|
|
43
|
+
Returns (nan, nan, metadata) for empty/invalid input.
|
|
44
|
+
"""
|
|
45
|
+
sentences = split_sentences(text)
|
|
46
|
+
tokens = tokenize(text)
|
|
47
|
+
character_count = sum(1 for char in text if char.isalnum())
|
|
48
|
+
|
|
49
|
+
if len(sentences) == 0 or len(tokens) == 0:
|
|
50
|
+
return (
|
|
51
|
+
float("nan"),
|
|
52
|
+
float("nan"),
|
|
53
|
+
{"sentence_count": 0, "word_count": 0, "character_count": 0},
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Calculate ratios
|
|
57
|
+
chars_per_word = character_count / len(tokens)
|
|
58
|
+
words_per_sentence = len(tokens) / len(sentences)
|
|
59
|
+
|
|
60
|
+
# Apply ARI formula
|
|
61
|
+
ari_score = (
|
|
62
|
+
_CHARACTER_COEFFICIENT * chars_per_word
|
|
63
|
+
+ _WORD_COEFFICIENT * words_per_sentence
|
|
64
|
+
+ _INTERCEPT
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
grade_level = max(0, min(20, math.floor(ari_score + 0.5)))
|
|
6
68
|
|
|
7
|
-
|
|
69
|
+
metadata = {
|
|
70
|
+
"sentence_count": len(sentences),
|
|
71
|
+
"word_count": len(tokens),
|
|
72
|
+
"character_count": character_count,
|
|
73
|
+
"characters_per_word": chars_per_word,
|
|
74
|
+
"words_per_sentence": words_per_sentence,
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
return (ari_score, float(grade_level), metadata)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def compute_ari(text: str, chunk_size: int = 1000) -> ARIResult:
|
|
8
81
|
"""
|
|
9
82
|
Compute Automated Readability Index (ARI).
|
|
10
83
|
|
|
84
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
85
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
86
|
+
|
|
11
87
|
Formula:
|
|
12
88
|
ARI = 4.71 × (characters/words) + 0.5 × (words/sentences) - 21.43
|
|
13
89
|
|
|
14
|
-
The ARI
|
|
15
|
-
|
|
90
|
+
The ARI uses character counts and word counts (similar to Coleman-Liau)
|
|
91
|
+
but adds sentence length as a factor. It produces an approximate
|
|
92
|
+
representation of the US grade level needed to comprehend the text.
|
|
16
93
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
9-12: 14-18 years
|
|
21
|
-
13-14: 18-22 years
|
|
22
|
-
14+: 22+ years (college level)
|
|
94
|
+
Related GitHub Issue:
|
|
95
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
96
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
23
97
|
|
|
24
98
|
References:
|
|
25
99
|
Senter, R. J., & Smith, E. A. (1967). Automated readability index.
|
|
@@ -27,44 +101,108 @@ def compute_ari(text: str) -> ARIResult:
|
|
|
27
101
|
|
|
28
102
|
Args:
|
|
29
103
|
text: Input text to analyze
|
|
104
|
+
chunk_size: Number of words per chunk (default: 1000).
|
|
105
|
+
The text is divided into chunks of this size, and metrics are
|
|
106
|
+
computed per-chunk.
|
|
30
107
|
|
|
31
108
|
Returns:
|
|
32
|
-
ARIResult with
|
|
109
|
+
ARIResult with:
|
|
110
|
+
- ari_score: Mean ARI score across chunks
|
|
111
|
+
- grade_level: Mean grade level across chunks
|
|
112
|
+
- age_range: Age range based on mean grade level
|
|
113
|
+
- ari_score_dist: Distribution with per-chunk values and stats
|
|
114
|
+
- grade_level_dist: Distribution with per-chunk values and stats
|
|
115
|
+
- chunk_size: The chunk size used
|
|
116
|
+
- chunk_count: Number of chunks analyzed
|
|
33
117
|
|
|
34
118
|
Example:
|
|
35
|
-
>>> result = compute_ari("
|
|
36
|
-
>>>
|
|
37
|
-
|
|
38
|
-
>>>
|
|
119
|
+
>>> result = compute_ari("Long text here...", chunk_size=1000)
|
|
120
|
+
>>> result.ari_score # Mean across chunks
|
|
121
|
+
9.5
|
|
122
|
+
>>> result.ari_score_dist.std # Variance reveals fingerprint
|
|
123
|
+
1.5
|
|
39
124
|
"""
|
|
40
|
-
|
|
41
|
-
|
|
125
|
+
# Chunk the text
|
|
126
|
+
chunks = chunk_text(text, chunk_size)
|
|
42
127
|
|
|
43
|
-
|
|
128
|
+
# Compute metrics per chunk
|
|
129
|
+
ari_values = []
|
|
130
|
+
grade_values = []
|
|
131
|
+
total_sentences = 0
|
|
132
|
+
total_words = 0
|
|
133
|
+
total_chars = 0
|
|
134
|
+
|
|
135
|
+
for chunk in chunks:
|
|
136
|
+
ai, gl, meta = _compute_ari_single(chunk)
|
|
137
|
+
if not math.isnan(ai):
|
|
138
|
+
ari_values.append(ai)
|
|
139
|
+
grade_values.append(gl)
|
|
140
|
+
total_sentences += meta.get("sentence_count", 0)
|
|
141
|
+
total_words += meta.get("word_count", 0)
|
|
142
|
+
total_chars += meta.get("character_count", 0)
|
|
143
|
+
|
|
144
|
+
# Handle empty or all-invalid chunks
|
|
145
|
+
if not ari_values:
|
|
146
|
+
empty_dist = Distribution(
|
|
147
|
+
values=[],
|
|
148
|
+
mean=float("nan"),
|
|
149
|
+
median=float("nan"),
|
|
150
|
+
std=0.0,
|
|
151
|
+
range=0.0,
|
|
152
|
+
iqr=0.0,
|
|
153
|
+
)
|
|
44
154
|
return ARIResult(
|
|
45
|
-
ari_score=
|
|
46
|
-
grade_level=
|
|
155
|
+
ari_score=float("nan"),
|
|
156
|
+
grade_level=float("nan"),
|
|
47
157
|
age_range="Unknown",
|
|
48
|
-
|
|
158
|
+
ari_score_dist=empty_dist,
|
|
159
|
+
grade_level_dist=empty_dist,
|
|
160
|
+
chunk_size=chunk_size,
|
|
161
|
+
chunk_count=len(chunks),
|
|
162
|
+
metadata={
|
|
163
|
+
# Backward-compatible keys
|
|
164
|
+
"sentence_count": 0,
|
|
165
|
+
"word_count": 0,
|
|
166
|
+
"character_count": 0,
|
|
167
|
+
"characters_per_word": 0.0,
|
|
168
|
+
"words_per_sentence": 0.0,
|
|
169
|
+
# New prefixed keys for consistency
|
|
170
|
+
"total_sentence_count": 0,
|
|
171
|
+
"total_word_count": 0,
|
|
172
|
+
"total_character_count": 0,
|
|
173
|
+
"reliable": False,
|
|
174
|
+
},
|
|
49
175
|
)
|
|
50
176
|
|
|
51
|
-
#
|
|
52
|
-
|
|
177
|
+
# Build distributions
|
|
178
|
+
ari_dist = make_distribution(ari_values)
|
|
179
|
+
grade_dist = make_distribution(grade_values)
|
|
180
|
+
|
|
181
|
+
# Get age range from mean grade level
|
|
182
|
+
age_range = _get_age_range(grade_dist.mean)
|
|
53
183
|
|
|
54
|
-
#
|
|
55
|
-
|
|
56
|
-
grade_level = 0 # Placeholder
|
|
57
|
-
age_range = "Unknown" # Placeholder
|
|
184
|
+
# Reliability heuristic
|
|
185
|
+
reliable = total_words >= 100
|
|
58
186
|
|
|
59
187
|
return ARIResult(
|
|
60
|
-
ari_score=
|
|
61
|
-
grade_level=
|
|
188
|
+
ari_score=ari_dist.mean,
|
|
189
|
+
grade_level=grade_dist.mean,
|
|
62
190
|
age_range=age_range,
|
|
191
|
+
ari_score_dist=ari_dist,
|
|
192
|
+
grade_level_dist=grade_dist,
|
|
193
|
+
chunk_size=chunk_size,
|
|
194
|
+
chunk_count=len(chunks),
|
|
63
195
|
metadata={
|
|
64
|
-
|
|
65
|
-
"
|
|
66
|
-
"
|
|
67
|
-
"
|
|
68
|
-
"
|
|
196
|
+
# Backward-compatible keys
|
|
197
|
+
"sentence_count": total_sentences,
|
|
198
|
+
"word_count": total_words,
|
|
199
|
+
"character_count": total_chars,
|
|
200
|
+
"characters_per_word": total_chars / total_words if total_words > 0 else 0,
|
|
201
|
+
"words_per_sentence": total_words / total_sentences if total_sentences > 0 else 0,
|
|
202
|
+
# New prefixed keys for consistency
|
|
203
|
+
"total_sentence_count": total_sentences,
|
|
204
|
+
"total_word_count": total_words,
|
|
205
|
+
"total_character_count": total_chars,
|
|
206
|
+
"reliable": reliable,
|
|
69
207
|
},
|
|
70
208
|
)
|
|
@@ -1,13 +1,69 @@
|
|
|
1
|
-
"""Coleman-Liau Index.
|
|
1
|
+
"""Coleman-Liau Index.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
This module implements the Coleman-Liau readability formula with native chunked
|
|
4
|
+
analysis for stylometric fingerprinting.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import math
|
|
12
|
+
|
|
13
|
+
from .._types import ColemanLiauResult, Distribution, chunk_text, make_distribution
|
|
4
14
|
from .._utils import split_sentences, tokenize
|
|
5
15
|
|
|
16
|
+
# Regression coefficients from Coleman & Liau (1975)
|
|
17
|
+
_LETTER_COEFFICIENT = 0.0588
|
|
18
|
+
_SENTENCE_COEFFICIENT = -0.296
|
|
19
|
+
_INTERCEPT = -15.8
|
|
20
|
+
|
|
6
21
|
|
|
7
|
-
def
|
|
22
|
+
def _compute_coleman_liau_single(text: str) -> tuple[float, float, dict]:
|
|
23
|
+
"""Compute Coleman-Liau metrics for a single chunk of text.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Tuple of (cli_index, grade_level, metadata_dict).
|
|
27
|
+
Returns (nan, nan, metadata) for empty/invalid input.
|
|
28
|
+
"""
|
|
29
|
+
sentences = split_sentences(text)
|
|
30
|
+
all_tokens = tokenize(text)
|
|
31
|
+
tokens = [token for token in all_tokens if any(char.isalpha() for char in token)]
|
|
32
|
+
letter_count = sum(1 for token in tokens for char in token if char.isalpha())
|
|
33
|
+
|
|
34
|
+
if len(sentences) == 0 or len(tokens) == 0:
|
|
35
|
+
return (
|
|
36
|
+
float("nan"),
|
|
37
|
+
float("nan"),
|
|
38
|
+
{"sentence_count": 0, "word_count": 0, "letter_count": 0},
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Calculate per 100 words
|
|
42
|
+
L = (letter_count / len(tokens)) * 100 # noqa: N806
|
|
43
|
+
S = (len(sentences) / len(tokens)) * 100 # noqa: N806
|
|
44
|
+
|
|
45
|
+
# Compute Coleman-Liau Index
|
|
46
|
+
cli_index = _LETTER_COEFFICIENT * L + _SENTENCE_COEFFICIENT * S + _INTERCEPT
|
|
47
|
+
grade_level = max(0, math.floor(cli_index + 0.5))
|
|
48
|
+
|
|
49
|
+
metadata = {
|
|
50
|
+
"sentence_count": len(sentences),
|
|
51
|
+
"word_count": len(tokens),
|
|
52
|
+
"letter_count": letter_count,
|
|
53
|
+
"letters_per_100_words": L,
|
|
54
|
+
"sentences_per_100_words": S,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return (cli_index, float(grade_level), metadata)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def compute_coleman_liau(text: str, chunk_size: int = 1000) -> ColemanLiauResult:
|
|
8
61
|
"""
|
|
9
62
|
Compute Coleman-Liau Index.
|
|
10
63
|
|
|
64
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
65
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
66
|
+
|
|
11
67
|
Formula:
|
|
12
68
|
CLI = 0.0588 × L - 0.296 × S - 15.8
|
|
13
69
|
|
|
@@ -16,7 +72,11 @@ def compute_coleman_liau(text: str) -> ColemanLiauResult:
|
|
|
16
72
|
S = average number of sentences per 100 words
|
|
17
73
|
|
|
18
74
|
The Coleman-Liau index relies on characters rather than syllables,
|
|
19
|
-
making it easier to compute and
|
|
75
|
+
making it easier to compute and not requiring syllable-counting algorithms.
|
|
76
|
+
|
|
77
|
+
Related GitHub Issue:
|
|
78
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
79
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
20
80
|
|
|
21
81
|
References:
|
|
22
82
|
Coleman, M., & Liau, T. L. (1975). A computer readability formula
|
|
@@ -24,44 +84,104 @@ def compute_coleman_liau(text: str) -> ColemanLiauResult:
|
|
|
24
84
|
|
|
25
85
|
Args:
|
|
26
86
|
text: Input text to analyze
|
|
87
|
+
chunk_size: Number of words per chunk (default: 1000).
|
|
88
|
+
The text is divided into chunks of this size, and metrics are
|
|
89
|
+
computed per-chunk.
|
|
27
90
|
|
|
28
91
|
Returns:
|
|
29
|
-
ColemanLiauResult with
|
|
92
|
+
ColemanLiauResult with:
|
|
93
|
+
- cli_index: Mean CLI across chunks
|
|
94
|
+
- grade_level: Mean grade level across chunks
|
|
95
|
+
- cli_index_dist: Distribution with per-chunk values and stats
|
|
96
|
+
- grade_level_dist: Distribution with per-chunk values and stats
|
|
97
|
+
- chunk_size: The chunk size used
|
|
98
|
+
- chunk_count: Number of chunks analyzed
|
|
30
99
|
|
|
31
100
|
Example:
|
|
32
|
-
>>> result = compute_coleman_liau("
|
|
33
|
-
>>>
|
|
34
|
-
|
|
101
|
+
>>> result = compute_coleman_liau("Long text here...", chunk_size=1000)
|
|
102
|
+
>>> result.cli_index # Mean across chunks
|
|
103
|
+
8.5
|
|
104
|
+
>>> result.cli_index_dist.std # Variance reveals fingerprint
|
|
105
|
+
1.2
|
|
35
106
|
"""
|
|
36
|
-
|
|
37
|
-
|
|
107
|
+
# Chunk the text
|
|
108
|
+
chunks = chunk_text(text, chunk_size)
|
|
38
109
|
|
|
39
|
-
|
|
110
|
+
# Compute metrics per chunk
|
|
111
|
+
cli_values = []
|
|
112
|
+
grade_values = []
|
|
113
|
+
total_sentences = 0
|
|
114
|
+
total_words = 0
|
|
115
|
+
total_letters = 0
|
|
116
|
+
|
|
117
|
+
for chunk in chunks:
|
|
118
|
+
ci, gl, meta = _compute_coleman_liau_single(chunk)
|
|
119
|
+
if not math.isnan(ci):
|
|
120
|
+
cli_values.append(ci)
|
|
121
|
+
grade_values.append(gl)
|
|
122
|
+
total_sentences += meta.get("sentence_count", 0)
|
|
123
|
+
total_words += meta.get("word_count", 0)
|
|
124
|
+
total_letters += meta.get("letter_count", 0)
|
|
125
|
+
|
|
126
|
+
# Handle empty or all-invalid chunks
|
|
127
|
+
if not cli_values:
|
|
128
|
+
empty_dist = Distribution(
|
|
129
|
+
values=[],
|
|
130
|
+
mean=float("nan"),
|
|
131
|
+
median=float("nan"),
|
|
132
|
+
std=0.0,
|
|
133
|
+
range=0.0,
|
|
134
|
+
iqr=0.0,
|
|
135
|
+
)
|
|
40
136
|
return ColemanLiauResult(
|
|
41
|
-
cli_index=
|
|
42
|
-
grade_level=
|
|
43
|
-
|
|
137
|
+
cli_index=float("nan"),
|
|
138
|
+
grade_level=float("nan"),
|
|
139
|
+
cli_index_dist=empty_dist,
|
|
140
|
+
grade_level_dist=empty_dist,
|
|
141
|
+
chunk_size=chunk_size,
|
|
142
|
+
chunk_count=len(chunks),
|
|
143
|
+
metadata={
|
|
144
|
+
# Backward-compatible keys
|
|
145
|
+
"sentence_count": 0,
|
|
146
|
+
"word_count": 0,
|
|
147
|
+
"letter_count": 0,
|
|
148
|
+
"letters_per_100_words": 0.0,
|
|
149
|
+
"sentences_per_100_words": 0.0,
|
|
150
|
+
# New prefixed keys for consistency
|
|
151
|
+
"total_sentence_count": 0,
|
|
152
|
+
"total_word_count": 0,
|
|
153
|
+
"total_letter_count": 0,
|
|
154
|
+
"reliable": False,
|
|
155
|
+
},
|
|
44
156
|
)
|
|
45
157
|
|
|
46
|
-
#
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
# Calculate per 100 words
|
|
50
|
-
L = (letter_count / len(tokens)) * 100 # noqa: N806
|
|
51
|
-
S = (len(sentences) / len(tokens)) * 100 # noqa: N806
|
|
158
|
+
# Build distributions
|
|
159
|
+
cli_dist = make_distribution(cli_values)
|
|
160
|
+
grade_dist = make_distribution(grade_values)
|
|
52
161
|
|
|
53
|
-
#
|
|
54
|
-
|
|
55
|
-
grade_level = 0 # Placeholder
|
|
162
|
+
# Reliability heuristic
|
|
163
|
+
reliable = total_words >= 100
|
|
56
164
|
|
|
57
165
|
return ColemanLiauResult(
|
|
58
|
-
cli_index=
|
|
59
|
-
grade_level=
|
|
166
|
+
cli_index=cli_dist.mean,
|
|
167
|
+
grade_level=grade_dist.mean,
|
|
168
|
+
cli_index_dist=cli_dist,
|
|
169
|
+
grade_level_dist=grade_dist,
|
|
170
|
+
chunk_size=chunk_size,
|
|
171
|
+
chunk_count=len(chunks),
|
|
60
172
|
metadata={
|
|
61
|
-
|
|
62
|
-
"
|
|
63
|
-
"
|
|
64
|
-
"
|
|
65
|
-
"
|
|
173
|
+
# Backward-compatible keys
|
|
174
|
+
"sentence_count": total_sentences,
|
|
175
|
+
"word_count": total_words,
|
|
176
|
+
"letter_count": total_letters,
|
|
177
|
+
"letters_per_100_words": (total_letters / total_words * 100) if total_words > 0 else 0,
|
|
178
|
+
"sentences_per_100_words": (total_sentences / total_words * 100)
|
|
179
|
+
if total_words > 0
|
|
180
|
+
else 0,
|
|
181
|
+
# New prefixed keys for consistency
|
|
182
|
+
"total_sentence_count": total_sentences,
|
|
183
|
+
"total_word_count": total_words,
|
|
184
|
+
"total_letter_count": total_letters,
|
|
185
|
+
"reliable": reliable,
|
|
66
186
|
},
|
|
67
187
|
)
|