pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pystylometry/__init__.py +30 -5
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1954 -28
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +26 -1
  6. pystylometry/authorship/additional_methods.py +75 -0
  7. pystylometry/authorship/kilgarriff.py +347 -0
  8. pystylometry/character/__init__.py +15 -0
  9. pystylometry/character/character_metrics.py +389 -0
  10. pystylometry/cli.py +427 -0
  11. pystylometry/consistency/__init__.py +57 -0
  12. pystylometry/consistency/_thresholds.py +162 -0
  13. pystylometry/consistency/drift.py +549 -0
  14. pystylometry/dialect/__init__.py +65 -0
  15. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  16. pystylometry/dialect/_loader.py +360 -0
  17. pystylometry/dialect/detector.py +533 -0
  18. pystylometry/lexical/__init__.py +13 -6
  19. pystylometry/lexical/advanced_diversity.py +680 -0
  20. pystylometry/lexical/function_words.py +590 -0
  21. pystylometry/lexical/hapax.py +310 -33
  22. pystylometry/lexical/mtld.py +180 -22
  23. pystylometry/lexical/ttr.py +149 -0
  24. pystylometry/lexical/word_frequency_sophistication.py +1805 -0
  25. pystylometry/lexical/yule.py +142 -29
  26. pystylometry/ngrams/__init__.py +2 -0
  27. pystylometry/ngrams/entropy.py +150 -49
  28. pystylometry/ngrams/extended_ngrams.py +235 -0
  29. pystylometry/prosody/__init__.py +12 -0
  30. pystylometry/prosody/rhythm_prosody.py +53 -0
  31. pystylometry/readability/__init__.py +12 -0
  32. pystylometry/readability/additional_formulas.py +2110 -0
  33. pystylometry/readability/ari.py +173 -35
  34. pystylometry/readability/coleman_liau.py +150 -30
  35. pystylometry/readability/complex_words.py +531 -0
  36. pystylometry/readability/flesch.py +181 -32
  37. pystylometry/readability/gunning_fog.py +208 -35
  38. pystylometry/readability/smog.py +126 -28
  39. pystylometry/readability/syllables.py +137 -30
  40. pystylometry/stylistic/__init__.py +20 -0
  41. pystylometry/stylistic/cohesion_coherence.py +45 -0
  42. pystylometry/stylistic/genre_register.py +45 -0
  43. pystylometry/stylistic/markers.py +131 -0
  44. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  45. pystylometry/syntactic/__init__.py +4 -0
  46. pystylometry/syntactic/advanced_syntactic.py +494 -0
  47. pystylometry/syntactic/pos_ratios.py +172 -17
  48. pystylometry/syntactic/sentence_stats.py +105 -18
  49. pystylometry/syntactic/sentence_types.py +526 -0
  50. pystylometry/viz/__init__.py +71 -0
  51. pystylometry/viz/drift.py +589 -0
  52. pystylometry/viz/jsx/__init__.py +31 -0
  53. pystylometry/viz/jsx/_base.py +144 -0
  54. pystylometry/viz/jsx/report.py +677 -0
  55. pystylometry/viz/jsx/timeline.py +716 -0
  56. pystylometry/viz/jsx/viewer.py +1032 -0
  57. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
  58. pystylometry-1.1.0.dist-info/RECORD +63 -0
  59. pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
  60. pystylometry-0.1.0.dist-info/RECORD +0 -26
  61. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
@@ -1,17 +1,95 @@
1
- """Flesch Reading Ease and Flesch-Kincaid Grade Level."""
1
+ """Flesch Reading Ease and Flesch-Kincaid Grade Level.
2
2
 
3
- from .._types import FleschResult
3
+ This module implements the Flesch readability formulas with native chunked
4
+ analysis for stylometric fingerprinting.
5
+
6
+ Related GitHub Issue:
7
+ #27 - Native chunked analysis with Distribution dataclass
8
+ https://github.com/craigtrim/pystylometry/issues/27
9
+ """
10
+
11
+ from .._normalize import normalize_for_readability
12
+ from .._types import Distribution, FleschResult, chunk_text, make_distribution
4
13
  from .._utils import split_sentences, tokenize
5
14
  from .syllables import count_syllables
6
15
 
7
16
 
8
- def compute_flesch(text: str) -> FleschResult:
17
+ def _compute_flesch_single(text: str) -> tuple[float, float, dict]:
18
+ """Compute Flesch metrics for a single chunk of text.
19
+
20
+ Returns:
21
+ Tuple of (reading_ease, grade_level, metadata_dict).
22
+ Returns (nan, nan, metadata) for empty/invalid input.
23
+ """
24
+ sentences = split_sentences(text)
25
+ tokens = tokenize(text)
26
+
27
+ # Filter tokens to only valid words for syllable counting
28
+ word_tokens = normalize_for_readability(tokens)
29
+
30
+ if len(sentences) == 0 or len(word_tokens) == 0:
31
+ return (
32
+ float("nan"),
33
+ float("nan"),
34
+ {"sentence_count": 0, "word_count": 0, "syllable_count": 0},
35
+ )
36
+
37
+ # Count syllables
38
+ total_syllables = sum(count_syllables(word) for word in word_tokens)
39
+
40
+ # Calculate metrics
41
+ words_per_sentence = len(word_tokens) / len(sentences)
42
+ syllables_per_word = total_syllables / len(word_tokens)
43
+
44
+ # Flesch Reading Ease
45
+ reading_ease = 206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
46
+
47
+ # Flesch-Kincaid Grade Level
48
+ grade_level = (0.39 * words_per_sentence) + (11.8 * syllables_per_word) - 15.59
49
+
50
+ metadata = {
51
+ "sentence_count": len(sentences),
52
+ "word_count": len(word_tokens),
53
+ "syllable_count": total_syllables,
54
+ "words_per_sentence": words_per_sentence,
55
+ "syllables_per_word": syllables_per_word,
56
+ }
57
+
58
+ return (reading_ease, grade_level, metadata)
59
+
60
+
61
+ def _get_difficulty(reading_ease: float) -> str:
62
+ """Determine difficulty rating based on reading ease score."""
63
+ import math
64
+
65
+ if math.isnan(reading_ease):
66
+ return "Unknown"
67
+ if reading_ease >= 90:
68
+ return "Very Easy"
69
+ if reading_ease >= 80:
70
+ return "Easy"
71
+ if reading_ease >= 70:
72
+ return "Fairly Easy"
73
+ if reading_ease >= 60:
74
+ return "Standard"
75
+ if reading_ease >= 50:
76
+ return "Fairly Difficult"
77
+ if reading_ease >= 30:
78
+ return "Difficult"
79
+ return "Very Difficult"
80
+
81
+
82
+ def compute_flesch(text: str, chunk_size: int = 1000) -> FleschResult:
9
83
  """
10
84
  Compute Flesch Reading Ease and Flesch-Kincaid Grade Level.
11
85
 
86
+ This function uses native chunked analysis to capture variance and patterns
87
+ across the text, which is essential for stylometric fingerprinting.
88
+
12
89
  Flesch Reading Ease:
13
90
  Score = 206.835 - 1.015 × (words/sentences) - 84.6 × (syllables/words)
14
- Higher scores = easier to read (0-100 scale)
91
+ Higher scores = easier to read
92
+ Typical range: 0-100, but can exceed bounds
15
93
 
16
94
  Flesch-Kincaid Grade Level:
17
95
  Grade = 0.39 × (words/sentences) + 11.8 × (syllables/words) - 15.59
@@ -25,6 +103,10 @@ def compute_flesch(text: str) -> FleschResult:
25
103
  30-49: Difficult (College)
26
104
  0-29: Very Difficult (College graduate)
27
105
 
106
+ Related GitHub Issue:
107
+ #27 - Native chunked analysis with Distribution dataclass
108
+ https://github.com/craigtrim/pystylometry/issues/27
109
+
28
110
  References:
29
111
  Flesch, R. (1948). A new readability yardstick.
30
112
  Journal of Applied Psychology, 32(3), 221.
@@ -34,48 +116,115 @@ def compute_flesch(text: str) -> FleschResult:
34
116
 
35
117
  Args:
36
118
  text: Input text to analyze
119
+ chunk_size: Number of words per chunk (default: 1000).
120
+ The text is divided into chunks of this size, and metrics are
121
+ computed per-chunk. Use a large value (e.g., 1_000_000) for
122
+ single-chunk "aggregate" mode.
37
123
 
38
124
  Returns:
39
- FleschResult with reading ease, grade level, and difficulty rating
125
+ FleschResult with:
126
+ - reading_ease: Mean reading ease across chunks
127
+ - grade_level: Mean grade level across chunks
128
+ - difficulty: Difficulty rating based on mean reading_ease
129
+ - reading_ease_dist: Distribution with per-chunk values and stats
130
+ - grade_level_dist: Distribution with per-chunk values and stats
131
+ - chunk_size: The chunk size used
132
+ - chunk_count: Number of chunks analyzed
40
133
 
41
134
  Example:
42
- >>> result = compute_flesch("The quick brown fox jumps over the lazy dog.")
43
- >>> print(f"Reading Ease: {result.reading_ease:.1f}")
44
- >>> print(f"Grade Level: {result.grade_level:.1f}")
45
- >>> print(f"Difficulty: {result.difficulty}")
135
+ >>> result = compute_flesch("Long text here...", chunk_size=1000)
136
+ >>> result.reading_ease # Mean across chunks
137
+ 68.54
138
+ >>> result.reading_ease_dist.std # Variance reveals fingerprint
139
+ 4.2
140
+ >>> result.reading_ease_dist.values # Per-chunk values
141
+ [65.2, 71.1, 68.8, ...]
142
+ >>> result.chunk_count
143
+ 59
144
+
145
+ >>> # Single-chunk mode (no chunking)
146
+ >>> result = compute_flesch("Short text.", chunk_size=1_000_000)
147
+ >>> result.chunk_count
148
+ 1
46
149
  """
47
- sentences = split_sentences(text)
48
- tokens = tokenize(text)
49
-
50
- if len(sentences) == 0 or len(tokens) == 0:
150
+ import math
151
+
152
+ # Chunk the text
153
+ chunks = chunk_text(text, chunk_size)
154
+
155
+ # Compute metrics per chunk
156
+ reading_ease_values = []
157
+ grade_level_values = []
158
+ total_sentences = 0
159
+ total_words = 0
160
+ total_syllables = 0
161
+
162
+ for chunk in chunks:
163
+ re, gl, meta = _compute_flesch_single(chunk)
164
+ if not math.isnan(re): # Only include valid results
165
+ reading_ease_values.append(re)
166
+ grade_level_values.append(gl)
167
+ total_sentences += meta.get("sentence_count", 0)
168
+ total_words += meta.get("word_count", 0)
169
+ total_syllables += meta.get("syllable_count", 0)
170
+
171
+ # Handle empty or all-invalid chunks
172
+ if not reading_ease_values:
173
+ empty_dist = Distribution(
174
+ values=[],
175
+ mean=float("nan"),
176
+ median=float("nan"),
177
+ std=0.0,
178
+ range=0.0,
179
+ iqr=0.0,
180
+ )
51
181
  return FleschResult(
52
- reading_ease=0.0,
53
- grade_level=0.0,
182
+ reading_ease=float("nan"),
183
+ grade_level=float("nan"),
54
184
  difficulty="Unknown",
55
- metadata={"sentence_count": 0, "word_count": 0, "syllable_count": 0},
185
+ reading_ease_dist=empty_dist,
186
+ grade_level_dist=empty_dist,
187
+ chunk_size=chunk_size,
188
+ chunk_count=len(chunks),
189
+ metadata={
190
+ # Backward-compatible keys
191
+ "sentence_count": 0,
192
+ "word_count": 0,
193
+ "syllable_count": 0,
194
+ # New prefixed keys for consistency
195
+ "total_sentence_count": 0,
196
+ "total_word_count": 0,
197
+ "total_syllable_count": 0,
198
+ },
56
199
  )
57
200
 
58
- # Count syllables
59
- total_syllables = sum(count_syllables(word) for word in tokens)
60
-
61
- # Calculate metrics
62
- words_per_sentence = len(tokens) / len(sentences)
63
- syllables_per_word = total_syllables / len(tokens)
201
+ # Build distributions
202
+ reading_ease_dist = make_distribution(reading_ease_values)
203
+ grade_level_dist = make_distribution(grade_level_values)
64
204
 
65
- # TODO: Implement Flesch formulas
66
- reading_ease = 0.0 # Placeholder
67
- grade_level = 0.0 # Placeholder
68
- difficulty = "Unknown" # Placeholder
205
+ # Use mean for convenient access
206
+ mean_reading_ease = reading_ease_dist.mean
207
+ mean_grade_level = grade_level_dist.mean
208
+ difficulty = _get_difficulty(mean_reading_ease)
69
209
 
70
210
  return FleschResult(
71
- reading_ease=reading_ease,
72
- grade_level=grade_level,
211
+ reading_ease=mean_reading_ease,
212
+ grade_level=mean_grade_level,
73
213
  difficulty=difficulty,
214
+ reading_ease_dist=reading_ease_dist,
215
+ grade_level_dist=grade_level_dist,
216
+ chunk_size=chunk_size,
217
+ chunk_count=len(chunks),
74
218
  metadata={
75
- "sentence_count": len(sentences),
76
- "word_count": len(tokens),
219
+ # Backward-compatible keys
220
+ "sentence_count": total_sentences,
221
+ "word_count": total_words,
77
222
  "syllable_count": total_syllables,
78
- "words_per_sentence": words_per_sentence,
79
- "syllables_per_word": syllables_per_word,
223
+ # New prefixed keys for consistency
224
+ "total_sentence_count": total_sentences,
225
+ "total_word_count": total_words,
226
+ "total_syllable_count": total_syllables,
227
+ "words_per_sentence": total_words / total_sentences if total_sentences > 0 else 0,
228
+ "syllables_per_word": total_syllables / total_words if total_words > 0 else 0,
80
229
  },
81
230
  )
@@ -1,63 +1,236 @@
1
- """Gunning Fog Index."""
1
+ """Gunning Fog Index with NLP-enhanced complex word detection.
2
2
 
3
- from .._types import GunningFogResult
3
+ This module computes the Gunning Fog Index, a readability metric that
4
+ estimates the years of formal education needed to understand text on first reading.
5
+
6
+ This implementation includes native chunked analysis for stylometric fingerprinting.
7
+
8
+ Related GitHub Issues:
9
+ #4 - NLP-enhanced complex word detection
10
+ #27 - Native chunked analysis with Distribution dataclass
11
+
12
+ Historical Background:
13
+ ----------------------
14
+ The Gunning Fog Index was developed by Robert Gunning in 1952 as part of his
15
+ work helping businesses improve the clarity of their writing. The formula produces
16
+ a U.S. grade-level score (e.g., 12 = high school senior reading level).
17
+
18
+ Reference:
19
+ Gunning, R. (1952). The Technique of Clear Writing.
20
+ McGraw-Hill, New York.
21
+ """
22
+
23
+ import math
24
+
25
+ from .._normalize import normalize_for_readability
26
+ from .._types import Distribution, GunningFogResult, chunk_text, make_distribution
4
27
  from .._utils import split_sentences, tokenize
5
- from .syllables import count_syllables
28
+ from .complex_words import process_text_for_complex_words
29
+
30
+ # Formula coefficient from Gunning (1952)
31
+ _FOG_COEFFICIENT = 0.4
32
+
6
33
 
34
+ def _compute_gunning_fog_single(text: str, spacy_model: str) -> tuple[float, float, dict]:
35
+ """Compute Gunning Fog metrics for a single chunk of text.
7
36
 
8
- def compute_gunning_fog(text: str) -> GunningFogResult:
37
+ Returns:
38
+ Tuple of (fog_index, grade_level, metadata_dict).
39
+ Returns (nan, nan, metadata) for empty/invalid input.
9
40
  """
10
- Compute Gunning Fog Index.
41
+ sentences = split_sentences(text)
42
+ all_tokens = tokenize(text)
43
+ tokens = normalize_for_readability(all_tokens)
44
+
45
+ if len(sentences) == 0 or len(tokens) == 0:
46
+ return (
47
+ float("nan"),
48
+ float("nan"),
49
+ {
50
+ "sentence_count": 0,
51
+ "word_count": 0,
52
+ "complex_word_count": 0,
53
+ "complex_word_percentage": 0.0,
54
+ },
55
+ )
56
+
57
+ # Count complex words using NLP-enhanced detection
58
+ complex_word_count, detection_metadata = process_text_for_complex_words(
59
+ text, tokens, model=spacy_model
60
+ )
11
61
 
12
- Formula:
62
+ # Calculate formula components
63
+ average_words_per_sentence = len(tokens) / len(sentences)
64
+ complex_word_percentage = (complex_word_count / len(tokens)) * 100
65
+
66
+ # Apply Gunning Fog formula
67
+ fog_index = _FOG_COEFFICIENT * (average_words_per_sentence + complex_word_percentage)
68
+ grade_level = max(0, min(20, round(fog_index)))
69
+
70
+ metadata = {
71
+ "sentence_count": len(sentences),
72
+ "word_count": len(tokens),
73
+ "complex_word_count": complex_word_count,
74
+ "complex_word_percentage": complex_word_percentage,
75
+ "average_words_per_sentence": average_words_per_sentence,
76
+ **detection_metadata,
77
+ }
78
+
79
+ return (fog_index, float(grade_level), metadata)
80
+
81
+
82
+ def compute_gunning_fog(
83
+ text: str, chunk_size: int = 1000, spacy_model: str = "en_core_web_sm"
84
+ ) -> GunningFogResult:
85
+ """
86
+ Compute Gunning Fog Index with NLP-enhanced complex word detection.
87
+
88
+ This function uses native chunked analysis to capture variance and patterns
89
+ across the text, which is essential for stylometric fingerprinting.
90
+
91
+ Formula (Gunning, 1952):
92
+ ------------------------
13
93
  Fog Index = 0.4 × [(words/sentences) + 100 × (complex words/words)]
14
94
 
15
- Where complex words are defined as words with 3+ syllables,
16
- excluding proper nouns, compound words, and common suffixes.
95
+ Where complex words are words with 3+ syllables, EXCLUDING:
96
+ 1. Proper nouns (names, places, organizations)
97
+ 2. Compound words (hyphenated)
98
+ 3. Common verb forms (-es, -ed, -ing endings)
17
99
 
18
- The index estimates years of formal education needed to understand the text
19
- on first reading.
100
+ Related GitHub Issues:
101
+ #4 - NLP-enhanced complex word detection
102
+ #27 - Native chunked analysis with Distribution dataclass
20
103
 
21
- References:
22
- Gunning, R. (1952). The Technique of Clear Writing.
23
- McGraw-Hill.
104
+ Reference:
105
+ Gunning, R. (1952). The Technique of Clear Writing. McGraw-Hill.
24
106
 
25
107
  Args:
26
108
  text: Input text to analyze
109
+ chunk_size: Number of words per chunk (default: 1000).
110
+ The text is divided into chunks of this size, and metrics are
111
+ computed per-chunk.
112
+ spacy_model: spaCy model name for enhanced mode (default: "en_core_web_sm")
27
113
 
28
114
  Returns:
29
- GunningFogResult with fog index and grade level
115
+ GunningFogResult with:
116
+ - fog_index: Mean Fog Index across chunks
117
+ - grade_level: Mean grade level across chunks
118
+ - fog_index_dist: Distribution with per-chunk values and stats
119
+ - grade_level_dist: Distribution with per-chunk values and stats
120
+ - chunk_size: The chunk size used
121
+ - chunk_count: Number of chunks analyzed
30
122
 
31
123
  Example:
32
- >>> result = compute_gunning_fog("The quick brown fox jumps over the lazy dog.")
33
- >>> print(f"Fog Index: {result.fog_index:.1f}")
34
- >>> print(f"Grade Level: {result.grade_level}")
124
+ >>> result = compute_gunning_fog("Long text here...", chunk_size=1000)
125
+ >>> result.fog_index # Mean across chunks
126
+ 12.5
127
+ >>> result.fog_index_dist.std # Variance reveals fingerprint
128
+ 2.1
35
129
  """
36
- sentences = split_sentences(text)
37
- tokens = tokenize(text)
130
+ # Chunk the text
131
+ chunks = chunk_text(text, chunk_size)
38
132
 
39
- if len(sentences) == 0 or len(tokens) == 0:
133
+ # Compute metrics per chunk
134
+ fog_values = []
135
+ grade_values = []
136
+ total_sentences = 0
137
+ total_words = 0
138
+ total_complex = 0
139
+ detection_metadata: dict = {}
140
+
141
+ for chunk in chunks:
142
+ fi, gl, meta = _compute_gunning_fog_single(chunk, spacy_model)
143
+ if not math.isnan(fi):
144
+ fog_values.append(fi)
145
+ grade_values.append(gl)
146
+ total_sentences += meta.get("sentence_count", 0)
147
+ total_words += meta.get("word_count", 0)
148
+ total_complex += meta.get("complex_word_count", 0)
149
+ # Capture detection metadata from first chunk (same for all chunks)
150
+ if not detection_metadata and "mode" in meta:
151
+ detection_metadata = {
152
+ "mode": meta.get("mode"),
153
+ "proper_noun_detection": meta.get("proper_noun_detection"),
154
+ "inflection_handling": meta.get("inflection_handling"),
155
+ }
156
+ if "spacy_model" in meta:
157
+ detection_metadata["spacy_model"] = meta.get("spacy_model")
158
+
159
+ # Handle empty or all-invalid chunks
160
+ if not fog_values:
161
+ empty_dist = Distribution(
162
+ values=[],
163
+ mean=float("nan"),
164
+ median=float("nan"),
165
+ std=0.0,
166
+ range=0.0,
167
+ iqr=0.0,
168
+ )
40
169
  return GunningFogResult(
41
- fog_index=0.0,
42
- grade_level=0,
43
- metadata={"sentence_count": 0, "word_count": 0, "complex_word_count": 0},
170
+ fog_index=float("nan"),
171
+ grade_level=float("nan"),
172
+ fog_index_dist=empty_dist,
173
+ grade_level_dist=empty_dist,
174
+ chunk_size=chunk_size,
175
+ chunk_count=len(chunks),
176
+ metadata={
177
+ # Backward-compatible keys
178
+ "sentence_count": 0,
179
+ "word_count": 0,
180
+ "complex_word_count": 0,
181
+ "complex_word_percentage": 0.0,
182
+ "average_words_per_sentence": 0.0,
183
+ # New prefixed keys for consistency
184
+ "total_sentence_count": 0,
185
+ "total_word_count": 0,
186
+ "total_complex_word_count": 0,
187
+ "reliable": False,
188
+ # Detection metadata
189
+ "mode": "none",
190
+ "proper_noun_detection": "none",
191
+ "inflection_handling": "none",
192
+ },
44
193
  )
45
194
 
46
- # Count complex words (3+ syllables)
47
- # TODO: Exclude proper nouns, compound words, and -es/-ed/-ing endings
48
- complex_word_count = sum(1 for word in tokens if count_syllables(word) >= 3)
195
+ # Build distributions
196
+ fog_dist = make_distribution(fog_values)
197
+ grade_dist = make_distribution(grade_values)
198
+
199
+ # Reliability heuristic
200
+ reliable = total_words >= 100 and total_sentences >= 3
49
201
 
50
- # TODO: Implement Gunning Fog formula
51
- fog_index = 0.0 # Placeholder
52
- grade_level = 0 # Placeholder
202
+ # Ensure detection metadata has defaults
203
+ if not detection_metadata:
204
+ detection_metadata = {
205
+ "mode": "none",
206
+ "proper_noun_detection": "none",
207
+ "inflection_handling": "none",
208
+ }
53
209
 
54
210
  return GunningFogResult(
55
- fog_index=fog_index,
56
- grade_level=grade_level,
211
+ fog_index=fog_dist.mean,
212
+ grade_level=grade_dist.mean,
213
+ fog_index_dist=fog_dist,
214
+ grade_level_dist=grade_dist,
215
+ chunk_size=chunk_size,
216
+ chunk_count=len(chunks),
57
217
  metadata={
58
- "sentence_count": len(sentences),
59
- "word_count": len(tokens),
60
- "complex_word_count": complex_word_count,
61
- "complex_word_percentage": (complex_word_count / len(tokens) * 100) if tokens else 0,
218
+ # Backward-compatible keys
219
+ "sentence_count": total_sentences,
220
+ "word_count": total_words,
221
+ "complex_word_count": total_complex,
222
+ "complex_word_percentage": (total_complex / total_words * 100)
223
+ if total_words > 0
224
+ else 0,
225
+ "average_words_per_sentence": total_words / total_sentences
226
+ if total_sentences > 0
227
+ else 0,
228
+ # New prefixed keys for consistency
229
+ "total_sentence_count": total_sentences,
230
+ "total_word_count": total_words,
231
+ "total_complex_word_count": total_complex,
232
+ "reliable": reliable,
233
+ # Detection metadata
234
+ **detection_metadata,
62
235
  },
63
236
  )