pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/README.md +42 -0
- pystylometry/__init__.py +45 -3
- pystylometry/_types.py +1017 -259
- pystylometry/authorship/README.md +21 -0
- pystylometry/authorship/__init__.py +28 -4
- pystylometry/authorship/additional_methods.py +260 -40
- pystylometry/authorship/compression.py +175 -0
- pystylometry/authorship/kilgarriff.py +354 -0
- pystylometry/character/README.md +17 -0
- pystylometry/character/character_metrics.py +267 -179
- pystylometry/cli.py +427 -0
- pystylometry/consistency/README.md +27 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/README.md +26 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/README.md +23 -0
- pystylometry/lexical/advanced_diversity.py +61 -22
- pystylometry/lexical/function_words.py +255 -56
- pystylometry/lexical/hapax.py +182 -52
- pystylometry/lexical/mtld.py +108 -26
- pystylometry/lexical/ttr.py +76 -10
- pystylometry/lexical/word_frequency_sophistication.py +1522 -298
- pystylometry/lexical/yule.py +136 -50
- pystylometry/ngrams/README.md +18 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +314 -69
- pystylometry/prosody/README.md +17 -0
- pystylometry/prosody/rhythm_prosody.py +773 -11
- pystylometry/readability/README.md +23 -0
- pystylometry/readability/additional_formulas.py +1887 -762
- pystylometry/readability/ari.py +144 -82
- pystylometry/readability/coleman_liau.py +136 -109
- pystylometry/readability/flesch.py +177 -73
- pystylometry/readability/gunning_fog.py +165 -161
- pystylometry/readability/smog.py +123 -42
- pystylometry/stylistic/README.md +20 -0
- pystylometry/stylistic/cohesion_coherence.py +669 -13
- pystylometry/stylistic/genre_register.py +1560 -17
- pystylometry/stylistic/markers.py +611 -17
- pystylometry/stylistic/vocabulary_overlap.py +354 -13
- pystylometry/syntactic/README.md +20 -0
- pystylometry/syntactic/advanced_syntactic.py +76 -14
- pystylometry/syntactic/pos_ratios.py +70 -6
- pystylometry/syntactic/sentence_stats.py +55 -12
- pystylometry/syntactic/sentence_types.py +71 -15
- pystylometry/viz/README.md +27 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- pystylometry-1.3.0.dist-info/METADATA +136 -0
- pystylometry-1.3.0.dist-info/RECORD +76 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
- pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
- pystylometry-1.0.0.dist-info/METADATA +0 -275
- pystylometry-1.0.0.dist-info/RECORD +0 -46
|
@@ -5,9 +5,9 @@ in writing style. Character-level metrics are fundamental for authorship
|
|
|
5
5
|
attribution and can reveal distinctive patterns in punctuation usage,
|
|
6
6
|
word construction, and formatting preferences.
|
|
7
7
|
|
|
8
|
-
Related GitHub
|
|
8
|
+
Related GitHub Issues:
|
|
9
9
|
#12 - Character-Level Metrics
|
|
10
|
-
|
|
10
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
11
11
|
|
|
12
12
|
Features implemented:
|
|
13
13
|
- Average word length (characters per word)
|
|
@@ -26,114 +26,80 @@ References:
|
|
|
26
26
|
JASIST, 60(3), 538-556.
|
|
27
27
|
"""
|
|
28
28
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
29
|
+
import math
|
|
30
|
+
|
|
31
|
+
from .._types import CharacterMetricsResult, Distribution, chunk_text, make_distribution
|
|
32
|
+
|
|
33
|
+
# Character sets
|
|
34
|
+
_PUNCTUATION = {
|
|
35
|
+
".",
|
|
36
|
+
",",
|
|
37
|
+
"!",
|
|
38
|
+
"?",
|
|
39
|
+
";",
|
|
40
|
+
":",
|
|
41
|
+
"-",
|
|
42
|
+
"—",
|
|
43
|
+
"–", # Basic punctuation
|
|
44
|
+
"'",
|
|
45
|
+
'"',
|
|
46
|
+
""", """,
|
|
47
|
+
"'",
|
|
48
|
+
"'", # Quotes
|
|
49
|
+
"(",
|
|
50
|
+
")",
|
|
51
|
+
"[",
|
|
52
|
+
"]",
|
|
53
|
+
"{",
|
|
54
|
+
"}", # Brackets
|
|
55
|
+
"/",
|
|
56
|
+
"\\",
|
|
57
|
+
"|", # Slashes
|
|
58
|
+
"…", # Ellipsis
|
|
59
|
+
"*",
|
|
60
|
+
"&",
|
|
61
|
+
"@",
|
|
62
|
+
"#",
|
|
63
|
+
"$",
|
|
64
|
+
"%",
|
|
65
|
+
"^",
|
|
66
|
+
"~",
|
|
67
|
+
"`", # Special symbols
|
|
68
|
+
}
|
|
69
|
+
_VOWELS = {"a", "e", "i", "o", "u"}
|
|
70
|
+
_STANDARD_LETTERS = set("abcdefghijklmnopqrstuvwxyz")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _compute_character_metrics_single(text: str) -> dict:
|
|
74
|
+
"""Compute character-level metrics for a single chunk of text.
|
|
75
|
+
|
|
76
|
+
Returns a dict with all computed values, or values containing nan for empty text.
|
|
33
77
|
"""
|
|
34
|
-
Compute character-level stylometric metrics.
|
|
35
|
-
|
|
36
|
-
This function analyzes text at the character level to extract features
|
|
37
|
-
related to word length, punctuation usage, letter distribution, and
|
|
38
|
-
other low-level patterns that can be distinctive for authorship
|
|
39
|
-
attribution and style analysis.
|
|
40
|
-
|
|
41
|
-
Related GitHub Issue:
|
|
42
|
-
#12 - Character-Level Metrics
|
|
43
|
-
https://github.com/craigtrim/pystylometry/issues/12
|
|
44
|
-
|
|
45
|
-
Character-level features are particularly valuable because:
|
|
46
|
-
1. They are language-independent (work across languages)
|
|
47
|
-
2. They capture subconscious writing patterns
|
|
48
|
-
3. They are resistant to topic variation
|
|
49
|
-
4. They complement higher-level metrics (words, syntax)
|
|
50
|
-
|
|
51
|
-
Metrics computed:
|
|
52
|
-
- Average word length: Mean characters per word
|
|
53
|
-
- Average sentence length (chars): Mean characters per sentence
|
|
54
|
-
- Punctuation density: Punctuation marks per 100 words
|
|
55
|
-
- Punctuation variety: Count of unique punctuation types used
|
|
56
|
-
- Letter frequency: Distribution of a-z (case-insensitive)
|
|
57
|
-
- Vowel-to-consonant ratio: Ratio of vowels to consonants
|
|
58
|
-
- Digit count/ratio: Numeric character usage
|
|
59
|
-
- Uppercase ratio: Uppercase letters / total letters
|
|
60
|
-
- Whitespace ratio: Whitespace characters / total characters
|
|
61
|
-
|
|
62
|
-
Args:
|
|
63
|
-
text: Input text to analyze. Should contain at least one sentence
|
|
64
|
-
for meaningful results. Empty text will return NaN for ratios
|
|
65
|
-
and 0 for counts.
|
|
66
|
-
|
|
67
|
-
Returns:
|
|
68
|
-
CharacterMetricsResult with all character-level features and metadata.
|
|
69
|
-
For empty text, all ratios will be NaN and counts will be 0.
|
|
70
|
-
|
|
71
|
-
Example:
|
|
72
|
-
>>> result = compute_character_metrics("The quick brown fox jumps!")
|
|
73
|
-
>>> print(f"Avg word length: {result.avg_word_length:.2f}")
|
|
74
|
-
Avg word length: 4.17
|
|
75
|
-
>>> print(f"Punctuation density: {result.punctuation_density:.2f}")
|
|
76
|
-
Punctuation density: 16.67
|
|
77
|
-
>>> print(f"Vowel/consonant ratio: {result.vowel_consonant_ratio:.2f}")
|
|
78
|
-
Vowel/consonant ratio: 0.71
|
|
79
|
-
|
|
80
|
-
>>> # Empty text handling
|
|
81
|
-
>>> result = compute_character_metrics("")
|
|
82
|
-
>>> import math
|
|
83
|
-
>>> math.isnan(result.avg_word_length)
|
|
84
|
-
True
|
|
85
|
-
>>> result.digit_count
|
|
86
|
-
0
|
|
87
|
-
|
|
88
|
-
Note:
|
|
89
|
-
- Punctuation marks include: . , ! ? ; : - ' " ( ) [ ] { } ... etc.
|
|
90
|
-
- Whitespace includes spaces, tabs, newlines
|
|
91
|
-
- Letter frequency is case-insensitive (lowercase normalized)
|
|
92
|
-
- Words are tokenized by whitespace for length calculation
|
|
93
|
-
- Sentences are split using standard sentence delimiters (. ! ?)
|
|
94
|
-
"""
|
|
95
|
-
# Define character sets
|
|
96
|
-
# GitHub Issue #12: https://github.com/craigtrim/pystylometry/issues/12
|
|
97
|
-
PUNCTUATION = {
|
|
98
|
-
".", ",", "!", "?", ";", ":", "-", "—", "–", # Basic punctuation
|
|
99
|
-
"'", '"', """, """, "'", "'", # Quotes
|
|
100
|
-
"(", ")", "[", "]", "{", "}", # Brackets
|
|
101
|
-
"/", "\\", "|", # Slashes
|
|
102
|
-
"…", # Ellipsis
|
|
103
|
-
"*", "&", "@", "#", "$", "%", "^", "~", "`", # Special symbols
|
|
104
|
-
}
|
|
105
|
-
VOWELS = {"a", "e", "i", "o", "u"}
|
|
106
|
-
|
|
107
|
-
# Handle empty text
|
|
108
78
|
if not text:
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
"uppercase_count": 0,
|
|
134
|
-
"lowercase_count": 0,
|
|
135
|
-
},
|
|
136
|
-
)
|
|
79
|
+
return {
|
|
80
|
+
"avg_word_length": float("nan"),
|
|
81
|
+
"avg_sentence_length_chars": float("nan"),
|
|
82
|
+
"punctuation_density": float("nan"),
|
|
83
|
+
"punctuation_variety": 0,
|
|
84
|
+
"vowel_consonant_ratio": float("nan"),
|
|
85
|
+
"digit_count": 0,
|
|
86
|
+
"digit_ratio": float("nan"),
|
|
87
|
+
"uppercase_ratio": float("nan"),
|
|
88
|
+
"whitespace_ratio": float("nan"),
|
|
89
|
+
"letter_frequency": {letter: 0.0 for letter in "abcdefghijklmnopqrstuvwxyz"},
|
|
90
|
+
"total_characters": 0,
|
|
91
|
+
"total_letters": 0,
|
|
92
|
+
"total_words": 0,
|
|
93
|
+
"total_sentences": 0,
|
|
94
|
+
"total_punctuation": 0,
|
|
95
|
+
"total_whitespace": 0,
|
|
96
|
+
"total_digits": 0,
|
|
97
|
+
"punctuation_types": [],
|
|
98
|
+
"vowel_count": 0,
|
|
99
|
+
"consonant_count": 0,
|
|
100
|
+
"uppercase_count": 0,
|
|
101
|
+
"lowercase_count": 0,
|
|
102
|
+
}
|
|
137
103
|
|
|
138
104
|
# Initialize counters
|
|
139
105
|
total_chars = len(text)
|
|
@@ -147,19 +113,18 @@ def compute_character_metrics(text: str) -> CharacterMetricsResult:
|
|
|
147
113
|
punctuation_count = 0
|
|
148
114
|
punctuation_types = set()
|
|
149
115
|
|
|
150
|
-
# Single pass through text
|
|
116
|
+
# Single pass through text
|
|
151
117
|
for char in text:
|
|
152
118
|
if char.isalpha():
|
|
153
|
-
|
|
154
|
-
|
|
119
|
+
lower_char = char.lower()
|
|
120
|
+
if lower_char in _STANDARD_LETTERS:
|
|
121
|
+
letter_counts[lower_char] += 1
|
|
155
122
|
|
|
156
|
-
|
|
157
|
-
if char.lower() in VOWELS:
|
|
123
|
+
if lower_char in _VOWELS:
|
|
158
124
|
vowel_count += 1
|
|
159
|
-
|
|
125
|
+
elif lower_char in _STANDARD_LETTERS:
|
|
160
126
|
consonant_count += 1
|
|
161
127
|
|
|
162
|
-
# Count uppercase and lowercase
|
|
163
128
|
if char.isupper():
|
|
164
129
|
uppercase_count += 1
|
|
165
130
|
else:
|
|
@@ -167,44 +132,35 @@ def compute_character_metrics(text: str) -> CharacterMetricsResult:
|
|
|
167
132
|
|
|
168
133
|
elif char.isdigit():
|
|
169
134
|
digit_count += 1
|
|
170
|
-
|
|
171
135
|
elif char.isspace():
|
|
172
136
|
whitespace_count += 1
|
|
173
|
-
|
|
174
|
-
elif char in PUNCTUATION:
|
|
137
|
+
elif char in _PUNCTUATION:
|
|
175
138
|
punctuation_count += 1
|
|
176
139
|
punctuation_types.add(char)
|
|
177
140
|
|
|
178
141
|
total_letters = vowel_count + consonant_count
|
|
179
142
|
|
|
180
|
-
#
|
|
143
|
+
# Letter frequency distribution
|
|
181
144
|
if total_letters > 0:
|
|
182
|
-
letter_frequency = {
|
|
145
|
+
letter_frequency = {
|
|
146
|
+
letter: count / total_letters for letter, count in letter_counts.items()
|
|
147
|
+
}
|
|
183
148
|
else:
|
|
184
149
|
letter_frequency = {letter: 0.0 for letter in "abcdefghijklmnopqrstuvwxyz"}
|
|
185
150
|
|
|
186
|
-
#
|
|
151
|
+
# Word metrics
|
|
187
152
|
words = text.split()
|
|
188
153
|
total_words = len(words)
|
|
189
154
|
|
|
190
|
-
# Calculate average word length (count only letters and digits in words)
|
|
191
155
|
if total_words > 0:
|
|
192
|
-
word_lengths = [
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
if word_length > 0: # Only count words with at least one alphanumeric char
|
|
197
|
-
word_lengths.append(word_length)
|
|
198
|
-
|
|
199
|
-
if word_lengths:
|
|
200
|
-
avg_word_length = sum(word_lengths) / len(word_lengths)
|
|
201
|
-
else:
|
|
202
|
-
avg_word_length = float("nan")
|
|
156
|
+
word_lengths = [
|
|
157
|
+
sum(1 for c in w if c.isalnum()) for w in words if any(c.isalnum() for c in w)
|
|
158
|
+
]
|
|
159
|
+
avg_word_length = sum(word_lengths) / len(word_lengths) if word_lengths else float("nan")
|
|
203
160
|
else:
|
|
204
161
|
avg_word_length = float("nan")
|
|
205
162
|
|
|
206
|
-
#
|
|
207
|
-
# Simple approach: split on sentence delimiters
|
|
163
|
+
# Sentence metrics
|
|
208
164
|
sentence_delimiters = {".", "!", "?"}
|
|
209
165
|
sentences = []
|
|
210
166
|
current_sentence = []
|
|
@@ -212,13 +168,11 @@ def compute_character_metrics(text: str) -> CharacterMetricsResult:
|
|
|
212
168
|
for char in text:
|
|
213
169
|
current_sentence.append(char)
|
|
214
170
|
if char in sentence_delimiters:
|
|
215
|
-
# End of sentence
|
|
216
171
|
sentence_text = "".join(current_sentence).strip()
|
|
217
|
-
if sentence_text:
|
|
172
|
+
if sentence_text:
|
|
218
173
|
sentences.append(sentence_text)
|
|
219
174
|
current_sentence = []
|
|
220
175
|
|
|
221
|
-
# Add any remaining text as a sentence if it's non-empty and doesn't end with delimiter
|
|
222
176
|
if current_sentence:
|
|
223
177
|
sentence_text = "".join(current_sentence).strip()
|
|
224
178
|
if sentence_text:
|
|
@@ -226,52 +180,40 @@ def compute_character_metrics(text: str) -> CharacterMetricsResult:
|
|
|
226
180
|
|
|
227
181
|
total_sentences = len(sentences)
|
|
228
182
|
|
|
229
|
-
# Calculate average sentence length in characters
|
|
230
183
|
if total_sentences > 0:
|
|
231
184
|
sentence_lengths = [len(sent) for sent in sentences]
|
|
232
185
|
avg_sentence_length_chars = sum(sentence_lengths) / total_sentences
|
|
233
186
|
else:
|
|
234
187
|
avg_sentence_length_chars = float("nan")
|
|
235
188
|
|
|
236
|
-
#
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
punctuation_density = float("nan")
|
|
241
|
-
|
|
242
|
-
# Punctuation variety (count of unique punctuation types)
|
|
189
|
+
# Ratios
|
|
190
|
+
punctuation_density = (
|
|
191
|
+
(punctuation_count / total_words * 100) if total_words > 0 else float("nan")
|
|
192
|
+
)
|
|
243
193
|
punctuation_variety = len(punctuation_types)
|
|
244
194
|
|
|
245
|
-
# Calculate vowel-to-consonant ratio
|
|
246
195
|
if consonant_count > 0:
|
|
247
196
|
vowel_consonant_ratio = vowel_count / consonant_count
|
|
248
197
|
elif vowel_count > 0:
|
|
249
|
-
# Vowels but no consonants - ratio is infinity
|
|
250
198
|
vowel_consonant_ratio = float("inf")
|
|
251
199
|
else:
|
|
252
|
-
# No letters at all
|
|
253
200
|
vowel_consonant_ratio = float("nan")
|
|
254
201
|
|
|
255
|
-
|
|
256
|
-
if
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
else:
|
|
271
|
-
whitespace_ratio = float("nan")
|
|
272
|
-
|
|
273
|
-
# Build metadata
|
|
274
|
-
metadata = {
|
|
202
|
+
digit_ratio = digit_count / total_chars if total_chars > 0 else float("nan")
|
|
203
|
+
uppercase_ratio = uppercase_count / total_letters if total_letters > 0 else float("nan")
|
|
204
|
+
whitespace_ratio = whitespace_count / total_chars if total_chars > 0 else float("nan")
|
|
205
|
+
|
|
206
|
+
return {
|
|
207
|
+
"avg_word_length": avg_word_length,
|
|
208
|
+
"avg_sentence_length_chars": avg_sentence_length_chars,
|
|
209
|
+
"punctuation_density": punctuation_density,
|
|
210
|
+
"punctuation_variety": punctuation_variety,
|
|
211
|
+
"vowel_consonant_ratio": vowel_consonant_ratio,
|
|
212
|
+
"digit_count": digit_count,
|
|
213
|
+
"digit_ratio": digit_ratio,
|
|
214
|
+
"uppercase_ratio": uppercase_ratio,
|
|
215
|
+
"whitespace_ratio": whitespace_ratio,
|
|
216
|
+
"letter_frequency": letter_frequency,
|
|
275
217
|
"total_characters": total_chars,
|
|
276
218
|
"total_letters": total_letters,
|
|
277
219
|
"total_words": total_words,
|
|
@@ -286,16 +228,162 @@ def compute_character_metrics(text: str) -> CharacterMetricsResult:
|
|
|
286
228
|
"lowercase_count": lowercase_count,
|
|
287
229
|
}
|
|
288
230
|
|
|
231
|
+
|
|
232
|
+
def compute_character_metrics(text: str, chunk_size: int = 1000) -> CharacterMetricsResult:
|
|
233
|
+
"""
|
|
234
|
+
Compute character-level stylometric metrics.
|
|
235
|
+
|
|
236
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
237
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
238
|
+
|
|
239
|
+
Related GitHub Issues:
|
|
240
|
+
#12 - Character-Level Metrics
|
|
241
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
242
|
+
|
|
243
|
+
Character-level features are particularly valuable because:
|
|
244
|
+
1. They are language-independent (work across languages)
|
|
245
|
+
2. They capture subconscious writing patterns
|
|
246
|
+
3. They are resistant to topic variation
|
|
247
|
+
4. They complement higher-level metrics (words, syntax)
|
|
248
|
+
|
|
249
|
+
Metrics computed:
|
|
250
|
+
- Average word length: Mean characters per word
|
|
251
|
+
- Average sentence length (chars): Mean characters per sentence
|
|
252
|
+
- Punctuation density: Punctuation marks per 100 words
|
|
253
|
+
- Punctuation variety: Count of unique punctuation types used
|
|
254
|
+
- Letter frequency: Distribution of a-z (case-insensitive)
|
|
255
|
+
- Vowel-to-consonant ratio: Ratio of vowels to consonants
|
|
256
|
+
- Digit count/ratio: Numeric character usage
|
|
257
|
+
- Uppercase ratio: Uppercase letters / total letters
|
|
258
|
+
- Whitespace ratio: Whitespace characters / total characters
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
text: Input text to analyze
|
|
262
|
+
chunk_size: Number of words per chunk (default: 1000)
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
CharacterMetricsResult with all character-level features, distributions,
|
|
266
|
+
and metadata.
|
|
267
|
+
|
|
268
|
+
Example:
|
|
269
|
+
>>> result = compute_character_metrics("Long text...", chunk_size=1000)
|
|
270
|
+
>>> result.avg_word_length # Mean across chunks
|
|
271
|
+
4.5
|
|
272
|
+
>>> result.avg_word_length_dist.std # Variance reveals fingerprint
|
|
273
|
+
0.3
|
|
274
|
+
"""
|
|
275
|
+
# Chunk the text
|
|
276
|
+
chunks = chunk_text(text, chunk_size)
|
|
277
|
+
|
|
278
|
+
# Compute metrics per chunk
|
|
279
|
+
chunk_results = [_compute_character_metrics_single(chunk) for chunk in chunks]
|
|
280
|
+
|
|
281
|
+
# Collect values for distributions
|
|
282
|
+
avg_word_length_vals = [
|
|
283
|
+
r["avg_word_length"] for r in chunk_results if not math.isnan(r["avg_word_length"])
|
|
284
|
+
]
|
|
285
|
+
avg_sentence_vals = [
|
|
286
|
+
r["avg_sentence_length_chars"]
|
|
287
|
+
for r in chunk_results
|
|
288
|
+
if not math.isnan(r["avg_sentence_length_chars"])
|
|
289
|
+
]
|
|
290
|
+
punct_density_vals = [
|
|
291
|
+
r["punctuation_density"] for r in chunk_results if not math.isnan(r["punctuation_density"])
|
|
292
|
+
]
|
|
293
|
+
punct_variety_vals = [float(r["punctuation_variety"]) for r in chunk_results]
|
|
294
|
+
vc_ratio_vals = [
|
|
295
|
+
r["vowel_consonant_ratio"]
|
|
296
|
+
for r in chunk_results
|
|
297
|
+
if not math.isnan(r["vowel_consonant_ratio"]) and not math.isinf(r["vowel_consonant_ratio"])
|
|
298
|
+
]
|
|
299
|
+
digit_ratio_vals = [r["digit_ratio"] for r in chunk_results if not math.isnan(r["digit_ratio"])]
|
|
300
|
+
uppercase_ratio_vals = [
|
|
301
|
+
r["uppercase_ratio"] for r in chunk_results if not math.isnan(r["uppercase_ratio"])
|
|
302
|
+
]
|
|
303
|
+
whitespace_ratio_vals = [
|
|
304
|
+
r["whitespace_ratio"] for r in chunk_results if not math.isnan(r["whitespace_ratio"])
|
|
305
|
+
]
|
|
306
|
+
|
|
307
|
+
# Aggregate totals
|
|
308
|
+
total_digits = sum(r["digit_count"] for r in chunk_results)
|
|
309
|
+
total_characters = sum(r["total_characters"] for r in chunk_results)
|
|
310
|
+
total_letters = sum(r["total_letters"] for r in chunk_results)
|
|
311
|
+
total_words = sum(r["total_words"] for r in chunk_results)
|
|
312
|
+
total_sentences = sum(r["total_sentences"] for r in chunk_results)
|
|
313
|
+
total_punctuation = sum(r["total_punctuation"] for r in chunk_results)
|
|
314
|
+
total_whitespace = sum(r["total_whitespace"] for r in chunk_results)
|
|
315
|
+
total_vowel_count = sum(r["vowel_count"] for r in chunk_results)
|
|
316
|
+
total_consonant_count = sum(r["consonant_count"] for r in chunk_results)
|
|
317
|
+
total_uppercase_count = sum(r["uppercase_count"] for r in chunk_results)
|
|
318
|
+
total_lowercase_count = sum(r["lowercase_count"] for r in chunk_results)
|
|
319
|
+
all_punctuation_types = set()
|
|
320
|
+
for r in chunk_results:
|
|
321
|
+
all_punctuation_types.update(r["punctuation_types"])
|
|
322
|
+
|
|
323
|
+
# Aggregate letter frequency
|
|
324
|
+
total_letter_counts = {letter: 0 for letter in "abcdefghijklmnopqrstuvwxyz"}
|
|
325
|
+
for r in chunk_results:
|
|
326
|
+
if r["total_letters"] > 0:
|
|
327
|
+
for letter, freq in r["letter_frequency"].items():
|
|
328
|
+
total_letter_counts[letter] += freq * r["total_letters"]
|
|
329
|
+
|
|
330
|
+
if total_letters > 0:
|
|
331
|
+
letter_frequency = {
|
|
332
|
+
letter: count / total_letters for letter, count in total_letter_counts.items()
|
|
333
|
+
}
|
|
334
|
+
else:
|
|
335
|
+
letter_frequency = {letter: 0.0 for letter in "abcdefghijklmnopqrstuvwxyz"}
|
|
336
|
+
|
|
337
|
+
# Build distributions (handle empty case)
|
|
338
|
+
def safe_dist(values: list[float]) -> Distribution:
|
|
339
|
+
if not values:
|
|
340
|
+
return Distribution(
|
|
341
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
342
|
+
)
|
|
343
|
+
return make_distribution(values)
|
|
344
|
+
|
|
345
|
+
avg_word_length_dist = safe_dist(avg_word_length_vals)
|
|
346
|
+
avg_sentence_dist = safe_dist(avg_sentence_vals)
|
|
347
|
+
punct_density_dist = safe_dist(punct_density_vals)
|
|
348
|
+
punct_variety_dist = safe_dist(punct_variety_vals)
|
|
349
|
+
vc_ratio_dist = safe_dist(vc_ratio_vals)
|
|
350
|
+
digit_ratio_dist = safe_dist(digit_ratio_vals)
|
|
351
|
+
uppercase_ratio_dist = safe_dist(uppercase_ratio_vals)
|
|
352
|
+
whitespace_ratio_dist = safe_dist(whitespace_ratio_vals)
|
|
353
|
+
|
|
289
354
|
return CharacterMetricsResult(
|
|
290
|
-
avg_word_length=
|
|
291
|
-
avg_sentence_length_chars=
|
|
292
|
-
punctuation_density=
|
|
293
|
-
punctuation_variety=
|
|
355
|
+
avg_word_length=avg_word_length_dist.mean,
|
|
356
|
+
avg_sentence_length_chars=avg_sentence_dist.mean,
|
|
357
|
+
punctuation_density=punct_density_dist.mean,
|
|
358
|
+
punctuation_variety=punct_variety_dist.mean,
|
|
294
359
|
letter_frequency=letter_frequency,
|
|
295
|
-
vowel_consonant_ratio=
|
|
296
|
-
digit_count=
|
|
297
|
-
digit_ratio=
|
|
298
|
-
uppercase_ratio=
|
|
299
|
-
whitespace_ratio=
|
|
300
|
-
|
|
360
|
+
vowel_consonant_ratio=vc_ratio_dist.mean,
|
|
361
|
+
digit_count=total_digits,
|
|
362
|
+
digit_ratio=digit_ratio_dist.mean,
|
|
363
|
+
uppercase_ratio=uppercase_ratio_dist.mean,
|
|
364
|
+
whitespace_ratio=whitespace_ratio_dist.mean,
|
|
365
|
+
avg_word_length_dist=avg_word_length_dist,
|
|
366
|
+
avg_sentence_length_chars_dist=avg_sentence_dist,
|
|
367
|
+
punctuation_density_dist=punct_density_dist,
|
|
368
|
+
punctuation_variety_dist=punct_variety_dist,
|
|
369
|
+
vowel_consonant_ratio_dist=vc_ratio_dist,
|
|
370
|
+
digit_ratio_dist=digit_ratio_dist,
|
|
371
|
+
uppercase_ratio_dist=uppercase_ratio_dist,
|
|
372
|
+
whitespace_ratio_dist=whitespace_ratio_dist,
|
|
373
|
+
chunk_size=chunk_size,
|
|
374
|
+
chunk_count=len(chunks),
|
|
375
|
+
metadata={
|
|
376
|
+
"total_characters": total_characters,
|
|
377
|
+
"total_letters": total_letters,
|
|
378
|
+
"total_words": total_words,
|
|
379
|
+
"total_sentences": total_sentences,
|
|
380
|
+
"total_punctuation": total_punctuation,
|
|
381
|
+
"total_whitespace": total_whitespace,
|
|
382
|
+
"total_digits": total_digits,
|
|
383
|
+
"punctuation_types": sorted(list(all_punctuation_types)),
|
|
384
|
+
"vowel_count": total_vowel_count,
|
|
385
|
+
"consonant_count": total_consonant_count,
|
|
386
|
+
"uppercase_count": total_uppercase_count,
|
|
387
|
+
"lowercase_count": total_lowercase_count,
|
|
388
|
+
},
|
|
301
389
|
)
|