pystylometry 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +29 -3
- pystylometry/_types.py +963 -259
- pystylometry/authorship/__init__.py +23 -2
- pystylometry/authorship/additional_methods.py +4 -29
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/character_metrics.py +267 -179
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/advanced_diversity.py +61 -22
- pystylometry/lexical/function_words.py +255 -56
- pystylometry/lexical/hapax.py +182 -52
- pystylometry/lexical/mtld.py +108 -26
- pystylometry/lexical/ttr.py +76 -10
- pystylometry/lexical/word_frequency_sophistication.py +1522 -298
- pystylometry/lexical/yule.py +136 -50
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/readability/additional_formulas.py +1887 -762
- pystylometry/readability/ari.py +144 -82
- pystylometry/readability/coleman_liau.py +136 -109
- pystylometry/readability/flesch.py +177 -73
- pystylometry/readability/gunning_fog.py +165 -161
- pystylometry/readability/smog.py +123 -42
- pystylometry/syntactic/advanced_syntactic.py +76 -14
- pystylometry/syntactic/pos_ratios.py +70 -6
- pystylometry/syntactic/sentence_stats.py +55 -12
- pystylometry/syntactic/sentence_types.py +71 -15
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +5 -2
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +1 -1
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-1.0.0.dist-info/RECORD +0 -46
|
@@ -0,0 +1,533 @@
|
|
|
1
|
+
"""Dialect detection using extensible JSON markers.
|
|
2
|
+
|
|
3
|
+
This module implements dialect detection for stylometric analysis, identifying
|
|
4
|
+
regional linguistic preferences (British vs. American English) and measuring
|
|
5
|
+
text markedness. The analysis uses native chunked analysis per Issue #27,
|
|
6
|
+
computing metrics per chunk and providing distributions for fingerprinting.
|
|
7
|
+
|
|
8
|
+
Related GitHub Issues:
|
|
9
|
+
#35 - Dialect detection with extensible JSON markers
|
|
10
|
+
https://github.com/craigtrim/pystylometry/issues/35
|
|
11
|
+
#30 - Whonix stylometry features (regional linguistic preferences)
|
|
12
|
+
https://github.com/craigtrim/pystylometry/issues/30
|
|
13
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
14
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
15
|
+
|
|
16
|
+
Theoretical Background:
|
|
17
|
+
Dialectometry (Goebl, 1982; Nerbonne, 2009) provides the quantitative
|
|
18
|
+
framework for measuring dialect similarity. Rather than selecting individual
|
|
19
|
+
"characteristic" features, modern dialectometry quantifies holistically
|
|
20
|
+
across all available markers.
|
|
21
|
+
|
|
22
|
+
Markedness theory (Battistella, 1990) informs the markedness_score: marked
|
|
23
|
+
forms stand out against "standard" written English. High markedness suggests
|
|
24
|
+
intentional stylistic choice or strong dialect identity.
|
|
25
|
+
|
|
26
|
+
Eye dialect (spellings like "gonna" that look nonstandard but reflect
|
|
27
|
+
standard pronunciation) indicates informal register, not regional dialect
|
|
28
|
+
(Encyclopedia.com, "Slang, Dialect, and Marked Language").
|
|
29
|
+
|
|
30
|
+
Detection Strategy:
|
|
31
|
+
1. Tokenize text and identify words
|
|
32
|
+
2. Match vocabulary (lexical level): flat/apartment, lorry/truck
|
|
33
|
+
3. Match spelling patterns (phonological/morphological): colour/color, -ise/-ize
|
|
34
|
+
4. Match grammar patterns (syntactic): have got/have, collective noun agreement
|
|
35
|
+
5. Count eye dialect markers separately (register, not dialect)
|
|
36
|
+
6. Apply feature weights from linguistic research
|
|
37
|
+
7. Compute scores and classify dialect
|
|
38
|
+
|
|
39
|
+
Chunking:
|
|
40
|
+
Following Issue #27, the text is split into chunks (default 1000 words).
|
|
41
|
+
Each chunk is analyzed independently, then results are aggregated into
|
|
42
|
+
Distribution objects. This captures variance across the text, which can
|
|
43
|
+
reveal mixed authorship (e.g., human + AI-generated content).
|
|
44
|
+
|
|
45
|
+
References:
|
|
46
|
+
Battistella, Edwin L. "Markedness: The Evaluative Superstructure of
|
|
47
|
+
Language." State University of New York Press, 1990.
|
|
48
|
+
Goebl, Hans. "Dialektometrie: Prinzipien und Methoden des Einsatzes der
|
|
49
|
+
numerischen Taxonomie im Bereich der Dialektgeographie." Verlag der
|
|
50
|
+
Österreichischen Akademie der Wissenschaften, 1982.
|
|
51
|
+
Labov, William. "The Social Stratification of English in New York City."
|
|
52
|
+
Cambridge University Press, 2006.
|
|
53
|
+
Nerbonne, John. "Data-Driven Dialectology." Language and Linguistics
|
|
54
|
+
Compass, vol. 3, no. 1, 2009, pp. 175-198.
|
|
55
|
+
Whonix Project. "Stylometry: Deanonymization Techniques." Whonix Wiki,
|
|
56
|
+
https://www.whonix.org/wiki/Stylometry
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
from __future__ import annotations
|
|
60
|
+
|
|
61
|
+
import re
|
|
62
|
+
from collections import defaultdict
|
|
63
|
+
from dataclasses import dataclass
|
|
64
|
+
from typing import Any
|
|
65
|
+
|
|
66
|
+
from .._types import DialectResult, Distribution, chunk_text, make_distribution
|
|
67
|
+
from ._loader import get_markers
|
|
68
|
+
|
|
69
|
+
# Simple word tokenizer pattern
|
|
70
|
+
_WORD_PATTERN = re.compile(r"\b[a-zA-Z]+(?:'[a-zA-Z]+)?\b")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class _ChunkAnalysis:
|
|
75
|
+
"""Internal result from analyzing a single chunk.
|
|
76
|
+
|
|
77
|
+
This dataclass holds per-chunk metrics that will be aggregated into
|
|
78
|
+
distributions for the final DialectResult.
|
|
79
|
+
|
|
80
|
+
Attributes:
|
|
81
|
+
british_count: Weighted count of British markers
|
|
82
|
+
american_count: Weighted count of American markers
|
|
83
|
+
total_markers: Total unweighted marker count
|
|
84
|
+
word_count: Total words in chunk
|
|
85
|
+
eye_dialect_count: Eye dialect markers found
|
|
86
|
+
markers_by_level: Markers categorized by linguistic level
|
|
87
|
+
spelling_markers: Individual spelling markers found
|
|
88
|
+
vocabulary_markers: Individual vocabulary markers found
|
|
89
|
+
grammar_markers: Individual grammar markers found
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
british_count: float
|
|
93
|
+
american_count: float
|
|
94
|
+
total_markers: int
|
|
95
|
+
word_count: int
|
|
96
|
+
eye_dialect_count: int
|
|
97
|
+
markers_by_level: dict[str, dict[str, int]]
|
|
98
|
+
spelling_markers: dict[str, int]
|
|
99
|
+
vocabulary_markers: dict[str, int]
|
|
100
|
+
grammar_markers: dict[str, int]
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _tokenize_words(text: str) -> list[str]:
|
|
104
|
+
"""Extract words from text for analysis.
|
|
105
|
+
|
|
106
|
+
Uses a simple regex pattern that captures contractions (don't, I'm)
|
|
107
|
+
as single tokens. All words are lowercased for matching.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
text: Input text
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
List of lowercase words
|
|
114
|
+
"""
|
|
115
|
+
return [match.group().lower() for match in _WORD_PATTERN.finditer(text)]
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _compute_dialect_single(text: str) -> _ChunkAnalysis:
|
|
119
|
+
"""Compute dialect metrics for a single chunk of text.
|
|
120
|
+
|
|
121
|
+
This is the core detection function, called once per chunk. It matches
|
|
122
|
+
vocabulary, spelling patterns, and grammar patterns against the text,
|
|
123
|
+
applying feature weights from the JSON database.
|
|
124
|
+
|
|
125
|
+
Related GitHub Issue:
|
|
126
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
127
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
text: Single chunk of text to analyze
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
_ChunkAnalysis with all metrics for this chunk
|
|
134
|
+
"""
|
|
135
|
+
markers = get_markers()
|
|
136
|
+
words = _tokenize_words(text)
|
|
137
|
+
word_count = len(words)
|
|
138
|
+
|
|
139
|
+
# Initialize counters
|
|
140
|
+
british_count = 0.0
|
|
141
|
+
american_count = 0.0
|
|
142
|
+
total_markers = 0
|
|
143
|
+
eye_dialect_count = 0
|
|
144
|
+
|
|
145
|
+
markers_by_level: dict[str, dict[str, int]] = {
|
|
146
|
+
"phonological": {},
|
|
147
|
+
"morphological": {},
|
|
148
|
+
"lexical": {},
|
|
149
|
+
"syntactic": {},
|
|
150
|
+
}
|
|
151
|
+
spelling_markers: dict[str, int] = defaultdict(int)
|
|
152
|
+
vocabulary_markers: dict[str, int] = defaultdict(int)
|
|
153
|
+
grammar_markers: dict[str, int] = defaultdict(int)
|
|
154
|
+
|
|
155
|
+
# ===== Vocabulary matching (lexical level) =====
|
|
156
|
+
# Match against vocabulary pairs and exclusive vocabulary
|
|
157
|
+
for word in words:
|
|
158
|
+
if word in markers.british_vocabulary:
|
|
159
|
+
british_count += 1.0 # Default weight 1.0 for vocabulary
|
|
160
|
+
total_markers += 1
|
|
161
|
+
vocabulary_markers[word] += 1
|
|
162
|
+
markers_by_level["lexical"][word] = markers_by_level["lexical"].get(word, 0) + 1
|
|
163
|
+
|
|
164
|
+
if word in markers.american_vocabulary:
|
|
165
|
+
american_count += 1.0
|
|
166
|
+
total_markers += 1
|
|
167
|
+
vocabulary_markers[word] += 1
|
|
168
|
+
markers_by_level["lexical"][word] = markers_by_level["lexical"].get(word, 0) + 1
|
|
169
|
+
|
|
170
|
+
# ===== Standalone spelling matching (phonological level) =====
|
|
171
|
+
# Direct word pairs like grey/gray, cheque/check
|
|
172
|
+
for word in words:
|
|
173
|
+
if word in markers.british_spellings:
|
|
174
|
+
british_count += 0.9 # High weight for spelling differences
|
|
175
|
+
total_markers += 1
|
|
176
|
+
spelling_markers[word] += 1
|
|
177
|
+
markers_by_level["phonological"][word] = (
|
|
178
|
+
markers_by_level["phonological"].get(word, 0) + 1
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
if word in markers.american_spellings:
|
|
182
|
+
american_count += 0.9
|
|
183
|
+
total_markers += 1
|
|
184
|
+
spelling_markers[word] += 1
|
|
185
|
+
markers_by_level["phonological"][word] = (
|
|
186
|
+
markers_by_level["phonological"].get(word, 0) + 1
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# ===== Regex spelling patterns (morphological level) =====
|
|
190
|
+
# Patterns like -ise/-ize, -our/-or with feature weights
|
|
191
|
+
text_lower = text.lower()
|
|
192
|
+
for pattern in markers.spelling_patterns:
|
|
193
|
+
weight = pattern.weight
|
|
194
|
+
feature_level = pattern.feature_level
|
|
195
|
+
|
|
196
|
+
# Match British pattern
|
|
197
|
+
if pattern.pattern_british:
|
|
198
|
+
for match in pattern.pattern_british.finditer(text_lower):
|
|
199
|
+
word = match.group().lower()
|
|
200
|
+
# Skip exceptions
|
|
201
|
+
if word not in pattern.exceptions:
|
|
202
|
+
british_count += weight
|
|
203
|
+
total_markers += 1
|
|
204
|
+
spelling_markers[word] += 1
|
|
205
|
+
markers_by_level[feature_level][word] = (
|
|
206
|
+
markers_by_level[feature_level].get(word, 0) + 1
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Match American pattern
|
|
210
|
+
if pattern.pattern_american:
|
|
211
|
+
for match in pattern.pattern_american.finditer(text_lower):
|
|
212
|
+
word = match.group().lower()
|
|
213
|
+
if word not in pattern.exceptions:
|
|
214
|
+
american_count += weight
|
|
215
|
+
total_markers += 1
|
|
216
|
+
spelling_markers[word] += 1
|
|
217
|
+
markers_by_level[feature_level][word] = (
|
|
218
|
+
markers_by_level[feature_level].get(word, 0) + 1
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
# ===== Grammar patterns (syntactic level) =====
|
|
222
|
+
# Patterns like "have got", "gotten", collective noun agreement
|
|
223
|
+
for grammar_pattern in markers.grammar_patterns:
|
|
224
|
+
weight = grammar_pattern.weight
|
|
225
|
+
|
|
226
|
+
# Match British grammar pattern
|
|
227
|
+
if grammar_pattern.pattern_british:
|
|
228
|
+
matches = list(grammar_pattern.pattern_british.finditer(text_lower))
|
|
229
|
+
if matches:
|
|
230
|
+
british_count += weight * len(matches)
|
|
231
|
+
total_markers += len(matches)
|
|
232
|
+
grammar_markers[grammar_pattern.name] = len(matches)
|
|
233
|
+
markers_by_level["syntactic"][grammar_pattern.name] = markers_by_level[
|
|
234
|
+
"syntactic"
|
|
235
|
+
].get(grammar_pattern.name, 0) + len(matches)
|
|
236
|
+
|
|
237
|
+
# Match American grammar pattern
|
|
238
|
+
if grammar_pattern.pattern_american:
|
|
239
|
+
matches = list(grammar_pattern.pattern_american.finditer(text_lower))
|
|
240
|
+
if matches:
|
|
241
|
+
american_count += weight * len(matches)
|
|
242
|
+
total_markers += len(matches)
|
|
243
|
+
grammar_markers[grammar_pattern.name] = grammar_markers.get(
|
|
244
|
+
grammar_pattern.name, 0
|
|
245
|
+
) + len(matches)
|
|
246
|
+
markers_by_level["syntactic"][grammar_pattern.name] = markers_by_level[
|
|
247
|
+
"syntactic"
|
|
248
|
+
].get(grammar_pattern.name, 0) + len(matches)
|
|
249
|
+
|
|
250
|
+
# ===== Eye dialect (register markers, not dialect) =====
|
|
251
|
+
# gonna, wanna, etc. indicate informal register
|
|
252
|
+
for word in words:
|
|
253
|
+
if word in markers.eye_dialect_words:
|
|
254
|
+
eye_dialect_count += 1
|
|
255
|
+
|
|
256
|
+
return _ChunkAnalysis(
|
|
257
|
+
british_count=british_count,
|
|
258
|
+
american_count=american_count,
|
|
259
|
+
total_markers=total_markers,
|
|
260
|
+
word_count=word_count,
|
|
261
|
+
eye_dialect_count=eye_dialect_count,
|
|
262
|
+
markers_by_level=dict(markers_by_level),
|
|
263
|
+
spelling_markers=dict(spelling_markers),
|
|
264
|
+
vocabulary_markers=dict(vocabulary_markers),
|
|
265
|
+
grammar_markers=dict(grammar_markers),
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _classify_dialect(british_score: float, american_score: float) -> tuple[str, float]:
|
|
270
|
+
"""Classify dialect based on scores.
|
|
271
|
+
|
|
272
|
+
Classification rules:
|
|
273
|
+
- If both scores are very low (< 0.1), classify as "neutral"
|
|
274
|
+
- If scores are close (within 20% of each other), classify as "mixed"
|
|
275
|
+
- Otherwise, classify as the dominant dialect
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
british_score: Normalized British marker score (0.0-1.0)
|
|
279
|
+
american_score: Normalized American marker score (0.0-1.0)
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Tuple of (dialect, confidence) where dialect is one of:
|
|
283
|
+
"british", "american", "mixed", "neutral"
|
|
284
|
+
"""
|
|
285
|
+
# Both very low -> neutral
|
|
286
|
+
if british_score < 0.05 and american_score < 0.05:
|
|
287
|
+
return "neutral", 0.5
|
|
288
|
+
|
|
289
|
+
total = british_score + american_score
|
|
290
|
+
if total == 0:
|
|
291
|
+
return "neutral", 0.5
|
|
292
|
+
|
|
293
|
+
# Calculate ratio
|
|
294
|
+
british_ratio = british_score / total
|
|
295
|
+
american_ratio = american_score / total
|
|
296
|
+
|
|
297
|
+
# Close scores -> mixed
|
|
298
|
+
if abs(british_ratio - american_ratio) < 0.2:
|
|
299
|
+
confidence = 1.0 - abs(british_ratio - american_ratio)
|
|
300
|
+
return "mixed", confidence
|
|
301
|
+
|
|
302
|
+
# Dominant dialect
|
|
303
|
+
if british_ratio > american_ratio:
|
|
304
|
+
confidence = british_ratio
|
|
305
|
+
return "british", confidence
|
|
306
|
+
else:
|
|
307
|
+
confidence = american_ratio
|
|
308
|
+
return "american", confidence
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _compute_markedness(
|
|
312
|
+
british_score: float, american_score: float, eye_dialect_ratio: float
|
|
313
|
+
) -> float:
|
|
314
|
+
"""Compute markedness score.
|
|
315
|
+
|
|
316
|
+
Markedness measures how far the text deviates from "unmarked" standard
|
|
317
|
+
English. High markedness suggests intentional stylistic choice or strong
|
|
318
|
+
dialect identity.
|
|
319
|
+
|
|
320
|
+
Following Battistella (1990), markedness is computed as the sum of:
|
|
321
|
+
- Dialect marker density (British + American)
|
|
322
|
+
- Eye dialect density (informal register markers)
|
|
323
|
+
|
|
324
|
+
Normalized to 0.0-1.0 range.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
british_score: Normalized British score
|
|
328
|
+
american_score: Normalized American score
|
|
329
|
+
eye_dialect_ratio: Eye dialect per 1000 words
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
Markedness score 0.0-1.0 (higher = more marked)
|
|
333
|
+
"""
|
|
334
|
+
# Combine dialect markers and eye dialect
|
|
335
|
+
dialect_component = (british_score + american_score) / 2
|
|
336
|
+
register_component = min(eye_dialect_ratio / 10, 1.0) # Cap at 10 per 1000 words
|
|
337
|
+
|
|
338
|
+
# Weighted combination (dialect matters more than register)
|
|
339
|
+
markedness = 0.7 * dialect_component + 0.3 * register_component
|
|
340
|
+
|
|
341
|
+
return min(markedness, 1.0)
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def compute_dialect(text: str, chunk_size: int = 1000) -> DialectResult:
|
|
345
|
+
"""Compute dialect detection metrics for a text.
|
|
346
|
+
|
|
347
|
+
This function uses native chunked analysis per Issue #27, computing
|
|
348
|
+
metrics per chunk and aggregating into distributions. The variance
|
|
349
|
+
across chunks can reveal mixed authorship (e.g., UK writer using
|
|
350
|
+
ChatGPT-generated American English content).
|
|
351
|
+
|
|
352
|
+
Related GitHub Issues:
|
|
353
|
+
#35 - Dialect detection with extensible JSON markers
|
|
354
|
+
https://github.com/craigtrim/pystylometry/issues/35
|
|
355
|
+
#30 - Whonix stylometry features (regional linguistic preferences)
|
|
356
|
+
https://github.com/craigtrim/pystylometry/issues/30
|
|
357
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
358
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
359
|
+
|
|
360
|
+
Detection Process:
|
|
361
|
+
1. Split text into chunks (default 1000 words)
|
|
362
|
+
2. For each chunk:
|
|
363
|
+
- Match vocabulary (lexical level)
|
|
364
|
+
- Match spelling patterns (phonological/morphological)
|
|
365
|
+
- Match grammar patterns (syntactic level)
|
|
366
|
+
- Count eye dialect markers (register indicator)
|
|
367
|
+
- Apply feature weights from linguistic research
|
|
368
|
+
3. Aggregate into distributions
|
|
369
|
+
4. Classify dialect and compute confidence
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
text: Input text to analyze
|
|
373
|
+
chunk_size: Number of words per chunk (default: 1000)
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
DialectResult with dialect classification, scores, distributions,
|
|
377
|
+
and detailed marker breakdowns
|
|
378
|
+
|
|
379
|
+
Example:
|
|
380
|
+
>>> result = compute_dialect("The colour of the programme was brilliant.")
|
|
381
|
+
>>> result.dialect
|
|
382
|
+
'british'
|
|
383
|
+
>>> result.british_score
|
|
384
|
+
0.85
|
|
385
|
+
>>> result.markedness_score
|
|
386
|
+
0.42
|
|
387
|
+
|
|
388
|
+
>>> # Detect mixed dialect
|
|
389
|
+
>>> result = compute_dialect("I love the color of autumn leaves in the neighbourhood.")
|
|
390
|
+
>>> result.dialect
|
|
391
|
+
'mixed'
|
|
392
|
+
>>> result.british_score_dist.std # Low std = consistent markers
|
|
393
|
+
0.02
|
|
394
|
+
"""
|
|
395
|
+
# Chunk the text
|
|
396
|
+
chunks = chunk_text(text, chunk_size)
|
|
397
|
+
|
|
398
|
+
# Analyze each chunk
|
|
399
|
+
british_scores: list[float] = []
|
|
400
|
+
american_scores: list[float] = []
|
|
401
|
+
markedness_scores: list[float] = []
|
|
402
|
+
|
|
403
|
+
total_eye_dialect = 0
|
|
404
|
+
total_word_count = 0
|
|
405
|
+
|
|
406
|
+
# Aggregate markers across chunks
|
|
407
|
+
agg_markers_by_level: dict[str, dict[str, int]] = {
|
|
408
|
+
"phonological": {},
|
|
409
|
+
"morphological": {},
|
|
410
|
+
"lexical": {},
|
|
411
|
+
"syntactic": {},
|
|
412
|
+
}
|
|
413
|
+
agg_spelling: dict[str, int] = defaultdict(int)
|
|
414
|
+
agg_vocabulary: dict[str, int] = defaultdict(int)
|
|
415
|
+
agg_grammar: dict[str, int] = defaultdict(int)
|
|
416
|
+
|
|
417
|
+
for chunk in chunks:
|
|
418
|
+
analysis = _compute_dialect_single(chunk)
|
|
419
|
+
|
|
420
|
+
# Skip empty chunks
|
|
421
|
+
if analysis.word_count == 0:
|
|
422
|
+
continue
|
|
423
|
+
|
|
424
|
+
# Normalize scores to per-1000-words for comparability
|
|
425
|
+
normalizer = 1000.0 / analysis.word_count if analysis.word_count > 0 else 0
|
|
426
|
+
|
|
427
|
+
british_normalized = analysis.british_count * normalizer
|
|
428
|
+
american_normalized = analysis.american_count * normalizer
|
|
429
|
+
eye_dialect_ratio = analysis.eye_dialect_count * normalizer
|
|
430
|
+
|
|
431
|
+
# Convert to 0-1 scale (cap at reasonable maximum)
|
|
432
|
+
# Typical texts have 0-50 markers per 1000 words
|
|
433
|
+
british_score = min(british_normalized / 50, 1.0)
|
|
434
|
+
american_score = min(american_normalized / 50, 1.0)
|
|
435
|
+
|
|
436
|
+
british_scores.append(british_score)
|
|
437
|
+
american_scores.append(american_score)
|
|
438
|
+
|
|
439
|
+
# Compute markedness for this chunk
|
|
440
|
+
markedness = _compute_markedness(british_score, american_score, eye_dialect_ratio)
|
|
441
|
+
markedness_scores.append(markedness)
|
|
442
|
+
|
|
443
|
+
# Aggregate counts
|
|
444
|
+
total_eye_dialect += analysis.eye_dialect_count
|
|
445
|
+
total_word_count += analysis.word_count
|
|
446
|
+
|
|
447
|
+
# Aggregate markers
|
|
448
|
+
for level, markers in analysis.markers_by_level.items():
|
|
449
|
+
for marker, count in markers.items():
|
|
450
|
+
agg_markers_by_level[level][marker] = (
|
|
451
|
+
agg_markers_by_level[level].get(marker, 0) + count
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
for marker, count in analysis.spelling_markers.items():
|
|
455
|
+
agg_spelling[marker] += count
|
|
456
|
+
for marker, count in analysis.vocabulary_markers.items():
|
|
457
|
+
agg_vocabulary[marker] += count
|
|
458
|
+
for marker, count in analysis.grammar_markers.items():
|
|
459
|
+
agg_grammar[marker] += count
|
|
460
|
+
|
|
461
|
+
# Handle empty text
|
|
462
|
+
if not british_scores:
|
|
463
|
+
empty_dist = Distribution(
|
|
464
|
+
values=[],
|
|
465
|
+
mean=float("nan"),
|
|
466
|
+
median=float("nan"),
|
|
467
|
+
std=0.0,
|
|
468
|
+
range=0.0,
|
|
469
|
+
iqr=0.0,
|
|
470
|
+
)
|
|
471
|
+
return DialectResult(
|
|
472
|
+
dialect="neutral",
|
|
473
|
+
confidence=0.0,
|
|
474
|
+
british_score=float("nan"),
|
|
475
|
+
american_score=float("nan"),
|
|
476
|
+
markedness_score=float("nan"),
|
|
477
|
+
british_score_dist=empty_dist,
|
|
478
|
+
american_score_dist=empty_dist,
|
|
479
|
+
markedness_score_dist=empty_dist,
|
|
480
|
+
markers_by_level=agg_markers_by_level,
|
|
481
|
+
spelling_markers=dict(agg_spelling),
|
|
482
|
+
vocabulary_markers=dict(agg_vocabulary),
|
|
483
|
+
grammar_markers=dict(agg_grammar),
|
|
484
|
+
eye_dialect_count=0,
|
|
485
|
+
eye_dialect_ratio=0.0,
|
|
486
|
+
register_hints={},
|
|
487
|
+
chunk_size=chunk_size,
|
|
488
|
+
chunk_count=len(chunks),
|
|
489
|
+
metadata={"total_word_count": 0},
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
# Build distributions
|
|
493
|
+
british_dist = make_distribution(british_scores)
|
|
494
|
+
american_dist = make_distribution(american_scores)
|
|
495
|
+
markedness_dist = make_distribution(markedness_scores)
|
|
496
|
+
|
|
497
|
+
# Classify based on mean scores
|
|
498
|
+
dialect, confidence = _classify_dialect(british_dist.mean, american_dist.mean)
|
|
499
|
+
|
|
500
|
+
# Compute overall eye dialect ratio
|
|
501
|
+
eye_dialect_ratio = (
|
|
502
|
+
(total_eye_dialect / total_word_count * 1000) if total_word_count > 0 else 0.0
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
# Build register hints
|
|
506
|
+
register_hints: dict[str, Any] = {
|
|
507
|
+
"eye_dialect_density": eye_dialect_ratio,
|
|
508
|
+
"marker_density": (british_dist.mean + american_dist.mean) / 2,
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
return DialectResult(
|
|
512
|
+
dialect=dialect,
|
|
513
|
+
confidence=confidence,
|
|
514
|
+
british_score=british_dist.mean,
|
|
515
|
+
american_score=american_dist.mean,
|
|
516
|
+
markedness_score=markedness_dist.mean,
|
|
517
|
+
british_score_dist=british_dist,
|
|
518
|
+
american_score_dist=american_dist,
|
|
519
|
+
markedness_score_dist=markedness_dist,
|
|
520
|
+
markers_by_level=agg_markers_by_level,
|
|
521
|
+
spelling_markers=dict(agg_spelling),
|
|
522
|
+
vocabulary_markers=dict(agg_vocabulary),
|
|
523
|
+
grammar_markers=dict(agg_grammar),
|
|
524
|
+
eye_dialect_count=total_eye_dialect,
|
|
525
|
+
eye_dialect_ratio=eye_dialect_ratio,
|
|
526
|
+
register_hints=register_hints,
|
|
527
|
+
chunk_size=chunk_size,
|
|
528
|
+
chunk_count=len(chunks),
|
|
529
|
+
metadata={
|
|
530
|
+
"total_word_count": total_word_count,
|
|
531
|
+
"markers_version": get_markers().version,
|
|
532
|
+
},
|
|
533
|
+
)
|