pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pystylometry/__init__.py +30 -5
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1954 -28
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +26 -1
  6. pystylometry/authorship/additional_methods.py +75 -0
  7. pystylometry/authorship/kilgarriff.py +347 -0
  8. pystylometry/character/__init__.py +15 -0
  9. pystylometry/character/character_metrics.py +389 -0
  10. pystylometry/cli.py +427 -0
  11. pystylometry/consistency/__init__.py +57 -0
  12. pystylometry/consistency/_thresholds.py +162 -0
  13. pystylometry/consistency/drift.py +549 -0
  14. pystylometry/dialect/__init__.py +65 -0
  15. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  16. pystylometry/dialect/_loader.py +360 -0
  17. pystylometry/dialect/detector.py +533 -0
  18. pystylometry/lexical/__init__.py +13 -6
  19. pystylometry/lexical/advanced_diversity.py +680 -0
  20. pystylometry/lexical/function_words.py +590 -0
  21. pystylometry/lexical/hapax.py +310 -33
  22. pystylometry/lexical/mtld.py +180 -22
  23. pystylometry/lexical/ttr.py +149 -0
  24. pystylometry/lexical/word_frequency_sophistication.py +1805 -0
  25. pystylometry/lexical/yule.py +142 -29
  26. pystylometry/ngrams/__init__.py +2 -0
  27. pystylometry/ngrams/entropy.py +150 -49
  28. pystylometry/ngrams/extended_ngrams.py +235 -0
  29. pystylometry/prosody/__init__.py +12 -0
  30. pystylometry/prosody/rhythm_prosody.py +53 -0
  31. pystylometry/readability/__init__.py +12 -0
  32. pystylometry/readability/additional_formulas.py +2110 -0
  33. pystylometry/readability/ari.py +173 -35
  34. pystylometry/readability/coleman_liau.py +150 -30
  35. pystylometry/readability/complex_words.py +531 -0
  36. pystylometry/readability/flesch.py +181 -32
  37. pystylometry/readability/gunning_fog.py +208 -35
  38. pystylometry/readability/smog.py +126 -28
  39. pystylometry/readability/syllables.py +137 -30
  40. pystylometry/stylistic/__init__.py +20 -0
  41. pystylometry/stylistic/cohesion_coherence.py +45 -0
  42. pystylometry/stylistic/genre_register.py +45 -0
  43. pystylometry/stylistic/markers.py +131 -0
  44. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  45. pystylometry/syntactic/__init__.py +4 -0
  46. pystylometry/syntactic/advanced_syntactic.py +494 -0
  47. pystylometry/syntactic/pos_ratios.py +172 -17
  48. pystylometry/syntactic/sentence_stats.py +105 -18
  49. pystylometry/syntactic/sentence_types.py +526 -0
  50. pystylometry/viz/__init__.py +71 -0
  51. pystylometry/viz/drift.py +589 -0
  52. pystylometry/viz/jsx/__init__.py +31 -0
  53. pystylometry/viz/jsx/_base.py +144 -0
  54. pystylometry/viz/jsx/report.py +677 -0
  55. pystylometry/viz/jsx/timeline.py +716 -0
  56. pystylometry/viz/jsx/viewer.py +1032 -0
  57. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
  58. pystylometry-1.1.0.dist-info/RECORD +63 -0
  59. pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
  60. pystylometry-0.1.0.dist-info/RECORD +0 -26
  61. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,533 @@
1
+ """Dialect detection using extensible JSON markers.
2
+
3
+ This module implements dialect detection for stylometric analysis, identifying
4
+ regional linguistic preferences (British vs. American English) and measuring
5
+ text markedness. The analysis uses native chunked analysis per Issue #27,
6
+ computing metrics per chunk and providing distributions for fingerprinting.
7
+
8
+ Related GitHub Issues:
9
+ #35 - Dialect detection with extensible JSON markers
10
+ https://github.com/craigtrim/pystylometry/issues/35
11
+ #30 - Whonix stylometry features (regional linguistic preferences)
12
+ https://github.com/craigtrim/pystylometry/issues/30
13
+ #27 - Native chunked analysis with Distribution dataclass
14
+ https://github.com/craigtrim/pystylometry/issues/27
15
+
16
+ Theoretical Background:
17
+ Dialectometry (Goebl, 1982; Nerbonne, 2009) provides the quantitative
18
+ framework for measuring dialect similarity. Rather than selecting individual
19
+ "characteristic" features, modern dialectometry quantifies holistically
20
+ across all available markers.
21
+
22
+ Markedness theory (Battistella, 1990) informs the markedness_score: marked
23
+ forms stand out against "standard" written English. High markedness suggests
24
+ intentional stylistic choice or strong dialect identity.
25
+
26
+ Eye dialect (spellings like "gonna" that look nonstandard but reflect
27
+ standard pronunciation) indicates informal register, not regional dialect
28
+ (Encyclopedia.com, "Slang, Dialect, and Marked Language").
29
+
30
+ Detection Strategy:
31
+ 1. Tokenize text and identify words
32
+ 2. Match vocabulary (lexical level): flat/apartment, lorry/truck
33
+ 3. Match spelling patterns (phonological/morphological): colour/color, -ise/-ize
34
+ 4. Match grammar patterns (syntactic): have got/have, collective noun agreement
35
+ 5. Count eye dialect markers separately (register, not dialect)
36
+ 6. Apply feature weights from linguistic research
37
+ 7. Compute scores and classify dialect
38
+
39
+ Chunking:
40
+ Following Issue #27, the text is split into chunks (default 1000 words).
41
+ Each chunk is analyzed independently, then results are aggregated into
42
+ Distribution objects. This captures variance across the text, which can
43
+ reveal mixed authorship (e.g., human + AI-generated content).
44
+
45
+ References:
46
+ Battistella, Edwin L. "Markedness: The Evaluative Superstructure of
47
+ Language." State University of New York Press, 1990.
48
+ Goebl, Hans. "Dialektometrie: Prinzipien und Methoden des Einsatzes der
49
+ numerischen Taxonomie im Bereich der Dialektgeographie." Verlag der
50
+ Österreichischen Akademie der Wissenschaften, 1982.
51
+ Labov, William. "The Social Stratification of English in New York City."
52
+ Cambridge University Press, 2006.
53
+ Nerbonne, John. "Data-Driven Dialectology." Language and Linguistics
54
+ Compass, vol. 3, no. 1, 2009, pp. 175-198.
55
+ Whonix Project. "Stylometry: Deanonymization Techniques." Whonix Wiki,
56
+ https://www.whonix.org/wiki/Stylometry
57
+ """
58
+
59
+ from __future__ import annotations
60
+
61
+ import re
62
+ from collections import defaultdict
63
+ from dataclasses import dataclass
64
+ from typing import Any
65
+
66
+ from .._types import DialectResult, Distribution, chunk_text, make_distribution
67
+ from ._loader import get_markers
68
+
69
+ # Simple word tokenizer pattern
70
+ _WORD_PATTERN = re.compile(r"\b[a-zA-Z]+(?:'[a-zA-Z]+)?\b")
71
+
72
+
73
+ @dataclass
74
+ class _ChunkAnalysis:
75
+ """Internal result from analyzing a single chunk.
76
+
77
+ This dataclass holds per-chunk metrics that will be aggregated into
78
+ distributions for the final DialectResult.
79
+
80
+ Attributes:
81
+ british_count: Weighted count of British markers
82
+ american_count: Weighted count of American markers
83
+ total_markers: Total unweighted marker count
84
+ word_count: Total words in chunk
85
+ eye_dialect_count: Eye dialect markers found
86
+ markers_by_level: Markers categorized by linguistic level
87
+ spelling_markers: Individual spelling markers found
88
+ vocabulary_markers: Individual vocabulary markers found
89
+ grammar_markers: Individual grammar markers found
90
+ """
91
+
92
+ british_count: float
93
+ american_count: float
94
+ total_markers: int
95
+ word_count: int
96
+ eye_dialect_count: int
97
+ markers_by_level: dict[str, dict[str, int]]
98
+ spelling_markers: dict[str, int]
99
+ vocabulary_markers: dict[str, int]
100
+ grammar_markers: dict[str, int]
101
+
102
+
103
+ def _tokenize_words(text: str) -> list[str]:
104
+ """Extract words from text for analysis.
105
+
106
+ Uses a simple regex pattern that captures contractions (don't, I'm)
107
+ as single tokens. All words are lowercased for matching.
108
+
109
+ Args:
110
+ text: Input text
111
+
112
+ Returns:
113
+ List of lowercase words
114
+ """
115
+ return [match.group().lower() for match in _WORD_PATTERN.finditer(text)]
116
+
117
+
118
+ def _compute_dialect_single(text: str) -> _ChunkAnalysis:
119
+ """Compute dialect metrics for a single chunk of text.
120
+
121
+ This is the core detection function, called once per chunk. It matches
122
+ vocabulary, spelling patterns, and grammar patterns against the text,
123
+ applying feature weights from the JSON database.
124
+
125
+ Related GitHub Issue:
126
+ #27 - Native chunked analysis with Distribution dataclass
127
+ https://github.com/craigtrim/pystylometry/issues/27
128
+
129
+ Args:
130
+ text: Single chunk of text to analyze
131
+
132
+ Returns:
133
+ _ChunkAnalysis with all metrics for this chunk
134
+ """
135
+ markers = get_markers()
136
+ words = _tokenize_words(text)
137
+ word_count = len(words)
138
+
139
+ # Initialize counters
140
+ british_count = 0.0
141
+ american_count = 0.0
142
+ total_markers = 0
143
+ eye_dialect_count = 0
144
+
145
+ markers_by_level: dict[str, dict[str, int]] = {
146
+ "phonological": {},
147
+ "morphological": {},
148
+ "lexical": {},
149
+ "syntactic": {},
150
+ }
151
+ spelling_markers: dict[str, int] = defaultdict(int)
152
+ vocabulary_markers: dict[str, int] = defaultdict(int)
153
+ grammar_markers: dict[str, int] = defaultdict(int)
154
+
155
+ # ===== Vocabulary matching (lexical level) =====
156
+ # Match against vocabulary pairs and exclusive vocabulary
157
+ for word in words:
158
+ if word in markers.british_vocabulary:
159
+ british_count += 1.0 # Default weight 1.0 for vocabulary
160
+ total_markers += 1
161
+ vocabulary_markers[word] += 1
162
+ markers_by_level["lexical"][word] = markers_by_level["lexical"].get(word, 0) + 1
163
+
164
+ if word in markers.american_vocabulary:
165
+ american_count += 1.0
166
+ total_markers += 1
167
+ vocabulary_markers[word] += 1
168
+ markers_by_level["lexical"][word] = markers_by_level["lexical"].get(word, 0) + 1
169
+
170
+ # ===== Standalone spelling matching (phonological level) =====
171
+ # Direct word pairs like grey/gray, cheque/check
172
+ for word in words:
173
+ if word in markers.british_spellings:
174
+ british_count += 0.9 # High weight for spelling differences
175
+ total_markers += 1
176
+ spelling_markers[word] += 1
177
+ markers_by_level["phonological"][word] = (
178
+ markers_by_level["phonological"].get(word, 0) + 1
179
+ )
180
+
181
+ if word in markers.american_spellings:
182
+ american_count += 0.9
183
+ total_markers += 1
184
+ spelling_markers[word] += 1
185
+ markers_by_level["phonological"][word] = (
186
+ markers_by_level["phonological"].get(word, 0) + 1
187
+ )
188
+
189
+ # ===== Regex spelling patterns (morphological level) =====
190
+ # Patterns like -ise/-ize, -our/-or with feature weights
191
+ text_lower = text.lower()
192
+ for pattern in markers.spelling_patterns:
193
+ weight = pattern.weight
194
+ feature_level = pattern.feature_level
195
+
196
+ # Match British pattern
197
+ if pattern.pattern_british:
198
+ for match in pattern.pattern_british.finditer(text_lower):
199
+ word = match.group().lower()
200
+ # Skip exceptions
201
+ if word not in pattern.exceptions:
202
+ british_count += weight
203
+ total_markers += 1
204
+ spelling_markers[word] += 1
205
+ markers_by_level[feature_level][word] = (
206
+ markers_by_level[feature_level].get(word, 0) + 1
207
+ )
208
+
209
+ # Match American pattern
210
+ if pattern.pattern_american:
211
+ for match in pattern.pattern_american.finditer(text_lower):
212
+ word = match.group().lower()
213
+ if word not in pattern.exceptions:
214
+ american_count += weight
215
+ total_markers += 1
216
+ spelling_markers[word] += 1
217
+ markers_by_level[feature_level][word] = (
218
+ markers_by_level[feature_level].get(word, 0) + 1
219
+ )
220
+
221
+ # ===== Grammar patterns (syntactic level) =====
222
+ # Patterns like "have got", "gotten", collective noun agreement
223
+ for grammar_pattern in markers.grammar_patterns:
224
+ weight = grammar_pattern.weight
225
+
226
+ # Match British grammar pattern
227
+ if grammar_pattern.pattern_british:
228
+ matches = list(grammar_pattern.pattern_british.finditer(text_lower))
229
+ if matches:
230
+ british_count += weight * len(matches)
231
+ total_markers += len(matches)
232
+ grammar_markers[grammar_pattern.name] = len(matches)
233
+ markers_by_level["syntactic"][grammar_pattern.name] = markers_by_level[
234
+ "syntactic"
235
+ ].get(grammar_pattern.name, 0) + len(matches)
236
+
237
+ # Match American grammar pattern
238
+ if grammar_pattern.pattern_american:
239
+ matches = list(grammar_pattern.pattern_american.finditer(text_lower))
240
+ if matches:
241
+ american_count += weight * len(matches)
242
+ total_markers += len(matches)
243
+ grammar_markers[grammar_pattern.name] = grammar_markers.get(
244
+ grammar_pattern.name, 0
245
+ ) + len(matches)
246
+ markers_by_level["syntactic"][grammar_pattern.name] = markers_by_level[
247
+ "syntactic"
248
+ ].get(grammar_pattern.name, 0) + len(matches)
249
+
250
+ # ===== Eye dialect (register markers, not dialect) =====
251
+ # gonna, wanna, etc. indicate informal register
252
+ for word in words:
253
+ if word in markers.eye_dialect_words:
254
+ eye_dialect_count += 1
255
+
256
+ return _ChunkAnalysis(
257
+ british_count=british_count,
258
+ american_count=american_count,
259
+ total_markers=total_markers,
260
+ word_count=word_count,
261
+ eye_dialect_count=eye_dialect_count,
262
+ markers_by_level=dict(markers_by_level),
263
+ spelling_markers=dict(spelling_markers),
264
+ vocabulary_markers=dict(vocabulary_markers),
265
+ grammar_markers=dict(grammar_markers),
266
+ )
267
+
268
+
269
+ def _classify_dialect(british_score: float, american_score: float) -> tuple[str, float]:
270
+ """Classify dialect based on scores.
271
+
272
+ Classification rules:
273
+ - If both scores are very low (< 0.1), classify as "neutral"
274
+ - If scores are close (within 20% of each other), classify as "mixed"
275
+ - Otherwise, classify as the dominant dialect
276
+
277
+ Args:
278
+ british_score: Normalized British marker score (0.0-1.0)
279
+ american_score: Normalized American marker score (0.0-1.0)
280
+
281
+ Returns:
282
+ Tuple of (dialect, confidence) where dialect is one of:
283
+ "british", "american", "mixed", "neutral"
284
+ """
285
+ # Both very low -> neutral
286
+ if british_score < 0.05 and american_score < 0.05:
287
+ return "neutral", 0.5
288
+
289
+ total = british_score + american_score
290
+ if total == 0:
291
+ return "neutral", 0.5
292
+
293
+ # Calculate ratio
294
+ british_ratio = british_score / total
295
+ american_ratio = american_score / total
296
+
297
+ # Close scores -> mixed
298
+ if abs(british_ratio - american_ratio) < 0.2:
299
+ confidence = 1.0 - abs(british_ratio - american_ratio)
300
+ return "mixed", confidence
301
+
302
+ # Dominant dialect
303
+ if british_ratio > american_ratio:
304
+ confidence = british_ratio
305
+ return "british", confidence
306
+ else:
307
+ confidence = american_ratio
308
+ return "american", confidence
309
+
310
+
311
+ def _compute_markedness(
312
+ british_score: float, american_score: float, eye_dialect_ratio: float
313
+ ) -> float:
314
+ """Compute markedness score.
315
+
316
+ Markedness measures how far the text deviates from "unmarked" standard
317
+ English. High markedness suggests intentional stylistic choice or strong
318
+ dialect identity.
319
+
320
+ Following Battistella (1990), markedness is computed as the sum of:
321
+ - Dialect marker density (British + American)
322
+ - Eye dialect density (informal register markers)
323
+
324
+ Normalized to 0.0-1.0 range.
325
+
326
+ Args:
327
+ british_score: Normalized British score
328
+ american_score: Normalized American score
329
+ eye_dialect_ratio: Eye dialect per 1000 words
330
+
331
+ Returns:
332
+ Markedness score 0.0-1.0 (higher = more marked)
333
+ """
334
+ # Combine dialect markers and eye dialect
335
+ dialect_component = (british_score + american_score) / 2
336
+ register_component = min(eye_dialect_ratio / 10, 1.0) # Cap at 10 per 1000 words
337
+
338
+ # Weighted combination (dialect matters more than register)
339
+ markedness = 0.7 * dialect_component + 0.3 * register_component
340
+
341
+ return min(markedness, 1.0)
342
+
343
+
344
+ def compute_dialect(text: str, chunk_size: int = 1000) -> DialectResult:
345
+ """Compute dialect detection metrics for a text.
346
+
347
+ This function uses native chunked analysis per Issue #27, computing
348
+ metrics per chunk and aggregating into distributions. The variance
349
+ across chunks can reveal mixed authorship (e.g., UK writer using
350
+ ChatGPT-generated American English content).
351
+
352
+ Related GitHub Issues:
353
+ #35 - Dialect detection with extensible JSON markers
354
+ https://github.com/craigtrim/pystylometry/issues/35
355
+ #30 - Whonix stylometry features (regional linguistic preferences)
356
+ https://github.com/craigtrim/pystylometry/issues/30
357
+ #27 - Native chunked analysis with Distribution dataclass
358
+ https://github.com/craigtrim/pystylometry/issues/27
359
+
360
+ Detection Process:
361
+ 1. Split text into chunks (default 1000 words)
362
+ 2. For each chunk:
363
+ - Match vocabulary (lexical level)
364
+ - Match spelling patterns (phonological/morphological)
365
+ - Match grammar patterns (syntactic level)
366
+ - Count eye dialect markers (register indicator)
367
+ - Apply feature weights from linguistic research
368
+ 3. Aggregate into distributions
369
+ 4. Classify dialect and compute confidence
370
+
371
+ Args:
372
+ text: Input text to analyze
373
+ chunk_size: Number of words per chunk (default: 1000)
374
+
375
+ Returns:
376
+ DialectResult with dialect classification, scores, distributions,
377
+ and detailed marker breakdowns
378
+
379
+ Example:
380
+ >>> result = compute_dialect("The colour of the programme was brilliant.")
381
+ >>> result.dialect
382
+ 'british'
383
+ >>> result.british_score
384
+ 0.85
385
+ >>> result.markedness_score
386
+ 0.42
387
+
388
+ >>> # Detect mixed dialect
389
+ >>> result = compute_dialect("I love the color of autumn leaves in the neighbourhood.")
390
+ >>> result.dialect
391
+ 'mixed'
392
+ >>> result.british_score_dist.std # Low std = consistent markers
393
+ 0.02
394
+ """
395
+ # Chunk the text
396
+ chunks = chunk_text(text, chunk_size)
397
+
398
+ # Analyze each chunk
399
+ british_scores: list[float] = []
400
+ american_scores: list[float] = []
401
+ markedness_scores: list[float] = []
402
+
403
+ total_eye_dialect = 0
404
+ total_word_count = 0
405
+
406
+ # Aggregate markers across chunks
407
+ agg_markers_by_level: dict[str, dict[str, int]] = {
408
+ "phonological": {},
409
+ "morphological": {},
410
+ "lexical": {},
411
+ "syntactic": {},
412
+ }
413
+ agg_spelling: dict[str, int] = defaultdict(int)
414
+ agg_vocabulary: dict[str, int] = defaultdict(int)
415
+ agg_grammar: dict[str, int] = defaultdict(int)
416
+
417
+ for chunk in chunks:
418
+ analysis = _compute_dialect_single(chunk)
419
+
420
+ # Skip empty chunks
421
+ if analysis.word_count == 0:
422
+ continue
423
+
424
+ # Normalize scores to per-1000-words for comparability
425
+ normalizer = 1000.0 / analysis.word_count if analysis.word_count > 0 else 0
426
+
427
+ british_normalized = analysis.british_count * normalizer
428
+ american_normalized = analysis.american_count * normalizer
429
+ eye_dialect_ratio = analysis.eye_dialect_count * normalizer
430
+
431
+ # Convert to 0-1 scale (cap at reasonable maximum)
432
+ # Typical texts have 0-50 markers per 1000 words
433
+ british_score = min(british_normalized / 50, 1.0)
434
+ american_score = min(american_normalized / 50, 1.0)
435
+
436
+ british_scores.append(british_score)
437
+ american_scores.append(american_score)
438
+
439
+ # Compute markedness for this chunk
440
+ markedness = _compute_markedness(british_score, american_score, eye_dialect_ratio)
441
+ markedness_scores.append(markedness)
442
+
443
+ # Aggregate counts
444
+ total_eye_dialect += analysis.eye_dialect_count
445
+ total_word_count += analysis.word_count
446
+
447
+ # Aggregate markers
448
+ for level, markers in analysis.markers_by_level.items():
449
+ for marker, count in markers.items():
450
+ agg_markers_by_level[level][marker] = (
451
+ agg_markers_by_level[level].get(marker, 0) + count
452
+ )
453
+
454
+ for marker, count in analysis.spelling_markers.items():
455
+ agg_spelling[marker] += count
456
+ for marker, count in analysis.vocabulary_markers.items():
457
+ agg_vocabulary[marker] += count
458
+ for marker, count in analysis.grammar_markers.items():
459
+ agg_grammar[marker] += count
460
+
461
+ # Handle empty text
462
+ if not british_scores:
463
+ empty_dist = Distribution(
464
+ values=[],
465
+ mean=float("nan"),
466
+ median=float("nan"),
467
+ std=0.0,
468
+ range=0.0,
469
+ iqr=0.0,
470
+ )
471
+ return DialectResult(
472
+ dialect="neutral",
473
+ confidence=0.0,
474
+ british_score=float("nan"),
475
+ american_score=float("nan"),
476
+ markedness_score=float("nan"),
477
+ british_score_dist=empty_dist,
478
+ american_score_dist=empty_dist,
479
+ markedness_score_dist=empty_dist,
480
+ markers_by_level=agg_markers_by_level,
481
+ spelling_markers=dict(agg_spelling),
482
+ vocabulary_markers=dict(agg_vocabulary),
483
+ grammar_markers=dict(agg_grammar),
484
+ eye_dialect_count=0,
485
+ eye_dialect_ratio=0.0,
486
+ register_hints={},
487
+ chunk_size=chunk_size,
488
+ chunk_count=len(chunks),
489
+ metadata={"total_word_count": 0},
490
+ )
491
+
492
+ # Build distributions
493
+ british_dist = make_distribution(british_scores)
494
+ american_dist = make_distribution(american_scores)
495
+ markedness_dist = make_distribution(markedness_scores)
496
+
497
+ # Classify based on mean scores
498
+ dialect, confidence = _classify_dialect(british_dist.mean, american_dist.mean)
499
+
500
+ # Compute overall eye dialect ratio
501
+ eye_dialect_ratio = (
502
+ (total_eye_dialect / total_word_count * 1000) if total_word_count > 0 else 0.0
503
+ )
504
+
505
+ # Build register hints
506
+ register_hints: dict[str, Any] = {
507
+ "eye_dialect_density": eye_dialect_ratio,
508
+ "marker_density": (british_dist.mean + american_dist.mean) / 2,
509
+ }
510
+
511
+ return DialectResult(
512
+ dialect=dialect,
513
+ confidence=confidence,
514
+ british_score=british_dist.mean,
515
+ american_score=american_dist.mean,
516
+ markedness_score=markedness_dist.mean,
517
+ british_score_dist=british_dist,
518
+ american_score_dist=american_dist,
519
+ markedness_score_dist=markedness_dist,
520
+ markers_by_level=agg_markers_by_level,
521
+ spelling_markers=dict(agg_spelling),
522
+ vocabulary_markers=dict(agg_vocabulary),
523
+ grammar_markers=dict(agg_grammar),
524
+ eye_dialect_count=total_eye_dialect,
525
+ eye_dialect_ratio=eye_dialect_ratio,
526
+ register_hints=register_hints,
527
+ chunk_size=chunk_size,
528
+ chunk_count=len(chunks),
529
+ metadata={
530
+ "total_word_count": total_word_count,
531
+ "markers_version": get_markers().version,
532
+ },
533
+ )
@@ -1,17 +1,24 @@
1
1
  """Lexical diversity metrics."""
2
2
 
3
- # Re-export from stylometry-ttr
4
- # from stylometry_ttr import compute_ttr, TTRResult
5
-
6
3
  # Local implementations
7
- from .hapax import compute_hapax_ratios
4
+ from .advanced_diversity import compute_hdd, compute_mattr, compute_msttr, compute_vocd_d
5
+ from .function_words import compute_function_words
6
+ from .hapax import compute_hapax_ratios, compute_hapax_with_lexicon_analysis
8
7
  from .mtld import compute_mtld
8
+ from .ttr import compute_ttr
9
+ from .word_frequency_sophistication import compute_word_frequency_sophistication
9
10
  from .yule import compute_yule
10
11
 
11
12
  __all__ = [
12
- # "compute_ttr", # From stylometry-ttr
13
- # "TTRResult", # From stylometry-ttr
13
+ "compute_ttr",
14
14
  "compute_mtld",
15
15
  "compute_yule",
16
16
  "compute_hapax_ratios",
17
+ "compute_hapax_with_lexicon_analysis",
18
+ "compute_function_words",
19
+ "compute_vocd_d",
20
+ "compute_mattr",
21
+ "compute_hdd",
22
+ "compute_msttr",
23
+ "compute_word_frequency_sophistication",
17
24
  ]