pystylometry 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. pystylometry/__init__.py +29 -3
  2. pystylometry/_types.py +963 -259
  3. pystylometry/authorship/__init__.py +23 -2
  4. pystylometry/authorship/additional_methods.py +4 -29
  5. pystylometry/authorship/kilgarriff.py +347 -0
  6. pystylometry/character/character_metrics.py +267 -179
  7. pystylometry/cli.py +427 -0
  8. pystylometry/consistency/__init__.py +57 -0
  9. pystylometry/consistency/_thresholds.py +162 -0
  10. pystylometry/consistency/drift.py +549 -0
  11. pystylometry/dialect/__init__.py +65 -0
  12. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  13. pystylometry/dialect/_loader.py +360 -0
  14. pystylometry/dialect/detector.py +533 -0
  15. pystylometry/lexical/advanced_diversity.py +61 -22
  16. pystylometry/lexical/function_words.py +255 -56
  17. pystylometry/lexical/hapax.py +182 -52
  18. pystylometry/lexical/mtld.py +108 -26
  19. pystylometry/lexical/ttr.py +76 -10
  20. pystylometry/lexical/word_frequency_sophistication.py +1522 -298
  21. pystylometry/lexical/yule.py +136 -50
  22. pystylometry/ngrams/entropy.py +150 -49
  23. pystylometry/readability/additional_formulas.py +1887 -762
  24. pystylometry/readability/ari.py +144 -82
  25. pystylometry/readability/coleman_liau.py +136 -109
  26. pystylometry/readability/flesch.py +177 -73
  27. pystylometry/readability/gunning_fog.py +165 -161
  28. pystylometry/readability/smog.py +123 -42
  29. pystylometry/syntactic/advanced_syntactic.py +76 -14
  30. pystylometry/syntactic/pos_ratios.py +70 -6
  31. pystylometry/syntactic/sentence_stats.py +55 -12
  32. pystylometry/syntactic/sentence_types.py +71 -15
  33. pystylometry/viz/__init__.py +71 -0
  34. pystylometry/viz/drift.py +589 -0
  35. pystylometry/viz/jsx/__init__.py +31 -0
  36. pystylometry/viz/jsx/_base.py +144 -0
  37. pystylometry/viz/jsx/report.py +677 -0
  38. pystylometry/viz/jsx/timeline.py +716 -0
  39. pystylometry/viz/jsx/viewer.py +1032 -0
  40. {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +5 -2
  41. pystylometry-1.1.0.dist-info/RECORD +63 -0
  42. {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +1 -1
  43. pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
  44. pystylometry-1.0.0.dist-info/RECORD +0 -46
@@ -0,0 +1,360 @@
1
+ """Dialect marker data loading and caching.
2
+
3
+ This module provides efficient loading and caching of the dialect markers JSON
4
+ database. The JSON file contains vocabulary pairs, spelling patterns, grammar
5
+ patterns, and other linguistic markers used for dialect detection.
6
+
7
+ Related GitHub Issues:
8
+ #35 - Dialect detection with extensible JSON markers
9
+ https://github.com/craigtrim/pystylometry/issues/35
10
+ #30 - Whonix stylometry features (regional linguistic preferences)
11
+ https://github.com/craigtrim/pystylometry/issues/30
12
+
13
+ Architecture:
14
+ The loader uses module-level caching to ensure the JSON file is read only
15
+ once per Python session. This is important for performance when analyzing
16
+ multiple texts, as the dialect markers database is moderately large (~50KB).
17
+
18
+ The loader also pre-compiles regex patterns from the JSON to avoid repeated
19
+ compilation overhead during detection.
20
+
21
+ Data Structure:
22
+ The dialect_markers.json file follows an extensible schema with:
23
+ - metadata: Version, sources, last updated date
24
+ - feature_levels: Linguistic level categorization (phonological, etc.)
25
+ - eye_dialect: Informal register markers (gonna, wanna)
26
+ - pragmatic_markers: Discourse and politeness markers
27
+ - vocabulary.pairs: American/British word pairs with categories
28
+ - vocabulary.exclusive: Region-specific vocabulary
29
+ - spelling_patterns.british_american: Regex patterns with weights
30
+ - spelling_patterns.standalone: Direct word pairs
31
+ - grammar_patterns: Grammar difference patterns
32
+ - punctuation_patterns: Punctuation conventions
33
+ - idiomatic_expressions: Idioms by dialect
34
+
35
+ References:
36
+ Nerbonne, John. "Data-Driven Dialectology." Language and Linguistics
37
+ Compass, vol. 3, no. 1, 2009, pp. 175-198.
38
+ Grieve, Jack. "Quantitative Authorship Attribution: An Evaluation of
39
+ Techniques." Literary and Linguistic Computing, vol. 22, no. 3,
40
+ 2007, pp. 251-270.
41
+ """
42
+
43
+ from __future__ import annotations
44
+
45
+ import json
46
+ import re
47
+ from dataclasses import dataclass, field
48
+ from functools import lru_cache
49
+ from pathlib import Path
50
+ from typing import Any
51
+
52
+ # Path to the dialect markers JSON file
53
+ _DATA_DIR = Path(__file__).parent / "_data"
54
+ _MARKERS_FILE = _DATA_DIR / "dialect_markers.json"
55
+
56
+
57
+ @dataclass
58
+ class CompiledSpellingPattern:
59
+ """Pre-compiled spelling pattern for efficient matching.
60
+
61
+ Spelling patterns are compiled once at load time to avoid repeated
62
+ regex compilation during detection. Each pattern includes metadata
63
+ for weighted scoring and linguistic level categorization.
64
+
65
+ Attributes:
66
+ name: Pattern identifier (e.g., "our_or", "ise_ize")
67
+ description: Human-readable description
68
+ pattern_british: Compiled regex for British variant
69
+ pattern_american: Compiled regex for American variant
70
+ weight: Diagnostic value 0.0-1.0 (higher = more distinctive)
71
+ feature_level: Linguistic level (phonological, morphological, etc.)
72
+ exceptions: Words that match pattern but aren't dialect markers
73
+ """
74
+
75
+ name: str
76
+ description: str
77
+ pattern_british: re.Pattern | None
78
+ pattern_american: re.Pattern | None
79
+ weight: float
80
+ feature_level: str
81
+ exceptions: set[str] = field(default_factory=set)
82
+
83
+
84
+ @dataclass
85
+ class CompiledGrammarPattern:
86
+ """Pre-compiled grammar pattern for efficient matching.
87
+
88
+ Grammar patterns detect syntactic differences like "have got" vs "have",
89
+ collective noun agreement, and shall/will usage.
90
+
91
+ Attributes:
92
+ name: Pattern identifier (e.g., "have_got", "gotten")
93
+ description: Human-readable description
94
+ pattern_british: Compiled regex for British variant (may be None)
95
+ pattern_american: Compiled regex for American variant (may be None)
96
+ weight: Diagnostic value 0.0-1.0
97
+ """
98
+
99
+ name: str
100
+ description: str
101
+ pattern_british: re.Pattern | None
102
+ pattern_american: re.Pattern | None
103
+ weight: float = 0.8
104
+
105
+
106
+ @dataclass
107
+ class DialectMarkers:
108
+ """Container for all loaded and compiled dialect markers.
109
+
110
+ This dataclass holds the complete dialect marker database after loading
111
+ and preprocessing. It includes both raw data (for inspection) and
112
+ pre-compiled patterns (for efficient detection).
113
+
114
+ Related GitHub Issue:
115
+ #35 - Dialect detection with extensible JSON markers
116
+ https://github.com/craigtrim/pystylometry/issues/35
117
+
118
+ Attributes:
119
+ version: Data version from metadata
120
+ vocabulary_pairs: List of American/British word pairs
121
+ vocabulary_exclusive: Region-specific vocabulary by dialect
122
+ spelling_patterns: Pre-compiled spelling patterns
123
+ standalone_spellings: Direct British/American spelling pairs
124
+ grammar_patterns: Pre-compiled grammar patterns
125
+ eye_dialect_words: Set of eye dialect markers (gonna, wanna)
126
+ pragmatic_markers: Discourse markers by dialect
127
+ idiomatic_expressions: Idioms by dialect
128
+ raw_data: Original JSON data for inspection
129
+ """
130
+
131
+ version: str
132
+ vocabulary_pairs: list[dict[str, str]]
133
+ vocabulary_exclusive: dict[str, list[str]]
134
+ spelling_patterns: list[CompiledSpellingPattern]
135
+ standalone_spellings: list[dict[str, str]]
136
+ grammar_patterns: list[CompiledGrammarPattern]
137
+ eye_dialect_words: set[str]
138
+ pragmatic_markers: dict[str, Any]
139
+ idiomatic_expressions: dict[str, list[dict[str, str]]]
140
+ raw_data: dict[str, Any]
141
+
142
+ # Pre-built lookup sets for fast matching
143
+ british_vocabulary: set[str] = field(default_factory=set)
144
+ american_vocabulary: set[str] = field(default_factory=set)
145
+ british_spellings: set[str] = field(default_factory=set)
146
+ american_spellings: set[str] = field(default_factory=set)
147
+
148
+
149
+ def _compile_spelling_pattern(pattern_data: dict[str, Any]) -> CompiledSpellingPattern:
150
+ """Compile a single spelling pattern from JSON data.
151
+
152
+ Args:
153
+ pattern_data: Dictionary from spelling_patterns.british_american
154
+
155
+ Returns:
156
+ CompiledSpellingPattern with pre-compiled regexes
157
+ """
158
+ pattern_british = None
159
+ pattern_american = None
160
+
161
+ # Compile British pattern if present and not null
162
+ british_pattern_str = pattern_data.get("pattern_british")
163
+ if british_pattern_str is not None and isinstance(british_pattern_str, str):
164
+ try:
165
+ pattern_british = re.compile(british_pattern_str, re.IGNORECASE)
166
+ except re.error:
167
+ pass # Skip invalid patterns
168
+
169
+ # Compile American pattern if present and not null
170
+ american_pattern_str = pattern_data.get("pattern_american")
171
+ if american_pattern_str is not None and isinstance(american_pattern_str, str):
172
+ try:
173
+ pattern_american = re.compile(american_pattern_str, re.IGNORECASE)
174
+ except re.error:
175
+ pass
176
+
177
+ # Extract exceptions as a set for fast lookup
178
+ exceptions = set(pattern_data.get("exceptions", []))
179
+
180
+ return CompiledSpellingPattern(
181
+ name=pattern_data.get("name", "unknown"),
182
+ description=pattern_data.get("description", ""),
183
+ pattern_british=pattern_british,
184
+ pattern_american=pattern_american,
185
+ weight=pattern_data.get("weight", 0.8),
186
+ feature_level=pattern_data.get("feature_level", "morphological"),
187
+ exceptions=exceptions,
188
+ )
189
+
190
+
191
+ def _compile_grammar_pattern(name: str, pattern_data: dict[str, Any]) -> CompiledGrammarPattern:
192
+ """Compile a single grammar pattern from JSON data.
193
+
194
+ Args:
195
+ name: Pattern name (key from grammar_patterns)
196
+ pattern_data: Dictionary with pattern details
197
+
198
+ Returns:
199
+ CompiledGrammarPattern with pre-compiled regexes
200
+ """
201
+ pattern_british = None
202
+ pattern_american = None
203
+
204
+ # Compile British pattern if present
205
+ if "british_pattern" in pattern_data:
206
+ try:
207
+ pattern_british = re.compile(pattern_data["british_pattern"], re.IGNORECASE)
208
+ except re.error:
209
+ pass
210
+
211
+ # Compile American pattern if present
212
+ if "american_pattern" in pattern_data:
213
+ try:
214
+ pattern_american = re.compile(pattern_data["american_pattern"], re.IGNORECASE)
215
+ except re.error:
216
+ pass
217
+
218
+ return CompiledGrammarPattern(
219
+ name=name,
220
+ description=pattern_data.get("description", ""),
221
+ pattern_british=pattern_british,
222
+ pattern_american=pattern_american,
223
+ weight=pattern_data.get("weight", 0.8),
224
+ )
225
+
226
+
227
+ def _build_vocabulary_sets(markers: DialectMarkers) -> None:
228
+ """Build fast lookup sets from vocabulary pairs.
229
+
230
+ Populates the british_vocabulary, american_vocabulary, british_spellings,
231
+ and american_spellings sets for O(1) word lookup during detection.
232
+
233
+ Args:
234
+ markers: DialectMarkers to populate (modified in place)
235
+ """
236
+ # Build vocabulary sets from pairs
237
+ for pair in markers.vocabulary_pairs:
238
+ if "british" in pair:
239
+ markers.british_vocabulary.add(pair["british"].lower())
240
+ if "american" in pair:
241
+ markers.american_vocabulary.add(pair["american"].lower())
242
+
243
+ # Add exclusive vocabulary
244
+ for word in markers.vocabulary_exclusive.get("british", []):
245
+ markers.british_vocabulary.add(word.lower())
246
+ for word in markers.vocabulary_exclusive.get("american", []):
247
+ markers.american_vocabulary.add(word.lower())
248
+
249
+ # Build spelling sets from standalone spellings
250
+ for pair in markers.standalone_spellings:
251
+ if "british" in pair:
252
+ markers.british_spellings.add(pair["british"].lower())
253
+ if "american" in pair:
254
+ markers.american_spellings.add(pair["american"].lower())
255
+
256
+
257
+ @lru_cache(maxsize=1)
258
+ def load_dialect_markers() -> DialectMarkers:
259
+ """Load and compile dialect markers from JSON file.
260
+
261
+ This function is cached with lru_cache to ensure the JSON file is loaded
262
+ only once per Python session. The cache has maxsize=1 since there's only
263
+ one dialect markers file.
264
+
265
+ Related GitHub Issue:
266
+ #35 - Dialect detection with extensible JSON markers
267
+ https://github.com/craigtrim/pystylometry/issues/35
268
+
269
+ Returns:
270
+ DialectMarkers with all data loaded and patterns compiled
271
+
272
+ Raises:
273
+ FileNotFoundError: If dialect_markers.json doesn't exist
274
+ json.JSONDecodeError: If JSON is malformed
275
+ """
276
+ with open(_MARKERS_FILE, encoding="utf-8") as f:
277
+ data = json.load(f)
278
+
279
+ # Extract metadata
280
+ metadata = data.get("metadata", {})
281
+ version = metadata.get("version", "unknown")
282
+
283
+ # Extract vocabulary
284
+ vocabulary = data.get("vocabulary", {})
285
+ vocabulary_pairs = vocabulary.get("pairs", [])
286
+ vocabulary_exclusive = vocabulary.get("exclusive", {})
287
+
288
+ # Compile spelling patterns
289
+ spelling_data = data.get("spelling_patterns", {})
290
+ compiled_spelling = [
291
+ _compile_spelling_pattern(p) for p in spelling_data.get("british_american", [])
292
+ ]
293
+ standalone_spellings = spelling_data.get("standalone", [])
294
+
295
+ # Compile grammar patterns
296
+ grammar_data = data.get("grammar_patterns", {})
297
+ compiled_grammar = [
298
+ _compile_grammar_pattern(name, pdata) for name, pdata in grammar_data.items()
299
+ ]
300
+
301
+ # Extract eye dialect words
302
+ eye_dialect_data = data.get("eye_dialect", {})
303
+ eye_dialect_words = set()
304
+ for word in eye_dialect_data.get("informal_contractions", []):
305
+ eye_dialect_words.add(word.lower())
306
+ for word in eye_dialect_data.get("phonetic_spellings", []):
307
+ eye_dialect_words.add(word.lower())
308
+
309
+ # Extract pragmatic markers
310
+ pragmatic_markers = data.get("pragmatic_markers", {})
311
+
312
+ # Extract idiomatic expressions
313
+ idiomatic = data.get("idiomatic_expressions", {})
314
+
315
+ # Build the markers container
316
+ markers = DialectMarkers(
317
+ version=version,
318
+ vocabulary_pairs=vocabulary_pairs,
319
+ vocabulary_exclusive=vocabulary_exclusive,
320
+ spelling_patterns=compiled_spelling,
321
+ standalone_spellings=standalone_spellings,
322
+ grammar_patterns=compiled_grammar,
323
+ eye_dialect_words=eye_dialect_words,
324
+ pragmatic_markers=pragmatic_markers,
325
+ idiomatic_expressions=idiomatic,
326
+ raw_data=data,
327
+ )
328
+
329
+ # Build lookup sets for fast matching
330
+ _build_vocabulary_sets(markers)
331
+
332
+ return markers
333
+
334
+
335
+ def get_markers() -> DialectMarkers:
336
+ """Get the cached dialect markers.
337
+
338
+ This is the primary entry point for accessing dialect markers. It returns
339
+ the cached markers from load_dialect_markers(), ensuring efficient access.
340
+
341
+ Example:
342
+ >>> markers = get_markers()
343
+ >>> len(markers.vocabulary_pairs)
344
+ 165
345
+ >>> "colour" in markers.british_spellings
346
+ True
347
+
348
+ Returns:
349
+ DialectMarkers with all data loaded and patterns compiled
350
+ """
351
+ return load_dialect_markers()
352
+
353
+
354
+ def clear_cache() -> None:
355
+ """Clear the dialect markers cache.
356
+
357
+ This forces a reload of the JSON file on the next get_markers() call.
358
+ Useful for testing or when the JSON file has been modified.
359
+ """
360
+ load_dialect_markers.cache_clear()