pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +30 -5
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1954 -28
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +26 -1
- pystylometry/authorship/additional_methods.py +75 -0
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +389 -0
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +680 -0
- pystylometry/lexical/function_words.py +590 -0
- pystylometry/lexical/hapax.py +310 -33
- pystylometry/lexical/mtld.py +180 -22
- pystylometry/lexical/ttr.py +149 -0
- pystylometry/lexical/word_frequency_sophistication.py +1805 -0
- pystylometry/lexical/yule.py +142 -29
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +2110 -0
- pystylometry/readability/ari.py +173 -35
- pystylometry/readability/coleman_liau.py +150 -30
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +181 -32
- pystylometry/readability/gunning_fog.py +208 -35
- pystylometry/readability/smog.py +126 -28
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +494 -0
- pystylometry/syntactic/pos_ratios.py +172 -17
- pystylometry/syntactic/sentence_stats.py +105 -18
- pystylometry/syntactic/sentence_types.py +526 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-0.1.0.dist-info/RECORD +0 -26
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
"""Dialect marker data loading and caching.
|
|
2
|
+
|
|
3
|
+
This module provides efficient loading and caching of the dialect markers JSON
|
|
4
|
+
database. The JSON file contains vocabulary pairs, spelling patterns, grammar
|
|
5
|
+
patterns, and other linguistic markers used for dialect detection.
|
|
6
|
+
|
|
7
|
+
Related GitHub Issues:
|
|
8
|
+
#35 - Dialect detection with extensible JSON markers
|
|
9
|
+
https://github.com/craigtrim/pystylometry/issues/35
|
|
10
|
+
#30 - Whonix stylometry features (regional linguistic preferences)
|
|
11
|
+
https://github.com/craigtrim/pystylometry/issues/30
|
|
12
|
+
|
|
13
|
+
Architecture:
|
|
14
|
+
The loader uses module-level caching to ensure the JSON file is read only
|
|
15
|
+
once per Python session. This is important for performance when analyzing
|
|
16
|
+
multiple texts, as the dialect markers database is moderately large (~50KB).
|
|
17
|
+
|
|
18
|
+
The loader also pre-compiles regex patterns from the JSON to avoid repeated
|
|
19
|
+
compilation overhead during detection.
|
|
20
|
+
|
|
21
|
+
Data Structure:
|
|
22
|
+
The dialect_markers.json file follows an extensible schema with:
|
|
23
|
+
- metadata: Version, sources, last updated date
|
|
24
|
+
- feature_levels: Linguistic level categorization (phonological, etc.)
|
|
25
|
+
- eye_dialect: Informal register markers (gonna, wanna)
|
|
26
|
+
- pragmatic_markers: Discourse and politeness markers
|
|
27
|
+
- vocabulary.pairs: American/British word pairs with categories
|
|
28
|
+
- vocabulary.exclusive: Region-specific vocabulary
|
|
29
|
+
- spelling_patterns.british_american: Regex patterns with weights
|
|
30
|
+
- spelling_patterns.standalone: Direct word pairs
|
|
31
|
+
- grammar_patterns: Grammar difference patterns
|
|
32
|
+
- punctuation_patterns: Punctuation conventions
|
|
33
|
+
- idiomatic_expressions: Idioms by dialect
|
|
34
|
+
|
|
35
|
+
References:
|
|
36
|
+
Nerbonne, John. "Data-Driven Dialectology." Language and Linguistics
|
|
37
|
+
Compass, vol. 3, no. 1, 2009, pp. 175-198.
|
|
38
|
+
Grieve, Jack. "Quantitative Authorship Attribution: An Evaluation of
|
|
39
|
+
Techniques." Literary and Linguistic Computing, vol. 22, no. 3,
|
|
40
|
+
2007, pp. 251-270.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
from __future__ import annotations
|
|
44
|
+
|
|
45
|
+
import json
|
|
46
|
+
import re
|
|
47
|
+
from dataclasses import dataclass, field
|
|
48
|
+
from functools import lru_cache
|
|
49
|
+
from pathlib import Path
|
|
50
|
+
from typing import Any
|
|
51
|
+
|
|
52
|
+
# Path to the dialect markers JSON file
|
|
53
|
+
_DATA_DIR = Path(__file__).parent / "_data"
|
|
54
|
+
_MARKERS_FILE = _DATA_DIR / "dialect_markers.json"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class CompiledSpellingPattern:
|
|
59
|
+
"""Pre-compiled spelling pattern for efficient matching.
|
|
60
|
+
|
|
61
|
+
Spelling patterns are compiled once at load time to avoid repeated
|
|
62
|
+
regex compilation during detection. Each pattern includes metadata
|
|
63
|
+
for weighted scoring and linguistic level categorization.
|
|
64
|
+
|
|
65
|
+
Attributes:
|
|
66
|
+
name: Pattern identifier (e.g., "our_or", "ise_ize")
|
|
67
|
+
description: Human-readable description
|
|
68
|
+
pattern_british: Compiled regex for British variant
|
|
69
|
+
pattern_american: Compiled regex for American variant
|
|
70
|
+
weight: Diagnostic value 0.0-1.0 (higher = more distinctive)
|
|
71
|
+
feature_level: Linguistic level (phonological, morphological, etc.)
|
|
72
|
+
exceptions: Words that match pattern but aren't dialect markers
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
name: str
|
|
76
|
+
description: str
|
|
77
|
+
pattern_british: re.Pattern | None
|
|
78
|
+
pattern_american: re.Pattern | None
|
|
79
|
+
weight: float
|
|
80
|
+
feature_level: str
|
|
81
|
+
exceptions: set[str] = field(default_factory=set)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class CompiledGrammarPattern:
|
|
86
|
+
"""Pre-compiled grammar pattern for efficient matching.
|
|
87
|
+
|
|
88
|
+
Grammar patterns detect syntactic differences like "have got" vs "have",
|
|
89
|
+
collective noun agreement, and shall/will usage.
|
|
90
|
+
|
|
91
|
+
Attributes:
|
|
92
|
+
name: Pattern identifier (e.g., "have_got", "gotten")
|
|
93
|
+
description: Human-readable description
|
|
94
|
+
pattern_british: Compiled regex for British variant (may be None)
|
|
95
|
+
pattern_american: Compiled regex for American variant (may be None)
|
|
96
|
+
weight: Diagnostic value 0.0-1.0
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
name: str
|
|
100
|
+
description: str
|
|
101
|
+
pattern_british: re.Pattern | None
|
|
102
|
+
pattern_american: re.Pattern | None
|
|
103
|
+
weight: float = 0.8
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@dataclass
|
|
107
|
+
class DialectMarkers:
|
|
108
|
+
"""Container for all loaded and compiled dialect markers.
|
|
109
|
+
|
|
110
|
+
This dataclass holds the complete dialect marker database after loading
|
|
111
|
+
and preprocessing. It includes both raw data (for inspection) and
|
|
112
|
+
pre-compiled patterns (for efficient detection).
|
|
113
|
+
|
|
114
|
+
Related GitHub Issue:
|
|
115
|
+
#35 - Dialect detection with extensible JSON markers
|
|
116
|
+
https://github.com/craigtrim/pystylometry/issues/35
|
|
117
|
+
|
|
118
|
+
Attributes:
|
|
119
|
+
version: Data version from metadata
|
|
120
|
+
vocabulary_pairs: List of American/British word pairs
|
|
121
|
+
vocabulary_exclusive: Region-specific vocabulary by dialect
|
|
122
|
+
spelling_patterns: Pre-compiled spelling patterns
|
|
123
|
+
standalone_spellings: Direct British/American spelling pairs
|
|
124
|
+
grammar_patterns: Pre-compiled grammar patterns
|
|
125
|
+
eye_dialect_words: Set of eye dialect markers (gonna, wanna)
|
|
126
|
+
pragmatic_markers: Discourse markers by dialect
|
|
127
|
+
idiomatic_expressions: Idioms by dialect
|
|
128
|
+
raw_data: Original JSON data for inspection
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
version: str
|
|
132
|
+
vocabulary_pairs: list[dict[str, str]]
|
|
133
|
+
vocabulary_exclusive: dict[str, list[str]]
|
|
134
|
+
spelling_patterns: list[CompiledSpellingPattern]
|
|
135
|
+
standalone_spellings: list[dict[str, str]]
|
|
136
|
+
grammar_patterns: list[CompiledGrammarPattern]
|
|
137
|
+
eye_dialect_words: set[str]
|
|
138
|
+
pragmatic_markers: dict[str, Any]
|
|
139
|
+
idiomatic_expressions: dict[str, list[dict[str, str]]]
|
|
140
|
+
raw_data: dict[str, Any]
|
|
141
|
+
|
|
142
|
+
# Pre-built lookup sets for fast matching
|
|
143
|
+
british_vocabulary: set[str] = field(default_factory=set)
|
|
144
|
+
american_vocabulary: set[str] = field(default_factory=set)
|
|
145
|
+
british_spellings: set[str] = field(default_factory=set)
|
|
146
|
+
american_spellings: set[str] = field(default_factory=set)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _compile_spelling_pattern(pattern_data: dict[str, Any]) -> CompiledSpellingPattern:
|
|
150
|
+
"""Compile a single spelling pattern from JSON data.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
pattern_data: Dictionary from spelling_patterns.british_american
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
CompiledSpellingPattern with pre-compiled regexes
|
|
157
|
+
"""
|
|
158
|
+
pattern_british = None
|
|
159
|
+
pattern_american = None
|
|
160
|
+
|
|
161
|
+
# Compile British pattern if present and not null
|
|
162
|
+
british_pattern_str = pattern_data.get("pattern_british")
|
|
163
|
+
if british_pattern_str is not None and isinstance(british_pattern_str, str):
|
|
164
|
+
try:
|
|
165
|
+
pattern_british = re.compile(british_pattern_str, re.IGNORECASE)
|
|
166
|
+
except re.error:
|
|
167
|
+
pass # Skip invalid patterns
|
|
168
|
+
|
|
169
|
+
# Compile American pattern if present and not null
|
|
170
|
+
american_pattern_str = pattern_data.get("pattern_american")
|
|
171
|
+
if american_pattern_str is not None and isinstance(american_pattern_str, str):
|
|
172
|
+
try:
|
|
173
|
+
pattern_american = re.compile(american_pattern_str, re.IGNORECASE)
|
|
174
|
+
except re.error:
|
|
175
|
+
pass
|
|
176
|
+
|
|
177
|
+
# Extract exceptions as a set for fast lookup
|
|
178
|
+
exceptions = set(pattern_data.get("exceptions", []))
|
|
179
|
+
|
|
180
|
+
return CompiledSpellingPattern(
|
|
181
|
+
name=pattern_data.get("name", "unknown"),
|
|
182
|
+
description=pattern_data.get("description", ""),
|
|
183
|
+
pattern_british=pattern_british,
|
|
184
|
+
pattern_american=pattern_american,
|
|
185
|
+
weight=pattern_data.get("weight", 0.8),
|
|
186
|
+
feature_level=pattern_data.get("feature_level", "morphological"),
|
|
187
|
+
exceptions=exceptions,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _compile_grammar_pattern(name: str, pattern_data: dict[str, Any]) -> CompiledGrammarPattern:
|
|
192
|
+
"""Compile a single grammar pattern from JSON data.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
name: Pattern name (key from grammar_patterns)
|
|
196
|
+
pattern_data: Dictionary with pattern details
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
CompiledGrammarPattern with pre-compiled regexes
|
|
200
|
+
"""
|
|
201
|
+
pattern_british = None
|
|
202
|
+
pattern_american = None
|
|
203
|
+
|
|
204
|
+
# Compile British pattern if present
|
|
205
|
+
if "british_pattern" in pattern_data:
|
|
206
|
+
try:
|
|
207
|
+
pattern_british = re.compile(pattern_data["british_pattern"], re.IGNORECASE)
|
|
208
|
+
except re.error:
|
|
209
|
+
pass
|
|
210
|
+
|
|
211
|
+
# Compile American pattern if present
|
|
212
|
+
if "american_pattern" in pattern_data:
|
|
213
|
+
try:
|
|
214
|
+
pattern_american = re.compile(pattern_data["american_pattern"], re.IGNORECASE)
|
|
215
|
+
except re.error:
|
|
216
|
+
pass
|
|
217
|
+
|
|
218
|
+
return CompiledGrammarPattern(
|
|
219
|
+
name=name,
|
|
220
|
+
description=pattern_data.get("description", ""),
|
|
221
|
+
pattern_british=pattern_british,
|
|
222
|
+
pattern_american=pattern_american,
|
|
223
|
+
weight=pattern_data.get("weight", 0.8),
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _build_vocabulary_sets(markers: DialectMarkers) -> None:
|
|
228
|
+
"""Build fast lookup sets from vocabulary pairs.
|
|
229
|
+
|
|
230
|
+
Populates the british_vocabulary, american_vocabulary, british_spellings,
|
|
231
|
+
and american_spellings sets for O(1) word lookup during detection.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
markers: DialectMarkers to populate (modified in place)
|
|
235
|
+
"""
|
|
236
|
+
# Build vocabulary sets from pairs
|
|
237
|
+
for pair in markers.vocabulary_pairs:
|
|
238
|
+
if "british" in pair:
|
|
239
|
+
markers.british_vocabulary.add(pair["british"].lower())
|
|
240
|
+
if "american" in pair:
|
|
241
|
+
markers.american_vocabulary.add(pair["american"].lower())
|
|
242
|
+
|
|
243
|
+
# Add exclusive vocabulary
|
|
244
|
+
for word in markers.vocabulary_exclusive.get("british", []):
|
|
245
|
+
markers.british_vocabulary.add(word.lower())
|
|
246
|
+
for word in markers.vocabulary_exclusive.get("american", []):
|
|
247
|
+
markers.american_vocabulary.add(word.lower())
|
|
248
|
+
|
|
249
|
+
# Build spelling sets from standalone spellings
|
|
250
|
+
for pair in markers.standalone_spellings:
|
|
251
|
+
if "british" in pair:
|
|
252
|
+
markers.british_spellings.add(pair["british"].lower())
|
|
253
|
+
if "american" in pair:
|
|
254
|
+
markers.american_spellings.add(pair["american"].lower())
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
@lru_cache(maxsize=1)
|
|
258
|
+
def load_dialect_markers() -> DialectMarkers:
|
|
259
|
+
"""Load and compile dialect markers from JSON file.
|
|
260
|
+
|
|
261
|
+
This function is cached with lru_cache to ensure the JSON file is loaded
|
|
262
|
+
only once per Python session. The cache has maxsize=1 since there's only
|
|
263
|
+
one dialect markers file.
|
|
264
|
+
|
|
265
|
+
Related GitHub Issue:
|
|
266
|
+
#35 - Dialect detection with extensible JSON markers
|
|
267
|
+
https://github.com/craigtrim/pystylometry/issues/35
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
DialectMarkers with all data loaded and patterns compiled
|
|
271
|
+
|
|
272
|
+
Raises:
|
|
273
|
+
FileNotFoundError: If dialect_markers.json doesn't exist
|
|
274
|
+
json.JSONDecodeError: If JSON is malformed
|
|
275
|
+
"""
|
|
276
|
+
with open(_MARKERS_FILE, encoding="utf-8") as f:
|
|
277
|
+
data = json.load(f)
|
|
278
|
+
|
|
279
|
+
# Extract metadata
|
|
280
|
+
metadata = data.get("metadata", {})
|
|
281
|
+
version = metadata.get("version", "unknown")
|
|
282
|
+
|
|
283
|
+
# Extract vocabulary
|
|
284
|
+
vocabulary = data.get("vocabulary", {})
|
|
285
|
+
vocabulary_pairs = vocabulary.get("pairs", [])
|
|
286
|
+
vocabulary_exclusive = vocabulary.get("exclusive", {})
|
|
287
|
+
|
|
288
|
+
# Compile spelling patterns
|
|
289
|
+
spelling_data = data.get("spelling_patterns", {})
|
|
290
|
+
compiled_spelling = [
|
|
291
|
+
_compile_spelling_pattern(p) for p in spelling_data.get("british_american", [])
|
|
292
|
+
]
|
|
293
|
+
standalone_spellings = spelling_data.get("standalone", [])
|
|
294
|
+
|
|
295
|
+
# Compile grammar patterns
|
|
296
|
+
grammar_data = data.get("grammar_patterns", {})
|
|
297
|
+
compiled_grammar = [
|
|
298
|
+
_compile_grammar_pattern(name, pdata) for name, pdata in grammar_data.items()
|
|
299
|
+
]
|
|
300
|
+
|
|
301
|
+
# Extract eye dialect words
|
|
302
|
+
eye_dialect_data = data.get("eye_dialect", {})
|
|
303
|
+
eye_dialect_words = set()
|
|
304
|
+
for word in eye_dialect_data.get("informal_contractions", []):
|
|
305
|
+
eye_dialect_words.add(word.lower())
|
|
306
|
+
for word in eye_dialect_data.get("phonetic_spellings", []):
|
|
307
|
+
eye_dialect_words.add(word.lower())
|
|
308
|
+
|
|
309
|
+
# Extract pragmatic markers
|
|
310
|
+
pragmatic_markers = data.get("pragmatic_markers", {})
|
|
311
|
+
|
|
312
|
+
# Extract idiomatic expressions
|
|
313
|
+
idiomatic = data.get("idiomatic_expressions", {})
|
|
314
|
+
|
|
315
|
+
# Build the markers container
|
|
316
|
+
markers = DialectMarkers(
|
|
317
|
+
version=version,
|
|
318
|
+
vocabulary_pairs=vocabulary_pairs,
|
|
319
|
+
vocabulary_exclusive=vocabulary_exclusive,
|
|
320
|
+
spelling_patterns=compiled_spelling,
|
|
321
|
+
standalone_spellings=standalone_spellings,
|
|
322
|
+
grammar_patterns=compiled_grammar,
|
|
323
|
+
eye_dialect_words=eye_dialect_words,
|
|
324
|
+
pragmatic_markers=pragmatic_markers,
|
|
325
|
+
idiomatic_expressions=idiomatic,
|
|
326
|
+
raw_data=data,
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
# Build lookup sets for fast matching
|
|
330
|
+
_build_vocabulary_sets(markers)
|
|
331
|
+
|
|
332
|
+
return markers
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def get_markers() -> DialectMarkers:
|
|
336
|
+
"""Get the cached dialect markers.
|
|
337
|
+
|
|
338
|
+
This is the primary entry point for accessing dialect markers. It returns
|
|
339
|
+
the cached markers from load_dialect_markers(), ensuring efficient access.
|
|
340
|
+
|
|
341
|
+
Example:
|
|
342
|
+
>>> markers = get_markers()
|
|
343
|
+
>>> len(markers.vocabulary_pairs)
|
|
344
|
+
165
|
|
345
|
+
>>> "colour" in markers.british_spellings
|
|
346
|
+
True
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
DialectMarkers with all data loaded and patterns compiled
|
|
350
|
+
"""
|
|
351
|
+
return load_dialect_markers()
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def clear_cache() -> None:
|
|
355
|
+
"""Clear the dialect markers cache.
|
|
356
|
+
|
|
357
|
+
This forces a reload of the JSON file on the next get_markers() call.
|
|
358
|
+
Useful for testing or when the JSON file has been modified.
|
|
359
|
+
"""
|
|
360
|
+
load_dialect_markers.cache_clear()
|