pystylometry 1.3.1__py3-none-any.whl → 1.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +42 -3
- pystylometry/_types.py +53 -3
- pystylometry/cli.py +695 -0
- pystylometry/lexical/__init__.py +4 -1
- pystylometry/lexical/bnc_frequency.py +309 -0
- pystylometry/lexical/ttr.py +288 -97
- pystylometry/viz/jsx/__init__.py +2 -0
- pystylometry/viz/jsx/bnc_frequency.py +495 -0
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/METADATA +16 -3
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/RECORD +13 -11
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/entry_points.txt +2 -0
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/LICENSE +0 -0
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/WHEEL +0 -0
pystylometry/lexical/__init__.py
CHANGED
|
@@ -2,20 +2,23 @@
|
|
|
2
2
|
|
|
3
3
|
# Local implementations
|
|
4
4
|
from .advanced_diversity import compute_hdd, compute_mattr, compute_msttr, compute_vocd_d
|
|
5
|
+
from .bnc_frequency import compute_bnc_frequency
|
|
5
6
|
from .function_words import compute_function_words
|
|
6
7
|
from .hapax import compute_hapax_ratios, compute_hapax_with_lexicon_analysis
|
|
7
8
|
from .mtld import compute_mtld
|
|
8
9
|
from .repetition import compute_repetitive_ngrams, compute_repetitive_unigrams
|
|
9
|
-
from .ttr import compute_ttr
|
|
10
|
+
from .ttr import TTRAggregator, compute_ttr
|
|
10
11
|
from .word_frequency_sophistication import compute_word_frequency_sophistication
|
|
11
12
|
from .yule import compute_yule
|
|
12
13
|
|
|
13
14
|
__all__ = [
|
|
14
15
|
"compute_ttr",
|
|
16
|
+
"TTRAggregator",
|
|
15
17
|
"compute_mtld",
|
|
16
18
|
"compute_yule",
|
|
17
19
|
"compute_hapax_ratios",
|
|
18
20
|
"compute_hapax_with_lexicon_analysis",
|
|
21
|
+
"compute_bnc_frequency",
|
|
19
22
|
"compute_function_words",
|
|
20
23
|
"compute_vocd_d",
|
|
21
24
|
"compute_mattr",
|
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
"""BNC (British National Corpus) frequency analysis for stylometric comparison.
|
|
2
|
+
|
|
3
|
+
This module computes word frequency ratios by comparing observed word frequencies
|
|
4
|
+
in a text against expected frequencies from the British National Corpus (BNC).
|
|
5
|
+
Words can be categorized as:
|
|
6
|
+
- Overused: appear more frequently than expected (ratio > 1)
|
|
7
|
+
- Underused: appear less frequently than expected (ratio < 1)
|
|
8
|
+
- Not in BNC: words that don't exist in the BNC corpus
|
|
9
|
+
|
|
10
|
+
Related GitHub Issue:
|
|
11
|
+
#TBD - BNC frequency analysis CLI
|
|
12
|
+
https://github.com/craigtrim/pystylometry/issues/TBD
|
|
13
|
+
|
|
14
|
+
References:
|
|
15
|
+
British National Corpus: http://www.natcorp.ox.ac.uk/
|
|
16
|
+
The BNC is a 100-million word collection of samples of written and spoken
|
|
17
|
+
language from a wide range of sources, designed to represent a wide
|
|
18
|
+
cross-section of British English from the late 20th century.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import re
|
|
24
|
+
import unicodedata
|
|
25
|
+
from collections import Counter
|
|
26
|
+
from dataclasses import dataclass
|
|
27
|
+
from typing import Literal
|
|
28
|
+
|
|
29
|
+
from .._utils import check_optional_dependency
|
|
30
|
+
|
|
31
|
+
# Unicode apostrophe variants to normalize to ASCII apostrophe (U+0027)
|
|
32
|
+
# See: https://github.com/craigtrim/pystylometry/issues/45
|
|
33
|
+
_APOSTROPHE_VARIANTS = (
|
|
34
|
+
"\u0060" # GRAVE ACCENT
|
|
35
|
+
"\u00B4" # ACUTE ACCENT
|
|
36
|
+
"\u2018" # LEFT SINGLE QUOTATION MARK
|
|
37
|
+
"\u2019" # RIGHT SINGLE QUOTATION MARK
|
|
38
|
+
"\u201B" # SINGLE HIGH-REVERSED-9 QUOTATION MARK
|
|
39
|
+
"\u2032" # PRIME
|
|
40
|
+
"\u2035" # REVERSED PRIME
|
|
41
|
+
"\u02B9" # MODIFIER LETTER PRIME
|
|
42
|
+
"\u02BC" # MODIFIER LETTER APOSTROPHE
|
|
43
|
+
"\u02C8" # MODIFIER LETTER VERTICAL LINE
|
|
44
|
+
"\u0313" # COMBINING COMMA ABOVE
|
|
45
|
+
"\u0315" # COMBINING COMMA ABOVE RIGHT
|
|
46
|
+
"\u055A" # ARMENIAN APOSTROPHE
|
|
47
|
+
"\u05F3" # HEBREW PUNCTUATION GERESH
|
|
48
|
+
"\u07F4" # NKO HIGH TONE APOSTROPHE
|
|
49
|
+
"\u07F5" # NKO LOW TONE APOSTROPHE
|
|
50
|
+
"\uFF07" # FULLWIDTH APOSTROPHE
|
|
51
|
+
"\u1FBF" # GREEK PSILI
|
|
52
|
+
"\u1FBD" # GREEK KORONIS
|
|
53
|
+
"\uA78C" # LATIN SMALL LETTER SALTILLO
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _normalize_apostrophes(text: str) -> str:
|
|
58
|
+
"""Normalize Unicode apostrophe variants to ASCII apostrophe.
|
|
59
|
+
|
|
60
|
+
Many texts (especially ebooks, PDFs, and word processor output) use
|
|
61
|
+
typographic "smart quotes" instead of ASCII apostrophes. This function
|
|
62
|
+
normalizes all variants to the standard ASCII apostrophe (U+0027) to
|
|
63
|
+
ensure consistent BNC lookups.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
text: Input text potentially containing apostrophe variants
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Text with all apostrophe variants normalized to ASCII apostrophe
|
|
70
|
+
|
|
71
|
+
Example:
|
|
72
|
+
>>> _normalize_apostrophes("don't") # curly apostrophe
|
|
73
|
+
"don't" # ASCII apostrophe
|
|
74
|
+
"""
|
|
75
|
+
for char in _APOSTROPHE_VARIANTS:
|
|
76
|
+
text = text.replace(char, "'")
|
|
77
|
+
return text
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
81
|
+
class WordAnalysis:
|
|
82
|
+
"""Analysis of a single word against BNC frequency.
|
|
83
|
+
|
|
84
|
+
Attributes:
|
|
85
|
+
word: The word being analyzed (lowercase)
|
|
86
|
+
observed: Number of times the word appears in the text
|
|
87
|
+
expected: Expected count based on BNC relative frequency
|
|
88
|
+
ratio: observed / expected (None if not in BNC)
|
|
89
|
+
in_wordnet: Whether the word exists in WordNet
|
|
90
|
+
char_type: Classification of character content
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
word: str
|
|
94
|
+
observed: int
|
|
95
|
+
expected: float | None
|
|
96
|
+
ratio: float | None
|
|
97
|
+
in_wordnet: bool | None
|
|
98
|
+
char_type: Literal["latin", "unicode", "numeric", "mixed", "punctuation"]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@dataclass
|
|
102
|
+
class BNCFrequencyResult:
|
|
103
|
+
"""Result of BNC frequency analysis.
|
|
104
|
+
|
|
105
|
+
Attributes:
|
|
106
|
+
overused: Words appearing more frequently than expected (ratio > threshold)
|
|
107
|
+
underused: Words appearing less frequently than expected (ratio < threshold)
|
|
108
|
+
not_in_bnc: Words not found in the BNC corpus
|
|
109
|
+
total_tokens: Total word count in the text
|
|
110
|
+
unique_tokens: Number of unique words
|
|
111
|
+
overuse_threshold: Ratio above which words are considered overused
|
|
112
|
+
underuse_threshold: Ratio below which words are considered underused
|
|
113
|
+
metadata: Additional analysis metadata
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
overused: list[WordAnalysis]
|
|
117
|
+
underused: list[WordAnalysis]
|
|
118
|
+
not_in_bnc: list[WordAnalysis]
|
|
119
|
+
total_tokens: int
|
|
120
|
+
unique_tokens: int
|
|
121
|
+
overuse_threshold: float
|
|
122
|
+
underuse_threshold: float
|
|
123
|
+
metadata: dict
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _classify_char_type(
|
|
127
|
+
word: str,
|
|
128
|
+
) -> Literal["latin", "unicode", "numeric", "mixed", "punctuation"]:
|
|
129
|
+
"""Classify the character content of a word.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
word: Word to classify
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Character type classification:
|
|
136
|
+
- latin: Pure ASCII alphabetic characters (a-z, A-Z)
|
|
137
|
+
- unicode: Contains non-ASCII characters (accents, etc.)
|
|
138
|
+
- numeric: Contains only digits
|
|
139
|
+
- mixed: Contains letters and numbers or other combinations
|
|
140
|
+
- punctuation: Contains only punctuation
|
|
141
|
+
"""
|
|
142
|
+
if not word:
|
|
143
|
+
return "punctuation"
|
|
144
|
+
|
|
145
|
+
has_ascii_alpha = bool(re.search(r"[a-zA-Z]", word))
|
|
146
|
+
has_unicode_alpha = any(unicodedata.category(c).startswith("L") and ord(c) > 127 for c in word)
|
|
147
|
+
has_digit = any(c.isdigit() for c in word)
|
|
148
|
+
has_punct = any(unicodedata.category(c).startswith("P") for c in word)
|
|
149
|
+
|
|
150
|
+
# Determine classification
|
|
151
|
+
if has_unicode_alpha:
|
|
152
|
+
return "unicode"
|
|
153
|
+
elif has_digit and not has_ascii_alpha:
|
|
154
|
+
return "numeric"
|
|
155
|
+
elif has_digit and has_ascii_alpha:
|
|
156
|
+
return "mixed"
|
|
157
|
+
elif has_ascii_alpha and not has_punct:
|
|
158
|
+
return "latin"
|
|
159
|
+
elif has_ascii_alpha and has_punct:
|
|
160
|
+
return "mixed"
|
|
161
|
+
elif not has_ascii_alpha and not has_digit:
|
|
162
|
+
return "punctuation"
|
|
163
|
+
else:
|
|
164
|
+
return "mixed"
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def compute_bnc_frequency(
|
|
168
|
+
text: str,
|
|
169
|
+
overuse_threshold: float = 1.3,
|
|
170
|
+
underuse_threshold: float = 0.8,
|
|
171
|
+
include_wordnet: bool = True,
|
|
172
|
+
min_mentions: int = 1,
|
|
173
|
+
) -> BNCFrequencyResult:
|
|
174
|
+
"""Compute BNC frequency analysis for a text.
|
|
175
|
+
|
|
176
|
+
Compares observed word frequencies against expected frequencies from the
|
|
177
|
+
British National Corpus. Words are categorized as overused, underused,
|
|
178
|
+
or not in BNC based on their frequency ratios.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
text: Input text to analyze
|
|
182
|
+
overuse_threshold: Ratio above which words are considered overused (default: 1.3)
|
|
183
|
+
underuse_threshold: Ratio below which words are considered underused (default: 0.8)
|
|
184
|
+
include_wordnet: Whether to check WordNet for unknown words (default: True)
|
|
185
|
+
min_mentions: Minimum number of mentions to include word (default: 1)
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
BNCFrequencyResult with categorized word lists
|
|
189
|
+
|
|
190
|
+
Raises:
|
|
191
|
+
ImportError: If bnc-lookup package is not installed
|
|
192
|
+
|
|
193
|
+
Example:
|
|
194
|
+
>>> result = compute_bnc_frequency("The captain ordered the larboard watch...")
|
|
195
|
+
>>> result.overused[:3] # Top 3 overused words
|
|
196
|
+
[WordAnalysis(word='larboard', ratio=33153.5, ...), ...]
|
|
197
|
+
>>> result.not_in_bnc[:3] # Words not in BNC
|
|
198
|
+
[WordAnalysis(word='xyzbot', ...), ...]
|
|
199
|
+
"""
|
|
200
|
+
# Check dependency
|
|
201
|
+
check_optional_dependency("bnc_lookup", "lexical")
|
|
202
|
+
from bnc_lookup import relative_frequency # type: ignore[import-untyped]
|
|
203
|
+
|
|
204
|
+
# Optional wordnet lookup
|
|
205
|
+
wordnet_checker = None
|
|
206
|
+
if include_wordnet:
|
|
207
|
+
try:
|
|
208
|
+
from wordnet_lookup import (
|
|
209
|
+
is_wordnet_term as _is_wordnet_term, # type: ignore[import-untyped]
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
wordnet_checker = _is_wordnet_term
|
|
213
|
+
except ImportError:
|
|
214
|
+
# WordNet lookup is optional
|
|
215
|
+
pass
|
|
216
|
+
|
|
217
|
+
# Tokenize text (simple whitespace + punctuation stripping)
|
|
218
|
+
# First normalize apostrophes to ensure consistent BNC lookups (Issue #45)
|
|
219
|
+
normalized_text = _normalize_apostrophes(text)
|
|
220
|
+
raw_tokens = normalized_text.split()
|
|
221
|
+
tokens = []
|
|
222
|
+
for raw in raw_tokens:
|
|
223
|
+
# Strip leading/trailing punctuation, lowercase
|
|
224
|
+
cleaned = re.sub(r"^[^\w]+|[^\w]+$", "", raw).lower()
|
|
225
|
+
if cleaned:
|
|
226
|
+
tokens.append(cleaned)
|
|
227
|
+
|
|
228
|
+
total_tokens = len(tokens)
|
|
229
|
+
|
|
230
|
+
# Count observed frequency of each word
|
|
231
|
+
observed = Counter(tokens)
|
|
232
|
+
unique_words = list(observed.keys())
|
|
233
|
+
|
|
234
|
+
# Get BNC relative frequencies (one at a time - bnc_lookup doesn't have batch)
|
|
235
|
+
bnc_freqs = {word: relative_frequency(word) for word in unique_words}
|
|
236
|
+
|
|
237
|
+
# Analyze each word
|
|
238
|
+
overused: list[WordAnalysis] = []
|
|
239
|
+
underused: list[WordAnalysis] = []
|
|
240
|
+
not_in_bnc: list[WordAnalysis] = []
|
|
241
|
+
|
|
242
|
+
for word, obs_count in observed.items():
|
|
243
|
+
if obs_count < min_mentions:
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
# Classify character type
|
|
247
|
+
char_type = _classify_char_type(word)
|
|
248
|
+
|
|
249
|
+
# Get BNC frequency
|
|
250
|
+
rel_freq = bnc_freqs.get(word)
|
|
251
|
+
|
|
252
|
+
# Check WordNet if requested
|
|
253
|
+
in_wordnet = None
|
|
254
|
+
if wordnet_checker is not None:
|
|
255
|
+
in_wordnet = wordnet_checker(word)
|
|
256
|
+
|
|
257
|
+
if rel_freq is None or rel_freq == 0:
|
|
258
|
+
# Word not in BNC
|
|
259
|
+
analysis = WordAnalysis(
|
|
260
|
+
word=word,
|
|
261
|
+
observed=obs_count,
|
|
262
|
+
expected=None,
|
|
263
|
+
ratio=None,
|
|
264
|
+
in_wordnet=in_wordnet,
|
|
265
|
+
char_type=char_type,
|
|
266
|
+
)
|
|
267
|
+
not_in_bnc.append(analysis)
|
|
268
|
+
else:
|
|
269
|
+
# Compute expected count and ratio
|
|
270
|
+
expected = rel_freq * total_tokens
|
|
271
|
+
ratio = obs_count / expected if expected > 0 else None
|
|
272
|
+
|
|
273
|
+
analysis = WordAnalysis(
|
|
274
|
+
word=word,
|
|
275
|
+
observed=obs_count,
|
|
276
|
+
expected=expected,
|
|
277
|
+
ratio=ratio,
|
|
278
|
+
in_wordnet=in_wordnet,
|
|
279
|
+
char_type=char_type,
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
if ratio is not None:
|
|
283
|
+
if ratio > overuse_threshold:
|
|
284
|
+
overused.append(analysis)
|
|
285
|
+
elif ratio < underuse_threshold:
|
|
286
|
+
underused.append(analysis)
|
|
287
|
+
|
|
288
|
+
# Sort by ratio (highest first for overused, lowest first for underused)
|
|
289
|
+
overused.sort(key=lambda x: x.ratio or 0, reverse=True)
|
|
290
|
+
underused.sort(key=lambda x: x.ratio or float("inf"))
|
|
291
|
+
# Sort not_in_bnc by observed count
|
|
292
|
+
not_in_bnc.sort(key=lambda x: x.observed, reverse=True)
|
|
293
|
+
|
|
294
|
+
return BNCFrequencyResult(
|
|
295
|
+
overused=overused,
|
|
296
|
+
underused=underused,
|
|
297
|
+
not_in_bnc=not_in_bnc,
|
|
298
|
+
total_tokens=total_tokens,
|
|
299
|
+
unique_tokens=len(unique_words),
|
|
300
|
+
overuse_threshold=overuse_threshold,
|
|
301
|
+
underuse_threshold=underuse_threshold,
|
|
302
|
+
metadata={
|
|
303
|
+
"include_wordnet": include_wordnet,
|
|
304
|
+
"min_mentions": min_mentions,
|
|
305
|
+
"overused_count": len(overused),
|
|
306
|
+
"underused_count": len(underused),
|
|
307
|
+
"not_in_bnc_count": len(not_in_bnc),
|
|
308
|
+
},
|
|
309
|
+
)
|