pystylometry 1.3.1__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,20 +2,23 @@
2
2
 
3
3
  # Local implementations
4
4
  from .advanced_diversity import compute_hdd, compute_mattr, compute_msttr, compute_vocd_d
5
+ from .bnc_frequency import compute_bnc_frequency
5
6
  from .function_words import compute_function_words
6
7
  from .hapax import compute_hapax_ratios, compute_hapax_with_lexicon_analysis
7
8
  from .mtld import compute_mtld
8
9
  from .repetition import compute_repetitive_ngrams, compute_repetitive_unigrams
9
- from .ttr import compute_ttr
10
+ from .ttr import TTRAggregator, compute_ttr
10
11
  from .word_frequency_sophistication import compute_word_frequency_sophistication
11
12
  from .yule import compute_yule
12
13
 
13
14
  __all__ = [
14
15
  "compute_ttr",
16
+ "TTRAggregator",
15
17
  "compute_mtld",
16
18
  "compute_yule",
17
19
  "compute_hapax_ratios",
18
20
  "compute_hapax_with_lexicon_analysis",
21
+ "compute_bnc_frequency",
19
22
  "compute_function_words",
20
23
  "compute_vocd_d",
21
24
  "compute_mattr",
@@ -0,0 +1,309 @@
1
+ """BNC (British National Corpus) frequency analysis for stylometric comparison.
2
+
3
+ This module computes word frequency ratios by comparing observed word frequencies
4
+ in a text against expected frequencies from the British National Corpus (BNC).
5
+ Words can be categorized as:
6
+ - Overused: appear more frequently than expected (ratio > 1)
7
+ - Underused: appear less frequently than expected (ratio < 1)
8
+ - Not in BNC: words that don't exist in the BNC corpus
9
+
10
+ Related GitHub Issue:
11
+ #TBD - BNC frequency analysis CLI
12
+ https://github.com/craigtrim/pystylometry/issues/TBD
13
+
14
+ References:
15
+ British National Corpus: http://www.natcorp.ox.ac.uk/
16
+ The BNC is a 100-million word collection of samples of written and spoken
17
+ language from a wide range of sources, designed to represent a wide
18
+ cross-section of British English from the late 20th century.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import re
24
+ import unicodedata
25
+ from collections import Counter
26
+ from dataclasses import dataclass
27
+ from typing import Literal
28
+
29
+ from .._utils import check_optional_dependency
30
+
31
+ # Unicode apostrophe variants to normalize to ASCII apostrophe (U+0027)
32
+ # See: https://github.com/craigtrim/pystylometry/issues/45
33
+ _APOSTROPHE_VARIANTS = (
34
+ "\u0060" # GRAVE ACCENT
35
+ "\u00B4" # ACUTE ACCENT
36
+ "\u2018" # LEFT SINGLE QUOTATION MARK
37
+ "\u2019" # RIGHT SINGLE QUOTATION MARK
38
+ "\u201B" # SINGLE HIGH-REVERSED-9 QUOTATION MARK
39
+ "\u2032" # PRIME
40
+ "\u2035" # REVERSED PRIME
41
+ "\u02B9" # MODIFIER LETTER PRIME
42
+ "\u02BC" # MODIFIER LETTER APOSTROPHE
43
+ "\u02C8" # MODIFIER LETTER VERTICAL LINE
44
+ "\u0313" # COMBINING COMMA ABOVE
45
+ "\u0315" # COMBINING COMMA ABOVE RIGHT
46
+ "\u055A" # ARMENIAN APOSTROPHE
47
+ "\u05F3" # HEBREW PUNCTUATION GERESH
48
+ "\u07F4" # NKO HIGH TONE APOSTROPHE
49
+ "\u07F5" # NKO LOW TONE APOSTROPHE
50
+ "\uFF07" # FULLWIDTH APOSTROPHE
51
+ "\u1FBF" # GREEK PSILI
52
+ "\u1FBD" # GREEK KORONIS
53
+ "\uA78C" # LATIN SMALL LETTER SALTILLO
54
+ )
55
+
56
+
57
+ def _normalize_apostrophes(text: str) -> str:
58
+ """Normalize Unicode apostrophe variants to ASCII apostrophe.
59
+
60
+ Many texts (especially ebooks, PDFs, and word processor output) use
61
+ typographic "smart quotes" instead of ASCII apostrophes. This function
62
+ normalizes all variants to the standard ASCII apostrophe (U+0027) to
63
+ ensure consistent BNC lookups.
64
+
65
+ Args:
66
+ text: Input text potentially containing apostrophe variants
67
+
68
+ Returns:
69
+ Text with all apostrophe variants normalized to ASCII apostrophe
70
+
71
+ Example:
72
+ >>> _normalize_apostrophes("don't") # curly apostrophe
73
+ "don't" # ASCII apostrophe
74
+ """
75
+ for char in _APOSTROPHE_VARIANTS:
76
+ text = text.replace(char, "'")
77
+ return text
78
+
79
+
80
+ @dataclass
81
+ class WordAnalysis:
82
+ """Analysis of a single word against BNC frequency.
83
+
84
+ Attributes:
85
+ word: The word being analyzed (lowercase)
86
+ observed: Number of times the word appears in the text
87
+ expected: Expected count based on BNC relative frequency
88
+ ratio: observed / expected (None if not in BNC)
89
+ in_wordnet: Whether the word exists in WordNet
90
+ char_type: Classification of character content
91
+ """
92
+
93
+ word: str
94
+ observed: int
95
+ expected: float | None
96
+ ratio: float | None
97
+ in_wordnet: bool | None
98
+ char_type: Literal["latin", "unicode", "numeric", "mixed", "punctuation"]
99
+
100
+
101
+ @dataclass
102
+ class BNCFrequencyResult:
103
+ """Result of BNC frequency analysis.
104
+
105
+ Attributes:
106
+ overused: Words appearing more frequently than expected (ratio > threshold)
107
+ underused: Words appearing less frequently than expected (ratio < threshold)
108
+ not_in_bnc: Words not found in the BNC corpus
109
+ total_tokens: Total word count in the text
110
+ unique_tokens: Number of unique words
111
+ overuse_threshold: Ratio above which words are considered overused
112
+ underuse_threshold: Ratio below which words are considered underused
113
+ metadata: Additional analysis metadata
114
+ """
115
+
116
+ overused: list[WordAnalysis]
117
+ underused: list[WordAnalysis]
118
+ not_in_bnc: list[WordAnalysis]
119
+ total_tokens: int
120
+ unique_tokens: int
121
+ overuse_threshold: float
122
+ underuse_threshold: float
123
+ metadata: dict
124
+
125
+
126
+ def _classify_char_type(
127
+ word: str,
128
+ ) -> Literal["latin", "unicode", "numeric", "mixed", "punctuation"]:
129
+ """Classify the character content of a word.
130
+
131
+ Args:
132
+ word: Word to classify
133
+
134
+ Returns:
135
+ Character type classification:
136
+ - latin: Pure ASCII alphabetic characters (a-z, A-Z)
137
+ - unicode: Contains non-ASCII characters (accents, etc.)
138
+ - numeric: Contains only digits
139
+ - mixed: Contains letters and numbers or other combinations
140
+ - punctuation: Contains only punctuation
141
+ """
142
+ if not word:
143
+ return "punctuation"
144
+
145
+ has_ascii_alpha = bool(re.search(r"[a-zA-Z]", word))
146
+ has_unicode_alpha = any(unicodedata.category(c).startswith("L") and ord(c) > 127 for c in word)
147
+ has_digit = any(c.isdigit() for c in word)
148
+ has_punct = any(unicodedata.category(c).startswith("P") for c in word)
149
+
150
+ # Determine classification
151
+ if has_unicode_alpha:
152
+ return "unicode"
153
+ elif has_digit and not has_ascii_alpha:
154
+ return "numeric"
155
+ elif has_digit and has_ascii_alpha:
156
+ return "mixed"
157
+ elif has_ascii_alpha and not has_punct:
158
+ return "latin"
159
+ elif has_ascii_alpha and has_punct:
160
+ return "mixed"
161
+ elif not has_ascii_alpha and not has_digit:
162
+ return "punctuation"
163
+ else:
164
+ return "mixed"
165
+
166
+
167
+ def compute_bnc_frequency(
168
+ text: str,
169
+ overuse_threshold: float = 1.3,
170
+ underuse_threshold: float = 0.8,
171
+ include_wordnet: bool = True,
172
+ min_mentions: int = 1,
173
+ ) -> BNCFrequencyResult:
174
+ """Compute BNC frequency analysis for a text.
175
+
176
+ Compares observed word frequencies against expected frequencies from the
177
+ British National Corpus. Words are categorized as overused, underused,
178
+ or not in BNC based on their frequency ratios.
179
+
180
+ Args:
181
+ text: Input text to analyze
182
+ overuse_threshold: Ratio above which words are considered overused (default: 1.3)
183
+ underuse_threshold: Ratio below which words are considered underused (default: 0.8)
184
+ include_wordnet: Whether to check WordNet for unknown words (default: True)
185
+ min_mentions: Minimum number of mentions to include word (default: 1)
186
+
187
+ Returns:
188
+ BNCFrequencyResult with categorized word lists
189
+
190
+ Raises:
191
+ ImportError: If bnc-lookup package is not installed
192
+
193
+ Example:
194
+ >>> result = compute_bnc_frequency("The captain ordered the larboard watch...")
195
+ >>> result.overused[:3] # Top 3 overused words
196
+ [WordAnalysis(word='larboard', ratio=33153.5, ...), ...]
197
+ >>> result.not_in_bnc[:3] # Words not in BNC
198
+ [WordAnalysis(word='xyzbot', ...), ...]
199
+ """
200
+ # Check dependency
201
+ check_optional_dependency("bnc_lookup", "lexical")
202
+ from bnc_lookup import relative_frequency # type: ignore[import-untyped]
203
+
204
+ # Optional wordnet lookup
205
+ wordnet_checker = None
206
+ if include_wordnet:
207
+ try:
208
+ from wordnet_lookup import (
209
+ is_wordnet_term as _is_wordnet_term, # type: ignore[import-untyped]
210
+ )
211
+
212
+ wordnet_checker = _is_wordnet_term
213
+ except ImportError:
214
+ # WordNet lookup is optional
215
+ pass
216
+
217
+ # Tokenize text (simple whitespace + punctuation stripping)
218
+ # First normalize apostrophes to ensure consistent BNC lookups (Issue #45)
219
+ normalized_text = _normalize_apostrophes(text)
220
+ raw_tokens = normalized_text.split()
221
+ tokens = []
222
+ for raw in raw_tokens:
223
+ # Strip leading/trailing punctuation, lowercase
224
+ cleaned = re.sub(r"^[^\w]+|[^\w]+$", "", raw).lower()
225
+ if cleaned:
226
+ tokens.append(cleaned)
227
+
228
+ total_tokens = len(tokens)
229
+
230
+ # Count observed frequency of each word
231
+ observed = Counter(tokens)
232
+ unique_words = list(observed.keys())
233
+
234
+ # Get BNC relative frequencies (one at a time - bnc_lookup doesn't have batch)
235
+ bnc_freqs = {word: relative_frequency(word) for word in unique_words}
236
+
237
+ # Analyze each word
238
+ overused: list[WordAnalysis] = []
239
+ underused: list[WordAnalysis] = []
240
+ not_in_bnc: list[WordAnalysis] = []
241
+
242
+ for word, obs_count in observed.items():
243
+ if obs_count < min_mentions:
244
+ continue
245
+
246
+ # Classify character type
247
+ char_type = _classify_char_type(word)
248
+
249
+ # Get BNC frequency
250
+ rel_freq = bnc_freqs.get(word)
251
+
252
+ # Check WordNet if requested
253
+ in_wordnet = None
254
+ if wordnet_checker is not None:
255
+ in_wordnet = wordnet_checker(word)
256
+
257
+ if rel_freq is None or rel_freq == 0:
258
+ # Word not in BNC
259
+ analysis = WordAnalysis(
260
+ word=word,
261
+ observed=obs_count,
262
+ expected=None,
263
+ ratio=None,
264
+ in_wordnet=in_wordnet,
265
+ char_type=char_type,
266
+ )
267
+ not_in_bnc.append(analysis)
268
+ else:
269
+ # Compute expected count and ratio
270
+ expected = rel_freq * total_tokens
271
+ ratio = obs_count / expected if expected > 0 else None
272
+
273
+ analysis = WordAnalysis(
274
+ word=word,
275
+ observed=obs_count,
276
+ expected=expected,
277
+ ratio=ratio,
278
+ in_wordnet=in_wordnet,
279
+ char_type=char_type,
280
+ )
281
+
282
+ if ratio is not None:
283
+ if ratio > overuse_threshold:
284
+ overused.append(analysis)
285
+ elif ratio < underuse_threshold:
286
+ underused.append(analysis)
287
+
288
+ # Sort by ratio (highest first for overused, lowest first for underused)
289
+ overused.sort(key=lambda x: x.ratio or 0, reverse=True)
290
+ underused.sort(key=lambda x: x.ratio or float("inf"))
291
+ # Sort not_in_bnc by observed count
292
+ not_in_bnc.sort(key=lambda x: x.observed, reverse=True)
293
+
294
+ return BNCFrequencyResult(
295
+ overused=overused,
296
+ underused=underused,
297
+ not_in_bnc=not_in_bnc,
298
+ total_tokens=total_tokens,
299
+ unique_tokens=len(unique_words),
300
+ overuse_threshold=overuse_threshold,
301
+ underuse_threshold=underuse_threshold,
302
+ metadata={
303
+ "include_wordnet": include_wordnet,
304
+ "min_mentions": min_mentions,
305
+ "overused_count": len(overused),
306
+ "underused_count": len(underused),
307
+ "not_in_bnc_count": len(not_in_bnc),
308
+ },
309
+ )