pystylometry 1.1.0__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,506 @@
1
+ """Repetitive word and n-gram detection for verbal tics / slop analysis.
2
+
3
+ This module detects abnormally repetitive words and phrases in text — a common
4
+ pattern in AI-generated content ("slop") where certain content words and phrases
5
+ appear far more frequently than expected.
6
+
7
+ Generative models exhibit "verbal tics": they repeatedly use certain words and
8
+ phrases throughout generated text. Examples include "shimmered", "flickered",
9
+ "obsidian", "a testament to", "an uncomfortable truth". These patterns differ
10
+ from natural human writing where content words appear when contextually relevant,
11
+ repetition clusters around specific scenes or topics, and unusual words don't
12
+ appear with suspiciously even distribution.
13
+
14
+ Two functions are provided:
15
+
16
+ compute_repetitive_unigrams:
17
+ Compares observed word frequencies against the British National Corpus
18
+ (BNC, ~100M tokens) baseline. Words that appear far more than their
19
+ BNC relative frequency predicts are flagged.
20
+
21
+ compute_repetitive_ngrams:
22
+ Detects content n-grams (bigrams, trigrams, etc.) that repeat more
23
+ than expected. No external corpus is required — content n-grams should
24
+ not repeat verbatim often in natural writing.
25
+
26
+ Both functions support chunked analysis to reveal distribution patterns:
27
+ - Even distribution across text = suspicious (model's consistent tic)
28
+ - Clustered distribution = likely intentional (human describing a scene)
29
+
30
+ Related GitHub Issue:
31
+ #28 - Verbal tics detection for slop analysis
32
+ https://github.com/craigtrim/pystylometry/issues/28
33
+
34
+ Dependencies:
35
+ - bnc-lookup >= 1.3.0 (optional, in lexical group)
36
+ Provides expected_count() and bucket() for BNC baseline comparison.
37
+
38
+ References:
39
+ British National Corpus Consortium. (2007). The British National Corpus,
40
+ version 3 (BNC XML Edition). http://www.natcorp.ox.ac.uk/
41
+ Kilgarriff, A. (2001). BNC database and word frequency lists.
42
+ https://www.kilgarriff.co.uk/bnc-readme.html
43
+ """
44
+
45
+ from __future__ import annotations
46
+
47
+ import math
48
+ import statistics
49
+ from collections import Counter
50
+
51
+ from .._types import (
52
+ Distribution,
53
+ RepetitiveNgram,
54
+ RepetitiveNgramsResult,
55
+ RepetitiveUnigramsResult,
56
+ RepetitiveWord,
57
+ chunk_text,
58
+ make_distribution,
59
+ )
60
+ from .._utils import check_optional_dependency, tokenize
61
+ from .function_words import (
62
+ AUXILIARIES,
63
+ CONJUNCTIONS,
64
+ DETERMINERS,
65
+ PARTICLES,
66
+ PREPOSITIONS,
67
+ PRONOUNS,
68
+ )
69
+
70
+ # Union of all function word sets — used to filter out non-content words
71
+ _FUNCTION_WORDS = DETERMINERS | PREPOSITIONS | CONJUNCTIONS | PRONOUNS | AUXILIARIES | PARTICLES
72
+
73
+
74
+ def _chunk_entropy(chunk_counts: list[int]) -> float:
75
+ """Compute Shannon entropy of a word's distribution across chunks.
76
+
77
+ Entropy measures how evenly a word is distributed across chunks.
78
+ Low entropy means the word appears evenly (suspicious for rare words).
79
+ High entropy means the word is concentrated in specific chunks (natural).
80
+
81
+ Formula:
82
+ H = -sum(p_i * log2(p_i)) for each chunk i where p_i > 0
83
+ p_i = count_in_chunk_i / total_count
84
+
85
+ Args:
86
+ chunk_counts: Per-chunk occurrence counts.
87
+
88
+ Returns:
89
+ Shannon entropy in bits. 0.0 if the word appears in only one chunk.
90
+ Returns 0.0 for empty or all-zero counts.
91
+ """
92
+ total = sum(chunk_counts)
93
+ if total == 0:
94
+ return 0.0
95
+
96
+ entropy = 0.0
97
+ for count in chunk_counts:
98
+ if count > 0:
99
+ p = count / total
100
+ entropy -= p * math.log2(p)
101
+
102
+ return entropy
103
+
104
+
105
+ def _tokenize_content_words(text: str) -> list[str]:
106
+ """Tokenize text and return only lowercase alphabetic content words.
107
+
108
+ Filters out:
109
+ - Non-alphabetic tokens (punctuation, numbers)
110
+ - Function words (determiners, prepositions, conjunctions,
111
+ pronouns, auxiliaries, particles)
112
+
113
+ Args:
114
+ text: Input text.
115
+
116
+ Returns:
117
+ List of lowercase content word tokens.
118
+ """
119
+ tokens = tokenize(text.lower())
120
+ return [t for t in tokens if t.isalpha() and t not in _FUNCTION_WORDS]
121
+
122
+
123
+ def compute_repetitive_unigrams(
124
+ text: str,
125
+ threshold: float = 3.0,
126
+ chunk_size: int = 1000,
127
+ min_count: int = 3,
128
+ ) -> RepetitiveUnigramsResult:
129
+ """Detect content words that repeat far more than expected based on BNC frequencies.
130
+
131
+ For each content word in the text, computes:
132
+ expected_count = BNC_relative_frequency(word) * text_length
133
+ repetition_score = observed_count / expected_count
134
+
135
+ Words exceeding the threshold score and minimum count are flagged.
136
+
137
+ This function uses native chunked analysis to capture distribution patterns
138
+ across the text. Words that are evenly distributed (low entropy) are more
139
+ suspicious than words clustered in specific sections.
140
+
141
+ Related GitHub Issue:
142
+ #28 - Verbal tics detection for slop analysis
143
+ https://github.com/craigtrim/pystylometry/issues/28
144
+
145
+ References:
146
+ British National Corpus Consortium. (2007). The British National Corpus,
147
+ version 3 (BNC XML Edition). http://www.natcorp.ox.ac.uk/
148
+
149
+ Args:
150
+ text: Input text to analyze.
151
+ threshold: Minimum repetition_score (observed/expected) to flag a word.
152
+ Default 3.0 means the word must appear at least 3x more than expected.
153
+ chunk_size: Number of words per chunk for distribution analysis (default: 1000).
154
+ min_count: Minimum observed count to flag a word. Prevents flagging words
155
+ that appear only once or twice, which aren't meaningfully repetitive
156
+ regardless of their score. Default: 3.
157
+ Returns:
158
+ RepetitiveUnigramsResult with flagged words, aggregate scores, and metadata.
159
+
160
+ Example:
161
+ >>> result = compute_repetitive_unigrams(novel_text)
162
+ >>> for w in result.repetitive_words[:5]:
163
+ ... print(f"{w.word}: {w.count}x (expected {w.expected_count:.1f}, "
164
+ ... f"score {w.repetition_score:.1f})")
165
+ shimmered: 23x (expected 0.1, score 266.2)
166
+ obsidian: 18x (expected 0.0, score 450.0)
167
+ >>> print(f"Slop score: {result.slop_score:.1f}")
168
+ Slop score: 42.7
169
+ """
170
+ check_optional_dependency("bnc_lookup", "lexical")
171
+
172
+ from bnc_lookup import bucket as bnc_bucket # type: ignore[import-untyped]
173
+ from bnc_lookup import expected_count as bnc_expected_count # type: ignore[import-untyped]
174
+
175
+ # Chunk the text
176
+ chunks = chunk_text(text, chunk_size)
177
+
178
+ # Tokenize each chunk into content words
179
+ chunk_tokens: list[list[str]] = [_tokenize_content_words(chunk) for chunk in chunks]
180
+
181
+ # Count content words per chunk
182
+ chunk_counters: list[Counter[str]] = [Counter(tokens) for tokens in chunk_tokens]
183
+ content_words_per_chunk = [len(tokens) for tokens in chunk_tokens]
184
+
185
+ # Build global content word counts
186
+ global_counter: Counter[str] = Counter()
187
+ for counter in chunk_counters:
188
+ global_counter.update(counter)
189
+
190
+ total_content_words = sum(global_counter.values())
191
+
192
+ # Handle empty text
193
+ if total_content_words == 0:
194
+ empty_dist = Distribution(
195
+ values=[],
196
+ mean=float("nan"),
197
+ median=float("nan"),
198
+ std=0.0,
199
+ range=0.0,
200
+ iqr=0.0,
201
+ )
202
+ return RepetitiveUnigramsResult(
203
+ repetitive_words=[],
204
+ total_content_words=0,
205
+ flagged_count=0,
206
+ flagged_words_per_10k=0.0,
207
+ mean_repetition_score=0.0,
208
+ slop_score=0.0,
209
+ total_content_words_dist=empty_dist,
210
+ chunk_size=chunk_size,
211
+ chunk_count=len(chunks),
212
+ metadata={"threshold": threshold, "min_count": min_count},
213
+ )
214
+
215
+ # Evaluate each content word against BNC baseline
216
+ flagged: list[RepetitiveWord] = []
217
+
218
+ for word, observed in global_counter.items():
219
+ if observed < min_count:
220
+ continue
221
+
222
+ # Get BNC expected count for this word given our text length
223
+ expected = bnc_expected_count(word, total_content_words)
224
+ word_bucket = bnc_bucket(word)
225
+
226
+ if expected is None or expected == 0.0:
227
+ # Word not in BNC or has zero expected frequency
228
+ # Any repeated occurrence is notable
229
+ score = float("inf")
230
+ expected_val = 0.0
231
+ else:
232
+ expected_val = expected
233
+ score = observed / expected_val
234
+
235
+ if score >= threshold:
236
+ # Build per-chunk counts for this word
237
+ per_chunk = [counter.get(word, 0) for counter in chunk_counters]
238
+ entropy = _chunk_entropy(per_chunk)
239
+ variance = statistics.variance(per_chunk) if len(per_chunk) > 1 else 0.0
240
+
241
+ flagged.append(
242
+ RepetitiveWord(
243
+ word=word,
244
+ count=observed,
245
+ expected_count=expected_val,
246
+ repetition_score=score,
247
+ bnc_bucket=word_bucket,
248
+ chunk_counts=per_chunk,
249
+ distribution_entropy=entropy,
250
+ distribution_variance=variance,
251
+ )
252
+ )
253
+
254
+ # Sort by repetition_score descending (inf sorts last with key trick)
255
+ flagged.sort(
256
+ key=lambda w: (
257
+ -w.repetition_score if w.repetition_score != float("inf") else -1e18,
258
+ -w.count,
259
+ )
260
+ )
261
+
262
+ # Compute aggregate metrics
263
+ flagged_count = len(flagged)
264
+ flagged_words_per_10k = (
265
+ flagged_count / (total_content_words / 10_000) if total_content_words > 0 else 0.0
266
+ )
267
+
268
+ # Mean repetition score (exclude inf for meaningful average)
269
+ finite_scores = [w.repetition_score for w in flagged if w.repetition_score != float("inf")]
270
+ mean_rep_score = statistics.mean(finite_scores) if finite_scores else 0.0
271
+
272
+ slop_score = flagged_words_per_10k * mean_rep_score
273
+
274
+ # Content words distribution
275
+ content_dist = (
276
+ make_distribution([float(c) for c in content_words_per_chunk])
277
+ if content_words_per_chunk
278
+ else Distribution(
279
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
280
+ )
281
+ )
282
+
283
+ return RepetitiveUnigramsResult(
284
+ repetitive_words=flagged,
285
+ total_content_words=total_content_words,
286
+ flagged_count=flagged_count,
287
+ flagged_words_per_10k=flagged_words_per_10k,
288
+ mean_repetition_score=mean_rep_score,
289
+ slop_score=slop_score,
290
+ total_content_words_dist=content_dist,
291
+ chunk_size=chunk_size,
292
+ chunk_count=len(chunks),
293
+ metadata={
294
+ "threshold": threshold,
295
+ "min_count": min_count,
296
+ "total_unique_content_words": len(global_counter),
297
+ "inf_score_count": sum(1 for w in flagged if w.repetition_score == float("inf")),
298
+ },
299
+ )
300
+
301
+
302
+ def _validate_n(n: int | tuple[int, ...]) -> tuple[int, ...]:
303
+ """Validate and normalize the n-gram order parameter.
304
+
305
+ Args:
306
+ n: Single integer or tuple of integers specifying n-gram orders.
307
+
308
+ Returns:
309
+ Sorted tuple of unique valid n-gram orders.
310
+
311
+ Raises:
312
+ ValueError: If any value is outside the range [2, 5] or input is empty.
313
+ """
314
+ values: tuple[int, ...]
315
+ if isinstance(n, int):
316
+ values = (n,)
317
+ else:
318
+ values = tuple(sorted(set(n)))
319
+
320
+ if not values:
321
+ raise ValueError("n must specify at least one n-gram order.")
322
+
323
+ for v in values:
324
+ if v < 2:
325
+ raise ValueError(
326
+ f"n-gram order {v} is too small. Minimum is 2 (bigrams). "
327
+ f"For single-word repetition, use compute_repetitive_unigrams() instead."
328
+ )
329
+ if v > 5:
330
+ raise ValueError(
331
+ f"n-gram order {v} is too large. Maximum is 5. "
332
+ f"N-grams of order 6+ are too sparse to produce meaningful repetition "
333
+ f"signals in typical texts (they rarely repeat even once)."
334
+ )
335
+
336
+ return values
337
+
338
+
339
+ def _is_content_ngram(ngram: tuple[str, ...]) -> bool:
340
+ """Check if an n-gram contains at least one content word.
341
+
342
+ An n-gram composed entirely of function words (e.g., "of the", "in a")
343
+ is expected to repeat and should not be flagged.
344
+
345
+ Args:
346
+ ngram: Tuple of words.
347
+
348
+ Returns:
349
+ True if at least one word is not a function word.
350
+ """
351
+ return any(word not in _FUNCTION_WORDS for word in ngram)
352
+
353
+
354
+ def compute_repetitive_ngrams(
355
+ text: str,
356
+ n: int | tuple[int, ...] = (2, 3),
357
+ chunk_size: int = 1000,
358
+ min_count: int = 3,
359
+ ) -> RepetitiveNgramsResult:
360
+ """Detect content n-grams that repeat more than expected within the text.
361
+
362
+ Content n-grams (bigrams, trigrams, etc.) should rarely repeat verbatim in
363
+ natural writing. This function flags n-grams that exceed a length-scaled
364
+ threshold, filtering out n-grams composed entirely of function words.
365
+
366
+ No external corpus is required — the threshold is computed internally based
367
+ on text length. Any content n-gram appearing more than
368
+ max(min_count, total_ngrams / 10000) times is flagged.
369
+
370
+ Related GitHub Issue:
371
+ #28 - Verbal tics detection for slop analysis
372
+ https://github.com/craigtrim/pystylometry/issues/28
373
+
374
+ Args:
375
+ text: Input text to analyze.
376
+ n: N-gram order(s) to analyze. Can be a single integer (e.g., 2 for
377
+ bigrams) or a tuple of integers (e.g., (2, 3) for bigrams and
378
+ trigrams). Valid range: 2 to 5. Default: (2, 3).
379
+ - Values below 2 are rejected (use compute_repetitive_unigrams
380
+ for single words).
381
+ - Values above 5 are rejected (n-grams of order 6+ are too sparse
382
+ to produce meaningful repetition signals).
383
+ chunk_size: Number of words per chunk for distribution analysis (default: 1000).
384
+ min_count: Minimum count to flag an n-gram. Default: 3.
385
+
386
+ Returns:
387
+ RepetitiveNgramsResult with flagged n-grams, counts, and metadata.
388
+
389
+ Example:
390
+ >>> result = compute_repetitive_ngrams(text, n=2)
391
+ >>> for ng in result.repetitive_ngrams[:5]:
392
+ ... print(f"{' '.join(ng.ngram)}: {ng.count}x")
393
+ uncomfortable truth: 8x
394
+ >>> result = compute_repetitive_ngrams(text, n=(2, 3, 4))
395
+ >>> print(f"Flagged: {result.flagged_count} n-grams")
396
+ """
397
+ # Validate n parameter
398
+ n_values = _validate_n(n)
399
+
400
+ # Chunk the text
401
+ chunks = chunk_text(text, chunk_size)
402
+
403
+ # Tokenize each chunk — lowercase alpha only (but keep function words
404
+ # so n-grams spanning content+function words are preserved; we filter
405
+ # all-function-word n-grams separately)
406
+ chunk_tokens: list[list[str]] = []
407
+ for chunk in chunks:
408
+ tokens = tokenize(chunk.lower())
409
+ chunk_tokens.append([t for t in tokens if t.isalpha()])
410
+
411
+ # Build n-grams per chunk for each requested order
412
+ # chunk_ngram_counters[chunk_idx] aggregates across all n values
413
+ chunk_ngram_counters: list[Counter[tuple[str, ...]]] = [Counter() for _ in chunks]
414
+ total_ngram_count = 0
415
+
416
+ for chunk_idx, tokens in enumerate(chunk_tokens):
417
+ for nv in n_values:
418
+ for i in range(len(tokens) - nv + 1):
419
+ ngram = tuple(tokens[i : i + nv])
420
+ if _is_content_ngram(ngram):
421
+ chunk_ngram_counters[chunk_idx][ngram] += 1
422
+ total_ngram_count += 1
423
+
424
+ # Build global counts
425
+ global_ngram_counter: Counter[tuple[str, ...]] = Counter()
426
+ for counter in chunk_ngram_counters:
427
+ global_ngram_counter.update(counter)
428
+
429
+ # Determine threshold: any content n-gram appearing more than this is flagged
430
+ length_threshold = max(min_count, total_ngram_count // 10_000)
431
+
432
+ # Handle empty text
433
+ if total_ngram_count == 0:
434
+ empty_dist = Distribution(
435
+ values=[],
436
+ mean=float("nan"),
437
+ median=float("nan"),
438
+ std=0.0,
439
+ range=0.0,
440
+ iqr=0.0,
441
+ )
442
+ return RepetitiveNgramsResult(
443
+ repetitive_ngrams=[],
444
+ n=n,
445
+ total_ngrams=0,
446
+ flagged_count=0,
447
+ flagged_per_10k=0.0,
448
+ total_ngrams_dist=empty_dist,
449
+ chunk_size=chunk_size,
450
+ chunk_count=len(chunks),
451
+ metadata={"min_count": min_count, "effective_threshold": length_threshold},
452
+ )
453
+
454
+ # Flag n-grams exceeding threshold
455
+ flagged: list[RepetitiveNgram] = []
456
+
457
+ for ngram, count in global_ngram_counter.items():
458
+ if count >= length_threshold:
459
+ per_chunk = [counter.get(ngram, 0) for counter in chunk_ngram_counters]
460
+ entropy = _chunk_entropy(per_chunk)
461
+ variance = statistics.variance(per_chunk) if len(per_chunk) > 1 else 0.0
462
+ freq_per_10k = count / (total_ngram_count / 10_000) if total_ngram_count > 0 else 0.0
463
+
464
+ flagged.append(
465
+ RepetitiveNgram(
466
+ ngram=ngram,
467
+ count=count,
468
+ frequency_per_10k=freq_per_10k,
469
+ chunk_counts=per_chunk,
470
+ distribution_entropy=entropy,
471
+ distribution_variance=variance,
472
+ )
473
+ )
474
+
475
+ # Sort by count descending
476
+ flagged.sort(key=lambda ng: -ng.count)
477
+
478
+ flagged_count = len(flagged)
479
+ flagged_per_10k = flagged_count / (total_ngram_count / 10_000) if total_ngram_count > 0 else 0.0
480
+
481
+ # N-grams per chunk distribution
482
+ ngrams_per_chunk = [sum(counter.values()) for counter in chunk_ngram_counters]
483
+ ngrams_dist = (
484
+ make_distribution([float(c) for c in ngrams_per_chunk])
485
+ if ngrams_per_chunk
486
+ else Distribution(
487
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
488
+ )
489
+ )
490
+
491
+ return RepetitiveNgramsResult(
492
+ repetitive_ngrams=flagged,
493
+ n=n,
494
+ total_ngrams=total_ngram_count,
495
+ flagged_count=flagged_count,
496
+ flagged_per_10k=flagged_per_10k,
497
+ total_ngrams_dist=ngrams_dist,
498
+ chunk_size=chunk_size,
499
+ chunk_count=len(chunks),
500
+ metadata={
501
+ "min_count": min_count,
502
+ "effective_threshold": length_threshold,
503
+ "n_values": list(n_values),
504
+ "total_unique_ngrams": len(global_ngram_counter),
505
+ },
506
+ )
@@ -0,0 +1,18 @@
1
+ # ngrams
2
+
3
+ ![4 public functions](https://img.shields.io/badge/functions-4-blue)
4
+ ![No external deps](https://img.shields.io/badge/deps-none-brightgreen)
5
+
6
+ N-gram generation, entropy computation, and sequence analysis.
7
+
8
+ ## Catalogue
9
+
10
+ | File | Functions | What It Measures |
11
+ |------|-----------|-----------------|
12
+ | `entropy.py` | `compute_ngram_entropy`, `compute_character_bigram_entropy`, `compute_word_bigram_entropy` | Shannon entropy at character and word n-gram levels |
13
+ | `extended_ngrams.py` | `compute_extended_ngrams` | Word, character, and POS n-gram profiles with frequency distributions |
14
+
15
+ ## See Also
16
+
17
+ - [`syntactic/`](../syntactic/) provides POS tags consumed by `compute_extended_ngrams(text, pos=True)`
18
+ - [`character/`](../character/) for character-level features without n-gram structure