pystylometry 1.3.0__py3-none-any.whl → 1.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +42 -3
- pystylometry/_types.py +205 -3
- pystylometry/cli.py +321 -0
- pystylometry/lexical/__init__.py +5 -1
- pystylometry/lexical/repetition.py +506 -0
- pystylometry/lexical/ttr.py +288 -97
- pystylometry-1.3.5.dist-info/LICENSE +21 -0
- pystylometry-1.3.5.dist-info/METADATA +78 -0
- {pystylometry-1.3.0.dist-info → pystylometry-1.3.5.dist-info}/RECORD +11 -9
- {pystylometry-1.3.0.dist-info → pystylometry-1.3.5.dist-info}/WHEEL +1 -1
- {pystylometry-1.3.0.dist-info → pystylometry-1.3.5.dist-info}/entry_points.txt +1 -0
- pystylometry-1.3.0.dist-info/METADATA +0 -136
|
@@ -0,0 +1,506 @@
|
|
|
1
|
+
"""Repetitive word and n-gram detection for verbal tics / slop analysis.
|
|
2
|
+
|
|
3
|
+
This module detects abnormally repetitive words and phrases in text — a common
|
|
4
|
+
pattern in AI-generated content ("slop") where certain content words and phrases
|
|
5
|
+
appear far more frequently than expected.
|
|
6
|
+
|
|
7
|
+
Generative models exhibit "verbal tics": they repeatedly use certain words and
|
|
8
|
+
phrases throughout generated text. Examples include "shimmered", "flickered",
|
|
9
|
+
"obsidian", "a testament to", "an uncomfortable truth". These patterns differ
|
|
10
|
+
from natural human writing where content words appear when contextually relevant,
|
|
11
|
+
repetition clusters around specific scenes or topics, and unusual words don't
|
|
12
|
+
appear with suspiciously even distribution.
|
|
13
|
+
|
|
14
|
+
Two functions are provided:
|
|
15
|
+
|
|
16
|
+
compute_repetitive_unigrams:
|
|
17
|
+
Compares observed word frequencies against the British National Corpus
|
|
18
|
+
(BNC, ~100M tokens) baseline. Words that appear far more than their
|
|
19
|
+
BNC relative frequency predicts are flagged.
|
|
20
|
+
|
|
21
|
+
compute_repetitive_ngrams:
|
|
22
|
+
Detects content n-grams (bigrams, trigrams, etc.) that repeat more
|
|
23
|
+
than expected. No external corpus is required — content n-grams should
|
|
24
|
+
not repeat verbatim often in natural writing.
|
|
25
|
+
|
|
26
|
+
Both functions support chunked analysis to reveal distribution patterns:
|
|
27
|
+
- Even distribution across text = suspicious (model's consistent tic)
|
|
28
|
+
- Clustered distribution = likely intentional (human describing a scene)
|
|
29
|
+
|
|
30
|
+
Related GitHub Issue:
|
|
31
|
+
#28 - Verbal tics detection for slop analysis
|
|
32
|
+
https://github.com/craigtrim/pystylometry/issues/28
|
|
33
|
+
|
|
34
|
+
Dependencies:
|
|
35
|
+
- bnc-lookup >= 1.3.0 (optional, in lexical group)
|
|
36
|
+
Provides expected_count() and bucket() for BNC baseline comparison.
|
|
37
|
+
|
|
38
|
+
References:
|
|
39
|
+
British National Corpus Consortium. (2007). The British National Corpus,
|
|
40
|
+
version 3 (BNC XML Edition). http://www.natcorp.ox.ac.uk/
|
|
41
|
+
Kilgarriff, A. (2001). BNC database and word frequency lists.
|
|
42
|
+
https://www.kilgarriff.co.uk/bnc-readme.html
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
from __future__ import annotations
|
|
46
|
+
|
|
47
|
+
import math
|
|
48
|
+
import statistics
|
|
49
|
+
from collections import Counter
|
|
50
|
+
|
|
51
|
+
from .._types import (
|
|
52
|
+
Distribution,
|
|
53
|
+
RepetitiveNgram,
|
|
54
|
+
RepetitiveNgramsResult,
|
|
55
|
+
RepetitiveUnigramsResult,
|
|
56
|
+
RepetitiveWord,
|
|
57
|
+
chunk_text,
|
|
58
|
+
make_distribution,
|
|
59
|
+
)
|
|
60
|
+
from .._utils import check_optional_dependency, tokenize
|
|
61
|
+
from .function_words import (
|
|
62
|
+
AUXILIARIES,
|
|
63
|
+
CONJUNCTIONS,
|
|
64
|
+
DETERMINERS,
|
|
65
|
+
PARTICLES,
|
|
66
|
+
PREPOSITIONS,
|
|
67
|
+
PRONOUNS,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Union of all function word sets — used to filter out non-content words
|
|
71
|
+
_FUNCTION_WORDS = DETERMINERS | PREPOSITIONS | CONJUNCTIONS | PRONOUNS | AUXILIARIES | PARTICLES
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _chunk_entropy(chunk_counts: list[int]) -> float:
|
|
75
|
+
"""Compute Shannon entropy of a word's distribution across chunks.
|
|
76
|
+
|
|
77
|
+
Entropy measures how evenly a word is distributed across chunks.
|
|
78
|
+
Low entropy means the word appears evenly (suspicious for rare words).
|
|
79
|
+
High entropy means the word is concentrated in specific chunks (natural).
|
|
80
|
+
|
|
81
|
+
Formula:
|
|
82
|
+
H = -sum(p_i * log2(p_i)) for each chunk i where p_i > 0
|
|
83
|
+
p_i = count_in_chunk_i / total_count
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
chunk_counts: Per-chunk occurrence counts.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Shannon entropy in bits. 0.0 if the word appears in only one chunk.
|
|
90
|
+
Returns 0.0 for empty or all-zero counts.
|
|
91
|
+
"""
|
|
92
|
+
total = sum(chunk_counts)
|
|
93
|
+
if total == 0:
|
|
94
|
+
return 0.0
|
|
95
|
+
|
|
96
|
+
entropy = 0.0
|
|
97
|
+
for count in chunk_counts:
|
|
98
|
+
if count > 0:
|
|
99
|
+
p = count / total
|
|
100
|
+
entropy -= p * math.log2(p)
|
|
101
|
+
|
|
102
|
+
return entropy
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _tokenize_content_words(text: str) -> list[str]:
|
|
106
|
+
"""Tokenize text and return only lowercase alphabetic content words.
|
|
107
|
+
|
|
108
|
+
Filters out:
|
|
109
|
+
- Non-alphabetic tokens (punctuation, numbers)
|
|
110
|
+
- Function words (determiners, prepositions, conjunctions,
|
|
111
|
+
pronouns, auxiliaries, particles)
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
text: Input text.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
List of lowercase content word tokens.
|
|
118
|
+
"""
|
|
119
|
+
tokens = tokenize(text.lower())
|
|
120
|
+
return [t for t in tokens if t.isalpha() and t not in _FUNCTION_WORDS]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def compute_repetitive_unigrams(
|
|
124
|
+
text: str,
|
|
125
|
+
threshold: float = 3.0,
|
|
126
|
+
chunk_size: int = 1000,
|
|
127
|
+
min_count: int = 3,
|
|
128
|
+
) -> RepetitiveUnigramsResult:
|
|
129
|
+
"""Detect content words that repeat far more than expected based on BNC frequencies.
|
|
130
|
+
|
|
131
|
+
For each content word in the text, computes:
|
|
132
|
+
expected_count = BNC_relative_frequency(word) * text_length
|
|
133
|
+
repetition_score = observed_count / expected_count
|
|
134
|
+
|
|
135
|
+
Words exceeding the threshold score and minimum count are flagged.
|
|
136
|
+
|
|
137
|
+
This function uses native chunked analysis to capture distribution patterns
|
|
138
|
+
across the text. Words that are evenly distributed (low entropy) are more
|
|
139
|
+
suspicious than words clustered in specific sections.
|
|
140
|
+
|
|
141
|
+
Related GitHub Issue:
|
|
142
|
+
#28 - Verbal tics detection for slop analysis
|
|
143
|
+
https://github.com/craigtrim/pystylometry/issues/28
|
|
144
|
+
|
|
145
|
+
References:
|
|
146
|
+
British National Corpus Consortium. (2007). The British National Corpus,
|
|
147
|
+
version 3 (BNC XML Edition). http://www.natcorp.ox.ac.uk/
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
text: Input text to analyze.
|
|
151
|
+
threshold: Minimum repetition_score (observed/expected) to flag a word.
|
|
152
|
+
Default 3.0 means the word must appear at least 3x more than expected.
|
|
153
|
+
chunk_size: Number of words per chunk for distribution analysis (default: 1000).
|
|
154
|
+
min_count: Minimum observed count to flag a word. Prevents flagging words
|
|
155
|
+
that appear only once or twice, which aren't meaningfully repetitive
|
|
156
|
+
regardless of their score. Default: 3.
|
|
157
|
+
Returns:
|
|
158
|
+
RepetitiveUnigramsResult with flagged words, aggregate scores, and metadata.
|
|
159
|
+
|
|
160
|
+
Example:
|
|
161
|
+
>>> result = compute_repetitive_unigrams(novel_text)
|
|
162
|
+
>>> for w in result.repetitive_words[:5]:
|
|
163
|
+
... print(f"{w.word}: {w.count}x (expected {w.expected_count:.1f}, "
|
|
164
|
+
... f"score {w.repetition_score:.1f})")
|
|
165
|
+
shimmered: 23x (expected 0.1, score 266.2)
|
|
166
|
+
obsidian: 18x (expected 0.0, score 450.0)
|
|
167
|
+
>>> print(f"Slop score: {result.slop_score:.1f}")
|
|
168
|
+
Slop score: 42.7
|
|
169
|
+
"""
|
|
170
|
+
check_optional_dependency("bnc_lookup", "lexical")
|
|
171
|
+
|
|
172
|
+
from bnc_lookup import bucket as bnc_bucket # type: ignore[import-untyped]
|
|
173
|
+
from bnc_lookup import expected_count as bnc_expected_count # type: ignore[import-untyped]
|
|
174
|
+
|
|
175
|
+
# Chunk the text
|
|
176
|
+
chunks = chunk_text(text, chunk_size)
|
|
177
|
+
|
|
178
|
+
# Tokenize each chunk into content words
|
|
179
|
+
chunk_tokens: list[list[str]] = [_tokenize_content_words(chunk) for chunk in chunks]
|
|
180
|
+
|
|
181
|
+
# Count content words per chunk
|
|
182
|
+
chunk_counters: list[Counter[str]] = [Counter(tokens) for tokens in chunk_tokens]
|
|
183
|
+
content_words_per_chunk = [len(tokens) for tokens in chunk_tokens]
|
|
184
|
+
|
|
185
|
+
# Build global content word counts
|
|
186
|
+
global_counter: Counter[str] = Counter()
|
|
187
|
+
for counter in chunk_counters:
|
|
188
|
+
global_counter.update(counter)
|
|
189
|
+
|
|
190
|
+
total_content_words = sum(global_counter.values())
|
|
191
|
+
|
|
192
|
+
# Handle empty text
|
|
193
|
+
if total_content_words == 0:
|
|
194
|
+
empty_dist = Distribution(
|
|
195
|
+
values=[],
|
|
196
|
+
mean=float("nan"),
|
|
197
|
+
median=float("nan"),
|
|
198
|
+
std=0.0,
|
|
199
|
+
range=0.0,
|
|
200
|
+
iqr=0.0,
|
|
201
|
+
)
|
|
202
|
+
return RepetitiveUnigramsResult(
|
|
203
|
+
repetitive_words=[],
|
|
204
|
+
total_content_words=0,
|
|
205
|
+
flagged_count=0,
|
|
206
|
+
flagged_words_per_10k=0.0,
|
|
207
|
+
mean_repetition_score=0.0,
|
|
208
|
+
slop_score=0.0,
|
|
209
|
+
total_content_words_dist=empty_dist,
|
|
210
|
+
chunk_size=chunk_size,
|
|
211
|
+
chunk_count=len(chunks),
|
|
212
|
+
metadata={"threshold": threshold, "min_count": min_count},
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# Evaluate each content word against BNC baseline
|
|
216
|
+
flagged: list[RepetitiveWord] = []
|
|
217
|
+
|
|
218
|
+
for word, observed in global_counter.items():
|
|
219
|
+
if observed < min_count:
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
# Get BNC expected count for this word given our text length
|
|
223
|
+
expected = bnc_expected_count(word, total_content_words)
|
|
224
|
+
word_bucket = bnc_bucket(word)
|
|
225
|
+
|
|
226
|
+
if expected is None or expected == 0.0:
|
|
227
|
+
# Word not in BNC or has zero expected frequency
|
|
228
|
+
# Any repeated occurrence is notable
|
|
229
|
+
score = float("inf")
|
|
230
|
+
expected_val = 0.0
|
|
231
|
+
else:
|
|
232
|
+
expected_val = expected
|
|
233
|
+
score = observed / expected_val
|
|
234
|
+
|
|
235
|
+
if score >= threshold:
|
|
236
|
+
# Build per-chunk counts for this word
|
|
237
|
+
per_chunk = [counter.get(word, 0) for counter in chunk_counters]
|
|
238
|
+
entropy = _chunk_entropy(per_chunk)
|
|
239
|
+
variance = statistics.variance(per_chunk) if len(per_chunk) > 1 else 0.0
|
|
240
|
+
|
|
241
|
+
flagged.append(
|
|
242
|
+
RepetitiveWord(
|
|
243
|
+
word=word,
|
|
244
|
+
count=observed,
|
|
245
|
+
expected_count=expected_val,
|
|
246
|
+
repetition_score=score,
|
|
247
|
+
bnc_bucket=word_bucket,
|
|
248
|
+
chunk_counts=per_chunk,
|
|
249
|
+
distribution_entropy=entropy,
|
|
250
|
+
distribution_variance=variance,
|
|
251
|
+
)
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# Sort by repetition_score descending (inf sorts last with key trick)
|
|
255
|
+
flagged.sort(
|
|
256
|
+
key=lambda w: (
|
|
257
|
+
-w.repetition_score if w.repetition_score != float("inf") else -1e18,
|
|
258
|
+
-w.count,
|
|
259
|
+
)
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
# Compute aggregate metrics
|
|
263
|
+
flagged_count = len(flagged)
|
|
264
|
+
flagged_words_per_10k = (
|
|
265
|
+
flagged_count / (total_content_words / 10_000) if total_content_words > 0 else 0.0
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# Mean repetition score (exclude inf for meaningful average)
|
|
269
|
+
finite_scores = [w.repetition_score for w in flagged if w.repetition_score != float("inf")]
|
|
270
|
+
mean_rep_score = statistics.mean(finite_scores) if finite_scores else 0.0
|
|
271
|
+
|
|
272
|
+
slop_score = flagged_words_per_10k * mean_rep_score
|
|
273
|
+
|
|
274
|
+
# Content words distribution
|
|
275
|
+
content_dist = (
|
|
276
|
+
make_distribution([float(c) for c in content_words_per_chunk])
|
|
277
|
+
if content_words_per_chunk
|
|
278
|
+
else Distribution(
|
|
279
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
280
|
+
)
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
return RepetitiveUnigramsResult(
|
|
284
|
+
repetitive_words=flagged,
|
|
285
|
+
total_content_words=total_content_words,
|
|
286
|
+
flagged_count=flagged_count,
|
|
287
|
+
flagged_words_per_10k=flagged_words_per_10k,
|
|
288
|
+
mean_repetition_score=mean_rep_score,
|
|
289
|
+
slop_score=slop_score,
|
|
290
|
+
total_content_words_dist=content_dist,
|
|
291
|
+
chunk_size=chunk_size,
|
|
292
|
+
chunk_count=len(chunks),
|
|
293
|
+
metadata={
|
|
294
|
+
"threshold": threshold,
|
|
295
|
+
"min_count": min_count,
|
|
296
|
+
"total_unique_content_words": len(global_counter),
|
|
297
|
+
"inf_score_count": sum(1 for w in flagged if w.repetition_score == float("inf")),
|
|
298
|
+
},
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def _validate_n(n: int | tuple[int, ...]) -> tuple[int, ...]:
|
|
303
|
+
"""Validate and normalize the n-gram order parameter.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
n: Single integer or tuple of integers specifying n-gram orders.
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
Sorted tuple of unique valid n-gram orders.
|
|
310
|
+
|
|
311
|
+
Raises:
|
|
312
|
+
ValueError: If any value is outside the range [2, 5] or input is empty.
|
|
313
|
+
"""
|
|
314
|
+
values: tuple[int, ...]
|
|
315
|
+
if isinstance(n, int):
|
|
316
|
+
values = (n,)
|
|
317
|
+
else:
|
|
318
|
+
values = tuple(sorted(set(n)))
|
|
319
|
+
|
|
320
|
+
if not values:
|
|
321
|
+
raise ValueError("n must specify at least one n-gram order.")
|
|
322
|
+
|
|
323
|
+
for v in values:
|
|
324
|
+
if v < 2:
|
|
325
|
+
raise ValueError(
|
|
326
|
+
f"n-gram order {v} is too small. Minimum is 2 (bigrams). "
|
|
327
|
+
f"For single-word repetition, use compute_repetitive_unigrams() instead."
|
|
328
|
+
)
|
|
329
|
+
if v > 5:
|
|
330
|
+
raise ValueError(
|
|
331
|
+
f"n-gram order {v} is too large. Maximum is 5. "
|
|
332
|
+
f"N-grams of order 6+ are too sparse to produce meaningful repetition "
|
|
333
|
+
f"signals in typical texts (they rarely repeat even once)."
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
return values
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def _is_content_ngram(ngram: tuple[str, ...]) -> bool:
|
|
340
|
+
"""Check if an n-gram contains at least one content word.
|
|
341
|
+
|
|
342
|
+
An n-gram composed entirely of function words (e.g., "of the", "in a")
|
|
343
|
+
is expected to repeat and should not be flagged.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
ngram: Tuple of words.
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
True if at least one word is not a function word.
|
|
350
|
+
"""
|
|
351
|
+
return any(word not in _FUNCTION_WORDS for word in ngram)
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def compute_repetitive_ngrams(
|
|
355
|
+
text: str,
|
|
356
|
+
n: int | tuple[int, ...] = (2, 3),
|
|
357
|
+
chunk_size: int = 1000,
|
|
358
|
+
min_count: int = 3,
|
|
359
|
+
) -> RepetitiveNgramsResult:
|
|
360
|
+
"""Detect content n-grams that repeat more than expected within the text.
|
|
361
|
+
|
|
362
|
+
Content n-grams (bigrams, trigrams, etc.) should rarely repeat verbatim in
|
|
363
|
+
natural writing. This function flags n-grams that exceed a length-scaled
|
|
364
|
+
threshold, filtering out n-grams composed entirely of function words.
|
|
365
|
+
|
|
366
|
+
No external corpus is required — the threshold is computed internally based
|
|
367
|
+
on text length. Any content n-gram appearing more than
|
|
368
|
+
max(min_count, total_ngrams / 10000) times is flagged.
|
|
369
|
+
|
|
370
|
+
Related GitHub Issue:
|
|
371
|
+
#28 - Verbal tics detection for slop analysis
|
|
372
|
+
https://github.com/craigtrim/pystylometry/issues/28
|
|
373
|
+
|
|
374
|
+
Args:
|
|
375
|
+
text: Input text to analyze.
|
|
376
|
+
n: N-gram order(s) to analyze. Can be a single integer (e.g., 2 for
|
|
377
|
+
bigrams) or a tuple of integers (e.g., (2, 3) for bigrams and
|
|
378
|
+
trigrams). Valid range: 2 to 5. Default: (2, 3).
|
|
379
|
+
- Values below 2 are rejected (use compute_repetitive_unigrams
|
|
380
|
+
for single words).
|
|
381
|
+
- Values above 5 are rejected (n-grams of order 6+ are too sparse
|
|
382
|
+
to produce meaningful repetition signals).
|
|
383
|
+
chunk_size: Number of words per chunk for distribution analysis (default: 1000).
|
|
384
|
+
min_count: Minimum count to flag an n-gram. Default: 3.
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
RepetitiveNgramsResult with flagged n-grams, counts, and metadata.
|
|
388
|
+
|
|
389
|
+
Example:
|
|
390
|
+
>>> result = compute_repetitive_ngrams(text, n=2)
|
|
391
|
+
>>> for ng in result.repetitive_ngrams[:5]:
|
|
392
|
+
... print(f"{' '.join(ng.ngram)}: {ng.count}x")
|
|
393
|
+
uncomfortable truth: 8x
|
|
394
|
+
>>> result = compute_repetitive_ngrams(text, n=(2, 3, 4))
|
|
395
|
+
>>> print(f"Flagged: {result.flagged_count} n-grams")
|
|
396
|
+
"""
|
|
397
|
+
# Validate n parameter
|
|
398
|
+
n_values = _validate_n(n)
|
|
399
|
+
|
|
400
|
+
# Chunk the text
|
|
401
|
+
chunks = chunk_text(text, chunk_size)
|
|
402
|
+
|
|
403
|
+
# Tokenize each chunk — lowercase alpha only (but keep function words
|
|
404
|
+
# so n-grams spanning content+function words are preserved; we filter
|
|
405
|
+
# all-function-word n-grams separately)
|
|
406
|
+
chunk_tokens: list[list[str]] = []
|
|
407
|
+
for chunk in chunks:
|
|
408
|
+
tokens = tokenize(chunk.lower())
|
|
409
|
+
chunk_tokens.append([t for t in tokens if t.isalpha()])
|
|
410
|
+
|
|
411
|
+
# Build n-grams per chunk for each requested order
|
|
412
|
+
# chunk_ngram_counters[chunk_idx] aggregates across all n values
|
|
413
|
+
chunk_ngram_counters: list[Counter[tuple[str, ...]]] = [Counter() for _ in chunks]
|
|
414
|
+
total_ngram_count = 0
|
|
415
|
+
|
|
416
|
+
for chunk_idx, tokens in enumerate(chunk_tokens):
|
|
417
|
+
for nv in n_values:
|
|
418
|
+
for i in range(len(tokens) - nv + 1):
|
|
419
|
+
ngram = tuple(tokens[i : i + nv])
|
|
420
|
+
if _is_content_ngram(ngram):
|
|
421
|
+
chunk_ngram_counters[chunk_idx][ngram] += 1
|
|
422
|
+
total_ngram_count += 1
|
|
423
|
+
|
|
424
|
+
# Build global counts
|
|
425
|
+
global_ngram_counter: Counter[tuple[str, ...]] = Counter()
|
|
426
|
+
for counter in chunk_ngram_counters:
|
|
427
|
+
global_ngram_counter.update(counter)
|
|
428
|
+
|
|
429
|
+
# Determine threshold: any content n-gram appearing more than this is flagged
|
|
430
|
+
length_threshold = max(min_count, total_ngram_count // 10_000)
|
|
431
|
+
|
|
432
|
+
# Handle empty text
|
|
433
|
+
if total_ngram_count == 0:
|
|
434
|
+
empty_dist = Distribution(
|
|
435
|
+
values=[],
|
|
436
|
+
mean=float("nan"),
|
|
437
|
+
median=float("nan"),
|
|
438
|
+
std=0.0,
|
|
439
|
+
range=0.0,
|
|
440
|
+
iqr=0.0,
|
|
441
|
+
)
|
|
442
|
+
return RepetitiveNgramsResult(
|
|
443
|
+
repetitive_ngrams=[],
|
|
444
|
+
n=n,
|
|
445
|
+
total_ngrams=0,
|
|
446
|
+
flagged_count=0,
|
|
447
|
+
flagged_per_10k=0.0,
|
|
448
|
+
total_ngrams_dist=empty_dist,
|
|
449
|
+
chunk_size=chunk_size,
|
|
450
|
+
chunk_count=len(chunks),
|
|
451
|
+
metadata={"min_count": min_count, "effective_threshold": length_threshold},
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
# Flag n-grams exceeding threshold
|
|
455
|
+
flagged: list[RepetitiveNgram] = []
|
|
456
|
+
|
|
457
|
+
for ngram, count in global_ngram_counter.items():
|
|
458
|
+
if count >= length_threshold:
|
|
459
|
+
per_chunk = [counter.get(ngram, 0) for counter in chunk_ngram_counters]
|
|
460
|
+
entropy = _chunk_entropy(per_chunk)
|
|
461
|
+
variance = statistics.variance(per_chunk) if len(per_chunk) > 1 else 0.0
|
|
462
|
+
freq_per_10k = count / (total_ngram_count / 10_000) if total_ngram_count > 0 else 0.0
|
|
463
|
+
|
|
464
|
+
flagged.append(
|
|
465
|
+
RepetitiveNgram(
|
|
466
|
+
ngram=ngram,
|
|
467
|
+
count=count,
|
|
468
|
+
frequency_per_10k=freq_per_10k,
|
|
469
|
+
chunk_counts=per_chunk,
|
|
470
|
+
distribution_entropy=entropy,
|
|
471
|
+
distribution_variance=variance,
|
|
472
|
+
)
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
# Sort by count descending
|
|
476
|
+
flagged.sort(key=lambda ng: -ng.count)
|
|
477
|
+
|
|
478
|
+
flagged_count = len(flagged)
|
|
479
|
+
flagged_per_10k = flagged_count / (total_ngram_count / 10_000) if total_ngram_count > 0 else 0.0
|
|
480
|
+
|
|
481
|
+
# N-grams per chunk distribution
|
|
482
|
+
ngrams_per_chunk = [sum(counter.values()) for counter in chunk_ngram_counters]
|
|
483
|
+
ngrams_dist = (
|
|
484
|
+
make_distribution([float(c) for c in ngrams_per_chunk])
|
|
485
|
+
if ngrams_per_chunk
|
|
486
|
+
else Distribution(
|
|
487
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
488
|
+
)
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
return RepetitiveNgramsResult(
|
|
492
|
+
repetitive_ngrams=flagged,
|
|
493
|
+
n=n,
|
|
494
|
+
total_ngrams=total_ngram_count,
|
|
495
|
+
flagged_count=flagged_count,
|
|
496
|
+
flagged_per_10k=flagged_per_10k,
|
|
497
|
+
total_ngrams_dist=ngrams_dist,
|
|
498
|
+
chunk_size=chunk_size,
|
|
499
|
+
chunk_count=len(chunks),
|
|
500
|
+
metadata={
|
|
501
|
+
"min_count": min_count,
|
|
502
|
+
"effective_threshold": length_threshold,
|
|
503
|
+
"n_values": list(n_values),
|
|
504
|
+
"total_unique_ngrams": len(global_ngram_counter),
|
|
505
|
+
},
|
|
506
|
+
)
|