aifingerprint 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aifingerprint-1.0.0/LICENSE +21 -0
- aifingerprint-1.0.0/PKG-INFO +23 -0
- aifingerprint-1.0.0/README.md +91 -0
- aifingerprint-1.0.0/pyproject.toml +35 -0
- aifingerprint-1.0.0/setup.cfg +4 -0
- aifingerprint-1.0.0/src/aifingerprint/__init__.py +5 -0
- aifingerprint-1.0.0/src/aifingerprint/__main__.py +5 -0
- aifingerprint-1.0.0/src/aifingerprint/analyzer.py +34 -0
- aifingerprint-1.0.0/src/aifingerprint/checks/__init__.py +27 -0
- aifingerprint-1.0.0/src/aifingerprint/checks/burstiness.py +68 -0
- aifingerprint-1.0.0/src/aifingerprint/checks/compression.py +64 -0
- aifingerprint-1.0.0/src/aifingerprint/checks/connectives.py +41 -0
- aifingerprint-1.0.0/src/aifingerprint/checks/formatting.py +51 -0
- aifingerprint-1.0.0/src/aifingerprint/checks/phrases.py +31 -0
- aifingerprint-1.0.0/src/aifingerprint/checks/punctuation.py +39 -0
- aifingerprint-1.0.0/src/aifingerprint/checks/rhythm.py +40 -0
- aifingerprint-1.0.0/src/aifingerprint/checks/structure.py +202 -0
- aifingerprint-1.0.0/src/aifingerprint/checks/tone.py +207 -0
- aifingerprint-1.0.0/src/aifingerprint/checks/vocabulary.py +41 -0
- aifingerprint-1.0.0/src/aifingerprint/cli.py +74 -0
- aifingerprint-1.0.0/src/aifingerprint/html.py +358 -0
- aifingerprint-1.0.0/src/aifingerprint/patterns.py +266 -0
- aifingerprint-1.0.0/src/aifingerprint/report.py +153 -0
- aifingerprint-1.0.0/src/aifingerprint/text.py +35 -0
- aifingerprint-1.0.0/src/aifingerprint.egg-info/PKG-INFO +23 -0
- aifingerprint-1.0.0/src/aifingerprint.egg-info/SOURCES.txt +27 -0
- aifingerprint-1.0.0/src/aifingerprint.egg-info/dependency_links.txt +1 -0
- aifingerprint-1.0.0/src/aifingerprint.egg-info/entry_points.txt +2 -0
- aifingerprint-1.0.0/src/aifingerprint.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Sander Buruma
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aifingerprint
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Detect AI writing patterns — scores text 0-100 for AI fingerprints
|
|
5
|
+
Author: Sander Buruma
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/sanderburuma/aifingerprint
|
|
8
|
+
Project-URL: Repository, https://github.com/sanderburuma/aifingerprint
|
|
9
|
+
Project-URL: Issues, https://github.com/sanderburuma/aifingerprint/issues
|
|
10
|
+
Keywords: ai,detection,writing,nlp,heuristics,text-analysis
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
20
|
+
Classifier: Typing :: Typed
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# aifingerprint
|
|
2
|
+
|
|
3
|
+
Scores text 0–100 for AI writing fingerprints. Catches the stuff LLMs can't help doing — flat rhythm, hedge words, compression patterns, that weird punctuation sameness. No API keys, no model downloads, just stdlib Python.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install .
|
|
9
|
+
# or for development:
|
|
10
|
+
pip install -e .
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Usage
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# Analyze a file
|
|
17
|
+
aifingerprint input.txt
|
|
18
|
+
|
|
19
|
+
# Read from clipboard
|
|
20
|
+
aifingerprint --clipboard
|
|
21
|
+
|
|
22
|
+
# Read from stdin
|
|
23
|
+
echo "text to analyze" | aifingerprint
|
|
24
|
+
|
|
25
|
+
# Generate a markdown report
|
|
26
|
+
aifingerprint input.txt --report
|
|
27
|
+
aifingerprint input.txt --report output.md
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Score interpretation
|
|
31
|
+
|
|
32
|
+
| Score | Label | Meaning |
|
|
33
|
+
|-------|-------|---------|
|
|
34
|
+
| 0–20 | CLEAN | Looks human |
|
|
35
|
+
| 21–40 | MILD | A few AI-ish traits, probably human |
|
|
36
|
+
| 41–60 | NOTICEABLE | Smells like AI |
|
|
37
|
+
| 61–80 | OBVIOUS | Yeah, that's AI |
|
|
38
|
+
| 81–100 | BLATANT | Copy-pasted straight from ChatGPT |
|
|
39
|
+
|
|
40
|
+
## What it checks
|
|
41
|
+
|
|
42
|
+
Ten weighted checks, each scoring 0.0–1.0:
|
|
43
|
+
|
|
44
|
+
| Check | Weight | What it measures |
|
|
45
|
+
|-------|--------|-----------------|
|
|
46
|
+
| Compression | 20% | LZMA similarity to a known AI corpus |
|
|
47
|
+
| Sentence rhythm | 15% | Coefficient of variation in sentence lengths |
|
|
48
|
+
| Tone | 15% | Hedging, enthusiasm, formality, word length |
|
|
49
|
+
| Punctuation | 12% | Shannon entropy of punctuation distribution |
|
|
50
|
+
| Connectives | 10% | Density of discourse markers (however, moreover...) |
|
|
51
|
+
| Burstiness | 8% | Whether content words cluster or distribute evenly |
|
|
52
|
+
| Vocabulary | 8% | Known AI-favored words (delve, leverage, utilize...) |
|
|
53
|
+
| Structure | 7% | Paragraph uniformity, parallelism, five-paragraph essay |
|
|
54
|
+
| Phrases | 5% | Cliches, hedges, openers, closers |
|
|
55
|
+
| Formatting | 0% | Em dashes, bold bullets, header density (disabled) |
|
|
56
|
+
|
|
57
|
+
## HTML reports
|
|
58
|
+
|
|
59
|
+
Generate a markdown report, then convert to styled HTML:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
aifingerprint input.txt --report
|
|
63
|
+
python -m aifingerprint.html report.md
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## How it compares
|
|
67
|
+
|
|
68
|
+
We tested against the **RoBERTa OpenAI detector** (`openai-community/roberta-base-openai-detector` via HuggingFace Transformers) — the only other pip-installable thing that runs offline on prose. 27 samples, 8 AI-generated, 19 human-written:
|
|
69
|
+
|
|
70
|
+
| | aifingerprint | RoBERTa |
|
|
71
|
+
|---|---|---|
|
|
72
|
+
| AI samples (avg) | **58%** | 97% |
|
|
73
|
+
| Human samples (avg) | **18%** | 97% |
|
|
74
|
+
| Separation | **40pp gap** | ~0 — labels everything as AI |
|
|
75
|
+
|
|
76
|
+
RoBERTa was trained on GPT-2 output back in 2019. It thinks Paul Graham, Reddit posts, and Seth Godin are all 100% AI. Basically useless on anything written after 2022. aifingerprint uses heuristics instead of a model, so it doesn't go stale when the next GPT drops.
|
|
77
|
+
|
|
78
|
+
Other packages in this space:
|
|
79
|
+
|
|
80
|
+
| Package | Why it doesn't work |
|
|
81
|
+
|---------|-------------------|
|
|
82
|
+
| [gptzero](https://github.com/Haste171/gptzero/) | API wrapper — requires paid GPTZero API key |
|
|
83
|
+
| [openai-detector](https://github.com/promptslab/openai-detector) | Thin wrapper around the same broken RoBERTa model |
|
|
84
|
+
| [sloppylint](https://github.com/rsionnach/sloppylint) | Detects AI patterns in code, not prose |
|
|
85
|
+
| [finbert-ai-detector](https://huggingface.co/msperlin/finbert-ai-detector) | Fine-tuned for financial documents only |
|
|
86
|
+
| [ai-slop-detector](https://github.com/flamehaven/ai-slop-detector) | Browser-based, requires Gemma 270M model download |
|
|
87
|
+
| [textstat](https://github.com/textstat/textstat) | Readability metrics (Flesch, SMOG, etc.) — doesn't attempt detection |
|
|
88
|
+
|
|
89
|
+
## No dependencies
|
|
90
|
+
|
|
91
|
+
Runs on Python 3.10+ using only the standard library.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "aifingerprint"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Detect AI writing patterns — scores text 0-100 for AI fingerprints"
|
|
9
|
+
requires-python = ">=3.10"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
authors = [{ name = "Sander Buruma" }]
|
|
12
|
+
keywords = ["ai", "detection", "writing", "nlp", "heuristics", "text-analysis"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 4 - Beta",
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"Intended Audience :: Science/Research",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.10",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Programming Language :: Python :: 3.13",
|
|
22
|
+
"Topic :: Text Processing :: Linguistic",
|
|
23
|
+
"Typing :: Typed",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[project.urls]
|
|
27
|
+
Homepage = "https://github.com/sanderburuma/aifingerprint"
|
|
28
|
+
Repository = "https://github.com/sanderburuma/aifingerprint"
|
|
29
|
+
Issues = "https://github.com/sanderburuma/aifingerprint/issues"
|
|
30
|
+
|
|
31
|
+
[project.scripts]
|
|
32
|
+
aifingerprint = "aifingerprint.cli:main"
|
|
33
|
+
|
|
34
|
+
[tool.setuptools.packages.find]
|
|
35
|
+
where = ["src"]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Core analyzer — orchestrates all checks and produces a weighted score."""
|
|
2
|
+
|
|
3
|
+
from aifingerprint.checks import CHECKS
|
|
4
|
+
from aifingerprint.patterns import CATEGORY_WEIGHTS
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def analyze(text: str) -> tuple[int, dict]:
|
|
8
|
+
"""Run all checks on text and return (score 0-100, results dict).
|
|
9
|
+
|
|
10
|
+
Results dict maps category name → (hits: list[str], raw_score: float).
|
|
11
|
+
"""
|
|
12
|
+
lines = text.splitlines()
|
|
13
|
+
results = {}
|
|
14
|
+
weighted_total = 0.0
|
|
15
|
+
|
|
16
|
+
for name, fn in CHECKS.items():
|
|
17
|
+
hits, raw_score = fn(text, lines)
|
|
18
|
+
results[name] = (hits, raw_score)
|
|
19
|
+
weighted_total += raw_score * CATEGORY_WEIGHTS[name]
|
|
20
|
+
|
|
21
|
+
final_score = max(0, min(100, int(round(weighted_total * 100))))
|
|
22
|
+
return final_score, results
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def score_label(score: int) -> str:
|
|
26
|
+
if score <= 20:
|
|
27
|
+
return "CLEAN"
|
|
28
|
+
if score <= 40:
|
|
29
|
+
return "MILD"
|
|
30
|
+
if score <= 60:
|
|
31
|
+
return "NOTICEABLE"
|
|
32
|
+
if score <= 80:
|
|
33
|
+
return "OBVIOUS"
|
|
34
|
+
return "BLATANT"
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Check functions — each scores text on a specific AI writing dimension."""
|
|
2
|
+
|
|
3
|
+
from aifingerprint.checks.vocabulary import check as check_vocabulary
|
|
4
|
+
from aifingerprint.checks.phrases import check as check_phrases
|
|
5
|
+
from aifingerprint.checks.structure import check as check_structure
|
|
6
|
+
from aifingerprint.checks.formatting import check as check_formatting
|
|
7
|
+
from aifingerprint.checks.tone import check as check_tone
|
|
8
|
+
from aifingerprint.checks.rhythm import check as check_sentence_rhythm
|
|
9
|
+
from aifingerprint.checks.punctuation import check as check_punctuation
|
|
10
|
+
from aifingerprint.checks.connectives import check as check_connectives
|
|
11
|
+
from aifingerprint.checks.burstiness import check as check_burstiness
|
|
12
|
+
from aifingerprint.checks.compression import check as check_compression
|
|
13
|
+
|
|
14
|
+
CheckResult = tuple[list[str], float]
|
|
15
|
+
|
|
16
|
+
CHECKS: dict[str, callable] = {
|
|
17
|
+
"vocabulary": check_vocabulary,
|
|
18
|
+
"phrases": check_phrases,
|
|
19
|
+
"structure": check_structure,
|
|
20
|
+
"formatting": check_formatting,
|
|
21
|
+
"tone": check_tone,
|
|
22
|
+
"sentence_rhythm": check_sentence_rhythm,
|
|
23
|
+
"punctuation": check_punctuation,
|
|
24
|
+
"connectives": check_connectives,
|
|
25
|
+
"burstiness": check_burstiness,
|
|
26
|
+
"compression": check_compression,
|
|
27
|
+
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Word burstiness — measures how evenly content words are distributed."""
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
STOPWORDS = {
|
|
7
|
+
"the", "and", "that", "this", "with", "from", "have", "been", "were",
|
|
8
|
+
"they", "their", "there", "which", "would", "could", "should", "about",
|
|
9
|
+
"into", "than", "then", "them", "these", "those", "other", "after",
|
|
10
|
+
"before", "being", "between", "does", "doing", "during", "each",
|
|
11
|
+
"every", "under", "over", "again", "further", "where", "when", "while",
|
|
12
|
+
"also", "just", "more", "most", "some", "such", "only", "very",
|
|
13
|
+
"will", "what", "your", "still",
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
MIN_WORD_LENGTH = 4 # Only track content words longer than this
|
|
17
|
+
MIN_OCCURRENCES = 3 # Need 3+ occurrences to measure gaps
|
|
18
|
+
MIN_TOTAL_WORDS = 50
|
|
19
|
+
|
|
20
|
+
BURSTINESS_VERY_LOW = 0.5
|
|
21
|
+
BURSTINESS_LOW = 0.7
|
|
22
|
+
# Score mapping: burstiness 0.3-1.0 → score 1.0-0.0
|
|
23
|
+
SCORE_CEILING = 1.0
|
|
24
|
+
SCORE_RANGE = 0.7
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def check(text: str, lines: list[str]) -> tuple[list[str], float]:
|
|
28
|
+
"""Human writing clusters topic words; AI distributes them evenly.
|
|
29
|
+
Measure CV of inter-occurrence gaps for content words."""
|
|
30
|
+
hits = []
|
|
31
|
+
words = re.findall(r"\b[a-z]+\b", text.lower())
|
|
32
|
+
if len(words) < MIN_TOTAL_WORDS:
|
|
33
|
+
return hits, 0.0
|
|
34
|
+
|
|
35
|
+
positions: dict[str, list[int]] = {}
|
|
36
|
+
for i, w in enumerate(words):
|
|
37
|
+
if len(w) > MIN_WORD_LENGTH and w not in STOPWORDS:
|
|
38
|
+
positions.setdefault(w, []).append(i)
|
|
39
|
+
|
|
40
|
+
burstiness_values = []
|
|
41
|
+
for w, pos_list in positions.items():
|
|
42
|
+
if len(pos_list) < MIN_OCCURRENCES:
|
|
43
|
+
continue
|
|
44
|
+
gaps = [pos_list[i + 1] - pos_list[i] for i in range(len(pos_list) - 1)]
|
|
45
|
+
mean_gap = sum(gaps) / len(gaps)
|
|
46
|
+
if mean_gap == 0:
|
|
47
|
+
continue
|
|
48
|
+
std_gap = math.sqrt(sum((g - mean_gap) ** 2 for g in gaps) / len(gaps))
|
|
49
|
+
burstiness_values.append(std_gap / mean_gap)
|
|
50
|
+
|
|
51
|
+
if not burstiness_values:
|
|
52
|
+
return hits, 0.0
|
|
53
|
+
|
|
54
|
+
avg_burstiness = sum(burstiness_values) / len(burstiness_values)
|
|
55
|
+
|
|
56
|
+
if avg_burstiness < BURSTINESS_VERY_LOW:
|
|
57
|
+
hits.append(
|
|
58
|
+
f" Word burstiness: {avg_burstiness:.2f} (very low) "
|
|
59
|
+
f"— content words are distributed too evenly"
|
|
60
|
+
)
|
|
61
|
+
elif avg_burstiness < BURSTINESS_LOW:
|
|
62
|
+
hits.append(
|
|
63
|
+
f" Word burstiness: {avg_burstiness:.2f} (low) "
|
|
64
|
+
f"— content words lack natural clustering"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
raw = max(0.0, min(1.0, (SCORE_CEILING - avg_burstiness) / SCORE_RANGE))
|
|
68
|
+
return hits, raw
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""LZMA compression similarity — compares text against known AI corpus."""
|
|
2
|
+
|
|
3
|
+
import lzma
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
# From corpus testing: AI leave-one-out mean ~0.47, human mean ~0.36
|
|
7
|
+
SIMILARITY_HIGH = 0.45
|
|
8
|
+
SIMILARITY_MODERATE = 0.38
|
|
9
|
+
# Score mapping: similarity 0.30-0.50 → score 0.0-1.0
|
|
10
|
+
SCORE_FLOOR = 0.30
|
|
11
|
+
SCORE_RANGE = 0.20
|
|
12
|
+
MIN_TEXT_BYTES = 100
|
|
13
|
+
|
|
14
|
+
_seed_bytes: bytes | None = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _load_seed() -> bytes:
|
|
18
|
+
global _seed_bytes
|
|
19
|
+
if _seed_bytes is not None:
|
|
20
|
+
return _seed_bytes
|
|
21
|
+
seed_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "..", "ai_seed_corpus.txt")
|
|
22
|
+
seed_path = os.path.normpath(seed_path)
|
|
23
|
+
if os.path.exists(seed_path):
|
|
24
|
+
with open(seed_path) as f:
|
|
25
|
+
_seed_bytes = f.read().encode("utf-8")
|
|
26
|
+
else:
|
|
27
|
+
_seed_bytes = b""
|
|
28
|
+
return _seed_bytes
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def check(text: str, lines: list[str]) -> tuple[list[str], float]:
|
|
32
|
+
"""Compare how well text compresses when appended to a known AI corpus.
|
|
33
|
+
AI-like text shares patterns with the seed, producing a higher compression ratio."""
|
|
34
|
+
hits = []
|
|
35
|
+
seed = _load_seed()
|
|
36
|
+
if not seed:
|
|
37
|
+
return hits, 0.0
|
|
38
|
+
|
|
39
|
+
text_bytes = text.encode("utf-8")
|
|
40
|
+
if len(text_bytes) < MIN_TEXT_BYTES:
|
|
41
|
+
return hits, 0.0
|
|
42
|
+
|
|
43
|
+
seed_compressed = len(lzma.compress(seed))
|
|
44
|
+
combined_compressed = len(lzma.compress(seed + text_bytes))
|
|
45
|
+
text_alone = len(lzma.compress(text_bytes))
|
|
46
|
+
|
|
47
|
+
if text_alone == 0:
|
|
48
|
+
return hits, 0.0
|
|
49
|
+
overhead = combined_compressed - seed_compressed
|
|
50
|
+
similarity = 1.0 - (overhead / text_alone)
|
|
51
|
+
|
|
52
|
+
if similarity > SIMILARITY_HIGH:
|
|
53
|
+
hits.append(
|
|
54
|
+
f" Compression similarity: {similarity:.3f} (high) "
|
|
55
|
+
f"— text compresses well against AI corpus"
|
|
56
|
+
)
|
|
57
|
+
elif similarity > SIMILARITY_MODERATE:
|
|
58
|
+
hits.append(
|
|
59
|
+
f" Compression similarity: {similarity:.3f} (moderate) "
|
|
60
|
+
f"— some pattern overlap with AI corpus"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
raw = max(0.0, min(1.0, (similarity - SCORE_FLOOR) / SCORE_RANGE))
|
|
64
|
+
return hits, raw
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Discourse connective density — however, moreover, furthermore, etc."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from aifingerprint.patterns import DISCOURSE_CONNECTIVES
|
|
6
|
+
from aifingerprint.text import split_sentences
|
|
7
|
+
|
|
8
|
+
DENSITY_HIGH = 0.5 # connectives per sentence
|
|
9
|
+
DENSITY_MODERATE = 0.25
|
|
10
|
+
SCORE_FLOOR = 0.1
|
|
11
|
+
SCORE_RANGE = 0.5
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def check(text: str, lines: list[str]) -> tuple[list[str], float]:
|
|
15
|
+
"""AI overuses discourse connectives like however, moreover, furthermore."""
|
|
16
|
+
hits = []
|
|
17
|
+
sentences = split_sentences(text)
|
|
18
|
+
if len(sentences) < 3:
|
|
19
|
+
return hits, 0.0
|
|
20
|
+
|
|
21
|
+
words = re.findall(r"\b[a-z]+\b", text.lower())
|
|
22
|
+
count = sum(1 for w in words if w in DISCOURSE_CONNECTIVES)
|
|
23
|
+
density = count / len(sentences)
|
|
24
|
+
|
|
25
|
+
if density > DENSITY_HIGH:
|
|
26
|
+
found = [w for w in words if w in DISCOURSE_CONNECTIVES]
|
|
27
|
+
unique_found = list(dict.fromkeys(found))[:5]
|
|
28
|
+
hits.append(
|
|
29
|
+
f" Connective density: {density:.2f}/sentence (high) "
|
|
30
|
+
f"— {', '.join(unique_found)}"
|
|
31
|
+
)
|
|
32
|
+
elif density > DENSITY_MODERATE:
|
|
33
|
+
found = [w for w in words if w in DISCOURSE_CONNECTIVES]
|
|
34
|
+
unique_found = list(dict.fromkeys(found))[:5]
|
|
35
|
+
hits.append(
|
|
36
|
+
f" Connective density: {density:.2f}/sentence (moderate) "
|
|
37
|
+
f"— {', '.join(unique_found)}"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
raw = max(0.0, min(1.0, (density - SCORE_FLOOR) / SCORE_RANGE))
|
|
41
|
+
return hits, raw
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Formatting pattern detection — em dashes, bold bullets, headers."""
|
|
2
|
+
|
|
3
|
+
from aifingerprint.patterns import FORMAT_PATTERNS
|
|
4
|
+
|
|
5
|
+
EM_DASH_PER_500_LIMIT = 2
|
|
6
|
+
BOLD_BULLET_MIN = 3
|
|
7
|
+
HEADER_PER_500_LIMIT = 4
|
|
8
|
+
SHORT_TEXT_HEADER_LIMIT = 3
|
|
9
|
+
SHORT_TEXT_WORD_LIMIT = 500
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def check(text: str, lines: list[str]) -> tuple[list[str], float]:
|
|
13
|
+
hits = []
|
|
14
|
+
total_words = len(text.split())
|
|
15
|
+
|
|
16
|
+
# Em dashes
|
|
17
|
+
em_dashes = FORMAT_PATTERNS["em_dash"].findall(text)
|
|
18
|
+
em_count = len(em_dashes)
|
|
19
|
+
if total_words > 0:
|
|
20
|
+
per_500 = em_count / (total_words / 500)
|
|
21
|
+
if per_500 > EM_DASH_PER_500_LIMIT:
|
|
22
|
+
hits.append(f" Em dash density: {per_500:.1f} per 500 words (limit: {EM_DASH_PER_500_LIMIT}) — {em_count} total")
|
|
23
|
+
|
|
24
|
+
# Bold-first bullets
|
|
25
|
+
bold_bullets = FORMAT_PATTERNS["bold_first_bullet"].findall(text)
|
|
26
|
+
if len(bold_bullets) >= BOLD_BULLET_MIN:
|
|
27
|
+
hits.append(f" Bold-first bullets: {len(bold_bullets)} instances")
|
|
28
|
+
|
|
29
|
+
# Header density
|
|
30
|
+
headers = FORMAT_PATTERNS["header"].findall(text)
|
|
31
|
+
if total_words > 0 and total_words < SHORT_TEXT_WORD_LIMIT and len(headers) > SHORT_TEXT_HEADER_LIMIT:
|
|
32
|
+
hits.append(f" Header density: {len(headers)} headers in {total_words} words — excessive")
|
|
33
|
+
elif total_words > 0:
|
|
34
|
+
per_500 = len(headers) / (total_words / 500)
|
|
35
|
+
if per_500 > HEADER_PER_500_LIMIT:
|
|
36
|
+
hits.append(f" Header density: {per_500:.1f} per 500 words — excessive")
|
|
37
|
+
|
|
38
|
+
# Title case headers
|
|
39
|
+
title_case = FORMAT_PATTERNS["title_case_header"].findall(text)
|
|
40
|
+
if title_case:
|
|
41
|
+
hits.append(f" Title Case headers: {len(title_case)} instances (prefer sentence case)")
|
|
42
|
+
|
|
43
|
+
score_parts = []
|
|
44
|
+
if em_count > 0 and total_words > 0:
|
|
45
|
+
score_parts.append(min(1.0, (em_count / (total_words / 500)) / 6))
|
|
46
|
+
if bold_bullets:
|
|
47
|
+
score_parts.append(min(1.0, len(bold_bullets) / 8))
|
|
48
|
+
if headers and total_words > 0:
|
|
49
|
+
score_parts.append(min(1.0, (len(headers) / (total_words / 500)) / 8))
|
|
50
|
+
|
|
51
|
+
return hits, (sum(score_parts) / max(1, len(score_parts))) if score_parts else 0.0
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Banned phrase detection — cliches, hedges, openers, closers."""
|
|
2
|
+
|
|
3
|
+
from aifingerprint.patterns import BANNED_PHRASES, BANNED_SENTENCE_STARTERS
|
|
4
|
+
from aifingerprint.text import split_sentences, find_line
|
|
5
|
+
|
|
6
|
+
HITS_PER_100_MAX = 3.0 # 3+ per 100 words = max score
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def check(text: str, lines: list[str]) -> tuple[list[str], float]:
|
|
10
|
+
hits = []
|
|
11
|
+
|
|
12
|
+
for category, phrases in BANNED_PHRASES.items():
|
|
13
|
+
for phrase in phrases:
|
|
14
|
+
for i, line in enumerate(lines, 1):
|
|
15
|
+
if phrase.lower() in line.lower():
|
|
16
|
+
idx = line.lower().index(phrase.lower())
|
|
17
|
+
matched = line[idx:idx + len(phrase) + 20].rstrip()
|
|
18
|
+
hits.append(f" Line {i}: \"{matched}...\" [{category}]")
|
|
19
|
+
|
|
20
|
+
sentences = split_sentences(text)
|
|
21
|
+
for sent in sentences:
|
|
22
|
+
for starter in BANNED_SENTENCE_STARTERS:
|
|
23
|
+
if sent.lower().startswith(starter):
|
|
24
|
+
line_num = find_line(lines, sent[:30])
|
|
25
|
+
hits.append(f" Line {line_num}: starts with \"{starter}\" [transition]")
|
|
26
|
+
|
|
27
|
+
total_words = len(text.split())
|
|
28
|
+
if total_words == 0:
|
|
29
|
+
return hits, 0.0
|
|
30
|
+
density = len(hits) / (total_words / 100)
|
|
31
|
+
return hits, min(1.0, density / HITS_PER_100_MAX)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Punctuation diversity — Shannon entropy of punctuation distribution."""
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
from collections import Counter
|
|
5
|
+
|
|
6
|
+
_PUNCT_CHARS = set('.,;:!?\u2014\u2013-()"\u201c\u201d\'\u2018\u2019')
|
|
7
|
+
|
|
8
|
+
ENTROPY_VERY_LOW = 0.30
|
|
9
|
+
ENTROPY_LOW = 0.45
|
|
10
|
+
NORM_FLOOR = 0.20 # Below this → max score
|
|
11
|
+
NORM_CEILING = 0.60 # Above this → zero score
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def check(text: str, lines: list[str]) -> tuple[list[str], float]:
|
|
15
|
+
"""AI uses a narrow range of punctuation. Measure entropy of punct distribution."""
|
|
16
|
+
hits = []
|
|
17
|
+
punct = [ch for ch in text if ch in _PUNCT_CHARS]
|
|
18
|
+
if len(punct) < 10:
|
|
19
|
+
return hits, 0.0
|
|
20
|
+
|
|
21
|
+
freqs = Counter(punct)
|
|
22
|
+
total = len(punct)
|
|
23
|
+
entropy = -sum((c / total) * math.log2(c / total) for c in freqs.values())
|
|
24
|
+
max_entropy = math.log2(len(_PUNCT_CHARS))
|
|
25
|
+
normalized = entropy / max_entropy if max_entropy > 0 else 0
|
|
26
|
+
|
|
27
|
+
if normalized < ENTROPY_VERY_LOW:
|
|
28
|
+
hits.append(
|
|
29
|
+
f" Punctuation diversity: entropy={entropy:.2f} (very low) "
|
|
30
|
+
f"— almost only commas and periods"
|
|
31
|
+
)
|
|
32
|
+
elif normalized < ENTROPY_LOW:
|
|
33
|
+
hits.append(
|
|
34
|
+
f" Punctuation diversity: entropy={entropy:.2f} (low) "
|
|
35
|
+
f"— limited punctuation variety"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
raw = max(0.0, min(1.0, (NORM_CEILING - normalized) / (NORM_CEILING - NORM_FLOOR)))
|
|
39
|
+
return hits, raw
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Sentence rhythm analysis — coefficient of variation in sentence lengths."""
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
|
|
5
|
+
from aifingerprint.text import split_sentences
|
|
6
|
+
|
|
7
|
+
# From corpus testing: AI mean CV ~0.45, human ~0.72
|
|
8
|
+
CV_VERY_LOW = 0.35
|
|
9
|
+
CV_LOW = 0.50
|
|
10
|
+
CV_FLOOR = 0.30 # Below this → max score
|
|
11
|
+
CV_CEILING = 0.70 # Above this → zero score
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def check(text: str, lines: list[str]) -> tuple[list[str], float]:
|
|
15
|
+
"""AI produces sentences of uniform length. Measure coefficient of variation."""
|
|
16
|
+
hits = []
|
|
17
|
+
sentences = split_sentences(text)
|
|
18
|
+
if len(sentences) < 5:
|
|
19
|
+
return hits, 0.0
|
|
20
|
+
|
|
21
|
+
lengths = [len(s.split()) for s in sentences]
|
|
22
|
+
mean_len = sum(lengths) / len(lengths)
|
|
23
|
+
if mean_len == 0:
|
|
24
|
+
return hits, 0.0
|
|
25
|
+
std_len = math.sqrt(sum((ln - mean_len) ** 2 for ln in lengths) / len(lengths))
|
|
26
|
+
cv = std_len / mean_len
|
|
27
|
+
|
|
28
|
+
if cv < CV_VERY_LOW:
|
|
29
|
+
hits.append(
|
|
30
|
+
f" Sentence rhythm: CV={cv:.2f} (very low) "
|
|
31
|
+
f"— sentence lengths are unnaturally uniform"
|
|
32
|
+
)
|
|
33
|
+
elif cv < CV_LOW:
|
|
34
|
+
hits.append(
|
|
35
|
+
f" Sentence rhythm: CV={cv:.2f} (low) "
|
|
36
|
+
f"— sentence lengths lack natural variation"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
raw = max(0.0, min(1.0, (CV_CEILING - cv) / (CV_CEILING - CV_FLOOR)))
|
|
40
|
+
return hits, raw
|