lmscan 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lmscan-0.1.0/.github/workflows/ci.yml +21 -0
- lmscan-0.1.0/.gitignore +14 -0
- lmscan-0.1.0/.pre-commit-hooks.yaml +8 -0
- lmscan-0.1.0/CHANGELOG.md +16 -0
- lmscan-0.1.0/LICENSE +17 -0
- lmscan-0.1.0/PKG-INFO +251 -0
- lmscan-0.1.0/README.md +221 -0
- lmscan-0.1.0/pyproject.toml +71 -0
- lmscan-0.1.0/src/lmscan/__init__.py +16 -0
- lmscan-0.1.0/src/lmscan/_types.py +61 -0
- lmscan-0.1.0/src/lmscan/cli.py +68 -0
- lmscan-0.1.0/src/lmscan/detector.py +224 -0
- lmscan-0.1.0/src/lmscan/features.py +429 -0
- lmscan-0.1.0/src/lmscan/fingerprint.py +219 -0
- lmscan-0.1.0/src/lmscan/py.typed +0 -0
- lmscan-0.1.0/src/lmscan/report.py +149 -0
- lmscan-0.1.0/src/lmscan/scanner.py +19 -0
- lmscan-0.1.0/tests/test_cli.py +102 -0
- lmscan-0.1.0/tests/test_detector.py +147 -0
- lmscan-0.1.0/tests/test_features.py +279 -0
- lmscan-0.1.0/tests/test_fingerprint.py +130 -0
- lmscan-0.1.0/tests/test_report.py +82 -0
- lmscan-0.1.0/tests/test_scanner.py +73 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: ${{ matrix.python-version }}
|
|
20
|
+
- run: pip install -e . pytest
|
|
21
|
+
- run: python -m pytest tests/ -v --tb=short
|
lmscan-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [0.1.0] - 2025-04-10
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- Statistical AI text detection using 12 linguistic features
|
|
7
|
+
- Model fingerprinting for GPT-4, Claude, Gemini, Llama, Mistral
|
|
8
|
+
- Per-sentence analysis with individual AI probability scores
|
|
9
|
+
- CLI with text/file/stdin input, JSON output, threshold gating
|
|
10
|
+
- Python API: `scan()`, `scan_file()`
|
|
11
|
+
- Burstiness, entropy, Zipf deviation, vocabulary richness analysis
|
|
12
|
+
- AI "slop word" detection (known LLM vocabulary markers)
|
|
13
|
+
- Transition word ratio, readability consistency, bigram/trigram repetition
|
|
14
|
+
- Beautiful ASCII terminal report with feature table and model attribution
|
|
15
|
+
- Zero external dependencies
|
|
16
|
+
- 96 tests
|
lmscan-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
Copyright 2025 Zacharie B
|
|
6
|
+
|
|
7
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
8
|
+
you may not use this file except in compliance with the License.
|
|
9
|
+
You may obtain a copy of the License at
|
|
10
|
+
|
|
11
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
12
|
+
|
|
13
|
+
Unless required by applicable law or agreed to in writing, software
|
|
14
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
15
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
16
|
+
See the License for the specific language governing permissions and
|
|
17
|
+
limitations under the License.
|
lmscan-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lmscan
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Detect AI-generated text and fingerprint which LLM wrote it. Open-source GPTZero alternative. Zero dependencies, works offline.
|
|
5
|
+
Project-URL: Homepage, https://github.com/stef41/lmscan
|
|
6
|
+
Project-URL: Repository, https://github.com/stef41/lmscan
|
|
7
|
+
Project-URL: Issues, https://github.com/stef41/lmscan/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/stef41/lmscan/blob/main/CHANGELOG.md
|
|
9
|
+
Author: Zacharie B
|
|
10
|
+
License: Apache-2.0
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: ai-detection,ai-text-detection,chatgpt,claude,content-detection,gemini,gptzero,llm,plagiarism,text-forensics
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Environment :: Console
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Intended Audience :: Education
|
|
17
|
+
Classifier: Intended Audience :: Science/Research
|
|
18
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
26
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
27
|
+
Classifier: Typing :: Typed
|
|
28
|
+
Requires-Python: >=3.9
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
|
|
31
|
+
# 🔍 lmscan
|
|
32
|
+
|
|
33
|
+
**Detect AI-generated text. Fingerprint which LLM wrote it. Open-source GPTZero alternative.**
|
|
34
|
+
|
|
35
|
+
[](https://pypi.org/project/lmscan/)
|
|
36
|
+
[](LICENSE)
|
|
37
|
+
[](https://pypi.org/project/lmscan/)
|
|
38
|
+
[]()
|
|
39
|
+
|
|
40
|
+
> GPTZero charges $15/month. Originality.ai charges per scan. Turnitin locks you into institutional contracts.
|
|
41
|
+
>
|
|
42
|
+
> **lmscan is free, open-source, works offline, and tells you _which_ model wrote the text.**
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
$ lmscan "In today's rapidly evolving digital landscape, it's important
|
|
46
|
+
to note that artificial intelligence has become a pivotal force in
|
|
47
|
+
transforming how we navigate the complexities of modern life..."
|
|
48
|
+
|
|
49
|
+
🔍 lmscan v0.1.0 — AI Text Forensics
|
|
50
|
+
══════════════════════════════════════════════════
|
|
51
|
+
|
|
52
|
+
Verdict: 🤖 Likely AI (77% confidence)
|
|
53
|
+
Words: 184
|
|
54
|
+
Sentences: 10
|
|
55
|
+
Scanned in 0.01s
|
|
56
|
+
|
|
57
|
+
┌────────────────────────────┬──────────┬────────────────────┐
|
|
58
|
+
│ Feature │ Value │ Signal │
|
|
59
|
+
├────────────────────────────┼──────────┼────────────────────┤
|
|
60
|
+
│ Burstiness │ 0.07 │ 🔴 Very low (AI) │
|
|
61
|
+
│ Sentence length variance │ 0.27 │ 🟡 Below average │
|
|
62
|
+
│ Slop word density │ 20.7% │ 🔴 High (AI) │
|
|
63
|
+
│ Transition word ratio │ 2.2% │ 🟡 Elevated │
|
|
64
|
+
│ Readability consistency │ 0.00 │ 🔴 Very low (AI) │
|
|
65
|
+
│ ... │ │ │
|
|
66
|
+
└────────────────────────────┴──────────┴────────────────────┘
|
|
67
|
+
|
|
68
|
+
🔎 Model Attribution
|
|
69
|
+
1. GPT-4 / ChatGPT 62% — "delve", "tapestry", "beacon", "landscape" (×2), +19 more
|
|
70
|
+
2. Claude (Anthropic) 13% — "robust", "nuanced", "comprehensive"
|
|
71
|
+
3. Gemini (Google) 9% — "furthermore", "additionally"
|
|
72
|
+
|
|
73
|
+
⚠️ Flags
|
|
74
|
+
• Very low burstiness (0.07) — AI text is more uniform in complexity
|
|
75
|
+
• High slop word density (20.7%) — contains known AI vocabulary markers
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Install
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
pip install lmscan
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
**Zero dependencies.** Works with Python 3.9+. No API keys. No internet. No GPU.
|
|
85
|
+
|
|
86
|
+
## Usage
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
# Scan text directly
|
|
90
|
+
lmscan "Your text here..."
|
|
91
|
+
|
|
92
|
+
# Scan a file
|
|
93
|
+
lmscan document.txt
|
|
94
|
+
|
|
95
|
+
# Pipe from stdin
|
|
96
|
+
cat essay.txt | lmscan -
|
|
97
|
+
|
|
98
|
+
# JSON output (for scripts and CI)
|
|
99
|
+
lmscan document.txt --format json
|
|
100
|
+
|
|
101
|
+
# Per-sentence breakdown
|
|
102
|
+
lmscan document.txt --sentences
|
|
103
|
+
|
|
104
|
+
# CI gate: fail if AI probability > 50%
|
|
105
|
+
lmscan submission.txt --threshold 0.5
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Python API
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
from lmscan import scan
|
|
112
|
+
|
|
113
|
+
result = scan("Text to analyze...")
|
|
114
|
+
|
|
115
|
+
print(f"AI probability: {result.ai_probability:.0%}")
|
|
116
|
+
print(f"Verdict: {result.verdict}")
|
|
117
|
+
print(f"Confidence: {result.confidence}")
|
|
118
|
+
|
|
119
|
+
# Which model wrote it?
|
|
120
|
+
for model in result.model_attribution:
|
|
121
|
+
print(f" {model.model}: {model.confidence:.0%}")
|
|
122
|
+
for evidence in model.evidence[:3]:
|
|
123
|
+
print(f" → {evidence}")
|
|
124
|
+
|
|
125
|
+
# Per-sentence analysis
|
|
126
|
+
for sentence in result.sentence_scores:
|
|
127
|
+
if sentence.ai_probability > 0.7:
|
|
128
|
+
print(f" 🤖 {sentence.text[:60]}... ({sentence.ai_probability:.0%})")
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Scan entire directories
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from lmscan import scan_file
|
|
135
|
+
import glob
|
|
136
|
+
|
|
137
|
+
for path in glob.glob("submissions/*.txt"):
|
|
138
|
+
result = scan_file(path)
|
|
139
|
+
print(f"{path}: {result.verdict} ({result.ai_probability:.0%})")
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## How It Works
|
|
143
|
+
|
|
144
|
+
lmscan uses **12 statistical features** derived from computational linguistics research to distinguish AI-generated text from human writing:
|
|
145
|
+
|
|
146
|
+
| Feature | What it measures | AI signal |
|
|
147
|
+
|---------|-----------------|-----------|
|
|
148
|
+
| **Burstiness** | Variance in sentence complexity | AI text is unusually uniform |
|
|
149
|
+
| **Sentence length variance** | How much sentence lengths vary | AI produces uniform lengths |
|
|
150
|
+
| **Vocabulary richness** | Type-token ratio (Yule's K corrected) | AI reuses words more |
|
|
151
|
+
| **Hapax legomena ratio** | Fraction of words appearing once | AI has fewer unique words |
|
|
152
|
+
| **Zipf deviation** | How word frequencies follow Zipf's law | AI deviates from natural distribution |
|
|
153
|
+
| **Readability consistency** | Flesch-Kincaid variance across paragraphs | AI maintains constant readability |
|
|
154
|
+
| **Bigram/trigram repetition** | Repeated word pairs and triples | AI repeats phrase structures |
|
|
155
|
+
| **Transition word ratio** | "however", "moreover", "furthermore"... | AI overuses transitions |
|
|
156
|
+
| **Slop word density** | Known AI vocabulary markers | "delve", "tapestry", "beacon"... |
|
|
157
|
+
| **Punctuation entropy** | Diversity of punctuation usage | AI is more predictable |
|
|
158
|
+
|
|
159
|
+
Each feature produces a signal via sigmoid transformation. The weighted combination produces the final AI probability.
|
|
160
|
+
|
|
161
|
+
### Model Fingerprinting
|
|
162
|
+
|
|
163
|
+
lmscan includes vocabulary fingerprints for 5 major LLM families:
|
|
164
|
+
|
|
165
|
+
| Model | Distinctive markers |
|
|
166
|
+
|-------|-------------------|
|
|
167
|
+
| **GPT-4 / ChatGPT** | "delve", "tapestry", "landscape", "leverage", "multifaceted", "it's important to note" |
|
|
168
|
+
| **Claude (Anthropic)** | "certainly", "I'd be happy to", "straightforward", "I should note" |
|
|
169
|
+
| **Gemini (Google)** | "crucial", "here's a breakdown", "keep in mind" |
|
|
170
|
+
| **Llama / Meta** | "awesome", "fantastic", "hope this helps" |
|
|
171
|
+
| **Mistral / Mixtral** | "indeed", "moreover", "hence", "noteworthy" |
|
|
172
|
+
|
|
173
|
+
Attribution uses weighted vocabulary matching, phrase detection, and hedging pattern analysis.
|
|
174
|
+
|
|
175
|
+
## Accuracy & Limitations
|
|
176
|
+
|
|
177
|
+
**What lmscan is good at:**
|
|
178
|
+
- Detecting text with strong AI stylistic patterns
|
|
179
|
+
- Identifying which model family generated text
|
|
180
|
+
- Scanning at scale (thousands of documents) with zero cost
|
|
181
|
+
- Providing explainable evidence (not a black box)
|
|
182
|
+
|
|
183
|
+
**What lmscan cannot do:**
|
|
184
|
+
- Detect AI text that has been manually edited or paraphrased
|
|
185
|
+
- Work reliably on very short text (<50 words)
|
|
186
|
+
- Detect AI text in non-English languages (English-only for now)
|
|
187
|
+
- Replace human judgment — use as a signal, not a verdict
|
|
188
|
+
|
|
189
|
+
**This is statistical analysis, not a neural classifier.** It detects stylistic patterns, not watermarks. It works best on unedited LLM output and degrades gracefully on edited text.
|
|
190
|
+
|
|
191
|
+
## CI Integration
|
|
192
|
+
|
|
193
|
+
### GitHub Actions
|
|
194
|
+
|
|
195
|
+
```yaml
|
|
196
|
+
- name: AI Content Check
|
|
197
|
+
run: |
|
|
198
|
+
pip install lmscan
|
|
199
|
+
lmscan submission.txt --threshold 0.7 --format json
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### Pre-commit
|
|
203
|
+
|
|
204
|
+
```yaml
|
|
205
|
+
repos:
|
|
206
|
+
- repo: https://github.com/stef41/lmscan
|
|
207
|
+
rev: v0.1.0
|
|
208
|
+
hooks:
|
|
209
|
+
- id: lmscan
|
|
210
|
+
args: ["--threshold", "0.7"]
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
## Research Background
|
|
214
|
+
|
|
215
|
+
lmscan's approach is informed by published research on AI text detection:
|
|
216
|
+
|
|
217
|
+
- **DetectGPT** (Mitchell et al., 2023) — perturbation-based detection using log probability curvature
|
|
218
|
+
- **GLTR** (Gehrmann et al., 2019) — statistical visualization of token predictions
|
|
219
|
+
- **Binoculars** (Hans et al., 2024) — cross-model perplexity comparison
|
|
220
|
+
- **Zipf's Law in NLP** — word frequency distributions differ between human and AI text
|
|
221
|
+
- **Stylometry** — decades of authorship attribution research applied to AI forensics
|
|
222
|
+
|
|
223
|
+
lmscan takes the statistical intuitions from these papers and implements them as lightweight, dependency-free heuristics that work without requiring a reference language model.
|
|
224
|
+
|
|
225
|
+
## FAQ
|
|
226
|
+
|
|
227
|
+
**Q: Is this as accurate as GPTZero?**
|
|
228
|
+
A: GPTZero uses neural classifiers trained on labeled data. lmscan uses statistical heuristics. GPTZero is more accurate on edge cases; lmscan is free, offline, and explainable. Use both if accuracy matters.
|
|
229
|
+
|
|
230
|
+
**Q: Can students use this to evade AI detection?**
|
|
231
|
+
A: lmscan shows which features trigger detection, which could help someone understand why text reads as AI-generated. This is by design — understanding AI writing patterns makes everyone a better writer. The same information is available in published research papers.
|
|
232
|
+
|
|
233
|
+
**Q: Does it work on non-English text?**
|
|
234
|
+
A: Currently English-only. The slop word lists and transition word lists are English-specific. Statistical features (entropy, burstiness) work across languages but haven't been calibrated.
|
|
235
|
+
|
|
236
|
+
**Q: Does it phone home?**
|
|
237
|
+
A: No. Zero network requests. No telemetry. No API keys. Everything runs locally.
|
|
238
|
+
|
|
239
|
+
**Q: How is model attribution possible without running the model?**
|
|
240
|
+
A: Each LLM family has characteristic vocabulary biases. GPT-4 loves "delve" and "tapestry". Claude says "I'd be happy to". These are statistical fingerprints — not guaranteed attribution, but strong signals.
|
|
241
|
+
|
|
242
|
+
## See Also
|
|
243
|
+
|
|
244
|
+
- [reverse-SynthID](https://github.com/aloshdenny/reverse-SynthID) — Reverse-engineering Google's image watermarking
|
|
245
|
+
- [vibesafe](https://github.com/stef41/vibesafe) — AI code safety scanner
|
|
246
|
+
- [injectionguard](https://github.com/stef41/injectionguard) — Prompt injection detection
|
|
247
|
+
- [vibescore](https://github.com/stef41/vibescore) — Grade your vibe-coded project
|
|
248
|
+
|
|
249
|
+
## License
|
|
250
|
+
|
|
251
|
+
Apache-2.0
|
lmscan-0.1.0/README.md
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# 🔍 lmscan
|
|
2
|
+
|
|
3
|
+
**Detect AI-generated text. Fingerprint which LLM wrote it. Open-source GPTZero alternative.**
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/lmscan/)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
[](https://pypi.org/project/lmscan/)
|
|
8
|
+
[]()
|
|
9
|
+
|
|
10
|
+
> GPTZero charges $15/month. Originality.ai charges per scan. Turnitin locks you into institutional contracts.
|
|
11
|
+
>
|
|
12
|
+
> **lmscan is free, open-source, works offline, and tells you _which_ model wrote the text.**
|
|
13
|
+
|
|
14
|
+
```
|
|
15
|
+
$ lmscan "In today's rapidly evolving digital landscape, it's important
|
|
16
|
+
to note that artificial intelligence has become a pivotal force in
|
|
17
|
+
transforming how we navigate the complexities of modern life..."
|
|
18
|
+
|
|
19
|
+
🔍 lmscan v0.1.0 — AI Text Forensics
|
|
20
|
+
══════════════════════════════════════════════════
|
|
21
|
+
|
|
22
|
+
Verdict: 🤖 Likely AI (77% confidence)
|
|
23
|
+
Words: 184
|
|
24
|
+
Sentences: 10
|
|
25
|
+
Scanned in 0.01s
|
|
26
|
+
|
|
27
|
+
┌────────────────────────────┬──────────┬────────────────────┐
|
|
28
|
+
│ Feature │ Value │ Signal │
|
|
29
|
+
├────────────────────────────┼──────────┼────────────────────┤
|
|
30
|
+
│ Burstiness │ 0.07 │ 🔴 Very low (AI) │
|
|
31
|
+
│ Sentence length variance │ 0.27 │ 🟡 Below average │
|
|
32
|
+
│ Slop word density │ 20.7% │ 🔴 High (AI) │
|
|
33
|
+
│ Transition word ratio │ 2.2% │ 🟡 Elevated │
|
|
34
|
+
│ Readability consistency │ 0.00 │ 🔴 Very low (AI) │
|
|
35
|
+
│ ... │ │ │
|
|
36
|
+
└────────────────────────────┴──────────┴────────────────────┘
|
|
37
|
+
|
|
38
|
+
🔎 Model Attribution
|
|
39
|
+
1. GPT-4 / ChatGPT 62% — "delve", "tapestry", "beacon", "landscape" (×2), +19 more
|
|
40
|
+
2. Claude (Anthropic) 13% — "robust", "nuanced", "comprehensive"
|
|
41
|
+
3. Gemini (Google) 9% — "furthermore", "additionally"
|
|
42
|
+
|
|
43
|
+
⚠️ Flags
|
|
44
|
+
• Very low burstiness (0.07) — AI text is more uniform in complexity
|
|
45
|
+
• High slop word density (20.7%) — contains known AI vocabulary markers
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Install
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install lmscan
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
**Zero dependencies.** Works with Python 3.9+. No API keys. No internet. No GPU.
|
|
55
|
+
|
|
56
|
+
## Usage
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
# Scan text directly
|
|
60
|
+
lmscan "Your text here..."
|
|
61
|
+
|
|
62
|
+
# Scan a file
|
|
63
|
+
lmscan document.txt
|
|
64
|
+
|
|
65
|
+
# Pipe from stdin
|
|
66
|
+
cat essay.txt | lmscan -
|
|
67
|
+
|
|
68
|
+
# JSON output (for scripts and CI)
|
|
69
|
+
lmscan document.txt --format json
|
|
70
|
+
|
|
71
|
+
# Per-sentence breakdown
|
|
72
|
+
lmscan document.txt --sentences
|
|
73
|
+
|
|
74
|
+
# CI gate: fail if AI probability > 50%
|
|
75
|
+
lmscan submission.txt --threshold 0.5
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Python API
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from lmscan import scan
|
|
82
|
+
|
|
83
|
+
result = scan("Text to analyze...")
|
|
84
|
+
|
|
85
|
+
print(f"AI probability: {result.ai_probability:.0%}")
|
|
86
|
+
print(f"Verdict: {result.verdict}")
|
|
87
|
+
print(f"Confidence: {result.confidence}")
|
|
88
|
+
|
|
89
|
+
# Which model wrote it?
|
|
90
|
+
for model in result.model_attribution:
|
|
91
|
+
print(f" {model.model}: {model.confidence:.0%}")
|
|
92
|
+
for evidence in model.evidence[:3]:
|
|
93
|
+
print(f" → {evidence}")
|
|
94
|
+
|
|
95
|
+
# Per-sentence analysis
|
|
96
|
+
for sentence in result.sentence_scores:
|
|
97
|
+
if sentence.ai_probability > 0.7:
|
|
98
|
+
print(f" 🤖 {sentence.text[:60]}... ({sentence.ai_probability:.0%})")
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Scan entire directories
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
from lmscan import scan_file
|
|
105
|
+
import glob
|
|
106
|
+
|
|
107
|
+
for path in glob.glob("submissions/*.txt"):
|
|
108
|
+
result = scan_file(path)
|
|
109
|
+
print(f"{path}: {result.verdict} ({result.ai_probability:.0%})")
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## How It Works
|
|
113
|
+
|
|
114
|
+
lmscan uses **12 statistical features** derived from computational linguistics research to distinguish AI-generated text from human writing:
|
|
115
|
+
|
|
116
|
+
| Feature | What it measures | AI signal |
|
|
117
|
+
|---------|-----------------|-----------|
|
|
118
|
+
| **Burstiness** | Variance in sentence complexity | AI text is unusually uniform |
|
|
119
|
+
| **Sentence length variance** | How much sentence lengths vary | AI produces uniform lengths |
|
|
120
|
+
| **Vocabulary richness** | Type-token ratio (Yule's K corrected) | AI reuses words more |
|
|
121
|
+
| **Hapax legomena ratio** | Fraction of words appearing once | AI has fewer unique words |
|
|
122
|
+
| **Zipf deviation** | How word frequencies follow Zipf's law | AI deviates from natural distribution |
|
|
123
|
+
| **Readability consistency** | Flesch-Kincaid variance across paragraphs | AI maintains constant readability |
|
|
124
|
+
| **Bigram/trigram repetition** | Repeated word pairs and triples | AI repeats phrase structures |
|
|
125
|
+
| **Transition word ratio** | "however", "moreover", "furthermore"... | AI overuses transitions |
|
|
126
|
+
| **Slop word density** | Known AI vocabulary markers | "delve", "tapestry", "beacon"... |
|
|
127
|
+
| **Punctuation entropy** | Diversity of punctuation usage | AI is more predictable |
|
|
128
|
+
|
|
129
|
+
Each feature produces a signal via sigmoid transformation. The weighted combination produces the final AI probability.
|
|
130
|
+
|
|
131
|
+
### Model Fingerprinting
|
|
132
|
+
|
|
133
|
+
lmscan includes vocabulary fingerprints for 5 major LLM families:
|
|
134
|
+
|
|
135
|
+
| Model | Distinctive markers |
|
|
136
|
+
|-------|-------------------|
|
|
137
|
+
| **GPT-4 / ChatGPT** | "delve", "tapestry", "landscape", "leverage", "multifaceted", "it's important to note" |
|
|
138
|
+
| **Claude (Anthropic)** | "certainly", "I'd be happy to", "straightforward", "I should note" |
|
|
139
|
+
| **Gemini (Google)** | "crucial", "here's a breakdown", "keep in mind" |
|
|
140
|
+
| **Llama / Meta** | "awesome", "fantastic", "hope this helps" |
|
|
141
|
+
| **Mistral / Mixtral** | "indeed", "moreover", "hence", "noteworthy" |
|
|
142
|
+
|
|
143
|
+
Attribution uses weighted vocabulary matching, phrase detection, and hedging pattern analysis.
|
|
144
|
+
|
|
145
|
+
## Accuracy & Limitations
|
|
146
|
+
|
|
147
|
+
**What lmscan is good at:**
|
|
148
|
+
- Detecting text with strong AI stylistic patterns
|
|
149
|
+
- Identifying which model family generated text
|
|
150
|
+
- Scanning at scale (thousands of documents) with zero cost
|
|
151
|
+
- Providing explainable evidence (not a black box)
|
|
152
|
+
|
|
153
|
+
**What lmscan cannot do:**
|
|
154
|
+
- Detect AI text that has been manually edited or paraphrased
|
|
155
|
+
- Work reliably on very short text (<50 words)
|
|
156
|
+
- Detect AI text in non-English languages (English-only for now)
|
|
157
|
+
- Replace human judgment — use as a signal, not a verdict
|
|
158
|
+
|
|
159
|
+
**This is statistical analysis, not a neural classifier.** It detects stylistic patterns, not watermarks. It works best on unedited LLM output and degrades gracefully on edited text.
|
|
160
|
+
|
|
161
|
+
## CI Integration
|
|
162
|
+
|
|
163
|
+
### GitHub Actions
|
|
164
|
+
|
|
165
|
+
```yaml
|
|
166
|
+
- name: AI Content Check
|
|
167
|
+
run: |
|
|
168
|
+
pip install lmscan
|
|
169
|
+
lmscan submission.txt --threshold 0.7 --format json
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### Pre-commit
|
|
173
|
+
|
|
174
|
+
```yaml
|
|
175
|
+
repos:
|
|
176
|
+
- repo: https://github.com/stef41/lmscan
|
|
177
|
+
rev: v0.1.0
|
|
178
|
+
hooks:
|
|
179
|
+
- id: lmscan
|
|
180
|
+
args: ["--threshold", "0.7"]
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## Research Background
|
|
184
|
+
|
|
185
|
+
lmscan's approach is informed by published research on AI text detection:
|
|
186
|
+
|
|
187
|
+
- **DetectGPT** (Mitchell et al., 2023) — perturbation-based detection using log probability curvature
|
|
188
|
+
- **GLTR** (Gehrmann et al., 2019) — statistical visualization of token predictions
|
|
189
|
+
- **Binoculars** (Hans et al., 2024) — cross-model perplexity comparison
|
|
190
|
+
- **Zipf's Law in NLP** — word frequency distributions differ between human and AI text
|
|
191
|
+
- **Stylometry** — decades of authorship attribution research applied to AI forensics
|
|
192
|
+
|
|
193
|
+
lmscan takes the statistical intuitions from these papers and implements them as lightweight, dependency-free heuristics that work without requiring a reference language model.
|
|
194
|
+
|
|
195
|
+
## FAQ
|
|
196
|
+
|
|
197
|
+
**Q: Is this as accurate as GPTZero?**
|
|
198
|
+
A: GPTZero uses neural classifiers trained on labeled data. lmscan uses statistical heuristics. GPTZero is more accurate on edge cases; lmscan is free, offline, and explainable. Use both if accuracy matters.
|
|
199
|
+
|
|
200
|
+
**Q: Can students use this to evade AI detection?**
|
|
201
|
+
A: lmscan shows which features trigger detection, which could help someone understand why text reads as AI-generated. This is by design — understanding AI writing patterns makes everyone a better writer. The same information is available in published research papers.
|
|
202
|
+
|
|
203
|
+
**Q: Does it work on non-English text?**
|
|
204
|
+
A: Currently English-only. The slop word lists and transition word lists are English-specific. Statistical features (entropy, burstiness) work across languages but haven't been calibrated.
|
|
205
|
+
|
|
206
|
+
**Q: Does it phone home?**
|
|
207
|
+
A: No. Zero network requests. No telemetry. No API keys. Everything runs locally.
|
|
208
|
+
|
|
209
|
+
**Q: How is model attribution possible without running the model?**
|
|
210
|
+
A: Each LLM family has characteristic vocabulary biases. GPT-4 loves "delve" and "tapestry". Claude says "I'd be happy to". These are statistical fingerprints — not guaranteed attribution, but strong signals.
|
|
211
|
+
|
|
212
|
+
## See Also
|
|
213
|
+
|
|
214
|
+
- [reverse-SynthID](https://github.com/aloshdenny/reverse-SynthID) — Reverse-engineering Google's image watermarking
|
|
215
|
+
- [vibesafe](https://github.com/stef41/vibesafe) — AI code safety scanner
|
|
216
|
+
- [injectionguard](https://github.com/stef41/injectionguard) — Prompt injection detection
|
|
217
|
+
- [vibescore](https://github.com/stef41/vibescore) — Grade your vibe-coded project
|
|
218
|
+
|
|
219
|
+
## License
|
|
220
|
+
|
|
221
|
+
Apache-2.0
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "lmscan"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Detect AI-generated text and fingerprint which LLM wrote it. Open-source GPTZero alternative. Zero dependencies, works offline."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "Apache-2.0"}
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [{ name = "Zacharie B" }]
|
|
13
|
+
keywords = [
|
|
14
|
+
"ai-detection",
|
|
15
|
+
"ai-text-detection",
|
|
16
|
+
"gptzero",
|
|
17
|
+
"llm",
|
|
18
|
+
"chatgpt",
|
|
19
|
+
"claude",
|
|
20
|
+
"gemini",
|
|
21
|
+
"text-forensics",
|
|
22
|
+
"plagiarism",
|
|
23
|
+
"content-detection",
|
|
24
|
+
]
|
|
25
|
+
classifiers = [
|
|
26
|
+
"Development Status :: 4 - Beta",
|
|
27
|
+
"Environment :: Console",
|
|
28
|
+
"Intended Audience :: Developers",
|
|
29
|
+
"Intended Audience :: Education",
|
|
30
|
+
"Intended Audience :: Science/Research",
|
|
31
|
+
"License :: OSI Approved :: Apache Software License",
|
|
32
|
+
"Programming Language :: Python :: 3",
|
|
33
|
+
"Programming Language :: Python :: 3.9",
|
|
34
|
+
"Programming Language :: Python :: 3.10",
|
|
35
|
+
"Programming Language :: Python :: 3.11",
|
|
36
|
+
"Programming Language :: Python :: 3.12",
|
|
37
|
+
"Programming Language :: Python :: 3.13",
|
|
38
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
39
|
+
"Topic :: Text Processing :: Linguistic",
|
|
40
|
+
"Typing :: Typed",
|
|
41
|
+
]
|
|
42
|
+
dependencies = []
|
|
43
|
+
|
|
44
|
+
[project.scripts]
|
|
45
|
+
lmscan = "lmscan.cli:main"
|
|
46
|
+
|
|
47
|
+
[project.urls]
|
|
48
|
+
Homepage = "https://github.com/stef41/lmscan"
|
|
49
|
+
Repository = "https://github.com/stef41/lmscan"
|
|
50
|
+
Issues = "https://github.com/stef41/lmscan/issues"
|
|
51
|
+
Changelog = "https://github.com/stef41/lmscan/blob/main/CHANGELOG.md"
|
|
52
|
+
|
|
53
|
+
[tool.hatch.build.targets.wheel]
|
|
54
|
+
packages = ["src/lmscan"]
|
|
55
|
+
|
|
56
|
+
[tool.pytest.ini_options]
|
|
57
|
+
testpaths = ["tests"]
|
|
58
|
+
addopts = "-v --tb=short"
|
|
59
|
+
|
|
60
|
+
[tool.ruff]
|
|
61
|
+
target-version = "py39"
|
|
62
|
+
line-length = 99
|
|
63
|
+
|
|
64
|
+
[tool.ruff.lint]
|
|
65
|
+
select = ["E", "F", "W", "I", "N", "UP", "B", "SIM", "TCH"]
|
|
66
|
+
|
|
67
|
+
[tool.mypy]
|
|
68
|
+
python_version = "3.9"
|
|
69
|
+
strict = true
|
|
70
|
+
warn_return_any = true
|
|
71
|
+
warn_unused_configs = true
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
|
|
5
|
+
from .scanner import scan, scan_file
|
|
6
|
+
from ._types import ScanResult, TextFeatures, SentenceScore, ModelMatch
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"scan",
|
|
10
|
+
"scan_file",
|
|
11
|
+
"ScanResult",
|
|
12
|
+
"TextFeatures",
|
|
13
|
+
"SentenceScore",
|
|
14
|
+
"ModelMatch",
|
|
15
|
+
"__version__",
|
|
16
|
+
]
|