lmscan 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - uses: actions/setup-python@v5
18
+ with:
19
+ python-version: ${{ matrix.python-version }}
20
+ - run: pip install -e . pytest
21
+ - run: python -m pytest tests/ -v --tb=short
@@ -0,0 +1,14 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ dist/
5
+ build/
6
+ *.egg-info/
7
+ .eggs/
8
+ *.egg
9
+ .pytest_cache/
10
+ .mypy_cache/
11
+ .ruff_cache/
12
+ .venv/
13
+ venv/
14
+ .env
@@ -0,0 +1,8 @@
1
+ - id: lmscan
2
+ name: lmscan
3
+ description: Detect AI-generated text
4
+ entry: lmscan
5
+ language: python
6
+ types: [text]
7
+ pass_filenames: true
8
+ args: ["--threshold", "0.7"]
@@ -0,0 +1,16 @@
1
+ # Changelog
2
+
3
+ ## [0.1.0] - 2025-04-10
4
+
5
+ ### Added
6
+ - Statistical AI text detection using 12 linguistic features
7
+ - Model fingerprinting for GPT-4, Claude, Gemini, Llama, Mistral
8
+ - Per-sentence analysis with individual AI probability scores
9
+ - CLI with text/file/stdin input, JSON output, threshold gating
10
+ - Python API: `scan()`, `scan_file()`
11
+ - Burstiness, entropy, Zipf deviation, vocabulary richness analysis
12
+ - AI "slop word" detection (known LLM vocabulary markers)
13
+ - Transition word ratio, readability consistency, bigram/trigram repetition
14
+ - Beautiful ASCII terminal report with feature table and model attribution
15
+ - Zero external dependencies
16
+ - 96 tests
lmscan-0.1.0/LICENSE ADDED
@@ -0,0 +1,17 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ Copyright 2025 Zacharie B
6
+
7
+ Licensed under the Apache License, Version 2.0 (the "License");
8
+ you may not use this file except in compliance with the License.
9
+ You may obtain a copy of the License at
10
+
11
+ http://www.apache.org/licenses/LICENSE-2.0
12
+
13
+ Unless required by applicable law or agreed to in writing, software
14
+ distributed under the License is distributed on an "AS IS" BASIS,
15
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ See the License for the specific language governing permissions and
17
+ limitations under the License.
lmscan-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,251 @@
1
+ Metadata-Version: 2.4
2
+ Name: lmscan
3
+ Version: 0.1.0
4
+ Summary: Detect AI-generated text and fingerprint which LLM wrote it. Open-source GPTZero alternative. Zero dependencies, works offline.
5
+ Project-URL: Homepage, https://github.com/stef41/lmscan
6
+ Project-URL: Repository, https://github.com/stef41/lmscan
7
+ Project-URL: Issues, https://github.com/stef41/lmscan/issues
8
+ Project-URL: Changelog, https://github.com/stef41/lmscan/blob/main/CHANGELOG.md
9
+ Author: Zacharie B
10
+ License: Apache-2.0
11
+ License-File: LICENSE
12
+ Keywords: ai-detection,ai-text-detection,chatgpt,claude,content-detection,gemini,gptzero,llm,plagiarism,text-forensics
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Environment :: Console
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Intended Audience :: Education
17
+ Classifier: Intended Audience :: Science/Research
18
+ Classifier: License :: OSI Approved :: Apache Software License
19
+ Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Classifier: Programming Language :: Python :: 3.12
24
+ Classifier: Programming Language :: Python :: 3.13
25
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
26
+ Classifier: Topic :: Text Processing :: Linguistic
27
+ Classifier: Typing :: Typed
28
+ Requires-Python: >=3.9
29
+ Description-Content-Type: text/markdown
30
+
31
+ # 🔍 lmscan
32
+
33
+ **Detect AI-generated text. Fingerprint which LLM wrote it. Open-source GPTZero alternative.**
34
+
35
+ [![PyPI](https://img.shields.io/pypi/v/lmscan?color=blue)](https://pypi.org/project/lmscan/)
36
+ [![License](https://img.shields.io/badge/license-Apache--2.0-green)](LICENSE)
37
+ [![Python](https://img.shields.io/pypi/pyversions/lmscan)](https://pypi.org/project/lmscan/)
38
+ [![Tests](https://img.shields.io/badge/tests-96%20passed-brightgreen)]()
39
+
40
+ > GPTZero charges $15/month. Originality.ai charges per scan. Turnitin locks you into institutional contracts.
41
+ >
42
+ > **lmscan is free, open-source, works offline, and tells you _which_ model wrote the text.**
43
+
44
+ ```
45
+ $ lmscan "In today's rapidly evolving digital landscape, it's important
46
+ to note that artificial intelligence has become a pivotal force in
47
+ transforming how we navigate the complexities of modern life..."
48
+
49
+ 🔍 lmscan v0.1.0 — AI Text Forensics
50
+ ══════════════════════════════════════════════════
51
+
52
+ Verdict: 🤖 Likely AI (77% confidence)
53
+ Words: 184
54
+ Sentences: 10
55
+ Scanned in 0.01s
56
+
57
+ ┌────────────────────────────┬──────────┬────────────────────┐
58
+ │ Feature │ Value │ Signal │
59
+ ├────────────────────────────┼──────────┼────────────────────┤
60
+ │ Burstiness │ 0.07 │ 🔴 Very low (AI) │
61
+ │ Sentence length variance │ 0.27 │ 🟡 Below average │
62
+ │ Slop word density │ 20.7% │ 🔴 High (AI) │
63
+ │ Transition word ratio │ 2.2% │ 🟡 Elevated │
64
+ │ Readability consistency │ 0.00 │ 🔴 Very low (AI) │
65
+ │ ... │ │ │
66
+ └────────────────────────────┴──────────┴────────────────────┘
67
+
68
+ 🔎 Model Attribution
69
+ 1. GPT-4 / ChatGPT 62% — "delve", "tapestry", "beacon", "landscape" (×2), +19 more
70
+ 2. Claude (Anthropic) 13% — "robust", "nuanced", "comprehensive"
71
+ 3. Gemini (Google) 9% — "furthermore", "additionally"
72
+
73
+ ⚠️ Flags
74
+ • Very low burstiness (0.07) — AI text is more uniform in complexity
75
+ • High slop word density (20.7%) — contains known AI vocabulary markers
76
+ ```
77
+
78
+ ## Install
79
+
80
+ ```bash
81
+ pip install lmscan
82
+ ```
83
+
84
+ **Zero dependencies.** Works with Python 3.9+. No API keys. No internet. No GPU.
85
+
86
+ ## Usage
87
+
88
+ ```bash
89
+ # Scan text directly
90
+ lmscan "Your text here..."
91
+
92
+ # Scan a file
93
+ lmscan document.txt
94
+
95
+ # Pipe from stdin
96
+ cat essay.txt | lmscan -
97
+
98
+ # JSON output (for scripts and CI)
99
+ lmscan document.txt --format json
100
+
101
+ # Per-sentence breakdown
102
+ lmscan document.txt --sentences
103
+
104
+ # CI gate: fail if AI probability > 50%
105
+ lmscan submission.txt --threshold 0.5
106
+ ```
107
+
108
+ ### Python API
109
+
110
+ ```python
111
+ from lmscan import scan
112
+
113
+ result = scan("Text to analyze...")
114
+
115
+ print(f"AI probability: {result.ai_probability:.0%}")
116
+ print(f"Verdict: {result.verdict}")
117
+ print(f"Confidence: {result.confidence}")
118
+
119
+ # Which model wrote it?
120
+ for model in result.model_attribution:
121
+ print(f" {model.model}: {model.confidence:.0%}")
122
+ for evidence in model.evidence[:3]:
123
+ print(f" → {evidence}")
124
+
125
+ # Per-sentence analysis
126
+ for sentence in result.sentence_scores:
127
+ if sentence.ai_probability > 0.7:
128
+ print(f" 🤖 {sentence.text[:60]}... ({sentence.ai_probability:.0%})")
129
+ ```
130
+
131
+ ### Scan entire directories
132
+
133
+ ```python
134
+ from lmscan import scan_file
135
+ import glob
136
+
137
+ for path in glob.glob("submissions/*.txt"):
138
+ result = scan_file(path)
139
+ print(f"{path}: {result.verdict} ({result.ai_probability:.0%})")
140
+ ```
141
+
142
+ ## How It Works
143
+
144
+ lmscan uses **12 statistical features** derived from computational linguistics research to distinguish AI-generated text from human writing:
145
+
146
+ | Feature | What it measures | AI signal |
147
+ |---------|-----------------|-----------|
148
+ | **Burstiness** | Variance in sentence complexity | AI text is unusually uniform |
149
+ | **Sentence length variance** | How much sentence lengths vary | AI produces uniform lengths |
150
+ | **Vocabulary richness** | Type-token ratio (Yule's K corrected) | AI reuses words more |
151
+ | **Hapax legomena ratio** | Fraction of words appearing once | AI has fewer unique words |
152
+ | **Zipf deviation** | How word frequencies follow Zipf's law | AI deviates from natural distribution |
153
+ | **Readability consistency** | Flesch-Kincaid variance across paragraphs | AI maintains constant readability |
154
+ | **Bigram/trigram repetition** | Repeated word pairs and triples | AI repeats phrase structures |
155
+ | **Transition word ratio** | "however", "moreover", "furthermore"... | AI overuses transitions |
156
+ | **Slop word density** | Known AI vocabulary markers | "delve", "tapestry", "beacon"... |
157
+ | **Punctuation entropy** | Diversity of punctuation usage | AI is more predictable |
158
+
159
+ Each feature produces a signal via sigmoid transformation. The weighted combination produces the final AI probability.
160
+
161
+ ### Model Fingerprinting
162
+
163
+ lmscan includes vocabulary fingerprints for 5 major LLM families:
164
+
165
+ | Model | Distinctive markers |
166
+ |-------|-------------------|
167
+ | **GPT-4 / ChatGPT** | "delve", "tapestry", "landscape", "leverage", "multifaceted", "it's important to note" |
168
+ | **Claude (Anthropic)** | "certainly", "I'd be happy to", "straightforward", "I should note" |
169
+ | **Gemini (Google)** | "crucial", "here's a breakdown", "keep in mind" |
170
+ | **Llama / Meta** | "awesome", "fantastic", "hope this helps" |
171
+ | **Mistral / Mixtral** | "indeed", "moreover", "hence", "noteworthy" |
172
+
173
+ Attribution uses weighted vocabulary matching, phrase detection, and hedging pattern analysis.
174
+
175
+ ## Accuracy & Limitations
176
+
177
+ **What lmscan is good at:**
178
+ - Detecting text with strong AI stylistic patterns
179
+ - Identifying which model family generated text
180
+ - Scanning at scale (thousands of documents) with zero cost
181
+ - Providing explainable evidence (not a black box)
182
+
183
+ **What lmscan cannot do:**
184
+ - Detect AI text that has been manually edited or paraphrased
185
+ - Work reliably on very short text (<50 words)
186
+ - Detect AI text in non-English languages (English-only for now)
187
+ - Replace human judgment — use as a signal, not a verdict
188
+
189
+ **This is statistical analysis, not a neural classifier.** It detects stylistic patterns, not watermarks. It works best on unedited LLM output and degrades gracefully on edited text.
190
+
191
+ ## CI Integration
192
+
193
+ ### GitHub Actions
194
+
195
+ ```yaml
196
+ - name: AI Content Check
197
+ run: |
198
+ pip install lmscan
199
+ lmscan submission.txt --threshold 0.7 --format json
200
+ ```
201
+
202
+ ### Pre-commit
203
+
204
+ ```yaml
205
+ repos:
206
+ - repo: https://github.com/stef41/lmscan
207
+ rev: v0.1.0
208
+ hooks:
209
+ - id: lmscan
210
+ args: ["--threshold", "0.7"]
211
+ ```
212
+
213
+ ## Research Background
214
+
215
+ lmscan's approach is informed by published research on AI text detection:
216
+
217
+ - **DetectGPT** (Mitchell et al., 2023) — perturbation-based detection using log probability curvature
218
+ - **GLTR** (Gehrmann et al., 2019) — statistical visualization of token predictions
219
+ - **Binoculars** (Hans et al., 2024) — cross-model perplexity comparison
220
+ - **Zipf's Law in NLP** — word frequency distributions differ between human and AI text
221
+ - **Stylometry** — decades of authorship attribution research applied to AI forensics
222
+
223
+ lmscan takes the statistical intuitions from these papers and implements them as lightweight, dependency-free heuristics that work without requiring a reference language model.
224
+
225
+ ## FAQ
226
+
227
+ **Q: Is this as accurate as GPTZero?**
228
+ A: GPTZero uses neural classifiers trained on labeled data. lmscan uses statistical heuristics. GPTZero is more accurate on edge cases; lmscan is free, offline, and explainable. Use both if accuracy matters.
229
+
230
+ **Q: Can students use this to evade AI detection?**
231
+ A: lmscan shows which features trigger detection, which could help someone understand why text reads as AI-generated. This is by design — understanding AI writing patterns makes everyone a better writer. The same information is available in published research papers.
232
+
233
+ **Q: Does it work on non-English text?**
234
+ A: Currently English-only. The slop word lists and transition word lists are English-specific. Statistical features (entropy, burstiness) work across languages but haven't been calibrated.
235
+
236
+ **Q: Does it phone home?**
237
+ A: No. Zero network requests. No telemetry. No API keys. Everything runs locally.
238
+
239
+ **Q: How is model attribution possible without running the model?**
240
+ A: Each LLM family has characteristic vocabulary biases. GPT-4 loves "delve" and "tapestry". Claude says "I'd be happy to". These are statistical fingerprints — not guaranteed attribution, but strong signals.
241
+
242
+ ## See Also
243
+
244
+ - [reverse-SynthID](https://github.com/aloshdenny/reverse-SynthID) — Reverse-engineering Google's image watermarking
245
+ - [vibesafe](https://github.com/stef41/vibesafe) — AI code safety scanner
246
+ - [injectionguard](https://github.com/stef41/injectionguard) — Prompt injection detection
247
+ - [vibescore](https://github.com/stef41/vibescore) — Grade your vibe-coded project
248
+
249
+ ## License
250
+
251
+ Apache-2.0
lmscan-0.1.0/README.md ADDED
@@ -0,0 +1,221 @@
1
+ # 🔍 lmscan
2
+
3
+ **Detect AI-generated text. Fingerprint which LLM wrote it. Open-source GPTZero alternative.**
4
+
5
+ [![PyPI](https://img.shields.io/pypi/v/lmscan?color=blue)](https://pypi.org/project/lmscan/)
6
+ [![License](https://img.shields.io/badge/license-Apache--2.0-green)](LICENSE)
7
+ [![Python](https://img.shields.io/pypi/pyversions/lmscan)](https://pypi.org/project/lmscan/)
8
+ [![Tests](https://img.shields.io/badge/tests-96%20passed-brightgreen)]()
9
+
10
+ > GPTZero charges $15/month. Originality.ai charges per scan. Turnitin locks you into institutional contracts.
11
+ >
12
+ > **lmscan is free, open-source, works offline, and tells you _which_ model wrote the text.**
13
+
14
+ ```
15
+ $ lmscan "In today's rapidly evolving digital landscape, it's important
16
+ to note that artificial intelligence has become a pivotal force in
17
+ transforming how we navigate the complexities of modern life..."
18
+
19
+ 🔍 lmscan v0.1.0 — AI Text Forensics
20
+ ══════════════════════════════════════════════════
21
+
22
+ Verdict: 🤖 Likely AI (77% confidence)
23
+ Words: 184
24
+ Sentences: 10
25
+ Scanned in 0.01s
26
+
27
+ ┌────────────────────────────┬──────────┬────────────────────┐
28
+ │ Feature │ Value │ Signal │
29
+ ├────────────────────────────┼──────────┼────────────────────┤
30
+ │ Burstiness │ 0.07 │ 🔴 Very low (AI) │
31
+ │ Sentence length variance │ 0.27 │ 🟡 Below average │
32
+ │ Slop word density │ 20.7% │ 🔴 High (AI) │
33
+ │ Transition word ratio │ 2.2% │ 🟡 Elevated │
34
+ │ Readability consistency │ 0.00 │ 🔴 Very low (AI) │
35
+ │ ... │ │ │
36
+ └────────────────────────────┴──────────┴────────────────────┘
37
+
38
+ 🔎 Model Attribution
39
+ 1. GPT-4 / ChatGPT 62% — "delve", "tapestry", "beacon", "landscape" (×2), +19 more
40
+ 2. Claude (Anthropic) 13% — "robust", "nuanced", "comprehensive"
41
+ 3. Gemini (Google) 9% — "furthermore", "additionally"
42
+
43
+ ⚠️ Flags
44
+ • Very low burstiness (0.07) — AI text is more uniform in complexity
45
+ • High slop word density (20.7%) — contains known AI vocabulary markers
46
+ ```
47
+
48
+ ## Install
49
+
50
+ ```bash
51
+ pip install lmscan
52
+ ```
53
+
54
+ **Zero dependencies.** Works with Python 3.9+. No API keys. No internet. No GPU.
55
+
56
+ ## Usage
57
+
58
+ ```bash
59
+ # Scan text directly
60
+ lmscan "Your text here..."
61
+
62
+ # Scan a file
63
+ lmscan document.txt
64
+
65
+ # Pipe from stdin
66
+ cat essay.txt | lmscan -
67
+
68
+ # JSON output (for scripts and CI)
69
+ lmscan document.txt --format json
70
+
71
+ # Per-sentence breakdown
72
+ lmscan document.txt --sentences
73
+
74
+ # CI gate: fail if AI probability > 50%
75
+ lmscan submission.txt --threshold 0.5
76
+ ```
77
+
78
+ ### Python API
79
+
80
+ ```python
81
+ from lmscan import scan
82
+
83
+ result = scan("Text to analyze...")
84
+
85
+ print(f"AI probability: {result.ai_probability:.0%}")
86
+ print(f"Verdict: {result.verdict}")
87
+ print(f"Confidence: {result.confidence}")
88
+
89
+ # Which model wrote it?
90
+ for model in result.model_attribution:
91
+ print(f" {model.model}: {model.confidence:.0%}")
92
+ for evidence in model.evidence[:3]:
93
+ print(f" → {evidence}")
94
+
95
+ # Per-sentence analysis
96
+ for sentence in result.sentence_scores:
97
+ if sentence.ai_probability > 0.7:
98
+ print(f" 🤖 {sentence.text[:60]}... ({sentence.ai_probability:.0%})")
99
+ ```
100
+
101
+ ### Scan entire directories
102
+
103
+ ```python
104
+ from lmscan import scan_file
105
+ import glob
106
+
107
+ for path in glob.glob("submissions/*.txt"):
108
+ result = scan_file(path)
109
+ print(f"{path}: {result.verdict} ({result.ai_probability:.0%})")
110
+ ```
111
+
112
+ ## How It Works
113
+
114
+ lmscan uses **12 statistical features** derived from computational linguistics research to distinguish AI-generated text from human writing:
115
+
116
+ | Feature | What it measures | AI signal |
117
+ |---------|-----------------|-----------|
118
+ | **Burstiness** | Variance in sentence complexity | AI text is unusually uniform |
119
+ | **Sentence length variance** | How much sentence lengths vary | AI produces uniform lengths |
120
+ | **Vocabulary richness** | Type-token ratio (Yule's K corrected) | AI reuses words more |
121
+ | **Hapax legomena ratio** | Fraction of words appearing once | AI has fewer unique words |
122
+ | **Zipf deviation** | How word frequencies follow Zipf's law | AI deviates from natural distribution |
123
+ | **Readability consistency** | Flesch-Kincaid variance across paragraphs | AI maintains constant readability |
124
+ | **Bigram/trigram repetition** | Repeated word pairs and triples | AI repeats phrase structures |
125
+ | **Transition word ratio** | "however", "moreover", "furthermore"... | AI overuses transitions |
126
+ | **Slop word density** | Known AI vocabulary markers | "delve", "tapestry", "beacon"... |
127
+ | **Punctuation entropy** | Diversity of punctuation usage | AI is more predictable |
128
+
129
+ Each feature produces a signal via sigmoid transformation. The weighted combination produces the final AI probability.
130
+
131
+ ### Model Fingerprinting
132
+
133
+ lmscan includes vocabulary fingerprints for 5 major LLM families:
134
+
135
+ | Model | Distinctive markers |
136
+ |-------|-------------------|
137
+ | **GPT-4 / ChatGPT** | "delve", "tapestry", "landscape", "leverage", "multifaceted", "it's important to note" |
138
+ | **Claude (Anthropic)** | "certainly", "I'd be happy to", "straightforward", "I should note" |
139
+ | **Gemini (Google)** | "crucial", "here's a breakdown", "keep in mind" |
140
+ | **Llama / Meta** | "awesome", "fantastic", "hope this helps" |
141
+ | **Mistral / Mixtral** | "indeed", "moreover", "hence", "noteworthy" |
142
+
143
+ Attribution uses weighted vocabulary matching, phrase detection, and hedging pattern analysis.
144
+
145
+ ## Accuracy & Limitations
146
+
147
+ **What lmscan is good at:**
148
+ - Detecting text with strong AI stylistic patterns
149
+ - Identifying which model family generated text
150
+ - Scanning at scale (thousands of documents) with zero cost
151
+ - Providing explainable evidence (not a black box)
152
+
153
+ **What lmscan cannot do:**
154
+ - Detect AI text that has been manually edited or paraphrased
155
+ - Work reliably on very short text (<50 words)
156
+ - Detect AI text in non-English languages (English-only for now)
157
+ - Replace human judgment — use as a signal, not a verdict
158
+
159
+ **This is statistical analysis, not a neural classifier.** It detects stylistic patterns, not watermarks. It works best on unedited LLM output and degrades gracefully on edited text.
160
+
161
+ ## CI Integration
162
+
163
+ ### GitHub Actions
164
+
165
+ ```yaml
166
+ - name: AI Content Check
167
+ run: |
168
+ pip install lmscan
169
+ lmscan submission.txt --threshold 0.7 --format json
170
+ ```
171
+
172
+ ### Pre-commit
173
+
174
+ ```yaml
175
+ repos:
176
+ - repo: https://github.com/stef41/lmscan
177
+ rev: v0.1.0
178
+ hooks:
179
+ - id: lmscan
180
+ args: ["--threshold", "0.7"]
181
+ ```
182
+
183
+ ## Research Background
184
+
185
+ lmscan's approach is informed by published research on AI text detection:
186
+
187
+ - **DetectGPT** (Mitchell et al., 2023) — perturbation-based detection using log probability curvature
188
+ - **GLTR** (Gehrmann et al., 2019) — statistical visualization of token predictions
189
+ - **Binoculars** (Hans et al., 2024) — cross-model perplexity comparison
190
+ - **Zipf's Law in NLP** — word frequency distributions differ between human and AI text
191
+ - **Stylometry** — decades of authorship attribution research applied to AI forensics
192
+
193
+ lmscan takes the statistical intuitions from these papers and implements them as lightweight, dependency-free heuristics that work without requiring a reference language model.
194
+
195
+ ## FAQ
196
+
197
+ **Q: Is this as accurate as GPTZero?**
198
+ A: GPTZero uses neural classifiers trained on labeled data. lmscan uses statistical heuristics. GPTZero is more accurate on edge cases; lmscan is free, offline, and explainable. Use both if accuracy matters.
199
+
200
+ **Q: Can students use this to evade AI detection?**
201
+ A: lmscan shows which features trigger detection, which could help someone understand why text reads as AI-generated. This is by design — understanding AI writing patterns makes everyone a better writer. The same information is available in published research papers.
202
+
203
+ **Q: Does it work on non-English text?**
204
+ A: Currently English-only. The slop word lists and transition word lists are English-specific. Statistical features (entropy, burstiness) work across languages but haven't been calibrated.
205
+
206
+ **Q: Does it phone home?**
207
+ A: No. Zero network requests. No telemetry. No API keys. Everything runs locally.
208
+
209
+ **Q: How is model attribution possible without running the model?**
210
+ A: Each LLM family has characteristic vocabulary biases. GPT-4 loves "delve" and "tapestry". Claude says "I'd be happy to". These are statistical fingerprints — not guaranteed attribution, but strong signals.
211
+
212
+ ## See Also
213
+
214
+ - [reverse-SynthID](https://github.com/aloshdenny/reverse-SynthID) — Reverse-engineering Google's image watermarking
215
+ - [vibesafe](https://github.com/stef41/vibesafe) — AI code safety scanner
216
+ - [injectionguard](https://github.com/stef41/injectionguard) — Prompt injection detection
217
+ - [vibescore](https://github.com/stef41/vibescore) — Grade your vibe-coded project
218
+
219
+ ## License
220
+
221
+ Apache-2.0
@@ -0,0 +1,71 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "lmscan"
7
+ version = "0.1.0"
8
+ description = "Detect AI-generated text and fingerprint which LLM wrote it. Open-source GPTZero alternative. Zero dependencies, works offline."
9
+ readme = "README.md"
10
+ license = {text = "Apache-2.0"}
11
+ requires-python = ">=3.9"
12
+ authors = [{ name = "Zacharie B" }]
13
+ keywords = [
14
+ "ai-detection",
15
+ "ai-text-detection",
16
+ "gptzero",
17
+ "llm",
18
+ "chatgpt",
19
+ "claude",
20
+ "gemini",
21
+ "text-forensics",
22
+ "plagiarism",
23
+ "content-detection",
24
+ ]
25
+ classifiers = [
26
+ "Development Status :: 4 - Beta",
27
+ "Environment :: Console",
28
+ "Intended Audience :: Developers",
29
+ "Intended Audience :: Education",
30
+ "Intended Audience :: Science/Research",
31
+ "License :: OSI Approved :: Apache Software License",
32
+ "Programming Language :: Python :: 3",
33
+ "Programming Language :: Python :: 3.9",
34
+ "Programming Language :: Python :: 3.10",
35
+ "Programming Language :: Python :: 3.11",
36
+ "Programming Language :: Python :: 3.12",
37
+ "Programming Language :: Python :: 3.13",
38
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
39
+ "Topic :: Text Processing :: Linguistic",
40
+ "Typing :: Typed",
41
+ ]
42
+ dependencies = []
43
+
44
+ [project.scripts]
45
+ lmscan = "lmscan.cli:main"
46
+
47
+ [project.urls]
48
+ Homepage = "https://github.com/stef41/lmscan"
49
+ Repository = "https://github.com/stef41/lmscan"
50
+ Issues = "https://github.com/stef41/lmscan/issues"
51
+ Changelog = "https://github.com/stef41/lmscan/blob/main/CHANGELOG.md"
52
+
53
+ [tool.hatch.build.targets.wheel]
54
+ packages = ["src/lmscan"]
55
+
56
+ [tool.pytest.ini_options]
57
+ testpaths = ["tests"]
58
+ addopts = "-v --tb=short"
59
+
60
+ [tool.ruff]
61
+ target-version = "py39"
62
+ line-length = 99
63
+
64
+ [tool.ruff.lint]
65
+ select = ["E", "F", "W", "I", "N", "UP", "B", "SIM", "TCH"]
66
+
67
+ [tool.mypy]
68
+ python_version = "3.9"
69
+ strict = true
70
+ warn_return_any = true
71
+ warn_unused_configs = true
@@ -0,0 +1,16 @@
1
+ from __future__ import annotations
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from .scanner import scan, scan_file
6
+ from ._types import ScanResult, TextFeatures, SentenceScore, ModelMatch
7
+
8
+ __all__ = [
9
+ "scan",
10
+ "scan_file",
11
+ "ScanResult",
12
+ "TextFeatures",
13
+ "SentenceScore",
14
+ "ModelMatch",
15
+ "__version__",
16
+ ]