exposurecheck 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- exposurecheck-0.1.0/LICENSE +21 -0
- exposurecheck-0.1.0/PKG-INFO +217 -0
- exposurecheck-0.1.0/README.md +194 -0
- exposurecheck-0.1.0/exposurecheck/__init__.py +11 -0
- exposurecheck-0.1.0/exposurecheck/__main__.py +4 -0
- exposurecheck-0.1.0/exposurecheck/audit.py +44 -0
- exposurecheck-0.1.0/exposurecheck/backends/__init__.py +57 -0
- exposurecheck-0.1.0/exposurecheck/backends/_mask.py +18 -0
- exposurecheck-0.1.0/exposurecheck/backends/base.py +52 -0
- exposurecheck-0.1.0/exposurecheck/backends/heuristic.py +102 -0
- exposurecheck-0.1.0/exposurecheck/backends/llm.py +205 -0
- exposurecheck-0.1.0/exposurecheck/backends/transports.py +114 -0
- exposurecheck-0.1.0/exposurecheck/cascade/__init__.py +6 -0
- exposurecheck-0.1.0/exposurecheck/cascade/deterministic.py +125 -0
- exposurecheck-0.1.0/exposurecheck/cascade/pipeline.py +74 -0
- exposurecheck-0.1.0/exposurecheck/cascade/prefilter.py +35 -0
- exposurecheck-0.1.0/exposurecheck/cascade/summarize.py +45 -0
- exposurecheck-0.1.0/exposurecheck/cli.py +197 -0
- exposurecheck-0.1.0/exposurecheck/metadata/__init__.py +4 -0
- exposurecheck-0.1.0/exposurecheck/metadata/exif.py +221 -0
- exposurecheck-0.1.0/exposurecheck/models.py +209 -0
- exposurecheck-0.1.0/exposurecheck/output/__init__.py +6 -0
- exposurecheck-0.1.0/exposurecheck/output/interactive.py +70 -0
- exposurecheck-0.1.0/exposurecheck/output/report.py +106 -0
- exposurecheck-0.1.0/exposurecheck/parsers/__init__.py +17 -0
- exposurecheck-0.1.0/exposurecheck/parsers/_source.py +86 -0
- exposurecheck-0.1.0/exposurecheck/parsers/_util.py +21 -0
- exposurecheck-0.1.0/exposurecheck/parsers/reddit.py +103 -0
- exposurecheck-0.1.0/exposurecheck/parsers/twitter.py +157 -0
- exposurecheck-0.1.0/exposurecheck/remediation/__init__.py +5 -0
- exposurecheck-0.1.0/exposurecheck/remediation/advise.py +24 -0
- exposurecheck-0.1.0/exposurecheck/risk/__init__.py +7 -0
- exposurecheck-0.1.0/exposurecheck/risk/card.py +39 -0
- exposurecheck-0.1.0/exposurecheck/risk/categories.py +118 -0
- exposurecheck-0.1.0/exposurecheck/risk/scoring.py +50 -0
- exposurecheck-0.1.0/exposurecheck/safety/__init__.py +6 -0
- exposurecheck-0.1.0/exposurecheck/safety/consent.py +38 -0
- exposurecheck-0.1.0/exposurecheck/safety/offline.py +60 -0
- exposurecheck-0.1.0/exposurecheck/safety/warnings.py +41 -0
- exposurecheck-0.1.0/exposurecheck.egg-info/PKG-INFO +217 -0
- exposurecheck-0.1.0/exposurecheck.egg-info/SOURCES.txt +53 -0
- exposurecheck-0.1.0/exposurecheck.egg-info/dependency_links.txt +1 -0
- exposurecheck-0.1.0/exposurecheck.egg-info/entry_points.txt +2 -0
- exposurecheck-0.1.0/exposurecheck.egg-info/requires.txt +3 -0
- exposurecheck-0.1.0/exposurecheck.egg-info/top_level.txt +1 -0
- exposurecheck-0.1.0/pyproject.toml +45 -0
- exposurecheck-0.1.0/setup.cfg +4 -0
- exposurecheck-0.1.0/tests/test_cascade.py +56 -0
- exposurecheck-0.1.0/tests/test_cli_errors.py +31 -0
- exposurecheck-0.1.0/tests/test_exif_hardening.py +44 -0
- exposurecheck-0.1.0/tests/test_llm_backend.py +76 -0
- exposurecheck-0.1.0/tests/test_no_dossier.py +62 -0
- exposurecheck-0.1.0/tests/test_offline.py +30 -0
- exposurecheck-0.1.0/tests/test_parsers.py +76 -0
- exposurecheck-0.1.0/tests/test_safety.py +37 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Cora Aegis
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: exposurecheck
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Audit your own social-media export for re-identification (mosaic) risk. Local-first, no-dossier, bring-your-own-LLM.
|
|
5
|
+
Author: Cora Aegis
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://cypherpunkguide.com
|
|
8
|
+
Project-URL: Repository, https://github.com/coraaegis/exposurecheck
|
|
9
|
+
Project-URL: Issues, https://github.com/coraaegis/exposurecheck/issues
|
|
10
|
+
Keywords: privacy,opsec,deanonymization,re-identification,osint-defense,mosaic,reddit,twitter,self-audit
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Topic :: Security
|
|
16
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Provides-Extra: dev
|
|
21
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
22
|
+
Dynamic: license-file
|
|
23
|
+
|
|
24
|
+
# ExposureCheck
|
|
25
|
+
|
|
26
|
+
**Audit your own social-media history for re-identification risk — before someone
|
|
27
|
+
else does it to you.**
|
|
28
|
+
|
|
29
|
+
Modern language models can read a few hundred of your ordinary public posts and
|
|
30
|
+
infer where you live, where you work, your routine, your family, and link a
|
|
31
|
+
pseudonymous account back to your real name — not from one careless post, but
|
|
32
|
+
from the *mosaic* of many individually-innocuous ones. Researchers have shown
|
|
33
|
+
this works at scale and with unsettling accuracy. `exposurecheck` runs that same
|
|
34
|
+
adversarial reading **on your own export, on your own terms**, and shows you
|
|
35
|
+
which of *your* posts to generalise or edit.
|
|
36
|
+
|
|
37
|
+
It is **local-first**, produces **no dossier**, and never writes a profile of you
|
|
38
|
+
to disk.
|
|
39
|
+
|
|
40
|
+
📖 Plain-language explainer of the threat:
|
|
41
|
+
**https://cypherpunkguide.com/privacy/social-media-self-audit/** (the companion
|
|
42
|
+
article — read it first if "mosaic re-identification" is new to you).
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## Demo
|
|
47
|
+
|
|
48
|
+

|
|
49
|
+
|
|
50
|
+
*A ~12-second run on the bundled sample export using the offline `--backend
|
|
51
|
+
heuristic` stub — reproducible: clone the repo and run the exact command shown,
|
|
52
|
+
and nothing leaves your machine (`--offline` hard-blocks egress). A real
|
|
53
|
+
`--backend local` or `cloud` model surfaces far more; the full run also scores
|
|
54
|
+
EMPLOYER, FINANCES, FAMILY and SCHEDULE. (Recorded as an
|
|
55
|
+
[asciicast](media/demo.cast).)*
|
|
56
|
+
|
|
57
|
+
## What it does
|
|
58
|
+
|
|
59
|
+
- Parses your **Reddit** GDPR export and your **X / Twitter** export (directory or `.zip`).
|
|
60
|
+
- Runs a **recall-preserving cascade**: a cheap pass ranks every post, an
|
|
61
|
+
expensive pass reads the high-priority ones, and weak signals are *kept* — the
|
|
62
|
+
mosaic is built from weak signals, so throwing them away would be false comfort.
|
|
63
|
+
- Extracts the **metadata layer** deterministically (this is where X leaks most):
|
|
64
|
+
the self-set location field, outbound links, image **EXIF/GPS**, device model,
|
|
65
|
+
and posting-time concentration that betrays your timezone.
|
|
66
|
+
- Reports **category risk cards** (Location, Employer, Family, Schedule,
|
|
67
|
+
Finances, Account-linkage, …) ranked by **risk contribution**, each with masked
|
|
68
|
+
examples and concrete, *generalise-first* remediation.
|
|
69
|
+
|
|
70
|
+
## What it deliberately does **not** do
|
|
71
|
+
|
|
72
|
+
- ❌ No dossier. It never prints "you live in X / work at Y / your name is Z".
|
|
73
|
+
Cards show **masked** snippets; the resolved value only ever appears when *you*
|
|
74
|
+
click through to *your own* original post, **in-session, never saved**.
|
|
75
|
+
- ❌ No export of findings. ❌ No scraping (export input only). ❌ No posting/
|
|
76
|
+
deletion on your behalf. ❌ No analysing anyone else's history.
|
|
77
|
+
- ❌ It does not make you anonymous. It **reduces** risk. "Low" is not "safe".
|
|
78
|
+
|
|
79
|
+
## Bring your own model — cloud **or** local
|
|
80
|
+
|
|
81
|
+
The inference runs on a backend **you** choose:
|
|
82
|
+
|
|
83
|
+
| backend | what it is | data leaves your machine? |
|
|
84
|
+
|---|---|---|
|
|
85
|
+
| `local` | a local Ollama (or llama.cpp/LM Studio) model | **no** |
|
|
86
|
+
| `cloud` | any OpenAI-compatible endpoint, your own key | **yes** |
|
|
87
|
+
| `heuristic` | offline regex stub, **near-zero recall** | no — *dev/CI only, not an audit* |
|
|
88
|
+
|
|
89
|
+
### ⚠️ The one cloud caveat that actually matters
|
|
90
|
+
|
|
91
|
+
If the account you are auditing is a **pseudonymous** one you keep separate from
|
|
92
|
+
your real identity, **and** your AI/cloud account is registered under your real
|
|
93
|
+
name or paid with a real-name method, then sending your history to the cloud lets
|
|
94
|
+
the provider link *real identity ↔ anonymous account* on their side (subpoena,
|
|
95
|
+
breach, insider). That is the exact deanonymization this tool exists to prevent.
|
|
96
|
+
|
|
97
|
+
So: **auditing a strictly-anonymous account → use `--backend local`** (or a cloud
|
|
98
|
+
account opened and paid for anonymously). Auditing your real-name / public
|
|
99
|
+
account → cloud is fine. The CLI states this and requires acknowledgement when it
|
|
100
|
+
applies. We never force local (that would shrink the audience to nobody); we make
|
|
101
|
+
the trade-off explicit.
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## Install
|
|
106
|
+
|
|
107
|
+
Core (parsing, EXIF, cascade, **and** the cloud/local HTTP backends) is **Python
|
|
108
|
+
standard library only** — no third-party code touches your export.
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
# pipx — isolated, recommended (works today; a PyPI release is coming):
|
|
112
|
+
pipx install git+https://github.com/coraaegis/exposurecheck
|
|
113
|
+
|
|
114
|
+
# or from source:
|
|
115
|
+
git clone https://github.com/coraaegis/exposurecheck && cd exposurecheck && pip install -e .
|
|
116
|
+
|
|
117
|
+
# or run without installing:
|
|
118
|
+
python -m exposurecheck --help
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
ExposureCheck is a **command-line tool** today, aimed at people comfortable with a
|
|
122
|
+
terminal — which is also where its first reviewers live (GitHub, Hacker News,
|
|
123
|
+
privacy forums). A **one-click app for non-technical users** — a packaged build
|
|
124
|
+
with a local, in-browser UI and no Python to install — is the next milestone (see
|
|
125
|
+
[Status](#status)). The CLI stays for power users.
|
|
126
|
+
|
|
127
|
+
### Verifying a release
|
|
128
|
+
|
|
129
|
+
Releases are **not signed with an identity code-signing certificate** — the author
|
|
130
|
+
is pseudonymous, and such a certificate would tie the project to a legal identity,
|
|
131
|
+
the opposite of the point. Authenticity is cryptographic and verifiable instead:
|
|
132
|
+
|
|
133
|
+
- Each release is **PGP-signed** by Cora Aegis. Fetch the key via WKD
|
|
134
|
+
(`gpg --locate-keys cora@cypherpunkguide.com`), then
|
|
135
|
+
`gpg --verify exposurecheck-<version>.tar.gz.asc`.
|
|
136
|
+
- **SHA-256 checksums** are published with every release.
|
|
137
|
+
- Builds are **reproducible** — rebuild from the tagged source and confirm the
|
|
138
|
+
artifact matches.
|
|
139
|
+
- Prefer a **package manager** (pip / Scoop / Homebrew) over a downloaded `.exe`.
|
|
140
|
+
An unsigned Windows binary may show a SmartScreen "unknown publisher" prompt;
|
|
141
|
+
that is expected — verify the PGP signature or run from source.
|
|
142
|
+
|
|
143
|
+
## Usage
|
|
144
|
+
|
|
145
|
+
Get your data first:
|
|
146
|
+
- Reddit → *Settings → Privacy → Request a copy of your data* (the `.zip`).
|
|
147
|
+
- X → *Settings → Your account → Download an archive of your data*.
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
# Local model — nothing leaves your machine (recommended for anonymous accounts)
|
|
151
|
+
exposurecheck audit \
|
|
152
|
+
--reddit ./reddit_export.zip \
|
|
153
|
+
--twitter ./twitter_export \
|
|
154
|
+
--backend local --expensive-model llama3.1 \
|
|
155
|
+
--i-own-this-data
|
|
156
|
+
|
|
157
|
+
# Cloud (bring your own key; set it in the ENV, never on the command line)
|
|
158
|
+
export OPENAI_API_KEY=sk-...
|
|
159
|
+
exposurecheck audit --twitter ./twitter_export --backend cloud --i-own-this-data
|
|
160
|
+
|
|
161
|
+
# See your own posts behind a category (in-session, nothing is saved)
|
|
162
|
+
exposurecheck audit --reddit ./reddit_export --backend local -i --i-own-this-data
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
The API key is read from an environment variable on purpose — command-line args
|
|
166
|
+
leak into shell history and process listings.
|
|
167
|
+
|
|
168
|
+
## Cost (cloud)
|
|
169
|
+
|
|
170
|
+
Roughly **$0.59 per profile** for ~125 posts on a GPT-4-class model; a real
|
|
171
|
+
1–3k-post history lands around **$4–15**, trimmed by the recall-preserving
|
|
172
|
+
pre-filter. Local models are free (lower accuracy — the tool warns you).
|
|
173
|
+
|
|
174
|
+
## How it works
|
|
175
|
+
|
|
176
|
+
```
|
|
177
|
+
export ─▶ parse ─▶ prefilter (drop only TRUE-empty) ─┬─▶ deterministic: profile + EXIF + timing ─┐
|
|
178
|
+
└─▶ cascade: cheap route ─▶ expensive read ─┤
|
|
179
|
+
▼
|
|
180
|
+
risk-contribution scoring ─▶ category cards ─▶ no-dossier report
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
See [`docs/THREAT-MODEL.md`](docs/THREAT-MODEL.md) for what is and isn't in scope,
|
|
184
|
+
and [`docs/ABUSE-EVAL.md`](docs/ABUSE-EVAL.md) for the dual-use safeguards and the
|
|
185
|
+
pre-release abuse evaluation.
|
|
186
|
+
|
|
187
|
+
## Status
|
|
188
|
+
|
|
189
|
+
`v0.1` alpha — the **CLI** runs end-to-end (Reddit + X, EXIF/GPS, the
|
|
190
|
+
recall-preserving cascade, the no-dossier report), aimed at terminal-comfortable
|
|
191
|
+
users for now.
|
|
192
|
+
|
|
193
|
+
Roadmap:
|
|
194
|
+
- A **one-click app** (packaged binary + a hardened, local-only in-browser UI, no
|
|
195
|
+
Python required) so non-technical people can use it too — the CLI stays for
|
|
196
|
+
power users.
|
|
197
|
+
- **Image *content* analysis is deliberately not in v1.** Sending images to a
|
|
198
|
+
cloud model is a serious privacy regression, so v1 extracts EXIF/metadata only
|
|
199
|
+
and says so plainly; a local-first multimodal option may come later.
|
|
200
|
+
- Real-corpus recall / false-positive evaluation (SynthPAI); more platforms
|
|
201
|
+
(Mastodon); single-post / pre-post checks.
|
|
202
|
+
|
|
203
|
+
## Security & contact
|
|
204
|
+
|
|
205
|
+
Found a privacy or safety flaw? See [`SECURITY.md`](SECURITY.md). Reach the author
|
|
206
|
+
at `cora@cypherpunkguide.com` (PGP via WKD: `gpg --locate-keys cora@cypherpunkguide.com`).
|
|
207
|
+
|
|
208
|
+
## License & official source
|
|
209
|
+
|
|
210
|
+
**MIT** — see [`LICENSE`](LICENSE). Built by **Cora Aegis**
|
|
211
|
+
([cypherpunkguide.com](https://cypherpunkguide.com)).
|
|
212
|
+
|
|
213
|
+
This repository (and the name **ExposureCheck**) is the canonical, official
|
|
214
|
+
source. You are free to fork and reuse under the MIT terms — but please don't
|
|
215
|
+
present a fork as the official project. ExposureCheck is for auditing **your
|
|
216
|
+
own** data; see [`docs/ABUSE-EVAL.md`](docs/ABUSE-EVAL.md) for the dual-use
|
|
217
|
+
safeguards that carry that intent (the licence deliberately does not — it can't).
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# ExposureCheck
|
|
2
|
+
|
|
3
|
+
**Audit your own social-media history for re-identification risk — before someone
|
|
4
|
+
else does it to you.**
|
|
5
|
+
|
|
6
|
+
Modern language models can read a few hundred of your ordinary public posts and
|
|
7
|
+
infer where you live, where you work, your routine, your family, and link a
|
|
8
|
+
pseudonymous account back to your real name — not from one careless post, but
|
|
9
|
+
from the *mosaic* of many individually-innocuous ones. Researchers have shown
|
|
10
|
+
this works at scale and with unsettling accuracy. `exposurecheck` runs that same
|
|
11
|
+
adversarial reading **on your own export, on your own terms**, and shows you
|
|
12
|
+
which of *your* posts to generalise or edit.
|
|
13
|
+
|
|
14
|
+
It is **local-first**, produces **no dossier**, and never writes a profile of you
|
|
15
|
+
to disk.
|
|
16
|
+
|
|
17
|
+
📖 Plain-language explainer of the threat:
|
|
18
|
+
**https://cypherpunkguide.com/privacy/social-media-self-audit/** (the companion
|
|
19
|
+
article — read it first if "mosaic re-identification" is new to you).
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Demo
|
|
24
|
+
|
|
25
|
+

|
|
26
|
+
|
|
27
|
+
*A ~12-second run on the bundled sample export using the offline `--backend
|
|
28
|
+
heuristic` stub — reproducible: clone the repo and run the exact command shown,
|
|
29
|
+
and nothing leaves your machine (`--offline` hard-blocks egress). A real
|
|
30
|
+
`--backend local` or `cloud` model surfaces far more; the full run also scores
|
|
31
|
+
EMPLOYER, FINANCES, FAMILY and SCHEDULE. (Recorded as an
|
|
32
|
+
[asciicast](media/demo.cast).)*
|
|
33
|
+
|
|
34
|
+
## What it does
|
|
35
|
+
|
|
36
|
+
- Parses your **Reddit** GDPR export and your **X / Twitter** export (directory or `.zip`).
|
|
37
|
+
- Runs a **recall-preserving cascade**: a cheap pass ranks every post, an
|
|
38
|
+
expensive pass reads the high-priority ones, and weak signals are *kept* — the
|
|
39
|
+
mosaic is built from weak signals, so throwing them away would be false comfort.
|
|
40
|
+
- Extracts the **metadata layer** deterministically (this is where X leaks most):
|
|
41
|
+
the self-set location field, outbound links, image **EXIF/GPS**, device model,
|
|
42
|
+
and posting-time concentration that betrays your timezone.
|
|
43
|
+
- Reports **category risk cards** (Location, Employer, Family, Schedule,
|
|
44
|
+
Finances, Account-linkage, …) ranked by **risk contribution**, each with masked
|
|
45
|
+
examples and concrete, *generalise-first* remediation.
|
|
46
|
+
|
|
47
|
+
## What it deliberately does **not** do
|
|
48
|
+
|
|
49
|
+
- ❌ No dossier. It never prints "you live in X / work at Y / your name is Z".
|
|
50
|
+
Cards show **masked** snippets; the resolved value only ever appears when *you*
|
|
51
|
+
click through to *your own* original post, **in-session, never saved**.
|
|
52
|
+
- ❌ No export of findings. ❌ No scraping (export input only). ❌ No posting/
|
|
53
|
+
deletion on your behalf. ❌ No analysing anyone else's history.
|
|
54
|
+
- ❌ It does not make you anonymous. It **reduces** risk. "Low" is not "safe".
|
|
55
|
+
|
|
56
|
+
## Bring your own model — cloud **or** local
|
|
57
|
+
|
|
58
|
+
The inference runs on a backend **you** choose:
|
|
59
|
+
|
|
60
|
+
| backend | what it is | data leaves your machine? |
|
|
61
|
+
|---|---|---|
|
|
62
|
+
| `local` | a local Ollama (or llama.cpp/LM Studio) model | **no** |
|
|
63
|
+
| `cloud` | any OpenAI-compatible endpoint, your own key | **yes** |
|
|
64
|
+
| `heuristic` | offline regex stub, **near-zero recall** | no — *dev/CI only, not an audit* |
|
|
65
|
+
|
|
66
|
+
### ⚠️ The one cloud caveat that actually matters
|
|
67
|
+
|
|
68
|
+
If the account you are auditing is a **pseudonymous** one you keep separate from
|
|
69
|
+
your real identity, **and** your AI/cloud account is registered under your real
|
|
70
|
+
name or paid with a real-name method, then sending your history to the cloud lets
|
|
71
|
+
the provider link *real identity ↔ anonymous account* on their side (subpoena,
|
|
72
|
+
breach, insider). That is the exact deanonymization this tool exists to prevent.
|
|
73
|
+
|
|
74
|
+
So: **auditing a strictly-anonymous account → use `--backend local`** (or a cloud
|
|
75
|
+
account opened and paid for anonymously). Auditing your real-name / public
|
|
76
|
+
account → cloud is fine. The CLI states this and requires acknowledgement when it
|
|
77
|
+
applies. We never force local (that would shrink the audience to nobody); we make
|
|
78
|
+
the trade-off explicit.
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## Install
|
|
83
|
+
|
|
84
|
+
Core (parsing, EXIF, cascade, **and** the cloud/local HTTP backends) is **Python
|
|
85
|
+
standard library only** — no third-party code touches your export.
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
# pipx — isolated, recommended (works today; a PyPI release is coming):
|
|
89
|
+
pipx install git+https://github.com/coraaegis/exposurecheck
|
|
90
|
+
|
|
91
|
+
# or from source:
|
|
92
|
+
git clone https://github.com/coraaegis/exposurecheck && cd exposurecheck && pip install -e .
|
|
93
|
+
|
|
94
|
+
# or run without installing:
|
|
95
|
+
python -m exposurecheck --help
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
ExposureCheck is a **command-line tool** today, aimed at people comfortable with a
|
|
99
|
+
terminal — which is also where its first reviewers live (GitHub, Hacker News,
|
|
100
|
+
privacy forums). A **one-click app for non-technical users** — a packaged build
|
|
101
|
+
with a local, in-browser UI and no Python to install — is the next milestone (see
|
|
102
|
+
[Status](#status)). The CLI stays for power users.
|
|
103
|
+
|
|
104
|
+
### Verifying a release
|
|
105
|
+
|
|
106
|
+
Releases are **not signed with an identity code-signing certificate** — the author
|
|
107
|
+
is pseudonymous, and such a certificate would tie the project to a legal identity,
|
|
108
|
+
the opposite of the point. Authenticity is cryptographic and verifiable instead:
|
|
109
|
+
|
|
110
|
+
- Each release is **PGP-signed** by Cora Aegis. Fetch the key via WKD
|
|
111
|
+
(`gpg --locate-keys cora@cypherpunkguide.com`), then
|
|
112
|
+
`gpg --verify exposurecheck-<version>.tar.gz.asc`.
|
|
113
|
+
- **SHA-256 checksums** are published with every release.
|
|
114
|
+
- Builds are **reproducible** — rebuild from the tagged source and confirm the
|
|
115
|
+
artifact matches.
|
|
116
|
+
- Prefer a **package manager** (pip / Scoop / Homebrew) over a downloaded `.exe`.
|
|
117
|
+
An unsigned Windows binary may show a SmartScreen "unknown publisher" prompt;
|
|
118
|
+
that is expected — verify the PGP signature or run from source.
|
|
119
|
+
|
|
120
|
+
## Usage
|
|
121
|
+
|
|
122
|
+
Get your data first:
|
|
123
|
+
- Reddit → *Settings → Privacy → Request a copy of your data* (the `.zip`).
|
|
124
|
+
- X → *Settings → Your account → Download an archive of your data*.
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
# Local model — nothing leaves your machine (recommended for anonymous accounts)
|
|
128
|
+
exposurecheck audit \
|
|
129
|
+
--reddit ./reddit_export.zip \
|
|
130
|
+
--twitter ./twitter_export \
|
|
131
|
+
--backend local --expensive-model llama3.1 \
|
|
132
|
+
--i-own-this-data
|
|
133
|
+
|
|
134
|
+
# Cloud (bring your own key; set it in the ENV, never on the command line)
|
|
135
|
+
export OPENAI_API_KEY=sk-...
|
|
136
|
+
exposurecheck audit --twitter ./twitter_export --backend cloud --i-own-this-data
|
|
137
|
+
|
|
138
|
+
# See your own posts behind a category (in-session, nothing is saved)
|
|
139
|
+
exposurecheck audit --reddit ./reddit_export --backend local -i --i-own-this-data
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
The API key is read from an environment variable on purpose — command-line args
|
|
143
|
+
leak into shell history and process listings.
|
|
144
|
+
|
|
145
|
+
## Cost (cloud)
|
|
146
|
+
|
|
147
|
+
Roughly **$0.59 per profile** for ~125 posts on a GPT-4-class model; a real
|
|
148
|
+
1–3k-post history lands around **$4–15**, trimmed by the recall-preserving
|
|
149
|
+
pre-filter. Local models are free (lower accuracy — the tool warns you).
|
|
150
|
+
|
|
151
|
+
## How it works
|
|
152
|
+
|
|
153
|
+
```
|
|
154
|
+
export ─▶ parse ─▶ prefilter (drop only TRUE-empty) ─┬─▶ deterministic: profile + EXIF + timing ─┐
|
|
155
|
+
└─▶ cascade: cheap route ─▶ expensive read ─┤
|
|
156
|
+
▼
|
|
157
|
+
risk-contribution scoring ─▶ category cards ─▶ no-dossier report
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
See [`docs/THREAT-MODEL.md`](docs/THREAT-MODEL.md) for what is and isn't in scope,
|
|
161
|
+
and [`docs/ABUSE-EVAL.md`](docs/ABUSE-EVAL.md) for the dual-use safeguards and the
|
|
162
|
+
pre-release abuse evaluation.
|
|
163
|
+
|
|
164
|
+
## Status
|
|
165
|
+
|
|
166
|
+
`v0.1` alpha — the **CLI** runs end-to-end (Reddit + X, EXIF/GPS, the
|
|
167
|
+
recall-preserving cascade, the no-dossier report), aimed at terminal-comfortable
|
|
168
|
+
users for now.
|
|
169
|
+
|
|
170
|
+
Roadmap:
|
|
171
|
+
- A **one-click app** (packaged binary + a hardened, local-only in-browser UI, no
|
|
172
|
+
Python required) so non-technical people can use it too — the CLI stays for
|
|
173
|
+
power users.
|
|
174
|
+
- **Image *content* analysis is deliberately not in v1.** Sending images to a
|
|
175
|
+
cloud model is a serious privacy regression, so v1 extracts EXIF/metadata only
|
|
176
|
+
and says so plainly; a local-first multimodal option may come later.
|
|
177
|
+
- Real-corpus recall / false-positive evaluation (SynthPAI); more platforms
|
|
178
|
+
(Mastodon); single-post / pre-post checks.
|
|
179
|
+
|
|
180
|
+
## Security & contact
|
|
181
|
+
|
|
182
|
+
Found a privacy or safety flaw? See [`SECURITY.md`](SECURITY.md). Reach the author
|
|
183
|
+
at `cora@cypherpunkguide.com` (PGP via WKD: `gpg --locate-keys cora@cypherpunkguide.com`).
|
|
184
|
+
|
|
185
|
+
## License & official source
|
|
186
|
+
|
|
187
|
+
**MIT** — see [`LICENSE`](LICENSE). Built by **Cora Aegis**
|
|
188
|
+
([cypherpunkguide.com](https://cypherpunkguide.com)).
|
|
189
|
+
|
|
190
|
+
This repository (and the name **ExposureCheck**) is the canonical, official
|
|
191
|
+
source. You are free to fork and reuse under the MIT terms — but please don't
|
|
192
|
+
present a fork as the official project. ExposureCheck is for auditing **your
|
|
193
|
+
own** data; see [`docs/ABUSE-EVAL.md`](docs/ABUSE-EVAL.md) for the dual-use
|
|
194
|
+
safeguards that carry that intent (the licence deliberately does not — it can't).
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""exposurecheck — audit your own social-media export for re-identification risk.
|
|
2
|
+
|
|
3
|
+
Local-first. No-dossier. Bring-your-own-LLM (cloud BYOK or local).
|
|
4
|
+
|
|
5
|
+
This package never phones home, never scrapes, and never writes a synthesized
|
|
6
|
+
profile of you to disk. It shows you, by category, what a *mosaic* re-identification
|
|
7
|
+
attack could reconstruct from the public history you already published — and points
|
|
8
|
+
you back at your own posts so you can edit or generalize them.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Top-level orchestration: parsed exports + a backend -> AuditResult."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Callable, Optional
|
|
6
|
+
|
|
7
|
+
from .backends.base import Backend
|
|
8
|
+
from .cascade import run_cascade
|
|
9
|
+
from .models import AuditResult, Export
|
|
10
|
+
from .risk import build_cards
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def run_audit(
|
|
14
|
+
exports: list[Export],
|
|
15
|
+
backend: Backend,
|
|
16
|
+
*,
|
|
17
|
+
candidate_fraction: float = 1.0,
|
|
18
|
+
max_candidates: Optional[int] = None,
|
|
19
|
+
batch_size: int = 10,
|
|
20
|
+
progress: Optional[Callable[[int, int], None]] = None,
|
|
21
|
+
) -> AuditResult:
|
|
22
|
+
outcome = run_cascade(
|
|
23
|
+
exports, backend,
|
|
24
|
+
candidate_fraction=candidate_fraction,
|
|
25
|
+
max_candidates=max_candidates,
|
|
26
|
+
batch_size=batch_size,
|
|
27
|
+
progress=progress,
|
|
28
|
+
)
|
|
29
|
+
cards = build_cards(outcome.findings)
|
|
30
|
+
return AuditResult(
|
|
31
|
+
cards=cards,
|
|
32
|
+
findings=outcome.findings,
|
|
33
|
+
backend_name=backend.name,
|
|
34
|
+
post_count=outcome.post_count,
|
|
35
|
+
candidate_count=outcome.candidate_count,
|
|
36
|
+
platforms=[ex.platform for ex in exports],
|
|
37
|
+
meta={
|
|
38
|
+
"dropped": outcome.dropped_count,
|
|
39
|
+
"kept": outcome.kept_count,
|
|
40
|
+
"not_analyzed": outcome.not_analyzed_count,
|
|
41
|
+
"raw": outcome.raw_count,
|
|
42
|
+
"media_count": sum(len(ex.media) for ex in exports),
|
|
43
|
+
},
|
|
44
|
+
)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Pluggable inference backends and a small factory.
|
|
2
|
+
|
|
3
|
+
heuristic - offline regex stub (no key, low recall — dev/CI/demo only)
|
|
4
|
+
cloud - OpenAI-compatible endpoint, bring-your-own-key (sends data offsite)
|
|
5
|
+
local - local Ollama server (no network egress)
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
from .base import Backend, RawInference
|
|
13
|
+
from .heuristic import HeuristicBackend
|
|
14
|
+
from .llm import LLMBackend
|
|
15
|
+
from .transports import CloudTransport, LocalTransport, TransportError
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"Backend", "RawInference", "HeuristicBackend", "LLMBackend",
|
|
19
|
+
"CloudTransport", "LocalTransport", "TransportError", "build_backend",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def build_backend(
|
|
24
|
+
kind: str,
|
|
25
|
+
*,
|
|
26
|
+
api_key: Optional[str] = None,
|
|
27
|
+
base_url: Optional[str] = None,
|
|
28
|
+
cheap_model: Optional[str] = None,
|
|
29
|
+
expensive_model: Optional[str] = None,
|
|
30
|
+
timeout: Optional[float] = None,
|
|
31
|
+
) -> Backend:
|
|
32
|
+
kind = (kind or "heuristic").lower()
|
|
33
|
+
if kind == "heuristic":
|
|
34
|
+
return HeuristicBackend()
|
|
35
|
+
if kind == "cloud":
|
|
36
|
+
transport = CloudTransport(
|
|
37
|
+
api_key or "",
|
|
38
|
+
base_url=base_url or "https://api.openai.com/v1",
|
|
39
|
+
timeout=timeout or 60.0,
|
|
40
|
+
)
|
|
41
|
+
return LLMBackend(
|
|
42
|
+
transport,
|
|
43
|
+
cheap_model=cheap_model or "gpt-4o-mini",
|
|
44
|
+
expensive_model=expensive_model or "gpt-4o",
|
|
45
|
+
)
|
|
46
|
+
if kind == "local":
|
|
47
|
+
transport = LocalTransport(
|
|
48
|
+
base_url=base_url or "http://localhost:11434",
|
|
49
|
+
timeout=timeout or 120.0,
|
|
50
|
+
)
|
|
51
|
+
model = expensive_model or cheap_model or "llama3.1"
|
|
52
|
+
return LLMBackend(
|
|
53
|
+
transport,
|
|
54
|
+
cheap_model=cheap_model or model,
|
|
55
|
+
expensive_model=model,
|
|
56
|
+
)
|
|
57
|
+
raise ValueError(f"unknown backend: {kind!r} (use heuristic|cloud|local)")
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Mechanical masked-snippet generation.
|
|
2
|
+
|
|
3
|
+
The masked snippet shown in a risk card is ALWAYS generated here from post
|
|
4
|
+
metadata (evidence label + where + when) — never from model free-text. This
|
|
5
|
+
guarantees the no-dossier invariant holds regardless of what an LLM returns:
|
|
6
|
+
the resolved value can only ever appear when the user clicks through to their
|
|
7
|
+
OWN original post.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from ..models import Post
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def masked_reference(post: Post, evidence_type: str) -> str:
|
|
16
|
+
when = post.created_at.date().isoformat() if post.created_at else "?"
|
|
17
|
+
where = f"r/{post.community}" if post.community else post.platform.value
|
|
18
|
+
return f"[{evidence_type}] | {where} | {when}"
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Backend contract for the leak-inference cascade.
|
|
2
|
+
|
|
3
|
+
A backend answers two questions about the user's OWN posts:
|
|
4
|
+
|
|
5
|
+
route(posts) -> a 0..1 priority per post (cheap tier). Lower priority never
|
|
6
|
+
means "dropped" — it only means "analyze later / sample less".
|
|
7
|
+
extract(batch) -> structured leak inferences for a small batch (expensive tier).
|
|
8
|
+
|
|
9
|
+
Backends never return resolved personal values: an inference carries a category,
|
|
10
|
+
a confidence, a MASKED snippet and a reference back to the user's own post.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from abc import ABC, abstractmethod
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from typing import Optional
|
|
18
|
+
|
|
19
|
+
from ..models import Confidence, Platform, Post, RiskCategory, Source
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class RawInference:
|
|
24
|
+
"""One leak signal, pre-aggregation. Holds a reference + masked text only."""
|
|
25
|
+
category: RiskCategory
|
|
26
|
+
confidence: Confidence
|
|
27
|
+
masked_snippet: str
|
|
28
|
+
evidence_type: str
|
|
29
|
+
source: Source = Source.TEXT
|
|
30
|
+
post_id: Optional[str] = None
|
|
31
|
+
permalink: Optional[str] = None
|
|
32
|
+
platform: Optional[Platform] = None # namespaces post_id across mixed exports
|
|
33
|
+
rationale: str = ""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Backend(ABC):
|
|
37
|
+
name: str = "base"
|
|
38
|
+
is_local: bool = False
|
|
39
|
+
#: True if running this backend sends the user's posts off their machine.
|
|
40
|
+
#: Drives the conditional cloud-deanonymization warning.
|
|
41
|
+
sends_data_offsite: bool = False
|
|
42
|
+
|
|
43
|
+
@abstractmethod
|
|
44
|
+
def route(self, posts: list[Post]) -> list[float]:
|
|
45
|
+
...
|
|
46
|
+
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def extract(self, batch: list[Post]) -> list[RawInference]:
|
|
49
|
+
...
|
|
50
|
+
|
|
51
|
+
def describe(self) -> str:
|
|
52
|
+
return self.name
|