exposurecheck 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. exposurecheck-0.1.0/LICENSE +21 -0
  2. exposurecheck-0.1.0/PKG-INFO +217 -0
  3. exposurecheck-0.1.0/README.md +194 -0
  4. exposurecheck-0.1.0/exposurecheck/__init__.py +11 -0
  5. exposurecheck-0.1.0/exposurecheck/__main__.py +4 -0
  6. exposurecheck-0.1.0/exposurecheck/audit.py +44 -0
  7. exposurecheck-0.1.0/exposurecheck/backends/__init__.py +57 -0
  8. exposurecheck-0.1.0/exposurecheck/backends/_mask.py +18 -0
  9. exposurecheck-0.1.0/exposurecheck/backends/base.py +52 -0
  10. exposurecheck-0.1.0/exposurecheck/backends/heuristic.py +102 -0
  11. exposurecheck-0.1.0/exposurecheck/backends/llm.py +205 -0
  12. exposurecheck-0.1.0/exposurecheck/backends/transports.py +114 -0
  13. exposurecheck-0.1.0/exposurecheck/cascade/__init__.py +6 -0
  14. exposurecheck-0.1.0/exposurecheck/cascade/deterministic.py +125 -0
  15. exposurecheck-0.1.0/exposurecheck/cascade/pipeline.py +74 -0
  16. exposurecheck-0.1.0/exposurecheck/cascade/prefilter.py +35 -0
  17. exposurecheck-0.1.0/exposurecheck/cascade/summarize.py +45 -0
  18. exposurecheck-0.1.0/exposurecheck/cli.py +197 -0
  19. exposurecheck-0.1.0/exposurecheck/metadata/__init__.py +4 -0
  20. exposurecheck-0.1.0/exposurecheck/metadata/exif.py +221 -0
  21. exposurecheck-0.1.0/exposurecheck/models.py +209 -0
  22. exposurecheck-0.1.0/exposurecheck/output/__init__.py +6 -0
  23. exposurecheck-0.1.0/exposurecheck/output/interactive.py +70 -0
  24. exposurecheck-0.1.0/exposurecheck/output/report.py +106 -0
  25. exposurecheck-0.1.0/exposurecheck/parsers/__init__.py +17 -0
  26. exposurecheck-0.1.0/exposurecheck/parsers/_source.py +86 -0
  27. exposurecheck-0.1.0/exposurecheck/parsers/_util.py +21 -0
  28. exposurecheck-0.1.0/exposurecheck/parsers/reddit.py +103 -0
  29. exposurecheck-0.1.0/exposurecheck/parsers/twitter.py +157 -0
  30. exposurecheck-0.1.0/exposurecheck/remediation/__init__.py +5 -0
  31. exposurecheck-0.1.0/exposurecheck/remediation/advise.py +24 -0
  32. exposurecheck-0.1.0/exposurecheck/risk/__init__.py +7 -0
  33. exposurecheck-0.1.0/exposurecheck/risk/card.py +39 -0
  34. exposurecheck-0.1.0/exposurecheck/risk/categories.py +118 -0
  35. exposurecheck-0.1.0/exposurecheck/risk/scoring.py +50 -0
  36. exposurecheck-0.1.0/exposurecheck/safety/__init__.py +6 -0
  37. exposurecheck-0.1.0/exposurecheck/safety/consent.py +38 -0
  38. exposurecheck-0.1.0/exposurecheck/safety/offline.py +60 -0
  39. exposurecheck-0.1.0/exposurecheck/safety/warnings.py +41 -0
  40. exposurecheck-0.1.0/exposurecheck.egg-info/PKG-INFO +217 -0
  41. exposurecheck-0.1.0/exposurecheck.egg-info/SOURCES.txt +53 -0
  42. exposurecheck-0.1.0/exposurecheck.egg-info/dependency_links.txt +1 -0
  43. exposurecheck-0.1.0/exposurecheck.egg-info/entry_points.txt +2 -0
  44. exposurecheck-0.1.0/exposurecheck.egg-info/requires.txt +3 -0
  45. exposurecheck-0.1.0/exposurecheck.egg-info/top_level.txt +1 -0
  46. exposurecheck-0.1.0/pyproject.toml +45 -0
  47. exposurecheck-0.1.0/setup.cfg +4 -0
  48. exposurecheck-0.1.0/tests/test_cascade.py +56 -0
  49. exposurecheck-0.1.0/tests/test_cli_errors.py +31 -0
  50. exposurecheck-0.1.0/tests/test_exif_hardening.py +44 -0
  51. exposurecheck-0.1.0/tests/test_llm_backend.py +76 -0
  52. exposurecheck-0.1.0/tests/test_no_dossier.py +62 -0
  53. exposurecheck-0.1.0/tests/test_offline.py +30 -0
  54. exposurecheck-0.1.0/tests/test_parsers.py +76 -0
  55. exposurecheck-0.1.0/tests/test_safety.py +37 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Cora Aegis
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,217 @@
1
+ Metadata-Version: 2.4
2
+ Name: exposurecheck
3
+ Version: 0.1.0
4
+ Summary: Audit your own social-media export for re-identification (mosaic) risk. Local-first, no-dossier, bring-your-own-LLM.
5
+ Author: Cora Aegis
6
+ License: MIT
7
+ Project-URL: Homepage, https://cypherpunkguide.com
8
+ Project-URL: Repository, https://github.com/coraaegis/exposurecheck
9
+ Project-URL: Issues, https://github.com/coraaegis/exposurecheck/issues
10
+ Keywords: privacy,opsec,deanonymization,re-identification,osint-defense,mosaic,reddit,twitter,self-audit
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: End Users/Desktop
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Topic :: Security
16
+ Classifier: Programming Language :: Python :: 3 :: Only
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Provides-Extra: dev
21
+ Requires-Dist: pytest>=7; extra == "dev"
22
+ Dynamic: license-file
23
+
24
+ # ExposureCheck
25
+
26
+ **Audit your own social-media history for re-identification risk — before someone
27
+ else does it to you.**
28
+
29
+ Modern language models can read a few hundred of your ordinary public posts and
30
+ infer where you live, where you work, your routine, your family, and link a
31
+ pseudonymous account back to your real name — not from one careless post, but
32
+ from the *mosaic* of many individually-innocuous ones. Researchers have shown
33
+ this works at scale and with unsettling accuracy. `exposurecheck` runs that same
34
+ adversarial reading **on your own export, on your own terms**, and shows you
35
+ which of *your* posts to generalise or edit.
36
+
37
+ It is **local-first**, produces **no dossier**, and never writes a profile of you
38
+ to disk.
39
+
40
+ 📖 Plain-language explainer of the threat:
41
+ **https://cypherpunkguide.com/privacy/social-media-self-audit/** (the companion
42
+ article — read it first if "mosaic re-identification" is new to you).
43
+
44
+ ---
45
+
46
+ ## Demo
47
+
48
+ ![ExposureCheck running an offline audit on the bundled sample data](media/demo.gif)
49
+
50
+ *A ~12-second run on the bundled sample export using the offline `--backend
51
+ heuristic` stub — reproducible: clone the repo and run the exact command shown,
52
+ and nothing leaves your machine (`--offline` hard-blocks egress). A real
53
+ `--backend local` or `cloud` model surfaces far more; the full run also scores
54
+ EMPLOYER, FINANCES, FAMILY and SCHEDULE. (Recorded as an
55
+ [asciicast](media/demo.cast).)*
56
+
57
+ ## What it does
58
+
59
+ - Parses your **Reddit** GDPR export and your **X / Twitter** export (directory or `.zip`).
60
+ - Runs a **recall-preserving cascade**: a cheap pass ranks every post, an
61
+ expensive pass reads the high-priority ones, and weak signals are *kept* — the
62
+ mosaic is built from weak signals, so throwing them away would be false comfort.
63
+ - Extracts the **metadata layer** deterministically (this is where X leaks most):
64
+ the self-set location field, outbound links, image **EXIF/GPS**, device model,
65
+ and posting-time concentration that betrays your timezone.
66
+ - Reports **category risk cards** (Location, Employer, Family, Schedule,
67
+ Finances, Account-linkage, …) ranked by **risk contribution**, each with masked
68
+ examples and concrete, *generalise-first* remediation.
69
+
70
+ ## What it deliberately does **not** do
71
+
72
+ - ❌ No dossier. It never prints "you live in X / work at Y / your name is Z".
73
+ Cards show **masked** snippets; the resolved value only ever appears when *you*
74
+ click through to *your own* original post, **in-session, never saved**.
75
+ - ❌ No export of findings. ❌ No scraping (export input only). ❌ No posting/
76
+ deletion on your behalf. ❌ No analysing anyone else's history.
77
+ - ❌ It does not make you anonymous. It **reduces** risk. "Low" is not "safe".
78
+
79
+ ## Bring your own model — cloud **or** local
80
+
81
+ The inference runs on a backend **you** choose:
82
+
83
+ | backend | what it is | data leaves your machine? |
84
+ |---|---|---|
85
+ | `local` | a local Ollama (or llama.cpp/LM Studio) model | **no** |
86
+ | `cloud` | any OpenAI-compatible endpoint, your own key | **yes** |
87
+ | `heuristic` | offline regex stub, **near-zero recall** | no — *dev/CI only, not an audit* |
88
+
89
+ ### ⚠️ The one cloud caveat that actually matters
90
+
91
+ If the account you are auditing is a **pseudonymous** one you keep separate from
92
+ your real identity, **and** your AI/cloud account is registered under your real
93
+ name or paid with a real-name method, then sending your history to the cloud lets
94
+ the provider link *real identity ↔ anonymous account* on their side (subpoena,
95
+ breach, insider). That is the exact deanonymization this tool exists to prevent.
96
+
97
+ So: **auditing a strictly-anonymous account → use `--backend local`** (or a cloud
98
+ account opened and paid for anonymously). Auditing your real-name / public
99
+ account → cloud is fine. The CLI states this and requires acknowledgement when it
100
+ applies. We never force local (that would shrink the audience to nobody); we make
101
+ the trade-off explicit.
102
+
103
+ ---
104
+
105
+ ## Install
106
+
107
+ Core (parsing, EXIF, cascade, **and** the cloud/local HTTP backends) is **Python
108
+ standard library only** — no third-party code touches your export.
109
+
110
+ ```bash
111
+ # pipx — isolated, recommended (works today; a PyPI release is coming):
112
+ pipx install git+https://github.com/coraaegis/exposurecheck
113
+
114
+ # or from source:
115
+ git clone https://github.com/coraaegis/exposurecheck && cd exposurecheck && pip install -e .
116
+
117
+ # or run without installing:
118
+ python -m exposurecheck --help
119
+ ```
120
+
121
+ ExposureCheck is a **command-line tool** today, aimed at people comfortable with a
122
+ terminal — which is also where its first reviewers live (GitHub, Hacker News,
123
+ privacy forums). A **one-click app for non-technical users** — a packaged build
124
+ with a local, in-browser UI and no Python to install — is the next milestone (see
125
+ [Status](#status)). The CLI stays for power users.
126
+
127
+ ### Verifying a release
128
+
129
+ Releases are **not signed with an identity code-signing certificate** — the author
130
+ is pseudonymous, and such a certificate would tie the project to a legal identity,
131
+ the opposite of the point. Authenticity is cryptographic and verifiable instead:
132
+
133
+ - Each release is **PGP-signed** by Cora Aegis. Fetch the key via WKD
134
+ (`gpg --locate-keys cora@cypherpunkguide.com`), then
135
+ `gpg --verify exposurecheck-<version>.tar.gz.asc`.
136
+ - **SHA-256 checksums** are published with every release.
137
+ - Builds are **reproducible** — rebuild from the tagged source and confirm the
138
+ artifact matches.
139
+ - Prefer a **package manager** (pip / Scoop / Homebrew) over a downloaded `.exe`.
140
+ An unsigned Windows binary may show a SmartScreen "unknown publisher" prompt;
141
+ that is expected — verify the PGP signature or run from source.
142
+
143
+ ## Usage
144
+
145
+ Get your data first:
146
+ - Reddit → *Settings → Privacy → Request a copy of your data* (the `.zip`).
147
+ - X → *Settings → Your account → Download an archive of your data*.
148
+
149
+ ```bash
150
+ # Local model — nothing leaves your machine (recommended for anonymous accounts)
151
+ exposurecheck audit \
152
+ --reddit ./reddit_export.zip \
153
+ --twitter ./twitter_export \
154
+ --backend local --expensive-model llama3.1 \
155
+ --i-own-this-data
156
+
157
+ # Cloud (bring your own key; set it in the ENV, never on the command line)
158
+ export OPENAI_API_KEY=sk-...
159
+ exposurecheck audit --twitter ./twitter_export --backend cloud --i-own-this-data
160
+
161
+ # See your own posts behind a category (in-session, nothing is saved)
162
+ exposurecheck audit --reddit ./reddit_export --backend local -i --i-own-this-data
163
+ ```
164
+
165
+ The API key is read from an environment variable on purpose — command-line args
166
+ leak into shell history and process listings.
167
+
168
+ ## Cost (cloud)
169
+
170
+ Roughly **$0.59 per profile** for ~125 posts on a GPT-4-class model; a real
171
+ 1–3k-post history lands around **$4–15**, trimmed by the recall-preserving
172
+ pre-filter. Local models are free (lower accuracy — the tool warns you).
173
+
174
+ ## How it works
175
+
176
+ ```
177
+ export ─▶ parse ─▶ prefilter (drop only TRUE-empty) ─┬─▶ deterministic: profile + EXIF + timing ─┐
178
+ └─▶ cascade: cheap route ─▶ expensive read ─┤
179
+
180
+ risk-contribution scoring ─▶ category cards ─▶ no-dossier report
181
+ ```
182
+
183
+ See [`docs/THREAT-MODEL.md`](docs/THREAT-MODEL.md) for what is and isn't in scope,
184
+ and [`docs/ABUSE-EVAL.md`](docs/ABUSE-EVAL.md) for the dual-use safeguards and the
185
+ pre-release abuse evaluation.
186
+
187
+ ## Status
188
+
189
+ `v0.1` alpha — the **CLI** runs end-to-end (Reddit + X, EXIF/GPS, the
190
+ recall-preserving cascade, the no-dossier report), aimed at terminal-comfortable
191
+ users for now.
192
+
193
+ Roadmap:
194
+ - A **one-click app** (packaged binary + a hardened, local-only in-browser UI, no
195
+ Python required) so non-technical people can use it too — the CLI stays for
196
+ power users.
197
+ - **Image *content* analysis is deliberately not in v1.** Sending images to a
198
+ cloud model is a serious privacy regression, so v1 extracts EXIF/metadata only
199
+ and says so plainly; a local-first multimodal option may come later.
200
+ - Real-corpus recall / false-positive evaluation (SynthPAI); more platforms
201
+ (Mastodon); single-post / pre-post checks.
202
+
203
+ ## Security & contact
204
+
205
+ Found a privacy or safety flaw? See [`SECURITY.md`](SECURITY.md). Reach the author
206
+ at `cora@cypherpunkguide.com` (PGP via WKD: `gpg --locate-keys cora@cypherpunkguide.com`).
207
+
208
+ ## License & official source
209
+
210
+ **MIT** — see [`LICENSE`](LICENSE). Built by **Cora Aegis**
211
+ ([cypherpunkguide.com](https://cypherpunkguide.com)).
212
+
213
+ This repository (and the name **ExposureCheck**) is the canonical, official
214
+ source. You are free to fork and reuse under the MIT terms — but please don't
215
+ present a fork as the official project. ExposureCheck is for auditing **your
216
+ own** data; see [`docs/ABUSE-EVAL.md`](docs/ABUSE-EVAL.md) for the dual-use
217
+ safeguards that carry that intent (the licence deliberately does not — it can't).
@@ -0,0 +1,194 @@
1
+ # ExposureCheck
2
+
3
+ **Audit your own social-media history for re-identification risk — before someone
4
+ else does it to you.**
5
+
6
+ Modern language models can read a few hundred of your ordinary public posts and
7
+ infer where you live, where you work, your routine, your family, and link a
8
+ pseudonymous account back to your real name — not from one careless post, but
9
+ from the *mosaic* of many individually-innocuous ones. Researchers have shown
10
+ this works at scale and with unsettling accuracy. `exposurecheck` runs that same
11
+ adversarial reading **on your own export, on your own terms**, and shows you
12
+ which of *your* posts to generalise or edit.
13
+
14
+ It is **local-first**, produces **no dossier**, and never writes a profile of you
15
+ to disk.
16
+
17
+ 📖 Plain-language explainer of the threat:
18
+ **https://cypherpunkguide.com/privacy/social-media-self-audit/** (the companion
19
+ article — read it first if "mosaic re-identification" is new to you).
20
+
21
+ ---
22
+
23
+ ## Demo
24
+
25
+ ![ExposureCheck running an offline audit on the bundled sample data](media/demo.gif)
26
+
27
+ *A ~12-second run on the bundled sample export using the offline `--backend
28
+ heuristic` stub — reproducible: clone the repo and run the exact command shown,
29
+ and nothing leaves your machine (`--offline` hard-blocks egress). A real
30
+ `--backend local` or `cloud` model surfaces far more; the full run also scores
31
+ EMPLOYER, FINANCES, FAMILY and SCHEDULE. (Recorded as an
32
+ [asciicast](media/demo.cast).)*
33
+
34
+ ## What it does
35
+
36
+ - Parses your **Reddit** GDPR export and your **X / Twitter** export (directory or `.zip`).
37
+ - Runs a **recall-preserving cascade**: a cheap pass ranks every post, an
38
+ expensive pass reads the high-priority ones, and weak signals are *kept* — the
39
+ mosaic is built from weak signals, so throwing them away would be false comfort.
40
+ - Extracts the **metadata layer** deterministically (this is where X leaks most):
41
+ the self-set location field, outbound links, image **EXIF/GPS**, device model,
42
+ and posting-time concentration that betrays your timezone.
43
+ - Reports **category risk cards** (Location, Employer, Family, Schedule,
44
+ Finances, Account-linkage, …) ranked by **risk contribution**, each with masked
45
+ examples and concrete, *generalise-first* remediation.
46
+
47
+ ## What it deliberately does **not** do
48
+
49
+ - ❌ No dossier. It never prints "you live in X / work at Y / your name is Z".
50
+ Cards show **masked** snippets; the resolved value only ever appears when *you*
51
+ click through to *your own* original post, **in-session, never saved**.
52
+ - ❌ No export of findings. ❌ No scraping (export input only). ❌ No posting/
53
+ deletion on your behalf. ❌ No analysing anyone else's history.
54
+ - ❌ It does not make you anonymous. It **reduces** risk. "Low" is not "safe".
55
+
56
+ ## Bring your own model — cloud **or** local
57
+
58
+ The inference runs on a backend **you** choose:
59
+
60
+ | backend | what it is | data leaves your machine? |
61
+ |---|---|---|
62
+ | `local` | a local Ollama (or llama.cpp/LM Studio) model | **no** |
63
+ | `cloud` | any OpenAI-compatible endpoint, your own key | **yes** |
64
+ | `heuristic` | offline regex stub, **near-zero recall** | no — *dev/CI only, not an audit* |
65
+
66
+ ### ⚠️ The one cloud caveat that actually matters
67
+
68
+ If the account you are auditing is a **pseudonymous** one you keep separate from
69
+ your real identity, **and** your AI/cloud account is registered under your real
70
+ name or paid with a real-name method, then sending your history to the cloud lets
71
+ the provider link *real identity ↔ anonymous account* on their side (subpoena,
72
+ breach, insider). That is the exact deanonymization this tool exists to prevent.
73
+
74
+ So: **auditing a strictly-anonymous account → use `--backend local`** (or a cloud
75
+ account opened and paid for anonymously). Auditing your real-name / public
76
+ account → cloud is fine. The CLI states this and requires acknowledgement when it
77
+ applies. We never force local (that would shrink the audience to nobody); we make
78
+ the trade-off explicit.
79
+
80
+ ---
81
+
82
+ ## Install
83
+
84
+ Core (parsing, EXIF, cascade, **and** the cloud/local HTTP backends) is **Python
85
+ standard library only** — no third-party code touches your export.
86
+
87
+ ```bash
88
+ # pipx — isolated, recommended (works today; a PyPI release is coming):
89
+ pipx install git+https://github.com/coraaegis/exposurecheck
90
+
91
+ # or from source:
92
+ git clone https://github.com/coraaegis/exposurecheck && cd exposurecheck && pip install -e .
93
+
94
+ # or run without installing:
95
+ python -m exposurecheck --help
96
+ ```
97
+
98
+ ExposureCheck is a **command-line tool** today, aimed at people comfortable with a
99
+ terminal — which is also where its first reviewers live (GitHub, Hacker News,
100
+ privacy forums). A **one-click app for non-technical users** — a packaged build
101
+ with a local, in-browser UI and no Python to install — is the next milestone (see
102
+ [Status](#status)). The CLI stays for power users.
103
+
104
+ ### Verifying a release
105
+
106
+ Releases are **not signed with an identity code-signing certificate** — the author
107
+ is pseudonymous, and such a certificate would tie the project to a legal identity,
108
+ the opposite of the point. Authenticity is cryptographic and verifiable instead:
109
+
110
+ - Each release is **PGP-signed** by Cora Aegis. Fetch the key via WKD
111
+ (`gpg --locate-keys cora@cypherpunkguide.com`), then
112
+ `gpg --verify exposurecheck-<version>.tar.gz.asc`.
113
+ - **SHA-256 checksums** are published with every release.
114
+ - Builds are **reproducible** — rebuild from the tagged source and confirm the
115
+ artifact matches.
116
+ - Prefer a **package manager** (pip / Scoop / Homebrew) over a downloaded `.exe`.
117
+ An unsigned Windows binary may show a SmartScreen "unknown publisher" prompt;
118
+ that is expected — verify the PGP signature or run from source.
119
+
120
+ ## Usage
121
+
122
+ Get your data first:
123
+ - Reddit → *Settings → Privacy → Request a copy of your data* (the `.zip`).
124
+ - X → *Settings → Your account → Download an archive of your data*.
125
+
126
+ ```bash
127
+ # Local model — nothing leaves your machine (recommended for anonymous accounts)
128
+ exposurecheck audit \
129
+ --reddit ./reddit_export.zip \
130
+ --twitter ./twitter_export \
131
+ --backend local --expensive-model llama3.1 \
132
+ --i-own-this-data
133
+
134
+ # Cloud (bring your own key; set it in the ENV, never on the command line)
135
+ export OPENAI_API_KEY=sk-...
136
+ exposurecheck audit --twitter ./twitter_export --backend cloud --i-own-this-data
137
+
138
+ # See your own posts behind a category (in-session, nothing is saved)
139
+ exposurecheck audit --reddit ./reddit_export --backend local -i --i-own-this-data
140
+ ```
141
+
142
+ The API key is read from an environment variable on purpose — command-line args
143
+ leak into shell history and process listings.
144
+
145
+ ## Cost (cloud)
146
+
147
+ Roughly **$0.59 per profile** for ~125 posts on a GPT-4-class model; a real
148
+ 1–3k-post history lands around **$4–15**, trimmed by the recall-preserving
149
+ pre-filter. Local models are free (lower accuracy — the tool warns you).
150
+
151
+ ## How it works
152
+
153
+ ```
154
+ export ─▶ parse ─▶ prefilter (drop only TRUE-empty) ─┬─▶ deterministic: profile + EXIF + timing ─┐
155
+ └─▶ cascade: cheap route ─▶ expensive read ─┤
156
+
157
+ risk-contribution scoring ─▶ category cards ─▶ no-dossier report
158
+ ```
159
+
160
+ See [`docs/THREAT-MODEL.md`](docs/THREAT-MODEL.md) for what is and isn't in scope,
161
+ and [`docs/ABUSE-EVAL.md`](docs/ABUSE-EVAL.md) for the dual-use safeguards and the
162
+ pre-release abuse evaluation.
163
+
164
+ ## Status
165
+
166
+ `v0.1` alpha — the **CLI** runs end-to-end (Reddit + X, EXIF/GPS, the
167
+ recall-preserving cascade, the no-dossier report), aimed at terminal-comfortable
168
+ users for now.
169
+
170
+ Roadmap:
171
+ - A **one-click app** (packaged binary + a hardened, local-only in-browser UI, no
172
+ Python required) so non-technical people can use it too — the CLI stays for
173
+ power users.
174
+ - **Image *content* analysis is deliberately not in v1.** Sending images to a
175
+ cloud model is a serious privacy regression, so v1 extracts EXIF/metadata only
176
+ and says so plainly; a local-first multimodal option may come later.
177
+ - Real-corpus recall / false-positive evaluation (SynthPAI); more platforms
178
+ (Mastodon); single-post / pre-post checks.
179
+
180
+ ## Security & contact
181
+
182
+ Found a privacy or safety flaw? See [`SECURITY.md`](SECURITY.md). Reach the author
183
+ at `cora@cypherpunkguide.com` (PGP via WKD: `gpg --locate-keys cora@cypherpunkguide.com`).
184
+
185
+ ## License & official source
186
+
187
+ **MIT** — see [`LICENSE`](LICENSE). Built by **Cora Aegis**
188
+ ([cypherpunkguide.com](https://cypherpunkguide.com)).
189
+
190
+ This repository (and the name **ExposureCheck**) is the canonical, official
191
+ source. You are free to fork and reuse under the MIT terms — but please don't
192
+ present a fork as the official project. ExposureCheck is for auditing **your
193
+ own** data; see [`docs/ABUSE-EVAL.md`](docs/ABUSE-EVAL.md) for the dual-use
194
+ safeguards that carry that intent (the licence deliberately does not — it can't).
@@ -0,0 +1,11 @@
1
+ """exposurecheck — audit your own social-media export for re-identification risk.
2
+
3
+ Local-first. No-dossier. Bring-your-own-LLM (cloud BYOK or local).
4
+
5
+ This package never phones home, never scrapes, and never writes a synthesized
6
+ profile of you to disk. It shows you, by category, what a *mosaic* re-identification
7
+ attack could reconstruct from the public history you already published — and points
8
+ you back at your own posts so you can edit or generalize them.
9
+ """
10
+
11
+ __version__ = "0.1.0"
@@ -0,0 +1,4 @@
1
+ from .cli import main
2
+
3
+ if __name__ == "__main__":
4
+ raise SystemExit(main())
@@ -0,0 +1,44 @@
1
+ """Top-level orchestration: parsed exports + a backend -> AuditResult."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Callable, Optional
6
+
7
+ from .backends.base import Backend
8
+ from .cascade import run_cascade
9
+ from .models import AuditResult, Export
10
+ from .risk import build_cards
11
+
12
+
13
+ def run_audit(
14
+ exports: list[Export],
15
+ backend: Backend,
16
+ *,
17
+ candidate_fraction: float = 1.0,
18
+ max_candidates: Optional[int] = None,
19
+ batch_size: int = 10,
20
+ progress: Optional[Callable[[int, int], None]] = None,
21
+ ) -> AuditResult:
22
+ outcome = run_cascade(
23
+ exports, backend,
24
+ candidate_fraction=candidate_fraction,
25
+ max_candidates=max_candidates,
26
+ batch_size=batch_size,
27
+ progress=progress,
28
+ )
29
+ cards = build_cards(outcome.findings)
30
+ return AuditResult(
31
+ cards=cards,
32
+ findings=outcome.findings,
33
+ backend_name=backend.name,
34
+ post_count=outcome.post_count,
35
+ candidate_count=outcome.candidate_count,
36
+ platforms=[ex.platform for ex in exports],
37
+ meta={
38
+ "dropped": outcome.dropped_count,
39
+ "kept": outcome.kept_count,
40
+ "not_analyzed": outcome.not_analyzed_count,
41
+ "raw": outcome.raw_count,
42
+ "media_count": sum(len(ex.media) for ex in exports),
43
+ },
44
+ )
@@ -0,0 +1,57 @@
1
+ """Pluggable inference backends and a small factory.
2
+
3
+ heuristic - offline regex stub (no key, low recall — dev/CI/demo only)
4
+ cloud - OpenAI-compatible endpoint, bring-your-own-key (sends data offsite)
5
+ local - local Ollama server (no network egress)
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Optional
11
+
12
+ from .base import Backend, RawInference
13
+ from .heuristic import HeuristicBackend
14
+ from .llm import LLMBackend
15
+ from .transports import CloudTransport, LocalTransport, TransportError
16
+
17
+ __all__ = [
18
+ "Backend", "RawInference", "HeuristicBackend", "LLMBackend",
19
+ "CloudTransport", "LocalTransport", "TransportError", "build_backend",
20
+ ]
21
+
22
+
23
+ def build_backend(
24
+ kind: str,
25
+ *,
26
+ api_key: Optional[str] = None,
27
+ base_url: Optional[str] = None,
28
+ cheap_model: Optional[str] = None,
29
+ expensive_model: Optional[str] = None,
30
+ timeout: Optional[float] = None,
31
+ ) -> Backend:
32
+ kind = (kind or "heuristic").lower()
33
+ if kind == "heuristic":
34
+ return HeuristicBackend()
35
+ if kind == "cloud":
36
+ transport = CloudTransport(
37
+ api_key or "",
38
+ base_url=base_url or "https://api.openai.com/v1",
39
+ timeout=timeout or 60.0,
40
+ )
41
+ return LLMBackend(
42
+ transport,
43
+ cheap_model=cheap_model or "gpt-4o-mini",
44
+ expensive_model=expensive_model or "gpt-4o",
45
+ )
46
+ if kind == "local":
47
+ transport = LocalTransport(
48
+ base_url=base_url or "http://localhost:11434",
49
+ timeout=timeout or 120.0,
50
+ )
51
+ model = expensive_model or cheap_model or "llama3.1"
52
+ return LLMBackend(
53
+ transport,
54
+ cheap_model=cheap_model or model,
55
+ expensive_model=model,
56
+ )
57
+ raise ValueError(f"unknown backend: {kind!r} (use heuristic|cloud|local)")
@@ -0,0 +1,18 @@
1
+ """Mechanical masked-snippet generation.
2
+
3
+ The masked snippet shown in a risk card is ALWAYS generated here from post
4
+ metadata (evidence label + where + when) — never from model free-text. This
5
+ guarantees the no-dossier invariant holds regardless of what an LLM returns:
6
+ the resolved value can only ever appear when the user clicks through to their
7
+ OWN original post.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from ..models import Post
13
+
14
+
15
+ def masked_reference(post: Post, evidence_type: str) -> str:
16
+ when = post.created_at.date().isoformat() if post.created_at else "?"
17
+ where = f"r/{post.community}" if post.community else post.platform.value
18
+ return f"[{evidence_type}] | {where} | {when}"
@@ -0,0 +1,52 @@
1
+ """Backend contract for the leak-inference cascade.
2
+
3
+ A backend answers two questions about the user's OWN posts:
4
+
5
+ route(posts) -> a 0..1 priority per post (cheap tier). Lower priority never
6
+ means "dropped" — it only means "analyze later / sample less".
7
+ extract(batch) -> structured leak inferences for a small batch (expensive tier).
8
+
9
+ Backends never return resolved personal values: an inference carries a category,
10
+ a confidence, a MASKED snippet and a reference back to the user's own post.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from abc import ABC, abstractmethod
16
+ from dataclasses import dataclass
17
+ from typing import Optional
18
+
19
+ from ..models import Confidence, Platform, Post, RiskCategory, Source
20
+
21
+
22
+ @dataclass
23
+ class RawInference:
24
+ """One leak signal, pre-aggregation. Holds a reference + masked text only."""
25
+ category: RiskCategory
26
+ confidence: Confidence
27
+ masked_snippet: str
28
+ evidence_type: str
29
+ source: Source = Source.TEXT
30
+ post_id: Optional[str] = None
31
+ permalink: Optional[str] = None
32
+ platform: Optional[Platform] = None # namespaces post_id across mixed exports
33
+ rationale: str = ""
34
+
35
+
36
+ class Backend(ABC):
37
+ name: str = "base"
38
+ is_local: bool = False
39
+ #: True if running this backend sends the user's posts off their machine.
40
+ #: Drives the conditional cloud-deanonymization warning.
41
+ sends_data_offsite: bool = False
42
+
43
+ @abstractmethod
44
+ def route(self, posts: list[Post]) -> list[float]:
45
+ ...
46
+
47
+ @abstractmethod
48
+ def extract(self, batch: list[Post]) -> list[RawInference]:
49
+ ...
50
+
51
+ def describe(self) -> str:
52
+ return self.name