exposurecheck 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- exposurecheck/__init__.py +11 -0
- exposurecheck/__main__.py +4 -0
- exposurecheck/audit.py +44 -0
- exposurecheck/backends/__init__.py +57 -0
- exposurecheck/backends/_mask.py +18 -0
- exposurecheck/backends/base.py +52 -0
- exposurecheck/backends/heuristic.py +102 -0
- exposurecheck/backends/llm.py +205 -0
- exposurecheck/backends/transports.py +114 -0
- exposurecheck/cascade/__init__.py +6 -0
- exposurecheck/cascade/deterministic.py +125 -0
- exposurecheck/cascade/pipeline.py +74 -0
- exposurecheck/cascade/prefilter.py +35 -0
- exposurecheck/cascade/summarize.py +45 -0
- exposurecheck/cli.py +197 -0
- exposurecheck/metadata/__init__.py +4 -0
- exposurecheck/metadata/exif.py +221 -0
- exposurecheck/models.py +209 -0
- exposurecheck/output/__init__.py +6 -0
- exposurecheck/output/interactive.py +70 -0
- exposurecheck/output/report.py +106 -0
- exposurecheck/parsers/__init__.py +17 -0
- exposurecheck/parsers/_source.py +86 -0
- exposurecheck/parsers/_util.py +21 -0
- exposurecheck/parsers/reddit.py +103 -0
- exposurecheck/parsers/twitter.py +157 -0
- exposurecheck/remediation/__init__.py +5 -0
- exposurecheck/remediation/advise.py +24 -0
- exposurecheck/risk/__init__.py +7 -0
- exposurecheck/risk/card.py +39 -0
- exposurecheck/risk/categories.py +118 -0
- exposurecheck/risk/scoring.py +50 -0
- exposurecheck/safety/__init__.py +6 -0
- exposurecheck/safety/consent.py +38 -0
- exposurecheck/safety/offline.py +60 -0
- exposurecheck/safety/warnings.py +41 -0
- exposurecheck-0.1.0.dist-info/METADATA +217 -0
- exposurecheck-0.1.0.dist-info/RECORD +42 -0
- exposurecheck-0.1.0.dist-info/WHEEL +5 -0
- exposurecheck-0.1.0.dist-info/entry_points.txt +2 -0
- exposurecheck-0.1.0.dist-info/licenses/LICENSE +21 -0
- exposurecheck-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""exposurecheck — audit your own social-media export for re-identification risk.
|
|
2
|
+
|
|
3
|
+
Local-first. No-dossier. Bring-your-own-LLM (cloud BYOK or local).
|
|
4
|
+
|
|
5
|
+
This package never phones home, never scrapes, and never writes a synthesized
|
|
6
|
+
profile of you to disk. It shows you, by category, what a *mosaic* re-identification
|
|
7
|
+
attack could reconstruct from the public history you already published — and points
|
|
8
|
+
you back at your own posts so you can edit or generalize them.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
__version__ = "0.1.0"
|
exposurecheck/audit.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Top-level orchestration: parsed exports + a backend -> AuditResult."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Callable, Optional
|
|
6
|
+
|
|
7
|
+
from .backends.base import Backend
|
|
8
|
+
from .cascade import run_cascade
|
|
9
|
+
from .models import AuditResult, Export
|
|
10
|
+
from .risk import build_cards
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def run_audit(
|
|
14
|
+
exports: list[Export],
|
|
15
|
+
backend: Backend,
|
|
16
|
+
*,
|
|
17
|
+
candidate_fraction: float = 1.0,
|
|
18
|
+
max_candidates: Optional[int] = None,
|
|
19
|
+
batch_size: int = 10,
|
|
20
|
+
progress: Optional[Callable[[int, int], None]] = None,
|
|
21
|
+
) -> AuditResult:
|
|
22
|
+
outcome = run_cascade(
|
|
23
|
+
exports, backend,
|
|
24
|
+
candidate_fraction=candidate_fraction,
|
|
25
|
+
max_candidates=max_candidates,
|
|
26
|
+
batch_size=batch_size,
|
|
27
|
+
progress=progress,
|
|
28
|
+
)
|
|
29
|
+
cards = build_cards(outcome.findings)
|
|
30
|
+
return AuditResult(
|
|
31
|
+
cards=cards,
|
|
32
|
+
findings=outcome.findings,
|
|
33
|
+
backend_name=backend.name,
|
|
34
|
+
post_count=outcome.post_count,
|
|
35
|
+
candidate_count=outcome.candidate_count,
|
|
36
|
+
platforms=[ex.platform for ex in exports],
|
|
37
|
+
meta={
|
|
38
|
+
"dropped": outcome.dropped_count,
|
|
39
|
+
"kept": outcome.kept_count,
|
|
40
|
+
"not_analyzed": outcome.not_analyzed_count,
|
|
41
|
+
"raw": outcome.raw_count,
|
|
42
|
+
"media_count": sum(len(ex.media) for ex in exports),
|
|
43
|
+
},
|
|
44
|
+
)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Pluggable inference backends and a small factory.
|
|
2
|
+
|
|
3
|
+
heuristic - offline regex stub (no key, low recall — dev/CI/demo only)
|
|
4
|
+
cloud - OpenAI-compatible endpoint, bring-your-own-key (sends data offsite)
|
|
5
|
+
local - local Ollama server (no network egress)
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
from .base import Backend, RawInference
|
|
13
|
+
from .heuristic import HeuristicBackend
|
|
14
|
+
from .llm import LLMBackend
|
|
15
|
+
from .transports import CloudTransport, LocalTransport, TransportError
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"Backend", "RawInference", "HeuristicBackend", "LLMBackend",
|
|
19
|
+
"CloudTransport", "LocalTransport", "TransportError", "build_backend",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def build_backend(
|
|
24
|
+
kind: str,
|
|
25
|
+
*,
|
|
26
|
+
api_key: Optional[str] = None,
|
|
27
|
+
base_url: Optional[str] = None,
|
|
28
|
+
cheap_model: Optional[str] = None,
|
|
29
|
+
expensive_model: Optional[str] = None,
|
|
30
|
+
timeout: Optional[float] = None,
|
|
31
|
+
) -> Backend:
|
|
32
|
+
kind = (kind or "heuristic").lower()
|
|
33
|
+
if kind == "heuristic":
|
|
34
|
+
return HeuristicBackend()
|
|
35
|
+
if kind == "cloud":
|
|
36
|
+
transport = CloudTransport(
|
|
37
|
+
api_key or "",
|
|
38
|
+
base_url=base_url or "https://api.openai.com/v1",
|
|
39
|
+
timeout=timeout or 60.0,
|
|
40
|
+
)
|
|
41
|
+
return LLMBackend(
|
|
42
|
+
transport,
|
|
43
|
+
cheap_model=cheap_model or "gpt-4o-mini",
|
|
44
|
+
expensive_model=expensive_model or "gpt-4o",
|
|
45
|
+
)
|
|
46
|
+
if kind == "local":
|
|
47
|
+
transport = LocalTransport(
|
|
48
|
+
base_url=base_url or "http://localhost:11434",
|
|
49
|
+
timeout=timeout or 120.0,
|
|
50
|
+
)
|
|
51
|
+
model = expensive_model or cheap_model or "llama3.1"
|
|
52
|
+
return LLMBackend(
|
|
53
|
+
transport,
|
|
54
|
+
cheap_model=cheap_model or model,
|
|
55
|
+
expensive_model=model,
|
|
56
|
+
)
|
|
57
|
+
raise ValueError(f"unknown backend: {kind!r} (use heuristic|cloud|local)")
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Mechanical masked-snippet generation.
|
|
2
|
+
|
|
3
|
+
The masked snippet shown in a risk card is ALWAYS generated here from post
|
|
4
|
+
metadata (evidence label + where + when) — never from model free-text. This
|
|
5
|
+
guarantees the no-dossier invariant holds regardless of what an LLM returns:
|
|
6
|
+
the resolved value can only ever appear when the user clicks through to their
|
|
7
|
+
OWN original post.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from ..models import Post
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def masked_reference(post: Post, evidence_type: str) -> str:
|
|
16
|
+
when = post.created_at.date().isoformat() if post.created_at else "?"
|
|
17
|
+
where = f"r/{post.community}" if post.community else post.platform.value
|
|
18
|
+
return f"[{evidence_type}] | {where} | {when}"
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Backend contract for the leak-inference cascade.
|
|
2
|
+
|
|
3
|
+
A backend answers two questions about the user's OWN posts:
|
|
4
|
+
|
|
5
|
+
route(posts) -> a 0..1 priority per post (cheap tier). Lower priority never
|
|
6
|
+
means "dropped" — it only means "analyze later / sample less".
|
|
7
|
+
extract(batch) -> structured leak inferences for a small batch (expensive tier).
|
|
8
|
+
|
|
9
|
+
Backends never return resolved personal values: an inference carries a category,
|
|
10
|
+
a confidence, a MASKED snippet and a reference back to the user's own post.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from abc import ABC, abstractmethod
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from typing import Optional
|
|
18
|
+
|
|
19
|
+
from ..models import Confidence, Platform, Post, RiskCategory, Source
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class RawInference:
|
|
24
|
+
"""One leak signal, pre-aggregation. Holds a reference + masked text only."""
|
|
25
|
+
category: RiskCategory
|
|
26
|
+
confidence: Confidence
|
|
27
|
+
masked_snippet: str
|
|
28
|
+
evidence_type: str
|
|
29
|
+
source: Source = Source.TEXT
|
|
30
|
+
post_id: Optional[str] = None
|
|
31
|
+
permalink: Optional[str] = None
|
|
32
|
+
platform: Optional[Platform] = None # namespaces post_id across mixed exports
|
|
33
|
+
rationale: str = ""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Backend(ABC):
|
|
37
|
+
name: str = "base"
|
|
38
|
+
is_local: bool = False
|
|
39
|
+
#: True if running this backend sends the user's posts off their machine.
|
|
40
|
+
#: Drives the conditional cloud-deanonymization warning.
|
|
41
|
+
sends_data_offsite: bool = False
|
|
42
|
+
|
|
43
|
+
@abstractmethod
|
|
44
|
+
def route(self, posts: list[Post]) -> list[float]:
|
|
45
|
+
...
|
|
46
|
+
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def extract(self, batch: list[Post]) -> list[RawInference]:
|
|
49
|
+
...
|
|
50
|
+
|
|
51
|
+
def describe(self) -> str:
|
|
52
|
+
return self.name
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Deterministic, offline, dependency-free stub backend.
|
|
2
|
+
|
|
3
|
+
PURPOSE: let the whole pipeline run end-to-end with no API key and no model, for
|
|
4
|
+
development, tests and CI. It uses keyword/regex matching only.
|
|
5
|
+
|
|
6
|
+
IMPORTANT — this is NOT real protection. Regex/keyword matching has near-zero
|
|
7
|
+
recall against the actual mosaic threat: the danger is an LLM *reasoning* across
|
|
8
|
+
many weak, individually-innocuous posts, which keywords cannot reproduce. Treat
|
|
9
|
+
heuristic output as a smoke test of the plumbing, never as an audit. The CLI
|
|
10
|
+
prints a loud warning when this backend is selected.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
|
|
17
|
+
from ..models import Confidence, Post, RiskCategory, Source
|
|
18
|
+
from .base import Backend, RawInference
|
|
19
|
+
from ._mask import masked_reference
|
|
20
|
+
|
|
21
|
+
# (category, evidence_type label, pattern, confidence)
|
|
22
|
+
_LEXICON: list[tuple[RiskCategory, str, re.Pattern, Confidence]] = [
|
|
23
|
+
(RiskCategory.LOCATION, "location clue",
|
|
24
|
+
re.compile(r"\b(live|lives|living|moved|move|near|downtown|neighbou?rhood|commute|"
|
|
25
|
+
r"ferry|station|light-rail|marina|reservoir|uptown|my (block|street))\b", re.I),
|
|
26
|
+
Confidence.LOW),
|
|
27
|
+
(RiskCategory.EMPLOYER, "employer mention",
|
|
28
|
+
re.compile(r"\b(my (company|team|employer|office|boss)|at the startup|we hire|we're hiring|"
|
|
29
|
+
r"i work (at|for)|as an? (sre|engineer|developer|manager|analyst))\b", re.I),
|
|
30
|
+
Confidence.MEDIUM),
|
|
31
|
+
(RiskCategory.SCHEDULE, "routine / timing clue",
|
|
32
|
+
re.compile(r"\b(\d{1,2}(:\d{2})?\s?(am|pm)|every (weekday|morning|day)|on-call|"
|
|
33
|
+
r"before work|after work|my (morning|commute) routine)\b", re.I),
|
|
34
|
+
Confidence.LOW),
|
|
35
|
+
(RiskCategory.FAMILY, "family reference",
|
|
36
|
+
re.compile(r"\b(my (daughter|son|kid|kids|child|children|wife|husband|partner|mom|dad)|"
|
|
37
|
+
r"kindergarten|elementary school|drop-off)\b", re.I),
|
|
38
|
+
Confidence.MEDIUM),
|
|
39
|
+
(RiskCategory.FINANCE, "financial disclosure",
|
|
40
|
+
re.compile(r"\b(\d+(\.\d+)?\s?btc|bitcoin|stacking|cold storage|my (portfolio|salary|income)|"
|
|
41
|
+
r"net worth|i hold|i own \d)\b", re.I),
|
|
42
|
+
Confidence.LOW),
|
|
43
|
+
(RiskCategory.AGE_DOB, "age / DOB clue",
|
|
44
|
+
re.compile(r"\b(i'?m \d{2}\b|born in (19|20)\d{2}|turned \d{2}|my birthday|\bm\d{2}\b|\bf\d{2}\b)",
|
|
45
|
+
re.I),
|
|
46
|
+
Confidence.MEDIUM),
|
|
47
|
+
(RiskCategory.IDENTITY_LINK, "cross-account / identity clue",
|
|
48
|
+
re.compile(r"\b(my real name|same (handle|username)|my (other|main) account|on my (blog|site)|"
|
|
49
|
+
r"my personal (site|website))\b", re.I),
|
|
50
|
+
Confidence.MEDIUM),
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def scan_text(text: str) -> list[tuple[RiskCategory, str, Confidence]]:
|
|
55
|
+
"""Run the keyword lexicon over a piece of text. Shared by the heuristic
|
|
56
|
+
backend (post bodies) and the deterministic layer (profile bio)."""
|
|
57
|
+
hits = []
|
|
58
|
+
for category, label, pat, conf in _LEXICON:
|
|
59
|
+
if pat.search(text or ""):
|
|
60
|
+
hits.append((category, label, conf))
|
|
61
|
+
return hits
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class HeuristicBackend(Backend):
|
|
65
|
+
name = "heuristic"
|
|
66
|
+
is_local = True
|
|
67
|
+
sends_data_offsite = False
|
|
68
|
+
|
|
69
|
+
def route(self, posts: list[Post]) -> list[float]:
|
|
70
|
+
scores = []
|
|
71
|
+
for p in posts:
|
|
72
|
+
s = 0.1
|
|
73
|
+
text = p.text or ""
|
|
74
|
+
if any(pat.search(text) for _c, _l, pat, _conf in _LEXICON):
|
|
75
|
+
s += 0.4
|
|
76
|
+
if p.urls:
|
|
77
|
+
s += 0.15
|
|
78
|
+
if p.mentions:
|
|
79
|
+
s += 0.1
|
|
80
|
+
if any(m.exif and m.exif.has_location() for m in p.media):
|
|
81
|
+
s += 0.5
|
|
82
|
+
scores.append(min(s, 1.0))
|
|
83
|
+
return scores
|
|
84
|
+
|
|
85
|
+
def extract(self, batch: list[Post]) -> list[RawInference]:
|
|
86
|
+
out: list[RawInference] = []
|
|
87
|
+
for p in batch:
|
|
88
|
+
text = p.text or ""
|
|
89
|
+
for category, label, pat, conf in _LEXICON:
|
|
90
|
+
if pat.search(text):
|
|
91
|
+
out.append(RawInference(
|
|
92
|
+
category=category,
|
|
93
|
+
confidence=conf,
|
|
94
|
+
masked_snippet=masked_reference(p, label),
|
|
95
|
+
evidence_type=label,
|
|
96
|
+
source=Source.TEXT,
|
|
97
|
+
post_id=p.post_id,
|
|
98
|
+
permalink=p.permalink,
|
|
99
|
+
platform=p.platform,
|
|
100
|
+
rationale=f"{label} detected by keyword match (low-recall stub).",
|
|
101
|
+
))
|
|
102
|
+
return out
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""LLM-driven backend: the real mosaic-inference engine.
|
|
2
|
+
|
|
3
|
+
Wraps a Transport (cloud or local). The model only ever decides *which* post
|
|
4
|
+
leaks *which* category, at what confidence — it is never asked for, and never
|
|
5
|
+
trusted to produce, the masked snippet or rationale (we generate those). That
|
|
6
|
+
keeps the no-dossier guarantee independent of model behaviour.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import re
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
from ..models import Confidence, Post, RiskCategory, Source
|
|
16
|
+
from .base import Backend, RawInference
|
|
17
|
+
from ._mask import masked_reference
|
|
18
|
+
from .transports import Transport
|
|
19
|
+
|
|
20
|
+
_CATS = [c.value for c in RiskCategory]
|
|
21
|
+
_CAT_BY_VALUE = {c.value: c for c in RiskCategory}
|
|
22
|
+
|
|
23
|
+
# Generic, developer-controlled evidence labels. The model is NEVER trusted to
|
|
24
|
+
# supply display text: it only chooses category/confidence/post_id, and the label
|
|
25
|
+
# shown on the card and inside the masked snippet is looked up here. This keeps
|
|
26
|
+
# the no-dossier guarantee independent of model behaviour (a model that tries to
|
|
27
|
+
# smuggle a resolved value into `evidence_type` cannot reach the output surface).
|
|
28
|
+
_LLM_EVIDENCE_LABEL: dict[RiskCategory, str] = {
|
|
29
|
+
RiskCategory.LOCATION: "location signal",
|
|
30
|
+
RiskCategory.EMPLOYER: "employer signal",
|
|
31
|
+
RiskCategory.EDUCATION: "education signal",
|
|
32
|
+
RiskCategory.FAMILY: "family signal",
|
|
33
|
+
RiskCategory.AGE_DOB: "age / DOB signal",
|
|
34
|
+
RiskCategory.HEALTH: "health signal",
|
|
35
|
+
RiskCategory.FINANCE: "financial signal",
|
|
36
|
+
RiskCategory.REAL_NAME: "real-name signal",
|
|
37
|
+
RiskCategory.SCHEDULE: "routine / timing signal",
|
|
38
|
+
RiskCategory.RELATIONSHIPS: "relationship signal",
|
|
39
|
+
RiskCategory.POLITICS_RELIGION: "affiliation signal",
|
|
40
|
+
RiskCategory.IDENTITY_LINK: "account-linkage signal",
|
|
41
|
+
}
|
|
42
|
+
_CAT_ALIASES = {
|
|
43
|
+
"dob": RiskCategory.AGE_DOB, "age": RiskCategory.AGE_DOB,
|
|
44
|
+
"job": RiskCategory.EMPLOYER, "work": RiskCategory.EMPLOYER, "company": RiskCategory.EMPLOYER,
|
|
45
|
+
"name": RiskCategory.REAL_NAME, "identity": RiskCategory.REAL_NAME,
|
|
46
|
+
"link": RiskCategory.IDENTITY_LINK, "account": RiskCategory.IDENTITY_LINK,
|
|
47
|
+
"money": RiskCategory.FINANCE, "financial": RiskCategory.FINANCE,
|
|
48
|
+
"politics": RiskCategory.POLITICS_RELIGION, "religion": RiskCategory.POLITICS_RELIGION,
|
|
49
|
+
"home": RiskCategory.LOCATION, "address": RiskCategory.LOCATION, "city": RiskCategory.LOCATION,
|
|
50
|
+
"school": RiskCategory.EDUCATION,
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
_ROUTE_SYS = (
|
|
54
|
+
"You rank a user's OWN social-media posts by how much identifying personal "
|
|
55
|
+
"information they could leak when AGGREGATED with the rest (location, employer, "
|
|
56
|
+
"family, schedule, finances, identity links, etc.). Output strict JSON only: an "
|
|
57
|
+
'array of {"i": <index int>, "s": <float 0..1>}. Higher s = more identifying '
|
|
58
|
+
"signal. Include every index given. Short or weak posts can still score moderate; "
|
|
59
|
+
"only truly empty or boilerplate posts score near 0."
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
_EXTRACT_SYS = (
|
|
63
|
+
"You are a privacy auditor helping a user reduce their OWN re-identification risk. "
|
|
64
|
+
"You see only the user's own posts. Acting as a careful adversary, decide which "
|
|
65
|
+
"posts leak which categories of personal information when aggregated across the "
|
|
66
|
+
"whole history, including weak individually-innocuous signals.\n"
|
|
67
|
+
"Output STRICT JSON only: an array of objects "
|
|
68
|
+
'{"post_id": str, "category": one of ' + json.dumps(_CATS) + ', '
|
|
69
|
+
'"confidence": "low"|"medium"|"high"}.\n'
|
|
70
|
+
"HARD RULES:\n"
|
|
71
|
+
"- Do NOT output any resolved value (no real city, neighbourhood, street, employer "
|
|
72
|
+
"name, person name, handle, exact age, or coordinates). Return only the structured "
|
|
73
|
+
"fields above; the tool generates all displayed text itself.\n"
|
|
74
|
+
"- Only use post_id values that appear in the input.\n"
|
|
75
|
+
"- One object per (post, category) you find. Omit posts that leak nothing."
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class LLMBackend(Backend):
|
|
80
|
+
def __init__(self, transport: Transport, *, cheap_model: str, expensive_model: str,
|
|
81
|
+
route_chunk: int = 25, name: Optional[str] = None):
|
|
82
|
+
self.transport = transport
|
|
83
|
+
self.cheap_model = cheap_model
|
|
84
|
+
self.expensive_model = expensive_model
|
|
85
|
+
self.route_chunk = route_chunk
|
|
86
|
+
self.name = name or f"llm:{transport.name}"
|
|
87
|
+
self.is_local = transport.is_local
|
|
88
|
+
self.sends_data_offsite = transport.sends_data_offsite
|
|
89
|
+
|
|
90
|
+
# -- cheap tier -------------------------------------------------------- #
|
|
91
|
+
def route(self, posts: list[Post]) -> list[float]:
|
|
92
|
+
scores = [0.3] * len(posts) # recall-preserving default: nothing starts at 0
|
|
93
|
+
for start in range(0, len(posts), self.route_chunk):
|
|
94
|
+
chunk = posts[start:start + self.route_chunk]
|
|
95
|
+
payload = [{"i": i, "t": (p.text or "")[:280]} for i, p in enumerate(chunk)]
|
|
96
|
+
try:
|
|
97
|
+
raw = self.transport.complete(
|
|
98
|
+
_ROUTE_SYS, json.dumps(payload), self.cheap_model,
|
|
99
|
+
max_tokens=400, temperature=0.0)
|
|
100
|
+
except Exception:
|
|
101
|
+
continue # keep defaults for this chunk on transport failure
|
|
102
|
+
for obj in _loads_array(raw):
|
|
103
|
+
try:
|
|
104
|
+
i = int(obj["i"])
|
|
105
|
+
s = float(obj["s"])
|
|
106
|
+
except (KeyError, TypeError, ValueError):
|
|
107
|
+
continue
|
|
108
|
+
if 0 <= i < len(chunk):
|
|
109
|
+
scores[start + i] = max(0.0, min(1.0, s))
|
|
110
|
+
return scores
|
|
111
|
+
|
|
112
|
+
# -- expensive tier ---------------------------------------------------- #
|
|
113
|
+
def extract(self, batch: list[Post]) -> list[RawInference]:
|
|
114
|
+
by_id = {p.post_id: p for p in batch}
|
|
115
|
+
payload = [{
|
|
116
|
+
"post_id": p.post_id,
|
|
117
|
+
"community": p.community or "",
|
|
118
|
+
"date": p.created_at.date().isoformat() if p.created_at else "",
|
|
119
|
+
"text": (p.text or "")[:600],
|
|
120
|
+
} for p in batch]
|
|
121
|
+
try:
|
|
122
|
+
raw = self.transport.complete(
|
|
123
|
+
_EXTRACT_SYS, json.dumps(payload), self.expensive_model,
|
|
124
|
+
max_tokens=900, temperature=0.0)
|
|
125
|
+
except Exception:
|
|
126
|
+
return []
|
|
127
|
+
|
|
128
|
+
out: list[RawInference] = []
|
|
129
|
+
for obj in _loads_array(raw):
|
|
130
|
+
if not isinstance(obj, dict):
|
|
131
|
+
continue
|
|
132
|
+
cat = _to_category(obj.get("category"))
|
|
133
|
+
pid = obj.get("post_id")
|
|
134
|
+
post = by_id.get(pid)
|
|
135
|
+
if cat is None or post is None:
|
|
136
|
+
continue # ignore hallucinated ids / unknown categories
|
|
137
|
+
# label is mechanical (never the model's free text) — see _LLM_EVIDENCE_LABEL
|
|
138
|
+
label = _LLM_EVIDENCE_LABEL.get(cat, "personal signal")
|
|
139
|
+
out.append(RawInference(
|
|
140
|
+
category=cat,
|
|
141
|
+
confidence=_to_conf(obj.get("confidence")),
|
|
142
|
+
masked_snippet=masked_reference(post, label), # we generate it
|
|
143
|
+
evidence_type=label,
|
|
144
|
+
source=Source.TEXT,
|
|
145
|
+
post_id=post.post_id,
|
|
146
|
+
permalink=post.permalink,
|
|
147
|
+
platform=post.platform,
|
|
148
|
+
))
|
|
149
|
+
return out
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# --------------------------------------------------------------------------- #
|
|
153
|
+
# parsing helpers
|
|
154
|
+
# --------------------------------------------------------------------------- #
|
|
155
|
+
|
|
156
|
+
def _strip_fences(s: str) -> str:
|
|
157
|
+
s = s.strip()
|
|
158
|
+
if s.startswith("```"):
|
|
159
|
+
s = re.sub(r"^```[a-zA-Z0-9]*\s*", "", s)
|
|
160
|
+
s = re.sub(r"\s*```$", "", s)
|
|
161
|
+
return s.strip()
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _loads_array(text: str) -> list:
|
|
165
|
+
text = _strip_fences(text or "")
|
|
166
|
+
try:
|
|
167
|
+
v = json.loads(text)
|
|
168
|
+
if isinstance(v, list):
|
|
169
|
+
return v
|
|
170
|
+
if isinstance(v, dict):
|
|
171
|
+
for k in ("inferences", "results", "items", "data"):
|
|
172
|
+
if isinstance(v.get(k), list):
|
|
173
|
+
return v[k]
|
|
174
|
+
except json.JSONDecodeError:
|
|
175
|
+
pass
|
|
176
|
+
i, j = text.find("["), text.rfind("]")
|
|
177
|
+
if 0 <= i < j:
|
|
178
|
+
try:
|
|
179
|
+
v = json.loads(text[i:j + 1])
|
|
180
|
+
return v if isinstance(v, list) else []
|
|
181
|
+
except json.JSONDecodeError:
|
|
182
|
+
return []
|
|
183
|
+
return []
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _to_category(s) -> Optional[RiskCategory]:
|
|
187
|
+
if not s:
|
|
188
|
+
return None
|
|
189
|
+
key = str(s).strip().lower().replace(" ", "_").replace("/", "_").replace("-", "_")
|
|
190
|
+
if key in _CAT_BY_VALUE:
|
|
191
|
+
return _CAT_BY_VALUE[key]
|
|
192
|
+
# whole-token match so "accountant" != "account", "worker" != "work"
|
|
193
|
+
tokens = set(key.split("_"))
|
|
194
|
+
for alias, cat in _CAT_ALIASES.items():
|
|
195
|
+
if alias in tokens:
|
|
196
|
+
return cat
|
|
197
|
+
return None
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _to_conf(s) -> Confidence:
|
|
201
|
+
key = str(s or "").strip().lower()
|
|
202
|
+
return {
|
|
203
|
+
"low": Confidence.LOW, "medium": Confidence.MEDIUM, "med": Confidence.MEDIUM,
|
|
204
|
+
"high": Confidence.HIGH,
|
|
205
|
+
}.get(key, Confidence.LOW)
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""HTTP transports for LLM backends, standard-library only (urllib, no requests).
|
|
2
|
+
|
|
3
|
+
Two shapes, both selected by the user:
|
|
4
|
+
CloudTransport - any OpenAI-compatible /chat/completions endpoint (OpenAI,
|
|
5
|
+
OpenRouter, or a self-hosted gateway). Bring your own key.
|
|
6
|
+
LocalTransport - a local Ollama server (/api/chat). No network egress.
|
|
7
|
+
|
|
8
|
+
A local OpenAI-compatible server (llama.cpp, LM Studio) can also be driven by
|
|
9
|
+
CloudTransport pointed at http://localhost:... with any placeholder key.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import urllib.error
|
|
16
|
+
import urllib.request
|
|
17
|
+
from abc import ABC, abstractmethod
|
|
18
|
+
from typing import Optional
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TransportError(RuntimeError):
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Transport(ABC):
|
|
26
|
+
name: str = "transport"
|
|
27
|
+
is_local: bool = False
|
|
28
|
+
sends_data_offsite: bool = False
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def complete(self, system: str, user: str, model: str, *,
|
|
32
|
+
max_tokens: int = 800, temperature: float = 0.0) -> str:
|
|
33
|
+
...
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _post_json(url: str, payload: dict, headers: dict, timeout: float) -> dict:
|
|
37
|
+
body = json.dumps(payload).encode("utf-8")
|
|
38
|
+
req = urllib.request.Request(url, data=body, method="POST")
|
|
39
|
+
req.add_header("Content-Type", "application/json")
|
|
40
|
+
for k, v in headers.items():
|
|
41
|
+
req.add_header(k, v)
|
|
42
|
+
try:
|
|
43
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
44
|
+
return json.loads(resp.read().decode("utf-8", "replace"))
|
|
45
|
+
except urllib.error.HTTPError as e:
|
|
46
|
+
detail = e.read().decode("utf-8", "replace")[:500] if e.fp else ""
|
|
47
|
+
raise TransportError(f"HTTP {e.code} from {url}: {detail}") from e
|
|
48
|
+
except urllib.error.URLError as e:
|
|
49
|
+
raise TransportError(f"cannot reach {url}: {e.reason}") from e
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class CloudTransport(Transport):
|
|
53
|
+
is_local = False
|
|
54
|
+
sends_data_offsite = True
|
|
55
|
+
|
|
56
|
+
def __init__(self, api_key: str, *, base_url: str = "https://api.openai.com/v1",
|
|
57
|
+
timeout: float = 60.0, name: Optional[str] = None):
|
|
58
|
+
if not api_key:
|
|
59
|
+
raise TransportError("cloud backend needs an API key (set it via env, never on the CLI)")
|
|
60
|
+
self.api_key = api_key
|
|
61
|
+
self.base_url = base_url.rstrip("/")
|
|
62
|
+
self.timeout = timeout
|
|
63
|
+
self.name = name or f"cloud({self.base_url})"
|
|
64
|
+
|
|
65
|
+
def complete(self, system, user, model, *, max_tokens=800, temperature=0.0) -> str:
|
|
66
|
+
data = _post_json(
|
|
67
|
+
f"{self.base_url}/chat/completions",
|
|
68
|
+
{
|
|
69
|
+
"model": model,
|
|
70
|
+
"messages": [
|
|
71
|
+
{"role": "system", "content": system},
|
|
72
|
+
{"role": "user", "content": user},
|
|
73
|
+
],
|
|
74
|
+
"temperature": temperature,
|
|
75
|
+
"max_tokens": max_tokens,
|
|
76
|
+
},
|
|
77
|
+
{"Authorization": f"Bearer {self.api_key}"},
|
|
78
|
+
self.timeout,
|
|
79
|
+
)
|
|
80
|
+
try:
|
|
81
|
+
return data["choices"][0]["message"]["content"]
|
|
82
|
+
except (KeyError, IndexError, TypeError) as e:
|
|
83
|
+
raise TransportError(f"unexpected response shape: {str(data)[:300]}") from e
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class LocalTransport(Transport):
|
|
87
|
+
is_local = True
|
|
88
|
+
sends_data_offsite = False
|
|
89
|
+
|
|
90
|
+
def __init__(self, *, base_url: str = "http://localhost:11434",
|
|
91
|
+
timeout: float = 120.0, name: Optional[str] = None):
|
|
92
|
+
self.base_url = base_url.rstrip("/")
|
|
93
|
+
self.timeout = timeout
|
|
94
|
+
self.name = name or f"local({self.base_url})"
|
|
95
|
+
|
|
96
|
+
def complete(self, system, user, model, *, max_tokens=800, temperature=0.0) -> str:
|
|
97
|
+
data = _post_json(
|
|
98
|
+
f"{self.base_url}/api/chat",
|
|
99
|
+
{
|
|
100
|
+
"model": model,
|
|
101
|
+
"messages": [
|
|
102
|
+
{"role": "system", "content": system},
|
|
103
|
+
{"role": "user", "content": user},
|
|
104
|
+
],
|
|
105
|
+
"stream": False,
|
|
106
|
+
"options": {"temperature": temperature, "num_predict": max_tokens},
|
|
107
|
+
},
|
|
108
|
+
{},
|
|
109
|
+
self.timeout,
|
|
110
|
+
)
|
|
111
|
+
try:
|
|
112
|
+
return data["message"]["content"]
|
|
113
|
+
except (KeyError, TypeError) as e:
|
|
114
|
+
raise TransportError(f"unexpected Ollama response: {str(data)[:300]}") from e
|