chatpanel-pii 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +168 -0
- package/README.md +58 -0
- package/index.js +14 -0
- package/package.json +45 -0
- package/pii-detect.js +156 -0
- package/pii-redact.js +193 -0
- package/pipeline.js +130 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# PolyForm Shield License 1.0.0
|
|
2
|
+
|
|
3
|
+
<https://polyformproject.org/licenses/shield/1.0.0>
|
|
4
|
+
|
|
5
|
+
Required Notice: Copyright © 2026 ChatPanel (https://chatpanel.net)
|
|
6
|
+
|
|
7
|
+
Licensor Line of Business: ChatPanel — an AI browser side-panel, its local
|
|
8
|
+
bridge, and related developer tools and services (https://chatpanel.net)
|
|
9
|
+
|
|
10
|
+
## Acceptance
|
|
11
|
+
|
|
12
|
+
In order to get any license under these terms, you must agree
|
|
13
|
+
to them as both strict obligations and conditions to all
|
|
14
|
+
your licenses.
|
|
15
|
+
|
|
16
|
+
## Copyright License
|
|
17
|
+
|
|
18
|
+
The licensor grants you a copyright license for the
|
|
19
|
+
software to do everything you might do with the software
|
|
20
|
+
that would otherwise infringe the licensor's copyright
|
|
21
|
+
in it for any permitted purpose. However, you may
|
|
22
|
+
only distribute the software according to [Distribution
|
|
23
|
+
License](#distribution-license) and make changes or new works
|
|
24
|
+
based on the software according to [Changes and New Works
|
|
25
|
+
License](#changes-and-new-works-license).
|
|
26
|
+
|
|
27
|
+
## Distribution License
|
|
28
|
+
|
|
29
|
+
The licensor grants you an additional copyright license to
|
|
30
|
+
distribute copies of the software. Your license to distribute
|
|
31
|
+
covers distributing the software with changes and new works
|
|
32
|
+
permitted by [Changes and New Works License](#changes-and-new-works-license).
|
|
33
|
+
|
|
34
|
+
## Notices
|
|
35
|
+
|
|
36
|
+
You must ensure that anyone who gets a copy of any part of
|
|
37
|
+
the software from you also gets a copy of these terms or the
|
|
38
|
+
URL for them above, as well as copies of any plain-text lines
|
|
39
|
+
beginning with `Required Notice:` that the licensor provided
|
|
40
|
+
with the software. For example:
|
|
41
|
+
|
|
42
|
+
> Required Notice: Copyright © 2026 ChatPanel (https://chatpanel.net)
|
|
43
|
+
|
|
44
|
+
## Changes and New Works License
|
|
45
|
+
|
|
46
|
+
The licensor grants you an additional copyright license to
|
|
47
|
+
make changes and new works based on the software for any
|
|
48
|
+
permitted purpose.
|
|
49
|
+
|
|
50
|
+
## Patent License
|
|
51
|
+
|
|
52
|
+
The licensor grants you a patent license for the software that
|
|
53
|
+
covers patent claims the licensor can license, or becomes able
|
|
54
|
+
to license, that you would infringe by using the software.
|
|
55
|
+
|
|
56
|
+
## Noncompete
|
|
57
|
+
|
|
58
|
+
Any purpose is a permitted purpose, except for providing any
|
|
59
|
+
product that competes with the software or any product the
|
|
60
|
+
licensor or any of its affiliates provides using the software.
|
|
61
|
+
|
|
62
|
+
## Competition
|
|
63
|
+
|
|
64
|
+
Goods and services compete even when they provide functionality
|
|
65
|
+
through different kinds of interfaces or for different technical
|
|
66
|
+
platforms. Applications can compete with services, libraries
|
|
67
|
+
with plugins, frameworks with development tools, and so on,
|
|
68
|
+
even if they're written in different programming languages
|
|
69
|
+
or for different computer architectures. Goods and services
|
|
70
|
+
compete even when provided free of charge. If you market a
|
|
71
|
+
product as a practical substitute for the software or another
|
|
72
|
+
product, it definitely competes.
|
|
73
|
+
|
|
74
|
+
## New Products
|
|
75
|
+
|
|
76
|
+
If you are using the software to provide a product that does
|
|
77
|
+
not compete, but the licensor or any of its affiliates brings
|
|
78
|
+
your product into competition by providing a new version of
|
|
79
|
+
the software or another product using the software, you may
|
|
80
|
+
continue using versions of the software available under these
|
|
81
|
+
terms beforehand to provide your competing product, but not
|
|
82
|
+
any later versions.
|
|
83
|
+
|
|
84
|
+
## Discontinued Products
|
|
85
|
+
|
|
86
|
+
You may begin using the software to compete with a product
|
|
87
|
+
or service that the licensor or any of its affiliates has
|
|
88
|
+
stopped providing, unless the licensor includes a plain-text
|
|
89
|
+
line beginning with `Licensor Line of Business:` with the
|
|
90
|
+
software that mentions that line of business. For example:
|
|
91
|
+
|
|
92
|
+
> Licensor Line of Business: ChatPanel — an AI browser side-panel, its local
|
|
93
|
+
> bridge, and related developer tools and services (https://chatpanel.net)
|
|
94
|
+
|
|
95
|
+
## Sales of Business
|
|
96
|
+
|
|
97
|
+
If the licensor or any of its affiliates sells a line of
|
|
98
|
+
business developing the software or using the software
|
|
99
|
+
to provide a product, the buyer can also enforce
|
|
100
|
+
Noncompete for that product.
|
|
101
|
+
|
|
102
|
+
## Fair Use
|
|
103
|
+
|
|
104
|
+
You may have "fair use" rights for the software under the
|
|
105
|
+
law. These terms do not limit them.
|
|
106
|
+
|
|
107
|
+
## No Other Rights
|
|
108
|
+
|
|
109
|
+
These terms do not allow you to sublicense or transfer any of
|
|
110
|
+
your licenses to anyone else, or prevent the licensor from
|
|
111
|
+
granting licenses to anyone else. These terms do not imply
|
|
112
|
+
any other licenses.
|
|
113
|
+
|
|
114
|
+
## Patent Defense
|
|
115
|
+
|
|
116
|
+
If you make any written claim that the software infringes or
|
|
117
|
+
contributes to infringement of any patent, your patent license
|
|
118
|
+
for the software granted under these terms ends immediately. If
|
|
119
|
+
your company makes such a claim, your patent license ends
|
|
120
|
+
immediately for work on behalf of your company.
|
|
121
|
+
|
|
122
|
+
## Violations
|
|
123
|
+
|
|
124
|
+
The first time you are notified in writing that you have
|
|
125
|
+
violated any of these terms, or done anything with the software
|
|
126
|
+
not covered by your licenses, your licenses can nonetheless
|
|
127
|
+
continue if you come into full compliance with these terms,
|
|
128
|
+
and take practical steps to correct past violations, within
|
|
129
|
+
32 days of receiving notice. Otherwise, all your licenses
|
|
130
|
+
end immediately.
|
|
131
|
+
|
|
132
|
+
## No Liability
|
|
133
|
+
|
|
134
|
+
***As far as the law allows, the software comes as is, without
|
|
135
|
+
any warranty or condition, and the licensor will not be liable
|
|
136
|
+
to you for any damages arising out of these terms or the use
|
|
137
|
+
or nature of the software, under any kind of legal claim.***
|
|
138
|
+
|
|
139
|
+
## Definitions
|
|
140
|
+
|
|
141
|
+
The **licensor** is the individual or entity offering these
|
|
142
|
+
terms, and the **software** is the software the licensor makes
|
|
143
|
+
available under these terms.
|
|
144
|
+
|
|
145
|
+
A **product** can be a good or service, or a combination
|
|
146
|
+
of them.
|
|
147
|
+
|
|
148
|
+
**You** refers to the individual or entity agreeing to these
|
|
149
|
+
terms.
|
|
150
|
+
|
|
151
|
+
**Your company** is any legal entity, sole proprietorship,
|
|
152
|
+
or other kind of organization that you work for, plus all
|
|
153
|
+
its affiliates.
|
|
154
|
+
|
|
155
|
+
**Affiliates** means the other organizations that an
|
|
156
|
+
organization has control over, is under the control of, or is
|
|
157
|
+
under common control with.
|
|
158
|
+
|
|
159
|
+
**Control** means ownership of substantially all the assets of
|
|
160
|
+
an entity, or the power to direct its management and policies
|
|
161
|
+
by vote, contract, or otherwise. Control can be direct or
|
|
162
|
+
indirect.
|
|
163
|
+
|
|
164
|
+
**Your licenses** are all the licenses granted to you for the
|
|
165
|
+
software under these terms.
|
|
166
|
+
|
|
167
|
+
**Use** means anything you do with the software requiring one
|
|
168
|
+
of your licenses.
|
package/README.md
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# chatpanel-pii
|
|
2
|
+
|
|
3
|
+
The **canonical ChatPanel privacy engine** — reversible PII redaction +
|
|
4
|
+
pseudonymization with local entity detection. Pure, dependency-free ESM. This is
|
|
5
|
+
the **single source of truth** shared by the ChatPanel
|
|
6
|
+
[extension](https://github.com/chatpanel/chatpanel-extension),
|
|
7
|
+
[gateway](https://github.com/chatpanel/chatpanel-gateway), and
|
|
8
|
+
[bridge](https://github.com/chatpanel/chatpanel-bridge): a privacy feature added
|
|
9
|
+
here is picked up by all of them.
|
|
10
|
+
|
|
11
|
+
```js
|
|
12
|
+
import { createVault, redactText, restoreText, detectEntities } from 'chatpanel-pii';
|
|
13
|
+
|
|
14
|
+
const vault = createVault();
|
|
15
|
+
const safe = redactText('email alex@example.com', vault, { tier: 'basic' });
|
|
16
|
+
// → 'email [[EMAIL_1]]'
|
|
17
|
+
restoreText(safe, vault);
|
|
18
|
+
// → 'email alex@example.com'
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
The model only ever sees opaque, stable placeholders like `[[PERSON_1]]` /
|
|
22
|
+
`[[EMAIL_2]]`, so it can still reason about *who said what* without seeing the
|
|
23
|
+
real values — and they're reconstructed locally on the way back.
|
|
24
|
+
|
|
25
|
+
## What's inside
|
|
26
|
+
|
|
27
|
+
| Module | Exports | Role |
|
|
28
|
+
|--------|---------|------|
|
|
29
|
+
| `pii-redact.js` | `createVault`, `redactText`, `restoreText`, `restoreWithAliases`, `vaultToJSON`/`vaultFromJSON`, `hasToken` | deterministic redact/restore + the per-conversation vault |
|
|
30
|
+
| `pii-detect.js` | `detectEntities`, `normalizeEntities`, `EXTRACT_SYS`, … | local entity detection (any HTTP NER endpoint, or a local OpenAI-compatible LLM) |
|
|
31
|
+
| `pipeline.js` | `redactOutbound`, `makeStreamRestorer`, `restore`, `restoreDeep`, `redactResult`, `effectiveTier`, `gatedDictionary`, `gatedScope` | pure turn orchestration + the free/Pro tier, scope, and dictionary gating |
|
|
32
|
+
|
|
33
|
+
Import the barrel (`chatpanel-pii`) or a submodule
|
|
34
|
+
(`chatpanel-pii/pii-redact.js`).
|
|
35
|
+
|
|
36
|
+
## Tiers
|
|
37
|
+
|
|
38
|
+
- **`basic`** — deterministic regex: emails, phones, IPs, cards (Luhn), SSNs, API
|
|
39
|
+
keys, plus a small user dictionary.
|
|
40
|
+
- **`full`** — basic + entity-aware: detected people / orgs / locations and an
|
|
41
|
+
unlimited custom dictionary. `effectiveTier(cfg, isPro)` downgrades `full`→`basic`
|
|
42
|
+
for non-Pro callers, so consumers enforce free/Pro identically.
|
|
43
|
+
|
|
44
|
+
A dictionary entry with an `alias` **pseudonymizes** (permanent substitution the
|
|
45
|
+
model and the user both see); without one it **redacts** to a reversible token.
|
|
46
|
+
|
|
47
|
+
## Design notes
|
|
48
|
+
|
|
49
|
+
- **Pure + dependency-free** so it unit-tests trivially and runs identically in a
|
|
50
|
+
browser extension, a Node proxy, and a CLI bridge.
|
|
51
|
+
- **Reversibility is best-effort**: if a model paraphrases a placeholder instead
|
|
52
|
+
of echoing it, that reference won't restore — but the privacy guarantee (the
|
|
53
|
+
real value never left the device) always holds.
|
|
54
|
+
|
|
55
|
+
## License
|
|
56
|
+
|
|
57
|
+
Source-available under the same license as the rest of ChatPanel — see
|
|
58
|
+
[LICENSE](LICENSE).
|
package/index.js
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
// chatpanel-pii — the canonical ChatPanel privacy engine. Single source of truth
|
|
2
|
+
// for reversible PII redaction + pseudonymization, shared by the extension, the
|
|
3
|
+
// gateway, and the bridge. Pure + dependency-free ESM.
|
|
4
|
+
//
|
|
5
|
+
// import { createVault, redactText, restoreText, detectEntities } from 'chatpanel-pii';
|
|
6
|
+
//
|
|
7
|
+
// Submodules are also importable directly:
|
|
8
|
+
// 'chatpanel-pii/pii-redact.js' deterministic redact/restore + vault
|
|
9
|
+
// 'chatpanel-pii/pii-detect.js' local NER / LLM entity detection
|
|
10
|
+
// 'chatpanel-pii/pipeline.js' pure turn orchestration + tier/scope gating
|
|
11
|
+
|
|
12
|
+
export * from './pii-redact.js';
|
|
13
|
+
export * from './pii-detect.js';
|
|
14
|
+
export * from './pipeline.js';
|
package/package.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "chatpanel-pii",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "The canonical ChatPanel privacy engine — reversible PII redaction + pseudonymization with local entity detection. Pure, dependency-free ESM shared by the ChatPanel extension, gateway, and bridge.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "index.js",
|
|
7
|
+
"exports": {
|
|
8
|
+
".": "./index.js",
|
|
9
|
+
"./pii-redact.js": "./pii-redact.js",
|
|
10
|
+
"./pii-detect.js": "./pii-detect.js",
|
|
11
|
+
"./pipeline.js": "./pipeline.js"
|
|
12
|
+
},
|
|
13
|
+
"files": [
|
|
14
|
+
"index.js",
|
|
15
|
+
"pii-redact.js",
|
|
16
|
+
"pii-detect.js",
|
|
17
|
+
"pipeline.js",
|
|
18
|
+
"LICENSE",
|
|
19
|
+
"README.md"
|
|
20
|
+
],
|
|
21
|
+
"scripts": {
|
|
22
|
+
"test": "node --test"
|
|
23
|
+
},
|
|
24
|
+
"engines": {
|
|
25
|
+
"node": ">=18"
|
|
26
|
+
},
|
|
27
|
+
"homepage": "https://chatpanel.net",
|
|
28
|
+
"repository": {
|
|
29
|
+
"type": "git",
|
|
30
|
+
"url": "git+https://github.com/chatpanel/chatpanel-pii.git"
|
|
31
|
+
},
|
|
32
|
+
"keywords": [
|
|
33
|
+
"privacy",
|
|
34
|
+
"pii",
|
|
35
|
+
"redaction",
|
|
36
|
+
"pseudonymization",
|
|
37
|
+
"ner",
|
|
38
|
+
"anonymization",
|
|
39
|
+
"llm",
|
|
40
|
+
"chatpanel"
|
|
41
|
+
],
|
|
42
|
+
"author": "ChatPanel (https://chatpanel.net)",
|
|
43
|
+
"license": "SEE LICENSE IN LICENSE",
|
|
44
|
+
"sideEffects": false
|
|
45
|
+
}
|
package/pii-detect.js
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
// Phase 2: configurable, LOCAL entity detection.
|
|
2
|
+
//
|
|
3
|
+
// Produces [{value, type}] spans that feed the redaction engine, so names / orgs /
|
|
4
|
+
// IDs get redacted WITHOUT a hand-maintained dictionary. Detection runs on-device
|
|
5
|
+
// only — the detector is a local NER service (spaCy / Presidio / any HTTP service)
|
|
6
|
+
// or a local LLM (OpenAI-compatible, e.g. a gemma served by llama.cpp). Raw text
|
|
7
|
+
// reaches the detector but never the final agent; only the redacted text does.
|
|
8
|
+
//
|
|
9
|
+
// Performance / flexibility (the whole point):
|
|
10
|
+
// - backends are pluggable and user-configured (URL + model + timeout).
|
|
11
|
+
// - a content-hash cache avoids re-detecting unchanged text.
|
|
12
|
+
// - a per-call timeout + fail-open means a slow/broken detector NEVER blocks the
|
|
13
|
+
// chat — redaction silently falls back to the deterministic layer.
|
|
14
|
+
// - input is length-capped so a huge transcript can't stall detection.
|
|
15
|
+
|
|
16
|
+
const cache = new Map(); // key -> [{value,type}]
|
|
17
|
+
const CACHE_MAX = 300;
|
|
18
|
+
|
|
19
|
+
export function clearDetectCache() { cache.clear(); }
|
|
20
|
+
|
|
21
|
+
function cacheKey(text, det) {
|
|
22
|
+
let h = 5381;
|
|
23
|
+
const s = `${det?.backend}|${det?.url}|${det?.model}|${text}`;
|
|
24
|
+
for (let i = 0; i < s.length; i++) h = ((h << 5) + h + s.charCodeAt(i)) | 0;
|
|
25
|
+
return `${s.length}:${h}`;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export function withTimeout(promise, ms, signal) {
|
|
29
|
+
return new Promise((resolve, reject) => {
|
|
30
|
+
const timer = setTimeout(() => reject(new Error('detect timeout')), Math.max(200, ms || 1500));
|
|
31
|
+
const onAbort = () => { clearTimeout(timer); reject(new Error('aborted')); };
|
|
32
|
+
if (signal) signal.addEventListener?.('abort', onAbort, { once: true });
|
|
33
|
+
promise.then(
|
|
34
|
+
(v) => { clearTimeout(timer); resolve(v); },
|
|
35
|
+
(e) => { clearTimeout(timer); reject(e); },
|
|
36
|
+
);
|
|
37
|
+
});
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// Map common NER labels (spaCy, HF, Presidio) onto our placeholder types.
|
|
41
|
+
function normType(t) {
|
|
42
|
+
const s = String(t || 'ENTITY').toUpperCase().replace(/[^A-Z0-9]/g, '') || 'ENTITY';
|
|
43
|
+
const map = {
|
|
44
|
+
PER: 'PERSON', PERSON: 'PERSON', PERSONNAME: 'PERSON',
|
|
45
|
+
ORG: 'ORG', ORGANIZATION: 'ORG',
|
|
46
|
+
GPE: 'LOCATION', LOC: 'LOCATION', LOCATION: 'LOCATION',
|
|
47
|
+
NORP: 'GROUP', EMAIL: 'EMAIL', EMAILADDRESS: 'EMAIL',
|
|
48
|
+
PHONE: 'PHONE', PHONENUMBER: 'PHONE',
|
|
49
|
+
};
|
|
50
|
+
return map[s] || s;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Identifiers we ALWAYS redact (also caught deterministically). The user-facing
|
|
54
|
+
// category toggles (person/org/location/number) control the rest, so geography
|
|
55
|
+
// questions still work if "location" is turned off, etc. Numeric/temporal labels
|
|
56
|
+
// (DATE, CARDINAL, ORDINAL…) are noisy — small NER models tag "today" / "4" — so
|
|
57
|
+
// they only count when the value is a long digit run (phone/account/ID).
|
|
58
|
+
const ALWAYS_KEEP = new Set(['EMAIL', 'PHONE', 'SSN', 'CREDITCARD', 'IBAN', 'ID']);
|
|
59
|
+
const LOCATION_TYPES = new Set(['LOCATION', 'FAC', 'ADDRESS', 'GROUP', 'NRP']);
|
|
60
|
+
|
|
61
|
+
function keepEntity(value, type, types) {
|
|
62
|
+
const on = (k) => !types || types[k] !== false; // default on
|
|
63
|
+
if (ALWAYS_KEEP.has(type)) return true;
|
|
64
|
+
if (type === 'PERSON') return on('person');
|
|
65
|
+
if (type === 'ORG') return on('org');
|
|
66
|
+
if (LOCATION_TYPES.has(type)) return on('location');
|
|
67
|
+
const digits = (String(value).match(/\d/g) || []).length;
|
|
68
|
+
return digits >= 7 ? on('number') : false;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Normalize the many detector response shapes to [{value, type}], de-duplicated.
|
|
72
|
+
// `types` (optional) is the user's category toggles {person,org,location,number}.
|
|
73
|
+
export function normalizeEntities(data, types) {
|
|
74
|
+
let list = [];
|
|
75
|
+
if (Array.isArray(data)) list = data;
|
|
76
|
+
else if (data && Array.isArray(data.entities)) list = data.entities;
|
|
77
|
+
else if (data && Array.isArray(data.ents)) list = data.ents; // spaCy displacy
|
|
78
|
+
else if (data && Array.isArray(data.results)) list = data.results; // Presidio
|
|
79
|
+
const out = [];
|
|
80
|
+
const seen = new Set();
|
|
81
|
+
for (const e of list) {
|
|
82
|
+
if (!e) continue;
|
|
83
|
+
const value = String(e.value ?? e.text ?? e.entity ?? e.word ?? '').trim();
|
|
84
|
+
const type = normType(e.type ?? e.label ?? e.entity_group ?? e.entity_type ?? e.tag);
|
|
85
|
+
if (!value || value.length > 200 || !keepEntity(value, type, types)) continue;
|
|
86
|
+
const k = `${type}:${value.toLowerCase()}`;
|
|
87
|
+
if (seen.has(k)) continue;
|
|
88
|
+
seen.add(k);
|
|
89
|
+
out.push({ value, type });
|
|
90
|
+
}
|
|
91
|
+
return out;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
export function parseJsonLoose(s) {
|
|
95
|
+
if (!s) return null;
|
|
96
|
+
const a = String(s).indexOf('{');
|
|
97
|
+
const b = String(s).lastIndexOf('}');
|
|
98
|
+
if (a < 0 || b <= a) return null;
|
|
99
|
+
try { return JSON.parse(String(s).slice(a, b + 1)); } catch { return null; }
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
export const EXTRACT_SYS = 'You extract sensitive entities from text for redaction. '
|
|
103
|
+
+ 'Return ONLY JSON: {"entities":[{"value":"<verbatim text>","type":"PERSON|ORG|LOCATION|ID|EMAIL|PHONE|OTHER"}]}. '
|
|
104
|
+
+ 'Copy each value exactly as it appears. Include people, organizations, locations, and account/ID numbers. No commentary, no code fences.';
|
|
105
|
+
|
|
106
|
+
async function detectViaEndpoint(text, det, signal, fetchImpl) {
|
|
107
|
+
const res = await fetchImpl(det.url, {
|
|
108
|
+
method: 'POST',
|
|
109
|
+
headers: { 'Content-Type': 'application/json', ...(det.apiKey ? { Authorization: `Bearer ${det.apiKey}` } : {}) },
|
|
110
|
+
body: JSON.stringify({ text }),
|
|
111
|
+
signal,
|
|
112
|
+
});
|
|
113
|
+
if (!res.ok) throw new Error(`detect HTTP ${res.status}`);
|
|
114
|
+
return normalizeEntities(await res.json(), det.types);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
async function detectViaOpenAI(text, det, signal, fetchImpl) {
|
|
118
|
+
const base = String(det.url || '').replace(/\/$/, '');
|
|
119
|
+
const url = /\/chat\/completions$/.test(base) ? base : `${base}/v1/chat/completions`;
|
|
120
|
+
const res = await fetchImpl(url, {
|
|
121
|
+
method: 'POST',
|
|
122
|
+
headers: { 'Content-Type': 'application/json', ...(det.apiKey ? { Authorization: `Bearer ${det.apiKey}` } : {}) },
|
|
123
|
+
body: JSON.stringify({
|
|
124
|
+
model: det.model || 'local',
|
|
125
|
+
temperature: 0,
|
|
126
|
+
max_tokens: det.maxTokens || 256,
|
|
127
|
+
messages: [{ role: 'system', content: EXTRACT_SYS }, { role: 'user', content: text }],
|
|
128
|
+
}),
|
|
129
|
+
signal,
|
|
130
|
+
});
|
|
131
|
+
if (!res.ok) throw new Error(`detect HTTP ${res.status}`);
|
|
132
|
+
const json = await res.json();
|
|
133
|
+
const content = json?.choices?.[0]?.message?.content ?? json?.content ?? '';
|
|
134
|
+
return normalizeEntities(parseJsonLoose(content), det.types);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Returns [{value, type}] spans for `text`, or [] (fail-open) on any error/timeout.
|
|
138
|
+
export async function detectEntities(text, cfg, { signal, fetchImpl = globalThis.fetch, strict = false } = {}) {
|
|
139
|
+
const det = cfg?.detection;
|
|
140
|
+
if (!det || !det.backend || det.backend === 'off' || !det.url || typeof fetchImpl !== 'function') return [];
|
|
141
|
+
const capped = String(text || '').slice(0, det.maxChars || 8000);
|
|
142
|
+
if (capped.trim().length < 8) return [];
|
|
143
|
+
const key = cacheKey(capped, det);
|
|
144
|
+
if (!strict && cache.has(key)) return cache.get(key);
|
|
145
|
+
const run = det.backend === 'endpoint' ? detectViaEndpoint : detectViaOpenAI;
|
|
146
|
+
let ents = [];
|
|
147
|
+
try {
|
|
148
|
+
ents = await withTimeout(run(capped, det, signal, fetchImpl), det.timeoutMs || 1500, signal);
|
|
149
|
+
} catch (e) {
|
|
150
|
+
if (strict) throw e; // surface errors to the Test button
|
|
151
|
+
ents = []; // otherwise fail open — deterministic redaction still applies
|
|
152
|
+
}
|
|
153
|
+
if (cache.size >= CACHE_MAX) cache.clear();
|
|
154
|
+
if (!strict) cache.set(key, ents);
|
|
155
|
+
return ents;
|
|
156
|
+
}
|
package/pii-redact.js
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
// Reversible PII redaction.
|
|
2
|
+
//
|
|
3
|
+
// Strips sensitive values out of everything that leaves the device for a model
|
|
4
|
+
// (chat text, attached page/meeting context, and tool results we feed back), then
|
|
5
|
+
// reconstructs the originals when the reply is rendered to the user. The model
|
|
6
|
+
// only ever sees opaque, stable placeholders like [[EMAIL_1]] / [[PERSON_2]] — so
|
|
7
|
+
// it can still reason about "who said what" without seeing the real values.
|
|
8
|
+
//
|
|
9
|
+
// Pure + dependency-free so it is unit-testable and runs identically for API and
|
|
10
|
+
// CLI/bridge agents (both assemble their outbound payload through providers.js).
|
|
11
|
+
//
|
|
12
|
+
// Tiers (the licensing seam):
|
|
13
|
+
// 'basic' — deterministic regex: emails, phones, IPs, cards (Luhn), SSNs, keys.
|
|
14
|
+
// 'full' — basic + entity-aware: known people/orgs (meeting roster, contacts,
|
|
15
|
+
// the user's own identity) and a user-editable custom dictionary.
|
|
16
|
+
//
|
|
17
|
+
// Reversibility caveat: if the model paraphrases instead of echoing a token, that
|
|
18
|
+
// one reference won't restore (it shows the token) — but the privacy guarantee
|
|
19
|
+
// (the real value never left the device) always holds.
|
|
20
|
+
|
|
21
|
+
const TOKEN_RE = /\[\[([A-Z][A-Z0-9]*)_(\d+)\]\]/g;
|
|
22
|
+
|
|
23
|
+
// A vault is the per-conversation mapping between placeholders and originals. Keep
|
|
24
|
+
// one per conversation so PERSON_1 means the same entity across turns.
|
|
25
|
+
export function createVault() {
|
|
26
|
+
// `aliases` maps a pseudonym (e.g. "Alex") back to the real value (e.g. "Suresh")
|
|
27
|
+
// so LOCAL tool calls (history/meeting search) can run on real data. The reply
|
|
28
|
+
// restorer ignores it — pseudonyms stay permanent in the user's view.
|
|
29
|
+
return { byToken: new Map(), byValue: new Map(), counts: new Map(), aliases: new Map() };
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export function vaultToJSON(vault) {
|
|
33
|
+
return {
|
|
34
|
+
entries: [...(vault?.byToken || new Map())].map(([token, value]) => ({ token, value })),
|
|
35
|
+
aliases: [...(vault?.aliases || new Map())].map(([alias, value]) => ({ alias, value })),
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export function vaultFromJSON(data) {
|
|
40
|
+
const vault = createVault();
|
|
41
|
+
for (const { token, value } of data?.entries || []) {
|
|
42
|
+
const m = /^\[\[([A-Z][A-Z0-9]*)_(\d+)\]\]$/.exec(token);
|
|
43
|
+
vault.byToken.set(token, value);
|
|
44
|
+
vault.byValue.set(value, token);
|
|
45
|
+
if (m) vault.counts.set(m[1], Math.max(vault.counts.get(m[1]) || 0, Number(m[2])));
|
|
46
|
+
}
|
|
47
|
+
for (const { alias, value } of data?.aliases || []) vault.aliases.set(alias, value);
|
|
48
|
+
return vault;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function tokenFor(vault, type, value) {
|
|
52
|
+
const existing = vault.byValue.get(value);
|
|
53
|
+
if (existing) return existing;
|
|
54
|
+
const t = String(type || 'PII').toUpperCase().replace(/[^A-Z0-9]/g, '') || 'PII';
|
|
55
|
+
const n = (vault.counts.get(t) || 0) + 1;
|
|
56
|
+
vault.counts.set(t, n);
|
|
57
|
+
const token = `[[${t}_${n}]]`;
|
|
58
|
+
vault.byToken.set(token, value);
|
|
59
|
+
vault.byValue.set(value, token);
|
|
60
|
+
return token;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function escapeRegex(s) {
|
|
64
|
+
return String(s).replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
function luhnValid(digits) {
|
|
68
|
+
let sum = 0;
|
|
69
|
+
let alt = false;
|
|
70
|
+
for (let i = digits.length - 1; i >= 0; i--) {
|
|
71
|
+
let d = digits.charCodeAt(i) - 48;
|
|
72
|
+
if (alt) { d *= 2; if (d > 9) d -= 9; }
|
|
73
|
+
sum += d;
|
|
74
|
+
alt = !alt;
|
|
75
|
+
}
|
|
76
|
+
return sum % 10 === 0;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Deterministic detectors. Each: { type, re, valid? }. Order = priority; more
|
|
80
|
+
// specific patterns run first so they win the bytes before greedier ones.
|
|
81
|
+
const DETECTORS = [
|
|
82
|
+
{ type: 'EMAIL', re: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g },
|
|
83
|
+
{ type: 'SSN', re: /\b\d{3}-\d{2}-\d{4}\b/g },
|
|
84
|
+
{
|
|
85
|
+
type: 'KEY',
|
|
86
|
+
re: /\b(?:sk-[A-Za-z0-9_-]{16,}|AKIA[0-9A-Z]{16}|gh[pousr]_[A-Za-z0-9]{20,}|xox[baprs]-[A-Za-z0-9-]{10,})\b/g,
|
|
87
|
+
},
|
|
88
|
+
{
|
|
89
|
+
type: 'IP',
|
|
90
|
+
re: /\b(?:(?:25[0-5]|2[0-4]\d|1?\d?\d)\.){3}(?:25[0-5]|2[0-4]\d|1?\d?\d)\b/g,
|
|
91
|
+
},
|
|
92
|
+
{
|
|
93
|
+
// Phone: only count it if it has a separator or a leading + and 7–15 digits —
|
|
94
|
+
// so long bare ids (a 11-digit page id, an order number) are NOT redacted.
|
|
95
|
+
type: 'PHONE',
|
|
96
|
+
re: /(?<![\w.])\+?\d[\d ().-]{6,}\d(?![\w])/g,
|
|
97
|
+
valid: (m) => {
|
|
98
|
+
const digits = m.replace(/\D/g, '');
|
|
99
|
+
// Needs a separator / leading + OR be a bare 10-digit run (a typed phone like
|
|
100
|
+
// 9320434444). 11+ bare digits still require formatting so long ids aren't hit.
|
|
101
|
+
return digits.length >= 7 && digits.length <= 15
|
|
102
|
+
&& (/[ ().-]/.test(m) || m.trimStart().startsWith('+') || digits.length === 10);
|
|
103
|
+
},
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
type: 'CARD',
|
|
107
|
+
re: /\b(?:\d[ -]?){13,19}\b/g,
|
|
108
|
+
valid: (m) => { const d = m.replace(/\D/g, ''); return d.length >= 13 && d.length <= 19 && luhnValid(d); },
|
|
109
|
+
},
|
|
110
|
+
];
|
|
111
|
+
|
|
112
|
+
// Redact `text`, recording placeholders in `vault`. `entities` (full tier) is a
|
|
113
|
+
// list of { value, type } known names/orgs; `dictionary` is the user's custom
|
|
114
|
+
// list of { value, type } (exact strings) or { pattern, flags, type } (regex).
|
|
115
|
+
export function redactText(text, vault, {
|
|
116
|
+
tier = 'basic',
|
|
117
|
+
entities = [],
|
|
118
|
+
dictionary = [],
|
|
119
|
+
} = {}) {
|
|
120
|
+
if (text == null || text === '') return text;
|
|
121
|
+
let out = String(text);
|
|
122
|
+
const v = vault || createVault();
|
|
123
|
+
|
|
124
|
+
const entityTier = tier === 'full' || tier === 'entities';
|
|
125
|
+
|
|
126
|
+
// 1) User dictionary first — highest authority, user explicitly chose these.
|
|
127
|
+
// An entry with `alias` PSEUDONYMIZES: permanent substitution (the model and
|
|
128
|
+
// the user's transcript both see the alias, never reversed). Otherwise it
|
|
129
|
+
// REDACTS to a reversible [[TYPE_n]] placeholder restored in the user's view.
|
|
130
|
+
for (const d of dictionary || []) {
|
|
131
|
+
if (!d) continue;
|
|
132
|
+
try {
|
|
133
|
+
const re = d.pattern
|
|
134
|
+
? new RegExp(d.pattern, d.flags && /g/.test(d.flags) ? d.flags : `${d.flags || ''}g`)
|
|
135
|
+
: (d.value ? new RegExp(`(?<![\\w])${escapeRegex(d.value)}(?![\\w])`, 'gi') : null);
|
|
136
|
+
if (!re) continue;
|
|
137
|
+
if (d.alias != null && d.alias !== '') {
|
|
138
|
+
out = out.replace(re, () => d.alias); // pseudonymize: model + reply see the alias…
|
|
139
|
+
// …but record alias→original so LOCAL tool args (history/meeting search) map
|
|
140
|
+
// back to the real value. Local lookups must hit real data; only the model is blinded.
|
|
141
|
+
if (d.value) v.aliases.set(d.alias, d.value);
|
|
142
|
+
} else {
|
|
143
|
+
out = out.replace(re, (m) => tokenFor(v, d.type || (d.pattern ? 'PII' : 'TERM'), d.pattern ? m : d.value));
|
|
144
|
+
}
|
|
145
|
+
} catch {
|
|
146
|
+
/* a bad user regex must never break redaction */
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// 2) Known entities (full tier) — longest value first so "Alex Rivera" wins
|
|
151
|
+
// before a bare "Alex". Restores to the canonical entity value.
|
|
152
|
+
if (entityTier) {
|
|
153
|
+
const ents = [...(entities || [])].filter((e) => e && e.value)
|
|
154
|
+
.sort((a, b) => String(b.value).length - String(a.value).length);
|
|
155
|
+
for (const e of ents) {
|
|
156
|
+
const re = new RegExp(`(?<![\\w])${escapeRegex(e.value)}(?![\\w])`, 'gi');
|
|
157
|
+
out = out.replace(re, () => tokenFor(v, e.type || 'PERSON', e.value));
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// 3) Deterministic detectors (all tiers).
|
|
162
|
+
for (const det of DETECTORS) {
|
|
163
|
+
out = out.replace(det.re, (m) => (!det.valid || det.valid(m) ? tokenFor(v, det.type, m) : m));
|
|
164
|
+
}
|
|
165
|
+
return out;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Swap placeholders back to their originals. Unknown tokens are left untouched.
|
|
169
|
+
export function restoreText(text, vault) {
|
|
170
|
+
if (text == null || !vault) return text;
|
|
171
|
+
return String(text).replace(TOKEN_RE, (m) => (vault.byToken.has(m) ? vault.byToken.get(m) : m));
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// Restore for LOCAL use only — e.g. tool-call args that hit on-device history /
|
|
175
|
+
// meeting search. Undoes reversible tokens AND pseudonyms, so local lookups run on
|
|
176
|
+
// the real values. NOT used for the user-facing reply (pseudonyms stay there).
|
|
177
|
+
export function restoreWithAliases(text, vault) {
|
|
178
|
+
let out = restoreText(text, vault);
|
|
179
|
+
if (vault?.aliases?.size) {
|
|
180
|
+
for (const [alias, real] of vault.aliases) {
|
|
181
|
+
if (!alias) continue;
|
|
182
|
+
out = out.replace(new RegExp(`(?<![\\w])${escapeRegex(alias)}(?![\\w])`, 'g'), () => real);
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
return out;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// True if the text still contains any redaction placeholder (useful for streaming
|
|
189
|
+
// restore — buffer a tail when a token may be split across chunks).
|
|
190
|
+
export function hasToken(text) {
|
|
191
|
+
TOKEN_RE.lastIndex = 0;
|
|
192
|
+
return TOKEN_RE.test(String(text || ''));
|
|
193
|
+
}
|
package/pipeline.js
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
// Pure turn-level orchestration shared by every ChatPanel surface (extension,
|
|
2
|
+
// gateway, bridge). It composes the deterministic engine (pii-redact.js) into the
|
|
3
|
+
// message pipeline and applies the tier / scope / dictionary gating.
|
|
4
|
+
//
|
|
5
|
+
// What lives HERE (portable): redactOutbound, redactToolResult/redactResult,
|
|
6
|
+
// makeStreamRestorer, restore/restoreDeep, effectiveTier + gating.
|
|
7
|
+
//
|
|
8
|
+
// What stays in the EXTENSION (host glue, NOT here): reading
|
|
9
|
+
// settings.ui.piiRedaction, the module-level Pro entitlement, and chrome storage.
|
|
10
|
+
// Those wrap these pure functions with host-specific config.
|
|
11
|
+
|
|
12
|
+
import { redactText, restoreText, restoreWithAliases } from './pii-redact.js';
|
|
13
|
+
|
|
14
|
+
export function redactionEnabled(cfg) {
|
|
15
|
+
return !!(cfg && cfg.mode && cfg.mode !== 'off');
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
// The entity (name/org) tier is Pro; Free silently falls back to deterministic
|
|
19
|
+
// regex so the feature still does something useful without the upsell breaking.
|
|
20
|
+
export function effectiveTier(cfg, isPro) {
|
|
21
|
+
const t = cfg?.tier === 'full' ? 'full' : 'basic';
|
|
22
|
+
return t === 'full' && !isPro ? 'basic' : t;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
// Free ceiling: deterministic SECRET redaction on CHAT only, with a small
|
|
26
|
+
// dictionary. Names/orgs (full tier), wider scope, an unlimited dictionary, and
|
|
27
|
+
// the model layer are Pro. Enforced here as defense-in-depth.
|
|
28
|
+
export const FREE_DICT_LIMIT = 3;
|
|
29
|
+
|
|
30
|
+
export function gatedDictionary(cfg, isPro) {
|
|
31
|
+
const d = Array.isArray(cfg?.dictionary) ? cfg.dictionary : [];
|
|
32
|
+
return isPro ? d : d.slice(0, FREE_DICT_LIMIT);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export function gatedScope(cfg, isPro) {
|
|
36
|
+
const s = cfg?.scope || {};
|
|
37
|
+
if (isPro) return s;
|
|
38
|
+
return { chat: s.chat !== false, context: false, history: false, toolResults: false };
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export function redactOpts(cfg, isPro, entities) {
|
|
42
|
+
return {
|
|
43
|
+
tier: effectiveTier(cfg, isPro),
|
|
44
|
+
entities: entities || [],
|
|
45
|
+
dictionary: gatedDictionary(cfg, isPro),
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// Returns redacted COPIES — never mutates the stored conversation.
|
|
50
|
+
export function redactOutbound({ messages, system, vault, cfg, isPro = false, entities = [] }) {
|
|
51
|
+
if (!redactionEnabled(cfg) || !vault) return { messages, system };
|
|
52
|
+
const opts = redactOpts(cfg, isPro, entities);
|
|
53
|
+
const scope = gatedScope(cfg, isPro);
|
|
54
|
+
const redactMsg = (m) => {
|
|
55
|
+
const copy = { ...m };
|
|
56
|
+
if (scope.chat !== false && m.content) copy.content = redactText(m.content, vault, opts);
|
|
57
|
+
if (Array.isArray(m.attachments)) {
|
|
58
|
+
copy.attachments = m.attachments.map((a) => {
|
|
59
|
+
if (a.kind === 'image' || !a.text) return a;
|
|
60
|
+
const isHistory = a.kind === 'history-rag';
|
|
61
|
+
if (isHistory ? scope.history === false : scope.context === false) return a;
|
|
62
|
+
return { ...a, text: redactText(a.text, vault, opts) };
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
return copy;
|
|
66
|
+
};
|
|
67
|
+
return {
|
|
68
|
+
messages: (messages || []).map(redactMsg),
|
|
69
|
+
system: system ? redactText(system, vault, opts) : system,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
export function redactToolResult(text, { vault, cfg, isPro = false, entities = [] } = {}) {
|
|
74
|
+
if (!redactionEnabled(cfg) || !vault || !gatedScope(cfg, isPro).toolResults) return text;
|
|
75
|
+
if (typeof text !== 'string') return text;
|
|
76
|
+
return redactText(text, vault, redactOpts(cfg, isPro, entities));
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Streaming-safe restorer. push() returns text safe to display now; flush() the rest.
|
|
80
|
+
export function makeStreamRestorer(vault) {
|
|
81
|
+
let buf = '';
|
|
82
|
+
return {
|
|
83
|
+
push(chunk) {
|
|
84
|
+
if (!vault) return chunk || '';
|
|
85
|
+
buf += chunk || '';
|
|
86
|
+
const open = buf.lastIndexOf('[[');
|
|
87
|
+
let safe;
|
|
88
|
+
if (open !== -1 && !buf.slice(open).includes(']]')) {
|
|
89
|
+
safe = buf.slice(0, open);
|
|
90
|
+
buf = buf.slice(open);
|
|
91
|
+
} else {
|
|
92
|
+
safe = buf;
|
|
93
|
+
buf = '';
|
|
94
|
+
}
|
|
95
|
+
return restoreText(safe, vault);
|
|
96
|
+
},
|
|
97
|
+
flush() {
|
|
98
|
+
const out = vault ? restoreText(buf, vault) : buf;
|
|
99
|
+
buf = '';
|
|
100
|
+
return out;
|
|
101
|
+
},
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
export function restore(text, vault) {
|
|
106
|
+
return vault ? restoreText(text, vault) : text;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// Deep-restore a value (tool-call args contain tokens; local tools must run on the
|
|
110
|
+
// REAL values). restoreWithAliases undoes pseudonyms too — local lookups hit real
|
|
111
|
+
// data; only the model stays blinded.
|
|
112
|
+
export function restoreDeep(value, vault) {
|
|
113
|
+
if (!vault) return value;
|
|
114
|
+
if (typeof value === 'string') return restoreWithAliases(value, vault);
|
|
115
|
+
if (Array.isArray(value)) return value.map((v) => restoreDeep(v, vault));
|
|
116
|
+
if (value && typeof value === 'object') {
|
|
117
|
+
const out = {};
|
|
118
|
+
for (const k of Object.keys(value)) out[k] = restoreDeep(value[k], vault);
|
|
119
|
+
return out;
|
|
120
|
+
}
|
|
121
|
+
return value;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
export function redactResult(result, ctx) {
|
|
125
|
+
if (typeof result === 'string') return redactToolResult(result, ctx);
|
|
126
|
+
if (result && typeof result === 'object' && typeof result.text === 'string') {
|
|
127
|
+
return { ...result, text: redactToolResult(result.text, ctx) };
|
|
128
|
+
}
|
|
129
|
+
return result;
|
|
130
|
+
}
|