@structured-world/structured-public-domains 0.0.6 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,146 @@
1
+ # structured-public-domains
2
+
3
+ Compact Public Suffix List (PSL) for Rust.
4
+
5
+ [![CI](https://github.com/structured-world/structured-public-domains/actions/workflows/ci.yml/badge.svg)](https://github.com/structured-world/structured-public-domains/actions/workflows/ci.yml)
6
+ [![Crates.io](https://img.shields.io/crates/v/structured-public-domains.svg)](https://crates.io/crates/structured-public-domains)
7
+ [![npm](https://img.shields.io/npm/v/@structured-world/structured-public-domains.svg)](https://www.npmjs.com/package/@structured-world/structured-public-domains)
8
+ [![docs.rs](https://docs.rs/structured-public-domains/badge.svg)](https://docs.rs/structured-public-domains)
9
+ [![License: Apache-2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](LICENSE)
10
+
11
+ - **Zero** runtime dependencies
12
+ - **~108KB** embedded data (compact binary trie)
13
+ - **~2.4M lookups/sec** on a single core (~420 ns per lookup)
14
+ - **O(depth * log k)** trie traversal with per-node binary search (typically 2-3 steps)
15
+ - Wildcard (`*.jp`) and exception (`!metro.tokyo.jp`) rules
16
+ - Based on the official Public Suffix List (ICANN and private sections)
17
+ - Checked daily against [publicsuffix.org](https://publicsuffix.org/)
18
+
19
+ **Terminology:** A *public suffix* (e.g., `com`, `co.uk`) is the part of a domain under which users can register names. The *registrable domain* (eTLD+1) is one label above the suffix (e.g., `example.co.uk`).
20
+
21
+ ## Usage (Rust)
22
+
23
+ ```rust
24
+ use structured_public_domains::{lookup, registrable_domain, is_known_suffix};
25
+
26
+ let info = lookup("www.example.co.uk").unwrap();
27
+ assert_eq!(info.suffix(), "co.uk");
28
+ assert_eq!(info.registrable_domain(), Some("example.co.uk"));
29
+ assert!(info.is_known());
30
+
31
+ // Helpers
32
+ assert_eq!(registrable_domain("sub.example.com"), Some("example.com".to_string()));
33
+ assert!(is_known_suffix("example.com"));
34
+ ```
35
+
36
+ ## Usage (JavaScript / TypeScript)
37
+
38
+ The same PSL trie ships as a native TypeScript npm package — no WebAssembly, no
39
+ runtime dependencies. The ~108KB binary trie is embedded and decoded lazily on
40
+ first call, so every function is **synchronous** with no `init()`: it drops
41
+ straight into Node, browsers, bundlers, and downstream libraries. Ships both
42
+ ESM and CommonJS with type declarations for each.
43
+
44
+ ```sh
45
+ npm install @structured-world/structured-public-domains
46
+ ```
47
+
48
+ ```typescript
49
+ // ESM
50
+ import { lookup, registrableDomain, isKnownSuffix } from '@structured-world/structured-public-domains';
51
+
52
+ const info = lookup('www.example.co.uk');
53
+ // info.suffix → "co.uk"
54
+ // info.registrableDomain → "example.co.uk"
55
+ // info.known → true
56
+
57
+ registrableDomain('sub.example.com'); // "example.com"
58
+ isKnownSuffix('example.com'); // true
59
+ ```
60
+
61
+ ```javascript
62
+ // CommonJS (e.g. default NestJS)
63
+ const { lookup, registrableDomain, isKnownSuffix } = require('@structured-world/structured-public-domains');
64
+ ```
65
+
66
+ ### Raw trie data
67
+
68
+ The embedded binary trie is exposed for consumers that want to walk it
69
+ themselves (the format matches the Rust crate's `src/psl.bin`):
70
+
71
+ ```typescript
72
+ import { pslData } from '@structured-world/structured-public-domains';
73
+
74
+ const bytes = pslData(); // Uint8Array — a defensive copy of the trie blob
75
+ ```
76
+
77
+ The JS lookup is verified byte-for-byte against the Rust implementation over the
78
+ entire PSL on every CI run, so both languages return identical results.
79
+
80
+ ### Tiny build (runtime-fetched, no embedded data)
81
+
82
+ For consumers who want always-fresh PSL **without bumping the package version**,
83
+ the `/tiny` entry ships *without* the embedded blob. It fetches the prebuilt
84
+ binary trie at runtime and caches it locally (Node: temp file with a TTL;
85
+ browser: CacheStorage). After the first `await load()`, the lookup API is
86
+ identical and synchronous.
87
+
88
+ ```typescript
89
+ import { load, registrableDomain } from '@structured-world/structured-public-domains/tiny';
90
+
91
+ await load(); // fetch + cache once (default: jsDelivr CDN)
92
+ registrableDomain('sub.example.co.uk'); // "example.co.uk"
93
+
94
+ // Options: custom source, TTL, cache dir, or force refresh.
95
+ await load({ url: 'https://psl.example.com/psl.bin', cacheTtlMs: 3_600_000, force: true });
96
+ ```
97
+
98
+ The default source is the same `psl.bin` served from this package's jsDelivr CDN,
99
+ pinned to the installed `major.minor` range — so it always tracks the latest
100
+ PSL-data patch release (same trie format) but never a future format-breaking
101
+ version the bundled parser can't read. Results are identical to the embedded
102
+ build. Use the full `.` entry when you want zero network and instant startup; use
103
+ `/tiny` when install size and
104
+ always-current data matter more.
105
+
106
+ ## Performance
107
+
108
+ Benchmarks on Apple M-series (criterion, `cargo bench`):
109
+
110
+ | Benchmark | Time | Throughput |
111
+ |-----------|------|-----------|
112
+ | Simple (`example.com`) | ~420 ns | ~2.4M/s |
113
+ | Nested (`www.example.co.uk`) | ~425 ns | ~2.4M/s |
114
+ | Deep subdomain (`a.b.c.d.example.com`) | ~500 ns | ~2.0M/s |
115
+ | Bare TLD (`com`) | ~195 ns | ~5.1M/s |
116
+ | Private domain (`mysite.github.io`) | ~450 ns | ~2.2M/s |
117
+ | Long chain (`very.deep...co.uk`) | ~500 ns | ~2.0M/s |
118
+
119
+ **Runtime memory:** The PSL trie is parsed lazily on first call (`OnceLock`), then cached for the lifetime of the process. Runtime footprint is ~530 KB (sorted `Vec` children with binary search lookup). The ~108KB binary blob is embedded in the binary at compile time.
120
+
121
+ ## Why not `psl`?
122
+
123
+ | | `psl` | `structured-public-domains` |
124
+ |---|---|---|
125
+ | Embedded data | ~876KB (codegen match tree) | **108KB** (compact binary trie) |
126
+ | Source size | 2.4MB codegen | 300 lines + 108KB blob |
127
+ | Runtime deps | None | **None** |
128
+ | Runtime memory | N/A (static) | **~530KB** |
129
+ | Lookup | O(depth) match tree | O(depth * log k) trie walk |
130
+ | Auto-update | New crate version | Daily GitHub Actions check |
131
+
132
+ Both crates have comparable lookup speed and zero runtime dependencies. `structured-public-domains` has ~8x smaller embedded data and auto-updates daily via GitHub Actions with domain-level changelogs.
133
+
134
+ ## Support the Project
135
+
136
+ <div align="center">
137
+
138
+ ![USDT TRC-20 Donation QR Code](assets/usdt-qr.svg)
139
+
140
+ USDT (TRC-20): `TFDsezHa1cBkoeZT5q2T49Wp66K8t2DmdA`
141
+
142
+ </div>
143
+
144
+ ## License
145
+
146
+ Apache License 2.0
@@ -0,0 +1,121 @@
1
+ // src/trie.ts
2
+ var utf8 = new TextDecoder("utf-8", { fatal: true });
3
+ function parseTrie(data) {
4
+ const cursor = { pos: 0 };
5
+ const root = parseNode(data, cursor);
6
+ if (cursor.pos !== data.length) {
7
+ throw new Error("PSL data: trailing bytes after root node");
8
+ }
9
+ return root;
10
+ }
11
+ function parseNode(data, cursor) {
12
+ const flags = byteAt(data, cursor.pos++);
13
+ if ((flags & -2) !== 0) {
14
+ throw new Error("PSL data: reserved flag bits set");
15
+ }
16
+ const lo = byteAt(data, cursor.pos++);
17
+ const hi = byteAt(data, cursor.pos++);
18
+ let numChildren = lo | hi << 8;
19
+ const MIN_CHILD_ENCODED_LEN = 5;
20
+ const remaining = data.length - cursor.pos;
21
+ if (remaining < 0 || numChildren > Math.floor(remaining / MIN_CHILD_ENCODED_LEN)) {
22
+ throw new Error("PSL data: num_children exceeds remaining bytes");
23
+ }
24
+ const labels = [];
25
+ const children = [];
26
+ let prev;
27
+ for (; numChildren > 0; numChildren--) {
28
+ const labelLen = byteAt(data, cursor.pos++);
29
+ if (labelLen === 0) {
30
+ throw new Error("PSL data: empty label");
31
+ }
32
+ const labelEnd = cursor.pos + labelLen;
33
+ if (labelEnd > data.length) {
34
+ throw new Error("PSL data: label runs past end of data");
35
+ }
36
+ const label = decodeLabel(data, cursor.pos, labelEnd);
37
+ cursor.pos = labelEnd;
38
+ if (prev !== void 0 && !(label > prev)) {
39
+ throw new Error("PSL data: children not strictly sorted");
40
+ }
41
+ prev = label;
42
+ labels.push(label);
43
+ children.push(parseNode(data, cursor));
44
+ }
45
+ return { suffixBoundary: (flags & 1) !== 0, labels, children };
46
+ }
47
+ function byteAt(data, i) {
48
+ if (i >= data.length) {
49
+ throw new Error("PSL data: unexpected end of data");
50
+ }
51
+ return data[i];
52
+ }
53
+ function decodeLabel(data, start, end) {
54
+ for (let i = start; i < end; i++) {
55
+ if (data[i] >= 128) return utf8.decode(data.subarray(start, end));
56
+ }
57
+ let s = "";
58
+ for (let i = start; i < end; i++) s += String.fromCharCode(data[i]);
59
+ return s;
60
+ }
61
+ function indexOfChild(node, label) {
62
+ let lo = 0;
63
+ let hi = node.labels.length - 1;
64
+ while (lo <= hi) {
65
+ const mid = lo + hi >>> 1;
66
+ const cur = node.labels[mid];
67
+ if (cur === label) return mid;
68
+ if (cur < label) lo = mid + 1;
69
+ else hi = mid - 1;
70
+ }
71
+ return -1;
72
+ }
73
+ function childOf(node, label) {
74
+ const i = indexOfChild(node, label);
75
+ return i < 0 ? void 0 : node.children[i];
76
+ }
77
+ function hasChild(node, label) {
78
+ return indexOfChild(node, label) >= 0;
79
+ }
80
+ function lookupTrie(root, domain) {
81
+ const trimmed = domain.trim();
82
+ const stripped = trimmed.endsWith(".") ? trimmed.slice(0, -1) : trimmed;
83
+ if (stripped === "") return void 0;
84
+ const labels = stripped.split(".").reverse();
85
+ for (const label of labels) {
86
+ if (label === "" || label === "*" || label.startsWith("!")) return void 0;
87
+ }
88
+ let node = root;
89
+ let suffixDepth = 0;
90
+ let known = false;
91
+ for (let depth = 0; depth < labels.length; depth++) {
92
+ const label = labels[depth].toLowerCase();
93
+ if (hasChild(node, "*")) {
94
+ if (hasChild(node, "!" + label)) {
95
+ suffixDepth = depth;
96
+ } else {
97
+ suffixDepth = depth + 1;
98
+ }
99
+ known = true;
100
+ }
101
+ const child = childOf(node, label);
102
+ if (child !== void 0) {
103
+ if (child.suffixBoundary) {
104
+ suffixDepth = depth + 1;
105
+ known = true;
106
+ }
107
+ node = child;
108
+ continue;
109
+ }
110
+ break;
111
+ }
112
+ if (suffixDepth === 0) {
113
+ suffixDepth = 1;
114
+ known = false;
115
+ }
116
+ const suffix = labels.slice(0, suffixDepth).reverse().map((l) => l.toLowerCase()).join(".");
117
+ const registrableDomain = labels.length > suffixDepth ? `${labels[suffixDepth].toLowerCase()}.${suffix}` : void 0;
118
+ return { suffix, registrableDomain, known };
119
+ }
120
+
121
+ export { lookupTrie, parseTrie };
package/dist/index.cjs ADDED
@@ -0,0 +1,161 @@
1
+ 'use strict';
2
+
3
+ var pslData_cjs = require('./psl-data.cjs');
4
+
5
+ // src/index.ts
6
+
7
+ // src/trie.ts
8
+ var utf8 = new TextDecoder("utf-8", { fatal: true });
9
+ function parseTrie(data) {
10
+ const cursor = { pos: 0 };
11
+ const root = parseNode(data, cursor);
12
+ if (cursor.pos !== data.length) {
13
+ throw new Error("PSL data: trailing bytes after root node");
14
+ }
15
+ return root;
16
+ }
17
+ function parseNode(data, cursor) {
18
+ const flags = byteAt(data, cursor.pos++);
19
+ if ((flags & -2) !== 0) {
20
+ throw new Error("PSL data: reserved flag bits set");
21
+ }
22
+ const lo = byteAt(data, cursor.pos++);
23
+ const hi = byteAt(data, cursor.pos++);
24
+ let numChildren = lo | hi << 8;
25
+ const MIN_CHILD_ENCODED_LEN = 5;
26
+ const remaining = data.length - cursor.pos;
27
+ if (remaining < 0 || numChildren > Math.floor(remaining / MIN_CHILD_ENCODED_LEN)) {
28
+ throw new Error("PSL data: num_children exceeds remaining bytes");
29
+ }
30
+ const labels = [];
31
+ const children = [];
32
+ let prev;
33
+ for (; numChildren > 0; numChildren--) {
34
+ const labelLen = byteAt(data, cursor.pos++);
35
+ if (labelLen === 0) {
36
+ throw new Error("PSL data: empty label");
37
+ }
38
+ const labelEnd = cursor.pos + labelLen;
39
+ if (labelEnd > data.length) {
40
+ throw new Error("PSL data: label runs past end of data");
41
+ }
42
+ const label = decodeLabel(data, cursor.pos, labelEnd);
43
+ cursor.pos = labelEnd;
44
+ if (prev !== void 0 && !(label > prev)) {
45
+ throw new Error("PSL data: children not strictly sorted");
46
+ }
47
+ prev = label;
48
+ labels.push(label);
49
+ children.push(parseNode(data, cursor));
50
+ }
51
+ return { suffixBoundary: (flags & 1) !== 0, labels, children };
52
+ }
53
+ function byteAt(data, i) {
54
+ if (i >= data.length) {
55
+ throw new Error("PSL data: unexpected end of data");
56
+ }
57
+ return data[i];
58
+ }
59
+ function decodeLabel(data, start, end) {
60
+ for (let i = start; i < end; i++) {
61
+ if (data[i] >= 128) return utf8.decode(data.subarray(start, end));
62
+ }
63
+ let s = "";
64
+ for (let i = start; i < end; i++) s += String.fromCharCode(data[i]);
65
+ return s;
66
+ }
67
+ function indexOfChild(node, label) {
68
+ let lo = 0;
69
+ let hi = node.labels.length - 1;
70
+ while (lo <= hi) {
71
+ const mid = lo + hi >>> 1;
72
+ const cur = node.labels[mid];
73
+ if (cur === label) return mid;
74
+ if (cur < label) lo = mid + 1;
75
+ else hi = mid - 1;
76
+ }
77
+ return -1;
78
+ }
79
+ function childOf(node, label) {
80
+ const i = indexOfChild(node, label);
81
+ return i < 0 ? void 0 : node.children[i];
82
+ }
83
+ function hasChild(node, label) {
84
+ return indexOfChild(node, label) >= 0;
85
+ }
86
+ function lookupTrie(root, domain) {
87
+ const trimmed = domain.trim();
88
+ const stripped = trimmed.endsWith(".") ? trimmed.slice(0, -1) : trimmed;
89
+ if (stripped === "") return void 0;
90
+ const labels = stripped.split(".").reverse();
91
+ for (const label of labels) {
92
+ if (label === "" || label === "*" || label.startsWith("!")) return void 0;
93
+ }
94
+ let node = root;
95
+ let suffixDepth = 0;
96
+ let known = false;
97
+ for (let depth = 0; depth < labels.length; depth++) {
98
+ const label = labels[depth].toLowerCase();
99
+ if (hasChild(node, "*")) {
100
+ if (hasChild(node, "!" + label)) {
101
+ suffixDepth = depth;
102
+ } else {
103
+ suffixDepth = depth + 1;
104
+ }
105
+ known = true;
106
+ }
107
+ const child = childOf(node, label);
108
+ if (child !== void 0) {
109
+ if (child.suffixBoundary) {
110
+ suffixDepth = depth + 1;
111
+ known = true;
112
+ }
113
+ node = child;
114
+ continue;
115
+ }
116
+ break;
117
+ }
118
+ if (suffixDepth === 0) {
119
+ suffixDepth = 1;
120
+ known = false;
121
+ }
122
+ const suffix = labels.slice(0, suffixDepth).reverse().map((l) => l.toLowerCase()).join(".");
123
+ const registrableDomain2 = labels.length > suffixDepth ? `${labels[suffixDepth].toLowerCase()}.${suffix}` : void 0;
124
+ return { suffix, registrableDomain: registrableDomain2, known };
125
+ }
126
+
127
+ // src/index.ts
128
+ var cachedBytes;
129
+ var cachedTrie;
130
+ function decodeBase64(b64) {
131
+ if (typeof Buffer !== "undefined") {
132
+ return new Uint8Array(Buffer.from(b64, "base64"));
133
+ }
134
+ const binary = atob(b64);
135
+ const out = new Uint8Array(binary.length);
136
+ for (let i = 0; i < binary.length; i++) out[i] = binary.charCodeAt(i);
137
+ return out;
138
+ }
139
+ function bytes() {
140
+ return cachedBytes ??= decodeBase64(pslData_cjs.PSL_BASE64);
141
+ }
142
+ function trie() {
143
+ return cachedTrie ??= parseTrie(bytes());
144
+ }
145
+ function lookup(domain) {
146
+ return lookupTrie(trie(), domain);
147
+ }
148
+ function registrableDomain(domain) {
149
+ return lookup(domain)?.registrableDomain;
150
+ }
151
+ function isKnownSuffix(domain) {
152
+ return lookup(domain)?.known ?? false;
153
+ }
154
+ function pslData() {
155
+ return bytes().slice();
156
+ }
157
+
158
+ exports.isKnownSuffix = isKnownSuffix;
159
+ exports.lookup = lookup;
160
+ exports.pslData = pslData;
161
+ exports.registrableDomain = registrableDomain;
@@ -0,0 +1,36 @@
1
+ import { D as DomainInfo } from './trie-jGeN4GI6.cjs';
2
+
3
+ /**
4
+ * Look up a domain in the Public Suffix List.
5
+ *
6
+ * Returns `undefined` for empty or invalid input (empty labels, or the PSL
7
+ * sentinel labels `*` / `!prefix`).
8
+ *
9
+ * @example
10
+ * ```ts
11
+ * const info = lookup("www.example.co.uk");
12
+ * // info.suffix → "co.uk"
13
+ * // info.registrableDomain → "example.co.uk"
14
+ * // info.known → true
15
+ * ```
16
+ */
17
+ declare function lookup(domain: string): DomainInfo | undefined;
18
+ /**
19
+ * Extract the registrable domain (eTLD+1).
20
+ *
21
+ * Returns `undefined` if the domain is itself a public suffix, or the input is
22
+ * invalid.
23
+ */
24
+ declare function registrableDomain(domain: string): string | undefined;
25
+ /** Check whether a domain's suffix is a known (explicit) entry in the PSL. */
26
+ declare function isKnownSuffix(domain: string): boolean;
27
+ /**
28
+ * The raw compact binary PSL trie (DFS preorder) embedded in this package.
29
+ *
30
+ * Returns a defensive copy so callers can hand the blob to their own parser
31
+ * without risking the cached singleton. The format matches the Rust crate's
32
+ * `src/psl.bin`; see `scripts/build-psl.py` for the layout.
33
+ */
34
+ declare function pslData(): Uint8Array;
35
+
36
+ export { DomainInfo, isKnownSuffix, lookup, pslData, registrableDomain };
@@ -0,0 +1,36 @@
1
+ import { D as DomainInfo } from './trie-jGeN4GI6.js';
2
+
3
+ /**
4
+ * Look up a domain in the Public Suffix List.
5
+ *
6
+ * Returns `undefined` for empty or invalid input (empty labels, or the PSL
7
+ * sentinel labels `*` / `!prefix`).
8
+ *
9
+ * @example
10
+ * ```ts
11
+ * const info = lookup("www.example.co.uk");
12
+ * // info.suffix → "co.uk"
13
+ * // info.registrableDomain → "example.co.uk"
14
+ * // info.known → true
15
+ * ```
16
+ */
17
+ declare function lookup(domain: string): DomainInfo | undefined;
18
+ /**
19
+ * Extract the registrable domain (eTLD+1).
20
+ *
21
+ * Returns `undefined` if the domain is itself a public suffix, or the input is
22
+ * invalid.
23
+ */
24
+ declare function registrableDomain(domain: string): string | undefined;
25
+ /** Check whether a domain's suffix is a known (explicit) entry in the PSL. */
26
+ declare function isKnownSuffix(domain: string): boolean;
27
+ /**
28
+ * The raw compact binary PSL trie (DFS preorder) embedded in this package.
29
+ *
30
+ * Returns a defensive copy so callers can hand the blob to their own parser
31
+ * without risking the cached singleton. The format matches the Rust crate's
32
+ * `src/psl.bin`; see `scripts/build-psl.py` for the layout.
33
+ */
34
+ declare function pslData(): Uint8Array;
35
+
36
+ export { DomainInfo, isKnownSuffix, lookup, pslData, registrableDomain };
package/dist/index.js ADDED
@@ -0,0 +1,34 @@
1
+ import { lookupTrie, parseTrie } from './chunk-EON4VJGA.js';
2
+ import { PSL_BASE64 } from './psl-data.cjs';
3
+
4
+ var cachedBytes;
5
+ var cachedTrie;
6
+ function decodeBase64(b64) {
7
+ if (typeof Buffer !== "undefined") {
8
+ return new Uint8Array(Buffer.from(b64, "base64"));
9
+ }
10
+ const binary = atob(b64);
11
+ const out = new Uint8Array(binary.length);
12
+ for (let i = 0; i < binary.length; i++) out[i] = binary.charCodeAt(i);
13
+ return out;
14
+ }
15
+ function bytes() {
16
+ return cachedBytes ??= decodeBase64(PSL_BASE64);
17
+ }
18
+ function trie() {
19
+ return cachedTrie ??= parseTrie(bytes());
20
+ }
21
+ function lookup(domain) {
22
+ return lookupTrie(trie(), domain);
23
+ }
24
+ function registrableDomain(domain) {
25
+ return lookup(domain)?.registrableDomain;
26
+ }
27
+ function isKnownSuffix(domain) {
28
+ return lookup(domain)?.known ?? false;
29
+ }
30
+ function pslData() {
31
+ return bytes().slice();
32
+ }
33
+
34
+ export { isKnownSuffix, lookup, pslData, registrableDomain };