@absolutejs/dataset-gleif 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.js +30 -0
- package/dist/snapshot.js +132 -0
- package/package.json +5 -2
package/README.md
CHANGED
|
@@ -30,5 +30,28 @@ B.V."`. Treat a hit as *"a legal entity by this name"*, not a certainty. It also
|
|
|
30
30
|
has no industry, headcount, or contacts — pair it with `@absolutejs/discover`
|
|
31
31
|
(people) and `@absolutejs/enrich` (emails). Defaults to ACTIVE entities only.
|
|
32
32
|
|
|
33
|
+
## Offline snapshot — no rate limits, no network
|
|
34
|
+
|
|
35
|
+
For high-volume or offline use, build a local SQLite index from GLEIF's full
|
|
36
|
+
**Golden Copy** dump (~2.7M entities) and point the adapter at it:
|
|
37
|
+
|
|
38
|
+
1. **Download** the Golden Copy (LEI2, CSV or `.zip`) from
|
|
39
|
+
<https://goldencopy.gleif.org> (it's CC0). Their bulk host WAF-blocks many
|
|
40
|
+
data-center IPs, so fetch it from a browser / allowed network — this tool does
|
|
41
|
+
not download it for you.
|
|
42
|
+
2. **Build** the snapshot:
|
|
43
|
+
```sh
|
|
44
|
+
bunx gleif-snapshot golden-copy.zip gleif-snapshot.sqlite
|
|
45
|
+
# or: bun run node_modules/@absolutejs/dataset-gleif/dist/snapshot.js <file> [out]
|
|
46
|
+
```
|
|
47
|
+
3. **Use** it:
|
|
48
|
+
```ts
|
|
49
|
+
gleifSource({ snapshotPath: "gleif-snapshot.sqlite" });
|
|
50
|
+
```
|
|
51
|
+
`findCompany` resolves from the snapshot — instant, offline, zero rate limits —
|
|
52
|
+
falling back to the live API only on a miss. Re-run the generator to refresh
|
|
53
|
+
(GLEIF republishes 3×/day). Snapshot reads use `bun:sqlite` (loaded lazily;
|
|
54
|
+
live-only use needs nothing).
|
|
55
|
+
|
|
33
56
|
Apache-2.0. Pure importer of public CC0 data — the self-collected, self-healing
|
|
34
57
|
dataset is deliberately not part of any adapter.
|
package/dist/index.d.ts
CHANGED
|
@@ -1,9 +1,14 @@
|
|
|
1
1
|
import type { DatasetSource } from "@absolutejs/discover";
|
|
2
|
+
export declare const normalizeCompanyName: (value: string) => string;
|
|
2
3
|
export type GleifOptions = {
|
|
3
4
|
/** Inject a fetch (proxy, test stub). Defaults to global fetch. */
|
|
4
5
|
fetch?: typeof fetch;
|
|
5
6
|
timeoutMs?: number;
|
|
6
7
|
/** Only consider ACTIVE legal entities (default true). */
|
|
7
8
|
activeOnly?: boolean;
|
|
9
|
+
/** Path to a SQLite snapshot built by the snapshot generator (see scripts/).
|
|
10
|
+
* When set, findCompany resolves from it first — instant, offline, no rate
|
|
11
|
+
* limit — and falls back to the live API only on a miss. */
|
|
12
|
+
snapshotPath?: string;
|
|
8
13
|
};
|
|
9
14
|
export declare const gleifSource: (options?: GleifOptions) => DatasetSource;
|
package/dist/index.js
CHANGED
|
@@ -1,8 +1,23 @@
|
|
|
1
1
|
// @bun
|
|
2
|
+
var __require = import.meta.require;
|
|
3
|
+
|
|
2
4
|
// src/index.ts
|
|
3
5
|
var GLEIF_API = "https://api.gleif.org/api/v1";
|
|
4
6
|
var DEFAULT_TIMEOUT_MS = 1e4;
|
|
5
7
|
var DEFAULT_PAGE_SIZE = 10;
|
|
8
|
+
var normalizeCompanyName = (value) => value.toLowerCase().normalize("NFKD").replace(/[^\p{L}\p{N} ]/gu, " ").replace(/\b(inc|incorporated|corp|corporation|llc|llp|lp|ltd|limited|co|company|gmbh|ag|sa|bv|nv|plc|holding|holdings|group|the)\b/g, " ").replace(/\s+/g, " ").trim();
|
|
9
|
+
var snapshotLookups = new Map;
|
|
10
|
+
var getSnapshotLookup = async (path) => {
|
|
11
|
+
const existing = snapshotLookups.get(path);
|
|
12
|
+
if (existing)
|
|
13
|
+
return existing;
|
|
14
|
+
const { Database } = await import("bun:sqlite");
|
|
15
|
+
const db = new Database(path, { readonly: true });
|
|
16
|
+
const statement = db.query("SELECT lei, name, country, status FROM company WHERE norm_name = ?1 ORDER BY CASE status WHEN 'ACTIVE' THEN 0 ELSE 1 END LIMIT 1");
|
|
17
|
+
const lookup = (norm) => statement.get(norm);
|
|
18
|
+
snapshotLookups.set(path, lookup);
|
|
19
|
+
return lookup;
|
|
20
|
+
};
|
|
6
21
|
var toCompany = (record) => {
|
|
7
22
|
const entity = record.attributes?.entity;
|
|
8
23
|
const name = entity?.legalName?.name?.trim();
|
|
@@ -25,6 +40,20 @@ var gleifSource = (options = {}) => {
|
|
|
25
40
|
const query = name?.trim();
|
|
26
41
|
if (!query)
|
|
27
42
|
return null;
|
|
43
|
+
if (options.snapshotPath) {
|
|
44
|
+
try {
|
|
45
|
+
const lookup = await getSnapshotLookup(options.snapshotPath);
|
|
46
|
+
const row = lookup(normalizeCompanyName(query));
|
|
47
|
+
if (row) {
|
|
48
|
+
return {
|
|
49
|
+
country: row.country ?? undefined,
|
|
50
|
+
name: row.name,
|
|
51
|
+
registryId: row.lei,
|
|
52
|
+
source: "gleif"
|
|
53
|
+
};
|
|
54
|
+
}
|
|
55
|
+
} catch {}
|
|
56
|
+
}
|
|
28
57
|
const params = new URLSearchParams;
|
|
29
58
|
params.set("filter[fulltext]", query);
|
|
30
59
|
params.set("page[size]", String(DEFAULT_PAGE_SIZE));
|
|
@@ -50,5 +79,6 @@ var gleifSource = (options = {}) => {
|
|
|
50
79
|
return { findCompany, name: "gleif" };
|
|
51
80
|
};
|
|
52
81
|
export {
|
|
82
|
+
normalizeCompanyName,
|
|
53
83
|
gleifSource
|
|
54
84
|
};
|
package/dist/snapshot.js
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
// @bun
|
|
3
|
+
|
|
4
|
+
// scripts/snapshot.ts
|
|
5
|
+
import { Database } from "bun:sqlite";
|
|
6
|
+
|
|
7
|
+
// src/index.ts
|
|
8
|
+
var normalizeCompanyName = (value) => value.toLowerCase().normalize("NFKD").replace(/[^\p{L}\p{N} ]/gu, " ").replace(/\b(inc|incorporated|corp|corporation|llc|llp|lp|ltd|limited|co|company|gmbh|ag|sa|bv|nv|plc|holding|holdings|group|the)\b/g, " ").replace(/\s+/g, " ").trim();
|
|
9
|
+
var snapshotLookups = new Map;
|
|
10
|
+
|
|
11
|
+
// scripts/snapshot.ts
|
|
12
|
+
var COLUMNS = {
|
|
13
|
+
country: "Entity.LegalAddress.Country",
|
|
14
|
+
lei: "LEI",
|
|
15
|
+
name: "Entity.LegalName",
|
|
16
|
+
status: "Entity.EntityStatus"
|
|
17
|
+
};
|
|
18
|
+
var BATCH_SIZE = 1e4;
|
|
19
|
+
var PROGRESS_EVERY = 1e5;
|
|
20
|
+
async function* csvRows(stream) {
|
|
21
|
+
const decoder = new TextDecoder;
|
|
22
|
+
let field = "";
|
|
23
|
+
let row = [];
|
|
24
|
+
let inQuotes = false;
|
|
25
|
+
let quotePending = false;
|
|
26
|
+
for await (const chunk of stream) {
|
|
27
|
+
const text = decoder.decode(chunk, { stream: true });
|
|
28
|
+
for (const char of text) {
|
|
29
|
+
if (quotePending) {
|
|
30
|
+
quotePending = false;
|
|
31
|
+
if (char === '"') {
|
|
32
|
+
field += '"';
|
|
33
|
+
inQuotes = true;
|
|
34
|
+
continue;
|
|
35
|
+
}
|
|
36
|
+
inQuotes = false;
|
|
37
|
+
}
|
|
38
|
+
if (inQuotes) {
|
|
39
|
+
if (char === '"')
|
|
40
|
+
quotePending = true;
|
|
41
|
+
else
|
|
42
|
+
field += char;
|
|
43
|
+
} else if (char === '"') {
|
|
44
|
+
inQuotes = true;
|
|
45
|
+
} else if (char === ",") {
|
|
46
|
+
row.push(field);
|
|
47
|
+
field = "";
|
|
48
|
+
} else if (char === `
|
|
49
|
+
`) {
|
|
50
|
+
row.push(field);
|
|
51
|
+
yield row;
|
|
52
|
+
field = "";
|
|
53
|
+
row = [];
|
|
54
|
+
} else if (char !== "\r") {
|
|
55
|
+
field += char;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
if (field.length > 0 || row.length > 0) {
|
|
60
|
+
row.push(field);
|
|
61
|
+
yield row;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
var inputPath = process.argv[2];
|
|
65
|
+
var outputPath = process.argv[3] ?? "gleif-snapshot.sqlite";
|
|
66
|
+
if (!inputPath) {
|
|
67
|
+
console.error("usage: bun run scripts/snapshot.ts <golden-copy.csv|.zip> [out.sqlite]");
|
|
68
|
+
process.exit(1);
|
|
69
|
+
}
|
|
70
|
+
var source = inputPath.endsWith(".zip") ? Bun.spawn(["unzip", "-p", inputPath]).stdout : Bun.file(inputPath).stream();
|
|
71
|
+
var db = new Database(outputPath, { create: true });
|
|
72
|
+
db.run("PRAGMA journal_mode = WAL");
|
|
73
|
+
db.run("DROP TABLE IF EXISTS company");
|
|
74
|
+
db.run("CREATE TABLE company (lei TEXT PRIMARY KEY, name TEXT NOT NULL, norm_name TEXT NOT NULL, country TEXT, status TEXT)");
|
|
75
|
+
var insert = db.query("INSERT OR REPLACE INTO company (lei, name, norm_name, country, status) VALUES (?1, ?2, ?3, ?4, ?5)");
|
|
76
|
+
var insertBatch = db.transaction((batch) => {
|
|
77
|
+
for (const values of batch)
|
|
78
|
+
insert.run(...values);
|
|
79
|
+
});
|
|
80
|
+
var cols = null;
|
|
81
|
+
var batch = [];
|
|
82
|
+
var total = 0;
|
|
83
|
+
var flush = () => {
|
|
84
|
+
if (batch.length > 0) {
|
|
85
|
+
insertBatch(batch);
|
|
86
|
+
batch = [];
|
|
87
|
+
}
|
|
88
|
+
};
|
|
89
|
+
for await (const row of csvRows(source)) {
|
|
90
|
+
if (!cols) {
|
|
91
|
+
const map = {};
|
|
92
|
+
row.forEach((column, index) => {
|
|
93
|
+
map[column.replace(/^\uFEFF/, "").trim()] = index;
|
|
94
|
+
});
|
|
95
|
+
const lei2 = map[COLUMNS.lei];
|
|
96
|
+
const name2 = map[COLUMNS.name];
|
|
97
|
+
if (lei2 === undefined || name2 === undefined) {
|
|
98
|
+
console.error(`Missing required columns. Found headers: ${row.slice(0, 8).join(", ")}\u2026`);
|
|
99
|
+
process.exit(1);
|
|
100
|
+
}
|
|
101
|
+
cols = {
|
|
102
|
+
country: map[COLUMNS.country] ?? -1,
|
|
103
|
+
lei: lei2,
|
|
104
|
+
name: name2,
|
|
105
|
+
status: map[COLUMNS.status] ?? -1
|
|
106
|
+
};
|
|
107
|
+
continue;
|
|
108
|
+
}
|
|
109
|
+
const lei = row[cols.lei]?.trim();
|
|
110
|
+
const name = row[cols.name]?.trim();
|
|
111
|
+
if (!lei || !name)
|
|
112
|
+
continue;
|
|
113
|
+
batch.push([
|
|
114
|
+
lei,
|
|
115
|
+
name,
|
|
116
|
+
normalizeCompanyName(name),
|
|
117
|
+
cols.country >= 0 ? row[cols.country]?.trim() ?? "" : "",
|
|
118
|
+
cols.status >= 0 ? row[cols.status]?.trim() ?? "" : ""
|
|
119
|
+
]);
|
|
120
|
+
total += 1;
|
|
121
|
+
if (batch.length >= BATCH_SIZE)
|
|
122
|
+
flush();
|
|
123
|
+
if (total % PROGRESS_EVERY === 0) {
|
|
124
|
+
console.log(` ${total.toLocaleString()} rows\u2026`);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
flush();
|
|
128
|
+
console.log("Indexing\u2026");
|
|
129
|
+
db.run("CREATE INDEX idx_company_norm_name ON company(norm_name)");
|
|
130
|
+
db.run("PRAGMA wal_checkpoint(TRUNCATE)");
|
|
131
|
+
db.close();
|
|
132
|
+
console.log(`Done \u2014 ${total.toLocaleString()} entities \u2192 ${outputPath}`);
|
package/package.json
CHANGED
|
@@ -20,6 +20,9 @@
|
|
|
20
20
|
"lei",
|
|
21
21
|
"company-data"
|
|
22
22
|
],
|
|
23
|
+
"bin": {
|
|
24
|
+
"gleif-snapshot": "./dist/snapshot.js"
|
|
25
|
+
},
|
|
23
26
|
"license": "Apache-2.0",
|
|
24
27
|
"main": "./dist/index.js",
|
|
25
28
|
"name": "@absolutejs/dataset-gleif",
|
|
@@ -35,11 +38,11 @@
|
|
|
35
38
|
"url": "https://github.com/absolutejs/dataset-adapters.git"
|
|
36
39
|
},
|
|
37
40
|
"scripts": {
|
|
38
|
-
"build": "rm -rf dist && bun build src/index.ts --outdir dist --target=bun --external @absolutejs/discover && tsc --emitDeclarationOnly --project tsconfig.json",
|
|
41
|
+
"build": "rm -rf dist && bun build src/index.ts --outdir dist --target=bun --external @absolutejs/discover && bun build scripts/snapshot.ts --outdir dist --target=bun --external @absolutejs/discover && tsc --emitDeclarationOnly --project tsconfig.json",
|
|
39
42
|
"release": "bun run build && bun publish",
|
|
40
43
|
"typecheck": "tsc --noEmit"
|
|
41
44
|
},
|
|
42
45
|
"type": "module",
|
|
43
46
|
"types": "./dist/index.d.ts",
|
|
44
|
-
"version": "0.0.
|
|
47
|
+
"version": "0.0.3"
|
|
45
48
|
}
|