latinfo 0.12.2 → 0.13.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +441 -19
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -47,7 +47,7 @@ const local_search_1 = require("./local-search");
|
|
|
47
47
|
const client_search_1 = require("./client-search");
|
|
48
48
|
const odis_search_1 = require("./odis-search");
|
|
49
49
|
const mphf_search_1 = require("./mphf-search");
|
|
50
|
-
const VERSION = '0.
|
|
50
|
+
const VERSION = '0.13.1';
|
|
51
51
|
const API_URL = process.env.LATINFO_API_URL || 'https://api.latinfo.dev';
|
|
52
52
|
const GITHUB_CLIENT_ID = process.env.GITHUB_CLIENT_ID || 'Ov23li5fcQaiCsVtaMKK';
|
|
53
53
|
const CONFIG_DIR = path_1.default.join(os_1.default.homedir(), '.latinfo');
|
|
@@ -1786,12 +1786,16 @@ async function pipeTest(args) {
|
|
|
1786
1786
|
}
|
|
1787
1787
|
}
|
|
1788
1788
|
// Run import with --limit 100
|
|
1789
|
-
|
|
1789
|
+
// Check YAML for custom import_script path
|
|
1790
|
+
const importScriptMatch = yamlContent.match(/import_script:\s*(.+)/);
|
|
1791
|
+
const customScript = importScriptMatch ? path_1.default.join(repo, importScriptMatch[1].trim()) : null;
|
|
1792
|
+
const defaultScript = path_1.default.join(repo, 'src', 'imports', `${sourceName}.ts`);
|
|
1793
|
+
const scriptPath = customScript && fs_1.default.existsSync(customScript) ? customScript
|
|
1794
|
+
: fs_1.default.existsSync(defaultScript) ? defaultScript : null;
|
|
1790
1795
|
const easypipePath = path_1.default.join(repo, 'src', 'imports', 'easypipe.ts');
|
|
1791
|
-
const
|
|
1792
|
-
|
|
1793
|
-
|
|
1794
|
-
: `npx tsx ${scriptPath} --limit 100 --local`;
|
|
1796
|
+
const cmd = scriptPath
|
|
1797
|
+
? `npx tsx ${scriptPath} --limit 100 --local`
|
|
1798
|
+
: `npx tsx ${easypipePath} ${yamlPath} --limit 100 --local`;
|
|
1795
1799
|
console.log(`[pipe] Gate 1: TEST (100 records)\n`);
|
|
1796
1800
|
console.log(`Running: ${cmd}\n`);
|
|
1797
1801
|
let output = '';
|
|
@@ -1852,12 +1856,17 @@ async function pipeValidate(args) {
|
|
|
1852
1856
|
requireGate(status, 'test', 'validate');
|
|
1853
1857
|
const repo = getRepoPath();
|
|
1854
1858
|
const { execSync: run } = await Promise.resolve().then(() => __importStar(require('child_process')));
|
|
1855
|
-
const
|
|
1859
|
+
const yamlPath = path_1.default.join(repo, 'sources', `${sourceName}.yaml`);
|
|
1860
|
+
const yamlContent = fs_1.default.existsSync(yamlPath) ? fs_1.default.readFileSync(yamlPath, 'utf-8') : '';
|
|
1861
|
+
const importScriptMatch = yamlContent.match(/import_script:\s*(.+)/);
|
|
1862
|
+
const customScript = importScriptMatch ? path_1.default.join(repo, importScriptMatch[1].trim()) : null;
|
|
1863
|
+
const defaultScript = path_1.default.join(repo, 'src', 'imports', `${sourceName}.ts`);
|
|
1864
|
+
const scriptPath = customScript && fs_1.default.existsSync(customScript) ? customScript
|
|
1865
|
+
: fs_1.default.existsSync(defaultScript) ? defaultScript : null;
|
|
1856
1866
|
const easypipePath = path_1.default.join(repo, 'src', 'imports', 'easypipe.ts');
|
|
1857
|
-
const
|
|
1858
|
-
|
|
1859
|
-
|
|
1860
|
-
: `npx tsx ${scriptPath} --local`;
|
|
1867
|
+
const cmd = scriptPath
|
|
1868
|
+
? `npx tsx ${scriptPath} --local`
|
|
1869
|
+
: `npx tsx ${easypipePath} ${yamlPath} --local`;
|
|
1861
1870
|
console.log(`[pipe] Gate 2: VALIDATE (full import, local only — no R2 upload)\n`);
|
|
1862
1871
|
console.log(`Running: ${cmd}\n`);
|
|
1863
1872
|
try {
|
|
@@ -2130,17 +2139,31 @@ async function pipePublish(args) {
|
|
|
2130
2139
|
const { execSync: run } = await Promise.resolve().then(() => __importStar(require('child_process')));
|
|
2131
2140
|
const RUNNER = 'f3mt0@100.109.82.87';
|
|
2132
2141
|
console.log(`[pipe] Gate 4: PUBLISH\n`);
|
|
2133
|
-
// 1.
|
|
2142
|
+
// 1. Auto-generate sources.ts from YAMLs
|
|
2143
|
+
console.log(`[pipe] Generating sources.ts...`);
|
|
2144
|
+
try {
|
|
2145
|
+
run(`npx tsx src/imports/generate-sources.ts`, { cwd: repo, stdio: 'inherit' });
|
|
2146
|
+
}
|
|
2147
|
+
catch {
|
|
2148
|
+
console.error(`[pipe] Failed to generate sources.ts`);
|
|
2149
|
+
process.exit(1);
|
|
2150
|
+
}
|
|
2151
|
+
// 2. Git add + commit + push
|
|
2134
2152
|
console.log(`[pipe] Committing to repo...`);
|
|
2135
|
-
const files = [`sources/${sourceName}.yaml
|
|
2153
|
+
const files = [`sources/${sourceName}.yaml`, 'src/sources.ts'];
|
|
2136
2154
|
const scriptPath = path_1.default.join(repo, 'src', 'imports', `${sourceName}.ts`);
|
|
2137
|
-
|
|
2155
|
+
const yamlContent = fs_1.default.readFileSync(path_1.default.join(repo, 'sources', `${sourceName}.yaml`), 'utf-8');
|
|
2156
|
+
const importScriptMatch = yamlContent.match(/import_script:\s*(.+)/);
|
|
2157
|
+
const customScript = importScriptMatch ? importScriptMatch[1].trim() : null;
|
|
2158
|
+
if (customScript && fs_1.default.existsSync(path_1.default.join(repo, customScript)))
|
|
2159
|
+
files.push(customScript);
|
|
2160
|
+
else if (fs_1.default.existsSync(scriptPath))
|
|
2138
2161
|
files.push(`src/imports/${sourceName}.ts`);
|
|
2162
|
+
const docsFile = `docs/sources/${sourceName}.md`;
|
|
2163
|
+
if (fs_1.default.existsSync(path_1.default.join(repo, docsFile)))
|
|
2164
|
+
files.push(docsFile);
|
|
2139
2165
|
try {
|
|
2140
|
-
|
|
2141
|
-
if (fs_1.default.existsSync(path_1.default.join(repo, docsFile)))
|
|
2142
|
-
files.push(docsFile);
|
|
2143
|
-
run(`git add ${files.join(' ')} src/sources.ts .github/workflows/import.yml`, { cwd: repo, stdio: 'pipe' });
|
|
2166
|
+
run(`git add ${files.join(' ')}`, { cwd: repo, stdio: 'pipe' });
|
|
2144
2167
|
run(`git commit -m "Add data source: ${sourceName}"`, { cwd: repo, stdio: 'pipe' });
|
|
2145
2168
|
run(`git push`, { cwd: repo, stdio: 'pipe' });
|
|
2146
2169
|
console.log(`[pipe] Pushed to remote.`);
|
|
@@ -2170,7 +2193,7 @@ async function pipePublish(args) {
|
|
|
2170
2193
|
// 4. Restart search server
|
|
2171
2194
|
console.log(`[pipe] Restarting search server on Linux Mint...`);
|
|
2172
2195
|
try {
|
|
2173
|
-
run(`ssh ${RUNNER} "sudo systemctl restart search
|
|
2196
|
+
run(`ssh ${RUNNER} "sudo systemctl restart latinfo-search 2>/dev/null || echo 'No service yet'"`, { stdio: 'inherit' });
|
|
2174
2197
|
}
|
|
2175
2198
|
catch { }
|
|
2176
2199
|
console.log(`\n[pipe] Gate 4 PASSED ✓`);
|
|
@@ -2321,6 +2344,402 @@ ENVIRONMENT
|
|
|
2321
2344
|
LATINFO_REPO_PATH Auto-detected from cwd`);
|
|
2322
2345
|
}
|
|
2323
2346
|
}
|
|
2347
|
+
// --- Docs ---
|
|
2348
|
+
const DOCS = {
|
|
2349
|
+
index: `latinfo docs — complete documentation
|
|
2350
|
+
|
|
2351
|
+
TOPICS
|
|
2352
|
+
latinfo docs pipe How to create a data pipeline (full guide)
|
|
2353
|
+
latinfo docs fields searchFieldIndex, statusFieldIndex explained
|
|
2354
|
+
latinfo docs v2 V2 search index + MPHF (mandatory)
|
|
2355
|
+
latinfo docs encoding Encoding issues (latin1, UTF-8, replacement chars)
|
|
2356
|
+
latinfo docs script Import script template + requirements
|
|
2357
|
+
latinfo docs troubleshooting Common errors and fixes
|
|
2358
|
+
latinfo docs architecture How latinfo works internally
|
|
2359
|
+
latinfo docs api API endpoints and response format`,
|
|
2360
|
+
pipe: `HOW TO CREATE A DATA PIPELINE
|
|
2361
|
+
|
|
2362
|
+
latinfo pipe handles storage, indexing, search, and API serving automatically.
|
|
2363
|
+
Your only job is to write a script that produces a TSV file.
|
|
2364
|
+
|
|
2365
|
+
STEP BY STEP
|
|
2366
|
+
|
|
2367
|
+
1. Create source definition:
|
|
2368
|
+
latinfo pipe create <country> <institution> <dataset> [flags]
|
|
2369
|
+
|
|
2370
|
+
Example:
|
|
2371
|
+
latinfo pipe create pe redam registry --url https://redam.pj.gob.pe/ --id-name dni --id-length 8
|
|
2372
|
+
|
|
2373
|
+
This generates a YAML config. Edit it to match your source.
|
|
2374
|
+
|
|
2375
|
+
2. Write your import script (any method: fetch, Playwright, curl, Python):
|
|
2376
|
+
Your script must:
|
|
2377
|
+
a) Download the source data
|
|
2378
|
+
b) Parse to TSV: ID\\tfield1\\tfield2\\t...
|
|
2379
|
+
c) Sort by ID: LC_ALL=C sort -t'\\t' -k1,1
|
|
2380
|
+
d) Call buildBinaryFiles() → .bin + .idx
|
|
2381
|
+
e) Call buildSearchIndex() with statusFieldIndex → V2 search (MANDATORY)
|
|
2382
|
+
f) Call buildMphfFromIdx() → .mphf (MANDATORY)
|
|
2383
|
+
g) Call uploadToR2() for each file
|
|
2384
|
+
h) Call saveImportMeta()
|
|
2385
|
+
|
|
2386
|
+
See: latinfo docs script (for full template)
|
|
2387
|
+
See: latinfo docs fields (for searchFieldIndex and statusFieldIndex)
|
|
2388
|
+
|
|
2389
|
+
3. Upload script:
|
|
2390
|
+
latinfo pipe script <source> ./my-script.ts
|
|
2391
|
+
|
|
2392
|
+
4. Add dependencies (if any):
|
|
2393
|
+
latinfo pipe deps <source> playwright ddddocr
|
|
2394
|
+
|
|
2395
|
+
5. Test (100 records, LOCAL — no R2 upload):
|
|
2396
|
+
latinfo pipe test <source>
|
|
2397
|
+
|
|
2398
|
+
6. Validate (all records, LOCAL — no R2 upload):
|
|
2399
|
+
latinfo pipe validate <source>
|
|
2400
|
+
|
|
2401
|
+
7. Stage (Linux Mint — import + 500 concurrent bench):
|
|
2402
|
+
latinfo pipe stage <source>
|
|
2403
|
+
|
|
2404
|
+
8. Document:
|
|
2405
|
+
latinfo pipe docs <source>
|
|
2406
|
+
|
|
2407
|
+
9. Publish to production:
|
|
2408
|
+
latinfo pipe publish <source>
|
|
2409
|
+
|
|
2410
|
+
GATES
|
|
2411
|
+
Each gate must pass before the next unlocks:
|
|
2412
|
+
test → validate → stage → docs → publish
|
|
2413
|
+
|
|
2414
|
+
test: 100 records, validates V2 + MPHF, no R2 upload
|
|
2415
|
+
validate: full import, all records, no R2 upload
|
|
2416
|
+
stage: runs on Linux Mint, uploads to R2, 500 concurrent bench (99.9% required)
|
|
2417
|
+
docs: documentation with required sections
|
|
2418
|
+
publish: deploy to Cloudflare, restart search server, smoke test, rollback on failure
|
|
2419
|
+
|
|
2420
|
+
NAMING
|
|
2421
|
+
Source name: {country}-{institution}-{dataset}
|
|
2422
|
+
Country: ISO 3166-1 alpha-2 lowercase (pe, co, br, mx, ec, ar, cl)
|
|
2423
|
+
Institution: government agency abbreviation, lowercase
|
|
2424
|
+
Dataset: what the data contains, english, lowercase
|
|
2425
|
+
Examples: pe-sunat-padron, pe-osce-sanctioned, co-rues-registry, pe-redam-registry`,
|
|
2426
|
+
fields: `FIELD INDEXES: searchFieldIndex and statusFieldIndex
|
|
2427
|
+
|
|
2428
|
+
Both go in buildSearchIndex() inside your import script.
|
|
2429
|
+
NOT in src/sources.ts. NOT in the YAML config.
|
|
2430
|
+
|
|
2431
|
+
searchFieldIndex
|
|
2432
|
+
Which TSV column (after ID) is the main searchable text.
|
|
2433
|
+
Almost always 0 (the first field after the primary ID).
|
|
2434
|
+
This field gets tokenized and indexed for full-text search.
|
|
2435
|
+
|
|
2436
|
+
statusFieldIndex
|
|
2437
|
+
Which TSV column (after ID) appears as a short label in search results.
|
|
2438
|
+
Choose the field that helps identify records at a glance.
|
|
2439
|
+
|
|
2440
|
+
| Source type | statusFieldIndex | Field | Example value |
|
|
2441
|
+
|---------------|-----------------|-----------------|------------------------|
|
|
2442
|
+
| Companies | 1 | estado | ACTIVO, CANCELADA |
|
|
2443
|
+
| Sanctions | 1 | date_start | 20230103 |
|
|
2444
|
+
| People (REDAM)| 4 | fecha_registro | 2022-10-27 |
|
|
2445
|
+
| Fines | 1 | date_start | 20230103 |
|
|
2446
|
+
|
|
2447
|
+
HOW TO COUNT
|
|
2448
|
+
|
|
2449
|
+
Your TSV looks like: ID\\tfield0\\tfield1\\tfield2\\tfield3\\tfield4
|
|
2450
|
+
The index is 0-based, counting from the first field AFTER the ID.
|
|
2451
|
+
|
|
2452
|
+
Example REDAM TSV:
|
|
2453
|
+
12345678\\tROSALES\\tORTIZ\\tJUAN\\tDNI\\t2022-10-27
|
|
2454
|
+
ID 0(ape_p) 1(ape_m) 2(nombres) 3(tipo) 4(fecha)
|
|
2455
|
+
|
|
2456
|
+
searchFieldIndex: 0 → search by ape_paterno (last name)
|
|
2457
|
+
statusFieldIndex: 4 → show fecha_registro in search results
|
|
2458
|
+
|
|
2459
|
+
CODE EXAMPLE
|
|
2460
|
+
|
|
2461
|
+
// In your import script (NOT in sources.ts):
|
|
2462
|
+
const search = await buildSearchIndex(
|
|
2463
|
+
sortedPath, TEMP_DIR, SOURCE,
|
|
2464
|
+
{ searchFieldIndex: 0, idRegex: /^\\d{8}$/, statusFieldIndex: 4 },
|
|
2465
|
+
recordCount,
|
|
2466
|
+
);
|
|
2467
|
+
|
|
2468
|
+
IMPORTANT
|
|
2469
|
+
- statusFieldIndex is MANDATORY (V2). Without it, you get V1 which is deprecated.
|
|
2470
|
+
- If pipe test says "V1" in the output, you forgot statusFieldIndex.
|
|
2471
|
+
- These indexes do NOT go in src/sources.ts — that file is for the API router.`,
|
|
2472
|
+
v2: `V2 SEARCH INDEX + MPHF
|
|
2473
|
+
|
|
2474
|
+
V2 is MANDATORY for all sources. V1 is deprecated.
|
|
2475
|
+
|
|
2476
|
+
WHAT IS V2?
|
|
2477
|
+
V2 stores name + status inline in the search posting list (110 bytes per entry).
|
|
2478
|
+
Search results come directly from the index — no secondary R2 lookup needed.
|
|
2479
|
+
V1 only stored the ID (8 bytes) and required a second read to get the name.
|
|
2480
|
+
|
|
2481
|
+
HOW TO ENABLE V2
|
|
2482
|
+
Pass statusFieldIndex to buildSearchIndex():
|
|
2483
|
+
|
|
2484
|
+
buildSearchIndex(sortedPath, TEMP_DIR, SOURCE,
|
|
2485
|
+
{ searchFieldIndex: 0, idRegex: /^\\d{11}$/, statusFieldIndex: 1 },
|
|
2486
|
+
recordCount,
|
|
2487
|
+
);
|
|
2488
|
+
|
|
2489
|
+
See: latinfo docs fields (for how to choose statusFieldIndex)
|
|
2490
|
+
|
|
2491
|
+
WHAT IS MPHF?
|
|
2492
|
+
Minimal Perfect Hash Function. A compact dictionary (~500KB) that enables
|
|
2493
|
+
client-side offline search. The client downloads it once and searches locally
|
|
2494
|
+
without hitting the server.
|
|
2495
|
+
|
|
2496
|
+
HOW TO GENERATE MPHF
|
|
2497
|
+
Always call buildMphfFromIdx() after buildSearchIndex():
|
|
2498
|
+
|
|
2499
|
+
const mphfPath = buildMphfFromIdx(search.idxPath);
|
|
2500
|
+
uploadToR2(mphfPath, \`\${SOURCE}-search.mphf\`);
|
|
2501
|
+
|
|
2502
|
+
DETECTION
|
|
2503
|
+
pipe test checks script output for "V1" or "V2" markers.
|
|
2504
|
+
If it sees V1, the gate fails with:
|
|
2505
|
+
"Search index is V1 — MUST use V2. Add statusFieldIndex to buildSearchIndex()"`,
|
|
2506
|
+
encoding: `ENCODING ISSUES
|
|
2507
|
+
|
|
2508
|
+
COMMON PROBLEM
|
|
2509
|
+
Source file is ISO-8859-1 (latin1) but read as UTF-8.
|
|
2510
|
+
Characters like ó, ñ, é appear as replacement char (U+FFFD).
|
|
2511
|
+
|
|
2512
|
+
HOW TO DETECT
|
|
2513
|
+
Run: file downloaded-file.csv
|
|
2514
|
+
If it says "ISO-8859 text" → it's latin1.
|
|
2515
|
+
|
|
2516
|
+
HOW TO FIX
|
|
2517
|
+
Read with latin1, write with UTF-8:
|
|
2518
|
+
|
|
2519
|
+
const input = createReadStream(csvPath, { encoding: 'latin1' });
|
|
2520
|
+
const output = createWriteStream(tsvPath, { encoding: 'utf-8' });
|
|
2521
|
+
|
|
2522
|
+
Node's latin1 decoder converts 0xF3 (ó) to the correct JS string,
|
|
2523
|
+
then UTF-8 writer encodes it as 0xC3B3.
|
|
2524
|
+
|
|
2525
|
+
TRUNCATION BUG (FIXED)
|
|
2526
|
+
build-binary.ts truncates fields to 255 bytes. If the cut lands in the middle
|
|
2527
|
+
of a multi-byte UTF-8 character, it produces replacement chars.
|
|
2528
|
+
This is fixed — build-binary.ts now backs up to the last valid UTF-8 boundary.
|
|
2529
|
+
|
|
2530
|
+
YAML CONFIG
|
|
2531
|
+
Set encoding in your YAML:
|
|
2532
|
+
encoding: iso-8859-1
|
|
2533
|
+
|
|
2534
|
+
Or for UTF-8 (default):
|
|
2535
|
+
encoding: utf-8`,
|
|
2536
|
+
script: `IMPORT SCRIPT TEMPLATE
|
|
2537
|
+
|
|
2538
|
+
Your script downloads data and produces a sorted TSV. That's it.
|
|
2539
|
+
latinfo handles everything else (binary format, search index, MPHF, R2 upload).
|
|
2540
|
+
|
|
2541
|
+
FULL TEMPLATE
|
|
2542
|
+
|
|
2543
|
+
import { createReadStream, createWriteStream } from 'fs';
|
|
2544
|
+
import * as readline from 'readline';
|
|
2545
|
+
import * as https from 'https';
|
|
2546
|
+
import * as fs from 'fs';
|
|
2547
|
+
import * as path from 'path';
|
|
2548
|
+
import { execSync } from 'child_process';
|
|
2549
|
+
import { buildBinaryFiles } from './build-binary';
|
|
2550
|
+
import { buildSearchIndex } from './build-search-index';
|
|
2551
|
+
import { uploadToR2, saveImportMeta, buildMphfFromIdx } from './shared';
|
|
2552
|
+
|
|
2553
|
+
const SOURCE = 'pe-example-dataset'; // must match YAML name
|
|
2554
|
+
const TEMP_DIR = \`/tmp/\${SOURCE}-import\`;
|
|
2555
|
+
|
|
2556
|
+
export async function importExample(options?: { limit?: number }) {
|
|
2557
|
+
console.log(\`=== \${SOURCE.toUpperCase()} IMPORT ===\\n\`);
|
|
2558
|
+
try {
|
|
2559
|
+
fs.mkdirSync(TEMP_DIR, { recursive: true });
|
|
2560
|
+
|
|
2561
|
+
// 1. Download
|
|
2562
|
+
// ... your download logic (fetch, Playwright, curl, etc.)
|
|
2563
|
+
|
|
2564
|
+
// 2. Parse to TSV
|
|
2565
|
+
const tsvPath = path.join(TEMP_DIR, 'parsed.tsv');
|
|
2566
|
+
const output = createWriteStream(tsvPath, { encoding: 'utf-8' });
|
|
2567
|
+
// ... parse source → output.write(\`\${id}\\t\${field1}\\t\${field2}\\n\`)
|
|
2568
|
+
await new Promise<void>(r => output.end(r));
|
|
2569
|
+
|
|
2570
|
+
// 3. Sort
|
|
2571
|
+
const sortedPath = path.join(TEMP_DIR, 'sorted.tsv');
|
|
2572
|
+
execSync(\`LC_ALL=C sort -t'\\t' -k1,1 "\${tsvPath}" -o "\${sortedPath}"\`);
|
|
2573
|
+
fs.unlinkSync(tsvPath);
|
|
2574
|
+
|
|
2575
|
+
// 4. Build binary
|
|
2576
|
+
const config = { idLength: 11, idRegex: /^\\d{11}$/, prefixLength: 5, fieldCount: 6 };
|
|
2577
|
+
const { shardPaths, idxPath, recordCount } = await buildBinaryFiles(
|
|
2578
|
+
sortedPath, TEMP_DIR, SOURCE, config);
|
|
2579
|
+
|
|
2580
|
+
// 5. Build V2 search (MANDATORY: pass statusFieldIndex)
|
|
2581
|
+
const search = await buildSearchIndex(sortedPath, TEMP_DIR, SOURCE,
|
|
2582
|
+
{ searchFieldIndex: 0, idRegex: /^\\d{11}$/, statusFieldIndex: 1 },
|
|
2583
|
+
recordCount);
|
|
2584
|
+
|
|
2585
|
+
// 6. Delete sorted TSV
|
|
2586
|
+
fs.unlinkSync(sortedPath);
|
|
2587
|
+
|
|
2588
|
+
// 7. Upload to R2
|
|
2589
|
+
for (let i = 0; i < shardPaths.length; i++)
|
|
2590
|
+
uploadToR2(shardPaths[i], \`\${SOURCE}-\${i}.bin\`);
|
|
2591
|
+
uploadToR2(idxPath, \`\${SOURCE}.idx\`);
|
|
2592
|
+
|
|
2593
|
+
// 8. Build + upload MPHF (MANDATORY)
|
|
2594
|
+
const mphfPath = buildMphfFromIdx(search.idxPath);
|
|
2595
|
+
uploadToR2(search.idxPath, \`\${SOURCE}-search.idx\`);
|
|
2596
|
+
uploadToR2(mphfPath, \`\${SOURCE}-search.mphf\`);
|
|
2597
|
+
for (let i = 0; i < search.shardPaths.length; i++)
|
|
2598
|
+
uploadToR2(search.shardPaths[i], \`\${SOURCE}-search-\${i}.dat\`);
|
|
2599
|
+
|
|
2600
|
+
// 9. Save metadata
|
|
2601
|
+
saveImportMeta(SOURCE, new Date().toISOString(), recordCount);
|
|
2602
|
+
|
|
2603
|
+
// 10. Cleanup
|
|
2604
|
+
fs.rmSync(TEMP_DIR, { recursive: true, force: true });
|
|
2605
|
+
console.log(\`\\n[\${SOURCE}] Success: \${recordCount.toLocaleString()} records\`);
|
|
2606
|
+
return true;
|
|
2607
|
+
} catch (error) {
|
|
2608
|
+
console.error(\`\\n[\${SOURCE}] Error:\`, error);
|
|
2609
|
+
return false;
|
|
2610
|
+
}
|
|
2611
|
+
}
|
|
2612
|
+
|
|
2613
|
+
if (require.main === module) {
|
|
2614
|
+
const args = process.argv.slice(2);
|
|
2615
|
+
const limitIdx = args.indexOf('--limit');
|
|
2616
|
+
const limit = limitIdx !== -1 ? parseInt(args[limitIdx + 1]) : undefined;
|
|
2617
|
+
importExample({ limit }).then(ok => process.exit(ok ? 0 : 1));
|
|
2618
|
+
}
|
|
2619
|
+
|
|
2620
|
+
CLEAN EXAMPLE
|
|
2621
|
+
See src/imports/pe-osce-sanctioned.ts (CSV, latin1, 9K records)
|
|
2622
|
+
|
|
2623
|
+
IMPORTANT
|
|
2624
|
+
- SOURCE const must match the YAML filename (without .yaml)
|
|
2625
|
+
- Always use V2 (statusFieldIndex) — see: latinfo docs fields
|
|
2626
|
+
- Always generate MPHF — see: latinfo docs v2
|
|
2627
|
+
- Use --local flag support: the shared.ts uploadToR2 skips uploads when --local is passed`,
|
|
2628
|
+
troubleshooting: `COMMON ERRORS AND FIXES
|
|
2629
|
+
|
|
2630
|
+
ENCODING
|
|
2631
|
+
Problem: ó appears as replacement char in output
|
|
2632
|
+
Fix: Read source as latin1, write TSV as UTF-8
|
|
2633
|
+
See: latinfo docs encoding
|
|
2634
|
+
|
|
2635
|
+
V1 INSTEAD OF V2
|
|
2636
|
+
Problem: pipe test says "Search index is V1"
|
|
2637
|
+
Fix: Add statusFieldIndex to buildSearchIndex()
|
|
2638
|
+
See: latinfo docs fields
|
|
2639
|
+
|
|
2640
|
+
NO MPHF
|
|
2641
|
+
Problem: pipe test says "No MPHF generated"
|
|
2642
|
+
Fix: Call buildMphfFromIdx() after buildSearchIndex()
|
|
2643
|
+
See: latinfo docs v2
|
|
2644
|
+
|
|
2645
|
+
HTTP 418 / 403
|
|
2646
|
+
Problem: Source server blocks download
|
|
2647
|
+
Fix: Add User-Agent header to your request:
|
|
2648
|
+
headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' }
|
|
2649
|
+
|
|
2650
|
+
0 RECORDS PARSED
|
|
2651
|
+
Problem: Import succeeds but no records
|
|
2652
|
+
Fix: Check delimiter and column indexes in YAML match the source file
|
|
2653
|
+
|
|
2654
|
+
UPLOAD FAILS (CLOUDFLARE_API_TOKEN)
|
|
2655
|
+
Problem: uploadToR2 fails with "set CLOUDFLARE_API_TOKEN"
|
|
2656
|
+
Fix: Ensure .env has CLOUDFLARE_API_TOKEN. For local testing use --local flag.
|
|
2657
|
+
pipe test and pipe validate already use --local (no R2 upload).
|
|
2658
|
+
|
|
2659
|
+
GATE LOCKED
|
|
2660
|
+
Problem: "Gate X has not passed. Run: latinfo pipe X"
|
|
2661
|
+
Fix: Gates are sequential. Run them in order: test → validate → stage → docs → publish
|
|
2662
|
+
|
|
2663
|
+
SCRIPT CHANGED, GATES RESET
|
|
2664
|
+
Problem: After running pipe script, all gates show "not run"
|
|
2665
|
+
This is expected. Uploading a new script resets all gates — re-run from test.`,
|
|
2666
|
+
architecture: `HOW LATINFO WORKS
|
|
2667
|
+
|
|
2668
|
+
DATA FLOW
|
|
2669
|
+
Source (CSV/API/crawler) → TSV → sort → binary (.bin + .idx) → R2
|
|
2670
|
+
→ search index (.idx + .dat) → R2 / Linux Mint
|
|
2671
|
+
→ MPHF (.mphf) → R2
|
|
2672
|
+
|
|
2673
|
+
SERVING
|
|
2674
|
+
Lookups (by ID): User → Cloudflare Worker → R2 range read → response
|
|
2675
|
+
Search (by name): User → Cloudflare Worker → Linux Mint (search server, RAM) → response
|
|
2676
|
+
Offline search: Client downloads .mphf once → searches locally, zero server
|
|
2677
|
+
|
|
2678
|
+
STORAGE
|
|
2679
|
+
R2 (Cloudflare): .bin shards (~200MB each), .idx prefix index (~300KB), search shards
|
|
2680
|
+
Linux Mint RAM: search index + posting lists (~2GB per source) for fast search
|
|
2681
|
+
|
|
2682
|
+
BINARY FORMAT
|
|
2683
|
+
.bin: records sorted by ID, each record = uint16 length + ID + fields (uint8 len + data)
|
|
2684
|
+
.idx: prefix index, 16-byte entries (prefix, shard, offset, length)
|
|
2685
|
+
Fields truncated to 255 bytes with UTF-8 boundary safety
|
|
2686
|
+
|
|
2687
|
+
SEARCH
|
|
2688
|
+
V2 search index: inverted index with inline name + status (110 bytes per posting)
|
|
2689
|
+
Tokenized, IDF-scored, prefix matching, stop words filtered
|
|
2690
|
+
Search server: Node cluster on Linux Mint, 2 workers, ~670 queries/sec
|
|
2691
|
+
|
|
2692
|
+
INFRASTRUCTURE
|
|
2693
|
+
Cloudflare Workers: API routing, auth, rate limiting, lookups
|
|
2694
|
+
Cloudflare D1: auth database (api_keys, usage)
|
|
2695
|
+
Cloudflare R2: data storage
|
|
2696
|
+
Linux Mint: search server (RAM), GitHub Actions runner (imports)
|
|
2697
|
+
Cloudflare Tunnel: search.latinfo.dev → Linux Mint port 3001`,
|
|
2698
|
+
api: `API ENDPOINTS
|
|
2699
|
+
|
|
2700
|
+
All endpoints require Authorization: Bearer <api_key>
|
|
2701
|
+
|
|
2702
|
+
ROUTE PATTERN
|
|
2703
|
+
/{country}/{institution}/{dataset}/{action}
|
|
2704
|
+
|
|
2705
|
+
LOOKUP BY ID
|
|
2706
|
+
GET /pe/sunat/padron/ruc/20100047218
|
|
2707
|
+
GET /pe/sunat/padron/dni/09346247
|
|
2708
|
+
GET /pe/sunat/coactiva/ruc/20348858182
|
|
2709
|
+
GET /pe/osce/sanctioned/ruc/20100994128
|
|
2710
|
+
GET /pe/osce/fines/ruc/10001108307
|
|
2711
|
+
GET /pe/redam/registry/dni/12345678
|
|
2712
|
+
GET /co/rues/registry/nit/0900073223
|
|
2713
|
+
|
|
2714
|
+
SEARCH
|
|
2715
|
+
GET /pe/sunat/padron/search?q=banco+credito
|
|
2716
|
+
GET /pe/osce/sanctioned/search?q=constructora
|
|
2717
|
+
GET /co/rues/registry/search?q=ecopetrol
|
|
2718
|
+
|
|
2719
|
+
LICITACIONES
|
|
2720
|
+
GET /pe/oece/tenders?q=servicio&limit=5
|
|
2721
|
+
GET /pe/oece/tenders/info
|
|
2722
|
+
|
|
2723
|
+
RESPONSE FORMAT
|
|
2724
|
+
Lookup: { "ruc": "20100047218", "razon_social": "BANCO DE CREDITO", ... }
|
|
2725
|
+
Search: [{ "ruc": "20100047218", "razon_social": "BANCO DE CREDITO", "estado": "ACTIVO" }, ...]
|
|
2726
|
+
|
|
2727
|
+
CLI
|
|
2728
|
+
latinfo pe sunat padron 20100047218
|
|
2729
|
+
latinfo pe sunat padron --search "banco credito"
|
|
2730
|
+
latinfo pe sunat padron --dni 09346247
|
|
2731
|
+
latinfo co rues registry --search "ecopetrol"`,
|
|
2732
|
+
};
|
|
2733
|
+
function docs(args) {
|
|
2734
|
+
const topic = args[0] || 'index';
|
|
2735
|
+
const content = DOCS[topic];
|
|
2736
|
+
if (!content) {
|
|
2737
|
+
console.error(`Unknown topic: ${topic}\n\nAvailable topics:`);
|
|
2738
|
+
console.log(DOCS.index);
|
|
2739
|
+
process.exit(1);
|
|
2740
|
+
}
|
|
2741
|
+
console.log(content);
|
|
2742
|
+
}
|
|
2324
2743
|
// --- Main ---
|
|
2325
2744
|
const [command, ...args] = rawArgs;
|
|
2326
2745
|
const COUNTRIES = ['pe', 'co', 'br', 'mx', 'ar', 'cl', 'ec'];
|
|
@@ -2389,6 +2808,9 @@ else {
|
|
|
2389
2808
|
case 'completion':
|
|
2390
2809
|
completion();
|
|
2391
2810
|
break;
|
|
2811
|
+
case 'docs':
|
|
2812
|
+
docs(args);
|
|
2813
|
+
break;
|
|
2392
2814
|
case 'help':
|
|
2393
2815
|
help();
|
|
2394
2816
|
break;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "latinfo",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.13.1",
|
|
4
4
|
"description": "Tax registry & procurement API for Latin America. Query RUC, DNI, NIT, licitaciones from Peru & Colombia. Offline MPHF search, full OCDS data, updated daily.",
|
|
5
5
|
"homepage": "https://latinfo.dev",
|
|
6
6
|
"repository": {
|