latinfo 0.12.2 → 0.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +441 -19
  2. package/package.json +1 -1
package/dist/index.js CHANGED
@@ -47,7 +47,7 @@ const local_search_1 = require("./local-search");
47
47
  const client_search_1 = require("./client-search");
48
48
  const odis_search_1 = require("./odis-search");
49
49
  const mphf_search_1 = require("./mphf-search");
50
- const VERSION = '0.12.2';
50
+ const VERSION = '0.13.1';
51
51
  const API_URL = process.env.LATINFO_API_URL || 'https://api.latinfo.dev';
52
52
  const GITHUB_CLIENT_ID = process.env.GITHUB_CLIENT_ID || 'Ov23li5fcQaiCsVtaMKK';
53
53
  const CONFIG_DIR = path_1.default.join(os_1.default.homedir(), '.latinfo');
@@ -1786,12 +1786,16 @@ async function pipeTest(args) {
1786
1786
  }
1787
1787
  }
1788
1788
  // Run import with --limit 100
1789
- const scriptPath = path_1.default.join(repo, 'src', 'imports', `${sourceName}.ts`);
1789
+ // Check YAML for custom import_script path
1790
+ const importScriptMatch = yamlContent.match(/import_script:\s*(.+)/);
1791
+ const customScript = importScriptMatch ? path_1.default.join(repo, importScriptMatch[1].trim()) : null;
1792
+ const defaultScript = path_1.default.join(repo, 'src', 'imports', `${sourceName}.ts`);
1793
+ const scriptPath = customScript && fs_1.default.existsSync(customScript) ? customScript
1794
+ : fs_1.default.existsSync(defaultScript) ? defaultScript : null;
1790
1795
  const easypipePath = path_1.default.join(repo, 'src', 'imports', 'easypipe.ts');
1791
- const useEasypipe = !fs_1.default.existsSync(scriptPath);
1792
- const cmd = useEasypipe
1793
- ? `npx tsx ${easypipePath} ${yamlPath} --limit 100 --local`
1794
- : `npx tsx ${scriptPath} --limit 100 --local`;
1796
+ const cmd = scriptPath
1797
+ ? `npx tsx ${scriptPath} --limit 100 --local`
1798
+ : `npx tsx ${easypipePath} ${yamlPath} --limit 100 --local`;
1795
1799
  console.log(`[pipe] Gate 1: TEST (100 records)\n`);
1796
1800
  console.log(`Running: ${cmd}\n`);
1797
1801
  let output = '';
@@ -1852,12 +1856,17 @@ async function pipeValidate(args) {
1852
1856
  requireGate(status, 'test', 'validate');
1853
1857
  const repo = getRepoPath();
1854
1858
  const { execSync: run } = await Promise.resolve().then(() => __importStar(require('child_process')));
1855
- const scriptPath = path_1.default.join(repo, 'src', 'imports', `${sourceName}.ts`);
1859
+ const yamlPath = path_1.default.join(repo, 'sources', `${sourceName}.yaml`);
1860
+ const yamlContent = fs_1.default.existsSync(yamlPath) ? fs_1.default.readFileSync(yamlPath, 'utf-8') : '';
1861
+ const importScriptMatch = yamlContent.match(/import_script:\s*(.+)/);
1862
+ const customScript = importScriptMatch ? path_1.default.join(repo, importScriptMatch[1].trim()) : null;
1863
+ const defaultScript = path_1.default.join(repo, 'src', 'imports', `${sourceName}.ts`);
1864
+ const scriptPath = customScript && fs_1.default.existsSync(customScript) ? customScript
1865
+ : fs_1.default.existsSync(defaultScript) ? defaultScript : null;
1856
1866
  const easypipePath = path_1.default.join(repo, 'src', 'imports', 'easypipe.ts');
1857
- const useEasypipe = !fs_1.default.existsSync(scriptPath);
1858
- const cmd = useEasypipe
1859
- ? `npx tsx ${easypipePath} ${path_1.default.join(repo, 'sources', `${sourceName}.yaml`)} --local`
1860
- : `npx tsx ${scriptPath} --local`;
1867
+ const cmd = scriptPath
1868
+ ? `npx tsx ${scriptPath} --local`
1869
+ : `npx tsx ${easypipePath} ${yamlPath} --local`;
1861
1870
  console.log(`[pipe] Gate 2: VALIDATE (full import, local only — no R2 upload)\n`);
1862
1871
  console.log(`Running: ${cmd}\n`);
1863
1872
  try {
@@ -2130,17 +2139,31 @@ async function pipePublish(args) {
2130
2139
  const { execSync: run } = await Promise.resolve().then(() => __importStar(require('child_process')));
2131
2140
  const RUNNER = 'f3mt0@100.109.82.87';
2132
2141
  console.log(`[pipe] Gate 4: PUBLISH\n`);
2133
- // 1. Git add + commit + push
2142
+ // 1. Auto-generate sources.ts from YAMLs
2143
+ console.log(`[pipe] Generating sources.ts...`);
2144
+ try {
2145
+ run(`npx tsx src/imports/generate-sources.ts`, { cwd: repo, stdio: 'inherit' });
2146
+ }
2147
+ catch {
2148
+ console.error(`[pipe] Failed to generate sources.ts`);
2149
+ process.exit(1);
2150
+ }
2151
+ // 2. Git add + commit + push
2134
2152
  console.log(`[pipe] Committing to repo...`);
2135
- const files = [`sources/${sourceName}.yaml`];
2153
+ const files = [`sources/${sourceName}.yaml`, 'src/sources.ts'];
2136
2154
  const scriptPath = path_1.default.join(repo, 'src', 'imports', `${sourceName}.ts`);
2137
- if (fs_1.default.existsSync(scriptPath))
2155
+ const yamlContent = fs_1.default.readFileSync(path_1.default.join(repo, 'sources', `${sourceName}.yaml`), 'utf-8');
2156
+ const importScriptMatch = yamlContent.match(/import_script:\s*(.+)/);
2157
+ const customScript = importScriptMatch ? importScriptMatch[1].trim() : null;
2158
+ if (customScript && fs_1.default.existsSync(path_1.default.join(repo, customScript)))
2159
+ files.push(customScript);
2160
+ else if (fs_1.default.existsSync(scriptPath))
2138
2161
  files.push(`src/imports/${sourceName}.ts`);
2162
+ const docsFile = `docs/sources/${sourceName}.md`;
2163
+ if (fs_1.default.existsSync(path_1.default.join(repo, docsFile)))
2164
+ files.push(docsFile);
2139
2165
  try {
2140
- const docsFile = `docs/sources/${sourceName}.md`;
2141
- if (fs_1.default.existsSync(path_1.default.join(repo, docsFile)))
2142
- files.push(docsFile);
2143
- run(`git add ${files.join(' ')} src/sources.ts .github/workflows/import.yml`, { cwd: repo, stdio: 'pipe' });
2166
+ run(`git add ${files.join(' ')}`, { cwd: repo, stdio: 'pipe' });
2144
2167
  run(`git commit -m "Add data source: ${sourceName}"`, { cwd: repo, stdio: 'pipe' });
2145
2168
  run(`git push`, { cwd: repo, stdio: 'pipe' });
2146
2169
  console.log(`[pipe] Pushed to remote.`);
@@ -2170,7 +2193,7 @@ async function pipePublish(args) {
2170
2193
  // 4. Restart search server
2171
2194
  console.log(`[pipe] Restarting search server on Linux Mint...`);
2172
2195
  try {
2173
- run(`ssh ${RUNNER} "sudo systemctl restart search-server 2>/dev/null || echo 'No service yet'"`, { stdio: 'inherit' });
2196
+ run(`ssh ${RUNNER} "sudo systemctl restart latinfo-search 2>/dev/null || echo 'No service yet'"`, { stdio: 'inherit' });
2174
2197
  }
2175
2198
  catch { }
2176
2199
  console.log(`\n[pipe] Gate 4 PASSED ✓`);
@@ -2321,6 +2344,402 @@ ENVIRONMENT
2321
2344
  LATINFO_REPO_PATH Auto-detected from cwd`);
2322
2345
  }
2323
2346
  }
2347
+ // --- Docs ---
2348
+ const DOCS = {
2349
+ index: `latinfo docs — complete documentation
2350
+
2351
+ TOPICS
2352
+ latinfo docs pipe How to create a data pipeline (full guide)
2353
+ latinfo docs fields searchFieldIndex, statusFieldIndex explained
2354
+ latinfo docs v2 V2 search index + MPHF (mandatory)
2355
+ latinfo docs encoding Encoding issues (latin1, UTF-8, replacement chars)
2356
+ latinfo docs script Import script template + requirements
2357
+ latinfo docs troubleshooting Common errors and fixes
2358
+ latinfo docs architecture How latinfo works internally
2359
+ latinfo docs api API endpoints and response format`,
2360
+ pipe: `HOW TO CREATE A DATA PIPELINE
2361
+
2362
+ latinfo pipe handles storage, indexing, search, and API serving automatically.
2363
+ Your only job is to write a script that produces a TSV file.
2364
+
2365
+ STEP BY STEP
2366
+
2367
+ 1. Create source definition:
2368
+ latinfo pipe create <country> <institution> <dataset> [flags]
2369
+
2370
+ Example:
2371
+ latinfo pipe create pe redam registry --url https://redam.pj.gob.pe/ --id-name dni --id-length 8
2372
+
2373
+ This generates a YAML config. Edit it to match your source.
2374
+
2375
+ 2. Write your import script (any method: fetch, Playwright, curl, Python):
2376
+ Your script must:
2377
+ a) Download the source data
2378
+ b) Parse to TSV: ID\\tfield1\\tfield2\\t...
2379
+ c) Sort by ID: LC_ALL=C sort -t'\\t' -k1,1
2380
+ d) Call buildBinaryFiles() → .bin + .idx
2381
+ e) Call buildSearchIndex() with statusFieldIndex → V2 search (MANDATORY)
2382
+ f) Call buildMphfFromIdx() → .mphf (MANDATORY)
2383
+ g) Call uploadToR2() for each file
2384
+ h) Call saveImportMeta()
2385
+
2386
+ See: latinfo docs script (for full template)
2387
+ See: latinfo docs fields (for searchFieldIndex and statusFieldIndex)
2388
+
2389
+ 3. Upload script:
2390
+ latinfo pipe script <source> ./my-script.ts
2391
+
2392
+ 4. Add dependencies (if any):
2393
+ latinfo pipe deps <source> playwright ddddocr
2394
+
2395
+ 5. Test (100 records, LOCAL — no R2 upload):
2396
+ latinfo pipe test <source>
2397
+
2398
+ 6. Validate (all records, LOCAL — no R2 upload):
2399
+ latinfo pipe validate <source>
2400
+
2401
+ 7. Stage (Linux Mint — import + 500 concurrent bench):
2402
+ latinfo pipe stage <source>
2403
+
2404
+ 8. Document:
2405
+ latinfo pipe docs <source>
2406
+
2407
+ 9. Publish to production:
2408
+ latinfo pipe publish <source>
2409
+
2410
+ GATES
2411
+ Each gate must pass before the next unlocks:
2412
+ test → validate → stage → docs → publish
2413
+
2414
+ test: 100 records, validates V2 + MPHF, no R2 upload
2415
+ validate: full import, all records, no R2 upload
2416
+ stage: runs on Linux Mint, uploads to R2, 500 concurrent bench (99.9% required)
2417
+ docs: documentation with required sections
2418
+ publish: deploy to Cloudflare, restart search server, smoke test, rollback on failure
2419
+
2420
+ NAMING
2421
+ Source name: {country}-{institution}-{dataset}
2422
+ Country: ISO 3166-1 alpha-2 lowercase (pe, co, br, mx, ec, ar, cl)
2423
+ Institution: government agency abbreviation, lowercase
2424
+ Dataset: what the data contains, english, lowercase
2425
+ Examples: pe-sunat-padron, pe-osce-sanctioned, co-rues-registry, pe-redam-registry`,
2426
+ fields: `FIELD INDEXES: searchFieldIndex and statusFieldIndex
2427
+
2428
+ Both go in buildSearchIndex() inside your import script.
2429
+ NOT in src/sources.ts. NOT in the YAML config.
2430
+
2431
+ searchFieldIndex
2432
+ Which TSV column (after ID) is the main searchable text.
2433
+ Almost always 0 (the first field after the primary ID).
2434
+ This field gets tokenized and indexed for full-text search.
2435
+
2436
+ statusFieldIndex
2437
+ Which TSV column (after ID) appears as a short label in search results.
2438
+ Choose the field that helps identify records at a glance.
2439
+
2440
+ | Source type | statusFieldIndex | Field | Example value |
2441
+ |---------------|-----------------|-----------------|------------------------|
2442
+ | Companies | 1 | estado | ACTIVO, CANCELADA |
2443
+ | Sanctions | 1 | date_start | 20230103 |
2444
+ | People (REDAM)| 4 | fecha_registro | 2022-10-27 |
2445
+ | Fines | 1 | date_start | 20230103 |
2446
+
2447
+ HOW TO COUNT
2448
+
2449
+ Your TSV looks like: ID\\tfield0\\tfield1\\tfield2\\tfield3\\tfield4
2450
+ The index is 0-based, counting from the first field AFTER the ID.
2451
+
2452
+ Example REDAM TSV:
2453
+ 12345678\\tROSALES\\tORTIZ\\tJUAN\\tDNI\\t2022-10-27
2454
+ ID 0(ape_p) 1(ape_m) 2(nombres) 3(tipo) 4(fecha)
2455
+
2456
+ searchFieldIndex: 0 → search by ape_paterno (last name)
2457
+ statusFieldIndex: 4 → show fecha_registro in search results
2458
+
2459
+ CODE EXAMPLE
2460
+
2461
+ // In your import script (NOT in sources.ts):
2462
+ const search = await buildSearchIndex(
2463
+ sortedPath, TEMP_DIR, SOURCE,
2464
+ { searchFieldIndex: 0, idRegex: /^\\d{8}$/, statusFieldIndex: 4 },
2465
+ recordCount,
2466
+ );
2467
+
2468
+ IMPORTANT
2469
+ - statusFieldIndex is MANDATORY (V2). Without it, you get V1 which is deprecated.
2470
+ - If pipe test says "V1" in the output, you forgot statusFieldIndex.
2471
+ - These indexes do NOT go in src/sources.ts — that file is for the API router.`,
2472
+ v2: `V2 SEARCH INDEX + MPHF
2473
+
2474
+ V2 is MANDATORY for all sources. V1 is deprecated.
2475
+
2476
+ WHAT IS V2?
2477
+ V2 stores name + status inline in the search posting list (110 bytes per entry).
2478
+ Search results come directly from the index — no secondary R2 lookup needed.
2479
+ V1 only stored the ID (8 bytes) and required a second read to get the name.
2480
+
2481
+ HOW TO ENABLE V2
2482
+ Pass statusFieldIndex to buildSearchIndex():
2483
+
2484
+ buildSearchIndex(sortedPath, TEMP_DIR, SOURCE,
2485
+ { searchFieldIndex: 0, idRegex: /^\\d{11}$/, statusFieldIndex: 1 },
2486
+ recordCount,
2487
+ );
2488
+
2489
+ See: latinfo docs fields (for how to choose statusFieldIndex)
2490
+
2491
+ WHAT IS MPHF?
2492
+ Minimal Perfect Hash Function. A compact dictionary (~500KB) that enables
2493
+ client-side offline search. The client downloads it once and searches locally
2494
+ without hitting the server.
2495
+
2496
+ HOW TO GENERATE MPHF
2497
+ Always call buildMphfFromIdx() after buildSearchIndex():
2498
+
2499
+ const mphfPath = buildMphfFromIdx(search.idxPath);
2500
+ uploadToR2(mphfPath, \`\${SOURCE}-search.mphf\`);
2501
+
2502
+ DETECTION
2503
+ pipe test checks script output for "V1" or "V2" markers.
2504
+ If it sees V1, the gate fails with:
2505
+ "Search index is V1 — MUST use V2. Add statusFieldIndex to buildSearchIndex()"`,
2506
+ encoding: `ENCODING ISSUES
2507
+
2508
+ COMMON PROBLEM
2509
+ Source file is ISO-8859-1 (latin1) but read as UTF-8.
2510
+ Characters like ó, ñ, é appear as replacement char (U+FFFD).
2511
+
2512
+ HOW TO DETECT
2513
+ Run: file downloaded-file.csv
2514
+ If it says "ISO-8859 text" → it's latin1.
2515
+
2516
+ HOW TO FIX
2517
+ Read with latin1, write with UTF-8:
2518
+
2519
+ const input = createReadStream(csvPath, { encoding: 'latin1' });
2520
+ const output = createWriteStream(tsvPath, { encoding: 'utf-8' });
2521
+
2522
+ Node's latin1 decoder converts 0xF3 (ó) to the correct JS string,
2523
+ then UTF-8 writer encodes it as 0xC3B3.
2524
+
2525
+ TRUNCATION BUG (FIXED)
2526
+ build-binary.ts truncates fields to 255 bytes. If the cut lands in the middle
2527
+ of a multi-byte UTF-8 character, it produces replacement chars.
2528
+ This is fixed — build-binary.ts now backs up to the last valid UTF-8 boundary.
2529
+
2530
+ YAML CONFIG
2531
+ Set encoding in your YAML:
2532
+ encoding: iso-8859-1
2533
+
2534
+ Or for UTF-8 (default):
2535
+ encoding: utf-8`,
2536
+ script: `IMPORT SCRIPT TEMPLATE
2537
+
2538
+ Your script downloads data and produces a sorted TSV. That's it.
2539
+ latinfo handles everything else (binary format, search index, MPHF, R2 upload).
2540
+
2541
+ FULL TEMPLATE
2542
+
2543
+ import { createReadStream, createWriteStream } from 'fs';
2544
+ import * as readline from 'readline';
2545
+ import * as https from 'https';
2546
+ import * as fs from 'fs';
2547
+ import * as path from 'path';
2548
+ import { execSync } from 'child_process';
2549
+ import { buildBinaryFiles } from './build-binary';
2550
+ import { buildSearchIndex } from './build-search-index';
2551
+ import { uploadToR2, saveImportMeta, buildMphfFromIdx } from './shared';
2552
+
2553
+ const SOURCE = 'pe-example-dataset'; // must match YAML name
2554
+ const TEMP_DIR = \`/tmp/\${SOURCE}-import\`;
2555
+
2556
+ export async function importExample(options?: { limit?: number }) {
2557
+ console.log(\`=== \${SOURCE.toUpperCase()} IMPORT ===\\n\`);
2558
+ try {
2559
+ fs.mkdirSync(TEMP_DIR, { recursive: true });
2560
+
2561
+ // 1. Download
2562
+ // ... your download logic (fetch, Playwright, curl, etc.)
2563
+
2564
+ // 2. Parse to TSV
2565
+ const tsvPath = path.join(TEMP_DIR, 'parsed.tsv');
2566
+ const output = createWriteStream(tsvPath, { encoding: 'utf-8' });
2567
+ // ... parse source → output.write(\`\${id}\\t\${field1}\\t\${field2}\\n\`)
2568
+ await new Promise<void>(r => output.end(r));
2569
+
2570
+ // 3. Sort
2571
+ const sortedPath = path.join(TEMP_DIR, 'sorted.tsv');
2572
+ execSync(\`LC_ALL=C sort -t'\\t' -k1,1 "\${tsvPath}" -o "\${sortedPath}"\`);
2573
+ fs.unlinkSync(tsvPath);
2574
+
2575
+ // 4. Build binary
2576
+ const config = { idLength: 11, idRegex: /^\\d{11}$/, prefixLength: 5, fieldCount: 6 };
2577
+ const { shardPaths, idxPath, recordCount } = await buildBinaryFiles(
2578
+ sortedPath, TEMP_DIR, SOURCE, config);
2579
+
2580
+ // 5. Build V2 search (MANDATORY: pass statusFieldIndex)
2581
+ const search = await buildSearchIndex(sortedPath, TEMP_DIR, SOURCE,
2582
+ { searchFieldIndex: 0, idRegex: /^\\d{11}$/, statusFieldIndex: 1 },
2583
+ recordCount);
2584
+
2585
+ // 6. Delete sorted TSV
2586
+ fs.unlinkSync(sortedPath);
2587
+
2588
+ // 7. Upload to R2
2589
+ for (let i = 0; i < shardPaths.length; i++)
2590
+ uploadToR2(shardPaths[i], \`\${SOURCE}-\${i}.bin\`);
2591
+ uploadToR2(idxPath, \`\${SOURCE}.idx\`);
2592
+
2593
+ // 8. Build + upload MPHF (MANDATORY)
2594
+ const mphfPath = buildMphfFromIdx(search.idxPath);
2595
+ uploadToR2(search.idxPath, \`\${SOURCE}-search.idx\`);
2596
+ uploadToR2(mphfPath, \`\${SOURCE}-search.mphf\`);
2597
+ for (let i = 0; i < search.shardPaths.length; i++)
2598
+ uploadToR2(search.shardPaths[i], \`\${SOURCE}-search-\${i}.dat\`);
2599
+
2600
+ // 9. Save metadata
2601
+ saveImportMeta(SOURCE, new Date().toISOString(), recordCount);
2602
+
2603
+ // 10. Cleanup
2604
+ fs.rmSync(TEMP_DIR, { recursive: true, force: true });
2605
+ console.log(\`\\n[\${SOURCE}] Success: \${recordCount.toLocaleString()} records\`);
2606
+ return true;
2607
+ } catch (error) {
2608
+ console.error(\`\\n[\${SOURCE}] Error:\`, error);
2609
+ return false;
2610
+ }
2611
+ }
2612
+
2613
+ if (require.main === module) {
2614
+ const args = process.argv.slice(2);
2615
+ const limitIdx = args.indexOf('--limit');
2616
+ const limit = limitIdx !== -1 ? parseInt(args[limitIdx + 1]) : undefined;
2617
+ importExample({ limit }).then(ok => process.exit(ok ? 0 : 1));
2618
+ }
2619
+
2620
+ CLEAN EXAMPLE
2621
+ See src/imports/pe-osce-sanctioned.ts (CSV, latin1, 9K records)
2622
+
2623
+ IMPORTANT
2624
+ - SOURCE const must match the YAML filename (without .yaml)
2625
+ - Always use V2 (statusFieldIndex) — see: latinfo docs fields
2626
+ - Always generate MPHF — see: latinfo docs v2
2627
+ - Use --local flag support: the shared.ts uploadToR2 skips uploads when --local is passed`,
2628
+ troubleshooting: `COMMON ERRORS AND FIXES
2629
+
2630
+ ENCODING
2631
+ Problem: ó appears as replacement char in output
2632
+ Fix: Read source as latin1, write TSV as UTF-8
2633
+ See: latinfo docs encoding
2634
+
2635
+ V1 INSTEAD OF V2
2636
+ Problem: pipe test says "Search index is V1"
2637
+ Fix: Add statusFieldIndex to buildSearchIndex()
2638
+ See: latinfo docs fields
2639
+
2640
+ NO MPHF
2641
+ Problem: pipe test says "No MPHF generated"
2642
+ Fix: Call buildMphfFromIdx() after buildSearchIndex()
2643
+ See: latinfo docs v2
2644
+
2645
+ HTTP 418 / 403
2646
+ Problem: Source server blocks download
2647
+ Fix: Add User-Agent header to your request:
2648
+ headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' }
2649
+
2650
+ 0 RECORDS PARSED
2651
+ Problem: Import succeeds but no records
2652
+ Fix: Check delimiter and column indexes in YAML match the source file
2653
+
2654
+ UPLOAD FAILS (CLOUDFLARE_API_TOKEN)
2655
+ Problem: uploadToR2 fails with "set CLOUDFLARE_API_TOKEN"
2656
+ Fix: Ensure .env has CLOUDFLARE_API_TOKEN. For local testing use --local flag.
2657
+ pipe test and pipe validate already use --local (no R2 upload).
2658
+
2659
+ GATE LOCKED
2660
+ Problem: "Gate X has not passed. Run: latinfo pipe X"
2661
+ Fix: Gates are sequential. Run them in order: test → validate → stage → docs → publish
2662
+
2663
+ SCRIPT CHANGED, GATES RESET
2664
+ Problem: After running pipe script, all gates show "not run"
2665
+ This is expected. Uploading a new script resets all gates — re-run from test.`,
2666
+ architecture: `HOW LATINFO WORKS
2667
+
2668
+ DATA FLOW
2669
+ Source (CSV/API/crawler) → TSV → sort → binary (.bin + .idx) → R2
2670
+ → search index (.idx + .dat) → R2 / Linux Mint
2671
+ → MPHF (.mphf) → R2
2672
+
2673
+ SERVING
2674
+ Lookups (by ID): User → Cloudflare Worker → R2 range read → response
2675
+ Search (by name): User → Cloudflare Worker → Linux Mint (search server, RAM) → response
2676
+ Offline search: Client downloads .mphf once → searches locally, zero server
2677
+
2678
+ STORAGE
2679
+ R2 (Cloudflare): .bin shards (~200MB each), .idx prefix index (~300KB), search shards
2680
+ Linux Mint RAM: search index + posting lists (~2GB per source) for fast search
2681
+
2682
+ BINARY FORMAT
2683
+ .bin: records sorted by ID, each record = uint16 length + ID + fields (uint8 len + data)
2684
+ .idx: prefix index, 16-byte entries (prefix, shard, offset, length)
2685
+ Fields truncated to 255 bytes with UTF-8 boundary safety
2686
+
2687
+ SEARCH
2688
+ V2 search index: inverted index with inline name + status (110 bytes per posting)
2689
+ Tokenized, IDF-scored, prefix matching, stop words filtered
2690
+ Search server: Node cluster on Linux Mint, 2 workers, ~670 queries/sec
2691
+
2692
+ INFRASTRUCTURE
2693
+ Cloudflare Workers: API routing, auth, rate limiting, lookups
2694
+ Cloudflare D1: auth database (api_keys, usage)
2695
+ Cloudflare R2: data storage
2696
+ Linux Mint: search server (RAM), GitHub Actions runner (imports)
2697
+ Cloudflare Tunnel: search.latinfo.dev → Linux Mint port 3001`,
2698
+ api: `API ENDPOINTS
2699
+
2700
+ All endpoints require Authorization: Bearer <api_key>
2701
+
2702
+ ROUTE PATTERN
2703
+ /{country}/{institution}/{dataset}/{action}
2704
+
2705
+ LOOKUP BY ID
2706
+ GET /pe/sunat/padron/ruc/20100047218
2707
+ GET /pe/sunat/padron/dni/09346247
2708
+ GET /pe/sunat/coactiva/ruc/20348858182
2709
+ GET /pe/osce/sanctioned/ruc/20100994128
2710
+ GET /pe/osce/fines/ruc/10001108307
2711
+ GET /pe/redam/registry/dni/12345678
2712
+ GET /co/rues/registry/nit/0900073223
2713
+
2714
+ SEARCH
2715
+ GET /pe/sunat/padron/search?q=banco+credito
2716
+ GET /pe/osce/sanctioned/search?q=constructora
2717
+ GET /co/rues/registry/search?q=ecopetrol
2718
+
2719
+ LICITACIONES
2720
+ GET /pe/oece/tenders?q=servicio&limit=5
2721
+ GET /pe/oece/tenders/info
2722
+
2723
+ RESPONSE FORMAT
2724
+ Lookup: { "ruc": "20100047218", "razon_social": "BANCO DE CREDITO", ... }
2725
+ Search: [{ "ruc": "20100047218", "razon_social": "BANCO DE CREDITO", "estado": "ACTIVO" }, ...]
2726
+
2727
+ CLI
2728
+ latinfo pe sunat padron 20100047218
2729
+ latinfo pe sunat padron --search "banco credito"
2730
+ latinfo pe sunat padron --dni 09346247
2731
+ latinfo co rues registry --search "ecopetrol"`,
2732
+ };
2733
+ function docs(args) {
2734
+ const topic = args[0] || 'index';
2735
+ const content = DOCS[topic];
2736
+ if (!content) {
2737
+ console.error(`Unknown topic: ${topic}\n\nAvailable topics:`);
2738
+ console.log(DOCS.index);
2739
+ process.exit(1);
2740
+ }
2741
+ console.log(content);
2742
+ }
2324
2743
  // --- Main ---
2325
2744
  const [command, ...args] = rawArgs;
2326
2745
  const COUNTRIES = ['pe', 'co', 'br', 'mx', 'ar', 'cl', 'ec'];
@@ -2389,6 +2808,9 @@ else {
2389
2808
  case 'completion':
2390
2809
  completion();
2391
2810
  break;
2811
+ case 'docs':
2812
+ docs(args);
2813
+ break;
2392
2814
  case 'help':
2393
2815
  help();
2394
2816
  break;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "latinfo",
3
- "version": "0.12.2",
3
+ "version": "0.13.1",
4
4
  "description": "Tax registry & procurement API for Latin America. Query RUC, DNI, NIT, licitaciones from Peru & Colombia. Offline MPHF search, full OCDS data, updated daily.",
5
5
  "homepage": "https://latinfo.dev",
6
6
  "repository": {