latinfo 0.12.2 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +400 -1
  2. package/package.json +1 -1
package/dist/index.js CHANGED
@@ -47,7 +47,7 @@ const local_search_1 = require("./local-search");
47
47
  const client_search_1 = require("./client-search");
48
48
  const odis_search_1 = require("./odis-search");
49
49
  const mphf_search_1 = require("./mphf-search");
50
- const VERSION = '0.12.2';
50
+ const VERSION = '0.13.0';
51
51
  const API_URL = process.env.LATINFO_API_URL || 'https://api.latinfo.dev';
52
52
  const GITHUB_CLIENT_ID = process.env.GITHUB_CLIENT_ID || 'Ov23li5fcQaiCsVtaMKK';
53
53
  const CONFIG_DIR = path_1.default.join(os_1.default.homedir(), '.latinfo');
@@ -2321,6 +2321,402 @@ ENVIRONMENT
2321
2321
  LATINFO_REPO_PATH Auto-detected from cwd`);
2322
2322
  }
2323
2323
  }
2324
+ // --- Docs ---
2325
+ const DOCS = {
2326
+ index: `latinfo docs — complete documentation
2327
+
2328
+ TOPICS
2329
+ latinfo docs pipe How to create a data pipeline (full guide)
2330
+ latinfo docs fields searchFieldIndex, statusFieldIndex explained
2331
+ latinfo docs v2 V2 search index + MPHF (mandatory)
2332
+ latinfo docs encoding Encoding issues (latin1, UTF-8, replacement chars)
2333
+ latinfo docs script Import script template + requirements
2334
+ latinfo docs troubleshooting Common errors and fixes
2335
+ latinfo docs architecture How latinfo works internally
2336
+ latinfo docs api API endpoints and response format`,
2337
+ pipe: `HOW TO CREATE A DATA PIPELINE
2338
+
2339
+ latinfo pipe handles storage, indexing, search, and API serving automatically.
2340
+ Your only job is to write a script that produces a TSV file.
2341
+
2342
+ STEP BY STEP
2343
+
2344
+ 1. Create source definition:
2345
+ latinfo pipe create <country> <institution> <dataset> [flags]
2346
+
2347
+ Example:
2348
+ latinfo pipe create pe redam registry --url https://redam.pj.gob.pe/ --id-name dni --id-length 8
2349
+
2350
+ This generates a YAML config. Edit it to match your source.
2351
+
2352
+ 2. Write your import script (any method: fetch, Playwright, curl, Python):
2353
+ Your script must:
2354
+ a) Download the source data
2355
+ b) Parse to TSV: ID\\tfield1\\tfield2\\t...
2356
+ c) Sort by ID: LC_ALL=C sort -t'\\t' -k1,1
2357
+ d) Call buildBinaryFiles() → .bin + .idx
2358
+ e) Call buildSearchIndex() with statusFieldIndex → V2 search (MANDATORY)
2359
+ f) Call buildMphfFromIdx() → .mphf (MANDATORY)
2360
+ g) Call uploadToR2() for each file
2361
+ h) Call saveImportMeta()
2362
+
2363
+ See: latinfo docs script (for full template)
2364
+ See: latinfo docs fields (for searchFieldIndex and statusFieldIndex)
2365
+
2366
+ 3. Upload script:
2367
+ latinfo pipe script <source> ./my-script.ts
2368
+
2369
+ 4. Add dependencies (if any):
2370
+ latinfo pipe deps <source> playwright ddddocr
2371
+
2372
+ 5. Test (100 records, LOCAL — no R2 upload):
2373
+ latinfo pipe test <source>
2374
+
2375
+ 6. Validate (all records, LOCAL — no R2 upload):
2376
+ latinfo pipe validate <source>
2377
+
2378
+ 7. Stage (Linux Mint — import + 500 concurrent bench):
2379
+ latinfo pipe stage <source>
2380
+
2381
+ 8. Document:
2382
+ latinfo pipe docs <source>
2383
+
2384
+ 9. Publish to production:
2385
+ latinfo pipe publish <source>
2386
+
2387
+ GATES
2388
+ Each gate must pass before the next unlocks:
2389
+ test → validate → stage → docs → publish
2390
+
2391
+ test: 100 records, validates V2 + MPHF, no R2 upload
2392
+ validate: full import, all records, no R2 upload
2393
+ stage: runs on Linux Mint, uploads to R2, 500 concurrent bench (99.9% required)
2394
+ docs: documentation with required sections
2395
+ publish: deploy to Cloudflare, restart search server, smoke test, rollback on failure
2396
+
2397
+ NAMING
2398
+ Source name: {country}-{institution}-{dataset}
2399
+ Country: ISO 3166-1 alpha-2 lowercase (pe, co, br, mx, ec, ar, cl)
2400
+ Institution: government agency abbreviation, lowercase
2401
+ Dataset: what the data contains, english, lowercase
2402
+ Examples: pe-sunat-padron, pe-osce-sanctioned, co-rues-registry, pe-redam-registry`,
2403
+ fields: `FIELD INDEXES: searchFieldIndex and statusFieldIndex
2404
+
2405
+ Both go in buildSearchIndex() inside your import script.
2406
+ NOT in src/sources.ts. NOT in the YAML config.
2407
+
2408
+ searchFieldIndex
2409
+ Which TSV column (after ID) is the main searchable text.
2410
+ Almost always 0 (the first field after the primary ID).
2411
+ This field gets tokenized and indexed for full-text search.
2412
+
2413
+ statusFieldIndex
2414
+ Which TSV column (after ID) appears as a short label in search results.
2415
+ Choose the field that helps identify records at a glance.
2416
+
2417
+ | Source type | statusFieldIndex | Field | Example value |
2418
+ |---------------|-----------------|-----------------|------------------------|
2419
+ | Companies | 1 | estado | ACTIVO, CANCELADA |
2420
+ | Sanctions | 1 | date_start | 20230103 |
2421
+ | People (REDAM)| 4 | fecha_registro | 2022-10-27 |
2422
+ | Fines | 1 | date_start | 20230103 |
2423
+
2424
+ HOW TO COUNT
2425
+
2426
+ Your TSV looks like: ID\\tfield0\\tfield1\\tfield2\\tfield3\\tfield4
2427
+ The index is 0-based, counting from the first field AFTER the ID.
2428
+
2429
+ Example REDAM TSV:
2430
+ 12345678\\tROSALES\\tORTIZ\\tJUAN\\tDNI\\t2022-10-27
2431
+ ID 0(ape_p) 1(ape_m) 2(nombres) 3(tipo) 4(fecha)
2432
+
2433
+ searchFieldIndex: 0 → search by ape_paterno (last name)
2434
+ statusFieldIndex: 4 → show fecha_registro in search results
2435
+
2436
+ CODE EXAMPLE
2437
+
2438
+ // In your import script (NOT in sources.ts):
2439
+ const search = await buildSearchIndex(
2440
+ sortedPath, TEMP_DIR, SOURCE,
2441
+ { searchFieldIndex: 0, idRegex: /^\\d{8}$/, statusFieldIndex: 4 },
2442
+ recordCount,
2443
+ );
2444
+
2445
+ IMPORTANT
2446
+ - statusFieldIndex is MANDATORY (V2). Without it, you get V1 which is deprecated.
2447
+ - If pipe test says "V1" in the output, you forgot statusFieldIndex.
2448
+ - These indexes do NOT go in src/sources.ts — that file is for the API router.`,
2449
+ v2: `V2 SEARCH INDEX + MPHF
2450
+
2451
+ V2 is MANDATORY for all sources. V1 is deprecated.
2452
+
2453
+ WHAT IS V2?
2454
+ V2 stores name + status inline in the search posting list (110 bytes per entry).
2455
+ Search results come directly from the index — no secondary R2 lookup needed.
2456
+ V1 only stored the ID (8 bytes) and required a second read to get the name.
2457
+
2458
+ HOW TO ENABLE V2
2459
+ Pass statusFieldIndex to buildSearchIndex():
2460
+
2461
+ buildSearchIndex(sortedPath, TEMP_DIR, SOURCE,
2462
+ { searchFieldIndex: 0, idRegex: /^\\d{11}$/, statusFieldIndex: 1 },
2463
+ recordCount,
2464
+ );
2465
+
2466
+ See: latinfo docs fields (for how to choose statusFieldIndex)
2467
+
2468
+ WHAT IS MPHF?
2469
+ Minimal Perfect Hash Function. A compact dictionary (~500KB) that enables
2470
+ client-side offline search. The client downloads it once and searches locally
2471
+ without hitting the server.
2472
+
2473
+ HOW TO GENERATE MPHF
2474
+ Always call buildMphfFromIdx() after buildSearchIndex():
2475
+
2476
+ const mphfPath = buildMphfFromIdx(search.idxPath);
2477
+ uploadToR2(mphfPath, \`\${SOURCE}-search.mphf\`);
2478
+
2479
+ DETECTION
2480
+ pipe test checks script output for "V1" or "V2" markers.
2481
+ If it sees V1, the gate fails with:
2482
+ "Search index is V1 — MUST use V2. Add statusFieldIndex to buildSearchIndex()"`,
2483
+ encoding: `ENCODING ISSUES
2484
+
2485
+ COMMON PROBLEM
2486
+ Source file is ISO-8859-1 (latin1) but read as UTF-8.
2487
+ Characters like ó, ñ, é appear as replacement char (U+FFFD).
2488
+
2489
+ HOW TO DETECT
2490
+ Run: file downloaded-file.csv
2491
+ If it says "ISO-8859 text" → it's latin1.
2492
+
2493
+ HOW TO FIX
2494
+ Read with latin1, write with UTF-8:
2495
+
2496
+ const input = createReadStream(csvPath, { encoding: 'latin1' });
2497
+ const output = createWriteStream(tsvPath, { encoding: 'utf-8' });
2498
+
2499
+ Node's latin1 decoder converts 0xF3 (ó) to the correct JS string,
2500
+ then UTF-8 writer encodes it as 0xC3B3.
2501
+
2502
+ TRUNCATION BUG (FIXED)
2503
+ build-binary.ts truncates fields to 255 bytes. If the cut lands in the middle
2504
+ of a multi-byte UTF-8 character, it produces replacement chars.
2505
+ This is fixed — build-binary.ts now backs up to the last valid UTF-8 boundary.
2506
+
2507
+ YAML CONFIG
2508
+ Set encoding in your YAML:
2509
+ encoding: iso-8859-1
2510
+
2511
+ Or for UTF-8 (default):
2512
+ encoding: utf-8`,
2513
+ script: `IMPORT SCRIPT TEMPLATE
2514
+
2515
+ Your script downloads data and produces a sorted TSV. That's it.
2516
+ latinfo handles everything else (binary format, search index, MPHF, R2 upload).
2517
+
2518
+ FULL TEMPLATE
2519
+
2520
+ import { createReadStream, createWriteStream } from 'fs';
2521
+ import * as readline from 'readline';
2522
+ import * as https from 'https';
2523
+ import * as fs from 'fs';
2524
+ import * as path from 'path';
2525
+ import { execSync } from 'child_process';
2526
+ import { buildBinaryFiles } from './build-binary';
2527
+ import { buildSearchIndex } from './build-search-index';
2528
+ import { uploadToR2, saveImportMeta, buildMphfFromIdx } from './shared';
2529
+
2530
+ const SOURCE = 'pe-example-dataset'; // must match YAML name
2531
+ const TEMP_DIR = \`/tmp/\${SOURCE}-import\`;
2532
+
2533
+ export async function importExample(options?: { limit?: number }) {
2534
+ console.log(\`=== \${SOURCE.toUpperCase()} IMPORT ===\\n\`);
2535
+ try {
2536
+ fs.mkdirSync(TEMP_DIR, { recursive: true });
2537
+
2538
+ // 1. Download
2539
+ // ... your download logic (fetch, Playwright, curl, etc.)
2540
+
2541
+ // 2. Parse to TSV
2542
+ const tsvPath = path.join(TEMP_DIR, 'parsed.tsv');
2543
+ const output = createWriteStream(tsvPath, { encoding: 'utf-8' });
2544
+ // ... parse source → output.write(\`\${id}\\t\${field1}\\t\${field2}\\n\`)
2545
+ await new Promise<void>(r => output.end(r));
2546
+
2547
+ // 3. Sort
2548
+ const sortedPath = path.join(TEMP_DIR, 'sorted.tsv');
2549
+ execSync(\`LC_ALL=C sort -t'\\t' -k1,1 "\${tsvPath}" -o "\${sortedPath}"\`);
2550
+ fs.unlinkSync(tsvPath);
2551
+
2552
+ // 4. Build binary
2553
+ const config = { idLength: 11, idRegex: /^\\d{11}$/, prefixLength: 5, fieldCount: 6 };
2554
+ const { shardPaths, idxPath, recordCount } = await buildBinaryFiles(
2555
+ sortedPath, TEMP_DIR, SOURCE, config);
2556
+
2557
+ // 5. Build V2 search (MANDATORY: pass statusFieldIndex)
2558
+ const search = await buildSearchIndex(sortedPath, TEMP_DIR, SOURCE,
2559
+ { searchFieldIndex: 0, idRegex: /^\\d{11}$/, statusFieldIndex: 1 },
2560
+ recordCount);
2561
+
2562
+ // 6. Delete sorted TSV
2563
+ fs.unlinkSync(sortedPath);
2564
+
2565
+ // 7. Upload to R2
2566
+ for (let i = 0; i < shardPaths.length; i++)
2567
+ uploadToR2(shardPaths[i], \`\${SOURCE}-\${i}.bin\`);
2568
+ uploadToR2(idxPath, \`\${SOURCE}.idx\`);
2569
+
2570
+ // 8. Build + upload MPHF (MANDATORY)
2571
+ const mphfPath = buildMphfFromIdx(search.idxPath);
2572
+ uploadToR2(search.idxPath, \`\${SOURCE}-search.idx\`);
2573
+ uploadToR2(mphfPath, \`\${SOURCE}-search.mphf\`);
2574
+ for (let i = 0; i < search.shardPaths.length; i++)
2575
+ uploadToR2(search.shardPaths[i], \`\${SOURCE}-search-\${i}.dat\`);
2576
+
2577
+ // 9. Save metadata
2578
+ saveImportMeta(SOURCE, new Date().toISOString(), recordCount);
2579
+
2580
+ // 10. Cleanup
2581
+ fs.rmSync(TEMP_DIR, { recursive: true, force: true });
2582
+ console.log(\`\\n[\${SOURCE}] Success: \${recordCount.toLocaleString()} records\`);
2583
+ return true;
2584
+ } catch (error) {
2585
+ console.error(\`\\n[\${SOURCE}] Error:\`, error);
2586
+ return false;
2587
+ }
2588
+ }
2589
+
2590
+ if (require.main === module) {
2591
+ const args = process.argv.slice(2);
2592
+ const limitIdx = args.indexOf('--limit');
2593
+ const limit = limitIdx !== -1 ? parseInt(args[limitIdx + 1]) : undefined;
2594
+ importExample({ limit }).then(ok => process.exit(ok ? 0 : 1));
2595
+ }
2596
+
2597
+ CLEAN EXAMPLE
2598
+ See src/imports/pe-osce-sanctioned.ts (CSV, latin1, 9K records)
2599
+
2600
+ IMPORTANT
2601
+ - SOURCE const must match the YAML filename (without .yaml)
2602
+ - Always use V2 (statusFieldIndex) — see: latinfo docs fields
2603
+ - Always generate MPHF — see: latinfo docs v2
2604
+ - Use --local flag support: the shared.ts uploadToR2 skips uploads when --local is passed`,
2605
+ troubleshooting: `COMMON ERRORS AND FIXES
2606
+
2607
+ ENCODING
2608
+ Problem: ó appears as replacement char in output
2609
+ Fix: Read source as latin1, write TSV as UTF-8
2610
+ See: latinfo docs encoding
2611
+
2612
+ V1 INSTEAD OF V2
2613
+ Problem: pipe test says "Search index is V1"
2614
+ Fix: Add statusFieldIndex to buildSearchIndex()
2615
+ See: latinfo docs fields
2616
+
2617
+ NO MPHF
2618
+ Problem: pipe test says "No MPHF generated"
2619
+ Fix: Call buildMphfFromIdx() after buildSearchIndex()
2620
+ See: latinfo docs v2
2621
+
2622
+ HTTP 418 / 403
2623
+ Problem: Source server blocks download
2624
+ Fix: Add User-Agent header to your request:
2625
+ headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' }
2626
+
2627
+ 0 RECORDS PARSED
2628
+ Problem: Import succeeds but no records
2629
+ Fix: Check delimiter and column indexes in YAML match the source file
2630
+
2631
+ UPLOAD FAILS (CLOUDFLARE_API_TOKEN)
2632
+ Problem: uploadToR2 fails with "set CLOUDFLARE_API_TOKEN"
2633
+ Fix: Ensure .env has CLOUDFLARE_API_TOKEN. For local testing use --local flag.
2634
+ pipe test and pipe validate already use --local (no R2 upload).
2635
+
2636
+ GATE LOCKED
2637
+ Problem: "Gate X has not passed. Run: latinfo pipe X"
2638
+ Fix: Gates are sequential. Run them in order: test → validate → stage → docs → publish
2639
+
2640
+ SCRIPT CHANGED, GATES RESET
2641
+ Problem: After running pipe script, all gates show "not run"
2642
+ This is expected. Uploading a new script resets all gates — re-run from test.`,
2643
+ architecture: `HOW LATINFO WORKS
2644
+
2645
+ DATA FLOW
2646
+ Source (CSV/API/crawler) → TSV → sort → binary (.bin + .idx) → R2
2647
+ → search index (.idx + .dat) → R2 / Linux Mint
2648
+ → MPHF (.mphf) → R2
2649
+
2650
+ SERVING
2651
+ Lookups (by ID): User → Cloudflare Worker → R2 range read → response
2652
+ Search (by name): User → Cloudflare Worker → Linux Mint (search server, RAM) → response
2653
+ Offline search: Client downloads .mphf once → searches locally, zero server
2654
+
2655
+ STORAGE
2656
+ R2 (Cloudflare): .bin shards (~200MB each), .idx prefix index (~300KB), search shards
2657
+ Linux Mint RAM: search index + posting lists (~2GB per source) for fast search
2658
+
2659
+ BINARY FORMAT
2660
+ .bin: records sorted by ID, each record = uint16 length + ID + fields (uint8 len + data)
2661
+ .idx: prefix index, 16-byte entries (prefix, shard, offset, length)
2662
+ Fields truncated to 255 bytes with UTF-8 boundary safety
2663
+
2664
+ SEARCH
2665
+ V2 search index: inverted index with inline name + status (110 bytes per posting)
2666
+ Tokenized, IDF-scored, prefix matching, stop words filtered
2667
+ Search server: Node cluster on Linux Mint, 2 workers, ~670 queries/sec
2668
+
2669
+ INFRASTRUCTURE
2670
+ Cloudflare Workers: API routing, auth, rate limiting, lookups
2671
+ Cloudflare D1: auth database (api_keys, usage)
2672
+ Cloudflare R2: data storage
2673
+ Linux Mint: search server (RAM), GitHub Actions runner (imports)
2674
+ Cloudflare Tunnel: search.latinfo.dev → Linux Mint port 3001`,
2675
+ api: `API ENDPOINTS
2676
+
2677
+ All endpoints require Authorization: Bearer <api_key>
2678
+
2679
+ ROUTE PATTERN
2680
+ /{country}/{institution}/{dataset}/{action}
2681
+
2682
+ LOOKUP BY ID
2683
+ GET /pe/sunat/padron/ruc/20100047218
2684
+ GET /pe/sunat/padron/dni/09346247
2685
+ GET /pe/sunat/coactiva/ruc/20348858182
2686
+ GET /pe/osce/sanctioned/ruc/20100994128
2687
+ GET /pe/osce/fines/ruc/10001108307
2688
+ GET /pe/redam/registry/dni/12345678
2689
+ GET /co/rues/registry/nit/0900073223
2690
+
2691
+ SEARCH
2692
+ GET /pe/sunat/padron/search?q=banco+credito
2693
+ GET /pe/osce/sanctioned/search?q=constructora
2694
+ GET /co/rues/registry/search?q=ecopetrol
2695
+
2696
+ LICITACIONES
2697
+ GET /pe/oece/tenders?q=servicio&limit=5
2698
+ GET /pe/oece/tenders/info
2699
+
2700
+ RESPONSE FORMAT
2701
+ Lookup: { "ruc": "20100047218", "razon_social": "BANCO DE CREDITO", ... }
2702
+ Search: [{ "ruc": "20100047218", "razon_social": "BANCO DE CREDITO", "estado": "ACTIVO" }, ...]
2703
+
2704
+ CLI
2705
+ latinfo pe sunat padron 20100047218
2706
+ latinfo pe sunat padron --search "banco credito"
2707
+ latinfo pe sunat padron --dni 09346247
2708
+ latinfo co rues registry --search "ecopetrol"`,
2709
+ };
2710
+ function docs(args) {
2711
+ const topic = args[0] || 'index';
2712
+ const content = DOCS[topic];
2713
+ if (!content) {
2714
+ console.error(`Unknown topic: ${topic}\n\nAvailable topics:`);
2715
+ console.log(DOCS.index);
2716
+ process.exit(1);
2717
+ }
2718
+ console.log(content);
2719
+ }
2324
2720
  // --- Main ---
2325
2721
  const [command, ...args] = rawArgs;
2326
2722
  const COUNTRIES = ['pe', 'co', 'br', 'mx', 'ar', 'cl', 'ec'];
@@ -2389,6 +2785,9 @@ else {
2389
2785
  case 'completion':
2390
2786
  completion();
2391
2787
  break;
2788
+ case 'docs':
2789
+ docs(args);
2790
+ break;
2392
2791
  case 'help':
2393
2792
  help();
2394
2793
  break;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "latinfo",
3
- "version": "0.12.2",
3
+ "version": "0.13.0",
4
4
  "description": "Tax registry & procurement API for Latin America. Query RUC, DNI, NIT, licitaciones from Peru & Colombia. Offline MPHF search, full OCDS data, updated daily.",
5
5
  "homepage": "https://latinfo.dev",
6
6
  "repository": {