npm - latinfo - Versions diffs - 0.12.2 → 0.13.0 - Mend

latinfo 0.12.2 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/index.js +400 -1
package/package.json +1 -1

package/dist/index.js CHANGED Viewed

@@ -47,7 +47,7 @@ const local_search_1 = require("./local-search");
 const client_search_1 = require("./client-search");
 const odis_search_1 = require("./odis-search");
 const mphf_search_1 = require("./mphf-search");
-const VERSION = '0.12.2';
+const VERSION = '0.13.0';
 const API_URL = process.env.LATINFO_API_URL || 'https://api.latinfo.dev';
 const GITHUB_CLIENT_ID = process.env.GITHUB_CLIENT_ID || 'Ov23li5fcQaiCsVtaMKK';
 const CONFIG_DIR = path_1.default.join(os_1.default.homedir(), '.latinfo');
@@ -2321,6 +2321,402 @@ ENVIRONMENT
   LATINFO_REPO_PATH     Auto-detected from cwd`);
     }
 }
+// --- Docs ---
+const DOCS = {
+    index: `latinfo docs — complete documentation
+TOPICS
+  latinfo docs pipe              How to create a data pipeline (full guide)
+  latinfo docs fields            searchFieldIndex, statusFieldIndex explained
+  latinfo docs v2                V2 search index + MPHF (mandatory)
+  latinfo docs encoding          Encoding issues (latin1, UTF-8, replacement chars)
+  latinfo docs script            Import script template + requirements
+  latinfo docs troubleshooting   Common errors and fixes
+  latinfo docs architecture      How latinfo works internally
+  latinfo docs api               API endpoints and response format`,
+    pipe: `HOW TO CREATE A DATA PIPELINE
+latinfo pipe handles storage, indexing, search, and API serving automatically.
+Your only job is to write a script that produces a TSV file.
+STEP BY STEP
+  1. Create source definition:
+     latinfo pipe create <country> <institution> <dataset> [flags]
+     Example:
+       latinfo pipe create pe redam registry --url https://redam.pj.gob.pe/ --id-name dni --id-length 8
+     This generates a YAML config. Edit it to match your source.
+  2. Write your import script (any method: fetch, Playwright, curl, Python):
+     Your script must:
+       a) Download the source data
+       b) Parse to TSV: ID\\tfield1\\tfield2\\t...
+       c) Sort by ID: LC_ALL=C sort -t'\\t' -k1,1
+       d) Call buildBinaryFiles() → .bin + .idx
+       e) Call buildSearchIndex() with statusFieldIndex → V2 search (MANDATORY)
+       f) Call buildMphfFromIdx() → .mphf (MANDATORY)
+       g) Call uploadToR2() for each file
+       h) Call saveImportMeta()
+     See: latinfo docs script (for full template)
+     See: latinfo docs fields (for searchFieldIndex and statusFieldIndex)
+  3. Upload script:
+     latinfo pipe script <source> ./my-script.ts
+  4. Add dependencies (if any):
+     latinfo pipe deps <source> playwright ddddocr
+  5. Test (100 records, LOCAL — no R2 upload):
+     latinfo pipe test <source>
+  6. Validate (all records, LOCAL — no R2 upload):
+     latinfo pipe validate <source>
+  7. Stage (Linux Mint — import + 500 concurrent bench):
+     latinfo pipe stage <source>
+  8. Document:
+     latinfo pipe docs <source>
+  9. Publish to production:
+     latinfo pipe publish <source>
+GATES
+  Each gate must pass before the next unlocks:
+    test → validate → stage → docs → publish
+  test:     100 records, validates V2 + MPHF, no R2 upload
+  validate: full import, all records, no R2 upload
+  stage:    runs on Linux Mint, uploads to R2, 500 concurrent bench (99.9% required)
+  docs:     documentation with required sections
+  publish:  deploy to Cloudflare, restart search server, smoke test, rollback on failure
+NAMING
+  Source name: {country}-{institution}-{dataset}
+  Country:     ISO 3166-1 alpha-2 lowercase (pe, co, br, mx, ec, ar, cl)
+  Institution: government agency abbreviation, lowercase
+  Dataset:     what the data contains, english, lowercase
+  Examples:    pe-sunat-padron, pe-osce-sanctioned, co-rues-registry, pe-redam-registry`,
+    fields: `FIELD INDEXES: searchFieldIndex and statusFieldIndex
+Both go in buildSearchIndex() inside your import script.
+NOT in src/sources.ts. NOT in the YAML config.
+searchFieldIndex
+  Which TSV column (after ID) is the main searchable text.
+  Almost always 0 (the first field after the primary ID).
+  This field gets tokenized and indexed for full-text search.
+statusFieldIndex
+  Which TSV column (after ID) appears as a short label in search results.
+  Choose the field that helps identify records at a glance.
+  | Source type    | statusFieldIndex | Field           | Example value          |
+  |---------------|-----------------|-----------------|------------------------|
+  | Companies     | 1               | estado          | ACTIVO, CANCELADA      |
+  | Sanctions     | 1               | date_start      | 20230103               |
+  | People (REDAM)| 4               | fecha_registro  | 2022-10-27             |
+  | Fines         | 1               | date_start      | 20230103               |
+HOW TO COUNT
+  Your TSV looks like: ID\\tfield0\\tfield1\\tfield2\\tfield3\\tfield4
+  The index is 0-based, counting from the first field AFTER the ID.
+  Example REDAM TSV:
+    12345678\\tROSALES\\tORTIZ\\tJUAN\\tDNI\\t2022-10-27
+    ID         0(ape_p)  1(ape_m)  2(nombres) 3(tipo) 4(fecha)
+    searchFieldIndex: 0  → search by ape_paterno (last name)
+    statusFieldIndex: 4  → show fecha_registro in search results
+CODE EXAMPLE
+  // In your import script (NOT in sources.ts):
+  const search = await buildSearchIndex(
+    sortedPath, TEMP_DIR, SOURCE,
+    { searchFieldIndex: 0, idRegex: /^\\d{8}$/, statusFieldIndex: 4 },
+    recordCount,
+  );
+IMPORTANT
+  - statusFieldIndex is MANDATORY (V2). Without it, you get V1 which is deprecated.
+  - If pipe test says "V1" in the output, you forgot statusFieldIndex.
+  - These indexes do NOT go in src/sources.ts — that file is for the API router.`,
+    v2: `V2 SEARCH INDEX + MPHF
+V2 is MANDATORY for all sources. V1 is deprecated.
+WHAT IS V2?
+  V2 stores name + status inline in the search posting list (110 bytes per entry).
+  Search results come directly from the index — no secondary R2 lookup needed.
+  V1 only stored the ID (8 bytes) and required a second read to get the name.
+HOW TO ENABLE V2
+  Pass statusFieldIndex to buildSearchIndex():
+    buildSearchIndex(sortedPath, TEMP_DIR, SOURCE,
+      { searchFieldIndex: 0, idRegex: /^\\d{11}$/, statusFieldIndex: 1 },
+      recordCount,
+    );
+  See: latinfo docs fields (for how to choose statusFieldIndex)
+WHAT IS MPHF?
+  Minimal Perfect Hash Function. A compact dictionary (~500KB) that enables
+  client-side offline search. The client downloads it once and searches locally
+  without hitting the server.
+HOW TO GENERATE MPHF
+  Always call buildMphfFromIdx() after buildSearchIndex():
+    const mphfPath = buildMphfFromIdx(search.idxPath);
+    uploadToR2(mphfPath, \`\${SOURCE}-search.mphf\`);
+DETECTION
+  pipe test checks script output for "V1" or "V2" markers.
+  If it sees V1, the gate fails with:
+    "Search index is V1 — MUST use V2. Add statusFieldIndex to buildSearchIndex()"`,
+    encoding: `ENCODING ISSUES
+COMMON PROBLEM
+  Source file is ISO-8859-1 (latin1) but read as UTF-8.
+  Characters like ó, ñ, é appear as replacement char (U+FFFD).
+HOW TO DETECT
+  Run: file downloaded-file.csv
+  If it says "ISO-8859 text" → it's latin1.
+HOW TO FIX
+  Read with latin1, write with UTF-8:
+    const input = createReadStream(csvPath, { encoding: 'latin1' });
+    const output = createWriteStream(tsvPath, { encoding: 'utf-8' });
+  Node's latin1 decoder converts 0xF3 (ó) to the correct JS string,
+  then UTF-8 writer encodes it as 0xC3B3.
+TRUNCATION BUG (FIXED)
+  build-binary.ts truncates fields to 255 bytes. If the cut lands in the middle
+  of a multi-byte UTF-8 character, it produces replacement chars.
+  This is fixed — build-binary.ts now backs up to the last valid UTF-8 boundary.
+YAML CONFIG
+  Set encoding in your YAML:
+    encoding: iso-8859-1
+  Or for UTF-8 (default):
+    encoding: utf-8`,
+    script: `IMPORT SCRIPT TEMPLATE
+Your script downloads data and produces a sorted TSV. That's it.
+latinfo handles everything else (binary format, search index, MPHF, R2 upload).
+FULL TEMPLATE
+  import { createReadStream, createWriteStream } from 'fs';
+  import * as readline from 'readline';
+  import * as https from 'https';
+  import * as fs from 'fs';
+  import * as path from 'path';
+  import { execSync } from 'child_process';
+  import { buildBinaryFiles } from './build-binary';
+  import { buildSearchIndex } from './build-search-index';
+  import { uploadToR2, saveImportMeta, buildMphfFromIdx } from './shared';
+  const SOURCE = 'pe-example-dataset';  // must match YAML name
+  const TEMP_DIR = \`/tmp/\${SOURCE}-import\`;
+  export async function importExample(options?: { limit?: number }) {
+    console.log(\`=== \${SOURCE.toUpperCase()} IMPORT ===\\n\`);
+    try {
+      fs.mkdirSync(TEMP_DIR, { recursive: true });
+      // 1. Download
+      // ... your download logic (fetch, Playwright, curl, etc.)
+      // 2. Parse to TSV
+      const tsvPath = path.join(TEMP_DIR, 'parsed.tsv');
+      const output = createWriteStream(tsvPath, { encoding: 'utf-8' });
+      // ... parse source → output.write(\`\${id}\\t\${field1}\\t\${field2}\\n\`)
+      await new Promise<void>(r => output.end(r));
+      // 3. Sort
+      const sortedPath = path.join(TEMP_DIR, 'sorted.tsv');
+      execSync(\`LC_ALL=C sort -t'\\t' -k1,1 "\${tsvPath}" -o "\${sortedPath}"\`);
+      fs.unlinkSync(tsvPath);
+      // 4. Build binary
+      const config = { idLength: 11, idRegex: /^\\d{11}$/, prefixLength: 5, fieldCount: 6 };
+      const { shardPaths, idxPath, recordCount } = await buildBinaryFiles(
+        sortedPath, TEMP_DIR, SOURCE, config);
+      // 5. Build V2 search (MANDATORY: pass statusFieldIndex)
+      const search = await buildSearchIndex(sortedPath, TEMP_DIR, SOURCE,
+        { searchFieldIndex: 0, idRegex: /^\\d{11}$/, statusFieldIndex: 1 },
+        recordCount);
+      // 6. Delete sorted TSV
+      fs.unlinkSync(sortedPath);
+      // 7. Upload to R2
+      for (let i = 0; i < shardPaths.length; i++)
+        uploadToR2(shardPaths[i], \`\${SOURCE}-\${i}.bin\`);
+      uploadToR2(idxPath, \`\${SOURCE}.idx\`);
+      // 8. Build + upload MPHF (MANDATORY)
+      const mphfPath = buildMphfFromIdx(search.idxPath);
+      uploadToR2(search.idxPath, \`\${SOURCE}-search.idx\`);
+      uploadToR2(mphfPath, \`\${SOURCE}-search.mphf\`);
+      for (let i = 0; i < search.shardPaths.length; i++)
+        uploadToR2(search.shardPaths[i], \`\${SOURCE}-search-\${i}.dat\`);
+      // 9. Save metadata
+      saveImportMeta(SOURCE, new Date().toISOString(), recordCount);
+      // 10. Cleanup
+      fs.rmSync(TEMP_DIR, { recursive: true, force: true });
+      console.log(\`\\n[\${SOURCE}] Success: \${recordCount.toLocaleString()} records\`);
+      return true;
+    } catch (error) {
+      console.error(\`\\n[\${SOURCE}] Error:\`, error);
+      return false;
+    }
+  }
+  if (require.main === module) {
+    const args = process.argv.slice(2);
+    const limitIdx = args.indexOf('--limit');
+    const limit = limitIdx !== -1 ? parseInt(args[limitIdx + 1]) : undefined;
+    importExample({ limit }).then(ok => process.exit(ok ? 0 : 1));
+  }
+CLEAN EXAMPLE
+  See src/imports/pe-osce-sanctioned.ts (CSV, latin1, 9K records)
+IMPORTANT
+  - SOURCE const must match the YAML filename (without .yaml)
+  - Always use V2 (statusFieldIndex) — see: latinfo docs fields
+  - Always generate MPHF — see: latinfo docs v2
+  - Use --local flag support: the shared.ts uploadToR2 skips uploads when --local is passed`,
+    troubleshooting: `COMMON ERRORS AND FIXES
+ENCODING
+  Problem: ó appears as replacement char in output
+  Fix: Read source as latin1, write TSV as UTF-8
+  See: latinfo docs encoding
+V1 INSTEAD OF V2
+  Problem: pipe test says "Search index is V1"
+  Fix: Add statusFieldIndex to buildSearchIndex()
+  See: latinfo docs fields
+NO MPHF
+  Problem: pipe test says "No MPHF generated"
+  Fix: Call buildMphfFromIdx() after buildSearchIndex()
+  See: latinfo docs v2
+HTTP 418 / 403
+  Problem: Source server blocks download
+  Fix: Add User-Agent header to your request:
+    headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' }
+0 RECORDS PARSED
+  Problem: Import succeeds but no records
+  Fix: Check delimiter and column indexes in YAML match the source file
+UPLOAD FAILS (CLOUDFLARE_API_TOKEN)
+  Problem: uploadToR2 fails with "set CLOUDFLARE_API_TOKEN"
+  Fix: Ensure .env has CLOUDFLARE_API_TOKEN. For local testing use --local flag.
+  pipe test and pipe validate already use --local (no R2 upload).
+GATE LOCKED
+  Problem: "Gate X has not passed. Run: latinfo pipe X"
+  Fix: Gates are sequential. Run them in order: test → validate → stage → docs → publish
+SCRIPT CHANGED, GATES RESET
+  Problem: After running pipe script, all gates show "not run"
+  This is expected. Uploading a new script resets all gates — re-run from test.`,
+    architecture: `HOW LATINFO WORKS
+DATA FLOW
+  Source (CSV/API/crawler) → TSV → sort → binary (.bin + .idx) → R2
+                                       → search index (.idx + .dat) → R2 / Linux Mint
+                                       → MPHF (.mphf) → R2
+SERVING
+  Lookups (by ID):  User → Cloudflare Worker → R2 range read → response
+  Search (by name): User → Cloudflare Worker → Linux Mint (search server, RAM) → response
+  Offline search:   Client downloads .mphf once → searches locally, zero server
+STORAGE
+  R2 (Cloudflare): .bin shards (~200MB each), .idx prefix index (~300KB), search shards
+  Linux Mint RAM:  search index + posting lists (~2GB per source) for fast search
+BINARY FORMAT
+  .bin: records sorted by ID, each record = uint16 length + ID + fields (uint8 len + data)
+  .idx: prefix index, 16-byte entries (prefix, shard, offset, length)
+  Fields truncated to 255 bytes with UTF-8 boundary safety
+SEARCH
+  V2 search index: inverted index with inline name + status (110 bytes per posting)
+  Tokenized, IDF-scored, prefix matching, stop words filtered
+  Search server: Node cluster on Linux Mint, 2 workers, ~670 queries/sec
+INFRASTRUCTURE
+  Cloudflare Workers: API routing, auth, rate limiting, lookups
+  Cloudflare D1: auth database (api_keys, usage)
+  Cloudflare R2: data storage
+  Linux Mint: search server (RAM), GitHub Actions runner (imports)
+  Cloudflare Tunnel: search.latinfo.dev → Linux Mint port 3001`,
+    api: `API ENDPOINTS
+All endpoints require Authorization: Bearer <api_key>
+ROUTE PATTERN
+  /{country}/{institution}/{dataset}/{action}
+LOOKUP BY ID
+  GET /pe/sunat/padron/ruc/20100047218
+  GET /pe/sunat/padron/dni/09346247
+  GET /pe/sunat/coactiva/ruc/20348858182
+  GET /pe/osce/sanctioned/ruc/20100994128
+  GET /pe/osce/fines/ruc/10001108307
+  GET /pe/redam/registry/dni/12345678
+  GET /co/rues/registry/nit/0900073223
+SEARCH
+  GET /pe/sunat/padron/search?q=banco+credito
+  GET /pe/osce/sanctioned/search?q=constructora
+  GET /co/rues/registry/search?q=ecopetrol
+LICITACIONES
+  GET /pe/oece/tenders?q=servicio&limit=5
+  GET /pe/oece/tenders/info
+RESPONSE FORMAT
+  Lookup: { "ruc": "20100047218", "razon_social": "BANCO DE CREDITO", ... }
+  Search: [{ "ruc": "20100047218", "razon_social": "BANCO DE CREDITO", "estado": "ACTIVO" }, ...]
+CLI
+  latinfo pe sunat padron 20100047218
+  latinfo pe sunat padron --search "banco credito"
+  latinfo pe sunat padron --dni 09346247
+  latinfo co rues registry --search "ecopetrol"`,
+};
+function docs(args) {
+    const topic = args[0] || 'index';
+    const content = DOCS[topic];
+    if (!content) {
+        console.error(`Unknown topic: ${topic}\n\nAvailable topics:`);
+        console.log(DOCS.index);
+        process.exit(1);
+    }
+    console.log(content);
+}
 // --- Main ---
 const [command, ...args] = rawArgs;
 const COUNTRIES = ['pe', 'co', 'br', 'mx', 'ar', 'cl', 'ec'];
@@ -2389,6 +2785,9 @@ else {
         case 'completion':
             completion();
             break;
+        case 'docs':
+            docs(args);
+            break;
         case 'help':
             help();
             break;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "latinfo",
-  "version": "0.12.2",
+  "version": "0.13.0",
   "description": "Tax registry & procurement API for Latin America. Query RUC, DNI, NIT, licitaciones from Peru & Colombia. Offline MPHF search, full OCDS data, updated daily.",
   "homepage": "https://latinfo.dev",
   "repository": {