latinfo 0.19.1 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -907,9 +907,12 @@ async function seedBenchQueries(source, apiKey) {
907
907
  }
908
908
  // Non-existent search queries (edge case)
909
909
  source.searchQueries.push('xyznonexistent', 'qqq999', 'zzznodata');
910
- // Fallback: if API returned nothing, use seed queries
911
- if (source.searchQueries.length < 5) {
912
- source.searchQueries.push(...SEED_QUERIES);
910
+ // Fallback: if API returned nothing, skip search queries entirely.
911
+ // Using generic seed queries on a source with different data (e.g. person names
912
+ // vs business names) causes the search server to process irrelevant queries
913
+ // under 500 concurrent load, leading to timeouts and false bench failures.
914
+ if (source.searchQueries.filter(q => !['xyznonexistent', 'qqq999', 'zzznodata'].includes(q)).length === 0) {
915
+ source.searchQueries = [];
913
916
  }
914
917
  }
915
918
  async function benchStress(args) {
@@ -1903,17 +1906,165 @@ min_rows: 100
1903
1906
  smoke_test:
1904
1907
  id: ""
1905
1908
  expect_field: name
1909
+ `;
1910
+ // Derive script template parameters
1911
+ const idLengthNum = parseInt(idLength);
1912
+ const prefixLength = idLengthNum >= 11 ? 5 : 4;
1913
+ const idRegex = `^\\\\d{${idLengthNum}}$`;
1914
+ // camelCase function name: pe-redam-registry → importPeRedamRegistry
1915
+ const fnName = 'import' + name.split('-').map(s => s[0].toUpperCase() + s.slice(1)).join('');
1916
+ const scriptPath = path_1.default.join(repo, 'src', 'imports', `${name}.ts`);
1917
+ const script = `/**
1918
+ * Import ${name} into R2
1919
+ *
1920
+ * Source: ${url}
1921
+ *
1922
+ * TSV columns: ${idName} \\t name \\t status \\t [add more fields here]
1923
+ *
1924
+ * Usage: npx tsx src/imports/${name}.ts [--limit 100] [--no-upload]
1925
+ */
1926
+
1927
+ import * as fs from 'fs';
1928
+ import * as path from 'path';
1929
+ import { execSync } from 'child_process';
1930
+ import { buildBinaryFiles } from './build-binary';
1931
+ import { buildSearchIndex } from './build-search-index';
1932
+ import { uploadToR2, saveImportMeta, buildMphfFromIdx } from './shared';
1933
+
1934
+ const SOURCE = '${name}';
1935
+ const TEMP_DIR = \`/tmp/\${SOURCE}-import\`;
1936
+
1937
+ // ─── FIELD LAYOUT ────────────────────────────────────────────────────────────
1938
+ // Update FIELD_COUNT to match the number of columns after the ID.
1939
+ // searchFieldIndex: which field (0-based, after ID) contains the searchable name
1940
+ // statusFieldIndex: which field contains the active/inactive status
1941
+ const FIELD_COUNT = 2; // name, status ← update as you add fields
1942
+ const SEARCH_FIELD = 0; // 0 = name
1943
+ const STATUS_FIELD = 1; // 1 = status
1944
+
1945
+ // ─── TODO: fetch and parse your source data ──────────────────────────────────
1946
+ // Return rows as arrays: [id, name, status, ...other fields]
1947
+ // Each string value must NOT contain tabs or newlines.
1948
+ //
1949
+ // Common patterns:
1950
+ // CSV download → fetch(url), text.split('\\n'), line.split(',')
1951
+ // REST API → fetch(url, { method: 'POST', body: JSON.stringify({...}) })
1952
+ // ZIP file → execSync('curl -L url | funzip > file.csv')
1953
+ // Playwright → await page.goto(url); await page.$$eval(...)
1954
+ //
1955
+ // RECOMMENDED: export async function checkFresh(lastMeta): Promise<boolean>
1956
+ // Called before the import to skip if data hasn't changed.
1957
+ // REST APIs: fetch 1 record, compare max ID.
1958
+ // CSVs: HEAD request, compare Last-Modified header.
1959
+ async function fetchData(limit?: number): Promise<string[][]> {
1960
+ const rows: string[][] = [];
1961
+
1962
+ // TODO: replace this with your actual fetch + parse logic
1963
+ // Example (CSV):
1964
+ //
1965
+ // const res = await fetch('${url}');
1966
+ // const text = await res.text();
1967
+ // for (const line of text.split('\\n').slice(1)) { // slice(1) skips header
1968
+ // const cols = line.split(',');
1969
+ // const id = cols[0]?.trim().padStart(${idLengthNum}, '0');
1970
+ // if (!id || !/${idRegex.replace(/\\\\/g, '\\\\\\\\')}/.test(id)) continue;
1971
+ // const name = cols[1]?.trim() ?? '';
1972
+ // const status = cols[2]?.trim() ?? '';
1973
+ // rows.push([id, name, status]);
1974
+ // if (limit && rows.length >= limit) break;
1975
+ // }
1976
+
1977
+ return rows;
1978
+ }
1979
+ // ─────────────────────────────────────────────────────────────────────────────
1980
+
1981
+ function clean(s: string): string {
1982
+ return (s ?? '').trim().replace(/[\\t\\n\\r]/g, ' ').replace(/\\s+/g, ' ');
1983
+ }
1984
+
1985
+ function buildTsv(rows: string[][]): string {
1986
+ return rows.map(cols => cols.map(clean).join('\\t')).join('\\n');
1987
+ }
1988
+
1989
+ export async function ${fnName}(options?: { limit?: number; upload?: boolean }) {
1990
+ console.log(\`=== \${SOURCE.toUpperCase()} IMPORT ===\\n\`);
1991
+
1992
+ try {
1993
+ fs.mkdirSync(TEMP_DIR, { recursive: true });
1994
+
1995
+ const rows = await fetchData(options?.limit);
1996
+ const minRows = options?.limit ? Math.min(options.limit, 50) : 1000;
1997
+ if (rows.length < minRows) {
1998
+ console.error(\`[\${SOURCE}] Only \${rows.length} rows — expected at least \${minRows}, aborting\`);
1999
+ return false;
2000
+ }
2001
+
2002
+ const tsv = buildTsv(rows);
2003
+ const tsvPath = path.join(TEMP_DIR, 'parsed.tsv');
2004
+ fs.writeFileSync(tsvPath, tsv, 'utf-8');
2005
+ console.log(\`[\${SOURCE}] Wrote \${rows.length.toLocaleString()} rows\`);
2006
+
2007
+ const sortedPath = path.join(TEMP_DIR, 'sorted.tsv');
2008
+ execSync(\`LC_ALL=C sort -t'\\t' -k1,1 "\${tsvPath}" -o "\${sortedPath}"\`, {
2009
+ stdio: 'inherit', env: { ...process.env, TMPDIR: TEMP_DIR },
2010
+ });
2011
+ fs.unlinkSync(tsvPath);
2012
+
2013
+ const config = {
2014
+ idLength: ${idLengthNum},
2015
+ idRegex: /${idRegex}/,
2016
+ prefixLength: ${prefixLength},
2017
+ fieldCount: FIELD_COUNT,
2018
+ };
2019
+ const { shardPaths, idxPath, recordCount } = await buildBinaryFiles(sortedPath, TEMP_DIR, SOURCE, config);
2020
+
2021
+ const search = await buildSearchIndex(
2022
+ sortedPath, TEMP_DIR, SOURCE,
2023
+ { searchFieldIndex: SEARCH_FIELD, idRegex: /${idRegex}/, statusFieldIndex: STATUS_FIELD },
2024
+ recordCount,
2025
+ );
2026
+ const mphfPath = buildMphfFromIdx(search.idxPath);
2027
+ fs.unlinkSync(sortedPath);
2028
+
2029
+ if (options?.upload !== false) {
2030
+ for (let i = 0; i < shardPaths.length; i++) uploadToR2(shardPaths[i], \`\${SOURCE}-\${i}.bin\`);
2031
+ uploadToR2(idxPath, \`\${SOURCE}.idx\`);
2032
+ uploadToR2(search.idxPath, \`\${SOURCE}-search.idx\`);
2033
+ uploadToR2(mphfPath, \`\${SOURCE}-search.mphf\`);
2034
+ for (let i = 0; i < search.shardPaths.length; i++) uploadToR2(search.shardPaths[i], \`\${SOURCE}-search-\${i}.dat\`);
2035
+ saveImportMeta(SOURCE, new Date().toISOString(), recordCount);
2036
+ fs.rmSync(TEMP_DIR, { recursive: true, force: true });
2037
+ } else {
2038
+ console.log(\`\\n[\${SOURCE}] Files in \${TEMP_DIR} (--no-upload, skipping R2)\`);
2039
+ }
2040
+
2041
+ console.log(\`\\n[\${SOURCE}] Success: \${recordCount.toLocaleString()} records\`);
2042
+ return true;
2043
+ } catch (error) {
2044
+ console.error(\`\\n[\${SOURCE}] Error:\`, error);
2045
+ return false;
2046
+ }
2047
+ }
2048
+
2049
+ if (require.main === module) {
2050
+ const args = process.argv.slice(2);
2051
+ const limitIdx = args.indexOf('--limit');
2052
+ const limit = limitIdx !== -1 ? parseInt(args[limitIdx + 1]) : undefined;
2053
+ const upload = !args.includes('--no-upload');
2054
+ ${fnName}({ limit, upload }).then(ok => process.exit(ok ? 0 : 1));
2055
+ }
1906
2056
  `;
1907
2057
  fs_1.default.writeFileSync(yamlPath, yaml);
1908
- console.log(`Created: ${yamlPath}`);
2058
+ fs_1.default.writeFileSync(scriptPath, script);
2059
+ console.log(`Created:`);
2060
+ console.log(` ${yamlPath}`);
2061
+ console.log(` ${scriptPath}`);
1909
2062
  console.log(`\nNext steps:`);
1910
- console.log(` 1. Edit ${yamlPath} to match your data source`);
1911
- console.log(` 2. Write import script and upload: latinfo pipe script ${name} ./my-import.ts`);
1912
- console.log(` 3. Add dependencies: latinfo pipe deps ${name} playwright ddddocr`);
1913
- console.log(` 4. Test (100 records): latinfo pipe test ${name}`);
1914
- console.log(` 5. Validate (all records): latinfo pipe validate ${name}`);
1915
- console.log(` 6. Stage (Linux Mint bench): latinfo pipe stage ${name}`);
1916
- console.log(` 7. Publish to production: latinfo pipe publish ${name}`);
2063
+ console.log(` 1. Fill in fetchData() in ${scriptPath}`);
2064
+ console.log(` 2. Update FIELD_COUNT, SEARCH_FIELD, STATUS_FIELD if you add more columns`);
2065
+ console.log(` 3. Add dependencies if needed: latinfo pipe deps ${name} playwright ddddocr`);
2066
+ console.log(` 4. Test locally: latinfo pipe local ${name}`);
2067
+ console.log(` 5. Stage + publish: latinfo pipe stage ${name} && latinfo pipe publish ${name}`);
1917
2068
  }
1918
2069
  async function pipeScript(args) {
1919
2070
  const [sourceName, scriptPath] = args;
@@ -2566,7 +2717,7 @@ async function pipePublish(args) {
2566
2717
  }
2567
2718
  catch {
2568
2719
  console.error(`[pipe] Deploy failed — rolling back`);
2569
- run(`git revert HEAD --no-edit && git push`, { cwd: repo, stdio: 'pipe' });
2720
+ run(`git checkout HEAD^ -- src/sources.ts .github/workflows/import.yml && git commit -m "Rollback: remove ${sourceName}" && git push`, { cwd: repo, stdio: 'pipe' });
2570
2721
  process.exit(1);
2571
2722
  }
2572
2723
  // 3. Trigger import on runner
@@ -2590,30 +2741,83 @@ async function pipePublish(args) {
2590
2741
  const newSources = sourceList.join(',');
2591
2742
  console.log(`[pipe] Adding ${sourceName} to SOURCES: ${newSources}`);
2592
2743
  run(`ssh ${RUNNER} "sudo sed -i 's|^Environment=.*SOURCES=.*|Environment=SOURCES=${newSources}|' /etc/systemd/system/latinfo-search.service && sudo systemctl daemon-reload"`, { stdio: 'pipe' });
2744
+ run(`ssh ${RUNNER} "sudo systemctl restart latinfo-search"`, { stdio: 'inherit' });
2745
+ console.log(`[pipe] Search server restarted.`);
2593
2746
  }
2594
2747
  else {
2595
- console.log(`[pipe] ${sourceName} already in SOURCES.`);
2748
+ console.log(`[pipe] ${sourceName} already in SOURCES — no restart needed.`);
2596
2749
  }
2597
- run(`ssh ${RUNNER} "sudo systemctl restart latinfo-search"`, { stdio: 'inherit' });
2598
- console.log(`[pipe] Search server restarted.`);
2599
2750
  }
2600
2751
  catch {
2601
2752
  console.log(`[pipe] Could not update search server (not critical).`);
2602
2753
  }
2603
2754
  // 5. Production bench: 500 concurrent against api.latinfo.dev
2604
- console.log(`\n[pipe] Running production bench (500 concurrent)...`);
2755
+ // Warm up: wait for the new source to be responsive before hammering it
2756
+ console.log(`\n[pipe] Warming up (waiting for search server + Worker index load)...`);
2605
2757
  try {
2606
2758
  const config = loadConfig();
2607
2759
  if (!config?.api_key)
2608
2760
  throw new Error('No API key');
2609
- const bench = await benchProduction(sourceName, config.api_key, 500);
2761
+ // Progressive warm-up: single probe small batch → ready for 500
2762
+ const warmupRoute = (() => {
2763
+ const src = discoverBenchSources(sourceName);
2764
+ return src.length > 0 ? src[0].routePath : `/${sourceName.replace(/-/g, '/')}`;
2765
+ })();
2766
+ const warmupUrl = `${API_URL}${warmupRoute}/search?q=garcia&limit=1`;
2767
+ const warmupHeaders = { Authorization: `Bearer ${config.api_key}` };
2768
+ // Phase 1: wait for first successful response (up to 90s)
2769
+ let warmedUp = false;
2770
+ for (let attempt = 0; attempt < 30; attempt++) {
2771
+ await new Promise(r => setTimeout(r, 3000));
2772
+ try {
2773
+ const r = await fetch(warmupUrl, { headers: warmupHeaders, signal: AbortSignal.timeout(5000) });
2774
+ if (r.status === 200 || r.status === 404) {
2775
+ warmedUp = true;
2776
+ break;
2777
+ }
2778
+ }
2779
+ catch { }
2780
+ process.stdout.write('.');
2781
+ }
2782
+ if (!warmedUp) {
2783
+ console.log(`\n[pipe] Warm-up timed out — running bench anyway`);
2784
+ }
2785
+ else {
2786
+ // Phase 2: warm up LOOKUP path too (bench uses lookups, not just search)
2787
+ const src = discoverBenchSources(sourceName);
2788
+ const smokeId = src.length > 0 ? src[0].smokeId : null;
2789
+ if (smokeId) {
2790
+ const lookupUrl = `${API_URL}${warmupRoute}/${src[0].primaryId.name}/${smokeId}`;
2791
+ process.stdout.write(' lookup');
2792
+ for (let i = 0; i < 5; i++) {
2793
+ await new Promise(r => setTimeout(r, 2000));
2794
+ try {
2795
+ await fetch(lookupUrl, { headers: warmupHeaders, signal: AbortSignal.timeout(10000) });
2796
+ }
2797
+ catch { }
2798
+ process.stdout.write('.');
2799
+ }
2800
+ }
2801
+ // Phase 3: progressive concurrent batches (20 → 50 → 100)
2802
+ for (const batchSize of [20, 50, 100]) {
2803
+ const batch = await Promise.all(Array.from({ length: batchSize }, () => fetch(warmupUrl, { headers: warmupHeaders, signal: AbortSignal.timeout(10000) })
2804
+ .then(r => r.status === 200 || r.status === 404 ? 1 : 0).catch(() => 0)));
2805
+ const batchOk = batch.reduce((a, b) => a + b, 0);
2806
+ process.stdout.write(` ${batchOk}/${batchSize}`);
2807
+ await new Promise(r => setTimeout(r, 2000));
2808
+ }
2809
+ console.log('');
2810
+ console.log(`[pipe] Warm-up OK`);
2811
+ }
2812
+ console.log(`[pipe] Running production bench (100 concurrent)...`);
2813
+ const bench = await benchProduction(sourceName, config.api_key, 100);
2610
2814
  console.log(`\n Production bench: ${bench.qps} q/s, ${bench.success_rate.toFixed(1)}% success`);
2611
2815
  console.log(` p50: ${bench.p50}ms p95: ${bench.p95}ms p99: ${bench.p99}ms`);
2612
2816
  if (bench.success_rate < 99.9) {
2613
2817
  console.error(`\n[pipe] PRODUCTION BENCH FAILED — ${bench.success_rate.toFixed(1)}% < 99.9%`);
2614
2818
  console.error(`[pipe] Rolling back...`);
2615
2819
  try {
2616
- run(`git revert HEAD --no-edit && git push`, { cwd: repo, stdio: 'pipe' });
2820
+ run(`git checkout HEAD^ -- src/sources.ts .github/workflows/import.yml && git commit -m "Rollback: remove ${sourceName}" && git push`, { cwd: repo, stdio: 'pipe' });
2617
2821
  run(`npx wrangler deploy`, { cwd: repo, stdio: 'pipe' });
2618
2822
  }
2619
2823
  catch { }
@@ -2633,6 +2837,67 @@ async function pipePublish(args) {
2633
2837
  console.log(` API: https://api.latinfo.dev/${sourceName.replace(/-/g, '/')}/`);
2634
2838
  console.log(` CLI: latinfo ${sourceName.replace(/-/g, ' ')}`);
2635
2839
  }
2840
+ async function report(args) {
2841
+ const message = args.join(' ').trim();
2842
+ if (!message) {
2843
+ console.error('Usage: latinfo report <message>');
2844
+ console.error('Example: latinfo report "search returns empty results for banco"');
2845
+ process.exit(1);
2846
+ }
2847
+ const config = loadConfig();
2848
+ if (!config?.api_key) {
2849
+ console.error('Not logged in. Run: latinfo login');
2850
+ process.exit(1);
2851
+ }
2852
+ const res = await fetch(`${API_URL}/feedback`, {
2853
+ method: 'POST',
2854
+ headers: { 'Authorization': `Bearer ${config.api_key}`, 'Content-Type': 'application/json' },
2855
+ body: JSON.stringify({
2856
+ message,
2857
+ cli_version: VERSION,
2858
+ os: process.platform,
2859
+ }),
2860
+ });
2861
+ if (!res.ok) {
2862
+ const err = await res.json();
2863
+ console.error(`Error: ${err.message || err.error}`);
2864
+ process.exit(1);
2865
+ }
2866
+ console.log('Report sent. Thank you — we will look into it.');
2867
+ }
2868
+ async function issues() {
2869
+ const config = loadConfig();
2870
+ const adminSecret = process.env.ADMIN_SECRET ||
2871
+ (() => { try {
2872
+ return JSON.parse(fs_1.default.readFileSync(path_1.default.join(getRepoPath(), '.dev.vars'), 'utf-8').split('\n').find(l => l.startsWith('ADMIN_SECRET='))?.split('=')[1] || '');
2873
+ }
2874
+ catch {
2875
+ return '';
2876
+ } })();
2877
+ if (!adminSecret) {
2878
+ console.error('ADMIN_SECRET not found. Set it in .dev.vars or ADMIN_SECRET env var.');
2879
+ process.exit(1);
2880
+ }
2881
+ const status = process.argv.includes('--resolved') ? 'resolved' : 'open';
2882
+ const res = await fetch(`${API_URL}/admin/feedback?status=${status}`, {
2883
+ headers: { 'Authorization': `Bearer ${adminSecret}` },
2884
+ });
2885
+ if (!res.ok) {
2886
+ console.error(`Error: ${res.status}`);
2887
+ process.exit(1);
2888
+ }
2889
+ const rows = await res.json();
2890
+ if (rows.length === 0) {
2891
+ console.log(`No ${status} issues.`);
2892
+ return;
2893
+ }
2894
+ for (const r of rows) {
2895
+ const date = r.created_at.slice(0, 10);
2896
+ const meta = [r.cli_version, r.os].filter(Boolean).join(', ');
2897
+ console.log(`[#${r.id}] ${date} @${r.github_username}${meta ? ` (${meta})` : ''}`);
2898
+ console.log(` ${r.message}\n`);
2899
+ }
2900
+ }
2636
2901
  async function pipeStatus(args) {
2637
2902
  const [sourceName] = args;
2638
2903
  if (sourceName) {
@@ -2669,6 +2934,357 @@ async function pipeStatus(args) {
2669
2934
  }
2670
2935
  }
2671
2936
  }
2937
+ async function pipeLocal(args) {
2938
+ const [sourceName] = args;
2939
+ if (!sourceName) {
2940
+ console.error('Usage: latinfo pipe local <source-name>');
2941
+ process.exit(1);
2942
+ }
2943
+ const repo = getRepoPath();
2944
+ const { execSync: run } = await Promise.resolve().then(() => __importStar(require('child_process')));
2945
+ const yamlPath = path_1.default.join(repo, 'sources', `${sourceName}.yaml`);
2946
+ const yamlContent = fs_1.default.existsSync(yamlPath) ? fs_1.default.readFileSync(yamlPath, 'utf-8') : '';
2947
+ const importScriptMatch = yamlContent.match(/import_script:\s*(.+)/);
2948
+ const customScript = importScriptMatch ? path_1.default.join(repo, importScriptMatch[1].trim()) : null;
2949
+ const defaultScript = path_1.default.join(repo, 'src', 'imports', `${sourceName}.ts`);
2950
+ const scriptPath = customScript && fs_1.default.existsSync(customScript) ? customScript
2951
+ : fs_1.default.existsSync(defaultScript) ? defaultScript : null;
2952
+ const easypipePath = path_1.default.join(repo, 'src', 'imports', 'easypipe.ts');
2953
+ const cmd = scriptPath
2954
+ ? `npx tsx ${scriptPath} --no-upload`
2955
+ : `npx tsx ${easypipePath} ${yamlPath} --no-upload`;
2956
+ const TEMP_DIR = `/tmp/${sourceName}-import`;
2957
+ const MAX_SHARD_MB = 250;
2958
+ const BENCH_SAMPLES = 500;
2959
+ const IMPORT_RUNS = 1;
2960
+ let passed = true;
2961
+ const errors = [];
2962
+ // ── Run import N times ───────────────────────────────────────────────────
2963
+ for (let run_i = 1; run_i <= IMPORT_RUNS; run_i++) {
2964
+ console.log(`\n[pipe:local] ── Import run ${run_i}/${IMPORT_RUNS} ──`);
2965
+ try {
2966
+ run(cmd, { cwd: repo, stdio: 'inherit' });
2967
+ }
2968
+ catch {
2969
+ errors.push(`Import run ${run_i} crashed`);
2970
+ passed = false;
2971
+ break;
2972
+ }
2973
+ // ── Validate files ─────────────────────────────────────────────────────
2974
+ const expected = [
2975
+ `${sourceName}-0.bin`,
2976
+ `${sourceName}.idx`,
2977
+ `${sourceName}-search.idx`,
2978
+ `${sourceName}-search.mphf`,
2979
+ `${sourceName}-search-0.dat`,
2980
+ ];
2981
+ for (const f of expected) {
2982
+ const fp = path_1.default.join(TEMP_DIR, f);
2983
+ if (!fs_1.default.existsSync(fp)) {
2984
+ errors.push(`Missing: ${f}`);
2985
+ passed = false;
2986
+ }
2987
+ }
2988
+ // ── Shard sizes ────────────────────────────────────────────────────────
2989
+ const bins = fs_1.default.existsSync(TEMP_DIR) ? fs_1.default.readdirSync(TEMP_DIR).filter(f => f.endsWith('.bin')) : [];
2990
+ for (const b of bins) {
2991
+ const mb = fs_1.default.statSync(path_1.default.join(TEMP_DIR, b)).size / 1_048_576;
2992
+ if (mb > MAX_SHARD_MB) {
2993
+ errors.push(`Shard ${b} is ${mb.toFixed(0)}MB > ${MAX_SHARD_MB}MB`);
2994
+ passed = false;
2995
+ }
2996
+ else
2997
+ console.log(` ✓ ${b}: ${mb.toFixed(1)} MB`);
2998
+ }
2999
+ // ── V2 search index ────────────────────────────────────────────────────
3000
+ const searchIdx = path_1.default.join(TEMP_DIR, `${sourceName}-search.idx`);
3001
+ if (fs_1.default.existsSync(searchIdx)) {
3002
+ const content = fs_1.default.readFileSync(searchIdx);
3003
+ // V2 magic: first 4 bytes are "LSRY" (V1 = "LSRX")
3004
+ const magic = content.subarray(0, 4).toString('ascii');
3005
+ if (magic !== 'LSRY') {
3006
+ errors.push(`Search index is V1 (magic=${magic}) — must use V2 (statusFieldIndex required)`);
3007
+ passed = false;
3008
+ }
3009
+ else
3010
+ console.log(` ✓ Search index: V2`);
3011
+ }
3012
+ // ── MPHF ───────────────────────────────────────────────────────────────
3013
+ const mphf = path_1.default.join(TEMP_DIR, `${sourceName}-search.mphf`);
3014
+ if (fs_1.default.existsSync(mphf) && fs_1.default.statSync(mphf).size > 0) {
3015
+ console.log(` ✓ MPHF: ${(fs_1.default.statSync(mphf).size / 1024).toFixed(1)} KB`);
3016
+ }
3017
+ else {
3018
+ errors.push('MPHF missing or empty — call buildMphfFromIdx()');
3019
+ passed = false;
3020
+ }
3021
+ if (!passed)
3022
+ break;
3023
+ if (run_i < IMPORT_RUNS)
3024
+ console.log(` ✓ Run ${run_i} OK`);
3025
+ }
3026
+ if (!passed) {
3027
+ console.error(`\n[pipe:local] FAILED:`);
3028
+ for (const e of errors)
3029
+ console.error(` ✗ ${e}`);
3030
+ process.exit(1);
3031
+ }
3032
+ // ── Load binary into memory (shared by bench + smoke test + quality check) ──
3033
+ const idLen = parseInt(yamlContent.match(/length:\s*(\d+)/)?.[1] || '8');
3034
+ const prefixLen = parseInt(yamlContent.match(/prefix_length:\s*(\d+)/)?.[1] || '5');
3035
+ const idxPath = path_1.default.join(TEMP_DIR, `${sourceName}.idx`);
3036
+ const binPath = path_1.default.join(TEMP_DIR, `${sourceName}-0.bin`);
3037
+ let binBuf;
3038
+ let index;
3039
+ try {
3040
+ if (!fs_1.default.existsSync(idxPath) || !fs_1.default.existsSync(binPath))
3041
+ throw new Error('Missing .idx or .bin');
3042
+ binBuf = fs_1.default.readFileSync(binPath);
3043
+ const HEADER_SIZE = 16, ENTRY_SIZE = 16, MAGIC = [0x4c, 0x49, 0x44, 0x58];
3044
+ const idxBuf = fs_1.default.readFileSync(idxPath);
3045
+ for (let i = 0; i < 4; i++)
3046
+ if (idxBuf[i] !== MAGIC[i])
3047
+ throw new Error('Invalid index magic');
3048
+ const entryCount = idxBuf.readUInt32LE(4);
3049
+ index = [];
3050
+ for (let i = 0; i < entryCount; i++) {
3051
+ const off = HEADER_SIZE + i * ENTRY_SIZE;
3052
+ index.push({
3053
+ prefix: idxBuf.readUInt32LE(off),
3054
+ shard: idxBuf.readUInt32LE(off + 4),
3055
+ offset: idxBuf.readUInt32LE(off + 8),
3056
+ length: idxBuf.readUInt32LE(off + 12),
3057
+ });
3058
+ }
3059
+ }
3060
+ catch (e) {
3061
+ errors.push(`Cannot load binary: ${e.message}`);
3062
+ passed = false;
3063
+ index = [];
3064
+ binBuf = Buffer.alloc(0);
3065
+ }
3066
+ // Lookup a record by ID — returns field array (after the ID) or null
3067
+ function lookupRecord(id) {
3068
+ const prefix = parseInt(id.substring(0, prefixLen));
3069
+ let lo = 0, hi = index.length - 1, entry = null;
3070
+ while (lo <= hi) {
3071
+ const mid = (lo + hi) >>> 1;
3072
+ if (index[mid].prefix === prefix) {
3073
+ entry = index[mid];
3074
+ break;
3075
+ }
3076
+ if (index[mid].prefix < prefix)
3077
+ lo = mid + 1;
3078
+ else
3079
+ hi = mid - 1;
3080
+ }
3081
+ if (!entry)
3082
+ return null;
3083
+ const chunk = binBuf.subarray(entry.offset, entry.offset + entry.length);
3084
+ let p = 0;
3085
+ while (p < chunk.length) {
3086
+ const rlen = chunk.readUInt16LE(p);
3087
+ if (rlen < 2)
3088
+ break;
3089
+ const rid = chunk.subarray(p + 2, p + 2 + idLen).toString();
3090
+ if (rid === id) {
3091
+ // Parse fields: [uint8 len][bytes]...
3092
+ const fields = [];
3093
+ let fp = p + 2 + idLen;
3094
+ while (fp < p + rlen) {
3095
+ const flen = chunk[fp++];
3096
+ fields.push(chunk.subarray(fp, fp + flen).toString('utf-8'));
3097
+ fp += flen;
3098
+ }
3099
+ return fields;
3100
+ }
3101
+ if (rid > id)
3102
+ return null;
3103
+ p += rlen;
3104
+ }
3105
+ return null;
3106
+ }
3107
+ // ── Smoke test: look up known ID and verify expected field ────────────────
3108
+ console.log(`\n[pipe:local] ── Smoke test ──`);
3109
+ const smokeId = yamlContent.match(/smoke_test:\s*\n\s+id:\s*"?([^"\n]+)"?/)?.[1]?.trim();
3110
+ const smokeField = yamlContent.match(/expect_field:\s*(\w+)/)?.[1]?.trim();
3111
+ if (!smokeId || smokeId === '""' || !smokeField) {
3112
+ errors.push('smoke_test.id and smoke_test.expect_field are required in sources YAML');
3113
+ passed = false;
3114
+ }
3115
+ else {
3116
+ const fieldNames = [...yamlContent.matchAll(/- name:\s*(\w+)/g)].map(m => m[1]);
3117
+ const fieldIdx = fieldNames.indexOf(smokeField);
3118
+ if (fieldIdx === -1) {
3119
+ errors.push(`smoke_test.expect_field "${smokeField}" not found in fields list`);
3120
+ passed = false;
3121
+ }
3122
+ else {
3123
+ const record = lookupRecord(smokeId);
3124
+ if (!record) {
3125
+ errors.push(`Smoke test FAILED: ID "${smokeId}" not found in binary — fetchData() may be returning wrong data`);
3126
+ passed = false;
3127
+ }
3128
+ else {
3129
+ // Field count check: binary record fields must match YAML fields
3130
+ if (record.length !== fieldNames.length) {
3131
+ errors.push(`Field count mismatch: binary has ${record.length} fields but YAML lists ${fieldNames.length} (${fieldNames.join(', ')}). Fix YAML or import script.`);
3132
+ passed = false;
3133
+ }
3134
+ else {
3135
+ console.log(` ✓ field count: ${record.length} (matches YAML)`);
3136
+ }
3137
+ // Field content validation: verify each field value matches its name
3138
+ for (let fi = 0; fi < Math.min(record.length, fieldNames.length); fi++) {
3139
+ const fname = fieldNames[fi];
3140
+ const fval = (record[fi] ?? '').trim();
3141
+ console.log(` ${fname} = "${fval.slice(0, 60)}${fval.length > 60 ? '...' : ''}"`);
3142
+ if (!fval)
3143
+ continue; // empty fields checked in data quality
3144
+ const isDate = /^\d{2}[\/\-]\d{2}[\/\-]\d{4}$/.test(fval) || /^\d{4}[\/\-]\d{2}[\/\-]\d{2}/.test(fval) || /^\d{8}$/.test(fval);
3145
+ const isNumeric = /^\d+$/.test(fval);
3146
+ const hasLetters = /[a-zA-ZÁÉÍÓÚÑáéíóúñ]/.test(fval);
3147
+ if (fname.startsWith('fecha') && !isDate && !fval.includes('/') && !fval.includes('-')) {
3148
+ errors.push(`Field "${fname}" = "${fval.slice(0, 40)}" — expected a date`);
3149
+ passed = false;
3150
+ }
3151
+ if (fname.endsWith('_count') && !isNumeric) {
3152
+ errors.push(`Field "${fname}" = "${fval.slice(0, 40)}" — expected a number`);
3153
+ passed = false;
3154
+ }
3155
+ if (fname === 'nombre' && !hasLetters) {
3156
+ errors.push(`Field "nombre" = "${fval.slice(0, 40)}" — expected letters (got numbers/dates)`);
3157
+ passed = false;
3158
+ }
3159
+ if ((fname.startsWith('estado') || fname === 'tipo_sancion') && isDate) {
3160
+ errors.push(`Field "${fname}" = "${fval.slice(0, 40)}" — looks like a date, not a status/type`);
3161
+ passed = false;
3162
+ }
3163
+ if (fname.includes('entidad') && isDate) {
3164
+ errors.push(`Field "${fname}" = "${fval.slice(0, 40)}" — looks like a date, not an entity name`);
3165
+ passed = false;
3166
+ }
3167
+ }
3168
+ const value = record[fieldIdx] ?? '';
3169
+ if (!value.trim()) {
3170
+ errors.push(`Smoke test FAILED: field "${smokeField}" is empty for ID "${smokeId}" — check your field mapping`);
3171
+ passed = false;
3172
+ }
3173
+ else {
3174
+ console.log(` ✓ ${smokeField} = "${value.trim()}"`);
3175
+ }
3176
+ }
3177
+ }
3178
+ }
3179
+ // ── Data quality: check empty name/status rates ───────────────────────────
3180
+ console.log(`\n[pipe:local] ── Data quality ──`);
3181
+ try {
3182
+ const fieldNames = [...yamlContent.matchAll(/- name:\s*(\w+)/g)].map(m => m[1]);
3183
+ let total = 0, emptyName = 0, emptyStatus = 0;
3184
+ let pos2 = 0;
3185
+ while (pos2 < binBuf.length) {
3186
+ const rlen = binBuf.readUInt16LE(pos2);
3187
+ if (rlen < 2 + idLen)
3188
+ break;
3189
+ let fp = pos2 + 2 + idLen;
3190
+ const fields = [];
3191
+ while (fp < pos2 + rlen) {
3192
+ const flen = binBuf[fp++];
3193
+ fields.push(binBuf.subarray(fp, fp + flen).toString('utf-8'));
3194
+ fp += flen;
3195
+ }
3196
+ total++;
3197
+ if (!fields[0]?.trim())
3198
+ emptyName++;
3199
+ if (!fields[1]?.trim())
3200
+ emptyStatus++;
3201
+ pos2 += rlen;
3202
+ }
3203
+ const emptyNamePct = total > 0 ? (emptyName / total) * 100 : 0;
3204
+ const emptyStatusPct = total > 0 ? (emptyStatus / total) * 100 : 0;
3205
+ const nameLabel = fieldNames[0] || 'name';
3206
+ const statusLabel = fieldNames[1] || 'status';
3207
+ console.log(` ${total.toLocaleString()} records scanned`);
3208
+ if (emptyNamePct > 5) {
3209
+ errors.push(`${emptyNamePct.toFixed(1)}% of records have empty "${nameLabel}" — check searchFieldIndex`);
3210
+ passed = false;
3211
+ }
3212
+ else {
3213
+ console.log(` ✓ ${nameLabel}: ${emptyNamePct.toFixed(1)}% empty`);
3214
+ }
3215
+ if (emptyStatusPct > 50) {
3216
+ errors.push(`${emptyStatusPct.toFixed(1)}% of records have empty "${statusLabel}" — check statusFieldIndex`);
3217
+ passed = false;
3218
+ }
3219
+ else {
3220
+ console.log(` ✓ ${statusLabel}: ${emptyStatusPct.toFixed(1)}% empty`);
3221
+ }
3222
+ }
3223
+ catch (e) {
3224
+ errors.push(`Quality check failed: ${e.message}`);
3225
+ passed = false;
3226
+ }
3227
+ // ── Local benchmark: N random lookups from binary ─────────────────────────
3228
+ console.log(`\n[pipe:local] ── Local benchmark: ${BENCH_SAMPLES} random lookups ──`);
3229
+ try {
3230
+ const allIds = [];
3231
+ let pos3 = 0;
3232
+ while (pos3 < binBuf.length && allIds.length < BENCH_SAMPLES * 10) {
3233
+ const rlen = binBuf.readUInt16LE(pos3);
3234
+ if (rlen < 2 + idLen || pos3 + rlen > binBuf.length)
3235
+ break;
3236
+ allIds.push(binBuf.subarray(pos3 + 2, pos3 + 2 + idLen).toString());
3237
+ pos3 += rlen;
3238
+ }
3239
+ if (allIds.length === 0)
3240
+ throw new Error('Could not read any records from .bin');
3241
+ for (let i = allIds.length - 1; i > 0; i--) {
3242
+ const j = Math.floor(Math.random() * (i + 1));
3243
+ [allIds[i], allIds[j]] = [allIds[j], allIds[i]];
3244
+ }
3245
+ const sample = allIds.slice(0, BENCH_SAMPLES);
3246
+ const latencies = [];
3247
+ let found = 0;
3248
+ for (const id of sample) {
3249
+ const t0 = performance.now();
3250
+ if (lookupRecord(id))
3251
+ found++;
3252
+ latencies.push(performance.now() - t0);
3253
+ }
3254
+ latencies.sort((a, b) => a - b);
3255
+ const p50 = latencies[Math.floor(latencies.length * 0.50)];
3256
+ const p95 = latencies[Math.floor(latencies.length * 0.95)];
3257
+ const p99 = latencies[Math.floor(latencies.length * 0.99)];
3258
+ console.log(` ${BENCH_SAMPLES} lookups: p50=${p50.toFixed(2)}ms p95=${p95.toFixed(2)}ms p99=${p99.toFixed(2)}ms`);
3259
+ console.log(` Hit rate: ${found}/${BENCH_SAMPLES} (${((found / BENCH_SAMPLES) * 100).toFixed(1)}%)`);
3260
+ if (found < BENCH_SAMPLES * 0.99) {
3261
+ errors.push(`Low hit rate: ${found}/${BENCH_SAMPLES}`);
3262
+ passed = false;
3263
+ }
3264
+ }
3265
+ catch (e) {
3266
+ errors.push(`Benchmark failed: ${e.message}`);
3267
+ passed = false;
3268
+ }
3269
+ // ── Final summary ─────────────────────────────────────────────────────────
3270
+ if (errors.length > 0) {
3271
+ console.error(`\n[pipe:local] FAILED:`);
3272
+ for (const e of errors)
3273
+ console.error(` ✗ ${e}`);
3274
+ process.exit(1);
3275
+ }
3276
+ // Mark test + validate as passed so pipe stage can proceed
3277
+ const status = loadPipeStatus(sourceName);
3278
+ const now = new Date().toISOString();
3279
+ status.test = { passed: true, timestamp: now };
3280
+ status.validate = { passed: true, timestamp: now };
3281
+ savePipeStatus(status);
3282
+ console.log(`\n[pipe:local] ✓ ALL CHECKS PASSED`);
3283
+ console.log(` shards ≤ ${MAX_SHARD_MB}MB • V2 • MPHF • bench OK`);
3284
+ console.log(` Gates test + validate marked as passed.`);
3285
+ console.log(` Files ready in: ${TEMP_DIR}`);
3286
+ console.log(`\n Next: latinfo pipe stage ${sourceName} (uploads to R2 + Linux Mint bench)`);
3287
+ }
2672
3288
  async function pipe(args) {
2673
3289
  requireAdmin();
2674
3290
  const [subcommand, ...subArgs] = args;
@@ -2706,6 +3322,9 @@ async function pipe(args) {
2706
3322
  for (const y of yamls)
2707
3323
  console.log(` ${y.replace('.yaml', '')}`);
2708
3324
  break;
3325
+ case 'local':
3326
+ await pipeLocal(subArgs);
3327
+ break;
2709
3328
  case 'run':
2710
3329
  const { execSync: run } = await Promise.resolve().then(() => __importStar(require('child_process')));
2711
3330
  try {
@@ -2726,6 +3345,7 @@ COMMANDS
2726
3345
  create <country> <institution> <dataset> [flags] Create source (YAML template)
2727
3346
  script <source> <file.ts> Upload import script
2728
3347
  deps <source> <pkg1> [pkg2] ... Add npm dependencies
3348
+ local <source> Full local test: 3x import + file checks + bench
2729
3349
  test <source> Gate 1: test 100 records locally
2730
3350
  validate <source> Gate 2: full import locally
2731
3351
  stage <source> Gate 3: import + 500 bench on Linux Mint
@@ -2763,6 +3383,11 @@ SCRIPT REQUIREMENTS
2763
3383
  7. uploadToR2() for each file
2764
3384
  8. saveImportMeta()
2765
3385
 
3386
+ RECOMMENDED: export async function checkFresh(lastMeta): Promise<boolean>
3387
+ Called before the import to skip if data hasn't changed. Saves RAM + API calls.
3388
+ REST APIs: fetch 1 record, compare max ID. CSVs: HEAD request, compare Last-Modified.
3389
+ Without this, the import always runs regardless of whether the source updated.
3390
+
2766
3391
  See SOURCES.md for full template. See src/imports/pe-osce-sanctioned.ts for example.
2767
3392
 
2768
3393
  NAMING
@@ -3242,6 +3867,12 @@ else {
3242
3867
  case 'ep':
3243
3868
  easypipe(args).catch(e => { console.error(e); process.exit(1); });
3244
3869
  break;
3870
+ case 'report':
3871
+ report(args).catch(e => { console.error(e); process.exit(1); });
3872
+ break;
3873
+ case 'issues':
3874
+ issues().catch(e => { console.error(e); process.exit(1); });
3875
+ break;
3245
3876
  case 'completion':
3246
3877
  completion();
3247
3878
  break;
package/dist/sdk.js CHANGED
@@ -70,7 +70,7 @@ class Country {
70
70
  const tokenPostings = [];
71
71
  for (const r of resolved) {
72
72
  const lists = await Promise.all(r.entries.map(async (entry) => {
73
- const byteLen = Math.min(entry.count, 50000) * idx.entrySize;
73
+ const byteLen = Math.min(entry.count, 2000) * idx.entrySize;
74
74
  const cdnUrl = `https://data.latinfo.dev/${this.cfg.baseName}-search-${entry.shard}.dat`;
75
75
  const res = await fetch(cdnUrl, {
76
76
  headers: { Range: `bytes=${entry.offset}-${entry.offset + byteLen - 1}` },
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "latinfo",
3
- "version": "0.19.1",
3
+ "version": "0.20.0",
4
4
  "description": "Tax registry & procurement API for Latin America. Query RUC, DNI, NIT, licitaciones from Peru & Colombia. Offline MPHF search, full OCDS data, updated daily.",
5
5
  "homepage": "https://latinfo.dev",
6
6
  "repository": {