@tryformation/querylight-cli 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -208,8 +208,8 @@ The default workspace is `.kb/`.
208
208
  raw/
209
209
  normalized/
210
210
  indexes/
211
- latest.json
212
- latest.meta.json
211
+ latest.json.gz
212
+ latest.meta.json.gz
213
213
  runs/
214
214
  logs/
215
215
  ```
package/dist/cli/main.js CHANGED
@@ -16,7 +16,7 @@ import path from "path";
16
16
  import YAML from "yaml";
17
17
 
18
18
  // src/core/constants.ts
19
- var PACKAGE_VERSION = "0.2.0";
19
+ var PACKAGE_VERSION = "0.2.1";
20
20
  var DEFAULT_WORKSPACE = ".kb";
21
21
  var DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
22
22
  var LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
@@ -387,6 +387,7 @@ import os from "os";
387
387
  import path6 from "path";
388
388
  import { fileURLToPath } from "url";
389
389
  import { execFile, execFileSync } from "child_process";
390
+ import { mkdtemp, rm, writeFile as writeFile3 } from "fs/promises";
390
391
 
391
392
  // src/core/files.ts
392
393
  import { stat as stat2 } from "fs/promises";
@@ -400,6 +401,7 @@ async function fileExists(filePath) {
400
401
  }
401
402
 
402
403
  // src/vector/runtime.ts
404
+ var sparseExecFileSync = execFileSync;
403
405
  function resolveQliHomeDir() {
404
406
  return path6.resolve(process.env.QLI_HOME ?? path6.join(os.homedir(), ".qli"));
405
407
  }
@@ -455,29 +457,36 @@ async function runSparsePython({
455
457
  }) {
456
458
  const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
457
459
  const scriptPath = await sparseScriptPath(importMetaUrl);
458
- return execFileSync(
459
- "uv",
460
- [
461
- "run",
462
- "--with",
463
- "torch",
464
- "--with",
465
- "transformers",
466
- "--with",
467
- "huggingface_hub",
468
- "python",
469
- scriptPath
470
- ],
471
- {
472
- encoding: "utf8",
473
- maxBuffer: 1024 * 1024 * 1024,
474
- input: JSON.stringify(payload),
475
- env: {
476
- ...process.env,
477
- HF_HOME: cacheDir
460
+ const payloadDir = await mkdtemp(path6.join(os.tmpdir(), "qli-sparse-"));
461
+ const payloadPath = path6.join(payloadDir, "payload.json");
462
+ await writeFile3(payloadPath, JSON.stringify(payload), "utf8");
463
+ try {
464
+ return sparseExecFileSync(
465
+ "uv",
466
+ [
467
+ "run",
468
+ "--with",
469
+ "torch",
470
+ "--with",
471
+ "transformers",
472
+ "--with",
473
+ "huggingface_hub",
474
+ "python",
475
+ scriptPath,
476
+ payloadPath
477
+ ],
478
+ {
479
+ encoding: "utf8",
480
+ maxBuffer: 1024 * 1024 * 1024,
481
+ env: {
482
+ ...process.env,
483
+ HF_HOME: cacheDir
484
+ }
478
485
  }
479
- }
480
- );
486
+ );
487
+ } finally {
488
+ await rm(payloadDir, { recursive: true, force: true });
489
+ }
481
490
  }
482
491
  async function getDenseTransformersRuntime(cacheDir) {
483
492
  const transformers = await import("@huggingface/transformers");
@@ -490,8 +499,40 @@ async function getDenseTransformersRuntime(cacheDir) {
490
499
  }
491
500
 
492
501
  // src/vector/store.ts
493
- import { mkdir as mkdir3, readFile as readFile4, writeFile as writeFile3 } from "fs/promises";
502
+ import { mkdir as mkdir3, rm as rm2, writeFile as writeFile5 } from "fs/promises";
494
503
  import path7 from "path";
504
+
505
+ // src/core/gzip-json.ts
506
+ import { readFile as readFile4, writeFile as writeFile4 } from "fs/promises";
507
+ import { promisify } from "util";
508
+ import { gunzip, gzip } from "zlib";
509
+ var gzipAsync = promisify(gzip);
510
+ var gunzipAsync = promisify(gunzip);
511
+ async function writeGzipJson(filePath, value) {
512
+ const payload = JSON.stringify(value, null, 2);
513
+ await writeFile4(filePath, await gzipAsync(Buffer.from(payload, "utf8")));
514
+ }
515
+ async function readJsonFromGzipOrFile(gzipPath, legacyPath) {
516
+ if (await fileExists(gzipPath)) {
517
+ const payload = await readFile4(gzipPath);
518
+ return JSON.parse((await gunzipAsync(payload)).toString("utf8"));
519
+ }
520
+ if (legacyPath && await fileExists(legacyPath)) {
521
+ return JSON.parse(await readFile4(legacyPath, "utf8"));
522
+ }
523
+ return JSON.parse(await readFile4(gzipPath, "utf8"));
524
+ }
525
+ async function resolveExistingGzipOrFilePath(gzipPath, legacyPath) {
526
+ if (await fileExists(gzipPath)) {
527
+ return gzipPath;
528
+ }
529
+ if (legacyPath && await fileExists(legacyPath)) {
530
+ return legacyPath;
531
+ }
532
+ return gzipPath;
533
+ }
534
+
535
+ // src/vector/store.ts
495
536
  function vectorsDir(workspacePath) {
496
537
  return path7.join(workspacePath, "vectors");
497
538
  }
@@ -499,15 +540,27 @@ function sharedModelStateDir() {
499
540
  return path7.join(resolveQliHomeDir(), "models", "status");
500
541
  }
501
542
  function denseVectorPath(workspacePath) {
502
- return path7.join(vectorsDir(workspacePath), "dense.latest.json");
543
+ return path7.join(vectorsDir(workspacePath), "dense.latest.json.gz");
503
544
  }
504
545
  function denseMetaPath(workspacePath) {
505
- return path7.join(vectorsDir(workspacePath), "dense.latest.meta.json");
546
+ return path7.join(vectorsDir(workspacePath), "dense.latest.meta.json.gz");
506
547
  }
507
548
  function sparseVectorPath(workspacePath) {
508
- return path7.join(vectorsDir(workspacePath), "sparse.latest.json");
549
+ return path7.join(vectorsDir(workspacePath), "sparse.latest.json.gz");
509
550
  }
510
551
  function sparseMetaPath(workspacePath) {
552
+ return path7.join(vectorsDir(workspacePath), "sparse.latest.meta.json.gz");
553
+ }
554
+ function legacyDenseVectorPath(workspacePath) {
555
+ return path7.join(vectorsDir(workspacePath), "dense.latest.json");
556
+ }
557
+ function legacyDenseMetaPath(workspacePath) {
558
+ return path7.join(vectorsDir(workspacePath), "dense.latest.meta.json");
559
+ }
560
+ function legacySparseVectorPath(workspacePath) {
561
+ return path7.join(vectorsDir(workspacePath), "sparse.latest.json");
562
+ }
563
+ function legacySparseMetaPath(workspacePath) {
511
564
  return path7.join(vectorsDir(workspacePath), "sparse.latest.meta.json");
512
565
  }
513
566
  function pullMarkerPath(type, workspacePath, modelId, cacheDir) {
@@ -523,29 +576,37 @@ function sparsePullMarker(workspacePath, modelId, cacheDir) {
523
576
  }
524
577
  async function writeDensePayload(workspacePath, payload) {
525
578
  await mkdir3(vectorsDir(workspacePath), { recursive: true });
526
- await writeFile3(denseVectorPath(workspacePath), JSON.stringify(payload, null, 2), "utf8");
527
- await writeFile3(denseMetaPath(workspacePath), JSON.stringify(payload.metadata, null, 2), "utf8");
579
+ await writeGzipJson(denseVectorPath(workspacePath), payload);
580
+ await writeGzipJson(denseMetaPath(workspacePath), payload.metadata);
581
+ await Promise.all([
582
+ rm2(legacyDenseVectorPath(workspacePath), { force: true }),
583
+ rm2(legacyDenseMetaPath(workspacePath), { force: true })
584
+ ]);
528
585
  }
529
586
  async function readDensePayload(workspacePath) {
530
- return JSON.parse(await readFile4(denseVectorPath(workspacePath), "utf8"));
587
+ return readJsonFromGzipOrFile(denseVectorPath(workspacePath), legacyDenseVectorPath(workspacePath));
531
588
  }
532
589
  async function writeSparsePayload(workspacePath, payload) {
533
590
  await mkdir3(vectorsDir(workspacePath), { recursive: true });
534
- await writeFile3(sparseVectorPath(workspacePath), JSON.stringify(payload, null, 2), "utf8");
535
- await writeFile3(sparseMetaPath(workspacePath), JSON.stringify(payload.metadata, null, 2), "utf8");
591
+ await writeGzipJson(sparseVectorPath(workspacePath), payload);
592
+ await writeGzipJson(sparseMetaPath(workspacePath), payload.metadata);
593
+ await Promise.all([
594
+ rm2(legacySparseVectorPath(workspacePath), { force: true }),
595
+ rm2(legacySparseMetaPath(workspacePath), { force: true })
596
+ ]);
536
597
  }
537
598
  async function readSparsePayload(workspacePath) {
538
- return JSON.parse(await readFile4(sparseVectorPath(workspacePath), "utf8"));
599
+ return readJsonFromGzipOrFile(sparseVectorPath(workspacePath), legacySparseVectorPath(workspacePath));
539
600
  }
540
601
  async function writeDensePullMarker(workspacePath, model, value) {
541
602
  const markerPath = densePullMarker(workspacePath, model.modelId, model.cacheDir);
542
603
  await mkdir3(path7.dirname(markerPath), { recursive: true });
543
- await writeFile3(markerPath, JSON.stringify(value, null, 2), "utf8");
604
+ await writeFile5(markerPath, JSON.stringify(value, null, 2), "utf8");
544
605
  }
545
606
  async function writeSparsePullMarker(workspacePath, model, value) {
546
607
  const markerPath = sparsePullMarker(workspacePath, model.modelId, model.cacheDir);
547
608
  await mkdir3(path7.dirname(markerPath), { recursive: true });
548
- await writeFile3(markerPath, JSON.stringify(value, null, 2), "utf8");
609
+ await writeFile5(markerPath, JSON.stringify(value, null, 2), "utf8");
549
610
  }
550
611
  async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
551
612
  const denseCacheDir = resolveCacheDir(workspacePath, dense.cacheDir);
@@ -556,7 +617,7 @@ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
556
617
  modelId: dense.modelId,
557
618
  cacheDir: denseCacheDir,
558
619
  available: await fileExists(densePullMarker(workspacePath, dense.modelId, dense.cacheDir)),
559
- artifactExists: await fileExists(denseVectorPath(workspacePath))
620
+ artifactExists: await fileExists(denseVectorPath(workspacePath)) || await fileExists(legacyDenseVectorPath(workspacePath))
560
621
  },
561
622
  sparse: {
562
623
  configured: sparse.enabled,
@@ -564,7 +625,7 @@ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
564
625
  cacheDir: sparseCacheDir,
565
626
  uvAvailable,
566
627
  available: await fileExists(sparsePullMarker(workspacePath, sparse.modelId, sparse.cacheDir)),
567
- artifactExists: await fileExists(sparseVectorPath(workspacePath))
628
+ artifactExists: await fileExists(sparseVectorPath(workspacePath)) || await fileExists(legacySparseVectorPath(workspacePath))
568
629
  }
569
630
  };
570
631
  }
@@ -963,31 +1024,63 @@ async function getModelStatus(workspacePath, config) {
963
1024
  }
964
1025
 
965
1026
  // src/index/index-store.ts
966
- import { readFile as readFile5, writeFile as writeFile4 } from "fs/promises";
1027
+ import { mkdir as mkdir6, rm as rm3 } from "fs/promises";
967
1028
  import path10 from "path";
1029
+ function versionedIndexPath(workspacePath, stamp) {
1030
+ return path10.join(workspacePath, "indexes", `${stamp}.json.gz`);
1031
+ }
1032
+ function versionedLegacyIndexPath(workspacePath, stamp) {
1033
+ return path10.join(workspacePath, "indexes", `${stamp}.json`);
1034
+ }
1035
+ function versionedMetaPath(workspacePath, stamp) {
1036
+ return path10.join(workspacePath, "indexes", `${stamp}.meta.json.gz`);
1037
+ }
1038
+ function versionedLegacyMetaPath(workspacePath, stamp) {
1039
+ return path10.join(workspacePath, "indexes", `${stamp}.meta.json`);
1040
+ }
1041
+ function latestIndexPath(workspacePath) {
1042
+ return path10.join(workspacePath, "indexes", "latest.json.gz");
1043
+ }
1044
+ function legacyLatestIndexPath(workspacePath) {
1045
+ return path10.join(workspacePath, "indexes", "latest.json");
1046
+ }
1047
+ function latestMetaPath(workspacePath) {
1048
+ return path10.join(workspacePath, "indexes", "latest.meta.json.gz");
1049
+ }
1050
+ function legacyLatestMetaPath(workspacePath) {
1051
+ return path10.join(workspacePath, "indexes", "latest.meta.json");
1052
+ }
968
1053
  async function writeIndexArtifacts({
969
1054
  workspacePath,
970
1055
  indexState,
971
1056
  metadata
972
1057
  }) {
973
1058
  const stamp = metadata.createdAt.replace(/[:.]/g, "-");
974
- const indexPath = path10.join(workspacePath, "indexes", `${stamp}.json`);
975
- const metaPath = path10.join(workspacePath, "indexes", `${stamp}.meta.json`);
976
- const latestIndexPath = path10.join(workspacePath, "indexes", "latest.json");
977
- const latestMetaPath = path10.join(workspacePath, "indexes", "latest.meta.json");
978
- const indexPayload = JSON.stringify(indexState, null, 2);
979
- const metaPayload = JSON.stringify(metadata, null, 2);
980
- await writeFile4(indexPath, indexPayload, "utf8");
981
- await writeFile4(metaPath, metaPayload, "utf8");
982
- await writeFile4(latestIndexPath, indexPayload, "utf8");
983
- await writeFile4(latestMetaPath, metaPayload, "utf8");
984
- return { indexPath: latestIndexPath, metadataPath: latestMetaPath };
1059
+ const indexPath = versionedIndexPath(workspacePath, stamp);
1060
+ const metaPath = versionedMetaPath(workspacePath, stamp);
1061
+ const latestIndexArtifactPath = latestIndexPath(workspacePath);
1062
+ const latestMetadataArtifactPath = latestMetaPath(workspacePath);
1063
+ await mkdir6(path10.join(workspacePath, "indexes"), { recursive: true });
1064
+ await writeGzipJson(indexPath, indexState);
1065
+ await writeGzipJson(metaPath, metadata);
1066
+ await writeGzipJson(latestIndexArtifactPath, indexState);
1067
+ await writeGzipJson(latestMetadataArtifactPath, metadata);
1068
+ await Promise.all([
1069
+ rm3(legacyLatestIndexPath(workspacePath), { force: true }),
1070
+ rm3(legacyLatestMetaPath(workspacePath), { force: true }),
1071
+ rm3(versionedLegacyIndexPath(workspacePath, stamp), { force: true }),
1072
+ rm3(versionedLegacyMetaPath(workspacePath, stamp), { force: true })
1073
+ ]);
1074
+ return { indexPath: latestIndexArtifactPath, metadataPath: latestMetadataArtifactPath };
985
1075
  }
986
1076
  async function readLatestIndexState(workspacePath) {
987
- return JSON.parse(await readFile5(path10.join(workspacePath, "indexes", "latest.json"), "utf8"));
1077
+ return readJsonFromGzipOrFile(latestIndexPath(workspacePath), legacyLatestIndexPath(workspacePath));
988
1078
  }
989
1079
  async function readLatestIndexMetadata(workspacePath) {
990
- return JSON.parse(await readFile5(path10.join(workspacePath, "indexes", "latest.meta.json"), "utf8"));
1080
+ return readJsonFromGzipOrFile(latestMetaPath(workspacePath), legacyLatestMetaPath(workspacePath));
1081
+ }
1082
+ async function resolveLatestIndexArtifactPath(workspacePath) {
1083
+ return resolveExistingGzipOrFilePath(latestIndexPath(workspacePath), legacyLatestIndexPath(workspacePath));
991
1084
  }
992
1085
 
993
1086
  // src/index/querylight-indexer.ts
@@ -1178,7 +1271,7 @@ async function removeSource(workspacePath, sourceId) {
1178
1271
  }
1179
1272
 
1180
1273
  // src/ingest/document-utils.ts
1181
- import { mkdir as mkdir6, rm, writeFile as writeFile5 } from "fs/promises";
1274
+ import { mkdir as mkdir7, rm as rm4, writeFile as writeFile6 } from "fs/promises";
1182
1275
  import path14 from "path";
1183
1276
 
1184
1277
  // src/normalize/normalize-markdown.ts
@@ -1231,8 +1324,8 @@ async function writeNormalizedDocument({
1231
1324
  normalizedPath,
1232
1325
  markdown
1233
1326
  }) {
1234
- await mkdir6(path14.dirname(normalizedPath), { recursive: true });
1235
- await writeFile5(
1327
+ await mkdir7(path14.dirname(normalizedPath), { recursive: true });
1328
+ await writeFile6(
1236
1329
  normalizedPath,
1237
1330
  withFrontmatter(
1238
1331
  {
@@ -1254,8 +1347,8 @@ async function writeNormalizedDocument({
1254
1347
  }
1255
1348
  async function deleteDocumentArtifacts(document) {
1256
1349
  await Promise.all([
1257
- document.rawPath ? rm(document.rawPath, { force: true }) : Promise.resolve(),
1258
- rm(document.normalizedPath, { force: true })
1350
+ document.rawPath ? rm4(document.rawPath, { force: true }) : Promise.resolve(),
1351
+ rm4(document.normalizedPath, { force: true })
1259
1352
  ]);
1260
1353
  }
1261
1354
 
@@ -1279,7 +1372,7 @@ async function listDirectoryFiles(source) {
1279
1372
 
1280
1373
  // src/ingest/adapters/file-adapter.ts
1281
1374
  import { basename, extname, resolve } from "path";
1282
- import { mkdir as mkdir7, readFile as readFile9, stat as stat3, writeFile as writeFile6 } from "fs/promises";
1375
+ import { mkdir as mkdir8, readFile as readFile8, stat as stat3, writeFile as writeFile7 } from "fs/promises";
1283
1376
 
1284
1377
  // src/ingest/extractors/docx-extractor.ts
1285
1378
  import mammoth from "mammoth";
@@ -1453,16 +1546,16 @@ function extractPublicationDateFromHtml(html) {
1453
1546
  }
1454
1547
 
1455
1548
  // src/ingest/extractors/markdown-extractor.ts
1456
- import { readFile as readFile6 } from "fs/promises";
1549
+ import { readFile as readFile5 } from "fs/promises";
1457
1550
  async function extractMarkdown(filePath) {
1458
- return readFile6(filePath, "utf8");
1551
+ return readFile5(filePath, "utf8");
1459
1552
  }
1460
1553
 
1461
1554
  // src/ingest/extractors/pdf-extractor.ts
1462
- import { readFile as readFile7 } from "fs/promises";
1555
+ import { readFile as readFile6 } from "fs/promises";
1463
1556
  import { PDFParse } from "pdf-parse";
1464
1557
  async function extractPdf(filePath) {
1465
- const buffer = await readFile7(filePath);
1558
+ const buffer = await readFile6(filePath);
1466
1559
  const parser = new PDFParse({ data: buffer });
1467
1560
  try {
1468
1561
  const parsed = await parser.getText();
@@ -1473,9 +1566,9 @@ async function extractPdf(filePath) {
1473
1566
  }
1474
1567
 
1475
1568
  // src/ingest/extractors/text-extractor.ts
1476
- import { readFile as readFile8 } from "fs/promises";
1569
+ import { readFile as readFile7 } from "fs/promises";
1477
1570
  async function extractText(filePath) {
1478
- return readFile8(filePath, "utf8");
1571
+ return readFile7(filePath, "utf8");
1479
1572
  }
1480
1573
 
1481
1574
  // src/ingest/adapters/file-adapter.ts
@@ -1510,7 +1603,7 @@ async function extractFileContent(filePath, mimeType) {
1510
1603
  ${text}`, raw: text };
1511
1604
  }
1512
1605
  if (mimeType === "text/html") {
1513
- const raw = await readFile9(filePath, "utf8");
1606
+ const raw = await readFile8(filePath, "utf8");
1514
1607
  const extracted = extractHtmlToMarkdown(raw);
1515
1608
  return { title: extracted.title, markdown: `# ${extracted.title}
1516
1609
 
@@ -1566,10 +1659,10 @@ async function ingestFile({
1566
1659
  const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
1567
1660
  const indexedAt = now;
1568
1661
  const crawledAt = now;
1569
- await mkdir7(resolve(workspacePath, "normalized"), { recursive: true });
1570
- await mkdir7(resolve(workspacePath, "raw", source.id), { recursive: true });
1662
+ await mkdir8(resolve(workspacePath, "normalized"), { recursive: true });
1663
+ await mkdir8(resolve(workspacePath, "raw", source.id), { recursive: true });
1571
1664
  if (extracted.raw) {
1572
- await writeFile6(rawPath, extracted.raw, "utf8");
1665
+ await writeFile7(rawPath, extracted.raw, "utf8");
1573
1666
  }
1574
1667
  await writeNormalizedDocument({
1575
1668
  documentId,
@@ -1632,7 +1725,7 @@ ${content}`;
1632
1725
  const now = (/* @__PURE__ */ new Date()).toISOString();
1633
1726
  const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
1634
1727
  const indexedAt = now;
1635
- await mkdir7(resolve(workspacePath, "normalized"), { recursive: true });
1728
+ await mkdir8(resolve(workspacePath, "normalized"), { recursive: true });
1636
1729
  await writeNormalizedDocument({
1637
1730
  documentId,
1638
1731
  sourceId: source.id,
@@ -1676,7 +1769,7 @@ async function reprocessStoredDocument(document, source) {
1676
1769
  if (!document.rawPath) {
1677
1770
  return null;
1678
1771
  }
1679
- const raw = await readFile9(document.rawPath, "utf8");
1772
+ const raw = await readFile8(document.rawPath, "utf8");
1680
1773
  const fallbackTitle = document.title || basename(document.uri);
1681
1774
  const extracted = await extractRawContent(raw, document.mimeType, fallbackTitle);
1682
1775
  const contentHash = sha256(extracted.markdown);
@@ -1793,7 +1886,7 @@ async function parseRssFeedDocument(xml, source) {
1793
1886
  }
1794
1887
 
1795
1888
  // src/ingest/adapters/url-adapter.ts
1796
- import { mkdir as mkdir8, readFile as readFile10, writeFile as writeFile7 } from "fs/promises";
1889
+ import { mkdir as mkdir9, readFile as readFile9, writeFile as writeFile8 } from "fs/promises";
1797
1890
  import path16 from "path";
1798
1891
 
1799
1892
  // src/core/urls.ts
@@ -1845,8 +1938,8 @@ ${extracted.markdown}`;
1845
1938
  const indexedAt = now;
1846
1939
  const crawledAt = now;
1847
1940
  const resolvedPublicationDate = choosePublicationDate(publicationDate, extractPublicationDateFromHtml(body), previous?.publicationDate);
1848
- await mkdir8(path16.resolve(workspacePath, "raw", source.id), { recursive: true });
1849
- await writeFile7(rawPath, body, "utf8");
1941
+ await mkdir9(path16.resolve(workspacePath, "raw", source.id), { recursive: true });
1942
+ await writeFile8(rawPath, body, "utf8");
1850
1943
  await writeNormalizedDocument({
1851
1944
  documentId,
1852
1945
  sourceId: source.id,
@@ -1966,7 +2059,7 @@ async function reprocessRemoteDocument(document, source) {
1966
2059
  if (!document.rawPath || !await fileExists(document.rawPath)) {
1967
2060
  return null;
1968
2061
  }
1969
- const raw = await readFile10(document.rawPath, "utf8");
2062
+ const raw = await readFile9(document.rawPath, "utf8");
1970
2063
  const extracted = extractHtmlToMarkdown(raw);
1971
2064
  const markdown = `# ${extracted.title}
1972
2065
 
@@ -2684,7 +2777,7 @@ async function discoverWebsiteFeed(websiteUrl, userAgent) {
2684
2777
  }
2685
2778
 
2686
2779
  // src/query/search-service.ts
2687
- import { readFile as readFile11 } from "fs/promises";
2780
+ import { readFile as readFile10 } from "fs/promises";
2688
2781
  import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
2689
2782
  import path18 from "path";
2690
2783
  async function loadHydratedIndex(workspacePath) {
@@ -2914,7 +3007,7 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
2914
3007
  if (!await fileExists(document.normalizedPath)) {
2915
3008
  return buildSnippet(chunk.text, query);
2916
3009
  }
2917
- const raw = await readFile11(document.normalizedPath, "utf8");
3010
+ const raw = await readFile10(document.normalizedPath, "utf8");
2918
3011
  orderedChunks = buildChunksForDocument(document, raw, config);
2919
3012
  orderedChunkCache.set(document.id, orderedChunks);
2920
3013
  }
@@ -4210,7 +4303,7 @@ Examples:
4210
4303
  try {
4211
4304
  const meta = await readLatestIndexMetadata(workspace);
4212
4305
  latestIndex = meta.createdAt;
4213
- indexSize = (await stat4(`${workspace}/indexes/latest.json`)).size;
4306
+ indexSize = (await stat4(await resolveLatestIndexArtifactPath(workspace))).size;
4214
4307
  } catch {
4215
4308
  latestIndex = void 0;
4216
4309
  }
@@ -1,5 +1,5 @@
1
1
  export declare const PACKAGE_NAME = "@tryformation/querylight-cli";
2
- export declare const PACKAGE_VERSION = "0.2.0";
2
+ export declare const PACKAGE_VERSION = "0.2.1";
3
3
  export declare const DEFAULT_WORKSPACE = ".kb";
4
4
  export declare const DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
5
5
  export declare const LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
@@ -0,0 +1,3 @@
1
+ export declare function writeGzipJson(filePath: string, value: unknown): Promise<void>;
2
+ export declare function readJsonFromGzipOrFile<T>(gzipPath: string, legacyPath?: string): Promise<T>;
3
+ export declare function resolveExistingGzipOrFilePath(gzipPath: string, legacyPath?: string): Promise<string>;
@@ -1,4 +1,6 @@
1
1
  import type { IndexMetadata } from "../types/models.js";
2
+ export declare function latestIndexPath(workspacePath: string): string;
3
+ export declare function latestMetaPath(workspacePath: string): string;
2
4
  export declare function writeIndexArtifacts({ workspacePath, indexState, metadata }: {
3
5
  workspacePath: string;
4
6
  indexState: object;
@@ -9,3 +11,4 @@ export declare function writeIndexArtifacts({ workspacePath, indexState, metadat
9
11
  }>;
10
12
  export declare function readLatestIndexState(workspacePath: string): Promise<object>;
11
13
  export declare function readLatestIndexMetadata(workspacePath: string): Promise<IndexMetadata>;
14
+ export declare function resolveLatestIndexArtifactPath(workspacePath: string): Promise<string>;
package/dist/index.js CHANGED
@@ -1795,6 +1795,8 @@ import os from "os";
1795
1795
  import path12 from "path";
1796
1796
  import { fileURLToPath } from "url";
1797
1797
  import { execFile, execFileSync } from "child_process";
1798
+ import { mkdtemp, rm as rm2, writeFile as writeFile6 } from "fs/promises";
1799
+ var sparseExecFileSync = execFileSync;
1798
1800
  function resolveQliHomeDir() {
1799
1801
  return path12.resolve(process.env.QLI_HOME ?? path12.join(os.homedir(), ".qli"));
1800
1802
  }
@@ -1850,29 +1852,36 @@ async function runSparsePython({
1850
1852
  }) {
1851
1853
  const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
1852
1854
  const scriptPath = await sparseScriptPath(importMetaUrl);
1853
- return execFileSync(
1854
- "uv",
1855
- [
1856
- "run",
1857
- "--with",
1858
- "torch",
1859
- "--with",
1860
- "transformers",
1861
- "--with",
1862
- "huggingface_hub",
1863
- "python",
1864
- scriptPath
1865
- ],
1866
- {
1867
- encoding: "utf8",
1868
- maxBuffer: 1024 * 1024 * 1024,
1869
- input: JSON.stringify(payload),
1870
- env: {
1871
- ...process.env,
1872
- HF_HOME: cacheDir
1855
+ const payloadDir = await mkdtemp(path12.join(os.tmpdir(), "qli-sparse-"));
1856
+ const payloadPath = path12.join(payloadDir, "payload.json");
1857
+ await writeFile6(payloadPath, JSON.stringify(payload), "utf8");
1858
+ try {
1859
+ return sparseExecFileSync(
1860
+ "uv",
1861
+ [
1862
+ "run",
1863
+ "--with",
1864
+ "torch",
1865
+ "--with",
1866
+ "transformers",
1867
+ "--with",
1868
+ "huggingface_hub",
1869
+ "python",
1870
+ scriptPath,
1871
+ payloadPath
1872
+ ],
1873
+ {
1874
+ encoding: "utf8",
1875
+ maxBuffer: 1024 * 1024 * 1024,
1876
+ env: {
1877
+ ...process.env,
1878
+ HF_HOME: cacheDir
1879
+ }
1873
1880
  }
1874
- }
1875
- );
1881
+ );
1882
+ } finally {
1883
+ await rm2(payloadDir, { recursive: true, force: true });
1884
+ }
1876
1885
  }
1877
1886
  async function getDenseTransformersRuntime(cacheDir) {
1878
1887
  const transformers = await import("@huggingface/transformers");
@@ -1885,8 +1894,31 @@ async function getDenseTransformersRuntime(cacheDir) {
1885
1894
  }
1886
1895
 
1887
1896
  // src/vector/store.ts
1888
- import { mkdir as mkdir6, readFile as readFile9, writeFile as writeFile6 } from "fs/promises";
1897
+ import { mkdir as mkdir6, rm as rm3, writeFile as writeFile8 } from "fs/promises";
1889
1898
  import path13 from "path";
1899
+
1900
+ // src/core/gzip-json.ts
1901
+ import { readFile as readFile9, writeFile as writeFile7 } from "fs/promises";
1902
+ import { promisify } from "util";
1903
+ import { gunzip, gzip } from "zlib";
1904
+ var gzipAsync = promisify(gzip);
1905
+ var gunzipAsync = promisify(gunzip);
1906
+ async function writeGzipJson(filePath, value) {
1907
+ const payload = JSON.stringify(value, null, 2);
1908
+ await writeFile7(filePath, await gzipAsync(Buffer.from(payload, "utf8")));
1909
+ }
1910
+ async function readJsonFromGzipOrFile(gzipPath, legacyPath) {
1911
+ if (await fileExists(gzipPath)) {
1912
+ const payload = await readFile9(gzipPath);
1913
+ return JSON.parse((await gunzipAsync(payload)).toString("utf8"));
1914
+ }
1915
+ if (legacyPath && await fileExists(legacyPath)) {
1916
+ return JSON.parse(await readFile9(legacyPath, "utf8"));
1917
+ }
1918
+ return JSON.parse(await readFile9(gzipPath, "utf8"));
1919
+ }
1920
+
1921
+ // src/vector/store.ts
1890
1922
  function vectorsDir(workspacePath) {
1891
1923
  return path13.join(workspacePath, "vectors");
1892
1924
  }
@@ -1894,15 +1926,27 @@ function sharedModelStateDir() {
1894
1926
  return path13.join(resolveQliHomeDir(), "models", "status");
1895
1927
  }
1896
1928
  function denseVectorPath(workspacePath) {
1897
- return path13.join(vectorsDir(workspacePath), "dense.latest.json");
1929
+ return path13.join(vectorsDir(workspacePath), "dense.latest.json.gz");
1898
1930
  }
1899
1931
  function denseMetaPath(workspacePath) {
1900
- return path13.join(vectorsDir(workspacePath), "dense.latest.meta.json");
1932
+ return path13.join(vectorsDir(workspacePath), "dense.latest.meta.json.gz");
1901
1933
  }
1902
1934
  function sparseVectorPath(workspacePath) {
1903
- return path13.join(vectorsDir(workspacePath), "sparse.latest.json");
1935
+ return path13.join(vectorsDir(workspacePath), "sparse.latest.json.gz");
1904
1936
  }
1905
1937
  function sparseMetaPath(workspacePath) {
1938
+ return path13.join(vectorsDir(workspacePath), "sparse.latest.meta.json.gz");
1939
+ }
1940
+ function legacyDenseVectorPath(workspacePath) {
1941
+ return path13.join(vectorsDir(workspacePath), "dense.latest.json");
1942
+ }
1943
+ function legacyDenseMetaPath(workspacePath) {
1944
+ return path13.join(vectorsDir(workspacePath), "dense.latest.meta.json");
1945
+ }
1946
+ function legacySparseVectorPath(workspacePath) {
1947
+ return path13.join(vectorsDir(workspacePath), "sparse.latest.json");
1948
+ }
1949
+ function legacySparseMetaPath(workspacePath) {
1906
1950
  return path13.join(vectorsDir(workspacePath), "sparse.latest.meta.json");
1907
1951
  }
1908
1952
  function pullMarkerPath(type, workspacePath, modelId, cacheDir) {
@@ -1918,19 +1962,27 @@ function sparsePullMarker(workspacePath, modelId, cacheDir) {
1918
1962
  }
1919
1963
  async function writeDensePayload(workspacePath, payload) {
1920
1964
  await mkdir6(vectorsDir(workspacePath), { recursive: true });
1921
- await writeFile6(denseVectorPath(workspacePath), JSON.stringify(payload, null, 2), "utf8");
1922
- await writeFile6(denseMetaPath(workspacePath), JSON.stringify(payload.metadata, null, 2), "utf8");
1965
+ await writeGzipJson(denseVectorPath(workspacePath), payload);
1966
+ await writeGzipJson(denseMetaPath(workspacePath), payload.metadata);
1967
+ await Promise.all([
1968
+ rm3(legacyDenseVectorPath(workspacePath), { force: true }),
1969
+ rm3(legacyDenseMetaPath(workspacePath), { force: true })
1970
+ ]);
1923
1971
  }
1924
1972
  async function readDensePayload(workspacePath) {
1925
- return JSON.parse(await readFile9(denseVectorPath(workspacePath), "utf8"));
1973
+ return readJsonFromGzipOrFile(denseVectorPath(workspacePath), legacyDenseVectorPath(workspacePath));
1926
1974
  }
1927
1975
  async function writeSparsePayload(workspacePath, payload) {
1928
1976
  await mkdir6(vectorsDir(workspacePath), { recursive: true });
1929
- await writeFile6(sparseVectorPath(workspacePath), JSON.stringify(payload, null, 2), "utf8");
1930
- await writeFile6(sparseMetaPath(workspacePath), JSON.stringify(payload.metadata, null, 2), "utf8");
1977
+ await writeGzipJson(sparseVectorPath(workspacePath), payload);
1978
+ await writeGzipJson(sparseMetaPath(workspacePath), payload.metadata);
1979
+ await Promise.all([
1980
+ rm3(legacySparseVectorPath(workspacePath), { force: true }),
1981
+ rm3(legacySparseMetaPath(workspacePath), { force: true })
1982
+ ]);
1931
1983
  }
1932
1984
  async function readSparsePayload(workspacePath) {
1933
- return JSON.parse(await readFile9(sparseVectorPath(workspacePath), "utf8"));
1985
+ return readJsonFromGzipOrFile(sparseVectorPath(workspacePath), legacySparseVectorPath(workspacePath));
1934
1986
  }
1935
1987
  async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
1936
1988
  const denseCacheDir = resolveCacheDir(workspacePath, dense.cacheDir);
@@ -1941,7 +1993,7 @@ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
1941
1993
  modelId: dense.modelId,
1942
1994
  cacheDir: denseCacheDir,
1943
1995
  available: await fileExists(densePullMarker(workspacePath, dense.modelId, dense.cacheDir)),
1944
- artifactExists: await fileExists(denseVectorPath(workspacePath))
1996
+ artifactExists: await fileExists(denseVectorPath(workspacePath)) || await fileExists(legacyDenseVectorPath(workspacePath))
1945
1997
  },
1946
1998
  sparse: {
1947
1999
  configured: sparse.enabled,
@@ -1949,7 +2001,7 @@ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
1949
2001
  cacheDir: sparseCacheDir,
1950
2002
  uvAvailable,
1951
2003
  available: await fileExists(sparsePullMarker(workspacePath, sparse.modelId, sparse.cacheDir)),
1952
- artifactExists: await fileExists(sparseVectorPath(workspacePath))
2004
+ artifactExists: await fileExists(sparseVectorPath(workspacePath)) || await fileExists(legacySparseVectorPath(workspacePath))
1953
2005
  }
1954
2006
  };
1955
2007
  }
@@ -2266,28 +2318,57 @@ async function buildVectorArtifacts({
2266
2318
  }
2267
2319
 
2268
2320
  // src/index/index-store.ts
2269
- import { readFile as readFile10, writeFile as writeFile7 } from "fs/promises";
2321
+ import { mkdir as mkdir9, rm as rm4 } from "fs/promises";
2270
2322
  import path16 from "path";
2323
+ function versionedIndexPath(workspacePath, stamp) {
2324
+ return path16.join(workspacePath, "indexes", `${stamp}.json.gz`);
2325
+ }
2326
+ function versionedLegacyIndexPath(workspacePath, stamp) {
2327
+ return path16.join(workspacePath, "indexes", `${stamp}.json`);
2328
+ }
2329
+ function versionedMetaPath(workspacePath, stamp) {
2330
+ return path16.join(workspacePath, "indexes", `${stamp}.meta.json.gz`);
2331
+ }
2332
+ function versionedLegacyMetaPath(workspacePath, stamp) {
2333
+ return path16.join(workspacePath, "indexes", `${stamp}.meta.json`);
2334
+ }
2335
+ function latestIndexPath(workspacePath) {
2336
+ return path16.join(workspacePath, "indexes", "latest.json.gz");
2337
+ }
2338
+ function legacyLatestIndexPath(workspacePath) {
2339
+ return path16.join(workspacePath, "indexes", "latest.json");
2340
+ }
2341
+ function latestMetaPath(workspacePath) {
2342
+ return path16.join(workspacePath, "indexes", "latest.meta.json.gz");
2343
+ }
2344
+ function legacyLatestMetaPath(workspacePath) {
2345
+ return path16.join(workspacePath, "indexes", "latest.meta.json");
2346
+ }
2271
2347
  async function writeIndexArtifacts({
2272
2348
  workspacePath,
2273
2349
  indexState,
2274
2350
  metadata
2275
2351
  }) {
2276
2352
  const stamp = metadata.createdAt.replace(/[:.]/g, "-");
2277
- const indexPath = path16.join(workspacePath, "indexes", `${stamp}.json`);
2278
- const metaPath = path16.join(workspacePath, "indexes", `${stamp}.meta.json`);
2279
- const latestIndexPath = path16.join(workspacePath, "indexes", "latest.json");
2280
- const latestMetaPath = path16.join(workspacePath, "indexes", "latest.meta.json");
2281
- const indexPayload = JSON.stringify(indexState, null, 2);
2282
- const metaPayload = JSON.stringify(metadata, null, 2);
2283
- await writeFile7(indexPath, indexPayload, "utf8");
2284
- await writeFile7(metaPath, metaPayload, "utf8");
2285
- await writeFile7(latestIndexPath, indexPayload, "utf8");
2286
- await writeFile7(latestMetaPath, metaPayload, "utf8");
2287
- return { indexPath: latestIndexPath, metadataPath: latestMetaPath };
2353
+ const indexPath = versionedIndexPath(workspacePath, stamp);
2354
+ const metaPath = versionedMetaPath(workspacePath, stamp);
2355
+ const latestIndexArtifactPath = latestIndexPath(workspacePath);
2356
+ const latestMetadataArtifactPath = latestMetaPath(workspacePath);
2357
+ await mkdir9(path16.join(workspacePath, "indexes"), { recursive: true });
2358
+ await writeGzipJson(indexPath, indexState);
2359
+ await writeGzipJson(metaPath, metadata);
2360
+ await writeGzipJson(latestIndexArtifactPath, indexState);
2361
+ await writeGzipJson(latestMetadataArtifactPath, metadata);
2362
+ await Promise.all([
2363
+ rm4(legacyLatestIndexPath(workspacePath), { force: true }),
2364
+ rm4(legacyLatestMetaPath(workspacePath), { force: true }),
2365
+ rm4(versionedLegacyIndexPath(workspacePath, stamp), { force: true }),
2366
+ rm4(versionedLegacyMetaPath(workspacePath, stamp), { force: true })
2367
+ ]);
2368
+ return { indexPath: latestIndexArtifactPath, metadataPath: latestMetadataArtifactPath };
2288
2369
  }
2289
2370
  async function readLatestIndexState(workspacePath) {
2290
- return JSON.parse(await readFile10(path16.join(workspacePath, "indexes", "latest.json"), "utf8"));
2371
+ return readJsonFromGzipOrFile(latestIndexPath(workspacePath), legacyLatestIndexPath(workspacePath));
2291
2372
  }
2292
2373
 
2293
2374
  // src/index/querylight-indexer.ts
@@ -2387,7 +2468,7 @@ async function buildIndex({
2387
2468
  }
2388
2469
 
2389
2470
  // src/query/search-service.ts
2390
- import { readFile as readFile11 } from "fs/promises";
2471
+ import { readFile as readFile10 } from "fs/promises";
2391
2472
  import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
2392
2473
  import path18 from "path";
2393
2474
  async function loadHydratedIndex(workspacePath) {
@@ -2617,7 +2698,7 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
2617
2698
  if (!await fileExists(document.normalizedPath)) {
2618
2699
  return buildSnippet(chunk.text, query);
2619
2700
  }
2620
- const raw = await readFile11(document.normalizedPath, "utf8");
2701
+ const raw = await readFile10(document.normalizedPath, "utf8");
2621
2702
  orderedChunks = buildChunksForDocument(document, raw, config);
2622
2703
  orderedChunkCache.set(document.id, orderedChunks);
2623
2704
  }
@@ -1,4 +1,11 @@
1
1
  import type { SparseVectorModelConfig } from "../types/models.js";
2
+ type SparseExecOptions = {
3
+ encoding: BufferEncoding;
4
+ maxBuffer: number;
5
+ env: NodeJS.ProcessEnv;
6
+ };
7
+ type SparseExecFileSync = (file: string, args: string[], options: SparseExecOptions) => string;
8
+ export declare function setSparseExecFileSyncForTests(fn: SparseExecFileSync | null): void;
2
9
  export declare function resolveQliHomeDir(): string;
3
10
  export declare function resolveCacheDir(workspacePath: string, configuredPath: string): string;
4
11
  export declare function packageRootFromImportMeta(importMetaUrl: string): string;
@@ -18,3 +25,4 @@ export declare function getDenseTransformersRuntime(cacheDir: string): Promise<{
18
25
  };
19
26
  pipeline: typeof import("@huggingface/transformers").pipeline;
20
27
  }>;
28
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tryformation/querylight-cli",
3
- "version": "0.2.0",
3
+ "version": "0.2.2",
4
4
  "description": "Querylight CLI for building and querying local knowledge bases.",
5
5
  "license": "MIT",
6
6
  "homepage": "https://github.com/formation-res/querylight-cli#readme",
@@ -88,7 +88,11 @@ def encode_documents(model_id: str, top_tokens: int, documents):
88
88
 
89
89
 
90
90
  def main():
91
- payload = json.load(sys.stdin)
91
+ if len(sys.argv) > 1:
92
+ with open(sys.argv[1], encoding="utf-8") as handle:
93
+ payload = json.load(handle)
94
+ else:
95
+ payload = json.load(sys.stdin)
92
96
  action = payload["action"]
93
97
  model_id = payload["model_id"]
94
98
  if action == "download_only":