@tryformation/querylight-cli 0.2.4 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Dockerfile +5 -0
- package/README.md +50 -2
- package/dist/cli/main.js +333 -167
- package/dist/core/archive.d.ts +18 -0
- package/dist/core/constants.d.ts +2 -2
- package/dist/index.js +81 -19
- package/dist/types/models.d.ts +3 -0
- package/dist/vector/runtime.d.ts +1 -4
- package/package.json +12 -8
- package/scripts/assert-release-version.mjs +48 -0
package/dist/cli/main.js
CHANGED
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
// src/cli/run-cli.ts
|
|
4
4
|
import { Command, Option } from "commander";
|
|
5
|
-
import { readFile as
|
|
6
|
-
import
|
|
5
|
+
import { readFile as readFile12, stat as stat6 } from "fs/promises";
|
|
6
|
+
import path23 from "path";
|
|
7
7
|
|
|
8
8
|
// src/chunk/chunker.ts
|
|
9
9
|
import { readFile as readFile3 } from "fs/promises";
|
|
@@ -16,7 +16,11 @@ import path from "path";
|
|
|
16
16
|
import YAML from "yaml";
|
|
17
17
|
|
|
18
18
|
// src/core/constants.ts
|
|
19
|
-
|
|
19
|
+
import { createRequire } from "module";
|
|
20
|
+
var require2 = createRequire(import.meta.url);
|
|
21
|
+
var packageJson = require2("../../package.json");
|
|
22
|
+
var PACKAGE_NAME = packageJson.name;
|
|
23
|
+
var PACKAGE_VERSION = packageJson.version;
|
|
20
24
|
var DEFAULT_WORKSPACE = ".kb";
|
|
21
25
|
var DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
|
|
22
26
|
var LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
|
|
@@ -49,6 +53,9 @@ var defaultConfig = () => ({
|
|
|
49
53
|
maxContextChars: 12e3,
|
|
50
54
|
citationStyle: "markdown"
|
|
51
55
|
},
|
|
56
|
+
search: {
|
|
57
|
+
defaultTopK: 50
|
|
58
|
+
},
|
|
52
59
|
retrieval: {
|
|
53
60
|
defaultMode: "lexical",
|
|
54
61
|
dense: {
|
|
@@ -70,12 +77,12 @@ var defaultConfig = () => ({
|
|
|
70
77
|
}
|
|
71
78
|
},
|
|
72
79
|
crawler: {
|
|
73
|
-
defaultUserAgent: "querylight-cli
|
|
80
|
+
defaultUserAgent: "querylight-cli",
|
|
74
81
|
obeyRobotsTxt: true,
|
|
75
82
|
rateLimitMs: 1e3,
|
|
76
83
|
maxConcurrentRequests: 5,
|
|
77
84
|
renderJs: false,
|
|
78
|
-
retentionDays:
|
|
85
|
+
retentionDays: 30,
|
|
79
86
|
fetchArticles: true
|
|
80
87
|
},
|
|
81
88
|
limits: {
|
|
@@ -119,6 +126,10 @@ async function loadConfig(workspacePath, configPath) {
|
|
|
119
126
|
...defaults.rag,
|
|
120
127
|
...parsed.rag ?? {}
|
|
121
128
|
},
|
|
129
|
+
search: {
|
|
130
|
+
...defaults.search,
|
|
131
|
+
...parsed.search ?? {}
|
|
132
|
+
},
|
|
122
133
|
retrieval: {
|
|
123
134
|
...defaults.retrieval,
|
|
124
135
|
...parsed.retrieval ?? {},
|
|
@@ -373,27 +384,138 @@ async function assertWorkspaceExists(workspacePath) {
|
|
|
373
384
|
}
|
|
374
385
|
}
|
|
375
386
|
|
|
387
|
+
// src/core/archive.ts
|
|
388
|
+
import { mkdir as mkdir3, readdir, readFile as readFile4, rm, stat as stat2, writeFile as writeFile3 } from "fs/promises";
|
|
389
|
+
import os from "os";
|
|
390
|
+
import path6 from "path";
|
|
391
|
+
import { unzipSync, zipSync } from "fflate";
|
|
392
|
+
function isWorkspaceArchivePath(workspacePath) {
|
|
393
|
+
return workspacePath.toLowerCase().endsWith(".zip");
|
|
394
|
+
}
|
|
395
|
+
async function collectFiles(root, outputPath) {
|
|
396
|
+
const files = {};
|
|
397
|
+
const resolvedOutput = path6.resolve(outputPath);
|
|
398
|
+
async function visit(dir) {
|
|
399
|
+
const entries = await readdir(dir, { withFileTypes: true });
|
|
400
|
+
for (const entry of entries) {
|
|
401
|
+
const absolute = path6.join(dir, entry.name);
|
|
402
|
+
if (path6.resolve(absolute) === resolvedOutput) {
|
|
403
|
+
continue;
|
|
404
|
+
}
|
|
405
|
+
if (entry.isDirectory()) {
|
|
406
|
+
await visit(absolute);
|
|
407
|
+
continue;
|
|
408
|
+
}
|
|
409
|
+
if (!entry.isFile()) {
|
|
410
|
+
continue;
|
|
411
|
+
}
|
|
412
|
+
const relative = path6.relative(root, absolute).split(path6.sep).join("/");
|
|
413
|
+
files[relative] = new Uint8Array(await readFile4(absolute));
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
await visit(root);
|
|
417
|
+
return files;
|
|
418
|
+
}
|
|
419
|
+
async function packageWorkspaceArchive({
|
|
420
|
+
workspacePath,
|
|
421
|
+
outputPath,
|
|
422
|
+
force = false
|
|
423
|
+
}) {
|
|
424
|
+
const workspace = await assertWorkspaceExists(workspacePath);
|
|
425
|
+
const archivePath = path6.resolve(outputPath);
|
|
426
|
+
try {
|
|
427
|
+
await stat2(archivePath);
|
|
428
|
+
if (!force) {
|
|
429
|
+
throw new CliError(`archive already exists: ${archivePath}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
430
|
+
}
|
|
431
|
+
} catch (error) {
|
|
432
|
+
if (error instanceof CliError) {
|
|
433
|
+
throw error;
|
|
434
|
+
}
|
|
435
|
+
if (error.code !== "ENOENT") {
|
|
436
|
+
throw error;
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
const files = await collectFiles(workspace, archivePath);
|
|
440
|
+
const archive = zipSync(files, { level: 6 });
|
|
441
|
+
await mkdir3(path6.dirname(archivePath), { recursive: true });
|
|
442
|
+
await writeFile3(archivePath, archive);
|
|
443
|
+
const archiveStat = await stat2(archivePath);
|
|
444
|
+
return {
|
|
445
|
+
workspacePath: workspace,
|
|
446
|
+
archivePath,
|
|
447
|
+
fileCount: Object.keys(files).length,
|
|
448
|
+
sizeBytes: archiveStat.size
|
|
449
|
+
};
|
|
450
|
+
}
|
|
451
|
+
function assertSafeArchiveEntry(name) {
|
|
452
|
+
const normalized = path6.posix.normalize(name);
|
|
453
|
+
if (name.startsWith("/") || normalized === "." || normalized.startsWith("../") || normalized.includes("/../")) {
|
|
454
|
+
throw new CliError(`unsafe archive entry: ${name}`, "WORKSPACE_ERROR", 3 /* WorkspaceError */);
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
async function archiveCachePath(archivePath) {
|
|
458
|
+
const info = await stat2(archivePath);
|
|
459
|
+
const key = sha256(`${path6.resolve(archivePath)}:${info.size}:${info.mtimeMs}`).slice(0, 24);
|
|
460
|
+
return path6.join(os.tmpdir(), "qli-workspace-archives", key);
|
|
461
|
+
}
|
|
462
|
+
async function resolveReadableWorkspace(workspacePath) {
|
|
463
|
+
const resolved = path6.resolve(workspacePath);
|
|
464
|
+
if (!isWorkspaceArchivePath(resolved)) {
|
|
465
|
+
return { workspacePath: await assertWorkspaceExists(resolved) };
|
|
466
|
+
}
|
|
467
|
+
const archive = await readFile4(resolved);
|
|
468
|
+
const extractRoot = await archiveCachePath(resolved);
|
|
469
|
+
const workspaceRoot = path6.join(extractRoot, "workspace");
|
|
470
|
+
try {
|
|
471
|
+
await assertWorkspaceExists(workspaceRoot);
|
|
472
|
+
return { workspacePath: workspaceRoot, archivePath: resolved };
|
|
473
|
+
} catch {
|
|
474
|
+
}
|
|
475
|
+
await rm(extractRoot, { recursive: true, force: true });
|
|
476
|
+
await mkdir3(workspaceRoot, { recursive: true });
|
|
477
|
+
const entries = unzipSync(new Uint8Array(archive));
|
|
478
|
+
await Promise.all(Object.entries(entries).map(async ([entryName, data]) => {
|
|
479
|
+
assertSafeArchiveEntry(entryName);
|
|
480
|
+
const target = path6.join(workspaceRoot, ...entryName.split("/"));
|
|
481
|
+
if (entryName.endsWith("/")) {
|
|
482
|
+
await mkdir3(target, { recursive: true });
|
|
483
|
+
return;
|
|
484
|
+
}
|
|
485
|
+
await mkdir3(path6.dirname(target), { recursive: true });
|
|
486
|
+
await writeFile3(target, Buffer.from(data));
|
|
487
|
+
}));
|
|
488
|
+
return { workspacePath: await assertWorkspaceExists(workspaceRoot), archivePath: resolved };
|
|
489
|
+
}
|
|
490
|
+
async function assertWritableWorkspacePath(workspacePath) {
|
|
491
|
+
const resolved = path6.resolve(workspacePath);
|
|
492
|
+
if (isWorkspaceArchivePath(resolved)) {
|
|
493
|
+
throw new CliError("zip workspaces are read-only; package a rebuilt directory workspace instead", "WORKSPACE_ERROR", 3 /* WorkspaceError */);
|
|
494
|
+
}
|
|
495
|
+
return resolved;
|
|
496
|
+
}
|
|
497
|
+
|
|
376
498
|
// src/index/querylight-indexer.ts
|
|
377
499
|
import { Analyzer, DateFieldIndex, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, StoredSourceIndex, TextFieldIndex } from "@tryformation/querylight-ts";
|
|
378
|
-
import
|
|
500
|
+
import path12 from "path";
|
|
379
501
|
|
|
380
502
|
// src/vector/dense.ts
|
|
381
503
|
import { VectorFieldIndex, cosineSimilarity, createSeededRandom } from "@tryformation/querylight-ts";
|
|
382
|
-
import { mkdir as
|
|
383
|
-
import
|
|
504
|
+
import { mkdir as mkdir5 } from "fs/promises";
|
|
505
|
+
import path9 from "path";
|
|
384
506
|
|
|
385
507
|
// src/vector/runtime.ts
|
|
386
|
-
import
|
|
387
|
-
import
|
|
508
|
+
import os2 from "os";
|
|
509
|
+
import path7 from "path";
|
|
388
510
|
import { fileURLToPath } from "url";
|
|
389
511
|
import { execFile, execFileSync } from "child_process";
|
|
390
|
-
import { mkdtemp, rm, writeFile as
|
|
512
|
+
import { mkdtemp, rm as rm2, writeFile as writeFile4 } from "fs/promises";
|
|
391
513
|
|
|
392
514
|
// src/core/files.ts
|
|
393
|
-
import { stat as
|
|
515
|
+
import { stat as stat3 } from "fs/promises";
|
|
394
516
|
async function fileExists(filePath) {
|
|
395
517
|
try {
|
|
396
|
-
await
|
|
518
|
+
await stat3(filePath);
|
|
397
519
|
return true;
|
|
398
520
|
} catch {
|
|
399
521
|
return false;
|
|
@@ -403,35 +525,35 @@ async function fileExists(filePath) {
|
|
|
403
525
|
// src/vector/runtime.ts
|
|
404
526
|
var sparseExecFileSync = execFileSync;
|
|
405
527
|
function resolveQliHomeDir() {
|
|
406
|
-
return
|
|
528
|
+
return path7.resolve(process.env.QLI_HOME ?? path7.join(os2.homedir(), ".qli"));
|
|
407
529
|
}
|
|
408
530
|
function resolveCacheDir(workspacePath, configuredPath) {
|
|
409
531
|
if (configuredPath === "~/.qli") {
|
|
410
532
|
return resolveQliHomeDir();
|
|
411
533
|
}
|
|
412
534
|
if (configuredPath.startsWith("~/.qli/")) {
|
|
413
|
-
return
|
|
535
|
+
return path7.join(resolveQliHomeDir(), configuredPath.slice("~/.qli/".length));
|
|
414
536
|
}
|
|
415
537
|
if (configuredPath === "~") {
|
|
416
|
-
return
|
|
538
|
+
return os2.homedir();
|
|
417
539
|
}
|
|
418
540
|
if (configuredPath.startsWith("~/")) {
|
|
419
|
-
return
|
|
541
|
+
return path7.join(os2.homedir(), configuredPath.slice(2));
|
|
420
542
|
}
|
|
421
|
-
return
|
|
543
|
+
return path7.isAbsolute(configuredPath) ? configuredPath : path7.resolve(workspacePath, configuredPath.replace(/^\.kb\//, ""));
|
|
422
544
|
}
|
|
423
545
|
function packageRootFromImportMeta(importMetaUrl) {
|
|
424
|
-
return
|
|
546
|
+
return path7.resolve(path7.dirname(fileURLToPath(importMetaUrl)), "..");
|
|
425
547
|
}
|
|
426
548
|
async function sparseScriptPath(importMetaUrl) {
|
|
427
549
|
const base = packageRootFromImportMeta(importMetaUrl);
|
|
428
550
|
const candidates = [
|
|
429
|
-
|
|
430
|
-
|
|
551
|
+
path7.join(base, "scripts", "sparse-encode.py"),
|
|
552
|
+
path7.join(base, "..", "scripts", "sparse-encode.py")
|
|
431
553
|
];
|
|
432
554
|
for (const candidate of candidates) {
|
|
433
555
|
if (await fileExists(candidate)) {
|
|
434
|
-
return
|
|
556
|
+
return path7.resolve(candidate);
|
|
435
557
|
}
|
|
436
558
|
}
|
|
437
559
|
throw new Error(`sparse helper script not found; checked ${candidates.join(", ")}`);
|
|
@@ -457,9 +579,9 @@ async function runSparsePython({
|
|
|
457
579
|
}) {
|
|
458
580
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
459
581
|
const scriptPath = await sparseScriptPath(importMetaUrl);
|
|
460
|
-
const payloadDir = await mkdtemp(
|
|
461
|
-
const payloadPath =
|
|
462
|
-
await
|
|
582
|
+
const payloadDir = await mkdtemp(path7.join(os2.tmpdir(), "qli-sparse-"));
|
|
583
|
+
const payloadPath = path7.join(payloadDir, "payload.json");
|
|
584
|
+
await writeFile4(payloadPath, JSON.stringify(payload), "utf8");
|
|
463
585
|
try {
|
|
464
586
|
return sparseExecFileSync(
|
|
465
587
|
"uv",
|
|
@@ -485,7 +607,7 @@ async function runSparsePython({
|
|
|
485
607
|
}
|
|
486
608
|
);
|
|
487
609
|
} finally {
|
|
488
|
-
await
|
|
610
|
+
await rm2(payloadDir, { recursive: true, force: true });
|
|
489
611
|
}
|
|
490
612
|
}
|
|
491
613
|
async function getDenseTransformersRuntime(cacheDir) {
|
|
@@ -499,28 +621,28 @@ async function getDenseTransformersRuntime(cacheDir) {
|
|
|
499
621
|
}
|
|
500
622
|
|
|
501
623
|
// src/vector/store.ts
|
|
502
|
-
import { mkdir as
|
|
503
|
-
import
|
|
624
|
+
import { mkdir as mkdir4, rm as rm3, writeFile as writeFile6 } from "fs/promises";
|
|
625
|
+
import path8 from "path";
|
|
504
626
|
|
|
505
627
|
// src/core/gzip-json.ts
|
|
506
|
-
import { readFile as
|
|
628
|
+
import { readFile as readFile5, writeFile as writeFile5 } from "fs/promises";
|
|
507
629
|
import { promisify } from "util";
|
|
508
630
|
import { gunzip, gzip } from "zlib";
|
|
509
631
|
var gzipAsync = promisify(gzip);
|
|
510
632
|
var gunzipAsync = promisify(gunzip);
|
|
511
633
|
async function writeGzipJson(filePath, value) {
|
|
512
634
|
const payload = JSON.stringify(value, null, 2);
|
|
513
|
-
await
|
|
635
|
+
await writeFile5(filePath, await gzipAsync(Buffer.from(payload, "utf8")));
|
|
514
636
|
}
|
|
515
637
|
async function readJsonFromGzipOrFile(gzipPath, legacyPath) {
|
|
516
638
|
if (await fileExists(gzipPath)) {
|
|
517
|
-
const payload = await
|
|
639
|
+
const payload = await readFile5(gzipPath);
|
|
518
640
|
return JSON.parse((await gunzipAsync(payload)).toString("utf8"));
|
|
519
641
|
}
|
|
520
642
|
if (legacyPath && await fileExists(legacyPath)) {
|
|
521
|
-
return JSON.parse(await
|
|
643
|
+
return JSON.parse(await readFile5(legacyPath, "utf8"));
|
|
522
644
|
}
|
|
523
|
-
return JSON.parse(await
|
|
645
|
+
return JSON.parse(await readFile5(gzipPath, "utf8"));
|
|
524
646
|
}
|
|
525
647
|
async function resolveExistingGzipOrFilePath(gzipPath, legacyPath) {
|
|
526
648
|
if (await fileExists(gzipPath)) {
|
|
@@ -534,39 +656,39 @@ async function resolveExistingGzipOrFilePath(gzipPath, legacyPath) {
|
|
|
534
656
|
|
|
535
657
|
// src/vector/store.ts
|
|
536
658
|
function vectorsDir(workspacePath) {
|
|
537
|
-
return
|
|
659
|
+
return path8.join(workspacePath, "vectors");
|
|
538
660
|
}
|
|
539
661
|
function sharedModelStateDir() {
|
|
540
|
-
return
|
|
662
|
+
return path8.join(resolveQliHomeDir(), "models", "status");
|
|
541
663
|
}
|
|
542
664
|
function denseVectorPath(workspacePath) {
|
|
543
|
-
return
|
|
665
|
+
return path8.join(vectorsDir(workspacePath), "dense.latest.json.gz");
|
|
544
666
|
}
|
|
545
667
|
function denseMetaPath(workspacePath) {
|
|
546
|
-
return
|
|
668
|
+
return path8.join(vectorsDir(workspacePath), "dense.latest.meta.json.gz");
|
|
547
669
|
}
|
|
548
670
|
function sparseVectorPath(workspacePath) {
|
|
549
|
-
return
|
|
671
|
+
return path8.join(vectorsDir(workspacePath), "sparse.latest.json.gz");
|
|
550
672
|
}
|
|
551
673
|
function sparseMetaPath(workspacePath) {
|
|
552
|
-
return
|
|
674
|
+
return path8.join(vectorsDir(workspacePath), "sparse.latest.meta.json.gz");
|
|
553
675
|
}
|
|
554
676
|
function legacyDenseVectorPath(workspacePath) {
|
|
555
|
-
return
|
|
677
|
+
return path8.join(vectorsDir(workspacePath), "dense.latest.json");
|
|
556
678
|
}
|
|
557
679
|
function legacyDenseMetaPath(workspacePath) {
|
|
558
|
-
return
|
|
680
|
+
return path8.join(vectorsDir(workspacePath), "dense.latest.meta.json");
|
|
559
681
|
}
|
|
560
682
|
function legacySparseVectorPath(workspacePath) {
|
|
561
|
-
return
|
|
683
|
+
return path8.join(vectorsDir(workspacePath), "sparse.latest.json");
|
|
562
684
|
}
|
|
563
685
|
function legacySparseMetaPath(workspacePath) {
|
|
564
|
-
return
|
|
686
|
+
return path8.join(vectorsDir(workspacePath), "sparse.latest.meta.json");
|
|
565
687
|
}
|
|
566
688
|
function pullMarkerPath(type, workspacePath, modelId, cacheDir) {
|
|
567
689
|
const resolvedCacheDir = resolveCacheDir(workspacePath, cacheDir);
|
|
568
690
|
const cacheKey = sha256(resolvedCacheDir).slice(0, 16);
|
|
569
|
-
return
|
|
691
|
+
return path8.join(sharedModelStateDir(), type, `${encodeURIComponent(modelId)}.${cacheKey}.json`);
|
|
570
692
|
}
|
|
571
693
|
function densePullMarker(workspacePath, modelId, cacheDir) {
|
|
572
694
|
return pullMarkerPath("dense", workspacePath, modelId, cacheDir);
|
|
@@ -575,24 +697,24 @@ function sparsePullMarker(workspacePath, modelId, cacheDir) {
|
|
|
575
697
|
return pullMarkerPath("sparse", workspacePath, modelId, cacheDir);
|
|
576
698
|
}
|
|
577
699
|
async function writeDensePayload(workspacePath, payload) {
|
|
578
|
-
await
|
|
700
|
+
await mkdir4(vectorsDir(workspacePath), { recursive: true });
|
|
579
701
|
await writeGzipJson(denseVectorPath(workspacePath), payload);
|
|
580
702
|
await writeGzipJson(denseMetaPath(workspacePath), payload.metadata);
|
|
581
703
|
await Promise.all([
|
|
582
|
-
|
|
583
|
-
|
|
704
|
+
rm3(legacyDenseVectorPath(workspacePath), { force: true }),
|
|
705
|
+
rm3(legacyDenseMetaPath(workspacePath), { force: true })
|
|
584
706
|
]);
|
|
585
707
|
}
|
|
586
708
|
async function readDensePayload(workspacePath) {
|
|
587
709
|
return readJsonFromGzipOrFile(denseVectorPath(workspacePath), legacyDenseVectorPath(workspacePath));
|
|
588
710
|
}
|
|
589
711
|
async function writeSparsePayload(workspacePath, payload) {
|
|
590
|
-
await
|
|
712
|
+
await mkdir4(vectorsDir(workspacePath), { recursive: true });
|
|
591
713
|
await writeGzipJson(sparseVectorPath(workspacePath), payload);
|
|
592
714
|
await writeGzipJson(sparseMetaPath(workspacePath), payload.metadata);
|
|
593
715
|
await Promise.all([
|
|
594
|
-
|
|
595
|
-
|
|
716
|
+
rm3(legacySparseVectorPath(workspacePath), { force: true }),
|
|
717
|
+
rm3(legacySparseMetaPath(workspacePath), { force: true })
|
|
596
718
|
]);
|
|
597
719
|
}
|
|
598
720
|
async function readSparsePayload(workspacePath) {
|
|
@@ -600,13 +722,13 @@ async function readSparsePayload(workspacePath) {
|
|
|
600
722
|
}
|
|
601
723
|
async function writeDensePullMarker(workspacePath, model, value) {
|
|
602
724
|
const markerPath = densePullMarker(workspacePath, model.modelId, model.cacheDir);
|
|
603
|
-
await
|
|
604
|
-
await
|
|
725
|
+
await mkdir4(path8.dirname(markerPath), { recursive: true });
|
|
726
|
+
await writeFile6(markerPath, JSON.stringify(value, null, 2), "utf8");
|
|
605
727
|
}
|
|
606
728
|
async function writeSparsePullMarker(workspacePath, model, value) {
|
|
607
729
|
const markerPath = sparsePullMarker(workspacePath, model.modelId, model.cacheDir);
|
|
608
|
-
await
|
|
609
|
-
await
|
|
730
|
+
await mkdir4(path8.dirname(markerPath), { recursive: true });
|
|
731
|
+
await writeFile6(markerPath, JSON.stringify(value, null, 2), "utf8");
|
|
610
732
|
}
|
|
611
733
|
async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
|
|
612
734
|
const denseCacheDir = resolveCacheDir(workspacePath, dense.cacheDir);
|
|
@@ -709,7 +831,7 @@ function exactDenseQuery(payload, vector, topK) {
|
|
|
709
831
|
}
|
|
710
832
|
async function pullDenseModel(workspacePath, config) {
|
|
711
833
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
712
|
-
await
|
|
834
|
+
await mkdir5(cacheDir, { recursive: true });
|
|
713
835
|
const embedder = await createEmbedder(cacheDir, config.modelId);
|
|
714
836
|
try {
|
|
715
837
|
await embedder.embed("warm dense model cache");
|
|
@@ -722,9 +844,9 @@ async function buildDenseVectors({
|
|
|
722
844
|
config,
|
|
723
845
|
progress
|
|
724
846
|
}) {
|
|
725
|
-
const chunks = await readJsonl(
|
|
847
|
+
const chunks = await readJsonl(path9.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
726
848
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
727
|
-
await
|
|
849
|
+
await mkdir5(cacheDir, { recursive: true });
|
|
728
850
|
const embedder = await createEmbedder(cacheDir, config.modelId);
|
|
729
851
|
try {
|
|
730
852
|
const records = [];
|
|
@@ -808,8 +930,8 @@ async function denseQuery({
|
|
|
808
930
|
|
|
809
931
|
// src/vector/sparse.ts
|
|
810
932
|
import { SparseVectorFieldIndex } from "@tryformation/querylight-ts";
|
|
811
|
-
import { mkdir as
|
|
812
|
-
import
|
|
933
|
+
import { mkdir as mkdir6 } from "fs/promises";
|
|
934
|
+
import path10 from "path";
|
|
813
935
|
var sparseQueryEncoderFactory = null;
|
|
814
936
|
var sparseDocumentBuilderFactory = null;
|
|
815
937
|
function buildSparseQueryVector(tokenIds, tokenWeights) {
|
|
@@ -854,7 +976,6 @@ async function createSparseQueryEncoder(cacheDir, modelId, queryTokenWeights) {
|
|
|
854
976
|
return async (text) => {
|
|
855
977
|
const features = await tokenizer([text], {
|
|
856
978
|
truncation: true,
|
|
857
|
-
return_attention_mask: false,
|
|
858
979
|
return_token_type_ids: false
|
|
859
980
|
});
|
|
860
981
|
return buildSparseQueryVector(normalizeTokenIds(features.input_ids), queryTokenWeights);
|
|
@@ -863,7 +984,7 @@ async function createSparseQueryEncoder(cacheDir, modelId, queryTokenWeights) {
|
|
|
863
984
|
async function pullSparseModel(workspacePath, config) {
|
|
864
985
|
await ensureUvAvailable();
|
|
865
986
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
866
|
-
await
|
|
987
|
+
await mkdir6(cacheDir, { recursive: true });
|
|
867
988
|
await runSparsePython({
|
|
868
989
|
workspacePath,
|
|
869
990
|
config,
|
|
@@ -914,7 +1035,7 @@ async function buildSparseVectors({
|
|
|
914
1035
|
config,
|
|
915
1036
|
progress
|
|
916
1037
|
}) {
|
|
917
|
-
const chunks = await readJsonl(
|
|
1038
|
+
const chunks = await readJsonl(path10.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
918
1039
|
reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for sparse retrieval`);
|
|
919
1040
|
const built = await buildSparseDocuments(workspacePath, config, chunks);
|
|
920
1041
|
reportProgress(progress, "Building sparse vector index");
|
|
@@ -1047,31 +1168,31 @@ async function getModelStatus(workspacePath, config) {
|
|
|
1047
1168
|
}
|
|
1048
1169
|
|
|
1049
1170
|
// src/index/index-store.ts
|
|
1050
|
-
import { mkdir as
|
|
1051
|
-
import
|
|
1171
|
+
import { mkdir as mkdir7, rm as rm4 } from "fs/promises";
|
|
1172
|
+
import path11 from "path";
|
|
1052
1173
|
function versionedIndexPath(workspacePath, stamp) {
|
|
1053
|
-
return
|
|
1174
|
+
return path11.join(workspacePath, "indexes", `${stamp}.json.gz`);
|
|
1054
1175
|
}
|
|
1055
1176
|
function versionedLegacyIndexPath(workspacePath, stamp) {
|
|
1056
|
-
return
|
|
1177
|
+
return path11.join(workspacePath, "indexes", `${stamp}.json`);
|
|
1057
1178
|
}
|
|
1058
1179
|
function versionedMetaPath(workspacePath, stamp) {
|
|
1059
|
-
return
|
|
1180
|
+
return path11.join(workspacePath, "indexes", `${stamp}.meta.json.gz`);
|
|
1060
1181
|
}
|
|
1061
1182
|
function versionedLegacyMetaPath(workspacePath, stamp) {
|
|
1062
|
-
return
|
|
1183
|
+
return path11.join(workspacePath, "indexes", `${stamp}.meta.json`);
|
|
1063
1184
|
}
|
|
1064
1185
|
function latestIndexPath(workspacePath) {
|
|
1065
|
-
return
|
|
1186
|
+
return path11.join(workspacePath, "indexes", "latest.json.gz");
|
|
1066
1187
|
}
|
|
1067
1188
|
function legacyLatestIndexPath(workspacePath) {
|
|
1068
|
-
return
|
|
1189
|
+
return path11.join(workspacePath, "indexes", "latest.json");
|
|
1069
1190
|
}
|
|
1070
1191
|
function latestMetaPath(workspacePath) {
|
|
1071
|
-
return
|
|
1192
|
+
return path11.join(workspacePath, "indexes", "latest.meta.json.gz");
|
|
1072
1193
|
}
|
|
1073
1194
|
function legacyLatestMetaPath(workspacePath) {
|
|
1074
|
-
return
|
|
1195
|
+
return path11.join(workspacePath, "indexes", "latest.meta.json");
|
|
1075
1196
|
}
|
|
1076
1197
|
async function writeIndexArtifacts({
|
|
1077
1198
|
workspacePath,
|
|
@@ -1083,16 +1204,16 @@ async function writeIndexArtifacts({
|
|
|
1083
1204
|
const metaPath = versionedMetaPath(workspacePath, stamp);
|
|
1084
1205
|
const latestIndexArtifactPath = latestIndexPath(workspacePath);
|
|
1085
1206
|
const latestMetadataArtifactPath = latestMetaPath(workspacePath);
|
|
1086
|
-
await
|
|
1207
|
+
await mkdir7(path11.join(workspacePath, "indexes"), { recursive: true });
|
|
1087
1208
|
await writeGzipJson(indexPath, indexState);
|
|
1088
1209
|
await writeGzipJson(metaPath, metadata);
|
|
1089
1210
|
await writeGzipJson(latestIndexArtifactPath, indexState);
|
|
1090
1211
|
await writeGzipJson(latestMetadataArtifactPath, metadata);
|
|
1091
1212
|
await Promise.all([
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1213
|
+
rm4(legacyLatestIndexPath(workspacePath), { force: true }),
|
|
1214
|
+
rm4(legacyLatestMetaPath(workspacePath), { force: true }),
|
|
1215
|
+
rm4(versionedLegacyIndexPath(workspacePath, stamp), { force: true }),
|
|
1216
|
+
rm4(versionedLegacyMetaPath(workspacePath, stamp), { force: true })
|
|
1096
1217
|
]);
|
|
1097
1218
|
return { indexPath: latestIndexArtifactPath, metadataPath: latestMetadataArtifactPath };
|
|
1098
1219
|
}
|
|
@@ -1157,9 +1278,9 @@ async function buildIndex({
|
|
|
1157
1278
|
}) {
|
|
1158
1279
|
const config = await loadConfig(workspacePath);
|
|
1159
1280
|
reportProgress(progress, "Loading documents, chunks, and sources");
|
|
1160
|
-
const chunks = await readJsonl(
|
|
1161
|
-
const documents = await readJsonl(
|
|
1162
|
-
const sources = await readJsonl(
|
|
1281
|
+
const chunks = await readJsonl(path12.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
1282
|
+
const documents = await readJsonl(path12.join(workspacePath, "documents", "documents.jsonl"));
|
|
1283
|
+
const sources = await readJsonl(path12.join(workspacePath, "sources", "sources.jsonl"));
|
|
1163
1284
|
const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
|
|
1164
1285
|
const index = new DocumentIndex(createIndexMapping(metadataFields));
|
|
1165
1286
|
const documentsById = new Map(documents.map((document) => [document.id, document]));
|
|
@@ -1238,7 +1359,7 @@ async function buildIndex({
|
|
|
1238
1359
|
}
|
|
1239
1360
|
|
|
1240
1361
|
// src/ingest/ingest-service.ts
|
|
1241
|
-
import
|
|
1362
|
+
import path18 from "path";
|
|
1242
1363
|
|
|
1243
1364
|
// src/core/concurrency.ts
|
|
1244
1365
|
async function mapWithConcurrency(items, limit, worker) {
|
|
@@ -1262,17 +1383,17 @@ async function mapWithConcurrency(items, limit, worker) {
|
|
|
1262
1383
|
}
|
|
1263
1384
|
|
|
1264
1385
|
// src/core/runs.ts
|
|
1265
|
-
import
|
|
1386
|
+
import path13 from "path";
|
|
1266
1387
|
async function writeRun(workspacePath, run) {
|
|
1267
|
-
await writeJsonl(
|
|
1388
|
+
await writeJsonl(path13.join(workspacePath, "runs", `${run.id}.json`), [run]);
|
|
1268
1389
|
}
|
|
1269
1390
|
async function listRuns(workspacePath) {
|
|
1270
1391
|
const fs = await import("fs/promises");
|
|
1271
|
-
const dir =
|
|
1392
|
+
const dir = path13.join(workspacePath, "runs");
|
|
1272
1393
|
try {
|
|
1273
1394
|
const entries = await fs.readdir(dir);
|
|
1274
1395
|
const records = await Promise.all(entries.filter((name) => name.endsWith(".json")).map(async (name) => {
|
|
1275
|
-
const runs = await readJsonl(
|
|
1396
|
+
const runs = await readJsonl(path13.join(dir, name));
|
|
1276
1397
|
return runs[0];
|
|
1277
1398
|
}));
|
|
1278
1399
|
return records.filter((record) => record != null).sort((a, b) => a.createdAt.localeCompare(b.createdAt));
|
|
@@ -1282,8 +1403,8 @@ async function listRuns(workspacePath) {
|
|
|
1282
1403
|
}
|
|
1283
1404
|
|
|
1284
1405
|
// src/sources/source-store.ts
|
|
1285
|
-
import
|
|
1286
|
-
var sourcesFile = (workspacePath) =>
|
|
1406
|
+
import path14 from "path";
|
|
1407
|
+
var sourcesFile = (workspacePath) => path14.join(workspacePath, "sources", "sources.jsonl");
|
|
1287
1408
|
async function listSources(workspacePath) {
|
|
1288
1409
|
return readJsonl(sourcesFile(workspacePath));
|
|
1289
1410
|
}
|
|
@@ -1329,8 +1450,8 @@ async function removeSource(workspacePath, sourceId) {
|
|
|
1329
1450
|
}
|
|
1330
1451
|
|
|
1331
1452
|
// src/ingest/document-utils.ts
|
|
1332
|
-
import { mkdir as
|
|
1333
|
-
import
|
|
1453
|
+
import { mkdir as mkdir8, rm as rm5, writeFile as writeFile7 } from "fs/promises";
|
|
1454
|
+
import path15 from "path";
|
|
1334
1455
|
|
|
1335
1456
|
// src/normalize/normalize-markdown.ts
|
|
1336
1457
|
import matter2 from "gray-matter";
|
|
@@ -1382,8 +1503,8 @@ async function writeNormalizedDocument({
|
|
|
1382
1503
|
normalizedPath,
|
|
1383
1504
|
markdown
|
|
1384
1505
|
}) {
|
|
1385
|
-
await
|
|
1386
|
-
await
|
|
1506
|
+
await mkdir8(path15.dirname(normalizedPath), { recursive: true });
|
|
1507
|
+
await writeFile7(
|
|
1387
1508
|
normalizedPath,
|
|
1388
1509
|
withFrontmatter(
|
|
1389
1510
|
{
|
|
@@ -1405,14 +1526,14 @@ async function writeNormalizedDocument({
|
|
|
1405
1526
|
}
|
|
1406
1527
|
async function deleteDocumentArtifacts(document) {
|
|
1407
1528
|
await Promise.all([
|
|
1408
|
-
document.rawPath ?
|
|
1409
|
-
|
|
1529
|
+
document.rawPath ? rm5(document.rawPath, { force: true }) : Promise.resolve(),
|
|
1530
|
+
rm5(document.normalizedPath, { force: true })
|
|
1410
1531
|
]);
|
|
1411
1532
|
}
|
|
1412
1533
|
|
|
1413
1534
|
// src/ingest/adapters/directory-adapter.ts
|
|
1414
1535
|
import fg from "fast-glob";
|
|
1415
|
-
import
|
|
1536
|
+
import path16 from "path";
|
|
1416
1537
|
async function listDirectoryFiles(source) {
|
|
1417
1538
|
const include = source.crawl?.includePatterns?.length ? source.crawl.includePatterns : ["**/*.md", "**/*.txt", "**/*.html", "**/*.htm", "**/*.pdf", "**/*.docx"];
|
|
1418
1539
|
const exclude = source.crawl?.excludePatterns ?? [];
|
|
@@ -1425,12 +1546,12 @@ async function listDirectoryFiles(source) {
|
|
|
1425
1546
|
ignore: exclude,
|
|
1426
1547
|
followSymbolicLinks: false
|
|
1427
1548
|
});
|
|
1428
|
-
return matches.map((match) =>
|
|
1549
|
+
return matches.map((match) => path16.resolve(match)).sort();
|
|
1429
1550
|
}
|
|
1430
1551
|
|
|
1431
1552
|
// src/ingest/adapters/file-adapter.ts
|
|
1432
1553
|
import { basename, extname, resolve } from "path";
|
|
1433
|
-
import { mkdir as
|
|
1554
|
+
import { mkdir as mkdir9, readFile as readFile9, stat as stat4, writeFile as writeFile8 } from "fs/promises";
|
|
1434
1555
|
|
|
1435
1556
|
// src/ingest/extractors/docx-extractor.ts
|
|
1436
1557
|
import mammoth from "mammoth";
|
|
@@ -1604,16 +1725,16 @@ function extractPublicationDateFromHtml(html) {
|
|
|
1604
1725
|
}
|
|
1605
1726
|
|
|
1606
1727
|
// src/ingest/extractors/markdown-extractor.ts
|
|
1607
|
-
import { readFile as
|
|
1728
|
+
import { readFile as readFile6 } from "fs/promises";
|
|
1608
1729
|
async function extractMarkdown(filePath) {
|
|
1609
|
-
return
|
|
1730
|
+
return readFile6(filePath, "utf8");
|
|
1610
1731
|
}
|
|
1611
1732
|
|
|
1612
1733
|
// src/ingest/extractors/pdf-extractor.ts
|
|
1613
|
-
import { readFile as
|
|
1734
|
+
import { readFile as readFile7 } from "fs/promises";
|
|
1614
1735
|
import { PDFParse } from "pdf-parse";
|
|
1615
1736
|
async function extractPdf(filePath) {
|
|
1616
|
-
const buffer = await
|
|
1737
|
+
const buffer = await readFile7(filePath);
|
|
1617
1738
|
const parser = new PDFParse({ data: buffer });
|
|
1618
1739
|
try {
|
|
1619
1740
|
const parsed = await parser.getText();
|
|
@@ -1624,9 +1745,9 @@ async function extractPdf(filePath) {
|
|
|
1624
1745
|
}
|
|
1625
1746
|
|
|
1626
1747
|
// src/ingest/extractors/text-extractor.ts
|
|
1627
|
-
import { readFile as
|
|
1748
|
+
import { readFile as readFile8 } from "fs/promises";
|
|
1628
1749
|
async function extractText(filePath) {
|
|
1629
|
-
return
|
|
1750
|
+
return readFile8(filePath, "utf8");
|
|
1630
1751
|
}
|
|
1631
1752
|
|
|
1632
1753
|
// src/ingest/adapters/file-adapter.ts
|
|
@@ -1661,7 +1782,7 @@ async function extractFileContent(filePath, mimeType) {
|
|
|
1661
1782
|
${text}`, raw: text };
|
|
1662
1783
|
}
|
|
1663
1784
|
if (mimeType === "text/html") {
|
|
1664
|
-
const raw = await
|
|
1785
|
+
const raw = await readFile9(filePath, "utf8");
|
|
1665
1786
|
const extracted = extractHtmlToMarkdown(raw);
|
|
1666
1787
|
return { title: extracted.title, markdown: `# ${extracted.title}
|
|
1667
1788
|
|
|
@@ -1706,7 +1827,7 @@ async function ingestFile({
|
|
|
1706
1827
|
previous
|
|
1707
1828
|
}) {
|
|
1708
1829
|
const resolved = resolve(filePath);
|
|
1709
|
-
const fileStat = await
|
|
1830
|
+
const fileStat = await stat4(resolved);
|
|
1710
1831
|
const mimeType = mimeTypeFor(resolved);
|
|
1711
1832
|
const extracted = await extractFileContent(resolved, mimeType);
|
|
1712
1833
|
const documentId = stableId("doc", source.id, resolved);
|
|
@@ -1717,10 +1838,10 @@ async function ingestFile({
|
|
|
1717
1838
|
const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
|
|
1718
1839
|
const indexedAt = now;
|
|
1719
1840
|
const crawledAt = now;
|
|
1720
|
-
await
|
|
1721
|
-
await
|
|
1841
|
+
await mkdir9(resolve(workspacePath, "normalized"), { recursive: true });
|
|
1842
|
+
await mkdir9(resolve(workspacePath, "raw", source.id), { recursive: true });
|
|
1722
1843
|
if (extracted.raw) {
|
|
1723
|
-
await
|
|
1844
|
+
await writeFile8(rawPath, extracted.raw, "utf8");
|
|
1724
1845
|
}
|
|
1725
1846
|
await writeNormalizedDocument({
|
|
1726
1847
|
documentId,
|
|
@@ -1783,7 +1904,7 @@ ${content}`;
|
|
|
1783
1904
|
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
1784
1905
|
const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
|
|
1785
1906
|
const indexedAt = now;
|
|
1786
|
-
await
|
|
1907
|
+
await mkdir9(resolve(workspacePath, "normalized"), { recursive: true });
|
|
1787
1908
|
await writeNormalizedDocument({
|
|
1788
1909
|
documentId,
|
|
1789
1910
|
sourceId: source.id,
|
|
@@ -1827,7 +1948,7 @@ async function reprocessStoredDocument(document, source) {
|
|
|
1827
1948
|
if (!document.rawPath) {
|
|
1828
1949
|
return null;
|
|
1829
1950
|
}
|
|
1830
|
-
const raw = await
|
|
1951
|
+
const raw = await readFile9(document.rawPath, "utf8");
|
|
1831
1952
|
const fallbackTitle = document.title || basename(document.uri);
|
|
1832
1953
|
const extracted = await extractRawContent(raw, document.mimeType, fallbackTitle);
|
|
1833
1954
|
const contentHash = sha256(extracted.markdown);
|
|
@@ -1944,8 +2065,8 @@ async function parseRssFeedDocument(xml, source) {
|
|
|
1944
2065
|
}
|
|
1945
2066
|
|
|
1946
2067
|
// src/ingest/adapters/url-adapter.ts
|
|
1947
|
-
import { mkdir as
|
|
1948
|
-
import
|
|
2068
|
+
import { mkdir as mkdir10, readFile as readFile10, writeFile as writeFile9 } from "fs/promises";
|
|
2069
|
+
import path17 from "path";
|
|
1949
2070
|
|
|
1950
2071
|
// src/core/urls.ts
|
|
1951
2072
|
function normalizeRemoteUrl(uri) {
|
|
@@ -1988,16 +2109,16 @@ async function normalizeRemoteDocument({
|
|
|
1988
2109
|
|
|
1989
2110
|
${extracted.markdown}`;
|
|
1990
2111
|
const documentId = stableId("doc", source.id, canonicalUri);
|
|
1991
|
-
const normalizedPath =
|
|
1992
|
-
const rawPath =
|
|
2112
|
+
const normalizedPath = path17.resolve(workspacePath, "normalized", `${documentId}.md`);
|
|
2113
|
+
const rawPath = path17.resolve(workspacePath, "raw", source.id, `${sha256(canonicalUri).slice(0, 12)}.html`);
|
|
1993
2114
|
const contentHash = sha256(markdown);
|
|
1994
2115
|
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
1995
2116
|
const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
|
|
1996
2117
|
const indexedAt = now;
|
|
1997
2118
|
const crawledAt = now;
|
|
1998
2119
|
const resolvedPublicationDate = choosePublicationDate(publicationDate, extractPublicationDateFromHtml(body), previous?.publicationDate);
|
|
1999
|
-
await
|
|
2000
|
-
await
|
|
2120
|
+
await mkdir10(path17.resolve(workspacePath, "raw", source.id), { recursive: true });
|
|
2121
|
+
await writeFile9(rawPath, body, "utf8");
|
|
2001
2122
|
await writeNormalizedDocument({
|
|
2002
2123
|
documentId,
|
|
2003
2124
|
sourceId: source.id,
|
|
@@ -2052,7 +2173,7 @@ async function fetchUrlDocument({
|
|
|
2052
2173
|
publicationDate
|
|
2053
2174
|
}) {
|
|
2054
2175
|
const headers = {
|
|
2055
|
-
"user-agent": source.crawl?.userAgent ?? "querylight-cli
|
|
2176
|
+
"user-agent": source.crawl?.userAgent ?? "querylight-cli"
|
|
2056
2177
|
};
|
|
2057
2178
|
if (previous?.httpCache?.etag) {
|
|
2058
2179
|
headers["if-none-match"] = previous.httpCache.etag;
|
|
@@ -2117,7 +2238,7 @@ async function reprocessRemoteDocument(document, source) {
|
|
|
2117
2238
|
if (!document.rawPath || !await fileExists(document.rawPath)) {
|
|
2118
2239
|
return null;
|
|
2119
2240
|
}
|
|
2120
|
-
const raw = await
|
|
2241
|
+
const raw = await readFile10(document.rawPath, "utf8");
|
|
2121
2242
|
const extracted = extractHtmlToMarkdown(raw);
|
|
2122
2243
|
const markdown = `# ${extracted.title}
|
|
2123
2244
|
|
|
@@ -2296,7 +2417,7 @@ async function crawlWebsite(source, defaults, progress) {
|
|
|
2296
2417
|
|
|
2297
2418
|
// src/ingest/ingest-service.ts
|
|
2298
2419
|
function documentsFile(workspacePath) {
|
|
2299
|
-
return
|
|
2420
|
+
return path18.join(workspacePath, "documents", "documents.jsonl");
|
|
2300
2421
|
}
|
|
2301
2422
|
async function loadDocuments(workspacePath) {
|
|
2302
2423
|
return readJsonl(documentsFile(workspacePath));
|
|
@@ -2351,7 +2472,7 @@ async function purgeDocuments(workspacePath, documentIds, documents) {
|
|
|
2351
2472
|
async function fetchFeedText(source) {
|
|
2352
2473
|
const response2 = await fetch(source.uri, {
|
|
2353
2474
|
headers: {
|
|
2354
|
-
"user-agent": source.crawl?.userAgent ?? "querylight-cli
|
|
2475
|
+
"user-agent": source.crawl?.userAgent ?? "querylight-cli"
|
|
2355
2476
|
}
|
|
2356
2477
|
});
|
|
2357
2478
|
if (!response2.ok) {
|
|
@@ -2839,9 +2960,9 @@ async function discoverWebsiteFeed(websiteUrl, userAgent) {
|
|
|
2839
2960
|
}
|
|
2840
2961
|
|
|
2841
2962
|
// src/query/search-service.ts
|
|
2842
|
-
import { readFile as
|
|
2963
|
+
import { readFile as readFile11 } from "fs/promises";
|
|
2843
2964
|
import { reciprocalRankFusion, searchJsonDsl } from "@tryformation/querylight-ts";
|
|
2844
|
-
import
|
|
2965
|
+
import path19 from "path";
|
|
2845
2966
|
async function loadHydratedIndex(workspacePath) {
|
|
2846
2967
|
let state;
|
|
2847
2968
|
try {
|
|
@@ -3051,7 +3172,7 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
|
|
|
3051
3172
|
if (!await fileExists(document.normalizedPath)) {
|
|
3052
3173
|
return buildSnippet(chunk.text, query);
|
|
3053
3174
|
}
|
|
3054
|
-
const raw = await
|
|
3175
|
+
const raw = await readFile11(document.normalizedPath, "utf8");
|
|
3055
3176
|
orderedChunks = buildChunksForDocument(document, raw, config);
|
|
3056
3177
|
orderedChunkCache.set(document.id, orderedChunks);
|
|
3057
3178
|
}
|
|
@@ -3386,9 +3507,9 @@ async function searchIndex({
|
|
|
3386
3507
|
const config = await loadConfig(workspacePath);
|
|
3387
3508
|
const mode = retrievalMode ?? config.retrieval.defaultMode;
|
|
3388
3509
|
const candidateLimit = Math.max(topK * 5, 50);
|
|
3389
|
-
const chunks = new Map((await readJsonl(
|
|
3390
|
-
const documents = new Map((await readJsonl(
|
|
3391
|
-
const sources = new Map((await readJsonl(
|
|
3510
|
+
const chunks = new Map((await readJsonl(path19.join(workspacePath, "chunks", "chunks.jsonl"))).map((chunk) => [chunk.id, chunk]));
|
|
3511
|
+
const documents = new Map((await readJsonl(path19.join(workspacePath, "documents", "documents.jsonl"))).map((document) => [document.id, document]));
|
|
3512
|
+
const sources = new Map((await readJsonl(path19.join(workspacePath, "sources", "sources.jsonl"))).map((source) => [source.id, source]));
|
|
3392
3513
|
const orderedChunkCache = /* @__PURE__ */ new Map();
|
|
3393
3514
|
const normalizedQuery = query.trim();
|
|
3394
3515
|
const filterIds = [...chunks.values()].filter((chunk) => filterChunk(chunk, documents.get(chunk.documentId), sources.get(chunk.sourceId), { sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata, dateRanges })).map((chunk) => chunk.id);
|
|
@@ -3561,18 +3682,18 @@ async function searchIndex({
|
|
|
3561
3682
|
|
|
3562
3683
|
// src/server/search-api.ts
|
|
3563
3684
|
import { createServer } from "http";
|
|
3564
|
-
import { readdir, stat as
|
|
3565
|
-
import
|
|
3685
|
+
import { readdir as readdir2, stat as stat5 } from "fs/promises";
|
|
3686
|
+
import path20 from "path";
|
|
3566
3687
|
async function pathIsDirectory(candidatePath) {
|
|
3567
3688
|
try {
|
|
3568
|
-
return (await
|
|
3689
|
+
return (await stat5(candidatePath)).isDirectory();
|
|
3569
3690
|
} catch {
|
|
3570
3691
|
return false;
|
|
3571
3692
|
}
|
|
3572
3693
|
}
|
|
3573
3694
|
async function discoverKnowledgeBases(workspacePath) {
|
|
3574
3695
|
try {
|
|
3575
|
-
const singleWorkspace = await
|
|
3696
|
+
const singleWorkspace = (await resolveReadableWorkspace(workspacePath)).workspacePath;
|
|
3576
3697
|
const config = await loadConfig(singleWorkspace);
|
|
3577
3698
|
const index = await loadHydratedIndex(singleWorkspace);
|
|
3578
3699
|
return {
|
|
@@ -3589,19 +3710,20 @@ async function discoverKnowledgeBases(workspacePath) {
|
|
|
3589
3710
|
throw error;
|
|
3590
3711
|
}
|
|
3591
3712
|
}
|
|
3592
|
-
const resolvedRoot =
|
|
3713
|
+
const resolvedRoot = path20.resolve(workspacePath);
|
|
3593
3714
|
if (!await pathIsDirectory(resolvedRoot)) {
|
|
3594
3715
|
throw new CliError(`workspace path does not exist: ${resolvedRoot}`, "WORKSPACE_ERROR", 3 /* WorkspaceError */);
|
|
3595
3716
|
}
|
|
3596
|
-
const entries = await
|
|
3597
|
-
const knowledgeBases = (await Promise.all(entries.filter((entry) => entry.isDirectory()).map(async (entry) => {
|
|
3598
|
-
const candidateWorkspace =
|
|
3717
|
+
const entries = await readdir2(resolvedRoot, { withFileTypes: true });
|
|
3718
|
+
const knowledgeBases = (await Promise.all(entries.filter((entry) => entry.isDirectory() || entry.isFile() && isWorkspaceArchivePath(entry.name)).map(async (entry) => {
|
|
3719
|
+
const candidateWorkspace = entry.isDirectory() ? path20.join(resolvedRoot, entry.name, ".kb") : path20.join(resolvedRoot, entry.name);
|
|
3720
|
+
const knowledgeBaseName = entry.isDirectory() ? entry.name : entry.name.replace(/\.zip$/i, "");
|
|
3599
3721
|
try {
|
|
3600
|
-
const workspace = await assertWorkspaceExists(candidateWorkspace);
|
|
3722
|
+
const workspace = entry.isDirectory() ? await assertWorkspaceExists(candidateWorkspace) : (await resolveReadableWorkspace(candidateWorkspace)).workspacePath;
|
|
3601
3723
|
const config = await loadConfig(workspace);
|
|
3602
3724
|
const index = await loadHydratedIndex(workspace);
|
|
3603
3725
|
return {
|
|
3604
|
-
name:
|
|
3726
|
+
name: knowledgeBaseName,
|
|
3605
3727
|
workspacePath: workspace,
|
|
3606
3728
|
configuredIndexName: config.index.name,
|
|
3607
3729
|
index
|
|
@@ -3615,7 +3737,7 @@ async function discoverKnowledgeBases(workspacePath) {
|
|
|
3615
3737
|
}))).filter((knowledgeBase) => knowledgeBase != null);
|
|
3616
3738
|
if (knowledgeBases.length === 0) {
|
|
3617
3739
|
throw new CliError(
|
|
3618
|
-
`no knowledge bases found at ${resolvedRoot}; use a .kb workspace or a directory of named subdirectories that each contain .kb`,
|
|
3740
|
+
`no knowledge bases found at ${resolvedRoot}; use a .kb workspace, a .zip workspace, or a directory of .zip files or named subdirectories that each contain .kb`,
|
|
3619
3741
|
"WORKSPACE_ERROR",
|
|
3620
3742
|
3 /* WorkspaceError */
|
|
3621
3743
|
);
|
|
@@ -3749,7 +3871,7 @@ async function startSearchApiServer({
|
|
|
3749
3871
|
}
|
|
3750
3872
|
|
|
3751
3873
|
// src/query/related-service.ts
|
|
3752
|
-
import
|
|
3874
|
+
import path21 from "path";
|
|
3753
3875
|
function cosineSimilarity2(left, right) {
|
|
3754
3876
|
let dot = 0;
|
|
3755
3877
|
let leftNorm = 0;
|
|
@@ -3825,7 +3947,7 @@ async function findRelatedDocuments({
|
|
|
3825
3947
|
if (!await fileExists(denseVectorPath(workspacePath))) {
|
|
3826
3948
|
throw new CliError("dense vector index is not built; run `qli models pull --dense` and `qli rebuild`", "DENSE_INDEX_MISSING", 7 /* QueryError */);
|
|
3827
3949
|
}
|
|
3828
|
-
const documents = await readJsonl(
|
|
3950
|
+
const documents = await readJsonl(path21.join(workspacePath, "documents", "documents.jsonl"));
|
|
3829
3951
|
const selected = resolveDocumentSelector(documents, document);
|
|
3830
3952
|
const densePayload = await readDensePayload(workspacePath);
|
|
3831
3953
|
const vectors = buildDocumentVectors(documents, densePayload.chunks, densePayload.metadata.dimensions);
|
|
@@ -3898,7 +4020,7 @@ async function createContext({
|
|
|
3898
4020
|
}
|
|
3899
4021
|
|
|
3900
4022
|
// src/report/diff-service.ts
|
|
3901
|
-
import
|
|
4023
|
+
import path22 from "path";
|
|
3902
4024
|
function chooseBaselineRun(runs, since) {
|
|
3903
4025
|
if (since === "last-run") {
|
|
3904
4026
|
return runs.at(-1);
|
|
@@ -3914,7 +4036,7 @@ async function diffWorkspace({
|
|
|
3914
4036
|
documentId,
|
|
3915
4037
|
since
|
|
3916
4038
|
}) {
|
|
3917
|
-
const current = await readJsonl(
|
|
4039
|
+
const current = await readJsonl(path22.join(workspacePath, "documents", "documents.jsonl"));
|
|
3918
4040
|
const baseline = chooseBaselineRun(await listRuns(workspacePath), since);
|
|
3919
4041
|
const previous = new Map((baseline?.documentsSnapshot ?? []).map((document) => [document.id, document]));
|
|
3920
4042
|
const changedDocuments = current.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId)).filter((document) => {
|
|
@@ -4273,7 +4395,7 @@ function parseDateValue(input, optionName) {
|
|
|
4273
4395
|
return parsed.toISOString();
|
|
4274
4396
|
}
|
|
4275
4397
|
async function parseJsonArgument(input) {
|
|
4276
|
-
const raw = input.startsWith("@") ? await
|
|
4398
|
+
const raw = input.startsWith("@") ? await readFile12(path23.resolve(input.slice(1)), "utf8") : input;
|
|
4277
4399
|
try {
|
|
4278
4400
|
const parsed = JSON.parse(raw);
|
|
4279
4401
|
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
@@ -4316,20 +4438,35 @@ function searchDateRanges(options) {
|
|
|
4316
4438
|
}
|
|
4317
4439
|
return entries;
|
|
4318
4440
|
}
|
|
4319
|
-
|
|
4320
|
-
|
|
4441
|
+
function resolveSearchTopK(optionsTopK, sourceTypes, dateRanges, defaultTopK) {
|
|
4442
|
+
const explicitTopK = parseOptionalPositiveInteger(optionsTopK, "--top-k");
|
|
4443
|
+
if (explicitTopK !== void 0) {
|
|
4444
|
+
return explicitTopK;
|
|
4445
|
+
}
|
|
4446
|
+
const includesRss = (sourceTypes ?? []).includes("rss");
|
|
4447
|
+
if (includesRss && dateRanges.length > 0) {
|
|
4448
|
+
return 500;
|
|
4449
|
+
}
|
|
4450
|
+
return defaultTopK;
|
|
4451
|
+
}
|
|
4452
|
+
async function resolveWorkspace(options, mode = {}) {
|
|
4453
|
+
const workspace = options.workspace ?? DEFAULT_WORKSPACE;
|
|
4454
|
+
if (mode.writable) {
|
|
4455
|
+
return assertWritableWorkspacePath(workspace);
|
|
4456
|
+
}
|
|
4457
|
+
return (await resolveReadableWorkspace(workspace)).workspacePath;
|
|
4321
4458
|
}
|
|
4322
4459
|
function workspaceFromArgv(argv) {
|
|
4323
4460
|
const index = argv.findIndex((arg) => arg === "--workspace");
|
|
4324
4461
|
if (index >= 0 && argv[index + 1]) {
|
|
4325
|
-
return
|
|
4462
|
+
return path23.resolve(argv[index + 1]);
|
|
4326
4463
|
}
|
|
4327
|
-
return
|
|
4464
|
+
return path23.resolve(DEFAULT_WORKSPACE);
|
|
4328
4465
|
}
|
|
4329
4466
|
async function runCli(argv, io = {}) {
|
|
4330
4467
|
const capture = { stdout: [], stderr: [], ...io };
|
|
4331
4468
|
const program = new Command();
|
|
4332
|
-
program.name("qli").description("Build and query a local Querylight workspace from files, directories, URLs, websites, and feeds.").showHelpAfterError().option("--workspace <path>", "Workspace directory. Defaults to .kb in the current directory.", DEFAULT_WORKSPACE).option("--config <path>", "Optional config file override. Useful for testing alternate retrieval settings.").option("--json", "Return a stable JSON envelope for automation and agents.").option("--silent", "Suppress progress logging for long-running commands.").option("--verbose", "Print more operational detail when a command supports it.").addOption(new Option("--quiet", "Deprecated alias for --silent.").hideHelp());
|
|
4469
|
+
program.name("qli").description("Build and query a local Querylight workspace from files, directories, URLs, websites, and feeds.").showHelpAfterError().option("--workspace <path>", "Workspace directory, or a packaged .zip workspace for read-only commands. Defaults to .kb in the current directory.", DEFAULT_WORKSPACE).option("--config <path>", "Optional config file override. Useful for testing alternate retrieval settings.").option("--json", "Return a stable JSON envelope for automation and agents.").option("--silent", "Suppress progress logging for long-running commands.").option("--verbose", "Print more operational detail when a command supports it.").addOption(new Option("--quiet", "Deprecated alias for --silent.").hideHelp());
|
|
4333
4470
|
program.addHelpText("after", `
|
|
4334
4471
|
Workflow:
|
|
4335
4472
|
1. Initialize a workspace with qli init
|
|
@@ -4341,12 +4478,15 @@ Examples:
|
|
|
4341
4478
|
qli init
|
|
4342
4479
|
qli source add directory ./docs --name "Product Docs" --tag docs
|
|
4343
4480
|
qli ingest
|
|
4481
|
+
qli package ./docs-kb.zip
|
|
4344
4482
|
qli rebuild --silent
|
|
4345
4483
|
qli search "api authentication" --top-k 8
|
|
4484
|
+
qli search --workspace ./docs-kb.zip "api authentication"
|
|
4346
4485
|
qli context "How do API keys work?" --top-k 8 --max-chars 8000
|
|
4347
4486
|
|
|
4348
4487
|
Long-running commands print progress to stderr by default. Use --silent to suppress it.
|
|
4349
4488
|
Use --json when another tool needs stable structured output.
|
|
4489
|
+
Read-only commands can use --workspace with a packaged .zip workspace.
|
|
4350
4490
|
|
|
4351
4491
|
Use qli <command> --help for command-specific options and examples.`);
|
|
4352
4492
|
program.command("init").description("Create a new workspace with the default directory layout and config, then pull missing retrieval models.").option("--force").addHelpText("after", `
|
|
@@ -4360,7 +4500,7 @@ Notes:
|
|
|
4360
4500
|
init pulls missing model assets for enabled retrieval modes.
|
|
4361
4501
|
Sparse model downloads require uv. If uv is not available, init skips the sparse pull.`).action(async function command(options) {
|
|
4362
4502
|
const global = this.optsWithGlobals();
|
|
4363
|
-
const workspace = await resolveWorkspace({ workspace: this.optsWithGlobals().workspace });
|
|
4503
|
+
const workspace = await resolveWorkspace({ workspace: this.optsWithGlobals().workspace }, { writable: true });
|
|
4364
4504
|
const result = await ensureWorkspace({ workspacePath: workspace, force: Boolean(options.force) });
|
|
4365
4505
|
const config = await loadConfig(workspace, global.config);
|
|
4366
4506
|
const status = await getModelStatus(workspace, config);
|
|
@@ -4401,7 +4541,7 @@ Notes:
|
|
|
4401
4541
|
}
|
|
4402
4542
|
validateSourceAddOptions(type, options);
|
|
4403
4543
|
const global = this.optsWithGlobals();
|
|
4404
|
-
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
4544
|
+
const workspace = await resolveWorkspace({ workspace: global.workspace }, { writable: true });
|
|
4405
4545
|
const config = await loadConfig(workspace, global.config);
|
|
4406
4546
|
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
4407
4547
|
const initialCrawl = createSourceCrawlConfig(type, options, { retentionDays: config.crawler.retentionDays });
|
|
@@ -4418,7 +4558,7 @@ Notes:
|
|
|
4418
4558
|
}
|
|
4419
4559
|
const stored = await addSource(workspace, {
|
|
4420
4560
|
type,
|
|
4421
|
-
uri: ["file", "directory"].includes(type) ?
|
|
4561
|
+
uri: ["file", "directory"].includes(type) ? path23.resolve(uri) : uri,
|
|
4422
4562
|
name: options.name,
|
|
4423
4563
|
enabled: true,
|
|
4424
4564
|
tags: options.tag ?? [],
|
|
@@ -4479,7 +4619,7 @@ Notes:
|
|
|
4479
4619
|
qli only exposes settings that the current source type uses at runtime.
|
|
4480
4620
|
URI, source type, and source id do not change here.`).action(async function command(sourceId, options) {
|
|
4481
4621
|
const global = this.optsWithGlobals();
|
|
4482
|
-
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
4622
|
+
const workspace = await resolveWorkspace({ workspace: global.workspace }, { writable: true });
|
|
4483
4623
|
const sources = await listSources(workspace);
|
|
4484
4624
|
const current = sources.find((source2) => source2.id === sourceId);
|
|
4485
4625
|
if (!current) {
|
|
@@ -4507,7 +4647,7 @@ Examples:
|
|
|
4507
4647
|
qli source remove src_123
|
|
4508
4648
|
qli source list --json`).action(async function command(sourceId) {
|
|
4509
4649
|
const global = this.optsWithGlobals();
|
|
4510
|
-
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
4650
|
+
const workspace = await resolveWorkspace({ workspace: global.workspace }, { writable: true });
|
|
4511
4651
|
await removeSource(workspace, sourceId);
|
|
4512
4652
|
emit(global.json, capture, response("source remove", workspace, { sourceId }), `Removed source ${sourceId}`);
|
|
4513
4653
|
});
|
|
@@ -4516,7 +4656,7 @@ Examples:
|
|
|
4516
4656
|
qli source disable src_123
|
|
4517
4657
|
qli source enable src_123`).action(async function command(sourceId) {
|
|
4518
4658
|
const global = this.optsWithGlobals();
|
|
4519
|
-
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
4659
|
+
const workspace = await resolveWorkspace({ workspace: global.workspace }, { writable: true });
|
|
4520
4660
|
const updated = await updateSource(workspace, sourceId, { enabled: false, updatedAt: (/* @__PURE__ */ new Date()).toISOString() });
|
|
4521
4661
|
emit(global.json, capture, response("source disable", workspace, updated), `Disabled source ${sourceId}`);
|
|
4522
4662
|
});
|
|
@@ -4525,7 +4665,7 @@ Examples:
|
|
|
4525
4665
|
qli source enable src_123
|
|
4526
4666
|
qli source list`).action(async function command(sourceId) {
|
|
4527
4667
|
const global = this.optsWithGlobals();
|
|
4528
|
-
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
4668
|
+
const workspace = await resolveWorkspace({ workspace: global.workspace }, { writable: true });
|
|
4529
4669
|
const updated = await updateSource(workspace, sourceId, { enabled: true, updatedAt: (/* @__PURE__ */ new Date()).toISOString() });
|
|
4530
4670
|
emit(global.json, capture, response("source enable", workspace, updated), `Enabled source ${sourceId}`);
|
|
4531
4671
|
});
|
|
@@ -4537,7 +4677,7 @@ Examples:
|
|
|
4537
4677
|
qli ingest --dense --sparse
|
|
4538
4678
|
qli ingest --silent`).action(async function command(options) {
|
|
4539
4679
|
const global = this.optsWithGlobals();
|
|
4540
|
-
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
4680
|
+
const workspace = await resolveWorkspace({ workspace: global.workspace }, { writable: true });
|
|
4541
4681
|
const result = await runIngestCommand({
|
|
4542
4682
|
workspace,
|
|
4543
4683
|
sourceId: options.source,
|
|
@@ -4555,7 +4695,7 @@ Examples:
|
|
|
4555
4695
|
qli chunk --document doc_123
|
|
4556
4696
|
qli chunk --silent`).action(async function command(options) {
|
|
4557
4697
|
const global = this.optsWithGlobals();
|
|
4558
|
-
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
4698
|
+
const workspace = await resolveWorkspace({ workspace: global.workspace }, { writable: true });
|
|
4559
4699
|
const result = await chunkDocuments({
|
|
4560
4700
|
workspacePath: workspace,
|
|
4561
4701
|
sourceId: options.source,
|
|
@@ -4571,7 +4711,7 @@ Examples:
|
|
|
4571
4711
|
qli reprocess --document doc_123
|
|
4572
4712
|
qli reprocess --silent`).action(async function command(options) {
|
|
4573
4713
|
const global = this.optsWithGlobals();
|
|
4574
|
-
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
4714
|
+
const workspace = await resolveWorkspace({ workspace: global.workspace }, { writable: true });
|
|
4575
4715
|
const result = await reprocessDocuments({
|
|
4576
4716
|
workspacePath: workspace,
|
|
4577
4717
|
sourceId: options.source,
|
|
@@ -4589,7 +4729,7 @@ Examples:
|
|
|
4589
4729
|
qli index build --dense --sparse
|
|
4590
4730
|
qli index build --silent`).action(async function command(options) {
|
|
4591
4731
|
const global = this.optsWithGlobals();
|
|
4592
|
-
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
4732
|
+
const workspace = await resolveWorkspace({ workspace: global.workspace }, { writable: true });
|
|
4593
4733
|
const result = await buildIndex({
|
|
4594
4734
|
workspacePath: workspace,
|
|
4595
4735
|
denseOverride: options.dense ? true : void 0,
|
|
@@ -4606,7 +4746,7 @@ Examples:
|
|
|
4606
4746
|
qli rebuild --dense --sparse
|
|
4607
4747
|
qli rebuild --silent`).action(async function command(options) {
|
|
4608
4748
|
const global = this.optsWithGlobals();
|
|
4609
|
-
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
4749
|
+
const workspace = await resolveWorkspace({ workspace: global.workspace }, { writable: true });
|
|
4610
4750
|
const progress = createProgressHandler(capture, global);
|
|
4611
4751
|
progress?.("info", "Rebuild step 1/3: ingest");
|
|
4612
4752
|
const ingest = await ingestSources({
|
|
@@ -4629,7 +4769,26 @@ Examples:
|
|
|
4629
4769
|
progress?.("info", "Rebuild complete");
|
|
4630
4770
|
emit(global.json, capture, response("rebuild", workspace, data), `Processed ${ingest.processedSources} sources, wrote ${chunk.chunksWritten} chunks`);
|
|
4631
4771
|
});
|
|
4632
|
-
program.command("
|
|
4772
|
+
program.command("package").description("Write the current workspace to a zip archive that read-only commands can use directly.").argument("<archive>", "Output .zip file.").option("--force", "Replace the output archive if it already exists.").addHelpText("after", `
|
|
4773
|
+
Examples:
|
|
4774
|
+
qli package ./docs-kb.zip
|
|
4775
|
+
qli package ./deploy/docs-kb.zip --workspace ./docs/.kb
|
|
4776
|
+
qli package ./docs-kb.zip --force --json
|
|
4777
|
+
|
|
4778
|
+
Notes:
|
|
4779
|
+
The archive stores the workspace contents at the zip root.
|
|
4780
|
+
Use the zip with read-only commands such as search, search-json, related, context, status, doctor, and serve.
|
|
4781
|
+
Rebuild the directory workspace and package it again when source content changes.`).action(async function command(archive, options) {
|
|
4782
|
+
const global = this.optsWithGlobals();
|
|
4783
|
+
const workspace = await resolveWorkspace({ workspace: global.workspace }, { writable: true });
|
|
4784
|
+
const result = await packageWorkspaceArchive({
|
|
4785
|
+
workspacePath: workspace,
|
|
4786
|
+
outputPath: archive,
|
|
4787
|
+
force: Boolean(options.force)
|
|
4788
|
+
});
|
|
4789
|
+
emit(global.json, capture, response("package", workspace, result), `Packaged ${result.fileCount} files to ${result.archivePath}`);
|
|
4790
|
+
});
|
|
4791
|
+
program.command("search").description("Search the built index and return ranked matching documents or chunks. Use search-json for raw JSON DSL queries.").argument("[query]", "Text query. Omit it to list the latest matching documents.").option("--top-k <n>", "Maximum number of results to return. Defaults to search.defaultTopK in config.yaml. RSS searches with a time window use 500 when omitted.").option("--source <sourceIds>", "Restrict results to one or more source ids. Use comma-separated values.").option("--source-name <names>", "Restrict results to one or more source names. Use comma-separated values.").option("--source-type <types>", `Restrict results to one or more source types. Use comma-separated values: ${SOURCE_TYPE_LIST.join(", ")}`).option("--uri-prefix <prefixes>", "Restrict results to one or more URI prefixes. Use comma-separated values.").option("--tag <tags>", "Restrict results to one or more source tags. Use comma-separated values.").option("--metadata <key=value...>", "Restrict results to sources with matching metadata.").option("--since <date>", "Shortcut for --publication-date-from.").option("--until <date>", "Shortcut for --publication-date-to.").option("--changed-since <date>", "Only include documents changed on or after this date.").option("--has-publication-date", "Only include documents with a publication date.").option("--publication-date-from <date>", "Only include documents published on or after this date.").option("--publication-date-to <date>", "Only include documents published on or before this date.").option("--first-seen-at-from <date>", "Only include documents first seen on or after this date.").option("--first-seen-at-to <date>", "Only include documents first seen on or before this date.").option("--last-seen-at-from <date>", "Only include documents last seen on or after this date.").option("--last-seen-at-to <date>", "Only include documents last seen on or before this date.").option("--last-changed-at-from <date>", "Only include documents changed on or after this date.").option("--last-changed-at-to <date>", "Only include documents changed on or before this date.").option("--crawled-at-from <date>", "Only include documents crawled on or after this date.").option("--crawled-at-to <date>", "Only include documents crawled on or before this date.").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).option("--show-chunks", "Return chunk-level matches when available.").addHelpText("after", `
|
|
4633
4792
|
Examples:
|
|
4634
4793
|
qli search "pricing api limits"
|
|
4635
4794
|
qli search "authentication" --top-k 20 --tag docs
|
|
@@ -4637,27 +4796,33 @@ Examples:
|
|
|
4637
4796
|
qli search --source-name "Release Feed,Company Blog" --uri-prefix https://example.com/news,https://example.com/blog
|
|
4638
4797
|
qli search "billing" --metadata team=support
|
|
4639
4798
|
qli search "embedding model" --retrieval hybrid --show-chunks
|
|
4799
|
+
qli search --workspace ./docs-kb.zip "authentication"
|
|
4640
4800
|
qli search --source-type rss,page --top-k 25 --json
|
|
4641
4801
|
|
|
4642
4802
|
Notes:
|
|
4643
4803
|
lexical works without vector models.
|
|
4644
4804
|
dense, sparse, and hybrid require the relevant index artifacts to exist.
|
|
4805
|
+
When you omit --top-k, qli uses search.defaultTopK from config.yaml. The default workspace value is 50.
|
|
4806
|
+
RSS searches with a time window default to 500 results when you omit --top-k.
|
|
4645
4807
|
Use search-json when you want the raw Querylight 0.11 JSON DSL and hit format.
|
|
4646
4808
|
When you omit the query, qli returns the latest matching documents sorted by publication date.`).action(async function command(query, options) {
|
|
4647
4809
|
const global = this.optsWithGlobals();
|
|
4648
4810
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
4811
|
+
const config = await loadConfig(workspace, global.config);
|
|
4812
|
+
const sourceTypes = parseSourceTypes(options.sourceType);
|
|
4813
|
+
const dateRanges = searchDateRanges(options);
|
|
4649
4814
|
const result = await searchIndex({
|
|
4650
4815
|
workspacePath: workspace,
|
|
4651
4816
|
query: query ?? "",
|
|
4652
|
-
topK:
|
|
4817
|
+
topK: resolveSearchTopK(options.topK, sourceTypes, dateRanges, config.search.defaultTopK),
|
|
4653
4818
|
sourceIds: parseCommaSeparatedList(options.source),
|
|
4654
4819
|
sourceNames: parseCommaSeparatedList(options.sourceName),
|
|
4655
|
-
sourceTypes
|
|
4820
|
+
sourceTypes,
|
|
4656
4821
|
uriPrefixes: parseCommaSeparatedList(options.uriPrefix),
|
|
4657
4822
|
hasPublicationDate: Boolean(options.hasPublicationDate),
|
|
4658
4823
|
tags: parseCommaSeparatedList(options.tag),
|
|
4659
4824
|
metadata: (options.metadata ?? []).map(parseKeyValue).map(([key, value]) => ({ key, value })),
|
|
4660
|
-
dateRanges
|
|
4825
|
+
dateRanges,
|
|
4661
4826
|
retrievalMode: parseRetrievalMode(options.retrieval),
|
|
4662
4827
|
showChunks: Boolean(options.showChunks)
|
|
4663
4828
|
});
|
|
@@ -4686,6 +4851,7 @@ Notes:
|
|
|
4686
4851
|
Examples:
|
|
4687
4852
|
qli serve
|
|
4688
4853
|
qli serve --workspace ./docs/.kb --port 4000
|
|
4854
|
+
qli serve --workspace ./docs-kb.zip --port 4000
|
|
4689
4855
|
qli serve --workspace ./kbs --host 0.0.0.0 --port 4000
|
|
4690
4856
|
|
|
4691
4857
|
Routes:
|
|
@@ -4696,10 +4862,10 @@ Routes:
|
|
|
4696
4862
|
Notes:
|
|
4697
4863
|
The request body must be a Querylight JSON DSL object.
|
|
4698
4864
|
serve only exposes lexical _search for now.
|
|
4699
|
-
When --workspace points to a directory of knowledge bases,
|
|
4865
|
+
When --workspace points to a directory of knowledge bases, qli serves child .zip files and child directories that contain .kb.
|
|
4700
4866
|
Index files are loaded once at startup and reused across requests.`).action(async function command(options) {
|
|
4701
4867
|
const global = this.optsWithGlobals();
|
|
4702
|
-
const workspace =
|
|
4868
|
+
const workspace = path23.resolve(global.workspace ?? DEFAULT_WORKSPACE);
|
|
4703
4869
|
const port = Number(options.port);
|
|
4704
4870
|
if (!Number.isInteger(port) || port < 0 || port > 65535) {
|
|
4705
4871
|
throw new CliError(`invalid port: ${options.port}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
@@ -4776,7 +4942,7 @@ Use --json when another tool needs structured access to the raw passages and met
|
|
|
4776
4942
|
});
|
|
4777
4943
|
const models = program.command("models");
|
|
4778
4944
|
models.description("Inspect and download retrieval model assets.");
|
|
4779
|
-
models.command("pull").description("Download dense
|
|
4945
|
+
models.command("pull").description("Download dense or sparse retrieval assets required by vector search.").option("--dense", "Only pull dense retrieval assets.").option("--sparse", "Only pull sparse retrieval assets.").addHelpText("after", `
|
|
4780
4946
|
Examples:
|
|
4781
4947
|
qli models pull
|
|
4782
4948
|
qli models pull --dense
|
|
@@ -4786,7 +4952,7 @@ Examples:
|
|
|
4786
4952
|
Pulled model assets are shared under ~/.qli by default.
|
|
4787
4953
|
If you plan to use related, dense search, or hybrid retrieval, pull the models and rebuild the index first.`).action(async function command(options) {
|
|
4788
4954
|
const global = this.optsWithGlobals();
|
|
4789
|
-
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
4955
|
+
const workspace = await resolveWorkspace({ workspace: global.workspace }, { writable: true });
|
|
4790
4956
|
const config = await loadConfig(workspace, global.config);
|
|
4791
4957
|
const status = await getModelStatus(workspace, config);
|
|
4792
4958
|
const { pullDense, pullSparse } = resolveModelPullPlan({
|
|
@@ -4862,7 +5028,7 @@ Examples:
|
|
|
4862
5028
|
try {
|
|
4863
5029
|
const meta = await readLatestIndexMetadata(workspace);
|
|
4864
5030
|
latestIndex = meta.createdAt;
|
|
4865
|
-
indexSize = (await
|
|
5031
|
+
indexSize = (await stat6(await resolveLatestIndexArtifactPath(workspace))).size;
|
|
4866
5032
|
} catch {
|
|
4867
5033
|
latestIndex = void 0;
|
|
4868
5034
|
}
|