@tryformation/querylight-cli 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/main.js +53 -44
- package/dist/index.js +43 -34
- package/dist/vector/runtime.d.ts +8 -0
- package/package.json +1 -1
- package/scripts/sparse-encode.py +5 -1
package/dist/cli/main.js
CHANGED
|
@@ -387,6 +387,7 @@ import os from "os";
|
|
|
387
387
|
import path6 from "path";
|
|
388
388
|
import { fileURLToPath } from "url";
|
|
389
389
|
import { execFile, execFileSync } from "child_process";
|
|
390
|
+
import { mkdtemp, rm, writeFile as writeFile3 } from "fs/promises";
|
|
390
391
|
|
|
391
392
|
// src/core/files.ts
|
|
392
393
|
import { stat as stat2 } from "fs/promises";
|
|
@@ -400,6 +401,7 @@ async function fileExists(filePath) {
|
|
|
400
401
|
}
|
|
401
402
|
|
|
402
403
|
// src/vector/runtime.ts
|
|
404
|
+
var sparseExecFileSync = execFileSync;
|
|
403
405
|
function resolveQliHomeDir() {
|
|
404
406
|
return path6.resolve(process.env.QLI_HOME ?? path6.join(os.homedir(), ".qli"));
|
|
405
407
|
}
|
|
@@ -455,29 +457,36 @@ async function runSparsePython({
|
|
|
455
457
|
}) {
|
|
456
458
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
457
459
|
const scriptPath = await sparseScriptPath(importMetaUrl);
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
"
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
460
|
+
const payloadDir = await mkdtemp(path6.join(os.tmpdir(), "qli-sparse-"));
|
|
461
|
+
const payloadPath = path6.join(payloadDir, "payload.json");
|
|
462
|
+
await writeFile3(payloadPath, JSON.stringify(payload), "utf8");
|
|
463
|
+
try {
|
|
464
|
+
return sparseExecFileSync(
|
|
465
|
+
"uv",
|
|
466
|
+
[
|
|
467
|
+
"run",
|
|
468
|
+
"--with",
|
|
469
|
+
"torch",
|
|
470
|
+
"--with",
|
|
471
|
+
"transformers",
|
|
472
|
+
"--with",
|
|
473
|
+
"huggingface_hub",
|
|
474
|
+
"python",
|
|
475
|
+
scriptPath,
|
|
476
|
+
payloadPath
|
|
477
|
+
],
|
|
478
|
+
{
|
|
479
|
+
encoding: "utf8",
|
|
480
|
+
maxBuffer: 1024 * 1024 * 1024,
|
|
481
|
+
env: {
|
|
482
|
+
...process.env,
|
|
483
|
+
HF_HOME: cacheDir
|
|
484
|
+
}
|
|
478
485
|
}
|
|
479
|
-
|
|
480
|
-
|
|
486
|
+
);
|
|
487
|
+
} finally {
|
|
488
|
+
await rm(payloadDir, { recursive: true, force: true });
|
|
489
|
+
}
|
|
481
490
|
}
|
|
482
491
|
async function getDenseTransformersRuntime(cacheDir) {
|
|
483
492
|
const transformers = await import("@huggingface/transformers");
|
|
@@ -490,18 +499,18 @@ async function getDenseTransformersRuntime(cacheDir) {
|
|
|
490
499
|
}
|
|
491
500
|
|
|
492
501
|
// src/vector/store.ts
|
|
493
|
-
import { mkdir as mkdir3, rm, writeFile as
|
|
502
|
+
import { mkdir as mkdir3, rm as rm2, writeFile as writeFile5 } from "fs/promises";
|
|
494
503
|
import path7 from "path";
|
|
495
504
|
|
|
496
505
|
// src/core/gzip-json.ts
|
|
497
|
-
import { readFile as readFile4, writeFile as
|
|
506
|
+
import { readFile as readFile4, writeFile as writeFile4 } from "fs/promises";
|
|
498
507
|
import { promisify } from "util";
|
|
499
508
|
import { gunzip, gzip } from "zlib";
|
|
500
509
|
var gzipAsync = promisify(gzip);
|
|
501
510
|
var gunzipAsync = promisify(gunzip);
|
|
502
511
|
async function writeGzipJson(filePath, value) {
|
|
503
512
|
const payload = JSON.stringify(value, null, 2);
|
|
504
|
-
await
|
|
513
|
+
await writeFile4(filePath, await gzipAsync(Buffer.from(payload, "utf8")));
|
|
505
514
|
}
|
|
506
515
|
async function readJsonFromGzipOrFile(gzipPath, legacyPath) {
|
|
507
516
|
if (await fileExists(gzipPath)) {
|
|
@@ -570,8 +579,8 @@ async function writeDensePayload(workspacePath, payload) {
|
|
|
570
579
|
await writeGzipJson(denseVectorPath(workspacePath), payload);
|
|
571
580
|
await writeGzipJson(denseMetaPath(workspacePath), payload.metadata);
|
|
572
581
|
await Promise.all([
|
|
573
|
-
|
|
574
|
-
|
|
582
|
+
rm2(legacyDenseVectorPath(workspacePath), { force: true }),
|
|
583
|
+
rm2(legacyDenseMetaPath(workspacePath), { force: true })
|
|
575
584
|
]);
|
|
576
585
|
}
|
|
577
586
|
async function readDensePayload(workspacePath) {
|
|
@@ -582,8 +591,8 @@ async function writeSparsePayload(workspacePath, payload) {
|
|
|
582
591
|
await writeGzipJson(sparseVectorPath(workspacePath), payload);
|
|
583
592
|
await writeGzipJson(sparseMetaPath(workspacePath), payload.metadata);
|
|
584
593
|
await Promise.all([
|
|
585
|
-
|
|
586
|
-
|
|
594
|
+
rm2(legacySparseVectorPath(workspacePath), { force: true }),
|
|
595
|
+
rm2(legacySparseMetaPath(workspacePath), { force: true })
|
|
587
596
|
]);
|
|
588
597
|
}
|
|
589
598
|
async function readSparsePayload(workspacePath) {
|
|
@@ -592,12 +601,12 @@ async function readSparsePayload(workspacePath) {
|
|
|
592
601
|
async function writeDensePullMarker(workspacePath, model, value) {
|
|
593
602
|
const markerPath = densePullMarker(workspacePath, model.modelId, model.cacheDir);
|
|
594
603
|
await mkdir3(path7.dirname(markerPath), { recursive: true });
|
|
595
|
-
await
|
|
604
|
+
await writeFile5(markerPath, JSON.stringify(value, null, 2), "utf8");
|
|
596
605
|
}
|
|
597
606
|
async function writeSparsePullMarker(workspacePath, model, value) {
|
|
598
607
|
const markerPath = sparsePullMarker(workspacePath, model.modelId, model.cacheDir);
|
|
599
608
|
await mkdir3(path7.dirname(markerPath), { recursive: true });
|
|
600
|
-
await
|
|
609
|
+
await writeFile5(markerPath, JSON.stringify(value, null, 2), "utf8");
|
|
601
610
|
}
|
|
602
611
|
async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
|
|
603
612
|
const denseCacheDir = resolveCacheDir(workspacePath, dense.cacheDir);
|
|
@@ -1015,7 +1024,7 @@ async function getModelStatus(workspacePath, config) {
|
|
|
1015
1024
|
}
|
|
1016
1025
|
|
|
1017
1026
|
// src/index/index-store.ts
|
|
1018
|
-
import { mkdir as mkdir6, rm as
|
|
1027
|
+
import { mkdir as mkdir6, rm as rm3 } from "fs/promises";
|
|
1019
1028
|
import path10 from "path";
|
|
1020
1029
|
function versionedIndexPath(workspacePath, stamp) {
|
|
1021
1030
|
return path10.join(workspacePath, "indexes", `${stamp}.json.gz`);
|
|
@@ -1057,10 +1066,10 @@ async function writeIndexArtifacts({
|
|
|
1057
1066
|
await writeGzipJson(latestIndexArtifactPath, indexState);
|
|
1058
1067
|
await writeGzipJson(latestMetadataArtifactPath, metadata);
|
|
1059
1068
|
await Promise.all([
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1069
|
+
rm3(legacyLatestIndexPath(workspacePath), { force: true }),
|
|
1070
|
+
rm3(legacyLatestMetaPath(workspacePath), { force: true }),
|
|
1071
|
+
rm3(versionedLegacyIndexPath(workspacePath, stamp), { force: true }),
|
|
1072
|
+
rm3(versionedLegacyMetaPath(workspacePath, stamp), { force: true })
|
|
1064
1073
|
]);
|
|
1065
1074
|
return { indexPath: latestIndexArtifactPath, metadataPath: latestMetadataArtifactPath };
|
|
1066
1075
|
}
|
|
@@ -1262,7 +1271,7 @@ async function removeSource(workspacePath, sourceId) {
|
|
|
1262
1271
|
}
|
|
1263
1272
|
|
|
1264
1273
|
// src/ingest/document-utils.ts
|
|
1265
|
-
import { mkdir as mkdir7, rm as
|
|
1274
|
+
import { mkdir as mkdir7, rm as rm4, writeFile as writeFile6 } from "fs/promises";
|
|
1266
1275
|
import path14 from "path";
|
|
1267
1276
|
|
|
1268
1277
|
// src/normalize/normalize-markdown.ts
|
|
@@ -1316,7 +1325,7 @@ async function writeNormalizedDocument({
|
|
|
1316
1325
|
markdown
|
|
1317
1326
|
}) {
|
|
1318
1327
|
await mkdir7(path14.dirname(normalizedPath), { recursive: true });
|
|
1319
|
-
await
|
|
1328
|
+
await writeFile6(
|
|
1320
1329
|
normalizedPath,
|
|
1321
1330
|
withFrontmatter(
|
|
1322
1331
|
{
|
|
@@ -1338,8 +1347,8 @@ async function writeNormalizedDocument({
|
|
|
1338
1347
|
}
|
|
1339
1348
|
async function deleteDocumentArtifacts(document) {
|
|
1340
1349
|
await Promise.all([
|
|
1341
|
-
document.rawPath ?
|
|
1342
|
-
|
|
1350
|
+
document.rawPath ? rm4(document.rawPath, { force: true }) : Promise.resolve(),
|
|
1351
|
+
rm4(document.normalizedPath, { force: true })
|
|
1343
1352
|
]);
|
|
1344
1353
|
}
|
|
1345
1354
|
|
|
@@ -1363,7 +1372,7 @@ async function listDirectoryFiles(source) {
|
|
|
1363
1372
|
|
|
1364
1373
|
// src/ingest/adapters/file-adapter.ts
|
|
1365
1374
|
import { basename, extname, resolve } from "path";
|
|
1366
|
-
import { mkdir as mkdir8, readFile as readFile8, stat as stat3, writeFile as
|
|
1375
|
+
import { mkdir as mkdir8, readFile as readFile8, stat as stat3, writeFile as writeFile7 } from "fs/promises";
|
|
1367
1376
|
|
|
1368
1377
|
// src/ingest/extractors/docx-extractor.ts
|
|
1369
1378
|
import mammoth from "mammoth";
|
|
@@ -1653,7 +1662,7 @@ async function ingestFile({
|
|
|
1653
1662
|
await mkdir8(resolve(workspacePath, "normalized"), { recursive: true });
|
|
1654
1663
|
await mkdir8(resolve(workspacePath, "raw", source.id), { recursive: true });
|
|
1655
1664
|
if (extracted.raw) {
|
|
1656
|
-
await
|
|
1665
|
+
await writeFile7(rawPath, extracted.raw, "utf8");
|
|
1657
1666
|
}
|
|
1658
1667
|
await writeNormalizedDocument({
|
|
1659
1668
|
documentId,
|
|
@@ -1877,7 +1886,7 @@ async function parseRssFeedDocument(xml, source) {
|
|
|
1877
1886
|
}
|
|
1878
1887
|
|
|
1879
1888
|
// src/ingest/adapters/url-adapter.ts
|
|
1880
|
-
import { mkdir as mkdir9, readFile as readFile9, writeFile as
|
|
1889
|
+
import { mkdir as mkdir9, readFile as readFile9, writeFile as writeFile8 } from "fs/promises";
|
|
1881
1890
|
import path16 from "path";
|
|
1882
1891
|
|
|
1883
1892
|
// src/core/urls.ts
|
|
@@ -1930,7 +1939,7 @@ ${extracted.markdown}`;
|
|
|
1930
1939
|
const crawledAt = now;
|
|
1931
1940
|
const resolvedPublicationDate = choosePublicationDate(publicationDate, extractPublicationDateFromHtml(body), previous?.publicationDate);
|
|
1932
1941
|
await mkdir9(path16.resolve(workspacePath, "raw", source.id), { recursive: true });
|
|
1933
|
-
await
|
|
1942
|
+
await writeFile8(rawPath, body, "utf8");
|
|
1934
1943
|
await writeNormalizedDocument({
|
|
1935
1944
|
documentId,
|
|
1936
1945
|
sourceId: source.id,
|
package/dist/index.js
CHANGED
|
@@ -1795,6 +1795,8 @@ import os from "os";
|
|
|
1795
1795
|
import path12 from "path";
|
|
1796
1796
|
import { fileURLToPath } from "url";
|
|
1797
1797
|
import { execFile, execFileSync } from "child_process";
|
|
1798
|
+
import { mkdtemp, rm as rm2, writeFile as writeFile6 } from "fs/promises";
|
|
1799
|
+
var sparseExecFileSync = execFileSync;
|
|
1798
1800
|
function resolveQliHomeDir() {
|
|
1799
1801
|
return path12.resolve(process.env.QLI_HOME ?? path12.join(os.homedir(), ".qli"));
|
|
1800
1802
|
}
|
|
@@ -1850,29 +1852,36 @@ async function runSparsePython({
|
|
|
1850
1852
|
}) {
|
|
1851
1853
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
1852
1854
|
const scriptPath = await sparseScriptPath(importMetaUrl);
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
"
|
|
1859
|
-
|
|
1860
|
-
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
|
|
1868
|
-
|
|
1869
|
-
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
|
|
1855
|
+
const payloadDir = await mkdtemp(path12.join(os.tmpdir(), "qli-sparse-"));
|
|
1856
|
+
const payloadPath = path12.join(payloadDir, "payload.json");
|
|
1857
|
+
await writeFile6(payloadPath, JSON.stringify(payload), "utf8");
|
|
1858
|
+
try {
|
|
1859
|
+
return sparseExecFileSync(
|
|
1860
|
+
"uv",
|
|
1861
|
+
[
|
|
1862
|
+
"run",
|
|
1863
|
+
"--with",
|
|
1864
|
+
"torch",
|
|
1865
|
+
"--with",
|
|
1866
|
+
"transformers",
|
|
1867
|
+
"--with",
|
|
1868
|
+
"huggingface_hub",
|
|
1869
|
+
"python",
|
|
1870
|
+
scriptPath,
|
|
1871
|
+
payloadPath
|
|
1872
|
+
],
|
|
1873
|
+
{
|
|
1874
|
+
encoding: "utf8",
|
|
1875
|
+
maxBuffer: 1024 * 1024 * 1024,
|
|
1876
|
+
env: {
|
|
1877
|
+
...process.env,
|
|
1878
|
+
HF_HOME: cacheDir
|
|
1879
|
+
}
|
|
1873
1880
|
}
|
|
1874
|
-
|
|
1875
|
-
|
|
1881
|
+
);
|
|
1882
|
+
} finally {
|
|
1883
|
+
await rm2(payloadDir, { recursive: true, force: true });
|
|
1884
|
+
}
|
|
1876
1885
|
}
|
|
1877
1886
|
async function getDenseTransformersRuntime(cacheDir) {
|
|
1878
1887
|
const transformers = await import("@huggingface/transformers");
|
|
@@ -1885,18 +1894,18 @@ async function getDenseTransformersRuntime(cacheDir) {
|
|
|
1885
1894
|
}
|
|
1886
1895
|
|
|
1887
1896
|
// src/vector/store.ts
|
|
1888
|
-
import { mkdir as mkdir6, rm as
|
|
1897
|
+
import { mkdir as mkdir6, rm as rm3, writeFile as writeFile8 } from "fs/promises";
|
|
1889
1898
|
import path13 from "path";
|
|
1890
1899
|
|
|
1891
1900
|
// src/core/gzip-json.ts
|
|
1892
|
-
import { readFile as readFile9, writeFile as
|
|
1901
|
+
import { readFile as readFile9, writeFile as writeFile7 } from "fs/promises";
|
|
1893
1902
|
import { promisify } from "util";
|
|
1894
1903
|
import { gunzip, gzip } from "zlib";
|
|
1895
1904
|
var gzipAsync = promisify(gzip);
|
|
1896
1905
|
var gunzipAsync = promisify(gunzip);
|
|
1897
1906
|
async function writeGzipJson(filePath, value) {
|
|
1898
1907
|
const payload = JSON.stringify(value, null, 2);
|
|
1899
|
-
await
|
|
1908
|
+
await writeFile7(filePath, await gzipAsync(Buffer.from(payload, "utf8")));
|
|
1900
1909
|
}
|
|
1901
1910
|
async function readJsonFromGzipOrFile(gzipPath, legacyPath) {
|
|
1902
1911
|
if (await fileExists(gzipPath)) {
|
|
@@ -1956,8 +1965,8 @@ async function writeDensePayload(workspacePath, payload) {
|
|
|
1956
1965
|
await writeGzipJson(denseVectorPath(workspacePath), payload);
|
|
1957
1966
|
await writeGzipJson(denseMetaPath(workspacePath), payload.metadata);
|
|
1958
1967
|
await Promise.all([
|
|
1959
|
-
|
|
1960
|
-
|
|
1968
|
+
rm3(legacyDenseVectorPath(workspacePath), { force: true }),
|
|
1969
|
+
rm3(legacyDenseMetaPath(workspacePath), { force: true })
|
|
1961
1970
|
]);
|
|
1962
1971
|
}
|
|
1963
1972
|
async function readDensePayload(workspacePath) {
|
|
@@ -1968,8 +1977,8 @@ async function writeSparsePayload(workspacePath, payload) {
|
|
|
1968
1977
|
await writeGzipJson(sparseVectorPath(workspacePath), payload);
|
|
1969
1978
|
await writeGzipJson(sparseMetaPath(workspacePath), payload.metadata);
|
|
1970
1979
|
await Promise.all([
|
|
1971
|
-
|
|
1972
|
-
|
|
1980
|
+
rm3(legacySparseVectorPath(workspacePath), { force: true }),
|
|
1981
|
+
rm3(legacySparseMetaPath(workspacePath), { force: true })
|
|
1973
1982
|
]);
|
|
1974
1983
|
}
|
|
1975
1984
|
async function readSparsePayload(workspacePath) {
|
|
@@ -2309,7 +2318,7 @@ async function buildVectorArtifacts({
|
|
|
2309
2318
|
}
|
|
2310
2319
|
|
|
2311
2320
|
// src/index/index-store.ts
|
|
2312
|
-
import { mkdir as mkdir9, rm as
|
|
2321
|
+
import { mkdir as mkdir9, rm as rm4 } from "fs/promises";
|
|
2313
2322
|
import path16 from "path";
|
|
2314
2323
|
function versionedIndexPath(workspacePath, stamp) {
|
|
2315
2324
|
return path16.join(workspacePath, "indexes", `${stamp}.json.gz`);
|
|
@@ -2351,10 +2360,10 @@ async function writeIndexArtifacts({
|
|
|
2351
2360
|
await writeGzipJson(latestIndexArtifactPath, indexState);
|
|
2352
2361
|
await writeGzipJson(latestMetadataArtifactPath, metadata);
|
|
2353
2362
|
await Promise.all([
|
|
2354
|
-
|
|
2355
|
-
|
|
2356
|
-
|
|
2357
|
-
|
|
2363
|
+
rm4(legacyLatestIndexPath(workspacePath), { force: true }),
|
|
2364
|
+
rm4(legacyLatestMetaPath(workspacePath), { force: true }),
|
|
2365
|
+
rm4(versionedLegacyIndexPath(workspacePath, stamp), { force: true }),
|
|
2366
|
+
rm4(versionedLegacyMetaPath(workspacePath, stamp), { force: true })
|
|
2358
2367
|
]);
|
|
2359
2368
|
return { indexPath: latestIndexArtifactPath, metadataPath: latestMetadataArtifactPath };
|
|
2360
2369
|
}
|
package/dist/vector/runtime.d.ts
CHANGED
|
@@ -1,4 +1,11 @@
|
|
|
1
1
|
import type { SparseVectorModelConfig } from "../types/models.js";
|
|
2
|
+
type SparseExecOptions = {
|
|
3
|
+
encoding: BufferEncoding;
|
|
4
|
+
maxBuffer: number;
|
|
5
|
+
env: NodeJS.ProcessEnv;
|
|
6
|
+
};
|
|
7
|
+
type SparseExecFileSync = (file: string, args: string[], options: SparseExecOptions) => string;
|
|
8
|
+
export declare function setSparseExecFileSyncForTests(fn: SparseExecFileSync | null): void;
|
|
2
9
|
export declare function resolveQliHomeDir(): string;
|
|
3
10
|
export declare function resolveCacheDir(workspacePath: string, configuredPath: string): string;
|
|
4
11
|
export declare function packageRootFromImportMeta(importMetaUrl: string): string;
|
|
@@ -18,3 +25,4 @@ export declare function getDenseTransformersRuntime(cacheDir: string): Promise<{
|
|
|
18
25
|
};
|
|
19
26
|
pipeline: typeof import("@huggingface/transformers").pipeline;
|
|
20
27
|
}>;
|
|
28
|
+
export {};
|
package/package.json
CHANGED
package/scripts/sparse-encode.py
CHANGED
|
@@ -88,7 +88,11 @@ def encode_documents(model_id: str, top_tokens: int, documents):
|
|
|
88
88
|
|
|
89
89
|
|
|
90
90
|
def main():
|
|
91
|
-
|
|
91
|
+
if len(sys.argv) > 1:
|
|
92
|
+
with open(sys.argv[1], encoding="utf-8") as handle:
|
|
93
|
+
payload = json.load(handle)
|
|
94
|
+
else:
|
|
95
|
+
payload = json.load(sys.stdin)
|
|
92
96
|
action = payload["action"]
|
|
93
97
|
model_id = payload["model_id"]
|
|
94
98
|
if action == "download_only":
|