@ghcrawl/api-core 0.5.0 → 0.7.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cluster/edge-worker.d.ts +2 -0
- package/dist/cluster/edge-worker.d.ts.map +1 -0
- package/dist/cluster/edge-worker.js +48 -0
- package/dist/cluster/edge-worker.js.map +1 -0
- package/dist/cluster/exact-edges.d.ts +20 -0
- package/dist/cluster/exact-edges.d.ts.map +1 -0
- package/dist/cluster/exact-edges.js +80 -0
- package/dist/cluster/exact-edges.js.map +1 -0
- package/dist/cluster/perf.integration.d.ts +2 -0
- package/dist/cluster/perf.integration.d.ts.map +1 -0
- package/dist/cluster/perf.integration.js +287 -0
- package/dist/cluster/perf.integration.js.map +1 -0
- package/dist/search/exact.d.ts +13 -0
- package/dist/search/exact.d.ts.map +1 -1
- package/dist/search/exact.js +58 -6
- package/dist/search/exact.js.map +1 -1
- package/dist/service.d.ts +8 -1
- package/dist/service.d.ts.map +1 -1
- package/dist/service.js +198 -61
- package/dist/service.js.map +1 -1
- package/package.json +5 -4
package/dist/service.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"service.d.ts","sourceRoot":"","sources":["../src/service.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"service.d.ts","sourceRoot":"","sources":["../src/service.ts"],"names":[],"mappings":"AAQA,OAAO,EAgBL,KAAK,aAAa,EAClB,KAAK,cAAc,EACnB,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EAClB,KAAK,qBAAqB,EAE1B,KAAK,gBAAgB,EACrB,KAAK,wBAAwB,EAC7B,KAAK,gBAAgB,EACrB,KAAK,cAAc,EACnB,KAAK,cAAc,EACnB,KAAK,iBAAiB,EACtB,KAAK,eAAe,EACpB,KAAK,oBAAoB,EACzB,KAAK,aAAa,EAClB,KAAK,YAAY,EACjB,KAAK,UAAU,EACf,KAAK,cAAc,EACnB,KAAK,aAAa,EAClB,KAAK,SAAS,EACd,KAAK,eAAe,EACrB,MAAM,uBAAuB,CAAC;AAI/B,OAAO,EAOL,KAAK,iBAAiB,EACtB,KAAK,cAAc,EACpB,MAAM,aAAa,CAAC;AAErB,OAAO,EAAU,KAAK,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAE7D,OAAO,EAAoB,KAAK,YAAY,EAAE,MAAM,oBAAoB,CAAC;AACzE,OAAO,EAAkB,KAAK,UAAU,EAAE,MAAM,sBAAsB,CAAC;AAgGvE,MAAM,MAAM,kBAAkB,GAAG,QAAQ,GAAG,MAAM,CAAC;AAEnD,MAAM,MAAM,YAAY,GAAG;IACzB,cAAc,EAAE,MAAM,CAAC;IACvB,oBAAoB,EAAE,MAAM,CAAC;IAC7B,0BAA0B,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1C,kBAAkB,EAAE,MAAM,GAAG,IAAI,CAAC;IAClC,qBAAqB,EAAE,MAAM,CAAC;IAC9B,qBAAqB,EAAE,MAAM,CAAC;IAC9B,kBAAkB,EAAE,MAAM,GAAG,IAAI,CAAC;IAClC,0BAA0B,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3C,CAAC;AAEF,MAAM,MAAM,iBAAiB,GAAG;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY,EAAE,MAAM,CAAC;IACrB,QAAQ,EAAE,OAAO,CAAC;IAClB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,gBAAgB,EAAE,MAAM,GAAG,IAAI,CAAC;IAChC,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,gBAAgB,EAAE,MAAM,CAAC;IACzB,eAAe,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/B,sBAAsB,EAAE,MAAM,GAAG,IAAI,CAAC;IACtC,oBAAoB,EAAE,MAAM,GAAG,IAAI,CAAC;IACpC,kBAAkB,EAAE,OAAO,GAAG,cAAc,GAAG,IAAI,CAAC;IACpD,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC;AAEF,MAAM,MAAM,gBAAgB,GAAG;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,OAAO,GAAG,cAAc,CAAC;IAC/B,QAAQ,EAAE,OAAO,CAAC;IAClB,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,YAAY,EAAE,MAAM,GAAG,IAAI,CAAC;CAC7B,CAAC;AAEF,MAAM,MAAM,gBAAgB,GAAG;IAC7B,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY,EAAE,MAAM,CAAC;IACrB,QAAQ,EAAE,OAAO,CAAC;IAClB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,gBAAgB,EAAE,MAAM,GAAG,IAAI,CAAC;IAChC,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,gBAAgB,EAAE,MAAM,CAAC;IACzB,eAAe,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/B,sBAAsB,EAAE,MAAM,GAAG,IAAI,CAAC;IACtC,oBAAoB,EAAE,MAAM,GAAG,IAAI,CAAC;IACpC,kBAAkB,EAAE,OAAO,GAAG,cAAc,GAAG,IAAI,CAAC;IACpD,OAAO,EAAE,gBAAgB,EAAE,CAAC;CAC7B,CAAC;AAEF,MAAM,MAAM,eAAe,GAAG;IAC5B,MAAM,EAAE,SAAS,CAAC;IAClB,SAAS,EAAE,OAAO,CAAC,MAAM,CAAC,iBAAiB,GAAG,kBAAkB,GAAG,2BAA2B,GAAG,gBAAgB,EAAE,MAAM,CAAC,CAAC,CAAC;IAC5H,SAAS,EAAE,YAAY,CAAC,WAAW,CAAC,CAAC;CACtC,CAAC;AAEF,MAAM,MAAM,WAAW,GAAG;IACxB,UAAU,EAAE,aAAa,CAAC;IAC1B,KAAK,EAAE,YAAY,CAAC;IACpB,YAAY,EAAE,MAAM,GAAG,IAAI,CAAC;IAC5B,QAAQ,EAAE,iBAAiB,EAAE,CAAC;CAC/B,CAAC;AAEF,MAAM,MAAM,YAAY,GAAG;IACzB,MAAM,EAAE,cAAc,CAAC;IACvB,MAAM,EAAE;QACN,UAAU,EAAE,OAAO,CAAC;QACpB,MAAM,EAAE,iBAAiB,CAAC;QAC1B,QAAQ,EAAE,OAAO,CAAC;QAClB,MAAM,EAAE,OAAO,CAAC;QAChB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;KACtB,CAAC;IACF,MAAM,EAAE;QACN,UAAU,EAAE,OAAO,CAAC;QACpB,MAAM,EAAE,iBAAiB,CAAC;QAC1B,QAAQ,EAAE,OAAO,CAAC;QAClB,MAAM,EAAE,OAAO,CAAC;QAChB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;KACtB,CAAC;CACH,CAAC;AAEF,KAAK,WAAW,GAAG;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;IACvC,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB,CAAC;AAEF,KAAK,oBAAoB,GAAG,cAAc,CAAC;AAC3C,KAAK,uBAAuB,GAAG,iBAAiB,CAAC;AA4KjD,qBAAa,cAAc;IACzB,QAAQ,CAAC,MAAM,EAAE,cAAc,CAAC;IAChC,QAAQ,CAAC,EAAE,EAAE,cAAc,CAAC;IAC5B,QAAQ,CAAC,MAAM,CAAC,EAAE,YAAY,CAAC;IAC/B,QAAQ,CAAC,EAAE,CAAC,EAAE,UAAU,CAAC;IACzB,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAiD;gBAE1E,OAAO,GAAE;QACnB,MAAM,CAAC,EAAE,cAAc,CAAC;QACxB,EAAE,CAAC,EAAE,cAAc,CAAC;QACpB,MAAM,CAAC,EAAE,YAAY,CAAC;QACtB,EAAE,CAAC,EAAE,UAAU,CAAC;KACZ;IASN,KAAK,IAAI,IAAI;IAKb,IAAI,IAAI,cAAc;IAehB,MAAM,IAAI,OAAO,CAAC,YAAY,CAAC;IAmDrC,gBAAgB,IAAI,oBAAoB;IAKxC,WAAW,CAAC,MAAM,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,IAAI,CAAC,EAAE,OAAO,GAAG,cAAc,CAAC;QAAC,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;QAAC,aAAa,CAAC,EAAE,OAAO,CAAA;KAAE,GAAG,eAAe;IAmDnJ,iBAAiB,CAAC,MAAM,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAC;QAAC,aAAa,CAAC,EAAE,OAAO,CAAA;KAAE,GAAG,qBAAqB;IA4GzH,kBAAkB,CAAC,MAAM,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,YAAY,EAAE,MAAM,CAAA;KAAE,GAAG,aAAa;IAmChG,mBAAmB,CAAC,MAAM,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,MAAM,CAAA;KAAE,GAAG,aAAa;IAiCxF,cAAc,CAClB,MAAM,EAAE,WAAW,GAClB,OAAO,CAAC,aAAa,CAAC;IA2InB,mBAAmB,CAAC,MAAM,EAAE;QAChC,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,eAAe,CAAC,EAAE,OAAO,CAAC;QAC1B,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;KACxC,GAAG,OAAO,CAAC;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,UAAU,EAAE,MAAM,CAAC;QAAC,WAAW,EAAE,MAAM,CAAC;QAAC,YAAY,EAAE,MAAM,CAAC;QAAC,WAAW,EAAE,MAAM,CAAA;KAAE,CAAC;IAsFlH,aAAa,CAAC,MAAM,EAAE;QACpB,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;KACxC,GAAG;QAAE,cAAc,EAAE,MAAM,CAAC;QAAC,gBAAgB,EAAE,MAAM,CAAA;KAAE;IAoClD,eAAe,CAAC,MAAM,EAAE;QAC5B,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;KACxC,GAAG,OAAO,CAAC,cAAc,CAAC;IAoDrB,iBAAiB,CAAC,MAAM,EAAE;QAC9B,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,CAAC,CAAC,EAAE,MAAM,CAAC;QACX,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;KACxC,GAAG,OAAO,CAAC,gBAAgB,CAAC;IA0CvB,gBAAgB,CAAC,MAAM,EAAE;QAC7B,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,CAAC,EAAE,UAAU,CAAC;QAClB,KAAK,CAAC,EAAE,MAAM,CAAC;KAChB,GAAG,OAAO,CAAC,oBAAoB,CAAC;IAmHjC,aAAa,CAAC,MAAM,EAAE;QACpB,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,YAAY,EAAE,MAAM,CAAC;QACrB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,QAAQ,CAAC,EAAE,MAAM,CAAC;KACnB,GAAG,uBAAuB;IAiD3B,YAAY,CAAC,MAAM,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,aAAa,CAAC,EAAE,OAAO,CAAA;KAAE,GAAG,gBAAgB;IAyE1F,iBAAiB,CAAC,MAAM,EAAE;QAC9B,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,IAAI,CAAC,EAAE,OAAO,CAAC;QACf,KAAK,CAAC,EAAE,OAAO,CAAC;QAChB,OAAO,CAAC,EAAE,OAAO,CAAC;QAClB,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;KACxC,GAAG,OAAO,CAAC,eAAe,CAAC;IAkD5B,oBAAoB,CAAC,MAAM,EAAE;QAC3B,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,IAAI,CAAC,EAAE,kBAAkB,CAAC;QAC1B,MAAM,CAAC,EAAE,MAAM,CAAC;QAChB,aAAa,CAAC,EAAE,OAAO,CAAC;KACzB,GAAG,wBAAwB;IA8B5B,oBAAoB,CAAC,MAAM,EAAE;QAC3B,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,MAAM,CAAC;QAClB,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,SAAS,CAAC,EAAE,MAAM,CAAC;QACnB,aAAa,CAAC,EAAE,OAAO,CAAC;KACzB,GAAG,qBAAqB;IAwDzB,cAAc,CAAC,MAAM,EAAE;QACrB,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,IAAI,CAAC,EAAE,kBAAkB,CAAC;QAC1B,MAAM,CAAC,EAAE,MAAM,CAAC;QAChB,qBAAqB,CAAC,EAAE,OAAO,CAAC;KACjC,GAAG,WAAW;IA2Bf,mBAAmB,CAAC,MAAM,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,MAAM,CAAC;QAAC,YAAY,CAAC,EAAE,MAAM,CAAA;KAAE,GAAG,gBAAgB;IAiExH,kBAAkB,CAAC,MAAM,EAAE;QACzB,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,gBAAgB,CAAC,EAAE,OAAO,CAAC;KAC5B,GAAG,eAAe;IA0Eb,WAAW,CAAC,OAAO,EAAE,aAAa,GAAG,OAAO,CAAC,cAAc,CAAC;IAgClE,OAAO,CAAC,kBAAkB;IAmE1B,OAAO,CAAC,oBAAoB;IA4B5B,OAAO,CAAC,eAAe;IA8BvB,OAAO,CAAC,mBAAmB;IAQ3B,OAAO,CAAC,+BAA+B;IAkBvC,OAAO,CAAC,0BAA0B;IAoElC,OAAO,CAAC,kBAAkB;IAiE1B,OAAO,CAAC,uBAAuB;IAuE/B,OAAO,CAAC,wBAAwB;YASlB,mBAAmB;IA4DjC,OAAO,CAAC,SAAS;IAOjB,OAAO,CAAC,aAAa;IAOrB,OAAO,CAAC,iBAAiB;IASzB,OAAO,CAAC,gBAAgB;IAgBxB,OAAO,CAAC,YAAY;YAoEN,uBAAuB;YAmFvB,2BAA2B;IAuGzC,OAAO,CAAC,eAAe;IA0BvB,OAAO,CAAC,eAAe;IAoCvB,OAAO,CAAC,kBAAkB;IAmD1B,OAAO,CAAC,mBAAmB;IAkD3B,OAAO,CAAC,oBAAoB;IAoB5B,OAAO,CAAC,uBAAuB;IAI/B,OAAO,CAAC,0BAA0B;IAkBlC,OAAO,CAAC,uBAAuB;YAIjB,sBAAsB;YAgCtB,2BAA2B;IAkCzC,OAAO,CAAC,mBAAmB;IA4B3B,OAAO,CAAC,2BAA2B;IAoBnC,OAAO,CAAC,mBAAmB;IAwB3B,OAAO,CAAC,oBAAoB;IAc5B,OAAO,CAAC,0BAA0B;IAoBlC,OAAO,CAAC,qCAAqC;IAwB7C,OAAO,CAAC,yBAAyB;IA4BjC,OAAO,CAAC,0BAA0B;IAwDlC,OAAO,CAAC,mBAAmB;IA+C3B,OAAO,CAAC,0BAA0B;IAyClC,OAAO,CAAC,OAAO;YAMD,wBAAwB;IAoGtC,OAAO,CAAC,oBAAoB;IAsB5B,OAAO,CAAC,4BAA4B;IAepC,OAAO,CAAC,wBAAwB;IAShC,OAAO,CAAC,iBAAiB;IAsDzB,OAAO,CAAC,mBAAmB;IAI3B,OAAO,CAAC,aAAa;IAarB,OAAO,CAAC,eAAe;IA2BvB,OAAO,CAAC,QAAQ;IAOhB,OAAO,CAAC,SAAS;CAkBlB;AAED,wBAAgB,eAAe,CAAC,GAAG,EAAE,GAAG,GAAG;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,CAOzE"}
|
package/dist/service.js
CHANGED
|
@@ -1,23 +1,32 @@
|
|
|
1
1
|
import http from 'node:http';
|
|
2
2
|
import crypto from 'node:crypto';
|
|
3
|
+
import { existsSync } from 'node:fs';
|
|
4
|
+
import os from 'node:os';
|
|
5
|
+
import { fileURLToPath } from 'node:url';
|
|
6
|
+
import { Worker } from 'node:worker_threads';
|
|
3
7
|
import { IterableMapper } from '@shutterstock/p-map-iterable';
|
|
4
8
|
import { actionResponseSchema, authorThreadsResponseSchema, closeResponseSchema, clusterDetailResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, clustersResponseSchema, embedResultSchema, healthResponseSchema, neighborsResponseSchema, refreshResponseSchema, repositoriesResponseSchema, searchResponseSchema, syncResultSchema, threadsResponseSchema, } from '@ghcrawl/api-contract';
|
|
5
9
|
import { buildClusters } from './cluster/build.js';
|
|
10
|
+
import { buildSourceKindEdges } from './cluster/exact-edges.js';
|
|
6
11
|
import { ensureRuntimeDirs, isLikelyGitHubToken, isLikelyOpenAiApiKey, loadConfig, requireGithubToken, requireOpenAiKey, } from './config.js';
|
|
7
12
|
import { migrate } from './db/migrate.js';
|
|
8
13
|
import { openDb } from './db/sqlite.js';
|
|
9
14
|
import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js';
|
|
10
15
|
import { makeGitHubClient } from './github/client.js';
|
|
11
16
|
import { OpenAiProvider } from './openai/provider.js';
|
|
12
|
-
import { cosineSimilarity, rankNearestNeighbors } from './search/exact.js';
|
|
17
|
+
import { cosineSimilarity, normalizeEmbedding, rankNearestNeighbors } from './search/exact.js';
|
|
13
18
|
const SYNC_BATCH_SIZE = 100;
|
|
14
19
|
const SYNC_BATCH_DELAY_MS = 5000;
|
|
15
20
|
const STALE_CLOSED_SWEEP_LIMIT = 1000;
|
|
16
21
|
const CLUSTER_PROGRESS_INTERVAL_MS = 5000;
|
|
22
|
+
const CLUSTER_PARALLEL_MIN_EMBEDDINGS = 5000;
|
|
17
23
|
const EMBED_ESTIMATED_CHARS_PER_TOKEN = 3;
|
|
18
24
|
const EMBED_MAX_ITEM_TOKENS = 7000;
|
|
19
25
|
const EMBED_MAX_BATCH_TOKENS = 250000;
|
|
20
26
|
const EMBED_TRUNCATION_MARKER = '\n\n[truncated for embedding]';
|
|
27
|
+
const EMBED_CONTEXT_RETRY_ATTEMPTS = 5;
|
|
28
|
+
const EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO = 0.9;
|
|
29
|
+
const EMBED_CONTEXT_RETRY_TARGET_BUFFER_RATIO = 0.95;
|
|
21
30
|
function nowIso() {
|
|
22
31
|
return new Date().toISOString();
|
|
23
32
|
}
|
|
@@ -697,24 +706,15 @@ export class GHCrawlService {
|
|
|
697
706
|
throw error;
|
|
698
707
|
}
|
|
699
708
|
}
|
|
700
|
-
clusterRepository(params) {
|
|
709
|
+
async clusterRepository(params) {
|
|
701
710
|
const repository = this.requireRepository(params.owner, params.repo);
|
|
702
711
|
const runId = this.startRun('cluster_runs', repository.id, repository.fullName);
|
|
703
712
|
const minScore = params.minScore ?? 0.82;
|
|
704
713
|
const k = params.k ?? 6;
|
|
705
714
|
try {
|
|
706
|
-
const
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
threadMeta.set(row.id, { number: row.number, title: row.title });
|
|
710
|
-
}
|
|
711
|
-
const items = Array.from(threadMeta.entries()).map(([id, meta]) => ({
|
|
712
|
-
id,
|
|
713
|
-
number: meta.number,
|
|
714
|
-
title: meta.title,
|
|
715
|
-
}));
|
|
716
|
-
params.onProgress?.(`[cluster] loaded ${items.length} embedded thread(s) across ${new Set(rows.map((row) => row.source_kind)).size} source kind(s) for ${repository.fullName} k=${k} minScore=${minScore}`);
|
|
717
|
-
const aggregatedEdges = this.aggregateRepositoryEdges(rows, {
|
|
715
|
+
const { items, sourceKinds } = this.loadClusterableThreadMeta(repository.id);
|
|
716
|
+
params.onProgress?.(`[cluster] loaded ${items.length} embedded thread(s) across ${sourceKinds.length} source kind(s) for ${repository.fullName} k=${k} minScore=${minScore}`);
|
|
717
|
+
const aggregatedEdges = await this.aggregateRepositoryEdges(repository.id, sourceKinds, {
|
|
718
718
|
limit: k,
|
|
719
719
|
minScore,
|
|
720
720
|
onProgress: params.onProgress,
|
|
@@ -954,7 +954,7 @@ export class GHCrawlService {
|
|
|
954
954
|
});
|
|
955
955
|
}
|
|
956
956
|
if (selected.cluster) {
|
|
957
|
-
cluster = this.clusterRepository({
|
|
957
|
+
cluster = await this.clusterRepository({
|
|
958
958
|
owner: params.owner,
|
|
959
959
|
repo: params.repo,
|
|
960
960
|
onProgress: params.onProgress,
|
|
@@ -1207,7 +1207,7 @@ export class GHCrawlService {
|
|
|
1207
1207
|
});
|
|
1208
1208
|
}
|
|
1209
1209
|
case 'cluster': {
|
|
1210
|
-
const result = this.clusterRepository(request);
|
|
1210
|
+
const result = await this.clusterRepository(request);
|
|
1211
1211
|
return actionResponseSchema.parse({
|
|
1212
1212
|
ok: true,
|
|
1213
1213
|
action: request.action,
|
|
@@ -1845,9 +1845,23 @@ export class GHCrawlService {
|
|
|
1845
1845
|
estimateEmbeddingTokens(text) {
|
|
1846
1846
|
return Math.max(1, Math.ceil(text.length / EMBED_ESTIMATED_CHARS_PER_TOKEN));
|
|
1847
1847
|
}
|
|
1848
|
-
|
|
1848
|
+
parseEmbeddingContextError(error) {
|
|
1849
1849
|
const message = error instanceof Error ? error.message : String(error);
|
|
1850
|
-
|
|
1850
|
+
const requestedMatch = message.match(/requested\s+(\d+)\s+tokens/i);
|
|
1851
|
+
const contextLimitMatch = message.match(/maximum context length is\s+(\d+)\s+tokens/i);
|
|
1852
|
+
const inputLimitMatch = message.match(/maximum input length is\s+(\d+)\s+tokens/i);
|
|
1853
|
+
const limitTokens = Number(contextLimitMatch?.[1] ?? inputLimitMatch?.[1] ?? NaN);
|
|
1854
|
+
const requestedTokens = Number(requestedMatch?.[1] ?? NaN);
|
|
1855
|
+
if (!Number.isFinite(limitTokens) && !Number.isFinite(requestedTokens)) {
|
|
1856
|
+
return null;
|
|
1857
|
+
}
|
|
1858
|
+
return {
|
|
1859
|
+
limitTokens: Number.isFinite(limitTokens) ? limitTokens : null,
|
|
1860
|
+
requestedTokens: Number.isFinite(requestedTokens) ? requestedTokens : null,
|
|
1861
|
+
};
|
|
1862
|
+
}
|
|
1863
|
+
isEmbeddingContextError(error) {
|
|
1864
|
+
return this.parseEmbeddingContextError(error) !== null;
|
|
1851
1865
|
}
|
|
1852
1866
|
async embedBatchWithRecovery(ai, batch, onProgress) {
|
|
1853
1867
|
try {
|
|
@@ -1875,7 +1889,7 @@ export class GHCrawlService {
|
|
|
1875
1889
|
}
|
|
1876
1890
|
async embedSingleTaskWithRecovery(ai, task, onProgress) {
|
|
1877
1891
|
let current = task;
|
|
1878
|
-
for (let attempt = 0; attempt <
|
|
1892
|
+
for (let attempt = 0; attempt < EMBED_CONTEXT_RETRY_ATTEMPTS; attempt += 1) {
|
|
1879
1893
|
try {
|
|
1880
1894
|
const [embedding] = await ai.embedTexts({
|
|
1881
1895
|
model: this.config.embedModel,
|
|
@@ -1884,10 +1898,11 @@ export class GHCrawlService {
|
|
|
1884
1898
|
return { task: current, embedding };
|
|
1885
1899
|
}
|
|
1886
1900
|
catch (error) {
|
|
1887
|
-
|
|
1901
|
+
const context = this.parseEmbeddingContextError(error);
|
|
1902
|
+
if (!context) {
|
|
1888
1903
|
throw error;
|
|
1889
1904
|
}
|
|
1890
|
-
const next = this.shrinkEmbeddingTask(current);
|
|
1905
|
+
const next = this.shrinkEmbeddingTask(current, context);
|
|
1891
1906
|
if (!next || next.text === current.text) {
|
|
1892
1907
|
throw error;
|
|
1893
1908
|
}
|
|
@@ -1897,14 +1912,17 @@ export class GHCrawlService {
|
|
|
1897
1912
|
}
|
|
1898
1913
|
throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.sourceKind} below model limits`);
|
|
1899
1914
|
}
|
|
1900
|
-
shrinkEmbeddingTask(task) {
|
|
1915
|
+
shrinkEmbeddingTask(task, context) {
|
|
1901
1916
|
const withoutMarker = task.text.endsWith(EMBED_TRUNCATION_MARKER)
|
|
1902
1917
|
? task.text.slice(0, -EMBED_TRUNCATION_MARKER.length)
|
|
1903
1918
|
: task.text;
|
|
1904
1919
|
if (withoutMarker.length < 256) {
|
|
1905
1920
|
return null;
|
|
1906
1921
|
}
|
|
1907
|
-
const nextLength = Math.max(256,
|
|
1922
|
+
const nextLength = Math.max(256, this.projectEmbeddingRetryLength(withoutMarker.length, task.estimatedTokens, context));
|
|
1923
|
+
if (nextLength >= withoutMarker.length) {
|
|
1924
|
+
return null;
|
|
1925
|
+
}
|
|
1908
1926
|
const nextText = `${withoutMarker.slice(0, Math.max(0, nextLength - EMBED_TRUNCATION_MARKER.length)).trimEnd()}${EMBED_TRUNCATION_MARKER}`;
|
|
1909
1927
|
return {
|
|
1910
1928
|
...task,
|
|
@@ -1914,6 +1932,19 @@ export class GHCrawlService {
|
|
|
1914
1932
|
wasTruncated: true,
|
|
1915
1933
|
};
|
|
1916
1934
|
}
|
|
1935
|
+
projectEmbeddingRetryLength(textLength, estimatedTokens, context) {
|
|
1936
|
+
const limitTokens = context?.limitTokens ?? null;
|
|
1937
|
+
const requestedTokens = context?.requestedTokens ?? null;
|
|
1938
|
+
if (limitTokens && requestedTokens && requestedTokens > limitTokens) {
|
|
1939
|
+
const targetRatio = (limitTokens * EMBED_CONTEXT_RETRY_TARGET_BUFFER_RATIO) / requestedTokens;
|
|
1940
|
+
return Math.floor(textLength * Math.max(0.1, Math.min(targetRatio, EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO)));
|
|
1941
|
+
}
|
|
1942
|
+
if (limitTokens && estimatedTokens > limitTokens) {
|
|
1943
|
+
const targetRatio = (limitTokens * EMBED_CONTEXT_RETRY_TARGET_BUFFER_RATIO) / estimatedTokens;
|
|
1944
|
+
return Math.floor(textLength * Math.max(0.1, Math.min(targetRatio, EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO)));
|
|
1945
|
+
}
|
|
1946
|
+
return Math.floor(textLength * EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO);
|
|
1947
|
+
}
|
|
1917
1948
|
chunkEmbeddingTasks(items, maxItems, maxEstimatedTokens) {
|
|
1918
1949
|
const chunks = [];
|
|
1919
1950
|
let current = [];
|
|
@@ -1950,13 +1981,56 @@ export class GHCrawlService {
|
|
|
1950
1981
|
if (cached) {
|
|
1951
1982
|
return cached;
|
|
1952
1983
|
}
|
|
1953
|
-
const parsed = this.loadStoredEmbeddings(repoId).map((row) =>
|
|
1954
|
-
|
|
1955
|
-
|
|
1956
|
-
|
|
1984
|
+
const parsed = this.loadStoredEmbeddings(repoId).map((row) => {
|
|
1985
|
+
const embedding = JSON.parse(row.embedding_json);
|
|
1986
|
+
const normalized = normalizeEmbedding(embedding);
|
|
1987
|
+
return {
|
|
1988
|
+
...row,
|
|
1989
|
+
embedding,
|
|
1990
|
+
normalizedEmbedding: normalized.normalized,
|
|
1991
|
+
embeddingNorm: normalized.norm,
|
|
1992
|
+
};
|
|
1993
|
+
});
|
|
1957
1994
|
this.parsedEmbeddingCache.set(repoId, parsed);
|
|
1958
1995
|
return parsed;
|
|
1959
1996
|
}
|
|
1997
|
+
loadNormalizedEmbeddingsForSourceKind(repoId, sourceKind) {
|
|
1998
|
+
const rows = this.db
|
|
1999
|
+
.prepare(`select t.id, e.embedding_json
|
|
2000
|
+
from threads t
|
|
2001
|
+
join document_embeddings e on e.thread_id = t.id
|
|
2002
|
+
where t.repo_id = ?
|
|
2003
|
+
and t.state = 'open'
|
|
2004
|
+
and t.closed_at_local is null
|
|
2005
|
+
and e.model = ?
|
|
2006
|
+
and e.source_kind = ?
|
|
2007
|
+
order by t.number asc`)
|
|
2008
|
+
.all(repoId, this.config.embedModel, sourceKind);
|
|
2009
|
+
return rows.map((row) => ({
|
|
2010
|
+
id: row.id,
|
|
2011
|
+
normalizedEmbedding: normalizeEmbedding(JSON.parse(row.embedding_json)).normalized,
|
|
2012
|
+
}));
|
|
2013
|
+
}
|
|
2014
|
+
loadClusterableThreadMeta(repoId) {
|
|
2015
|
+
const rows = this.db
|
|
2016
|
+
.prepare(`select t.id, t.number, t.title, e.source_kind
|
|
2017
|
+
from threads t
|
|
2018
|
+
join document_embeddings e on e.thread_id = t.id
|
|
2019
|
+
where t.repo_id = ?
|
|
2020
|
+
and t.state = 'open'
|
|
2021
|
+
and t.closed_at_local is null`)
|
|
2022
|
+
.all(repoId);
|
|
2023
|
+
const itemsById = new Map();
|
|
2024
|
+
const sourceKinds = new Set();
|
|
2025
|
+
for (const row of rows) {
|
|
2026
|
+
itemsById.set(row.id, { id: row.id, number: row.number, title: row.title });
|
|
2027
|
+
sourceKinds.add(row.source_kind);
|
|
2028
|
+
}
|
|
2029
|
+
return {
|
|
2030
|
+
items: Array.from(itemsById.values()),
|
|
2031
|
+
sourceKinds: Array.from(sourceKinds.values()),
|
|
2032
|
+
};
|
|
2033
|
+
}
|
|
1960
2034
|
listStoredClusterNeighbors(repoId, threadId, limit) {
|
|
1961
2035
|
const latestRun = this.getLatestClusterRun(repoId);
|
|
1962
2036
|
if (!latestRun) {
|
|
@@ -2072,50 +2146,113 @@ export class GHCrawlService {
|
|
|
2072
2146
|
const right = Math.max(leftThreadId, rightThreadId);
|
|
2073
2147
|
return `${left}:${right}`;
|
|
2074
2148
|
}
|
|
2075
|
-
aggregateRepositoryEdges(
|
|
2076
|
-
const bySource = new Map();
|
|
2077
|
-
for (const row of rows) {
|
|
2078
|
-
const list = bySource.get(row.source_kind) ?? [];
|
|
2079
|
-
list.push({ id: row.id, embedding: row.embedding });
|
|
2080
|
-
bySource.set(row.source_kind, list);
|
|
2081
|
-
}
|
|
2149
|
+
async aggregateRepositoryEdges(repoId, sourceKinds, params) {
|
|
2082
2150
|
const aggregated = new Map();
|
|
2083
|
-
const totalItems =
|
|
2084
|
-
|
|
2085
|
-
|
|
2086
|
-
|
|
2087
|
-
|
|
2088
|
-
|
|
2089
|
-
|
|
2151
|
+
const totalItems = sourceKinds.reduce((sum, sourceKind) => sum + this.countEmbeddingsForSourceKind(repoId, sourceKind), 0);
|
|
2152
|
+
if (sourceKinds.length === 0 || totalItems === 0) {
|
|
2153
|
+
return aggregated;
|
|
2154
|
+
}
|
|
2155
|
+
const workerRuntime = this.resolveEdgeWorkerRuntime();
|
|
2156
|
+
const shouldParallelize = workerRuntime !== null && sourceKinds.length > 1 && totalItems >= CLUSTER_PARALLEL_MIN_EMBEDDINGS && os.availableParallelism() > 1;
|
|
2157
|
+
if (!shouldParallelize) {
|
|
2158
|
+
let processedItems = 0;
|
|
2159
|
+
for (const sourceKind of sourceKinds) {
|
|
2160
|
+
const items = this.loadNormalizedEmbeddingsForSourceKind(repoId, sourceKind);
|
|
2161
|
+
const edges = buildSourceKindEdges(items, {
|
|
2090
2162
|
limit: params.limit,
|
|
2091
2163
|
minScore: params.minScore,
|
|
2092
|
-
|
|
2164
|
+
progressIntervalMs: CLUSTER_PROGRESS_INTERVAL_MS,
|
|
2165
|
+
onProgress: (progress) => {
|
|
2166
|
+
if (!params.onProgress)
|
|
2167
|
+
return;
|
|
2168
|
+
params.onProgress(`[cluster] identifying similarity edges ${processedItems + progress.processedItems}/${totalItems} source embeddings processed current_edges~=${aggregated.size + progress.currentEdgeEstimate}`);
|
|
2169
|
+
},
|
|
2093
2170
|
});
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
|
|
2102
|
-
|
|
2103
|
-
|
|
2104
|
-
|
|
2105
|
-
|
|
2106
|
-
|
|
2171
|
+
processedItems += items.length;
|
|
2172
|
+
this.mergeSourceKindEdges(aggregated, edges, sourceKind);
|
|
2173
|
+
}
|
|
2174
|
+
return aggregated;
|
|
2175
|
+
}
|
|
2176
|
+
const progressBySource = new Map();
|
|
2177
|
+
const edgeSets = await Promise.all(sourceKinds.map((sourceKind) => new Promise((resolve, reject) => {
|
|
2178
|
+
const worker = new Worker(workerRuntime.url, {
|
|
2179
|
+
workerData: {
|
|
2180
|
+
dbPath: this.config.dbPath,
|
|
2181
|
+
repoId,
|
|
2182
|
+
sourceKind,
|
|
2183
|
+
limit: params.limit,
|
|
2184
|
+
minScore: params.minScore,
|
|
2185
|
+
},
|
|
2186
|
+
});
|
|
2187
|
+
worker.on('message', (message) => {
|
|
2188
|
+
if (!message || typeof message !== 'object') {
|
|
2189
|
+
return;
|
|
2190
|
+
}
|
|
2191
|
+
const typed = message;
|
|
2192
|
+
if (typed.type === 'progress') {
|
|
2193
|
+
progressBySource.set(typed.sourceKind, {
|
|
2194
|
+
processedItems: typed.processedItems,
|
|
2195
|
+
totalItems: typed.totalItems,
|
|
2196
|
+
currentEdgeEstimate: typed.currentEdgeEstimate,
|
|
2107
2197
|
});
|
|
2198
|
+
if (params.onProgress) {
|
|
2199
|
+
const processedItems = Array.from(progressBySource.values()).reduce((sum, value) => sum + value.processedItems, 0);
|
|
2200
|
+
const currentEdgeEstimate = Array.from(progressBySource.values()).reduce((sum, value) => sum + value.currentEdgeEstimate, 0);
|
|
2201
|
+
params.onProgress(`[cluster] identifying similarity edges ${processedItems}/${totalItems} source embeddings processed current_edges~=${aggregated.size + currentEdgeEstimate}`);
|
|
2202
|
+
}
|
|
2203
|
+
return;
|
|
2108
2204
|
}
|
|
2109
|
-
|
|
2110
|
-
|
|
2111
|
-
|
|
2112
|
-
|
|
2113
|
-
|
|
2205
|
+
resolve(typed.edges);
|
|
2206
|
+
});
|
|
2207
|
+
worker.on('error', reject);
|
|
2208
|
+
worker.on('exit', (code) => {
|
|
2209
|
+
if (code !== 0) {
|
|
2210
|
+
reject(new Error(`edge worker for ${sourceKind} exited with code ${code}`));
|
|
2114
2211
|
}
|
|
2115
|
-
}
|
|
2212
|
+
});
|
|
2213
|
+
})));
|
|
2214
|
+
for (const [index, edges] of edgeSets.entries()) {
|
|
2215
|
+
this.mergeSourceKindEdges(aggregated, edges, sourceKinds[index]);
|
|
2116
2216
|
}
|
|
2117
2217
|
return aggregated;
|
|
2118
2218
|
}
|
|
2219
|
+
mergeSourceKindEdges(aggregated, edges, sourceKind) {
|
|
2220
|
+
for (const edge of edges) {
|
|
2221
|
+
const key = this.edgeKey(edge.leftThreadId, edge.rightThreadId);
|
|
2222
|
+
const existing = aggregated.get(key);
|
|
2223
|
+
if (existing) {
|
|
2224
|
+
existing.score = Math.max(existing.score, edge.score);
|
|
2225
|
+
existing.sourceKinds.add(sourceKind);
|
|
2226
|
+
continue;
|
|
2227
|
+
}
|
|
2228
|
+
aggregated.set(key, {
|
|
2229
|
+
leftThreadId: edge.leftThreadId,
|
|
2230
|
+
rightThreadId: edge.rightThreadId,
|
|
2231
|
+
score: edge.score,
|
|
2232
|
+
sourceKinds: new Set([sourceKind]),
|
|
2233
|
+
});
|
|
2234
|
+
}
|
|
2235
|
+
}
|
|
2236
|
+
countEmbeddingsForSourceKind(repoId, sourceKind) {
|
|
2237
|
+
const row = this.db
|
|
2238
|
+
.prepare(`select count(*) as count
|
|
2239
|
+
from document_embeddings e
|
|
2240
|
+
join threads t on t.id = e.thread_id
|
|
2241
|
+
where t.repo_id = ?
|
|
2242
|
+
and t.state = 'open'
|
|
2243
|
+
and t.closed_at_local is null
|
|
2244
|
+
and e.source_kind = ?`)
|
|
2245
|
+
.get(repoId, sourceKind);
|
|
2246
|
+
return row.count;
|
|
2247
|
+
}
|
|
2248
|
+
resolveEdgeWorkerRuntime() {
|
|
2249
|
+
const jsUrl = new URL('./cluster/edge-worker.js', import.meta.url);
|
|
2250
|
+
if (existsSync(fileURLToPath(jsUrl))) {
|
|
2251
|
+
return { url: jsUrl };
|
|
2252
|
+
}
|
|
2253
|
+
// Source-mode runs do not have a compiled worker entrypoint, so keep clustering in-process.
|
|
2254
|
+
return null;
|
|
2255
|
+
}
|
|
2119
2256
|
persistClusterRun(repoId, runId, aggregatedEdges, clusters) {
|
|
2120
2257
|
const insertEdge = this.db.prepare(`insert into similarity_edges (repo_id, cluster_run_id, left_thread_id, right_thread_id, method, score, explanation_json, created_at)
|
|
2121
2258
|
values (?, ?, ?, ?, ?, ?, ?, ?)`);
|