@ghcrawl/api-core 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cluster/edge-worker.d.ts +2 -0
- package/dist/cluster/edge-worker.d.ts.map +1 -0
- package/dist/cluster/edge-worker.js +48 -0
- package/dist/cluster/edge-worker.js.map +1 -0
- package/dist/cluster/exact-edges.d.ts +20 -0
- package/dist/cluster/exact-edges.d.ts.map +1 -0
- package/dist/cluster/exact-edges.js +80 -0
- package/dist/cluster/exact-edges.js.map +1 -0
- package/dist/cluster/perf.integration.d.ts +2 -0
- package/dist/cluster/perf.integration.d.ts.map +1 -0
- package/dist/cluster/perf.integration.js +287 -0
- package/dist/cluster/perf.integration.js.map +1 -0
- package/dist/search/exact.d.ts +13 -0
- package/dist/search/exact.d.ts.map +1 -1
- package/dist/search/exact.js +58 -6
- package/dist/search/exact.js.map +1 -1
- package/dist/service.d.ts +5 -1
- package/dist/service.d.ts.map +1 -1
- package/dist/service.js +146 -54
- package/dist/service.js.map +1 -1
- package/package.json +3 -2
package/dist/service.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"service.d.ts","sourceRoot":"","sources":["../src/service.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"service.d.ts","sourceRoot":"","sources":["../src/service.ts"],"names":[],"mappings":"AAQA,OAAO,EAgBL,KAAK,aAAa,EAClB,KAAK,cAAc,EACnB,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EAClB,KAAK,qBAAqB,EAE1B,KAAK,gBAAgB,EACrB,KAAK,wBAAwB,EAC7B,KAAK,gBAAgB,EACrB,KAAK,cAAc,EACnB,KAAK,cAAc,EACnB,KAAK,iBAAiB,EACtB,KAAK,eAAe,EACpB,KAAK,oBAAoB,EACzB,KAAK,aAAa,EAClB,KAAK,YAAY,EACjB,KAAK,UAAU,EACf,KAAK,cAAc,EACnB,KAAK,aAAa,EAClB,KAAK,SAAS,EACd,KAAK,eAAe,EACrB,MAAM,uBAAuB,CAAC;AAI/B,OAAO,EAOL,KAAK,iBAAiB,EACtB,KAAK,cAAc,EACpB,MAAM,aAAa,CAAC;AAErB,OAAO,EAAU,KAAK,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAE7D,OAAO,EAAoB,KAAK,YAAY,EAAE,MAAM,oBAAoB,CAAC;AACzE,OAAO,EAAkB,KAAK,UAAU,EAAE,MAAM,sBAAsB,CAAC;AAgGvE,MAAM,MAAM,kBAAkB,GAAG,QAAQ,GAAG,MAAM,CAAC;AAEnD,MAAM,MAAM,YAAY,GAAG;IACzB,cAAc,EAAE,MAAM,CAAC;IACvB,oBAAoB,EAAE,MAAM,CAAC;IAC7B,0BAA0B,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1C,kBAAkB,EAAE,MAAM,GAAG,IAAI,CAAC;IAClC,qBAAqB,EAAE,MAAM,CAAC;IAC9B,qBAAqB,EAAE,MAAM,CAAC;IAC9B,kBAAkB,EAAE,MAAM,GAAG,IAAI,CAAC;IAClC,0BAA0B,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3C,CAAC;AAEF,MAAM,MAAM,iBAAiB,GAAG;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY,EAAE,MAAM,CAAC;IACrB,QAAQ,EAAE,OAAO,CAAC;IAClB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,gBAAgB,EAAE,MAAM,GAAG,IAAI,CAAC;IAChC,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,gBAAgB,EAAE,MAAM,CAAC;IACzB,eAAe,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/B,sBAAsB,EAAE,MAAM,GAAG,IAAI,CAAC;IACtC,oBAAoB,EAAE,MAAM,GAAG,IAAI,CAAC;IACpC,kBAAkB,EAAE,OAAO,GAAG,cAAc,GAAG,IAAI,CAAC;IACpD,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC;AAEF,MAAM,MAAM,gBAAgB,GAAG;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,OAAO,GAAG,cAAc,CAAC;IAC/B,QAAQ,EAAE,OAAO,CAAC;IAClB,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,YAAY,EAAE,MAAM,GAAG,IAAI,CAAC;CAC7B,CAAC;AAEF,MAAM,MAAM,gBAAgB,GAAG;IAC7B,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY,EAAE,MAAM,CAAC;IACrB,QAAQ,EAAE,OAAO,CAAC;IAClB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,gBAAgB,EAAE,MAAM,GAAG,IAAI,CAAC;IAChC,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,gBAAgB,EAAE,MAAM,CAAC;IACzB,eAAe,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/B,sBAAsB,EAAE,MAAM,GAAG,IAAI,CAAC;IACtC,oBAAoB,EAAE,MAAM,GAAG,IAAI,CAAC;IACpC,kBAAkB,EAAE,OAAO,GAAG,cAAc,GAAG,IAAI,CAAC;IACpD,OAAO,EAAE,gBAAgB,EAAE,CAAC;CAC7B,CAAC;AAEF,MAAM,MAAM,eAAe,GAAG;IAC5B,MAAM,EAAE,SAAS,CAAC;IAClB,SAAS,EAAE,OAAO,CAAC,MAAM,CAAC,iBAAiB,GAAG,kBAAkB,GAAG,2BAA2B,GAAG,gBAAgB,EAAE,MAAM,CAAC,CAAC,CAAC;IAC5H,SAAS,EAAE,YAAY,CAAC,WAAW,CAAC,CAAC;CACtC,CAAC;AAEF,MAAM,MAAM,WAAW,GAAG;IACxB,UAAU,EAAE,aAAa,CAAC;IAC1B,KAAK,EAAE,YAAY,CAAC;IACpB,YAAY,EAAE,MAAM,GAAG,IAAI,CAAC;IAC5B,QAAQ,EAAE,iBAAiB,EAAE,CAAC;CAC/B,CAAC;AAEF,MAAM,MAAM,YAAY,GAAG;IACzB,MAAM,EAAE,cAAc,CAAC;IACvB,MAAM,EAAE;QACN,UAAU,EAAE,OAAO,CAAC;QACpB,MAAM,EAAE,iBAAiB,CAAC;QAC1B,QAAQ,EAAE,OAAO,CAAC;QAClB,MAAM,EAAE,OAAO,CAAC;QAChB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;KACtB,CAAC;IACF,MAAM,EAAE;QACN,UAAU,EAAE,OAAO,CAAC;QACpB,MAAM,EAAE,iBAAiB,CAAC;QAC1B,QAAQ,EAAE,OAAO,CAAC;QAClB,MAAM,EAAE,OAAO,CAAC;QAChB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;KACtB,CAAC;CACH,CAAC;AAEF,KAAK,WAAW,GAAG;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;IACvC,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB,CAAC;AAEF,KAAK,oBAAoB,GAAG,cAAc,CAAC;AAC3C,KAAK,uBAAuB,GAAG,iBAAiB,CAAC;AAyKjD,qBAAa,cAAc;IACzB,QAAQ,CAAC,MAAM,EAAE,cAAc,CAAC;IAChC,QAAQ,CAAC,EAAE,EAAE,cAAc,CAAC;IAC5B,QAAQ,CAAC,MAAM,CAAC,EAAE,YAAY,CAAC;IAC/B,QAAQ,CAAC,EAAE,CAAC,EAAE,UAAU,CAAC;IACzB,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAiD;gBAE1E,OAAO,GAAE;QACnB,MAAM,CAAC,EAAE,cAAc,CAAC;QACxB,EAAE,CAAC,EAAE,cAAc,CAAC;QACpB,MAAM,CAAC,EAAE,YAAY,CAAC;QACtB,EAAE,CAAC,EAAE,UAAU,CAAC;KACZ;IASN,KAAK,IAAI,IAAI;IAKb,IAAI,IAAI,cAAc;IAehB,MAAM,IAAI,OAAO,CAAC,YAAY,CAAC;IAmDrC,gBAAgB,IAAI,oBAAoB;IAKxC,WAAW,CAAC,MAAM,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,IAAI,CAAC,EAAE,OAAO,GAAG,cAAc,CAAC;QAAC,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;QAAC,aAAa,CAAC,EAAE,OAAO,CAAA;KAAE,GAAG,eAAe;IAmDnJ,iBAAiB,CAAC,MAAM,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAC;QAAC,aAAa,CAAC,EAAE,OAAO,CAAA;KAAE,GAAG,qBAAqB;IA4GzH,kBAAkB,CAAC,MAAM,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,YAAY,EAAE,MAAM,CAAA;KAAE,GAAG,aAAa;IAmChG,mBAAmB,CAAC,MAAM,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,MAAM,CAAA;KAAE,GAAG,aAAa;IAiCxF,cAAc,CAClB,MAAM,EAAE,WAAW,GAClB,OAAO,CAAC,aAAa,CAAC;IA2InB,mBAAmB,CAAC,MAAM,EAAE;QAChC,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,eAAe,CAAC,EAAE,OAAO,CAAC;QAC1B,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;KACxC,GAAG,OAAO,CAAC;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,UAAU,EAAE,MAAM,CAAC;QAAC,WAAW,EAAE,MAAM,CAAC;QAAC,YAAY,EAAE,MAAM,CAAC;QAAC,WAAW,EAAE,MAAM,CAAA;KAAE,CAAC;IAsFlH,aAAa,CAAC,MAAM,EAAE;QACpB,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;KACxC,GAAG;QAAE,cAAc,EAAE,MAAM,CAAC;QAAC,gBAAgB,EAAE,MAAM,CAAA;KAAE;IAoClD,eAAe,CAAC,MAAM,EAAE;QAC5B,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;KACxC,GAAG,OAAO,CAAC,cAAc,CAAC;IAoDrB,iBAAiB,CAAC,MAAM,EAAE;QAC9B,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,CAAC,CAAC,EAAE,MAAM,CAAC;QACX,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;KACxC,GAAG,OAAO,CAAC,gBAAgB,CAAC;IA0CvB,gBAAgB,CAAC,MAAM,EAAE;QAC7B,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,CAAC,EAAE,UAAU,CAAC;QAClB,KAAK,CAAC,EAAE,MAAM,CAAC;KAChB,GAAG,OAAO,CAAC,oBAAoB,CAAC;IAmHjC,aAAa,CAAC,MAAM,EAAE;QACpB,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,YAAY,EAAE,MAAM,CAAC;QACrB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,QAAQ,CAAC,EAAE,MAAM,CAAC;KACnB,GAAG,uBAAuB;IAiD3B,YAAY,CAAC,MAAM,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,aAAa,CAAC,EAAE,OAAO,CAAA;KAAE,GAAG,gBAAgB;IAyE1F,iBAAiB,CAAC,MAAM,EAAE;QAC9B,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,IAAI,CAAC,EAAE,OAAO,CAAC;QACf,KAAK,CAAC,EAAE,OAAO,CAAC;QAChB,OAAO,CAAC,EAAE,OAAO,CAAC;QAClB,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;KACxC,GAAG,OAAO,CAAC,eAAe,CAAC;IAkD5B,oBAAoB,CAAC,MAAM,EAAE;QAC3B,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,IAAI,CAAC,EAAE,kBAAkB,CAAC;QAC1B,MAAM,CAAC,EAAE,MAAM,CAAC;QAChB,aAAa,CAAC,EAAE,OAAO,CAAC;KACzB,GAAG,wBAAwB;IA8B5B,oBAAoB,CAAC,MAAM,EAAE;QAC3B,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,MAAM,CAAC;QAClB,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,SAAS,CAAC,EAAE,MAAM,CAAC;QACnB,aAAa,CAAC,EAAE,OAAO,CAAC;KACzB,GAAG,qBAAqB;IAwDzB,cAAc,CAAC,MAAM,EAAE;QACrB,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,IAAI,CAAC,EAAE,kBAAkB,CAAC;QAC1B,MAAM,CAAC,EAAE,MAAM,CAAC;QAChB,qBAAqB,CAAC,EAAE,OAAO,CAAC;KACjC,GAAG,WAAW;IA2Bf,mBAAmB,CAAC,MAAM,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,MAAM,CAAC;QAAC,YAAY,CAAC,EAAE,MAAM,CAAA;KAAE,GAAG,gBAAgB;IAiExH,kBAAkB,CAAC,MAAM,EAAE;QACzB,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,gBAAgB,CAAC,EAAE,OAAO,CAAC;KAC5B,GAAG,eAAe;IA0Eb,WAAW,CAAC,OAAO,EAAE,aAAa,GAAG,OAAO,CAAC,cAAc,CAAC;IAgClE,OAAO,CAAC,kBAAkB;IAmE1B,OAAO,CAAC,oBAAoB;IA4B5B,OAAO,CAAC,eAAe;IA8BvB,OAAO,CAAC,mBAAmB;IAQ3B,OAAO,CAAC,+BAA+B;IAkBvC,OAAO,CAAC,0BAA0B;IAoElC,OAAO,CAAC,kBAAkB;IAiE1B,OAAO,CAAC,uBAAuB;IAuE/B,OAAO,CAAC,wBAAwB;YASlB,mBAAmB;IA4DjC,OAAO,CAAC,SAAS;IAOjB,OAAO,CAAC,aAAa;IAOrB,OAAO,CAAC,iBAAiB;IASzB,OAAO,CAAC,gBAAgB;IAgBxB,OAAO,CAAC,YAAY;YAoEN,uBAAuB;YAmFvB,2BAA2B;IAuGzC,OAAO,CAAC,eAAe;IA0BvB,OAAO,CAAC,eAAe;IAoCvB,OAAO,CAAC,kBAAkB;IAmD1B,OAAO,CAAC,mBAAmB;IAkD3B,OAAO,CAAC,oBAAoB;IAoB5B,OAAO,CAAC,uBAAuB;IAI/B,OAAO,CAAC,uBAAuB;YAKjB,sBAAsB;YAgCtB,2BAA2B;IAiCzC,OAAO,CAAC,mBAAmB;IAmB3B,OAAO,CAAC,mBAAmB;IAwB3B,OAAO,CAAC,oBAAoB;IAc5B,OAAO,CAAC,0BAA0B;IAoBlC,OAAO,CAAC,yBAAyB;IA4BjC,OAAO,CAAC,0BAA0B;IAwDlC,OAAO,CAAC,mBAAmB;IA+C3B,OAAO,CAAC,0BAA0B;IAyClC,OAAO,CAAC,OAAO;YAMD,wBAAwB;IA4GtC,OAAO,CAAC,oBAAoB;IAsB5B,OAAO,CAAC,4BAA4B;IAepC,OAAO,CAAC,oBAAoB;IAQ5B,OAAO,CAAC,iBAAiB;IAsDzB,OAAO,CAAC,mBAAmB;IAI3B,OAAO,CAAC,aAAa;IAarB,OAAO,CAAC,eAAe;IA2BvB,OAAO,CAAC,QAAQ;IAOhB,OAAO,CAAC,SAAS;CAkBlB;AAED,wBAAgB,eAAe,CAAC,GAAG,EAAE,GAAG,GAAG;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,CAOzE"}
|
package/dist/service.js
CHANGED
|
@@ -1,19 +1,25 @@
|
|
|
1
1
|
import http from 'node:http';
|
|
2
2
|
import crypto from 'node:crypto';
|
|
3
|
+
import { existsSync } from 'node:fs';
|
|
4
|
+
import os from 'node:os';
|
|
5
|
+
import { fileURLToPath } from 'node:url';
|
|
6
|
+
import { Worker } from 'node:worker_threads';
|
|
3
7
|
import { IterableMapper } from '@shutterstock/p-map-iterable';
|
|
4
8
|
import { actionResponseSchema, authorThreadsResponseSchema, closeResponseSchema, clusterDetailResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, clustersResponseSchema, embedResultSchema, healthResponseSchema, neighborsResponseSchema, refreshResponseSchema, repositoriesResponseSchema, searchResponseSchema, syncResultSchema, threadsResponseSchema, } from '@ghcrawl/api-contract';
|
|
5
9
|
import { buildClusters } from './cluster/build.js';
|
|
10
|
+
import { buildSourceKindEdges } from './cluster/exact-edges.js';
|
|
6
11
|
import { ensureRuntimeDirs, isLikelyGitHubToken, isLikelyOpenAiApiKey, loadConfig, requireGithubToken, requireOpenAiKey, } from './config.js';
|
|
7
12
|
import { migrate } from './db/migrate.js';
|
|
8
13
|
import { openDb } from './db/sqlite.js';
|
|
9
14
|
import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js';
|
|
10
15
|
import { makeGitHubClient } from './github/client.js';
|
|
11
16
|
import { OpenAiProvider } from './openai/provider.js';
|
|
12
|
-
import { cosineSimilarity, rankNearestNeighbors } from './search/exact.js';
|
|
17
|
+
import { cosineSimilarity, normalizeEmbedding, rankNearestNeighbors } from './search/exact.js';
|
|
13
18
|
const SYNC_BATCH_SIZE = 100;
|
|
14
19
|
const SYNC_BATCH_DELAY_MS = 5000;
|
|
15
20
|
const STALE_CLOSED_SWEEP_LIMIT = 1000;
|
|
16
21
|
const CLUSTER_PROGRESS_INTERVAL_MS = 5000;
|
|
22
|
+
const CLUSTER_PARALLEL_MIN_EMBEDDINGS = 5000;
|
|
17
23
|
const EMBED_ESTIMATED_CHARS_PER_TOKEN = 3;
|
|
18
24
|
const EMBED_MAX_ITEM_TOKENS = 7000;
|
|
19
25
|
const EMBED_MAX_BATCH_TOKENS = 250000;
|
|
@@ -697,24 +703,15 @@ export class GHCrawlService {
|
|
|
697
703
|
throw error;
|
|
698
704
|
}
|
|
699
705
|
}
|
|
700
|
-
clusterRepository(params) {
|
|
706
|
+
async clusterRepository(params) {
|
|
701
707
|
const repository = this.requireRepository(params.owner, params.repo);
|
|
702
708
|
const runId = this.startRun('cluster_runs', repository.id, repository.fullName);
|
|
703
709
|
const minScore = params.minScore ?? 0.82;
|
|
704
710
|
const k = params.k ?? 6;
|
|
705
711
|
try {
|
|
706
|
-
const
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
threadMeta.set(row.id, { number: row.number, title: row.title });
|
|
710
|
-
}
|
|
711
|
-
const items = Array.from(threadMeta.entries()).map(([id, meta]) => ({
|
|
712
|
-
id,
|
|
713
|
-
number: meta.number,
|
|
714
|
-
title: meta.title,
|
|
715
|
-
}));
|
|
716
|
-
params.onProgress?.(`[cluster] loaded ${items.length} embedded thread(s) across ${new Set(rows.map((row) => row.source_kind)).size} source kind(s) for ${repository.fullName} k=${k} minScore=${minScore}`);
|
|
717
|
-
const aggregatedEdges = this.aggregateRepositoryEdges(rows, {
|
|
712
|
+
const { items, sourceKinds } = this.loadClusterableThreadMeta(repository.id);
|
|
713
|
+
params.onProgress?.(`[cluster] loaded ${items.length} embedded thread(s) across ${sourceKinds.length} source kind(s) for ${repository.fullName} k=${k} minScore=${minScore}`);
|
|
714
|
+
const aggregatedEdges = await this.aggregateRepositoryEdges(repository.id, sourceKinds, {
|
|
718
715
|
limit: k,
|
|
719
716
|
minScore,
|
|
720
717
|
onProgress: params.onProgress,
|
|
@@ -954,7 +951,7 @@ export class GHCrawlService {
|
|
|
954
951
|
});
|
|
955
952
|
}
|
|
956
953
|
if (selected.cluster) {
|
|
957
|
-
cluster = this.clusterRepository({
|
|
954
|
+
cluster = await this.clusterRepository({
|
|
958
955
|
owner: params.owner,
|
|
959
956
|
repo: params.repo,
|
|
960
957
|
onProgress: params.onProgress,
|
|
@@ -1207,7 +1204,7 @@ export class GHCrawlService {
|
|
|
1207
1204
|
});
|
|
1208
1205
|
}
|
|
1209
1206
|
case 'cluster': {
|
|
1210
|
-
const result = this.clusterRepository(request);
|
|
1207
|
+
const result = await this.clusterRepository(request);
|
|
1211
1208
|
return actionResponseSchema.parse({
|
|
1212
1209
|
ok: true,
|
|
1213
1210
|
action: request.action,
|
|
@@ -1950,13 +1947,39 @@ export class GHCrawlService {
|
|
|
1950
1947
|
if (cached) {
|
|
1951
1948
|
return cached;
|
|
1952
1949
|
}
|
|
1953
|
-
const parsed = this.loadStoredEmbeddings(repoId).map((row) =>
|
|
1954
|
-
|
|
1955
|
-
|
|
1956
|
-
|
|
1950
|
+
const parsed = this.loadStoredEmbeddings(repoId).map((row) => {
|
|
1951
|
+
const embedding = JSON.parse(row.embedding_json);
|
|
1952
|
+
const normalized = normalizeEmbedding(embedding);
|
|
1953
|
+
return {
|
|
1954
|
+
...row,
|
|
1955
|
+
embedding,
|
|
1956
|
+
normalizedEmbedding: normalized.normalized,
|
|
1957
|
+
embeddingNorm: normalized.norm,
|
|
1958
|
+
};
|
|
1959
|
+
});
|
|
1957
1960
|
this.parsedEmbeddingCache.set(repoId, parsed);
|
|
1958
1961
|
return parsed;
|
|
1959
1962
|
}
|
|
1963
|
+
loadClusterableThreadMeta(repoId) {
|
|
1964
|
+
const rows = this.db
|
|
1965
|
+
.prepare(`select t.id, t.number, t.title, e.source_kind
|
|
1966
|
+
from threads t
|
|
1967
|
+
join document_embeddings e on e.thread_id = t.id
|
|
1968
|
+
where t.repo_id = ?
|
|
1969
|
+
and t.state = 'open'
|
|
1970
|
+
and t.closed_at_local is null`)
|
|
1971
|
+
.all(repoId);
|
|
1972
|
+
const itemsById = new Map();
|
|
1973
|
+
const sourceKinds = new Set();
|
|
1974
|
+
for (const row of rows) {
|
|
1975
|
+
itemsById.set(row.id, { id: row.id, number: row.number, title: row.title });
|
|
1976
|
+
sourceKinds.add(row.source_kind);
|
|
1977
|
+
}
|
|
1978
|
+
return {
|
|
1979
|
+
items: Array.from(itemsById.values()),
|
|
1980
|
+
sourceKinds: Array.from(sourceKinds.values()),
|
|
1981
|
+
};
|
|
1982
|
+
}
|
|
1960
1983
|
listStoredClusterNeighbors(repoId, threadId, limit) {
|
|
1961
1984
|
const latestRun = this.getLatestClusterRun(repoId);
|
|
1962
1985
|
if (!latestRun) {
|
|
@@ -2072,50 +2095,119 @@ export class GHCrawlService {
|
|
|
2072
2095
|
const right = Math.max(leftThreadId, rightThreadId);
|
|
2073
2096
|
return `${left}:${right}`;
|
|
2074
2097
|
}
|
|
2075
|
-
aggregateRepositoryEdges(
|
|
2076
|
-
const bySource = new Map();
|
|
2077
|
-
for (const row of rows) {
|
|
2078
|
-
const list = bySource.get(row.source_kind) ?? [];
|
|
2079
|
-
list.push({ id: row.id, embedding: row.embedding });
|
|
2080
|
-
bySource.set(row.source_kind, list);
|
|
2081
|
-
}
|
|
2098
|
+
async aggregateRepositoryEdges(repoId, sourceKinds, params) {
|
|
2082
2099
|
const aggregated = new Map();
|
|
2083
|
-
const totalItems =
|
|
2084
|
-
|
|
2085
|
-
|
|
2086
|
-
|
|
2087
|
-
|
|
2088
|
-
|
|
2089
|
-
|
|
2100
|
+
const totalItems = sourceKinds.reduce((sum, sourceKind) => sum + this.countEmbeddingsForSourceKind(repoId, sourceKind), 0);
|
|
2101
|
+
if (sourceKinds.length === 0 || totalItems === 0) {
|
|
2102
|
+
return aggregated;
|
|
2103
|
+
}
|
|
2104
|
+
const shouldParallelize = sourceKinds.length > 1 && totalItems >= CLUSTER_PARALLEL_MIN_EMBEDDINGS && os.availableParallelism() > 1;
|
|
2105
|
+
if (!shouldParallelize) {
|
|
2106
|
+
const rows = this.loadParsedStoredEmbeddings(repoId);
|
|
2107
|
+
const bySource = new Map();
|
|
2108
|
+
for (const row of rows) {
|
|
2109
|
+
const list = bySource.get(row.source_kind) ?? [];
|
|
2110
|
+
list.push({ id: row.id, normalizedEmbedding: row.normalizedEmbedding });
|
|
2111
|
+
bySource.set(row.source_kind, list);
|
|
2112
|
+
}
|
|
2113
|
+
let processedItems = 0;
|
|
2114
|
+
for (const sourceKind of sourceKinds) {
|
|
2115
|
+
const items = bySource.get(sourceKind) ?? [];
|
|
2116
|
+
const edges = buildSourceKindEdges(items, {
|
|
2090
2117
|
limit: params.limit,
|
|
2091
2118
|
minScore: params.minScore,
|
|
2092
|
-
|
|
2119
|
+
progressIntervalMs: CLUSTER_PROGRESS_INTERVAL_MS,
|
|
2120
|
+
onProgress: (progress) => {
|
|
2121
|
+
if (!params.onProgress)
|
|
2122
|
+
return;
|
|
2123
|
+
params.onProgress(`[cluster] identifying similarity edges ${processedItems + progress.processedItems}/${totalItems} source embeddings processed current_edges~=${aggregated.size + progress.currentEdgeEstimate}`);
|
|
2124
|
+
},
|
|
2093
2125
|
});
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
|
|
2102
|
-
|
|
2103
|
-
|
|
2104
|
-
|
|
2105
|
-
|
|
2106
|
-
|
|
2126
|
+
processedItems += items.length;
|
|
2127
|
+
this.mergeSourceKindEdges(aggregated, edges, sourceKind);
|
|
2128
|
+
}
|
|
2129
|
+
return aggregated;
|
|
2130
|
+
}
|
|
2131
|
+
const workerUrl = this.resolveEdgeWorkerUrl();
|
|
2132
|
+
const progressBySource = new Map();
|
|
2133
|
+
const edgeSets = await Promise.all(sourceKinds.map((sourceKind) => new Promise((resolve, reject) => {
|
|
2134
|
+
const worker = new Worker(workerUrl, {
|
|
2135
|
+
workerData: {
|
|
2136
|
+
dbPath: this.config.dbPath,
|
|
2137
|
+
repoId,
|
|
2138
|
+
sourceKind,
|
|
2139
|
+
limit: params.limit,
|
|
2140
|
+
minScore: params.minScore,
|
|
2141
|
+
},
|
|
2142
|
+
});
|
|
2143
|
+
worker.on('message', (message) => {
|
|
2144
|
+
if (!message || typeof message !== 'object') {
|
|
2145
|
+
return;
|
|
2146
|
+
}
|
|
2147
|
+
const typed = message;
|
|
2148
|
+
if (typed.type === 'progress') {
|
|
2149
|
+
progressBySource.set(typed.sourceKind, {
|
|
2150
|
+
processedItems: typed.processedItems,
|
|
2151
|
+
totalItems: typed.totalItems,
|
|
2152
|
+
currentEdgeEstimate: typed.currentEdgeEstimate,
|
|
2107
2153
|
});
|
|
2154
|
+
if (params.onProgress) {
|
|
2155
|
+
const processedItems = Array.from(progressBySource.values()).reduce((sum, value) => sum + value.processedItems, 0);
|
|
2156
|
+
const currentEdgeEstimate = Array.from(progressBySource.values()).reduce((sum, value) => sum + value.currentEdgeEstimate, 0);
|
|
2157
|
+
params.onProgress(`[cluster] identifying similarity edges ${processedItems}/${totalItems} source embeddings processed current_edges~=${aggregated.size + currentEdgeEstimate}`);
|
|
2158
|
+
}
|
|
2159
|
+
return;
|
|
2108
2160
|
}
|
|
2109
|
-
|
|
2110
|
-
|
|
2111
|
-
|
|
2112
|
-
|
|
2113
|
-
|
|
2161
|
+
resolve(typed.edges);
|
|
2162
|
+
});
|
|
2163
|
+
worker.on('error', reject);
|
|
2164
|
+
worker.on('exit', (code) => {
|
|
2165
|
+
if (code !== 0) {
|
|
2166
|
+
reject(new Error(`edge worker for ${sourceKind} exited with code ${code}`));
|
|
2114
2167
|
}
|
|
2115
|
-
}
|
|
2168
|
+
});
|
|
2169
|
+
})));
|
|
2170
|
+
for (const [index, edges] of edgeSets.entries()) {
|
|
2171
|
+
this.mergeSourceKindEdges(aggregated, edges, sourceKinds[index]);
|
|
2116
2172
|
}
|
|
2117
2173
|
return aggregated;
|
|
2118
2174
|
}
|
|
2175
|
+
mergeSourceKindEdges(aggregated, edges, sourceKind) {
|
|
2176
|
+
for (const edge of edges) {
|
|
2177
|
+
const key = this.edgeKey(edge.leftThreadId, edge.rightThreadId);
|
|
2178
|
+
const existing = aggregated.get(key);
|
|
2179
|
+
if (existing) {
|
|
2180
|
+
existing.score = Math.max(existing.score, edge.score);
|
|
2181
|
+
existing.sourceKinds.add(sourceKind);
|
|
2182
|
+
continue;
|
|
2183
|
+
}
|
|
2184
|
+
aggregated.set(key, {
|
|
2185
|
+
leftThreadId: edge.leftThreadId,
|
|
2186
|
+
rightThreadId: edge.rightThreadId,
|
|
2187
|
+
score: edge.score,
|
|
2188
|
+
sourceKinds: new Set([sourceKind]),
|
|
2189
|
+
});
|
|
2190
|
+
}
|
|
2191
|
+
}
|
|
2192
|
+
countEmbeddingsForSourceKind(repoId, sourceKind) {
|
|
2193
|
+
const row = this.db
|
|
2194
|
+
.prepare(`select count(*) as count
|
|
2195
|
+
from document_embeddings e
|
|
2196
|
+
join threads t on t.id = e.thread_id
|
|
2197
|
+
where t.repo_id = ?
|
|
2198
|
+
and t.state = 'open'
|
|
2199
|
+
and t.closed_at_local is null
|
|
2200
|
+
and e.source_kind = ?`)
|
|
2201
|
+
.get(repoId, sourceKind);
|
|
2202
|
+
return row.count;
|
|
2203
|
+
}
|
|
2204
|
+
resolveEdgeWorkerUrl() {
|
|
2205
|
+
const jsUrl = new URL('./cluster/edge-worker.js', import.meta.url);
|
|
2206
|
+
if (existsSync(fileURLToPath(jsUrl))) {
|
|
2207
|
+
return jsUrl;
|
|
2208
|
+
}
|
|
2209
|
+
return new URL('./cluster/edge-worker.ts', import.meta.url);
|
|
2210
|
+
}
|
|
2119
2211
|
persistClusterRun(repoId, runId, aggregatedEdges, clusters) {
|
|
2120
2212
|
const insertEdge = this.db.prepare(`insert into similarity_edges (repo_id, cluster_run_id, left_thread_id, right_thread_id, method, score, explanation_json, created_at)
|
|
2121
2213
|
values (?, ?, ?, ?, ?, ?, ?, ?)`);
|