gitnexus 1.6.8-rc.7 → 1.6.8-rc.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,6 +21,7 @@ import { isHttpMode, getHttpDimensions, httpEmbed } from './http-client.js';
21
21
  import { resolveEmbeddingConfig } from './config.js';
22
22
  import { applyHfEnvOverrides, isHfDownloadFailure, withHfDownloadRetry } from './hf-env.js';
23
23
  import { getLocalEmbeddingRuntimeBlocker } from './runtime-support.js';
24
+ import { ensureOnnxRuntimeCommonResolvable } from './onnxruntime-common-resolver.js';
24
25
  import { logger } from '../logger.js';
25
26
  /**
26
27
  * Check whether the onnxruntime-node package that @huggingface/transformers
@@ -147,6 +148,9 @@ export const initEmbedder = async (onProgress, config = {}, forceDevice) => {
147
148
  try {
148
149
  // Lazy-load transformers.js only after the runtime guard has passed, so
149
150
  // unsupported platforms never reach the native ONNX import (#1515).
151
+ // Under pnpm-strict / `pnpm dlx`, transformers' phantom `onnxruntime-common`
152
+ // import is unresolvable; register the fallback resolver first (#307).
153
+ ensureOnnxRuntimeCommonResolvable();
150
154
  const { pipeline, env } = await import('@huggingface/transformers');
151
155
  // Configure transformers.js environment
152
156
  env.allowLocalModels = false;
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Idempotently install the onnxruntime-common resolution fallback. Call once
3
+ * immediately before the dynamic `import('@huggingface/transformers')` on the
4
+ * local-embedding path.
5
+ */
6
+ export declare const ensureOnnxRuntimeCommonResolvable: () => void;
@@ -0,0 +1,130 @@
1
+ /**
2
+ * Make `@huggingface/transformers`' phantom `onnxruntime-common` import
3
+ * resolvable under strict package-manager layouts (#307, #2069).
4
+ *
5
+ * ## Why
6
+ * transformers' shipped `dist/transformers.node.mjs` does a bare
7
+ * `import 'onnxruntime-common'`, but transformers' `package.json` never declares
8
+ * onnxruntime-common (it lists onnxruntime-node / onnxruntime-web / sharp). With
9
+ * npm's flat `node_modules` — or pnpm with hoisting — the package is hoisted to
10
+ * a directory on transformers' resolution path and the import resolves by
11
+ * accident. Under pnpm's isolated store (and therefore `pnpm dlx` / `pnpx`), a
12
+ * package only sees its *declared* deps, so the import dies with
13
+ * `ERR_MODULE_NOT_FOUND` before `analyze --embeddings` can run.
14
+ *
15
+ * Declaring onnxruntime-common in gitnexus' own dependencies (#2074) does NOT
16
+ * fix this under pnpm: Node resolves the bare specifier from *transformers'*
17
+ * module scope, not ours, and overrides/resolutions can only re-version an
18
+ * existing edge, never add the missing one.
19
+ *
20
+ * ## What this does
21
+ * Install a synchronous, in-thread ESM resolution hook (`module.registerHooks`,
22
+ * Node >= 22.15) that redirects `onnxruntime-common` to a copy gitnexus can
23
+ * resolve — but only when the default resolver fails. The redirect target is
24
+ * preferentially the `onnxruntime-common` that `onnxruntime-node` (the native
25
+ * binding transformers actually loads) itself depends on, so the redirected copy
26
+ * is version-matched to that binding even under `pnpm dlx` — where gitnexus'
27
+ * npm-style `overrides` block does NOT apply, because it is honoured only from a
28
+ * root manifest and gitnexus is a transitive dependency there. It falls back to
29
+ * gitnexus' own direct `onnxruntime-common` dependency when that chain can't be
30
+ * walked. onnxruntime-common is a stable, pure-JS package whose `Tensor` surface
31
+ * is unchanged across 1.24–1.26, so either target is API-compatible. On working
32
+ * layouts the default resolver succeeds first and the hook never fires, so
33
+ * behaviour is unchanged.
34
+ *
35
+ * `registerHooks` (synchronous, in-thread) is preferred over the older
36
+ * `module.register` (async, off-thread, now deprecated — DEP0205, removed in
37
+ * Node 26): the redirect is a one-line conditional that needs no worker thread,
38
+ * no separate hook module, and no `data` marshalling.
39
+ *
40
+ * ## Safety
41
+ * Best-effort and idempotent. The hook is installed lazily, only on the
42
+ * local-embedding code path (after parsing), so it is never registered during
43
+ * analysis, in the parse workers, or in HTTP embedding mode. Once installed it
44
+ * is process-global: its resolve closure runs for every subsequent module
45
+ * resolution, but it passes all of them through untouched and only substitutes a
46
+ * result for the exact `onnxruntime-common` specifier when that specifier is
47
+ * genuinely absent — so it cannot mask an unrelated resolution error, and the
48
+ * per-resolution cost is a single string comparison.
49
+ *
50
+ * `module.registerHooks` is marked `@experimental` and requires Node >= 22.15
51
+ * (the gitnexus engines floor is >= 22.0.0). On older runtimes it is absent and
52
+ * this is a graceful no-op: embeddings then resolve onnxruntime-common exactly
53
+ * as before — fine on hoisted layouts. Any failure during installation is
54
+ * swallowed.
55
+ */
56
+ import { registerHooks, createRequire } from 'node:module';
57
+ import { pathToFileURL } from 'node:url';
58
+ import { logger } from '../logger.js';
59
+ let attempted = false;
60
+ /**
61
+ * Compute the file: URL the hook redirects `onnxruntime-common` to.
62
+ *
63
+ * Prefer the copy `onnxruntime-node` (the native binding transformers loads)
64
+ * depends on, so the redirected module is version-matched to the binding even
65
+ * under `pnpm dlx`, where transformers keeps its own pinned onnxruntime-node.
66
+ * The walk resolves transformers' MAIN entry — NOT `@huggingface/transformers/
67
+ * package.json`, which transformers' `exports` map blocks
68
+ * (`ERR_PACKAGE_PATH_NOT_EXPORTED`) — then onnxruntime-node, then its
69
+ * onnxruntime-common. Falls back to gitnexus' own direct dependency (always
70
+ * resolvable from our scope) when any step fails.
71
+ */
72
+ const resolveOnnxRuntimeCommonUrl = () => {
73
+ const require = createRequire(import.meta.url);
74
+ try {
75
+ const transformersMain = require.resolve('@huggingface/transformers');
76
+ const ortNodePkg = createRequire(transformersMain).resolve('onnxruntime-node/package.json');
77
+ const common = createRequire(ortNodePkg).resolve('onnxruntime-common');
78
+ return pathToFileURL(common).href;
79
+ }
80
+ catch {
81
+ return pathToFileURL(require.resolve('onnxruntime-common')).href;
82
+ }
83
+ };
84
+ /**
85
+ * Idempotently install the onnxruntime-common resolution fallback. Call once
86
+ * immediately before the dynamic `import('@huggingface/transformers')` on the
87
+ * local-embedding path.
88
+ */
89
+ export const ensureOnnxRuntimeCommonResolvable = () => {
90
+ if (attempted)
91
+ return;
92
+ // Mark attempted up-front: a failed attempt must not retry on every
93
+ // initEmbedder() call, and the hook is process-global — once is enough.
94
+ attempted = true;
95
+ try {
96
+ // Node < 22.15 (the gitnexus engines floor is >= 22.0.0): no synchronous
97
+ // hooks API. Degrade gracefully — the import still works on hoisted layouts.
98
+ if (typeof registerHooks !== 'function')
99
+ return;
100
+ const redirectUrl = resolveOnnxRuntimeCommonUrl();
101
+ registerHooks({
102
+ resolve(specifier, context, nextResolve) {
103
+ if (specifier !== 'onnxruntime-common')
104
+ return nextResolve(specifier, context);
105
+ // Honour a real, package-manager-provided copy when one is on the path
106
+ // (npm / hoisted pnpm); only substitute ours when the specifier is
107
+ // genuinely absent.
108
+ try {
109
+ return nextResolve(specifier, context);
110
+ }
111
+ catch (err) {
112
+ // The phantom import surfaces as ERR_MODULE_NOT_FOUND (or, for a
113
+ // present-but-exports-broken copy, ERR_PACKAGE_PATH_NOT_EXPORTED).
114
+ // Rethrow anything else so a genuinely broken install is not masked.
115
+ const code = err?.code;
116
+ if (code === 'ERR_MODULE_NOT_FOUND' || code === 'ERR_PACKAGE_PATH_NOT_EXPORTED') {
117
+ return { url: redirectUrl, shortCircuit: true };
118
+ }
119
+ throw err;
120
+ }
121
+ },
122
+ });
123
+ logger.debug({ redirectUrl }, 'Installed onnxruntime-common resolution fallback (#307)');
124
+ }
125
+ catch (err) {
126
+ // Never block embeddings on the fallback. On layouts where the package
127
+ // manager already resolves onnxruntime-common this is unnecessary anyway.
128
+ logger.debug({ err: err instanceof Error ? err.message : String(err) }, 'onnxruntime-common resolution fallback not installed');
129
+ }
130
+ };
@@ -147,6 +147,12 @@ interface LanguageProviderConfig {
147
147
  * `undefined` when no constraints exist / the node isn't a templated
148
148
  * function. Languages without SFINAE / concept semantics leave this
149
149
  * undefined and the disambiguation is a pass-through.
150
+ *
151
+ * Cloneability contract: the returned payload crosses the worker boundary
152
+ * via structured clone, so it MUST be structured-clone-safe (no functions,
153
+ * symbols, or tree-sitter `SyntaxNode`s — only plain data). Wrap the return
154
+ * with `assertCloneable` from `workers/clone-safety.ts` so a future leak is a
155
+ * compile error at the source instead of a runtime DataCloneError (#2143).
150
156
  */
151
157
  readonly extractTemplateConstraints?: (definitionNode: SyntaxNode) => unknown;
152
158
  /** Override the default node label for definition.function captures.
@@ -268,8 +274,12 @@ interface LanguageProviderConfig {
268
274
  * disk store WITHOUT a main-thread re-parse. The main thread restores them
269
275
  * via the matching `ScopeResolver.applyCaptureSideChannel` hook.
270
276
  *
271
- * MUST return plain data (objects / arrays / primitives) so it round-trips
272
- * through `JSON.stringify` + the parsedfile-store interning reviver.
277
+ * Cloneability contract: MUST return plain data (objects / arrays /
278
+ * primitives no functions, symbols, or tree-sitter `SyntaxNode`s) so it
279
+ * survives BOTH the worker→main structured clone AND `JSON.stringify` + the
280
+ * parsedfile-store interning reviver. Wrap the return with `assertCloneable`
281
+ * from `workers/clone-safety.ts` so a future non-serializable leak is a
282
+ * compile error at the source instead of a runtime DataCloneError (#2143).
273
283
  *
274
284
  * Default: undefined (provider has no capture-time module-level side effects).
275
285
  */
@@ -38,7 +38,8 @@ import { cCallConfig, cppCallConfig } from '../call-extractors/configs/c-cpp.js'
38
38
  import { stripUeMacros } from '../cpp-ue-preprocessor.js';
39
39
  import { emitCScopeCaptures, interpretCImport, interpretCTypeBinding, cArityCompatibility, cBindingScopeFor, cImportOwningScope, cReceiverBinding, collectCStaticLinkageSideChannel, } from './c/index.js';
40
40
  import { emitCppScopeCaptures, interpretCppImport, interpretCppTypeBinding, cppArityCompatibility, cppBindingScopeFor, cppImportOwningScope, cppReceiverBinding, collectCppCaptureSideChannel, } from './cpp/index.js';
41
- import { extractCppTemplateConstraints } from './cpp/constraint-extractor.js';
41
+ import { extractCppTemplateConstraints, } from './cpp/constraint-extractor.js';
42
+ import { assertCloneable } from '../workers/clone-safety.js';
42
43
  const C_BUILT_INS = new Set([
43
44
  'printf',
44
45
  'fprintf',
@@ -358,7 +359,10 @@ export const cProvider = defineLanguage({
358
359
  // `static` functions look non-file-local on the main thread and leak into
359
360
  // cross-file global free-call resolution / wildcard imports. See
360
361
  // `c/capture-side-channel.ts`.
361
- collectCaptureSideChannel: collectCStaticLinkageSideChannel,
362
+ // `assertCloneable` is a runtime identity; it makes a future non-serializable
363
+ // value in the side-channel payload a compile error here, at the source, rather
364
+ // than a DataCloneError at the worker boundary (#2143).
365
+ collectCaptureSideChannel: (filePath) => assertCloneable(collectCStaticLinkageSideChannel(filePath)),
362
366
  interpretImport: interpretCImport,
363
367
  interpretTypeBinding: interpretCTypeBinding,
364
368
  bindingScopeFor: cBindingScopeFor,
@@ -431,7 +435,7 @@ export const cppProvider = defineLanguage({
431
435
  // just populated for this file into plain data on `ParsedFile.captureSideChannel`,
432
436
  // so the main thread can restore them via `applyCaptureSideChannel` WITHOUT a
433
437
  // re-parse (#1983). See `cpp/capture-side-channel.ts`.
434
- collectCaptureSideChannel: collectCppCaptureSideChannel,
438
+ collectCaptureSideChannel: (filePath) => assertCloneable(collectCppCaptureSideChannel(filePath)),
435
439
  interpretImport: interpretCppImport,
436
440
  interpretTypeBinding: interpretCppTypeBinding,
437
441
  bindingScopeFor: cppBindingScopeFor,
@@ -482,5 +486,8 @@ function extractCppTemplateConstraintsForProvider(definitionNode) {
482
486
  }
483
487
  break;
484
488
  }
485
- return extractCppTemplateConstraints(templateDecl, declarator);
489
+ // Guard the boundary at the source: a future non-cloneable member of the
490
+ // constraint payload becomes a compile error here, not a runtime
491
+ // DataCloneError at the worker post (#2143).
492
+ return assertCloneable(extractCppTemplateConstraints(templateDecl, declarator));
486
493
  }
@@ -10,6 +10,7 @@ import { SupportedLanguages } from '../../../_shared/index.js';
10
10
  import { createClassExtractor } from '../class-extractors/generic.js';
11
11
  import { kotlinClassConfig } from '../class-extractors/configs/jvm.js';
12
12
  import { defineLanguage } from '../language-provider.js';
13
+ import { assertCloneable } from '../workers/clone-safety.js';
13
14
  import { kotlinTypeConfig } from '../type-extractors/jvm.js';
14
15
  import { kotlinExportChecker } from '../export-detection.js';
15
16
  import { createImportResolver } from '../import-resolvers/resolver-factory.js';
@@ -166,7 +167,10 @@ export const kotlinProvider = defineLanguage({
166
167
  // so the main thread can restore them via `applyCaptureSideChannel` WITHOUT a
167
168
  // re-parse (#1983). Without this, companion/static dispatch emits no CALLS
168
169
  // edges on the worker path. See `kotlin/capture-side-channel.ts`.
169
- collectCaptureSideChannel: collectKotlinCaptureSideChannel,
170
+ // `assertCloneable` is a runtime identity; it makes a future non-serializable
171
+ // value in the side-channel payload a compile error here, at the source, rather
172
+ // than a DataCloneError at the worker boundary (#2143).
173
+ collectCaptureSideChannel: (filePath) => assertCloneable(collectKotlinCaptureSideChannel(filePath)),
170
174
  interpretImport: interpretKotlinImport,
171
175
  interpretTypeBinding: interpretKotlinTypeBinding,
172
176
  bindingScopeFor: kotlinBindingScopeFor,
@@ -144,6 +144,27 @@ chunkHash) => {
144
144
  .join(', ');
145
145
  logger.warn(` Skipped unsupported languages: ${summary}`);
146
146
  }
147
+ // Clone-safety telemetry (#2112): files whose parse output carried a value
148
+ // the structured-clone algorithm couldn't serialize across the worker
149
+ // boundary. The worker sanitized/dropped the offending value so the run
150
+ // could complete; surface the (rare) data loss so it's visible and the
151
+ // offending extractor can be fixed at source.
152
+ const skippedPaths = [];
153
+ for (const result of chunkResults) {
154
+ for (const entry of result.skippedPaths ?? [])
155
+ skippedPaths.push(entry);
156
+ }
157
+ if (skippedPaths.length > 0) {
158
+ // Keep the per-file reason ("stripped N value(s) from nodes" /
159
+ // "dropped non-serializable parsedFiles entry") — it distinguishes a
160
+ // recoverable strip from a whole-record drop, which a path-only line loses.
161
+ const shown = skippedPaths
162
+ .slice(0, 10)
163
+ .map((e) => `${e.path} (${e.reason})`)
164
+ .join(', ');
165
+ const more = skippedPaths.length > 10 ? ` …and ${skippedPaths.length - 10} more` : '';
166
+ logger.warn(` Sanitized ${skippedPaths.length} file(s) with non-serializable parse output: ${shown}${more}`);
167
+ }
147
168
  onFileProgress?.(total, total, 'done');
148
169
  return chunkResults;
149
170
  };
@@ -0,0 +1,109 @@
1
+ /**
2
+ * Structured-clone safety for the worker result boundary (#2112).
3
+ *
4
+ * A parse worker delivers its accumulated result to the main thread via
5
+ * `parentPort.postMessage(...)`. Node serializes that payload with the
6
+ * structured-clone algorithm SYNCHRONOUSLY on the worker thread, and it
7
+ * THROWS a `DataCloneError` the instant it meets a value it can't serialize —
8
+ * a function, a symbol, a Promise, a WeakMap, etc. The reporter of #2112 hit
9
+ * exactly this: a node record whose `properties` carried an own-enumerable
10
+ * value pointing at a native function (`function toString() { [native code] }
11
+ * could not be cloned`). One such value aborted the entire parse phase,
12
+ * because the worker re-posts the throw as `{type:'error'}` which the pool
13
+ * counts as a worker death — and under `GITNEXUS_WORKER_POOL_SIZE=1` the same
14
+ * graph re-throws on every respawn until the slot's budget is exhausted.
15
+ *
16
+ * This module is the safety net. It runs ONLY after a real clone failure on
17
+ * the fast-path post (zero overhead on healthy runs), and rewrites the
18
+ * boundary-crossing arrays so the result becomes cloneable: a non-cloneable
19
+ * value inside a plain extraction record is dropped (the record is otherwise
20
+ * kept — strictly-missing data, never wrong), and a `ParsedFile` that can't be
21
+ * made cloneable is dropped whole so scope-resolution re-derives it on the
22
+ * main thread (where there is no clone boundary) with intact edge data.
23
+ *
24
+ * Language-neutral by construction: it keys on value shape and field name
25
+ * only, never on a language (AGENTS.md shared-pipeline rule). The strip
26
+ * semantics mirror what the store path's `JSON.stringify` already silently
27
+ * drops, so store / no-store / cold / warm runs converge on the same graph.
28
+ */
29
+ /** A file whose parse result was sanitized or dropped at the clone boundary. */
30
+ export interface SkippedPath {
31
+ /** Best-effort source path of the offending record (or `(unknown)`). */
32
+ path: string;
33
+ /** Human-readable reason, e.g. "dropped 1 non-serializable value from nodes". */
34
+ reason: string;
35
+ }
36
+ /**
37
+ * True iff `value` survives Node's structured-clone algorithm (the same
38
+ * algorithm `postMessage` uses). This is the authoritative probe — it matches
39
+ * the real failure exactly, including Map/Set/Date/RegExp/TypedArray support,
40
+ * so it never false-positives on the `Scope` Maps that clone fine.
41
+ */
42
+ export declare function isStructuredCloneable(value: unknown): boolean;
43
+ /** The leaf values the structured-clone algorithm copies verbatim. */
44
+ type CloneablePrimitive = undefined | null | boolean | number | bigint | string;
45
+ /**
46
+ * Maps `T` to itself when every value reachable from it is structured-clone
47
+ * safe, and to a type containing `never` at the first offending property
48
+ * otherwise. A function or symbol — the values `postMessage` rejects — becomes
49
+ * `never`, so a struct carrying one is no longer assignable to its own
50
+ * `Cloneable<T>` and `assertCloneable` rejects it, naming the bad key.
51
+ *
52
+ * Implemented as a homomorphic mapped type (`{ [K in keyof T]: … }`) so it
53
+ * preserves `interface` shapes and `readonly` modifiers and works WITHOUT
54
+ * requiring the payload types to carry an index signature — sidestepping the
55
+ * "closed interface is not assignable to a recursive index-signature type" wall
56
+ * that blocked the value-typed-`Cloneable` approach (#2143). `Map`/`Set`/array
57
+ * containers recurse into their element types; `Date`/`RegExp` are clone-safe
58
+ * leaves.
59
+ */
60
+ /** True iff `T` is `any` (the canonical `IsAny` probe: only `any` satisfies `0 extends 1 & T`). */
61
+ type IsAny<T> = 0 extends 1 & T ? true : false;
62
+ export type Cloneable<T> = IsAny<T> extends true ? never : T extends CloneablePrimitive | Date | RegExp ? T : T extends (...args: never[]) => unknown ? never : T extends symbol ? never : T extends ReadonlyMap<infer K, infer V> ? ReadonlyMap<Cloneable<K>, Cloneable<V>> : T extends ReadonlySet<infer U> ? ReadonlySet<Cloneable<U>> : T extends readonly (infer U)[] ? T extends unknown[] ? Cloneable<U>[] : readonly Cloneable<U>[] : T extends object ? {
63
+ [K in keyof T]: Cloneable<T[K]>;
64
+ } : never;
65
+ /**
66
+ * Identity at runtime (zero cost — returns its argument unchanged); a
67
+ * compile-time assertion that `value` is structured-clone safe. Wrap a
68
+ * producer that feeds an `unknown` worker-result sink:
69
+ *
70
+ * collectCaptureSideChannel: (filePath) => assertCloneable(collectFoo(filePath))
71
+ *
72
+ * If `collectFoo`'s return type ever gains a non-cloneable member (a function, a
73
+ * `SyntaxNode`, …) the call fails to compile, pointing at the offending key.
74
+ *
75
+ * The parameter is a conditional type rather than an `extends Cloneable<T>`
76
+ * constraint because a self-referential constraint (`T extends Cloneable<T>`)
77
+ * is a "circular constraint" error in TypeScript. For a clone-safe `T` the
78
+ * parameter resolves to `T` (call type-checks as a plain identity); for an
79
+ * unsafe `T` it resolves to `Cloneable<T>` (which has `never` at the bad key),
80
+ * so the argument is rejected.
81
+ */
82
+ export declare function assertCloneable<T>(value: T extends Cloneable<T> ? T : Cloneable<T>): T;
83
+ export interface MakeCloneSafeOptions {
84
+ /**
85
+ * Array field names whose offending elements are DROPPED whole rather than
86
+ * stripped in place (e.g. `parsedFiles` — its `captureSideChannel` drives
87
+ * edge resolution, so a stripped-and-delivered file would ship WRONG edges;
88
+ * dropping it lets scope-resolution re-derive it on the main thread).
89
+ */
90
+ dropWholeElement: ReadonlySet<string>;
91
+ /** Field names to skip entirely (e.g. the `skippedPaths` field itself). */
92
+ skipFields?: ReadonlySet<string>;
93
+ /** Keys to probe for a file path when attributing a skip. */
94
+ pathKeys?: readonly string[];
95
+ }
96
+ /**
97
+ * Make a worker result's boundary-crossing array fields structured-cloneable,
98
+ * mutating `result` in place. Only arrays that actually contain a
99
+ * non-cloneable value are rewritten; everything else keeps referential
100
+ * identity. Returns the list of affected file paths for reporting.
101
+ *
102
+ * Call this after ANY failure of the fast-path post — a `DataCloneError`, OR a
103
+ * throwing getter's own error surfaced by structuredClone (the caller in
104
+ * `post-result.ts` recovers on any throw, not only `DataCloneError`).
105
+ */
106
+ export declare function makeWorkerResultCloneSafe(result: Record<string, unknown>, options: MakeCloneSafeOptions): {
107
+ skipped: SkippedPath[];
108
+ };
109
+ export {};