@opencodehub/ingestion 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +202 -0
- package/README.md +69 -0
- package/dist/extract/index.d.ts +8 -0
- package/dist/extract/index.d.ts.map +1 -0
- package/dist/extract/index.js +6 -0
- package/dist/extract/index.js.map +1 -0
- package/dist/extract/orm-detector.d.ts +19 -0
- package/dist/extract/orm-detector.d.ts.map +1 -0
- package/dist/extract/orm-detector.js +209 -0
- package/dist/extract/orm-detector.js.map +1 -0
- package/dist/extract/property-access.d.ts +76 -0
- package/dist/extract/property-access.d.ts.map +1 -0
- package/dist/extract/property-access.js +260 -0
- package/dist/extract/property-access.js.map +1 -0
- package/dist/extract/receiver-resolver.d.ts +86 -0
- package/dist/extract/receiver-resolver.d.ts.map +1 -0
- package/dist/extract/receiver-resolver.js +77 -0
- package/dist/extract/receiver-resolver.js.map +1 -0
- package/dist/extract/route-detector-java.d.ts +29 -0
- package/dist/extract/route-detector-java.d.ts.map +1 -0
- package/dist/extract/route-detector-java.js +190 -0
- package/dist/extract/route-detector-java.js.map +1 -0
- package/dist/extract/route-detector-nestjs.d.ts +30 -0
- package/dist/extract/route-detector-nestjs.d.ts.map +1 -0
- package/dist/extract/route-detector-nestjs.js +134 -0
- package/dist/extract/route-detector-nestjs.js.map +1 -0
- package/dist/extract/route-detector-python.d.ts +28 -0
- package/dist/extract/route-detector-python.d.ts.map +1 -0
- package/dist/extract/route-detector-python.js +100 -0
- package/dist/extract/route-detector-python.js.map +1 -0
- package/dist/extract/route-detector-rails.d.ts +28 -0
- package/dist/extract/route-detector-rails.d.ts.map +1 -0
- package/dist/extract/route-detector-rails.js +162 -0
- package/dist/extract/route-detector-rails.js.map +1 -0
- package/dist/extract/route-detector.d.ts +45 -0
- package/dist/extract/route-detector.d.ts.map +1 -0
- package/dist/extract/route-detector.js +467 -0
- package/dist/extract/route-detector.js.map +1 -0
- package/dist/extract/tool-detector.d.ts +26 -0
- package/dist/extract/tool-detector.d.ts.map +1 -0
- package/dist/extract/tool-detector.js +364 -0
- package/dist/extract/tool-detector.js.map +1 -0
- package/dist/extract/types.d.ts +89 -0
- package/dist/extract/types.d.ts.map +1 -0
- package/dist/extract/types.js +11 -0
- package/dist/extract/types.js.map +1 -0
- package/dist/index.d.ts +10 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +10 -0
- package/dist/index.js.map +1 -0
- package/dist/parse/cobol-regex.d.ts +85 -0
- package/dist/parse/cobol-regex.d.ts.map +1 -0
- package/dist/parse/cobol-regex.js +355 -0
- package/dist/parse/cobol-regex.js.map +1 -0
- package/dist/parse/grammar-registry.d.ts +115 -0
- package/dist/parse/grammar-registry.d.ts.map +1 -0
- package/dist/parse/grammar-registry.js +278 -0
- package/dist/parse/grammar-registry.js.map +1 -0
- package/dist/parse/index.d.ts +14 -0
- package/dist/parse/index.d.ts.map +1 -0
- package/dist/parse/index.js +10 -0
- package/dist/parse/index.js.map +1 -0
- package/dist/parse/language-detector.d.ts +17 -0
- package/dist/parse/language-detector.d.ts.map +1 -0
- package/dist/parse/language-detector.js +104 -0
- package/dist/parse/language-detector.js.map +1 -0
- package/dist/parse/parse-worker.d.ts +24 -0
- package/dist/parse/parse-worker.d.ts.map +1 -0
- package/dist/parse/parse-worker.js +230 -0
- package/dist/parse/parse-worker.js.map +1 -0
- package/dist/parse/types.d.ts +49 -0
- package/dist/parse/types.d.ts.map +1 -0
- package/dist/parse/types.js +11 -0
- package/dist/parse/types.js.map +1 -0
- package/dist/parse/unified-queries.d.ts +37 -0
- package/dist/parse/unified-queries.d.ts.map +1 -0
- package/dist/parse/unified-queries.js +623 -0
- package/dist/parse/unified-queries.js.map +1 -0
- package/dist/parse/wasm-fallback.d.ts +88 -0
- package/dist/parse/wasm-fallback.d.ts.map +1 -0
- package/dist/parse/wasm-fallback.js +258 -0
- package/dist/parse/wasm-fallback.js.map +1 -0
- package/dist/parse/worker-pool.d.ts +48 -0
- package/dist/parse/worker-pool.d.ts.map +1 -0
- package/dist/parse/worker-pool.js +97 -0
- package/dist/parse/worker-pool.js.map +1 -0
- package/dist/pipeline/dep-parsers/go.d.ts +25 -0
- package/dist/pipeline/dep-parsers/go.d.ts.map +1 -0
- package/dist/pipeline/dep-parsers/go.js +146 -0
- package/dist/pipeline/dep-parsers/go.js.map +1 -0
- package/dist/pipeline/dep-parsers/index.d.ts +17 -0
- package/dist/pipeline/dep-parsers/index.d.ts.map +1 -0
- package/dist/pipeline/dep-parsers/index.js +16 -0
- package/dist/pipeline/dep-parsers/index.js.map +1 -0
- package/dist/pipeline/dep-parsers/maven.d.ts +24 -0
- package/dist/pipeline/dep-parsers/maven.d.ts.map +1 -0
- package/dist/pipeline/dep-parsers/maven.js +131 -0
- package/dist/pipeline/dep-parsers/maven.js.map +1 -0
- package/dist/pipeline/dep-parsers/npm.d.ts +30 -0
- package/dist/pipeline/dep-parsers/npm.d.ts.map +1 -0
- package/dist/pipeline/dep-parsers/npm.js +309 -0
- package/dist/pipeline/dep-parsers/npm.js.map +1 -0
- package/dist/pipeline/dep-parsers/nuget.d.ts +24 -0
- package/dist/pipeline/dep-parsers/nuget.d.ts.map +1 -0
- package/dist/pipeline/dep-parsers/nuget.js +178 -0
- package/dist/pipeline/dep-parsers/nuget.js.map +1 -0
- package/dist/pipeline/dep-parsers/python.d.ts +21 -0
- package/dist/pipeline/dep-parsers/python.d.ts.map +1 -0
- package/dist/pipeline/dep-parsers/python.js +369 -0
- package/dist/pipeline/dep-parsers/python.js.map +1 -0
- package/dist/pipeline/dep-parsers/rust.d.ts +18 -0
- package/dist/pipeline/dep-parsers/rust.d.ts.map +1 -0
- package/dist/pipeline/dep-parsers/rust.js +134 -0
- package/dist/pipeline/dep-parsers/rust.js.map +1 -0
- package/dist/pipeline/dep-parsers/spdx-normalize.d.ts +15 -0
- package/dist/pipeline/dep-parsers/spdx-normalize.d.ts.map +1 -0
- package/dist/pipeline/dep-parsers/spdx-normalize.js +31 -0
- package/dist/pipeline/dep-parsers/spdx-normalize.js.map +1 -0
- package/dist/pipeline/dep-parsers/types.d.ts +63 -0
- package/dist/pipeline/dep-parsers/types.d.ts.map +1 -0
- package/dist/pipeline/dep-parsers/types.js +56 -0
- package/dist/pipeline/dep-parsers/types.js.map +1 -0
- package/dist/pipeline/gitignore-stack.d.ts +44 -0
- package/dist/pipeline/gitignore-stack.d.ts.map +1 -0
- package/dist/pipeline/gitignore-stack.js +69 -0
- package/dist/pipeline/gitignore-stack.js.map +1 -0
- package/dist/pipeline/gitignore.d.ts +67 -0
- package/dist/pipeline/gitignore.d.ts.map +1 -0
- package/dist/pipeline/gitignore.js +210 -0
- package/dist/pipeline/gitignore.js.map +1 -0
- package/dist/pipeline/index.d.ts +53 -0
- package/dist/pipeline/index.d.ts.map +1 -0
- package/dist/pipeline/index.js +29 -0
- package/dist/pipeline/index.js.map +1 -0
- package/dist/pipeline/orchestrator.d.ts +105 -0
- package/dist/pipeline/orchestrator.d.ts.map +1 -0
- package/dist/pipeline/orchestrator.js +175 -0
- package/dist/pipeline/orchestrator.js.map +1 -0
- package/dist/pipeline/ownership-helpers/drift.d.ts +41 -0
- package/dist/pipeline/ownership-helpers/drift.d.ts.map +1 -0
- package/dist/pipeline/ownership-helpers/drift.js +122 -0
- package/dist/pipeline/ownership-helpers/drift.js.map +1 -0
- package/dist/pipeline/ownership-helpers/gini-community.d.ts +24 -0
- package/dist/pipeline/ownership-helpers/gini-community.d.ts.map +1 -0
- package/dist/pipeline/ownership-helpers/gini-community.js +32 -0
- package/dist/pipeline/ownership-helpers/gini-community.js.map +1 -0
- package/dist/pipeline/ownership-helpers/git-blame-batcher.d.ts +71 -0
- package/dist/pipeline/ownership-helpers/git-blame-batcher.d.ts.map +1 -0
- package/dist/pipeline/ownership-helpers/git-blame-batcher.js +178 -0
- package/dist/pipeline/ownership-helpers/git-blame-batcher.js.map +1 -0
- package/dist/pipeline/ownership-helpers/line-overlap.d.ts +35 -0
- package/dist/pipeline/ownership-helpers/line-overlap.d.ts.map +1 -0
- package/dist/pipeline/ownership-helpers/line-overlap.js +62 -0
- package/dist/pipeline/ownership-helpers/line-overlap.js.map +1 -0
- package/dist/pipeline/ownership-helpers/orphan.d.ts +73 -0
- package/dist/pipeline/ownership-helpers/orphan.d.ts.map +1 -0
- package/dist/pipeline/ownership-helpers/orphan.js +117 -0
- package/dist/pipeline/ownership-helpers/orphan.js.map +1 -0
- package/dist/pipeline/phases/accesses.d.ts +44 -0
- package/dist/pipeline/phases/accesses.d.ts.map +1 -0
- package/dist/pipeline/phases/accesses.js +194 -0
- package/dist/pipeline/phases/accesses.js.map +1 -0
- package/dist/pipeline/phases/annotate.d.ts +28 -0
- package/dist/pipeline/phases/annotate.d.ts.map +1 -0
- package/dist/pipeline/phases/annotate.js +60 -0
- package/dist/pipeline/phases/annotate.js.map +1 -0
- package/dist/pipeline/phases/cochange.d.ts +42 -0
- package/dist/pipeline/phases/cochange.d.ts.map +1 -0
- package/dist/pipeline/phases/cochange.js +0 -0
- package/dist/pipeline/phases/cochange.js.map +1 -0
- package/dist/pipeline/phases/communities.d.ts +34 -0
- package/dist/pipeline/phases/communities.d.ts.map +1 -0
- package/dist/pipeline/phases/communities.js +412 -0
- package/dist/pipeline/phases/communities.js.map +1 -0
- package/dist/pipeline/phases/complexity.d.ts +50 -0
- package/dist/pipeline/phases/complexity.d.ts.map +1 -0
- package/dist/pipeline/phases/complexity.js +794 -0
- package/dist/pipeline/phases/complexity.js.map +1 -0
- package/dist/pipeline/phases/confidence-demote.d.ts +23 -0
- package/dist/pipeline/phases/confidence-demote.d.ts.map +1 -0
- package/dist/pipeline/phases/confidence-demote.js +113 -0
- package/dist/pipeline/phases/confidence-demote.js.map +1 -0
- package/dist/pipeline/phases/content-cache.d.ts +166 -0
- package/dist/pipeline/phases/content-cache.d.ts.map +1 -0
- package/dist/pipeline/phases/content-cache.js +323 -0
- package/dist/pipeline/phases/content-cache.js.map +1 -0
- package/dist/pipeline/phases/coverage-parsers/cobertura.d.ts +25 -0
- package/dist/pipeline/phases/coverage-parsers/cobertura.d.ts.map +1 -0
- package/dist/pipeline/phases/coverage-parsers/cobertura.js +139 -0
- package/dist/pipeline/phases/coverage-parsers/cobertura.js.map +1 -0
- package/dist/pipeline/phases/coverage-parsers/coverage-py.d.ts +25 -0
- package/dist/pipeline/phases/coverage-parsers/coverage-py.d.ts.map +1 -0
- package/dist/pipeline/phases/coverage-parsers/coverage-py.js +51 -0
- package/dist/pipeline/phases/coverage-parsers/coverage-py.js.map +1 -0
- package/dist/pipeline/phases/coverage-parsers/jacoco.d.ts +32 -0
- package/dist/pipeline/phases/coverage-parsers/jacoco.d.ts.map +1 -0
- package/dist/pipeline/phases/coverage-parsers/jacoco.js +98 -0
- package/dist/pipeline/phases/coverage-parsers/jacoco.js.map +1 -0
- package/dist/pipeline/phases/coverage-parsers/lcov.d.ts +21 -0
- package/dist/pipeline/phases/coverage-parsers/lcov.d.ts.map +1 -0
- package/dist/pipeline/phases/coverage-parsers/lcov.js +104 -0
- package/dist/pipeline/phases/coverage-parsers/lcov.js.map +1 -0
- package/dist/pipeline/phases/coverage-parsers/types.d.ts +27 -0
- package/dist/pipeline/phases/coverage-parsers/types.d.ts.map +1 -0
- package/dist/pipeline/phases/coverage-parsers/types.js +39 -0
- package/dist/pipeline/phases/coverage-parsers/types.js.map +1 -0
- package/dist/pipeline/phases/coverage.d.ts +39 -0
- package/dist/pipeline/phases/coverage.d.ts.map +1 -0
- package/dist/pipeline/phases/coverage.js +154 -0
- package/dist/pipeline/phases/coverage.js.map +1 -0
- package/dist/pipeline/phases/cross-file.d.ts +40 -0
- package/dist/pipeline/phases/cross-file.d.ts.map +1 -0
- package/dist/pipeline/phases/cross-file.js +411 -0
- package/dist/pipeline/phases/cross-file.js.map +1 -0
- package/dist/pipeline/phases/dead-code.d.ts +28 -0
- package/dist/pipeline/phases/dead-code.d.ts.map +1 -0
- package/dist/pipeline/phases/dead-code.js +157 -0
- package/dist/pipeline/phases/dead-code.js.map +1 -0
- package/dist/pipeline/phases/default-set.d.ts +24 -0
- package/dist/pipeline/phases/default-set.d.ts.map +1 -0
- package/dist/pipeline/phases/default-set.js +133 -0
- package/dist/pipeline/phases/default-set.js.map +1 -0
- package/dist/pipeline/phases/dependencies.d.ts +59 -0
- package/dist/pipeline/phases/dependencies.d.ts.map +1 -0
- package/dist/pipeline/phases/dependencies.js +281 -0
- package/dist/pipeline/phases/dependencies.js.map +1 -0
- package/dist/pipeline/phases/embedder-pool.d.ts +31 -0
- package/dist/pipeline/phases/embedder-pool.d.ts.map +1 -0
- package/dist/pipeline/phases/embedder-pool.js +79 -0
- package/dist/pipeline/phases/embedder-pool.js.map +1 -0
- package/dist/pipeline/phases/embedder-worker.d.ts +28 -0
- package/dist/pipeline/phases/embedder-worker.d.ts.map +1 -0
- package/dist/pipeline/phases/embedder-worker.js +43 -0
- package/dist/pipeline/phases/embedder-worker.js.map +1 -0
- package/dist/pipeline/phases/embeddings.d.ts +117 -0
- package/dist/pipeline/phases/embeddings.d.ts.map +1 -0
- package/dist/pipeline/phases/embeddings.js +697 -0
- package/dist/pipeline/phases/embeddings.js.map +1 -0
- package/dist/pipeline/phases/fetches.d.ts +47 -0
- package/dist/pipeline/phases/fetches.d.ts.map +1 -0
- package/dist/pipeline/phases/fetches.js +207 -0
- package/dist/pipeline/phases/fetches.js.map +1 -0
- package/dist/pipeline/phases/incremental-helper.d.ts +96 -0
- package/dist/pipeline/phases/incremental-helper.d.ts.map +1 -0
- package/dist/pipeline/phases/incremental-helper.js +125 -0
- package/dist/pipeline/phases/incremental-helper.js.map +1 -0
- package/dist/pipeline/phases/incremental-scope.d.ts +67 -0
- package/dist/pipeline/phases/incremental-scope.d.ts.map +1 -0
- package/dist/pipeline/phases/incremental-scope.js +225 -0
- package/dist/pipeline/phases/incremental-scope.js.map +1 -0
- package/dist/pipeline/phases/markdown.d.ts +29 -0
- package/dist/pipeline/phases/markdown.d.ts.map +1 -0
- package/dist/pipeline/phases/markdown.js +298 -0
- package/dist/pipeline/phases/markdown.js.map +1 -0
- package/dist/pipeline/phases/mro.d.ts +24 -0
- package/dist/pipeline/phases/mro.d.ts.map +1 -0
- package/dist/pipeline/phases/mro.js +303 -0
- package/dist/pipeline/phases/mro.js.map +1 -0
- package/dist/pipeline/phases/openapi.d.ts +52 -0
- package/dist/pipeline/phases/openapi.d.ts.map +1 -0
- package/dist/pipeline/phases/openapi.js +285 -0
- package/dist/pipeline/phases/openapi.js.map +1 -0
- package/dist/pipeline/phases/orm.d.ts +26 -0
- package/dist/pipeline/phases/orm.d.ts.map +1 -0
- package/dist/pipeline/phases/orm.js +183 -0
- package/dist/pipeline/phases/orm.js.map +1 -0
- package/dist/pipeline/phases/ownership.d.ts +88 -0
- package/dist/pipeline/phases/ownership.d.ts.map +1 -0
- package/dist/pipeline/phases/ownership.js +479 -0
- package/dist/pipeline/phases/ownership.js.map +1 -0
- package/dist/pipeline/phases/parse.d.ts +63 -0
- package/dist/pipeline/phases/parse.d.ts.map +1 -0
- package/dist/pipeline/phases/parse.js +994 -0
- package/dist/pipeline/phases/parse.js.map +1 -0
- package/dist/pipeline/phases/processes.d.ts +47 -0
- package/dist/pipeline/phases/processes.d.ts.map +1 -0
- package/dist/pipeline/phases/processes.js +620 -0
- package/dist/pipeline/phases/processes.js.map +1 -0
- package/dist/pipeline/phases/profile.d.ts +33 -0
- package/dist/pipeline/phases/profile.d.ts.map +1 -0
- package/dist/pipeline/phases/profile.js +91 -0
- package/dist/pipeline/phases/profile.js.map +1 -0
- package/dist/pipeline/phases/repo-node.d.ts +112 -0
- package/dist/pipeline/phases/repo-node.d.ts.map +1 -0
- package/dist/pipeline/phases/repo-node.js +272 -0
- package/dist/pipeline/phases/repo-node.js.map +1 -0
- package/dist/pipeline/phases/risk-snapshot.d.ts +34 -0
- package/dist/pipeline/phases/risk-snapshot.d.ts.map +1 -0
- package/dist/pipeline/phases/risk-snapshot.js +63 -0
- package/dist/pipeline/phases/risk-snapshot.js.map +1 -0
- package/dist/pipeline/phases/routes.d.ts +31 -0
- package/dist/pipeline/phases/routes.d.ts.map +1 -0
- package/dist/pipeline/phases/routes.js +262 -0
- package/dist/pipeline/phases/routes.js.map +1 -0
- package/dist/pipeline/phases/sbom.d.ts +45 -0
- package/dist/pipeline/phases/sbom.d.ts.map +1 -0
- package/dist/pipeline/phases/sbom.js +289 -0
- package/dist/pipeline/phases/sbom.js.map +1 -0
- package/dist/pipeline/phases/scan.d.ts +54 -0
- package/dist/pipeline/phases/scan.d.ts.map +1 -0
- package/dist/pipeline/phases/scan.js +340 -0
- package/dist/pipeline/phases/scan.js.map +1 -0
- package/dist/pipeline/phases/scip-index.d.ts +54 -0
- package/dist/pipeline/phases/scip-index.d.ts.map +1 -0
- package/dist/pipeline/phases/scip-index.js +469 -0
- package/dist/pipeline/phases/scip-index.js.map +1 -0
- package/dist/pipeline/phases/structure.d.ts +21 -0
- package/dist/pipeline/phases/structure.d.ts.map +1 -0
- package/dist/pipeline/phases/structure.js +115 -0
- package/dist/pipeline/phases/structure.js.map +1 -0
- package/dist/pipeline/phases/summarize.d.ts +126 -0
- package/dist/pipeline/phases/summarize.d.ts.map +1 -0
- package/dist/pipeline/phases/summarize.js +401 -0
- package/dist/pipeline/phases/summarize.js.map +1 -0
- package/dist/pipeline/phases/temporal-helpers/branch-divergence.d.ts +42 -0
- package/dist/pipeline/phases/temporal-helpers/branch-divergence.d.ts.map +1 -0
- package/dist/pipeline/phases/temporal-helpers/branch-divergence.js +96 -0
- package/dist/pipeline/phases/temporal-helpers/branch-divergence.js.map +1 -0
- package/dist/pipeline/phases/temporal-helpers/churn-decay.d.ts +22 -0
- package/dist/pipeline/phases/temporal-helpers/churn-decay.d.ts.map +1 -0
- package/dist/pipeline/phases/temporal-helpers/churn-decay.js +32 -0
- package/dist/pipeline/phases/temporal-helpers/churn-decay.js.map +1 -0
- package/dist/pipeline/phases/temporal-helpers/conventional-commits.d.ts +21 -0
- package/dist/pipeline/phases/temporal-helpers/conventional-commits.d.ts.map +1 -0
- package/dist/pipeline/phases/temporal-helpers/conventional-commits.js +37 -0
- package/dist/pipeline/phases/temporal-helpers/conventional-commits.js.map +1 -0
- package/dist/pipeline/phases/temporal-helpers/gini.d.ts +32 -0
- package/dist/pipeline/phases/temporal-helpers/gini.d.ts.map +1 -0
- package/dist/pipeline/phases/temporal-helpers/gini.js +78 -0
- package/dist/pipeline/phases/temporal-helpers/gini.js.map +1 -0
- package/dist/pipeline/phases/temporal-helpers/revert-detect.d.ts +14 -0
- package/dist/pipeline/phases/temporal-helpers/revert-detect.d.ts.map +1 -0
- package/dist/pipeline/phases/temporal-helpers/revert-detect.js +25 -0
- package/dist/pipeline/phases/temporal-helpers/revert-detect.js.map +1 -0
- package/dist/pipeline/phases/temporal-helpers/test-pair.d.ts +18 -0
- package/dist/pipeline/phases/temporal-helpers/test-pair.d.ts.map +1 -0
- package/dist/pipeline/phases/temporal-helpers/test-pair.js +119 -0
- package/dist/pipeline/phases/temporal-helpers/test-pair.js.map +1 -0
- package/dist/pipeline/phases/temporal.d.ts +65 -0
- package/dist/pipeline/phases/temporal.d.ts.map +1 -0
- package/dist/pipeline/phases/temporal.js +621 -0
- package/dist/pipeline/phases/temporal.js.map +1 -0
- package/dist/pipeline/phases/tools.d.ts +21 -0
- package/dist/pipeline/phases/tools.d.ts.map +1 -0
- package/dist/pipeline/phases/tools.js +118 -0
- package/dist/pipeline/phases/tools.js.map +1 -0
- package/dist/pipeline/profile-detectors/api-contracts.d.ts +18 -0
- package/dist/pipeline/profile-detectors/api-contracts.d.ts.map +1 -0
- package/dist/pipeline/profile-detectors/api-contracts.js +78 -0
- package/dist/pipeline/profile-detectors/api-contracts.js.map +1 -0
- package/dist/pipeline/profile-detectors/framework-detector.d.ts +11 -0
- package/dist/pipeline/profile-detectors/framework-detector.d.ts.map +1 -0
- package/dist/pipeline/profile-detectors/framework-detector.js +11 -0
- package/dist/pipeline/profile-detectors/framework-detector.js.map +1 -0
- package/dist/pipeline/profile-detectors/frameworks-catalog.d.ts +7 -0
- package/dist/pipeline/profile-detectors/frameworks-catalog.d.ts.map +1 -0
- package/dist/pipeline/profile-detectors/frameworks-catalog.js +7 -0
- package/dist/pipeline/profile-detectors/frameworks-catalog.js.map +1 -0
- package/dist/pipeline/profile-detectors/frameworks.d.ts +7 -0
- package/dist/pipeline/profile-detectors/frameworks.d.ts.map +1 -0
- package/dist/pipeline/profile-detectors/frameworks.js +7 -0
- package/dist/pipeline/profile-detectors/frameworks.js.map +1 -0
- package/dist/pipeline/profile-detectors/iac.d.ts +22 -0
- package/dist/pipeline/profile-detectors/iac.d.ts.map +1 -0
- package/dist/pipeline/profile-detectors/iac.js +97 -0
- package/dist/pipeline/profile-detectors/iac.js.map +1 -0
- package/dist/pipeline/profile-detectors/languages.d.ts +18 -0
- package/dist/pipeline/profile-detectors/languages.d.ts.map +1 -0
- package/dist/pipeline/profile-detectors/languages.js +60 -0
- package/dist/pipeline/profile-detectors/languages.js.map +1 -0
- package/dist/pipeline/profile-detectors/manifests.d.ts +7 -0
- package/dist/pipeline/profile-detectors/manifests.d.ts.map +1 -0
- package/dist/pipeline/profile-detectors/manifests.js +7 -0
- package/dist/pipeline/profile-detectors/manifests.js.map +1 -0
- package/dist/pipeline/profile-detectors/src-dirs.d.ts +17 -0
- package/dist/pipeline/profile-detectors/src-dirs.d.ts.map +1 -0
- package/dist/pipeline/profile-detectors/src-dirs.js +89 -0
- package/dist/pipeline/profile-detectors/src-dirs.js.map +1 -0
- package/dist/pipeline/profile-detectors/variant-detectors.d.ts +7 -0
- package/dist/pipeline/profile-detectors/variant-detectors.d.ts.map +1 -0
- package/dist/pipeline/profile-detectors/variant-detectors.js +7 -0
- package/dist/pipeline/profile-detectors/variant-detectors.js.map +1 -0
- package/dist/pipeline/runner.d.ts +54 -0
- package/dist/pipeline/runner.d.ts.map +1 -0
- package/dist/pipeline/runner.js +247 -0
- package/dist/pipeline/runner.js.map +1 -0
- package/dist/pipeline/types.d.ts +235 -0
- package/dist/pipeline/types.d.ts.map +1 -0
- package/dist/pipeline/types.js +15 -0
- package/dist/pipeline/types.js.map +1 -0
- package/dist/providers/c.d.ts +3 -0
- package/dist/providers/c.d.ts.map +1 -0
- package/dist/providers/c.js +162 -0
- package/dist/providers/c.js.map +1 -0
- package/dist/providers/cobol.d.ts +19 -0
- package/dist/providers/cobol.d.ts.map +1 -0
- package/dist/providers/cobol.js +44 -0
- package/dist/providers/cobol.js.map +1 -0
- package/dist/providers/cpp.d.ts +3 -0
- package/dist/providers/cpp.d.ts.map +1 -0
- package/dist/providers/cpp.js +200 -0
- package/dist/providers/cpp.js.map +1 -0
- package/dist/providers/csharp.d.ts +3 -0
- package/dist/providers/csharp.d.ts.map +1 -0
- package/dist/providers/csharp.js +292 -0
- package/dist/providers/csharp.js.map +1 -0
- package/dist/providers/dart.d.ts +3 -0
- package/dist/providers/dart.d.ts.map +1 -0
- package/dist/providers/dart.js +214 -0
- package/dist/providers/dart.js.map +1 -0
- package/dist/providers/definition-ids.d.ts +18 -0
- package/dist/providers/definition-ids.d.ts.map +1 -0
- package/dist/providers/definition-ids.js +23 -0
- package/dist/providers/definition-ids.js.map +1 -0
- package/dist/providers/extract-helpers.d.ts +60 -0
- package/dist/providers/extract-helpers.d.ts.map +1 -0
- package/dist/providers/extract-helpers.js +296 -0
- package/dist/providers/extract-helpers.js.map +1 -0
- package/dist/providers/extraction-types.d.ts +85 -0
- package/dist/providers/extraction-types.d.ts.map +1 -0
- package/dist/providers/extraction-types.js +13 -0
- package/dist/providers/extraction-types.js.map +1 -0
- package/dist/providers/go.d.ts +3 -0
- package/dist/providers/go.d.ts.map +1 -0
- package/dist/providers/go.js +359 -0
- package/dist/providers/go.js.map +1 -0
- package/dist/providers/http-detect.d.ts +44 -0
- package/dist/providers/http-detect.d.ts.map +1 -0
- package/dist/providers/http-detect.js +307 -0
- package/dist/providers/http-detect.js.map +1 -0
- package/dist/providers/index.d.ts +38 -0
- package/dist/providers/index.d.ts.map +1 -0
- package/dist/providers/index.js +33 -0
- package/dist/providers/index.js.map +1 -0
- package/dist/providers/java.d.ts +3 -0
- package/dist/providers/java.d.ts.map +1 -0
- package/dist/providers/java.js +259 -0
- package/dist/providers/java.js.map +1 -0
- package/dist/providers/javascript.d.ts +3 -0
- package/dist/providers/javascript.d.ts.map +1 -0
- package/dist/providers/javascript.js +139 -0
- package/dist/providers/javascript.js.map +1 -0
- package/dist/providers/kotlin.d.ts +3 -0
- package/dist/providers/kotlin.d.ts.map +1 -0
- package/dist/providers/kotlin.js +175 -0
- package/dist/providers/kotlin.js.map +1 -0
- package/dist/providers/php.d.ts +3 -0
- package/dist/providers/php.d.ts.map +1 -0
- package/dist/providers/php.js +218 -0
- package/dist/providers/php.js.map +1 -0
- package/dist/providers/python-accesses.d.ts +9 -0
- package/dist/providers/python-accesses.d.ts.map +1 -0
- package/dist/providers/python-accesses.js +22 -0
- package/dist/providers/python-accesses.js.map +1 -0
- package/dist/providers/python.d.ts +3 -0
- package/dist/providers/python.d.ts.map +1 -0
- package/dist/providers/python.js +323 -0
- package/dist/providers/python.js.map +1 -0
- package/dist/providers/registry.d.ts +4 -0
- package/dist/providers/registry.d.ts.map +1 -0
- package/dist/providers/registry.js +46 -0
- package/dist/providers/registry.js.map +1 -0
- package/dist/providers/resolution/c3.d.ts +6 -0
- package/dist/providers/resolution/c3.d.ts.map +1 -0
- package/dist/providers/resolution/c3.js +76 -0
- package/dist/providers/resolution/c3.js.map +1 -0
- package/dist/providers/resolution/context.d.ts +38 -0
- package/dist/providers/resolution/context.d.ts.map +1 -0
- package/dist/providers/resolution/context.js +45 -0
- package/dist/providers/resolution/context.js.map +1 -0
- package/dist/providers/resolution/first-wins.d.ts +3 -0
- package/dist/providers/resolution/first-wins.d.ts.map +1 -0
- package/dist/providers/resolution/first-wins.js +27 -0
- package/dist/providers/resolution/first-wins.js.map +1 -0
- package/dist/providers/resolution/mro.d.ts +16 -0
- package/dist/providers/resolution/mro.d.ts.map +1 -0
- package/dist/providers/resolution/mro.js +14 -0
- package/dist/providers/resolution/mro.js.map +1 -0
- package/dist/providers/resolution/none.d.ts +3 -0
- package/dist/providers/resolution/none.d.ts.map +1 -0
- package/dist/providers/resolution/none.js +11 -0
- package/dist/providers/resolution/none.js.map +1 -0
- package/dist/providers/resolution/python-all-filter.d.ts +25 -0
- package/dist/providers/resolution/python-all-filter.d.ts.map +1 -0
- package/dist/providers/resolution/python-all-filter.js +64 -0
- package/dist/providers/resolution/python-all-filter.js.map +1 -0
- package/dist/providers/resolution/resolver-strategy.d.ts +42 -0
- package/dist/providers/resolution/resolver-strategy.d.ts.map +1 -0
- package/dist/providers/resolution/resolver-strategy.js +50 -0
- package/dist/providers/resolution/resolver-strategy.js.map +1 -0
- package/dist/providers/resolution/single-inheritance.d.ts +3 -0
- package/dist/providers/resolution/single-inheritance.d.ts.map +1 -0
- package/dist/providers/resolution/single-inheritance.js +21 -0
- package/dist/providers/resolution/single-inheritance.js.map +1 -0
- package/dist/providers/resolution/stack-graphs/__fixtures__/mock-tree.d.ts +16 -0
- package/dist/providers/resolution/stack-graphs/__fixtures__/mock-tree.d.ts.map +1 -0
- package/dist/providers/resolution/stack-graphs/__fixtures__/mock-tree.js +50 -0
- package/dist/providers/resolution/stack-graphs/__fixtures__/mock-tree.js.map +1 -0
- package/dist/providers/resolution/stack-graphs/glue.d.ts +15 -0
- package/dist/providers/resolution/stack-graphs/glue.d.ts.map +1 -0
- package/dist/providers/resolution/stack-graphs/glue.js +44 -0
- package/dist/providers/resolution/stack-graphs/glue.js.map +1 -0
- package/dist/providers/resolution/stack-graphs/node-edge-builder.d.ts +30 -0
- package/dist/providers/resolution/stack-graphs/node-edge-builder.d.ts.map +1 -0
- package/dist/providers/resolution/stack-graphs/node-edge-builder.js +366 -0
- package/dist/providers/resolution/stack-graphs/node-edge-builder.js.map +1 -0
- package/dist/providers/resolution/stack-graphs/partial-path-engine.d.ts +9 -0
- package/dist/providers/resolution/stack-graphs/partial-path-engine.d.ts.map +1 -0
- package/dist/providers/resolution/stack-graphs/partial-path-engine.js +152 -0
- package/dist/providers/resolution/stack-graphs/partial-path-engine.js.map +1 -0
- package/dist/providers/resolution/stack-graphs/rule-parser.d.ts +11 -0
- package/dist/providers/resolution/stack-graphs/rule-parser.d.ts.map +1 -0
- package/dist/providers/resolution/stack-graphs/rule-parser.js +247 -0
- package/dist/providers/resolution/stack-graphs/rule-parser.js.map +1 -0
- package/dist/providers/resolution/stack-graphs/types.d.ts +93 -0
- package/dist/providers/resolution/stack-graphs/types.d.ts.map +1 -0
- package/dist/providers/resolution/stack-graphs/types.js +11 -0
- package/dist/providers/resolution/stack-graphs/types.js.map +1 -0
- package/dist/providers/resolution/stack-graphs-python.d.ts +27 -0
- package/dist/providers/resolution/stack-graphs-python.d.ts.map +1 -0
- package/dist/providers/resolution/stack-graphs-python.js +104 -0
- package/dist/providers/resolution/stack-graphs-python.js.map +1 -0
- package/dist/providers/resolution/stack-graphs-ts.d.ts +134 -0
- package/dist/providers/resolution/stack-graphs-ts.d.ts.map +1 -0
- package/dist/providers/resolution/stack-graphs-ts.js +372 -0
- package/dist/providers/resolution/stack-graphs-ts.js.map +1 -0
- package/dist/providers/ruby.d.ts +3 -0
- package/dist/providers/ruby.d.ts.map +1 -0
- package/dist/providers/ruby.js +259 -0
- package/dist/providers/ruby.js.map +1 -0
- package/dist/providers/rust.d.ts +3 -0
- package/dist/providers/rust.d.ts.map +1 -0
- package/dist/providers/rust.js +318 -0
- package/dist/providers/rust.js.map +1 -0
- package/dist/providers/swift.d.ts +3 -0
- package/dist/providers/swift.d.ts.map +1 -0
- package/dist/providers/swift.js +177 -0
- package/dist/providers/swift.js.map +1 -0
- package/dist/providers/test-helpers.d.ts +24 -0
- package/dist/providers/test-helpers.d.ts.map +1 -0
- package/dist/providers/test-helpers.js +33 -0
- package/dist/providers/test-helpers.js.map +1 -0
- package/dist/providers/ts-shared.d.ts +30 -0
- package/dist/providers/ts-shared.d.ts.map +1 -0
- package/dist/providers/ts-shared.js +328 -0
- package/dist/providers/ts-shared.js.map +1 -0
- package/dist/providers/tsx.d.ts +7 -0
- package/dist/providers/tsx.d.ts.map +1 -0
- package/dist/providers/tsx.js +79 -0
- package/dist/providers/tsx.js.map +1 -0
- package/dist/providers/types.d.ts +166 -0
- package/dist/providers/types.d.ts.map +1 -0
- package/dist/providers/types.js +7 -0
- package/dist/providers/types.js.map +1 -0
- package/dist/providers/typescript-family-accesses.d.ts +14 -0
- package/dist/providers/typescript-family-accesses.d.ts.map +1 -0
- package/dist/providers/typescript-family-accesses.js +27 -0
- package/dist/providers/typescript-family-accesses.js.map +1 -0
- package/dist/providers/typescript.d.ts +9 -0
- package/dist/providers/typescript.d.ts.map +1 -0
- package/dist/providers/typescript.js +84 -0
- package/dist/providers/typescript.js.map +1 -0
- package/package.json +108 -0
|
@@ -0,0 +1,697 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embeddings phase — generates 768-dim vectors across one or more
|
|
3
|
+
* hierarchical tiers and materialises them into the phase output as an
|
|
4
|
+
* array of `EmbeddingRow`s the CLI upserts into DuckDB.
|
|
5
|
+
*
|
|
6
|
+
* Granularity tiers (P03):
|
|
7
|
+
* - `"symbol"` — one vector per callable/declaration symbol. When a
|
|
8
|
+
* `SymbolSummaryRow` exists for the node the text is fused
|
|
9
|
+
* `signature\nsummary\nbody`; otherwise we fall back to the raw
|
|
10
|
+
* signature/description pair.
|
|
11
|
+
* - `"file"` — one vector per scanned file. Coarse tier used by the
|
|
12
|
+
* `--zoom` retrieval path. Files larger than ~8192 tokens are
|
|
13
|
+
* truncated to the first `N` chars so a single outlier never blows
|
|
14
|
+
* up batch latency.
|
|
15
|
+
* - `"community"` — one vector per Community node. Architectural tier
|
|
16
|
+
* used to answer "which subsystem handles X?" queries. Text is
|
|
17
|
+
* `inferredLabel\nkeywords…\ntop_symbols…`.
|
|
18
|
+
*
|
|
19
|
+
* Contract:
|
|
20
|
+
* - `options.embeddings !== true` → phase is a silent no-op.
|
|
21
|
+
* - Weights missing (EMBEDDER_NOT_SETUP) → emit a warning via the
|
|
22
|
+
* progress callback and return zeroes. NEVER aborts the pipeline.
|
|
23
|
+
* - Default `granularity = ["symbol"]` preserves v1.0 behaviour; callers
|
|
24
|
+
* opt in to hierarchical tiers explicitly.
|
|
25
|
+
*
|
|
26
|
+
* Determinism:
|
|
27
|
+
* - Rows are sorted by (granularity, node_id, chunk_index).
|
|
28
|
+
* `embeddingsHash` hashes the canonical representation so downstream
|
|
29
|
+
* callers can assert byte-level stability across runs. The hash is
|
|
30
|
+
* returned in the phase output but is intentionally not folded into
|
|
31
|
+
* graphHash.
|
|
32
|
+
*/
|
|
33
|
+
import { createHash } from "node:crypto";
|
|
34
|
+
import { readFileSync } from "node:fs";
|
|
35
|
+
import path from "node:path";
|
|
36
|
+
import { EmbedderNotSetupError, openOnnxEmbedder, tryOpenHttpEmbedder, } from "@opencodehub/embedder";
|
|
37
|
+
import { ANNOTATE_PHASE_NAME } from "./annotate.js";
|
|
38
|
+
import { COMMUNITIES_PHASE_NAME } from "./communities.js";
|
|
39
|
+
import { openOnnxEmbedderPool } from "./embedder-pool.js";
|
|
40
|
+
import { SCAN_PHASE_NAME } from "./scan.js";
|
|
41
|
+
import { SUMMARIZE_PHASE_NAME } from "./summarize.js";
|
|
42
|
+
/**
|
|
43
|
+
* Default batch size for cross-node inference. Picked so a single batch
|
|
44
|
+
* fully utilizes one ONNX session without blowing host memory on a typical
|
|
45
|
+
* M-series / Linux laptop: 32 symbols × ~500 tokens × 2 (int64 id+mask) is
|
|
46
|
+
* comfortably under 1 MB of tensor feed, and the quadratic attention cost
|
|
47
|
+
* is dominated by the per-chunk cost rather than the batch dimension.
|
|
48
|
+
* Callers can override via `options.embeddingsBatchSize`.
|
|
49
|
+
*/
|
|
50
|
+
const DEFAULT_EMBEDDING_BATCH_SIZE = 32;
|
|
51
|
+
export const EMBEDDER_PHASE_NAME = "embeddings";
|
|
52
|
+
/**
|
|
53
|
+
* Well-known options key the orchestrator uses to attach an
|
|
54
|
+
* {@link EmbeddingHashCacheAdapter}. Kept as a `const` so callers can't
|
|
55
|
+
* typo the probe site. Matches the pattern used by `SUMMARY_CACHE_OPTIONS_KEY`
|
|
56
|
+
* in the summarize phase.
|
|
57
|
+
*/
|
|
58
|
+
export const EMBEDDING_HASH_CACHE_OPTIONS_KEY = "__embeddingHashCache";
|
|
59
|
+
function resolveEmbeddingHashCacheAdapter(ctx) {
|
|
60
|
+
const opts = ctx.options;
|
|
61
|
+
const cache = opts[EMBEDDING_HASH_CACHE_OPTIONS_KEY];
|
|
62
|
+
if (cache === undefined || cache === null || typeof cache !== "object")
|
|
63
|
+
return undefined;
|
|
64
|
+
const adapter = cache;
|
|
65
|
+
if (typeof adapter.list !== "function")
|
|
66
|
+
return undefined;
|
|
67
|
+
return adapter;
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Compose the composite key used to probe {@link EmbeddingHashCacheAdapter}.
|
|
71
|
+
* `\0` is binary-safe vs `:` which appears inside NodeIds; the same key
|
|
72
|
+
* encoding is used by the storage adapter's `listEmbeddingHashes`.
|
|
73
|
+
*/
|
|
74
|
+
function priorHashKey(granularity, nodeId, chunkIndex) {
|
|
75
|
+
return `${granularity}\0${nodeId}\0${chunkIndex}`;
|
|
76
|
+
}
|
|
77
|
+
/** Node kinds we currently embed at the symbol tier. */
|
|
78
|
+
const EMBEDDABLE_KINDS = new Set([
|
|
79
|
+
"Function",
|
|
80
|
+
"Method",
|
|
81
|
+
"Constructor",
|
|
82
|
+
"Route",
|
|
83
|
+
"Tool",
|
|
84
|
+
"Class",
|
|
85
|
+
"Interface",
|
|
86
|
+
]);
|
|
87
|
+
/**
|
|
88
|
+
* Max body chars to fuse into a summary-fused symbol embedding. Keeps the
|
|
89
|
+
* fused text well under the embedder's ~500-token window even after
|
|
90
|
+
* signature + summary join. The chunker downstream still wraps any
|
|
91
|
+
* overflow, so this cap is a belt-and-braces guard.
|
|
92
|
+
*/
|
|
93
|
+
const SYMBOL_BODY_CHAR_CAP = 1200;
|
|
94
|
+
/**
|
|
95
|
+
* File-level truncation cap. 8192 tokens × ~4 chars/token on code
|
|
96
|
+
* (conservative WordPiece approximation) ≈ 32_768 chars. Rarely hit in
|
|
97
|
+
* practice because most source files are well under this size; outliers
|
|
98
|
+
* (generated code, lockfiles) are truncated to the first chunk so the
|
|
99
|
+
* phase stays responsive.
|
|
100
|
+
*/
|
|
101
|
+
const FILE_CHAR_CAP = 8192 * 4;
|
|
102
|
+
/**
|
|
103
|
+
* File extensions that contribute to file-tier embeddings. Picked to
|
|
104
|
+
* mirror `scan.detectLanguage`'s reliably-parseable set so we don't try
|
|
105
|
+
* to embed binary assets or vendored artifacts. The gate is
|
|
106
|
+
* deliberately conservative — the file tier is a retrieval aid, not a
|
|
107
|
+
* completeness guarantee.
|
|
108
|
+
*/
|
|
109
|
+
const EMBEDDABLE_FILE_EXTS = new Set([
|
|
110
|
+
".ts",
|
|
111
|
+
".tsx",
|
|
112
|
+
".js",
|
|
113
|
+
".jsx",
|
|
114
|
+
".mjs",
|
|
115
|
+
".cjs",
|
|
116
|
+
".py",
|
|
117
|
+
".go",
|
|
118
|
+
".rs",
|
|
119
|
+
".java",
|
|
120
|
+
".kt",
|
|
121
|
+
".rb",
|
|
122
|
+
".php",
|
|
123
|
+
".cs",
|
|
124
|
+
".swift",
|
|
125
|
+
".md",
|
|
126
|
+
".mdx",
|
|
127
|
+
]);
|
|
128
|
+
function emptyOutput() {
|
|
129
|
+
return {
|
|
130
|
+
embeddingsInserted: 0,
|
|
131
|
+
symbolsSkipped: 0,
|
|
132
|
+
chunksTotal: 0,
|
|
133
|
+
embeddingsModelId: "",
|
|
134
|
+
embeddingsHash: hashRows([]),
|
|
135
|
+
rows: [],
|
|
136
|
+
ranEmbedder: false,
|
|
137
|
+
byGranularity: { symbol: 0, file: 0, community: 0 },
|
|
138
|
+
summaryFused: false,
|
|
139
|
+
chunksSkipped: 0,
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* Fuse text for the symbol tier. When a summary is present the layout is
|
|
144
|
+
* `signature\nsummary\nbody`; otherwise we fall back to
|
|
145
|
+
* `signature\ndescription`. Body is length-capped so a long function's
|
|
146
|
+
* source never overwhelms the 500-token embedder window even before the
|
|
147
|
+
* chunker runs.
|
|
148
|
+
*/
|
|
149
|
+
function symbolText(node, summary, body) {
|
|
150
|
+
const head = node.signature !== undefined && node.signature.length > 0 ? node.signature : node.name;
|
|
151
|
+
if (summary !== undefined) {
|
|
152
|
+
const sigLine = summary.signatureSummary !== undefined && summary.signatureSummary.length > 0
|
|
153
|
+
? summary.signatureSummary
|
|
154
|
+
: head;
|
|
155
|
+
const bodyPiece = body !== undefined && body.length > 0
|
|
156
|
+
? body.length > SYMBOL_BODY_CHAR_CAP
|
|
157
|
+
? body.slice(0, SYMBOL_BODY_CHAR_CAP)
|
|
158
|
+
: body
|
|
159
|
+
: "";
|
|
160
|
+
const parts = [sigLine, summary.summaryText];
|
|
161
|
+
if (bodyPiece.length > 0)
|
|
162
|
+
parts.push(bodyPiece);
|
|
163
|
+
return parts.join("\n");
|
|
164
|
+
}
|
|
165
|
+
const tail = node.description ?? "";
|
|
166
|
+
return tail.length > 0 ? `${head}\n${tail}` : head;
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Greedy text splitter used when a single input exceeds the embedder's
|
|
170
|
+
* maxTokens budget. We split on line boundaries first, and fall back to
|
|
171
|
+
* fixed-width character slices when a single line is too long.
|
|
172
|
+
*
|
|
173
|
+
* Token budget is approximated as `maxChars = tokens * 4` (conservative
|
|
174
|
+
* for WordPiece, which produces ~4 chars/token on English code).
|
|
175
|
+
*/
|
|
176
|
+
function splitIntoChunks(text, tokens) {
|
|
177
|
+
const maxChars = Math.max(tokens * 4, 64);
|
|
178
|
+
if (text.length <= maxChars) {
|
|
179
|
+
return [text];
|
|
180
|
+
}
|
|
181
|
+
const lines = text.split("\n");
|
|
182
|
+
const chunks = [];
|
|
183
|
+
let buf = "";
|
|
184
|
+
for (const line of lines) {
|
|
185
|
+
if (line.length > maxChars) {
|
|
186
|
+
// Flush whatever we had.
|
|
187
|
+
if (buf.length > 0) {
|
|
188
|
+
chunks.push(buf);
|
|
189
|
+
buf = "";
|
|
190
|
+
}
|
|
191
|
+
// Fixed-width slice.
|
|
192
|
+
for (let i = 0; i < line.length; i += maxChars) {
|
|
193
|
+
chunks.push(line.slice(i, i + maxChars));
|
|
194
|
+
}
|
|
195
|
+
continue;
|
|
196
|
+
}
|
|
197
|
+
if (buf.length + line.length + 1 > maxChars) {
|
|
198
|
+
chunks.push(buf);
|
|
199
|
+
buf = line;
|
|
200
|
+
}
|
|
201
|
+
else {
|
|
202
|
+
buf = buf.length > 0 ? `${buf}\n${line}` : line;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
if (buf.length > 0) {
|
|
206
|
+
chunks.push(buf);
|
|
207
|
+
}
|
|
208
|
+
return chunks;
|
|
209
|
+
}
|
|
210
|
+
/**
|
|
211
|
+
* Hash a canonical representation of the rows. Rows are sorted by
|
|
212
|
+
* (granularity, node_id, chunk_index); each row is serialised as
|
|
213
|
+
* `<granularity>\0<id>\0<chunk>\0<hex(vector bytes)>\0<content_hash>`.
|
|
214
|
+
* This representation is byte-stable across machines and TypeScript
|
|
215
|
+
* engines.
|
|
216
|
+
*/
|
|
217
|
+
function hashRows(rows) {
|
|
218
|
+
const hasher = createHash("sha256");
|
|
219
|
+
const sorted = [...rows].sort((a, b) => {
|
|
220
|
+
const ga = a.granularity ?? "symbol";
|
|
221
|
+
const gb = b.granularity ?? "symbol";
|
|
222
|
+
if (ga !== gb)
|
|
223
|
+
return ga < gb ? -1 : 1;
|
|
224
|
+
if (a.nodeId === b.nodeId)
|
|
225
|
+
return a.chunkIndex - b.chunkIndex;
|
|
226
|
+
return a.nodeId < b.nodeId ? -1 : 1;
|
|
227
|
+
});
|
|
228
|
+
for (const r of sorted) {
|
|
229
|
+
hasher.update(r.granularity ?? "symbol", "utf8");
|
|
230
|
+
hasher.update("\0");
|
|
231
|
+
hasher.update(r.nodeId, "utf8");
|
|
232
|
+
hasher.update("\0");
|
|
233
|
+
hasher.update(String(r.chunkIndex));
|
|
234
|
+
hasher.update("\0");
|
|
235
|
+
// Vector bytes — endianness is stable across every platform we ship to
|
|
236
|
+
// (little-endian on x86_64 + aarch64). Copy into a fresh Uint8Array so
|
|
237
|
+
// we never leak Float32Array's ArrayBufferLike widening into crypto.
|
|
238
|
+
const vecBytes = new Uint8Array(r.vector.buffer.slice(r.vector.byteOffset, r.vector.byteOffset + r.vector.byteLength));
|
|
239
|
+
hasher.update(vecBytes);
|
|
240
|
+
hasher.update("\0");
|
|
241
|
+
hasher.update(r.contentHash, "utf8");
|
|
242
|
+
hasher.update("\0");
|
|
243
|
+
}
|
|
244
|
+
return hasher.digest("hex");
|
|
245
|
+
}
|
|
246
|
+
/**
|
|
247
|
+
* Content hash = sha256 of `<granularity>\0<sourceText>`. Threading the
|
|
248
|
+
* tier into the hash prevents collisions when the same node is embedded
|
|
249
|
+
* at multiple granularities (very unlikely in practice, but keeps the
|
|
250
|
+
* cache-key space clean when a future tier reuses the same underlying
|
|
251
|
+
* content).
|
|
252
|
+
*/
|
|
253
|
+
function hashText(granularity, text) {
|
|
254
|
+
const hasher = createHash("sha256");
|
|
255
|
+
hasher.update(granularity, "utf8");
|
|
256
|
+
hasher.update("\0");
|
|
257
|
+
hasher.update(text, "utf8");
|
|
258
|
+
return hasher.digest("hex");
|
|
259
|
+
}
|
|
260
|
+
function isEmbeddableSymbol(node) {
|
|
261
|
+
if (typeof node !== "object" || node === null)
|
|
262
|
+
return false;
|
|
263
|
+
const n = node;
|
|
264
|
+
return (typeof n["id"] === "string" &&
|
|
265
|
+
typeof n["name"] === "string" &&
|
|
266
|
+
typeof n["kind"] === "string" &&
|
|
267
|
+
typeof n["filePath"] === "string" &&
|
|
268
|
+
EMBEDDABLE_KINDS.has(n["kind"]));
|
|
269
|
+
}
|
|
270
|
+
function isFileNode(node) {
|
|
271
|
+
if (typeof node !== "object" || node === null)
|
|
272
|
+
return false;
|
|
273
|
+
const n = node;
|
|
274
|
+
return (typeof n["id"] === "string" &&
|
|
275
|
+
n["kind"] === "File" &&
|
|
276
|
+
typeof n["filePath"] === "string" &&
|
|
277
|
+
typeof n["name"] === "string");
|
|
278
|
+
}
|
|
279
|
+
function isCommunityNode(node) {
|
|
280
|
+
if (typeof node !== "object" || node === null)
|
|
281
|
+
return false;
|
|
282
|
+
const n = node;
|
|
283
|
+
return typeof n["id"] === "string" && n["kind"] === "Community" && typeof n["name"] === "string";
|
|
284
|
+
}
|
|
285
|
+
/**
|
|
286
|
+
* Normalize the requested tier list. De-dupe while preserving first-seen
|
|
287
|
+
* order so the phase walks tiers in a predictable sequence
|
|
288
|
+
* (symbol → file → community) regardless of how the caller supplied them.
|
|
289
|
+
*/
|
|
290
|
+
function normalizeGranularities(requested) {
|
|
291
|
+
if (requested === undefined || requested.length === 0)
|
|
292
|
+
return ["symbol"];
|
|
293
|
+
const seen = new Set();
|
|
294
|
+
const out = [];
|
|
295
|
+
for (const g of requested) {
|
|
296
|
+
if (seen.has(g))
|
|
297
|
+
continue;
|
|
298
|
+
seen.add(g);
|
|
299
|
+
out.push(g);
|
|
300
|
+
}
|
|
301
|
+
return out;
|
|
302
|
+
}
|
|
303
|
+
/**
|
|
304
|
+
* Read a line-bounded slice of a source file. Returns `undefined` on any
|
|
305
|
+
* error so the embedder never aborts because of a permission/missing
|
|
306
|
+
* file condition. Tests patch readFileSync via module state; the fallback
|
|
307
|
+
* is `fs.readFileSync`.
|
|
308
|
+
*/
|
|
309
|
+
function readSourceSpan(repoPath, filePath, startLine, endLine) {
|
|
310
|
+
try {
|
|
311
|
+
const abs = path.isAbsolute(filePath) ? filePath : path.join(repoPath, filePath);
|
|
312
|
+
const all = readFileSync(abs, "utf-8");
|
|
313
|
+
const lines = all.split(/\r?\n/);
|
|
314
|
+
const from = Math.max(0, startLine - 1);
|
|
315
|
+
const to = Math.min(lines.length, endLine);
|
|
316
|
+
if (to <= from)
|
|
317
|
+
return undefined;
|
|
318
|
+
return lines.slice(from, to).join("\n");
|
|
319
|
+
}
|
|
320
|
+
catch {
|
|
321
|
+
return undefined;
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
function readFileWhole(repoPath, relPath) {
|
|
325
|
+
try {
|
|
326
|
+
const abs = path.isAbsolute(relPath) ? relPath : path.join(repoPath, relPath);
|
|
327
|
+
return readFileSync(abs, "utf-8");
|
|
328
|
+
}
|
|
329
|
+
catch {
|
|
330
|
+
return undefined;
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
async function runEmbeddings(ctx) {
|
|
334
|
+
// 1. Flag gate. Silent no-op when disabled.
|
|
335
|
+
if (ctx.options.embeddings !== true) {
|
|
336
|
+
return emptyOutput();
|
|
337
|
+
}
|
|
338
|
+
const tiers = normalizeGranularities(ctx.options
|
|
339
|
+
.embeddingsGranularity);
|
|
340
|
+
// 2. Open embedder. Priority:
|
|
341
|
+
// a. If CODEHUB_EMBEDDING_URL + CODEHUB_EMBEDDING_MODEL are set AND
|
|
342
|
+
// offline is not in effect, use the HTTP embedder — no ONNX weights
|
|
343
|
+
// needed, dimension is enforced against the remote response.
|
|
344
|
+
// b. Otherwise fall back to the local ONNX path. Missing weights is a
|
|
345
|
+
// graceful degradation (warn + empty output); any other ONNX open
|
|
346
|
+
// error is re-raised.
|
|
347
|
+
//
|
|
348
|
+
// The offline invariant is non-negotiable: when `offline === true`, the
|
|
349
|
+
// HTTP path is REFUSED even if the env vars are set — `tryOpenHttpEmbedder`
|
|
350
|
+
// throws, and we rethrow rather than silently continuing to ONNX.
|
|
351
|
+
// `embeddingsWorkers` controls the ONNX worker-pool size. `undefined` or
|
|
352
|
+
// `<= 1` preserves the legacy in-process embedder (no pool, no worker
|
|
353
|
+
// overhead). Values >= 2 spin up a Piscina pool whose workers each hold
|
|
354
|
+
// their own OnnxEmbedder. The HTTP backend ignores the flag — its
|
|
355
|
+
// parallelism is driven by the remote server's capacity.
|
|
356
|
+
const workers = Math.max(1, Math.floor(ctx.options.embeddingsWorkers ?? 1));
|
|
357
|
+
const batchSize = Math.max(1, Math.floor(ctx.options.embeddingsBatchSize ?? DEFAULT_EMBEDDING_BATCH_SIZE));
|
|
358
|
+
let embedder;
|
|
359
|
+
try {
|
|
360
|
+
// Intentionally NOT using `openDefaultEmbedder` from `@opencodehub/embedder`:
|
|
361
|
+
// ingestion needs the offline flag, an explicit ONNX variant + modelDir,
|
|
362
|
+
// a weight canary, and an OnnxEmbedderPool — none of which apply at query
|
|
363
|
+
// time. Keep the two paths separate.
|
|
364
|
+
const httpEmbedder = await tryOpenHttpEmbedder({ offline: ctx.options.offline === true });
|
|
365
|
+
if (httpEmbedder !== null) {
|
|
366
|
+
embedder = httpEmbedder;
|
|
367
|
+
}
|
|
368
|
+
else {
|
|
369
|
+
const variant = ctx.options.embeddingsVariant ?? "fp32";
|
|
370
|
+
const cfg = { variant };
|
|
371
|
+
if (ctx.options.embeddingsModelDir !== undefined) {
|
|
372
|
+
cfg.modelDir = ctx.options.embeddingsModelDir;
|
|
373
|
+
}
|
|
374
|
+
if (workers > 1) {
|
|
375
|
+
// Weight canary: open (and immediately close) a main-thread
|
|
376
|
+
// OnnxEmbedder so EmbedderNotSetupError surfaces with its class
|
|
377
|
+
// identity preserved. Piscina's structured-clone transport would
|
|
378
|
+
// strip the prototype chain from a worker-raised error, breaking
|
|
379
|
+
// the `instanceof EmbedderNotSetupError` catch below.
|
|
380
|
+
const canary = await openOnnxEmbedder(cfg);
|
|
381
|
+
await canary.close();
|
|
382
|
+
embedder = openOnnxEmbedderPool({ workers, ...cfg });
|
|
383
|
+
}
|
|
384
|
+
else {
|
|
385
|
+
embedder = await openOnnxEmbedder(cfg);
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
catch (err) {
|
|
390
|
+
if (err instanceof EmbedderNotSetupError) {
|
|
391
|
+
ctx.onProgress?.({
|
|
392
|
+
phase: EMBEDDER_PHASE_NAME,
|
|
393
|
+
kind: "warn",
|
|
394
|
+
message: "embeddings phase skipped: weights not installed. " +
|
|
395
|
+
"Run `codehub setup --embeddings` while online, or set " +
|
|
396
|
+
"CODEHUB_EMBEDDING_URL to use a remote OpenAI-compatible endpoint.",
|
|
397
|
+
});
|
|
398
|
+
return emptyOutput();
|
|
399
|
+
}
|
|
400
|
+
throw err;
|
|
401
|
+
}
|
|
402
|
+
try {
|
|
403
|
+
const rows = [];
|
|
404
|
+
let skipped = 0;
|
|
405
|
+
let chunksTotal = 0;
|
|
406
|
+
let chunksSkipped = 0;
|
|
407
|
+
let summaryFused = false;
|
|
408
|
+
const byGranularity = {
|
|
409
|
+
symbol: 0,
|
|
410
|
+
file: 0,
|
|
411
|
+
community: 0,
|
|
412
|
+
};
|
|
413
|
+
// Prior-hash cache. When the CLI plugs an adapter AND the caller
|
|
414
|
+
// did not pass `force: true`, we load every prior `content_hash` from the
|
|
415
|
+
// `embeddings` table in a single round-trip. Chunks whose
|
|
416
|
+
// `(granularity, nodeId, chunkIndex)` key maps to an identical freshly-
|
|
417
|
+
// computed hash skip both `embedder.embed()` and the upsert batch —
|
|
418
|
+
// unchanged source reduces a full re-analyze to a no-op for the
|
|
419
|
+
// embeddings phase. Under `force`, or with no adapter installed, the map
|
|
420
|
+
// is empty and the phase behaves exactly as it did before the
|
|
421
|
+
// content-hash skip landed.
|
|
422
|
+
const forceFlag = ctx.options.force === true;
|
|
423
|
+
const hashCache = resolveEmbeddingHashCacheAdapter(ctx);
|
|
424
|
+
const priorHashes = forceFlag || hashCache === undefined ? new Map() : await hashCache.list();
|
|
425
|
+
// Max tokens includes [CLS]/[SEP]; the embedder caps input at 510 user
|
|
426
|
+
// tokens by default. Keep the chunker slightly conservative.
|
|
427
|
+
const maxUserTokens = 500;
|
|
428
|
+
// Lookup summaries by nodeId (the newest `createdAt` wins when multiple
|
|
429
|
+
// prompt versions coexist). Summaries live in the `summarize` phase's
|
|
430
|
+
// output; absent phase / disabled flag → empty map, which simply means
|
|
431
|
+
// raw-body fallback.
|
|
432
|
+
const summarizeOut = ctx.phaseOutputs.get(SUMMARIZE_PHASE_NAME);
|
|
433
|
+
const summaryByNode = new Map();
|
|
434
|
+
if (summarizeOut !== undefined && summarizeOut.rows.length > 0) {
|
|
435
|
+
for (const s of summarizeOut.rows) {
|
|
436
|
+
const entry = {
|
|
437
|
+
summaryText: s.summaryText,
|
|
438
|
+
};
|
|
439
|
+
if (s.signatureSummary !== undefined)
|
|
440
|
+
entry.signatureSummary = s.signatureSummary;
|
|
441
|
+
summaryByNode.set(s.nodeId, entry);
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
const jobs = [];
|
|
445
|
+
// ---- Symbol tier ---------------------------------------------------
|
|
446
|
+
if (tiers.includes("symbol")) {
|
|
447
|
+
const eligible = [];
|
|
448
|
+
for (const n of ctx.graph.nodes()) {
|
|
449
|
+
if (isEmbeddableSymbol(n))
|
|
450
|
+
eligible.push(n);
|
|
451
|
+
}
|
|
452
|
+
eligible.sort((a, b) => (a.id < b.id ? -1 : a.id > b.id ? 1 : 0));
|
|
453
|
+
for (const node of eligible) {
|
|
454
|
+
const summary = summaryByNode.get(node.id);
|
|
455
|
+
let body;
|
|
456
|
+
if (summary !== undefined &&
|
|
457
|
+
node.startLine !== undefined &&
|
|
458
|
+
node.endLine !== undefined &&
|
|
459
|
+
node.filePath.length > 0) {
|
|
460
|
+
body = readSourceSpan(ctx.repoPath, node.filePath, node.startLine, node.endLine);
|
|
461
|
+
}
|
|
462
|
+
const text = symbolText(node, summary, body);
|
|
463
|
+
if (text.length === 0) {
|
|
464
|
+
skipped += 1;
|
|
465
|
+
continue;
|
|
466
|
+
}
|
|
467
|
+
if (summary !== undefined)
|
|
468
|
+
summaryFused = true;
|
|
469
|
+
const chunks = splitIntoChunks(text, maxUserTokens);
|
|
470
|
+
if (chunks.length === 0) {
|
|
471
|
+
skipped += 1;
|
|
472
|
+
continue;
|
|
473
|
+
}
|
|
474
|
+
chunksTotal += chunks.length;
|
|
475
|
+
// Content-hash skip. A symbol can emit multiple chunks
|
|
476
|
+
// (long signature+summary+body). We only skip when *every* fresh
|
|
477
|
+
// chunk hash matches its prior row — otherwise one mismatched chunk
|
|
478
|
+
// would leave the tier partially updated with stale neighbours.
|
|
479
|
+
// The anti-goal is explicit: don't try to diff indices; re-embed
|
|
480
|
+
// the whole node at this granularity.
|
|
481
|
+
const freshHashes = chunks.map((ch) => hashText("symbol", ch));
|
|
482
|
+
const allMatch = priorHashes.size > 0 &&
|
|
483
|
+
chunks.every((_chunk, i) => {
|
|
484
|
+
const fresh = freshHashes[i];
|
|
485
|
+
if (fresh === undefined)
|
|
486
|
+
return false;
|
|
487
|
+
return priorHashes.get(priorHashKey("symbol", node.id, i)) === fresh;
|
|
488
|
+
});
|
|
489
|
+
if (allMatch) {
|
|
490
|
+
chunksSkipped += chunks.length;
|
|
491
|
+
continue;
|
|
492
|
+
}
|
|
493
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
494
|
+
const chunkText = chunks[i] ?? "";
|
|
495
|
+
const contentHash = freshHashes[i] ?? hashText("symbol", chunkText);
|
|
496
|
+
const chunkIndex = i;
|
|
497
|
+
jobs.push({
|
|
498
|
+
granularity: "symbol",
|
|
499
|
+
text: chunkText,
|
|
500
|
+
emitRow: (vector) => ({
|
|
501
|
+
nodeId: node.id,
|
|
502
|
+
granularity: "symbol",
|
|
503
|
+
chunkIndex,
|
|
504
|
+
...(node.startLine !== undefined ? { startLine: node.startLine } : {}),
|
|
505
|
+
...(node.endLine !== undefined ? { endLine: node.endLine } : {}),
|
|
506
|
+
vector,
|
|
507
|
+
contentHash,
|
|
508
|
+
}),
|
|
509
|
+
});
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
// ---- File tier -----------------------------------------------------
|
|
514
|
+
if (tiers.includes("file")) {
|
|
515
|
+
const scan = ctx.phaseOutputs.get(SCAN_PHASE_NAME);
|
|
516
|
+
const fileNodeByPath = new Map();
|
|
517
|
+
for (const n of ctx.graph.nodes()) {
|
|
518
|
+
if (isFileNode(n))
|
|
519
|
+
fileNodeByPath.set(n.filePath, n);
|
|
520
|
+
}
|
|
521
|
+
const scanFiles = scan ? [...scan.files] : [];
|
|
522
|
+
scanFiles.sort((a, b) => (a.relPath < b.relPath ? -1 : a.relPath > b.relPath ? 1 : 0));
|
|
523
|
+
for (const f of scanFiles) {
|
|
524
|
+
const ext = path.extname(f.relPath).toLowerCase();
|
|
525
|
+
if (!EMBEDDABLE_FILE_EXTS.has(ext))
|
|
526
|
+
continue;
|
|
527
|
+
const fileNode = fileNodeByPath.get(f.relPath);
|
|
528
|
+
if (fileNode === undefined)
|
|
529
|
+
continue;
|
|
530
|
+
const raw = readFileWhole(ctx.repoPath, f.relPath);
|
|
531
|
+
if (raw === undefined || raw.length === 0) {
|
|
532
|
+
skipped += 1;
|
|
533
|
+
continue;
|
|
534
|
+
}
|
|
535
|
+
const truncated = raw.length > FILE_CHAR_CAP ? raw.slice(0, FILE_CHAR_CAP) : raw;
|
|
536
|
+
const chunks = splitIntoChunks(truncated, maxUserTokens);
|
|
537
|
+
const firstChunk = chunks[0];
|
|
538
|
+
if (firstChunk === undefined) {
|
|
539
|
+
skipped += 1;
|
|
540
|
+
continue;
|
|
541
|
+
}
|
|
542
|
+
chunksTotal += 1;
|
|
543
|
+
// Content-hash skip. Single-chunk tier — the compare is
|
|
544
|
+
// straightforward: if the prior row's hash equals the fresh hash,
|
|
545
|
+
// bail before queuing work.
|
|
546
|
+
const contentHash = hashText("file", firstChunk);
|
|
547
|
+
if (priorHashes.size > 0 &&
|
|
548
|
+
priorHashes.get(priorHashKey("file", fileNode.id, 0)) === contentHash) {
|
|
549
|
+
chunksSkipped += 1;
|
|
550
|
+
continue;
|
|
551
|
+
}
|
|
552
|
+
jobs.push({
|
|
553
|
+
granularity: "file",
|
|
554
|
+
text: firstChunk,
|
|
555
|
+
emitRow: (vector) => ({
|
|
556
|
+
nodeId: fileNode.id,
|
|
557
|
+
granularity: "file",
|
|
558
|
+
chunkIndex: 0,
|
|
559
|
+
vector,
|
|
560
|
+
contentHash,
|
|
561
|
+
}),
|
|
562
|
+
});
|
|
563
|
+
}
|
|
564
|
+
}
|
|
565
|
+
// ---- Community tier -----------------------------------------------
|
|
566
|
+
if (tiers.includes("community")) {
|
|
567
|
+
const membersByCommunity = new Map();
|
|
568
|
+
const nameById = new Map();
|
|
569
|
+
for (const n of ctx.graph.nodes()) {
|
|
570
|
+
const nn = n;
|
|
571
|
+
if (typeof nn.id === "string" && typeof nn.name === "string") {
|
|
572
|
+
nameById.set(nn.id, nn.name);
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
for (const e of ctx.graph.edges()) {
|
|
576
|
+
if (e.type !== "MEMBER_OF")
|
|
577
|
+
continue;
|
|
578
|
+
const to = e.to;
|
|
579
|
+
const arr = membersByCommunity.get(to);
|
|
580
|
+
if (arr !== undefined)
|
|
581
|
+
arr.push(e.from);
|
|
582
|
+
else
|
|
583
|
+
membersByCommunity.set(to, [e.from]);
|
|
584
|
+
}
|
|
585
|
+
const communities = [];
|
|
586
|
+
for (const n of ctx.graph.nodes()) {
|
|
587
|
+
if (isCommunityNode(n))
|
|
588
|
+
communities.push(n);
|
|
589
|
+
}
|
|
590
|
+
communities.sort((a, b) => (a.id < b.id ? -1 : a.id > b.id ? 1 : 0));
|
|
591
|
+
for (const c of communities) {
|
|
592
|
+
const members = membersByCommunity.get(c.id) ?? [];
|
|
593
|
+
const memberNames = members
|
|
594
|
+
.map((m) => nameById.get(m))
|
|
595
|
+
.filter((x) => x !== undefined)
|
|
596
|
+
.sort();
|
|
597
|
+
const topNames = memberNames.slice(0, 10);
|
|
598
|
+
const label = c.inferredLabel ?? c.name;
|
|
599
|
+
const keywords = (c.keywords ?? []).slice(0, 5).join(" ");
|
|
600
|
+
const parts = [label];
|
|
601
|
+
if (keywords.length > 0)
|
|
602
|
+
parts.push(keywords);
|
|
603
|
+
if (topNames.length > 0)
|
|
604
|
+
parts.push(topNames.join(" "));
|
|
605
|
+
const text = parts.join("\n");
|
|
606
|
+
if (text.length === 0) {
|
|
607
|
+
skipped += 1;
|
|
608
|
+
continue;
|
|
609
|
+
}
|
|
610
|
+
const chunks = splitIntoChunks(text, maxUserTokens);
|
|
611
|
+
const firstChunk = chunks[0];
|
|
612
|
+
if (firstChunk === undefined) {
|
|
613
|
+
skipped += 1;
|
|
614
|
+
continue;
|
|
615
|
+
}
|
|
616
|
+
chunksTotal += 1;
|
|
617
|
+
// Content-hash skip. Community tier is also single-chunk.
|
|
618
|
+
const contentHash = hashText("community", firstChunk);
|
|
619
|
+
if (priorHashes.size > 0 &&
|
|
620
|
+
priorHashes.get(priorHashKey("community", c.id, 0)) === contentHash) {
|
|
621
|
+
chunksSkipped += 1;
|
|
622
|
+
continue;
|
|
623
|
+
}
|
|
624
|
+
jobs.push({
|
|
625
|
+
granularity: "community",
|
|
626
|
+
text: firstChunk,
|
|
627
|
+
emitRow: (vector) => ({
|
|
628
|
+
nodeId: c.id,
|
|
629
|
+
granularity: "community",
|
|
630
|
+
chunkIndex: 0,
|
|
631
|
+
vector,
|
|
632
|
+
contentHash,
|
|
633
|
+
}),
|
|
634
|
+
});
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
// ---- Dispatch ------------------------------------------------------
|
|
638
|
+
// Cross-node batching: group jobs into fixed-size batches and embed
|
|
639
|
+
// them as a single `embedBatch()` call. When the embedder is a worker
|
|
640
|
+
// pool, successive batches ride different workers in parallel; when
|
|
641
|
+
// it's an in-process embedder the batching still cuts per-call
|
|
642
|
+
// overhead (tokenizer + tensor feed building amortize across the
|
|
643
|
+
// batch). We fire `workers` batches concurrently so the pool stays
|
|
644
|
+
// saturated — the pool's Piscina queue handles backpressure.
|
|
645
|
+
for (let i = 0; i < jobs.length; i += batchSize * workers) {
|
|
646
|
+
const waveEnd = Math.min(jobs.length, i + batchSize * workers);
|
|
647
|
+
const waveBatches = [];
|
|
648
|
+
const waveJobSlices = [];
|
|
649
|
+
for (let b = i; b < waveEnd; b += batchSize) {
|
|
650
|
+
const batchEnd = Math.min(waveEnd, b + batchSize);
|
|
651
|
+
const slice = jobs.slice(b, batchEnd);
|
|
652
|
+
waveJobSlices.push(slice);
|
|
653
|
+
waveBatches.push(embedder.embedBatch(slice.map((j) => j.text)));
|
|
654
|
+
}
|
|
655
|
+
const waveResults = await Promise.all(waveBatches);
|
|
656
|
+
for (let w = 0; w < waveResults.length; w++) {
|
|
657
|
+
const vectors = waveResults[w] ?? [];
|
|
658
|
+
const slice = waveJobSlices[w] ?? [];
|
|
659
|
+
for (let k = 0; k < slice.length; k++) {
|
|
660
|
+
const job = slice[k];
|
|
661
|
+
const vec = vectors[k];
|
|
662
|
+
if (job === undefined || vec === undefined)
|
|
663
|
+
continue;
|
|
664
|
+
rows.push(job.emitRow(vec));
|
|
665
|
+
byGranularity[job.granularity] = (byGranularity[job.granularity] ?? 0) + 1;
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
return {
|
|
670
|
+
embeddingsInserted: rows.length,
|
|
671
|
+
symbolsSkipped: skipped,
|
|
672
|
+
chunksTotal,
|
|
673
|
+
embeddingsModelId: embedder.modelId,
|
|
674
|
+
embeddingsHash: hashRows(rows),
|
|
675
|
+
rows,
|
|
676
|
+
ranEmbedder: true,
|
|
677
|
+
byGranularity,
|
|
678
|
+
summaryFused,
|
|
679
|
+
chunksSkipped,
|
|
680
|
+
};
|
|
681
|
+
}
|
|
682
|
+
finally {
|
|
683
|
+
await embedder.close();
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
export const embeddingsPhase = {
|
|
687
|
+
name: EMBEDDER_PHASE_NAME,
|
|
688
|
+
// Depend on `summarize` so summary-fused text is available; depend on
|
|
689
|
+
// `communities` so the community tier sees the emitted Community nodes
|
|
690
|
+
// and MEMBER_OF edges; depend on `scan` transitively via `annotate`
|
|
691
|
+
// (annotate → structure → scan) for the file tier.
|
|
692
|
+
deps: [ANNOTATE_PHASE_NAME, SUMMARIZE_PHASE_NAME, COMMUNITIES_PHASE_NAME],
|
|
693
|
+
async run(ctx) {
|
|
694
|
+
return runEmbeddings(ctx);
|
|
695
|
+
},
|
|
696
|
+
};
|
|
697
|
+
//# sourceMappingURL=embeddings.js.map
|