@rbalchii/anchor-engine 4.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +609 -0
- package/README.md +317 -0
- package/anchor.bat +5 -0
- package/docs/API.md +314 -0
- package/docs/DEPLOYMENT.md +448 -0
- package/docs/INDEX.md +226 -0
- package/docs/STAR_Whitepaper_Executive.md +216 -0
- package/docs/TROUBLESHOOTING.md +535 -0
- package/docs/archive/GIT_BACKUP_VERIFICATION.md +297 -0
- package/docs/archive/adoption-guide.md +264 -0
- package/docs/archive/adoption-preparation.md +179 -0
- package/docs/archive/agent-harness-integration.md +227 -0
- package/docs/archive/api-reference.md +106 -0
- package/docs/archive/api_flows_diagram.md +118 -0
- package/docs/archive/architecture.md +410 -0
- package/docs/archive/architecture_diagram.md +174 -0
- package/docs/archive/broader-adoption-preparation.md +175 -0
- package/docs/archive/browser-paradigm-architecture.md +163 -0
- package/docs/archive/chat-integration.md +124 -0
- package/docs/archive/community-adoption-materials.md +103 -0
- package/docs/archive/community-adoption.md +147 -0
- package/docs/archive/comparison-with-siloed-solutions.md +192 -0
- package/docs/archive/comprehensive-docs.md +156 -0
- package/docs/archive/data_flow_diagram.md +251 -0
- package/docs/archive/enhancement-implementation-summary.md +146 -0
- package/docs/archive/evolution-summary.md +141 -0
- package/docs/archive/ingestion_pipeline_diagram.md +198 -0
- package/docs/archive/native-module-profiling-results.md +135 -0
- package/docs/archive/positioning-document.md +158 -0
- package/docs/archive/positioning.md +175 -0
- package/docs/archive/query-builder-documentation.md +218 -0
- package/docs/archive/quick-reference.md +40 -0
- package/docs/archive/quickstart.md +63 -0
- package/docs/archive/relationship-narrative-discovery.md +141 -0
- package/docs/archive/search-logic-improvement-plan.md +336 -0
- package/docs/archive/search_architecture_diagram.md +212 -0
- package/docs/archive/semantic-architecture-guide.md +97 -0
- package/docs/archive/sequence-diagrams.md +128 -0
- package/docs/archive/system_components_diagram.md +296 -0
- package/docs/archive/test-framework-integration.md +109 -0
- package/docs/archive/testing-framework-documentation.md +397 -0
- package/docs/archive/testing-framework-summary.md +121 -0
- package/docs/archive/testing-framework.md +377 -0
- package/docs/archive/ui-architecture.md +75 -0
- package/docs/arxiv/BIBLIOGRAPHY.bib +145 -0
- package/docs/arxiv/RELATED_WORK.tex +39 -0
- package/docs/arxiv/compile.bat +48 -0
- package/docs/arxiv/joss_response.md +33 -0
- package/docs/arxiv/prepare-submission.bat +46 -0
- package/docs/arxiv/review.md +128 -0
- package/docs/arxiv/star-whitepaper.tex +657 -0
- package/docs/code-patterns.md +289 -0
- package/docs/whitepaper.md +445 -0
- package/engine/dist/agent/runtime.d.ts +41 -0
- package/engine/dist/agent/runtime.d.ts.map +1 -0
- package/engine/dist/agent/runtime.js +73 -0
- package/engine/dist/agent/runtime.js.map +1 -0
- package/engine/dist/commands/audit-tags.d.ts +14 -0
- package/engine/dist/commands/audit-tags.d.ts.map +1 -0
- package/engine/dist/commands/audit-tags.js +180 -0
- package/engine/dist/commands/audit-tags.js.map +1 -0
- package/engine/dist/commands/distill.d.ts +19 -0
- package/engine/dist/commands/distill.d.ts.map +1 -0
- package/engine/dist/commands/distill.js +114 -0
- package/engine/dist/commands/distill.js.map +1 -0
- package/engine/dist/commands/generate-synonyms.d.ts +14 -0
- package/engine/dist/commands/generate-synonyms.d.ts.map +1 -0
- package/engine/dist/commands/generate-synonyms.js +91 -0
- package/engine/dist/commands/generate-synonyms.js.map +1 -0
- package/engine/dist/config/index.d.ts +115 -0
- package/engine/dist/config/index.d.ts.map +1 -0
- package/engine/dist/config/index.js +326 -0
- package/engine/dist/config/index.js.map +1 -0
- package/engine/dist/config/max-recall-config.d.ts +102 -0
- package/engine/dist/config/max-recall-config.d.ts.map +1 -0
- package/engine/dist/config/max-recall-config.js +102 -0
- package/engine/dist/config/max-recall-config.js.map +1 -0
- package/engine/dist/config/paths.d.ts +40 -0
- package/engine/dist/config/paths.d.ts.map +1 -0
- package/engine/dist/config/paths.js +49 -0
- package/engine/dist/config/paths.js.map +1 -0
- package/engine/dist/core/batch.d.ts +19 -0
- package/engine/dist/core/batch.d.ts.map +1 -0
- package/engine/dist/core/batch.js +37 -0
- package/engine/dist/core/batch.js.map +1 -0
- package/engine/dist/core/db.d.ts +58 -0
- package/engine/dist/core/db.d.ts.map +1 -0
- package/engine/dist/core/db.js +563 -0
- package/engine/dist/core/db.js.map +1 -0
- package/engine/dist/core/inference/ChatWorker.d.ts +2 -0
- package/engine/dist/core/inference/ChatWorker.d.ts.map +1 -0
- package/engine/dist/core/inference/ChatWorker.js +28 -0
- package/engine/dist/core/inference/ChatWorker.js.map +1 -0
- package/engine/dist/core/inference/context_manager.d.ts +49 -0
- package/engine/dist/core/inference/context_manager.d.ts.map +1 -0
- package/engine/dist/core/inference/context_manager.js +199 -0
- package/engine/dist/core/inference/context_manager.js.map +1 -0
- package/engine/dist/core/inference/llamaLoaderWorker.d.ts +2 -0
- package/engine/dist/core/inference/llamaLoaderWorker.d.ts.map +1 -0
- package/engine/dist/core/inference/llamaLoaderWorker.js +23 -0
- package/engine/dist/core/inference/llamaLoaderWorker.js.map +1 -0
- package/engine/dist/core/vector.d.ts +40 -0
- package/engine/dist/core/vector.d.ts.map +1 -0
- package/engine/dist/core/vector.js +167 -0
- package/engine/dist/core/vector.js.map +1 -0
- package/engine/dist/index.d.ts +4 -0
- package/engine/dist/index.d.ts.map +1 -0
- package/engine/dist/index.js +400 -0
- package/engine/dist/index.js.map +1 -0
- package/engine/dist/middleware/auth.d.ts +14 -0
- package/engine/dist/middleware/auth.d.ts.map +1 -0
- package/engine/dist/middleware/auth.js +44 -0
- package/engine/dist/middleware/auth.js.map +1 -0
- package/engine/dist/middleware/request-tracing.d.ts +29 -0
- package/engine/dist/middleware/request-tracing.d.ts.map +1 -0
- package/engine/dist/middleware/request-tracing.js +115 -0
- package/engine/dist/middleware/request-tracing.js.map +1 -0
- package/engine/dist/middleware/validate.d.ts +30 -0
- package/engine/dist/middleware/validate.d.ts.map +1 -0
- package/engine/dist/middleware/validate.js +117 -0
- package/engine/dist/middleware/validate.js.map +1 -0
- package/engine/dist/native/index.d.ts +106 -0
- package/engine/dist/native/index.d.ts.map +1 -0
- package/engine/dist/native/index.js +230 -0
- package/engine/dist/native/index.js.map +1 -0
- package/engine/dist/native/types.d.ts +45 -0
- package/engine/dist/native/types.d.ts.map +1 -0
- package/engine/dist/native/types.js +6 -0
- package/engine/dist/native/types.js.map +1 -0
- package/engine/dist/profiling/atomization-profiling.d.ts +8 -0
- package/engine/dist/profiling/atomization-profiling.d.ts.map +1 -0
- package/engine/dist/profiling/atomization-profiling.js +108 -0
- package/engine/dist/profiling/atomization-profiling.js.map +1 -0
- package/engine/dist/profiling/bottleneck-identification.d.ts +8 -0
- package/engine/dist/profiling/bottleneck-identification.d.ts.map +1 -0
- package/engine/dist/profiling/bottleneck-identification.js +249 -0
- package/engine/dist/profiling/bottleneck-identification.js.map +1 -0
- package/engine/dist/profiling/content-sanitization-profiling.d.ts +12 -0
- package/engine/dist/profiling/content-sanitization-profiling.d.ts.map +1 -0
- package/engine/dist/profiling/content-sanitization-profiling.js +266 -0
- package/engine/dist/profiling/content-sanitization-profiling.js.map +1 -0
- package/engine/dist/profiling/simhash-profiling.d.ts +11 -0
- package/engine/dist/profiling/simhash-profiling.d.ts.map +1 -0
- package/engine/dist/profiling/simhash-profiling.js +168 -0
- package/engine/dist/profiling/simhash-profiling.js.map +1 -0
- package/engine/dist/routes/api.d.ts +9 -0
- package/engine/dist/routes/api.d.ts.map +1 -0
- package/engine/dist/routes/api.js +37 -0
- package/engine/dist/routes/api.js.map +1 -0
- package/engine/dist/routes/enhanced-api.d.ts +9 -0
- package/engine/dist/routes/enhanced-api.d.ts.map +1 -0
- package/engine/dist/routes/enhanced-api.js +139 -0
- package/engine/dist/routes/enhanced-api.js.map +1 -0
- package/engine/dist/routes/health.d.ts +8 -0
- package/engine/dist/routes/health.d.ts.map +1 -0
- package/engine/dist/routes/health.js +89 -0
- package/engine/dist/routes/health.js.map +1 -0
- package/engine/dist/routes/monitoring.d.ts +8 -0
- package/engine/dist/routes/monitoring.d.ts.map +1 -0
- package/engine/dist/routes/monitoring.js +509 -0
- package/engine/dist/routes/monitoring.js.map +1 -0
- package/engine/dist/routes/v1/admin.d.ts +3 -0
- package/engine/dist/routes/v1/admin.d.ts.map +1 -0
- package/engine/dist/routes/v1/admin.js +261 -0
- package/engine/dist/routes/v1/admin.js.map +1 -0
- package/engine/dist/routes/v1/atoms.d.ts +3 -0
- package/engine/dist/routes/v1/atoms.d.ts.map +1 -0
- package/engine/dist/routes/v1/atoms.js +172 -0
- package/engine/dist/routes/v1/atoms.js.map +1 -0
- package/engine/dist/routes/v1/backup.d.ts +3 -0
- package/engine/dist/routes/v1/backup.d.ts.map +1 -0
- package/engine/dist/routes/v1/backup.js +100 -0
- package/engine/dist/routes/v1/backup.js.map +1 -0
- package/engine/dist/routes/v1/git.d.ts +3 -0
- package/engine/dist/routes/v1/git.d.ts.map +1 -0
- package/engine/dist/routes/v1/git.js +316 -0
- package/engine/dist/routes/v1/git.js.map +1 -0
- package/engine/dist/routes/v1/ingest.d.ts +3 -0
- package/engine/dist/routes/v1/ingest.d.ts.map +1 -0
- package/engine/dist/routes/v1/ingest.js +66 -0
- package/engine/dist/routes/v1/ingest.js.map +1 -0
- package/engine/dist/routes/v1/memory.d.ts +14 -0
- package/engine/dist/routes/v1/memory.d.ts.map +1 -0
- package/engine/dist/routes/v1/memory.js +87 -0
- package/engine/dist/routes/v1/memory.js.map +1 -0
- package/engine/dist/routes/v1/research.d.ts +3 -0
- package/engine/dist/routes/v1/research.d.ts.map +1 -0
- package/engine/dist/routes/v1/research.js +109 -0
- package/engine/dist/routes/v1/research.js.map +1 -0
- package/engine/dist/routes/v1/search.d.ts +3 -0
- package/engine/dist/routes/v1/search.d.ts.map +1 -0
- package/engine/dist/routes/v1/search.js +180 -0
- package/engine/dist/routes/v1/search.js.map +1 -0
- package/engine/dist/routes/v1/settings.d.ts +8 -0
- package/engine/dist/routes/v1/settings.d.ts.map +1 -0
- package/engine/dist/routes/v1/settings.js +211 -0
- package/engine/dist/routes/v1/settings.js.map +1 -0
- package/engine/dist/routes/v1/system.d.ts +3 -0
- package/engine/dist/routes/v1/system.d.ts.map +1 -0
- package/engine/dist/routes/v1/system.js +326 -0
- package/engine/dist/routes/v1/system.js.map +1 -0
- package/engine/dist/routes/v1/tags.d.ts +3 -0
- package/engine/dist/routes/v1/tags.d.ts.map +1 -0
- package/engine/dist/routes/v1/tags.js +102 -0
- package/engine/dist/routes/v1/tags.js.map +1 -0
- package/engine/dist/server-8080.d.ts +2 -0
- package/engine/dist/server-8080.d.ts.map +1 -0
- package/engine/dist/server-8080.js +74 -0
- package/engine/dist/server-8080.js.map +1 -0
- package/engine/dist/services/backup/backup-restore.d.ts +37 -0
- package/engine/dist/services/backup/backup-restore.d.ts.map +1 -0
- package/engine/dist/services/backup/backup-restore.js +385 -0
- package/engine/dist/services/backup/backup-restore.js.map +1 -0
- package/engine/dist/services/backup/backup.d.ts +14 -0
- package/engine/dist/services/backup/backup.d.ts.map +1 -0
- package/engine/dist/services/backup/backup.js +442 -0
- package/engine/dist/services/backup/backup.js.map +1 -0
- package/engine/dist/services/distillation/radial-distiller-v2.d.ts +127 -0
- package/engine/dist/services/distillation/radial-distiller-v2.d.ts.map +1 -0
- package/engine/dist/services/distillation/radial-distiller-v2.js +503 -0
- package/engine/dist/services/distillation/radial-distiller-v2.js.map +1 -0
- package/engine/dist/services/distillation/radial-distiller.d.ts +63 -0
- package/engine/dist/services/distillation/radial-distiller.d.ts.map +1 -0
- package/engine/dist/services/distillation/radial-distiller.js +394 -0
- package/engine/dist/services/distillation/radial-distiller.js.map +1 -0
- package/engine/dist/services/health-check-enhanced.d.ts +89 -0
- package/engine/dist/services/health-check-enhanced.d.ts.map +1 -0
- package/engine/dist/services/health-check-enhanced.js +417 -0
- package/engine/dist/services/health-check-enhanced.js.map +1 -0
- package/engine/dist/services/idle-manager.d.ts +56 -0
- package/engine/dist/services/idle-manager.d.ts.map +1 -0
- package/engine/dist/services/idle-manager.js +210 -0
- package/engine/dist/services/idle-manager.js.map +1 -0
- package/engine/dist/services/inference/inference-service.d.ts +27 -0
- package/engine/dist/services/inference/inference-service.d.ts.map +1 -0
- package/engine/dist/services/inference/inference-service.js +89 -0
- package/engine/dist/services/inference/inference-service.js.map +1 -0
- package/engine/dist/services/inference/inference.d.ts +59 -0
- package/engine/dist/services/inference/inference.d.ts.map +1 -0
- package/engine/dist/services/inference/inference.js +131 -0
- package/engine/dist/services/inference/inference.js.map +1 -0
- package/engine/dist/services/ingest/atomizer-service.d.ts +74 -0
- package/engine/dist/services/ingest/atomizer-service.d.ts.map +1 -0
- package/engine/dist/services/ingest/atomizer-service.js +982 -0
- package/engine/dist/services/ingest/atomizer-service.js.map +1 -0
- package/engine/dist/services/ingest/content-cleaner.d.ts +43 -0
- package/engine/dist/services/ingest/content-cleaner.d.ts.map +1 -0
- package/engine/dist/services/ingest/content-cleaner.js +166 -0
- package/engine/dist/services/ingest/content-cleaner.js.map +1 -0
- package/engine/dist/services/ingest/github-ingest-service.d.ts +103 -0
- package/engine/dist/services/ingest/github-ingest-service.d.ts.map +1 -0
- package/engine/dist/services/ingest/github-ingest-service.js +537 -0
- package/engine/dist/services/ingest/github-ingest-service.js.map +1 -0
- package/engine/dist/services/ingest/ingest-atomic.d.ts +16 -0
- package/engine/dist/services/ingest/ingest-atomic.d.ts.map +1 -0
- package/engine/dist/services/ingest/ingest-atomic.js +437 -0
- package/engine/dist/services/ingest/ingest-atomic.js.map +1 -0
- package/engine/dist/services/ingest/ingest.d.ts +50 -0
- package/engine/dist/services/ingest/ingest.d.ts.map +1 -0
- package/engine/dist/services/ingest/ingest.js +230 -0
- package/engine/dist/services/ingest/ingest.js.map +1 -0
- package/engine/dist/services/ingest/watchdog.d.ts +31 -0
- package/engine/dist/services/ingest/watchdog.d.ts.map +1 -0
- package/engine/dist/services/ingest/watchdog.js +400 -0
- package/engine/dist/services/ingest/watchdog.js.map +1 -0
- package/engine/dist/services/llm/context.d.ts +6 -0
- package/engine/dist/services/llm/context.d.ts.map +1 -0
- package/engine/dist/services/llm/context.js +80 -0
- package/engine/dist/services/llm/context.js.map +1 -0
- package/engine/dist/services/llm/provider.d.ts +23 -0
- package/engine/dist/services/llm/provider.d.ts.map +1 -0
- package/engine/dist/services/llm/provider.js +338 -0
- package/engine/dist/services/llm/provider.js.map +1 -0
- package/engine/dist/services/llm/reader.d.ts +12 -0
- package/engine/dist/services/llm/reader.d.ts.map +1 -0
- package/engine/dist/services/llm/reader.js +40 -0
- package/engine/dist/services/llm/reader.js.map +1 -0
- package/engine/dist/services/mirror/mirror.d.ts +28 -0
- package/engine/dist/services/mirror/mirror.d.ts.map +1 -0
- package/engine/dist/services/mirror/mirror.js +208 -0
- package/engine/dist/services/mirror/mirror.js.map +1 -0
- package/engine/dist/services/nlp/nlp-service.d.ts +70 -0
- package/engine/dist/services/nlp/nlp-service.d.ts.map +1 -0
- package/engine/dist/services/nlp/nlp-service.js +151 -0
- package/engine/dist/services/nlp/nlp-service.js.map +1 -0
- package/engine/dist/services/nlp/query-parser.d.ts +9 -0
- package/engine/dist/services/nlp/query-parser.d.ts.map +1 -0
- package/engine/dist/services/nlp/query-parser.js +29 -0
- package/engine/dist/services/nlp/query-parser.js.map +1 -0
- package/engine/dist/services/query-builder/DataFrame.d.ts +95 -0
- package/engine/dist/services/query-builder/DataFrame.d.ts.map +1 -0
- package/engine/dist/services/query-builder/DataFrame.js +263 -0
- package/engine/dist/services/query-builder/DataFrame.js.map +1 -0
- package/engine/dist/services/query-builder/QueryBuilder.d.ts +106 -0
- package/engine/dist/services/query-builder/QueryBuilder.d.ts.map +1 -0
- package/engine/dist/services/query-builder/QueryBuilder.js +235 -0
- package/engine/dist/services/query-builder/QueryBuilder.js.map +1 -0
- package/engine/dist/services/query-builder/utils/export.d.ts +11 -0
- package/engine/dist/services/query-builder/utils/export.d.ts.map +1 -0
- package/engine/dist/services/query-builder/utils/export.js +130 -0
- package/engine/dist/services/query-builder/utils/export.js.map +1 -0
- package/engine/dist/services/research/researcher.d.ts +15 -0
- package/engine/dist/services/research/researcher.d.ts.map +1 -0
- package/engine/dist/services/research/researcher.js +123 -0
- package/engine/dist/services/research/researcher.js.map +1 -0
- package/engine/dist/services/scribe/scribe.d.ts +43 -0
- package/engine/dist/services/scribe/scribe.d.ts.map +1 -0
- package/engine/dist/services/scribe/scribe.js +135 -0
- package/engine/dist/services/scribe/scribe.js.map +1 -0
- package/engine/dist/services/search/bright-nodes.d.ts +41 -0
- package/engine/dist/services/search/bright-nodes.d.ts.map +1 -0
- package/engine/dist/services/search/bright-nodes.js +117 -0
- package/engine/dist/services/search/bright-nodes.js.map +1 -0
- package/engine/dist/services/search/context-inflator.d.ts +63 -0
- package/engine/dist/services/search/context-inflator.d.ts.map +1 -0
- package/engine/dist/services/search/context-inflator.js +649 -0
- package/engine/dist/services/search/context-inflator.js.map +1 -0
- package/engine/dist/services/search/context-manager.d.ts +34 -0
- package/engine/dist/services/search/context-manager.d.ts.map +1 -0
- package/engine/dist/services/search/context-manager.js +124 -0
- package/engine/dist/services/search/context-manager.js.map +1 -0
- package/engine/dist/services/search/distributed-query.d.ts +38 -0
- package/engine/dist/services/search/distributed-query.d.ts.map +1 -0
- package/engine/dist/services/search/distributed-query.js +105 -0
- package/engine/dist/services/search/distributed-query.js.map +1 -0
- package/engine/dist/services/search/explore.d.ts +73 -0
- package/engine/dist/services/search/explore.d.ts.map +1 -0
- package/engine/dist/services/search/explore.js +388 -0
- package/engine/dist/services/search/explore.js.map +1 -0
- package/engine/dist/services/search/graph-context-serializer.d.ts +76 -0
- package/engine/dist/services/search/graph-context-serializer.d.ts.map +1 -0
- package/engine/dist/services/search/graph-context-serializer.js +435 -0
- package/engine/dist/services/search/graph-context-serializer.js.map +1 -0
- package/engine/dist/services/search/llm-context-formatter.d.ts +122 -0
- package/engine/dist/services/search/llm-context-formatter.d.ts.map +1 -0
- package/engine/dist/services/search/llm-context-formatter.js +394 -0
- package/engine/dist/services/search/llm-context-formatter.js.map +1 -0
- package/engine/dist/services/search/physics-tag-walker.d.ts +115 -0
- package/engine/dist/services/search/physics-tag-walker.d.ts.map +1 -0
- package/engine/dist/services/search/physics-tag-walker.js +611 -0
- package/engine/dist/services/search/physics-tag-walker.js.map +1 -0
- package/engine/dist/services/search/query-parser.d.ts +66 -0
- package/engine/dist/services/search/query-parser.d.ts.map +1 -0
- package/engine/dist/services/search/query-parser.js +346 -0
- package/engine/dist/services/search/query-parser.js.map +1 -0
- package/engine/dist/services/search/search-utils.d.ts +100 -0
- package/engine/dist/services/search/search-utils.d.ts.map +1 -0
- package/engine/dist/services/search/search-utils.js +473 -0
- package/engine/dist/services/search/search-utils.js.map +1 -0
- package/engine/dist/services/search/search.d.ts +116 -0
- package/engine/dist/services/search/search.d.ts.map +1 -0
- package/engine/dist/services/search/search.js +1286 -0
- package/engine/dist/services/search/search.js.map +1 -0
- package/engine/dist/services/search/sovereign-system-prompt.d.ts +48 -0
- package/engine/dist/services/search/sovereign-system-prompt.d.ts.map +1 -0
- package/engine/dist/services/search/sovereign-system-prompt.js +101 -0
- package/engine/dist/services/search/sovereign-system-prompt.js.map +1 -0
- package/engine/dist/services/search/streaming-search.d.ts +51 -0
- package/engine/dist/services/search/streaming-search.d.ts.map +1 -0
- package/engine/dist/services/search/streaming-search.js +94 -0
- package/engine/dist/services/search/streaming-search.js.map +1 -0
- package/engine/dist/services/semantic/semantic-ingestion-service.d.ts +53 -0
- package/engine/dist/services/semantic/semantic-ingestion-service.d.ts.map +1 -0
- package/engine/dist/services/semantic/semantic-ingestion-service.js +625 -0
- package/engine/dist/services/semantic/semantic-ingestion-service.js.map +1 -0
- package/engine/dist/services/semantic/semantic-molecule-processor.d.ts +68 -0
- package/engine/dist/services/semantic/semantic-molecule-processor.d.ts.map +1 -0
- package/engine/dist/services/semantic/semantic-molecule-processor.js +176 -0
- package/engine/dist/services/semantic/semantic-molecule-processor.js.map +1 -0
- package/engine/dist/services/semantic/semantic-search.d.ts +52 -0
- package/engine/dist/services/semantic/semantic-search.d.ts.map +1 -0
- package/engine/dist/services/semantic/semantic-search.js +649 -0
- package/engine/dist/services/semantic/semantic-search.js.map +1 -0
- package/engine/dist/services/semantic/semantic-tag-deriver.d.ts +64 -0
- package/engine/dist/services/semantic/semantic-tag-deriver.d.ts.map +1 -0
- package/engine/dist/services/semantic/semantic-tag-deriver.js +191 -0
- package/engine/dist/services/semantic/semantic-tag-deriver.js.map +1 -0
- package/engine/dist/services/semantic/types/semantic.d.ts +26 -0
- package/engine/dist/services/semantic/types/semantic.d.ts.map +1 -0
- package/engine/dist/services/semantic/types/semantic.js +7 -0
- package/engine/dist/services/semantic/types/semantic.js.map +1 -0
- package/engine/dist/services/synonyms/auto-synonym-generator.d.ts +79 -0
- package/engine/dist/services/synonyms/auto-synonym-generator.d.ts.map +1 -0
- package/engine/dist/services/synonyms/auto-synonym-generator.js +415 -0
- package/engine/dist/services/synonyms/auto-synonym-generator.js.map +1 -0
- package/engine/dist/services/system-status.d.ts +68 -0
- package/engine/dist/services/system-status.d.ts.map +1 -0
- package/engine/dist/services/system-status.js +107 -0
- package/engine/dist/services/system-status.js.map +1 -0
- package/engine/dist/services/tags/discovery.d.ts +16 -0
- package/engine/dist/services/tags/discovery.d.ts.map +1 -0
- package/engine/dist/services/tags/discovery.js +206 -0
- package/engine/dist/services/tags/discovery.js.map +1 -0
- package/engine/dist/services/tags/gliner.d.ts +18 -0
- package/engine/dist/services/tags/gliner.d.ts.map +1 -0
- package/engine/dist/services/tags/gliner.js +119 -0
- package/engine/dist/services/tags/gliner.js.map +1 -0
- package/engine/dist/services/tags/infector.d.ts +21 -0
- package/engine/dist/services/tags/infector.d.ts.map +1 -0
- package/engine/dist/services/tags/infector.js +168 -0
- package/engine/dist/services/tags/infector.js.map +1 -0
- package/engine/dist/services/tags/tag-auditor.d.ts +77 -0
- package/engine/dist/services/tags/tag-auditor.d.ts.map +1 -0
- package/engine/dist/services/tags/tag-auditor.js +283 -0
- package/engine/dist/services/tags/tag-auditor.js.map +1 -0
- package/engine/dist/services/taxonomy/taxonomy-manager.d.ts +50 -0
- package/engine/dist/services/taxonomy/taxonomy-manager.d.ts.map +1 -0
- package/engine/dist/services/taxonomy/taxonomy-manager.js +291 -0
- package/engine/dist/services/taxonomy/taxonomy-manager.js.map +1 -0
- package/engine/dist/services/vision/vision_service.d.ts +4 -0
- package/engine/dist/services/vision/vision_service.d.ts.map +1 -0
- package/engine/dist/services/vision/vision_service.js +197 -0
- package/engine/dist/services/vision/vision_service.js.map +1 -0
- package/engine/dist/test-framework/core.d.ts +133 -0
- package/engine/dist/test-framework/core.d.ts.map +1 -0
- package/engine/dist/test-framework/core.js +313 -0
- package/engine/dist/test-framework/core.js.map +1 -0
- package/engine/dist/test-framework/dataset-runner.d.ts +78 -0
- package/engine/dist/test-framework/dataset-runner.d.ts.map +1 -0
- package/engine/dist/test-framework/dataset-runner.js +223 -0
- package/engine/dist/test-framework/dataset-runner.js.map +1 -0
- package/engine/dist/test-framework/diagnostic-tests.d.ts +38 -0
- package/engine/dist/test-framework/diagnostic-tests.d.ts.map +1 -0
- package/engine/dist/test-framework/diagnostic-tests.js +283 -0
- package/engine/dist/test-framework/diagnostic-tests.js.map +1 -0
- package/engine/dist/test-framework/performance-regression-tests.d.ts +30 -0
- package/engine/dist/test-framework/performance-regression-tests.d.ts.map +1 -0
- package/engine/dist/test-framework/performance-regression-tests.js +331 -0
- package/engine/dist/test-framework/performance-regression-tests.js.map +1 -0
- package/engine/dist/types/api.d.ts +53 -0
- package/engine/dist/types/api.d.ts.map +1 -0
- package/engine/dist/types/api.js +2 -0
- package/engine/dist/types/api.js.map +1 -0
- package/engine/dist/types/atomic.d.ts +42 -0
- package/engine/dist/types/atomic.d.ts.map +1 -0
- package/engine/dist/types/atomic.js +10 -0
- package/engine/dist/types/atomic.js.map +1 -0
- package/engine/dist/types/context-protocol.d.ts +137 -0
- package/engine/dist/types/context-protocol.d.ts.map +1 -0
- package/engine/dist/types/context-protocol.js +28 -0
- package/engine/dist/types/context-protocol.js.map +1 -0
- package/engine/dist/types/context.d.ts +2 -0
- package/engine/dist/types/context.d.ts.map +1 -0
- package/engine/dist/types/context.js +2 -0
- package/engine/dist/types/context.js.map +1 -0
- package/engine/dist/types/index.d.ts +20 -0
- package/engine/dist/types/index.d.ts.map +1 -0
- package/engine/dist/types/index.js +18 -0
- package/engine/dist/types/index.js.map +1 -0
- package/engine/dist/types/search.d.ts +31 -0
- package/engine/dist/types/search.d.ts.map +1 -0
- package/engine/dist/types/search.js +2 -0
- package/engine/dist/types/search.js.map +1 -0
- package/engine/dist/types/taxonomy.d.ts +137 -0
- package/engine/dist/types/taxonomy.d.ts.map +1 -0
- package/engine/dist/types/taxonomy.js +138 -0
- package/engine/dist/types/taxonomy.js.map +1 -0
- package/engine/dist/types/taxonomy.simple.d.ts +131 -0
- package/engine/dist/types/taxonomy.simple.d.ts.map +1 -0
- package/engine/dist/types/taxonomy.simple.js +132 -0
- package/engine/dist/types/taxonomy.simple.js.map +1 -0
- package/engine/dist/types/tool-call.d.ts +16 -0
- package/engine/dist/types/tool-call.d.ts.map +1 -0
- package/engine/dist/types/tool-call.js +6 -0
- package/engine/dist/types/tool-call.js.map +1 -0
- package/engine/dist/types/trace.d.ts +25 -0
- package/engine/dist/types/trace.d.ts.map +1 -0
- package/engine/dist/types/trace.js +5 -0
- package/engine/dist/types/trace.js.map +1 -0
- package/engine/dist/utils/adaptive-concurrency.d.ts +81 -0
- package/engine/dist/utils/adaptive-concurrency.d.ts.map +1 -0
- package/engine/dist/utils/adaptive-concurrency.js +266 -0
- package/engine/dist/utils/adaptive-concurrency.js.map +1 -0
- package/engine/dist/utils/date_extractor.d.ts +2 -0
- package/engine/dist/utils/date_extractor.d.ts.map +1 -0
- package/engine/dist/utils/date_extractor.js +32 -0
- package/engine/dist/utils/date_extractor.js.map +1 -0
- package/engine/dist/utils/native-module-manager.d.ts +48 -0
- package/engine/dist/utils/native-module-manager.d.ts.map +1 -0
- package/engine/dist/utils/native-module-manager.js +265 -0
- package/engine/dist/utils/native-module-manager.js.map +1 -0
- package/engine/dist/utils/native-module-profiler.d.ts +66 -0
- package/engine/dist/utils/native-module-profiler.d.ts.map +1 -0
- package/engine/dist/utils/native-module-profiler.js +182 -0
- package/engine/dist/utils/native-module-profiler.js.map +1 -0
- package/engine/dist/utils/path-manager.d.ts +59 -0
- package/engine/dist/utils/path-manager.d.ts.map +1 -0
- package/engine/dist/utils/path-manager.js +154 -0
- package/engine/dist/utils/path-manager.js.map +1 -0
- package/engine/dist/utils/performance-monitor.d.ts +92 -0
- package/engine/dist/utils/performance-monitor.d.ts.map +1 -0
- package/engine/dist/utils/performance-monitor.js +221 -0
- package/engine/dist/utils/performance-monitor.js.map +1 -0
- package/engine/dist/utils/process-manager.d.ts +18 -0
- package/engine/dist/utils/process-manager.d.ts.map +1 -0
- package/engine/dist/utils/process-manager.js +100 -0
- package/engine/dist/utils/process-manager.js.map +1 -0
- package/engine/dist/utils/request-tracer.d.ts +131 -0
- package/engine/dist/utils/request-tracer.d.ts.map +1 -0
- package/engine/dist/utils/request-tracer.js +414 -0
- package/engine/dist/utils/request-tracer.js.map +1 -0
- package/engine/dist/utils/resource-manager.d.ts +108 -0
- package/engine/dist/utils/resource-manager.d.ts.map +1 -0
- package/engine/dist/utils/resource-manager.js +235 -0
- package/engine/dist/utils/resource-manager.js.map +1 -0
- package/engine/dist/utils/safe-dns.d.ts +14 -0
- package/engine/dist/utils/safe-dns.d.ts.map +1 -0
- package/engine/dist/utils/safe-dns.js +105 -0
- package/engine/dist/utils/safe-dns.js.map +1 -0
- package/engine/dist/utils/structured-logger.d.ts +124 -0
- package/engine/dist/utils/structured-logger.d.ts.map +1 -0
- package/engine/dist/utils/structured-logger.js +332 -0
- package/engine/dist/utils/structured-logger.js.map +1 -0
- package/engine/dist/utils/tag-cleanup.d.ts +11 -0
- package/engine/dist/utils/tag-cleanup.d.ts.map +1 -0
- package/engine/dist/utils/tag-cleanup.js +111 -0
- package/engine/dist/utils/tag-cleanup.js.map +1 -0
- package/engine/dist/utils/tag-filter.d.ts +19 -0
- package/engine/dist/utils/tag-filter.d.ts.map +1 -0
- package/engine/dist/utils/tag-filter.js +147 -0
- package/engine/dist/utils/tag-filter.js.map +1 -0
- package/engine/dist/utils/tag-modulation.d.ts +80 -0
- package/engine/dist/utils/tag-modulation.d.ts.map +1 -0
- package/engine/dist/utils/tag-modulation.js +284 -0
- package/engine/dist/utils/tag-modulation.js.map +1 -0
- package/engine/dist/utils/timer.d.ts +40 -0
- package/engine/dist/utils/timer.d.ts.map +1 -0
- package/engine/dist/utils/timer.js +76 -0
- package/engine/dist/utils/timer.js.map +1 -0
- package/engine/dist/utils/token-utils.d.ts +19 -0
- package/engine/dist/utils/token-utils.d.ts.map +1 -0
- package/engine/dist/utils/token-utils.js +71 -0
- package/engine/dist/utils/token-utils.js.map +1 -0
- package/engine/dist/utils/wasm-module-loader.d.ts +50 -0
- package/engine/dist/utils/wasm-module-loader.d.ts.map +1 -0
- package/engine/dist/utils/wasm-module-loader.js +136 -0
- package/engine/dist/utils/wasm-module-loader.js.map +1 -0
- package/engine/package.json +105 -0
- package/package.json +106 -0
|
@@ -0,0 +1,982 @@
|
|
|
1
|
+
import * as crypto from 'crypto';
|
|
2
|
+
import * as fs from 'fs';
|
|
3
|
+
import * as path from 'path';
|
|
4
|
+
import { fileURLToPath } from 'url';
|
|
5
|
+
import { shouldUseStrictAtomSelection, modulateTags } from '../../utils/tag-modulation.js';
|
|
6
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
7
|
+
const __dirname = path.dirname(__filename);
|
|
8
|
+
// Native modules from @rbalchii packages (with fallbacks)
|
|
9
|
+
let nativeFingerprint = null;
|
|
10
|
+
let nativeCleanse = null;
|
|
11
|
+
try {
|
|
12
|
+
const fp = await import('@rbalchii/native-fingerprint');
|
|
13
|
+
nativeFingerprint = fp.fingerprint;
|
|
14
|
+
}
|
|
15
|
+
catch { /* use JS fallback */ }
|
|
16
|
+
try {
|
|
17
|
+
const ka = await import('@rbalchii/native-keyassassin');
|
|
18
|
+
nativeCleanse = ka.cleanse;
|
|
19
|
+
}
|
|
20
|
+
catch { /* use JS fallback */ }
|
|
21
|
+
export class AtomizerService {
|
|
22
|
+
/**
|
|
23
|
+
* Tag blacklist patterns - prevents low-value tags from being stored
|
|
24
|
+
* These patterns filter out noise at ingestion time
|
|
25
|
+
*/
|
|
26
|
+
static TAG_BLACKLIST_PATTERNS = [
|
|
27
|
+
// Color codes (hex)
|
|
28
|
+
/^#[0-9a-fA-F]{3,8}$/,
|
|
29
|
+
// Pure numbers or too short
|
|
30
|
+
/^#\d{1,3}$/,
|
|
31
|
+
/^#_\w*$/,
|
|
32
|
+
/^#__[\w\d_]+$/,
|
|
33
|
+
// HTML/DOM artifacts
|
|
34
|
+
/^#btn\b/, /^#class\b/, /^#div\b/, /^#id\b/,
|
|
35
|
+
/^#span\b/, /^#href\b/, /^#src\b/,
|
|
36
|
+
// Code artifacts
|
|
37
|
+
/^#fn\b/, /^#elif\b/, /^#else\b/, /^#endif\b/,
|
|
38
|
+
/^#ifdef\b/, /^#ifndef\b/, /^#include\b/,
|
|
39
|
+
/^#define\b/, /^#pragma\b/,
|
|
40
|
+
// Scraping artifacts
|
|
41
|
+
/^#cite_note/, /^#cite_ref/, /^#amp_tf/,
|
|
42
|
+
/^#details_of_atom/, /^#entry_lin/, /^#entry_links/,
|
|
43
|
+
/^#opensearch_extension/, /^#extension_elements/,
|
|
44
|
+
/^#simple_examples/, /^#query_interface/,
|
|
45
|
+
/^#api_response/, /^#response_example/,
|
|
46
|
+
/^#examples?$/, /^#overview$/, /^#preface$/,
|
|
47
|
+
/^#appendix/, /^#appendices$/, /^#bib\b/, /^#ref\b/,
|
|
48
|
+
// Error/artifact tags
|
|
49
|
+
/^#incorrect_/, /^#error_/, /^#null\b/,
|
|
50
|
+
/^#undefined\b/, /^#nan\b/,
|
|
51
|
+
// Too generic
|
|
52
|
+
/^#slow_pickup$/, /^#late_night$/, /^#early_morning$/,
|
|
53
|
+
/^#monday\b/, /^#tuesday\b/, /^#wednesday\b/,
|
|
54
|
+
/^#thursday\b/, /^#friday\b/, /^#saturday\b/,
|
|
55
|
+
/^#sunday\b/, /^#manual\b/, /^#manually_/,
|
|
56
|
+
/^#test_/, /^#tmp\b/, /^#temp\b/, /^#untagged$/,
|
|
57
|
+
// Deprecated project names
|
|
58
|
+
/^#agentgpt$/, /^#babyagi$/, /^#autogen$/, /^#chimaera$/,
|
|
59
|
+
// System tags
|
|
60
|
+
/^#manually_quarantined$/, /^#quarantined$/,
|
|
61
|
+
/^#system$/, /^#internal$/, /^#external$/,
|
|
62
|
+
// Test fixture tags (from unit test mock data bleeding into production)
|
|
63
|
+
/^#tag\d*$/i, // #Tag, #Tag1, #tag2
|
|
64
|
+
/^#shared[a-z]$/i, // #sharedA, #sharedB
|
|
65
|
+
/^#word\d*$/i, // #Word, #Word1
|
|
66
|
+
/^#fixture/i, // #fixture...
|
|
67
|
+
/^#mock/i, // #mock...
|
|
68
|
+
/^#dummy/i, // #dummy...
|
|
69
|
+
/^#sample[a-z0-9]*$/i, // #sample, #sampleA
|
|
70
|
+
];
|
|
71
|
+
static TAG_BLACKLIST_EXACT = new Set([
|
|
72
|
+
'#_', '#0', '#1', '#2', '#3', '#4', '#5', '#6', '#7', '#8', '#9',
|
|
73
|
+
'#00', '#000', '#0000', '#00000', '#000000',
|
|
74
|
+
]);
|
|
75
|
+
/**
|
|
76
|
+
* Check if a tag should be filtered out
|
|
77
|
+
*/
|
|
78
|
+
isBlacklistedTag(tag) {
|
|
79
|
+
if (!tag || typeof tag !== 'string')
|
|
80
|
+
return true;
|
|
81
|
+
const normalizedTag = tag.trim();
|
|
82
|
+
if (AtomizerService.TAG_BLACKLIST_EXACT.has(normalizedTag)) {
|
|
83
|
+
return true;
|
|
84
|
+
}
|
|
85
|
+
for (const pattern of AtomizerService.TAG_BLACKLIST_PATTERNS) {
|
|
86
|
+
if (pattern.test(normalizedTag)) {
|
|
87
|
+
return true;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
return false;
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Apply tag modulation to atom labels
|
|
94
|
+
* Filters based on modulation level and blacklist strictness from user_settings.json
|
|
95
|
+
*/
|
|
96
|
+
applyTagModulation(atomLabels) {
|
|
97
|
+
if (!atomLabels || atomLabels.length === 0)
|
|
98
|
+
return [];
|
|
99
|
+
// Convert atom labels to tag format
|
|
100
|
+
const rawTags = atomLabels.map(label => label.startsWith('#') ? label : `#${label}`);
|
|
101
|
+
// Apply modulation filtering
|
|
102
|
+
return modulateTags(rawTags);
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* Transient data patterns to exclude from ingestion
|
|
106
|
+
* These patterns identify temporary/noisy content that clutters context
|
|
107
|
+
*/
|
|
108
|
+
static TRANSIENT_PATTERNS = [
|
|
109
|
+
// Terminal error logs
|
|
110
|
+
/Traceback \(most recent call last\)/i,
|
|
111
|
+
/KeyError:/i,
|
|
112
|
+
/TypeError:/i,
|
|
113
|
+
/ValueError:/i,
|
|
114
|
+
/Error:.*at line \d+/i,
|
|
115
|
+
/Exception in thread/i,
|
|
116
|
+
/Fatal error:/i,
|
|
117
|
+
// Package installation logs
|
|
118
|
+
/npm install/i,
|
|
119
|
+
/pip install/i,
|
|
120
|
+
/yarn add/i,
|
|
121
|
+
/pnpm add/i,
|
|
122
|
+
/Collecting [a-zA-Z0-9_-]+/i, // pip "Collecting package"
|
|
123
|
+
/Downloading [a-zA-Z0-9_-]+/i, // pip "Downloading package"
|
|
124
|
+
/added \d+ package/i, // npm "added X packages"
|
|
125
|
+
/Successfully installed/i,
|
|
126
|
+
// Build artifacts
|
|
127
|
+
/Build succeeded/i,
|
|
128
|
+
/Build failed/i,
|
|
129
|
+
/Compiling\.\.\./i,
|
|
130
|
+
/Linking\.\.\./i,
|
|
131
|
+
/Generating\.\.\./i,
|
|
132
|
+
// Repetitive log noise
|
|
133
|
+
/^\[\d{4}-\d{2}-\d{2}.*\]$/m, // Standalone timestamp lines
|
|
134
|
+
/^={50,}$/m, // Separator lines (====...)
|
|
135
|
+
/^-{50,}$/m, // Separator lines (----...)
|
|
136
|
+
];
|
|
137
|
+
/**
|
|
138
|
+
* Check if content is transient/temporary data that should be excluded
|
|
139
|
+
*/
|
|
140
|
+
isTransientData(content) {
|
|
141
|
+
// Check if more than 50% of content matches transient patterns
|
|
142
|
+
const lines = content.split('\n');
|
|
143
|
+
if (lines.length < 5)
|
|
144
|
+
return false; // Too short to be log output
|
|
145
|
+
let transientLines = 0;
|
|
146
|
+
for (const pattern of AtomizerService.TRANSIENT_PATTERNS) {
|
|
147
|
+
for (const line of lines) {
|
|
148
|
+
if (pattern.test(line)) {
|
|
149
|
+
transientLines++;
|
|
150
|
+
if (transientLines > lines.length * 0.5) {
|
|
151
|
+
return true; // More than 50% is transient
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
return false;
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Deconstructs raw content into Atomic Topology.
|
|
160
|
+
* Returns the Compound (Main Body) and its Constituent Particles (Atoms/Molecules).
|
|
161
|
+
*/
|
|
162
|
+
async atomize(content, sourcePath, provenance, fileTimestamp) {
|
|
163
|
+
const filename = sourcePath.split(/[/\\]/).pop() || sourcePath;
|
|
164
|
+
const contentSizeMB = (content.length / (1024 * 1024)).toFixed(2);
|
|
165
|
+
const startTime = Date.now();
|
|
166
|
+
// Check for transient data before processing
|
|
167
|
+
if (this.isTransientData(content)) {
|
|
168
|
+
console.log(`[Atomizer] ⚠️ SKIP: ${filename} - Transient data detected (error logs, install output, etc.)`);
|
|
169
|
+
return null; // Skip ingestion entirely
|
|
170
|
+
}
|
|
171
|
+
// Note: System output (Anchor search results) is NOT skipped - it's cleaned during sanitization
|
|
172
|
+
// The sanitization step removes score markers, system IDs, YAML formatting, etc.
|
|
173
|
+
// Deduplication handles any remaining duplicates
|
|
174
|
+
console.log(`[Atomizer] ⏱️ START: ${filename} (${contentSizeMB}MB)`);
|
|
175
|
+
try {
|
|
176
|
+
// 1. Sanitize (Iron Lung) - Chunked Strategy for Large Files
|
|
177
|
+
// Optimized port of Refiner's Key Assassin
|
|
178
|
+
// For very large files, we sanitize in chunks to avoid string length limits/OOM
|
|
179
|
+
const sanitizeStart = Date.now();
|
|
180
|
+
const CHUNK_SIZE = 1024 * 1024; // 1MB chunks
|
|
181
|
+
let cleanContent = '';
|
|
182
|
+
if (content.length > CHUNK_SIZE * 2) {
|
|
183
|
+
// Generator approach for memory efficiency
|
|
184
|
+
let chunkCount = 0;
|
|
185
|
+
for (const chunk of this.chunkedSanitize(content, sourcePath, CHUNK_SIZE)) {
|
|
186
|
+
cleanContent += chunk;
|
|
187
|
+
chunkCount++;
|
|
188
|
+
if (chunkCount % 10 === 0) {
|
|
189
|
+
console.log(`[Atomizer] ⏱️ Sanitize chunk ${chunkCount}... (${((Date.now() - sanitizeStart) / 1000).toFixed(1)}s)`);
|
|
190
|
+
}
|
|
191
|
+
// Yield to event loop to keep server responsive
|
|
192
|
+
await new Promise(resolve => setImmediate(resolve));
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
else {
|
|
196
|
+
cleanContent = this.sanitize(content, sourcePath);
|
|
197
|
+
}
|
|
198
|
+
console.log(`[Atomizer] ⏱️ Sanitize complete: ${((Date.now() - sanitizeStart) / 1000).toFixed(2)}s`);
|
|
199
|
+
// 2. Identification (Hash)
|
|
200
|
+
const hashStart = Date.now();
|
|
201
|
+
const compoundId = crypto.createHash('md5').update(cleanContent + sourcePath).digest('hex');
|
|
202
|
+
const timestamp = fileTimestamp || Date.now();
|
|
203
|
+
console.log(`[Atomizer] ⏱️ Hash complete: ${Date.now() - hashStart}ms`);
|
|
204
|
+
// 3. System Atoms (Project/File Level)
|
|
205
|
+
const systemAtoms = this.extractSystemAtoms(sourcePath);
|
|
206
|
+
// 4. Construct Compound ID
|
|
207
|
+
const fullCompoundId = `mem_${compoundId}`;
|
|
208
|
+
// 5. Molecular Fission (Semantic Splitting)
|
|
209
|
+
// Determine Type & Extract Data
|
|
210
|
+
const splitStart = Date.now();
|
|
211
|
+
const type = this.detectMoleculeType(cleanContent, sourcePath); // Determine main type
|
|
212
|
+
// Pass type to optimize splitting strategy
|
|
213
|
+
const moleculeParts = this.splitIntoMolecules(cleanContent, type);
|
|
214
|
+
console.log(`[Atomizer] ⏱️ Split into ${moleculeParts.length} molecules: ${((Date.now() - splitStart) / 1000).toFixed(2)}s`);
|
|
215
|
+
// 5. Molecular Enrichment (Granular Tagging & Typing)
|
|
216
|
+
const enrichStart = Date.now();
|
|
217
|
+
const molecules = [];
|
|
218
|
+
const allAtomsMap = new Map();
|
|
219
|
+
// Add System Atoms to global map
|
|
220
|
+
systemAtoms.forEach(a => allAtomsMap.set(a.id, a));
|
|
221
|
+
// Define maximum content length for individual molecules
|
|
222
|
+
const MAX_MOLECULE_CONTENT_LENGTH = 500 * 1024; // 500KB limit
|
|
223
|
+
// Timestamp Context: Start with file timestamp (modification time)
|
|
224
|
+
// As we scan molecules, if we find a date in the content (e.g. log timestamp),
|
|
225
|
+
// we update this context so subsequent atoms inherit it.
|
|
226
|
+
let currentTimestamp = timestamp;
|
|
227
|
+
const totalMolecules = moleculeParts.length;
|
|
228
|
+
const progressInterval = Math.max(100, Math.floor(totalMolecules / 10)); // Log every 10% or every 100
|
|
229
|
+
// Process molecules in batches to yield to event loop
|
|
230
|
+
for (let i = 0; i < moleculeParts.length; i++) {
|
|
231
|
+
const part = moleculeParts[i];
|
|
232
|
+
const { content: text, start, end, timestamp: partTimestamp } = part;
|
|
233
|
+
// Progress logging and yield every 100 molecules
|
|
234
|
+
if (i % progressInterval === 0 && i > 0) {
|
|
235
|
+
const pct = ((i / totalMolecules) * 100).toFixed(0);
|
|
236
|
+
console.log(`[Atomizer] ⏱️ Enriching: ${pct}% (${i}/${totalMolecules}) - ${((Date.now() - enrichStart) / 1000).toFixed(1)}s`);
|
|
237
|
+
}
|
|
238
|
+
if (i % 100 === 0) {
|
|
239
|
+
await new Promise(resolve => setImmediate(resolve));
|
|
240
|
+
}
|
|
241
|
+
// Update time context if this part has a specific timestamp
|
|
242
|
+
// Extract earliest timestamp from content for temporal ordering
|
|
243
|
+
const extractedTs = this.extractEarliestTimestamp(text, currentTimestamp);
|
|
244
|
+
if (extractedTs) {
|
|
245
|
+
currentTimestamp = extractedTs;
|
|
246
|
+
}
|
|
247
|
+
// Check content length and truncate if necessary
|
|
248
|
+
let processedText = text;
|
|
249
|
+
if (processedText.length > MAX_MOLECULE_CONTENT_LENGTH) {
|
|
250
|
+
console.warn(`[Atomizer] Molecule content exceeds maximum length (${processedText.length} chars), truncating...`);
|
|
251
|
+
processedText = processedText.substring(0, MAX_MOLECULE_CONTENT_LENGTH) + '... [TRUNCATED]';
|
|
252
|
+
}
|
|
253
|
+
// Scan for concepts in this specific molecule
|
|
254
|
+
// PERFORMANCE: Skip for pure data rows (CSV lines) that have no prose
|
|
255
|
+
// But keep scanning for conversational YAML which has semantic content
|
|
256
|
+
const conceptAtoms = this.scanAtoms(processedText);
|
|
257
|
+
const moleculeAtoms = [...systemAtoms, ...conceptAtoms];
|
|
258
|
+
// Add concepts to global map
|
|
259
|
+
conceptAtoms.forEach(a => allAtomsMap.set(a.id, a));
|
|
260
|
+
const molId = `mol_${crypto.createHash('md5').update(compoundId + i + processedText).digest('hex').substring(0, 12)}`;
|
|
261
|
+
// Re-Determine Type locally (e.g. code block in markdown)
|
|
262
|
+
// Use the passed type as default, but refined per chunk if needed
|
|
263
|
+
const molType = (type === 'prose' && (processedText.includes('```') || processedText.includes('function') || processedText.includes('const '))) ? 'code' : type;
|
|
264
|
+
let numericVal = undefined;
|
|
265
|
+
let numericUnit = undefined;
|
|
266
|
+
if (molType === 'data') {
|
|
267
|
+
const data = this.extractNumericData(processedText);
|
|
268
|
+
if (data) {
|
|
269
|
+
numericVal = data.value;
|
|
270
|
+
numericUnit = data.unit;
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
molecules.push({
|
|
274
|
+
id: molId,
|
|
275
|
+
content: processedText,
|
|
276
|
+
atoms: moleculeAtoms.map(a => a.id),
|
|
277
|
+
sequence: i,
|
|
278
|
+
compoundId: fullCompoundId,
|
|
279
|
+
// Universal Coordinates
|
|
280
|
+
start_byte: start,
|
|
281
|
+
end_byte: end,
|
|
282
|
+
// Metadata
|
|
283
|
+
type: molType,
|
|
284
|
+
numeric_value: numericVal,
|
|
285
|
+
numeric_unit: numericUnit,
|
|
286
|
+
molecular_signature: this.generateSimHash(processedText),
|
|
287
|
+
timestamp: currentTimestamp,
|
|
288
|
+
// Apply tag modulation: filter blacklisted tags and apply modulation level
|
|
289
|
+
tags: this.applyTagModulation(moleculeAtoms.map(a => a.label)),
|
|
290
|
+
entities: {
|
|
291
|
+
people: moleculeAtoms.filter(a => ['#coda', '#rob', '#oliver'].includes(a.label.toLowerCase())).map(a => a.label),
|
|
292
|
+
concepts: moleculeAtoms.filter(a => a.type === 'concept').map(a => a.label),
|
|
293
|
+
projects: moleculeAtoms.filter(a => ['#project', '#engine', '#agent'].some(kw => a.label.toLowerCase().includes(kw))).map(a => a.label)
|
|
294
|
+
}
|
|
295
|
+
});
|
|
296
|
+
}
|
|
297
|
+
console.log(`[Atomizer] ⏱️ Enrichment complete: ${((Date.now() - enrichStart) / 1000).toFixed(2)}s`);
|
|
298
|
+
const allAtoms = Array.from(allAtomsMap.values());
|
|
299
|
+
const compound = {
|
|
300
|
+
id: fullCompoundId,
|
|
301
|
+
compound_body: cleanContent,
|
|
302
|
+
molecules: molecules.map(m => m.id),
|
|
303
|
+
atoms: allAtoms.map(a => a.id),
|
|
304
|
+
path: sourcePath,
|
|
305
|
+
timestamp: fileTimestamp || timestamp, // Compound keeps file timestamp if provided
|
|
306
|
+
provenance: provenance,
|
|
307
|
+
molecular_signature: this.generateSimHash(cleanContent)
|
|
308
|
+
};
|
|
309
|
+
const totalTime = ((Date.now() - startTime) / 1000).toFixed(2);
|
|
310
|
+
console.log(`[Atomizer] ✅ COMPLETE: ${filename} (${contentSizeMB}MB) → ${molecules.length} molecules, ${allAtoms.length} atoms in ${totalTime}s`);
|
|
311
|
+
return {
|
|
312
|
+
compound,
|
|
313
|
+
molecules,
|
|
314
|
+
atoms: allAtoms
|
|
315
|
+
};
|
|
316
|
+
}
|
|
317
|
+
catch (error) {
|
|
318
|
+
console.error(`[Atomizer] FATAL ERROR processing ${sourcePath}:`, error.message);
|
|
319
|
+
throw error;
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
*chunkedSanitize(text, filePath, chunkSize) {
|
|
323
|
+
let offset = 0;
|
|
324
|
+
while (offset < text.length) {
|
|
325
|
+
let end = Math.min(offset + chunkSize, text.length);
|
|
326
|
+
// Align to newline if not at the end
|
|
327
|
+
if (end < text.length) {
|
|
328
|
+
const nextNewline = text.indexOf('\n', end);
|
|
329
|
+
if (nextNewline !== -1 && nextNewline < end + 1000) { // Don't drift too far
|
|
330
|
+
end = nextNewline + 1;
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
const chunk = text.substring(offset, end);
|
|
334
|
+
yield this.sanitize(chunk, filePath);
|
|
335
|
+
offset = end;
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
// --- PORTED LOGIC FROM REFINER.TS ---
|
|
339
|
+
/**
|
|
340
|
+
* Enhanced Content Sanitization (The Key Assassin)
|
|
341
|
+
* Surgically removes JSON wrappers, log spam, and PII.
|
|
342
|
+
*/
|
|
343
|
+
sanitize(text, filePath = '') {
|
|
344
|
+
let clean = text;
|
|
345
|
+
// 1. Fundamental Normalization
|
|
346
|
+
clean = clean.replace(/^\uFEFF/, '').replace(/[\u0000\uFFFD]/g, '');
|
|
347
|
+
// Aggressive Newline Normalization: convert all \r\n and literal "\r\n" strings to real newlines
|
|
348
|
+
clean = clean.replace(/\\r\\n/g, '\n').replace(/\r\n/g, '\n');
|
|
349
|
+
// 2. Enhanced Surgeon: Log Spam Removal
|
|
350
|
+
clean = clean.replace(/(?:^|\s|\.{3}\s*)Processing '[^']+'\.{3}/g, '\n');
|
|
351
|
+
clean = clean.replace(/(?:^|\s|\.{3}\s*)Loading '[^']+'\.{3}/g, '\n');
|
|
352
|
+
clean = clean.replace(/(?:^|\s|\.{3}\s*)Indexing '[^']+'\.{3}/g, '\n');
|
|
353
|
+
clean = clean.replace(/(?:^|\s|\.{3}\s*)Analyzing '[^']+'\.{3}/g, '\n');
|
|
354
|
+
// [NEW] Robust Processing Log Filter (for " - [TIMESTAMP] ... Processing ...")
|
|
355
|
+
clean = clean.replace(/(?:^|\n)\s*-\s*\[\d{4}-\d{2}-\d{2}.*?\].*?Processing.*?(?:\n|$)/gi, '\n');
|
|
356
|
+
// Strip Log Timestamps (at start of lines)
|
|
357
|
+
clean = clean.replace(/^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(?:\.\d{3})?\s*(?:AM|PM)?\s*[-:>]/gm, '');
|
|
358
|
+
// Strip bracketed metadata like [2026-01-25...]
|
|
359
|
+
clean = clean.replace(/\[\d{4}-\d{2}-\d{2}.*?\]/g, '');
|
|
360
|
+
clean = clean.replace(/\[[#=]{0,10}\s{0,10}\]\s*\d{1,3}%/g, ''); // [===] 100%
|
|
361
|
+
// 2.5 PII Masking
|
|
362
|
+
clean = clean.replace(/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g, '[EMAIL_REDACTED]');
|
|
363
|
+
clean = clean.replace(/\b(?:\d{1,3}\.){3}\d{1,3}\b/g, '[IP_REDACTED]');
|
|
364
|
+
clean = clean.replace(/sk-[a-zA-Z0-9]{32,}/g, 'sk-[REDACTED]');
|
|
365
|
+
// --- DENSITY-AWARE SCRUBBER (Standard 073) ---
|
|
366
|
+
// 1. Strip "Dirty Read" Source Headers & Recursive Metadata
|
|
367
|
+
// Matches: [Source: ...] or status: [Source: ...]
|
|
368
|
+
clean = clean.replace(/(?:status:\s*)?\[Source: .*?\](?:\s*\(Timestamp: .*?\))?/g, '');
|
|
369
|
+
// 2. Strip Logging/YAML/JSON Wrappers (Aggressive Pattern)
|
|
370
|
+
// This targets the keys and the quotes around them, but leaves the content.
|
|
371
|
+
const metaKeys = ['response_content', 'thinking_content', 'content', 'message', 'text', 'body', 'type', 'timestamp', 'source_path'];
|
|
372
|
+
metaKeys.forEach(key => {
|
|
373
|
+
// Match "key": " or key: |- or "key": |- etc.
|
|
374
|
+
const regex = new RegExp(`["']?${key}["']?\\s*:\\s*(?:\\|-?|")?`, 'g');
|
|
375
|
+
clean = clean.replace(regex, '');
|
|
376
|
+
});
|
|
377
|
+
// Strip trailing quotes and braces from JSON-like fragments
|
|
378
|
+
clean = clean.replace(/"\s*,\s*"/g, '\n');
|
|
379
|
+
clean = clean.replace(/"\s*}/g, '');
|
|
380
|
+
clean = clean.replace(/{\s*"/g, '');
|
|
381
|
+
// 3. Strip LLM Role Markers
|
|
382
|
+
clean = clean.replace(/<\|user\|>/g, '');
|
|
383
|
+
clean = clean.replace(/<\|assistant\|>/g, '');
|
|
384
|
+
clean = clean.replace(/<\|system\|>/g, '');
|
|
385
|
+
// 4. Strip Anchor System Output (prevent self-contamination)
|
|
386
|
+
// Remove score markers from search results
|
|
387
|
+
clean = clean.replace(/score:\s*\d+(?:\.\d+)?/g, '');
|
|
388
|
+
// Remove virtual molecule IDs
|
|
389
|
+
clean = clean.replace(/virtual_mem_[a-f0-9_]+/g, '');
|
|
390
|
+
// Remove system memory IDs
|
|
391
|
+
clean = clean.replace(/\bid:\s*["']?mem_[a-f0-9_]+["']?\s*,?/g, '');
|
|
392
|
+
// Remove source path markers
|
|
393
|
+
clean = clean.replace(/source:\s*["']?inbox\/[^"'\n]+["']?\s*,?/g, '');
|
|
394
|
+
// Remove provenance markers
|
|
395
|
+
clean = clean.replace(/provenance:\s*["']?(internal|external|quarantine)["']?\s*,?/g, '');
|
|
396
|
+
// Remove bucket arrays
|
|
397
|
+
clean = clean.replace(/buckets:\s*\[[\s\w,"']*\]\s*,?/g, '');
|
|
398
|
+
// Remove epoch data
|
|
399
|
+
clean = clean.replace(/epochs?:\s*['"]?[^,\n"']+['"]?\s*,?/g, '');
|
|
400
|
+
// Remove timestamp fields from system output
|
|
401
|
+
clean = clean.replace(/timestamp:\s*["']?[^"'\n]+["']?\s*,?/g, '');
|
|
402
|
+
// Remove compound_id and byte range markers
|
|
403
|
+
clean = clean.replace(/compound_id:\s*["']?[a-f0-9_]+["']?\s*,?/g, '');
|
|
404
|
+
clean = clean.replace(/start_byte:\s*\d+\s*,?/g, '');
|
|
405
|
+
clean = clean.replace(/end_byte:\s*\d+\s*,?/g, '');
|
|
406
|
+
clean = clean.replace(/molecular_signature:\s*["']?[a-f0-9]+["']?\s*,?/g, '');
|
|
407
|
+
clean = clean.replace(/is_inflated:\s*(true|false)\s*,?/g, '');
|
|
408
|
+
// 5. Strip MCP/Agent Output Formatting
|
|
409
|
+
// Remove YAML list markers from search results
|
|
410
|
+
clean = clean.replace(/^\s*-\s*(id|source|score|content|tags|buckets|provenance):\s*/gm, '');
|
|
411
|
+
// Remove YAML block markers
|
|
412
|
+
clean = clean.replace(/^\s*\|\s*$/gm, '');
|
|
413
|
+
// Remove code block wrappers (keep content)
|
|
414
|
+
clean = clean.replace(/```yaml\s*/g, '');
|
|
415
|
+
clean = clean.replace(/```\s*$/gm, '');
|
|
416
|
+
// Remove emoji markers from system output
|
|
417
|
+
clean = clean.replace(/🔍\s*|🤖\s*|⚙️\s*|✅\s*|❌\s*/g, '');
|
|
418
|
+
// 6. Final Polish
|
|
419
|
+
clean = clean.replace(/\n{3,}/g, '\n\n');
|
|
420
|
+
return clean.trim();
|
|
421
|
+
}
|
|
422
|
+
/**
|
|
423
|
+
* Helper: The Key Assassin
|
|
424
|
+
* Recursively un-escapes and removes JSON wrappers.
|
|
425
|
+
*/
|
|
426
|
+
cleanseJsonArtifacts(text) {
|
|
427
|
+
let clean = text;
|
|
428
|
+
// 1. Recursive Un-escape
|
|
429
|
+
// DISABLED NATIVE CLEANSE due to stack overflow on deep nesting
|
|
430
|
+
// if (native && native.cleanse) {
|
|
431
|
+
// clean = native.cleanse(clean);
|
|
432
|
+
// } else {
|
|
433
|
+
let pass = 0;
|
|
434
|
+
while (clean.includes('\\') && pass < 3) {
|
|
435
|
+
pass++;
|
|
436
|
+
clean = clean.replace(/\\"/g, '"').replace(/\\n/g, '\n').replace(/\\t/g, '\t');
|
|
437
|
+
}
|
|
438
|
+
// }
|
|
439
|
+
// 2. Code Block Protection
|
|
440
|
+
const codeBlocks = [];
|
|
441
|
+
const PLACEHOLDER = '___CODE_BLOCK_PLACEHOLDER___';
|
|
442
|
+
clean = clean.replace(/```[\s\S]*?```/g, (match) => {
|
|
443
|
+
codeBlocks.push(match);
|
|
444
|
+
return `${PLACEHOLDER}${codeBlocks.length - 1}___`;
|
|
445
|
+
});
|
|
446
|
+
// 3. Remove Metadata & Wrappers
|
|
447
|
+
const purge = (ptrn) => { clean = clean.replace(ptrn, ''); };
|
|
448
|
+
purge(/"type"\s*:\s*"[^"]*",?/g);
|
|
449
|
+
purge(/"timestamp"\s*:\s*"[^"]*",?/g);
|
|
450
|
+
purge(/"source"\s*:\s*"[^"]*",?/g);
|
|
451
|
+
purge(/"response_content"\s*:\s*/g);
|
|
452
|
+
purge(/"thinking_content"\s*:\s*/g);
|
|
453
|
+
purge(/"content"\s*:\s*/g);
|
|
454
|
+
// 4. Structural Cleanup
|
|
455
|
+
clean = clean.replace(/\}\s*,\s*\{/g, '\n\n');
|
|
456
|
+
clean = clean.trim();
|
|
457
|
+
if (clean.startsWith('[') && clean.endsWith(']'))
|
|
458
|
+
clean = clean.substring(1, clean.length - 1);
|
|
459
|
+
// 5. Restore Code Blocks
|
|
460
|
+
clean = clean.replace(/___CODE_BLOCK_PLACEHOLDER___(\d+)___/g, (_, idx) => codeBlocks[parseInt(idx)] || _);
|
|
461
|
+
// 6. Slash Compressor
|
|
462
|
+
clean = clean.replace(/\\{2,}/g, '/');
|
|
463
|
+
return clean;
|
|
464
|
+
}
|
|
465
|
+
extractSystemAtoms(filePath) {
|
|
466
|
+
const atoms = [];
|
|
467
|
+
const normalized = filePath.replace(/\\/g, '/');
|
|
468
|
+
const lowerPath = normalized.toLowerCase();
|
|
469
|
+
const parts = normalized.split('/');
|
|
470
|
+
// --- TIME-LADDER LOGIC ---
|
|
471
|
+
// History/Archive gets down-weighted #Archive tag
|
|
472
|
+
if (lowerPath.includes('/history/') || lowerPath.includes('/archive/')) {
|
|
473
|
+
atoms.push(this.createAtom('#Archive', 'system', 0.5));
|
|
474
|
+
}
|
|
475
|
+
// Everything else is implicitly Current/Truth (Weight 1.0) unless specified otherwise
|
|
476
|
+
// 1. Project Root & Structure (Auto-Tagging)
|
|
477
|
+
const projectIndicators = ['codebase', 'projects', 'repos', 'src', 'packages', 'apps', 'personal', 'work', 'client'];
|
|
478
|
+
for (let i = 0; i < parts.length; i++) {
|
|
479
|
+
if (projectIndicators.includes(parts[i].toLowerCase()) && parts[i + 1]) {
|
|
480
|
+
atoms.push(this.createAtom(`#project:${parts[i + 1]}`, 'system'));
|
|
481
|
+
break;
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
// Structure Tags
|
|
485
|
+
if (normalized.includes('/src/') || normalized.startsWith('src/'))
|
|
486
|
+
atoms.push(this.createAtom('#src', 'system'));
|
|
487
|
+
if (normalized.includes('/docs/') || normalized.startsWith('docs/'))
|
|
488
|
+
atoms.push(this.createAtom('#docs', 'system'));
|
|
489
|
+
if (normalized.includes('/tests/') || normalized.startsWith('tests/'))
|
|
490
|
+
atoms.push(this.createAtom('#test', 'system'));
|
|
491
|
+
// File Type Tags
|
|
492
|
+
const ext = normalized.split('.').pop()?.toLowerCase() || '';
|
|
493
|
+
if (['ts', 'js', 'py', 'rs', 'go', 'java', 'cpp', 'c', 'h'].includes(ext))
|
|
494
|
+
atoms.push(this.createAtom('#code', 'system'));
|
|
495
|
+
if (['md', 'txt', 'rst'].includes(ext))
|
|
496
|
+
atoms.push(this.createAtom('#doc', 'system'));
|
|
497
|
+
if (['json', 'yaml', 'yml', 'xml'].includes(ext))
|
|
498
|
+
atoms.push(this.createAtom('#config', 'system'));
|
|
499
|
+
return atoms;
|
|
500
|
+
}
|
|
501
|
+
scanAtoms(content) {
|
|
502
|
+
const atoms = [];
|
|
503
|
+
const strictMode = shouldUseStrictAtomSelection();
|
|
504
|
+
// 1. Sovereign Keywords - OPTIMIZED with compiled regex
|
|
505
|
+
const keywordRegex = this.getKeywordRegex();
|
|
506
|
+
if (keywordRegex) {
|
|
507
|
+
const lowerContent = content.toLowerCase();
|
|
508
|
+
const matches = lowerContent.match(keywordRegex);
|
|
509
|
+
if (matches) {
|
|
510
|
+
// Use cached lowercase->original mapping
|
|
511
|
+
const keywordMap = this.getKeywordMap();
|
|
512
|
+
const seen = new Set();
|
|
513
|
+
for (const match of matches) {
|
|
514
|
+
const original = keywordMap.get(match);
|
|
515
|
+
if (original && !seen.has(original)) {
|
|
516
|
+
seen.add(original);
|
|
517
|
+
atoms.push(this.createAtom(`#${original}`, 'concept'));
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
}
|
|
522
|
+
// 2. Explicit Content Tags (#tag)
|
|
523
|
+
const tagMatches = content.match(/#(\w+)/g);
|
|
524
|
+
if (tagMatches) {
|
|
525
|
+
const seen = new Set();
|
|
526
|
+
tagMatches.forEach(m => {
|
|
527
|
+
const tag = m.toLowerCase();
|
|
528
|
+
// In strict mode, filter out common words and low-value tags
|
|
529
|
+
if (strictMode) {
|
|
530
|
+
const cleanTag = tag.replace(/^#/, '');
|
|
531
|
+
// Skip if too short, common word, or looks like noise
|
|
532
|
+
if (cleanTag.length < 3 ||
|
|
533
|
+
this.isCommonWord(cleanTag) ||
|
|
534
|
+
this.isBlacklistedTag(tag)) {
|
|
535
|
+
return;
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
if (!seen.has(tag)) {
|
|
539
|
+
seen.add(tag);
|
|
540
|
+
atoms.push(this.createAtom(m, 'concept'));
|
|
541
|
+
}
|
|
542
|
+
});
|
|
543
|
+
}
|
|
544
|
+
// Deduplicate locally
|
|
545
|
+
const unique = new Map();
|
|
546
|
+
atoms.forEach(a => unique.set(a.id, a));
|
|
547
|
+
return Array.from(unique.values());
|
|
548
|
+
}
|
|
549
|
+
/**
|
|
550
|
+
* Check if a word is a common word that should be filtered in strict mode
|
|
551
|
+
*/
|
|
552
|
+
isCommonWord(word) {
|
|
553
|
+
const commonWords = new Set([
|
|
554
|
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
555
|
+
'of', 'with', 'by', 'from', 'up', 'about', 'into', 'over', 'after',
|
|
556
|
+
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
|
|
557
|
+
'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might',
|
|
558
|
+
'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they',
|
|
559
|
+
'what', 'which', 'who', 'whom', 'whose', 'when', 'where', 'why', 'how',
|
|
560
|
+
'all', 'each', 'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such',
|
|
561
|
+
'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
|
|
562
|
+
'can', 'just', 'now', 'then', 'here', 'there', 'if', 'as', 'but', 'or'
|
|
563
|
+
]);
|
|
564
|
+
return commonWords.has(word.toLowerCase());
|
|
565
|
+
}
|
|
566
|
+
// Cache for keywords and compiled regex
|
|
567
|
+
cachedKeywords = null;
|
|
568
|
+
cachedKeywordRegex = null;
|
|
569
|
+
cachedKeywordMap = null;
|
|
570
|
+
getKeywordRegex() {
|
|
571
|
+
if (this.cachedKeywordRegex !== null)
|
|
572
|
+
return this.cachedKeywordRegex;
|
|
573
|
+
const keywords = this.loadSovereignKeywords();
|
|
574
|
+
if (keywords.length === 0) {
|
|
575
|
+
return null;
|
|
576
|
+
}
|
|
577
|
+
// Escape regex special chars and join with | for single-pass matching
|
|
578
|
+
const escaped = keywords.map(kw => kw.toLowerCase().replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
|
|
579
|
+
this.cachedKeywordRegex = new RegExp(`\\b(${escaped.join('|')})\\b`, 'gi');
|
|
580
|
+
return this.cachedKeywordRegex;
|
|
581
|
+
}
|
|
582
|
+
getKeywordMap() {
|
|
583
|
+
if (this.cachedKeywordMap)
|
|
584
|
+
return this.cachedKeywordMap;
|
|
585
|
+
const keywords = this.loadSovereignKeywords();
|
|
586
|
+
this.cachedKeywordMap = new Map();
|
|
587
|
+
for (const kw of keywords) {
|
|
588
|
+
this.cachedKeywordMap.set(kw.toLowerCase(), kw);
|
|
589
|
+
}
|
|
590
|
+
return this.cachedKeywordMap;
|
|
591
|
+
}
|
|
592
|
+
loadSovereignKeywords() {
|
|
593
|
+
if (this.cachedKeywords)
|
|
594
|
+
return this.cachedKeywords;
|
|
595
|
+
try {
|
|
596
|
+
// Check likely locations for internal_tags.json
|
|
597
|
+
const possiblePaths = [
|
|
598
|
+
path.join(process.cwd(), 'engine', 'context', 'internal_tags.json'),
|
|
599
|
+
path.join(process.cwd(), '..', 'engine', 'context', 'internal_tags.json'),
|
|
600
|
+
// engine/src/services/ingest -> ../../../../engine/context
|
|
601
|
+
path.join(__dirname, '../../../../engine/context/internal_tags.json'),
|
|
602
|
+
// Fallback to old location
|
|
603
|
+
path.join(process.cwd(), 'context', 'internal_tags.json')
|
|
604
|
+
];
|
|
605
|
+
for (const p of possiblePaths) {
|
|
606
|
+
if (fs.existsSync(p)) {
|
|
607
|
+
const content = fs.readFileSync(p, 'utf-8');
|
|
608
|
+
const json = JSON.parse(content);
|
|
609
|
+
if (Array.isArray(json.keywords)) {
|
|
610
|
+
this.cachedKeywords = json.keywords;
|
|
611
|
+
return json.keywords;
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
}
|
|
615
|
+
this.cachedKeywords = [];
|
|
616
|
+
return [];
|
|
617
|
+
}
|
|
618
|
+
catch (e) {
|
|
619
|
+
console.error('[Atomizer] Failed to load internal_tags.json', e);
|
|
620
|
+
return [];
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
createAtom(label, type, weight = 1.0) {
|
|
624
|
+
return {
|
|
625
|
+
id: `atom_${crypto.createHash('sha256').update(label).digest('hex').substring(0, 12)}`,
|
|
626
|
+
label,
|
|
627
|
+
type,
|
|
628
|
+
weight
|
|
629
|
+
};
|
|
630
|
+
}
|
|
631
|
+
/**
|
|
632
|
+
* Splits content into molecules with byte offsets and extracted timestamps.
|
|
633
|
+
* Enhanced with Type awareness (Prose vs Code vs Data).
|
|
634
|
+
*/
|
|
635
|
+
splitIntoMolecules(text, type = 'prose', maxSize = 1024) {
|
|
636
|
+
const results = [];
|
|
637
|
+
// Helper to get UTF-8 byte length of a string
|
|
638
|
+
const getByteLength = (str) => {
|
|
639
|
+
return Buffer.byteLength(str, 'utf8');
|
|
640
|
+
};
|
|
641
|
+
// Helper to convert string index to byte offset
|
|
642
|
+
const stringIndexToByteOffset = (str, stringIndex) => {
|
|
643
|
+
if (stringIndex <= 0)
|
|
644
|
+
return 0;
|
|
645
|
+
if (stringIndex >= str.length)
|
|
646
|
+
return getByteLength(str);
|
|
647
|
+
return getByteLength(str.substring(0, stringIndex));
|
|
648
|
+
};
|
|
649
|
+
// Helper to extract FIRST timestamp from a chunk (legacy - used for molecule splitting)
|
|
650
|
+
const extractTimestamp = (chunk) => {
|
|
651
|
+
// Match ISO timestamps: 2026-01-25T03:43:54.405Z or 2026-01-25 03:43:54
|
|
652
|
+
const isoRegex = /\b(\d{4}-\d{2}-\d{2}[T\s]\d{2}:\d{2}:\d{2}(?:\.\d{3})?Z?)\b/g;
|
|
653
|
+
let match = isoRegex.exec(chunk);
|
|
654
|
+
if (match) {
|
|
655
|
+
const ts = Date.parse(match[1]);
|
|
656
|
+
if (!isNaN(ts))
|
|
657
|
+
return ts;
|
|
658
|
+
}
|
|
659
|
+
// Match YYYY-MM-DD format (without time)
|
|
660
|
+
const dateRegex = /\b(20[2-9]\d-\d{2}-\d{2})\b/;
|
|
661
|
+
let match2 = chunk.match(dateRegex);
|
|
662
|
+
if (match2) {
|
|
663
|
+
const ts = Date.parse(match2[1]);
|
|
664
|
+
if (!isNaN(ts))
|
|
665
|
+
return ts;
|
|
666
|
+
}
|
|
667
|
+
// Match MM/DD/YYYY or DD/MM/YYYY format
|
|
668
|
+
const usDateRegex = /\b(\d{1,2}\/\d{1,2}\/\d{4})\b/;
|
|
669
|
+
let match3 = chunk.match(usDateRegex);
|
|
670
|
+
if (match3) {
|
|
671
|
+
const ts = Date.parse(match3[1]);
|
|
672
|
+
if (!isNaN(ts))
|
|
673
|
+
return ts;
|
|
674
|
+
}
|
|
675
|
+
// Match Month DD, YYYY format
|
|
676
|
+
const monthDayYearRegex = /\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),\s+(\d{4})\b/;
|
|
677
|
+
let match4 = chunk.match(monthDayYearRegex);
|
|
678
|
+
if (match4) {
|
|
679
|
+
const [, month, day, year] = match4;
|
|
680
|
+
const monthIndex = ['January', 'February', 'March', 'April', 'May', 'June',
|
|
681
|
+
'July', 'August', 'September', 'October', 'November', 'December']
|
|
682
|
+
.indexOf(month);
|
|
683
|
+
const date = new Date(parseInt(year), monthIndex, parseInt(day));
|
|
684
|
+
if (!isNaN(date.getTime()))
|
|
685
|
+
return date.getTime();
|
|
686
|
+
}
|
|
687
|
+
// Match DD Month YYYY format
|
|
688
|
+
const dayMonthYearRegex = /\b(\d{1,2})\s+(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{4})\b/;
|
|
689
|
+
let match5 = chunk.match(dayMonthYearRegex);
|
|
690
|
+
if (match5) {
|
|
691
|
+
const [, day, month, year] = match5;
|
|
692
|
+
const monthIndex = ['January', 'February', 'March', 'April', 'May', 'June',
|
|
693
|
+
'July', 'August', 'September', 'October', 'November', 'December']
|
|
694
|
+
.indexOf(month);
|
|
695
|
+
const date = new Date(parseInt(year), monthIndex, parseInt(day));
|
|
696
|
+
if (!isNaN(date.getTime()))
|
|
697
|
+
return date.getTime();
|
|
698
|
+
}
|
|
699
|
+
return undefined;
|
|
700
|
+
};
|
|
701
|
+
// --- STRATEGY: CODE (AST BLOCKS) ---
|
|
702
|
+
if (type === 'code') {
|
|
703
|
+
// "Heuristic AST": Split by top-level blocks (functions, classes) or chunks of logic.
|
|
704
|
+
// Using regex to detect block starts and tracking braces.
|
|
705
|
+
const lines = text.split('\n');
|
|
706
|
+
let currentBlock = '';
|
|
707
|
+
let blockStart = 0;
|
|
708
|
+
let currentCursor = 0;
|
|
709
|
+
let braceDepth = 0;
|
|
710
|
+
for (const line of lines) {
|
|
711
|
+
const lineWithNewline = line + '\n';
|
|
712
|
+
const lineByteLen = getByteLength(lineWithNewline);
|
|
713
|
+
const openBraces = (line.match(/\{/g) || []).length;
|
|
714
|
+
const closeBraces = (line.match(/\}/g) || []).length;
|
|
715
|
+
const prevDepth = braceDepth;
|
|
716
|
+
braceDepth += (openBraces - closeBraces);
|
|
717
|
+
currentBlock += lineWithNewline;
|
|
718
|
+
// End of a top-level block?
|
|
719
|
+
if (braceDepth === 0 && prevDepth > 0) {
|
|
720
|
+
// Just closed a root block (function/class)
|
|
721
|
+
results.push({ content: currentBlock, start: blockStart, end: currentCursor + lineByteLen, timestamp: extractTimestamp(currentBlock) });
|
|
722
|
+
currentBlock = '';
|
|
723
|
+
blockStart = currentCursor + lineByteLen;
|
|
724
|
+
}
|
|
725
|
+
// Double newline in root scope -> likely separate statements?
|
|
726
|
+
else if (braceDepth === 0 && line.trim() === '' && currentBlock.trim().length > 0) {
|
|
727
|
+
results.push({ content: currentBlock, start: blockStart, end: currentCursor + lineByteLen, timestamp: extractTimestamp(currentBlock) });
|
|
728
|
+
currentBlock = '';
|
|
729
|
+
blockStart = currentCursor + lineByteLen;
|
|
730
|
+
}
|
|
731
|
+
currentCursor += lineByteLen;
|
|
732
|
+
}
|
|
733
|
+
if (currentBlock.trim().length > 0) {
|
|
734
|
+
results.push({ content: currentBlock, start: blockStart, end: currentCursor, timestamp: extractTimestamp(currentBlock) });
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
else if (type === 'data') {
|
|
738
|
+
// --- STRATEGY: DATA (ROWS) ---
|
|
739
|
+
// Split by line
|
|
740
|
+
let cursor = 0;
|
|
741
|
+
const lines = text.split('\n');
|
|
742
|
+
for (const line of lines) {
|
|
743
|
+
const lineWithNewline = line + '\n';
|
|
744
|
+
const byteLen = getByteLength(lineWithNewline);
|
|
745
|
+
if (line.trim().length > 0) {
|
|
746
|
+
// Store without the newline in content, but account for it in byte offsets
|
|
747
|
+
const lineByteLen = getByteLength(line);
|
|
748
|
+
results.push({ content: line, start: cursor, end: cursor + lineByteLen, timestamp: extractTimestamp(line) });
|
|
749
|
+
}
|
|
750
|
+
cursor += byteLen;
|
|
751
|
+
}
|
|
752
|
+
}
|
|
753
|
+
else {
|
|
754
|
+
// --- STRATEGY: PROSE (SENTENCES with MARKDOWN FISSION) ---
|
|
755
|
+
// MARKDOWN FISSION: Split on code fences first to separate code from prose
|
|
756
|
+
const codeFenceRegex = /```[\s\S]*?```/g;
|
|
757
|
+
const codeFences = [];
|
|
758
|
+
let fenceMatch;
|
|
759
|
+
while ((fenceMatch = codeFenceRegex.exec(text)) !== null) {
|
|
760
|
+
const startByte = stringIndexToByteOffset(text, fenceMatch.index);
|
|
761
|
+
const endByte = stringIndexToByteOffset(text, fenceMatch.index + fenceMatch[0].length);
|
|
762
|
+
codeFences.push({
|
|
763
|
+
match: fenceMatch[0],
|
|
764
|
+
stringIndex: fenceMatch.index,
|
|
765
|
+
startByte: startByte,
|
|
766
|
+
endByte: endByte
|
|
767
|
+
});
|
|
768
|
+
}
|
|
769
|
+
// If we have code fences, split around them
|
|
770
|
+
if (codeFences.length > 0) {
|
|
771
|
+
let stringCursor = 0; // Track position in string indices
|
|
772
|
+
let byteCursor = 0; // Track position in byte offsets
|
|
773
|
+
for (const fence of codeFences) {
|
|
774
|
+
// Pre-fence prose
|
|
775
|
+
const fenceStringStart = fence.stringIndex;
|
|
776
|
+
if (fenceStringStart > stringCursor) {
|
|
777
|
+
const preProse = text.substring(stringCursor, fenceStringStart);
|
|
778
|
+
if (preProse.trim().length > 0) {
|
|
779
|
+
// Recursively split the prose portion into sentences
|
|
780
|
+
const proseParts = preProse.split(/(?<=[.!?])\s+(?=[A-Z])/);
|
|
781
|
+
let proseStringCursor = 0;
|
|
782
|
+
for (const part of proseParts) {
|
|
783
|
+
if (part.trim().length === 0)
|
|
784
|
+
continue;
|
|
785
|
+
const partStringStart = preProse.indexOf(part, proseStringCursor);
|
|
786
|
+
if (partStringStart !== -1) {
|
|
787
|
+
const partByteStart = byteCursor + stringIndexToByteOffset(preProse, partStringStart);
|
|
788
|
+
const partByteEnd = partByteStart + getByteLength(part);
|
|
789
|
+
results.push({ content: part, start: partByteStart, end: partByteEnd, timestamp: extractTimestamp(part) });
|
|
790
|
+
proseStringCursor = partStringStart + part.length;
|
|
791
|
+
}
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
}
|
|
795
|
+
// The code fence itself (will be typed as 'code' in molecule enrichment)
|
|
796
|
+
results.push({ content: fence.match, start: fence.startByte, end: fence.endByte, timestamp: extractTimestamp(fence.match) });
|
|
797
|
+
stringCursor = fenceStringStart + fence.match.length;
|
|
798
|
+
byteCursor = fence.endByte;
|
|
799
|
+
}
|
|
800
|
+
// Post-fence prose (after last fence)
|
|
801
|
+
if (stringCursor < text.length) {
|
|
802
|
+
const postProse = text.substring(stringCursor);
|
|
803
|
+
if (postProse.trim().length > 0) {
|
|
804
|
+
const proseParts = postProse.split(/(?<=[.!?])\s+(?=[A-Z])/);
|
|
805
|
+
let proseStringCursor = 0;
|
|
806
|
+
for (const part of proseParts) {
|
|
807
|
+
if (part.trim().length === 0)
|
|
808
|
+
continue;
|
|
809
|
+
const partStringStart = postProse.indexOf(part, proseStringCursor);
|
|
810
|
+
if (partStringStart !== -1) {
|
|
811
|
+
const partByteStart = byteCursor + stringIndexToByteOffset(postProse, partStringStart);
|
|
812
|
+
const partByteEnd = partByteStart + getByteLength(part);
|
|
813
|
+
results.push({ content: part, start: partByteStart, end: partByteEnd, timestamp: extractTimestamp(part) });
|
|
814
|
+
proseStringCursor = partStringStart + part.length;
|
|
815
|
+
}
|
|
816
|
+
}
|
|
817
|
+
}
|
|
818
|
+
}
|
|
819
|
+
}
|
|
820
|
+
else {
|
|
821
|
+
// No code fences - standard sentence splitting
|
|
822
|
+
const parts = text.split(/(?<=[.!?])\s+(?=[A-Z])/);
|
|
823
|
+
let searchStringCursor = 0;
|
|
824
|
+
for (const part of parts) {
|
|
825
|
+
if (part.trim().length === 0)
|
|
826
|
+
continue;
|
|
827
|
+
const realStringStart = text.indexOf(part, searchStringCursor); // Find next occurrence
|
|
828
|
+
if (realStringStart !== -1) {
|
|
829
|
+
const realByteStart = stringIndexToByteOffset(text, realStringStart);
|
|
830
|
+
const realByteEnd = realByteStart + getByteLength(part);
|
|
831
|
+
results.push({ content: part, start: realByteStart, end: realByteEnd, timestamp: extractTimestamp(part) });
|
|
832
|
+
searchStringCursor = realStringStart + part.length;
|
|
833
|
+
}
|
|
834
|
+
}
|
|
835
|
+
}
|
|
836
|
+
}
|
|
837
|
+
// --- ENFORCE SIZE LIMIT (POST-PROCESS) ---
|
|
838
|
+
const finalResults = [];
|
|
839
|
+
for (const item of results) {
|
|
840
|
+
const itemByteLen = getByteLength(item.content);
|
|
841
|
+
if (itemByteLen <= maxSize) {
|
|
842
|
+
finalResults.push(item);
|
|
843
|
+
}
|
|
844
|
+
else {
|
|
845
|
+
// Force split large molecules by byte size
|
|
846
|
+
let currentStart = item.start;
|
|
847
|
+
let remaining = item.content;
|
|
848
|
+
while (remaining.length > 0) {
|
|
849
|
+
// Find a safe split point that doesn't exceed maxSize bytes
|
|
850
|
+
let splitPoint = remaining.length;
|
|
851
|
+
let chunkByteLen = getByteLength(remaining);
|
|
852
|
+
// Binary search for the right split point if we're over the limit
|
|
853
|
+
if (chunkByteLen > maxSize) {
|
|
854
|
+
let low = 0;
|
|
855
|
+
let high = remaining.length;
|
|
856
|
+
while (low < high) {
|
|
857
|
+
const mid = Math.floor((low + high + 1) / 2);
|
|
858
|
+
const testChunk = remaining.substring(0, mid);
|
|
859
|
+
const testByteLen = getByteLength(testChunk);
|
|
860
|
+
if (testByteLen <= maxSize) {
|
|
861
|
+
low = mid;
|
|
862
|
+
}
|
|
863
|
+
else {
|
|
864
|
+
high = mid - 1;
|
|
865
|
+
}
|
|
866
|
+
}
|
|
867
|
+
// Walk back to nearest newline to avoid splitting mid-line
|
|
868
|
+
let newlinePos = remaining.lastIndexOf('\n', low);
|
|
869
|
+
splitPoint = newlinePos > 0 ? newlinePos + 1 : low;
|
|
870
|
+
}
|
|
871
|
+
const chunk = remaining.substring(0, splitPoint);
|
|
872
|
+
const chunkBytes = getByteLength(chunk);
|
|
873
|
+
// Inherit timestamp for all chunks if the original item had one
|
|
874
|
+
finalResults.push({
|
|
875
|
+
content: chunk,
|
|
876
|
+
start: currentStart,
|
|
877
|
+
end: currentStart + chunkBytes,
|
|
878
|
+
timestamp: item.timestamp
|
|
879
|
+
});
|
|
880
|
+
remaining = remaining.substring(splitPoint);
|
|
881
|
+
currentStart += chunkBytes;
|
|
882
|
+
}
|
|
883
|
+
}
|
|
884
|
+
}
|
|
885
|
+
return finalResults;
|
|
886
|
+
}
|
|
887
|
+
detectMoleculeType(text, filePath) {
|
|
888
|
+
// 1. File Extension hints
|
|
889
|
+
if (filePath.endsWith('.csv') || filePath.endsWith('.json') || filePath.endsWith('.yaml') || filePath.endsWith('.yml'))
|
|
890
|
+
return 'data';
|
|
891
|
+
if (filePath.match(/\.(ts|js|py|rs|go|cpp|h|c)$/))
|
|
892
|
+
return 'code';
|
|
893
|
+
// 2. Large file safety: treat files > 5MB as data to avoid regex timeout
|
|
894
|
+
if (text.length > 5 * 1024 * 1024) {
|
|
895
|
+
console.log(`[Atomizer] Large file (${(text.length / (1024 * 1024)).toFixed(1)}MB) - using data strategy for performance`);
|
|
896
|
+
return 'data';
|
|
897
|
+
}
|
|
898
|
+
// 3. Content Heuristics
|
|
899
|
+
if (text.trim().startsWith('|') && text.includes('|'))
|
|
900
|
+
return 'data'; // Markdown Table row
|
|
901
|
+
if (text.includes('```') || text.includes('function ') || text.includes('const ') || text.includes('import '))
|
|
902
|
+
return 'code';
|
|
903
|
+
return 'prose';
|
|
904
|
+
}
|
|
905
|
+
/**
|
|
906
|
+
* Extract earliest timestamp from content for temporal ordering
|
|
907
|
+
* Scans for multiple timestamp formats and returns the earliest found
|
|
908
|
+
*/
|
|
909
|
+
extractEarliestTimestamp(chunk, fallbackTimestamp) {
|
|
910
|
+
const timestamps = [];
|
|
911
|
+
// ISO timestamps: 2026-01-25T03:43:54.405Z or 2026-01-25 03:43:54
|
|
912
|
+
const isoRegex = /\b(\d{4}-\d{2}-\d{2}[T\s]\d{2}:\d{2}:\d{2}(?:\.\d{3})?Z?)\b/g;
|
|
913
|
+
let isoMatch;
|
|
914
|
+
while ((isoMatch = isoRegex.exec(chunk)) !== null) {
|
|
915
|
+
const ts = Date.parse(isoMatch[1]);
|
|
916
|
+
if (!isNaN(ts))
|
|
917
|
+
timestamps.push(ts);
|
|
918
|
+
}
|
|
919
|
+
// YYYY-MM-DD
|
|
920
|
+
const dateRegex = /\b(20[2-9]\d-\d{2}-\d{2})\b/g;
|
|
921
|
+
let dateMatch;
|
|
922
|
+
while ((dateMatch = dateRegex.exec(chunk)) !== null) {
|
|
923
|
+
const ts = Date.parse(dateMatch[1]);
|
|
924
|
+
if (!isNaN(ts))
|
|
925
|
+
timestamps.push(ts);
|
|
926
|
+
}
|
|
927
|
+
// MM/DD/YYYY or DD/MM/YYYY
|
|
928
|
+
const usDateRegex = /\b(\d{1,2}\/\d{1,2}\/\d{4})\b/g;
|
|
929
|
+
let usMatch;
|
|
930
|
+
while ((usMatch = usDateRegex.exec(chunk)) !== null) {
|
|
931
|
+
const ts = Date.parse(usMatch[1]);
|
|
932
|
+
if (!isNaN(ts))
|
|
933
|
+
timestamps.push(ts);
|
|
934
|
+
}
|
|
935
|
+
// Return earliest timestamp found, or fallback
|
|
936
|
+
if (timestamps.length > 0) {
|
|
937
|
+
return Math.min(...timestamps);
|
|
938
|
+
}
|
|
939
|
+
return fallbackTimestamp || Date.now();
|
|
940
|
+
}
|
|
941
|
+
extractNumericData(text) {
|
|
942
|
+
// Examples: "1500 PSI", "15%", "$10.50"
|
|
943
|
+
const matches = text.match(/([\d,]+\.?\d*)\s?([A-Za-z%]+)?/g);
|
|
944
|
+
if (!matches)
|
|
945
|
+
return null;
|
|
946
|
+
let bestCandidate = null;
|
|
947
|
+
for (const m of matches) {
|
|
948
|
+
const valStr = m.match(/[\d,]+\.?\d*/)?.[0]?.replace(/,/g, '');
|
|
949
|
+
const unit = m.match(/[A-Za-z%]+/)?.[0];
|
|
950
|
+
if (valStr) {
|
|
951
|
+
const val = parseFloat(valStr);
|
|
952
|
+
// Filter out likely years (1900-2100) if no unit, to avoid false positives in history
|
|
953
|
+
if ((val >= 1900 && val <= 2100) && Number.isInteger(val) && !unit)
|
|
954
|
+
continue;
|
|
955
|
+
if (unit || !bestCandidate) {
|
|
956
|
+
bestCandidate = { value: val, unit: unit };
|
|
957
|
+
}
|
|
958
|
+
}
|
|
959
|
+
}
|
|
960
|
+
return bestCandidate;
|
|
961
|
+
}
|
|
962
|
+
generateSimHash(text) {
|
|
963
|
+
// Use @rbalchii/native-fingerprint if available
|
|
964
|
+
if (nativeFingerprint) {
|
|
965
|
+
try {
|
|
966
|
+
return nativeFingerprint(text);
|
|
967
|
+
}
|
|
968
|
+
catch { /* fall through to JS fallback */ }
|
|
969
|
+
}
|
|
970
|
+
// JS Fallback: Simple Jenkins Hash
|
|
971
|
+
let hash = 0;
|
|
972
|
+
if (text.length === 0)
|
|
973
|
+
return "0";
|
|
974
|
+
for (let i = 0; i < text.length; i++) {
|
|
975
|
+
const char = text.charCodeAt(i);
|
|
976
|
+
hash = ((hash << 5) - hash) + char;
|
|
977
|
+
hash = hash & hash; // Convert to 32bit integer
|
|
978
|
+
}
|
|
979
|
+
return Math.abs(hash).toString(16);
|
|
980
|
+
}
|
|
981
|
+
}
|
|
982
|
+
//# sourceMappingURL=atomizer-service.js.map
|