@rbalchii/anchor-engine 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (539) hide show
  1. package/LICENSE +609 -0
  2. package/README.md +317 -0
  3. package/anchor.bat +5 -0
  4. package/docs/API.md +314 -0
  5. package/docs/DEPLOYMENT.md +448 -0
  6. package/docs/INDEX.md +226 -0
  7. package/docs/STAR_Whitepaper_Executive.md +216 -0
  8. package/docs/TROUBLESHOOTING.md +535 -0
  9. package/docs/archive/GIT_BACKUP_VERIFICATION.md +297 -0
  10. package/docs/archive/adoption-guide.md +264 -0
  11. package/docs/archive/adoption-preparation.md +179 -0
  12. package/docs/archive/agent-harness-integration.md +227 -0
  13. package/docs/archive/api-reference.md +106 -0
  14. package/docs/archive/api_flows_diagram.md +118 -0
  15. package/docs/archive/architecture.md +410 -0
  16. package/docs/archive/architecture_diagram.md +174 -0
  17. package/docs/archive/broader-adoption-preparation.md +175 -0
  18. package/docs/archive/browser-paradigm-architecture.md +163 -0
  19. package/docs/archive/chat-integration.md +124 -0
  20. package/docs/archive/community-adoption-materials.md +103 -0
  21. package/docs/archive/community-adoption.md +147 -0
  22. package/docs/archive/comparison-with-siloed-solutions.md +192 -0
  23. package/docs/archive/comprehensive-docs.md +156 -0
  24. package/docs/archive/data_flow_diagram.md +251 -0
  25. package/docs/archive/enhancement-implementation-summary.md +146 -0
  26. package/docs/archive/evolution-summary.md +141 -0
  27. package/docs/archive/ingestion_pipeline_diagram.md +198 -0
  28. package/docs/archive/native-module-profiling-results.md +135 -0
  29. package/docs/archive/positioning-document.md +158 -0
  30. package/docs/archive/positioning.md +175 -0
  31. package/docs/archive/query-builder-documentation.md +218 -0
  32. package/docs/archive/quick-reference.md +40 -0
  33. package/docs/archive/quickstart.md +63 -0
  34. package/docs/archive/relationship-narrative-discovery.md +141 -0
  35. package/docs/archive/search-logic-improvement-plan.md +336 -0
  36. package/docs/archive/search_architecture_diagram.md +212 -0
  37. package/docs/archive/semantic-architecture-guide.md +97 -0
  38. package/docs/archive/sequence-diagrams.md +128 -0
  39. package/docs/archive/system_components_diagram.md +296 -0
  40. package/docs/archive/test-framework-integration.md +109 -0
  41. package/docs/archive/testing-framework-documentation.md +397 -0
  42. package/docs/archive/testing-framework-summary.md +121 -0
  43. package/docs/archive/testing-framework.md +377 -0
  44. package/docs/archive/ui-architecture.md +75 -0
  45. package/docs/arxiv/BIBLIOGRAPHY.bib +145 -0
  46. package/docs/arxiv/RELATED_WORK.tex +39 -0
  47. package/docs/arxiv/compile.bat +48 -0
  48. package/docs/arxiv/joss_response.md +33 -0
  49. package/docs/arxiv/prepare-submission.bat +46 -0
  50. package/docs/arxiv/review.md +128 -0
  51. package/docs/arxiv/star-whitepaper.tex +657 -0
  52. package/docs/code-patterns.md +289 -0
  53. package/docs/whitepaper.md +445 -0
  54. package/engine/dist/agent/runtime.d.ts +41 -0
  55. package/engine/dist/agent/runtime.d.ts.map +1 -0
  56. package/engine/dist/agent/runtime.js +73 -0
  57. package/engine/dist/agent/runtime.js.map +1 -0
  58. package/engine/dist/commands/audit-tags.d.ts +14 -0
  59. package/engine/dist/commands/audit-tags.d.ts.map +1 -0
  60. package/engine/dist/commands/audit-tags.js +180 -0
  61. package/engine/dist/commands/audit-tags.js.map +1 -0
  62. package/engine/dist/commands/distill.d.ts +19 -0
  63. package/engine/dist/commands/distill.d.ts.map +1 -0
  64. package/engine/dist/commands/distill.js +114 -0
  65. package/engine/dist/commands/distill.js.map +1 -0
  66. package/engine/dist/commands/generate-synonyms.d.ts +14 -0
  67. package/engine/dist/commands/generate-synonyms.d.ts.map +1 -0
  68. package/engine/dist/commands/generate-synonyms.js +91 -0
  69. package/engine/dist/commands/generate-synonyms.js.map +1 -0
  70. package/engine/dist/config/index.d.ts +115 -0
  71. package/engine/dist/config/index.d.ts.map +1 -0
  72. package/engine/dist/config/index.js +326 -0
  73. package/engine/dist/config/index.js.map +1 -0
  74. package/engine/dist/config/max-recall-config.d.ts +102 -0
  75. package/engine/dist/config/max-recall-config.d.ts.map +1 -0
  76. package/engine/dist/config/max-recall-config.js +102 -0
  77. package/engine/dist/config/max-recall-config.js.map +1 -0
  78. package/engine/dist/config/paths.d.ts +40 -0
  79. package/engine/dist/config/paths.d.ts.map +1 -0
  80. package/engine/dist/config/paths.js +49 -0
  81. package/engine/dist/config/paths.js.map +1 -0
  82. package/engine/dist/core/batch.d.ts +19 -0
  83. package/engine/dist/core/batch.d.ts.map +1 -0
  84. package/engine/dist/core/batch.js +37 -0
  85. package/engine/dist/core/batch.js.map +1 -0
  86. package/engine/dist/core/db.d.ts +58 -0
  87. package/engine/dist/core/db.d.ts.map +1 -0
  88. package/engine/dist/core/db.js +563 -0
  89. package/engine/dist/core/db.js.map +1 -0
  90. package/engine/dist/core/inference/ChatWorker.d.ts +2 -0
  91. package/engine/dist/core/inference/ChatWorker.d.ts.map +1 -0
  92. package/engine/dist/core/inference/ChatWorker.js +28 -0
  93. package/engine/dist/core/inference/ChatWorker.js.map +1 -0
  94. package/engine/dist/core/inference/context_manager.d.ts +49 -0
  95. package/engine/dist/core/inference/context_manager.d.ts.map +1 -0
  96. package/engine/dist/core/inference/context_manager.js +199 -0
  97. package/engine/dist/core/inference/context_manager.js.map +1 -0
  98. package/engine/dist/core/inference/llamaLoaderWorker.d.ts +2 -0
  99. package/engine/dist/core/inference/llamaLoaderWorker.d.ts.map +1 -0
  100. package/engine/dist/core/inference/llamaLoaderWorker.js +23 -0
  101. package/engine/dist/core/inference/llamaLoaderWorker.js.map +1 -0
  102. package/engine/dist/core/vector.d.ts +40 -0
  103. package/engine/dist/core/vector.d.ts.map +1 -0
  104. package/engine/dist/core/vector.js +167 -0
  105. package/engine/dist/core/vector.js.map +1 -0
  106. package/engine/dist/index.d.ts +4 -0
  107. package/engine/dist/index.d.ts.map +1 -0
  108. package/engine/dist/index.js +400 -0
  109. package/engine/dist/index.js.map +1 -0
  110. package/engine/dist/middleware/auth.d.ts +14 -0
  111. package/engine/dist/middleware/auth.d.ts.map +1 -0
  112. package/engine/dist/middleware/auth.js +44 -0
  113. package/engine/dist/middleware/auth.js.map +1 -0
  114. package/engine/dist/middleware/request-tracing.d.ts +29 -0
  115. package/engine/dist/middleware/request-tracing.d.ts.map +1 -0
  116. package/engine/dist/middleware/request-tracing.js +115 -0
  117. package/engine/dist/middleware/request-tracing.js.map +1 -0
  118. package/engine/dist/middleware/validate.d.ts +30 -0
  119. package/engine/dist/middleware/validate.d.ts.map +1 -0
  120. package/engine/dist/middleware/validate.js +117 -0
  121. package/engine/dist/middleware/validate.js.map +1 -0
  122. package/engine/dist/native/index.d.ts +106 -0
  123. package/engine/dist/native/index.d.ts.map +1 -0
  124. package/engine/dist/native/index.js +230 -0
  125. package/engine/dist/native/index.js.map +1 -0
  126. package/engine/dist/native/types.d.ts +45 -0
  127. package/engine/dist/native/types.d.ts.map +1 -0
  128. package/engine/dist/native/types.js +6 -0
  129. package/engine/dist/native/types.js.map +1 -0
  130. package/engine/dist/profiling/atomization-profiling.d.ts +8 -0
  131. package/engine/dist/profiling/atomization-profiling.d.ts.map +1 -0
  132. package/engine/dist/profiling/atomization-profiling.js +108 -0
  133. package/engine/dist/profiling/atomization-profiling.js.map +1 -0
  134. package/engine/dist/profiling/bottleneck-identification.d.ts +8 -0
  135. package/engine/dist/profiling/bottleneck-identification.d.ts.map +1 -0
  136. package/engine/dist/profiling/bottleneck-identification.js +249 -0
  137. package/engine/dist/profiling/bottleneck-identification.js.map +1 -0
  138. package/engine/dist/profiling/content-sanitization-profiling.d.ts +12 -0
  139. package/engine/dist/profiling/content-sanitization-profiling.d.ts.map +1 -0
  140. package/engine/dist/profiling/content-sanitization-profiling.js +266 -0
  141. package/engine/dist/profiling/content-sanitization-profiling.js.map +1 -0
  142. package/engine/dist/profiling/simhash-profiling.d.ts +11 -0
  143. package/engine/dist/profiling/simhash-profiling.d.ts.map +1 -0
  144. package/engine/dist/profiling/simhash-profiling.js +168 -0
  145. package/engine/dist/profiling/simhash-profiling.js.map +1 -0
  146. package/engine/dist/routes/api.d.ts +9 -0
  147. package/engine/dist/routes/api.d.ts.map +1 -0
  148. package/engine/dist/routes/api.js +37 -0
  149. package/engine/dist/routes/api.js.map +1 -0
  150. package/engine/dist/routes/enhanced-api.d.ts +9 -0
  151. package/engine/dist/routes/enhanced-api.d.ts.map +1 -0
  152. package/engine/dist/routes/enhanced-api.js +139 -0
  153. package/engine/dist/routes/enhanced-api.js.map +1 -0
  154. package/engine/dist/routes/health.d.ts +8 -0
  155. package/engine/dist/routes/health.d.ts.map +1 -0
  156. package/engine/dist/routes/health.js +89 -0
  157. package/engine/dist/routes/health.js.map +1 -0
  158. package/engine/dist/routes/monitoring.d.ts +8 -0
  159. package/engine/dist/routes/monitoring.d.ts.map +1 -0
  160. package/engine/dist/routes/monitoring.js +509 -0
  161. package/engine/dist/routes/monitoring.js.map +1 -0
  162. package/engine/dist/routes/v1/admin.d.ts +3 -0
  163. package/engine/dist/routes/v1/admin.d.ts.map +1 -0
  164. package/engine/dist/routes/v1/admin.js +261 -0
  165. package/engine/dist/routes/v1/admin.js.map +1 -0
  166. package/engine/dist/routes/v1/atoms.d.ts +3 -0
  167. package/engine/dist/routes/v1/atoms.d.ts.map +1 -0
  168. package/engine/dist/routes/v1/atoms.js +172 -0
  169. package/engine/dist/routes/v1/atoms.js.map +1 -0
  170. package/engine/dist/routes/v1/backup.d.ts +3 -0
  171. package/engine/dist/routes/v1/backup.d.ts.map +1 -0
  172. package/engine/dist/routes/v1/backup.js +100 -0
  173. package/engine/dist/routes/v1/backup.js.map +1 -0
  174. package/engine/dist/routes/v1/git.d.ts +3 -0
  175. package/engine/dist/routes/v1/git.d.ts.map +1 -0
  176. package/engine/dist/routes/v1/git.js +316 -0
  177. package/engine/dist/routes/v1/git.js.map +1 -0
  178. package/engine/dist/routes/v1/ingest.d.ts +3 -0
  179. package/engine/dist/routes/v1/ingest.d.ts.map +1 -0
  180. package/engine/dist/routes/v1/ingest.js +66 -0
  181. package/engine/dist/routes/v1/ingest.js.map +1 -0
  182. package/engine/dist/routes/v1/memory.d.ts +14 -0
  183. package/engine/dist/routes/v1/memory.d.ts.map +1 -0
  184. package/engine/dist/routes/v1/memory.js +87 -0
  185. package/engine/dist/routes/v1/memory.js.map +1 -0
  186. package/engine/dist/routes/v1/research.d.ts +3 -0
  187. package/engine/dist/routes/v1/research.d.ts.map +1 -0
  188. package/engine/dist/routes/v1/research.js +109 -0
  189. package/engine/dist/routes/v1/research.js.map +1 -0
  190. package/engine/dist/routes/v1/search.d.ts +3 -0
  191. package/engine/dist/routes/v1/search.d.ts.map +1 -0
  192. package/engine/dist/routes/v1/search.js +180 -0
  193. package/engine/dist/routes/v1/search.js.map +1 -0
  194. package/engine/dist/routes/v1/settings.d.ts +8 -0
  195. package/engine/dist/routes/v1/settings.d.ts.map +1 -0
  196. package/engine/dist/routes/v1/settings.js +211 -0
  197. package/engine/dist/routes/v1/settings.js.map +1 -0
  198. package/engine/dist/routes/v1/system.d.ts +3 -0
  199. package/engine/dist/routes/v1/system.d.ts.map +1 -0
  200. package/engine/dist/routes/v1/system.js +326 -0
  201. package/engine/dist/routes/v1/system.js.map +1 -0
  202. package/engine/dist/routes/v1/tags.d.ts +3 -0
  203. package/engine/dist/routes/v1/tags.d.ts.map +1 -0
  204. package/engine/dist/routes/v1/tags.js +102 -0
  205. package/engine/dist/routes/v1/tags.js.map +1 -0
  206. package/engine/dist/server-8080.d.ts +2 -0
  207. package/engine/dist/server-8080.d.ts.map +1 -0
  208. package/engine/dist/server-8080.js +74 -0
  209. package/engine/dist/server-8080.js.map +1 -0
  210. package/engine/dist/services/backup/backup-restore.d.ts +37 -0
  211. package/engine/dist/services/backup/backup-restore.d.ts.map +1 -0
  212. package/engine/dist/services/backup/backup-restore.js +385 -0
  213. package/engine/dist/services/backup/backup-restore.js.map +1 -0
  214. package/engine/dist/services/backup/backup.d.ts +14 -0
  215. package/engine/dist/services/backup/backup.d.ts.map +1 -0
  216. package/engine/dist/services/backup/backup.js +442 -0
  217. package/engine/dist/services/backup/backup.js.map +1 -0
  218. package/engine/dist/services/distillation/radial-distiller-v2.d.ts +127 -0
  219. package/engine/dist/services/distillation/radial-distiller-v2.d.ts.map +1 -0
  220. package/engine/dist/services/distillation/radial-distiller-v2.js +503 -0
  221. package/engine/dist/services/distillation/radial-distiller-v2.js.map +1 -0
  222. package/engine/dist/services/distillation/radial-distiller.d.ts +63 -0
  223. package/engine/dist/services/distillation/radial-distiller.d.ts.map +1 -0
  224. package/engine/dist/services/distillation/radial-distiller.js +394 -0
  225. package/engine/dist/services/distillation/radial-distiller.js.map +1 -0
  226. package/engine/dist/services/health-check-enhanced.d.ts +89 -0
  227. package/engine/dist/services/health-check-enhanced.d.ts.map +1 -0
  228. package/engine/dist/services/health-check-enhanced.js +417 -0
  229. package/engine/dist/services/health-check-enhanced.js.map +1 -0
  230. package/engine/dist/services/idle-manager.d.ts +56 -0
  231. package/engine/dist/services/idle-manager.d.ts.map +1 -0
  232. package/engine/dist/services/idle-manager.js +210 -0
  233. package/engine/dist/services/idle-manager.js.map +1 -0
  234. package/engine/dist/services/inference/inference-service.d.ts +27 -0
  235. package/engine/dist/services/inference/inference-service.d.ts.map +1 -0
  236. package/engine/dist/services/inference/inference-service.js +89 -0
  237. package/engine/dist/services/inference/inference-service.js.map +1 -0
  238. package/engine/dist/services/inference/inference.d.ts +59 -0
  239. package/engine/dist/services/inference/inference.d.ts.map +1 -0
  240. package/engine/dist/services/inference/inference.js +131 -0
  241. package/engine/dist/services/inference/inference.js.map +1 -0
  242. package/engine/dist/services/ingest/atomizer-service.d.ts +74 -0
  243. package/engine/dist/services/ingest/atomizer-service.d.ts.map +1 -0
  244. package/engine/dist/services/ingest/atomizer-service.js +982 -0
  245. package/engine/dist/services/ingest/atomizer-service.js.map +1 -0
  246. package/engine/dist/services/ingest/content-cleaner.d.ts +43 -0
  247. package/engine/dist/services/ingest/content-cleaner.d.ts.map +1 -0
  248. package/engine/dist/services/ingest/content-cleaner.js +166 -0
  249. package/engine/dist/services/ingest/content-cleaner.js.map +1 -0
  250. package/engine/dist/services/ingest/github-ingest-service.d.ts +103 -0
  251. package/engine/dist/services/ingest/github-ingest-service.d.ts.map +1 -0
  252. package/engine/dist/services/ingest/github-ingest-service.js +537 -0
  253. package/engine/dist/services/ingest/github-ingest-service.js.map +1 -0
  254. package/engine/dist/services/ingest/ingest-atomic.d.ts +16 -0
  255. package/engine/dist/services/ingest/ingest-atomic.d.ts.map +1 -0
  256. package/engine/dist/services/ingest/ingest-atomic.js +437 -0
  257. package/engine/dist/services/ingest/ingest-atomic.js.map +1 -0
  258. package/engine/dist/services/ingest/ingest.d.ts +50 -0
  259. package/engine/dist/services/ingest/ingest.d.ts.map +1 -0
  260. package/engine/dist/services/ingest/ingest.js +230 -0
  261. package/engine/dist/services/ingest/ingest.js.map +1 -0
  262. package/engine/dist/services/ingest/watchdog.d.ts +31 -0
  263. package/engine/dist/services/ingest/watchdog.d.ts.map +1 -0
  264. package/engine/dist/services/ingest/watchdog.js +400 -0
  265. package/engine/dist/services/ingest/watchdog.js.map +1 -0
  266. package/engine/dist/services/llm/context.d.ts +6 -0
  267. package/engine/dist/services/llm/context.d.ts.map +1 -0
  268. package/engine/dist/services/llm/context.js +80 -0
  269. package/engine/dist/services/llm/context.js.map +1 -0
  270. package/engine/dist/services/llm/provider.d.ts +23 -0
  271. package/engine/dist/services/llm/provider.d.ts.map +1 -0
  272. package/engine/dist/services/llm/provider.js +338 -0
  273. package/engine/dist/services/llm/provider.js.map +1 -0
  274. package/engine/dist/services/llm/reader.d.ts +12 -0
  275. package/engine/dist/services/llm/reader.d.ts.map +1 -0
  276. package/engine/dist/services/llm/reader.js +40 -0
  277. package/engine/dist/services/llm/reader.js.map +1 -0
  278. package/engine/dist/services/mirror/mirror.d.ts +28 -0
  279. package/engine/dist/services/mirror/mirror.d.ts.map +1 -0
  280. package/engine/dist/services/mirror/mirror.js +208 -0
  281. package/engine/dist/services/mirror/mirror.js.map +1 -0
  282. package/engine/dist/services/nlp/nlp-service.d.ts +70 -0
  283. package/engine/dist/services/nlp/nlp-service.d.ts.map +1 -0
  284. package/engine/dist/services/nlp/nlp-service.js +151 -0
  285. package/engine/dist/services/nlp/nlp-service.js.map +1 -0
  286. package/engine/dist/services/nlp/query-parser.d.ts +9 -0
  287. package/engine/dist/services/nlp/query-parser.d.ts.map +1 -0
  288. package/engine/dist/services/nlp/query-parser.js +29 -0
  289. package/engine/dist/services/nlp/query-parser.js.map +1 -0
  290. package/engine/dist/services/query-builder/DataFrame.d.ts +95 -0
  291. package/engine/dist/services/query-builder/DataFrame.d.ts.map +1 -0
  292. package/engine/dist/services/query-builder/DataFrame.js +263 -0
  293. package/engine/dist/services/query-builder/DataFrame.js.map +1 -0
  294. package/engine/dist/services/query-builder/QueryBuilder.d.ts +106 -0
  295. package/engine/dist/services/query-builder/QueryBuilder.d.ts.map +1 -0
  296. package/engine/dist/services/query-builder/QueryBuilder.js +235 -0
  297. package/engine/dist/services/query-builder/QueryBuilder.js.map +1 -0
  298. package/engine/dist/services/query-builder/utils/export.d.ts +11 -0
  299. package/engine/dist/services/query-builder/utils/export.d.ts.map +1 -0
  300. package/engine/dist/services/query-builder/utils/export.js +130 -0
  301. package/engine/dist/services/query-builder/utils/export.js.map +1 -0
  302. package/engine/dist/services/research/researcher.d.ts +15 -0
  303. package/engine/dist/services/research/researcher.d.ts.map +1 -0
  304. package/engine/dist/services/research/researcher.js +123 -0
  305. package/engine/dist/services/research/researcher.js.map +1 -0
  306. package/engine/dist/services/scribe/scribe.d.ts +43 -0
  307. package/engine/dist/services/scribe/scribe.d.ts.map +1 -0
  308. package/engine/dist/services/scribe/scribe.js +135 -0
  309. package/engine/dist/services/scribe/scribe.js.map +1 -0
  310. package/engine/dist/services/search/bright-nodes.d.ts +41 -0
  311. package/engine/dist/services/search/bright-nodes.d.ts.map +1 -0
  312. package/engine/dist/services/search/bright-nodes.js +117 -0
  313. package/engine/dist/services/search/bright-nodes.js.map +1 -0
  314. package/engine/dist/services/search/context-inflator.d.ts +63 -0
  315. package/engine/dist/services/search/context-inflator.d.ts.map +1 -0
  316. package/engine/dist/services/search/context-inflator.js +649 -0
  317. package/engine/dist/services/search/context-inflator.js.map +1 -0
  318. package/engine/dist/services/search/context-manager.d.ts +34 -0
  319. package/engine/dist/services/search/context-manager.d.ts.map +1 -0
  320. package/engine/dist/services/search/context-manager.js +124 -0
  321. package/engine/dist/services/search/context-manager.js.map +1 -0
  322. package/engine/dist/services/search/distributed-query.d.ts +38 -0
  323. package/engine/dist/services/search/distributed-query.d.ts.map +1 -0
  324. package/engine/dist/services/search/distributed-query.js +105 -0
  325. package/engine/dist/services/search/distributed-query.js.map +1 -0
  326. package/engine/dist/services/search/explore.d.ts +73 -0
  327. package/engine/dist/services/search/explore.d.ts.map +1 -0
  328. package/engine/dist/services/search/explore.js +388 -0
  329. package/engine/dist/services/search/explore.js.map +1 -0
  330. package/engine/dist/services/search/graph-context-serializer.d.ts +76 -0
  331. package/engine/dist/services/search/graph-context-serializer.d.ts.map +1 -0
  332. package/engine/dist/services/search/graph-context-serializer.js +435 -0
  333. package/engine/dist/services/search/graph-context-serializer.js.map +1 -0
  334. package/engine/dist/services/search/llm-context-formatter.d.ts +122 -0
  335. package/engine/dist/services/search/llm-context-formatter.d.ts.map +1 -0
  336. package/engine/dist/services/search/llm-context-formatter.js +394 -0
  337. package/engine/dist/services/search/llm-context-formatter.js.map +1 -0
  338. package/engine/dist/services/search/physics-tag-walker.d.ts +115 -0
  339. package/engine/dist/services/search/physics-tag-walker.d.ts.map +1 -0
  340. package/engine/dist/services/search/physics-tag-walker.js +611 -0
  341. package/engine/dist/services/search/physics-tag-walker.js.map +1 -0
  342. package/engine/dist/services/search/query-parser.d.ts +66 -0
  343. package/engine/dist/services/search/query-parser.d.ts.map +1 -0
  344. package/engine/dist/services/search/query-parser.js +346 -0
  345. package/engine/dist/services/search/query-parser.js.map +1 -0
  346. package/engine/dist/services/search/search-utils.d.ts +100 -0
  347. package/engine/dist/services/search/search-utils.d.ts.map +1 -0
  348. package/engine/dist/services/search/search-utils.js +473 -0
  349. package/engine/dist/services/search/search-utils.js.map +1 -0
  350. package/engine/dist/services/search/search.d.ts +116 -0
  351. package/engine/dist/services/search/search.d.ts.map +1 -0
  352. package/engine/dist/services/search/search.js +1286 -0
  353. package/engine/dist/services/search/search.js.map +1 -0
  354. package/engine/dist/services/search/sovereign-system-prompt.d.ts +48 -0
  355. package/engine/dist/services/search/sovereign-system-prompt.d.ts.map +1 -0
  356. package/engine/dist/services/search/sovereign-system-prompt.js +101 -0
  357. package/engine/dist/services/search/sovereign-system-prompt.js.map +1 -0
  358. package/engine/dist/services/search/streaming-search.d.ts +51 -0
  359. package/engine/dist/services/search/streaming-search.d.ts.map +1 -0
  360. package/engine/dist/services/search/streaming-search.js +94 -0
  361. package/engine/dist/services/search/streaming-search.js.map +1 -0
  362. package/engine/dist/services/semantic/semantic-ingestion-service.d.ts +53 -0
  363. package/engine/dist/services/semantic/semantic-ingestion-service.d.ts.map +1 -0
  364. package/engine/dist/services/semantic/semantic-ingestion-service.js +625 -0
  365. package/engine/dist/services/semantic/semantic-ingestion-service.js.map +1 -0
  366. package/engine/dist/services/semantic/semantic-molecule-processor.d.ts +68 -0
  367. package/engine/dist/services/semantic/semantic-molecule-processor.d.ts.map +1 -0
  368. package/engine/dist/services/semantic/semantic-molecule-processor.js +176 -0
  369. package/engine/dist/services/semantic/semantic-molecule-processor.js.map +1 -0
  370. package/engine/dist/services/semantic/semantic-search.d.ts +52 -0
  371. package/engine/dist/services/semantic/semantic-search.d.ts.map +1 -0
  372. package/engine/dist/services/semantic/semantic-search.js +649 -0
  373. package/engine/dist/services/semantic/semantic-search.js.map +1 -0
  374. package/engine/dist/services/semantic/semantic-tag-deriver.d.ts +64 -0
  375. package/engine/dist/services/semantic/semantic-tag-deriver.d.ts.map +1 -0
  376. package/engine/dist/services/semantic/semantic-tag-deriver.js +191 -0
  377. package/engine/dist/services/semantic/semantic-tag-deriver.js.map +1 -0
  378. package/engine/dist/services/semantic/types/semantic.d.ts +26 -0
  379. package/engine/dist/services/semantic/types/semantic.d.ts.map +1 -0
  380. package/engine/dist/services/semantic/types/semantic.js +7 -0
  381. package/engine/dist/services/semantic/types/semantic.js.map +1 -0
  382. package/engine/dist/services/synonyms/auto-synonym-generator.d.ts +79 -0
  383. package/engine/dist/services/synonyms/auto-synonym-generator.d.ts.map +1 -0
  384. package/engine/dist/services/synonyms/auto-synonym-generator.js +415 -0
  385. package/engine/dist/services/synonyms/auto-synonym-generator.js.map +1 -0
  386. package/engine/dist/services/system-status.d.ts +68 -0
  387. package/engine/dist/services/system-status.d.ts.map +1 -0
  388. package/engine/dist/services/system-status.js +107 -0
  389. package/engine/dist/services/system-status.js.map +1 -0
  390. package/engine/dist/services/tags/discovery.d.ts +16 -0
  391. package/engine/dist/services/tags/discovery.d.ts.map +1 -0
  392. package/engine/dist/services/tags/discovery.js +206 -0
  393. package/engine/dist/services/tags/discovery.js.map +1 -0
  394. package/engine/dist/services/tags/gliner.d.ts +18 -0
  395. package/engine/dist/services/tags/gliner.d.ts.map +1 -0
  396. package/engine/dist/services/tags/gliner.js +119 -0
  397. package/engine/dist/services/tags/gliner.js.map +1 -0
  398. package/engine/dist/services/tags/infector.d.ts +21 -0
  399. package/engine/dist/services/tags/infector.d.ts.map +1 -0
  400. package/engine/dist/services/tags/infector.js +168 -0
  401. package/engine/dist/services/tags/infector.js.map +1 -0
  402. package/engine/dist/services/tags/tag-auditor.d.ts +77 -0
  403. package/engine/dist/services/tags/tag-auditor.d.ts.map +1 -0
  404. package/engine/dist/services/tags/tag-auditor.js +283 -0
  405. package/engine/dist/services/tags/tag-auditor.js.map +1 -0
  406. package/engine/dist/services/taxonomy/taxonomy-manager.d.ts +50 -0
  407. package/engine/dist/services/taxonomy/taxonomy-manager.d.ts.map +1 -0
  408. package/engine/dist/services/taxonomy/taxonomy-manager.js +291 -0
  409. package/engine/dist/services/taxonomy/taxonomy-manager.js.map +1 -0
  410. package/engine/dist/services/vision/vision_service.d.ts +4 -0
  411. package/engine/dist/services/vision/vision_service.d.ts.map +1 -0
  412. package/engine/dist/services/vision/vision_service.js +197 -0
  413. package/engine/dist/services/vision/vision_service.js.map +1 -0
  414. package/engine/dist/test-framework/core.d.ts +133 -0
  415. package/engine/dist/test-framework/core.d.ts.map +1 -0
  416. package/engine/dist/test-framework/core.js +313 -0
  417. package/engine/dist/test-framework/core.js.map +1 -0
  418. package/engine/dist/test-framework/dataset-runner.d.ts +78 -0
  419. package/engine/dist/test-framework/dataset-runner.d.ts.map +1 -0
  420. package/engine/dist/test-framework/dataset-runner.js +223 -0
  421. package/engine/dist/test-framework/dataset-runner.js.map +1 -0
  422. package/engine/dist/test-framework/diagnostic-tests.d.ts +38 -0
  423. package/engine/dist/test-framework/diagnostic-tests.d.ts.map +1 -0
  424. package/engine/dist/test-framework/diagnostic-tests.js +283 -0
  425. package/engine/dist/test-framework/diagnostic-tests.js.map +1 -0
  426. package/engine/dist/test-framework/performance-regression-tests.d.ts +30 -0
  427. package/engine/dist/test-framework/performance-regression-tests.d.ts.map +1 -0
  428. package/engine/dist/test-framework/performance-regression-tests.js +331 -0
  429. package/engine/dist/test-framework/performance-regression-tests.js.map +1 -0
  430. package/engine/dist/types/api.d.ts +53 -0
  431. package/engine/dist/types/api.d.ts.map +1 -0
  432. package/engine/dist/types/api.js +2 -0
  433. package/engine/dist/types/api.js.map +1 -0
  434. package/engine/dist/types/atomic.d.ts +42 -0
  435. package/engine/dist/types/atomic.d.ts.map +1 -0
  436. package/engine/dist/types/atomic.js +10 -0
  437. package/engine/dist/types/atomic.js.map +1 -0
  438. package/engine/dist/types/context-protocol.d.ts +137 -0
  439. package/engine/dist/types/context-protocol.d.ts.map +1 -0
  440. package/engine/dist/types/context-protocol.js +28 -0
  441. package/engine/dist/types/context-protocol.js.map +1 -0
  442. package/engine/dist/types/context.d.ts +2 -0
  443. package/engine/dist/types/context.d.ts.map +1 -0
  444. package/engine/dist/types/context.js +2 -0
  445. package/engine/dist/types/context.js.map +1 -0
  446. package/engine/dist/types/index.d.ts +20 -0
  447. package/engine/dist/types/index.d.ts.map +1 -0
  448. package/engine/dist/types/index.js +18 -0
  449. package/engine/dist/types/index.js.map +1 -0
  450. package/engine/dist/types/search.d.ts +31 -0
  451. package/engine/dist/types/search.d.ts.map +1 -0
  452. package/engine/dist/types/search.js +2 -0
  453. package/engine/dist/types/search.js.map +1 -0
  454. package/engine/dist/types/taxonomy.d.ts +137 -0
  455. package/engine/dist/types/taxonomy.d.ts.map +1 -0
  456. package/engine/dist/types/taxonomy.js +138 -0
  457. package/engine/dist/types/taxonomy.js.map +1 -0
  458. package/engine/dist/types/taxonomy.simple.d.ts +131 -0
  459. package/engine/dist/types/taxonomy.simple.d.ts.map +1 -0
  460. package/engine/dist/types/taxonomy.simple.js +132 -0
  461. package/engine/dist/types/taxonomy.simple.js.map +1 -0
  462. package/engine/dist/types/tool-call.d.ts +16 -0
  463. package/engine/dist/types/tool-call.d.ts.map +1 -0
  464. package/engine/dist/types/tool-call.js +6 -0
  465. package/engine/dist/types/tool-call.js.map +1 -0
  466. package/engine/dist/types/trace.d.ts +25 -0
  467. package/engine/dist/types/trace.d.ts.map +1 -0
  468. package/engine/dist/types/trace.js +5 -0
  469. package/engine/dist/types/trace.js.map +1 -0
  470. package/engine/dist/utils/adaptive-concurrency.d.ts +81 -0
  471. package/engine/dist/utils/adaptive-concurrency.d.ts.map +1 -0
  472. package/engine/dist/utils/adaptive-concurrency.js +266 -0
  473. package/engine/dist/utils/adaptive-concurrency.js.map +1 -0
  474. package/engine/dist/utils/date_extractor.d.ts +2 -0
  475. package/engine/dist/utils/date_extractor.d.ts.map +1 -0
  476. package/engine/dist/utils/date_extractor.js +32 -0
  477. package/engine/dist/utils/date_extractor.js.map +1 -0
  478. package/engine/dist/utils/native-module-manager.d.ts +48 -0
  479. package/engine/dist/utils/native-module-manager.d.ts.map +1 -0
  480. package/engine/dist/utils/native-module-manager.js +265 -0
  481. package/engine/dist/utils/native-module-manager.js.map +1 -0
  482. package/engine/dist/utils/native-module-profiler.d.ts +66 -0
  483. package/engine/dist/utils/native-module-profiler.d.ts.map +1 -0
  484. package/engine/dist/utils/native-module-profiler.js +182 -0
  485. package/engine/dist/utils/native-module-profiler.js.map +1 -0
  486. package/engine/dist/utils/path-manager.d.ts +59 -0
  487. package/engine/dist/utils/path-manager.d.ts.map +1 -0
  488. package/engine/dist/utils/path-manager.js +154 -0
  489. package/engine/dist/utils/path-manager.js.map +1 -0
  490. package/engine/dist/utils/performance-monitor.d.ts +92 -0
  491. package/engine/dist/utils/performance-monitor.d.ts.map +1 -0
  492. package/engine/dist/utils/performance-monitor.js +221 -0
  493. package/engine/dist/utils/performance-monitor.js.map +1 -0
  494. package/engine/dist/utils/process-manager.d.ts +18 -0
  495. package/engine/dist/utils/process-manager.d.ts.map +1 -0
  496. package/engine/dist/utils/process-manager.js +100 -0
  497. package/engine/dist/utils/process-manager.js.map +1 -0
  498. package/engine/dist/utils/request-tracer.d.ts +131 -0
  499. package/engine/dist/utils/request-tracer.d.ts.map +1 -0
  500. package/engine/dist/utils/request-tracer.js +414 -0
  501. package/engine/dist/utils/request-tracer.js.map +1 -0
  502. package/engine/dist/utils/resource-manager.d.ts +108 -0
  503. package/engine/dist/utils/resource-manager.d.ts.map +1 -0
  504. package/engine/dist/utils/resource-manager.js +235 -0
  505. package/engine/dist/utils/resource-manager.js.map +1 -0
  506. package/engine/dist/utils/safe-dns.d.ts +14 -0
  507. package/engine/dist/utils/safe-dns.d.ts.map +1 -0
  508. package/engine/dist/utils/safe-dns.js +105 -0
  509. package/engine/dist/utils/safe-dns.js.map +1 -0
  510. package/engine/dist/utils/structured-logger.d.ts +124 -0
  511. package/engine/dist/utils/structured-logger.d.ts.map +1 -0
  512. package/engine/dist/utils/structured-logger.js +332 -0
  513. package/engine/dist/utils/structured-logger.js.map +1 -0
  514. package/engine/dist/utils/tag-cleanup.d.ts +11 -0
  515. package/engine/dist/utils/tag-cleanup.d.ts.map +1 -0
  516. package/engine/dist/utils/tag-cleanup.js +111 -0
  517. package/engine/dist/utils/tag-cleanup.js.map +1 -0
  518. package/engine/dist/utils/tag-filter.d.ts +19 -0
  519. package/engine/dist/utils/tag-filter.d.ts.map +1 -0
  520. package/engine/dist/utils/tag-filter.js +147 -0
  521. package/engine/dist/utils/tag-filter.js.map +1 -0
  522. package/engine/dist/utils/tag-modulation.d.ts +80 -0
  523. package/engine/dist/utils/tag-modulation.d.ts.map +1 -0
  524. package/engine/dist/utils/tag-modulation.js +284 -0
  525. package/engine/dist/utils/tag-modulation.js.map +1 -0
  526. package/engine/dist/utils/timer.d.ts +40 -0
  527. package/engine/dist/utils/timer.d.ts.map +1 -0
  528. package/engine/dist/utils/timer.js +76 -0
  529. package/engine/dist/utils/timer.js.map +1 -0
  530. package/engine/dist/utils/token-utils.d.ts +19 -0
  531. package/engine/dist/utils/token-utils.d.ts.map +1 -0
  532. package/engine/dist/utils/token-utils.js +71 -0
  533. package/engine/dist/utils/token-utils.js.map +1 -0
  534. package/engine/dist/utils/wasm-module-loader.d.ts +50 -0
  535. package/engine/dist/utils/wasm-module-loader.d.ts.map +1 -0
  536. package/engine/dist/utils/wasm-module-loader.js +136 -0
  537. package/engine/dist/utils/wasm-module-loader.js.map +1 -0
  538. package/engine/package.json +105 -0
  539. package/package.json +106 -0
@@ -0,0 +1,982 @@
1
+ import * as crypto from 'crypto';
2
+ import * as fs from 'fs';
3
+ import * as path from 'path';
4
+ import { fileURLToPath } from 'url';
5
+ import { shouldUseStrictAtomSelection, modulateTags } from '../../utils/tag-modulation.js';
6
+ const __filename = fileURLToPath(import.meta.url);
7
+ const __dirname = path.dirname(__filename);
8
+ // Native modules from @rbalchii packages (with fallbacks)
9
+ let nativeFingerprint = null;
10
+ let nativeCleanse = null;
11
+ try {
12
+ const fp = await import('@rbalchii/native-fingerprint');
13
+ nativeFingerprint = fp.fingerprint;
14
+ }
15
+ catch { /* use JS fallback */ }
16
+ try {
17
+ const ka = await import('@rbalchii/native-keyassassin');
18
+ nativeCleanse = ka.cleanse;
19
+ }
20
+ catch { /* use JS fallback */ }
21
+ export class AtomizerService {
22
+ /**
23
+ * Tag blacklist patterns - prevents low-value tags from being stored
24
+ * These patterns filter out noise at ingestion time
25
+ */
26
+ static TAG_BLACKLIST_PATTERNS = [
27
+ // Color codes (hex)
28
+ /^#[0-9a-fA-F]{3,8}$/,
29
+ // Pure numbers or too short
30
+ /^#\d{1,3}$/,
31
+ /^#_\w*$/,
32
+ /^#__[\w\d_]+$/,
33
+ // HTML/DOM artifacts
34
+ /^#btn\b/, /^#class\b/, /^#div\b/, /^#id\b/,
35
+ /^#span\b/, /^#href\b/, /^#src\b/,
36
+ // Code artifacts
37
+ /^#fn\b/, /^#elif\b/, /^#else\b/, /^#endif\b/,
38
+ /^#ifdef\b/, /^#ifndef\b/, /^#include\b/,
39
+ /^#define\b/, /^#pragma\b/,
40
+ // Scraping artifacts
41
+ /^#cite_note/, /^#cite_ref/, /^#amp_tf/,
42
+ /^#details_of_atom/, /^#entry_lin/, /^#entry_links/,
43
+ /^#opensearch_extension/, /^#extension_elements/,
44
+ /^#simple_examples/, /^#query_interface/,
45
+ /^#api_response/, /^#response_example/,
46
+ /^#examples?$/, /^#overview$/, /^#preface$/,
47
+ /^#appendix/, /^#appendices$/, /^#bib\b/, /^#ref\b/,
48
+ // Error/artifact tags
49
+ /^#incorrect_/, /^#error_/, /^#null\b/,
50
+ /^#undefined\b/, /^#nan\b/,
51
+ // Too generic
52
+ /^#slow_pickup$/, /^#late_night$/, /^#early_morning$/,
53
+ /^#monday\b/, /^#tuesday\b/, /^#wednesday\b/,
54
+ /^#thursday\b/, /^#friday\b/, /^#saturday\b/,
55
+ /^#sunday\b/, /^#manual\b/, /^#manually_/,
56
+ /^#test_/, /^#tmp\b/, /^#temp\b/, /^#untagged$/,
57
+ // Deprecated project names
58
+ /^#agentgpt$/, /^#babyagi$/, /^#autogen$/, /^#chimaera$/,
59
+ // System tags
60
+ /^#manually_quarantined$/, /^#quarantined$/,
61
+ /^#system$/, /^#internal$/, /^#external$/,
62
+ // Test fixture tags (from unit test mock data bleeding into production)
63
+ /^#tag\d*$/i, // #Tag, #Tag1, #tag2
64
+ /^#shared[a-z]$/i, // #sharedA, #sharedB
65
+ /^#word\d*$/i, // #Word, #Word1
66
+ /^#fixture/i, // #fixture...
67
+ /^#mock/i, // #mock...
68
+ /^#dummy/i, // #dummy...
69
+ /^#sample[a-z0-9]*$/i, // #sample, #sampleA
70
+ ];
71
+ static TAG_BLACKLIST_EXACT = new Set([
72
+ '#_', '#0', '#1', '#2', '#3', '#4', '#5', '#6', '#7', '#8', '#9',
73
+ '#00', '#000', '#0000', '#00000', '#000000',
74
+ ]);
75
+ /**
76
+ * Check if a tag should be filtered out
77
+ */
78
+ isBlacklistedTag(tag) {
79
+ if (!tag || typeof tag !== 'string')
80
+ return true;
81
+ const normalizedTag = tag.trim();
82
+ if (AtomizerService.TAG_BLACKLIST_EXACT.has(normalizedTag)) {
83
+ return true;
84
+ }
85
+ for (const pattern of AtomizerService.TAG_BLACKLIST_PATTERNS) {
86
+ if (pattern.test(normalizedTag)) {
87
+ return true;
88
+ }
89
+ }
90
+ return false;
91
+ }
92
+ /**
93
+ * Apply tag modulation to atom labels
94
+ * Filters based on modulation level and blacklist strictness from user_settings.json
95
+ */
96
+ applyTagModulation(atomLabels) {
97
+ if (!atomLabels || atomLabels.length === 0)
98
+ return [];
99
+ // Convert atom labels to tag format
100
+ const rawTags = atomLabels.map(label => label.startsWith('#') ? label : `#${label}`);
101
+ // Apply modulation filtering
102
+ return modulateTags(rawTags);
103
+ }
104
+ /**
105
+ * Transient data patterns to exclude from ingestion
106
+ * These patterns identify temporary/noisy content that clutters context
107
+ */
108
+ static TRANSIENT_PATTERNS = [
109
+ // Terminal error logs
110
+ /Traceback \(most recent call last\)/i,
111
+ /KeyError:/i,
112
+ /TypeError:/i,
113
+ /ValueError:/i,
114
+ /Error:.*at line \d+/i,
115
+ /Exception in thread/i,
116
+ /Fatal error:/i,
117
+ // Package installation logs
118
+ /npm install/i,
119
+ /pip install/i,
120
+ /yarn add/i,
121
+ /pnpm add/i,
122
+ /Collecting [a-zA-Z0-9_-]+/i, // pip "Collecting package"
123
+ /Downloading [a-zA-Z0-9_-]+/i, // pip "Downloading package"
124
+ /added \d+ package/i, // npm "added X packages"
125
+ /Successfully installed/i,
126
+ // Build artifacts
127
+ /Build succeeded/i,
128
+ /Build failed/i,
129
+ /Compiling\.\.\./i,
130
+ /Linking\.\.\./i,
131
+ /Generating\.\.\./i,
132
+ // Repetitive log noise
133
+ /^\[\d{4}-\d{2}-\d{2}.*\]$/m, // Standalone timestamp lines
134
+ /^={50,}$/m, // Separator lines (====...)
135
+ /^-{50,}$/m, // Separator lines (----...)
136
+ ];
137
+ /**
138
+ * Check if content is transient/temporary data that should be excluded
139
+ */
140
+ isTransientData(content) {
141
+ // Check if more than 50% of content matches transient patterns
142
+ const lines = content.split('\n');
143
+ if (lines.length < 5)
144
+ return false; // Too short to be log output
145
+ let transientLines = 0;
146
+ for (const pattern of AtomizerService.TRANSIENT_PATTERNS) {
147
+ for (const line of lines) {
148
+ if (pattern.test(line)) {
149
+ transientLines++;
150
+ if (transientLines > lines.length * 0.5) {
151
+ return true; // More than 50% is transient
152
+ }
153
+ }
154
+ }
155
+ }
156
+ return false;
157
+ }
158
+ /**
159
+ * Deconstructs raw content into Atomic Topology.
160
+ * Returns the Compound (Main Body) and its Constituent Particles (Atoms/Molecules).
161
+ */
162
+ async atomize(content, sourcePath, provenance, fileTimestamp) {
163
+ const filename = sourcePath.split(/[/\\]/).pop() || sourcePath;
164
+ const contentSizeMB = (content.length / (1024 * 1024)).toFixed(2);
165
+ const startTime = Date.now();
166
+ // Check for transient data before processing
167
+ if (this.isTransientData(content)) {
168
+ console.log(`[Atomizer] ⚠️ SKIP: ${filename} - Transient data detected (error logs, install output, etc.)`);
169
+ return null; // Skip ingestion entirely
170
+ }
171
+ // Note: System output (Anchor search results) is NOT skipped - it's cleaned during sanitization
172
+ // The sanitization step removes score markers, system IDs, YAML formatting, etc.
173
+ // Deduplication handles any remaining duplicates
174
+ console.log(`[Atomizer] ⏱️ START: ${filename} (${contentSizeMB}MB)`);
175
+ try {
176
+ // 1. Sanitize (Iron Lung) - Chunked Strategy for Large Files
177
+ // Optimized port of Refiner's Key Assassin
178
+ // For very large files, we sanitize in chunks to avoid string length limits/OOM
179
+ const sanitizeStart = Date.now();
180
+ const CHUNK_SIZE = 1024 * 1024; // 1MB chunks
181
+ let cleanContent = '';
182
+ if (content.length > CHUNK_SIZE * 2) {
183
+ // Generator approach for memory efficiency
184
+ let chunkCount = 0;
185
+ for (const chunk of this.chunkedSanitize(content, sourcePath, CHUNK_SIZE)) {
186
+ cleanContent += chunk;
187
+ chunkCount++;
188
+ if (chunkCount % 10 === 0) {
189
+ console.log(`[Atomizer] ⏱️ Sanitize chunk ${chunkCount}... (${((Date.now() - sanitizeStart) / 1000).toFixed(1)}s)`);
190
+ }
191
+ // Yield to event loop to keep server responsive
192
+ await new Promise(resolve => setImmediate(resolve));
193
+ }
194
+ }
195
+ else {
196
+ cleanContent = this.sanitize(content, sourcePath);
197
+ }
198
+ console.log(`[Atomizer] ⏱️ Sanitize complete: ${((Date.now() - sanitizeStart) / 1000).toFixed(2)}s`);
199
+ // 2. Identification (Hash)
200
+ const hashStart = Date.now();
201
+ const compoundId = crypto.createHash('md5').update(cleanContent + sourcePath).digest('hex');
202
+ const timestamp = fileTimestamp || Date.now();
203
+ console.log(`[Atomizer] ⏱️ Hash complete: ${Date.now() - hashStart}ms`);
204
+ // 3. System Atoms (Project/File Level)
205
+ const systemAtoms = this.extractSystemAtoms(sourcePath);
206
+ // 4. Construct Compound ID
207
+ const fullCompoundId = `mem_${compoundId}`;
208
+ // 5. Molecular Fission (Semantic Splitting)
209
+ // Determine Type & Extract Data
210
+ const splitStart = Date.now();
211
+ const type = this.detectMoleculeType(cleanContent, sourcePath); // Determine main type
212
+ // Pass type to optimize splitting strategy
213
+ const moleculeParts = this.splitIntoMolecules(cleanContent, type);
214
+ console.log(`[Atomizer] ⏱️ Split into ${moleculeParts.length} molecules: ${((Date.now() - splitStart) / 1000).toFixed(2)}s`);
215
+ // 5. Molecular Enrichment (Granular Tagging & Typing)
216
+ const enrichStart = Date.now();
217
+ const molecules = [];
218
+ const allAtomsMap = new Map();
219
+ // Add System Atoms to global map
220
+ systemAtoms.forEach(a => allAtomsMap.set(a.id, a));
221
+ // Define maximum content length for individual molecules
222
+ const MAX_MOLECULE_CONTENT_LENGTH = 500 * 1024; // 500KB limit
223
+ // Timestamp Context: Start with file timestamp (modification time)
224
+ // As we scan molecules, if we find a date in the content (e.g. log timestamp),
225
+ // we update this context so subsequent atoms inherit it.
226
+ let currentTimestamp = timestamp;
227
+ const totalMolecules = moleculeParts.length;
228
+ const progressInterval = Math.max(100, Math.floor(totalMolecules / 10)); // Log every 10% or every 100
229
+ // Process molecules in batches to yield to event loop
230
+ for (let i = 0; i < moleculeParts.length; i++) {
231
+ const part = moleculeParts[i];
232
+ const { content: text, start, end, timestamp: partTimestamp } = part;
233
+ // Progress logging and yield every 100 molecules
234
+ if (i % progressInterval === 0 && i > 0) {
235
+ const pct = ((i / totalMolecules) * 100).toFixed(0);
236
+ console.log(`[Atomizer] ⏱️ Enriching: ${pct}% (${i}/${totalMolecules}) - ${((Date.now() - enrichStart) / 1000).toFixed(1)}s`);
237
+ }
238
+ if (i % 100 === 0) {
239
+ await new Promise(resolve => setImmediate(resolve));
240
+ }
241
+ // Update time context if this part has a specific timestamp
242
+ // Extract earliest timestamp from content for temporal ordering
243
+ const extractedTs = this.extractEarliestTimestamp(text, currentTimestamp);
244
+ if (extractedTs) {
245
+ currentTimestamp = extractedTs;
246
+ }
247
+ // Check content length and truncate if necessary
248
+ let processedText = text;
249
+ if (processedText.length > MAX_MOLECULE_CONTENT_LENGTH) {
250
+ console.warn(`[Atomizer] Molecule content exceeds maximum length (${processedText.length} chars), truncating...`);
251
+ processedText = processedText.substring(0, MAX_MOLECULE_CONTENT_LENGTH) + '... [TRUNCATED]';
252
+ }
253
+ // Scan for concepts in this specific molecule
254
+ // PERFORMANCE: Skip for pure data rows (CSV lines) that have no prose
255
+ // But keep scanning for conversational YAML which has semantic content
256
+ const conceptAtoms = this.scanAtoms(processedText);
257
+ const moleculeAtoms = [...systemAtoms, ...conceptAtoms];
258
+ // Add concepts to global map
259
+ conceptAtoms.forEach(a => allAtomsMap.set(a.id, a));
260
+ const molId = `mol_${crypto.createHash('md5').update(compoundId + i + processedText).digest('hex').substring(0, 12)}`;
261
+ // Re-Determine Type locally (e.g. code block in markdown)
262
+ // Use the passed type as default, but refined per chunk if needed
263
+ const molType = (type === 'prose' && (processedText.includes('```') || processedText.includes('function') || processedText.includes('const '))) ? 'code' : type;
264
+ let numericVal = undefined;
265
+ let numericUnit = undefined;
266
+ if (molType === 'data') {
267
+ const data = this.extractNumericData(processedText);
268
+ if (data) {
269
+ numericVal = data.value;
270
+ numericUnit = data.unit;
271
+ }
272
+ }
273
+ molecules.push({
274
+ id: molId,
275
+ content: processedText,
276
+ atoms: moleculeAtoms.map(a => a.id),
277
+ sequence: i,
278
+ compoundId: fullCompoundId,
279
+ // Universal Coordinates
280
+ start_byte: start,
281
+ end_byte: end,
282
+ // Metadata
283
+ type: molType,
284
+ numeric_value: numericVal,
285
+ numeric_unit: numericUnit,
286
+ molecular_signature: this.generateSimHash(processedText),
287
+ timestamp: currentTimestamp,
288
+ // Apply tag modulation: filter blacklisted tags and apply modulation level
289
+ tags: this.applyTagModulation(moleculeAtoms.map(a => a.label)),
290
+ entities: {
291
+ people: moleculeAtoms.filter(a => ['#coda', '#rob', '#oliver'].includes(a.label.toLowerCase())).map(a => a.label),
292
+ concepts: moleculeAtoms.filter(a => a.type === 'concept').map(a => a.label),
293
+ projects: moleculeAtoms.filter(a => ['#project', '#engine', '#agent'].some(kw => a.label.toLowerCase().includes(kw))).map(a => a.label)
294
+ }
295
+ });
296
+ }
297
+ console.log(`[Atomizer] ⏱️ Enrichment complete: ${((Date.now() - enrichStart) / 1000).toFixed(2)}s`);
298
+ const allAtoms = Array.from(allAtomsMap.values());
299
+ const compound = {
300
+ id: fullCompoundId,
301
+ compound_body: cleanContent,
302
+ molecules: molecules.map(m => m.id),
303
+ atoms: allAtoms.map(a => a.id),
304
+ path: sourcePath,
305
+ timestamp: fileTimestamp || timestamp, // Compound keeps file timestamp if provided
306
+ provenance: provenance,
307
+ molecular_signature: this.generateSimHash(cleanContent)
308
+ };
309
+ const totalTime = ((Date.now() - startTime) / 1000).toFixed(2);
310
+ console.log(`[Atomizer] ✅ COMPLETE: ${filename} (${contentSizeMB}MB) → ${molecules.length} molecules, ${allAtoms.length} atoms in ${totalTime}s`);
311
+ return {
312
+ compound,
313
+ molecules,
314
+ atoms: allAtoms
315
+ };
316
+ }
317
+ catch (error) {
318
+ console.error(`[Atomizer] FATAL ERROR processing ${sourcePath}:`, error.message);
319
+ throw error;
320
+ }
321
+ }
322
+ *chunkedSanitize(text, filePath, chunkSize) {
323
+ let offset = 0;
324
+ while (offset < text.length) {
325
+ let end = Math.min(offset + chunkSize, text.length);
326
+ // Align to newline if not at the end
327
+ if (end < text.length) {
328
+ const nextNewline = text.indexOf('\n', end);
329
+ if (nextNewline !== -1 && nextNewline < end + 1000) { // Don't drift too far
330
+ end = nextNewline + 1;
331
+ }
332
+ }
333
+ const chunk = text.substring(offset, end);
334
+ yield this.sanitize(chunk, filePath);
335
+ offset = end;
336
+ }
337
+ }
338
+ // --- PORTED LOGIC FROM REFINER.TS ---
339
+ /**
340
+ * Enhanced Content Sanitization (The Key Assassin)
341
+ * Surgically removes JSON wrappers, log spam, and PII.
342
+ */
343
+ sanitize(text, filePath = '') {
344
+ let clean = text;
345
+ // 1. Fundamental Normalization
346
+ clean = clean.replace(/^\uFEFF/, '').replace(/[\u0000\uFFFD]/g, '');
347
+ // Aggressive Newline Normalization: convert all \r\n and literal "\r\n" strings to real newlines
348
+ clean = clean.replace(/\\r\\n/g, '\n').replace(/\r\n/g, '\n');
349
+ // 2. Enhanced Surgeon: Log Spam Removal
350
+ clean = clean.replace(/(?:^|\s|\.{3}\s*)Processing '[^']+'\.{3}/g, '\n');
351
+ clean = clean.replace(/(?:^|\s|\.{3}\s*)Loading '[^']+'\.{3}/g, '\n');
352
+ clean = clean.replace(/(?:^|\s|\.{3}\s*)Indexing '[^']+'\.{3}/g, '\n');
353
+ clean = clean.replace(/(?:^|\s|\.{3}\s*)Analyzing '[^']+'\.{3}/g, '\n');
354
+ // [NEW] Robust Processing Log Filter (for " - [TIMESTAMP] ... Processing ...")
355
+ clean = clean.replace(/(?:^|\n)\s*-\s*\[\d{4}-\d{2}-\d{2}.*?\].*?Processing.*?(?:\n|$)/gi, '\n');
356
+ // Strip Log Timestamps (at start of lines)
357
+ clean = clean.replace(/^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(?:\.\d{3})?\s*(?:AM|PM)?\s*[-:>]/gm, '');
358
+ // Strip bracketed metadata like [2026-01-25...]
359
+ clean = clean.replace(/\[\d{4}-\d{2}-\d{2}.*?\]/g, '');
360
+ clean = clean.replace(/\[[#=]{0,10}\s{0,10}\]\s*\d{1,3}%/g, ''); // [===] 100%
361
+ // 2.5 PII Masking
362
+ clean = clean.replace(/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g, '[EMAIL_REDACTED]');
363
+ clean = clean.replace(/\b(?:\d{1,3}\.){3}\d{1,3}\b/g, '[IP_REDACTED]');
364
+ clean = clean.replace(/sk-[a-zA-Z0-9]{32,}/g, 'sk-[REDACTED]');
365
+ // --- DENSITY-AWARE SCRUBBER (Standard 073) ---
366
+ // 1. Strip "Dirty Read" Source Headers & Recursive Metadata
367
+ // Matches: [Source: ...] or status: [Source: ...]
368
+ clean = clean.replace(/(?:status:\s*)?\[Source: .*?\](?:\s*\(Timestamp: .*?\))?/g, '');
369
+ // 2. Strip Logging/YAML/JSON Wrappers (Aggressive Pattern)
370
+ // This targets the keys and the quotes around them, but leaves the content.
371
+ const metaKeys = ['response_content', 'thinking_content', 'content', 'message', 'text', 'body', 'type', 'timestamp', 'source_path'];
372
+ metaKeys.forEach(key => {
373
+ // Match "key": " or key: |- or "key": |- etc.
374
+ const regex = new RegExp(`["']?${key}["']?\\s*:\\s*(?:\\|-?|")?`, 'g');
375
+ clean = clean.replace(regex, '');
376
+ });
377
+ // Strip trailing quotes and braces from JSON-like fragments
378
+ clean = clean.replace(/"\s*,\s*"/g, '\n');
379
+ clean = clean.replace(/"\s*}/g, '');
380
+ clean = clean.replace(/{\s*"/g, '');
381
+ // 3. Strip LLM Role Markers
382
+ clean = clean.replace(/<\|user\|>/g, '');
383
+ clean = clean.replace(/<\|assistant\|>/g, '');
384
+ clean = clean.replace(/<\|system\|>/g, '');
385
+ // 4. Strip Anchor System Output (prevent self-contamination)
386
+ // Remove score markers from search results
387
+ clean = clean.replace(/score:\s*\d+(?:\.\d+)?/g, '');
388
+ // Remove virtual molecule IDs
389
+ clean = clean.replace(/virtual_mem_[a-f0-9_]+/g, '');
390
+ // Remove system memory IDs
391
+ clean = clean.replace(/\bid:\s*["']?mem_[a-f0-9_]+["']?\s*,?/g, '');
392
+ // Remove source path markers
393
+ clean = clean.replace(/source:\s*["']?inbox\/[^"'\n]+["']?\s*,?/g, '');
394
+ // Remove provenance markers
395
+ clean = clean.replace(/provenance:\s*["']?(internal|external|quarantine)["']?\s*,?/g, '');
396
+ // Remove bucket arrays
397
+ clean = clean.replace(/buckets:\s*\[[\s\w,"']*\]\s*,?/g, '');
398
+ // Remove epoch data
399
+ clean = clean.replace(/epochs?:\s*['"]?[^,\n"']+['"]?\s*,?/g, '');
400
+ // Remove timestamp fields from system output
401
+ clean = clean.replace(/timestamp:\s*["']?[^"'\n]+["']?\s*,?/g, '');
402
+ // Remove compound_id and byte range markers
403
+ clean = clean.replace(/compound_id:\s*["']?[a-f0-9_]+["']?\s*,?/g, '');
404
+ clean = clean.replace(/start_byte:\s*\d+\s*,?/g, '');
405
+ clean = clean.replace(/end_byte:\s*\d+\s*,?/g, '');
406
+ clean = clean.replace(/molecular_signature:\s*["']?[a-f0-9]+["']?\s*,?/g, '');
407
+ clean = clean.replace(/is_inflated:\s*(true|false)\s*,?/g, '');
408
+ // 5. Strip MCP/Agent Output Formatting
409
+ // Remove YAML list markers from search results
410
+ clean = clean.replace(/^\s*-\s*(id|source|score|content|tags|buckets|provenance):\s*/gm, '');
411
+ // Remove YAML block markers
412
+ clean = clean.replace(/^\s*\|\s*$/gm, '');
413
+ // Remove code block wrappers (keep content)
414
+ clean = clean.replace(/```yaml\s*/g, '');
415
+ clean = clean.replace(/```\s*$/gm, '');
416
+ // Remove emoji markers from system output
417
+ clean = clean.replace(/🔍\s*|🤖\s*|⚙️\s*|✅\s*|❌\s*/g, '');
418
+ // 6. Final Polish
419
+ clean = clean.replace(/\n{3,}/g, '\n\n');
420
+ return clean.trim();
421
+ }
422
+ /**
423
+ * Helper: The Key Assassin
424
+ * Recursively un-escapes and removes JSON wrappers.
425
+ */
426
+ cleanseJsonArtifacts(text) {
427
+ let clean = text;
428
+ // 1. Recursive Un-escape
429
+ // DISABLED NATIVE CLEANSE due to stack overflow on deep nesting
430
+ // if (native && native.cleanse) {
431
+ // clean = native.cleanse(clean);
432
+ // } else {
433
+ let pass = 0;
434
+ while (clean.includes('\\') && pass < 3) {
435
+ pass++;
436
+ clean = clean.replace(/\\"/g, '"').replace(/\\n/g, '\n').replace(/\\t/g, '\t');
437
+ }
438
+ // }
439
+ // 2. Code Block Protection
440
+ const codeBlocks = [];
441
+ const PLACEHOLDER = '___CODE_BLOCK_PLACEHOLDER___';
442
+ clean = clean.replace(/```[\s\S]*?```/g, (match) => {
443
+ codeBlocks.push(match);
444
+ return `${PLACEHOLDER}${codeBlocks.length - 1}___`;
445
+ });
446
+ // 3. Remove Metadata & Wrappers
447
+ const purge = (ptrn) => { clean = clean.replace(ptrn, ''); };
448
+ purge(/"type"\s*:\s*"[^"]*",?/g);
449
+ purge(/"timestamp"\s*:\s*"[^"]*",?/g);
450
+ purge(/"source"\s*:\s*"[^"]*",?/g);
451
+ purge(/"response_content"\s*:\s*/g);
452
+ purge(/"thinking_content"\s*:\s*/g);
453
+ purge(/"content"\s*:\s*/g);
454
+ // 4. Structural Cleanup
455
+ clean = clean.replace(/\}\s*,\s*\{/g, '\n\n');
456
+ clean = clean.trim();
457
+ if (clean.startsWith('[') && clean.endsWith(']'))
458
+ clean = clean.substring(1, clean.length - 1);
459
+ // 5. Restore Code Blocks
460
+ clean = clean.replace(/___CODE_BLOCK_PLACEHOLDER___(\d+)___/g, (_, idx) => codeBlocks[parseInt(idx)] || _);
461
+ // 6. Slash Compressor
462
+ clean = clean.replace(/\\{2,}/g, '/');
463
+ return clean;
464
+ }
465
+ extractSystemAtoms(filePath) {
466
+ const atoms = [];
467
+ const normalized = filePath.replace(/\\/g, '/');
468
+ const lowerPath = normalized.toLowerCase();
469
+ const parts = normalized.split('/');
470
+ // --- TIME-LADDER LOGIC ---
471
+ // History/Archive gets down-weighted #Archive tag
472
+ if (lowerPath.includes('/history/') || lowerPath.includes('/archive/')) {
473
+ atoms.push(this.createAtom('#Archive', 'system', 0.5));
474
+ }
475
+ // Everything else is implicitly Current/Truth (Weight 1.0) unless specified otherwise
476
+ // 1. Project Root & Structure (Auto-Tagging)
477
+ const projectIndicators = ['codebase', 'projects', 'repos', 'src', 'packages', 'apps', 'personal', 'work', 'client'];
478
+ for (let i = 0; i < parts.length; i++) {
479
+ if (projectIndicators.includes(parts[i].toLowerCase()) && parts[i + 1]) {
480
+ atoms.push(this.createAtom(`#project:${parts[i + 1]}`, 'system'));
481
+ break;
482
+ }
483
+ }
484
+ // Structure Tags
485
+ if (normalized.includes('/src/') || normalized.startsWith('src/'))
486
+ atoms.push(this.createAtom('#src', 'system'));
487
+ if (normalized.includes('/docs/') || normalized.startsWith('docs/'))
488
+ atoms.push(this.createAtom('#docs', 'system'));
489
+ if (normalized.includes('/tests/') || normalized.startsWith('tests/'))
490
+ atoms.push(this.createAtom('#test', 'system'));
491
+ // File Type Tags
492
+ const ext = normalized.split('.').pop()?.toLowerCase() || '';
493
+ if (['ts', 'js', 'py', 'rs', 'go', 'java', 'cpp', 'c', 'h'].includes(ext))
494
+ atoms.push(this.createAtom('#code', 'system'));
495
+ if (['md', 'txt', 'rst'].includes(ext))
496
+ atoms.push(this.createAtom('#doc', 'system'));
497
+ if (['json', 'yaml', 'yml', 'xml'].includes(ext))
498
+ atoms.push(this.createAtom('#config', 'system'));
499
+ return atoms;
500
+ }
501
+ scanAtoms(content) {
502
+ const atoms = [];
503
+ const strictMode = shouldUseStrictAtomSelection();
504
+ // 1. Sovereign Keywords - OPTIMIZED with compiled regex
505
+ const keywordRegex = this.getKeywordRegex();
506
+ if (keywordRegex) {
507
+ const lowerContent = content.toLowerCase();
508
+ const matches = lowerContent.match(keywordRegex);
509
+ if (matches) {
510
+ // Use cached lowercase->original mapping
511
+ const keywordMap = this.getKeywordMap();
512
+ const seen = new Set();
513
+ for (const match of matches) {
514
+ const original = keywordMap.get(match);
515
+ if (original && !seen.has(original)) {
516
+ seen.add(original);
517
+ atoms.push(this.createAtom(`#${original}`, 'concept'));
518
+ }
519
+ }
520
+ }
521
+ }
522
+ // 2. Explicit Content Tags (#tag)
523
+ const tagMatches = content.match(/#(\w+)/g);
524
+ if (tagMatches) {
525
+ const seen = new Set();
526
+ tagMatches.forEach(m => {
527
+ const tag = m.toLowerCase();
528
+ // In strict mode, filter out common words and low-value tags
529
+ if (strictMode) {
530
+ const cleanTag = tag.replace(/^#/, '');
531
+ // Skip if too short, common word, or looks like noise
532
+ if (cleanTag.length < 3 ||
533
+ this.isCommonWord(cleanTag) ||
534
+ this.isBlacklistedTag(tag)) {
535
+ return;
536
+ }
537
+ }
538
+ if (!seen.has(tag)) {
539
+ seen.add(tag);
540
+ atoms.push(this.createAtom(m, 'concept'));
541
+ }
542
+ });
543
+ }
544
+ // Deduplicate locally
545
+ const unique = new Map();
546
+ atoms.forEach(a => unique.set(a.id, a));
547
+ return Array.from(unique.values());
548
+ }
549
+ /**
550
+ * Check if a word is a common word that should be filtered in strict mode
551
+ */
552
+ isCommonWord(word) {
553
+ const commonWords = new Set([
554
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
555
+ 'of', 'with', 'by', 'from', 'up', 'about', 'into', 'over', 'after',
556
+ 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
557
+ 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might',
558
+ 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they',
559
+ 'what', 'which', 'who', 'whom', 'whose', 'when', 'where', 'why', 'how',
560
+ 'all', 'each', 'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such',
561
+ 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
562
+ 'can', 'just', 'now', 'then', 'here', 'there', 'if', 'as', 'but', 'or'
563
+ ]);
564
+ return commonWords.has(word.toLowerCase());
565
+ }
566
+ // Cache for keywords and compiled regex
567
+ cachedKeywords = null;
568
+ cachedKeywordRegex = null;
569
+ cachedKeywordMap = null;
570
+ getKeywordRegex() {
571
+ if (this.cachedKeywordRegex !== null)
572
+ return this.cachedKeywordRegex;
573
+ const keywords = this.loadSovereignKeywords();
574
+ if (keywords.length === 0) {
575
+ return null;
576
+ }
577
+ // Escape regex special chars and join with | for single-pass matching
578
+ const escaped = keywords.map(kw => kw.toLowerCase().replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
579
+ this.cachedKeywordRegex = new RegExp(`\\b(${escaped.join('|')})\\b`, 'gi');
580
+ return this.cachedKeywordRegex;
581
+ }
582
+ getKeywordMap() {
583
+ if (this.cachedKeywordMap)
584
+ return this.cachedKeywordMap;
585
+ const keywords = this.loadSovereignKeywords();
586
+ this.cachedKeywordMap = new Map();
587
+ for (const kw of keywords) {
588
+ this.cachedKeywordMap.set(kw.toLowerCase(), kw);
589
+ }
590
+ return this.cachedKeywordMap;
591
+ }
592
+ loadSovereignKeywords() {
593
+ if (this.cachedKeywords)
594
+ return this.cachedKeywords;
595
+ try {
596
+ // Check likely locations for internal_tags.json
597
+ const possiblePaths = [
598
+ path.join(process.cwd(), 'engine', 'context', 'internal_tags.json'),
599
+ path.join(process.cwd(), '..', 'engine', 'context', 'internal_tags.json'),
600
+ // engine/src/services/ingest -> ../../../../engine/context
601
+ path.join(__dirname, '../../../../engine/context/internal_tags.json'),
602
+ // Fallback to old location
603
+ path.join(process.cwd(), 'context', 'internal_tags.json')
604
+ ];
605
+ for (const p of possiblePaths) {
606
+ if (fs.existsSync(p)) {
607
+ const content = fs.readFileSync(p, 'utf-8');
608
+ const json = JSON.parse(content);
609
+ if (Array.isArray(json.keywords)) {
610
+ this.cachedKeywords = json.keywords;
611
+ return json.keywords;
612
+ }
613
+ }
614
+ }
615
+ this.cachedKeywords = [];
616
+ return [];
617
+ }
618
+ catch (e) {
619
+ console.error('[Atomizer] Failed to load internal_tags.json', e);
620
+ return [];
621
+ }
622
+ }
623
+ createAtom(label, type, weight = 1.0) {
624
+ return {
625
+ id: `atom_${crypto.createHash('sha256').update(label).digest('hex').substring(0, 12)}`,
626
+ label,
627
+ type,
628
+ weight
629
+ };
630
+ }
631
+ /**
632
+ * Splits content into molecules with byte offsets and extracted timestamps.
633
+ * Enhanced with Type awareness (Prose vs Code vs Data).
634
+ */
635
+ splitIntoMolecules(text, type = 'prose', maxSize = 1024) {
636
+ const results = [];
637
+ // Helper to get UTF-8 byte length of a string
638
+ const getByteLength = (str) => {
639
+ return Buffer.byteLength(str, 'utf8');
640
+ };
641
+ // Helper to convert string index to byte offset
642
+ const stringIndexToByteOffset = (str, stringIndex) => {
643
+ if (stringIndex <= 0)
644
+ return 0;
645
+ if (stringIndex >= str.length)
646
+ return getByteLength(str);
647
+ return getByteLength(str.substring(0, stringIndex));
648
+ };
649
+ // Helper to extract FIRST timestamp from a chunk (legacy - used for molecule splitting)
650
+ const extractTimestamp = (chunk) => {
651
+ // Match ISO timestamps: 2026-01-25T03:43:54.405Z or 2026-01-25 03:43:54
652
+ const isoRegex = /\b(\d{4}-\d{2}-\d{2}[T\s]\d{2}:\d{2}:\d{2}(?:\.\d{3})?Z?)\b/g;
653
+ let match = isoRegex.exec(chunk);
654
+ if (match) {
655
+ const ts = Date.parse(match[1]);
656
+ if (!isNaN(ts))
657
+ return ts;
658
+ }
659
+ // Match YYYY-MM-DD format (without time)
660
+ const dateRegex = /\b(20[2-9]\d-\d{2}-\d{2})\b/;
661
+ let match2 = chunk.match(dateRegex);
662
+ if (match2) {
663
+ const ts = Date.parse(match2[1]);
664
+ if (!isNaN(ts))
665
+ return ts;
666
+ }
667
+ // Match MM/DD/YYYY or DD/MM/YYYY format
668
+ const usDateRegex = /\b(\d{1,2}\/\d{1,2}\/\d{4})\b/;
669
+ let match3 = chunk.match(usDateRegex);
670
+ if (match3) {
671
+ const ts = Date.parse(match3[1]);
672
+ if (!isNaN(ts))
673
+ return ts;
674
+ }
675
+ // Match Month DD, YYYY format
676
+ const monthDayYearRegex = /\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),\s+(\d{4})\b/;
677
+ let match4 = chunk.match(monthDayYearRegex);
678
+ if (match4) {
679
+ const [, month, day, year] = match4;
680
+ const monthIndex = ['January', 'February', 'March', 'April', 'May', 'June',
681
+ 'July', 'August', 'September', 'October', 'November', 'December']
682
+ .indexOf(month);
683
+ const date = new Date(parseInt(year), monthIndex, parseInt(day));
684
+ if (!isNaN(date.getTime()))
685
+ return date.getTime();
686
+ }
687
+ // Match DD Month YYYY format
688
+ const dayMonthYearRegex = /\b(\d{1,2})\s+(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{4})\b/;
689
+ let match5 = chunk.match(dayMonthYearRegex);
690
+ if (match5) {
691
+ const [, day, month, year] = match5;
692
+ const monthIndex = ['January', 'February', 'March', 'April', 'May', 'June',
693
+ 'July', 'August', 'September', 'October', 'November', 'December']
694
+ .indexOf(month);
695
+ const date = new Date(parseInt(year), monthIndex, parseInt(day));
696
+ if (!isNaN(date.getTime()))
697
+ return date.getTime();
698
+ }
699
+ return undefined;
700
+ };
701
+ // --- STRATEGY: CODE (AST BLOCKS) ---
702
+ if (type === 'code') {
703
+ // "Heuristic AST": Split by top-level blocks (functions, classes) or chunks of logic.
704
+ // Using regex to detect block starts and tracking braces.
705
+ const lines = text.split('\n');
706
+ let currentBlock = '';
707
+ let blockStart = 0;
708
+ let currentCursor = 0;
709
+ let braceDepth = 0;
710
+ for (const line of lines) {
711
+ const lineWithNewline = line + '\n';
712
+ const lineByteLen = getByteLength(lineWithNewline);
713
+ const openBraces = (line.match(/\{/g) || []).length;
714
+ const closeBraces = (line.match(/\}/g) || []).length;
715
+ const prevDepth = braceDepth;
716
+ braceDepth += (openBraces - closeBraces);
717
+ currentBlock += lineWithNewline;
718
+ // End of a top-level block?
719
+ if (braceDepth === 0 && prevDepth > 0) {
720
+ // Just closed a root block (function/class)
721
+ results.push({ content: currentBlock, start: blockStart, end: currentCursor + lineByteLen, timestamp: extractTimestamp(currentBlock) });
722
+ currentBlock = '';
723
+ blockStart = currentCursor + lineByteLen;
724
+ }
725
+ // Double newline in root scope -> likely separate statements?
726
+ else if (braceDepth === 0 && line.trim() === '' && currentBlock.trim().length > 0) {
727
+ results.push({ content: currentBlock, start: blockStart, end: currentCursor + lineByteLen, timestamp: extractTimestamp(currentBlock) });
728
+ currentBlock = '';
729
+ blockStart = currentCursor + lineByteLen;
730
+ }
731
+ currentCursor += lineByteLen;
732
+ }
733
+ if (currentBlock.trim().length > 0) {
734
+ results.push({ content: currentBlock, start: blockStart, end: currentCursor, timestamp: extractTimestamp(currentBlock) });
735
+ }
736
+ }
737
+ else if (type === 'data') {
738
+ // --- STRATEGY: DATA (ROWS) ---
739
+ // Split by line
740
+ let cursor = 0;
741
+ const lines = text.split('\n');
742
+ for (const line of lines) {
743
+ const lineWithNewline = line + '\n';
744
+ const byteLen = getByteLength(lineWithNewline);
745
+ if (line.trim().length > 0) {
746
+ // Store without the newline in content, but account for it in byte offsets
747
+ const lineByteLen = getByteLength(line);
748
+ results.push({ content: line, start: cursor, end: cursor + lineByteLen, timestamp: extractTimestamp(line) });
749
+ }
750
+ cursor += byteLen;
751
+ }
752
+ }
753
+ else {
754
+ // --- STRATEGY: PROSE (SENTENCES with MARKDOWN FISSION) ---
755
+ // MARKDOWN FISSION: Split on code fences first to separate code from prose
756
+ const codeFenceRegex = /```[\s\S]*?```/g;
757
+ const codeFences = [];
758
+ let fenceMatch;
759
+ while ((fenceMatch = codeFenceRegex.exec(text)) !== null) {
760
+ const startByte = stringIndexToByteOffset(text, fenceMatch.index);
761
+ const endByte = stringIndexToByteOffset(text, fenceMatch.index + fenceMatch[0].length);
762
+ codeFences.push({
763
+ match: fenceMatch[0],
764
+ stringIndex: fenceMatch.index,
765
+ startByte: startByte,
766
+ endByte: endByte
767
+ });
768
+ }
769
+ // If we have code fences, split around them
770
+ if (codeFences.length > 0) {
771
+ let stringCursor = 0; // Track position in string indices
772
+ let byteCursor = 0; // Track position in byte offsets
773
+ for (const fence of codeFences) {
774
+ // Pre-fence prose
775
+ const fenceStringStart = fence.stringIndex;
776
+ if (fenceStringStart > stringCursor) {
777
+ const preProse = text.substring(stringCursor, fenceStringStart);
778
+ if (preProse.trim().length > 0) {
779
+ // Recursively split the prose portion into sentences
780
+ const proseParts = preProse.split(/(?<=[.!?])\s+(?=[A-Z])/);
781
+ let proseStringCursor = 0;
782
+ for (const part of proseParts) {
783
+ if (part.trim().length === 0)
784
+ continue;
785
+ const partStringStart = preProse.indexOf(part, proseStringCursor);
786
+ if (partStringStart !== -1) {
787
+ const partByteStart = byteCursor + stringIndexToByteOffset(preProse, partStringStart);
788
+ const partByteEnd = partByteStart + getByteLength(part);
789
+ results.push({ content: part, start: partByteStart, end: partByteEnd, timestamp: extractTimestamp(part) });
790
+ proseStringCursor = partStringStart + part.length;
791
+ }
792
+ }
793
+ }
794
+ }
795
+ // The code fence itself (will be typed as 'code' in molecule enrichment)
796
+ results.push({ content: fence.match, start: fence.startByte, end: fence.endByte, timestamp: extractTimestamp(fence.match) });
797
+ stringCursor = fenceStringStart + fence.match.length;
798
+ byteCursor = fence.endByte;
799
+ }
800
+ // Post-fence prose (after last fence)
801
+ if (stringCursor < text.length) {
802
+ const postProse = text.substring(stringCursor);
803
+ if (postProse.trim().length > 0) {
804
+ const proseParts = postProse.split(/(?<=[.!?])\s+(?=[A-Z])/);
805
+ let proseStringCursor = 0;
806
+ for (const part of proseParts) {
807
+ if (part.trim().length === 0)
808
+ continue;
809
+ const partStringStart = postProse.indexOf(part, proseStringCursor);
810
+ if (partStringStart !== -1) {
811
+ const partByteStart = byteCursor + stringIndexToByteOffset(postProse, partStringStart);
812
+ const partByteEnd = partByteStart + getByteLength(part);
813
+ results.push({ content: part, start: partByteStart, end: partByteEnd, timestamp: extractTimestamp(part) });
814
+ proseStringCursor = partStringStart + part.length;
815
+ }
816
+ }
817
+ }
818
+ }
819
+ }
820
+ else {
821
+ // No code fences - standard sentence splitting
822
+ const parts = text.split(/(?<=[.!?])\s+(?=[A-Z])/);
823
+ let searchStringCursor = 0;
824
+ for (const part of parts) {
825
+ if (part.trim().length === 0)
826
+ continue;
827
+ const realStringStart = text.indexOf(part, searchStringCursor); // Find next occurrence
828
+ if (realStringStart !== -1) {
829
+ const realByteStart = stringIndexToByteOffset(text, realStringStart);
830
+ const realByteEnd = realByteStart + getByteLength(part);
831
+ results.push({ content: part, start: realByteStart, end: realByteEnd, timestamp: extractTimestamp(part) });
832
+ searchStringCursor = realStringStart + part.length;
833
+ }
834
+ }
835
+ }
836
+ }
837
+ // --- ENFORCE SIZE LIMIT (POST-PROCESS) ---
838
+ const finalResults = [];
839
+ for (const item of results) {
840
+ const itemByteLen = getByteLength(item.content);
841
+ if (itemByteLen <= maxSize) {
842
+ finalResults.push(item);
843
+ }
844
+ else {
845
+ // Force split large molecules by byte size
846
+ let currentStart = item.start;
847
+ let remaining = item.content;
848
+ while (remaining.length > 0) {
849
+ // Find a safe split point that doesn't exceed maxSize bytes
850
+ let splitPoint = remaining.length;
851
+ let chunkByteLen = getByteLength(remaining);
852
+ // Binary search for the right split point if we're over the limit
853
+ if (chunkByteLen > maxSize) {
854
+ let low = 0;
855
+ let high = remaining.length;
856
+ while (low < high) {
857
+ const mid = Math.floor((low + high + 1) / 2);
858
+ const testChunk = remaining.substring(0, mid);
859
+ const testByteLen = getByteLength(testChunk);
860
+ if (testByteLen <= maxSize) {
861
+ low = mid;
862
+ }
863
+ else {
864
+ high = mid - 1;
865
+ }
866
+ }
867
+ // Walk back to nearest newline to avoid splitting mid-line
868
+ let newlinePos = remaining.lastIndexOf('\n', low);
869
+ splitPoint = newlinePos > 0 ? newlinePos + 1 : low;
870
+ }
871
+ const chunk = remaining.substring(0, splitPoint);
872
+ const chunkBytes = getByteLength(chunk);
873
+ // Inherit timestamp for all chunks if the original item had one
874
+ finalResults.push({
875
+ content: chunk,
876
+ start: currentStart,
877
+ end: currentStart + chunkBytes,
878
+ timestamp: item.timestamp
879
+ });
880
+ remaining = remaining.substring(splitPoint);
881
+ currentStart += chunkBytes;
882
+ }
883
+ }
884
+ }
885
+ return finalResults;
886
+ }
887
+ detectMoleculeType(text, filePath) {
888
+ // 1. File Extension hints
889
+ if (filePath.endsWith('.csv') || filePath.endsWith('.json') || filePath.endsWith('.yaml') || filePath.endsWith('.yml'))
890
+ return 'data';
891
+ if (filePath.match(/\.(ts|js|py|rs|go|cpp|h|c)$/))
892
+ return 'code';
893
+ // 2. Large file safety: treat files > 5MB as data to avoid regex timeout
894
+ if (text.length > 5 * 1024 * 1024) {
895
+ console.log(`[Atomizer] Large file (${(text.length / (1024 * 1024)).toFixed(1)}MB) - using data strategy for performance`);
896
+ return 'data';
897
+ }
898
+ // 3. Content Heuristics
899
+ if (text.trim().startsWith('|') && text.includes('|'))
900
+ return 'data'; // Markdown Table row
901
+ if (text.includes('```') || text.includes('function ') || text.includes('const ') || text.includes('import '))
902
+ return 'code';
903
+ return 'prose';
904
+ }
905
+ /**
906
+ * Extract earliest timestamp from content for temporal ordering
907
+ * Scans for multiple timestamp formats and returns the earliest found
908
+ */
909
+ extractEarliestTimestamp(chunk, fallbackTimestamp) {
910
+ const timestamps = [];
911
+ // ISO timestamps: 2026-01-25T03:43:54.405Z or 2026-01-25 03:43:54
912
+ const isoRegex = /\b(\d{4}-\d{2}-\d{2}[T\s]\d{2}:\d{2}:\d{2}(?:\.\d{3})?Z?)\b/g;
913
+ let isoMatch;
914
+ while ((isoMatch = isoRegex.exec(chunk)) !== null) {
915
+ const ts = Date.parse(isoMatch[1]);
916
+ if (!isNaN(ts))
917
+ timestamps.push(ts);
918
+ }
919
+ // YYYY-MM-DD
920
+ const dateRegex = /\b(20[2-9]\d-\d{2}-\d{2})\b/g;
921
+ let dateMatch;
922
+ while ((dateMatch = dateRegex.exec(chunk)) !== null) {
923
+ const ts = Date.parse(dateMatch[1]);
924
+ if (!isNaN(ts))
925
+ timestamps.push(ts);
926
+ }
927
+ // MM/DD/YYYY or DD/MM/YYYY
928
+ const usDateRegex = /\b(\d{1,2}\/\d{1,2}\/\d{4})\b/g;
929
+ let usMatch;
930
+ while ((usMatch = usDateRegex.exec(chunk)) !== null) {
931
+ const ts = Date.parse(usMatch[1]);
932
+ if (!isNaN(ts))
933
+ timestamps.push(ts);
934
+ }
935
+ // Return earliest timestamp found, or fallback
936
+ if (timestamps.length > 0) {
937
+ return Math.min(...timestamps);
938
+ }
939
+ return fallbackTimestamp || Date.now();
940
+ }
941
+ extractNumericData(text) {
942
+ // Examples: "1500 PSI", "15%", "$10.50"
943
+ const matches = text.match(/([\d,]+\.?\d*)\s?([A-Za-z%]+)?/g);
944
+ if (!matches)
945
+ return null;
946
+ let bestCandidate = null;
947
+ for (const m of matches) {
948
+ const valStr = m.match(/[\d,]+\.?\d*/)?.[0]?.replace(/,/g, '');
949
+ const unit = m.match(/[A-Za-z%]+/)?.[0];
950
+ if (valStr) {
951
+ const val = parseFloat(valStr);
952
+ // Filter out likely years (1900-2100) if no unit, to avoid false positives in history
953
+ if ((val >= 1900 && val <= 2100) && Number.isInteger(val) && !unit)
954
+ continue;
955
+ if (unit || !bestCandidate) {
956
+ bestCandidate = { value: val, unit: unit };
957
+ }
958
+ }
959
+ }
960
+ return bestCandidate;
961
+ }
962
+ generateSimHash(text) {
963
+ // Use @rbalchii/native-fingerprint if available
964
+ if (nativeFingerprint) {
965
+ try {
966
+ return nativeFingerprint(text);
967
+ }
968
+ catch { /* fall through to JS fallback */ }
969
+ }
970
+ // JS Fallback: Simple Jenkins Hash
971
+ let hash = 0;
972
+ if (text.length === 0)
973
+ return "0";
974
+ for (let i = 0; i < text.length; i++) {
975
+ const char = text.charCodeAt(i);
976
+ hash = ((hash << 5) - hash) + char;
977
+ hash = hash & hash; // Convert to 32bit integer
978
+ }
979
+ return Math.abs(hash).toString(16);
980
+ }
981
+ }
982
+ //# sourceMappingURL=atomizer-service.js.map