claude-brain 0.30.2 → 0.30.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (236) hide show
  1. package/README.md +241 -191
  2. package/VERSION +1 -1
  3. package/assets/CLAUDE-unified.md +11 -11
  4. package/assets/CLAUDE.md +29 -29
  5. package/package.json +7 -3
  6. package/packs/backend/node.json +173 -173
  7. package/packs/core/javascript.json +176 -176
  8. package/packs/core/typescript.json +222 -222
  9. package/packs/frontend/react.json +254 -254
  10. package/packs/meta/testing.json +172 -172
  11. package/scripts/postinstall.mjs +531 -531
  12. package/src/automation/decision-detector.ts +452 -452
  13. package/src/automation/phase12-manager.ts +456 -456
  14. package/src/automation/proactive-recall.ts +373 -373
  15. package/src/automation/project-detector.ts +310 -310
  16. package/src/automation/repo-scanner.ts +210 -205
  17. package/src/cli/auto-setup.ts +75 -75
  18. package/src/cli/auto-start.ts +266 -266
  19. package/src/cli/bin.ts +264 -264
  20. package/src/cli/commands/autostart.ts +90 -90
  21. package/src/cli/commands/chroma.ts +578 -577
  22. package/src/cli/commands/export-training.ts +70 -70
  23. package/src/cli/commands/export.ts +130 -130
  24. package/src/cli/commands/git-hook.ts +183 -183
  25. package/src/cli/commands/hooks.ts +217 -217
  26. package/src/cli/commands/init.ts +123 -123
  27. package/src/cli/commands/install-mcp.ts +122 -111
  28. package/src/cli/commands/models.ts +979 -979
  29. package/src/cli/commands/pack.ts +200 -200
  30. package/src/cli/commands/refresh.ts +344 -339
  31. package/src/cli/commands/reindex.ts +120 -120
  32. package/src/cli/commands/serve.ts +466 -463
  33. package/src/cli/commands/start.ts +44 -44
  34. package/src/cli/commands/status.ts +220 -203
  35. package/src/cli/commands/uninstall-mcp.ts +45 -41
  36. package/src/cli/commands/update.ts +130 -124
  37. package/src/cli/migrate-chroma.ts +106 -106
  38. package/src/cli/ui/animations.ts +80 -80
  39. package/src/cli/ui/components.ts +82 -82
  40. package/src/cli/ui/index.ts +4 -4
  41. package/src/cli/ui/logo.ts +36 -36
  42. package/src/cli/ui/theme.ts +55 -55
  43. package/src/code-intelligence/indexer.ts +352 -352
  44. package/src/code-intelligence/linker.ts +178 -178
  45. package/src/code-intelligence/parser.ts +484 -484
  46. package/src/code-intelligence/query.ts +291 -291
  47. package/src/code-intelligence/schema.ts +83 -83
  48. package/src/code-intelligence/types.ts +95 -95
  49. package/src/config/defaults.ts +52 -52
  50. package/src/config/home.ts +56 -56
  51. package/src/config/index.ts +5 -5
  52. package/src/config/loader.ts +192 -192
  53. package/src/config/schema.ts +446 -415
  54. package/src/config/validator.ts +182 -182
  55. package/src/context/assembler.ts +407 -400
  56. package/src/context/index.ts +79 -79
  57. package/src/context/progress-tracker.ts +174 -174
  58. package/src/context/standards-manager.ts +287 -287
  59. package/src/context/validator.ts +58 -58
  60. package/src/diagnostics/index.ts +122 -121
  61. package/src/health/index.ts +233 -232
  62. package/src/hooks/brain-hook.ts +134 -131
  63. package/src/hooks/capture.ts +168 -168
  64. package/src/hooks/claude-code-mastery.md +112 -112
  65. package/src/hooks/context-hook.ts +260 -245
  66. package/src/hooks/deduplicator.ts +72 -72
  67. package/src/hooks/git-capture.ts +109 -109
  68. package/src/hooks/git-hook-installer.ts +211 -207
  69. package/src/hooks/index.ts +20 -20
  70. package/src/hooks/installer.ts +306 -288
  71. package/src/hooks/interceptor-hook.ts +204 -201
  72. package/src/hooks/passive-classifier.ts +397 -397
  73. package/src/hooks/queue.ts +160 -129
  74. package/src/hooks/session-tracker.ts +312 -312
  75. package/src/hooks/types.ts +52 -52
  76. package/src/index.ts +7 -7
  77. package/src/intelligence/cross-project/generalizer.ts +283 -283
  78. package/src/intelligence/cross-project/index.ts +7 -7
  79. package/src/intelligence/hf-downloader.ts +222 -222
  80. package/src/intelligence/hf-manifest.json +78 -78
  81. package/src/intelligence/index.ts +24 -24
  82. package/src/intelligence/inference-router.ts +762 -762
  83. package/src/intelligence/model-manager.ts +263 -245
  84. package/src/intelligence/optimization/index.ts +10 -10
  85. package/src/intelligence/optimization/precompute.ts +202 -202
  86. package/src/intelligence/optimization/semantic-cache.ts +213 -207
  87. package/src/intelligence/prediction/index.ts +7 -7
  88. package/src/intelligence/prediction/recommender.ts +276 -268
  89. package/src/intelligence/reasoning/chain-retrieval.ts +243 -247
  90. package/src/intelligence/reasoning/index.ts +7 -7
  91. package/src/intelligence/temporal/evolution.ts +193 -197
  92. package/src/intelligence/temporal/index.ts +16 -16
  93. package/src/intelligence/temporal/query-processor.ts +190 -190
  94. package/src/intelligence/temporal/timeline.ts +272 -259
  95. package/src/intelligence/temporal/trends.ts +263 -263
  96. package/src/intelligence/tokenizer.ts +118 -118
  97. package/src/knowledge/entity-extractor.ts +447 -443
  98. package/src/knowledge/graph/builder.ts +185 -185
  99. package/src/knowledge/graph/linker.ts +201 -201
  100. package/src/knowledge/graph/memory-graph.ts +359 -359
  101. package/src/knowledge/graph/schema.ts +99 -99
  102. package/src/knowledge/graph/search.ts +166 -166
  103. package/src/knowledge/relationship-extractor.ts +108 -108
  104. package/src/memory/chroma/client.ts +211 -192
  105. package/src/memory/chroma/collection-manager.ts +92 -92
  106. package/src/memory/chroma/config.ts +57 -57
  107. package/src/memory/chroma/embeddings.ts +177 -175
  108. package/src/memory/chroma/index.ts +82 -82
  109. package/src/memory/chroma/migration.ts +270 -270
  110. package/src/memory/chroma/schemas.ts +69 -69
  111. package/src/memory/chroma/search.ts +319 -315
  112. package/src/memory/chroma/store.ts +755 -747
  113. package/src/memory/compression.ts +121 -121
  114. package/src/memory/consolidation/archiver.ts +162 -165
  115. package/src/memory/consolidation/merger.ts +182 -186
  116. package/src/memory/consolidation/scorer.ts +136 -136
  117. package/src/memory/database.ts +9 -0
  118. package/src/memory/dual-write.ts +145 -0
  119. package/src/memory/embeddings.ts +226 -226
  120. package/src/memory/episodic/detector.ts +108 -108
  121. package/src/memory/episodic/manager.ts +347 -351
  122. package/src/memory/episodic/summarizer.ts +179 -179
  123. package/src/memory/episodic/types.ts +52 -52
  124. package/src/memory/fts5-search.ts +692 -633
  125. package/src/memory/index.ts +943 -1060
  126. package/src/memory/migrations/add-fts5.ts +118 -108
  127. package/src/memory/patterns.ts +438 -438
  128. package/src/memory/pruning.ts +60 -60
  129. package/src/memory/schema.ts +88 -88
  130. package/src/memory/store.ts +911 -787
  131. package/src/orchestrator/handlers/decision-handler.ts +204 -204
  132. package/src/packs/index.ts +9 -9
  133. package/src/packs/loader.ts +134 -134
  134. package/src/packs/manager.ts +204 -204
  135. package/src/packs/ranker.ts +78 -78
  136. package/src/packs/types.ts +81 -81
  137. package/src/phase12/index.ts +5 -5
  138. package/src/retrieval/bm25/index.ts +300 -297
  139. package/src/retrieval/bm25/tokenizer.ts +184 -184
  140. package/src/retrieval/feedback/adaptive.ts +221 -221
  141. package/src/retrieval/feedback/index.ts +16 -16
  142. package/src/retrieval/feedback/metrics.ts +221 -221
  143. package/src/retrieval/feedback/store.ts +283 -283
  144. package/src/retrieval/fusion/index.ts +194 -194
  145. package/src/retrieval/fusion/rrf.ts +165 -165
  146. package/src/retrieval/index.ts +12 -12
  147. package/src/retrieval/pipeline.ts +375 -375
  148. package/src/retrieval/query/expander.ts +203 -203
  149. package/src/retrieval/query/index.ts +27 -27
  150. package/src/retrieval/query/intent-classifier.ts +252 -252
  151. package/src/retrieval/query/temporal-parser.ts +295 -295
  152. package/src/retrieval/reranker/index.ts +189 -188
  153. package/src/retrieval/reranker/model.ts +99 -95
  154. package/src/retrieval/service.ts +125 -125
  155. package/src/retrieval/types.ts +162 -162
  156. package/src/routing/entity-extractor.ts +454 -454
  157. package/src/routing/handlers/exploration-handler.ts +369 -0
  158. package/src/routing/handlers/index.ts +19 -0
  159. package/src/routing/handlers/memory-handler.ts +273 -0
  160. package/src/routing/handlers/mutation-handler.ts +241 -0
  161. package/src/routing/handlers/recall-handler.ts +642 -0
  162. package/src/routing/handlers/shared.ts +515 -0
  163. package/src/routing/handlers/types.ts +48 -0
  164. package/src/routing/intent-classifier.ts +552 -552
  165. package/src/routing/response-filter.ts +399 -391
  166. package/src/routing/router.ts +245 -2193
  167. package/src/routing/search-engine.ts +521 -514
  168. package/src/routing/types.ts +104 -94
  169. package/src/scripts/health-check.ts +118 -118
  170. package/src/scripts/setup.ts +122 -122
  171. package/src/server/auto-updater.ts +283 -276
  172. package/src/server/handlers/call-tool.ts +159 -159
  173. package/src/server/handlers/list-tools.ts +35 -35
  174. package/src/server/handlers/tools/auto-remember.ts +165 -165
  175. package/src/server/handlers/tools/brain.ts +86 -86
  176. package/src/server/handlers/tools/create-project.ts +135 -135
  177. package/src/server/handlers/tools/get-code-standards.ts +123 -123
  178. package/src/server/handlers/tools/get-corrections.ts +152 -152
  179. package/src/server/handlers/tools/get-patterns.ts +156 -156
  180. package/src/server/handlers/tools/get-project-context.ts +75 -75
  181. package/src/server/handlers/tools/index.ts +30 -30
  182. package/src/server/handlers/tools/init-project.ts +756 -756
  183. package/src/server/handlers/tools/list-projects.ts +126 -126
  184. package/src/server/handlers/tools/recall-similar.ts +87 -87
  185. package/src/server/handlers/tools/recognize-pattern.ts +132 -132
  186. package/src/server/handlers/tools/record-correction.ts +131 -131
  187. package/src/server/handlers/tools/remember-decision.ts +168 -168
  188. package/src/server/handlers/tools/schemas.ts +179 -179
  189. package/src/server/handlers/tools/search-code.ts +122 -122
  190. package/src/server/handlers/tools/smart-context.ts +146 -146
  191. package/src/server/handlers/tools/update-progress.ts +131 -131
  192. package/src/server/http-api.ts +215 -1229
  193. package/src/server/mcp-proxy.ts +85 -84
  194. package/src/server/mcp-server.ts +285 -284
  195. package/src/server/middleware/auth.ts +39 -0
  196. package/src/server/middleware/error-handler.ts +37 -0
  197. package/src/server/middleware/rate-limit.ts +53 -0
  198. package/src/server/middleware/validate.ts +42 -0
  199. package/src/server/pid-manager.ts +137 -136
  200. package/src/server/providers/resources.ts +581 -581
  201. package/src/server/routes/code.ts +228 -0
  202. package/src/server/routes/context.ts +26 -0
  203. package/src/server/routes/health.ts +19 -0
  204. package/src/server/routes/helpers.ts +100 -0
  205. package/src/server/routes/hooks.ts +197 -0
  206. package/src/server/routes/mcp.ts +47 -0
  207. package/src/server/routes/memory.ts +397 -0
  208. package/src/server/routes/models.ts +96 -0
  209. package/src/server/routes/projects.ts +89 -0
  210. package/src/server/routes/types.ts +21 -0
  211. package/src/server/schemas/api-schemas.ts +202 -0
  212. package/src/server/services.ts +720 -720
  213. package/src/server/utils/memory-indicator.ts +84 -84
  214. package/src/server/utils/response-formatter.ts +129 -129
  215. package/src/server/web-viewer.ts +1145 -1115
  216. package/src/setup/index.ts +38 -38
  217. package/src/tools/registry.ts +115 -115
  218. package/src/tools/schemas.ts +666 -666
  219. package/src/tools/types.ts +412 -412
  220. package/src/training/data-store.ts +320 -298
  221. package/src/training/retrain-pipeline.ts +399 -394
  222. package/src/utils/error-handler.ts +136 -136
  223. package/src/utils/index.ts +58 -58
  224. package/src/utils/kill-port.ts +55 -53
  225. package/src/utils/phase12-helper.ts +56 -56
  226. package/src/utils/safe-path.ts +43 -0
  227. package/src/utils/timing.ts +47 -47
  228. package/src/utils/transaction.ts +63 -63
  229. package/src/vault/index.ts +4 -3
  230. package/src/vault/paths.ts +106 -106
  231. package/src/vault/query.ts +4 -1
  232. package/src/vault/reader.ts +44 -1
  233. package/src/vault/watcher.ts +24 -1
  234. package/src/vault/writer.ts +487 -413
  235. package/skills/persistent-memory/SKILL.md +0 -148
  236. package/skills/persistent-memory/references/tool-reference.md +0 -90
@@ -1,443 +1,447 @@
1
- /**
2
- * Entity Extractor
3
- * Rule-based NER using compromise + technology dictionary
4
- */
5
-
6
- import type { EntityType } from './graph/schema'
7
-
8
- export interface ExtractedEntity {
9
- name: string
10
- normalizedName: string
11
- type: EntityType
12
- confidence: number
13
- source: 'dictionary' | 'nlp' | 'rule' | 'model'
14
- positions: number[]
15
- }
16
-
17
- const TECH_DICTIONARY: Record<string, { aliases: string[]; type: EntityType }> = {
18
- // Languages
19
- typescript: { aliases: ['ts', 'typescript', 'type-script'], type: 'technology' },
20
- javascript: { aliases: ['js', 'javascript', 'ecmascript', 'es6', 'es2015', 'es2020', 'es2021', 'es2022'], type: 'technology' },
21
- python: { aliases: ['py', 'python3', 'python2'], type: 'technology' },
22
- rust: { aliases: ['rust-lang', 'rustlang'], type: 'technology' },
23
- go: { aliases: ['golang', 'go-lang'], type: 'technology' },
24
- java: { aliases: [], type: 'technology' },
25
- csharp: { aliases: ['c#', 'c-sharp', 'dotnet', '.net'], type: 'technology' },
26
- ruby: { aliases: ['rb'], type: 'technology' },
27
- php: { aliases: [], type: 'technology' },
28
- swift: { aliases: [], type: 'technology' },
29
- kotlin: { aliases: ['kt'], type: 'technology' },
30
- scala: { aliases: [], type: 'technology' },
31
- elixir: { aliases: [], type: 'technology' },
32
- clojure: { aliases: ['clj'], type: 'technology' },
33
- haskell: { aliases: ['hs'], type: 'technology' },
34
- lua: { aliases: [], type: 'technology' },
35
- perl: { aliases: ['pl'], type: 'technology' },
36
- r: { aliases: ['r-lang', 'rlang'], type: 'technology' },
37
- dart: { aliases: [], type: 'technology' },
38
- sql: { aliases: ['mysql', 'postgresql', 'postgres', 'sqlite', 'mssql', 'mariadb'], type: 'technology' },
39
- html: { aliases: ['html5'], type: 'technology' },
40
- css: { aliases: ['css3', 'scss', 'sass', 'less', 'stylus'], type: 'technology' },
41
- graphql: { aliases: ['gql'], type: 'technology' },
42
- yaml: { aliases: ['yml'], type: 'technology' },
43
- json: { aliases: [], type: 'technology' },
44
- markdown: { aliases: ['md'], type: 'technology' },
45
- bash: { aliases: ['sh', 'shell', 'zsh', 'fish'], type: 'technology' },
46
- c: { aliases: ['c-lang', 'ansi-c'], type: 'technology' },
47
- cpp: { aliases: ['c++', 'cplusplus'], type: 'technology' },
48
-
49
- // Frontend Frameworks
50
- react: { aliases: ['reactjs', 'react.js', 'react-dom'], type: 'technology' },
51
- vue: { aliases: ['vuejs', 'vue.js', 'vue3', 'vue2'], type: 'technology' },
52
- angular: { aliases: ['angularjs', 'angular.js', 'ng'], type: 'technology' },
53
- svelte: { aliases: ['sveltejs', 'sveltekit'], type: 'technology' },
54
- nextjs: { aliases: ['next.js', 'next', 'nextjs'], type: 'technology' },
55
- nuxt: { aliases: ['nuxtjs', 'nuxt.js'], type: 'technology' },
56
- remix: { aliases: ['remix-run'], type: 'technology' },
57
- astro: { aliases: ['astrojs'], type: 'technology' },
58
- gatsby: { aliases: ['gatsbyjs'], type: 'technology' },
59
- solid: { aliases: ['solidjs', 'solid-js'], type: 'technology' },
60
- preact: { aliases: ['preactjs'], type: 'technology' },
61
- htmx: { aliases: [], type: 'technology' },
62
- jquery: { aliases: [], type: 'technology' },
63
-
64
- // Backend Frameworks
65
- express: { aliases: ['expressjs', 'express.js'], type: 'technology' },
66
- fastify: { aliases: [], type: 'technology' },
67
- hono: { aliases: [], type: 'technology' },
68
- koa: { aliases: ['koajs'], type: 'technology' },
69
- nestjs: { aliases: ['nest.js', 'nest'], type: 'technology' },
70
- django: { aliases: [], type: 'technology' },
71
- flask: { aliases: [], type: 'technology' },
72
- fastapi: { aliases: ['fast-api'], type: 'technology' },
73
- rails: { aliases: ['ruby-on-rails', 'ror'], type: 'technology' },
74
- spring: { aliases: ['spring-boot', 'springboot'], type: 'technology' },
75
- laravel: { aliases: [], type: 'technology' },
76
- phoenix: { aliases: [], type: 'technology' },
77
- gin: { aliases: [], type: 'technology' },
78
- actix: { aliases: ['actix-web'], type: 'technology' },
79
-
80
- // Databases
81
- mongodb: { aliases: ['mongo'], type: 'technology' },
82
- redis: { aliases: [], type: 'technology' },
83
- elasticsearch: { aliases: ['elastic', 'es'], type: 'technology' },
84
- dynamodb: { aliases: ['dynamo'], type: 'technology' },
85
- cassandra: { aliases: [], type: 'technology' },
86
- neo4j: { aliases: [], type: 'technology' },
87
- couchdb: { aliases: ['couch'], type: 'technology' },
88
- firebase: { aliases: ['firestore'], type: 'technology' },
89
- supabase: { aliases: [], type: 'technology' },
90
- prisma: { aliases: [], type: 'technology' },
91
- drizzle: { aliases: ['drizzle-orm'], type: 'technology' },
92
- sequelize: { aliases: [], type: 'technology' },
93
- typeorm: { aliases: [], type: 'technology' },
94
- chromadb: { aliases: ['chroma'], type: 'technology' },
95
- pinecone: { aliases: [], type: 'technology' },
96
- weaviate: { aliases: [], type: 'technology' },
97
- qdrant: { aliases: [], type: 'technology' },
98
-
99
- // Cloud & DevOps
100
- aws: { aliases: ['amazon-web-services', 'amazon'], type: 'technology' },
101
- gcp: { aliases: ['google-cloud', 'google-cloud-platform'], type: 'technology' },
102
- azure: { aliases: ['microsoft-azure'], type: 'technology' },
103
- docker: { aliases: ['dockerfile', 'docker-compose'], type: 'technology' },
104
- kubernetes: { aliases: ['k8s', 'kube'], type: 'technology' },
105
- terraform: { aliases: ['tf'], type: 'technology' },
106
- ansible: { aliases: [], type: 'technology' },
107
- jenkins: { aliases: [], type: 'technology' },
108
- github: { aliases: ['gh'], type: 'technology' },
109
- gitlab: { aliases: [], type: 'technology' },
110
- vercel: { aliases: [], type: 'technology' },
111
- netlify: { aliases: [], type: 'technology' },
112
- cloudflare: { aliases: ['cf'], type: 'technology' },
113
- nginx: { aliases: [], type: 'technology' },
114
- caddy: { aliases: [], type: 'technology' },
115
-
116
- // Tools & Libraries
117
- webpack: { aliases: [], type: 'technology' },
118
- vite: { aliases: [], type: 'technology' },
119
- esbuild: { aliases: [], type: 'technology' },
120
- rollup: { aliases: ['rollupjs'], type: 'technology' },
121
- parcel: { aliases: [], type: 'technology' },
122
- turbopack: { aliases: [], type: 'technology' },
123
- bun: { aliases: ['bunjs'], type: 'technology' },
124
- deno: { aliases: [], type: 'technology' },
125
- node: { aliases: ['nodejs', 'node.js'], type: 'technology' },
126
- npm: { aliases: [], type: 'technology' },
127
- yarn: { aliases: [], type: 'technology' },
128
- pnpm: { aliases: [], type: 'technology' },
129
- git: { aliases: [], type: 'technology' },
130
- jest: { aliases: [], type: 'technology' },
131
- vitest: { aliases: [], type: 'technology' },
132
- mocha: { aliases: [], type: 'technology' },
133
- cypress: { aliases: [], type: 'technology' },
134
- playwright: { aliases: [], type: 'technology' },
135
- eslint: { aliases: [], type: 'technology' },
136
- prettier: { aliases: [], type: 'technology' },
137
- biome: { aliases: ['biomejs'], type: 'technology' },
138
- tailwind: { aliases: ['tailwindcss', 'tailwind-css'], type: 'technology' },
139
- bootstrap: { aliases: [], type: 'technology' },
140
- zod: { aliases: [], type: 'technology' },
141
- trpc: { aliases: ['t-rpc'], type: 'technology' },
142
- grpc: { aliases: ['g-rpc'], type: 'technology' },
143
- websocket: { aliases: ['ws', 'websockets'], type: 'technology' },
144
- oauth: { aliases: ['oauth2', 'oauth2.0'], type: 'technology' },
145
- jwt: { aliases: ['json-web-token'], type: 'technology' },
146
- openai: { aliases: ['gpt', 'chatgpt', 'gpt-4'], type: 'technology' },
147
- anthropic: { aliases: ['claude', 'claude-ai'], type: 'technology' },
148
- langchain: { aliases: [], type: 'technology' },
149
- llamaindex: { aliases: ['llama-index'], type: 'technology' },
150
- huggingface: { aliases: ['hf', 'hugging-face'], type: 'technology' },
151
- tensorflow: { aliases: ['tf'], type: 'technology' },
152
- pytorch: { aliases: ['torch'], type: 'technology' },
153
- pino: { aliases: [], type: 'technology' },
154
- winston: { aliases: [], type: 'technology' },
155
- storybook: { aliases: [], type: 'technology' },
156
- nx: { aliases: [], type: 'technology' },
157
- turborepo: { aliases: [], type: 'technology' },
158
- lerna: { aliases: [], type: 'technology' },
159
- compromise: { aliases: ['compromise-nlp'], type: 'technology' },
160
- minisearch: { aliases: [], type: 'technology' },
161
-
162
- // Concepts
163
- microservices: { aliases: ['micro-services', 'microservice'], type: 'concept' },
164
- monolith: { aliases: ['monolithic'], type: 'concept' },
165
- serverless: { aliases: ['faas', 'lambda'], type: 'concept' },
166
- rest: { aliases: ['restful', 'rest-api'], type: 'concept' },
167
- api: { aliases: ['apis'], type: 'concept' },
168
- ci: { aliases: ['continuous-integration'], type: 'concept' },
169
- cd: { aliases: ['continuous-deployment', 'continuous-delivery'], type: 'concept' },
170
- tdd: { aliases: ['test-driven-development'], type: 'concept' },
171
- bdd: { aliases: ['behavior-driven-development'], type: 'concept' },
172
- ddd: { aliases: ['domain-driven-design'], type: 'concept' },
173
- cqrs: { aliases: ['command-query-responsibility-segregation'], type: 'concept' },
174
- mvc: { aliases: ['model-view-controller'], type: 'concept' },
175
- mvvm: { aliases: ['model-view-viewmodel'], type: 'concept' },
176
- oop: { aliases: ['object-oriented-programming', 'object-oriented'], type: 'concept' },
177
- fp: { aliases: ['functional-programming', 'functional'], type: 'concept' },
178
- ssr: { aliases: ['server-side-rendering'], type: 'concept' },
179
- ssg: { aliases: ['static-site-generation'], type: 'concept' },
180
- spa: { aliases: ['single-page-application', 'single-page-app'], type: 'concept' },
181
- pwa: { aliases: ['progressive-web-app'], type: 'concept' },
182
- mcp: { aliases: ['model-context-protocol'], type: 'concept' },
183
- rag: { aliases: ['retrieval-augmented-generation'], type: 'concept' },
184
- embedding: { aliases: ['embeddings', 'vector-embedding'], type: 'concept' },
185
- 'vector-search': { aliases: ['semantic-search', 'vector-similarity'], type: 'concept' },
186
- }
187
-
188
- // Build reverse lookup: alias → normalized name
189
- const ALIAS_MAP = new Map<string, string>()
190
- for (const [normalized, entry] of Object.entries(TECH_DICTIONARY)) {
191
- ALIAS_MAP.set(normalized.toLowerCase(), normalized)
192
- for (const alias of entry.aliases) {
193
- ALIAS_MAP.set(alias.toLowerCase(), normalized)
194
- }
195
- }
196
-
197
- // File path regex - handles ./path, ~/path, /path, and relative paths
198
- const FILE_PATH_REGEX = /(?:^|[\s(,])((?:\.\/|~\/|\/)?(?:[\w-]+\/)+[\w.-]+\.\w+)/g
199
- // URL regex
200
- const URL_REGEX = /https?:\/\/[^\s),]+/g
201
- // Version number regex
202
- let nlpModule: any = null
203
-
204
- async function loadNlp(): Promise<any> {
205
- if (!nlpModule) {
206
- try {
207
- nlpModule = (await import('compromise')).default
208
- } catch {
209
- nlpModule = null
210
- }
211
- }
212
- return nlpModule
213
- }
214
-
215
- export class EntityExtractor {
216
- private nlpLoaded = false
217
- private nlp: any = null
218
-
219
- async initialize(): Promise<void> {
220
- this.nlp = await loadNlp()
221
- this.nlpLoaded = this.nlp !== null
222
- }
223
-
224
- extract(text: string): ExtractedEntity[] {
225
- const startTime = Date.now()
226
- const entities: Map<string, ExtractedEntity> = new Map()
227
-
228
- this.extractFromDictionary(text, entities)
229
- this.extractFilePaths(text, entities)
230
- this.extractUrls(text, entities)
231
- this.extractDates(text, entities)
232
-
233
- if (this.nlpLoaded && this.nlp) {
234
- this.extractWithNlp(text, entities)
235
- }
236
-
237
- const result = Array.from(entities.values())
238
- .sort((a, b) => b.confidence - a.confidence)
239
-
240
- // SLM Phase 1A: Log extraction for training data collection
241
- this._logTraining(text, result, startTime)
242
-
243
- return result
244
- }
245
-
246
- /**
247
- * SLM Phase 1A: Log entity extraction result for training data.
248
- */
249
- private _logTraining(text: string, entities: ExtractedEntity[], startTime: number): void {
250
- try {
251
- const { logTrainingData } = require('@/training/data-store')
252
- logTrainingData({
253
- task: 'entity' as const,
254
- input: text,
255
- output: JSON.stringify(entities.map(e => ({
256
- text: e.name,
257
- type: e.type,
258
- normalized: e.normalizedName,
259
- confidence: e.confidence,
260
- source: e.source,
261
- positions: e.positions,
262
- }))),
263
- metadata: JSON.stringify({ count: entities.length, elapsed_ms: Date.now() - startTime }),
264
- })
265
- } catch {
266
- // Non-critical
267
- }
268
- }
269
-
270
- extractBatch(texts: string[]): ExtractedEntity[][] {
271
- return texts.map(text => this.extract(text))
272
- }
273
-
274
- private extractFromDictionary(text: string, entities: Map<string, ExtractedEntity>): void {
275
- const lowerText = text.toLowerCase()
276
- const words = lowerText.split(/[\s,;:()[\]{}"'`|/\\]+/)
277
-
278
- for (const word of words) {
279
- const cleaned = word.replace(/^[^a-z0-9]+|[^a-z0-9]+$/g, '')
280
- if (cleaned.length < 2) continue
281
-
282
- const normalized = ALIAS_MAP.get(cleaned)
283
- if (normalized) {
284
- const dictEntry = TECH_DICTIONARY[normalized]
285
- if (!dictEntry) continue
286
-
287
- const existing = entities.get(normalized)
288
- if (existing) {
289
- existing.positions.push(lowerText.indexOf(cleaned))
290
- existing.confidence = Math.min(1.0, existing.confidence + 0.05)
291
- } else {
292
- entities.set(normalized, {
293
- name: cleaned,
294
- normalizedName: normalized,
295
- type: dictEntry.type,
296
- confidence: 0.95,
297
- source: 'dictionary',
298
- positions: [lowerText.indexOf(cleaned)]
299
- })
300
- }
301
- }
302
- }
303
-
304
- // Also check multi-word aliases
305
- for (const [alias, normalized] of ALIAS_MAP) {
306
- if (alias.includes('-') || alias.includes('.') || alias.includes(' ')) {
307
- if (lowerText.includes(alias) && !entities.has(normalized)) {
308
- const dictEntry = TECH_DICTIONARY[normalized]
309
- if (!dictEntry) continue
310
-
311
- entities.set(normalized, {
312
- name: alias,
313
- normalizedName: normalized,
314
- type: dictEntry.type,
315
- confidence: 0.95,
316
- source: 'dictionary',
317
- positions: [lowerText.indexOf(alias)]
318
- })
319
- }
320
- }
321
- }
322
- }
323
-
324
- private extractFilePaths(text: string, entities: Map<string, ExtractedEntity>): void {
325
- let match: RegExpExecArray | null
326
- const regex = new RegExp(FILE_PATH_REGEX.source, FILE_PATH_REGEX.flags)
327
-
328
- while ((match = regex.exec(text)) !== null) {
329
- const filePath = match[1]!.trim()
330
- if (filePath.length < 4) continue
331
-
332
- const key = `file:${filePath}`
333
- if (!entities.has(key)) {
334
- entities.set(key, {
335
- name: filePath,
336
- normalizedName: filePath,
337
- type: 'file',
338
- confidence: 0.85,
339
- source: 'rule',
340
- positions: [match.index]
341
- })
342
- }
343
- }
344
- }
345
-
346
- private extractUrls(text: string, entities: Map<string, ExtractedEntity>): void {
347
- let match: RegExpExecArray | null
348
- const regex = new RegExp(URL_REGEX.source, URL_REGEX.flags)
349
-
350
- while ((match = regex.exec(text)) !== null) {
351
- const url = match[0]
352
- const key = `url:${url}`
353
- if (!entities.has(key)) {
354
- entities.set(key, {
355
- name: url,
356
- normalizedName: url,
357
- type: 'file',
358
- confidence: 0.9,
359
- source: 'rule',
360
- positions: [match.index]
361
- })
362
- }
363
- }
364
- }
365
-
366
- private extractDates(text: string, entities: Map<string, ExtractedEntity>): void {
367
- // ISO date pattern
368
- const isoDateRegex = /\b\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})?)?\b/g
369
- let match: RegExpExecArray | null
370
-
371
- while ((match = isoDateRegex.exec(text)) !== null) {
372
- const dateStr = match[0]
373
- const key = `date:${dateStr}`
374
- if (!entities.has(key)) {
375
- entities.set(key, {
376
- name: dateStr,
377
- normalizedName: dateStr,
378
- type: 'date',
379
- confidence: 0.9,
380
- source: 'rule',
381
- positions: [match.index]
382
- })
383
- }
384
- }
385
- }
386
-
387
- private extractWithNlp(text: string, entities: Map<string, ExtractedEntity>): void {
388
- try {
389
- const doc = this.nlp(text)
390
-
391
- // Extract people
392
- const people = doc.people()
393
- if (people && people.length > 0) {
394
- for (const person of people.out('array') as string[]) {
395
- const cleaned = person.trim()
396
- if (cleaned.length < 2) continue
397
- const key = `person:${cleaned.toLowerCase()}`
398
- if (!entities.has(key)) {
399
- entities.set(key, {
400
- name: cleaned,
401
- normalizedName: cleaned.toLowerCase(),
402
- type: 'person',
403
- confidence: 0.7,
404
- source: 'nlp',
405
- positions: [text.indexOf(cleaned)]
406
- })
407
- }
408
- }
409
- }
410
-
411
- // Extract organizations
412
- const orgs = doc.organizations()
413
- if (orgs && orgs.length > 0) {
414
- for (const org of orgs.out('array') as string[]) {
415
- const cleaned = org.trim()
416
- if (cleaned.length < 2) continue
417
- // Check if it's already a known technology
418
- const normalizedCheck = ALIAS_MAP.get(cleaned.toLowerCase())
419
- if (normalizedCheck) continue
420
-
421
- const key = `org:${cleaned.toLowerCase()}`
422
- if (!entities.has(key)) {
423
- entities.set(key, {
424
- name: cleaned,
425
- normalizedName: cleaned.toLowerCase(),
426
- type: 'concept',
427
- confidence: 0.65,
428
- source: 'nlp',
429
- positions: [text.indexOf(cleaned)]
430
- })
431
- }
432
- }
433
- }
434
- } catch {
435
- // NLP extraction failure is non-critical
436
- }
437
- }
438
-
439
- static normalizeEntityName(name: string): string {
440
- const lower = name.toLowerCase().trim()
441
- return ALIAS_MAP.get(lower) || lower
442
- }
443
- }
1
+ /**
2
+ * Entity Extractor
3
+ * Rule-based NER using compromise + technology dictionary
4
+ */
5
+
6
+ import type { EntityType } from './graph/schema'
7
+
8
+ export interface ExtractedEntity {
9
+ name: string
10
+ normalizedName: string
11
+ type: EntityType
12
+ confidence: number
13
+ source: 'dictionary' | 'nlp' | 'rule' | 'model'
14
+ positions: number[]
15
+ }
16
+
17
+ const TECH_DICTIONARY: Record<string, { aliases: string[]; type: EntityType }> = {
18
+ // Languages
19
+ typescript: { aliases: ['ts', 'typescript', 'type-script'], type: 'technology' },
20
+ javascript: { aliases: ['js', 'javascript', 'ecmascript', 'es6', 'es2015', 'es2020', 'es2021', 'es2022'], type: 'technology' },
21
+ python: { aliases: ['py', 'python3', 'python2'], type: 'technology' },
22
+ rust: { aliases: ['rust-lang', 'rustlang'], type: 'technology' },
23
+ go: { aliases: ['golang', 'go-lang'], type: 'technology' },
24
+ java: { aliases: [], type: 'technology' },
25
+ csharp: { aliases: ['c#', 'c-sharp', 'dotnet', '.net'], type: 'technology' },
26
+ ruby: { aliases: ['rb'], type: 'technology' },
27
+ php: { aliases: [], type: 'technology' },
28
+ swift: { aliases: [], type: 'technology' },
29
+ kotlin: { aliases: ['kt'], type: 'technology' },
30
+ scala: { aliases: [], type: 'technology' },
31
+ elixir: { aliases: [], type: 'technology' },
32
+ clojure: { aliases: ['clj'], type: 'technology' },
33
+ haskell: { aliases: ['hs'], type: 'technology' },
34
+ lua: { aliases: [], type: 'technology' },
35
+ perl: { aliases: ['pl'], type: 'technology' },
36
+ r: { aliases: ['r-lang', 'rlang'], type: 'technology' },
37
+ dart: { aliases: [], type: 'technology' },
38
+ sql: { aliases: ['mysql', 'postgresql', 'postgres', 'sqlite', 'mssql', 'mariadb'], type: 'technology' },
39
+ html: { aliases: ['html5'], type: 'technology' },
40
+ css: { aliases: ['css3', 'scss', 'sass', 'less', 'stylus'], type: 'technology' },
41
+ graphql: { aliases: ['gql'], type: 'technology' },
42
+ yaml: { aliases: ['yml'], type: 'technology' },
43
+ json: { aliases: [], type: 'technology' },
44
+ markdown: { aliases: ['md'], type: 'technology' },
45
+ bash: { aliases: ['sh', 'shell', 'zsh', 'fish'], type: 'technology' },
46
+ c: { aliases: ['c-lang', 'ansi-c'], type: 'technology' },
47
+ cpp: { aliases: ['c++', 'cplusplus'], type: 'technology' },
48
+
49
+ // Frontend Frameworks
50
+ react: { aliases: ['reactjs', 'react.js', 'react-dom'], type: 'technology' },
51
+ vue: { aliases: ['vuejs', 'vue.js', 'vue3', 'vue2'], type: 'technology' },
52
+ angular: { aliases: ['angularjs', 'angular.js', 'ng'], type: 'technology' },
53
+ svelte: { aliases: ['sveltejs', 'sveltekit'], type: 'technology' },
54
+ nextjs: { aliases: ['next.js', 'next', 'nextjs'], type: 'technology' },
55
+ nuxt: { aliases: ['nuxtjs', 'nuxt.js'], type: 'technology' },
56
+ remix: { aliases: ['remix-run'], type: 'technology' },
57
+ astro: { aliases: ['astrojs'], type: 'technology' },
58
+ gatsby: { aliases: ['gatsbyjs'], type: 'technology' },
59
+ solid: { aliases: ['solidjs', 'solid-js'], type: 'technology' },
60
+ preact: { aliases: ['preactjs'], type: 'technology' },
61
+ htmx: { aliases: [], type: 'technology' },
62
+ jquery: { aliases: [], type: 'technology' },
63
+
64
+ // Backend Frameworks
65
+ express: { aliases: ['expressjs', 'express.js'], type: 'technology' },
66
+ fastify: { aliases: [], type: 'technology' },
67
+ hono: { aliases: [], type: 'technology' },
68
+ koa: { aliases: ['koajs'], type: 'technology' },
69
+ nestjs: { aliases: ['nest.js', 'nest'], type: 'technology' },
70
+ django: { aliases: [], type: 'technology' },
71
+ flask: { aliases: [], type: 'technology' },
72
+ fastapi: { aliases: ['fast-api'], type: 'technology' },
73
+ rails: { aliases: ['ruby-on-rails', 'ror'], type: 'technology' },
74
+ spring: { aliases: ['spring-boot', 'springboot'], type: 'technology' },
75
+ laravel: { aliases: [], type: 'technology' },
76
+ phoenix: { aliases: [], type: 'technology' },
77
+ gin: { aliases: [], type: 'technology' },
78
+ actix: { aliases: ['actix-web'], type: 'technology' },
79
+
80
+ // Databases
81
+ mongodb: { aliases: ['mongo'], type: 'technology' },
82
+ redis: { aliases: [], type: 'technology' },
83
+ elasticsearch: { aliases: ['elastic', 'es'], type: 'technology' },
84
+ dynamodb: { aliases: ['dynamo'], type: 'technology' },
85
+ cassandra: { aliases: [], type: 'technology' },
86
+ neo4j: { aliases: [], type: 'technology' },
87
+ couchdb: { aliases: ['couch'], type: 'technology' },
88
+ firebase: { aliases: ['firestore'], type: 'technology' },
89
+ supabase: { aliases: [], type: 'technology' },
90
+ prisma: { aliases: [], type: 'technology' },
91
+ drizzle: { aliases: ['drizzle-orm'], type: 'technology' },
92
+ sequelize: { aliases: [], type: 'technology' },
93
+ typeorm: { aliases: [], type: 'technology' },
94
+ chromadb: { aliases: ['chroma'], type: 'technology' },
95
+ pinecone: { aliases: [], type: 'technology' },
96
+ weaviate: { aliases: [], type: 'technology' },
97
+ qdrant: { aliases: [], type: 'technology' },
98
+
99
+ // Cloud & DevOps
100
+ aws: { aliases: ['amazon-web-services', 'amazon'], type: 'technology' },
101
+ gcp: { aliases: ['google-cloud', 'google-cloud-platform'], type: 'technology' },
102
+ azure: { aliases: ['microsoft-azure'], type: 'technology' },
103
+ docker: { aliases: ['dockerfile', 'docker-compose'], type: 'technology' },
104
+ kubernetes: { aliases: ['k8s', 'kube'], type: 'technology' },
105
+ terraform: { aliases: ['tf'], type: 'technology' },
106
+ ansible: { aliases: [], type: 'technology' },
107
+ jenkins: { aliases: [], type: 'technology' },
108
+ github: { aliases: ['gh'], type: 'technology' },
109
+ gitlab: { aliases: [], type: 'technology' },
110
+ vercel: { aliases: [], type: 'technology' },
111
+ netlify: { aliases: [], type: 'technology' },
112
+ cloudflare: { aliases: ['cf'], type: 'technology' },
113
+ nginx: { aliases: [], type: 'technology' },
114
+ caddy: { aliases: [], type: 'technology' },
115
+
116
+ // Tools & Libraries
117
+ webpack: { aliases: [], type: 'technology' },
118
+ vite: { aliases: [], type: 'technology' },
119
+ esbuild: { aliases: [], type: 'technology' },
120
+ rollup: { aliases: ['rollupjs'], type: 'technology' },
121
+ parcel: { aliases: [], type: 'technology' },
122
+ turbopack: { aliases: [], type: 'technology' },
123
+ bun: { aliases: ['bunjs'], type: 'technology' },
124
+ deno: { aliases: [], type: 'technology' },
125
+ node: { aliases: ['nodejs', 'node.js'], type: 'technology' },
126
+ npm: { aliases: [], type: 'technology' },
127
+ yarn: { aliases: [], type: 'technology' },
128
+ pnpm: { aliases: [], type: 'technology' },
129
+ git: { aliases: [], type: 'technology' },
130
+ jest: { aliases: [], type: 'technology' },
131
+ vitest: { aliases: [], type: 'technology' },
132
+ mocha: { aliases: [], type: 'technology' },
133
+ cypress: { aliases: [], type: 'technology' },
134
+ playwright: { aliases: [], type: 'technology' },
135
+ eslint: { aliases: [], type: 'technology' },
136
+ prettier: { aliases: [], type: 'technology' },
137
+ biome: { aliases: ['biomejs'], type: 'technology' },
138
+ tailwind: { aliases: ['tailwindcss', 'tailwind-css'], type: 'technology' },
139
+ bootstrap: { aliases: [], type: 'technology' },
140
+ zod: { aliases: [], type: 'technology' },
141
+ trpc: { aliases: ['t-rpc'], type: 'technology' },
142
+ grpc: { aliases: ['g-rpc'], type: 'technology' },
143
+ websocket: { aliases: ['ws', 'websockets'], type: 'technology' },
144
+ oauth: { aliases: ['oauth2', 'oauth2.0'], type: 'technology' },
145
+ jwt: { aliases: ['json-web-token'], type: 'technology' },
146
+ openai: { aliases: ['gpt', 'chatgpt', 'gpt-4'], type: 'technology' },
147
+ anthropic: { aliases: ['claude', 'claude-ai'], type: 'technology' },
148
+ langchain: { aliases: [], type: 'technology' },
149
+ llamaindex: { aliases: ['llama-index'], type: 'technology' },
150
+ huggingface: { aliases: ['hf', 'hugging-face'], type: 'technology' },
151
+ tensorflow: { aliases: ['tf'], type: 'technology' },
152
+ pytorch: { aliases: ['torch'], type: 'technology' },
153
+ pino: { aliases: [], type: 'technology' },
154
+ winston: { aliases: [], type: 'technology' },
155
+ storybook: { aliases: [], type: 'technology' },
156
+ nx: { aliases: [], type: 'technology' },
157
+ turborepo: { aliases: [], type: 'technology' },
158
+ lerna: { aliases: [], type: 'technology' },
159
+ compromise: { aliases: ['compromise-nlp'], type: 'technology' },
160
+ minisearch: { aliases: [], type: 'technology' },
161
+
162
+ // Concepts
163
+ microservices: { aliases: ['micro-services', 'microservice'], type: 'concept' },
164
+ monolith: { aliases: ['monolithic'], type: 'concept' },
165
+ serverless: { aliases: ['faas', 'lambda'], type: 'concept' },
166
+ rest: { aliases: ['restful', 'rest-api'], type: 'concept' },
167
+ api: { aliases: ['apis'], type: 'concept' },
168
+ ci: { aliases: ['continuous-integration'], type: 'concept' },
169
+ cd: { aliases: ['continuous-deployment', 'continuous-delivery'], type: 'concept' },
170
+ tdd: { aliases: ['test-driven-development'], type: 'concept' },
171
+ bdd: { aliases: ['behavior-driven-development'], type: 'concept' },
172
+ ddd: { aliases: ['domain-driven-design'], type: 'concept' },
173
+ cqrs: { aliases: ['command-query-responsibility-segregation'], type: 'concept' },
174
+ mvc: { aliases: ['model-view-controller'], type: 'concept' },
175
+ mvvm: { aliases: ['model-view-viewmodel'], type: 'concept' },
176
+ oop: { aliases: ['object-oriented-programming', 'object-oriented'], type: 'concept' },
177
+ fp: { aliases: ['functional-programming', 'functional'], type: 'concept' },
178
+ ssr: { aliases: ['server-side-rendering'], type: 'concept' },
179
+ ssg: { aliases: ['static-site-generation'], type: 'concept' },
180
+ spa: { aliases: ['single-page-application', 'single-page-app'], type: 'concept' },
181
+ pwa: { aliases: ['progressive-web-app'], type: 'concept' },
182
+ mcp: { aliases: ['model-context-protocol'], type: 'concept' },
183
+ rag: { aliases: ['retrieval-augmented-generation'], type: 'concept' },
184
+ embedding: { aliases: ['embeddings', 'vector-embedding'], type: 'concept' },
185
+ 'vector-search': { aliases: ['semantic-search', 'vector-similarity'], type: 'concept' },
186
+ }
187
+
188
+ // Build reverse lookup: alias → normalized name
189
+ const ALIAS_MAP = new Map<string, string>()
190
+ for (const [normalized, entry] of Object.entries(TECH_DICTIONARY)) {
191
+ ALIAS_MAP.set(normalized.toLowerCase(), normalized)
192
+ for (const alias of entry.aliases) {
193
+ ALIAS_MAP.set(alias.toLowerCase(), normalized)
194
+ }
195
+ }
196
+
197
+ // File path regex - handles ./path, ~/path, /path, and relative paths
198
+ const FILE_PATH_REGEX = /(?:^|[\s(,])((?:\.\/|~\/|\/)?(?:[\w-]+\/)+[\w.-]+\.\w+)/g
199
+ // URL regex
200
+ const URL_REGEX = /https?:\/\/[^\s),]+/g
201
+ // Version number regex
202
+ let nlpModule: unknown = null
203
+
204
+ async function loadNlp(): Promise<unknown> {
205
+ if (!nlpModule) {
206
+ try {
207
+ nlpModule = (await import('compromise')).default
208
+ } catch {
209
+ nlpModule = null
210
+ }
211
+ }
212
+ return nlpModule
213
+ }
214
+
215
+ export class EntityExtractor {
216
+ private nlpLoaded = false
217
+ private nlp: unknown = null
218
+
219
+ async initialize(): Promise<void> {
220
+ this.nlp = await loadNlp()
221
+ this.nlpLoaded = this.nlp !== null
222
+ }
223
+
224
+ extract(text: string): ExtractedEntity[] {
225
+ const startTime = Date.now()
226
+ const entities: Map<string, ExtractedEntity> = new Map()
227
+
228
+ this.extractFromDictionary(text, entities)
229
+ this.extractFilePaths(text, entities)
230
+ this.extractUrls(text, entities)
231
+ this.extractDates(text, entities)
232
+
233
+ if (this.nlpLoaded && this.nlp) {
234
+ this.extractWithNlp(text, entities)
235
+ }
236
+
237
+ const result = Array.from(entities.values())
238
+ .sort((a, b) => b.confidence - a.confidence)
239
+
240
+ // SLM Phase 1A: Log extraction for training data collection
241
+ this._logTraining(text, result, startTime)
242
+
243
+ return result
244
+ }
245
+
246
+ /**
247
+ * SLM Phase 1A: Log entity extraction result for training data.
248
+ */
249
+ private _logTraining(text: string, entities: ExtractedEntity[], startTime: number): void {
250
+ try {
251
+ const { logTrainingData } = require('@/training/data-store')
252
+ logTrainingData({
253
+ task: 'entity' as const,
254
+ input: text,
255
+ output: JSON.stringify(entities.map(e => ({
256
+ text: e.name,
257
+ type: e.type,
258
+ normalized: e.normalizedName,
259
+ confidence: e.confidence,
260
+ source: e.source,
261
+ positions: e.positions,
262
+ }))),
263
+ metadata: JSON.stringify({ count: entities.length, elapsed_ms: Date.now() - startTime }),
264
+ })
265
+ } catch {
266
+ // Non-critical
267
+ }
268
+ }
269
+
270
+ extractBatch(texts: string[]): ExtractedEntity[][] {
271
+ return texts.map(text => this.extract(text))
272
+ }
273
+
274
+ private extractFromDictionary(text: string, entities: Map<string, ExtractedEntity>): void {
275
+ const lowerText = text.toLowerCase()
276
+ const words = lowerText.split(/[\s,;:()[\]{}"'`|/\\]+/)
277
+
278
+ for (const word of words) {
279
+ const cleaned = word.replace(/^[^a-z0-9]+|[^a-z0-9]+$/g, '')
280
+ if (cleaned.length < 2) continue
281
+
282
+ const normalized = ALIAS_MAP.get(cleaned)
283
+ if (normalized) {
284
+ const dictEntry = TECH_DICTIONARY[normalized]
285
+ if (!dictEntry) continue
286
+
287
+ const existing = entities.get(normalized)
288
+ if (existing) {
289
+ existing.positions.push(lowerText.indexOf(cleaned))
290
+ existing.confidence = Math.min(1.0, existing.confidence + 0.05)
291
+ } else {
292
+ entities.set(normalized, {
293
+ name: cleaned,
294
+ normalizedName: normalized,
295
+ type: dictEntry.type,
296
+ confidence: 0.95,
297
+ source: 'dictionary',
298
+ positions: [lowerText.indexOf(cleaned)]
299
+ })
300
+ }
301
+ }
302
+ }
303
+
304
+ // Also check multi-word aliases
305
+ for (const [alias, normalized] of ALIAS_MAP) {
306
+ if (alias.includes('-') || alias.includes('.') || alias.includes(' ')) {
307
+ if (lowerText.includes(alias) && !entities.has(normalized)) {
308
+ const dictEntry = TECH_DICTIONARY[normalized]
309
+ if (!dictEntry) continue
310
+
311
+ entities.set(normalized, {
312
+ name: alias,
313
+ normalizedName: normalized,
314
+ type: dictEntry.type,
315
+ confidence: 0.95,
316
+ source: 'dictionary',
317
+ positions: [lowerText.indexOf(alias)]
318
+ })
319
+ }
320
+ }
321
+ }
322
+ }
323
+
324
+ private extractFilePaths(text: string, entities: Map<string, ExtractedEntity>): void {
325
+ let match: RegExpExecArray | null
326
+ const regex = new RegExp(FILE_PATH_REGEX.source, FILE_PATH_REGEX.flags)
327
+
328
+ while ((match = regex.exec(text)) !== null) {
329
+ const filePath = match[1]!.trim()
330
+ if (filePath.length < 4) continue
331
+
332
+ const key = `file:${filePath}`
333
+ if (!entities.has(key)) {
334
+ entities.set(key, {
335
+ name: filePath,
336
+ normalizedName: filePath,
337
+ type: 'file',
338
+ confidence: 0.85,
339
+ source: 'rule',
340
+ positions: [match.index]
341
+ })
342
+ }
343
+ }
344
+ }
345
+
346
+ private extractUrls(text: string, entities: Map<string, ExtractedEntity>): void {
347
+ let match: RegExpExecArray | null
348
+ const regex = new RegExp(URL_REGEX.source, URL_REGEX.flags)
349
+
350
+ while ((match = regex.exec(text)) !== null) {
351
+ const url = match[0]
352
+ const key = `url:${url}`
353
+ if (!entities.has(key)) {
354
+ entities.set(key, {
355
+ name: url,
356
+ normalizedName: url,
357
+ type: 'file',
358
+ confidence: 0.9,
359
+ source: 'rule',
360
+ positions: [match.index]
361
+ })
362
+ }
363
+ }
364
+ }
365
+
366
+ private extractDates(text: string, entities: Map<string, ExtractedEntity>): void {
367
+ // ISO date pattern
368
+ const isoDateRegex = /\b\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})?)?\b/g
369
+ let match: RegExpExecArray | null
370
+
371
+ while ((match = isoDateRegex.exec(text)) !== null) {
372
+ const dateStr = match[0]
373
+ const key = `date:${dateStr}`
374
+ if (!entities.has(key)) {
375
+ entities.set(key, {
376
+ name: dateStr,
377
+ normalizedName: dateStr,
378
+ type: 'date',
379
+ confidence: 0.9,
380
+ source: 'rule',
381
+ positions: [match.index]
382
+ })
383
+ }
384
+ }
385
+ }
386
+
387
+ private extractWithNlp(text: string, entities: Map<string, ExtractedEntity>): void {
388
+ try {
389
+ const nlpFn = this.nlp as (text: string) => {
390
+ people(): { length: number; out(format: string): unknown }
391
+ organizations(): { length: number; out(format: string): unknown }
392
+ }
393
+ const doc = nlpFn(text)
394
+
395
+ // Extract people
396
+ const people = doc.people()
397
+ if (people && people.length > 0) {
398
+ for (const person of people.out('array') as string[]) {
399
+ const cleaned = person.trim()
400
+ if (cleaned.length < 2) continue
401
+ const key = `person:${cleaned.toLowerCase()}`
402
+ if (!entities.has(key)) {
403
+ entities.set(key, {
404
+ name: cleaned,
405
+ normalizedName: cleaned.toLowerCase(),
406
+ type: 'person',
407
+ confidence: 0.7,
408
+ source: 'nlp',
409
+ positions: [text.indexOf(cleaned)]
410
+ })
411
+ }
412
+ }
413
+ }
414
+
415
+ // Extract organizations
416
+ const orgs = doc.organizations()
417
+ if (orgs && orgs.length > 0) {
418
+ for (const org of orgs.out('array') as string[]) {
419
+ const cleaned = org.trim()
420
+ if (cleaned.length < 2) continue
421
+ // Check if it's already a known technology
422
+ const normalizedCheck = ALIAS_MAP.get(cleaned.toLowerCase())
423
+ if (normalizedCheck) continue
424
+
425
+ const key = `org:${cleaned.toLowerCase()}`
426
+ if (!entities.has(key)) {
427
+ entities.set(key, {
428
+ name: cleaned,
429
+ normalizedName: cleaned.toLowerCase(),
430
+ type: 'concept',
431
+ confidence: 0.65,
432
+ source: 'nlp',
433
+ positions: [text.indexOf(cleaned)]
434
+ })
435
+ }
436
+ }
437
+ }
438
+ } catch {
439
+ // NLP extraction failure is non-critical
440
+ }
441
+ }
442
+
443
+ static normalizeEntityName(name: string): string {
444
+ const lower = name.toLowerCase().trim()
445
+ return ALIAS_MAP.get(lower) || lower
446
+ }
447
+ }