claude-brain 0.15.2 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. package/README.md +191 -191
  2. package/VERSION +1 -1
  3. package/assets/CLAUDE-unified.md +11 -11
  4. package/assets/CLAUDE.md +29 -11
  5. package/bunfig.toml +8 -8
  6. package/package.json +82 -82
  7. package/packs/backend/node.json +173 -173
  8. package/packs/core/javascript.json +176 -176
  9. package/packs/core/typescript.json +222 -222
  10. package/packs/frontend/react.json +254 -254
  11. package/packs/meta/testing.json +172 -172
  12. package/scripts/postinstall.mjs +341 -341
  13. package/src/automation/auto-context.ts +240 -240
  14. package/src/automation/decision-detector.ts +452 -452
  15. package/src/automation/index.ts +11 -11
  16. package/src/automation/phase12-manager.ts +456 -456
  17. package/src/automation/proactive-recall.ts +373 -373
  18. package/src/automation/project-detector.ts +310 -310
  19. package/src/automation/repo-scanner.ts +205 -205
  20. package/src/cli/auto-setup.ts +82 -82
  21. package/src/cli/bin.ts +209 -202
  22. package/src/cli/commands/chroma.ts +573 -573
  23. package/src/cli/commands/git-hook.ts +189 -189
  24. package/src/cli/commands/hooks.ts +213 -213
  25. package/src/cli/commands/init.ts +122 -122
  26. package/src/cli/commands/install-mcp.ts +92 -92
  27. package/src/cli/commands/pack.ts +197 -197
  28. package/src/cli/commands/refresh.ts +323 -0
  29. package/src/cli/commands/serve.ts +167 -173
  30. package/src/cli/commands/start.ts +42 -42
  31. package/src/cli/commands/uninstall-mcp.ts +41 -41
  32. package/src/cli/commands/update.ts +124 -121
  33. package/src/cli/diagnose.ts +4 -4
  34. package/src/cli/health-check.ts +4 -4
  35. package/src/cli/migrate-chroma.ts +106 -106
  36. package/src/cli/setup.ts +4 -4
  37. package/src/cli/ui/animations.ts +80 -80
  38. package/src/cli/ui/components.ts +82 -82
  39. package/src/cli/ui/index.ts +4 -4
  40. package/src/cli/ui/logo.ts +36 -36
  41. package/src/cli/ui/theme.ts +55 -55
  42. package/src/config/defaults.ts +50 -50
  43. package/src/config/home.ts +55 -55
  44. package/src/config/index.ts +7 -7
  45. package/src/config/loader.ts +166 -166
  46. package/src/config/migration.ts +76 -76
  47. package/src/config/schema.ts +360 -360
  48. package/src/config/validator.ts +184 -184
  49. package/src/config/watcher.ts +86 -86
  50. package/src/context/assembler.ts +398 -398
  51. package/src/context/cache-manager.ts +101 -101
  52. package/src/context/formatter.ts +84 -84
  53. package/src/context/hierarchy.ts +85 -85
  54. package/src/context/index.ts +83 -83
  55. package/src/context/progress-tracker.ts +174 -174
  56. package/src/context/standards-manager.ts +287 -287
  57. package/src/context/types.ts +252 -252
  58. package/src/context/validator.ts +58 -58
  59. package/src/diagnostics/index.ts +123 -123
  60. package/src/health/index.ts +229 -229
  61. package/src/hooks/brain-hook.ts +128 -112
  62. package/src/hooks/capture.ts +168 -205
  63. package/src/hooks/context-hook.ts +137 -0
  64. package/src/hooks/deduplicator.ts +72 -72
  65. package/src/hooks/git-capture.ts +109 -109
  66. package/src/hooks/git-hook-installer.ts +207 -207
  67. package/src/hooks/index.ts +20 -20
  68. package/src/hooks/installer.ts +244 -194
  69. package/src/hooks/passive-classifier.ts +404 -723
  70. package/src/hooks/queue.ts +129 -129
  71. package/src/hooks/session-tracker.ts +312 -275
  72. package/src/hooks/types.ts +52 -47
  73. package/src/index.ts +7 -7
  74. package/src/intelligence/cross-project/affinity.ts +162 -162
  75. package/src/intelligence/cross-project/generalizer.ts +283 -283
  76. package/src/intelligence/cross-project/index.ts +13 -13
  77. package/src/intelligence/cross-project/transfer.ts +201 -201
  78. package/src/intelligence/index.ts +24 -24
  79. package/src/intelligence/optimization/index.ts +10 -10
  80. package/src/intelligence/optimization/precompute.ts +202 -202
  81. package/src/intelligence/optimization/semantic-cache.ts +207 -207
  82. package/src/intelligence/prediction/context-anticipator.ts +198 -198
  83. package/src/intelligence/prediction/decision-predictor.ts +184 -184
  84. package/src/intelligence/prediction/index.ts +13 -13
  85. package/src/intelligence/prediction/recommender.ts +268 -268
  86. package/src/intelligence/reasoning/chain-retrieval.ts +247 -247
  87. package/src/intelligence/reasoning/counterfactual.ts +248 -248
  88. package/src/intelligence/reasoning/index.ts +13 -13
  89. package/src/intelligence/reasoning/synthesizer.ts +169 -169
  90. package/src/intelligence/temporal/evolution.ts +197 -197
  91. package/src/intelligence/temporal/index.ts +16 -16
  92. package/src/intelligence/temporal/query-processor.ts +190 -190
  93. package/src/intelligence/temporal/timeline.ts +259 -259
  94. package/src/intelligence/temporal/trends.ts +263 -263
  95. package/src/knowledge/entity-extractor.ts +416 -416
  96. package/src/knowledge/graph/builder.ts +185 -185
  97. package/src/knowledge/graph/linker.ts +201 -201
  98. package/src/knowledge/graph/memory-graph.ts +359 -359
  99. package/src/knowledge/graph/schema.ts +99 -99
  100. package/src/knowledge/graph/search.ts +168 -168
  101. package/src/knowledge/relationship-extractor.ts +108 -108
  102. package/src/memory/chroma/client.ts +174 -174
  103. package/src/memory/chroma/collection-manager.ts +94 -94
  104. package/src/memory/chroma/config.ts +57 -57
  105. package/src/memory/chroma/embeddings.ts +155 -155
  106. package/src/memory/chroma/index.ts +82 -82
  107. package/src/memory/chroma/migration.ts +270 -270
  108. package/src/memory/chroma/schemas.ts +69 -69
  109. package/src/memory/chroma/search.ts +315 -315
  110. package/src/memory/chroma/store.ts +741 -741
  111. package/src/memory/consolidation/archiver.ts +164 -164
  112. package/src/memory/consolidation/merger.ts +186 -186
  113. package/src/memory/consolidation/scorer.ts +138 -138
  114. package/src/memory/context-builder.ts +236 -236
  115. package/src/memory/database.ts +169 -169
  116. package/src/memory/embedding-utils.ts +156 -156
  117. package/src/memory/embeddings.ts +226 -226
  118. package/src/memory/episodic/detector.ts +108 -108
  119. package/src/memory/episodic/manager.ts +351 -351
  120. package/src/memory/episodic/summarizer.ts +179 -179
  121. package/src/memory/episodic/types.ts +52 -52
  122. package/src/memory/index.ts +582 -582
  123. package/src/memory/knowledge-extractor.ts +455 -455
  124. package/src/memory/learning.ts +378 -378
  125. package/src/memory/patterns.ts +396 -396
  126. package/src/memory/schema.ts +88 -88
  127. package/src/memory/search.ts +309 -309
  128. package/src/memory/store.ts +787 -787
  129. package/src/memory/types.ts +121 -121
  130. package/src/orchestrator/coordinator.ts +272 -272
  131. package/src/orchestrator/decision-logger.ts +228 -228
  132. package/src/orchestrator/event-emitter.ts +198 -198
  133. package/src/orchestrator/event-queue.ts +184 -184
  134. package/src/orchestrator/handlers/base-handler.ts +70 -70
  135. package/src/orchestrator/handlers/context-handler.ts +73 -73
  136. package/src/orchestrator/handlers/decision-handler.ts +204 -204
  137. package/src/orchestrator/handlers/index.ts +10 -10
  138. package/src/orchestrator/handlers/status-handler.ts +131 -131
  139. package/src/orchestrator/handlers/task-handler.ts +171 -171
  140. package/src/orchestrator/index.ts +275 -275
  141. package/src/orchestrator/task-parser.ts +284 -284
  142. package/src/orchestrator/types.ts +98 -98
  143. package/src/packs/index.ts +9 -9
  144. package/src/packs/loader.ts +134 -134
  145. package/src/packs/manager.ts +204 -204
  146. package/src/packs/ranker.ts +78 -78
  147. package/src/packs/types.ts +81 -81
  148. package/src/phase12/index.ts +5 -5
  149. package/src/retrieval/bm25/index.ts +300 -300
  150. package/src/retrieval/bm25/tokenizer.ts +184 -184
  151. package/src/retrieval/feedback/adaptive.ts +223 -223
  152. package/src/retrieval/feedback/index.ts +16 -16
  153. package/src/retrieval/feedback/metrics.ts +223 -223
  154. package/src/retrieval/feedback/store.ts +283 -283
  155. package/src/retrieval/fusion/index.ts +194 -194
  156. package/src/retrieval/fusion/rrf.ts +163 -163
  157. package/src/retrieval/index.ts +12 -12
  158. package/src/retrieval/pipeline.ts +375 -375
  159. package/src/retrieval/query/expander.ts +198 -198
  160. package/src/retrieval/query/index.ts +27 -27
  161. package/src/retrieval/query/intent-classifier.ts +236 -236
  162. package/src/retrieval/query/temporal-parser.ts +295 -295
  163. package/src/retrieval/reranker/index.ts +188 -188
  164. package/src/retrieval/reranker/model.ts +95 -95
  165. package/src/retrieval/service.ts +125 -125
  166. package/src/retrieval/types.ts +162 -162
  167. package/src/routing/entity-extractor.ts +428 -428
  168. package/src/routing/intent-classifier.ts +450 -436
  169. package/src/routing/response-filter.ts +261 -258
  170. package/src/routing/router.ts +1441 -1322
  171. package/src/routing/search-engine.ts +515 -475
  172. package/src/routing/types.ts +94 -94
  173. package/src/scripts/health-check.ts +118 -118
  174. package/src/scripts/setup.ts +122 -122
  175. package/src/server/handlers/call-tool.ts +156 -156
  176. package/src/server/handlers/index.ts +9 -9
  177. package/src/server/handlers/list-tools.ts +35 -35
  178. package/src/server/handlers/tools/analyze-decision-evolution.ts +151 -151
  179. package/src/server/handlers/tools/auto-remember.ts +200 -200
  180. package/src/server/handlers/tools/brain.ts +85 -85
  181. package/src/server/handlers/tools/create-project.ts +135 -135
  182. package/src/server/handlers/tools/detect-trends.ts +144 -144
  183. package/src/server/handlers/tools/find-cross-project-patterns.ts +168 -168
  184. package/src/server/handlers/tools/get-activity-log.ts +194 -194
  185. package/src/server/handlers/tools/get-code-standards.ts +124 -124
  186. package/src/server/handlers/tools/get-corrections.ts +154 -154
  187. package/src/server/handlers/tools/get-decision-timeline.ts +172 -172
  188. package/src/server/handlers/tools/get-episode.ts +103 -103
  189. package/src/server/handlers/tools/get-patterns.ts +158 -158
  190. package/src/server/handlers/tools/get-phase12-status.ts +63 -63
  191. package/src/server/handlers/tools/get-project-context.ts +75 -75
  192. package/src/server/handlers/tools/get-recommendations.ts +145 -145
  193. package/src/server/handlers/tools/index.ts +31 -31
  194. package/src/server/handlers/tools/init-project.ts +757 -757
  195. package/src/server/handlers/tools/list-episodes.ts +90 -90
  196. package/src/server/handlers/tools/list-projects.ts +125 -125
  197. package/src/server/handlers/tools/rate-memory.ts +101 -101
  198. package/src/server/handlers/tools/recall-similar.ts +87 -87
  199. package/src/server/handlers/tools/recognize-pattern.ts +126 -126
  200. package/src/server/handlers/tools/record-correction.ts +125 -125
  201. package/src/server/handlers/tools/remember-decision.ts +153 -153
  202. package/src/server/handlers/tools/schemas.ts +253 -253
  203. package/src/server/handlers/tools/search-knowledge-graph.ts +102 -102
  204. package/src/server/handlers/tools/smart-context.ts +146 -146
  205. package/src/server/handlers/tools/update-progress.ts +131 -131
  206. package/src/server/handlers/tools/what-if-analysis.ts +135 -135
  207. package/src/server/http-api.ts +761 -693
  208. package/src/server/index.ts +40 -40
  209. package/src/server/mcp-server.ts +283 -283
  210. package/src/server/providers/index.ts +7 -7
  211. package/src/server/providers/prompts.ts +327 -327
  212. package/src/server/providers/resources.ts +622 -622
  213. package/src/server/services.ts +468 -468
  214. package/src/server/types.ts +39 -39
  215. package/src/server/utils/error-handler.ts +155 -155
  216. package/src/server/utils/index.ts +13 -13
  217. package/src/server/utils/memory-indicator.ts +83 -83
  218. package/src/server/utils/request-context.ts +122 -122
  219. package/src/server/utils/response-formatter.ts +129 -129
  220. package/src/server/utils/validators.ts +210 -210
  221. package/src/setup/index.ts +48 -48
  222. package/src/setup/wizard.ts +461 -461
  223. package/src/tools/index.ts +24 -24
  224. package/src/tools/registry.ts +115 -115
  225. package/src/tools/schemas.test.ts +30 -30
  226. package/src/tools/schemas.ts +617 -617
  227. package/src/tools/types.ts +412 -412
  228. package/src/utils/circuit-breaker.ts +130 -130
  229. package/src/utils/cleanup.ts +34 -34
  230. package/src/utils/error-handler.ts +132 -132
  231. package/src/utils/error-messages.ts +60 -60
  232. package/src/utils/fallback.ts +45 -45
  233. package/src/utils/index.ts +54 -54
  234. package/src/utils/logger-utils.ts +80 -80
  235. package/src/utils/logger.ts +88 -88
  236. package/src/utils/phase12-helper.ts +56 -56
  237. package/src/utils/retry.ts +94 -94
  238. package/src/utils/timing.ts +47 -47
  239. package/src/utils/transaction.ts +63 -63
  240. package/src/vault/frontmatter.ts +264 -264
  241. package/src/vault/index.ts +318 -318
  242. package/src/vault/paths.ts +106 -106
  243. package/src/vault/query.ts +422 -422
  244. package/src/vault/reader.ts +264 -264
  245. package/src/vault/templates.ts +186 -186
  246. package/src/vault/types.ts +73 -73
  247. package/src/vault/watcher.ts +277 -277
  248. package/src/vault/writer.ts +413 -413
  249. package/tsconfig.json +30 -30
  250. package/src/cli/auto-update.ts +0 -157
@@ -1,416 +1,416 @@
1
- /**
2
- * Entity Extractor
3
- * Rule-based NER using compromise + technology dictionary
4
- */
5
-
6
- import type { EntityType } from './graph/schema'
7
-
8
- export interface ExtractedEntity {
9
- name: string
10
- normalizedName: string
11
- type: EntityType
12
- confidence: number
13
- source: 'dictionary' | 'nlp' | 'rule'
14
- positions: number[]
15
- }
16
-
17
- const TECH_DICTIONARY: Record<string, { aliases: string[]; type: EntityType }> = {
18
- // Languages
19
- typescript: { aliases: ['ts', 'typescript', 'type-script'], type: 'technology' },
20
- javascript: { aliases: ['js', 'javascript', 'ecmascript', 'es6', 'es2015', 'es2020', 'es2021', 'es2022'], type: 'technology' },
21
- python: { aliases: ['py', 'python3', 'python2'], type: 'technology' },
22
- rust: { aliases: ['rust-lang', 'rustlang'], type: 'technology' },
23
- go: { aliases: ['golang', 'go-lang'], type: 'technology' },
24
- java: { aliases: [], type: 'technology' },
25
- csharp: { aliases: ['c#', 'c-sharp', 'dotnet', '.net'], type: 'technology' },
26
- ruby: { aliases: ['rb'], type: 'technology' },
27
- php: { aliases: [], type: 'technology' },
28
- swift: { aliases: [], type: 'technology' },
29
- kotlin: { aliases: ['kt'], type: 'technology' },
30
- scala: { aliases: [], type: 'technology' },
31
- elixir: { aliases: [], type: 'technology' },
32
- clojure: { aliases: ['clj'], type: 'technology' },
33
- haskell: { aliases: ['hs'], type: 'technology' },
34
- lua: { aliases: [], type: 'technology' },
35
- perl: { aliases: ['pl'], type: 'technology' },
36
- r: { aliases: ['r-lang', 'rlang'], type: 'technology' },
37
- dart: { aliases: [], type: 'technology' },
38
- sql: { aliases: ['mysql', 'postgresql', 'postgres', 'sqlite', 'mssql', 'mariadb'], type: 'technology' },
39
- html: { aliases: ['html5'], type: 'technology' },
40
- css: { aliases: ['css3', 'scss', 'sass', 'less', 'stylus'], type: 'technology' },
41
- graphql: { aliases: ['gql'], type: 'technology' },
42
- yaml: { aliases: ['yml'], type: 'technology' },
43
- json: { aliases: [], type: 'technology' },
44
- markdown: { aliases: ['md'], type: 'technology' },
45
- bash: { aliases: ['sh', 'shell', 'zsh', 'fish'], type: 'technology' },
46
- c: { aliases: ['c-lang', 'ansi-c'], type: 'technology' },
47
- cpp: { aliases: ['c++', 'cplusplus'], type: 'technology' },
48
-
49
- // Frontend Frameworks
50
- react: { aliases: ['reactjs', 'react.js', 'react-dom'], type: 'technology' },
51
- vue: { aliases: ['vuejs', 'vue.js', 'vue3', 'vue2'], type: 'technology' },
52
- angular: { aliases: ['angularjs', 'angular.js', 'ng'], type: 'technology' },
53
- svelte: { aliases: ['sveltejs', 'sveltekit'], type: 'technology' },
54
- nextjs: { aliases: ['next.js', 'next', 'nextjs'], type: 'technology' },
55
- nuxt: { aliases: ['nuxtjs', 'nuxt.js'], type: 'technology' },
56
- remix: { aliases: ['remix-run'], type: 'technology' },
57
- astro: { aliases: ['astrojs'], type: 'technology' },
58
- gatsby: { aliases: ['gatsbyjs'], type: 'technology' },
59
- solid: { aliases: ['solidjs', 'solid-js'], type: 'technology' },
60
- preact: { aliases: ['preactjs'], type: 'technology' },
61
- htmx: { aliases: [], type: 'technology' },
62
- jquery: { aliases: [], type: 'technology' },
63
-
64
- // Backend Frameworks
65
- express: { aliases: ['expressjs', 'express.js'], type: 'technology' },
66
- fastify: { aliases: [], type: 'technology' },
67
- hono: { aliases: [], type: 'technology' },
68
- koa: { aliases: ['koajs'], type: 'technology' },
69
- nestjs: { aliases: ['nest.js', 'nest'], type: 'technology' },
70
- django: { aliases: [], type: 'technology' },
71
- flask: { aliases: [], type: 'technology' },
72
- fastapi: { aliases: ['fast-api'], type: 'technology' },
73
- rails: { aliases: ['ruby-on-rails', 'ror'], type: 'technology' },
74
- spring: { aliases: ['spring-boot', 'springboot'], type: 'technology' },
75
- laravel: { aliases: [], type: 'technology' },
76
- phoenix: { aliases: [], type: 'technology' },
77
- gin: { aliases: [], type: 'technology' },
78
- actix: { aliases: ['actix-web'], type: 'technology' },
79
-
80
- // Databases
81
- mongodb: { aliases: ['mongo'], type: 'technology' },
82
- redis: { aliases: [], type: 'technology' },
83
- elasticsearch: { aliases: ['elastic', 'es'], type: 'technology' },
84
- dynamodb: { aliases: ['dynamo'], type: 'technology' },
85
- cassandra: { aliases: [], type: 'technology' },
86
- neo4j: { aliases: [], type: 'technology' },
87
- couchdb: { aliases: ['couch'], type: 'technology' },
88
- firebase: { aliases: ['firestore'], type: 'technology' },
89
- supabase: { aliases: [], type: 'technology' },
90
- prisma: { aliases: [], type: 'technology' },
91
- drizzle: { aliases: ['drizzle-orm'], type: 'technology' },
92
- sequelize: { aliases: [], type: 'technology' },
93
- typeorm: { aliases: [], type: 'technology' },
94
- chromadb: { aliases: ['chroma'], type: 'technology' },
95
- pinecone: { aliases: [], type: 'technology' },
96
- weaviate: { aliases: [], type: 'technology' },
97
- qdrant: { aliases: [], type: 'technology' },
98
-
99
- // Cloud & DevOps
100
- aws: { aliases: ['amazon-web-services', 'amazon'], type: 'technology' },
101
- gcp: { aliases: ['google-cloud', 'google-cloud-platform'], type: 'technology' },
102
- azure: { aliases: ['microsoft-azure'], type: 'technology' },
103
- docker: { aliases: ['dockerfile', 'docker-compose'], type: 'technology' },
104
- kubernetes: { aliases: ['k8s', 'kube'], type: 'technology' },
105
- terraform: { aliases: ['tf'], type: 'technology' },
106
- ansible: { aliases: [], type: 'technology' },
107
- jenkins: { aliases: [], type: 'technology' },
108
- github: { aliases: ['gh'], type: 'technology' },
109
- gitlab: { aliases: [], type: 'technology' },
110
- vercel: { aliases: [], type: 'technology' },
111
- netlify: { aliases: [], type: 'technology' },
112
- cloudflare: { aliases: ['cf'], type: 'technology' },
113
- nginx: { aliases: [], type: 'technology' },
114
- caddy: { aliases: [], type: 'technology' },
115
-
116
- // Tools & Libraries
117
- webpack: { aliases: [], type: 'technology' },
118
- vite: { aliases: [], type: 'technology' },
119
- esbuild: { aliases: [], type: 'technology' },
120
- rollup: { aliases: ['rollupjs'], type: 'technology' },
121
- parcel: { aliases: [], type: 'technology' },
122
- turbopack: { aliases: [], type: 'technology' },
123
- bun: { aliases: ['bunjs'], type: 'technology' },
124
- deno: { aliases: [], type: 'technology' },
125
- node: { aliases: ['nodejs', 'node.js'], type: 'technology' },
126
- npm: { aliases: [], type: 'technology' },
127
- yarn: { aliases: [], type: 'technology' },
128
- pnpm: { aliases: [], type: 'technology' },
129
- git: { aliases: [], type: 'technology' },
130
- jest: { aliases: [], type: 'technology' },
131
- vitest: { aliases: [], type: 'technology' },
132
- mocha: { aliases: [], type: 'technology' },
133
- cypress: { aliases: [], type: 'technology' },
134
- playwright: { aliases: [], type: 'technology' },
135
- eslint: { aliases: [], type: 'technology' },
136
- prettier: { aliases: [], type: 'technology' },
137
- biome: { aliases: ['biomejs'], type: 'technology' },
138
- tailwind: { aliases: ['tailwindcss', 'tailwind-css'], type: 'technology' },
139
- bootstrap: { aliases: [], type: 'technology' },
140
- zod: { aliases: [], type: 'technology' },
141
- trpc: { aliases: ['t-rpc'], type: 'technology' },
142
- graphql: { aliases: ['gql'], type: 'technology' },
143
- grpc: { aliases: ['g-rpc'], type: 'technology' },
144
- websocket: { aliases: ['ws', 'websockets'], type: 'technology' },
145
- oauth: { aliases: ['oauth2', 'oauth2.0'], type: 'technology' },
146
- jwt: { aliases: ['json-web-token'], type: 'technology' },
147
- openai: { aliases: ['gpt', 'chatgpt', 'gpt-4'], type: 'technology' },
148
- anthropic: { aliases: ['claude', 'claude-ai'], type: 'technology' },
149
- langchain: { aliases: [], type: 'technology' },
150
- llamaindex: { aliases: ['llama-index'], type: 'technology' },
151
- huggingface: { aliases: ['hf', 'hugging-face'], type: 'technology' },
152
- tensorflow: { aliases: ['tf'], type: 'technology' },
153
- pytorch: { aliases: ['torch'], type: 'technology' },
154
- pino: { aliases: [], type: 'technology' },
155
- winston: { aliases: [], type: 'technology' },
156
- storybook: { aliases: [], type: 'technology' },
157
- nx: { aliases: [], type: 'technology' },
158
- turborepo: { aliases: [], type: 'technology' },
159
- lerna: { aliases: [], type: 'technology' },
160
- compromise: { aliases: ['compromise-nlp'], type: 'technology' },
161
- minisearch: { aliases: [], type: 'technology' },
162
-
163
- // Concepts
164
- microservices: { aliases: ['micro-services', 'microservice'], type: 'concept' },
165
- monolith: { aliases: ['monolithic'], type: 'concept' },
166
- serverless: { aliases: ['faas', 'lambda'], type: 'concept' },
167
- rest: { aliases: ['restful', 'rest-api'], type: 'concept' },
168
- api: { aliases: ['apis'], type: 'concept' },
169
- ci: { aliases: ['continuous-integration'], type: 'concept' },
170
- cd: { aliases: ['continuous-deployment', 'continuous-delivery'], type: 'concept' },
171
- tdd: { aliases: ['test-driven-development'], type: 'concept' },
172
- bdd: { aliases: ['behavior-driven-development'], type: 'concept' },
173
- ddd: { aliases: ['domain-driven-design'], type: 'concept' },
174
- cqrs: { aliases: ['command-query-responsibility-segregation'], type: 'concept' },
175
- mvc: { aliases: ['model-view-controller'], type: 'concept' },
176
- mvvm: { aliases: ['model-view-viewmodel'], type: 'concept' },
177
- oop: { aliases: ['object-oriented-programming', 'object-oriented'], type: 'concept' },
178
- fp: { aliases: ['functional-programming', 'functional'], type: 'concept' },
179
- ssr: { aliases: ['server-side-rendering'], type: 'concept' },
180
- ssg: { aliases: ['static-site-generation'], type: 'concept' },
181
- spa: { aliases: ['single-page-application', 'single-page-app'], type: 'concept' },
182
- pwa: { aliases: ['progressive-web-app'], type: 'concept' },
183
- mcp: { aliases: ['model-context-protocol'], type: 'concept' },
184
- rag: { aliases: ['retrieval-augmented-generation'], type: 'concept' },
185
- embedding: { aliases: ['embeddings', 'vector-embedding'], type: 'concept' },
186
- 'vector-search': { aliases: ['semantic-search', 'vector-similarity'], type: 'concept' },
187
- }
188
-
189
- // Build reverse lookup: alias → normalized name
190
- const ALIAS_MAP = new Map<string, string>()
191
- for (const [normalized, entry] of Object.entries(TECH_DICTIONARY)) {
192
- ALIAS_MAP.set(normalized.toLowerCase(), normalized)
193
- for (const alias of entry.aliases) {
194
- ALIAS_MAP.set(alias.toLowerCase(), normalized)
195
- }
196
- }
197
-
198
- // File path regex - handles ./path, ~/path, /path, and relative paths
199
- const FILE_PATH_REGEX = /(?:^|[\s(,])((?:\.\/|~\/|\/)?(?:[\w-]+\/)+[\w.-]+\.\w+)/g
200
- // URL regex
201
- const URL_REGEX = /https?:\/\/[^\s),]+/g
202
- // Version number regex
203
- const VERSION_REGEX = /\b[vV]?\d+\.\d+(?:\.\d+)?(?:-[\w.]+)?\b/g
204
-
205
- let nlpModule: any = null
206
-
207
- async function loadNlp(): Promise<any> {
208
- if (!nlpModule) {
209
- try {
210
- nlpModule = (await import('compromise')).default
211
- } catch {
212
- nlpModule = null
213
- }
214
- }
215
- return nlpModule
216
- }
217
-
218
- export class EntityExtractor {
219
- private nlpLoaded = false
220
- private nlp: any = null
221
-
222
- async initialize(): Promise<void> {
223
- this.nlp = await loadNlp()
224
- this.nlpLoaded = this.nlp !== null
225
- }
226
-
227
- extract(text: string): ExtractedEntity[] {
228
- const entities: Map<string, ExtractedEntity> = new Map()
229
-
230
- this.extractFromDictionary(text, entities)
231
- this.extractFilePaths(text, entities)
232
- this.extractUrls(text, entities)
233
- this.extractDates(text, entities)
234
-
235
- if (this.nlpLoaded && this.nlp) {
236
- this.extractWithNlp(text, entities)
237
- }
238
-
239
- return Array.from(entities.values())
240
- .sort((a, b) => b.confidence - a.confidence)
241
- }
242
-
243
- extractBatch(texts: string[]): ExtractedEntity[][] {
244
- return texts.map(text => this.extract(text))
245
- }
246
-
247
- private extractFromDictionary(text: string, entities: Map<string, ExtractedEntity>): void {
248
- const lowerText = text.toLowerCase()
249
- const words = lowerText.split(/[\s,;:()[\]{}"'`|/\\]+/)
250
-
251
- for (const word of words) {
252
- const cleaned = word.replace(/^[^a-z0-9]+|[^a-z0-9]+$/g, '')
253
- if (cleaned.length < 2) continue
254
-
255
- const normalized = ALIAS_MAP.get(cleaned)
256
- if (normalized) {
257
- const dictEntry = TECH_DICTIONARY[normalized]
258
- if (!dictEntry) continue
259
-
260
- const existing = entities.get(normalized)
261
- if (existing) {
262
- existing.positions.push(lowerText.indexOf(cleaned))
263
- existing.confidence = Math.min(1.0, existing.confidence + 0.05)
264
- } else {
265
- entities.set(normalized, {
266
- name: cleaned,
267
- normalizedName: normalized,
268
- type: dictEntry.type,
269
- confidence: 0.95,
270
- source: 'dictionary',
271
- positions: [lowerText.indexOf(cleaned)]
272
- })
273
- }
274
- }
275
- }
276
-
277
- // Also check multi-word aliases
278
- for (const [alias, normalized] of ALIAS_MAP) {
279
- if (alias.includes('-') || alias.includes('.') || alias.includes(' ')) {
280
- if (lowerText.includes(alias) && !entities.has(normalized)) {
281
- const dictEntry = TECH_DICTIONARY[normalized]
282
- if (!dictEntry) continue
283
-
284
- entities.set(normalized, {
285
- name: alias,
286
- normalizedName: normalized,
287
- type: dictEntry.type,
288
- confidence: 0.95,
289
- source: 'dictionary',
290
- positions: [lowerText.indexOf(alias)]
291
- })
292
- }
293
- }
294
- }
295
- }
296
-
297
- private extractFilePaths(text: string, entities: Map<string, ExtractedEntity>): void {
298
- let match: RegExpExecArray | null
299
- const regex = new RegExp(FILE_PATH_REGEX.source, FILE_PATH_REGEX.flags)
300
-
301
- while ((match = regex.exec(text)) !== null) {
302
- const filePath = match[1].trim()
303
- if (filePath.length < 4) continue
304
-
305
- const key = `file:${filePath}`
306
- if (!entities.has(key)) {
307
- entities.set(key, {
308
- name: filePath,
309
- normalizedName: filePath,
310
- type: 'file',
311
- confidence: 0.85,
312
- source: 'rule',
313
- positions: [match.index]
314
- })
315
- }
316
- }
317
- }
318
-
319
- private extractUrls(text: string, entities: Map<string, ExtractedEntity>): void {
320
- let match: RegExpExecArray | null
321
- const regex = new RegExp(URL_REGEX.source, URL_REGEX.flags)
322
-
323
- while ((match = regex.exec(text)) !== null) {
324
- const url = match[0]
325
- const key = `url:${url}`
326
- if (!entities.has(key)) {
327
- entities.set(key, {
328
- name: url,
329
- normalizedName: url,
330
- type: 'file',
331
- confidence: 0.9,
332
- source: 'rule',
333
- positions: [match.index]
334
- })
335
- }
336
- }
337
- }
338
-
339
- private extractDates(text: string, entities: Map<string, ExtractedEntity>): void {
340
- // ISO date pattern
341
- const isoDateRegex = /\b\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})?)?\b/g
342
- let match: RegExpExecArray | null
343
-
344
- while ((match = isoDateRegex.exec(text)) !== null) {
345
- const dateStr = match[0]
346
- const key = `date:${dateStr}`
347
- if (!entities.has(key)) {
348
- entities.set(key, {
349
- name: dateStr,
350
- normalizedName: dateStr,
351
- type: 'date',
352
- confidence: 0.9,
353
- source: 'rule',
354
- positions: [match.index]
355
- })
356
- }
357
- }
358
- }
359
-
360
- private extractWithNlp(text: string, entities: Map<string, ExtractedEntity>): void {
361
- try {
362
- const doc = this.nlp(text)
363
-
364
- // Extract people
365
- const people = doc.people()
366
- if (people && people.length > 0) {
367
- for (const person of people.out('array') as string[]) {
368
- const cleaned = person.trim()
369
- if (cleaned.length < 2) continue
370
- const key = `person:${cleaned.toLowerCase()}`
371
- if (!entities.has(key)) {
372
- entities.set(key, {
373
- name: cleaned,
374
- normalizedName: cleaned.toLowerCase(),
375
- type: 'person',
376
- confidence: 0.7,
377
- source: 'nlp',
378
- positions: [text.indexOf(cleaned)]
379
- })
380
- }
381
- }
382
- }
383
-
384
- // Extract organizations
385
- const orgs = doc.organizations()
386
- if (orgs && orgs.length > 0) {
387
- for (const org of orgs.out('array') as string[]) {
388
- const cleaned = org.trim()
389
- if (cleaned.length < 2) continue
390
- // Check if it's already a known technology
391
- const normalizedCheck = ALIAS_MAP.get(cleaned.toLowerCase())
392
- if (normalizedCheck) continue
393
-
394
- const key = `org:${cleaned.toLowerCase()}`
395
- if (!entities.has(key)) {
396
- entities.set(key, {
397
- name: cleaned,
398
- normalizedName: cleaned.toLowerCase(),
399
- type: 'concept',
400
- confidence: 0.65,
401
- source: 'nlp',
402
- positions: [text.indexOf(cleaned)]
403
- })
404
- }
405
- }
406
- }
407
- } catch {
408
- // NLP extraction failure is non-critical
409
- }
410
- }
411
-
412
- static normalizeEntityName(name: string): string {
413
- const lower = name.toLowerCase().trim()
414
- return ALIAS_MAP.get(lower) || lower
415
- }
416
- }
1
+ /**
2
+ * Entity Extractor
3
+ * Rule-based NER using compromise + technology dictionary
4
+ */
5
+
6
+ import type { EntityType } from './graph/schema'
7
+
8
+ export interface ExtractedEntity {
9
+ name: string
10
+ normalizedName: string
11
+ type: EntityType
12
+ confidence: number
13
+ source: 'dictionary' | 'nlp' | 'rule'
14
+ positions: number[]
15
+ }
16
+
17
+ const TECH_DICTIONARY: Record<string, { aliases: string[]; type: EntityType }> = {
18
+ // Languages
19
+ typescript: { aliases: ['ts', 'typescript', 'type-script'], type: 'technology' },
20
+ javascript: { aliases: ['js', 'javascript', 'ecmascript', 'es6', 'es2015', 'es2020', 'es2021', 'es2022'], type: 'technology' },
21
+ python: { aliases: ['py', 'python3', 'python2'], type: 'technology' },
22
+ rust: { aliases: ['rust-lang', 'rustlang'], type: 'technology' },
23
+ go: { aliases: ['golang', 'go-lang'], type: 'technology' },
24
+ java: { aliases: [], type: 'technology' },
25
+ csharp: { aliases: ['c#', 'c-sharp', 'dotnet', '.net'], type: 'technology' },
26
+ ruby: { aliases: ['rb'], type: 'technology' },
27
+ php: { aliases: [], type: 'technology' },
28
+ swift: { aliases: [], type: 'technology' },
29
+ kotlin: { aliases: ['kt'], type: 'technology' },
30
+ scala: { aliases: [], type: 'technology' },
31
+ elixir: { aliases: [], type: 'technology' },
32
+ clojure: { aliases: ['clj'], type: 'technology' },
33
+ haskell: { aliases: ['hs'], type: 'technology' },
34
+ lua: { aliases: [], type: 'technology' },
35
+ perl: { aliases: ['pl'], type: 'technology' },
36
+ r: { aliases: ['r-lang', 'rlang'], type: 'technology' },
37
+ dart: { aliases: [], type: 'technology' },
38
+ sql: { aliases: ['mysql', 'postgresql', 'postgres', 'sqlite', 'mssql', 'mariadb'], type: 'technology' },
39
+ html: { aliases: ['html5'], type: 'technology' },
40
+ css: { aliases: ['css3', 'scss', 'sass', 'less', 'stylus'], type: 'technology' },
41
+ graphql: { aliases: ['gql'], type: 'technology' },
42
+ yaml: { aliases: ['yml'], type: 'technology' },
43
+ json: { aliases: [], type: 'technology' },
44
+ markdown: { aliases: ['md'], type: 'technology' },
45
+ bash: { aliases: ['sh', 'shell', 'zsh', 'fish'], type: 'technology' },
46
+ c: { aliases: ['c-lang', 'ansi-c'], type: 'technology' },
47
+ cpp: { aliases: ['c++', 'cplusplus'], type: 'technology' },
48
+
49
+ // Frontend Frameworks
50
+ react: { aliases: ['reactjs', 'react.js', 'react-dom'], type: 'technology' },
51
+ vue: { aliases: ['vuejs', 'vue.js', 'vue3', 'vue2'], type: 'technology' },
52
+ angular: { aliases: ['angularjs', 'angular.js', 'ng'], type: 'technology' },
53
+ svelte: { aliases: ['sveltejs', 'sveltekit'], type: 'technology' },
54
+ nextjs: { aliases: ['next.js', 'next', 'nextjs'], type: 'technology' },
55
+ nuxt: { aliases: ['nuxtjs', 'nuxt.js'], type: 'technology' },
56
+ remix: { aliases: ['remix-run'], type: 'technology' },
57
+ astro: { aliases: ['astrojs'], type: 'technology' },
58
+ gatsby: { aliases: ['gatsbyjs'], type: 'technology' },
59
+ solid: { aliases: ['solidjs', 'solid-js'], type: 'technology' },
60
+ preact: { aliases: ['preactjs'], type: 'technology' },
61
+ htmx: { aliases: [], type: 'technology' },
62
+ jquery: { aliases: [], type: 'technology' },
63
+
64
+ // Backend Frameworks
65
+ express: { aliases: ['expressjs', 'express.js'], type: 'technology' },
66
+ fastify: { aliases: [], type: 'technology' },
67
+ hono: { aliases: [], type: 'technology' },
68
+ koa: { aliases: ['koajs'], type: 'technology' },
69
+ nestjs: { aliases: ['nest.js', 'nest'], type: 'technology' },
70
+ django: { aliases: [], type: 'technology' },
71
+ flask: { aliases: [], type: 'technology' },
72
+ fastapi: { aliases: ['fast-api'], type: 'technology' },
73
+ rails: { aliases: ['ruby-on-rails', 'ror'], type: 'technology' },
74
+ spring: { aliases: ['spring-boot', 'springboot'], type: 'technology' },
75
+ laravel: { aliases: [], type: 'technology' },
76
+ phoenix: { aliases: [], type: 'technology' },
77
+ gin: { aliases: [], type: 'technology' },
78
+ actix: { aliases: ['actix-web'], type: 'technology' },
79
+
80
+ // Databases
81
+ mongodb: { aliases: ['mongo'], type: 'technology' },
82
+ redis: { aliases: [], type: 'technology' },
83
+ elasticsearch: { aliases: ['elastic', 'es'], type: 'technology' },
84
+ dynamodb: { aliases: ['dynamo'], type: 'technology' },
85
+ cassandra: { aliases: [], type: 'technology' },
86
+ neo4j: { aliases: [], type: 'technology' },
87
+ couchdb: { aliases: ['couch'], type: 'technology' },
88
+ firebase: { aliases: ['firestore'], type: 'technology' },
89
+ supabase: { aliases: [], type: 'technology' },
90
+ prisma: { aliases: [], type: 'technology' },
91
+ drizzle: { aliases: ['drizzle-orm'], type: 'technology' },
92
+ sequelize: { aliases: [], type: 'technology' },
93
+ typeorm: { aliases: [], type: 'technology' },
94
+ chromadb: { aliases: ['chroma'], type: 'technology' },
95
+ pinecone: { aliases: [], type: 'technology' },
96
+ weaviate: { aliases: [], type: 'technology' },
97
+ qdrant: { aliases: [], type: 'technology' },
98
+
99
+ // Cloud & DevOps
100
+ aws: { aliases: ['amazon-web-services', 'amazon'], type: 'technology' },
101
+ gcp: { aliases: ['google-cloud', 'google-cloud-platform'], type: 'technology' },
102
+ azure: { aliases: ['microsoft-azure'], type: 'technology' },
103
+ docker: { aliases: ['dockerfile', 'docker-compose'], type: 'technology' },
104
+ kubernetes: { aliases: ['k8s', 'kube'], type: 'technology' },
105
+ terraform: { aliases: ['tf'], type: 'technology' },
106
+ ansible: { aliases: [], type: 'technology' },
107
+ jenkins: { aliases: [], type: 'technology' },
108
+ github: { aliases: ['gh'], type: 'technology' },
109
+ gitlab: { aliases: [], type: 'technology' },
110
+ vercel: { aliases: [], type: 'technology' },
111
+ netlify: { aliases: [], type: 'technology' },
112
+ cloudflare: { aliases: ['cf'], type: 'technology' },
113
+ nginx: { aliases: [], type: 'technology' },
114
+ caddy: { aliases: [], type: 'technology' },
115
+
116
+ // Tools & Libraries
117
+ webpack: { aliases: [], type: 'technology' },
118
+ vite: { aliases: [], type: 'technology' },
119
+ esbuild: { aliases: [], type: 'technology' },
120
+ rollup: { aliases: ['rollupjs'], type: 'technology' },
121
+ parcel: { aliases: [], type: 'technology' },
122
+ turbopack: { aliases: [], type: 'technology' },
123
+ bun: { aliases: ['bunjs'], type: 'technology' },
124
+ deno: { aliases: [], type: 'technology' },
125
+ node: { aliases: ['nodejs', 'node.js'], type: 'technology' },
126
+ npm: { aliases: [], type: 'technology' },
127
+ yarn: { aliases: [], type: 'technology' },
128
+ pnpm: { aliases: [], type: 'technology' },
129
+ git: { aliases: [], type: 'technology' },
130
+ jest: { aliases: [], type: 'technology' },
131
+ vitest: { aliases: [], type: 'technology' },
132
+ mocha: { aliases: [], type: 'technology' },
133
+ cypress: { aliases: [], type: 'technology' },
134
+ playwright: { aliases: [], type: 'technology' },
135
+ eslint: { aliases: [], type: 'technology' },
136
+ prettier: { aliases: [], type: 'technology' },
137
+ biome: { aliases: ['biomejs'], type: 'technology' },
138
+ tailwind: { aliases: ['tailwindcss', 'tailwind-css'], type: 'technology' },
139
+ bootstrap: { aliases: [], type: 'technology' },
140
+ zod: { aliases: [], type: 'technology' },
141
+ trpc: { aliases: ['t-rpc'], type: 'technology' },
142
+ graphql: { aliases: ['gql'], type: 'technology' },
143
+ grpc: { aliases: ['g-rpc'], type: 'technology' },
144
+ websocket: { aliases: ['ws', 'websockets'], type: 'technology' },
145
+ oauth: { aliases: ['oauth2', 'oauth2.0'], type: 'technology' },
146
+ jwt: { aliases: ['json-web-token'], type: 'technology' },
147
+ openai: { aliases: ['gpt', 'chatgpt', 'gpt-4'], type: 'technology' },
148
+ anthropic: { aliases: ['claude', 'claude-ai'], type: 'technology' },
149
+ langchain: { aliases: [], type: 'technology' },
150
+ llamaindex: { aliases: ['llama-index'], type: 'technology' },
151
+ huggingface: { aliases: ['hf', 'hugging-face'], type: 'technology' },
152
+ tensorflow: { aliases: ['tf'], type: 'technology' },
153
+ pytorch: { aliases: ['torch'], type: 'technology' },
154
+ pino: { aliases: [], type: 'technology' },
155
+ winston: { aliases: [], type: 'technology' },
156
+ storybook: { aliases: [], type: 'technology' },
157
+ nx: { aliases: [], type: 'technology' },
158
+ turborepo: { aliases: [], type: 'technology' },
159
+ lerna: { aliases: [], type: 'technology' },
160
+ compromise: { aliases: ['compromise-nlp'], type: 'technology' },
161
+ minisearch: { aliases: [], type: 'technology' },
162
+
163
+ // Concepts
164
+ microservices: { aliases: ['micro-services', 'microservice'], type: 'concept' },
165
+ monolith: { aliases: ['monolithic'], type: 'concept' },
166
+ serverless: { aliases: ['faas', 'lambda'], type: 'concept' },
167
+ rest: { aliases: ['restful', 'rest-api'], type: 'concept' },
168
+ api: { aliases: ['apis'], type: 'concept' },
169
+ ci: { aliases: ['continuous-integration'], type: 'concept' },
170
+ cd: { aliases: ['continuous-deployment', 'continuous-delivery'], type: 'concept' },
171
+ tdd: { aliases: ['test-driven-development'], type: 'concept' },
172
+ bdd: { aliases: ['behavior-driven-development'], type: 'concept' },
173
+ ddd: { aliases: ['domain-driven-design'], type: 'concept' },
174
+ cqrs: { aliases: ['command-query-responsibility-segregation'], type: 'concept' },
175
+ mvc: { aliases: ['model-view-controller'], type: 'concept' },
176
+ mvvm: { aliases: ['model-view-viewmodel'], type: 'concept' },
177
+ oop: { aliases: ['object-oriented-programming', 'object-oriented'], type: 'concept' },
178
+ fp: { aliases: ['functional-programming', 'functional'], type: 'concept' },
179
+ ssr: { aliases: ['server-side-rendering'], type: 'concept' },
180
+ ssg: { aliases: ['static-site-generation'], type: 'concept' },
181
+ spa: { aliases: ['single-page-application', 'single-page-app'], type: 'concept' },
182
+ pwa: { aliases: ['progressive-web-app'], type: 'concept' },
183
+ mcp: { aliases: ['model-context-protocol'], type: 'concept' },
184
+ rag: { aliases: ['retrieval-augmented-generation'], type: 'concept' },
185
+ embedding: { aliases: ['embeddings', 'vector-embedding'], type: 'concept' },
186
+ 'vector-search': { aliases: ['semantic-search', 'vector-similarity'], type: 'concept' },
187
+ }
188
+
189
+ // Build reverse lookup: alias → normalized name
190
+ const ALIAS_MAP = new Map<string, string>()
191
+ for (const [normalized, entry] of Object.entries(TECH_DICTIONARY)) {
192
+ ALIAS_MAP.set(normalized.toLowerCase(), normalized)
193
+ for (const alias of entry.aliases) {
194
+ ALIAS_MAP.set(alias.toLowerCase(), normalized)
195
+ }
196
+ }
197
+
198
+ // File path regex - handles ./path, ~/path, /path, and relative paths
199
+ const FILE_PATH_REGEX = /(?:^|[\s(,])((?:\.\/|~\/|\/)?(?:[\w-]+\/)+[\w.-]+\.\w+)/g
200
+ // URL regex
201
+ const URL_REGEX = /https?:\/\/[^\s),]+/g
202
+ // Version number regex
203
+ const VERSION_REGEX = /\b[vV]?\d+\.\d+(?:\.\d+)?(?:-[\w.]+)?\b/g
204
+
205
+ let nlpModule: any = null
206
+
207
+ async function loadNlp(): Promise<any> {
208
+ if (!nlpModule) {
209
+ try {
210
+ nlpModule = (await import('compromise')).default
211
+ } catch {
212
+ nlpModule = null
213
+ }
214
+ }
215
+ return nlpModule
216
+ }
217
+
218
+ export class EntityExtractor {
219
+ private nlpLoaded = false
220
+ private nlp: any = null
221
+
222
+ async initialize(): Promise<void> {
223
+ this.nlp = await loadNlp()
224
+ this.nlpLoaded = this.nlp !== null
225
+ }
226
+
227
+ extract(text: string): ExtractedEntity[] {
228
+ const entities: Map<string, ExtractedEntity> = new Map()
229
+
230
+ this.extractFromDictionary(text, entities)
231
+ this.extractFilePaths(text, entities)
232
+ this.extractUrls(text, entities)
233
+ this.extractDates(text, entities)
234
+
235
+ if (this.nlpLoaded && this.nlp) {
236
+ this.extractWithNlp(text, entities)
237
+ }
238
+
239
+ return Array.from(entities.values())
240
+ .sort((a, b) => b.confidence - a.confidence)
241
+ }
242
+
243
+ extractBatch(texts: string[]): ExtractedEntity[][] {
244
+ return texts.map(text => this.extract(text))
245
+ }
246
+
247
+ private extractFromDictionary(text: string, entities: Map<string, ExtractedEntity>): void {
248
+ const lowerText = text.toLowerCase()
249
+ const words = lowerText.split(/[\s,;:()[\]{}"'`|/\\]+/)
250
+
251
+ for (const word of words) {
252
+ const cleaned = word.replace(/^[^a-z0-9]+|[^a-z0-9]+$/g, '')
253
+ if (cleaned.length < 2) continue
254
+
255
+ const normalized = ALIAS_MAP.get(cleaned)
256
+ if (normalized) {
257
+ const dictEntry = TECH_DICTIONARY[normalized]
258
+ if (!dictEntry) continue
259
+
260
+ const existing = entities.get(normalized)
261
+ if (existing) {
262
+ existing.positions.push(lowerText.indexOf(cleaned))
263
+ existing.confidence = Math.min(1.0, existing.confidence + 0.05)
264
+ } else {
265
+ entities.set(normalized, {
266
+ name: cleaned,
267
+ normalizedName: normalized,
268
+ type: dictEntry.type,
269
+ confidence: 0.95,
270
+ source: 'dictionary',
271
+ positions: [lowerText.indexOf(cleaned)]
272
+ })
273
+ }
274
+ }
275
+ }
276
+
277
+ // Also check multi-word aliases
278
+ for (const [alias, normalized] of ALIAS_MAP) {
279
+ if (alias.includes('-') || alias.includes('.') || alias.includes(' ')) {
280
+ if (lowerText.includes(alias) && !entities.has(normalized)) {
281
+ const dictEntry = TECH_DICTIONARY[normalized]
282
+ if (!dictEntry) continue
283
+
284
+ entities.set(normalized, {
285
+ name: alias,
286
+ normalizedName: normalized,
287
+ type: dictEntry.type,
288
+ confidence: 0.95,
289
+ source: 'dictionary',
290
+ positions: [lowerText.indexOf(alias)]
291
+ })
292
+ }
293
+ }
294
+ }
295
+ }
296
+
297
+ private extractFilePaths(text: string, entities: Map<string, ExtractedEntity>): void {
298
+ let match: RegExpExecArray | null
299
+ const regex = new RegExp(FILE_PATH_REGEX.source, FILE_PATH_REGEX.flags)
300
+
301
+ while ((match = regex.exec(text)) !== null) {
302
+ const filePath = match[1].trim()
303
+ if (filePath.length < 4) continue
304
+
305
+ const key = `file:${filePath}`
306
+ if (!entities.has(key)) {
307
+ entities.set(key, {
308
+ name: filePath,
309
+ normalizedName: filePath,
310
+ type: 'file',
311
+ confidence: 0.85,
312
+ source: 'rule',
313
+ positions: [match.index]
314
+ })
315
+ }
316
+ }
317
+ }
318
+
319
+ private extractUrls(text: string, entities: Map<string, ExtractedEntity>): void {
320
+ let match: RegExpExecArray | null
321
+ const regex = new RegExp(URL_REGEX.source, URL_REGEX.flags)
322
+
323
+ while ((match = regex.exec(text)) !== null) {
324
+ const url = match[0]
325
+ const key = `url:${url}`
326
+ if (!entities.has(key)) {
327
+ entities.set(key, {
328
+ name: url,
329
+ normalizedName: url,
330
+ type: 'file',
331
+ confidence: 0.9,
332
+ source: 'rule',
333
+ positions: [match.index]
334
+ })
335
+ }
336
+ }
337
+ }
338
+
339
+ private extractDates(text: string, entities: Map<string, ExtractedEntity>): void {
340
+ // ISO date pattern
341
+ const isoDateRegex = /\b\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})?)?\b/g
342
+ let match: RegExpExecArray | null
343
+
344
+ while ((match = isoDateRegex.exec(text)) !== null) {
345
+ const dateStr = match[0]
346
+ const key = `date:${dateStr}`
347
+ if (!entities.has(key)) {
348
+ entities.set(key, {
349
+ name: dateStr,
350
+ normalizedName: dateStr,
351
+ type: 'date',
352
+ confidence: 0.9,
353
+ source: 'rule',
354
+ positions: [match.index]
355
+ })
356
+ }
357
+ }
358
+ }
359
+
360
+ private extractWithNlp(text: string, entities: Map<string, ExtractedEntity>): void {
361
+ try {
362
+ const doc = this.nlp(text)
363
+
364
+ // Extract people
365
+ const people = doc.people()
366
+ if (people && people.length > 0) {
367
+ for (const person of people.out('array') as string[]) {
368
+ const cleaned = person.trim()
369
+ if (cleaned.length < 2) continue
370
+ const key = `person:${cleaned.toLowerCase()}`
371
+ if (!entities.has(key)) {
372
+ entities.set(key, {
373
+ name: cleaned,
374
+ normalizedName: cleaned.toLowerCase(),
375
+ type: 'person',
376
+ confidence: 0.7,
377
+ source: 'nlp',
378
+ positions: [text.indexOf(cleaned)]
379
+ })
380
+ }
381
+ }
382
+ }
383
+
384
+ // Extract organizations
385
+ const orgs = doc.organizations()
386
+ if (orgs && orgs.length > 0) {
387
+ for (const org of orgs.out('array') as string[]) {
388
+ const cleaned = org.trim()
389
+ if (cleaned.length < 2) continue
390
+ // Check if it's already a known technology
391
+ const normalizedCheck = ALIAS_MAP.get(cleaned.toLowerCase())
392
+ if (normalizedCheck) continue
393
+
394
+ const key = `org:${cleaned.toLowerCase()}`
395
+ if (!entities.has(key)) {
396
+ entities.set(key, {
397
+ name: cleaned,
398
+ normalizedName: cleaned.toLowerCase(),
399
+ type: 'concept',
400
+ confidence: 0.65,
401
+ source: 'nlp',
402
+ positions: [text.indexOf(cleaned)]
403
+ })
404
+ }
405
+ }
406
+ }
407
+ } catch {
408
+ // NLP extraction failure is non-critical
409
+ }
410
+ }
411
+
412
+ static normalizeEntityName(name: string): string {
413
+ const lower = name.toLowerCase().trim()
414
+ return ALIAS_MAP.get(lower) || lower
415
+ }
416
+ }