claude-brain 0.15.2 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. package/README.md +191 -191
  2. package/VERSION +1 -1
  3. package/assets/CLAUDE-unified.md +11 -11
  4. package/assets/CLAUDE.md +29 -11
  5. package/bunfig.toml +8 -8
  6. package/package.json +82 -82
  7. package/packs/backend/node.json +173 -173
  8. package/packs/core/javascript.json +176 -176
  9. package/packs/core/typescript.json +222 -222
  10. package/packs/frontend/react.json +254 -254
  11. package/packs/meta/testing.json +172 -172
  12. package/scripts/postinstall.mjs +341 -341
  13. package/src/automation/auto-context.ts +240 -240
  14. package/src/automation/decision-detector.ts +452 -452
  15. package/src/automation/index.ts +11 -11
  16. package/src/automation/phase12-manager.ts +456 -456
  17. package/src/automation/proactive-recall.ts +373 -373
  18. package/src/automation/project-detector.ts +310 -310
  19. package/src/automation/repo-scanner.ts +205 -205
  20. package/src/cli/auto-setup.ts +82 -82
  21. package/src/cli/bin.ts +209 -202
  22. package/src/cli/commands/chroma.ts +573 -573
  23. package/src/cli/commands/git-hook.ts +189 -189
  24. package/src/cli/commands/hooks.ts +213 -213
  25. package/src/cli/commands/init.ts +122 -122
  26. package/src/cli/commands/install-mcp.ts +92 -92
  27. package/src/cli/commands/pack.ts +197 -197
  28. package/src/cli/commands/refresh.ts +323 -0
  29. package/src/cli/commands/serve.ts +167 -173
  30. package/src/cli/commands/start.ts +42 -42
  31. package/src/cli/commands/uninstall-mcp.ts +41 -41
  32. package/src/cli/commands/update.ts +124 -121
  33. package/src/cli/diagnose.ts +4 -4
  34. package/src/cli/health-check.ts +4 -4
  35. package/src/cli/migrate-chroma.ts +106 -106
  36. package/src/cli/setup.ts +4 -4
  37. package/src/cli/ui/animations.ts +80 -80
  38. package/src/cli/ui/components.ts +82 -82
  39. package/src/cli/ui/index.ts +4 -4
  40. package/src/cli/ui/logo.ts +36 -36
  41. package/src/cli/ui/theme.ts +55 -55
  42. package/src/config/defaults.ts +50 -50
  43. package/src/config/home.ts +55 -55
  44. package/src/config/index.ts +7 -7
  45. package/src/config/loader.ts +166 -166
  46. package/src/config/migration.ts +76 -76
  47. package/src/config/schema.ts +360 -360
  48. package/src/config/validator.ts +184 -184
  49. package/src/config/watcher.ts +86 -86
  50. package/src/context/assembler.ts +398 -398
  51. package/src/context/cache-manager.ts +101 -101
  52. package/src/context/formatter.ts +84 -84
  53. package/src/context/hierarchy.ts +85 -85
  54. package/src/context/index.ts +83 -83
  55. package/src/context/progress-tracker.ts +174 -174
  56. package/src/context/standards-manager.ts +287 -287
  57. package/src/context/types.ts +252 -252
  58. package/src/context/validator.ts +58 -58
  59. package/src/diagnostics/index.ts +123 -123
  60. package/src/health/index.ts +229 -229
  61. package/src/hooks/brain-hook.ts +128 -112
  62. package/src/hooks/capture.ts +168 -205
  63. package/src/hooks/deduplicator.ts +72 -72
  64. package/src/hooks/git-capture.ts +109 -109
  65. package/src/hooks/git-hook-installer.ts +207 -207
  66. package/src/hooks/index.ts +20 -20
  67. package/src/hooks/installer.ts +194 -194
  68. package/src/hooks/passive-classifier.ts +404 -723
  69. package/src/hooks/queue.ts +129 -129
  70. package/src/hooks/session-tracker.ts +312 -275
  71. package/src/hooks/types.ts +47 -47
  72. package/src/index.ts +7 -7
  73. package/src/intelligence/cross-project/affinity.ts +162 -162
  74. package/src/intelligence/cross-project/generalizer.ts +283 -283
  75. package/src/intelligence/cross-project/index.ts +13 -13
  76. package/src/intelligence/cross-project/transfer.ts +201 -201
  77. package/src/intelligence/index.ts +24 -24
  78. package/src/intelligence/optimization/index.ts +10 -10
  79. package/src/intelligence/optimization/precompute.ts +202 -202
  80. package/src/intelligence/optimization/semantic-cache.ts +207 -207
  81. package/src/intelligence/prediction/context-anticipator.ts +198 -198
  82. package/src/intelligence/prediction/decision-predictor.ts +184 -184
  83. package/src/intelligence/prediction/index.ts +13 -13
  84. package/src/intelligence/prediction/recommender.ts +268 -268
  85. package/src/intelligence/reasoning/chain-retrieval.ts +247 -247
  86. package/src/intelligence/reasoning/counterfactual.ts +248 -248
  87. package/src/intelligence/reasoning/index.ts +13 -13
  88. package/src/intelligence/reasoning/synthesizer.ts +169 -169
  89. package/src/intelligence/temporal/evolution.ts +197 -197
  90. package/src/intelligence/temporal/index.ts +16 -16
  91. package/src/intelligence/temporal/query-processor.ts +190 -190
  92. package/src/intelligence/temporal/timeline.ts +259 -259
  93. package/src/intelligence/temporal/trends.ts +263 -263
  94. package/src/knowledge/entity-extractor.ts +416 -416
  95. package/src/knowledge/graph/builder.ts +185 -185
  96. package/src/knowledge/graph/linker.ts +201 -201
  97. package/src/knowledge/graph/memory-graph.ts +359 -359
  98. package/src/knowledge/graph/schema.ts +99 -99
  99. package/src/knowledge/graph/search.ts +168 -168
  100. package/src/knowledge/relationship-extractor.ts +108 -108
  101. package/src/memory/chroma/client.ts +174 -174
  102. package/src/memory/chroma/collection-manager.ts +94 -94
  103. package/src/memory/chroma/config.ts +57 -57
  104. package/src/memory/chroma/embeddings.ts +155 -155
  105. package/src/memory/chroma/index.ts +82 -82
  106. package/src/memory/chroma/migration.ts +270 -270
  107. package/src/memory/chroma/schemas.ts +69 -69
  108. package/src/memory/chroma/search.ts +315 -315
  109. package/src/memory/chroma/store.ts +741 -741
  110. package/src/memory/consolidation/archiver.ts +164 -164
  111. package/src/memory/consolidation/merger.ts +186 -186
  112. package/src/memory/consolidation/scorer.ts +138 -138
  113. package/src/memory/context-builder.ts +236 -236
  114. package/src/memory/database.ts +169 -169
  115. package/src/memory/embedding-utils.ts +156 -156
  116. package/src/memory/embeddings.ts +226 -226
  117. package/src/memory/episodic/detector.ts +108 -108
  118. package/src/memory/episodic/manager.ts +351 -351
  119. package/src/memory/episodic/summarizer.ts +179 -179
  120. package/src/memory/episodic/types.ts +52 -52
  121. package/src/memory/index.ts +582 -582
  122. package/src/memory/knowledge-extractor.ts +455 -455
  123. package/src/memory/learning.ts +378 -378
  124. package/src/memory/patterns.ts +396 -396
  125. package/src/memory/schema.ts +88 -88
  126. package/src/memory/search.ts +309 -309
  127. package/src/memory/store.ts +787 -787
  128. package/src/memory/types.ts +121 -121
  129. package/src/orchestrator/coordinator.ts +272 -272
  130. package/src/orchestrator/decision-logger.ts +228 -228
  131. package/src/orchestrator/event-emitter.ts +198 -198
  132. package/src/orchestrator/event-queue.ts +184 -184
  133. package/src/orchestrator/handlers/base-handler.ts +70 -70
  134. package/src/orchestrator/handlers/context-handler.ts +73 -73
  135. package/src/orchestrator/handlers/decision-handler.ts +204 -204
  136. package/src/orchestrator/handlers/index.ts +10 -10
  137. package/src/orchestrator/handlers/status-handler.ts +131 -131
  138. package/src/orchestrator/handlers/task-handler.ts +171 -171
  139. package/src/orchestrator/index.ts +275 -275
  140. package/src/orchestrator/task-parser.ts +284 -284
  141. package/src/orchestrator/types.ts +98 -98
  142. package/src/packs/index.ts +9 -9
  143. package/src/packs/loader.ts +134 -134
  144. package/src/packs/manager.ts +204 -204
  145. package/src/packs/ranker.ts +78 -78
  146. package/src/packs/types.ts +81 -81
  147. package/src/phase12/index.ts +5 -5
  148. package/src/retrieval/bm25/index.ts +300 -300
  149. package/src/retrieval/bm25/tokenizer.ts +184 -184
  150. package/src/retrieval/feedback/adaptive.ts +223 -223
  151. package/src/retrieval/feedback/index.ts +16 -16
  152. package/src/retrieval/feedback/metrics.ts +223 -223
  153. package/src/retrieval/feedback/store.ts +283 -283
  154. package/src/retrieval/fusion/index.ts +194 -194
  155. package/src/retrieval/fusion/rrf.ts +163 -163
  156. package/src/retrieval/index.ts +12 -12
  157. package/src/retrieval/pipeline.ts +375 -375
  158. package/src/retrieval/query/expander.ts +198 -198
  159. package/src/retrieval/query/index.ts +27 -27
  160. package/src/retrieval/query/intent-classifier.ts +236 -236
  161. package/src/retrieval/query/temporal-parser.ts +295 -295
  162. package/src/retrieval/reranker/index.ts +188 -188
  163. package/src/retrieval/reranker/model.ts +95 -95
  164. package/src/retrieval/service.ts +125 -125
  165. package/src/retrieval/types.ts +162 -162
  166. package/src/routing/entity-extractor.ts +428 -428
  167. package/src/routing/intent-classifier.ts +450 -436
  168. package/src/routing/response-filter.ts +261 -258
  169. package/src/routing/router.ts +1441 -1322
  170. package/src/routing/search-engine.ts +515 -475
  171. package/src/routing/types.ts +94 -94
  172. package/src/scripts/health-check.ts +118 -118
  173. package/src/scripts/setup.ts +122 -122
  174. package/src/server/handlers/call-tool.ts +156 -156
  175. package/src/server/handlers/index.ts +9 -9
  176. package/src/server/handlers/list-tools.ts +35 -35
  177. package/src/server/handlers/tools/analyze-decision-evolution.ts +151 -151
  178. package/src/server/handlers/tools/auto-remember.ts +200 -200
  179. package/src/server/handlers/tools/brain.ts +85 -85
  180. package/src/server/handlers/tools/create-project.ts +135 -135
  181. package/src/server/handlers/tools/detect-trends.ts +144 -144
  182. package/src/server/handlers/tools/find-cross-project-patterns.ts +168 -168
  183. package/src/server/handlers/tools/get-activity-log.ts +194 -194
  184. package/src/server/handlers/tools/get-code-standards.ts +124 -124
  185. package/src/server/handlers/tools/get-corrections.ts +154 -154
  186. package/src/server/handlers/tools/get-decision-timeline.ts +172 -172
  187. package/src/server/handlers/tools/get-episode.ts +103 -103
  188. package/src/server/handlers/tools/get-patterns.ts +158 -158
  189. package/src/server/handlers/tools/get-phase12-status.ts +63 -63
  190. package/src/server/handlers/tools/get-project-context.ts +75 -75
  191. package/src/server/handlers/tools/get-recommendations.ts +145 -145
  192. package/src/server/handlers/tools/index.ts +31 -31
  193. package/src/server/handlers/tools/init-project.ts +757 -757
  194. package/src/server/handlers/tools/list-episodes.ts +90 -90
  195. package/src/server/handlers/tools/list-projects.ts +125 -125
  196. package/src/server/handlers/tools/rate-memory.ts +101 -101
  197. package/src/server/handlers/tools/recall-similar.ts +87 -87
  198. package/src/server/handlers/tools/recognize-pattern.ts +126 -126
  199. package/src/server/handlers/tools/record-correction.ts +125 -125
  200. package/src/server/handlers/tools/remember-decision.ts +153 -153
  201. package/src/server/handlers/tools/schemas.ts +253 -253
  202. package/src/server/handlers/tools/search-knowledge-graph.ts +102 -102
  203. package/src/server/handlers/tools/smart-context.ts +146 -146
  204. package/src/server/handlers/tools/update-progress.ts +131 -131
  205. package/src/server/handlers/tools/what-if-analysis.ts +135 -135
  206. package/src/server/http-api.ts +693 -693
  207. package/src/server/index.ts +40 -40
  208. package/src/server/mcp-server.ts +283 -283
  209. package/src/server/providers/index.ts +7 -7
  210. package/src/server/providers/prompts.ts +327 -327
  211. package/src/server/providers/resources.ts +622 -622
  212. package/src/server/services.ts +468 -468
  213. package/src/server/types.ts +39 -39
  214. package/src/server/utils/error-handler.ts +155 -155
  215. package/src/server/utils/index.ts +13 -13
  216. package/src/server/utils/memory-indicator.ts +83 -83
  217. package/src/server/utils/request-context.ts +122 -122
  218. package/src/server/utils/response-formatter.ts +129 -129
  219. package/src/server/utils/validators.ts +210 -210
  220. package/src/setup/index.ts +48 -48
  221. package/src/setup/wizard.ts +461 -461
  222. package/src/tools/index.ts +24 -24
  223. package/src/tools/registry.ts +115 -115
  224. package/src/tools/schemas.test.ts +30 -30
  225. package/src/tools/schemas.ts +617 -617
  226. package/src/tools/types.ts +412 -412
  227. package/src/utils/circuit-breaker.ts +130 -130
  228. package/src/utils/cleanup.ts +34 -34
  229. package/src/utils/error-handler.ts +132 -132
  230. package/src/utils/error-messages.ts +60 -60
  231. package/src/utils/fallback.ts +45 -45
  232. package/src/utils/index.ts +54 -54
  233. package/src/utils/logger-utils.ts +80 -80
  234. package/src/utils/logger.ts +88 -88
  235. package/src/utils/phase12-helper.ts +56 -56
  236. package/src/utils/retry.ts +94 -94
  237. package/src/utils/timing.ts +47 -47
  238. package/src/utils/transaction.ts +63 -63
  239. package/src/vault/frontmatter.ts +264 -264
  240. package/src/vault/index.ts +318 -318
  241. package/src/vault/paths.ts +106 -106
  242. package/src/vault/query.ts +422 -422
  243. package/src/vault/reader.ts +264 -264
  244. package/src/vault/templates.ts +186 -186
  245. package/src/vault/types.ts +73 -73
  246. package/src/vault/watcher.ts +277 -277
  247. package/src/vault/writer.ts +413 -413
  248. package/tsconfig.json +30 -30
  249. package/src/cli/auto-update.ts +0 -157
@@ -1,416 +1,416 @@
1
- /**
2
- * Entity Extractor
3
- * Rule-based NER using compromise + technology dictionary
4
- */
5
-
6
- import type { EntityType } from './graph/schema'
7
-
8
- export interface ExtractedEntity {
9
- name: string
10
- normalizedName: string
11
- type: EntityType
12
- confidence: number
13
- source: 'dictionary' | 'nlp' | 'rule'
14
- positions: number[]
15
- }
16
-
17
- const TECH_DICTIONARY: Record<string, { aliases: string[]; type: EntityType }> = {
18
- // Languages
19
- typescript: { aliases: ['ts', 'typescript', 'type-script'], type: 'technology' },
20
- javascript: { aliases: ['js', 'javascript', 'ecmascript', 'es6', 'es2015', 'es2020', 'es2021', 'es2022'], type: 'technology' },
21
- python: { aliases: ['py', 'python3', 'python2'], type: 'technology' },
22
- rust: { aliases: ['rust-lang', 'rustlang'], type: 'technology' },
23
- go: { aliases: ['golang', 'go-lang'], type: 'technology' },
24
- java: { aliases: [], type: 'technology' },
25
- csharp: { aliases: ['c#', 'c-sharp', 'dotnet', '.net'], type: 'technology' },
26
- ruby: { aliases: ['rb'], type: 'technology' },
27
- php: { aliases: [], type: 'technology' },
28
- swift: { aliases: [], type: 'technology' },
29
- kotlin: { aliases: ['kt'], type: 'technology' },
30
- scala: { aliases: [], type: 'technology' },
31
- elixir: { aliases: [], type: 'technology' },
32
- clojure: { aliases: ['clj'], type: 'technology' },
33
- haskell: { aliases: ['hs'], type: 'technology' },
34
- lua: { aliases: [], type: 'technology' },
35
- perl: { aliases: ['pl'], type: 'technology' },
36
- r: { aliases: ['r-lang', 'rlang'], type: 'technology' },
37
- dart: { aliases: [], type: 'technology' },
38
- sql: { aliases: ['mysql', 'postgresql', 'postgres', 'sqlite', 'mssql', 'mariadb'], type: 'technology' },
39
- html: { aliases: ['html5'], type: 'technology' },
40
- css: { aliases: ['css3', 'scss', 'sass', 'less', 'stylus'], type: 'technology' },
41
- graphql: { aliases: ['gql'], type: 'technology' },
42
- yaml: { aliases: ['yml'], type: 'technology' },
43
- json: { aliases: [], type: 'technology' },
44
- markdown: { aliases: ['md'], type: 'technology' },
45
- bash: { aliases: ['sh', 'shell', 'zsh', 'fish'], type: 'technology' },
46
- c: { aliases: ['c-lang', 'ansi-c'], type: 'technology' },
47
- cpp: { aliases: ['c++', 'cplusplus'], type: 'technology' },
48
-
49
- // Frontend Frameworks
50
- react: { aliases: ['reactjs', 'react.js', 'react-dom'], type: 'technology' },
51
- vue: { aliases: ['vuejs', 'vue.js', 'vue3', 'vue2'], type: 'technology' },
52
- angular: { aliases: ['angularjs', 'angular.js', 'ng'], type: 'technology' },
53
- svelte: { aliases: ['sveltejs', 'sveltekit'], type: 'technology' },
54
- nextjs: { aliases: ['next.js', 'next', 'nextjs'], type: 'technology' },
55
- nuxt: { aliases: ['nuxtjs', 'nuxt.js'], type: 'technology' },
56
- remix: { aliases: ['remix-run'], type: 'technology' },
57
- astro: { aliases: ['astrojs'], type: 'technology' },
58
- gatsby: { aliases: ['gatsbyjs'], type: 'technology' },
59
- solid: { aliases: ['solidjs', 'solid-js'], type: 'technology' },
60
- preact: { aliases: ['preactjs'], type: 'technology' },
61
- htmx: { aliases: [], type: 'technology' },
62
- jquery: { aliases: [], type: 'technology' },
63
-
64
- // Backend Frameworks
65
- express: { aliases: ['expressjs', 'express.js'], type: 'technology' },
66
- fastify: { aliases: [], type: 'technology' },
67
- hono: { aliases: [], type: 'technology' },
68
- koa: { aliases: ['koajs'], type: 'technology' },
69
- nestjs: { aliases: ['nest.js', 'nest'], type: 'technology' },
70
- django: { aliases: [], type: 'technology' },
71
- flask: { aliases: [], type: 'technology' },
72
- fastapi: { aliases: ['fast-api'], type: 'technology' },
73
- rails: { aliases: ['ruby-on-rails', 'ror'], type: 'technology' },
74
- spring: { aliases: ['spring-boot', 'springboot'], type: 'technology' },
75
- laravel: { aliases: [], type: 'technology' },
76
- phoenix: { aliases: [], type: 'technology' },
77
- gin: { aliases: [], type: 'technology' },
78
- actix: { aliases: ['actix-web'], type: 'technology' },
79
-
80
- // Databases
81
- mongodb: { aliases: ['mongo'], type: 'technology' },
82
- redis: { aliases: [], type: 'technology' },
83
- elasticsearch: { aliases: ['elastic', 'es'], type: 'technology' },
84
- dynamodb: { aliases: ['dynamo'], type: 'technology' },
85
- cassandra: { aliases: [], type: 'technology' },
86
- neo4j: { aliases: [], type: 'technology' },
87
- couchdb: { aliases: ['couch'], type: 'technology' },
88
- firebase: { aliases: ['firestore'], type: 'technology' },
89
- supabase: { aliases: [], type: 'technology' },
90
- prisma: { aliases: [], type: 'technology' },
91
- drizzle: { aliases: ['drizzle-orm'], type: 'technology' },
92
- sequelize: { aliases: [], type: 'technology' },
93
- typeorm: { aliases: [], type: 'technology' },
94
- chromadb: { aliases: ['chroma'], type: 'technology' },
95
- pinecone: { aliases: [], type: 'technology' },
96
- weaviate: { aliases: [], type: 'technology' },
97
- qdrant: { aliases: [], type: 'technology' },
98
-
99
- // Cloud & DevOps
100
- aws: { aliases: ['amazon-web-services', 'amazon'], type: 'technology' },
101
- gcp: { aliases: ['google-cloud', 'google-cloud-platform'], type: 'technology' },
102
- azure: { aliases: ['microsoft-azure'], type: 'technology' },
103
- docker: { aliases: ['dockerfile', 'docker-compose'], type: 'technology' },
104
- kubernetes: { aliases: ['k8s', 'kube'], type: 'technology' },
105
- terraform: { aliases: ['tf'], type: 'technology' },
106
- ansible: { aliases: [], type: 'technology' },
107
- jenkins: { aliases: [], type: 'technology' },
108
- github: { aliases: ['gh'], type: 'technology' },
109
- gitlab: { aliases: [], type: 'technology' },
110
- vercel: { aliases: [], type: 'technology' },
111
- netlify: { aliases: [], type: 'technology' },
112
- cloudflare: { aliases: ['cf'], type: 'technology' },
113
- nginx: { aliases: [], type: 'technology' },
114
- caddy: { aliases: [], type: 'technology' },
115
-
116
- // Tools & Libraries
117
- webpack: { aliases: [], type: 'technology' },
118
- vite: { aliases: [], type: 'technology' },
119
- esbuild: { aliases: [], type: 'technology' },
120
- rollup: { aliases: ['rollupjs'], type: 'technology' },
121
- parcel: { aliases: [], type: 'technology' },
122
- turbopack: { aliases: [], type: 'technology' },
123
- bun: { aliases: ['bunjs'], type: 'technology' },
124
- deno: { aliases: [], type: 'technology' },
125
- node: { aliases: ['nodejs', 'node.js'], type: 'technology' },
126
- npm: { aliases: [], type: 'technology' },
127
- yarn: { aliases: [], type: 'technology' },
128
- pnpm: { aliases: [], type: 'technology' },
129
- git: { aliases: [], type: 'technology' },
130
- jest: { aliases: [], type: 'technology' },
131
- vitest: { aliases: [], type: 'technology' },
132
- mocha: { aliases: [], type: 'technology' },
133
- cypress: { aliases: [], type: 'technology' },
134
- playwright: { aliases: [], type: 'technology' },
135
- eslint: { aliases: [], type: 'technology' },
136
- prettier: { aliases: [], type: 'technology' },
137
- biome: { aliases: ['biomejs'], type: 'technology' },
138
- tailwind: { aliases: ['tailwindcss', 'tailwind-css'], type: 'technology' },
139
- bootstrap: { aliases: [], type: 'technology' },
140
- zod: { aliases: [], type: 'technology' },
141
- trpc: { aliases: ['t-rpc'], type: 'technology' },
142
- graphql: { aliases: ['gql'], type: 'technology' },
143
- grpc: { aliases: ['g-rpc'], type: 'technology' },
144
- websocket: { aliases: ['ws', 'websockets'], type: 'technology' },
145
- oauth: { aliases: ['oauth2', 'oauth2.0'], type: 'technology' },
146
- jwt: { aliases: ['json-web-token'], type: 'technology' },
147
- openai: { aliases: ['gpt', 'chatgpt', 'gpt-4'], type: 'technology' },
148
- anthropic: { aliases: ['claude', 'claude-ai'], type: 'technology' },
149
- langchain: { aliases: [], type: 'technology' },
150
- llamaindex: { aliases: ['llama-index'], type: 'technology' },
151
- huggingface: { aliases: ['hf', 'hugging-face'], type: 'technology' },
152
- tensorflow: { aliases: ['tf'], type: 'technology' },
153
- pytorch: { aliases: ['torch'], type: 'technology' },
154
- pino: { aliases: [], type: 'technology' },
155
- winston: { aliases: [], type: 'technology' },
156
- storybook: { aliases: [], type: 'technology' },
157
- nx: { aliases: [], type: 'technology' },
158
- turborepo: { aliases: [], type: 'technology' },
159
- lerna: { aliases: [], type: 'technology' },
160
- compromise: { aliases: ['compromise-nlp'], type: 'technology' },
161
- minisearch: { aliases: [], type: 'technology' },
162
-
163
- // Concepts
164
- microservices: { aliases: ['micro-services', 'microservice'], type: 'concept' },
165
- monolith: { aliases: ['monolithic'], type: 'concept' },
166
- serverless: { aliases: ['faas', 'lambda'], type: 'concept' },
167
- rest: { aliases: ['restful', 'rest-api'], type: 'concept' },
168
- api: { aliases: ['apis'], type: 'concept' },
169
- ci: { aliases: ['continuous-integration'], type: 'concept' },
170
- cd: { aliases: ['continuous-deployment', 'continuous-delivery'], type: 'concept' },
171
- tdd: { aliases: ['test-driven-development'], type: 'concept' },
172
- bdd: { aliases: ['behavior-driven-development'], type: 'concept' },
173
- ddd: { aliases: ['domain-driven-design'], type: 'concept' },
174
- cqrs: { aliases: ['command-query-responsibility-segregation'], type: 'concept' },
175
- mvc: { aliases: ['model-view-controller'], type: 'concept' },
176
- mvvm: { aliases: ['model-view-viewmodel'], type: 'concept' },
177
- oop: { aliases: ['object-oriented-programming', 'object-oriented'], type: 'concept' },
178
- fp: { aliases: ['functional-programming', 'functional'], type: 'concept' },
179
- ssr: { aliases: ['server-side-rendering'], type: 'concept' },
180
- ssg: { aliases: ['static-site-generation'], type: 'concept' },
181
- spa: { aliases: ['single-page-application', 'single-page-app'], type: 'concept' },
182
- pwa: { aliases: ['progressive-web-app'], type: 'concept' },
183
- mcp: { aliases: ['model-context-protocol'], type: 'concept' },
184
- rag: { aliases: ['retrieval-augmented-generation'], type: 'concept' },
185
- embedding: { aliases: ['embeddings', 'vector-embedding'], type: 'concept' },
186
- 'vector-search': { aliases: ['semantic-search', 'vector-similarity'], type: 'concept' },
187
- }
188
-
189
- // Build reverse lookup: alias → normalized name
190
- const ALIAS_MAP = new Map<string, string>()
191
- for (const [normalized, entry] of Object.entries(TECH_DICTIONARY)) {
192
- ALIAS_MAP.set(normalized.toLowerCase(), normalized)
193
- for (const alias of entry.aliases) {
194
- ALIAS_MAP.set(alias.toLowerCase(), normalized)
195
- }
196
- }
197
-
198
- // File path regex - handles ./path, ~/path, /path, and relative paths
199
- const FILE_PATH_REGEX = /(?:^|[\s(,])((?:\.\/|~\/|\/)?(?:[\w-]+\/)+[\w.-]+\.\w+)/g
200
- // URL regex
201
- const URL_REGEX = /https?:\/\/[^\s),]+/g
202
- // Version number regex
203
- const VERSION_REGEX = /\b[vV]?\d+\.\d+(?:\.\d+)?(?:-[\w.]+)?\b/g
204
-
205
- let nlpModule: any = null
206
-
207
- async function loadNlp(): Promise<any> {
208
- if (!nlpModule) {
209
- try {
210
- nlpModule = (await import('compromise')).default
211
- } catch {
212
- nlpModule = null
213
- }
214
- }
215
- return nlpModule
216
- }
217
-
218
- export class EntityExtractor {
219
- private nlpLoaded = false
220
- private nlp: any = null
221
-
222
- async initialize(): Promise<void> {
223
- this.nlp = await loadNlp()
224
- this.nlpLoaded = this.nlp !== null
225
- }
226
-
227
- extract(text: string): ExtractedEntity[] {
228
- const entities: Map<string, ExtractedEntity> = new Map()
229
-
230
- this.extractFromDictionary(text, entities)
231
- this.extractFilePaths(text, entities)
232
- this.extractUrls(text, entities)
233
- this.extractDates(text, entities)
234
-
235
- if (this.nlpLoaded && this.nlp) {
236
- this.extractWithNlp(text, entities)
237
- }
238
-
239
- return Array.from(entities.values())
240
- .sort((a, b) => b.confidence - a.confidence)
241
- }
242
-
243
- extractBatch(texts: string[]): ExtractedEntity[][] {
244
- return texts.map(text => this.extract(text))
245
- }
246
-
247
- private extractFromDictionary(text: string, entities: Map<string, ExtractedEntity>): void {
248
- const lowerText = text.toLowerCase()
249
- const words = lowerText.split(/[\s,;:()[\]{}"'`|/\\]+/)
250
-
251
- for (const word of words) {
252
- const cleaned = word.replace(/^[^a-z0-9]+|[^a-z0-9]+$/g, '')
253
- if (cleaned.length < 2) continue
254
-
255
- const normalized = ALIAS_MAP.get(cleaned)
256
- if (normalized) {
257
- const dictEntry = TECH_DICTIONARY[normalized]
258
- if (!dictEntry) continue
259
-
260
- const existing = entities.get(normalized)
261
- if (existing) {
262
- existing.positions.push(lowerText.indexOf(cleaned))
263
- existing.confidence = Math.min(1.0, existing.confidence + 0.05)
264
- } else {
265
- entities.set(normalized, {
266
- name: cleaned,
267
- normalizedName: normalized,
268
- type: dictEntry.type,
269
- confidence: 0.95,
270
- source: 'dictionary',
271
- positions: [lowerText.indexOf(cleaned)]
272
- })
273
- }
274
- }
275
- }
276
-
277
- // Also check multi-word aliases
278
- for (const [alias, normalized] of ALIAS_MAP) {
279
- if (alias.includes('-') || alias.includes('.') || alias.includes(' ')) {
280
- if (lowerText.includes(alias) && !entities.has(normalized)) {
281
- const dictEntry = TECH_DICTIONARY[normalized]
282
- if (!dictEntry) continue
283
-
284
- entities.set(normalized, {
285
- name: alias,
286
- normalizedName: normalized,
287
- type: dictEntry.type,
288
- confidence: 0.95,
289
- source: 'dictionary',
290
- positions: [lowerText.indexOf(alias)]
291
- })
292
- }
293
- }
294
- }
295
- }
296
-
297
- private extractFilePaths(text: string, entities: Map<string, ExtractedEntity>): void {
298
- let match: RegExpExecArray | null
299
- const regex = new RegExp(FILE_PATH_REGEX.source, FILE_PATH_REGEX.flags)
300
-
301
- while ((match = regex.exec(text)) !== null) {
302
- const filePath = match[1].trim()
303
- if (filePath.length < 4) continue
304
-
305
- const key = `file:${filePath}`
306
- if (!entities.has(key)) {
307
- entities.set(key, {
308
- name: filePath,
309
- normalizedName: filePath,
310
- type: 'file',
311
- confidence: 0.85,
312
- source: 'rule',
313
- positions: [match.index]
314
- })
315
- }
316
- }
317
- }
318
-
319
- private extractUrls(text: string, entities: Map<string, ExtractedEntity>): void {
320
- let match: RegExpExecArray | null
321
- const regex = new RegExp(URL_REGEX.source, URL_REGEX.flags)
322
-
323
- while ((match = regex.exec(text)) !== null) {
324
- const url = match[0]
325
- const key = `url:${url}`
326
- if (!entities.has(key)) {
327
- entities.set(key, {
328
- name: url,
329
- normalizedName: url,
330
- type: 'file',
331
- confidence: 0.9,
332
- source: 'rule',
333
- positions: [match.index]
334
- })
335
- }
336
- }
337
- }
338
-
339
- private extractDates(text: string, entities: Map<string, ExtractedEntity>): void {
340
- // ISO date pattern
341
- const isoDateRegex = /\b\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})?)?\b/g
342
- let match: RegExpExecArray | null
343
-
344
- while ((match = isoDateRegex.exec(text)) !== null) {
345
- const dateStr = match[0]
346
- const key = `date:${dateStr}`
347
- if (!entities.has(key)) {
348
- entities.set(key, {
349
- name: dateStr,
350
- normalizedName: dateStr,
351
- type: 'date',
352
- confidence: 0.9,
353
- source: 'rule',
354
- positions: [match.index]
355
- })
356
- }
357
- }
358
- }
359
-
360
- private extractWithNlp(text: string, entities: Map<string, ExtractedEntity>): void {
361
- try {
362
- const doc = this.nlp(text)
363
-
364
- // Extract people
365
- const people = doc.people()
366
- if (people && people.length > 0) {
367
- for (const person of people.out('array') as string[]) {
368
- const cleaned = person.trim()
369
- if (cleaned.length < 2) continue
370
- const key = `person:${cleaned.toLowerCase()}`
371
- if (!entities.has(key)) {
372
- entities.set(key, {
373
- name: cleaned,
374
- normalizedName: cleaned.toLowerCase(),
375
- type: 'person',
376
- confidence: 0.7,
377
- source: 'nlp',
378
- positions: [text.indexOf(cleaned)]
379
- })
380
- }
381
- }
382
- }
383
-
384
- // Extract organizations
385
- const orgs = doc.organizations()
386
- if (orgs && orgs.length > 0) {
387
- for (const org of orgs.out('array') as string[]) {
388
- const cleaned = org.trim()
389
- if (cleaned.length < 2) continue
390
- // Check if it's already a known technology
391
- const normalizedCheck = ALIAS_MAP.get(cleaned.toLowerCase())
392
- if (normalizedCheck) continue
393
-
394
- const key = `org:${cleaned.toLowerCase()}`
395
- if (!entities.has(key)) {
396
- entities.set(key, {
397
- name: cleaned,
398
- normalizedName: cleaned.toLowerCase(),
399
- type: 'concept',
400
- confidence: 0.65,
401
- source: 'nlp',
402
- positions: [text.indexOf(cleaned)]
403
- })
404
- }
405
- }
406
- }
407
- } catch {
408
- // NLP extraction failure is non-critical
409
- }
410
- }
411
-
412
- static normalizeEntityName(name: string): string {
413
- const lower = name.toLowerCase().trim()
414
- return ALIAS_MAP.get(lower) || lower
415
- }
416
- }
1
+ /**
2
+ * Entity Extractor
3
+ * Rule-based NER using compromise + technology dictionary
4
+ */
5
+
6
+ import type { EntityType } from './graph/schema'
7
+
8
+ export interface ExtractedEntity {
9
+ name: string
10
+ normalizedName: string
11
+ type: EntityType
12
+ confidence: number
13
+ source: 'dictionary' | 'nlp' | 'rule'
14
+ positions: number[]
15
+ }
16
+
17
+ const TECH_DICTIONARY: Record<string, { aliases: string[]; type: EntityType }> = {
18
+ // Languages
19
+ typescript: { aliases: ['ts', 'typescript', 'type-script'], type: 'technology' },
20
+ javascript: { aliases: ['js', 'javascript', 'ecmascript', 'es6', 'es2015', 'es2020', 'es2021', 'es2022'], type: 'technology' },
21
+ python: { aliases: ['py', 'python3', 'python2'], type: 'technology' },
22
+ rust: { aliases: ['rust-lang', 'rustlang'], type: 'technology' },
23
+ go: { aliases: ['golang', 'go-lang'], type: 'technology' },
24
+ java: { aliases: [], type: 'technology' },
25
+ csharp: { aliases: ['c#', 'c-sharp', 'dotnet', '.net'], type: 'technology' },
26
+ ruby: { aliases: ['rb'], type: 'technology' },
27
+ php: { aliases: [], type: 'technology' },
28
+ swift: { aliases: [], type: 'technology' },
29
+ kotlin: { aliases: ['kt'], type: 'technology' },
30
+ scala: { aliases: [], type: 'technology' },
31
+ elixir: { aliases: [], type: 'technology' },
32
+ clojure: { aliases: ['clj'], type: 'technology' },
33
+ haskell: { aliases: ['hs'], type: 'technology' },
34
+ lua: { aliases: [], type: 'technology' },
35
+ perl: { aliases: ['pl'], type: 'technology' },
36
+ r: { aliases: ['r-lang', 'rlang'], type: 'technology' },
37
+ dart: { aliases: [], type: 'technology' },
38
+ sql: { aliases: ['mysql', 'postgresql', 'postgres', 'sqlite', 'mssql', 'mariadb'], type: 'technology' },
39
+ html: { aliases: ['html5'], type: 'technology' },
40
+ css: { aliases: ['css3', 'scss', 'sass', 'less', 'stylus'], type: 'technology' },
41
+ graphql: { aliases: ['gql'], type: 'technology' },
42
+ yaml: { aliases: ['yml'], type: 'technology' },
43
+ json: { aliases: [], type: 'technology' },
44
+ markdown: { aliases: ['md'], type: 'technology' },
45
+ bash: { aliases: ['sh', 'shell', 'zsh', 'fish'], type: 'technology' },
46
+ c: { aliases: ['c-lang', 'ansi-c'], type: 'technology' },
47
+ cpp: { aliases: ['c++', 'cplusplus'], type: 'technology' },
48
+
49
+ // Frontend Frameworks
50
+ react: { aliases: ['reactjs', 'react.js', 'react-dom'], type: 'technology' },
51
+ vue: { aliases: ['vuejs', 'vue.js', 'vue3', 'vue2'], type: 'technology' },
52
+ angular: { aliases: ['angularjs', 'angular.js', 'ng'], type: 'technology' },
53
+ svelte: { aliases: ['sveltejs', 'sveltekit'], type: 'technology' },
54
+ nextjs: { aliases: ['next.js', 'next', 'nextjs'], type: 'technology' },
55
+ nuxt: { aliases: ['nuxtjs', 'nuxt.js'], type: 'technology' },
56
+ remix: { aliases: ['remix-run'], type: 'technology' },
57
+ astro: { aliases: ['astrojs'], type: 'technology' },
58
+ gatsby: { aliases: ['gatsbyjs'], type: 'technology' },
59
+ solid: { aliases: ['solidjs', 'solid-js'], type: 'technology' },
60
+ preact: { aliases: ['preactjs'], type: 'technology' },
61
+ htmx: { aliases: [], type: 'technology' },
62
+ jquery: { aliases: [], type: 'technology' },
63
+
64
+ // Backend Frameworks
65
+ express: { aliases: ['expressjs', 'express.js'], type: 'technology' },
66
+ fastify: { aliases: [], type: 'technology' },
67
+ hono: { aliases: [], type: 'technology' },
68
+ koa: { aliases: ['koajs'], type: 'technology' },
69
+ nestjs: { aliases: ['nest.js', 'nest'], type: 'technology' },
70
+ django: { aliases: [], type: 'technology' },
71
+ flask: { aliases: [], type: 'technology' },
72
+ fastapi: { aliases: ['fast-api'], type: 'technology' },
73
+ rails: { aliases: ['ruby-on-rails', 'ror'], type: 'technology' },
74
+ spring: { aliases: ['spring-boot', 'springboot'], type: 'technology' },
75
+ laravel: { aliases: [], type: 'technology' },
76
+ phoenix: { aliases: [], type: 'technology' },
77
+ gin: { aliases: [], type: 'technology' },
78
+ actix: { aliases: ['actix-web'], type: 'technology' },
79
+
80
+ // Databases
81
+ mongodb: { aliases: ['mongo'], type: 'technology' },
82
+ redis: { aliases: [], type: 'technology' },
83
+ elasticsearch: { aliases: ['elastic', 'es'], type: 'technology' },
84
+ dynamodb: { aliases: ['dynamo'], type: 'technology' },
85
+ cassandra: { aliases: [], type: 'technology' },
86
+ neo4j: { aliases: [], type: 'technology' },
87
+ couchdb: { aliases: ['couch'], type: 'technology' },
88
+ firebase: { aliases: ['firestore'], type: 'technology' },
89
+ supabase: { aliases: [], type: 'technology' },
90
+ prisma: { aliases: [], type: 'technology' },
91
+ drizzle: { aliases: ['drizzle-orm'], type: 'technology' },
92
+ sequelize: { aliases: [], type: 'technology' },
93
+ typeorm: { aliases: [], type: 'technology' },
94
+ chromadb: { aliases: ['chroma'], type: 'technology' },
95
+ pinecone: { aliases: [], type: 'technology' },
96
+ weaviate: { aliases: [], type: 'technology' },
97
+ qdrant: { aliases: [], type: 'technology' },
98
+
99
+ // Cloud & DevOps
100
+ aws: { aliases: ['amazon-web-services', 'amazon'], type: 'technology' },
101
+ gcp: { aliases: ['google-cloud', 'google-cloud-platform'], type: 'technology' },
102
+ azure: { aliases: ['microsoft-azure'], type: 'technology' },
103
+ docker: { aliases: ['dockerfile', 'docker-compose'], type: 'technology' },
104
+ kubernetes: { aliases: ['k8s', 'kube'], type: 'technology' },
105
+ terraform: { aliases: ['tf'], type: 'technology' },
106
+ ansible: { aliases: [], type: 'technology' },
107
+ jenkins: { aliases: [], type: 'technology' },
108
+ github: { aliases: ['gh'], type: 'technology' },
109
+ gitlab: { aliases: [], type: 'technology' },
110
+ vercel: { aliases: [], type: 'technology' },
111
+ netlify: { aliases: [], type: 'technology' },
112
+ cloudflare: { aliases: ['cf'], type: 'technology' },
113
+ nginx: { aliases: [], type: 'technology' },
114
+ caddy: { aliases: [], type: 'technology' },
115
+
116
+ // Tools & Libraries
117
+ webpack: { aliases: [], type: 'technology' },
118
+ vite: { aliases: [], type: 'technology' },
119
+ esbuild: { aliases: [], type: 'technology' },
120
+ rollup: { aliases: ['rollupjs'], type: 'technology' },
121
+ parcel: { aliases: [], type: 'technology' },
122
+ turbopack: { aliases: [], type: 'technology' },
123
+ bun: { aliases: ['bunjs'], type: 'technology' },
124
+ deno: { aliases: [], type: 'technology' },
125
+ node: { aliases: ['nodejs', 'node.js'], type: 'technology' },
126
+ npm: { aliases: [], type: 'technology' },
127
+ yarn: { aliases: [], type: 'technology' },
128
+ pnpm: { aliases: [], type: 'technology' },
129
+ git: { aliases: [], type: 'technology' },
130
+ jest: { aliases: [], type: 'technology' },
131
+ vitest: { aliases: [], type: 'technology' },
132
+ mocha: { aliases: [], type: 'technology' },
133
+ cypress: { aliases: [], type: 'technology' },
134
+ playwright: { aliases: [], type: 'technology' },
135
+ eslint: { aliases: [], type: 'technology' },
136
+ prettier: { aliases: [], type: 'technology' },
137
+ biome: { aliases: ['biomejs'], type: 'technology' },
138
+ tailwind: { aliases: ['tailwindcss', 'tailwind-css'], type: 'technology' },
139
+ bootstrap: { aliases: [], type: 'technology' },
140
+ zod: { aliases: [], type: 'technology' },
141
+ trpc: { aliases: ['t-rpc'], type: 'technology' },
142
+ graphql: { aliases: ['gql'], type: 'technology' },
143
+ grpc: { aliases: ['g-rpc'], type: 'technology' },
144
+ websocket: { aliases: ['ws', 'websockets'], type: 'technology' },
145
+ oauth: { aliases: ['oauth2', 'oauth2.0'], type: 'technology' },
146
+ jwt: { aliases: ['json-web-token'], type: 'technology' },
147
+ openai: { aliases: ['gpt', 'chatgpt', 'gpt-4'], type: 'technology' },
148
+ anthropic: { aliases: ['claude', 'claude-ai'], type: 'technology' },
149
+ langchain: { aliases: [], type: 'technology' },
150
+ llamaindex: { aliases: ['llama-index'], type: 'technology' },
151
+ huggingface: { aliases: ['hf', 'hugging-face'], type: 'technology' },
152
+ tensorflow: { aliases: ['tf'], type: 'technology' },
153
+ pytorch: { aliases: ['torch'], type: 'technology' },
154
+ pino: { aliases: [], type: 'technology' },
155
+ winston: { aliases: [], type: 'technology' },
156
+ storybook: { aliases: [], type: 'technology' },
157
+ nx: { aliases: [], type: 'technology' },
158
+ turborepo: { aliases: [], type: 'technology' },
159
+ lerna: { aliases: [], type: 'technology' },
160
+ compromise: { aliases: ['compromise-nlp'], type: 'technology' },
161
+ minisearch: { aliases: [], type: 'technology' },
162
+
163
+ // Concepts
164
+ microservices: { aliases: ['micro-services', 'microservice'], type: 'concept' },
165
+ monolith: { aliases: ['monolithic'], type: 'concept' },
166
+ serverless: { aliases: ['faas', 'lambda'], type: 'concept' },
167
+ rest: { aliases: ['restful', 'rest-api'], type: 'concept' },
168
+ api: { aliases: ['apis'], type: 'concept' },
169
+ ci: { aliases: ['continuous-integration'], type: 'concept' },
170
+ cd: { aliases: ['continuous-deployment', 'continuous-delivery'], type: 'concept' },
171
+ tdd: { aliases: ['test-driven-development'], type: 'concept' },
172
+ bdd: { aliases: ['behavior-driven-development'], type: 'concept' },
173
+ ddd: { aliases: ['domain-driven-design'], type: 'concept' },
174
+ cqrs: { aliases: ['command-query-responsibility-segregation'], type: 'concept' },
175
+ mvc: { aliases: ['model-view-controller'], type: 'concept' },
176
+ mvvm: { aliases: ['model-view-viewmodel'], type: 'concept' },
177
+ oop: { aliases: ['object-oriented-programming', 'object-oriented'], type: 'concept' },
178
+ fp: { aliases: ['functional-programming', 'functional'], type: 'concept' },
179
+ ssr: { aliases: ['server-side-rendering'], type: 'concept' },
180
+ ssg: { aliases: ['static-site-generation'], type: 'concept' },
181
+ spa: { aliases: ['single-page-application', 'single-page-app'], type: 'concept' },
182
+ pwa: { aliases: ['progressive-web-app'], type: 'concept' },
183
+ mcp: { aliases: ['model-context-protocol'], type: 'concept' },
184
+ rag: { aliases: ['retrieval-augmented-generation'], type: 'concept' },
185
+ embedding: { aliases: ['embeddings', 'vector-embedding'], type: 'concept' },
186
+ 'vector-search': { aliases: ['semantic-search', 'vector-similarity'], type: 'concept' },
187
+ }
188
+
189
+ // Build reverse lookup: alias → normalized name
190
+ const ALIAS_MAP = new Map<string, string>()
191
+ for (const [normalized, entry] of Object.entries(TECH_DICTIONARY)) {
192
+ ALIAS_MAP.set(normalized.toLowerCase(), normalized)
193
+ for (const alias of entry.aliases) {
194
+ ALIAS_MAP.set(alias.toLowerCase(), normalized)
195
+ }
196
+ }
197
+
198
+ // File path regex - handles ./path, ~/path, /path, and relative paths
199
+ const FILE_PATH_REGEX = /(?:^|[\s(,])((?:\.\/|~\/|\/)?(?:[\w-]+\/)+[\w.-]+\.\w+)/g
200
+ // URL regex
201
+ const URL_REGEX = /https?:\/\/[^\s),]+/g
202
+ // Version number regex
203
+ const VERSION_REGEX = /\b[vV]?\d+\.\d+(?:\.\d+)?(?:-[\w.]+)?\b/g
204
+
205
+ let nlpModule: any = null
206
+
207
+ async function loadNlp(): Promise<any> {
208
+ if (!nlpModule) {
209
+ try {
210
+ nlpModule = (await import('compromise')).default
211
+ } catch {
212
+ nlpModule = null
213
+ }
214
+ }
215
+ return nlpModule
216
+ }
217
+
218
+ export class EntityExtractor {
219
+ private nlpLoaded = false
220
+ private nlp: any = null
221
+
222
+ async initialize(): Promise<void> {
223
+ this.nlp = await loadNlp()
224
+ this.nlpLoaded = this.nlp !== null
225
+ }
226
+
227
+ extract(text: string): ExtractedEntity[] {
228
+ const entities: Map<string, ExtractedEntity> = new Map()
229
+
230
+ this.extractFromDictionary(text, entities)
231
+ this.extractFilePaths(text, entities)
232
+ this.extractUrls(text, entities)
233
+ this.extractDates(text, entities)
234
+
235
+ if (this.nlpLoaded && this.nlp) {
236
+ this.extractWithNlp(text, entities)
237
+ }
238
+
239
+ return Array.from(entities.values())
240
+ .sort((a, b) => b.confidence - a.confidence)
241
+ }
242
+
243
+ extractBatch(texts: string[]): ExtractedEntity[][] {
244
+ return texts.map(text => this.extract(text))
245
+ }
246
+
247
+ private extractFromDictionary(text: string, entities: Map<string, ExtractedEntity>): void {
248
+ const lowerText = text.toLowerCase()
249
+ const words = lowerText.split(/[\s,;:()[\]{}"'`|/\\]+/)
250
+
251
+ for (const word of words) {
252
+ const cleaned = word.replace(/^[^a-z0-9]+|[^a-z0-9]+$/g, '')
253
+ if (cleaned.length < 2) continue
254
+
255
+ const normalized = ALIAS_MAP.get(cleaned)
256
+ if (normalized) {
257
+ const dictEntry = TECH_DICTIONARY[normalized]
258
+ if (!dictEntry) continue
259
+
260
+ const existing = entities.get(normalized)
261
+ if (existing) {
262
+ existing.positions.push(lowerText.indexOf(cleaned))
263
+ existing.confidence = Math.min(1.0, existing.confidence + 0.05)
264
+ } else {
265
+ entities.set(normalized, {
266
+ name: cleaned,
267
+ normalizedName: normalized,
268
+ type: dictEntry.type,
269
+ confidence: 0.95,
270
+ source: 'dictionary',
271
+ positions: [lowerText.indexOf(cleaned)]
272
+ })
273
+ }
274
+ }
275
+ }
276
+
277
+ // Also check multi-word aliases
278
+ for (const [alias, normalized] of ALIAS_MAP) {
279
+ if (alias.includes('-') || alias.includes('.') || alias.includes(' ')) {
280
+ if (lowerText.includes(alias) && !entities.has(normalized)) {
281
+ const dictEntry = TECH_DICTIONARY[normalized]
282
+ if (!dictEntry) continue
283
+
284
+ entities.set(normalized, {
285
+ name: alias,
286
+ normalizedName: normalized,
287
+ type: dictEntry.type,
288
+ confidence: 0.95,
289
+ source: 'dictionary',
290
+ positions: [lowerText.indexOf(alias)]
291
+ })
292
+ }
293
+ }
294
+ }
295
+ }
296
+
297
+ private extractFilePaths(text: string, entities: Map<string, ExtractedEntity>): void {
298
+ let match: RegExpExecArray | null
299
+ const regex = new RegExp(FILE_PATH_REGEX.source, FILE_PATH_REGEX.flags)
300
+
301
+ while ((match = regex.exec(text)) !== null) {
302
+ const filePath = match[1].trim()
303
+ if (filePath.length < 4) continue
304
+
305
+ const key = `file:${filePath}`
306
+ if (!entities.has(key)) {
307
+ entities.set(key, {
308
+ name: filePath,
309
+ normalizedName: filePath,
310
+ type: 'file',
311
+ confidence: 0.85,
312
+ source: 'rule',
313
+ positions: [match.index]
314
+ })
315
+ }
316
+ }
317
+ }
318
+
319
+ private extractUrls(text: string, entities: Map<string, ExtractedEntity>): void {
320
+ let match: RegExpExecArray | null
321
+ const regex = new RegExp(URL_REGEX.source, URL_REGEX.flags)
322
+
323
+ while ((match = regex.exec(text)) !== null) {
324
+ const url = match[0]
325
+ const key = `url:${url}`
326
+ if (!entities.has(key)) {
327
+ entities.set(key, {
328
+ name: url,
329
+ normalizedName: url,
330
+ type: 'file',
331
+ confidence: 0.9,
332
+ source: 'rule',
333
+ positions: [match.index]
334
+ })
335
+ }
336
+ }
337
+ }
338
+
339
+ private extractDates(text: string, entities: Map<string, ExtractedEntity>): void {
340
+ // ISO date pattern
341
+ const isoDateRegex = /\b\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})?)?\b/g
342
+ let match: RegExpExecArray | null
343
+
344
+ while ((match = isoDateRegex.exec(text)) !== null) {
345
+ const dateStr = match[0]
346
+ const key = `date:${dateStr}`
347
+ if (!entities.has(key)) {
348
+ entities.set(key, {
349
+ name: dateStr,
350
+ normalizedName: dateStr,
351
+ type: 'date',
352
+ confidence: 0.9,
353
+ source: 'rule',
354
+ positions: [match.index]
355
+ })
356
+ }
357
+ }
358
+ }
359
+
360
+ private extractWithNlp(text: string, entities: Map<string, ExtractedEntity>): void {
361
+ try {
362
+ const doc = this.nlp(text)
363
+
364
+ // Extract people
365
+ const people = doc.people()
366
+ if (people && people.length > 0) {
367
+ for (const person of people.out('array') as string[]) {
368
+ const cleaned = person.trim()
369
+ if (cleaned.length < 2) continue
370
+ const key = `person:${cleaned.toLowerCase()}`
371
+ if (!entities.has(key)) {
372
+ entities.set(key, {
373
+ name: cleaned,
374
+ normalizedName: cleaned.toLowerCase(),
375
+ type: 'person',
376
+ confidence: 0.7,
377
+ source: 'nlp',
378
+ positions: [text.indexOf(cleaned)]
379
+ })
380
+ }
381
+ }
382
+ }
383
+
384
+ // Extract organizations
385
+ const orgs = doc.organizations()
386
+ if (orgs && orgs.length > 0) {
387
+ for (const org of orgs.out('array') as string[]) {
388
+ const cleaned = org.trim()
389
+ if (cleaned.length < 2) continue
390
+ // Check if it's already a known technology
391
+ const normalizedCheck = ALIAS_MAP.get(cleaned.toLowerCase())
392
+ if (normalizedCheck) continue
393
+
394
+ const key = `org:${cleaned.toLowerCase()}`
395
+ if (!entities.has(key)) {
396
+ entities.set(key, {
397
+ name: cleaned,
398
+ normalizedName: cleaned.toLowerCase(),
399
+ type: 'concept',
400
+ confidence: 0.65,
401
+ source: 'nlp',
402
+ positions: [text.indexOf(cleaned)]
403
+ })
404
+ }
405
+ }
406
+ }
407
+ } catch {
408
+ // NLP extraction failure is non-critical
409
+ }
410
+ }
411
+
412
+ static normalizeEntityName(name: string): string {
413
+ const lower = name.toLowerCase().trim()
414
+ return ALIAS_MAP.get(lower) || lower
415
+ }
416
+ }