@memberjunction/query-gen 0.0.1 → 2.126.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/.turbo/turbo-build.log +4 -0
  2. package/CHANGELOG.md +34 -0
  3. package/COORDINATOR.md +768 -0
  4. package/IMPLEMENTATION_PLAN.md +1753 -0
  5. package/LLM_ENTITY_GROUPING_PLAN.md +977 -0
  6. package/README.md +675 -29
  7. package/dist/cli/commands/export.d.ts +15 -0
  8. package/dist/cli/commands/export.d.ts.map +1 -0
  9. package/dist/cli/commands/export.js +178 -0
  10. package/dist/cli/commands/export.js.map +1 -0
  11. package/dist/cli/commands/generate.d.ts +19 -0
  12. package/dist/cli/commands/generate.d.ts.map +1 -0
  13. package/dist/cli/commands/generate.js +282 -0
  14. package/dist/cli/commands/generate.js.map +1 -0
  15. package/dist/cli/commands/validate.d.ts +17 -0
  16. package/dist/cli/commands/validate.d.ts.map +1 -0
  17. package/dist/cli/commands/validate.js +193 -0
  18. package/dist/cli/commands/validate.js.map +1 -0
  19. package/dist/cli/config.d.ts +51 -0
  20. package/dist/cli/config.d.ts.map +1 -0
  21. package/dist/cli/config.js +142 -0
  22. package/dist/cli/config.js.map +1 -0
  23. package/dist/cli/index.d.ts +13 -0
  24. package/dist/cli/index.d.ts.map +1 -0
  25. package/dist/cli/index.js +57 -0
  26. package/dist/cli/index.js.map +1 -0
  27. package/dist/core/EntityGrouper.d.ts +74 -0
  28. package/dist/core/EntityGrouper.d.ts.map +1 -0
  29. package/dist/core/EntityGrouper.js +246 -0
  30. package/dist/core/EntityGrouper.js.map +1 -0
  31. package/dist/core/MetadataExporter.d.ts +59 -0
  32. package/dist/core/MetadataExporter.d.ts.map +1 -0
  33. package/dist/core/MetadataExporter.js +151 -0
  34. package/dist/core/MetadataExporter.js.map +1 -0
  35. package/dist/core/QueryDatabaseWriter.d.ts +50 -0
  36. package/dist/core/QueryDatabaseWriter.d.ts.map +1 -0
  37. package/dist/core/QueryDatabaseWriter.js +152 -0
  38. package/dist/core/QueryDatabaseWriter.js.map +1 -0
  39. package/dist/core/QueryFixer.d.ts +48 -0
  40. package/dist/core/QueryFixer.d.ts.map +1 -0
  41. package/dist/core/QueryFixer.js +115 -0
  42. package/dist/core/QueryFixer.js.map +1 -0
  43. package/dist/core/QueryRefiner.d.ts +94 -0
  44. package/dist/core/QueryRefiner.d.ts.map +1 -0
  45. package/dist/core/QueryRefiner.js +267 -0
  46. package/dist/core/QueryRefiner.js.map +1 -0
  47. package/dist/core/QueryTester.d.ts +70 -0
  48. package/dist/core/QueryTester.d.ts.map +1 -0
  49. package/dist/core/QueryTester.js +243 -0
  50. package/dist/core/QueryTester.js.map +1 -0
  51. package/dist/core/QueryWriter.d.ts +57 -0
  52. package/dist/core/QueryWriter.d.ts.map +1 -0
  53. package/dist/core/QueryWriter.js +184 -0
  54. package/dist/core/QueryWriter.js.map +1 -0
  55. package/dist/core/QuestionGenerator.d.ts +58 -0
  56. package/dist/core/QuestionGenerator.d.ts.map +1 -0
  57. package/dist/core/QuestionGenerator.js +145 -0
  58. package/dist/core/QuestionGenerator.js.map +1 -0
  59. package/dist/data/schema.d.ts +230 -0
  60. package/dist/data/schema.d.ts.map +1 -0
  61. package/dist/data/schema.js +6 -0
  62. package/dist/data/schema.js.map +1 -0
  63. package/dist/index.d.ts +28 -0
  64. package/dist/index.d.ts.map +1 -0
  65. package/dist/index.js +77 -0
  66. package/dist/index.js.map +1 -0
  67. package/dist/prompts/PromptNames.d.ts +32 -0
  68. package/dist/prompts/PromptNames.d.ts.map +1 -0
  69. package/dist/prompts/PromptNames.js +35 -0
  70. package/dist/prompts/PromptNames.js.map +1 -0
  71. package/dist/utils/category-builder.d.ts +28 -0
  72. package/dist/utils/category-builder.d.ts.map +1 -0
  73. package/dist/utils/category-builder.js +90 -0
  74. package/dist/utils/category-builder.js.map +1 -0
  75. package/dist/utils/entity-helpers.d.ts +49 -0
  76. package/dist/utils/entity-helpers.d.ts.map +1 -0
  77. package/dist/utils/entity-helpers.js +189 -0
  78. package/dist/utils/entity-helpers.js.map +1 -0
  79. package/dist/utils/error-handlers.d.ts +19 -0
  80. package/dist/utils/error-handlers.d.ts.map +1 -0
  81. package/dist/utils/error-handlers.js +41 -0
  82. package/dist/utils/error-handlers.js.map +1 -0
  83. package/dist/utils/graph-helpers.d.ts +51 -0
  84. package/dist/utils/graph-helpers.d.ts.map +1 -0
  85. package/dist/utils/graph-helpers.js +82 -0
  86. package/dist/utils/graph-helpers.js.map +1 -0
  87. package/dist/utils/prompt-helpers.d.ts +25 -0
  88. package/dist/utils/prompt-helpers.d.ts.map +1 -0
  89. package/dist/utils/prompt-helpers.js +66 -0
  90. package/dist/utils/prompt-helpers.js.map +1 -0
  91. package/dist/utils/query-helpers.d.ts +23 -0
  92. package/dist/utils/query-helpers.d.ts.map +1 -0
  93. package/dist/utils/query-helpers.js +34 -0
  94. package/dist/utils/query-helpers.js.map +1 -0
  95. package/dist/utils/user-helpers.d.ts +15 -0
  96. package/dist/utils/user-helpers.d.ts.map +1 -0
  97. package/dist/utils/user-helpers.js +32 -0
  98. package/dist/utils/user-helpers.js.map +1 -0
  99. package/dist/vectors/EmbeddingService.d.ts +58 -0
  100. package/dist/vectors/EmbeddingService.d.ts.map +1 -0
  101. package/dist/vectors/EmbeddingService.js +90 -0
  102. package/dist/vectors/EmbeddingService.js.map +1 -0
  103. package/dist/vectors/SimilaritySearch.d.ts +51 -0
  104. package/dist/vectors/SimilaritySearch.d.ts.map +1 -0
  105. package/dist/vectors/SimilaritySearch.js +85 -0
  106. package/dist/vectors/SimilaritySearch.js.map +1 -0
  107. package/docs/API.md +1040 -0
  108. package/docs/ARCHITECTURE.md +1120 -0
  109. package/examples/advanced-usage.ts +401 -0
  110. package/examples/basic-usage.ts +285 -0
  111. package/package.json +48 -6
  112. package/src/cli/commands/export.ts +173 -0
  113. package/src/cli/commands/generate.ts +330 -0
  114. package/src/cli/commands/validate.ts +185 -0
  115. package/src/cli/config.ts +203 -0
  116. package/src/cli/index.ts +63 -0
  117. package/src/core/EntityGrouper.ts +318 -0
  118. package/src/core/MetadataExporter.ts +148 -0
  119. package/src/core/QueryDatabaseWriter.ts +187 -0
  120. package/src/core/QueryFixer.ts +153 -0
  121. package/src/core/QueryRefiner.ts +382 -0
  122. package/src/core/QueryTester.ts +264 -0
  123. package/src/core/QueryWriter.ts +239 -0
  124. package/src/core/QuestionGenerator.ts +199 -0
  125. package/src/data/golden-queries.json +1371 -0
  126. package/src/data/schema.ts +252 -0
  127. package/src/index.ts +49 -0
  128. package/src/prompts/PromptNames.ts +36 -0
  129. package/src/utils/category-builder.ts +97 -0
  130. package/src/utils/entity-helpers.ts +203 -0
  131. package/src/utils/error-handlers.ts +41 -0
  132. package/src/utils/graph-helpers.ts +99 -0
  133. package/src/utils/prompt-helpers.ts +79 -0
  134. package/src/utils/query-helpers.ts +32 -0
  135. package/src/utils/user-helpers.ts +39 -0
  136. package/src/vectors/EmbeddingService.ts +109 -0
  137. package/src/vectors/SimilaritySearch.ts +108 -0
  138. package/tsconfig.json +39 -0
@@ -0,0 +1,977 @@
1
+ # LLM-Based Entity Grouping Implementation Plan
2
+
3
+ ## Overview
4
+
5
+ Replace the deterministic hub-and-spoke entity grouping algorithm with an LLM-based semantic approach that understands business context and generates meaningful entity combinations for query generation.
6
+
7
+ ## Key Design Decision: Single vs Multiple LLM Calls
8
+
9
+ ### Option A: Single LLM Call (RECOMMENDED)
10
+ **Generate all entity groups in one LLM invocation**
11
+
12
+ **Advantages:**
13
+ - **Cost Efficient**: One LLM call instead of N calls (for 57 entities: $0.01 vs potentially $0.50+)
14
+ - **Global Context**: LLM can see full schema and avoid duplicate/overlapping groups
15
+ - **Better Diversity**: Can balance groups across different business domains in single pass
16
+ - **Consistent Quality**: All groups generated with same context and reasoning
17
+ - **Simpler Implementation**: One prompt, one validation pass
18
+
19
+ **Disadvantages:**
20
+ - **Output Length**: Could hit output token limits for very large schemas (100+ entities)
21
+ - **All-or-Nothing**: If generation fails, must retry entire operation
22
+
23
+ **Recommendation**: Use single call with structured JSON output. For schemas with 50-100 entities, this is clearly optimal. Only switch to batching if you exceed ~150 entities or hit output token limits.
24
+
25
+ ### Option B: Multiple LLM Calls
26
+ **Generate groups iteratively or per-hub-entity**
27
+
28
+ **Advantages:**
29
+ - **Scales to Large Schemas**: Can handle 500+ entity schemas
30
+ - **Fault Tolerance**: Partial results if some calls fail
31
+
32
+ **Disadvantages:**
33
+ - **High Cost**: 50-100x more expensive for typical schemas
34
+ - **Context Loss**: Each call doesn't know what previous calls generated
35
+ - **Duplicate Groups**: More complex deduplication needed
36
+ - **Quality Issues**: Inconsistent reasoning across calls
37
+
38
+ ---
39
+
40
+ ## Phase 1: Core LLM Entity Grouping
41
+
42
+ ### 1.1 Update EntityGroup Interface
43
+
44
+ **File**: `/packages/QueryGen/src/types/schema.ts`
45
+
46
+ ```typescript
47
+ export interface EntityGroup {
48
+ entities: EntityInfo[];
49
+ relationships: EntityRelationshipInfo[];
50
+ primaryEntity: EntityInfo;
51
+ relationshipType: 'single' | 'parent-child' | 'many-to-many';
52
+
53
+ // NEW: LLM-generated semantic metadata
54
+ businessDomain: string; // "Sales Pipeline", "Inventory Management"
55
+ businessRationale: string; // Why this grouping makes business sense
56
+ expectedQuestionTypes: string[]; // ["trend_analysis", "aggregation", "comparison"]
57
+ }
58
+ ```
59
+
60
+ **Changes:**
61
+ - Add semantic fields to existing interface (no breaking changes to consumers)
62
+ - For single-entity groups, use sensible defaults (domain = entity name, etc.)
63
+
64
+ ### 1.2 Create LLM Entity Grouping Prompt
65
+
66
+ **File**: `/metadata/prompts/entity-grouping/entity-group-generator.prompt.md`
67
+
68
+ ```markdown
69
+ # Entity Group Generator
70
+
71
+ You are a database analyst helping to identify meaningful entity groupings for business intelligence query generation.
72
+
73
+ ## Task
74
+
75
+ Given a database schema, identify logical entity groups that make sense for business questions. Each group should represent a coherent business concept or process that users would naturally ask questions about.
76
+
77
+ ## Input Schema
78
+
79
+ **Schema Name**: {{ schemaName }}
80
+
81
+ **Entities** ({{ entities.length }} total):
82
+
83
+ {% for entity in entities %}
84
+ ### {{ entity.Name }}
85
+ - **Description**: {{ entity.Description || 'No description' }}
86
+ - **Schema**: {{ entity.SchemaName }}
87
+ - **Fields**: {{ entity.Fields.length }} fields
88
+ - **Related Entities**:
89
+ {% for rel in entity.RelatedEntities %}
90
+ - {{ rel.RelatedEntity }} ({{ rel.Type }})
91
+ {% endfor %}
92
+ {% endfor %}
93
+
94
+ ## Relationship Graph
95
+
96
+ ```
97
+ {{ relationshipGraph }}
98
+ ```
99
+
100
+ ## Guidelines
101
+
102
+ 1. **Business Relevance**: Focus on entity combinations that support real business questions
103
+ - ✅ GOOD: "Customers + Orders + OrderDetails" (sales analysis)
104
+ - ❌ BAD: "SystemLogs + UserPreferences + EmailTemplates" (unrelated)
105
+
106
+ 2. **Relationship Types**:
107
+ - **Single Entity**: Standalone entities with rich data (all entities should get a single-entity group)
108
+ - **Parent-Child**: Natural hierarchies (Customer → Orders, Product → Categories)
109
+ - **Many-to-Many**: Bridge tables connecting related concepts (Products ↔ Categories via ProductCategories)
110
+ - **Transactional Flow**: Process chains (Lead → Opportunity → Quote → Order)
111
+
112
+ 3. **Size Constraints**:
113
+ - Minimum: {{ minGroupSize }} entities
114
+ - Maximum: {{ maxGroupSize }} entities
115
+ - Target total groups: {{ targetGroupCount }}
116
+
117
+ 4. **Connectivity**: All entities in a group must be connected by foreign key relationships
118
+
119
+ 5. **Coverage**: Prioritize covering all important entities at least once
120
+
121
+ 6. **Business Domains**: Common domains include:
122
+ - Sales & Revenue (customers, orders, payments)
123
+ - Inventory & Products (products, categories, suppliers, stock)
124
+ - Marketing (campaigns, leads, conversions)
125
+ - Operations (shipments, fulfillment, logistics)
126
+ - Finance (invoices, payments, accounts)
127
+ - Human Resources (employees, departments, roles)
128
+ - Customer Service (tickets, cases, support)
129
+
130
+ ## Output Format
131
+
132
+ Return a JSON array of entity groups. Each group MUST include:
133
+
134
+ ```json
135
+ {
136
+ "groups": [
137
+ {
138
+ "entities": ["EntityName1", "EntityName2", "EntityName3"],
139
+ "primaryEntity": "EntityName1",
140
+ "businessDomain": "Sales Pipeline Analysis",
141
+ "businessRationale": "Tracks customer journey from lead to closed sale, essential for sales forecasting and conversion analysis",
142
+ "relationshipType": "parent-child",
143
+ "expectedQuestionTypes": ["trend_analysis", "funnel_analysis", "conversion_rates"]
144
+ }
145
+ ]
146
+ }
147
+ ```
148
+
149
+ **Important**:
150
+ - `entities`: Array of exact entity names from the schema (must match exactly)
151
+ - `primaryEntity`: The "hub" or most important entity in the group
152
+ - `businessDomain`: Clear business domain label (2-5 words)
153
+ - `businessRationale`: One sentence explaining why this grouping matters
154
+ - `relationshipType`: One of: "single", "parent-child", "many-to-many"
155
+ - `expectedQuestionTypes`: Array of question types this group supports
156
+
157
+ ## Example Output
158
+
159
+ ```json
160
+ {
161
+ "groups": [
162
+ {
163
+ "entities": ["Customers"],
164
+ "primaryEntity": "Customers",
165
+ "businessDomain": "Customer Master Data",
166
+ "businessRationale": "Core customer information and demographics for segmentation and analysis",
167
+ "relationshipType": "single",
168
+ "expectedQuestionTypes": ["segmentation", "demographics", "customer_profiling"]
169
+ },
170
+ {
171
+ "entities": ["Customers", "Orders"],
172
+ "primaryEntity": "Customers",
173
+ "businessDomain": "Customer Purchasing Behavior",
174
+ "businessRationale": "Links customers to their purchase history for lifetime value and repeat purchase analysis",
175
+ "relationshipType": "parent-child",
176
+ "expectedQuestionTypes": ["lifetime_value", "repeat_purchase", "customer_retention"]
177
+ },
178
+ {
179
+ "entities": ["Products", "Categories", "Suppliers"],
180
+ "primaryEntity": "Products",
181
+ "businessDomain": "Product Catalog Management",
182
+ "businessRationale": "Complete product information including categorization and sourcing for inventory and procurement decisions",
183
+ "relationshipType": "parent-child",
184
+ "expectedQuestionTypes": ["product_mix", "supplier_analysis", "category_performance"]
185
+ }
186
+ ]
187
+ }
188
+ ```
189
+
190
+ Generate entity groups now.
191
+ ```
192
+
193
+ **File**: `/metadata/prompts/entity-grouping/.entity-grouping-prompts.json`
194
+
195
+ ```json
196
+ {
197
+ "prompts": [
198
+ {
199
+ "type": "AIPromptEntity",
200
+ "fields": {
201
+ "Name": "Entity Group Generator",
202
+ "Description": "Analyzes database schema to generate semantically meaningful entity groupings for business query generation",
203
+ "Prompt": "@file:entity-group-generator.prompt.md",
204
+ "PromptRole": "User",
205
+ "CategoryID": "@lookup:AI Prompt Categories.Name=Query Generation",
206
+ "IsActive": true,
207
+ "JSONSchema": "@file:entity-group-generator.schema.json"
208
+ },
209
+ "children": [
210
+ {
211
+ "type": "AIPromptModelEntity",
212
+ "relation": "PromptID",
213
+ "records": [
214
+ { "fields": { "PromptID": "@parent:ID", "ModelID": "@lookup:AI Models.Name=Claude 4.5 Sonnet", "VendorID": "@lookup:MJ: AI Vendors.Name=Anthropic", "Priority": 6 } },
215
+ { "fields": { "PromptID": "@parent:ID", "ModelID": "@lookup:AI Models.Name=Kimi K2", "VendorID": "@lookup:MJ: AI Vendors.Name=Groq", "Priority": 5 } },
216
+ { "fields": { "PromptID": "@parent:ID", "ModelID": "@lookup:AI Models.Name=GPT-4.5o", "VendorID": "@lookup:MJ: AI Vendors.Name=OpenAI", "Priority": 4 } },
217
+ { "fields": { "PromptID": "@parent:ID", "ModelID": "@lookup:AI Models.Name=Gemini 2.0 Flash", "VendorID": "@lookup:MJ: AI Vendors.Name=Google", "Priority": 3 } },
218
+ { "fields": { "PromptID": "@parent:ID", "ModelID": "@lookup:AI Models.Name=Llama 3.3 70B Versatile", "VendorID": "@lookup:MJ: AI Vendors.Name=Groq", "Priority": 2 } },
219
+ { "fields": { "PromptID": "@parent:ID", "ModelID": "@lookup:AI Models.Name=Gemini 2.0 Flash Thinking", "VendorID": "@lookup:MJ: AI Vendors.Name=Google", "Priority": 1 } }
220
+ ]
221
+ }
222
+ ]
223
+ }
224
+ ]
225
+ }
226
+ ```
227
+
228
+ **File**: `/metadata/prompts/entity-grouping/entity-group-generator.schema.json`
229
+
230
+ ```json
231
+ {
232
+ "$schema": "http://json-schema.org/draft-07/schema#",
233
+ "type": "object",
234
+ "required": ["groups"],
235
+ "properties": {
236
+ "groups": {
237
+ "type": "array",
238
+ "items": {
239
+ "type": "object",
240
+ "required": ["entities", "primaryEntity", "businessDomain", "businessRationale", "relationshipType", "expectedQuestionTypes"],
241
+ "properties": {
242
+ "entities": {
243
+ "type": "array",
244
+ "items": { "type": "string" },
245
+ "minItems": 1,
246
+ "maxItems": 10,
247
+ "description": "Array of entity names in this group (must match schema exactly)"
248
+ },
249
+ "primaryEntity": {
250
+ "type": "string",
251
+ "description": "The hub or most important entity in the group"
252
+ },
253
+ "businessDomain": {
254
+ "type": "string",
255
+ "minLength": 3,
256
+ "maxLength": 100,
257
+ "description": "Clear business domain label"
258
+ },
259
+ "businessRationale": {
260
+ "type": "string",
261
+ "minLength": 10,
262
+ "maxLength": 500,
263
+ "description": "One sentence explaining business value"
264
+ },
265
+ "relationshipType": {
266
+ "type": "string",
267
+ "enum": ["single", "parent-child", "many-to-many"],
268
+ "description": "Type of relationships between entities"
269
+ },
270
+ "expectedQuestionTypes": {
271
+ "type": "array",
272
+ "items": { "type": "string" },
273
+ "minItems": 1,
274
+ "maxItems": 5,
275
+ "description": "Types of questions this group can answer"
276
+ }
277
+ }
278
+ }
279
+ }
280
+ }
281
+ }
282
+ ```
283
+
284
+ ### 1.3 Create Graph Visualization Helper
285
+
286
+ **File**: `/packages/QueryGen/src/utils/graph-helpers.ts`
287
+
288
+ ```typescript
289
+ import { EntityInfo } from '@memberjunction/core';
290
+
291
+ /**
292
+ * Generates a simple text-based relationship graph for LLM prompts
293
+ */
294
+ export function generateRelationshipGraph(entities: EntityInfo[]): string {
295
+ const lines: string[] = [];
296
+
297
+ for (const entity of entities) {
298
+ if (entity.RelatedEntities.length === 0) continue;
299
+
300
+ const relations = entity.RelatedEntities
301
+ .map(rel => `→ ${rel.RelatedEntity}`)
302
+ .join(', ');
303
+
304
+ lines.push(`${entity.Name}: ${relations}`);
305
+ }
306
+
307
+ return lines.join('\n');
308
+ }
309
+
310
+ /**
311
+ * Generates a Mermaid diagram (if needed for richer visualization)
312
+ */
313
+ export function generateMermaidDiagram(entities: EntityInfo[]): string {
314
+ const lines = ['graph LR'];
315
+ const processedPairs = new Set<string>();
316
+
317
+ for (const entity of entities) {
318
+ const safeEntityName = entity.Name.replace(/\s/g, '_');
319
+
320
+ for (const rel of entity.RelatedEntities) {
321
+ const safeRelatedName = rel.RelatedEntity.replace(/\s/g, '_');
322
+ const pairKey = [safeEntityName, safeRelatedName].sort().join('|');
323
+
324
+ if (!processedPairs.has(pairKey)) {
325
+ lines.push(` ${safeEntityName}[${entity.Name}] --> ${safeRelatedName}[${rel.RelatedEntity}]`);
326
+ processedPairs.add(pairKey);
327
+ }
328
+ }
329
+ }
330
+
331
+ return lines.join('\n');
332
+ }
333
+
334
+ /**
335
+ * Formats entity metadata for LLM prompt (concise version)
336
+ */
337
+ export interface EntityMetadataForPrompt {
338
+ Name: string;
339
+ Description: string;
340
+ SchemaName: string;
341
+ FieldCount: number;
342
+ RelatedEntities: Array<{ name: string; type: string }>;
343
+ }
344
+
345
+ export function formatEntitiesForPrompt(entities: EntityInfo[]): EntityMetadataForPrompt[] {
346
+ return entities.map(entity => ({
347
+ Name: entity.Name,
348
+ Description: entity.Description || 'No description available',
349
+ SchemaName: entity.SchemaName || 'dbo',
350
+ FieldCount: entity.Fields.length,
351
+ RelatedEntities: entity.RelatedEntities.map(rel => ({
352
+ name: rel.RelatedEntity,
353
+ type: rel.Type
354
+ }))
355
+ }));
356
+ }
357
+ ```
358
+
359
+ ### 1.4 Implement LLMEntityGrouper Class
360
+
361
+ **File**: `/packages/QueryGen/src/core/EntityGrouper.ts` (REPLACE EXISTING)
362
+
363
+ ```typescript
364
+ import { EntityInfo, EntityRelationshipInfo, Metadata } from '@memberjunction/core';
365
+ import { AIEngine } from '@memberjunction/aiengine';
366
+ import { AIPromptRunner, AIPromptParams } from '@memberjunction/ai-prompts';
367
+ import { EntityGroup } from '../types/schema';
368
+ import { generateRelationshipGraph, formatEntitiesForPrompt } from '../utils/graph-helpers';
369
+ import { extractErrorMessage } from '../utils/error-helpers';
370
+
371
+ /**
372
+ * LLM response format from Entity Group Generator prompt
373
+ */
374
+ interface LLMEntityGroupResponse {
375
+ groups: Array<{
376
+ entities: string[];
377
+ primaryEntity: string;
378
+ businessDomain: string;
379
+ businessRationale: string;
380
+ relationshipType: 'single' | 'parent-child' | 'many-to-many';
381
+ expectedQuestionTypes: string[];
382
+ }>;
383
+ }
384
+
385
+ /**
386
+ * Generates entity groups using LLM-based semantic analysis
387
+ *
388
+ * This replaces the deterministic hub-and-spoke algorithm with an
389
+ * intelligent approach that understands business context and generates
390
+ * meaningful entity combinations.
391
+ */
392
+ export class EntityGrouper {
393
+ private readonly promptName = 'Entity Group Generator';
394
+
395
+ /**
396
+ * Generate semantically meaningful entity groups using LLM analysis
397
+ *
398
+ * @param entities - All entities to analyze
399
+ * @param minSize - Minimum entities per group (typically 1)
400
+ * @param maxSize - Maximum entities per group (typically 3-5)
401
+ * @param targetGroupCount - Desired number of groups (approximate)
402
+ * @returns Array of validated entity groups with business context
403
+ */
404
+ async generateEntityGroups(
405
+ entities: EntityInfo[],
406
+ minSize: number,
407
+ maxSize: number,
408
+ targetGroupCount: number = 75
409
+ ): Promise<EntityGroup[]> {
410
+ try {
411
+ // 1. Prepare schema data for LLM
412
+ const schemaData = this.prepareSchemaData(entities, minSize, maxSize, targetGroupCount);
413
+
414
+ // 2. Call LLM to generate groups
415
+ const llmResponse = await this.callLLMForGrouping(schemaData);
416
+
417
+ // 3. Validate and convert to EntityGroup objects
418
+ const validatedGroups = this.validateAndConvertGroups(llmResponse, entities);
419
+
420
+ // 4. Deduplicate any similar groups
421
+ const deduplicatedGroups = this.deduplicateGroups(validatedGroups);
422
+
423
+ return deduplicatedGroups;
424
+ } catch (error: unknown) {
425
+ throw new Error(extractErrorMessage(error, 'EntityGrouper.generateEntityGroups'));
426
+ }
427
+ }
428
+
429
+ /**
430
+ * Prepare schema data for LLM prompt
431
+ */
432
+ private prepareSchemaData(
433
+ entities: EntityInfo[],
434
+ minSize: number,
435
+ maxSize: number,
436
+ targetGroupCount: number
437
+ ): Record<string, unknown> {
438
+ const formattedEntities = formatEntitiesForPrompt(entities);
439
+ const relationshipGraph = generateRelationshipGraph(entities);
440
+
441
+ // Get schema name from first entity (assume single schema)
442
+ const schemaName = entities[0]?.SchemaName || 'Unknown';
443
+
444
+ return {
445
+ schemaName,
446
+ entities: formattedEntities,
447
+ relationshipGraph,
448
+ minGroupSize: minSize,
449
+ maxGroupSize: maxSize,
450
+ targetGroupCount
451
+ };
452
+ }
453
+
454
+ /**
455
+ * Call LLM via AIPromptRunner to generate entity groups
456
+ */
457
+ private async callLLMForGrouping(
458
+ schemaData: Record<string, unknown>
459
+ ): Promise<LLMEntityGroupResponse> {
460
+ const promptParams = new AIPromptParams();
461
+ promptParams.prompt = this.promptName;
462
+ promptParams.data = schemaData;
463
+ promptParams.requireValidJSON = true;
464
+
465
+ const runner = new AIPromptRunner();
466
+ const result = await runner.ExecutePrompt(promptParams);
467
+
468
+ if (!result.Success) {
469
+ throw new Error(`LLM grouping failed: ${result.ErrorMessage}`);
470
+ }
471
+
472
+ if (!result.OutputJSON) {
473
+ throw new Error('LLM did not return JSON output');
474
+ }
475
+
476
+ return result.OutputJSON as LLMEntityGroupResponse;
477
+ }
478
+
479
+ /**
480
+ * Validate LLM output and convert to EntityGroup objects
481
+ */
482
+ private validateAndConvertGroups(
483
+ llmResponse: LLMEntityGroupResponse,
484
+ entities: EntityInfo[]
485
+ ): EntityGroup[] {
486
+ const entityMap = new Map(entities.map(e => [e.Name, e]));
487
+ const validGroups: EntityGroup[] = [];
488
+
489
+ for (const llmGroup of llmResponse.groups) {
490
+ try {
491
+ // Validate all entity names exist
492
+ const groupEntities = llmGroup.entities
493
+ .map(name => entityMap.get(name))
494
+ .filter((e): e is EntityInfo => e !== undefined);
495
+
496
+ if (groupEntities.length !== llmGroup.entities.length) {
497
+ console.warn(`Skipping group "${llmGroup.businessDomain}": contains unknown entities`);
498
+ continue;
499
+ }
500
+
501
+ // Validate primary entity exists
502
+ const primaryEntity = entityMap.get(llmGroup.primaryEntity);
503
+ if (!primaryEntity) {
504
+ console.warn(`Skipping group "${llmGroup.businessDomain}": primary entity "${llmGroup.primaryEntity}" not found`);
505
+ continue;
506
+ }
507
+
508
+ // Build relationships array (collect all relationships between entities in the group)
509
+ const relationships = this.extractRelationships(groupEntities);
510
+
511
+ // Validate connectivity (all entities must be reachable from primary)
512
+ if (groupEntities.length > 1 && !this.isConnected(groupEntities, relationships)) {
513
+ console.warn(`Skipping group "${llmGroup.businessDomain}": entities are not connected`);
514
+ continue;
515
+ }
516
+
517
+ // Create EntityGroup with LLM metadata
518
+ validGroups.push({
519
+ entities: groupEntities,
520
+ relationships,
521
+ primaryEntity,
522
+ relationshipType: llmGroup.relationshipType,
523
+ businessDomain: llmGroup.businessDomain,
524
+ businessRationale: llmGroup.businessRationale,
525
+ expectedQuestionTypes: llmGroup.expectedQuestionTypes
526
+ });
527
+ } catch (error: unknown) {
528
+ console.warn(`Skipping invalid group: ${extractErrorMessage(error, 'validateGroup')}`);
529
+ }
530
+ }
531
+
532
+ if (validGroups.length === 0) {
533
+ throw new Error('No valid entity groups generated by LLM');
534
+ }
535
+
536
+ return validGroups;
537
+ }
538
+
539
+ /**
540
+ * Extract relationships between entities in a group
541
+ */
542
+ private extractRelationships(entities: EntityInfo[]): EntityRelationshipInfo[] {
543
+ const entityNames = new Set(entities.map(e => e.Name));
544
+ const relationships: EntityRelationshipInfo[] = [];
545
+
546
+ for (const entity of entities) {
547
+ for (const rel of entity.RelatedEntities) {
548
+ if (entityNames.has(rel.RelatedEntity)) {
549
+ relationships.push(rel);
550
+ }
551
+ }
552
+ }
553
+
554
+ return relationships;
555
+ }
556
+
557
+ /**
558
+ * Check if all entities in a group are connected by relationships
559
+ */
560
+ private isConnected(entities: EntityInfo[], relationships: EntityRelationshipInfo[]): boolean {
561
+ if (entities.length <= 1) return true;
562
+
563
+ // Build adjacency map
564
+ const adjacency = new Map<string, Set<string>>();
565
+ for (const entity of entities) {
566
+ adjacency.set(entity.Name, new Set());
567
+ }
568
+
569
+ for (const rel of relationships) {
570
+ const entityName = entities.find(e =>
571
+ e.RelatedEntities.includes(rel)
572
+ )?.Name;
573
+
574
+ if (entityName) {
575
+ adjacency.get(entityName)?.add(rel.RelatedEntity);
576
+ adjacency.get(rel.RelatedEntity)?.add(entityName); // Bidirectional
577
+ }
578
+ }
579
+
580
+ // BFS from first entity
581
+ const visited = new Set<string>();
582
+ const queue = [entities[0].Name];
583
+ visited.add(entities[0].Name);
584
+
585
+ while (queue.length > 0) {
586
+ const current = queue.shift()!;
587
+ const neighbors = adjacency.get(current) || new Set();
588
+
589
+ for (const neighbor of neighbors) {
590
+ if (!visited.has(neighbor)) {
591
+ visited.add(neighbor);
592
+ queue.push(neighbor);
593
+ }
594
+ }
595
+ }
596
+
597
+ // All entities should be visited
598
+ return visited.size === entities.length;
599
+ }
600
+
601
+ /**
602
+ * Remove duplicate or highly similar groups
603
+ */
604
+ private deduplicateGroups(groups: EntityGroup[]): EntityGroup[] {
605
+ const seen = new Set<string>();
606
+ const unique: EntityGroup[] = [];
607
+
608
+ for (const group of groups) {
609
+ // Create normalized key (sorted entity names)
610
+ const key = group.entities.map(e => e.Name).sort().join('|');
611
+
612
+ if (!seen.has(key)) {
613
+ seen.add(key);
614
+ unique.push(group);
615
+ }
616
+ }
617
+
618
+ return unique;
619
+ }
620
+ }
621
+ ```
622
+
623
+ ### 1.5 Update Generate Command
624
+
625
+ **File**: `/packages/QueryGen/src/cli/commands/generate.ts`
626
+
627
+ Minimal changes needed - EntityGrouper interface stays the same:
628
+
629
+ ```typescript
630
+ // Line ~68 - Add targetGroupCount parameter
631
+ const entityGroups = await grouper.generateEntityGroups(
632
+ allEntities,
633
+ config.minEntitiesPerGroup,
634
+ config.maxEntitiesPerGroup,
635
+ config.targetGroupCount || 75 // NEW: Add target count
636
+ );
637
+ ```
638
+
639
+ **File**: `/packages/QueryGen/src/cli/config.ts`
640
+
641
+ Add new configuration option:
642
+
643
+ ```typescript
644
+ export interface QueryGenConfig {
645
+ // ... existing fields ...
646
+ targetGroupCount: number; // NEW: Desired number of entity groups
647
+ }
648
+
649
+ export function loadConfig(options: Record<string, unknown>): QueryGenConfig {
650
+ return {
651
+ // ... existing config ...
652
+ targetGroupCount: (options.targetGroupCount as number) || 75
653
+ };
654
+ }
655
+ ```
656
+
657
+ ### 1.6 Update CLI Command Flags
658
+
659
+ **File**: `/packages/MJCLI/src/commands/querygen/generate.ts`
660
+
661
+ ```typescript
662
+ static flags = {
663
+ // ... existing flags ...
664
+ 'target-groups': Flags.integer({
665
+ char: 't',
666
+ description: 'Target number of entity groups to generate',
667
+ default: 75
668
+ }),
669
+ };
670
+
671
+ // In run():
672
+ const options: Record<string, unknown> = {
673
+ // ... existing options ...
674
+ targetGroupCount: flags['target-groups'],
675
+ };
676
+ ```
677
+
678
+ ---
679
+
680
+ ## Phase 2: Testing & Validation
681
+
682
+ ### 2.1 Test on AssociationDemo Schema
683
+
684
+ ```bash
685
+ # Test with verbose output to see group details
686
+ QUERYGEN_COUNT_ONLY=true mj querygen generate \
687
+ --max-entities 3 \
688
+ --target-groups 75 \
689
+ --verbose
690
+
691
+ # Expected output:
692
+ # Found 75 entity groups
693
+ # Single entities: 58
694
+ # Pairs (1-hop): ~10-15
695
+ # Triples (bridge/connected): ~2-5
696
+ # Sample groups:
697
+ # Single: Customers
698
+ # Pair: Customers + Orders
699
+ # Triple: Organizations + OrganizationContacts + Contacts
700
+ ```
701
+
702
+ ### 2.2 Validation Checklist
703
+
704
+ - [ ] All entity names in LLM output match schema exactly
705
+ - [ ] Primary entities exist in entity list
706
+ - [ ] All multi-entity groups are connected
707
+ - [ ] Business domains are meaningful and diverse
708
+ - [ ] No duplicate groups (same entity set)
709
+ - [ ] Group count is within reasonable range (50-150)
710
+ - [ ] Single-entity groups generated for all important entities
711
+ - [ ] LLM respects maxGroupSize constraint
712
+
713
+ ### 2.3 Error Handling
714
+
715
+ Key error scenarios to handle:
716
+ 1. **LLM returns invalid JSON**: Retry with explicit JSON mode
717
+ 2. **Entity names don't match**: Fuzzy matching with warning
718
+ 3. **Disconnected groups**: Skip with warning
719
+ 4. **Too few groups**: Lower threshold or ask LLM to generate more
720
+ 5. **Too many groups**: Take top N by business relevance score
721
+
722
+ ---
723
+
724
+ ## Phase 3: Integration with Question Generation
725
+
726
+ ### 3.1 Update Business Question Prompt
727
+
728
+ **File**: `/metadata/prompts/business-question-generator.template.md`
729
+
730
+ Add new fields to template data:
731
+
732
+ ```markdown
733
+ ## Entity Group Context
734
+
735
+ **Business Domain**: {{ businessDomain }}
736
+
737
+ **Business Rationale**: {{ businessRationale }}
738
+
739
+ **Expected Question Types**: {{ expectedQuestionTypes | join(", ") }}
740
+
741
+ ## Entities in Group
742
+ ...
743
+ ```
744
+
745
+ ### 3.2 Pass Metadata to Question Generator
746
+
747
+ **File**: `/packages/QueryGen/src/core/QuestionGenerator.ts`
748
+
749
+ Update `formatEntityGroupForPrompt()` to include new fields:
750
+
751
+ ```typescript
752
+ private formatEntityGroupForPrompt(group: EntityGroup): Record<string, unknown> {
753
+ return {
754
+ businessDomain: group.businessDomain,
755
+ businessRationale: group.businessRationale,
756
+ expectedQuestionTypes: group.expectedQuestionTypes,
757
+ entities: group.entities.map(e => ({
758
+ name: e.Name,
759
+ // ... rest of entity metadata
760
+ }))
761
+ };
762
+ }
763
+ ```
764
+
765
+ This provides additional context to help the question generator create more relevant, targeted questions.
766
+
767
+ ---
768
+
769
+ ## Phase 4: Monitoring & Refinement
770
+
771
+ ### 4.1 Add Telemetry
772
+
773
+ Track key metrics:
774
+ - LLM token usage for grouping call
775
+ - Number of groups generated vs target
776
+ - Number of groups filtered out (invalid)
777
+ - Distribution of group sizes (1, 2, 3+ entities)
778
+ - Business domain diversity
779
+
780
+ ### 4.2 Quality Metrics
781
+
782
+ Define success criteria:
783
+ - **Coverage**: % of entities included in at least one group
784
+ - **Relevance**: Human review of sample groups (1-10 rating)
785
+ - **Diversity**: Number of unique business domains
786
+ - **Connectivity**: % of multi-entity groups that are properly connected
787
+ - **Downstream Success**: % of groups that generate valid SQL queries
788
+
789
+ ### 4.3 Prompt Refinement
790
+
791
+ Iterate on prompt based on:
792
+ - Quality of business domains (too generic? too specific?)
793
+ - Relevance of entity combinations
794
+ - Balance of group sizes
795
+ - Coverage of important entities
796
+
797
+ ---
798
+
799
+ ## Future Enhancements
800
+
801
+ ### Enhancement 1: Graph Theory Pre-Clustering
802
+
803
+ **Goal**: Use deterministic graph algorithms to suggest entity communities, then let LLM refine/label them.
804
+
805
+ **Algorithms to Explore:**
806
+
807
+ 1. **Community Detection** (Louvain Algorithm)
808
+ - Detects densely connected clusters of entities
809
+ - Use as "suggested groupings" input to LLM
810
+ - LLM validates and adds business context
811
+
812
+ 2. **Centrality Measures** (PageRank, Betweenness)
813
+ - Identify "hub" entities (high degree centrality)
814
+ - Prioritize these entities in grouping
815
+ - Filter out low-value peripheral entities
816
+
817
+ 3. **Minimum Spanning Tree**
818
+ - Find core skeleton of schema relationships
819
+ - Focus LLM attention on primary relationships
820
+ - Avoid redundant/weak connections
821
+
822
+ **Implementation Approach:**
823
+
824
+ ```typescript
825
+ class HybridEntityGrouper {
826
+ async generateEntityGroups(entities: EntityInfo[]): Promise<EntityGroup[]> {
827
+ // 1. Run graph algorithms
828
+ const communities = this.detectCommunities(entities);
829
+ const centralEntities = this.computeCentrality(entities);
830
+
831
+ // 2. Generate "suggested groups" from graph analysis
832
+ const suggestedGroups = this.createSuggestedGroups(communities, centralEntities);
833
+
834
+ // 3. Pass suggestions to LLM for validation + labeling
835
+ const promptData = {
836
+ entities: formatEntitiesForPrompt(entities),
837
+ suggestedGroups: suggestedGroups,
838
+ instructions: "Review these algorithmically-generated groups. Validate, merge, split, or add business context as needed."
839
+ };
840
+
841
+ // 4. LLM refines and adds semantic labels
842
+ const refinedGroups = await this.callLLMWithSuggestions(promptData);
843
+
844
+ return refinedGroups;
845
+ }
846
+ }
847
+ ```
848
+
849
+ **Benefits:**
850
+ - Reduces LLM workload (validates vs generates from scratch)
851
+ - Leverages mathematical rigor for connectivity
852
+ - LLM focuses on semantic labeling and edge cases
853
+ - Potentially better quality for very large schemas (200+ entities)
854
+
855
+ **Dependencies:**
856
+ - Graph library (e.g., `graphology`, `jsnx`)
857
+ - Community detection implementation (Louvain)
858
+ - Performance testing on large schemas
859
+
860
+ ### Enhancement 2: Semantic Similarity Clustering
861
+
862
+ **Goal**: Embed entity descriptions and cluster by semantic similarity BEFORE considering relationships.
863
+
864
+ **Approach:**
865
+ ```typescript
866
+ // 1. Embed all entity descriptions
867
+ const embeddings = await Promise.all(
868
+ entities.map(e => AIEngine.Instance.EmbedTextLocal(e.Description))
869
+ );
870
+
871
+ // 2. Cluster by cosine similarity (k-means or hierarchical)
872
+ const semanticClusters = kMeansClustering(embeddings, k=10);
873
+
874
+ // 3. Within each cluster, apply relationship constraints
875
+ const groups = semanticClusters.flatMap(cluster =>
876
+ this.generateGroupsWithinCluster(cluster, relationships)
877
+ );
878
+ ```
879
+
880
+ **Example**: "Customer", "Lead", "Contact" would cluster together semantically even if not directly related via FK, enabling cross-entity analysis.
881
+
882
+ ### Enhancement 3: User Feedback Loop
883
+
884
+ **Goal**: Learn from query success/failure to improve grouping over time.
885
+
886
+ **Approach:**
887
+ - Track which entity groups produce valid SQL queries
888
+ - Track which groups generate queries users actually run
889
+ - Use this feedback to adjust group generation (upweight successful patterns)
890
+ - Could fine-tune a small LLM on "good group examples" from production
891
+
892
+ ### Enhancement 4: Schema-Specific Presets
893
+
894
+ **Goal**: Maintain curated entity grouping templates for common schema patterns.
895
+
896
+ **Examples:**
897
+ - **E-commerce**: Customers, Orders, Products, Categories, Reviews, Payments
898
+ - **CRM**: Accounts, Contacts, Opportunities, Activities, Cases
899
+ - **ERP**: Inventory, Suppliers, PurchaseOrders, Warehouses, Shipments
900
+
901
+ **Implementation:**
902
+ - Detect schema type via entity name matching
903
+ - Provide preset as "example groups" to LLM
904
+ - LLM adapts preset to actual schema
905
+
906
+ ---
907
+
908
+ ## Success Criteria
909
+
910
+ **Phase 1 (Core LLM Grouping) Complete When:**
911
+ - [ ] LLM generates 50-150 groups for typical schema (50-100 entities)
912
+ - [ ] All generated groups pass connectivity validation
913
+ - [ ] Business domains are diverse and meaningful (manual review)
914
+ - [ ] Group generation completes in <30 seconds for typical schema
915
+ - [ ] Token cost is <$0.05 per schema analysis
916
+ - [ ] Integration with question generation works seamlessly
917
+
918
+ **Future Enhancement Complete When:**
919
+ - [ ] Hybrid approach reduces LLM token usage by 50%
920
+ - [ ] Graph algorithms improve coverage of important entities
921
+ - [ ] Semantic clustering identifies cross-domain query opportunities
922
+ - [ ] Telemetry shows improved query success rates
923
+
924
+ ---
925
+
926
+ ## Rollout Plan
927
+
928
+ 1. **Week 1**: Implement Phase 1 (Core LLM Grouping)
929
+ - Create prompt and metadata
930
+ - Implement EntityGrouper with LLM
931
+ - Add validation logic
932
+
933
+ 2. **Week 2**: Testing & Refinement
934
+ - Test on AssociationDemo schema
935
+ - Test on production schemas (Customers, Orders, etc.)
936
+ - Refine prompt based on quality metrics
937
+
938
+ 3. **Week 3**: Integration
939
+ - Update question generator to use new metadata
940
+ - End-to-end testing (groups → questions → SQL)
941
+ - Performance optimization
942
+
943
+ 4. **Week 4+**: Future Enhancements
944
+ - Implement graph theory pre-clustering (if needed)
945
+ - Add telemetry and monitoring
946
+ - Iterate based on production usage
947
+
948
+ ---
949
+
950
+ ## Open Questions
951
+
952
+ 1. **Prompt Engineering**: Should we include example business questions in the entity grouping prompt to guide the LLM?
953
+ - **Leaning YES**: Helps LLM understand what "meaningful for business questions" means
954
+
955
+ 2. **Retry Strategy**: If LLM generates poor groups, should we retry with adjusted prompt or fall back to deterministic?
956
+ - **Recommend**: Retry once with explicit "generate more diverse groups" instruction, then fall back
957
+
958
+ 3. **Group Size Distribution**: Should we guide LLM on distribution (e.g., "50% single, 30% pairs, 20% triples")?
959
+ - **Recommend**: Let LLM decide naturally based on schema, but monitor distribution
960
+
961
+ 4. **Cost Controls**: Should we add a max token budget and truncate schema if exceeded?
962
+ - **Recommend**: Yes, add warning if schema exceeds 100 entities, offer to focus on subset
963
+
964
+ 5. **Human Review**: Should we add a CLI flag to output groups for manual review before proceeding?
965
+ - **Recommend**: Yes, add `--review-groups` flag that pauses after grouping for inspection
966
+
967
+ ---
968
+
969
+ ## Conclusion
970
+
971
+ The LLM-based approach solves the combinatorial explosion problem while adding semantic understanding that deterministic algorithms cannot provide. The single-call design is optimal for typical schemas (50-100 entities) and can be extended with graph theory for larger schemas if needed.
972
+
973
+ **Estimated Implementation Time**: 1-2 weeks for Phase 1, 1-2 weeks for refinement and testing.
974
+
975
+ **Estimated Cost Per Run**: $0.01-0.03 per schema analysis (one-time cost during setup).
976
+
977
+ **Expected Quality Improvement**: Significant - groups will be business-relevant rather than structurally arbitrary.