@deepagents/text2sql 0.25.0 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +1 -1
  2. package/dist/index.d.ts +0 -1
  3. package/dist/index.d.ts.map +1 -1
  4. package/dist/index.js +212 -285
  5. package/dist/index.js.map +4 -4
  6. package/dist/lib/adapters/adapter.d.ts +6 -0
  7. package/dist/lib/adapters/adapter.d.ts.map +1 -1
  8. package/dist/lib/adapters/bigquery/index.js +18 -1
  9. package/dist/lib/adapters/bigquery/index.js.map +2 -2
  10. package/dist/lib/adapters/groundings/abstract.grounding.d.ts +2 -1
  11. package/dist/lib/adapters/groundings/abstract.grounding.d.ts.map +1 -1
  12. package/dist/lib/adapters/groundings/index.js.map +2 -2
  13. package/dist/lib/adapters/mysql/index.js +18 -1
  14. package/dist/lib/adapters/mysql/index.js.map +2 -2
  15. package/dist/lib/adapters/postgres/index.js +18 -1
  16. package/dist/lib/adapters/postgres/index.js.map +2 -2
  17. package/dist/lib/adapters/runtime-scope.d.ts +14 -0
  18. package/dist/lib/adapters/runtime-scope.d.ts.map +1 -0
  19. package/dist/lib/adapters/spreadsheet/index.js +18 -1
  20. package/dist/lib/adapters/spreadsheet/index.js.map +2 -2
  21. package/dist/lib/adapters/sqlite/index.js +18 -1
  22. package/dist/lib/adapters/sqlite/index.js.map +2 -2
  23. package/dist/lib/adapters/sqlserver/index.js +18 -1
  24. package/dist/lib/adapters/sqlserver/index.js.map +2 -2
  25. package/dist/lib/agents/exceptions.d.ts +22 -0
  26. package/dist/lib/agents/exceptions.d.ts.map +1 -1
  27. package/dist/lib/agents/result-tools.d.ts.map +1 -1
  28. package/dist/lib/fragments/schema.d.ts +2 -1
  29. package/dist/lib/fragments/schema.d.ts.map +1 -1
  30. package/dist/lib/instructions.d.ts +1 -9
  31. package/dist/lib/instructions.d.ts.map +1 -1
  32. package/dist/lib/sql.d.ts +0 -3
  33. package/dist/lib/sql.d.ts.map +1 -1
  34. package/dist/lib/synthesis/index.js +428 -621
  35. package/dist/lib/synthesis/index.js.map +4 -4
  36. package/dist/lib/synthesis/synthesizers/index.d.ts +1 -2
  37. package/dist/lib/synthesis/synthesizers/index.d.ts.map +1 -1
  38. package/package.json +7 -6
  39. package/dist/lib/agents/developer.agent.d.ts +0 -41
  40. package/dist/lib/agents/developer.agent.d.ts.map +0 -1
  41. package/dist/lib/agents/teachables.agent.d.ts +0 -10
  42. package/dist/lib/agents/teachables.agent.d.ts.map +0 -1
  43. package/dist/lib/synthesis/synthesizers/teachings-generator.d.ts +0 -20
  44. package/dist/lib/synthesis/synthesizers/teachings-generator.d.ts.map +0 -1
@@ -711,38 +711,10 @@ var LastQueryExtractor = class extends BaseContextualExtractor {
711
711
  }
712
712
  };
713
713
 
714
- // packages/text2sql/src/lib/synthesis/synthesizers/schema-synthesizer.ts
715
- import pLimit from "p-limit";
716
-
717
- // packages/text2sql/src/lib/agents/exceptions.ts
718
- var sqlValidationMarker = Symbol("SQLValidationError");
719
- var unanswerableSqlMarker = Symbol("UnanswerableSQLError");
720
- var SQLValidationError = class _SQLValidationError extends Error {
721
- [sqlValidationMarker];
722
- constructor(message) {
723
- super(message);
724
- this.name = "SQLValidationError";
725
- this[sqlValidationMarker] = true;
726
- }
727
- static isInstance(error) {
728
- return error instanceof _SQLValidationError && error[sqlValidationMarker] === true;
729
- }
730
- };
731
- var UnanswerableSQLError = class _UnanswerableSQLError extends Error {
732
- [unanswerableSqlMarker];
733
- constructor(message) {
734
- super(message);
735
- this.name = "UnanswerableSQLError";
736
- this[unanswerableSqlMarker] = true;
737
- }
738
- static isInstance(error) {
739
- return error instanceof _UnanswerableSQLError && error[unanswerableSqlMarker] === true;
740
- }
741
- };
742
-
743
- // packages/text2sql/src/lib/agents/question.agent.ts
714
+ // packages/text2sql/src/lib/synthesis/synthesizers/breadth-evolver.ts
744
715
  import { groq as groq4 } from "@ai-sdk/groq";
745
716
  import dedent4 from "dedent";
717
+ import pLimit from "p-limit";
746
718
  import z4 from "zod";
747
719
  import "@deepagents/agent";
748
720
  import {
@@ -750,107 +722,223 @@ import {
750
722
  InMemoryContextStore as InMemoryContextStore4,
751
723
  fragment as fragment4,
752
724
  guardrail,
753
- persona as persona4,
725
+ persona as personaFragment,
754
726
  structuredOutput as structuredOutput4,
755
727
  user as user4
756
728
  } from "@deepagents/context";
757
- var complexityInstructions = {
758
- simple: dedent4`
759
- Generate simple questions that require:
760
- - Basic SELECT with single table
761
- - Simple WHERE clauses with one condition
762
- - COUNT(*) or basic aggregations
763
- - No joins required
764
- Examples: "How many customers do we have?", "List all products", "What is the total revenue?"
765
- `,
766
- moderate: dedent4`
767
- Generate moderate questions that require:
768
- - JOINs between 2-3 tables
769
- - Multiple WHERE conditions (AND/OR)
770
- - GROUP BY with HAVING clauses
771
- - ORDER BY with LIMIT
772
- - Basic subqueries
773
- Examples: "What are the top 5 customers by total orders?", "Which products have never been ordered?"
774
- `,
775
- complex: dedent4`
776
- Generate complex questions that require:
777
- - Multiple JOINs (3+ tables)
778
- - Nested subqueries or CTEs
779
- - Complex aggregations with multiple GROUP BY columns
780
- - CASE expressions
781
- - Date/time calculations
782
- Examples: "What is the month-over-month growth rate?", "Which customers have increased spending compared to last year?"
783
- `,
784
- "high complex": dedent4`
785
- Generate highly complex questions that require advanced SQL features:
786
- - Window functions (ROW_NUMBER, RANK, DENSE_RANK)
787
- - LAG, LEAD for comparisons
788
- - Running totals (SUM OVER)
789
- - Moving averages
790
- - PARTITION BY clauses
791
- - Complex CTEs with multiple levels
792
- Examples: "What is the running total of sales per month?", "Rank customers by their purchase frequency within each region"
793
- `
729
+
730
+ // packages/text2sql/src/lib/synthesis/synthesizers/styles.ts
731
+ var ALL_STYLES = [
732
+ "formal",
733
+ // Professional business language
734
+ "colloquial",
735
+ // Casual everyday speech
736
+ "imperative",
737
+ // Commands: "Show me...", "Get..."
738
+ "interrogative",
739
+ // Questions: "What is...", "How many..."
740
+ "descriptive",
741
+ // Verbose, detailed
742
+ "concise",
743
+ // Brief, minimal
744
+ "vague",
745
+ // Ambiguous, hedging
746
+ "metaphorical",
747
+ // Figurative language
748
+ "conversational"
749
+ // Chat-like
750
+ ];
751
+ var styleInstructions = {
752
+ formal: "Use professional business language, complete sentences, no slang",
753
+ colloquial: "Use casual everyday speech, contractions, informal tone",
754
+ imperative: 'Phrase as commands: "Show me...", "Get...", "List..."',
755
+ interrogative: 'Phrase as questions: "What is...", "How many...", "Which..."',
756
+ descriptive: "Use detailed, verbose phrasing with extra context",
757
+ concise: "Use minimal words, telegram-style brevity",
758
+ vague: "Be intentionally ambiguous, use hedging language",
759
+ metaphorical: "Use figurative language, analogies, creative phrasing",
760
+ conversational: "Chat-like tone, as if talking to a colleague"
794
761
  };
795
- var outputSchema2 = z4.object({
796
- questions: z4.array(z4.string().describe("A natural language question about the data")).min(1).describe("List of natural language questions a user might ask")
762
+
763
+ // packages/text2sql/src/lib/synthesis/synthesizers/breadth-evolver.ts
764
+ var paraphraserOutputSchema = z4.object({
765
+ paraphrases: z4.array(
766
+ z4.string().describe("A paraphrased version of the original question")
767
+ ).min(1).describe("List of paraphrased questions that would produce the same SQL")
797
768
  });
798
- async function generateQuestions(params) {
799
- const { introspection, complexity, count, prompt, model } = params;
769
+ async function paraphraseQuestion(params) {
800
770
  const context = new ContextEngine4({
801
771
  store: new InMemoryContextStore4(),
802
- chatId: `question-gen-${crypto.randomUUID()}`,
772
+ chatId: `paraphraser-${crypto.randomUUID()}`,
803
773
  userId: "system"
804
774
  });
775
+ const personaInstruction = params.persona ? dedent4`
776
+ <persona role="${params.persona.role}">
777
+ ${params.persona.perspective}
778
+
779
+ Paraphrase the question as this persona would naturally ask it.
780
+ Use their vocabulary, priorities, and framing style.
781
+ </persona>
782
+ ` : "";
783
+ const styleInstruction = params.persona?.styles && params.persona.styles.length > 0 ? dedent4`
784
+ <communication_styles>
785
+ Generate paraphrases using these communication styles: ${params.persona.styles.join(", ")}
786
+
787
+ Style definitions:
788
+ ${params.persona.styles.map((s) => `- ${s}: ${styleInstructions[s]}`).join("\n")}
789
+
790
+ Distribute paraphrases across these styles for variety.
791
+ </communication_styles>
792
+ ` : "";
805
793
  context.set(
806
- persona4({
807
- name: "question_generator",
808
- role: "You are a synthetic data generator specializing in creating realistic natural language questions that users might ask about a database.",
809
- objective: "Generate diverse, realistic natural language questions that match the specified complexity level"
794
+ personaFragment({
795
+ name: "question_paraphraser",
796
+ role: "You are a linguistic expert specializing in paraphrasing database questions. Your task is to generate alternative phrasings of questions that preserve the exact same semantic meaning - they must all produce the identical SQL query.",
797
+ objective: "Generate paraphrased versions of questions that preserve exact semantic meaning and produce identical SQL"
810
798
  }),
811
- fragment4("database_schema", introspection || ""),
799
+ fragment4("original_question", params.question),
812
800
  fragment4(
813
- "complexity",
814
- { level: complexity },
815
- complexityInstructions[complexity]
801
+ "reference_sql",
802
+ params.sql,
803
+ "This SQL shows what the question is really asking - all paraphrases must ask for exactly this"
816
804
  ),
805
+ ...personaInstruction ? [fragment4("persona", personaInstruction)] : [],
806
+ ...styleInstruction ? [fragment4("communication_styles", styleInstruction)] : [],
817
807
  fragment4(
818
808
  "task",
819
809
  dedent4`
820
- Generate exactly ${count} natural language questions at the "${complexity}" complexity level.
821
- The questions should:
822
- 1. Match the complexity requirements above
823
- 2. Use natural business language, not technical SQL terms
824
- 3. Be realistic questions a non-technical user would actually ask
825
- 4. Cover different tables and relationships when possible
810
+ Generate exactly ${params.count} paraphrased versions of the original question.
811
+
812
+ Requirements:
813
+ 1. Each paraphrase must be semantically equivalent - it should produce the EXACT same SQL
814
+ 2. Vary the sentence structure, word choice, and phrasing style
815
+ 3. Use natural language without SQL keywords (SELECT, WHERE, JOIN, etc.)
816
+ 4. Keep paraphrases realistic - how actual users would ask
817
+ 5. Do not add or remove any conditions, filters, or requirements from the original
818
+ ${params.persona?.styles?.length ? "6. Apply the specified communication styles to create diverse phrasings" : ""}
826
819
  `
827
820
  ),
821
+ guardrail({ rule: "NEVER change what data is being requested" }),
828
822
  guardrail({
829
- rule: "Questions MUST ONLY reference tables and columns that exist in the schema above"
830
- }),
831
- guardrail({
832
- rule: "Before generating each question, verify that ALL entities (tables, columns, relationships) you reference are explicitly listed in the schema"
833
- }),
834
- guardrail({
835
- rule: "DO NOT invent or assume tables/columns that are not explicitly shown in the schema"
823
+ rule: "NEVER add filters, aggregations, or conditions not in the original"
836
824
  }),
837
825
  guardrail({
838
- rule: "Use natural language without SQL keywords like SELECT, WHERE, etc."
826
+ rule: "NEVER remove any specificity from the original question"
839
827
  }),
840
828
  guardrail({
841
- rule: "All questions must match the specified complexity level"
829
+ rule: "All paraphrases must be answerable by the exact same SQL query"
842
830
  }),
843
831
  user4(
844
- prompt ?? `Generate ${count} questions at ${complexity} complexity given db schema.`
832
+ `Paraphrase this question ${params.count} times: "${params.question}"`
845
833
  )
846
834
  );
847
- const questionOutput = structuredOutput4({
848
- model: model ?? groq4("openai/gpt-oss-20b"),
835
+ const paraphraserOutput = structuredOutput4({
836
+ model: params.model ?? groq4("openai/gpt-oss-20b"),
849
837
  context,
850
- schema: outputSchema2
838
+ schema: paraphraserOutputSchema
851
839
  });
852
- return questionOutput.generate();
840
+ return paraphraserOutput.generate();
853
841
  }
842
+ var BreadthEvolver = class extends PairProducer {
843
+ /**
844
+ * @param source - Source pairs or producer to evolve
845
+ * @param options - Evolution options including count, persona, and concurrency
846
+ */
847
+ constructor(source, options) {
848
+ super();
849
+ this.source = source;
850
+ this.options = options;
851
+ this.#limit = pLimit(this.options.concurrency ?? 4);
852
+ }
853
+ #limit;
854
+ /**
855
+ * Batch pairs within each chunk for concurrent processing.
856
+ * Uses pLimit for concurrency control, yields results per pair after chunk completes.
857
+ */
858
+ async *produce() {
859
+ for await (const chunk of this.from(this.source)) {
860
+ const tasks = chunk.map(
861
+ (pair) => this.#limit(async () => {
862
+ const result = await paraphraseQuestion({
863
+ question: pair.question,
864
+ sql: pair.sql,
865
+ count: this.options.count,
866
+ persona: this.options.persona,
867
+ model: this.options.model
868
+ });
869
+ return result.paraphrases.map((paraphrase) => ({
870
+ question: paraphrase,
871
+ sql: pair.sql,
872
+ context: pair.context,
873
+ success: pair.success
874
+ }));
875
+ })
876
+ );
877
+ const results = await Promise.all(tasks);
878
+ yield results.flat();
879
+ }
880
+ }
881
+ };
882
+
883
+ // packages/text2sql/src/lib/synthesis/synthesizers/depth-evolver.ts
884
+ import { groq as groq6 } from "@ai-sdk/groq";
885
+ import { NoObjectGeneratedError as NoObjectGeneratedError2, NoOutputGeneratedError as NoOutputGeneratedError2 } from "ai";
886
+ import dedent6 from "dedent";
887
+ import pLimit2 from "p-limit";
888
+ import pRetry2 from "p-retry";
889
+ import z6 from "zod";
890
+ import "@deepagents/agent";
891
+ import {
892
+ ContextEngine as ContextEngine6,
893
+ InMemoryContextStore as InMemoryContextStore6,
894
+ fragment as fragment6,
895
+ guardrail as guardrail3,
896
+ persona as persona5,
897
+ structuredOutput as structuredOutput6,
898
+ user as user6
899
+ } from "@deepagents/context";
900
+
901
+ // packages/text2sql/src/lib/agents/exceptions.ts
902
+ var sqlValidationMarker = Symbol("SQLValidationError");
903
+ var unanswerableSqlMarker = Symbol("UnanswerableSQLError");
904
+ var sqlScopeMarker = Symbol("SQLScopeError");
905
+ var SQLValidationError = class _SQLValidationError extends Error {
906
+ [sqlValidationMarker];
907
+ constructor(message) {
908
+ super(message);
909
+ this.name = "SQLValidationError";
910
+ this[sqlValidationMarker] = true;
911
+ }
912
+ static isInstance(error) {
913
+ return error instanceof _SQLValidationError && error[sqlValidationMarker] === true;
914
+ }
915
+ };
916
+ var UnanswerableSQLError = class _UnanswerableSQLError extends Error {
917
+ [unanswerableSqlMarker];
918
+ constructor(message) {
919
+ super(message);
920
+ this.name = "UnanswerableSQLError";
921
+ this[unanswerableSqlMarker] = true;
922
+ }
923
+ static isInstance(error) {
924
+ return error instanceof _UnanswerableSQLError && error[unanswerableSqlMarker] === true;
925
+ }
926
+ };
927
+ var SQLScopeError = class _SQLScopeError extends Error {
928
+ [sqlScopeMarker];
929
+ payload;
930
+ errorType;
931
+ constructor(payload) {
932
+ super(JSON.stringify(payload));
933
+ this.name = "SQLScopeError";
934
+ this.payload = payload;
935
+ this.errorType = payload.error_type;
936
+ this[sqlScopeMarker] = true;
937
+ }
938
+ static isInstance(error) {
939
+ return error instanceof _SQLScopeError && error[sqlScopeMarker] === true;
940
+ }
941
+ };
854
942
 
855
943
  // packages/text2sql/src/lib/agents/sql.agent.ts
856
944
  import { groq as groq5 } from "@ai-sdk/groq";
@@ -875,7 +963,7 @@ import {
875
963
  fragment as fragment5,
876
964
  guardrail as guardrail2,
877
965
  hint,
878
- persona as persona5,
966
+ persona as persona4,
879
967
  policy,
880
968
  structuredOutput as structuredOutput5,
881
969
  user as user5,
@@ -1099,7 +1187,7 @@ async function toSql(options) {
1099
1187
  userId: "system"
1100
1188
  });
1101
1189
  context.set(
1102
- persona5({
1190
+ persona4({
1103
1191
  name: "Freya",
1104
1192
  role: SQL_AGENT_ROLE,
1105
1193
  objective: SQL_AGENT_OBJECTIVE
@@ -1265,287 +1353,9 @@ async function withRetry(computation, options = { retries: 3 }) {
1265
1353
  );
1266
1354
  }
1267
1355
 
1268
- // packages/text2sql/src/lib/synthesis/synthesizers/schema-synthesizer.ts
1269
- var SchemaSynthesizer = class extends PairProducer {
1270
- /**
1271
- * @param adapter - Database adapter for schema introspection and SQL validation
1272
- * @param options - Synthesis configuration including count, complexity, and concurrency
1273
- */
1274
- constructor(adapter, options) {
1275
- super();
1276
- this.adapter = adapter;
1277
- this.options = options;
1278
- this.#complexities = Array.isArray(this.options.complexity) ? this.options.complexity : [this.options.complexity ?? "moderate"];
1279
- this.#personas = this.options.personas ?? [void 0];
1280
- this.#limit = pLimit(this.options.concurrency ?? 5);
1281
- }
1282
- #complexities = [];
1283
- #personas = [];
1284
- #limit;
1285
- /**
1286
- * Generates question-SQL pairs by iterating through all persona × complexity combinations.
1287
- * Uses parallel processing bounded by the configured concurrency limit.
1288
- * Yields results as each combination completes (streaming pattern).
1289
- * @returns Generated pairs from all combinations
1290
- */
1291
- async *produce() {
1292
- const introspection = "";
1293
- const combinations = this.#personas.flatMap(
1294
- (persona8) => this.#complexities.map((complexity) => ({ persona: persona8, complexity }))
1295
- );
1296
- for (const { persona: persona8, complexity } of combinations) {
1297
- const pairs = await this.#processCombination(
1298
- introspection,
1299
- persona8,
1300
- complexity
1301
- );
1302
- if (pairs.length) {
1303
- yield pairs;
1304
- }
1305
- }
1306
- }
1307
- /**
1308
- * Processes a single persona × complexity combination by generating questions
1309
- * and converting each to SQL in parallel.
1310
- */
1311
- async #processCombination(introspection, persona8, complexity) {
1312
- const personaContext = persona8 ? `As ${persona8.role}, ${persona8.perspective}
1313
-
1314
- Generate questions this persona would ask.` : void 0;
1315
- const prompt = personaContext ? `${personaContext}
1316
-
1317
- Generate ${this.options.count} questions at ${complexity} complexity.` : void 0;
1318
- const { questions } = await this.#limit(
1319
- () => generateQuestions({
1320
- introspection,
1321
- complexity,
1322
- count: this.options.count,
1323
- prompt,
1324
- model: this.options.model
1325
- })
1326
- );
1327
- const pairs = await Promise.all(
1328
- questions.map(async (question) => {
1329
- const result = await this.#limit(async () => {
1330
- try {
1331
- return await toSql({
1332
- input: question,
1333
- adapter: this.adapter,
1334
- fragments: this.options.teachings ?? [],
1335
- model: this.options.model
1336
- });
1337
- } catch (error) {
1338
- if (UnanswerableSQLError.isInstance(error)) {
1339
- return {
1340
- attempts: 0,
1341
- sql: "",
1342
- errors: [
1343
- `Cannot answer the question ${question} because ${error.message}`
1344
- ]
1345
- };
1346
- }
1347
- throw error;
1348
- }
1349
- });
1350
- return {
1351
- question,
1352
- sql: result.sql,
1353
- success: !result.errors || result.errors.length === 0
1354
- };
1355
- })
1356
- );
1357
- return pairs;
1358
- }
1359
- };
1360
-
1361
- // packages/text2sql/src/lib/synthesis/synthesizers/breadth-evolver.ts
1362
- import { groq as groq6 } from "@ai-sdk/groq";
1363
- import dedent6 from "dedent";
1364
- import pLimit2 from "p-limit";
1365
- import z6 from "zod";
1366
- import "@deepagents/agent";
1367
- import {
1368
- ContextEngine as ContextEngine6,
1369
- InMemoryContextStore as InMemoryContextStore6,
1370
- fragment as fragment6,
1371
- guardrail as guardrail3,
1372
- persona as personaFragment,
1373
- structuredOutput as structuredOutput6,
1374
- user as user6
1375
- } from "@deepagents/context";
1376
-
1377
- // packages/text2sql/src/lib/synthesis/synthesizers/styles.ts
1378
- var ALL_STYLES = [
1379
- "formal",
1380
- // Professional business language
1381
- "colloquial",
1382
- // Casual everyday speech
1383
- "imperative",
1384
- // Commands: "Show me...", "Get..."
1385
- "interrogative",
1386
- // Questions: "What is...", "How many..."
1387
- "descriptive",
1388
- // Verbose, detailed
1389
- "concise",
1390
- // Brief, minimal
1391
- "vague",
1392
- // Ambiguous, hedging
1393
- "metaphorical",
1394
- // Figurative language
1395
- "conversational"
1396
- // Chat-like
1397
- ];
1398
- var styleInstructions = {
1399
- formal: "Use professional business language, complete sentences, no slang",
1400
- colloquial: "Use casual everyday speech, contractions, informal tone",
1401
- imperative: 'Phrase as commands: "Show me...", "Get...", "List..."',
1402
- interrogative: 'Phrase as questions: "What is...", "How many...", "Which..."',
1403
- descriptive: "Use detailed, verbose phrasing with extra context",
1404
- concise: "Use minimal words, telegram-style brevity",
1405
- vague: "Be intentionally ambiguous, use hedging language",
1406
- metaphorical: "Use figurative language, analogies, creative phrasing",
1407
- conversational: "Chat-like tone, as if talking to a colleague"
1408
- };
1409
-
1410
- // packages/text2sql/src/lib/synthesis/synthesizers/breadth-evolver.ts
1411
- var paraphraserOutputSchema = z6.object({
1412
- paraphrases: z6.array(
1413
- z6.string().describe("A paraphrased version of the original question")
1414
- ).min(1).describe("List of paraphrased questions that would produce the same SQL")
1415
- });
1416
- async function paraphraseQuestion(params) {
1417
- const context = new ContextEngine6({
1418
- store: new InMemoryContextStore6(),
1419
- chatId: `paraphraser-${crypto.randomUUID()}`,
1420
- userId: "system"
1421
- });
1422
- const personaInstruction = params.persona ? dedent6`
1423
- <persona role="${params.persona.role}">
1424
- ${params.persona.perspective}
1425
-
1426
- Paraphrase the question as this persona would naturally ask it.
1427
- Use their vocabulary, priorities, and framing style.
1428
- </persona>
1429
- ` : "";
1430
- const styleInstruction = params.persona?.styles && params.persona.styles.length > 0 ? dedent6`
1431
- <communication_styles>
1432
- Generate paraphrases using these communication styles: ${params.persona.styles.join(", ")}
1433
-
1434
- Style definitions:
1435
- ${params.persona.styles.map((s) => `- ${s}: ${styleInstructions[s]}`).join("\n")}
1436
-
1437
- Distribute paraphrases across these styles for variety.
1438
- </communication_styles>
1439
- ` : "";
1440
- context.set(
1441
- personaFragment({
1442
- name: "question_paraphraser",
1443
- role: "You are a linguistic expert specializing in paraphrasing database questions. Your task is to generate alternative phrasings of questions that preserve the exact same semantic meaning - they must all produce the identical SQL query.",
1444
- objective: "Generate paraphrased versions of questions that preserve exact semantic meaning and produce identical SQL"
1445
- }),
1446
- fragment6("original_question", params.question),
1447
- fragment6(
1448
- "reference_sql",
1449
- params.sql,
1450
- "This SQL shows what the question is really asking - all paraphrases must ask for exactly this"
1451
- ),
1452
- ...personaInstruction ? [fragment6("persona", personaInstruction)] : [],
1453
- ...styleInstruction ? [fragment6("communication_styles", styleInstruction)] : [],
1454
- fragment6(
1455
- "task",
1456
- dedent6`
1457
- Generate exactly ${params.count} paraphrased versions of the original question.
1458
-
1459
- Requirements:
1460
- 1. Each paraphrase must be semantically equivalent - it should produce the EXACT same SQL
1461
- 2. Vary the sentence structure, word choice, and phrasing style
1462
- 3. Use natural language without SQL keywords (SELECT, WHERE, JOIN, etc.)
1463
- 4. Keep paraphrases realistic - how actual users would ask
1464
- 5. Do not add or remove any conditions, filters, or requirements from the original
1465
- ${params.persona?.styles?.length ? "6. Apply the specified communication styles to create diverse phrasings" : ""}
1466
- `
1467
- ),
1468
- guardrail3({ rule: "NEVER change what data is being requested" }),
1469
- guardrail3({
1470
- rule: "NEVER add filters, aggregations, or conditions not in the original"
1471
- }),
1472
- guardrail3({
1473
- rule: "NEVER remove any specificity from the original question"
1474
- }),
1475
- guardrail3({
1476
- rule: "All paraphrases must be answerable by the exact same SQL query"
1477
- }),
1478
- user6(
1479
- `Paraphrase this question ${params.count} times: "${params.question}"`
1480
- )
1481
- );
1482
- const paraphraserOutput = structuredOutput6({
1483
- model: params.model ?? groq6("openai/gpt-oss-20b"),
1484
- context,
1485
- schema: paraphraserOutputSchema
1486
- });
1487
- return paraphraserOutput.generate();
1488
- }
1489
- var BreadthEvolver = class extends PairProducer {
1490
- /**
1491
- * @param source - Source pairs or producer to evolve
1492
- * @param options - Evolution options including count, persona, and concurrency
1493
- */
1494
- constructor(source, options) {
1495
- super();
1496
- this.source = source;
1497
- this.options = options;
1498
- this.#limit = pLimit2(this.options.concurrency ?? 4);
1499
- }
1500
- #limit;
1501
- /**
1502
- * Batch pairs within each chunk for concurrent processing.
1503
- * Uses pLimit for concurrency control, yields results per pair after chunk completes.
1504
- */
1505
- async *produce() {
1506
- for await (const chunk of this.from(this.source)) {
1507
- const tasks = chunk.map(
1508
- (pair) => this.#limit(async () => {
1509
- const result = await paraphraseQuestion({
1510
- question: pair.question,
1511
- sql: pair.sql,
1512
- count: this.options.count,
1513
- persona: this.options.persona,
1514
- model: this.options.model
1515
- });
1516
- return result.paraphrases.map((paraphrase) => ({
1517
- question: paraphrase,
1518
- sql: pair.sql,
1519
- context: pair.context,
1520
- success: pair.success
1521
- }));
1522
- })
1523
- );
1524
- const results = await Promise.all(tasks);
1525
- yield results.flat();
1526
- }
1527
- }
1528
- };
1529
-
1530
1356
  // packages/text2sql/src/lib/synthesis/synthesizers/depth-evolver.ts
1531
- import { groq as groq7 } from "@ai-sdk/groq";
1532
- import { NoObjectGeneratedError as NoObjectGeneratedError2, NoOutputGeneratedError as NoOutputGeneratedError2 } from "ai";
1533
- import dedent7 from "dedent";
1534
- import pLimit3 from "p-limit";
1535
- import pRetry2 from "p-retry";
1536
- import z7 from "zod";
1537
- import "@deepagents/agent";
1538
- import {
1539
- ContextEngine as ContextEngine7,
1540
- InMemoryContextStore as InMemoryContextStore7,
1541
- fragment as fragment7,
1542
- guardrail as guardrail4,
1543
- persona as persona6,
1544
- structuredOutput as structuredOutput7,
1545
- user as user7
1546
- } from "@deepagents/context";
1547
1357
  var techniqueInstructions = {
1548
- "add-aggregation": dedent7`
1358
+ "add-aggregation": dedent6`
1549
1359
  Add aggregation requirements to the question.
1550
1360
  Transform it to require GROUP BY, COUNT, SUM, AVG, MIN, MAX, or similar operations.
1551
1361
  Examples:
@@ -1553,7 +1363,7 @@ var techniqueInstructions = {
1553
1363
  - "List products" → "What is the average price per category?"
1554
1364
  - "Get employees" → "How many employees are in each department?"
1555
1365
  `,
1556
- "add-filter": dedent7`
1366
+ "add-filter": dedent6`
1557
1367
  Add filtering conditions to the question.
1558
1368
  Transform it to require WHERE clauses with specific conditions.
1559
1369
  Examples:
@@ -1561,7 +1371,7 @@ var techniqueInstructions = {
1561
1371
  - "List customers" → "List customers who have made more than 5 purchases"
1562
1372
  - "Get products" → "Get products with price above $100"
1563
1373
  `,
1564
- "add-join": dedent7`
1374
+ "add-join": dedent6`
1565
1375
  Add requirements that need data from related tables.
1566
1376
  Transform it to require JOIN operations between multiple tables.
1567
1377
  Examples:
@@ -1569,7 +1379,7 @@ var techniqueInstructions = {
1569
1379
  - "List products" → "List products with their supplier information"
1570
1380
  - "Get employees" → "Get employees with their department and manager names"
1571
1381
  `,
1572
- "add-reasoning": dedent7`
1382
+ "add-reasoning": dedent6`
1573
1383
  Add multi-step reasoning requirements.
1574
1384
  Transform it to require logical deduction, comparisons, or derived calculations.
1575
1385
  Examples:
@@ -1577,7 +1387,7 @@ var techniqueInstructions = {
1577
1387
  - "List products" → "Which products are underperforming compared to their category average?"
1578
1388
  - "Get revenue" → "Which month had the highest growth compared to the previous month?"
1579
1389
  `,
1580
- hypothetical: dedent7`
1390
+ hypothetical: dedent6`
1581
1391
  Add a hypothetical or speculative scenario.
1582
1392
  Transform it to require applying calculations or projections.
1583
1393
  Examples:
@@ -1586,36 +1396,36 @@ var techniqueInstructions = {
1586
1396
  - "Get costs" → "What would be the impact of a 10% discount on profit margins?"
1587
1397
  `
1588
1398
  };
1589
- var evolverOutputSchema = z7.object({
1590
- evolvedQuestion: z7.string().describe("The evolved, more complex version of the original question")
1399
+ var evolverOutputSchema = z6.object({
1400
+ evolvedQuestion: z6.string().describe("The evolved, more complex version of the original question")
1591
1401
  });
1592
1402
  async function evolveQuestion(params) {
1593
- const context = new ContextEngine7({
1594
- store: new InMemoryContextStore7(),
1403
+ const context = new ContextEngine6({
1404
+ store: new InMemoryContextStore6(),
1595
1405
  chatId: `evolver-${crypto.randomUUID()}`,
1596
1406
  userId: "system"
1597
1407
  });
1598
1408
  context.set(
1599
- persona6({
1409
+ persona5({
1600
1410
  name: "question_evolver",
1601
1411
  role: "You are an expert at evolving simple database questions into more complex ones. Your task is to take a basic question and transform it into a more sophisticated version that requires advanced SQL techniques to answer.",
1602
1412
  objective: "Transform simple questions into complex versions requiring advanced SQL techniques"
1603
1413
  }),
1604
- fragment7("original_question", params.question),
1605
- fragment7(
1414
+ fragment6("original_question", params.question),
1415
+ fragment6(
1606
1416
  "original_sql",
1607
1417
  params.sql,
1608
1418
  "(This shows what the original question required)"
1609
1419
  ),
1610
- fragment7("database_schema", params.schema),
1611
- fragment7(
1420
+ fragment6("database_schema", params.schema),
1421
+ fragment6(
1612
1422
  "technique",
1613
1423
  { name: params.technique },
1614
1424
  params.techniqueInstruction
1615
1425
  ),
1616
- fragment7(
1426
+ fragment6(
1617
1427
  "task",
1618
- dedent7`
1428
+ dedent6`
1619
1429
  Evolve the original question using the "${params.technique}" technique.
1620
1430
 
1621
1431
  Requirements:
@@ -1627,22 +1437,22 @@ async function evolveQuestion(params) {
1627
1437
  6. The evolved question should build upon the original topic/domain
1628
1438
  `
1629
1439
  ),
1630
- guardrail4({
1440
+ guardrail3({
1631
1441
  rule: "The evolved question MUST require more complex SQL than the original"
1632
1442
  }),
1633
- guardrail4({
1443
+ guardrail3({
1634
1444
  rule: "Do not ask for data that does not exist in the schema"
1635
1445
  }),
1636
- guardrail4({
1446
+ guardrail3({
1637
1447
  rule: "Keep the question grounded in the same domain as the original"
1638
1448
  }),
1639
- guardrail4({ rule: "Make sure the question is clear and unambiguous" }),
1640
- user7(
1449
+ guardrail3({ rule: "Make sure the question is clear and unambiguous" }),
1450
+ user6(
1641
1451
  `Evolve this question using "${params.technique}": "${params.question}"`
1642
1452
  )
1643
1453
  );
1644
- const evolverOutput = structuredOutput7({
1645
- model: params.model ?? groq7("openai/gpt-oss-20b"),
1454
+ const evolverOutput = structuredOutput6({
1455
+ model: params.model ?? groq6("openai/gpt-oss-20b"),
1646
1456
  context,
1647
1457
  schema: evolverOutputSchema
1648
1458
  });
@@ -1666,7 +1476,7 @@ var DepthEvolver = class extends PairProducer {
1666
1476
  this.source = source;
1667
1477
  this.adapter = adapter;
1668
1478
  this.options = options;
1669
- this.#limit = pLimit3(this.options?.concurrency ?? 4);
1479
+ this.#limit = pLimit2(this.options?.concurrency ?? 4);
1670
1480
  }
1671
1481
  #limit;
1672
1482
  /**
@@ -1757,28 +1567,28 @@ async function withRetry2(computation) {
1757
1567
  }
1758
1568
 
1759
1569
  // packages/text2sql/src/lib/synthesis/synthesizers/persona-generator.ts
1760
- import { groq as groq8 } from "@ai-sdk/groq";
1761
- import dedent8 from "dedent";
1762
- import z8 from "zod";
1570
+ import { groq as groq7 } from "@ai-sdk/groq";
1571
+ import dedent7 from "dedent";
1572
+ import z7 from "zod";
1763
1573
  import "@deepagents/agent";
1764
1574
  import {
1765
- ContextEngine as ContextEngine8,
1766
- InMemoryContextStore as InMemoryContextStore8,
1575
+ ContextEngine as ContextEngine7,
1576
+ InMemoryContextStore as InMemoryContextStore7,
1767
1577
  XmlRenderer,
1768
- fragment as fragment8,
1769
- guardrail as guardrail5,
1578
+ fragment as fragment7,
1579
+ guardrail as guardrail4,
1770
1580
  persona as personaFragment2,
1771
- structuredOutput as structuredOutput8,
1772
- user as user8
1581
+ structuredOutput as structuredOutput7,
1582
+ user as user7
1773
1583
  } from "@deepagents/context";
1774
- var outputSchema3 = z8.object({
1775
- personas: z8.array(
1776
- z8.object({
1777
- role: z8.string().describe("The job title or role of this persona"),
1778
- perspective: z8.string().describe(
1584
+ var outputSchema2 = z7.object({
1585
+ personas: z7.array(
1586
+ z7.object({
1587
+ role: z7.string().describe("The job title or role of this persona"),
1588
+ perspective: z7.string().describe(
1779
1589
  "Rich description of what this persona cares about when querying the database"
1780
1590
  ),
1781
- styles: z8.array(z8.enum(ALL_STYLES)).min(1).max(3).describe(
1591
+ styles: z7.array(z7.enum(ALL_STYLES)).min(1).max(3).describe(
1782
1592
  "Typical communication styles for this persona (1-3 styles)"
1783
1593
  )
1784
1594
  })
@@ -1787,8 +1597,8 @@ var outputSchema3 = z8.object({
1787
1597
  async function generatePersonas(schemaFragments, options) {
1788
1598
  const schema = new XmlRenderer().render(schemaFragments);
1789
1599
  const count = options?.count ?? 5;
1790
- const context = new ContextEngine8({
1791
- store: new InMemoryContextStore8(),
1600
+ const context = new ContextEngine7({
1601
+ store: new InMemoryContextStore7(),
1792
1602
  chatId: `persona-gen-${crypto.randomUUID()}`,
1793
1603
  userId: "system"
1794
1604
  });
@@ -1798,10 +1608,10 @@ async function generatePersonas(schemaFragments, options) {
1798
1608
  role: "You are an expert at understanding database schemas and inferring who would use them.",
1799
1609
  objective: "Generate realistic personas representing users who would query this database"
1800
1610
  }),
1801
- fragment8("database_schema", schema),
1802
- fragment8(
1611
+ fragment7("database_schema", schema),
1612
+ fragment7(
1803
1613
  "task",
1804
- dedent8`
1614
+ dedent7`
1805
1615
  Analyze the database schema and generate realistic personas representing
1806
1616
  the different types of users who would query this database.
1807
1617
 
@@ -1832,9 +1642,9 @@ async function generatePersonas(schemaFragments, options) {
1832
1642
  - Styles should match how this persona would naturally communicate
1833
1643
  `
1834
1644
  ),
1835
- fragment8(
1645
+ fragment7(
1836
1646
  "example",
1837
- dedent8`
1647
+ dedent7`
1838
1648
  For an e-commerce schema with orders, customers, products tables:
1839
1649
 
1840
1650
  {
@@ -1850,237 +1660,235 @@ async function generatePersonas(schemaFragments, options) {
1850
1660
  }
1851
1661
  `
1852
1662
  ),
1853
- guardrail5({
1663
+ guardrail4({
1854
1664
  rule: "Only generate personas relevant to the actual schema provided"
1855
1665
  }),
1856
- guardrail5({
1666
+ guardrail4({
1857
1667
  rule: "Do not invent tables or data that do not exist in the schema"
1858
1668
  }),
1859
- guardrail5({
1669
+ guardrail4({
1860
1670
  rule: "Ensure perspectives are specific to the domain, not generic"
1861
1671
  }),
1862
- user8(
1672
+ user7(
1863
1673
  `Generate exactly ${count} distinct personas who would query this database.`
1864
1674
  )
1865
1675
  );
1866
- const personaOutput = structuredOutput8({
1867
- model: options?.model ?? groq8("openai/gpt-oss-20b"),
1676
+ const personaOutput = structuredOutput7({
1677
+ model: options?.model ?? groq7("openai/gpt-oss-20b"),
1868
1678
  context,
1869
- schema: outputSchema3
1679
+ schema: outputSchema2
1870
1680
  });
1871
1681
  const output = await personaOutput.generate();
1872
1682
  return output.personas;
1873
1683
  }
1874
1684
 
1875
- // packages/text2sql/src/lib/synthesis/synthesizers/teachings-generator.ts
1876
- import { XmlRenderer as XmlRenderer2 } from "@deepagents/context";
1685
+ // packages/text2sql/src/lib/synthesis/synthesizers/schema-synthesizer.ts
1686
+ import pLimit3 from "p-limit";
1877
1687
 
1878
- // packages/text2sql/src/lib/agents/teachables.agent.ts
1879
- import { groq as groq9 } from "@ai-sdk/groq";
1880
- import dedent9 from "dedent";
1881
- import z9 from "zod";
1688
+ // packages/text2sql/src/lib/agents/question.agent.ts
1689
+ import { groq as groq8 } from "@ai-sdk/groq";
1690
+ import dedent8 from "dedent";
1691
+ import z8 from "zod";
1882
1692
  import "@deepagents/agent";
1883
1693
  import {
1884
- ContextEngine as ContextEngine9,
1885
- InMemoryContextStore as InMemoryContextStore9,
1886
- analogy,
1887
- clarification,
1888
- example as example2,
1889
- explain,
1890
- fragment as fragment9,
1891
- guardrail as guardrail6,
1892
- hint as hint2,
1893
- persona as persona7,
1894
- quirk,
1895
- structuredOutput as structuredOutput9,
1896
- styleGuide,
1897
- term,
1898
- user as user9,
1899
- workflow as workflow2
1694
+ ContextEngine as ContextEngine8,
1695
+ InMemoryContextStore as InMemoryContextStore8,
1696
+ fragment as fragment8,
1697
+ guardrail as guardrail5,
1698
+ persona as persona6,
1699
+ structuredOutput as structuredOutput8,
1700
+ user as user8
1900
1701
  } from "@deepagents/context";
1901
- var outputSchema4 = z9.object({
1902
- terms: z9.array(z9.object({ name: z9.string(), definition: z9.string() })).optional().describe("Domain terminology definitions"),
1903
- hints: z9.array(z9.object({ text: z9.string() })).optional().describe("Helpful hints for SQL generation"),
1904
- guardrails: z9.array(
1905
- z9.object({
1906
- rule: z9.string(),
1907
- reason: z9.string().optional(),
1908
- action: z9.string().optional()
1909
- })
1910
- ).optional().describe("Safety rules and constraints"),
1911
- explains: z9.array(
1912
- z9.object({
1913
- concept: z9.string(),
1914
- explanation: z9.string(),
1915
- therefore: z9.string().optional()
1916
- })
1917
- ).optional().describe("Concept explanations"),
1918
- examples: z9.array(
1919
- z9.object({
1920
- question: z9.string(),
1921
- answer: z9.string(),
1922
- note: z9.string().optional()
1923
- })
1924
- ).optional().describe("Example question-answer pairs"),
1925
- clarifications: z9.array(z9.object({ when: z9.string(), ask: z9.string(), reason: z9.string() })).optional().describe("When to ask for clarification"),
1926
- workflows: z9.array(
1927
- z9.object({
1928
- task: z9.string(),
1929
- steps: z9.array(z9.string()).min(1),
1930
- triggers: z9.array(z9.string()).optional(),
1931
- notes: z9.string().optional()
1932
- })
1933
- ).optional().describe("Multi-step workflows"),
1934
- quirks: z9.array(z9.object({ issue: z9.string(), workaround: z9.string() })).optional().describe("Known issues and workarounds"),
1935
- styleGuides: z9.array(
1936
- z9.object({
1937
- prefer: z9.string(),
1938
- never: z9.string().optional(),
1939
- always: z9.string().optional()
1940
- })
1941
- ).optional().describe("SQL style preferences"),
1942
- analogies: z9.array(
1943
- z9.object({
1944
- concepts: z9.array(z9.string()).min(2),
1945
- relationship: z9.string(),
1946
- insight: z9.string().optional(),
1947
- therefore: z9.string().optional(),
1948
- pitfall: z9.string().optional()
1949
- })
1950
- ).optional().describe("Concept analogies")
1702
+ var complexityInstructions = {
1703
+ simple: dedent8`
1704
+ Generate simple questions that require:
1705
+ - Basic SELECT with single table
1706
+ - Simple WHERE clauses with one condition
1707
+ - COUNT(*) or basic aggregations
1708
+ - No joins required
1709
+ Examples: "How many customers do we have?", "List all products", "What is the total revenue?"
1710
+ `,
1711
+ moderate: dedent8`
1712
+ Generate moderate questions that require:
1713
+ - JOINs between 2-3 tables
1714
+ - Multiple WHERE conditions (AND/OR)
1715
+ - GROUP BY with HAVING clauses
1716
+ - ORDER BY with LIMIT
1717
+ - Basic subqueries
1718
+ Examples: "What are the top 5 customers by total orders?", "Which products have never been ordered?"
1719
+ `,
1720
+ complex: dedent8`
1721
+ Generate complex questions that require:
1722
+ - Multiple JOINs (3+ tables)
1723
+ - Nested subqueries or CTEs
1724
+ - Complex aggregations with multiple GROUP BY columns
1725
+ - CASE expressions
1726
+ - Date/time calculations
1727
+ Examples: "What is the month-over-month growth rate?", "Which customers have increased spending compared to last year?"
1728
+ `,
1729
+ "high complex": dedent8`
1730
+ Generate highly complex questions that require advanced SQL features:
1731
+ - Window functions (ROW_NUMBER, RANK, DENSE_RANK)
1732
+ - LAG, LEAD for comparisons
1733
+ - Running totals (SUM OVER)
1734
+ - Moving averages
1735
+ - PARTITION BY clauses
1736
+ - Complex CTEs with multiple levels
1737
+ Examples: "What is the running total of sales per month?", "Rank customers by their purchase frequency within each region"
1738
+ `
1739
+ };
1740
+ var outputSchema3 = z8.object({
1741
+ questions: z8.array(z8.string().describe("A natural language question about the data")).min(1).describe("List of natural language questions a user might ask")
1951
1742
  });
1952
- async function toTeachings(input, options) {
1953
- const context = new ContextEngine9({
1954
- store: new InMemoryContextStore9(),
1955
- chatId: `teachables-gen-${crypto.randomUUID()}`,
1743
+ async function generateQuestions(params) {
1744
+ const { introspection, complexity, count, prompt, model } = params;
1745
+ const context = new ContextEngine8({
1746
+ store: new InMemoryContextStore8(),
1747
+ chatId: `question-gen-${crypto.randomUUID()}`,
1956
1748
  userId: "system"
1957
1749
  });
1958
1750
  context.set(
1959
- persona7({
1960
- name: "teachables-author",
1961
- role: 'You design "fragments" for a Text2SQL system. Fragments become structured XML instructions.',
1962
- objective: "Choose only high-impact items that improve accuracy, safety, or clarity for this database"
1751
+ persona6({
1752
+ name: "question_generator",
1753
+ role: "You are a synthetic data generator specializing in creating realistic natural language questions that users might ask about a database.",
1754
+ objective: "Generate diverse, realistic natural language questions that match the specified complexity level"
1963
1755
  }),
1964
- fragment9("database_schema", input.schema),
1965
- ...input.context ? [fragment9("additional_context", input.context)] : [],
1966
- fragment9(
1967
- "output_structure",
1968
- dedent9`
1969
- Output a JSON object with these optional arrays (include only relevant ones):
1970
- - terms: [{ name: string, definition: string }] - Domain terminology
1971
- - hints: [{ text: string }] - Helpful SQL generation hints
1972
- - guardrails: [{ rule: string, reason?: string, action?: string }] - Safety constraints
1973
- - explains: [{ concept: string, explanation: string, therefore?: string }] - Concept explanations
1974
- - examples: [{ question: string, answer: string, note?: string }] - Q&A examples
1975
- - clarifications: [{ when: string, ask: string, reason: string }] - Clarification triggers
1976
- - workflows: [{ task: string, steps: string[], triggers?: string[], notes?: string }] - Multi-step tasks
1977
- - quirks: [{ issue: string, workaround: string }] - Known issues
1978
- - styleGuides: [{ prefer: string, never?: string, always?: string }] - SQL style rules
1979
- - analogies: [{ concepts: string[], relationship: string, insight?: string, therefore?: string, pitfall?: string }]
1980
- `
1756
+ fragment8("database_schema", introspection || ""),
1757
+ fragment8(
1758
+ "complexity",
1759
+ { level: complexity },
1760
+ complexityInstructions[complexity]
1981
1761
  ),
1982
- fragment9(
1762
+ fragment8(
1983
1763
  "task",
1984
- dedent9`
1985
- 1. Analyze the schema to infer domain, relationships, and sensitive columns.
1986
- 2. Generate 3-10 fragments total across all categories, prioritizing:
1987
- - guardrails for PII columns (email, ssn, phone, etc)
1988
- - hints for status/enum columns
1989
- - clarifications for ambiguous terms
1990
- 3. Ground everything in the schema - do not invent tables/columns.
1991
- 4. Only include categories that are relevant to this schema.
1764
+ dedent8`
1765
+ Generate exactly ${count} natural language questions at the "${complexity}" complexity level.
1766
+ The questions should:
1767
+ 1. Match the complexity requirements above
1768
+ 2. Use natural business language, not technical SQL terms
1769
+ 3. Be realistic questions a non-technical user would actually ask
1770
+ 4. Cover different tables and relationships when possible
1992
1771
  `
1993
1772
  ),
1994
- user9(
1995
- `Analyze this database schema and generate fragments that will help an AI generate accurate SQL queries.`
1773
+ guardrail5({
1774
+ rule: "Questions MUST ONLY reference tables and columns that exist in the schema above"
1775
+ }),
1776
+ guardrail5({
1777
+ rule: "Before generating each question, verify that ALL entities (tables, columns, relationships) you reference are explicitly listed in the schema"
1778
+ }),
1779
+ guardrail5({
1780
+ rule: "DO NOT invent or assume tables/columns that are not explicitly shown in the schema"
1781
+ }),
1782
+ guardrail5({
1783
+ rule: "Use natural language without SQL keywords like SELECT, WHERE, etc."
1784
+ }),
1785
+ guardrail5({
1786
+ rule: "All questions must match the specified complexity level"
1787
+ }),
1788
+ user8(
1789
+ prompt ?? `Generate ${count} questions at ${complexity} complexity given db schema.`
1996
1790
  )
1997
1791
  );
1998
- const teachablesOutput = structuredOutput9({
1999
- model: options?.model ?? groq9("openai/gpt-oss-20b"),
1792
+ const questionOutput = structuredOutput8({
1793
+ model: model ?? groq8("openai/gpt-oss-20b"),
2000
1794
  context,
2001
- schema: outputSchema4
1795
+ schema: outputSchema3
2002
1796
  });
2003
- const result = await teachablesOutput.generate();
2004
- const fragments = [];
2005
- result.terms?.forEach((t) => fragments.push(term(t.name, t.definition)));
2006
- result.hints?.forEach((h) => fragments.push(hint2(h.text)));
2007
- result.guardrails?.forEach(
2008
- (g) => fragments.push(
2009
- guardrail6({ rule: g.rule, reason: g.reason, action: g.action })
2010
- )
2011
- );
2012
- result.explains?.forEach(
2013
- (e) => fragments.push(
2014
- explain({
2015
- concept: e.concept,
2016
- explanation: e.explanation,
2017
- therefore: e.therefore
2018
- })
2019
- )
2020
- );
2021
- result.examples?.forEach(
2022
- (e) => fragments.push(
2023
- example2({ question: e.question, answer: e.answer, note: e.note })
2024
- )
2025
- );
2026
- result.clarifications?.forEach(
2027
- (c) => fragments.push(
2028
- clarification({ when: c.when, ask: c.ask, reason: c.reason })
2029
- )
2030
- );
2031
- result.workflows?.forEach(
2032
- (w) => fragments.push(
2033
- workflow2({
2034
- task: w.task,
2035
- steps: w.steps,
2036
- triggers: w.triggers,
2037
- notes: w.notes
2038
- })
2039
- )
2040
- );
2041
- result.quirks?.forEach(
2042
- (q) => fragments.push(quirk({ issue: q.issue, workaround: q.workaround }))
2043
- );
2044
- result.styleGuides?.forEach(
2045
- (s) => fragments.push(
2046
- styleGuide({ prefer: s.prefer, never: s.never, always: s.always })
2047
- )
2048
- );
2049
- result.analogies?.forEach(
2050
- (a) => fragments.push(
2051
- analogy({
2052
- concepts: a.concepts,
2053
- relationship: a.relationship,
2054
- insight: a.insight,
2055
- therefore: a.therefore,
2056
- pitfall: a.pitfall
2057
- })
2058
- )
2059
- );
2060
- return fragments;
1797
+ return questionOutput.generate();
2061
1798
  }
2062
1799
 
2063
- // packages/text2sql/src/lib/synthesis/synthesizers/teachings-generator.ts
2064
- async function generateTeachings(schemaFragments, options) {
2065
- const schema = new XmlRenderer2().render(schemaFragments);
2066
- const maxRetries = options?.maxRetries ?? 3;
2067
- let lastError;
2068
- for (let attempt = 0; attempt < maxRetries; attempt++) {
2069
- try {
2070
- return await toTeachings(
2071
- { schema, context: options?.context },
2072
- { model: options?.model }
1800
+ // packages/text2sql/src/lib/synthesis/synthesizers/schema-synthesizer.ts
1801
+ var SchemaSynthesizer = class extends PairProducer {
1802
+ /**
1803
+ * @param adapter - Database adapter for schema introspection and SQL validation
1804
+ * @param options - Synthesis configuration including count, complexity, and concurrency
1805
+ */
1806
+ constructor(adapter, options) {
1807
+ super();
1808
+ this.adapter = adapter;
1809
+ this.options = options;
1810
+ this.#complexities = Array.isArray(this.options.complexity) ? this.options.complexity : [this.options.complexity ?? "moderate"];
1811
+ this.#personas = this.options.personas ?? [void 0];
1812
+ this.#limit = pLimit3(this.options.concurrency ?? 5);
1813
+ }
1814
+ #complexities = [];
1815
+ #personas = [];
1816
+ #limit;
1817
+ /**
1818
+ * Generates question-SQL pairs by iterating through all persona × complexity combinations.
1819
+ * Uses parallel processing bounded by the configured concurrency limit.
1820
+ * Yields results as each combination completes (streaming pattern).
1821
+ * @returns Generated pairs from all combinations
1822
+ */
1823
+ async *produce() {
1824
+ const introspection = "";
1825
+ const combinations = this.#personas.flatMap(
1826
+ (persona7) => this.#complexities.map((complexity) => ({ persona: persona7, complexity }))
1827
+ );
1828
+ for (const { persona: persona7, complexity } of combinations) {
1829
+ const pairs = await this.#processCombination(
1830
+ introspection,
1831
+ persona7,
1832
+ complexity
2073
1833
  );
2074
- } catch (error) {
2075
- lastError = error;
2076
- const isRetryable = lastError.message.includes("parse") || lastError.message.includes("schema") || lastError.message.includes("No object generated") || lastError.name.includes("AI_");
2077
- if (!isRetryable) {
2078
- throw lastError;
1834
+ if (pairs.length) {
1835
+ yield pairs;
2079
1836
  }
2080
1837
  }
2081
1838
  }
2082
- throw lastError;
2083
- }
1839
+ /**
1840
+ * Processes a single persona × complexity combination by generating questions
1841
+ * and converting each to SQL in parallel.
1842
+ */
1843
+ async #processCombination(introspection, persona7, complexity) {
1844
+ const personaContext = persona7 ? `As ${persona7.role}, ${persona7.perspective}
1845
+
1846
+ Generate questions this persona would ask.` : void 0;
1847
+ const prompt = personaContext ? `${personaContext}
1848
+
1849
+ Generate ${this.options.count} questions at ${complexity} complexity.` : void 0;
1850
+ const { questions } = await this.#limit(
1851
+ () => generateQuestions({
1852
+ introspection,
1853
+ complexity,
1854
+ count: this.options.count,
1855
+ prompt,
1856
+ model: this.options.model
1857
+ })
1858
+ );
1859
+ const pairs = await Promise.all(
1860
+ questions.map(async (question) => {
1861
+ const result = await this.#limit(async () => {
1862
+ try {
1863
+ return await toSql({
1864
+ input: question,
1865
+ adapter: this.adapter,
1866
+ fragments: this.options.teachings ?? [],
1867
+ model: this.options.model
1868
+ });
1869
+ } catch (error) {
1870
+ if (UnanswerableSQLError.isInstance(error)) {
1871
+ return {
1872
+ attempts: 0,
1873
+ sql: "",
1874
+ errors: [
1875
+ `Cannot answer the question ${question} because ${error.message}`
1876
+ ]
1877
+ };
1878
+ }
1879
+ throw error;
1880
+ }
1881
+ });
1882
+ return {
1883
+ question,
1884
+ sql: result.sql,
1885
+ success: !result.errors || result.errors.length === 0
1886
+ };
1887
+ })
1888
+ );
1889
+ return pairs;
1890
+ }
1891
+ };
2084
1892
  export {
2085
1893
  ALL_STYLES,
2086
1894
  BaseContextualExtractor,
@@ -2099,7 +1907,6 @@ export {
2099
1907
  WindowedContextExtractor,
2100
1908
  formatConversation,
2101
1909
  generatePersonas,
2102
- generateTeachings,
2103
1910
  getMessageText,
2104
1911
  resolveContext,
2105
1912
  styleInstructions,