agentlang 0.10.2 → 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. package/README.md +7 -14
  2. package/out/api/http.d.ts +4 -0
  3. package/out/api/http.d.ts.map +1 -1
  4. package/out/api/http.js +171 -26
  5. package/out/api/http.js.map +1 -1
  6. package/out/cli/main.d.ts.map +1 -1
  7. package/out/cli/main.js +3 -0
  8. package/out/cli/main.js.map +1 -1
  9. package/out/extension/main.cjs +250 -250
  10. package/out/extension/main.cjs.map +2 -2
  11. package/out/language/agentlang-validator.d.ts.map +1 -1
  12. package/out/language/agentlang-validator.js +4 -0
  13. package/out/language/agentlang-validator.js.map +1 -1
  14. package/out/language/error-reporter.d.ts +53 -0
  15. package/out/language/error-reporter.d.ts.map +1 -0
  16. package/out/language/error-reporter.js +879 -0
  17. package/out/language/error-reporter.js.map +1 -0
  18. package/out/language/generated/ast.d.ts +51 -1
  19. package/out/language/generated/ast.d.ts.map +1 -1
  20. package/out/language/generated/ast.js +40 -0
  21. package/out/language/generated/ast.js.map +1 -1
  22. package/out/language/generated/grammar.d.ts.map +1 -1
  23. package/out/language/generated/grammar.js +286 -190
  24. package/out/language/generated/grammar.js.map +1 -1
  25. package/out/language/main.cjs +828 -694
  26. package/out/language/main.cjs.map +3 -3
  27. package/out/language/parser.d.ts +4 -2
  28. package/out/language/parser.d.ts.map +1 -1
  29. package/out/language/parser.js +30 -97
  30. package/out/language/parser.js.map +1 -1
  31. package/out/language/syntax.d.ts +2 -0
  32. package/out/language/syntax.d.ts.map +1 -1
  33. package/out/language/syntax.js +6 -0
  34. package/out/language/syntax.js.map +1 -1
  35. package/out/runtime/api.d.ts.map +1 -1
  36. package/out/runtime/api.js +22 -0
  37. package/out/runtime/api.js.map +1 -1
  38. package/out/runtime/defs.d.ts +1 -0
  39. package/out/runtime/defs.d.ts.map +1 -1
  40. package/out/runtime/defs.js +2 -1
  41. package/out/runtime/defs.js.map +1 -1
  42. package/out/runtime/document-retriever.d.ts +24 -0
  43. package/out/runtime/document-retriever.d.ts.map +1 -0
  44. package/out/runtime/document-retriever.js +258 -0
  45. package/out/runtime/document-retriever.js.map +1 -0
  46. package/out/runtime/embeddings/chunker.d.ts +18 -0
  47. package/out/runtime/embeddings/chunker.d.ts.map +1 -1
  48. package/out/runtime/embeddings/chunker.js +47 -15
  49. package/out/runtime/embeddings/chunker.js.map +1 -1
  50. package/out/runtime/embeddings/openai.d.ts.map +1 -1
  51. package/out/runtime/embeddings/openai.js +22 -9
  52. package/out/runtime/embeddings/openai.js.map +1 -1
  53. package/out/runtime/embeddings/provider.d.ts +1 -0
  54. package/out/runtime/embeddings/provider.d.ts.map +1 -1
  55. package/out/runtime/embeddings/provider.js +20 -1
  56. package/out/runtime/embeddings/provider.js.map +1 -1
  57. package/out/runtime/integration-client.d.ts +21 -0
  58. package/out/runtime/integration-client.d.ts.map +1 -0
  59. package/out/runtime/integration-client.js +112 -0
  60. package/out/runtime/integration-client.js.map +1 -0
  61. package/out/runtime/integrations.d.ts.map +1 -1
  62. package/out/runtime/integrations.js +20 -9
  63. package/out/runtime/integrations.js.map +1 -1
  64. package/out/runtime/interpreter.d.ts +1 -0
  65. package/out/runtime/interpreter.d.ts.map +1 -1
  66. package/out/runtime/interpreter.js +152 -17
  67. package/out/runtime/interpreter.js.map +1 -1
  68. package/out/runtime/loader.d.ts.map +1 -1
  69. package/out/runtime/loader.js +70 -7
  70. package/out/runtime/loader.js.map +1 -1
  71. package/out/runtime/logger.d.ts.map +1 -1
  72. package/out/runtime/logger.js +8 -1
  73. package/out/runtime/logger.js.map +1 -1
  74. package/out/runtime/module.d.ts +10 -0
  75. package/out/runtime/module.d.ts.map +1 -1
  76. package/out/runtime/module.js +68 -3
  77. package/out/runtime/module.js.map +1 -1
  78. package/out/runtime/modules/ai.d.ts +9 -2
  79. package/out/runtime/modules/ai.d.ts.map +1 -1
  80. package/out/runtime/modules/ai.js +219 -67
  81. package/out/runtime/modules/ai.js.map +1 -1
  82. package/out/runtime/resolvers/interface.d.ts +4 -0
  83. package/out/runtime/resolvers/interface.d.ts.map +1 -1
  84. package/out/runtime/resolvers/interface.js +14 -1
  85. package/out/runtime/resolvers/interface.js.map +1 -1
  86. package/out/runtime/resolvers/sqldb/database.d.ts +2 -0
  87. package/out/runtime/resolvers/sqldb/database.d.ts.map +1 -1
  88. package/out/runtime/resolvers/sqldb/database.js +142 -126
  89. package/out/runtime/resolvers/sqldb/database.js.map +1 -1
  90. package/out/runtime/resolvers/sqldb/dbutil.d.ts.map +1 -1
  91. package/out/runtime/resolvers/sqldb/dbutil.js +8 -0
  92. package/out/runtime/resolvers/sqldb/dbutil.js.map +1 -1
  93. package/out/runtime/resolvers/sqldb/impl.d.ts +1 -0
  94. package/out/runtime/resolvers/sqldb/impl.d.ts.map +1 -1
  95. package/out/runtime/resolvers/sqldb/impl.js +7 -0
  96. package/out/runtime/resolvers/sqldb/impl.js.map +1 -1
  97. package/out/runtime/resolvers/vector/lancedb-store.d.ts +16 -0
  98. package/out/runtime/resolvers/vector/lancedb-store.d.ts.map +1 -0
  99. package/out/runtime/resolvers/vector/lancedb-store.js +159 -0
  100. package/out/runtime/resolvers/vector/lancedb-store.js.map +1 -0
  101. package/out/runtime/resolvers/vector/types.d.ts +32 -0
  102. package/out/runtime/resolvers/vector/types.d.ts.map +1 -0
  103. package/out/runtime/resolvers/vector/types.js +2 -0
  104. package/out/runtime/resolvers/vector/types.js.map +1 -0
  105. package/out/runtime/services/documentFetcher.d.ts.map +1 -1
  106. package/out/runtime/services/documentFetcher.js +21 -6
  107. package/out/runtime/services/documentFetcher.js.map +1 -1
  108. package/out/runtime/state.d.ts +19 -1
  109. package/out/runtime/state.d.ts.map +1 -1
  110. package/out/runtime/state.js +36 -1
  111. package/out/runtime/state.js.map +1 -1
  112. package/out/syntaxes/agentlang.monarch.js +1 -1
  113. package/out/syntaxes/agentlang.monarch.js.map +1 -1
  114. package/package.json +19 -19
  115. package/src/api/http.ts +197 -37
  116. package/src/cli/main.ts +3 -0
  117. package/src/language/agentlang-validator.ts +3 -0
  118. package/src/language/agentlang.langium +3 -1
  119. package/src/language/error-reporter.ts +1028 -0
  120. package/src/language/generated/ast.ts +62 -0
  121. package/src/language/generated/grammar.ts +286 -190
  122. package/src/language/parser.ts +31 -100
  123. package/src/language/syntax.ts +8 -0
  124. package/src/runtime/api.ts +31 -0
  125. package/src/runtime/defs.ts +2 -1
  126. package/src/runtime/document-retriever.ts +311 -0
  127. package/src/runtime/embeddings/chunker.ts +52 -14
  128. package/src/runtime/embeddings/openai.ts +27 -9
  129. package/src/runtime/embeddings/provider.ts +22 -1
  130. package/src/runtime/integration-client.ts +158 -0
  131. package/src/runtime/integrations.ts +20 -11
  132. package/src/runtime/interpreter.ts +142 -12
  133. package/src/runtime/loader.ts +83 -5
  134. package/src/runtime/logger.ts +12 -1
  135. package/src/runtime/module.ts +78 -3
  136. package/src/runtime/modules/ai.ts +263 -76
  137. package/src/runtime/resolvers/interface.ts +19 -1
  138. package/src/runtime/resolvers/sqldb/database.ts +158 -130
  139. package/src/runtime/resolvers/sqldb/dbutil.ts +8 -0
  140. package/src/runtime/resolvers/sqldb/impl.ts +8 -0
  141. package/src/runtime/resolvers/vector/lancedb-store.ts +187 -0
  142. package/src/runtime/resolvers/vector/types.ts +39 -0
  143. package/src/runtime/services/documentFetcher.ts +21 -6
  144. package/src/runtime/state.ts +40 -1
  145. package/src/syntaxes/agentlang.monarch.ts +1 -1
@@ -1,5 +1,6 @@
1
1
  import { createAgentlangServices } from '../language/agentlang-module.js';
2
2
  import { AstNode, EmptyFileSystem, LangiumCoreServices, LangiumDocument, URI } from 'langium';
3
+ import { getFormattedErrors, collectErrors, formatErrors } from './error-reporter.js';
3
4
  import {
4
5
  CrudMap,
5
6
  Delete,
@@ -19,12 +20,14 @@ import {
19
20
  isPrimExpr,
20
21
  isWorkflowDefinition,
21
22
  JoinSpec,
23
+ LimitClause,
22
24
  Literal,
23
25
  MapEntry,
24
26
  MapLiteral,
25
27
  ModuleDefinition,
26
28
  NegExpr,
27
29
  NotExpr,
30
+ OffsetClause,
28
31
  OrderByClause,
29
32
  Pattern,
30
33
  PrimExpr,
@@ -124,113 +127,25 @@ export async function parseWorkflow(workflowDef: string): Promise<WorkflowDefini
124
127
  }
125
128
  }
126
129
 
127
- const ErrorIndicator = '<-- ERROR';
128
-
129
130
  export function maybeGetValidationErrors(
130
131
  document: LangiumDocument,
131
- lines?: string[]
132
- ): string[] | undefined {
133
- if (lines === undefined) {
134
- lines = document.textDocument.getText().split('\n');
135
- }
132
+ _lines?: string[]
133
+ ): string | undefined {
136
134
  const validationErrors = (document.diagnostics ?? []).filter(e => e.severity === 1);
135
+ if (validationErrors.length === 0) return undefined;
137
136
 
138
- const sls = new Set<number>();
139
- const scs = new Set<number>();
140
- if (validationErrors.length > 0) {
141
- for (const validationError of validationErrors) {
142
- if (
143
- !sls.has(validationError.range.start.line) &&
144
- !scs.has(validationError.range.start.character)
145
- ) {
146
- const t = document.textDocument.getText(validationError.range);
147
- const s = `(${validationError.range.start.line + 1}:${validationError.range.start.character + 1}) unexpected token(s) '${t}'`;
148
- const ln = lines[validationError.range.start.line];
149
- if (ln.indexOf(ErrorIndicator) > 0) {
150
- lines[validationError.range.start.line] = `${ln}, ${s}`;
151
- } else {
152
- lines[validationError.range.start.line] = `${ln} ${ErrorIndicator} ${s}`;
153
- }
154
- sls.add(validationError.range.start.line);
155
- scs.add(validationError.range.start.character);
156
- }
157
- }
158
- return trimErrorLines(lines);
159
- } else {
160
- return undefined;
161
- }
162
- }
163
-
164
- function trimErrorLines(lines: string[]): string[] {
165
- let startidx = 0;
166
- for (let i = 0; i < lines.length; ++i) {
167
- if (lines[i].indexOf(ErrorIndicator) > 0) {
168
- startidx = i;
169
- break;
170
- }
171
- }
172
- let endidx = startidx;
173
- for (let i = startidx + 1; i < lines.length; ++i) {
174
- if (lines[i].indexOf(ErrorIndicator) > 0) {
175
- endidx = i;
176
- break;
177
- }
178
- }
179
- if (startidx > 0) {
180
- --startidx;
181
- }
182
- if (endidx != lines.length) {
183
- ++endidx;
184
- }
185
- return lines.slice(startidx, endidx);
186
- }
137
+ // Collect only validation errors (not lexer/parser)
138
+ const errors = collectErrors(document).filter(e => e.category === 'VALIDATION ERROR');
139
+ if (errors.length === 0) return undefined;
187
140
 
188
- function trimErrorMessage(s: string): string {
189
- const start = s.indexOf('Expecting:');
190
- if (start >= 0) {
191
- const end = s.indexOf('but found:');
192
- if (end > 0) {
193
- return `Expecting a valid token sequence, ${s.substring(end)}`;
194
- }
195
- }
196
- return s;
141
+ const source = document.textDocument.getText();
142
+ return formatErrors(errors, source);
197
143
  }
198
144
 
199
145
  export function maybeRaiseParserErrors(document: LangiumDocument) {
200
- const code = document.textDocument.getText();
201
- const lines = code.split('\n');
202
- let hasErrors = false;
203
- const errLines = new Set<number>();
204
- if (document.parseResult.lexerErrors.length > 0) {
205
- document.parseResult.lexerErrors.forEach((err: any) => {
206
- if (!errLines.has(err.line)) {
207
- const errMsg = trimErrorMessage(err.message);
208
- const s = `${ErrorIndicator} (${err.line}:${err.column}) ${errMsg}`;
209
- lines[err.line - 1] = `${lines[err.line - 1]} ${s}`;
210
- errLines.add(err.line);
211
- }
212
- });
213
- hasErrors = true;
214
- }
215
- if (document.parseResult.parserErrors.length > 0) {
216
- document.parseResult.parserErrors.forEach((err: any) => {
217
- const errMsg = trimErrorMessage(err.message);
218
- if (err.token.startLine && err.token.endLine) {
219
- if (!errLines.has(err.token.startLine)) {
220
- const s = `${ErrorIndicator} (${err.token.startLine}:${err.token.startColumn}) ${errMsg}`;
221
- lines[err.token.endLine - 1] = `${lines[err.token.endLine - 1]} ${s}`;
222
- lines.join('\n');
223
- errLines.add(err.token.startLine);
224
- }
225
- } else {
226
- lines.push(`ERROR: ${errMsg}`);
227
- }
228
- });
229
- hasErrors = true;
230
- }
231
- const errs = maybeGetValidationErrors(document, lines);
232
- if (hasErrors || errs !== undefined) {
233
- throw new Error(lines.join('\n'));
146
+ const formatted = getFormattedErrors(document);
147
+ if (formatted) {
148
+ throw new Error('\n' + formatted);
234
149
  }
235
150
  }
236
151
 
@@ -411,6 +326,12 @@ function introspectQueryPattern(crudMap: CrudMap): CrudPattern {
411
326
  if (opts.orderByClause) {
412
327
  cp.orderBy = opts.orderByClause.colNames;
413
328
  }
329
+ if (opts.limitClause) {
330
+ cp.limit = opts.limitClause.value;
331
+ }
332
+ if (opts.offsetClause) {
333
+ cp.offset = opts.offsetClause.value;
334
+ }
414
335
  cp.isCreate = false;
415
336
  cp.isQueryUpdate = false;
416
337
  cp.isQuery = true;
@@ -425,6 +346,8 @@ export type ExtractedQueryOptions = {
425
346
  where: WhereSpec | undefined;
426
347
  groupByClause: GroupByClause | undefined;
427
348
  orderByClause: OrderByClause | undefined;
349
+ limitClause: LimitClause | undefined;
350
+ offsetClause: OffsetClause | undefined;
428
351
  upsert: '@upsert' | undefined;
429
352
  distinct: '@distinct' | undefined;
430
353
  };
@@ -436,6 +359,8 @@ export function extractQueryOptions(crudMap: CrudMap): ExtractedQueryOptions {
436
359
  where: undefined,
437
360
  groupByClause: undefined,
438
361
  orderByClause: undefined,
362
+ limitClause: undefined,
363
+ offsetClause: undefined,
439
364
  upsert: undefined,
440
365
  distinct: undefined,
441
366
  };
@@ -453,6 +378,10 @@ export function extractQueryOptions(crudMap: CrudMap): ExtractedQueryOptions {
453
378
  r.groupByClause = qo.groupByClause;
454
379
  } else if (qo.orderByClause) {
455
380
  r.orderByClause = qo.orderByClause;
381
+ } else if (qo.limitClause) {
382
+ r.limitClause = qo.limitClause;
383
+ } else if (qo.offsetClause) {
384
+ r.offsetClause = qo.offsetClause;
456
385
  } else if (qo.upsert) {
457
386
  r.upsert = qo.upsert;
458
387
  } else if (qo.distinct) {
@@ -596,7 +525,7 @@ export async function introspectCase(caseStr: string): Promise<CasePattern> {
596
525
 
597
526
  export function canParse(s: string): boolean {
598
527
  const ts = s.trim();
599
- if (ts) {
528
+ if (ts && ts.includes('{')) {
600
529
  const contents = ts.substring(1, ts.length - 1).trim();
601
530
  return contents.length > 0;
602
531
  }
@@ -614,6 +543,8 @@ export function objectToQueryPattern(obj: any): string {
614
543
  });
615
544
  } else if (k === '@groupBy' || k === '@orderBy') {
616
545
  strs.push(`${k} ( ${xs.join(', ')} )`);
546
+ } else if (k === '@limit' || k === '@offset') {
547
+ strs.push(`${k}(${xs})`);
617
548
  } else {
618
549
  strs.push(`${k} ${objectToQuerySpecPattern(xs, true)}`);
619
550
  }
@@ -413,6 +413,8 @@ export class CrudPattern extends BasePattern {
413
413
  where: WhereSpecClausePattern[] | undefined;
414
414
  groupBy: string[] | undefined;
415
415
  orderBy: string[] | undefined;
416
+ limit: number | undefined;
417
+ offset: number | undefined;
416
418
  isQuery: boolean = false;
417
419
  isQueryUpdate: boolean = false;
418
420
  isCreate: boolean = false;
@@ -592,6 +594,12 @@ export class CrudPattern extends BasePattern {
592
594
  if (this.orderBy) {
593
595
  s = s.concat(`,\n${this.orderByAsString()}`);
594
596
  }
597
+ if (this.limit !== undefined) {
598
+ s = s.concat(`,\n@limit(${this.limit})`);
599
+ }
600
+ if (this.offset !== undefined) {
601
+ s = s.concat(`,\n@offset(${this.offset})`);
602
+ }
595
603
 
596
604
  return s.concat('}', this.hintsAsString());
597
605
  }
@@ -1,4 +1,5 @@
1
1
  import { fetchConfig as al_fetchConfig } from './interpreter.js';
2
+ import { defaultDataSource } from './resolvers/sqldb/database.js';
2
3
  import {
3
4
  makeInstance as al_makeInstance,
4
5
  isInstanceOfType as al_isInstanceOfType,
@@ -6,6 +7,13 @@ import {
6
7
  import { getLocalEnv as al_getLocalEnv, setLocalEnv as al_setLocalEnv } from './auth/defs.js';
7
8
  import { now } from './util.js';
8
9
  import { initDateFns } from './datefns.js';
10
+ import {
11
+ integrationAuthFetch as al_authFetch,
12
+ getIntegrationAuthHeaders as al_getAuthHeaders,
13
+ getOAuthAuthorizeUrl as al_getOAuthAuthorizeUrl,
14
+ exchangeOAuthCode as al_exchangeOAuthCode,
15
+ getIntegrationAccessToken as al_getAccessToken,
16
+ } from './integration-client.js';
9
17
 
10
18
  declare global {
11
19
  var agentlang: any | undefined;
@@ -37,6 +45,29 @@ export function initGlobalApi() {
37
45
  // Expose date-fns functions globally as dateFns.*
38
46
  globalThis.dateFns = initDateFns();
39
47
 
48
+ // Expose credential auth helpers globally
49
+ globalThis.agentlang.authFetch = al_authFetch;
50
+ globalThis.agentlang.getAuthHeaders = al_getAuthHeaders;
51
+
52
+ // Expose OAuth consent flow helpers globally
53
+ globalThis.agentlang.getOAuthAuthorizeUrl = al_getOAuthAuthorizeUrl;
54
+ globalThis.agentlang.exchangeOAuthCode = al_exchangeOAuthCode;
55
+ globalThis.agentlang.getAccessToken = al_getAccessToken;
56
+
57
+ // Expose raw SQL query for resolvers that need direct database access (e.g. pgvector)
58
+ globalThis.agentlang.rawQuery = async (sql: string, params?: any[]) => {
59
+ if (!defaultDataSource || !defaultDataSource.isInitialized) {
60
+ throw new Error('Database not initialized');
61
+ }
62
+ return defaultDataSource.query(sql, params);
63
+ };
64
+
65
+ // Expose database type detection for resolvers
66
+ globalThis.agentlang.getDbType = () => {
67
+ if (!defaultDataSource || !defaultDataSource.isInitialized) return 'unknown';
68
+ return defaultDataSource.options.type;
69
+ };
70
+
40
71
  ApiInited = true;
41
72
  }
42
73
  }
@@ -4,7 +4,8 @@ export const PathAttributeName: string = '__path__';
4
4
  export const PathAttributeNameQuery: string = '__path__?';
5
5
  export const ParentAttributeName: string = '__parent__';
6
6
  export const DeletedFlagAttributeName: string = '__is_deleted__';
7
- export const TenantAttributeName: string = '__tenant__';
7
+ export const AgentIdAttributeName: string = 'agentId';
8
+ export const TenantAttributeName: string = 'agentId';
8
9
 
9
10
  export function isPathAttribute(n: string): boolean {
10
11
  return n.startsWith(PathAttributeName);
@@ -0,0 +1,311 @@
1
+ import { logger } from './logger.js';
2
+ import { AppConfig } from './state.js';
3
+ import { TextChunker } from './embeddings/chunker.js';
4
+ import { OpenAIEmbeddingProvider } from './embeddings/openai.js';
5
+ import { LanceDBVectorStore } from './resolvers/vector/lancedb-store.js';
6
+ import type { VectorStore } from './resolvers/vector/types.js';
7
+ import crypto from 'crypto';
8
+ import { readFileSync } from 'fs';
9
+ import { resolve as pathResolve } from 'path';
10
+
11
+ const VECTOR_DIMENSION = 1536;
12
+
13
+ interface LocalChunk {
14
+ id: string;
15
+ content: string;
16
+ documentTitle: string;
17
+ chunkIndex: number;
18
+ }
19
+
20
+ function usePgvector(): boolean {
21
+ if (AppConfig?.vectorStore?.type === 'pgvector') return true;
22
+ if (AppConfig?.vectorStore?.type === 'lancedb') return false;
23
+ return AppConfig?.store?.type === 'postgres';
24
+ }
25
+
26
+ /**
27
+ * Local document retriever — embeds documents into pgvector or LanceDB
28
+ * and retrieves relevant chunks via vector similarity search.
29
+ */
30
+ class DocumentRetriever {
31
+ private vectorStore: VectorStore | null = null;
32
+ private embeddingProvider: OpenAIEmbeddingProvider | null = null;
33
+ private chunker: TextChunker | null = null;
34
+ private localChunks: Map<string, LocalChunk> = new Map();
35
+ private processedDocuments: Set<string> = new Set();
36
+ private initialized = false;
37
+
38
+ private async ensureInit(): Promise<void> {
39
+ if (this.initialized) return;
40
+
41
+ this.chunker = new TextChunker(1000, 200);
42
+ this.embeddingProvider = new OpenAIEmbeddingProvider({
43
+ model: 'text-embedding-3-small',
44
+ });
45
+
46
+ if (!usePgvector()) {
47
+ const dbPath =
48
+ AppConfig?.vectorStore?.type === 'lancedb'
49
+ ? (AppConfig.vectorStore as any).dbname || './data/document-vectors.lance'
50
+ : './data/document-vectors.lance';
51
+
52
+ this.vectorStore = new LanceDBVectorStore({
53
+ moduleName: 'documents',
54
+ vectorDimension: VECTOR_DIMENSION,
55
+ dbname: dbPath,
56
+ });
57
+ await this.vectorStore.init();
58
+ logger.info(`[DOCUMENT-RETRIEVER] LanceDB vector store initialized at ${dbPath}`);
59
+ } else {
60
+ try {
61
+ const ag = (globalThis as any).agentlang;
62
+ if (ag?.rawQuery) {
63
+ await ag.rawQuery(`
64
+ CREATE TABLE IF NOT EXISTS document_local_chunks (
65
+ id TEXT PRIMARY KEY,
66
+ content TEXT NOT NULL,
67
+ document_title TEXT NOT NULL,
68
+ chunk_index INTEGER NOT NULL,
69
+ embedding vector(${VECTOR_DIMENSION})
70
+ )
71
+ `);
72
+ try {
73
+ await ag.rawQuery(`
74
+ CREATE INDEX IF NOT EXISTS idx_document_local_chunks_embedding
75
+ ON document_local_chunks USING hnsw (embedding vector_cosine_ops)
76
+ `);
77
+ } catch {
78
+ // Index may already exist or pgvector extension not loaded
79
+ }
80
+ logger.info('[DOCUMENT-RETRIEVER] pgvector local chunks table initialized');
81
+ }
82
+ } catch (err) {
83
+ logger.warn(`[DOCUMENT-RETRIEVER] Failed to initialize pgvector table: ${err}`);
84
+ }
85
+ }
86
+
87
+ this.initialized = true;
88
+ }
89
+
90
+ async processDocument(title: string, url: string): Promise<void> {
91
+ if (this.processedDocuments.has(title)) return;
92
+
93
+ await this.ensureInit();
94
+
95
+ try {
96
+ let content: string;
97
+ if (url.startsWith('http://') || url.startsWith('https://')) {
98
+ const resp = await fetch(url);
99
+ if (!resp.ok) {
100
+ logger.warn(
101
+ `[DOCUMENT-RETRIEVER] Failed to fetch "${title}" from ${url}: ${resp.status}`
102
+ );
103
+ return;
104
+ }
105
+ content = await resp.text();
106
+ } else {
107
+ const filePath = pathResolve(url);
108
+ try {
109
+ content = readFileSync(filePath, 'utf-8');
110
+ } catch (err) {
111
+ logger.warn(`[DOCUMENT-RETRIEVER] Failed to read "${title}" from ${filePath}: ${err}`);
112
+ return;
113
+ }
114
+ }
115
+
116
+ if (!content || content.trim().length === 0) {
117
+ logger.debug(`[DOCUMENT-RETRIEVER] Document "${title}" is empty, skipping`);
118
+ this.processedDocuments.add(title);
119
+ return;
120
+ }
121
+
122
+ const chunks = this.chunker!.splitText(content);
123
+ logger.debug(`[DOCUMENT-RETRIEVER] Document "${title}": ${chunks.length} chunks`);
124
+
125
+ if (chunks.length === 0) {
126
+ this.processedDocuments.add(title);
127
+ return;
128
+ }
129
+
130
+ const embeddings = await this.embeddingProvider!.embedTexts(chunks);
131
+
132
+ if (usePgvector()) {
133
+ await this.storePgvectorChunks(title, chunks, embeddings);
134
+ } else {
135
+ await this.storeLanceDBChunks(title, chunks, embeddings);
136
+ }
137
+
138
+ this.processedDocuments.add(title);
139
+ logger.info(
140
+ `[DOCUMENT-RETRIEVER] Processed "${title}": ${chunks.length} chunks embedded and stored`
141
+ );
142
+ } catch (err) {
143
+ logger.warn(`[DOCUMENT-RETRIEVER] Error processing "${title}": ${err}`);
144
+ }
145
+ }
146
+
147
+ private async storePgvectorChunks(
148
+ title: string,
149
+ chunks: string[],
150
+ embeddings: number[][]
151
+ ): Promise<void> {
152
+ const ag = (globalThis as any).agentlang;
153
+ if (!ag?.rawQuery) return;
154
+
155
+ for (let i = 0; i < chunks.length; i++) {
156
+ const id = crypto.randomUUID();
157
+ const embeddingStr = `[${embeddings[i].join(',')}]`;
158
+ await ag.rawQuery(
159
+ `INSERT INTO document_local_chunks (id, content, document_title, chunk_index, embedding)
160
+ VALUES ($1, $2, $3, $4, $5::vector)
161
+ ON CONFLICT (id) DO NOTHING`,
162
+ [id, chunks[i], title, i, embeddingStr]
163
+ );
164
+ this.localChunks.set(id, {
165
+ id,
166
+ content: chunks[i],
167
+ documentTitle: title,
168
+ chunkIndex: i,
169
+ });
170
+ }
171
+ }
172
+
173
+ private async storeLanceDBChunks(
174
+ title: string,
175
+ chunks: string[],
176
+ embeddings: number[][]
177
+ ): Promise<void> {
178
+ if (!this.vectorStore) return;
179
+
180
+ for (let i = 0; i < chunks.length; i++) {
181
+ const id = crypto.randomUUID();
182
+ await this.vectorStore.addEmbedding({
183
+ id,
184
+ embedding: embeddings[i],
185
+ documentId: title,
186
+ });
187
+ this.localChunks.set(id, {
188
+ id,
189
+ content: chunks[i],
190
+ documentTitle: title,
191
+ chunkIndex: i,
192
+ });
193
+ }
194
+ }
195
+
196
+ async query(queryText: string, documentTitles?: string[], limit: number = 10): Promise<string> {
197
+ await this.ensureInit();
198
+
199
+ try {
200
+ const results = usePgvector()
201
+ ? await this.queryPgvector(queryText, documentTitles, limit)
202
+ : await this.queryLanceDB(queryText, documentTitles, limit);
203
+
204
+ if (results.length === 0) return '';
205
+
206
+ return results.map(r => r.content).join('\n\n---\n\n');
207
+ } catch (err) {
208
+ logger.debug(`[DOCUMENT-RETRIEVER] Query failed: ${err}`);
209
+ return '';
210
+ }
211
+ }
212
+
213
+ private async queryPgvector(
214
+ queryText: string,
215
+ documentTitles?: string[],
216
+ limit: number = 10
217
+ ): Promise<Array<{ id: string; content: string; similarity: number }>> {
218
+ const ag = (globalThis as any).agentlang;
219
+ if (!ag?.rawQuery) return [];
220
+
221
+ const queryEmbedding = await this.embeddingProvider!.embedText(queryText);
222
+ const embeddingStr = `[${queryEmbedding.join(',')}]`;
223
+
224
+ let sql: string;
225
+ let params: any[];
226
+
227
+ if (documentTitles && documentTitles.length > 0) {
228
+ const placeholders = documentTitles.map((_, i) => `$${i + 2}`).join(', ');
229
+ sql = `SELECT id, content, document_title, 1 - (embedding <=> $1::vector) AS similarity
230
+ FROM document_local_chunks
231
+ WHERE document_title IN (${placeholders})
232
+ ORDER BY embedding <=> $1::vector
233
+ LIMIT ${limit}`;
234
+ params = [embeddingStr, ...documentTitles];
235
+ } else {
236
+ sql = `SELECT id, content, document_title, 1 - (embedding <=> $1::vector) AS similarity
237
+ FROM document_local_chunks
238
+ ORDER BY embedding <=> $1::vector
239
+ LIMIT ${limit}`;
240
+ params = [embeddingStr];
241
+ }
242
+
243
+ const rows: any[] = await ag.rawQuery(sql, params);
244
+ return (rows || []).map((r: any) => ({
245
+ id: r.id,
246
+ content: r.content,
247
+ similarity: parseFloat(r.similarity) || 0,
248
+ }));
249
+ }
250
+
251
+ private async queryLanceDB(
252
+ queryText: string,
253
+ documentTitles?: string[],
254
+ limit: number = 10
255
+ ): Promise<Array<{ id: string; content: string; similarity: number }>> {
256
+ if (!this.vectorStore) return [];
257
+
258
+ const queryEmbedding = await this.embeddingProvider!.embedText(queryText);
259
+ const searchResults = await this.vectorStore.search(
260
+ queryEmbedding,
261
+ undefined,
262
+ undefined,
263
+ limit
264
+ );
265
+
266
+ const results: Array<{ id: string; content: string; similarity: number }> = [];
267
+
268
+ for (const sr of searchResults) {
269
+ const chunk = this.localChunks.get(sr.id);
270
+ if (!chunk) continue;
271
+
272
+ if (documentTitles && documentTitles.length > 0) {
273
+ if (!documentTitles.includes(chunk.documentTitle)) continue;
274
+ }
275
+
276
+ results.push({
277
+ id: sr.id,
278
+ content: chunk.content,
279
+ similarity: 1 - (sr.distance || 0),
280
+ });
281
+ }
282
+
283
+ return results.slice(0, limit);
284
+ }
285
+
286
+ async close(): Promise<void> {
287
+ if (this.vectorStore) {
288
+ await this.vectorStore.close();
289
+ this.vectorStore = null;
290
+ }
291
+ this.localChunks.clear();
292
+ this.processedDocuments.clear();
293
+ this.initialized = false;
294
+ }
295
+ }
296
+
297
+ let retrieverInstance: DocumentRetriever | null = null;
298
+
299
+ export function getDocumentRetriever(): DocumentRetriever {
300
+ if (!retrieverInstance) {
301
+ retrieverInstance = new DocumentRetriever();
302
+ }
303
+ return retrieverInstance;
304
+ }
305
+
306
+ export function resetDocumentRetriever(): void {
307
+ if (retrieverInstance) {
308
+ retrieverInstance.close().catch(() => {});
309
+ retrieverInstance = null;
310
+ }
311
+ }
@@ -1,41 +1,79 @@
1
+ /**
2
+ * Streaming text chunker - yields chunks one at a time without storing all in memory.
3
+ */
1
4
  export class TextChunker {
2
5
  private chunkSize: number;
3
6
  private chunkOverlap: number;
4
7
  private separators: string[] = ['\n\n', '\n', '. ', ' ', ''];
5
8
 
6
9
  constructor(chunkSize: number = 1000, chunkOverlap: number = 200) {
7
- this.chunkSize = chunkSize;
8
- this.chunkOverlap = chunkOverlap;
10
+ // Ensure valid values - overlap must be less than chunk size to avoid infinite loop
11
+ this.chunkSize = Math.max(100, chunkSize || 1000);
12
+ // Cap overlap to at most 20% of chunk size to ensure progress
13
+ this.chunkOverlap = Math.max(
14
+ 0,
15
+ Math.min(chunkOverlap || 200, Math.floor(this.chunkSize * 0.2))
16
+ );
9
17
  }
10
18
 
11
- splitText(text: string): string[] {
19
+ /**
20
+ * Calculate total chunks without creating them all in memory.
21
+ * Used for logging/progress tracking.
22
+ */
23
+ estimateChunks(text: string): number {
24
+ if (text.length <= this.chunkSize) {
25
+ return 1;
26
+ }
27
+ // Rough estimate: (text length / effective chunk size) + 1
28
+ const effectiveChunkSize = this.chunkSize - this.chunkOverlap;
29
+ return Math.ceil(text.length / effectiveChunkSize);
30
+ }
31
+
32
+ /**
33
+ * Streaming generator that yields chunks one at a time.
34
+ * Memory-efficient: doesn't store all chunks in an array.
35
+ */
36
+ *streamChunks(text: string): Generator<string, void, unknown> {
12
37
  if (text.length <= this.chunkSize) {
13
- return [text];
38
+ yield text;
39
+ return;
14
40
  }
15
41
 
16
- const chunks: string[] = [];
17
42
  let start = 0;
43
+ const minAdvance = Math.max(50, this.chunkSize - this.chunkOverlap); // Ensure we always advance
18
44
 
19
45
  while (start < text.length) {
20
46
  let end = Math.min(start + this.chunkSize, text.length);
21
47
 
22
48
  if (end < text.length) {
23
- end = this.findBestSplitPoint(text, start, end);
49
+ // Try to find a good split point, but ensure we advance by at least minAdvance
50
+ const splitPoint = this.findBestSplitPoint(text, start, end);
51
+ // Only use split point if it gives us reasonable progress
52
+ if (splitPoint - start >= minAdvance * 0.5) {
53
+ end = splitPoint;
54
+ }
55
+ // Otherwise use the hard end to ensure progress
24
56
  }
25
57
 
26
- chunks.push(text.substring(start, end));
27
- start = end - this.chunkOverlap;
58
+ yield text.substring(start, end);
28
59
 
29
- if (start < 0) start = 0;
30
- if (start >= text.length - this.chunkOverlap) {
31
- if (start < text.length) {
32
- chunks.push(text.substring(start));
33
- }
60
+ // Advance by at least minAdvance characters to avoid infinite loops
61
+ const nextStart = end - this.chunkOverlap;
62
+ start = Math.max(nextStart, start + minAdvance * 0.5);
63
+
64
+ if (start >= text.length) {
34
65
  break;
35
66
  }
36
67
  }
68
+ }
37
69
 
38
- return chunks;
70
+ /**
71
+ * Legacy method for backwards compatibility.
72
+ * ⚠️ WARNING: This creates all chunks in memory and can cause OOM on large documents.
73
+ * Prefer streamChunks() for large documents.
74
+ */
75
+ splitText(text: string): string[] {
76
+ return Array.from(this.streamChunks(text));
39
77
  }
40
78
 
41
79
  private findBestSplitPoint(text: string, start: number, end: number): number {