@hubblecommerce/overmind-core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,162 @@
1
+ import { parseConfluenceHtml } from './confluence-html-parser.js';
2
+ /**
3
+ * Split markdown content by H2 headings (## Section)
4
+ * Each H2 section becomes one chunk
5
+ */
6
+ export function splitMarkdownBySections(markdown) {
7
+ const chunks = [];
8
+ // Split by H2 headings (## )
9
+ const h2Pattern = /^## (.+)$/gm;
10
+ const matches = [...markdown.matchAll(h2Pattern)];
11
+ if (matches.length === 0) {
12
+ // No H2 headings - treat entire content as single chunk
13
+ return [{
14
+ sectionTitle: undefined,
15
+ content: markdown.trim(),
16
+ sectionIndex: 0,
17
+ }];
18
+ }
19
+ // Process each section
20
+ for (let i = 0; i < matches.length; i++) {
21
+ const match = matches[i];
22
+ const sectionTitle = match[1].trim();
23
+ const startIndex = match.index ?? 0;
24
+ const endIndex = i < matches.length - 1 ? (matches[i + 1].index ?? markdown.length) : markdown.length;
25
+ const sectionContent = markdown.substring(startIndex, endIndex).trim();
26
+ chunks.push({
27
+ sectionTitle,
28
+ content: sectionContent,
29
+ sectionIndex: i,
30
+ });
31
+ }
32
+ return chunks;
33
+ }
34
+ /**
35
+ * Extract metadata from content
36
+ */
37
+ function analyzeContent(content) {
38
+ const wordCount = content.split(/\s+/).filter(word => word.length > 0).length;
39
+ const hasCodeBlocks = /```/.test(content);
40
+ const hasTables = /\|.*\|/.test(content);
41
+ const hasLinks = /\[.+\]\(.+\)/.test(content);
42
+ return {
43
+ word_count: wordCount,
44
+ has_code_blocks: hasCodeBlocks,
45
+ has_tables: hasTables,
46
+ has_links: hasLinks,
47
+ };
48
+ }
49
+ /**
50
+ * Extract metadata from labels
51
+ * Categorizes labels into specific fields and collects the rest as keywords
52
+ */
53
+ function extractMetadataFromLabels(labels) {
54
+ const contentTypeLabels = ['project-info', 'faq', 'guide', 'decision', 'template'];
55
+ const categoryLabels = ['active', 'maintenance', 'legacy', 'archived'];
56
+ const contentType = labels.find(label => contentTypeLabels.includes(label));
57
+ const customer = labels.find(label => label.startsWith('customer-'));
58
+ const category = labels.find(label => categoryLabels.includes(label));
59
+ // Keywords: all labels that weren't categorized
60
+ const categorizedLabels = new Set([
61
+ contentType,
62
+ customer,
63
+ category,
64
+ ].filter(Boolean));
65
+ const keywords = labels.filter(label => !categorizedLabels.has(label));
66
+ return {
67
+ content_type: contentType,
68
+ customer,
69
+ category,
70
+ keywords,
71
+ };
72
+ }
73
+ /**
74
+ * Process a Confluence page into section-based chunks
75
+ * One chunk per H2 section
76
+ */
77
+ export function processConfluencePage(page, spaceKey, baseUrl, labels = []) {
78
+ // Extract HTML content
79
+ const htmlContent = page.body?.storage?.value;
80
+ if (!htmlContent) {
81
+ console.warn(`Page ${page.id} (${page.title}) has no content, skipping`);
82
+ return [];
83
+ }
84
+ // Parse HTML to Markdown
85
+ const markdown = parseConfluenceHtml(htmlContent);
86
+ if (!markdown || markdown.trim().length === 0) {
87
+ console.warn(`Page ${page.id} (${page.title}) resulted in empty markdown after parsing, skipping`);
88
+ return [];
89
+ }
90
+ // Split by H2 sections
91
+ const sections = splitMarkdownBySections(markdown);
92
+ // Construct page URL
93
+ const pageUrl = `${baseUrl.replace(/\/$/, '')}/wiki${page._links.webui}`;
94
+ // Generate document ID base
95
+ const documentIdBase = `confluence:${spaceKey}:${page.id}`;
96
+ // Extract metadata from labels
97
+ const labelMetadata = extractMetadataFromLabels(labels);
98
+ // Get current timestamp
99
+ const indexedAt = new Date().toISOString();
100
+ // Create chunks
101
+ const chunks = sections.map((section) => {
102
+ // Analyze content
103
+ const contentAnalysis = analyzeContent(section.content);
104
+ // Determine chunk type
105
+ const chunkType = section.sectionTitle ? 'section' : 'full_page';
106
+ // Create document ID
107
+ const documentId = `${documentIdBase}_section_${section.sectionIndex}`;
108
+ const metadata = {
109
+ // === Source Info ===
110
+ source: 'confluence',
111
+ page_id: page.id,
112
+ page_title: page.title,
113
+ page_url: pageUrl,
114
+ space_key: spaceKey,
115
+ // === Content Structure ===
116
+ section_title: section.sectionTitle,
117
+ section_index: section.sectionIndex,
118
+ chunk_type: chunkType,
119
+ // === Labels ===
120
+ labels,
121
+ // === Extracted Metadata ===
122
+ ...labelMetadata,
123
+ // === Timestamps ===
124
+ last_updated: page.version.createdAt,
125
+ page_version: page.version.number,
126
+ indexed_at: indexedAt,
127
+ // === Author Info ===
128
+ last_updated_by: page.version.authorId,
129
+ page_creator: page.authorId,
130
+ // === Quality Indicators ===
131
+ ...contentAnalysis,
132
+ // === Parent Context ===
133
+ parent_page_id: page.parentId,
134
+ // === Standard fields ===
135
+ document_id: documentId,
136
+ document_type: 'confluence_page',
137
+ };
138
+ return {
139
+ pageContent: section.content,
140
+ metadata,
141
+ };
142
+ });
143
+ return chunks;
144
+ }
145
+ /**
146
+ * Process multiple Confluence pages in batch
147
+ */
148
+ export function processConfluencePages(pages, spaceKey, baseUrl, labelsMap = new Map()) {
149
+ const allChunks = [];
150
+ for (const page of pages) {
151
+ try {
152
+ const labels = labelsMap.get(page.id) || [];
153
+ const pageChunks = processConfluencePage(page, spaceKey, baseUrl, labels);
154
+ allChunks.push(...pageChunks);
155
+ }
156
+ catch (error) {
157
+ console.error(`Error processing page ${page.id} (${page.title}):`, error);
158
+ // Continue with other pages
159
+ }
160
+ }
161
+ return allChunks;
162
+ }
@@ -0,0 +1,220 @@
1
+ import TurndownService from 'turndown';
2
+ /**
3
+ * Parse Confluence storage format HTML to clean Markdown
4
+ * Uses Turndown with custom rules for Confluence-specific elements
5
+ */
6
+ export function parseConfluenceHtml(html) {
7
+ const turndownService = new TurndownService({
8
+ headingStyle: 'atx', // Use # for headings
9
+ hr: '---',
10
+ bulletListMarker: '-',
11
+ codeBlockStyle: 'fenced',
12
+ emDelimiter: '*',
13
+ strongDelimiter: '**',
14
+ });
15
+ // Add custom rules for Confluence-specific elements
16
+ addConfluenceRules(turndownService);
17
+ // Convert HTML to Markdown
18
+ const markdown = turndownService.turndown(html);
19
+ // Clean up extra whitespace
20
+ return markdown
21
+ .replace(/\n{3,}/g, '\n\n') // Max 2 consecutive newlines
22
+ .replace(/[ \t]+$/gm, '') // Remove trailing spaces
23
+ .trim();
24
+ }
25
+ /**
26
+ * Add custom Turndown rules for Confluence storage format elements
27
+ */
28
+ function addConfluenceRules(turndownService) {
29
+ // Confluence Code Macro
30
+ turndownService.addRule('confluenceCodeMacro', {
31
+ filter: function (node) {
32
+ return (node.nodeName === 'AC:STRUCTURED-MACRO' &&
33
+ node.getAttribute('ac:name') === 'code');
34
+ },
35
+ replacement: function (content, node) {
36
+ // Extract language parameter
37
+ const langParam = node.querySelector('ac\\:parameter[ac\\:name="language"]');
38
+ const language = langParam?.textContent || '';
39
+ // Extract code content
40
+ const codeBody = node.querySelector('ac\\:plain-text-body');
41
+ const code = codeBody?.textContent || '';
42
+ return '\n```' + language + '\n' + code + '\n```\n';
43
+ },
44
+ });
45
+ // Confluence Info/Warning/Note/Tip Panels
46
+ turndownService.addRule('confluenceInfoPanels', {
47
+ filter: function (node) {
48
+ if (node.nodeName !== 'AC:STRUCTURED-MACRO')
49
+ return false;
50
+ const macroName = node.getAttribute('ac:name');
51
+ return ['info', 'note', 'tip', 'warning', 'panel'].includes(macroName || '');
52
+ },
53
+ replacement: function (content, node) {
54
+ const element = node;
55
+ const macroName = element.getAttribute('ac:name') || 'info';
56
+ // Extract content from rich-text-body or plain-text-body
57
+ const richTextBody = element.querySelector('ac\\:rich-text-body');
58
+ const plainTextBody = element.querySelector('ac\\:plain-text-body');
59
+ const bodyContent = richTextBody?.textContent || plainTextBody?.textContent || content;
60
+ // Format as blockquote with label
61
+ const label = macroName.toUpperCase();
62
+ return '\n> **' + label + '**: ' + bodyContent.trim() + '\n\n';
63
+ },
64
+ });
65
+ // Confluence TOC (Table of Contents) Macro
66
+ turndownService.addRule('confluenceTocMacro', {
67
+ filter: function (node) {
68
+ return (node.nodeName === 'AC:STRUCTURED-MACRO' &&
69
+ node.getAttribute('ac:name') === 'toc');
70
+ },
71
+ replacement: function () {
72
+ return '\n**Table of Contents**\n\n';
73
+ },
74
+ });
75
+ // Confluence Excerpt Macro
76
+ turndownService.addRule('confluenceExcerptMacro', {
77
+ filter: function (node) {
78
+ return (node.nodeName === 'AC:STRUCTURED-MACRO' &&
79
+ node.getAttribute('ac:name') === 'excerpt');
80
+ },
81
+ replacement: function (content, node) {
82
+ const richTextBody = node.querySelector('ac\\:rich-text-body');
83
+ return richTextBody?.textContent || content;
84
+ },
85
+ });
86
+ // Confluence Expand Macro (collapsible sections)
87
+ turndownService.addRule('confluenceExpandMacro', {
88
+ filter: function (node) {
89
+ return (node.nodeName === 'AC:STRUCTURED-MACRO' &&
90
+ node.getAttribute('ac:name') === 'expand');
91
+ },
92
+ replacement: function (content, node) {
93
+ const titleParam = node.querySelector('ac\\:parameter[ac\\:name="title"]');
94
+ const title = titleParam?.textContent || 'Details';
95
+ const richTextBody = node.querySelector('ac\\:rich-text-body');
96
+ const bodyContent = richTextBody?.textContent || content;
97
+ return '\n**' + title + '**\n\n' + bodyContent + '\n\n';
98
+ },
99
+ });
100
+ // Confluence Quote Macro
101
+ turndownService.addRule('confluenceQuoteMacro', {
102
+ filter: function (node) {
103
+ return (node.nodeName === 'AC:STRUCTURED-MACRO' &&
104
+ node.getAttribute('ac:name') === 'quote');
105
+ },
106
+ replacement: function (content, node) {
107
+ const richTextBody = node.querySelector('ac\\:rich-text-body');
108
+ const bodyContent = richTextBody?.textContent || content;
109
+ return '\n> ' + bodyContent.trim().split('\n').join('\n> ') + '\n\n';
110
+ },
111
+ });
112
+ // Confluence Rich Text Body (nested content container)
113
+ turndownService.addRule('confluenceRichTextBody', {
114
+ filter: function (node) {
115
+ return node.nodeName === 'AC:RICH-TEXT-BODY';
116
+ },
117
+ replacement: function (content) {
118
+ return content;
119
+ },
120
+ });
121
+ // Confluence Plain Text Body (nested content container)
122
+ turndownService.addRule('confluencePlainTextBody', {
123
+ filter: function (node) {
124
+ return node.nodeName === 'AC:PLAIN-TEXT-BODY';
125
+ },
126
+ replacement: function (content) {
127
+ return content;
128
+ },
129
+ });
130
+ // Confluence Links (internal page links)
131
+ turndownService.addRule('confluenceLinks', {
132
+ filter: function (node) {
133
+ return node.nodeName === 'AC:LINK';
134
+ },
135
+ replacement: function (content, node) {
136
+ // Try to extract link target
137
+ const element = node;
138
+ const pageRef = element.querySelector('ri\\:page');
139
+ const urlRef = element.querySelector('ri\\:url');
140
+ const linkBody = element.querySelector('ac\\:link-body, ac\\:plain-text-link-body');
141
+ const linkText = linkBody?.textContent || content || 'link';
142
+ const href = pageRef?.getAttribute('ri:content-title') ||
143
+ urlRef?.getAttribute('ri:value') ||
144
+ element.getAttribute('ac:anchor') || '';
145
+ if (href) {
146
+ return '[' + linkText + '](' + href + ')';
147
+ }
148
+ return linkText;
149
+ },
150
+ });
151
+ // Confluence Status Macro (status labels)
152
+ turndownService.addRule('confluenceStatusMacro', {
153
+ filter: function (node) {
154
+ return (node.nodeName === 'AC:STRUCTURED-MACRO' &&
155
+ node.getAttribute('ac:name') === 'status');
156
+ },
157
+ replacement: function (content, node) {
158
+ const titleParam = node.querySelector('ac\\:parameter[ac\\:name="title"]');
159
+ const title = titleParam?.textContent || 'Status';
160
+ const colorParam = node.querySelector('ac\\:parameter[ac\\:name="colour"]');
161
+ const color = colorParam?.textContent || '';
162
+ return '**[' + title + ']**' + (color ? ' (' + color + ')' : '');
163
+ },
164
+ });
165
+ // Confluence Anchor (page anchors/bookmarks)
166
+ turndownService.addRule('confluenceAnchor', {
167
+ filter: function (node) {
168
+ return (node.nodeName === 'AC:STRUCTURED-MACRO' &&
169
+ node.getAttribute('ac:name') === 'anchor');
170
+ },
171
+ replacement: function (content, node) {
172
+ const anchorParam = node.querySelector('ac\\:parameter');
173
+ const anchorName = anchorParam?.textContent || '';
174
+ return anchorName ? '<a name="' + anchorName + '"></a>' : '';
175
+ },
176
+ });
177
+ // Confluence JIRA Macro (embedded JIRA issues)
178
+ turndownService.addRule('confluenceJiraMacro', {
179
+ filter: function (node) {
180
+ return (node.nodeName === 'AC:STRUCTURED-MACRO' &&
181
+ node.getAttribute('ac:name') === 'jira');
182
+ },
183
+ replacement: function (content, node) {
184
+ // Extract JIRA parameters
185
+ const element = node;
186
+ const serverParam = element.querySelector('ac\\:parameter[ac\\:name="server"]');
187
+ const keyParam = element.querySelector('ac\\:parameter[ac\\:name="key"]');
188
+ const jiraKey = keyParam?.textContent || '';
189
+ const serverName = serverParam?.textContent || '';
190
+ if (jiraKey) {
191
+ return `\n**JIRA Issue:** ${jiraKey}${serverName ? ` (${serverName})` : ''}\n\n`;
192
+ }
193
+ return '';
194
+ },
195
+ });
196
+ // Generic handler for unknown Confluence macros
197
+ // Only catches macros that haven't been handled by specific rules above
198
+ turndownService.addRule('confluenceUnknownMacros', {
199
+ filter: function (node) {
200
+ if (node.nodeName !== 'AC:STRUCTURED-MACRO')
201
+ return false;
202
+ const macroName = node.getAttribute('ac:name');
203
+ // List of known/handled macros - skip these
204
+ const knownMacros = [
205
+ 'code', 'info', 'note', 'tip', 'warning', 'panel',
206
+ 'toc', 'excerpt', 'expand', 'quote', 'status', 'anchor', 'jira'
207
+ ];
208
+ return !knownMacros.includes(macroName || '');
209
+ },
210
+ replacement: function (content, node) {
211
+ const element = node;
212
+ const macroName = element.getAttribute('ac:name') || 'unknown';
213
+ console.warn(`Unknown Confluence macro: ${macroName}`);
214
+ // Try to extract any text content from the macro
215
+ const richTextBody = element.querySelector('ac\\:rich-text-body');
216
+ const plainTextBody = element.querySelector('ac\\:plain-text-body');
217
+ return richTextBody?.textContent || plainTextBody?.textContent || content || '';
218
+ },
219
+ });
220
+ }
@@ -0,0 +1,152 @@
1
+ import { tool } from '@langchain/core/tools';
2
+ import { z } from 'zod';
3
+ /**
4
+ * Create Confluence agent tool for searching the knowledge base
5
+ * Supports semantic search with optional metadata filtering
6
+ */
7
+ export function createConfluenceAgentTool(vectorStore) {
8
+ const schema = z.object({
9
+ query: z.string().describe("Semantic search query for Confluence knowledge base"),
10
+ filters: z.object({
11
+ keywords: z.array(z.string()).optional()
12
+ .describe("Filter by Confluence labels (e.g., ['shopware', 'deployment'])"),
13
+ content_type: z.enum(['project-info', 'faq', 'guide', 'decision', 'template']).optional()
14
+ .describe("Type of content to search"),
15
+ customer: z.string().optional()
16
+ .describe("Filter by customer name (e.g., 'customer-xyz')"),
17
+ category: z.enum(['active', 'maintenance', 'legacy', 'archived']).optional()
18
+ .describe("Project/content lifecycle category"),
19
+ space_key: z.string().optional()
20
+ .describe("Confluence space key to search within"),
21
+ chunk_type: z.enum(['section', 'full_page']).optional()
22
+ .describe("Type of content chunk (section or full_page)"),
23
+ has_code_blocks: z.boolean().optional()
24
+ .describe("Filter for content with code examples"),
25
+ has_tables: z.boolean().optional()
26
+ .describe("Filter for content with tables"),
27
+ has_links: z.boolean().optional()
28
+ .describe("Filter for content with links"),
29
+ min_word_count: z.number().optional()
30
+ .describe("Minimum word count for content"),
31
+ last_updated_by: z.string().optional()
32
+ .describe("Filter by author user ID"),
33
+ page_creator: z.string().optional()
34
+ .describe("Filter by page creator user ID"),
35
+ parent_page_id: z.string().optional()
36
+ .describe("Filter by parent page ID (child pages only)")
37
+ }).optional().describe("Optional metadata filters to narrow search results")
38
+ });
39
+ return tool(async ({ query, filters }) => {
40
+ try {
41
+ console.log(`\nšŸ” Confluence Agent searching: "${query}"`);
42
+ if (filters) {
43
+ console.log(` Filters:`, filters);
44
+ }
45
+ // Build metadata filter for vector store
46
+ const metadataFilter = {
47
+ source: 'confluence'
48
+ };
49
+ if (filters?.keywords && filters.keywords.length > 0) {
50
+ metadataFilter.labels = { arrayContains: filters.keywords };
51
+ }
52
+ // Handle content_type filter - exclude templates and guides by default unless explicitly requested
53
+ if (filters?.content_type) {
54
+ metadataFilter.content_type = filters.content_type;
55
+ }
56
+ else {
57
+ // Exclude both 'template' and 'guide' by default
58
+ metadataFilter.content_type = { notIn: ['template', 'guide'] };
59
+ }
60
+ if (filters?.customer) {
61
+ metadataFilter.customer = filters.customer;
62
+ }
63
+ if (filters?.category) {
64
+ metadataFilter.category = filters.category;
65
+ }
66
+ if (filters?.space_key) {
67
+ metadataFilter.space_key = filters.space_key;
68
+ }
69
+ if (filters?.chunk_type) {
70
+ metadataFilter.chunk_type = filters.chunk_type;
71
+ }
72
+ if (filters?.has_code_blocks !== undefined) {
73
+ metadataFilter.has_code_blocks = filters.has_code_blocks;
74
+ }
75
+ if (filters?.has_tables !== undefined) {
76
+ metadataFilter.has_tables = filters.has_tables;
77
+ }
78
+ if (filters?.has_links !== undefined) {
79
+ metadataFilter.has_links = filters.has_links;
80
+ }
81
+ if (filters?.min_word_count !== undefined) {
82
+ metadataFilter.word_count = { gte: filters.min_word_count };
83
+ }
84
+ if (filters?.last_updated_by) {
85
+ metadataFilter.last_updated_by = filters.last_updated_by;
86
+ }
87
+ if (filters?.page_creator) {
88
+ metadataFilter.page_creator = filters.page_creator;
89
+ }
90
+ if (filters?.parent_page_id) {
91
+ metadataFilter.parent_page_id = filters.parent_page_id;
92
+ }
93
+ // Perform vector search with scores
94
+ const resultsWithScores = await vectorStore.similaritySearchWithScore(query, 5, Object.keys(metadataFilter).length > 0 ? metadataFilter : undefined);
95
+ console.log(` Found ${resultsWithScores.length} results`);
96
+ // Calculate similarity metrics
97
+ const scores = resultsWithScores.map(r => r.score);
98
+ const avgSimilarity = scores.length > 0
99
+ ? scores.reduce((sum, score) => sum + score, 0) / scores.length
100
+ : 0;
101
+ // Calculate variance (measure of score consistency)
102
+ // Example: If scores are [0.8, 0.6, 0.9] and avgSimilarity is 0.767:
103
+ // - (0.8 - 0.767)² + (0.6 - 0.767)² + (0.9 - 0.767)² = 0.033
104
+ // - 0.033 / 3 = 0.011 (variance)
105
+ const scoreVariance = scores.length > 1
106
+ ? scores.reduce((sum, score) => sum + Math.pow(score - avgSimilarity, 2), 0) / scores.length
107
+ : 0;
108
+ console.log(` Avg similarity: ${avgSimilarity.toFixed(3)}, Variance: ${scoreVariance.toFixed(3)}`);
109
+ // Format results for LLM
110
+ const formattedResults = resultsWithScores.map(({ document }) => {
111
+ return {
112
+ content: document.pageContent,
113
+ metadata: document.metadata
114
+ };
115
+ });
116
+ const result = {
117
+ documents: formattedResults,
118
+ searchQuery: query,
119
+ filters: filters || {},
120
+ resultCount: resultsWithScores.length,
121
+ avgSimilarity,
122
+ scoreVariance
123
+ };
124
+ // Return as string for createReactAgent compatibility
125
+ return JSON.stringify(result, null, 2);
126
+ }
127
+ catch (error) {
128
+ const errorMessage = error instanceof Error ? error.message : String(error);
129
+ console.error(`āŒ Confluence Agent error:`, errorMessage);
130
+ const errorResult = {
131
+ error: true,
132
+ message: `Tool execution failed. ${errorMessage}`,
133
+ query,
134
+ filters: filters || {},
135
+ suggestion: 'Please check your input parameters and try again with a simpler query or different filters.'
136
+ };
137
+ return JSON.stringify(errorResult, null, 2);
138
+ }
139
+ }, {
140
+ name: "confluence_agent",
141
+ description: `Search Confluence knowledge base for implicit organizational knowledge including:
142
+ - Deployment rules and procedures
143
+ - Team contacts and expertise areas
144
+ - Project mappings (GitLab ↔ Jira ↔ Customer)
145
+ - Historical technical decisions
146
+ - Approved FAQs and documentation
147
+ - Business rules and constraints
148
+
149
+ Supports semantic search with lots of metadata filters which need to be used for precise targeting.`,
150
+ schema
151
+ });
152
+ }
@@ -0,0 +1,18 @@
1
+ import { tool } from '@langchain/core/tools';
2
+ import { z } from 'zod';
3
+ export function createGetCurrentDateTool() {
4
+ const getCurrentDateSchema = z.object({});
5
+ return tool(() => {
6
+ const currentDate = new Date();
7
+ const year = currentDate.getFullYear();
8
+ const month = (currentDate.getMonth() + 1).toString().padStart(2, '0');
9
+ const day = currentDate.getDate().toString().padStart(2, '0');
10
+ const formattedDate = `${year}-${month}-${day}`;
11
+ console.log(`\nšŸ“… Returning current date: ${formattedDate}`);
12
+ return formattedDate;
13
+ }, {
14
+ name: 'get_current_date',
15
+ description: 'Returns the current date in YYYY-MM-DD format. Use this tool when the user asks a question that requires knowledge of the current date, such as "in the last week" or "since yesterday".',
16
+ schema: getCurrentDateSchema,
17
+ });
18
+ }
@@ -0,0 +1,95 @@
1
+ import { PGVectorStore } from '@langchain/community/vectorstores/pgvector';
2
+ import { Pool } from 'pg';
3
+ /**
4
+ * Test PostgreSQL database connection
5
+ * @throws Error if connection fails
6
+ */
7
+ export async function testPostgresConnection(config) {
8
+ const pool = new Pool({
9
+ host: config.host,
10
+ port: config.port,
11
+ user: config.user,
12
+ password: config.password,
13
+ database: config.database,
14
+ });
15
+ try {
16
+ await pool.query('SELECT 1');
17
+ }
18
+ finally {
19
+ await pool.end();
20
+ }
21
+ }
22
+ export async function createPostgresVectorStore(embeddings, documents, config) {
23
+ console.log('Creating PostgreSQL vector store...');
24
+ const poolConfig = {
25
+ host: config.host,
26
+ port: config.port,
27
+ user: config.user,
28
+ password: config.password,
29
+ database: config.database,
30
+ };
31
+ const pgVectorStore = await PGVectorStore.initialize(embeddings, {
32
+ postgresConnectionOptions: poolConfig,
33
+ tableName: config.tableName,
34
+ columns: {
35
+ idColumnName: 'id',
36
+ vectorColumnName: 'embedding',
37
+ contentColumnName: 'content',
38
+ metadataColumnName: 'metadata',
39
+ },
40
+ });
41
+ // Delete existing documents with the same document_id before inserting
42
+ if (documents.length > 0) {
43
+ // Collect all unique document_ids from the documents array
44
+ const documentIds = new Set();
45
+ for (const doc of documents) {
46
+ if (doc.metadata?.document_id) {
47
+ documentIds.add(String(doc.metadata.document_id));
48
+ }
49
+ }
50
+ if (documentIds.size > 0) {
51
+ const pool = new Pool(poolConfig);
52
+ try {
53
+ const documentIdsArray = Array.from(documentIds);
54
+ const result = await pool.query(`DELETE FROM ${config.tableName} WHERE metadata->>'document_id' = ANY($1)`, [documentIdsArray]);
55
+ console.log(`Deleted ${result.rowCount} existing chunks for ${documentIds.size} document(s): ${documentIdsArray.join(', ')}`);
56
+ }
57
+ finally {
58
+ await pool.end();
59
+ }
60
+ }
61
+ await pgVectorStore.addDocuments(documents);
62
+ }
63
+ console.log('PostgreSQL vector store created');
64
+ // Wrap PGVectorStore to match VectorStoreProvider interface
65
+ return {
66
+ async addDocuments(docs) {
67
+ await pgVectorStore.addDocuments(docs);
68
+ },
69
+ async similaritySearch(query, k, filter) {
70
+ // If filter is a metadata object, pass it to pgvector's native filter
71
+ if (filter && typeof filter === 'object' && !('call' in filter)) {
72
+ return await pgVectorStore.similaritySearch(query, k, filter);
73
+ }
74
+ // If filter is a function, apply it to results
75
+ const results = await pgVectorStore.similaritySearch(query, k);
76
+ return (filter && typeof filter === 'function') ? results.filter(filter) : results;
77
+ },
78
+ async similaritySearchWithScore(query, k, filter) {
79
+ // If filter is a metadata object, pass it to pgvector's native filter
80
+ if (filter && typeof filter === 'object' && !('call' in filter)) {
81
+ const results = await pgVectorStore.similaritySearchWithScore(query, k, filter);
82
+ return results.map(([document, score]) => ({ document, score }));
83
+ }
84
+ // If filter is a function, apply it to results
85
+ const results = await pgVectorStore.similaritySearchWithScore(query, k);
86
+ const mapped = results.map(([document, score]) => ({ document, score }));
87
+ return (filter && typeof filter === 'function')
88
+ ? mapped.filter(item => filter(item.document))
89
+ : mapped;
90
+ },
91
+ async cleanup() {
92
+ await pgVectorStore.end();
93
+ },
94
+ };
95
+ }
@@ -0,0 +1 @@
1
+ export {};