opc-agent 4.0.0 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/README.md +404 -80
  2. package/README.zh-CN.md +82 -0
  3. package/dist/cli/chat.d.ts +2 -0
  4. package/dist/cli/chat.js +134 -0
  5. package/dist/cli/setup.d.ts +4 -0
  6. package/dist/cli/setup.js +303 -0
  7. package/dist/cli.js +106 -6
  8. package/dist/hub/brain-seed.d.ts +14 -0
  9. package/dist/hub/brain-seed.js +77 -0
  10. package/dist/hub/client.d.ts +25 -0
  11. package/dist/hub/client.js +44 -0
  12. package/dist/index.d.ts +4 -2
  13. package/dist/index.js +12 -3
  14. package/dist/providers/index.d.ts +1 -1
  15. package/dist/providers/index.js +54 -1
  16. package/dist/scheduler/cron-engine.d.ts +41 -0
  17. package/dist/scheduler/cron-engine.js +200 -0
  18. package/dist/scheduler/index.d.ts +3 -0
  19. package/dist/scheduler/index.js +7 -0
  20. package/dist/skills/builtin/index.d.ts +6 -0
  21. package/dist/skills/builtin/index.js +402 -0
  22. package/dist/skills/marketplace.d.ts +30 -0
  23. package/dist/skills/marketplace.js +142 -0
  24. package/dist/skills/types.d.ts +34 -0
  25. package/dist/skills/types.js +16 -0
  26. package/dist/studio/server.d.ts +25 -0
  27. package/dist/studio/server.js +780 -0
  28. package/dist/studio/templates-data.d.ts +21 -0
  29. package/dist/studio/templates-data.js +148 -0
  30. package/dist/studio-ui/index.html +2502 -1073
  31. package/dist/tools/builtin/index.d.ts +1 -0
  32. package/dist/tools/builtin/index.js +7 -2
  33. package/dist/tools/builtin/web-search.d.ts +9 -0
  34. package/dist/tools/builtin/web-search.js +150 -0
  35. package/dist/tools/document-processor.d.ts +39 -0
  36. package/dist/tools/document-processor.js +188 -0
  37. package/dist/tools/image-generator.d.ts +42 -0
  38. package/dist/tools/image-generator.js +136 -0
  39. package/dist/tools/web-scraper.d.ts +20 -0
  40. package/dist/tools/web-scraper.js +148 -0
  41. package/dist/tools/web-search.d.ts +51 -0
  42. package/dist/tools/web-search.js +152 -0
  43. package/install.ps1 +154 -0
  44. package/install.sh +164 -0
  45. package/package.json +63 -52
  46. package/src/cli/chat.ts +99 -0
  47. package/src/cli/setup.ts +314 -0
  48. package/src/cli.ts +108 -6
  49. package/src/hub/brain-seed.ts +54 -0
  50. package/src/hub/client.ts +60 -0
  51. package/src/index.ts +4 -2
  52. package/src/providers/index.ts +64 -1
  53. package/src/scheduler/cron-engine.ts +191 -0
  54. package/src/scheduler/index.ts +2 -0
  55. package/src/skills/builtin/index.ts +408 -0
  56. package/src/skills/marketplace.ts +113 -0
  57. package/src/skills/types.ts +42 -0
  58. package/src/studio/server.ts +1591 -791
  59. package/src/studio/templates-data.ts +178 -0
  60. package/src/studio-ui/index.html +2502 -1073
  61. package/src/tools/builtin/index.ts +37 -35
  62. package/src/tools/builtin/web-search.ts +126 -0
  63. package/src/tools/document-processor.ts +213 -0
  64. package/src/tools/image-generator.ts +150 -0
  65. package/src/tools/web-scraper.ts +179 -0
  66. package/src/tools/web-search.ts +180 -0
  67. package/tests/cron-engine.test.ts +101 -0
  68. package/tests/document-processor.test.ts +69 -0
  69. package/tests/e2e-nocode.test.ts +442 -0
  70. package/tests/image-generator.test.ts +84 -0
  71. package/tests/settings-api.test.ts +148 -0
  72. package/tests/setup.test.ts +73 -0
  73. package/tests/studio.test.ts +402 -229
  74. package/tests/voice-interaction.test.ts +38 -0
  75. package/tests/web-search.test.ts +155 -0
@@ -0,0 +1,179 @@
1
+ /**
2
+ * Web Scraper - v0.10.0
3
+ * Fetch URL content and extract readable text in markdown format.
4
+ * Uses a simple readability-style extraction (no external dependencies).
5
+ */
6
+
7
+ export interface ScrapedContent {
8
+ title: string;
9
+ content: string; // markdown
10
+ url: string;
11
+ wordCount: number;
12
+ }
13
+
14
+ const MAX_CONTENT_LENGTH = 5000;
15
+
16
+ /**
17
+ * Fetch a URL and extract readable content as markdown.
18
+ */
19
+ export async function scrapeUrl(url: string, maxLength = MAX_CONTENT_LENGTH): Promise<ScrapedContent> {
20
+ const response = await fetch(url, {
21
+ headers: {
22
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
23
+ Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
24
+ },
25
+ signal: AbortSignal.timeout(15000),
26
+ redirect: 'follow',
27
+ });
28
+
29
+ const contentType = response.headers.get('content-type') || '';
30
+ const text = await response.text();
31
+
32
+ // If not HTML, return raw text
33
+ if (!contentType.includes('html')) {
34
+ const truncated = text.slice(0, maxLength);
35
+ return {
36
+ title: url,
37
+ content: truncated,
38
+ url,
39
+ wordCount: truncated.split(/\s+/).length,
40
+ };
41
+ }
42
+
43
+ return extractReadableContent(text, url, maxLength);
44
+ }
45
+
46
+ /**
47
+ * Extract readable content from HTML using simple heuristics.
48
+ */
49
+ export function extractReadableContent(html: string, url: string, maxLength = MAX_CONTENT_LENGTH): ScrapedContent {
50
+ // Extract title
51
+ const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
52
+ const title = titleMatch ? decodeEntities(titleMatch[1]).trim() : url;
53
+
54
+ // Remove non-content elements
55
+ let content = html;
56
+
57
+ // Remove script, style, nav, header, footer, aside, iframe
58
+ const removePatterns = [
59
+ /<script[\s\S]*?<\/script>/gi,
60
+ /<style[\s\S]*?<\/style>/gi,
61
+ /<nav[\s\S]*?<\/nav>/gi,
62
+ /<footer[\s\S]*?<\/footer>/gi,
63
+ /<aside[\s\S]*?<\/aside>/gi,
64
+ /<iframe[\s\S]*?<\/iframe>/gi,
65
+ /<noscript[\s\S]*?<\/noscript>/gi,
66
+ /<!--[\s\S]*?-->/g,
67
+ ];
68
+
69
+ for (const pattern of removePatterns) {
70
+ content = content.replace(pattern, '');
71
+ }
72
+
73
+ // Try to find main content area
74
+ const mainContent = findMainContent(content);
75
+ content = mainContent || content;
76
+
77
+ // Convert to markdown-ish text
78
+ content = htmlToMarkdown(content);
79
+
80
+ // Clean up whitespace
81
+ content = content
82
+ .replace(/\n{3,}/g, '\n\n')
83
+ .replace(/[ \t]+/g, ' ')
84
+ .trim();
85
+
86
+ // Truncate
87
+ if (content.length > maxLength) {
88
+ content = content.slice(0, maxLength) + '\n\n...[truncated]';
89
+ }
90
+
91
+ return {
92
+ title,
93
+ content,
94
+ url,
95
+ wordCount: content.split(/\s+/).filter(Boolean).length,
96
+ };
97
+ }
98
+
99
+ /**
100
+ * Try to find the main content area of the page.
101
+ */
102
+ function findMainContent(html: string): string | null {
103
+ // Try common content selectors
104
+ const patterns = [
105
+ /<article[^>]*>([\s\S]*?)<\/article>/i,
106
+ /<main[^>]*>([\s\S]*?)<\/main>/i,
107
+ /<div[^>]*class="[^"]*(?:content|article|post|entry|main)[^"]*"[^>]*>([\s\S]*?)<\/div>/i,
108
+ /<div[^>]*id="[^"]*(?:content|article|post|entry|main)[^"]*"[^>]*>([\s\S]*?)<\/div>/i,
109
+ ];
110
+
111
+ for (const pattern of patterns) {
112
+ const match = html.match(pattern);
113
+ if (match && match[1] && match[1].length > 200) {
114
+ return match[1];
115
+ }
116
+ }
117
+
118
+ // Fallback: find body content
119
+ const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
120
+ return bodyMatch ? bodyMatch[1] : null;
121
+ }
122
+
123
+ /**
124
+ * Simple HTML to Markdown conversion.
125
+ */
126
+ function htmlToMarkdown(html: string): string {
127
+ let md = html;
128
+
129
+ // Headers
130
+ md = md.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, '\n# $1\n');
131
+ md = md.replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, '\n## $1\n');
132
+ md = md.replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, '\n### $1\n');
133
+ md = md.replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, '\n#### $1\n');
134
+ md = md.replace(/<h5[^>]*>([\s\S]*?)<\/h5>/gi, '\n##### $1\n');
135
+ md = md.replace(/<h6[^>]*>([\s\S]*?)<\/h6>/gi, '\n###### $1\n');
136
+
137
+ // Paragraphs and line breaks
138
+ md = md.replace(/<p[^>]*>/gi, '\n');
139
+ md = md.replace(/<\/p>/gi, '\n');
140
+ md = md.replace(/<br\s*\/?>/gi, '\n');
141
+
142
+ // Links
143
+ md = md.replace(/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, '[$2]($1)');
144
+
145
+ // Bold and italic
146
+ md = md.replace(/<(?:strong|b)[^>]*>([\s\S]*?)<\/(?:strong|b)>/gi, '**$1**');
147
+ md = md.replace(/<(?:em|i)[^>]*>([\s\S]*?)<\/(?:em|i)>/gi, '*$1*');
148
+
149
+ // Code
150
+ md = md.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, '`$1`');
151
+ md = md.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, '\n```\n$1\n```\n');
152
+
153
+ // Lists
154
+ md = md.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, '- $1\n');
155
+
156
+ // Blockquote
157
+ md = md.replace(/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi, '\n> $1\n');
158
+
159
+ // Remove remaining HTML tags
160
+ md = md.replace(/<[^>]+>/g, '');
161
+
162
+ // Decode entities
163
+ md = decodeEntities(md);
164
+
165
+ return md;
166
+ }
167
+
168
+ function decodeEntities(text: string): string {
169
+ return text
170
+ .replace(/&amp;/g, '&')
171
+ .replace(/&lt;/g, '<')
172
+ .replace(/&gt;/g, '>')
173
+ .replace(/&quot;/g, '"')
174
+ .replace(/&#x27;/g, "'")
175
+ .replace(/&#39;/g, "'")
176
+ .replace(/&nbsp;/g, ' ')
177
+ .replace(/&#(\d+);/g, (_, n) => String.fromCharCode(parseInt(n)))
178
+ .replace(/&#x([0-9a-fA-F]+);/g, (_, n) => String.fromCharCode(parseInt(n, 16)));
179
+ }
@@ -0,0 +1,180 @@
1
+ /**
2
+ * Web Search Engine Manager - v0.10.0
3
+ * Supports multiple search backends with automatic fallback.
4
+ * Default: DuckDuckGo (free, no API key required).
5
+ */
6
+
7
+ export interface SearchResult {
8
+ title: string;
9
+ url: string;
10
+ snippet: string;
11
+ }
12
+
13
+ export interface SearchOptions {
14
+ maxResults?: number;
15
+ engine?: SearchEngine;
16
+ }
17
+
18
+ export type SearchEngine = 'duckduckgo' | 'brave' | 'searxng' | 'google';
19
+
20
+ export interface SearchEngineConfig {
21
+ enabled: boolean;
22
+ apiKey?: string;
23
+ baseUrl?: string; // For SearXNG self-hosted
24
+ }
25
+
26
+ export interface WebSearchConfig {
27
+ defaultEngine: SearchEngine;
28
+ enabled: boolean;
29
+ engines: Partial<Record<SearchEngine, SearchEngineConfig>>;
30
+ }
31
+
32
+ export const DEFAULT_SEARCH_CONFIG: WebSearchConfig = {
33
+ defaultEngine: 'duckduckgo',
34
+ enabled: true,
35
+ engines: {
36
+ duckduckgo: { enabled: true },
37
+ },
38
+ };
39
+
40
+ /**
41
+ * Parse DuckDuckGo HTML search results.
42
+ */
43
+ export function parseDuckDuckGoHTML(html: string): SearchResult[] {
44
+ const results: SearchResult[] = [];
45
+ // Match result blocks: <a class="result__a" href="...">title</a> ... <a class="result__snippet">snippet</a>
46
+ const resultBlocks = html.split(/class="result__body"/);
47
+
48
+ for (let i = 1; i < resultBlocks.length && results.length < 10; i++) {
49
+ const block = resultBlocks[i];
50
+
51
+ // Extract URL and title from result__a
52
+ const linkMatch = block.match(/class="result__a"[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/);
53
+ if (!linkMatch) continue;
54
+
55
+ let url = linkMatch[1];
56
+ const title = stripHTML(linkMatch[2]).trim();
57
+
58
+ // DuckDuckGo wraps URLs in redirect, extract actual URL
59
+ const uddgMatch = url.match(/[?&]uddg=([^&]+)/);
60
+ if (uddgMatch) {
61
+ url = decodeURIComponent(uddgMatch[1]);
62
+ }
63
+
64
+ // Extract snippet
65
+ const snippetMatch = block.match(/class="result__snippet"[^>]*>([\s\S]*?)<\/a>/);
66
+ const snippet = snippetMatch ? stripHTML(snippetMatch[1]).trim() : '';
67
+
68
+ if (title && url) {
69
+ results.push({ title, url, snippet });
70
+ }
71
+ }
72
+
73
+ return results;
74
+ }
75
+
76
+ /**
77
+ * Search using DuckDuckGo HTML interface (no API key needed).
78
+ */
79
+ export async function searchDuckDuckGo(query: string, maxResults = 5): Promise<SearchResult[]> {
80
+ const url = `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`;
81
+ const response = await fetch(url, {
82
+ headers: {
83
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
84
+ },
85
+ signal: AbortSignal.timeout(15000),
86
+ });
87
+ const html = await response.text();
88
+ return parseDuckDuckGoHTML(html).slice(0, maxResults);
89
+ }
90
+
91
+ /**
92
+ * Search using Brave Search API.
93
+ */
94
+ export async function searchBrave(query: string, apiKey: string, maxResults = 5): Promise<SearchResult[]> {
95
+ const url = `https://api.search.brave.com/res/v1/web/search?q=${encodeURIComponent(query)}&count=${maxResults}`;
96
+ const response = await fetch(url, {
97
+ headers: { 'X-Subscription-Token': apiKey, Accept: 'application/json' },
98
+ signal: AbortSignal.timeout(15000),
99
+ });
100
+ const data = await response.json() as any;
101
+ return (data.web?.results || []).slice(0, maxResults).map((r: any) => ({
102
+ title: r.title || '',
103
+ url: r.url || '',
104
+ snippet: r.description || '',
105
+ }));
106
+ }
107
+
108
+ /**
109
+ * Search using SearXNG instance.
110
+ */
111
+ export async function searchSearXNG(query: string, baseUrl: string, maxResults = 5): Promise<SearchResult[]> {
112
+ const url = `${baseUrl.replace(/\/$/, '')}/search?q=${encodeURIComponent(query)}&format=json`;
113
+ const response = await fetch(url, { signal: AbortSignal.timeout(15000) });
114
+ const data = await response.json() as any;
115
+ return (data.results || []).slice(0, maxResults).map((r: any) => ({
116
+ title: r.title || '',
117
+ url: r.url || '',
118
+ snippet: r.content || '',
119
+ }));
120
+ }
121
+
122
+ /**
123
+ * Search using Google Custom Search API.
124
+ */
125
+ export async function searchGoogle(query: string, apiKey: string, maxResults = 5): Promise<SearchResult[]> {
126
+ // apiKey format: "key:cx" (API key and Custom Search Engine ID)
127
+ const [key, cx] = apiKey.split(':');
128
+ const url = `https://www.googleapis.com/customsearch/v1?q=${encodeURIComponent(query)}&key=${key}&cx=${cx}&num=${maxResults}`;
129
+ const response = await fetch(url, { signal: AbortSignal.timeout(15000) });
130
+ const data = await response.json() as any;
131
+ return (data.items || []).slice(0, maxResults).map((r: any) => ({
132
+ title: r.title || '',
133
+ url: r.link || '',
134
+ snippet: r.snippet || '',
135
+ }));
136
+ }
137
+
138
+ /**
139
+ * Unified search function with fallback.
140
+ */
141
+ export async function webSearch(query: string, config?: WebSearchConfig, options?: SearchOptions): Promise<SearchResult[]> {
142
+ const cfg = config || DEFAULT_SEARCH_CONFIG;
143
+ if (!cfg.enabled) return [];
144
+
145
+ const maxResults = options?.maxResults || 5;
146
+ const engine = options?.engine || cfg.defaultEngine;
147
+
148
+ // Try requested engine first, then fallback chain
149
+ const fallbackOrder: SearchEngine[] = [engine, 'duckduckgo', 'brave', 'searxng', 'google']
150
+ .filter((e, i, arr) => arr.indexOf(e) === i) as SearchEngine[];
151
+
152
+ for (const eng of fallbackOrder) {
153
+ const engCfg = cfg.engines[eng];
154
+ if (engCfg && !engCfg.enabled) continue;
155
+
156
+ try {
157
+ switch (eng) {
158
+ case 'duckduckgo':
159
+ return await searchDuckDuckGo(query, maxResults);
160
+ case 'brave':
161
+ if (engCfg?.apiKey) return await searchBrave(query, engCfg.apiKey, maxResults);
162
+ continue;
163
+ case 'searxng':
164
+ if (engCfg?.baseUrl) return await searchSearXNG(query, engCfg.baseUrl, maxResults);
165
+ continue;
166
+ case 'google':
167
+ if (engCfg?.apiKey) return await searchGoogle(query, engCfg.apiKey, maxResults);
168
+ continue;
169
+ }
170
+ } catch {
171
+ continue; // Fallback to next engine
172
+ }
173
+ }
174
+
175
+ return [];
176
+ }
177
+
178
+ function stripHTML(html: string): string {
179
+ return html.replace(/<[^>]+>/g, '').replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>').replace(/&quot;/g, '"').replace(/&#x27;/g, "'").replace(/&nbsp;/g, ' ');
180
+ }
@@ -0,0 +1,101 @@
1
+ import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
2
+ import { CronEngine, frequencyToCron } from '../src/scheduler/cron-engine';
3
+ import type { ScheduleTask } from '../src/scheduler/cron-engine';
4
+ import { existsSync, unlinkSync } from 'fs';
5
+ import { join } from 'path';
6
+ import * as os from 'os';
7
+
8
+ describe('CronEngine', () => {
9
+ let engine: CronEngine;
10
+ const schedulesPath = join(os.homedir(), '.opc', 'schedules.json');
11
+
12
+ beforeEach(() => {
13
+ engine = new CronEngine(async () => {});
14
+ });
15
+
16
+ afterEach(() => {
17
+ engine.stop();
18
+ });
19
+
20
+ describe('frequencyToCron', () => {
21
+ it('converts daily + time to cron', () => {
22
+ expect(frequencyToCron('daily', '08:30')).toBe('30 8 * * *');
23
+ });
24
+
25
+ it('converts weekly to Monday cron', () => {
26
+ expect(frequencyToCron('weekly', '09:00')).toBe('0 9 * * 1');
27
+ });
28
+
29
+ it('converts monthly to 1st of month', () => {
30
+ expect(frequencyToCron('monthly', '10:15')).toBe('15 10 1 * *');
31
+ });
32
+
33
+ it('defaults to 9:00 when no time given', () => {
34
+ expect(frequencyToCron('daily')).toBe('0 9 * * *');
35
+ });
36
+ });
37
+
38
+ describe('CRUD operations', () => {
39
+ it('creates a task', () => {
40
+ const task = engine.createTask({
41
+ name: 'Test Task',
42
+ schedule: '0 9 * * *',
43
+ description: 'Test description',
44
+ frequency: 'daily',
45
+ time: '09:00',
46
+ outputChannel: 'web',
47
+ enabled: true,
48
+ });
49
+ expect(task.id).toBeTruthy();
50
+ expect(task.name).toBe('Test Task');
51
+ expect(task.createdAt).toBeTruthy();
52
+ });
53
+
54
+ it('lists tasks', () => {
55
+ engine.createTask({ name: 'A', schedule: '0 9 * * *', description: '', frequency: 'daily', outputChannel: 'web', enabled: true });
56
+ engine.createTask({ name: 'B', schedule: '0 10 * * *', description: '', frequency: 'daily', outputChannel: 'telegram', enabled: false });
57
+ const tasks = engine.listTasks();
58
+ expect(tasks.length).toBeGreaterThanOrEqual(2);
59
+ });
60
+
61
+ it('updates a task', () => {
62
+ const task = engine.createTask({ name: 'Original', schedule: '0 9 * * *', description: '', frequency: 'daily', outputChannel: 'web', enabled: true });
63
+ const updated = engine.updateTask(task.id, { name: 'Updated', enabled: false });
64
+ expect(updated).not.toBeNull();
65
+ expect(updated!.name).toBe('Updated');
66
+ expect(updated!.enabled).toBe(false);
67
+ });
68
+
69
+ it('deletes a task', () => {
70
+ const task = engine.createTask({ name: 'ToDelete', schedule: '0 9 * * *', description: '', frequency: 'daily', outputChannel: 'web', enabled: true });
71
+ expect(engine.deleteTask(task.id)).toBe(true);
72
+ expect(engine.getTask(task.id)).toBeUndefined();
73
+ });
74
+
75
+ it('returns false for deleting non-existent task', () => {
76
+ expect(engine.deleteTask('nonexistent')).toBe(false);
77
+ });
78
+
79
+ it('returns null for updating non-existent task', () => {
80
+ expect(engine.updateTask('nonexistent', { name: 'x' })).toBeNull();
81
+ });
82
+ });
83
+
84
+ describe('run task', () => {
85
+ it('runs a task immediately', async () => {
86
+ const handler = vi.fn();
87
+ const eng = new CronEngine(handler);
88
+ const task = eng.createTask({ name: 'RunMe', schedule: '0 9 * * *', description: 'test', frequency: 'daily', outputChannel: 'web', enabled: true });
89
+ eng.start();
90
+ const result = await eng.runTask(task.id);
91
+ expect(result).toBe(true);
92
+ expect(handler).toHaveBeenCalled();
93
+ eng.stop();
94
+ });
95
+
96
+ it('returns false for running non-existent task', async () => {
97
+ const result = await engine.runTask('nonexistent');
98
+ expect(result).toBe(false);
99
+ });
100
+ });
101
+ });
@@ -0,0 +1,69 @@
1
+ import { describe, it, expect } from 'vitest';
2
+ import { DocumentProcessor } from '../src/tools/document-processor';
3
+
4
+ describe('DocumentProcessor', () => {
5
+ const processor = new DocumentProcessor();
6
+
7
+ it('should process plain text', async () => {
8
+ const text = 'Hello world. This is a test document.\n\nSecond paragraph here.';
9
+ const buffer = Buffer.from(text, 'utf-8');
10
+ const doc = await processor.process(buffer, 'test.txt');
11
+
12
+ expect(doc.filename).toBe('test.txt');
13
+ expect(doc.format).toBe('txt');
14
+ expect(doc.chunks.length).toBeGreaterThan(0);
15
+ expect(doc.chunks[0].content).toContain('Hello world');
16
+ expect(doc.chunks[0].metadata.source).toBe('test.txt');
17
+ });
18
+
19
+ it('should process markdown with headings', async () => {
20
+ const md = `# Title\n\nFirst section content.\n\n## Section Two\n\nSecond section content.\n\n## Section Three\n\nThird section.`;
21
+ const buffer = Buffer.from(md, 'utf-8');
22
+ const doc = await processor.process(buffer, 'test.md');
23
+
24
+ expect(doc.format).toBe('md');
25
+ expect(doc.chunks.length).toBeGreaterThan(0);
26
+ });
27
+
28
+ it('should process CSV', async () => {
29
+ const csv = 'Name,Age,City\nAlice,30,Beijing\nBob,25,Shanghai';
30
+ const buffer = Buffer.from(csv, 'utf-8');
31
+ const doc = await processor.process(buffer, 'data.csv');
32
+
33
+ expect(doc.format).toBe('csv');
34
+ expect(doc.chunks.length).toBeGreaterThan(0);
35
+ expect(doc.chunks[0].content).toContain('Alice');
36
+ });
37
+
38
+ it('should process JSON array', async () => {
39
+ const json = JSON.stringify([{ name: 'Alice', role: 'engineer' }, { name: 'Bob', role: 'designer' }]);
40
+ const buffer = Buffer.from(json, 'utf-8');
41
+ const doc = await processor.process(buffer, 'data.json');
42
+
43
+ expect(doc.format).toBe('json');
44
+ expect(doc.chunks[0].content).toContain('Alice');
45
+ });
46
+
47
+ it('should chunk large text properly', async () => {
48
+ const bigText = Array(200).fill('Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.').join('\n\n');
49
+ const buffer = Buffer.from(bigText, 'utf-8');
50
+ const doc = await processor.process(buffer, 'big.txt');
51
+
52
+ expect(doc.chunks.length).toBeGreaterThan(1);
53
+ for (const chunk of doc.chunks) {
54
+ expect(chunk.content.length).toBeLessThanOrEqual(5000); // some tolerance
55
+ expect(chunk.metadata.totalChunks).toBe(doc.chunks.length);
56
+ }
57
+ });
58
+
59
+ it('should reject files over 50MB', async () => {
60
+ const bigBuffer = Buffer.alloc(51 * 1024 * 1024);
61
+ await expect(processor.process(bigBuffer, 'huge.txt')).rejects.toThrow('too large');
62
+ });
63
+
64
+ it('should handle empty content', async () => {
65
+ const buffer = Buffer.from('', 'utf-8');
66
+ const doc = await processor.process(buffer, 'empty.txt');
67
+ expect(doc.chunks.length).toBe(0);
68
+ });
69
+ });