@everworker/oneringai 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -27,7 +27,8 @@
27
27
  - [13. Streaming](#13-streaming)
28
28
  - [14. OAuth for External APIs](#14-oauth-for-external-apis)
29
29
  - [15. Developer Tools](#15-developer-tools)
30
- - [16. External API Integration](#16-external-api-integration) — Scoped Registry, Vendor Templates, Tool Discovery
30
+ - [16. Document Reader](#16-document-reader-new) — PDF, DOCX, XLSX, PPTX, CSV, HTML, images
31
+ - [17. External API Integration](#17-external-api-integration) — Scoped Registry, Vendor Templates, Tool Discovery
31
32
  - [MCP Integration](#mcp-model-context-protocol-integration)
32
33
  - [Documentation](#documentation)
33
34
  - [Examples](#examples)
@@ -77,6 +78,8 @@
77
78
  - 📝 **Persistent Instructions** - NEW: Agent-level custom instructions that persist across sessions on disk
78
79
  - 🛠️ **Agentic Workflows** - Built-in tool calling and multi-turn conversations
79
80
  - 🔧 **Developer Tools** - NEW: Filesystem and shell tools for coding assistants (read, write, edit, grep, glob, bash)
81
+ - 🖥️ **Desktop Automation** - NEW: OS-level computer use — screenshot, mouse, keyboard, and window control for vision-driven agent loops
82
+ - 📄 **Document Reader** - NEW: Universal file-to-text converter — PDF, DOCX, XLSX, PPTX, CSV, HTML, images auto-converted to markdown
80
83
  - 🔌 **MCP Integration** - NEW: Model Context Protocol client for seamless tool discovery from local and remote servers
81
84
  - 👁️ **Vision Support** - Analyze images with AI across all providers
82
85
  - 📋 **Clipboard Integration** - Paste screenshots directly (like Claude Code!)
@@ -254,6 +257,55 @@ const veoJob = await googleVideo.generate({
254
257
  });
255
258
  ```
256
259
 
260
+ ### Document Reader (NEW)
261
+
262
+ Read any document format — agents automatically get markdown text from PDFs, Word docs, spreadsheets, and more:
263
+
264
+ ```typescript
265
+ import { Agent, developerTools } from '@everworker/oneringai';
266
+
267
+ const agent = Agent.create({
268
+ connector: 'openai',
269
+ model: 'gpt-4',
270
+ tools: developerTools,
271
+ });
272
+
273
+ // read_file auto-converts binary documents to markdown
274
+ await agent.run('Read /path/to/report.pdf and summarize the key findings');
275
+ await agent.run('Read /path/to/data.xlsx and describe the trends');
276
+ await agent.run('Read /path/to/presentation.pptx and list all slides');
277
+ ```
278
+
279
+ **Programmatic usage:**
280
+
281
+ ```typescript
282
+ import { DocumentReader, readDocumentAsContent } from '@everworker/oneringai';
283
+
284
+ // Read any file to markdown pieces
285
+ const reader = DocumentReader.create();
286
+ const result = await reader.read('/path/to/report.pdf');
287
+ console.log(result.pieces); // DocumentPiece[] (text + images)
288
+
289
+ // One-call conversion to LLM Content[] (for multimodal input)
290
+ const content = await readDocumentAsContent('/path/to/slides.pptx', {
291
+ imageFilter: { minWidth: 100, minHeight: 100 },
292
+ imageDetail: 'auto',
293
+ });
294
+
295
+ const response = await agent.run([
296
+ { type: 'input_text', text: 'Analyze this document:' },
297
+ ...content,
298
+ ]);
299
+ ```
300
+
301
+ **Supported Formats:**
302
+ - **Office**: DOCX, PPTX, ODT, ODP, ODS, RTF (via `officeparser`)
303
+ - **Spreadsheets**: XLSX, CSV (via `exceljs`)
304
+ - **PDF** (via `unpdf`)
305
+ - **HTML** (via Readability + Turndown)
306
+ - **Text**: TXT, MD, JSON, XML, YAML
307
+ - **Images**: PNG, JPG, GIF, WEBP, SVG (pass-through as base64)
308
+
257
309
  ### Web Search
258
310
 
259
311
  Connector-based web search with multiple providers:
@@ -963,7 +1015,108 @@ await agent.run('Run npm test and report any failures');
963
1015
  - Timeout protection (default 2 min)
964
1016
  - Output truncation for large outputs
965
1017
 
966
- ### 16. External API Integration
1018
+ ### 16. Desktop Automation Tools (NEW)
1019
+
1020
+ OS-level desktop automation for building "computer use" agents — screenshot the screen, send to a vision model, receive tool calls (click, type, etc.), execute them, repeat:
1021
+
1022
+ ```typescript
1023
+ import { desktopTools } from '@everworker/oneringai';
1024
+
1025
+ const agent = Agent.create({
1026
+ connector: 'openai',
1027
+ model: 'gpt-4',
1028
+ tools: desktopTools, // All 11 desktop tools
1029
+ });
1030
+
1031
+ // Agent can now see and interact with the desktop:
1032
+ await agent.run('Take a screenshot and describe what you see');
1033
+ await agent.run('Open Safari and search for "weather forecast"');
1034
+ ```
1035
+
1036
+ **Available Tools:**
1037
+ - **desktop_screenshot** - Capture full screen or region (returns image to vision model)
1038
+ - **desktop_mouse_move** - Move cursor to position
1039
+ - **desktop_mouse_click** - Click (left/right/middle, single/double/triple)
1040
+ - **desktop_mouse_drag** - Drag from one position to another
1041
+ - **desktop_mouse_scroll** - Scroll wheel (vertical and horizontal)
1042
+ - **desktop_get_cursor** - Get current cursor position
1043
+ - **desktop_keyboard_type** - Type text
1044
+ - **desktop_keyboard_key** - Press shortcuts (e.g., `ctrl+c`, `cmd+shift+s`, `enter`)
1045
+ - **desktop_get_screen_size** - Get screen dimensions and scale factor
1046
+ - **desktop_window_list** - List visible windows
1047
+ - **desktop_window_focus** - Bring a window to the foreground
1048
+
1049
+ **Key Design:**
1050
+ - All coordinates are in **physical pixel space** (same as screenshot pixels) — no manual Retina scaling needed
1051
+ - Screenshots use the `__images` convention for automatic multimodal handling across all providers (Anthropic, OpenAI, Google)
1052
+ - Requires `@nut-tree-fork/nut-js` as an optional peer dependency: `npm install @nut-tree-fork/nut-js`
1053
+
1054
+ ### 17. Document Reader (NEW)
1055
+
1056
+ Universal file-to-LLM-content converter. Reads arbitrary document formats and produces clean markdown text with optional image extraction:
1057
+
1058
+ ```typescript
1059
+ import { DocumentReader, mergeTextPieces } from '@everworker/oneringai';
1060
+
1061
+ const reader = DocumentReader.create({
1062
+ defaults: {
1063
+ maxTokens: 50_000,
1064
+ extractImages: true,
1065
+ imageFilter: { minWidth: 100, minHeight: 100 },
1066
+ },
1067
+ });
1068
+
1069
+ // Read from file path, URL, Buffer, or Blob
1070
+ const result = await reader.read('/path/to/report.pdf');
1071
+ const result = await reader.read('https://example.com/doc.xlsx');
1072
+ const result = await reader.read({ type: 'buffer', buffer: myBuffer, filename: 'doc.docx' });
1073
+
1074
+ // Get merged markdown text
1075
+ const markdown = mergeTextPieces(result.pieces);
1076
+
1077
+ // Metadata
1078
+ console.log(result.metadata.format); // 'pdf'
1079
+ console.log(result.metadata.estimatedTokens); // 12500
1080
+ console.log(result.metadata.processingTimeMs); // 234
1081
+ ```
1082
+
1083
+ **Automatic Integration — No Code Changes Needed:**
1084
+ - **`read_file` tool** — Agents calling `read_file` on a PDF, DOCX, or XLSX get markdown text automatically
1085
+ - **`web_fetch` tool** — Documents downloaded from URLs are auto-converted to markdown
1086
+
1087
+ **Content Bridge for Multimodal Input:**
1088
+
1089
+ ```typescript
1090
+ import { readDocumentAsContent } from '@everworker/oneringai';
1091
+
1092
+ // Convert document directly to Content[] for LLM input
1093
+ const content = await readDocumentAsContent('/path/to/slides.pptx', {
1094
+ extractImages: true,
1095
+ imageDetail: 'auto',
1096
+ maxImages: 20,
1097
+ });
1098
+
1099
+ // Use in agent.run() with text + images
1100
+ await agent.run([
1101
+ { type: 'input_text', text: 'Analyze this presentation:' },
1102
+ ...content,
1103
+ ]);
1104
+ ```
1105
+
1106
+ **Pluggable Architecture:**
1107
+ - 6 built-in format handlers (Office, Excel, PDF, HTML, Text, Image)
1108
+ - 3 default transformers (header, table formatting, truncation)
1109
+ - Custom handlers and transformers via `DocumentReader.create({ handlers, ... })`
1110
+ - All heavy dependencies lazy-loaded (officeparser, exceljs, unpdf)
1111
+
1112
+ **Image Filtering:**
1113
+ - Configurable min dimensions, min size, max count, pattern exclusions
1114
+ - Automatically removes junk images (logos, icons, tiny backgrounds)
1115
+ - Applied both at extraction time and at content conversion time
1116
+
1117
+ See the [User Guide](./USER_GUIDE.md#document-reader) for complete API reference and configuration options.
1118
+
1119
+ ### 18. External API Integration
967
1120
 
968
1121
  Connect your AI agents to 35+ external services with enterprise-grade resilience:
969
1122
 
@@ -1315,4 +1468,4 @@ MIT License - See [LICENSE](./LICENSE) file.
1315
1468
 
1316
1469
  ---
1317
1470
 
1318
- **Version:** 0.2.0 | **Last Updated:** 2026-02-09 | **[User Guide](./USER_GUIDE.md)** | **[API Reference](./API_REFERENCE.md)** | **[Changelog](./CHANGELOG.md)**
1471
+ **Version:** 0.2.1 | **Last Updated:** 2026-02-11 | **[User Guide](./USER_GUIDE.md)** | **[API Reference](./API_REFERENCE.md)** | **[Changelog](./CHANGELOG.md)**
@@ -1,4 +1,4 @@
1
- export { aD as AfterToolContext, av as AgentEventName, A as AgentEvents, ay as AgenticLoopEventName, ax as AgenticLoopEvents, aG as ApprovalResult, aE as ApproveToolContext, m as AuditEntry, aC as BeforeToolContext, aI as ExecutionCompleteEvent, aw as ExecutionConfig, E as ExecutionContext, l as ExecutionMetrics, aH as ExecutionStartEvent, j as HistoryMode, aA as Hook, H as HookConfig, au as HookManager, az as HookName, aL as LLMRequestEvent, aM as LLMResponseEvent, aB as ModifyingHook, aK as ToolCompleteEvent, aF as ToolModification, aJ as ToolStartEvent } from '../../index-MJ14lkui.cjs';
1
+ export { aD as AfterToolContext, av as AgentEventName, A as AgentEvents, ay as AgenticLoopEventName, ax as AgenticLoopEvents, aG as ApprovalResult, aE as ApproveToolContext, m as AuditEntry, aC as BeforeToolContext, aI as ExecutionCompleteEvent, aw as ExecutionConfig, E as ExecutionContext, l as ExecutionMetrics, aH as ExecutionStartEvent, j as HistoryMode, aA as Hook, H as HookConfig, au as HookManager, az as HookName, aL as LLMRequestEvent, aM as LLMResponseEvent, aB as ModifyingHook, aK as ToolCompleteEvent, aF as ToolModification, aJ as ToolStartEvent } from '../../index-D62LXWdW.cjs';
2
2
  import '../../IProvider-c4QCbPjn.cjs';
3
3
  import '../../Vendor-DYh_bzwo.cjs';
4
4
  import 'eventemitter3';
@@ -1,4 +1,4 @@
1
- export { aD as AfterToolContext, av as AgentEventName, A as AgentEvents, ay as AgenticLoopEventName, ax as AgenticLoopEvents, aG as ApprovalResult, aE as ApproveToolContext, m as AuditEntry, aC as BeforeToolContext, aI as ExecutionCompleteEvent, aw as ExecutionConfig, E as ExecutionContext, l as ExecutionMetrics, aH as ExecutionStartEvent, j as HistoryMode, aA as Hook, H as HookConfig, au as HookManager, az as HookName, aL as LLMRequestEvent, aM as LLMResponseEvent, aB as ModifyingHook, aK as ToolCompleteEvent, aF as ToolModification, aJ as ToolStartEvent } from '../../index-B5UaeEvK.js';
1
+ export { aD as AfterToolContext, av as AgentEventName, A as AgentEvents, ay as AgenticLoopEventName, ax as AgenticLoopEvents, aG as ApprovalResult, aE as ApproveToolContext, m as AuditEntry, aC as BeforeToolContext, aI as ExecutionCompleteEvent, aw as ExecutionConfig, E as ExecutionContext, l as ExecutionMetrics, aH as ExecutionStartEvent, j as HistoryMode, aA as Hook, H as HookConfig, au as HookManager, az as HookName, aL as LLMRequestEvent, aM as LLMResponseEvent, aB as ModifyingHook, aK as ToolCompleteEvent, aF as ToolModification, aJ as ToolStartEvent } from '../../index-DVb6vfA3.js';
2
2
  import '../../IProvider-DcYJ3YE-.js';
3
3
  import '../../Vendor-DYh_bzwo.js';
4
4
  import 'eventemitter3';