npm - @everworker/oneringai - Versions diffs - 0.2.0 → 0.2.1 - Mend

@everworker/oneringai 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +156 -3
package/dist/capabilities/agents/index.d.cts +1 -1
package/dist/capabilities/agents/index.d.ts +1 -1
package/dist/capabilities/images/index.cjs.map +1 -1
package/dist/capabilities/images/index.js.map +1 -1
package/dist/{index-MJ14lkui.d.cts → index-D62LXWdW.d.cts} +9 -0
package/dist/{index-B5UaeEvK.d.ts → index-DVb6vfA3.d.ts} +9 -0
package/dist/index.cjs +2829 -585
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +836 -9
package/dist/index.d.ts +836 -9
package/dist/index.js +2795 -586
package/dist/index.js.map +1 -1
package/package.json +14 -1

package/README.md CHANGED Viewed

@@ -27,7 +27,8 @@
   - [13. Streaming](#13-streaming)
   - [14. OAuth for External APIs](#14-oauth-for-external-apis)
   - [15. Developer Tools](#15-developer-tools)
-  - [16. External API Integration](#16-external-api-integration) — Scoped Registry, Vendor Templates, Tool Discovery
+  - [16. Document Reader](#16-document-reader-new) — PDF, DOCX, XLSX, PPTX, CSV, HTML, images
+  - [17. External API Integration](#17-external-api-integration) — Scoped Registry, Vendor Templates, Tool Discovery
 - [MCP Integration](#mcp-model-context-protocol-integration)
 - [Documentation](#documentation)
 - [Examples](#examples)
@@ -77,6 +78,8 @@
 - 📝 **Persistent Instructions** - NEW: Agent-level custom instructions that persist across sessions on disk
 - 🛠️ **Agentic Workflows** - Built-in tool calling and multi-turn conversations
 - 🔧 **Developer Tools** - NEW: Filesystem and shell tools for coding assistants (read, write, edit, grep, glob, bash)
+- 🖥️ **Desktop Automation** - NEW: OS-level computer use — screenshot, mouse, keyboard, and window control for vision-driven agent loops
+- 📄 **Document Reader** - NEW: Universal file-to-text converter — PDF, DOCX, XLSX, PPTX, CSV, HTML, images auto-converted to markdown
 - 🔌 **MCP Integration** - NEW: Model Context Protocol client for seamless tool discovery from local and remote servers
 - 👁️ **Vision Support** - Analyze images with AI across all providers
 - 📋 **Clipboard Integration** - Paste screenshots directly (like Claude Code!)
@@ -254,6 +257,55 @@ const veoJob = await googleVideo.generate({
 });
 ```
+### Document Reader (NEW)
+Read any document format — agents automatically get markdown text from PDFs, Word docs, spreadsheets, and more:
+```typescript
+import { Agent, developerTools } from '@everworker/oneringai';
+const agent = Agent.create({
+  connector: 'openai',
+  model: 'gpt-4',
+  tools: developerTools,
+});
+// read_file auto-converts binary documents to markdown
+await agent.run('Read /path/to/report.pdf and summarize the key findings');
+await agent.run('Read /path/to/data.xlsx and describe the trends');
+await agent.run('Read /path/to/presentation.pptx and list all slides');
+```
+**Programmatic usage:**
+```typescript
+import { DocumentReader, readDocumentAsContent } from '@everworker/oneringai';
+// Read any file to markdown pieces
+const reader = DocumentReader.create();
+const result = await reader.read('/path/to/report.pdf');
+console.log(result.pieces); // DocumentPiece[] (text + images)
+// One-call conversion to LLM Content[] (for multimodal input)
+const content = await readDocumentAsContent('/path/to/slides.pptx', {
+  imageFilter: { minWidth: 100, minHeight: 100 },
+  imageDetail: 'auto',
+});
+const response = await agent.run([
+  { type: 'input_text', text: 'Analyze this document:' },
+  ...content,
+]);
+```
+**Supported Formats:**
+- **Office**: DOCX, PPTX, ODT, ODP, ODS, RTF (via `officeparser`)
+- **Spreadsheets**: XLSX, CSV (via `exceljs`)
+- **PDF** (via `unpdf`)
+- **HTML** (via Readability + Turndown)
+- **Text**: TXT, MD, JSON, XML, YAML
+- **Images**: PNG, JPG, GIF, WEBP, SVG (pass-through as base64)
 ### Web Search
 Connector-based web search with multiple providers:
@@ -963,7 +1015,108 @@ await agent.run('Run npm test and report any failures');
 - Timeout protection (default 2 min)
 - Output truncation for large outputs
-### 16. External API Integration
+### 16. Desktop Automation Tools (NEW)
+OS-level desktop automation for building "computer use" agents — screenshot the screen, send to a vision model, receive tool calls (click, type, etc.), execute them, repeat:
+```typescript
+import { desktopTools } from '@everworker/oneringai';
+const agent = Agent.create({
+  connector: 'openai',
+  model: 'gpt-4',
+  tools: desktopTools, // All 11 desktop tools
+});
+// Agent can now see and interact with the desktop:
+await agent.run('Take a screenshot and describe what you see');
+await agent.run('Open Safari and search for "weather forecast"');
+```
+**Available Tools:**
+- **desktop_screenshot** - Capture full screen or region (returns image to vision model)
+- **desktop_mouse_move** - Move cursor to position
+- **desktop_mouse_click** - Click (left/right/middle, single/double/triple)
+- **desktop_mouse_drag** - Drag from one position to another
+- **desktop_mouse_scroll** - Scroll wheel (vertical and horizontal)
+- **desktop_get_cursor** - Get current cursor position
+- **desktop_keyboard_type** - Type text
+- **desktop_keyboard_key** - Press shortcuts (e.g., `ctrl+c`, `cmd+shift+s`, `enter`)
+- **desktop_get_screen_size** - Get screen dimensions and scale factor
+- **desktop_window_list** - List visible windows
+- **desktop_window_focus** - Bring a window to the foreground
+**Key Design:**
+- All coordinates are in **physical pixel space** (same as screenshot pixels) — no manual Retina scaling needed
+- Screenshots use the `__images` convention for automatic multimodal handling across all providers (Anthropic, OpenAI, Google)
+- Requires `@nut-tree-fork/nut-js` as an optional peer dependency: `npm install @nut-tree-fork/nut-js`
+### 17. Document Reader (NEW)
+Universal file-to-LLM-content converter. Reads arbitrary document formats and produces clean markdown text with optional image extraction:
+```typescript
+import { DocumentReader, mergeTextPieces } from '@everworker/oneringai';
+const reader = DocumentReader.create({
+  defaults: {
+    maxTokens: 50_000,
+    extractImages: true,
+    imageFilter: { minWidth: 100, minHeight: 100 },
+  },
+});
+// Read from file path, URL, Buffer, or Blob
+const result = await reader.read('/path/to/report.pdf');
+const result = await reader.read('https://example.com/doc.xlsx');
+const result = await reader.read({ type: 'buffer', buffer: myBuffer, filename: 'doc.docx' });
+// Get merged markdown text
+const markdown = mergeTextPieces(result.pieces);
+// Metadata
+console.log(result.metadata.format);          // 'pdf'
+console.log(result.metadata.estimatedTokens); // 12500
+console.log(result.metadata.processingTimeMs); // 234
+```
+**Automatic Integration — No Code Changes Needed:**
+- **`read_file` tool** — Agents calling `read_file` on a PDF, DOCX, or XLSX get markdown text automatically
+- **`web_fetch` tool** — Documents downloaded from URLs are auto-converted to markdown
+**Content Bridge for Multimodal Input:**
+```typescript
+import { readDocumentAsContent } from '@everworker/oneringai';
+// Convert document directly to Content[] for LLM input
+const content = await readDocumentAsContent('/path/to/slides.pptx', {
+  extractImages: true,
+  imageDetail: 'auto',
+  maxImages: 20,
+});
+// Use in agent.run() with text + images
+await agent.run([
+  { type: 'input_text', text: 'Analyze this presentation:' },
+  ...content,
+]);
+```
+**Pluggable Architecture:**
+- 6 built-in format handlers (Office, Excel, PDF, HTML, Text, Image)
+- 3 default transformers (header, table formatting, truncation)
+- Custom handlers and transformers via `DocumentReader.create({ handlers, ... })`
+- All heavy dependencies lazy-loaded (officeparser, exceljs, unpdf)
+**Image Filtering:**
+- Configurable min dimensions, min size, max count, pattern exclusions
+- Automatically removes junk images (logos, icons, tiny backgrounds)
+- Applied both at extraction time and at content conversion time
+See the [User Guide](./USER_GUIDE.md#document-reader) for complete API reference and configuration options.
+### 18. External API Integration
 Connect your AI agents to 35+ external services with enterprise-grade resilience:
@@ -1315,4 +1468,4 @@ MIT License - See [LICENSE](./LICENSE) file.
 ---
-**Version:** 0.2.0 | **Last Updated:** 2026-02-09 | **[User Guide](./USER_GUIDE.md)** | **[API Reference](./API_REFERENCE.md)** | **[Changelog](./CHANGELOG.md)**
+**Version:** 0.2.1 | **Last Updated:** 2026-02-11 | **[User Guide](./USER_GUIDE.md)** | **[API Reference](./API_REFERENCE.md)** | **[Changelog](./CHANGELOG.md)**

package/dist/capabilities/agents/index.d.cts CHANGED Viewed

@@ -1,4 +1,4 @@
-export { aD as AfterToolContext, av as AgentEventName, A as AgentEvents, ay as AgenticLoopEventName, ax as AgenticLoopEvents, aG as ApprovalResult, aE as ApproveToolContext, m as AuditEntry, aC as BeforeToolContext, aI as ExecutionCompleteEvent, aw as ExecutionConfig, E as ExecutionContext, l as ExecutionMetrics, aH as ExecutionStartEvent, j as HistoryMode, aA as Hook, H as HookConfig, au as HookManager, az as HookName, aL as LLMRequestEvent, aM as LLMResponseEvent, aB as ModifyingHook, aK as ToolCompleteEvent, aF as ToolModification, aJ as ToolStartEvent } from '../../index-MJ14lkui.cjs';
+export { aD as AfterToolContext, av as AgentEventName, A as AgentEvents, ay as AgenticLoopEventName, ax as AgenticLoopEvents, aG as ApprovalResult, aE as ApproveToolContext, m as AuditEntry, aC as BeforeToolContext, aI as ExecutionCompleteEvent, aw as ExecutionConfig, E as ExecutionContext, l as ExecutionMetrics, aH as ExecutionStartEvent, j as HistoryMode, aA as Hook, H as HookConfig, au as HookManager, az as HookName, aL as LLMRequestEvent, aM as LLMResponseEvent, aB as ModifyingHook, aK as ToolCompleteEvent, aF as ToolModification, aJ as ToolStartEvent } from '../../index-D62LXWdW.cjs';
 import '../../IProvider-c4QCbPjn.cjs';
 import '../../Vendor-DYh_bzwo.cjs';
 import 'eventemitter3';

package/dist/capabilities/agents/index.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-export { aD as AfterToolContext, av as AgentEventName, A as AgentEvents, ay as AgenticLoopEventName, ax as AgenticLoopEvents, aG as ApprovalResult, aE as ApproveToolContext, m as AuditEntry, aC as BeforeToolContext, aI as ExecutionCompleteEvent, aw as ExecutionConfig, E as ExecutionContext, l as ExecutionMetrics, aH as ExecutionStartEvent, j as HistoryMode, aA as Hook, H as HookConfig, au as HookManager, az as HookName, aL as LLMRequestEvent, aM as LLMResponseEvent, aB as ModifyingHook, aK as ToolCompleteEvent, aF as ToolModification, aJ as ToolStartEvent } from '../../index-B5UaeEvK.js';
+export { aD as AfterToolContext, av as AgentEventName, A as AgentEvents, ay as AgenticLoopEventName, ax as AgenticLoopEvents, aG as ApprovalResult, aE as ApproveToolContext, m as AuditEntry, aC as BeforeToolContext, aI as ExecutionCompleteEvent, aw as ExecutionConfig, E as ExecutionContext, l as ExecutionMetrics, aH as ExecutionStartEvent, j as HistoryMode, aA as Hook, H as HookConfig, au as HookManager, az as HookName, aL as LLMRequestEvent, aM as LLMResponseEvent, aB as ModifyingHook, aK as ToolCompleteEvent, aF as ToolModification, aJ as ToolStartEvent } from '../../index-DVb6vfA3.js';
 import '../../IProvider-DcYJ3YE-.js';
 import '../../Vendor-DYh_bzwo.js';
 import 'eventemitter3';