@xberg-io/opencode-xberg 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.opencode/plugins/xberg.js +132 -0
- package/README.md +129 -0
- package/assets/icon.svg +8 -0
- package/assets/logo.png +0 -0
- package/package.json +42 -0
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import { spawn } from "node:child_process";
|
|
2
|
+
import { tool } from "@opencode-ai/plugin";
|
|
3
|
+
|
|
4
|
+
const schema = tool.schema;
|
|
5
|
+
|
|
6
|
+
const wireFormat = schema
|
|
7
|
+
.enum(["text", "json", "toon"])
|
|
8
|
+
.default("json")
|
|
9
|
+
.describe("CLI output format.");
|
|
10
|
+
|
|
11
|
+
const contentFormat = schema
|
|
12
|
+
.enum(["plain", "markdown", "djot", "html", "json"])
|
|
13
|
+
.optional()
|
|
14
|
+
.describe("Document content rendering format.");
|
|
15
|
+
|
|
16
|
+
function hasValue(value) {
|
|
17
|
+
return value !== undefined && value !== null && value !== "";
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function pushOption(args, name, value) {
|
|
21
|
+
if (hasValue(value)) {
|
|
22
|
+
args.push(name, String(value));
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function validateJson(value, name) {
|
|
27
|
+
if (!hasValue(value)) {
|
|
28
|
+
return;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
try {
|
|
32
|
+
JSON.parse(value);
|
|
33
|
+
} catch (error) {
|
|
34
|
+
throw new Error(`${name} must be valid JSON: ${error.message}`);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function runCli(args, context) {
|
|
39
|
+
const directory = context?.directory ?? context?.worktree ?? process.cwd();
|
|
40
|
+
|
|
41
|
+
return new Promise((resolve, reject) => {
|
|
42
|
+
const child = spawn("xberg", args, {
|
|
43
|
+
cwd: directory,
|
|
44
|
+
env: process.env,
|
|
45
|
+
signal: context?.abort,
|
|
46
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
const stdout = [];
|
|
50
|
+
const stderr = [];
|
|
51
|
+
|
|
52
|
+
child.stdout.on("data", (chunk) => stdout.push(chunk));
|
|
53
|
+
child.stderr.on("data", (chunk) => stderr.push(chunk));
|
|
54
|
+
child.on("error", (error) => {
|
|
55
|
+
if (error.code === "ENOENT") {
|
|
56
|
+
resolve({
|
|
57
|
+
title: "xberg CLI not found",
|
|
58
|
+
output:
|
|
59
|
+
"Install the xberg CLI with `brew install xberg-io/tap/xberg`, or run it via `npx xberg` / `uvx --from xberg xberg`.",
|
|
60
|
+
metadata: { exitCode: 127, command: "xberg", subcommand: args[0] },
|
|
61
|
+
});
|
|
62
|
+
return;
|
|
63
|
+
}
|
|
64
|
+
reject(error);
|
|
65
|
+
});
|
|
66
|
+
child.on("close", (exitCode, signal) => {
|
|
67
|
+
const stdoutText = Buffer.concat(stdout).toString("utf8").trim();
|
|
68
|
+
const stderrText = Buffer.concat(stderr).toString("utf8").trim();
|
|
69
|
+
const output = [stdoutText, stderrText && `stderr:\n${stderrText}`]
|
|
70
|
+
.filter(Boolean)
|
|
71
|
+
.join("\n\n");
|
|
72
|
+
|
|
73
|
+
resolve({
|
|
74
|
+
title: exitCode === 0 ? `xberg ${args[0]}` : `xberg ${args[0]} failed`,
|
|
75
|
+
output: output || "(no output)",
|
|
76
|
+
metadata: {
|
|
77
|
+
exitCode,
|
|
78
|
+
signal,
|
|
79
|
+
command: "xberg",
|
|
80
|
+
subcommand: args[0],
|
|
81
|
+
},
|
|
82
|
+
});
|
|
83
|
+
});
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
export const XbergPlugin = async () => ({
|
|
88
|
+
tool: {
|
|
89
|
+
xberg_extract: tool({
|
|
90
|
+
description:
|
|
91
|
+
"Extract text, tables, metadata, and images from a local document with the xberg CLI.",
|
|
92
|
+
args: {
|
|
93
|
+
path: schema.string().min(1).describe("Path to the local document."),
|
|
94
|
+
format: wireFormat,
|
|
95
|
+
content_format: contentFormat,
|
|
96
|
+
mime_type: schema.string().min(1).optional().describe("Optional MIME type hint."),
|
|
97
|
+
config_json: schema.string().min(2).optional().describe("Optional ExtractionConfig JSON."),
|
|
98
|
+
},
|
|
99
|
+
async execute(args, context) {
|
|
100
|
+
validateJson(args.config_json, "config_json");
|
|
101
|
+
|
|
102
|
+
const cliArgs = ["extract", args.path, "--format", args.format];
|
|
103
|
+
pushOption(cliArgs, "--content-format", args.content_format);
|
|
104
|
+
pushOption(cliArgs, "--mime-type", args.mime_type);
|
|
105
|
+
pushOption(cliArgs, "--config-json", args.config_json);
|
|
106
|
+
|
|
107
|
+
return runCli(cliArgs, context);
|
|
108
|
+
},
|
|
109
|
+
}),
|
|
110
|
+
xberg_detect: tool({
|
|
111
|
+
description: "Detect the MIME type for a local file with the xberg CLI.",
|
|
112
|
+
args: {
|
|
113
|
+
path: schema.string().min(1).describe("Path to the local file."),
|
|
114
|
+
format: wireFormat,
|
|
115
|
+
},
|
|
116
|
+
async execute(args, context) {
|
|
117
|
+
return runCli(["detect", args.path, "--format", args.format], context);
|
|
118
|
+
},
|
|
119
|
+
}),
|
|
120
|
+
xberg_formats: tool({
|
|
121
|
+
description: "List document formats supported by the xberg CLI.",
|
|
122
|
+
args: {
|
|
123
|
+
format: wireFormat,
|
|
124
|
+
},
|
|
125
|
+
async execute(args, context) {
|
|
126
|
+
return runCli(["formats", "--format", args.format], context);
|
|
127
|
+
},
|
|
128
|
+
}),
|
|
129
|
+
},
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
export default XbergPlugin;
|
package/README.md
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# xberg
|
|
2
|
+
|
|
3
|
+
Extract text, tables, metadata, and images from 91+ document formats — PDF, Office, images with OCR, HTML, email, archives, academic — using the local `xberg` CLI in your agent.
|
|
4
|
+
|
|
5
|
+
<!-- TODO: screenshot -->
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
### From the marketplace (recommended)
|
|
10
|
+
|
|
11
|
+
Pending review for official Claude marketplace.
|
|
12
|
+
|
|
13
|
+
Self-host:
|
|
14
|
+
|
|
15
|
+
```text
|
|
16
|
+
/plugin marketplace add xberg-io/plugins
|
|
17
|
+
/plugin install xberg@xberg
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
### Binary requirement
|
|
21
|
+
|
|
22
|
+
Install the `xberg` CLI:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
brew install xberg-io/tap/xberg
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Or run without a persistent install:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
npx @xberg-io/xberg-cli --help
|
|
32
|
+
uvx --from xberg-cli xberg --help
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Or download a prebuilt binary from the [latest GitHub release](https://github.com/xberg-io/xberg/releases/latest), or install the CLI from crates.io:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
cargo install xberg-cli --features mcp
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
The Python (`xberg`) and Node (`@xberg-io/xberg`) packages are language SDKs/bindings, not the CLI. The prebuilt CLI binaries (brew, GitHub release) include the MCP server; a source build enables it with `--features mcp` (`mcp` is not in the CLI's default feature set).
|
|
42
|
+
|
|
43
|
+
OCR ships with Tesseract by default. Install language packs for non-English documents:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
brew install tesseract-lang # macOS
|
|
47
|
+
sudo apt install tesseract-ocr-* # Debian/Ubuntu
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Skills shipped
|
|
51
|
+
|
|
52
|
+
| Skill | Trigger |
|
|
53
|
+
|-------|---------|
|
|
54
|
+
| **xberg** | Extract text, tables, metadata, and images from 91+ document formats (PDF, Office, images, HTML, email, archives, academic) using Xberg. Use when writing code that calls Xberg APIs in Python, Node.js/TypeScript, Rust, or CLI. Covers installation, extraction (sync/async), configuration (OCR, chunking, output format), batch processing, error handling, and plugins. |
|
|
55
|
+
| **extracting-with-ocr** | Use when extracting text from scanned PDFs, photographed pages, or images that have no embedded text layer. Covers OCR backends, language packs, force-OCR, and performance tuning. |
|
|
56
|
+
| **extracting-tables** | Use when extracting tabular data from PDFs, spreadsheets, or images. Covers layout-aware table detection, table model selection, output formats (markdown / JSON cells), and known limits. |
|
|
57
|
+
| **chunking** | Use when splitting extracted text into chunks for LLM context windows or RAG ingestion. Covers chunk size, overlap, markdown/yaml/semantic chunkers, tokenizer-based sizing, and the standalone `chunk` command. |
|
|
58
|
+
| **extracting-keywords** | Use when extracting keywords (YAKE/RAKE), detecting document language, or generating embeddings for RAG and search. Covers the keyword config, `--detect-language`, and the standalone `embed` command. |
|
|
59
|
+
| **batch-extraction** | Use when extracting from many files at once with shared config, bounded parallelism, per-file overrides, and error recovery. Covers the `batch` command, `--file-configs`, `--max-concurrent`, and output layout. |
|
|
60
|
+
| **picking-a-format** | Use when choosing an output format for extracted documents — text, markdown, djot, html, or JSON. Maps consumer (LLM, parser, archive) to the right `--format` / `--content-format` pair. |
|
|
61
|
+
|
|
62
|
+
**Reference materials** (linked from the `xberg` skill):
|
|
63
|
+
|
|
64
|
+
| Reference | Content |
|
|
65
|
+
|-----------|---------|
|
|
66
|
+
| **CLI Reference** | All commands, flags, config precedence, exit codes |
|
|
67
|
+
| **Configuration Reference** | TOML/YAML/JSON formats, auto-discovery, env vars, full schema |
|
|
68
|
+
| **Supported Formats** | All 91+ formats with file extensions and MIME types |
|
|
69
|
+
| **Python API Reference** | All functions, config classes, plugin protocols, exact signatures |
|
|
70
|
+
| **Node.js API Reference** | All functions, TypeScript interfaces, worker pool APIs |
|
|
71
|
+
| **Rust API Reference** | All functions with feature gates, structs, Cargo.toml examples |
|
|
72
|
+
| **Advanced Features** | Plugins, embeddings, MCP server, API server, security limits |
|
|
73
|
+
| **Other Language Bindings** | Go, Ruby, Java, C#, PHP, Elixir, WASM, Docker |
|
|
74
|
+
|
|
75
|
+
## MCP tools
|
|
76
|
+
|
|
77
|
+
Run `xberg mcp` to start the MCP server over stdio. The server exposes 9 tools:
|
|
78
|
+
|
|
79
|
+
**Extraction:** `extract`, `extract_batch`, `detect_mime_type`
|
|
80
|
+
|
|
81
|
+
**Cache:** `cache_stats`, `cache_clear`, `cache_manifest`, `cache_warm`
|
|
82
|
+
|
|
83
|
+
**Metadata:** `list_formats`, `get_version`
|
|
84
|
+
|
|
85
|
+
The `extract` and `extract_batch` tools accept an optional `config` object to override defaults and a `response_format` of `json` (default) or `toon`. There is no separate `extract_bytes` tool — `extract`/`extract_batch` take an `input` object whose `kind` is `bytes` or `uri`. The MCP server is gated behind the CLI's `mcp` build feature.
|
|
86
|
+
|
|
87
|
+
## Configuration
|
|
88
|
+
|
|
89
|
+
Xberg auto-discovers `xberg.toml` from the current directory upward. Set config via:
|
|
90
|
+
|
|
91
|
+
1. **Environment variable**: `XBERG_CONFIG_JSON='{"output_format":"markdown"}'`
|
|
92
|
+
2. **Config file** (TOML): `xberg.toml` in cwd or a parent directory.
|
|
93
|
+
3. **CLI flag**: `xberg extract doc.pdf --content-format markdown`
|
|
94
|
+
|
|
95
|
+
See `skills/xberg/references/configuration.md` for the full schema and precedence rules.
|
|
96
|
+
|
|
97
|
+
## Examples
|
|
98
|
+
|
|
99
|
+
Extract a PDF to plain text and print it:
|
|
100
|
+
|
|
101
|
+
```text
|
|
102
|
+
xberg extract document.pdf
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Extract with markdown formatting for LLM context:
|
|
106
|
+
|
|
107
|
+
```text
|
|
108
|
+
xberg extract report.pdf --content-format markdown
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Extract tables from a spreadsheet as JSON:
|
|
112
|
+
|
|
113
|
+
```text
|
|
114
|
+
xberg extract data.xlsx --format json
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## Versioning
|
|
118
|
+
|
|
119
|
+
The plugin version tracks the marketplace `VERSION` file. See [CHANGELOG.md](../../CHANGELOG.md) for release notes.
|
|
120
|
+
|
|
121
|
+
## License
|
|
122
|
+
|
|
123
|
+
MIT. The skill content uses Elastic-2.0 references to the upstream [xberg](https://github.com/xberg-io/xberg) repository.
|
|
124
|
+
|
|
125
|
+
## See also
|
|
126
|
+
|
|
127
|
+
- **Marketplace**: [xberg-io/plugins](https://github.com/xberg-io/plugins)
|
|
128
|
+
- **Upstream**: [xberg-io/xberg](https://github.com/xberg-io/xberg)
|
|
129
|
+
- **Sibling plugins**: [crawlberg](../crawlberg/README.md)
|
package/assets/icon.svg
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 256" width="256" height="256">
|
|
2
|
+
<rect width="256" height="256" rx="48" fill="#1F6FEB"/>
|
|
3
|
+
<path d="M64 64 L152 64 L192 104 L192 200 C192 204.4 188.4 208 184 208 L64 208 C59.6 208 56 204.4 56 200 L56 72 C56 67.6 59.6 64 64 64 Z" fill="white"/>
|
|
4
|
+
<path d="M152 64 L152 96 C152 100.4 155.6 104 160 104 L192 104 Z" fill="#1F6FEB"/>
|
|
5
|
+
<rect x="80" y="128" width="88" height="12" rx="6" fill="#1F6FEB"/>
|
|
6
|
+
<rect x="80" y="152" width="96" height="12" rx="6" fill="#1F6FEB"/>
|
|
7
|
+
<rect x="80" y="176" width="64" height="12" rx="6" fill="#1F6FEB"/>
|
|
8
|
+
</svg>
|
package/assets/logo.png
ADDED
|
Binary file
|
package/package.json
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@xberg-io/opencode-xberg",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "OpenCode tools for local document extraction with the xberg CLI.",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"document-intelligence",
|
|
7
|
+
"extraction",
|
|
8
|
+
"ocr",
|
|
9
|
+
"opencode",
|
|
10
|
+
"pdf"
|
|
11
|
+
],
|
|
12
|
+
"homepage": "https://github.com/xberg-io/plugins/tree/main/plugins/xberg",
|
|
13
|
+
"bugs": {
|
|
14
|
+
"url": "https://github.com/xberg-io/plugins/issues"
|
|
15
|
+
},
|
|
16
|
+
"license": "MIT",
|
|
17
|
+
"repository": {
|
|
18
|
+
"type": "git",
|
|
19
|
+
"url": "git+https://github.com/xberg-io/plugins.git",
|
|
20
|
+
"directory": "plugins/xberg"
|
|
21
|
+
},
|
|
22
|
+
"files": [
|
|
23
|
+
".opencode/",
|
|
24
|
+
"assets/",
|
|
25
|
+
"README.md"
|
|
26
|
+
],
|
|
27
|
+
"type": "module",
|
|
28
|
+
"main": ".opencode/plugins/xberg.js",
|
|
29
|
+
"exports": {
|
|
30
|
+
".": "./.opencode/plugins/xberg.js"
|
|
31
|
+
},
|
|
32
|
+
"publishConfig": {
|
|
33
|
+
"access": "public",
|
|
34
|
+
"provenance": true
|
|
35
|
+
},
|
|
36
|
+
"dependencies": {
|
|
37
|
+
"@opencode-ai/plugin": "^1.17.8"
|
|
38
|
+
},
|
|
39
|
+
"engines": {
|
|
40
|
+
"node": ">=22.14.0"
|
|
41
|
+
}
|
|
42
|
+
}
|