@xberg-io/opencode-html-to-markdown 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,202 @@
1
+ import { spawn } from "node:child_process";
2
+ import { tool } from "@opencode-ai/plugin";
3
+
4
+ const schema = tool.schema;
5
+
6
+ const headingStyle = schema
7
+ .enum(["atx", "underlined", "atx-closed"])
8
+ .optional()
9
+ .describe("Markdown heading style. Default: atx.");
10
+
11
+ const codeBlockStyle = schema
12
+ .enum(["backticks", "indented", "tildes"])
13
+ .optional()
14
+ .describe("Code block fence style. Default: backticks.");
15
+
16
+ const outputFormat = schema
17
+ .enum(["markdown", "djot"])
18
+ .optional()
19
+ .describe("Output markup format. Default: markdown.");
20
+
21
+ const preset = schema
22
+ .enum(["minimal", "standard", "aggressive"])
23
+ .optional()
24
+ .describe("Preprocessing aggressiveness. Requires `preprocess`. Default: standard.");
25
+
26
+ function hasValue(value) {
27
+ return value !== undefined && value !== null && value !== "";
28
+ }
29
+
30
+ function pushOption(args, name, value) {
31
+ if (hasValue(value)) {
32
+ args.push(name, String(value));
33
+ }
34
+ }
35
+
36
+ function pushFlag(args, name, value) {
37
+ if (value === true) {
38
+ args.push(name);
39
+ }
40
+ }
41
+
42
+ function runCli(args, context, stdin) {
43
+ const directory = context?.directory ?? context?.worktree ?? process.cwd();
44
+
45
+ return new Promise((resolve, reject) => {
46
+ const child = spawn("html-to-markdown", args, {
47
+ cwd: directory,
48
+ env: process.env,
49
+ signal: context?.abort,
50
+ stdio: [stdin === undefined ? "ignore" : "pipe", "pipe", "pipe"],
51
+ });
52
+
53
+ const stdout = [];
54
+ const stderr = [];
55
+
56
+ child.stdout.on("data", (chunk) => stdout.push(chunk));
57
+ child.stderr.on("data", (chunk) => stderr.push(chunk));
58
+ child.on("error", (error) => {
59
+ if (error.code === "ENOENT") {
60
+ resolve({
61
+ title: "html-to-markdown CLI not found",
62
+ output:
63
+ "Install the html-to-markdown CLI with `brew install xberg-io/tap/html-to-markdown`, or run it via `npx html-to-markdown` / `uvx --from html-to-markdown html-to-markdown`.",
64
+ metadata: { exitCode: 127, command: "html-to-markdown" },
65
+ });
66
+ return;
67
+ }
68
+ reject(error);
69
+ });
70
+ child.on("close", (exitCode, signal) => {
71
+ const stdoutText = Buffer.concat(stdout).toString("utf8").trim();
72
+ const stderrText = Buffer.concat(stderr).toString("utf8").trim();
73
+ const output = [stdoutText, stderrText && `stderr:\n${stderrText}`]
74
+ .filter(Boolean)
75
+ .join("\n\n");
76
+
77
+ resolve({
78
+ title: exitCode === 0 ? "html-to-markdown" : "html-to-markdown failed",
79
+ output: output || "(no output)",
80
+ metadata: { exitCode, signal, command: "html-to-markdown" },
81
+ });
82
+ });
83
+
84
+ if (stdin !== undefined) {
85
+ child.stdin.write(stdin);
86
+ child.stdin.end();
87
+ }
88
+ });
89
+ }
90
+
91
+ function styleArgs(args, params) {
92
+ pushOption(args, "--heading-style", params.heading_style);
93
+ pushOption(args, "--code-block-style", params.code_block_style);
94
+ pushOption(args, "--output-format", params.output_format);
95
+ pushFlag(args, "--preprocess", params.preprocess);
96
+ pushOption(args, "--preset", params.preset);
97
+ }
98
+
99
+ export const HtmlToMarkdownPlugin = async () => ({
100
+ tool: {
101
+ html_to_markdown_convert: tool({
102
+ description:
103
+ "Convert an HTML file or HTML string to Markdown (or Djot) with the html-to-markdown CLI. Provide either `path` or `html`.",
104
+ args: {
105
+ path: schema.string().min(1).optional().describe("Path to a local HTML file."),
106
+ html: schema
107
+ .string()
108
+ .min(1)
109
+ .optional()
110
+ .describe("Inline HTML to convert (used when `path` is omitted)."),
111
+ heading_style: headingStyle,
112
+ code_block_style: codeBlockStyle,
113
+ output_format: outputFormat,
114
+ preprocess: schema
115
+ .boolean()
116
+ .optional()
117
+ .describe("Strip navigation, ads, and forms before converting."),
118
+ preset,
119
+ },
120
+ async execute(args, context) {
121
+ const cliArgs = [];
122
+ styleArgs(cliArgs, args);
123
+
124
+ if (hasValue(args.path)) {
125
+ cliArgs.push(args.path);
126
+ return runCli(cliArgs, context);
127
+ }
128
+ if (hasValue(args.html)) {
129
+ return runCli(cliArgs, context, args.html);
130
+ }
131
+ throw new Error("Provide either `path` or `html`.");
132
+ },
133
+ }),
134
+ html_to_markdown_fetch_url: tool({
135
+ description:
136
+ "Fetch a URL and convert its HTML to Markdown (or Djot) with the html-to-markdown CLI.",
137
+ args: {
138
+ url: schema.string().min(1).describe("URL to fetch and convert."),
139
+ heading_style: headingStyle,
140
+ code_block_style: codeBlockStyle,
141
+ output_format: outputFormat,
142
+ preprocess: schema
143
+ .boolean()
144
+ .optional()
145
+ .describe("Strip navigation, ads, and forms before converting."),
146
+ preset,
147
+ user_agent: schema
148
+ .string()
149
+ .min(1)
150
+ .optional()
151
+ .describe("Custom User-Agent header for the fetch."),
152
+ },
153
+ async execute(args, context) {
154
+ const cliArgs = ["--url", args.url];
155
+ pushOption(cliArgs, "--user-agent", args.user_agent);
156
+ styleArgs(cliArgs, args);
157
+ return runCli(cliArgs, context);
158
+ },
159
+ }),
160
+ html_to_markdown_extract: tool({
161
+ description:
162
+ "Extract structured metadata, tables, and (optionally) document structure from HTML as JSON. Returns the full ConversionResult. Provide `path`, `html`, or `url`.",
163
+ args: {
164
+ path: schema.string().min(1).optional().describe("Path to a local HTML file."),
165
+ html: schema
166
+ .string()
167
+ .min(1)
168
+ .optional()
169
+ .describe("Inline HTML to analyze (used when `path` and `url` are omitted)."),
170
+ url: schema.string().min(1).optional().describe("URL to fetch and analyze."),
171
+ include_structure: schema
172
+ .boolean()
173
+ .optional()
174
+ .describe("Include the document structure tree in the JSON output."),
175
+ no_content: schema
176
+ .boolean()
177
+ .optional()
178
+ .describe("Suppress the Markdown content field — return metadata/tables/images only."),
179
+ },
180
+ async execute(args, context) {
181
+ const cliArgs = ["--json"];
182
+ pushFlag(cliArgs, "--include-structure", args.include_structure);
183
+ pushFlag(cliArgs, "--no-content", args.no_content);
184
+
185
+ if (hasValue(args.url)) {
186
+ cliArgs.push("--url", args.url);
187
+ return runCli(cliArgs, context);
188
+ }
189
+ if (hasValue(args.path)) {
190
+ cliArgs.push(args.path);
191
+ return runCli(cliArgs, context);
192
+ }
193
+ if (hasValue(args.html)) {
194
+ return runCli(cliArgs, context, args.html);
195
+ }
196
+ throw new Error("Provide one of `path`, `html`, or `url`.");
197
+ },
198
+ }),
199
+ },
200
+ });
201
+
202
+ export default HtmlToMarkdownPlugin;
package/README.md ADDED
@@ -0,0 +1,175 @@
1
+ # html-to-markdown
2
+
3
+ Fast, lossless HTML→Markdown conversion with structured metadata, tables, and document-structure extraction — using the local `html-to-markdown` CLI in your agent.
4
+
5
+ <!-- TODO: screenshot -->
6
+
7
+ ## Install
8
+
9
+ ### From the marketplace (recommended)
10
+
11
+ Pending review for official Claude marketplace.
12
+
13
+ Self-host:
14
+
15
+ ```text
16
+ /plugin marketplace add xberg-io/plugins
17
+ /plugin install html-to-markdown@xberg
18
+ ```
19
+
20
+ ### Binary requirement
21
+
22
+ The bundled MCP launcher (`scripts/mcp-launch.sh`) resolves an
23
+ `html-to-markdown` binary automatically on first run: it reuses one already on
24
+ `PATH`, then tries `npx`/`uvx`, then Homebrew, then a prebuilt download from the
25
+ tool's latest GitHub release. No manual install is required to use the MCP
26
+ server.
27
+
28
+ To install the CLI yourself:
29
+
30
+ ```bash
31
+ # (Homebrew 6.0+ requires explicit trust for third-party taps)
32
+ brew trust xberg-io/tap
33
+ brew install xberg-io/tap/html-to-markdown
34
+ # or run it without a persistent install (the CLI proxy package self-installs the binary):
35
+ npx @xberg-io/html-to-markdown-cli --help
36
+ uvx --from html-to-markdown-cli html-to-markdown --help
37
+ # or download a prebuilt binary from the latest GitHub release:
38
+ # https://github.com/xberg-io/html-to-markdown/releases/latest
39
+ # or build from source:
40
+ cargo install html-to-markdown-cli
41
+ ```
42
+
43
+ The skills also cover the language SDKs. Install the one you need:
44
+
45
+ ```bash
46
+ pip install html-to-markdown # Python
47
+ npm install @xberg-io/html-to-markdown # TypeScript / Node.js
48
+ cargo add html-to-markdown-rs # Rust
49
+ gem install html-to-markdown # Ruby
50
+ ```
51
+
52
+ ## Skills shipped
53
+
54
+ | Skill | Trigger |
55
+ |-------|---------|
56
+ | **html-to-markdown** | Convert HTML to Markdown, Djot, or plain text with structured extraction. Use when writing code that calls html-to-markdown APIs in Rust, Python, TypeScript, Go, Ruby, PHP, Java, C#, Elixir, R, C, or WASM. Covers installation, conversion, configuration, metadata extraction, tables, document structure, and CLI usage. |
57
+ | **converting-html** | Use when converting HTML to Markdown, Djot, or plain text. Covers output formats, heading and code-block styles, escaping, wrapping, and HTML preprocessing. |
58
+ | **extracting-metadata** | Use when extracting metadata from HTML — title, description, language, Open Graph, JSON-LD / Microdata / RDFa, headers, links, and images. Covers the `--json` output shape and the `--extract-metadata` flag. |
59
+ | **extracting-tables** | Use when extracting tabular data from HTML. Covers GFM Markdown tables, the structured `tables` array (grid cells + pre-rendered markdown), and `<br>` handling in cells. |
60
+ | **fetching-and-converting-urls** | Use when fetching a live URL and converting it to Markdown. Covers `--url`, custom user agents, preprocessing for noisy pages, and the `--json` ConversionResult shape. |
61
+ | **using-the-mcp-server** | Use when converting HTML and extracting metadata through the MCP server's `convert_html`/`extract_metadata` tools instead of the CLI. Covers the tool surface and the auto-installing launcher. |
62
+
63
+ **Reference materials** (linked from the `html-to-markdown` skill):
64
+
65
+ | Reference | Content |
66
+ |-----------|---------|
67
+ | **CLI Reference** | All flags, output formats, JSON shape, exit codes |
68
+ | **Configuration Reference** | All 30+ ConversionOptions fields with defaults |
69
+ | **Rust API Reference** | Functions, builder options, feature flags |
70
+ | **Python API Reference** | Functions, dataclasses, type hints |
71
+ | **TypeScript API Reference** | Functions, interfaces, Buffer support |
72
+ | **Other Language Bindings** | Go, Ruby, PHP, Java, C#, Elixir, R, WASM, C FFI |
73
+
74
+ ## MCP server
75
+
76
+ The plugin auto-registers an MCP server named `html-to-markdown`, launched via
77
+ `scripts/mcp-launch.sh` (which execs `html-to-markdown mcp`). It exposes two
78
+ tools — `convert_html` (HTML string → Markdown/Djot/plain text, or the full
79
+ `ConversionResult` JSON with `json: true`) and `extract_metadata` (structured
80
+ metadata from an HTML string) — so the agent can convert HTML directly without
81
+ shelling out to the CLI. The launcher auto-installs a
82
+ binary on first run (override with
83
+ `HTML_TO_MARKDOWN_LAUNCHER=auto|npx|uvx|brew|download`). The `mcp` subcommand
84
+ ships in a recent release of the tool; an older binary on `PATH` may need an
85
+ upgrade to expose it. See the **using-the-mcp-server** skill for details.
86
+
87
+ ## CLI / SDK usage
88
+
89
+ The conversion CLI takes **flags only** — `FILE` is positional; omit it (or use `-`) to read HTML from stdin. (The one subcommand is `mcp`, which starts the MCP server.)
90
+
91
+ ```bash
92
+ html-to-markdown input.html # convert file to stdout
93
+ html-to-markdown input.html -o output.md # convert to a file
94
+ cat page.html | html-to-markdown # read from stdin
95
+ html-to-markdown --url https://example.com # fetch and convert a URL
96
+ html-to-markdown --json input.html # full ConversionResult as JSON
97
+ ```
98
+
99
+ The same single entry point exists across the SDKs — `convert()` returns a structured `ConversionResult` (`content`, `metadata`, `tables`, `images`, `warnings`):
100
+
101
+ ```python
102
+ from html_to_markdown import convert
103
+
104
+ result = convert("<h1>Hello</h1><p>World</p>")
105
+ print(result.content) # # Hello\n\nWorld
106
+ print(result.metadata) # title, links, headers, …
107
+ ```
108
+
109
+ ```typescript
110
+ import { convert } from "@xberg-io/html-to-markdown";
111
+
112
+ // Node's convert() returns a ConversionResult object directly.
113
+ const result = convert("<h1>Hello</h1><p>World</p>");
114
+ console.log(result.content); // # Hello\n\nWorld
115
+ ```
116
+
117
+ Prefer the CLI for one-shot conversions and shell pipelines; prefer the SDKs when embedding conversion in application code.
118
+
119
+ ## Configuration
120
+
121
+ All conversion behavior is controlled by CLI flags (or the matching `ConversionOptions` fields in the SDKs):
122
+
123
+ | Flag | Values | Default | Purpose |
124
+ |------|--------|---------|---------|
125
+ | `--output-format` / `-f` | `markdown`, `djot`, `plain` | `markdown` | Output markup format. |
126
+ | `--heading-style` | `atx`, `underlined`, `atx-closed` | `atx` | Heading rendering. |
127
+ | `--code-block-style` | `backticks`, `indented`, `tildes` | `backticks` | Code fence style. |
128
+ | `--preprocess` / `-p` | flag | off | Strip nav, ads, forms before converting. |
129
+ | `--preset` | `minimal`, `standard`, `aggressive` | `standard` | Preprocessing aggressiveness (needs `--preprocess`). |
130
+ | `--wrap` / `-w`, `--wrap-width` | flag, 20–500 | off, `80` | Text wrapping. |
131
+ | `--json` | flag | off | Emit the full `ConversionResult` JSON. |
132
+ | `--include-structure` | flag | off | Add the document-structure tree (needs `--json`). |
133
+ | `--no-content` | flag | off | Extraction-only — skip the Markdown text. |
134
+
135
+ See `skills/html-to-markdown/references/configuration.md` for the full 30+ option table and `skills/html-to-markdown/references/cli-reference.md` for every flag.
136
+
137
+ ## Examples
138
+
139
+ Convert an HTML file to Markdown:
140
+
141
+ ```text
142
+ html-to-markdown article.html -o article.md
143
+ ```
144
+
145
+ Scrape a page with aggressive preprocessing:
146
+
147
+ ```text
148
+ html-to-markdown --url https://example.com/blog --preprocess --preset aggressive
149
+ ```
150
+
151
+ Extract metadata and tables only (no Markdown body):
152
+
153
+ ```text
154
+ html-to-markdown --json --no-content page.html | jq '{title: .metadata.document.title, tables: (.tables | length)}'
155
+ ```
156
+
157
+ Convert with Djot output and underlined headings:
158
+
159
+ ```text
160
+ html-to-markdown input.html --output-format djot --heading-style underlined
161
+ ```
162
+
163
+ ## Versioning
164
+
165
+ The plugin version tracks the marketplace `VERSION` file. See [CHANGELOG.md](../../CHANGELOG.md) for release notes.
166
+
167
+ ## License
168
+
169
+ MIT. The skill content references the upstream [html-to-markdown](https://github.com/xberg-io/html-to-markdown) repository.
170
+
171
+ ## See also
172
+
173
+ - **Marketplace**: [xberg-io/plugins](https://github.com/xberg-io/plugins)
174
+ - **Upstream**: [xberg-io/html-to-markdown](https://github.com/xberg-io/html-to-markdown)
175
+ - **Sibling plugins**: [xberg](../xberg/README.md), [crawlberg](../crawlberg/README.md)
@@ -0,0 +1,7 @@
1
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 256" width="256" height="256">
2
+ <rect width="256" height="256" rx="48" fill="#2E2D36"/>
3
+ <path d="M64 56 L152 56 L196 100 L196 196 C196 200.4 192.4 204 188 204 L64 204 C59.6 204 56 200.4 56 196 L56 64 C56 59.6 59.6 56 64 56 Z" fill="#58FBDA"/>
4
+ <path d="M152 56 L152 92 C152 96.4 155.6 100 160 100 L196 100 Z" fill="#2E2D36"/>
5
+ <path d="M84 168 L84 124 L100 144 L116 124 L116 168" fill="none" stroke="#2E2D36" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
6
+ <path d="M150 124 L150 168 M150 168 L134 150 M150 168 L166 150" fill="none" stroke="#2E2D36" stroke-width="12" stroke-linecap="round" stroke-linejoin="round"/>
7
+ </svg>
@@ -0,0 +1,26 @@
1
+ <svg width="800" height="800" viewBox="0 0 800 800" fill="none" xmlns="http://www.w3.org/2000/svg">
2
+ <circle cx="400" cy="400" r="313.839" fill="#58FBDA" stroke="#2E2D36" stroke-width="7.67819"/>
3
+ <path d="M469.155 481.996L497.619 509.815L517.95 513.534L529.157 523.7" stroke="#2E2D36" stroke-width="7.67819" stroke-linecap="round"/>
4
+ <path d="M428.146 495.931V565.751L454.576 592.876V615.587" stroke="#2E2D36" stroke-width="7.67819" stroke-linecap="round"/>
5
+ <path d="M333.878 482.343L310.522 509.468L289.15 512.543L271.199 525.733" stroke="#2E2D36" stroke-width="7.67819" stroke-linecap="round"/>
6
+ <path d="M404.521 155.839C498.389 155.839 574.411 231.075 574.411 323.792C574.411 416.509 498.389 491.744 404.521 491.744C310.653 491.744 234.631 416.509 234.631 323.792C234.631 231.075 310.653 155.839 404.521 155.839Z" fill="#79FFE6" stroke="#2E2D36" stroke-width="7.67819"/>
7
+ <path d="M401.021 218.481C450.893 218.481 491.275 258.538 491.275 307.892C491.275 357.246 450.893 397.303 401.021 397.303C351.149 397.303 310.767 357.246 310.767 307.892C310.767 258.538 351.149 218.481 401.021 218.481Z" fill="white" stroke="#2E2D36" stroke-width="6.14255"/>
8
+ <circle cx="231.379" cy="476.492" r="14.8269" fill="white" stroke="#2E2D36" stroke-width="4.60691"/>
9
+ <circle cx="256.471" cy="535.8" r="14.8269" fill="white" stroke="#2E2D36" stroke-width="4.60691"/>
10
+ <circle cx="300.109" cy="582.413" r="14.8269" fill="white" stroke="#2E2D36" stroke-width="4.60691"/>
11
+ <circle cx="401.071" cy="643.307" r="14.8269" fill="white" stroke="#2E2D36" stroke-width="4.60691"/>
12
+ <circle cx="457.998" cy="631.009" r="14.8269" fill="white" stroke="#2E2D36" stroke-width="4.60691"/>
13
+ <path d="M457.304 487.798V535.601L489.139 569.123" stroke="#2E2D36" stroke-width="7.67819" stroke-linecap="round"/>
14
+ <circle cx="499.354" cy="582.413" r="14.8269" fill="white" stroke="#2E2D36" stroke-width="4.60691"/>
15
+ <circle cx="542.695" cy="535.006" r="14.8269" fill="white" stroke="#2E2D36" stroke-width="4.60691"/>
16
+ <circle cx="562.332" cy="476.393" r="14.8269" fill="white" stroke="#2E2D36" stroke-width="4.60691"/>
17
+ <path d="M303.381 465.483L289.993 476.988H248.834" stroke="#2E2D36" stroke-width="7.67819" stroke-linecap="round"/>
18
+ <path d="M351.532 489.484V531.188L312.555 569.817" stroke="#2E2D36" stroke-width="7.67819" stroke-linecap="round"/>
19
+ <path d="M371.863 494.889V569.123L345.78 596.942V615.24" stroke="#2E2D36" stroke-width="7.67819" stroke-linecap="round"/>
20
+ <path d="M401.368 498.311V625.753" stroke="#2E2D36" stroke-width="7.67819" stroke-linecap="round"/>
21
+ <path d="M496.925 465.731L505.057 473.566H545.422" stroke="#2E2D36" stroke-width="7.67819" stroke-linecap="round"/>
22
+ <ellipse cx="401.021" cy="307.892" rx="51.671" ry="53.0099" fill="#2E2D36"/>
23
+ <ellipse cx="411.286" cy="294.106" rx="18.5956" ry="19.0915" fill="white"/>
24
+ <ellipse cx="428.493" cy="322.471" rx="8.43002" ry="8.67796" fill="white"/>
25
+ <circle cx="344.143" cy="632.299" r="14.8269" fill="white" stroke="#2E2D36" stroke-width="4.60691"/>
26
+ </svg>
package/package.json ADDED
@@ -0,0 +1,42 @@
1
+ {
2
+ "name": "@xberg-io/opencode-html-to-markdown",
3
+ "version": "0.1.0",
4
+ "description": "OpenCode tools for HTML→Markdown conversion with the html-to-markdown CLI.",
5
+ "keywords": [
6
+ "conversion",
7
+ "html",
8
+ "html-to-markdown",
9
+ "markdown",
10
+ "opencode"
11
+ ],
12
+ "homepage": "https://github.com/xberg-io/plugins/tree/main/plugins/html-to-markdown",
13
+ "bugs": {
14
+ "url": "https://github.com/xberg-io/plugins/issues"
15
+ },
16
+ "license": "MIT",
17
+ "repository": {
18
+ "type": "git",
19
+ "url": "git+https://github.com/xberg-io/plugins.git",
20
+ "directory": "plugins/html-to-markdown"
21
+ },
22
+ "files": [
23
+ ".opencode/",
24
+ "assets/",
25
+ "README.md"
26
+ ],
27
+ "type": "module",
28
+ "main": ".opencode/plugins/html-to-markdown.js",
29
+ "exports": {
30
+ ".": "./.opencode/plugins/html-to-markdown.js"
31
+ },
32
+ "publishConfig": {
33
+ "access": "public",
34
+ "provenance": true
35
+ },
36
+ "dependencies": {
37
+ "@opencode-ai/plugin": "^1.17.8"
38
+ },
39
+ "engines": {
40
+ "node": ">=22.14.0"
41
+ }
42
+ }