mdream 0.1.1 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -99,16 +99,16 @@ pnpm add mdream
99
99
  ### Usage
100
100
 
101
101
  Mdream provides two utils for working with HTML, both will process content as a stream.
102
- - `syncHtmlToMarkdown`: Useful if you already have the entire HTML payload you want to convert.
102
+ - `htmlToMarkdown`: Useful if you already have the entire HTML payload you want to convert.
103
103
  - `streamHtmlToMarkdown`: Best practice if you are fetching or reading from a local file.
104
104
 
105
105
  **Convert existing HTML**
106
106
 
107
107
  ```ts
108
- import { syncHtmlToMarkdown } from 'mdream'
108
+ import { htmlToMarkdown } from 'mdream'
109
109
 
110
110
  // Simple conversion
111
- const markdown = syncHtmlToMarkdown('<h1>Hello World</h1>')
111
+ const markdown = htmlToMarkdown('<h1>Hello World</h1>')
112
112
  console.log(markdown) // # Hello World
113
113
  ````
114
114
 
@@ -138,7 +138,7 @@ for await (const chunk of markdownGenerator) {
138
138
  Mdream now features a powerful plugin system that allows you to customize and extend the HTML-to-Markdown conversion process.
139
139
 
140
140
  ```ts
141
- import { createPlugin, filterUnsupportedTags, syncHtmlToMarkdown, withTailwind } from 'mdream'
141
+ import { createPlugin, filterUnsupportedTags, htmlToMarkdown, withTailwind } from 'mdream'
142
142
 
143
143
  // Create a custom plugin
144
144
  const myPlugin = createPlugin({
@@ -153,7 +153,7 @@ const myPlugin = createPlugin({
153
153
 
154
154
  // Use multiple plugins together
155
155
  const html = '<div role="alert" class="font-bold">Important message</div>'
156
- const markdown = syncHtmlToMarkdown(html, {
156
+ const markdown = htmlToMarkdown(html, {
157
157
  plugins: [
158
158
  withTailwind(), // Apply Tailwind class processing
159
159
  filterUnsupportedTags(), // Filter out unsupported tags
@@ -164,6 +164,37 @@ const markdown = syncHtmlToMarkdown(html, {
164
164
  console.log(markdown) // "⚠️ **Important message** ⚠️"
165
165
  ```
166
166
 
167
+ #### Extraction Plugin
168
+
169
+ Extract specific elements and their content during HTML processing for data analysis or content discovery:
170
+
171
+ ```ts
172
+ import { extractionPlugin, htmlToMarkdown } from 'mdream'
173
+
174
+ const html = `
175
+ <article>
176
+ <h2>Getting Started</h2>
177
+ <p>This is a tutorial about web scraping.</p>
178
+ <img src="/hero.jpg" alt="Hero image" />
179
+ </article>
180
+ `
181
+
182
+ // Extract elements using CSS selectors
183
+ const plugin = extractionPlugin({
184
+ 'h2': (element) => {
185
+ console.log('Heading:', element.textContent) // "Getting Started"
186
+ },
187
+ 'img[alt]': (element) => {
188
+ console.log('Image:', element.attributes.src, element.attributes.alt)
189
+ // "Image: /hero.jpg Hero image"
190
+ }
191
+ })
192
+
193
+ htmlToMarkdown(html, { plugins: [plugin] })
194
+ ```
195
+
196
+ The extraction plugin provides memory-efficient element extraction with full text content and attributes, perfect for SEO analysis, content discovery, and data mining.
197
+
167
198
  For more details, see the [plugin documentation](./docs/plugins.md).
168
199
 
169
200
  ## Credits
package/dist/index.d.mts CHANGED
@@ -10,6 +10,6 @@ import { ReadableStream } from 'node:stream/web';
10
10
  */
11
11
  declare function streamHtmlToMarkdown(htmlStream: ReadableStream | null, options?: HTMLToMarkdownOptions): AsyncIterable<string>;
12
12
 
13
- declare function syncHtmlToMarkdown(html: string, options?: HTMLToMarkdownOptions): string;
13
+ declare function htmlToMarkdown(html: string, options?: HTMLToMarkdownOptions): string;
14
14
 
15
- export { HTMLToMarkdownOptions, streamHtmlToMarkdown, syncHtmlToMarkdown };
15
+ export { HTMLToMarkdownOptions, htmlToMarkdown, streamHtmlToMarkdown };
package/dist/index.d.ts CHANGED
@@ -10,6 +10,6 @@ import { ReadableStream } from 'node:stream/web';
10
10
  */
11
11
  declare function streamHtmlToMarkdown(htmlStream: ReadableStream | null, options?: HTMLToMarkdownOptions): AsyncIterable<string>;
12
12
 
13
- declare function syncHtmlToMarkdown(html: string, options?: HTMLToMarkdownOptions): string;
13
+ declare function htmlToMarkdown(html: string, options?: HTMLToMarkdownOptions): string;
14
14
 
15
- export { HTMLToMarkdownOptions, streamHtmlToMarkdown, syncHtmlToMarkdown };
15
+ export { HTMLToMarkdownOptions, htmlToMarkdown, streamHtmlToMarkdown };
package/dist/index.mjs CHANGED
@@ -2,7 +2,7 @@ import { p as processPartialHTMLToMarkdown } from './shared/mdream.DUeWbUFG.mjs'
2
2
  export { s as streamHtmlToMarkdown } from './shared/mdream.DUeWbUFG.mjs';
3
3
  import './shared/mdream.-hdaPj9a.mjs';
4
4
 
5
- function syncHtmlToMarkdown(html, options = {}) {
5
+ function htmlToMarkdown(html, options = {}) {
6
6
  const state = {
7
7
  options
8
8
  };
@@ -10,4 +10,4 @@ function syncHtmlToMarkdown(html, options = {}) {
10
10
  return result.trimEnd();
11
11
  }
12
12
 
13
- export { syncHtmlToMarkdown };
13
+ export { htmlToMarkdown };
package/dist/plugins.mjs CHANGED
@@ -1,4 +1,4 @@
1
1
  export { c as createPlugin, f as frontmatterPlugin } from './shared/mdream.cpEmpxyh.mjs';
2
- export { f as filterPlugin, i as isolateMainPlugin, t as tailwindPlugin } from './shared/mdream.5zaIXVJz.mjs';
2
+ export { f as filterPlugin, i as isolateMainPlugin, t as tailwindPlugin } from './shared/mdream.DEM9pag4.mjs';
3
3
  export { r as readabilityPlugin } from './shared/mdream.C8ruysN5.mjs';
4
4
  import './shared/mdream.-hdaPj9a.mjs';
@@ -1,5 +1,5 @@
1
1
  import { y as TAG_FORM, t as TAG_FIELDSET, q as TAG_OBJECT, r as TAG_EMBED, a0 as TAG_FIGURE, B as TAG_FOOTER, z as TAG_ASIDE, s as TAG_IFRAME, w as TAG_INPUT, v as TAG_TEXTAREA, u as TAG_SELECT, x as TAG_BUTTON, A as TAG_NAV } from '../shared/mdream.-hdaPj9a.mjs';
2
- import { i as isolateMainPlugin, t as tailwindPlugin, f as filterPlugin } from '../shared/mdream.5zaIXVJz.mjs';
2
+ import { i as isolateMainPlugin, t as tailwindPlugin, f as filterPlugin } from '../shared/mdream.DEM9pag4.mjs';
3
3
  import { f as frontmatterPlugin } from '../shared/mdream.cpEmpxyh.mjs';
4
4
 
5
5
  function withMinimalPreset(options = {}) {
@@ -103,7 +103,6 @@ function parseSelector(selector) {
103
103
  }
104
104
  const selectorParts = [];
105
105
  let current = "";
106
- let inAttribute = false;
107
106
  for (let i = 0; i < selector.length; i++) {
108
107
  const char = selector[i];
109
108
  if ((char === "." || char === "#" || char === "[") && current) {
@@ -120,13 +119,6 @@ function parseSelector(selector) {
120
119
  } else {
121
120
  current += char;
122
121
  }
123
- if (char === "[")
124
- inAttribute = true;
125
- if (char === "]")
126
- inAttribute = false;
127
- if (inAttribute && char !== "[") {
128
- continue;
129
- }
130
122
  }
131
123
  if (current) {
132
124
  if (current[0] === ".") {
@@ -144,6 +136,7 @@ function parseSelector(selector) {
144
136
  }
145
137
  return new CompoundSelector(selectorParts);
146
138
  }
139
+
147
140
  function filterPlugin(options = {}) {
148
141
  const includeSelectors = options.include?.map((selector) => {
149
142
  if (typeof selector === "string") {
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "mdream",
3
3
  "type": "module",
4
- "version": "0.1.1",
4
+ "version": "0.2.1",
5
5
  "description": "Ultra-performant JavaScript HTML to Markdown converter optimized for LLMs.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",