@agent-infra/browser-context 0.1.6 → 0.2.0-alpha.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md
CHANGED
|
@@ -32,7 +32,7 @@ await page.goto('https://example.com/article');
|
|
|
32
32
|
|
|
33
33
|
// Extract content as Markdown
|
|
34
34
|
const result = await extractContent(page);
|
|
35
|
-
console.log(result.title);
|
|
35
|
+
console.log(result.title); // Article title
|
|
36
36
|
console.log(result.content); // Clean Markdown content
|
|
37
37
|
|
|
38
38
|
await browser.close();
|
|
@@ -43,16 +43,19 @@ await browser.close();
|
|
|
43
43
|
Extract content from HTML strings:
|
|
44
44
|
|
|
45
45
|
```typescript
|
|
46
|
-
import {
|
|
46
|
+
import {
|
|
47
|
+
extractWithDefuddle,
|
|
48
|
+
extractWithReadability,
|
|
49
|
+
} from '@agent-infra/browser-context';
|
|
47
50
|
|
|
48
51
|
// Using Defuddle (primary method)
|
|
49
52
|
const result1 = await extractWithDefuddle(htmlString, url, {
|
|
50
|
-
markdown: true
|
|
53
|
+
markdown: true,
|
|
51
54
|
});
|
|
52
55
|
|
|
53
56
|
// Using Readability (fallback method)
|
|
54
57
|
const result2 = await extractWithReadability(page, {
|
|
55
|
-
markdown: true
|
|
58
|
+
markdown: true,
|
|
56
59
|
});
|
|
57
60
|
```
|
|
58
61
|
|
|
@@ -65,21 +68,24 @@ import { toMarkdown } from '@agent-infra/browser-context';
|
|
|
65
68
|
|
|
66
69
|
const html = '<h1>Title</h1><p>Content with <strong>bold</strong> text</p>';
|
|
67
70
|
const markdown = toMarkdown(html, {
|
|
68
|
-
gfmExtension: true,
|
|
69
|
-
codeBlockStyle: 'fenced',
|
|
70
|
-
headingStyle: 'atx'
|
|
71
|
+
gfmExtension: true, // Enable GitHub Flavored Markdown
|
|
72
|
+
codeBlockStyle: 'fenced', // Use fenced code blocks
|
|
73
|
+
headingStyle: 'atx', // Use # style headings
|
|
71
74
|
});
|
|
72
75
|
|
|
73
76
|
console.log(markdown);
|
|
74
77
|
// # Title
|
|
75
|
-
//
|
|
78
|
+
//
|
|
76
79
|
// Content with **bold** text
|
|
77
80
|
```
|
|
78
81
|
|
|
79
82
|
### Advanced HTML to Markdown Options
|
|
80
83
|
|
|
81
84
|
```typescript
|
|
82
|
-
import {
|
|
85
|
+
import {
|
|
86
|
+
toMarkdown,
|
|
87
|
+
DEFAULT_TAGS_TO_REMOVE,
|
|
88
|
+
} from '@agent-infra/browser-context';
|
|
83
89
|
|
|
84
90
|
const options = {
|
|
85
91
|
gfmExtension: true,
|
|
@@ -87,7 +93,7 @@ const options = {
|
|
|
87
93
|
headingStyle: 'atx' as const,
|
|
88
94
|
emDelimiter: '*',
|
|
89
95
|
strongDelimiter: '**',
|
|
90
|
-
removeTags: [...DEFAULT_TAGS_TO_REMOVE, 'footer', 'nav'] // Remove additional tags
|
|
96
|
+
removeTags: [...DEFAULT_TAGS_TO_REMOVE, 'footer', 'nav'], // Remove additional tags
|
|
91
97
|
};
|
|
92
98
|
|
|
93
99
|
const markdown = toMarkdown(htmlContent, options);
|
|
@@ -100,9 +106,11 @@ const markdown = toMarkdown(htmlContent, options);
|
|
|
100
106
|
Main extraction function that automatically tries Defuddle first, then falls back to Readability.
|
|
101
107
|
|
|
102
108
|
**Parameters:**
|
|
109
|
+
|
|
103
110
|
- `page`: Puppeteer page instance
|
|
104
111
|
|
|
105
112
|
**Returns:**
|
|
113
|
+
|
|
106
114
|
- `Promise<{title: string, content: string}>`: Extracted title and Markdown content
|
|
107
115
|
|
|
108
116
|
### `extractWithDefuddle(html: string, url: string, options: DefuddleOptions)`
|
|
@@ -110,6 +118,7 @@ Main extraction function that automatically tries Defuddle first, then falls bac
|
|
|
110
118
|
Extract content using the Defuddle library.
|
|
111
119
|
|
|
112
120
|
**Parameters:**
|
|
121
|
+
|
|
113
122
|
- `html`: HTML content string
|
|
114
123
|
- `url`: Page URL
|
|
115
124
|
- `options`: Defuddle configuration options
|
|
@@ -119,6 +128,7 @@ Extract content using the Defuddle library.
|
|
|
119
128
|
Extract content using Mozilla's Readability algorithm.
|
|
120
129
|
|
|
121
130
|
**Parameters:**
|
|
131
|
+
|
|
122
132
|
- `page`: Puppeteer page instance
|
|
123
133
|
- `options.markdown`: Whether to convert to Markdown (default: false)
|
|
124
134
|
|
|
@@ -127,10 +137,12 @@ Extract content using Mozilla's Readability algorithm.
|
|
|
127
137
|
Convert HTML to Markdown format.
|
|
128
138
|
|
|
129
139
|
**Parameters:**
|
|
140
|
+
|
|
130
141
|
- `html`: HTML content string
|
|
131
142
|
- `options`: Conversion options
|
|
132
143
|
|
|
133
144
|
**ToMarkdownOptions:**
|
|
145
|
+
|
|
134
146
|
- `gfmExtension`: Enable GitHub Flavored Markdown (default: true)
|
|
135
147
|
- `removeTags`: Array of HTML tags to remove
|
|
136
148
|
- `codeBlockStyle`: 'indented' | 'fenced'
|
|
@@ -161,14 +173,17 @@ You can customize this list using the `removeTags` option.
|
|
|
161
173
|
## Browser Compatibility
|
|
162
174
|
|
|
163
175
|
This library is designed to work with:
|
|
176
|
+
|
|
164
177
|
- Puppeteer
|
|
165
178
|
- Playwright
|
|
166
179
|
- Any browser automation tool that provides a Page-like interface
|
|
167
180
|
|
|
168
181
|
## License
|
|
169
182
|
|
|
170
|
-
Apache
|
|
183
|
+
Apache License 2.0.
|
|
184
|
+
|
|
185
|
+
## Credits
|
|
171
186
|
|
|
172
|
-
|
|
187
|
+
Special thanks to the open source projects that inspired this toolkit:
|
|
173
188
|
|
|
174
|
-
|
|
189
|
+
- [readability](https://github.com/mozilla/readability/) - A standalone version of the readability lib
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/content/index.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/content/index.ts"],"names":[],"mappings":"AAIA,cAAc,sBAAsB,CAAC;AACrC,cAAc,kBAAkB,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"to-markdown.d.ts","sourceRoot":"","sources":["../../src/content/to-markdown.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"to-markdown.d.ts","sourceRoot":"","sources":["../../src/content/to-markdown.ts"],"names":[],"mappings":"AAIA,OAAO,QAAQ,EAAE,EAAE,OAAO,EAAE,MAAM,UAAU,CAAC;AAG7C,eAAO,MAAM,sBAAsB,EAAE,OAAO,EAc3C,CAAC;AAEF,MAAM,WAAW,iBAAkB,SAAQ,QAAQ,CAAC,OAAO;IACzD,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,UAAU,CAAC,EAAE,OAAO,EAAE,CAAC;CACxB;AAED;;;;;GAKG;AACH,wBAAgB,UAAU,CACxB,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE,iBAAsB,GAC9B,MAAM,CAgCR"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"content/to-markdown.js","sources":["webpack://@agent-infra/browser-context/webpack/runtime/compat_get_default_export","webpack://@agent-infra/browser-context/webpack/runtime/define_property_getters","webpack://@agent-infra/browser-context/webpack/runtime/has_own_property","webpack://@agent-infra/browser-context/webpack/runtime/make_namespace_object","webpack://@agent-infra/browser-context/./src/content/to-markdown.ts"],"sourcesContent":["// getDefaultExport function for compatibility with non-ESM modules\n__webpack_require__.n = (module) => {\n\tvar getter = module && module.__esModule ?\n\t\t() => (module['default']) :\n\t\t() => (module);\n\t__webpack_require__.d(getter, { a: getter });\n\treturn getter;\n};\n","__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","
|
|
1
|
+
{"version":3,"file":"content/to-markdown.js","sources":["webpack://@agent-infra/browser-context/webpack/runtime/compat_get_default_export","webpack://@agent-infra/browser-context/webpack/runtime/define_property_getters","webpack://@agent-infra/browser-context/webpack/runtime/has_own_property","webpack://@agent-infra/browser-context/webpack/runtime/make_namespace_object","webpack://@agent-infra/browser-context/./src/content/to-markdown.ts"],"sourcesContent":["// getDefaultExport function for compatibility with non-ESM modules\n__webpack_require__.n = (module) => {\n\tvar getter = module && module.__esModule ?\n\t\t() => (module['default']) :\n\t\t() => (module);\n\t__webpack_require__.d(getter, { a: getter });\n\treturn getter;\n};\n","__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","/*\n * Copyright (c) 2025 Bytedance, Inc. and its affiliates.\n * SPDX-License-Identifier: Apache-2.0\n */\nimport Turndown, { TagName } from 'turndown';\nimport { gfm } from 'turndown-plugin-gfm';\n\nexport const DEFAULT_TAGS_TO_REMOVE: TagName[] = [\n 'script',\n 'style',\n 'link',\n 'head',\n 'iframe',\n 'video',\n 'audio',\n 'canvas',\n 'object',\n 'embed',\n 'noscript',\n 'aside',\n 'dialog',\n];\n\nexport interface ToMarkdownOptions extends Turndown.Options {\n gfmExtension?: boolean;\n removeTags?: TagName[];\n}\n\n/**\n * Convert HTML content to Markdown format\n * @param html HTML string\n * @param options Conversion options\n * @returns Markdown string\n */\nexport function toMarkdown(\n html: string,\n options: ToMarkdownOptions = {},\n): string {\n if (!html) return '';\n\n try {\n const {\n codeBlockStyle = 'fenced',\n headingStyle = 'atx',\n emDelimiter = '*',\n strongDelimiter = '**',\n gfmExtension = true,\n removeTags = DEFAULT_TAGS_TO_REMOVE,\n } = options;\n\n const turndown = new Turndown({\n codeBlockStyle,\n headingStyle,\n emDelimiter,\n strongDelimiter,\n });\n\n // issue: https://github.com/mixmark-io/turndown/issues/210#issuecomment-353666857\n turndown.remove(removeTags);\n\n if (gfmExtension) {\n turndown.use(gfm);\n }\n\n return turndown.turndown(html);\n } catch (error) {\n console.error('HTML to Markdown conversion failed:', error);\n return html;\n }\n}\n"],"names":["__webpack_require__","module","getter","definition","key","Object","obj","prop","Symbol","DEFAULT_TAGS_TO_REMOVE","toMarkdown","html","options","codeBlockStyle","headingStyle","emDelimiter","strongDelimiter","gfmExtension","removeTags","turndown","Turndown","gfm","error","console"],"mappings":";;;;;;;IACAA,oBAAoB,CAAC,GAAG,CAACC;QACxB,IAAIC,SAASD,UAAUA,OAAO,UAAU,GACvC,IAAOA,MAAM,CAAC,UAAU,GACxB,IAAOA;QACRD,oBAAoB,CAAC,CAACE,QAAQ;YAAE,GAAGA;QAAO;QAC1C,OAAOA;IACR;;;ICPAF,oBAAoB,CAAC,GAAG,CAAC,UAASG;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGH,oBAAoB,CAAC,CAACG,YAAYC,QAAQ,CAACJ,oBAAoB,CAAC,CAAC,UAASI,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAJ,oBAAoB,CAAC,GAAG,CAACM,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFP,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOQ,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;ACCO,MAAMI,yBAAoC;IAC/C;IACA;IACA;IACA;IACA;IACA;IACA;IACA;IACA;IACA;IACA;IACA;IACA;CACD;AAaM,SAASC,WACdC,IAAY,EACZC,UAA6B,CAAC,CAAC;IAE/B,IAAI,CAACD,MAAM,OAAO;IAElB,IAAI;QACF,MAAM,EACJE,iBAAiB,QAAQ,EACzBC,eAAe,KAAK,EACpBC,cAAc,GAAG,EACjBC,kBAAkB,IAAI,EACtBC,eAAe,IAAI,EACnBC,aAAaT,sBAAsB,EACpC,GAAGG;QAEJ,MAAMO,WAAW,IAAIC,CAAAA,2BAAAA,EAAS;YAC5BP;YACAC;YACAC;YACAC;QACF;QAGAG,SAAS,MAAM,CAACD;QAEhB,IAAID,cACFE,SAAS,GAAG,CAACE,6CAAAA,GAAGA;QAGlB,OAAOF,SAAS,QAAQ,CAACR;IAC3B,EAAE,OAAOW,OAAO;QACdC,QAAQ,KAAK,CAAC,uCAAuCD;QACrD,OAAOX;IACT;AACF"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"content/to-markdown.mjs","sources":["webpack://@agent-infra/browser-context/./src/content/to-markdown.ts"],"sourcesContent":["
|
|
1
|
+
{"version":3,"file":"content/to-markdown.mjs","sources":["webpack://@agent-infra/browser-context/./src/content/to-markdown.ts"],"sourcesContent":["/*\n * Copyright (c) 2025 Bytedance, Inc. and its affiliates.\n * SPDX-License-Identifier: Apache-2.0\n */\nimport Turndown, { TagName } from 'turndown';\nimport { gfm } from 'turndown-plugin-gfm';\n\nexport const DEFAULT_TAGS_TO_REMOVE: TagName[] = [\n 'script',\n 'style',\n 'link',\n 'head',\n 'iframe',\n 'video',\n 'audio',\n 'canvas',\n 'object',\n 'embed',\n 'noscript',\n 'aside',\n 'dialog',\n];\n\nexport interface ToMarkdownOptions extends Turndown.Options {\n gfmExtension?: boolean;\n removeTags?: TagName[];\n}\n\n/**\n * Convert HTML content to Markdown format\n * @param html HTML string\n * @param options Conversion options\n * @returns Markdown string\n */\nexport function toMarkdown(\n html: string,\n options: ToMarkdownOptions = {},\n): string {\n if (!html) return '';\n\n try {\n const {\n codeBlockStyle = 'fenced',\n headingStyle = 'atx',\n emDelimiter = '*',\n strongDelimiter = '**',\n gfmExtension = true,\n removeTags = DEFAULT_TAGS_TO_REMOVE,\n } = options;\n\n const turndown = new Turndown({\n codeBlockStyle,\n headingStyle,\n emDelimiter,\n strongDelimiter,\n });\n\n // issue: https://github.com/mixmark-io/turndown/issues/210#issuecomment-353666857\n turndown.remove(removeTags);\n\n if (gfmExtension) {\n turndown.use(gfm);\n }\n\n return turndown.turndown(html);\n } catch (error) {\n console.error('HTML to Markdown conversion failed:', error);\n return html;\n }\n}\n"],"names":["DEFAULT_TAGS_TO_REMOVE","toMarkdown","html","options","codeBlockStyle","headingStyle","emDelimiter","strongDelimiter","gfmExtension","removeTags","turndown","Turndown","gfm","error","console"],"mappings":";;;;;;AAOO,MAAMA,yBAAoC;IAC/C;IACA;IACA;IACA;IACA;IACA;IACA;IACA;IACA;IACA;IACA;IACA;IACA;CACD;AAaM,SAASC,WACdC,IAAY,EACZC,UAA6B,CAAC,CAAC;IAE/B,IAAI,CAACD,MAAM,OAAO;IAElB,IAAI;QACF,MAAM,EACJE,iBAAiB,QAAQ,EACzBC,eAAe,KAAK,EACpBC,cAAc,GAAG,EACjBC,kBAAkB,IAAI,EACtBC,eAAe,IAAI,EACnBC,aAAaT,sBAAsB,EACpC,GAAGG;QAEJ,MAAMO,WAAW,IAAIC,WAAS;YAC5BP;YACAC;YACAC;YACAC;QACF;QAGAG,SAAS,MAAM,CAACD;QAEhB,IAAID,cACFE,SAAS,GAAG,CAACE;QAGf,OAAOF,SAAS,QAAQ,CAACR;IAC3B,EAAE,OAAOW,OAAO;QACdC,QAAQ,KAAK,CAAC,uCAAuCD;QACrD,OAAOX;IACT;AACF"}
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAIA,cAAc,oBAAoB,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAIA,cAAc,oBAAoB,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@agent-infra/browser-context",
|
|
3
3
|
"description": "get browser context for AI Agent",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.2.0-alpha.3",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"module": "dist/index.mjs",
|
|
7
7
|
"types": "dist/index.d.ts",
|
|
@@ -23,18 +23,18 @@
|
|
|
23
23
|
"access": "public"
|
|
24
24
|
},
|
|
25
25
|
"dependencies": {
|
|
26
|
-
"defuddle": "0.6.
|
|
27
|
-
"puppeteer-core": "24.
|
|
28
|
-
"turndown": "7.2.
|
|
26
|
+
"defuddle": "0.6.6",
|
|
27
|
+
"puppeteer-core": "24.23.0",
|
|
28
|
+
"turndown": "7.2.1",
|
|
29
29
|
"turndown-plugin-gfm": "1.0.2"
|
|
30
30
|
},
|
|
31
31
|
"devDependencies": {
|
|
32
32
|
"@types/turndown": "5.0.5",
|
|
33
|
-
"@types/node": "24.1
|
|
34
|
-
"typescript": "5.
|
|
33
|
+
"@types/node": "24.7.1",
|
|
34
|
+
"typescript": "5.9.3",
|
|
35
35
|
"vitest": "3.2.4",
|
|
36
36
|
"@vitest/coverage-v8": "3.2.4",
|
|
37
|
-
"@rslib/core": "0.
|
|
37
|
+
"@rslib/core": "0.15.0"
|
|
38
38
|
},
|
|
39
39
|
"scripts": {
|
|
40
40
|
"dev": "rslib build --watch",
|