@agent-infra/browser-context 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,23 @@
1
1
  import type { Page } from 'puppeteer-core';
2
+ import { DefuddleOptions } from 'defuddle/node';
3
+ export declare const extractWithReadability: (page: Page, options?: {
4
+ markdown?: boolean;
5
+ }) => Promise<{
6
+ title: string;
7
+ content: string;
8
+ }>;
9
+ export declare const extractWithDefuddle: (html: string, url: string, options: DefuddleOptions) => Promise<{
10
+ title: string;
11
+ content: string;
12
+ }>;
13
+ /**
14
+ * Extract content from a page using Defuddle or Readability
15
+ * page html -> markdown
16
+ * @param page - The page to extract content from
17
+ * @returns The title and content of the page
18
+ */
2
19
  export declare const extractContent: (page: Page) => Promise<{
3
- content: any;
4
- title: any;
5
- fullContent: any;
20
+ title: string;
21
+ content: string;
6
22
  }>;
7
23
  //# sourceMappingURL=extract-content.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"extract-content.d.ts","sourceRoot":"","sources":["../../src/content/extract-content.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,gBAAgB,CAAC;AAG3C,eAAO,MAAM,cAAc,GAAU,MAAM,IAAI;;;;EAgC9C,CAAC"}
1
+ {"version":3,"file":"extract-content.d.ts","sourceRoot":"","sources":["../../src/content/extract-content.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,gBAAgB,CAAC;AAC3C,OAAO,EAAY,eAAe,EAAE,MAAM,eAAe,CAAC;AAI1D,eAAO,MAAM,sBAAsB,GACjC,MAAM,IAAI,EACV,UAAS;IACP,QAAQ,CAAC,EAAE,OAAO,CAAC;CACf,KACL,OAAO,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,OAAO,EAAE,MAAM,CAAA;CAAE,CAoC5C,CAAC;AAEF,eAAO,MAAM,mBAAmB,GAC9B,MAAM,MAAM,EACZ,KAAK,MAAM,EACX,SAAS,eAAe,KACvB,OAAO,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,OAAO,EAAE,MAAM,CAAA;CAAE,CAO5C,CAAC;AAEF;;;;;GAKG;AACH,eAAO,MAAM,cAAc,GACzB,MAAM,IAAI,KACT,OAAO,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,OAAO,EAAE,MAAM,CAAA;CAAE,CAY5C,CAAC"}
@@ -28,10 +28,14 @@ var __webpack_require__ = {};
28
28
  var __webpack_exports__ = {};
29
29
  __webpack_require__.r(__webpack_exports__);
30
30
  __webpack_require__.d(__webpack_exports__, {
31
- extractContent: ()=>extractContent
31
+ extractWithReadability: ()=>extractWithReadability,
32
+ extractContent: ()=>extractContent,
33
+ extractWithDefuddle: ()=>extractWithDefuddle
32
34
  });
35
+ const node_namespaceObject = require("defuddle/node");
33
36
  const external_readability_script_js_namespaceObject = require("./readability-script.js");
34
- const extractContent = async (page)=>{
37
+ const external_to_markdown_js_namespaceObject = require("./to-markdown.js");
38
+ const extractWithReadability = async (page, options = {})=>{
35
39
  const extractionResult = await page.evaluate((readabilityScript)=>{
36
40
  const Readability = new Function('module', `${readabilityScript}\nreturn module.exports`)({});
37
41
  const documentClone = document.cloneNode(true);
@@ -41,15 +45,40 @@ const extractContent = async (page)=>{
41
45
  const title = document.title;
42
46
  return {
43
47
  content,
44
- title: (null == article ? void 0 : article.title) || title,
45
- fullContent: content
48
+ title: (null == article ? void 0 : article.title) || title
46
49
  };
47
50
  }, external_readability_script_js_namespaceObject.READABILITY_SCRIPT);
48
- return extractionResult;
51
+ return (null == options ? void 0 : options.markdown) ? {
52
+ title: extractionResult.title,
53
+ content: (0, external_to_markdown_js_namespaceObject.toMarkdown)(extractionResult.content)
54
+ } : extractionResult;
55
+ };
56
+ const extractWithDefuddle = async (html, url, options)=>{
57
+ const { title, content } = await (0, node_namespaceObject.Defuddle)(html, url, options);
58
+ return {
59
+ title,
60
+ content
61
+ };
62
+ };
63
+ const extractContent = async (page)=>{
64
+ const pageSourceHTML = await page.content();
65
+ try {
66
+ return await extractWithDefuddle(pageSourceHTML, page.url(), {
67
+ markdown: true
68
+ });
69
+ } catch (e) {
70
+ return await extractWithReadability(page, {
71
+ markdown: true
72
+ });
73
+ }
49
74
  };
50
75
  exports.extractContent = __webpack_exports__.extractContent;
76
+ exports.extractWithDefuddle = __webpack_exports__.extractWithDefuddle;
77
+ exports.extractWithReadability = __webpack_exports__.extractWithReadability;
51
78
  for(var __webpack_i__ in __webpack_exports__)if (-1 === [
52
- "extractContent"
79
+ "extractContent",
80
+ "extractWithDefuddle",
81
+ "extractWithReadability"
53
82
  ].indexOf(__webpack_i__)) exports[__webpack_i__] = __webpack_exports__[__webpack_i__];
54
83
  Object.defineProperty(exports, '__esModule', {
55
84
  value: true
@@ -1 +1 @@
1
- {"version":3,"file":"content/extract-content.js","sources":["webpack://@agent-infra/browser-context/webpack/runtime/define_property_getters","webpack://@agent-infra/browser-context/webpack/runtime/has_own_property","webpack://@agent-infra/browser-context/webpack/runtime/make_namespace_object","webpack://@agent-infra/browser-context/./src/content/extract-content.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","/*\n * Copyright (c) 2025 Bytedance, Inc. and its affiliates.\n * SPDX-License-Identifier: Apache-2.0\n */\nimport type { Page } from 'puppeteer-core';\nimport { READABILITY_SCRIPT } from './readability-script.js';\n\nexport const extractContent = async (page: Page) => {\n // Extract content using Readability algorithm on a document clone to prevent DOM flickering\n const extractionResult = await page.evaluate((readabilityScript) => {\n // Initialize Readability from script\n const Readability = new Function(\n 'module',\n `${readabilityScript}\\nreturn module.exports`,\n )({});\n\n // Create a deep clone of the document to avoid modifying the visible DOM\n const documentClone = document.cloneNode(true) as Document;\n\n // Clean up the cloned document\n documentClone\n .querySelectorAll(\n 'script,noscript,style,link,svg,img,video,iframe,canvas,.reflist',\n )\n .forEach((el) => el.remove());\n\n // Parse content from the clone\n const article = new Readability(documentClone).parse();\n const content = article?.content || '';\n const title = document.title;\n\n return {\n content,\n title: article?.title || title,\n fullContent: content,\n };\n }, READABILITY_SCRIPT);\n\n return extractionResult;\n};\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","extractContent","page","extractionResult","readabilityScript","Readability","Function","documentClone","document","el","article","content","title","READABILITY_SCRIPT"],"mappings":";;;;;;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;ACCO,MAAMI,iBAAiB,OAAOC;IAEnC,MAAMC,mBAAmB,MAAMD,KAAK,QAAQ,CAAC,CAACE;QAE5C,MAAMC,cAAc,IAAIC,SACtB,UACA,GAAGF,kBAAkB,uBAAuB,CAAC,EAC7C,CAAC;QAGH,MAAMG,gBAAgBC,SAAS,SAAS,CAAC;QAGzCD,cACG,gBAAgB,CACf,mEAED,OAAO,CAAC,CAACE,KAAOA,GAAG,MAAM;QAG5B,MAAMC,UAAU,IAAIL,YAAYE,eAAe,KAAK;QACpD,MAAMI,UAAUD,AAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,OAAO,AAAD,KAAK;QACpC,MAAME,QAAQJ,SAAS,KAAK;QAE5B,OAAO;YACLG;YACA,OAAOD,AAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,KAAK,AAAD,KAAKE;YACzB,aAAaD;QACf;IACF,GAAGE,+CAAAA,kBAAkBA;IAErB,OAAOV;AACT"}
1
+ {"version":3,"file":"content/extract-content.js","sources":["webpack://@agent-infra/browser-context/webpack/runtime/define_property_getters","webpack://@agent-infra/browser-context/webpack/runtime/has_own_property","webpack://@agent-infra/browser-context/webpack/runtime/make_namespace_object","webpack://@agent-infra/browser-context/./src/content/extract-content.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","/*\n * Copyright (c) 2025 Bytedance, Inc. and its affiliates.\n * SPDX-License-Identifier: Apache-2.0\n */\nimport type { Page } from 'puppeteer-core';\nimport { Defuddle, DefuddleOptions } from 'defuddle/node';\nimport { READABILITY_SCRIPT } from './readability-script.js';\nimport { toMarkdown } from './to-markdown.js';\n\nexport const extractWithReadability = async (\n page: Page,\n options: {\n markdown?: boolean;\n } = {},\n): Promise<{ title: string; content: string }> => {\n // Extract content using Readability algorithm on a document clone to prevent DOM flickering\n const extractionResult = await page.evaluate((readabilityScript) => {\n // Initialize Readability from script\n const Readability = new Function(\n 'module',\n `${readabilityScript}\\nreturn module.exports`,\n )({});\n\n // Create a deep clone of the document to avoid modifying the visible DOM\n const documentClone = document.cloneNode(true) as Document;\n\n // Clean up the cloned document\n documentClone\n .querySelectorAll(\n 'script,noscript,style,link,svg,img,video,iframe,canvas,.reflist',\n )\n .forEach((el) => el.remove());\n\n // Parse content from the clone\n const article = new Readability(documentClone).parse();\n const content = article?.content || '';\n const title = document.title;\n\n return {\n content,\n title: article?.title || title,\n };\n }, READABILITY_SCRIPT);\n\n return options?.markdown\n ? {\n title: extractionResult.title,\n content: toMarkdown(extractionResult.content),\n }\n : extractionResult;\n};\n\nexport const extractWithDefuddle = async (\n html: string,\n url: string,\n options: DefuddleOptions,\n): Promise<{ title: string; content: string }> => {\n const { title, content } = await Defuddle(html, url, options);\n\n return {\n title,\n content,\n };\n};\n\n/**\n * Extract content from a page using Defuddle or Readability\n * page html -> markdown\n * @param page - The page to extract content from\n * @returns The title and content of the page\n */\nexport const extractContent = async (\n page: Page,\n): Promise<{ title: string; content: string }> => {\n const pageSourceHTML = await page.content();\n\n try {\n return await extractWithDefuddle(pageSourceHTML, page.url(), {\n markdown: true,\n });\n } catch (e) {\n return await extractWithReadability(page as any, {\n markdown: true,\n });\n }\n};\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","extractWithReadability","page","options","extractionResult","readabilityScript","Readability","Function","documentClone","document","el","article","content","title","READABILITY_SCRIPT","toMarkdown","extractWithDefuddle","html","url","Defuddle","extractContent","pageSourceHTML","e"],"mappings":";;;;;;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;;ACGO,MAAMI,yBAAyB,OACpCC,MACAC,UAEI,CAAC,CAAC;IAGN,MAAMC,mBAAmB,MAAMF,KAAK,QAAQ,CAAC,CAACG;QAE5C,MAAMC,cAAc,IAAIC,SACtB,UACA,GAAGF,kBAAkB,uBAAuB,CAAC,EAC7C,CAAC;QAGH,MAAMG,gBAAgBC,SAAS,SAAS,CAAC;QAGzCD,cACG,gBAAgB,CACf,mEAED,OAAO,CAAC,CAACE,KAAOA,GAAG,MAAM;QAG5B,MAAMC,UAAU,IAAIL,YAAYE,eAAe,KAAK;QACpD,MAAMI,UAAUD,AAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,OAAO,AAAD,KAAK;QACpC,MAAME,QAAQJ,SAAS,KAAK;QAE5B,OAAO;YACLG;YACA,OAAOD,AAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,KAAK,AAAD,KAAKE;QAC3B;IACF,GAAGC,+CAAAA,kBAAkBA;IAErB,OAAOX,AAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,QAAQ,AAAD,IACnB;QACE,OAAOC,iBAAiB,KAAK;QAC7B,SAASW,AAAAA,IAAAA,wCAAAA,UAAAA,AAAAA,EAAWX,iBAAiB,OAAO;IAC9C,IACAA;AACN;AAEO,MAAMY,sBAAsB,OACjCC,MACAC,KACAf;IAEA,MAAM,EAAEU,KAAK,EAAED,OAAO,EAAE,GAAG,MAAMO,AAAAA,IAAAA,qBAAAA,QAAAA,AAAAA,EAASF,MAAMC,KAAKf;IAErD,OAAO;QACLU;QACAD;IACF;AACF;AAQO,MAAMQ,iBAAiB,OAC5BlB;IAEA,MAAMmB,iBAAiB,MAAMnB,KAAK,OAAO;IAEzC,IAAI;QACF,OAAO,MAAMc,oBAAoBK,gBAAgBnB,KAAK,GAAG,IAAI;YAC3D,UAAU;QACZ;IACF,EAAE,OAAOoB,GAAG;QACV,OAAO,MAAMrB,uBAAuBC,MAAa;YAC/C,UAAU;QACZ;IACF;AACF"}
@@ -2,8 +2,10 @@
2
2
  * Copyright (c) 2025 Bytedance, Inc. and its affiliates.
3
3
  * SPDX-License-Identifier: Apache-2.0
4
4
  */
5
+ import { Defuddle } from "defuddle/node";
5
6
  import { READABILITY_SCRIPT } from "./readability-script.mjs";
6
- const extractContent = async (page)=>{
7
+ import { toMarkdown } from "./to-markdown.mjs";
8
+ const extractWithReadability = async (page, options = {})=>{
7
9
  const extractionResult = await page.evaluate((readabilityScript)=>{
8
10
  const Readability = new Function('module', `${readabilityScript}\nreturn module.exports`)({});
9
11
  const documentClone = document.cloneNode(true);
@@ -13,12 +15,33 @@ const extractContent = async (page)=>{
13
15
  const title = document.title;
14
16
  return {
15
17
  content,
16
- title: (null == article ? void 0 : article.title) || title,
17
- fullContent: content
18
+ title: (null == article ? void 0 : article.title) || title
18
19
  };
19
20
  }, READABILITY_SCRIPT);
20
- return extractionResult;
21
+ return (null == options ? void 0 : options.markdown) ? {
22
+ title: extractionResult.title,
23
+ content: toMarkdown(extractionResult.content)
24
+ } : extractionResult;
25
+ };
26
+ const extractWithDefuddle = async (html, url, options)=>{
27
+ const { title, content } = await Defuddle(html, url, options);
28
+ return {
29
+ title,
30
+ content
31
+ };
32
+ };
33
+ const extractContent = async (page)=>{
34
+ const pageSourceHTML = await page.content();
35
+ try {
36
+ return await extractWithDefuddle(pageSourceHTML, page.url(), {
37
+ markdown: true
38
+ });
39
+ } catch (e) {
40
+ return await extractWithReadability(page, {
41
+ markdown: true
42
+ });
43
+ }
21
44
  };
22
- export { extractContent };
45
+ export { extractContent, extractWithDefuddle, extractWithReadability };
23
46
 
24
47
  //# sourceMappingURL=extract-content.mjs.map
@@ -1 +1 @@
1
- {"version":3,"file":"content/extract-content.mjs","sources":["webpack://@agent-infra/browser-context/./src/content/extract-content.ts"],"sourcesContent":["/*\n * Copyright (c) 2025 Bytedance, Inc. and its affiliates.\n * SPDX-License-Identifier: Apache-2.0\n */\nimport type { Page } from 'puppeteer-core';\nimport { READABILITY_SCRIPT } from './readability-script.js';\n\nexport const extractContent = async (page: Page) => {\n // Extract content using Readability algorithm on a document clone to prevent DOM flickering\n const extractionResult = await page.evaluate((readabilityScript) => {\n // Initialize Readability from script\n const Readability = new Function(\n 'module',\n `${readabilityScript}\\nreturn module.exports`,\n )({});\n\n // Create a deep clone of the document to avoid modifying the visible DOM\n const documentClone = document.cloneNode(true) as Document;\n\n // Clean up the cloned document\n documentClone\n .querySelectorAll(\n 'script,noscript,style,link,svg,img,video,iframe,canvas,.reflist',\n )\n .forEach((el) => el.remove());\n\n // Parse content from the clone\n const article = new Readability(documentClone).parse();\n const content = article?.content || '';\n const title = document.title;\n\n return {\n content,\n title: article?.title || title,\n fullContent: content,\n };\n }, READABILITY_SCRIPT);\n\n return extractionResult;\n};\n"],"names":["extractContent","page","extractionResult","readabilityScript","Readability","Function","documentClone","document","el","article","content","title","READABILITY_SCRIPT"],"mappings":";;;;;AAOO,MAAMA,iBAAiB,OAAOC;IAEnC,MAAMC,mBAAmB,MAAMD,KAAK,QAAQ,CAAC,CAACE;QAE5C,MAAMC,cAAc,IAAIC,SACtB,UACA,GAAGF,kBAAkB,uBAAuB,CAAC,EAC7C,CAAC;QAGH,MAAMG,gBAAgBC,SAAS,SAAS,CAAC;QAGzCD,cACG,gBAAgB,CACf,mEAED,OAAO,CAAC,CAACE,KAAOA,GAAG,MAAM;QAG5B,MAAMC,UAAU,IAAIL,YAAYE,eAAe,KAAK;QACpD,MAAMI,UAAUD,AAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,OAAO,AAAD,KAAK;QACpC,MAAME,QAAQJ,SAAS,KAAK;QAE5B,OAAO;YACLG;YACA,OAAOD,AAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,KAAK,AAAD,KAAKE;YACzB,aAAaD;QACf;IACF,GAAGE;IAEH,OAAOV;AACT"}
1
+ {"version":3,"file":"content/extract-content.mjs","sources":["webpack://@agent-infra/browser-context/./src/content/extract-content.ts"],"sourcesContent":["/*\n * Copyright (c) 2025 Bytedance, Inc. and its affiliates.\n * SPDX-License-Identifier: Apache-2.0\n */\nimport type { Page } from 'puppeteer-core';\nimport { Defuddle, DefuddleOptions } from 'defuddle/node';\nimport { READABILITY_SCRIPT } from './readability-script.js';\nimport { toMarkdown } from './to-markdown.js';\n\nexport const extractWithReadability = async (\n page: Page,\n options: {\n markdown?: boolean;\n } = {},\n): Promise<{ title: string; content: string }> => {\n // Extract content using Readability algorithm on a document clone to prevent DOM flickering\n const extractionResult = await page.evaluate((readabilityScript) => {\n // Initialize Readability from script\n const Readability = new Function(\n 'module',\n `${readabilityScript}\\nreturn module.exports`,\n )({});\n\n // Create a deep clone of the document to avoid modifying the visible DOM\n const documentClone = document.cloneNode(true) as Document;\n\n // Clean up the cloned document\n documentClone\n .querySelectorAll(\n 'script,noscript,style,link,svg,img,video,iframe,canvas,.reflist',\n )\n .forEach((el) => el.remove());\n\n // Parse content from the clone\n const article = new Readability(documentClone).parse();\n const content = article?.content || '';\n const title = document.title;\n\n return {\n content,\n title: article?.title || title,\n };\n }, READABILITY_SCRIPT);\n\n return options?.markdown\n ? {\n title: extractionResult.title,\n content: toMarkdown(extractionResult.content),\n }\n : extractionResult;\n};\n\nexport const extractWithDefuddle = async (\n html: string,\n url: string,\n options: DefuddleOptions,\n): Promise<{ title: string; content: string }> => {\n const { title, content } = await Defuddle(html, url, options);\n\n return {\n title,\n content,\n };\n};\n\n/**\n * Extract content from a page using Defuddle or Readability\n * page html -> markdown\n * @param page - The page to extract content from\n * @returns The title and content of the page\n */\nexport const extractContent = async (\n page: Page,\n): Promise<{ title: string; content: string }> => {\n const pageSourceHTML = await page.content();\n\n try {\n return await extractWithDefuddle(pageSourceHTML, page.url(), {\n markdown: true,\n });\n } catch (e) {\n return await extractWithReadability(page as any, {\n markdown: true,\n });\n }\n};\n"],"names":["extractWithReadability","page","options","extractionResult","readabilityScript","Readability","Function","documentClone","document","el","article","content","title","READABILITY_SCRIPT","toMarkdown","extractWithDefuddle","html","url","Defuddle","extractContent","pageSourceHTML","e"],"mappings":";;;;;;;AASO,MAAMA,yBAAyB,OACpCC,MACAC,UAEI,CAAC,CAAC;IAGN,MAAMC,mBAAmB,MAAMF,KAAK,QAAQ,CAAC,CAACG;QAE5C,MAAMC,cAAc,IAAIC,SACtB,UACA,GAAGF,kBAAkB,uBAAuB,CAAC,EAC7C,CAAC;QAGH,MAAMG,gBAAgBC,SAAS,SAAS,CAAC;QAGzCD,cACG,gBAAgB,CACf,mEAED,OAAO,CAAC,CAACE,KAAOA,GAAG,MAAM;QAG5B,MAAMC,UAAU,IAAIL,YAAYE,eAAe,KAAK;QACpD,MAAMI,UAAUD,AAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,OAAO,AAAD,KAAK;QACpC,MAAME,QAAQJ,SAAS,KAAK;QAE5B,OAAO;YACLG;YACA,OAAOD,AAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,KAAK,AAAD,KAAKE;QAC3B;IACF,GAAGC;IAEH,OAAOX,AAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,QAAQ,AAAD,IACnB;QACE,OAAOC,iBAAiB,KAAK;QAC7B,SAASW,WAAWX,iBAAiB,OAAO;IAC9C,IACAA;AACN;AAEO,MAAMY,sBAAsB,OACjCC,MACAC,KACAf;IAEA,MAAM,EAAEU,KAAK,EAAED,OAAO,EAAE,GAAG,MAAMO,SAASF,MAAMC,KAAKf;IAErD,OAAO;QACLU;QACAD;IACF;AACF;AAQO,MAAMQ,iBAAiB,OAC5BlB;IAEA,MAAMmB,iBAAiB,MAAMnB,KAAK,OAAO;IAEzC,IAAI;QACF,OAAO,MAAMc,oBAAoBK,gBAAgBnB,KAAK,GAAG,IAAI;YAC3D,UAAU;QACZ;IACF,EAAE,OAAOoB,GAAG;QACV,OAAO,MAAMrB,uBAAuBC,MAAa;YAC/C,UAAU;QACZ;IACF;AACF"}
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@agent-infra/browser-context",
3
3
  "description": "get browser context for AI Agent",
4
- "version": "0.1.1",
4
+ "version": "0.1.3",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.mjs",
7
7
  "types": "dist/index.d.ts",
@@ -15,32 +15,34 @@
15
15
  "files": [
16
16
  "dist"
17
17
  ],
18
+ "repository": {
19
+ "type": "git",
20
+ "url": "git@github.com:agent-infra/browser.git"
21
+ },
18
22
  "publishConfig": {
19
23
  "access": "public"
20
24
  },
21
- "scripts": {
22
- "dev": "rslib build --watch",
23
- "build": "rslib build",
24
- "prepare": "npm run build",
25
- "prepublishOnly": "pnpm run build",
26
- "test": "vitest run",
27
- "test:watch": "vitest",
28
- "test:e2e": "vitest --config vitest.e2e.config.ts",
29
- "coverage": "vitest run --coverage",
30
- "test:e2e:local": "vitest --config vitest.e2e.config.ts local-browser.e2e.test.ts"
31
- },
32
25
  "dependencies": {
33
- "puppeteer-core": "24.14.0",
34
- "turndown": "^7.2.0",
35
- "turndown-plugin-gfm": "^1.0.2"
26
+ "defuddle": "0.6.4",
27
+ "puppeteer-core": "24.15.0",
28
+ "turndown": "7.2.0",
29
+ "turndown-plugin-gfm": "1.0.2"
36
30
  },
37
31
  "devDependencies": {
38
- "@types/turndown": "^5.0.5",
39
- "@types/which": "3.0.4",
32
+ "@types/turndown": "5.0.5",
40
33
  "@types/node": "24.1.0",
41
34
  "typescript": "5.8.3",
42
35
  "vitest": "3.2.4",
43
36
  "@vitest/coverage-v8": "3.2.4",
44
37
  "@rslib/core": "0.11.0"
38
+ },
39
+ "scripts": {
40
+ "dev": "rslib build --watch",
41
+ "build": "rslib build",
42
+ "test": "vitest run",
43
+ "test:watch": "vitest",
44
+ "test:e2e": "vitest --config vitest.e2e.config.ts",
45
+ "coverage": "vitest run --coverage",
46
+ "test:e2e:local": "vitest --config vitest.e2e.config.ts local-browser.e2e.test.ts"
45
47
  }
46
- }
48
+ }