@purepageio/fetch-engines 0.9.1 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,145 @@
1
+ const ROOT_CONTAINER_REGEX = /<div[^>]+id=["']?(?:root|app)\b["']?[^>]*>\s*(?:<!--[\s\S]*?-->\s*)*<\/div>/i;
2
+ const HAS_ROOT_CONTAINER_REGEX = /<(?:div|main|section)[^>]+id=["']?(?:root|app)\b["']?[^>]*>/i;
3
+ const TITLE_REGEX = /<title[^>]*>([\s\S]*?)<\/title>/i;
4
+ const HEADING_REGEX = /<h[1-3][^>]*>/gi;
5
+ const MAIN_LIKE_REGEX = /<(?:main|article)[^>]*>/i;
6
+ const NOSCRIPT_ENABLE_JS_REGEX = /<noscript[\s\S]*?(enable javascript|requires javascript|javascript to run)/i;
7
+ const SCRIPT_TAG_REGEX = /<script\b/gi;
8
+ // Soft-block / challenge page detection
9
+ const SOFT_BLOCK_TITLE_REGEX = /just a moment|attention required|access denied|please wait|one more step|checking your browser|security check|you have been blocked|blocked by|are you a robot/i;
10
+ const SOFT_BLOCK_BODY_REGEX = /checking your browser|verify you.{0,10}(?:are |'re )?(?:not a )?(?:ro)?bot|verify you.{0,10}human|please complete the security check|cf-challenge|captcha-container|hcaptcha|recaptcha|cf-turnstile|enable (?:javascript|cookies) to (?:continue|access|view)|automated (?:access|request)|bot detect|suspicious activity|unusual traffic|too many requests|rate limit exceeded|we need to verify/i;
11
+ function collapseWhitespace(value) {
12
+ return value.replace(/\s+/g, " ").trim();
13
+ }
14
+ function htmlEntityDecode(value) {
15
+ return value
16
+ .replace(/&nbsp;/gi, " ")
17
+ .replace(/&amp;/gi, "&")
18
+ .replace(/&lt;/gi, "<")
19
+ .replace(/&gt;/gi, ">")
20
+ .replace(/&quot;/gi, '"')
21
+ .replace(/&#39;/gi, "'");
22
+ }
23
+ function stripHtmlToVisibleText(html) {
24
+ return collapseWhitespace(htmlEntityDecode(html
25
+ .replace(/<!--[\s\S]*?-->/g, " ")
26
+ .replace(/<script[\s\S]*?<\/script>/gi, " ")
27
+ .replace(/<style[\s\S]*?<\/style>/gi, " ")
28
+ .replace(/<svg[\s\S]*?<\/svg>/gi, " ")
29
+ .replace(/<noscript[\s\S]*?<\/noscript>/gi, " ")
30
+ .replace(/<[^>]+>/g, " ")));
31
+ }
32
+ function stripMarkdownToVisibleText(markdown) {
33
+ return collapseWhitespace(markdown
34
+ .replace(/```[\s\S]*?```/g, " ")
35
+ .replace(/`[^`]*`/g, " ")
36
+ .replace(/!\[[^\]]*]\([^)]*\)/g, " ")
37
+ .replace(/\[([^\]]*)\]\([^)]*\)/g, "$1")
38
+ .replace(/^#{1,6}\s+/gm, "")
39
+ .replace(/[*_~>-]/g, " "));
40
+ }
41
+ function scoreTextSignals(textLength, titleLength, hasMainLike, headingCount) {
42
+ let score = 0;
43
+ score += Math.min(6, Math.floor(textLength / 120));
44
+ score += Math.min(2, Math.floor(titleLength / 12));
45
+ if (hasMainLike)
46
+ score += 2;
47
+ if (headingCount > 0)
48
+ score += 1;
49
+ return score;
50
+ }
51
+ export function assessHtmlRenderNeed(html) {
52
+ const htmlLength = html.length;
53
+ const visibleText = stripHtmlToVisibleText(html);
54
+ const visibleTextLength = visibleText.length;
55
+ const titleMatch = html.match(TITLE_REGEX);
56
+ const titleLength = collapseWhitespace(htmlEntityDecode(titleMatch?.[1] || "")).length;
57
+ const scriptCount = (html.match(SCRIPT_TAG_REGEX) || []).length;
58
+ const headingCount = (html.match(HEADING_REGEX) || []).length;
59
+ const hasMainLike = MAIN_LIKE_REGEX.test(html);
60
+ const hasRootContainer = HAS_ROOT_CONTAINER_REGEX.test(html);
61
+ const hasEmptyRootContainer = ROOT_CONTAINER_REGEX.test(html);
62
+ const hasNoscriptEnableJs = NOSCRIPT_ENABLE_JS_REGEX.test(html);
63
+ let renderLikelyNeededScore = 0;
64
+ if (titleLength === 0)
65
+ renderLikelyNeededScore += 3;
66
+ if (visibleTextLength < 80)
67
+ renderLikelyNeededScore += 3;
68
+ if (hasEmptyRootContainer)
69
+ renderLikelyNeededScore += 3;
70
+ if (hasNoscriptEnableJs)
71
+ renderLikelyNeededScore += 2;
72
+ if (htmlLength < 2000)
73
+ renderLikelyNeededScore += 1;
74
+ if (scriptCount >= 3 && visibleTextLength < 200)
75
+ renderLikelyNeededScore += 1;
76
+ if (hasRootContainer && visibleTextLength < 160)
77
+ renderLikelyNeededScore += 1;
78
+ if (!hasMainLike && headingCount === 0 && visibleTextLength < 120)
79
+ renderLikelyNeededScore += 1;
80
+ let qualityScore = scoreTextSignals(visibleTextLength, titleLength, hasMainLike, headingCount);
81
+ if (hasEmptyRootContainer)
82
+ qualityScore -= 3;
83
+ if (titleLength === 0)
84
+ qualityScore -= 2;
85
+ if (visibleTextLength < 80)
86
+ qualityScore -= 2;
87
+ return {
88
+ htmlLength,
89
+ visibleTextLength,
90
+ titleLength,
91
+ scriptCount,
92
+ headingCount,
93
+ hasMainLike,
94
+ hasRootContainer,
95
+ hasEmptyRootContainer,
96
+ hasNoscriptEnableJs,
97
+ qualityScore,
98
+ renderLikelyNeededScore,
99
+ renderLikelyNeeded: renderLikelyNeededScore >= 4,
100
+ };
101
+ }
102
+ /**
103
+ * Detect if an HTTP response is a soft-block page (Cloudflare challenge, CAPTCHA,
104
+ * "verify you're human", etc.) that looks like a real HTML document but contains no
105
+ * actual page content.
106
+ */
107
+ export function isSoftBlockPage(html) {
108
+ const visibleText = stripHtmlToVisibleText(html);
109
+ // Genuine content pages produce substantial text; soft blocks rarely exceed ~1500 visible chars.
110
+ if (visibleText.length > 1500)
111
+ return false;
112
+ const titleMatch = html.match(TITLE_REGEX);
113
+ const title = titleMatch?.[1] || "";
114
+ if (SOFT_BLOCK_TITLE_REGEX.test(title))
115
+ return true;
116
+ return SOFT_BLOCK_BODY_REGEX.test(html);
117
+ }
118
+ export function assessSerializedContent(content, contentType) {
119
+ if (contentType === "html") {
120
+ const assessment = assessHtmlRenderNeed(content);
121
+ return {
122
+ textLength: assessment.visibleTextLength,
123
+ titleLength: assessment.titleLength,
124
+ qualityScore: assessment.qualityScore,
125
+ };
126
+ }
127
+ const visibleText = stripMarkdownToVisibleText(content);
128
+ const firstHeadingMatch = content.match(/^#\s+(.+)$/m);
129
+ const titleLength = collapseWhitespace(firstHeadingMatch?.[1] || "").length;
130
+ return {
131
+ textLength: visibleText.length,
132
+ titleLength,
133
+ qualityScore: scoreTextSignals(visibleText.length, titleLength, false, firstHeadingMatch ? 1 : 0),
134
+ };
135
+ }
136
+ export function isRenderedContentMeaningfullyBetter(baseline, candidate) {
137
+ if (candidate.qualityScore >= baseline.qualityScore + 2)
138
+ return true;
139
+ if (candidate.textLength >= Math.max(200, baseline.textLength * 2))
140
+ return true;
141
+ if (candidate.titleLength > 0 && baseline.titleLength === 0 && candidate.textLength >= baseline.textLength)
142
+ return true;
143
+ return false;
144
+ }
145
+ //# sourceMappingURL=render-detection.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"render-detection.js","sourceRoot":"","sources":["../../src/utils/render-detection.ts"],"names":[],"mappings":"AAqBA,MAAM,oBAAoB,GAAG,8EAA8E,CAAC;AAC5G,MAAM,wBAAwB,GAAG,8DAA8D,CAAC;AAChG,MAAM,WAAW,GAAG,kCAAkC,CAAC;AACvD,MAAM,aAAa,GAAG,iBAAiB,CAAC;AACxC,MAAM,eAAe,GAAG,0BAA0B,CAAC;AACnD,MAAM,wBAAwB,GAAG,6EAA6E,CAAC;AAC/G,MAAM,gBAAgB,GAAG,aAAa,CAAC;AAEvC,wCAAwC;AACxC,MAAM,sBAAsB,GAC1B,iKAAiK,CAAC;AAEpK,MAAM,qBAAqB,GACzB,oYAAoY,CAAC;AAEvY,SAAS,kBAAkB,CAAC,KAAa;IACvC,OAAO,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;AAC3C,CAAC;AAED,SAAS,gBAAgB,CAAC,KAAa;IACrC,OAAO,KAAK;SACT,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC;SACxB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;SACtB,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;SACtB,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC;SACxB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC,CAAC;AAC7B,CAAC;AAED,SAAS,sBAAsB,CAAC,IAAY;IAC1C,OAAO,kBAAkB,CACvB,gBAAgB,CACd,IAAI;SACD,OAAO,CAAC,kBAAkB,EAAE,GAAG,CAAC;SAChC,OAAO,CAAC,6BAA6B,EAAE,GAAG,CAAC;SAC3C,OAAO,CAAC,2BAA2B,EAAE,GAAG,CAAC;SACzC,OAAO,CAAC,uBAAuB,EAAE,GAAG,CAAC;SACrC,OAAO,CAAC,iCAAiC,EAAE,GAAG,CAAC;SAC/C,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC,CAC5B,CACF,CAAC;AACJ,CAAC;AAED,SAAS,0BAA0B,CAAC,QAAgB;IAClD,OAAO,kBAAkB,CACvB,QAAQ;SACL,OAAO,CAAC,iBAAiB,EAAE,GAAG,CAAC;SAC/B,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC;SACxB,OAAO,CAAC,sBAAsB,EAAE,GAAG,CAAC;SACpC,OAAO,CAAC,wBAAwB,EAAE,IAAI,CAAC;SACvC,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC;SAC3B,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC,CAC5B,CAAC;AACJ,CAAC;AAED,SAAS,gBAAgB,CAAC,UAAkB,EAAE,WAAmB,EAAE,WAAoB,EAAE,YAAoB;IAC3G,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,KAAK,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,UAAU,GAAG,GAAG,CAAC,CAAC,CAAC;IACnD,KAAK,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,WAAW,GAAG,EAAE,CAAC,CAAC,CAAC;IACnD,IAAI,WAAW;QAAE,KAAK,IAAI,CAAC,CAAC;IAC5B,IAAI,YAAY,GAAG,CAAC;QAAE,KAAK,IAAI,CAAC,CAAC;IACjC,OAAO,KAAK,CAAC;AACf,CAAC;AAED,MAAM,UAAU,oBAAoB,CAAC,IAAY;IAC/C,MAAM,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC;IAC/B,MAAM,WAAW,GAAG,sBAAsB,CAAC,IAAI,CAAC,CAAC;IACjD,MAAM,iBAAiB,GAAG,WAAW,CAAC,MAAM,CAAC;IAC7C,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC;IAC3C,MAAM,WAAW,GAAG,kBAAkB,CAAC,gBAAgB,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;IACvF,MAAM,WAAW,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;IAChE,MAAM,YAAY,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;IAC9D,MAAM,WAAW,GAAG,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC/C,MAAM,gBAAgB,GAAG,wBAAwB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC7D,MAAM,qBAAqB,GAAG,oBAAoB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC9D,MAAM,mBAAmB,GAAG,wBAAwB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEhE,IAAI,uBAAuB,GAAG,CAAC,CAAC;IAChC,IAAI,WAAW,KAAK,CAAC;QAAE,uBAAuB,IAAI,CAAC,CAAC;IACpD,IAAI,iBAAiB,GAAG,EAAE;QAAE,uBAAuB,IAAI,CAAC,CAAC;IACzD,IAAI,qBAAqB;QAAE,uBAAuB,IAAI,CAAC,CAAC;IACxD,IAAI,mBAAmB;QAAE,uBAAuB,IAAI,CAAC,CAAC;IACtD,IAAI,UAAU,GAAG,IAAI;QAAE,uBAAuB,IAAI,CAAC,CAAC;IACpD,IAAI,WAAW,IAAI,CAAC,IAAI,iBAAiB,GAAG,GAAG;QAAE,uBAAuB,IAAI,CAAC,CAAC;IAC9E,IAAI,gBAAgB,IAAI,iBAAiB,GAAG,GAAG;QAAE,uBAAuB,IAAI,CAAC,CAAC;IAC9E,IAAI,CAAC,WAAW,IAAI,YAAY,KAAK,CAAC,IAAI,iBAAiB,GAAG,GAAG;QAAE,uBAAuB,IAAI,CAAC,CAAC;IAEhG,IAAI,YAAY,GAAG,gBAAgB,CAAC,iBAAiB,EAAE,WAAW,EAAE,WAAW,EAAE,YAAY,CAAC,CAAC;IAC/F,IAAI,qBAAqB;QAAE,YAAY,IAAI,CAAC,CAAC;IAC7C,IAAI,WAAW,KAAK,CAAC;QAAE,YAAY,IAAI,CAAC,CAAC;IACzC,IAAI,iBAAiB,GAAG,EAAE;QAAE,YAAY,IAAI,CAAC,CAAC;IAE9C,OAAO;QACL,UAAU;QACV,iBAAiB;QACjB,WAAW;QACX,WAAW;QACX,YAAY;QACZ,WAAW;QACX,gBAAgB;QAChB,qBAAqB;QACrB,mBAAmB;QACnB,YAAY;QACZ,uBAAuB;QACvB,kBAAkB,EAAE,uBAAuB,IAAI,CAAC;KACjD,CAAC;AACJ,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,eAAe,CAAC,IAAY;IAC1C,MAAM,WAAW,GAAG,sBAAsB,CAAC,IAAI,CAAC,CAAC;IACjD,iGAAiG;IACjG,IAAI,WAAW,CAAC,MAAM,GAAG,IAAI;QAAE,OAAO,KAAK,CAAC;IAE5C,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC;IAC3C,MAAM,KAAK,GAAG,UAAU,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACpC,IAAI,sBAAsB,CAAC,IAAI,CAAC,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAEpD,OAAO,qBAAqB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1C,CAAC;AAED,MAAM,UAAU,uBAAuB,CACrC,OAAe,EACf,WAAgC;IAEhC,IAAI,WAAW,KAAK,MAAM,EAAE,CAAC;QAC3B,MAAM,UAAU,GAAG,oBAAoB,CAAC,OAAO,CAAC,CAAC;QACjD,OAAO;YACL,UAAU,EAAE,UAAU,CAAC,iBAAiB;YACxC,WAAW,EAAE,UAAU,CAAC,WAAW;YACnC,YAAY,EAAE,UAAU,CAAC,YAAY;SACtC,CAAC;IACJ,CAAC;IAED,MAAM,WAAW,GAAG,0BAA0B,CAAC,OAAO,CAAC,CAAC;IACxD,MAAM,iBAAiB,GAAG,OAAO,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;IACvD,MAAM,WAAW,GAAG,kBAAkB,CAAC,iBAAiB,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;IAE5E,OAAO;QACL,UAAU,EAAE,WAAW,CAAC,MAAM;QAC9B,WAAW;QACX,YAAY,EAAE,gBAAgB,CAAC,WAAW,CAAC,MAAM,EAAE,WAAW,EAAE,KAAK,EAAE,iBAAiB,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;KAClG,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,mCAAmC,CACjD,QAAqC,EACrC,SAAsC;IAEtC,IAAI,SAAS,CAAC,YAAY,IAAI,QAAQ,CAAC,YAAY,GAAG,CAAC;QAAE,OAAO,IAAI,CAAC;IACrE,IAAI,SAAS,CAAC,UAAU,IAAI,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,QAAQ,CAAC,UAAU,GAAG,CAAC,CAAC;QAAE,OAAO,IAAI,CAAC;IAChF,IAAI,SAAS,CAAC,WAAW,GAAG,CAAC,IAAI,QAAQ,CAAC,WAAW,KAAK,CAAC,IAAI,SAAS,CAAC,UAAU,IAAI,QAAQ,CAAC,UAAU;QACxG,OAAO,IAAI,CAAC;IACd,OAAO,KAAK,CAAC;AACf,CAAC"}
package/package.json CHANGED
@@ -1,8 +1,8 @@
1
1
  {
2
2
  "name": "@purepageio/fetch-engines",
3
- "version": "0.9.1",
3
+ "version": "0.10.1",
4
4
  "type": "module",
5
- "description": "Fetch web pages as clean Markdown or structured data. HTTP-first with automatic Playwright fallback, built for RAG pipelines and content extraction.",
5
+ "description": "Production-grade web extraction: clean Markdown, Playwright fallback, soft-block detection, and structured data extraction.",
6
6
  "main": "dist/index.js",
7
7
  "types": "dist/index.d.ts",
8
8
  "files": [
@@ -75,12 +75,15 @@
75
75
  },
76
76
  "scripts": {
77
77
  "build": "tsc",
78
+ "typecheck": "tsc --noEmit",
78
79
  "lint": "eslint \"src/**/*.ts\" \"examples/**/*.ts\"",
79
80
  "format": "prettier --write \"src/**/*.ts\" \"examples/**/*.ts\" \"*.{js,cjs,json,md}\"",
80
81
  "test": "vitest run",
81
82
  "test:unit": "vitest run",
82
83
  "test:live": "LIVE_NETWORK=1 vitest run test/live/*.test.ts",
84
+ "test:live:auto-render": "LIVE_NETWORK=1 vitest run test/live/AutoRenderHypothesis.test.ts",
83
85
  "examples:hybrid-md": "node scripts/hybrid-md-dump.mjs",
86
+ "eval:auto-render": "pnpm build && node scripts/eval-auto-render.mjs",
84
87
  "simple-scraping": "tsx examples/simple-scraping.ts",
85
88
  "smart-scraping": "tsx examples/smart-scraping.ts",
86
89
  "ai-extraction": "tsx examples/ai-extraction.ts"