webpeel 0.21.13 → 0.21.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -778,6 +778,10 @@ async function githubExtractor(_html, url) {
|
|
|
778
778
|
if (pathParts.length === 0)
|
|
779
779
|
return null;
|
|
780
780
|
const ghHeaders = { Accept: 'application/vnd.github.v3+json' };
|
|
781
|
+
// Use GITHUB_TOKEN if available for higher rate limits (5000/hr vs 60/hr)
|
|
782
|
+
const ghToken = process.env.GITHUB_TOKEN || process.env.GH_TOKEN;
|
|
783
|
+
if (ghToken)
|
|
784
|
+
ghHeaders.Authorization = `token ${ghToken}`;
|
|
781
785
|
// User profile: /username (single segment)
|
|
782
786
|
if (pathParts.length === 1) {
|
|
783
787
|
const username = pathParts[0];
|
|
@@ -986,6 +990,27 @@ async function hackerNewsExtractor(_html, url) {
|
|
|
986
990
|
const storyData = await fetchJson(`https://hacker-news.firebaseio.com/v0/item/${itemId}.json`);
|
|
987
991
|
if (!storyData)
|
|
988
992
|
return null;
|
|
993
|
+
// Comment items — fetch parent story for context
|
|
994
|
+
if (storyData.type === 'comment') {
|
|
995
|
+
const parentId = storyData.parent;
|
|
996
|
+
let parentTitle = '';
|
|
997
|
+
if (parentId) {
|
|
998
|
+
try {
|
|
999
|
+
const parentData = await fetchJson(`https://hacker-news.firebaseio.com/v0/item/${parentId}.json`);
|
|
1000
|
+
parentTitle = parentData?.title || '';
|
|
1001
|
+
// Walk up to root story if parent is also a comment
|
|
1002
|
+
if (!parentTitle && parentData?.parent) {
|
|
1003
|
+
const rootData = await fetchJson(`https://hacker-news.firebaseio.com/v0/item/${parentData.parent}.json`);
|
|
1004
|
+
parentTitle = rootData?.title || '';
|
|
1005
|
+
}
|
|
1006
|
+
}
|
|
1007
|
+
catch { /* non-fatal */ }
|
|
1008
|
+
}
|
|
1009
|
+
const text = storyData.text ? stripHtml(storyData.text) : '';
|
|
1010
|
+
const titleStr = parentTitle ? `Comment on: ${parentTitle}` : 'HN Comment';
|
|
1011
|
+
const cleanContent = `## 🟠 ${titleStr}\n\n**Author:** ${storyData.by || '[deleted]'} | **Posted:** ${unixToIso(storyData.time)}\n\n${text}`;
|
|
1012
|
+
return { domain: 'news.ycombinator.com', type: 'comment', structured: { title: titleStr, author: storyData.by, text }, cleanContent };
|
|
1013
|
+
}
|
|
989
1014
|
const type = storyData.type === 'story' ? 'story' :
|
|
990
1015
|
storyData.type === 'ask' ? 'ask_hn' :
|
|
991
1016
|
storyData.type === 'show' ? 'show_hn' :
|
package/dist/core/pipeline.js
CHANGED
|
@@ -278,15 +278,9 @@ export async function fetchContent(ctx) {
|
|
|
278
278
|
ctx.rawHtmlSize = ddResult.rawHtmlSize;
|
|
279
279
|
}
|
|
280
280
|
else {
|
|
281
|
-
// For API-first extractors (HN, Reddit, GitHub),
|
|
282
|
-
//
|
|
283
|
-
|
|
284
|
-
const headResp = await fetch(ctx.url, { method: 'HEAD', signal: AbortSignal.timeout(2000) });
|
|
285
|
-
const cl = headResp.headers.get('content-length');
|
|
286
|
-
if (cl)
|
|
287
|
-
ctx.rawHtmlSize = parseInt(cl, 10);
|
|
288
|
-
}
|
|
289
|
-
catch { /* non-fatal — token estimate will be undefined */ }
|
|
281
|
+
// For API-first extractors (HN, Reddit, GitHub), the raw HTML page is typically
|
|
282
|
+
// 6-10x larger than the extracted content. Estimate conservatively at 7x.
|
|
283
|
+
ctx.rawHtmlSize = ddResult.cleanContent.length * 7;
|
|
290
284
|
}
|
|
291
285
|
// Create minimal fetchResult so downstream stages don't crash
|
|
292
286
|
ctx.fetchResult = {
|
|
@@ -360,6 +354,10 @@ export async function fetchContent(ctx) {
|
|
|
360
354
|
if (ddResult.rawHtmlSize && ddResult.rawHtmlSize > 0) {
|
|
361
355
|
ctx.rawHtmlSize = ddResult.rawHtmlSize;
|
|
362
356
|
}
|
|
357
|
+
else {
|
|
358
|
+
// Estimate raw HTML size for API-first extractors (7x compression factor)
|
|
359
|
+
ctx.rawHtmlSize = ddResult.cleanContent.length * 7;
|
|
360
|
+
}
|
|
363
361
|
ctx.fetchResult = {
|
|
364
362
|
html: ddResult.cleanContent,
|
|
365
363
|
url: ctx.url,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.15",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|