webpeel 0.21.13 → 0.21.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -778,6 +778,10 @@ async function githubExtractor(_html, url) {
778
778
  if (pathParts.length === 0)
779
779
  return null;
780
780
  const ghHeaders = { Accept: 'application/vnd.github.v3+json' };
781
+ // Use GITHUB_TOKEN if available for higher rate limits (5000/hr vs 60/hr)
782
+ const ghToken = process.env.GITHUB_TOKEN || process.env.GH_TOKEN;
783
+ if (ghToken)
784
+ ghHeaders.Authorization = `token ${ghToken}`;
781
785
  // User profile: /username (single segment)
782
786
  if (pathParts.length === 1) {
783
787
  const username = pathParts[0];
@@ -986,6 +990,27 @@ async function hackerNewsExtractor(_html, url) {
986
990
  const storyData = await fetchJson(`https://hacker-news.firebaseio.com/v0/item/${itemId}.json`);
987
991
  if (!storyData)
988
992
  return null;
993
+ // Comment items — fetch parent story for context
994
+ if (storyData.type === 'comment') {
995
+ const parentId = storyData.parent;
996
+ let parentTitle = '';
997
+ if (parentId) {
998
+ try {
999
+ const parentData = await fetchJson(`https://hacker-news.firebaseio.com/v0/item/${parentId}.json`);
1000
+ parentTitle = parentData?.title || '';
1001
+ // Walk up to root story if parent is also a comment
1002
+ if (!parentTitle && parentData?.parent) {
1003
+ const rootData = await fetchJson(`https://hacker-news.firebaseio.com/v0/item/${parentData.parent}.json`);
1004
+ parentTitle = rootData?.title || '';
1005
+ }
1006
+ }
1007
+ catch { /* non-fatal */ }
1008
+ }
1009
+ const text = storyData.text ? stripHtml(storyData.text) : '';
1010
+ const titleStr = parentTitle ? `Comment on: ${parentTitle}` : 'HN Comment';
1011
+ const cleanContent = `## 🟠 ${titleStr}\n\n**Author:** ${storyData.by || '[deleted]'} | **Posted:** ${unixToIso(storyData.time)}\n\n${text}`;
1012
+ return { domain: 'news.ycombinator.com', type: 'comment', structured: { title: titleStr, author: storyData.by, text }, cleanContent };
1013
+ }
989
1014
  const type = storyData.type === 'story' ? 'story' :
990
1015
  storyData.type === 'ask' ? 'ask_hn' :
991
1016
  storyData.type === 'show' ? 'show_hn' :
@@ -278,15 +278,9 @@ export async function fetchContent(ctx) {
278
278
  ctx.rawHtmlSize = ddResult.rawHtmlSize;
279
279
  }
280
280
  else {
281
- // For API-first extractors (HN, Reddit, GitHub), estimate what the raw page
282
- // would cost by doing a quick HEAD request for Content-Length
283
- try {
284
- const headResp = await fetch(ctx.url, { method: 'HEAD', signal: AbortSignal.timeout(2000) });
285
- const cl = headResp.headers.get('content-length');
286
- if (cl)
287
- ctx.rawHtmlSize = parseInt(cl, 10);
288
- }
289
- catch { /* non-fatal — token estimate will be undefined */ }
281
+ // For API-first extractors (HN, Reddit, GitHub), the raw HTML page is typically
282
+ // 6-10x larger than the extracted content. Estimate conservatively at 7x.
283
+ ctx.rawHtmlSize = ddResult.cleanContent.length * 7;
290
284
  }
291
285
  // Create minimal fetchResult so downstream stages don't crash
292
286
  ctx.fetchResult = {
@@ -360,6 +354,10 @@ export async function fetchContent(ctx) {
360
354
  if (ddResult.rawHtmlSize && ddResult.rawHtmlSize > 0) {
361
355
  ctx.rawHtmlSize = ddResult.rawHtmlSize;
362
356
  }
357
+ else {
358
+ // Estimate raw HTML size for API-first extractors (7x compression factor)
359
+ ctx.rawHtmlSize = ddResult.cleanContent.length * 7;
360
+ }
363
361
  ctx.fetchResult = {
364
362
  html: ddResult.cleanContent,
365
363
  url: ctx.url,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.13",
3
+ "version": "0.21.15",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",