webpeel 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/README.md +140 -500
  2. package/dist/cli-auth.d.ts +2 -0
  3. package/dist/cli-auth.d.ts.map +1 -1
  4. package/dist/cli-auth.js +16 -3
  5. package/dist/cli-auth.js.map +1 -1
  6. package/dist/cli.js +475 -77
  7. package/dist/cli.js.map +1 -1
  8. package/dist/core/actions.d.ts +19 -10
  9. package/dist/core/actions.d.ts.map +1 -1
  10. package/dist/core/actions.js +214 -43
  11. package/dist/core/actions.js.map +1 -1
  12. package/dist/core/agent.d.ts +60 -3
  13. package/dist/core/agent.d.ts.map +1 -1
  14. package/dist/core/agent.js +375 -86
  15. package/dist/core/agent.js.map +1 -1
  16. package/dist/core/answer.d.ts +43 -0
  17. package/dist/core/answer.d.ts.map +1 -0
  18. package/dist/core/answer.js +378 -0
  19. package/dist/core/answer.js.map +1 -0
  20. package/dist/core/cache.d.ts +14 -0
  21. package/dist/core/cache.d.ts.map +1 -0
  22. package/dist/core/cache.js +122 -0
  23. package/dist/core/cache.js.map +1 -0
  24. package/dist/core/dns-cache.d.ts +21 -0
  25. package/dist/core/dns-cache.d.ts.map +1 -0
  26. package/dist/core/dns-cache.js +184 -0
  27. package/dist/core/dns-cache.js.map +1 -0
  28. package/dist/core/documents.d.ts +24 -0
  29. package/dist/core/documents.d.ts.map +1 -0
  30. package/dist/core/documents.js +124 -0
  31. package/dist/core/documents.js.map +1 -0
  32. package/dist/core/extract-inline.d.ts +39 -0
  33. package/dist/core/extract-inline.d.ts.map +1 -0
  34. package/dist/core/extract-inline.js +214 -0
  35. package/dist/core/extract-inline.js.map +1 -0
  36. package/dist/core/fetcher.d.ts +33 -7
  37. package/dist/core/fetcher.d.ts.map +1 -1
  38. package/dist/core/fetcher.js +608 -41
  39. package/dist/core/fetcher.js.map +1 -1
  40. package/dist/core/jobs.d.ts +66 -0
  41. package/dist/core/jobs.d.ts.map +1 -0
  42. package/dist/core/jobs.js +513 -0
  43. package/dist/core/jobs.js.map +1 -0
  44. package/dist/core/markdown.d.ts.map +1 -1
  45. package/dist/core/markdown.js +141 -31
  46. package/dist/core/markdown.js.map +1 -1
  47. package/dist/core/pdf.d.ts.map +1 -1
  48. package/dist/core/pdf.js +3 -1
  49. package/dist/core/pdf.js.map +1 -1
  50. package/dist/core/screenshot.d.ts +33 -0
  51. package/dist/core/screenshot.d.ts.map +1 -0
  52. package/dist/core/screenshot.js +30 -0
  53. package/dist/core/screenshot.js.map +1 -0
  54. package/dist/core/search-provider.d.ts +46 -0
  55. package/dist/core/search-provider.d.ts.map +1 -0
  56. package/dist/core/search-provider.js +281 -0
  57. package/dist/core/search-provider.js.map +1 -0
  58. package/dist/core/strategies.d.ts +7 -10
  59. package/dist/core/strategies.d.ts.map +1 -1
  60. package/dist/core/strategies.js +370 -63
  61. package/dist/core/strategies.js.map +1 -1
  62. package/dist/index.d.ts +9 -3
  63. package/dist/index.d.ts.map +1 -1
  64. package/dist/index.js +61 -32
  65. package/dist/index.js.map +1 -1
  66. package/dist/mcp/server.js +335 -70
  67. package/dist/mcp/server.js.map +1 -1
  68. package/dist/types.d.ts +43 -1
  69. package/dist/types.d.ts.map +1 -1
  70. package/dist/types.js.map +1 -1
  71. package/llms.txt +85 -47
  72. package/package.json +11 -5
@@ -0,0 +1,21 @@
1
+ /**
2
+ * DNS Pre-Resolution Cache
3
+ *
4
+ * Warms a local Map<hostname, ip[]> on startup for the top ~50 popular domains
5
+ * and exposes a custom lookup function compatible with undici's Agent `connect.lookup`.
6
+ */
7
+ import dns from 'node:dns';
8
+ export declare function getCachedDns(hostname: string): string[] | null;
9
+ export declare function resolveAndCache(hostname: string): Promise<string[]>;
10
+ /**
11
+ * Custom lookup function compatible with undici's Agent `connect.lookup`.
12
+ *
13
+ * undici passes `{ hints: 1024, all: true }` — so when `all` is true the
14
+ * callback must receive `(err, entries: { address, family }[])`.
15
+ * When `all` is false (or absent), the callback is `(err, address, family)`.
16
+ */
17
+ export declare function cachedLookup(hostname: string, options: dns.LookupOptions, callback: (...args: any[]) => void): void;
18
+ export declare function warmupDnsCache(domains?: string[]): Promise<void>;
19
+ export declare function startDnsWarmup(): void;
20
+ export declare function clearDnsCache(): void;
21
+ //# sourceMappingURL=dns-cache.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dns-cache.d.ts","sourceRoot":"","sources":["../../src/core/dns-cache.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,GAAG,MAAM,UAAU,CAAC;AA+E3B,wBAAgB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,IAAI,CAM9D;AAWD,wBAAsB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,CAYzE;AASD;;;;;;GAMG;AACH,wBAAgB,YAAY,CAC1B,QAAQ,EAAE,MAAM,EAChB,OAAO,EAAE,GAAG,CAAC,aAAa,EAC1B,QAAQ,EAAE,CAAC,GAAG,IAAI,EAAE,GAAG,EAAE,KAAK,IAAI,GACjC,IAAI,CA8CN;AAED,wBAAsB,cAAc,CAClC,OAAO,GAAE,MAAM,EAAuB,GACrC,OAAO,CAAC,IAAI,CAAC,CAEf;AAED,wBAAgB,cAAc,IAAI,IAAI,CAMrC;AAED,wBAAgB,aAAa,IAAI,IAAI,CAEpC"}
@@ -0,0 +1,184 @@
1
+ /**
2
+ * DNS Pre-Resolution Cache
3
+ *
4
+ * Warms a local Map<hostname, ip[]> on startup for the top ~50 popular domains
5
+ * and exposes a custom lookup function compatible with undici's Agent `connect.lookup`.
6
+ */
7
+ import dns from 'node:dns';
8
+ import net from 'node:net';
9
+ const DNS_CACHE_TTL_MS = 30 * 60 * 1000; // 30 minutes
10
+ const DNS_WARMUP_DOMAINS = [
11
+ 'github.com',
12
+ 'www.github.com',
13
+ 'raw.githubusercontent.com',
14
+ 'api.github.com',
15
+ 'wikipedia.org',
16
+ 'en.wikipedia.org',
17
+ 'news.ycombinator.com',
18
+ 'stackoverflow.com',
19
+ 'www.stackoverflow.com',
20
+ 'developer.mozilla.org',
21
+ 'react.dev',
22
+ 'nextjs.org',
23
+ 'vercel.com',
24
+ 'tailwindcss.com',
25
+ 'supabase.com',
26
+ 'npmjs.com',
27
+ 'www.npmjs.com',
28
+ 'reddit.com',
29
+ 'www.reddit.com',
30
+ 'www.cloudflare.com',
31
+ 'medium.com',
32
+ 'linkedin.com',
33
+ 'www.linkedin.com',
34
+ 'www.bloomberg.com',
35
+ 'www.glassdoor.com',
36
+ 'arxiv.org',
37
+ 'www.sec.gov',
38
+ 'w3.org',
39
+ 'www.w3.org',
40
+ 'tools.ietf.org',
41
+ 'unicode.org',
42
+ 'www.bbc.com',
43
+ 'news.google.com',
44
+ 'www.youtube.com',
45
+ 'example.com',
46
+ 'httpbin.org',
47
+ 'docs.python.org',
48
+ 'nodejs.org',
49
+ 'openai.com',
50
+ 'anthropic.com',
51
+ 'x.com',
52
+ 'twitter.com',
53
+ 'www.nytimes.com',
54
+ 'www.wsj.com',
55
+ 'www.reuters.com',
56
+ 'www.theverge.com',
57
+ 'www.cnn.com',
58
+ 'www.amazon.com',
59
+ 'www.apple.com',
60
+ 'www.microsoft.com',
61
+ ];
62
+ const dnsCache = new Map();
63
+ let warmupStarted = false;
64
+ let roundRobinCursor = 0;
65
+ function normalizeHostname(hostname) {
66
+ return hostname.trim().toLowerCase();
67
+ }
68
+ function pruneIfExpired(hostname) {
69
+ const entry = dnsCache.get(hostname);
70
+ if (!entry)
71
+ return;
72
+ if (entry.expiresAt <= Date.now()) {
73
+ dnsCache.delete(hostname);
74
+ }
75
+ }
76
+ export function getCachedDns(hostname) {
77
+ const normalized = normalizeHostname(hostname);
78
+ pruneIfExpired(normalized);
79
+ const entry = dnsCache.get(normalized);
80
+ if (!entry || entry.ips.length === 0)
81
+ return null;
82
+ return [...entry.ips];
83
+ }
84
+ function setCachedDns(hostname, ips) {
85
+ if (ips.length === 0)
86
+ return;
87
+ const normalized = normalizeHostname(hostname);
88
+ dnsCache.set(normalized, {
89
+ ips: [...new Set(ips)],
90
+ expiresAt: Date.now() + DNS_CACHE_TTL_MS,
91
+ });
92
+ }
93
+ export async function resolveAndCache(hostname) {
94
+ const normalized = normalizeHostname(hostname);
95
+ const cached = getCachedDns(normalized);
96
+ if (cached)
97
+ return cached;
98
+ try {
99
+ const ips = await dns.promises.resolve4(normalized);
100
+ if (ips.length > 0)
101
+ setCachedDns(normalized, ips);
102
+ return ips;
103
+ }
104
+ catch {
105
+ return [];
106
+ }
107
+ }
108
+ function selectCachedIp(ips) {
109
+ if (ips.length === 1)
110
+ return ips[0];
111
+ const selected = ips[roundRobinCursor % ips.length];
112
+ roundRobinCursor = (roundRobinCursor + 1) % Number.MAX_SAFE_INTEGER;
113
+ return selected;
114
+ }
115
+ /**
116
+ * Custom lookup function compatible with undici's Agent `connect.lookup`.
117
+ *
118
+ * undici passes `{ hints: 1024, all: true }` — so when `all` is true the
119
+ * callback must receive `(err, entries: { address, family }[])`.
120
+ * When `all` is false (or absent), the callback is `(err, address, family)`.
121
+ */
122
+ export function cachedLookup(hostname, options, callback) {
123
+ // If hostname is already an IP, return immediately
124
+ const ipFamily = net.isIP(hostname);
125
+ if (ipFamily === 4 || ipFamily === 6) {
126
+ if (options?.all) {
127
+ callback(null, [{ address: hostname, family: ipFamily }]);
128
+ }
129
+ else {
130
+ callback(null, hostname, ipFamily);
131
+ }
132
+ return;
133
+ }
134
+ // Only use cache for IPv4 lookups (family 0 or 4)
135
+ const requestedFamily = typeof options?.family === 'number' ? options.family : 0;
136
+ if (requestedFamily !== 6) {
137
+ const cachedIps = getCachedDns(hostname);
138
+ if (cachedIps && cachedIps.length > 0) {
139
+ if (options?.all) {
140
+ callback(null, cachedIps.map(ip => ({ address: ip, family: 4 })));
141
+ }
142
+ else {
143
+ callback(null, selectCachedIp(cachedIps), 4);
144
+ }
145
+ return;
146
+ }
147
+ // Async resolve, fall back to native lookup on failure
148
+ void resolveAndCache(hostname)
149
+ .then((resolvedIps) => {
150
+ if (resolvedIps.length > 0) {
151
+ if (options?.all) {
152
+ callback(null, resolvedIps.map(ip => ({ address: ip, family: 4 })));
153
+ }
154
+ else {
155
+ callback(null, selectCachedIp(resolvedIps), 4);
156
+ }
157
+ }
158
+ else {
159
+ dns.lookup(hostname, options, callback);
160
+ }
161
+ })
162
+ .catch(() => {
163
+ dns.lookup(hostname, options, callback);
164
+ });
165
+ return;
166
+ }
167
+ // IPv6 requested — fall through to native lookup
168
+ dns.lookup(hostname, options, callback);
169
+ }
170
+ export async function warmupDnsCache(domains = DNS_WARMUP_DOMAINS) {
171
+ await Promise.allSettled(domains.map((d) => resolveAndCache(d)));
172
+ }
173
+ export function startDnsWarmup() {
174
+ if (warmupStarted)
175
+ return;
176
+ warmupStarted = true;
177
+ void warmupDnsCache().catch(() => {
178
+ // Best-effort only.
179
+ });
180
+ }
181
+ export function clearDnsCache() {
182
+ dnsCache.clear();
183
+ }
184
+ //# sourceMappingURL=dns-cache.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dns-cache.js","sourceRoot":"","sources":["../../src/core/dns-cache.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,GAAG,MAAM,UAAU,CAAC;AAC3B,OAAO,GAAG,MAAM,UAAU,CAAC;AAO3B,MAAM,gBAAgB,GAAG,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,CAAC,aAAa;AAEtD,MAAM,kBAAkB,GAAa;IACnC,YAAY;IACZ,gBAAgB;IAChB,2BAA2B;IAC3B,gBAAgB;IAChB,eAAe;IACf,kBAAkB;IAClB,sBAAsB;IACtB,mBAAmB;IACnB,uBAAuB;IACvB,uBAAuB;IACvB,WAAW;IACX,YAAY;IACZ,YAAY;IACZ,iBAAiB;IACjB,cAAc;IACd,WAAW;IACX,eAAe;IACf,YAAY;IACZ,gBAAgB;IAChB,oBAAoB;IACpB,YAAY;IACZ,cAAc;IACd,kBAAkB;IAClB,mBAAmB;IACnB,mBAAmB;IACnB,WAAW;IACX,aAAa;IACb,QAAQ;IACR,YAAY;IACZ,gBAAgB;IAChB,aAAa;IACb,aAAa;IACb,iBAAiB;IACjB,iBAAiB;IACjB,aAAa;IACb,aAAa;IACb,iBAAiB;IACjB,YAAY;IACZ,YAAY;IACZ,eAAe;IACf,OAAO;IACP,aAAa;IACb,iBAAiB;IACjB,aAAa;IACb,iBAAiB;IACjB,kBAAkB;IAClB,aAAa;IACb,gBAAgB;IAChB,eAAe;IACf,mBAAmB;CACpB,CAAC;AAEF,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAyB,CAAC;AAClD,IAAI,aAAa,GAAG,KAAK,CAAC;AAC1B,IAAI,gBAAgB,GAAG,CAAC,CAAC;AAEzB,SAAS,iBAAiB,CAAC,QAAgB;IACzC,OAAO,QAAQ,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;AACvC,CAAC;AAED,SAAS,cAAc,CAAC,QAAgB;IACtC,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;IACrC,IAAI,CAAC,KAAK;QAAE,OAAO;IACnB,IAAI,KAAK,CAAC,SAAS,IAAI,IAAI,CAAC,GAAG,EAAE,EAAE,CAAC;QAClC,QAAQ,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;IAC5B,CAAC;AACH,CAAC;AAED,MAAM,UAAU,YAAY,CAAC,QAAgB;IAC3C,MAAM,UAAU,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAC;IAC/C,cAAc,CAAC,UAAU,CAAC,CAAC;IAC3B,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;IACvC,IAAI,CAAC,KAAK,IAAI,KAAK,CAAC,GAAG,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAClD,OAAO,CAAC,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC;AACxB,CAAC;AAED,SAAS,YAAY,CAAC,QAAgB,EAAE,GAAa;IACnD,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO;IAC7B,MAAM,UAAU,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAC;IAC/C,QAAQ,CAAC,GAAG,CAAC,UAAU,EAAE;QACvB,GAAG,EAAE,CAAC,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QACtB,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,gBAAgB;KACzC,CAAC,CAAC;AACL,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,QAAgB;IACpD,MAAM,UAAU,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAC;IAC/C,MAAM,MAAM,GAAG,YAAY,CAAC,UAAU,CAAC,CAAC;IACxC,IAAI,MAAM;QAAE,OAAO,MAAM,CAAC;IAE1B,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,GAAG,CAAC,QAAQ,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC;QACpD,IAAI,GAAG,CAAC,MAAM,GAAG,CAAC;YAAE,YAAY,CAAC,UAAU,EAAE,GAAG,CAAC,CAAC;QAClD,OAAO,GAAG,CAAC;IACb,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED,SAAS,cAAc,CAAC,GAAa;IACnC,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,GAAG,CAAC,CAAC,CAAE,CAAC;IACrC,MAAM,QAAQ,GAAG,GAAG,CAAC,gBAAgB,GAAG,GAAG,CAAC,MAAM,CAAE,CAAC;IACrD,gBAAgB,GAAG,CAAC,gBAAgB,GAAG,CAAC,CAAC,GAAG,MAAM,CAAC,gBAAgB,CAAC;IACpE,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,YAAY,CAC1B,QAAgB,EAChB,OAA0B,EAC1B,QAAkC;IAElC,mDAAmD;IACnD,MAAM,QAAQ,GAAG,GAAG,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACpC,IAAI,QAAQ,KAAK,CAAC,IAAI,QAAQ,KAAK,CAAC,EAAE,CAAC;QACrC,IAAI,OAAO,EAAE,GAAG,EAAE,CAAC;YACjB,QAAQ,CAAC,IAAI,EAAE,CAAC,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,QAAQ,EAAE,CAAC,CAAC,CAAC;QAC5D,CAAC;aAAM,CAAC;YACN,QAAQ,CAAC,IAAI,EAAE,QAAQ,EAAE,QAAQ,CAAC,CAAC;QACrC,CAAC;QACD,OAAO;IACT,CAAC;IAED,kDAAkD;IAClD,MAAM,eAAe,GAAG,OAAO,OAAO,EAAE,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IACjF,IAAI,eAAe,KAAK,CAAC,EAAE,CAAC;QAC1B,MAAM,SAAS,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;QACzC,IAAI,SAAS,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtC,IAAI,OAAO,EAAE,GAAG,EAAE,CAAC;gBACjB,QAAQ,CAAC,IAAI,EAAE,SAAS,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;YACpE,CAAC;iBAAM,CAAC;gBACN,QAAQ,CAAC,IAAI,EAAE,cAAc,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC,CAAC;YAC/C,CAAC;YACD,OAAO;QACT,CAAC;QAED,uDAAuD;QACvD,KAAK,eAAe,CAAC,QAAQ,CAAC;aAC3B,IAAI,CAAC,CAAC,WAAW,EAAE,EAAE;YACpB,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC3B,IAAI,OAAO,EAAE,GAAG,EAAE,CAAC;oBACjB,QAAQ,CAAC,IAAI,EAAE,WAAW,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;gBACtE,CAAC;qBAAM,CAAC;oBACN,QAAQ,CAAC,IAAI,EAAE,cAAc,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC,CAAC;gBACjD,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,OAAO,EAAE,QAAQ,CAAC,CAAC;YAC1C,CAAC;QACH,CAAC,CAAC;aACD,KAAK,CAAC,GAAG,EAAE;YACV,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,OAAO,EAAE,QAAQ,CAAC,CAAC;QAC1C,CAAC,CAAC,CAAC;QACL,OAAO;IACT,CAAC;IAED,iDAAiD;IACjD,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,OAAO,EAAE,QAAQ,CAAC,CAAC;AAC1C,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,UAAoB,kBAAkB;IAEtC,MAAM,OAAO,CAAC,UAAU,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;AACnE,CAAC;AAED,MAAM,UAAU,cAAc;IAC5B,IAAI,aAAa;QAAE,OAAO;IAC1B,aAAa,GAAG,IAAI,CAAC;IACrB,KAAK,cAAc,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE;QAC/B,oBAAoB;IACtB,CAAC,CAAC,CAAC;AACL,CAAC;AAED,MAAM,UAAU,aAAa;IAC3B,QAAQ,CAAC,KAAK,EAAE,CAAC;AACnB,CAAC"}
@@ -0,0 +1,24 @@
1
+ /**
2
+ * Document (PDF/DOCX) parsing utilities.
3
+ *
4
+ * Keeps binary/document parsing separate from the HTML scraping pipeline.
5
+ */
6
+ export type DocumentFormat = 'markdown' | 'text' | 'html';
7
+ export interface DocumentExtractionResult {
8
+ content: string;
9
+ metadata: {
10
+ title: string;
11
+ contentType: string;
12
+ wordCount: number;
13
+ [key: string]: any;
14
+ };
15
+ }
16
+ export declare function normalizeContentType(contentTypeHeader: string | undefined | null): string;
17
+ export declare function isPdfContentType(contentTypeHeader: string | undefined | null): boolean;
18
+ export declare function isDocxContentType(contentTypeHeader: string | undefined | null): boolean;
19
+ export declare function extractDocumentToFormat(buffer: Buffer, options?: {
20
+ url?: string;
21
+ contentType?: string;
22
+ format?: DocumentFormat;
23
+ }): Promise<DocumentExtractionResult>;
24
+ //# sourceMappingURL=documents.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"documents.d.ts","sourceRoot":"","sources":["../../src/core/documents.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAKH,MAAM,MAAM,cAAc,GAAG,UAAU,GAAG,MAAM,GAAG,MAAM,CAAC;AAE1D,MAAM,WAAW,wBAAwB;IACvC,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE;QACR,KAAK,EAAE,MAAM,CAAC;QACd,WAAW,EAAE,MAAM,CAAC;QACpB,SAAS,EAAE,MAAM,CAAC;QAClB,CAAC,GAAG,EAAE,MAAM,GAAG,GAAG,CAAC;KACpB,CAAC;CACH;AAED,wBAAgB,oBAAoB,CAAC,iBAAiB,EAAE,MAAM,GAAG,SAAS,GAAG,IAAI,GAAG,MAAM,CAGzF;AAED,wBAAgB,gBAAgB,CAAC,iBAAiB,EAAE,MAAM,GAAG,SAAS,GAAG,IAAI,GAAG,OAAO,CAGtF;AAED,wBAAgB,iBAAiB,CAAC,iBAAiB,EAAE,MAAM,GAAG,SAAS,GAAG,IAAI,GAAG,OAAO,CAGvF;AA4CD,wBAAsB,uBAAuB,CAC3C,MAAM,EAAE,MAAM,EACd,OAAO,GAAE;IACP,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,cAAc,CAAC;CACpB,GACL,OAAO,CAAC,wBAAwB,CAAC,CA0EnC"}
@@ -0,0 +1,124 @@
1
+ /**
2
+ * Document (PDF/DOCX) parsing utilities.
3
+ *
4
+ * Keeps binary/document parsing separate from the HTML scraping pipeline.
5
+ */
6
+ import { htmlToMarkdown, htmlToText } from './markdown.js';
7
+ import { extractPdf } from './pdf.js';
8
+ export function normalizeContentType(contentTypeHeader) {
9
+ if (!contentTypeHeader)
10
+ return '';
11
+ return contentTypeHeader.split(';')[0]?.trim().toLowerCase() || '';
12
+ }
13
+ export function isPdfContentType(contentTypeHeader) {
14
+ const ct = normalizeContentType(contentTypeHeader);
15
+ return ct === 'application/pdf' || ct.endsWith('+pdf');
16
+ }
17
+ export function isDocxContentType(contentTypeHeader) {
18
+ const ct = normalizeContentType(contentTypeHeader);
19
+ return ct === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
20
+ }
21
+ function basenameFromUrl(url) {
22
+ if (!url)
23
+ return '';
24
+ try {
25
+ const u = new URL(url);
26
+ const last = u.pathname.split('/').filter(Boolean).pop() || '';
27
+ return decodeURIComponent(last);
28
+ }
29
+ catch {
30
+ return '';
31
+ }
32
+ }
33
+ function stripExtension(name) {
34
+ return name.replace(/\.(pdf|docx)$/i, '');
35
+ }
36
+ function escapeHtml(text) {
37
+ return text
38
+ .replace(/&/g, '&amp;')
39
+ .replace(/</g, '&lt;')
40
+ .replace(/>/g, '&gt;')
41
+ .replace(/"/g, '&quot;')
42
+ .replace(/'/g, '&#39;');
43
+ }
44
+ function countWords(text) {
45
+ const words = text
46
+ .replace(/\s+/g, ' ')
47
+ .trim()
48
+ .split(' ')
49
+ .filter(Boolean);
50
+ return words.length;
51
+ }
52
+ function normalizePlainText(text) {
53
+ // pdf-parse returns lots of line breaks; keep paragraphs but reduce noise.
54
+ return text
55
+ .replace(/\r\n/g, '\n')
56
+ .replace(/\n{3,}/g, '\n\n')
57
+ .replace(/[ \t]+/g, ' ')
58
+ .trim();
59
+ }
60
+ export async function extractDocumentToFormat(buffer, options = {}) {
61
+ const { url, contentType, format = 'markdown' } = options;
62
+ const normalized = normalizeContentType(contentType);
63
+ const urlLower = (url || '').toLowerCase();
64
+ const isPdf = isPdfContentType(normalized) || urlLower.endsWith('.pdf');
65
+ const isDocx = isDocxContentType(normalized) || urlLower.endsWith('.docx');
66
+ if (isPdf) {
67
+ const pdf = await extractPdf(buffer);
68
+ const text = normalizePlainText(pdf.text || '');
69
+ const fallbackTitle = stripExtension(basenameFromUrl(url)) || 'PDF Document';
70
+ const title = pdf.metadata?.title || fallbackTitle;
71
+ const wordCount = countWords(text);
72
+ let content;
73
+ if (format === 'html') {
74
+ content = `<pre>${escapeHtml(text)}</pre>`;
75
+ }
76
+ else {
77
+ // markdown + text: return readable plain text.
78
+ content = text;
79
+ }
80
+ return {
81
+ content,
82
+ metadata: {
83
+ title,
84
+ contentType: normalized || 'application/pdf',
85
+ wordCount,
86
+ pages: pdf.pages,
87
+ ...pdf.metadata,
88
+ },
89
+ };
90
+ }
91
+ if (isDocx) {
92
+ // Mammoth returns clean semantic HTML.
93
+ const mammothMod = await import('mammoth');
94
+ const mammoth = mammothMod.default || mammothMod;
95
+ const result = await mammoth.convertToHtml({ buffer });
96
+ const html = (result?.value || '').trim();
97
+ const fallbackTitle = stripExtension(basenameFromUrl(url)) || 'Word Document';
98
+ const title = fallbackTitle;
99
+ // Word count should be based on plain text, not markdown formatting.
100
+ const plainText = htmlToText(html);
101
+ const wordCount = countWords(plainText);
102
+ let content;
103
+ if (format === 'html') {
104
+ content = html;
105
+ }
106
+ else if (format === 'text') {
107
+ content = plainText;
108
+ }
109
+ else {
110
+ content = htmlToMarkdown(html);
111
+ }
112
+ return {
113
+ content,
114
+ metadata: {
115
+ title,
116
+ contentType: normalized || 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
117
+ wordCount,
118
+ messages: result?.messages || [],
119
+ },
120
+ };
121
+ }
122
+ throw new Error(`Unsupported document type: ${normalized || contentType || 'unknown'}`);
123
+ }
124
+ //# sourceMappingURL=documents.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"documents.js","sourceRoot":"","sources":["../../src/core/documents.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,cAAc,EAAE,UAAU,EAAE,MAAM,eAAe,CAAC;AAC3D,OAAO,EAAE,UAAU,EAAE,MAAM,UAAU,CAAC;AActC,MAAM,UAAU,oBAAoB,CAAC,iBAA4C;IAC/E,IAAI,CAAC,iBAAiB;QAAE,OAAO,EAAE,CAAC;IAClC,OAAO,iBAAiB,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;AACrE,CAAC;AAED,MAAM,UAAU,gBAAgB,CAAC,iBAA4C;IAC3E,MAAM,EAAE,GAAG,oBAAoB,CAAC,iBAAiB,CAAC,CAAC;IACnD,OAAO,EAAE,KAAK,iBAAiB,IAAI,EAAE,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;AACzD,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAC,iBAA4C;IAC5E,MAAM,EAAE,GAAG,oBAAoB,CAAC,iBAAiB,CAAC,CAAC;IACnD,OAAO,EAAE,KAAK,yEAAyE,CAAC;AAC1F,CAAC;AAED,SAAS,eAAe,CAAC,GAAuB;IAC9C,IAAI,CAAC,GAAG;QAAE,OAAO,EAAE,CAAC;IACpB,IAAI,CAAC;QACH,MAAM,CAAC,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QACvB,MAAM,IAAI,GAAG,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC;QAC/D,OAAO,kBAAkB,CAAC,IAAI,CAAC,CAAC;IAClC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED,SAAS,cAAc,CAAC,IAAY;IAClC,OAAO,IAAI,CAAC,OAAO,CAAC,gBAAgB,EAAE,EAAE,CAAC,CAAC;AAC5C,CAAC;AAED,SAAS,UAAU,CAAC,IAAY;IAC9B,OAAO,IAAI;SACR,OAAO,CAAC,IAAI,EAAE,OAAO,CAAC;SACtB,OAAO,CAAC,IAAI,EAAE,MAAM,CAAC;SACrB,OAAO,CAAC,IAAI,EAAE,MAAM,CAAC;SACrB,OAAO,CAAC,IAAI,EAAE,QAAQ,CAAC;SACvB,OAAO,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;AAC5B,CAAC;AAED,SAAS,UAAU,CAAC,IAAY;IAC9B,MAAM,KAAK,GAAG,IAAI;SACf,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,IAAI,EAAE;SACN,KAAK,CAAC,GAAG,CAAC;SACV,MAAM,CAAC,OAAO,CAAC,CAAC;IACnB,OAAO,KAAK,CAAC,MAAM,CAAC;AACtB,CAAC;AAED,SAAS,kBAAkB,CAAC,IAAY;IACtC,2EAA2E;IAC3E,OAAO,IAAI;SACR,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC;SACtB,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;SAC1B,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,IAAI,EAAE,CAAC;AACZ,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,uBAAuB,CAC3C,MAAc,EACd,UAII,EAAE;IAEN,MAAM,EAAE,GAAG,EAAE,WAAW,EAAE,MAAM,GAAG,UAAU,EAAE,GAAG,OAAO,CAAC;IAE1D,MAAM,UAAU,GAAG,oBAAoB,CAAC,WAAW,CAAC,CAAC;IACrD,MAAM,QAAQ,GAAG,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;IAE3C,MAAM,KAAK,GAAG,gBAAgB,CAAC,UAAU,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;IACxE,MAAM,MAAM,GAAG,iBAAiB,CAAC,UAAU,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;IAE3E,IAAI,KAAK,EAAE,CAAC;QACV,MAAM,GAAG,GAAG,MAAM,UAAU,CAAC,MAAM,CAAC,CAAC;QACrC,MAAM,IAAI,GAAG,kBAAkB,CAAC,GAAG,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC;QAEhD,MAAM,aAAa,GAAG,cAAc,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC,IAAI,cAAc,CAAC;QAC7E,MAAM,KAAK,GAAI,GAAG,CAAC,QAAQ,EAAE,KAAgB,IAAI,aAAa,CAAC;QAE/D,MAAM,SAAS,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;QAEnC,IAAI,OAAe,CAAC;QACpB,IAAI,MAAM,KAAK,MAAM,EAAE,CAAC;YACtB,OAAO,GAAG,QAAQ,UAAU,CAAC,IAAI,CAAC,QAAQ,CAAC;QAC7C,CAAC;aAAM,CAAC;YACN,+CAA+C;YAC/C,OAAO,GAAG,IAAI,CAAC;QACjB,CAAC;QAED,OAAO;YACL,OAAO;YACP,QAAQ,EAAE;gBACR,KAAK;gBACL,WAAW,EAAE,UAAU,IAAI,iBAAiB;gBAC5C,SAAS;gBACT,KAAK,EAAE,GAAG,CAAC,KAAK;gBAChB,GAAG,GAAG,CAAC,QAAQ;aAChB;SACF,CAAC;IACJ,CAAC;IAED,IAAI,MAAM,EAAE,CAAC;QACX,uCAAuC;QACvC,MAAM,UAAU,GAAG,MAAM,MAAM,CAAC,SAAS,CAAC,CAAC;QAC3C,MAAM,OAAO,GAAS,UAAkB,CAAC,OAAO,IAAI,UAAU,CAAC;QAE/D,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;QACvD,MAAM,IAAI,GAAG,CAAC,MAAM,EAAE,KAAK,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QAE1C,MAAM,aAAa,GAAG,cAAc,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC,IAAI,eAAe,CAAC;QAC9E,MAAM,KAAK,GAAG,aAAa,CAAC;QAE5B,qEAAqE;QACrE,MAAM,SAAS,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;QACnC,MAAM,SAAS,GAAG,UAAU,CAAC,SAAS,CAAC,CAAC;QAExC,IAAI,OAAe,CAAC;QACpB,IAAI,MAAM,KAAK,MAAM,EAAE,CAAC;YACtB,OAAO,GAAG,IAAI,CAAC;QACjB,CAAC;aAAM,IAAI,MAAM,KAAK,MAAM,EAAE,CAAC;YAC7B,OAAO,GAAG,SAAS,CAAC;QACtB,CAAC;aAAM,CAAC;YACN,OAAO,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;QACjC,CAAC;QAED,OAAO;YACL,OAAO;YACP,QAAQ,EAAE;gBACR,KAAK;gBACL,WAAW,EAAE,UAAU,IAAI,yEAAyE;gBACpG,SAAS;gBACT,QAAQ,EAAE,MAAM,EAAE,QAAQ,IAAI,EAAE;aACjC;SACF,CAAC;IACJ,CAAC;IAED,MAAM,IAAI,KAAK,CAAC,8BAA8B,UAAU,IAAI,WAAW,IAAI,SAAS,EAAE,CAAC,CAAC;AAC1F,CAAC"}
@@ -0,0 +1,39 @@
1
+ /**
2
+ * Inline structured extraction using BYOK LLM
3
+ *
4
+ * After fetching page content, pass it + a JSON schema + optional prompt
5
+ * to an LLM and get back structured JSON matching the schema.
6
+ *
7
+ * Supports OpenAI, Anthropic, and Google (same BYOK pattern as /v1/answer).
8
+ */
9
+ export type LLMProvider = 'openai' | 'anthropic' | 'google';
10
+ export interface InlineExtractOptions {
11
+ /** JSON Schema describing the desired output structure */
12
+ schema?: Record<string, any>;
13
+ /** Natural language prompt describing what to extract */
14
+ prompt?: string;
15
+ /** LLM provider (required) */
16
+ llmProvider: LLMProvider;
17
+ /** LLM API key — BYOK (required) */
18
+ llmApiKey: string;
19
+ /** LLM model name (optional — uses provider default) */
20
+ llmModel?: string;
21
+ }
22
+ export interface InlineExtractResult {
23
+ /** Extracted structured data */
24
+ data: Record<string, any>;
25
+ /** Tokens consumed */
26
+ tokensUsed: {
27
+ input: number;
28
+ output: number;
29
+ };
30
+ }
31
+ /**
32
+ * Extract structured JSON from page content using an LLM (BYOK).
33
+ *
34
+ * @param content - Page content (markdown or text)
35
+ * @param options - Extraction options including schema, prompt, and LLM credentials
36
+ * @returns Extracted structured data + token usage
37
+ */
38
+ export declare function extractInlineJson(content: string, options: InlineExtractOptions): Promise<InlineExtractResult>;
39
+ //# sourceMappingURL=extract-inline.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extract-inline.d.ts","sourceRoot":"","sources":["../../src/core/extract-inline.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,MAAM,MAAM,WAAW,GAAG,QAAQ,GAAG,WAAW,GAAG,QAAQ,CAAC;AAE5D,MAAM,WAAW,oBAAoB;IACnC,0DAA0D;IAC1D,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IAC7B,yDAAyD;IACzD,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,8BAA8B;IAC9B,WAAW,EAAE,WAAW,CAAC;IACzB,oCAAoC;IACpC,SAAS,EAAE,MAAM,CAAC;IAClB,wDAAwD;IACxD,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,mBAAmB;IAClC,gCAAgC;IAChC,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IAC1B,sBAAsB;IACtB,UAAU,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC;CAC/C;AAyMD;;;;;;GAMG;AACH,wBAAsB,iBAAiB,CACrC,OAAO,EAAE,MAAM,EACf,OAAO,EAAE,oBAAoB,GAC5B,OAAO,CAAC,mBAAmB,CAAC,CAyC9B"}
@@ -0,0 +1,214 @@
1
+ /**
2
+ * Inline structured extraction using BYOK LLM
3
+ *
4
+ * After fetching page content, pass it + a JSON schema + optional prompt
5
+ * to an LLM and get back structured JSON matching the schema.
6
+ *
7
+ * Supports OpenAI, Anthropic, and Google (same BYOK pattern as /v1/answer).
8
+ */
9
+ function defaultModel(provider) {
10
+ switch (provider) {
11
+ case 'openai':
12
+ return 'gpt-4o-mini';
13
+ case 'anthropic':
14
+ return 'claude-3-5-sonnet-latest';
15
+ case 'google':
16
+ return 'gemini-1.5-flash';
17
+ }
18
+ }
19
+ function buildSystemPrompt(schema, prompt) {
20
+ const parts = [
21
+ 'You are a structured data extraction assistant.',
22
+ 'Extract data from the provided web page content and return ONLY valid JSON — no markdown fences, no explanation, no extra text.',
23
+ ];
24
+ if (prompt) {
25
+ parts.push(`\nInstruction: ${prompt}`);
26
+ }
27
+ if (schema) {
28
+ parts.push(`\nReturn a JSON object that conforms to this JSON Schema:\n${JSON.stringify(schema, null, 2)}`);
29
+ }
30
+ parts.push('\nReturn ONLY the JSON object.');
31
+ return parts.join('\n');
32
+ }
33
+ function truncateContent(content, maxChars = 24_000) {
34
+ if (content.length <= maxChars)
35
+ return content;
36
+ return content.slice(0, maxChars) + '\n\n[Content truncated]';
37
+ }
38
+ function parseJsonResponse(text) {
39
+ // Try direct parse first
40
+ try {
41
+ return JSON.parse(text);
42
+ }
43
+ catch {
44
+ // Strip markdown code fences if present
45
+ const fenceMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/);
46
+ if (fenceMatch) {
47
+ try {
48
+ return JSON.parse(fenceMatch[1].trim());
49
+ }
50
+ catch {
51
+ // fall through
52
+ }
53
+ }
54
+ // Try to find the first { ... } block
55
+ const braceStart = text.indexOf('{');
56
+ const braceEnd = text.lastIndexOf('}');
57
+ if (braceStart !== -1 && braceEnd > braceStart) {
58
+ try {
59
+ return JSON.parse(text.slice(braceStart, braceEnd + 1));
60
+ }
61
+ catch {
62
+ // fall through
63
+ }
64
+ }
65
+ throw new Error(`LLM returned invalid JSON: ${text.slice(0, 300)}`);
66
+ }
67
+ }
68
+ // ---------------------------------------------------------------------------
69
+ // Provider-specific calls (mirrors core/answer.ts patterns)
70
+ // ---------------------------------------------------------------------------
71
+ async function callOpenAI(apiKey, model, systemPrompt, userContent) {
72
+ const resp = await fetch('https://api.openai.com/v1/chat/completions', {
73
+ method: 'POST',
74
+ headers: {
75
+ 'Content-Type': 'application/json',
76
+ Authorization: `Bearer ${apiKey}`,
77
+ },
78
+ body: JSON.stringify({
79
+ model,
80
+ messages: [
81
+ { role: 'system', content: systemPrompt },
82
+ { role: 'user', content: userContent },
83
+ ],
84
+ temperature: 0,
85
+ response_format: { type: 'json_object' },
86
+ }),
87
+ });
88
+ if (!resp.ok) {
89
+ const errText = await resp.text().catch(() => '');
90
+ throw new Error(`OpenAI API error: HTTP ${resp.status}${errText ? ` - ${errText}` : ''}`);
91
+ }
92
+ const json = (await resp.json());
93
+ return {
94
+ text: String(json?.choices?.[0]?.message?.content || '').trim(),
95
+ usage: {
96
+ input: Number(json?.usage?.prompt_tokens || 0),
97
+ output: Number(json?.usage?.completion_tokens || 0),
98
+ },
99
+ };
100
+ }
101
+ async function callAnthropic(apiKey, model, systemPrompt, userContent) {
102
+ const resp = await fetch('https://api.anthropic.com/v1/messages', {
103
+ method: 'POST',
104
+ headers: {
105
+ 'Content-Type': 'application/json',
106
+ 'x-api-key': apiKey,
107
+ 'anthropic-version': '2023-06-01',
108
+ },
109
+ body: JSON.stringify({
110
+ model,
111
+ system: systemPrompt,
112
+ messages: [{ role: 'user', content: userContent }],
113
+ max_tokens: 4096,
114
+ temperature: 0,
115
+ }),
116
+ });
117
+ if (!resp.ok) {
118
+ const errText = await resp.text().catch(() => '');
119
+ throw new Error(`Anthropic API error: HTTP ${resp.status}${errText ? ` - ${errText}` : ''}`);
120
+ }
121
+ const json = (await resp.json());
122
+ const blocks = Array.isArray(json?.content) ? json.content : [];
123
+ const text = blocks
124
+ .map((b) => (typeof b?.text === 'string' ? b.text : ''))
125
+ .join('')
126
+ .trim();
127
+ return {
128
+ text,
129
+ usage: {
130
+ input: Number(json?.usage?.input_tokens || 0),
131
+ output: Number(json?.usage?.output_tokens || 0),
132
+ },
133
+ };
134
+ }
135
+ async function callGoogle(apiKey, model, systemPrompt, userContent) {
136
+ const url = `https://generativelanguage.googleapis.com/v1beta/models/${encodeURIComponent(model)}:generateContent?key=${encodeURIComponent(apiKey)}`;
137
+ const resp = await fetch(url, {
138
+ method: 'POST',
139
+ headers: { 'Content-Type': 'application/json' },
140
+ body: JSON.stringify({
141
+ contents: [
142
+ {
143
+ role: 'user',
144
+ parts: [{ text: `${systemPrompt}\n\n${userContent}` }],
145
+ },
146
+ ],
147
+ generationConfig: {
148
+ temperature: 0,
149
+ responseMimeType: 'application/json',
150
+ },
151
+ }),
152
+ });
153
+ if (!resp.ok) {
154
+ const errText = await resp.text().catch(() => '');
155
+ throw new Error(`Google API error: HTTP ${resp.status}${errText ? ` - ${errText}` : ''}`);
156
+ }
157
+ const json = (await resp.json());
158
+ const parts = json?.candidates?.[0]?.content?.parts;
159
+ const text = Array.isArray(parts)
160
+ ? parts.map((p) => (typeof p?.text === 'string' ? p.text : '')).join('')
161
+ : '';
162
+ return {
163
+ text: String(text || '').trim(),
164
+ usage: {
165
+ input: Number(json?.usageMetadata?.promptTokenCount || 0),
166
+ output: Number(json?.usageMetadata?.candidatesTokenCount || 0),
167
+ },
168
+ };
169
+ }
170
+ // ---------------------------------------------------------------------------
171
+ // Public API
172
+ // ---------------------------------------------------------------------------
173
+ /**
174
+ * Extract structured JSON from page content using an LLM (BYOK).
175
+ *
176
+ * @param content - Page content (markdown or text)
177
+ * @param options - Extraction options including schema, prompt, and LLM credentials
178
+ * @returns Extracted structured data + token usage
179
+ */
180
+ export async function extractInlineJson(content, options) {
181
+ const { schema, prompt, llmProvider, llmApiKey, llmModel } = options;
182
+ if (!llmApiKey) {
183
+ throw new Error('Inline extraction requires "llmApiKey" (BYOK)');
184
+ }
185
+ if (!llmProvider) {
186
+ throw new Error('Inline extraction requires "llmProvider" (openai, anthropic, or google)');
187
+ }
188
+ if (!schema && !prompt) {
189
+ throw new Error('Inline extraction requires "schema" or "prompt" (or both)');
190
+ }
191
+ const model = (llmModel || '').trim() || defaultModel(llmProvider);
192
+ const systemPrompt = buildSystemPrompt(schema, prompt);
193
+ const userContent = truncateContent(content);
194
+ let result;
195
+ switch (llmProvider) {
196
+ case 'openai':
197
+ result = await callOpenAI(llmApiKey, model, systemPrompt, userContent);
198
+ break;
199
+ case 'anthropic':
200
+ result = await callAnthropic(llmApiKey, model, systemPrompt, userContent);
201
+ break;
202
+ case 'google':
203
+ result = await callGoogle(llmApiKey, model, systemPrompt, userContent);
204
+ break;
205
+ default:
206
+ throw new Error(`Unsupported llmProvider: ${llmProvider}`);
207
+ }
208
+ const data = parseJsonResponse(result.text);
209
+ return {
210
+ data,
211
+ tokensUsed: result.usage,
212
+ };
213
+ }
214
+ //# sourceMappingURL=extract-inline.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extract-inline.js","sourceRoot":"","sources":["../../src/core/extract-inline.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAwBH,SAAS,YAAY,CAAC,QAAqB;IACzC,QAAQ,QAAQ,EAAE,CAAC;QACjB,KAAK,QAAQ;YACX,OAAO,aAAa,CAAC;QACvB,KAAK,WAAW;YACd,OAAO,0BAA0B,CAAC;QACpC,KAAK,QAAQ;YACX,OAAO,kBAAkB,CAAC;IAC9B,CAAC;AACH,CAAC;AAED,SAAS,iBAAiB,CAAC,MAA4B,EAAE,MAAe;IACtE,MAAM,KAAK,GAAa;QACtB,iDAAiD;QACjD,iIAAiI;KAClI,CAAC;IAEF,IAAI,MAAM,EAAE,CAAC;QACX,KAAK,CAAC,IAAI,CAAC,kBAAkB,MAAM,EAAE,CAAC,CAAC;IACzC,CAAC;IAED,IAAI,MAAM,EAAE,CAAC;QACX,KAAK,CAAC,IAAI,CAAC,8DAA8D,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC;IAC9G,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,gCAAgC,CAAC,CAAC;IAE7C,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,SAAS,eAAe,CAAC,OAAe,EAAE,QAAQ,GAAG,MAAM;IACzD,IAAI,OAAO,CAAC,MAAM,IAAI,QAAQ;QAAE,OAAO,OAAO,CAAC;IAC/C,OAAO,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,GAAG,yBAAyB,CAAC;AAChE,CAAC;AAED,SAAS,iBAAiB,CAAC,IAAY;IACrC,yBAAyB;IACzB,IAAI,CAAC;QACH,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC;IAAC,MAAM,CAAC;QACP,wCAAwC;QACxC,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,8BAA8B,CAAC,CAAC;QAC9D,IAAI,UAAU,EAAE,CAAC;YACf,IAAI,CAAC;gBACH,OAAO,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;YAC1C,CAAC;YAAC,MAAM,CAAC;gBACP,eAAe;YACjB,CAAC;QACH,CAAC;QAED,sCAAsC;QACtC,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QACrC,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;QACvC,IAAI,UAAU,KAAK,CAAC,CAAC,IAAI,QAAQ,GAAG,UAAU,EAAE,CAAC;YAC/C,IAAI,CAAC;gBACH,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,UAAU,EAAE,QAAQ,GAAG,CAAC,CAAC,CAAC,CAAC;YAC1D,CAAC;YAAC,MAAM,CAAC;gBACP,eAAe;YACjB,CAAC;QACH,CAAC;QAED,MAAM,IAAI,KAAK,CAAC,8BAA8B,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC;IACtE,CAAC;AACH,CAAC;AAED,8EAA8E;AAC9E,4DAA4D;AAC5D,8EAA8E;AAE9E,KAAK,UAAU,UAAU,CACvB,MAAc,EACd,KAAa,EACb,YAAoB,EACpB,WAAmB;IAEnB,MAAM,IAAI,GAAG,MAAM,KAAK,CAAC,4CAA4C,EAAE;QACrE,MAAM,EAAE,MAAM;QACd,OAAO,EAAE;YACP,cAAc,EAAE,kBAAkB;YAClC,aAAa,EAAE,UAAU,MAAM,EAAE;SAClC;QACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;YACnB,KAAK;YACL,QAAQ,EAAE;gBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,EAAE;gBACzC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE;aACvC;YACD,WAAW,EAAE,CAAC;YACd,eAAe,EAAE,EAAE,IAAI,EAAE,aAAa,EAAE;SACzC,CAAC;KACH,CAAC,CAAC;IAEH,IAAI,CAAC,IAAI,CAAC,EAAE,EAAE,CAAC;QACb,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC;QAClD,MAAM,IAAI,KAAK,CAAC,0BAA0B,IAAI,CAAC,MAAM,GAAG,OAAO,CAAC,CAAC,CAAC,MAAM,OAAO,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IAC5F,CAAC;IAED,MAAM,IAAI,GAAG,CAAC,MAAM,IAAI,CAAC,IAAI,EAAE,CAAQ,CAAC;IACxC,OAAO;QACL,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE,OAAO,EAAE,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE;QAC/D,KAAK,EAAE;YACL,KAAK,EAAE,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,aAAa,IAAI,CAAC,CAAC;YAC9C,MAAM,EAAE,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,iBAAiB,IAAI,CAAC,CAAC;SACpD;KACF,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,aAAa,CAC1B,MAAc,EACd,KAAa,EACb,YAAoB,EACpB,WAAmB;IAEnB,MAAM,IAAI,GAAG,MAAM,KAAK,CAAC,uCAAuC,EAAE;QAChE,MAAM,EAAE,MAAM;QACd,OAAO,EAAE;YACP,cAAc,EAAE,kBAAkB;YAClC,WAAW,EAAE,MAAM;YACnB,mBAAmB,EAAE,YAAY;SAClC;QACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;YACnB,KAAK;YACL,MAAM,EAAE,YAAY;YACpB,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,CAAC;YAClD,UAAU,EAAE,IAAI;YAChB,WAAW,EAAE,CAAC;SACf,CAAC;KACH,CAAC,CAAC;IAEH,IAAI,CAAC,IAAI,CAAC,EAAE,EAAE,CAAC;QACb,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC;QAClD,MAAM,IAAI,KAAK,CAAC,6BAA6B,IAAI,CAAC,MAAM,GAAG,OAAO,CAAC,CAAC,CAAC,MAAM,OAAO,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IAC/F,CAAC;IAED,MAAM,IAAI,GAAG,CAAC,MAAM,IAAI,CAAC,IAAI,EAAE,CAAQ,CAAC;IACxC,MAAM,MAAM,GAAG,KAAK,CAAC,OAAO,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC;IAChE,MAAM,IAAI,GAAG,MAAM;SAChB,GAAG,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,EAAE,IAAI,KAAK,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;SAC5D,IAAI,CAAC,EAAE,CAAC;SACR,IAAI,EAAE,CAAC;IAEV,OAAO;QACL,IAAI;QACJ,KAAK,EAAE;YACL,KAAK,EAAE,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,YAAY,IAAI,CAAC,CAAC;YAC7C,MAAM,EAAE,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,aAAa,IAAI,CAAC,CAAC;SAChD;KACF,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,UAAU,CACvB,MAAc,EACd,KAAa,EACb,YAAoB,EACpB,WAAmB;IAEnB,MAAM,GAAG,GAAG,2DAA2D,kBAAkB,CAAC,KAAK,CAAC,wBAAwB,kBAAkB,CAAC,MAAM,CAAC,EAAE,CAAC;IAErJ,MAAM,IAAI,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;QAC5B,MAAM,EAAE,MAAM;QACd,OAAO,EAAE,EAAE,cAAc,EAAE,kBAAkB,EAAE;QAC/C,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;YACnB,QAAQ,EAAE;gBACR;oBACE,IAAI,EAAE,MAAM;oBACZ,KAAK,EAAE,CAAC,EAAE,IAAI,EAAE,GAAG,YAAY,OAAO,WAAW,EAAE,EAAE,CAAC;iBACvD;aACF;YACD,gBAAgB,EAAE;gBAChB,WAAW,EAAE,CAAC;gBACd,gBAAgB,EAAE,kBAAkB;aACrC;SACF,CAAC;KACH,CAAC,CAAC;IAEH,IAAI,CAAC,IAAI,CAAC,EAAE,EAAE,CAAC;QACb,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC;QAClD,MAAM,IAAI,KAAK,CAAC,0BAA0B,IAAI,CAAC,MAAM,GAAG,OAAO,CAAC,CAAC,CAAC,MAAM,OAAO,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IAC5F,CAAC;IAED,MAAM,IAAI,GAAG,CAAC,MAAM,IAAI,CAAC,IAAI,EAAE,CAAQ,CAAC;IACxC,MAAM,KAAK,GAAG,IAAI,EAAE,UAAU,EAAE,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,KAAK,CAAC;IACpD,MAAM,IAAI,GAAG,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC;QAC/B,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,EAAE,IAAI,KAAK,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC;QAC7E,CAAC,CAAC,EAAE,CAAC;IAEP,OAAO;QACL,IAAI,EAAE,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE;QAC/B,KAAK,EAAE;YACL,KAAK,EAAE,MAAM,CAAC,IAAI,EAAE,aAAa,EAAE,gBAAgB,IAAI,CAAC,CAAC;YACzD,MAAM,EAAE,MAAM,CAAC,IAAI,EAAE,aAAa,EAAE,oBAAoB,IAAI,CAAC,CAAC;SAC/D;KACF,CAAC;AACJ,CAAC;AAED,8EAA8E;AAC9E,aAAa;AACb,8EAA8E;AAE9E;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,OAAe,EACf,OAA6B;IAE7B,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,SAAS,EAAE,QAAQ,EAAE,GAAG,OAAO,CAAC;IAErE,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,MAAM,IAAI,KAAK,CAAC,+CAA+C,CAAC,CAAC;IACnE,CAAC;IAED,IAAI,CAAC,WAAW,EAAE,CAAC;QACjB,MAAM,IAAI,KAAK,CAAC,yEAAyE,CAAC,CAAC;IAC7F,CAAC;IAED,IAAI,CAAC,MAAM,IAAI,CAAC,MAAM,EAAE,CAAC;QACvB,MAAM,IAAI,KAAK,CAAC,2DAA2D,CAAC,CAAC;IAC/E,CAAC;IAED,MAAM,KAAK,GAAG,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,IAAI,YAAY,CAAC,WAAW,CAAC,CAAC;IACnE,MAAM,YAAY,GAAG,iBAAiB,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACvD,MAAM,WAAW,GAAG,eAAe,CAAC,OAAO,CAAC,CAAC;IAE7C,IAAI,MAAkE,CAAC;IAEvE,QAAQ,WAAW,EAAE,CAAC;QACpB,KAAK,QAAQ;YACX,MAAM,GAAG,MAAM,UAAU,CAAC,SAAS,EAAE,KAAK,EAAE,YAAY,EAAE,WAAW,CAAC,CAAC;YACvE,MAAM;QACR,KAAK,WAAW;YACd,MAAM,GAAG,MAAM,aAAa,CAAC,SAAS,EAAE,KAAK,EAAE,YAAY,EAAE,WAAW,CAAC,CAAC;YAC1E,MAAM;QACR,KAAK,QAAQ;YACX,MAAM,GAAG,MAAM,UAAU,CAAC,SAAS,EAAE,KAAK,EAAE,YAAY,EAAE,WAAW,CAAC,CAAC;YACvE,MAAM;QACR;YACE,MAAM,IAAI,KAAK,CAAC,4BAA4B,WAAW,EAAE,CAAC,CAAC;IAC/D,CAAC;IAED,MAAM,IAAI,GAAG,iBAAiB,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAE5C,OAAO;QACL,IAAI;QACJ,UAAU,EAAE,MAAM,CAAC,KAAK;KACzB,CAAC;AACJ,CAAC"}