webpeel 0.13.4 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/README.md +120 -162
  2. package/dist/cli-auth.js +7 -7
  3. package/dist/cli-auth.js.map +1 -1
  4. package/dist/cli.js +197 -26
  5. package/dist/cli.js.map +1 -1
  6. package/dist/core/auto-extract.d.ts +83 -0
  7. package/dist/core/auto-extract.d.ts.map +1 -0
  8. package/dist/core/auto-extract.js +565 -0
  9. package/dist/core/auto-extract.js.map +1 -0
  10. package/dist/core/deep-fetch.d.ts +75 -0
  11. package/dist/core/deep-fetch.d.ts.map +1 -0
  12. package/dist/core/deep-fetch.js +406 -0
  13. package/dist/core/deep-fetch.js.map +1 -0
  14. package/dist/core/domain-extractors.d.ts +34 -0
  15. package/dist/core/domain-extractors.d.ts.map +1 -0
  16. package/dist/core/domain-extractors.js +654 -0
  17. package/dist/core/domain-extractors.js.map +1 -0
  18. package/dist/core/markdown.d.ts +8 -0
  19. package/dist/core/markdown.d.ts.map +1 -1
  20. package/dist/core/markdown.js +25 -0
  21. package/dist/core/markdown.js.map +1 -1
  22. package/dist/core/quick-answer.d.ts +28 -0
  23. package/dist/core/quick-answer.d.ts.map +1 -0
  24. package/dist/core/quick-answer.js +288 -0
  25. package/dist/core/quick-answer.js.map +1 -0
  26. package/dist/core/readability.d.ts +58 -0
  27. package/dist/core/readability.d.ts.map +1 -0
  28. package/dist/core/readability.js +496 -0
  29. package/dist/core/readability.js.map +1 -0
  30. package/dist/core/search-provider.d.ts.map +1 -1
  31. package/dist/core/search-provider.js +3 -6
  32. package/dist/core/search-provider.js.map +1 -1
  33. package/dist/core/strategies.d.ts.map +1 -1
  34. package/dist/core/strategies.js +70 -5
  35. package/dist/core/strategies.js.map +1 -1
  36. package/dist/core/watch-manager.d.ts +140 -0
  37. package/dist/core/watch-manager.d.ts.map +1 -0
  38. package/dist/core/watch-manager.js +348 -0
  39. package/dist/core/watch-manager.js.map +1 -0
  40. package/dist/core/youtube.d.ts +91 -0
  41. package/dist/core/youtube.d.ts.map +1 -0
  42. package/dist/core/youtube.js +380 -0
  43. package/dist/core/youtube.js.map +1 -0
  44. package/dist/index.d.ts +4 -0
  45. package/dist/index.d.ts.map +1 -1
  46. package/dist/index.js +103 -0
  47. package/dist/index.js.map +1 -1
  48. package/dist/mcp/server.js +58 -16
  49. package/dist/mcp/server.js.map +1 -1
  50. package/dist/server/app.d.ts.map +1 -1
  51. package/dist/server/app.js +19 -1
  52. package/dist/server/app.js.map +1 -1
  53. package/dist/server/routes/deep-fetch.d.ts +9 -0
  54. package/dist/server/routes/deep-fetch.d.ts.map +1 -0
  55. package/dist/server/routes/deep-fetch.js +38 -0
  56. package/dist/server/routes/deep-fetch.js.map +1 -0
  57. package/dist/server/routes/extract.d.ts.map +1 -1
  58. package/dist/server/routes/extract.js +11 -0
  59. package/dist/server/routes/extract.js.map +1 -1
  60. package/dist/server/routes/fetch.d.ts.map +1 -1
  61. package/dist/server/routes/fetch.js +45 -19
  62. package/dist/server/routes/fetch.js.map +1 -1
  63. package/dist/server/routes/mcp.d.ts +2 -1
  64. package/dist/server/routes/mcp.d.ts.map +1 -1
  65. package/dist/server/routes/mcp.js +307 -38
  66. package/dist/server/routes/mcp.js.map +1 -1
  67. package/dist/server/routes/quick-answer.d.ts +9 -0
  68. package/dist/server/routes/quick-answer.d.ts.map +1 -0
  69. package/dist/server/routes/quick-answer.js +84 -0
  70. package/dist/server/routes/quick-answer.js.map +1 -0
  71. package/dist/server/routes/watch.d.ts +16 -0
  72. package/dist/server/routes/watch.d.ts.map +1 -0
  73. package/dist/server/routes/watch.js +219 -0
  74. package/dist/server/routes/watch.js.map +1 -0
  75. package/dist/server/routes/youtube.d.ts +7 -0
  76. package/dist/server/routes/youtube.d.ts.map +1 -0
  77. package/dist/server/routes/youtube.js +87 -0
  78. package/dist/server/routes/youtube.js.map +1 -0
  79. package/dist/types.d.ts +18 -0
  80. package/dist/types.d.ts.map +1 -1
  81. package/dist/types.js.map +1 -1
  82. package/llms.txt +14 -5
  83. package/package.json +1 -1
@@ -0,0 +1,83 @@
1
+ /**
2
+ * Auto-extraction module — heuristic + CSS selector based structured data extraction.
3
+ * No LLM API key required.
4
+ *
5
+ * Supports:
6
+ * - pricing : pricing tables / plan cards
7
+ * - products : product grids / listings
8
+ * - contact : emails, phones, addresses, social links
9
+ * - article : blog posts / news articles
10
+ * - api_docs : REST API endpoint documentation
11
+ * - unknown : fallback when no type is detected
12
+ */
13
+ export interface PricingPlan {
14
+ name: string;
15
+ price: string;
16
+ period?: string;
17
+ features: string[];
18
+ cta?: string;
19
+ }
20
+ export interface PricingResult {
21
+ type: 'pricing';
22
+ plans: PricingPlan[];
23
+ }
24
+ export interface ProductItem {
25
+ name: string;
26
+ price?: string;
27
+ image?: string;
28
+ url?: string;
29
+ rating?: string;
30
+ }
31
+ export interface ProductsResult {
32
+ type: 'products';
33
+ items: ProductItem[];
34
+ }
35
+ export interface ContactResult {
36
+ type: 'contact';
37
+ emails: string[];
38
+ phones: string[];
39
+ addresses: string[];
40
+ social: Record<string, string>;
41
+ }
42
+ export interface ArticleSection {
43
+ heading: string;
44
+ content: string;
45
+ }
46
+ export interface ArticleResult {
47
+ type: 'article';
48
+ title?: string;
49
+ author?: string;
50
+ date?: string;
51
+ readingTime?: string;
52
+ summary?: string;
53
+ sections: ArticleSection[];
54
+ }
55
+ export interface ApiEndpoint {
56
+ method: string;
57
+ path: string;
58
+ description?: string;
59
+ params?: string[];
60
+ }
61
+ export interface ApiDocsResult {
62
+ type: 'api_docs';
63
+ baseUrl?: string;
64
+ endpoints: ApiEndpoint[];
65
+ }
66
+ export interface UnknownResult {
67
+ type: 'unknown';
68
+ }
69
+ export type AutoExtractResult = PricingResult | ProductsResult | ContactResult | ArticleResult | ApiDocsResult | UnknownResult;
70
+ /**
71
+ * Detect the page type from HTML + URL.
72
+ * Returns one of: 'pricing' | 'products' | 'contact' | 'article' | 'api_docs' | 'unknown'
73
+ */
74
+ export declare function detectPageType(html: string, url: string): string;
75
+ /**
76
+ * Detect the type of a web page based on HTML content and URL.
77
+ */
78
+ export { detectPageType as default };
79
+ /**
80
+ * Auto-extract structured data from a web page without an LLM API key.
81
+ */
82
+ export declare function autoExtract(html: string, url: string): AutoExtractResult;
83
+ //# sourceMappingURL=auto-extract.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"auto-extract.d.ts","sourceRoot":"","sources":["../../src/core/auto-extract.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAQH,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AAED,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,SAAS,CAAC;IAChB,KAAK,EAAE,WAAW,EAAE,CAAC;CACtB;AAED,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,cAAc;IAC7B,IAAI,EAAE,UAAU,CAAC;IACjB,KAAK,EAAE,WAAW,EAAE,CAAC;CACtB;AAED,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,SAAS,CAAC;IAChB,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,SAAS,EAAE,MAAM,EAAE,CAAC;IACpB,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAChC;AAED,MAAM,WAAW,cAAc;IAC7B,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,SAAS,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,cAAc,EAAE,CAAC;CAC5B;AAED,MAAM,WAAW,WAAW;IAC1B,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC;CACnB;AAED,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,UAAU,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,WAAW,EAAE,CAAC;CAC1B;AAED,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,SAAS,CAAC;CACjB;AAED,MAAM,MAAM,iBAAiB,GACzB,aAAa,GACb,cAAc,GACd,aAAa,GACb,aAAa,GACb,aAAa,GACb,aAAa,CAAC;AA8BlB;;;GAGG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,CAiFhE;AAycD;;GAEG;AACH,OAAO,EAAE,cAAc,IAAI,OAAO,EAAE,CAAC;AAErC;;GAEG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,iBAAiB,CAyCxE"}
@@ -0,0 +1,565 @@
1
+ /**
2
+ * Auto-extraction module — heuristic + CSS selector based structured data extraction.
3
+ * No LLM API key required.
4
+ *
5
+ * Supports:
6
+ * - pricing : pricing tables / plan cards
7
+ * - products : product grids / listings
8
+ * - contact : emails, phones, addresses, social links
9
+ * - article : blog posts / news articles
10
+ * - api_docs : REST API endpoint documentation
11
+ * - unknown : fallback when no type is detected
12
+ */
13
+ import { load } from 'cheerio';
14
+ // ---------------------------------------------------------------------------
15
+ // Page type detection
16
+ // ---------------------------------------------------------------------------
17
+ const PRICE_INLINE = /(\$|€|£)\s*\d+/;
18
+ const FREE_PLAN = /\bfree\b/i;
19
+ const HTTP_METHOD_PATTERN = /\b(GET|POST|PUT|PATCH|DELETE|HEAD|OPTIONS)\b/;
20
+ const URL_PATH_PATTERN = /\/(v\d+\/)?[a-z_-]+(\/{[^}]+}|\/?[a-z_-]*)*\b/;
21
+ const EMAIL_PATTERN = /[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}/g;
22
+ const PHONE_PATTERN = /(\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}|\+\d{1,3}[-.\s]?\d{2,4}[-.\s]?\d{4,}/g;
23
+ /** Extract body text with spaces between elements (prevents regex over-matching adjacent tokens). */
24
+ function getBodyText($) {
25
+ const html = $('body').html() || '';
26
+ return html.replace(/<[^>]+>/g, ' ').replace(/&[a-z#\d]+;/gi, ' ').replace(/\s+/g, ' ').trim();
27
+ }
28
+ function urlHas(url, ...keywords) {
29
+ try {
30
+ const path = new URL(url).pathname.toLowerCase();
31
+ return keywords.some((kw) => path.includes(kw));
32
+ }
33
+ catch {
34
+ const lower = url.toLowerCase();
35
+ return keywords.some((kw) => lower.includes(kw));
36
+ }
37
+ }
38
+ /**
39
+ * Detect the page type from HTML + URL.
40
+ * Returns one of: 'pricing' | 'products' | 'contact' | 'article' | 'api_docs' | 'unknown'
41
+ */
42
+ export function detectPageType(html, url) {
43
+ const $ = load(html);
44
+ // --- Pricing ---
45
+ if (urlHas(url, '/pricing', '/plans', '/packages', '/tiers', '/billing')) {
46
+ return 'pricing';
47
+ }
48
+ const bodyText = getBodyText($);
49
+ const priceMatches = bodyText.match(/(\$|€|£)\s*\d+/g) || [];
50
+ const perPeriodMatches = bodyText.match(/\/(mo|month|year|yr|annual|week)/gi) || [];
51
+ if (priceMatches.length >= 2 && perPeriodMatches.length >= 1) {
52
+ return 'pricing';
53
+ }
54
+ // --- Contact ---
55
+ if (urlHas(url, '/contact', '/about', '/reach', '/connect', '/support')) {
56
+ const emails = bodyText.match(EMAIL_PATTERN) || [];
57
+ if (emails.length > 0)
58
+ return 'contact';
59
+ }
60
+ const emails = bodyText.match(EMAIL_PATTERN) || [];
61
+ const phones = bodyText.match(PHONE_PATTERN) || [];
62
+ const socialLinks = $('a[href*="twitter.com"], a[href*="linkedin.com"], a[href*="github.com"]').length;
63
+ if (emails.length > 0 && (phones.length > 0 || socialLinks > 0)) {
64
+ return 'contact';
65
+ }
66
+ // --- Article ---
67
+ const hasArticleTag = $('article').length > 0;
68
+ const hasTimeTag = $('time[datetime], time[pubdate]').length > 0;
69
+ const hasAuthorMeta = $('meta[name="author"]').length > 0 ||
70
+ $('[class*="author"], [itemprop="author"]').length > 0;
71
+ if (hasArticleTag || (hasTimeTag && hasAuthorMeta)) {
72
+ return 'article';
73
+ }
74
+ // Single <h1> + multiple paragraphs and a date-ish element
75
+ const h1Count = $('h1').length;
76
+ const paraCount = $('p').length;
77
+ if (h1Count === 1 && paraCount >= 3 && hasTimeTag) {
78
+ return 'article';
79
+ }
80
+ // --- API docs ---
81
+ const codeText = $('code, pre').text();
82
+ const httpMethodHits = (codeText.match(HTTP_METHOD_PATTERN) || []).length;
83
+ const urlPathHits = (codeText.match(URL_PATH_PATTERN) || []).length;
84
+ if (httpMethodHits >= 2 && urlPathHits >= 2) {
85
+ return 'api_docs';
86
+ }
87
+ // Also check for common API doc patterns in normal text
88
+ const headingText = $('h1, h2, h3').text();
89
+ if (headingText.match(/endpoint|api reference|rest api|http method/i) &&
90
+ httpMethodHits >= 1) {
91
+ return 'api_docs';
92
+ }
93
+ // --- Products ---
94
+ // Look for repeating card-like structures with prices + images
95
+ const potentialProductContainers = [
96
+ '.product', '.item', '.card', '[class*="product"]', '[class*="item"]', '[class*="card"]',
97
+ ];
98
+ for (const sel of potentialProductContainers) {
99
+ const cards = $(sel);
100
+ if (cards.length >= 3) {
101
+ let withPrice = 0;
102
+ cards.each((_, el) => {
103
+ const text = $(el).text();
104
+ if (PRICE_INLINE.test(text) || FREE_PLAN.test(text))
105
+ withPrice++;
106
+ });
107
+ if (withPrice >= 2)
108
+ return 'products';
109
+ }
110
+ }
111
+ // Fallback: many <img> elements with adjacent prices
112
+ const imgs = $('img').length;
113
+ if (imgs >= 4 && priceMatches.length >= 3) {
114
+ return 'products';
115
+ }
116
+ return 'unknown';
117
+ }
118
+ // ---------------------------------------------------------------------------
119
+ // Pricing extractor
120
+ // ---------------------------------------------------------------------------
121
+ function extractPricingPlans($) {
122
+ const plans = [];
123
+ // Common pricing card selectors (ordered from specific to broad)
124
+ const containerSelectors = [
125
+ '[class*="pricing-card"]',
126
+ '[class*="price-card"]',
127
+ '[class*="plan-card"]',
128
+ '[class*="tier-card"]',
129
+ '[class*="pricing__plan"]',
130
+ '[class*="plan"]',
131
+ '[class*="pricing-tier"]',
132
+ '[class*="pricing-table"] td',
133
+ '[class*="pricing-table"] th',
134
+ '.card',
135
+ '[class*="col-"]',
136
+ ];
137
+ let containers = null;
138
+ for (const sel of containerSelectors) {
139
+ const found = $(sel).filter((_, el) => {
140
+ const text = $(el).text();
141
+ return PRICE_INLINE.test(text) || FREE_PLAN.test(text);
142
+ });
143
+ if (found.length >= 2) {
144
+ containers = found;
145
+ break;
146
+ }
147
+ }
148
+ if (!containers || containers.length === 0) {
149
+ // Last resort: parse entire page for price-like text blocks
150
+ return parsePricingFromText($);
151
+ }
152
+ containers.each((_, el) => {
153
+ try {
154
+ const $el = $(el);
155
+ const text = $el.text().trim();
156
+ // Extract plan name — first heading in the container
157
+ const nameEl = $el.find('h1, h2, h3, h4, h5, h6, [class*="name"], [class*="title"]').first();
158
+ const name = nameEl.text().trim() || 'Plan';
159
+ // Extract price
160
+ const priceMatch = text.match(/(\$|€|£|free)\s*[\d,]+(\.\d+)?/i);
161
+ if (!priceMatch && !FREE_PLAN.test(text))
162
+ return; // Skip non-price containers
163
+ const price = FREE_PLAN.test(text) && !priceMatch ? 'Free' : (priceMatch?.[0] ?? '');
164
+ // Extract period
165
+ const periodMatch = text.match(/\/(mo(nth)?|yr|year|week|day|annual)/i);
166
+ const period = periodMatch ? periodMatch[0] : undefined;
167
+ // Extract features from lists
168
+ const features = [];
169
+ $el.find('li').each((_, li) => {
170
+ const featureText = $(li).text().trim();
171
+ if (featureText && featureText.length < 200) {
172
+ features.push(featureText);
173
+ }
174
+ });
175
+ // Extract CTA button
176
+ const ctaEl = $el
177
+ .find('a, button')
178
+ .filter((_, btn) => /get started|sign up|buy|subscribe|choose|select|try|start|upgrade/i.test($(btn).text()))
179
+ .first();
180
+ const cta = ctaEl.text().trim() || undefined;
181
+ if (name || price) {
182
+ plans.push({ name, price, period, features, cta });
183
+ }
184
+ }
185
+ catch {
186
+ // Silently skip malformed containers
187
+ }
188
+ });
189
+ return deduplicatePlans(plans);
190
+ }
191
+ function parsePricingFromText($) {
192
+ // Fallback: find all price-like elements and group them
193
+ const plans = [];
194
+ const bodyText = getBodyText($);
195
+ const priceRegex = /(\$|€|£)\s*(\d+(?:\.\d+)?)\s*(?:\/(mo(?:nth)?|yr|year|week|annual))?/gi;
196
+ let match;
197
+ const foundPrices = [];
198
+ while ((match = priceRegex.exec(bodyText)) !== null) {
199
+ foundPrices.push(match[0]);
200
+ }
201
+ // Simple heuristic: each unique price = 1 plan
202
+ const uniquePrices = [...new Set(foundPrices)];
203
+ for (const p of uniquePrices) {
204
+ plans.push({ name: 'Plan', price: p, features: [] });
205
+ }
206
+ return plans;
207
+ }
208
+ function deduplicatePlans(plans) {
209
+ const seen = new Set();
210
+ return plans.filter((p) => {
211
+ const key = `${p.name}|${p.price}`;
212
+ if (seen.has(key))
213
+ return false;
214
+ seen.add(key);
215
+ return true;
216
+ });
217
+ }
218
+ // ---------------------------------------------------------------------------
219
+ // Products extractor
220
+ // ---------------------------------------------------------------------------
221
+ function extractProducts($, baseUrl) {
222
+ const items = [];
223
+ const origin = (() => {
224
+ try {
225
+ return new URL(baseUrl).origin;
226
+ }
227
+ catch {
228
+ return '';
229
+ }
230
+ })();
231
+ const containerSelectors = [
232
+ '[class*="product"]',
233
+ '[class*="item"]',
234
+ '[class*="card"]',
235
+ 'li',
236
+ 'article',
237
+ ];
238
+ let containers = null;
239
+ for (const sel of containerSelectors) {
240
+ const found = $(sel).filter((_, el) => {
241
+ const text = $(el).text();
242
+ return (PRICE_INLINE.test(text) || FREE_PLAN.test(text)) && $(el).find('img').length > 0;
243
+ });
244
+ if (found.length >= 2) {
245
+ containers = found;
246
+ break;
247
+ }
248
+ }
249
+ if (!containers || containers.length === 0)
250
+ return items;
251
+ containers.each((_, el) => {
252
+ try {
253
+ const $el = $(el);
254
+ // Name
255
+ const nameEl = $el.find('h1,h2,h3,h4,h5,h6,[class*="name"],[class*="title"]').first();
256
+ const name = nameEl.text().trim();
257
+ if (!name)
258
+ return;
259
+ // Price
260
+ const priceMatch = $el.text().match(/(\$|€|£)\s*[\d,]+(\.\d+)?/);
261
+ const price = priceMatch ? priceMatch[0].trim() : undefined;
262
+ // Image
263
+ const imgEl = $el.find('img').first();
264
+ const imgSrc = imgEl.attr('src') || imgEl.attr('data-src') || imgEl.attr('data-lazy');
265
+ const image = imgSrc
266
+ ? imgSrc.startsWith('http')
267
+ ? imgSrc
268
+ : `${origin}${imgSrc.startsWith('/') ? '' : '/'}${imgSrc}`
269
+ : undefined;
270
+ // URL
271
+ const linkEl = $el.find('a').first();
272
+ const href = linkEl.attr('href');
273
+ const url = href
274
+ ? href.startsWith('http')
275
+ ? href
276
+ : `${origin}${href.startsWith('/') ? '' : '/'}${href}`
277
+ : undefined;
278
+ // Rating
279
+ const ratingMatch = $el.text().match(/(\d(\.\d)?)\s*(\/\s*5|stars?|★)/i);
280
+ const rating = ratingMatch ? `${ratingMatch[1]}/5` : undefined;
281
+ items.push({ name, price, image, url, rating });
282
+ }
283
+ catch {
284
+ // Skip malformed
285
+ }
286
+ });
287
+ return items.slice(0, 100); // cap at 100
288
+ }
289
+ // ---------------------------------------------------------------------------
290
+ // Contact extractor
291
+ // ---------------------------------------------------------------------------
292
+ const SOCIAL_DOMAINS = {
293
+ 'twitter.com': 'twitter',
294
+ 'x.com': 'twitter',
295
+ 'linkedin.com': 'linkedin',
296
+ 'github.com': 'github',
297
+ 'facebook.com': 'facebook',
298
+ 'instagram.com': 'instagram',
299
+ 'youtube.com': 'youtube',
300
+ 'tiktok.com': 'tiktok',
301
+ 'discord.gg': 'discord',
302
+ 'discord.com': 'discord',
303
+ };
304
+ const ADDRESS_PATTERN = /\d{1,5}\s+[A-Za-z0-9\s,\.]+(?:street|st|avenue|ave|road|rd|blvd|boulevard|lane|ln|drive|dr|court|ct|way|wy|place|pl)\b[^<\n]{0,80}/i;
305
+ function extractContact($) {
306
+ const bodyText = getBodyText($);
307
+ // Emails
308
+ const emailMatches = bodyText.match(EMAIL_PATTERN) || [];
309
+ const emails = [
310
+ ...new Set(emailMatches.map((e) => e.toLowerCase())),
311
+ ];
312
+ // Phones
313
+ const phoneMatches = bodyText.match(PHONE_PATTERN) || [];
314
+ const phones = [...new Set(phoneMatches.map((p) => p.trim()))];
315
+ // Addresses
316
+ const addresses = [];
317
+ $('[class*="address"], [itemprop="address"], address').each((_, el) => {
318
+ const addr = $(el).text().replace(/\s+/g, ' ').trim();
319
+ if (addr.length > 10)
320
+ addresses.push(addr);
321
+ });
322
+ // Also regex-based
323
+ const addrMatch = bodyText.match(ADDRESS_PATTERN);
324
+ if (addrMatch) {
325
+ const addr = addrMatch[0].trim();
326
+ if (!addresses.some((a) => a.includes(addr.substring(0, 10)))) {
327
+ addresses.push(addr);
328
+ }
329
+ }
330
+ // Social links
331
+ const social = {};
332
+ $('a[href]').each((_, el) => {
333
+ const href = $(el).attr('href') || '';
334
+ for (const [domain, key] of Object.entries(SOCIAL_DOMAINS)) {
335
+ if (href.includes(domain) && !social[key]) {
336
+ social[key] = href;
337
+ }
338
+ }
339
+ });
340
+ return { type: 'contact', emails, phones, addresses, social };
341
+ }
342
+ // ---------------------------------------------------------------------------
343
+ // Article extractor
344
+ // ---------------------------------------------------------------------------
345
+ function extractArticle($) {
346
+ // Title
347
+ const title = $('h1').first().text().trim() ||
348
+ $('meta[property="og:title"]').attr('content') ||
349
+ $('title').text().trim() ||
350
+ undefined;
351
+ // Author
352
+ const author = $('meta[name="author"]').attr('content') ||
353
+ $('[itemprop="author"]').first().text().trim() ||
354
+ $('[class*="author"]').first().text().trim() ||
355
+ $('[rel="author"]').first().text().trim() ||
356
+ undefined;
357
+ // Date
358
+ const date = $('time[datetime]').first().attr('datetime') ||
359
+ $('time[pubdate]').first().attr('datetime') ||
360
+ $('meta[name="date"]').attr('content') ||
361
+ $('meta[property="article:published_time"]').attr('content') ||
362
+ $('time').first().text().trim() ||
363
+ undefined;
364
+ // Reading time
365
+ const readingTimeEl = $('[class*="reading-time"], [class*="read-time"], [class*="readtime"]').first();
366
+ const readingTime = readingTimeEl.length ? readingTimeEl.text().trim() : estimateReadingTime($);
367
+ // Summary (first 2 sentences of article content)
368
+ const articleEl = $('article').first();
369
+ const contentEl = articleEl.length ? articleEl : $('main').first();
370
+ const firstPara = contentEl.find('p').first().text().trim() ||
371
+ $('meta[name="description"]').attr('content') ||
372
+ $('meta[property="og:description"]').attr('content') ||
373
+ '';
374
+ const summary = firstPara ? extractFirstSentences(firstPara, 2) : undefined;
375
+ // Sections: h2/h3 + following content
376
+ const sections = [];
377
+ const headings = contentEl.find('h2, h3');
378
+ headings.each((_, el) => {
379
+ const heading = $(el).text().trim();
380
+ if (!heading)
381
+ return;
382
+ // Gather text of next sibling elements until next heading
383
+ const contentParts = [];
384
+ let sibling = $(el).next();
385
+ while (sibling.length && !sibling.is('h2, h3')) {
386
+ const text = sibling.text().trim();
387
+ if (text)
388
+ contentParts.push(text);
389
+ sibling = sibling.next();
390
+ }
391
+ if (contentParts.length > 0) {
392
+ sections.push({ heading, content: contentParts.join(' ') });
393
+ }
394
+ });
395
+ return { type: 'article', title, author, date, readingTime, summary, sections };
396
+ }
397
+ function extractFirstSentences(text, count) {
398
+ const sentenceEnd = /[.!?]+\s+/g;
399
+ let match;
400
+ let lastIndex = 0;
401
+ let sentenceCount = 0;
402
+ while ((match = sentenceEnd.exec(text)) !== null) {
403
+ lastIndex = match.index + match[0].length;
404
+ sentenceCount++;
405
+ if (sentenceCount >= count)
406
+ break;
407
+ }
408
+ return sentenceCount > 0 ? text.slice(0, lastIndex).trim() : text.slice(0, 300).trim();
409
+ }
410
+ function estimateReadingTime($) {
411
+ const wordsPerMinute = 200;
412
+ const text = $('article, main, [class*="content"], body').first().text();
413
+ const wordCount = text.split(/\s+/).filter(Boolean).length;
414
+ const minutes = Math.max(1, Math.ceil(wordCount / wordsPerMinute));
415
+ return `${minutes} min`;
416
+ }
417
+ // ---------------------------------------------------------------------------
418
+ // API docs extractor
419
+ // ---------------------------------------------------------------------------
420
+ const HTTP_METHODS = ['GET', 'POST', 'PUT', 'PATCH', 'DELETE', 'HEAD', 'OPTIONS'];
421
+ function extractApiDocs($, url) {
422
+ const endpoints = [];
423
+ // Try to detect base URL from page or URL
424
+ let baseUrl;
425
+ const pageText = getBodyText($);
426
+ const baseUrlMatch = pageText.match(/https?:\/\/api\.[a-zA-Z0-9.-]+/);
427
+ if (baseUrlMatch) {
428
+ baseUrl = baseUrlMatch[0];
429
+ }
430
+ else {
431
+ try {
432
+ const parsed = new URL(url);
433
+ baseUrl = `${parsed.protocol}//api.${parsed.hostname}`;
434
+ }
435
+ catch {
436
+ baseUrl = undefined;
437
+ }
438
+ }
439
+ // Strategy 1: Parse code blocks for HTTP method + path patterns
440
+ $('code, pre').each((_, el) => {
441
+ const text = $(el).text().trim();
442
+ const lines = text.split(/\n/);
443
+ for (const line of lines) {
444
+ const trimmed = line.trim();
445
+ for (const method of HTTP_METHODS) {
446
+ if (trimmed.startsWith(method + ' ') || trimmed.startsWith(method + '\t')) {
447
+ const rest = trimmed.slice(method.length).trim();
448
+ // Extract path (first URL-like token)
449
+ const pathMatch = rest.match(/^(https?:\/\/[^\s]+|\/[^\s]*)/);
450
+ if (pathMatch) {
451
+ let path = pathMatch[0];
452
+ // Normalize: strip base URL prefix if present
453
+ if (baseUrl && path.startsWith(baseUrl)) {
454
+ path = path.slice(baseUrl.length);
455
+ }
456
+ // Strip query string
457
+ path = path.split('?')[0];
458
+ // Try to find a description — look at nearest heading above this code block
459
+ const description = findNearestHeading($(el)) || undefined;
460
+ endpoints.push({ method, path, description });
461
+ }
462
+ }
463
+ }
464
+ }
465
+ });
466
+ // Strategy 2: Scan for method badges + inline paths in regular text
467
+ $('[class*="method"], [class*="http-method"], .badge, .label').each((_, el) => {
468
+ const methodText = $(el).text().trim().toUpperCase();
469
+ if (!HTTP_METHODS.includes(methodText))
470
+ return;
471
+ // Look for adjacent path element
472
+ const siblings = [
473
+ $(el).next('[class*="path"], [class*="endpoint"], [class*="route"], code'),
474
+ $(el).parent().find('code').first(),
475
+ ];
476
+ for (const sibling of siblings) {
477
+ if (sibling.length) {
478
+ const path = sibling.text().trim();
479
+ if (URL_PATH_PATTERN.test(path)) {
480
+ endpoints.push({ method: methodText, path });
481
+ break;
482
+ }
483
+ }
484
+ }
485
+ });
486
+ // Deduplicate by method+path
487
+ const seen = new Set();
488
+ const unique = endpoints.filter((ep) => {
489
+ const key = `${ep.method}:${ep.path}`;
490
+ if (seen.has(key))
491
+ return false;
492
+ seen.add(key);
493
+ return true;
494
+ });
495
+ return { type: 'api_docs', baseUrl, endpoints: unique };
496
+ }
497
+ function findNearestHeading($el) {
498
+ // Walk backwards through siblings/parents to find closest heading
499
+ let current = $el.prev();
500
+ let depth = 0;
501
+ while (depth < 5) {
502
+ if (current.length === 0) {
503
+ const parent = $el.parent();
504
+ if (!parent.length)
505
+ break;
506
+ current = parent.prev();
507
+ }
508
+ else if (current.is('h1,h2,h3,h4,h5,h6')) {
509
+ return current.text().trim();
510
+ }
511
+ else {
512
+ current = current.prev();
513
+ }
514
+ depth++;
515
+ }
516
+ return null;
517
+ }
518
+ // ---------------------------------------------------------------------------
519
+ // Main entry points
520
+ // ---------------------------------------------------------------------------
521
+ /**
522
+ * Detect the type of a web page based on HTML content and URL.
523
+ */
524
+ export { detectPageType as default };
525
+ /**
526
+ * Auto-extract structured data from a web page without an LLM API key.
527
+ */
528
+ export function autoExtract(html, url) {
529
+ const type = detectPageType(html, url);
530
+ const $ = load(html);
531
+ try {
532
+ switch (type) {
533
+ case 'pricing':
534
+ return { type: 'pricing', plans: extractPricingPlans($) };
535
+ case 'products':
536
+ return { type: 'products', items: extractProducts($, url) };
537
+ case 'contact':
538
+ return extractContact($);
539
+ case 'article':
540
+ return extractArticle($);
541
+ case 'api_docs':
542
+ return extractApiDocs($, url);
543
+ default:
544
+ return { type: 'unknown' };
545
+ }
546
+ }
547
+ catch {
548
+ // Return partial/empty result rather than crashing
549
+ switch (type) {
550
+ case 'pricing':
551
+ return { type: 'pricing', plans: [] };
552
+ case 'products':
553
+ return { type: 'products', items: [] };
554
+ case 'contact':
555
+ return { type: 'contact', emails: [], phones: [], addresses: [], social: {} };
556
+ case 'article':
557
+ return { type: 'article', sections: [] };
558
+ case 'api_docs':
559
+ return { type: 'api_docs', endpoints: [] };
560
+ default:
561
+ return { type: 'unknown' };
562
+ }
563
+ }
564
+ }
565
+ //# sourceMappingURL=auto-extract.js.map