@pinkpixel/sugarstitch 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +59 -0
- package/LICENSE +21 -0
- package/OVERVIEW.md +306 -0
- package/README.md +462 -0
- package/assets/banner_dark.png +0 -0
- package/assets/banner_light.png +0 -0
- package/assets/logo.png +0 -0
- package/assets/screenshot_cli.png +0 -0
- package/assets/screenshot_completed.png +0 -0
- package/assets/screenshot_homepage.png +0 -0
- package/assets/screenshot_scraping.png +0 -0
- package/dist/index.js +216 -0
- package/dist/scraper.js +719 -0
- package/dist/server.js +1272 -0
- package/package.json +26 -0
- package/public/favicon.png +0 -0
- package/scripts/add-shebang.js +11 -0
- package/src/index.ts +217 -0
- package/src/scraper.ts +903 -0
- package/src/server.ts +1319 -0
- package/tsconfig.json +12 -0
- package/website/astro.config.mjs +5 -0
- package/website/package-lock.json +6358 -0
- package/website/package.json +18 -0
- package/website/public/banner_dark.png +0 -0
- package/website/public/banner_light.png +0 -0
- package/website/public/favicon.png +0 -0
- package/website/public/screenshot_cli.png +0 -0
- package/website/public/screenshot_completed.png +0 -0
- package/website/public/screenshot_homepage.png +0 -0
- package/website/public/screenshot_scraping.png +0 -0
- package/website/src/layouts/DocsLayout.astro +142 -0
- package/website/src/pages/docs/install.astro +96 -0
- package/website/src/pages/docs/use-the-app.astro +131 -0
- package/website/src/pages/index.astro +94 -0
- package/website/src/styles/site.css +611 -0
- package/website/tsconfig.json +3 -0
- package/website/wrangler.toml +6 -0
package/dist/scraper.js
ADDED
|
@@ -0,0 +1,719 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
36
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
37
|
+
};
|
|
38
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
39
|
+
exports.DEFAULT_PROFILES_FILE = void 0;
|
|
40
|
+
exports.sanitizeFilename = sanitizeFilename;
|
|
41
|
+
exports.normalizeUrl = normalizeUrl;
|
|
42
|
+
exports.dedupeStrings = dedupeStrings;
|
|
43
|
+
exports.getSelectorPresets = getSelectorPresets;
|
|
44
|
+
exports.isSelectorPresetId = isSelectorPresetId;
|
|
45
|
+
exports.getSelectorPreset = getSelectorPreset;
|
|
46
|
+
exports.loadSiteProfiles = loadSiteProfiles;
|
|
47
|
+
exports.sanitizeSelectorOverrides = sanitizeSelectorOverrides;
|
|
48
|
+
exports.loadExistingData = loadExistingData;
|
|
49
|
+
exports.previewPattern = previewPattern;
|
|
50
|
+
exports.scrapeUrls = scrapeUrls;
|
|
51
|
+
const axios_1 = __importDefault(require("axios"));
|
|
52
|
+
const cheerio = __importStar(require("cheerio"));
|
|
53
|
+
const fs = __importStar(require("fs/promises"));
|
|
54
|
+
const fs_1 = require("fs");
|
|
55
|
+
const path = __importStar(require("path"));
|
|
56
|
+
const promises_1 = require("stream/promises");
|
|
57
|
+
const REQUEST_HEADERS = {
|
|
58
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
59
|
+
};
|
|
60
|
+
const REQUEST_TIMEOUT_MS = 15000;
|
|
61
|
+
const IMAGE_EXTENSIONS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.avif']);
|
|
62
|
+
const DEFAULT_DELAY_MS = 2000;
|
|
63
|
+
const DEFAULT_CRAWL_MAX_DEPTH = 2;
|
|
64
|
+
const DEFAULT_CRAWL_MAX_URLS = 100;
|
|
65
|
+
const DEFAULT_CRAWL_MAX_PAGINATION_PAGES = 20;
|
|
66
|
+
exports.DEFAULT_PROFILES_FILE = 'sugarstitch.profiles.json';
|
|
67
|
+
const ARTICLE_TEXT_SELECTORS = ['.entry-content', '.wp-block-post-content', '.post-content', '.article-body', 'article', 'main'];
|
|
68
|
+
const SELECTOR_PRESETS = {
|
|
69
|
+
generic: {
|
|
70
|
+
id: 'generic',
|
|
71
|
+
label: 'Generic / Custom',
|
|
72
|
+
description: 'A broad default for custom sites and general article-style pattern pages.',
|
|
73
|
+
titleSelectors: ['h1'],
|
|
74
|
+
descriptionSelectors: ['.entry-content p', '.post-content p', '.article-body p', 'main p'],
|
|
75
|
+
materialsSelectors: ['.materials-list li', '.pattern-materials li', '.supply-list li'],
|
|
76
|
+
instructionsSelectors: ['.instructions-step', '.instruction-step', '.pattern-steps li', 'ol li'],
|
|
77
|
+
imageSelectors: ['.entry-content img', '.post-content img', '.article-body img', 'main img']
|
|
78
|
+
},
|
|
79
|
+
wordpress: {
|
|
80
|
+
id: 'wordpress',
|
|
81
|
+
label: 'WordPress Article',
|
|
82
|
+
description: 'Best for blog-style posts using common WordPress article wrappers.',
|
|
83
|
+
titleSelectors: ['h1.entry-title', 'article h1', 'h1'],
|
|
84
|
+
descriptionSelectors: ['.entry-content p', '.wp-block-post-content p', '.post-content p'],
|
|
85
|
+
materialsSelectors: ['.entry-content ul li', '.wp-block-post-content ul li', '.materials-list li'],
|
|
86
|
+
instructionsSelectors: ['.entry-content ol li', '.wp-block-post-content ol li', '.instructions-step'],
|
|
87
|
+
imageSelectors: ['.entry-content img', '.wp-block-post-content img', '.post-content img']
|
|
88
|
+
},
|
|
89
|
+
woocommerce: {
|
|
90
|
+
id: 'woocommerce',
|
|
91
|
+
label: 'WooCommerce Product',
|
|
92
|
+
description: 'Best for WooCommerce product pages with product galleries and description areas.',
|
|
93
|
+
titleSelectors: ['.product_title', '.entry-summary h1', 'h1.product_title', 'h1'],
|
|
94
|
+
descriptionSelectors: [
|
|
95
|
+
'.woocommerce-product-details__short-description p',
|
|
96
|
+
'.woocommerce-Tabs-panel--description p',
|
|
97
|
+
'.entry-summary p'
|
|
98
|
+
],
|
|
99
|
+
materialsSelectors: [
|
|
100
|
+
'.woocommerce-Tabs-panel--description ul li',
|
|
101
|
+
'.woocommerce-Tabs-panel ul li',
|
|
102
|
+
'.product_meta + div ul li'
|
|
103
|
+
],
|
|
104
|
+
instructionsSelectors: [
|
|
105
|
+
'.woocommerce-Tabs-panel--description ol li',
|
|
106
|
+
'.woocommerce-Tabs-panel ol li',
|
|
107
|
+
'.instruction-step'
|
|
108
|
+
],
|
|
109
|
+
imageSelectors: ['.woocommerce-product-gallery img', '.images img', '.woocommerce-product-gallery__image img']
|
|
110
|
+
}
|
|
111
|
+
};
|
|
112
|
+
function defaultLogger(message) {
|
|
113
|
+
console.log(message);
|
|
114
|
+
}
|
|
115
|
+
function sanitizeFilename(name) {
|
|
116
|
+
const sanitized = name
|
|
117
|
+
.replace(/[^a-z0-9]/gi, '_')
|
|
118
|
+
.replace(/_+/g, '_')
|
|
119
|
+
.replace(/^_+|_+$/g, '')
|
|
120
|
+
.toLowerCase();
|
|
121
|
+
return sanitized || 'untitled_pattern';
|
|
122
|
+
}
|
|
123
|
+
function normalizeUrl(input) {
|
|
124
|
+
try {
|
|
125
|
+
const url = new URL(input.trim());
|
|
126
|
+
return ['http:', 'https:'].includes(url.protocol) ? url.href : null;
|
|
127
|
+
}
|
|
128
|
+
catch {
|
|
129
|
+
return null;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
function dedupeStrings(items) {
|
|
133
|
+
return [...new Set(items)];
|
|
134
|
+
}
|
|
135
|
+
function getSelectorPresets() {
|
|
136
|
+
return Object.values(SELECTOR_PRESETS);
|
|
137
|
+
}
|
|
138
|
+
function isSelectorPresetId(value) {
|
|
139
|
+
return value in SELECTOR_PRESETS;
|
|
140
|
+
}
|
|
141
|
+
function getSelectorPreset(presetId = 'generic') {
|
|
142
|
+
return SELECTOR_PRESETS[presetId];
|
|
143
|
+
}
|
|
144
|
+
async function loadSiteProfiles(profilesPath = path.resolve(process.cwd(), exports.DEFAULT_PROFILES_FILE)) {
|
|
145
|
+
try {
|
|
146
|
+
const fileContent = await fs.readFile(profilesPath, 'utf-8');
|
|
147
|
+
const parsed = JSON.parse(fileContent);
|
|
148
|
+
if (!Array.isArray(parsed.profiles)) {
|
|
149
|
+
throw new Error('Profiles file must contain a "profiles" array.');
|
|
150
|
+
}
|
|
151
|
+
return parsed.profiles.map(profile => {
|
|
152
|
+
if (!profile?.id || !profile?.label) {
|
|
153
|
+
throw new Error('Each profile must include both "id" and "label".');
|
|
154
|
+
}
|
|
155
|
+
if (profile.preset && !isSelectorPresetId(profile.preset)) {
|
|
156
|
+
throw new Error(`Profile "${profile.id}" references unknown preset "${profile.preset}".`);
|
|
157
|
+
}
|
|
158
|
+
return {
|
|
159
|
+
id: profile.id,
|
|
160
|
+
label: profile.label,
|
|
161
|
+
description: profile.description,
|
|
162
|
+
preset: profile.preset ?? 'generic',
|
|
163
|
+
selectorOverrides: sanitizeSelectorOverrides(profile.selectorOverrides)
|
|
164
|
+
};
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
catch (error) {
|
|
168
|
+
if (error?.code === 'ENOENT') {
|
|
169
|
+
return [];
|
|
170
|
+
}
|
|
171
|
+
if (error instanceof SyntaxError) {
|
|
172
|
+
throw new Error(`Profiles file contains invalid JSON: ${error.message}`);
|
|
173
|
+
}
|
|
174
|
+
throw error;
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
function sanitizeSelectorOverrides(overrides) {
|
|
178
|
+
if (!overrides)
|
|
179
|
+
return undefined;
|
|
180
|
+
const cleaned = {};
|
|
181
|
+
for (const [key, value] of Object.entries(overrides)) {
|
|
182
|
+
const trimmed = value?.trim();
|
|
183
|
+
if (trimmed) {
|
|
184
|
+
cleaned[key] = trimmed;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
return Object.keys(cleaned).length > 0 ? cleaned : undefined;
|
|
188
|
+
}
|
|
189
|
+
function resolveSelectors(preset, overrides) {
|
|
190
|
+
const cleanedOverrides = sanitizeSelectorOverrides(overrides);
|
|
191
|
+
return {
|
|
192
|
+
titleSelectors: cleanedOverrides?.titleSelector ? [cleanedOverrides.titleSelector, ...preset.titleSelectors] : preset.titleSelectors,
|
|
193
|
+
descriptionSelectors: cleanedOverrides?.descriptionSelector ? [cleanedOverrides.descriptionSelector, ...preset.descriptionSelectors] : preset.descriptionSelectors,
|
|
194
|
+
materialsSelectors: cleanedOverrides?.materialsSelector ? [cleanedOverrides.materialsSelector, ...preset.materialsSelectors] : preset.materialsSelectors,
|
|
195
|
+
instructionsSelectors: cleanedOverrides?.instructionsSelector ? [cleanedOverrides.instructionsSelector, ...preset.instructionsSelectors] : preset.instructionsSelectors,
|
|
196
|
+
imageSelectors: cleanedOverrides?.imageSelector ? [cleanedOverrides.imageSelector, ...preset.imageSelectors] : preset.imageSelectors
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
async function resolveStrategy(options) {
|
|
200
|
+
const profilesPath = options.profilesPath ?? path.resolve(process.cwd(), exports.DEFAULT_PROFILES_FILE);
|
|
201
|
+
const availableProfiles = await loadSiteProfiles(profilesPath);
|
|
202
|
+
if (options.profileId) {
|
|
203
|
+
const profile = availableProfiles.find(item => item.id === options.profileId);
|
|
204
|
+
if (!profile) {
|
|
205
|
+
throw new Error(`Unknown profile "${options.profileId}" in ${profilesPath}.`);
|
|
206
|
+
}
|
|
207
|
+
const profilePresetId = profile.preset ?? 'generic';
|
|
208
|
+
return {
|
|
209
|
+
presetId: profilePresetId,
|
|
210
|
+
preset: getSelectorPreset(profilePresetId),
|
|
211
|
+
selectorOverrides: sanitizeSelectorOverrides({
|
|
212
|
+
...profile.selectorOverrides,
|
|
213
|
+
...options.selectorOverrides
|
|
214
|
+
}),
|
|
215
|
+
profileId: profile.id,
|
|
216
|
+
profileLabel: profile.label
|
|
217
|
+
};
|
|
218
|
+
}
|
|
219
|
+
const presetId = options.preset ?? 'generic';
|
|
220
|
+
return {
|
|
221
|
+
presetId,
|
|
222
|
+
preset: getSelectorPreset(presetId),
|
|
223
|
+
selectorOverrides: sanitizeSelectorOverrides(options.selectorOverrides)
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
async function loadExistingData(outputPath) {
|
|
227
|
+
try {
|
|
228
|
+
const fileContent = await fs.readFile(outputPath, 'utf-8');
|
|
229
|
+
const parsed = JSON.parse(fileContent);
|
|
230
|
+
if (!Array.isArray(parsed)) {
|
|
231
|
+
throw new Error('Output file must contain a JSON array.');
|
|
232
|
+
}
|
|
233
|
+
return parsed;
|
|
234
|
+
}
|
|
235
|
+
catch (error) {
|
|
236
|
+
if (error?.code === 'ENOENT') {
|
|
237
|
+
return [];
|
|
238
|
+
}
|
|
239
|
+
if (error instanceof SyntaxError) {
|
|
240
|
+
throw new Error(`Output file contains invalid JSON: ${error.message}`);
|
|
241
|
+
}
|
|
242
|
+
throw error;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
function getImageExtension(fileUrl) {
|
|
246
|
+
try {
|
|
247
|
+
const pathname = new URL(fileUrl).pathname;
|
|
248
|
+
const ext = path.extname(pathname).toLowerCase();
|
|
249
|
+
return IMAGE_EXTENSIONS.has(ext) ? ext : '.jpg';
|
|
250
|
+
}
|
|
251
|
+
catch {
|
|
252
|
+
return '.jpg';
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
async function downloadFile(url, dest, logger) {
|
|
256
|
+
try {
|
|
257
|
+
const response = await axios_1.default.get(url, {
|
|
258
|
+
responseType: 'stream',
|
|
259
|
+
headers: REQUEST_HEADERS,
|
|
260
|
+
timeout: REQUEST_TIMEOUT_MS
|
|
261
|
+
});
|
|
262
|
+
await (0, promises_1.pipeline)(response.data, (0, fs_1.createWriteStream)(dest));
|
|
263
|
+
return true;
|
|
264
|
+
}
|
|
265
|
+
catch (error) {
|
|
266
|
+
logger(`\nâ Shit, couldn't download file ${url}: ${error.message}`);
|
|
267
|
+
return false;
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
async function fetchHtml(url) {
|
|
271
|
+
const { data } = await axios_1.default.get(url, {
|
|
272
|
+
headers: REQUEST_HEADERS,
|
|
273
|
+
timeout: REQUEST_TIMEOUT_MS
|
|
274
|
+
});
|
|
275
|
+
return data;
|
|
276
|
+
}
|
|
277
|
+
function shouldSkipHref(href) {
|
|
278
|
+
const normalized = href.trim().toLowerCase();
|
|
279
|
+
return (normalized.length === 0 ||
|
|
280
|
+
normalized.startsWith('#') ||
|
|
281
|
+
normalized.startsWith('mailto:') ||
|
|
282
|
+
normalized.startsWith('tel:') ||
|
|
283
|
+
normalized.startsWith('javascript:'));
|
|
284
|
+
}
|
|
285
|
+
function normalizeLanguageValue(value) {
|
|
286
|
+
const normalized = value?.trim().toLowerCase();
|
|
287
|
+
return normalized ? normalized : undefined;
|
|
288
|
+
}
|
|
289
|
+
function detectLanguageFromUrl(url) {
|
|
290
|
+
const langParam = url.searchParams.get('lang')?.trim().toLowerCase();
|
|
291
|
+
if (langParam)
|
|
292
|
+
return langParam;
|
|
293
|
+
const pathname = url.pathname.toLowerCase();
|
|
294
|
+
if (pathname.includes('french'))
|
|
295
|
+
return 'french';
|
|
296
|
+
if (pathname.includes('portuguese'))
|
|
297
|
+
return 'portuguese';
|
|
298
|
+
if (pathname.includes('english'))
|
|
299
|
+
return 'english';
|
|
300
|
+
return undefined;
|
|
301
|
+
}
|
|
302
|
+
function matchesLanguage(url, language) {
|
|
303
|
+
const normalizedLanguage = normalizeLanguageValue(language);
|
|
304
|
+
if (!normalizedLanguage)
|
|
305
|
+
return true;
|
|
306
|
+
const detectedLanguage = detectLanguageFromUrl(url);
|
|
307
|
+
if (!detectedLanguage)
|
|
308
|
+
return true;
|
|
309
|
+
return detectedLanguage === normalizedLanguage;
|
|
310
|
+
}
|
|
311
|
+
function looksLikeHtmlPage(url) {
|
|
312
|
+
const pathname = url.pathname.toLowerCase();
|
|
313
|
+
if (pathname.endsWith('.pdf'))
|
|
314
|
+
return false;
|
|
315
|
+
const ext = path.extname(pathname);
|
|
316
|
+
return ext === '' || ['.html', '.htm', '.php', '.asp', '.aspx'].includes(ext);
|
|
317
|
+
}
|
|
318
|
+
function detectMaxPaginationPages($) {
|
|
319
|
+
const dataPages = $('.jet-listing-grid__items[data-pages]').first().attr('data-pages');
|
|
320
|
+
const parsedDataPages = dataPages ? Number.parseInt(dataPages, 10) : Number.NaN;
|
|
321
|
+
if (Number.isFinite(parsedDataPages) && parsedDataPages > 1) {
|
|
322
|
+
return parsedDataPages;
|
|
323
|
+
}
|
|
324
|
+
let maxPage = 1;
|
|
325
|
+
$('a[href*="/page/"]').each((_, el) => {
|
|
326
|
+
const href = $(el).attr('href');
|
|
327
|
+
if (!href)
|
|
328
|
+
return;
|
|
329
|
+
const match = href.match(/\/page\/(\d+)\//i);
|
|
330
|
+
if (!match)
|
|
331
|
+
return;
|
|
332
|
+
const page = Number.parseInt(match[1], 10);
|
|
333
|
+
if (Number.isFinite(page)) {
|
|
334
|
+
maxPage = Math.max(maxPage, page);
|
|
335
|
+
}
|
|
336
|
+
});
|
|
337
|
+
return maxPage > 1 ? maxPage : undefined;
|
|
338
|
+
}
|
|
339
|
+
function buildPaginatedUrl(seedUrl, pageNumber) {
|
|
340
|
+
const parsed = new URL(seedUrl);
|
|
341
|
+
const segments = parsed.pathname.replace(/\/+$/, '').split('/').filter(Boolean);
|
|
342
|
+
if (segments[segments.length - 2] === 'page') {
|
|
343
|
+
segments[segments.length - 1] = String(pageNumber);
|
|
344
|
+
}
|
|
345
|
+
else {
|
|
346
|
+
segments.push('page', String(pageNumber));
|
|
347
|
+
}
|
|
348
|
+
parsed.pathname = `/${segments.join('/')}/`;
|
|
349
|
+
return parsed.href;
|
|
350
|
+
}
|
|
351
|
+
async function expandPaginatedSeedUrls(startUrls, logger, crawl) {
|
|
352
|
+
if (!crawl?.enabled || !crawl.paginate) {
|
|
353
|
+
return startUrls;
|
|
354
|
+
}
|
|
355
|
+
const maxPaginationPages = Math.max(1, crawl.maxPaginationPages ?? DEFAULT_CRAWL_MAX_PAGINATION_PAGES);
|
|
356
|
+
const expanded = new Set(startUrls);
|
|
357
|
+
for (const startUrl of startUrls) {
|
|
358
|
+
let html;
|
|
359
|
+
try {
|
|
360
|
+
html = await fetchHtml(startUrl);
|
|
361
|
+
}
|
|
362
|
+
catch (error) {
|
|
363
|
+
logger(`â ď¸ Couldn't inspect pagination for ${startUrl}: ${error.message}`);
|
|
364
|
+
continue;
|
|
365
|
+
}
|
|
366
|
+
const $ = cheerio.load(html);
|
|
367
|
+
const detectedMaxPages = detectMaxPaginationPages($);
|
|
368
|
+
if (!detectedMaxPages || detectedMaxPages <= 1) {
|
|
369
|
+
continue;
|
|
370
|
+
}
|
|
371
|
+
const pageLimit = Math.min(detectedMaxPages, maxPaginationPages);
|
|
372
|
+
logger(`đ Pagination detected. Adding listing pages 2 through ${pageLimit} for ${startUrl}`);
|
|
373
|
+
for (let page = 2; page <= pageLimit; page++) {
|
|
374
|
+
expanded.add(buildPaginatedUrl(startUrl, page));
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
return [...expanded];
|
|
378
|
+
}
|
|
379
|
+
async function discoverUrls(startUrls, logger, crawl) {
|
|
380
|
+
if (!crawl?.enabled) {
|
|
381
|
+
return startUrls;
|
|
382
|
+
}
|
|
383
|
+
const maxDepth = Math.max(0, crawl.maxDepth ?? DEFAULT_CRAWL_MAX_DEPTH);
|
|
384
|
+
const sameDomainOnly = crawl.sameDomainOnly ?? true;
|
|
385
|
+
const maxDiscoveredUrls = Math.max(1, crawl.maxDiscoveredUrls ?? DEFAULT_CRAWL_MAX_URLS);
|
|
386
|
+
const language = normalizeLanguageValue(crawl.language);
|
|
387
|
+
const linkPattern = crawl.linkPattern?.trim();
|
|
388
|
+
const patternRegex = linkPattern ? new RegExp(linkPattern, 'i') : null;
|
|
389
|
+
const queue = [];
|
|
390
|
+
const visited = new Set();
|
|
391
|
+
const discovered = new Set();
|
|
392
|
+
const seedUrls = await expandPaginatedSeedUrls(startUrls, logger, crawl);
|
|
393
|
+
for (const startUrl of seedUrls) {
|
|
394
|
+
const parsed = new URL(startUrl);
|
|
395
|
+
if (!matchesLanguage(parsed, language))
|
|
396
|
+
continue;
|
|
397
|
+
queue.push({ url: startUrl, depth: 0, rootHost: parsed.host });
|
|
398
|
+
discovered.add(startUrl);
|
|
399
|
+
}
|
|
400
|
+
logger(`\nđ¸ď¸ Crawl mode is on. Exploring up to ${maxDepth} link level(s) deep...`);
|
|
401
|
+
while (queue.length > 0 && discovered.size < maxDiscoveredUrls) {
|
|
402
|
+
const current = queue.shift();
|
|
403
|
+
if (visited.has(current.url))
|
|
404
|
+
continue;
|
|
405
|
+
visited.add(current.url);
|
|
406
|
+
logger(`đ Discovering links at depth ${current.depth}: ${current.url}`);
|
|
407
|
+
let html;
|
|
408
|
+
try {
|
|
409
|
+
html = await fetchHtml(current.url);
|
|
410
|
+
}
|
|
411
|
+
catch (error) {
|
|
412
|
+
logger(`â ď¸ Couldn't crawl ${current.url}: ${error.message}`);
|
|
413
|
+
continue;
|
|
414
|
+
}
|
|
415
|
+
const $ = cheerio.load(html);
|
|
416
|
+
$('a').each((_, el) => {
|
|
417
|
+
if (discovered.size >= maxDiscoveredUrls)
|
|
418
|
+
return;
|
|
419
|
+
const href = $(el).attr('href');
|
|
420
|
+
if (!href || shouldSkipHref(href))
|
|
421
|
+
return;
|
|
422
|
+
try {
|
|
423
|
+
const absolute = new URL(href, current.url);
|
|
424
|
+
if (!['http:', 'https:'].includes(absolute.protocol))
|
|
425
|
+
return;
|
|
426
|
+
if (sameDomainOnly && absolute.host !== current.rootHost)
|
|
427
|
+
return;
|
|
428
|
+
if (!matchesLanguage(absolute, language))
|
|
429
|
+
return;
|
|
430
|
+
if (!looksLikeHtmlPage(absolute))
|
|
431
|
+
return;
|
|
432
|
+
const normalized = absolute.href;
|
|
433
|
+
const matchesPattern = !patternRegex || patternRegex.test(normalized) || patternRegex.test($(el).text().trim());
|
|
434
|
+
if (!matchesPattern)
|
|
435
|
+
return;
|
|
436
|
+
if (discovered.has(normalized))
|
|
437
|
+
return;
|
|
438
|
+
discovered.add(normalized);
|
|
439
|
+
if (current.depth < maxDepth) {
|
|
440
|
+
queue.push({ url: normalized, depth: current.depth + 1, rootHost: current.rootHost });
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
catch { }
|
|
444
|
+
});
|
|
445
|
+
if (queue.length > 0) {
|
|
446
|
+
await sleep(500);
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
logger(`⨠Discovery finished. Found ${discovered.size} page URL(s) to scrape.`);
|
|
450
|
+
return [...discovered];
|
|
451
|
+
}
|
|
452
|
+
function firstTextMatch($, selectors, fallback) {
|
|
453
|
+
for (const selector of selectors) {
|
|
454
|
+
const text = $(selector).first().text().trim();
|
|
455
|
+
if (text)
|
|
456
|
+
return text;
|
|
457
|
+
}
|
|
458
|
+
return fallback;
|
|
459
|
+
}
|
|
460
|
+
function collectTextMatches($, selectors) {
|
|
461
|
+
for (const selector of selectors) {
|
|
462
|
+
const values = [];
|
|
463
|
+
$(selector).each((_, el) => {
|
|
464
|
+
const value = $(el).text().trim();
|
|
465
|
+
if (value)
|
|
466
|
+
values.push(value);
|
|
467
|
+
});
|
|
468
|
+
if (values.length > 0) {
|
|
469
|
+
return dedupeStrings(values);
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
return [];
|
|
473
|
+
}
|
|
474
|
+
function collectAssetUrls($, selectors, pageUrl) {
|
|
475
|
+
for (const selector of selectors) {
|
|
476
|
+
const urls = [];
|
|
477
|
+
$(selector).each((_, el) => {
|
|
478
|
+
const src = $(el).attr('src');
|
|
479
|
+
if (!src)
|
|
480
|
+
return;
|
|
481
|
+
try {
|
|
482
|
+
const absoluteUrl = new URL(src, pageUrl).href;
|
|
483
|
+
if (!absoluteUrl.includes('logo') && !absoluteUrl.includes('icon')) {
|
|
484
|
+
urls.push(absoluteUrl);
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
catch { }
|
|
488
|
+
});
|
|
489
|
+
if (urls.length > 0) {
|
|
490
|
+
return dedupeStrings(urls);
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
return [];
|
|
494
|
+
}
|
|
495
|
+
function normalizeTextBlock(value) {
|
|
496
|
+
return value.replace(/\s+/g, ' ').trim();
|
|
497
|
+
}
|
|
498
|
+
function extractPageText($) {
|
|
499
|
+
for (const selector of ARTICLE_TEXT_SELECTORS) {
|
|
500
|
+
const container = $(selector).first();
|
|
501
|
+
if (!container.length)
|
|
502
|
+
continue;
|
|
503
|
+
const blocks = [];
|
|
504
|
+
container.find('h2, h3, h4, p, li').each((_, el) => {
|
|
505
|
+
const text = normalizeTextBlock($(el).text());
|
|
506
|
+
if (text) {
|
|
507
|
+
blocks.push(text);
|
|
508
|
+
}
|
|
509
|
+
});
|
|
510
|
+
const uniqueBlocks = dedupeStrings(blocks);
|
|
511
|
+
if (uniqueBlocks.length > 0) {
|
|
512
|
+
return uniqueBlocks.join('\n\n');
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
const fallbackText = normalizeTextBlock($('body').text());
|
|
516
|
+
return fallbackText;
|
|
517
|
+
}
|
|
518
|
+
async function writeTextArtifact(preview, workingDirectory, safeTitle) {
|
|
519
|
+
const textDir = path.resolve(workingDirectory, 'texts', safeTitle);
|
|
520
|
+
await fs.mkdir(textDir, { recursive: true });
|
|
521
|
+
const textFile = path.resolve(textDir, 'pattern.txt');
|
|
522
|
+
const outputText = [
|
|
523
|
+
`Title: ${preview.title}`,
|
|
524
|
+
`Source URL: ${preview.sourceUrl}`,
|
|
525
|
+
`Preset: ${preview.presetLabel}`,
|
|
526
|
+
preview.profileLabel ? `Profile: ${preview.profileLabel}` : '',
|
|
527
|
+
preview.description ? `Description: ${preview.description}` : '',
|
|
528
|
+
preview.materials.length > 0 ? `Materials:\n- ${preview.materials.join('\n- ')}` : '',
|
|
529
|
+
preview.instructions.length > 0 ? `Instructions:\n- ${preview.instructions.join('\n- ')}` : '',
|
|
530
|
+
preview.pageText ? `Page Text:\n${preview.pageText}` : ''
|
|
531
|
+
].filter(Boolean).join('\n\n');
|
|
532
|
+
await fs.writeFile(textFile, outputText, 'utf-8');
|
|
533
|
+
return `texts/${safeTitle}/pattern.txt`;
|
|
534
|
+
}
|
|
535
|
+
async function extractPatternPreview(url, logger, strategy) {
|
|
536
|
+
logger(`\nđ Purring along... Fetching data from: ${url}`);
|
|
537
|
+
logger(`𪥠Using selector preset: ${strategy.preset.label}`);
|
|
538
|
+
if (strategy.profileLabel) {
|
|
539
|
+
logger(`đ§ Loaded site profile: ${strategy.profileLabel}`);
|
|
540
|
+
}
|
|
541
|
+
if (strategy.selectorOverrides) {
|
|
542
|
+
logger('𧡠Advanced selector overrides are active for this run.');
|
|
543
|
+
}
|
|
544
|
+
const { data } = await axios_1.default.get(url, {
|
|
545
|
+
headers: REQUEST_HEADERS,
|
|
546
|
+
timeout: REQUEST_TIMEOUT_MS
|
|
547
|
+
});
|
|
548
|
+
const $ = cheerio.load(data);
|
|
549
|
+
const selectors = resolveSelectors(strategy.preset, strategy.selectorOverrides);
|
|
550
|
+
const title = firstTextMatch($, selectors.titleSelectors, 'Untitled Pattern');
|
|
551
|
+
const description = firstTextMatch($, selectors.descriptionSelectors, 'No description found.');
|
|
552
|
+
const materials = collectTextMatches($, selectors.materialsSelectors);
|
|
553
|
+
const instructions = collectTextMatches($, selectors.instructionsSelectors);
|
|
554
|
+
const pageText = extractPageText($);
|
|
555
|
+
const imageUrls = collectAssetUrls($, selectors.imageSelectors, url);
|
|
556
|
+
const pdfUrls = [];
|
|
557
|
+
$('a').each((_, el) => {
|
|
558
|
+
const href = $(el).attr('href');
|
|
559
|
+
if (!href || !href.toLowerCase().includes('.pdf'))
|
|
560
|
+
return;
|
|
561
|
+
try {
|
|
562
|
+
const absoluteUrl = new URL(href, url).href;
|
|
563
|
+
pdfUrls.push(absoluteUrl);
|
|
564
|
+
}
|
|
565
|
+
catch { }
|
|
566
|
+
});
|
|
567
|
+
return {
|
|
568
|
+
title,
|
|
569
|
+
description,
|
|
570
|
+
materials,
|
|
571
|
+
instructions,
|
|
572
|
+
pageText,
|
|
573
|
+
sourceUrl: url,
|
|
574
|
+
imageUrls,
|
|
575
|
+
pdfUrls: dedupeStrings(pdfUrls),
|
|
576
|
+
preset: strategy.presetId,
|
|
577
|
+
presetLabel: strategy.preset.label,
|
|
578
|
+
selectorOverrides: strategy.selectorOverrides,
|
|
579
|
+
profileId: strategy.profileId,
|
|
580
|
+
profileLabel: strategy.profileLabel
|
|
581
|
+
};
|
|
582
|
+
}
|
|
583
|
+
async function scrapePattern(url, workingDirectory, logger, strategy) {
|
|
584
|
+
try {
|
|
585
|
+
const preview = await extractPatternPreview(url, logger, strategy);
|
|
586
|
+
const uniqueImageUrls = dedupeStrings(preview.imageUrls);
|
|
587
|
+
const uniquePdfUrls = dedupeStrings(preview.pdfUrls);
|
|
588
|
+
const safeTitle = sanitizeFilename(preview.title);
|
|
589
|
+
const localImages = [];
|
|
590
|
+
const localPdfs = [];
|
|
591
|
+
const localTextFile = await writeTextArtifact(preview, workingDirectory, safeTitle);
|
|
592
|
+
logger(`đ Saved page text to ./${localTextFile}`);
|
|
593
|
+
if (uniqueImageUrls.length > 0) {
|
|
594
|
+
const imageDir = path.resolve(workingDirectory, 'images', safeTitle);
|
|
595
|
+
await fs.mkdir(imageDir, { recursive: true });
|
|
596
|
+
logger(`⨠Found ${uniqueImageUrls.length} images! Downloading to ./images/${safeTitle}/...`);
|
|
597
|
+
for (let i = 0; i < uniqueImageUrls.length; i++) {
|
|
598
|
+
const imgUrl = uniqueImageUrls[i];
|
|
599
|
+
const ext = getImageExtension(imgUrl);
|
|
600
|
+
const filename = `image_${i + 1}${ext}`;
|
|
601
|
+
const destPath = path.resolve(imageDir, filename);
|
|
602
|
+
if (await downloadFile(imgUrl, destPath, logger)) {
|
|
603
|
+
localImages.push(`images/${safeTitle}/${filename}`);
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
if (uniquePdfUrls.length > 0) {
|
|
608
|
+
const pdfDir = path.resolve(workingDirectory, 'pdfs', safeTitle);
|
|
609
|
+
await fs.mkdir(pdfDir, { recursive: true });
|
|
610
|
+
logger(`đ Holy shit, found ${uniquePdfUrls.length} PDFs! Downloading to ./pdfs/${safeTitle}/...`);
|
|
611
|
+
for (let i = 0; i < uniquePdfUrls.length; i++) {
|
|
612
|
+
const pdfUrl = uniquePdfUrls[i];
|
|
613
|
+
let filename = path.basename(new URL(pdfUrl).pathname);
|
|
614
|
+
if (!filename.toLowerCase().endsWith('.pdf'))
|
|
615
|
+
filename = `pattern_${i + 1}.pdf`;
|
|
616
|
+
const destPath = path.resolve(pdfDir, filename);
|
|
617
|
+
if (await downloadFile(pdfUrl, destPath, logger)) {
|
|
618
|
+
localPdfs.push(`pdfs/${safeTitle}/${filename}`);
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
return {
|
|
623
|
+
title: preview.title,
|
|
624
|
+
description: preview.description,
|
|
625
|
+
materials: preview.materials,
|
|
626
|
+
instructions: preview.instructions,
|
|
627
|
+
sourceUrl: url,
|
|
628
|
+
localImages,
|
|
629
|
+
localPdfs,
|
|
630
|
+
localTextFile
|
|
631
|
+
};
|
|
632
|
+
}
|
|
633
|
+
catch (error) {
|
|
634
|
+
logger(`\nâ Ah shit, something broke while fetching the URL: ${error.message}`);
|
|
635
|
+
return null;
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
function sleep(ms) {
|
|
639
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
640
|
+
}
|
|
641
|
+
async function previewPattern(options, logger = defaultLogger) {
|
|
642
|
+
const normalizedUrl = normalizeUrl(options.url);
|
|
643
|
+
if (!normalizedUrl) {
|
|
644
|
+
throw new Error(`That doesn't look like a valid URL: ${options.url}`);
|
|
645
|
+
}
|
|
646
|
+
const strategy = await resolveStrategy({
|
|
647
|
+
preset: options.preset,
|
|
648
|
+
selectorOverrides: options.selectorOverrides,
|
|
649
|
+
profileId: options.profileId,
|
|
650
|
+
profilesPath: options.profilesPath
|
|
651
|
+
});
|
|
652
|
+
return extractPatternPreview(normalizedUrl, logger, strategy);
|
|
653
|
+
}
|
|
654
|
+
async function scrapeUrls(options) {
|
|
655
|
+
const logger = options.logger ?? defaultLogger;
|
|
656
|
+
const workingDirectory = options.workingDirectory ?? process.cwd();
|
|
657
|
+
const strategy = await resolveStrategy({
|
|
658
|
+
preset: options.preset,
|
|
659
|
+
selectorOverrides: options.selectorOverrides,
|
|
660
|
+
profileId: options.profileId,
|
|
661
|
+
profilesPath: options.profilesPath
|
|
662
|
+
});
|
|
663
|
+
const startingUrls = dedupeStrings(options.urls
|
|
664
|
+
.map(normalizeUrl)
|
|
665
|
+
.filter((url) => Boolean(url)));
|
|
666
|
+
if (startingUrls.length === 0) {
|
|
667
|
+
throw new Error('No valid http(s) URLs were provided.');
|
|
668
|
+
}
|
|
669
|
+
const normalizedUrls = await discoverUrls(startingUrls, logger, options.crawl);
|
|
670
|
+
const existingData = await loadExistingData(options.outputPath);
|
|
671
|
+
const knownSourceUrls = new Set(existingData.map(item => item.sourceUrl));
|
|
672
|
+
const newPatterns = [];
|
|
673
|
+
let skippedCount = 0;
|
|
674
|
+
let failedCount = 0;
|
|
675
|
+
for (let i = 0; i < normalizedUrls.length; i++) {
|
|
676
|
+
const currentUrl = normalizedUrls[i];
|
|
677
|
+
logger(`\n======================================================`);
|
|
678
|
+
logger(`đ§ľ Scraping pattern ${i + 1} of ${normalizedUrls.length}`);
|
|
679
|
+
logger(`======================================================`);
|
|
680
|
+
if (knownSourceUrls.has(currentUrl)) {
|
|
681
|
+
skippedCount += 1;
|
|
682
|
+
logger(`â ď¸ Already scraped ${currentUrl}. Skipping before download to save time.`);
|
|
683
|
+
continue;
|
|
684
|
+
}
|
|
685
|
+
const data = await scrapePattern(currentUrl, workingDirectory, logger, strategy);
|
|
686
|
+
if (!data) {
|
|
687
|
+
failedCount += 1;
|
|
688
|
+
}
|
|
689
|
+
else if (!knownSourceUrls.has(data.sourceUrl)) {
|
|
690
|
+
existingData.push(data);
|
|
691
|
+
newPatterns.push(data);
|
|
692
|
+
knownSourceUrls.add(data.sourceUrl);
|
|
693
|
+
await fs.writeFile(options.outputPath, JSON.stringify(existingData, null, 2), 'utf-8');
|
|
694
|
+
logger(`⨠Badass! Appended data to ${path.basename(options.outputPath)} đ§`);
|
|
695
|
+
}
|
|
696
|
+
else {
|
|
697
|
+
skippedCount += 1;
|
|
698
|
+
logger(`â ď¸ Looks like we already have the data for "${data.title}". Skipping JSON append to keep it clean.`);
|
|
699
|
+
}
|
|
700
|
+
if (i < normalizedUrls.length - 1) {
|
|
701
|
+
logger(`\nâł Taking a quick 2-second breather so we don't get blocked...`);
|
|
702
|
+
await sleep(DEFAULT_DELAY_MS);
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
logger(`\nđ All done! Your data is ready to roll. You crushed it.`);
|
|
706
|
+
return {
|
|
707
|
+
scrapedCount: newPatterns.length,
|
|
708
|
+
skippedCount,
|
|
709
|
+
failedCount,
|
|
710
|
+
outputPath: options.outputPath,
|
|
711
|
+
outputDirectory: workingDirectory,
|
|
712
|
+
patterns: newPatterns,
|
|
713
|
+
preset: strategy.presetId,
|
|
714
|
+
selectorOverrides: strategy.selectorOverrides,
|
|
715
|
+
profileId: strategy.profileId,
|
|
716
|
+
profileLabel: strategy.profileLabel,
|
|
717
|
+
discoveredUrlCount: normalizedUrls.length
|
|
718
|
+
};
|
|
719
|
+
}
|