soustack 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +301 -244
- package/dist/cli/index.js +1697 -1357
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.mts +48 -138
- package/dist/index.d.ts +48 -138
- package/dist/index.js +1093 -1466
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +1092 -1453
- package/dist/index.mjs.map +1 -1
- package/dist/scrape.d.mts +308 -0
- package/dist/scrape.d.ts +308 -0
- package/dist/scrape.js +819 -0
- package/dist/scrape.js.map +1 -0
- package/dist/scrape.mjs +814 -0
- package/dist/scrape.mjs.map +1 -0
- package/package.json +86 -75
- package/src/profiles/.gitkeep +0 -0
- package/src/profiles/base.schema.json +9 -0
- package/src/profiles/cookable.schema.json +18 -0
- package/src/profiles/illustrated.schema.json +48 -0
- package/src/profiles/quantified.schema.json +43 -0
- package/src/profiles/scalable.schema.json +75 -0
- package/src/profiles/schedulable.schema.json +43 -0
- package/src/schema.json +43 -22
- package/src/soustack.schema.json +344 -0
package/dist/scrape.mjs
ADDED
|
@@ -0,0 +1,814 @@
|
|
|
1
|
+
import { load } from 'cheerio';
|
|
2
|
+
|
|
3
|
+
// src/converters/yield.ts
|
|
4
|
+
function parseYield(value) {
|
|
5
|
+
if (value === void 0 || value === null) {
|
|
6
|
+
return void 0;
|
|
7
|
+
}
|
|
8
|
+
if (typeof value === "number") {
|
|
9
|
+
return {
|
|
10
|
+
amount: value,
|
|
11
|
+
unit: "servings"
|
|
12
|
+
};
|
|
13
|
+
}
|
|
14
|
+
if (Array.isArray(value)) {
|
|
15
|
+
return parseYield(value[0]);
|
|
16
|
+
}
|
|
17
|
+
if (typeof value === "object") {
|
|
18
|
+
const maybeYield = value;
|
|
19
|
+
if (typeof maybeYield.amount === "number") {
|
|
20
|
+
return {
|
|
21
|
+
amount: maybeYield.amount,
|
|
22
|
+
unit: typeof maybeYield.unit === "string" ? maybeYield.unit : "servings",
|
|
23
|
+
description: typeof maybeYield.description === "string" ? maybeYield.description : void 0
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
if (typeof value === "string") {
|
|
28
|
+
const trimmed = value.trim();
|
|
29
|
+
const match = trimmed.match(/(\d+(?:\.\d+)?)/);
|
|
30
|
+
if (match) {
|
|
31
|
+
const amount = parseFloat(match[1]);
|
|
32
|
+
const unit = trimmed.slice(match.index + match[1].length).trim();
|
|
33
|
+
return {
|
|
34
|
+
amount,
|
|
35
|
+
unit: unit || "servings",
|
|
36
|
+
description: trimmed
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return void 0;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// src/parsers/duration.ts
|
|
44
|
+
var ISO_DURATION_REGEX = /^P(?:(\d+(?:\.\d+)?)D)?(?:T(?:(\d+(?:\.\d+)?)H)?(?:(\d+(?:\.\d+)?)M)?(?:(\d+(?:\.\d+)?)S)?)?$/i;
|
|
45
|
+
var HUMAN_OVERNIGHT = 8 * 60;
|
|
46
|
+
function parseDuration(iso) {
|
|
47
|
+
if (typeof iso === "number" && Number.isFinite(iso)) {
|
|
48
|
+
return iso;
|
|
49
|
+
}
|
|
50
|
+
if (!iso || typeof iso !== "string") return null;
|
|
51
|
+
const trimmed = iso.trim();
|
|
52
|
+
if (!trimmed) return null;
|
|
53
|
+
const match = trimmed.match(ISO_DURATION_REGEX);
|
|
54
|
+
if (!match) return null;
|
|
55
|
+
const [, daysRaw, hoursRaw, minutesRaw, secondsRaw] = match;
|
|
56
|
+
if (!daysRaw && !hoursRaw && !minutesRaw && !secondsRaw) {
|
|
57
|
+
return null;
|
|
58
|
+
}
|
|
59
|
+
let total = 0;
|
|
60
|
+
if (daysRaw) total += parseFloat(daysRaw) * 24 * 60;
|
|
61
|
+
if (hoursRaw) total += parseFloat(hoursRaw) * 60;
|
|
62
|
+
if (minutesRaw) total += parseFloat(minutesRaw);
|
|
63
|
+
if (secondsRaw) total += Math.ceil(parseFloat(secondsRaw) / 60);
|
|
64
|
+
return Math.round(total);
|
|
65
|
+
}
|
|
66
|
+
function parseHumanDuration(text) {
|
|
67
|
+
if (!text || typeof text !== "string") return null;
|
|
68
|
+
const normalized = text.toLowerCase().trim();
|
|
69
|
+
if (!normalized) return null;
|
|
70
|
+
if (normalized === "overnight") {
|
|
71
|
+
return HUMAN_OVERNIGHT;
|
|
72
|
+
}
|
|
73
|
+
let total = 0;
|
|
74
|
+
const hourRegex = /(\d+(?:\.\d+)?)\s*(?:hours?|hrs?|hr|h)\b/g;
|
|
75
|
+
let hourMatch;
|
|
76
|
+
while ((hourMatch = hourRegex.exec(normalized)) !== null) {
|
|
77
|
+
total += parseFloat(hourMatch[1]) * 60;
|
|
78
|
+
}
|
|
79
|
+
const minuteRegex = /(\d+(?:\.\d+)?)\s*(?:minutes?|mins?|min|m)\b/g;
|
|
80
|
+
let minuteMatch;
|
|
81
|
+
while ((minuteMatch = minuteRegex.exec(normalized)) !== null) {
|
|
82
|
+
total += parseFloat(minuteMatch[1]);
|
|
83
|
+
}
|
|
84
|
+
if (total <= 0) {
|
|
85
|
+
return null;
|
|
86
|
+
}
|
|
87
|
+
return Math.round(total);
|
|
88
|
+
}
|
|
89
|
+
function smartParseDuration(input) {
|
|
90
|
+
const iso = parseDuration(input);
|
|
91
|
+
if (iso !== null) {
|
|
92
|
+
return iso;
|
|
93
|
+
}
|
|
94
|
+
return parseHumanDuration(input);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// src/utils/image.ts
|
|
98
|
+
function normalizeImage(image) {
|
|
99
|
+
if (!image) {
|
|
100
|
+
return void 0;
|
|
101
|
+
}
|
|
102
|
+
if (typeof image === "string") {
|
|
103
|
+
const trimmed = image.trim();
|
|
104
|
+
return trimmed || void 0;
|
|
105
|
+
}
|
|
106
|
+
if (Array.isArray(image)) {
|
|
107
|
+
const urls = image.map((entry) => typeof entry === "string" ? entry.trim() : extractUrl(entry)).filter((url) => typeof url === "string" && Boolean(url));
|
|
108
|
+
if (urls.length === 0) {
|
|
109
|
+
return void 0;
|
|
110
|
+
}
|
|
111
|
+
if (urls.length === 1) {
|
|
112
|
+
return urls[0];
|
|
113
|
+
}
|
|
114
|
+
return urls;
|
|
115
|
+
}
|
|
116
|
+
return extractUrl(image);
|
|
117
|
+
}
|
|
118
|
+
function extractUrl(value) {
|
|
119
|
+
if (!value || typeof value !== "object") {
|
|
120
|
+
return void 0;
|
|
121
|
+
}
|
|
122
|
+
const record = value;
|
|
123
|
+
const candidate = typeof record.url === "string" ? record.url : typeof record.contentUrl === "string" ? record.contentUrl : void 0;
|
|
124
|
+
if (!candidate) {
|
|
125
|
+
return void 0;
|
|
126
|
+
}
|
|
127
|
+
const trimmed = candidate.trim();
|
|
128
|
+
return trimmed || void 0;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// src/fromSchemaOrg.ts
|
|
132
|
+
function fromSchemaOrg(input) {
|
|
133
|
+
const recipeNode = extractRecipeNode(input);
|
|
134
|
+
if (!recipeNode) {
|
|
135
|
+
return null;
|
|
136
|
+
}
|
|
137
|
+
const ingredients = convertIngredients(recipeNode.recipeIngredient);
|
|
138
|
+
const instructions = convertInstructions(recipeNode.recipeInstructions);
|
|
139
|
+
const time = convertTime(recipeNode);
|
|
140
|
+
const recipeYield = parseYield(recipeNode.recipeYield);
|
|
141
|
+
const tags = collectTags(recipeNode.recipeCuisine, recipeNode.keywords);
|
|
142
|
+
const category = extractFirst(recipeNode.recipeCategory);
|
|
143
|
+
const source = convertSource(recipeNode);
|
|
144
|
+
const nutrition = recipeNode.nutrition && typeof recipeNode.nutrition === "object" ? recipeNode.nutrition : void 0;
|
|
145
|
+
return {
|
|
146
|
+
name: recipeNode.name.trim(),
|
|
147
|
+
description: recipeNode.description?.trim() || void 0,
|
|
148
|
+
image: normalizeImage(recipeNode.image),
|
|
149
|
+
category,
|
|
150
|
+
tags: tags.length ? tags : void 0,
|
|
151
|
+
source,
|
|
152
|
+
dateAdded: recipeNode.datePublished || void 0,
|
|
153
|
+
dateModified: recipeNode.dateModified || void 0,
|
|
154
|
+
yield: recipeYield,
|
|
155
|
+
time,
|
|
156
|
+
ingredients,
|
|
157
|
+
instructions,
|
|
158
|
+
nutrition
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
function extractRecipeNode(input) {
|
|
162
|
+
if (!input) return null;
|
|
163
|
+
if (Array.isArray(input)) {
|
|
164
|
+
for (const entry of input) {
|
|
165
|
+
const found = extractRecipeNode(entry);
|
|
166
|
+
if (found) {
|
|
167
|
+
return found;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
return null;
|
|
171
|
+
}
|
|
172
|
+
if (typeof input !== "object") {
|
|
173
|
+
return null;
|
|
174
|
+
}
|
|
175
|
+
const record = input;
|
|
176
|
+
if (record["@graph"]) {
|
|
177
|
+
const fromGraph = extractRecipeNode(record["@graph"]);
|
|
178
|
+
if (fromGraph) {
|
|
179
|
+
return fromGraph;
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
if (!hasRecipeType(record["@type"])) {
|
|
183
|
+
return null;
|
|
184
|
+
}
|
|
185
|
+
if (!isValidName(record.name)) {
|
|
186
|
+
return null;
|
|
187
|
+
}
|
|
188
|
+
return record;
|
|
189
|
+
}
|
|
190
|
+
function hasRecipeType(value) {
|
|
191
|
+
if (!value) return false;
|
|
192
|
+
const types = Array.isArray(value) ? value : [value];
|
|
193
|
+
return types.some(
|
|
194
|
+
(entry) => typeof entry === "string" && entry.toLowerCase() === "recipe"
|
|
195
|
+
);
|
|
196
|
+
}
|
|
197
|
+
function isValidName(name) {
|
|
198
|
+
return typeof name === "string" && Boolean(name.trim());
|
|
199
|
+
}
|
|
200
|
+
function convertIngredients(value) {
|
|
201
|
+
if (!value) return [];
|
|
202
|
+
const normalized = Array.isArray(value) ? value : [value];
|
|
203
|
+
return normalized.map((item) => typeof item === "string" ? item.trim() : "").filter(Boolean);
|
|
204
|
+
}
|
|
205
|
+
function convertInstructions(value) {
|
|
206
|
+
if (!value) return [];
|
|
207
|
+
const normalized = Array.isArray(value) ? value : [value];
|
|
208
|
+
const result = [];
|
|
209
|
+
for (const entry of normalized) {
|
|
210
|
+
if (!entry) continue;
|
|
211
|
+
if (typeof entry === "string") {
|
|
212
|
+
const text = entry.trim();
|
|
213
|
+
if (text) {
|
|
214
|
+
result.push(text);
|
|
215
|
+
}
|
|
216
|
+
continue;
|
|
217
|
+
}
|
|
218
|
+
if (isHowToSection(entry)) {
|
|
219
|
+
const subsectionItems = extractSectionItems(entry.itemListElement);
|
|
220
|
+
if (subsectionItems.length) {
|
|
221
|
+
result.push({
|
|
222
|
+
subsection: entry.name?.trim() || "Section",
|
|
223
|
+
items: subsectionItems
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
continue;
|
|
227
|
+
}
|
|
228
|
+
if (isHowToStep(entry)) {
|
|
229
|
+
const parsed = convertHowToStep(entry);
|
|
230
|
+
if (parsed) {
|
|
231
|
+
result.push(parsed);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
return result;
|
|
236
|
+
}
|
|
237
|
+
function extractSectionItems(items = []) {
|
|
238
|
+
const result = [];
|
|
239
|
+
for (const item of items) {
|
|
240
|
+
if (!item) continue;
|
|
241
|
+
if (typeof item === "string") {
|
|
242
|
+
const text = item.trim();
|
|
243
|
+
if (text) {
|
|
244
|
+
result.push(text);
|
|
245
|
+
}
|
|
246
|
+
continue;
|
|
247
|
+
}
|
|
248
|
+
if (isHowToStep(item)) {
|
|
249
|
+
const parsed = convertHowToStep(item);
|
|
250
|
+
if (parsed) {
|
|
251
|
+
result.push(parsed);
|
|
252
|
+
}
|
|
253
|
+
continue;
|
|
254
|
+
}
|
|
255
|
+
if (isHowToSection(item)) {
|
|
256
|
+
result.push(...extractSectionItems(item.itemListElement));
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
return result;
|
|
260
|
+
}
|
|
261
|
+
function extractInstructionText(value) {
|
|
262
|
+
const text = typeof value.text === "string" ? value.text : value.name;
|
|
263
|
+
return typeof text === "string" ? text.trim() || void 0 : void 0;
|
|
264
|
+
}
|
|
265
|
+
function convertHowToStep(step) {
|
|
266
|
+
const text = extractInstructionText(step);
|
|
267
|
+
if (!text) {
|
|
268
|
+
return void 0;
|
|
269
|
+
}
|
|
270
|
+
const normalizedImage = normalizeImage(step.image);
|
|
271
|
+
const image = Array.isArray(normalizedImage) ? normalizedImage[0] : normalizedImage;
|
|
272
|
+
const id = extractInstructionId(step);
|
|
273
|
+
const timing = extractInstructionTiming(step);
|
|
274
|
+
if (!image && !id && !timing) {
|
|
275
|
+
return text;
|
|
276
|
+
}
|
|
277
|
+
const instruction = { text };
|
|
278
|
+
if (id) instruction.id = id;
|
|
279
|
+
if (image) instruction.image = image;
|
|
280
|
+
if (timing) instruction.timing = timing;
|
|
281
|
+
return instruction;
|
|
282
|
+
}
|
|
283
|
+
function extractInstructionTiming(step) {
|
|
284
|
+
const duration = step.totalTime || step.performTime || step.prepTime || step.duration;
|
|
285
|
+
if (!duration || typeof duration !== "string") {
|
|
286
|
+
return void 0;
|
|
287
|
+
}
|
|
288
|
+
const parsed = smartParseDuration(duration);
|
|
289
|
+
return { duration: parsed ?? duration, type: "active" };
|
|
290
|
+
}
|
|
291
|
+
function extractInstructionId(step) {
|
|
292
|
+
const raw = step["@id"] || step.id || step.url;
|
|
293
|
+
if (typeof raw !== "string") {
|
|
294
|
+
return void 0;
|
|
295
|
+
}
|
|
296
|
+
const trimmed = raw.trim();
|
|
297
|
+
return trimmed || void 0;
|
|
298
|
+
}
|
|
299
|
+
function isHowToStep(value) {
|
|
300
|
+
return Boolean(value) && typeof value === "object" && value["@type"] === "HowToStep";
|
|
301
|
+
}
|
|
302
|
+
function isHowToSection(value) {
|
|
303
|
+
return Boolean(value) && typeof value === "object" && value["@type"] === "HowToSection" && Array.isArray(value.itemListElement);
|
|
304
|
+
}
|
|
305
|
+
function convertTime(recipe) {
|
|
306
|
+
const prep = smartParseDuration(recipe.prepTime ?? "");
|
|
307
|
+
const cook = smartParseDuration(recipe.cookTime ?? "");
|
|
308
|
+
const total = smartParseDuration(recipe.totalTime ?? "");
|
|
309
|
+
const structured = {};
|
|
310
|
+
if (prep !== null && prep !== void 0) structured.prep = prep;
|
|
311
|
+
if (cook !== null && cook !== void 0) structured.active = cook;
|
|
312
|
+
if (total !== null && total !== void 0) structured.total = total;
|
|
313
|
+
return Object.keys(structured).length ? structured : void 0;
|
|
314
|
+
}
|
|
315
|
+
function collectTags(cuisine, keywords) {
|
|
316
|
+
const tags = /* @__PURE__ */ new Set();
|
|
317
|
+
flattenStrings(cuisine).forEach((tag) => tags.add(tag));
|
|
318
|
+
if (typeof keywords === "string") {
|
|
319
|
+
splitKeywords(keywords).forEach((tag) => tags.add(tag));
|
|
320
|
+
} else {
|
|
321
|
+
flattenStrings(keywords).forEach((tag) => tags.add(tag));
|
|
322
|
+
}
|
|
323
|
+
return Array.from(tags);
|
|
324
|
+
}
|
|
325
|
+
function splitKeywords(value) {
|
|
326
|
+
return value.split(/[,|]/).map((part) => part.trim()).filter(Boolean);
|
|
327
|
+
}
|
|
328
|
+
function flattenStrings(value) {
|
|
329
|
+
if (!value) return [];
|
|
330
|
+
if (typeof value === "string") return [value.trim()].filter(Boolean);
|
|
331
|
+
if (Array.isArray(value)) {
|
|
332
|
+
return value.map((item) => typeof item === "string" ? item.trim() : "").filter(Boolean);
|
|
333
|
+
}
|
|
334
|
+
return [];
|
|
335
|
+
}
|
|
336
|
+
function extractFirst(value) {
|
|
337
|
+
const arr = flattenStrings(value);
|
|
338
|
+
return arr.length ? arr[0] : void 0;
|
|
339
|
+
}
|
|
340
|
+
function convertSource(recipe) {
|
|
341
|
+
const author = extractEntityName(recipe.author);
|
|
342
|
+
const publisher = extractEntityName(recipe.publisher);
|
|
343
|
+
const url = (recipe.url || recipe.mainEntityOfPage)?.trim();
|
|
344
|
+
const source = {};
|
|
345
|
+
if (author) source.author = author;
|
|
346
|
+
if (publisher) source.name = publisher;
|
|
347
|
+
if (url) source.url = url;
|
|
348
|
+
return Object.keys(source).length ? source : void 0;
|
|
349
|
+
}
|
|
350
|
+
function extractEntityName(value) {
|
|
351
|
+
if (!value) return void 0;
|
|
352
|
+
if (typeof value === "string") {
|
|
353
|
+
const trimmed = value.trim();
|
|
354
|
+
return trimmed || void 0;
|
|
355
|
+
}
|
|
356
|
+
if (Array.isArray(value)) {
|
|
357
|
+
for (const entry of value) {
|
|
358
|
+
const name = extractEntityName(entry);
|
|
359
|
+
if (name) {
|
|
360
|
+
return name;
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
return void 0;
|
|
364
|
+
}
|
|
365
|
+
if (typeof value === "object" && typeof value.name === "string") {
|
|
366
|
+
const trimmed = value.name.trim();
|
|
367
|
+
return trimmed || void 0;
|
|
368
|
+
}
|
|
369
|
+
return void 0;
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
// src/scraper/fetch.ts
|
|
373
|
+
var DEFAULT_USER_AGENTS = [
|
|
374
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
375
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
376
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0"
|
|
377
|
+
];
|
|
378
|
+
function chooseUserAgent(provided) {
|
|
379
|
+
if (provided) return provided;
|
|
380
|
+
const index = Math.floor(Math.random() * DEFAULT_USER_AGENTS.length);
|
|
381
|
+
return DEFAULT_USER_AGENTS[index];
|
|
382
|
+
}
|
|
383
|
+
function resolveFetch(fetchFn) {
|
|
384
|
+
if (fetchFn) {
|
|
385
|
+
return fetchFn;
|
|
386
|
+
}
|
|
387
|
+
const globalFetch = globalThis.fetch;
|
|
388
|
+
if (!globalFetch) {
|
|
389
|
+
throw new Error(
|
|
390
|
+
"A global fetch implementation is not available. Provide window.fetch in browsers or upgrade to Node 18+."
|
|
391
|
+
);
|
|
392
|
+
}
|
|
393
|
+
return globalFetch;
|
|
394
|
+
}
|
|
395
|
+
function isBrowserEnvironment() {
|
|
396
|
+
return typeof globalThis.document !== "undefined";
|
|
397
|
+
}
|
|
398
|
+
function isClientError(error) {
|
|
399
|
+
if (typeof error.status === "number") {
|
|
400
|
+
return error.status >= 400 && error.status < 500;
|
|
401
|
+
}
|
|
402
|
+
return error.message.includes("HTTP 4");
|
|
403
|
+
}
|
|
404
|
+
async function wait(ms) {
|
|
405
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
406
|
+
}
|
|
407
|
+
async function fetchPage(url, options = {}) {
|
|
408
|
+
const {
|
|
409
|
+
timeout = 1e4,
|
|
410
|
+
userAgent,
|
|
411
|
+
maxRetries = 2,
|
|
412
|
+
fetchFn
|
|
413
|
+
} = options;
|
|
414
|
+
let lastError = null;
|
|
415
|
+
const resolvedFetch = resolveFetch(fetchFn);
|
|
416
|
+
const isBrowser2 = isBrowserEnvironment();
|
|
417
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
418
|
+
const controller = new AbortController();
|
|
419
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
420
|
+
try {
|
|
421
|
+
const headers = {
|
|
422
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
423
|
+
"Accept-Language": "en-US,en;q=0.5"
|
|
424
|
+
};
|
|
425
|
+
if (!isBrowser2) {
|
|
426
|
+
headers["User-Agent"] = chooseUserAgent(userAgent);
|
|
427
|
+
}
|
|
428
|
+
const requestInit = {
|
|
429
|
+
headers,
|
|
430
|
+
signal: controller.signal,
|
|
431
|
+
redirect: "follow"
|
|
432
|
+
};
|
|
433
|
+
const response = await resolvedFetch(url, requestInit);
|
|
434
|
+
clearTimeout(timeoutId);
|
|
435
|
+
if (response && typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
436
|
+
try {
|
|
437
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
438
|
+
if (globalFetch) {
|
|
439
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/fetch.ts:63", message: "fetch response", data: { url, status: response.status, statusText: response.statusText, ok: response.ok, isNYTimes: url.includes("nytimes.com") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
|
|
440
|
+
});
|
|
441
|
+
}
|
|
442
|
+
} catch {
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
if (!response.ok) {
|
|
446
|
+
const error = new Error(
|
|
447
|
+
`HTTP ${response.status}: ${response.statusText}`
|
|
448
|
+
);
|
|
449
|
+
error.status = response.status;
|
|
450
|
+
throw error;
|
|
451
|
+
}
|
|
452
|
+
const html = await response.text();
|
|
453
|
+
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
454
|
+
try {
|
|
455
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
456
|
+
if (globalFetch) {
|
|
457
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/fetch.ts:75", message: "HTML received", data: { htmlLength: html.length, hasLoginPage: html.toLowerCase().includes("login") || html.toLowerCase().includes("sign in"), hasRecipeData: html.includes("application/ld+json") || html.includes("schema.org/Recipe") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B,D" }) }).catch(() => {
|
|
458
|
+
});
|
|
459
|
+
}
|
|
460
|
+
} catch {
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
return html;
|
|
464
|
+
} catch (err) {
|
|
465
|
+
clearTimeout(timeoutId);
|
|
466
|
+
lastError = err instanceof Error ? err : new Error(String(err));
|
|
467
|
+
if (isClientError(lastError)) {
|
|
468
|
+
throw lastError;
|
|
469
|
+
}
|
|
470
|
+
if (attempt < maxRetries) {
|
|
471
|
+
await wait(1e3 * (attempt + 1));
|
|
472
|
+
continue;
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
throw lastError ?? new Error("Failed to fetch page");
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
// src/scraper/extractors/utils.ts
|
|
480
|
+
var RECIPE_TYPES = /* @__PURE__ */ new Set([
|
|
481
|
+
"recipe",
|
|
482
|
+
"https://schema.org/recipe",
|
|
483
|
+
"http://schema.org/recipe"
|
|
484
|
+
]);
|
|
485
|
+
function isRecipeNode(value) {
|
|
486
|
+
if (!value || typeof value !== "object") {
|
|
487
|
+
return false;
|
|
488
|
+
}
|
|
489
|
+
const type = value["@type"];
|
|
490
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/utils.ts:14", message: "isRecipeNode check", data: { type, typeLower: typeof type === "string" ? type.toLowerCase() : Array.isArray(type) ? type.map((t) => typeof t === "string" ? t.toLowerCase() : t) : void 0, isMatch: typeof type === "string" ? RECIPE_TYPES.has(type.toLowerCase()) : Array.isArray(type) ? type.some((e) => typeof e === "string" && RECIPE_TYPES.has(e.toLowerCase())) : false }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
|
|
491
|
+
});
|
|
492
|
+
if (typeof type === "string") {
|
|
493
|
+
return RECIPE_TYPES.has(type.toLowerCase());
|
|
494
|
+
}
|
|
495
|
+
if (Array.isArray(type)) {
|
|
496
|
+
return type.some(
|
|
497
|
+
(entry) => typeof entry === "string" && RECIPE_TYPES.has(entry.toLowerCase())
|
|
498
|
+
);
|
|
499
|
+
}
|
|
500
|
+
return false;
|
|
501
|
+
}
|
|
502
|
+
function safeJsonParse(content) {
|
|
503
|
+
try {
|
|
504
|
+
return JSON.parse(content);
|
|
505
|
+
} catch {
|
|
506
|
+
return null;
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
function normalizeText(value) {
|
|
510
|
+
if (!value) return void 0;
|
|
511
|
+
const trimmed = value.replace(/\s+/g, " ").trim();
|
|
512
|
+
return trimmed || void 0;
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
// src/scraper/extractors/jsonld.ts
|
|
516
|
+
function extractJsonLd(html) {
|
|
517
|
+
const $ = load(html);
|
|
518
|
+
const scripts = $('script[type="application/ld+json"]');
|
|
519
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:8", message: "JSON-LD scripts found", data: { scriptCount: scripts.length }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
|
|
520
|
+
});
|
|
521
|
+
const candidates = [];
|
|
522
|
+
scripts.each((_, element) => {
|
|
523
|
+
const content = $(element).html();
|
|
524
|
+
if (!content) return;
|
|
525
|
+
const parsed = safeJsonParse(content);
|
|
526
|
+
if (!parsed) return;
|
|
527
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:18", message: "JSON-LD parsed", data: { hasGraph: !!(parsed && typeof parsed === "object" && "@graph" in parsed), type: parsed && typeof parsed === "object" && "@type" in parsed ? parsed["@type"] : void 0 }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C" }) }).catch(() => {
|
|
528
|
+
});
|
|
529
|
+
collectCandidates(parsed, candidates);
|
|
530
|
+
});
|
|
531
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:22", message: "JSON-LD candidates", data: { candidateCount: candidates.length, candidateTypes: candidates.map((c) => c["@type"]) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C" }) }).catch(() => {
|
|
532
|
+
});
|
|
533
|
+
return candidates[0] ?? null;
|
|
534
|
+
}
|
|
535
|
+
function collectCandidates(payload, bucket) {
|
|
536
|
+
if (!payload) return;
|
|
537
|
+
if (Array.isArray(payload)) {
|
|
538
|
+
payload.forEach((entry) => collectCandidates(entry, bucket));
|
|
539
|
+
return;
|
|
540
|
+
}
|
|
541
|
+
if (typeof payload !== "object") {
|
|
542
|
+
return;
|
|
543
|
+
}
|
|
544
|
+
if (isRecipeNode(payload)) {
|
|
545
|
+
bucket.push(payload);
|
|
546
|
+
return;
|
|
547
|
+
}
|
|
548
|
+
const graph = payload["@graph"];
|
|
549
|
+
if (Array.isArray(graph)) {
|
|
550
|
+
graph.forEach((entry) => collectCandidates(entry, bucket));
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
var SIMPLE_PROPS = [
|
|
554
|
+
"name",
|
|
555
|
+
"description",
|
|
556
|
+
"image",
|
|
557
|
+
"recipeYield",
|
|
558
|
+
"prepTime",
|
|
559
|
+
"cookTime",
|
|
560
|
+
"totalTime"
|
|
561
|
+
];
|
|
562
|
+
function extractMicrodata(html) {
|
|
563
|
+
const $ = load(html);
|
|
564
|
+
const recipeEl = $('[itemscope][itemtype*="schema.org/Recipe"]').first();
|
|
565
|
+
if (!recipeEl.length) {
|
|
566
|
+
return null;
|
|
567
|
+
}
|
|
568
|
+
const recipe = {
|
|
569
|
+
"@type": "Recipe"
|
|
570
|
+
};
|
|
571
|
+
SIMPLE_PROPS.forEach((prop) => {
|
|
572
|
+
const value = findPropertyValue($, recipeEl, prop);
|
|
573
|
+
if (value) {
|
|
574
|
+
recipe[prop] = value;
|
|
575
|
+
}
|
|
576
|
+
});
|
|
577
|
+
const ingredients = [];
|
|
578
|
+
recipeEl.find('[itemprop="recipeIngredient"]').each((_, el) => {
|
|
579
|
+
const text = normalizeText($(el).attr("content") || $(el).text());
|
|
580
|
+
if (text) ingredients.push(text);
|
|
581
|
+
});
|
|
582
|
+
if (ingredients.length) {
|
|
583
|
+
recipe.recipeIngredient = ingredients;
|
|
584
|
+
}
|
|
585
|
+
const instructions = [];
|
|
586
|
+
recipeEl.find('[itemprop="recipeInstructions"]').each((_, el) => {
|
|
587
|
+
const text = normalizeText($(el).attr("content")) || normalizeText($(el).find('[itemprop="text"]').first().text()) || normalizeText($(el).text());
|
|
588
|
+
if (text) instructions.push(text);
|
|
589
|
+
});
|
|
590
|
+
if (instructions.length) {
|
|
591
|
+
recipe.recipeInstructions = instructions;
|
|
592
|
+
}
|
|
593
|
+
if (recipe.name || ingredients.length) {
|
|
594
|
+
return recipe;
|
|
595
|
+
}
|
|
596
|
+
return null;
|
|
597
|
+
}
|
|
598
|
+
function findPropertyValue($, context, prop) {
|
|
599
|
+
const node = context.find(`[itemprop="${prop}"]`).first();
|
|
600
|
+
if (!node.length) return void 0;
|
|
601
|
+
return normalizeText(node.attr("content")) || normalizeText(node.attr("href")) || normalizeText(node.attr("src")) || normalizeText(node.text());
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
// src/scraper/extractors/browser.ts
|
|
605
|
+
var SIMPLE_PROPS2 = ["name", "description", "image", "recipeYield", "prepTime", "cookTime", "totalTime"];
|
|
606
|
+
function extractRecipeBrowser(html) {
|
|
607
|
+
const jsonLdRecipe = extractJsonLdBrowser(html);
|
|
608
|
+
if (jsonLdRecipe) {
|
|
609
|
+
return { recipe: jsonLdRecipe, source: "jsonld" };
|
|
610
|
+
}
|
|
611
|
+
const microdataRecipe = extractMicrodataBrowser(html);
|
|
612
|
+
if (microdataRecipe) {
|
|
613
|
+
return { recipe: microdataRecipe, source: "microdata" };
|
|
614
|
+
}
|
|
615
|
+
return { recipe: null, source: null };
|
|
616
|
+
}
|
|
617
|
+
function extractJsonLdBrowser(html) {
|
|
618
|
+
if (typeof globalThis.DOMParser === "undefined") {
|
|
619
|
+
return null;
|
|
620
|
+
}
|
|
621
|
+
const parser = new globalThis.DOMParser();
|
|
622
|
+
const doc = parser.parseFromString(html, "text/html");
|
|
623
|
+
const scripts = doc.querySelectorAll('script[type="application/ld+json"]');
|
|
624
|
+
const candidates = [];
|
|
625
|
+
scripts.forEach((script) => {
|
|
626
|
+
const content = script.textContent;
|
|
627
|
+
if (!content) return;
|
|
628
|
+
const parsed = safeJsonParse(content);
|
|
629
|
+
if (!parsed) return;
|
|
630
|
+
collectCandidates2(parsed, candidates);
|
|
631
|
+
});
|
|
632
|
+
return candidates[0] ?? null;
|
|
633
|
+
}
|
|
634
|
+
function extractMicrodataBrowser(html) {
|
|
635
|
+
if (typeof globalThis.DOMParser === "undefined") {
|
|
636
|
+
return null;
|
|
637
|
+
}
|
|
638
|
+
const parser = new globalThis.DOMParser();
|
|
639
|
+
const doc = parser.parseFromString(html, "text/html");
|
|
640
|
+
const recipeEl = doc.querySelector('[itemscope][itemtype*="schema.org/Recipe"]');
|
|
641
|
+
if (!recipeEl) {
|
|
642
|
+
return null;
|
|
643
|
+
}
|
|
644
|
+
const recipe = {
|
|
645
|
+
"@type": "Recipe"
|
|
646
|
+
};
|
|
647
|
+
SIMPLE_PROPS2.forEach((prop) => {
|
|
648
|
+
const value = findPropertyValue2(recipeEl, prop);
|
|
649
|
+
if (value) {
|
|
650
|
+
recipe[prop] = value;
|
|
651
|
+
}
|
|
652
|
+
});
|
|
653
|
+
const ingredients = [];
|
|
654
|
+
recipeEl.querySelectorAll('[itemprop="recipeIngredient"]').forEach((el) => {
|
|
655
|
+
const text = normalizeText(
|
|
656
|
+
el.getAttribute("content") || el.textContent || void 0
|
|
657
|
+
);
|
|
658
|
+
if (text) ingredients.push(text);
|
|
659
|
+
});
|
|
660
|
+
if (ingredients.length) {
|
|
661
|
+
recipe.recipeIngredient = ingredients;
|
|
662
|
+
}
|
|
663
|
+
const instructions = [];
|
|
664
|
+
recipeEl.querySelectorAll('[itemprop="recipeInstructions"]').forEach((el) => {
|
|
665
|
+
const text = normalizeText(el.getAttribute("content")) || normalizeText(el.querySelector('[itemprop="text"]')?.textContent || void 0) || normalizeText(el.textContent || void 0);
|
|
666
|
+
if (text) instructions.push(text);
|
|
667
|
+
});
|
|
668
|
+
if (instructions.length) {
|
|
669
|
+
recipe.recipeInstructions = instructions;
|
|
670
|
+
}
|
|
671
|
+
if (recipe.name || ingredients.length) {
|
|
672
|
+
return recipe;
|
|
673
|
+
}
|
|
674
|
+
return null;
|
|
675
|
+
}
|
|
676
|
+
function findPropertyValue2(context, prop) {
|
|
677
|
+
const node = context.querySelector(`[itemprop="${prop}"]`);
|
|
678
|
+
if (!node) return void 0;
|
|
679
|
+
return normalizeText(node.getAttribute("content")) || normalizeText(node.getAttribute("href")) || normalizeText(node.getAttribute("src")) || normalizeText(node.textContent || void 0);
|
|
680
|
+
}
|
|
681
|
+
function collectCandidates2(payload, bucket) {
|
|
682
|
+
if (!payload) return;
|
|
683
|
+
if (Array.isArray(payload)) {
|
|
684
|
+
payload.forEach((entry) => collectCandidates2(entry, bucket));
|
|
685
|
+
return;
|
|
686
|
+
}
|
|
687
|
+
if (typeof payload !== "object") {
|
|
688
|
+
return;
|
|
689
|
+
}
|
|
690
|
+
if (isRecipeNode(payload)) {
|
|
691
|
+
bucket.push(payload);
|
|
692
|
+
return;
|
|
693
|
+
}
|
|
694
|
+
const graph = payload["@graph"];
|
|
695
|
+
if (Array.isArray(graph)) {
|
|
696
|
+
graph.forEach((entry) => collectCandidates2(entry, bucket));
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
// src/scraper/extractors/index.ts
|
|
701
|
+
function isBrowser() {
|
|
702
|
+
try {
|
|
703
|
+
return typeof globalThis.DOMParser !== "undefined";
|
|
704
|
+
} catch {
|
|
705
|
+
return false;
|
|
706
|
+
}
|
|
707
|
+
}
|
|
708
|
+
function extractRecipe(html) {
|
|
709
|
+
if (isBrowser()) {
|
|
710
|
+
return extractRecipeBrowser(html);
|
|
711
|
+
}
|
|
712
|
+
const jsonLdRecipe = extractJsonLd(html);
|
|
713
|
+
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
714
|
+
try {
|
|
715
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
716
|
+
if (globalFetch) {
|
|
717
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:6", message: "JSON-LD extraction result", data: { hasJsonLd: !!jsonLdRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
|
|
718
|
+
});
|
|
719
|
+
}
|
|
720
|
+
} catch {
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
if (jsonLdRecipe) {
|
|
724
|
+
return { recipe: jsonLdRecipe, source: "jsonld" };
|
|
725
|
+
}
|
|
726
|
+
const microdataRecipe = extractMicrodata(html);
|
|
727
|
+
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
728
|
+
try {
|
|
729
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
730
|
+
if (globalFetch) {
|
|
731
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:12", message: "Microdata extraction result", data: { hasMicrodata: !!microdataRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "D" }) }).catch(() => {
|
|
732
|
+
});
|
|
733
|
+
}
|
|
734
|
+
} catch {
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
if (microdataRecipe) {
|
|
738
|
+
return { recipe: microdataRecipe, source: "microdata" };
|
|
739
|
+
}
|
|
740
|
+
return { recipe: null, source: null };
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
// src/scraper/index.ts
|
|
744
|
+
async function scrapeRecipe(url, options = {}) {
|
|
745
|
+
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
746
|
+
try {
|
|
747
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
748
|
+
if (globalFetch) {
|
|
749
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:7", message: "scrapeRecipe entry", data: { url, hasOptions: !!options }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,B,C,D,E" }) }).catch(() => {
|
|
750
|
+
});
|
|
751
|
+
}
|
|
752
|
+
} catch {
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
const html = await fetchPage(url, options);
|
|
756
|
+
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
757
|
+
try {
|
|
758
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
759
|
+
if (globalFetch) {
|
|
760
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:9", message: "HTML fetched", data: { htmlLength: html?.length, htmlPreview: html?.substring(0, 200) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
|
|
761
|
+
});
|
|
762
|
+
}
|
|
763
|
+
} catch {
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
const { recipe } = extractRecipe(html);
|
|
767
|
+
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
768
|
+
try {
|
|
769
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
770
|
+
if (globalFetch) {
|
|
771
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:11", message: "extractRecipe result", data: { hasRecipe: !!recipe, recipeType: recipe?.["@type"], recipeName: recipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C,D" }) }).catch(() => {
|
|
772
|
+
});
|
|
773
|
+
}
|
|
774
|
+
} catch {
|
|
775
|
+
}
|
|
776
|
+
}
|
|
777
|
+
if (!recipe) {
|
|
778
|
+
throw new Error("No Schema.org recipe data found in page");
|
|
779
|
+
}
|
|
780
|
+
const soustackRecipe = fromSchemaOrg(recipe);
|
|
781
|
+
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
782
|
+
try {
|
|
783
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
784
|
+
if (globalFetch) {
|
|
785
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:17", message: "fromSchemaOrg result", data: { hasSoustackRecipe: !!soustackRecipe, soustackRecipeName: soustackRecipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
|
|
786
|
+
});
|
|
787
|
+
}
|
|
788
|
+
} catch {
|
|
789
|
+
}
|
|
790
|
+
}
|
|
791
|
+
if (!soustackRecipe) {
|
|
792
|
+
throw new Error("Schema.org data did not include a valid recipe");
|
|
793
|
+
}
|
|
794
|
+
return soustackRecipe;
|
|
795
|
+
}
|
|
796
|
+
function extractRecipeFromHTML(html) {
|
|
797
|
+
const { recipe } = extractRecipe(html);
|
|
798
|
+
if (!recipe) {
|
|
799
|
+
throw new Error("No Schema.org recipe data found in HTML");
|
|
800
|
+
}
|
|
801
|
+
const soustackRecipe = fromSchemaOrg(recipe);
|
|
802
|
+
if (!soustackRecipe) {
|
|
803
|
+
throw new Error("Schema.org data did not include a valid recipe");
|
|
804
|
+
}
|
|
805
|
+
return soustackRecipe;
|
|
806
|
+
}
|
|
807
|
+
function extractSchemaOrgRecipeFromHTML(html) {
|
|
808
|
+
const { recipe } = extractRecipe(html);
|
|
809
|
+
return recipe;
|
|
810
|
+
}
|
|
811
|
+
|
|
812
|
+
export { extractRecipeFromHTML, extractSchemaOrgRecipeFromHTML, fetchPage, scrapeRecipe };
|
|
813
|
+
//# sourceMappingURL=scrape.mjs.map
|
|
814
|
+
//# sourceMappingURL=scrape.mjs.map
|