soustack 0.2.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +128 -18
- package/dist/cli/index.js +1706 -665
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.mts +172 -28
- package/dist/index.d.ts +172 -28
- package/dist/index.js +2028 -662
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +2022 -662
- package/dist/index.mjs.map +1 -1
- package/dist/{scrape.d.mts → scrape/index.d.mts} +38 -10
- package/dist/{scrape.d.ts → scrape/index.d.ts} +38 -10
- package/dist/{scrape.js → scrape/index.js} +268 -62
- package/dist/scrape/index.js.map +1 -0
- package/dist/{scrape.mjs → scrape/index.mjs} +268 -62
- package/dist/scrape/index.mjs.map +1 -0
- package/package.json +15 -9
- package/src/profiles/base.schema.json +2 -2
- package/src/profiles/cookable.schema.json +4 -4
- package/src/profiles/illustrated.schema.json +4 -4
- package/src/profiles/quantified.schema.json +4 -4
- package/src/profiles/scalable.schema.json +6 -6
- package/src/profiles/schedulable.schema.json +4 -4
- package/src/schema.json +15 -3
- package/src/soustack.schema.json +15 -3
- package/dist/scrape.js.map +0 -1
- package/dist/scrape.mjs.map +0 -1
|
@@ -128,6 +128,92 @@ function extractUrl(value) {
|
|
|
128
128
|
return trimmed || void 0;
|
|
129
129
|
}
|
|
130
130
|
|
|
131
|
+
// src/normalize.ts
|
|
132
|
+
function normalizeRecipe(input) {
|
|
133
|
+
if (!input || typeof input !== "object") {
|
|
134
|
+
throw new Error("Recipe input must be an object");
|
|
135
|
+
}
|
|
136
|
+
const recipe = JSON.parse(JSON.stringify(input));
|
|
137
|
+
const warnings = [];
|
|
138
|
+
const legacyField = ["mod", "ules"].join("");
|
|
139
|
+
if (legacyField in recipe) {
|
|
140
|
+
throw new Error("The legacy field is no longer supported. Use `stacks` instead.");
|
|
141
|
+
}
|
|
142
|
+
normalizeStacks(recipe, warnings);
|
|
143
|
+
if (!recipe.stacks) {
|
|
144
|
+
recipe.stacks = {};
|
|
145
|
+
}
|
|
146
|
+
if (recipe && typeof recipe === "object" && "version" in recipe && !recipe.recipeVersion && typeof recipe.version === "string") {
|
|
147
|
+
recipe.recipeVersion = recipe.version;
|
|
148
|
+
warnings.push("'version' is deprecated; mapped to 'recipeVersion'.");
|
|
149
|
+
}
|
|
150
|
+
normalizeTime(recipe);
|
|
151
|
+
return {
|
|
152
|
+
recipe,
|
|
153
|
+
warnings
|
|
154
|
+
};
|
|
155
|
+
}
|
|
156
|
+
function normalizeStacks(recipe, warnings) {
|
|
157
|
+
let stacks = {};
|
|
158
|
+
if (recipe.stacks && typeof recipe.stacks === "object" && !Array.isArray(recipe.stacks)) {
|
|
159
|
+
for (const [key, value] of Object.entries(recipe.stacks)) {
|
|
160
|
+
if (typeof value === "number" && Number.isInteger(value) && value >= 1) {
|
|
161
|
+
stacks[key] = value;
|
|
162
|
+
} else {
|
|
163
|
+
warnings.push(`Invalid stack version for '${key}': expected positive integer, got ${value}`);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
if (Array.isArray(recipe.stacks)) {
|
|
168
|
+
const stackIdentifiers = recipe.stacks.filter((s) => typeof s === "string");
|
|
169
|
+
for (const identifier of stackIdentifiers) {
|
|
170
|
+
const parsed = parseStackIdentifier(identifier);
|
|
171
|
+
if (parsed) {
|
|
172
|
+
const { name, version } = parsed;
|
|
173
|
+
if (!stacks[name] || stacks[name] < version) {
|
|
174
|
+
stacks[name] = version;
|
|
175
|
+
}
|
|
176
|
+
} else {
|
|
177
|
+
warnings.push(`Invalid stack identifier '${identifier}': expected format 'name@version' (e.g., 'scaling@1')`);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
recipe.stacks = stacks;
|
|
182
|
+
}
|
|
183
|
+
function parseStackIdentifier(identifier) {
|
|
184
|
+
if (typeof identifier !== "string" || !identifier.trim()) {
|
|
185
|
+
return null;
|
|
186
|
+
}
|
|
187
|
+
const match = identifier.trim().match(/^([a-z0-9_-]+)@(\d+)$/i);
|
|
188
|
+
if (!match) {
|
|
189
|
+
return null;
|
|
190
|
+
}
|
|
191
|
+
const [, name, versionStr] = match;
|
|
192
|
+
const version = parseInt(versionStr, 10);
|
|
193
|
+
if (isNaN(version) || version < 1) {
|
|
194
|
+
return null;
|
|
195
|
+
}
|
|
196
|
+
return { name, version };
|
|
197
|
+
}
|
|
198
|
+
function normalizeTime(recipe) {
|
|
199
|
+
const time = recipe?.time;
|
|
200
|
+
if (!time || typeof time !== "object" || Array.isArray(time)) return;
|
|
201
|
+
const structuredKeys = [
|
|
202
|
+
"prep",
|
|
203
|
+
"active",
|
|
204
|
+
"passive",
|
|
205
|
+
"total"
|
|
206
|
+
];
|
|
207
|
+
structuredKeys.forEach((key) => {
|
|
208
|
+
const value = time[key];
|
|
209
|
+
if (typeof value === "number") return;
|
|
210
|
+
const parsed = parseDuration(value);
|
|
211
|
+
if (parsed !== null) {
|
|
212
|
+
time[key] = parsed;
|
|
213
|
+
}
|
|
214
|
+
});
|
|
215
|
+
}
|
|
216
|
+
|
|
131
217
|
// src/fromSchemaOrg.ts
|
|
132
218
|
function fromSchemaOrg(input) {
|
|
133
219
|
const recipeNode = extractRecipeNode(input);
|
|
@@ -141,8 +227,22 @@ function fromSchemaOrg(input) {
|
|
|
141
227
|
const tags = collectTags(recipeNode.recipeCuisine, recipeNode.keywords);
|
|
142
228
|
const category = extractFirst(recipeNode.recipeCategory);
|
|
143
229
|
const source = convertSource(recipeNode);
|
|
144
|
-
const
|
|
145
|
-
|
|
230
|
+
const dateModified = recipeNode.dateModified || void 0;
|
|
231
|
+
const nutrition = convertNutrition(recipeNode.nutrition);
|
|
232
|
+
const attribution = convertAttribution(recipeNode);
|
|
233
|
+
const taxonomy = convertTaxonomy(tags, category, extractFirst(recipeNode.recipeCuisine));
|
|
234
|
+
const media = convertMedia(recipeNode.image, recipeNode.video);
|
|
235
|
+
const times = convertTimes(time);
|
|
236
|
+
const stacks = {};
|
|
237
|
+
if (attribution) stacks.attribution = 1;
|
|
238
|
+
if (taxonomy) stacks.taxonomy = 1;
|
|
239
|
+
if (media) stacks.media = 1;
|
|
240
|
+
if (nutrition) stacks.nutrition = 1;
|
|
241
|
+
if (times) stacks.times = 1;
|
|
242
|
+
const rawRecipe = {
|
|
243
|
+
"@type": "Recipe",
|
|
244
|
+
profile: "minimal",
|
|
245
|
+
stacks,
|
|
146
246
|
name: recipeNode.name.trim(),
|
|
147
247
|
description: recipeNode.description?.trim() || void 0,
|
|
148
248
|
image: normalizeImage(recipeNode.image),
|
|
@@ -150,13 +250,19 @@ function fromSchemaOrg(input) {
|
|
|
150
250
|
tags: tags.length ? tags : void 0,
|
|
151
251
|
source,
|
|
152
252
|
dateAdded: recipeNode.datePublished || void 0,
|
|
153
|
-
dateModified: recipeNode.dateModified || void 0,
|
|
154
253
|
yield: recipeYield,
|
|
155
254
|
time,
|
|
156
255
|
ingredients,
|
|
157
256
|
instructions,
|
|
158
|
-
|
|
257
|
+
...dateModified ? { dateModified } : {},
|
|
258
|
+
...nutrition ? { nutrition } : {},
|
|
259
|
+
...attribution ? { attribution } : {},
|
|
260
|
+
...taxonomy ? { taxonomy } : {},
|
|
261
|
+
...media ? { media } : {},
|
|
262
|
+
...times ? { times } : {}
|
|
159
263
|
};
|
|
264
|
+
const { recipe } = normalizeRecipe(rawRecipe);
|
|
265
|
+
return recipe;
|
|
160
266
|
}
|
|
161
267
|
function extractRecipeNode(input) {
|
|
162
268
|
if (!input) return null;
|
|
@@ -368,6 +474,90 @@ function extractEntityName(value) {
|
|
|
368
474
|
}
|
|
369
475
|
return void 0;
|
|
370
476
|
}
|
|
477
|
+
function convertAttribution(recipe) {
|
|
478
|
+
const attribution = {};
|
|
479
|
+
const url = (recipe.url || recipe.mainEntityOfPage)?.trim();
|
|
480
|
+
const author = extractEntityName(recipe.author);
|
|
481
|
+
const datePublished = recipe.datePublished?.trim();
|
|
482
|
+
if (url) attribution.url = url;
|
|
483
|
+
if (author) attribution.author = author;
|
|
484
|
+
if (datePublished) attribution.datePublished = datePublished;
|
|
485
|
+
return Object.keys(attribution).length ? attribution : void 0;
|
|
486
|
+
}
|
|
487
|
+
function convertTaxonomy(keywords, category, cuisine) {
|
|
488
|
+
const taxonomy = {};
|
|
489
|
+
if (keywords.length) taxonomy.keywords = keywords;
|
|
490
|
+
if (category) taxonomy.category = category;
|
|
491
|
+
if (cuisine) taxonomy.cuisine = cuisine;
|
|
492
|
+
return Object.keys(taxonomy).length ? taxonomy : void 0;
|
|
493
|
+
}
|
|
494
|
+
function normalizeMediaList(value) {
|
|
495
|
+
if (!value) return [];
|
|
496
|
+
if (typeof value === "string") return [value.trim()].filter(Boolean);
|
|
497
|
+
if (Array.isArray(value)) {
|
|
498
|
+
return value.map((item) => typeof item === "string" ? item.trim() : extractMediaUrl(item)).filter((entry) => Boolean(entry?.length));
|
|
499
|
+
}
|
|
500
|
+
const url = extractMediaUrl(value);
|
|
501
|
+
return url ? [url] : [];
|
|
502
|
+
}
|
|
503
|
+
function extractMediaUrl(value) {
|
|
504
|
+
if (value && typeof value === "object" && "url" in value && typeof value.url === "string") {
|
|
505
|
+
const trimmed = value.url.trim();
|
|
506
|
+
return trimmed || void 0;
|
|
507
|
+
}
|
|
508
|
+
return void 0;
|
|
509
|
+
}
|
|
510
|
+
function convertMedia(image, video) {
|
|
511
|
+
const normalizedImage = normalizeImage(image);
|
|
512
|
+
const images = normalizedImage ? Array.isArray(normalizedImage) ? normalizedImage : [normalizedImage] : [];
|
|
513
|
+
const videos = normalizeMediaList(video);
|
|
514
|
+
const media = {};
|
|
515
|
+
if (images.length) media.images = images;
|
|
516
|
+
if (videos.length) media.videos = videos;
|
|
517
|
+
return Object.keys(media).length ? media : void 0;
|
|
518
|
+
}
|
|
519
|
+
function convertTimes(time) {
|
|
520
|
+
if (!time) return void 0;
|
|
521
|
+
const times = {};
|
|
522
|
+
if (typeof time.prep === "number") times.prepMinutes = time.prep;
|
|
523
|
+
if (typeof time.active === "number") times.cookMinutes = time.active;
|
|
524
|
+
if (typeof time.total === "number") times.totalMinutes = time.total;
|
|
525
|
+
return Object.keys(times).length ? times : void 0;
|
|
526
|
+
}
|
|
527
|
+
function convertNutrition(nutrition) {
|
|
528
|
+
if (!nutrition || typeof nutrition !== "object") {
|
|
529
|
+
return void 0;
|
|
530
|
+
}
|
|
531
|
+
const result = {};
|
|
532
|
+
let hasData = false;
|
|
533
|
+
if ("calories" in nutrition) {
|
|
534
|
+
const calories = nutrition.calories;
|
|
535
|
+
if (typeof calories === "number") {
|
|
536
|
+
result.calories = calories;
|
|
537
|
+
hasData = true;
|
|
538
|
+
} else if (typeof calories === "string") {
|
|
539
|
+
const parsed = parseFloat(calories.replace(/[^\d.-]/g, ""));
|
|
540
|
+
if (!isNaN(parsed)) {
|
|
541
|
+
result.calories = parsed;
|
|
542
|
+
hasData = true;
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
if ("proteinContent" in nutrition || "protein_g" in nutrition) {
|
|
547
|
+
const protein = nutrition.proteinContent || nutrition.protein_g;
|
|
548
|
+
if (typeof protein === "number") {
|
|
549
|
+
result.protein_g = protein;
|
|
550
|
+
hasData = true;
|
|
551
|
+
} else if (typeof protein === "string") {
|
|
552
|
+
const parsed = parseFloat(protein.replace(/[^\d.-]/g, ""));
|
|
553
|
+
if (!isNaN(parsed)) {
|
|
554
|
+
result.protein_g = parsed;
|
|
555
|
+
hasData = true;
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
return hasData ? result : void 0;
|
|
560
|
+
}
|
|
371
561
|
|
|
372
562
|
// src/scraper/fetch.ts
|
|
373
563
|
var DEFAULT_USER_AGENTS = [
|
|
@@ -433,13 +623,16 @@ async function fetchPage(url, options = {}) {
|
|
|
433
623
|
const response = await resolvedFetch(url, requestInit);
|
|
434
624
|
clearTimeout(timeoutId);
|
|
435
625
|
if (response && typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
globalFetch
|
|
440
|
-
|
|
626
|
+
const ingestUrl = process.env.SOUSTACK_DEBUG_INGEST_URL;
|
|
627
|
+
if (ingestUrl) {
|
|
628
|
+
try {
|
|
629
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
630
|
+
if (globalFetch) {
|
|
631
|
+
globalFetch(ingestUrl, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/fetch.ts:63", message: "fetch response", data: { url, status: response.status, statusText: response.statusText, ok: response.ok, isNYTimes: url.includes("nytimes.com") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
|
|
632
|
+
});
|
|
633
|
+
}
|
|
634
|
+
} catch {
|
|
441
635
|
}
|
|
442
|
-
} catch {
|
|
443
636
|
}
|
|
444
637
|
}
|
|
445
638
|
if (!response.ok) {
|
|
@@ -451,13 +644,16 @@ async function fetchPage(url, options = {}) {
|
|
|
451
644
|
}
|
|
452
645
|
const html = await response.text();
|
|
453
646
|
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
globalFetch
|
|
458
|
-
|
|
647
|
+
const ingestUrl = process.env.SOUSTACK_DEBUG_INGEST_URL;
|
|
648
|
+
if (ingestUrl) {
|
|
649
|
+
try {
|
|
650
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
651
|
+
if (globalFetch) {
|
|
652
|
+
globalFetch(ingestUrl, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/fetch.ts:75", message: "HTML received", data: { htmlLength: html.length, hasLoginPage: html.toLowerCase().includes("login") || html.toLowerCase().includes("sign in"), hasRecipeData: html.includes("application/ld+json") || html.includes("schema.org/Recipe") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B,D" }) }).catch(() => {
|
|
653
|
+
});
|
|
654
|
+
}
|
|
655
|
+
} catch {
|
|
459
656
|
}
|
|
460
|
-
} catch {
|
|
461
657
|
}
|
|
462
658
|
}
|
|
463
659
|
return html;
|
|
@@ -487,8 +683,6 @@ function isRecipeNode(value) {
|
|
|
487
683
|
return false;
|
|
488
684
|
}
|
|
489
685
|
const type = value["@type"];
|
|
490
|
-
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/utils.ts:14", message: "isRecipeNode check", data: { type, typeLower: typeof type === "string" ? type.toLowerCase() : Array.isArray(type) ? type.map((t) => typeof t === "string" ? t.toLowerCase() : t) : void 0, isMatch: typeof type === "string" ? RECIPE_TYPES.has(type.toLowerCase()) : Array.isArray(type) ? type.some((e) => typeof e === "string" && RECIPE_TYPES.has(e.toLowerCase())) : false }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
|
|
491
|
-
});
|
|
492
686
|
if (typeof type === "string") {
|
|
493
687
|
return RECIPE_TYPES.has(type.toLowerCase());
|
|
494
688
|
}
|
|
@@ -516,20 +710,14 @@ function normalizeText(value) {
|
|
|
516
710
|
function extractJsonLd(html) {
|
|
517
711
|
const $ = load(html);
|
|
518
712
|
const scripts = $('script[type="application/ld+json"]');
|
|
519
|
-
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:8", message: "JSON-LD scripts found", data: { scriptCount: scripts.length }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
|
|
520
|
-
});
|
|
521
713
|
const candidates = [];
|
|
522
714
|
scripts.each((_, element) => {
|
|
523
715
|
const content = $(element).html();
|
|
524
716
|
if (!content) return;
|
|
525
717
|
const parsed = safeJsonParse(content);
|
|
526
718
|
if (!parsed) return;
|
|
527
|
-
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:18", message: "JSON-LD parsed", data: { hasGraph: !!(parsed && typeof parsed === "object" && "@graph" in parsed), type: parsed && typeof parsed === "object" && "@type" in parsed ? parsed["@type"] : void 0 }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C" }) }).catch(() => {
|
|
528
|
-
});
|
|
529
719
|
collectCandidates(parsed, candidates);
|
|
530
720
|
});
|
|
531
|
-
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:22", message: "JSON-LD candidates", data: { candidateCount: candidates.length, candidateTypes: candidates.map((c) => c["@type"]) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C" }) }).catch(() => {
|
|
532
|
-
});
|
|
533
721
|
return candidates[0] ?? null;
|
|
534
722
|
}
|
|
535
723
|
function collectCandidates(payload, bucket) {
|
|
@@ -711,13 +899,16 @@ function extractRecipe(html) {
|
|
|
711
899
|
}
|
|
712
900
|
const jsonLdRecipe = extractJsonLd(html);
|
|
713
901
|
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
globalFetch
|
|
718
|
-
|
|
902
|
+
const ingestUrl = process.env.SOUSTACK_DEBUG_INGEST_URL;
|
|
903
|
+
if (ingestUrl) {
|
|
904
|
+
try {
|
|
905
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
906
|
+
if (globalFetch) {
|
|
907
|
+
globalFetch(ingestUrl, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:6", message: "JSON-LD extraction result", data: { hasJsonLd: !!jsonLdRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
|
|
908
|
+
});
|
|
909
|
+
}
|
|
910
|
+
} catch {
|
|
719
911
|
}
|
|
720
|
-
} catch {
|
|
721
912
|
}
|
|
722
913
|
}
|
|
723
914
|
if (jsonLdRecipe) {
|
|
@@ -725,13 +916,16 @@ function extractRecipe(html) {
|
|
|
725
916
|
}
|
|
726
917
|
const microdataRecipe = extractMicrodata(html);
|
|
727
918
|
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
globalFetch
|
|
732
|
-
|
|
919
|
+
const ingestUrl = process.env.SOUSTACK_DEBUG_INGEST_URL;
|
|
920
|
+
if (ingestUrl) {
|
|
921
|
+
try {
|
|
922
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
923
|
+
if (globalFetch) {
|
|
924
|
+
globalFetch(ingestUrl, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:12", message: "Microdata extraction result", data: { hasMicrodata: !!microdataRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "D" }) }).catch(() => {
|
|
925
|
+
});
|
|
926
|
+
}
|
|
927
|
+
} catch {
|
|
733
928
|
}
|
|
734
|
-
} catch {
|
|
735
929
|
}
|
|
736
930
|
}
|
|
737
931
|
if (microdataRecipe) {
|
|
@@ -743,35 +937,44 @@ function extractRecipe(html) {
|
|
|
743
937
|
// src/scraper/index.ts
|
|
744
938
|
async function scrapeRecipe(url, options = {}) {
|
|
745
939
|
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
globalFetch
|
|
750
|
-
|
|
940
|
+
const ingestUrl = process.env.SOUSTACK_DEBUG_INGEST_URL;
|
|
941
|
+
if (ingestUrl) {
|
|
942
|
+
try {
|
|
943
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
944
|
+
if (globalFetch) {
|
|
945
|
+
globalFetch(ingestUrl, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:7", message: "scrapeRecipe entry", data: { url, hasOptions: !!options }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,B,C,D,E" }) }).catch(() => {
|
|
946
|
+
});
|
|
947
|
+
}
|
|
948
|
+
} catch {
|
|
751
949
|
}
|
|
752
|
-
} catch {
|
|
753
950
|
}
|
|
754
951
|
}
|
|
755
952
|
const html = await fetchPage(url, options);
|
|
756
953
|
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
globalFetch
|
|
761
|
-
|
|
954
|
+
const ingestUrl = process.env.SOUSTACK_DEBUG_INGEST_URL;
|
|
955
|
+
if (ingestUrl) {
|
|
956
|
+
try {
|
|
957
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
958
|
+
if (globalFetch) {
|
|
959
|
+
globalFetch(ingestUrl, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:9", message: "HTML fetched", data: { htmlLength: html?.length, htmlPreview: html?.substring(0, 200) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
|
|
960
|
+
});
|
|
961
|
+
}
|
|
962
|
+
} catch {
|
|
762
963
|
}
|
|
763
|
-
} catch {
|
|
764
964
|
}
|
|
765
965
|
}
|
|
766
966
|
const { recipe } = extractRecipe(html);
|
|
767
967
|
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
globalFetch
|
|
772
|
-
|
|
968
|
+
const ingestUrl = process.env.SOUSTACK_DEBUG_INGEST_URL;
|
|
969
|
+
if (ingestUrl) {
|
|
970
|
+
try {
|
|
971
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
972
|
+
if (globalFetch) {
|
|
973
|
+
globalFetch(ingestUrl, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:11", message: "extractRecipe result", data: { hasRecipe: !!recipe, recipeType: recipe?.["@type"], recipeName: recipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C,D" }) }).catch(() => {
|
|
974
|
+
});
|
|
975
|
+
}
|
|
976
|
+
} catch {
|
|
773
977
|
}
|
|
774
|
-
} catch {
|
|
775
978
|
}
|
|
776
979
|
}
|
|
777
980
|
if (!recipe) {
|
|
@@ -779,13 +982,16 @@ async function scrapeRecipe(url, options = {}) {
|
|
|
779
982
|
}
|
|
780
983
|
const soustackRecipe = fromSchemaOrg(recipe);
|
|
781
984
|
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
globalFetch
|
|
786
|
-
|
|
985
|
+
const ingestUrl = process.env.SOUSTACK_DEBUG_INGEST_URL;
|
|
986
|
+
if (ingestUrl) {
|
|
987
|
+
try {
|
|
988
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
989
|
+
if (globalFetch) {
|
|
990
|
+
globalFetch(ingestUrl, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:17", message: "fromSchemaOrg result", data: { hasSoustackRecipe: !!soustackRecipe, soustackRecipeName: soustackRecipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
|
|
991
|
+
});
|
|
992
|
+
}
|
|
993
|
+
} catch {
|
|
787
994
|
}
|
|
788
|
-
} catch {
|
|
789
995
|
}
|
|
790
996
|
}
|
|
791
997
|
if (!soustackRecipe) {
|
|
@@ -810,5 +1016,5 @@ function extractSchemaOrgRecipeFromHTML(html) {
|
|
|
810
1016
|
}
|
|
811
1017
|
|
|
812
1018
|
export { extractRecipeFromHTML, extractSchemaOrgRecipeFromHTML, fetchPage, scrapeRecipe };
|
|
813
|
-
//# sourceMappingURL=
|
|
814
|
-
//# sourceMappingURL=
|
|
1019
|
+
//# sourceMappingURL=index.mjs.map
|
|
1020
|
+
//# sourceMappingURL=index.mjs.map
|