soustack 0.2.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +128 -18
- package/dist/cli/index.js +1706 -665
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.mts +172 -28
- package/dist/index.d.ts +172 -28
- package/dist/index.js +2028 -662
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +2022 -662
- package/dist/index.mjs.map +1 -1
- package/dist/{scrape.d.mts → scrape/index.d.mts} +38 -10
- package/dist/{scrape.d.ts → scrape/index.d.ts} +38 -10
- package/dist/{scrape.js → scrape/index.js} +268 -62
- package/dist/scrape/index.js.map +1 -0
- package/dist/{scrape.mjs → scrape/index.mjs} +268 -62
- package/dist/scrape/index.mjs.map +1 -0
- package/package.json +15 -9
- package/src/profiles/base.schema.json +2 -2
- package/src/profiles/cookable.schema.json +4 -4
- package/src/profiles/illustrated.schema.json +4 -4
- package/src/profiles/quantified.schema.json +4 -4
- package/src/profiles/scalable.schema.json +6 -6
- package/src/profiles/schedulable.schema.json +4 -4
- package/src/schema.json +15 -3
- package/src/soustack.schema.json +15 -3
- package/dist/scrape.js.map +0 -1
- package/dist/scrape.mjs.map +0 -1
|
@@ -1,10 +1,26 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Soustack Recipe Schema v0.
|
|
2
|
+
* Soustack Recipe Schema v0.3.0
|
|
3
3
|
* A portable, scalable, interoperable recipe format.
|
|
4
4
|
*/
|
|
5
5
|
interface SoustackRecipe {
|
|
6
|
+
/** Document marker for Soustack recipes */
|
|
7
|
+
'@type'?: 'Recipe';
|
|
6
8
|
/** Optional $schema pointer for profile-aware validation */
|
|
7
9
|
$schema?: string;
|
|
10
|
+
/** Optional declared validation profile */
|
|
11
|
+
profile?: string;
|
|
12
|
+
/** Recipe level: "lite" or "base" */
|
|
13
|
+
level?: "lite" | "base";
|
|
14
|
+
/** Stack declarations as a map: Record<stackName, versionNumber> */
|
|
15
|
+
stacks?: Record<string, number>;
|
|
16
|
+
/** Attribution stack payload */
|
|
17
|
+
attribution?: AttributionModule;
|
|
18
|
+
/** Taxonomy stack payload */
|
|
19
|
+
taxonomy?: TaxonomyModule;
|
|
20
|
+
/** Media stack payload */
|
|
21
|
+
media?: MediaModule;
|
|
22
|
+
/** Times stack payload */
|
|
23
|
+
times?: TimesModule;
|
|
8
24
|
/** Unique identifier (slug or UUID) */
|
|
9
25
|
id?: string;
|
|
10
26
|
/** Optional display title */
|
|
@@ -190,15 +206,27 @@ interface Alternative {
|
|
|
190
206
|
dietary?: string[];
|
|
191
207
|
}
|
|
192
208
|
interface NutritionFacts {
|
|
193
|
-
calories?:
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
209
|
+
calories?: number;
|
|
210
|
+
protein_g?: number;
|
|
211
|
+
}
|
|
212
|
+
interface AttributionModule {
|
|
213
|
+
url?: string;
|
|
214
|
+
author?: string;
|
|
215
|
+
datePublished?: string;
|
|
216
|
+
}
|
|
217
|
+
interface TaxonomyModule {
|
|
218
|
+
keywords?: string[];
|
|
219
|
+
category?: string;
|
|
220
|
+
cuisine?: string;
|
|
221
|
+
}
|
|
222
|
+
interface MediaModule {
|
|
223
|
+
images?: string[];
|
|
224
|
+
videos?: string[];
|
|
225
|
+
}
|
|
226
|
+
interface TimesModule {
|
|
227
|
+
prepMinutes?: number;
|
|
228
|
+
cookMinutes?: number;
|
|
229
|
+
totalMinutes?: number;
|
|
202
230
|
}
|
|
203
231
|
|
|
204
232
|
interface HowToStep {
|
|
@@ -1,10 +1,26 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Soustack Recipe Schema v0.
|
|
2
|
+
* Soustack Recipe Schema v0.3.0
|
|
3
3
|
* A portable, scalable, interoperable recipe format.
|
|
4
4
|
*/
|
|
5
5
|
interface SoustackRecipe {
|
|
6
|
+
/** Document marker for Soustack recipes */
|
|
7
|
+
'@type'?: 'Recipe';
|
|
6
8
|
/** Optional $schema pointer for profile-aware validation */
|
|
7
9
|
$schema?: string;
|
|
10
|
+
/** Optional declared validation profile */
|
|
11
|
+
profile?: string;
|
|
12
|
+
/** Recipe level: "lite" or "base" */
|
|
13
|
+
level?: "lite" | "base";
|
|
14
|
+
/** Stack declarations as a map: Record<stackName, versionNumber> */
|
|
15
|
+
stacks?: Record<string, number>;
|
|
16
|
+
/** Attribution stack payload */
|
|
17
|
+
attribution?: AttributionModule;
|
|
18
|
+
/** Taxonomy stack payload */
|
|
19
|
+
taxonomy?: TaxonomyModule;
|
|
20
|
+
/** Media stack payload */
|
|
21
|
+
media?: MediaModule;
|
|
22
|
+
/** Times stack payload */
|
|
23
|
+
times?: TimesModule;
|
|
8
24
|
/** Unique identifier (slug or UUID) */
|
|
9
25
|
id?: string;
|
|
10
26
|
/** Optional display title */
|
|
@@ -190,15 +206,27 @@ interface Alternative {
|
|
|
190
206
|
dietary?: string[];
|
|
191
207
|
}
|
|
192
208
|
interface NutritionFacts {
|
|
193
|
-
calories?:
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
209
|
+
calories?: number;
|
|
210
|
+
protein_g?: number;
|
|
211
|
+
}
|
|
212
|
+
interface AttributionModule {
|
|
213
|
+
url?: string;
|
|
214
|
+
author?: string;
|
|
215
|
+
datePublished?: string;
|
|
216
|
+
}
|
|
217
|
+
interface TaxonomyModule {
|
|
218
|
+
keywords?: string[];
|
|
219
|
+
category?: string;
|
|
220
|
+
cuisine?: string;
|
|
221
|
+
}
|
|
222
|
+
interface MediaModule {
|
|
223
|
+
images?: string[];
|
|
224
|
+
videos?: string[];
|
|
225
|
+
}
|
|
226
|
+
interface TimesModule {
|
|
227
|
+
prepMinutes?: number;
|
|
228
|
+
cookMinutes?: number;
|
|
229
|
+
totalMinutes?: number;
|
|
202
230
|
}
|
|
203
231
|
|
|
204
232
|
interface HowToStep {
|
|
@@ -130,6 +130,92 @@ function extractUrl(value) {
|
|
|
130
130
|
return trimmed || void 0;
|
|
131
131
|
}
|
|
132
132
|
|
|
133
|
+
// src/normalize.ts
|
|
134
|
+
function normalizeRecipe(input) {
|
|
135
|
+
if (!input || typeof input !== "object") {
|
|
136
|
+
throw new Error("Recipe input must be an object");
|
|
137
|
+
}
|
|
138
|
+
const recipe = JSON.parse(JSON.stringify(input));
|
|
139
|
+
const warnings = [];
|
|
140
|
+
const legacyField = ["mod", "ules"].join("");
|
|
141
|
+
if (legacyField in recipe) {
|
|
142
|
+
throw new Error("The legacy field is no longer supported. Use `stacks` instead.");
|
|
143
|
+
}
|
|
144
|
+
normalizeStacks(recipe, warnings);
|
|
145
|
+
if (!recipe.stacks) {
|
|
146
|
+
recipe.stacks = {};
|
|
147
|
+
}
|
|
148
|
+
if (recipe && typeof recipe === "object" && "version" in recipe && !recipe.recipeVersion && typeof recipe.version === "string") {
|
|
149
|
+
recipe.recipeVersion = recipe.version;
|
|
150
|
+
warnings.push("'version' is deprecated; mapped to 'recipeVersion'.");
|
|
151
|
+
}
|
|
152
|
+
normalizeTime(recipe);
|
|
153
|
+
return {
|
|
154
|
+
recipe,
|
|
155
|
+
warnings
|
|
156
|
+
};
|
|
157
|
+
}
|
|
158
|
+
function normalizeStacks(recipe, warnings) {
|
|
159
|
+
let stacks = {};
|
|
160
|
+
if (recipe.stacks && typeof recipe.stacks === "object" && !Array.isArray(recipe.stacks)) {
|
|
161
|
+
for (const [key, value] of Object.entries(recipe.stacks)) {
|
|
162
|
+
if (typeof value === "number" && Number.isInteger(value) && value >= 1) {
|
|
163
|
+
stacks[key] = value;
|
|
164
|
+
} else {
|
|
165
|
+
warnings.push(`Invalid stack version for '${key}': expected positive integer, got ${value}`);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
if (Array.isArray(recipe.stacks)) {
|
|
170
|
+
const stackIdentifiers = recipe.stacks.filter((s) => typeof s === "string");
|
|
171
|
+
for (const identifier of stackIdentifiers) {
|
|
172
|
+
const parsed = parseStackIdentifier(identifier);
|
|
173
|
+
if (parsed) {
|
|
174
|
+
const { name, version } = parsed;
|
|
175
|
+
if (!stacks[name] || stacks[name] < version) {
|
|
176
|
+
stacks[name] = version;
|
|
177
|
+
}
|
|
178
|
+
} else {
|
|
179
|
+
warnings.push(`Invalid stack identifier '${identifier}': expected format 'name@version' (e.g., 'scaling@1')`);
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
recipe.stacks = stacks;
|
|
184
|
+
}
|
|
185
|
+
function parseStackIdentifier(identifier) {
|
|
186
|
+
if (typeof identifier !== "string" || !identifier.trim()) {
|
|
187
|
+
return null;
|
|
188
|
+
}
|
|
189
|
+
const match = identifier.trim().match(/^([a-z0-9_-]+)@(\d+)$/i);
|
|
190
|
+
if (!match) {
|
|
191
|
+
return null;
|
|
192
|
+
}
|
|
193
|
+
const [, name, versionStr] = match;
|
|
194
|
+
const version = parseInt(versionStr, 10);
|
|
195
|
+
if (isNaN(version) || version < 1) {
|
|
196
|
+
return null;
|
|
197
|
+
}
|
|
198
|
+
return { name, version };
|
|
199
|
+
}
|
|
200
|
+
function normalizeTime(recipe) {
|
|
201
|
+
const time = recipe?.time;
|
|
202
|
+
if (!time || typeof time !== "object" || Array.isArray(time)) return;
|
|
203
|
+
const structuredKeys = [
|
|
204
|
+
"prep",
|
|
205
|
+
"active",
|
|
206
|
+
"passive",
|
|
207
|
+
"total"
|
|
208
|
+
];
|
|
209
|
+
structuredKeys.forEach((key) => {
|
|
210
|
+
const value = time[key];
|
|
211
|
+
if (typeof value === "number") return;
|
|
212
|
+
const parsed = parseDuration(value);
|
|
213
|
+
if (parsed !== null) {
|
|
214
|
+
time[key] = parsed;
|
|
215
|
+
}
|
|
216
|
+
});
|
|
217
|
+
}
|
|
218
|
+
|
|
133
219
|
// src/fromSchemaOrg.ts
|
|
134
220
|
function fromSchemaOrg(input) {
|
|
135
221
|
const recipeNode = extractRecipeNode(input);
|
|
@@ -143,8 +229,22 @@ function fromSchemaOrg(input) {
|
|
|
143
229
|
const tags = collectTags(recipeNode.recipeCuisine, recipeNode.keywords);
|
|
144
230
|
const category = extractFirst(recipeNode.recipeCategory);
|
|
145
231
|
const source = convertSource(recipeNode);
|
|
146
|
-
const
|
|
147
|
-
|
|
232
|
+
const dateModified = recipeNode.dateModified || void 0;
|
|
233
|
+
const nutrition = convertNutrition(recipeNode.nutrition);
|
|
234
|
+
const attribution = convertAttribution(recipeNode);
|
|
235
|
+
const taxonomy = convertTaxonomy(tags, category, extractFirst(recipeNode.recipeCuisine));
|
|
236
|
+
const media = convertMedia(recipeNode.image, recipeNode.video);
|
|
237
|
+
const times = convertTimes(time);
|
|
238
|
+
const stacks = {};
|
|
239
|
+
if (attribution) stacks.attribution = 1;
|
|
240
|
+
if (taxonomy) stacks.taxonomy = 1;
|
|
241
|
+
if (media) stacks.media = 1;
|
|
242
|
+
if (nutrition) stacks.nutrition = 1;
|
|
243
|
+
if (times) stacks.times = 1;
|
|
244
|
+
const rawRecipe = {
|
|
245
|
+
"@type": "Recipe",
|
|
246
|
+
profile: "minimal",
|
|
247
|
+
stacks,
|
|
148
248
|
name: recipeNode.name.trim(),
|
|
149
249
|
description: recipeNode.description?.trim() || void 0,
|
|
150
250
|
image: normalizeImage(recipeNode.image),
|
|
@@ -152,13 +252,19 @@ function fromSchemaOrg(input) {
|
|
|
152
252
|
tags: tags.length ? tags : void 0,
|
|
153
253
|
source,
|
|
154
254
|
dateAdded: recipeNode.datePublished || void 0,
|
|
155
|
-
dateModified: recipeNode.dateModified || void 0,
|
|
156
255
|
yield: recipeYield,
|
|
157
256
|
time,
|
|
158
257
|
ingredients,
|
|
159
258
|
instructions,
|
|
160
|
-
|
|
259
|
+
...dateModified ? { dateModified } : {},
|
|
260
|
+
...nutrition ? { nutrition } : {},
|
|
261
|
+
...attribution ? { attribution } : {},
|
|
262
|
+
...taxonomy ? { taxonomy } : {},
|
|
263
|
+
...media ? { media } : {},
|
|
264
|
+
...times ? { times } : {}
|
|
161
265
|
};
|
|
266
|
+
const { recipe } = normalizeRecipe(rawRecipe);
|
|
267
|
+
return recipe;
|
|
162
268
|
}
|
|
163
269
|
function extractRecipeNode(input) {
|
|
164
270
|
if (!input) return null;
|
|
@@ -370,6 +476,90 @@ function extractEntityName(value) {
|
|
|
370
476
|
}
|
|
371
477
|
return void 0;
|
|
372
478
|
}
|
|
479
|
+
function convertAttribution(recipe) {
|
|
480
|
+
const attribution = {};
|
|
481
|
+
const url = (recipe.url || recipe.mainEntityOfPage)?.trim();
|
|
482
|
+
const author = extractEntityName(recipe.author);
|
|
483
|
+
const datePublished = recipe.datePublished?.trim();
|
|
484
|
+
if (url) attribution.url = url;
|
|
485
|
+
if (author) attribution.author = author;
|
|
486
|
+
if (datePublished) attribution.datePublished = datePublished;
|
|
487
|
+
return Object.keys(attribution).length ? attribution : void 0;
|
|
488
|
+
}
|
|
489
|
+
function convertTaxonomy(keywords, category, cuisine) {
|
|
490
|
+
const taxonomy = {};
|
|
491
|
+
if (keywords.length) taxonomy.keywords = keywords;
|
|
492
|
+
if (category) taxonomy.category = category;
|
|
493
|
+
if (cuisine) taxonomy.cuisine = cuisine;
|
|
494
|
+
return Object.keys(taxonomy).length ? taxonomy : void 0;
|
|
495
|
+
}
|
|
496
|
+
function normalizeMediaList(value) {
|
|
497
|
+
if (!value) return [];
|
|
498
|
+
if (typeof value === "string") return [value.trim()].filter(Boolean);
|
|
499
|
+
if (Array.isArray(value)) {
|
|
500
|
+
return value.map((item) => typeof item === "string" ? item.trim() : extractMediaUrl(item)).filter((entry) => Boolean(entry?.length));
|
|
501
|
+
}
|
|
502
|
+
const url = extractMediaUrl(value);
|
|
503
|
+
return url ? [url] : [];
|
|
504
|
+
}
|
|
505
|
+
function extractMediaUrl(value) {
|
|
506
|
+
if (value && typeof value === "object" && "url" in value && typeof value.url === "string") {
|
|
507
|
+
const trimmed = value.url.trim();
|
|
508
|
+
return trimmed || void 0;
|
|
509
|
+
}
|
|
510
|
+
return void 0;
|
|
511
|
+
}
|
|
512
|
+
function convertMedia(image, video) {
|
|
513
|
+
const normalizedImage = normalizeImage(image);
|
|
514
|
+
const images = normalizedImage ? Array.isArray(normalizedImage) ? normalizedImage : [normalizedImage] : [];
|
|
515
|
+
const videos = normalizeMediaList(video);
|
|
516
|
+
const media = {};
|
|
517
|
+
if (images.length) media.images = images;
|
|
518
|
+
if (videos.length) media.videos = videos;
|
|
519
|
+
return Object.keys(media).length ? media : void 0;
|
|
520
|
+
}
|
|
521
|
+
function convertTimes(time) {
|
|
522
|
+
if (!time) return void 0;
|
|
523
|
+
const times = {};
|
|
524
|
+
if (typeof time.prep === "number") times.prepMinutes = time.prep;
|
|
525
|
+
if (typeof time.active === "number") times.cookMinutes = time.active;
|
|
526
|
+
if (typeof time.total === "number") times.totalMinutes = time.total;
|
|
527
|
+
return Object.keys(times).length ? times : void 0;
|
|
528
|
+
}
|
|
529
|
+
function convertNutrition(nutrition) {
|
|
530
|
+
if (!nutrition || typeof nutrition !== "object") {
|
|
531
|
+
return void 0;
|
|
532
|
+
}
|
|
533
|
+
const result = {};
|
|
534
|
+
let hasData = false;
|
|
535
|
+
if ("calories" in nutrition) {
|
|
536
|
+
const calories = nutrition.calories;
|
|
537
|
+
if (typeof calories === "number") {
|
|
538
|
+
result.calories = calories;
|
|
539
|
+
hasData = true;
|
|
540
|
+
} else if (typeof calories === "string") {
|
|
541
|
+
const parsed = parseFloat(calories.replace(/[^\d.-]/g, ""));
|
|
542
|
+
if (!isNaN(parsed)) {
|
|
543
|
+
result.calories = parsed;
|
|
544
|
+
hasData = true;
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
}
|
|
548
|
+
if ("proteinContent" in nutrition || "protein_g" in nutrition) {
|
|
549
|
+
const protein = nutrition.proteinContent || nutrition.protein_g;
|
|
550
|
+
if (typeof protein === "number") {
|
|
551
|
+
result.protein_g = protein;
|
|
552
|
+
hasData = true;
|
|
553
|
+
} else if (typeof protein === "string") {
|
|
554
|
+
const parsed = parseFloat(protein.replace(/[^\d.-]/g, ""));
|
|
555
|
+
if (!isNaN(parsed)) {
|
|
556
|
+
result.protein_g = parsed;
|
|
557
|
+
hasData = true;
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
return hasData ? result : void 0;
|
|
562
|
+
}
|
|
373
563
|
|
|
374
564
|
// src/scraper/fetch.ts
|
|
375
565
|
var DEFAULT_USER_AGENTS = [
|
|
@@ -435,13 +625,16 @@ async function fetchPage(url, options = {}) {
|
|
|
435
625
|
const response = await resolvedFetch(url, requestInit);
|
|
436
626
|
clearTimeout(timeoutId);
|
|
437
627
|
if (response && typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
globalFetch
|
|
442
|
-
|
|
628
|
+
const ingestUrl = process.env.SOUSTACK_DEBUG_INGEST_URL;
|
|
629
|
+
if (ingestUrl) {
|
|
630
|
+
try {
|
|
631
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
632
|
+
if (globalFetch) {
|
|
633
|
+
globalFetch(ingestUrl, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/fetch.ts:63", message: "fetch response", data: { url, status: response.status, statusText: response.statusText, ok: response.ok, isNYTimes: url.includes("nytimes.com") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
|
|
634
|
+
});
|
|
635
|
+
}
|
|
636
|
+
} catch {
|
|
443
637
|
}
|
|
444
|
-
} catch {
|
|
445
638
|
}
|
|
446
639
|
}
|
|
447
640
|
if (!response.ok) {
|
|
@@ -453,13 +646,16 @@ async function fetchPage(url, options = {}) {
|
|
|
453
646
|
}
|
|
454
647
|
const html = await response.text();
|
|
455
648
|
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
globalFetch
|
|
460
|
-
|
|
649
|
+
const ingestUrl = process.env.SOUSTACK_DEBUG_INGEST_URL;
|
|
650
|
+
if (ingestUrl) {
|
|
651
|
+
try {
|
|
652
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
653
|
+
if (globalFetch) {
|
|
654
|
+
globalFetch(ingestUrl, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/fetch.ts:75", message: "HTML received", data: { htmlLength: html.length, hasLoginPage: html.toLowerCase().includes("login") || html.toLowerCase().includes("sign in"), hasRecipeData: html.includes("application/ld+json") || html.includes("schema.org/Recipe") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B,D" }) }).catch(() => {
|
|
655
|
+
});
|
|
656
|
+
}
|
|
657
|
+
} catch {
|
|
461
658
|
}
|
|
462
|
-
} catch {
|
|
463
659
|
}
|
|
464
660
|
}
|
|
465
661
|
return html;
|
|
@@ -489,8 +685,6 @@ function isRecipeNode(value) {
|
|
|
489
685
|
return false;
|
|
490
686
|
}
|
|
491
687
|
const type = value["@type"];
|
|
492
|
-
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/utils.ts:14", message: "isRecipeNode check", data: { type, typeLower: typeof type === "string" ? type.toLowerCase() : Array.isArray(type) ? type.map((t) => typeof t === "string" ? t.toLowerCase() : t) : void 0, isMatch: typeof type === "string" ? RECIPE_TYPES.has(type.toLowerCase()) : Array.isArray(type) ? type.some((e) => typeof e === "string" && RECIPE_TYPES.has(e.toLowerCase())) : false }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
|
|
493
|
-
});
|
|
494
688
|
if (typeof type === "string") {
|
|
495
689
|
return RECIPE_TYPES.has(type.toLowerCase());
|
|
496
690
|
}
|
|
@@ -518,20 +712,14 @@ function normalizeText(value) {
|
|
|
518
712
|
function extractJsonLd(html) {
|
|
519
713
|
const $ = cheerio.load(html);
|
|
520
714
|
const scripts = $('script[type="application/ld+json"]');
|
|
521
|
-
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:8", message: "JSON-LD scripts found", data: { scriptCount: scripts.length }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
|
|
522
|
-
});
|
|
523
715
|
const candidates = [];
|
|
524
716
|
scripts.each((_, element) => {
|
|
525
717
|
const content = $(element).html();
|
|
526
718
|
if (!content) return;
|
|
527
719
|
const parsed = safeJsonParse(content);
|
|
528
720
|
if (!parsed) return;
|
|
529
|
-
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:18", message: "JSON-LD parsed", data: { hasGraph: !!(parsed && typeof parsed === "object" && "@graph" in parsed), type: parsed && typeof parsed === "object" && "@type" in parsed ? parsed["@type"] : void 0 }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C" }) }).catch(() => {
|
|
530
|
-
});
|
|
531
721
|
collectCandidates(parsed, candidates);
|
|
532
722
|
});
|
|
533
|
-
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:22", message: "JSON-LD candidates", data: { candidateCount: candidates.length, candidateTypes: candidates.map((c) => c["@type"]) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C" }) }).catch(() => {
|
|
534
|
-
});
|
|
535
723
|
return candidates[0] ?? null;
|
|
536
724
|
}
|
|
537
725
|
function collectCandidates(payload, bucket) {
|
|
@@ -713,13 +901,16 @@ function extractRecipe(html) {
|
|
|
713
901
|
}
|
|
714
902
|
const jsonLdRecipe = extractJsonLd(html);
|
|
715
903
|
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
globalFetch
|
|
720
|
-
|
|
904
|
+
const ingestUrl = process.env.SOUSTACK_DEBUG_INGEST_URL;
|
|
905
|
+
if (ingestUrl) {
|
|
906
|
+
try {
|
|
907
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
908
|
+
if (globalFetch) {
|
|
909
|
+
globalFetch(ingestUrl, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:6", message: "JSON-LD extraction result", data: { hasJsonLd: !!jsonLdRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
|
|
910
|
+
});
|
|
911
|
+
}
|
|
912
|
+
} catch {
|
|
721
913
|
}
|
|
722
|
-
} catch {
|
|
723
914
|
}
|
|
724
915
|
}
|
|
725
916
|
if (jsonLdRecipe) {
|
|
@@ -727,13 +918,16 @@ function extractRecipe(html) {
|
|
|
727
918
|
}
|
|
728
919
|
const microdataRecipe = extractMicrodata(html);
|
|
729
920
|
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
globalFetch
|
|
734
|
-
|
|
921
|
+
const ingestUrl = process.env.SOUSTACK_DEBUG_INGEST_URL;
|
|
922
|
+
if (ingestUrl) {
|
|
923
|
+
try {
|
|
924
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
925
|
+
if (globalFetch) {
|
|
926
|
+
globalFetch(ingestUrl, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:12", message: "Microdata extraction result", data: { hasMicrodata: !!microdataRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "D" }) }).catch(() => {
|
|
927
|
+
});
|
|
928
|
+
}
|
|
929
|
+
} catch {
|
|
735
930
|
}
|
|
736
|
-
} catch {
|
|
737
931
|
}
|
|
738
932
|
}
|
|
739
933
|
if (microdataRecipe) {
|
|
@@ -745,35 +939,44 @@ function extractRecipe(html) {
|
|
|
745
939
|
// src/scraper/index.ts
|
|
746
940
|
async function scrapeRecipe(url, options = {}) {
|
|
747
941
|
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
globalFetch
|
|
752
|
-
|
|
942
|
+
const ingestUrl = process.env.SOUSTACK_DEBUG_INGEST_URL;
|
|
943
|
+
if (ingestUrl) {
|
|
944
|
+
try {
|
|
945
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
946
|
+
if (globalFetch) {
|
|
947
|
+
globalFetch(ingestUrl, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:7", message: "scrapeRecipe entry", data: { url, hasOptions: !!options }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,B,C,D,E" }) }).catch(() => {
|
|
948
|
+
});
|
|
949
|
+
}
|
|
950
|
+
} catch {
|
|
753
951
|
}
|
|
754
|
-
} catch {
|
|
755
952
|
}
|
|
756
953
|
}
|
|
757
954
|
const html = await fetchPage(url, options);
|
|
758
955
|
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
globalFetch
|
|
763
|
-
|
|
956
|
+
const ingestUrl = process.env.SOUSTACK_DEBUG_INGEST_URL;
|
|
957
|
+
if (ingestUrl) {
|
|
958
|
+
try {
|
|
959
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
960
|
+
if (globalFetch) {
|
|
961
|
+
globalFetch(ingestUrl, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:9", message: "HTML fetched", data: { htmlLength: html?.length, htmlPreview: html?.substring(0, 200) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
|
|
962
|
+
});
|
|
963
|
+
}
|
|
964
|
+
} catch {
|
|
764
965
|
}
|
|
765
|
-
} catch {
|
|
766
966
|
}
|
|
767
967
|
}
|
|
768
968
|
const { recipe } = extractRecipe(html);
|
|
769
969
|
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
globalFetch
|
|
774
|
-
|
|
970
|
+
const ingestUrl = process.env.SOUSTACK_DEBUG_INGEST_URL;
|
|
971
|
+
if (ingestUrl) {
|
|
972
|
+
try {
|
|
973
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
974
|
+
if (globalFetch) {
|
|
975
|
+
globalFetch(ingestUrl, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:11", message: "extractRecipe result", data: { hasRecipe: !!recipe, recipeType: recipe?.["@type"], recipeName: recipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C,D" }) }).catch(() => {
|
|
976
|
+
});
|
|
977
|
+
}
|
|
978
|
+
} catch {
|
|
775
979
|
}
|
|
776
|
-
} catch {
|
|
777
980
|
}
|
|
778
981
|
}
|
|
779
982
|
if (!recipe) {
|
|
@@ -781,13 +984,16 @@ async function scrapeRecipe(url, options = {}) {
|
|
|
781
984
|
}
|
|
782
985
|
const soustackRecipe = fromSchemaOrg(recipe);
|
|
783
986
|
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
globalFetch
|
|
788
|
-
|
|
987
|
+
const ingestUrl = process.env.SOUSTACK_DEBUG_INGEST_URL;
|
|
988
|
+
if (ingestUrl) {
|
|
989
|
+
try {
|
|
990
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
991
|
+
if (globalFetch) {
|
|
992
|
+
globalFetch(ingestUrl, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:17", message: "fromSchemaOrg result", data: { hasSoustackRecipe: !!soustackRecipe, soustackRecipeName: soustackRecipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
|
|
993
|
+
});
|
|
994
|
+
}
|
|
995
|
+
} catch {
|
|
789
996
|
}
|
|
790
|
-
} catch {
|
|
791
997
|
}
|
|
792
998
|
}
|
|
793
999
|
if (!soustackRecipe) {
|
|
@@ -815,5 +1021,5 @@ exports.extractRecipeFromHTML = extractRecipeFromHTML;
|
|
|
815
1021
|
exports.extractSchemaOrgRecipeFromHTML = extractSchemaOrgRecipeFromHTML;
|
|
816
1022
|
exports.fetchPage = fetchPage;
|
|
817
1023
|
exports.scrapeRecipe = scrapeRecipe;
|
|
818
|
-
//# sourceMappingURL=
|
|
819
|
-
//# sourceMappingURL=
|
|
1024
|
+
//# sourceMappingURL=index.js.map
|
|
1025
|
+
//# sourceMappingURL=index.js.map
|