soustack 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +86 -10
- package/dist/cli/index.js +159 -53
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.mts +75 -11
- package/dist/index.d.ts +75 -11
- package/dist/index.js +165 -53
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +164 -54
- package/dist/index.mjs.map +1 -1
- package/package.json +2 -2
- package/src/schema.json +22 -4
package/dist/index.js
CHANGED
|
@@ -140,8 +140,8 @@ function flattenInstructions(items) {
|
|
|
140
140
|
// src/schema.json
|
|
141
141
|
var schema_default = {
|
|
142
142
|
$schema: "http://json-schema.org/draft-07/schema#",
|
|
143
|
-
$id: "http://soustack.org/schema/v0.
|
|
144
|
-
title: "Soustack Recipe Schema v0.
|
|
143
|
+
$id: "http://soustack.org/schema/v0.2",
|
|
144
|
+
title: "Soustack Recipe Schema v0.2",
|
|
145
145
|
description: "A portable, scalable, interoperable recipe format.",
|
|
146
146
|
type: "object",
|
|
147
147
|
required: ["name", "ingredients", "instructions"],
|
|
@@ -171,8 +171,21 @@ var schema_default = {
|
|
|
171
171
|
items: { type: "string" }
|
|
172
172
|
},
|
|
173
173
|
image: {
|
|
174
|
-
|
|
175
|
-
|
|
174
|
+
description: "Recipe-level hero image(s)",
|
|
175
|
+
anyOf: [
|
|
176
|
+
{
|
|
177
|
+
type: "string",
|
|
178
|
+
format: "uri"
|
|
179
|
+
},
|
|
180
|
+
{
|
|
181
|
+
type: "array",
|
|
182
|
+
minItems: 1,
|
|
183
|
+
items: {
|
|
184
|
+
type: "string",
|
|
185
|
+
format: "uri"
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
]
|
|
176
189
|
},
|
|
177
190
|
dateAdded: {
|
|
178
191
|
type: "string",
|
|
@@ -337,6 +350,11 @@ var schema_default = {
|
|
|
337
350
|
properties: {
|
|
338
351
|
id: { type: "string" },
|
|
339
352
|
text: { type: "string" },
|
|
353
|
+
image: {
|
|
354
|
+
type: "string",
|
|
355
|
+
format: "uri",
|
|
356
|
+
description: "Optional image that illustrates this instruction"
|
|
357
|
+
},
|
|
340
358
|
destination: { type: "string" },
|
|
341
359
|
dependsOn: {
|
|
342
360
|
type: "array",
|
|
@@ -1234,6 +1252,40 @@ function smartParseDuration(input) {
|
|
|
1234
1252
|
return parseHumanDuration(input);
|
|
1235
1253
|
}
|
|
1236
1254
|
|
|
1255
|
+
// src/utils/image.ts
|
|
1256
|
+
function normalizeImage(image) {
|
|
1257
|
+
if (!image) {
|
|
1258
|
+
return void 0;
|
|
1259
|
+
}
|
|
1260
|
+
if (typeof image === "string") {
|
|
1261
|
+
const trimmed = image.trim();
|
|
1262
|
+
return trimmed || void 0;
|
|
1263
|
+
}
|
|
1264
|
+
if (Array.isArray(image)) {
|
|
1265
|
+
const urls = image.map((entry) => typeof entry === "string" ? entry.trim() : extractUrl(entry)).filter((url) => typeof url === "string" && Boolean(url));
|
|
1266
|
+
if (urls.length === 0) {
|
|
1267
|
+
return void 0;
|
|
1268
|
+
}
|
|
1269
|
+
if (urls.length === 1) {
|
|
1270
|
+
return urls[0];
|
|
1271
|
+
}
|
|
1272
|
+
return urls;
|
|
1273
|
+
}
|
|
1274
|
+
return extractUrl(image);
|
|
1275
|
+
}
|
|
1276
|
+
function extractUrl(value) {
|
|
1277
|
+
if (!value || typeof value !== "object") {
|
|
1278
|
+
return void 0;
|
|
1279
|
+
}
|
|
1280
|
+
const record = value;
|
|
1281
|
+
const candidate = typeof record.url === "string" ? record.url : typeof record.contentUrl === "string" ? record.contentUrl : void 0;
|
|
1282
|
+
if (!candidate) {
|
|
1283
|
+
return void 0;
|
|
1284
|
+
}
|
|
1285
|
+
const trimmed = candidate.trim();
|
|
1286
|
+
return trimmed || void 0;
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1237
1289
|
// src/fromSchemaOrg.ts
|
|
1238
1290
|
function fromSchemaOrg(input) {
|
|
1239
1291
|
const recipeNode = extractRecipeNode(input);
|
|
@@ -1246,13 +1298,12 @@ function fromSchemaOrg(input) {
|
|
|
1246
1298
|
const recipeYield = parseYield(recipeNode.recipeYield);
|
|
1247
1299
|
const tags = collectTags(recipeNode.recipeCuisine, recipeNode.keywords);
|
|
1248
1300
|
const category = extractFirst(recipeNode.recipeCategory);
|
|
1249
|
-
const image = convertImage(recipeNode.image);
|
|
1250
1301
|
const source = convertSource(recipeNode);
|
|
1251
1302
|
const nutrition = recipeNode.nutrition && typeof recipeNode.nutrition === "object" ? recipeNode.nutrition : void 0;
|
|
1252
1303
|
return {
|
|
1253
1304
|
name: recipeNode.name.trim(),
|
|
1254
1305
|
description: recipeNode.description?.trim() || void 0,
|
|
1255
|
-
image,
|
|
1306
|
+
image: normalizeImage(recipeNode.image),
|
|
1256
1307
|
category,
|
|
1257
1308
|
tags: tags.length ? tags : void 0,
|
|
1258
1309
|
source,
|
|
@@ -1335,9 +1386,9 @@ function convertInstructions(value) {
|
|
|
1335
1386
|
continue;
|
|
1336
1387
|
}
|
|
1337
1388
|
if (isHowToStep(entry)) {
|
|
1338
|
-
const
|
|
1339
|
-
if (
|
|
1340
|
-
result.push(
|
|
1389
|
+
const parsed = convertHowToStep(entry);
|
|
1390
|
+
if (parsed) {
|
|
1391
|
+
result.push(parsed);
|
|
1341
1392
|
}
|
|
1342
1393
|
}
|
|
1343
1394
|
}
|
|
@@ -1355,9 +1406,9 @@ function extractSectionItems(items = []) {
|
|
|
1355
1406
|
continue;
|
|
1356
1407
|
}
|
|
1357
1408
|
if (isHowToStep(item)) {
|
|
1358
|
-
const
|
|
1359
|
-
if (
|
|
1360
|
-
result.push(
|
|
1409
|
+
const parsed = convertHowToStep(item);
|
|
1410
|
+
if (parsed) {
|
|
1411
|
+
result.push(parsed);
|
|
1361
1412
|
}
|
|
1362
1413
|
continue;
|
|
1363
1414
|
}
|
|
@@ -1371,6 +1422,17 @@ function extractInstructionText(value) {
|
|
|
1371
1422
|
const text = typeof value.text === "string" ? value.text : value.name;
|
|
1372
1423
|
return typeof text === "string" ? text.trim() || void 0 : void 0;
|
|
1373
1424
|
}
|
|
1425
|
+
function convertHowToStep(step) {
|
|
1426
|
+
const text = extractInstructionText(step);
|
|
1427
|
+
if (!text) {
|
|
1428
|
+
return void 0;
|
|
1429
|
+
}
|
|
1430
|
+
const normalizedImage = normalizeImage(step.image);
|
|
1431
|
+
if (typeof normalizedImage === "string") {
|
|
1432
|
+
return { text, image: normalizedImage };
|
|
1433
|
+
}
|
|
1434
|
+
return text;
|
|
1435
|
+
}
|
|
1374
1436
|
function isHowToStep(value) {
|
|
1375
1437
|
return Boolean(value) && typeof value === "object" && value["@type"] === "HowToStep";
|
|
1376
1438
|
}
|
|
@@ -1412,26 +1474,6 @@ function extractFirst(value) {
|
|
|
1412
1474
|
const arr = flattenStrings(value);
|
|
1413
1475
|
return arr.length ? arr[0] : void 0;
|
|
1414
1476
|
}
|
|
1415
|
-
function convertImage(value) {
|
|
1416
|
-
if (!value) return void 0;
|
|
1417
|
-
if (typeof value === "string") {
|
|
1418
|
-
return value;
|
|
1419
|
-
}
|
|
1420
|
-
if (Array.isArray(value)) {
|
|
1421
|
-
for (const item of value) {
|
|
1422
|
-
const url = typeof item === "string" ? item : extractImageUrl(item);
|
|
1423
|
-
if (url) return url;
|
|
1424
|
-
}
|
|
1425
|
-
return void 0;
|
|
1426
|
-
}
|
|
1427
|
-
return extractImageUrl(value);
|
|
1428
|
-
}
|
|
1429
|
-
function extractImageUrl(value) {
|
|
1430
|
-
if (!value || typeof value !== "object") return void 0;
|
|
1431
|
-
const record = value;
|
|
1432
|
-
const candidate = typeof record.url === "string" ? record.url : typeof record.contentUrl === "string" ? record.contentUrl : void 0;
|
|
1433
|
-
return candidate?.trim() || void 0;
|
|
1434
|
-
}
|
|
1435
1477
|
function convertSource(recipe) {
|
|
1436
1478
|
const author = extractEntityName(recipe.author);
|
|
1437
1479
|
const publisher = extractEntityName(recipe.publisher);
|
|
@@ -1527,7 +1569,7 @@ function convertInstruction(entry) {
|
|
|
1527
1569
|
return createHowToStep(entry);
|
|
1528
1570
|
}
|
|
1529
1571
|
if ("subsection" in entry) {
|
|
1530
|
-
const steps = entry.items.map((item) =>
|
|
1572
|
+
const steps = entry.items.map((item) => createHowToStep(item)).filter((step) => Boolean(step));
|
|
1531
1573
|
if (!steps.length) {
|
|
1532
1574
|
return null;
|
|
1533
1575
|
}
|
|
@@ -1538,18 +1580,34 @@ function convertInstruction(entry) {
|
|
|
1538
1580
|
};
|
|
1539
1581
|
}
|
|
1540
1582
|
if ("text" in entry) {
|
|
1541
|
-
return createHowToStep(entry
|
|
1583
|
+
return createHowToStep(entry);
|
|
1542
1584
|
}
|
|
1543
1585
|
return createHowToStep(String(entry));
|
|
1544
1586
|
}
|
|
1545
|
-
function createHowToStep(
|
|
1546
|
-
if (!
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
|
|
1587
|
+
function createHowToStep(entry) {
|
|
1588
|
+
if (!entry) return null;
|
|
1589
|
+
if (typeof entry === "string") {
|
|
1590
|
+
const trimmed2 = entry.trim();
|
|
1591
|
+
if (!trimmed2) {
|
|
1592
|
+
return null;
|
|
1593
|
+
}
|
|
1594
|
+
return {
|
|
1595
|
+
"@type": "HowToStep",
|
|
1596
|
+
text: trimmed2
|
|
1597
|
+
};
|
|
1598
|
+
}
|
|
1599
|
+
const trimmed = entry.text?.trim();
|
|
1600
|
+
if (!trimmed) {
|
|
1601
|
+
return null;
|
|
1602
|
+
}
|
|
1603
|
+
const step = {
|
|
1550
1604
|
"@type": "HowToStep",
|
|
1551
1605
|
text: trimmed
|
|
1552
1606
|
};
|
|
1607
|
+
if (entry.image) {
|
|
1608
|
+
step.image = entry.image;
|
|
1609
|
+
}
|
|
1610
|
+
return step;
|
|
1553
1611
|
}
|
|
1554
1612
|
function convertTime2(time) {
|
|
1555
1613
|
if (!time) {
|
|
@@ -1712,7 +1770,7 @@ async function fetchPage(url, options = {}) {
|
|
|
1712
1770
|
};
|
|
1713
1771
|
const response = await resolvedFetch(url, requestInit);
|
|
1714
1772
|
clearTimeout(timeoutId);
|
|
1715
|
-
if (response &&
|
|
1773
|
+
if (response && typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
1716
1774
|
try {
|
|
1717
1775
|
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
1718
1776
|
if (globalFetch) {
|
|
@@ -1730,7 +1788,7 @@ async function fetchPage(url, options = {}) {
|
|
|
1730
1788
|
throw error;
|
|
1731
1789
|
}
|
|
1732
1790
|
const html = await response.text();
|
|
1733
|
-
if (typeof process
|
|
1791
|
+
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
1734
1792
|
try {
|
|
1735
1793
|
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
1736
1794
|
if (globalFetch) {
|
|
@@ -1990,14 +2048,30 @@ function extractRecipe(html) {
|
|
|
1990
2048
|
return extractRecipeBrowser(html);
|
|
1991
2049
|
}
|
|
1992
2050
|
const jsonLdRecipe = extractJsonLd(html);
|
|
1993
|
-
|
|
1994
|
-
|
|
2051
|
+
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
2052
|
+
try {
|
|
2053
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
2054
|
+
if (globalFetch) {
|
|
2055
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:6", message: "JSON-LD extraction result", data: { hasJsonLd: !!jsonLdRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
|
|
2056
|
+
});
|
|
2057
|
+
}
|
|
2058
|
+
} catch {
|
|
2059
|
+
}
|
|
2060
|
+
}
|
|
1995
2061
|
if (jsonLdRecipe) {
|
|
1996
2062
|
return { recipe: jsonLdRecipe, source: "jsonld" };
|
|
1997
2063
|
}
|
|
1998
2064
|
const microdataRecipe = extractMicrodata(html);
|
|
1999
|
-
|
|
2000
|
-
|
|
2065
|
+
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
2066
|
+
try {
|
|
2067
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
2068
|
+
if (globalFetch) {
|
|
2069
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:12", message: "Microdata extraction result", data: { hasMicrodata: !!microdataRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "D" }) }).catch(() => {
|
|
2070
|
+
});
|
|
2071
|
+
}
|
|
2072
|
+
} catch {
|
|
2073
|
+
}
|
|
2074
|
+
}
|
|
2001
2075
|
if (microdataRecipe) {
|
|
2002
2076
|
return { recipe: microdataRecipe, source: "microdata" };
|
|
2003
2077
|
}
|
|
@@ -2006,20 +2080,52 @@ function extractRecipe(html) {
|
|
|
2006
2080
|
|
|
2007
2081
|
// src/scraper/index.ts
|
|
2008
2082
|
async function scrapeRecipe(url, options = {}) {
|
|
2009
|
-
|
|
2010
|
-
|
|
2083
|
+
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
2084
|
+
try {
|
|
2085
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
2086
|
+
if (globalFetch) {
|
|
2087
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:7", message: "scrapeRecipe entry", data: { url, hasOptions: !!options }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,B,C,D,E" }) }).catch(() => {
|
|
2088
|
+
});
|
|
2089
|
+
}
|
|
2090
|
+
} catch {
|
|
2091
|
+
}
|
|
2092
|
+
}
|
|
2011
2093
|
const html = await fetchPage(url, options);
|
|
2012
|
-
|
|
2013
|
-
|
|
2094
|
+
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
2095
|
+
try {
|
|
2096
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
2097
|
+
if (globalFetch) {
|
|
2098
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:9", message: "HTML fetched", data: { htmlLength: html?.length, htmlPreview: html?.substring(0, 200) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
|
|
2099
|
+
});
|
|
2100
|
+
}
|
|
2101
|
+
} catch {
|
|
2102
|
+
}
|
|
2103
|
+
}
|
|
2014
2104
|
const { recipe } = extractRecipe(html);
|
|
2015
|
-
|
|
2016
|
-
|
|
2105
|
+
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
2106
|
+
try {
|
|
2107
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
2108
|
+
if (globalFetch) {
|
|
2109
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:11", message: "extractRecipe result", data: { hasRecipe: !!recipe, recipeType: recipe?.["@type"], recipeName: recipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C,D" }) }).catch(() => {
|
|
2110
|
+
});
|
|
2111
|
+
}
|
|
2112
|
+
} catch {
|
|
2113
|
+
}
|
|
2114
|
+
}
|
|
2017
2115
|
if (!recipe) {
|
|
2018
2116
|
throw new Error("No Schema.org recipe data found in page");
|
|
2019
2117
|
}
|
|
2020
2118
|
const soustackRecipe = fromSchemaOrg(recipe);
|
|
2021
|
-
|
|
2022
|
-
|
|
2119
|
+
if (typeof process !== "undefined" && process.env.NODE_ENV !== "test") {
|
|
2120
|
+
try {
|
|
2121
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
2122
|
+
if (globalFetch) {
|
|
2123
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:17", message: "fromSchemaOrg result", data: { hasSoustackRecipe: !!soustackRecipe, soustackRecipeName: soustackRecipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
|
|
2124
|
+
});
|
|
2125
|
+
}
|
|
2126
|
+
} catch {
|
|
2127
|
+
}
|
|
2128
|
+
}
|
|
2023
2129
|
if (!soustackRecipe) {
|
|
2024
2130
|
throw new Error("Schema.org data did not include a valid recipe");
|
|
2025
2131
|
}
|
|
@@ -2036,6 +2142,10 @@ function extractRecipeFromHTML(html) {
|
|
|
2036
2142
|
}
|
|
2037
2143
|
return soustackRecipe;
|
|
2038
2144
|
}
|
|
2145
|
+
function extractSchemaOrgRecipeFromHTML(html) {
|
|
2146
|
+
const { recipe } = extractRecipe(html);
|
|
2147
|
+
return recipe;
|
|
2148
|
+
}
|
|
2039
2149
|
|
|
2040
2150
|
// src/parsers/yield.ts
|
|
2041
2151
|
var RANGE_PATTERN = /^(\d+)(?:\s*(?:[-–—]|to)\s*)(\d+)\s+(.+)$/i;
|
|
@@ -2280,9 +2390,11 @@ function wordToNumber(word) {
|
|
|
2280
2390
|
}
|
|
2281
2391
|
|
|
2282
2392
|
exports.extractRecipeFromHTML = extractRecipeFromHTML;
|
|
2393
|
+
exports.extractSchemaOrgRecipeFromHTML = extractSchemaOrgRecipeFromHTML;
|
|
2283
2394
|
exports.formatDuration = formatDuration;
|
|
2284
2395
|
exports.formatYield = formatYield2;
|
|
2285
2396
|
exports.fromSchemaOrg = fromSchemaOrg;
|
|
2397
|
+
exports.normalizeImage = normalizeImage;
|
|
2286
2398
|
exports.normalizeIngredientInput = normalizeIngredientInput;
|
|
2287
2399
|
exports.normalizeYield = normalizeYield;
|
|
2288
2400
|
exports.parseDuration = parseDuration;
|