recipe-scrapers-js 0.1.0-alpha.5 → 0.1.0-alpha.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -4
- package/dist/index.js +146 -23
- package/package.json +5 -4
package/README.md
CHANGED
|
@@ -19,14 +19,16 @@ A TypeScript/JavaScript library for scraping recipe data from various cooking we
|
|
|
19
19
|
|
|
20
20
|
## Installation
|
|
21
21
|
|
|
22
|
+
Add the `recipe-scrapers-js` package and its peer dependencies.
|
|
23
|
+
|
|
22
24
|
```bash
|
|
23
|
-
npm install recipe-scrapers-js
|
|
25
|
+
npm install recipe-scrapers-js cheerio zod
|
|
24
26
|
# or
|
|
25
|
-
yarn add recipe-scrapers-js
|
|
27
|
+
yarn add recipe-scrapers-js cheerio zod
|
|
26
28
|
# or
|
|
27
|
-
pnpm add recipe-scrapers-js
|
|
29
|
+
pnpm add recipe-scrapers-js cheerio zod
|
|
28
30
|
# or
|
|
29
|
-
bun add recipe-scrapers-js
|
|
31
|
+
bun add recipe-scrapers-js cheerio zod
|
|
30
32
|
```
|
|
31
33
|
|
|
32
34
|
## Usage
|
|
@@ -195,6 +197,7 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file
|
|
|
195
197
|
- Original [recipe-scrapers](https://github.com/hhursev/recipe-scrapers) Python library by [hhursev](https://github.com/hhursev)
|
|
196
198
|
- [Schema.org Recipe specification](https://schema.org/Recipe)
|
|
197
199
|
- [Cheerio](https://cheerio.js.org/) for HTML parsing
|
|
200
|
+
- [Zod](https://zod.dev/) for schema validation
|
|
198
201
|
|
|
199
202
|
## Copyright and Usage
|
|
200
203
|
|
package/dist/index.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import * as cheerio from "cheerio";
|
|
2
2
|
import { parse, toSeconds } from "iso8601-duration";
|
|
3
|
+
import z from "zod/v4";
|
|
3
4
|
|
|
4
5
|
//#region src/utils/index.ts
|
|
5
6
|
function isDefined(value) {
|
|
@@ -145,7 +146,7 @@ function parseMinutes(value) {
|
|
|
145
146
|
|
|
146
147
|
//#endregion
|
|
147
148
|
//#region src/utils/ingredients.ts
|
|
148
|
-
const
|
|
149
|
+
const DEFAULT_INGREDIENTS_GROUP_NAME = "Ingredients";
|
|
149
150
|
const DEFAULT_GROUPING_SELECTORS = {
|
|
150
151
|
wprm: {
|
|
151
152
|
headingSelectors: [".wprm-recipe-ingredient-group h4", ".wprm-recipe-group-name"],
|
|
@@ -227,13 +228,13 @@ function groupIngredients($, ingredientsList, headingSelector, itemSelector) {
|
|
|
227
228
|
const $el = $(el);
|
|
228
229
|
if ($el.is(groupNameSelector)) {
|
|
229
230
|
const headingText = normalizeString($el.text());
|
|
230
|
-
currentHeading = headingText ||
|
|
231
|
+
currentHeading = headingText || DEFAULT_INGREDIENTS_GROUP_NAME;
|
|
231
232
|
if (!groupings.has(currentHeading)) groupings.set(currentHeading, /* @__PURE__ */ new Set());
|
|
232
233
|
} else if ($el.is(ingredientSelector)) {
|
|
233
234
|
const text = normalizeString($el.text());
|
|
234
235
|
if (!text) continue;
|
|
235
236
|
const matched = bestMatch(text, ingredients);
|
|
236
|
-
const heading = currentHeading ||
|
|
237
|
+
const heading = currentHeading || DEFAULT_INGREDIENTS_GROUP_NAME;
|
|
237
238
|
if (!groupings.has(heading)) groupings.set(heading, /* @__PURE__ */ new Set());
|
|
238
239
|
groupings.get(heading)?.add(matched);
|
|
239
240
|
}
|
|
@@ -1076,6 +1077,137 @@ var AllRecipes = class extends AbstractScraper {
|
|
|
1076
1077
|
extractors = {};
|
|
1077
1078
|
};
|
|
1078
1079
|
|
|
1080
|
+
//#endregion
|
|
1081
|
+
//#region src/scrapers/americastestkitchen.ts
|
|
1082
|
+
const recipeIngredientItemSchema = z.object({ fields: z.object({
|
|
1083
|
+
qty: z.string(),
|
|
1084
|
+
preText: z.string(),
|
|
1085
|
+
postText: z.string(),
|
|
1086
|
+
measurement: z.string().nullable(),
|
|
1087
|
+
pluralIngredient: z.boolean(),
|
|
1088
|
+
ingredient: z.object({
|
|
1089
|
+
contentType: z.string(),
|
|
1090
|
+
fields: z.object({
|
|
1091
|
+
title: z.string(),
|
|
1092
|
+
pluralTitle: z.string(),
|
|
1093
|
+
kind: z.string()
|
|
1094
|
+
})
|
|
1095
|
+
})
|
|
1096
|
+
}) });
|
|
1097
|
+
const recipeIngredientGroupSchema = z.object({ fields: z.object({
|
|
1098
|
+
title: z.string(),
|
|
1099
|
+
recipeIngredientItems: z.array(recipeIngredientItemSchema)
|
|
1100
|
+
}) });
|
|
1101
|
+
const recipeInstructionSchema = z.object({ fields: z.object({ content: z.string() }) });
|
|
1102
|
+
const recipeDataSchema = z.object({
|
|
1103
|
+
totalCookTime: z.number(),
|
|
1104
|
+
recipeTimeNote: z.string().optional(),
|
|
1105
|
+
ingredientGroups: z.array(recipeIngredientGroupSchema),
|
|
1106
|
+
headnote: z.string().optional(),
|
|
1107
|
+
instructions: z.array(recipeInstructionSchema),
|
|
1108
|
+
metaData: z.object({ fields: z.object({ photo: z.object({ url: z.url() }) }) })
|
|
1109
|
+
});
|
|
1110
|
+
const pagePropsDataSchema = z.object({ props: z.object({ pageProps: z.object({ data: recipeDataSchema }) }) });
|
|
1111
|
+
var AmericasTestKitchen = class extends AbstractScraper {
|
|
1112
|
+
data = null;
|
|
1113
|
+
static host() {
|
|
1114
|
+
return "americastestkitchen.com";
|
|
1115
|
+
}
|
|
1116
|
+
extractors = {
|
|
1117
|
+
image: this.image.bind(this),
|
|
1118
|
+
ingredients: this.ingredients.bind(this),
|
|
1119
|
+
instructions: this.instructions.bind(this),
|
|
1120
|
+
siteName: this.siteName.bind(this)
|
|
1121
|
+
};
|
|
1122
|
+
siteName() {
|
|
1123
|
+
return "America's Test Kitchen";
|
|
1124
|
+
}
|
|
1125
|
+
image(prevValue) {
|
|
1126
|
+
const data = this.getRecipeData();
|
|
1127
|
+
if (!data) {
|
|
1128
|
+
if (prevValue) return prevValue;
|
|
1129
|
+
throw new Error("Failed to extract image");
|
|
1130
|
+
}
|
|
1131
|
+
return data.metaData.fields.photo.url;
|
|
1132
|
+
}
|
|
1133
|
+
ingredients(prevValue) {
|
|
1134
|
+
let ingredients = this.parseIngredients();
|
|
1135
|
+
if (!ingredients) ingredients = this.parseHtmlIngredients(prevValue);
|
|
1136
|
+
if (!ingredients) throw new Error("Failed to extract ingredients");
|
|
1137
|
+
return ingredients;
|
|
1138
|
+
}
|
|
1139
|
+
instructions(prevValue) {
|
|
1140
|
+
const data = this.getRecipeData();
|
|
1141
|
+
if (!data) {
|
|
1142
|
+
if (prevValue) return prevValue;
|
|
1143
|
+
throw new Error("Failed to extract instructions");
|
|
1144
|
+
}
|
|
1145
|
+
const { headnote } = data;
|
|
1146
|
+
let headnoteText = "";
|
|
1147
|
+
if (headnote) headnoteText = `Note: ${normalizeString(headnote)}`;
|
|
1148
|
+
const instructionTexts = [];
|
|
1149
|
+
for (const instruction of data.instructions) instructionTexts.push(normalizeString(instruction.fields.content));
|
|
1150
|
+
return new Set([headnoteText, ...instructionTexts]);
|
|
1151
|
+
}
|
|
1152
|
+
parseHtmlIngredients(prevValue) {
|
|
1153
|
+
const headingSelector = "[class*=\"RecipeIngredientGroups_group\"] > span";
|
|
1154
|
+
const ingredientSelector = "[class*=\"RecipeIngredient\"] label";
|
|
1155
|
+
if (isList(prevValue) && prevValue.size > 0) {
|
|
1156
|
+
const result = groupIngredients(this.$, prevValue, headingSelector, ingredientSelector);
|
|
1157
|
+
return result;
|
|
1158
|
+
}
|
|
1159
|
+
return null;
|
|
1160
|
+
}
|
|
1161
|
+
getRecipeData() {
|
|
1162
|
+
if (this.data === null) {
|
|
1163
|
+
const jsonElement = this.$("script[type=\"application/json\"]");
|
|
1164
|
+
const jsonString = jsonElement.html();
|
|
1165
|
+
if (!jsonString) {
|
|
1166
|
+
this.logger.warn("Could not find JSON data script tag");
|
|
1167
|
+
return null;
|
|
1168
|
+
}
|
|
1169
|
+
try {
|
|
1170
|
+
const parsed = pagePropsDataSchema.parse(JSON.parse(jsonString));
|
|
1171
|
+
this.data = parsed.props.pageProps.data;
|
|
1172
|
+
} catch (error) {
|
|
1173
|
+
this.logger.error("Failed to parse JSON data:", error);
|
|
1174
|
+
return null;
|
|
1175
|
+
}
|
|
1176
|
+
}
|
|
1177
|
+
return this.data;
|
|
1178
|
+
}
|
|
1179
|
+
parseIngredientItem(ingredientItem) {
|
|
1180
|
+
const { fields } = ingredientItem;
|
|
1181
|
+
const fragments = [
|
|
1182
|
+
fields.qty || "",
|
|
1183
|
+
fields.measurement || "",
|
|
1184
|
+
fields.ingredient.fields.title || "",
|
|
1185
|
+
fields.postText || ""
|
|
1186
|
+
];
|
|
1187
|
+
const filteredFragments = [];
|
|
1188
|
+
for (const fragment of fragments) if (fragment) filteredFragments.push(fragment.trimEnd());
|
|
1189
|
+
return filteredFragments.join(" ").trimEnd().replace(" ,", ",");
|
|
1190
|
+
}
|
|
1191
|
+
parseIngredients() {
|
|
1192
|
+
const data = this.getRecipeData();
|
|
1193
|
+
if (!data) return null;
|
|
1194
|
+
const { ingredientGroups } = data;
|
|
1195
|
+
if (ingredientGroups.length === 1) {
|
|
1196
|
+
const ingredientSet = /* @__PURE__ */ new Set();
|
|
1197
|
+
for (const item of ingredientGroups[0].fields.recipeIngredientItems) ingredientSet.add(this.parseIngredientItem(item));
|
|
1198
|
+
return ingredientSet;
|
|
1199
|
+
}
|
|
1200
|
+
const ingredientMap = /* @__PURE__ */ new Map();
|
|
1201
|
+
for (const group of ingredientGroups) {
|
|
1202
|
+
const groupTitle = group.fields.title || DEFAULT_INGREDIENTS_GROUP_NAME;
|
|
1203
|
+
const ingredientSet = /* @__PURE__ */ new Set();
|
|
1204
|
+
for (const item of group.fields.recipeIngredientItems) ingredientSet.add(this.parseIngredientItem(item));
|
|
1205
|
+
ingredientMap.set(groupTitle, ingredientSet);
|
|
1206
|
+
}
|
|
1207
|
+
return ingredientMap;
|
|
1208
|
+
}
|
|
1209
|
+
};
|
|
1210
|
+
|
|
1079
1211
|
//#endregion
|
|
1080
1212
|
//#region src/scrapers/bbcgoodfood.ts
|
|
1081
1213
|
var BBCGoodFood = class extends AbstractScraper {
|
|
@@ -1083,16 +1215,6 @@ var BBCGoodFood = class extends AbstractScraper {
|
|
|
1083
1215
|
return "bbcgoodfood.com";
|
|
1084
1216
|
}
|
|
1085
1217
|
extractors = { ingredients: this.ingredients.bind(this) };
|
|
1086
|
-
/**
|
|
1087
|
-
* The NYTimes website appears to auto generate it's CSS class names,
|
|
1088
|
-
* which results in them ending with a string a random characters.
|
|
1089
|
-
* Matching the exact class name is likely to break fairly quickly
|
|
1090
|
-
* so instead we are going to match on a partial class name.
|
|
1091
|
-
* For example, h3[class*='ingredientgroup_name'] matches an h3 element
|
|
1092
|
-
* with a class that contains the value 'ingredient_groupname' at least once
|
|
1093
|
-
* anywhere in the element class attribute.
|
|
1094
|
-
* @link https://developer.mozilla.org/en-US/docs/Web/CSS/Attribute_selectors
|
|
1095
|
-
*/
|
|
1096
1218
|
ingredients(prevValue) {
|
|
1097
1219
|
const headingSelector = ".recipe__ingredients h3";
|
|
1098
1220
|
const ingredientSelector = ".recipe__ingredients li";
|
|
@@ -1104,6 +1226,15 @@ var BBCGoodFood = class extends AbstractScraper {
|
|
|
1104
1226
|
}
|
|
1105
1227
|
};
|
|
1106
1228
|
|
|
1229
|
+
//#endregion
|
|
1230
|
+
//#region src/scrapers/eatingwell.ts
|
|
1231
|
+
var EatingWell = class extends AbstractScraper {
|
|
1232
|
+
static host() {
|
|
1233
|
+
return "eatingwell.com";
|
|
1234
|
+
}
|
|
1235
|
+
extractors = {};
|
|
1236
|
+
};
|
|
1237
|
+
|
|
1107
1238
|
//#endregion
|
|
1108
1239
|
//#region src/scrapers/epicurious.ts
|
|
1109
1240
|
var Epicurious = class extends AbstractScraper {
|
|
@@ -1124,16 +1255,6 @@ var NYTimes = class extends AbstractScraper {
|
|
|
1124
1255
|
return "cooking.nytimes.com";
|
|
1125
1256
|
}
|
|
1126
1257
|
extractors = { ingredients: this.ingredients.bind(this) };
|
|
1127
|
-
/**
|
|
1128
|
-
* The NYTimes website appears to auto generate it's CSS class names,
|
|
1129
|
-
* which results in them ending with a string a random characters.
|
|
1130
|
-
* Matching the exact class name is likely to break fairly quickly
|
|
1131
|
-
* so instead we are going to match on a partial class name.
|
|
1132
|
-
* For example, h3[class*='ingredientgroup_name'] matches an h3 element
|
|
1133
|
-
* with a class that contains the value 'ingredient_groupname' at least once
|
|
1134
|
-
* anywhere in the element class attribute.
|
|
1135
|
-
* @link https://developer.mozilla.org/en-US/docs/Web/CSS/Attribute_selectors
|
|
1136
|
-
*/
|
|
1137
1258
|
ingredients(prevValue) {
|
|
1138
1259
|
const headingSelector = "h3[class*=\"ingredientgroup_name\"]";
|
|
1139
1260
|
const ingredientSelector = "li[class*=\"ingredient\"]";
|
|
@@ -1184,7 +1305,9 @@ var SimplyRecipes = class extends AbstractScraper {
|
|
|
1184
1305
|
*/
|
|
1185
1306
|
const scrapers = {
|
|
1186
1307
|
[AllRecipes.host()]: AllRecipes,
|
|
1308
|
+
[AmericasTestKitchen.host()]: AmericasTestKitchen,
|
|
1187
1309
|
[BBCGoodFood.host()]: BBCGoodFood,
|
|
1310
|
+
[EatingWell.host()]: EatingWell,
|
|
1188
1311
|
[Epicurious.host()]: Epicurious,
|
|
1189
1312
|
[SeriousEats.host()]: SeriousEats,
|
|
1190
1313
|
[SimplyRecipes.host()]: SimplyRecipes,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "recipe-scrapers-js",
|
|
3
|
-
"version": "0.1.0-alpha.
|
|
3
|
+
"version": "0.1.0-alpha.7",
|
|
4
4
|
"license": "MIT",
|
|
5
5
|
"description": "A recipe scrapers library",
|
|
6
6
|
"author": {
|
|
@@ -39,15 +39,16 @@
|
|
|
39
39
|
"prepublishOnly": "bun run lint && bun run build"
|
|
40
40
|
},
|
|
41
41
|
"peerDependencies": {
|
|
42
|
-
"cheerio": "^1.1.0"
|
|
42
|
+
"cheerio": "^1.1.0",
|
|
43
|
+
"zod": "^3.25.76"
|
|
43
44
|
},
|
|
44
45
|
"dependencies": {
|
|
45
46
|
"iso8601-duration": "^2.1.2",
|
|
46
47
|
"schema-dts": "^1.1.5"
|
|
47
48
|
},
|
|
48
49
|
"devDependencies": {
|
|
49
|
-
"@biomejs/biome": "^2.
|
|
50
|
-
"@types/bun": "^1.2.
|
|
50
|
+
"@biomejs/biome": "^2.1.1",
|
|
51
|
+
"@types/bun": "^1.2.18",
|
|
51
52
|
"cheerio": "^1.1.0",
|
|
52
53
|
"tsdown": "^0.12.9",
|
|
53
54
|
"typescript": "^5.8.3"
|