recipe-scrapers-js 0.1.0-alpha.6 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +31 -50
- package/package.json +9 -9
package/dist/index.js
CHANGED
|
@@ -25,8 +25,7 @@ function isString(value) {
|
|
|
25
25
|
*/
|
|
26
26
|
function getHostName(value) {
|
|
27
27
|
try {
|
|
28
|
-
|
|
29
|
-
return url.host;
|
|
28
|
+
return new URL(value.replace("www.", "")).host;
|
|
30
29
|
} catch {
|
|
31
30
|
throw new Error(`Invalid URL: ${value}`);
|
|
32
31
|
}
|
|
@@ -139,8 +138,7 @@ function splitToList(value, separator) {
|
|
|
139
138
|
* @TODO Implement [Temporal.Duration](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Temporal/Duration) once it lands.
|
|
140
139
|
*/
|
|
141
140
|
function parseMinutes(value) {
|
|
142
|
-
const
|
|
143
|
-
const totalSeconds = toSeconds(duration);
|
|
141
|
+
const totalSeconds = toSeconds(parse(value));
|
|
144
142
|
return Math.round(totalSeconds / 60);
|
|
145
143
|
}
|
|
146
144
|
|
|
@@ -181,8 +179,7 @@ function scoreSentenceSimilarity(first, second) {
|
|
|
181
179
|
const bigrams = (s) => new Set(Array.from({ length: s.length - 1 }, (_, i) => s.slice(i, i + 2)));
|
|
182
180
|
const firstBigrams = bigrams(first);
|
|
183
181
|
const secondBigrams = bigrams(second);
|
|
184
|
-
|
|
185
|
-
return 2 * intersectionSize / (firstBigrams.size + secondBigrams.size);
|
|
182
|
+
return 2 * [...firstBigrams].filter((b) => secondBigrams.has(b)).length / (firstBigrams.size + secondBigrams.size);
|
|
186
183
|
}
|
|
187
184
|
function bestMatch(testString, targetStrings) {
|
|
188
185
|
if (targetStrings.length === 0) throw new Error("targetStrings cannot be empty");
|
|
@@ -227,8 +224,7 @@ function groupIngredients($, ingredientsList, headingSelector, itemSelector) {
|
|
|
227
224
|
for (const el of elements) {
|
|
228
225
|
const $el = $(el);
|
|
229
226
|
if ($el.is(groupNameSelector)) {
|
|
230
|
-
|
|
231
|
-
currentHeading = headingText || DEFAULT_INGREDIENTS_GROUP_NAME;
|
|
227
|
+
currentHeading = normalizeString($el.text()) || DEFAULT_INGREDIENTS_GROUP_NAME;
|
|
232
228
|
if (!groupings.has(currentHeading)) groupings.set(currentHeading, /* @__PURE__ */ new Set());
|
|
233
229
|
} else if ($el.is(ingredientSelector)) {
|
|
234
230
|
const text = normalizeString($el.text());
|
|
@@ -383,8 +379,7 @@ const extractValueFromElement = (element) => {
|
|
|
383
379
|
return element.text().trim();
|
|
384
380
|
};
|
|
385
381
|
const extractSchemaType = (itemType) => {
|
|
386
|
-
|
|
387
|
-
return typeMatch?.[1];
|
|
382
|
+
return itemType.match(/schema\.org\/(\w+)/)?.[1];
|
|
388
383
|
};
|
|
389
384
|
/**
|
|
390
385
|
* Extracts microdata from HTML elements using itemtype and itemprop attributes
|
|
@@ -395,8 +390,7 @@ const extractSchemaType = (itemType) => {
|
|
|
395
390
|
*/
|
|
396
391
|
function extractMicrodata($, selector) {
|
|
397
392
|
const results = [];
|
|
398
|
-
|
|
399
|
-
elements.each((_, el) => {
|
|
393
|
+
$(selector).each((_, el) => {
|
|
400
394
|
const $element = $(el);
|
|
401
395
|
const itemType = $element.attr("itemtype");
|
|
402
396
|
const rootObject = {};
|
|
@@ -406,13 +400,11 @@ function extractMicrodata($, selector) {
|
|
|
406
400
|
}
|
|
407
401
|
const allProps = $element.find("[itemprop]").addBack("[itemprop]");
|
|
408
402
|
const nestedItemTypes = $element.find("[itemtype]");
|
|
409
|
-
|
|
403
|
+
allProps.filter((_$1, propEl) => {
|
|
410
404
|
const $prop = $(propEl);
|
|
411
405
|
if ($prop.attr("itemtype")) return true;
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
});
|
|
415
|
-
rootLevelProps.each((_$1, propEl) => {
|
|
406
|
+
return !nestedItemTypes.toArray().some((nestedEl) => $(nestedEl).find($prop).length > 0);
|
|
407
|
+
}).each((_$1, propEl) => {
|
|
416
408
|
const $prop = $(propEl);
|
|
417
409
|
const propName = $prop.attr("itemprop");
|
|
418
410
|
if (!propName) return;
|
|
@@ -497,8 +489,7 @@ function parseYields(element) {
|
|
|
497
489
|
const splitMatch = serveText.match(SERVE_REGEX_TO);
|
|
498
490
|
if (splitMatch && splitMatch.index !== void 0) serveText = serveText.slice(splitMatch.index + splitMatch[0].length).trim();
|
|
499
491
|
}
|
|
500
|
-
const
|
|
501
|
-
const matched = match?.groups?.items || "0";
|
|
492
|
+
const matched = serveText.match(SERVE_REGEX_NUMBER)?.groups?.items || "0";
|
|
502
493
|
const serveTextLower = serveText.toLowerCase();
|
|
503
494
|
let bestMatch$1 = null;
|
|
504
495
|
let bestMatchLength = 0;
|
|
@@ -535,8 +526,7 @@ function isSchemaOrgData(obj) {
|
|
|
535
526
|
}
|
|
536
527
|
function isThingType(obj, type) {
|
|
537
528
|
if (!isBaseType(obj)) return false;
|
|
538
|
-
|
|
539
|
-
return thingType === type;
|
|
529
|
+
return (Array.isArray(obj["@type"]) ? obj["@type"][0] : obj["@type"]) === type;
|
|
540
530
|
}
|
|
541
531
|
function isAggregateRating(obj) {
|
|
542
532
|
return isThingType(obj, "AggregateRating");
|
|
@@ -645,7 +635,6 @@ var SchemaOrgPlugin = class SchemaOrgPlugin extends ExtractorPlugin {
|
|
|
645
635
|
pickFromObject(obj, props) {
|
|
646
636
|
if (!isPlainObject(obj)) return void 0;
|
|
647
637
|
for (const prop of props) if (isString(obj[prop])) return obj[prop];
|
|
648
|
-
return void 0;
|
|
649
638
|
}
|
|
650
639
|
getSchemaTextValue(value, props = [
|
|
651
640
|
"textValue",
|
|
@@ -714,10 +703,7 @@ var SchemaOrgPlugin = class SchemaOrgPlugin extends ExtractorPlugin {
|
|
|
714
703
|
return value;
|
|
715
704
|
}
|
|
716
705
|
if (isString(value)) return parseMinutes(value);
|
|
717
|
-
if (isBaseType(value) && "maxValue" in value)
|
|
718
|
-
const maxValue = this.getSchemaTextValue(value.maxValue);
|
|
719
|
-
return parseMinutes(maxValue);
|
|
720
|
-
}
|
|
706
|
+
if (isBaseType(value) && "maxValue" in value) return parseMinutes(this.getSchemaTextValue(value.maxValue));
|
|
721
707
|
return null;
|
|
722
708
|
}
|
|
723
709
|
parseInstructions(value) {
|
|
@@ -921,8 +907,7 @@ var RecipeExtractor = class RecipeExtractor {
|
|
|
921
907
|
this.logger.debug(`Extracting field: ${field}`);
|
|
922
908
|
for (const plugin of this.plugins) {
|
|
923
909
|
const pluginLogger = new Logger(this.getContext(plugin.name), this.options.logLevel);
|
|
924
|
-
|
|
925
|
-
if (isSupported && !isDefined(result)) try {
|
|
910
|
+
if (plugin.supports(field) && !isDefined(result)) try {
|
|
926
911
|
result = await plugin.extract(field);
|
|
927
912
|
} catch (err) {
|
|
928
913
|
if (err instanceof ExtractionFailedException) pluginLogger.verbose(err.message);
|
|
@@ -964,9 +949,7 @@ var AbstractScraper = class {
|
|
|
964
949
|
const { extraExtractors = [], extraPostProcessors = [], logLevel = LogLevel.WARN } = options;
|
|
965
950
|
this.logger = new Logger(this.constructor.name, logLevel);
|
|
966
951
|
this.$ = cheerio.load(html);
|
|
967
|
-
|
|
968
|
-
const basePostProcessors = [new HtmlStripperPlugin()];
|
|
969
|
-
this.pluginManager = new PluginManager(baseExtractors, basePostProcessors, extraExtractors, extraPostProcessors);
|
|
952
|
+
this.pluginManager = new PluginManager([new OpenGraphPlugin(this.$), new SchemaOrgPlugin(this.$, logLevel)], [new HtmlStripperPlugin()], extraExtractors, extraPostProcessors);
|
|
970
953
|
this.recipeExtractor = new RecipeExtractor(this.pluginManager.getExtractors(), this.constructor.name, { logLevel });
|
|
971
954
|
}
|
|
972
955
|
/**
|
|
@@ -1104,7 +1087,8 @@ const recipeDataSchema = z.object({
|
|
|
1104
1087
|
recipeTimeNote: z.string().optional(),
|
|
1105
1088
|
ingredientGroups: z.array(recipeIngredientGroupSchema),
|
|
1106
1089
|
headnote: z.string().optional(),
|
|
1107
|
-
instructions: z.array(recipeInstructionSchema)
|
|
1090
|
+
instructions: z.array(recipeInstructionSchema),
|
|
1091
|
+
metaData: z.object({ fields: z.object({ photo: z.object({ url: z.url() }) }) })
|
|
1108
1092
|
});
|
|
1109
1093
|
const pagePropsDataSchema = z.object({ props: z.object({ pageProps: z.object({ data: recipeDataSchema }) }) });
|
|
1110
1094
|
var AmericasTestKitchen = class extends AbstractScraper {
|
|
@@ -1113,6 +1097,7 @@ var AmericasTestKitchen = class extends AbstractScraper {
|
|
|
1113
1097
|
return "americastestkitchen.com";
|
|
1114
1098
|
}
|
|
1115
1099
|
extractors = {
|
|
1100
|
+
image: this.image.bind(this),
|
|
1116
1101
|
ingredients: this.ingredients.bind(this),
|
|
1117
1102
|
instructions: this.instructions.bind(this),
|
|
1118
1103
|
siteName: this.siteName.bind(this)
|
|
@@ -1120,6 +1105,14 @@ var AmericasTestKitchen = class extends AbstractScraper {
|
|
|
1120
1105
|
siteName() {
|
|
1121
1106
|
return "America's Test Kitchen";
|
|
1122
1107
|
}
|
|
1108
|
+
image(prevValue) {
|
|
1109
|
+
const data = this.getRecipeData();
|
|
1110
|
+
if (!data) {
|
|
1111
|
+
if (prevValue) return prevValue;
|
|
1112
|
+
throw new Error("Failed to extract image");
|
|
1113
|
+
}
|
|
1114
|
+
return data.metaData.fields.photo.url;
|
|
1115
|
+
}
|
|
1123
1116
|
ingredients(prevValue) {
|
|
1124
1117
|
let ingredients = this.parseIngredients();
|
|
1125
1118
|
if (!ingredients) ingredients = this.parseHtmlIngredients(prevValue);
|
|
@@ -1142,23 +1135,18 @@ var AmericasTestKitchen = class extends AbstractScraper {
|
|
|
1142
1135
|
parseHtmlIngredients(prevValue) {
|
|
1143
1136
|
const headingSelector = "[class*=\"RecipeIngredientGroups_group\"] > span";
|
|
1144
1137
|
const ingredientSelector = "[class*=\"RecipeIngredient\"] label";
|
|
1145
|
-
if (isList(prevValue) && prevValue.size > 0)
|
|
1146
|
-
const result = groupIngredients(this.$, prevValue, headingSelector, ingredientSelector);
|
|
1147
|
-
return result;
|
|
1148
|
-
}
|
|
1138
|
+
if (isList(prevValue) && prevValue.size > 0) return groupIngredients(this.$, prevValue, headingSelector, ingredientSelector);
|
|
1149
1139
|
return null;
|
|
1150
1140
|
}
|
|
1151
1141
|
getRecipeData() {
|
|
1152
1142
|
if (this.data === null) {
|
|
1153
|
-
const
|
|
1154
|
-
const jsonString = jsonElement.html();
|
|
1143
|
+
const jsonString = this.$("script[type=\"application/json\"]").html();
|
|
1155
1144
|
if (!jsonString) {
|
|
1156
1145
|
this.logger.warn("Could not find JSON data script tag");
|
|
1157
1146
|
return null;
|
|
1158
1147
|
}
|
|
1159
1148
|
try {
|
|
1160
|
-
|
|
1161
|
-
this.data = parsed.props.pageProps.data;
|
|
1149
|
+
this.data = pagePropsDataSchema.parse(JSON.parse(jsonString)).props.pageProps.data;
|
|
1162
1150
|
} catch (error) {
|
|
1163
1151
|
this.logger.error("Failed to parse JSON data:", error);
|
|
1164
1152
|
return null;
|
|
@@ -1208,10 +1196,7 @@ var BBCGoodFood = class extends AbstractScraper {
|
|
|
1208
1196
|
ingredients(prevValue) {
|
|
1209
1197
|
const headingSelector = ".recipe__ingredients h3";
|
|
1210
1198
|
const ingredientSelector = ".recipe__ingredients li";
|
|
1211
|
-
if (isList(prevValue) && prevValue.size > 0)
|
|
1212
|
-
const result = groupIngredients(this.$, prevValue, headingSelector, ingredientSelector);
|
|
1213
|
-
return result;
|
|
1214
|
-
}
|
|
1199
|
+
if (isList(prevValue) && prevValue.size > 0) return groupIngredients(this.$, prevValue, headingSelector, ingredientSelector);
|
|
1215
1200
|
throw new Error("No ingredients found to group");
|
|
1216
1201
|
}
|
|
1217
1202
|
};
|
|
@@ -1233,8 +1218,7 @@ var Epicurious = class extends AbstractScraper {
|
|
|
1233
1218
|
}
|
|
1234
1219
|
extractors = { author: this.author.bind(this) };
|
|
1235
1220
|
author() {
|
|
1236
|
-
|
|
1237
|
-
return author;
|
|
1221
|
+
return this.$("a[itemprop=\"author\"]").text().trim();
|
|
1238
1222
|
}
|
|
1239
1223
|
};
|
|
1240
1224
|
|
|
@@ -1248,10 +1232,7 @@ var NYTimes = class extends AbstractScraper {
|
|
|
1248
1232
|
ingredients(prevValue) {
|
|
1249
1233
|
const headingSelector = "h3[class*=\"ingredientgroup_name\"]";
|
|
1250
1234
|
const ingredientSelector = "li[class*=\"ingredient\"]";
|
|
1251
|
-
if (isList(prevValue) && prevValue.size > 0)
|
|
1252
|
-
const result = groupIngredients(this.$, prevValue, headingSelector, ingredientSelector);
|
|
1253
|
-
return result;
|
|
1254
|
-
}
|
|
1235
|
+
if (isList(prevValue) && prevValue.size > 0) return groupIngredients(this.$, prevValue, headingSelector, ingredientSelector);
|
|
1255
1236
|
throw new Error("No ingredients found to group");
|
|
1256
1237
|
}
|
|
1257
1238
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "recipe-scrapers-js",
|
|
3
|
-
"version": "0.1.0
|
|
3
|
+
"version": "0.1.0",
|
|
4
4
|
"license": "MIT",
|
|
5
5
|
"description": "A recipe scrapers library",
|
|
6
6
|
"author": {
|
|
@@ -39,18 +39,18 @@
|
|
|
39
39
|
"prepublishOnly": "bun run lint && bun run build"
|
|
40
40
|
},
|
|
41
41
|
"peerDependencies": {
|
|
42
|
-
"cheerio": "^1.1.
|
|
43
|
-
"zod": "^
|
|
42
|
+
"cheerio": "^1.1.2",
|
|
43
|
+
"zod": "^4.1.12"
|
|
44
44
|
},
|
|
45
45
|
"dependencies": {
|
|
46
|
-
"iso8601-duration": "^2.1.
|
|
46
|
+
"iso8601-duration": "^2.1.3",
|
|
47
47
|
"schema-dts": "^1.1.5"
|
|
48
48
|
},
|
|
49
49
|
"devDependencies": {
|
|
50
|
-
"@biomejs/biome": "^2.
|
|
51
|
-
"@types/bun": "^1.
|
|
52
|
-
"cheerio": "^1.1.
|
|
53
|
-
"tsdown": "^0.
|
|
54
|
-
"typescript": "^5.
|
|
50
|
+
"@biomejs/biome": "^2.2.6",
|
|
51
|
+
"@types/bun": "^1.3.0",
|
|
52
|
+
"cheerio": "^1.1.2",
|
|
53
|
+
"tsdown": "^0.15.7",
|
|
54
|
+
"typescript": "^5.9.3"
|
|
55
55
|
}
|
|
56
56
|
}
|