recipe-scrapers-js 0.1.0-alpha.7 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +20 -49
- package/package.json +9 -9
package/dist/index.js
CHANGED
|
@@ -25,8 +25,7 @@ function isString(value) {
|
|
|
25
25
|
*/
|
|
26
26
|
function getHostName(value) {
|
|
27
27
|
try {
|
|
28
|
-
|
|
29
|
-
return url.host;
|
|
28
|
+
return new URL(value.replace("www.", "")).host;
|
|
30
29
|
} catch {
|
|
31
30
|
throw new Error(`Invalid URL: ${value}`);
|
|
32
31
|
}
|
|
@@ -139,8 +138,7 @@ function splitToList(value, separator) {
|
|
|
139
138
|
* @TODO Implement [Temporal.Duration](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Temporal/Duration) once it lands.
|
|
140
139
|
*/
|
|
141
140
|
function parseMinutes(value) {
|
|
142
|
-
const
|
|
143
|
-
const totalSeconds = toSeconds(duration);
|
|
141
|
+
const totalSeconds = toSeconds(parse(value));
|
|
144
142
|
return Math.round(totalSeconds / 60);
|
|
145
143
|
}
|
|
146
144
|
|
|
@@ -181,8 +179,7 @@ function scoreSentenceSimilarity(first, second) {
|
|
|
181
179
|
const bigrams = (s) => new Set(Array.from({ length: s.length - 1 }, (_, i) => s.slice(i, i + 2)));
|
|
182
180
|
const firstBigrams = bigrams(first);
|
|
183
181
|
const secondBigrams = bigrams(second);
|
|
184
|
-
|
|
185
|
-
return 2 * intersectionSize / (firstBigrams.size + secondBigrams.size);
|
|
182
|
+
return 2 * [...firstBigrams].filter((b) => secondBigrams.has(b)).length / (firstBigrams.size + secondBigrams.size);
|
|
186
183
|
}
|
|
187
184
|
function bestMatch(testString, targetStrings) {
|
|
188
185
|
if (targetStrings.length === 0) throw new Error("targetStrings cannot be empty");
|
|
@@ -227,8 +224,7 @@ function groupIngredients($, ingredientsList, headingSelector, itemSelector) {
|
|
|
227
224
|
for (const el of elements) {
|
|
228
225
|
const $el = $(el);
|
|
229
226
|
if ($el.is(groupNameSelector)) {
|
|
230
|
-
|
|
231
|
-
currentHeading = headingText || DEFAULT_INGREDIENTS_GROUP_NAME;
|
|
227
|
+
currentHeading = normalizeString($el.text()) || DEFAULT_INGREDIENTS_GROUP_NAME;
|
|
232
228
|
if (!groupings.has(currentHeading)) groupings.set(currentHeading, /* @__PURE__ */ new Set());
|
|
233
229
|
} else if ($el.is(ingredientSelector)) {
|
|
234
230
|
const text = normalizeString($el.text());
|
|
@@ -383,8 +379,7 @@ const extractValueFromElement = (element) => {
|
|
|
383
379
|
return element.text().trim();
|
|
384
380
|
};
|
|
385
381
|
const extractSchemaType = (itemType) => {
|
|
386
|
-
|
|
387
|
-
return typeMatch?.[1];
|
|
382
|
+
return itemType.match(/schema\.org\/(\w+)/)?.[1];
|
|
388
383
|
};
|
|
389
384
|
/**
|
|
390
385
|
* Extracts microdata from HTML elements using itemtype and itemprop attributes
|
|
@@ -395,8 +390,7 @@ const extractSchemaType = (itemType) => {
|
|
|
395
390
|
*/
|
|
396
391
|
function extractMicrodata($, selector) {
|
|
397
392
|
const results = [];
|
|
398
|
-
|
|
399
|
-
elements.each((_, el) => {
|
|
393
|
+
$(selector).each((_, el) => {
|
|
400
394
|
const $element = $(el);
|
|
401
395
|
const itemType = $element.attr("itemtype");
|
|
402
396
|
const rootObject = {};
|
|
@@ -406,13 +400,11 @@ function extractMicrodata($, selector) {
|
|
|
406
400
|
}
|
|
407
401
|
const allProps = $element.find("[itemprop]").addBack("[itemprop]");
|
|
408
402
|
const nestedItemTypes = $element.find("[itemtype]");
|
|
409
|
-
|
|
403
|
+
allProps.filter((_$1, propEl) => {
|
|
410
404
|
const $prop = $(propEl);
|
|
411
405
|
if ($prop.attr("itemtype")) return true;
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
});
|
|
415
|
-
rootLevelProps.each((_$1, propEl) => {
|
|
406
|
+
return !nestedItemTypes.toArray().some((nestedEl) => $(nestedEl).find($prop).length > 0);
|
|
407
|
+
}).each((_$1, propEl) => {
|
|
416
408
|
const $prop = $(propEl);
|
|
417
409
|
const propName = $prop.attr("itemprop");
|
|
418
410
|
if (!propName) return;
|
|
@@ -497,8 +489,7 @@ function parseYields(element) {
|
|
|
497
489
|
const splitMatch = serveText.match(SERVE_REGEX_TO);
|
|
498
490
|
if (splitMatch && splitMatch.index !== void 0) serveText = serveText.slice(splitMatch.index + splitMatch[0].length).trim();
|
|
499
491
|
}
|
|
500
|
-
const
|
|
501
|
-
const matched = match?.groups?.items || "0";
|
|
492
|
+
const matched = serveText.match(SERVE_REGEX_NUMBER)?.groups?.items || "0";
|
|
502
493
|
const serveTextLower = serveText.toLowerCase();
|
|
503
494
|
let bestMatch$1 = null;
|
|
504
495
|
let bestMatchLength = 0;
|
|
@@ -535,8 +526,7 @@ function isSchemaOrgData(obj) {
|
|
|
535
526
|
}
|
|
536
527
|
function isThingType(obj, type) {
|
|
537
528
|
if (!isBaseType(obj)) return false;
|
|
538
|
-
|
|
539
|
-
return thingType === type;
|
|
529
|
+
return (Array.isArray(obj["@type"]) ? obj["@type"][0] : obj["@type"]) === type;
|
|
540
530
|
}
|
|
541
531
|
function isAggregateRating(obj) {
|
|
542
532
|
return isThingType(obj, "AggregateRating");
|
|
@@ -645,7 +635,6 @@ var SchemaOrgPlugin = class SchemaOrgPlugin extends ExtractorPlugin {
|
|
|
645
635
|
pickFromObject(obj, props) {
|
|
646
636
|
if (!isPlainObject(obj)) return void 0;
|
|
647
637
|
for (const prop of props) if (isString(obj[prop])) return obj[prop];
|
|
648
|
-
return void 0;
|
|
649
638
|
}
|
|
650
639
|
getSchemaTextValue(value, props = [
|
|
651
640
|
"textValue",
|
|
@@ -714,10 +703,7 @@ var SchemaOrgPlugin = class SchemaOrgPlugin extends ExtractorPlugin {
|
|
|
714
703
|
return value;
|
|
715
704
|
}
|
|
716
705
|
if (isString(value)) return parseMinutes(value);
|
|
717
|
-
if (isBaseType(value) && "maxValue" in value)
|
|
718
|
-
const maxValue = this.getSchemaTextValue(value.maxValue);
|
|
719
|
-
return parseMinutes(maxValue);
|
|
720
|
-
}
|
|
706
|
+
if (isBaseType(value) && "maxValue" in value) return parseMinutes(this.getSchemaTextValue(value.maxValue));
|
|
721
707
|
return null;
|
|
722
708
|
}
|
|
723
709
|
parseInstructions(value) {
|
|
@@ -921,8 +907,7 @@ var RecipeExtractor = class RecipeExtractor {
|
|
|
921
907
|
this.logger.debug(`Extracting field: ${field}`);
|
|
922
908
|
for (const plugin of this.plugins) {
|
|
923
909
|
const pluginLogger = new Logger(this.getContext(plugin.name), this.options.logLevel);
|
|
924
|
-
|
|
925
|
-
if (isSupported && !isDefined(result)) try {
|
|
910
|
+
if (plugin.supports(field) && !isDefined(result)) try {
|
|
926
911
|
result = await plugin.extract(field);
|
|
927
912
|
} catch (err) {
|
|
928
913
|
if (err instanceof ExtractionFailedException) pluginLogger.verbose(err.message);
|
|
@@ -964,9 +949,7 @@ var AbstractScraper = class {
|
|
|
964
949
|
const { extraExtractors = [], extraPostProcessors = [], logLevel = LogLevel.WARN } = options;
|
|
965
950
|
this.logger = new Logger(this.constructor.name, logLevel);
|
|
966
951
|
this.$ = cheerio.load(html);
|
|
967
|
-
|
|
968
|
-
const basePostProcessors = [new HtmlStripperPlugin()];
|
|
969
|
-
this.pluginManager = new PluginManager(baseExtractors, basePostProcessors, extraExtractors, extraPostProcessors);
|
|
952
|
+
this.pluginManager = new PluginManager([new OpenGraphPlugin(this.$), new SchemaOrgPlugin(this.$, logLevel)], [new HtmlStripperPlugin()], extraExtractors, extraPostProcessors);
|
|
970
953
|
this.recipeExtractor = new RecipeExtractor(this.pluginManager.getExtractors(), this.constructor.name, { logLevel });
|
|
971
954
|
}
|
|
972
955
|
/**
|
|
@@ -1152,23 +1135,18 @@ var AmericasTestKitchen = class extends AbstractScraper {
|
|
|
1152
1135
|
parseHtmlIngredients(prevValue) {
|
|
1153
1136
|
const headingSelector = "[class*=\"RecipeIngredientGroups_group\"] > span";
|
|
1154
1137
|
const ingredientSelector = "[class*=\"RecipeIngredient\"] label";
|
|
1155
|
-
if (isList(prevValue) && prevValue.size > 0)
|
|
1156
|
-
const result = groupIngredients(this.$, prevValue, headingSelector, ingredientSelector);
|
|
1157
|
-
return result;
|
|
1158
|
-
}
|
|
1138
|
+
if (isList(prevValue) && prevValue.size > 0) return groupIngredients(this.$, prevValue, headingSelector, ingredientSelector);
|
|
1159
1139
|
return null;
|
|
1160
1140
|
}
|
|
1161
1141
|
getRecipeData() {
|
|
1162
1142
|
if (this.data === null) {
|
|
1163
|
-
const
|
|
1164
|
-
const jsonString = jsonElement.html();
|
|
1143
|
+
const jsonString = this.$("script[type=\"application/json\"]").html();
|
|
1165
1144
|
if (!jsonString) {
|
|
1166
1145
|
this.logger.warn("Could not find JSON data script tag");
|
|
1167
1146
|
return null;
|
|
1168
1147
|
}
|
|
1169
1148
|
try {
|
|
1170
|
-
|
|
1171
|
-
this.data = parsed.props.pageProps.data;
|
|
1149
|
+
this.data = pagePropsDataSchema.parse(JSON.parse(jsonString)).props.pageProps.data;
|
|
1172
1150
|
} catch (error) {
|
|
1173
1151
|
this.logger.error("Failed to parse JSON data:", error);
|
|
1174
1152
|
return null;
|
|
@@ -1218,10 +1196,7 @@ var BBCGoodFood = class extends AbstractScraper {
|
|
|
1218
1196
|
ingredients(prevValue) {
|
|
1219
1197
|
const headingSelector = ".recipe__ingredients h3";
|
|
1220
1198
|
const ingredientSelector = ".recipe__ingredients li";
|
|
1221
|
-
if (isList(prevValue) && prevValue.size > 0)
|
|
1222
|
-
const result = groupIngredients(this.$, prevValue, headingSelector, ingredientSelector);
|
|
1223
|
-
return result;
|
|
1224
|
-
}
|
|
1199
|
+
if (isList(prevValue) && prevValue.size > 0) return groupIngredients(this.$, prevValue, headingSelector, ingredientSelector);
|
|
1225
1200
|
throw new Error("No ingredients found to group");
|
|
1226
1201
|
}
|
|
1227
1202
|
};
|
|
@@ -1243,8 +1218,7 @@ var Epicurious = class extends AbstractScraper {
|
|
|
1243
1218
|
}
|
|
1244
1219
|
extractors = { author: this.author.bind(this) };
|
|
1245
1220
|
author() {
|
|
1246
|
-
|
|
1247
|
-
return author;
|
|
1221
|
+
return this.$("a[itemprop=\"author\"]").text().trim();
|
|
1248
1222
|
}
|
|
1249
1223
|
};
|
|
1250
1224
|
|
|
@@ -1258,10 +1232,7 @@ var NYTimes = class extends AbstractScraper {
|
|
|
1258
1232
|
ingredients(prevValue) {
|
|
1259
1233
|
const headingSelector = "h3[class*=\"ingredientgroup_name\"]";
|
|
1260
1234
|
const ingredientSelector = "li[class*=\"ingredient\"]";
|
|
1261
|
-
if (isList(prevValue) && prevValue.size > 0)
|
|
1262
|
-
const result = groupIngredients(this.$, prevValue, headingSelector, ingredientSelector);
|
|
1263
|
-
return result;
|
|
1264
|
-
}
|
|
1235
|
+
if (isList(prevValue) && prevValue.size > 0) return groupIngredients(this.$, prevValue, headingSelector, ingredientSelector);
|
|
1265
1236
|
throw new Error("No ingredients found to group");
|
|
1266
1237
|
}
|
|
1267
1238
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "recipe-scrapers-js",
|
|
3
|
-
"version": "0.1.0
|
|
3
|
+
"version": "0.1.0",
|
|
4
4
|
"license": "MIT",
|
|
5
5
|
"description": "A recipe scrapers library",
|
|
6
6
|
"author": {
|
|
@@ -39,18 +39,18 @@
|
|
|
39
39
|
"prepublishOnly": "bun run lint && bun run build"
|
|
40
40
|
},
|
|
41
41
|
"peerDependencies": {
|
|
42
|
-
"cheerio": "^1.1.
|
|
43
|
-
"zod": "^
|
|
42
|
+
"cheerio": "^1.1.2",
|
|
43
|
+
"zod": "^4.1.12"
|
|
44
44
|
},
|
|
45
45
|
"dependencies": {
|
|
46
|
-
"iso8601-duration": "^2.1.
|
|
46
|
+
"iso8601-duration": "^2.1.3",
|
|
47
47
|
"schema-dts": "^1.1.5"
|
|
48
48
|
},
|
|
49
49
|
"devDependencies": {
|
|
50
|
-
"@biomejs/biome": "^2.
|
|
51
|
-
"@types/bun": "^1.
|
|
52
|
-
"cheerio": "^1.1.
|
|
53
|
-
"tsdown": "^0.
|
|
54
|
-
"typescript": "^5.
|
|
50
|
+
"@biomejs/biome": "^2.2.6",
|
|
51
|
+
"@types/bun": "^1.3.0",
|
|
52
|
+
"cheerio": "^1.1.2",
|
|
53
|
+
"tsdown": "^0.15.7",
|
|
54
|
+
"typescript": "^5.9.3"
|
|
55
55
|
}
|
|
56
56
|
}
|