recipe-scrapers-js 0.1.0-alpha.3 → 0.1.0-alpha.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js
CHANGED
|
@@ -18,11 +18,13 @@ function isString(value) {
|
|
|
18
18
|
return typeof value === "string";
|
|
19
19
|
}
|
|
20
20
|
/**
|
|
21
|
-
* Extracts the host name from a URL string
|
|
21
|
+
* Extracts the host name from a URL string
|
|
22
|
+
* and removes 'www.' prefix if present.
|
|
23
|
+
* Throws an error if the input is not a valid URL.
|
|
22
24
|
*/
|
|
23
25
|
function getHostName(value) {
|
|
24
26
|
try {
|
|
25
|
-
const url = new URL(value);
|
|
27
|
+
const url = new URL(value.replace("www.", ""));
|
|
26
28
|
return url.host;
|
|
27
29
|
} catch {
|
|
28
30
|
throw new Error(`Invalid URL: ${value}`);
|
|
@@ -280,7 +282,7 @@ var HtmlStripperPlugin = class extends PostProcessorPlugin {
|
|
|
280
282
|
return ingredients;
|
|
281
283
|
}
|
|
282
284
|
stripHtml(html) {
|
|
283
|
-
return html.replace(/<[^>]*>/g, "").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, "\"").replace(/"/g, "\"").replace(/'/g, "'").trim();
|
|
285
|
+
return html.replace(/<[^>]*>/g, "").replace(/&/g, "&").replace(/ /g, " ").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, "\"").replace(/"/g, "\"").replace(/'/g, "'").trim();
|
|
284
286
|
}
|
|
285
287
|
};
|
|
286
288
|
|
|
@@ -1017,31 +1019,31 @@ var AbstractScraper = class {
|
|
|
1017
1019
|
const instance = this.constructor;
|
|
1018
1020
|
if (this.recipeData) return this.recipeData;
|
|
1019
1021
|
this.recipeData = {
|
|
1020
|
-
host: instance.host(),
|
|
1021
|
-
siteName: await this.extract("siteName"),
|
|
1022
1022
|
author: await this.extract("author"),
|
|
1023
|
-
title: await this.extract("title"),
|
|
1024
|
-
image: await this.extract("image"),
|
|
1025
1023
|
canonicalUrl: this.canonicalUrl(),
|
|
1026
|
-
|
|
1027
|
-
|
|
1024
|
+
category: await this.extract("category"),
|
|
1025
|
+
cookTime: await this.extract("cookTime"),
|
|
1026
|
+
cookingMethod: await this.extract("cookingMethod"),
|
|
1027
|
+
cuisine: await this.extract("cuisine"),
|
|
1028
1028
|
description: await this.extract("description"),
|
|
1029
|
+
dietaryRestrictions: await this.extract("dietaryRestrictions"),
|
|
1030
|
+
equipment: await this.extract("equipment"),
|
|
1031
|
+
host: instance.host(),
|
|
1032
|
+
image: await this.extract("image"),
|
|
1029
1033
|
ingredients: await this.extract("ingredients"),
|
|
1030
1034
|
instructions: await this.extract("instructions"),
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
+
keywords: await this.extract("keywords"),
|
|
1036
|
+
language: this.language(),
|
|
1037
|
+
links: this.links(),
|
|
1038
|
+
nutrients: await this.extract("nutrients"),
|
|
1035
1039
|
prepTime: await this.extract("prepTime"),
|
|
1036
|
-
cuisine: await this.extract("cuisine"),
|
|
1037
|
-
cookingMethod: await this.extract("cookingMethod"),
|
|
1038
1040
|
ratings: await this.extract("ratings"),
|
|
1039
1041
|
ratingsCount: await this.extract("ratingsCount"),
|
|
1040
|
-
equipment: await this.extract("equipment"),
|
|
1041
1042
|
reviews: await this.extract("reviews"),
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1043
|
+
siteName: await this.extract("siteName"),
|
|
1044
|
+
title: await this.extract("title"),
|
|
1045
|
+
totalTime: await this.extract("totalTime"),
|
|
1046
|
+
yields: await this.extract("yields")
|
|
1045
1047
|
};
|
|
1046
1048
|
return this.recipeData;
|
|
1047
1049
|
}
|
package/package.json
CHANGED
package/src/abstract-scraper.ts
CHANGED
|
@@ -159,31 +159,31 @@ export abstract class AbstractScraper {
|
|
|
159
159
|
}
|
|
160
160
|
|
|
161
161
|
this.recipeData = {
|
|
162
|
-
host: instance.host(),
|
|
163
|
-
siteName: await this.extract('siteName'),
|
|
164
162
|
author: await this.extract('author'),
|
|
165
|
-
title: await this.extract('title'),
|
|
166
|
-
image: await this.extract('image'),
|
|
167
163
|
canonicalUrl: this.canonicalUrl(),
|
|
168
|
-
|
|
169
|
-
|
|
164
|
+
category: await this.extract('category'),
|
|
165
|
+
cookTime: await this.extract('cookTime'),
|
|
166
|
+
cookingMethod: await this.extract('cookingMethod'),
|
|
167
|
+
cuisine: await this.extract('cuisine'),
|
|
170
168
|
description: await this.extract('description'),
|
|
169
|
+
dietaryRestrictions: await this.extract('dietaryRestrictions'),
|
|
170
|
+
equipment: await this.extract('equipment'),
|
|
171
|
+
host: instance.host(),
|
|
172
|
+
image: await this.extract('image'),
|
|
171
173
|
ingredients: await this.extract('ingredients'),
|
|
172
174
|
instructions: await this.extract('instructions'),
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
175
|
+
keywords: await this.extract('keywords'),
|
|
176
|
+
language: this.language(),
|
|
177
|
+
links: this.links(),
|
|
178
|
+
nutrients: await this.extract('nutrients'),
|
|
177
179
|
prepTime: await this.extract('prepTime'),
|
|
178
|
-
cuisine: await this.extract('cuisine'),
|
|
179
|
-
cookingMethod: await this.extract('cookingMethod'),
|
|
180
180
|
ratings: await this.extract('ratings'),
|
|
181
181
|
ratingsCount: await this.extract('ratingsCount'),
|
|
182
|
-
equipment: await this.extract('equipment'),
|
|
183
182
|
reviews: await this.extract('reviews'),
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
183
|
+
siteName: await this.extract('siteName'),
|
|
184
|
+
title: await this.extract('title'),
|
|
185
|
+
totalTime: await this.extract('totalTime'),
|
|
186
|
+
yields: await this.extract('yields'),
|
|
187
187
|
}
|
|
188
188
|
|
|
189
189
|
return this.recipeData
|
|
@@ -20,6 +20,9 @@ describe('HtmlStripperPlugin', () => {
|
|
|
20
20
|
expect(plugin.process('title', '<span>Test <tag></span>')).toBe(
|
|
21
21
|
'Test <tag>',
|
|
22
22
|
)
|
|
23
|
+
expect(plugin.process('description', '<span>Hello World</span>')).toBe(
|
|
24
|
+
'Hello World',
|
|
25
|
+
)
|
|
23
26
|
})
|
|
24
27
|
|
|
25
28
|
it('strips HTML from instructions Set<string>', () => {
|
|
@@ -69,6 +69,7 @@ export class HtmlStripperPlugin extends PostProcessorPlugin {
|
|
|
69
69
|
return html
|
|
70
70
|
.replace(/<[^>]*>/g, '') // Remove HTML tags
|
|
71
71
|
.replace(/&/g, '&') // Decode common entities
|
|
72
|
+
.replace(/ /g, ' ')
|
|
72
73
|
.replace(/</g, '<')
|
|
73
74
|
.replace(/>/g, '>')
|
|
74
75
|
.replace(/"/g, '"')
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { describe, expect, it } from 'bun:test'
|
|
2
2
|
import {
|
|
3
|
+
getHostName,
|
|
3
4
|
isDefined,
|
|
4
5
|
isFunction,
|
|
5
6
|
isNumber,
|
|
@@ -92,3 +93,36 @@ describe('isString', () => {
|
|
|
92
93
|
expect(isString({})).toBe(false)
|
|
93
94
|
})
|
|
94
95
|
})
|
|
96
|
+
describe('getHostName', () => {
|
|
97
|
+
it('should return the host for a standard URL', () => {
|
|
98
|
+
expect(getHostName('https://www.example.com/path')).toBe('example.com')
|
|
99
|
+
})
|
|
100
|
+
|
|
101
|
+
it('should return the host for a URL with a subdomain', () => {
|
|
102
|
+
expect(getHostName('http://sub.domain.co.uk/page?q=1')).toBe(
|
|
103
|
+
'sub.domain.co.uk',
|
|
104
|
+
)
|
|
105
|
+
})
|
|
106
|
+
|
|
107
|
+
it('should return the host for a URL without a path', () => {
|
|
108
|
+
expect(getHostName('https://anothersite.org')).toBe('anothersite.org')
|
|
109
|
+
})
|
|
110
|
+
|
|
111
|
+
it('should throw an error for an invalid URL string', () => {
|
|
112
|
+
const invalidUrl = 'not a url'
|
|
113
|
+
expect(() => getHostName(invalidUrl)).toThrow(
|
|
114
|
+
new Error(`Invalid URL: ${invalidUrl}`),
|
|
115
|
+
)
|
|
116
|
+
})
|
|
117
|
+
|
|
118
|
+
it('should throw an error for an empty string', () => {
|
|
119
|
+
expect(() => getHostName('')).toThrow(new Error('Invalid URL: '))
|
|
120
|
+
})
|
|
121
|
+
|
|
122
|
+
it('should throw an error for a string that looks like a host but lacks a protocol', () => {
|
|
123
|
+
const urlWithoutProtocol = 'example.com'
|
|
124
|
+
expect(() => getHostName(urlWithoutProtocol)).toThrow(
|
|
125
|
+
new Error(`Invalid URL: ${urlWithoutProtocol}`),
|
|
126
|
+
)
|
|
127
|
+
})
|
|
128
|
+
})
|
package/src/utils/index.ts
CHANGED
|
@@ -26,11 +26,13 @@ export function isString(value: unknown): value is string {
|
|
|
26
26
|
}
|
|
27
27
|
|
|
28
28
|
/**
|
|
29
|
-
* Extracts the host name from a URL string
|
|
29
|
+
* Extracts the host name from a URL string
|
|
30
|
+
* and removes 'www.' prefix if present.
|
|
31
|
+
* Throws an error if the input is not a valid URL.
|
|
30
32
|
*/
|
|
31
33
|
export function getHostName(value: string) {
|
|
32
34
|
try {
|
|
33
|
-
const url = new URL(value)
|
|
35
|
+
const url = new URL(value.replace('www.', ''))
|
|
34
36
|
return url.host
|
|
35
37
|
} catch {
|
|
36
38
|
throw new Error(`Invalid URL: ${value}`)
|