recipe-scrapers-js 0.1.0-alpha.3 → 0.1.0-alpha.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +22 -20
- package/package.json +19 -9
- package/src/__tests__/abstract-extractor-plugin.test.ts +0 -234
- package/src/__tests__/abstract-scraper.test.ts +0 -201
- package/src/__tests__/logger.test.ts +0 -318
- package/src/__tests__/plugin-manager.test.ts +0 -64
- package/src/__tests__/recipe-extractor.test.ts +0 -103
- package/src/__tests__/scraper-diagnostics.test.ts +0 -102
- package/src/__tests__/setup.ts +0 -1
- package/src/abstract-extractor-plugin.ts +0 -16
- package/src/abstract-plugin.ts +0 -11
- package/src/abstract-postprocessor-plugin.ts +0 -13
- package/src/abstract-scraper.ts +0 -222
- package/src/constants.ts +0 -19
- package/src/exceptions/__tests__/index.test.ts +0 -44
- package/src/exceptions/index.ts +0 -33
- package/src/index.ts +0 -24
- package/src/logger.ts +0 -45
- package/src/plugin-manager.ts +0 -33
- package/src/plugins/__tests__/html-stripper.processor.test.ts +0 -60
- package/src/plugins/__tests__/opengraph.extractor.test.ts +0 -106
- package/src/plugins/html-stripper.processor.ts +0 -79
- package/src/plugins/opengraph.extractor.ts +0 -61
- package/src/plugins/schema-org.extractor/__tests__/index.test.ts +0 -136
- package/src/plugins/schema-org.extractor/__tests__/type-predicates.test.ts +0 -116
- package/src/plugins/schema-org.extractor/index.ts +0 -622
- package/src/plugins/schema-org.extractor/schema-org.interface.ts +0 -25
- package/src/plugins/schema-org.extractor/type-predicates.ts +0 -79
- package/src/recipe-extractor.ts +0 -93
- package/src/scraper-diagnostics.ts +0 -87
- package/src/scrapers/__tests__/scrapers.test.ts +0 -94
- package/src/scrapers/_index.ts +0 -19
- package/src/scrapers/allrecipes.ts +0 -9
- package/src/scrapers/bbcgoodfood.ts +0 -43
- package/src/scrapers/epicurious.ts +0 -17
- package/src/scrapers/nytimes.ts +0 -43
- package/src/scrapers/seriouseats.ts +0 -9
- package/src/scrapers/simplyrecipes.ts +0 -37
- package/src/types/recipe.interface.ts +0 -247
- package/src/types/scraper.interface.ts +0 -34
- package/src/utils/__tests__/index.test.ts +0 -94
- package/src/utils/__tests__/ingredients.test.ts +0 -439
- package/src/utils/__tests__/instructions.test.ts +0 -44
- package/src/utils/__tests__/microdata.test.ts +0 -93
- package/src/utils/__tests__/parse-yields.test.ts +0 -30
- package/src/utils/__tests__/parsing.test.ts +0 -69
- package/src/utils/fractions.ts +0 -60
- package/src/utils/index.ts +0 -38
- package/src/utils/ingredients.ts +0 -212
- package/src/utils/instructions.ts +0 -45
- package/src/utils/microdata.ts +0 -162
- package/src/utils/parse-yields.ts +0 -103
- package/src/utils/parsing.ts +0 -43
package/dist/index.js
CHANGED
|
@@ -18,11 +18,13 @@ function isString(value) {
|
|
|
18
18
|
return typeof value === "string";
|
|
19
19
|
}
|
|
20
20
|
/**
|
|
21
|
-
* Extracts the host name from a URL string
|
|
21
|
+
* Extracts the host name from a URL string
|
|
22
|
+
* and removes 'www.' prefix if present.
|
|
23
|
+
* Throws an error if the input is not a valid URL.
|
|
22
24
|
*/
|
|
23
25
|
function getHostName(value) {
|
|
24
26
|
try {
|
|
25
|
-
const url = new URL(value);
|
|
27
|
+
const url = new URL(value.replace("www.", ""));
|
|
26
28
|
return url.host;
|
|
27
29
|
} catch {
|
|
28
30
|
throw new Error(`Invalid URL: ${value}`);
|
|
@@ -280,7 +282,7 @@ var HtmlStripperPlugin = class extends PostProcessorPlugin {
|
|
|
280
282
|
return ingredients;
|
|
281
283
|
}
|
|
282
284
|
stripHtml(html) {
|
|
283
|
-
return html.replace(/<[^>]*>/g, "").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, "\"").replace(/"/g, "\"").replace(/'/g, "'").trim();
|
|
285
|
+
return html.replace(/<[^>]*>/g, "").replace(/&/g, "&").replace(/ /g, " ").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, "\"").replace(/"/g, "\"").replace(/'/g, "'").trim();
|
|
284
286
|
}
|
|
285
287
|
};
|
|
286
288
|
|
|
@@ -650,7 +652,7 @@ var SchemaOrgPlugin = class SchemaOrgPlugin extends ExtractorPlugin {
|
|
|
650
652
|
"title",
|
|
651
653
|
"@id"
|
|
652
654
|
]) {
|
|
653
|
-
let text
|
|
655
|
+
let text;
|
|
654
656
|
if (isString(value)) text = value;
|
|
655
657
|
else if (isNumber(value)) text = value.toString();
|
|
656
658
|
else if (Array.isArray(value)) text = this.getSchemaTextValue(value[0], props);
|
|
@@ -1017,31 +1019,31 @@ var AbstractScraper = class {
|
|
|
1017
1019
|
const instance = this.constructor;
|
|
1018
1020
|
if (this.recipeData) return this.recipeData;
|
|
1019
1021
|
this.recipeData = {
|
|
1020
|
-
host: instance.host(),
|
|
1021
|
-
siteName: await this.extract("siteName"),
|
|
1022
1022
|
author: await this.extract("author"),
|
|
1023
|
-
title: await this.extract("title"),
|
|
1024
|
-
image: await this.extract("image"),
|
|
1025
1023
|
canonicalUrl: this.canonicalUrl(),
|
|
1026
|
-
|
|
1027
|
-
|
|
1024
|
+
category: await this.extract("category"),
|
|
1025
|
+
cookTime: await this.extract("cookTime"),
|
|
1026
|
+
cookingMethod: await this.extract("cookingMethod"),
|
|
1027
|
+
cuisine: await this.extract("cuisine"),
|
|
1028
1028
|
description: await this.extract("description"),
|
|
1029
|
+
dietaryRestrictions: await this.extract("dietaryRestrictions"),
|
|
1030
|
+
equipment: await this.extract("equipment"),
|
|
1031
|
+
host: instance.host(),
|
|
1032
|
+
image: await this.extract("image"),
|
|
1029
1033
|
ingredients: await this.extract("ingredients"),
|
|
1030
1034
|
instructions: await this.extract("instructions"),
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
+
keywords: await this.extract("keywords"),
|
|
1036
|
+
language: this.language(),
|
|
1037
|
+
links: this.links(),
|
|
1038
|
+
nutrients: await this.extract("nutrients"),
|
|
1035
1039
|
prepTime: await this.extract("prepTime"),
|
|
1036
|
-
cuisine: await this.extract("cuisine"),
|
|
1037
|
-
cookingMethod: await this.extract("cookingMethod"),
|
|
1038
1040
|
ratings: await this.extract("ratings"),
|
|
1039
1041
|
ratingsCount: await this.extract("ratingsCount"),
|
|
1040
|
-
equipment: await this.extract("equipment"),
|
|
1041
1042
|
reviews: await this.extract("reviews"),
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1043
|
+
siteName: await this.extract("siteName"),
|
|
1044
|
+
title: await this.extract("title"),
|
|
1045
|
+
totalTime: await this.extract("totalTime"),
|
|
1046
|
+
yields: await this.extract("yields")
|
|
1045
1047
|
};
|
|
1046
1048
|
return this.recipeData;
|
|
1047
1049
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "recipe-scrapers-js",
|
|
3
|
-
"version": "0.1.0-alpha.
|
|
3
|
+
"version": "0.1.0-alpha.5",
|
|
4
4
|
"license": "MIT",
|
|
5
5
|
"description": "A recipe scrapers library",
|
|
6
6
|
"author": {
|
|
@@ -12,11 +12,21 @@
|
|
|
12
12
|
"url": "git+https://github.com/nerdstep/recipe-scrapers-js.git"
|
|
13
13
|
},
|
|
14
14
|
"type": "module",
|
|
15
|
-
"module": "
|
|
15
|
+
"module": "dist/index.js",
|
|
16
16
|
"main": "dist/index.js",
|
|
17
17
|
"types": "dist/index.d.ts",
|
|
18
|
-
"files": [
|
|
19
|
-
|
|
18
|
+
"files": [
|
|
19
|
+
"dist",
|
|
20
|
+
"README.md",
|
|
21
|
+
"LICENSE"
|
|
22
|
+
],
|
|
23
|
+
"keywords": [
|
|
24
|
+
"recipe",
|
|
25
|
+
"scraper",
|
|
26
|
+
"parser",
|
|
27
|
+
"food",
|
|
28
|
+
"cooking"
|
|
29
|
+
],
|
|
20
30
|
"scripts": {
|
|
21
31
|
"build": "tsdown src/index.ts --outdir dist",
|
|
22
32
|
"test": "bun test",
|
|
@@ -29,17 +39,17 @@
|
|
|
29
39
|
"prepublishOnly": "bun run lint && bun run build"
|
|
30
40
|
},
|
|
31
41
|
"peerDependencies": {
|
|
32
|
-
"cheerio": "^1.
|
|
42
|
+
"cheerio": "^1.1.0"
|
|
33
43
|
},
|
|
34
44
|
"dependencies": {
|
|
35
45
|
"iso8601-duration": "^2.1.2",
|
|
36
46
|
"schema-dts": "^1.1.5"
|
|
37
47
|
},
|
|
38
48
|
"devDependencies": {
|
|
39
|
-
"@biomejs/biome": "^
|
|
40
|
-
"@types/bun": "^1.2.
|
|
41
|
-
"cheerio": "^1.
|
|
42
|
-
"tsdown": "^0.12.
|
|
49
|
+
"@biomejs/biome": "^2.0.6",
|
|
50
|
+
"@types/bun": "^1.2.17",
|
|
51
|
+
"cheerio": "^1.1.0",
|
|
52
|
+
"tsdown": "^0.12.9",
|
|
43
53
|
"typescript": "^5.8.3"
|
|
44
54
|
}
|
|
45
55
|
}
|
|
@@ -1,234 +0,0 @@
|
|
|
1
|
-
import { beforeEach, describe, expect, it } from 'bun:test'
|
|
2
|
-
import {
|
|
3
|
-
NotImplementedException,
|
|
4
|
-
UnsupportedFieldException,
|
|
5
|
-
} from '@/exceptions'
|
|
6
|
-
import { load } from 'cheerio'
|
|
7
|
-
import { ExtractorPlugin } from '../abstract-extractor-plugin'
|
|
8
|
-
import type { RecipeFields } from '../types/recipe.interface'
|
|
9
|
-
|
|
10
|
-
class MockExtractorPlugin extends ExtractorPlugin {
|
|
11
|
-
name = 'MockExtractorPlugin'
|
|
12
|
-
priority = 100
|
|
13
|
-
|
|
14
|
-
private supportedFields: Set<keyof RecipeFields>
|
|
15
|
-
|
|
16
|
-
constructor(supportedFields: (keyof RecipeFields)[] = []) {
|
|
17
|
-
const $ = load('<html><body></body></html>')
|
|
18
|
-
super($)
|
|
19
|
-
this.supportedFields = new Set(supportedFields)
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
supports(field: keyof RecipeFields): boolean {
|
|
23
|
-
return this.supportedFields.has(field)
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
extract<Key extends keyof RecipeFields>(field: Key): RecipeFields[Key] {
|
|
27
|
-
if (!this.supports(field)) {
|
|
28
|
-
throw new UnsupportedFieldException(field)
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
// Mock extraction logic
|
|
32
|
-
switch (field) {
|
|
33
|
-
case 'title':
|
|
34
|
-
return 'Mock Recipe Title' as RecipeFields[Key]
|
|
35
|
-
case 'description':
|
|
36
|
-
return 'Mock Recipe Description' as RecipeFields[Key]
|
|
37
|
-
case 'ingredients':
|
|
38
|
-
return new Set(['ingredient 1', 'ingredient 2']) as RecipeFields[Key]
|
|
39
|
-
case 'instructions':
|
|
40
|
-
return new Set(['step 1', 'step 2']) as RecipeFields[Key]
|
|
41
|
-
case 'prepTime':
|
|
42
|
-
return 15 as RecipeFields[Key]
|
|
43
|
-
case 'cookTime':
|
|
44
|
-
return 30 as RecipeFields[Key]
|
|
45
|
-
case 'totalTime':
|
|
46
|
-
return 45 as RecipeFields[Key]
|
|
47
|
-
case 'yields':
|
|
48
|
-
return '4 servings' as RecipeFields[Key]
|
|
49
|
-
default:
|
|
50
|
-
throw new NotImplementedException(field)
|
|
51
|
-
}
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
class AsyncMockExtractorPlugin extends ExtractorPlugin {
|
|
56
|
-
name = 'AsyncMockExtractorPlugin'
|
|
57
|
-
priority = 100
|
|
58
|
-
|
|
59
|
-
constructor() {
|
|
60
|
-
const $ = load('<html><body></body></html>')
|
|
61
|
-
super($)
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
supports(field: keyof RecipeFields): boolean {
|
|
65
|
-
return ['title', 'description'].includes(field)
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
async extract<Key extends keyof RecipeFields>(
|
|
69
|
-
field: Key,
|
|
70
|
-
): Promise<RecipeFields[Key]> {
|
|
71
|
-
await new Promise((resolve) => setTimeout(resolve, 10))
|
|
72
|
-
|
|
73
|
-
if (!this.supports(field)) {
|
|
74
|
-
throw new UnsupportedFieldException(field)
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
switch (field) {
|
|
78
|
-
case 'title':
|
|
79
|
-
return 'Async Recipe Title' as RecipeFields[Key]
|
|
80
|
-
case 'description':
|
|
81
|
-
return 'Async Recipe Description' as RecipeFields[Key]
|
|
82
|
-
default:
|
|
83
|
-
throw new NotImplementedException(field)
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
class ThrowingExtractorPlugin extends ExtractorPlugin {
|
|
89
|
-
name = 'ThrowingExtractorPlugin'
|
|
90
|
-
priority = 100
|
|
91
|
-
|
|
92
|
-
constructor() {
|
|
93
|
-
const $ = load('<html><body></body></html>')
|
|
94
|
-
super($)
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
supports(field: keyof RecipeFields): boolean {
|
|
98
|
-
return true
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
extract<Key extends keyof RecipeFields>(field: Key): RecipeFields[Key] {
|
|
102
|
-
throw new Error(`Extraction failed for field: ${String(field)}`)
|
|
103
|
-
}
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
describe('ExtractorPlugin', () => {
|
|
107
|
-
let plugin: MockExtractorPlugin
|
|
108
|
-
|
|
109
|
-
beforeEach(() => {
|
|
110
|
-
plugin = new MockExtractorPlugin([
|
|
111
|
-
'title',
|
|
112
|
-
'description',
|
|
113
|
-
'ingredients',
|
|
114
|
-
'prepTime',
|
|
115
|
-
])
|
|
116
|
-
})
|
|
117
|
-
|
|
118
|
-
describe('inheritance', () => {
|
|
119
|
-
it('should extend AbstractPlugin', () => {
|
|
120
|
-
expect(plugin).toBeInstanceOf(ExtractorPlugin)
|
|
121
|
-
})
|
|
122
|
-
|
|
123
|
-
it('should have access to cheerio instance from parent', () => {
|
|
124
|
-
expect(plugin.$).toBeDefined()
|
|
125
|
-
expect(typeof plugin.$).toBe('function')
|
|
126
|
-
})
|
|
127
|
-
})
|
|
128
|
-
|
|
129
|
-
describe('supports method', () => {
|
|
130
|
-
it('should return true for supported fields', () => {
|
|
131
|
-
expect(plugin.supports('title')).toBe(true)
|
|
132
|
-
expect(plugin.supports('description')).toBe(true)
|
|
133
|
-
expect(plugin.supports('ingredients')).toBe(true)
|
|
134
|
-
expect(plugin.supports('prepTime')).toBe(true)
|
|
135
|
-
})
|
|
136
|
-
|
|
137
|
-
it('should return false for unsupported fields', () => {
|
|
138
|
-
expect(plugin.supports('cookTime')).toBe(false)
|
|
139
|
-
expect(plugin.supports('totalTime')).toBe(false)
|
|
140
|
-
expect(plugin.supports('yields')).toBe(false)
|
|
141
|
-
expect(plugin.supports('author')).toBe(false)
|
|
142
|
-
})
|
|
143
|
-
|
|
144
|
-
it('should handle empty supported fields', () => {
|
|
145
|
-
const emptyPlugin = new MockExtractorPlugin([])
|
|
146
|
-
expect(emptyPlugin.supports('title')).toBe(false)
|
|
147
|
-
expect(emptyPlugin.supports('description')).toBe(false)
|
|
148
|
-
})
|
|
149
|
-
|
|
150
|
-
it('should handle all fields as supported', () => {
|
|
151
|
-
const allFieldsPlugin = new MockExtractorPlugin([
|
|
152
|
-
'title',
|
|
153
|
-
'description',
|
|
154
|
-
'ingredients',
|
|
155
|
-
'instructions',
|
|
156
|
-
'prepTime',
|
|
157
|
-
'cookTime',
|
|
158
|
-
'totalTime',
|
|
159
|
-
'yields',
|
|
160
|
-
])
|
|
161
|
-
|
|
162
|
-
expect(allFieldsPlugin.supports('title')).toBe(true)
|
|
163
|
-
expect(allFieldsPlugin.supports('cookTime')).toBe(true)
|
|
164
|
-
expect(allFieldsPlugin.supports('yields')).toBe(true)
|
|
165
|
-
})
|
|
166
|
-
})
|
|
167
|
-
|
|
168
|
-
describe('extract method', () => {
|
|
169
|
-
it('should extract supported fields', () => {
|
|
170
|
-
expect(plugin.extract('title')).toBe('Mock Recipe Title')
|
|
171
|
-
expect(plugin.extract('description')).toBe('Mock Recipe Description')
|
|
172
|
-
expect(plugin.extract('prepTime')).toBe(15)
|
|
173
|
-
expect(plugin.extract('ingredients')).toEqual(
|
|
174
|
-
new Set(['ingredient 1', 'ingredient 2']),
|
|
175
|
-
)
|
|
176
|
-
})
|
|
177
|
-
|
|
178
|
-
it('should throw error for unsupported fields', () => {
|
|
179
|
-
expect(() => plugin.extract('cookTime')).toThrow(
|
|
180
|
-
'Extraction not supported for field: cookTime',
|
|
181
|
-
)
|
|
182
|
-
expect(() => plugin.extract('totalTime')).toThrow(
|
|
183
|
-
'Extraction not supported for field: totalTime',
|
|
184
|
-
)
|
|
185
|
-
})
|
|
186
|
-
})
|
|
187
|
-
|
|
188
|
-
describe('async extraction', () => {
|
|
189
|
-
let asyncPlugin: AsyncMockExtractorPlugin
|
|
190
|
-
|
|
191
|
-
beforeEach(() => {
|
|
192
|
-
asyncPlugin = new AsyncMockExtractorPlugin()
|
|
193
|
-
})
|
|
194
|
-
|
|
195
|
-
it('should handle async extraction', async () => {
|
|
196
|
-
const title = await asyncPlugin.extract('title')
|
|
197
|
-
expect(title).toBe('Async Recipe Title')
|
|
198
|
-
const description = await asyncPlugin.extract('description')
|
|
199
|
-
expect(description).toBe('Async Recipe Description')
|
|
200
|
-
})
|
|
201
|
-
|
|
202
|
-
it('should throw error for unsupported fields in async mode', async () => {
|
|
203
|
-
await expect(asyncPlugin.extract('cookTime')).rejects.toThrow(
|
|
204
|
-
'Extraction not supported for field: cookTime',
|
|
205
|
-
)
|
|
206
|
-
})
|
|
207
|
-
})
|
|
208
|
-
|
|
209
|
-
describe('error handling', () => {
|
|
210
|
-
let throwingPlugin: ThrowingExtractorPlugin
|
|
211
|
-
|
|
212
|
-
beforeEach(() => {
|
|
213
|
-
throwingPlugin = new ThrowingExtractorPlugin()
|
|
214
|
-
})
|
|
215
|
-
|
|
216
|
-
it('should propagate extraction errors', () => {
|
|
217
|
-
expect(() => throwingPlugin.extract('title')).toThrow(
|
|
218
|
-
'Extraction failed for field: title',
|
|
219
|
-
)
|
|
220
|
-
expect(() => throwingPlugin.extract('description')).toThrow(
|
|
221
|
-
'Extraction failed for field: description',
|
|
222
|
-
)
|
|
223
|
-
})
|
|
224
|
-
})
|
|
225
|
-
|
|
226
|
-
describe('edge cases', () => {
|
|
227
|
-
it('should throw on undefined extractor', () => {
|
|
228
|
-
const plugin = new MockExtractorPlugin(['author'])
|
|
229
|
-
expect(() => plugin.extract('author')).toThrow(
|
|
230
|
-
'Method should be implemented: author',
|
|
231
|
-
)
|
|
232
|
-
})
|
|
233
|
-
})
|
|
234
|
-
})
|
|
@@ -1,201 +0,0 @@
|
|
|
1
|
-
import { afterEach, beforeEach, describe, expect, it, spyOn } from 'bun:test'
|
|
2
|
-
import { AbstractScraper } from '@/abstract-scraper'
|
|
3
|
-
import { NotImplementedException } from '@/exceptions'
|
|
4
|
-
import { Logger } from '@/logger'
|
|
5
|
-
import type { RecipeFields, RecipeObject } from '@/types/recipe.interface'
|
|
6
|
-
|
|
7
|
-
class DummyScraper extends AbstractScraper {
|
|
8
|
-
// implement required static host
|
|
9
|
-
static host(): string {
|
|
10
|
-
return 'dummy.com'
|
|
11
|
-
}
|
|
12
|
-
// no site-specific extractors
|
|
13
|
-
extractors = {}
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
describe('AbstractScraper utility methods', () => {
|
|
17
|
-
let scraper: DummyScraper
|
|
18
|
-
|
|
19
|
-
describe('static host()', () => {
|
|
20
|
-
it('throws by default on base class', () => {
|
|
21
|
-
expect(() => AbstractScraper.host()).toThrow(NotImplementedException)
|
|
22
|
-
})
|
|
23
|
-
|
|
24
|
-
it('returns host for subclass', () => {
|
|
25
|
-
expect(DummyScraper.host()).toBe('dummy.com')
|
|
26
|
-
})
|
|
27
|
-
})
|
|
28
|
-
|
|
29
|
-
describe('canonicalUrl()', () => {
|
|
30
|
-
it('returns absolute canonical URL when provided', () => {
|
|
31
|
-
const html = '<link rel="canonical" href="/foo/bar"/>'
|
|
32
|
-
scraper = new DummyScraper(html, 'http://example.com/page', {})
|
|
33
|
-
expect(scraper.canonicalUrl()).toBe('http://example.com/foo/bar')
|
|
34
|
-
})
|
|
35
|
-
|
|
36
|
-
it('returns base URL when no canonical link', () => {
|
|
37
|
-
const html = '<html></html>'
|
|
38
|
-
scraper = new DummyScraper(html, 'https://site.org/path?x=1', {})
|
|
39
|
-
expect(scraper.canonicalUrl()).toBe('https://site.org/path?x=1')
|
|
40
|
-
})
|
|
41
|
-
|
|
42
|
-
it('prefixes URL with https when missing protocol', () => {
|
|
43
|
-
const html = ''
|
|
44
|
-
scraper = new DummyScraper(html, 'site.org/abc', {})
|
|
45
|
-
expect(scraper.canonicalUrl()).toBe('https://site.org/abc')
|
|
46
|
-
})
|
|
47
|
-
})
|
|
48
|
-
|
|
49
|
-
describe('language()', () => {
|
|
50
|
-
let warnSpy: ReturnType<typeof spyOn>
|
|
51
|
-
|
|
52
|
-
beforeEach(() => {
|
|
53
|
-
warnSpy = spyOn(Logger.prototype, 'warn').mockImplementation(() => {})
|
|
54
|
-
})
|
|
55
|
-
afterEach(() => {
|
|
56
|
-
warnSpy.mockRestore()
|
|
57
|
-
})
|
|
58
|
-
|
|
59
|
-
it('reads html lang attribute', () => {
|
|
60
|
-
const html = '<html lang="fr"><body></body></html>'
|
|
61
|
-
scraper = new DummyScraper(html, 'url', {})
|
|
62
|
-
expect(scraper.language()).toBe('fr')
|
|
63
|
-
expect(warnSpy).not.toHaveBeenCalled()
|
|
64
|
-
})
|
|
65
|
-
|
|
66
|
-
it('falls back to meta http-equiv content-language', () => {
|
|
67
|
-
const html =
|
|
68
|
-
'<html><head>' +
|
|
69
|
-
'<meta http-equiv="content-language" content="de, en"/>' +
|
|
70
|
-
'</head></html>'
|
|
71
|
-
scraper = new DummyScraper(html, 'url', {})
|
|
72
|
-
expect(scraper.language()).toBe('de')
|
|
73
|
-
expect(warnSpy).not.toHaveBeenCalled()
|
|
74
|
-
})
|
|
75
|
-
|
|
76
|
-
it('defaults to "en" and logs warning when none found', () => {
|
|
77
|
-
scraper = new DummyScraper('<html></html>', 'url', {})
|
|
78
|
-
expect(scraper.language()).toBe('en')
|
|
79
|
-
expect(warnSpy).toHaveBeenCalledWith('Could not determine language')
|
|
80
|
-
})
|
|
81
|
-
})
|
|
82
|
-
|
|
83
|
-
describe('links()', () => {
|
|
84
|
-
const html = `
|
|
85
|
-
<a href="http://foo.com/page">Foo</a>
|
|
86
|
-
<a href="/local">Local</a>
|
|
87
|
-
<a>No href</a>
|
|
88
|
-
`
|
|
89
|
-
it('returns empty list when linksEnabled is false', () => {
|
|
90
|
-
scraper = new DummyScraper(html, 'url', { linksEnabled: false })
|
|
91
|
-
expect(scraper.links()).toEqual([])
|
|
92
|
-
})
|
|
93
|
-
|
|
94
|
-
it('returns only absolute links when linksEnabled is true', () => {
|
|
95
|
-
scraper = new DummyScraper(html, 'url', { linksEnabled: true })
|
|
96
|
-
const links = scraper.links()
|
|
97
|
-
expect(links).toEqual([{ href: 'http://foo.com/page', text: 'Foo' }])
|
|
98
|
-
})
|
|
99
|
-
})
|
|
100
|
-
})
|
|
101
|
-
|
|
102
|
-
// Test subclass overriding extract, canonicalUrl, language, links, and host
|
|
103
|
-
class TestScraper extends AbstractScraper {
|
|
104
|
-
static host(): string {
|
|
105
|
-
return 'hostVal'
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
// Provide no real HTML parsing
|
|
109
|
-
extractors = {}
|
|
110
|
-
private data: Partial<Record<keyof RecipeFields, unknown>>
|
|
111
|
-
constructor(data: Partial<Record<keyof RecipeFields, unknown>>) {
|
|
112
|
-
// html, url and options are unused because we override methods
|
|
113
|
-
super('', '', { linksEnabled: true })
|
|
114
|
-
this.data = data
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
// Return mocked values for every field
|
|
118
|
-
async extract<Key extends keyof RecipeFields>(
|
|
119
|
-
field: Key,
|
|
120
|
-
): Promise<RecipeFields[Key]> {
|
|
121
|
-
return this.data[field] as RecipeFields[Key]
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
override canonicalUrl(): string {
|
|
125
|
-
return this.data.canonicalUrl as string
|
|
126
|
-
}
|
|
127
|
-
override language(): string {
|
|
128
|
-
return this.data.language as string
|
|
129
|
-
}
|
|
130
|
-
override links(): RecipeFields['links'] {
|
|
131
|
-
return this.data.links as RecipeFields['links']
|
|
132
|
-
}
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
describe('AbstractScraper.toObject', () => {
|
|
136
|
-
it('returns a fully serialized RecipeObject', async () => {
|
|
137
|
-
// Prepare mock values
|
|
138
|
-
const mockValues: Partial<Record<keyof RecipeFields, unknown>> = {
|
|
139
|
-
siteName: 'site',
|
|
140
|
-
author: 'auth',
|
|
141
|
-
title: 'ttl',
|
|
142
|
-
image: 'img',
|
|
143
|
-
description: 'desc',
|
|
144
|
-
yields: '4 servings',
|
|
145
|
-
totalTime: 30,
|
|
146
|
-
cookTime: 10,
|
|
147
|
-
prepTime: 20,
|
|
148
|
-
cookingMethod: 'bake',
|
|
149
|
-
ratings: 4.2,
|
|
150
|
-
ratingsCount: 100,
|
|
151
|
-
category: new Set(['cat1', 'cat2']),
|
|
152
|
-
cuisine: new Set(['cui']),
|
|
153
|
-
dietaryRestrictions: new Set(['veg']),
|
|
154
|
-
equipment: new Set(['pan']),
|
|
155
|
-
ingredients: new Set(['ing1', 'ing2']),
|
|
156
|
-
instructions: new Set(['step1', 'step2']),
|
|
157
|
-
keywords: new Set(['kw1']),
|
|
158
|
-
nutrients: new Map([['cal', '200kcal']]),
|
|
159
|
-
reviews: new Map([['rev1', 'Good']]),
|
|
160
|
-
canonicalUrl: 'http://can.url',
|
|
161
|
-
language: 'en-US',
|
|
162
|
-
links: [{ href: 'http://link', text: 'LinkText' }],
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
const scraper = new TestScraper(mockValues)
|
|
166
|
-
const result = await scraper.toObject()
|
|
167
|
-
|
|
168
|
-
// Basic scalar fields
|
|
169
|
-
const expectedRest = {
|
|
170
|
-
host: 'hostVal',
|
|
171
|
-
siteName: 'site',
|
|
172
|
-
author: 'auth',
|
|
173
|
-
title: 'ttl',
|
|
174
|
-
image: 'img',
|
|
175
|
-
canonicalUrl: 'http://can.url',
|
|
176
|
-
language: 'en-US',
|
|
177
|
-
links: [{ href: 'http://link', text: 'LinkText' }],
|
|
178
|
-
description: 'desc',
|
|
179
|
-
yields: '4 servings',
|
|
180
|
-
totalTime: 30,
|
|
181
|
-
cookTime: 10,
|
|
182
|
-
prepTime: 20,
|
|
183
|
-
cookingMethod: 'bake',
|
|
184
|
-
ratings: 4.2,
|
|
185
|
-
ratingsCount: 100,
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
expect(result).toEqual({
|
|
189
|
-
...expectedRest,
|
|
190
|
-
category: ['cat1', 'cat2'],
|
|
191
|
-
cuisine: ['cui'],
|
|
192
|
-
dietaryRestrictions: ['veg'],
|
|
193
|
-
equipment: ['pan'],
|
|
194
|
-
ingredients: ['ing1', 'ing2'],
|
|
195
|
-
instructions: ['step1', 'step2'],
|
|
196
|
-
keywords: ['kw1'],
|
|
197
|
-
nutrients: { cal: '200kcal' },
|
|
198
|
-
reviews: { rev1: 'Good' },
|
|
199
|
-
} as RecipeObject)
|
|
200
|
-
})
|
|
201
|
-
})
|