recipe-scrapers-js 0.1.0-alpha.3 → 0.1.0-alpha.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -18,11 +18,13 @@ function isString(value) {
18
18
  return typeof value === "string";
19
19
  }
20
20
  /**
21
- * Extracts the host name from a URL string.
21
+ * Extracts the host name from a URL string
22
+ * and removes 'www.' prefix if present.
23
+ * Throws an error if the input is not a valid URL.
22
24
  */
23
25
  function getHostName(value) {
24
26
  try {
25
- const url = new URL(value);
27
+ const url = new URL(value.replace("www.", ""));
26
28
  return url.host;
27
29
  } catch {
28
30
  throw new Error(`Invalid URL: ${value}`);
@@ -280,7 +282,7 @@ var HtmlStripperPlugin = class extends PostProcessorPlugin {
280
282
  return ingredients;
281
283
  }
282
284
  stripHtml(html) {
283
- return html.replace(/<[^>]*>/g, "").replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&quot;/g, "\"").replace(/&#34;/g, "\"").replace(/&#39;/g, "'").trim();
285
+ return html.replace(/<[^>]*>/g, "").replace(/&amp;/g, "&").replace(/&nbsp;/g, " ").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&quot;/g, "\"").replace(/&#34;/g, "\"").replace(/&#39;/g, "'").trim();
284
286
  }
285
287
  };
286
288
 
@@ -1017,31 +1019,31 @@ var AbstractScraper = class {
1017
1019
  const instance = this.constructor;
1018
1020
  if (this.recipeData) return this.recipeData;
1019
1021
  this.recipeData = {
1020
- host: instance.host(),
1021
- siteName: await this.extract("siteName"),
1022
1022
  author: await this.extract("author"),
1023
- title: await this.extract("title"),
1024
- image: await this.extract("image"),
1025
1023
  canonicalUrl: this.canonicalUrl(),
1026
- language: this.language(),
1027
- links: this.links(),
1024
+ category: await this.extract("category"),
1025
+ cookTime: await this.extract("cookTime"),
1026
+ cookingMethod: await this.extract("cookingMethod"),
1027
+ cuisine: await this.extract("cuisine"),
1028
1028
  description: await this.extract("description"),
1029
+ dietaryRestrictions: await this.extract("dietaryRestrictions"),
1030
+ equipment: await this.extract("equipment"),
1031
+ host: instance.host(),
1032
+ image: await this.extract("image"),
1029
1033
  ingredients: await this.extract("ingredients"),
1030
1034
  instructions: await this.extract("instructions"),
1031
- category: await this.extract("category"),
1032
- yields: await this.extract("yields"),
1033
- totalTime: await this.extract("totalTime"),
1034
- cookTime: await this.extract("cookTime"),
1035
+ keywords: await this.extract("keywords"),
1036
+ language: this.language(),
1037
+ links: this.links(),
1038
+ nutrients: await this.extract("nutrients"),
1035
1039
  prepTime: await this.extract("prepTime"),
1036
- cuisine: await this.extract("cuisine"),
1037
- cookingMethod: await this.extract("cookingMethod"),
1038
1040
  ratings: await this.extract("ratings"),
1039
1041
  ratingsCount: await this.extract("ratingsCount"),
1040
- equipment: await this.extract("equipment"),
1041
1042
  reviews: await this.extract("reviews"),
1042
- nutrients: await this.extract("nutrients"),
1043
- dietaryRestrictions: await this.extract("dietaryRestrictions"),
1044
- keywords: await this.extract("keywords")
1043
+ siteName: await this.extract("siteName"),
1044
+ title: await this.extract("title"),
1045
+ totalTime: await this.extract("totalTime"),
1046
+ yields: await this.extract("yields")
1045
1047
  };
1046
1048
  return this.recipeData;
1047
1049
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "recipe-scrapers-js",
3
- "version": "0.1.0-alpha.3",
3
+ "version": "0.1.0-alpha.4",
4
4
  "license": "MIT",
5
5
  "description": "A recipe scrapers library",
6
6
  "author": {
@@ -159,31 +159,31 @@ export abstract class AbstractScraper {
159
159
  }
160
160
 
161
161
  this.recipeData = {
162
- host: instance.host(),
163
- siteName: await this.extract('siteName'),
164
162
  author: await this.extract('author'),
165
- title: await this.extract('title'),
166
- image: await this.extract('image'),
167
163
  canonicalUrl: this.canonicalUrl(),
168
- language: this.language(),
169
- links: this.links(),
164
+ category: await this.extract('category'),
165
+ cookTime: await this.extract('cookTime'),
166
+ cookingMethod: await this.extract('cookingMethod'),
167
+ cuisine: await this.extract('cuisine'),
170
168
  description: await this.extract('description'),
169
+ dietaryRestrictions: await this.extract('dietaryRestrictions'),
170
+ equipment: await this.extract('equipment'),
171
+ host: instance.host(),
172
+ image: await this.extract('image'),
171
173
  ingredients: await this.extract('ingredients'),
172
174
  instructions: await this.extract('instructions'),
173
- category: await this.extract('category'),
174
- yields: await this.extract('yields'),
175
- totalTime: await this.extract('totalTime'),
176
- cookTime: await this.extract('cookTime'),
175
+ keywords: await this.extract('keywords'),
176
+ language: this.language(),
177
+ links: this.links(),
178
+ nutrients: await this.extract('nutrients'),
177
179
  prepTime: await this.extract('prepTime'),
178
- cuisine: await this.extract('cuisine'),
179
- cookingMethod: await this.extract('cookingMethod'),
180
180
  ratings: await this.extract('ratings'),
181
181
  ratingsCount: await this.extract('ratingsCount'),
182
- equipment: await this.extract('equipment'),
183
182
  reviews: await this.extract('reviews'),
184
- nutrients: await this.extract('nutrients'),
185
- dietaryRestrictions: await this.extract('dietaryRestrictions'),
186
- keywords: await this.extract('keywords'),
183
+ siteName: await this.extract('siteName'),
184
+ title: await this.extract('title'),
185
+ totalTime: await this.extract('totalTime'),
186
+ yields: await this.extract('yields'),
187
187
  }
188
188
 
189
189
  return this.recipeData
@@ -20,6 +20,9 @@ describe('HtmlStripperPlugin', () => {
20
20
  expect(plugin.process('title', '<span>Test &lt;tag&gt;</span>')).toBe(
21
21
  'Test <tag>',
22
22
  )
23
+ expect(plugin.process('description', '<span>Hello&nbsp;World</span>')).toBe(
24
+ 'Hello World',
25
+ )
23
26
  })
24
27
 
25
28
  it('strips HTML from instructions Set<string>', () => {
@@ -69,6 +69,7 @@ export class HtmlStripperPlugin extends PostProcessorPlugin {
69
69
  return html
70
70
  .replace(/<[^>]*>/g, '') // Remove HTML tags
71
71
  .replace(/&amp;/g, '&') // Decode common entities
72
+ .replace(/&nbsp;/g, ' ')
72
73
  .replace(/&lt;/g, '<')
73
74
  .replace(/&gt;/g, '>')
74
75
  .replace(/&quot;/g, '"')
@@ -1,5 +1,6 @@
1
1
  import { describe, expect, it } from 'bun:test'
2
2
  import {
3
+ getHostName,
3
4
  isDefined,
4
5
  isFunction,
5
6
  isNumber,
@@ -92,3 +93,36 @@ describe('isString', () => {
92
93
  expect(isString({})).toBe(false)
93
94
  })
94
95
  })
96
+ describe('getHostName', () => {
97
+ it('should return the host for a standard URL', () => {
98
+ expect(getHostName('https://www.example.com/path')).toBe('example.com')
99
+ })
100
+
101
+ it('should return the host for a URL with a subdomain', () => {
102
+ expect(getHostName('http://sub.domain.co.uk/page?q=1')).toBe(
103
+ 'sub.domain.co.uk',
104
+ )
105
+ })
106
+
107
+ it('should return the host for a URL without a path', () => {
108
+ expect(getHostName('https://anothersite.org')).toBe('anothersite.org')
109
+ })
110
+
111
+ it('should throw an error for an invalid URL string', () => {
112
+ const invalidUrl = 'not a url'
113
+ expect(() => getHostName(invalidUrl)).toThrow(
114
+ new Error(`Invalid URL: ${invalidUrl}`),
115
+ )
116
+ })
117
+
118
+ it('should throw an error for an empty string', () => {
119
+ expect(() => getHostName('')).toThrow(new Error('Invalid URL: '))
120
+ })
121
+
122
+ it('should throw an error for a string that looks like a host but lacks a protocol', () => {
123
+ const urlWithoutProtocol = 'example.com'
124
+ expect(() => getHostName(urlWithoutProtocol)).toThrow(
125
+ new Error(`Invalid URL: ${urlWithoutProtocol}`),
126
+ )
127
+ })
128
+ })
@@ -26,11 +26,13 @@ export function isString(value: unknown): value is string {
26
26
  }
27
27
 
28
28
  /**
29
- * Extracts the host name from a URL string.
29
+ * Extracts the host name from a URL string
30
+ * and removes 'www.' prefix if present.
31
+ * Throws an error if the input is not a valid URL.
30
32
  */
31
33
  export function getHostName(value: string) {
32
34
  try {
33
- const url = new URL(value)
35
+ const url = new URL(value.replace('www.', ''))
34
36
  return url.host
35
37
  } catch {
36
38
  throw new Error(`Invalid URL: ${value}`)