soustack 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -245,7 +245,7 @@ declare function validateRecipe(data: any): data is Recipe;
245
245
 
246
246
  declare function fromSchemaOrg(input: unknown): Recipe | null;
247
247
 
248
- interface SchemaOrgRecipe {
248
+ interface SchemaOrgRecipe$1 {
249
249
  '@context'?: string;
250
250
  '@type'?: string | string[];
251
251
  name: string;
@@ -270,7 +270,7 @@ interface SchemaOrgRecipe {
270
270
  '@graph'?: unknown;
271
271
  }
272
272
  type SchemaOrgIngredientList = string | string[];
273
- type SchemaOrgInstructionList = string | HowToStep | HowToSection | Array<string | HowToStep | HowToSection>;
273
+ type SchemaOrgInstructionList = string | HowToStep$1 | HowToSection | Array<string | HowToStep$1 | HowToSection>;
274
274
  interface SchemaOrgImageObject {
275
275
  '@type'?: string;
276
276
  url?: string;
@@ -282,7 +282,7 @@ interface SchemaOrgYield {
282
282
  unit?: string;
283
283
  description?: string;
284
284
  }
285
- interface HowToStep {
285
+ interface HowToStep$1 {
286
286
  '@type': 'HowToStep';
287
287
  text?: string;
288
288
  name?: string;
@@ -292,7 +292,7 @@ interface HowToStep {
292
292
  interface HowToSection {
293
293
  '@type': 'HowToSection';
294
294
  name: string;
295
- itemListElement: Array<string | HowToStep | HowToSection>;
295
+ itemListElement: Array<string | HowToStep$1 | HowToSection>;
296
296
  }
297
297
  interface SchemaOrgPersonOrOrganization {
298
298
  '@type'?: 'Person' | 'Organization';
@@ -303,17 +303,109 @@ interface NutritionInformation {
303
303
  [key: string]: string | number | null | undefined;
304
304
  }
305
305
 
306
- declare function toSchemaOrg(recipe: Recipe): SchemaOrgRecipe;
306
+ declare function toSchemaOrg(recipe: Recipe): SchemaOrgRecipe$1;
307
307
 
308
+ interface HowToStep {
309
+ '@type'?: 'HowToStep' | 'HowToSection' | string;
310
+ name?: string;
311
+ text?: string;
312
+ itemListElement?: Array<string | HowToStep>;
313
+ }
314
+ interface SchemaOrgRecipe {
315
+ '@type': string | string[];
316
+ name?: string;
317
+ description?: string;
318
+ image?: string | string[];
319
+ recipeIngredient?: string[];
320
+ recipeInstructions?: Array<string | HowToStep>;
321
+ recipeYield?: string | number;
322
+ prepTime?: string;
323
+ cookTime?: string;
324
+ totalTime?: string;
325
+ author?: unknown;
326
+ datePublished?: string;
327
+ aggregateRating?: unknown;
328
+ [key: string]: unknown;
329
+ }
330
+ interface FetchRequestInit {
331
+ headers?: Record<string, string>;
332
+ signal?: AbortSignal;
333
+ redirect?: 'follow' | 'error' | 'manual';
334
+ }
335
+ interface FetchResponse {
336
+ ok: boolean;
337
+ status: number;
338
+ statusText: string;
339
+ text(): Promise<string>;
340
+ }
341
+ type FetchImplementation = (url: string, init?: FetchRequestInit) => Promise<FetchResponse>;
308
342
  interface FetchOptions {
309
343
  timeout?: number;
310
344
  userAgent?: string;
311
345
  maxRetries?: number;
346
+ fetchFn?: FetchImplementation;
312
347
  }
313
348
  interface ScrapeRecipeOptions extends FetchOptions {
314
349
  }
315
350
 
351
+ /**
352
+ * Scrapes a recipe from a URL (Node.js only).
353
+ *
354
+ * ⚠️ Not available in browser environments due to CORS restrictions.
355
+ * For browser usage, fetch the HTML yourself and use extractRecipeFromHTML().
356
+ *
357
+ * @param url - The URL of the recipe page to scrape
358
+ * @param options - Fetch options (timeout, userAgent, maxRetries)
359
+ * @returns A Soustack recipe object
360
+ * @throws Error if no recipe is found
361
+ */
316
362
  declare function scrapeRecipe(url: string, options?: ScrapeRecipeOptions): Promise<Recipe>;
363
+ /**
364
+ * Extracts a recipe from HTML string (browser and Node.js compatible).
365
+ *
366
+ * This function works in both environments and doesn't require network access.
367
+ * Perfect for browser usage where you fetch HTML yourself (with cookies/session).
368
+ *
369
+ * @example
370
+ * ```ts
371
+ * // In browser:
372
+ * const response = await fetch('https://example.com/recipe');
373
+ * const html = await response.text();
374
+ * const recipe = extractRecipeFromHTML(html);
375
+ * ```
376
+ *
377
+ * @param html - The HTML string containing Schema.org recipe data
378
+ * @returns A Soustack recipe object
379
+ * @throws Error if no recipe is found
380
+ */
381
+ declare function extractRecipeFromHTML(html: string): Recipe;
382
+ /**
383
+ * Extract Schema.org recipe data from HTML string (browser-compatible).
384
+ *
385
+ * Returns the raw Schema.org recipe object, which can then be converted
386
+ * to Soustack format using fromSchemaOrg(). This gives you access to the
387
+ * original Schema.org data for inspection, debugging, or custom transformations.
388
+ *
389
+ * @param html - HTML string containing Schema.org recipe data
390
+ * @returns Schema.org recipe object, or null if not found
391
+ *
392
+ * @example
393
+ * ```ts
394
+ * // In browser:
395
+ * const response = await fetch('https://example.com/recipe');
396
+ * const html = await response.text();
397
+ * const schemaOrgRecipe = extractSchemaOrgRecipeFromHTML(html);
398
+ *
399
+ * if (schemaOrgRecipe) {
400
+ * // Inspect or modify Schema.org data before converting
401
+ * console.log('Found recipe:', schemaOrgRecipe.name);
402
+ *
403
+ * // Convert to Soustack format
404
+ * const soustackRecipe = fromSchemaOrg(schemaOrgRecipe);
405
+ * }
406
+ * ```
407
+ */
408
+ declare function extractSchemaOrgRecipeFromHTML(html: string): SchemaOrgRecipe | null;
317
409
 
318
410
  declare function normalizeIngredientInput(input: string): string;
319
411
  declare function parseIngredient(text: string): ParsedIngredient;
@@ -333,4 +425,4 @@ declare function normalizeYield(text: string): string;
333
425
  declare function parseYield(text: string): ParsedYield | null;
334
426
  declare function formatYield(value: ParsedYield): string;
335
427
 
336
- export { type Alternative, type ComputedIngredient, type ComputedInstruction, type ComputedRecipe, type Equipment, type FrozenStorageMethod, type Ingredient, type IngredientItem, type IngredientSubsection, type Instruction, type InstructionItem, type InstructionSubsection, type MakeAheadComponent, type NutritionFacts, type ParsedIngredient, type ParsedYield, type Quantity, type Recipe, type Scaling, type ScalingBakersPercentage, type ScalingBase, type ScalingDiscrete, type ScalingFixed, type ScalingLinear, type ScalingProportional, type SimpleTime, type Source, type StepTiming, type Storage, type StorageMethod, type StructuredTime, type Substitution, type Time, type Yield, formatDuration, formatYield, fromSchemaOrg, normalizeIngredientInput, normalizeYield, parseDuration, parseHumanDuration, parseIngredient, parseIngredientLine, parseIngredients, parseYield, scaleRecipe, scrapeRecipe, smartParseDuration, toSchemaOrg, validateRecipe };
428
+ export { type Alternative, type ComputedIngredient, type ComputedInstruction, type ComputedRecipe, type Equipment, type FrozenStorageMethod, type Ingredient, type IngredientItem, type IngredientSubsection, type Instruction, type InstructionItem, type InstructionSubsection, type MakeAheadComponent, type NutritionFacts, type ParsedIngredient, type ParsedYield, type Quantity, type Recipe, type Scaling, type ScalingBakersPercentage, type ScalingBase, type ScalingDiscrete, type ScalingFixed, type ScalingLinear, type ScalingProportional, type SchemaOrgRecipe, type SimpleTime, type Source, type StepTiming, type Storage, type StorageMethod, type StructuredTime, type Substitution, type Time, type Yield, extractRecipeFromHTML, extractSchemaOrgRecipeFromHTML, formatDuration, formatYield, fromSchemaOrg, normalizeIngredientInput, normalizeYield, parseDuration, parseHumanDuration, parseIngredient, parseIngredientLine, parseIngredients, parseYield, scaleRecipe, scrapeRecipe, smartParseDuration, toSchemaOrg, validateRecipe };
package/dist/index.d.ts CHANGED
@@ -245,7 +245,7 @@ declare function validateRecipe(data: any): data is Recipe;
245
245
 
246
246
  declare function fromSchemaOrg(input: unknown): Recipe | null;
247
247
 
248
- interface SchemaOrgRecipe {
248
+ interface SchemaOrgRecipe$1 {
249
249
  '@context'?: string;
250
250
  '@type'?: string | string[];
251
251
  name: string;
@@ -270,7 +270,7 @@ interface SchemaOrgRecipe {
270
270
  '@graph'?: unknown;
271
271
  }
272
272
  type SchemaOrgIngredientList = string | string[];
273
- type SchemaOrgInstructionList = string | HowToStep | HowToSection | Array<string | HowToStep | HowToSection>;
273
+ type SchemaOrgInstructionList = string | HowToStep$1 | HowToSection | Array<string | HowToStep$1 | HowToSection>;
274
274
  interface SchemaOrgImageObject {
275
275
  '@type'?: string;
276
276
  url?: string;
@@ -282,7 +282,7 @@ interface SchemaOrgYield {
282
282
  unit?: string;
283
283
  description?: string;
284
284
  }
285
- interface HowToStep {
285
+ interface HowToStep$1 {
286
286
  '@type': 'HowToStep';
287
287
  text?: string;
288
288
  name?: string;
@@ -292,7 +292,7 @@ interface HowToStep {
292
292
  interface HowToSection {
293
293
  '@type': 'HowToSection';
294
294
  name: string;
295
- itemListElement: Array<string | HowToStep | HowToSection>;
295
+ itemListElement: Array<string | HowToStep$1 | HowToSection>;
296
296
  }
297
297
  interface SchemaOrgPersonOrOrganization {
298
298
  '@type'?: 'Person' | 'Organization';
@@ -303,17 +303,109 @@ interface NutritionInformation {
303
303
  [key: string]: string | number | null | undefined;
304
304
  }
305
305
 
306
- declare function toSchemaOrg(recipe: Recipe): SchemaOrgRecipe;
306
+ declare function toSchemaOrg(recipe: Recipe): SchemaOrgRecipe$1;
307
307
 
308
+ interface HowToStep {
309
+ '@type'?: 'HowToStep' | 'HowToSection' | string;
310
+ name?: string;
311
+ text?: string;
312
+ itemListElement?: Array<string | HowToStep>;
313
+ }
314
+ interface SchemaOrgRecipe {
315
+ '@type': string | string[];
316
+ name?: string;
317
+ description?: string;
318
+ image?: string | string[];
319
+ recipeIngredient?: string[];
320
+ recipeInstructions?: Array<string | HowToStep>;
321
+ recipeYield?: string | number;
322
+ prepTime?: string;
323
+ cookTime?: string;
324
+ totalTime?: string;
325
+ author?: unknown;
326
+ datePublished?: string;
327
+ aggregateRating?: unknown;
328
+ [key: string]: unknown;
329
+ }
330
+ interface FetchRequestInit {
331
+ headers?: Record<string, string>;
332
+ signal?: AbortSignal;
333
+ redirect?: 'follow' | 'error' | 'manual';
334
+ }
335
+ interface FetchResponse {
336
+ ok: boolean;
337
+ status: number;
338
+ statusText: string;
339
+ text(): Promise<string>;
340
+ }
341
+ type FetchImplementation = (url: string, init?: FetchRequestInit) => Promise<FetchResponse>;
308
342
  interface FetchOptions {
309
343
  timeout?: number;
310
344
  userAgent?: string;
311
345
  maxRetries?: number;
346
+ fetchFn?: FetchImplementation;
312
347
  }
313
348
  interface ScrapeRecipeOptions extends FetchOptions {
314
349
  }
315
350
 
351
+ /**
352
+ * Scrapes a recipe from a URL (Node.js only).
353
+ *
354
+ * ⚠️ Not available in browser environments due to CORS restrictions.
355
+ * For browser usage, fetch the HTML yourself and use extractRecipeFromHTML().
356
+ *
357
+ * @param url - The URL of the recipe page to scrape
358
+ * @param options - Fetch options (timeout, userAgent, maxRetries)
359
+ * @returns A Soustack recipe object
360
+ * @throws Error if no recipe is found
361
+ */
316
362
  declare function scrapeRecipe(url: string, options?: ScrapeRecipeOptions): Promise<Recipe>;
363
+ /**
364
+ * Extracts a recipe from HTML string (browser and Node.js compatible).
365
+ *
366
+ * This function works in both environments and doesn't require network access.
367
+ * Perfect for browser usage where you fetch HTML yourself (with cookies/session).
368
+ *
369
+ * @example
370
+ * ```ts
371
+ * // In browser:
372
+ * const response = await fetch('https://example.com/recipe');
373
+ * const html = await response.text();
374
+ * const recipe = extractRecipeFromHTML(html);
375
+ * ```
376
+ *
377
+ * @param html - The HTML string containing Schema.org recipe data
378
+ * @returns A Soustack recipe object
379
+ * @throws Error if no recipe is found
380
+ */
381
+ declare function extractRecipeFromHTML(html: string): Recipe;
382
+ /**
383
+ * Extract Schema.org recipe data from HTML string (browser-compatible).
384
+ *
385
+ * Returns the raw Schema.org recipe object, which can then be converted
386
+ * to Soustack format using fromSchemaOrg(). This gives you access to the
387
+ * original Schema.org data for inspection, debugging, or custom transformations.
388
+ *
389
+ * @param html - HTML string containing Schema.org recipe data
390
+ * @returns Schema.org recipe object, or null if not found
391
+ *
392
+ * @example
393
+ * ```ts
394
+ * // In browser:
395
+ * const response = await fetch('https://example.com/recipe');
396
+ * const html = await response.text();
397
+ * const schemaOrgRecipe = extractSchemaOrgRecipeFromHTML(html);
398
+ *
399
+ * if (schemaOrgRecipe) {
400
+ * // Inspect or modify Schema.org data before converting
401
+ * console.log('Found recipe:', schemaOrgRecipe.name);
402
+ *
403
+ * // Convert to Soustack format
404
+ * const soustackRecipe = fromSchemaOrg(schemaOrgRecipe);
405
+ * }
406
+ * ```
407
+ */
408
+ declare function extractSchemaOrgRecipeFromHTML(html: string): SchemaOrgRecipe | null;
317
409
 
318
410
  declare function normalizeIngredientInput(input: string): string;
319
411
  declare function parseIngredient(text: string): ParsedIngredient;
@@ -333,4 +425,4 @@ declare function normalizeYield(text: string): string;
333
425
  declare function parseYield(text: string): ParsedYield | null;
334
426
  declare function formatYield(value: ParsedYield): string;
335
427
 
336
- export { type Alternative, type ComputedIngredient, type ComputedInstruction, type ComputedRecipe, type Equipment, type FrozenStorageMethod, type Ingredient, type IngredientItem, type IngredientSubsection, type Instruction, type InstructionItem, type InstructionSubsection, type MakeAheadComponent, type NutritionFacts, type ParsedIngredient, type ParsedYield, type Quantity, type Recipe, type Scaling, type ScalingBakersPercentage, type ScalingBase, type ScalingDiscrete, type ScalingFixed, type ScalingLinear, type ScalingProportional, type SimpleTime, type Source, type StepTiming, type Storage, type StorageMethod, type StructuredTime, type Substitution, type Time, type Yield, formatDuration, formatYield, fromSchemaOrg, normalizeIngredientInput, normalizeYield, parseDuration, parseHumanDuration, parseIngredient, parseIngredientLine, parseIngredients, parseYield, scaleRecipe, scrapeRecipe, smartParseDuration, toSchemaOrg, validateRecipe };
428
+ export { type Alternative, type ComputedIngredient, type ComputedInstruction, type ComputedRecipe, type Equipment, type FrozenStorageMethod, type Ingredient, type IngredientItem, type IngredientSubsection, type Instruction, type InstructionItem, type InstructionSubsection, type MakeAheadComponent, type NutritionFacts, type ParsedIngredient, type ParsedYield, type Quantity, type Recipe, type Scaling, type ScalingBakersPercentage, type ScalingBase, type ScalingDiscrete, type ScalingFixed, type ScalingLinear, type ScalingProportional, type SchemaOrgRecipe, type SimpleTime, type Source, type StepTiming, type Storage, type StorageMethod, type StructuredTime, type Substitution, type Time, type Yield, extractRecipeFromHTML, extractSchemaOrgRecipeFromHTML, formatDuration, formatYield, fromSchemaOrg, normalizeIngredientInput, normalizeYield, parseDuration, parseHumanDuration, parseIngredient, parseIngredientLine, parseIngredients, parseYield, scaleRecipe, scrapeRecipe, smartParseDuration, toSchemaOrg, validateRecipe };
package/dist/index.js CHANGED
@@ -1297,6 +1297,8 @@ function extractRecipeNode(input) {
1297
1297
  function hasRecipeType(value) {
1298
1298
  if (!value) return false;
1299
1299
  const types = Array.isArray(value) ? value : [value];
1300
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "fromSchemaOrg.ts:95", message: "hasRecipeType check", data: { types, typesLower: types.map((t) => typeof t === "string" ? t.toLowerCase() : t), isMatch: types.some((e) => typeof e === "string" && e.toLowerCase() === "recipe") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
1301
+ });
1300
1302
  return types.some(
1301
1303
  (entry) => typeof entry === "string" && entry.toLowerCase() === "recipe"
1302
1304
  );
@@ -1653,18 +1655,26 @@ var DEFAULT_USER_AGENTS = [
1653
1655
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
1654
1656
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0"
1655
1657
  ];
1656
- var fetchImpl = null;
1657
- async function ensureFetch() {
1658
- if (!fetchImpl) {
1659
- fetchImpl = import('node-fetch').then((mod) => mod.default);
1660
- }
1661
- return fetchImpl;
1662
- }
1663
1658
  function chooseUserAgent(provided) {
1664
1659
  if (provided) return provided;
1665
1660
  const index = Math.floor(Math.random() * DEFAULT_USER_AGENTS.length);
1666
1661
  return DEFAULT_USER_AGENTS[index];
1667
1662
  }
1663
+ function resolveFetch(fetchFn) {
1664
+ if (fetchFn) {
1665
+ return fetchFn;
1666
+ }
1667
+ const globalFetch = globalThis.fetch;
1668
+ if (!globalFetch) {
1669
+ throw new Error(
1670
+ "A global fetch implementation is not available. Provide window.fetch in browsers or upgrade to Node 18+."
1671
+ );
1672
+ }
1673
+ return globalFetch;
1674
+ }
1675
+ function isBrowserEnvironment() {
1676
+ return typeof globalThis.document !== "undefined";
1677
+ }
1668
1678
  function isClientError(error) {
1669
1679
  if (typeof error.status === "number") {
1670
1680
  return error.status >= 400 && error.status < 500;
@@ -1678,25 +1688,40 @@ async function fetchPage(url, options = {}) {
1678
1688
  const {
1679
1689
  timeout = 1e4,
1680
1690
  userAgent,
1681
- maxRetries = 2
1691
+ maxRetries = 2,
1692
+ fetchFn
1682
1693
  } = options;
1683
1694
  let lastError = null;
1695
+ const resolvedFetch = resolveFetch(fetchFn);
1696
+ const isBrowser2 = isBrowserEnvironment();
1684
1697
  for (let attempt = 0; attempt <= maxRetries; attempt++) {
1685
1698
  const controller = new AbortController();
1686
1699
  const timeoutId = setTimeout(() => controller.abort(), timeout);
1687
1700
  try {
1688
- const fetch = await ensureFetch();
1689
1701
  const headers = {
1690
- "User-Agent": chooseUserAgent(userAgent),
1691
1702
  Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
1692
1703
  "Accept-Language": "en-US,en;q=0.5"
1693
1704
  };
1694
- const response = await fetch(url, {
1705
+ if (!isBrowser2) {
1706
+ headers["User-Agent"] = chooseUserAgent(userAgent);
1707
+ }
1708
+ const requestInit = {
1695
1709
  headers,
1696
1710
  signal: controller.signal,
1697
1711
  redirect: "follow"
1698
- });
1712
+ };
1713
+ const response = await resolvedFetch(url, requestInit);
1699
1714
  clearTimeout(timeoutId);
1715
+ if (response && (typeof process === "undefined" || process.env.NODE_ENV !== "test")) {
1716
+ try {
1717
+ const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
1718
+ if (globalFetch) {
1719
+ globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/fetch.ts:63", message: "fetch response", data: { url, status: response.status, statusText: response.statusText, ok: response.ok, isNYTimes: url.includes("nytimes.com") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
1720
+ });
1721
+ }
1722
+ } catch {
1723
+ }
1724
+ }
1700
1725
  if (!response.ok) {
1701
1726
  const error = new Error(
1702
1727
  `HTTP ${response.status}: ${response.statusText}`
@@ -1704,7 +1729,18 @@ async function fetchPage(url, options = {}) {
1704
1729
  error.status = response.status;
1705
1730
  throw error;
1706
1731
  }
1707
- return await response.text();
1732
+ const html = await response.text();
1733
+ if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
1734
+ try {
1735
+ const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
1736
+ if (globalFetch) {
1737
+ globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/fetch.ts:75", message: "HTML received", data: { htmlLength: html.length, hasLoginPage: html.toLowerCase().includes("login") || html.toLowerCase().includes("sign in"), hasRecipeData: html.includes("application/ld+json") || html.includes("schema.org/Recipe") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B,D" }) }).catch(() => {
1738
+ });
1739
+ }
1740
+ } catch {
1741
+ }
1742
+ }
1743
+ return html;
1708
1744
  } catch (err) {
1709
1745
  clearTimeout(timeoutId);
1710
1746
  lastError = err instanceof Error ? err : new Error(String(err));
@@ -1731,6 +1767,8 @@ function isRecipeNode(value) {
1731
1767
  return false;
1732
1768
  }
1733
1769
  const type = value["@type"];
1770
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/utils.ts:14", message: "isRecipeNode check", data: { type, typeLower: typeof type === "string" ? type.toLowerCase() : Array.isArray(type) ? type.map((t) => typeof t === "string" ? t.toLowerCase() : t) : void 0, isMatch: typeof type === "string" ? RECIPE_TYPES.has(type.toLowerCase()) : Array.isArray(type) ? type.some((e) => typeof e === "string" && RECIPE_TYPES.has(e.toLowerCase())) : false }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
1771
+ });
1734
1772
  if (typeof type === "string") {
1735
1773
  return RECIPE_TYPES.has(type.toLowerCase());
1736
1774
  }
@@ -1758,14 +1796,20 @@ function normalizeText(value) {
1758
1796
  function extractJsonLd(html) {
1759
1797
  const $ = cheerio.load(html);
1760
1798
  const scripts = $('script[type="application/ld+json"]');
1799
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:8", message: "JSON-LD scripts found", data: { scriptCount: scripts.length }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
1800
+ });
1761
1801
  const candidates = [];
1762
1802
  scripts.each((_, element) => {
1763
1803
  const content = $(element).html();
1764
1804
  if (!content) return;
1765
1805
  const parsed = safeJsonParse(content);
1766
1806
  if (!parsed) return;
1807
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:18", message: "JSON-LD parsed", data: { hasGraph: !!(parsed && typeof parsed === "object" && "@graph" in parsed), type: parsed && typeof parsed === "object" && "@type" in parsed ? parsed["@type"] : void 0 }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C" }) }).catch(() => {
1808
+ });
1767
1809
  collectCandidates(parsed, candidates);
1768
1810
  });
1811
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:22", message: "JSON-LD candidates", data: { candidateCount: candidates.length, candidateTypes: candidates.map((c) => c["@type"]) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C" }) }).catch(() => {
1812
+ });
1769
1813
  return candidates[0] ?? null;
1770
1814
  }
1771
1815
  function collectCandidates(payload, bucket) {
@@ -1837,13 +1881,139 @@ function findPropertyValue($, context, prop) {
1837
1881
  return normalizeText(node.attr("content")) || normalizeText(node.attr("href")) || normalizeText(node.attr("src")) || normalizeText(node.text());
1838
1882
  }
1839
1883
 
1884
+ // src/scraper/extractors/browser.ts
1885
+ var SIMPLE_PROPS2 = ["name", "description", "image", "recipeYield", "prepTime", "cookTime", "totalTime"];
1886
+ function extractRecipeBrowser(html) {
1887
+ const jsonLdRecipe = extractJsonLdBrowser(html);
1888
+ if (jsonLdRecipe) {
1889
+ return { recipe: jsonLdRecipe, source: "jsonld" };
1890
+ }
1891
+ const microdataRecipe = extractMicrodataBrowser(html);
1892
+ if (microdataRecipe) {
1893
+ return { recipe: microdataRecipe, source: "microdata" };
1894
+ }
1895
+ return { recipe: null, source: null };
1896
+ }
1897
+ function extractJsonLdBrowser(html) {
1898
+ if (typeof globalThis.DOMParser === "undefined") {
1899
+ return null;
1900
+ }
1901
+ const parser = new globalThis.DOMParser();
1902
+ const doc = parser.parseFromString(html, "text/html");
1903
+ const scripts = doc.querySelectorAll('script[type="application/ld+json"]');
1904
+ const candidates = [];
1905
+ scripts.forEach((script) => {
1906
+ const content = script.textContent;
1907
+ if (!content) return;
1908
+ const parsed = safeJsonParse(content);
1909
+ if (!parsed) return;
1910
+ collectCandidates2(parsed, candidates);
1911
+ });
1912
+ return candidates[0] ?? null;
1913
+ }
1914
+ function extractMicrodataBrowser(html) {
1915
+ if (typeof globalThis.DOMParser === "undefined") {
1916
+ return null;
1917
+ }
1918
+ const parser = new globalThis.DOMParser();
1919
+ const doc = parser.parseFromString(html, "text/html");
1920
+ const recipeEl = doc.querySelector('[itemscope][itemtype*="schema.org/Recipe"]');
1921
+ if (!recipeEl) {
1922
+ return null;
1923
+ }
1924
+ const recipe = {
1925
+ "@type": "Recipe"
1926
+ };
1927
+ SIMPLE_PROPS2.forEach((prop) => {
1928
+ const value = findPropertyValue2(recipeEl, prop);
1929
+ if (value) {
1930
+ recipe[prop] = value;
1931
+ }
1932
+ });
1933
+ const ingredients = [];
1934
+ recipeEl.querySelectorAll('[itemprop="recipeIngredient"]').forEach((el) => {
1935
+ const text = normalizeText(
1936
+ el.getAttribute("content") || el.textContent || void 0
1937
+ );
1938
+ if (text) ingredients.push(text);
1939
+ });
1940
+ if (ingredients.length) {
1941
+ recipe.recipeIngredient = ingredients;
1942
+ }
1943
+ const instructions = [];
1944
+ recipeEl.querySelectorAll('[itemprop="recipeInstructions"]').forEach((el) => {
1945
+ const text = normalizeText(el.getAttribute("content")) || normalizeText(el.querySelector('[itemprop="text"]')?.textContent || void 0) || normalizeText(el.textContent || void 0);
1946
+ if (text) instructions.push(text);
1947
+ });
1948
+ if (instructions.length) {
1949
+ recipe.recipeInstructions = instructions;
1950
+ }
1951
+ if (recipe.name || ingredients.length) {
1952
+ return recipe;
1953
+ }
1954
+ return null;
1955
+ }
1956
+ function findPropertyValue2(context, prop) {
1957
+ const node = context.querySelector(`[itemprop="${prop}"]`);
1958
+ if (!node) return void 0;
1959
+ return normalizeText(node.getAttribute("content")) || normalizeText(node.getAttribute("href")) || normalizeText(node.getAttribute("src")) || normalizeText(node.textContent || void 0);
1960
+ }
1961
+ function collectCandidates2(payload, bucket) {
1962
+ if (!payload) return;
1963
+ if (Array.isArray(payload)) {
1964
+ payload.forEach((entry) => collectCandidates2(entry, bucket));
1965
+ return;
1966
+ }
1967
+ if (typeof payload !== "object") {
1968
+ return;
1969
+ }
1970
+ if (isRecipeNode(payload)) {
1971
+ bucket.push(payload);
1972
+ return;
1973
+ }
1974
+ const graph = payload["@graph"];
1975
+ if (Array.isArray(graph)) {
1976
+ graph.forEach((entry) => collectCandidates2(entry, bucket));
1977
+ }
1978
+ }
1979
+
1840
1980
  // src/scraper/extractors/index.ts
1981
+ function isBrowser() {
1982
+ try {
1983
+ return typeof globalThis.DOMParser !== "undefined";
1984
+ } catch {
1985
+ return false;
1986
+ }
1987
+ }
1841
1988
  function extractRecipe(html) {
1989
+ if (isBrowser()) {
1990
+ return extractRecipeBrowser(html);
1991
+ }
1842
1992
  const jsonLdRecipe = extractJsonLd(html);
1993
+ if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
1994
+ try {
1995
+ const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
1996
+ if (globalFetch) {
1997
+ globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:6", message: "JSON-LD extraction result", data: { hasJsonLd: !!jsonLdRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
1998
+ });
1999
+ }
2000
+ } catch {
2001
+ }
2002
+ }
1843
2003
  if (jsonLdRecipe) {
1844
2004
  return { recipe: jsonLdRecipe, source: "jsonld" };
1845
2005
  }
1846
2006
  const microdataRecipe = extractMicrodata(html);
2007
+ if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
2008
+ try {
2009
+ const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
2010
+ if (globalFetch) {
2011
+ globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:12", message: "Microdata extraction result", data: { hasMicrodata: !!microdataRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "D" }) }).catch(() => {
2012
+ });
2013
+ }
2014
+ } catch {
2015
+ }
2016
+ }
1847
2017
  if (microdataRecipe) {
1848
2018
  return { recipe: microdataRecipe, source: "microdata" };
1849
2019
  }
@@ -1852,17 +2022,72 @@ function extractRecipe(html) {
1852
2022
 
1853
2023
  // src/scraper/index.ts
1854
2024
  async function scrapeRecipe(url, options = {}) {
2025
+ if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
2026
+ try {
2027
+ const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
2028
+ if (globalFetch) {
2029
+ globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:7", message: "scrapeRecipe entry", data: { url, hasOptions: !!options }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,B,C,D,E" }) }).catch(() => {
2030
+ });
2031
+ }
2032
+ } catch {
2033
+ }
2034
+ }
1855
2035
  const html = await fetchPage(url, options);
2036
+ if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
2037
+ try {
2038
+ const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
2039
+ if (globalFetch) {
2040
+ globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:9", message: "HTML fetched", data: { htmlLength: html?.length, htmlPreview: html?.substring(0, 200) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
2041
+ });
2042
+ }
2043
+ } catch {
2044
+ }
2045
+ }
1856
2046
  const { recipe } = extractRecipe(html);
2047
+ if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
2048
+ try {
2049
+ const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
2050
+ if (globalFetch) {
2051
+ globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:11", message: "extractRecipe result", data: { hasRecipe: !!recipe, recipeType: recipe?.["@type"], recipeName: recipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C,D" }) }).catch(() => {
2052
+ });
2053
+ }
2054
+ } catch {
2055
+ }
2056
+ }
1857
2057
  if (!recipe) {
1858
2058
  throw new Error("No Schema.org recipe data found in page");
1859
2059
  }
1860
2060
  const soustackRecipe = fromSchemaOrg(recipe);
2061
+ if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
2062
+ try {
2063
+ const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
2064
+ if (globalFetch) {
2065
+ globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:17", message: "fromSchemaOrg result", data: { hasSoustackRecipe: !!soustackRecipe, soustackRecipeName: soustackRecipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
2066
+ });
2067
+ }
2068
+ } catch {
2069
+ }
2070
+ }
2071
+ if (!soustackRecipe) {
2072
+ throw new Error("Schema.org data did not include a valid recipe");
2073
+ }
2074
+ return soustackRecipe;
2075
+ }
2076
+ function extractRecipeFromHTML(html) {
2077
+ const { recipe } = extractRecipe(html);
2078
+ if (!recipe) {
2079
+ throw new Error("No Schema.org recipe data found in HTML");
2080
+ }
2081
+ const soustackRecipe = fromSchemaOrg(recipe);
1861
2082
  if (!soustackRecipe) {
1862
2083
  throw new Error("Schema.org data did not include a valid recipe");
1863
2084
  }
1864
2085
  return soustackRecipe;
1865
2086
  }
2087
+ function extractSchemaOrgRecipeFromHTML(html) {
2088
+ const { recipe } = extractRecipe(html);
2089
+ return recipe;
2090
+ }
1866
2091
 
1867
2092
  // src/parsers/yield.ts
1868
2093
  var RANGE_PATTERN = /^(\d+)(?:\s*(?:[-–—]|to)\s*)(\d+)\s+(.+)$/i;
@@ -2106,6 +2331,8 @@ function wordToNumber(word) {
2106
2331
  return null;
2107
2332
  }
2108
2333
 
2334
+ exports.extractRecipeFromHTML = extractRecipeFromHTML;
2335
+ exports.extractSchemaOrgRecipeFromHTML = extractSchemaOrgRecipeFromHTML;
2109
2336
  exports.formatDuration = formatDuration;
2110
2337
  exports.formatYield = formatYield2;
2111
2338
  exports.fromSchemaOrg = fromSchemaOrg;