soustack 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -305,15 +305,58 @@ interface NutritionInformation {
305
305
 
306
306
  declare function toSchemaOrg(recipe: Recipe): SchemaOrgRecipe;
307
307
 
308
+ interface FetchRequestInit {
309
+ headers?: Record<string, string>;
310
+ signal?: AbortSignal;
311
+ redirect?: 'follow' | 'error' | 'manual';
312
+ }
313
+ interface FetchResponse {
314
+ ok: boolean;
315
+ status: number;
316
+ statusText: string;
317
+ text(): Promise<string>;
318
+ }
319
+ type FetchImplementation = (url: string, init?: FetchRequestInit) => Promise<FetchResponse>;
308
320
  interface FetchOptions {
309
321
  timeout?: number;
310
322
  userAgent?: string;
311
323
  maxRetries?: number;
324
+ fetchFn?: FetchImplementation;
312
325
  }
313
326
  interface ScrapeRecipeOptions extends FetchOptions {
314
327
  }
315
328
 
329
+ /**
330
+ * Scrapes a recipe from a URL (Node.js only).
331
+ *
332
+ * ⚠️ Not available in browser environments due to CORS restrictions.
333
+ * For browser usage, fetch the HTML yourself and use extractRecipeFromHTML().
334
+ *
335
+ * @param url - The URL of the recipe page to scrape
336
+ * @param options - Fetch options (timeout, userAgent, maxRetries)
337
+ * @returns A Soustack recipe object
338
+ * @throws Error if no recipe is found
339
+ */
316
340
  declare function scrapeRecipe(url: string, options?: ScrapeRecipeOptions): Promise<Recipe>;
341
+ /**
342
+ * Extracts a recipe from HTML string (browser and Node.js compatible).
343
+ *
344
+ * This function works in both environments and doesn't require network access.
345
+ * Perfect for browser usage where you fetch HTML yourself (with cookies/session).
346
+ *
347
+ * @example
348
+ * ```ts
349
+ * // In browser:
350
+ * const response = await fetch('https://example.com/recipe');
351
+ * const html = await response.text();
352
+ * const recipe = extractRecipeFromHTML(html);
353
+ * ```
354
+ *
355
+ * @param html - The HTML string containing Schema.org recipe data
356
+ * @returns A Soustack recipe object
357
+ * @throws Error if no recipe is found
358
+ */
359
+ declare function extractRecipeFromHTML(html: string): Recipe;
317
360
 
318
361
  declare function normalizeIngredientInput(input: string): string;
319
362
  declare function parseIngredient(text: string): ParsedIngredient;
@@ -333,4 +376,4 @@ declare function normalizeYield(text: string): string;
333
376
  declare function parseYield(text: string): ParsedYield | null;
334
377
  declare function formatYield(value: ParsedYield): string;
335
378
 
336
- export { type Alternative, type ComputedIngredient, type ComputedInstruction, type ComputedRecipe, type Equipment, type FrozenStorageMethod, type Ingredient, type IngredientItem, type IngredientSubsection, type Instruction, type InstructionItem, type InstructionSubsection, type MakeAheadComponent, type NutritionFacts, type ParsedIngredient, type ParsedYield, type Quantity, type Recipe, type Scaling, type ScalingBakersPercentage, type ScalingBase, type ScalingDiscrete, type ScalingFixed, type ScalingLinear, type ScalingProportional, type SimpleTime, type Source, type StepTiming, type Storage, type StorageMethod, type StructuredTime, type Substitution, type Time, type Yield, formatDuration, formatYield, fromSchemaOrg, normalizeIngredientInput, normalizeYield, parseDuration, parseHumanDuration, parseIngredient, parseIngredientLine, parseIngredients, parseYield, scaleRecipe, scrapeRecipe, smartParseDuration, toSchemaOrg, validateRecipe };
379
+ export { type Alternative, type ComputedIngredient, type ComputedInstruction, type ComputedRecipe, type Equipment, type FrozenStorageMethod, type Ingredient, type IngredientItem, type IngredientSubsection, type Instruction, type InstructionItem, type InstructionSubsection, type MakeAheadComponent, type NutritionFacts, type ParsedIngredient, type ParsedYield, type Quantity, type Recipe, type Scaling, type ScalingBakersPercentage, type ScalingBase, type ScalingDiscrete, type ScalingFixed, type ScalingLinear, type ScalingProportional, type SimpleTime, type Source, type StepTiming, type Storage, type StorageMethod, type StructuredTime, type Substitution, type Time, type Yield, extractRecipeFromHTML, formatDuration, formatYield, fromSchemaOrg, normalizeIngredientInput, normalizeYield, parseDuration, parseHumanDuration, parseIngredient, parseIngredientLine, parseIngredients, parseYield, scaleRecipe, scrapeRecipe, smartParseDuration, toSchemaOrg, validateRecipe };
package/dist/index.d.ts CHANGED
@@ -305,15 +305,58 @@ interface NutritionInformation {
305
305
 
306
306
  declare function toSchemaOrg(recipe: Recipe): SchemaOrgRecipe;
307
307
 
308
+ interface FetchRequestInit {
309
+ headers?: Record<string, string>;
310
+ signal?: AbortSignal;
311
+ redirect?: 'follow' | 'error' | 'manual';
312
+ }
313
+ interface FetchResponse {
314
+ ok: boolean;
315
+ status: number;
316
+ statusText: string;
317
+ text(): Promise<string>;
318
+ }
319
+ type FetchImplementation = (url: string, init?: FetchRequestInit) => Promise<FetchResponse>;
308
320
  interface FetchOptions {
309
321
  timeout?: number;
310
322
  userAgent?: string;
311
323
  maxRetries?: number;
324
+ fetchFn?: FetchImplementation;
312
325
  }
313
326
  interface ScrapeRecipeOptions extends FetchOptions {
314
327
  }
315
328
 
329
+ /**
330
+ * Scrapes a recipe from a URL (Node.js only).
331
+ *
332
+ * ⚠️ Not available in browser environments due to CORS restrictions.
333
+ * For browser usage, fetch the HTML yourself and use extractRecipeFromHTML().
334
+ *
335
+ * @param url - The URL of the recipe page to scrape
336
+ * @param options - Fetch options (timeout, userAgent, maxRetries)
337
+ * @returns A Soustack recipe object
338
+ * @throws Error if no recipe is found
339
+ */
316
340
  declare function scrapeRecipe(url: string, options?: ScrapeRecipeOptions): Promise<Recipe>;
341
+ /**
342
+ * Extracts a recipe from HTML string (browser and Node.js compatible).
343
+ *
344
+ * This function works in both environments and doesn't require network access.
345
+ * Perfect for browser usage where you fetch HTML yourself (with cookies/session).
346
+ *
347
+ * @example
348
+ * ```ts
349
+ * // In browser:
350
+ * const response = await fetch('https://example.com/recipe');
351
+ * const html = await response.text();
352
+ * const recipe = extractRecipeFromHTML(html);
353
+ * ```
354
+ *
355
+ * @param html - The HTML string containing Schema.org recipe data
356
+ * @returns A Soustack recipe object
357
+ * @throws Error if no recipe is found
358
+ */
359
+ declare function extractRecipeFromHTML(html: string): Recipe;
317
360
 
318
361
  declare function normalizeIngredientInput(input: string): string;
319
362
  declare function parseIngredient(text: string): ParsedIngredient;
@@ -333,4 +376,4 @@ declare function normalizeYield(text: string): string;
333
376
  declare function parseYield(text: string): ParsedYield | null;
334
377
  declare function formatYield(value: ParsedYield): string;
335
378
 
336
- export { type Alternative, type ComputedIngredient, type ComputedInstruction, type ComputedRecipe, type Equipment, type FrozenStorageMethod, type Ingredient, type IngredientItem, type IngredientSubsection, type Instruction, type InstructionItem, type InstructionSubsection, type MakeAheadComponent, type NutritionFacts, type ParsedIngredient, type ParsedYield, type Quantity, type Recipe, type Scaling, type ScalingBakersPercentage, type ScalingBase, type ScalingDiscrete, type ScalingFixed, type ScalingLinear, type ScalingProportional, type SimpleTime, type Source, type StepTiming, type Storage, type StorageMethod, type StructuredTime, type Substitution, type Time, type Yield, formatDuration, formatYield, fromSchemaOrg, normalizeIngredientInput, normalizeYield, parseDuration, parseHumanDuration, parseIngredient, parseIngredientLine, parseIngredients, parseYield, scaleRecipe, scrapeRecipe, smartParseDuration, toSchemaOrg, validateRecipe };
379
+ export { type Alternative, type ComputedIngredient, type ComputedInstruction, type ComputedRecipe, type Equipment, type FrozenStorageMethod, type Ingredient, type IngredientItem, type IngredientSubsection, type Instruction, type InstructionItem, type InstructionSubsection, type MakeAheadComponent, type NutritionFacts, type ParsedIngredient, type ParsedYield, type Quantity, type Recipe, type Scaling, type ScalingBakersPercentage, type ScalingBase, type ScalingDiscrete, type ScalingFixed, type ScalingLinear, type ScalingProportional, type SimpleTime, type Source, type StepTiming, type Storage, type StorageMethod, type StructuredTime, type Substitution, type Time, type Yield, extractRecipeFromHTML, formatDuration, formatYield, fromSchemaOrg, normalizeIngredientInput, normalizeYield, parseDuration, parseHumanDuration, parseIngredient, parseIngredientLine, parseIngredients, parseYield, scaleRecipe, scrapeRecipe, smartParseDuration, toSchemaOrg, validateRecipe };
package/dist/index.js CHANGED
@@ -1297,6 +1297,8 @@ function extractRecipeNode(input) {
1297
1297
  function hasRecipeType(value) {
1298
1298
  if (!value) return false;
1299
1299
  const types = Array.isArray(value) ? value : [value];
1300
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "fromSchemaOrg.ts:95", message: "hasRecipeType check", data: { types, typesLower: types.map((t) => typeof t === "string" ? t.toLowerCase() : t), isMatch: types.some((e) => typeof e === "string" && e.toLowerCase() === "recipe") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
1301
+ });
1300
1302
  return types.some(
1301
1303
  (entry) => typeof entry === "string" && entry.toLowerCase() === "recipe"
1302
1304
  );
@@ -1653,18 +1655,26 @@ var DEFAULT_USER_AGENTS = [
1653
1655
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
1654
1656
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0"
1655
1657
  ];
1656
- var fetchImpl = null;
1657
- async function ensureFetch() {
1658
- if (!fetchImpl) {
1659
- fetchImpl = import('node-fetch').then((mod) => mod.default);
1660
- }
1661
- return fetchImpl;
1662
- }
1663
1658
  function chooseUserAgent(provided) {
1664
1659
  if (provided) return provided;
1665
1660
  const index = Math.floor(Math.random() * DEFAULT_USER_AGENTS.length);
1666
1661
  return DEFAULT_USER_AGENTS[index];
1667
1662
  }
1663
+ function resolveFetch(fetchFn) {
1664
+ if (fetchFn) {
1665
+ return fetchFn;
1666
+ }
1667
+ const globalFetch = globalThis.fetch;
1668
+ if (!globalFetch) {
1669
+ throw new Error(
1670
+ "A global fetch implementation is not available. Provide window.fetch in browsers or upgrade to Node 18+."
1671
+ );
1672
+ }
1673
+ return globalFetch;
1674
+ }
1675
+ function isBrowserEnvironment() {
1676
+ return typeof globalThis.document !== "undefined";
1677
+ }
1668
1678
  function isClientError(error) {
1669
1679
  if (typeof error.status === "number") {
1670
1680
  return error.status >= 400 && error.status < 500;
@@ -1678,25 +1688,40 @@ async function fetchPage(url, options = {}) {
1678
1688
  const {
1679
1689
  timeout = 1e4,
1680
1690
  userAgent,
1681
- maxRetries = 2
1691
+ maxRetries = 2,
1692
+ fetchFn
1682
1693
  } = options;
1683
1694
  let lastError = null;
1695
+ const resolvedFetch = resolveFetch(fetchFn);
1696
+ const isBrowser2 = isBrowserEnvironment();
1684
1697
  for (let attempt = 0; attempt <= maxRetries; attempt++) {
1685
1698
  const controller = new AbortController();
1686
1699
  const timeoutId = setTimeout(() => controller.abort(), timeout);
1687
1700
  try {
1688
- const fetch = await ensureFetch();
1689
1701
  const headers = {
1690
- "User-Agent": chooseUserAgent(userAgent),
1691
1702
  Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
1692
1703
  "Accept-Language": "en-US,en;q=0.5"
1693
1704
  };
1694
- const response = await fetch(url, {
1705
+ if (!isBrowser2) {
1706
+ headers["User-Agent"] = chooseUserAgent(userAgent);
1707
+ }
1708
+ const requestInit = {
1695
1709
  headers,
1696
1710
  signal: controller.signal,
1697
1711
  redirect: "follow"
1698
- });
1712
+ };
1713
+ const response = await resolvedFetch(url, requestInit);
1699
1714
  clearTimeout(timeoutId);
1715
+ if (response && (typeof process === "undefined" || process.env.NODE_ENV !== "test")) {
1716
+ try {
1717
+ const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
1718
+ if (globalFetch) {
1719
+ globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/fetch.ts:63", message: "fetch response", data: { url, status: response.status, statusText: response.statusText, ok: response.ok, isNYTimes: url.includes("nytimes.com") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
1720
+ });
1721
+ }
1722
+ } catch {
1723
+ }
1724
+ }
1700
1725
  if (!response.ok) {
1701
1726
  const error = new Error(
1702
1727
  `HTTP ${response.status}: ${response.statusText}`
@@ -1704,7 +1729,18 @@ async function fetchPage(url, options = {}) {
1704
1729
  error.status = response.status;
1705
1730
  throw error;
1706
1731
  }
1707
- return await response.text();
1732
+ const html = await response.text();
1733
+ if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
1734
+ try {
1735
+ const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
1736
+ if (globalFetch) {
1737
+ globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/fetch.ts:75", message: "HTML received", data: { htmlLength: html.length, hasLoginPage: html.toLowerCase().includes("login") || html.toLowerCase().includes("sign in"), hasRecipeData: html.includes("application/ld+json") || html.includes("schema.org/Recipe") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B,D" }) }).catch(() => {
1738
+ });
1739
+ }
1740
+ } catch {
1741
+ }
1742
+ }
1743
+ return html;
1708
1744
  } catch (err) {
1709
1745
  clearTimeout(timeoutId);
1710
1746
  lastError = err instanceof Error ? err : new Error(String(err));
@@ -1731,6 +1767,8 @@ function isRecipeNode(value) {
1731
1767
  return false;
1732
1768
  }
1733
1769
  const type = value["@type"];
1770
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/utils.ts:14", message: "isRecipeNode check", data: { type, typeLower: typeof type === "string" ? type.toLowerCase() : Array.isArray(type) ? type.map((t) => typeof t === "string" ? t.toLowerCase() : t) : void 0, isMatch: typeof type === "string" ? RECIPE_TYPES.has(type.toLowerCase()) : Array.isArray(type) ? type.some((e) => typeof e === "string" && RECIPE_TYPES.has(e.toLowerCase())) : false }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
1771
+ });
1734
1772
  if (typeof type === "string") {
1735
1773
  return RECIPE_TYPES.has(type.toLowerCase());
1736
1774
  }
@@ -1758,14 +1796,20 @@ function normalizeText(value) {
1758
1796
  function extractJsonLd(html) {
1759
1797
  const $ = cheerio.load(html);
1760
1798
  const scripts = $('script[type="application/ld+json"]');
1799
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:8", message: "JSON-LD scripts found", data: { scriptCount: scripts.length }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
1800
+ });
1761
1801
  const candidates = [];
1762
1802
  scripts.each((_, element) => {
1763
1803
  const content = $(element).html();
1764
1804
  if (!content) return;
1765
1805
  const parsed = safeJsonParse(content);
1766
1806
  if (!parsed) return;
1807
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:18", message: "JSON-LD parsed", data: { hasGraph: !!(parsed && typeof parsed === "object" && "@graph" in parsed), type: parsed && typeof parsed === "object" && "@type" in parsed ? parsed["@type"] : void 0 }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C" }) }).catch(() => {
1808
+ });
1767
1809
  collectCandidates(parsed, candidates);
1768
1810
  });
1811
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:22", message: "JSON-LD candidates", data: { candidateCount: candidates.length, candidateTypes: candidates.map((c) => c["@type"]) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C" }) }).catch(() => {
1812
+ });
1769
1813
  return candidates[0] ?? null;
1770
1814
  }
1771
1815
  function collectCandidates(payload, bucket) {
@@ -1837,13 +1881,123 @@ function findPropertyValue($, context, prop) {
1837
1881
  return normalizeText(node.attr("content")) || normalizeText(node.attr("href")) || normalizeText(node.attr("src")) || normalizeText(node.text());
1838
1882
  }
1839
1883
 
1884
+ // src/scraper/extractors/browser.ts
1885
+ var SIMPLE_PROPS2 = ["name", "description", "image", "recipeYield", "prepTime", "cookTime", "totalTime"];
1886
+ function extractRecipeBrowser(html) {
1887
+ const jsonLdRecipe = extractJsonLdBrowser(html);
1888
+ if (jsonLdRecipe) {
1889
+ return { recipe: jsonLdRecipe, source: "jsonld" };
1890
+ }
1891
+ const microdataRecipe = extractMicrodataBrowser(html);
1892
+ if (microdataRecipe) {
1893
+ return { recipe: microdataRecipe, source: "microdata" };
1894
+ }
1895
+ return { recipe: null, source: null };
1896
+ }
1897
+ function extractJsonLdBrowser(html) {
1898
+ if (typeof globalThis.DOMParser === "undefined") {
1899
+ return null;
1900
+ }
1901
+ const parser = new globalThis.DOMParser();
1902
+ const doc = parser.parseFromString(html, "text/html");
1903
+ const scripts = doc.querySelectorAll('script[type="application/ld+json"]');
1904
+ const candidates = [];
1905
+ scripts.forEach((script) => {
1906
+ const content = script.textContent;
1907
+ if (!content) return;
1908
+ const parsed = safeJsonParse(content);
1909
+ if (!parsed) return;
1910
+ collectCandidates2(parsed, candidates);
1911
+ });
1912
+ return candidates[0] ?? null;
1913
+ }
1914
+ function extractMicrodataBrowser(html) {
1915
+ if (typeof globalThis.DOMParser === "undefined") {
1916
+ return null;
1917
+ }
1918
+ const parser = new globalThis.DOMParser();
1919
+ const doc = parser.parseFromString(html, "text/html");
1920
+ const recipeEl = doc.querySelector('[itemscope][itemtype*="schema.org/Recipe"]');
1921
+ if (!recipeEl) {
1922
+ return null;
1923
+ }
1924
+ const recipe = {
1925
+ "@type": "Recipe"
1926
+ };
1927
+ SIMPLE_PROPS2.forEach((prop) => {
1928
+ const value = findPropertyValue2(recipeEl, prop);
1929
+ if (value) {
1930
+ recipe[prop] = value;
1931
+ }
1932
+ });
1933
+ const ingredients = [];
1934
+ recipeEl.querySelectorAll('[itemprop="recipeIngredient"]').forEach((el) => {
1935
+ const text = normalizeText(
1936
+ el.getAttribute("content") || el.textContent || void 0
1937
+ );
1938
+ if (text) ingredients.push(text);
1939
+ });
1940
+ if (ingredients.length) {
1941
+ recipe.recipeIngredient = ingredients;
1942
+ }
1943
+ const instructions = [];
1944
+ recipeEl.querySelectorAll('[itemprop="recipeInstructions"]').forEach((el) => {
1945
+ const text = normalizeText(el.getAttribute("content")) || normalizeText(el.querySelector('[itemprop="text"]')?.textContent || void 0) || normalizeText(el.textContent || void 0);
1946
+ if (text) instructions.push(text);
1947
+ });
1948
+ if (instructions.length) {
1949
+ recipe.recipeInstructions = instructions;
1950
+ }
1951
+ if (recipe.name || ingredients.length) {
1952
+ return recipe;
1953
+ }
1954
+ return null;
1955
+ }
1956
+ function findPropertyValue2(context, prop) {
1957
+ const node = context.querySelector(`[itemprop="${prop}"]`);
1958
+ if (!node) return void 0;
1959
+ return normalizeText(node.getAttribute("content")) || normalizeText(node.getAttribute("href")) || normalizeText(node.getAttribute("src")) || normalizeText(node.textContent || void 0);
1960
+ }
1961
+ function collectCandidates2(payload, bucket) {
1962
+ if (!payload) return;
1963
+ if (Array.isArray(payload)) {
1964
+ payload.forEach((entry) => collectCandidates2(entry, bucket));
1965
+ return;
1966
+ }
1967
+ if (typeof payload !== "object") {
1968
+ return;
1969
+ }
1970
+ if (isRecipeNode(payload)) {
1971
+ bucket.push(payload);
1972
+ return;
1973
+ }
1974
+ const graph = payload["@graph"];
1975
+ if (Array.isArray(graph)) {
1976
+ graph.forEach((entry) => collectCandidates2(entry, bucket));
1977
+ }
1978
+ }
1979
+
1840
1980
  // src/scraper/extractors/index.ts
1981
+ function isBrowser() {
1982
+ try {
1983
+ return typeof globalThis.DOMParser !== "undefined";
1984
+ } catch {
1985
+ return false;
1986
+ }
1987
+ }
1841
1988
  function extractRecipe(html) {
1989
+ if (isBrowser()) {
1990
+ return extractRecipeBrowser(html);
1991
+ }
1842
1992
  const jsonLdRecipe = extractJsonLd(html);
1993
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:6", message: "JSON-LD extraction result", data: { hasJsonLd: !!jsonLdRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
1994
+ });
1843
1995
  if (jsonLdRecipe) {
1844
1996
  return { recipe: jsonLdRecipe, source: "jsonld" };
1845
1997
  }
1846
1998
  const microdataRecipe = extractMicrodata(html);
1999
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:12", message: "Microdata extraction result", data: { hasMicrodata: !!microdataRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "D" }) }).catch(() => {
2000
+ });
1847
2001
  if (microdataRecipe) {
1848
2002
  return { recipe: microdataRecipe, source: "microdata" };
1849
2003
  }
@@ -1852,12 +2006,31 @@ function extractRecipe(html) {
1852
2006
 
1853
2007
  // src/scraper/index.ts
1854
2008
  async function scrapeRecipe(url, options = {}) {
2009
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:7", message: "scrapeRecipe entry", data: { url, hasOptions: !!options }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,B,C,D,E" }) }).catch(() => {
2010
+ });
1855
2011
  const html = await fetchPage(url, options);
2012
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:9", message: "HTML fetched", data: { htmlLength: html?.length, htmlPreview: html?.substring(0, 200) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
2013
+ });
1856
2014
  const { recipe } = extractRecipe(html);
2015
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:11", message: "extractRecipe result", data: { hasRecipe: !!recipe, recipeType: recipe?.["@type"], recipeName: recipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C,D" }) }).catch(() => {
2016
+ });
1857
2017
  if (!recipe) {
1858
2018
  throw new Error("No Schema.org recipe data found in page");
1859
2019
  }
1860
2020
  const soustackRecipe = fromSchemaOrg(recipe);
2021
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:17", message: "fromSchemaOrg result", data: { hasSoustackRecipe: !!soustackRecipe, soustackRecipeName: soustackRecipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
2022
+ });
2023
+ if (!soustackRecipe) {
2024
+ throw new Error("Schema.org data did not include a valid recipe");
2025
+ }
2026
+ return soustackRecipe;
2027
+ }
2028
+ function extractRecipeFromHTML(html) {
2029
+ const { recipe } = extractRecipe(html);
2030
+ if (!recipe) {
2031
+ throw new Error("No Schema.org recipe data found in HTML");
2032
+ }
2033
+ const soustackRecipe = fromSchemaOrg(recipe);
1861
2034
  if (!soustackRecipe) {
1862
2035
  throw new Error("Schema.org data did not include a valid recipe");
1863
2036
  }
@@ -2106,6 +2279,7 @@ function wordToNumber(word) {
2106
2279
  return null;
2107
2280
  }
2108
2281
 
2282
+ exports.extractRecipeFromHTML = extractRecipeFromHTML;
2109
2283
  exports.formatDuration = formatDuration;
2110
2284
  exports.formatYield = formatYield2;
2111
2285
  exports.fromSchemaOrg = fromSchemaOrg;