soustack 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1290,6 +1290,8 @@ function extractRecipeNode(input) {
1290
1290
  function hasRecipeType(value) {
1291
1291
  if (!value) return false;
1292
1292
  const types = Array.isArray(value) ? value : [value];
1293
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "fromSchemaOrg.ts:95", message: "hasRecipeType check", data: { types, typesLower: types.map((t) => typeof t === "string" ? t.toLowerCase() : t), isMatch: types.some((e) => typeof e === "string" && e.toLowerCase() === "recipe") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
1294
+ });
1293
1295
  return types.some(
1294
1296
  (entry) => typeof entry === "string" && entry.toLowerCase() === "recipe"
1295
1297
  );
@@ -1646,18 +1648,26 @@ var DEFAULT_USER_AGENTS = [
1646
1648
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
1647
1649
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0"
1648
1650
  ];
1649
- var fetchImpl = null;
1650
- async function ensureFetch() {
1651
- if (!fetchImpl) {
1652
- fetchImpl = import('node-fetch').then((mod) => mod.default);
1653
- }
1654
- return fetchImpl;
1655
- }
1656
1651
  function chooseUserAgent(provided) {
1657
1652
  if (provided) return provided;
1658
1653
  const index = Math.floor(Math.random() * DEFAULT_USER_AGENTS.length);
1659
1654
  return DEFAULT_USER_AGENTS[index];
1660
1655
  }
1656
+ function resolveFetch(fetchFn) {
1657
+ if (fetchFn) {
1658
+ return fetchFn;
1659
+ }
1660
+ const globalFetch = globalThis.fetch;
1661
+ if (!globalFetch) {
1662
+ throw new Error(
1663
+ "A global fetch implementation is not available. Provide window.fetch in browsers or upgrade to Node 18+."
1664
+ );
1665
+ }
1666
+ return globalFetch;
1667
+ }
1668
+ function isBrowserEnvironment() {
1669
+ return typeof globalThis.document !== "undefined";
1670
+ }
1661
1671
  function isClientError(error) {
1662
1672
  if (typeof error.status === "number") {
1663
1673
  return error.status >= 400 && error.status < 500;
@@ -1671,25 +1681,40 @@ async function fetchPage(url, options = {}) {
1671
1681
  const {
1672
1682
  timeout = 1e4,
1673
1683
  userAgent,
1674
- maxRetries = 2
1684
+ maxRetries = 2,
1685
+ fetchFn
1675
1686
  } = options;
1676
1687
  let lastError = null;
1688
+ const resolvedFetch = resolveFetch(fetchFn);
1689
+ const isBrowser2 = isBrowserEnvironment();
1677
1690
  for (let attempt = 0; attempt <= maxRetries; attempt++) {
1678
1691
  const controller = new AbortController();
1679
1692
  const timeoutId = setTimeout(() => controller.abort(), timeout);
1680
1693
  try {
1681
- const fetch = await ensureFetch();
1682
1694
  const headers = {
1683
- "User-Agent": chooseUserAgent(userAgent),
1684
1695
  Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
1685
1696
  "Accept-Language": "en-US,en;q=0.5"
1686
1697
  };
1687
- const response = await fetch(url, {
1698
+ if (!isBrowser2) {
1699
+ headers["User-Agent"] = chooseUserAgent(userAgent);
1700
+ }
1701
+ const requestInit = {
1688
1702
  headers,
1689
1703
  signal: controller.signal,
1690
1704
  redirect: "follow"
1691
- });
1705
+ };
1706
+ const response = await resolvedFetch(url, requestInit);
1692
1707
  clearTimeout(timeoutId);
1708
+ if (response && (typeof process === "undefined" || process.env.NODE_ENV !== "test")) {
1709
+ try {
1710
+ const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
1711
+ if (globalFetch) {
1712
+ globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/fetch.ts:63", message: "fetch response", data: { url, status: response.status, statusText: response.statusText, ok: response.ok, isNYTimes: url.includes("nytimes.com") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
1713
+ });
1714
+ }
1715
+ } catch {
1716
+ }
1717
+ }
1693
1718
  if (!response.ok) {
1694
1719
  const error = new Error(
1695
1720
  `HTTP ${response.status}: ${response.statusText}`
@@ -1697,7 +1722,18 @@ async function fetchPage(url, options = {}) {
1697
1722
  error.status = response.status;
1698
1723
  throw error;
1699
1724
  }
1700
- return await response.text();
1725
+ const html = await response.text();
1726
+ if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
1727
+ try {
1728
+ const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
1729
+ if (globalFetch) {
1730
+ globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/fetch.ts:75", message: "HTML received", data: { htmlLength: html.length, hasLoginPage: html.toLowerCase().includes("login") || html.toLowerCase().includes("sign in"), hasRecipeData: html.includes("application/ld+json") || html.includes("schema.org/Recipe") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B,D" }) }).catch(() => {
1731
+ });
1732
+ }
1733
+ } catch {
1734
+ }
1735
+ }
1736
+ return html;
1701
1737
  } catch (err) {
1702
1738
  clearTimeout(timeoutId);
1703
1739
  lastError = err instanceof Error ? err : new Error(String(err));
@@ -1724,6 +1760,8 @@ function isRecipeNode(value) {
1724
1760
  return false;
1725
1761
  }
1726
1762
  const type = value["@type"];
1763
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/utils.ts:14", message: "isRecipeNode check", data: { type, typeLower: typeof type === "string" ? type.toLowerCase() : Array.isArray(type) ? type.map((t) => typeof t === "string" ? t.toLowerCase() : t) : void 0, isMatch: typeof type === "string" ? RECIPE_TYPES.has(type.toLowerCase()) : Array.isArray(type) ? type.some((e) => typeof e === "string" && RECIPE_TYPES.has(e.toLowerCase())) : false }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
1764
+ });
1727
1765
  if (typeof type === "string") {
1728
1766
  return RECIPE_TYPES.has(type.toLowerCase());
1729
1767
  }
@@ -1751,14 +1789,20 @@ function normalizeText(value) {
1751
1789
  function extractJsonLd(html) {
1752
1790
  const $ = load(html);
1753
1791
  const scripts = $('script[type="application/ld+json"]');
1792
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:8", message: "JSON-LD scripts found", data: { scriptCount: scripts.length }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
1793
+ });
1754
1794
  const candidates = [];
1755
1795
  scripts.each((_, element) => {
1756
1796
  const content = $(element).html();
1757
1797
  if (!content) return;
1758
1798
  const parsed = safeJsonParse(content);
1759
1799
  if (!parsed) return;
1800
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:18", message: "JSON-LD parsed", data: { hasGraph: !!(parsed && typeof parsed === "object" && "@graph" in parsed), type: parsed && typeof parsed === "object" && "@type" in parsed ? parsed["@type"] : void 0 }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C" }) }).catch(() => {
1801
+ });
1760
1802
  collectCandidates(parsed, candidates);
1761
1803
  });
1804
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:22", message: "JSON-LD candidates", data: { candidateCount: candidates.length, candidateTypes: candidates.map((c) => c["@type"]) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C" }) }).catch(() => {
1805
+ });
1762
1806
  return candidates[0] ?? null;
1763
1807
  }
1764
1808
  function collectCandidates(payload, bucket) {
@@ -1830,13 +1874,123 @@ function findPropertyValue($, context, prop) {
1830
1874
  return normalizeText(node.attr("content")) || normalizeText(node.attr("href")) || normalizeText(node.attr("src")) || normalizeText(node.text());
1831
1875
  }
1832
1876
 
1877
+ // src/scraper/extractors/browser.ts
1878
+ var SIMPLE_PROPS2 = ["name", "description", "image", "recipeYield", "prepTime", "cookTime", "totalTime"];
1879
+ function extractRecipeBrowser(html) {
1880
+ const jsonLdRecipe = extractJsonLdBrowser(html);
1881
+ if (jsonLdRecipe) {
1882
+ return { recipe: jsonLdRecipe, source: "jsonld" };
1883
+ }
1884
+ const microdataRecipe = extractMicrodataBrowser(html);
1885
+ if (microdataRecipe) {
1886
+ return { recipe: microdataRecipe, source: "microdata" };
1887
+ }
1888
+ return { recipe: null, source: null };
1889
+ }
1890
+ function extractJsonLdBrowser(html) {
1891
+ if (typeof globalThis.DOMParser === "undefined") {
1892
+ return null;
1893
+ }
1894
+ const parser = new globalThis.DOMParser();
1895
+ const doc = parser.parseFromString(html, "text/html");
1896
+ const scripts = doc.querySelectorAll('script[type="application/ld+json"]');
1897
+ const candidates = [];
1898
+ scripts.forEach((script) => {
1899
+ const content = script.textContent;
1900
+ if (!content) return;
1901
+ const parsed = safeJsonParse(content);
1902
+ if (!parsed) return;
1903
+ collectCandidates2(parsed, candidates);
1904
+ });
1905
+ return candidates[0] ?? null;
1906
+ }
1907
+ function extractMicrodataBrowser(html) {
1908
+ if (typeof globalThis.DOMParser === "undefined") {
1909
+ return null;
1910
+ }
1911
+ const parser = new globalThis.DOMParser();
1912
+ const doc = parser.parseFromString(html, "text/html");
1913
+ const recipeEl = doc.querySelector('[itemscope][itemtype*="schema.org/Recipe"]');
1914
+ if (!recipeEl) {
1915
+ return null;
1916
+ }
1917
+ const recipe = {
1918
+ "@type": "Recipe"
1919
+ };
1920
+ SIMPLE_PROPS2.forEach((prop) => {
1921
+ const value = findPropertyValue2(recipeEl, prop);
1922
+ if (value) {
1923
+ recipe[prop] = value;
1924
+ }
1925
+ });
1926
+ const ingredients = [];
1927
+ recipeEl.querySelectorAll('[itemprop="recipeIngredient"]').forEach((el) => {
1928
+ const text = normalizeText(
1929
+ el.getAttribute("content") || el.textContent || void 0
1930
+ );
1931
+ if (text) ingredients.push(text);
1932
+ });
1933
+ if (ingredients.length) {
1934
+ recipe.recipeIngredient = ingredients;
1935
+ }
1936
+ const instructions = [];
1937
+ recipeEl.querySelectorAll('[itemprop="recipeInstructions"]').forEach((el) => {
1938
+ const text = normalizeText(el.getAttribute("content")) || normalizeText(el.querySelector('[itemprop="text"]')?.textContent || void 0) || normalizeText(el.textContent || void 0);
1939
+ if (text) instructions.push(text);
1940
+ });
1941
+ if (instructions.length) {
1942
+ recipe.recipeInstructions = instructions;
1943
+ }
1944
+ if (recipe.name || ingredients.length) {
1945
+ return recipe;
1946
+ }
1947
+ return null;
1948
+ }
1949
+ function findPropertyValue2(context, prop) {
1950
+ const node = context.querySelector(`[itemprop="${prop}"]`);
1951
+ if (!node) return void 0;
1952
+ return normalizeText(node.getAttribute("content")) || normalizeText(node.getAttribute("href")) || normalizeText(node.getAttribute("src")) || normalizeText(node.textContent || void 0);
1953
+ }
1954
+ function collectCandidates2(payload, bucket) {
1955
+ if (!payload) return;
1956
+ if (Array.isArray(payload)) {
1957
+ payload.forEach((entry) => collectCandidates2(entry, bucket));
1958
+ return;
1959
+ }
1960
+ if (typeof payload !== "object") {
1961
+ return;
1962
+ }
1963
+ if (isRecipeNode(payload)) {
1964
+ bucket.push(payload);
1965
+ return;
1966
+ }
1967
+ const graph = payload["@graph"];
1968
+ if (Array.isArray(graph)) {
1969
+ graph.forEach((entry) => collectCandidates2(entry, bucket));
1970
+ }
1971
+ }
1972
+
1833
1973
  // src/scraper/extractors/index.ts
1974
+ function isBrowser() {
1975
+ try {
1976
+ return typeof globalThis.DOMParser !== "undefined";
1977
+ } catch {
1978
+ return false;
1979
+ }
1980
+ }
1834
1981
  function extractRecipe(html) {
1982
+ if (isBrowser()) {
1983
+ return extractRecipeBrowser(html);
1984
+ }
1835
1985
  const jsonLdRecipe = extractJsonLd(html);
1986
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:6", message: "JSON-LD extraction result", data: { hasJsonLd: !!jsonLdRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
1987
+ });
1836
1988
  if (jsonLdRecipe) {
1837
1989
  return { recipe: jsonLdRecipe, source: "jsonld" };
1838
1990
  }
1839
1991
  const microdataRecipe = extractMicrodata(html);
1992
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:12", message: "Microdata extraction result", data: { hasMicrodata: !!microdataRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "D" }) }).catch(() => {
1993
+ });
1840
1994
  if (microdataRecipe) {
1841
1995
  return { recipe: microdataRecipe, source: "microdata" };
1842
1996
  }
@@ -1845,12 +1999,31 @@ function extractRecipe(html) {
1845
1999
 
1846
2000
  // src/scraper/index.ts
1847
2001
  async function scrapeRecipe(url, options = {}) {
2002
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:7", message: "scrapeRecipe entry", data: { url, hasOptions: !!options }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,B,C,D,E" }) }).catch(() => {
2003
+ });
1848
2004
  const html = await fetchPage(url, options);
2005
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:9", message: "HTML fetched", data: { htmlLength: html?.length, htmlPreview: html?.substring(0, 200) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
2006
+ });
1849
2007
  const { recipe } = extractRecipe(html);
2008
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:11", message: "extractRecipe result", data: { hasRecipe: !!recipe, recipeType: recipe?.["@type"], recipeName: recipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C,D" }) }).catch(() => {
2009
+ });
1850
2010
  if (!recipe) {
1851
2011
  throw new Error("No Schema.org recipe data found in page");
1852
2012
  }
1853
2013
  const soustackRecipe = fromSchemaOrg(recipe);
2014
+ fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:17", message: "fromSchemaOrg result", data: { hasSoustackRecipe: !!soustackRecipe, soustackRecipeName: soustackRecipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
2015
+ });
2016
+ if (!soustackRecipe) {
2017
+ throw new Error("Schema.org data did not include a valid recipe");
2018
+ }
2019
+ return soustackRecipe;
2020
+ }
2021
+ function extractRecipeFromHTML(html) {
2022
+ const { recipe } = extractRecipe(html);
2023
+ if (!recipe) {
2024
+ throw new Error("No Schema.org recipe data found in HTML");
2025
+ }
2026
+ const soustackRecipe = fromSchemaOrg(recipe);
1854
2027
  if (!soustackRecipe) {
1855
2028
  throw new Error("Schema.org data did not include a valid recipe");
1856
2029
  }
@@ -2099,6 +2272,6 @@ function wordToNumber(word) {
2099
2272
  return null;
2100
2273
  }
2101
2274
 
2102
- export { formatDuration, formatYield2 as formatYield, fromSchemaOrg, normalizeIngredientInput, normalizeYield, parseDuration, parseHumanDuration, parseIngredient, parseIngredientLine, parseIngredients, parseYield2 as parseYield, scaleRecipe, scrapeRecipe, smartParseDuration, toSchemaOrg, validateRecipe };
2275
+ export { extractRecipeFromHTML, formatDuration, formatYield2 as formatYield, fromSchemaOrg, normalizeIngredientInput, normalizeYield, parseDuration, parseHumanDuration, parseIngredient, parseIngredientLine, parseIngredients, parseYield2 as parseYield, scaleRecipe, scrapeRecipe, smartParseDuration, toSchemaOrg, validateRecipe };
2103
2276
  //# sourceMappingURL=index.mjs.map
2104
2277
  //# sourceMappingURL=index.mjs.map