soustack 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -8
- package/dist/cli/index.js +175 -13
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.mts +44 -1
- package/dist/index.d.ts +44 -1
- package/dist/index.js +187 -13
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +187 -14
- package/dist/index.mjs.map +1 -1
- package/package.json +5 -4
package/README.md
CHANGED
|
@@ -46,19 +46,21 @@ npm install soustack
|
|
|
46
46
|
- **Schema.org Conversion**:
|
|
47
47
|
- `fromSchemaOrg()` (Schema.org JSON-LD → Soustack)
|
|
48
48
|
- `toSchemaOrg()` (Soustack → Schema.org JSON-LD)
|
|
49
|
-
- **Web Scraping**:
|
|
50
|
-
-
|
|
51
|
-
-
|
|
49
|
+
- **Web Scraping**:
|
|
50
|
+
- `scrapeRecipe()` fetches a recipe page and extracts Schema.org recipe data (Node.js only)
|
|
51
|
+
- `extractRecipeFromHTML()` extracts recipe data from HTML string (browser & Node.js compatible)
|
|
52
|
+
- Supports JSON-LD (`<script type="application/ld+json">`) and Microdata (`itemscope/itemtype`)
|
|
52
53
|
|
|
53
54
|
## Programmatic Usage
|
|
54
55
|
|
|
55
56
|
```ts
|
|
56
57
|
import {
|
|
57
58
|
scrapeRecipe,
|
|
59
|
+
extractRecipeFromHTML,
|
|
58
60
|
fromSchemaOrg,
|
|
59
61
|
toSchemaOrg,
|
|
60
62
|
validateRecipe,
|
|
61
|
-
scaleRecipe
|
|
63
|
+
scaleRecipe,
|
|
62
64
|
} from 'soustack';
|
|
63
65
|
|
|
64
66
|
// Validate a Soustack recipe JSON object
|
|
@@ -67,9 +69,13 @@ validateRecipe(recipe);
|
|
|
67
69
|
// Scale a recipe to a target yield amount (returns a "computed recipe")
|
|
68
70
|
const computed = scaleRecipe(recipe, 2);
|
|
69
71
|
|
|
70
|
-
// Scrape a URL into a Soustack recipe (throws if no recipe is found)
|
|
72
|
+
// Scrape a URL into a Soustack recipe (Node.js only, throws if no recipe is found)
|
|
71
73
|
const scraped = await scrapeRecipe('https://example.com/recipe');
|
|
72
74
|
|
|
75
|
+
// Extract recipe from HTML string (browser & Node.js compatible)
|
|
76
|
+
const html = await fetch('https://example.com/recipe').then((r) => r.text());
|
|
77
|
+
const recipe = extractRecipeFromHTML(html);
|
|
78
|
+
|
|
73
79
|
// Convert Schema.org → Soustack
|
|
74
80
|
const soustack = fromSchemaOrg(schemaOrgJsonLd);
|
|
75
81
|
|
|
@@ -88,9 +94,13 @@ const soustackRecipe = fromSchemaOrg(schemaOrgJsonLd);
|
|
|
88
94
|
const schemaOrgRecipe = toSchemaOrg(soustackRecipe);
|
|
89
95
|
```
|
|
90
96
|
|
|
91
|
-
## 🧰 Scraping
|
|
97
|
+
## 🧰 Web Scraping
|
|
98
|
+
|
|
99
|
+
### Node.js: `scrapeRecipe()`
|
|
100
|
+
|
|
101
|
+
`scrapeRecipe(url, options)` fetches a recipe page and extracts Schema.org data. **Node.js only** due to CORS restrictions.
|
|
92
102
|
|
|
93
|
-
|
|
103
|
+
Options:
|
|
94
104
|
|
|
95
105
|
- `timeout` (ms, default `10000`)
|
|
96
106
|
- `userAgent` (string, optional)
|
|
@@ -101,10 +111,30 @@ import { scrapeRecipe } from 'soustack';
|
|
|
101
111
|
|
|
102
112
|
const recipe = await scrapeRecipe('https://example.com/recipe', {
|
|
103
113
|
timeout: 15000,
|
|
104
|
-
maxRetries: 3
|
|
114
|
+
maxRetries: 3,
|
|
105
115
|
});
|
|
106
116
|
```
|
|
107
117
|
|
|
118
|
+
### Browser: `extractRecipeFromHTML()`
|
|
119
|
+
|
|
120
|
+
`extractRecipeFromHTML(html)` extracts recipe data from an HTML string. **Works in both browser and Node.js**. Perfect for browser usage where you fetch HTML yourself (with cookies/session for authenticated content).
|
|
121
|
+
|
|
122
|
+
```ts
|
|
123
|
+
import { extractRecipeFromHTML } from 'soustack';
|
|
124
|
+
|
|
125
|
+
// In browser: fetch HTML yourself (bypasses CORS, uses your cookies/session)
|
|
126
|
+
const response = await fetch('https://example.com/recipe');
|
|
127
|
+
const html = await response.text();
|
|
128
|
+
const recipe = extractRecipeFromHTML(html);
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
**Why use `extractRecipeFromHTML()` in browsers?**
|
|
132
|
+
|
|
133
|
+
- ✅ No CORS issues — you fetch HTML yourself
|
|
134
|
+
- ✅ Works with authenticated/paywalled content — uses browser cookies
|
|
135
|
+
- ✅ Smaller bundle — no Node.js dependencies
|
|
136
|
+
- ✅ Universal — works in both browser and Node.js environments
|
|
137
|
+
|
|
108
138
|
### CLI
|
|
109
139
|
|
|
110
140
|
```bash
|
package/dist/cli/index.js
CHANGED
|
@@ -1313,6 +1313,8 @@ function extractRecipeNode(input) {
|
|
|
1313
1313
|
function hasRecipeType(value) {
|
|
1314
1314
|
if (!value) return false;
|
|
1315
1315
|
const types = Array.isArray(value) ? value : [value];
|
|
1316
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "fromSchemaOrg.ts:95", message: "hasRecipeType check", data: { types, typesLower: types.map((t) => typeof t === "string" ? t.toLowerCase() : t), isMatch: types.some((e) => typeof e === "string" && e.toLowerCase() === "recipe") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
|
|
1317
|
+
});
|
|
1316
1318
|
return types.some(
|
|
1317
1319
|
(entry) => typeof entry === "string" && entry.toLowerCase() === "recipe"
|
|
1318
1320
|
);
|
|
@@ -1669,18 +1671,26 @@ var DEFAULT_USER_AGENTS = [
|
|
|
1669
1671
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
1670
1672
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0"
|
|
1671
1673
|
];
|
|
1672
|
-
var fetchImpl = null;
|
|
1673
|
-
async function ensureFetch() {
|
|
1674
|
-
if (!fetchImpl) {
|
|
1675
|
-
fetchImpl = import('node-fetch').then((mod) => mod.default);
|
|
1676
|
-
}
|
|
1677
|
-
return fetchImpl;
|
|
1678
|
-
}
|
|
1679
1674
|
function chooseUserAgent(provided) {
|
|
1680
1675
|
if (provided) return provided;
|
|
1681
1676
|
const index = Math.floor(Math.random() * DEFAULT_USER_AGENTS.length);
|
|
1682
1677
|
return DEFAULT_USER_AGENTS[index];
|
|
1683
1678
|
}
|
|
1679
|
+
function resolveFetch(fetchFn) {
|
|
1680
|
+
if (fetchFn) {
|
|
1681
|
+
return fetchFn;
|
|
1682
|
+
}
|
|
1683
|
+
const globalFetch = globalThis.fetch;
|
|
1684
|
+
if (!globalFetch) {
|
|
1685
|
+
throw new Error(
|
|
1686
|
+
"A global fetch implementation is not available. Provide window.fetch in browsers or upgrade to Node 18+."
|
|
1687
|
+
);
|
|
1688
|
+
}
|
|
1689
|
+
return globalFetch;
|
|
1690
|
+
}
|
|
1691
|
+
function isBrowserEnvironment() {
|
|
1692
|
+
return typeof globalThis.document !== "undefined";
|
|
1693
|
+
}
|
|
1684
1694
|
function isClientError(error) {
|
|
1685
1695
|
if (typeof error.status === "number") {
|
|
1686
1696
|
return error.status >= 400 && error.status < 500;
|
|
@@ -1694,25 +1704,40 @@ async function fetchPage(url, options = {}) {
|
|
|
1694
1704
|
const {
|
|
1695
1705
|
timeout = 1e4,
|
|
1696
1706
|
userAgent,
|
|
1697
|
-
maxRetries = 2
|
|
1707
|
+
maxRetries = 2,
|
|
1708
|
+
fetchFn
|
|
1698
1709
|
} = options;
|
|
1699
1710
|
let lastError = null;
|
|
1711
|
+
const resolvedFetch = resolveFetch(fetchFn);
|
|
1712
|
+
const isBrowser2 = isBrowserEnvironment();
|
|
1700
1713
|
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
1701
1714
|
const controller = new AbortController();
|
|
1702
1715
|
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
1703
1716
|
try {
|
|
1704
|
-
const fetch = await ensureFetch();
|
|
1705
1717
|
const headers = {
|
|
1706
|
-
"User-Agent": chooseUserAgent(userAgent),
|
|
1707
1718
|
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
1708
1719
|
"Accept-Language": "en-US,en;q=0.5"
|
|
1709
1720
|
};
|
|
1710
|
-
|
|
1721
|
+
if (!isBrowser2) {
|
|
1722
|
+
headers["User-Agent"] = chooseUserAgent(userAgent);
|
|
1723
|
+
}
|
|
1724
|
+
const requestInit = {
|
|
1711
1725
|
headers,
|
|
1712
1726
|
signal: controller.signal,
|
|
1713
1727
|
redirect: "follow"
|
|
1714
|
-
}
|
|
1728
|
+
};
|
|
1729
|
+
const response = await resolvedFetch(url, requestInit);
|
|
1715
1730
|
clearTimeout(timeoutId);
|
|
1731
|
+
if (response && (typeof process === "undefined" || process.env.NODE_ENV !== "test")) {
|
|
1732
|
+
try {
|
|
1733
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
1734
|
+
if (globalFetch) {
|
|
1735
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/fetch.ts:63", message: "fetch response", data: { url, status: response.status, statusText: response.statusText, ok: response.ok, isNYTimes: url.includes("nytimes.com") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
|
|
1736
|
+
});
|
|
1737
|
+
}
|
|
1738
|
+
} catch {
|
|
1739
|
+
}
|
|
1740
|
+
}
|
|
1716
1741
|
if (!response.ok) {
|
|
1717
1742
|
const error = new Error(
|
|
1718
1743
|
`HTTP ${response.status}: ${response.statusText}`
|
|
@@ -1720,7 +1745,18 @@ async function fetchPage(url, options = {}) {
|
|
|
1720
1745
|
error.status = response.status;
|
|
1721
1746
|
throw error;
|
|
1722
1747
|
}
|
|
1723
|
-
|
|
1748
|
+
const html = await response.text();
|
|
1749
|
+
if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
|
|
1750
|
+
try {
|
|
1751
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
1752
|
+
if (globalFetch) {
|
|
1753
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/fetch.ts:75", message: "HTML received", data: { htmlLength: html.length, hasLoginPage: html.toLowerCase().includes("login") || html.toLowerCase().includes("sign in"), hasRecipeData: html.includes("application/ld+json") || html.includes("schema.org/Recipe") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B,D" }) }).catch(() => {
|
|
1754
|
+
});
|
|
1755
|
+
}
|
|
1756
|
+
} catch {
|
|
1757
|
+
}
|
|
1758
|
+
}
|
|
1759
|
+
return html;
|
|
1724
1760
|
} catch (err) {
|
|
1725
1761
|
clearTimeout(timeoutId);
|
|
1726
1762
|
lastError = err instanceof Error ? err : new Error(String(err));
|
|
@@ -1747,6 +1783,8 @@ function isRecipeNode(value) {
|
|
|
1747
1783
|
return false;
|
|
1748
1784
|
}
|
|
1749
1785
|
const type = value["@type"];
|
|
1786
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/utils.ts:14", message: "isRecipeNode check", data: { type, typeLower: typeof type === "string" ? type.toLowerCase() : Array.isArray(type) ? type.map((t) => typeof t === "string" ? t.toLowerCase() : t) : void 0, isMatch: typeof type === "string" ? RECIPE_TYPES.has(type.toLowerCase()) : Array.isArray(type) ? type.some((e) => typeof e === "string" && RECIPE_TYPES.has(e.toLowerCase())) : false }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
|
|
1787
|
+
});
|
|
1750
1788
|
if (typeof type === "string") {
|
|
1751
1789
|
return RECIPE_TYPES.has(type.toLowerCase());
|
|
1752
1790
|
}
|
|
@@ -1774,14 +1812,20 @@ function normalizeText(value) {
|
|
|
1774
1812
|
function extractJsonLd(html) {
|
|
1775
1813
|
const $ = cheerio.load(html);
|
|
1776
1814
|
const scripts = $('script[type="application/ld+json"]');
|
|
1815
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:8", message: "JSON-LD scripts found", data: { scriptCount: scripts.length }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
|
|
1816
|
+
});
|
|
1777
1817
|
const candidates = [];
|
|
1778
1818
|
scripts.each((_, element) => {
|
|
1779
1819
|
const content = $(element).html();
|
|
1780
1820
|
if (!content) return;
|
|
1781
1821
|
const parsed = safeJsonParse(content);
|
|
1782
1822
|
if (!parsed) return;
|
|
1823
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:18", message: "JSON-LD parsed", data: { hasGraph: !!(parsed && typeof parsed === "object" && "@graph" in parsed), type: parsed && typeof parsed === "object" && "@type" in parsed ? parsed["@type"] : void 0 }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C" }) }).catch(() => {
|
|
1824
|
+
});
|
|
1783
1825
|
collectCandidates(parsed, candidates);
|
|
1784
1826
|
});
|
|
1827
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:22", message: "JSON-LD candidates", data: { candidateCount: candidates.length, candidateTypes: candidates.map((c) => c["@type"]) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C" }) }).catch(() => {
|
|
1828
|
+
});
|
|
1785
1829
|
return candidates[0] ?? null;
|
|
1786
1830
|
}
|
|
1787
1831
|
function collectCandidates(payload, bucket) {
|
|
@@ -1853,13 +1897,123 @@ function findPropertyValue($, context, prop) {
|
|
|
1853
1897
|
return normalizeText(node.attr("content")) || normalizeText(node.attr("href")) || normalizeText(node.attr("src")) || normalizeText(node.text());
|
|
1854
1898
|
}
|
|
1855
1899
|
|
|
1900
|
+
// src/scraper/extractors/browser.ts
|
|
1901
|
+
var SIMPLE_PROPS2 = ["name", "description", "image", "recipeYield", "prepTime", "cookTime", "totalTime"];
|
|
1902
|
+
function extractRecipeBrowser(html) {
|
|
1903
|
+
const jsonLdRecipe = extractJsonLdBrowser(html);
|
|
1904
|
+
if (jsonLdRecipe) {
|
|
1905
|
+
return { recipe: jsonLdRecipe, source: "jsonld" };
|
|
1906
|
+
}
|
|
1907
|
+
const microdataRecipe = extractMicrodataBrowser(html);
|
|
1908
|
+
if (microdataRecipe) {
|
|
1909
|
+
return { recipe: microdataRecipe, source: "microdata" };
|
|
1910
|
+
}
|
|
1911
|
+
return { recipe: null, source: null };
|
|
1912
|
+
}
|
|
1913
|
+
function extractJsonLdBrowser(html) {
|
|
1914
|
+
if (typeof globalThis.DOMParser === "undefined") {
|
|
1915
|
+
return null;
|
|
1916
|
+
}
|
|
1917
|
+
const parser = new globalThis.DOMParser();
|
|
1918
|
+
const doc = parser.parseFromString(html, "text/html");
|
|
1919
|
+
const scripts = doc.querySelectorAll('script[type="application/ld+json"]');
|
|
1920
|
+
const candidates = [];
|
|
1921
|
+
scripts.forEach((script) => {
|
|
1922
|
+
const content = script.textContent;
|
|
1923
|
+
if (!content) return;
|
|
1924
|
+
const parsed = safeJsonParse(content);
|
|
1925
|
+
if (!parsed) return;
|
|
1926
|
+
collectCandidates2(parsed, candidates);
|
|
1927
|
+
});
|
|
1928
|
+
return candidates[0] ?? null;
|
|
1929
|
+
}
|
|
1930
|
+
function extractMicrodataBrowser(html) {
|
|
1931
|
+
if (typeof globalThis.DOMParser === "undefined") {
|
|
1932
|
+
return null;
|
|
1933
|
+
}
|
|
1934
|
+
const parser = new globalThis.DOMParser();
|
|
1935
|
+
const doc = parser.parseFromString(html, "text/html");
|
|
1936
|
+
const recipeEl = doc.querySelector('[itemscope][itemtype*="schema.org/Recipe"]');
|
|
1937
|
+
if (!recipeEl) {
|
|
1938
|
+
return null;
|
|
1939
|
+
}
|
|
1940
|
+
const recipe = {
|
|
1941
|
+
"@type": "Recipe"
|
|
1942
|
+
};
|
|
1943
|
+
SIMPLE_PROPS2.forEach((prop) => {
|
|
1944
|
+
const value = findPropertyValue2(recipeEl, prop);
|
|
1945
|
+
if (value) {
|
|
1946
|
+
recipe[prop] = value;
|
|
1947
|
+
}
|
|
1948
|
+
});
|
|
1949
|
+
const ingredients = [];
|
|
1950
|
+
recipeEl.querySelectorAll('[itemprop="recipeIngredient"]').forEach((el) => {
|
|
1951
|
+
const text = normalizeText(
|
|
1952
|
+
el.getAttribute("content") || el.textContent || void 0
|
|
1953
|
+
);
|
|
1954
|
+
if (text) ingredients.push(text);
|
|
1955
|
+
});
|
|
1956
|
+
if (ingredients.length) {
|
|
1957
|
+
recipe.recipeIngredient = ingredients;
|
|
1958
|
+
}
|
|
1959
|
+
const instructions = [];
|
|
1960
|
+
recipeEl.querySelectorAll('[itemprop="recipeInstructions"]').forEach((el) => {
|
|
1961
|
+
const text = normalizeText(el.getAttribute("content")) || normalizeText(el.querySelector('[itemprop="text"]')?.textContent || void 0) || normalizeText(el.textContent || void 0);
|
|
1962
|
+
if (text) instructions.push(text);
|
|
1963
|
+
});
|
|
1964
|
+
if (instructions.length) {
|
|
1965
|
+
recipe.recipeInstructions = instructions;
|
|
1966
|
+
}
|
|
1967
|
+
if (recipe.name || ingredients.length) {
|
|
1968
|
+
return recipe;
|
|
1969
|
+
}
|
|
1970
|
+
return null;
|
|
1971
|
+
}
|
|
1972
|
+
function findPropertyValue2(context, prop) {
|
|
1973
|
+
const node = context.querySelector(`[itemprop="${prop}"]`);
|
|
1974
|
+
if (!node) return void 0;
|
|
1975
|
+
return normalizeText(node.getAttribute("content")) || normalizeText(node.getAttribute("href")) || normalizeText(node.getAttribute("src")) || normalizeText(node.textContent || void 0);
|
|
1976
|
+
}
|
|
1977
|
+
function collectCandidates2(payload, bucket) {
|
|
1978
|
+
if (!payload) return;
|
|
1979
|
+
if (Array.isArray(payload)) {
|
|
1980
|
+
payload.forEach((entry) => collectCandidates2(entry, bucket));
|
|
1981
|
+
return;
|
|
1982
|
+
}
|
|
1983
|
+
if (typeof payload !== "object") {
|
|
1984
|
+
return;
|
|
1985
|
+
}
|
|
1986
|
+
if (isRecipeNode(payload)) {
|
|
1987
|
+
bucket.push(payload);
|
|
1988
|
+
return;
|
|
1989
|
+
}
|
|
1990
|
+
const graph = payload["@graph"];
|
|
1991
|
+
if (Array.isArray(graph)) {
|
|
1992
|
+
graph.forEach((entry) => collectCandidates2(entry, bucket));
|
|
1993
|
+
}
|
|
1994
|
+
}
|
|
1995
|
+
|
|
1856
1996
|
// src/scraper/extractors/index.ts
|
|
1997
|
+
function isBrowser() {
|
|
1998
|
+
try {
|
|
1999
|
+
return typeof globalThis.DOMParser !== "undefined";
|
|
2000
|
+
} catch {
|
|
2001
|
+
return false;
|
|
2002
|
+
}
|
|
2003
|
+
}
|
|
1857
2004
|
function extractRecipe(html) {
|
|
2005
|
+
if (isBrowser()) {
|
|
2006
|
+
return extractRecipeBrowser(html);
|
|
2007
|
+
}
|
|
1858
2008
|
const jsonLdRecipe = extractJsonLd(html);
|
|
2009
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:6", message: "JSON-LD extraction result", data: { hasJsonLd: !!jsonLdRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
|
|
2010
|
+
});
|
|
1859
2011
|
if (jsonLdRecipe) {
|
|
1860
2012
|
return { recipe: jsonLdRecipe, source: "jsonld" };
|
|
1861
2013
|
}
|
|
1862
2014
|
const microdataRecipe = extractMicrodata(html);
|
|
2015
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:12", message: "Microdata extraction result", data: { hasMicrodata: !!microdataRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "D" }) }).catch(() => {
|
|
2016
|
+
});
|
|
1863
2017
|
if (microdataRecipe) {
|
|
1864
2018
|
return { recipe: microdataRecipe, source: "microdata" };
|
|
1865
2019
|
}
|
|
@@ -1868,12 +2022,20 @@ function extractRecipe(html) {
|
|
|
1868
2022
|
|
|
1869
2023
|
// src/scraper/index.ts
|
|
1870
2024
|
async function scrapeRecipe(url, options = {}) {
|
|
2025
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:7", message: "scrapeRecipe entry", data: { url, hasOptions: !!options }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,B,C,D,E" }) }).catch(() => {
|
|
2026
|
+
});
|
|
1871
2027
|
const html = await fetchPage(url, options);
|
|
2028
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:9", message: "HTML fetched", data: { htmlLength: html?.length, htmlPreview: html?.substring(0, 200) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
|
|
2029
|
+
});
|
|
1872
2030
|
const { recipe } = extractRecipe(html);
|
|
2031
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:11", message: "extractRecipe result", data: { hasRecipe: !!recipe, recipeType: recipe?.["@type"], recipeName: recipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C,D" }) }).catch(() => {
|
|
2032
|
+
});
|
|
1873
2033
|
if (!recipe) {
|
|
1874
2034
|
throw new Error("No Schema.org recipe data found in page");
|
|
1875
2035
|
}
|
|
1876
2036
|
const soustackRecipe = fromSchemaOrg(recipe);
|
|
2037
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:17", message: "fromSchemaOrg result", data: { hasSoustackRecipe: !!soustackRecipe, soustackRecipeName: soustackRecipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
|
|
2038
|
+
});
|
|
1877
2039
|
if (!soustackRecipe) {
|
|
1878
2040
|
throw new Error("Schema.org data did not include a valid recipe");
|
|
1879
2041
|
}
|