soustack 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +168 -138
- package/dist/cli/index.js +175 -13
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.mts +44 -1
- package/dist/index.d.ts +44 -1
- package/dist/index.js +187 -13
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +187 -14
- package/dist/index.mjs.map +1 -1
- package/package.json +75 -74
package/dist/index.d.mts
CHANGED
|
@@ -305,15 +305,58 @@ interface NutritionInformation {
|
|
|
305
305
|
|
|
306
306
|
declare function toSchemaOrg(recipe: Recipe): SchemaOrgRecipe;
|
|
307
307
|
|
|
308
|
+
interface FetchRequestInit {
|
|
309
|
+
headers?: Record<string, string>;
|
|
310
|
+
signal?: AbortSignal;
|
|
311
|
+
redirect?: 'follow' | 'error' | 'manual';
|
|
312
|
+
}
|
|
313
|
+
interface FetchResponse {
|
|
314
|
+
ok: boolean;
|
|
315
|
+
status: number;
|
|
316
|
+
statusText: string;
|
|
317
|
+
text(): Promise<string>;
|
|
318
|
+
}
|
|
319
|
+
type FetchImplementation = (url: string, init?: FetchRequestInit) => Promise<FetchResponse>;
|
|
308
320
|
interface FetchOptions {
|
|
309
321
|
timeout?: number;
|
|
310
322
|
userAgent?: string;
|
|
311
323
|
maxRetries?: number;
|
|
324
|
+
fetchFn?: FetchImplementation;
|
|
312
325
|
}
|
|
313
326
|
interface ScrapeRecipeOptions extends FetchOptions {
|
|
314
327
|
}
|
|
315
328
|
|
|
329
|
+
/**
|
|
330
|
+
* Scrapes a recipe from a URL (Node.js only).
|
|
331
|
+
*
|
|
332
|
+
* ⚠️ Not available in browser environments due to CORS restrictions.
|
|
333
|
+
* For browser usage, fetch the HTML yourself and use extractRecipeFromHTML().
|
|
334
|
+
*
|
|
335
|
+
* @param url - The URL of the recipe page to scrape
|
|
336
|
+
* @param options - Fetch options (timeout, userAgent, maxRetries)
|
|
337
|
+
* @returns A Soustack recipe object
|
|
338
|
+
* @throws Error if no recipe is found
|
|
339
|
+
*/
|
|
316
340
|
declare function scrapeRecipe(url: string, options?: ScrapeRecipeOptions): Promise<Recipe>;
|
|
341
|
+
/**
|
|
342
|
+
* Extracts a recipe from HTML string (browser and Node.js compatible).
|
|
343
|
+
*
|
|
344
|
+
* This function works in both environments and doesn't require network access.
|
|
345
|
+
* Perfect for browser usage where you fetch HTML yourself (with cookies/session).
|
|
346
|
+
*
|
|
347
|
+
* @example
|
|
348
|
+
* ```ts
|
|
349
|
+
* // In browser:
|
|
350
|
+
* const response = await fetch('https://example.com/recipe');
|
|
351
|
+
* const html = await response.text();
|
|
352
|
+
* const recipe = extractRecipeFromHTML(html);
|
|
353
|
+
* ```
|
|
354
|
+
*
|
|
355
|
+
* @param html - The HTML string containing Schema.org recipe data
|
|
356
|
+
* @returns A Soustack recipe object
|
|
357
|
+
* @throws Error if no recipe is found
|
|
358
|
+
*/
|
|
359
|
+
declare function extractRecipeFromHTML(html: string): Recipe;
|
|
317
360
|
|
|
318
361
|
declare function normalizeIngredientInput(input: string): string;
|
|
319
362
|
declare function parseIngredient(text: string): ParsedIngredient;
|
|
@@ -333,4 +376,4 @@ declare function normalizeYield(text: string): string;
|
|
|
333
376
|
declare function parseYield(text: string): ParsedYield | null;
|
|
334
377
|
declare function formatYield(value: ParsedYield): string;
|
|
335
378
|
|
|
336
|
-
export { type Alternative, type ComputedIngredient, type ComputedInstruction, type ComputedRecipe, type Equipment, type FrozenStorageMethod, type Ingredient, type IngredientItem, type IngredientSubsection, type Instruction, type InstructionItem, type InstructionSubsection, type MakeAheadComponent, type NutritionFacts, type ParsedIngredient, type ParsedYield, type Quantity, type Recipe, type Scaling, type ScalingBakersPercentage, type ScalingBase, type ScalingDiscrete, type ScalingFixed, type ScalingLinear, type ScalingProportional, type SimpleTime, type Source, type StepTiming, type Storage, type StorageMethod, type StructuredTime, type Substitution, type Time, type Yield, formatDuration, formatYield, fromSchemaOrg, normalizeIngredientInput, normalizeYield, parseDuration, parseHumanDuration, parseIngredient, parseIngredientLine, parseIngredients, parseYield, scaleRecipe, scrapeRecipe, smartParseDuration, toSchemaOrg, validateRecipe };
|
|
379
|
+
export { type Alternative, type ComputedIngredient, type ComputedInstruction, type ComputedRecipe, type Equipment, type FrozenStorageMethod, type Ingredient, type IngredientItem, type IngredientSubsection, type Instruction, type InstructionItem, type InstructionSubsection, type MakeAheadComponent, type NutritionFacts, type ParsedIngredient, type ParsedYield, type Quantity, type Recipe, type Scaling, type ScalingBakersPercentage, type ScalingBase, type ScalingDiscrete, type ScalingFixed, type ScalingLinear, type ScalingProportional, type SimpleTime, type Source, type StepTiming, type Storage, type StorageMethod, type StructuredTime, type Substitution, type Time, type Yield, extractRecipeFromHTML, formatDuration, formatYield, fromSchemaOrg, normalizeIngredientInput, normalizeYield, parseDuration, parseHumanDuration, parseIngredient, parseIngredientLine, parseIngredients, parseYield, scaleRecipe, scrapeRecipe, smartParseDuration, toSchemaOrg, validateRecipe };
|
package/dist/index.d.ts
CHANGED
|
@@ -305,15 +305,58 @@ interface NutritionInformation {
|
|
|
305
305
|
|
|
306
306
|
declare function toSchemaOrg(recipe: Recipe): SchemaOrgRecipe;
|
|
307
307
|
|
|
308
|
+
interface FetchRequestInit {
|
|
309
|
+
headers?: Record<string, string>;
|
|
310
|
+
signal?: AbortSignal;
|
|
311
|
+
redirect?: 'follow' | 'error' | 'manual';
|
|
312
|
+
}
|
|
313
|
+
interface FetchResponse {
|
|
314
|
+
ok: boolean;
|
|
315
|
+
status: number;
|
|
316
|
+
statusText: string;
|
|
317
|
+
text(): Promise<string>;
|
|
318
|
+
}
|
|
319
|
+
type FetchImplementation = (url: string, init?: FetchRequestInit) => Promise<FetchResponse>;
|
|
308
320
|
interface FetchOptions {
|
|
309
321
|
timeout?: number;
|
|
310
322
|
userAgent?: string;
|
|
311
323
|
maxRetries?: number;
|
|
324
|
+
fetchFn?: FetchImplementation;
|
|
312
325
|
}
|
|
313
326
|
interface ScrapeRecipeOptions extends FetchOptions {
|
|
314
327
|
}
|
|
315
328
|
|
|
329
|
+
/**
|
|
330
|
+
* Scrapes a recipe from a URL (Node.js only).
|
|
331
|
+
*
|
|
332
|
+
* ⚠️ Not available in browser environments due to CORS restrictions.
|
|
333
|
+
* For browser usage, fetch the HTML yourself and use extractRecipeFromHTML().
|
|
334
|
+
*
|
|
335
|
+
* @param url - The URL of the recipe page to scrape
|
|
336
|
+
* @param options - Fetch options (timeout, userAgent, maxRetries)
|
|
337
|
+
* @returns A Soustack recipe object
|
|
338
|
+
* @throws Error if no recipe is found
|
|
339
|
+
*/
|
|
316
340
|
declare function scrapeRecipe(url: string, options?: ScrapeRecipeOptions): Promise<Recipe>;
|
|
341
|
+
/**
|
|
342
|
+
* Extracts a recipe from HTML string (browser and Node.js compatible).
|
|
343
|
+
*
|
|
344
|
+
* This function works in both environments and doesn't require network access.
|
|
345
|
+
* Perfect for browser usage where you fetch HTML yourself (with cookies/session).
|
|
346
|
+
*
|
|
347
|
+
* @example
|
|
348
|
+
* ```ts
|
|
349
|
+
* // In browser:
|
|
350
|
+
* const response = await fetch('https://example.com/recipe');
|
|
351
|
+
* const html = await response.text();
|
|
352
|
+
* const recipe = extractRecipeFromHTML(html);
|
|
353
|
+
* ```
|
|
354
|
+
*
|
|
355
|
+
* @param html - The HTML string containing Schema.org recipe data
|
|
356
|
+
* @returns A Soustack recipe object
|
|
357
|
+
* @throws Error if no recipe is found
|
|
358
|
+
*/
|
|
359
|
+
declare function extractRecipeFromHTML(html: string): Recipe;
|
|
317
360
|
|
|
318
361
|
declare function normalizeIngredientInput(input: string): string;
|
|
319
362
|
declare function parseIngredient(text: string): ParsedIngredient;
|
|
@@ -333,4 +376,4 @@ declare function normalizeYield(text: string): string;
|
|
|
333
376
|
declare function parseYield(text: string): ParsedYield | null;
|
|
334
377
|
declare function formatYield(value: ParsedYield): string;
|
|
335
378
|
|
|
336
|
-
export { type Alternative, type ComputedIngredient, type ComputedInstruction, type ComputedRecipe, type Equipment, type FrozenStorageMethod, type Ingredient, type IngredientItem, type IngredientSubsection, type Instruction, type InstructionItem, type InstructionSubsection, type MakeAheadComponent, type NutritionFacts, type ParsedIngredient, type ParsedYield, type Quantity, type Recipe, type Scaling, type ScalingBakersPercentage, type ScalingBase, type ScalingDiscrete, type ScalingFixed, type ScalingLinear, type ScalingProportional, type SimpleTime, type Source, type StepTiming, type Storage, type StorageMethod, type StructuredTime, type Substitution, type Time, type Yield, formatDuration, formatYield, fromSchemaOrg, normalizeIngredientInput, normalizeYield, parseDuration, parseHumanDuration, parseIngredient, parseIngredientLine, parseIngredients, parseYield, scaleRecipe, scrapeRecipe, smartParseDuration, toSchemaOrg, validateRecipe };
|
|
379
|
+
export { type Alternative, type ComputedIngredient, type ComputedInstruction, type ComputedRecipe, type Equipment, type FrozenStorageMethod, type Ingredient, type IngredientItem, type IngredientSubsection, type Instruction, type InstructionItem, type InstructionSubsection, type MakeAheadComponent, type NutritionFacts, type ParsedIngredient, type ParsedYield, type Quantity, type Recipe, type Scaling, type ScalingBakersPercentage, type ScalingBase, type ScalingDiscrete, type ScalingFixed, type ScalingLinear, type ScalingProportional, type SimpleTime, type Source, type StepTiming, type Storage, type StorageMethod, type StructuredTime, type Substitution, type Time, type Yield, extractRecipeFromHTML, formatDuration, formatYield, fromSchemaOrg, normalizeIngredientInput, normalizeYield, parseDuration, parseHumanDuration, parseIngredient, parseIngredientLine, parseIngredients, parseYield, scaleRecipe, scrapeRecipe, smartParseDuration, toSchemaOrg, validateRecipe };
|
package/dist/index.js
CHANGED
|
@@ -1297,6 +1297,8 @@ function extractRecipeNode(input) {
|
|
|
1297
1297
|
function hasRecipeType(value) {
|
|
1298
1298
|
if (!value) return false;
|
|
1299
1299
|
const types = Array.isArray(value) ? value : [value];
|
|
1300
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "fromSchemaOrg.ts:95", message: "hasRecipeType check", data: { types, typesLower: types.map((t) => typeof t === "string" ? t.toLowerCase() : t), isMatch: types.some((e) => typeof e === "string" && e.toLowerCase() === "recipe") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
|
|
1301
|
+
});
|
|
1300
1302
|
return types.some(
|
|
1301
1303
|
(entry) => typeof entry === "string" && entry.toLowerCase() === "recipe"
|
|
1302
1304
|
);
|
|
@@ -1653,18 +1655,26 @@ var DEFAULT_USER_AGENTS = [
|
|
|
1653
1655
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
1654
1656
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0"
|
|
1655
1657
|
];
|
|
1656
|
-
var fetchImpl = null;
|
|
1657
|
-
async function ensureFetch() {
|
|
1658
|
-
if (!fetchImpl) {
|
|
1659
|
-
fetchImpl = import('node-fetch').then((mod) => mod.default);
|
|
1660
|
-
}
|
|
1661
|
-
return fetchImpl;
|
|
1662
|
-
}
|
|
1663
1658
|
function chooseUserAgent(provided) {
|
|
1664
1659
|
if (provided) return provided;
|
|
1665
1660
|
const index = Math.floor(Math.random() * DEFAULT_USER_AGENTS.length);
|
|
1666
1661
|
return DEFAULT_USER_AGENTS[index];
|
|
1667
1662
|
}
|
|
1663
|
+
function resolveFetch(fetchFn) {
|
|
1664
|
+
if (fetchFn) {
|
|
1665
|
+
return fetchFn;
|
|
1666
|
+
}
|
|
1667
|
+
const globalFetch = globalThis.fetch;
|
|
1668
|
+
if (!globalFetch) {
|
|
1669
|
+
throw new Error(
|
|
1670
|
+
"A global fetch implementation is not available. Provide window.fetch in browsers or upgrade to Node 18+."
|
|
1671
|
+
);
|
|
1672
|
+
}
|
|
1673
|
+
return globalFetch;
|
|
1674
|
+
}
|
|
1675
|
+
function isBrowserEnvironment() {
|
|
1676
|
+
return typeof globalThis.document !== "undefined";
|
|
1677
|
+
}
|
|
1668
1678
|
function isClientError(error) {
|
|
1669
1679
|
if (typeof error.status === "number") {
|
|
1670
1680
|
return error.status >= 400 && error.status < 500;
|
|
@@ -1678,25 +1688,40 @@ async function fetchPage(url, options = {}) {
|
|
|
1678
1688
|
const {
|
|
1679
1689
|
timeout = 1e4,
|
|
1680
1690
|
userAgent,
|
|
1681
|
-
maxRetries = 2
|
|
1691
|
+
maxRetries = 2,
|
|
1692
|
+
fetchFn
|
|
1682
1693
|
} = options;
|
|
1683
1694
|
let lastError = null;
|
|
1695
|
+
const resolvedFetch = resolveFetch(fetchFn);
|
|
1696
|
+
const isBrowser2 = isBrowserEnvironment();
|
|
1684
1697
|
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
1685
1698
|
const controller = new AbortController();
|
|
1686
1699
|
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
1687
1700
|
try {
|
|
1688
|
-
const fetch = await ensureFetch();
|
|
1689
1701
|
const headers = {
|
|
1690
|
-
"User-Agent": chooseUserAgent(userAgent),
|
|
1691
1702
|
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
1692
1703
|
"Accept-Language": "en-US,en;q=0.5"
|
|
1693
1704
|
};
|
|
1694
|
-
|
|
1705
|
+
if (!isBrowser2) {
|
|
1706
|
+
headers["User-Agent"] = chooseUserAgent(userAgent);
|
|
1707
|
+
}
|
|
1708
|
+
const requestInit = {
|
|
1695
1709
|
headers,
|
|
1696
1710
|
signal: controller.signal,
|
|
1697
1711
|
redirect: "follow"
|
|
1698
|
-
}
|
|
1712
|
+
};
|
|
1713
|
+
const response = await resolvedFetch(url, requestInit);
|
|
1699
1714
|
clearTimeout(timeoutId);
|
|
1715
|
+
if (response && (typeof process === "undefined" || process.env.NODE_ENV !== "test")) {
|
|
1716
|
+
try {
|
|
1717
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
1718
|
+
if (globalFetch) {
|
|
1719
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/fetch.ts:63", message: "fetch response", data: { url, status: response.status, statusText: response.statusText, ok: response.ok, isNYTimes: url.includes("nytimes.com") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
|
|
1720
|
+
});
|
|
1721
|
+
}
|
|
1722
|
+
} catch {
|
|
1723
|
+
}
|
|
1724
|
+
}
|
|
1700
1725
|
if (!response.ok) {
|
|
1701
1726
|
const error = new Error(
|
|
1702
1727
|
`HTTP ${response.status}: ${response.statusText}`
|
|
@@ -1704,7 +1729,18 @@ async function fetchPage(url, options = {}) {
|
|
|
1704
1729
|
error.status = response.status;
|
|
1705
1730
|
throw error;
|
|
1706
1731
|
}
|
|
1707
|
-
|
|
1732
|
+
const html = await response.text();
|
|
1733
|
+
if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
|
|
1734
|
+
try {
|
|
1735
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
1736
|
+
if (globalFetch) {
|
|
1737
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/fetch.ts:75", message: "HTML received", data: { htmlLength: html.length, hasLoginPage: html.toLowerCase().includes("login") || html.toLowerCase().includes("sign in"), hasRecipeData: html.includes("application/ld+json") || html.includes("schema.org/Recipe") }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B,D" }) }).catch(() => {
|
|
1738
|
+
});
|
|
1739
|
+
}
|
|
1740
|
+
} catch {
|
|
1741
|
+
}
|
|
1742
|
+
}
|
|
1743
|
+
return html;
|
|
1708
1744
|
} catch (err) {
|
|
1709
1745
|
clearTimeout(timeoutId);
|
|
1710
1746
|
lastError = err instanceof Error ? err : new Error(String(err));
|
|
@@ -1731,6 +1767,8 @@ function isRecipeNode(value) {
|
|
|
1731
1767
|
return false;
|
|
1732
1768
|
}
|
|
1733
1769
|
const type = value["@type"];
|
|
1770
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/utils.ts:14", message: "isRecipeNode check", data: { type, typeLower: typeof type === "string" ? type.toLowerCase() : Array.isArray(type) ? type.map((t) => typeof t === "string" ? t.toLowerCase() : t) : void 0, isMatch: typeof type === "string" ? RECIPE_TYPES.has(type.toLowerCase()) : Array.isArray(type) ? type.some((e) => typeof e === "string" && RECIPE_TYPES.has(e.toLowerCase())) : false }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
|
|
1771
|
+
});
|
|
1734
1772
|
if (typeof type === "string") {
|
|
1735
1773
|
return RECIPE_TYPES.has(type.toLowerCase());
|
|
1736
1774
|
}
|
|
@@ -1758,14 +1796,20 @@ function normalizeText(value) {
|
|
|
1758
1796
|
function extractJsonLd(html) {
|
|
1759
1797
|
const $ = cheerio.load(html);
|
|
1760
1798
|
const scripts = $('script[type="application/ld+json"]');
|
|
1799
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:8", message: "JSON-LD scripts found", data: { scriptCount: scripts.length }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
|
|
1800
|
+
});
|
|
1761
1801
|
const candidates = [];
|
|
1762
1802
|
scripts.each((_, element) => {
|
|
1763
1803
|
const content = $(element).html();
|
|
1764
1804
|
if (!content) return;
|
|
1765
1805
|
const parsed = safeJsonParse(content);
|
|
1766
1806
|
if (!parsed) return;
|
|
1807
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:18", message: "JSON-LD parsed", data: { hasGraph: !!(parsed && typeof parsed === "object" && "@graph" in parsed), type: parsed && typeof parsed === "object" && "@type" in parsed ? parsed["@type"] : void 0 }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C" }) }).catch(() => {
|
|
1808
|
+
});
|
|
1767
1809
|
collectCandidates(parsed, candidates);
|
|
1768
1810
|
});
|
|
1811
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/jsonld.ts:22", message: "JSON-LD candidates", data: { candidateCount: candidates.length, candidateTypes: candidates.map((c) => c["@type"]) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C" }) }).catch(() => {
|
|
1812
|
+
});
|
|
1769
1813
|
return candidates[0] ?? null;
|
|
1770
1814
|
}
|
|
1771
1815
|
function collectCandidates(payload, bucket) {
|
|
@@ -1837,13 +1881,123 @@ function findPropertyValue($, context, prop) {
|
|
|
1837
1881
|
return normalizeText(node.attr("content")) || normalizeText(node.attr("href")) || normalizeText(node.attr("src")) || normalizeText(node.text());
|
|
1838
1882
|
}
|
|
1839
1883
|
|
|
1884
|
+
// src/scraper/extractors/browser.ts
|
|
1885
|
+
var SIMPLE_PROPS2 = ["name", "description", "image", "recipeYield", "prepTime", "cookTime", "totalTime"];
|
|
1886
|
+
function extractRecipeBrowser(html) {
|
|
1887
|
+
const jsonLdRecipe = extractJsonLdBrowser(html);
|
|
1888
|
+
if (jsonLdRecipe) {
|
|
1889
|
+
return { recipe: jsonLdRecipe, source: "jsonld" };
|
|
1890
|
+
}
|
|
1891
|
+
const microdataRecipe = extractMicrodataBrowser(html);
|
|
1892
|
+
if (microdataRecipe) {
|
|
1893
|
+
return { recipe: microdataRecipe, source: "microdata" };
|
|
1894
|
+
}
|
|
1895
|
+
return { recipe: null, source: null };
|
|
1896
|
+
}
|
|
1897
|
+
function extractJsonLdBrowser(html) {
|
|
1898
|
+
if (typeof globalThis.DOMParser === "undefined") {
|
|
1899
|
+
return null;
|
|
1900
|
+
}
|
|
1901
|
+
const parser = new globalThis.DOMParser();
|
|
1902
|
+
const doc = parser.parseFromString(html, "text/html");
|
|
1903
|
+
const scripts = doc.querySelectorAll('script[type="application/ld+json"]');
|
|
1904
|
+
const candidates = [];
|
|
1905
|
+
scripts.forEach((script) => {
|
|
1906
|
+
const content = script.textContent;
|
|
1907
|
+
if (!content) return;
|
|
1908
|
+
const parsed = safeJsonParse(content);
|
|
1909
|
+
if (!parsed) return;
|
|
1910
|
+
collectCandidates2(parsed, candidates);
|
|
1911
|
+
});
|
|
1912
|
+
return candidates[0] ?? null;
|
|
1913
|
+
}
|
|
1914
|
+
function extractMicrodataBrowser(html) {
|
|
1915
|
+
if (typeof globalThis.DOMParser === "undefined") {
|
|
1916
|
+
return null;
|
|
1917
|
+
}
|
|
1918
|
+
const parser = new globalThis.DOMParser();
|
|
1919
|
+
const doc = parser.parseFromString(html, "text/html");
|
|
1920
|
+
const recipeEl = doc.querySelector('[itemscope][itemtype*="schema.org/Recipe"]');
|
|
1921
|
+
if (!recipeEl) {
|
|
1922
|
+
return null;
|
|
1923
|
+
}
|
|
1924
|
+
const recipe = {
|
|
1925
|
+
"@type": "Recipe"
|
|
1926
|
+
};
|
|
1927
|
+
SIMPLE_PROPS2.forEach((prop) => {
|
|
1928
|
+
const value = findPropertyValue2(recipeEl, prop);
|
|
1929
|
+
if (value) {
|
|
1930
|
+
recipe[prop] = value;
|
|
1931
|
+
}
|
|
1932
|
+
});
|
|
1933
|
+
const ingredients = [];
|
|
1934
|
+
recipeEl.querySelectorAll('[itemprop="recipeIngredient"]').forEach((el) => {
|
|
1935
|
+
const text = normalizeText(
|
|
1936
|
+
el.getAttribute("content") || el.textContent || void 0
|
|
1937
|
+
);
|
|
1938
|
+
if (text) ingredients.push(text);
|
|
1939
|
+
});
|
|
1940
|
+
if (ingredients.length) {
|
|
1941
|
+
recipe.recipeIngredient = ingredients;
|
|
1942
|
+
}
|
|
1943
|
+
const instructions = [];
|
|
1944
|
+
recipeEl.querySelectorAll('[itemprop="recipeInstructions"]').forEach((el) => {
|
|
1945
|
+
const text = normalizeText(el.getAttribute("content")) || normalizeText(el.querySelector('[itemprop="text"]')?.textContent || void 0) || normalizeText(el.textContent || void 0);
|
|
1946
|
+
if (text) instructions.push(text);
|
|
1947
|
+
});
|
|
1948
|
+
if (instructions.length) {
|
|
1949
|
+
recipe.recipeInstructions = instructions;
|
|
1950
|
+
}
|
|
1951
|
+
if (recipe.name || ingredients.length) {
|
|
1952
|
+
return recipe;
|
|
1953
|
+
}
|
|
1954
|
+
return null;
|
|
1955
|
+
}
|
|
1956
|
+
function findPropertyValue2(context, prop) {
|
|
1957
|
+
const node = context.querySelector(`[itemprop="${prop}"]`);
|
|
1958
|
+
if (!node) return void 0;
|
|
1959
|
+
return normalizeText(node.getAttribute("content")) || normalizeText(node.getAttribute("href")) || normalizeText(node.getAttribute("src")) || normalizeText(node.textContent || void 0);
|
|
1960
|
+
}
|
|
1961
|
+
function collectCandidates2(payload, bucket) {
|
|
1962
|
+
if (!payload) return;
|
|
1963
|
+
if (Array.isArray(payload)) {
|
|
1964
|
+
payload.forEach((entry) => collectCandidates2(entry, bucket));
|
|
1965
|
+
return;
|
|
1966
|
+
}
|
|
1967
|
+
if (typeof payload !== "object") {
|
|
1968
|
+
return;
|
|
1969
|
+
}
|
|
1970
|
+
if (isRecipeNode(payload)) {
|
|
1971
|
+
bucket.push(payload);
|
|
1972
|
+
return;
|
|
1973
|
+
}
|
|
1974
|
+
const graph = payload["@graph"];
|
|
1975
|
+
if (Array.isArray(graph)) {
|
|
1976
|
+
graph.forEach((entry) => collectCandidates2(entry, bucket));
|
|
1977
|
+
}
|
|
1978
|
+
}
|
|
1979
|
+
|
|
1840
1980
|
// src/scraper/extractors/index.ts
|
|
1981
|
+
function isBrowser() {
|
|
1982
|
+
try {
|
|
1983
|
+
return typeof globalThis.DOMParser !== "undefined";
|
|
1984
|
+
} catch {
|
|
1985
|
+
return false;
|
|
1986
|
+
}
|
|
1987
|
+
}
|
|
1841
1988
|
function extractRecipe(html) {
|
|
1989
|
+
if (isBrowser()) {
|
|
1990
|
+
return extractRecipeBrowser(html);
|
|
1991
|
+
}
|
|
1842
1992
|
const jsonLdRecipe = extractJsonLd(html);
|
|
1993
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:6", message: "JSON-LD extraction result", data: { hasJsonLd: !!jsonLdRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
|
|
1994
|
+
});
|
|
1843
1995
|
if (jsonLdRecipe) {
|
|
1844
1996
|
return { recipe: jsonLdRecipe, source: "jsonld" };
|
|
1845
1997
|
}
|
|
1846
1998
|
const microdataRecipe = extractMicrodata(html);
|
|
1999
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:12", message: "Microdata extraction result", data: { hasMicrodata: !!microdataRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "D" }) }).catch(() => {
|
|
2000
|
+
});
|
|
1847
2001
|
if (microdataRecipe) {
|
|
1848
2002
|
return { recipe: microdataRecipe, source: "microdata" };
|
|
1849
2003
|
}
|
|
@@ -1852,12 +2006,31 @@ function extractRecipe(html) {
|
|
|
1852
2006
|
|
|
1853
2007
|
// src/scraper/index.ts
|
|
1854
2008
|
async function scrapeRecipe(url, options = {}) {
|
|
2009
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:7", message: "scrapeRecipe entry", data: { url, hasOptions: !!options }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,B,C,D,E" }) }).catch(() => {
|
|
2010
|
+
});
|
|
1855
2011
|
const html = await fetchPage(url, options);
|
|
2012
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:9", message: "HTML fetched", data: { htmlLength: html?.length, htmlPreview: html?.substring(0, 200) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
|
|
2013
|
+
});
|
|
1856
2014
|
const { recipe } = extractRecipe(html);
|
|
2015
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:11", message: "extractRecipe result", data: { hasRecipe: !!recipe, recipeType: recipe?.["@type"], recipeName: recipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C,D" }) }).catch(() => {
|
|
2016
|
+
});
|
|
1857
2017
|
if (!recipe) {
|
|
1858
2018
|
throw new Error("No Schema.org recipe data found in page");
|
|
1859
2019
|
}
|
|
1860
2020
|
const soustackRecipe = fromSchemaOrg(recipe);
|
|
2021
|
+
fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:17", message: "fromSchemaOrg result", data: { hasSoustackRecipe: !!soustackRecipe, soustackRecipeName: soustackRecipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
|
|
2022
|
+
});
|
|
2023
|
+
if (!soustackRecipe) {
|
|
2024
|
+
throw new Error("Schema.org data did not include a valid recipe");
|
|
2025
|
+
}
|
|
2026
|
+
return soustackRecipe;
|
|
2027
|
+
}
|
|
2028
|
+
function extractRecipeFromHTML(html) {
|
|
2029
|
+
const { recipe } = extractRecipe(html);
|
|
2030
|
+
if (!recipe) {
|
|
2031
|
+
throw new Error("No Schema.org recipe data found in HTML");
|
|
2032
|
+
}
|
|
2033
|
+
const soustackRecipe = fromSchemaOrg(recipe);
|
|
1861
2034
|
if (!soustackRecipe) {
|
|
1862
2035
|
throw new Error("Schema.org data did not include a valid recipe");
|
|
1863
2036
|
}
|
|
@@ -2106,6 +2279,7 @@ function wordToNumber(word) {
|
|
|
2106
2279
|
return null;
|
|
2107
2280
|
}
|
|
2108
2281
|
|
|
2282
|
+
exports.extractRecipeFromHTML = extractRecipeFromHTML;
|
|
2109
2283
|
exports.formatDuration = formatDuration;
|
|
2110
2284
|
exports.formatYield = formatYield2;
|
|
2111
2285
|
exports.fromSchemaOrg = fromSchemaOrg;
|