soustack 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -46,9 +46,10 @@ npm install soustack
46
46
  - **Schema.org Conversion**:
47
47
  - `fromSchemaOrg()` (Schema.org JSON-LD → Soustack)
48
48
  - `toSchemaOrg()` (Soustack → Schema.org JSON-LD)
49
- - **Web Scraping**:
49
+ - **Web Scraping**:
50
50
  - `scrapeRecipe()` fetches a recipe page and extracts Schema.org recipe data (Node.js only)
51
- - `extractRecipeFromHTML()` extracts recipe data from HTML string (browser & Node.js compatible)
51
+ - `extractRecipeFromHTML()` extracts recipe data from HTML string, returns Soustack format (browser & Node.js compatible)
52
+ - `extractSchemaOrgRecipeFromHTML()` extracts raw Schema.org recipe data from HTML string (browser & Node.js compatible)
52
53
  - Supports JSON-LD (`<script type="application/ld+json">`) and Microdata (`itemscope/itemtype`)
53
54
 
54
55
  ## Programmatic Usage
@@ -57,6 +58,7 @@ npm install soustack
57
58
  import {
58
59
  scrapeRecipe,
59
60
  extractRecipeFromHTML,
61
+ extractSchemaOrgRecipeFromHTML,
60
62
  fromSchemaOrg,
61
63
  toSchemaOrg,
62
64
  validateRecipe,
@@ -73,9 +75,16 @@ const computed = scaleRecipe(recipe, 2);
73
75
  const scraped = await scrapeRecipe('https://example.com/recipe');
74
76
 
75
77
  // Extract recipe from HTML string (browser & Node.js compatible)
78
+ // Option 1: Get Soustack format directly
76
79
  const html = await fetch('https://example.com/recipe').then((r) => r.text());
77
80
  const recipe = extractRecipeFromHTML(html);
78
81
 
82
+ // Option 2: Get Schema.org format first (for inspection/modification)
83
+ const schemaOrgRecipe = extractSchemaOrgRecipeFromHTML(html);
84
+ if (schemaOrgRecipe) {
85
+ const soustackRecipe = fromSchemaOrg(schemaOrgRecipe);
86
+ }
87
+
79
88
  // Convert Schema.org → Soustack
80
89
  const soustack = fromSchemaOrg(schemaOrgJsonLd);
81
90
 
@@ -115,9 +124,11 @@ const recipe = await scrapeRecipe('https://example.com/recipe', {
115
124
  });
116
125
  ```
117
126
 
118
- ### Browser: `extractRecipeFromHTML()`
127
+ ### Browser: `extractRecipeFromHTML()` and `extractSchemaOrgRecipeFromHTML()`
128
+
129
+ #### `extractRecipeFromHTML()` - Returns Soustack Format
119
130
 
120
- `extractRecipeFromHTML(html)` extracts recipe data from an HTML string. **Works in both browser and Node.js**. Perfect for browser usage where you fetch HTML yourself (with cookies/session for authenticated content).
131
+ `extractRecipeFromHTML(html)` extracts recipe data from an HTML string and returns it in Soustack format. **Works in both browser and Node.js**. Perfect for browser usage where you fetch HTML yourself (with cookies/session for authenticated content).
121
132
 
122
133
  ```ts
123
134
  import { extractRecipeFromHTML } from 'soustack';
@@ -125,15 +136,39 @@ import { extractRecipeFromHTML } from 'soustack';
125
136
  // In browser: fetch HTML yourself (bypasses CORS, uses your cookies/session)
126
137
  const response = await fetch('https://example.com/recipe');
127
138
  const html = await response.text();
128
- const recipe = extractRecipeFromHTML(html);
139
+ const recipe = extractRecipeFromHTML(html); // Already in Soustack format
140
+ ```
141
+
142
+ #### `extractSchemaOrgRecipeFromHTML()` - Returns Schema.org Format
143
+
144
+ `extractSchemaOrgRecipeFromHTML(html)` extracts the raw Schema.org recipe data from HTML. Returns `null` if no recipe is found. Use this when you need to inspect, debug, or modify the Schema.org data before converting to Soustack format.
145
+
146
+ ```ts
147
+ import { extractSchemaOrgRecipeFromHTML, fromSchemaOrg } from 'soustack';
148
+
149
+ // In browser: fetch HTML yourself
150
+ const response = await fetch('https://example.com/recipe');
151
+ const html = await response.text();
152
+
153
+ // Extract Schema.org format (for inspection/modification)
154
+ const schemaOrgRecipe = extractSchemaOrgRecipeFromHTML(html);
155
+
156
+ if (schemaOrgRecipe) {
157
+ // Inspect or modify Schema.org data before converting
158
+ console.log('Found recipe:', schemaOrgRecipe.name);
159
+
160
+ // Convert to Soustack format when ready
161
+ const soustackRecipe = fromSchemaOrg(schemaOrgRecipe);
162
+ }
129
163
  ```
130
164
 
131
- **Why use `extractRecipeFromHTML()` in browsers?**
165
+ **Why use these functions in browsers?**
132
166
 
133
167
  - ✅ No CORS issues — you fetch HTML yourself
134
168
  - ✅ Works with authenticated/paywalled content — uses browser cookies
135
169
  - ✅ Smaller bundle — no Node.js dependencies
136
170
  - ✅ Universal — works in both browser and Node.js environments
171
+ - ✅ Flexible — choose Schema.org format for inspection/modification, or Soustack format for direct use
137
172
 
138
173
  ### CLI
139
174
 
package/dist/cli/index.js CHANGED
@@ -2006,14 +2006,30 @@ function extractRecipe(html) {
2006
2006
  return extractRecipeBrowser(html);
2007
2007
  }
2008
2008
  const jsonLdRecipe = extractJsonLd(html);
2009
- fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:6", message: "JSON-LD extraction result", data: { hasJsonLd: !!jsonLdRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
2010
- });
2009
+ if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
2010
+ try {
2011
+ const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
2012
+ if (globalFetch) {
2013
+ globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:6", message: "JSON-LD extraction result", data: { hasJsonLd: !!jsonLdRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
2014
+ });
2015
+ }
2016
+ } catch {
2017
+ }
2018
+ }
2011
2019
  if (jsonLdRecipe) {
2012
2020
  return { recipe: jsonLdRecipe, source: "jsonld" };
2013
2021
  }
2014
2022
  const microdataRecipe = extractMicrodata(html);
2015
- fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:12", message: "Microdata extraction result", data: { hasMicrodata: !!microdataRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "D" }) }).catch(() => {
2016
- });
2023
+ if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
2024
+ try {
2025
+ const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
2026
+ if (globalFetch) {
2027
+ globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:12", message: "Microdata extraction result", data: { hasMicrodata: !!microdataRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "D" }) }).catch(() => {
2028
+ });
2029
+ }
2030
+ } catch {
2031
+ }
2032
+ }
2017
2033
  if (microdataRecipe) {
2018
2034
  return { recipe: microdataRecipe, source: "microdata" };
2019
2035
  }
@@ -2022,20 +2038,52 @@ function extractRecipe(html) {
2022
2038
 
2023
2039
  // src/scraper/index.ts
2024
2040
  async function scrapeRecipe(url, options = {}) {
2025
- fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:7", message: "scrapeRecipe entry", data: { url, hasOptions: !!options }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,B,C,D,E" }) }).catch(() => {
2026
- });
2041
+ if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
2042
+ try {
2043
+ const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
2044
+ if (globalFetch) {
2045
+ globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:7", message: "scrapeRecipe entry", data: { url, hasOptions: !!options }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,B,C,D,E" }) }).catch(() => {
2046
+ });
2047
+ }
2048
+ } catch {
2049
+ }
2050
+ }
2027
2051
  const html = await fetchPage(url, options);
2028
- fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:9", message: "HTML fetched", data: { htmlLength: html?.length, htmlPreview: html?.substring(0, 200) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
2029
- });
2052
+ if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
2053
+ try {
2054
+ const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
2055
+ if (globalFetch) {
2056
+ globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:9", message: "HTML fetched", data: { htmlLength: html?.length, htmlPreview: html?.substring(0, 200) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
2057
+ });
2058
+ }
2059
+ } catch {
2060
+ }
2061
+ }
2030
2062
  const { recipe } = extractRecipe(html);
2031
- fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:11", message: "extractRecipe result", data: { hasRecipe: !!recipe, recipeType: recipe?.["@type"], recipeName: recipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C,D" }) }).catch(() => {
2032
- });
2063
+ if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
2064
+ try {
2065
+ const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
2066
+ if (globalFetch) {
2067
+ globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:11", message: "extractRecipe result", data: { hasRecipe: !!recipe, recipeType: recipe?.["@type"], recipeName: recipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C,D" }) }).catch(() => {
2068
+ });
2069
+ }
2070
+ } catch {
2071
+ }
2072
+ }
2033
2073
  if (!recipe) {
2034
2074
  throw new Error("No Schema.org recipe data found in page");
2035
2075
  }
2036
2076
  const soustackRecipe = fromSchemaOrg(recipe);
2037
- fetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:17", message: "fromSchemaOrg result", data: { hasSoustackRecipe: !!soustackRecipe, soustackRecipeName: soustackRecipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
2038
- });
2077
+ if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
2078
+ try {
2079
+ const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
2080
+ if (globalFetch) {
2081
+ globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:17", message: "fromSchemaOrg result", data: { hasSoustackRecipe: !!soustackRecipe, soustackRecipeName: soustackRecipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
2082
+ });
2083
+ }
2084
+ } catch {
2085
+ }
2086
+ }
2039
2087
  if (!soustackRecipe) {
2040
2088
  throw new Error("Schema.org data did not include a valid recipe");
2041
2089
  }