soustack 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +41 -6
- package/dist/cli/index.js +60 -12
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.mts +55 -6
- package/dist/index.d.ts +55 -6
- package/dist/index.js +65 -12
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +65 -13
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -46,9 +46,10 @@ npm install soustack
|
|
|
46
46
|
- **Schema.org Conversion**:
|
|
47
47
|
- `fromSchemaOrg()` (Schema.org JSON-LD → Soustack)
|
|
48
48
|
- `toSchemaOrg()` (Soustack → Schema.org JSON-LD)
|
|
49
|
-
- **Web Scraping**:
|
|
49
|
+
- **Web Scraping**:
|
|
50
50
|
- `scrapeRecipe()` fetches a recipe page and extracts Schema.org recipe data (Node.js only)
|
|
51
|
-
- `extractRecipeFromHTML()` extracts recipe data from HTML string (browser & Node.js compatible)
|
|
51
|
+
- `extractRecipeFromHTML()` extracts recipe data from HTML string, returns Soustack format (browser & Node.js compatible)
|
|
52
|
+
- `extractSchemaOrgRecipeFromHTML()` extracts raw Schema.org recipe data from HTML string (browser & Node.js compatible)
|
|
52
53
|
- Supports JSON-LD (`<script type="application/ld+json">`) and Microdata (`itemscope/itemtype`)
|
|
53
54
|
|
|
54
55
|
## Programmatic Usage
|
|
@@ -57,6 +58,7 @@ npm install soustack
|
|
|
57
58
|
import {
|
|
58
59
|
scrapeRecipe,
|
|
59
60
|
extractRecipeFromHTML,
|
|
61
|
+
extractSchemaOrgRecipeFromHTML,
|
|
60
62
|
fromSchemaOrg,
|
|
61
63
|
toSchemaOrg,
|
|
62
64
|
validateRecipe,
|
|
@@ -73,9 +75,16 @@ const computed = scaleRecipe(recipe, 2);
|
|
|
73
75
|
const scraped = await scrapeRecipe('https://example.com/recipe');
|
|
74
76
|
|
|
75
77
|
// Extract recipe from HTML string (browser & Node.js compatible)
|
|
78
|
+
// Option 1: Get Soustack format directly
|
|
76
79
|
const html = await fetch('https://example.com/recipe').then((r) => r.text());
|
|
77
80
|
const recipe = extractRecipeFromHTML(html);
|
|
78
81
|
|
|
82
|
+
// Option 2: Get Schema.org format first (for inspection/modification)
|
|
83
|
+
const schemaOrgRecipe = extractSchemaOrgRecipeFromHTML(html);
|
|
84
|
+
if (schemaOrgRecipe) {
|
|
85
|
+
const soustackRecipe = fromSchemaOrg(schemaOrgRecipe);
|
|
86
|
+
}
|
|
87
|
+
|
|
79
88
|
// Convert Schema.org → Soustack
|
|
80
89
|
const soustack = fromSchemaOrg(schemaOrgJsonLd);
|
|
81
90
|
|
|
@@ -115,9 +124,11 @@ const recipe = await scrapeRecipe('https://example.com/recipe', {
|
|
|
115
124
|
});
|
|
116
125
|
```
|
|
117
126
|
|
|
118
|
-
### Browser: `extractRecipeFromHTML()`
|
|
127
|
+
### Browser: `extractRecipeFromHTML()` and `extractSchemaOrgRecipeFromHTML()`
|
|
128
|
+
|
|
129
|
+
#### `extractRecipeFromHTML()` - Returns Soustack Format
|
|
119
130
|
|
|
120
|
-
`extractRecipeFromHTML(html)` extracts recipe data from an HTML string. **Works in both browser and Node.js**. Perfect for browser usage where you fetch HTML yourself (with cookies/session for authenticated content).
|
|
131
|
+
`extractRecipeFromHTML(html)` extracts recipe data from an HTML string and returns it in Soustack format. **Works in both browser and Node.js**. Perfect for browser usage where you fetch HTML yourself (with cookies/session for authenticated content).
|
|
121
132
|
|
|
122
133
|
```ts
|
|
123
134
|
import { extractRecipeFromHTML } from 'soustack';
|
|
@@ -125,15 +136,39 @@ import { extractRecipeFromHTML } from 'soustack';
|
|
|
125
136
|
// In browser: fetch HTML yourself (bypasses CORS, uses your cookies/session)
|
|
126
137
|
const response = await fetch('https://example.com/recipe');
|
|
127
138
|
const html = await response.text();
|
|
128
|
-
const recipe = extractRecipeFromHTML(html);
|
|
139
|
+
const recipe = extractRecipeFromHTML(html); // Already in Soustack format
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
#### `extractSchemaOrgRecipeFromHTML()` - Returns Schema.org Format
|
|
143
|
+
|
|
144
|
+
`extractSchemaOrgRecipeFromHTML(html)` extracts the raw Schema.org recipe data from HTML. Returns `null` if no recipe is found. Use this when you need to inspect, debug, or modify the Schema.org data before converting to Soustack format.
|
|
145
|
+
|
|
146
|
+
```ts
|
|
147
|
+
import { extractSchemaOrgRecipeFromHTML, fromSchemaOrg } from 'soustack';
|
|
148
|
+
|
|
149
|
+
// In browser: fetch HTML yourself
|
|
150
|
+
const response = await fetch('https://example.com/recipe');
|
|
151
|
+
const html = await response.text();
|
|
152
|
+
|
|
153
|
+
// Extract Schema.org format (for inspection/modification)
|
|
154
|
+
const schemaOrgRecipe = extractSchemaOrgRecipeFromHTML(html);
|
|
155
|
+
|
|
156
|
+
if (schemaOrgRecipe) {
|
|
157
|
+
// Inspect or modify Schema.org data before converting
|
|
158
|
+
console.log('Found recipe:', schemaOrgRecipe.name);
|
|
159
|
+
|
|
160
|
+
// Convert to Soustack format when ready
|
|
161
|
+
const soustackRecipe = fromSchemaOrg(schemaOrgRecipe);
|
|
162
|
+
}
|
|
129
163
|
```
|
|
130
164
|
|
|
131
|
-
**Why use
|
|
165
|
+
**Why use these functions in browsers?**
|
|
132
166
|
|
|
133
167
|
- ✅ No CORS issues — you fetch HTML yourself
|
|
134
168
|
- ✅ Works with authenticated/paywalled content — uses browser cookies
|
|
135
169
|
- ✅ Smaller bundle — no Node.js dependencies
|
|
136
170
|
- ✅ Universal — works in both browser and Node.js environments
|
|
171
|
+
- ✅ Flexible — choose Schema.org format for inspection/modification, or Soustack format for direct use
|
|
137
172
|
|
|
138
173
|
### CLI
|
|
139
174
|
|
package/dist/cli/index.js
CHANGED
|
@@ -2006,14 +2006,30 @@ function extractRecipe(html) {
|
|
|
2006
2006
|
return extractRecipeBrowser(html);
|
|
2007
2007
|
}
|
|
2008
2008
|
const jsonLdRecipe = extractJsonLd(html);
|
|
2009
|
-
|
|
2010
|
-
|
|
2009
|
+
if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
|
|
2010
|
+
try {
|
|
2011
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
2012
|
+
if (globalFetch) {
|
|
2013
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:6", message: "JSON-LD extraction result", data: { hasJsonLd: !!jsonLdRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "C,D" }) }).catch(() => {
|
|
2014
|
+
});
|
|
2015
|
+
}
|
|
2016
|
+
} catch {
|
|
2017
|
+
}
|
|
2018
|
+
}
|
|
2011
2019
|
if (jsonLdRecipe) {
|
|
2012
2020
|
return { recipe: jsonLdRecipe, source: "jsonld" };
|
|
2013
2021
|
}
|
|
2014
2022
|
const microdataRecipe = extractMicrodata(html);
|
|
2015
|
-
|
|
2016
|
-
|
|
2023
|
+
if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
|
|
2024
|
+
try {
|
|
2025
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
2026
|
+
if (globalFetch) {
|
|
2027
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/extractors/index.ts:12", message: "Microdata extraction result", data: { hasMicrodata: !!microdataRecipe }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "D" }) }).catch(() => {
|
|
2028
|
+
});
|
|
2029
|
+
}
|
|
2030
|
+
} catch {
|
|
2031
|
+
}
|
|
2032
|
+
}
|
|
2017
2033
|
if (microdataRecipe) {
|
|
2018
2034
|
return { recipe: microdataRecipe, source: "microdata" };
|
|
2019
2035
|
}
|
|
@@ -2022,20 +2038,52 @@ function extractRecipe(html) {
|
|
|
2022
2038
|
|
|
2023
2039
|
// src/scraper/index.ts
|
|
2024
2040
|
async function scrapeRecipe(url, options = {}) {
|
|
2025
|
-
|
|
2026
|
-
|
|
2041
|
+
if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
|
|
2042
|
+
try {
|
|
2043
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
2044
|
+
if (globalFetch) {
|
|
2045
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:7", message: "scrapeRecipe entry", data: { url, hasOptions: !!options }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,B,C,D,E" }) }).catch(() => {
|
|
2046
|
+
});
|
|
2047
|
+
}
|
|
2048
|
+
} catch {
|
|
2049
|
+
}
|
|
2050
|
+
}
|
|
2027
2051
|
const html = await fetchPage(url, options);
|
|
2028
|
-
|
|
2029
|
-
|
|
2052
|
+
if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
|
|
2053
|
+
try {
|
|
2054
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
2055
|
+
if (globalFetch) {
|
|
2056
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:9", message: "HTML fetched", data: { htmlLength: html?.length, htmlPreview: html?.substring(0, 200) }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "B" }) }).catch(() => {
|
|
2057
|
+
});
|
|
2058
|
+
}
|
|
2059
|
+
} catch {
|
|
2060
|
+
}
|
|
2061
|
+
}
|
|
2030
2062
|
const { recipe } = extractRecipe(html);
|
|
2031
|
-
|
|
2032
|
-
|
|
2063
|
+
if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
|
|
2064
|
+
try {
|
|
2065
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
2066
|
+
if (globalFetch) {
|
|
2067
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:11", message: "extractRecipe result", data: { hasRecipe: !!recipe, recipeType: recipe?.["@type"], recipeName: recipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A,C,D" }) }).catch(() => {
|
|
2068
|
+
});
|
|
2069
|
+
}
|
|
2070
|
+
} catch {
|
|
2071
|
+
}
|
|
2072
|
+
}
|
|
2033
2073
|
if (!recipe) {
|
|
2034
2074
|
throw new Error("No Schema.org recipe data found in page");
|
|
2035
2075
|
}
|
|
2036
2076
|
const soustackRecipe = fromSchemaOrg(recipe);
|
|
2037
|
-
|
|
2038
|
-
|
|
2077
|
+
if (typeof process === "undefined" || process.env.NODE_ENV !== "test") {
|
|
2078
|
+
try {
|
|
2079
|
+
const globalFetch = typeof globalThis !== "undefined" && typeof globalThis.fetch !== "undefined" ? globalThis.fetch : null;
|
|
2080
|
+
if (globalFetch) {
|
|
2081
|
+
globalFetch("http://127.0.0.1:7243/ingest/7225c3b5-9ac2-4c94-b561-807ca9003b66", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ location: "scraper/index.ts:17", message: "fromSchemaOrg result", data: { hasSoustackRecipe: !!soustackRecipe, soustackRecipeName: soustackRecipe?.name }, timestamp: Date.now(), sessionId: "debug-session", runId: "run1", hypothesisId: "A" }) }).catch(() => {
|
|
2082
|
+
});
|
|
2083
|
+
}
|
|
2084
|
+
} catch {
|
|
2085
|
+
}
|
|
2086
|
+
}
|
|
2039
2087
|
if (!soustackRecipe) {
|
|
2040
2088
|
throw new Error("Schema.org data did not include a valid recipe");
|
|
2041
2089
|
}
|