@purepageio/fetch-engines 0.6.2 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md
CHANGED
|
@@ -25,7 +25,7 @@ Fetch websites with confidence. `@purepageio/fetch-engines` gives teams an HTTP-
|
|
|
25
25
|
|
|
26
26
|
- **One API for multiple strategies** – Call `fetchHTML` for rendered pages or `fetchContent` for raw responses. The library handles HTTP shortcuts and Playwright fallbacks automatically.
|
|
27
27
|
- **Production-minded defaults** – Retries, caching, and consistent telemetry are ready out of the box.
|
|
28
|
-
- **Drop-in AI enrichment** – Provide a Zod schema and let OpenAI convert full pages into structured data.
|
|
28
|
+
- **Drop-in AI enrichment** – Provide a Zod schema and let OpenAI (or any OpenAI-compatible API) convert full pages into structured data.
|
|
29
29
|
- **Typed and tested** – Built in TypeScript with examples that mirror real-world scraping pipelines.
|
|
30
30
|
|
|
31
31
|
## Installation
|
|
@@ -80,7 +80,21 @@ const result = await fetchStructuredContent("https://example.com/article", schem
|
|
|
80
80
|
console.log(result.data.summary);
|
|
81
81
|
```
|
|
82
82
|
|
|
83
|
-
Set `OPENAI_API_KEY` before running structured helpers
|
|
83
|
+
Set `OPENAI_API_KEY` before running structured helpers, or use `apiConfig` to connect to OpenAI-compatible APIs like OpenRouter:
|
|
84
|
+
|
|
85
|
+
```typescript
|
|
86
|
+
const result = await fetchStructuredContent("https://example.com/article", schema, {
|
|
87
|
+
model: "anthropic/claude-3.5-sonnet",
|
|
88
|
+
apiConfig: {
|
|
89
|
+
apiKey: process.env.OPENROUTER_API_KEY,
|
|
90
|
+
baseURL: "https://openrouter.ai/api/v1",
|
|
91
|
+
headers: {
|
|
92
|
+
"HTTP-Referer": "https://your-app.com",
|
|
93
|
+
"X-Title": "Your App Name",
|
|
94
|
+
},
|
|
95
|
+
},
|
|
96
|
+
});
|
|
97
|
+
```
|
|
84
98
|
|
|
85
99
|
## Configuration
|
|
86
100
|
|
|
@@ -1,15 +1,28 @@
|
|
|
1
1
|
import type { z } from "zod";
|
|
2
2
|
import type { PlaywrightEngineConfig } from "./types.js";
|
|
3
|
+
/**
|
|
4
|
+
* Configuration for OpenAI-compatible API providers
|
|
5
|
+
*/
|
|
6
|
+
export interface ApiConfig {
|
|
7
|
+
/** API key for the provider. Defaults to OPENAI_API_KEY environment variable */
|
|
8
|
+
apiKey?: string;
|
|
9
|
+
/** Base URL for the API. Use this for OpenAI-compatible APIs like OpenRouter */
|
|
10
|
+
baseURL?: string;
|
|
11
|
+
/** Custom headers to include in API requests */
|
|
12
|
+
headers?: Record<string, string>;
|
|
13
|
+
}
|
|
3
14
|
/**
|
|
4
15
|
* Configuration options for structured content fetching
|
|
5
16
|
*/
|
|
6
17
|
export interface StructuredContentOptions {
|
|
7
|
-
/**
|
|
8
|
-
model?:
|
|
18
|
+
/** Model to use. Can be any model name supported by your API provider (e.g., 'gpt-4.1-mini', 'gpt-4.1', 'gpt-5', 'gpt-5-mini', or OpenRouter model names) */
|
|
19
|
+
model?: string;
|
|
9
20
|
/** Custom prompt to provide additional context to the LLM */
|
|
10
21
|
customPrompt?: string;
|
|
11
22
|
/** HybridEngine configuration for content fetching */
|
|
12
23
|
engineConfig?: PlaywrightEngineConfig;
|
|
24
|
+
/** API configuration for OpenAI-compatible providers (OpenRouter, etc.) */
|
|
25
|
+
apiConfig?: ApiConfig;
|
|
13
26
|
}
|
|
14
27
|
/**
|
|
15
28
|
* Result of structured content extraction
|
|
@@ -43,7 +56,7 @@ export declare class StructuredContentEngine {
|
|
|
43
56
|
* @param schema Zod schema defining the structure of data to extract
|
|
44
57
|
* @param options Additional options for the extraction process
|
|
45
58
|
* @returns Promise resolving to structured data and metadata
|
|
46
|
-
* @throws Error if
|
|
59
|
+
* @throws Error if API key is not set or if extraction fails
|
|
47
60
|
*/
|
|
48
61
|
fetchStructuredContent<T>(url: string, schema: z.ZodSchema<T>, options?: StructuredContentOptions): Promise<StructuredContentResult<T>>;
|
|
49
62
|
/**
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"StructuredContentEngine.d.ts","sourceRoot":"","sources":["../src/StructuredContentEngine.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAE7B,OAAO,KAAK,EAAE,sBAAsB,EAAE,MAAM,YAAY,CAAC;AAEzD;;GAEG;AACH,MAAM,WAAW,
|
|
1
|
+
{"version":3,"file":"StructuredContentEngine.d.ts","sourceRoot":"","sources":["../src/StructuredContentEngine.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAE7B,OAAO,KAAK,EAAE,sBAAsB,EAAE,MAAM,YAAY,CAAC;AAEzD;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB,gFAAgF;IAChF,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,gFAAgF;IAChF,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,gDAAgD;IAChD,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAClC;AAED;;GAEG;AACH,MAAM,WAAW,wBAAwB;IACvC,6JAA6J;IAC7J,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,6DAA6D;IAC7D,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,sDAAsD;IACtD,YAAY,CAAC,EAAE,sBAAsB,CAAC;IACtC,2EAA2E;IAC3E,SAAS,CAAC,EAAE,SAAS,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,uBAAuB,CAAC,CAAC;IACxC,qDAAqD;IACrD,IAAI,EAAE,CAAC,CAAC;IACR,uDAAuD;IACvD,QAAQ,EAAE,MAAM,CAAC;IACjB,iCAAiC;IACjC,GAAG,EAAE,MAAM,CAAC;IACZ,yCAAyC;IACzC,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACrB,8BAA8B;IAC9B,KAAK,EAAE;QACL,YAAY,EAAE,MAAM,CAAC;QACrB,gBAAgB,EAAE,MAAM,CAAC;QACzB,WAAW,EAAE,MAAM,CAAC;KACrB,CAAC;CACH;AAED;;GAEG;AACH,qBAAa,uBAAuB;IAClC,OAAO,CAAC,YAAY,CAAe;gBAEvB,MAAM,GAAE,sBAA2B;IAQ/C;;;;;;;;OAQG;IACG,sBAAsB,CAAC,CAAC,EAC5B,GAAG,EAAE,MAAM,EACX,MAAM,EAAE,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,EACtB,OAAO,GAAE,wBAA6B,GACrC,OAAO,CAAC,uBAAuB,CAAC,CAAC,CAAC,CAAC;IA0DtC;;OAEG;IACH,OAAO,CAAC,cAAc;IAiBtB;;OAEG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAG/B;AAED;;;;;;;GAOG;AACH,wBAAsB,sBAAsB,CAAC,CAAC,EAC5C,GAAG,EAAE,MAAM,EACX,MAAM,EAAE,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,EACtB,OAAO,GAAE,wBAA6B,GACrC,OAAO,CAAC,uBAAuB,CAAC,CAAC,CAAC,CAAC,CAOrC"}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { generateObject } from "ai";
|
|
2
|
-
import {
|
|
2
|
+
import { createOpenAI } from "@ai-sdk/openai";
|
|
3
3
|
import { HybridEngine } from "./HybridEngine.js";
|
|
4
4
|
/**
|
|
5
5
|
* Engine for fetching web content and extracting structured data using AI
|
|
@@ -20,15 +20,14 @@ export class StructuredContentEngine {
|
|
|
20
20
|
* @param schema Zod schema defining the structure of data to extract
|
|
21
21
|
* @param options Additional options for the extraction process
|
|
22
22
|
* @returns Promise resolving to structured data and metadata
|
|
23
|
-
* @throws Error if
|
|
23
|
+
* @throws Error if API key is not set or if extraction fails
|
|
24
24
|
*/
|
|
25
25
|
async fetchStructuredContent(url, schema, options = {}) {
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
26
|
+
const { model = "gpt-5-mini", customPrompt = "", engineConfig = {}, apiConfig = {} } = options;
|
|
27
|
+
const apiKey = apiConfig.apiKey ?? process.env.OPENAI_API_KEY;
|
|
28
|
+
if (!apiKey) {
|
|
29
|
+
throw new Error("API key is required for structured content extraction. Provide it via apiConfig.apiKey or set OPENAI_API_KEY environment variable");
|
|
29
30
|
}
|
|
30
|
-
const { model = "gpt-5-mini", customPrompt = "", engineConfig = {} } = options;
|
|
31
|
-
// Fetch content using HybridEngine with markdown enabled
|
|
32
31
|
const result = await this.hybridEngine.fetchHTML(url, {
|
|
33
32
|
markdown: true,
|
|
34
33
|
...engineConfig,
|
|
@@ -36,17 +35,19 @@ export class StructuredContentEngine {
|
|
|
36
35
|
if (result.contentType !== "markdown") {
|
|
37
36
|
throw new Error("Failed to convert content to markdown");
|
|
38
37
|
}
|
|
39
|
-
// Prepare the prompt for the LLM
|
|
40
38
|
const systemPrompt = `You are an expert at extracting structured data from web content.
|
|
41
39
|
Extract the requested information from the provided markdown content accurately and completely.
|
|
42
40
|
${customPrompt ? `\nAdditional context: ${customPrompt}` : ""}
|
|
43
41
|
|
|
44
42
|
Content to analyze:
|
|
45
43
|
${result.content}`;
|
|
46
|
-
// Configure model-specific options
|
|
47
44
|
const modelConfig = this.getModelConfig(model);
|
|
45
|
+
const openai = createOpenAI({
|
|
46
|
+
apiKey,
|
|
47
|
+
...(apiConfig.baseURL && { baseURL: apiConfig.baseURL }),
|
|
48
|
+
...(apiConfig.headers && { headers: apiConfig.headers }),
|
|
49
|
+
});
|
|
48
50
|
try {
|
|
49
|
-
// Generate structured object using AI SDK
|
|
50
51
|
const aiResult = await generateObject({
|
|
51
52
|
model: openai(model),
|
|
52
53
|
schema,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"StructuredContentEngine.js","sourceRoot":"","sources":["../src/StructuredContentEngine.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,MAAM,IAAI,CAAC;AACpC,OAAO,EAAE,
|
|
1
|
+
{"version":3,"file":"StructuredContentEngine.js","sourceRoot":"","sources":["../src/StructuredContentEngine.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,MAAM,IAAI,CAAC;AACpC,OAAO,EAAE,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAE9C,OAAO,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AAiDjD;;GAEG;AACH,MAAM,OAAO,uBAAuB;IAC1B,YAAY,CAAe;IAEnC,YAAY,SAAiC,EAAE;QAC7C,2DAA2D;QAC3D,IAAI,CAAC,YAAY,GAAG,IAAI,YAAY,CAAC;YACnC,GAAG,MAAM;YACT,QAAQ,EAAE,IAAI;SACf,CAAC,CAAC;IACL,CAAC;IAED;;;;;;;;OAQG;IACH,KAAK,CAAC,sBAAsB,CAC1B,GAAW,EACX,MAAsB,EACtB,UAAoC,EAAE;QAEtC,MAAM,EAAE,KAAK,GAAG,YAAY,EAAE,YAAY,GAAG,EAAE,EAAE,YAAY,GAAG,EAAE,EAAE,SAAS,GAAG,EAAE,EAAE,GAAG,OAAO,CAAC;QAE/F,MAAM,MAAM,GAAG,SAAS,CAAC,MAAM,IAAI,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC;QAC9D,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,MAAM,IAAI,KAAK,CACb,mIAAmI,CACpI,CAAC;QACJ,CAAC;QAED,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,GAAG,EAAE;YACpD,QAAQ,EAAE,IAAI;YACd,GAAG,YAAY;SAChB,CAAC,CAAC;QAEH,IAAI,MAAM,CAAC,WAAW,KAAK,UAAU,EAAE,CAAC;YACtC,MAAM,IAAI,KAAK,CAAC,uCAAuC,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,YAAY,GAAG;;EAEvB,YAAY,CAAC,CAAC,CAAC,yBAAyB,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE;;;EAG3D,MAAM,CAAC,OAAO,EAAE,CAAC;QAEf,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC;QAE/C,MAAM,MAAM,GAAG,YAAY,CAAC;YAC1B,MAAM;YACN,GAAG,CAAC,SAAS,CAAC,OAAO,IAAI,EAAE,OAAO,EAAE,SAAS,CAAC,OAAO,EAAE,CAAC;YACxD,GAAG,CAAC,SAAS,CAAC,OAAO,IAAI,EAAE,OAAO,EAAE,SAAS,CAAC,OAAO,EAAE,CAAC;SACzD,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,cAAc,CAAC;gBACpC,KAAK,EAAE,MAAM,CAAC,KAAK,CAAC;gBACpB,MAAM;gBACN,MAAM,EAAE,YAAY;gBACpB,GAAG,WAAW;aACf,CAAC,CAAC;YAEH,OAAO;gBACL,IAAI,EAAE,QAAQ,CAAC,MAAM;gBACrB,QAAQ,EAAE,MAAM,CAAC,OAAO;gBACxB,GAAG,EAAE,MAAM,CAAC,GAAG;gBACf,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,KAAK,EAAE;oBACL,YAAY,EAAG,QAAQ,CAAC,KAAa,EAAE,YAAY,IAAI,CAAC;oBACxD,gBAAgB,EAAG,QAAQ,CAAC,KAAa,EAAE,gBAAgB,IAAI,CAAC;oBAChE,WAAW,EAAG,QAAQ,CAAC,KAAa,EAAE,WAAW,IAAI,CAAC;iBACvD;aACF,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,KAAK,CAAC,sCAAsC,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QAClH,CAAC;IACH,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,KAAa;QAClC,IAAI,KAAK,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;YAC9B,OAAO;gBACL,eAAe,EAAE;oBACf,MAAM,EAAE;wBACN,gBAAgB,EAAE,KAAK;qBACxB;iBACF;aACF,CAAC;QACJ,CAAC;aAAM,IAAI,KAAK,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;YACvC,OAAO;gBACL,WAAW,EAAE,CAAC;aACf,CAAC;QACJ,CAAC;QACD,OAAO,EAAE,CAAC;IACZ,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO;QACX,MAAM,IAAI,CAAC,YAAY,CAAC,OAAO,EAAE,CAAC;IACpC,CAAC;CACF;AAED;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,sBAAsB,CAC1C,GAAW,EACX,MAAsB,EACtB,UAAoC,EAAE;IAEtC,MAAM,MAAM,GAAG,IAAI,uBAAuB,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;IACjE,IAAI,CAAC;QACH,OAAO,MAAM,MAAM,CAAC,sBAAsB,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC;IACnE,CAAC;YAAS,CAAC;QACT,MAAM,MAAM,CAAC,OAAO,EAAE,CAAC;IACzB,CAAC;AACH,CAAC"}
|
package/package.json
CHANGED