@graphext/cuery 0.9.5 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/esm/browser.d.ts +1 -1
- package/esm/browser.js +1 -1
- package/esm/mod.d.ts +3 -3
- package/esm/mod.d.ts.map +1 -1
- package/esm/mod.js +3 -3
- package/esm/src/apis/brightdata/contentScraper/index.d.ts.map +1 -0
- package/{script/src/apis/brightdata → esm/src/apis/brightdata/contentScraper}/scrape.d.ts +1 -1
- package/esm/src/apis/brightdata/contentScraper/scrape.d.ts.map +1 -0
- package/esm/src/apis/brightdata/{scrape.js → contentScraper/scrape.js} +2 -2
- package/esm/src/apis/brightdata/llmScraper/brightdata.d.ts +20 -0
- package/esm/src/apis/brightdata/llmScraper/brightdata.d.ts.map +1 -0
- package/esm/src/apis/brightdata/llmScraper/brightdata.js +182 -0
- package/esm/src/apis/brightdata/llmScraper/index.d.ts +14 -0
- package/esm/src/apis/brightdata/llmScraper/index.d.ts.map +1 -0
- package/esm/src/apis/brightdata/llmScraper/index.js +97 -0
- package/esm/src/apis/brightdata/llmScraper/oxy.d.ts +16 -0
- package/esm/src/apis/brightdata/llmScraper/oxy.d.ts.map +1 -0
- package/esm/src/apis/brightdata/llmScraper/oxy.js +171 -0
- package/{script/src/apis/chatgptScraper/scraper.d.ts → esm/src/apis/brightdata/llmScraper/scrape.d.ts} +12 -15
- package/esm/src/apis/brightdata/llmScraper/scrape.d.ts.map +1 -0
- package/esm/src/apis/brightdata/llmScraper/scrape.js +184 -0
- package/esm/src/apis/hasdata/helpers.d.ts.map +1 -1
- package/esm/src/apis/hasdata/helpers.js +56 -18
- package/esm/src/schemas/search.schema.d.ts +2 -2
- package/esm/src/schemas/search.schema.d.ts.map +1 -1
- package/esm/src/schemas/sources.schema.d.ts +1 -4
- package/esm/src/schemas/sources.schema.d.ts.map +1 -1
- package/package.json +1 -1
- package/script/browser.d.ts +1 -1
- package/script/browser.js +1 -1
- package/script/mod.d.ts +3 -3
- package/script/mod.d.ts.map +1 -1
- package/script/mod.js +6 -6
- package/script/src/apis/brightdata/contentScraper/index.d.ts.map +1 -0
- package/{esm/src/apis/brightdata → script/src/apis/brightdata/contentScraper}/scrape.d.ts +1 -1
- package/script/src/apis/brightdata/contentScraper/scrape.d.ts.map +1 -0
- package/script/src/apis/brightdata/{scrape.js → contentScraper/scrape.js} +2 -2
- package/script/src/apis/brightdata/llmScraper/brightdata.d.ts +20 -0
- package/script/src/apis/brightdata/llmScraper/brightdata.d.ts.map +1 -0
- package/script/src/apis/brightdata/llmScraper/brightdata.js +219 -0
- package/script/src/apis/brightdata/llmScraper/index.d.ts +14 -0
- package/script/src/apis/brightdata/llmScraper/index.d.ts.map +1 -0
- package/script/src/apis/brightdata/llmScraper/index.js +140 -0
- package/script/src/apis/brightdata/llmScraper/oxy.d.ts +16 -0
- package/script/src/apis/brightdata/llmScraper/oxy.d.ts.map +1 -0
- package/script/src/apis/brightdata/llmScraper/oxy.js +208 -0
- package/{esm/src/apis/chatgptScraper/scraper.d.ts → script/src/apis/brightdata/llmScraper/scrape.d.ts} +12 -15
- package/script/src/apis/brightdata/llmScraper/scrape.d.ts.map +1 -0
- package/script/src/apis/brightdata/llmScraper/scrape.js +224 -0
- package/script/src/apis/hasdata/helpers.d.ts.map +1 -1
- package/script/src/apis/hasdata/helpers.js +56 -18
- package/script/src/schemas/search.schema.d.ts +2 -2
- package/script/src/schemas/search.schema.d.ts.map +1 -1
- package/script/src/schemas/sources.schema.d.ts +1 -4
- package/script/src/schemas/sources.schema.d.ts.map +1 -1
- package/esm/src/apis/brightdata/index.d.ts.map +0 -1
- package/esm/src/apis/brightdata/scrape.d.ts.map +0 -1
- package/esm/src/apis/chatgptScraper/brightdata.d.ts +0 -3
- package/esm/src/apis/chatgptScraper/brightdata.d.ts.map +0 -1
- package/esm/src/apis/chatgptScraper/brightdata.js +0 -172
- package/esm/src/apis/chatgptScraper/index.d.ts +0 -10
- package/esm/src/apis/chatgptScraper/index.d.ts.map +0 -1
- package/esm/src/apis/chatgptScraper/index.js +0 -41
- package/esm/src/apis/chatgptScraper/oxy.d.ts +0 -3
- package/esm/src/apis/chatgptScraper/oxy.d.ts.map +0 -1
- package/esm/src/apis/chatgptScraper/oxy.js +0 -156
- package/esm/src/apis/chatgptScraper/scraper.d.ts.map +0 -1
- package/esm/src/apis/chatgptScraper/scraper.js +0 -98
- package/script/src/apis/brightdata/index.d.ts.map +0 -1
- package/script/src/apis/brightdata/scrape.d.ts.map +0 -1
- package/script/src/apis/chatgptScraper/brightdata.d.ts +0 -3
- package/script/src/apis/chatgptScraper/brightdata.d.ts.map +0 -1
- package/script/src/apis/chatgptScraper/brightdata.js +0 -208
- package/script/src/apis/chatgptScraper/index.d.ts +0 -10
- package/script/src/apis/chatgptScraper/index.d.ts.map +0 -1
- package/script/src/apis/chatgptScraper/index.js +0 -81
- package/script/src/apis/chatgptScraper/oxy.d.ts +0 -3
- package/script/src/apis/chatgptScraper/oxy.d.ts.map +0 -1
- package/script/src/apis/chatgptScraper/oxy.js +0 -192
- package/script/src/apis/chatgptScraper/scraper.d.ts.map +0 -1
- package/script/src/apis/chatgptScraper/scraper.js +0 -139
- /package/esm/src/apis/brightdata/{index.d.ts → contentScraper/index.d.ts} +0 -0
- /package/esm/src/apis/brightdata/{index.js → contentScraper/index.js} +0 -0
- /package/script/src/apis/brightdata/{index.d.ts → contentScraper/index.d.ts} +0 -0
- /package/script/src/apis/brightdata/{index.js → contentScraper/index.js} +0 -0
package/esm/browser.d.ts
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* Browser-safe exports for @graphext/cuery
|
|
3
3
|
*
|
|
4
4
|
* This module only exports types and pure functions that can safely run in the browser.
|
|
5
|
-
* It excludes server-only modules like
|
|
5
|
+
* It excludes server-only modules like llmScraper, googleAds, and API functions
|
|
6
6
|
* that depend on Node.js or Deno-specific APIs.
|
|
7
7
|
*
|
|
8
8
|
* @module
|
package/esm/browser.js
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* Browser-safe exports for @graphext/cuery
|
|
3
3
|
*
|
|
4
4
|
* This module only exports types and pure functions that can safely run in the browser.
|
|
5
|
-
* It excludes server-only modules like
|
|
5
|
+
* It excludes server-only modules like llmScraper, googleAds, and API functions
|
|
6
6
|
* that depend on Node.js or Deno-specific APIs.
|
|
7
7
|
*
|
|
8
8
|
* @module
|
package/esm/mod.d.ts
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
*/
|
|
6
6
|
export * from './src/llm.js';
|
|
7
7
|
export { BatchResponse } from './src/response.js';
|
|
8
|
-
export {
|
|
8
|
+
export { calculateCost, getModelInfo, getModelPricing, getProvider, getProviderForModel, type ModelInfo, type ModelPricing, } from './src/providers/index.js';
|
|
9
9
|
export * from './src/tools/keywords.js';
|
|
10
10
|
export * from './src/tools/classifier.js';
|
|
11
11
|
export * from './src/tools/funnel.js';
|
|
@@ -26,8 +26,8 @@ export * from './src/tools/scorer.js';
|
|
|
26
26
|
export * from './src/helpers/seedKeywords.js';
|
|
27
27
|
export * from './src/tools/generic.js';
|
|
28
28
|
export * from './src/apis/hasdata/index.js';
|
|
29
|
-
export * from './src/apis/brightdata/index.js';
|
|
30
|
-
export * from './src/apis/
|
|
29
|
+
export * from './src/apis/brightdata/contentScraper/index.js';
|
|
30
|
+
export * from './src/apis/brightdata/llmScraper/index.js';
|
|
31
31
|
export * from './src/apis/googleAds/keywordPlanner.js';
|
|
32
32
|
export * from './src/schemas/index.js';
|
|
33
33
|
//# sourceMappingURL=mod.d.ts.map
|
package/esm/mod.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"mod.d.ts","sourceRoot":"","sources":["../src/mod.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,cAAc,cAAc,CAAC;AAC7B,OAAO,EAAE,aAAa,EAAE,MAAM,mBAAmB,CAAC;AAClD,OAAO,EACN,
|
|
1
|
+
{"version":3,"file":"mod.d.ts","sourceRoot":"","sources":["../src/mod.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,cAAc,cAAc,CAAC;AAC7B,OAAO,EAAE,aAAa,EAAE,MAAM,mBAAmB,CAAC;AAClD,OAAO,EACN,aAAa,EACb,YAAY,EACZ,eAAe,EACf,WAAW,EACX,mBAAmB,EACnB,KAAK,SAAS,EACd,KAAK,YAAY,GACjB,MAAM,0BAA0B,CAAC;AAGlC,cAAc,yBAAyB,CAAC;AACxC,cAAc,2BAA2B,CAAC;AAC1C,cAAc,uBAAuB,CAAC;AACtC,cAAc,yBAAyB,CAAC;AACxC,cAAc,uBAAuB,CAAC;AACtC,cAAc,uBAAuB,CAAC;AACtC,cAAc,wBAAwB,CAAC;AACvC,cAAc,wBAAwB,CAAC;AACvC,cAAc,uBAAuB,CAAC;AACtC,cAAc,uBAAuB,CAAC;AACtC,cAAc,0BAA0B,CAAC;AACzC,cAAc,0BAA0B,CAAC;AACzC,cAAc,0BAA0B,CAAC;AACzC,cAAc,wBAAwB,CAAC;AACvC,cAAc,yBAAyB,CAAC;AACxC,cAAc,wBAAwB,CAAC;AACvC,cAAc,uBAAuB,CAAC;AACtC,cAAc,+BAA+B,CAAC;AAC9C,cAAc,wBAAwB,CAAC;AACvC,cAAc,6BAA6B,CAAC;AAC5C,cAAc,+CAA+C,CAAC;AAC9D,cAAc,2CAA2C,CAAC;AAC1D,cAAc,wCAAwC,CAAC;AACvD,cAAc,wBAAwB,CAAC"}
|
package/esm/mod.js
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
// Core LLM interface and types
|
|
7
7
|
export * from './src/llm.js';
|
|
8
8
|
export { BatchResponse } from './src/response.js';
|
|
9
|
-
export {
|
|
9
|
+
export { calculateCost, getModelInfo, getModelPricing, getProvider, getProviderForModel, } from './src/providers/index.js';
|
|
10
10
|
// Tools
|
|
11
11
|
export * from './src/tools/keywords.js';
|
|
12
12
|
export * from './src/tools/classifier.js';
|
|
@@ -28,7 +28,7 @@ export * from './src/tools/scorer.js';
|
|
|
28
28
|
export * from './src/helpers/seedKeywords.js';
|
|
29
29
|
export * from './src/tools/generic.js';
|
|
30
30
|
export * from './src/apis/hasdata/index.js';
|
|
31
|
-
export * from './src/apis/brightdata/index.js';
|
|
32
|
-
export * from './src/apis/
|
|
31
|
+
export * from './src/apis/brightdata/contentScraper/index.js';
|
|
32
|
+
export * from './src/apis/brightdata/llmScraper/index.js';
|
|
33
33
|
export * from './src/apis/googleAds/keywordPlanner.js';
|
|
34
34
|
export * from './src/schemas/index.js';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../../src/src/apis/brightdata/contentScraper/index.ts"],"names":[],"mappings":"AAAA,cAAc,aAAa,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scrape.d.ts","sourceRoot":"","sources":["../../../../../src/src/apis/brightdata/contentScraper/scrape.ts"],"names":[],"mappings":"AACA,OAAO,EAA4B,KAAK,WAAW,EAAE,MAAM,2BAA2B,CAAC;AAavF,MAAM,WAAW,uBAAuB;IACpC,wDAAwD;IACxD,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,6DAA6D;IAC7D,MAAM,CAAC,EAAE,KAAK,GAAG,MAAM,CAAC;IACxB,8DAA8D;IAC9D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,wDAAwD;IACxD,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,oDAAoD;IACpD,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,kCAAkC;IAClC,WAAW,CAAC,EAAE,WAAW,CAAC;CAC7B;AAED,MAAM,WAAW,wBAAwB;IACrC,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,CAAC,EAAE,MAAM,CAAC;CACjB;AA4ED;;;GAGG;AACH,wBAAsB,gBAAgB,CAClC,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,uBAA4B,GACtC,OAAO,CAAC,wBAAwB,CAAC,CAYnC;AAED;;;GAGG;AACH,wBAAsB,qBAAqB,CACvC,IAAI,EAAE,KAAK,CAAC,MAAM,CAAC,EACnB,OAAO,GAAE,uBAA4B,EACrC,cAAc,GAAE,MAA+B,GAChD,OAAO,CAAC,KAAK,CAAC,wBAAwB,CAAC,CAAC,CAQ1C"}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import * as dntShim from "
|
|
2
|
-
import { mapParallel, withRetries } from '
|
|
1
|
+
import * as dntShim from "../../../../_dnt.shims.js";
|
|
2
|
+
import { mapParallel, withRetries } from '../../../helpers/async.js';
|
|
3
3
|
const BRIGHTDATA_CONCURRENCY = 10;
|
|
4
4
|
const BRIGHTDATA_RETRY_CONFIG = {
|
|
5
5
|
maxRetries: 3,
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { type ProviderFunctions } from './scrape.js';
|
|
2
|
+
interface BrightdataProviderConfig {
|
|
3
|
+
apiBase: string;
|
|
4
|
+
datasetId: string;
|
|
5
|
+
outputFields: Array<string>;
|
|
6
|
+
extraFields?: Array<string>;
|
|
7
|
+
targetUrl: string;
|
|
8
|
+
extraInputs?: (params: {
|
|
9
|
+
prompt: string;
|
|
10
|
+
useSearch: boolean;
|
|
11
|
+
countryISOCode: string | null;
|
|
12
|
+
}) => Record<string, unknown>;
|
|
13
|
+
providerName: string;
|
|
14
|
+
maxConcurrency: number;
|
|
15
|
+
maxPromptsPerRequest: number;
|
|
16
|
+
}
|
|
17
|
+
export declare function createBrightdataProvider(overrides?: Partial<BrightdataProviderConfig>): ProviderFunctions;
|
|
18
|
+
export declare const brightdataProvider: ProviderFunctions;
|
|
19
|
+
export {};
|
|
20
|
+
//# sourceMappingURL=brightdata.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"brightdata.d.ts","sourceRoot":"","sources":["../../../../../src/src/apis/brightdata/llmScraper/brightdata.ts"],"names":[],"mappings":"AAcA,OAAO,EAA6C,KAAK,iBAAiB,EAAE,MAAM,aAAa,CAAC;AA4BhG,UAAU,wBAAwB;IACjC,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IAC5B,WAAW,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IAC5B,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,CACb,MAAM,EAAE;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,OAAO,CAAC;QAAC,cAAc,EAAE,MAAM,GAAG,IAAI,CAAA;KAAE,KACzE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC7B,YAAY,EAAE,MAAM,CAAC;IACrB,cAAc,EAAE,MAAM,CAAC;IACvB,oBAAoB,EAAE,MAAM,CAAC;CAC7B;AA2DD,wBAAgB,wBAAwB,CACvC,SAAS,GAAE,OAAO,CAAC,wBAAwB,CAAM,GAC/C,iBAAiB,CAgJnB;AAMD,eAAO,MAAM,kBAAkB,EAAE,iBAA8C,CAAC"}
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
/* eslint no-console: ["warn", { allow: ["log", "warn", "error"] }] */
|
|
2
|
+
/**
|
|
3
|
+
* Brightdata LLM Scraper Provider.
|
|
4
|
+
*
|
|
5
|
+
* API Flow:
|
|
6
|
+
* 1. Trigger: POST to /datasets/v3/trigger → returns snapshot_id
|
|
7
|
+
* 2. Monitor: GET /datasets/v3/progress/{snapshot_id} until ready
|
|
8
|
+
* 3. Download: GET /datasets/v3/snapshot/{snapshot_id}
|
|
9
|
+
*/
|
|
10
|
+
import * as dntShim from "../../../../_dnt.shims.js";
|
|
11
|
+
import { sleep, withRetries } from '../../../helpers/async.js';
|
|
12
|
+
import { buildSources, cleanAnswer, getAbortSignal } from './scrape.js';
|
|
13
|
+
const DEFAULT_BRIGHTDATA_PROVIDER_CONFIG = {
|
|
14
|
+
apiBase: 'https://api.brightdata.com',
|
|
15
|
+
datasetId: 'gd_m7aof0k82r803d5bjm',
|
|
16
|
+
outputFields: [
|
|
17
|
+
'url',
|
|
18
|
+
'prompt',
|
|
19
|
+
'answer_text',
|
|
20
|
+
'answer_text_markdown',
|
|
21
|
+
'citations',
|
|
22
|
+
'links_attached',
|
|
23
|
+
'country',
|
|
24
|
+
'index',
|
|
25
|
+
],
|
|
26
|
+
targetUrl: 'http://chatgpt.com/',
|
|
27
|
+
providerName: 'Brightdata',
|
|
28
|
+
maxConcurrency: 50,
|
|
29
|
+
maxPromptsPerRequest: 1,
|
|
30
|
+
};
|
|
31
|
+
const TRIGGER_RETRY = {
|
|
32
|
+
maxRetries: 3,
|
|
33
|
+
initialDelay: 0,
|
|
34
|
+
statusCodes: [429, 500, 502, 503, 504],
|
|
35
|
+
};
|
|
36
|
+
const DOWNLOAD_RETRY = {
|
|
37
|
+
maxRetries: 5,
|
|
38
|
+
initialDelay: 2000,
|
|
39
|
+
statusCodes: [202, 500, 502, 503, 504],
|
|
40
|
+
};
|
|
41
|
+
const MONITOR_RETRY = {
|
|
42
|
+
maxRetries: 4,
|
|
43
|
+
initialDelay: 1000,
|
|
44
|
+
statusCodes: [408, 425, 429, 500, 502, 503, 504],
|
|
45
|
+
};
|
|
46
|
+
const MONITOR_RETRIABLE = new Set(MONITOR_RETRY.statusCodes ?? []);
|
|
47
|
+
const MAX_WAIT_MS = 600_000; // 10 minutes
|
|
48
|
+
const POLL_INTERVAL_MS = 5_000;
|
|
49
|
+
// ============================================================================
|
|
50
|
+
// API Key
|
|
51
|
+
// ============================================================================
|
|
52
|
+
function getApiKey() {
|
|
53
|
+
const apiKey = dntShim.Deno.env.get('BRIGHTDATA_API_KEY');
|
|
54
|
+
if (!apiKey) {
|
|
55
|
+
throw new Error('BRIGHTDATA_API_KEY environment variable is required');
|
|
56
|
+
}
|
|
57
|
+
return apiKey;
|
|
58
|
+
}
|
|
59
|
+
// ============================================================================
|
|
60
|
+
// Provider Functions
|
|
61
|
+
// ============================================================================
|
|
62
|
+
export function createBrightdataProvider(overrides = {}) {
|
|
63
|
+
const config = { ...DEFAULT_BRIGHTDATA_PROVIDER_CONFIG, ...overrides };
|
|
64
|
+
const customOutputFields = [...new Set([...(config.outputFields ?? []), ...(config.extraFields ?? [])])].join('|');
|
|
65
|
+
async function triggerJob(prompt, useSearch, countryISOCode) {
|
|
66
|
+
const apiKey = getApiKey();
|
|
67
|
+
const url = `${config.apiBase}/datasets/v3/trigger?dataset_id=${config.datasetId}&include_errors=true`;
|
|
68
|
+
const input = {
|
|
69
|
+
url: config.targetUrl,
|
|
70
|
+
prompt,
|
|
71
|
+
country: countryISOCode || '',
|
|
72
|
+
index: 0,
|
|
73
|
+
};
|
|
74
|
+
Object.assign(input, config.extraInputs?.({ prompt, useSearch, countryISOCode }) ?? {});
|
|
75
|
+
const body = {
|
|
76
|
+
custom_output_fields: customOutputFields,
|
|
77
|
+
input: [input],
|
|
78
|
+
};
|
|
79
|
+
try {
|
|
80
|
+
const response = await withRetries(() => fetch(url, {
|
|
81
|
+
method: 'POST',
|
|
82
|
+
headers: {
|
|
83
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
84
|
+
'Content-Type': 'application/json',
|
|
85
|
+
},
|
|
86
|
+
body: JSON.stringify(body),
|
|
87
|
+
signal: getAbortSignal(),
|
|
88
|
+
}), TRIGGER_RETRY);
|
|
89
|
+
if (!response.ok) {
|
|
90
|
+
console.error(`[${config.providerName}] Trigger error: ${response.status}`);
|
|
91
|
+
return null;
|
|
92
|
+
}
|
|
93
|
+
const data = await response.json();
|
|
94
|
+
return data?.snapshot_id || null;
|
|
95
|
+
}
|
|
96
|
+
catch (error) {
|
|
97
|
+
console.error(`[${config.providerName}] Trigger failed:`, error);
|
|
98
|
+
return null;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
async function monitorJob(snapshotId) {
|
|
102
|
+
const apiKey = getApiKey();
|
|
103
|
+
const url = `${config.apiBase}/datasets/v3/progress/${snapshotId}`;
|
|
104
|
+
const startTime = Date.now();
|
|
105
|
+
const abortSignal = getAbortSignal();
|
|
106
|
+
while (Date.now() - startTime < MAX_WAIT_MS) {
|
|
107
|
+
if (abortSignal?.aborted)
|
|
108
|
+
return false;
|
|
109
|
+
try {
|
|
110
|
+
const response = await withRetries(() => fetch(url, {
|
|
111
|
+
headers: { 'Authorization': `Bearer ${apiKey}` },
|
|
112
|
+
signal: abortSignal,
|
|
113
|
+
}), MONITOR_RETRY);
|
|
114
|
+
if (!response.ok) {
|
|
115
|
+
if (!MONITOR_RETRIABLE.has(response.status))
|
|
116
|
+
return false;
|
|
117
|
+
}
|
|
118
|
+
else {
|
|
119
|
+
const status = await response.json();
|
|
120
|
+
if (status.status === 'ready' || status.status === 'complete')
|
|
121
|
+
return true;
|
|
122
|
+
if (status.status === 'failed' || status.status === 'error')
|
|
123
|
+
return false;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
catch (error) {
|
|
127
|
+
console.error(`[${config.providerName}] Monitor error:`, error);
|
|
128
|
+
}
|
|
129
|
+
await sleep(POLL_INTERVAL_MS, abortSignal);
|
|
130
|
+
}
|
|
131
|
+
console.error(`[${config.providerName}] Monitor timeout after ${MAX_WAIT_MS / 1000}s`);
|
|
132
|
+
return false;
|
|
133
|
+
}
|
|
134
|
+
async function downloadJob(snapshotId) {
|
|
135
|
+
const apiKey = getApiKey();
|
|
136
|
+
const url = `${config.apiBase}/datasets/v3/snapshot/${snapshotId}?format=json`;
|
|
137
|
+
try {
|
|
138
|
+
const response = await withRetries(() => fetch(url, {
|
|
139
|
+
headers: { 'Authorization': `Bearer ${apiKey}` },
|
|
140
|
+
signal: getAbortSignal(),
|
|
141
|
+
}), DOWNLOAD_RETRY);
|
|
142
|
+
if (!response.ok) {
|
|
143
|
+
console.error(`[${config.providerName}] Download error: ${response.status}`);
|
|
144
|
+
return null;
|
|
145
|
+
}
|
|
146
|
+
const data = await response.json();
|
|
147
|
+
return Array.isArray(data) ? data : null;
|
|
148
|
+
}
|
|
149
|
+
catch (error) {
|
|
150
|
+
console.error(`[${config.providerName}] Download failed:`, error);
|
|
151
|
+
return null;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
function transformResponse(raw) {
|
|
155
|
+
const responses = raw;
|
|
156
|
+
if (!responses || responses.length === 0)
|
|
157
|
+
return null;
|
|
158
|
+
const response = responses[0];
|
|
159
|
+
const answerText = cleanAnswer(response.answer_text || '');
|
|
160
|
+
const answerTextMarkdown = cleanAnswer(response.answer_text_markdown || '');
|
|
161
|
+
return {
|
|
162
|
+
prompt: response.prompt,
|
|
163
|
+
answer: answerText,
|
|
164
|
+
answer_text_markdown: answerTextMarkdown,
|
|
165
|
+
sources: buildSources(response.citations ?? [], response.links_attached ?? []),
|
|
166
|
+
searchQueries: response.web_search_query || [],
|
|
167
|
+
};
|
|
168
|
+
}
|
|
169
|
+
return {
|
|
170
|
+
name: config.providerName,
|
|
171
|
+
maxConcurrency: config.maxConcurrency,
|
|
172
|
+
maxPromptsPerRequest: config.maxPromptsPerRequest,
|
|
173
|
+
triggerJob,
|
|
174
|
+
monitorJob,
|
|
175
|
+
downloadJob,
|
|
176
|
+
transformResponse,
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
// ============================================================================
|
|
180
|
+
// Export
|
|
181
|
+
// ============================================================================
|
|
182
|
+
export const brightdataProvider = createBrightdataProvider();
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { ModelResult } from '../../../schemas/models.schema.js';
|
|
2
|
+
import { type BatchOptions } from './scrape.js';
|
|
3
|
+
export type { BatchOptions };
|
|
4
|
+
export type JobId = string | null;
|
|
5
|
+
export type ScraperTarget = 'chatgpt' | 'aim';
|
|
6
|
+
export declare function getMaxConcurrency(target?: ScraperTarget): number;
|
|
7
|
+
export declare function getMaxPromptsPerRequest(target?: ScraperTarget): number;
|
|
8
|
+
export declare function scrapeGPTBatch(options: BatchOptions): Promise<Array<ModelResult>>;
|
|
9
|
+
export declare function triggerGPTBatch(options: BatchOptions): Promise<Array<string | null>>;
|
|
10
|
+
export declare function downloadGPTSnapshots(jobIds: Array<string | null>): Promise<Array<ModelResult>>;
|
|
11
|
+
export declare function scrapeAIMBatch(options: BatchOptions): Promise<Array<ModelResult>>;
|
|
12
|
+
export declare function triggerAIMBatch(options: BatchOptions): Promise<Array<string | null>>;
|
|
13
|
+
export declare function downloadAIMSnapshots(jobIds: Array<string | null>): Promise<Array<ModelResult>>;
|
|
14
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../../src/src/apis/brightdata/llmScraper/index.ts"],"names":[],"mappings":"AAYA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,mCAAmC,CAAC;AACrE,OAAO,EAAE,KAAK,YAAY,EAAqC,MAAM,aAAa,CAAC;AAKnF,YAAY,EAAE,YAAY,EAAE,CAAC;AAC7B,MAAM,MAAM,KAAK,GAAG,MAAM,GAAG,IAAI,CAAC;AAClC,MAAM,MAAM,aAAa,GAAG,SAAS,GAAG,KAAK,CAAC;AAuE9C,wBAAgB,iBAAiB,CAAC,MAAM,GAAE,aAAyB,GAAG,MAAM,CAE3E;AAED,wBAAgB,uBAAuB,CAAC,MAAM,GAAE,aAAyB,GAAG,MAAM,CAEjF;AAGD,wBAAsB,cAAc,CAAC,OAAO,EAAE,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAEvF;AAED,wBAAsB,eAAe,CAAC,OAAO,EAAE,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAE1F;AAED,wBAAsB,oBAAoB,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAEpG;AAGD,wBAAsB,cAAc,CAAC,OAAO,EAAE,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAEvF;AAED,wBAAsB,eAAe,CAAC,OAAO,EAAE,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAE1F;AAED,wBAAsB,oBAAoB,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAEpG"}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
/* eslint no-console: ["warn", { allow: ["log", "warn", "error"] }] */
|
|
2
|
+
/**
|
|
3
|
+
* LLM Scraper - Public API
|
|
4
|
+
*
|
|
5
|
+
* Selects between Brightdata and Oxylabs based on LLM_SCRAPER_PROVIDER env var.
|
|
6
|
+
* Falls back to CHATGPT_SCRAPER_PROVIDER for backward compatibility.
|
|
7
|
+
* Exposes ChatGPT and AIM variants while preserving the legacy GPT method names.
|
|
8
|
+
* Default: oxylabs
|
|
9
|
+
*/
|
|
10
|
+
import * as dntShim from "../../../../_dnt.shims.js";
|
|
11
|
+
import { createLLMScraper } from './scrape.js';
|
|
12
|
+
import { createBrightdataProvider } from './brightdata.js';
|
|
13
|
+
import { createOxylabsProvider } from './oxy.js';
|
|
14
|
+
// ============================================================================
|
|
15
|
+
// Scraper Instance (lazy singleton)
|
|
16
|
+
// ============================================================================
|
|
17
|
+
const scrapers = new Map();
|
|
18
|
+
const CHATGPT_BRIGHTDATA_OPTIONS = {
|
|
19
|
+
datasetId: 'gd_m7aof0k82r803d5bjm',
|
|
20
|
+
extraFields: ['web_search_triggered', 'web_search_query'],
|
|
21
|
+
extraInputs: ({ useSearch }) => ({
|
|
22
|
+
web_search: useSearch,
|
|
23
|
+
}),
|
|
24
|
+
targetUrl: 'http://chatgpt.com/',
|
|
25
|
+
};
|
|
26
|
+
const CHATGPT_OXYLABS_OPTIONS = {
|
|
27
|
+
source: 'chatgpt',
|
|
28
|
+
};
|
|
29
|
+
const AIM_BRIGHTDATA_OPTIONS = {
|
|
30
|
+
datasetId: 'gd_mcswdt6z2elth3zqr2',
|
|
31
|
+
targetUrl: 'https://google.com/aimode',
|
|
32
|
+
};
|
|
33
|
+
const AIM_OXYLABS_OPTIONS = {
|
|
34
|
+
source: 'google_ai_mode',
|
|
35
|
+
inputKey: 'query',
|
|
36
|
+
render: 'html',
|
|
37
|
+
search: undefined,
|
|
38
|
+
};
|
|
39
|
+
function getProviderName() {
|
|
40
|
+
return dntShim.Deno.env.get('LLM_SCRAPER_PROVIDER')?.toLowerCase() ??
|
|
41
|
+
dntShim.Deno.env.get('CHATGPT_SCRAPER_PROVIDER')?.toLowerCase();
|
|
42
|
+
}
|
|
43
|
+
function getTargetOptions(target) {
|
|
44
|
+
return target === 'aim'
|
|
45
|
+
? {
|
|
46
|
+
brightdata: AIM_BRIGHTDATA_OPTIONS,
|
|
47
|
+
oxylabs: AIM_OXYLABS_OPTIONS,
|
|
48
|
+
}
|
|
49
|
+
: {
|
|
50
|
+
brightdata: CHATGPT_BRIGHTDATA_OPTIONS,
|
|
51
|
+
oxylabs: CHATGPT_OXYLABS_OPTIONS,
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
function getLLMScraper(target = 'chatgpt') {
|
|
55
|
+
const existingScraper = scrapers.get(target);
|
|
56
|
+
if (existingScraper) {
|
|
57
|
+
return existingScraper;
|
|
58
|
+
}
|
|
59
|
+
const providerName = getProviderName();
|
|
60
|
+
const targetOptions = getTargetOptions(target);
|
|
61
|
+
const provider = providerName === 'brightdata'
|
|
62
|
+
? createBrightdataProvider(targetOptions.brightdata)
|
|
63
|
+
: createOxylabsProvider(targetOptions.oxylabs);
|
|
64
|
+
const scraper = createLLMScraper(provider);
|
|
65
|
+
scrapers.set(target, scraper);
|
|
66
|
+
return scraper;
|
|
67
|
+
}
|
|
68
|
+
// ============================================================================
|
|
69
|
+
// Public API
|
|
70
|
+
// ============================================================================
|
|
71
|
+
// Shared scraper metadata
|
|
72
|
+
export function getMaxConcurrency(target = 'chatgpt') {
|
|
73
|
+
return getLLMScraper(target).maxConcurrency;
|
|
74
|
+
}
|
|
75
|
+
export function getMaxPromptsPerRequest(target = 'chatgpt') {
|
|
76
|
+
return getLLMScraper(target).maxPromptsPerRequest;
|
|
77
|
+
}
|
|
78
|
+
// ChatGPT scraper methods
|
|
79
|
+
export async function scrapeGPTBatch(options) {
|
|
80
|
+
return getLLMScraper('chatgpt').scrapeLLMBatch(options);
|
|
81
|
+
}
|
|
82
|
+
export async function triggerGPTBatch(options) {
|
|
83
|
+
return getLLMScraper('chatgpt').triggerLLMBatch(options);
|
|
84
|
+
}
|
|
85
|
+
export async function downloadGPTSnapshots(jobIds) {
|
|
86
|
+
return getLLMScraper('chatgpt').downloadLLMSnapshots(jobIds);
|
|
87
|
+
}
|
|
88
|
+
// AIM scraper methods
|
|
89
|
+
export async function scrapeAIMBatch(options) {
|
|
90
|
+
return getLLMScraper('aim').scrapeLLMBatch(options);
|
|
91
|
+
}
|
|
92
|
+
export async function triggerAIMBatch(options) {
|
|
93
|
+
return getLLMScraper('aim').triggerLLMBatch(options);
|
|
94
|
+
}
|
|
95
|
+
export async function downloadAIMSnapshots(jobIds) {
|
|
96
|
+
return getLLMScraper('aim').downloadLLMSnapshots(jobIds);
|
|
97
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { type ProviderFunctions } from './scrape.js';
|
|
2
|
+
interface OxylabsProviderConfig {
|
|
3
|
+
apiBase: string;
|
|
4
|
+
source: string;
|
|
5
|
+
inputKey: 'prompt' | 'query';
|
|
6
|
+
parse: boolean;
|
|
7
|
+
search?: boolean;
|
|
8
|
+
render?: 'html';
|
|
9
|
+
providerName: string;
|
|
10
|
+
maxConcurrency: number;
|
|
11
|
+
maxPromptsPerRequest: number;
|
|
12
|
+
}
|
|
13
|
+
export declare function createOxylabsProvider(overrides?: Partial<OxylabsProviderConfig>): ProviderFunctions;
|
|
14
|
+
export declare const oxylabsProvider: ProviderFunctions;
|
|
15
|
+
export {};
|
|
16
|
+
//# sourceMappingURL=oxy.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"oxy.d.ts","sourceRoot":"","sources":["../../../../../src/src/apis/brightdata/llmScraper/oxy.ts"],"names":[],"mappings":"AAeA,OAAO,EAA6C,KAAK,iBAAiB,EAAE,MAAM,aAAa,CAAC;AA2BhG,UAAU,qBAAqB;IAC9B,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,QAAQ,GAAG,OAAO,CAAC;IAC7B,KAAK,EAAE,OAAO,CAAC;IACf,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,MAAM,CAAC;IACrB,cAAc,EAAE,MAAM,CAAC;IACvB,oBAAoB,EAAE,MAAM,CAAC;CAC7B;AAyCD,wBAAgB,qBAAqB,CAAC,SAAS,GAAE,OAAO,CAAC,qBAAqB,CAAM,GAAG,iBAAiB,CAyJvG;AAMD,eAAO,MAAM,eAAe,EAAE,iBAA2C,CAAC"}
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
/* eslint no-console: ["warn", { allow: ["log", "warn", "error"] }] */
|
|
2
|
+
/**
|
|
3
|
+
* Oxylabs LLM Scraper Provider.
|
|
4
|
+
*
|
|
5
|
+
* API Flow (Async Push-Pull):
|
|
6
|
+
* 1. Trigger: POST to /v1/queries → returns job id
|
|
7
|
+
* 2. Monitor: GET /v1/queries/{id} until status is 'done'
|
|
8
|
+
* 3. Download: GET /v1/queries/{id}/results
|
|
9
|
+
*/
|
|
10
|
+
import * as dntShim from "../../../../_dnt.shims.js";
|
|
11
|
+
import { sleep, withRetries } from '../../../helpers/async.js';
|
|
12
|
+
import { buildSources, cleanAnswer, getAbortSignal } from './scrape.js';
|
|
13
|
+
const DEFAULT_OXYLABS_PROVIDER_CONFIG = {
|
|
14
|
+
apiBase: 'https://data.oxylabs.io/v1',
|
|
15
|
+
source: 'chatgpt',
|
|
16
|
+
inputKey: 'prompt',
|
|
17
|
+
parse: true,
|
|
18
|
+
search: true,
|
|
19
|
+
providerName: 'Oxylabs',
|
|
20
|
+
maxConcurrency: 10,
|
|
21
|
+
maxPromptsPerRequest: 1,
|
|
22
|
+
};
|
|
23
|
+
const RETRY_CONFIG = {
|
|
24
|
+
maxRetries: 3,
|
|
25
|
+
initialDelay: 1000,
|
|
26
|
+
statusCodes: [429, 500, 502, 503, 504, 524, 612, 613],
|
|
27
|
+
};
|
|
28
|
+
const MAX_WAIT_MS = 600_000; // 10 minutes
|
|
29
|
+
const POLL_INTERVAL_MS = 5_000;
|
|
30
|
+
// ============================================================================
|
|
31
|
+
// Auth
|
|
32
|
+
// ============================================================================
|
|
33
|
+
function getAuthHeader() {
|
|
34
|
+
const username = dntShim.Deno.env.get('OXYLABS_USERNAME');
|
|
35
|
+
const password = dntShim.Deno.env.get('OXYLABS_PASSWORD');
|
|
36
|
+
if (!username || !password) {
|
|
37
|
+
throw new Error('OXYLABS_USERNAME and OXYLABS_PASSWORD environment variables are required');
|
|
38
|
+
}
|
|
39
|
+
return `Basic ${btoa(`${username}:${password}`)}`;
|
|
40
|
+
}
|
|
41
|
+
// ============================================================================
|
|
42
|
+
// Provider Functions
|
|
43
|
+
// ============================================================================
|
|
44
|
+
export function createOxylabsProvider(overrides = {}) {
|
|
45
|
+
const config = { ...DEFAULT_OXYLABS_PROVIDER_CONFIG, ...overrides };
|
|
46
|
+
async function triggerJob(prompt, _useSearch, countryISOCode) {
|
|
47
|
+
const authHeader = getAuthHeader();
|
|
48
|
+
const url = `${config.apiBase}/queries`;
|
|
49
|
+
const body = {
|
|
50
|
+
source: config.source,
|
|
51
|
+
parse: config.parse,
|
|
52
|
+
[config.inputKey]: prompt,
|
|
53
|
+
};
|
|
54
|
+
if (config.search != null) {
|
|
55
|
+
body.search = config.search;
|
|
56
|
+
}
|
|
57
|
+
if (config.render != null) {
|
|
58
|
+
body.render = config.render;
|
|
59
|
+
}
|
|
60
|
+
if (countryISOCode) {
|
|
61
|
+
body.geo_location = countryISOCode;
|
|
62
|
+
}
|
|
63
|
+
try {
|
|
64
|
+
const response = await withRetries(() => fetch(url, {
|
|
65
|
+
method: 'POST',
|
|
66
|
+
headers: {
|
|
67
|
+
'Authorization': authHeader,
|
|
68
|
+
'Content-Type': 'application/json',
|
|
69
|
+
},
|
|
70
|
+
body: JSON.stringify(body),
|
|
71
|
+
signal: getAbortSignal(),
|
|
72
|
+
}), RETRY_CONFIG);
|
|
73
|
+
if (!response.ok) {
|
|
74
|
+
console.error(`[${config.providerName}] Trigger error: ${response.status}`);
|
|
75
|
+
return null;
|
|
76
|
+
}
|
|
77
|
+
const data = await response.json();
|
|
78
|
+
return data?.id || null;
|
|
79
|
+
}
|
|
80
|
+
catch (error) {
|
|
81
|
+
console.error(`[${config.providerName}] Trigger failed:`, error);
|
|
82
|
+
return null;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
async function monitorJob(jobId) {
|
|
86
|
+
const authHeader = getAuthHeader();
|
|
87
|
+
const url = `${config.apiBase}/queries/${jobId}`;
|
|
88
|
+
const startTime = Date.now();
|
|
89
|
+
const abortSignal = getAbortSignal();
|
|
90
|
+
while (Date.now() - startTime < MAX_WAIT_MS) {
|
|
91
|
+
if (abortSignal?.aborted)
|
|
92
|
+
return false;
|
|
93
|
+
try {
|
|
94
|
+
const response = await fetch(url, {
|
|
95
|
+
headers: { 'Authorization': authHeader },
|
|
96
|
+
signal: abortSignal,
|
|
97
|
+
});
|
|
98
|
+
// 204 = job not completed yet, continue polling
|
|
99
|
+
if (response.status === 204) {
|
|
100
|
+
await sleep(POLL_INTERVAL_MS, abortSignal);
|
|
101
|
+
continue;
|
|
102
|
+
}
|
|
103
|
+
if (response.ok) {
|
|
104
|
+
const status = await response.json();
|
|
105
|
+
if (status.status === 'done')
|
|
106
|
+
return true;
|
|
107
|
+
if (status.status === 'faulted' || status.status === 'failed')
|
|
108
|
+
return false;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
catch (error) {
|
|
112
|
+
console.error(`[${config.providerName}] Monitor error:`, error);
|
|
113
|
+
}
|
|
114
|
+
await sleep(POLL_INTERVAL_MS, abortSignal);
|
|
115
|
+
}
|
|
116
|
+
console.error(`[${config.providerName}] Monitor timeout after ${MAX_WAIT_MS / 1000}s`);
|
|
117
|
+
return false;
|
|
118
|
+
}
|
|
119
|
+
async function downloadJob(jobId) {
|
|
120
|
+
const authHeader = getAuthHeader();
|
|
121
|
+
const url = `${config.apiBase}/queries/${jobId}/results`;
|
|
122
|
+
try {
|
|
123
|
+
const response = await withRetries(() => fetch(url, {
|
|
124
|
+
headers: { 'Authorization': authHeader },
|
|
125
|
+
signal: getAbortSignal(),
|
|
126
|
+
}), RETRY_CONFIG);
|
|
127
|
+
if (!response.ok) {
|
|
128
|
+
console.error(`[${config.providerName}] Download error: ${response.status}`);
|
|
129
|
+
return null;
|
|
130
|
+
}
|
|
131
|
+
return await response.json();
|
|
132
|
+
}
|
|
133
|
+
catch (error) {
|
|
134
|
+
console.error(`[${config.providerName}] Download failed:`, error);
|
|
135
|
+
return null;
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
function transformResponse(raw) {
|
|
139
|
+
const response = raw;
|
|
140
|
+
const content = response?.results?.[0]?.content;
|
|
141
|
+
if (!content)
|
|
142
|
+
return null;
|
|
143
|
+
const answerText = cleanAnswer(content.response_text || '');
|
|
144
|
+
const answerTextMarkdown = cleanAnswer(content.markdown_text || '');
|
|
145
|
+
// Map section='citations' to cited=true (like Brightdata's cited field)
|
|
146
|
+
const citations = (content.citations ?? []).map((c) => ({
|
|
147
|
+
...c,
|
|
148
|
+
cited: c.section === 'citations',
|
|
149
|
+
}));
|
|
150
|
+
return {
|
|
151
|
+
prompt: content.prompt || '',
|
|
152
|
+
answer: answerText,
|
|
153
|
+
answer_text_markdown: answerTextMarkdown,
|
|
154
|
+
sources: buildSources(citations),
|
|
155
|
+
searchQueries: [],
|
|
156
|
+
};
|
|
157
|
+
}
|
|
158
|
+
return {
|
|
159
|
+
name: config.providerName,
|
|
160
|
+
maxConcurrency: config.maxConcurrency,
|
|
161
|
+
maxPromptsPerRequest: config.maxPromptsPerRequest,
|
|
162
|
+
triggerJob,
|
|
163
|
+
monitorJob,
|
|
164
|
+
downloadJob,
|
|
165
|
+
transformResponse,
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
// ============================================================================
|
|
169
|
+
// Export
|
|
170
|
+
// ============================================================================
|
|
171
|
+
export const oxylabsProvider = createOxylabsProvider();
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import type { ModelResult } from '
|
|
2
|
-
import type { Source
|
|
1
|
+
import type { ModelResult } from '../../../schemas/models.schema.js';
|
|
2
|
+
import type { Source } from '../../../schemas/sources.schema.js';
|
|
3
3
|
export interface BatchOptions {
|
|
4
4
|
prompts: Array<string>;
|
|
5
5
|
useSearch?: boolean;
|
|
@@ -14,12 +14,12 @@ export interface ProviderFunctions {
|
|
|
14
14
|
downloadJob: (jobId: string) => Promise<unknown>;
|
|
15
15
|
transformResponse: (raw: unknown) => ModelResult | null;
|
|
16
16
|
}
|
|
17
|
-
export interface
|
|
17
|
+
export interface LLMScraper {
|
|
18
18
|
maxConcurrency: number;
|
|
19
19
|
maxPromptsPerRequest: number;
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
20
|
+
scrapeLLMBatch: (options: BatchOptions) => Promise<Array<ModelResult>>;
|
|
21
|
+
triggerLLMBatch: (options: BatchOptions) => Promise<Array<string | null>>;
|
|
22
|
+
downloadLLMSnapshots: (jobIds: Array<string | null>) => Promise<Array<ModelResult>>;
|
|
23
23
|
}
|
|
24
24
|
export declare function getAbortSignal(): AbortSignal | undefined;
|
|
25
25
|
export declare function cleanAnswer(answer: string): string;
|
|
@@ -29,18 +29,15 @@ export declare function buildSources(citations: Array<{
|
|
|
29
29
|
description?: string;
|
|
30
30
|
text?: string;
|
|
31
31
|
cited?: boolean;
|
|
32
|
-
}>,
|
|
33
|
-
export declare function buildSearchSources(sources: Array<{
|
|
32
|
+
}>, linksAttached?: Array<{
|
|
34
33
|
url?: string;
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
date_published?: string;
|
|
39
|
-
}>): Array<SearchSource>;
|
|
34
|
+
text?: string;
|
|
35
|
+
position?: number;
|
|
36
|
+
}>): Array<Source>;
|
|
40
37
|
/**
|
|
41
38
|
* Creates an empty model result for failed jobs.
|
|
42
39
|
* This ensures we always return the same number of rows as input.
|
|
43
40
|
*/
|
|
44
41
|
export declare function emptyModelResult(providerName: string, errorMessage?: string, context?: unknown): ModelResult;
|
|
45
|
-
export declare function
|
|
46
|
-
//# sourceMappingURL=
|
|
42
|
+
export declare function createLLMScraper(provider: ProviderFunctions): LLMScraper;
|
|
43
|
+
//# sourceMappingURL=scrape.d.ts.map
|