@graphext/cuery 0.9.5 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/esm/browser.d.ts +1 -1
- package/esm/browser.js +1 -1
- package/esm/mod.d.ts +3 -3
- package/esm/mod.d.ts.map +1 -1
- package/esm/mod.js +3 -3
- package/esm/src/apis/brightdata/contentScraper/index.d.ts.map +1 -0
- package/{script/src/apis/brightdata → esm/src/apis/brightdata/contentScraper}/scrape.d.ts +1 -1
- package/esm/src/apis/brightdata/contentScraper/scrape.d.ts.map +1 -0
- package/esm/src/apis/brightdata/{scrape.js → contentScraper/scrape.js} +2 -2
- package/esm/src/apis/brightdata/llmScraper/brightdata.d.ts +20 -0
- package/esm/src/apis/brightdata/llmScraper/brightdata.d.ts.map +1 -0
- package/esm/src/apis/brightdata/llmScraper/brightdata.js +182 -0
- package/esm/src/apis/brightdata/llmScraper/index.d.ts +14 -0
- package/esm/src/apis/brightdata/llmScraper/index.d.ts.map +1 -0
- package/esm/src/apis/brightdata/llmScraper/index.js +97 -0
- package/esm/src/apis/brightdata/llmScraper/oxy.d.ts +16 -0
- package/esm/src/apis/brightdata/llmScraper/oxy.d.ts.map +1 -0
- package/esm/src/apis/brightdata/llmScraper/oxy.js +171 -0
- package/{script/src/apis/chatgptScraper/scraper.d.ts → esm/src/apis/brightdata/llmScraper/scrape.d.ts} +12 -15
- package/esm/src/apis/brightdata/llmScraper/scrape.d.ts.map +1 -0
- package/esm/src/apis/brightdata/llmScraper/scrape.js +184 -0
- package/esm/src/schemas/search.schema.d.ts +2 -2
- package/esm/src/schemas/search.schema.d.ts.map +1 -1
- package/esm/src/schemas/sources.schema.d.ts +1 -4
- package/esm/src/schemas/sources.schema.d.ts.map +1 -1
- package/package.json +1 -1
- package/script/browser.d.ts +1 -1
- package/script/browser.js +1 -1
- package/script/mod.d.ts +3 -3
- package/script/mod.d.ts.map +1 -1
- package/script/mod.js +6 -6
- package/script/src/apis/brightdata/contentScraper/index.d.ts.map +1 -0
- package/{esm/src/apis/brightdata → script/src/apis/brightdata/contentScraper}/scrape.d.ts +1 -1
- package/script/src/apis/brightdata/contentScraper/scrape.d.ts.map +1 -0
- package/script/src/apis/brightdata/{scrape.js → contentScraper/scrape.js} +2 -2
- package/script/src/apis/brightdata/llmScraper/brightdata.d.ts +20 -0
- package/script/src/apis/brightdata/llmScraper/brightdata.d.ts.map +1 -0
- package/script/src/apis/brightdata/llmScraper/brightdata.js +219 -0
- package/script/src/apis/brightdata/llmScraper/index.d.ts +14 -0
- package/script/src/apis/brightdata/llmScraper/index.d.ts.map +1 -0
- package/script/src/apis/brightdata/llmScraper/index.js +140 -0
- package/script/src/apis/brightdata/llmScraper/oxy.d.ts +16 -0
- package/script/src/apis/brightdata/llmScraper/oxy.d.ts.map +1 -0
- package/script/src/apis/brightdata/llmScraper/oxy.js +208 -0
- package/{esm/src/apis/chatgptScraper/scraper.d.ts → script/src/apis/brightdata/llmScraper/scrape.d.ts} +12 -15
- package/script/src/apis/brightdata/llmScraper/scrape.d.ts.map +1 -0
- package/script/src/apis/brightdata/llmScraper/scrape.js +224 -0
- package/script/src/schemas/search.schema.d.ts +2 -2
- package/script/src/schemas/search.schema.d.ts.map +1 -1
- package/script/src/schemas/sources.schema.d.ts +1 -4
- package/script/src/schemas/sources.schema.d.ts.map +1 -1
- package/esm/src/apis/brightdata/index.d.ts.map +0 -1
- package/esm/src/apis/brightdata/scrape.d.ts.map +0 -1
- package/esm/src/apis/chatgptScraper/brightdata.d.ts +0 -3
- package/esm/src/apis/chatgptScraper/brightdata.d.ts.map +0 -1
- package/esm/src/apis/chatgptScraper/brightdata.js +0 -172
- package/esm/src/apis/chatgptScraper/index.d.ts +0 -10
- package/esm/src/apis/chatgptScraper/index.d.ts.map +0 -1
- package/esm/src/apis/chatgptScraper/index.js +0 -41
- package/esm/src/apis/chatgptScraper/oxy.d.ts +0 -3
- package/esm/src/apis/chatgptScraper/oxy.d.ts.map +0 -1
- package/esm/src/apis/chatgptScraper/oxy.js +0 -156
- package/esm/src/apis/chatgptScraper/scraper.d.ts.map +0 -1
- package/esm/src/apis/chatgptScraper/scraper.js +0 -98
- package/script/src/apis/brightdata/index.d.ts.map +0 -1
- package/script/src/apis/brightdata/scrape.d.ts.map +0 -1
- package/script/src/apis/chatgptScraper/brightdata.d.ts +0 -3
- package/script/src/apis/chatgptScraper/brightdata.d.ts.map +0 -1
- package/script/src/apis/chatgptScraper/brightdata.js +0 -208
- package/script/src/apis/chatgptScraper/index.d.ts +0 -10
- package/script/src/apis/chatgptScraper/index.d.ts.map +0 -1
- package/script/src/apis/chatgptScraper/index.js +0 -81
- package/script/src/apis/chatgptScraper/oxy.d.ts +0 -3
- package/script/src/apis/chatgptScraper/oxy.d.ts.map +0 -1
- package/script/src/apis/chatgptScraper/oxy.js +0 -192
- package/script/src/apis/chatgptScraper/scraper.d.ts.map +0 -1
- package/script/src/apis/chatgptScraper/scraper.js +0 -139
- /package/esm/src/apis/brightdata/{index.d.ts → contentScraper/index.d.ts} +0 -0
- /package/esm/src/apis/brightdata/{index.js → contentScraper/index.js} +0 -0
- /package/script/src/apis/brightdata/{index.d.ts → contentScraper/index.d.ts} +0 -0
- /package/script/src/apis/brightdata/{index.js → contentScraper/index.js} +0 -0
|
@@ -3,6 +3,7 @@ export interface Source {
|
|
|
3
3
|
url: string;
|
|
4
4
|
domain: string;
|
|
5
5
|
cited?: boolean;
|
|
6
|
+
snippet?: string;
|
|
6
7
|
positions?: Array<number>;
|
|
7
8
|
}
|
|
8
9
|
/**
|
|
@@ -22,8 +23,4 @@ export interface CategorizedSource extends EnrichedSource {
|
|
|
22
23
|
category: string | null;
|
|
23
24
|
subcategory: string | null;
|
|
24
25
|
}
|
|
25
|
-
export interface SearchSource extends Source {
|
|
26
|
-
rank: number;
|
|
27
|
-
datePublished: string | null;
|
|
28
|
-
}
|
|
29
26
|
//# sourceMappingURL=sources.schema.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"sources.schema.d.ts","sourceRoot":"","sources":["../../../src/src/schemas/sources.schema.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,MAAM;IACtB,KAAK,EAAE,MAAM,CAAC;IACd,GAAG,EAAE,MAAM,CAAC;IACZ,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,SAAS,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;CAC1B;AAED;;;GAGG;AACH,MAAM,WAAW,cAAe,SAAQ,MAAM;IAC7C,eAAe,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IAC/B,oBAAoB,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACpC,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,gBAAgB,EAAE,MAAM,GAAG,IAAI,CAAC;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,iBAAkB,SAAQ,cAAc;IACxD,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3B
|
|
1
|
+
{"version":3,"file":"sources.schema.d.ts","sourceRoot":"","sources":["../../../src/src/schemas/sources.schema.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,MAAM;IACtB,KAAK,EAAE,MAAM,CAAC;IACd,GAAG,EAAE,MAAM,CAAC;IACZ,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;CAC1B;AAED;;;GAGG;AACH,MAAM,WAAW,cAAe,SAAQ,MAAM;IAC7C,eAAe,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IAC/B,oBAAoB,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACpC,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,gBAAgB,EAAE,MAAM,GAAG,IAAI,CAAC;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,iBAAkB,SAAQ,cAAc;IACxD,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3B"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/brightdata/index.ts"],"names":[],"mappings":"AAAA,cAAc,aAAa,CAAC"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"scrape.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/brightdata/scrape.ts"],"names":[],"mappings":"AACA,OAAO,EAA4B,KAAK,WAAW,EAAE,MAAM,wBAAwB,CAAC;AAapF,MAAM,WAAW,uBAAuB;IACpC,wDAAwD;IACxD,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,6DAA6D;IAC7D,MAAM,CAAC,EAAE,KAAK,GAAG,MAAM,CAAC;IACxB,8DAA8D;IAC9D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,wDAAwD;IACxD,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,oDAAoD;IACpD,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,kCAAkC;IAClC,WAAW,CAAC,EAAE,WAAW,CAAC;CAC7B;AAED,MAAM,WAAW,wBAAwB;IACrC,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,CAAC,EAAE,MAAM,CAAC;CACjB;AA4ED;;;GAGG;AACH,wBAAsB,gBAAgB,CAClC,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,uBAA4B,GACtC,OAAO,CAAC,wBAAwB,CAAC,CAYnC;AAED;;;GAGG;AACH,wBAAsB,qBAAqB,CACvC,IAAI,EAAE,KAAK,CAAC,MAAM,CAAC,EACnB,OAAO,GAAE,uBAA4B,EACrC,cAAc,GAAE,MAA+B,GAChD,OAAO,CAAC,KAAK,CAAC,wBAAwB,CAAC,CAAC,CAQ1C"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"brightdata.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/chatgptScraper/brightdata.ts"],"names":[],"mappings":"AAeA,OAAO,EACN,KAAK,iBAAiB,EAKtB,MAAM,cAAc,CAAC;AAmNtB,eAAO,MAAM,kBAAkB,EAAE,iBAQhC,CAAC"}
|
|
@@ -1,172 +0,0 @@
|
|
|
1
|
-
/* eslint no-console: ["warn", { allow: ["log", "warn", "error"] }] */
|
|
2
|
-
/**
|
|
3
|
-
* Brightdata GPT Scraper Provider.
|
|
4
|
-
*
|
|
5
|
-
* API Flow:
|
|
6
|
-
* 1. Trigger: POST to /datasets/v3/trigger → returns snapshot_id
|
|
7
|
-
* 2. Monitor: GET /datasets/v3/progress/{snapshot_id} until ready
|
|
8
|
-
* 3. Download: GET /datasets/v3/snapshot/{snapshot_id}
|
|
9
|
-
*/
|
|
10
|
-
import * as dntShim from "../../../_dnt.shims.js";
|
|
11
|
-
import { withRetries, sleep } from '../../helpers/async.js';
|
|
12
|
-
import { getAbortSignal, cleanAnswer, buildSources, buildSearchSources } from './scraper.js';
|
|
13
|
-
// ============================================================================
|
|
14
|
-
// Constants
|
|
15
|
-
// ============================================================================
|
|
16
|
-
const API_BASE = 'https://api.brightdata.com';
|
|
17
|
-
const DATASET_ID = 'gd_m7aof0k82r803d5bjm';
|
|
18
|
-
const OUTPUT_FIELDS = 'url|prompt|answer_text|answer_text_markdown|citations|links_attached|search_sources|country|model|web_search_triggered|web_search_query|index';
|
|
19
|
-
const TRIGGER_RETRY = {
|
|
20
|
-
maxRetries: 3,
|
|
21
|
-
initialDelay: 0,
|
|
22
|
-
statusCodes: [429, 500, 502, 503, 504]
|
|
23
|
-
};
|
|
24
|
-
const DOWNLOAD_RETRY = {
|
|
25
|
-
maxRetries: 5,
|
|
26
|
-
initialDelay: 2000,
|
|
27
|
-
statusCodes: [202, 500, 502, 503, 504]
|
|
28
|
-
};
|
|
29
|
-
const MONITOR_RETRY = {
|
|
30
|
-
maxRetries: 4,
|
|
31
|
-
initialDelay: 1000,
|
|
32
|
-
statusCodes: [408, 425, 429, 500, 502, 503, 504]
|
|
33
|
-
};
|
|
34
|
-
const MONITOR_RETRIABLE = new Set(MONITOR_RETRY.statusCodes ?? []);
|
|
35
|
-
const MAX_WAIT_MS = 600_000; // 10 minutes
|
|
36
|
-
const POLL_INTERVAL_MS = 5_000;
|
|
37
|
-
// ============================================================================
|
|
38
|
-
// API Key
|
|
39
|
-
// ============================================================================
|
|
40
|
-
function getApiKey() {
|
|
41
|
-
const apiKey = dntShim.Deno.env.get('BRIGHTDATA_API_KEY');
|
|
42
|
-
if (!apiKey) {
|
|
43
|
-
throw new Error('BRIGHTDATA_API_KEY environment variable is required');
|
|
44
|
-
}
|
|
45
|
-
return apiKey;
|
|
46
|
-
}
|
|
47
|
-
// ============================================================================
|
|
48
|
-
// Provider Functions
|
|
49
|
-
// ============================================================================
|
|
50
|
-
async function triggerJob(prompt, useSearch, countryISOCode) {
|
|
51
|
-
const apiKey = getApiKey();
|
|
52
|
-
const url = `${API_BASE}/datasets/v3/trigger?dataset_id=${DATASET_ID}&include_errors=true`;
|
|
53
|
-
const body = {
|
|
54
|
-
custom_output_fields: OUTPUT_FIELDS,
|
|
55
|
-
input: [{
|
|
56
|
-
url: 'http://chatgpt.com/',
|
|
57
|
-
prompt,
|
|
58
|
-
web_search: useSearch,
|
|
59
|
-
country: countryISOCode || '',
|
|
60
|
-
index: 0
|
|
61
|
-
}]
|
|
62
|
-
};
|
|
63
|
-
try {
|
|
64
|
-
const response = await withRetries(() => fetch(url, {
|
|
65
|
-
method: 'POST',
|
|
66
|
-
headers: {
|
|
67
|
-
'Authorization': `Bearer ${apiKey}`,
|
|
68
|
-
'Content-Type': 'application/json'
|
|
69
|
-
},
|
|
70
|
-
body: JSON.stringify(body),
|
|
71
|
-
signal: getAbortSignal()
|
|
72
|
-
}), TRIGGER_RETRY);
|
|
73
|
-
if (!response.ok) {
|
|
74
|
-
console.error(`[Brightdata] Trigger error: ${response.status}`);
|
|
75
|
-
return null;
|
|
76
|
-
}
|
|
77
|
-
const data = await response.json();
|
|
78
|
-
return data?.snapshot_id || null;
|
|
79
|
-
}
|
|
80
|
-
catch (error) {
|
|
81
|
-
console.error('[Brightdata] Trigger failed:', error);
|
|
82
|
-
return null;
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
async function monitorJob(snapshotId) {
|
|
86
|
-
const apiKey = getApiKey();
|
|
87
|
-
const url = `${API_BASE}/datasets/v3/progress/${snapshotId}`;
|
|
88
|
-
const startTime = Date.now();
|
|
89
|
-
const abortSignal = getAbortSignal();
|
|
90
|
-
while (Date.now() - startTime < MAX_WAIT_MS) {
|
|
91
|
-
if (abortSignal?.aborted)
|
|
92
|
-
return false;
|
|
93
|
-
try {
|
|
94
|
-
const response = await withRetries(() => fetch(url, {
|
|
95
|
-
headers: { 'Authorization': `Bearer ${apiKey}` },
|
|
96
|
-
signal: abortSignal
|
|
97
|
-
}), MONITOR_RETRY);
|
|
98
|
-
if (!response.ok) {
|
|
99
|
-
if (!MONITOR_RETRIABLE.has(response.status))
|
|
100
|
-
return false;
|
|
101
|
-
}
|
|
102
|
-
else {
|
|
103
|
-
const status = await response.json();
|
|
104
|
-
if (status.status === 'ready' || status.status === 'complete')
|
|
105
|
-
return true;
|
|
106
|
-
if (status.status === 'failed' || status.status === 'error')
|
|
107
|
-
return false;
|
|
108
|
-
}
|
|
109
|
-
}
|
|
110
|
-
catch (error) {
|
|
111
|
-
console.error('[Brightdata] Monitor error:', error);
|
|
112
|
-
}
|
|
113
|
-
await sleep(POLL_INTERVAL_MS, abortSignal);
|
|
114
|
-
}
|
|
115
|
-
console.error(`[Brightdata] Monitor timeout after ${MAX_WAIT_MS / 1000}s`);
|
|
116
|
-
return false;
|
|
117
|
-
}
|
|
118
|
-
async function downloadJob(snapshotId) {
|
|
119
|
-
const apiKey = getApiKey();
|
|
120
|
-
const url = `${API_BASE}/datasets/v3/snapshot/${snapshotId}?format=json`;
|
|
121
|
-
try {
|
|
122
|
-
const response = await withRetries(() => fetch(url, {
|
|
123
|
-
headers: { 'Authorization': `Bearer ${apiKey}` },
|
|
124
|
-
signal: getAbortSignal()
|
|
125
|
-
}), DOWNLOAD_RETRY);
|
|
126
|
-
if (!response.ok) {
|
|
127
|
-
console.error(`[Brightdata] Download error: ${response.status}`);
|
|
128
|
-
return null;
|
|
129
|
-
}
|
|
130
|
-
const data = await response.json();
|
|
131
|
-
return Array.isArray(data) ? data : null;
|
|
132
|
-
}
|
|
133
|
-
catch (error) {
|
|
134
|
-
console.error('[Brightdata] Download failed:', error);
|
|
135
|
-
return null;
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
function transformResponse(raw) {
|
|
139
|
-
const responses = raw;
|
|
140
|
-
if (!responses || responses.length === 0)
|
|
141
|
-
return null;
|
|
142
|
-
const response = responses[0];
|
|
143
|
-
let answer = response.answer_text_markdown || response.answer_text || '';
|
|
144
|
-
answer = cleanAnswer(answer);
|
|
145
|
-
// Build link positions map
|
|
146
|
-
const linkPositions = {};
|
|
147
|
-
for (const link of response.links_attached ?? []) {
|
|
148
|
-
if (link.url && link.position != null) {
|
|
149
|
-
linkPositions[link.url] ??= [];
|
|
150
|
-
linkPositions[link.url].push(link.position);
|
|
151
|
-
}
|
|
152
|
-
}
|
|
153
|
-
return {
|
|
154
|
-
prompt: response.prompt,
|
|
155
|
-
answer,
|
|
156
|
-
sources: buildSources(response.citations ?? [], linkPositions),
|
|
157
|
-
searchQueries: response.web_search_query || [],
|
|
158
|
-
searchSources: buildSearchSources(response.search_sources ?? [])
|
|
159
|
-
};
|
|
160
|
-
}
|
|
161
|
-
// ============================================================================
|
|
162
|
-
// Export
|
|
163
|
-
// ============================================================================
|
|
164
|
-
export const brightdataProvider = {
|
|
165
|
-
name: 'Brightdata',
|
|
166
|
-
maxConcurrency: 50,
|
|
167
|
-
maxPromptsPerRequest: 1,
|
|
168
|
-
triggerJob,
|
|
169
|
-
monitorJob,
|
|
170
|
-
downloadJob,
|
|
171
|
-
transformResponse
|
|
172
|
-
};
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
import type { ModelResult } from '../../schemas/models.schema.js';
|
|
2
|
-
import { type BatchOptions } from './scraper.js';
|
|
3
|
-
export type { BatchOptions };
|
|
4
|
-
export type JobId = string | null;
|
|
5
|
-
export declare function getMaxConcurrency(): number;
|
|
6
|
-
export declare function getMaxPromptsPerRequest(): number;
|
|
7
|
-
export declare function scrapeGPTBatch(options: BatchOptions): Promise<Array<ModelResult>>;
|
|
8
|
-
export declare function triggerGPTBatch(options: BatchOptions): Promise<Array<string | null>>;
|
|
9
|
-
export declare function downloadGPTSnapshots(jobIds: Array<string | null>): Promise<Array<ModelResult>>;
|
|
10
|
-
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/chatgptScraper/index.ts"],"names":[],"mappings":"AAUA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,gCAAgC,CAAC;AAClE,OAAO,EAAE,KAAK,YAAY,EAAkC,MAAM,cAAc,CAAC;AAKjF,YAAY,EAAE,YAAY,EAAE,CAAC;AAC7B,MAAM,MAAM,KAAK,GAAG,MAAM,GAAG,IAAI,CAAC;AAqBlC,wBAAgB,iBAAiB,IAAI,MAAM,CAE1C;AAED,wBAAgB,uBAAuB,IAAI,MAAM,CAEhD;AAED,wBAAsB,cAAc,CAAC,OAAO,EAAE,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAEvF;AAED,wBAAsB,eAAe,CAAC,OAAO,EAAE,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAE1F;AAED,wBAAsB,oBAAoB,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAEpG"}
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
/* eslint no-console: ["warn", { allow: ["log", "warn", "error"] }] */
|
|
2
|
-
/**
|
|
3
|
-
* GPT Scraper - Public API
|
|
4
|
-
*
|
|
5
|
-
* Selects between Brightdata and Oxylabs based on CHATGPT_SCRAPER_PROVIDER env var.
|
|
6
|
-
* Default: oxylabs
|
|
7
|
-
*/
|
|
8
|
-
import * as dntShim from "../../../_dnt.shims.js";
|
|
9
|
-
import { createScraper } from './scraper.js';
|
|
10
|
-
import { brightdataProvider } from './brightdata.js';
|
|
11
|
-
import { oxylabsProvider } from './oxy.js';
|
|
12
|
-
// ============================================================================
|
|
13
|
-
// Scraper Instance (lazy singleton)
|
|
14
|
-
// ============================================================================
|
|
15
|
-
let scraper = null;
|
|
16
|
-
function getScraper() {
|
|
17
|
-
if (!scraper) {
|
|
18
|
-
const providerName = dntShim.Deno.env.get('CHATGPT_SCRAPER_PROVIDER')?.toLowerCase();
|
|
19
|
-
const provider = providerName === 'brightdata' ? brightdataProvider : oxylabsProvider;
|
|
20
|
-
scraper = createScraper(provider);
|
|
21
|
-
}
|
|
22
|
-
return scraper;
|
|
23
|
-
}
|
|
24
|
-
// ============================================================================
|
|
25
|
-
// Public API
|
|
26
|
-
// ============================================================================
|
|
27
|
-
export function getMaxConcurrency() {
|
|
28
|
-
return getScraper().maxConcurrency;
|
|
29
|
-
}
|
|
30
|
-
export function getMaxPromptsPerRequest() {
|
|
31
|
-
return getScraper().maxPromptsPerRequest;
|
|
32
|
-
}
|
|
33
|
-
export async function scrapeGPTBatch(options) {
|
|
34
|
-
return getScraper().scrapeGPTBatch(options);
|
|
35
|
-
}
|
|
36
|
-
export async function triggerGPTBatch(options) {
|
|
37
|
-
return getScraper().triggerGPTBatch(options);
|
|
38
|
-
}
|
|
39
|
-
export async function downloadGPTSnapshots(jobIds) {
|
|
40
|
-
return getScraper().downloadGPTSnapshots(jobIds);
|
|
41
|
-
}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"oxy.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/chatgptScraper/oxy.ts"],"names":[],"mappings":"AAeA,OAAO,EACN,KAAK,iBAAiB,EAItB,MAAM,cAAc,CAAC;AA6LtB,eAAO,MAAM,eAAe,EAAE,iBAQ7B,CAAC"}
|
|
@@ -1,156 +0,0 @@
|
|
|
1
|
-
/* eslint no-console: ["warn", { allow: ["log", "warn", "error"] }] */
|
|
2
|
-
/**
|
|
3
|
-
* Oxylabs GPT Scraper Provider.
|
|
4
|
-
*
|
|
5
|
-
* API Flow (Async Push-Pull):
|
|
6
|
-
* 1. Trigger: POST to /v1/queries → returns job id
|
|
7
|
-
* 2. Monitor: GET /v1/queries/{id} until status is 'done'
|
|
8
|
-
* 3. Download: GET /v1/queries/{id}/results
|
|
9
|
-
*/
|
|
10
|
-
import * as dntShim from "../../../_dnt.shims.js";
|
|
11
|
-
import { withRetries, sleep } from '../../helpers/async.js';
|
|
12
|
-
import { getAbortSignal, cleanAnswer, buildSources } from './scraper.js';
|
|
13
|
-
// ============================================================================
|
|
14
|
-
// Constants
|
|
15
|
-
// ============================================================================
|
|
16
|
-
const API_BASE = 'https://data.oxylabs.io/v1';
|
|
17
|
-
const RETRY_CONFIG = {
|
|
18
|
-
maxRetries: 3,
|
|
19
|
-
initialDelay: 1000,
|
|
20
|
-
statusCodes: [429, 500, 502, 503, 504, 524, 612, 613]
|
|
21
|
-
};
|
|
22
|
-
const MAX_WAIT_MS = 600_000; // 10 minutes
|
|
23
|
-
const POLL_INTERVAL_MS = 5_000;
|
|
24
|
-
// ============================================================================
|
|
25
|
-
// Auth
|
|
26
|
-
// ============================================================================
|
|
27
|
-
function getAuthHeader() {
|
|
28
|
-
const username = dntShim.Deno.env.get('OXYLABS_USERNAME');
|
|
29
|
-
const password = dntShim.Deno.env.get('OXYLABS_PASSWORD');
|
|
30
|
-
if (!username || !password) {
|
|
31
|
-
throw new Error('OXYLABS_USERNAME and OXYLABS_PASSWORD environment variables are required');
|
|
32
|
-
}
|
|
33
|
-
return `Basic ${btoa(`${username}:${password}`)}`;
|
|
34
|
-
}
|
|
35
|
-
// ============================================================================
|
|
36
|
-
// Provider Functions
|
|
37
|
-
// ============================================================================
|
|
38
|
-
async function triggerJob(prompt, useSearch, countryISOCode) {
|
|
39
|
-
const authHeader = getAuthHeader();
|
|
40
|
-
const url = `${API_BASE}/queries`;
|
|
41
|
-
const body = {
|
|
42
|
-
source: 'chatgpt',
|
|
43
|
-
prompt,
|
|
44
|
-
parse: true,
|
|
45
|
-
search: true // Oxylabs requires search: true (cannot be false or blank)
|
|
46
|
-
};
|
|
47
|
-
if (countryISOCode) {
|
|
48
|
-
body.geo_location = countryISOCode;
|
|
49
|
-
}
|
|
50
|
-
try {
|
|
51
|
-
const response = await withRetries(() => fetch(url, {
|
|
52
|
-
method: 'POST',
|
|
53
|
-
headers: {
|
|
54
|
-
'Authorization': authHeader,
|
|
55
|
-
'Content-Type': 'application/json'
|
|
56
|
-
},
|
|
57
|
-
body: JSON.stringify(body),
|
|
58
|
-
signal: getAbortSignal()
|
|
59
|
-
}), RETRY_CONFIG);
|
|
60
|
-
if (!response.ok) {
|
|
61
|
-
console.error(`[Oxylabs] Trigger error: ${response.status}`);
|
|
62
|
-
return null;
|
|
63
|
-
}
|
|
64
|
-
const data = await response.json();
|
|
65
|
-
return data?.id || null;
|
|
66
|
-
}
|
|
67
|
-
catch (error) {
|
|
68
|
-
console.error('[Oxylabs] Trigger failed:', error);
|
|
69
|
-
return null;
|
|
70
|
-
}
|
|
71
|
-
}
|
|
72
|
-
async function monitorJob(jobId) {
|
|
73
|
-
const authHeader = getAuthHeader();
|
|
74
|
-
const url = `${API_BASE}/queries/${jobId}`;
|
|
75
|
-
const startTime = Date.now();
|
|
76
|
-
const abortSignal = getAbortSignal();
|
|
77
|
-
while (Date.now() - startTime < MAX_WAIT_MS) {
|
|
78
|
-
if (abortSignal?.aborted)
|
|
79
|
-
return false;
|
|
80
|
-
try {
|
|
81
|
-
const response = await fetch(url, {
|
|
82
|
-
headers: { 'Authorization': authHeader },
|
|
83
|
-
signal: abortSignal
|
|
84
|
-
});
|
|
85
|
-
// 204 = job not completed yet, continue polling
|
|
86
|
-
if (response.status === 204) {
|
|
87
|
-
await sleep(POLL_INTERVAL_MS, abortSignal);
|
|
88
|
-
continue;
|
|
89
|
-
}
|
|
90
|
-
if (response.ok) {
|
|
91
|
-
const status = await response.json();
|
|
92
|
-
if (status.status === 'done')
|
|
93
|
-
return true;
|
|
94
|
-
if (status.status === 'faulted' || status.status === 'failed')
|
|
95
|
-
return false;
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
catch (error) {
|
|
99
|
-
console.error('[Oxylabs] Monitor error:', error);
|
|
100
|
-
}
|
|
101
|
-
await sleep(POLL_INTERVAL_MS, abortSignal);
|
|
102
|
-
}
|
|
103
|
-
console.error(`[Oxylabs] Monitor timeout after ${MAX_WAIT_MS / 1000}s`);
|
|
104
|
-
return false;
|
|
105
|
-
}
|
|
106
|
-
async function downloadJob(jobId) {
|
|
107
|
-
const authHeader = getAuthHeader();
|
|
108
|
-
const url = `${API_BASE}/queries/${jobId}/results`;
|
|
109
|
-
try {
|
|
110
|
-
const response = await withRetries(() => fetch(url, {
|
|
111
|
-
headers: { 'Authorization': authHeader },
|
|
112
|
-
signal: getAbortSignal()
|
|
113
|
-
}), RETRY_CONFIG);
|
|
114
|
-
if (!response.ok) {
|
|
115
|
-
console.error(`[Oxylabs] Download error: ${response.status}`);
|
|
116
|
-
return null;
|
|
117
|
-
}
|
|
118
|
-
return await response.json();
|
|
119
|
-
}
|
|
120
|
-
catch (error) {
|
|
121
|
-
console.error('[Oxylabs] Download failed:', error);
|
|
122
|
-
return null;
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
function transformResponse(raw) {
|
|
126
|
-
const response = raw;
|
|
127
|
-
const content = response?.results?.[0]?.content;
|
|
128
|
-
if (!content)
|
|
129
|
-
return null;
|
|
130
|
-
let answer = content.markdown_text || content.response_text || '';
|
|
131
|
-
answer = cleanAnswer(answer);
|
|
132
|
-
// Map section='citations' to cited=true (like Brightdata's cited field)
|
|
133
|
-
const citations = (content.citations ?? []).map(c => ({
|
|
134
|
-
...c,
|
|
135
|
-
cited: c.section === 'citations'
|
|
136
|
-
}));
|
|
137
|
-
return {
|
|
138
|
-
prompt: content.prompt || '',
|
|
139
|
-
answer,
|
|
140
|
-
sources: buildSources(citations),
|
|
141
|
-
searchQueries: [],
|
|
142
|
-
searchSources: []
|
|
143
|
-
};
|
|
144
|
-
}
|
|
145
|
-
// ============================================================================
|
|
146
|
-
// Export
|
|
147
|
-
// ============================================================================
|
|
148
|
-
export const oxylabsProvider = {
|
|
149
|
-
name: 'Oxylabs',
|
|
150
|
-
maxConcurrency: 10,
|
|
151
|
-
maxPromptsPerRequest: 1,
|
|
152
|
-
triggerJob,
|
|
153
|
-
monitorJob,
|
|
154
|
-
downloadJob,
|
|
155
|
-
transformResponse
|
|
156
|
-
};
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"scraper.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/chatgptScraper/scraper.ts"],"names":[],"mappings":"AAWA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,gCAAgC,CAAC;AAClE,OAAO,KAAK,EAAE,MAAM,EAAE,YAAY,EAAE,MAAM,iCAAiC,CAAC;AAO5E,MAAM,WAAW,YAAY;IAC5B,OAAO,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACvB,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,cAAc,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CAC/B;AAED,MAAM,WAAW,iBAAiB;IACjC,IAAI,EAAE,MAAM,CAAC;IACb,cAAc,EAAE,MAAM,CAAC;IACvB,oBAAoB,EAAE,MAAM,CAAC;IAC7B,UAAU,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,SAAS,EAAE,OAAO,EAAE,cAAc,EAAE,MAAM,GAAG,IAAI,KAAK,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC;IAC1G,UAAU,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,OAAO,CAAC,OAAO,CAAC,CAAC;IAChD,WAAW,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,OAAO,CAAC,OAAO,CAAC,CAAC;IACjD,iBAAiB,EAAE,CAAC,GAAG,EAAE,OAAO,KAAK,WAAW,GAAG,IAAI,CAAC;CACxD;AAED,MAAM,WAAW,UAAU;IAC1B,cAAc,EAAE,MAAM,CAAC;IACvB,oBAAoB,EAAE,MAAM,CAAC;IAC7B,cAAc,EAAE,CAAC,OAAO,EAAE,YAAY,KAAK,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAAC;IACvE,eAAe,EAAE,CAAC,OAAO,EAAE,YAAY,KAAK,OAAO,CAAC,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAAC;IAC1E,oBAAoB,EAAE,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,KAAK,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAAC;CACpF;AAMD,wBAAgB,cAAc,IAAI,WAAW,GAAG,SAAS,CAExD;AAED,wBAAgB,WAAW,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,CAMlD;AAED,wBAAgB,YAAY,CAC3B,SAAS,EAAE,KAAK,CAAC;IAAE,GAAG,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IAAC,WAAW,CAAC,EAAE,MAAM,CAAC;IAAC,IAAI,CAAC,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,OAAO,CAAA;CAAE,CAAC,EACvG,aAAa,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC,GAC3C,KAAK,CAAC,MAAM,CAAC,CAQf;AAED,wBAAgB,kBAAkB,CACjC,OAAO,EAAE,KAAK,CAAC;IAAE,GAAG,CAAC,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IAAC,OAAO,CAAC,EAAE,MAAM,CAAC;IAAC,IAAI,CAAC,EAAE,MAAM,CAAC;IAAC,cAAc,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC,GACxG,KAAK,CAAC,YAAY,CAAC,CAQrB;AAED;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,YAAY,EAAE,MAAM,EAAE,YAAY,CAAC,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,OAAO,GAAG,WAAW,CAS5G;AAMD,wBAAgB,aAAa,CAAC,QAAQ,EAAE,iBAAiB,GAAG,UAAU,CAkErE"}
|
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
/* eslint no-console: ["warn", { allow: ["log", "warn", "error"] }] */
|
|
2
|
-
/**
|
|
3
|
-
* GPT Scraper - Core types and orchestration logic.
|
|
4
|
-
*
|
|
5
|
-
* Uses composition: providers supply functions, this module orchestrates them.
|
|
6
|
-
*/
|
|
7
|
-
import * as dntShim from "../../../_dnt.shims.js";
|
|
8
|
-
import { mapParallel } from '../../helpers/async.js';
|
|
9
|
-
import { extractDomain } from '../../helpers/urls.js';
|
|
10
|
-
// ============================================================================
|
|
11
|
-
// Shared Utilities
|
|
12
|
-
// ============================================================================
|
|
13
|
-
export function getAbortSignal() {
|
|
14
|
-
return dntShim.dntGlobalThis.abortSignal;
|
|
15
|
-
}
|
|
16
|
-
export function cleanAnswer(answer) {
|
|
17
|
-
return answer
|
|
18
|
-
.replace(/!\[([^\]]*)\]\([^)]+\)/g, '')
|
|
19
|
-
.replace(/\n\s*Image\s*\n/g, '\n')
|
|
20
|
-
.replace(/\n{3,}/g, '\n\n')
|
|
21
|
-
.trim();
|
|
22
|
-
}
|
|
23
|
-
export function buildSources(citations, linkPositions) {
|
|
24
|
-
return citations.map(c => ({
|
|
25
|
-
title: c.title || c.description || c.text || '',
|
|
26
|
-
url: c.url,
|
|
27
|
-
domain: extractDomain(c.url),
|
|
28
|
-
cited: c.cited,
|
|
29
|
-
positions: linkPositions?.[c.url]
|
|
30
|
-
}));
|
|
31
|
-
}
|
|
32
|
-
export function buildSearchSources(sources) {
|
|
33
|
-
return sources.map(s => ({
|
|
34
|
-
title: s.title || s.snippet || '',
|
|
35
|
-
url: s.url || '',
|
|
36
|
-
domain: s.url ? extractDomain(s.url) : '',
|
|
37
|
-
rank: s.rank || 0,
|
|
38
|
-
datePublished: s.date_published || null
|
|
39
|
-
}));
|
|
40
|
-
}
|
|
41
|
-
/**
|
|
42
|
-
* Creates an empty model result for failed jobs.
|
|
43
|
-
* This ensures we always return the same number of rows as input.
|
|
44
|
-
*/
|
|
45
|
-
export function emptyModelResult(providerName, errorMessage, context) {
|
|
46
|
-
if (errorMessage) {
|
|
47
|
-
console.error(`[${providerName}] ${errorMessage}`, context ?? '');
|
|
48
|
-
}
|
|
49
|
-
return {
|
|
50
|
-
prompt: '',
|
|
51
|
-
answer: '',
|
|
52
|
-
sources: []
|
|
53
|
-
};
|
|
54
|
-
}
|
|
55
|
-
// ============================================================================
|
|
56
|
-
// Scraper Factory
|
|
57
|
-
// ============================================================================
|
|
58
|
-
export function createScraper(provider) {
|
|
59
|
-
const { name, maxConcurrency, maxPromptsPerRequest, triggerJob, monitorJob, downloadJob, transformResponse } = provider;
|
|
60
|
-
async function triggerGPTBatch({ prompts, useSearch = false, countryISOCode = null }) {
|
|
61
|
-
const jobIds = await mapParallel(prompts, maxConcurrency, (prompt) => triggerJob(prompt, useSearch, countryISOCode));
|
|
62
|
-
console.log(`[${name}] Triggered ${jobIds.length} jobs for ${prompts.length} prompts`);
|
|
63
|
-
return jobIds;
|
|
64
|
-
}
|
|
65
|
-
async function downloadGPTSnapshots(jobIds) {
|
|
66
|
-
const results = [];
|
|
67
|
-
for (const jobId of jobIds) {
|
|
68
|
-
if (!jobId) {
|
|
69
|
-
results.push(emptyModelResult(name, 'No job ID provided'));
|
|
70
|
-
continue;
|
|
71
|
-
}
|
|
72
|
-
const isReady = await monitorJob(jobId);
|
|
73
|
-
if (!isReady) {
|
|
74
|
-
results.push(emptyModelResult(name, 'Job not ready or failed', jobId));
|
|
75
|
-
continue;
|
|
76
|
-
}
|
|
77
|
-
const raw = await downloadJob(jobId);
|
|
78
|
-
if (!raw) {
|
|
79
|
-
results.push(emptyModelResult(name, 'Failed to download job', jobId));
|
|
80
|
-
continue;
|
|
81
|
-
}
|
|
82
|
-
const result = transformResponse(raw);
|
|
83
|
-
results.push(result ?? emptyModelResult(name, 'Failed to transform response', jobId));
|
|
84
|
-
}
|
|
85
|
-
return results;
|
|
86
|
-
}
|
|
87
|
-
async function scrapeGPTBatch(options) {
|
|
88
|
-
const jobIds = await triggerGPTBatch(options);
|
|
89
|
-
return downloadGPTSnapshots(jobIds);
|
|
90
|
-
}
|
|
91
|
-
return {
|
|
92
|
-
maxConcurrency,
|
|
93
|
-
maxPromptsPerRequest,
|
|
94
|
-
scrapeGPTBatch,
|
|
95
|
-
triggerGPTBatch,
|
|
96
|
-
downloadGPTSnapshots
|
|
97
|
-
};
|
|
98
|
-
}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/brightdata/index.ts"],"names":[],"mappings":"AAAA,cAAc,aAAa,CAAC"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"scrape.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/brightdata/scrape.ts"],"names":[],"mappings":"AACA,OAAO,EAA4B,KAAK,WAAW,EAAE,MAAM,wBAAwB,CAAC;AAapF,MAAM,WAAW,uBAAuB;IACpC,wDAAwD;IACxD,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,6DAA6D;IAC7D,MAAM,CAAC,EAAE,KAAK,GAAG,MAAM,CAAC;IACxB,8DAA8D;IAC9D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,wDAAwD;IACxD,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,oDAAoD;IACpD,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,kCAAkC;IAClC,WAAW,CAAC,EAAE,WAAW,CAAC;CAC7B;AAED,MAAM,WAAW,wBAAwB;IACrC,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,CAAC,EAAE,MAAM,CAAC;CACjB;AA4ED;;;GAGG;AACH,wBAAsB,gBAAgB,CAClC,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,uBAA4B,GACtC,OAAO,CAAC,wBAAwB,CAAC,CAYnC;AAED;;;GAGG;AACH,wBAAsB,qBAAqB,CACvC,IAAI,EAAE,KAAK,CAAC,MAAM,CAAC,EACnB,OAAO,GAAE,uBAA4B,EACrC,cAAc,GAAE,MAA+B,GAChD,OAAO,CAAC,KAAK,CAAC,wBAAwB,CAAC,CAAC,CAQ1C"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"brightdata.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/chatgptScraper/brightdata.ts"],"names":[],"mappings":"AAeA,OAAO,EACN,KAAK,iBAAiB,EAKtB,MAAM,cAAc,CAAC;AAmNtB,eAAO,MAAM,kBAAkB,EAAE,iBAQhC,CAAC"}
|