@graphext/cuery 0.9.5 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/esm/browser.d.ts +1 -1
- package/esm/browser.js +1 -1
- package/esm/mod.d.ts +3 -3
- package/esm/mod.d.ts.map +1 -1
- package/esm/mod.js +3 -3
- package/esm/src/apis/brightdata/contentScraper/index.d.ts.map +1 -0
- package/{script/src/apis/brightdata → esm/src/apis/brightdata/contentScraper}/scrape.d.ts +1 -1
- package/esm/src/apis/brightdata/contentScraper/scrape.d.ts.map +1 -0
- package/esm/src/apis/brightdata/{scrape.js → contentScraper/scrape.js} +2 -2
- package/esm/src/apis/brightdata/llmScraper/brightdata.d.ts +20 -0
- package/esm/src/apis/brightdata/llmScraper/brightdata.d.ts.map +1 -0
- package/esm/src/apis/brightdata/llmScraper/brightdata.js +182 -0
- package/esm/src/apis/brightdata/llmScraper/index.d.ts +14 -0
- package/esm/src/apis/brightdata/llmScraper/index.d.ts.map +1 -0
- package/esm/src/apis/brightdata/llmScraper/index.js +97 -0
- package/esm/src/apis/brightdata/llmScraper/oxy.d.ts +16 -0
- package/esm/src/apis/brightdata/llmScraper/oxy.d.ts.map +1 -0
- package/esm/src/apis/brightdata/llmScraper/oxy.js +171 -0
- package/{script/src/apis/chatgptScraper/scraper.d.ts → esm/src/apis/brightdata/llmScraper/scrape.d.ts} +12 -15
- package/esm/src/apis/brightdata/llmScraper/scrape.d.ts.map +1 -0
- package/esm/src/apis/brightdata/llmScraper/scrape.js +184 -0
- package/esm/src/apis/hasdata/helpers.d.ts.map +1 -1
- package/esm/src/apis/hasdata/helpers.js +56 -18
- package/esm/src/schemas/search.schema.d.ts +2 -2
- package/esm/src/schemas/search.schema.d.ts.map +1 -1
- package/esm/src/schemas/sources.schema.d.ts +1 -4
- package/esm/src/schemas/sources.schema.d.ts.map +1 -1
- package/package.json +1 -1
- package/script/browser.d.ts +1 -1
- package/script/browser.js +1 -1
- package/script/mod.d.ts +3 -3
- package/script/mod.d.ts.map +1 -1
- package/script/mod.js +6 -6
- package/script/src/apis/brightdata/contentScraper/index.d.ts.map +1 -0
- package/{esm/src/apis/brightdata → script/src/apis/brightdata/contentScraper}/scrape.d.ts +1 -1
- package/script/src/apis/brightdata/contentScraper/scrape.d.ts.map +1 -0
- package/script/src/apis/brightdata/{scrape.js → contentScraper/scrape.js} +2 -2
- package/script/src/apis/brightdata/llmScraper/brightdata.d.ts +20 -0
- package/script/src/apis/brightdata/llmScraper/brightdata.d.ts.map +1 -0
- package/script/src/apis/brightdata/llmScraper/brightdata.js +219 -0
- package/script/src/apis/brightdata/llmScraper/index.d.ts +14 -0
- package/script/src/apis/brightdata/llmScraper/index.d.ts.map +1 -0
- package/script/src/apis/brightdata/llmScraper/index.js +140 -0
- package/script/src/apis/brightdata/llmScraper/oxy.d.ts +16 -0
- package/script/src/apis/brightdata/llmScraper/oxy.d.ts.map +1 -0
- package/script/src/apis/brightdata/llmScraper/oxy.js +208 -0
- package/{esm/src/apis/chatgptScraper/scraper.d.ts → script/src/apis/brightdata/llmScraper/scrape.d.ts} +12 -15
- package/script/src/apis/brightdata/llmScraper/scrape.d.ts.map +1 -0
- package/script/src/apis/brightdata/llmScraper/scrape.js +224 -0
- package/script/src/apis/hasdata/helpers.d.ts.map +1 -1
- package/script/src/apis/hasdata/helpers.js +56 -18
- package/script/src/schemas/search.schema.d.ts +2 -2
- package/script/src/schemas/search.schema.d.ts.map +1 -1
- package/script/src/schemas/sources.schema.d.ts +1 -4
- package/script/src/schemas/sources.schema.d.ts.map +1 -1
- package/esm/src/apis/brightdata/index.d.ts.map +0 -1
- package/esm/src/apis/brightdata/scrape.d.ts.map +0 -1
- package/esm/src/apis/chatgptScraper/brightdata.d.ts +0 -3
- package/esm/src/apis/chatgptScraper/brightdata.d.ts.map +0 -1
- package/esm/src/apis/chatgptScraper/brightdata.js +0 -172
- package/esm/src/apis/chatgptScraper/index.d.ts +0 -10
- package/esm/src/apis/chatgptScraper/index.d.ts.map +0 -1
- package/esm/src/apis/chatgptScraper/index.js +0 -41
- package/esm/src/apis/chatgptScraper/oxy.d.ts +0 -3
- package/esm/src/apis/chatgptScraper/oxy.d.ts.map +0 -1
- package/esm/src/apis/chatgptScraper/oxy.js +0 -156
- package/esm/src/apis/chatgptScraper/scraper.d.ts.map +0 -1
- package/esm/src/apis/chatgptScraper/scraper.js +0 -98
- package/script/src/apis/brightdata/index.d.ts.map +0 -1
- package/script/src/apis/brightdata/scrape.d.ts.map +0 -1
- package/script/src/apis/chatgptScraper/brightdata.d.ts +0 -3
- package/script/src/apis/chatgptScraper/brightdata.d.ts.map +0 -1
- package/script/src/apis/chatgptScraper/brightdata.js +0 -208
- package/script/src/apis/chatgptScraper/index.d.ts +0 -10
- package/script/src/apis/chatgptScraper/index.d.ts.map +0 -1
- package/script/src/apis/chatgptScraper/index.js +0 -81
- package/script/src/apis/chatgptScraper/oxy.d.ts +0 -3
- package/script/src/apis/chatgptScraper/oxy.d.ts.map +0 -1
- package/script/src/apis/chatgptScraper/oxy.js +0 -192
- package/script/src/apis/chatgptScraper/scraper.d.ts.map +0 -1
- package/script/src/apis/chatgptScraper/scraper.js +0 -139
- /package/esm/src/apis/brightdata/{index.d.ts → contentScraper/index.d.ts} +0 -0
- /package/esm/src/apis/brightdata/{index.js → contentScraper/index.js} +0 -0
- /package/script/src/apis/brightdata/{index.d.ts → contentScraper/index.d.ts} +0 -0
- /package/script/src/apis/brightdata/{index.js → contentScraper/index.js} +0 -0
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.getAbortSignal = getAbortSignal;
|
|
37
|
+
exports.cleanAnswer = cleanAnswer;
|
|
38
|
+
exports.buildSources = buildSources;
|
|
39
|
+
exports.emptyModelResult = emptyModelResult;
|
|
40
|
+
exports.createLLMScraper = createLLMScraper;
|
|
41
|
+
/* eslint no-console: ["warn", { allow: ["log", "warn", "error"] }] */
|
|
42
|
+
/**
|
|
43
|
+
* LLM Scraper - Core types and orchestration logic.
|
|
44
|
+
*
|
|
45
|
+
* Uses composition: providers supply functions, this module orchestrates them.
|
|
46
|
+
*/
|
|
47
|
+
const dntShim = __importStar(require("../../../../_dnt.shims.js"));
|
|
48
|
+
const async_js_1 = require("../../../helpers/async.js");
|
|
49
|
+
const urls_js_1 = require("../../../helpers/urls.js");
|
|
50
|
+
// ============================================================================
|
|
51
|
+
// Shared Utilities
|
|
52
|
+
// ============================================================================
|
|
53
|
+
function getAbortSignal() {
|
|
54
|
+
return dntShim.dntGlobalThis.abortSignal;
|
|
55
|
+
}
|
|
56
|
+
function cleanAnswer(answer) {
|
|
57
|
+
return answer
|
|
58
|
+
.replace(/!\[([^\]]*)\]\([^)]+\)/g, '')
|
|
59
|
+
.replace(/\n\s*Image\s*\n/g, '\n')
|
|
60
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
61
|
+
.trim();
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Derive a merge key from a URL: origin + pathname, stripping query and fragment.
|
|
65
|
+
* Falls back to the raw URL if parsing fails.
|
|
66
|
+
*/
|
|
67
|
+
function urlMergeKey(url) {
|
|
68
|
+
try {
|
|
69
|
+
const parsed = new URL(url);
|
|
70
|
+
return parsed.origin + parsed.pathname;
|
|
71
|
+
}
|
|
72
|
+
catch {
|
|
73
|
+
return url;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Returns true when `candidate` carries extra info (hash or search params)
|
|
78
|
+
* that `current` does not.
|
|
79
|
+
*/
|
|
80
|
+
function hasExtraUrlInfo(current, candidate) {
|
|
81
|
+
try {
|
|
82
|
+
const cur = new URL(current);
|
|
83
|
+
const cand = new URL(candidate);
|
|
84
|
+
const hasNewHash = cand.hash !== '' && cur.hash === '';
|
|
85
|
+
const hasNewParams = cand.search !== '' && cur.search === '';
|
|
86
|
+
return hasNewHash || hasNewParams;
|
|
87
|
+
}
|
|
88
|
+
catch {
|
|
89
|
+
return false;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
function buildSources(citations, linksAttached = []) {
|
|
93
|
+
const sources = [];
|
|
94
|
+
const sourcesByKey = new Map();
|
|
95
|
+
const upsertSource = (url, initialTitle, cited) => {
|
|
96
|
+
const key = urlMergeKey(url);
|
|
97
|
+
const existing = sourcesByKey.get(key);
|
|
98
|
+
if (existing) {
|
|
99
|
+
if (!existing.title && initialTitle) {
|
|
100
|
+
existing.title = initialTitle;
|
|
101
|
+
}
|
|
102
|
+
existing.cited = existing.cited || cited;
|
|
103
|
+
// Keep the most informative URL (with fragment/params)
|
|
104
|
+
if (hasExtraUrlInfo(existing.url, url)) {
|
|
105
|
+
existing.url = url;
|
|
106
|
+
}
|
|
107
|
+
return existing;
|
|
108
|
+
}
|
|
109
|
+
const source = {
|
|
110
|
+
title: initialTitle,
|
|
111
|
+
url,
|
|
112
|
+
domain: (0, urls_js_1.extractDomain)(url),
|
|
113
|
+
cited,
|
|
114
|
+
};
|
|
115
|
+
sources.push(source);
|
|
116
|
+
sourcesByKey.set(key, source);
|
|
117
|
+
return source;
|
|
118
|
+
};
|
|
119
|
+
const sortedLinks = [...linksAttached].sort((a, b) => {
|
|
120
|
+
const aPos = a.position ?? Number.MAX_SAFE_INTEGER;
|
|
121
|
+
const bPos = b.position ?? Number.MAX_SAFE_INTEGER;
|
|
122
|
+
return aPos - bPos;
|
|
123
|
+
});
|
|
124
|
+
for (const link of sortedLinks) {
|
|
125
|
+
if (!link.url)
|
|
126
|
+
continue;
|
|
127
|
+
const source = upsertSource(link.url, link.text ?? '', true);
|
|
128
|
+
if (link.position != null) {
|
|
129
|
+
source.positions ??= [];
|
|
130
|
+
if (!source.positions.includes(link.position)) {
|
|
131
|
+
source.positions.push(link.position);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
for (const citation of citations) {
|
|
136
|
+
if (!citation.url)
|
|
137
|
+
continue;
|
|
138
|
+
const key = urlMergeKey(citation.url);
|
|
139
|
+
const existing = sourcesByKey.get(key);
|
|
140
|
+
const title = citation.title || citation.description || citation.text || '';
|
|
141
|
+
if (existing) {
|
|
142
|
+
if (title) {
|
|
143
|
+
existing.title = title;
|
|
144
|
+
}
|
|
145
|
+
existing.cited = existing.cited || citation.cited;
|
|
146
|
+
// Append extra fragment/params from citation
|
|
147
|
+
if (hasExtraUrlInfo(existing.url, citation.url)) {
|
|
148
|
+
existing.url = citation.url;
|
|
149
|
+
}
|
|
150
|
+
continue;
|
|
151
|
+
}
|
|
152
|
+
const source = {
|
|
153
|
+
title,
|
|
154
|
+
url: citation.url,
|
|
155
|
+
domain: (0, urls_js_1.extractDomain)(citation.url),
|
|
156
|
+
cited: citation.cited,
|
|
157
|
+
};
|
|
158
|
+
sources.push(source);
|
|
159
|
+
sourcesByKey.set(key, source);
|
|
160
|
+
}
|
|
161
|
+
for (const source of sources) {
|
|
162
|
+
source.positions?.sort((a, b) => a - b);
|
|
163
|
+
}
|
|
164
|
+
return sources;
|
|
165
|
+
}
|
|
166
|
+
/**
|
|
167
|
+
* Creates an empty model result for failed jobs.
|
|
168
|
+
* This ensures we always return the same number of rows as input.
|
|
169
|
+
*/
|
|
170
|
+
function emptyModelResult(providerName, errorMessage, context) {
|
|
171
|
+
if (errorMessage) {
|
|
172
|
+
console.error(`[${providerName}] ${errorMessage}`, context ?? '');
|
|
173
|
+
}
|
|
174
|
+
return {
|
|
175
|
+
prompt: '',
|
|
176
|
+
answer: '',
|
|
177
|
+
answer_text_markdown: '',
|
|
178
|
+
sources: [],
|
|
179
|
+
};
|
|
180
|
+
}
|
|
181
|
+
// ============================================================================
|
|
182
|
+
// Scraper Factory
|
|
183
|
+
// ============================================================================
|
|
184
|
+
function createLLMScraper(provider) {
|
|
185
|
+
const { name, maxConcurrency, maxPromptsPerRequest, triggerJob, monitorJob, downloadJob, transformResponse, } = provider;
|
|
186
|
+
async function triggerLLMBatch({ prompts, useSearch = false, countryISOCode = null, }) {
|
|
187
|
+
const jobIds = await (0, async_js_1.mapParallel)(prompts, maxConcurrency, (prompt) => triggerJob(prompt, useSearch, countryISOCode));
|
|
188
|
+
console.log(`[${name}] Triggered ${jobIds.length} jobs for ${prompts.length} prompts`);
|
|
189
|
+
return jobIds;
|
|
190
|
+
}
|
|
191
|
+
async function downloadLLMSnapshots(jobIds) {
|
|
192
|
+
const results = [];
|
|
193
|
+
for (const jobId of jobIds) {
|
|
194
|
+
if (!jobId) {
|
|
195
|
+
results.push(emptyModelResult(name, 'No job ID provided'));
|
|
196
|
+
continue;
|
|
197
|
+
}
|
|
198
|
+
const isReady = await monitorJob(jobId);
|
|
199
|
+
if (!isReady) {
|
|
200
|
+
results.push(emptyModelResult(name, 'Job not ready or failed', jobId));
|
|
201
|
+
continue;
|
|
202
|
+
}
|
|
203
|
+
const raw = await downloadJob(jobId);
|
|
204
|
+
if (!raw) {
|
|
205
|
+
results.push(emptyModelResult(name, 'Failed to download job', jobId));
|
|
206
|
+
continue;
|
|
207
|
+
}
|
|
208
|
+
const result = transformResponse(raw);
|
|
209
|
+
results.push(result ?? emptyModelResult(name, 'Failed to transform response', jobId));
|
|
210
|
+
}
|
|
211
|
+
return results;
|
|
212
|
+
}
|
|
213
|
+
async function scrapeLLMBatch(options) {
|
|
214
|
+
const jobIds = await triggerLLMBatch(options);
|
|
215
|
+
return downloadLLMSnapshots(jobIds);
|
|
216
|
+
}
|
|
217
|
+
return {
|
|
218
|
+
maxConcurrency,
|
|
219
|
+
maxPromptsPerRequest,
|
|
220
|
+
scrapeLLMBatch,
|
|
221
|
+
triggerLLMBatch,
|
|
222
|
+
downloadLLMSnapshots,
|
|
223
|
+
};
|
|
224
|
+
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"helpers.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/hasdata/helpers.ts"],"names":[],"mappings":"AACA,OAAO,EAAe,KAAK,WAAW,EAAE,MAAM,wBAAwB,CAAC;AAEvE,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,iCAAiC,CAAC;AAG9D,eAAO,MAAM,mBAAmB,KAAK,CAAC;AAEtC,eAAO,MAAM,oBAAoB,EAAE,WAMlC,CAAC;AAEF,wBAAgB,gBAAgB,IAAI,MAAM,CAMzC;AAED,wBAAsB,qBAAqB,CAC1C,GAAG,EAAE,MAAM,EACX,WAAW,GAAE,WAAkC,GAC7C,OAAO,CAAC,QAAQ,CAAC,CAgCnB;AAED,UAAU,QAAQ;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,IAAI,CAAC,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;CACvB;AAED,UAAU,SAAS;IAClB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,uBAAuB,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACxC,gBAAgB,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACjC,IAAI,CAAC,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;IACvB,IAAI,CAAC,EAAE,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC;IAC5B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,UAAU,SAAS;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,UAAU;IAC1B,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IAC9B,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IAC9B,UAAU,CAAC,EAAE,UAAU,CAAC;IACxB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;CACrB;AAED,UAAU,eAAe;IACxB,EAAE,CAAC,EAAE,MAAM,CAAC;IACZ,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,GAAG,CAAC,EAAE,MAAM,CAAC;CACb;AAED,MAAM,WAAW,MAAM;IACtB,eAAe,CAAC,EAAE,eAAe,CAAC;IAClC,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IAC9B,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;CAC9B;AAED,MAAM,WAAW,SAAS;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;CACvB;
|
|
1
|
+
{"version":3,"file":"helpers.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/hasdata/helpers.ts"],"names":[],"mappings":"AACA,OAAO,EAAe,KAAK,WAAW,EAAE,MAAM,wBAAwB,CAAC;AAEvE,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,iCAAiC,CAAC;AAG9D,eAAO,MAAM,mBAAmB,KAAK,CAAC;AAEtC,eAAO,MAAM,oBAAoB,EAAE,WAMlC,CAAC;AAEF,wBAAgB,gBAAgB,IAAI,MAAM,CAMzC;AAED,wBAAsB,qBAAqB,CAC1C,GAAG,EAAE,MAAM,EACX,WAAW,GAAE,WAAkC,GAC7C,OAAO,CAAC,QAAQ,CAAC,CAgCnB;AAED,UAAU,QAAQ;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,IAAI,CAAC,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;CACvB;AAED,UAAU,SAAS;IAClB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,uBAAuB,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACxC,gBAAgB,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACjC,IAAI,CAAC,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;IACvB,IAAI,CAAC,EAAE,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC;IAC5B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,UAAU,SAAS;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,UAAU;IAC1B,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IAC9B,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IAC9B,UAAU,CAAC,EAAE,UAAU,CAAC;IACxB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;CACrB;AAED,UAAU,eAAe;IACxB,EAAE,CAAC,EAAE,MAAM,CAAC;IACZ,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,GAAG,CAAC,EAAE,MAAM,CAAC;CACb;AAED,MAAM,WAAW,MAAM;IACtB,eAAe,CAAC,EAAE,eAAe,CAAC;IAClC,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IAC9B,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;CAC9B;AAED,MAAM,WAAW,SAAS;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;CACvB;AAqMD,wBAAgB,QAAQ,CAAC,GAAG,EAAE,UAAU,GAAG,SAAS,CAInD;AAED,wBAAgB,QAAQ,CAAC,GAAG,EAAE,MAAM,GAAG,SAAS,CAI/C"}
|
|
@@ -156,8 +156,38 @@ function formatCode(block) {
|
|
|
156
156
|
const header = `[Code${lang ? ': ' + lang : ''}]`;
|
|
157
157
|
return `${header}\n${snippet.trim()}`;
|
|
158
158
|
}
|
|
159
|
+
function formatCitationMarkers(refIndexes) {
|
|
160
|
+
if (refIndexes.length === 0) {
|
|
161
|
+
return '';
|
|
162
|
+
}
|
|
163
|
+
return ' ' + refIndexes.map(i => `[${i + 1}]`).join('');
|
|
164
|
+
}
|
|
159
165
|
function parseAIResult(data, { allowNestedOverview = true } = {}) {
|
|
160
166
|
const textBlocks = data.textBlocks || (allowNestedOverview ? data.aiOverview?.textBlocks : []) || [];
|
|
167
|
+
// Build reference index → source index mapping and track cited refs
|
|
168
|
+
const refs = data.references || (allowNestedOverview ? data.aiOverview?.references : []) || [];
|
|
169
|
+
const sources = [];
|
|
170
|
+
const refIndexToSourceIndex = new Map();
|
|
171
|
+
for (const r of refs) {
|
|
172
|
+
const link = r.link || r.url;
|
|
173
|
+
const title = [r.title, r.source, r.snippet].filter(Boolean).join(' - ');
|
|
174
|
+
if (link && r.index != null) {
|
|
175
|
+
// Deduplicate by URL
|
|
176
|
+
const existingIdx = sources.findIndex(s => s.url === link);
|
|
177
|
+
if (existingIdx >= 0) {
|
|
178
|
+
refIndexToSourceIndex.set(r.index, existingIdx);
|
|
179
|
+
}
|
|
180
|
+
else {
|
|
181
|
+
refIndexToSourceIndex.set(r.index, sources.length);
|
|
182
|
+
sources.push({
|
|
183
|
+
title,
|
|
184
|
+
url: link,
|
|
185
|
+
domain: (0, urls_js_1.extractDomain)(link)
|
|
186
|
+
});
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
const citedSourceIndexes = new Set();
|
|
161
191
|
const parts = [];
|
|
162
192
|
const handlers = {
|
|
163
193
|
paragraph: (b) => cleanText(b.snippet || ''),
|
|
@@ -171,19 +201,40 @@ function parseAIResult(data, { allowNestedOverview = true } = {}) {
|
|
|
171
201
|
continue;
|
|
172
202
|
}
|
|
173
203
|
const handler = handlers[btype];
|
|
204
|
+
let rendered = '';
|
|
174
205
|
if (handler) {
|
|
175
|
-
|
|
176
|
-
if (rendered) {
|
|
177
|
-
parts.push(rendered);
|
|
178
|
-
}
|
|
206
|
+
rendered = handler(block);
|
|
179
207
|
}
|
|
180
208
|
else {
|
|
181
209
|
const snippet = block.snippet || '';
|
|
182
210
|
if (snippet) {
|
|
183
|
-
|
|
211
|
+
rendered = cleanText(snippet);
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
if (rendered) {
|
|
215
|
+
// Append citation markers and track positions
|
|
216
|
+
const refIndexes = block.referenceIndexes || [];
|
|
217
|
+
if (refIndexes.length > 0) {
|
|
218
|
+
// Map ref indexes to 1-based source indexes for display
|
|
219
|
+
const sourceIndexes = refIndexes
|
|
220
|
+
.map(ri => refIndexToSourceIndex.get(ri))
|
|
221
|
+
.filter((si) => si != null);
|
|
222
|
+
for (const si of sourceIndexes) {
|
|
223
|
+
citedSourceIndexes.add(si);
|
|
224
|
+
sources[si].positions ??= [];
|
|
225
|
+
if (!sources[si].positions.includes(parts.length)) {
|
|
226
|
+
sources[si].positions.push(parts.length);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
rendered += formatCitationMarkers(sourceIndexes.filter((v, i, a) => a.indexOf(v) === i));
|
|
184
230
|
}
|
|
231
|
+
parts.push(rendered);
|
|
185
232
|
}
|
|
186
233
|
}
|
|
234
|
+
// Mark cited sources
|
|
235
|
+
for (const si of citedSourceIndexes) {
|
|
236
|
+
sources[si].cited = true;
|
|
237
|
+
}
|
|
187
238
|
const deduped = [];
|
|
188
239
|
for (const p of parts) {
|
|
189
240
|
if (deduped.length === 0 || deduped[deduped.length - 1] !== p) {
|
|
@@ -195,19 +246,6 @@ function parseAIResult(data, { allowNestedOverview = true } = {}) {
|
|
|
195
246
|
console.warn('Warning: AI answer truncated to 16000 characters');
|
|
196
247
|
answer = answer.slice(0, 16000);
|
|
197
248
|
}
|
|
198
|
-
const refs = data.references || (allowNestedOverview ? data.aiOverview?.references : []) || [];
|
|
199
|
-
const sources = [];
|
|
200
|
-
for (const r of refs) {
|
|
201
|
-
const link = r.link || r.url;
|
|
202
|
-
const title = [r.title, r.source, r.snippet].filter(Boolean).join(' - ');
|
|
203
|
-
if (link) {
|
|
204
|
-
sources.push({
|
|
205
|
-
title,
|
|
206
|
-
url: link,
|
|
207
|
-
domain: (0, urls_js_1.extractDomain)(link)
|
|
208
|
-
});
|
|
209
|
-
}
|
|
210
|
-
}
|
|
211
249
|
return { answer, sources };
|
|
212
250
|
}
|
|
213
251
|
function parseAIO(aio) {
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import type { z } from '../../deps/jsr.io/@zod/zod/4.3.6/src/index.js';
|
|
2
|
-
import type { Source
|
|
2
|
+
import type { Source } from './sources.schema.js';
|
|
3
3
|
export type ContextSize = 'low' | 'medium' | 'high';
|
|
4
4
|
export type ReasoningEffort = 'low' | 'medium' | 'high';
|
|
5
5
|
export interface SearchResult {
|
|
6
6
|
answer: string;
|
|
7
|
+
answer_text_markdown?: string;
|
|
7
8
|
sources: Array<Source>;
|
|
8
9
|
searchQueries?: Array<string>;
|
|
9
|
-
searchSources?: Array<SearchSource>;
|
|
10
10
|
}
|
|
11
11
|
export type SearchOptions = {
|
|
12
12
|
prompt: string;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"search.schema.d.ts","sourceRoot":"","sources":["../../../src/src/schemas/search.schema.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,CAAC,EAAE,MAAM,+CAA+C,CAAC;AAEvE,OAAO,KAAK,EAAE,MAAM,EAAE,
|
|
1
|
+
{"version":3,"file":"search.schema.d.ts","sourceRoot":"","sources":["../../../src/src/schemas/search.schema.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,CAAC,EAAE,MAAM,+CAA+C,CAAC;AAEvE,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,qBAAqB,CAAC;AAClD,MAAM,MAAM,WAAW,GAAG,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;AACpD,MAAM,MAAM,eAAe,GAAG,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;AAExD,MAAM,WAAW,YAAY;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAC9B,OAAO,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACvB,aAAa,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;CAC9B;AAED,MAAM,MAAM,aAAa,GAAG;IAC3B,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB;4FACwF;IACxF,cAAc,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/B,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,eAAe,CAAC,EAAE,eAAe,CAAC;IAClC,UAAU,CAAC,EAAE,YAAY,GAAG,oBAAoB,CAAC;CACjD,CAAC;AAEF,MAAM,MAAM,sBAAsB,CAAC,CAAC,IAAI,aAAa,GAAG;IACvD,cAAc,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;CAC7B,CAAC;AAEF,MAAM,MAAM,kBAAkB,GAAG,IAAI,CAAC,aAAa,EAAE,QAAQ,CAAC,GAAG;IAChE,OAAO,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACvB,cAAc,CAAC,EAAE,MAAM,CAAC;CACxB,CAAC"}
|
|
@@ -3,6 +3,7 @@ export interface Source {
|
|
|
3
3
|
url: string;
|
|
4
4
|
domain: string;
|
|
5
5
|
cited?: boolean;
|
|
6
|
+
snippet?: string;
|
|
6
7
|
positions?: Array<number>;
|
|
7
8
|
}
|
|
8
9
|
/**
|
|
@@ -22,8 +23,4 @@ export interface CategorizedSource extends EnrichedSource {
|
|
|
22
23
|
category: string | null;
|
|
23
24
|
subcategory: string | null;
|
|
24
25
|
}
|
|
25
|
-
export interface SearchSource extends Source {
|
|
26
|
-
rank: number;
|
|
27
|
-
datePublished: string | null;
|
|
28
|
-
}
|
|
29
26
|
//# sourceMappingURL=sources.schema.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"sources.schema.d.ts","sourceRoot":"","sources":["../../../src/src/schemas/sources.schema.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,MAAM;IACtB,KAAK,EAAE,MAAM,CAAC;IACd,GAAG,EAAE,MAAM,CAAC;IACZ,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,SAAS,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;CAC1B;AAED;;;GAGG;AACH,MAAM,WAAW,cAAe,SAAQ,MAAM;IAC7C,eAAe,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IAC/B,oBAAoB,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACpC,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,gBAAgB,EAAE,MAAM,GAAG,IAAI,CAAC;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,iBAAkB,SAAQ,cAAc;IACxD,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3B
|
|
1
|
+
{"version":3,"file":"sources.schema.d.ts","sourceRoot":"","sources":["../../../src/src/schemas/sources.schema.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,MAAM;IACtB,KAAK,EAAE,MAAM,CAAC;IACd,GAAG,EAAE,MAAM,CAAC;IACZ,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;CAC1B;AAED;;;GAGG;AACH,MAAM,WAAW,cAAe,SAAQ,MAAM;IAC7C,eAAe,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IAC/B,oBAAoB,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACpC,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,gBAAgB,EAAE,MAAM,GAAG,IAAI,CAAC;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,iBAAkB,SAAQ,cAAc;IACxD,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3B"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/brightdata/index.ts"],"names":[],"mappings":"AAAA,cAAc,aAAa,CAAC"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"scrape.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/brightdata/scrape.ts"],"names":[],"mappings":"AACA,OAAO,EAA4B,KAAK,WAAW,EAAE,MAAM,wBAAwB,CAAC;AAapF,MAAM,WAAW,uBAAuB;IACpC,wDAAwD;IACxD,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,6DAA6D;IAC7D,MAAM,CAAC,EAAE,KAAK,GAAG,MAAM,CAAC;IACxB,8DAA8D;IAC9D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,wDAAwD;IACxD,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,oDAAoD;IACpD,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,kCAAkC;IAClC,WAAW,CAAC,EAAE,WAAW,CAAC;CAC7B;AAED,MAAM,WAAW,wBAAwB;IACrC,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,CAAC,EAAE,MAAM,CAAC;CACjB;AA4ED;;;GAGG;AACH,wBAAsB,gBAAgB,CAClC,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,uBAA4B,GACtC,OAAO,CAAC,wBAAwB,CAAC,CAYnC;AAED;;;GAGG;AACH,wBAAsB,qBAAqB,CACvC,IAAI,EAAE,KAAK,CAAC,MAAM,CAAC,EACnB,OAAO,GAAE,uBAA4B,EACrC,cAAc,GAAE,MAA+B,GAChD,OAAO,CAAC,KAAK,CAAC,wBAAwB,CAAC,CAAC,CAQ1C"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"brightdata.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/chatgptScraper/brightdata.ts"],"names":[],"mappings":"AAeA,OAAO,EACN,KAAK,iBAAiB,EAKtB,MAAM,cAAc,CAAC;AAmNtB,eAAO,MAAM,kBAAkB,EAAE,iBAQhC,CAAC"}
|
|
@@ -1,172 +0,0 @@
|
|
|
1
|
-
/* eslint no-console: ["warn", { allow: ["log", "warn", "error"] }] */
|
|
2
|
-
/**
|
|
3
|
-
* Brightdata GPT Scraper Provider.
|
|
4
|
-
*
|
|
5
|
-
* API Flow:
|
|
6
|
-
* 1. Trigger: POST to /datasets/v3/trigger → returns snapshot_id
|
|
7
|
-
* 2. Monitor: GET /datasets/v3/progress/{snapshot_id} until ready
|
|
8
|
-
* 3. Download: GET /datasets/v3/snapshot/{snapshot_id}
|
|
9
|
-
*/
|
|
10
|
-
import * as dntShim from "../../../_dnt.shims.js";
|
|
11
|
-
import { withRetries, sleep } from '../../helpers/async.js';
|
|
12
|
-
import { getAbortSignal, cleanAnswer, buildSources, buildSearchSources } from './scraper.js';
|
|
13
|
-
// ============================================================================
|
|
14
|
-
// Constants
|
|
15
|
-
// ============================================================================
|
|
16
|
-
const API_BASE = 'https://api.brightdata.com';
|
|
17
|
-
const DATASET_ID = 'gd_m7aof0k82r803d5bjm';
|
|
18
|
-
const OUTPUT_FIELDS = 'url|prompt|answer_text|answer_text_markdown|citations|links_attached|search_sources|country|model|web_search_triggered|web_search_query|index';
|
|
19
|
-
const TRIGGER_RETRY = {
|
|
20
|
-
maxRetries: 3,
|
|
21
|
-
initialDelay: 0,
|
|
22
|
-
statusCodes: [429, 500, 502, 503, 504]
|
|
23
|
-
};
|
|
24
|
-
const DOWNLOAD_RETRY = {
|
|
25
|
-
maxRetries: 5,
|
|
26
|
-
initialDelay: 2000,
|
|
27
|
-
statusCodes: [202, 500, 502, 503, 504]
|
|
28
|
-
};
|
|
29
|
-
const MONITOR_RETRY = {
|
|
30
|
-
maxRetries: 4,
|
|
31
|
-
initialDelay: 1000,
|
|
32
|
-
statusCodes: [408, 425, 429, 500, 502, 503, 504]
|
|
33
|
-
};
|
|
34
|
-
const MONITOR_RETRIABLE = new Set(MONITOR_RETRY.statusCodes ?? []);
|
|
35
|
-
const MAX_WAIT_MS = 600_000; // 10 minutes
|
|
36
|
-
const POLL_INTERVAL_MS = 5_000;
|
|
37
|
-
// ============================================================================
|
|
38
|
-
// API Key
|
|
39
|
-
// ============================================================================
|
|
40
|
-
function getApiKey() {
|
|
41
|
-
const apiKey = dntShim.Deno.env.get('BRIGHTDATA_API_KEY');
|
|
42
|
-
if (!apiKey) {
|
|
43
|
-
throw new Error('BRIGHTDATA_API_KEY environment variable is required');
|
|
44
|
-
}
|
|
45
|
-
return apiKey;
|
|
46
|
-
}
|
|
47
|
-
// ============================================================================
|
|
48
|
-
// Provider Functions
|
|
49
|
-
// ============================================================================
|
|
50
|
-
async function triggerJob(prompt, useSearch, countryISOCode) {
|
|
51
|
-
const apiKey = getApiKey();
|
|
52
|
-
const url = `${API_BASE}/datasets/v3/trigger?dataset_id=${DATASET_ID}&include_errors=true`;
|
|
53
|
-
const body = {
|
|
54
|
-
custom_output_fields: OUTPUT_FIELDS,
|
|
55
|
-
input: [{
|
|
56
|
-
url: 'http://chatgpt.com/',
|
|
57
|
-
prompt,
|
|
58
|
-
web_search: useSearch,
|
|
59
|
-
country: countryISOCode || '',
|
|
60
|
-
index: 0
|
|
61
|
-
}]
|
|
62
|
-
};
|
|
63
|
-
try {
|
|
64
|
-
const response = await withRetries(() => fetch(url, {
|
|
65
|
-
method: 'POST',
|
|
66
|
-
headers: {
|
|
67
|
-
'Authorization': `Bearer ${apiKey}`,
|
|
68
|
-
'Content-Type': 'application/json'
|
|
69
|
-
},
|
|
70
|
-
body: JSON.stringify(body),
|
|
71
|
-
signal: getAbortSignal()
|
|
72
|
-
}), TRIGGER_RETRY);
|
|
73
|
-
if (!response.ok) {
|
|
74
|
-
console.error(`[Brightdata] Trigger error: ${response.status}`);
|
|
75
|
-
return null;
|
|
76
|
-
}
|
|
77
|
-
const data = await response.json();
|
|
78
|
-
return data?.snapshot_id || null;
|
|
79
|
-
}
|
|
80
|
-
catch (error) {
|
|
81
|
-
console.error('[Brightdata] Trigger failed:', error);
|
|
82
|
-
return null;
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
async function monitorJob(snapshotId) {
|
|
86
|
-
const apiKey = getApiKey();
|
|
87
|
-
const url = `${API_BASE}/datasets/v3/progress/${snapshotId}`;
|
|
88
|
-
const startTime = Date.now();
|
|
89
|
-
const abortSignal = getAbortSignal();
|
|
90
|
-
while (Date.now() - startTime < MAX_WAIT_MS) {
|
|
91
|
-
if (abortSignal?.aborted)
|
|
92
|
-
return false;
|
|
93
|
-
try {
|
|
94
|
-
const response = await withRetries(() => fetch(url, {
|
|
95
|
-
headers: { 'Authorization': `Bearer ${apiKey}` },
|
|
96
|
-
signal: abortSignal
|
|
97
|
-
}), MONITOR_RETRY);
|
|
98
|
-
if (!response.ok) {
|
|
99
|
-
if (!MONITOR_RETRIABLE.has(response.status))
|
|
100
|
-
return false;
|
|
101
|
-
}
|
|
102
|
-
else {
|
|
103
|
-
const status = await response.json();
|
|
104
|
-
if (status.status === 'ready' || status.status === 'complete')
|
|
105
|
-
return true;
|
|
106
|
-
if (status.status === 'failed' || status.status === 'error')
|
|
107
|
-
return false;
|
|
108
|
-
}
|
|
109
|
-
}
|
|
110
|
-
catch (error) {
|
|
111
|
-
console.error('[Brightdata] Monitor error:', error);
|
|
112
|
-
}
|
|
113
|
-
await sleep(POLL_INTERVAL_MS, abortSignal);
|
|
114
|
-
}
|
|
115
|
-
console.error(`[Brightdata] Monitor timeout after ${MAX_WAIT_MS / 1000}s`);
|
|
116
|
-
return false;
|
|
117
|
-
}
|
|
118
|
-
async function downloadJob(snapshotId) {
|
|
119
|
-
const apiKey = getApiKey();
|
|
120
|
-
const url = `${API_BASE}/datasets/v3/snapshot/${snapshotId}?format=json`;
|
|
121
|
-
try {
|
|
122
|
-
const response = await withRetries(() => fetch(url, {
|
|
123
|
-
headers: { 'Authorization': `Bearer ${apiKey}` },
|
|
124
|
-
signal: getAbortSignal()
|
|
125
|
-
}), DOWNLOAD_RETRY);
|
|
126
|
-
if (!response.ok) {
|
|
127
|
-
console.error(`[Brightdata] Download error: ${response.status}`);
|
|
128
|
-
return null;
|
|
129
|
-
}
|
|
130
|
-
const data = await response.json();
|
|
131
|
-
return Array.isArray(data) ? data : null;
|
|
132
|
-
}
|
|
133
|
-
catch (error) {
|
|
134
|
-
console.error('[Brightdata] Download failed:', error);
|
|
135
|
-
return null;
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
function transformResponse(raw) {
|
|
139
|
-
const responses = raw;
|
|
140
|
-
if (!responses || responses.length === 0)
|
|
141
|
-
return null;
|
|
142
|
-
const response = responses[0];
|
|
143
|
-
let answer = response.answer_text_markdown || response.answer_text || '';
|
|
144
|
-
answer = cleanAnswer(answer);
|
|
145
|
-
// Build link positions map
|
|
146
|
-
const linkPositions = {};
|
|
147
|
-
for (const link of response.links_attached ?? []) {
|
|
148
|
-
if (link.url && link.position != null) {
|
|
149
|
-
linkPositions[link.url] ??= [];
|
|
150
|
-
linkPositions[link.url].push(link.position);
|
|
151
|
-
}
|
|
152
|
-
}
|
|
153
|
-
return {
|
|
154
|
-
prompt: response.prompt,
|
|
155
|
-
answer,
|
|
156
|
-
sources: buildSources(response.citations ?? [], linkPositions),
|
|
157
|
-
searchQueries: response.web_search_query || [],
|
|
158
|
-
searchSources: buildSearchSources(response.search_sources ?? [])
|
|
159
|
-
};
|
|
160
|
-
}
|
|
161
|
-
// ============================================================================
|
|
162
|
-
// Export
|
|
163
|
-
// ============================================================================
|
|
164
|
-
export const brightdataProvider = {
|
|
165
|
-
name: 'Brightdata',
|
|
166
|
-
maxConcurrency: 50,
|
|
167
|
-
maxPromptsPerRequest: 1,
|
|
168
|
-
triggerJob,
|
|
169
|
-
monitorJob,
|
|
170
|
-
downloadJob,
|
|
171
|
-
transformResponse
|
|
172
|
-
};
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
import type { ModelResult } from '../../schemas/models.schema.js';
|
|
2
|
-
import { type BatchOptions } from './scraper.js';
|
|
3
|
-
export type { BatchOptions };
|
|
4
|
-
export type JobId = string | null;
|
|
5
|
-
export declare function getMaxConcurrency(): number;
|
|
6
|
-
export declare function getMaxPromptsPerRequest(): number;
|
|
7
|
-
export declare function scrapeGPTBatch(options: BatchOptions): Promise<Array<ModelResult>>;
|
|
8
|
-
export declare function triggerGPTBatch(options: BatchOptions): Promise<Array<string | null>>;
|
|
9
|
-
export declare function downloadGPTSnapshots(jobIds: Array<string | null>): Promise<Array<ModelResult>>;
|
|
10
|
-
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/chatgptScraper/index.ts"],"names":[],"mappings":"AAUA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,gCAAgC,CAAC;AAClE,OAAO,EAAE,KAAK,YAAY,EAAkC,MAAM,cAAc,CAAC;AAKjF,YAAY,EAAE,YAAY,EAAE,CAAC;AAC7B,MAAM,MAAM,KAAK,GAAG,MAAM,GAAG,IAAI,CAAC;AAqBlC,wBAAgB,iBAAiB,IAAI,MAAM,CAE1C;AAED,wBAAgB,uBAAuB,IAAI,MAAM,CAEhD;AAED,wBAAsB,cAAc,CAAC,OAAO,EAAE,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAEvF;AAED,wBAAsB,eAAe,CAAC,OAAO,EAAE,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAE1F;AAED,wBAAsB,oBAAoB,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAEpG"}
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
/* eslint no-console: ["warn", { allow: ["log", "warn", "error"] }] */
|
|
2
|
-
/**
|
|
3
|
-
* GPT Scraper - Public API
|
|
4
|
-
*
|
|
5
|
-
* Selects between Brightdata and Oxylabs based on CHATGPT_SCRAPER_PROVIDER env var.
|
|
6
|
-
* Default: oxylabs
|
|
7
|
-
*/
|
|
8
|
-
import * as dntShim from "../../../_dnt.shims.js";
|
|
9
|
-
import { createScraper } from './scraper.js';
|
|
10
|
-
import { brightdataProvider } from './brightdata.js';
|
|
11
|
-
import { oxylabsProvider } from './oxy.js';
|
|
12
|
-
// ============================================================================
|
|
13
|
-
// Scraper Instance (lazy singleton)
|
|
14
|
-
// ============================================================================
|
|
15
|
-
let scraper = null;
|
|
16
|
-
function getScraper() {
|
|
17
|
-
if (!scraper) {
|
|
18
|
-
const providerName = dntShim.Deno.env.get('CHATGPT_SCRAPER_PROVIDER')?.toLowerCase();
|
|
19
|
-
const provider = providerName === 'brightdata' ? brightdataProvider : oxylabsProvider;
|
|
20
|
-
scraper = createScraper(provider);
|
|
21
|
-
}
|
|
22
|
-
return scraper;
|
|
23
|
-
}
|
|
24
|
-
// ============================================================================
|
|
25
|
-
// Public API
|
|
26
|
-
// ============================================================================
|
|
27
|
-
export function getMaxConcurrency() {
|
|
28
|
-
return getScraper().maxConcurrency;
|
|
29
|
-
}
|
|
30
|
-
export function getMaxPromptsPerRequest() {
|
|
31
|
-
return getScraper().maxPromptsPerRequest;
|
|
32
|
-
}
|
|
33
|
-
export async function scrapeGPTBatch(options) {
|
|
34
|
-
return getScraper().scrapeGPTBatch(options);
|
|
35
|
-
}
|
|
36
|
-
export async function triggerGPTBatch(options) {
|
|
37
|
-
return getScraper().triggerGPTBatch(options);
|
|
38
|
-
}
|
|
39
|
-
export async function downloadGPTSnapshots(jobIds) {
|
|
40
|
-
return getScraper().downloadGPTSnapshots(jobIds);
|
|
41
|
-
}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"oxy.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/chatgptScraper/oxy.ts"],"names":[],"mappings":"AAeA,OAAO,EACN,KAAK,iBAAiB,EAItB,MAAM,cAAc,CAAC;AA6LtB,eAAO,MAAM,eAAe,EAAE,iBAQ7B,CAAC"}
|