@graphext/cuery 0.9.4 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/esm/browser.d.ts +1 -1
  2. package/esm/browser.js +1 -1
  3. package/esm/mod.d.ts +3 -3
  4. package/esm/mod.d.ts.map +1 -1
  5. package/esm/mod.js +3 -3
  6. package/esm/src/apis/brightdata/contentScraper/index.d.ts.map +1 -0
  7. package/{script/src/apis/brightdata → esm/src/apis/brightdata/contentScraper}/scrape.d.ts +1 -1
  8. package/esm/src/apis/brightdata/contentScraper/scrape.d.ts.map +1 -0
  9. package/esm/src/apis/brightdata/{scrape.js → contentScraper/scrape.js} +2 -2
  10. package/esm/src/apis/brightdata/llmScraper/brightdata.d.ts +20 -0
  11. package/esm/src/apis/brightdata/llmScraper/brightdata.d.ts.map +1 -0
  12. package/esm/src/apis/brightdata/llmScraper/brightdata.js +182 -0
  13. package/esm/src/apis/brightdata/llmScraper/index.d.ts +14 -0
  14. package/esm/src/apis/brightdata/llmScraper/index.d.ts.map +1 -0
  15. package/esm/src/apis/brightdata/llmScraper/index.js +97 -0
  16. package/esm/src/apis/brightdata/llmScraper/oxy.d.ts +16 -0
  17. package/esm/src/apis/brightdata/llmScraper/oxy.d.ts.map +1 -0
  18. package/esm/src/apis/brightdata/llmScraper/oxy.js +171 -0
  19. package/{script/src/apis/chatgptScraper/scraper.d.ts → esm/src/apis/brightdata/llmScraper/scrape.d.ts} +12 -15
  20. package/esm/src/apis/brightdata/llmScraper/scrape.d.ts.map +1 -0
  21. package/esm/src/apis/brightdata/llmScraper/scrape.js +184 -0
  22. package/esm/src/schemas/search.schema.d.ts +2 -2
  23. package/esm/src/schemas/search.schema.d.ts.map +1 -1
  24. package/esm/src/schemas/sources.schema.d.ts +1 -4
  25. package/esm/src/schemas/sources.schema.d.ts.map +1 -1
  26. package/esm/src/tools/sentiment.d.ts.map +1 -1
  27. package/esm/src/tools/sentiment.js +3 -2
  28. package/package.json +1 -1
  29. package/script/browser.d.ts +1 -1
  30. package/script/browser.js +1 -1
  31. package/script/mod.d.ts +3 -3
  32. package/script/mod.d.ts.map +1 -1
  33. package/script/mod.js +6 -6
  34. package/script/src/apis/brightdata/contentScraper/index.d.ts.map +1 -0
  35. package/{esm/src/apis/brightdata → script/src/apis/brightdata/contentScraper}/scrape.d.ts +1 -1
  36. package/script/src/apis/brightdata/contentScraper/scrape.d.ts.map +1 -0
  37. package/script/src/apis/brightdata/{scrape.js → contentScraper/scrape.js} +2 -2
  38. package/script/src/apis/brightdata/llmScraper/brightdata.d.ts +20 -0
  39. package/script/src/apis/brightdata/llmScraper/brightdata.d.ts.map +1 -0
  40. package/script/src/apis/brightdata/llmScraper/brightdata.js +219 -0
  41. package/script/src/apis/brightdata/llmScraper/index.d.ts +14 -0
  42. package/script/src/apis/brightdata/llmScraper/index.d.ts.map +1 -0
  43. package/script/src/apis/brightdata/llmScraper/index.js +140 -0
  44. package/script/src/apis/brightdata/llmScraper/oxy.d.ts +16 -0
  45. package/script/src/apis/brightdata/llmScraper/oxy.d.ts.map +1 -0
  46. package/script/src/apis/brightdata/llmScraper/oxy.js +208 -0
  47. package/{esm/src/apis/chatgptScraper/scraper.d.ts → script/src/apis/brightdata/llmScraper/scrape.d.ts} +12 -15
  48. package/script/src/apis/brightdata/llmScraper/scrape.d.ts.map +1 -0
  49. package/script/src/apis/brightdata/llmScraper/scrape.js +224 -0
  50. package/script/src/schemas/search.schema.d.ts +2 -2
  51. package/script/src/schemas/search.schema.d.ts.map +1 -1
  52. package/script/src/schemas/sources.schema.d.ts +1 -4
  53. package/script/src/schemas/sources.schema.d.ts.map +1 -1
  54. package/script/src/tools/sentiment.d.ts.map +1 -1
  55. package/script/src/tools/sentiment.js +3 -2
  56. package/esm/src/apis/brightdata/index.d.ts.map +0 -1
  57. package/esm/src/apis/brightdata/scrape.d.ts.map +0 -1
  58. package/esm/src/apis/chatgptScraper/brightdata.d.ts +0 -3
  59. package/esm/src/apis/chatgptScraper/brightdata.d.ts.map +0 -1
  60. package/esm/src/apis/chatgptScraper/brightdata.js +0 -172
  61. package/esm/src/apis/chatgptScraper/index.d.ts +0 -10
  62. package/esm/src/apis/chatgptScraper/index.d.ts.map +0 -1
  63. package/esm/src/apis/chatgptScraper/index.js +0 -41
  64. package/esm/src/apis/chatgptScraper/oxy.d.ts +0 -3
  65. package/esm/src/apis/chatgptScraper/oxy.d.ts.map +0 -1
  66. package/esm/src/apis/chatgptScraper/oxy.js +0 -156
  67. package/esm/src/apis/chatgptScraper/scraper.d.ts.map +0 -1
  68. package/esm/src/apis/chatgptScraper/scraper.js +0 -98
  69. package/script/src/apis/brightdata/index.d.ts.map +0 -1
  70. package/script/src/apis/brightdata/scrape.d.ts.map +0 -1
  71. package/script/src/apis/chatgptScraper/brightdata.d.ts +0 -3
  72. package/script/src/apis/chatgptScraper/brightdata.d.ts.map +0 -1
  73. package/script/src/apis/chatgptScraper/brightdata.js +0 -208
  74. package/script/src/apis/chatgptScraper/index.d.ts +0 -10
  75. package/script/src/apis/chatgptScraper/index.d.ts.map +0 -1
  76. package/script/src/apis/chatgptScraper/index.js +0 -81
  77. package/script/src/apis/chatgptScraper/oxy.d.ts +0 -3
  78. package/script/src/apis/chatgptScraper/oxy.d.ts.map +0 -1
  79. package/script/src/apis/chatgptScraper/oxy.js +0 -192
  80. package/script/src/apis/chatgptScraper/scraper.d.ts.map +0 -1
  81. package/script/src/apis/chatgptScraper/scraper.js +0 -139
  82. /package/esm/src/apis/brightdata/{index.d.ts → contentScraper/index.d.ts} +0 -0
  83. /package/esm/src/apis/brightdata/{index.js → contentScraper/index.js} +0 -0
  84. /package/script/src/apis/brightdata/{index.d.ts → contentScraper/index.d.ts} +0 -0
  85. /package/script/src/apis/brightdata/{index.js → contentScraper/index.js} +0 -0
@@ -0,0 +1,140 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.getMaxConcurrency = getMaxConcurrency;
37
+ exports.getMaxPromptsPerRequest = getMaxPromptsPerRequest;
38
+ exports.scrapeGPTBatch = scrapeGPTBatch;
39
+ exports.triggerGPTBatch = triggerGPTBatch;
40
+ exports.downloadGPTSnapshots = downloadGPTSnapshots;
41
+ exports.scrapeAIMBatch = scrapeAIMBatch;
42
+ exports.triggerAIMBatch = triggerAIMBatch;
43
+ exports.downloadAIMSnapshots = downloadAIMSnapshots;
44
+ /* eslint no-console: ["warn", { allow: ["log", "warn", "error"] }] */
45
+ /**
46
+ * LLM Scraper - Public API
47
+ *
48
+ * Selects between Brightdata and Oxylabs based on LLM_SCRAPER_PROVIDER env var.
49
+ * Falls back to CHATGPT_SCRAPER_PROVIDER for backward compatibility.
50
+ * Exposes ChatGPT and AIM variants while preserving the legacy GPT method names.
51
+ * Default: oxylabs
52
+ */
53
+ const dntShim = __importStar(require("../../../../_dnt.shims.js"));
54
+ const scrape_js_1 = require("./scrape.js");
55
+ const brightdata_js_1 = require("./brightdata.js");
56
+ const oxy_js_1 = require("./oxy.js");
57
+ // ============================================================================
58
+ // Scraper Instance (lazy singleton)
59
+ // ============================================================================
60
+ const scrapers = new Map();
61
+ const CHATGPT_BRIGHTDATA_OPTIONS = {
62
+ datasetId: 'gd_m7aof0k82r803d5bjm',
63
+ extraFields: ['web_search_triggered', 'web_search_query'],
64
+ extraInputs: ({ useSearch }) => ({
65
+ web_search: useSearch,
66
+ }),
67
+ targetUrl: 'http://chatgpt.com/',
68
+ };
69
+ const CHATGPT_OXYLABS_OPTIONS = {
70
+ source: 'chatgpt',
71
+ };
72
+ const AIM_BRIGHTDATA_OPTIONS = {
73
+ datasetId: 'gd_mcswdt6z2elth3zqr2',
74
+ targetUrl: 'https://google.com/aimode',
75
+ };
76
+ const AIM_OXYLABS_OPTIONS = {
77
+ source: 'google_ai_mode',
78
+ inputKey: 'query',
79
+ render: 'html',
80
+ search: undefined,
81
+ };
82
+ function getProviderName() {
83
+ return dntShim.Deno.env.get('LLM_SCRAPER_PROVIDER')?.toLowerCase() ??
84
+ dntShim.Deno.env.get('CHATGPT_SCRAPER_PROVIDER')?.toLowerCase();
85
+ }
86
+ function getTargetOptions(target) {
87
+ return target === 'aim'
88
+ ? {
89
+ brightdata: AIM_BRIGHTDATA_OPTIONS,
90
+ oxylabs: AIM_OXYLABS_OPTIONS,
91
+ }
92
+ : {
93
+ brightdata: CHATGPT_BRIGHTDATA_OPTIONS,
94
+ oxylabs: CHATGPT_OXYLABS_OPTIONS,
95
+ };
96
+ }
97
+ function getLLMScraper(target = 'chatgpt') {
98
+ const existingScraper = scrapers.get(target);
99
+ if (existingScraper) {
100
+ return existingScraper;
101
+ }
102
+ const providerName = getProviderName();
103
+ const targetOptions = getTargetOptions(target);
104
+ const provider = providerName === 'brightdata'
105
+ ? (0, brightdata_js_1.createBrightdataProvider)(targetOptions.brightdata)
106
+ : (0, oxy_js_1.createOxylabsProvider)(targetOptions.oxylabs);
107
+ const scraper = (0, scrape_js_1.createLLMScraper)(provider);
108
+ scrapers.set(target, scraper);
109
+ return scraper;
110
+ }
111
+ // ============================================================================
112
+ // Public API
113
+ // ============================================================================
114
+ // Shared scraper metadata
115
+ function getMaxConcurrency(target = 'chatgpt') {
116
+ return getLLMScraper(target).maxConcurrency;
117
+ }
118
+ function getMaxPromptsPerRequest(target = 'chatgpt') {
119
+ return getLLMScraper(target).maxPromptsPerRequest;
120
+ }
121
+ // ChatGPT scraper methods
122
+ async function scrapeGPTBatch(options) {
123
+ return getLLMScraper('chatgpt').scrapeLLMBatch(options);
124
+ }
125
+ async function triggerGPTBatch(options) {
126
+ return getLLMScraper('chatgpt').triggerLLMBatch(options);
127
+ }
128
+ async function downloadGPTSnapshots(jobIds) {
129
+ return getLLMScraper('chatgpt').downloadLLMSnapshots(jobIds);
130
+ }
131
+ // AIM scraper methods
132
+ async function scrapeAIMBatch(options) {
133
+ return getLLMScraper('aim').scrapeLLMBatch(options);
134
+ }
135
+ async function triggerAIMBatch(options) {
136
+ return getLLMScraper('aim').triggerLLMBatch(options);
137
+ }
138
+ async function downloadAIMSnapshots(jobIds) {
139
+ return getLLMScraper('aim').downloadLLMSnapshots(jobIds);
140
+ }
@@ -0,0 +1,16 @@
1
+ import { type ProviderFunctions } from './scrape.js';
2
+ interface OxylabsProviderConfig {
3
+ apiBase: string;
4
+ source: string;
5
+ inputKey: 'prompt' | 'query';
6
+ parse: boolean;
7
+ search?: boolean;
8
+ render?: 'html';
9
+ providerName: string;
10
+ maxConcurrency: number;
11
+ maxPromptsPerRequest: number;
12
+ }
13
+ export declare function createOxylabsProvider(overrides?: Partial<OxylabsProviderConfig>): ProviderFunctions;
14
+ export declare const oxylabsProvider: ProviderFunctions;
15
+ export {};
16
+ //# sourceMappingURL=oxy.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"oxy.d.ts","sourceRoot":"","sources":["../../../../../src/src/apis/brightdata/llmScraper/oxy.ts"],"names":[],"mappings":"AAeA,OAAO,EAA6C,KAAK,iBAAiB,EAAE,MAAM,aAAa,CAAC;AA2BhG,UAAU,qBAAqB;IAC9B,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,QAAQ,GAAG,OAAO,CAAC;IAC7B,KAAK,EAAE,OAAO,CAAC;IACf,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,MAAM,CAAC;IACrB,cAAc,EAAE,MAAM,CAAC;IACvB,oBAAoB,EAAE,MAAM,CAAC;CAC7B;AAyCD,wBAAgB,qBAAqB,CAAC,SAAS,GAAE,OAAO,CAAC,qBAAqB,CAAM,GAAG,iBAAiB,CAyJvG;AAMD,eAAO,MAAM,eAAe,EAAE,iBAA2C,CAAC"}
@@ -0,0 +1,208 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.oxylabsProvider = void 0;
37
+ exports.createOxylabsProvider = createOxylabsProvider;
38
+ /* eslint no-console: ["warn", { allow: ["log", "warn", "error"] }] */
39
+ /**
40
+ * Oxylabs LLM Scraper Provider.
41
+ *
42
+ * API Flow (Async Push-Pull):
43
+ * 1. Trigger: POST to /v1/queries → returns job id
44
+ * 2. Monitor: GET /v1/queries/{id} until status is 'done'
45
+ * 3. Download: GET /v1/queries/{id}/results
46
+ */
47
+ const dntShim = __importStar(require("../../../../_dnt.shims.js"));
48
+ const async_js_1 = require("../../../helpers/async.js");
49
+ const scrape_js_1 = require("./scrape.js");
50
+ const DEFAULT_OXYLABS_PROVIDER_CONFIG = {
51
+ apiBase: 'https://data.oxylabs.io/v1',
52
+ source: 'chatgpt',
53
+ inputKey: 'prompt',
54
+ parse: true,
55
+ search: true,
56
+ providerName: 'Oxylabs',
57
+ maxConcurrency: 10,
58
+ maxPromptsPerRequest: 1,
59
+ };
60
+ const RETRY_CONFIG = {
61
+ maxRetries: 3,
62
+ initialDelay: 1000,
63
+ statusCodes: [429, 500, 502, 503, 504, 524, 612, 613],
64
+ };
65
+ const MAX_WAIT_MS = 600_000; // 10 minutes
66
+ const POLL_INTERVAL_MS = 5_000;
67
+ // ============================================================================
68
+ // Auth
69
+ // ============================================================================
70
+ function getAuthHeader() {
71
+ const username = dntShim.Deno.env.get('OXYLABS_USERNAME');
72
+ const password = dntShim.Deno.env.get('OXYLABS_PASSWORD');
73
+ if (!username || !password) {
74
+ throw new Error('OXYLABS_USERNAME and OXYLABS_PASSWORD environment variables are required');
75
+ }
76
+ return `Basic ${btoa(`${username}:${password}`)}`;
77
+ }
78
+ // ============================================================================
79
+ // Provider Functions
80
+ // ============================================================================
81
+ function createOxylabsProvider(overrides = {}) {
82
+ const config = { ...DEFAULT_OXYLABS_PROVIDER_CONFIG, ...overrides };
83
+ async function triggerJob(prompt, _useSearch, countryISOCode) {
84
+ const authHeader = getAuthHeader();
85
+ const url = `${config.apiBase}/queries`;
86
+ const body = {
87
+ source: config.source,
88
+ parse: config.parse,
89
+ [config.inputKey]: prompt,
90
+ };
91
+ if (config.search != null) {
92
+ body.search = config.search;
93
+ }
94
+ if (config.render != null) {
95
+ body.render = config.render;
96
+ }
97
+ if (countryISOCode) {
98
+ body.geo_location = countryISOCode;
99
+ }
100
+ try {
101
+ const response = await (0, async_js_1.withRetries)(() => fetch(url, {
102
+ method: 'POST',
103
+ headers: {
104
+ 'Authorization': authHeader,
105
+ 'Content-Type': 'application/json',
106
+ },
107
+ body: JSON.stringify(body),
108
+ signal: (0, scrape_js_1.getAbortSignal)(),
109
+ }), RETRY_CONFIG);
110
+ if (!response.ok) {
111
+ console.error(`[${config.providerName}] Trigger error: ${response.status}`);
112
+ return null;
113
+ }
114
+ const data = await response.json();
115
+ return data?.id || null;
116
+ }
117
+ catch (error) {
118
+ console.error(`[${config.providerName}] Trigger failed:`, error);
119
+ return null;
120
+ }
121
+ }
122
+ async function monitorJob(jobId) {
123
+ const authHeader = getAuthHeader();
124
+ const url = `${config.apiBase}/queries/${jobId}`;
125
+ const startTime = Date.now();
126
+ const abortSignal = (0, scrape_js_1.getAbortSignal)();
127
+ while (Date.now() - startTime < MAX_WAIT_MS) {
128
+ if (abortSignal?.aborted)
129
+ return false;
130
+ try {
131
+ const response = await fetch(url, {
132
+ headers: { 'Authorization': authHeader },
133
+ signal: abortSignal,
134
+ });
135
+ // 204 = job not completed yet, continue polling
136
+ if (response.status === 204) {
137
+ await (0, async_js_1.sleep)(POLL_INTERVAL_MS, abortSignal);
138
+ continue;
139
+ }
140
+ if (response.ok) {
141
+ const status = await response.json();
142
+ if (status.status === 'done')
143
+ return true;
144
+ if (status.status === 'faulted' || status.status === 'failed')
145
+ return false;
146
+ }
147
+ }
148
+ catch (error) {
149
+ console.error(`[${config.providerName}] Monitor error:`, error);
150
+ }
151
+ await (0, async_js_1.sleep)(POLL_INTERVAL_MS, abortSignal);
152
+ }
153
+ console.error(`[${config.providerName}] Monitor timeout after ${MAX_WAIT_MS / 1000}s`);
154
+ return false;
155
+ }
156
+ async function downloadJob(jobId) {
157
+ const authHeader = getAuthHeader();
158
+ const url = `${config.apiBase}/queries/${jobId}/results`;
159
+ try {
160
+ const response = await (0, async_js_1.withRetries)(() => fetch(url, {
161
+ headers: { 'Authorization': authHeader },
162
+ signal: (0, scrape_js_1.getAbortSignal)(),
163
+ }), RETRY_CONFIG);
164
+ if (!response.ok) {
165
+ console.error(`[${config.providerName}] Download error: ${response.status}`);
166
+ return null;
167
+ }
168
+ return await response.json();
169
+ }
170
+ catch (error) {
171
+ console.error(`[${config.providerName}] Download failed:`, error);
172
+ return null;
173
+ }
174
+ }
175
+ function transformResponse(raw) {
176
+ const response = raw;
177
+ const content = response?.results?.[0]?.content;
178
+ if (!content)
179
+ return null;
180
+ const answerText = (0, scrape_js_1.cleanAnswer)(content.response_text || '');
181
+ const answerTextMarkdown = (0, scrape_js_1.cleanAnswer)(content.markdown_text || '');
182
+ // Map section='citations' to cited=true (like Brightdata's cited field)
183
+ const citations = (content.citations ?? []).map((c) => ({
184
+ ...c,
185
+ cited: c.section === 'citations',
186
+ }));
187
+ return {
188
+ prompt: content.prompt || '',
189
+ answer: answerText,
190
+ answer_text_markdown: answerTextMarkdown,
191
+ sources: (0, scrape_js_1.buildSources)(citations),
192
+ searchQueries: [],
193
+ };
194
+ }
195
+ return {
196
+ name: config.providerName,
197
+ maxConcurrency: config.maxConcurrency,
198
+ maxPromptsPerRequest: config.maxPromptsPerRequest,
199
+ triggerJob,
200
+ monitorJob,
201
+ downloadJob,
202
+ transformResponse,
203
+ };
204
+ }
205
+ // ============================================================================
206
+ // Export
207
+ // ============================================================================
208
+ exports.oxylabsProvider = createOxylabsProvider();
@@ -1,5 +1,5 @@
1
- import type { ModelResult } from '../../schemas/models.schema.js';
2
- import type { Source, SearchSource } from '../../schemas/sources.schema.js';
1
+ import type { ModelResult } from '../../../schemas/models.schema.js';
2
+ import type { Source } from '../../../schemas/sources.schema.js';
3
3
  export interface BatchOptions {
4
4
  prompts: Array<string>;
5
5
  useSearch?: boolean;
@@ -14,12 +14,12 @@ export interface ProviderFunctions {
14
14
  downloadJob: (jobId: string) => Promise<unknown>;
15
15
  transformResponse: (raw: unknown) => ModelResult | null;
16
16
  }
17
- export interface GPTScraper {
17
+ export interface LLMScraper {
18
18
  maxConcurrency: number;
19
19
  maxPromptsPerRequest: number;
20
- scrapeGPTBatch: (options: BatchOptions) => Promise<Array<ModelResult>>;
21
- triggerGPTBatch: (options: BatchOptions) => Promise<Array<string | null>>;
22
- downloadGPTSnapshots: (jobIds: Array<string | null>) => Promise<Array<ModelResult>>;
20
+ scrapeLLMBatch: (options: BatchOptions) => Promise<Array<ModelResult>>;
21
+ triggerLLMBatch: (options: BatchOptions) => Promise<Array<string | null>>;
22
+ downloadLLMSnapshots: (jobIds: Array<string | null>) => Promise<Array<ModelResult>>;
23
23
  }
24
24
  export declare function getAbortSignal(): AbortSignal | undefined;
25
25
  export declare function cleanAnswer(answer: string): string;
@@ -29,18 +29,15 @@ export declare function buildSources(citations: Array<{
29
29
  description?: string;
30
30
  text?: string;
31
31
  cited?: boolean;
32
- }>, linkPositions?: Record<string, Array<number>>): Array<Source>;
33
- export declare function buildSearchSources(sources: Array<{
32
+ }>, linksAttached?: Array<{
34
33
  url?: string;
35
- title?: string;
36
- snippet?: string;
37
- rank?: number;
38
- date_published?: string;
39
- }>): Array<SearchSource>;
34
+ text?: string;
35
+ position?: number;
36
+ }>): Array<Source>;
40
37
  /**
41
38
  * Creates an empty model result for failed jobs.
42
39
  * This ensures we always return the same number of rows as input.
43
40
  */
44
41
  export declare function emptyModelResult(providerName: string, errorMessage?: string, context?: unknown): ModelResult;
45
- export declare function createScraper(provider: ProviderFunctions): GPTScraper;
46
- //# sourceMappingURL=scraper.d.ts.map
42
+ export declare function createLLMScraper(provider: ProviderFunctions): LLMScraper;
43
+ //# sourceMappingURL=scrape.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"scrape.d.ts","sourceRoot":"","sources":["../../../../../src/src/apis/brightdata/llmScraper/scrape.ts"],"names":[],"mappings":"AAWA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,mCAAmC,CAAC;AACrE,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oCAAoC,CAAC;AAOjE,MAAM,WAAW,YAAY;IAC5B,OAAO,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACvB,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,cAAc,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CAC/B;AAED,MAAM,WAAW,iBAAiB;IACjC,IAAI,EAAE,MAAM,CAAC;IACb,cAAc,EAAE,MAAM,CAAC;IACvB,oBAAoB,EAAE,MAAM,CAAC;IAC7B,UAAU,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,SAAS,EAAE,OAAO,EAAE,cAAc,EAAE,MAAM,GAAG,IAAI,KAAK,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC;IAC1G,UAAU,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,OAAO,CAAC,OAAO,CAAC,CAAC;IAChD,WAAW,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,OAAO,CAAC,OAAO,CAAC,CAAC;IACjD,iBAAiB,EAAE,CAAC,GAAG,EAAE,OAAO,KAAK,WAAW,GAAG,IAAI,CAAC;CACxD;AAED,MAAM,WAAW,UAAU;IAC1B,cAAc,EAAE,MAAM,CAAC;IACvB,oBAAoB,EAAE,MAAM,CAAC;IAC7B,cAAc,EAAE,CAAC,OAAO,EAAE,YAAY,KAAK,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAAC;IACvE,eAAe,EAAE,CAAC,OAAO,EAAE,YAAY,KAAK,OAAO,CAAC,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAAC;IAC1E,oBAAoB,EAAE,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,KAAK,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAAC;CACpF;AAMD,wBAAgB,cAAc,IAAI,WAAW,GAAG,SAAS,CAExD;AAED,wBAAgB,WAAW,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,CAMlD;AA+BD,wBAAgB,YAAY,CAC3B,SAAS,EAAE,KAAK,CAAC;IAAE,GAAG,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IAAC,WAAW,CAAC,EAAE,MAAM,CAAC;IAAC,IAAI,CAAC,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,OAAO,CAAA;CAAE,CAAC,EACvG,aAAa,GAAE,KAAK,CAAC;IAAE,GAAG,CAAC,EAAE,MAAM,CAAC;IAAC,IAAI,CAAC,EAAE,MAAM,CAAC;IAAC,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAM,GAC3E,KAAK,CAAC,MAAM,CAAC,CAoFf;AAED;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,YAAY,EAAE,MAAM,EAAE,YAAY,CAAC,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,OAAO,GAAG,WAAW,CAU5G;AAMD,wBAAgB,gBAAgB,CAAC,QAAQ,EAAE,iBAAiB,GAAG,UAAU,CAkExE"}
@@ -0,0 +1,224 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.getAbortSignal = getAbortSignal;
37
+ exports.cleanAnswer = cleanAnswer;
38
+ exports.buildSources = buildSources;
39
+ exports.emptyModelResult = emptyModelResult;
40
+ exports.createLLMScraper = createLLMScraper;
41
+ /* eslint no-console: ["warn", { allow: ["log", "warn", "error"] }] */
42
+ /**
43
+ * LLM Scraper - Core types and orchestration logic.
44
+ *
45
+ * Uses composition: providers supply functions, this module orchestrates them.
46
+ */
47
+ const dntShim = __importStar(require("../../../../_dnt.shims.js"));
48
+ const async_js_1 = require("../../../helpers/async.js");
49
+ const urls_js_1 = require("../../../helpers/urls.js");
50
+ // ============================================================================
51
+ // Shared Utilities
52
+ // ============================================================================
53
+ function getAbortSignal() {
54
+ return dntShim.dntGlobalThis.abortSignal;
55
+ }
56
+ function cleanAnswer(answer) {
57
+ return answer
58
+ .replace(/!\[([^\]]*)\]\([^)]+\)/g, '')
59
+ .replace(/\n\s*Image\s*\n/g, '\n')
60
+ .replace(/\n{3,}/g, '\n\n')
61
+ .trim();
62
+ }
63
+ /**
64
+ * Derive a merge key from a URL: origin + pathname, stripping query and fragment.
65
+ * Falls back to the raw URL if parsing fails.
66
+ */
67
+ function urlMergeKey(url) {
68
+ try {
69
+ const parsed = new URL(url);
70
+ return parsed.origin + parsed.pathname;
71
+ }
72
+ catch {
73
+ return url;
74
+ }
75
+ }
76
+ /**
77
+ * Returns true when `candidate` carries extra info (hash or search params)
78
+ * that `current` does not.
79
+ */
80
+ function hasExtraUrlInfo(current, candidate) {
81
+ try {
82
+ const cur = new URL(current);
83
+ const cand = new URL(candidate);
84
+ const hasNewHash = cand.hash !== '' && cur.hash === '';
85
+ const hasNewParams = cand.search !== '' && cur.search === '';
86
+ return hasNewHash || hasNewParams;
87
+ }
88
+ catch {
89
+ return false;
90
+ }
91
+ }
92
+ function buildSources(citations, linksAttached = []) {
93
+ const sources = [];
94
+ const sourcesByKey = new Map();
95
+ const upsertSource = (url, initialTitle, cited) => {
96
+ const key = urlMergeKey(url);
97
+ const existing = sourcesByKey.get(key);
98
+ if (existing) {
99
+ if (!existing.title && initialTitle) {
100
+ existing.title = initialTitle;
101
+ }
102
+ existing.cited = existing.cited || cited;
103
+ // Keep the most informative URL (with fragment/params)
104
+ if (hasExtraUrlInfo(existing.url, url)) {
105
+ existing.url = url;
106
+ }
107
+ return existing;
108
+ }
109
+ const source = {
110
+ title: initialTitle,
111
+ url,
112
+ domain: (0, urls_js_1.extractDomain)(url),
113
+ cited,
114
+ };
115
+ sources.push(source);
116
+ sourcesByKey.set(key, source);
117
+ return source;
118
+ };
119
+ const sortedLinks = [...linksAttached].sort((a, b) => {
120
+ const aPos = a.position ?? Number.MAX_SAFE_INTEGER;
121
+ const bPos = b.position ?? Number.MAX_SAFE_INTEGER;
122
+ return aPos - bPos;
123
+ });
124
+ for (const link of sortedLinks) {
125
+ if (!link.url)
126
+ continue;
127
+ const source = upsertSource(link.url, link.text ?? '', true);
128
+ if (link.position != null) {
129
+ source.positions ??= [];
130
+ if (!source.positions.includes(link.position)) {
131
+ source.positions.push(link.position);
132
+ }
133
+ }
134
+ }
135
+ for (const citation of citations) {
136
+ if (!citation.url)
137
+ continue;
138
+ const key = urlMergeKey(citation.url);
139
+ const existing = sourcesByKey.get(key);
140
+ const title = citation.title || citation.description || citation.text || '';
141
+ if (existing) {
142
+ if (title) {
143
+ existing.title = title;
144
+ }
145
+ existing.cited = existing.cited || citation.cited;
146
+ // Append extra fragment/params from citation
147
+ if (hasExtraUrlInfo(existing.url, citation.url)) {
148
+ existing.url = citation.url;
149
+ }
150
+ continue;
151
+ }
152
+ const source = {
153
+ title,
154
+ url: citation.url,
155
+ domain: (0, urls_js_1.extractDomain)(citation.url),
156
+ cited: citation.cited,
157
+ };
158
+ sources.push(source);
159
+ sourcesByKey.set(key, source);
160
+ }
161
+ for (const source of sources) {
162
+ source.positions?.sort((a, b) => a - b);
163
+ }
164
+ return sources;
165
+ }
166
+ /**
167
+ * Creates an empty model result for failed jobs.
168
+ * This ensures we always return the same number of rows as input.
169
+ */
170
+ function emptyModelResult(providerName, errorMessage, context) {
171
+ if (errorMessage) {
172
+ console.error(`[${providerName}] ${errorMessage}`, context ?? '');
173
+ }
174
+ return {
175
+ prompt: '',
176
+ answer: '',
177
+ answer_text_markdown: '',
178
+ sources: [],
179
+ };
180
+ }
181
+ // ============================================================================
182
+ // Scraper Factory
183
+ // ============================================================================
184
+ function createLLMScraper(provider) {
185
+ const { name, maxConcurrency, maxPromptsPerRequest, triggerJob, monitorJob, downloadJob, transformResponse, } = provider;
186
+ async function triggerLLMBatch({ prompts, useSearch = false, countryISOCode = null, }) {
187
+ const jobIds = await (0, async_js_1.mapParallel)(prompts, maxConcurrency, (prompt) => triggerJob(prompt, useSearch, countryISOCode));
188
+ console.log(`[${name}] Triggered ${jobIds.length} jobs for ${prompts.length} prompts`);
189
+ return jobIds;
190
+ }
191
+ async function downloadLLMSnapshots(jobIds) {
192
+ const results = [];
193
+ for (const jobId of jobIds) {
194
+ if (!jobId) {
195
+ results.push(emptyModelResult(name, 'No job ID provided'));
196
+ continue;
197
+ }
198
+ const isReady = await monitorJob(jobId);
199
+ if (!isReady) {
200
+ results.push(emptyModelResult(name, 'Job not ready or failed', jobId));
201
+ continue;
202
+ }
203
+ const raw = await downloadJob(jobId);
204
+ if (!raw) {
205
+ results.push(emptyModelResult(name, 'Failed to download job', jobId));
206
+ continue;
207
+ }
208
+ const result = transformResponse(raw);
209
+ results.push(result ?? emptyModelResult(name, 'Failed to transform response', jobId));
210
+ }
211
+ return results;
212
+ }
213
+ async function scrapeLLMBatch(options) {
214
+ const jobIds = await triggerLLMBatch(options);
215
+ return downloadLLMSnapshots(jobIds);
216
+ }
217
+ return {
218
+ maxConcurrency,
219
+ maxPromptsPerRequest,
220
+ scrapeLLMBatch,
221
+ triggerLLMBatch,
222
+ downloadLLMSnapshots,
223
+ };
224
+ }
@@ -1,12 +1,12 @@
1
1
  import type { z } from '../../deps/jsr.io/@zod/zod/4.3.6/src/index.js';
2
- import type { Source, SearchSource } from './sources.schema.js';
2
+ import type { Source } from './sources.schema.js';
3
3
  export type ContextSize = 'low' | 'medium' | 'high';
4
4
  export type ReasoningEffort = 'low' | 'medium' | 'high';
5
5
  export interface SearchResult {
6
6
  answer: string;
7
+ answer_text_markdown?: string;
7
8
  sources: Array<Source>;
8
9
  searchQueries?: Array<string>;
9
- searchSources?: Array<SearchSource>;
10
10
  }
11
11
  export type SearchOptions = {
12
12
  prompt: string;
@@ -1 +1 @@
1
- {"version":3,"file":"search.schema.d.ts","sourceRoot":"","sources":["../../../src/src/schemas/search.schema.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,CAAC,EAAE,MAAM,+CAA+C,CAAC;AAEvE,OAAO,KAAK,EAAE,MAAM,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AAChE,MAAM,MAAM,WAAW,GAAG,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;AACpD,MAAM,MAAM,eAAe,GAAG,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;AAGxD,MAAM,WAAW,YAAY;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACvB,aAAa,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IAC9B,aAAa,CAAC,EAAE,KAAK,CAAC,YAAY,CAAC,CAAC;CACpC;AAED,MAAM,MAAM,aAAa,GAAG;IAC3B,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB;4FACwF;IACxF,cAAc,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/B,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,eAAe,CAAC,EAAE,eAAe,CAAC;IAClC,UAAU,CAAC,EAAE,YAAY,GAAG,oBAAoB,CAAA;CAChD,CAAC;AAEF,MAAM,MAAM,sBAAsB,CAAC,CAAC,IAAI,aAAa,GAAG;IACvD,cAAc,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;CAC7B,CAAC;AAEF,MAAM,MAAM,kBAAkB,GAAG,IAAI,CAAC,aAAa,EAAE,QAAQ,CAAC,GAAG;IAChE,OAAO,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACvB,cAAc,CAAC,EAAE,MAAM,CAAA;CACvB,CAAA"}
1
+ {"version":3,"file":"search.schema.d.ts","sourceRoot":"","sources":["../../../src/src/schemas/search.schema.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,CAAC,EAAE,MAAM,+CAA+C,CAAC;AAEvE,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,qBAAqB,CAAC;AAClD,MAAM,MAAM,WAAW,GAAG,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;AACpD,MAAM,MAAM,eAAe,GAAG,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;AAExD,MAAM,WAAW,YAAY;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAC9B,OAAO,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACvB,aAAa,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;CAC9B;AAED,MAAM,MAAM,aAAa,GAAG;IAC3B,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB;4FACwF;IACxF,cAAc,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/B,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,eAAe,CAAC,EAAE,eAAe,CAAC;IAClC,UAAU,CAAC,EAAE,YAAY,GAAG,oBAAoB,CAAC;CACjD,CAAC;AAEF,MAAM,MAAM,sBAAsB,CAAC,CAAC,IAAI,aAAa,GAAG;IACvD,cAAc,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;CAC7B,CAAC;AAEF,MAAM,MAAM,kBAAkB,GAAG,IAAI,CAAC,aAAa,EAAE,QAAQ,CAAC,GAAG;IAChE,OAAO,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACvB,cAAc,CAAC,EAAE,MAAM,CAAC;CACxB,CAAC"}