@graphext/cuery 0.10.0 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/esm/src/apis/brightdata/llmScraper/index.d.ts +2 -1
- package/esm/src/apis/brightdata/llmScraper/index.d.ts.map +1 -1
- package/esm/src/apis/brightdata/llmScraper/index.js +19 -6
- package/esm/src/apis/hasdata/helpers.d.ts.map +1 -1
- package/esm/src/apis/hasdata/helpers.js +56 -18
- package/package.json +1 -1
- package/script/src/apis/brightdata/llmScraper/index.d.ts +2 -1
- package/script/src/apis/brightdata/llmScraper/index.d.ts.map +1 -1
- package/script/src/apis/brightdata/llmScraper/index.js +20 -6
- package/script/src/apis/hasdata/helpers.d.ts.map +1 -1
- package/script/src/apis/hasdata/helpers.js +56 -18
|
@@ -2,7 +2,7 @@ import type { ModelResult } from '../../../schemas/models.schema.js';
|
|
|
2
2
|
import { type BatchOptions } from './scrape.js';
|
|
3
3
|
export type { BatchOptions };
|
|
4
4
|
export type JobId = string | null;
|
|
5
|
-
export type ScraperTarget = 'chatgpt' | 'aim';
|
|
5
|
+
export type ScraperTarget = 'chatgpt' | 'aim' | 'generic';
|
|
6
6
|
export declare function getMaxConcurrency(target?: ScraperTarget): number;
|
|
7
7
|
export declare function getMaxPromptsPerRequest(target?: ScraperTarget): number;
|
|
8
8
|
export declare function scrapeGPTBatch(options: BatchOptions): Promise<Array<ModelResult>>;
|
|
@@ -11,4 +11,5 @@ export declare function downloadGPTSnapshots(jobIds: Array<string | null>): Prom
|
|
|
11
11
|
export declare function scrapeAIMBatch(options: BatchOptions): Promise<Array<ModelResult>>;
|
|
12
12
|
export declare function triggerAIMBatch(options: BatchOptions): Promise<Array<string | null>>;
|
|
13
13
|
export declare function downloadAIMSnapshots(jobIds: Array<string | null>): Promise<Array<ModelResult>>;
|
|
14
|
+
export declare function downloadSnapshots(jobIds: Array<string | null>): Promise<Array<ModelResult>>;
|
|
14
15
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../../src/src/apis/brightdata/llmScraper/index.ts"],"names":[],"mappings":"AAYA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,mCAAmC,CAAC;AACrE,OAAO,EAAE,KAAK,YAAY,EAAqC,MAAM,aAAa,CAAC;AAKnF,YAAY,EAAE,YAAY,EAAE,CAAC;AAC7B,MAAM,MAAM,KAAK,GAAG,MAAM,GAAG,IAAI,CAAC;AAClC,MAAM,MAAM,aAAa,GAAG,SAAS,GAAG,KAAK,CAAC;
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../../src/src/apis/brightdata/llmScraper/index.ts"],"names":[],"mappings":"AAYA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,mCAAmC,CAAC;AACrE,OAAO,EAAE,KAAK,YAAY,EAAqC,MAAM,aAAa,CAAC;AAKnF,YAAY,EAAE,YAAY,EAAE,CAAC;AAC7B,MAAM,MAAM,KAAK,GAAG,MAAM,GAAG,IAAI,CAAC;AAClC,MAAM,MAAM,aAAa,GAAG,SAAS,GAAG,KAAK,GAAG,SAAS,CAAC;AAiF1D,wBAAgB,iBAAiB,CAAC,MAAM,GAAE,aAAyB,GAAG,MAAM,CAE3E;AAED,wBAAgB,uBAAuB,CAAC,MAAM,GAAE,aAAyB,GAAG,MAAM,CAEjF;AAGD,wBAAsB,cAAc,CAAC,OAAO,EAAE,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAEvF;AAED,wBAAsB,eAAe,CAAC,OAAO,EAAE,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAE1F;AAED,wBAAsB,oBAAoB,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAEpG;AAGD,wBAAsB,cAAc,CAAC,OAAO,EAAE,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAEvF;AAED,wBAAsB,eAAe,CAAC,OAAO,EAAE,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAE1F;AAED,wBAAsB,oBAAoB,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAEpG;AAGD,wBAAsB,iBAAiB,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAEjG"}
|
|
@@ -57,10 +57,19 @@ function getLLMScraper(target = 'chatgpt') {
|
|
|
57
57
|
return existingScraper;
|
|
58
58
|
}
|
|
59
59
|
const providerName = getProviderName();
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
60
|
+
let provider;
|
|
61
|
+
if (target === 'generic') {
|
|
62
|
+
// Generic instance: only used for download/monitor, no target-specific config needed
|
|
63
|
+
provider = providerName === 'brightdata'
|
|
64
|
+
? createBrightdataProvider()
|
|
65
|
+
: createOxylabsProvider();
|
|
66
|
+
}
|
|
67
|
+
else {
|
|
68
|
+
const targetOptions = getTargetOptions(target);
|
|
69
|
+
provider = providerName === 'brightdata'
|
|
70
|
+
? createBrightdataProvider(targetOptions.brightdata)
|
|
71
|
+
: createOxylabsProvider(targetOptions.oxylabs);
|
|
72
|
+
}
|
|
64
73
|
const scraper = createLLMScraper(provider);
|
|
65
74
|
scrapers.set(target, scraper);
|
|
66
75
|
return scraper;
|
|
@@ -83,7 +92,7 @@ export async function triggerGPTBatch(options) {
|
|
|
83
92
|
return getLLMScraper('chatgpt').triggerLLMBatch(options);
|
|
84
93
|
}
|
|
85
94
|
export async function downloadGPTSnapshots(jobIds) {
|
|
86
|
-
return
|
|
95
|
+
return downloadSnapshots(jobIds);
|
|
87
96
|
}
|
|
88
97
|
// AIM scraper methods
|
|
89
98
|
export async function scrapeAIMBatch(options) {
|
|
@@ -93,5 +102,9 @@ export async function triggerAIMBatch(options) {
|
|
|
93
102
|
return getLLMScraper('aim').triggerLLMBatch(options);
|
|
94
103
|
}
|
|
95
104
|
export async function downloadAIMSnapshots(jobIds) {
|
|
96
|
-
return
|
|
105
|
+
return downloadSnapshots(jobIds);
|
|
106
|
+
}
|
|
107
|
+
// Generic download — target-agnostic, works with any job IDs
|
|
108
|
+
export async function downloadSnapshots(jobIds) {
|
|
109
|
+
return getLLMScraper('generic').downloadLLMSnapshots(jobIds);
|
|
97
110
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"helpers.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/hasdata/helpers.ts"],"names":[],"mappings":"AACA,OAAO,EAAe,KAAK,WAAW,EAAE,MAAM,wBAAwB,CAAC;AAEvE,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,iCAAiC,CAAC;AAG9D,eAAO,MAAM,mBAAmB,KAAK,CAAC;AAEtC,eAAO,MAAM,oBAAoB,EAAE,WAMlC,CAAC;AAEF,wBAAgB,gBAAgB,IAAI,MAAM,CAMzC;AAED,wBAAsB,qBAAqB,CAC1C,GAAG,EAAE,MAAM,EACX,WAAW,GAAE,WAAkC,GAC7C,OAAO,CAAC,QAAQ,CAAC,CAgCnB;AAED,UAAU,QAAQ;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,IAAI,CAAC,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;CACvB;AAED,UAAU,SAAS;IAClB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,uBAAuB,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACxC,gBAAgB,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACjC,IAAI,CAAC,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;IACvB,IAAI,CAAC,EAAE,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC;IAC5B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,UAAU,SAAS;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,UAAU;IAC1B,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IAC9B,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IAC9B,UAAU,CAAC,EAAE,UAAU,CAAC;IACxB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;CACrB;AAED,UAAU,eAAe;IACxB,EAAE,CAAC,EAAE,MAAM,CAAC;IACZ,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,GAAG,CAAC,EAAE,MAAM,CAAC;CACb;AAED,MAAM,WAAW,MAAM;IACtB,eAAe,CAAC,EAAE,eAAe,CAAC;IAClC,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IAC9B,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;CAC9B;AAED,MAAM,WAAW,SAAS;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;CACvB;
|
|
1
|
+
{"version":3,"file":"helpers.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/hasdata/helpers.ts"],"names":[],"mappings":"AACA,OAAO,EAAe,KAAK,WAAW,EAAE,MAAM,wBAAwB,CAAC;AAEvE,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,iCAAiC,CAAC;AAG9D,eAAO,MAAM,mBAAmB,KAAK,CAAC;AAEtC,eAAO,MAAM,oBAAoB,EAAE,WAMlC,CAAC;AAEF,wBAAgB,gBAAgB,IAAI,MAAM,CAMzC;AAED,wBAAsB,qBAAqB,CAC1C,GAAG,EAAE,MAAM,EACX,WAAW,GAAE,WAAkC,GAC7C,OAAO,CAAC,QAAQ,CAAC,CAgCnB;AAED,UAAU,QAAQ;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,IAAI,CAAC,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;CACvB;AAED,UAAU,SAAS;IAClB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,uBAAuB,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACxC,gBAAgB,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACjC,IAAI,CAAC,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;IACvB,IAAI,CAAC,EAAE,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC;IAC5B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,UAAU,SAAS;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,UAAU;IAC1B,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IAC9B,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IAC9B,UAAU,CAAC,EAAE,UAAU,CAAC;IACxB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;CACrB;AAED,UAAU,eAAe;IACxB,EAAE,CAAC,EAAE,MAAM,CAAC;IACZ,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,GAAG,CAAC,EAAE,MAAM,CAAC;CACb;AAED,MAAM,WAAW,MAAM;IACtB,eAAe,CAAC,EAAE,eAAe,CAAC;IAClC,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IAC9B,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;CAC9B;AAED,MAAM,WAAW,SAAS;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;CACvB;AAqMD,wBAAgB,QAAQ,CAAC,GAAG,EAAE,UAAU,GAAG,SAAS,CAInD;AAED,wBAAgB,QAAQ,CAAC,GAAG,EAAE,MAAM,GAAG,SAAS,CAI/C"}
|
|
@@ -116,8 +116,38 @@ function formatCode(block) {
|
|
|
116
116
|
const header = `[Code${lang ? ': ' + lang : ''}]`;
|
|
117
117
|
return `${header}\n${snippet.trim()}`;
|
|
118
118
|
}
|
|
119
|
+
function formatCitationMarkers(refIndexes) {
|
|
120
|
+
if (refIndexes.length === 0) {
|
|
121
|
+
return '';
|
|
122
|
+
}
|
|
123
|
+
return ' ' + refIndexes.map(i => `[${i + 1}]`).join('');
|
|
124
|
+
}
|
|
119
125
|
function parseAIResult(data, { allowNestedOverview = true } = {}) {
|
|
120
126
|
const textBlocks = data.textBlocks || (allowNestedOverview ? data.aiOverview?.textBlocks : []) || [];
|
|
127
|
+
// Build reference index → source index mapping and track cited refs
|
|
128
|
+
const refs = data.references || (allowNestedOverview ? data.aiOverview?.references : []) || [];
|
|
129
|
+
const sources = [];
|
|
130
|
+
const refIndexToSourceIndex = new Map();
|
|
131
|
+
for (const r of refs) {
|
|
132
|
+
const link = r.link || r.url;
|
|
133
|
+
const title = [r.title, r.source, r.snippet].filter(Boolean).join(' - ');
|
|
134
|
+
if (link && r.index != null) {
|
|
135
|
+
// Deduplicate by URL
|
|
136
|
+
const existingIdx = sources.findIndex(s => s.url === link);
|
|
137
|
+
if (existingIdx >= 0) {
|
|
138
|
+
refIndexToSourceIndex.set(r.index, existingIdx);
|
|
139
|
+
}
|
|
140
|
+
else {
|
|
141
|
+
refIndexToSourceIndex.set(r.index, sources.length);
|
|
142
|
+
sources.push({
|
|
143
|
+
title,
|
|
144
|
+
url: link,
|
|
145
|
+
domain: extractDomain(link)
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
const citedSourceIndexes = new Set();
|
|
121
151
|
const parts = [];
|
|
122
152
|
const handlers = {
|
|
123
153
|
paragraph: (b) => cleanText(b.snippet || ''),
|
|
@@ -131,19 +161,40 @@ function parseAIResult(data, { allowNestedOverview = true } = {}) {
|
|
|
131
161
|
continue;
|
|
132
162
|
}
|
|
133
163
|
const handler = handlers[btype];
|
|
164
|
+
let rendered = '';
|
|
134
165
|
if (handler) {
|
|
135
|
-
|
|
136
|
-
if (rendered) {
|
|
137
|
-
parts.push(rendered);
|
|
138
|
-
}
|
|
166
|
+
rendered = handler(block);
|
|
139
167
|
}
|
|
140
168
|
else {
|
|
141
169
|
const snippet = block.snippet || '';
|
|
142
170
|
if (snippet) {
|
|
143
|
-
|
|
171
|
+
rendered = cleanText(snippet);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
if (rendered) {
|
|
175
|
+
// Append citation markers and track positions
|
|
176
|
+
const refIndexes = block.referenceIndexes || [];
|
|
177
|
+
if (refIndexes.length > 0) {
|
|
178
|
+
// Map ref indexes to 1-based source indexes for display
|
|
179
|
+
const sourceIndexes = refIndexes
|
|
180
|
+
.map(ri => refIndexToSourceIndex.get(ri))
|
|
181
|
+
.filter((si) => si != null);
|
|
182
|
+
for (const si of sourceIndexes) {
|
|
183
|
+
citedSourceIndexes.add(si);
|
|
184
|
+
sources[si].positions ??= [];
|
|
185
|
+
if (!sources[si].positions.includes(parts.length)) {
|
|
186
|
+
sources[si].positions.push(parts.length);
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
rendered += formatCitationMarkers(sourceIndexes.filter((v, i, a) => a.indexOf(v) === i));
|
|
144
190
|
}
|
|
191
|
+
parts.push(rendered);
|
|
145
192
|
}
|
|
146
193
|
}
|
|
194
|
+
// Mark cited sources
|
|
195
|
+
for (const si of citedSourceIndexes) {
|
|
196
|
+
sources[si].cited = true;
|
|
197
|
+
}
|
|
147
198
|
const deduped = [];
|
|
148
199
|
for (const p of parts) {
|
|
149
200
|
if (deduped.length === 0 || deduped[deduped.length - 1] !== p) {
|
|
@@ -155,19 +206,6 @@ function parseAIResult(data, { allowNestedOverview = true } = {}) {
|
|
|
155
206
|
console.warn('Warning: AI answer truncated to 16000 characters');
|
|
156
207
|
answer = answer.slice(0, 16000);
|
|
157
208
|
}
|
|
158
|
-
const refs = data.references || (allowNestedOverview ? data.aiOverview?.references : []) || [];
|
|
159
|
-
const sources = [];
|
|
160
|
-
for (const r of refs) {
|
|
161
|
-
const link = r.link || r.url;
|
|
162
|
-
const title = [r.title, r.source, r.snippet].filter(Boolean).join(' - ');
|
|
163
|
-
if (link) {
|
|
164
|
-
sources.push({
|
|
165
|
-
title,
|
|
166
|
-
url: link,
|
|
167
|
-
domain: extractDomain(link)
|
|
168
|
-
});
|
|
169
|
-
}
|
|
170
|
-
}
|
|
171
209
|
return { answer, sources };
|
|
172
210
|
}
|
|
173
211
|
export function parseAIO(aio) {
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@ import type { ModelResult } from '../../../schemas/models.schema.js';
|
|
|
2
2
|
import { type BatchOptions } from './scrape.js';
|
|
3
3
|
export type { BatchOptions };
|
|
4
4
|
export type JobId = string | null;
|
|
5
|
-
export type ScraperTarget = 'chatgpt' | 'aim';
|
|
5
|
+
export type ScraperTarget = 'chatgpt' | 'aim' | 'generic';
|
|
6
6
|
export declare function getMaxConcurrency(target?: ScraperTarget): number;
|
|
7
7
|
export declare function getMaxPromptsPerRequest(target?: ScraperTarget): number;
|
|
8
8
|
export declare function scrapeGPTBatch(options: BatchOptions): Promise<Array<ModelResult>>;
|
|
@@ -11,4 +11,5 @@ export declare function downloadGPTSnapshots(jobIds: Array<string | null>): Prom
|
|
|
11
11
|
export declare function scrapeAIMBatch(options: BatchOptions): Promise<Array<ModelResult>>;
|
|
12
12
|
export declare function triggerAIMBatch(options: BatchOptions): Promise<Array<string | null>>;
|
|
13
13
|
export declare function downloadAIMSnapshots(jobIds: Array<string | null>): Promise<Array<ModelResult>>;
|
|
14
|
+
export declare function downloadSnapshots(jobIds: Array<string | null>): Promise<Array<ModelResult>>;
|
|
14
15
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../../src/src/apis/brightdata/llmScraper/index.ts"],"names":[],"mappings":"AAYA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,mCAAmC,CAAC;AACrE,OAAO,EAAE,KAAK,YAAY,EAAqC,MAAM,aAAa,CAAC;AAKnF,YAAY,EAAE,YAAY,EAAE,CAAC;AAC7B,MAAM,MAAM,KAAK,GAAG,MAAM,GAAG,IAAI,CAAC;AAClC,MAAM,MAAM,aAAa,GAAG,SAAS,GAAG,KAAK,CAAC;
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../../src/src/apis/brightdata/llmScraper/index.ts"],"names":[],"mappings":"AAYA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,mCAAmC,CAAC;AACrE,OAAO,EAAE,KAAK,YAAY,EAAqC,MAAM,aAAa,CAAC;AAKnF,YAAY,EAAE,YAAY,EAAE,CAAC;AAC7B,MAAM,MAAM,KAAK,GAAG,MAAM,GAAG,IAAI,CAAC;AAClC,MAAM,MAAM,aAAa,GAAG,SAAS,GAAG,KAAK,GAAG,SAAS,CAAC;AAiF1D,wBAAgB,iBAAiB,CAAC,MAAM,GAAE,aAAyB,GAAG,MAAM,CAE3E;AAED,wBAAgB,uBAAuB,CAAC,MAAM,GAAE,aAAyB,GAAG,MAAM,CAEjF;AAGD,wBAAsB,cAAc,CAAC,OAAO,EAAE,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAEvF;AAED,wBAAsB,eAAe,CAAC,OAAO,EAAE,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAE1F;AAED,wBAAsB,oBAAoB,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAEpG;AAGD,wBAAsB,cAAc,CAAC,OAAO,EAAE,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAEvF;AAED,wBAAsB,eAAe,CAAC,OAAO,EAAE,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAE1F;AAED,wBAAsB,oBAAoB,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAEpG;AAGD,wBAAsB,iBAAiB,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAEjG"}
|
|
@@ -41,6 +41,7 @@ exports.downloadGPTSnapshots = downloadGPTSnapshots;
|
|
|
41
41
|
exports.scrapeAIMBatch = scrapeAIMBatch;
|
|
42
42
|
exports.triggerAIMBatch = triggerAIMBatch;
|
|
43
43
|
exports.downloadAIMSnapshots = downloadAIMSnapshots;
|
|
44
|
+
exports.downloadSnapshots = downloadSnapshots;
|
|
44
45
|
/* eslint no-console: ["warn", { allow: ["log", "warn", "error"] }] */
|
|
45
46
|
/**
|
|
46
47
|
* LLM Scraper - Public API
|
|
@@ -100,10 +101,19 @@ function getLLMScraper(target = 'chatgpt') {
|
|
|
100
101
|
return existingScraper;
|
|
101
102
|
}
|
|
102
103
|
const providerName = getProviderName();
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
104
|
+
let provider;
|
|
105
|
+
if (target === 'generic') {
|
|
106
|
+
// Generic instance: only used for download/monitor, no target-specific config needed
|
|
107
|
+
provider = providerName === 'brightdata'
|
|
108
|
+
? (0, brightdata_js_1.createBrightdataProvider)()
|
|
109
|
+
: (0, oxy_js_1.createOxylabsProvider)();
|
|
110
|
+
}
|
|
111
|
+
else {
|
|
112
|
+
const targetOptions = getTargetOptions(target);
|
|
113
|
+
provider = providerName === 'brightdata'
|
|
114
|
+
? (0, brightdata_js_1.createBrightdataProvider)(targetOptions.brightdata)
|
|
115
|
+
: (0, oxy_js_1.createOxylabsProvider)(targetOptions.oxylabs);
|
|
116
|
+
}
|
|
107
117
|
const scraper = (0, scrape_js_1.createLLMScraper)(provider);
|
|
108
118
|
scrapers.set(target, scraper);
|
|
109
119
|
return scraper;
|
|
@@ -126,7 +136,7 @@ async function triggerGPTBatch(options) {
|
|
|
126
136
|
return getLLMScraper('chatgpt').triggerLLMBatch(options);
|
|
127
137
|
}
|
|
128
138
|
async function downloadGPTSnapshots(jobIds) {
|
|
129
|
-
return
|
|
139
|
+
return downloadSnapshots(jobIds);
|
|
130
140
|
}
|
|
131
141
|
// AIM scraper methods
|
|
132
142
|
async function scrapeAIMBatch(options) {
|
|
@@ -136,5 +146,9 @@ async function triggerAIMBatch(options) {
|
|
|
136
146
|
return getLLMScraper('aim').triggerLLMBatch(options);
|
|
137
147
|
}
|
|
138
148
|
async function downloadAIMSnapshots(jobIds) {
|
|
139
|
-
return
|
|
149
|
+
return downloadSnapshots(jobIds);
|
|
150
|
+
}
|
|
151
|
+
// Generic download — target-agnostic, works with any job IDs
|
|
152
|
+
async function downloadSnapshots(jobIds) {
|
|
153
|
+
return getLLMScraper('generic').downloadLLMSnapshots(jobIds);
|
|
140
154
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"helpers.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/hasdata/helpers.ts"],"names":[],"mappings":"AACA,OAAO,EAAe,KAAK,WAAW,EAAE,MAAM,wBAAwB,CAAC;AAEvE,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,iCAAiC,CAAC;AAG9D,eAAO,MAAM,mBAAmB,KAAK,CAAC;AAEtC,eAAO,MAAM,oBAAoB,EAAE,WAMlC,CAAC;AAEF,wBAAgB,gBAAgB,IAAI,MAAM,CAMzC;AAED,wBAAsB,qBAAqB,CAC1C,GAAG,EAAE,MAAM,EACX,WAAW,GAAE,WAAkC,GAC7C,OAAO,CAAC,QAAQ,CAAC,CAgCnB;AAED,UAAU,QAAQ;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,IAAI,CAAC,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;CACvB;AAED,UAAU,SAAS;IAClB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,uBAAuB,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACxC,gBAAgB,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACjC,IAAI,CAAC,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;IACvB,IAAI,CAAC,EAAE,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC;IAC5B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,UAAU,SAAS;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,UAAU;IAC1B,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IAC9B,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IAC9B,UAAU,CAAC,EAAE,UAAU,CAAC;IACxB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;CACrB;AAED,UAAU,eAAe;IACxB,EAAE,CAAC,EAAE,MAAM,CAAC;IACZ,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,GAAG,CAAC,EAAE,MAAM,CAAC;CACb;AAED,MAAM,WAAW,MAAM;IACtB,eAAe,CAAC,EAAE,eAAe,CAAC;IAClC,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IAC9B,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;CAC9B;AAED,MAAM,WAAW,SAAS;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;CACvB;
|
|
1
|
+
{"version":3,"file":"helpers.d.ts","sourceRoot":"","sources":["../../../../src/src/apis/hasdata/helpers.ts"],"names":[],"mappings":"AACA,OAAO,EAAe,KAAK,WAAW,EAAE,MAAM,wBAAwB,CAAC;AAEvE,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,iCAAiC,CAAC;AAG9D,eAAO,MAAM,mBAAmB,KAAK,CAAC;AAEtC,eAAO,MAAM,oBAAoB,EAAE,WAMlC,CAAC;AAEF,wBAAgB,gBAAgB,IAAI,MAAM,CAMzC;AAED,wBAAsB,qBAAqB,CAC1C,GAAG,EAAE,MAAM,EACX,WAAW,GAAE,WAAkC,GAC7C,OAAO,CAAC,QAAQ,CAAC,CAgCnB;AAED,UAAU,QAAQ;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,IAAI,CAAC,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;CACvB;AAED,UAAU,SAAS;IAClB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,uBAAuB,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACxC,gBAAgB,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACjC,IAAI,CAAC,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;IACvB,IAAI,CAAC,EAAE,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC;IAC5B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,UAAU,SAAS;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,UAAU;IAC1B,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IAC9B,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IAC9B,UAAU,CAAC,EAAE,UAAU,CAAC;IACxB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;CACrB;AAED,UAAU,eAAe;IACxB,EAAE,CAAC,EAAE,MAAM,CAAC;IACZ,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,GAAG,CAAC,EAAE,MAAM,CAAC;CACb;AAED,MAAM,WAAW,MAAM;IACtB,eAAe,CAAC,EAAE,eAAe,CAAC;IAClC,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IAC9B,UAAU,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;CAC9B;AAED,MAAM,WAAW,SAAS;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;CACvB;AAqMD,wBAAgB,QAAQ,CAAC,GAAG,EAAE,UAAU,GAAG,SAAS,CAInD;AAED,wBAAgB,QAAQ,CAAC,GAAG,EAAE,MAAM,GAAG,SAAS,CAI/C"}
|
|
@@ -156,8 +156,38 @@ function formatCode(block) {
|
|
|
156
156
|
const header = `[Code${lang ? ': ' + lang : ''}]`;
|
|
157
157
|
return `${header}\n${snippet.trim()}`;
|
|
158
158
|
}
|
|
159
|
+
function formatCitationMarkers(refIndexes) {
|
|
160
|
+
if (refIndexes.length === 0) {
|
|
161
|
+
return '';
|
|
162
|
+
}
|
|
163
|
+
return ' ' + refIndexes.map(i => `[${i + 1}]`).join('');
|
|
164
|
+
}
|
|
159
165
|
function parseAIResult(data, { allowNestedOverview = true } = {}) {
|
|
160
166
|
const textBlocks = data.textBlocks || (allowNestedOverview ? data.aiOverview?.textBlocks : []) || [];
|
|
167
|
+
// Build reference index → source index mapping and track cited refs
|
|
168
|
+
const refs = data.references || (allowNestedOverview ? data.aiOverview?.references : []) || [];
|
|
169
|
+
const sources = [];
|
|
170
|
+
const refIndexToSourceIndex = new Map();
|
|
171
|
+
for (const r of refs) {
|
|
172
|
+
const link = r.link || r.url;
|
|
173
|
+
const title = [r.title, r.source, r.snippet].filter(Boolean).join(' - ');
|
|
174
|
+
if (link && r.index != null) {
|
|
175
|
+
// Deduplicate by URL
|
|
176
|
+
const existingIdx = sources.findIndex(s => s.url === link);
|
|
177
|
+
if (existingIdx >= 0) {
|
|
178
|
+
refIndexToSourceIndex.set(r.index, existingIdx);
|
|
179
|
+
}
|
|
180
|
+
else {
|
|
181
|
+
refIndexToSourceIndex.set(r.index, sources.length);
|
|
182
|
+
sources.push({
|
|
183
|
+
title,
|
|
184
|
+
url: link,
|
|
185
|
+
domain: (0, urls_js_1.extractDomain)(link)
|
|
186
|
+
});
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
const citedSourceIndexes = new Set();
|
|
161
191
|
const parts = [];
|
|
162
192
|
const handlers = {
|
|
163
193
|
paragraph: (b) => cleanText(b.snippet || ''),
|
|
@@ -171,19 +201,40 @@ function parseAIResult(data, { allowNestedOverview = true } = {}) {
|
|
|
171
201
|
continue;
|
|
172
202
|
}
|
|
173
203
|
const handler = handlers[btype];
|
|
204
|
+
let rendered = '';
|
|
174
205
|
if (handler) {
|
|
175
|
-
|
|
176
|
-
if (rendered) {
|
|
177
|
-
parts.push(rendered);
|
|
178
|
-
}
|
|
206
|
+
rendered = handler(block);
|
|
179
207
|
}
|
|
180
208
|
else {
|
|
181
209
|
const snippet = block.snippet || '';
|
|
182
210
|
if (snippet) {
|
|
183
|
-
|
|
211
|
+
rendered = cleanText(snippet);
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
if (rendered) {
|
|
215
|
+
// Append citation markers and track positions
|
|
216
|
+
const refIndexes = block.referenceIndexes || [];
|
|
217
|
+
if (refIndexes.length > 0) {
|
|
218
|
+
// Map ref indexes to 1-based source indexes for display
|
|
219
|
+
const sourceIndexes = refIndexes
|
|
220
|
+
.map(ri => refIndexToSourceIndex.get(ri))
|
|
221
|
+
.filter((si) => si != null);
|
|
222
|
+
for (const si of sourceIndexes) {
|
|
223
|
+
citedSourceIndexes.add(si);
|
|
224
|
+
sources[si].positions ??= [];
|
|
225
|
+
if (!sources[si].positions.includes(parts.length)) {
|
|
226
|
+
sources[si].positions.push(parts.length);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
rendered += formatCitationMarkers(sourceIndexes.filter((v, i, a) => a.indexOf(v) === i));
|
|
184
230
|
}
|
|
231
|
+
parts.push(rendered);
|
|
185
232
|
}
|
|
186
233
|
}
|
|
234
|
+
// Mark cited sources
|
|
235
|
+
for (const si of citedSourceIndexes) {
|
|
236
|
+
sources[si].cited = true;
|
|
237
|
+
}
|
|
187
238
|
const deduped = [];
|
|
188
239
|
for (const p of parts) {
|
|
189
240
|
if (deduped.length === 0 || deduped[deduped.length - 1] !== p) {
|
|
@@ -195,19 +246,6 @@ function parseAIResult(data, { allowNestedOverview = true } = {}) {
|
|
|
195
246
|
console.warn('Warning: AI answer truncated to 16000 characters');
|
|
196
247
|
answer = answer.slice(0, 16000);
|
|
197
248
|
}
|
|
198
|
-
const refs = data.references || (allowNestedOverview ? data.aiOverview?.references : []) || [];
|
|
199
|
-
const sources = [];
|
|
200
|
-
for (const r of refs) {
|
|
201
|
-
const link = r.link || r.url;
|
|
202
|
-
const title = [r.title, r.source, r.snippet].filter(Boolean).join(' - ');
|
|
203
|
-
if (link) {
|
|
204
|
-
sources.push({
|
|
205
|
-
title,
|
|
206
|
-
url: link,
|
|
207
|
-
domain: (0, urls_js_1.extractDomain)(link)
|
|
208
|
-
});
|
|
209
|
-
}
|
|
210
|
-
}
|
|
211
249
|
return { answer, sources };
|
|
212
250
|
}
|
|
213
251
|
function parseAIO(aio) {
|