firecrawl 4.25.2 → 4.25.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-XCQC2QCZ.js → chunk-5D4KXCYO.js} +2 -2
- package/dist/index.cjs +150 -2
- package/dist/index.d.cts +189 -1
- package/dist/index.d.ts +189 -1
- package/dist/index.js +149 -2
- package/dist/{package-D6422PQU.js → package-HESILIET.js} +1 -1
- package/package.json +2 -2
- package/pnpm-workspace.yaml +3 -0
- package/src/__tests__/e2e/v1/index.test.ts +15 -15
- package/src/__tests__/unit/v2/research.test.ts +168 -0
- package/src/index.ts +2 -0
- package/src/v2/client.ts +12 -0
- package/src/v2/methods/research.ts +195 -0
- package/src/v2/types.ts +158 -0
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
import type {
|
|
2
|
+
SearchPapersOptions,
|
|
3
|
+
SearchPapersResponse,
|
|
4
|
+
GetPaperOptions,
|
|
5
|
+
PaperMetadataResponse,
|
|
6
|
+
ReadPaperResponse,
|
|
7
|
+
SimilarPapersOptions,
|
|
8
|
+
SimilarPapersResponse,
|
|
9
|
+
SearchGithubOptions,
|
|
10
|
+
GitHubSearchResponse,
|
|
11
|
+
} from "../types";
|
|
12
|
+
import { SdkError } from "../types";
|
|
13
|
+
import { HttpClient } from "../utils/httpClient";
|
|
14
|
+
import { throwForBadResponse } from "../utils/errorHandler";
|
|
15
|
+
|
|
16
|
+
const BASE = "/v2/research";
|
|
17
|
+
|
|
18
|
+
/** Append a value (or repeated array values) to a URLSearchParams instance. */
|
|
19
|
+
function appendParam(
|
|
20
|
+
params: URLSearchParams,
|
|
21
|
+
key: string,
|
|
22
|
+
value: string | number | boolean | string[] | undefined,
|
|
23
|
+
): void {
|
|
24
|
+
if (value == null) return;
|
|
25
|
+
if (Array.isArray(value)) {
|
|
26
|
+
for (const v of value) {
|
|
27
|
+
if (v != null && String(v).length > 0) params.append(key, String(v));
|
|
28
|
+
}
|
|
29
|
+
} else {
|
|
30
|
+
params.append(key, String(value));
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function withQuery(path: string, params: URLSearchParams): string {
|
|
35
|
+
const qs = params.toString();
|
|
36
|
+
return qs ? `${path}?${qs}` : path;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Translate the RFC 7807 Problem body returned by the research service into an
|
|
41
|
+
* SdkError. Falls back to the generic axios normalization otherwise.
|
|
42
|
+
*/
|
|
43
|
+
function normalizeResearchError(err: any, action: string): never {
|
|
44
|
+
if (err?.isAxiosError) {
|
|
45
|
+
const status: number | undefined = err.response?.status;
|
|
46
|
+
const body: any = err.response?.data;
|
|
47
|
+
if (body && (body.detail || body.title)) {
|
|
48
|
+
const message = body.detail || body.title;
|
|
49
|
+
throw new SdkError(message, status, body.type, body);
|
|
50
|
+
}
|
|
51
|
+
throw new SdkError(
|
|
52
|
+
err.message || `Request failed while trying to ${action}`,
|
|
53
|
+
status,
|
|
54
|
+
err.code,
|
|
55
|
+
body,
|
|
56
|
+
);
|
|
57
|
+
}
|
|
58
|
+
throw err;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Client for the v2 research endpoints (arXiv papers + GitHub history/readmes).
|
|
63
|
+
* Accessed via `firecrawl.research`.
|
|
64
|
+
*/
|
|
65
|
+
export class ResearchClient {
|
|
66
|
+
constructor(private readonly http: HttpClient) {}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Search papers by abstract relevance.
|
|
70
|
+
* @param query Natural-language search query.
|
|
71
|
+
* @param options Optional filters (k, authors, categories, from, to).
|
|
72
|
+
*/
|
|
73
|
+
async searchPapers(
|
|
74
|
+
query: string,
|
|
75
|
+
options: SearchPapersOptions = {},
|
|
76
|
+
): Promise<SearchPapersResponse> {
|
|
77
|
+
if (!query || !query.trim()) throw new Error("query cannot be empty");
|
|
78
|
+
if (options.k != null && options.k <= 0)
|
|
79
|
+
throw new Error("k must be positive");
|
|
80
|
+
const params = new URLSearchParams();
|
|
81
|
+
appendParam(params, "query", query);
|
|
82
|
+
appendParam(params, "k", options.k);
|
|
83
|
+
appendParam(params, "authors", options.authors);
|
|
84
|
+
appendParam(params, "categories", options.categories);
|
|
85
|
+
appendParam(params, "from", options.from);
|
|
86
|
+
appendParam(params, "to", options.to);
|
|
87
|
+
try {
|
|
88
|
+
const res = await this.http.get<SearchPapersResponse>(
|
|
89
|
+
withQuery(`${BASE}/papers`, params),
|
|
90
|
+
);
|
|
91
|
+
if (res.status !== 200) throwForBadResponse(res, "search papers");
|
|
92
|
+
return res.data;
|
|
93
|
+
} catch (err) {
|
|
94
|
+
return normalizeResearchError(err, "search papers");
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Get paper metadata (detail mode), or read in-body passages (when `query` is
|
|
100
|
+
* supplied). `k` is only valid together with `query`.
|
|
101
|
+
* @param id Paper reference: a canonical `paper_id`, an `arxiv:<id>` key, or a
|
|
102
|
+
* bare arXiv id / URL.
|
|
103
|
+
* @param options Optional `query` (switches to read mode) and `k`.
|
|
104
|
+
*/
|
|
105
|
+
async getPaper(
|
|
106
|
+
id: string,
|
|
107
|
+
options?: { query?: undefined; k?: undefined },
|
|
108
|
+
): Promise<PaperMetadataResponse>;
|
|
109
|
+
async getPaper(
|
|
110
|
+
id: string,
|
|
111
|
+
options: { query: string; k?: number },
|
|
112
|
+
): Promise<ReadPaperResponse>;
|
|
113
|
+
async getPaper(
|
|
114
|
+
id: string,
|
|
115
|
+
options: GetPaperOptions = {},
|
|
116
|
+
): Promise<PaperMetadataResponse | ReadPaperResponse> {
|
|
117
|
+
if (!id || !id.trim()) throw new Error("id cannot be empty");
|
|
118
|
+
if (options.k != null && options.query == null)
|
|
119
|
+
throw new Error("k is only valid together with query");
|
|
120
|
+
if (options.k != null && options.k <= 0)
|
|
121
|
+
throw new Error("k must be positive");
|
|
122
|
+
const params = new URLSearchParams();
|
|
123
|
+
appendParam(params, "query", options.query);
|
|
124
|
+
appendParam(params, "k", options.k);
|
|
125
|
+
try {
|
|
126
|
+
const res = await this.http.get<PaperMetadataResponse | ReadPaperResponse>(
|
|
127
|
+
withQuery(`${BASE}/papers/${encodeURIComponent(id)}`, params),
|
|
128
|
+
);
|
|
129
|
+
if (res.status !== 200) throwForBadResponse(res, "get paper");
|
|
130
|
+
return res.data;
|
|
131
|
+
} catch (err) {
|
|
132
|
+
return normalizeResearchError(err, "get paper");
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Find related papers via the citation graph.
|
|
138
|
+
* @param id Primary seed paper reference.
|
|
139
|
+
* @param options Required `intent` plus optional mode, k, rerank, anchor.
|
|
140
|
+
*/
|
|
141
|
+
async similarPapers(
|
|
142
|
+
id: string,
|
|
143
|
+
options: SimilarPapersOptions,
|
|
144
|
+
): Promise<SimilarPapersResponse> {
|
|
145
|
+
if (!id || !id.trim()) throw new Error("id cannot be empty");
|
|
146
|
+
if (!options?.intent || !options.intent.trim())
|
|
147
|
+
throw new Error("intent cannot be empty");
|
|
148
|
+
if (options.k != null && options.k <= 0)
|
|
149
|
+
throw new Error("k must be positive");
|
|
150
|
+
const params = new URLSearchParams();
|
|
151
|
+
appendParam(params, "intent", options.intent);
|
|
152
|
+
appendParam(params, "mode", options.mode);
|
|
153
|
+
appendParam(params, "k", options.k);
|
|
154
|
+
if (options.rerank != null) appendParam(params, "rerank", options.rerank);
|
|
155
|
+
appendParam(params, "anchor", options.anchor);
|
|
156
|
+
try {
|
|
157
|
+
const res = await this.http.get<SimilarPapersResponse>(
|
|
158
|
+
withQuery(
|
|
159
|
+
`${BASE}/papers/${encodeURIComponent(id)}/similar`,
|
|
160
|
+
params,
|
|
161
|
+
),
|
|
162
|
+
);
|
|
163
|
+
if (res.status !== 200) throwForBadResponse(res, "find similar papers");
|
|
164
|
+
return res.data;
|
|
165
|
+
} catch (err) {
|
|
166
|
+
return normalizeResearchError(err, "find similar papers");
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/**
|
|
171
|
+
* Search GitHub issue/PR history and repository readmes.
|
|
172
|
+
* @param query Search query.
|
|
173
|
+
* @param options Optional `k`.
|
|
174
|
+
*/
|
|
175
|
+
async searchGithub(
|
|
176
|
+
query: string,
|
|
177
|
+
options: SearchGithubOptions = {},
|
|
178
|
+
): Promise<GitHubSearchResponse> {
|
|
179
|
+
if (!query || !query.trim()) throw new Error("query cannot be empty");
|
|
180
|
+
if (options.k != null && options.k <= 0)
|
|
181
|
+
throw new Error("k must be positive");
|
|
182
|
+
const params = new URLSearchParams();
|
|
183
|
+
appendParam(params, "query", query);
|
|
184
|
+
appendParam(params, "k", options.k);
|
|
185
|
+
try {
|
|
186
|
+
const res = await this.http.get<GitHubSearchResponse>(
|
|
187
|
+
withQuery(`${BASE}/github`, params),
|
|
188
|
+
);
|
|
189
|
+
if (res.status !== 200) throwForBadResponse(res, "search github");
|
|
190
|
+
return res.data;
|
|
191
|
+
} catch (err) {
|
|
192
|
+
return normalizeResearchError(err, "search github");
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
}
|
package/src/v2/types.ts
CHANGED
|
@@ -1155,3 +1155,161 @@ export interface BrowserListResponse {
|
|
|
1155
1155
|
sessions?: BrowserSession[];
|
|
1156
1156
|
error?: string;
|
|
1157
1157
|
}
|
|
1158
|
+
|
|
1159
|
+
// ---------- Research (v2) ----------
|
|
1160
|
+
|
|
1161
|
+
/**
|
|
1162
|
+
* Source identifiers grouped by namespace. Currently only `arxiv` is
|
|
1163
|
+
* populated; each value is an array of ids in that namespace.
|
|
1164
|
+
*/
|
|
1165
|
+
export type IdMap = Record<string, string[]>;
|
|
1166
|
+
|
|
1167
|
+
/** Per-candidate ranking signals (present on similarity results). */
|
|
1168
|
+
export interface PaperSignals {
|
|
1169
|
+
/** Raw structural strength (co-citation / coupling counts, or seed overlap). */
|
|
1170
|
+
structural: number;
|
|
1171
|
+
/** Semantic score from the intent abstract search (0 if absent). */
|
|
1172
|
+
semantic: number;
|
|
1173
|
+
/** Citation-graph PageRank of the candidate. */
|
|
1174
|
+
pagerank: number;
|
|
1175
|
+
/** Number of distinct seeds connected to this candidate. */
|
|
1176
|
+
seed_overlap: number;
|
|
1177
|
+
}
|
|
1178
|
+
|
|
1179
|
+
/** A ranked paper. `paper_id` is canonical; arXiv lives in `ids`. */
|
|
1180
|
+
export interface PaperResult {
|
|
1181
|
+
/** Canonical paper id — the Milvus INT64 primary key as a decimal string. */
|
|
1182
|
+
paper_id: string;
|
|
1183
|
+
ids?: IdMap;
|
|
1184
|
+
title: string;
|
|
1185
|
+
abstract: string;
|
|
1186
|
+
/** Final ranking score (post-rerank when enabled). Not normalized. */
|
|
1187
|
+
score: number;
|
|
1188
|
+
/** Present on similarity results. */
|
|
1189
|
+
signals?: PaperSignals;
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
export interface PaperMetadata {
|
|
1193
|
+
paper_id: string;
|
|
1194
|
+
ids?: IdMap;
|
|
1195
|
+
title: string;
|
|
1196
|
+
abstract: string;
|
|
1197
|
+
/** Comma-joined author names. Omitted if unknown. */
|
|
1198
|
+
authors?: string;
|
|
1199
|
+
/** arXiv categories. Omitted if unknown. */
|
|
1200
|
+
categories?: string[];
|
|
1201
|
+
/** Original creation date string (format varies). Omitted if unknown. */
|
|
1202
|
+
created_date?: string;
|
|
1203
|
+
/** Last-updated date string. Omitted if unknown. */
|
|
1204
|
+
update_date?: string;
|
|
1205
|
+
}
|
|
1206
|
+
|
|
1207
|
+
export interface Passage {
|
|
1208
|
+
/** In-body passage text (may be markdown, including tables). */
|
|
1209
|
+
text: string;
|
|
1210
|
+
/** Dense similarity score for the passage. */
|
|
1211
|
+
score: number;
|
|
1212
|
+
}
|
|
1213
|
+
|
|
1214
|
+
export interface SearchPapersResponse {
|
|
1215
|
+
results: PaperResult[];
|
|
1216
|
+
}
|
|
1217
|
+
|
|
1218
|
+
export interface PaperMetadataResponse {
|
|
1219
|
+
paper: PaperMetadata;
|
|
1220
|
+
}
|
|
1221
|
+
|
|
1222
|
+
export interface ReadPaperResponse {
|
|
1223
|
+
paper: PaperMetadata;
|
|
1224
|
+
/** Resolved canonical paper id (empty string if not found via id-key). */
|
|
1225
|
+
paper_id: string;
|
|
1226
|
+
/** Echo of the read query. */
|
|
1227
|
+
query: string;
|
|
1228
|
+
/** Top matching in-body passages. */
|
|
1229
|
+
passages: Passage[];
|
|
1230
|
+
}
|
|
1231
|
+
|
|
1232
|
+
export interface SimilarPapersResponse {
|
|
1233
|
+
/** Ranked related papers; each carries `signals`. */
|
|
1234
|
+
results: PaperResult[];
|
|
1235
|
+
/** Number of resolved candidates considered before truncation to `k`. */
|
|
1236
|
+
pool_size: number;
|
|
1237
|
+
/** True if more resolved candidates existed than were returned. */
|
|
1238
|
+
truncated: boolean;
|
|
1239
|
+
/** Human-readable note when no results are produced. */
|
|
1240
|
+
note?: string | null;
|
|
1241
|
+
}
|
|
1242
|
+
|
|
1243
|
+
/** Component scores; each field is present only when that signal contributed. */
|
|
1244
|
+
export interface GitHubScoreBreakdown {
|
|
1245
|
+
rrf?: number;
|
|
1246
|
+
semantic?: number;
|
|
1247
|
+
lexical?: number;
|
|
1248
|
+
fusion?: number;
|
|
1249
|
+
}
|
|
1250
|
+
|
|
1251
|
+
export interface GitHubSearchItem {
|
|
1252
|
+
resultType: "github_history" | "repo_readme";
|
|
1253
|
+
/** `owner/name`. */
|
|
1254
|
+
repo: string;
|
|
1255
|
+
url: string;
|
|
1256
|
+
/** History page type (e.g. `issue`, `pull`). Omitted for readmes. */
|
|
1257
|
+
pageType?: string;
|
|
1258
|
+
/** Issue/PR number. Omitted for readmes. */
|
|
1259
|
+
number?: number;
|
|
1260
|
+
/** Number of matched segments/chunks. Omitted when not applicable. */
|
|
1261
|
+
segmentCount?: number;
|
|
1262
|
+
/** Readme URL (readme results). Omitted otherwise. */
|
|
1263
|
+
readmeUrl?: string;
|
|
1264
|
+
/** Short matched excerpt. */
|
|
1265
|
+
snippet: string;
|
|
1266
|
+
/** Full matched content in markdown. Omitted unless available. */
|
|
1267
|
+
contentMd?: string;
|
|
1268
|
+
scores: GitHubScoreBreakdown;
|
|
1269
|
+
}
|
|
1270
|
+
|
|
1271
|
+
export interface GitHubSearchResponse {
|
|
1272
|
+
results: GitHubSearchItem[];
|
|
1273
|
+
}
|
|
1274
|
+
|
|
1275
|
+
/** Options for `research.searchPapers`. */
|
|
1276
|
+
export interface SearchPapersOptions {
|
|
1277
|
+
/** Number of results to return (1–500, default 40). */
|
|
1278
|
+
k?: number;
|
|
1279
|
+
/** Author substring filter(s); ALL must match (case-insensitive). */
|
|
1280
|
+
authors?: string[];
|
|
1281
|
+
/** arXiv category filter(s) (e.g. `cs.LG`); ALL must match. */
|
|
1282
|
+
categories?: string[];
|
|
1283
|
+
/** Inclusive lower bound on created/updated date (ISO `YYYY-MM-DD`). */
|
|
1284
|
+
from?: string;
|
|
1285
|
+
/** Inclusive upper bound on created/updated date (lexicographic). */
|
|
1286
|
+
to?: string;
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1289
|
+
/** Options for `research.getPaper`. */
|
|
1290
|
+
export interface GetPaperOptions {
|
|
1291
|
+
/** When present, switches to read mode and returns in-body passages. */
|
|
1292
|
+
query?: string;
|
|
1293
|
+
/** Passage count (read mode only; 1–50, default 4). Requires `query`. */
|
|
1294
|
+
k?: number;
|
|
1295
|
+
}
|
|
1296
|
+
|
|
1297
|
+
/** Options for `research.similarPapers`. */
|
|
1298
|
+
export interface SimilarPapersOptions {
|
|
1299
|
+
/** Natural-language intent used to semantically rerank candidates. Required. */
|
|
1300
|
+
intent: string;
|
|
1301
|
+
/** Traversal mode (default `similar`). */
|
|
1302
|
+
mode?: "similar" | "citers" | "references";
|
|
1303
|
+
/** Number of related papers to return (1–500, default 40). */
|
|
1304
|
+
k?: number;
|
|
1305
|
+
/** Apply an additional ZeroEntropy rerank over the fused candidates. */
|
|
1306
|
+
rerank?: boolean;
|
|
1307
|
+
/** Additional seed paper reference(s), same format as `id`. */
|
|
1308
|
+
anchor?: string[];
|
|
1309
|
+
}
|
|
1310
|
+
|
|
1311
|
+
/** Options for `research.searchGithub`. */
|
|
1312
|
+
export interface SearchGithubOptions {
|
|
1313
|
+
/** Number of results to return (1–100, default 20). */
|
|
1314
|
+
k?: number;
|
|
1315
|
+
}
|