firecrawl 4.25.1 → 4.25.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/dist/{chunk-7KWNHI4H.js → chunk-5D4KXCYO.js} +4 -4
- package/dist/index.cjs +152 -4
- package/dist/index.d.cts +262 -2
- package/dist/index.d.ts +262 -2
- package/dist/index.js +149 -2
- package/dist/{package-KBCQFPRT.js → package-HESILIET.js} +1 -1
- package/package.json +3 -3
- package/pnpm-workspace.yaml +3 -0
- package/src/__tests__/e2e/v1/index.test.ts +15 -15
- package/src/__tests__/unit/v2/research.test.ts +168 -0
- package/src/index.ts +2 -0
- package/src/v2/client.ts +12 -0
- package/src/v2/methods/research.ts +195 -0
- package/src/v2/types.ts +252 -1
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import { describe, test, expect } from "@jest/globals";
|
|
2
|
+
import { ResearchClient } from "../../../v2/methods/research";
|
|
3
|
+
import { SdkError } from "../../../v2/types";
|
|
4
|
+
import type { HttpClient } from "../../../v2/utils/httpClient";
|
|
5
|
+
|
|
6
|
+
/** Build a ResearchClient whose http.get records the requested URL. */
|
|
7
|
+
function makeClient(
|
|
8
|
+
responder: (url: string) => { status: number; data: any } = () => ({
|
|
9
|
+
status: 200,
|
|
10
|
+
data: {},
|
|
11
|
+
}),
|
|
12
|
+
) {
|
|
13
|
+
const calls: string[] = [];
|
|
14
|
+
const http = {
|
|
15
|
+
get: async (url: string) => {
|
|
16
|
+
calls.push(url);
|
|
17
|
+
return responder(url);
|
|
18
|
+
},
|
|
19
|
+
} as unknown as HttpClient;
|
|
20
|
+
return { client: new ResearchClient(http), calls };
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/** Make an axios-like error carrying an RFC 7807 Problem body. */
|
|
24
|
+
function problemError(status: number, body: any) {
|
|
25
|
+
return { isAxiosError: true, response: { status, data: body }, message: "req failed" };
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
describe("research.searchPapers", () => {
|
|
29
|
+
test("builds query string with explode arrays", async () => {
|
|
30
|
+
const { client, calls } = makeClient(() => ({
|
|
31
|
+
status: 200,
|
|
32
|
+
data: { results: [] },
|
|
33
|
+
}));
|
|
34
|
+
await client.searchPapers("diffusion models", {
|
|
35
|
+
k: 10,
|
|
36
|
+
authors: ["Ho", "Abbeel"],
|
|
37
|
+
categories: ["cs.LG", "stat.ML"],
|
|
38
|
+
from: "2020-01-01",
|
|
39
|
+
to: "2024-12-31",
|
|
40
|
+
});
|
|
41
|
+
const url = calls[0];
|
|
42
|
+
expect(url.startsWith("/v2/research/papers?")).toBe(true);
|
|
43
|
+
const qs = new URLSearchParams(url.split("?")[1]);
|
|
44
|
+
expect(qs.get("query")).toBe("diffusion models");
|
|
45
|
+
expect(qs.get("k")).toBe("10");
|
|
46
|
+
expect(qs.getAll("authors")).toEqual(["Ho", "Abbeel"]);
|
|
47
|
+
expect(qs.getAll("categories")).toEqual(["cs.LG", "stat.ML"]);
|
|
48
|
+
expect(qs.get("from")).toBe("2020-01-01");
|
|
49
|
+
expect(qs.get("to")).toBe("2024-12-31");
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
test("omits absent options", async () => {
|
|
53
|
+
const { client, calls } = makeClient(() => ({ status: 200, data: { results: [] } }));
|
|
54
|
+
await client.searchPapers("q");
|
|
55
|
+
const qs = new URLSearchParams(calls[0].split("?")[1]);
|
|
56
|
+
expect([...qs.keys()]).toEqual(["query"]);
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
test("rejects empty query", async () => {
|
|
60
|
+
const { client } = makeClient();
|
|
61
|
+
await expect(client.searchPapers(" ")).rejects.toThrow(/query cannot be empty/i);
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
test("rejects non-positive k", async () => {
|
|
65
|
+
const { client } = makeClient();
|
|
66
|
+
await expect(client.searchPapers("q", { k: 0 })).rejects.toThrow(/k must be positive/i);
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
test("returns the response body verbatim", async () => {
|
|
70
|
+
const payload = { results: [{ paper_id: "1", title: "t", abstract: "a", score: 0.1 }] };
|
|
71
|
+
const { client } = makeClient(() => ({ status: 200, data: payload }));
|
|
72
|
+
await expect(client.searchPapers("q")).resolves.toEqual(payload);
|
|
73
|
+
});
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
describe("research.getPaper", () => {
|
|
77
|
+
test("detail mode encodes the id and sends no query params", async () => {
|
|
78
|
+
const { client, calls } = makeClient(() => ({ status: 200, data: { paper: {} } }));
|
|
79
|
+
await client.getPaper("arxiv:2105.05233");
|
|
80
|
+
expect(calls[0]).toBe("/v2/research/papers/arxiv%3A2105.05233");
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
test("read mode adds query and k", async () => {
|
|
84
|
+
const { client, calls } = makeClient(() => ({
|
|
85
|
+
status: 200,
|
|
86
|
+
data: { paper: {}, paper_id: "1", query: "q", passages: [] },
|
|
87
|
+
}));
|
|
88
|
+
await client.getPaper("123", { query: "noise schedule", k: 4 });
|
|
89
|
+
const [path, query] = calls[0].split("?");
|
|
90
|
+
expect(path).toBe("/v2/research/papers/123");
|
|
91
|
+
const qs = new URLSearchParams(query);
|
|
92
|
+
expect(qs.get("query")).toBe("noise schedule");
|
|
93
|
+
expect(qs.get("k")).toBe("4");
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
test("rejects k without query", async () => {
|
|
97
|
+
const { client } = makeClient();
|
|
98
|
+
await expect(client.getPaper("123", { k: 4 } as any)).rejects.toThrow(
|
|
99
|
+
/k is only valid together with query/i,
|
|
100
|
+
);
|
|
101
|
+
});
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
describe("research.similarPapers", () => {
|
|
105
|
+
test("requires intent", async () => {
|
|
106
|
+
const { client } = makeClient();
|
|
107
|
+
await expect(
|
|
108
|
+
client.similarPapers("123", { intent: "" }),
|
|
109
|
+
).rejects.toThrow(/intent cannot be empty/i);
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
test("builds path and query with repeated anchors and rerank", async () => {
|
|
113
|
+
const { client, calls } = makeClient(() => ({
|
|
114
|
+
status: 200,
|
|
115
|
+
data: { results: [], pool_size: 0, truncated: false },
|
|
116
|
+
}));
|
|
117
|
+
await client.similarPapers("2105.05233", {
|
|
118
|
+
intent: "diffusion image synthesis",
|
|
119
|
+
mode: "citers",
|
|
120
|
+
k: 20,
|
|
121
|
+
rerank: false,
|
|
122
|
+
anchor: ["arxiv:2006.11239", "1503.03585"],
|
|
123
|
+
});
|
|
124
|
+
const [path, query] = calls[0].split("?");
|
|
125
|
+
expect(path).toBe("/v2/research/papers/2105.05233/similar");
|
|
126
|
+
const qs = new URLSearchParams(query);
|
|
127
|
+
expect(qs.get("intent")).toBe("diffusion image synthesis");
|
|
128
|
+
expect(qs.get("mode")).toBe("citers");
|
|
129
|
+
expect(qs.get("k")).toBe("20");
|
|
130
|
+
expect(qs.get("rerank")).toBe("false");
|
|
131
|
+
expect(qs.getAll("anchor")).toEqual(["arxiv:2006.11239", "1503.03585"]);
|
|
132
|
+
});
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
describe("research.searchGithub", () => {
|
|
136
|
+
test("builds query string", async () => {
|
|
137
|
+
const { client, calls } = makeClient(() => ({ status: 200, data: { results: [] } }));
|
|
138
|
+
await client.searchGithub("milvus hybrid search", { k: 10 });
|
|
139
|
+
const qs = new URLSearchParams(calls[0].split("?")[1]);
|
|
140
|
+
expect(calls[0].startsWith("/v2/research/github?")).toBe(true);
|
|
141
|
+
expect(qs.get("query")).toBe("milvus hybrid search");
|
|
142
|
+
expect(qs.get("k")).toBe("10");
|
|
143
|
+
});
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
describe("research error mapping", () => {
|
|
147
|
+
test("maps RFC 7807 Problem detail to SdkError", async () => {
|
|
148
|
+
const { client } = makeClient(() => {
|
|
149
|
+
throw problemError(400, {
|
|
150
|
+
type: "urn:search-pipeline:invalid_request",
|
|
151
|
+
title: "Bad Request",
|
|
152
|
+
status: 400,
|
|
153
|
+
detail: "query is required",
|
|
154
|
+
});
|
|
155
|
+
});
|
|
156
|
+
await expect(client.searchPapers("q")).rejects.toMatchObject({
|
|
157
|
+
message: "query is required",
|
|
158
|
+
status: 400,
|
|
159
|
+
} as Partial<SdkError>);
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
test("falls back to title when detail is absent", async () => {
|
|
163
|
+
const { client } = makeClient(() => {
|
|
164
|
+
throw problemError(404, { title: "Not Found", status: 404 });
|
|
165
|
+
});
|
|
166
|
+
await expect(client.getPaper("999")).rejects.toThrow(/Not Found/);
|
|
167
|
+
});
|
|
168
|
+
});
|
package/src/index.ts
CHANGED
|
@@ -11,6 +11,8 @@ export { FirecrawlClient } from "./v2/client";
|
|
|
11
11
|
export * from "./v2/types";
|
|
12
12
|
/** Watcher class and options for crawl/batch job monitoring. */
|
|
13
13
|
export { Watcher, type WatcherOptions } from "./v2/watcher";
|
|
14
|
+
/** Research sub-client (accessed via `firecrawl.research`). */
|
|
15
|
+
export { ResearchClient } from "./v2/methods/research";
|
|
14
16
|
/** Legacy v1 client (feature‑frozen). */
|
|
15
17
|
export { default as FirecrawlAppV1 } from "./v1";
|
|
16
18
|
|
package/src/v2/client.ts
CHANGED
|
@@ -32,6 +32,7 @@ import {
|
|
|
32
32
|
listBrowsers,
|
|
33
33
|
} from "./methods/browser";
|
|
34
34
|
import { getConcurrency, getCreditUsage, getQueueStatus, getTokenUsage, getCreditUsageHistorical, getTokenUsageHistorical } from "./methods/usage";
|
|
35
|
+
import { ResearchClient } from "./methods/research";
|
|
35
36
|
import {
|
|
36
37
|
createMonitor as createMonitorMethod,
|
|
37
38
|
deleteMonitor as deleteMonitorMethod,
|
|
@@ -119,6 +120,7 @@ export type FirecrawlClientInput = FirecrawlClientOptions | string;
|
|
|
119
120
|
|
|
120
121
|
export class FirecrawlClient {
|
|
121
122
|
private readonly http: HttpClient;
|
|
123
|
+
private _research?: ResearchClient;
|
|
122
124
|
|
|
123
125
|
private isCloudService(url: string): boolean {
|
|
124
126
|
return url.includes('api.firecrawl.dev');
|
|
@@ -234,6 +236,16 @@ export class FirecrawlClient {
|
|
|
234
236
|
return search(this.http, { query, ...req });
|
|
235
237
|
}
|
|
236
238
|
|
|
239
|
+
// Research
|
|
240
|
+
/**
|
|
241
|
+
* Access the v2 research endpoints (arXiv papers + GitHub history/readmes).
|
|
242
|
+
* Example: `firecrawl.research.searchPapers("diffusion models")`.
|
|
243
|
+
*/
|
|
244
|
+
get research(): ResearchClient {
|
|
245
|
+
if (!this._research) this._research = new ResearchClient(this.http);
|
|
246
|
+
return this._research;
|
|
247
|
+
}
|
|
248
|
+
|
|
237
249
|
// Map
|
|
238
250
|
/**
|
|
239
251
|
* Map a site to discover URLs (sitemap-aware).
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
import type {
|
|
2
|
+
SearchPapersOptions,
|
|
3
|
+
SearchPapersResponse,
|
|
4
|
+
GetPaperOptions,
|
|
5
|
+
PaperMetadataResponse,
|
|
6
|
+
ReadPaperResponse,
|
|
7
|
+
SimilarPapersOptions,
|
|
8
|
+
SimilarPapersResponse,
|
|
9
|
+
SearchGithubOptions,
|
|
10
|
+
GitHubSearchResponse,
|
|
11
|
+
} from "../types";
|
|
12
|
+
import { SdkError } from "../types";
|
|
13
|
+
import { HttpClient } from "../utils/httpClient";
|
|
14
|
+
import { throwForBadResponse } from "../utils/errorHandler";
|
|
15
|
+
|
|
16
|
+
const BASE = "/v2/research";
|
|
17
|
+
|
|
18
|
+
/** Append a value (or repeated array values) to a URLSearchParams instance. */
|
|
19
|
+
function appendParam(
|
|
20
|
+
params: URLSearchParams,
|
|
21
|
+
key: string,
|
|
22
|
+
value: string | number | boolean | string[] | undefined,
|
|
23
|
+
): void {
|
|
24
|
+
if (value == null) return;
|
|
25
|
+
if (Array.isArray(value)) {
|
|
26
|
+
for (const v of value) {
|
|
27
|
+
if (v != null && String(v).length > 0) params.append(key, String(v));
|
|
28
|
+
}
|
|
29
|
+
} else {
|
|
30
|
+
params.append(key, String(value));
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function withQuery(path: string, params: URLSearchParams): string {
|
|
35
|
+
const qs = params.toString();
|
|
36
|
+
return qs ? `${path}?${qs}` : path;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Translate the RFC 7807 Problem body returned by the research service into an
|
|
41
|
+
* SdkError. Falls back to the generic axios normalization otherwise.
|
|
42
|
+
*/
|
|
43
|
+
function normalizeResearchError(err: any, action: string): never {
|
|
44
|
+
if (err?.isAxiosError) {
|
|
45
|
+
const status: number | undefined = err.response?.status;
|
|
46
|
+
const body: any = err.response?.data;
|
|
47
|
+
if (body && (body.detail || body.title)) {
|
|
48
|
+
const message = body.detail || body.title;
|
|
49
|
+
throw new SdkError(message, status, body.type, body);
|
|
50
|
+
}
|
|
51
|
+
throw new SdkError(
|
|
52
|
+
err.message || `Request failed while trying to ${action}`,
|
|
53
|
+
status,
|
|
54
|
+
err.code,
|
|
55
|
+
body,
|
|
56
|
+
);
|
|
57
|
+
}
|
|
58
|
+
throw err;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Client for the v2 research endpoints (arXiv papers + GitHub history/readmes).
|
|
63
|
+
* Accessed via `firecrawl.research`.
|
|
64
|
+
*/
|
|
65
|
+
export class ResearchClient {
|
|
66
|
+
constructor(private readonly http: HttpClient) {}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Search papers by abstract relevance.
|
|
70
|
+
* @param query Natural-language search query.
|
|
71
|
+
* @param options Optional filters (k, authors, categories, from, to).
|
|
72
|
+
*/
|
|
73
|
+
async searchPapers(
|
|
74
|
+
query: string,
|
|
75
|
+
options: SearchPapersOptions = {},
|
|
76
|
+
): Promise<SearchPapersResponse> {
|
|
77
|
+
if (!query || !query.trim()) throw new Error("query cannot be empty");
|
|
78
|
+
if (options.k != null && options.k <= 0)
|
|
79
|
+
throw new Error("k must be positive");
|
|
80
|
+
const params = new URLSearchParams();
|
|
81
|
+
appendParam(params, "query", query);
|
|
82
|
+
appendParam(params, "k", options.k);
|
|
83
|
+
appendParam(params, "authors", options.authors);
|
|
84
|
+
appendParam(params, "categories", options.categories);
|
|
85
|
+
appendParam(params, "from", options.from);
|
|
86
|
+
appendParam(params, "to", options.to);
|
|
87
|
+
try {
|
|
88
|
+
const res = await this.http.get<SearchPapersResponse>(
|
|
89
|
+
withQuery(`${BASE}/papers`, params),
|
|
90
|
+
);
|
|
91
|
+
if (res.status !== 200) throwForBadResponse(res, "search papers");
|
|
92
|
+
return res.data;
|
|
93
|
+
} catch (err) {
|
|
94
|
+
return normalizeResearchError(err, "search papers");
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Get paper metadata (detail mode), or read in-body passages (when `query` is
|
|
100
|
+
* supplied). `k` is only valid together with `query`.
|
|
101
|
+
* @param id Paper reference: a canonical `paper_id`, an `arxiv:<id>` key, or a
|
|
102
|
+
* bare arXiv id / URL.
|
|
103
|
+
* @param options Optional `query` (switches to read mode) and `k`.
|
|
104
|
+
*/
|
|
105
|
+
async getPaper(
|
|
106
|
+
id: string,
|
|
107
|
+
options?: { query?: undefined; k?: undefined },
|
|
108
|
+
): Promise<PaperMetadataResponse>;
|
|
109
|
+
async getPaper(
|
|
110
|
+
id: string,
|
|
111
|
+
options: { query: string; k?: number },
|
|
112
|
+
): Promise<ReadPaperResponse>;
|
|
113
|
+
async getPaper(
|
|
114
|
+
id: string,
|
|
115
|
+
options: GetPaperOptions = {},
|
|
116
|
+
): Promise<PaperMetadataResponse | ReadPaperResponse> {
|
|
117
|
+
if (!id || !id.trim()) throw new Error("id cannot be empty");
|
|
118
|
+
if (options.k != null && options.query == null)
|
|
119
|
+
throw new Error("k is only valid together with query");
|
|
120
|
+
if (options.k != null && options.k <= 0)
|
|
121
|
+
throw new Error("k must be positive");
|
|
122
|
+
const params = new URLSearchParams();
|
|
123
|
+
appendParam(params, "query", options.query);
|
|
124
|
+
appendParam(params, "k", options.k);
|
|
125
|
+
try {
|
|
126
|
+
const res = await this.http.get<PaperMetadataResponse | ReadPaperResponse>(
|
|
127
|
+
withQuery(`${BASE}/papers/${encodeURIComponent(id)}`, params),
|
|
128
|
+
);
|
|
129
|
+
if (res.status !== 200) throwForBadResponse(res, "get paper");
|
|
130
|
+
return res.data;
|
|
131
|
+
} catch (err) {
|
|
132
|
+
return normalizeResearchError(err, "get paper");
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Find related papers via the citation graph.
|
|
138
|
+
* @param id Primary seed paper reference.
|
|
139
|
+
* @param options Required `intent` plus optional mode, k, rerank, anchor.
|
|
140
|
+
*/
|
|
141
|
+
async similarPapers(
|
|
142
|
+
id: string,
|
|
143
|
+
options: SimilarPapersOptions,
|
|
144
|
+
): Promise<SimilarPapersResponse> {
|
|
145
|
+
if (!id || !id.trim()) throw new Error("id cannot be empty");
|
|
146
|
+
if (!options?.intent || !options.intent.trim())
|
|
147
|
+
throw new Error("intent cannot be empty");
|
|
148
|
+
if (options.k != null && options.k <= 0)
|
|
149
|
+
throw new Error("k must be positive");
|
|
150
|
+
const params = new URLSearchParams();
|
|
151
|
+
appendParam(params, "intent", options.intent);
|
|
152
|
+
appendParam(params, "mode", options.mode);
|
|
153
|
+
appendParam(params, "k", options.k);
|
|
154
|
+
if (options.rerank != null) appendParam(params, "rerank", options.rerank);
|
|
155
|
+
appendParam(params, "anchor", options.anchor);
|
|
156
|
+
try {
|
|
157
|
+
const res = await this.http.get<SimilarPapersResponse>(
|
|
158
|
+
withQuery(
|
|
159
|
+
`${BASE}/papers/${encodeURIComponent(id)}/similar`,
|
|
160
|
+
params,
|
|
161
|
+
),
|
|
162
|
+
);
|
|
163
|
+
if (res.status !== 200) throwForBadResponse(res, "find similar papers");
|
|
164
|
+
return res.data;
|
|
165
|
+
} catch (err) {
|
|
166
|
+
return normalizeResearchError(err, "find similar papers");
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/**
|
|
171
|
+
* Search GitHub issue/PR history and repository readmes.
|
|
172
|
+
* @param query Search query.
|
|
173
|
+
* @param options Optional `k`.
|
|
174
|
+
*/
|
|
175
|
+
async searchGithub(
|
|
176
|
+
query: string,
|
|
177
|
+
options: SearchGithubOptions = {},
|
|
178
|
+
): Promise<GitHubSearchResponse> {
|
|
179
|
+
if (!query || !query.trim()) throw new Error("query cannot be empty");
|
|
180
|
+
if (options.k != null && options.k <= 0)
|
|
181
|
+
throw new Error("k must be positive");
|
|
182
|
+
const params = new URLSearchParams();
|
|
183
|
+
appendParam(params, "query", query);
|
|
184
|
+
appendParam(params, "k", options.k);
|
|
185
|
+
try {
|
|
186
|
+
const res = await this.http.get<GitHubSearchResponse>(
|
|
187
|
+
withQuery(`${BASE}/github`, params),
|
|
188
|
+
);
|
|
189
|
+
if (res.status !== 200) throwForBadResponse(res, "search github");
|
|
190
|
+
return res.data;
|
|
191
|
+
} catch (err) {
|
|
192
|
+
return normalizeResearchError(err, "search github");
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
}
|
package/src/v2/types.ts
CHANGED
|
@@ -14,7 +14,8 @@ export type FormatString =
|
|
|
14
14
|
| "attributes"
|
|
15
15
|
| "branding"
|
|
16
16
|
| "audio"
|
|
17
|
-
| "video"
|
|
17
|
+
| "video"
|
|
18
|
+
| "pii";
|
|
18
19
|
|
|
19
20
|
export interface Viewport {
|
|
20
21
|
width: number;
|
|
@@ -205,6 +206,7 @@ export interface ScrapeOptions {
|
|
|
205
206
|
minAge?: number;
|
|
206
207
|
storeInCache?: boolean;
|
|
207
208
|
lockdown?: boolean;
|
|
209
|
+
redactPII?: boolean | RedactPIIOptions;
|
|
208
210
|
profile?: {
|
|
209
211
|
name: string;
|
|
210
212
|
saveChanges?: boolean;
|
|
@@ -213,6 +215,70 @@ export interface ScrapeOptions {
|
|
|
213
215
|
origin?: string;
|
|
214
216
|
}
|
|
215
217
|
|
|
218
|
+
export type RedactPIIEntity =
|
|
219
|
+
| "PERSON"
|
|
220
|
+
| "EMAIL"
|
|
221
|
+
| "PHONE"
|
|
222
|
+
| "LOCATION"
|
|
223
|
+
| "FINANCIAL"
|
|
224
|
+
| "SECRET";
|
|
225
|
+
|
|
226
|
+
export interface RedactPIIOptions {
|
|
227
|
+
/**
|
|
228
|
+
* accurate (default): model-only redaction. Best precision, cleanest output.
|
|
229
|
+
* aggressive: model + Presidio + spaCy. Higher recall at the cost of precision.
|
|
230
|
+
* fast: Presidio only, no model call. Lower F1, ~2x throughput.
|
|
231
|
+
*/
|
|
232
|
+
mode?: "accurate" | "aggressive" | "fast";
|
|
233
|
+
/** Restrict redaction to these entity buckets. Unset means all entities. */
|
|
234
|
+
entities?: RedactPIIEntity[];
|
|
235
|
+
/**
|
|
236
|
+
* tag (default): replace spans with `<KIND>` placeholders.
|
|
237
|
+
* mask: replace spans with `*` of equal length.
|
|
238
|
+
* remove: drop span characters entirely.
|
|
239
|
+
*/
|
|
240
|
+
replaceStyle?: "tag" | "mask" | "remove";
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
export type PIISource = "model" | "heuristics" | "unknown";
|
|
244
|
+
|
|
245
|
+
export interface PIISpan {
|
|
246
|
+
start: number;
|
|
247
|
+
end: number;
|
|
248
|
+
/** Unified entity bucket. Omitted when `kind` doesn't map onto one. */
|
|
249
|
+
entity?: RedactPIIEntity;
|
|
250
|
+
/** Granular recognizer label from fire-privacy. */
|
|
251
|
+
kind: string;
|
|
252
|
+
source: PIISource;
|
|
253
|
+
/** Confidence in [0, 1] when supplied. */
|
|
254
|
+
score?: number;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
/**
|
|
258
|
+
* - ok: redaction completed; redactedMarkdown is the result.
|
|
259
|
+
* - skipped: redaction was not performed; see `reason`.
|
|
260
|
+
* - failed: redaction was attempted but did not produce a usable result.
|
|
261
|
+
*/
|
|
262
|
+
export type PIIStatus = "ok" | "skipped" | "failed";
|
|
263
|
+
|
|
264
|
+
/** Always set when status !== "ok". */
|
|
265
|
+
export type PIIReason =
|
|
266
|
+
| "empty_input"
|
|
267
|
+
| "too_large"
|
|
268
|
+
| "upstream_skipped"
|
|
269
|
+
| "service_unavailable"
|
|
270
|
+
| "timeout"
|
|
271
|
+
| "error";
|
|
272
|
+
|
|
273
|
+
export interface PIIBlock {
|
|
274
|
+
status: PIIStatus;
|
|
275
|
+
reason?: PIIReason;
|
|
276
|
+
redactedMarkdown: string | null;
|
|
277
|
+
spans: PIISpan[];
|
|
278
|
+
/** Span count per entity bucket. Only non-zero entries are present. */
|
|
279
|
+
counts: Partial<Record<RedactPIIEntity, number>>;
|
|
280
|
+
}
|
|
281
|
+
|
|
216
282
|
export type ParseFileData =
|
|
217
283
|
| Blob
|
|
218
284
|
| File
|
|
@@ -483,6 +549,7 @@ export interface Document {
|
|
|
483
549
|
warning?: string;
|
|
484
550
|
changeTracking?: Record<string, unknown>;
|
|
485
551
|
branding?: BrandingProfile;
|
|
552
|
+
pii?: PIIBlock;
|
|
486
553
|
}
|
|
487
554
|
|
|
488
555
|
// Pagination configuration for auto-fetching pages from v2 endpoints that return a `next` URL
|
|
@@ -660,6 +727,25 @@ export interface MonitorEmailNotification {
|
|
|
660
727
|
includeDiffs?: boolean;
|
|
661
728
|
}
|
|
662
729
|
|
|
730
|
+
/**
|
|
731
|
+
* Per-recipient opt-in state for monitor email notifications.
|
|
732
|
+
*
|
|
733
|
+
* External recipients (not members of the team that owns the monitor) must
|
|
734
|
+
* confirm their subscription via a one-time email before they receive any
|
|
735
|
+
* monitor notifications. Team members are auto-confirmed.
|
|
736
|
+
*
|
|
737
|
+
* - `pending` → confirmation email sent, no notifications yet
|
|
738
|
+
* - `confirmed` → notifications enabled
|
|
739
|
+
* - `unsubscribed` → recipient opted out and cannot be re-added without a new
|
|
740
|
+
* confirmation flow
|
|
741
|
+
*/
|
|
742
|
+
export interface MonitorEmailRecipientSubscription {
|
|
743
|
+
email: string;
|
|
744
|
+
status: "pending" | "confirmed" | "unsubscribed";
|
|
745
|
+
source: "team" | "opt_in" | "legacy";
|
|
746
|
+
confirmationEmailSent?: boolean;
|
|
747
|
+
}
|
|
748
|
+
|
|
663
749
|
export interface MonitorNotification {
|
|
664
750
|
email?: MonitorEmailNotification;
|
|
665
751
|
}
|
|
@@ -731,6 +817,13 @@ export interface Monitor {
|
|
|
731
817
|
targets: MonitorTarget[];
|
|
732
818
|
webhook?: MonitorWebhookConfig | null;
|
|
733
819
|
notification?: MonitorNotification | null;
|
|
820
|
+
/**
|
|
821
|
+
* Present on create/update/get responses. Reflects the opt-in state of every
|
|
822
|
+
* email recipient currently configured on the monitor. Absent when the API
|
|
823
|
+
* has not reconciled recipients (e.g. team-default delivery with no
|
|
824
|
+
* explicit recipients).
|
|
825
|
+
*/
|
|
826
|
+
emailRecipientSubscriptions?: MonitorEmailRecipientSubscription[];
|
|
734
827
|
retentionDays: number;
|
|
735
828
|
estimatedCreditsPerMonth?: number | null;
|
|
736
829
|
lastCheckSummary?: MonitorSummary | null;
|
|
@@ -1062,3 +1155,161 @@ export interface BrowserListResponse {
|
|
|
1062
1155
|
sessions?: BrowserSession[];
|
|
1063
1156
|
error?: string;
|
|
1064
1157
|
}
|
|
1158
|
+
|
|
1159
|
+
// ---------- Research (v2) ----------
|
|
1160
|
+
|
|
1161
|
+
/**
|
|
1162
|
+
* Source identifiers grouped by namespace. Currently only `arxiv` is
|
|
1163
|
+
* populated; each value is an array of ids in that namespace.
|
|
1164
|
+
*/
|
|
1165
|
+
export type IdMap = Record<string, string[]>;
|
|
1166
|
+
|
|
1167
|
+
/** Per-candidate ranking signals (present on similarity results). */
|
|
1168
|
+
export interface PaperSignals {
|
|
1169
|
+
/** Raw structural strength (co-citation / coupling counts, or seed overlap). */
|
|
1170
|
+
structural: number;
|
|
1171
|
+
/** Semantic score from the intent abstract search (0 if absent). */
|
|
1172
|
+
semantic: number;
|
|
1173
|
+
/** Citation-graph PageRank of the candidate. */
|
|
1174
|
+
pagerank: number;
|
|
1175
|
+
/** Number of distinct seeds connected to this candidate. */
|
|
1176
|
+
seed_overlap: number;
|
|
1177
|
+
}
|
|
1178
|
+
|
|
1179
|
+
/** A ranked paper. `paper_id` is canonical; arXiv lives in `ids`. */
|
|
1180
|
+
export interface PaperResult {
|
|
1181
|
+
/** Canonical paper id — the Milvus INT64 primary key as a decimal string. */
|
|
1182
|
+
paper_id: string;
|
|
1183
|
+
ids?: IdMap;
|
|
1184
|
+
title: string;
|
|
1185
|
+
abstract: string;
|
|
1186
|
+
/** Final ranking score (post-rerank when enabled). Not normalized. */
|
|
1187
|
+
score: number;
|
|
1188
|
+
/** Present on similarity results. */
|
|
1189
|
+
signals?: PaperSignals;
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
export interface PaperMetadata {
|
|
1193
|
+
paper_id: string;
|
|
1194
|
+
ids?: IdMap;
|
|
1195
|
+
title: string;
|
|
1196
|
+
abstract: string;
|
|
1197
|
+
/** Comma-joined author names. Omitted if unknown. */
|
|
1198
|
+
authors?: string;
|
|
1199
|
+
/** arXiv categories. Omitted if unknown. */
|
|
1200
|
+
categories?: string[];
|
|
1201
|
+
/** Original creation date string (format varies). Omitted if unknown. */
|
|
1202
|
+
created_date?: string;
|
|
1203
|
+
/** Last-updated date string. Omitted if unknown. */
|
|
1204
|
+
update_date?: string;
|
|
1205
|
+
}
|
|
1206
|
+
|
|
1207
|
+
export interface Passage {
|
|
1208
|
+
/** In-body passage text (may be markdown, including tables). */
|
|
1209
|
+
text: string;
|
|
1210
|
+
/** Dense similarity score for the passage. */
|
|
1211
|
+
score: number;
|
|
1212
|
+
}
|
|
1213
|
+
|
|
1214
|
+
export interface SearchPapersResponse {
|
|
1215
|
+
results: PaperResult[];
|
|
1216
|
+
}
|
|
1217
|
+
|
|
1218
|
+
export interface PaperMetadataResponse {
|
|
1219
|
+
paper: PaperMetadata;
|
|
1220
|
+
}
|
|
1221
|
+
|
|
1222
|
+
export interface ReadPaperResponse {
|
|
1223
|
+
paper: PaperMetadata;
|
|
1224
|
+
/** Resolved canonical paper id (empty string if not found via id-key). */
|
|
1225
|
+
paper_id: string;
|
|
1226
|
+
/** Echo of the read query. */
|
|
1227
|
+
query: string;
|
|
1228
|
+
/** Top matching in-body passages. */
|
|
1229
|
+
passages: Passage[];
|
|
1230
|
+
}
|
|
1231
|
+
|
|
1232
|
+
export interface SimilarPapersResponse {
|
|
1233
|
+
/** Ranked related papers; each carries `signals`. */
|
|
1234
|
+
results: PaperResult[];
|
|
1235
|
+
/** Number of resolved candidates considered before truncation to `k`. */
|
|
1236
|
+
pool_size: number;
|
|
1237
|
+
/** True if more resolved candidates existed than were returned. */
|
|
1238
|
+
truncated: boolean;
|
|
1239
|
+
/** Human-readable note when no results are produced. */
|
|
1240
|
+
note?: string | null;
|
|
1241
|
+
}
|
|
1242
|
+
|
|
1243
|
+
/** Component scores; each field is present only when that signal contributed. */
|
|
1244
|
+
export interface GitHubScoreBreakdown {
|
|
1245
|
+
rrf?: number;
|
|
1246
|
+
semantic?: number;
|
|
1247
|
+
lexical?: number;
|
|
1248
|
+
fusion?: number;
|
|
1249
|
+
}
|
|
1250
|
+
|
|
1251
|
+
export interface GitHubSearchItem {
|
|
1252
|
+
resultType: "github_history" | "repo_readme";
|
|
1253
|
+
/** `owner/name`. */
|
|
1254
|
+
repo: string;
|
|
1255
|
+
url: string;
|
|
1256
|
+
/** History page type (e.g. `issue`, `pull`). Omitted for readmes. */
|
|
1257
|
+
pageType?: string;
|
|
1258
|
+
/** Issue/PR number. Omitted for readmes. */
|
|
1259
|
+
number?: number;
|
|
1260
|
+
/** Number of matched segments/chunks. Omitted when not applicable. */
|
|
1261
|
+
segmentCount?: number;
|
|
1262
|
+
/** Readme URL (readme results). Omitted otherwise. */
|
|
1263
|
+
readmeUrl?: string;
|
|
1264
|
+
/** Short matched excerpt. */
|
|
1265
|
+
snippet: string;
|
|
1266
|
+
/** Full matched content in markdown. Omitted unless available. */
|
|
1267
|
+
contentMd?: string;
|
|
1268
|
+
scores: GitHubScoreBreakdown;
|
|
1269
|
+
}
|
|
1270
|
+
|
|
1271
|
+
export interface GitHubSearchResponse {
|
|
1272
|
+
results: GitHubSearchItem[];
|
|
1273
|
+
}
|
|
1274
|
+
|
|
1275
|
+
/** Options for `research.searchPapers`. */
|
|
1276
|
+
export interface SearchPapersOptions {
|
|
1277
|
+
/** Number of results to return (1–500, default 40). */
|
|
1278
|
+
k?: number;
|
|
1279
|
+
/** Author substring filter(s); ALL must match (case-insensitive). */
|
|
1280
|
+
authors?: string[];
|
|
1281
|
+
/** arXiv category filter(s) (e.g. `cs.LG`); ALL must match. */
|
|
1282
|
+
categories?: string[];
|
|
1283
|
+
/** Inclusive lower bound on created/updated date (ISO `YYYY-MM-DD`). */
|
|
1284
|
+
from?: string;
|
|
1285
|
+
/** Inclusive upper bound on created/updated date (lexicographic). */
|
|
1286
|
+
to?: string;
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1289
|
+
/** Options for `research.getPaper`. */
|
|
1290
|
+
export interface GetPaperOptions {
|
|
1291
|
+
/** When present, switches to read mode and returns in-body passages. */
|
|
1292
|
+
query?: string;
|
|
1293
|
+
/** Passage count (read mode only; 1–50, default 4). Requires `query`. */
|
|
1294
|
+
k?: number;
|
|
1295
|
+
}
|
|
1296
|
+
|
|
1297
|
+
/** Options for `research.similarPapers`. */
|
|
1298
|
+
export interface SimilarPapersOptions {
|
|
1299
|
+
/** Natural-language intent used to semantically rerank candidates. Required. */
|
|
1300
|
+
intent: string;
|
|
1301
|
+
/** Traversal mode (default `similar`). */
|
|
1302
|
+
mode?: "similar" | "citers" | "references";
|
|
1303
|
+
/** Number of related papers to return (1–500, default 40). */
|
|
1304
|
+
k?: number;
|
|
1305
|
+
/** Apply an additional ZeroEntropy rerank over the fused candidates. */
|
|
1306
|
+
rerank?: boolean;
|
|
1307
|
+
/** Additional seed paper reference(s), same format as `id`. */
|
|
1308
|
+
anchor?: string[];
|
|
1309
|
+
}
|
|
1310
|
+
|
|
1311
|
+
/** Options for `research.searchGithub`. */
|
|
1312
|
+
export interface SearchGithubOptions {
|
|
1313
|
+
/** Number of results to return (1–100, default 20). */
|
|
1314
|
+
k?: number;
|
|
1315
|
+
}
|