firecrawl 4.25.1 → 4.25.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/dist/{chunk-7KWNHI4H.js → chunk-5D4KXCYO.js} +4 -4
- package/dist/index.cjs +152 -4
- package/dist/index.d.cts +262 -2
- package/dist/index.d.ts +262 -2
- package/dist/index.js +149 -2
- package/dist/{package-KBCQFPRT.js → package-HESILIET.js} +1 -1
- package/package.json +3 -3
- package/pnpm-workspace.yaml +3 -0
- package/src/__tests__/e2e/v1/index.test.ts +15 -15
- package/src/__tests__/unit/v2/research.test.ts +168 -0
- package/src/index.ts +2 -0
- package/src/v2/client.ts +12 -0
- package/src/v2/methods/research.ts +195 -0
- package/src/v2/types.ts +252 -1
package/dist/index.d.ts
CHANGED
|
@@ -4,7 +4,7 @@ import { AxiosResponse, AxiosRequestHeaders } from 'axios';
|
|
|
4
4
|
import { EventEmitter } from 'events';
|
|
5
5
|
import { TypedEventTarget } from 'typescript-event-target';
|
|
6
6
|
|
|
7
|
-
type FormatString = "markdown" | "html" | "rawHtml" | "links" | "images" | "screenshot" | "summary" | "changeTracking" | "json" | "attributes" | "branding" | "audio" | "video";
|
|
7
|
+
type FormatString = "markdown" | "html" | "rawHtml" | "links" | "images" | "screenshot" | "summary" | "changeTracking" | "json" | "attributes" | "branding" | "audio" | "video" | "pii";
|
|
8
8
|
interface Viewport {
|
|
9
9
|
width: number;
|
|
10
10
|
height: number;
|
|
@@ -140,6 +140,7 @@ interface ScrapeOptions {
|
|
|
140
140
|
minAge?: number;
|
|
141
141
|
storeInCache?: boolean;
|
|
142
142
|
lockdown?: boolean;
|
|
143
|
+
redactPII?: boolean | RedactPIIOptions;
|
|
143
144
|
profile?: {
|
|
144
145
|
name: string;
|
|
145
146
|
saveChanges?: boolean;
|
|
@@ -147,6 +148,51 @@ interface ScrapeOptions {
|
|
|
147
148
|
integration?: string;
|
|
148
149
|
origin?: string;
|
|
149
150
|
}
|
|
151
|
+
type RedactPIIEntity = "PERSON" | "EMAIL" | "PHONE" | "LOCATION" | "FINANCIAL" | "SECRET";
|
|
152
|
+
interface RedactPIIOptions {
|
|
153
|
+
/**
|
|
154
|
+
* accurate (default): model-only redaction. Best precision, cleanest output.
|
|
155
|
+
* aggressive: model + Presidio + spaCy. Higher recall at the cost of precision.
|
|
156
|
+
* fast: Presidio only, no model call. Lower F1, ~2x throughput.
|
|
157
|
+
*/
|
|
158
|
+
mode?: "accurate" | "aggressive" | "fast";
|
|
159
|
+
/** Restrict redaction to these entity buckets. Unset means all entities. */
|
|
160
|
+
entities?: RedactPIIEntity[];
|
|
161
|
+
/**
|
|
162
|
+
* tag (default): replace spans with `<KIND>` placeholders.
|
|
163
|
+
* mask: replace spans with `*` of equal length.
|
|
164
|
+
* remove: drop span characters entirely.
|
|
165
|
+
*/
|
|
166
|
+
replaceStyle?: "tag" | "mask" | "remove";
|
|
167
|
+
}
|
|
168
|
+
type PIISource = "model" | "heuristics" | "unknown";
|
|
169
|
+
interface PIISpan {
|
|
170
|
+
start: number;
|
|
171
|
+
end: number;
|
|
172
|
+
/** Unified entity bucket. Omitted when `kind` doesn't map onto one. */
|
|
173
|
+
entity?: RedactPIIEntity;
|
|
174
|
+
/** Granular recognizer label from fire-privacy. */
|
|
175
|
+
kind: string;
|
|
176
|
+
source: PIISource;
|
|
177
|
+
/** Confidence in [0, 1] when supplied. */
|
|
178
|
+
score?: number;
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* - ok: redaction completed; redactedMarkdown is the result.
|
|
182
|
+
* - skipped: redaction was not performed; see `reason`.
|
|
183
|
+
* - failed: redaction was attempted but did not produce a usable result.
|
|
184
|
+
*/
|
|
185
|
+
type PIIStatus = "ok" | "skipped" | "failed";
|
|
186
|
+
/** Always set when status !== "ok". */
|
|
187
|
+
type PIIReason = "empty_input" | "too_large" | "upstream_skipped" | "service_unavailable" | "timeout" | "error";
|
|
188
|
+
interface PIIBlock {
|
|
189
|
+
status: PIIStatus;
|
|
190
|
+
reason?: PIIReason;
|
|
191
|
+
redactedMarkdown: string | null;
|
|
192
|
+
spans: PIISpan[];
|
|
193
|
+
/** Span count per entity bucket. Only non-zero entries are present. */
|
|
194
|
+
counts: Partial<Record<RedactPIIEntity, number>>;
|
|
195
|
+
}
|
|
150
196
|
type ParseFileData = Blob | File | Buffer | Uint8Array | ArrayBuffer | string;
|
|
151
197
|
interface ParseFile {
|
|
152
198
|
data: ParseFileData;
|
|
@@ -364,6 +410,7 @@ interface Document {
|
|
|
364
410
|
warning?: string;
|
|
365
411
|
changeTracking?: Record<string, unknown>;
|
|
366
412
|
branding?: BrandingProfile;
|
|
413
|
+
pii?: PIIBlock;
|
|
367
414
|
}
|
|
368
415
|
interface PaginationConfig {
|
|
369
416
|
/** When true (default), automatically follow `next` links and aggregate all documents. */
|
|
@@ -522,6 +569,24 @@ interface MonitorEmailNotification {
|
|
|
522
569
|
recipients?: string[];
|
|
523
570
|
includeDiffs?: boolean;
|
|
524
571
|
}
|
|
572
|
+
/**
|
|
573
|
+
* Per-recipient opt-in state for monitor email notifications.
|
|
574
|
+
*
|
|
575
|
+
* External recipients (not members of the team that owns the monitor) must
|
|
576
|
+
* confirm their subscription via a one-time email before they receive any
|
|
577
|
+
* monitor notifications. Team members are auto-confirmed.
|
|
578
|
+
*
|
|
579
|
+
* - `pending` → confirmation email sent, no notifications yet
|
|
580
|
+
* - `confirmed` → notifications enabled
|
|
581
|
+
* - `unsubscribed` → recipient opted out and cannot be re-added without a new
|
|
582
|
+
* confirmation flow
|
|
583
|
+
*/
|
|
584
|
+
interface MonitorEmailRecipientSubscription {
|
|
585
|
+
email: string;
|
|
586
|
+
status: "pending" | "confirmed" | "unsubscribed";
|
|
587
|
+
source: "team" | "opt_in" | "legacy";
|
|
588
|
+
confirmationEmailSent?: boolean;
|
|
589
|
+
}
|
|
525
590
|
interface MonitorNotification {
|
|
526
591
|
email?: MonitorEmailNotification;
|
|
527
592
|
}
|
|
@@ -585,6 +650,13 @@ interface Monitor {
|
|
|
585
650
|
targets: MonitorTarget[];
|
|
586
651
|
webhook?: MonitorWebhookConfig | null;
|
|
587
652
|
notification?: MonitorNotification | null;
|
|
653
|
+
/**
|
|
654
|
+
* Present on create/update/get responses. Reflects the opt-in state of every
|
|
655
|
+
* email recipient currently configured on the monitor. Absent when the API
|
|
656
|
+
* has not reconciled recipients (e.g. team-default delivery with no
|
|
657
|
+
* explicit recipients).
|
|
658
|
+
*/
|
|
659
|
+
emailRecipientSubscriptions?: MonitorEmailRecipientSubscription[];
|
|
588
660
|
retentionDays: number;
|
|
589
661
|
estimatedCreditsPerMonth?: number | null;
|
|
590
662
|
lastCheckSummary?: MonitorSummary | null;
|
|
@@ -846,6 +918,146 @@ interface BrowserListResponse {
|
|
|
846
918
|
sessions?: BrowserSession[];
|
|
847
919
|
error?: string;
|
|
848
920
|
}
|
|
921
|
+
/**
|
|
922
|
+
* Source identifiers grouped by namespace. Currently only `arxiv` is
|
|
923
|
+
* populated; each value is an array of ids in that namespace.
|
|
924
|
+
*/
|
|
925
|
+
type IdMap = Record<string, string[]>;
|
|
926
|
+
/** Per-candidate ranking signals (present on similarity results). */
|
|
927
|
+
interface PaperSignals {
|
|
928
|
+
/** Raw structural strength (co-citation / coupling counts, or seed overlap). */
|
|
929
|
+
structural: number;
|
|
930
|
+
/** Semantic score from the intent abstract search (0 if absent). */
|
|
931
|
+
semantic: number;
|
|
932
|
+
/** Citation-graph PageRank of the candidate. */
|
|
933
|
+
pagerank: number;
|
|
934
|
+
/** Number of distinct seeds connected to this candidate. */
|
|
935
|
+
seed_overlap: number;
|
|
936
|
+
}
|
|
937
|
+
/** A ranked paper. `paper_id` is canonical; arXiv lives in `ids`. */
|
|
938
|
+
interface PaperResult {
|
|
939
|
+
/** Canonical paper id — the Milvus INT64 primary key as a decimal string. */
|
|
940
|
+
paper_id: string;
|
|
941
|
+
ids?: IdMap;
|
|
942
|
+
title: string;
|
|
943
|
+
abstract: string;
|
|
944
|
+
/** Final ranking score (post-rerank when enabled). Not normalized. */
|
|
945
|
+
score: number;
|
|
946
|
+
/** Present on similarity results. */
|
|
947
|
+
signals?: PaperSignals;
|
|
948
|
+
}
|
|
949
|
+
interface PaperMetadata {
|
|
950
|
+
paper_id: string;
|
|
951
|
+
ids?: IdMap;
|
|
952
|
+
title: string;
|
|
953
|
+
abstract: string;
|
|
954
|
+
/** Comma-joined author names. Omitted if unknown. */
|
|
955
|
+
authors?: string;
|
|
956
|
+
/** arXiv categories. Omitted if unknown. */
|
|
957
|
+
categories?: string[];
|
|
958
|
+
/** Original creation date string (format varies). Omitted if unknown. */
|
|
959
|
+
created_date?: string;
|
|
960
|
+
/** Last-updated date string. Omitted if unknown. */
|
|
961
|
+
update_date?: string;
|
|
962
|
+
}
|
|
963
|
+
interface Passage {
|
|
964
|
+
/** In-body passage text (may be markdown, including tables). */
|
|
965
|
+
text: string;
|
|
966
|
+
/** Dense similarity score for the passage. */
|
|
967
|
+
score: number;
|
|
968
|
+
}
|
|
969
|
+
interface SearchPapersResponse {
|
|
970
|
+
results: PaperResult[];
|
|
971
|
+
}
|
|
972
|
+
interface PaperMetadataResponse {
|
|
973
|
+
paper: PaperMetadata;
|
|
974
|
+
}
|
|
975
|
+
interface ReadPaperResponse {
|
|
976
|
+
paper: PaperMetadata;
|
|
977
|
+
/** Resolved canonical paper id (empty string if not found via id-key). */
|
|
978
|
+
paper_id: string;
|
|
979
|
+
/** Echo of the read query. */
|
|
980
|
+
query: string;
|
|
981
|
+
/** Top matching in-body passages. */
|
|
982
|
+
passages: Passage[];
|
|
983
|
+
}
|
|
984
|
+
interface SimilarPapersResponse {
|
|
985
|
+
/** Ranked related papers; each carries `signals`. */
|
|
986
|
+
results: PaperResult[];
|
|
987
|
+
/** Number of resolved candidates considered before truncation to `k`. */
|
|
988
|
+
pool_size: number;
|
|
989
|
+
/** True if more resolved candidates existed than were returned. */
|
|
990
|
+
truncated: boolean;
|
|
991
|
+
/** Human-readable note when no results are produced. */
|
|
992
|
+
note?: string | null;
|
|
993
|
+
}
|
|
994
|
+
/** Component scores; each field is present only when that signal contributed. */
|
|
995
|
+
interface GitHubScoreBreakdown {
|
|
996
|
+
rrf?: number;
|
|
997
|
+
semantic?: number;
|
|
998
|
+
lexical?: number;
|
|
999
|
+
fusion?: number;
|
|
1000
|
+
}
|
|
1001
|
+
interface GitHubSearchItem {
|
|
1002
|
+
resultType: "github_history" | "repo_readme";
|
|
1003
|
+
/** `owner/name`. */
|
|
1004
|
+
repo: string;
|
|
1005
|
+
url: string;
|
|
1006
|
+
/** History page type (e.g. `issue`, `pull`). Omitted for readmes. */
|
|
1007
|
+
pageType?: string;
|
|
1008
|
+
/** Issue/PR number. Omitted for readmes. */
|
|
1009
|
+
number?: number;
|
|
1010
|
+
/** Number of matched segments/chunks. Omitted when not applicable. */
|
|
1011
|
+
segmentCount?: number;
|
|
1012
|
+
/** Readme URL (readme results). Omitted otherwise. */
|
|
1013
|
+
readmeUrl?: string;
|
|
1014
|
+
/** Short matched excerpt. */
|
|
1015
|
+
snippet: string;
|
|
1016
|
+
/** Full matched content in markdown. Omitted unless available. */
|
|
1017
|
+
contentMd?: string;
|
|
1018
|
+
scores: GitHubScoreBreakdown;
|
|
1019
|
+
}
|
|
1020
|
+
interface GitHubSearchResponse {
|
|
1021
|
+
results: GitHubSearchItem[];
|
|
1022
|
+
}
|
|
1023
|
+
/** Options for `research.searchPapers`. */
|
|
1024
|
+
interface SearchPapersOptions {
|
|
1025
|
+
/** Number of results to return (1–500, default 40). */
|
|
1026
|
+
k?: number;
|
|
1027
|
+
/** Author substring filter(s); ALL must match (case-insensitive). */
|
|
1028
|
+
authors?: string[];
|
|
1029
|
+
/** arXiv category filter(s) (e.g. `cs.LG`); ALL must match. */
|
|
1030
|
+
categories?: string[];
|
|
1031
|
+
/** Inclusive lower bound on created/updated date (ISO `YYYY-MM-DD`). */
|
|
1032
|
+
from?: string;
|
|
1033
|
+
/** Inclusive upper bound on created/updated date (lexicographic). */
|
|
1034
|
+
to?: string;
|
|
1035
|
+
}
|
|
1036
|
+
/** Options for `research.getPaper`. */
|
|
1037
|
+
interface GetPaperOptions {
|
|
1038
|
+
/** When present, switches to read mode and returns in-body passages. */
|
|
1039
|
+
query?: string;
|
|
1040
|
+
/** Passage count (read mode only; 1–50, default 4). Requires `query`. */
|
|
1041
|
+
k?: number;
|
|
1042
|
+
}
|
|
1043
|
+
/** Options for `research.similarPapers`. */
|
|
1044
|
+
interface SimilarPapersOptions {
|
|
1045
|
+
/** Natural-language intent used to semantically rerank candidates. Required. */
|
|
1046
|
+
intent: string;
|
|
1047
|
+
/** Traversal mode (default `similar`). */
|
|
1048
|
+
mode?: "similar" | "citers" | "references";
|
|
1049
|
+
/** Number of related papers to return (1–500, default 40). */
|
|
1050
|
+
k?: number;
|
|
1051
|
+
/** Apply an additional ZeroEntropy rerank over the fused candidates. */
|
|
1052
|
+
rerank?: boolean;
|
|
1053
|
+
/** Additional seed paper reference(s), same format as `id`. */
|
|
1054
|
+
anchor?: string[];
|
|
1055
|
+
}
|
|
1056
|
+
/** Options for `research.searchGithub`. */
|
|
1057
|
+
interface SearchGithubOptions {
|
|
1058
|
+
/** Number of results to return (1–100, default 20). */
|
|
1059
|
+
k?: number;
|
|
1060
|
+
}
|
|
849
1061
|
|
|
850
1062
|
interface HttpClientOptions {
|
|
851
1063
|
apiKey: string;
|
|
@@ -930,6 +1142,48 @@ declare function listBrowsers(http: HttpClient, args?: {
|
|
|
930
1142
|
status?: "active" | "destroyed";
|
|
931
1143
|
}): Promise<BrowserListResponse>;
|
|
932
1144
|
|
|
1145
|
+
/**
|
|
1146
|
+
* Client for the v2 research endpoints (arXiv papers + GitHub history/readmes).
|
|
1147
|
+
* Accessed via `firecrawl.research`.
|
|
1148
|
+
*/
|
|
1149
|
+
declare class ResearchClient {
|
|
1150
|
+
private readonly http;
|
|
1151
|
+
constructor(http: HttpClient);
|
|
1152
|
+
/**
|
|
1153
|
+
* Search papers by abstract relevance.
|
|
1154
|
+
* @param query Natural-language search query.
|
|
1155
|
+
* @param options Optional filters (k, authors, categories, from, to).
|
|
1156
|
+
*/
|
|
1157
|
+
searchPapers(query: string, options?: SearchPapersOptions): Promise<SearchPapersResponse>;
|
|
1158
|
+
/**
|
|
1159
|
+
* Get paper metadata (detail mode), or read in-body passages (when `query` is
|
|
1160
|
+
* supplied). `k` is only valid together with `query`.
|
|
1161
|
+
* @param id Paper reference: a canonical `paper_id`, an `arxiv:<id>` key, or a
|
|
1162
|
+
* bare arXiv id / URL.
|
|
1163
|
+
* @param options Optional `query` (switches to read mode) and `k`.
|
|
1164
|
+
*/
|
|
1165
|
+
getPaper(id: string, options?: {
|
|
1166
|
+
query?: undefined;
|
|
1167
|
+
k?: undefined;
|
|
1168
|
+
}): Promise<PaperMetadataResponse>;
|
|
1169
|
+
getPaper(id: string, options: {
|
|
1170
|
+
query: string;
|
|
1171
|
+
k?: number;
|
|
1172
|
+
}): Promise<ReadPaperResponse>;
|
|
1173
|
+
/**
|
|
1174
|
+
* Find related papers via the citation graph.
|
|
1175
|
+
* @param id Primary seed paper reference.
|
|
1176
|
+
* @param options Required `intent` plus optional mode, k, rerank, anchor.
|
|
1177
|
+
*/
|
|
1178
|
+
similarPapers(id: string, options: SimilarPapersOptions): Promise<SimilarPapersResponse>;
|
|
1179
|
+
/**
|
|
1180
|
+
* Search GitHub issue/PR history and repository readmes.
|
|
1181
|
+
* @param query Search query.
|
|
1182
|
+
* @param options Optional `k`.
|
|
1183
|
+
*/
|
|
1184
|
+
searchGithub(query: string, options?: SearchGithubOptions): Promise<GitHubSearchResponse>;
|
|
1185
|
+
}
|
|
1186
|
+
|
|
933
1187
|
type JobKind = "crawl" | "batch";
|
|
934
1188
|
interface WatcherOptions {
|
|
935
1189
|
kind?: JobKind;
|
|
@@ -985,6 +1239,7 @@ type FirecrawlClientInput = FirecrawlClientOptions | string;
|
|
|
985
1239
|
*/
|
|
986
1240
|
declare class FirecrawlClient {
|
|
987
1241
|
private readonly http;
|
|
1242
|
+
private _research?;
|
|
988
1243
|
private isCloudService;
|
|
989
1244
|
/**
|
|
990
1245
|
* Create a v2 client.
|
|
@@ -1045,6 +1300,11 @@ declare class FirecrawlClient {
|
|
|
1045
1300
|
* @returns Structured search results.
|
|
1046
1301
|
*/
|
|
1047
1302
|
search(query: string, req?: Omit<SearchRequest, "query">): Promise<SearchData>;
|
|
1303
|
+
/**
|
|
1304
|
+
* Access the v2 research endpoints (arXiv papers + GitHub history/readmes).
|
|
1305
|
+
* Example: `firecrawl.research.searchPapers("diffusion models")`.
|
|
1306
|
+
*/
|
|
1307
|
+
get research(): ResearchClient;
|
|
1048
1308
|
/**
|
|
1049
1309
|
* Map a site to discover URLs (sitemap-aware).
|
|
1050
1310
|
* @param url Root URL to map.
|
|
@@ -2208,4 +2468,4 @@ declare class Firecrawl extends FirecrawlClient {
|
|
|
2208
2468
|
get v1(): FirecrawlApp;
|
|
2209
2469
|
}
|
|
2210
2470
|
|
|
2211
|
-
export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type AgentOptions$1 as AgentOptions, type AgentResponse, type AgentStatusResponse, type AgentWebhookConfig, type AgentWebhookEvent, type AttributesFormat, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type BrandingProfile, type BrowserCreateResponse, type BrowserDeleteResponse, type BrowserExecuteResponse, type BrowserListResponse, type BrowserSession, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreateMonitorRequest, type CreditUsage, type CreditUsageHistoricalPeriod, type CreditUsageHistoricalResponse, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientInput, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, type GetMonitorCheckOptions, type HighlightsFormat, JobTimeoutError, type JsonFormat, type ListMonitorChecksOptions, type ListMonitorsOptions, type LocationConfig$1 as LocationConfig, type MapData, type MapOptions, type Monitor, type MonitorCheck, type MonitorCheckDetail, type MonitorCheckPage, type MonitorCrawlTarget, type MonitorEmailNotification, type MonitorJsonFieldDiff, type MonitorNotification, type MonitorPageDiff, type MonitorPageJudgment, type MonitorPageSnapshot, type MonitorSchedule, type MonitorScrapeTarget, type MonitorSummary, type MonitorTarget, type MonitorWebhookConfig, type PDFAction, type PaginationConfig, type ParseFile, type ParseFileData, type ParseFormat, type ParseFormatOption, type ParseFormatString, type ParseOptions, type PressAction, type QueryFormat, type QuestionFormat, type QueueStatusResponse$1 as QueueStatusResponse, type ScrapeAction, type ScrapeBrowserDeleteResponse, type ScrapeExecuteRequest, type ScrapeExecuteResponse, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type TokenUsageHistoricalPeriod, type TokenUsageHistoricalResponse, type UpdateMonitorRequest, type Viewport, type WaitAction, Watcher, type WatcherOptions, type WebhookConfig, type WriteAction, Firecrawl as default };
|
|
2471
|
+
export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type AgentOptions$1 as AgentOptions, type AgentResponse, type AgentStatusResponse, type AgentWebhookConfig, type AgentWebhookEvent, type AttributesFormat, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type BrandingProfile, type BrowserCreateResponse, type BrowserDeleteResponse, type BrowserExecuteResponse, type BrowserListResponse, type BrowserSession, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreateMonitorRequest, type CreditUsage, type CreditUsageHistoricalPeriod, type CreditUsageHistoricalResponse, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientInput, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, type GetMonitorCheckOptions, type GetPaperOptions, type GitHubScoreBreakdown, type GitHubSearchItem, type GitHubSearchResponse, type HighlightsFormat, type IdMap, JobTimeoutError, type JsonFormat, type ListMonitorChecksOptions, type ListMonitorsOptions, type LocationConfig$1 as LocationConfig, type MapData, type MapOptions, type Monitor, type MonitorCheck, type MonitorCheckDetail, type MonitorCheckPage, type MonitorCrawlTarget, type MonitorEmailNotification, type MonitorEmailRecipientSubscription, type MonitorJsonFieldDiff, type MonitorNotification, type MonitorPageDiff, type MonitorPageJudgment, type MonitorPageSnapshot, type MonitorSchedule, type MonitorScrapeTarget, type MonitorSummary, type MonitorTarget, type MonitorWebhookConfig, type PDFAction, type PIIBlock, type PIIReason, type PIISource, type PIISpan, type PIIStatus, type PaginationConfig, type PaperMetadata, type PaperMetadataResponse, type PaperResult, type PaperSignals, type ParseFile, type ParseFileData, type ParseFormat, type ParseFormatOption, type ParseFormatString, type ParseOptions, type Passage, type PressAction, type QueryFormat, type QuestionFormat, type QueueStatusResponse$1 as QueueStatusResponse, type ReadPaperResponse, type RedactPIIEntity, type RedactPIIOptions, ResearchClient, type ScrapeAction, type ScrapeBrowserDeleteResponse, type ScrapeExecuteRequest, type ScrapeExecuteResponse, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchGithubOptions, type SearchPapersOptions, type SearchPapersResponse, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type SimilarPapersOptions, type SimilarPapersResponse, type TokenUsage, type TokenUsageHistoricalPeriod, type TokenUsageHistoricalResponse, type UpdateMonitorRequest, type Viewport, type WaitAction, Watcher, type WatcherOptions, type WebhookConfig, type WriteAction, Firecrawl as default };
|
package/dist/index.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
require_package
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-5D4KXCYO.js";
|
|
4
4
|
|
|
5
5
|
// src/v2/utils/httpClient.ts
|
|
6
6
|
import axios from "axios";
|
|
@@ -1301,6 +1301,142 @@ async function getTokenUsageHistorical(http, byApiKey) {
|
|
|
1301
1301
|
}
|
|
1302
1302
|
}
|
|
1303
1303
|
|
|
1304
|
+
// src/v2/methods/research.ts
|
|
1305
|
+
var BASE = "/v2/research";
|
|
1306
|
+
function appendParam(params, key, value) {
|
|
1307
|
+
if (value == null) return;
|
|
1308
|
+
if (Array.isArray(value)) {
|
|
1309
|
+
for (const v of value) {
|
|
1310
|
+
if (v != null && String(v).length > 0) params.append(key, String(v));
|
|
1311
|
+
}
|
|
1312
|
+
} else {
|
|
1313
|
+
params.append(key, String(value));
|
|
1314
|
+
}
|
|
1315
|
+
}
|
|
1316
|
+
function withQuery(path, params) {
|
|
1317
|
+
const qs = params.toString();
|
|
1318
|
+
return qs ? `${path}?${qs}` : path;
|
|
1319
|
+
}
|
|
1320
|
+
function normalizeResearchError(err, action) {
|
|
1321
|
+
if (err?.isAxiosError) {
|
|
1322
|
+
const status = err.response?.status;
|
|
1323
|
+
const body = err.response?.data;
|
|
1324
|
+
if (body && (body.detail || body.title)) {
|
|
1325
|
+
const message = body.detail || body.title;
|
|
1326
|
+
throw new SdkError(message, status, body.type, body);
|
|
1327
|
+
}
|
|
1328
|
+
throw new SdkError(
|
|
1329
|
+
err.message || `Request failed while trying to ${action}`,
|
|
1330
|
+
status,
|
|
1331
|
+
err.code,
|
|
1332
|
+
body
|
|
1333
|
+
);
|
|
1334
|
+
}
|
|
1335
|
+
throw err;
|
|
1336
|
+
}
|
|
1337
|
+
var ResearchClient = class {
|
|
1338
|
+
constructor(http) {
|
|
1339
|
+
this.http = http;
|
|
1340
|
+
}
|
|
1341
|
+
/**
|
|
1342
|
+
* Search papers by abstract relevance.
|
|
1343
|
+
* @param query Natural-language search query.
|
|
1344
|
+
* @param options Optional filters (k, authors, categories, from, to).
|
|
1345
|
+
*/
|
|
1346
|
+
async searchPapers(query, options = {}) {
|
|
1347
|
+
if (!query || !query.trim()) throw new Error("query cannot be empty");
|
|
1348
|
+
if (options.k != null && options.k <= 0)
|
|
1349
|
+
throw new Error("k must be positive");
|
|
1350
|
+
const params = new URLSearchParams();
|
|
1351
|
+
appendParam(params, "query", query);
|
|
1352
|
+
appendParam(params, "k", options.k);
|
|
1353
|
+
appendParam(params, "authors", options.authors);
|
|
1354
|
+
appendParam(params, "categories", options.categories);
|
|
1355
|
+
appendParam(params, "from", options.from);
|
|
1356
|
+
appendParam(params, "to", options.to);
|
|
1357
|
+
try {
|
|
1358
|
+
const res = await this.http.get(
|
|
1359
|
+
withQuery(`${BASE}/papers`, params)
|
|
1360
|
+
);
|
|
1361
|
+
if (res.status !== 200) throwForBadResponse(res, "search papers");
|
|
1362
|
+
return res.data;
|
|
1363
|
+
} catch (err) {
|
|
1364
|
+
return normalizeResearchError(err, "search papers");
|
|
1365
|
+
}
|
|
1366
|
+
}
|
|
1367
|
+
async getPaper(id, options = {}) {
|
|
1368
|
+
if (!id || !id.trim()) throw new Error("id cannot be empty");
|
|
1369
|
+
if (options.k != null && options.query == null)
|
|
1370
|
+
throw new Error("k is only valid together with query");
|
|
1371
|
+
if (options.k != null && options.k <= 0)
|
|
1372
|
+
throw new Error("k must be positive");
|
|
1373
|
+
const params = new URLSearchParams();
|
|
1374
|
+
appendParam(params, "query", options.query);
|
|
1375
|
+
appendParam(params, "k", options.k);
|
|
1376
|
+
try {
|
|
1377
|
+
const res = await this.http.get(
|
|
1378
|
+
withQuery(`${BASE}/papers/${encodeURIComponent(id)}`, params)
|
|
1379
|
+
);
|
|
1380
|
+
if (res.status !== 200) throwForBadResponse(res, "get paper");
|
|
1381
|
+
return res.data;
|
|
1382
|
+
} catch (err) {
|
|
1383
|
+
return normalizeResearchError(err, "get paper");
|
|
1384
|
+
}
|
|
1385
|
+
}
|
|
1386
|
+
/**
|
|
1387
|
+
* Find related papers via the citation graph.
|
|
1388
|
+
* @param id Primary seed paper reference.
|
|
1389
|
+
* @param options Required `intent` plus optional mode, k, rerank, anchor.
|
|
1390
|
+
*/
|
|
1391
|
+
async similarPapers(id, options) {
|
|
1392
|
+
if (!id || !id.trim()) throw new Error("id cannot be empty");
|
|
1393
|
+
if (!options?.intent || !options.intent.trim())
|
|
1394
|
+
throw new Error("intent cannot be empty");
|
|
1395
|
+
if (options.k != null && options.k <= 0)
|
|
1396
|
+
throw new Error("k must be positive");
|
|
1397
|
+
const params = new URLSearchParams();
|
|
1398
|
+
appendParam(params, "intent", options.intent);
|
|
1399
|
+
appendParam(params, "mode", options.mode);
|
|
1400
|
+
appendParam(params, "k", options.k);
|
|
1401
|
+
if (options.rerank != null) appendParam(params, "rerank", options.rerank);
|
|
1402
|
+
appendParam(params, "anchor", options.anchor);
|
|
1403
|
+
try {
|
|
1404
|
+
const res = await this.http.get(
|
|
1405
|
+
withQuery(
|
|
1406
|
+
`${BASE}/papers/${encodeURIComponent(id)}/similar`,
|
|
1407
|
+
params
|
|
1408
|
+
)
|
|
1409
|
+
);
|
|
1410
|
+
if (res.status !== 200) throwForBadResponse(res, "find similar papers");
|
|
1411
|
+
return res.data;
|
|
1412
|
+
} catch (err) {
|
|
1413
|
+
return normalizeResearchError(err, "find similar papers");
|
|
1414
|
+
}
|
|
1415
|
+
}
|
|
1416
|
+
/**
|
|
1417
|
+
* Search GitHub issue/PR history and repository readmes.
|
|
1418
|
+
* @param query Search query.
|
|
1419
|
+
* @param options Optional `k`.
|
|
1420
|
+
*/
|
|
1421
|
+
async searchGithub(query, options = {}) {
|
|
1422
|
+
if (!query || !query.trim()) throw new Error("query cannot be empty");
|
|
1423
|
+
if (options.k != null && options.k <= 0)
|
|
1424
|
+
throw new Error("k must be positive");
|
|
1425
|
+
const params = new URLSearchParams();
|
|
1426
|
+
appendParam(params, "query", query);
|
|
1427
|
+
appendParam(params, "k", options.k);
|
|
1428
|
+
try {
|
|
1429
|
+
const res = await this.http.get(
|
|
1430
|
+
withQuery(`${BASE}/github`, params)
|
|
1431
|
+
);
|
|
1432
|
+
if (res.status !== 200) throwForBadResponse(res, "search github");
|
|
1433
|
+
return res.data;
|
|
1434
|
+
} catch (err) {
|
|
1435
|
+
return normalizeResearchError(err, "search github");
|
|
1436
|
+
}
|
|
1437
|
+
}
|
|
1438
|
+
};
|
|
1439
|
+
|
|
1304
1440
|
// src/v2/methods/monitor.ts
|
|
1305
1441
|
function queryString(params) {
|
|
1306
1442
|
if (!params) return "";
|
|
@@ -1666,6 +1802,7 @@ var Watcher = class extends EventEmitter {
|
|
|
1666
1802
|
import "zod";
|
|
1667
1803
|
var FirecrawlClient = class {
|
|
1668
1804
|
http;
|
|
1805
|
+
_research;
|
|
1669
1806
|
isCloudService(url) {
|
|
1670
1807
|
return url.includes("api.firecrawl.dev");
|
|
1671
1808
|
}
|
|
@@ -1738,6 +1875,15 @@ var FirecrawlClient = class {
|
|
|
1738
1875
|
async search(query, req = {}) {
|
|
1739
1876
|
return search(this.http, { query, ...req });
|
|
1740
1877
|
}
|
|
1878
|
+
// Research
|
|
1879
|
+
/**
|
|
1880
|
+
* Access the v2 research endpoints (arXiv papers + GitHub history/readmes).
|
|
1881
|
+
* Example: `firecrawl.research.searchPapers("diffusion models")`.
|
|
1882
|
+
*/
|
|
1883
|
+
get research() {
|
|
1884
|
+
if (!this._research) this._research = new ResearchClient(this.http);
|
|
1885
|
+
return this._research;
|
|
1886
|
+
}
|
|
1741
1887
|
// Map
|
|
1742
1888
|
/**
|
|
1743
1889
|
* Map a site to discover URLs (sitemap-aware).
|
|
@@ -2097,7 +2243,7 @@ var FirecrawlApp = class {
|
|
|
2097
2243
|
if (typeof process !== "undefined" && process.env && process.env.npm_package_version) {
|
|
2098
2244
|
return process.env.npm_package_version;
|
|
2099
2245
|
}
|
|
2100
|
-
const packageJson = await import("./package-
|
|
2246
|
+
const packageJson = await import("./package-HESILIET.js");
|
|
2101
2247
|
return packageJson.default.version;
|
|
2102
2248
|
} catch (error) {
|
|
2103
2249
|
const isTest = typeof process !== "undefined" && (process.env.JEST_WORKER_ID != null || false);
|
|
@@ -3494,6 +3640,7 @@ export {
|
|
|
3494
3640
|
FirecrawlApp as FirecrawlAppV1,
|
|
3495
3641
|
FirecrawlClient,
|
|
3496
3642
|
JobTimeoutError,
|
|
3643
|
+
ResearchClient,
|
|
3497
3644
|
SdkError,
|
|
3498
3645
|
Watcher,
|
|
3499
3646
|
index_default as default
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "firecrawl",
|
|
3
|
-
"version": "4.25.
|
|
3
|
+
"version": "4.25.3",
|
|
4
4
|
"description": "JavaScript SDK for Firecrawl API",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
"author": "Mendable.ai",
|
|
20
20
|
"license": "MIT",
|
|
21
21
|
"dependencies": {
|
|
22
|
-
"axios": "1.
|
|
22
|
+
"axios": "1.16.1",
|
|
23
23
|
"typescript-event-target": "^1.1.1",
|
|
24
24
|
"zod": "^3.23.8",
|
|
25
25
|
"zod-to-json-schema": "^3.23.0"
|
|
@@ -40,7 +40,7 @@
|
|
|
40
40
|
"ts-jest": "^29.4.5",
|
|
41
41
|
"tsup": "^8.5.0",
|
|
42
42
|
"typescript": "^5.4.5",
|
|
43
|
-
"uuid": "^
|
|
43
|
+
"uuid": "^14.0.0"
|
|
44
44
|
},
|
|
45
45
|
"keywords": [
|
|
46
46
|
"firecrawl",
|
|
@@ -26,10 +26,10 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
26
26
|
test.concurrent('should throw error for invalid API key on scrape', async () => {
|
|
27
27
|
if (API_URL.includes('api.firecrawl.dev')) {
|
|
28
28
|
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
29
|
-
await expect(invalidApp.scrapeUrl('https://
|
|
29
|
+
await expect(invalidApp.scrapeUrl('https://firecrawl-test-site.vercel.app')).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 401");
|
|
30
30
|
} else {
|
|
31
31
|
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
32
|
-
await expect(invalidApp.scrapeUrl('https://
|
|
32
|
+
await expect(invalidApp.scrapeUrl('https://firecrawl-test-site.vercel.app')).resolves.not.toThrow();
|
|
33
33
|
}
|
|
34
34
|
});
|
|
35
35
|
|
|
@@ -42,7 +42,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
42
42
|
test.concurrent('should return successful response for valid scrape', async () => {
|
|
43
43
|
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
44
44
|
|
|
45
|
-
const response = await app.scrapeUrl('https://
|
|
45
|
+
const response = await app.scrapeUrl('https://firecrawl-test-site.vercel.app');
|
|
46
46
|
if (!response.success) {
|
|
47
47
|
throw new Error(response.error);
|
|
48
48
|
}
|
|
@@ -51,7 +51,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
51
51
|
test.concurrent('should return successful response with valid API key and options', async () => {
|
|
52
52
|
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
53
53
|
const response = await app.scrapeUrl(
|
|
54
|
-
'https://
|
|
54
|
+
'https://firecrawl-test-site.vercel.app', {
|
|
55
55
|
formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'],
|
|
56
56
|
headers: { "x-key": "test" },
|
|
57
57
|
includeTags: ['h1'],
|
|
@@ -69,7 +69,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
69
69
|
test.concurrent('should return successful response with valid API key and screenshot fullPage', async () => {
|
|
70
70
|
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
71
71
|
const response = await app.scrapeUrl(
|
|
72
|
-
'https://
|
|
72
|
+
'https://firecrawl-test-site.vercel.app', {
|
|
73
73
|
formats: ['screenshot@fullPage'],
|
|
74
74
|
});
|
|
75
75
|
if (!response.success) {
|
|
@@ -132,16 +132,16 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
132
132
|
test.concurrent('should throw error for invalid API key on crawl', async () => {
|
|
133
133
|
if (API_URL.includes('api.firecrawl.dev')) {
|
|
134
134
|
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
135
|
-
await expect(invalidApp.crawlUrl('https://
|
|
135
|
+
await expect(invalidApp.crawlUrl('https://firecrawl-test-site.vercel.app')).rejects.toThrow("Request failed with status code 401");
|
|
136
136
|
} else {
|
|
137
137
|
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
138
|
-
await expect(invalidApp.crawlUrl('https://
|
|
138
|
+
await expect(invalidApp.crawlUrl('https://firecrawl-test-site.vercel.app')).resolves.not.toThrow();
|
|
139
139
|
}
|
|
140
140
|
});
|
|
141
141
|
|
|
142
142
|
test.concurrent('should return successful response for crawl and wait for completion', async () => {
|
|
143
143
|
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
144
|
-
const response = await app.crawlUrl('https://
|
|
144
|
+
const response = await app.crawlUrl('https://firecrawl-test-site.vercel.app', {}, 30) as CrawlStatusResponse;
|
|
145
145
|
expect(response).not.toHaveProperty("next"); // wait until done
|
|
146
146
|
expect(response.data.length).toBeGreaterThan(0);
|
|
147
147
|
if (response.data[0]) {
|
|
@@ -151,7 +151,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
151
151
|
|
|
152
152
|
test.concurrent('should return successful response for crawl with options and wait for completion', async () => {
|
|
153
153
|
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
154
|
-
const response = await app.crawlUrl('https://
|
|
154
|
+
const response = await app.crawlUrl('https://firecrawl-test-site.vercel.app', {
|
|
155
155
|
excludePaths: ['blog/*'],
|
|
156
156
|
includePaths: ['/'],
|
|
157
157
|
maxDepth: 2,
|
|
@@ -183,11 +183,11 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
183
183
|
test.concurrent('should handle idempotency key for crawl', async () => {
|
|
184
184
|
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
185
185
|
const uniqueIdempotencyKey = uuidv4();
|
|
186
|
-
const response = await app.asyncCrawlUrl('https://
|
|
186
|
+
const response = await app.asyncCrawlUrl('https://firecrawl-test-site.vercel.app', {}, uniqueIdempotencyKey) as CrawlResponse;
|
|
187
187
|
expect(response).not.toBeNull();
|
|
188
188
|
expect(response.id).toBeDefined();
|
|
189
189
|
|
|
190
|
-
await expect(app.crawlUrl('https://
|
|
190
|
+
await expect(app.crawlUrl('https://firecrawl-test-site.vercel.app', {}, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409");
|
|
191
191
|
});
|
|
192
192
|
|
|
193
193
|
test.concurrent('should check crawl status', async () => {
|
|
@@ -236,10 +236,10 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
236
236
|
test.concurrent('should throw error for invalid API key on map', async () => {
|
|
237
237
|
if (API_URL.includes('api.firecrawl.dev')) {
|
|
238
238
|
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
239
|
-
await expect(invalidApp.mapUrl('https://
|
|
239
|
+
await expect(invalidApp.mapUrl('https://firecrawl-test-site.vercel.app')).rejects.toThrow("Request failed with status code 401");
|
|
240
240
|
} else {
|
|
241
241
|
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
242
|
-
await expect(invalidApp.mapUrl('https://
|
|
242
|
+
await expect(invalidApp.mapUrl('https://firecrawl-test-site.vercel.app')).resolves.not.toThrow();
|
|
243
243
|
}
|
|
244
244
|
});
|
|
245
245
|
|
|
@@ -250,12 +250,12 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
250
250
|
});
|
|
251
251
|
|
|
252
252
|
test.concurrent('should return successful response for valid map', async () => {
|
|
253
|
-
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.mapUrl('https://
|
|
253
|
+
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.mapUrl('https://firecrawl-test-site.vercel.app') as MapResponse;
|
|
254
254
|
expect(response).not.toBeNull();
|
|
255
255
|
|
|
256
256
|
expect(response.links?.length).toBeGreaterThan(0);
|
|
257
257
|
expect(response.links?.[0]).toContain("https://");
|
|
258
|
-
const filteredLinks = response.links?.filter((link: string) => link.includes("
|
|
258
|
+
const filteredLinks = response.links?.filter((link: string) => link.includes("firecrawl-test-site.vercel.app"));
|
|
259
259
|
expect(filteredLinks?.length).toBeGreaterThan(0);
|
|
260
260
|
}, 30000); // 30 seconds timeout
|
|
261
261
|
|