illuma-agents 1.0.8 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -5
- package/dist/cjs/common/enum.cjs +1 -2
- package/dist/cjs/common/enum.cjs.map +1 -1
- package/dist/cjs/events.cjs +11 -0
- package/dist/cjs/events.cjs.map +1 -1
- package/dist/cjs/graphs/Graph.cjs +2 -1
- package/dist/cjs/graphs/Graph.cjs.map +1 -1
- package/dist/cjs/instrumentation.cjs +3 -1
- package/dist/cjs/instrumentation.cjs.map +1 -1
- package/dist/cjs/llm/anthropic/types.cjs.map +1 -1
- package/dist/cjs/llm/anthropic/utils/message_inputs.cjs +79 -2
- package/dist/cjs/llm/anthropic/utils/message_inputs.cjs.map +1 -1
- package/dist/cjs/llm/anthropic/utils/tools.cjs.map +1 -1
- package/dist/cjs/llm/bedrock/index.cjs +99 -0
- package/dist/cjs/llm/bedrock/index.cjs.map +1 -0
- package/dist/cjs/llm/fake.cjs.map +1 -1
- package/dist/cjs/llm/openai/index.cjs +102 -0
- package/dist/cjs/llm/openai/index.cjs.map +1 -1
- package/dist/cjs/llm/openai/utils/index.cjs +87 -1
- package/dist/cjs/llm/openai/utils/index.cjs.map +1 -1
- package/dist/cjs/llm/openrouter/index.cjs +175 -1
- package/dist/cjs/llm/openrouter/index.cjs.map +1 -1
- package/dist/cjs/llm/providers.cjs +13 -16
- package/dist/cjs/llm/providers.cjs.map +1 -1
- package/dist/cjs/llm/text.cjs.map +1 -1
- package/dist/cjs/messages/core.cjs +14 -14
- package/dist/cjs/messages/core.cjs.map +1 -1
- package/dist/cjs/messages/ids.cjs.map +1 -1
- package/dist/cjs/messages/prune.cjs.map +1 -1
- package/dist/cjs/run.cjs +18 -1
- package/dist/cjs/run.cjs.map +1 -1
- package/dist/cjs/splitStream.cjs.map +1 -1
- package/dist/cjs/stream.cjs +24 -1
- package/dist/cjs/stream.cjs.map +1 -1
- package/dist/cjs/tools/ToolNode.cjs +20 -1
- package/dist/cjs/tools/ToolNode.cjs.map +1 -1
- package/dist/cjs/tools/handlers.cjs +29 -25
- package/dist/cjs/tools/handlers.cjs.map +1 -1
- package/dist/cjs/tools/search/anthropic.cjs.map +1 -1
- package/dist/cjs/tools/search/content.cjs.map +1 -1
- package/dist/cjs/tools/search/firecrawl.cjs.map +1 -1
- package/dist/cjs/tools/search/format.cjs.map +1 -1
- package/dist/cjs/tools/search/highlights.cjs.map +1 -1
- package/dist/cjs/tools/search/rerankers.cjs.map +1 -1
- package/dist/cjs/tools/search/schema.cjs +27 -25
- package/dist/cjs/tools/search/schema.cjs.map +1 -1
- package/dist/cjs/tools/search/search.cjs +6 -1
- package/dist/cjs/tools/search/search.cjs.map +1 -1
- package/dist/cjs/tools/search/serper-scraper.cjs.map +1 -1
- package/dist/cjs/tools/search/tool.cjs +182 -35
- package/dist/cjs/tools/search/tool.cjs.map +1 -1
- package/dist/cjs/tools/search/utils.cjs.map +1 -1
- package/dist/cjs/utils/graph.cjs.map +1 -1
- package/dist/cjs/utils/llm.cjs +0 -1
- package/dist/cjs/utils/llm.cjs.map +1 -1
- package/dist/cjs/utils/misc.cjs.map +1 -1
- package/dist/cjs/utils/run.cjs.map +1 -1
- package/dist/cjs/utils/title.cjs +7 -7
- package/dist/cjs/utils/title.cjs.map +1 -1
- package/dist/esm/common/enum.mjs +1 -2
- package/dist/esm/common/enum.mjs.map +1 -1
- package/dist/esm/events.mjs +11 -0
- package/dist/esm/events.mjs.map +1 -1
- package/dist/esm/graphs/Graph.mjs +2 -1
- package/dist/esm/graphs/Graph.mjs.map +1 -1
- package/dist/esm/instrumentation.mjs +3 -1
- package/dist/esm/instrumentation.mjs.map +1 -1
- package/dist/esm/llm/anthropic/types.mjs.map +1 -1
- package/dist/esm/llm/anthropic/utils/message_inputs.mjs +79 -2
- package/dist/esm/llm/anthropic/utils/message_inputs.mjs.map +1 -1
- package/dist/esm/llm/anthropic/utils/tools.mjs.map +1 -1
- package/dist/esm/llm/bedrock/index.mjs +97 -0
- package/dist/esm/llm/bedrock/index.mjs.map +1 -0
- package/dist/esm/llm/fake.mjs.map +1 -1
- package/dist/esm/llm/openai/index.mjs +103 -1
- package/dist/esm/llm/openai/index.mjs.map +1 -1
- package/dist/esm/llm/openai/utils/index.mjs +88 -2
- package/dist/esm/llm/openai/utils/index.mjs.map +1 -1
- package/dist/esm/llm/openrouter/index.mjs +175 -1
- package/dist/esm/llm/openrouter/index.mjs.map +1 -1
- package/dist/esm/llm/providers.mjs +2 -5
- package/dist/esm/llm/providers.mjs.map +1 -1
- package/dist/esm/llm/text.mjs.map +1 -1
- package/dist/esm/messages/core.mjs +14 -14
- package/dist/esm/messages/core.mjs.map +1 -1
- package/dist/esm/messages/ids.mjs.map +1 -1
- package/dist/esm/messages/prune.mjs.map +1 -1
- package/dist/esm/run.mjs +18 -1
- package/dist/esm/run.mjs.map +1 -1
- package/dist/esm/splitStream.mjs.map +1 -1
- package/dist/esm/stream.mjs +24 -1
- package/dist/esm/stream.mjs.map +1 -1
- package/dist/esm/tools/ToolNode.mjs +20 -1
- package/dist/esm/tools/ToolNode.mjs.map +1 -1
- package/dist/esm/tools/handlers.mjs +30 -26
- package/dist/esm/tools/handlers.mjs.map +1 -1
- package/dist/esm/tools/search/anthropic.mjs.map +1 -1
- package/dist/esm/tools/search/content.mjs.map +1 -1
- package/dist/esm/tools/search/firecrawl.mjs.map +1 -1
- package/dist/esm/tools/search/format.mjs.map +1 -1
- package/dist/esm/tools/search/highlights.mjs.map +1 -1
- package/dist/esm/tools/search/rerankers.mjs.map +1 -1
- package/dist/esm/tools/search/schema.mjs +27 -25
- package/dist/esm/tools/search/schema.mjs.map +1 -1
- package/dist/esm/tools/search/search.mjs +6 -1
- package/dist/esm/tools/search/search.mjs.map +1 -1
- package/dist/esm/tools/search/serper-scraper.mjs.map +1 -1
- package/dist/esm/tools/search/tool.mjs +182 -35
- package/dist/esm/tools/search/tool.mjs.map +1 -1
- package/dist/esm/tools/search/utils.mjs.map +1 -1
- package/dist/esm/utils/graph.mjs.map +1 -1
- package/dist/esm/utils/llm.mjs +0 -1
- package/dist/esm/utils/llm.mjs.map +1 -1
- package/dist/esm/utils/misc.mjs.map +1 -1
- package/dist/esm/utils/run.mjs.map +1 -1
- package/dist/esm/utils/title.mjs +7 -7
- package/dist/esm/utils/title.mjs.map +1 -1
- package/dist/types/common/enum.d.ts +1 -2
- package/dist/types/llm/bedrock/index.d.ts +36 -0
- package/dist/types/llm/openai/index.d.ts +1 -0
- package/dist/types/llm/openai/utils/index.d.ts +10 -1
- package/dist/types/llm/openrouter/index.d.ts +4 -1
- package/dist/types/tools/search/types.d.ts +2 -0
- package/dist/types/types/llm.d.ts +3 -8
- package/package.json +16 -12
- package/src/common/enum.ts +1 -2
- package/src/common/index.ts +1 -1
- package/src/events.ts +11 -0
- package/src/graphs/Graph.ts +2 -1
- package/src/instrumentation.ts +25 -22
- package/src/llm/anthropic/llm.spec.ts +1442 -1442
- package/src/llm/anthropic/types.ts +140 -140
- package/src/llm/anthropic/utils/message_inputs.ts +757 -660
- package/src/llm/anthropic/utils/output_parsers.ts +133 -133
- package/src/llm/anthropic/utils/tools.ts +29 -29
- package/src/llm/bedrock/index.ts +128 -0
- package/src/llm/fake.ts +133 -133
- package/src/llm/google/llm.spec.ts +3 -1
- package/src/llm/google/utils/tools.ts +160 -160
- package/src/llm/openai/index.ts +126 -0
- package/src/llm/openai/types.ts +24 -24
- package/src/llm/openai/utils/index.ts +116 -1
- package/src/llm/openai/utils/isReasoningModel.test.ts +90 -90
- package/src/llm/openrouter/index.ts +222 -1
- package/src/llm/providers.ts +2 -7
- package/src/llm/text.ts +94 -94
- package/src/messages/core.ts +463 -463
- package/src/messages/formatAgentMessages.tools.test.ts +400 -400
- package/src/messages/formatMessage.test.ts +693 -693
- package/src/messages/ids.ts +26 -26
- package/src/messages/prune.ts +567 -567
- package/src/messages/shiftIndexTokenCountMap.test.ts +81 -81
- package/src/mockStream.ts +98 -98
- package/src/prompts/collab.ts +5 -5
- package/src/prompts/index.ts +1 -1
- package/src/prompts/taskmanager.ts +61 -61
- package/src/run.ts +22 -4
- package/src/scripts/ant_web_search_edge_case.ts +162 -0
- package/src/scripts/ant_web_search_error_edge_case.ts +148 -0
- package/src/scripts/args.ts +48 -48
- package/src/scripts/caching.ts +123 -123
- package/src/scripts/code_exec_files.ts +193 -193
- package/src/scripts/empty_input.ts +137 -137
- package/src/scripts/memory.ts +97 -97
- package/src/scripts/test-tools-before-handoff.ts +1 -5
- package/src/scripts/thinking.ts +149 -149
- package/src/scripts/tools.ts +1 -4
- package/src/specs/anthropic.simple.test.ts +67 -0
- package/src/specs/spec.utils.ts +3 -3
- package/src/specs/token-distribution-edge-case.test.ts +316 -316
- package/src/specs/tool-error.test.ts +193 -193
- package/src/splitStream.test.ts +691 -691
- package/src/splitStream.ts +234 -234
- package/src/stream.test.ts +94 -94
- package/src/stream.ts +30 -1
- package/src/tools/ToolNode.ts +24 -1
- package/src/tools/handlers.ts +32 -28
- package/src/tools/search/anthropic.ts +51 -51
- package/src/tools/search/content.test.ts +173 -173
- package/src/tools/search/content.ts +147 -147
- package/src/tools/search/direct-url.test.ts +530 -0
- package/src/tools/search/firecrawl.ts +210 -210
- package/src/tools/search/format.ts +250 -250
- package/src/tools/search/highlights.ts +320 -320
- package/src/tools/search/index.ts +2 -2
- package/src/tools/search/jina-reranker.test.ts +126 -126
- package/src/tools/search/output.md +2775 -2775
- package/src/tools/search/rerankers.ts +242 -242
- package/src/tools/search/schema.ts +65 -63
- package/src/tools/search/search.ts +766 -759
- package/src/tools/search/serper-scraper.ts +155 -155
- package/src/tools/search/test.html +883 -883
- package/src/tools/search/test.md +642 -642
- package/src/tools/search/test.ts +159 -159
- package/src/tools/search/tool.ts +641 -471
- package/src/tools/search/types.ts +689 -687
- package/src/tools/search/utils.ts +79 -79
- package/src/types/index.ts +6 -6
- package/src/types/llm.ts +2 -8
- package/src/utils/graph.ts +10 -10
- package/src/utils/llm.ts +26 -27
- package/src/utils/llmConfig.ts +13 -5
- package/src/utils/logging.ts +48 -48
- package/src/utils/misc.ts +57 -57
- package/src/utils/run.ts +100 -100
- package/src/utils/title.ts +165 -165
- package/dist/cjs/llm/ollama/index.cjs +0 -70
- package/dist/cjs/llm/ollama/index.cjs.map +0 -1
- package/dist/cjs/llm/ollama/utils.cjs +0 -158
- package/dist/cjs/llm/ollama/utils.cjs.map +0 -1
- package/dist/esm/llm/ollama/index.mjs +0 -68
- package/dist/esm/llm/ollama/index.mjs.map +0 -1
- package/dist/esm/llm/ollama/utils.mjs +0 -155
- package/dist/esm/llm/ollama/utils.mjs.map +0 -1
- package/dist/types/llm/ollama/index.d.ts +0 -8
- package/dist/types/llm/ollama/utils.d.ts +0 -7
- package/src/llm/ollama/index.ts +0 -92
- package/src/llm/ollama/utils.ts +0 -193
- package/src/proto/CollabGraph.ts +0 -269
- package/src/proto/TaskManager.ts +0 -243
- package/src/proto/collab.ts +0 -200
- package/src/proto/collab_design.ts +0 -184
- package/src/proto/collab_design_v2.ts +0 -224
- package/src/proto/collab_design_v3.ts +0 -255
- package/src/proto/collab_design_v4.ts +0 -220
- package/src/proto/collab_design_v5.ts +0 -251
- package/src/proto/collab_graph.ts +0 -181
- package/src/proto/collab_original.ts +0 -123
- package/src/proto/example.ts +0 -93
- package/src/proto/example_new.ts +0 -68
- package/src/proto/example_old.ts +0 -201
- package/src/proto/example_test.ts +0 -152
- package/src/proto/example_test_anthropic.ts +0 -100
- package/src/proto/log_stream.ts +0 -202
- package/src/proto/main_collab_community_event.ts +0 -133
- package/src/proto/main_collab_design_v2.ts +0 -96
- package/src/proto/main_collab_design_v4.ts +0 -100
- package/src/proto/main_collab_design_v5.ts +0 -135
- package/src/proto/main_collab_global_analysis.ts +0 -122
- package/src/proto/main_collab_hackathon_event.ts +0 -153
- package/src/proto/main_collab_space_mission.ts +0 -153
- package/src/proto/main_philosophy.ts +0 -210
- package/src/proto/original_script.ts +0 -126
- package/src/proto/standard.ts +0 -100
- package/src/proto/stream.ts +0 -56
- package/src/proto/tasks.ts +0 -118
- package/src/proto/tools/global_analysis_tools.ts +0 -86
- package/src/proto/tools/space_mission_tools.ts +0 -60
- package/src/proto/vertexai.ts +0 -54
- package/src/scripts/image.ts +0 -178
|
@@ -1,210 +1,210 @@
|
|
|
1
|
-
import axios from 'axios';
|
|
2
|
-
import { processContent } from './content';
|
|
3
|
-
import type * as t from './types';
|
|
4
|
-
import { createDefaultLogger } from './utils';
|
|
5
|
-
|
|
6
|
-
/**
|
|
7
|
-
* Firecrawl scraper implementation
|
|
8
|
-
* Uses the Firecrawl API to scrape web pages
|
|
9
|
-
*/
|
|
10
|
-
export class FirecrawlScraper implements t.BaseScraper {
|
|
11
|
-
private apiKey: string;
|
|
12
|
-
private apiUrl: string;
|
|
13
|
-
private version: string;
|
|
14
|
-
private defaultFormats: string[];
|
|
15
|
-
private timeout: number;
|
|
16
|
-
private logger: t.Logger;
|
|
17
|
-
private includeTags?: string[];
|
|
18
|
-
private excludeTags?: string[];
|
|
19
|
-
private waitFor?: number;
|
|
20
|
-
private maxAge?: number;
|
|
21
|
-
private mobile?: boolean;
|
|
22
|
-
private skipTlsVerification?: boolean;
|
|
23
|
-
private blockAds?: boolean;
|
|
24
|
-
private removeBase64Images?: boolean;
|
|
25
|
-
private parsePDF?: boolean;
|
|
26
|
-
private storeInCache?: boolean;
|
|
27
|
-
private zeroDataRetention?: boolean;
|
|
28
|
-
private headers?: Record<string, string>;
|
|
29
|
-
private location?: { country?: string; languages?: string[] };
|
|
30
|
-
private onlyMainContent?: boolean;
|
|
31
|
-
private changeTrackingOptions?: object;
|
|
32
|
-
|
|
33
|
-
constructor(config: t.FirecrawlScraperConfig = {}) {
|
|
34
|
-
this.apiKey = config.apiKey ?? process.env.FIRECRAWL_API_KEY ?? '';
|
|
35
|
-
|
|
36
|
-
this.version = config.version ?? 'v2';
|
|
37
|
-
|
|
38
|
-
const baseUrl =
|
|
39
|
-
config.apiUrl ??
|
|
40
|
-
process.env.FIRECRAWL_BASE_URL ??
|
|
41
|
-
'https://api.firecrawl.dev';
|
|
42
|
-
this.apiUrl = `${baseUrl.replace(/\/+$/, '')}/${this.version}/scrape`;
|
|
43
|
-
|
|
44
|
-
this.defaultFormats = config.formats ?? ['markdown', 'rawHtml'];
|
|
45
|
-
this.timeout = config.timeout ?? 7500;
|
|
46
|
-
|
|
47
|
-
this.logger = config.logger || createDefaultLogger();
|
|
48
|
-
|
|
49
|
-
this.includeTags = config.includeTags;
|
|
50
|
-
this.excludeTags = config.excludeTags;
|
|
51
|
-
this.waitFor = config.waitFor;
|
|
52
|
-
this.maxAge = config.maxAge;
|
|
53
|
-
this.mobile = config.mobile;
|
|
54
|
-
this.skipTlsVerification = config.skipTlsVerification;
|
|
55
|
-
this.blockAds = config.blockAds;
|
|
56
|
-
this.removeBase64Images = config.removeBase64Images;
|
|
57
|
-
this.parsePDF = config.parsePDF;
|
|
58
|
-
this.storeInCache = config.storeInCache;
|
|
59
|
-
this.zeroDataRetention = config.zeroDataRetention;
|
|
60
|
-
this.headers = config.headers;
|
|
61
|
-
this.location = config.location;
|
|
62
|
-
this.onlyMainContent = config.onlyMainContent;
|
|
63
|
-
this.changeTrackingOptions = config.changeTrackingOptions;
|
|
64
|
-
|
|
65
|
-
if (!this.apiKey) {
|
|
66
|
-
this.logger.warn('FIRECRAWL_API_KEY is not set. Scraping will not work.');
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
this.logger.debug(
|
|
70
|
-
`Firecrawl scraper initialized with API URL: ${this.apiUrl}`
|
|
71
|
-
);
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
/**
|
|
75
|
-
* Scrape a single URL
|
|
76
|
-
* @param url URL to scrape
|
|
77
|
-
* @param options Scrape options
|
|
78
|
-
* @returns Scrape response
|
|
79
|
-
*/
|
|
80
|
-
async scrapeUrl(
|
|
81
|
-
url: string,
|
|
82
|
-
options: t.FirecrawlScrapeOptions = {}
|
|
83
|
-
): Promise<[string, t.FirecrawlScrapeResponse]> {
|
|
84
|
-
if (!this.apiKey) {
|
|
85
|
-
return [
|
|
86
|
-
url,
|
|
87
|
-
{
|
|
88
|
-
success: false,
|
|
89
|
-
error: 'FIRECRAWL_API_KEY is not set',
|
|
90
|
-
},
|
|
91
|
-
];
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
try {
|
|
95
|
-
const payload = omitUndefined({
|
|
96
|
-
url,
|
|
97
|
-
formats: options.formats ?? this.defaultFormats,
|
|
98
|
-
includeTags: options.includeTags ?? this.includeTags,
|
|
99
|
-
excludeTags: options.excludeTags ?? this.excludeTags,
|
|
100
|
-
headers: options.headers ?? this.headers,
|
|
101
|
-
waitFor: options.waitFor ?? this.waitFor,
|
|
102
|
-
timeout: options.timeout ?? this.timeout,
|
|
103
|
-
onlyMainContent: options.onlyMainContent ?? this.onlyMainContent,
|
|
104
|
-
maxAge: options.maxAge ?? this.maxAge,
|
|
105
|
-
mobile: options.mobile ?? this.mobile,
|
|
106
|
-
skipTlsVerification:
|
|
107
|
-
options.skipTlsVerification ?? this.skipTlsVerification,
|
|
108
|
-
parsePDF: options.parsePDF ?? this.parsePDF,
|
|
109
|
-
location: options.location ?? this.location,
|
|
110
|
-
removeBase64Images:
|
|
111
|
-
options.removeBase64Images ?? this.removeBase64Images,
|
|
112
|
-
blockAds: options.blockAds ?? this.blockAds,
|
|
113
|
-
storeInCache: options.storeInCache ?? this.storeInCache,
|
|
114
|
-
zeroDataRetention: options.zeroDataRetention ?? this.zeroDataRetention,
|
|
115
|
-
changeTrackingOptions:
|
|
116
|
-
options.changeTrackingOptions ?? this.changeTrackingOptions,
|
|
117
|
-
});
|
|
118
|
-
const response = await axios.post(this.apiUrl, payload, {
|
|
119
|
-
headers: {
|
|
120
|
-
'Content-Type': 'application/json',
|
|
121
|
-
Authorization: `Bearer ${this.apiKey}`,
|
|
122
|
-
},
|
|
123
|
-
timeout: this.timeout,
|
|
124
|
-
});
|
|
125
|
-
|
|
126
|
-
return [url, response.data];
|
|
127
|
-
} catch (error) {
|
|
128
|
-
const errorMessage =
|
|
129
|
-
error instanceof Error ? error.message : String(error);
|
|
130
|
-
return [
|
|
131
|
-
url,
|
|
132
|
-
{
|
|
133
|
-
success: false,
|
|
134
|
-
error: `Firecrawl API request failed: ${errorMessage}`,
|
|
135
|
-
},
|
|
136
|
-
];
|
|
137
|
-
}
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
/**
|
|
141
|
-
* Extract content from scrape response
|
|
142
|
-
* @param response Scrape response
|
|
143
|
-
* @returns Extracted content or empty string if not available
|
|
144
|
-
*/
|
|
145
|
-
extractContent(
|
|
146
|
-
response: t.FirecrawlScrapeResponse
|
|
147
|
-
): [string, undefined | t.References] {
|
|
148
|
-
if (!response.success || !response.data) {
|
|
149
|
-
return ['', undefined];
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
if (response.data.markdown != null && response.data.html != null) {
|
|
153
|
-
try {
|
|
154
|
-
const { markdown, ...rest } = processContent(
|
|
155
|
-
response.data.html,
|
|
156
|
-
response.data.markdown
|
|
157
|
-
);
|
|
158
|
-
return [markdown, rest];
|
|
159
|
-
} catch (error) {
|
|
160
|
-
this.logger.error('Error processing content:', error);
|
|
161
|
-
return [response.data.markdown, undefined];
|
|
162
|
-
}
|
|
163
|
-
} else if (response.data.markdown != null) {
|
|
164
|
-
return [response.data.markdown, undefined];
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
// Fall back to HTML content
|
|
168
|
-
if (response.data.html != null) {
|
|
169
|
-
return [response.data.html, undefined];
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
// Fall back to raw HTML content
|
|
173
|
-
if (response.data.rawHtml != null) {
|
|
174
|
-
return [response.data.rawHtml, undefined];
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
return ['', undefined];
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
/**
|
|
181
|
-
* Extract metadata from scrape response
|
|
182
|
-
* @param response Scrape response
|
|
183
|
-
* @returns Metadata object
|
|
184
|
-
*/
|
|
185
|
-
extractMetadata(response: t.FirecrawlScrapeResponse): t.ScrapeMetadata {
|
|
186
|
-
if (!response.success || !response.data || !response.data.metadata) {
|
|
187
|
-
return {};
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
return response.data.metadata;
|
|
191
|
-
}
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
/**
|
|
195
|
-
* Create a Firecrawl scraper instance
|
|
196
|
-
* @param config Scraper configuration
|
|
197
|
-
* @returns Firecrawl scraper instance
|
|
198
|
-
*/
|
|
199
|
-
export const createFirecrawlScraper = (
|
|
200
|
-
config: t.FirecrawlScraperConfig = {}
|
|
201
|
-
): FirecrawlScraper => {
|
|
202
|
-
return new FirecrawlScraper(config);
|
|
203
|
-
};
|
|
204
|
-
|
|
205
|
-
// Helper function to clean up payload for firecrawl
|
|
206
|
-
function omitUndefined<T extends object>(obj: T): Partial<T> {
|
|
207
|
-
return Object.fromEntries(
|
|
208
|
-
Object.entries(obj).filter(([, v]) => v !== undefined)
|
|
209
|
-
) as Partial<T>;
|
|
210
|
-
}
|
|
1
|
+
import axios from 'axios';
|
|
2
|
+
import { processContent } from './content';
|
|
3
|
+
import type * as t from './types';
|
|
4
|
+
import { createDefaultLogger } from './utils';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Firecrawl scraper implementation
|
|
8
|
+
* Uses the Firecrawl API to scrape web pages
|
|
9
|
+
*/
|
|
10
|
+
export class FirecrawlScraper implements t.BaseScraper {
|
|
11
|
+
private apiKey: string;
|
|
12
|
+
private apiUrl: string;
|
|
13
|
+
private version: string;
|
|
14
|
+
private defaultFormats: string[];
|
|
15
|
+
private timeout: number;
|
|
16
|
+
private logger: t.Logger;
|
|
17
|
+
private includeTags?: string[];
|
|
18
|
+
private excludeTags?: string[];
|
|
19
|
+
private waitFor?: number;
|
|
20
|
+
private maxAge?: number;
|
|
21
|
+
private mobile?: boolean;
|
|
22
|
+
private skipTlsVerification?: boolean;
|
|
23
|
+
private blockAds?: boolean;
|
|
24
|
+
private removeBase64Images?: boolean;
|
|
25
|
+
private parsePDF?: boolean;
|
|
26
|
+
private storeInCache?: boolean;
|
|
27
|
+
private zeroDataRetention?: boolean;
|
|
28
|
+
private headers?: Record<string, string>;
|
|
29
|
+
private location?: { country?: string; languages?: string[] };
|
|
30
|
+
private onlyMainContent?: boolean;
|
|
31
|
+
private changeTrackingOptions?: object;
|
|
32
|
+
|
|
33
|
+
constructor(config: t.FirecrawlScraperConfig = {}) {
|
|
34
|
+
this.apiKey = config.apiKey ?? process.env.FIRECRAWL_API_KEY ?? '';
|
|
35
|
+
|
|
36
|
+
this.version = config.version ?? 'v2';
|
|
37
|
+
|
|
38
|
+
const baseUrl =
|
|
39
|
+
config.apiUrl ??
|
|
40
|
+
process.env.FIRECRAWL_BASE_URL ??
|
|
41
|
+
'https://api.firecrawl.dev';
|
|
42
|
+
this.apiUrl = `${baseUrl.replace(/\/+$/, '')}/${this.version}/scrape`;
|
|
43
|
+
|
|
44
|
+
this.defaultFormats = config.formats ?? ['markdown', 'rawHtml'];
|
|
45
|
+
this.timeout = config.timeout ?? 7500;
|
|
46
|
+
|
|
47
|
+
this.logger = config.logger || createDefaultLogger();
|
|
48
|
+
|
|
49
|
+
this.includeTags = config.includeTags;
|
|
50
|
+
this.excludeTags = config.excludeTags;
|
|
51
|
+
this.waitFor = config.waitFor;
|
|
52
|
+
this.maxAge = config.maxAge;
|
|
53
|
+
this.mobile = config.mobile;
|
|
54
|
+
this.skipTlsVerification = config.skipTlsVerification;
|
|
55
|
+
this.blockAds = config.blockAds;
|
|
56
|
+
this.removeBase64Images = config.removeBase64Images;
|
|
57
|
+
this.parsePDF = config.parsePDF;
|
|
58
|
+
this.storeInCache = config.storeInCache;
|
|
59
|
+
this.zeroDataRetention = config.zeroDataRetention;
|
|
60
|
+
this.headers = config.headers;
|
|
61
|
+
this.location = config.location;
|
|
62
|
+
this.onlyMainContent = config.onlyMainContent;
|
|
63
|
+
this.changeTrackingOptions = config.changeTrackingOptions;
|
|
64
|
+
|
|
65
|
+
if (!this.apiKey) {
|
|
66
|
+
this.logger.warn('FIRECRAWL_API_KEY is not set. Scraping will not work.');
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
this.logger.debug(
|
|
70
|
+
`Firecrawl scraper initialized with API URL: ${this.apiUrl}`
|
|
71
|
+
);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Scrape a single URL
|
|
76
|
+
* @param url URL to scrape
|
|
77
|
+
* @param options Scrape options
|
|
78
|
+
* @returns Scrape response
|
|
79
|
+
*/
|
|
80
|
+
async scrapeUrl(
|
|
81
|
+
url: string,
|
|
82
|
+
options: t.FirecrawlScrapeOptions = {}
|
|
83
|
+
): Promise<[string, t.FirecrawlScrapeResponse]> {
|
|
84
|
+
if (!this.apiKey) {
|
|
85
|
+
return [
|
|
86
|
+
url,
|
|
87
|
+
{
|
|
88
|
+
success: false,
|
|
89
|
+
error: 'FIRECRAWL_API_KEY is not set',
|
|
90
|
+
},
|
|
91
|
+
];
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
try {
|
|
95
|
+
const payload = omitUndefined({
|
|
96
|
+
url,
|
|
97
|
+
formats: options.formats ?? this.defaultFormats,
|
|
98
|
+
includeTags: options.includeTags ?? this.includeTags,
|
|
99
|
+
excludeTags: options.excludeTags ?? this.excludeTags,
|
|
100
|
+
headers: options.headers ?? this.headers,
|
|
101
|
+
waitFor: options.waitFor ?? this.waitFor,
|
|
102
|
+
timeout: options.timeout ?? this.timeout,
|
|
103
|
+
onlyMainContent: options.onlyMainContent ?? this.onlyMainContent,
|
|
104
|
+
maxAge: options.maxAge ?? this.maxAge,
|
|
105
|
+
mobile: options.mobile ?? this.mobile,
|
|
106
|
+
skipTlsVerification:
|
|
107
|
+
options.skipTlsVerification ?? this.skipTlsVerification,
|
|
108
|
+
parsePDF: options.parsePDF ?? this.parsePDF,
|
|
109
|
+
location: options.location ?? this.location,
|
|
110
|
+
removeBase64Images:
|
|
111
|
+
options.removeBase64Images ?? this.removeBase64Images,
|
|
112
|
+
blockAds: options.blockAds ?? this.blockAds,
|
|
113
|
+
storeInCache: options.storeInCache ?? this.storeInCache,
|
|
114
|
+
zeroDataRetention: options.zeroDataRetention ?? this.zeroDataRetention,
|
|
115
|
+
changeTrackingOptions:
|
|
116
|
+
options.changeTrackingOptions ?? this.changeTrackingOptions,
|
|
117
|
+
});
|
|
118
|
+
const response = await axios.post(this.apiUrl, payload, {
|
|
119
|
+
headers: {
|
|
120
|
+
'Content-Type': 'application/json',
|
|
121
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
122
|
+
},
|
|
123
|
+
timeout: this.timeout,
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
return [url, response.data];
|
|
127
|
+
} catch (error) {
|
|
128
|
+
const errorMessage =
|
|
129
|
+
error instanceof Error ? error.message : String(error);
|
|
130
|
+
return [
|
|
131
|
+
url,
|
|
132
|
+
{
|
|
133
|
+
success: false,
|
|
134
|
+
error: `Firecrawl API request failed: ${errorMessage}`,
|
|
135
|
+
},
|
|
136
|
+
];
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Extract content from scrape response
|
|
142
|
+
* @param response Scrape response
|
|
143
|
+
* @returns Extracted content or empty string if not available
|
|
144
|
+
*/
|
|
145
|
+
extractContent(
|
|
146
|
+
response: t.FirecrawlScrapeResponse
|
|
147
|
+
): [string, undefined | t.References] {
|
|
148
|
+
if (!response.success || !response.data) {
|
|
149
|
+
return ['', undefined];
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
if (response.data.markdown != null && response.data.html != null) {
|
|
153
|
+
try {
|
|
154
|
+
const { markdown, ...rest } = processContent(
|
|
155
|
+
response.data.html,
|
|
156
|
+
response.data.markdown
|
|
157
|
+
);
|
|
158
|
+
return [markdown, rest];
|
|
159
|
+
} catch (error) {
|
|
160
|
+
this.logger.error('Error processing content:', error);
|
|
161
|
+
return [response.data.markdown, undefined];
|
|
162
|
+
}
|
|
163
|
+
} else if (response.data.markdown != null) {
|
|
164
|
+
return [response.data.markdown, undefined];
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// Fall back to HTML content
|
|
168
|
+
if (response.data.html != null) {
|
|
169
|
+
return [response.data.html, undefined];
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// Fall back to raw HTML content
|
|
173
|
+
if (response.data.rawHtml != null) {
|
|
174
|
+
return [response.data.rawHtml, undefined];
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
return ['', undefined];
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Extract metadata from scrape response
|
|
182
|
+
* @param response Scrape response
|
|
183
|
+
* @returns Metadata object
|
|
184
|
+
*/
|
|
185
|
+
extractMetadata(response: t.FirecrawlScrapeResponse): t.ScrapeMetadata {
|
|
186
|
+
if (!response.success || !response.data || !response.data.metadata) {
|
|
187
|
+
return {};
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
return response.data.metadata;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* Create a Firecrawl scraper instance
|
|
196
|
+
* @param config Scraper configuration
|
|
197
|
+
* @returns Firecrawl scraper instance
|
|
198
|
+
*/
|
|
199
|
+
export const createFirecrawlScraper = (
|
|
200
|
+
config: t.FirecrawlScraperConfig = {}
|
|
201
|
+
): FirecrawlScraper => {
|
|
202
|
+
return new FirecrawlScraper(config);
|
|
203
|
+
};
|
|
204
|
+
|
|
205
|
+
// Helper function to clean up payload for firecrawl
|
|
206
|
+
function omitUndefined<T extends object>(obj: T): Partial<T> {
|
|
207
|
+
return Object.fromEntries(
|
|
208
|
+
Object.entries(obj).filter(([, v]) => v !== undefined)
|
|
209
|
+
) as Partial<T>;
|
|
210
|
+
}
|