@librechat/agents 2.4.22 → 2.4.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/common/enum.cjs +1 -0
- package/dist/cjs/common/enum.cjs.map +1 -1
- package/dist/cjs/llm/anthropic/index.cjs +1 -1
- package/dist/cjs/llm/anthropic/index.cjs.map +1 -1
- package/dist/cjs/llm/anthropic/types.cjs +50 -0
- package/dist/cjs/llm/anthropic/types.cjs.map +1 -0
- package/dist/cjs/llm/anthropic/utils/message_inputs.cjs +227 -21
- package/dist/cjs/llm/anthropic/utils/message_inputs.cjs.map +1 -1
- package/dist/cjs/llm/anthropic/utils/message_outputs.cjs +1 -0
- package/dist/cjs/llm/anthropic/utils/message_outputs.cjs.map +1 -1
- package/dist/cjs/llm/openai/index.cjs.map +1 -1
- package/dist/cjs/main.cjs +2 -0
- package/dist/cjs/main.cjs.map +1 -1
- package/dist/cjs/run.cjs.map +1 -1
- package/dist/cjs/tools/search/firecrawl.cjs +149 -0
- package/dist/cjs/tools/search/firecrawl.cjs.map +1 -0
- package/dist/cjs/tools/search/format.cjs +116 -0
- package/dist/cjs/tools/search/format.cjs.map +1 -0
- package/dist/cjs/tools/search/highlights.cjs +193 -0
- package/dist/cjs/tools/search/highlights.cjs.map +1 -0
- package/dist/cjs/tools/search/rerankers.cjs +187 -0
- package/dist/cjs/tools/search/rerankers.cjs.map +1 -0
- package/dist/cjs/tools/search/search.cjs +410 -0
- package/dist/cjs/tools/search/search.cjs.map +1 -0
- package/dist/cjs/tools/search/tool.cjs +103 -0
- package/dist/cjs/tools/search/tool.cjs.map +1 -0
- package/dist/esm/common/enum.mjs +1 -0
- package/dist/esm/common/enum.mjs.map +1 -1
- package/dist/esm/llm/anthropic/index.mjs +1 -1
- package/dist/esm/llm/anthropic/index.mjs.map +1 -1
- package/dist/esm/llm/anthropic/types.mjs +48 -0
- package/dist/esm/llm/anthropic/types.mjs.map +1 -0
- package/dist/esm/llm/anthropic/utils/message_inputs.mjs +228 -22
- package/dist/esm/llm/anthropic/utils/message_inputs.mjs.map +1 -1
- package/dist/esm/llm/anthropic/utils/message_outputs.mjs +1 -0
- package/dist/esm/llm/anthropic/utils/message_outputs.mjs.map +1 -1
- package/dist/esm/llm/openai/index.mjs.map +1 -1
- package/dist/esm/main.mjs +1 -0
- package/dist/esm/main.mjs.map +1 -1
- package/dist/esm/run.mjs.map +1 -1
- package/dist/esm/tools/search/firecrawl.mjs +145 -0
- package/dist/esm/tools/search/firecrawl.mjs.map +1 -0
- package/dist/esm/tools/search/format.mjs +114 -0
- package/dist/esm/tools/search/format.mjs.map +1 -0
- package/dist/esm/tools/search/highlights.mjs +191 -0
- package/dist/esm/tools/search/highlights.mjs.map +1 -0
- package/dist/esm/tools/search/rerankers.mjs +181 -0
- package/dist/esm/tools/search/rerankers.mjs.map +1 -0
- package/dist/esm/tools/search/search.mjs +407 -0
- package/dist/esm/tools/search/search.mjs.map +1 -0
- package/dist/esm/tools/search/tool.mjs +101 -0
- package/dist/esm/tools/search/tool.mjs.map +1 -0
- package/dist/types/common/enum.d.ts +1 -0
- package/dist/types/index.d.ts +1 -0
- package/dist/types/llm/anthropic/index.d.ts +3 -4
- package/dist/types/llm/anthropic/types.d.ts +4 -35
- package/dist/types/llm/anthropic/utils/message_inputs.d.ts +2 -2
- package/dist/types/llm/anthropic/utils/message_outputs.d.ts +1 -3
- package/dist/types/llm/anthropic/utils/output_parsers.d.ts +22 -0
- package/dist/types/llm/openai/index.d.ts +3 -2
- package/dist/types/scripts/search.d.ts +1 -0
- package/dist/types/tools/example.d.ts +21 -3
- package/dist/types/tools/search/firecrawl.d.ts +117 -0
- package/dist/types/tools/search/format.d.ts +2 -0
- package/dist/types/tools/search/highlights.d.ts +13 -0
- package/dist/types/tools/search/index.d.ts +2 -0
- package/dist/types/tools/search/rerankers.d.ts +32 -0
- package/dist/types/tools/search/search.d.ts +9 -0
- package/dist/types/tools/search/tool.d.ts +12 -0
- package/dist/types/tools/search/types.d.ts +150 -0
- package/package.json +10 -9
- package/src/common/enum.ts +1 -0
- package/src/index.ts +1 -0
- package/src/llm/anthropic/index.ts +6 -5
- package/src/llm/anthropic/llm.spec.ts +176 -179
- package/src/llm/anthropic/types.ts +64 -39
- package/src/llm/anthropic/utils/message_inputs.ts +275 -37
- package/src/llm/anthropic/utils/message_outputs.ts +4 -21
- package/src/llm/anthropic/utils/output_parsers.ts +114 -0
- package/src/llm/openai/index.ts +7 -6
- package/src/run.ts +1 -1
- package/src/scripts/search.ts +141 -0
- package/src/tools/search/firecrawl.ts +270 -0
- package/src/tools/search/format.ts +121 -0
- package/src/tools/search/highlights.ts +237 -0
- package/src/tools/search/index.ts +2 -0
- package/src/tools/search/rerankers.ts +248 -0
- package/src/tools/search/search.ts +567 -0
- package/src/tools/search/tool.ts +151 -0
- package/src/tools/search/types.ts +179 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
/* eslint-disable @typescript-eslint/explicit-function-return-type */
|
|
2
|
+
/* eslint-disable @typescript-eslint/no-empty-object-type */
|
|
3
|
+
import { z } from 'zod';
|
|
4
|
+
import {
|
|
5
|
+
BaseLLMOutputParser,
|
|
6
|
+
OutputParserException,
|
|
7
|
+
} from '@langchain/core/output_parsers';
|
|
8
|
+
import { JsonOutputKeyToolsParserParams } from '@langchain/core/output_parsers/openai_tools';
|
|
9
|
+
import { ChatGeneration } from '@langchain/core/outputs';
|
|
10
|
+
import { ToolCall } from '@langchain/core/messages/tool';
|
|
11
|
+
|
|
12
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
13
|
+
interface AnthropicToolsOutputParserParams<T extends Record<string, any>>
|
|
14
|
+
extends JsonOutputKeyToolsParserParams<T> {}
|
|
15
|
+
|
|
16
|
+
export class AnthropicToolsOutputParser<
|
|
17
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
18
|
+
T extends Record<string, any> = Record<string, any>,
|
|
19
|
+
> extends BaseLLMOutputParser<T> {
|
|
20
|
+
static lc_name() {
|
|
21
|
+
return 'AnthropicToolsOutputParser';
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
lc_namespace = ['langchain', 'anthropic', 'output_parsers'];
|
|
25
|
+
|
|
26
|
+
returnId = false;
|
|
27
|
+
|
|
28
|
+
/** The type of tool calls to return. */
|
|
29
|
+
keyName: string;
|
|
30
|
+
|
|
31
|
+
/** Whether to return only the first tool call. */
|
|
32
|
+
returnSingle = false;
|
|
33
|
+
|
|
34
|
+
zodSchema?: z.ZodType<T>;
|
|
35
|
+
|
|
36
|
+
constructor(params: AnthropicToolsOutputParserParams<T>) {
|
|
37
|
+
super(params);
|
|
38
|
+
this.keyName = params.keyName;
|
|
39
|
+
this.returnSingle = params.returnSingle ?? this.returnSingle;
|
|
40
|
+
this.zodSchema = params.zodSchema;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
protected async _validateResult(result: unknown): Promise<T> {
|
|
44
|
+
let parsedResult = result;
|
|
45
|
+
if (typeof result === 'string') {
|
|
46
|
+
try {
|
|
47
|
+
parsedResult = JSON.parse(result);
|
|
48
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
49
|
+
} catch (e: any) {
|
|
50
|
+
throw new OutputParserException(
|
|
51
|
+
`Failed to parse. Text: "${JSON.stringify(
|
|
52
|
+
result,
|
|
53
|
+
null,
|
|
54
|
+
2
|
|
55
|
+
)}". Error: ${JSON.stringify(e.message)}`,
|
|
56
|
+
result
|
|
57
|
+
);
|
|
58
|
+
}
|
|
59
|
+
} else {
|
|
60
|
+
parsedResult = result;
|
|
61
|
+
}
|
|
62
|
+
if (this.zodSchema === undefined) {
|
|
63
|
+
return parsedResult as T;
|
|
64
|
+
}
|
|
65
|
+
const zodParsedResult = await this.zodSchema.safeParseAsync(parsedResult);
|
|
66
|
+
if (zodParsedResult.success) {
|
|
67
|
+
return zodParsedResult.data;
|
|
68
|
+
} else {
|
|
69
|
+
throw new OutputParserException(
|
|
70
|
+
`Failed to parse. Text: "${JSON.stringify(
|
|
71
|
+
result,
|
|
72
|
+
null,
|
|
73
|
+
2
|
|
74
|
+
)}". Error: ${JSON.stringify(zodParsedResult.error.errors)}`,
|
|
75
|
+
JSON.stringify(parsedResult, null, 2)
|
|
76
|
+
);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
async parseResult(generations: ChatGeneration[]): Promise<T> {
|
|
81
|
+
const tools = generations.flatMap((generation) => {
|
|
82
|
+
const { message } = generation;
|
|
83
|
+
if (!Array.isArray(message.content)) {
|
|
84
|
+
return [];
|
|
85
|
+
}
|
|
86
|
+
const tool = extractToolCalls(message.content)[0];
|
|
87
|
+
return tool;
|
|
88
|
+
});
|
|
89
|
+
if (tools[0] === undefined) {
|
|
90
|
+
throw new Error(
|
|
91
|
+
'No parseable tool calls provided to AnthropicToolsOutputParser.'
|
|
92
|
+
);
|
|
93
|
+
}
|
|
94
|
+
const [tool] = tools;
|
|
95
|
+
const validatedResult = await this._validateResult(tool.args);
|
|
96
|
+
return validatedResult;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
101
|
+
export function extractToolCalls(content: Record<string, any>[]) {
|
|
102
|
+
const toolCalls: ToolCall[] = [];
|
|
103
|
+
for (const block of content) {
|
|
104
|
+
if (block.type === 'tool_use') {
|
|
105
|
+
toolCalls.push({
|
|
106
|
+
name: block.name,
|
|
107
|
+
args: block.input,
|
|
108
|
+
id: block.id,
|
|
109
|
+
type: 'tool_call',
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
return toolCalls;
|
|
114
|
+
}
|
package/src/llm/openai/index.ts
CHANGED
|
@@ -7,6 +7,7 @@ import {
|
|
|
7
7
|
ChatOpenAI as OriginalChatOpenAI,
|
|
8
8
|
AzureChatOpenAI as OriginalAzureChatOpenAI,
|
|
9
9
|
} from '@langchain/openai';
|
|
10
|
+
import type { OpenAICoreRequestOptions } from 'node_modules/@langchain/deepseek/node_modules/@langchain/openai';
|
|
10
11
|
import type * as t from '@langchain/openai';
|
|
11
12
|
|
|
12
13
|
function createAbortHandler(controller: AbortController): () => void {
|
|
@@ -191,8 +192,8 @@ export class ChatDeepSeek extends OriginalChatDeepSeek {
|
|
|
191
192
|
return this.client;
|
|
192
193
|
}
|
|
193
194
|
protected _getClientOptions(
|
|
194
|
-
options?:
|
|
195
|
-
):
|
|
195
|
+
options?: OpenAICoreRequestOptions
|
|
196
|
+
): OpenAICoreRequestOptions {
|
|
196
197
|
if (!(this.client as OpenAIClient | undefined)) {
|
|
197
198
|
const openAIEndpointConfig: t.OpenAIEndpointConfig = {
|
|
198
199
|
baseURL: this.clientConfig.baseURL,
|
|
@@ -214,7 +215,7 @@ export class ChatDeepSeek extends OriginalChatDeepSeek {
|
|
|
214
215
|
const requestOptions = {
|
|
215
216
|
...this.clientConfig,
|
|
216
217
|
...options,
|
|
217
|
-
} as
|
|
218
|
+
} as OpenAICoreRequestOptions;
|
|
218
219
|
return requestOptions;
|
|
219
220
|
}
|
|
220
221
|
}
|
|
@@ -224,8 +225,8 @@ export class ChatXAI extends OriginalChatXAI {
|
|
|
224
225
|
return this.client;
|
|
225
226
|
}
|
|
226
227
|
protected _getClientOptions(
|
|
227
|
-
options?:
|
|
228
|
-
):
|
|
228
|
+
options?: OpenAICoreRequestOptions
|
|
229
|
+
): OpenAICoreRequestOptions {
|
|
229
230
|
if (!(this.client as OpenAIClient | undefined)) {
|
|
230
231
|
const openAIEndpointConfig: t.OpenAIEndpointConfig = {
|
|
231
232
|
baseURL: this.clientConfig.baseURL,
|
|
@@ -247,7 +248,7 @@ export class ChatXAI extends OriginalChatXAI {
|
|
|
247
248
|
const requestOptions = {
|
|
248
249
|
...this.clientConfig,
|
|
249
250
|
...options,
|
|
250
|
-
} as
|
|
251
|
+
} as OpenAICoreRequestOptions;
|
|
251
252
|
return requestOptions;
|
|
252
253
|
}
|
|
253
254
|
}
|
package/src/run.ts
CHANGED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
/* eslint-disable no-console */
|
|
2
|
+
// src/scripts/cli.ts
|
|
3
|
+
import { config } from 'dotenv';
|
|
4
|
+
config();
|
|
5
|
+
import { HumanMessage, BaseMessage } from '@langchain/core/messages';
|
|
6
|
+
import type * as t from '@/types';
|
|
7
|
+
import { ChatModelStreamHandler, createContentAggregator } from '@/stream';
|
|
8
|
+
import { ToolEndHandler, ModelEndHandler } from '@/events';
|
|
9
|
+
import { createSearchTool } from '@/tools/search';
|
|
10
|
+
|
|
11
|
+
import { getArgs } from '@/scripts/args';
|
|
12
|
+
import { Run } from '@/run';
|
|
13
|
+
import { GraphEvents, Callback } from '@/common';
|
|
14
|
+
import { getLLMConfig } from '@/utils/llmConfig';
|
|
15
|
+
|
|
16
|
+
const conversationHistory: BaseMessage[] = [];
|
|
17
|
+
async function testStandardStreaming(): Promise<void> {
|
|
18
|
+
const { userName, location, provider, currentDate } = await getArgs();
|
|
19
|
+
const { contentParts, aggregateContent } = createContentAggregator();
|
|
20
|
+
const customHandlers = {
|
|
21
|
+
[GraphEvents.TOOL_END]: new ToolEndHandler(),
|
|
22
|
+
[GraphEvents.CHAT_MODEL_END]: new ModelEndHandler(),
|
|
23
|
+
[GraphEvents.CHAT_MODEL_STREAM]: new ChatModelStreamHandler(),
|
|
24
|
+
[GraphEvents.ON_RUN_STEP_COMPLETED]: {
|
|
25
|
+
handle: (
|
|
26
|
+
event: GraphEvents.ON_RUN_STEP_COMPLETED,
|
|
27
|
+
data: t.StreamEventData
|
|
28
|
+
): void => {
|
|
29
|
+
console.log('====== ON_RUN_STEP_COMPLETED ======');
|
|
30
|
+
// console.dir(data, { depth: null });
|
|
31
|
+
aggregateContent({
|
|
32
|
+
event,
|
|
33
|
+
data: data as unknown as { result: t.ToolEndEvent },
|
|
34
|
+
});
|
|
35
|
+
},
|
|
36
|
+
},
|
|
37
|
+
[GraphEvents.ON_RUN_STEP]: {
|
|
38
|
+
handle: (
|
|
39
|
+
event: GraphEvents.ON_RUN_STEP,
|
|
40
|
+
data: t.StreamEventData
|
|
41
|
+
): void => {
|
|
42
|
+
console.log('====== ON_RUN_STEP ======');
|
|
43
|
+
console.dir(data, { depth: null });
|
|
44
|
+
aggregateContent({ event, data: data as t.RunStep });
|
|
45
|
+
},
|
|
46
|
+
},
|
|
47
|
+
[GraphEvents.ON_RUN_STEP_DELTA]: {
|
|
48
|
+
handle: (
|
|
49
|
+
event: GraphEvents.ON_RUN_STEP_DELTA,
|
|
50
|
+
data: t.StreamEventData
|
|
51
|
+
): void => {
|
|
52
|
+
console.log('====== ON_RUN_STEP_DELTA ======');
|
|
53
|
+
console.dir(data, { depth: null });
|
|
54
|
+
aggregateContent({ event, data: data as t.RunStepDeltaEvent });
|
|
55
|
+
},
|
|
56
|
+
},
|
|
57
|
+
[GraphEvents.ON_MESSAGE_DELTA]: {
|
|
58
|
+
handle: (
|
|
59
|
+
event: GraphEvents.ON_MESSAGE_DELTA,
|
|
60
|
+
data: t.StreamEventData
|
|
61
|
+
): void => {
|
|
62
|
+
console.log('====== ON_MESSAGE_DELTA ======');
|
|
63
|
+
console.dir(data, { depth: null });
|
|
64
|
+
aggregateContent({ event, data: data as t.MessageDeltaEvent });
|
|
65
|
+
},
|
|
66
|
+
},
|
|
67
|
+
[GraphEvents.TOOL_START]: {
|
|
68
|
+
handle: (
|
|
69
|
+
_event: string,
|
|
70
|
+
data: t.StreamEventData,
|
|
71
|
+
metadata?: Record<string, unknown>
|
|
72
|
+
): void => {
|
|
73
|
+
console.log('====== TOOL_START ======');
|
|
74
|
+
// console.dir(data, { depth: null });
|
|
75
|
+
},
|
|
76
|
+
},
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
const llmConfig = getLLMConfig(provider);
|
|
80
|
+
|
|
81
|
+
const run = await Run.create<t.IState>({
|
|
82
|
+
runId: 'test-run-id',
|
|
83
|
+
graphConfig: {
|
|
84
|
+
type: 'standard',
|
|
85
|
+
llmConfig,
|
|
86
|
+
tools: [createSearchTool()],
|
|
87
|
+
instructions:
|
|
88
|
+
'You are a friendly AI assistant. Always address the user by their name.',
|
|
89
|
+
additional_instructions: `The user's name is ${userName} and they are located in ${location}.`,
|
|
90
|
+
},
|
|
91
|
+
returnContent: true,
|
|
92
|
+
customHandlers,
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
const config = {
|
|
96
|
+
configurable: {
|
|
97
|
+
provider,
|
|
98
|
+
thread_id: 'conversation-num-1',
|
|
99
|
+
},
|
|
100
|
+
streamMode: 'values',
|
|
101
|
+
version: 'v2' as const,
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
console.log('Test 1: Weather query (content parts test)');
|
|
105
|
+
|
|
106
|
+
// const userMessage = `
|
|
107
|
+
// Make a search for the weather in ${location} today, which is ${currentDate}.
|
|
108
|
+
// Before making the search, please let me know what you're about to do, then immediately start searching without hesitation.
|
|
109
|
+
// Make sure to always refer to me by name, which is ${userName}.
|
|
110
|
+
// After giving me a thorough summary, tell me a joke about the weather forecast we went over.
|
|
111
|
+
// `;
|
|
112
|
+
const userMessage = 'Are massage guns good?';
|
|
113
|
+
|
|
114
|
+
conversationHistory.push(new HumanMessage(userMessage));
|
|
115
|
+
|
|
116
|
+
const inputs = {
|
|
117
|
+
messages: conversationHistory,
|
|
118
|
+
};
|
|
119
|
+
const finalContentParts = await run.processStream(inputs, config);
|
|
120
|
+
const finalMessages = run.getRunMessages();
|
|
121
|
+
if (finalMessages) {
|
|
122
|
+
conversationHistory.push(...finalMessages);
|
|
123
|
+
console.dir(conversationHistory, { depth: null });
|
|
124
|
+
}
|
|
125
|
+
// console.dir(finalContentParts, { depth: null });
|
|
126
|
+
console.log('\n\n====================\n\n');
|
|
127
|
+
// console.dir(contentParts, { depth: null });
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
process.on('unhandledRejection', (reason, promise) => {
|
|
131
|
+
console.error('Unhandled Rejection at:', promise, 'reason:', reason);
|
|
132
|
+
console.log('Conversation history:');
|
|
133
|
+
process.exit(1);
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
testStandardStreaming().catch((err) => {
|
|
137
|
+
console.error(err);
|
|
138
|
+
console.log('Conversation history:');
|
|
139
|
+
console.dir(conversationHistory, { depth: null });
|
|
140
|
+
process.exit(1);
|
|
141
|
+
});
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
/* eslint-disable no-console */
|
|
2
|
+
import axios from 'axios';
|
|
3
|
+
|
|
4
|
+
export interface FirecrawlScrapeOptions {
|
|
5
|
+
formats?: string[];
|
|
6
|
+
includeTags?: string[];
|
|
7
|
+
excludeTags?: string[];
|
|
8
|
+
headers?: Record<string, string>;
|
|
9
|
+
waitFor?: number;
|
|
10
|
+
timeout?: number;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
interface ScrapeMetadata {
|
|
14
|
+
// Core source information
|
|
15
|
+
sourceURL?: string;
|
|
16
|
+
url?: string;
|
|
17
|
+
scrapeId?: string;
|
|
18
|
+
statusCode?: number;
|
|
19
|
+
// Basic metadata
|
|
20
|
+
title?: string;
|
|
21
|
+
description?: string;
|
|
22
|
+
language?: string;
|
|
23
|
+
favicon?: string;
|
|
24
|
+
viewport?: string;
|
|
25
|
+
robots?: string;
|
|
26
|
+
'theme-color'?: string;
|
|
27
|
+
// Open Graph metadata
|
|
28
|
+
'og:url'?: string;
|
|
29
|
+
'og:title'?: string;
|
|
30
|
+
'og:description'?: string;
|
|
31
|
+
'og:type'?: string;
|
|
32
|
+
'og:image'?: string;
|
|
33
|
+
'og:image:width'?: string;
|
|
34
|
+
'og:image:height'?: string;
|
|
35
|
+
'og:site_name'?: string;
|
|
36
|
+
ogUrl?: string;
|
|
37
|
+
ogTitle?: string;
|
|
38
|
+
ogDescription?: string;
|
|
39
|
+
ogImage?: string;
|
|
40
|
+
ogSiteName?: string;
|
|
41
|
+
// Article metadata
|
|
42
|
+
'article:author'?: string;
|
|
43
|
+
'article:published_time'?: string;
|
|
44
|
+
'article:modified_time'?: string;
|
|
45
|
+
'article:section'?: string;
|
|
46
|
+
'article:tag'?: string;
|
|
47
|
+
'article:publisher'?: string;
|
|
48
|
+
publishedTime?: string;
|
|
49
|
+
modifiedTime?: string;
|
|
50
|
+
// Twitter metadata
|
|
51
|
+
'twitter:site'?: string;
|
|
52
|
+
'twitter:creator'?: string;
|
|
53
|
+
'twitter:card'?: string;
|
|
54
|
+
'twitter:image'?: string;
|
|
55
|
+
'twitter:dnt'?: string;
|
|
56
|
+
'twitter:app:name:iphone'?: string;
|
|
57
|
+
'twitter:app:id:iphone'?: string;
|
|
58
|
+
'twitter:app:url:iphone'?: string;
|
|
59
|
+
'twitter:app:name:ipad'?: string;
|
|
60
|
+
'twitter:app:id:ipad'?: string;
|
|
61
|
+
'twitter:app:url:ipad'?: string;
|
|
62
|
+
'twitter:app:name:googleplay'?: string;
|
|
63
|
+
'twitter:app:id:googleplay'?: string;
|
|
64
|
+
'twitter:app:url:googleplay'?: string;
|
|
65
|
+
// Facebook metadata
|
|
66
|
+
'fb:app_id'?: string;
|
|
67
|
+
// App links
|
|
68
|
+
'al:ios:url'?: string;
|
|
69
|
+
'al:ios:app_name'?: string;
|
|
70
|
+
'al:ios:app_store_id'?: string;
|
|
71
|
+
// Allow for additional properties that might be present
|
|
72
|
+
[key: string]: string | number | boolean | null | undefined;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
export interface FirecrawlScrapeResponse {
|
|
76
|
+
success: boolean;
|
|
77
|
+
data?: {
|
|
78
|
+
markdown?: string;
|
|
79
|
+
html?: string;
|
|
80
|
+
rawHtml?: string;
|
|
81
|
+
screenshot?: string;
|
|
82
|
+
links?: string[];
|
|
83
|
+
metadata?: ScrapeMetadata;
|
|
84
|
+
};
|
|
85
|
+
error?: string;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
export interface FirecrawlScraperConfig {
|
|
89
|
+
apiKey?: string;
|
|
90
|
+
apiUrl?: string;
|
|
91
|
+
formats?: string[];
|
|
92
|
+
timeout?: number;
|
|
93
|
+
}
|
|
94
|
+
const getDomainName = (
|
|
95
|
+
link: string,
|
|
96
|
+
metadata?: ScrapeMetadata
|
|
97
|
+
): string | undefined => {
|
|
98
|
+
try {
|
|
99
|
+
const url = metadata?.sourceURL ?? metadata?.url ?? (link || '');
|
|
100
|
+
const domain = new URL(url).hostname.replace(/^www\./, '');
|
|
101
|
+
if (domain) {
|
|
102
|
+
return domain;
|
|
103
|
+
}
|
|
104
|
+
} catch (e) {
|
|
105
|
+
// URL parsing failed
|
|
106
|
+
console.error('Error parsing URL:', e);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
return;
|
|
110
|
+
};
|
|
111
|
+
|
|
112
|
+
export function getAttribution(
|
|
113
|
+
link: string,
|
|
114
|
+
metadata?: ScrapeMetadata
|
|
115
|
+
): string | undefined {
|
|
116
|
+
if (!metadata) return getDomainName(link, metadata);
|
|
117
|
+
|
|
118
|
+
const possibleAttributions = [
|
|
119
|
+
metadata.ogSiteName,
|
|
120
|
+
metadata['og:site_name'],
|
|
121
|
+
metadata.title?.split('|').pop()?.trim(),
|
|
122
|
+
metadata['twitter:site']?.replace(/^@/, ''),
|
|
123
|
+
];
|
|
124
|
+
|
|
125
|
+
const attribution = possibleAttributions.find(
|
|
126
|
+
(attr) => attr != null && typeof attr === 'string' && attr.trim() !== ''
|
|
127
|
+
);
|
|
128
|
+
if (attribution != null) {
|
|
129
|
+
return attribution;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
return getDomainName(link, metadata);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Firecrawl scraper implementation
|
|
137
|
+
* Uses the Firecrawl API to scrape web pages
|
|
138
|
+
*/
|
|
139
|
+
export class FirecrawlScraper {
|
|
140
|
+
private apiKey: string;
|
|
141
|
+
private apiUrl: string;
|
|
142
|
+
private defaultFormats: string[];
|
|
143
|
+
private timeout: number;
|
|
144
|
+
|
|
145
|
+
constructor(config: FirecrawlScraperConfig = {}) {
|
|
146
|
+
this.apiKey = config.apiKey ?? process.env.FIRECRAWL_API_KEY ?? '';
|
|
147
|
+
|
|
148
|
+
const baseUrl =
|
|
149
|
+
config.apiUrl ??
|
|
150
|
+
process.env.FIRECRAWL_BASE_URL ??
|
|
151
|
+
'https://api.firecrawl.dev';
|
|
152
|
+
this.apiUrl = `${baseUrl.replace(/\/+$/, '')}/v1/scrape`;
|
|
153
|
+
|
|
154
|
+
this.defaultFormats = config.formats ?? ['markdown', 'html'];
|
|
155
|
+
this.timeout = config.timeout ?? 30000;
|
|
156
|
+
|
|
157
|
+
if (!this.apiKey) {
|
|
158
|
+
console.warn('FIRECRAWL_API_KEY is not set. Scraping will not work.');
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
console.log(`Firecrawl scraper initialized with API URL: ${this.apiUrl}`);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Scrape a single URL
|
|
166
|
+
* @param url URL to scrape
|
|
167
|
+
* @param options Scrape options
|
|
168
|
+
* @returns Scrape response
|
|
169
|
+
*/
|
|
170
|
+
async scrapeUrl(
|
|
171
|
+
url: string,
|
|
172
|
+
options: FirecrawlScrapeOptions = {}
|
|
173
|
+
): Promise<[string, FirecrawlScrapeResponse]> {
|
|
174
|
+
if (!this.apiKey) {
|
|
175
|
+
return [
|
|
176
|
+
url,
|
|
177
|
+
{
|
|
178
|
+
success: false,
|
|
179
|
+
error: 'FIRECRAWL_API_KEY is not set',
|
|
180
|
+
},
|
|
181
|
+
];
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
try {
|
|
185
|
+
const response = await axios.post(
|
|
186
|
+
this.apiUrl,
|
|
187
|
+
{
|
|
188
|
+
url,
|
|
189
|
+
formats: options.formats || this.defaultFormats,
|
|
190
|
+
includeTags: options.includeTags,
|
|
191
|
+
excludeTags: options.excludeTags,
|
|
192
|
+
headers: options.headers,
|
|
193
|
+
waitFor: options.waitFor,
|
|
194
|
+
timeout: options.timeout ?? this.timeout,
|
|
195
|
+
},
|
|
196
|
+
{
|
|
197
|
+
headers: {
|
|
198
|
+
'Content-Type': 'application/json',
|
|
199
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
200
|
+
},
|
|
201
|
+
timeout: this.timeout,
|
|
202
|
+
}
|
|
203
|
+
);
|
|
204
|
+
|
|
205
|
+
return [url, response.data];
|
|
206
|
+
} catch (error) {
|
|
207
|
+
const errorMessage =
|
|
208
|
+
error instanceof Error ? error.message : String(error);
|
|
209
|
+
return [
|
|
210
|
+
url,
|
|
211
|
+
{
|
|
212
|
+
success: false,
|
|
213
|
+
error: `Firecrawl API request failed: ${errorMessage}`,
|
|
214
|
+
},
|
|
215
|
+
];
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* Extract content from scrape response
|
|
221
|
+
* @param response Scrape response
|
|
222
|
+
* @returns Extracted content or empty string if not available
|
|
223
|
+
*/
|
|
224
|
+
extractContent(response: FirecrawlScrapeResponse): string {
|
|
225
|
+
if (!response.success || !response.data) {
|
|
226
|
+
return '';
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// Prefer markdown content if available
|
|
230
|
+
if (response.data.markdown != null) {
|
|
231
|
+
return response.data.markdown;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// Fall back to HTML content
|
|
235
|
+
if (response.data.html != null) {
|
|
236
|
+
return response.data.html;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
// Fall back to raw HTML content
|
|
240
|
+
if (response.data.rawHtml != null) {
|
|
241
|
+
return response.data.rawHtml;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
return '';
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Extract metadata from scrape response
|
|
249
|
+
* @param response Scrape response
|
|
250
|
+
* @returns Metadata object
|
|
251
|
+
*/
|
|
252
|
+
extractMetadata(response: FirecrawlScrapeResponse): ScrapeMetadata {
|
|
253
|
+
if (!response.success || !response.data || !response.data.metadata) {
|
|
254
|
+
return {};
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
return response.data.metadata;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
/**
|
|
262
|
+
* Create a Firecrawl scraper instance
|
|
263
|
+
* @param config Scraper configuration
|
|
264
|
+
* @returns Firecrawl scraper instance
|
|
265
|
+
*/
|
|
266
|
+
export const createFirecrawlScraper = (
|
|
267
|
+
config: FirecrawlScraperConfig = {}
|
|
268
|
+
): FirecrawlScraper => {
|
|
269
|
+
return new FirecrawlScraper(config);
|
|
270
|
+
};
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import type * as t from './types';
|
|
2
|
+
|
|
3
|
+
export function formatResultsForLLM(results: t.SearchResultData): string {
|
|
4
|
+
let output = '';
|
|
5
|
+
|
|
6
|
+
const addSection = (title: string): void => {
|
|
7
|
+
output += `\n=== ${title} ===\n`;
|
|
8
|
+
};
|
|
9
|
+
|
|
10
|
+
// Organic (web) results
|
|
11
|
+
const organic = results.organic ?? [];
|
|
12
|
+
if (organic.length) {
|
|
13
|
+
addSection('Web Results');
|
|
14
|
+
organic.forEach((r, i) => {
|
|
15
|
+
output += [
|
|
16
|
+
`Source ${i + 1}: ${r.title ?? '(no title)'}`,
|
|
17
|
+
`Citation Anchor: \\ue202turn0search${i + 1}`,
|
|
18
|
+
`URL: ${r.link}`,
|
|
19
|
+
r.snippet != null ? `Summary: ${r.snippet}` : '',
|
|
20
|
+
r.date != null ? `Date: ${r.date}` : '',
|
|
21
|
+
r.attribution != null ? `Source: ${r.attribution}` : '',
|
|
22
|
+
'',
|
|
23
|
+
'--- Content Highlights ---',
|
|
24
|
+
...(r.highlights ?? [])
|
|
25
|
+
.filter((h) => h.text.trim().length > 0)
|
|
26
|
+
.map((h) => `[Relevance: ${h.score.toFixed(2)}]\n${h.text.trim()}`),
|
|
27
|
+
'',
|
|
28
|
+
]
|
|
29
|
+
.filter(Boolean)
|
|
30
|
+
.join('\n');
|
|
31
|
+
});
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Ignoring these sections for now
|
|
35
|
+
// // Top stories (news)
|
|
36
|
+
// const topStores = results.topStories ?? [];
|
|
37
|
+
// if (topStores.length) {
|
|
38
|
+
// addSection('News Results');
|
|
39
|
+
// topStores.forEach((r, i) => {
|
|
40
|
+
// output += [
|
|
41
|
+
// `Anchor: \ue202turn0news${i + 1}`,
|
|
42
|
+
// `Title: ${r.title ?? '(no title)'}`,
|
|
43
|
+
// `URL: ${r.link}`,
|
|
44
|
+
// r.snippet != null ? `Snippet: ${r.snippet}` : '',
|
|
45
|
+
// r.date != null ? `Date: ${r.date}` : '',
|
|
46
|
+
// r.attribution != null ? `Source: ${r.attribution}` : '',
|
|
47
|
+
// ''
|
|
48
|
+
// ].filter(Boolean).join('\n');
|
|
49
|
+
// });
|
|
50
|
+
// }
|
|
51
|
+
|
|
52
|
+
// // Images
|
|
53
|
+
// const images = results.images ?? [];
|
|
54
|
+
// if (images.length) {
|
|
55
|
+
// addSection('Image Results');
|
|
56
|
+
// images.forEach((img, i) => {
|
|
57
|
+
// output += [
|
|
58
|
+
// `Anchor: \ue202turn0image${i + 1}`,
|
|
59
|
+
// `Title: ${img.title ?? '(no title)'}`,
|
|
60
|
+
// `Image URL: ${img.imageUrl}`,
|
|
61
|
+
// ''
|
|
62
|
+
// ].join('\n');
|
|
63
|
+
// });
|
|
64
|
+
// }
|
|
65
|
+
|
|
66
|
+
// Knowledge Graph
|
|
67
|
+
if (results.knowledgeGraph != null) {
|
|
68
|
+
addSection('Knowledge Graph');
|
|
69
|
+
output += [
|
|
70
|
+
`Title: ${results.knowledgeGraph.title ?? '(no title)'}`,
|
|
71
|
+
results.knowledgeGraph.description != null
|
|
72
|
+
? `Description: ${results.knowledgeGraph.description}`
|
|
73
|
+
: '',
|
|
74
|
+
results.knowledgeGraph.type != null
|
|
75
|
+
? `Type: ${results.knowledgeGraph.type}`
|
|
76
|
+
: '',
|
|
77
|
+
results.knowledgeGraph.imageUrl != null
|
|
78
|
+
? `Image URL: ${results.knowledgeGraph.imageUrl}`
|
|
79
|
+
: '',
|
|
80
|
+
results.knowledgeGraph.attributes != null
|
|
81
|
+
? `Attributes: ${JSON.stringify(results.knowledgeGraph.attributes, null, 2)}`
|
|
82
|
+
: '',
|
|
83
|
+
'',
|
|
84
|
+
]
|
|
85
|
+
.filter(Boolean)
|
|
86
|
+
.join('\n');
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Answer Box
|
|
90
|
+
if (results.answerBox != null) {
|
|
91
|
+
addSection('Answer Box');
|
|
92
|
+
output += [
|
|
93
|
+
results.answerBox.title != null
|
|
94
|
+
? `Title: ${results.answerBox.title}`
|
|
95
|
+
: '',
|
|
96
|
+
results.answerBox.answer != null
|
|
97
|
+
? `Answer: ${results.answerBox.answer}`
|
|
98
|
+
: '',
|
|
99
|
+
results.answerBox.snippet != null
|
|
100
|
+
? `Snippet: ${results.answerBox.snippet}`
|
|
101
|
+
: '',
|
|
102
|
+
results.answerBox.date != null ? `Date: ${results.answerBox.date}` : '',
|
|
103
|
+
'',
|
|
104
|
+
]
|
|
105
|
+
.filter(Boolean)
|
|
106
|
+
.join('\n');
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// People also ask
|
|
110
|
+
const peopleAlsoAsk = results.peopleAlsoAsk ?? [];
|
|
111
|
+
if (peopleAlsoAsk.length) {
|
|
112
|
+
addSection('People Also Ask');
|
|
113
|
+
peopleAlsoAsk.forEach((p, _i) => {
|
|
114
|
+
output += [`Q: ${p.question}`, `A: ${p.answer}`, '']
|
|
115
|
+
.filter(Boolean)
|
|
116
|
+
.join('\n');
|
|
117
|
+
});
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
return output.trim();
|
|
121
|
+
}
|