@darbotlabs/darbot-browser-mcp 0.1.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +249 -158
- package/cli.js +1 -1
- package/config.d.ts +77 -1
- package/index.d.ts +1 -1
- package/index.js +1 -1
- package/lib/ai/context.js +150 -0
- package/lib/ai/guardrails.js +382 -0
- package/lib/ai/integration.js +397 -0
- package/lib/ai/intent.js +237 -0
- package/lib/ai/manualPromise.js +111 -0
- package/lib/ai/memory.js +273 -0
- package/lib/ai/ml-scorer.js +265 -0
- package/lib/ai/orchestrator-tools.js +292 -0
- package/lib/ai/orchestrator.js +473 -0
- package/lib/ai/planner.js +300 -0
- package/lib/ai/reporter.js +493 -0
- package/lib/ai/workflow.js +407 -0
- package/lib/auth/apiKeyAuth.js +46 -0
- package/lib/auth/entraAuth.js +110 -0
- package/lib/auth/entraJwtVerifier.js +117 -0
- package/lib/auth/index.js +210 -0
- package/lib/auth/managedIdentityAuth.js +175 -0
- package/lib/auth/mcpOAuthProvider.js +186 -0
- package/lib/auth/tunnelAuth.js +120 -0
- package/lib/browserContextFactory.js +1 -1
- package/lib/browserServer.js +1 -1
- package/lib/cdpRelay.js +2 -2
- package/lib/common.js +68 -0
- package/lib/config.js +62 -3
- package/lib/connection.js +1 -1
- package/lib/context.js +1 -1
- package/lib/fileUtils.js +1 -1
- package/lib/guardrails.js +382 -0
- package/lib/health.js +178 -0
- package/lib/httpServer.js +1 -1
- package/lib/index.js +1 -1
- package/lib/javascript.js +1 -1
- package/lib/manualPromise.js +1 -1
- package/lib/memory.js +273 -0
- package/lib/openapi.js +373 -0
- package/lib/orchestrator.js +473 -0
- package/lib/package.js +1 -1
- package/lib/pageSnapshot.js +17 -2
- package/lib/planner.js +302 -0
- package/lib/program.js +17 -5
- package/lib/reporter.js +493 -0
- package/lib/resources/resource.js +1 -1
- package/lib/server.js +5 -3
- package/lib/tab.js +1 -1
- package/lib/tools/ai-native.js +298 -0
- package/lib/tools/autonomous.js +147 -0
- package/lib/tools/clock.js +183 -0
- package/lib/tools/common.js +1 -1
- package/lib/tools/console.js +1 -1
- package/lib/tools/diagnostics.js +132 -0
- package/lib/tools/dialogs.js +1 -1
- package/lib/tools/emulation.js +155 -0
- package/lib/tools/files.js +1 -1
- package/lib/tools/install.js +1 -1
- package/lib/tools/keyboard.js +1 -1
- package/lib/tools/navigate.js +1 -1
- package/lib/tools/network.js +1 -1
- package/lib/tools/pageSnapshot.js +58 -0
- package/lib/tools/pdf.js +1 -1
- package/lib/tools/profiles.js +76 -25
- package/lib/tools/screenshot.js +1 -1
- package/lib/tools/scroll.js +93 -0
- package/lib/tools/snapshot.js +1 -1
- package/lib/tools/storage.js +328 -0
- package/lib/tools/tab.js +16 -0
- package/lib/tools/tabs.js +1 -1
- package/lib/tools/testing.js +1 -1
- package/lib/tools/tool.js +1 -1
- package/lib/tools/utils.js +1 -1
- package/lib/tools/vision.js +1 -1
- package/lib/tools/wait.js +1 -1
- package/lib/tools.js +22 -1
- package/lib/transport.js +251 -31
- package/package.json +54 -21
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) DarbotLabs.
|
|
3
|
+
*
|
|
4
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
* you may not use this file except in compliance with the License.
|
|
6
|
+
* You may obtain a copy of the License at
|
|
7
|
+
*
|
|
8
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
*
|
|
10
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
* See the License for the specific language governing permissions and
|
|
14
|
+
* limitations under the License.
|
|
15
|
+
*/
|
|
16
|
+
export class ManualPromise extends Promise {
|
|
17
|
+
_resolve;
|
|
18
|
+
_reject;
|
|
19
|
+
_isDone;
|
|
20
|
+
constructor() {
|
|
21
|
+
let resolve;
|
|
22
|
+
let reject;
|
|
23
|
+
super((f, r) => {
|
|
24
|
+
resolve = f;
|
|
25
|
+
reject = r;
|
|
26
|
+
});
|
|
27
|
+
this._isDone = false;
|
|
28
|
+
this._resolve = resolve;
|
|
29
|
+
this._reject = reject;
|
|
30
|
+
}
|
|
31
|
+
isDone() {
|
|
32
|
+
return this._isDone;
|
|
33
|
+
}
|
|
34
|
+
resolve(t) {
|
|
35
|
+
this._isDone = true;
|
|
36
|
+
this._resolve(t);
|
|
37
|
+
}
|
|
38
|
+
reject(e) {
|
|
39
|
+
this._isDone = true;
|
|
40
|
+
this._reject(e);
|
|
41
|
+
}
|
|
42
|
+
static get [Symbol.species]() {
|
|
43
|
+
return Promise;
|
|
44
|
+
}
|
|
45
|
+
get [Symbol.toStringTag]() {
|
|
46
|
+
return 'ManualPromise';
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
export class LongStandingScope {
|
|
50
|
+
_terminateError;
|
|
51
|
+
_closeError;
|
|
52
|
+
_terminatePromises = new Map();
|
|
53
|
+
_isClosed = false;
|
|
54
|
+
reject(error) {
|
|
55
|
+
this._isClosed = true;
|
|
56
|
+
this._terminateError = error;
|
|
57
|
+
for (const p of this._terminatePromises.keys())
|
|
58
|
+
p.resolve(error);
|
|
59
|
+
}
|
|
60
|
+
close(error) {
|
|
61
|
+
this._isClosed = true;
|
|
62
|
+
this._closeError = error;
|
|
63
|
+
for (const [p, frames] of this._terminatePromises)
|
|
64
|
+
p.resolve(cloneError(error, frames));
|
|
65
|
+
}
|
|
66
|
+
isClosed() {
|
|
67
|
+
return this._isClosed;
|
|
68
|
+
}
|
|
69
|
+
static async raceMultiple(scopes, promise) {
|
|
70
|
+
return Promise.race(scopes.map(s => s.race(promise)));
|
|
71
|
+
}
|
|
72
|
+
async race(promise) {
|
|
73
|
+
return this._race(Array.isArray(promise) ? promise : [promise], false);
|
|
74
|
+
}
|
|
75
|
+
async safeRace(promise, defaultValue) {
|
|
76
|
+
return this._race([promise], true, defaultValue);
|
|
77
|
+
}
|
|
78
|
+
async _race(promises, safe, defaultValue) {
|
|
79
|
+
const terminatePromise = new ManualPromise();
|
|
80
|
+
const frames = captureRawStack();
|
|
81
|
+
if (this._terminateError)
|
|
82
|
+
terminatePromise.resolve(this._terminateError);
|
|
83
|
+
if (this._closeError)
|
|
84
|
+
terminatePromise.resolve(cloneError(this._closeError, frames));
|
|
85
|
+
this._terminatePromises.set(terminatePromise, frames);
|
|
86
|
+
try {
|
|
87
|
+
return await Promise.race([
|
|
88
|
+
terminatePromise.then(e => safe ? defaultValue : Promise.reject(e)),
|
|
89
|
+
...promises
|
|
90
|
+
]);
|
|
91
|
+
}
|
|
92
|
+
finally {
|
|
93
|
+
this._terminatePromises.delete(terminatePromise);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
function cloneError(error, frames) {
|
|
98
|
+
const clone = new Error();
|
|
99
|
+
clone.name = error.name;
|
|
100
|
+
clone.message = error.message;
|
|
101
|
+
clone.stack = [error.name + ':' + error.message, ...frames].join('\n');
|
|
102
|
+
return clone;
|
|
103
|
+
}
|
|
104
|
+
function captureRawStack() {
|
|
105
|
+
const stackTraceLimit = Error.stackTraceLimit;
|
|
106
|
+
Error.stackTraceLimit = 50;
|
|
107
|
+
const error = new Error();
|
|
108
|
+
const stack = error.stack || '';
|
|
109
|
+
Error.stackTraceLimit = stackTraceLimit;
|
|
110
|
+
return stack.split('\n');
|
|
111
|
+
}
|
package/lib/ai/memory.js
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) DarbotLabs.
|
|
3
|
+
*
|
|
4
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
* you may not use this file except in compliance with the License.
|
|
6
|
+
* You may obtain a copy of the License at
|
|
7
|
+
*
|
|
8
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
*
|
|
10
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
* See the License for the specific language governing permissions and
|
|
14
|
+
* limitations under the License.
|
|
15
|
+
*/
|
|
16
|
+
import crypto from 'node:crypto';
|
|
17
|
+
import fs from 'node:fs';
|
|
18
|
+
import path from 'node:path';
|
|
19
|
+
import debug from 'debug';
|
|
20
|
+
const log = debug('darbot:memory');
|
|
21
|
+
/**
|
|
22
|
+
* Local file-based memory storage implementation
|
|
23
|
+
*/
|
|
24
|
+
export class LocalMemoryStorage {
|
|
25
|
+
storagePath;
|
|
26
|
+
maxStates;
|
|
27
|
+
constructor(config = {}) {
|
|
28
|
+
this.storagePath = config.storagePath || path.join(process.cwd(), '.darbot', 'memory');
|
|
29
|
+
this.maxStates = config.maxStates || 1000;
|
|
30
|
+
this.ensureStorageDirectory();
|
|
31
|
+
}
|
|
32
|
+
ensureStorageDirectory() {
|
|
33
|
+
if (!fs.existsSync(this.storagePath))
|
|
34
|
+
fs.mkdirSync(this.storagePath, { recursive: true });
|
|
35
|
+
}
|
|
36
|
+
getStatePath(stateHash) {
|
|
37
|
+
return path.join(this.storagePath, `${stateHash}.json`);
|
|
38
|
+
}
|
|
39
|
+
async storeState(state) {
|
|
40
|
+
try {
|
|
41
|
+
const statePath = this.getStatePath(state.stateHash);
|
|
42
|
+
await fs.promises.writeFile(statePath, JSON.stringify(state, null, 2));
|
|
43
|
+
log('Stored state:', state.stateHash, state.url);
|
|
44
|
+
// Clean up old states if we exceed the limit
|
|
45
|
+
await this.cleanupOldStates();
|
|
46
|
+
}
|
|
47
|
+
catch (error) {
|
|
48
|
+
log('Error storing state:', error);
|
|
49
|
+
throw error;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
async getState(stateHash) {
|
|
53
|
+
try {
|
|
54
|
+
const statePath = this.getStatePath(stateHash);
|
|
55
|
+
if (!fs.existsSync(statePath))
|
|
56
|
+
return null;
|
|
57
|
+
const data = await fs.promises.readFile(statePath, 'utf-8');
|
|
58
|
+
return JSON.parse(data);
|
|
59
|
+
}
|
|
60
|
+
catch (error) {
|
|
61
|
+
log('Error reading state:', error);
|
|
62
|
+
return null;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
async hasState(stateHash) {
|
|
66
|
+
const statePath = this.getStatePath(stateHash);
|
|
67
|
+
return fs.existsSync(statePath);
|
|
68
|
+
}
|
|
69
|
+
async getAllStates() {
|
|
70
|
+
try {
|
|
71
|
+
const files = await fs.promises.readdir(this.storagePath);
|
|
72
|
+
const states = [];
|
|
73
|
+
for (const file of files) {
|
|
74
|
+
if (file.endsWith('.json')) {
|
|
75
|
+
const filePath = path.join(this.storagePath, file);
|
|
76
|
+
try {
|
|
77
|
+
const data = await fs.promises.readFile(filePath, 'utf-8');
|
|
78
|
+
states.push(JSON.parse(data));
|
|
79
|
+
}
|
|
80
|
+
catch (error) {
|
|
81
|
+
log('Error reading state file:', file, error);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
return states.sort((a, b) => a.timestamp - b.timestamp);
|
|
86
|
+
}
|
|
87
|
+
catch (error) {
|
|
88
|
+
log('Error reading states:', error);
|
|
89
|
+
return [];
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
async getUnvisitedLinks() {
|
|
93
|
+
const states = await this.getAllStates();
|
|
94
|
+
const visited = new Set(states.filter(s => s.visited).map(s => s.url));
|
|
95
|
+
const allLinks = new Set();
|
|
96
|
+
states.forEach(state => {
|
|
97
|
+
state.links.forEach(link => {
|
|
98
|
+
if (!visited.has(link))
|
|
99
|
+
allLinks.add(link);
|
|
100
|
+
});
|
|
101
|
+
});
|
|
102
|
+
return Array.from(allLinks);
|
|
103
|
+
}
|
|
104
|
+
async clear() {
|
|
105
|
+
try {
|
|
106
|
+
const files = await fs.promises.readdir(this.storagePath);
|
|
107
|
+
await Promise.all(files.map(file => fs.promises.unlink(path.join(this.storagePath, file))));
|
|
108
|
+
log('Cleared memory storage');
|
|
109
|
+
}
|
|
110
|
+
catch (error) {
|
|
111
|
+
log('Error clearing storage:', error);
|
|
112
|
+
throw error;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
async cleanupOldStates() {
|
|
116
|
+
const states = await this.getAllStates();
|
|
117
|
+
if (states.length <= this.maxStates)
|
|
118
|
+
return;
|
|
119
|
+
// Remove oldest states
|
|
120
|
+
const toRemove = states.slice(0, states.length - this.maxStates);
|
|
121
|
+
await Promise.all(toRemove.map(state => fs.promises.unlink(this.getStatePath(state.stateHash)).catch(() => { })));
|
|
122
|
+
log(`Cleaned up ${toRemove.length} old states`);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* Darbot Memory MCP connector (placeholder for future implementation)
|
|
127
|
+
*/
|
|
128
|
+
export class DarbotMemoryStorage {
|
|
129
|
+
constructor(config = {}) {
|
|
130
|
+
// TODO: Implement darbot-memory-mcp integration
|
|
131
|
+
log('Darbot Memory MCP connector not yet implemented, falling back to local storage');
|
|
132
|
+
}
|
|
133
|
+
async storeState(state) {
|
|
134
|
+
// TODO: Send to darbot-memory-mcp server
|
|
135
|
+
throw new Error('Darbot Memory MCP connector not yet implemented');
|
|
136
|
+
}
|
|
137
|
+
async getState(stateHash) {
|
|
138
|
+
// TODO: Query darbot-memory-mcp server
|
|
139
|
+
throw new Error('Darbot Memory MCP connector not yet implemented');
|
|
140
|
+
}
|
|
141
|
+
async hasState(stateHash) {
|
|
142
|
+
// TODO: Check darbot-memory-mcp server
|
|
143
|
+
throw new Error('Darbot Memory MCP connector not yet implemented');
|
|
144
|
+
}
|
|
145
|
+
async getAllStates() {
|
|
146
|
+
// TODO: Fetch from darbot-memory-mcp server
|
|
147
|
+
throw new Error('Darbot Memory MCP connector not yet implemented');
|
|
148
|
+
}
|
|
149
|
+
async getUnvisitedLinks() {
|
|
150
|
+
// TODO: Query darbot-memory-mcp server
|
|
151
|
+
throw new Error('Darbot Memory MCP connector not yet implemented');
|
|
152
|
+
}
|
|
153
|
+
async clear() {
|
|
154
|
+
// TODO: Clear darbot-memory-mcp storage
|
|
155
|
+
throw new Error('Darbot Memory MCP connector not yet implemented');
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Memory manager with optional darbot-memory-mcp integration
|
|
160
|
+
*/
|
|
161
|
+
export class MemoryManager {
|
|
162
|
+
storage;
|
|
163
|
+
config;
|
|
164
|
+
constructor(config = { enabled: true }) {
|
|
165
|
+
this.config = config;
|
|
166
|
+
if (!config.enabled) {
|
|
167
|
+
this.storage = new LocalMemoryStorage(); // Dummy storage that won't be used
|
|
168
|
+
return;
|
|
169
|
+
}
|
|
170
|
+
switch (config.connector) {
|
|
171
|
+
case 'darbot-memory-mcp':
|
|
172
|
+
try {
|
|
173
|
+
this.storage = new DarbotMemoryStorage();
|
|
174
|
+
}
|
|
175
|
+
catch (error) {
|
|
176
|
+
log('Failed to initialize darbot-memory-mcp connector, falling back to local storage:', error);
|
|
177
|
+
this.storage = new LocalMemoryStorage({
|
|
178
|
+
storagePath: config.storagePath,
|
|
179
|
+
maxStates: config.maxStates
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
break;
|
|
183
|
+
case 'local':
|
|
184
|
+
default:
|
|
185
|
+
this.storage = new LocalMemoryStorage({
|
|
186
|
+
storagePath: config.storagePath,
|
|
187
|
+
maxStates: config.maxStates
|
|
188
|
+
});
|
|
189
|
+
break;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
/**
|
|
193
|
+
* Generate a hash for the current page state
|
|
194
|
+
*/
|
|
195
|
+
static stateHash(domSnapshot) {
|
|
196
|
+
return crypto.createHash('sha256').update(domSnapshot).digest('hex').substring(0, 16);
|
|
197
|
+
}
|
|
198
|
+
/**
|
|
199
|
+
* Store a crawl state with screenshot
|
|
200
|
+
*/
|
|
201
|
+
async storeState(url, title, domSnapshot, screenshot, links = []) {
|
|
202
|
+
if (!this.config.enabled)
|
|
203
|
+
return '';
|
|
204
|
+
const stateHash = MemoryManager.stateHash(domSnapshot);
|
|
205
|
+
let screenshotPath;
|
|
206
|
+
// Save screenshot if provided
|
|
207
|
+
if (screenshot) {
|
|
208
|
+
const screenshotDir = path.join(process.cwd(), '.darbot', 'screenshots');
|
|
209
|
+
if (!fs.existsSync(screenshotDir))
|
|
210
|
+
fs.mkdirSync(screenshotDir, { recursive: true });
|
|
211
|
+
screenshotPath = path.join(screenshotDir, `${stateHash}.png`);
|
|
212
|
+
await fs.promises.writeFile(screenshotPath, screenshot);
|
|
213
|
+
}
|
|
214
|
+
const state = {
|
|
215
|
+
url,
|
|
216
|
+
title,
|
|
217
|
+
stateHash,
|
|
218
|
+
timestamp: Date.now(),
|
|
219
|
+
screenshot: screenshotPath,
|
|
220
|
+
links,
|
|
221
|
+
visited: true
|
|
222
|
+
};
|
|
223
|
+
await this.storage.storeState(state);
|
|
224
|
+
return stateHash;
|
|
225
|
+
}
|
|
226
|
+
/**
|
|
227
|
+
* Check if we've seen this state before
|
|
228
|
+
*/
|
|
229
|
+
async hasState(domSnapshot) {
|
|
230
|
+
if (!this.config.enabled)
|
|
231
|
+
return false;
|
|
232
|
+
const stateHash = MemoryManager.stateHash(domSnapshot);
|
|
233
|
+
return await this.storage.hasState(stateHash);
|
|
234
|
+
}
|
|
235
|
+
/**
|
|
236
|
+
* Get a stored state by hash
|
|
237
|
+
*/
|
|
238
|
+
async getState(stateHash) {
|
|
239
|
+
if (!this.config.enabled)
|
|
240
|
+
return null;
|
|
241
|
+
return await this.storage.getState(stateHash);
|
|
242
|
+
}
|
|
243
|
+
/**
|
|
244
|
+
* Get all stored states
|
|
245
|
+
*/
|
|
246
|
+
async getAllStates() {
|
|
247
|
+
if (!this.config.enabled)
|
|
248
|
+
return [];
|
|
249
|
+
return await this.storage.getAllStates();
|
|
250
|
+
}
|
|
251
|
+
/**
|
|
252
|
+
* Get unvisited links for BFS crawling
|
|
253
|
+
*/
|
|
254
|
+
async getUnvisitedLinks() {
|
|
255
|
+
if (!this.config.enabled)
|
|
256
|
+
return [];
|
|
257
|
+
return await this.storage.getUnvisitedLinks();
|
|
258
|
+
}
|
|
259
|
+
/**
|
|
260
|
+
* Clear all stored states
|
|
261
|
+
*/
|
|
262
|
+
async clear() {
|
|
263
|
+
if (!this.config.enabled)
|
|
264
|
+
return;
|
|
265
|
+
await this.storage.clear();
|
|
266
|
+
}
|
|
267
|
+
/**
|
|
268
|
+
* Check if memory is enabled
|
|
269
|
+
*/
|
|
270
|
+
get enabled() {
|
|
271
|
+
return this.config.enabled;
|
|
272
|
+
}
|
|
273
|
+
}
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) DarbotLabs.
|
|
3
|
+
*
|
|
4
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
* you may not use this file except in compliance with the License.
|
|
6
|
+
* You may obtain a copy of the License at
|
|
7
|
+
*
|
|
8
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
*
|
|
10
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
* See the License for the specific language governing permissions and
|
|
14
|
+
* limitations under the License.
|
|
15
|
+
*/
|
|
16
|
+
/**
|
|
17
|
+
* ML-based scoring system for intelligent URL and element prioritization
|
|
18
|
+
*
|
|
19
|
+
* This replaces basic heuristics with feature-based scoring that learns from
|
|
20
|
+
* crawl patterns and goal descriptions to make better decisions.
|
|
21
|
+
*/
|
|
22
|
+
import debug from 'debug';
|
|
23
|
+
const log = debug('darbot:ml-scorer');
|
|
24
|
+
/**
|
|
25
|
+
* ML-inspired scoring system using feature engineering and weighted scoring
|
|
26
|
+
*/
|
|
27
|
+
export class MLBasedScorer {
|
|
28
|
+
weights;
|
|
29
|
+
goalKeywords;
|
|
30
|
+
learnedPatterns = new Map();
|
|
31
|
+
constructor(goal) {
|
|
32
|
+
// Initialize feature weights (these could be learned from data)
|
|
33
|
+
this.weights = new Map([
|
|
34
|
+
// URL structure weights
|
|
35
|
+
['urlDepth', -0.3], // Prefer shallower URLs
|
|
36
|
+
['urlLength', -0.1], // Prefer shorter URLs
|
|
37
|
+
['pathSegments', 0.2], // More segments = more specific
|
|
38
|
+
['queryParams', -0.2], // Too many params = dynamic/session pages
|
|
39
|
+
// Content weights
|
|
40
|
+
['textLength', 0.3], // More text = more content
|
|
41
|
+
['hasKeywords', 2.0], // Strong signal for goal relevance
|
|
42
|
+
['semanticRelevance', 1.5], // Goal-based relevance
|
|
43
|
+
// Context weights
|
|
44
|
+
['parentScore', 0.4], // Good parents suggest good children
|
|
45
|
+
['visitedSiblings', -0.2], // Avoid repetitive sibling pages
|
|
46
|
+
['domDepth', -0.1], // Prefer more accessible elements
|
|
47
|
+
// Pattern weights
|
|
48
|
+
['contentPattern', 1.0], // Content pages
|
|
49
|
+
['navPattern', 0.5], // Navigation pages
|
|
50
|
+
['utilityPattern', -0.5], // Utility pages (login, etc.)
|
|
51
|
+
]);
|
|
52
|
+
// Extract keywords from goal
|
|
53
|
+
this.goalKeywords = new Set();
|
|
54
|
+
if (goal) {
|
|
55
|
+
const words = goal.toLowerCase()
|
|
56
|
+
.split(/\s+/)
|
|
57
|
+
.filter(w => w.length > 3)
|
|
58
|
+
.filter(w => !this.isStopWord(w));
|
|
59
|
+
words.forEach(w => this.goalKeywords.add(w));
|
|
60
|
+
}
|
|
61
|
+
log('Initialized ML scorer with goal keywords:', Array.from(this.goalKeywords));
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Score a URL for crawling priority
|
|
65
|
+
*/
|
|
66
|
+
scoreUrl(url, context) {
|
|
67
|
+
const features = this.extractUrlFeatures(url, context);
|
|
68
|
+
return this.calculateScore(features);
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Score an element for interaction priority
|
|
72
|
+
*/
|
|
73
|
+
scoreElement(element, context) {
|
|
74
|
+
const features = this.extractElementFeatures(element, context);
|
|
75
|
+
return this.calculateScore(features);
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Update learned patterns based on successful crawl
|
|
79
|
+
*/
|
|
80
|
+
learn(url, success, features) {
|
|
81
|
+
// Extract pattern from URL
|
|
82
|
+
const pattern = this.extractPattern(url);
|
|
83
|
+
// Update pattern score based on success
|
|
84
|
+
const currentScore = this.learnedPatterns.get(pattern) || 0;
|
|
85
|
+
const delta = success ? 0.1 : -0.05;
|
|
86
|
+
this.learnedPatterns.set(pattern, currentScore + delta);
|
|
87
|
+
log('Updated pattern score:', pattern, '->', this.learnedPatterns.get(pattern));
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Extract features from URL
|
|
91
|
+
*/
|
|
92
|
+
extractUrlFeatures(url, context) {
|
|
93
|
+
const features = new Map();
|
|
94
|
+
try {
|
|
95
|
+
const urlObj = new URL(url);
|
|
96
|
+
const path = urlObj.pathname;
|
|
97
|
+
// URL structure features
|
|
98
|
+
features.set('urlDepth', context.currentDepth);
|
|
99
|
+
features.set('urlLength', url.length / 100); // Normalize
|
|
100
|
+
features.set('hasNumbers', /\d/.test(path) ? 1 : 0);
|
|
101
|
+
features.set('hasHyphens', /-/.test(path) ? 1 : 0);
|
|
102
|
+
features.set('pathSegments', path.split('/').filter(Boolean).length);
|
|
103
|
+
features.set('queryParams', urlObj.searchParams.size);
|
|
104
|
+
// Pattern matching
|
|
105
|
+
features.set('contentPattern', this.matchesContentPattern(url) ? 1 : 0);
|
|
106
|
+
features.set('navPattern', this.matchesNavPattern(url) ? 1 : 0);
|
|
107
|
+
features.set('utilityPattern', this.matchesUtilityPattern(url) ? 1 : 0);
|
|
108
|
+
// Goal relevance
|
|
109
|
+
const relevance = this.calculateSemanticRelevance(url);
|
|
110
|
+
features.set('semanticRelevance', relevance);
|
|
111
|
+
features.set('hasKeywords', relevance > 0.5 ? 1 : 0);
|
|
112
|
+
// Context features
|
|
113
|
+
const parentPattern = context.parentUrl ? this.extractPattern(context.parentUrl) : '';
|
|
114
|
+
features.set('parentScore', this.learnedPatterns.get(parentPattern) || 0);
|
|
115
|
+
// Visited sibling counting
|
|
116
|
+
const urlPattern = this.extractPattern(url);
|
|
117
|
+
let visitedSiblings = 0;
|
|
118
|
+
for (const visitedUrl of context.visitedUrls) {
|
|
119
|
+
if (this.extractPattern(visitedUrl) === urlPattern)
|
|
120
|
+
visitedSiblings++;
|
|
121
|
+
}
|
|
122
|
+
features.set('visitedSiblings', visitedSiblings);
|
|
123
|
+
}
|
|
124
|
+
catch (error) {
|
|
125
|
+
log('Error extracting URL features:', error);
|
|
126
|
+
}
|
|
127
|
+
return features;
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Extract features from element
|
|
131
|
+
*/
|
|
132
|
+
extractElementFeatures(element, context) {
|
|
133
|
+
const features = new Map();
|
|
134
|
+
const text = element.text.toLowerCase();
|
|
135
|
+
// Text features
|
|
136
|
+
features.set('textLength', Math.min(element.text.length / 50, 1));
|
|
137
|
+
features.set('hasKeywords', this.containsKeywords(text) ? 1 : 0);
|
|
138
|
+
// Semantic relevance
|
|
139
|
+
features.set('semanticRelevance', this.calculateSemanticRelevance(text));
|
|
140
|
+
// DOM features
|
|
141
|
+
const selectorDepth = element.selector.split('>').length;
|
|
142
|
+
features.set('domDepth', selectorDepth);
|
|
143
|
+
// Element type features
|
|
144
|
+
const isActionButton = ['button', 'submit'].includes(element.tag);
|
|
145
|
+
features.set('isActionButton', isActionButton ? 1 : 0);
|
|
146
|
+
// Pattern matching
|
|
147
|
+
features.set('contentPattern', this.matchesContentPattern(text) ? 1 : 0);
|
|
148
|
+
features.set('navPattern', this.matchesNavPattern(text) ? 1 : 0);
|
|
149
|
+
features.set('utilityPattern', this.matchesUtilityPattern(text) ? 1 : 0);
|
|
150
|
+
return features;
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* Calculate final score from features
|
|
154
|
+
*/
|
|
155
|
+
calculateScore(features) {
|
|
156
|
+
let score = 0;
|
|
157
|
+
for (const [feature, value] of features.entries()) {
|
|
158
|
+
const weight = this.weights.get(feature) || 0;
|
|
159
|
+
score += weight * value;
|
|
160
|
+
}
|
|
161
|
+
// Apply sigmoid to bound score between 0 and 1
|
|
162
|
+
return 1 / (1 + Math.exp(-score));
|
|
163
|
+
}
|
|
164
|
+
/**
|
|
165
|
+
* Calculate semantic relevance to goal
|
|
166
|
+
*/
|
|
167
|
+
calculateSemanticRelevance(text) {
|
|
168
|
+
if (this.goalKeywords.size === 0)
|
|
169
|
+
return 0.5; // Neutral if no goal specified
|
|
170
|
+
const words = text.toLowerCase().split(/\W+/);
|
|
171
|
+
let matches = 0;
|
|
172
|
+
for (const word of words) {
|
|
173
|
+
if (this.goalKeywords.has(word))
|
|
174
|
+
matches++;
|
|
175
|
+
}
|
|
176
|
+
// Normalize by goal keyword count
|
|
177
|
+
return Math.min(matches / this.goalKeywords.size, 1.0);
|
|
178
|
+
}
|
|
179
|
+
/**
|
|
180
|
+
* Check if text contains goal keywords
|
|
181
|
+
*/
|
|
182
|
+
containsKeywords(text) {
|
|
183
|
+
if (this.goalKeywords.size === 0)
|
|
184
|
+
return false;
|
|
185
|
+
const words = text.toLowerCase().split(/\W+/);
|
|
186
|
+
return words.some(word => this.goalKeywords.has(word));
|
|
187
|
+
}
|
|
188
|
+
/**
|
|
189
|
+
* Extract pattern from URL
|
|
190
|
+
*/
|
|
191
|
+
extractPattern(url) {
|
|
192
|
+
try {
|
|
193
|
+
const urlObj = new URL(url);
|
|
194
|
+
// Pattern = domain + path structure (without specific IDs/numbers)
|
|
195
|
+
const pathPattern = urlObj.pathname.replace(/\d+/g, '*').replace(/\/[a-f0-9-]{36}/gi, '/*');
|
|
196
|
+
return `${urlObj.hostname}${pathPattern}`;
|
|
197
|
+
}
|
|
198
|
+
catch {
|
|
199
|
+
return url;
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Check if URL/text matches content patterns
|
|
204
|
+
*/
|
|
205
|
+
matchesContentPattern(text) {
|
|
206
|
+
const patterns = [
|
|
207
|
+
/article/i, /post/i, /blog/i, /news/i, /story/i,
|
|
208
|
+
/product/i, /item/i, /detail/i, /content/i, /page/i,
|
|
209
|
+
/documentation/i, /docs/i, /guide/i, /tutorial/i
|
|
210
|
+
];
|
|
211
|
+
return patterns.some(p => p.test(text));
|
|
212
|
+
}
|
|
213
|
+
/**
|
|
214
|
+
* Check if URL/text matches navigation patterns
|
|
215
|
+
*/
|
|
216
|
+
matchesNavPattern(text) {
|
|
217
|
+
const patterns = [
|
|
218
|
+
/category/i, /section/i, /menu/i, /nav/i,
|
|
219
|
+
/index/i, /list/i, /archive/i, /browse/i
|
|
220
|
+
];
|
|
221
|
+
return patterns.some(p => p.test(text));
|
|
222
|
+
}
|
|
223
|
+
/**
|
|
224
|
+
* Check if URL/text matches utility patterns (usually low priority)
|
|
225
|
+
*/
|
|
226
|
+
matchesUtilityPattern(text) {
|
|
227
|
+
const patterns = [
|
|
228
|
+
/login/i, /signin/i, /register/i, /signup/i,
|
|
229
|
+
/logout/i, /signout/i, /profile/i, /account/i,
|
|
230
|
+
/terms/i, /privacy/i, /legal/i, /cookie/i,
|
|
231
|
+
/contact/i, /about/i, /help/i, /faq/i
|
|
232
|
+
];
|
|
233
|
+
return patterns.some(p => p.test(text));
|
|
234
|
+
}
|
|
235
|
+
/**
|
|
236
|
+
* Check if word is a stop word
|
|
237
|
+
*/
|
|
238
|
+
isStopWord(word) {
|
|
239
|
+
const stopWords = new Set([
|
|
240
|
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
241
|
+
'of', 'with', 'by', 'from', 'up', 'about', 'into', 'through', 'during'
|
|
242
|
+
]);
|
|
243
|
+
return stopWords.has(word.toLowerCase());
|
|
244
|
+
}
|
|
245
|
+
/**
|
|
246
|
+
* Get learned patterns for debugging/export
|
|
247
|
+
*/
|
|
248
|
+
getLearnedPatterns() {
|
|
249
|
+
return new Map(this.learnedPatterns);
|
|
250
|
+
}
|
|
251
|
+
/**
|
|
252
|
+
* Export scoring statistics
|
|
253
|
+
*/
|
|
254
|
+
getStatistics() {
|
|
255
|
+
const scores = Array.from(this.learnedPatterns.values());
|
|
256
|
+
const avgScore = scores.length > 0
|
|
257
|
+
? scores.reduce((a, b) => a + b, 0) / scores.length
|
|
258
|
+
: 0;
|
|
259
|
+
return {
|
|
260
|
+
goalKeywords: Array.from(this.goalKeywords),
|
|
261
|
+
learnedPatterns: this.learnedPatterns.size,
|
|
262
|
+
averagePatternScore: avgScore
|
|
263
|
+
};
|
|
264
|
+
}
|
|
265
|
+
}
|