explorbot 0.1.17 → 0.1.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/explorbot-cli.ts +2 -0
- package/boat/doc-collector/bin/doc-collector-cli.ts +5 -0
- package/boat/doc-collector/package.json +24 -0
- package/boat/doc-collector/src/ai/documentarian.ts +184 -0
- package/boat/doc-collector/src/cli.ts +119 -0
- package/boat/doc-collector/src/config.ts +162 -0
- package/boat/doc-collector/src/docbot.ts +391 -0
- package/boat/doc-collector/src/docs-renderer.ts +187 -0
- package/boat/doc-collector/src/path-filter.ts +46 -0
- package/boat/doc-collector/src/research-navigation.ts +90 -0
- package/dist/bin/explorbot-cli.js +2 -0
- package/dist/boat/doc-collector/bin/doc-collector-cli.js +4 -0
- package/dist/boat/doc-collector/src/ai/documentarian.js +157 -0
- package/dist/boat/doc-collector/src/cli.js +104 -0
- package/dist/boat/doc-collector/src/config.js +129 -0
- package/dist/boat/doc-collector/src/docbot.js +326 -0
- package/dist/boat/doc-collector/src/docs-renderer.js +141 -0
- package/dist/boat/doc-collector/src/path-filter.js +35 -0
- package/dist/boat/doc-collector/src/research-navigation.js +71 -0
- package/dist/package.json +4 -1
- package/dist/src/ai/researcher/coordinates.js +1 -1
- package/dist/src/ai/researcher/parser.js +3 -0
- package/dist/src/ai/researcher.js +2 -1
- package/dist/src/config.js +10 -3
- package/dist/src/explorer.js +14 -1
- package/dist/src/state-manager.js +3 -0
- package/dist/src/utils/url-matcher.js +5 -3
- package/dist/src/utils/web-element.js +3 -2
- package/package.json +4 -1
- package/src/ai/researcher/coordinates.ts +1 -1
- package/src/ai/researcher/parser.ts +3 -0
- package/src/ai/researcher.ts +2 -1
- package/src/config.ts +13 -3
- package/src/explorbot.ts +1 -0
- package/src/explorer.ts +12 -1
- package/src/state-manager.ts +4 -0
- package/src/utils/url-matcher.ts +5 -2
- package/src/utils/web-element.ts +3 -2
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
import { existsSync, mkdirSync, writeFileSync } from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { ExplorBot } from "../../../src/explorbot.js";
|
|
4
|
+
import { normalizeUrl } from "../../../src/state-manager.js";
|
|
5
|
+
import { sanitizeFilename } from "../../../src/utils/strings.js";
|
|
6
|
+
import { tag } from "../../../src/utils/logger.js";
|
|
7
|
+
import { Documentarian } from "./ai/documentarian.js";
|
|
8
|
+
import { DocbotConfigParser } from "./config.js";
|
|
9
|
+
import { renderPageDocumentation, renderSpecIndex } from "./docs-renderer.js";
|
|
10
|
+
import { getDocPageKey, shouldCrawlDocPath } from "./path-filter.js";
|
|
11
|
+
import { extractResearchNavigationTargets } from "./research-navigation.js";
|
|
12
|
+
class DocBot {
|
|
13
|
+
explorBot;
|
|
14
|
+
configParser;
|
|
15
|
+
config = {};
|
|
16
|
+
documentarian;
|
|
17
|
+
options;
|
|
18
|
+
scopeRoot = '/';
|
|
19
|
+
constructor(options = {}) {
|
|
20
|
+
this.options = options;
|
|
21
|
+
const baseUrl = this.extractAbsoluteBaseUrl(options.startUrl || '/');
|
|
22
|
+
this.explorBot = new ExplorBot({
|
|
23
|
+
baseUrl,
|
|
24
|
+
verbose: options.verbose,
|
|
25
|
+
config: options.config,
|
|
26
|
+
path: options.path,
|
|
27
|
+
show: options.show,
|
|
28
|
+
headless: options.headless,
|
|
29
|
+
incognito: options.incognito,
|
|
30
|
+
session: options.session,
|
|
31
|
+
});
|
|
32
|
+
this.configParser = DocbotConfigParser.getInstance();
|
|
33
|
+
}
|
|
34
|
+
async start() {
|
|
35
|
+
await this.explorBot.start();
|
|
36
|
+
this.config = await this.configParser.loadConfig({
|
|
37
|
+
config: this.options.docsConfig,
|
|
38
|
+
path: this.options.path,
|
|
39
|
+
});
|
|
40
|
+
this.documentarian = new Documentarian(this.explorBot.getProvider(), this.config);
|
|
41
|
+
this.ensureDirectory(this.configParser.getOutputDir());
|
|
42
|
+
this.ensureDirectory(this.getPagesDir());
|
|
43
|
+
}
|
|
44
|
+
async stop() {
|
|
45
|
+
await this.explorBot.stop();
|
|
46
|
+
}
|
|
47
|
+
async collect(startPath, opts = {}) {
|
|
48
|
+
const effectiveStartPath = this.normalizeStartPath(startPath);
|
|
49
|
+
this.scopeRoot = this.getScopeRoot(effectiveStartPath);
|
|
50
|
+
const effectiveMaxPages = this.getMaxPages(opts.maxPages);
|
|
51
|
+
const queue = [];
|
|
52
|
+
const queued = new Set();
|
|
53
|
+
const documented = new Set();
|
|
54
|
+
const pages = [];
|
|
55
|
+
const skipped = [];
|
|
56
|
+
const baseUrl = this.explorBot.getConfig().playwright.url;
|
|
57
|
+
this.enqueuePath(effectiveStartPath, queue, queued);
|
|
58
|
+
while (queue.length > 0 && pages.length < effectiveMaxPages) {
|
|
59
|
+
const target = queue.shift();
|
|
60
|
+
if (!target) {
|
|
61
|
+
continue;
|
|
62
|
+
}
|
|
63
|
+
const targetKey = this.getPageKey(target);
|
|
64
|
+
if (documented.has(targetKey)) {
|
|
65
|
+
continue;
|
|
66
|
+
}
|
|
67
|
+
const stateManager = this.explorBot.getExplorer().getStateManager();
|
|
68
|
+
if (stateManager.hasVisitedState(target)) {
|
|
69
|
+
continue;
|
|
70
|
+
}
|
|
71
|
+
try {
|
|
72
|
+
tag('info').log(`Collecting docs for ${this.toDisplayUrl(target, baseUrl)}`);
|
|
73
|
+
await this.explorBot.visit(target);
|
|
74
|
+
if (stateManager.isInDeadLoop()) {
|
|
75
|
+
tag('warning').log('Dead loop detected during docs crawl, stopping collection');
|
|
76
|
+
skipped.push({
|
|
77
|
+
url: target,
|
|
78
|
+
reason: 'dead loop detected during crawl',
|
|
79
|
+
});
|
|
80
|
+
break;
|
|
81
|
+
}
|
|
82
|
+
const state = this.explorBot.getCurrentState();
|
|
83
|
+
if (!state) {
|
|
84
|
+
skipped.push({
|
|
85
|
+
url: target,
|
|
86
|
+
reason: 'page state was not captured after navigation',
|
|
87
|
+
});
|
|
88
|
+
continue;
|
|
89
|
+
}
|
|
90
|
+
const pageKey = this.getPageKey(state.url || target);
|
|
91
|
+
if (documented.has(pageKey)) {
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
const research = await this.explorBot.agentResearcher().research(state, {
|
|
95
|
+
screenshot: this.shouldUseScreenshots(),
|
|
96
|
+
force: true,
|
|
97
|
+
});
|
|
98
|
+
const documentation = await this.documentarian.document(state, research);
|
|
99
|
+
const lowSignalReason = this.getLowSignalReason(documentation, research);
|
|
100
|
+
if (lowSignalReason) {
|
|
101
|
+
skipped.push({
|
|
102
|
+
url: state.url,
|
|
103
|
+
reason: lowSignalReason,
|
|
104
|
+
});
|
|
105
|
+
documented.add(pageKey);
|
|
106
|
+
continue;
|
|
107
|
+
}
|
|
108
|
+
const filePath = this.savePageDocumentation(state, documentation);
|
|
109
|
+
pages.push({
|
|
110
|
+
url: state.url,
|
|
111
|
+
title: state.title || '',
|
|
112
|
+
summary: documentation.summary,
|
|
113
|
+
canCount: documentation.can.length,
|
|
114
|
+
mightCount: documentation.might.length,
|
|
115
|
+
canActions: documentation.can.map((item) => item.action),
|
|
116
|
+
mightActions: documentation.might.map((item) => item.action),
|
|
117
|
+
filePath,
|
|
118
|
+
});
|
|
119
|
+
documented.add(pageKey);
|
|
120
|
+
const nextPaths = this.extractNextPaths(state, baseUrl, research);
|
|
121
|
+
for (const nextPath of nextPaths) {
|
|
122
|
+
if (documented.has(this.getPageKey(nextPath))) {
|
|
123
|
+
continue;
|
|
124
|
+
}
|
|
125
|
+
if (stateManager.hasVisitedState(nextPath)) {
|
|
126
|
+
continue;
|
|
127
|
+
}
|
|
128
|
+
this.enqueuePath(nextPath, queue, queued);
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
catch (error) {
|
|
132
|
+
const reason = error instanceof Error ? error.message : String(error);
|
|
133
|
+
tag('warning').log(`Skipping ${target}: ${reason}`);
|
|
134
|
+
skipped.push({
|
|
135
|
+
url: target,
|
|
136
|
+
reason,
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
const indexPath = this.saveIndex(effectiveStartPath, pages, skipped, effectiveMaxPages);
|
|
141
|
+
return {
|
|
142
|
+
pages,
|
|
143
|
+
skipped,
|
|
144
|
+
indexPath,
|
|
145
|
+
outputDir: this.configParser.getOutputDir(),
|
|
146
|
+
};
|
|
147
|
+
}
|
|
148
|
+
getMaxPages(override) {
|
|
149
|
+
if (override && override > 0) {
|
|
150
|
+
return override;
|
|
151
|
+
}
|
|
152
|
+
const configured = this.config.docs?.maxPages;
|
|
153
|
+
if (configured && configured > 0) {
|
|
154
|
+
return configured;
|
|
155
|
+
}
|
|
156
|
+
return 100;
|
|
157
|
+
}
|
|
158
|
+
shouldUseScreenshots() {
|
|
159
|
+
const screenshot = this.config.docs?.screenshot;
|
|
160
|
+
if (screenshot === false) {
|
|
161
|
+
return false;
|
|
162
|
+
}
|
|
163
|
+
return true;
|
|
164
|
+
}
|
|
165
|
+
extractNextPaths(state, baseUrl, research) {
|
|
166
|
+
const paths = [];
|
|
167
|
+
const seen = new Set();
|
|
168
|
+
for (const link of state.links || []) {
|
|
169
|
+
const nextPath = this.resolveLink(link, baseUrl);
|
|
170
|
+
if (!nextPath) {
|
|
171
|
+
continue;
|
|
172
|
+
}
|
|
173
|
+
if (!shouldCrawlDocPath(nextPath, this.config)) {
|
|
174
|
+
continue;
|
|
175
|
+
}
|
|
176
|
+
if (!this.isInScope(nextPath)) {
|
|
177
|
+
continue;
|
|
178
|
+
}
|
|
179
|
+
if (seen.has(nextPath)) {
|
|
180
|
+
continue;
|
|
181
|
+
}
|
|
182
|
+
seen.add(nextPath);
|
|
183
|
+
paths.push(nextPath);
|
|
184
|
+
}
|
|
185
|
+
for (const target of extractResearchNavigationTargets(state, research)) {
|
|
186
|
+
if (!shouldCrawlDocPath(target, this.config)) {
|
|
187
|
+
continue;
|
|
188
|
+
}
|
|
189
|
+
if (!this.isInScope(target)) {
|
|
190
|
+
continue;
|
|
191
|
+
}
|
|
192
|
+
if (seen.has(target)) {
|
|
193
|
+
continue;
|
|
194
|
+
}
|
|
195
|
+
seen.add(target);
|
|
196
|
+
paths.push(target);
|
|
197
|
+
}
|
|
198
|
+
return paths;
|
|
199
|
+
}
|
|
200
|
+
resolveLink(link, baseUrl) {
|
|
201
|
+
let resolved;
|
|
202
|
+
try {
|
|
203
|
+
resolved = new URL(link.url, baseUrl);
|
|
204
|
+
}
|
|
205
|
+
catch {
|
|
206
|
+
return null;
|
|
207
|
+
}
|
|
208
|
+
const base = new URL(baseUrl);
|
|
209
|
+
if (resolved.origin !== base.origin) {
|
|
210
|
+
return null;
|
|
211
|
+
}
|
|
212
|
+
const pathName = resolved.pathname || '/';
|
|
213
|
+
return `${pathName}${resolved.search}${resolved.hash}`;
|
|
214
|
+
}
|
|
215
|
+
toDisplayUrl(target, baseUrl) {
|
|
216
|
+
try {
|
|
217
|
+
return new URL(target, baseUrl).toString();
|
|
218
|
+
}
|
|
219
|
+
catch {
|
|
220
|
+
return target;
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
enqueuePath(inputPath, queue, queued) {
|
|
224
|
+
const normalized = normalizeUrl(inputPath);
|
|
225
|
+
const pageKey = this.getPageKey(inputPath);
|
|
226
|
+
if (queued.has(pageKey)) {
|
|
227
|
+
return;
|
|
228
|
+
}
|
|
229
|
+
queued.add(pageKey);
|
|
230
|
+
if (!inputPath.startsWith('/')) {
|
|
231
|
+
queue.push(`/${normalized}`);
|
|
232
|
+
return;
|
|
233
|
+
}
|
|
234
|
+
queue.push(inputPath);
|
|
235
|
+
}
|
|
236
|
+
getPageKey(pageUrl) {
|
|
237
|
+
return getDocPageKey(pageUrl, this.config);
|
|
238
|
+
}
|
|
239
|
+
normalizeStartPath(startPath) {
|
|
240
|
+
try {
|
|
241
|
+
const parsed = new URL(startPath);
|
|
242
|
+
return `${parsed.pathname || '/'}${parsed.search}${parsed.hash}`;
|
|
243
|
+
}
|
|
244
|
+
catch {
|
|
245
|
+
return startPath;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
extractAbsoluteBaseUrl(startPath) {
|
|
249
|
+
try {
|
|
250
|
+
const parsed = new URL(startPath);
|
|
251
|
+
return parsed.origin;
|
|
252
|
+
}
|
|
253
|
+
catch {
|
|
254
|
+
return undefined;
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
isInScope(target) {
|
|
258
|
+
const normalized = this.normalizeStartPath(target);
|
|
259
|
+
const scope = this.config.docs?.scope || 'site';
|
|
260
|
+
if (scope === 'site') {
|
|
261
|
+
return true;
|
|
262
|
+
}
|
|
263
|
+
if (scope === 'subtree') {
|
|
264
|
+
return normalized === this.scopeRoot || normalized.startsWith(`${this.scopeRoot}/`);
|
|
265
|
+
}
|
|
266
|
+
if (scope === 'section') {
|
|
267
|
+
return normalized === this.scopeRoot || normalized.startsWith(`${this.scopeRoot}/`) || normalized.startsWith(`${this.scopeRoot}-`);
|
|
268
|
+
}
|
|
269
|
+
return true;
|
|
270
|
+
}
|
|
271
|
+
getScopeRoot(startPath) {
|
|
272
|
+
const normalized = this.normalizeStartPath(startPath);
|
|
273
|
+
const parts = normalized.split('/').filter(Boolean);
|
|
274
|
+
if (parts.length === 0) {
|
|
275
|
+
return '/';
|
|
276
|
+
}
|
|
277
|
+
if (parts.length >= 4) {
|
|
278
|
+
return `/${parts.slice(0, 4).join('/')}`;
|
|
279
|
+
}
|
|
280
|
+
return `/${parts.join('/')}`;
|
|
281
|
+
}
|
|
282
|
+
getLowSignalReason(documentation, research) {
|
|
283
|
+
const minCanActions = this.config.docs?.minCanActions ?? 1;
|
|
284
|
+
const minInteractiveElements = this.config.docs?.minInteractiveElements ?? 3;
|
|
285
|
+
if (documentation.can.length >= minCanActions) {
|
|
286
|
+
return null;
|
|
287
|
+
}
|
|
288
|
+
const interactiveCount = this.countInteractiveElements(research);
|
|
289
|
+
if (interactiveCount >= minInteractiveElements) {
|
|
290
|
+
return null;
|
|
291
|
+
}
|
|
292
|
+
return `low-signal page: only ${documentation.can.length} proven actions and ${interactiveCount} interactive elements`;
|
|
293
|
+
}
|
|
294
|
+
countInteractiveElements(research) {
|
|
295
|
+
const matches = [...research.matchAll(/\((\d+) elements?\)/g)];
|
|
296
|
+
return matches.reduce((sum, match) => sum + Number.parseInt(match[1], 10), 0);
|
|
297
|
+
}
|
|
298
|
+
savePageDocumentation(state, documentation) {
|
|
299
|
+
const pagePath = this.getPageFilePath(state.url);
|
|
300
|
+
writeFileSync(pagePath, renderPageDocumentation(state, documentation), 'utf8');
|
|
301
|
+
return pagePath;
|
|
302
|
+
}
|
|
303
|
+
saveIndex(startPath, pages, skipped, maxPages) {
|
|
304
|
+
const indexPath = path.join(this.configParser.getOutputDir(), 'spec.md');
|
|
305
|
+
writeFileSync(indexPath, renderSpecIndex(this.configParser.getOutputDir(), startPath, pages, skipped, maxPages), 'utf8');
|
|
306
|
+
return indexPath;
|
|
307
|
+
}
|
|
308
|
+
getPagesDir() {
|
|
309
|
+
return path.join(this.configParser.getOutputDir(), 'pages');
|
|
310
|
+
}
|
|
311
|
+
getPageFilePath(pageUrl) {
|
|
312
|
+
const normalized = normalizeUrl(pageUrl || '/');
|
|
313
|
+
const baseName = sanitizeFilename(normalized || 'root');
|
|
314
|
+
if (baseName) {
|
|
315
|
+
return path.join(this.getPagesDir(), `${baseName}.md`);
|
|
316
|
+
}
|
|
317
|
+
return path.join(this.getPagesDir(), 'root.md');
|
|
318
|
+
}
|
|
319
|
+
ensureDirectory(dirPath) {
|
|
320
|
+
if (existsSync(dirPath)) {
|
|
321
|
+
return;
|
|
322
|
+
}
|
|
323
|
+
mkdirSync(dirPath, { recursive: true });
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
export { DocBot };
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
function renderPageDocumentation(state, documentation) {
|
|
3
|
+
const lines = [];
|
|
4
|
+
lines.push(`# ${state.url}`);
|
|
5
|
+
lines.push('');
|
|
6
|
+
if (state.title) {
|
|
7
|
+
lines.push(`Title: ${normalizeInlineText(state.title)}`);
|
|
8
|
+
lines.push('');
|
|
9
|
+
}
|
|
10
|
+
lines.push('## Purpose');
|
|
11
|
+
lines.push('');
|
|
12
|
+
lines.push(ensureSentence(documentation.summary));
|
|
13
|
+
lines.push('');
|
|
14
|
+
lines.push('## User Can');
|
|
15
|
+
lines.push('');
|
|
16
|
+
if (documentation.can.length === 0) {
|
|
17
|
+
lines.push('- No proven actions were identified from the collected research.');
|
|
18
|
+
lines.push('');
|
|
19
|
+
}
|
|
20
|
+
for (const item of documentation.can) {
|
|
21
|
+
lines.push(`- ${normalizeAction(item.action)} -> ${item.scope}`);
|
|
22
|
+
lines.push(` Proof: ${ensureSentence(item.evidence)}`);
|
|
23
|
+
}
|
|
24
|
+
if (documentation.can.length > 0) {
|
|
25
|
+
lines.push('');
|
|
26
|
+
}
|
|
27
|
+
lines.push('## User Might');
|
|
28
|
+
lines.push('');
|
|
29
|
+
if (documentation.might.length === 0) {
|
|
30
|
+
lines.push('- No assumption-based actions were identified.');
|
|
31
|
+
lines.push('');
|
|
32
|
+
}
|
|
33
|
+
for (const item of documentation.might) {
|
|
34
|
+
lines.push(`- ${normalizeAction(item.action, 'might')} -> ${item.scope}`);
|
|
35
|
+
lines.push(` Signal: ${ensureSentence(item.evidence)}`);
|
|
36
|
+
}
|
|
37
|
+
if (documentation.might.length > 0) {
|
|
38
|
+
lines.push('');
|
|
39
|
+
}
|
|
40
|
+
return `${lines.join('\n').trimEnd()}\n`;
|
|
41
|
+
}
|
|
42
|
+
function renderSpecIndex(outputDir, startPath, pages, skipped, maxPages) {
|
|
43
|
+
const lines = [];
|
|
44
|
+
lines.push('# Website Spec');
|
|
45
|
+
lines.push('');
|
|
46
|
+
lines.push('## Overview');
|
|
47
|
+
lines.push('');
|
|
48
|
+
lines.push(`Start page: ${startPath}`);
|
|
49
|
+
lines.push(`Pages documented: ${pages.length}`);
|
|
50
|
+
lines.push(`Pages skipped: ${skipped.length}`);
|
|
51
|
+
lines.push(`Max pages: ${maxPages}`);
|
|
52
|
+
lines.push('');
|
|
53
|
+
lines.push('## Pages');
|
|
54
|
+
lines.push('');
|
|
55
|
+
if (pages.length === 0) {
|
|
56
|
+
lines.push('- No pages were documented.');
|
|
57
|
+
lines.push('');
|
|
58
|
+
}
|
|
59
|
+
for (const page of pages) {
|
|
60
|
+
const relativeFile = path.relative(outputDir, page.filePath).replaceAll('\\', '/');
|
|
61
|
+
lines.push(`### [${page.url}](${relativeFile})`);
|
|
62
|
+
lines.push('');
|
|
63
|
+
lines.push(`Purpose: ${ensureSentence(page.summary)}`);
|
|
64
|
+
lines.push(`Proven actions: ${page.canCount}`);
|
|
65
|
+
lines.push(`Possible actions: ${page.mightCount}`);
|
|
66
|
+
if (page.title) {
|
|
67
|
+
lines.push(`Title: ${normalizeInlineText(page.title)}`);
|
|
68
|
+
}
|
|
69
|
+
lines.push('');
|
|
70
|
+
if (page.canActions.length > 0) {
|
|
71
|
+
lines.push('User Can:');
|
|
72
|
+
for (const action of page.canActions) {
|
|
73
|
+
lines.push(`- ${normalizeAction(action, 'can')}`);
|
|
74
|
+
}
|
|
75
|
+
lines.push('');
|
|
76
|
+
}
|
|
77
|
+
if (page.mightActions.length > 0) {
|
|
78
|
+
lines.push('User Might:');
|
|
79
|
+
for (const action of page.mightActions) {
|
|
80
|
+
lines.push(`- ${normalizeAction(action, 'might')}`);
|
|
81
|
+
}
|
|
82
|
+
lines.push('');
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
if (skipped.length > 0) {
|
|
86
|
+
lines.push('## Skipped');
|
|
87
|
+
lines.push('');
|
|
88
|
+
for (const page of skipped) {
|
|
89
|
+
lines.push(`- ${page.url}. Reason: ${ensureSentence(page.reason)}`);
|
|
90
|
+
}
|
|
91
|
+
lines.push('');
|
|
92
|
+
}
|
|
93
|
+
return `${lines.join('\n').trimEnd()}\n`;
|
|
94
|
+
}
|
|
95
|
+
function normalizeAction(action, kind = 'can') {
|
|
96
|
+
const trimmed = normalizeInlineText(action);
|
|
97
|
+
if (!trimmed) {
|
|
98
|
+
return 'user can interact with this page';
|
|
99
|
+
}
|
|
100
|
+
const normalized = ensureSentence(trimmed).slice(0, -1);
|
|
101
|
+
const lower = normalized.toLowerCase();
|
|
102
|
+
if (kind === 'can') {
|
|
103
|
+
if (lower.startsWith('user can ')) {
|
|
104
|
+
return normalized;
|
|
105
|
+
}
|
|
106
|
+
if (lower.startsWith('can ')) {
|
|
107
|
+
return `user can ${normalized.slice(4)}`;
|
|
108
|
+
}
|
|
109
|
+
if (lower.startsWith('user might ')) {
|
|
110
|
+
return `user can ${normalized.slice(11)}`;
|
|
111
|
+
}
|
|
112
|
+
return `user can ${normalized}`;
|
|
113
|
+
}
|
|
114
|
+
if (lower.startsWith('user might ')) {
|
|
115
|
+
return normalized;
|
|
116
|
+
}
|
|
117
|
+
if (lower.startsWith('might ')) {
|
|
118
|
+
return `user might ${normalized.slice(6)}`;
|
|
119
|
+
}
|
|
120
|
+
if (lower.startsWith('user can ')) {
|
|
121
|
+
return `user might ${normalized.slice(9)}`;
|
|
122
|
+
}
|
|
123
|
+
if (lower.startsWith('can ')) {
|
|
124
|
+
return `user might ${normalized.slice(4)}`;
|
|
125
|
+
}
|
|
126
|
+
return `user might ${normalized}`;
|
|
127
|
+
}
|
|
128
|
+
function ensureSentence(text) {
|
|
129
|
+
const trimmed = normalizeInlineText(text);
|
|
130
|
+
if (!trimmed) {
|
|
131
|
+
return '';
|
|
132
|
+
}
|
|
133
|
+
if (/[.!?]$/.test(trimmed)) {
|
|
134
|
+
return trimmed;
|
|
135
|
+
}
|
|
136
|
+
return `${trimmed}.`;
|
|
137
|
+
}
|
|
138
|
+
function normalizeInlineText(text) {
|
|
139
|
+
return text.normalize('NFKC').replace(/\s+/g, ' ').trim();
|
|
140
|
+
}
|
|
141
|
+
export { renderPageDocumentation, renderSpecIndex, ensureSentence, normalizeAction };
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import { normalizeUrl } from "../../../src/state-manager.js";
|
|
2
|
+
import { matchesUrl, generalizeUrl } from "../../../src/utils/url-matcher.js";
|
|
3
|
+
const DEFAULT_DENIED_PATH_SEGMENTS = ['callback', 'callbacks', 'logout', 'signout', 'sign_out', 'destroy', 'delete', 'remove'];
|
|
4
|
+
export function shouldCrawlDocPath(nextPath, config = {}) {
|
|
5
|
+
const parsed = new URL(nextPath, 'http://localhost');
|
|
6
|
+
const segments = parsed.pathname
|
|
7
|
+
.split('/')
|
|
8
|
+
.map((segment) => segment.trim().toLowerCase())
|
|
9
|
+
.filter(Boolean);
|
|
10
|
+
const normalizedPath = parsed.pathname || '/';
|
|
11
|
+
const includePaths = config.docs?.includePaths || [];
|
|
12
|
+
if (includePaths.length > 0) {
|
|
13
|
+
return includePaths.some((pattern) => matchesUrl(pattern, normalizedPath));
|
|
14
|
+
}
|
|
15
|
+
const excludePaths = config.docs?.excludePaths || [];
|
|
16
|
+
if (excludePaths.some((pattern) => matchesUrl(pattern, normalizedPath))) {
|
|
17
|
+
return false;
|
|
18
|
+
}
|
|
19
|
+
if (segments.length === 0) {
|
|
20
|
+
return true;
|
|
21
|
+
}
|
|
22
|
+
const terminalActions = new Set((config.docs?.deniedPathSegments || DEFAULT_DENIED_PATH_SEGMENTS).map((segment) => segment.trim().toLowerCase()).filter(Boolean));
|
|
23
|
+
if (segments.some((segment) => terminalActions.has(segment))) {
|
|
24
|
+
return false;
|
|
25
|
+
}
|
|
26
|
+
return true;
|
|
27
|
+
}
|
|
28
|
+
export function getDocPageKey(pageUrl, config = {}) {
|
|
29
|
+
const normalized = normalizeUrl(pageUrl || '/');
|
|
30
|
+
const path = normalized.startsWith('/') ? normalized : `/${normalized}`;
|
|
31
|
+
if (config.docs?.collapseDynamicPages === false) {
|
|
32
|
+
return normalizeUrl(path);
|
|
33
|
+
}
|
|
34
|
+
return normalizeUrl(generalizeUrl(path));
|
|
35
|
+
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import { parseResearchSections } from "../../../src/ai/researcher/parser.js";
|
|
2
|
+
const OPEN_API_TAG_SELECTOR_PATTERN = /api-\d+\/tag\/([a-z0-9-]+)(?:["'#/\]\s]|$)/i;
|
|
3
|
+
const OPEN_API_NAVIGATION_SECTION_KEYWORDS = ['navigation', 'menu'];
|
|
4
|
+
export function extractResearchNavigationTargets(state, research) {
|
|
5
|
+
const currentUrl = state.url || '/';
|
|
6
|
+
const sections = parseResearchSections(research);
|
|
7
|
+
const targets = [];
|
|
8
|
+
const seen = new Set();
|
|
9
|
+
for (const section of sections) {
|
|
10
|
+
const sectionName = section.name.toLowerCase();
|
|
11
|
+
if (!OPEN_API_NAVIGATION_SECTION_KEYWORDS.some((keyword) => sectionName.includes(keyword))) {
|
|
12
|
+
continue;
|
|
13
|
+
}
|
|
14
|
+
for (const element of section.elements) {
|
|
15
|
+
const target = extractNavigationTarget(currentUrl, element);
|
|
16
|
+
if (!target || seen.has(target)) {
|
|
17
|
+
continue;
|
|
18
|
+
}
|
|
19
|
+
seen.add(target);
|
|
20
|
+
targets.push(target);
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
return targets;
|
|
24
|
+
}
|
|
25
|
+
function extractNavigationTarget(currentUrl, element) {
|
|
26
|
+
const openApiTagFromCss = extractOpenApiTagHashFromCss(element.css);
|
|
27
|
+
if (openApiTagFromCss) {
|
|
28
|
+
return buildSamePageHashTarget(currentUrl, openApiTagFromCss);
|
|
29
|
+
}
|
|
30
|
+
if (!currentUrl.includes('#tag/')) {
|
|
31
|
+
return null;
|
|
32
|
+
}
|
|
33
|
+
const inferredOpenApiTag = inferOpenApiTagSlugFromLabel(element.name);
|
|
34
|
+
if (!inferredOpenApiTag) {
|
|
35
|
+
return null;
|
|
36
|
+
}
|
|
37
|
+
return buildSamePageHashTarget(currentUrl, `tag/${inferredOpenApiTag}`);
|
|
38
|
+
}
|
|
39
|
+
function extractOpenApiTagHashFromCss(css) {
|
|
40
|
+
if (!css) {
|
|
41
|
+
return null;
|
|
42
|
+
}
|
|
43
|
+
const normalizedSelector = css.replaceAll('\\/', '/');
|
|
44
|
+
const match = normalizedSelector.match(OPEN_API_TAG_SELECTOR_PATTERN);
|
|
45
|
+
if (!match?.[1]) {
|
|
46
|
+
return null;
|
|
47
|
+
}
|
|
48
|
+
return `tag/${match[1].toLowerCase()}`;
|
|
49
|
+
}
|
|
50
|
+
function inferOpenApiTagSlugFromLabel(name) {
|
|
51
|
+
const cleanedLabel = name
|
|
52
|
+
.replace(/^'+|'+$/g, '')
|
|
53
|
+
.replace(/\(expanded\)|\(collapsed\)|open group|close group|show more/gi, '')
|
|
54
|
+
.trim();
|
|
55
|
+
if (!cleanedLabel.includes('/')) {
|
|
56
|
+
return null;
|
|
57
|
+
}
|
|
58
|
+
const slug = cleanedLabel
|
|
59
|
+
.split('/')
|
|
60
|
+
.map((part) => part.trim().toLowerCase())
|
|
61
|
+
.filter(Boolean)
|
|
62
|
+
.join('-')
|
|
63
|
+
.replace(/[^a-z0-9-]+/g, '-')
|
|
64
|
+
.replace(/-+/g, '-')
|
|
65
|
+
.replace(/^-|-$/g, '');
|
|
66
|
+
return slug || null;
|
|
67
|
+
}
|
|
68
|
+
function buildSamePageHashTarget(currentUrl, hashPath) {
|
|
69
|
+
const [baseWithSearch] = currentUrl.split('#');
|
|
70
|
+
return `${baseWithSearch}#${hashPath}`;
|
|
71
|
+
}
|
package/dist/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "explorbot",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.18",
|
|
4
4
|
"description": "CLI app built with React Ink, CodeceptJS, and Playwright",
|
|
5
5
|
"license": "Elastic-2.0",
|
|
6
6
|
"type": "module",
|
|
@@ -20,6 +20,9 @@
|
|
|
20
20
|
"src/**/*.tsx",
|
|
21
21
|
"bin/**/*.ts",
|
|
22
22
|
"boat/api-tester/src/**/*.ts",
|
|
23
|
+
"boat/doc-collector/src/**/*.ts",
|
|
24
|
+
"boat/doc-collector/bin/**/*.ts",
|
|
25
|
+
"boat/doc-collector/package.json",
|
|
23
26
|
"rules/",
|
|
24
27
|
"assets/sample-files/"
|
|
25
28
|
],
|
|
@@ -182,7 +182,7 @@ export function WithCoordinates(Base) {
|
|
|
182
182
|
const eidxWithoutCoords = [];
|
|
183
183
|
for (const section of sections) {
|
|
184
184
|
for (const el of section.elements) {
|
|
185
|
-
if (el.eidx && !el.coordinates)
|
|
185
|
+
if (el.eidx && /^e\d+$/i.test(el.eidx) && !el.coordinates)
|
|
186
186
|
eidxWithoutCoords.push(el.eidx);
|
|
187
187
|
}
|
|
188
188
|
}
|
|
@@ -40,6 +40,9 @@ export function mapRowToElement(row) {
|
|
|
40
40
|
let eidxRaw = (colMap.eidx || '').trim();
|
|
41
41
|
if (eidxRaw && /^\d+$/.test(eidxRaw))
|
|
42
42
|
eidxRaw = `e${eidxRaw}`;
|
|
43
|
+
if (eidxRaw && !/^e\d+$/i.test(eidxRaw)) {
|
|
44
|
+
eidxRaw = '';
|
|
45
|
+
}
|
|
43
46
|
const aria = parseAriaLocator(colMap.aria || '-');
|
|
44
47
|
return {
|
|
45
48
|
name,
|
|
@@ -90,7 +90,8 @@ export class Researcher extends ResearcherBase {
|
|
|
90
90
|
Stats.researches++;
|
|
91
91
|
const sessionName = `researcher: ${state.url}`;
|
|
92
92
|
return Observability.run(sessionName, { tags: ['researcher'], sessionId: stateHash }, async () => {
|
|
93
|
-
|
|
93
|
+
const displayUrl = state.fullUrl || state.url;
|
|
94
|
+
tag('info').log(`Researching ${displayUrl} to understand the context...`);
|
|
94
95
|
setActivity(`${this.emoji} Researching...`, 'action');
|
|
95
96
|
await this.ensureNavigated(state.url, screenshot && this.provider.hasVision());
|
|
96
97
|
await this.hooksRunner.runBeforeHook('researcher', state.url);
|
package/dist/src/config.js
CHANGED
|
@@ -24,6 +24,7 @@ export class ConfigParser {
|
|
|
24
24
|
static instance;
|
|
25
25
|
config = null;
|
|
26
26
|
configPath = null;
|
|
27
|
+
runtimeBaseUrlOverride = null;
|
|
27
28
|
constructor() { }
|
|
28
29
|
static loadEnv(filePath) {
|
|
29
30
|
const resolved = resolve(filePath);
|
|
@@ -38,7 +39,7 @@ export class ConfigParser {
|
|
|
38
39
|
return ConfigParser.instance;
|
|
39
40
|
}
|
|
40
41
|
async loadConfig(options) {
|
|
41
|
-
if (this.config && !options?.config && !options?.path) {
|
|
42
|
+
if (this.config && !options?.config && !options?.path && this.runtimeBaseUrlOverride === (options?.baseUrl || null)) {
|
|
42
43
|
return this.config;
|
|
43
44
|
}
|
|
44
45
|
// Store the initial working directory for reference
|
|
@@ -61,7 +62,8 @@ export class ConfigParser {
|
|
|
61
62
|
if (!loadedConfig) {
|
|
62
63
|
throw new Error('Configuration file is empty or invalid');
|
|
63
64
|
}
|
|
64
|
-
this.config = this.resolveConfig(loadedConfig);
|
|
65
|
+
this.config = this.resolveConfig(loadedConfig, options);
|
|
66
|
+
this.runtimeBaseUrlOverride = options?.baseUrl || null;
|
|
65
67
|
this.configPath = resolvedPath;
|
|
66
68
|
log(`Configuration loaded from: ${resolvedPath}`);
|
|
67
69
|
// Restore original directory after successful config load
|
|
@@ -108,6 +110,7 @@ export class ConfigParser {
|
|
|
108
110
|
if (ConfigParser.instance) {
|
|
109
111
|
ConfigParser.instance.config = null;
|
|
110
112
|
ConfigParser.instance.configPath = null;
|
|
113
|
+
ConfigParser.instance.runtimeBaseUrlOverride = null;
|
|
111
114
|
}
|
|
112
115
|
}
|
|
113
116
|
// For testing purposes only - sets up minimal default config
|
|
@@ -185,11 +188,15 @@ export class ConfigParser {
|
|
|
185
188
|
return JSON.parse(content);
|
|
186
189
|
}
|
|
187
190
|
}
|
|
188
|
-
resolveConfig(config) {
|
|
191
|
+
resolveConfig(config, options) {
|
|
189
192
|
if (config.web?.url && !config.playwright?.url) {
|
|
190
193
|
config.playwright = config.playwright || { browser: 'chromium', url: '' };
|
|
191
194
|
config.playwright.url = config.web.url;
|
|
192
195
|
}
|
|
196
|
+
if (options?.baseUrl) {
|
|
197
|
+
config.playwright = config.playwright || { browser: 'chromium', url: '' };
|
|
198
|
+
config.playwright.url = options.baseUrl;
|
|
199
|
+
}
|
|
193
200
|
return config;
|
|
194
201
|
}
|
|
195
202
|
validateConfig(config) {
|