codemini-cli 0.4.1 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/core/tools.js CHANGED
@@ -15,7 +15,7 @@ import {
15
15
  import { evaluateCommandPolicy } from './command-policy.js';
16
16
  import { findEnclosingSymbol, queryAst, readAstNode, resolveAstTarget } from './ast.js';
17
17
  import { initializeProjectIndex, queryProjectIndex, refreshIndexedFile } from './project-index.js';
18
- import { checkReadDedup } from './agent-loop.js';
18
+ import { checkReadDedup } from './tool-result-store.js';
19
19
  import { TOOL_SKIP_DIRS as SKIP_DIRS, TEXT_EXTENSIONS, CODE_WRITE_GUARD_EXTENSIONS, LANGUAGE_FILE_TYPES } from './constants.js';
20
20
  import { sha256Prefixed as sha256, sha256 as sha256Hash } from './crypto-utils.js';
21
21
  import { forgetMemory, listMemories, rememberMemory, searchMemories, captureToInbox } from './memory-store.js';
@@ -172,6 +172,55 @@ function collectPageLinks($, pageUrl, maxLinks = 20) {
172
172
  return links;
173
173
  }
174
174
 
175
+ function extractPageContent(cheerio, html, pageUrl, { maxLinks, status = null, contentType = '', fetchMode = 'static' } = {}) {
176
+ const $ = cheerio.load(html);
177
+ $('script, style, noscript').remove();
178
+ const bodyText = $('body').text() || $.root().text();
179
+ const text = String(bodyText || '').replace(/[^\S\n]+/g, ' ').replace(/\n{3,}/g, '\n\n').trim();
180
+ const title = trimPreview($('title').first().text(), 240);
181
+ const description = extractHtmlMeta($, 'description') || extractHtmlMeta($, 'og:description');
182
+ const links = collectPageLinks($, pageUrl, maxLinks);
183
+
184
+ return {
185
+ final_url: pageUrl,
186
+ title,
187
+ description,
188
+ text,
189
+ links,
190
+ metadata: {
191
+ status,
192
+ fetched_at: new Date().toISOString(),
193
+ content_type: contentType,
194
+ fetch_mode: fetchMode,
195
+ lang: String($('html').attr('lang') || '').trim()
196
+ }
197
+ };
198
+ }
199
+
200
+ function shouldTryBrowserRender(html, text) {
201
+ if (String(text || '').trim().length >= 120) return false;
202
+ return /<script\b/i.test(html) ||
203
+ /id=["']__(?:next|nuxt)["']/i.test(html) ||
204
+ /data-reactroot|ng-version|window\.__/i.test(html);
205
+ }
206
+
207
+ function playwrightInstallHint() {
208
+ return 'For JavaScript-rendered pages, install Playwright for richer web_fetch results: npm install -g playwright && playwright install chromium';
209
+ }
210
+
211
+ async function loadOptionalPlaywright() {
212
+ try {
213
+ return await import('playwright');
214
+ } catch (error) {
215
+ const code = String(error?.code || '');
216
+ const message = String(error?.message || '');
217
+ if (code === 'ERR_MODULE_NOT_FOUND' || /Cannot find package 'playwright'|Cannot find module 'playwright'/i.test(message)) {
218
+ return null;
219
+ }
220
+ throw error;
221
+ }
222
+ }
223
+
175
224
  async function buildPlaywrightLaunchEnv() {
176
225
  const localLibDir = path.join(
177
226
  process.env.HOME || '',
@@ -204,44 +253,85 @@ async function webFetchPage(args = {}) {
204
253
  ? String(normalizedArgs.wait_until).trim()
205
254
  : 'domcontentloaded';
206
255
 
207
- const [{ chromium }, cheerio] = await Promise.all([import('playwright'), import('cheerio')]);
256
+ const cheerio = await import('cheerio');
257
+ let staticResult = null;
258
+ let staticHtml = '';
259
+ let staticError = null;
260
+ try {
261
+ const controller = new AbortController();
262
+ const timeout = setTimeout(() => controller.abort(), timeoutMs);
263
+ let response;
264
+ try {
265
+ response = await fetch(url, {
266
+ redirect: 'follow',
267
+ signal: controller.signal,
268
+ headers: {
269
+ 'user-agent': 'CodeMiniCLI/0.4 web_fetch'
270
+ }
271
+ });
272
+ } finally {
273
+ clearTimeout(timeout);
274
+ }
275
+ staticHtml = await response.text();
276
+ staticResult = {
277
+ url,
278
+ ...extractPageContent(cheerio, staticHtml, response.url || url, {
279
+ maxLinks,
280
+ status: response.status,
281
+ contentType: response.headers.get('content-type') || '',
282
+ fetchMode: 'static'
283
+ })
284
+ };
285
+ if (!shouldTryBrowserRender(staticHtml, staticResult.text)) {
286
+ return staticResult;
287
+ }
288
+ } catch (error) {
289
+ staticError = error;
290
+ }
208
291
 
209
- // Crawlee is intentionally disabled for now so single-page reads stay lightweight.
210
- // If we later need multi-URL crawl orchestration, retries, or request queues, we can re-enable it here.
292
+ const playwright = await loadOptionalPlaywright();
293
+ if (!playwright) {
294
+ if (staticResult) {
295
+ return {
296
+ ...staticResult,
297
+ warnings: [playwrightInstallHint()]
298
+ };
299
+ }
300
+ throw new Error(`web_fetch failed and browser rendering is unavailable. ${playwrightInstallHint()}. Static fetch error: ${staticError?.message || staticError}`);
301
+ }
211
302
 
212
- const browser = await chromium.launch({
213
- headless: true,
214
- env: await buildPlaywrightLaunchEnv()
215
- });
303
+ let browser;
216
304
  try {
305
+ browser = await playwright.chromium.launch({
306
+ headless: true,
307
+ env: await buildPlaywrightLaunchEnv()
308
+ });
217
309
  const page = await browser.newPage();
218
310
  const response = await page.goto(url, { waitUntil, timeout: timeoutMs });
219
311
  const finalUrl = page.url();
220
312
  const html = await page.content();
221
- const $ = cheerio.load(html);
222
- const bodyText = $('body').text() || $.root().text();
223
- const text = String(bodyText || '').replace(/[^\S\n]+/g, ' ').replace(/\n{3,}/g, '\n\n').trim();
224
- const title = trimPreview($('title').first().text() || (await page.title()), 240);
225
- const description = extractHtmlMeta($, 'description') || extractHtmlMeta($, 'og:description');
226
- const links = collectPageLinks($, finalUrl, maxLinks);
227
-
228
- return {
313
+ const rendered = {
229
314
  url,
230
- final_url: finalUrl,
231
- title,
232
- description,
233
- text,
234
- links,
235
- metadata: {
315
+ ...extractPageContent(cheerio, html, finalUrl, {
316
+ maxLinks,
236
317
  status: response?.status?.() ?? null,
237
- fetched_at: new Date().toISOString(),
238
- content_type: response?.headers?.()['content-type'] || '',
239
- wait_until: waitUntil,
240
- lang: String($('html').attr('lang') || '').trim()
241
- }
318
+ contentType: response?.headers?.()['content-type'] || '',
319
+ fetchMode: 'browser'
320
+ })
242
321
  };
322
+ rendered.metadata.wait_until = waitUntil;
323
+ rendered.title = rendered.title || trimPreview(await page.title(), 240);
324
+ return rendered;
325
+ } catch (error) {
326
+ if (staticResult) {
327
+ return {
328
+ ...staticResult,
329
+ warnings: [`Browser rendering fallback failed: ${error?.message || error}`]
330
+ };
331
+ }
332
+ throw error;
243
333
  } finally {
244
- await browser.close();
334
+ if (browser) await browser.close();
245
335
  }
246
336
  }
247
337
 
@@ -255,30 +345,107 @@ async function webSearchQuery(config, args = {}) {
255
345
  if (!query) throw new Error('web_search requires query');
256
346
 
257
347
  const maxResults = clampNumber(normalizedArgs.max_results, 1, 20, 8);
258
- const [{ search, SafeSearchType }] = await Promise.all([import('duck-duck-scrape')]);
259
- const response = await search(query, {
260
- safeSearch: SafeSearchType.MODERATE,
261
- locale: String(normalizedArgs.locale || 'en-us').trim() || 'en-us',
262
- region: String(normalizedArgs.region || 'wt-wt').trim() || 'wt-wt'
348
+ const locale = String(normalizedArgs.locale || config?.web?.search_locale || 'en-US').trim() || 'en-US';
349
+ const region = String(normalizedArgs.region || normalizedArgs.cc || config?.web?.search_region || (locale.toLowerCase().endsWith('-cn') ? 'CN' : 'US')).trim() || 'US';
350
+ const searchUrl = buildBingRssSearchUrl({
351
+ baseUrl: config?.web?.search_base_url,
352
+ query,
353
+ locale,
354
+ region
263
355
  });
356
+ const timeoutMs = clampNumber(normalizedArgs.timeout_ms || config?.web?.search_timeout_ms, 1_000, 60_000, 15_000);
357
+ const controller = new AbortController();
358
+ const timeout = setTimeout(() => controller.abort(), timeoutMs);
359
+ let response;
360
+ try {
361
+ response = await fetch(searchUrl, {
362
+ redirect: 'follow',
363
+ signal: controller.signal,
364
+ headers: {
365
+ 'user-agent': 'CodeMiniCLI/0.4 web_search',
366
+ accept: 'application/rss+xml, application/xml;q=0.9, text/xml;q=0.8, */*;q=0.5',
367
+ 'accept-language': `${locale},en;q=0.8`
368
+ }
369
+ });
370
+ } finally {
371
+ clearTimeout(timeout);
372
+ }
373
+ if (!response.ok) {
374
+ throw new Error(`web_search Bing RSS request failed: HTTP ${response.status}`);
375
+ }
376
+
377
+ const xml = await response.text();
378
+ const cheerio = await import('cheerio');
379
+ const parsed = parseBingRssResults(cheerio, xml, maxResults);
264
380
 
265
381
  return {
266
382
  query,
267
- no_results: response?.noResults === true,
268
- results: Array.isArray(response?.results)
269
- ? response.results.slice(0, maxResults).map((item) => ({
270
- title: String(item?.title || '').trim(),
271
- url: String(item?.url || '').trim(),
272
- description: normalizeWhitespace(item?.description || item?.rawDescription || ''),
273
- hostname: String(item?.hostname || '').trim()
274
- }))
275
- : [],
276
- related: Array.isArray(response?.related)
277
- ? response.related.slice(0, 8).map((item) => String(item?.text || item?.raw || '').trim()).filter(Boolean)
278
- : []
383
+ engine: 'bing_rss',
384
+ source_url: response.url || searchUrl,
385
+ no_results: parsed.results.length === 0,
386
+ results: parsed.results,
387
+ related: []
279
388
  };
280
389
  }
281
390
 
391
+ function buildBingRssSearchUrl({ baseUrl, query, locale, region }) {
392
+ const url = new URL(String(baseUrl || 'https://cn.bing.com/search'));
393
+ url.searchParams.set('q', query);
394
+ url.searchParams.set('mkt', locale);
395
+ url.searchParams.set('setlang', locale);
396
+ url.searchParams.set('cc', region);
397
+ url.searchParams.set('format', 'rss');
398
+ return url.toString();
399
+ }
400
+
401
+ function parseBingRssResults(cheerio, xml, maxResults) {
402
+ const $ = cheerio.load(xml, { xmlMode: true });
403
+ const results = [];
404
+ const seenUrls = new Set();
405
+ $('item').each((_, element) => {
406
+ if (results.length >= maxResults) return false;
407
+ const title = normalizeWhitespace($(element).find('title').first().text());
408
+ const url = normalizeSearchResultUrl($(element).find('link').first().text());
409
+ if (!title || !url || seenUrls.has(url)) return undefined;
410
+ seenUrls.add(url);
411
+ results.push({
412
+ title,
413
+ url,
414
+ description: normalizeRssDescription(cheerio, $(element).find('description').first().text()),
415
+ hostname: hostnameFromUrl(url),
416
+ published_at: normalizeWhitespace($(element).find('pubDate').first().text())
417
+ });
418
+ return undefined;
419
+ });
420
+ return { results };
421
+ }
422
+
423
+ function normalizeSearchResultUrl(value) {
424
+ const text = String(value || '').trim();
425
+ if (!text) return '';
426
+ try {
427
+ const parsed = new URL(text);
428
+ if (!['http:', 'https:'].includes(parsed.protocol)) return '';
429
+ return parsed.toString();
430
+ } catch {
431
+ return '';
432
+ }
433
+ }
434
+
435
+ function normalizeRssDescription(cheerio, value) {
436
+ const text = String(value || '').trim();
437
+ if (!text) return '';
438
+ return normalizeWhitespace(cheerio.load(text).text() || text);
439
+ }
440
+
441
+ function hostnameFromUrl(value) {
442
+ try {
443
+ return new URL(value).hostname;
444
+ } catch {
445
+ return '';
446
+ }
447
+ }
448
+
282
449
  function findUniqueLineBlock(lines, blockContent) {
283
450
  const probeLines = splitLines(blockContent);
284
451
  if (probeLines.length === 0 || (probeLines.length === 1 && probeLines[0] === '')) return null;
@@ -877,7 +1044,7 @@ async function runCommand(root, config, args) {
877
1044
  command,
878
1045
  cwd: root,
879
1046
  shell: config.shell.default,
880
- timeoutMs: config.shell.timeout_ms
1047
+ timeoutMs: Number(args?.timeout || args?.timeout_ms || args?.timeoutMs || config.shell.timeout_ms)
881
1048
  });
882
1049
  return { ...result, command };
883
1050
  }
@@ -1752,24 +1919,6 @@ export function getBuiltinTools({ workspaceRoot = process.cwd(), config, onSyste
1752
1919
  }
1753
1920
  }
1754
1921
  },
1755
- {
1756
- type: 'function',
1757
- function: {
1758
- name: 'glob',
1759
- description:
1760
- 'Find files by glob pattern. Use this when you already know a filename pattern such as src/**/*.ts.',
1761
- parameters: {
1762
- type: 'object',
1763
- properties: {
1764
- pattern: { type: 'string', description: 'Glob pattern' },
1765
- path: { type: 'string', description: 'Directory to search' },
1766
- include_hidden: { type: 'boolean', description: 'Include dotfiles' },
1767
- max_results: { type: 'number', description: 'Max results' }
1768
- },
1769
- required: ['pattern']
1770
- }
1771
- }
1772
- },
1773
1922
  {
1774
1923
  type: 'function',
1775
1924
  function: {
@@ -1995,6 +2144,24 @@ export function getBuiltinTools({ workspaceRoot = process.cwd(), config, onSyste
1995
2144
  ];
1996
2145
 
1997
2146
  const deferredDefinitions = {
2147
+ glob: {
2148
+ type: 'function',
2149
+ function: {
2150
+ name: 'glob',
2151
+ description:
2152
+ 'Find files by glob pattern. Use this when you already know a filename pattern such as src/**/*.ts.',
2153
+ parameters: {
2154
+ type: 'object',
2155
+ properties: {
2156
+ pattern: { type: 'string', description: 'Glob pattern' },
2157
+ path: { type: 'string', description: 'Directory to search' },
2158
+ include_hidden: { type: 'boolean', description: 'Include dotfiles' },
2159
+ max_results: { type: 'number', description: 'Max results' }
2160
+ },
2161
+ required: ['pattern']
2162
+ }
2163
+ }
2164
+ },
1998
2165
  ast_query: {
1999
2166
  type: 'function',
2000
2167
  function: {
@@ -2036,7 +2203,7 @@ export function getBuiltinTools({ workspaceRoot = process.cwd(), config, onSyste
2036
2203
  function: {
2037
2204
  name: 'web_fetch',
2038
2205
  description:
2039
- 'Fetch and read a live web page. Uses Playwright to render the page, Cheerio to extract structured content, and Crawlee request handling to normalize the fetch flow. Use this for direct URL reads, not for keyword search.',
2206
+ 'Fetch and read a live web page. Uses a lightweight fetch + Cheerio reader by default, then falls back to optional Playwright browser rendering for JavaScript-heavy pages when Playwright is installed. Use this for direct URL reads, not for keyword search.',
2040
2207
  parameters: {
2041
2208
  type: 'object',
2042
2209
  properties: {
@@ -2055,15 +2222,15 @@ export function getBuiltinTools({ workspaceRoot = process.cwd(), config, onSyste
2055
2222
  function: {
2056
2223
  name: 'web_search',
2057
2224
  description:
2058
- 'Run a live web search through DuckDuckGo. Use this for keyword-based internet search. This tool respects config.web.search_enabled and will fail when network search is disabled.',
2225
+ 'Run a live web search by fetching Bing RSS results. Use this for keyword-based internet search. This tool respects config.web.search_enabled and will fail when network search is disabled.',
2059
2226
  parameters: {
2060
2227
  type: 'object',
2061
2228
  properties: {
2062
2229
  query: { type: 'string', description: 'Search query' },
2063
2230
  q: { type: 'string', description: 'Alias for query' },
2064
2231
  max_results: { type: 'number', description: 'Max results to return' },
2065
- locale: { type: 'string', description: 'DuckDuckGo locale such as en-us' },
2066
- region: { type: 'string', description: 'DuckDuckGo region such as wt-wt' }
2232
+ locale: { type: 'string', description: 'Bing market and language such as en-US or zh-CN' },
2233
+ region: { type: 'string', description: 'Bing country code such as US or CN' }
2067
2234
  },
2068
2235
  required: ['query']
2069
2236
  }
@@ -2140,7 +2307,7 @@ export function getBuiltinTools({ workspaceRoot = process.cwd(), config, onSyste
2140
2307
  function: {
2141
2308
  name: 'dream_consolidate',
2142
2309
  description:
2143
- 'Run a dream loop consolidation pass over inbox entries. Reads recent inbox items, deduplicates, evaluates lifecycle progression (observed → candidate → operational/longterm), promotes stable patterns into persistent memory, archives expired items, and writes an audit report. Use during off-hours or explicit maintenance.',
2310
+ 'Run a dream loop pass over inbox entries and existing memory buckets. Reads recent inbox items, deduplicates, evaluates lifecycle progression (observed → candidate → operational/longterm), promotes stable patterns into persistent memory, then uses LLM maintenance to merge/summarize/clean stale user/global/project memories when their bucket changed since the last maintenance marker. Writes an audit report. Use during off-hours or explicit maintenance.',
2144
2311
  parameters: {
2145
2312
  type: 'object',
2146
2313
  properties: {
@@ -2742,6 +2909,12 @@ export function getBuiltinTools({ workspaceRoot = process.cwd(), config, onSyste
2742
2909
  if (result.title) lines.push(`title: ${result.title}`);
2743
2910
  if (result.description) lines.push(`description: ${trimPreview(result.description, 200)}`);
2744
2911
  if (result.metadata?.status) lines.push(`status: ${result.metadata.status}`);
2912
+ if (result.metadata?.fetch_mode) lines.push(`mode: ${result.metadata.fetch_mode}`);
2913
+ if (Array.isArray(result.warnings)) {
2914
+ for (const warning of result.warnings.slice(0, 3)) {
2915
+ if (warning) lines.push(`warning: ${warning}`);
2916
+ }
2917
+ }
2745
2918
  if (Array.isArray(result.links) && result.links.length > 0) {
2746
2919
  lines.push(`links: ${result.links.slice(0, 5).map((item) => item.href).join(', ')}`);
2747
2920
  }