@antodevs/groundtruth 0.2.5 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +1 -0
- package/package.json +4 -6
- package/src/cache.js +2 -0
- package/src/circuit-breaker.js +2 -0
- package/src/config.js +1 -1
- package/src/env.js +2 -3
- package/src/http-agent.js +13 -14
- package/src/inject.js +30 -16
- package/src/packages.js +17 -5
- package/src/proxy.js +31 -3
- package/src/registry.js +17 -12
- package/src/sanitize.js +50 -1
- package/src/search.js +4 -10
- package/src/state.js +22 -9
- package/src/utils/atomic-write.js +1 -0
- package/src/watcher.js +14 -22
- package/specification.yaml +0 -143
package/index.js
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
* @module index
|
|
4
4
|
* @description Entry point runtime groundtruth delegazione CLI o proxy flow logic.
|
|
5
5
|
*/
|
|
6
|
+
import './src/http-agent.js';
|
|
6
7
|
import { chalk, label } from './src/logger.js';
|
|
7
8
|
import { usePackageJson, antigravityMode, claudeCodeMode, uninstallMode, port, intervalMinutes, batchSize, version } from './src/cli.js';
|
|
8
9
|
import { createServer } from './src/proxy.js';
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@antodevs/groundtruth",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.1",
|
|
4
4
|
"description": "Lightweight Node.js proxy to intercept API requests from coding agents and inject fresh web context",
|
|
5
5
|
"publishConfig": {
|
|
6
6
|
"access": "public"
|
|
@@ -30,7 +30,6 @@
|
|
|
30
30
|
"index.js",
|
|
31
31
|
"src/",
|
|
32
32
|
"assets/",
|
|
33
|
-
"specification.yaml",
|
|
34
33
|
"README.md",
|
|
35
34
|
"LICENSE"
|
|
36
35
|
],
|
|
@@ -44,10 +43,9 @@
|
|
|
44
43
|
"release:minor": "git add . && git commit -m \"chore: auto-commit before release\" || true && npm version minor && git push origin main --tags && npm publish"
|
|
45
44
|
},
|
|
46
45
|
"dependencies": {
|
|
47
|
-
"@mozilla/readability": "^0.
|
|
46
|
+
"@mozilla/readability": "^0.6.0",
|
|
48
47
|
"chalk": "^5.3.0",
|
|
49
|
-
"cheerio": "^1.
|
|
50
|
-
"linkedom": "^0.18.12"
|
|
51
|
-
"node-fetch": "^3.3.2"
|
|
48
|
+
"cheerio": "^1.2.0",
|
|
49
|
+
"linkedom": "^0.18.12"
|
|
52
50
|
}
|
|
53
51
|
}
|
package/src/cache.js
CHANGED
package/src/circuit-breaker.js
CHANGED
|
@@ -13,6 +13,7 @@ export class CircuitBreaker {
|
|
|
13
13
|
this.state = 'CLOSED'; // CLOSED, OPEN, HALF_OPEN
|
|
14
14
|
this.failures = 0;
|
|
15
15
|
this.lastFailureTime = null;
|
|
16
|
+
this.halfOpenSuccesses = 0;
|
|
16
17
|
}
|
|
17
18
|
|
|
18
19
|
/**
|
|
@@ -57,6 +58,7 @@ export class CircuitBreaker {
|
|
|
57
58
|
// 429 rate limit apre il circuito immediatamente
|
|
58
59
|
if (err?.message?.includes('429')) {
|
|
59
60
|
this.failures = this.failureThreshold;
|
|
61
|
+
this.state = 'OPEN';
|
|
60
62
|
} else {
|
|
61
63
|
this.failures++;
|
|
62
64
|
}
|
package/src/config.js
CHANGED
|
@@ -45,7 +45,7 @@ export async function loadConfig() {
|
|
|
45
45
|
const parsed = JSON.parse(raw);
|
|
46
46
|
|
|
47
47
|
return {
|
|
48
|
-
maxTokens: clamp(parsed.maxTokens
|
|
48
|
+
maxTokens: clamp(parsed.maxTokens || DEFAULTS.maxTokens, 500, 8000),
|
|
49
49
|
quality: ['low', 'medium', 'high'].includes(parsed.quality) ? parsed.quality : DEFAULTS.quality,
|
|
50
50
|
verbose: typeof parsed.verbose === 'boolean' ? parsed.verbose : DEFAULTS.verbose,
|
|
51
51
|
sources: Array.isArray(parsed.sources) ? parsed.sources.filter(s => s && s.url) : DEFAULTS.sources,
|
package/src/env.js
CHANGED
|
@@ -23,13 +23,12 @@ export async function autoSetEnv(p) {
|
|
|
23
23
|
if (process.env.ANTHROPIC_BASE_URL === targetUrl) return;
|
|
24
24
|
|
|
25
25
|
const homeDir = os.homedir();
|
|
26
|
-
|
|
27
|
-
const isFish = process.env.SHELL?.includes('fish') || existsSync(
|
|
26
|
+
const fishConfigFile = path.join(homeDir, '.config', 'fish', 'config.fish');
|
|
27
|
+
const isFish = process.env.SHELL?.includes('fish') || existsSync(fishConfigFile);
|
|
28
28
|
let foundAny = false;
|
|
29
29
|
const modifiedFiles = [];
|
|
30
30
|
|
|
31
31
|
if (isFish) {
|
|
32
|
-
const fishConfigFile = path.join(homeDir, '.config', 'fish', 'config.fish');
|
|
33
32
|
await fs.mkdir(path.dirname(fishConfigFile), { recursive: true });
|
|
34
33
|
foundAny = true;
|
|
35
34
|
try {
|
package/src/http-agent.js
CHANGED
|
@@ -1,21 +1,20 @@
|
|
|
1
|
-
|
|
2
|
-
* @module http-agent
|
|
3
|
-
* @description Pool manager per connessioni API http e requests HTTPS in proxy context.
|
|
4
|
-
*/
|
|
1
|
+
import { Agent, setGlobalDispatcher } from 'undici';
|
|
5
2
|
import { Agent as HttpsAgent } from 'https';
|
|
6
3
|
import { Agent as HttpAgent } from 'http';
|
|
7
4
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
timeout: 5000,
|
|
5
|
+
const globalAgent = new Agent({
|
|
6
|
+
keepAliveTimeout: 20 * 1000,
|
|
7
|
+
keepAliveMaxTimeout: 60 * 1000,
|
|
8
|
+
connections: 10,
|
|
9
|
+
pipelining: 1,
|
|
14
10
|
});
|
|
11
|
+
setGlobalDispatcher(globalAgent);
|
|
12
|
+
|
|
13
|
+
export { globalAgent };
|
|
15
14
|
|
|
15
|
+
export const httpsAgent = new HttpsAgent({
|
|
16
|
+
keepAlive: true, maxSockets: 10, maxFreeSockets: 5, timeout: 5000,
|
|
17
|
+
});
|
|
16
18
|
export const httpAgent = new HttpAgent({
|
|
17
|
-
keepAlive: true,
|
|
18
|
-
maxSockets: 10,
|
|
19
|
-
maxFreeSockets: 5,
|
|
20
|
-
timeout: 5000,
|
|
19
|
+
keepAlive: true, maxSockets: 10, maxFreeSockets: 5, timeout: 5000,
|
|
21
20
|
});
|
package/src/inject.js
CHANGED
|
@@ -9,6 +9,14 @@ import os from 'os';
|
|
|
9
9
|
import { chalk, log, LOG_WARN, LOG_REFRESH } from './logger.js';
|
|
10
10
|
import { atomicWrite } from './utils/atomic-write.js';
|
|
11
11
|
|
|
12
|
+
const fileLocks = new Map();
|
|
13
|
+
async function withFileLock(filePath, fn) {
|
|
14
|
+
const previous = fileLocks.get(filePath) || Promise.resolve();
|
|
15
|
+
const next = previous.then(fn, fn);
|
|
16
|
+
fileLocks.set(filePath, next.catch(() => { }));
|
|
17
|
+
return next;
|
|
18
|
+
}
|
|
19
|
+
|
|
12
20
|
// ─── Document injection rules ────────────────────────
|
|
13
21
|
|
|
14
22
|
/**
|
|
@@ -19,24 +27,30 @@ import { atomicWrite } from './utils/atomic-write.js';
|
|
|
19
27
|
* @returns {Promise<void>}
|
|
20
28
|
*/
|
|
21
29
|
export async function injectBlock(filePath, content, blockId) {
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
30
|
+
return withFileLock(filePath, async () => {
|
|
31
|
+
let fileContent = '';
|
|
32
|
+
if (existsSync(filePath)) {
|
|
33
|
+
fileContent = await fs.readFile(filePath, 'utf8');
|
|
34
|
+
}
|
|
35
|
+
const startTag = `<!-- groundtruth:block-${blockId}:start -->`;
|
|
36
|
+
const endTag = `<!-- groundtruth:block-${blockId}:end -->`;
|
|
37
|
+
const block = `${startTag}\n${content.trim()}\n${endTag}`;
|
|
29
38
|
|
|
30
|
-
|
|
31
|
-
|
|
39
|
+
const startIndex = fileContent.indexOf(startTag);
|
|
40
|
+
const endIndex = fileContent.indexOf(endTag);
|
|
32
41
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
42
|
+
if (startIndex !== -1 && endIndex !== -1 && endIndex > startIndex) {
|
|
43
|
+
// Sostituisce il blocco esistente mantenendo il resto del file intatto
|
|
44
|
+
const before = fileContent.slice(0, startIndex);
|
|
45
|
+
const after = fileContent.slice(endIndex + endTag.length);
|
|
46
|
+
fileContent = before + block + after;
|
|
47
|
+
} else {
|
|
48
|
+
// Aggiunge in fondo se non esiste
|
|
49
|
+
fileContent = fileContent.trimEnd() + '\n\n' + block + '\n';
|
|
50
|
+
}
|
|
38
51
|
|
|
39
|
-
|
|
52
|
+
await atomicWrite(filePath, fileContent);
|
|
53
|
+
});
|
|
40
54
|
}
|
|
41
55
|
|
|
42
56
|
/**
|
|
@@ -48,7 +62,7 @@ export async function injectBlock(filePath, content, blockId) {
|
|
|
48
62
|
export async function removeStaleBlocks(filePath, activeBlockIds) {
|
|
49
63
|
if (!existsSync(filePath)) return;
|
|
50
64
|
let fileContent = await fs.readFile(filePath, 'utf8');
|
|
51
|
-
const regex = /<!-- groundtruth:block-(\w+):start -->[\s\S]*?<!-- groundtruth:block-\
|
|
65
|
+
const regex = /<!-- groundtruth:block-(\w+):start -->[\s\S]*?<!-- groundtruth:block-\1:end -->/g;
|
|
52
66
|
|
|
53
67
|
let modified = false;
|
|
54
68
|
fileContent = fileContent.replace(regex, (match, blockId) => {
|
package/src/packages.js
CHANGED
|
@@ -23,24 +23,36 @@ export async function readPackageDeps() {
|
|
|
23
23
|
}
|
|
24
24
|
const pkg = JSON.parse(await fs.readFile(pkgPath, 'utf8'));
|
|
25
25
|
|
|
26
|
-
const
|
|
26
|
+
const EXACT_EXCLUDE = new Set(['eslint', 'prettier', 'vite', 'rollup', 'webpack', 'babel', 'turbo', 'esbuild']);
|
|
27
|
+
const SUBSTR_EXCLUDE = ['plugin', 'adapter', '-check', 'lint-staged'];
|
|
27
28
|
|
|
28
29
|
const filterAndFormat = (depsObj) => {
|
|
29
30
|
if (!depsObj) return [];
|
|
30
31
|
return Object.entries(depsObj)
|
|
31
|
-
.filter(([n]) =>
|
|
32
|
+
.filter(([n]) => {
|
|
33
|
+
const lower = n.toLowerCase();
|
|
34
|
+
const base = lower.startsWith('@') ? lower.split('/')[1] : lower;
|
|
35
|
+
if (EXACT_EXCLUDE.has(base)) return false;
|
|
36
|
+
if (SUBSTR_EXCLUDE.some(ex => lower.includes(ex))) return false;
|
|
37
|
+
return true;
|
|
38
|
+
})
|
|
32
39
|
.map(([n, v]) => {
|
|
33
40
|
let cleanName = n;
|
|
34
41
|
if (n === '@sveltejs/kit') cleanName = 'sveltekit';
|
|
35
|
-
else if (n.startsWith('@')) cleanName = n.split('/')[1];
|
|
36
42
|
let cleanVersion = String(v).replace(/[\^~>=<]/g, '').split('.').slice(0, 2).join('.');
|
|
37
43
|
return `${cleanName} ${cleanVersion}`;
|
|
38
44
|
});
|
|
39
45
|
};
|
|
40
46
|
|
|
41
|
-
|
|
42
|
-
|
|
47
|
+
const depMap = new Map();
|
|
48
|
+
for (const [n, v] of Object.entries(pkg.dependencies || {})) {
|
|
49
|
+
depMap.set(n, v);
|
|
50
|
+
}
|
|
51
|
+
for (const [n, v] of Object.entries(pkg.devDependencies || {})) {
|
|
52
|
+
if (!depMap.has(n)) depMap.set(n, v);
|
|
53
|
+
}
|
|
43
54
|
|
|
55
|
+
const selected = filterAndFormat(Object.fromEntries(depMap));
|
|
44
56
|
return selected.length > 0 ? selected : null;
|
|
45
57
|
} catch (err) {
|
|
46
58
|
log(LOG_WARN, chalk.yellow, chalk.white('package.json parse error') + ` → ${chalk.yellow(err.message)}`);
|
package/src/proxy.js
CHANGED
|
@@ -9,6 +9,8 @@ import { readPackageDeps, buildQuery } from './packages.js';
|
|
|
9
9
|
import { chalk, log, LOG_WARN, LOG_BOLT } from './logger.js';
|
|
10
10
|
import { httpsAgent } from './http-agent.js';
|
|
11
11
|
import { sanitizeWebContent } from './sanitize.js';
|
|
12
|
+
import { watch } from 'fs';
|
|
13
|
+
import path from 'path';
|
|
12
14
|
import { maxTokens, qualitySettings, verbose } from './cli.js';
|
|
13
15
|
|
|
14
16
|
// ─── HTTP Node server daemon ─────────────────────────
|
|
@@ -20,12 +22,32 @@ import { maxTokens, qualitySettings, verbose } from './cli.js';
|
|
|
20
22
|
*/
|
|
21
23
|
export async function createServer(usePackageJson) {
|
|
22
24
|
let packageQueryCache = null;
|
|
25
|
+
let cacheStale = true;
|
|
26
|
+
|
|
23
27
|
if (usePackageJson) {
|
|
24
28
|
const depEntries = await readPackageDeps();
|
|
25
|
-
if (depEntries)
|
|
29
|
+
if (depEntries) {
|
|
30
|
+
packageQueryCache = buildQuery(depEntries);
|
|
31
|
+
cacheStale = false;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const pkgPath = path.resolve(process.cwd(), 'package.json');
|
|
35
|
+
try {
|
|
36
|
+
watch(pkgPath, { persistent: false }, () => {
|
|
37
|
+
cacheStale = true;
|
|
38
|
+
log(LOG_REFRESH, chalk.cyan, chalk.white('package.json changed — cache invalidated'));
|
|
39
|
+
});
|
|
40
|
+
} catch (_) { }
|
|
26
41
|
}
|
|
27
42
|
|
|
28
43
|
const server = http.createServer(async (req, res) => {
|
|
44
|
+
if (usePackageJson && cacheStale) {
|
|
45
|
+
const depEntries = await readPackageDeps();
|
|
46
|
+
if (depEntries) {
|
|
47
|
+
packageQueryCache = buildQuery(depEntries);
|
|
48
|
+
cacheStale = false;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
29
51
|
if (req.method !== 'POST') { res.writeHead(404); res.end(); return; }
|
|
30
52
|
|
|
31
53
|
let protocol = null;
|
|
@@ -95,7 +117,7 @@ export async function createServer(usePackageJson) {
|
|
|
95
117
|
try {
|
|
96
118
|
if (!query || query.trim() === String(new Date().getFullYear())) throw new Error('Empty query');
|
|
97
119
|
// parallel load in proxy app process to boost response load
|
|
98
|
-
const { results, pageText } = await webSearch(query,
|
|
120
|
+
const { results, pageText } = await webSearch(query, false, {
|
|
99
121
|
ddgResults: qualitySettings.ddgResults,
|
|
100
122
|
maxLen: qualitySettings.charsPerPage,
|
|
101
123
|
jinaTimeout: qualitySettings.jinaTimeout,
|
|
@@ -155,7 +177,13 @@ export async function createServer(usePackageJson) {
|
|
|
155
177
|
headers['content-length'] = Buffer.byteLength(reqBodyStr);
|
|
156
178
|
|
|
157
179
|
const proxyReq = https.request(targetUrl, { method: req.method, headers, agent: httpsAgent }, (proxyRes) => {
|
|
158
|
-
|
|
180
|
+
const responseHeaders = { ...proxyRes.headers };
|
|
181
|
+
delete responseHeaders['content-security-policy'];
|
|
182
|
+
delete responseHeaders['x-content-type-options'];
|
|
183
|
+
delete responseHeaders['content-encoding'];
|
|
184
|
+
delete responseHeaders['content-length'];
|
|
185
|
+
|
|
186
|
+
res.writeHead(proxyRes.statusCode, responseHeaders);
|
|
159
187
|
proxyRes.pipe(res);
|
|
160
188
|
});
|
|
161
189
|
proxyReq.on('error', () => { if (!res.headersSent) { res.writeHead(502); res.end('Bad Gateway'); } });
|
package/src/registry.js
CHANGED
|
@@ -2,12 +2,13 @@
|
|
|
2
2
|
* @module registry
|
|
3
3
|
* @description Interroga il Cloudflare Worker (Remote Registry) per risolvere URL docs ufficiali.
|
|
4
4
|
*/
|
|
5
|
-
|
|
5
|
+
|
|
6
|
+
import { LRUCache } from './cache.js';
|
|
6
7
|
|
|
7
8
|
const REGISTRY_API_URL = 'https://groundtruth-registry.antony-flex01.workers.dev/lookup';
|
|
8
9
|
|
|
9
|
-
// Cache in memoria per evitare query multiple allo stesso endpoint
|
|
10
|
-
const
|
|
10
|
+
// Cache in memoria con LRU per evitare query multiple allo stesso endpoint
|
|
11
|
+
const registryCache = new LRUCache({ max: 1000, ttl: 60 * 60 * 1000 });
|
|
11
12
|
|
|
12
13
|
/**
|
|
13
14
|
* @description Interroga asincronamente l'API cloudflare per cercare URL docs nel registry remoto
|
|
@@ -18,38 +19,42 @@ export async function lookupRegistryUrl(depName) {
|
|
|
18
19
|
if (!depName) return null;
|
|
19
20
|
|
|
20
21
|
// Normalizzazione preventiva
|
|
21
|
-
|
|
22
|
+
let name = depName.split(' ')[0].toLowerCase().trim();
|
|
23
|
+
|
|
24
|
+
// Alias mapping per framework comuni con scope npm
|
|
25
|
+
if (name === '@sveltejs/kit') name = 'sveltekit';
|
|
26
|
+
|
|
22
27
|
|
|
23
28
|
// Check hit in memoria (ritorna subito)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
}
|
|
29
|
+
const cached = registryCache.get(name);
|
|
30
|
+
if (cached !== undefined) return cached;
|
|
27
31
|
|
|
28
32
|
try {
|
|
29
33
|
// Fetch asincrono con timeout stretto per evitare latenze di fallback
|
|
30
34
|
const res = await fetch(`${REGISTRY_API_URL}?pkg=${encodeURIComponent(name)}`, {
|
|
31
|
-
signal: AbortSignal.timeout(1500),
|
|
35
|
+
signal: AbortSignal.timeout(1500),
|
|
32
36
|
headers: {
|
|
33
|
-
'Accept': 'application/json'
|
|
37
|
+
'Accept': 'application/json',
|
|
38
|
+
'X-GroundTruth-Key': process.env.GROUNDTRUTH_REGISTRY_KEY || ''
|
|
34
39
|
}
|
|
35
40
|
});
|
|
36
41
|
|
|
37
42
|
if (res.ok) {
|
|
38
43
|
const data = await res.json();
|
|
39
44
|
if (data && data.found && data.url) {
|
|
40
|
-
|
|
45
|
+
registryCache.set(name, data.url); // Cache hit success
|
|
41
46
|
return data.url;
|
|
42
47
|
}
|
|
43
48
|
}
|
|
44
49
|
|
|
45
50
|
// Se l'API restituisce 404/not found
|
|
46
|
-
|
|
51
|
+
registryCache.set(name, null); // Cache negative (così non rifacciamo network)
|
|
47
52
|
return null;
|
|
48
53
|
|
|
49
54
|
} catch (err) {
|
|
50
55
|
// Failover silente! (timeout o worker rotto). Se Cloudflare fallisce,
|
|
51
56
|
// noi non diamo errore all'utente ma facciamo DDG search fallback locale naturale.
|
|
52
|
-
|
|
57
|
+
registryCache.set(name, null);
|
|
53
58
|
return null;
|
|
54
59
|
}
|
|
55
60
|
}
|
package/src/sanitize.js
CHANGED
|
@@ -19,17 +19,66 @@ const DANGEROUS_PATTERNS = [
|
|
|
19
19
|
/HUMAN:\s/gi,
|
|
20
20
|
];
|
|
21
21
|
|
|
22
|
+
const NOISE_PATTERNS = [
|
|
23
|
+
/Skip to content/gi,
|
|
24
|
+
/Navigation Menu/gi,
|
|
25
|
+
/Toggle navigation/gi,
|
|
26
|
+
/Appearance settings/gi,
|
|
27
|
+
/AI CODE CREATION/gi,
|
|
28
|
+
/GitHub Copilot Write better code with AI/gi,
|
|
29
|
+
/Sign in/gi,
|
|
30
|
+
/Sign up/gi,
|
|
31
|
+
/Notifications/gi,
|
|
32
|
+
/Fork\s+\d+/gi,
|
|
33
|
+
/Star\s+[\d.]+[kK]?/gi,
|
|
34
|
+
/Code/gi,
|
|
35
|
+
/Issues/gi,
|
|
36
|
+
/Pull requests/gi,
|
|
37
|
+
/Actions/gi,
|
|
38
|
+
/Projects/gi,
|
|
39
|
+
/Security/gi,
|
|
40
|
+
/Insights/gi,
|
|
41
|
+
/Why GitHub/gi,
|
|
42
|
+
/Solutions/gi,
|
|
43
|
+
/Resources/gi,
|
|
44
|
+
/Open Source/gi,
|
|
45
|
+
/Enterprises/gi,
|
|
46
|
+
/Startups/gi,
|
|
47
|
+
/Customer stories|Ebooks & reports|Events & webinars/gi,
|
|
48
|
+
/GitHub (Sponsors|Skills|Accelerator|Archive Program|Spark|Models)/gi,
|
|
49
|
+
/Weekly Downloads|Unpacked Size|Total Files|Collaborators/gi,
|
|
50
|
+
/Analyze with Socket|Check bundle size|View package health|Explore dependencies/gi,
|
|
51
|
+
/Skip to content|Skip to main content|skip to:\[content\]|package search/gi,
|
|
52
|
+
/\[Signing in\]\(https:\/\/github\.com\/login\)/gi,
|
|
53
|
+
/Performing verification|This website uses a service to protect against malicious bots/gi,
|
|
54
|
+
/Radix Primitives|Visually or semantically separates content/gi,
|
|
55
|
+
/View docs here|Check bundle size|View package health/gi,
|
|
56
|
+
];
|
|
57
|
+
|
|
58
|
+
|
|
22
59
|
/**
|
|
23
|
-
* @description Filtra pattern pericolosi di
|
|
60
|
+
* @description Filtra pattern pericolosi e rumore di navigazione dal testo web scrappato.
|
|
24
61
|
* @param {string} text - Testo raw proveniente da web scraping
|
|
25
62
|
* @param {number} maxLen - Lunghezza massima output (default 8000)
|
|
26
63
|
* @returns {string} Testo sanitizzato
|
|
27
64
|
*/
|
|
28
65
|
export function sanitizeWebContent(text, maxLen = 8000) {
|
|
29
66
|
if (!text || typeof text !== 'string') return '';
|
|
67
|
+
|
|
30
68
|
let cleaned = text;
|
|
69
|
+
|
|
70
|
+
// 1. Rimuoviamo il rumore di navigazione
|
|
71
|
+
for (const pattern of NOISE_PATTERNS) {
|
|
72
|
+
cleaned = cleaned.replace(pattern, '');
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// 2. Rimuoviamo pattern pericolosi
|
|
31
76
|
for (const p of DANGEROUS_PATTERNS) {
|
|
32
77
|
cleaned = cleaned.replace(p, '[FILTERED]');
|
|
33
78
|
}
|
|
79
|
+
|
|
80
|
+
// 3. Normalizzazione spazi bianchi per risparmiare token
|
|
81
|
+
cleaned = cleaned.replace(/\s+/g, ' ').trim();
|
|
82
|
+
|
|
34
83
|
return cleaned.slice(0, maxLen);
|
|
35
84
|
}
|
package/src/search.js
CHANGED
|
@@ -2,13 +2,11 @@
|
|
|
2
2
|
* @module search
|
|
3
3
|
* @description Logica di scraping web: Jina Reader → fallback Readability, registry bypass, DDG search.
|
|
4
4
|
*/
|
|
5
|
-
import fetch from 'node-fetch';
|
|
6
5
|
import * as cheerio from 'cheerio';
|
|
7
6
|
import { Readability } from '@mozilla/readability';
|
|
8
7
|
import { DOMParser } from 'linkedom';
|
|
9
8
|
import { searchCache } from './cache.js';
|
|
10
9
|
import { CircuitBreaker } from './circuit-breaker.js';
|
|
11
|
-
import { httpAgent, httpsAgent } from './http-agent.js';
|
|
12
10
|
import { sanitizeWebContent } from './sanitize.js';
|
|
13
11
|
import { lookupRegistryUrl } from './registry.js';
|
|
14
12
|
|
|
@@ -48,7 +46,7 @@ export async function fetchPageContent(url, userAgent, opts = {}) {
|
|
|
48
46
|
const text = await jinaRes.text();
|
|
49
47
|
if (text && text.length > 200) {
|
|
50
48
|
if (verbose) console.log(` [jina] ✓ ${url} → ${text.length} chars`);
|
|
51
|
-
return sanitizeWebContent(text
|
|
49
|
+
return sanitizeWebContent(text, maxLen);
|
|
52
50
|
}
|
|
53
51
|
}
|
|
54
52
|
} catch (_) {
|
|
@@ -59,8 +57,7 @@ export async function fetchPageContent(url, userAgent, opts = {}) {
|
|
|
59
57
|
try {
|
|
60
58
|
const pageRes = await fetch(url, {
|
|
61
59
|
signal: AbortSignal.timeout(5000),
|
|
62
|
-
headers: { 'User-Agent': userAgent }
|
|
63
|
-
agent: url.startsWith('https:') ? httpsAgent : httpAgent
|
|
60
|
+
headers: { 'User-Agent': userAgent }
|
|
64
61
|
});
|
|
65
62
|
if (pageRes.ok) {
|
|
66
63
|
const document = new DOMParser().parseFromString(await pageRes.text(), 'text/html');
|
|
@@ -71,10 +68,7 @@ export async function fetchPageContent(url, userAgent, opts = {}) {
|
|
|
71
68
|
} catch (_) {
|
|
72
69
|
text = document.body?.textContent || '';
|
|
73
70
|
}
|
|
74
|
-
|
|
75
|
-
if (verbose) console.log(` [readability] ✓ ${url} → ${text.length} chars`);
|
|
76
|
-
return sanitizeWebContent(text.replace(/\s+/g, ' '), maxLen);
|
|
77
|
-
}
|
|
71
|
+
return sanitizeWebContent(text, maxLen);
|
|
78
72
|
}
|
|
79
73
|
} catch (_) { }
|
|
80
74
|
|
|
@@ -142,7 +136,7 @@ async function doSearch(query, resultsLimit = 3) {
|
|
|
142
136
|
const userAgent = getRandomUA();
|
|
143
137
|
const searchRes = await fetch(
|
|
144
138
|
`https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`,
|
|
145
|
-
{ signal: AbortSignal.timeout(5000), headers: { 'User-Agent': userAgent }
|
|
139
|
+
{ signal: AbortSignal.timeout(5000), headers: { 'User-Agent': userAgent } }
|
|
146
140
|
);
|
|
147
141
|
if (!searchRes.ok) throw new Error(`DDG ${searchRes.status}`);
|
|
148
142
|
|
package/src/state.js
CHANGED
|
@@ -13,26 +13,39 @@ const STATE_FILE = path.join(STATE_DIR, 'watcher-state.json');
|
|
|
13
13
|
|
|
14
14
|
/**
|
|
15
15
|
* @description Carica gli hash validati e memorizzati dallo schedule storage locale.
|
|
16
|
-
* @
|
|
16
|
+
* @param {string} currentVersion - Versione attuale dell'applicazione per validare la cache.
|
|
17
|
+
* @returns {Promise<Map>} Restituisce le hash map entries persistite o una mappa vuota se la versione differisce.
|
|
17
18
|
*/
|
|
18
|
-
export async function loadBatchState() {
|
|
19
|
+
export async function loadBatchState(currentVersion) {
|
|
19
20
|
try {
|
|
20
21
|
if (!existsSync(STATE_FILE)) return new Map();
|
|
21
22
|
const data = await readFile(STATE_FILE, 'utf8');
|
|
22
|
-
const
|
|
23
|
-
|
|
23
|
+
const state = JSON.parse(data);
|
|
24
|
+
|
|
25
|
+
// Invalida la cache se la versione è differente (forza refresh dopo update)
|
|
26
|
+
if (state.version !== currentVersion) {
|
|
27
|
+
return new Map();
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
return new Map(Object.entries(state.hashes || {}));
|
|
24
31
|
} catch {
|
|
25
32
|
return new Map();
|
|
26
33
|
}
|
|
27
34
|
}
|
|
28
35
|
|
|
29
36
|
/**
|
|
30
|
-
* @description Sincronizza hash batches per fault tolerance cross process
|
|
31
|
-
* @param {Map} map - Oggetto dei blocchi hashati validi
|
|
37
|
+
* @description Sincronizza hash batches e versione per fault tolerance cross process.
|
|
38
|
+
* @param {Map} map - Oggetto dei blocchi hashati validi.
|
|
39
|
+
* @param {string} version - Versione attuale dell'applicazione.
|
|
32
40
|
* @returns {Promise<void>}
|
|
33
41
|
*/
|
|
34
|
-
export async function saveBatchState(map) {
|
|
42
|
+
export async function saveBatchState(map, version) {
|
|
35
43
|
await mkdir(STATE_DIR, { recursive: true });
|
|
36
|
-
const
|
|
37
|
-
|
|
44
|
+
const state = {
|
|
45
|
+
version: version,
|
|
46
|
+
updatedAt: new Date().toISOString(),
|
|
47
|
+
hashes: Object.fromEntries(map)
|
|
48
|
+
};
|
|
49
|
+
await atomicWrite(STATE_FILE, JSON.stringify(state, null, 2), { backup: false });
|
|
38
50
|
}
|
|
51
|
+
|
package/src/watcher.js
CHANGED
|
@@ -11,7 +11,6 @@ import { updateGeminiFiles, removeStaleBlocks } from './inject.js';
|
|
|
11
11
|
import { chalk, label, log, LOG_WARN, LOG_REFRESH } from './logger.js';
|
|
12
12
|
import { version, maxTokens, quality, qualitySettings, verbose, customSources } from './cli.js';
|
|
13
13
|
import { loadBatchState, saveBatchState } from './state.js';
|
|
14
|
-
import { httpsAgent } from './http-agent.js';
|
|
15
14
|
|
|
16
15
|
// ─── Scheduler Watcher Instance ──────────────────────
|
|
17
16
|
|
|
@@ -52,7 +51,7 @@ export function startWatcher({ intervalMinutes, usePackageJson, batchSize }) {
|
|
|
52
51
|
|
|
53
52
|
async function updateSkill() {
|
|
54
53
|
if (previousBatchHashes.size === 0) {
|
|
55
|
-
previousBatchHashes = await loadBatchState();
|
|
54
|
+
previousBatchHashes = await loadBatchState(version);
|
|
56
55
|
}
|
|
57
56
|
const deps = await readPackageDeps();
|
|
58
57
|
if (!deps || deps.length === 0) {
|
|
@@ -70,7 +69,7 @@ export function startWatcher({ intervalMinutes, usePackageJson, batchSize }) {
|
|
|
70
69
|
|
|
71
70
|
for (const batch of batches) {
|
|
72
71
|
const promise = (async () => {
|
|
73
|
-
const blockId = batchHash(batch);
|
|
72
|
+
const blockId = batchHash(batch.map(d => d.split(' ')[0]));
|
|
74
73
|
activeBlockIds.add(blockId);
|
|
75
74
|
|
|
76
75
|
const currentHash = batchHash(batch);
|
|
@@ -160,13 +159,16 @@ export function startWatcher({ intervalMinutes, usePackageJson, batchSize }) {
|
|
|
160
159
|
|
|
161
160
|
// ── Custom sources from .groundtruth.json ──
|
|
162
161
|
if (customSources.length > 0) {
|
|
163
|
-
|
|
162
|
+
const CUSTOM_SOURCE_TTL_MS = 60 * 60 * 1000;
|
|
163
|
+
const customWork = customSources.map(async (src) => {
|
|
164
164
|
const blockId = 'src_' + Buffer.from(src.url).toString('base64url').slice(0, 8);
|
|
165
|
+
const tsKey = 'src_ts_' + blockId;
|
|
165
166
|
activeBlockIds.add(blockId);
|
|
166
167
|
|
|
167
|
-
|
|
168
|
+
const lastFetchTime = previousBatchHashes.get(tsKey) || 0;
|
|
169
|
+
if ((Date.now() - lastFetchTime) < CUSTOM_SOURCE_TTL_MS) {
|
|
168
170
|
skippedCount++;
|
|
169
|
-
|
|
171
|
+
return;
|
|
170
172
|
}
|
|
171
173
|
|
|
172
174
|
try {
|
|
@@ -180,37 +182,27 @@ export function startWatcher({ intervalMinutes, usePackageJson, batchSize }) {
|
|
|
180
182
|
globalContent: `## ${srcLabel}\n${sanitizeWebContent(text, 500)}\n`,
|
|
181
183
|
workspaceContent: md
|
|
182
184
|
}]);
|
|
183
|
-
previousBatchHashes.set(
|
|
185
|
+
previousBatchHashes.set(tsKey, Date.now());
|
|
184
186
|
updatedCount++;
|
|
185
187
|
log(LOG_REFRESH, chalk.cyan, `custom source updated → ${srcLabel}`);
|
|
186
188
|
}
|
|
187
189
|
} catch (_) {
|
|
188
190
|
failedCount++;
|
|
189
191
|
}
|
|
190
|
-
}
|
|
192
|
+
});
|
|
193
|
+
await Promise.all(customWork);
|
|
191
194
|
}
|
|
192
195
|
|
|
193
196
|
await removeStaleBlocks(globalPath, activeBlockIds);
|
|
194
197
|
await removeStaleBlocks(workspacePath, activeBlockIds);
|
|
195
198
|
|
|
196
|
-
await saveBatchState(previousBatchHashes);
|
|
199
|
+
await saveBatchState(previousBatchHashes, version);
|
|
197
200
|
|
|
198
201
|
log(LOG_REFRESH, chalk.gray, `cycle done → ${activeBlockIds.size} blocks active, ${updatedCount} updated, ${skippedCount} skipped, ${failedCount} errors`);
|
|
199
202
|
}
|
|
200
203
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
process.on('SIGINT', async () => {
|
|
204
|
-
await saveBatchState(previousBatchHashes);
|
|
205
|
-
process.exit(0);
|
|
206
|
-
});
|
|
207
|
-
|
|
208
|
-
updateSkill();
|
|
204
|
+
updateSkill().catch(err => log(LOG_WARN, chalk.yellow, 'updateSkill error: ' + err.message));
|
|
209
205
|
setInterval(() => {
|
|
210
|
-
|
|
211
|
-
if (cycleCount % 10 === 0) {
|
|
212
|
-
httpsAgent.destroy();
|
|
213
|
-
}
|
|
214
|
-
updateSkill();
|
|
206
|
+
updateSkill().catch(err => log(LOG_WARN, chalk.yellow, 'updateSkill error: ' + err.message));
|
|
215
207
|
}, intervalMinutes * 60 * 1000);
|
|
216
208
|
}
|
package/specification.yaml
DELETED
|
@@ -1,143 +0,0 @@
|
|
|
1
|
-
name: GroundTruth
|
|
2
|
-
version: 0.1.4
|
|
3
|
-
description: |
|
|
4
|
-
GroundTruth is a zero-configuration, transparent middleware context injection layer.
|
|
5
|
-
It is designed to bridge the deterministic knowledge cutoff gap of LLM-based coding agents
|
|
6
|
-
(such as Claude Code or Antigravity) by dynamically fetching and injecting live,
|
|
7
|
-
dependency-specific documentation right before inference or via out-of-band rule files.
|
|
8
|
-
|
|
9
|
-
architecture:
|
|
10
|
-
modes:
|
|
11
|
-
- name: Proxy Intercept Mode
|
|
12
|
-
alias: claude-code
|
|
13
|
-
description: |
|
|
14
|
-
Operates as a local HTTP reverse-proxy interceptor that captures outgoing API payloads
|
|
15
|
-
targeting Anthropic or Google Gemini endpoints, mutating them in transit.
|
|
16
|
-
target_endpoints:
|
|
17
|
-
- "https://api.anthropic.com/v1/messages"
|
|
18
|
-
- "https://generativelanguage.googleapis.com/v1beta/models/*"
|
|
19
|
-
flow:
|
|
20
|
-
1: "Listen on localhost port (default: 8080) and capture POST requests."
|
|
21
|
-
2: "Extract the last user message from the JSON body (supports `messages` array for Anthropic and `contents` array for Gemini)."
|
|
22
|
-
3: "Determine search query: use `--use-package-json` AST parsing, or fallback to the user message text."
|
|
23
|
-
4: "Scrape DuckDuckGo concurrently to retrieve live context (title, snippet, Readability-parsed text up to 4000 chars)."
|
|
24
|
-
5: "Mutate the `system` instruction prompt in the JSON payload by appending the live context block."
|
|
25
|
-
6: "Forward the modified request to the actual LLM provider, streaming the response back to the client."
|
|
26
|
-
components:
|
|
27
|
-
- src/proxy.js
|
|
28
|
-
|
|
29
|
-
- name: File Watcher Mode
|
|
30
|
-
alias: antigravity
|
|
31
|
-
description: |
|
|
32
|
-
Runs as a persistent background daemon. It polls the local project's dependencies,
|
|
33
|
-
fetches up-to-date documentation, and generates synchronized knowledge base dotfiles
|
|
34
|
-
(`GEMINI.md`) that the agent natively reads on invocation.
|
|
35
|
-
flow:
|
|
36
|
-
1: "Parses `package.json` dependencies and filters out build/tooling packages (e.g., eslint, vite, adapter)."
|
|
37
|
-
2: "Groups the filtered dependencies into chunks (batching) of configurable size (default: 3, max: 5) using `groupIntoBatches`."
|
|
38
|
-
3: "Hashes each dependency chunk (`batchHash` md5 sliced to 8 chars) to uniquely identify rule blocks and track state."
|
|
39
|
-
4: "Checks previous state to avoid redundant network fetches if the batch hasn't changed (`previousDepsKey` mapping)."
|
|
40
|
-
5: "Fetches live DuckDuckGo results per batch asynchronously, filtering out low-quality pages (403, captcha, < 200 chars)."
|
|
41
|
-
6: "Injects distinct dependency rule blocks bounded by `<!-- groundtruth:block-{hash}:start/end -->` directly inside `~/.gemini/GEMINI.md` (global) and `./.gemini/GEMINI.md` (workspace)."
|
|
42
|
-
7: "Garbage-collects stale blocks (`removeStaleBlocks`) belonging to evicted or resolved dependencies by regex matching active block IDs."
|
|
43
|
-
components:
|
|
44
|
-
- src/watcher.js
|
|
45
|
-
- src/inject.js
|
|
46
|
-
|
|
47
|
-
core_modules:
|
|
48
|
-
- name: cli.js
|
|
49
|
-
responsibilities:
|
|
50
|
-
- "Process `process.argv` argument parsing."
|
|
51
|
-
- "Validation and defaulting of arguments (`--port`, `--interval`, `--batch-size`, `--claude-code`, `--antigravity`)."
|
|
52
|
-
- "Help/Docs Screen rendering and early exit conditions with aesthetic formatting inspired by Claude Code."
|
|
53
|
-
|
|
54
|
-
- name: search.js
|
|
55
|
-
responsibilities:
|
|
56
|
-
- "DuckDuckGo HTML scraping using `cheerio`."
|
|
57
|
-
- "URL resolution from DuckDuckGo's `uddg` tracking links."
|
|
58
|
-
- "User-Agent rotation to mitigate scraping blocks."
|
|
59
|
-
- "Integration with `CircuitBreaker` pattern for rate-limit protection."
|
|
60
|
-
- "Integration with bounded custom O(1) `LRUCache` from `cache.js`."
|
|
61
|
-
- "Page content extraction using `linkedom` and Mozilla's `Readability`."
|
|
62
|
-
- "Integration with persistent connection pooling components from `http-agent.js`."
|
|
63
|
-
|
|
64
|
-
- name: packages.js
|
|
65
|
-
responsibilities:
|
|
66
|
-
- "Read local Node modules context (`package.json`)."
|
|
67
|
-
- "Clean semantic versions (e.g., `^1.2.3` -> `1.2`)."
|
|
68
|
-
- "Filter out non-informative tooling (`vite`, `prettier`, `eslint`, `plugin`, `adapter`, `check`)."
|
|
69
|
-
- "Group dependencies into manageable batches (default: 3) prioritizing core dependencies over devDependencies."
|
|
70
|
-
- "Generate deterministic MD5 identifiers per batch for block management (`batchHash`)."
|
|
71
|
-
- "Construct search queries based on dependency batches plus temporal identifiers (`latest 2026`)."
|
|
72
|
-
|
|
73
|
-
- name: logger.js
|
|
74
|
-
responsibilities:
|
|
75
|
-
- "Chalk-driven aesthetic terminal formatting."
|
|
76
|
-
- "Centralized status symbolizing constants (✓, ⚠, ⚡, ↻, ◆, ✻)."
|
|
77
|
-
- "Timestamp generation mapped to `it-IT` locale."
|
|
78
|
-
|
|
79
|
-
- name: env.js
|
|
80
|
-
responsibilities:
|
|
81
|
-
- "Shell configuration auto-instrumentation (`.zshrc`, `.bashrc`, `.bash_profile`, `config.fish`)."
|
|
82
|
-
- "Exporting `ANTHROPIC_BASE_URL` to route CLI tools (like Claude Code) through the proxy."
|
|
83
|
-
- "Cross-Platform Environment Override (Bypassing Windows systems safely)."
|
|
84
|
-
|
|
85
|
-
- name: inject.js
|
|
86
|
-
responsibilities:
|
|
87
|
-
- "File I/O operations for `GEMINI.md` in both `$HOME` and `$CWD`."
|
|
88
|
-
- "Regex-based block injection using exact start/end bounds matching."
|
|
89
|
-
- "Stale block eviction via `removeStaleBlocks`."
|
|
90
|
-
- "Uses `atomicWrite` for zero-corruption file replacements."
|
|
91
|
-
|
|
92
|
-
- name: cache.js
|
|
93
|
-
responsibilities:
|
|
94
|
-
- "Implements zero-dependency O(1) bounded LRU caching logic."
|
|
95
|
-
- "Provides getter/setter mechanisms tied to temporal eviction limits."
|
|
96
|
-
|
|
97
|
-
- name: circuit-breaker.js
|
|
98
|
-
responsibilities:
|
|
99
|
-
- "Manages DuckDuckGo fetch attempts via threshold-based error state wrapping (OPEN/HALF_OPEN/CLOSED)."
|
|
100
|
-
|
|
101
|
-
- name: state.js
|
|
102
|
-
responsibilities:
|
|
103
|
-
- "Persistent recovery system for dependency batch hashes mapping across system crash/restarts."
|
|
104
|
-
- "Reads and writes `.gemini/watcher-state.json`."
|
|
105
|
-
|
|
106
|
-
- name: http-agent.js
|
|
107
|
-
responsibilities:
|
|
108
|
-
- "Creates reusable Keep-Alive HTTP and HTTPS configuration agents to mitigate handshake overheads."
|
|
109
|
-
|
|
110
|
-
- name: utils/atomic-write.js
|
|
111
|
-
responsibilities:
|
|
112
|
-
- "Creates temporary file blocks inside the target's directory (to prevent EXDEV cross-device link errors) performing `fs.rename` (POSIX) or safe-copies (Windows)."
|
|
113
|
-
|
|
114
|
-
dependencies:
|
|
115
|
-
runtime: "Node.js >= 18.0.0 (uses ES Modules)"
|
|
116
|
-
built_ins:
|
|
117
|
-
- fs
|
|
118
|
-
- path
|
|
119
|
-
- os
|
|
120
|
-
- http
|
|
121
|
-
- https
|
|
122
|
-
- crypto
|
|
123
|
-
third_party:
|
|
124
|
-
- chalk: "^5.3.0" # Terminal styling
|
|
125
|
-
- cheerio: "^1.0.0" # Fast HTML parsing for DDG results
|
|
126
|
-
- linkedom: "^0.18.5" # Lightweight DOM emulation for Mozilla Readability
|
|
127
|
-
- node-fetch: "^3.3.2" # WHATWG Fetch API polyfill for Node
|
|
128
|
-
- "@mozilla/readability": "^0.5.0" # Main content extraction
|
|
129
|
-
|
|
130
|
-
mechanics:
|
|
131
|
-
caching_and_eviction:
|
|
132
|
-
- "Search level caching: Runtime searches are cached for 5 minutes (`CACHE_TTL`), matching identical queries to avoid redundant network transit."
|
|
133
|
-
- "Watcher level caching: the daemon uses a Map tracking `blockId` -> `JSON.stringify(batch)`. If the hash signature matches across cycles, the network layer is skipped."
|
|
134
|
-
quality_assurance:
|
|
135
|
-
- "Content verification: Extracted text is sanitized and evaluated. If a page returns < 200 characters, or contains indicators of bot protection (e.g., '403', 'captcha', 'access denied'), the result is flagged."
|
|
136
|
-
- "Fallback mechanism: If a result is flagged as low-quality, the watcher rolls back and retains the successfully injected markdown block from the previous cycle."
|
|
137
|
-
network_resilience:
|
|
138
|
-
- "Timeouts: All outbound `node-fetch` requests strictly adhere to a 5-second `AbortSignal.timeout(5000)`."
|
|
139
|
-
- "Retries & Bans: `search.js` relies on a `CircuitBreaker` class mitigating recursive DuckDuckGo IP bans."
|
|
140
|
-
- "Resource Connection: Avoids TCP handshakes through persistent keep-alive Agent dispatching."
|
|
141
|
-
shell_integration:
|
|
142
|
-
- "Darwin/Linux-first: Windows OS (`win32`) skips autoconfig cleanly."
|
|
143
|
-
- "Fish Shell paths uniquely utilize `set -gx` constructs unlike standard Bash/Zsh `export` syntax, appending recursively or mutating existing assignments."
|