npm - selftune - Versions diffs - 0.2.0 → 0.2.2 - Mend

selftune 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

package/.claude/agents/diagnosis-analyst.md +20 -10
package/.claude/agents/evolution-reviewer.md +14 -1
package/.claude/agents/integration-guide.md +18 -6
package/.claude/agents/pattern-analyst.md +18 -5
package/CHANGELOG.md +12 -4
package/README.md +43 -35
package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
package/apps/local-dashboard/dist/favicon.png +0 -0
package/apps/local-dashboard/dist/index.html +17 -0
package/apps/local-dashboard/dist/logo.png +0 -0
package/apps/local-dashboard/dist/logo.svg +9 -0
package/cli/selftune/badge/badge-data.ts +1 -1
package/cli/selftune/badge/badge.ts +4 -8
package/cli/selftune/canonical-export.ts +183 -0
package/cli/selftune/constants.ts +28 -0
package/cli/selftune/contribute/contribute.ts +1 -1
package/cli/selftune/cron/setup.ts +17 -17
package/cli/selftune/dashboard-contract.ts +202 -0
package/cli/selftune/dashboard-server.ts +653 -186
package/cli/selftune/dashboard.ts +41 -176
package/cli/selftune/eval/baseline.ts +5 -4
package/cli/selftune/eval/composability-v2.ts +273 -0
package/cli/selftune/eval/hooks-to-evals.ts +34 -15
package/cli/selftune/eval/unit-test-cli.ts +1 -1
package/cli/selftune/evolution/evidence.ts +26 -0
package/cli/selftune/evolution/evolve-body.ts +105 -11
package/cli/selftune/evolution/evolve.ts +371 -25
package/cli/selftune/evolution/extract-patterns.ts +87 -29
package/cli/selftune/evolution/rollback.ts +2 -2
package/cli/selftune/grading/auto-grade.ts +200 -0
package/cli/selftune/grading/grade-session.ts +448 -97
package/cli/selftune/grading/results.ts +42 -0
package/cli/selftune/hooks/prompt-log.ts +172 -2
package/cli/selftune/hooks/session-stop.ts +123 -3
package/cli/selftune/hooks/skill-eval.ts +119 -3
package/cli/selftune/index.ts +395 -116
package/cli/selftune/ingestors/claude-replay.ts +140 -114
package/cli/selftune/ingestors/codex-rollout.ts +345 -46
package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
package/cli/selftune/ingestors/openclaw-ingest.ts +141 -8
package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
package/cli/selftune/init.ts +227 -14
package/cli/selftune/last.ts +14 -5
package/cli/selftune/localdb/db.ts +63 -0
package/cli/selftune/localdb/materialize.ts +428 -0
package/cli/selftune/localdb/queries.ts +376 -0
package/cli/selftune/localdb/schema.ts +204 -0
package/cli/selftune/monitoring/watch.ts +66 -15
package/cli/selftune/normalization.ts +682 -0
package/cli/selftune/observability.ts +19 -44
package/cli/selftune/orchestrate.ts +1073 -0
package/cli/selftune/quickstart.ts +203 -0
package/cli/selftune/repair/skill-usage.ts +576 -0
package/cli/selftune/schedule.ts +561 -0
package/cli/selftune/status.ts +48 -26
package/cli/selftune/sync.ts +627 -0
package/cli/selftune/types.ts +148 -0
package/cli/selftune/utils/canonical-log.ts +45 -0
package/cli/selftune/utils/hooks.ts +41 -0
package/cli/selftune/utils/html.ts +27 -0
package/cli/selftune/utils/llm-call.ts +78 -20
package/cli/selftune/utils/math.ts +10 -0
package/cli/selftune/utils/query-filter.ts +139 -0
package/cli/selftune/utils/skill-discovery.ts +340 -0
package/cli/selftune/utils/skill-log.ts +68 -0
package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
package/cli/selftune/utils/transcript.ts +272 -26
package/cli/selftune/workflows/discover.ts +254 -0
package/cli/selftune/workflows/skill-md-writer.ts +288 -0
package/cli/selftune/workflows/workflows.ts +188 -0
package/package.json +21 -8
package/packages/telemetry-contract/README.md +11 -0
package/packages/telemetry-contract/fixtures/golden.json +87 -0
package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
package/packages/telemetry-contract/index.ts +1 -0
package/packages/telemetry-contract/package.json +19 -0
package/packages/telemetry-contract/src/index.ts +2 -0
package/packages/telemetry-contract/src/types.ts +163 -0
package/packages/telemetry-contract/src/validators.ts +109 -0
package/skill/SKILL.md +84 -53
package/skill/Workflows/AutoActivation.md +17 -16
package/skill/Workflows/Badge.md +6 -0
package/skill/Workflows/Baseline.md +46 -23
package/skill/Workflows/Composability.md +12 -5
package/skill/Workflows/Contribute.md +17 -14
package/skill/Workflows/Cron.md +56 -79
package/skill/Workflows/Dashboard.md +45 -34
package/skill/Workflows/Doctor.md +30 -17
package/skill/Workflows/Evals.md +64 -40
package/skill/Workflows/EvolutionMemory.md +2 -0
package/skill/Workflows/Evolve.md +102 -47
package/skill/Workflows/EvolveBody.md +6 -6
package/skill/Workflows/Grade.md +36 -31
package/skill/Workflows/ImportSkillsBench.md +11 -5
package/skill/Workflows/Ingest.md +43 -36
package/skill/Workflows/Initialize.md +44 -30
package/skill/Workflows/Orchestrate.md +139 -0
package/skill/Workflows/Replay.md +39 -18
package/skill/Workflows/Rollback.md +3 -3
package/skill/Workflows/Schedule.md +61 -0
package/skill/Workflows/Sync.md +88 -0
package/skill/Workflows/UnitTest.md +34 -22
package/skill/Workflows/Watch.md +14 -4
package/skill/Workflows/Workflows.md +129 -0
package/skill/assets/activation-rules-default.json +26 -0
package/skill/assets/multi-skill-settings.json +63 -0
package/skill/assets/single-skill-settings.json +57 -0
package/skill/references/invocation-taxonomy.md +2 -2
package/skill/references/logs.md +164 -2
package/skill/references/setup-patterns.md +65 -0
package/skill/references/version-history.md +40 -0
package/skill/settings_snippet.json +1 -1
package/templates/multi-skill-settings.json +7 -7
package/templates/single-skill-settings.json +6 -6
package/dashboard/index.html +0 -1680

package/cli/selftune/monitoring/watch.ts CHANGED Viewed

@@ -12,6 +12,7 @@ import { QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
 import { classifyInvocation } from "../eval/hooks-to-evals.js";
 import { getLastDeployedProposal } from "../evolution/audit.js";
 import { updateContextAfterWatch } from "../memory/writer.js";
+import type { SyncResult } from "../sync.js";
 import type {
   InvocationType,
   MonitoringSnapshot,
@@ -20,6 +21,11 @@ import type {
   SkillUsageRecord,
 } from "../types.js";
 import { readJsonl } from "../utils/jsonl.js";
+import {
+  filterActionableQueryRecords,
+  filterActionableSkillUsageRecords,
+} from "../utils/query-filter.js";
+import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
 // ---------------------------------------------------------------------------
 // Public interfaces
@@ -42,6 +48,10 @@ export interface WatchOptions {
     skillPath: string;
     proposalId?: string;
   }) => Promise<{ rolledBack: boolean; restoredDescription: string; reason: string }>;
+  /** Source-truth refresh before reading logs. */
+  syncFirst?: boolean;
+  syncForce?: boolean;
+  _syncFn?: typeof import("../sync.js").syncSources;
 }
 export interface WatchResult {
@@ -49,6 +59,7 @@ export interface WatchResult {
   alert: string | null;
   rolledBack: boolean;
   recommendation: string;
+  sync_result?: SyncResult;
 }
 // ---------------------------------------------------------------------------
@@ -57,6 +68,7 @@ export interface WatchResult {
 const DEFAULT_BASELINE_PASS_RATE = 0.5;
 const DEFAULT_REGRESSION_THRESHOLD = 0.1;
+export const MIN_MONITORING_SKILL_CHECKS = 3;
 // ---------------------------------------------------------------------------
 // computeMonitoringSnapshot - pure function
@@ -66,9 +78,9 @@ const DEFAULT_REGRESSION_THRESHOLD = 0.1;
  * Compute a monitoring snapshot from raw log records.
  *
  * The function windows telemetry to the last `windowSessions` entries, then
- * scopes skill and query records to those sessions. If telemetry is empty or
- * no records match the windowed session IDs, all provided skill/query records
- * are used directly (unfiltered by session).
+ * scopes skill and actionable query records to those sessions. If telemetry is
+ * empty or no records match the windowed session IDs, all provided skill/query
+ * records are used directly (unfiltered by session).
  *
  * @param skillName        - The skill to monitor
  * @param telemetry        - All session telemetry records
@@ -88,33 +100,33 @@ export function computeMonitoringSnapshot(
   regressionThreshold: number = DEFAULT_REGRESSION_THRESHOLD,
 ): MonitoringSnapshot {
   // 1. Window the telemetry to the last N sessions (by array order, assumed chronological)
+  const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
+  const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
   const windowedTelemetry = telemetry.slice(-windowSessions);
   const windowedSessionIds = new Set(windowedTelemetry.map((t) => t.session_id));
   // 2. Filter skill records by skill name first
-  const skillNameFiltered = skillRecords.filter((r) => r.skill_name === skillName);
+  const skillNameFiltered = actionableSkillRecords.filter((r) => r.skill_name === skillName);
   // 3. Apply session ID windowing only if telemetry is present and overlaps
   const hasSessionOverlap =
     windowedSessionIds.size > 0 &&
     (skillNameFiltered.some((r) => windowedSessionIds.has(r.session_id)) ||
-      queryRecords.some((r) => windowedSessionIds.has(r.session_id)));
+      actionableQueryRecords.some((r) => windowedSessionIds.has(r.session_id)));
   const filteredSkillRecords = hasSessionOverlap
     ? skillNameFiltered.filter((r) => windowedSessionIds.has(r.session_id))
     : skillNameFiltered;
   const filteredQueryRecords = hasSessionOverlap
-    ? queryRecords.filter((r) => windowedSessionIds.has(r.session_id))
-    : queryRecords;
+    ? actionableQueryRecords.filter((r) => windowedSessionIds.has(r.session_id))
+    : actionableQueryRecords;
-  // 4. Compute pass rate: triggered_count / total_query_count
+  // 4. Compute pass rate from explicit skill checks, not from all queries.
   const triggeredCount = filteredSkillRecords.filter((r) => r.triggered).length;
-  const totalQueries = filteredQueryRecords.length;
-  const passRate = totalQueries === 0 ? 1.0 : triggeredCount / totalQueries;
+  const totalSkillChecks = filteredSkillRecords.length;
+  const passRate = totalSkillChecks === 0 ? 0 : triggeredCount / totalSkillChecks;
   // 5. Compute false negative rate from skill usage records
-  const totalSkillChecks = filteredSkillRecords.length;
   const falseNegatives = filteredSkillRecords.filter((r) => !r.triggered).length;
   const falseNegativeRate = totalSkillChecks === 0 ? 0 : falseNegatives / totalSkillChecks;
@@ -126,7 +138,10 @@ export function computeMonitoringSnapshot(
     negative: { passed: 0, total: 0 },
   };
   for (const record of filteredSkillRecords) {
-    const invType = classifyInvocation(record.query, skillName);
+    const invType = classifyInvocation(
+      typeof record.query === "string" ? record.query : "",
+      skillName,
+    );
     byInvocationType[invType].total++;
     if (record.triggered) {
       byInvocationType[invType].passed++;
@@ -139,12 +154,16 @@ export function computeMonitoringSnapshot(
   const adjustedThreshold =
     Math.round((baselinePassRate - regressionThreshold) * precision) / precision;
   const roundedPassRate = Math.round(passRate * precision) / precision;
-  const regressionDetected = roundedPassRate < adjustedThreshold;
+  const hasEnoughSignalForRegression =
+    totalSkillChecks >= MIN_MONITORING_SKILL_CHECKS ||
+    (totalSkillChecks === 0 && filteredQueryRecords.length >= MIN_MONITORING_SKILL_CHECKS);
+  const regressionDetected = hasEnoughSignalForRegression && roundedPassRate < adjustedThreshold;
   return {
     timestamp: new Date().toISOString(),
     skill_name: skillName,
     window_sessions: windowSessions,
+    skill_checks: totalSkillChecks,
     pass_rate: passRate,
     false_negative_rate: falseNegativeRate,
     by_invocation_type: byInvocationType,
@@ -172,11 +191,28 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
     _queryLogPath = QUERY_LOG,
     _auditLogPath,
     _rollbackFn,
+    syncFirst = false,
+    syncForce = false,
+    _syncFn,
   } = options;
+  let syncResult: SyncResult | undefined;
+  if (syncFirst) {
+    const { createDefaultSyncOptions, syncSources: realSyncSources } = await import("../sync.js");
+    const syncRunner = _syncFn ?? realSyncSources;
+    syncResult = syncRunner(
+      createDefaultSyncOptions({
+        force: syncForce,
+      }),
+    );
+  }
   // 1. Read log files
   const telemetry = readJsonl<SessionTelemetryRecord>(_telemetryLogPath);
-  const skillRecords = readJsonl<SkillUsageRecord>(_skillLogPath);
+  const skillRecords =
+    _skillLogPath === SKILL_LOG
+      ? readEffectiveSkillUsageRecords()
+      : readJsonl<SkillUsageRecord>(_skillLogPath);
   const queryRecords = readJsonl<QueryLogRecord>(_queryLogPath);
   // 2. Determine baseline pass rate from last deployed audit entry
@@ -217,6 +253,10 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
     recommendation = rolledBack
       ? `Rolled back "${skillName}" to previous version. Monitor to confirm recovery.`
       : `Consider running: selftune rollback --skill "${skillName}" --skill-path "${skillPath}"`;
+  } else if (snapshot.skill_checks < MIN_MONITORING_SKILL_CHECKS) {
+    recommendation =
+      `Skill "${skillName}" has only ${snapshot.skill_checks} actionable check(s) in the current window. ` +
+      `Need at least ${MIN_MONITORING_SKILL_CHECKS} before calling it stable.`;
   } else {
     recommendation = `Skill "${skillName}" is stable. Pass rate ${snapshot.pass_rate.toFixed(2)} is within acceptable range of baseline ${baselinePassRate.toFixed(2)}.`;
   }
@@ -240,6 +280,7 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
     alert,
     rolledBack,
     recommendation,
+    ...(syncResult ? { sync_result: syncResult } : {}),
   };
 }
@@ -283,6 +324,8 @@ export async function cliMain(): Promise<void> {
       window: { type: "string", default: "20" },
       threshold: { type: "string", default: "0.1" },
       "auto-rollback": { type: "boolean", default: false },
+      "sync-first": { type: "boolean", default: false },
+      "sync-force": { type: "boolean", default: false },
       help: { type: "boolean", default: false },
     },
     strict: true,
@@ -300,6 +343,8 @@ Options:
   --window            Number of recent sessions to consider (default: 20)
   --threshold         Regression threshold below baseline (default: 0.1)
   --auto-rollback     Automatically rollback on regression detection
+  --sync-first        Refresh source-truth telemetry before reading watch inputs
+  --sync-force        Force a full rescan during --sync-first
   --help              Show this help message`);
     process.exit(0);
   }
@@ -308,6 +353,10 @@ Options:
     console.error("[ERROR] --skill and --skill-path are required");
     process.exit(1);
   }
+  if ((values["sync-force"] ?? false) && !(values["sync-first"] ?? false)) {
+    console.error("[ERROR] --sync-force requires --sync-first");
+    process.exit(1);
+  }
   const rawWindow = values.window ?? "20";
   if (!/^\d+$/.test(rawWindow)) {
@@ -337,6 +386,8 @@ Options:
     windowSessions,
     regressionThreshold,
     autoRollback: values["auto-rollback"] ?? false,
+    syncFirst: values["sync-first"] ?? false,
+    syncForce: values["sync-force"] ?? false,
   });
   console.log(JSON.stringify(result, null, 2));