npm - @poncho-ai/harness - Versions diffs - 0.47.0 → 0.48.0 - Mend

@poncho-ai/harness 0.47.0 → 0.48.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/.turbo/turbo-build.log +5 -5
package/CHANGELOG.md +62 -0
package/dist/index.d.ts +43 -5
package/dist/index.js +73 -11
package/package.json +1 -1
package/src/harness.ts +38 -6
package/src/skill-context.ts +11 -1
package/src/storage/postgres-engine.ts +55 -3

package/.turbo/turbo-build.log CHANGED Viewed

@@ -1,5 +1,5 @@
-> @poncho-ai/harness@0.47.0 build /home/runner/work/poncho-ai/poncho-ai/packages/harness
+> @poncho-ai/harness@0.48.0 build /home/runner/work/poncho-ai/poncho-ai/packages/harness
 > node scripts/embed-docs.js && tsup src/index.ts --format esm --dts
 [embed-docs] Generated poncho-docs.ts with 4 topics
@@ -8,9 +8,9 @@
 [34mCLI[39m tsup v8.5.1
 [34mCLI[39m Target: es2022
 [34mESM[39m Build start
+[32mESM[39m [1mdist/index.js            [22m[32m528.41 KB[39m
 [32mESM[39m [1mdist/isolate-VY35DGLM.js [22m[32m49.43 KB[39m
-[32mESM[39m [1mdist/index.js            [22m[32m525.35 KB[39m
-[32mESM[39m ⚡️ Build success in 249ms
+[32mESM[39m ⚡️ Build success in 216ms
 [34mDTS[39m Build start
-[32mDTS[39m ⚡️ Build success in 7482ms
-[32mDTS[39m [1mdist/index.d.ts [22m[32m85.30 KB[39m
+[32mDTS[39m ⚡️ Build success in 7309ms
+[32mDTS[39m [1mdist/index.d.ts [22m[32m87.43 KB[39m

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,67 @@
 # @poncho-ai/harness
+## 0.48.0
+### Minor Changes
+- [#125](https://github.com/cesr/poncho-ai/pull/125) [`ff66aae`](https://github.com/cesr/poncho-ai/commit/ff66aaeebe6017ca9e1ee4b31ffe0d89bdf5ef28) Thanks [@cesr](https://github.com/cesr)! - harness: add `systemSkillPaths` for platform-shipped system skills
+  New optional `HarnessOptions.systemSkillPaths` (absolute directories,
+  each scanned for `<name>/SKILL.md` at init). System skills are surfaced
+  in `<available_skills>` like any other skill, with their bodies read
+  from local disk on activation — letting a platform ship default skills
+  with the deploy instead of writing them into every tenant's VFS.
+  Precedence is purely additive: per tenant the skill set resolves as
+  repo skills > the tenant's own VFS skills > system skills. So a tenant's
+  `/skills/<same-name>/` overrides a same-named system skill (mirroring
+  the VFS override behavior platforms already rely on for system jobs),
+  and the existing repo-vs-VFS precedence is unchanged. Empty by default —
+  no behavior change for existing consumers.
+  Also exports `loadSkillMetadataFromDirs(dirs)` (extracted from
+  `loadSkillMetadata`) for scanning an explicit list of absolute skill
+  directories.
+## 0.47.1
+### Patch Changes
+- [#122](https://github.com/cesr/poncho-ai/pull/122) [`661536b`](https://github.com/cesr/poncho-ai/commit/661536b8d24691d91dc01e345b828ef6c9884beb) Thanks [@cesr](https://github.com/cesr)! - harness: postgres connection-pool resilience for managed-postgres hosts
+  Managed Postgres providers (Railway, Neon, Heroku, etc.) drop idle
+  TCP connections server-side after a few minutes. The previous
+  postgres-engine config left `idle_timeout` at the porsager/postgres
+  default (0 = never close client-side), so the pool accumulated stale
+  sockets; the first query on one rejected with `write CONNECTION_ENDED
+<host>:5432` at `durMs=0` and bubbled up as a hard failure to the
+  caller — including user-facing chat turns and the orchestrator's
+  subagent callback rerun.
+  Two complementary settings, plus one belt-and-suspenders retry:
+  - `idle_timeout: 20` — close idle client-side connections before
+    any reasonable provider-side timer fires. Fresh connection on
+    next checkout, no stale-socket race.
+  - `max_lifetime: 60 * 10` (10 min) — recycle long-lived
+    connections defensively, sidestepping provider-side
+    "max connection age" limits.
+  - `private query()` now retries once on `CONNECTION_ENDED` /
+    `CONNECTION_CLOSED` / `CONNECTION_DESTROYED`. Covers the
+    narrow race where a query lands on a connection at the exact
+    instant the provider drops it.
+  Defaults unchanged: `max: 10`, `connect_timeout: 30`. Migration DDL
+  (`sql.unsafe(sql)` inside `executeRaw`) and transactions
+  (`sql.begin(...)`) deliberately don't go through the retry — DDL
+  is `IF NOT EXISTS` idempotent and transactions need atomic scoping.
+  Observed in production: the PonchOS api running on Railway hit this
+  during a subagent test, the orchestrator's auto-callback rerun
+  threw the connection-ended error, a concurrent unhandled async
+  rejection killed the node process, and Railway restarted the
+  replica (~50s). User-facing chat turns started seeing the same
+  error after that. Patch eliminates the source.
 ## 0.47.0
 ### Minor Changes

package/dist/index.d.ts CHANGED Viewed

@@ -1211,6 +1211,17 @@ interface HarnessOptions {
      * Empty by default — no system mounts in the CLI / dev workflow.
      */
     virtualMounts?: VirtualMount[];
+    /**
+     * Absolute directories of platform-shipped "system" skills. Each is
+     * scanned for `<name>/SKILL.md` at init; the bodies live on local disk
+     * and ship with the deploy. System skills are surfaced in
+     * `<available_skills>` like any other skill, but sit at the LOWEST
+     * precedence: a tenant's own `/skills/<same-name>/` (and a repo skill)
+     * overrides a system skill of the same name. Pair with a read-only
+     * `virtualMounts` entry (e.g. "/system/skills/") if the same files
+     * should also be browsable in the VFS. Empty by default.
+     */
+    systemSkillPaths?: string[];
 }
 interface HarnessRunOutput {
     runId: string;
@@ -1243,6 +1254,8 @@ declare class AgentHarness {
     private loadedConfig?;
     private readonly injectedConfig?;
     private loadedSkills;
+    private systemSkills;
+    private readonly systemSkillPaths;
     private skillFingerprint;
     private lastSkillRefreshAt;
     private readonly activeSkillNames;
@@ -1298,10 +1311,15 @@ declare class AgentHarness {
     private getMemoryStore;
     private listActiveSkills;
     /**
-     * Resolve the skill set visible to a given tenant: repo skills plus that
-     * tenant's VFS skills, with repo winning on name collision. Cached per
-     * tenant; cache invalidates on VFS writes under /skills/ via
-     * invalidateSkillsForTenant.
+     * Resolve the skill set visible to a given tenant. Three tiers, by
+     * precedence: repo skills > the tenant's own VFS skills > platform
+     * system skills. So a repo skill wins over a same-named VFS skill
+     * (unchanged), and a tenant's `/skills/<name>/` overrides a same-named
+     * system skill (the deploy-shipped default). Cached per tenant; cache
+     * invalidates on VFS writes under /skills/ via invalidateSkillsForTenant.
+     * System skills are static within a process, so they don't participate
+     * in the fingerprint — but a VFS override does (it changes a /skills
+     * path), which recomputes the cache and lets the override take effect.
      */
     private getSkillsForTenant;
     invalidateSkillsForTenant(tenantId: string): void;
@@ -1451,6 +1469,7 @@ declare const parseSkillFrontmatter: (content: string) => {
     };
 } | undefined;
 declare const loadSkillMetadata: (workingDir: string, extraSkillPaths?: string[]) => Promise<SkillMetadata[]>;
+declare const loadSkillMetadataFromDirs: (skillDirs: string[]) => Promise<SkillMetadata[]>;
 declare const buildSkillContextWindow: (skills: SkillMetadata[]) => string;
 declare const loadVfsSkillMetadata: (engine: StorageEngine, tenantId: string) => Promise<SkillMetadata[]>;
 declare const mergeSkills: (repoSkills: SkillMetadata[], vfsSkills: SkillMetadata[], onCollision?: (vfsSkill: SkillMetadata) => void) => SkillMetadata[];
@@ -1714,6 +1733,25 @@ declare class PostgresEngine extends SqlStorageEngine {
     refreshPathCache(tenantId: string): Promise<void>;
     private patchVfs;
     private query;
+    /**
+     * Single retry on a transient connection-layer failure. The
+     * `idle_timeout` / `max_lifetime` config above prevents *most*
+     * stale-connection cases, but a query can still race a
+     * provider-initiated drop in flight — the postgres.js client
+     * rejects with `code: "CONNECTION_ENDED"` and the next attempt
+     * checks out a fresh connection from the pool. One retry is
+     * enough; if it fails again the host-side network is genuinely
+     * broken and the caller should see the error.
+     *
+     * Only retries reads + the standard exec/run paths in `query`;
+     * `sql.unsafe(sql)` calls in `executeRaw` (migration DDL) and
+     * `sql.begin(...)` transactions are unwrapped — those are
+     * idempotent-by-construction (DDL is `IF NOT EXISTS`) or
+     * atomically scoped (transactions roll back cleanly), and adding
+     * a retry around them would complicate the transaction
+     * semantics.
+     */
+    private runWithRetry;
     private addToPathCache;
     private removeFromPathCache;
 }
@@ -2061,4 +2099,4 @@ interface RunConversationTurnResult {
 }
 declare const runConversationTurn: (opts: RunConversationTurnOpts) => Promise<RunConversationTurnResult>;
-export { type ActiveConversationRun, type ActiveSubagentRun, type AgentFrontmatter, AgentHarness, type AgentIdentity, type AgentLimitsConfig, type AgentModelConfig, AgentOrchestrator, type ApprovalEventItem, type ArchivedToolResult$1 as ArchivedToolResult, type BashConfig, BashEnvironmentManager, type BashExecutionLimits, type BuiltInToolToggles, CALLBACK_LOCK_STALE_MS, type CompactMessagesOptions, type CompactResult, type CompactionConfig, type ContinuationHooks, type Conversation, type ConversationCreateInit, type ConversationState, type ConversationStatusSnapshot, type ConversationStore, type ConversationSummary, type CreateSkillToolsOptions, type CronJobConfig, DEFAULT_AGENT_DESCRIPTION, DEFAULT_AGENT_NAME, DEFAULT_MAX_STEPS, DEFAULT_MODEL_NAME, DEFAULT_MODEL_PROVIDER, DEFAULT_TEMPERATURE, DEFAULT_TIMEOUT, type DefaultAgentDefinitionOptions, type EventSink, type ExecuteTurnResult, type HarnessOptions, type HarnessRunOutput, type HistorySource, InMemoryConversationStore, InMemoryEngine, InMemoryStateStore, type IsolateBinding, type IsolateConfig, LocalMcpBridge, LocalUploadStore, MAX_CONCURRENT_SUBAGENTS, MAX_CONTINUATION_COUNT, MAX_SUBAGENT_CALLBACK_COUNT, MAX_SUBAGENT_NESTING, type MainMemory, type McpConfig, type MemoryConfig, type MemoryStore, type MessagingChannelConfig, type ModelProviderFactory, type NetworkConfig, OPENAI_CODEX_CLIENT_ID, type OpenAICodexAuthConfig, type OpenAICodexDeviceAuthRequest, type OpenAICodexSession, type OrchestratorHooks, type OrchestratorOptions, type OtlpConfig, type OtlpOption, PONCHO_UPLOAD_SCHEME, type ParsedAgent, type PendingSubagentApproval, type PendingSubagentResult, type PendingToolCall, type PonchoConfig, PonchoFsAdapter, PostgresEngine, type ProviderConfig, type Recurrence, type RecurrenceType, type Reminder, type ReminderCreateInput, type ReminderStatus, type ReminderStore, type RemoteMcpServerConfig, type RunConversationTurnOpts, type RunConversationTurnResult, type RunOutcome, type RunRequest, type RuntimeRenderContext, S3UploadStore, STALE_SUBAGENT_THRESHOLD_MS, STORAGE_SCHEMA_VERSION, type SecretsStore, type SkillContextEntry, type SkillMetadata, type SkillSource, SqliteEngine, type StateConfig, type StateProviderName, type StateStore, type StorageConfig, type StorageEngine, type StorageFactoryOptions, type StorageProvider, type StoredApproval, type SubagentManager, type SubagentResult, type SubagentSpawnResult, type SubagentSummary, type SubagentTranscript, type SubagentTranscriptMode, TOOL_RESULT_ARCHIVE_PARAM, type TelemetryConfig, TelemetryEmitter, type TenantTokenPayload, type ToolAccess, type ToolCall, ToolDispatcher, type ToolExecutionResult, type TurnDraftState, type TurnResultMetadata, type TurnSection, type UploadStore, type UploadsConfig, VFS_SCHEME, VercelBlobUploadStore, type VfsDirEntry, type VfsStat, type VirtualMount, applyTurnMetadata, buildAgentDirectoryName, buildApprovalCheckpoints, buildAssistantMetadata, buildSkillContextWindow, buildToolCompletedText, cloneSections, compactMessages, completeOpenAICodexDeviceAuth, computeNextOccurrence, createBashTool, createConversationStore, createConversationStoreFromEngine, createDefaultTools, createDeleteDirectoryTool, createDeleteTool, createEditTool, createMemoryStore, createMemoryStoreFromEngine, createMemoryTools, createModelProvider, createReminderStore, createReminderStoreFromEngine, createReminderTools, createSearchTools, createSecretsStore, createSkillTools, createStateStore, createStorageEngine, createSubagentTools, createTodoStoreFromEngine, createTurnDraftState, createUploadStore, createWriteTool, decodeFileInputData, defaultAgentDefinition, deleteOpenAICodexSession, deriveUploadKey, ensureAgentIdentity, estimateTokens, estimateTotalTokens, executeConversationTurn, findSafeSplitPoint, flushTurnDraft, generateAgentId, getAgentStoreDirectory, getModelContextWindow, getOpenAICodexAccessToken, getOpenAICodexAuthFilePath, getOpenAICodexRequiredScopes, getPonchoStoreRoot, isMessageArray, jsonSchemaToZod, loadCanonicalHistory, loadPonchoConfig, loadRunHistory, loadSkillContext, loadSkillInstructions, loadSkillMetadata, loadVfsSkillMetadata, mergeSkills, normalizeApprovalCheckpoint, normalizeOtlp, normalizeScriptPolicyPath, normalizeToolAccess, parseAgentFile, parseAgentMarkdown, parseSkillFrontmatter, ponchoDocsTool, readOpenAICodexSession, readSkillResource, recordStandardTurnEvent, renderAgentPrompt, resolveAgentIdentity, resolveCompactionConfig, resolveEnv, resolveMemoryConfig, resolveRunRequest, resolveSkillDirs, resolveStateConfig, runConversationTurn, slugifyStorageComponent, startOpenAICodexDeviceAuth, verifyTenantToken, withToolResultArchiveParam, writeOpenAICodexSession };
+export { type ActiveConversationRun, type ActiveSubagentRun, type AgentFrontmatter, AgentHarness, type AgentIdentity, type AgentLimitsConfig, type AgentModelConfig, AgentOrchestrator, type ApprovalEventItem, type ArchivedToolResult$1 as ArchivedToolResult, type BashConfig, BashEnvironmentManager, type BashExecutionLimits, type BuiltInToolToggles, CALLBACK_LOCK_STALE_MS, type CompactMessagesOptions, type CompactResult, type CompactionConfig, type ContinuationHooks, type Conversation, type ConversationCreateInit, type ConversationState, type ConversationStatusSnapshot, type ConversationStore, type ConversationSummary, type CreateSkillToolsOptions, type CronJobConfig, DEFAULT_AGENT_DESCRIPTION, DEFAULT_AGENT_NAME, DEFAULT_MAX_STEPS, DEFAULT_MODEL_NAME, DEFAULT_MODEL_PROVIDER, DEFAULT_TEMPERATURE, DEFAULT_TIMEOUT, type DefaultAgentDefinitionOptions, type EventSink, type ExecuteTurnResult, type HarnessOptions, type HarnessRunOutput, type HistorySource, InMemoryConversationStore, InMemoryEngine, InMemoryStateStore, type IsolateBinding, type IsolateConfig, LocalMcpBridge, LocalUploadStore, MAX_CONCURRENT_SUBAGENTS, MAX_CONTINUATION_COUNT, MAX_SUBAGENT_CALLBACK_COUNT, MAX_SUBAGENT_NESTING, type MainMemory, type McpConfig, type MemoryConfig, type MemoryStore, type MessagingChannelConfig, type ModelProviderFactory, type NetworkConfig, OPENAI_CODEX_CLIENT_ID, type OpenAICodexAuthConfig, type OpenAICodexDeviceAuthRequest, type OpenAICodexSession, type OrchestratorHooks, type OrchestratorOptions, type OtlpConfig, type OtlpOption, PONCHO_UPLOAD_SCHEME, type ParsedAgent, type PendingSubagentApproval, type PendingSubagentResult, type PendingToolCall, type PonchoConfig, PonchoFsAdapter, PostgresEngine, type ProviderConfig, type Recurrence, type RecurrenceType, type Reminder, type ReminderCreateInput, type ReminderStatus, type ReminderStore, type RemoteMcpServerConfig, type RunConversationTurnOpts, type RunConversationTurnResult, type RunOutcome, type RunRequest, type RuntimeRenderContext, S3UploadStore, STALE_SUBAGENT_THRESHOLD_MS, STORAGE_SCHEMA_VERSION, type SecretsStore, type SkillContextEntry, type SkillMetadata, type SkillSource, SqliteEngine, type StateConfig, type StateProviderName, type StateStore, type StorageConfig, type StorageEngine, type StorageFactoryOptions, type StorageProvider, type StoredApproval, type SubagentManager, type SubagentResult, type SubagentSpawnResult, type SubagentSummary, type SubagentTranscript, type SubagentTranscriptMode, TOOL_RESULT_ARCHIVE_PARAM, type TelemetryConfig, TelemetryEmitter, type TenantTokenPayload, type ToolAccess, type ToolCall, ToolDispatcher, type ToolExecutionResult, type TurnDraftState, type TurnResultMetadata, type TurnSection, type UploadStore, type UploadsConfig, VFS_SCHEME, VercelBlobUploadStore, type VfsDirEntry, type VfsStat, type VirtualMount, applyTurnMetadata, buildAgentDirectoryName, buildApprovalCheckpoints, buildAssistantMetadata, buildSkillContextWindow, buildToolCompletedText, cloneSections, compactMessages, completeOpenAICodexDeviceAuth, computeNextOccurrence, createBashTool, createConversationStore, createConversationStoreFromEngine, createDefaultTools, createDeleteDirectoryTool, createDeleteTool, createEditTool, createMemoryStore, createMemoryStoreFromEngine, createMemoryTools, createModelProvider, createReminderStore, createReminderStoreFromEngine, createReminderTools, createSearchTools, createSecretsStore, createSkillTools, createStateStore, createStorageEngine, createSubagentTools, createTodoStoreFromEngine, createTurnDraftState, createUploadStore, createWriteTool, decodeFileInputData, defaultAgentDefinition, deleteOpenAICodexSession, deriveUploadKey, ensureAgentIdentity, estimateTokens, estimateTotalTokens, executeConversationTurn, findSafeSplitPoint, flushTurnDraft, generateAgentId, getAgentStoreDirectory, getModelContextWindow, getOpenAICodexAccessToken, getOpenAICodexAuthFilePath, getOpenAICodexRequiredScopes, getPonchoStoreRoot, isMessageArray, jsonSchemaToZod, loadCanonicalHistory, loadPonchoConfig, loadRunHistory, loadSkillContext, loadSkillInstructions, loadSkillMetadata, loadSkillMetadataFromDirs, loadVfsSkillMetadata, mergeSkills, normalizeApprovalCheckpoint, normalizeOtlp, normalizeScriptPolicyPath, normalizeToolAccess, parseAgentFile, parseAgentMarkdown, parseSkillFrontmatter, ponchoDocsTool, readOpenAICodexSession, readSkillResource, recordStandardTurnEvent, renderAgentPrompt, resolveAgentIdentity, resolveCompactionConfig, resolveEnv, resolveMemoryConfig, resolveRunRequest, resolveSkillDirs, resolveStateConfig, runConversationTurn, slugifyStorageComponent, startOpenAICodexDeviceAuth, verifyTenantToken, withToolResultArchiveParam, writeOpenAICodexSession };

package/dist/index.js CHANGED Viewed

@@ -4433,7 +4433,28 @@ var PostgresEngine = class extends SqlStorageEngine {
     this.sql = postgres(url, {
       onnotice: () => {
       },
-      prepare: false
+      prepare: false,
+      // Connection-pool resilience. Managed Postgres providers
+      // (Railway, Neon, Heroku, etc.) routinely drop idle TCP
+      // connections server-side after a few minutes. Without these
+      // knobs, porsager/postgres keeps stale sockets in the pool;
+      // the next query on one rejects with
+      // `write CONNECTION_ENDED <host>:5432` at `durMs=0`, surfacing
+      // as a hard failure to the caller. Two complementary settings:
+      //
+      //   - `idle_timeout: 20` closes idle connections client-side
+      //     after 20s, before any reasonable provider-side timer
+      //     fires. Fresh connection on next checkout = no stale
+      //     socket race.
+      //   - `max_lifetime: 600` (10 min) recycles long-lived
+      //     connections defensively even if they've stayed busy,
+      //     which sidesteps a separate class of provider-side
+      //     "max connection age" limits.
+      //
+      // Defaults remain `max: 10`, `connect_timeout: 30` — leaving
+      // pool size + initial connect behavior unchanged.
+      idle_timeout: 20,
+      max_lifetime: 60 * 10
     });
   }
   async initialize() {
@@ -4477,10 +4498,38 @@ var PostgresEngine = class extends SqlStorageEngine {
     };
   }
   async query(sql, params) {
-    if (!params || params.length === 0) {
-      return this.sql.unsafe(sql);
+    return this.runWithRetry(
+      () => !params || params.length === 0 ? this.sql.unsafe(sql) : this.sql.unsafe(sql, params)
+    );
+  }
+  /**
+   * Single retry on a transient connection-layer failure. The
+   * `idle_timeout` / `max_lifetime` config above prevents *most*
+   * stale-connection cases, but a query can still race a
+   * provider-initiated drop in flight — the postgres.js client
+   * rejects with `code: "CONNECTION_ENDED"` and the next attempt
+   * checks out a fresh connection from the pool. One retry is
+   * enough; if it fails again the host-side network is genuinely
+   * broken and the caller should see the error.
+   *
+   * Only retries reads + the standard exec/run paths in `query`;
+   * `sql.unsafe(sql)` calls in `executeRaw` (migration DDL) and
+   * `sql.begin(...)` transactions are unwrapped — those are
+   * idempotent-by-construction (DDL is `IF NOT EXISTS`) or
+   * atomically scoped (transactions roll back cleanly), and adding
+   * a retry around them would complicate the transaction
+   * semantics.
+   */
+  async runWithRetry(fn) {
+    try {
+      return await fn();
+    } catch (err) {
+      const code = err?.code;
+      if (code === "CONNECTION_ENDED" || code === "CONNECTION_CLOSED" || code === "CONNECTION_DESTROYED") {
+        return await fn();
+      }
+      throw err;
     }
-    return this.sql.unsafe(sql, params);
   }
   addToPathCache(tenantId, path) {
     const paths = this.pathCache.get(tenantId);
@@ -7314,7 +7363,9 @@ var collectSkillManifests = async (directory) => {
   return files;
 };
 var loadSkillMetadata = async (workingDir, extraSkillPaths) => {
-  const skillDirs = resolveSkillDirs(workingDir, extraSkillPaths);
+  return loadSkillMetadataFromDirs(resolveSkillDirs(workingDir, extraSkillPaths));
+};
+var loadSkillMetadataFromDirs = async (skillDirs) => {
   const allManifests = [];
   for (const dir of skillDirs) {
     try {
@@ -9071,6 +9122,8 @@ var AgentHarness = class _AgentHarness {
   loadedConfig;
   injectedConfig;
   loadedSkills = [];
+  systemSkills = [];
+  systemSkillPaths = [];
   skillFingerprint = "";
   lastSkillRefreshAt = 0;
   activeSkillNames = /* @__PURE__ */ new Set();
@@ -9276,6 +9329,7 @@ var AgentHarness = class _AgentHarness {
       this.injectedStorageEngine = true;
     }
     this.virtualMounts = options.virtualMounts ?? [];
+    this.systemSkillPaths = options.systemSkillPaths ?? [];
     if (options.toolDefinitions?.length) {
       this.dispatcher.registerMany(options.toolDefinitions);
     }
@@ -9437,14 +9491,19 @@ var AgentHarness = class _AgentHarness {
     return [...this.activeSkillNames].sort();
   }
   /**
-   * Resolve the skill set visible to a given tenant: repo skills plus that
-   * tenant's VFS skills, with repo winning on name collision. Cached per
-   * tenant; cache invalidates on VFS writes under /skills/ via
-   * invalidateSkillsForTenant.
+   * Resolve the skill set visible to a given tenant. Three tiers, by
+   * precedence: repo skills > the tenant's own VFS skills > platform
+   * system skills. So a repo skill wins over a same-named VFS skill
+   * (unchanged), and a tenant's `/skills/<name>/` overrides a same-named
+   * system skill (the deploy-shipped default). Cached per tenant; cache
+   * invalidates on VFS writes under /skills/ via invalidateSkillsForTenant.
+   * System skills are static within a process, so they don't participate
+   * in the fingerprint — but a VFS override does (it changes a /skills
+   * path), which recomputes the cache and lets the override take effect.
    */
   async getSkillsForTenant(tenantId) {
     if (!this.storageEngine) {
-      return this.loadedSkills;
+      return mergeSkills(this.loadedSkills, this.systemSkills);
     }
     const effectiveTenant = tenantId || "__default__";
     const engineWithRefresh = this.storageEngine;
@@ -9457,7 +9516,7 @@ var AgentHarness = class _AgentHarness {
       return cached.skills;
     }
     const vfsSkills = await loadVfsSkillMetadata(this.storageEngine, effectiveTenant);
-    const merged = mergeSkills(this.loadedSkills, vfsSkills, (skipped) => {
+    const repoAndVfs = mergeSkills(this.loadedSkills, vfsSkills, (skipped) => {
       const key = `${effectiveTenant}:${skipped.name}`;
       if (this.vfsSkillCollisionWarnings.has(key)) return;
       this.vfsSkillCollisionWarnings.add(key);
@@ -9465,6 +9524,7 @@ var AgentHarness = class _AgentHarness {
         `VFS skill "${skipped.name}" for tenant ${effectiveTenant} ignored: a repo skill with the same name takes precedence.`
       );
     });
+    const merged = mergeSkills(repoAndVfs, this.systemSkills);
     this.skillCache.set(effectiveTenant, { skills: merged, fingerprint });
     return merged;
   }
@@ -9795,6 +9855,7 @@ var AgentHarness = class _AgentHarness {
     const extraSkillPaths = config?.skillPaths;
     const skillMetadata = await loadSkillMetadata(this.workingDir, extraSkillPaths);
     this.loadedSkills = skillMetadata;
+    this.systemSkills = this.systemSkillPaths.length ? await loadSkillMetadataFromDirs(this.systemSkillPaths) : [];
     this.skillFingerprint = this.buildSkillFingerprint(skillMetadata);
     this.registerSkillTools();
     const agentId = this.parsedAgent.frontmatter.id ?? this.parsedAgent.frontmatter.name;
@@ -13931,6 +13992,7 @@ export {
   loadSkillContext,
   loadSkillInstructions,
   loadSkillMetadata,
+  loadSkillMetadataFromDirs,
   loadVfsSkillMetadata,
   mergeSkills,
   normalizeApprovalCheckpoint,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@poncho-ai/harness",
-  "version": "0.47.0",
+  "version": "0.48.0",
   "description": "Agent execution runtime - conversation loop, tool dispatch, streaming",
   "repository": {
     "type": "git",

package/src/harness.ts CHANGED Viewed

@@ -55,6 +55,7 @@ import { createModelProvider, getModelContextWindow, type ModelProviderFactory,
 import {
   buildSkillContextWindow,
   loadSkillMetadata,
+  loadSkillMetadataFromDirs,
   loadVfsSkillMetadata,
   mergeSkills,
 } from "./skill-context.js";
@@ -134,6 +135,17 @@ export interface HarnessOptions {
    * Empty by default — no system mounts in the CLI / dev workflow.
    */
   virtualMounts?: VirtualMount[];
+  /**
+   * Absolute directories of platform-shipped "system" skills. Each is
+   * scanned for `<name>/SKILL.md` at init; the bodies live on local disk
+   * and ship with the deploy. System skills are surfaced in
+   * `<available_skills>` like any other skill, but sit at the LOWEST
+   * precedence: a tenant's own `/skills/<same-name>/` (and a repo skill)
+   * overrides a system skill of the same name. Pair with a read-only
+   * `virtualMounts` entry (e.g. "/system/skills/") if the same files
+   * should also be browsable in the VFS. Empty by default.
+   */
+  systemSkillPaths?: string[];
 }
 export interface HarnessRunOutput {
@@ -839,6 +851,8 @@ export class AgentHarness {
   private loadedConfig?: PonchoConfig;
   private readonly injectedConfig?: PonchoConfig;
   private loadedSkills: SkillMetadata[] = [];
+  private systemSkills: SkillMetadata[] = [];
+  private readonly systemSkillPaths: string[] = [];
   private skillFingerprint = "";
   private lastSkillRefreshAt = 0;
   private readonly activeSkillNames = new Set<string>();
@@ -1077,6 +1091,7 @@ export class AgentHarness {
       this.injectedStorageEngine = true;
     }
     this.virtualMounts = options.virtualMounts ?? [];
+    this.systemSkillPaths = options.systemSkillPaths ?? [];
     if (options.toolDefinitions?.length) {
       this.dispatcher.registerMany(options.toolDefinitions);
@@ -1271,14 +1286,19 @@ export class AgentHarness {
   }
   /**
-   * Resolve the skill set visible to a given tenant: repo skills plus that
-   * tenant's VFS skills, with repo winning on name collision. Cached per
-   * tenant; cache invalidates on VFS writes under /skills/ via
-   * invalidateSkillsForTenant.
+   * Resolve the skill set visible to a given tenant. Three tiers, by
+   * precedence: repo skills > the tenant's own VFS skills > platform
+   * system skills. So a repo skill wins over a same-named VFS skill
+   * (unchanged), and a tenant's `/skills/<name>/` overrides a same-named
+   * system skill (the deploy-shipped default). Cached per tenant; cache
+   * invalidates on VFS writes under /skills/ via invalidateSkillsForTenant.
+   * System skills are static within a process, so they don't participate
+   * in the fingerprint — but a VFS override does (it changes a /skills
+   * path), which recomputes the cache and lets the override take effect.
    */
   private async getSkillsForTenant(tenantId: string | undefined | null): Promise<SkillMetadata[]> {
     if (!this.storageEngine) {
-      return this.loadedSkills;
+      return mergeSkills(this.loadedSkills, this.systemSkills);
     }
     // Mirror the rest of the harness: undefined tenantId falls back to
     // "__default__" so dev-mode (no auth) conversations see the same VFS
@@ -1305,7 +1325,7 @@ export class AgentHarness {
       return cached.skills;
     }
     const vfsSkills = await loadVfsSkillMetadata(this.storageEngine, effectiveTenant);
-    const merged = mergeSkills(this.loadedSkills, vfsSkills, (skipped) => {
+    const repoAndVfs = mergeSkills(this.loadedSkills, vfsSkills, (skipped) => {
       const key = `${effectiveTenant}:${skipped.name}`;
       if (this.vfsSkillCollisionWarnings.has(key)) return;
       this.vfsSkillCollisionWarnings.add(key);
@@ -1313,6 +1333,11 @@ export class AgentHarness {
         `VFS skill "${skipped.name}" for tenant ${effectiveTenant} ignored: a repo skill with the same name takes precedence.`,
       );
     });
+    // System skills sit at the bottom: a repo or VFS skill of the same
+    // name overrides them. Overriding a system default is the intended
+    // user workflow (mirrors /jobs system-default overrides), so the
+    // collision is silent — not a warning.
+    const merged = mergeSkills(repoAndVfs, this.systemSkills);
     this.skillCache.set(effectiveTenant, { skills: merged, fingerprint });
     return merged;
   }
@@ -1706,6 +1731,13 @@ export class AgentHarness {
     const extraSkillPaths = config?.skillPaths;
     const skillMetadata = await loadSkillMetadata(this.workingDir, extraSkillPaths);
     this.loadedSkills = skillMetadata;
+    // Platform-shipped system skills, scanned from absolute dirs on disk.
+    // Loaded once at init (they ship with the deploy and don't change
+    // within a process). Merged at LOWEST precedence per tenant — see
+    // getSkillsForTenant.
+    this.systemSkills = this.systemSkillPaths.length
+      ? await loadSkillMetadataFromDirs(this.systemSkillPaths)
+      : [];
     this.skillFingerprint = this.buildSkillFingerprint(skillMetadata);
     this.registerSkillTools();
     const agentId = this.parsedAgent.frontmatter.id ?? this.parsedAgent.frontmatter.name;

package/src/skill-context.ts CHANGED Viewed

@@ -209,7 +209,17 @@ export const loadSkillMetadata = async (
   workingDir: string,
   extraSkillPaths?: string[],
 ): Promise<SkillMetadata[]> => {
-  const skillDirs = resolveSkillDirs(workingDir, extraSkillPaths);
+  return loadSkillMetadataFromDirs(resolveSkillDirs(workingDir, extraSkillPaths));
+};
+// Scan an explicit list of absolute directories for `<name>/SKILL.md`
+// manifests and return their metadata as `source: "repo"` skills (body
+// read from disk on activation). Used both by `loadSkillMetadata` (after
+// resolving repo skill dirs against the working dir) and directly for
+// platform-shipped "system" skills whose source dirs are already absolute.
+export const loadSkillMetadataFromDirs = async (
+  skillDirs: string[],
+): Promise<SkillMetadata[]> => {
   const allManifests: string[] = [];
   for (const dir of skillDirs) {

package/src/storage/postgres-engine.ts CHANGED Viewed

@@ -57,6 +57,27 @@ export class PostgresEngine extends SqlStorageEngine {
     this.sql = postgres(url, {
       onnotice: () => {},
       prepare: false,
+      // Connection-pool resilience. Managed Postgres providers
+      // (Railway, Neon, Heroku, etc.) routinely drop idle TCP
+      // connections server-side after a few minutes. Without these
+      // knobs, porsager/postgres keeps stale sockets in the pool;
+      // the next query on one rejects with
+      // `write CONNECTION_ENDED <host>:5432` at `durMs=0`, surfacing
+      // as a hard failure to the caller. Two complementary settings:
+      //
+      //   - `idle_timeout: 20` closes idle connections client-side
+      //     after 20s, before any reasonable provider-side timer
+      //     fires. Fresh connection on next checkout = no stale
+      //     socket race.
+      //   - `max_lifetime: 600` (10 min) recycles long-lived
+      //     connections defensively even if they've stayed busy,
+      //     which sidesteps a separate class of provider-side
+      //     "max connection age" limits.
+      //
+      // Defaults remain `max: 10`, `connect_timeout: 30` — leaving
+      // pool size + initial connect behavior unchanged.
+      idle_timeout: 20,
+      max_lifetime: 60 * 10,
     });
   }
@@ -118,10 +139,41 @@ export class PostgresEngine extends SqlStorageEngine {
   }
   private async query(sql: string, params?: unknown[]): Promise<any[]> {
-    if (!params || params.length === 0) {
-      return this.sql.unsafe(sql);
+    return this.runWithRetry(() =>
+      !params || params.length === 0
+        ? this.sql.unsafe(sql)
+        : this.sql.unsafe(sql, params),
+    );
+  }
+  /**
+   * Single retry on a transient connection-layer failure. The
+   * `idle_timeout` / `max_lifetime` config above prevents *most*
+   * stale-connection cases, but a query can still race a
+   * provider-initiated drop in flight — the postgres.js client
+   * rejects with `code: "CONNECTION_ENDED"` and the next attempt
+   * checks out a fresh connection from the pool. One retry is
+   * enough; if it fails again the host-side network is genuinely
+   * broken and the caller should see the error.
+   *
+   * Only retries reads + the standard exec/run paths in `query`;
+   * `sql.unsafe(sql)` calls in `executeRaw` (migration DDL) and
+   * `sql.begin(...)` transactions are unwrapped — those are
+   * idempotent-by-construction (DDL is `IF NOT EXISTS`) or
+   * atomically scoped (transactions roll back cleanly), and adding
+   * a retry around them would complicate the transaction
+   * semantics.
+   */
+  private async runWithRetry<T>(fn: () => Promise<T>): Promise<T> {
+    try {
+      return await fn();
+    } catch (err) {
+      const code = (err as { code?: string } | null | undefined)?.code;
+      if (code === "CONNECTION_ENDED" || code === "CONNECTION_CLOSED" || code === "CONNECTION_DESTROYED") {
+        return await fn();
+      }
+      throw err;
     }
-    return this.sql.unsafe(sql, params);
   }
   private addToPathCache(tenantId: string, path: string): void {