@poncho-ai/harness 0.47.0 → 0.48.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- > @poncho-ai/harness@0.47.0 build /home/runner/work/poncho-ai/poncho-ai/packages/harness
2
+ > @poncho-ai/harness@0.48.0 build /home/runner/work/poncho-ai/poncho-ai/packages/harness
3
3
  > node scripts/embed-docs.js && tsup src/index.ts --format esm --dts
4
4
 
5
5
  [embed-docs] Generated poncho-docs.ts with 4 topics
@@ -8,9 +8,9 @@
8
8
  CLI tsup v8.5.1
9
9
  CLI Target: es2022
10
10
  ESM Build start
11
+ ESM dist/index.js 528.41 KB
11
12
  ESM dist/isolate-VY35DGLM.js 49.43 KB
12
- ESM dist/index.js 525.35 KB
13
- ESM ⚡️ Build success in 249ms
13
+ ESM ⚡️ Build success in 216ms
14
14
  DTS Build start
15
- DTS ⚡️ Build success in 7482ms
16
- DTS dist/index.d.ts 85.30 KB
15
+ DTS ⚡️ Build success in 7309ms
16
+ DTS dist/index.d.ts 87.43 KB
package/CHANGELOG.md CHANGED
@@ -1,5 +1,67 @@
1
1
  # @poncho-ai/harness
2
2
 
3
+ ## 0.48.0
4
+
5
+ ### Minor Changes
6
+
7
+ - [#125](https://github.com/cesr/poncho-ai/pull/125) [`ff66aae`](https://github.com/cesr/poncho-ai/commit/ff66aaeebe6017ca9e1ee4b31ffe0d89bdf5ef28) Thanks [@cesr](https://github.com/cesr)! - harness: add `systemSkillPaths` for platform-shipped system skills
8
+
9
+ New optional `HarnessOptions.systemSkillPaths` (absolute directories,
10
+ each scanned for `<name>/SKILL.md` at init). System skills are surfaced
11
+ in `<available_skills>` like any other skill, with their bodies read
12
+ from local disk on activation — letting a platform ship default skills
13
+ with the deploy instead of writing them into every tenant's VFS.
14
+
15
+ Precedence is purely additive: per tenant the skill set resolves as
16
+ repo skills > the tenant's own VFS skills > system skills. So a tenant's
17
+ `/skills/<same-name>/` overrides a same-named system skill (mirroring
18
+ the VFS override behavior platforms already rely on for system jobs),
19
+ and the existing repo-vs-VFS precedence is unchanged. Empty by default —
20
+ no behavior change for existing consumers.
21
+
22
+ Also exports `loadSkillMetadataFromDirs(dirs)` (extracted from
23
+ `loadSkillMetadata`) for scanning an explicit list of absolute skill
24
+ directories.
25
+
26
+ ## 0.47.1
27
+
28
+ ### Patch Changes
29
+
30
+ - [#122](https://github.com/cesr/poncho-ai/pull/122) [`661536b`](https://github.com/cesr/poncho-ai/commit/661536b8d24691d91dc01e345b828ef6c9884beb) Thanks [@cesr](https://github.com/cesr)! - harness: postgres connection-pool resilience for managed-postgres hosts
31
+
32
+ Managed Postgres providers (Railway, Neon, Heroku, etc.) drop idle
33
+ TCP connections server-side after a few minutes. The previous
34
+ postgres-engine config left `idle_timeout` at the porsager/postgres
35
+ default (0 = never close client-side), so the pool accumulated stale
36
+ sockets; the first query on one rejected with `write CONNECTION_ENDED
37
+ <host>:5432` at `durMs=0` and bubbled up as a hard failure to the
38
+ caller — including user-facing chat turns and the orchestrator's
39
+ subagent callback rerun.
40
+
41
+ Two complementary settings, plus one belt-and-suspenders retry:
42
+ - `idle_timeout: 20` — close idle client-side connections before
43
+ any reasonable provider-side timer fires. Fresh connection on
44
+ next checkout, no stale-socket race.
45
+ - `max_lifetime: 60 * 10` (10 min) — recycle long-lived
46
+ connections defensively, sidestepping provider-side
47
+ "max connection age" limits.
48
+ - `private query()` now retries once on `CONNECTION_ENDED` /
49
+ `CONNECTION_CLOSED` / `CONNECTION_DESTROYED`. Covers the
50
+ narrow race where a query lands on a connection at the exact
51
+ instant the provider drops it.
52
+
53
+ Defaults unchanged: `max: 10`, `connect_timeout: 30`. Migration DDL
54
+ (`sql.unsafe(sql)` inside `executeRaw`) and transactions
55
+ (`sql.begin(...)`) deliberately don't go through the retry — DDL
56
+ is `IF NOT EXISTS` idempotent and transactions need atomic scoping.
57
+
58
+ Observed in production: the PonchOS api running on Railway hit this
59
+ during a subagent test, the orchestrator's auto-callback rerun
60
+ threw the connection-ended error, a concurrent unhandled async
61
+ rejection killed the node process, and Railway restarted the
62
+ replica (~50s). User-facing chat turns started seeing the same
63
+ error after that. Patch eliminates the source.
64
+
3
65
  ## 0.47.0
4
66
 
5
67
  ### Minor Changes
package/dist/index.d.ts CHANGED
@@ -1211,6 +1211,17 @@ interface HarnessOptions {
1211
1211
  * Empty by default — no system mounts in the CLI / dev workflow.
1212
1212
  */
1213
1213
  virtualMounts?: VirtualMount[];
1214
+ /**
1215
+ * Absolute directories of platform-shipped "system" skills. Each is
1216
+ * scanned for `<name>/SKILL.md` at init; the bodies live on local disk
1217
+ * and ship with the deploy. System skills are surfaced in
1218
+ * `<available_skills>` like any other skill, but sit at the LOWEST
1219
+ * precedence: a tenant's own `/skills/<same-name>/` (and a repo skill)
1220
+ * overrides a system skill of the same name. Pair with a read-only
1221
+ * `virtualMounts` entry (e.g. "/system/skills/") if the same files
1222
+ * should also be browsable in the VFS. Empty by default.
1223
+ */
1224
+ systemSkillPaths?: string[];
1214
1225
  }
1215
1226
  interface HarnessRunOutput {
1216
1227
  runId: string;
@@ -1243,6 +1254,8 @@ declare class AgentHarness {
1243
1254
  private loadedConfig?;
1244
1255
  private readonly injectedConfig?;
1245
1256
  private loadedSkills;
1257
+ private systemSkills;
1258
+ private readonly systemSkillPaths;
1246
1259
  private skillFingerprint;
1247
1260
  private lastSkillRefreshAt;
1248
1261
  private readonly activeSkillNames;
@@ -1298,10 +1311,15 @@ declare class AgentHarness {
1298
1311
  private getMemoryStore;
1299
1312
  private listActiveSkills;
1300
1313
  /**
1301
- * Resolve the skill set visible to a given tenant: repo skills plus that
1302
- * tenant's VFS skills, with repo winning on name collision. Cached per
1303
- * tenant; cache invalidates on VFS writes under /skills/ via
1304
- * invalidateSkillsForTenant.
1314
+ * Resolve the skill set visible to a given tenant. Three tiers, by
1315
+ * precedence: repo skills > the tenant's own VFS skills > platform
1316
+ * system skills. So a repo skill wins over a same-named VFS skill
1317
+ * (unchanged), and a tenant's `/skills/<name>/` overrides a same-named
1318
+ * system skill (the deploy-shipped default). Cached per tenant; cache
1319
+ * invalidates on VFS writes under /skills/ via invalidateSkillsForTenant.
1320
+ * System skills are static within a process, so they don't participate
1321
+ * in the fingerprint — but a VFS override does (it changes a /skills
1322
+ * path), which recomputes the cache and lets the override take effect.
1305
1323
  */
1306
1324
  private getSkillsForTenant;
1307
1325
  invalidateSkillsForTenant(tenantId: string): void;
@@ -1451,6 +1469,7 @@ declare const parseSkillFrontmatter: (content: string) => {
1451
1469
  };
1452
1470
  } | undefined;
1453
1471
  declare const loadSkillMetadata: (workingDir: string, extraSkillPaths?: string[]) => Promise<SkillMetadata[]>;
1472
+ declare const loadSkillMetadataFromDirs: (skillDirs: string[]) => Promise<SkillMetadata[]>;
1454
1473
  declare const buildSkillContextWindow: (skills: SkillMetadata[]) => string;
1455
1474
  declare const loadVfsSkillMetadata: (engine: StorageEngine, tenantId: string) => Promise<SkillMetadata[]>;
1456
1475
  declare const mergeSkills: (repoSkills: SkillMetadata[], vfsSkills: SkillMetadata[], onCollision?: (vfsSkill: SkillMetadata) => void) => SkillMetadata[];
@@ -1714,6 +1733,25 @@ declare class PostgresEngine extends SqlStorageEngine {
1714
1733
  refreshPathCache(tenantId: string): Promise<void>;
1715
1734
  private patchVfs;
1716
1735
  private query;
1736
+ /**
1737
+ * Single retry on a transient connection-layer failure. The
1738
+ * `idle_timeout` / `max_lifetime` config above prevents *most*
1739
+ * stale-connection cases, but a query can still race a
1740
+ * provider-initiated drop in flight — the postgres.js client
1741
+ * rejects with `code: "CONNECTION_ENDED"` and the next attempt
1742
+ * checks out a fresh connection from the pool. One retry is
1743
+ * enough; if it fails again the host-side network is genuinely
1744
+ * broken and the caller should see the error.
1745
+ *
1746
+ * Only retries reads + the standard exec/run paths in `query`;
1747
+ * `sql.unsafe(sql)` calls in `executeRaw` (migration DDL) and
1748
+ * `sql.begin(...)` transactions are unwrapped — those are
1749
+ * idempotent-by-construction (DDL is `IF NOT EXISTS`) or
1750
+ * atomically scoped (transactions roll back cleanly), and adding
1751
+ * a retry around them would complicate the transaction
1752
+ * semantics.
1753
+ */
1754
+ private runWithRetry;
1717
1755
  private addToPathCache;
1718
1756
  private removeFromPathCache;
1719
1757
  }
@@ -2061,4 +2099,4 @@ interface RunConversationTurnResult {
2061
2099
  }
2062
2100
  declare const runConversationTurn: (opts: RunConversationTurnOpts) => Promise<RunConversationTurnResult>;
2063
2101
 
2064
- export { type ActiveConversationRun, type ActiveSubagentRun, type AgentFrontmatter, AgentHarness, type AgentIdentity, type AgentLimitsConfig, type AgentModelConfig, AgentOrchestrator, type ApprovalEventItem, type ArchivedToolResult$1 as ArchivedToolResult, type BashConfig, BashEnvironmentManager, type BashExecutionLimits, type BuiltInToolToggles, CALLBACK_LOCK_STALE_MS, type CompactMessagesOptions, type CompactResult, type CompactionConfig, type ContinuationHooks, type Conversation, type ConversationCreateInit, type ConversationState, type ConversationStatusSnapshot, type ConversationStore, type ConversationSummary, type CreateSkillToolsOptions, type CronJobConfig, DEFAULT_AGENT_DESCRIPTION, DEFAULT_AGENT_NAME, DEFAULT_MAX_STEPS, DEFAULT_MODEL_NAME, DEFAULT_MODEL_PROVIDER, DEFAULT_TEMPERATURE, DEFAULT_TIMEOUT, type DefaultAgentDefinitionOptions, type EventSink, type ExecuteTurnResult, type HarnessOptions, type HarnessRunOutput, type HistorySource, InMemoryConversationStore, InMemoryEngine, InMemoryStateStore, type IsolateBinding, type IsolateConfig, LocalMcpBridge, LocalUploadStore, MAX_CONCURRENT_SUBAGENTS, MAX_CONTINUATION_COUNT, MAX_SUBAGENT_CALLBACK_COUNT, MAX_SUBAGENT_NESTING, type MainMemory, type McpConfig, type MemoryConfig, type MemoryStore, type MessagingChannelConfig, type ModelProviderFactory, type NetworkConfig, OPENAI_CODEX_CLIENT_ID, type OpenAICodexAuthConfig, type OpenAICodexDeviceAuthRequest, type OpenAICodexSession, type OrchestratorHooks, type OrchestratorOptions, type OtlpConfig, type OtlpOption, PONCHO_UPLOAD_SCHEME, type ParsedAgent, type PendingSubagentApproval, type PendingSubagentResult, type PendingToolCall, type PonchoConfig, PonchoFsAdapter, PostgresEngine, type ProviderConfig, type Recurrence, type RecurrenceType, type Reminder, type ReminderCreateInput, type ReminderStatus, type ReminderStore, type RemoteMcpServerConfig, type RunConversationTurnOpts, type RunConversationTurnResult, type RunOutcome, type RunRequest, type RuntimeRenderContext, S3UploadStore, STALE_SUBAGENT_THRESHOLD_MS, STORAGE_SCHEMA_VERSION, type SecretsStore, type SkillContextEntry, type SkillMetadata, type SkillSource, SqliteEngine, type StateConfig, type StateProviderName, type StateStore, type StorageConfig, type StorageEngine, type StorageFactoryOptions, type StorageProvider, type StoredApproval, type SubagentManager, type SubagentResult, type SubagentSpawnResult, type SubagentSummary, type SubagentTranscript, type SubagentTranscriptMode, TOOL_RESULT_ARCHIVE_PARAM, type TelemetryConfig, TelemetryEmitter, type TenantTokenPayload, type ToolAccess, type ToolCall, ToolDispatcher, type ToolExecutionResult, type TurnDraftState, type TurnResultMetadata, type TurnSection, type UploadStore, type UploadsConfig, VFS_SCHEME, VercelBlobUploadStore, type VfsDirEntry, type VfsStat, type VirtualMount, applyTurnMetadata, buildAgentDirectoryName, buildApprovalCheckpoints, buildAssistantMetadata, buildSkillContextWindow, buildToolCompletedText, cloneSections, compactMessages, completeOpenAICodexDeviceAuth, computeNextOccurrence, createBashTool, createConversationStore, createConversationStoreFromEngine, createDefaultTools, createDeleteDirectoryTool, createDeleteTool, createEditTool, createMemoryStore, createMemoryStoreFromEngine, createMemoryTools, createModelProvider, createReminderStore, createReminderStoreFromEngine, createReminderTools, createSearchTools, createSecretsStore, createSkillTools, createStateStore, createStorageEngine, createSubagentTools, createTodoStoreFromEngine, createTurnDraftState, createUploadStore, createWriteTool, decodeFileInputData, defaultAgentDefinition, deleteOpenAICodexSession, deriveUploadKey, ensureAgentIdentity, estimateTokens, estimateTotalTokens, executeConversationTurn, findSafeSplitPoint, flushTurnDraft, generateAgentId, getAgentStoreDirectory, getModelContextWindow, getOpenAICodexAccessToken, getOpenAICodexAuthFilePath, getOpenAICodexRequiredScopes, getPonchoStoreRoot, isMessageArray, jsonSchemaToZod, loadCanonicalHistory, loadPonchoConfig, loadRunHistory, loadSkillContext, loadSkillInstructions, loadSkillMetadata, loadVfsSkillMetadata, mergeSkills, normalizeApprovalCheckpoint, normalizeOtlp, normalizeScriptPolicyPath, normalizeToolAccess, parseAgentFile, parseAgentMarkdown, parseSkillFrontmatter, ponchoDocsTool, readOpenAICodexSession, readSkillResource, recordStandardTurnEvent, renderAgentPrompt, resolveAgentIdentity, resolveCompactionConfig, resolveEnv, resolveMemoryConfig, resolveRunRequest, resolveSkillDirs, resolveStateConfig, runConversationTurn, slugifyStorageComponent, startOpenAICodexDeviceAuth, verifyTenantToken, withToolResultArchiveParam, writeOpenAICodexSession };
2102
+ export { type ActiveConversationRun, type ActiveSubagentRun, type AgentFrontmatter, AgentHarness, type AgentIdentity, type AgentLimitsConfig, type AgentModelConfig, AgentOrchestrator, type ApprovalEventItem, type ArchivedToolResult$1 as ArchivedToolResult, type BashConfig, BashEnvironmentManager, type BashExecutionLimits, type BuiltInToolToggles, CALLBACK_LOCK_STALE_MS, type CompactMessagesOptions, type CompactResult, type CompactionConfig, type ContinuationHooks, type Conversation, type ConversationCreateInit, type ConversationState, type ConversationStatusSnapshot, type ConversationStore, type ConversationSummary, type CreateSkillToolsOptions, type CronJobConfig, DEFAULT_AGENT_DESCRIPTION, DEFAULT_AGENT_NAME, DEFAULT_MAX_STEPS, DEFAULT_MODEL_NAME, DEFAULT_MODEL_PROVIDER, DEFAULT_TEMPERATURE, DEFAULT_TIMEOUT, type DefaultAgentDefinitionOptions, type EventSink, type ExecuteTurnResult, type HarnessOptions, type HarnessRunOutput, type HistorySource, InMemoryConversationStore, InMemoryEngine, InMemoryStateStore, type IsolateBinding, type IsolateConfig, LocalMcpBridge, LocalUploadStore, MAX_CONCURRENT_SUBAGENTS, MAX_CONTINUATION_COUNT, MAX_SUBAGENT_CALLBACK_COUNT, MAX_SUBAGENT_NESTING, type MainMemory, type McpConfig, type MemoryConfig, type MemoryStore, type MessagingChannelConfig, type ModelProviderFactory, type NetworkConfig, OPENAI_CODEX_CLIENT_ID, type OpenAICodexAuthConfig, type OpenAICodexDeviceAuthRequest, type OpenAICodexSession, type OrchestratorHooks, type OrchestratorOptions, type OtlpConfig, type OtlpOption, PONCHO_UPLOAD_SCHEME, type ParsedAgent, type PendingSubagentApproval, type PendingSubagentResult, type PendingToolCall, type PonchoConfig, PonchoFsAdapter, PostgresEngine, type ProviderConfig, type Recurrence, type RecurrenceType, type Reminder, type ReminderCreateInput, type ReminderStatus, type ReminderStore, type RemoteMcpServerConfig, type RunConversationTurnOpts, type RunConversationTurnResult, type RunOutcome, type RunRequest, type RuntimeRenderContext, S3UploadStore, STALE_SUBAGENT_THRESHOLD_MS, STORAGE_SCHEMA_VERSION, type SecretsStore, type SkillContextEntry, type SkillMetadata, type SkillSource, SqliteEngine, type StateConfig, type StateProviderName, type StateStore, type StorageConfig, type StorageEngine, type StorageFactoryOptions, type StorageProvider, type StoredApproval, type SubagentManager, type SubagentResult, type SubagentSpawnResult, type SubagentSummary, type SubagentTranscript, type SubagentTranscriptMode, TOOL_RESULT_ARCHIVE_PARAM, type TelemetryConfig, TelemetryEmitter, type TenantTokenPayload, type ToolAccess, type ToolCall, ToolDispatcher, type ToolExecutionResult, type TurnDraftState, type TurnResultMetadata, type TurnSection, type UploadStore, type UploadsConfig, VFS_SCHEME, VercelBlobUploadStore, type VfsDirEntry, type VfsStat, type VirtualMount, applyTurnMetadata, buildAgentDirectoryName, buildApprovalCheckpoints, buildAssistantMetadata, buildSkillContextWindow, buildToolCompletedText, cloneSections, compactMessages, completeOpenAICodexDeviceAuth, computeNextOccurrence, createBashTool, createConversationStore, createConversationStoreFromEngine, createDefaultTools, createDeleteDirectoryTool, createDeleteTool, createEditTool, createMemoryStore, createMemoryStoreFromEngine, createMemoryTools, createModelProvider, createReminderStore, createReminderStoreFromEngine, createReminderTools, createSearchTools, createSecretsStore, createSkillTools, createStateStore, createStorageEngine, createSubagentTools, createTodoStoreFromEngine, createTurnDraftState, createUploadStore, createWriteTool, decodeFileInputData, defaultAgentDefinition, deleteOpenAICodexSession, deriveUploadKey, ensureAgentIdentity, estimateTokens, estimateTotalTokens, executeConversationTurn, findSafeSplitPoint, flushTurnDraft, generateAgentId, getAgentStoreDirectory, getModelContextWindow, getOpenAICodexAccessToken, getOpenAICodexAuthFilePath, getOpenAICodexRequiredScopes, getPonchoStoreRoot, isMessageArray, jsonSchemaToZod, loadCanonicalHistory, loadPonchoConfig, loadRunHistory, loadSkillContext, loadSkillInstructions, loadSkillMetadata, loadSkillMetadataFromDirs, loadVfsSkillMetadata, mergeSkills, normalizeApprovalCheckpoint, normalizeOtlp, normalizeScriptPolicyPath, normalizeToolAccess, parseAgentFile, parseAgentMarkdown, parseSkillFrontmatter, ponchoDocsTool, readOpenAICodexSession, readSkillResource, recordStandardTurnEvent, renderAgentPrompt, resolveAgentIdentity, resolveCompactionConfig, resolveEnv, resolveMemoryConfig, resolveRunRequest, resolveSkillDirs, resolveStateConfig, runConversationTurn, slugifyStorageComponent, startOpenAICodexDeviceAuth, verifyTenantToken, withToolResultArchiveParam, writeOpenAICodexSession };
package/dist/index.js CHANGED
@@ -4433,7 +4433,28 @@ var PostgresEngine = class extends SqlStorageEngine {
4433
4433
  this.sql = postgres(url, {
4434
4434
  onnotice: () => {
4435
4435
  },
4436
- prepare: false
4436
+ prepare: false,
4437
+ // Connection-pool resilience. Managed Postgres providers
4438
+ // (Railway, Neon, Heroku, etc.) routinely drop idle TCP
4439
+ // connections server-side after a few minutes. Without these
4440
+ // knobs, porsager/postgres keeps stale sockets in the pool;
4441
+ // the next query on one rejects with
4442
+ // `write CONNECTION_ENDED <host>:5432` at `durMs=0`, surfacing
4443
+ // as a hard failure to the caller. Two complementary settings:
4444
+ //
4445
+ // - `idle_timeout: 20` closes idle connections client-side
4446
+ // after 20s, before any reasonable provider-side timer
4447
+ // fires. Fresh connection on next checkout = no stale
4448
+ // socket race.
4449
+ // - `max_lifetime: 600` (10 min) recycles long-lived
4450
+ // connections defensively even if they've stayed busy,
4451
+ // which sidesteps a separate class of provider-side
4452
+ // "max connection age" limits.
4453
+ //
4454
+ // Defaults remain `max: 10`, `connect_timeout: 30` — leaving
4455
+ // pool size + initial connect behavior unchanged.
4456
+ idle_timeout: 20,
4457
+ max_lifetime: 60 * 10
4437
4458
  });
4438
4459
  }
4439
4460
  async initialize() {
@@ -4477,10 +4498,38 @@ var PostgresEngine = class extends SqlStorageEngine {
4477
4498
  };
4478
4499
  }
4479
4500
  async query(sql, params) {
4480
- if (!params || params.length === 0) {
4481
- return this.sql.unsafe(sql);
4501
+ return this.runWithRetry(
4502
+ () => !params || params.length === 0 ? this.sql.unsafe(sql) : this.sql.unsafe(sql, params)
4503
+ );
4504
+ }
4505
+ /**
4506
+ * Single retry on a transient connection-layer failure. The
4507
+ * `idle_timeout` / `max_lifetime` config above prevents *most*
4508
+ * stale-connection cases, but a query can still race a
4509
+ * provider-initiated drop in flight — the postgres.js client
4510
+ * rejects with `code: "CONNECTION_ENDED"` and the next attempt
4511
+ * checks out a fresh connection from the pool. One retry is
4512
+ * enough; if it fails again the host-side network is genuinely
4513
+ * broken and the caller should see the error.
4514
+ *
4515
+ * Only retries reads + the standard exec/run paths in `query`;
4516
+ * `sql.unsafe(sql)` calls in `executeRaw` (migration DDL) and
4517
+ * `sql.begin(...)` transactions are unwrapped — those are
4518
+ * idempotent-by-construction (DDL is `IF NOT EXISTS`) or
4519
+ * atomically scoped (transactions roll back cleanly), and adding
4520
+ * a retry around them would complicate the transaction
4521
+ * semantics.
4522
+ */
4523
+ async runWithRetry(fn) {
4524
+ try {
4525
+ return await fn();
4526
+ } catch (err) {
4527
+ const code = err?.code;
4528
+ if (code === "CONNECTION_ENDED" || code === "CONNECTION_CLOSED" || code === "CONNECTION_DESTROYED") {
4529
+ return await fn();
4530
+ }
4531
+ throw err;
4482
4532
  }
4483
- return this.sql.unsafe(sql, params);
4484
4533
  }
4485
4534
  addToPathCache(tenantId, path) {
4486
4535
  const paths = this.pathCache.get(tenantId);
@@ -7314,7 +7363,9 @@ var collectSkillManifests = async (directory) => {
7314
7363
  return files;
7315
7364
  };
7316
7365
  var loadSkillMetadata = async (workingDir, extraSkillPaths) => {
7317
- const skillDirs = resolveSkillDirs(workingDir, extraSkillPaths);
7366
+ return loadSkillMetadataFromDirs(resolveSkillDirs(workingDir, extraSkillPaths));
7367
+ };
7368
+ var loadSkillMetadataFromDirs = async (skillDirs) => {
7318
7369
  const allManifests = [];
7319
7370
  for (const dir of skillDirs) {
7320
7371
  try {
@@ -9071,6 +9122,8 @@ var AgentHarness = class _AgentHarness {
9071
9122
  loadedConfig;
9072
9123
  injectedConfig;
9073
9124
  loadedSkills = [];
9125
+ systemSkills = [];
9126
+ systemSkillPaths = [];
9074
9127
  skillFingerprint = "";
9075
9128
  lastSkillRefreshAt = 0;
9076
9129
  activeSkillNames = /* @__PURE__ */ new Set();
@@ -9276,6 +9329,7 @@ var AgentHarness = class _AgentHarness {
9276
9329
  this.injectedStorageEngine = true;
9277
9330
  }
9278
9331
  this.virtualMounts = options.virtualMounts ?? [];
9332
+ this.systemSkillPaths = options.systemSkillPaths ?? [];
9279
9333
  if (options.toolDefinitions?.length) {
9280
9334
  this.dispatcher.registerMany(options.toolDefinitions);
9281
9335
  }
@@ -9437,14 +9491,19 @@ var AgentHarness = class _AgentHarness {
9437
9491
  return [...this.activeSkillNames].sort();
9438
9492
  }
9439
9493
  /**
9440
- * Resolve the skill set visible to a given tenant: repo skills plus that
9441
- * tenant's VFS skills, with repo winning on name collision. Cached per
9442
- * tenant; cache invalidates on VFS writes under /skills/ via
9443
- * invalidateSkillsForTenant.
9494
+ * Resolve the skill set visible to a given tenant. Three tiers, by
9495
+ * precedence: repo skills > the tenant's own VFS skills > platform
9496
+ * system skills. So a repo skill wins over a same-named VFS skill
9497
+ * (unchanged), and a tenant's `/skills/<name>/` overrides a same-named
9498
+ * system skill (the deploy-shipped default). Cached per tenant; cache
9499
+ * invalidates on VFS writes under /skills/ via invalidateSkillsForTenant.
9500
+ * System skills are static within a process, so they don't participate
9501
+ * in the fingerprint — but a VFS override does (it changes a /skills
9502
+ * path), which recomputes the cache and lets the override take effect.
9444
9503
  */
9445
9504
  async getSkillsForTenant(tenantId) {
9446
9505
  if (!this.storageEngine) {
9447
- return this.loadedSkills;
9506
+ return mergeSkills(this.loadedSkills, this.systemSkills);
9448
9507
  }
9449
9508
  const effectiveTenant = tenantId || "__default__";
9450
9509
  const engineWithRefresh = this.storageEngine;
@@ -9457,7 +9516,7 @@ var AgentHarness = class _AgentHarness {
9457
9516
  return cached.skills;
9458
9517
  }
9459
9518
  const vfsSkills = await loadVfsSkillMetadata(this.storageEngine, effectiveTenant);
9460
- const merged = mergeSkills(this.loadedSkills, vfsSkills, (skipped) => {
9519
+ const repoAndVfs = mergeSkills(this.loadedSkills, vfsSkills, (skipped) => {
9461
9520
  const key = `${effectiveTenant}:${skipped.name}`;
9462
9521
  if (this.vfsSkillCollisionWarnings.has(key)) return;
9463
9522
  this.vfsSkillCollisionWarnings.add(key);
@@ -9465,6 +9524,7 @@ var AgentHarness = class _AgentHarness {
9465
9524
  `VFS skill "${skipped.name}" for tenant ${effectiveTenant} ignored: a repo skill with the same name takes precedence.`
9466
9525
  );
9467
9526
  });
9527
+ const merged = mergeSkills(repoAndVfs, this.systemSkills);
9468
9528
  this.skillCache.set(effectiveTenant, { skills: merged, fingerprint });
9469
9529
  return merged;
9470
9530
  }
@@ -9795,6 +9855,7 @@ var AgentHarness = class _AgentHarness {
9795
9855
  const extraSkillPaths = config?.skillPaths;
9796
9856
  const skillMetadata = await loadSkillMetadata(this.workingDir, extraSkillPaths);
9797
9857
  this.loadedSkills = skillMetadata;
9858
+ this.systemSkills = this.systemSkillPaths.length ? await loadSkillMetadataFromDirs(this.systemSkillPaths) : [];
9798
9859
  this.skillFingerprint = this.buildSkillFingerprint(skillMetadata);
9799
9860
  this.registerSkillTools();
9800
9861
  const agentId = this.parsedAgent.frontmatter.id ?? this.parsedAgent.frontmatter.name;
@@ -13931,6 +13992,7 @@ export {
13931
13992
  loadSkillContext,
13932
13993
  loadSkillInstructions,
13933
13994
  loadSkillMetadata,
13995
+ loadSkillMetadataFromDirs,
13934
13996
  loadVfsSkillMetadata,
13935
13997
  mergeSkills,
13936
13998
  normalizeApprovalCheckpoint,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@poncho-ai/harness",
3
- "version": "0.47.0",
3
+ "version": "0.48.0",
4
4
  "description": "Agent execution runtime - conversation loop, tool dispatch, streaming",
5
5
  "repository": {
6
6
  "type": "git",
package/src/harness.ts CHANGED
@@ -55,6 +55,7 @@ import { createModelProvider, getModelContextWindow, type ModelProviderFactory,
55
55
  import {
56
56
  buildSkillContextWindow,
57
57
  loadSkillMetadata,
58
+ loadSkillMetadataFromDirs,
58
59
  loadVfsSkillMetadata,
59
60
  mergeSkills,
60
61
  } from "./skill-context.js";
@@ -134,6 +135,17 @@ export interface HarnessOptions {
134
135
  * Empty by default — no system mounts in the CLI / dev workflow.
135
136
  */
136
137
  virtualMounts?: VirtualMount[];
138
+ /**
139
+ * Absolute directories of platform-shipped "system" skills. Each is
140
+ * scanned for `<name>/SKILL.md` at init; the bodies live on local disk
141
+ * and ship with the deploy. System skills are surfaced in
142
+ * `<available_skills>` like any other skill, but sit at the LOWEST
143
+ * precedence: a tenant's own `/skills/<same-name>/` (and a repo skill)
144
+ * overrides a system skill of the same name. Pair with a read-only
145
+ * `virtualMounts` entry (e.g. "/system/skills/") if the same files
146
+ * should also be browsable in the VFS. Empty by default.
147
+ */
148
+ systemSkillPaths?: string[];
137
149
  }
138
150
 
139
151
  export interface HarnessRunOutput {
@@ -839,6 +851,8 @@ export class AgentHarness {
839
851
  private loadedConfig?: PonchoConfig;
840
852
  private readonly injectedConfig?: PonchoConfig;
841
853
  private loadedSkills: SkillMetadata[] = [];
854
+ private systemSkills: SkillMetadata[] = [];
855
+ private readonly systemSkillPaths: string[] = [];
842
856
  private skillFingerprint = "";
843
857
  private lastSkillRefreshAt = 0;
844
858
  private readonly activeSkillNames = new Set<string>();
@@ -1077,6 +1091,7 @@ export class AgentHarness {
1077
1091
  this.injectedStorageEngine = true;
1078
1092
  }
1079
1093
  this.virtualMounts = options.virtualMounts ?? [];
1094
+ this.systemSkillPaths = options.systemSkillPaths ?? [];
1080
1095
 
1081
1096
  if (options.toolDefinitions?.length) {
1082
1097
  this.dispatcher.registerMany(options.toolDefinitions);
@@ -1271,14 +1286,19 @@ export class AgentHarness {
1271
1286
  }
1272
1287
 
1273
1288
  /**
1274
- * Resolve the skill set visible to a given tenant: repo skills plus that
1275
- * tenant's VFS skills, with repo winning on name collision. Cached per
1276
- * tenant; cache invalidates on VFS writes under /skills/ via
1277
- * invalidateSkillsForTenant.
1289
+ * Resolve the skill set visible to a given tenant. Three tiers, by
1290
+ * precedence: repo skills > the tenant's own VFS skills > platform
1291
+ * system skills. So a repo skill wins over a same-named VFS skill
1292
+ * (unchanged), and a tenant's `/skills/<name>/` overrides a same-named
1293
+ * system skill (the deploy-shipped default). Cached per tenant; cache
1294
+ * invalidates on VFS writes under /skills/ via invalidateSkillsForTenant.
1295
+ * System skills are static within a process, so they don't participate
1296
+ * in the fingerprint — but a VFS override does (it changes a /skills
1297
+ * path), which recomputes the cache and lets the override take effect.
1278
1298
  */
1279
1299
  private async getSkillsForTenant(tenantId: string | undefined | null): Promise<SkillMetadata[]> {
1280
1300
  if (!this.storageEngine) {
1281
- return this.loadedSkills;
1301
+ return mergeSkills(this.loadedSkills, this.systemSkills);
1282
1302
  }
1283
1303
  // Mirror the rest of the harness: undefined tenantId falls back to
1284
1304
  // "__default__" so dev-mode (no auth) conversations see the same VFS
@@ -1305,7 +1325,7 @@ export class AgentHarness {
1305
1325
  return cached.skills;
1306
1326
  }
1307
1327
  const vfsSkills = await loadVfsSkillMetadata(this.storageEngine, effectiveTenant);
1308
- const merged = mergeSkills(this.loadedSkills, vfsSkills, (skipped) => {
1328
+ const repoAndVfs = mergeSkills(this.loadedSkills, vfsSkills, (skipped) => {
1309
1329
  const key = `${effectiveTenant}:${skipped.name}`;
1310
1330
  if (this.vfsSkillCollisionWarnings.has(key)) return;
1311
1331
  this.vfsSkillCollisionWarnings.add(key);
@@ -1313,6 +1333,11 @@ export class AgentHarness {
1313
1333
  `VFS skill "${skipped.name}" for tenant ${effectiveTenant} ignored: a repo skill with the same name takes precedence.`,
1314
1334
  );
1315
1335
  });
1336
+ // System skills sit at the bottom: a repo or VFS skill of the same
1337
+ // name overrides them. Overriding a system default is the intended
1338
+ // user workflow (mirrors /jobs system-default overrides), so the
1339
+ // collision is silent — not a warning.
1340
+ const merged = mergeSkills(repoAndVfs, this.systemSkills);
1316
1341
  this.skillCache.set(effectiveTenant, { skills: merged, fingerprint });
1317
1342
  return merged;
1318
1343
  }
@@ -1706,6 +1731,13 @@ export class AgentHarness {
1706
1731
  const extraSkillPaths = config?.skillPaths;
1707
1732
  const skillMetadata = await loadSkillMetadata(this.workingDir, extraSkillPaths);
1708
1733
  this.loadedSkills = skillMetadata;
1734
+ // Platform-shipped system skills, scanned from absolute dirs on disk.
1735
+ // Loaded once at init (they ship with the deploy and don't change
1736
+ // within a process). Merged at LOWEST precedence per tenant — see
1737
+ // getSkillsForTenant.
1738
+ this.systemSkills = this.systemSkillPaths.length
1739
+ ? await loadSkillMetadataFromDirs(this.systemSkillPaths)
1740
+ : [];
1709
1741
  this.skillFingerprint = this.buildSkillFingerprint(skillMetadata);
1710
1742
  this.registerSkillTools();
1711
1743
  const agentId = this.parsedAgent.frontmatter.id ?? this.parsedAgent.frontmatter.name;
@@ -209,7 +209,17 @@ export const loadSkillMetadata = async (
209
209
  workingDir: string,
210
210
  extraSkillPaths?: string[],
211
211
  ): Promise<SkillMetadata[]> => {
212
- const skillDirs = resolveSkillDirs(workingDir, extraSkillPaths);
212
+ return loadSkillMetadataFromDirs(resolveSkillDirs(workingDir, extraSkillPaths));
213
+ };
214
+
215
+ // Scan an explicit list of absolute directories for `<name>/SKILL.md`
216
+ // manifests and return their metadata as `source: "repo"` skills (body
217
+ // read from disk on activation). Used both by `loadSkillMetadata` (after
218
+ // resolving repo skill dirs against the working dir) and directly for
219
+ // platform-shipped "system" skills whose source dirs are already absolute.
220
+ export const loadSkillMetadataFromDirs = async (
221
+ skillDirs: string[],
222
+ ): Promise<SkillMetadata[]> => {
213
223
  const allManifests: string[] = [];
214
224
 
215
225
  for (const dir of skillDirs) {
@@ -57,6 +57,27 @@ export class PostgresEngine extends SqlStorageEngine {
57
57
  this.sql = postgres(url, {
58
58
  onnotice: () => {},
59
59
  prepare: false,
60
+ // Connection-pool resilience. Managed Postgres providers
61
+ // (Railway, Neon, Heroku, etc.) routinely drop idle TCP
62
+ // connections server-side after a few minutes. Without these
63
+ // knobs, porsager/postgres keeps stale sockets in the pool;
64
+ // the next query on one rejects with
65
+ // `write CONNECTION_ENDED <host>:5432` at `durMs=0`, surfacing
66
+ // as a hard failure to the caller. Two complementary settings:
67
+ //
68
+ // - `idle_timeout: 20` closes idle connections client-side
69
+ // after 20s, before any reasonable provider-side timer
70
+ // fires. Fresh connection on next checkout = no stale
71
+ // socket race.
72
+ // - `max_lifetime: 600` (10 min) recycles long-lived
73
+ // connections defensively even if they've stayed busy,
74
+ // which sidesteps a separate class of provider-side
75
+ // "max connection age" limits.
76
+ //
77
+ // Defaults remain `max: 10`, `connect_timeout: 30` — leaving
78
+ // pool size + initial connect behavior unchanged.
79
+ idle_timeout: 20,
80
+ max_lifetime: 60 * 10,
60
81
  });
61
82
  }
62
83
 
@@ -118,10 +139,41 @@ export class PostgresEngine extends SqlStorageEngine {
118
139
  }
119
140
 
120
141
  private async query(sql: string, params?: unknown[]): Promise<any[]> {
121
- if (!params || params.length === 0) {
122
- return this.sql.unsafe(sql);
142
+ return this.runWithRetry(() =>
143
+ !params || params.length === 0
144
+ ? this.sql.unsafe(sql)
145
+ : this.sql.unsafe(sql, params),
146
+ );
147
+ }
148
+
149
+ /**
150
+ * Single retry on a transient connection-layer failure. The
151
+ * `idle_timeout` / `max_lifetime` config above prevents *most*
152
+ * stale-connection cases, but a query can still race a
153
+ * provider-initiated drop in flight — the postgres.js client
154
+ * rejects with `code: "CONNECTION_ENDED"` and the next attempt
155
+ * checks out a fresh connection from the pool. One retry is
156
+ * enough; if it fails again the host-side network is genuinely
157
+ * broken and the caller should see the error.
158
+ *
159
+ * Only retries reads + the standard exec/run paths in `query`;
160
+ * `sql.unsafe(sql)` calls in `executeRaw` (migration DDL) and
161
+ * `sql.begin(...)` transactions are unwrapped — those are
162
+ * idempotent-by-construction (DDL is `IF NOT EXISTS`) or
163
+ * atomically scoped (transactions roll back cleanly), and adding
164
+ * a retry around them would complicate the transaction
165
+ * semantics.
166
+ */
167
+ private async runWithRetry<T>(fn: () => Promise<T>): Promise<T> {
168
+ try {
169
+ return await fn();
170
+ } catch (err) {
171
+ const code = (err as { code?: string } | null | undefined)?.code;
172
+ if (code === "CONNECTION_ENDED" || code === "CONNECTION_CLOSED" || code === "CONNECTION_DESTROYED") {
173
+ return await fn();
174
+ }
175
+ throw err;
123
176
  }
124
- return this.sql.unsafe(sql, params);
125
177
  }
126
178
 
127
179
  private addToPathCache(tenantId: string, path: string): void {