robot_lab 0.0.9 → 0.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +53 -0
- data/README.md +210 -1
- data/Rakefile +2 -1
- data/docs/api/core/result.md +123 -0
- data/docs/api/core/robot.md +182 -0
- data/docs/api/errors.md +185 -0
- data/docs/guides/building-robots.md +125 -0
- data/docs/guides/creating-networks.md +21 -0
- data/docs/guides/index.md +10 -0
- data/docs/guides/knowledge.md +182 -0
- data/docs/guides/mcp-integration.md +106 -0
- data/docs/guides/memory.md +2 -0
- data/docs/guides/observability.md +486 -0
- data/docs/guides/ractor-parallelism.md +364 -0
- data/docs/superpowers/plans/2026-04-14-ractor-integration.md +1538 -0
- data/docs/superpowers/specs/2026-04-14-ractor-integration-design.md +258 -0
- data/examples/19_token_tracking.rb +128 -0
- data/examples/20_circuit_breaker.rb +153 -0
- data/examples/21_learning_loop.rb +164 -0
- data/examples/22_context_compression.rb +179 -0
- data/examples/23_convergence.rb +137 -0
- data/examples/24_structured_delegation.rb +150 -0
- data/examples/25_history_search/conversation.jsonl +30 -0
- data/examples/25_history_search.rb +136 -0
- data/examples/26_document_store/api_versioning_adr.md +52 -0
- data/examples/26_document_store/incident_postmortem.md +46 -0
- data/examples/26_document_store/postgres_runbook.md +49 -0
- data/examples/26_document_store/redis_caching_guide.md +48 -0
- data/examples/26_document_store/sidekiq_guide.md +51 -0
- data/examples/26_document_store.rb +147 -0
- data/examples/27_incident_response/incident_response.rb +244 -0
- data/examples/28_mcp_discovery.rb +112 -0
- data/examples/29_ractor_tools.rb +243 -0
- data/examples/30_ractor_network.rb +256 -0
- data/examples/README.md +136 -0
- data/examples/prompts/skill_with_mcp_test.md +9 -0
- data/examples/prompts/skill_with_robot_name_test.md +5 -0
- data/examples/prompts/skill_with_tools_test.md +6 -0
- data/lib/robot_lab/bus_poller.rb +149 -0
- data/lib/robot_lab/convergence.rb +69 -0
- data/lib/robot_lab/delegation_future.rb +93 -0
- data/lib/robot_lab/document_store.rb +155 -0
- data/lib/robot_lab/error.rb +25 -0
- data/lib/robot_lab/history_compressor.rb +205 -0
- data/lib/robot_lab/mcp/client.rb +17 -5
- data/lib/robot_lab/mcp/connection_poller.rb +187 -0
- data/lib/robot_lab/mcp/server.rb +7 -2
- data/lib/robot_lab/mcp/server_discovery.rb +110 -0
- data/lib/robot_lab/mcp/transports/stdio.rb +6 -0
- data/lib/robot_lab/memory.rb +103 -6
- data/lib/robot_lab/network.rb +44 -9
- data/lib/robot_lab/ractor_boundary.rb +42 -0
- data/lib/robot_lab/ractor_job.rb +37 -0
- data/lib/robot_lab/ractor_memory_proxy.rb +85 -0
- data/lib/robot_lab/ractor_network_scheduler.rb +154 -0
- data/lib/robot_lab/ractor_worker_pool.rb +117 -0
- data/lib/robot_lab/robot/bus_messaging.rb +43 -65
- data/lib/robot_lab/robot/history_search.rb +69 -0
- data/lib/robot_lab/robot.rb +228 -11
- data/lib/robot_lab/robot_result.rb +24 -5
- data/lib/robot_lab/run_config.rb +1 -1
- data/lib/robot_lab/text_analysis.rb +103 -0
- data/lib/robot_lab/tool.rb +42 -3
- data/lib/robot_lab/tool_config.rb +1 -1
- data/lib/robot_lab/version.rb +1 -1
- data/lib/robot_lab/waiter.rb +49 -29
- data/lib/robot_lab.rb +25 -0
- data/mkdocs.yml +1 -0
- metadata +72 -2
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Example 25: Chat History Search
|
|
5
|
+
#
|
|
6
|
+
# Demonstrates robot.search_history(query, limit:) — semantic search over a
|
|
7
|
+
# robot's accumulated conversation turns using stemmed term-frequency cosine
|
|
8
|
+
# similarity (classifier gem).
|
|
9
|
+
#
|
|
10
|
+
# The conversation fixture (30 turns across 5 topics) lives in:
|
|
11
|
+
# examples/25_history_search/conversation.jsonl
|
|
12
|
+
#
|
|
13
|
+
# Usage:
|
|
14
|
+
# ruby examples/25_history_search.rb
|
|
15
|
+
|
|
16
|
+
ENV["ROBOT_LAB_TEMPLATE_PATH"] ||= File.join(__dir__, "prompts")
|
|
17
|
+
|
|
18
|
+
require "json"
|
|
19
|
+
require_relative "../lib/robot_lab"
|
|
20
|
+
|
|
21
|
+
CONVERSATION_TURNS = File.readlines(
|
|
22
|
+
File.join(__dir__, "25_history_search", "conversation.jsonl"), chomp: true
|
|
23
|
+
).map { |line| JSON.parse(line, symbolize_names: true) }.freeze
|
|
24
|
+
|
|
25
|
+
puts "=" * 60
|
|
26
|
+
puts "Example 25: Chat History Search"
|
|
27
|
+
puts "=" * 60
|
|
28
|
+
puts
|
|
29
|
+
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
# Minimal message stub — populates history without LLM calls
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
FakeMsg = Struct.new(:role, :content, :tool_calls)
|
|
34
|
+
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
# Build a robot and inject the conversation fixture
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
robot = RobotLab.build(name: "tech_lead", system_prompt: "You are a senior engineering advisor.")
|
|
39
|
+
|
|
40
|
+
messages = CONVERSATION_TURNS.map { |t| FakeMsg.new(t[:role], t[:content], nil) }
|
|
41
|
+
robot.instance_variable_get(:@chat).instance_variable_set(:@messages, messages)
|
|
42
|
+
|
|
43
|
+
total_words = messages.sum { |m| m.content.to_s.split.size }
|
|
44
|
+
puts "Conversation loaded: #{messages.size} messages, ~#{total_words} words"
|
|
45
|
+
puts "Topics: database migration, API performance, deployment pipeline, background jobs, onboarding"
|
|
46
|
+
puts
|
|
47
|
+
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
# Helper: print search results
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
def show_results(results)
|
|
52
|
+
results.each do |r|
|
|
53
|
+
preview = r.text.length > 100 ? "#{r.text[0..97]}..." : r.text
|
|
54
|
+
puts " [#{r.role}] score=#{format("%.3f", r.score)} idx=#{r.index}"
|
|
55
|
+
puts " #{preview}"
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# ---------------------------------------------------------------------------
|
|
60
|
+
# Searches across four distinct topics
|
|
61
|
+
#
|
|
62
|
+
# Note on TF cosine artifacts (expected behavior, not bugs):
|
|
63
|
+
#
|
|
64
|
+
# 'deploy rollback production incident': the rollback playbook (idx=17) ranks
|
|
65
|
+
# 3rd rather than 1st. The short question "Should we do blue-green deploys?"
|
|
66
|
+
# (idx=18) beats it because "deploy" appears there and TF vectors over-weight
|
|
67
|
+
# single high-frequency query terms in short messages.
|
|
68
|
+
#
|
|
69
|
+
# 'caching Redis invalidation TTL': the Docker GHA cache step (idx=15) is a
|
|
70
|
+
# false positive at rank 3 — "cache" appears in that message. The Redis hits
|
|
71
|
+
# at ranks 1 and 2 are correct.
|
|
72
|
+
#
|
|
73
|
+
# These are genuine limitations of keyword-based cosine similarity. The results
|
|
74
|
+
# shown here are authentic, not cherry-picked.
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
{
|
|
77
|
+
"database migration schema change postgres" => 3,
|
|
78
|
+
"slow API endpoint N+1 query performance" => 3,
|
|
79
|
+
"deploy rollback production incident" => 3,
|
|
80
|
+
"Sidekiq retry Stripe failed jobs dead queue" => 3,
|
|
81
|
+
"caching Redis invalidation TTL" => 3,
|
|
82
|
+
}.each do |query, limit|
|
|
83
|
+
puts "── Search: '#{query}'"
|
|
84
|
+
show_results robot.search_history(query, limit: limit)
|
|
85
|
+
puts
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# ---------------------------------------------------------------------------
|
|
89
|
+
# RAG pattern — retrieve the most relevant turns, then inject as context
|
|
90
|
+
# ---------------------------------------------------------------------------
|
|
91
|
+
puts "── RAG pattern: retrieve context, then call LLM ────────────────"
|
|
92
|
+
puts "(showing retrieved context — no actual LLM call)"
|
|
93
|
+
puts
|
|
94
|
+
|
|
95
|
+
rag_query = "Sidekiq jobs exhausting retries during Stripe outage"
|
|
96
|
+
rag_hits = robot.search_history(rag_query, limit: 3)
|
|
97
|
+
rag_ctx = rag_hits.map(&:text).join("\n\n")
|
|
98
|
+
|
|
99
|
+
puts "Query: \"#{rag_query}\""
|
|
100
|
+
puts "Retrieved #{rag_hits.size} turn(s) — #{rag_ctx.split.size} words"
|
|
101
|
+
puts "Scores: #{rag_hits.map { |h| format("%.3f", h.score) }.join(" ")}"
|
|
102
|
+
puts "Token savings vs. full history: ~#{total_words} words → #{rag_ctx.split.size} words"
|
|
103
|
+
puts
|
|
104
|
+
puts "Top retrieved turn:"
|
|
105
|
+
puts " \"#{rag_hits.first.text[0..110]}...\""
|
|
106
|
+
puts
|
|
107
|
+
puts "LLM call would be:"
|
|
108
|
+
puts ' robot.run("Prior context:\n#{context}\n\nQuestion: #{rag_query}")'
|
|
109
|
+
puts
|
|
110
|
+
|
|
111
|
+
# ---------------------------------------------------------------------------
|
|
112
|
+
# When to use search_history
|
|
113
|
+
# ---------------------------------------------------------------------------
|
|
114
|
+
puts "=" * 60
|
|
115
|
+
puts "When to use search_history"
|
|
116
|
+
puts "=" * 60
|
|
117
|
+
puts <<~'TEXT'
|
|
118
|
+
|
|
119
|
+
Without search_history:
|
|
120
|
+
robot.run(question)
|
|
121
|
+
— full accumulated history sent to the LLM on every call
|
|
122
|
+
— costs grow linearly with conversation length
|
|
123
|
+
|
|
124
|
+
With search_history:
|
|
125
|
+
hits = robot.search_history(question, limit: 3)
|
|
126
|
+
context = hits.map(&:text).join("\n\n")
|
|
127
|
+
robot.run("Prior context:\n#{context}\n\nQuestion: #{question}")
|
|
128
|
+
— only the N most relevant turns are injected
|
|
129
|
+
— token cost stays flat regardless of history length
|
|
130
|
+
— pairs well with compress_history
|
|
131
|
+
|
|
132
|
+
Optional dependency: gem "classifier", "~> 2.3"
|
|
133
|
+
|
|
134
|
+
TEXT
|
|
135
|
+
|
|
136
|
+
puts "Done."
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# Architecture Decision Record #047 — API Versioning Strategy
|
|
2
|
+
|
|
3
|
+
**Status:** Accepted (2024-11-12)
|
|
4
|
+
**Deciders:** Platform team, Mobile team, Partner integrations team
|
|
5
|
+
|
|
6
|
+
## Context
|
|
7
|
+
|
|
8
|
+
The v1 API has accumulated 23 breaking changes held back by an informal freeze
|
|
9
|
+
while three external partners built integrations. The mobile apps ship on a
|
|
10
|
+
4-week release cycle and cannot deploy hotfixes to force users to upgrade. We
|
|
11
|
+
need a versioning strategy that allows the backend to evolve without coordinated
|
|
12
|
+
lockstep releases across all consumers.
|
|
13
|
+
|
|
14
|
+
## Decision
|
|
15
|
+
|
|
16
|
+
We adopt URI-based versioning (/api/v2/, /api/v3/) rather than header-based
|
|
17
|
+
(Accept: application/vnd.company.v2+json) for the following reasons:
|
|
18
|
+
|
|
19
|
+
- URI versioning is visible in logs, dashboards, and browser dev tools.
|
|
20
|
+
- Proxy and CDN rules can target specific version prefixes.
|
|
21
|
+
- Internal clients are all first-party and can be updated in lockstep.
|
|
22
|
+
|
|
23
|
+
Header-based versioning is reserved for minor non-breaking variants (e.g.,
|
|
24
|
+
adding optional fields) using the Prefer header.
|
|
25
|
+
|
|
26
|
+
## Support Lifecycle
|
|
27
|
+
|
|
28
|
+
Each major version is supported for 18 months from GA. Deprecation notices are
|
|
29
|
+
added to response headers (Sunset: date) 6 months before EOL. The deprecation
|
|
30
|
+
dashboard tracks call volume per version per consumer; we do not retire a
|
|
31
|
+
version with > 100 calls/day without direct partner outreach.
|
|
32
|
+
|
|
33
|
+
## Backwards Compatibility Rules
|
|
34
|
+
|
|
35
|
+
Within a version, we **may**:
|
|
36
|
+
- Add new fields to responses.
|
|
37
|
+
- Add new optional request parameters.
|
|
38
|
+
- Add new endpoints.
|
|
39
|
+
- Add new enum values (consumers must ignore unknown values).
|
|
40
|
+
|
|
41
|
+
We **must not**:
|
|
42
|
+
- Remove or rename fields.
|
|
43
|
+
- Change field types.
|
|
44
|
+
- Change HTTP status codes for existing success cases.
|
|
45
|
+
- Remove endpoints.
|
|
46
|
+
|
|
47
|
+
## Migration Tooling
|
|
48
|
+
|
|
49
|
+
A version compatibility shim layer translates v1 requests to v2 internal
|
|
50
|
+
representations and back-translates responses. This allows v1 to remain
|
|
51
|
+
operational without duplicating business logic. The shim is tested with a
|
|
52
|
+
contract test suite against recorded v1 response fixtures.
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# Incident Postmortem — INC-2024-089
|
|
2
|
+
|
|
3
|
+
**Date:** 2024-10-03
|
|
4
|
+
**Duration:** 47 minutes
|
|
5
|
+
**Severity:** P1
|
|
6
|
+
**Affected:** API gateway, order processing, checkout flows
|
|
7
|
+
|
|
8
|
+
## Timeline
|
|
9
|
+
|
|
10
|
+
| Time | Event |
|
|
11
|
+
|-------|-------|
|
|
12
|
+
| 14:23 | Automated alert fires: p99 API latency exceeds 5 seconds |
|
|
13
|
+
| 14:25 | On-call engineer pages in; confirms checkout error rate at 34% |
|
|
14
|
+
| 14:31 | Identified spike in slow queries on orders table in Datadog APM |
|
|
15
|
+
| 14:38 | Root cause confirmed: migration added non-concurrent index at peak traffic |
|
|
16
|
+
| 14:44 | DBA kills the migration process; index creation aborted |
|
|
17
|
+
| 14:48 | Query latency returns to baseline; error rate drops to 0.2% |
|
|
18
|
+
| 15:10 | Full recovery confirmed; incident closed |
|
|
19
|
+
|
|
20
|
+
## Root Cause
|
|
21
|
+
|
|
22
|
+
An engineer ran a schema migration that created an index on orders.status
|
|
23
|
+
without the CONCURRENTLY keyword. Postgres acquired an AccessExclusiveLock on
|
|
24
|
+
the orders table for the duration of the index build (11 minutes). All queries
|
|
25
|
+
touching the orders table queued behind the lock, exhausting the PgBouncer
|
|
26
|
+
connection pool within 3 minutes.
|
|
27
|
+
|
|
28
|
+
## Contributing Factors
|
|
29
|
+
|
|
30
|
+
1. Migration review checklist did not include "concurrent index" verification.
|
|
31
|
+
2. The migration was run manually during business hours, not via the deploy pipeline.
|
|
32
|
+
3. No automated linting (strong_migrations) was enforced in CI.
|
|
33
|
+
|
|
34
|
+
## Remediation (Completed)
|
|
35
|
+
|
|
36
|
+
- `strong_migrations` gem added to Gemfile; CI fails on unsafe migration patterns.
|
|
37
|
+
- Runbook updated: all migrations that touch tables > 1M rows require DBA review.
|
|
38
|
+
- Index creation added to the concurrent-operations checklist.
|
|
39
|
+
- PgBouncer max_client_conn increased from 150 to 300 as a buffer.
|
|
40
|
+
|
|
41
|
+
## Lessons Learned
|
|
42
|
+
|
|
43
|
+
Lock acquisition during index creation is silent in application logs — the first
|
|
44
|
+
visible symptom is connection pool exhaustion, not a database error.
|
|
45
|
+
Instrumenting pg_locks with an alert on long-held AccessExclusiveLocks would
|
|
46
|
+
have cut detection time from 8 minutes to under 1 minute.
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# PostgreSQL Operations Runbook — v3.1
|
|
2
|
+
|
|
3
|
+
## Slow Query Investigation
|
|
4
|
+
|
|
5
|
+
When a query exceeds 1 second, start with pg_stat_statements:
|
|
6
|
+
|
|
7
|
+
SELECT query, mean_exec_time, calls, total_exec_time
|
|
8
|
+
FROM pg_stat_statements ORDER BY mean_exec_time DESC LIMIT 20;
|
|
9
|
+
|
|
10
|
+
Use EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT) on the top offenders.
|
|
11
|
+
Look for Sequential Scans on large tables (> 50k rows) and Hash Joins on
|
|
12
|
+
unindexed foreign keys. Missing index candidates appear as "rows removed by
|
|
13
|
+
filter" values that are an order of magnitude larger than the rows returned.
|
|
14
|
+
|
|
15
|
+
## Connection Pool Exhaustion
|
|
16
|
+
|
|
17
|
+
PgBouncer pools connections at the transaction level. When all connections are
|
|
18
|
+
in use, new queries queue until pool_size is reached, at which point clients
|
|
19
|
+
receive "too many clients" errors. Mitigate by:
|
|
20
|
+
1. Reducing max_connections per Rails process via database.yml pool setting.
|
|
21
|
+
2. Increasing server_pool_size in pgbouncer.ini incrementally.
|
|
22
|
+
3. Identifying and killing idle-in-transaction connections:
|
|
23
|
+
|
|
24
|
+
SELECT pid, state, query, now() - query_start AS duration
|
|
25
|
+
FROM pg_stat_activity WHERE state = 'idle in transaction'
|
|
26
|
+
AND query_start < now() - interval '30 seconds';
|
|
27
|
+
|
|
28
|
+
## Table Bloat and Vacuum
|
|
29
|
+
|
|
30
|
+
High update/delete workloads generate table bloat. Check with:
|
|
31
|
+
|
|
32
|
+
SELECT relname, n_dead_tup, n_live_tup,
|
|
33
|
+
round(n_dead_tup::numeric / nullif(n_live_tup, 0) * 100, 1) AS dead_pct
|
|
34
|
+
FROM pg_stat_user_tables ORDER BY dead_pct DESC;
|
|
35
|
+
|
|
36
|
+
If dead_pct exceeds 20% on a hot table, trigger VACUUM ANALYZE manually. For
|
|
37
|
+
severe bloat, schedule an off-hours VACUUM FULL (acquires exclusive lock).
|
|
38
|
+
Autovacuum scale factor defaults to 0.2; reduce to 0.05 on high-churn tables.
|
|
39
|
+
|
|
40
|
+
## Replication Lag
|
|
41
|
+
|
|
42
|
+
Monitor standby lag with:
|
|
43
|
+
|
|
44
|
+
SELECT client_addr, write_lag, flush_lag, replay_lag
|
|
45
|
+
FROM pg_stat_replication;
|
|
46
|
+
|
|
47
|
+
Lag above 30 seconds indicates the replica is falling behind writes. Common
|
|
48
|
+
causes: long-running VACUUM on primary holding WAL files, network saturation
|
|
49
|
+
between primary and replica, or index builds on the replica.
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# Redis Caching Patterns — Implementation Guide
|
|
2
|
+
|
|
3
|
+
## Cache Key Design
|
|
4
|
+
|
|
5
|
+
Keys must encode every dimension that affects the cached value. For a
|
|
6
|
+
user-scoped collection: `orders:user_USER_ID:page_PAGE:v2`. Always include a
|
|
7
|
+
version suffix (v2) so a code deploy can invalidate globally by bumping the
|
|
8
|
+
version, without a manual cache flush. Avoid encoding mutable data (e.g.,
|
|
9
|
+
user.plan) directly in the key; use separate keys and join at read time,
|
|
10
|
+
or accept stale reads.
|
|
11
|
+
|
|
12
|
+
## TTL Strategy
|
|
13
|
+
|
|
14
|
+
Set TTLs based on acceptable staleness, not on intuition:
|
|
15
|
+
|
|
16
|
+
- User session data: 24h (refreshed on activity)
|
|
17
|
+
- API response cache (authenticated): 5 minutes
|
|
18
|
+
- API response cache (public, CDN-backed): 60 seconds
|
|
19
|
+
- Computed aggregates (dashboards): 15 minutes with background refresh
|
|
20
|
+
- Feature flags: 30 seconds (fast propagation of flag changes)
|
|
21
|
+
|
|
22
|
+
Always set a TTL. Unbounded keys are a production outage waiting to happen
|
|
23
|
+
when a runaway process fills the Redis instance.
|
|
24
|
+
|
|
25
|
+
## Cache Invalidation
|
|
26
|
+
|
|
27
|
+
Explicit invalidation is more reliable than TTL-only for write-heavy data. Use
|
|
28
|
+
after_commit callbacks to delete or update cache entries when records change.
|
|
29
|
+
For collections, track the latest updated_at timestamp as the cache key
|
|
30
|
+
component (Russian doll caching). When multiple cache entries must be
|
|
31
|
+
invalidated atomically, use a Redis pipeline or Lua script.
|
|
32
|
+
|
|
33
|
+
## Redis Memory Pressure
|
|
34
|
+
|
|
35
|
+
When Redis hits maxmemory, it evicts keys according to the eviction policy. Use
|
|
36
|
+
`allkeys-lru` for pure cache workloads. Monitor `evicted_keys` in Redis INFO; a
|
|
37
|
+
non-zero and growing value means your cache is too small for the working set.
|
|
38
|
+
Separate cache and session data into different Redis instances (or databases)
|
|
39
|
+
so session eviction cannot be triggered by cache pressure.
|
|
40
|
+
|
|
41
|
+
## Stampede Protection
|
|
42
|
+
|
|
43
|
+
Under high read concurrency, a cache miss causes multiple processes to
|
|
44
|
+
simultaneously recompute the same expensive value — the cache stampede.
|
|
45
|
+
Mitigate with probabilistic early expiration: recompute when TTL drops below a
|
|
46
|
+
random fraction of the original TTL. Alternatively, use a distributed lock
|
|
47
|
+
(Redlock or a simple SET NX PX lock key) to allow only one process to recompute
|
|
48
|
+
while others wait briefly on the stale value.
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# Background Job Processing with Sidekiq — Engineering Guide
|
|
2
|
+
|
|
3
|
+
## Job Design Principles
|
|
4
|
+
|
|
5
|
+
Every Sidekiq job must be idempotent: running it twice with the same arguments
|
|
6
|
+
must produce the same outcome. This is non-negotiable because Sidekiq retries
|
|
7
|
+
failed jobs and at-least-once delivery is guaranteed, not exactly-once. Achieve
|
|
8
|
+
idempotency by checking preconditions (has this invoice already been generated?),
|
|
9
|
+
using database unique constraints on job output records, and passing Stripe
|
|
10
|
+
idempotency keys.
|
|
11
|
+
|
|
12
|
+
## Retry Configuration
|
|
13
|
+
|
|
14
|
+
The default retry count is 25, which provides backoff up to ~21 days. For
|
|
15
|
+
time-sensitive jobs (send_welcome_email) reduce to 3. For financial jobs
|
|
16
|
+
(charge_subscription) raise to 15 to survive multi-hour outages.
|
|
17
|
+
|
|
18
|
+
Configure per-job: `sidekiq_options retry: 10`
|
|
19
|
+
|
|
20
|
+
Customize backoff with sidekiq_retry_in:
|
|
21
|
+
|
|
22
|
+
sidekiq_retry_in { |count| (count ** 4) + 15 + rand(30) * count }
|
|
23
|
+
|
|
24
|
+
This gives approximately: 15s, 1m, 5m, 17m, 34m for the first 5 retries.
|
|
25
|
+
|
|
26
|
+
## Circuit Breaker Pattern
|
|
27
|
+
|
|
28
|
+
When a downstream service (Stripe, SendGrid) is degraded, jobs fail rapidly and
|
|
29
|
+
fill the retry queue, creating a thundering-herd effect when the service
|
|
30
|
+
recovers. Use a circuit breaker backed by Redis:
|
|
31
|
+
|
|
32
|
+
- Set `stripe:circuit_open` in Redis when 3 consecutive failures occur.
|
|
33
|
+
- In a job middleware, check the flag; if open, re-enqueue with 5-minute delay.
|
|
34
|
+
- Auto-clear the flag after 10 minutes using Redis TTL.
|
|
35
|
+
|
|
36
|
+
This converts retry churn into scheduled bursts.
|
|
37
|
+
|
|
38
|
+
## Dead Queue Management
|
|
39
|
+
|
|
40
|
+
Jobs reach the dead queue after exhausting all retries. Never bulk-retry
|
|
41
|
+
blindly. Group dead jobs by error class, inspect a sample for root cause,
|
|
42
|
+
fix the underlying issue, then use a Rake task to re-enqueue in batches of 50
|
|
43
|
+
with a 1-second inter-batch sleep to avoid overwhelming the recovered service.
|
|
44
|
+
Log each re-enqueue with original args and failure reason.
|
|
45
|
+
|
|
46
|
+
## Queue Priority and Latency Budgets
|
|
47
|
+
|
|
48
|
+
Define at least three queues: critical (< 1s SLA: auth, payments), default
|
|
49
|
+
(< 30s: email, webhooks), and bulk (< 1h: exports, reports). Run dedicated
|
|
50
|
+
Sidekiq processes per queue tier. Never mix critical and bulk work in the same
|
|
51
|
+
process — a spike of bulk jobs will starve critical work if they share a queue.
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Example 26: Embedding-Based Document Store
|
|
5
|
+
#
|
|
6
|
+
# Demonstrates Memory#store_document and Memory#search_documents — a
|
|
7
|
+
# lightweight RAG store backed by fastembed (BAAI/bge-small-en-v1.5).
|
|
8
|
+
#
|
|
9
|
+
# Documents are multi-paragraph engineering guides stored as Markdown files in:
|
|
10
|
+
# examples/26_document_store/
|
|
11
|
+
#
|
|
12
|
+
# Usage:
|
|
13
|
+
# ruby examples/26_document_store.rb
|
|
14
|
+
# (Downloads the ~23 MB ONNX model on first run; cached afterwards.)
|
|
15
|
+
|
|
16
|
+
ENV["ROBOT_LAB_TEMPLATE_PATH"] ||= File.join(__dir__, "prompts")
|
|
17
|
+
|
|
18
|
+
require_relative "../lib/robot_lab"
|
|
19
|
+
|
|
20
|
+
puts "=" * 60
|
|
21
|
+
puts "Example 26: Embedding-Based Document Store"
|
|
22
|
+
puts "=" * 60
|
|
23
|
+
puts
|
|
24
|
+
puts "Note: First run downloads the fastembed model (~23 MB, cached)."
|
|
25
|
+
puts
|
|
26
|
+
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
# Load documents from the companion directory
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
DOC_DIR = File.join(__dir__, "26_document_store")
|
|
31
|
+
|
|
32
|
+
DOCUMENTS = Dir[File.join(DOC_DIR, "*.md")].sort.each_with_object({}) do |path, h|
|
|
33
|
+
key = File.basename(path, ".md").to_sym
|
|
34
|
+
h[key] = File.read(path)
|
|
35
|
+
end.freeze
|
|
36
|
+
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
# Store into a standalone DocumentStore
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
store = RobotLab::DocumentStore.new
|
|
41
|
+
|
|
42
|
+
print "Storing #{DOCUMENTS.size} documents... "
|
|
43
|
+
DOCUMENTS.each { |key, text| store.store(key, text) }
|
|
44
|
+
puts "done"
|
|
45
|
+
puts
|
|
46
|
+
DOCUMENTS.each { |key, text| puts " #{key.to_s.ljust(24)} #{text.split.size} words" }
|
|
47
|
+
puts
|
|
48
|
+
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
# Queries — each phrased differently from the document content
|
|
51
|
+
# ---------------------------------------------------------------------------
|
|
52
|
+
QUERIES = [
|
|
53
|
+
{
|
|
54
|
+
label: "Database query performance",
|
|
55
|
+
query: "Why is my Postgres query slow and how do I investigate it?",
|
|
56
|
+
want: :postgres_runbook
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
label: "Background job failures during outage",
|
|
60
|
+
query: "Jobs keep failing when Stripe is down. How do I stop them piling up?",
|
|
61
|
+
want: :sidekiq_guide
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
label: "API breaking changes policy",
|
|
65
|
+
query: "Can I rename a response field in the API without breaking clients?",
|
|
66
|
+
want: :api_versioning_adr
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
label: "Cache expiry and memory pressure",
|
|
70
|
+
query: "Redis is evicting keys unexpectedly and the cache hit rate has dropped.",
|
|
71
|
+
want: :redis_caching_guide
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
label: "Production outage from table lock",
|
|
75
|
+
query: "We had an outage caused by a database lock during a migration. What happened?",
|
|
76
|
+
want: :incident_postmortem
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
label: "Semantic gap — no shared keywords",
|
|
80
|
+
query: "Connection pool is full and new requests are being rejected.",
|
|
81
|
+
want: :postgres_runbook
|
|
82
|
+
},
|
|
83
|
+
].freeze
|
|
84
|
+
|
|
85
|
+
QUERIES.each do |q|
|
|
86
|
+
results = store.search(q[:query], limit: 3)
|
|
87
|
+
top = results.first
|
|
88
|
+
verdict = top[:key] == q[:want] ? "✓ correct" : "✗ expected #{q[:want]}"
|
|
89
|
+
|
|
90
|
+
puts "── #{q[:label]}"
|
|
91
|
+
puts " Query: \"#{q[:query]}\""
|
|
92
|
+
puts " Top result: #{top[:key]} (#{format("%.3f", top[:score])}) — #{verdict}"
|
|
93
|
+
puts " Ranking: " + results.map { |r| "#{r[:key]} #{format("%.3f", r[:score])}" }.join(" | ")
|
|
94
|
+
puts
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# ---------------------------------------------------------------------------
|
|
98
|
+
# Delete and verify
|
|
99
|
+
# ---------------------------------------------------------------------------
|
|
100
|
+
puts "── Delete :redis_caching_guide, re-run cache query"
|
|
101
|
+
store.delete(:redis_caching_guide)
|
|
102
|
+
results = store.search("Redis evicting keys unexpectedly", limit: 2)
|
|
103
|
+
puts " Remaining keys: #{store.keys.inspect}"
|
|
104
|
+
puts " Top result after deletion: #{results.first[:key]}"
|
|
105
|
+
puts
|
|
106
|
+
|
|
107
|
+
# ---------------------------------------------------------------------------
|
|
108
|
+
# Memory integration
|
|
109
|
+
# ---------------------------------------------------------------------------
|
|
110
|
+
puts "── Memory integration"
|
|
111
|
+
memory = RobotLab::Memory.new(enable_cache: false)
|
|
112
|
+
|
|
113
|
+
DOCUMENTS.each { |key, text| memory.store_document(key, text) }
|
|
114
|
+
puts " Stored #{memory.document_keys.size} documents via memory.store_document"
|
|
115
|
+
|
|
116
|
+
hits = memory.search_documents("slow query bloat vacuum autovacuum", limit: 2)
|
|
117
|
+
puts " Search 'slow query bloat vacuum autovacuum':"
|
|
118
|
+
hits.each { |h| puts " #{h[:key]} (#{format("%.3f", h[:score])})" }
|
|
119
|
+
|
|
120
|
+
memory.delete_document(:postgres_runbook)
|
|
121
|
+
puts " After delete, keys: #{memory.document_keys.inspect}"
|
|
122
|
+
puts
|
|
123
|
+
|
|
124
|
+
# ---------------------------------------------------------------------------
|
|
125
|
+
# RAG pattern
|
|
126
|
+
# ---------------------------------------------------------------------------
|
|
127
|
+
puts "=" * 60
|
|
128
|
+
puts "RAG Pattern: retrieve relevant docs, then generate with LLM"
|
|
129
|
+
puts "=" * 60
|
|
130
|
+
puts
|
|
131
|
+
|
|
132
|
+
rag_query = "Our Sidekiq jobs exhaust retries and land in the dead queue after a Stripe outage."
|
|
133
|
+
|
|
134
|
+
hits = store.search(rag_query, limit: 2)
|
|
135
|
+
context = hits.map { |h| h[:text] }.join("\n\n---\n\n")
|
|
136
|
+
|
|
137
|
+
puts "User question:"
|
|
138
|
+
puts " \"#{rag_query}\""
|
|
139
|
+
puts
|
|
140
|
+
puts "Retrieved #{hits.size} document(s) — #{context.split.size} words of context:"
|
|
141
|
+
hits.each { |h| puts " #{h[:key]} (score #{format("%.3f", h[:score])})" }
|
|
142
|
+
puts
|
|
143
|
+
puts "LLM call would be:"
|
|
144
|
+
puts ' robot.run("Use the following docs:\n#{context}\n\nQuestion: #{rag_query}")'
|
|
145
|
+
puts
|
|
146
|
+
|
|
147
|
+
puts "Done."
|