thinkwork-cli 0.8.2 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/LICENSE +202 -0
  2. package/README.md +18 -2
  3. package/dist/cli.js +3004 -215
  4. package/dist/terraform/examples/greenfield/main.tf +325 -19
  5. package/dist/terraform/examples/greenfield/terraform.tfvars.example +14 -0
  6. package/dist/terraform/modules/app/agentcore-code-interpreter/Dockerfile.sandbox-base +61 -0
  7. package/dist/terraform/modules/app/agentcore-code-interpreter/README.md +54 -0
  8. package/dist/terraform/modules/app/agentcore-code-interpreter/main.tf +197 -0
  9. package/dist/terraform/modules/app/agentcore-code-interpreter/scripts/build_and_push_sandbox_base.sh +70 -0
  10. package/dist/terraform/modules/app/agentcore-flue/README.md +58 -0
  11. package/dist/terraform/modules/app/agentcore-flue/main.tf +322 -0
  12. package/dist/terraform/modules/app/agentcore-flue/outputs.tf +23 -0
  13. package/dist/terraform/modules/app/agentcore-flue/variables.tf +91 -0
  14. package/dist/terraform/modules/app/agentcore-memory/scripts/create_or_find_memory.sh +0 -0
  15. package/dist/terraform/modules/app/agentcore-runtime/main.tf +204 -4
  16. package/dist/terraform/modules/app/appsync-subscriptions/main.tf +4 -0
  17. package/dist/terraform/modules/app/appsync-subscriptions/outputs.tf +5 -0
  18. package/dist/terraform/modules/app/computer-runtime/README.md +15 -0
  19. package/dist/terraform/modules/app/computer-runtime/main.tf +406 -0
  20. package/dist/terraform/modules/app/computer-runtime/outputs.tf +75 -0
  21. package/dist/terraform/modules/app/computer-runtime/variables.tf +66 -0
  22. package/dist/terraform/modules/app/hindsight-memory/main.tf +6 -0
  23. package/dist/terraform/modules/app/lambda-api/eval-fanout.tf +128 -0
  24. package/dist/terraform/modules/app/lambda-api/handlers.tf +1557 -42
  25. package/dist/terraform/modules/app/lambda-api/main.tf +299 -15
  26. package/dist/terraform/modules/app/lambda-api/mcp-oauth.tf +118 -0
  27. package/dist/terraform/modules/app/lambda-api/oauth-secrets.tf +49 -0
  28. package/dist/terraform/modules/app/lambda-api/outputs.tf +38 -0
  29. package/dist/terraform/modules/app/lambda-api/slack-app-secrets.tf +43 -0
  30. package/dist/terraform/modules/app/lambda-api/stripe-secrets.tf +53 -0
  31. package/dist/terraform/modules/app/lambda-api/variables.tf +349 -2
  32. package/dist/terraform/modules/app/lambda-api/workspace-events.tf +125 -0
  33. package/dist/terraform/modules/app/routines-stepfunctions/main.tf +453 -0
  34. package/dist/terraform/modules/app/sandbox-log-scrubber/README.md +66 -0
  35. package/dist/terraform/modules/app/sandbox-log-scrubber/main.tf +200 -0
  36. package/dist/terraform/modules/app/static-site/main.tf +146 -5
  37. package/dist/terraform/modules/app/www-dns/main.tf +118 -15
  38. package/dist/terraform/modules/app/www-dns/outputs.tf +10 -0
  39. package/dist/terraform/modules/app/www-dns/variables.tf +42 -0
  40. package/dist/terraform/modules/data/aurora-postgres/main.tf +164 -3
  41. package/dist/terraform/modules/data/aurora-postgres/outputs.tf +34 -0
  42. package/dist/terraform/modules/data/aurora-postgres/variables.tf +16 -0
  43. package/dist/terraform/modules/data/compliance-audit-bucket/README.md +145 -0
  44. package/dist/terraform/modules/data/compliance-audit-bucket/main.tf +573 -0
  45. package/dist/terraform/modules/data/compliance-audit-bucket/outputs.tf +43 -0
  46. package/dist/terraform/modules/data/compliance-audit-bucket/variables.tf +93 -0
  47. package/dist/terraform/modules/data/compliance-exports-bucket/main.tf +269 -0
  48. package/dist/terraform/modules/data/compliance-exports-bucket/outputs.tf +23 -0
  49. package/dist/terraform/modules/data/compliance-exports-bucket/variables.tf +50 -0
  50. package/dist/terraform/modules/data/s3-backups-bucket/main.tf +123 -0
  51. package/dist/terraform/modules/data/s3-buckets/main.tf +13 -0
  52. package/dist/terraform/modules/foundation/cognito/variables.tf +5 -2
  53. package/dist/terraform/modules/thinkwork/main.tf +439 -21
  54. package/dist/terraform/modules/thinkwork/outputs.tf +121 -0
  55. package/dist/terraform/modules/thinkwork/variables.tf +165 -6
  56. package/dist/terraform/schema.graphql +45 -0
  57. package/package.json +15 -14
@@ -7,34 +7,56 @@
7
7
  ################################################################################
8
8
 
9
9
  locals {
10
- use_local_zips = var.lambda_zips_dir != ""
11
- runtime = "nodejs20.x"
10
+ use_local_zips = var.lambda_zips_dir != ""
11
+ eval_fanout_queue_url = local.use_local_zips ? aws_sqs_queue.eval_fanout[0].url : ""
12
+ runtime = "nodejs20.x"
12
13
 
13
14
  # Common environment variables shared by all API handlers
14
15
  common_env = {
15
- STAGE = var.stage
16
- DATABASE_URL = "postgresql://${var.db_username}:${urlencode(var.db_password)}@${var.db_cluster_endpoint}:5432/${var.database_name}?sslmode=no-verify"
17
- DATABASE_SECRET_ARN = var.graphql_db_secret_arn
18
- DATABASE_HOST = var.db_cluster_endpoint
19
- DATABASE_NAME = var.database_name
20
- BUCKET_NAME = var.bucket_name
21
- USER_POOL_ID = var.user_pool_id
22
- COGNITO_USER_POOL_ID = var.user_pool_id
23
- ADMIN_CLIENT_ID = var.admin_client_id
24
- MOBILE_CLIENT_ID = var.mobile_client_id
25
- COGNITO_APP_CLIENT_IDS = "${var.admin_client_id},${var.mobile_client_id}"
26
- APPSYNC_ENDPOINT = var.appsync_api_url
27
- APPSYNC_API_KEY = var.appsync_api_key
28
- GRAPHQL_API_KEY = var.appsync_api_key
29
- API_AUTH_SECRET = var.api_auth_secret
30
- THINKWORK_API_SECRET = var.api_auth_secret
31
- EMAIL_HMAC_SECRET = var.api_auth_secret
32
- THINKWORK_API_URL = "https://${aws_apigatewayv2_api.main.id}.execute-api.${var.region}.amazonaws.com"
33
- AGENTCORE_FUNCTION_NAME = var.agentcore_function_name
34
- WORKSPACE_BUCKET = var.bucket_name
35
- HINDSIGHT_ENDPOINT = var.hindsight_endpoint
36
- AGENTCORE_MEMORY_ID = var.agentcore_memory_id
37
- MEMORY_ENGINE = var.memory_engine
16
+ STAGE = var.stage
17
+ DATABASE_URL = "postgresql://${var.db_username}:${urlencode(var.db_password)}@${var.db_cluster_endpoint}:5432/${var.database_name}?sslmode=no-verify"
18
+ DATABASE_SECRET_ARN = var.graphql_db_secret_arn
19
+ DATABASE_HOST = var.db_cluster_endpoint
20
+ DATABASE_NAME = var.database_name
21
+ BUCKET_NAME = var.bucket_name
22
+ USER_POOL_ID = var.user_pool_id
23
+ COGNITO_USER_POOL_ID = var.user_pool_id
24
+ ADMIN_CLIENT_ID = var.admin_client_id
25
+ MOBILE_CLIENT_ID = var.mobile_client_id
26
+ COGNITO_MCP_CLIENT_ID = aws_cognito_user_pool_client.mcp_oauth.id
27
+ COGNITO_AUTH_BASE_URL = local.mcp_oauth_cognito_base_url
28
+ MCP_OAUTH_CALLBACK_URL = "${local.mcp_oauth_api_base_url}/mcp/oauth/callback"
29
+ MCP_OAUTH_REVOCATIONS_TABLE = aws_dynamodb_table.mcp_oauth_revocations.name
30
+ COGNITO_APP_CLIENT_IDS = "${var.admin_client_id},${var.mobile_client_id}"
31
+ APPSYNC_ENDPOINT = var.appsync_api_url
32
+ APPSYNC_API_KEY = var.appsync_api_key
33
+ GRAPHQL_API_KEY = var.appsync_api_key
34
+ API_AUTH_SECRET = var.api_auth_secret
35
+ THINKWORK_API_SECRET = var.api_auth_secret
36
+ EMAIL_HMAC_SECRET = var.api_auth_secret
37
+ THINKWORK_API_URL = "https://${aws_apigatewayv2_api.main.id}.execute-api.${var.region}.amazonaws.com"
38
+ # Comma-separated allowlist of caller emails permitted to invoke
39
+ # operator-gated mutations (updateTenantPolicy, sandbox fixture
40
+ # setup, etc.). Resolved against ctx.auth.email, which is pulled
41
+ # from the Cognito JWT for user callers and from the
42
+ # `x-principal-email` header for service-auth callers (see
43
+ # packages/api/src/lib/cognito-auth.ts). Empty ⇒ the gate
44
+ # rejects every call, which is the safe default pre-rollout.
45
+ THINKWORK_PLATFORM_OPERATOR_EMAILS = var.platform_operator_emails
46
+ AGENTCORE_FUNCTION_NAME = var.agentcore_function_name
47
+ AGENTCORE_FLUE_FUNCTION_NAME = var.agentcore_flue_function_name
48
+ # SSM parameter names for the Bedrock AgentCore Runtime IDs (one per
49
+ # runtime type). deploy.yml's "Update AgentCore Runtimes" job writes
50
+ # these in `update-agentcore-runtime-image.sh`. eval-runner reads them
51
+ # via `loadRuntimeId(runtimeType)` to start a Bedrock-control-plane
52
+ # invocation against the right runtime — pre-U3 the flue path was
53
+ # dead because the env var was never wired here.
54
+ AGENTCORE_RUNTIME_SSM_STRANDS = "/thinkwork/${var.stage}/agentcore/runtime-id-strands"
55
+ AGENTCORE_RUNTIME_SSM_FLUE = "/thinkwork/${var.stage}/agentcore/runtime-id-flue"
56
+ WORKSPACE_BUCKET = var.bucket_name
57
+ HINDSIGHT_ENDPOINT = var.hindsight_endpoint
58
+ AGENTCORE_MEMORY_ID = var.agentcore_memory_id
59
+ MEMORY_ENGINE = var.memory_engine
38
60
  # Skip the SSM indirection for cross-function ARN lookup. Terraform
39
61
  # already knows this ARN at apply time and the Lambda role's SSM
40
62
  # permission has been a recurring source of silent failures where
@@ -47,21 +69,202 @@ locals {
47
69
  ECR_REPOSITORY_URL = var.ecr_repository_url
48
70
  AWS_ACCOUNT_ID = var.account_id
49
71
  NODE_OPTIONS = "--enable-source-maps"
50
- # LastMile Tasks REST API base URL feature-flags the outbound sync
51
- # path. When unset, syncExternalTaskOnCreate writes sync_status='local'
52
- # and the workflow picker proxy returns 503. Set to the LMI develop /
53
- # staging / prod base URL per stage to enable real cross-system sync.
54
- LASTMILE_TASKS_API_URL = var.lastmile_tasks_api_url
72
+ # Per-user OAuth wiring (Google Workspace today; Microsoft 365 follow-up).
73
+ # Secret ARNs are the indirection; the actual client_id/client_secret
74
+ # values live in Secrets Manager and are fetched by
75
+ # packages/api/src/lib/oauth-client-credentials.ts at cold-start.
76
+ # OAUTH_CALLBACK_URL is the URL registered with Google/Azure OAuth apps.
77
+ # REDIRECT_SUCCESS_URL is the fallback post-OAuth redirect when the
78
+ # caller doesn't pass a per-request returnUrl (mobile passes thinkwork://).
79
+ GOOGLE_PRODUCTIVITY_OAUTH_SECRET_ARN = aws_secretsmanager_secret.oauth_google_productivity.arn
80
+ OAUTH_CALLBACK_URL = "https://${aws_apigatewayv2_api.main.id}.execute-api.${var.region}.amazonaws.com/api/oauth/callback"
81
+ REDIRECT_SUCCESS_URL = var.redirect_success_url
82
+ COMPANY_BRAIN_SOURCE_AGENT_MODEL_ID = var.company_brain_source_agent_model_id
83
+ # Stripe billing — see stripe-secrets.tf. The ARN is the indirection;
84
+ # the actual keys live in Secrets Manager and are fetched by
85
+ # packages/api/src/lib/stripe-credentials.ts at cold-start. Price IDs
86
+ # are non-secret per-stage config carried as a plain JSON env var so
87
+ # staging/prod can use different products without a secret rotation.
88
+ STRIPE_CREDENTIALS_SECRET_ARN = aws_secretsmanager_secret.stripe_api_credentials.arn
89
+ STRIPE_PRICE_IDS_JSON = var.stripe_price_ids_json
90
+ STRIPE_CHECKOUT_SUCCESS_URL = "${var.admin_url}/onboarding/welcome?session_id={CHECKOUT_SESSION_ID}"
91
+ STRIPE_CHECKOUT_CANCEL_URL = "${var.www_url}/cloud"
92
+ WWW_URL = var.www_url
93
+ # Override the welcome email's From: address. Defaults to
94
+ # hello@agents.thinkwork.ai (the already-verified SES inbound domain);
95
+ # set to hello@thinkwork.ai once the bare-apex identity is verified in SES.
96
+ STRIPE_WELCOME_FROM_EMAIL = var.stripe_welcome_from_email
97
+ }
98
+
99
+ # Computer runtime control handlers only need database access, service-auth,
100
+ # the API callback URL, and ECS/EFS runtime wiring. Using the full common_env
101
+ # pushes computer-manager over Lambda's 4KB environment-variable limit in dev.
102
+ computer_runtime_control_base_env = {
103
+ STAGE = var.stage
104
+ DATABASE_URL = "postgresql://${var.db_username}:${urlencode(var.db_password)}@${var.db_cluster_endpoint}:5432/${var.database_name}?sslmode=no-verify"
105
+ API_AUTH_SECRET = var.api_auth_secret
106
+ THINKWORK_API_URL = "https://${aws_apigatewayv2_api.main.id}.execute-api.${var.region}.amazonaws.com"
107
+ NODE_OPTIONS = "--enable-source-maps"
108
+ }
109
+
110
+ computer_runtime_control_env = {
111
+ COMPUTER_RUNTIME_CLUSTER_NAME = var.computer_runtime_cluster_name
112
+ COMPUTER_RUNTIME_EFS_FILE_SYSTEM_ID = var.computer_runtime_efs_file_system_id
113
+ COMPUTER_RUNTIME_SUBNET_IDS = join(",", var.computer_runtime_subnet_ids)
114
+ COMPUTER_RUNTIME_ASSIGN_PUBLIC_IP = var.computer_runtime_assign_public_ip
115
+ COMPUTER_RUNTIME_TASK_SG_ID = var.computer_runtime_task_sg_id
116
+ COMPUTER_RUNTIME_EXECUTION_ROLE_ARN = var.computer_runtime_execution_role_arn
117
+ COMPUTER_RUNTIME_TASK_ROLE_ARN = var.computer_runtime_task_role_arn
118
+ COMPUTER_RUNTIME_LOG_GROUP_NAME = var.computer_runtime_log_group_name
119
+ COMPUTER_RUNTIME_REPOSITORY_URL = var.computer_runtime_repository_url
120
+ COMPUTER_RUNTIME_DEFAULT_CPU = tostring(var.computer_runtime_default_cpu)
121
+ COMPUTER_RUNTIME_DEFAULT_MEMORY = tostring(var.computer_runtime_default_memory)
55
122
  }
56
123
 
57
124
  # Per-handler env-var overrides. ARNs are constructed from the naming
58
125
  # pattern (same trick as lambda_api_cross_invoke in main.tf) so we don't
59
126
  # introduce a self-referential dependency inside the handler for_each.
127
+ slack_handler_env = {
128
+ SLACK_APP_CREDENTIALS_SECRET_ARN = aws_secretsmanager_secret.slack_app_credentials.arn
129
+ }
130
+
60
131
  handler_extra_env = {
132
+ "extension-proxy" = {
133
+ EXTENSION_PROXY_BACKENDS_JSON = var.extension_proxy_backends_json
134
+ EXTENSION_PROXY_SIGNING_SECRET = var.extension_proxy_signing_secret
135
+ }
61
136
  "job-schedule-manager" = {
62
137
  JOB_TRIGGER_ARN = "arn:aws:lambda:${var.region}:${var.account_id}:function:thinkwork-${var.stage}-api-job-trigger"
63
138
  JOB_TRIGGER_ROLE_ARN = var.job_scheduler_role_arn
64
139
  }
140
+ # Compounding Memory compile Lambda. Any Converse-compatible Bedrock
141
+ # model works; the planner + section-writer cap themselves at ~500
142
+ # records / 25 new pages per invocation so a 480 s timeout covers
143
+ # the worst case comfortably. Env vars come from variables so
144
+ # unrelated deploys don't wipe them back to defaults (the aggregation
145
+ # flag got reset on every terraform apply before this was pinned).
146
+ "wiki-compile" = {
147
+ BEDROCK_MODEL_ID = var.wiki_compile_model_id
148
+ WIKI_AGGREGATION_PASS_ENABLED = var.wiki_aggregation_pass_enabled
149
+ WIKI_DETERMINISTIC_LINKING_ENABLED = var.wiki_deterministic_linking_enabled
150
+ # Name (not value) of the SecureString SSM parameter that holds the
151
+ # Google Places API key. wiki-compile fetches + caches on cold start.
152
+ # The parameter may contain a placeholder value at apply time — the
153
+ # Lambda logs and degrades gracefully if decryption returns empty.
154
+ GOOGLE_PLACES_SSM_PARAM_NAME = "/thinkwork/${var.stage}/google-places/api-key"
155
+ }
156
+ "wiki-export" = {
157
+ WIKI_EXPORT_BUCKET = aws_s3_bucket.wiki_exports.bucket
158
+ }
159
+ # workspace-files invokes the workspace-files-efs sidecar (Request
160
+ # Response) for Computer-target list/get to bypass the computer_tasks
161
+ # queue. ARN constructed from the naming pattern to avoid a self-
162
+ # referential dependency on the standalone Lambda resource defined at
163
+ # the bottom of this file.
164
+ "workspace-files" = {
165
+ WORKSPACE_FILES_EFS_FN_ARN = "arn:aws:lambda:${var.region}:${var.account_id}:function:thinkwork-${var.stage}-api-workspace-files-efs"
166
+ }
167
+ "oauth-authorize" = local.slack_handler_env
168
+ "oauth-callback" = local.slack_handler_env
169
+ "slack-events" = local.slack_handler_env
170
+ "slack-slash-command" = local.slack_handler_env
171
+ "slack-interactivity" = local.slack_handler_env
172
+ "slack-oauth-install" = local.slack_handler_env
173
+ "slack-dispatch" = local.slack_handler_env
174
+ # computer-terminal-start needs the cluster name to scope its
175
+ # ECS ListTasks / DescribeTasks / ExecuteCommand calls.
176
+ "computer-terminal-start" = {
177
+ COMPUTER_RUNTIME_CLUSTER_NAME = var.computer_runtime_cluster_name
178
+ }
179
+ # computer-manager and computer-runtime-reconciler consume ECS/EFS task
180
+ # config from packages/api/src/lib/computers/runtime-control.ts.
181
+ # Scoping the COMPUTER_RUNTIME_* variables here (instead of in
182
+ # local.common_env_vars) keeps the per-Lambda env-var payload under
183
+ # the AWS 4KB hard limit — they were previously dumped into every
184
+ # handler and pushed ~70 Lambdas over quota.
185
+ "computer-manager" = local.computer_runtime_control_env
186
+ "computer-runtime-reconciler" = local.computer_runtime_control_env
187
+ "mcp-context-engine" = {
188
+ CONTEXT_ENGINE_MEMORY_QUERY_MODE = "reflect"
189
+ CONTEXT_ENGINE_MEMORY_TIMEOUT_MS = "20000"
190
+ }
191
+ # routine-task-python (Phase B U6) needs the AgentCore code-interpreter
192
+ # id + the per-stage S3 routine-output bucket. The interpreter id is
193
+ # provisioned by the agentcore-code-interpreter module and exposed via
194
+ # the agentcore_code_interpreter_id input variable; the bucket name
195
+ # follows the per-stage naming convention from the routines-stepfunctions
196
+ # module (Phase A U1).
197
+ "routine-task-python" = {
198
+ SANDBOX_INTERPRETER_ID = var.agentcore_code_interpreter_id
199
+ ROUTINE_OUTPUT_BUCKET = "thinkwork-${var.stage}-routine-output"
200
+ ROUTINE_PYTHON_ENV_ALLOWLIST = "TENANT_ID,ROUTINE_ID,EXECUTION_ID"
201
+ }
202
+ # graphql-http hosts the createRoutine / publishRoutineVersion / etc.
203
+ # resolvers (Phase B U7) AND the routine-approval-bridge (Phase B
204
+ # U8) which invokes routine-resume via the AWS SDK.
205
+ "graphql-http" = {
206
+ ROUTINES_EXECUTION_ROLE_ARN = var.routines_execution_role_arn
207
+ ROUTINES_LOG_GROUP_ARN = var.routines_log_group_arn
208
+ AWS_ACCOUNT_ID = var.account_id
209
+ # routine-approval-bridge (Phase B U8) calls this function name
210
+ # via the AWS SDK Lambda Invoke after a HITL decideInboxItem.
211
+ # The bridge throws if unset — terraform wiring is mandatory.
212
+ ROUTINE_RESUME_FUNCTION_NAME = "thinkwork-${var.stage}-api-routine-resume"
213
+ # triggerRoutineRun seeds this into the SFN execution input so the
214
+ # inbox_approval recipe Task can find the callback Lambda via
215
+ # $$.Execution.Input.inboxApprovalFunctionName.
216
+ ROUTINE_APPROVAL_CALLBACK_FUNCTION_NAME = "thinkwork-${var.stage}-api-routine-approval-callback"
217
+ EMAIL_SEND_FUNCTION_NAME = "thinkwork-${var.stage}-api-email-send"
218
+ ROUTINE_TASK_PYTHON_FUNCTION_NAME = "thinkwork-${var.stage}-api-routine-task-python"
219
+ ADMIN_OPS_MCP_FUNCTION_NAME = "thinkwork-${var.stage}-api-admin-ops-mcp"
220
+ SLACK_SEND_FUNCTION_NAME = "thinkwork-${var.stage}-api-slack-send"
221
+ # Phase 3 U10 — compliance read resolvers (complianceEvents,
222
+ # complianceEvent, complianceEventByHash) connect to Aurora as
223
+ # the compliance_reader role. The existing lambda_secrets policy
224
+ # in main.tf grants secretsmanager:GetSecretValue on the
225
+ # thinkwork/* wildcard, so no new IAM resource is needed.
226
+ COMPLIANCE_READER_SECRET_ARN = var.compliance_reader_secret_arn
227
+ # Phase 3 U11.U2 — createComplianceExport mutation dispatches a
228
+ # jobId to a known-name SQS queue. We do NOT pass the queue URL
229
+ # as an env var here: graphql-http's env block is already at the
230
+ # AWS 4 KB ceiling, and adding another URL pushed the deploy over
231
+ # the limit. The mutation derives the URL from STAGE + AWS_REGION
232
+ # + AWS_ACCOUNT_ID, which the Lambda already has. The runner
233
+ # Lambda (separate function below) keeps an explicit
234
+ # COMPLIANCE_EXPORTS_QUEUE_URL because its env is small.
235
+ }
236
+ # U2 eval fan-out substrate. eval-runner does not dispatch to this
237
+ # queue until U3; eval-worker is a throwing inert stub that redrives
238
+ # accidental traffic to the DLQ.
239
+ "eval-runner" = {
240
+ EVAL_FANOUT_QUEUE_URL = local.eval_fanout_queue_url
241
+ EVAL_DIRECT_AGENTCORE_MESSAGE_SHARDS = "20"
242
+ }
243
+ "eval-worker" = {
244
+ EVAL_FANOUT_QUEUE_URL = local.eval_fanout_queue_url
245
+ EVAL_AGENTCORE_EVALUATORS = "disabled"
246
+ }
247
+ # job-trigger fires scheduled routine runs via SFN.StartExecution
248
+ # (Phase B U7) — the alias ARN comes from the row, but the Lambda
249
+ # also reads AWS_ACCOUNT_ID for diagnostic logging. It also passes
250
+ # the routine-approval-callback function name in the SFN execution
251
+ # input so the inbox_approval recipe can fanout to it on .waitForTaskToken.
252
+ "job-trigger" = {
253
+ AWS_ACCOUNT_ID = var.account_id
254
+ ROUTINE_APPROVAL_CALLBACK_FUNCTION_NAME = "thinkwork-${var.stage}-api-routine-approval-callback"
255
+ EMAIL_SEND_FUNCTION_NAME = "thinkwork-${var.stage}-api-email-send"
256
+ ROUTINE_TASK_PYTHON_FUNCTION_NAME = "thinkwork-${var.stage}-api-routine-task-python"
257
+ ADMIN_OPS_MCP_FUNCTION_NAME = "thinkwork-${var.stage}-api-admin-ops-mcp"
258
+ SLACK_SEND_FUNCTION_NAME = "thinkwork-${var.stage}-api-slack-send"
259
+ }
260
+ # Phase 3 U4 Compliance outbox drainer.
261
+ # Connects to Aurora as the compliance_drainer role (provisioned in
262
+ # U2). The DATABASE_SECRET_ARN-style indirection is via
263
+ # COMPLIANCE_DRAINER_SECRET_ARN so the drainer's connection cache is
264
+ # isolated from the master `getDb()` cache used by other handlers.
265
+ "compliance-outbox-drainer" = {
266
+ COMPLIANCE_DRAINER_SECRET_ARN = var.compliance_drainer_secret_arn
267
+ }
65
268
  }
66
269
  }
67
270
 
@@ -74,18 +277,28 @@ resource "aws_lambda_function" "handler" {
74
277
  "graphql-http",
75
278
  "chat-agent-invoke",
76
279
  "wakeup-processor",
280
+ "workspace-event-dispatcher",
77
281
  "agents",
78
282
  "agent-actions",
79
283
  "messages",
80
284
  "connections",
81
285
  "oauth-authorize",
82
286
  "oauth-callback",
287
+ "stripe-checkout",
288
+ "stripe-webhook",
289
+ "stripe-portal",
290
+ "stripe-subscription",
291
+ "auth-me",
292
+ "extension-proxy",
83
293
  "teams",
84
294
  "team-members",
85
295
  "tenants",
86
296
  "users",
87
297
  "invites",
88
298
  "skills",
299
+ "mcp-oauth",
300
+ "mcp-user-memory",
301
+ "mcp-context-engine",
89
302
  "activity",
90
303
  "routines",
91
304
  "budgets",
@@ -93,39 +306,209 @@ resource "aws_lambda_function" "handler" {
93
306
  "scheduled-jobs",
94
307
  "job-schedule-manager",
95
308
  "job-trigger",
309
+ "routine-task-weather-email",
96
310
  "webhooks",
97
311
  "webhooks-admin",
98
312
  "webhook-deliveries-cleanup",
99
- "task-connectors",
313
+ "skill-runs-reconciler",
314
+ "cron-stall-monitor",
315
+ "webhook-crm-opportunity",
316
+ "webhook-task-event",
100
317
  "workspace-files",
101
318
  "knowledge-base-manager",
102
319
  "knowledge-base-files",
103
320
  "email-send",
104
321
  "email-inbound",
322
+ "slack-events",
323
+ "slack-slash-command",
324
+ "slack-interactivity",
325
+ "slack-oauth-install",
326
+ "slack-dispatch",
105
327
  "github-app",
106
328
  "github-repos",
107
329
  "memory",
108
330
  "memory-retain",
331
+ "wiki-compile",
332
+ "wiki-lint",
333
+ "wiki-export",
334
+ "wiki-bootstrap-import",
109
335
  "artifact-deliver",
110
336
  "recipe-refresh",
111
337
  "agent-skills-list",
112
338
  "bootstrap-workspaces",
339
+ "migrate-agents-to-computers",
340
+ "computer-runtime",
341
+ "computer-manager",
342
+ "computer-runtime-reconciler",
343
+ # Admin Terminal tab — POST /api/computers/{computerId}/terminal/start.
344
+ # Returns the SSM Session Manager session envelope (sessionId,
345
+ # streamUrl, tokenValue) so the browser can open a direct WebSocket
346
+ # to ssmmessages. Plan:
347
+ # docs/plans/2026-05-13-004-feat-computer-terminal-ecs-exec-plan.md.
348
+ "computer-terminal-start",
113
349
  "code-factory",
350
+ "eval-runner",
351
+ "eval-worker",
352
+ "eval-runs-reconciler",
353
+ # AgentCore Code Sandbox narrow REST endpoints (plan Unit 10 + Unit 11).
354
+ # Both are service-endpoint shape: the Strands container POSTs with
355
+ # Bearer API_AUTH_SECRET. No GraphQL resolver involvement, no extra IAM.
356
+ "sandbox-quota-check",
357
+ "sandbox-invocation-log",
358
+ # Routines Step Functions ASL validator (plan
359
+ # docs/plans/2026-05-01-004-feat-routines-phase-a-substrate-plan.md §U5).
360
+ # Bearer API_AUTH_SECRET; chat builder + publish flow call this before
361
+ # accepting LLM-emitted ASL. Needs states:ValidateStateMachineDefinition
362
+ # IAM grant — see main.tf.
363
+ "routine-asl-validator",
364
+ # Routines Step Functions Task wrappers (plan
365
+ # docs/plans/2026-05-01-005-feat-routines-phase-b-runtime-plan.md §U6).
366
+ # routine-task-python: SFN-invoked Lambda that runs `python` recipe
367
+ # states in the AgentCore code interpreter, offloading stdout/stderr
368
+ # to the per-stage routine-output bucket. Needs bedrock-agentcore
369
+ # (Start/Invoke/Stop CodeInterpreterSession) + S3 PutObject IAM —
370
+ # see main.tf.
371
+ "routine-task-python",
372
+ # routine-resume: SDK-invoked by routine-approval-bridge (Phase B
373
+ # U8) after a HITL decision. Calls SendTaskSuccess/SendTaskFailure;
374
+ # idempotent on already-consumed tokens. Needs states:SendTaskSuccess
375
+ # + states:SendTaskFailure IAM (already granted in U1's substrate).
376
+ "routine-resume",
377
+ # routine-approval-callback: SFN's inbox_approval Task invokes this
378
+ # via .waitForTaskToken (plan 2026-05-01-005 §U8). Creates the
379
+ # inbox_items row + persists the task token in routine_approval_tokens.
380
+ # No additional IAM beyond the lambda execution role's DB access —
381
+ # the trust boundary is the routines-stepfunctions execution role's
382
+ # lambda:InvokeFunction grant scoped to this Lambda's ARN.
383
+ "routine-approval-callback",
384
+ # routine-step-callback + routine-execution-callback (Phase B U9).
385
+ # Bearer API_AUTH_SECRET ingest endpoints — Task wrappers and the
386
+ # EventBridge SFN-state-change rule POST here. routine-step-callback
387
+ # writes routine_step_events; routine-execution-callback updates
388
+ # routine_executions lifecycle status. Idempotent on the dedup index
389
+ # for steps + on the conditional UPDATE for executions.
390
+ "routine-step-callback",
391
+ "routine-execution-callback",
392
+ # Skill-run dispatcher runtime-config fetch (plan
393
+ # docs/plans/2026-04-24-008-feat-skill-run-dispatcher-plan.md §U1). The
394
+ # Strands container's `kind=run_skill` handler calls this with Bearer
395
+ # API_AUTH_SECRET to pull the agent's template + skills + MCP + KBs
396
+ # before building the headless agent turn.
397
+ "agents-runtime-config",
398
+ # Admin-Ops MCP — JSON-RPC endpoint at POST /mcp/admin, exposes the
399
+ # @thinkwork/admin-ops package as MCP tools for Strands agents.
400
+ "admin-ops-mcp",
401
+ # MCP admin key management — per-tenant Bearer tokens for admin-ops.
402
+ # Admin-ops-mcp authenticates incoming tokens by sha256-hash lookup
403
+ # against tenant_mcp_admin_keys, populated by this handler's routes.
404
+ "mcp-admin-keys",
405
+ # One-shot tenant provisioning: mints a tkm_ key + stores in Secrets
406
+ # Manager at thinkwork/<stage>/mcp/<tenantId>/admin-ops + upserts
407
+ # tenant_mcp_servers. SM IAM is already granted on thinkwork/* by
408
+ # aws_iam_role_policy.lambda_secrets in main.tf (Create/Update/Get).
409
+ "mcp-admin-provision",
410
+ # Plugin-installed MCP server admin approval (plan §U11, SI-5). Cognito
411
+ # JWT admin caller → approve/reject. Approve computes url_hash =
412
+ # sha256(canonical(url, auth_config)) and pins it; any subsequent
413
+ # mutation to those fields reverts the row to 'pending'.
414
+ "mcp-approval",
415
+ # Daily sweeper: auto-rejects MCP servers pending > 30 days. Triggered
416
+ # by EventBridge schedule (mcp-approval-sweeper-daily).
417
+ "mcp-approval-sweeper",
418
+ # Plugin upload REST handler (plan §U10). Four routes:
419
+ # POST /api/plugins/presign + /upload, GET /api/plugins (+ /:uploadId).
420
+ # Cognito JWT; admin-role gated. Needs WORKSPACE_BUCKET env for S3.
421
+ "plugin-upload",
422
+ # Finance pilot U2 — thread-attachment upload (presign + finalize).
423
+ # presign issues a 5-min PUT URL the end-user client uses to push
424
+ # Excel/CSV bytes directly to S3; finalize sniffs magic bytes, scans
425
+ # OOXML containers (rejects macros + external links), inserts
426
+ # thread_attachments, and emits attachment.received audit event.
427
+ # Cognito JWT (end-user-facing — NOT admin-gated); tenant pinned via
428
+ # threads.tenant_id lookup. Needs WORKSPACE_BUCKET env for S3.
429
+ "thread-attachments-presign",
430
+ "thread-attachments-finalize",
431
+ # U9-remainder of finance pilot — tenant-pinned download endpoint.
432
+ # GET /api/threads/{tid}/attachments/{aid}/download returns a 302
433
+ # to a 5-minute presigned S3 GET URL with ResponseContentDisposition:
434
+ # attachment so browsers download rather than render inline. Same
435
+ # tenant-pin discipline as presign/finalize.
436
+ "thread-attachment-download",
437
+ # Folder bundle import (fat-folder plan Phase D). Admin uploads a zip
438
+ # or GitHub ref and the handler normalizes vendor folder layouts into
439
+ # the agent workspace.
440
+ "folder-bundle-import",
441
+ # Hourly sweeper: reaps orphan S3 staging from failed / interrupted
442
+ # plugin install sagas + marks matching plugin_uploads rows 'failed'.
443
+ "plugin-staging-sweeper",
444
+ # Resolved Capability Manifest write endpoint (plan §U15). Strands
445
+ # container POSTs one row per agent-session-start. Shared
446
+ # API_AUTH_SECRET bearer (runtime→API; no tenant OAuth).
447
+ "manifest-log",
448
+ # SI-7 catalog-list read endpoint (plan §U15 pt 3/3). Strands
449
+ # container fetches the allowed builtin-tool slug set once per
450
+ # session-start + feature-flag-gated enforcement filter drops
451
+ # catalog-missing tools before Agent(tools=...). Shared
452
+ # API_AUTH_SECRET bearer.
453
+ "capability-catalog-list",
454
+ # Brain v0 narrow write endpoint. Strands calls this with
455
+ # Bearer API_AUTH_SECRET; GraphQL remains user/admin-facing only.
456
+ "brain-agent-write",
457
+ # Phase 3 U4 of the Compliance audit-event log
458
+ # (docs/plans/2026-05-07-004-feat-compliance-u4-outbox-drainer-plan.md).
459
+ # Single-writer drainer with reserved_concurrent_executions=1 (set
460
+ # below). Connects to Aurora as `compliance_drainer` role via the
461
+ # COMPLIANCE_DRAINER_SECRET_ARN env var (compliance secret created in
462
+ # U2). EventBridge rate(1 minute) schedule + DLQ + MaxRetryAttempts=0
463
+ # (defined in dedicated resources below).
464
+ "compliance-outbox-drainer",
465
+ # Phase 3 U6 of the Compliance audit-event log
466
+ # (docs/plans/2026-05-07-007-feat-compliance-u6-strands-emit-path-plan.md).
467
+ # Cross-runtime emit endpoint POST /api/compliance/events — Bearer
468
+ # API_AUTH_SECRET, Strands Python client posts here with a
469
+ # client-supplied UUIDv7 event_id for idempotency. Connects to
470
+ # Aurora via the master DATABASE_SECRET_ARN like every other narrow
471
+ # handler (compliance_writer role is reserved for future hardening).
472
+ "compliance-events",
473
+ # Phase 3 U8b watchdog moved out of for_each into a standalone
474
+ # aws_lambda_function resource (see below). It now uses a sibling
475
+ # IAM role (kms:DescribeKey only on the CMK; s3:ListBucket scoped
476
+ # to anchors/) instead of the shared aws_iam_role.lambda — the
477
+ # widened S3+KMS grant on the shared role would have leaked into
478
+ # 60+ unrelated handlers. Pre-merge step: `terraform state mv`
479
+ # the existing handler["compliance-anchor-watchdog"] address to the
480
+ # new standalone resource (see U8b plan operator-step section).
114
481
  ]) : toset([])
115
482
 
116
483
  function_name = "thinkwork-${var.stage}-api-${each.key}"
117
484
  role = aws_iam_role.lambda.arn
118
485
  handler = "index.handler"
119
486
  runtime = local.runtime
120
- timeout = each.key == "wakeup-processor" ? 300 : each.key == "chat-agent-invoke" ? 300 : 30
121
- memory_size = each.key == "graphql-http" ? 512 : each.key == "wakeup-processor" ? 512 : 256
487
+ # eval-runner walks every test case sequentially, invoking an agent +
488
+ # waiting up to 2 min for spans to propagate per test, so a 10-test run
489
+ # can easily exceed the 30 s default. 900 s covers ~5-15 min sweeps.
490
+ # wiki-bootstrap-import runs a full Hindsight ingest for ~3,000 records;
491
+ # the LLM-backed retain path makes it the longest-running Lambda in the
492
+ # set. 900 s is Lambda's per-invocation max and matches eval-runner's ceiling.
493
+ # routine-task-python wraps a 300s sandbox session and needs headroom
494
+ # for the Start/Invoke/Stop/S3-offload round trip; 360s leaves ~60s
495
+ # for AWS-call setup and offload after the sandbox's own ceiling.
496
+ timeout = each.key == "wakeup-processor" ? 300 : each.key == "chat-agent-invoke" ? 300 : each.key == "workspace-event-dispatcher" ? 60 : each.key == "eval-runner" ? 900 : each.key == "eval-worker" ? 240 : each.key == "wiki-compile" ? 480 : each.key == "wiki-lint" ? 300 : each.key == "wiki-export" ? 600 : each.key == "wiki-bootstrap-import" ? 900 : each.key == "folder-bundle-import" ? 300 : each.key == "routine-task-python" ? 360 : 30
497
+ memory_size = each.key == "graphql-http" ? 512 : each.key == "wakeup-processor" ? 512 : each.key == "workspace-event-dispatcher" ? 512 : each.key == "eval-runner" ? 512 : each.key == "eval-worker" ? 512 : each.key == "wiki-compile" ? 1024 : each.key == "wiki-export" ? 1024 : each.key == "wiki-bootstrap-import" ? 1024 : each.key == "folder-bundle-import" ? 1024 : 256
122
498
 
123
499
  filename = "${var.lambda_zips_dir}/${each.key}.zip"
124
500
  source_code_hash = filebase64sha256("${var.lambda_zips_dir}/${each.key}.zip")
125
501
 
502
+ # Per-handler reserved concurrency. compliance-outbox-drainer is a
503
+ # single-writer (per-tenant hash chain integrity depends on it — two
504
+ # concurrent drainers would race the chain head SELECT and produce
505
+ # orphan prev_hash links). All other handlers run with the default
506
+ # account-level concurrency pool.
507
+ reserved_concurrent_executions = each.key == "compliance-outbox-drainer" ? 1 : each.key == "eval-worker" ? 20 : -1
508
+
126
509
  environment {
127
510
  variables = merge(
128
- local.common_env,
511
+ contains(["computer-manager", "computer-runtime-reconciler"], each.key) ? local.computer_runtime_control_base_env : local.common_env,
129
512
  { FUNCTION_NAME = each.key },
130
513
  lookup(local.handler_extra_env, each.key, {}),
131
514
  )
@@ -137,6 +520,161 @@ resource "aws_lambda_function" "handler" {
137
520
  }
138
521
  }
139
522
 
523
+ # ---------------------------------------------------------------------------
524
+ # wiki-compile async retry config + DLQ
525
+ # ---------------------------------------------------------------------------
526
+ #
527
+ # AWS Lambda's default async invoke retries the function 2 times with a
528
+ # 1-minute delay before sending failures to a DLQ (or dropping). For
529
+ # wiki-compile, retries duplicate Bedrock cost AND can produce duplicate
530
+ # user-visible threads + workspace_runs (the brain-enrichment draft path
531
+ # in particular — see plan 2026-05-01-002 U5/U6 and
532
+ # docs/solutions/architecture-patterns/async-retry-idempotency-lessons).
533
+ #
534
+ # Pin retries to 0 and route failures to a dedicated DLQ. The runner's
535
+ # job-status short-circuit (running/succeeded/failed/skipped) is the
536
+ # in-process protection against duplicate writebacks; this is the
537
+ # infrastructure-level belt-and-suspenders.
538
+
539
+ resource "aws_sqs_queue" "wiki_compile_dlq" {
540
+ count = local.use_local_zips ? 1 : 0
541
+ name = "thinkwork-${var.stage}-wiki-compile-dlq"
542
+ message_retention_seconds = 1209600 # 14 days
543
+
544
+ tags = {
545
+ Name = "thinkwork-${var.stage}-wiki-compile-dlq"
546
+ }
547
+ }
548
+
549
+ resource "aws_iam_role_policy" "wiki_compile_dlq_send" {
550
+ count = local.use_local_zips ? 1 : 0
551
+ name = "thinkwork-${var.stage}-wiki-compile-dlq-send"
552
+ role = aws_iam_role.lambda.id
553
+
554
+ policy = jsonencode({
555
+ Version = "2012-10-17"
556
+ Statement = [{
557
+ Effect = "Allow"
558
+ Action = ["sqs:SendMessage"]
559
+ Resource = aws_sqs_queue.wiki_compile_dlq[0].arn
560
+ }]
561
+ })
562
+ }
563
+
564
+ resource "aws_lambda_function_event_invoke_config" "wiki_compile" {
565
+ count = local.use_local_zips ? 1 : 0
566
+ function_name = aws_lambda_function.handler["wiki-compile"].function_name
567
+ maximum_retry_attempts = 0
568
+ maximum_event_age_in_seconds = 3600
569
+
570
+ destination_config {
571
+ on_failure {
572
+ destination = aws_sqs_queue.wiki_compile_dlq[0].arn
573
+ }
574
+ }
575
+ }
576
+
577
+ # Phase B U8: SFN's inbox_approval Task invokes routine-approval-callback
578
+ # directly via .waitForTaskToken. Lambda's default async-retry policy
579
+ # (2 attempts) is incompatible with the callback's two-insert flow —
580
+ # even though the inserts are now wrapped in db.transaction(), AWS
581
+ # Lambda's own retry-after-error semantics multiply with SFN's task
582
+ # Retry policy and create thundering-herd attempts on transient
583
+ # failures. SFN is the canonical retry path; Lambda async retries are
584
+ # off. Per project_async_retry_idempotency_lessons.
585
+ resource "aws_lambda_function_event_invoke_config" "routine_approval_callback" {
586
+ count = local.use_local_zips ? 1 : 0
587
+ function_name = aws_lambda_function.handler["routine-approval-callback"].function_name
588
+ maximum_retry_attempts = 0
589
+ maximum_event_age_in_seconds = 3600
590
+ }
591
+
592
+ # Per-turn auto-retain: the runtime (Strands + Flue) Event-invokes
593
+ # memory-retain after every chat turn. AWS Lambda's default async-retry
594
+ # policy is 2 attempts; without overriding it, a transient failure on the
595
+ # canonical-transcript fetch or adapter write retries the entire writeback
596
+ # and can multi-write the same per-turn document into Hindsight. The
597
+ # longest-suffix-prefix merge in memory-retain.ts dedupes content but the
598
+ # retain-cost path (Bedrock tokens charged in adapter.retainConversation)
599
+ # is NOT idempotent — retries multiply LLM cost. Per
600
+ # project_async_retry_idempotency_lessons.
601
+ resource "aws_lambda_function_event_invoke_config" "memory_retain" {
602
+ count = local.use_local_zips ? 1 : 0
603
+ function_name = aws_lambda_function.handler["memory-retain"].function_name
604
+ maximum_retry_attempts = 0
605
+ maximum_event_age_in_seconds = 3600
606
+ }
607
+
608
+ # ---------------------------------------------------------------------------
609
+ # Phase 3 U4: compliance-outbox-drainer DLQ + async retry config
610
+ #
611
+ # AWS Lambda's default async-retry policy is 2 attempts. The drainer's
612
+ # INSERT ... ON CONFLICT (outbox_id) DO NOTHING makes per-row replay
613
+ # safe, but reserved-concurrency=1 + retry-0 is the architectural
614
+ # guarantee that we never have two drainers racing the chain head.
615
+ # Per project_async_retry_idempotency_lessons.
616
+ # ---------------------------------------------------------------------------
617
+
618
+ resource "aws_sqs_queue" "compliance_drainer_dlq" {
619
+ count = local.use_local_zips ? 1 : 0
620
+ name = "thinkwork-${var.stage}-compliance-drainer-dlq"
621
+ message_retention_seconds = 1209600 # 14 days
622
+
623
+ tags = {
624
+ Name = "thinkwork-${var.stage}-compliance-drainer-dlq"
625
+ }
626
+ }
627
+
628
+ resource "aws_iam_role_policy" "compliance_drainer_dlq_send" {
629
+ count = local.use_local_zips ? 1 : 0
630
+ name = "compliance-drainer-dlq-send"
631
+ role = aws_iam_role.lambda.id
632
+
633
+ policy = jsonencode({
634
+ Version = "2012-10-17"
635
+ Statement = [{
636
+ Effect = "Allow"
637
+ Action = ["sqs:SendMessage"]
638
+ Resource = aws_sqs_queue.compliance_drainer_dlq[0].arn
639
+ }]
640
+ })
641
+ }
642
+
643
+ resource "aws_lambda_function_event_invoke_config" "compliance_outbox_drainer" {
644
+ count = local.use_local_zips ? 1 : 0
645
+ function_name = aws_lambda_function.handler["compliance-outbox-drainer"].function_name
646
+ maximum_retry_attempts = 0
647
+ maximum_event_age_in_seconds = 3600
648
+
649
+ destination_config {
650
+ on_failure {
651
+ destination = aws_sqs_queue.compliance_drainer_dlq[0].arn
652
+ }
653
+ }
654
+ }
655
+
656
+ # ---------------------------------------------------------------------------
657
+ # Phase 3 U4: compliance-outbox-drainer EventBridge schedule (every 1 min)
658
+ # ---------------------------------------------------------------------------
659
+
660
+ resource "aws_scheduler_schedule" "compliance_outbox_drainer" {
661
+ count = local.use_local_zips ? 1 : 0
662
+
663
+ name = "thinkwork-${var.stage}-compliance-outbox-drainer"
664
+ group_name = "default"
665
+ schedule_expression = "rate(1 minutes)"
666
+ state = "ENABLED"
667
+
668
+ flexible_time_window {
669
+ mode = "OFF"
670
+ }
671
+
672
+ target {
673
+ arn = aws_lambda_function.handler["compliance-outbox-drainer"].arn
674
+ role_arn = aws_iam_role.scheduler.arn
675
+ }
676
+ }
677
+
140
678
  # ---------------------------------------------------------------------------
141
679
  # API Gateway routes → Lambda integrations
142
680
  # ---------------------------------------------------------------------------
@@ -179,10 +717,35 @@ locals {
179
717
  "ANY /api/invites/{proxy+}" = "invites"
180
718
  "ANY /api/invites" = "invites"
181
719
 
720
+ # Compliance audit-event emit (Phase 3 U6) — narrow Bearer
721
+ # API_AUTH_SECRET endpoint, Strands Python client posts here.
722
+ "POST /api/compliance/events" = "compliance-events"
723
+
182
724
  # Skills
183
725
  "ANY /api/skills/{proxy+}" = "skills"
184
726
  "ANY /api/skills" = "skills"
185
727
 
728
+ # User Memory MCP OAuth/resource-server unblocker. These endpoints are
729
+ # enough for `codex mcp login thinkwork-user-memory-dev` to discover OAuth,
730
+ # register as a public PKCE client, sign the user in through Cognito, and
731
+ # receive a bearer token for the User Memory MCP resource.
732
+ "GET /.well-known/oauth-protected-resource" = "mcp-oauth"
733
+ "GET /.well-known/oauth-protected-resource/{proxy+}" = "mcp-oauth"
734
+ "GET /.well-known/oauth-authorization-server" = "mcp-oauth"
735
+ "GET /.well-known/openid-configuration" = "mcp-oauth"
736
+ "GET /mcp/oauth/jwks" = "mcp-oauth"
737
+ "POST /mcp/oauth/register" = "mcp-oauth"
738
+ "GET /mcp/oauth/authorize" = "mcp-oauth"
739
+ "GET /mcp/oauth/callback" = "mcp-oauth"
740
+ "POST /mcp/oauth/token" = "mcp-oauth"
741
+ "POST /mcp/oauth/revoke" = "mcp-oauth"
742
+ "ANY /mcp/user-memory" = "mcp-user-memory"
743
+ "ANY /mcp/context-engine" = "mcp-context-engine"
744
+
745
+ # Brain v0 service-auth writeback.
746
+ "POST /api/brain/agent-write" = "brain-agent-write"
747
+ "OPTIONS /api/brain/agent-write" = "brain-agent-write"
748
+
186
749
  # Activity
187
750
  "ANY /api/activity/{proxy+}" = "activity"
188
751
  "ANY /api/activity" = "activity"
@@ -193,6 +756,20 @@ locals {
193
756
  "GET /api/oauth/authorize" = "oauth-authorize"
194
757
  "GET /api/oauth/callback" = "oauth-callback"
195
758
 
759
+ # Stripe billing (unauthenticated — checkout is pre-signup; webhook is
760
+ # server-to-server with Stripe signature verification).
761
+ "POST /api/stripe/checkout-session" = "stripe-checkout"
762
+ "OPTIONS /api/stripe/checkout-session" = "stripe-checkout"
763
+ "POST /api/stripe/webhook" = "stripe-webhook"
764
+ "POST /api/stripe/portal-session" = "stripe-portal"
765
+ "OPTIONS /api/stripe/portal-session" = "stripe-portal"
766
+ "GET /api/stripe/subscription" = "stripe-subscription"
767
+ "OPTIONS /api/stripe/subscription" = "stripe-subscription"
768
+ "GET /api/auth/me" = "auth-me"
769
+ "OPTIONS /api/auth/me" = "auth-me"
770
+ "ANY /api/extensions/{extensionId}" = "extension-proxy"
771
+ "ANY /api/extensions/{extensionId}/{proxy+}" = "extension-proxy"
772
+
196
773
  # Routines
197
774
  "ANY /api/routines/{proxy+}" = "routines"
198
775
  "ANY /api/routines" = "routines"
@@ -215,26 +792,43 @@ locals {
215
792
  "ANY /api/job-schedules/{proxy+}" = "job-schedule-manager"
216
793
  "ANY /api/job-schedules" = "job-schedule-manager"
217
794
 
218
- # Webhooks (public trigger)
795
+ # Integration webhooks (Unit 8 — composable-skills). Each integration
796
+ # has its own Lambda + a specific route under /webhooks/{integration}/
797
+ # {tenantId}. Specific routes take precedence over the {proxy+}
798
+ # catch-all below, which still owns the legacy PRD-19 webhook-token
799
+ # surface.
800
+ "POST /webhooks/crm-opportunity/{tenantId}" = "webhook-crm-opportunity"
801
+ "POST /webhooks/task-event/{tenantId}" = "webhook-task-event"
802
+
803
+ # Webhooks (public trigger) — legacy PRD-19 tokenized webhooks.
219
804
  "POST /webhooks/{proxy+}" = "webhooks"
220
805
 
221
806
  # Webhooks admin
222
807
  "ANY /api/webhooks/{proxy+}" = "webhooks-admin"
223
808
  "ANY /api/webhooks" = "webhooks-admin"
224
809
 
225
- # Task Connectors admin
226
- "ANY /api/task-connectors/{proxy+}" = "task-connectors"
227
- "ANY /api/task-connectors" = "task-connectors"
228
-
229
810
  # Workspace files
230
811
  "ANY /api/workspaces/{proxy+}" = "workspace-files"
231
812
 
813
+ # Phase-one Computer migration. Service-auth only; operator tooling calls
814
+ # dry-run first and apply only after conflict review.
815
+ "POST /api/migrations/agents-to-computers" = "migrate-agents-to-computers"
816
+ "OPTIONS /api/migrations/agents-to-computers" = "migrate-agents-to-computers"
817
+
232
818
  # Knowledge bases
233
819
  "ANY /api/knowledge-bases/{proxy+}" = "knowledge-base-files"
234
820
 
235
821
  # Email
236
822
  "POST /api/email/send" = "email-send"
237
823
 
824
+ # Slack workspace app ingress. These unauthenticated public endpoints
825
+ # verify Slack signatures in handler code before any tenant work happens.
826
+ "POST /slack/events" = "slack-events"
827
+ "POST /slack/slash-command" = "slack-slash-command"
828
+ "POST /slack/interactivity" = "slack-interactivity"
829
+ "GET /slack/oauth/install" = "slack-oauth-install"
830
+ "POST /slack/oauth/install" = "slack-oauth-install"
831
+
238
832
  # Memory
239
833
  "ANY /api/memory/{proxy+}" = "memory"
240
834
 
@@ -247,6 +841,121 @@ locals {
247
841
  # GitHub App
248
842
  "ANY /api/github-app/{proxy+}" = "github-app"
249
843
  "POST /api/github/webhook" = "github-app"
844
+
845
+ # AgentCore Code Sandbox (plan Unit 10 + Unit 11). Strands container
846
+ # calls both with Bearer API_AUTH_SECRET before + after every
847
+ # executeCode. 429 on quota denial, 201 on audit-row insert.
848
+ "POST /api/sandbox/quota/check-and-increment" = "sandbox-quota-check"
849
+ "POST /api/sandbox/invocations" = "sandbox-invocation-log"
850
+
851
+ # Routines ASL validator (plan 2026-05-01-004 §U5). Bearer
852
+ # API_AUTH_SECRET. Chat builder + publish flow POST the candidate
853
+ # ASL document; returns { valid, errors, warnings }.
854
+ "POST /api/routines/validate" = "routine-asl-validator"
855
+ "OPTIONS /api/routines/validate" = "routine-asl-validator"
856
+
857
+ # Routines step-event ingest (plan 2026-05-01-005 §U9). Task wrappers
858
+ # (routine-task-python, routine-resume) POST per-step status
859
+ # transitions; the EventBridge rule in routines-stepfunctions/main.tf
860
+ # POSTs SFN execution-state-change events here for the agent_invoke
861
+ # recipe path (no wrapper Lambda). Bearer API_AUTH_SECRET. Idempotent
862
+ # via partial unique index on (execution_id, node_id, status,
863
+ # started_at) — see migration 0056.
864
+ "POST /api/routines/step" = "routine-step-callback"
865
+ "OPTIONS /api/routines/step" = "routine-step-callback"
866
+ "POST /api/routines/execution" = "routine-execution-callback"
867
+ "OPTIONS /api/routines/execution" = "routine-execution-callback"
868
+
869
+ # Skill-run dispatcher runtime-config fetch. Service-auth GET.
870
+ "GET /api/agents/runtime-config" = "agents-runtime-config"
871
+
872
+ # ThinkWork Computer runtime callback API. ECS tasks call outbound with
873
+ # Bearer API_AUTH_SECRET to fetch config, heartbeat, claim one task, append
874
+ # product/audit events, and complete/fail tasks.
875
+ "ANY /api/computers/runtime/{proxy+}" = "computer-runtime"
876
+
877
+ # Admin Terminal tab — opens an ECS Exec session into the running
878
+ # Computer task and returns {sessionId, streamUrl, tokenValue} to the
879
+ # browser, which then connects WebSocket directly to ssmmessages.
880
+ "POST /api/computers/{computerId}/terminal/start" = "computer-terminal-start"
881
+ "OPTIONS /api/computers/{computerId}/terminal/start" = "computer-terminal-start"
882
+
883
+ # ThinkWork Computer manager API. Internal service-auth endpoint used by
884
+ # admin operations to reconcile per-Computer ECS service desired state.
885
+ "POST /api/computers/manager" = "computer-manager"
886
+ "OPTIONS /api/computers/manager" = "computer-manager"
887
+
888
+ # Admin-Ops MCP server — single JSON-RPC endpoint. Strands agents
889
+ # (and anyone else) POST with Bearer <tenant-scoped token> issued by
890
+ # the mcp-admin-keys handler below. The shared API_AUTH_SECRET is
891
+ # retained as a break-glass superuser path for bootstrap/debug.
892
+ "POST /mcp/admin" = "admin-ops-mcp"
893
+
894
+ # MCP admin key management — per-tenant Bearer token CRUD. Tokens
895
+ # are shown ONCE at creation (POST returns raw value); server stores
896
+ # sha256 hash only. These specific routes take precedence over the
897
+ # existing `ANY /api/tenants/{proxy+}` route (tenants handler) per
898
+ # API Gateway v2's most-specific-match rule.
899
+ "POST /api/tenants/{tenantId}/mcp-admin-keys" = "mcp-admin-keys"
900
+ "GET /api/tenants/{tenantId}/mcp-admin-keys" = "mcp-admin-keys"
901
+ "DELETE /api/tenants/{tenantId}/mcp-admin-keys/{keyId}" = "mcp-admin-keys"
902
+
903
+ # One-shot tenant provisioning for the admin-ops MCP. Mints a fresh
904
+ # tkm_ key + stores it in Secrets Manager at
905
+ # thinkwork/<stage>/mcp/<tenantId>/admin-ops + upserts the
906
+ # tenant_mcp_servers row so the runtime picks the server up for
907
+ # any agent that gets it assigned via agent_mcp_servers.
908
+ "POST /api/tenants/{tenantId}/mcp-admin-provision" = "mcp-admin-provision"
909
+
910
+ # MCP server admin approval (plan §U11, SI-5). Plugin-uploaded MCP
911
+ # servers land with status='pending'; these routes flip them to
912
+ # approved/rejected. Cognito JWT only (mcp-approval handler rejects
913
+ # apikey callers) — the admin SPA is the sole UI surface.
914
+ "POST /api/tenants/{tenantId}/mcp-servers/{serverId}/approve" = "mcp-approval"
915
+ "OPTIONS /api/tenants/{tenantId}/mcp-servers/{serverId}/approve" = "mcp-approval"
916
+ "POST /api/tenants/{tenantId}/mcp-servers/{serverId}/reject" = "mcp-approval"
917
+ "OPTIONS /api/tenants/{tenantId}/mcp-servers/{serverId}/reject" = "mcp-approval"
918
+
919
+ # Plugin upload admin surface (plan §U10). Admin SPA drives the full
920
+ # flow: POST /presign → browser PUT to presigned S3 URL → POST /upload
921
+ # (validator + three-phase install saga). GET routes back the admin's
922
+ # plugin history view. handleCors() short-circuits OPTIONS before auth
923
+ # — required for the browser to preflight successfully.
924
+ "POST /api/plugins/presign" = "plugin-upload"
925
+ "OPTIONS /api/plugins/presign" = "plugin-upload"
926
+ "POST /api/plugins/upload" = "plugin-upload"
927
+ "OPTIONS /api/plugins/upload" = "plugin-upload"
928
+ "GET /api/plugins" = "plugin-upload"
929
+ "OPTIONS /api/plugins" = "plugin-upload"
930
+ "GET /api/plugins/{uploadId}" = "plugin-upload"
931
+ "OPTIONS /api/plugins/{uploadId}" = "plugin-upload"
932
+
933
+ # Finance pilot U2 — thread-attachment upload (presign + finalize).
934
+ # Cognito JWT; tenant pinned via threads.tenant_id lookup. OPTIONS
935
+ # is handled inside the Lambda before auth.
936
+ "POST /api/threads/{threadId}/attachments/presign" = "thread-attachments-presign"
937
+ "OPTIONS /api/threads/{threadId}/attachments/presign" = "thread-attachments-presign"
938
+ "POST /api/threads/{threadId}/attachments/finalize" = "thread-attachments-finalize"
939
+ "OPTIONS /api/threads/{threadId}/attachments/finalize" = "thread-attachments-finalize"
940
+
941
+ # U9-remainder of finance pilot — tenant-pinned download endpoint.
942
+ "GET /api/threads/{threadId}/attachments/{attachmentId}/download" = "thread-attachment-download"
943
+ "OPTIONS /api/threads/{threadId}/attachments/{attachmentId}/download" = "thread-attachment-download"
944
+
945
+ # Fat-folder bundle import. OPTIONS is handled inside the Lambda before auth.
946
+ "POST /api/agents/{agentId}/import-bundle" = "folder-bundle-import"
947
+ "OPTIONS /api/agents/{agentId}/import-bundle" = "folder-bundle-import"
948
+
949
+ # Resolved Capability Manifest write endpoint (plan §U15). Strands
950
+ # container posts one row per agent-session-start. Shared
951
+ # API_AUTH_SECRET; no tenant OAuth.
952
+ "POST /api/runtime/manifests" = "manifest-log"
953
+ "OPTIONS /api/runtime/manifests" = "manifest-log"
954
+
955
+ # SI-7 catalog-list read (plan §U15 pt 3/3). Strands container fetches
956
+ # the allowed slug set once per session-start. Shared API_AUTH_SECRET.
957
+ "GET /api/runtime/capability-catalog" = "capability-catalog-list"
958
+ "OPTIONS /api/runtime/capability-catalog" = "capability-catalog-list"
250
959
  } : {}
251
960
  }
252
961
 
@@ -321,15 +1030,300 @@ resource "aws_scheduler_schedule" "webhook_deliveries_cleanup" {
321
1030
  }
322
1031
  }
323
1032
 
1033
+ # ---------------------------------------------------------------------------
1034
+ # Plugin staging sweeper — hourly orphan-S3 cleanup for interrupted install
1035
+ # sagas (plan §U10). WORKSPACE_BUCKET env on the Lambda role already grants
1036
+ # the list+delete IAM; this schedule is the hourly trigger. The sweeper's
1037
+ # own cutoff constant (60 min) is independent of this cron cadence.
1038
+ # ---------------------------------------------------------------------------
1039
+
1040
+ resource "aws_scheduler_schedule" "plugin_staging_sweeper" {
1041
+ count = local.use_local_zips ? 1 : 0
1042
+
1043
+ name = "thinkwork-${var.stage}-plugin-staging-sweeper"
1044
+ group_name = "default"
1045
+ schedule_expression = "rate(1 hour)"
1046
+ state = "ENABLED"
1047
+
1048
+ flexible_time_window {
1049
+ mode = "OFF"
1050
+ }
1051
+
1052
+ target {
1053
+ arn = aws_lambda_function.handler["plugin-staging-sweeper"].arn
1054
+ role_arn = aws_iam_role.scheduler.arn
1055
+ }
1056
+ }
1057
+
1058
+ # ---------------------------------------------------------------------------
1059
+ # MCP approval TTL sweeper — daily auto-reject of pending rows > 30 days old
1060
+ # (plan §U11). A plugin whose MCP sat uncurated for a month is stale: clear
1061
+ # pending to keep the admin queue honest and surface the reject action in
1062
+ # the audit log.
1063
+ # ---------------------------------------------------------------------------
1064
+
1065
+ resource "aws_scheduler_schedule" "mcp_approval_sweeper" {
1066
+ count = local.use_local_zips ? 1 : 0
1067
+
1068
+ name = "thinkwork-${var.stage}-mcp-approval-sweeper"
1069
+ group_name = "default"
1070
+ schedule_expression = "cron(15 4 * * ? *)" # daily at 04:15 UTC (offset from webhook cleanup)
1071
+ state = "ENABLED"
1072
+
1073
+ flexible_time_window {
1074
+ mode = "OFF"
1075
+ }
1076
+
1077
+ target {
1078
+ arn = aws_lambda_function.handler["mcp-approval-sweeper"].arn
1079
+ role_arn = aws_iam_role.scheduler.arn
1080
+ }
1081
+ }
1082
+
1083
+ # ---------------------------------------------------------------------------
1084
+ # skill_runs reconciler — transitions stuck-running rows to failed every 5 min.
1085
+ # Guards against agentcore Lambda crashes / OOMs that drop the
1086
+ # /api/skills/complete writeback and leave the row at 'running' forever,
1087
+ # which in turn blocks the dedup partial unique index from letting retries
1088
+ # through.
1089
+ # ---------------------------------------------------------------------------
1090
+
1091
+ resource "aws_scheduler_schedule" "skill_runs_reconciler" {
1092
+ count = local.use_local_zips ? 1 : 0
1093
+
1094
+ name = "thinkwork-${var.stage}-skill-runs-reconciler"
1095
+ group_name = "default"
1096
+ schedule_expression = "rate(5 minutes)"
1097
+ state = "ENABLED"
1098
+
1099
+ flexible_time_window {
1100
+ mode = "OFF"
1101
+ }
1102
+
1103
+ target {
1104
+ arn = aws_lambda_function.handler["skill-runs-reconciler"].arn
1105
+ role_arn = aws_iam_role.scheduler.arn
1106
+ }
1107
+ }
1108
+
1109
+ # ---------------------------------------------------------------------------
1110
+ # eval_runs reconciler — finalizes stuck-running eval runs every 5 min.
1111
+ # Guards against worker crashes/timeouts that occur before a per-case result
1112
+ # row is written. Missing category-selected cases are recorded as error rows,
1113
+ # then the run is finalized so the Admin UI cannot remain "running" forever.
1114
+ # ---------------------------------------------------------------------------
1115
+
1116
+ resource "aws_scheduler_schedule" "eval_runs_reconciler" {
1117
+ count = local.use_local_zips ? 1 : 0
1118
+
1119
+ name = "thinkwork-${var.stage}-eval-runs-reconciler"
1120
+ group_name = "default"
1121
+ schedule_expression = "rate(5 minutes)"
1122
+ state = "ENABLED"
1123
+
1124
+ flexible_time_window {
1125
+ mode = "OFF"
1126
+ }
1127
+
1128
+ target {
1129
+ arn = aws_lambda_function.handler["eval-runs-reconciler"].arn
1130
+ role_arn = aws_iam_role.scheduler.arn
1131
+ }
1132
+ }
1133
+
1134
+ # ---------------------------------------------------------------------------
1135
+ # Stall monitor — marks stalled thread turns and runbook steps failed every
1136
+ # minute. This is the global backstop for agent/runtime crashes; the Computer
1137
+ # heartbeat also reconciles its own stale runbook tasks while it is alive.
1138
+ # ---------------------------------------------------------------------------
1139
+
1140
+ resource "aws_scheduler_schedule" "stall_monitor" {
1141
+ count = local.use_local_zips ? 1 : 0
1142
+
1143
+ name = "thinkwork-${var.stage}-stall-monitor"
1144
+ group_name = "default"
1145
+ schedule_expression = "rate(1 minutes)"
1146
+ state = "ENABLED"
1147
+
1148
+ flexible_time_window {
1149
+ mode = "OFF"
1150
+ }
1151
+
1152
+ target {
1153
+ arn = aws_lambda_function.handler["cron-stall-monitor"].arn
1154
+ role_arn = aws_iam_role.scheduler.arn
1155
+ }
1156
+ }
1157
+
1158
+ # ---------------------------------------------------------------------------
1159
+ # ThinkWork Computer runtime reconciler — keeps active Computers aligned with
1160
+ # desired_runtime_status by provisioning/starting/stopping ECS services in
1161
+ # bounded batches. The handler is conservative and records per-Computer events
1162
+ # for every attempted action.
1163
+ # ---------------------------------------------------------------------------
1164
+
1165
+ resource "aws_scheduler_schedule" "computer_runtime_reconciler" {
1166
+ count = local.use_local_zips ? 1 : 0
1167
+
1168
+ name = "thinkwork-${var.stage}-computer-runtime-reconciler"
1169
+ group_name = "default"
1170
+ schedule_expression = "rate(5 minutes)"
1171
+ state = "ENABLED"
1172
+
1173
+ flexible_time_window {
1174
+ mode = "OFF"
1175
+ }
1176
+
1177
+ target {
1178
+ arn = aws_lambda_function.handler["computer-runtime-reconciler"].arn
1179
+ role_arn = aws_iam_role.scheduler.arn
1180
+ }
1181
+ }
1182
+
1183
+ resource "aws_scheduler_schedule" "slack_dispatch" {
1184
+ count = local.use_local_zips ? 1 : 0
1185
+
1186
+ name = "thinkwork-${var.stage}-slack-dispatch"
1187
+ group_name = "default"
1188
+ schedule_expression = "rate(1 minute)"
1189
+ state = "ENABLED"
1190
+
1191
+ flexible_time_window {
1192
+ mode = "OFF"
1193
+ }
1194
+
1195
+ target {
1196
+ arn = aws_lambda_function.handler["slack-dispatch"].arn
1197
+ role_arn = aws_iam_role.scheduler.arn
1198
+ input = jsonencode({ limit = 25 })
1199
+ }
1200
+ }
1201
+
1202
+ # ---------------------------------------------------------------------------
1203
+ # Compounding Memory — nightly hygiene + export
1204
+ # ---------------------------------------------------------------------------
1205
+
1206
+ resource "aws_scheduler_schedule" "wiki_compile_drainer" {
1207
+ count = local.use_local_zips ? 1 : 0
1208
+
1209
+ name = "thinkwork-${var.stage}-wiki-compile-drainer"
1210
+ group_name = "default"
1211
+ schedule_expression = "rate(1 minutes)"
1212
+ state = "ENABLED"
1213
+
1214
+ flexible_time_window {
1215
+ mode = "OFF"
1216
+ }
1217
+
1218
+ target {
1219
+ arn = aws_lambda_function.handler["wiki-compile"].arn
1220
+ role_arn = aws_iam_role.scheduler.arn
1221
+ }
1222
+ }
1223
+
1224
+ resource "aws_scheduler_schedule" "wiki_lint" {
1225
+ count = local.use_local_zips ? 1 : 0
1226
+
1227
+ name = "thinkwork-${var.stage}-wiki-lint"
1228
+ group_name = "default"
1229
+ schedule_expression = "cron(0 2 * * ? *)" # daily at 02:00 UTC
1230
+ state = "ENABLED"
1231
+
1232
+ flexible_time_window {
1233
+ mode = "OFF"
1234
+ }
1235
+
1236
+ target {
1237
+ arn = aws_lambda_function.handler["wiki-lint"].arn
1238
+ role_arn = aws_iam_role.scheduler.arn
1239
+ }
1240
+ }
1241
+
1242
+ resource "aws_scheduler_schedule" "wiki_export" {
1243
+ count = local.use_local_zips ? 1 : 0
1244
+
1245
+ name = "thinkwork-${var.stage}-wiki-export"
1246
+ group_name = "default"
1247
+ schedule_expression = "cron(0 3 * * ? *)" # daily at 03:00 UTC (after lint)
1248
+ state = "ENABLED"
1249
+
1250
+ flexible_time_window {
1251
+ mode = "OFF"
1252
+ }
1253
+
1254
+ target {
1255
+ arn = aws_lambda_function.handler["wiki-export"].arn
1256
+ role_arn = aws_iam_role.scheduler.arn
1257
+ }
1258
+ }
1259
+
1260
+ # S3 bucket for markdown vault exports. One bundle per (tenant, owner, date).
1261
+ # Retention is handled by the lifecycle rule below (30 days).
1262
+ resource "aws_s3_bucket" "wiki_exports" {
1263
+ bucket = "thinkwork-${var.stage}-wiki-exports"
1264
+ force_destroy = var.stage == "dev"
1265
+
1266
+ tags = {
1267
+ Name = "thinkwork-${var.stage}-wiki-exports"
1268
+ }
1269
+ }
1270
+
1271
+ resource "aws_s3_bucket_public_access_block" "wiki_exports" {
1272
+ bucket = aws_s3_bucket.wiki_exports.id
1273
+ block_public_acls = true
1274
+ block_public_policy = true
1275
+ ignore_public_acls = true
1276
+ restrict_public_buckets = true
1277
+ }
1278
+
1279
+ resource "aws_s3_bucket_lifecycle_configuration" "wiki_exports" {
1280
+ bucket = aws_s3_bucket.wiki_exports.id
1281
+
1282
+ rule {
1283
+ id = "expire-old-bundles"
1284
+ status = "Enabled"
1285
+
1286
+ filter {}
1287
+
1288
+ expiration {
1289
+ days = 30
1290
+ }
1291
+ }
1292
+ }
1293
+
1294
+ resource "aws_iam_role_policy" "lambda_wiki_exports_s3" {
1295
+ name = "wiki-exports-s3"
1296
+ role = aws_iam_role.lambda.id
1297
+
1298
+ policy = jsonencode({
1299
+ Version = "2012-10-17"
1300
+ Statement = [{
1301
+ Effect = "Allow"
1302
+ Action = ["s3:PutObject", "s3:AbortMultipartUpload"]
1303
+ Resource = "${aws_s3_bucket.wiki_exports.arn}/*"
1304
+ }]
1305
+ })
1306
+ }
1307
+
324
1308
  resource "aws_iam_role" "scheduler" {
325
1309
  name = "thinkwork-${var.stage}-scheduler-role"
326
1310
 
1311
+ # Phase 3 U8a — `aws:SourceAccount` confused-deputy guard. Without
1312
+ # this condition, a foreign-account principal who learns the role ARN
1313
+ # could potentially construct cross-account Scheduler events. The
1314
+ # guard applies to ALL handlers the scheduler invokes; defense-in-depth
1315
+ # alongside per-Lambda `aws:SourceArn` pins like the U7 anchor role.
327
1316
  assume_role_policy = jsonencode({
328
1317
  Version = "2012-10-17"
329
1318
  Statement = [{
330
1319
  Effect = "Allow"
331
1320
  Principal = { Service = "scheduler.amazonaws.com" }
332
1321
  Action = "sts:AssumeRole"
1322
+ Condition = {
1323
+ StringEquals = {
1324
+ "aws:SourceAccount" = var.account_id
1325
+ }
1326
+ }
333
1327
  }]
334
1328
  })
335
1329
  }
@@ -341,9 +1335,22 @@ resource "aws_iam_role_policy" "scheduler_invoke" {
341
1335
  policy = jsonencode({
342
1336
  Version = "2012-10-17"
343
1337
  Statement = [{
344
- Effect = "Allow"
345
- Action = ["lambda:InvokeFunction"]
346
- Resource = local.use_local_zips ? [for k, v in aws_lambda_function.handler : v.arn] : []
1338
+ Effect = "Allow"
1339
+ Action = ["lambda:InvokeFunction"]
1340
+ # Includes every for_each handler PLUS the standalone Phase 3 U8a
1341
+ # anchor Lambda (which is intentionally outside the for_each set
1342
+ # because it uses the U7 IAM role, not the shared aws_iam_role.lambda).
1343
+ # Splat (`[*]`) expansion handles count=0 cleanly when local.use_local_zips
1344
+ # is false; an indexed reference (`[0].arn`) would throw on graph eval.
1345
+ # Phase 3 U8b — watchdog moved to standalone resource; its ARN must
1346
+ # be added to the splat list explicitly (SEC-U8B-005). The splat
1347
+ # `[*].arn` form handles count = 0 cleanly when local.use_local_zips
1348
+ # is false; an indexed `[0].arn` would throw on graph eval.
1349
+ Resource = local.use_local_zips ? concat(
1350
+ [for k, v in aws_lambda_function.handler : v.arn],
1351
+ aws_lambda_function.compliance_anchor[*].arn,
1352
+ aws_lambda_function.compliance_anchor_watchdog[*].arn,
1353
+ ) : []
347
1354
  }]
348
1355
  })
349
1356
  }
@@ -352,15 +1359,523 @@ resource "aws_iam_role_policy" "scheduler_invoke" {
352
1359
  # SSM Parameters — Lambda ARNs for cross-function invocation
353
1360
  # ---------------------------------------------------------------------------
354
1361
 
1362
+ ########################################################################
1363
+ # SecureString parameter for the Google Places API key. wiki-compile reads
1364
+ # this on cold start via loadGooglePlacesClientFromSsm() and caches the
1365
+ # client at module scope. When google_places_api_key is empty (the
1366
+ # default), we seed the parameter with a placeholder so the Lambda init
1367
+ # path can distinguish "unconfigured" (skip Google entirely, degrade
1368
+ # gracefully) from "configured but wrong" (log + skip). lifecycle.ignore_
1369
+ # changes on `value` lets ops rotate via
1370
+ # aws ssm put-parameter --overwrite \
1371
+ # --name /thinkwork/<stage>/google-places/api-key \
1372
+ # --type SecureString --value <KEY>
1373
+ # without terraform fighting it on the next apply.
1374
+ ########################################################################
1375
+
1376
+ resource "aws_ssm_parameter" "google_places_api_key" {
1377
+ name = "/thinkwork/${var.stage}/google-places/api-key"
1378
+ type = "SecureString"
1379
+ value = var.google_places_api_key != "" ? var.google_places_api_key : "PLACEHOLDER_SET_VIA_CLI"
1380
+ description = "Google Places API (New) key consumed by wiki-compile. See docs/plans/2026-04-21-005-feat-wiki-place-capability-v2-plan.md Unit 4."
1381
+
1382
+ lifecycle {
1383
+ # Allow `aws ssm put-parameter --overwrite` to stick across applies.
1384
+ # New-key rotation or initial population by ops should happen via CLI,
1385
+ # not via terraform var.
1386
+ ignore_changes = [value]
1387
+ }
1388
+ }
1389
+
355
1390
  resource "aws_ssm_parameter" "lambda_arns" {
356
1391
  for_each = local.use_local_zips ? {
357
1392
  "chat-agent-invoke-fn-arn" = aws_lambda_function.handler["chat-agent-invoke"].arn
358
1393
  "kb-manager-fn-arn" = aws_lambda_function.handler["knowledge-base-manager"].arn
359
1394
  "job-schedule-manager-fn-arn" = aws_lambda_function.handler["job-schedule-manager"].arn
360
1395
  "memory-retain-fn-arn" = aws_lambda_function.handler["memory-retain"].arn
1396
+ "eval-runner-fn-arn" = aws_lambda_function.handler["eval-runner"].arn
1397
+ "eval-worker-fn-arn" = aws_lambda_function.handler["eval-worker"].arn
361
1398
  } : {}
362
1399
 
363
1400
  name = "/thinkwork/${var.stage}/${each.key}"
364
1401
  type = "String"
365
1402
  value = each.value
366
1403
  }
1404
+
1405
+ # ===========================================================================
1406
+ # Phase 3 U8a — Compliance Anchor Lambda (STANDALONE) + Watchdog wiring
1407
+ # ===========================================================================
1408
+ # Plan: docs/plans/2026-05-07-010-feat-compliance-u8a-anchor-lambda-inert-plan.md
1409
+ #
1410
+ # The anchor Lambda is INTENTIONALLY OUTSIDE the for_each handler set
1411
+ # because its execution role is the U7 IAM role (`compliance-anchor-
1412
+ # lambda-role`), not the shared `aws_iam_role.lambda`. Adding a per-key
1413
+ # `role` ternary on the for_each set is the highest-blast-radius single
1414
+ # expression in this PR (any expression error silently downgrades 60+
1415
+ # unrelated handlers); a standalone resource isolates blast radius.
1416
+ #
1417
+ # The watchdog DOES live in the for_each set — it uses the shared
1418
+ # execution role (only needs AWSLambdaBasicExecutionRole + a small inline
1419
+ # policy below for ComplianceAnchorWatchdogHeartbeat metric emit).
1420
+ # ===========================================================================
1421
+
1422
+ resource "aws_lambda_function" "compliance_anchor" {
1423
+ count = local.use_local_zips ? 1 : 0
1424
+
1425
+ function_name = "thinkwork-${var.stage}-api-compliance-anchor"
1426
+ role = var.compliance_anchor_lambda_role_arn
1427
+ handler = "index.handler"
1428
+ runtime = local.runtime
1429
+ timeout = 60
1430
+ memory_size = 1024
1431
+ filename = "${var.lambda_zips_dir}/compliance-anchor.zip"
1432
+ source_code_hash = filebase64sha256("${var.lambda_zips_dir}/compliance-anchor.zip")
1433
+ reserved_concurrent_executions = 1
1434
+
1435
+ environment {
1436
+ variables = {
1437
+ STAGE = var.stage
1438
+ AWS_NODEJS_CONNECTION_REUSE_ENABLED = "1"
1439
+ COMPLIANCE_READER_SECRET_ARN = var.compliance_reader_secret_arn
1440
+ COMPLIANCE_DRAINER_SECRET_ARN = var.compliance_drainer_secret_arn
1441
+ COMPLIANCE_ANCHOR_BUCKET_NAME = var.compliance_anchor_bucket_name
1442
+ COMPLIANCE_ANCHOR_RETENTION_DAYS = tostring(var.compliance_anchor_object_lock_retention_days)
1443
+ # Phase 3 U8b — required by `_anchor_fn_live`. The Lambda throws on
1444
+ # boot if either of these is empty; the U8b composite root wires
1445
+ # both from `module.compliance_anchors` outputs.
1446
+ COMPLIANCE_ANCHOR_KMS_KEY_ARN = var.compliance_anchor_kms_key_arn
1447
+ COMPLIANCE_ANCHOR_OBJECT_LOCK_MODE = var.compliance_anchor_object_lock_mode
1448
+ }
1449
+ }
1450
+ }
1451
+
1452
+ resource "aws_sqs_queue" "compliance_anchor_dlq" {
1453
+ count = local.use_local_zips ? 1 : 0
1454
+ name = "thinkwork-${var.stage}-compliance-anchor-dlq"
1455
+ message_retention_seconds = 1209600 # 14 days, matches the drainer DLQ
1456
+ sqs_managed_sse_enabled = true
1457
+ }
1458
+
1459
+ resource "aws_iam_role_policy" "compliance_anchor_dlq_send" {
1460
+ count = local.use_local_zips ? 1 : 0
1461
+ name = "compliance-anchor-dlq-send"
1462
+ # Attached to the U7 anchor role (which the standalone anchor Lambda assumes).
1463
+ role = var.compliance_anchor_lambda_role_name
1464
+
1465
+ policy = jsonencode({
1466
+ Version = "2012-10-17"
1467
+ Statement = [{
1468
+ Effect = "Allow"
1469
+ Action = ["sqs:SendMessage"]
1470
+ Resource = aws_sqs_queue.compliance_anchor_dlq[0].arn
1471
+ }]
1472
+ })
1473
+ }
1474
+
1475
+ resource "aws_lambda_function_event_invoke_config" "compliance_anchor" {
1476
+ count = local.use_local_zips ? 1 : 0
1477
+ function_name = aws_lambda_function.compliance_anchor[0].function_name
1478
+ maximum_retry_attempts = 0
1479
+ maximum_event_age_in_seconds = 3600
1480
+
1481
+ destination_config {
1482
+ on_failure {
1483
+ destination = aws_sqs_queue.compliance_anchor_dlq[0].arn
1484
+ }
1485
+ }
1486
+ }
1487
+
1488
+ # ---------------------------------------------------------------------------
1489
+ # Phase 3 U8b — Watchdog Lambda (STANDALONE).
1490
+ #
1491
+ # Moves OFF the shared aws_iam_role.lambda onto a dedicated sibling role
1492
+ # (kms:DescribeKey only on the CMK, s3:ListBucket prefix-conditioned on
1493
+ # anchors/, no kms:Decrypt — the watchdog never reads object bodies).
1494
+ # The shared role's prior compliance_watchdog_metrics inline policy is
1495
+ # removed (its function is now on the sibling role).
1496
+ #
1497
+ # Operator pre-merge: `terraform state mv` the existing
1498
+ # `aws_lambda_function.handler["compliance-anchor-watchdog"]` address to
1499
+ # `aws_lambda_function.compliance_anchor_watchdog[0]`. Without it, apply
1500
+ # fails with ResourceConflictException on the function name.
1501
+ # ---------------------------------------------------------------------------
1502
+
1503
+ resource "aws_lambda_function" "compliance_anchor_watchdog" {
1504
+ count = local.use_local_zips ? 1 : 0
1505
+
1506
+ function_name = "thinkwork-${var.stage}-api-compliance-anchor-watchdog"
1507
+ role = var.compliance_anchor_watchdog_role_arn
1508
+ handler = "index.handler"
1509
+ runtime = local.runtime
1510
+ timeout = 30
1511
+ memory_size = 512
1512
+ filename = "${var.lambda_zips_dir}/compliance-anchor-watchdog.zip"
1513
+ source_code_hash = filebase64sha256("${var.lambda_zips_dir}/compliance-anchor-watchdog.zip")
1514
+
1515
+ environment {
1516
+ variables = {
1517
+ STAGE = var.stage
1518
+ AWS_NODEJS_CONNECTION_REUSE_ENABLED = "1"
1519
+ COMPLIANCE_ANCHOR_BUCKET_NAME = var.compliance_anchor_bucket_name
1520
+ }
1521
+ }
1522
+
1523
+ tags = {
1524
+ Name = "thinkwork-${var.stage}-api-compliance-anchor-watchdog"
1525
+ Handler = "compliance-anchor-watchdog"
1526
+ }
1527
+ }
1528
+
1529
+ # ---------------------------------------------------------------------------
1530
+ # Schedules — retry_policy is nested inside target { ... }, NOT at the
1531
+ # schedule top level. Verified against AWS provider schema.
1532
+ # ---------------------------------------------------------------------------
1533
+
1534
+ resource "aws_scheduler_schedule" "compliance_anchor" {
1535
+ count = local.use_local_zips ? 1 : 0
1536
+
1537
+ name = "thinkwork-${var.stage}-compliance-anchor"
1538
+ group_name = "default"
1539
+ schedule_expression = "rate(15 minutes)"
1540
+ state = "ENABLED"
1541
+
1542
+ flexible_time_window {
1543
+ mode = "OFF"
1544
+ }
1545
+
1546
+ target {
1547
+ arn = aws_lambda_function.compliance_anchor[0].arn
1548
+ role_arn = aws_iam_role.scheduler.arn
1549
+
1550
+ retry_policy {
1551
+ maximum_retry_attempts = 0
1552
+ }
1553
+ }
1554
+ }
1555
+
1556
+ resource "aws_scheduler_schedule" "compliance_anchor_watchdog" {
1557
+ count = local.use_local_zips ? 1 : 0
1558
+
1559
+ name = "thinkwork-${var.stage}-compliance-anchor-watchdog"
1560
+ group_name = "default"
1561
+ schedule_expression = "rate(5 minutes)"
1562
+ state = "ENABLED"
1563
+
1564
+ flexible_time_window {
1565
+ mode = "OFF"
1566
+ }
1567
+
1568
+ target {
1569
+ # Phase 3 U8b — points at the standalone watchdog resource (was
1570
+ # aws_lambda_function.handler["compliance-anchor-watchdog"] before
1571
+ # the for_each split-out).
1572
+ arn = aws_lambda_function.compliance_anchor_watchdog[0].arn
1573
+ role_arn = aws_iam_role.scheduler.arn
1574
+
1575
+ retry_policy {
1576
+ maximum_retry_attempts = 0
1577
+ }
1578
+ }
1579
+ }
1580
+
1581
+ # ---------------------------------------------------------------------------
1582
+ # CloudWatch alarms — Phase 3 U8b
1583
+ #
1584
+ # Two alarms split the failure space:
1585
+ #
1586
+ # 1. compliance-anchor-gap (treat_missing_data = "breaching"). Fires
1587
+ # when ComplianceAnchorGap >= 1 for two consecutive 5-min periods
1588
+ # OR when the watchdog stops emitting the metric entirely (IAM
1589
+ # regression, code crash, S3 ListObjectsV2 perma-fail).
1590
+ #
1591
+ # 2. compliance-anchor-watchdog-heartbeat-missing
1592
+ # (treat_missing_data = "notBreaching" born-state). Distinguishes
1593
+ # "real anchor gap" from "watchdog metric path broken". Born-state
1594
+ # is notBreaching to give Greenfield deploys a window before the
1595
+ # first heartbeat lands; flip to breaching in a follow-up after
1596
+ # first soak (Decision #7 / ADV-004).
1597
+ # ---------------------------------------------------------------------------
1598
+
1599
+ resource "aws_cloudwatch_metric_alarm" "compliance_anchor_gap" {
1600
+ count = local.use_local_zips ? 1 : 0
1601
+
1602
+ alarm_name = "thinkwork-${var.stage}-compliance-anchor-gap"
1603
+ alarm_description = "Anchor cadence gap exceeded threshold. LIVE in U8b — fires on >=1 ComplianceAnchorGap=1 OR missing metric (means watchdog broken)."
1604
+ namespace = "Thinkwork/Compliance"
1605
+ metric_name = "ComplianceAnchorGap"
1606
+ statistic = "Maximum"
1607
+ period = 300
1608
+ evaluation_periods = 2
1609
+ threshold = 1
1610
+ comparison_operator = "GreaterThanOrEqualToThreshold"
1611
+ treat_missing_data = "breaching"
1612
+ alarm_actions = []
1613
+
1614
+ dimensions = {
1615
+ Stage = var.stage
1616
+ }
1617
+ }
1618
+
1619
+ resource "aws_cloudwatch_metric_alarm" "compliance_anchor_watchdog_heartbeat_missing" {
1620
+ count = local.use_local_zips ? 1 : 0
1621
+
1622
+ alarm_name = "thinkwork-${var.stage}-compliance-anchor-watchdog-heartbeat-missing"
1623
+ alarm_description = "Watchdog heartbeat metric is missing. LIVE in U8b — born with treat_missing_data = notBreaching to absorb deploy-time gaps; promote to breaching in a follow-up after first soak."
1624
+ namespace = "Thinkwork/Compliance"
1625
+ metric_name = "ComplianceAnchorWatchdogHeartbeat"
1626
+ statistic = "Sum"
1627
+ period = 300
1628
+ evaluation_periods = 2
1629
+ threshold = 1
1630
+ comparison_operator = "LessThanThreshold"
1631
+ treat_missing_data = "notBreaching"
1632
+ alarm_actions = []
1633
+
1634
+ dimensions = {
1635
+ Stage = var.stage
1636
+ }
1637
+ }
1638
+
1639
+ # ---------------------------------------------------------------------------
1640
+ # Phase 3 U11.U2 — Compliance export runner (STANDALONE, INERT)
1641
+ #
1642
+ # The U11.U1 createComplianceExport mutation (PR #944) inserts a queued
1643
+ # row into compliance.export_jobs and dispatches `{jobId}` to this SQS
1644
+ # queue. The runner Lambda below has a STUB body in U11.U2 (throws
1645
+ # "not implemented") — U11.U3 swaps in the live body that streams
1646
+ # CSV/NDJSON to the exports S3 bucket and publishes a 15-minute
1647
+ # presigned URL.
1648
+ #
1649
+ # Inert-substrate posture (per `feedback_ship_inert_pattern`):
1650
+ # - SQS messages from the U11.U1 mutation accumulate.
1651
+ # - After maxReceiveCount=3 attempts the stub throw routes them to
1652
+ # the DLQ.
1653
+ # - The DLQ depth alarm signals operators that the runner needs U11.U3.
1654
+ # - This is the visible inert state — silent no-op stubs are an
1655
+ # anti-pattern (queued jobs would stay QUEUED forever with no signal).
1656
+ #
1657
+ # Standalone Lambda (NOT in the for_each pool) — isolates the runner's
1658
+ # bucket-scoped IAM role from the 60+ unrelated handlers. Mirrors the
1659
+ # U8a anchor Lambda's standalone-resource pattern.
1660
+ # ---------------------------------------------------------------------------
1661
+
1662
+ resource "aws_sqs_queue" "compliance_exports_dlq" {
1663
+ count = local.use_local_zips ? 1 : 0
1664
+ name = "thinkwork-${var.stage}-compliance-exports-dlq"
1665
+ message_retention_seconds = 1209600 # 14 days
1666
+ sqs_managed_sse_enabled = true
1667
+
1668
+ tags = {
1669
+ Name = "thinkwork-${var.stage}-compliance-exports-dlq"
1670
+ }
1671
+ }
1672
+
1673
+ resource "aws_sqs_queue" "compliance_exports" {
1674
+ count = local.use_local_zips ? 1 : 0
1675
+ name = "thinkwork-${var.stage}-compliance-exports"
1676
+ visibility_timeout_seconds = 900 # matches Lambda 15-min timeout
1677
+ message_retention_seconds = 86400 # 1 day; DLQ holds longer-stuck messages
1678
+ sqs_managed_sse_enabled = true
1679
+
1680
+ redrive_policy = jsonencode({
1681
+ deadLetterTargetArn = aws_sqs_queue.compliance_exports_dlq[0].arn
1682
+ maxReceiveCount = 3
1683
+ })
1684
+
1685
+ tags = {
1686
+ Name = "thinkwork-${var.stage}-compliance-exports"
1687
+ }
1688
+ }
1689
+
1690
+ # graphql-http needs sqs:SendMessage on the new queue to dispatch jobIds
1691
+ # from the createComplianceExport mutation. Attached to the shared
1692
+ # lambda role (which graphql-http assumes); scope is queue-specific.
1693
+ resource "aws_iam_role_policy" "compliance_exports_send" {
1694
+ count = local.use_local_zips ? 1 : 0
1695
+ name = "compliance-exports-send"
1696
+ role = aws_iam_role.lambda.id
1697
+
1698
+ policy = jsonencode({
1699
+ Version = "2012-10-17"
1700
+ Statement = [{
1701
+ Effect = "Allow"
1702
+ Action = ["sqs:SendMessage"]
1703
+ Resource = aws_sqs_queue.compliance_exports[0].arn
1704
+ }]
1705
+ })
1706
+ }
1707
+
1708
+ # Runner role's SQS receive grants — only the runner consumes the queue.
1709
+ resource "aws_iam_role_policy" "compliance_exports_runner_sqs" {
1710
+ count = local.use_local_zips ? 1 : 0
1711
+ name = "compliance-exports-runner-sqs"
1712
+ role = var.compliance_exports_runner_role_name
1713
+
1714
+ policy = jsonencode({
1715
+ Version = "2012-10-17"
1716
+ Statement = [
1717
+ {
1718
+ Sid = "RunnerSqsReceive"
1719
+ Effect = "Allow"
1720
+ Action = [
1721
+ "sqs:ReceiveMessage",
1722
+ "sqs:DeleteMessage",
1723
+ "sqs:GetQueueAttributes",
1724
+ "sqs:ChangeMessageVisibility",
1725
+ ]
1726
+ Resource = aws_sqs_queue.compliance_exports[0].arn
1727
+ },
1728
+ {
1729
+ Sid = "RunnerDlqSend"
1730
+ Effect = "Allow"
1731
+ Action = ["sqs:SendMessage"]
1732
+ Resource = aws_sqs_queue.compliance_exports_dlq[0].arn
1733
+ },
1734
+ ]
1735
+ })
1736
+ }
1737
+
1738
+ resource "aws_lambda_function" "compliance_export_runner" {
1739
+ count = local.use_local_zips ? 1 : 0
1740
+
1741
+ function_name = "thinkwork-${var.stage}-api-compliance-export-runner"
1742
+ role = var.compliance_exports_runner_role_arn
1743
+ handler = "index.handler"
1744
+ runtime = local.runtime
1745
+ timeout = 900
1746
+ memory_size = 1024
1747
+ filename = "${var.lambda_zips_dir}/compliance-export-runner.zip"
1748
+ source_code_hash = filebase64sha256("${var.lambda_zips_dir}/compliance-export-runner.zip")
1749
+ reserved_concurrent_executions = 2
1750
+
1751
+ environment {
1752
+ variables = {
1753
+ STAGE = var.stage
1754
+ AWS_NODEJS_CONNECTION_REUSE_ENABLED = "1"
1755
+ COMPLIANCE_EXPORTS_BUCKET = var.compliance_exports_bucket_name
1756
+ COMPLIANCE_EXPORTS_QUEUE_URL = aws_sqs_queue.compliance_exports[0].url
1757
+ # Phase 3 U11.U3 — the live runner connects to Aurora as the
1758
+ # writer pool (existing app role) for INSERT/UPDATE on
1759
+ # compliance.export_jobs and SELECT on compliance.audit_events.
1760
+ DATABASE_URL_SECRET_ARN = var.graphql_db_secret_arn
1761
+ # The writer-pool secret stores only {username, password}; the
1762
+ # runner constructs the URL from these env vars + the secret.
1763
+ # Mirrors the fallback in packages/database-pg/src/db.ts's
1764
+ # `resolveDatabaseUrlFromSecrets` (deploy run 25563132057
1765
+ # surfaced this as "Invalid URL" when only the ARN was wired).
1766
+ DATABASE_HOST = var.db_cluster_endpoint
1767
+ DATABASE_NAME = var.database_name
1768
+ }
1769
+ }
1770
+ }
1771
+
1772
+ # SQS → Lambda event source mapping. batch_size=1 so each export is a
1773
+ # discrete invocation; ReportBatchItemFailures lets the runner mark
1774
+ # individual messages failed without re-enqueuing the whole batch.
1775
+ # Concurrency is bounded by the Lambda function's
1776
+ # reserved_concurrent_executions=2 (set above) — the
1777
+ # `maximum_concurrency` argument on the event-source mapping requires a
1778
+ # newer aws provider version than this codebase currently pins, and the
1779
+ # function-level reservation gives the equivalent ceiling at v1 scale.
1780
+ resource "aws_lambda_event_source_mapping" "compliance_exports" {
1781
+ count = local.use_local_zips ? 1 : 0
1782
+
1783
+ event_source_arn = aws_sqs_queue.compliance_exports[0].arn
1784
+ function_name = aws_lambda_function.compliance_export_runner[0].function_name
1785
+ batch_size = 1
1786
+ enabled = true
1787
+ function_response_types = ["ReportBatchItemFailures"]
1788
+ }
1789
+
1790
+ resource "aws_cloudwatch_metric_alarm" "compliance_exports_dlq_depth" {
1791
+ count = local.use_local_zips ? 1 : 0
1792
+
1793
+ alarm_name = "thinkwork-${var.stage}-compliance-exports-dlq-depth"
1794
+ alarm_description = "Compliance exports DLQ has messages — runner Lambda crashed (or is inert pre-U11.U3); operator must inspect."
1795
+ namespace = "AWS/SQS"
1796
+ metric_name = "ApproximateNumberOfMessagesVisible"
1797
+ statistic = "Maximum"
1798
+ period = 60
1799
+ evaluation_periods = 1
1800
+ threshold = 1
1801
+ comparison_operator = "GreaterThanOrEqualToThreshold"
1802
+ treat_missing_data = "notBreaching"
1803
+ alarm_actions = []
1804
+
1805
+ dimensions = {
1806
+ QueueName = aws_sqs_queue.compliance_exports_dlq[0].name
1807
+ }
1808
+ }
1809
+
1810
+ # ---------------------------------------------------------------------------
1811
+ # workspace-files-efs — STANDALONE Lambda that reads any Computer's workspace
1812
+ # files directly off the shared EFS file system. Bypasses the
1813
+ # computer_tasks queue for list/get operations so the admin Computer
1814
+ # Workspace tab is independent of runtime liveness or write-queue backlog.
1815
+ #
1816
+ # Plan: docs/plans/2026-05-13-XXX-feat-admin-computer-efs-listing-plan.md
1817
+ #
1818
+ # The Lambda mounts the `workspace_admin` access point at /mnt/efs. That
1819
+ # access point is rooted at /tenants on the shared EFS, so the handler
1820
+ # can address any Computer's workspace as
1821
+ # /mnt/efs/<tenantId>/computers/<computerId>/<path...>
1822
+ # (matches the layout written by `computerWorkspacePath` in
1823
+ # packages/api/src/lib/computers/runtime-control.ts:40).
1824
+ #
1825
+ # VPC config: same subnet set the Computer ECS tasks use (so the mount
1826
+ # targets are reachable). Dedicated security group with an EFS-SG ingress
1827
+ # rule defined as a sibling of the task-SG rule in the computer-runtime
1828
+ # module — keeps Lambda traffic auditable separately.
1829
+ #
1830
+ # Writes intentionally stay on the existing computer_tasks queue path.
1831
+ # Mutations have ordering semantics with the runtime's in-process state;
1832
+ # changing them is out of scope for this PR.
1833
+ # ---------------------------------------------------------------------------
1834
+
1835
+ resource "aws_lambda_function" "workspace_files_efs" {
1836
+ count = local.use_local_zips ? 1 : 0
1837
+
1838
+ function_name = "thinkwork-${var.stage}-api-workspace-files-efs"
1839
+ role = aws_iam_role.lambda.arn
1840
+ handler = "index.handler"
1841
+ runtime = local.runtime
1842
+ timeout = 30
1843
+ memory_size = 512
1844
+
1845
+ filename = "${var.lambda_zips_dir}/workspace-files-efs.zip"
1846
+ source_code_hash = filebase64sha256("${var.lambda_zips_dir}/workspace-files-efs.zip")
1847
+
1848
+ vpc_config {
1849
+ subnet_ids = var.computer_runtime_subnet_ids
1850
+ security_group_ids = [var.workspace_admin_lambda_sg_id]
1851
+ }
1852
+
1853
+ file_system_config {
1854
+ arn = var.workspace_admin_efs_access_point_arn
1855
+ local_mount_path = "/mnt/efs"
1856
+ }
1857
+
1858
+ environment {
1859
+ variables = {
1860
+ STAGE = var.stage
1861
+ AWS_NODEJS_CONNECTION_REUSE_ENABLED = "1"
1862
+ WORKSPACE_EFS_ROOT = "/mnt/efs"
1863
+ }
1864
+ }
1865
+
1866
+ tags = {
1867
+ Name = "thinkwork-${var.stage}-api-workspace-files-efs"
1868
+ Handler = "workspace-files-efs"
1869
+ }
1870
+ }
1871
+
1872
+ # VPC-attached Lambdas need permission to manage ENIs. The shared lambda
1873
+ # role doesn't grant this by default because most handlers run outside a
1874
+ # VPC. AWSLambdaVPCAccessExecutionRole gives Create/Describe/DeleteNetwork
1875
+ # Interface — minimum scope for VPC Lambdas.
1876
+ resource "aws_iam_role_policy_attachment" "lambda_vpc_access" {
1877
+ count = local.use_local_zips ? 1 : 0
1878
+
1879
+ role = aws_iam_role.lambda.name
1880
+ policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole"
1881
+ }