thinkwork-cli 0.9.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +202 -0
- package/README.md +2 -2
- package/dist/cli.js +1187 -315
- package/dist/terraform/examples/greenfield/main.tf +325 -19
- package/dist/terraform/examples/greenfield/terraform.tfvars.example +14 -0
- package/dist/terraform/modules/app/agentcore-code-interpreter/Dockerfile.sandbox-base +61 -0
- package/dist/terraform/modules/app/agentcore-code-interpreter/README.md +54 -0
- package/dist/terraform/modules/app/agentcore-code-interpreter/main.tf +197 -0
- package/dist/terraform/modules/app/agentcore-code-interpreter/scripts/build_and_push_sandbox_base.sh +70 -0
- package/dist/terraform/modules/app/agentcore-flue/README.md +58 -0
- package/dist/terraform/modules/app/agentcore-flue/main.tf +322 -0
- package/dist/terraform/modules/app/agentcore-flue/outputs.tf +23 -0
- package/dist/terraform/modules/app/agentcore-flue/variables.tf +91 -0
- package/dist/terraform/modules/app/agentcore-memory/scripts/create_or_find_memory.sh +0 -0
- package/dist/terraform/modules/app/agentcore-runtime/main.tf +165 -0
- package/dist/terraform/modules/app/appsync-subscriptions/main.tf +4 -0
- package/dist/terraform/modules/app/appsync-subscriptions/outputs.tf +5 -0
- package/dist/terraform/modules/app/computer-runtime/README.md +15 -0
- package/dist/terraform/modules/app/computer-runtime/main.tf +406 -0
- package/dist/terraform/modules/app/computer-runtime/outputs.tf +75 -0
- package/dist/terraform/modules/app/computer-runtime/variables.tf +66 -0
- package/dist/terraform/modules/app/hindsight-memory/main.tf +6 -0
- package/dist/terraform/modules/app/lambda-api/eval-fanout.tf +128 -0
- package/dist/terraform/modules/app/lambda-api/handlers.tf +1454 -43
- package/dist/terraform/modules/app/lambda-api/main.tf +221 -12
- package/dist/terraform/modules/app/lambda-api/mcp-oauth.tf +118 -0
- package/dist/terraform/modules/app/lambda-api/oauth-secrets.tf +49 -0
- package/dist/terraform/modules/app/lambda-api/outputs.tf +38 -0
- package/dist/terraform/modules/app/lambda-api/slack-app-secrets.tf +43 -0
- package/dist/terraform/modules/app/lambda-api/stripe-secrets.tf +53 -0
- package/dist/terraform/modules/app/lambda-api/variables.tf +349 -2
- package/dist/terraform/modules/app/lambda-api/workspace-events.tf +125 -0
- package/dist/terraform/modules/app/routines-stepfunctions/main.tf +453 -0
- package/dist/terraform/modules/app/sandbox-log-scrubber/README.md +66 -0
- package/dist/terraform/modules/app/sandbox-log-scrubber/main.tf +200 -0
- package/dist/terraform/modules/app/static-site/main.tf +146 -5
- package/dist/terraform/modules/app/www-dns/main.tf +118 -15
- package/dist/terraform/modules/app/www-dns/outputs.tf +10 -0
- package/dist/terraform/modules/app/www-dns/variables.tf +42 -0
- package/dist/terraform/modules/data/aurora-postgres/main.tf +164 -3
- package/dist/terraform/modules/data/aurora-postgres/outputs.tf +34 -0
- package/dist/terraform/modules/data/aurora-postgres/variables.tf +16 -0
- package/dist/terraform/modules/data/compliance-audit-bucket/README.md +145 -0
- package/dist/terraform/modules/data/compliance-audit-bucket/main.tf +573 -0
- package/dist/terraform/modules/data/compliance-audit-bucket/outputs.tf +43 -0
- package/dist/terraform/modules/data/compliance-audit-bucket/variables.tf +93 -0
- package/dist/terraform/modules/data/compliance-exports-bucket/main.tf +269 -0
- package/dist/terraform/modules/data/compliance-exports-bucket/outputs.tf +23 -0
- package/dist/terraform/modules/data/compliance-exports-bucket/variables.tf +50 -0
- package/dist/terraform/modules/data/s3-backups-bucket/main.tf +123 -0
- package/dist/terraform/modules/data/s3-buckets/main.tf +13 -0
- package/dist/terraform/modules/foundation/cognito/variables.tf +2 -2
- package/dist/terraform/modules/thinkwork/main.tf +439 -21
- package/dist/terraform/modules/thinkwork/outputs.tf +121 -0
- package/dist/terraform/modules/thinkwork/variables.tf +153 -2
- package/dist/terraform/schema.graphql +17 -0
- package/package.json +15 -14
|
@@ -7,34 +7,56 @@
|
|
|
7
7
|
################################################################################
|
|
8
8
|
|
|
9
9
|
locals {
|
|
10
|
-
use_local_zips
|
|
11
|
-
|
|
10
|
+
use_local_zips = var.lambda_zips_dir != ""
|
|
11
|
+
eval_fanout_queue_url = local.use_local_zips ? aws_sqs_queue.eval_fanout[0].url : ""
|
|
12
|
+
runtime = "nodejs20.x"
|
|
12
13
|
|
|
13
14
|
# Common environment variables shared by all API handlers
|
|
14
15
|
common_env = {
|
|
15
|
-
STAGE
|
|
16
|
-
DATABASE_URL
|
|
17
|
-
DATABASE_SECRET_ARN
|
|
18
|
-
DATABASE_HOST
|
|
19
|
-
DATABASE_NAME
|
|
20
|
-
BUCKET_NAME
|
|
21
|
-
USER_POOL_ID
|
|
22
|
-
COGNITO_USER_POOL_ID
|
|
23
|
-
ADMIN_CLIENT_ID
|
|
24
|
-
MOBILE_CLIENT_ID
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
16
|
+
STAGE = var.stage
|
|
17
|
+
DATABASE_URL = "postgresql://${var.db_username}:${urlencode(var.db_password)}@${var.db_cluster_endpoint}:5432/${var.database_name}?sslmode=no-verify"
|
|
18
|
+
DATABASE_SECRET_ARN = var.graphql_db_secret_arn
|
|
19
|
+
DATABASE_HOST = var.db_cluster_endpoint
|
|
20
|
+
DATABASE_NAME = var.database_name
|
|
21
|
+
BUCKET_NAME = var.bucket_name
|
|
22
|
+
USER_POOL_ID = var.user_pool_id
|
|
23
|
+
COGNITO_USER_POOL_ID = var.user_pool_id
|
|
24
|
+
ADMIN_CLIENT_ID = var.admin_client_id
|
|
25
|
+
MOBILE_CLIENT_ID = var.mobile_client_id
|
|
26
|
+
COGNITO_MCP_CLIENT_ID = aws_cognito_user_pool_client.mcp_oauth.id
|
|
27
|
+
COGNITO_AUTH_BASE_URL = local.mcp_oauth_cognito_base_url
|
|
28
|
+
MCP_OAUTH_CALLBACK_URL = "${local.mcp_oauth_api_base_url}/mcp/oauth/callback"
|
|
29
|
+
MCP_OAUTH_REVOCATIONS_TABLE = aws_dynamodb_table.mcp_oauth_revocations.name
|
|
30
|
+
COGNITO_APP_CLIENT_IDS = "${var.admin_client_id},${var.mobile_client_id}"
|
|
31
|
+
APPSYNC_ENDPOINT = var.appsync_api_url
|
|
32
|
+
APPSYNC_API_KEY = var.appsync_api_key
|
|
33
|
+
GRAPHQL_API_KEY = var.appsync_api_key
|
|
34
|
+
API_AUTH_SECRET = var.api_auth_secret
|
|
35
|
+
THINKWORK_API_SECRET = var.api_auth_secret
|
|
36
|
+
EMAIL_HMAC_SECRET = var.api_auth_secret
|
|
37
|
+
THINKWORK_API_URL = "https://${aws_apigatewayv2_api.main.id}.execute-api.${var.region}.amazonaws.com"
|
|
38
|
+
# Comma-separated allowlist of caller emails permitted to invoke
|
|
39
|
+
# operator-gated mutations (updateTenantPolicy, sandbox fixture
|
|
40
|
+
# setup, etc.). Resolved against ctx.auth.email, which is pulled
|
|
41
|
+
# from the Cognito JWT for user callers and from the
|
|
42
|
+
# `x-principal-email` header for service-auth callers (see
|
|
43
|
+
# packages/api/src/lib/cognito-auth.ts). Empty ⇒ the gate
|
|
44
|
+
# rejects every call, which is the safe default pre-rollout.
|
|
45
|
+
THINKWORK_PLATFORM_OPERATOR_EMAILS = var.platform_operator_emails
|
|
46
|
+
AGENTCORE_FUNCTION_NAME = var.agentcore_function_name
|
|
47
|
+
AGENTCORE_FLUE_FUNCTION_NAME = var.agentcore_flue_function_name
|
|
48
|
+
# SSM parameter names for the Bedrock AgentCore Runtime IDs (one per
|
|
49
|
+
# runtime type). deploy.yml's "Update AgentCore Runtimes" job writes
|
|
50
|
+
# these in `update-agentcore-runtime-image.sh`. eval-runner reads them
|
|
51
|
+
# via `loadRuntimeId(runtimeType)` to start a Bedrock-control-plane
|
|
52
|
+
# invocation against the right runtime — pre-U3 the flue path was
|
|
53
|
+
# dead because the env var was never wired here.
|
|
54
|
+
AGENTCORE_RUNTIME_SSM_STRANDS = "/thinkwork/${var.stage}/agentcore/runtime-id-strands"
|
|
55
|
+
AGENTCORE_RUNTIME_SSM_FLUE = "/thinkwork/${var.stage}/agentcore/runtime-id-flue"
|
|
56
|
+
WORKSPACE_BUCKET = var.bucket_name
|
|
57
|
+
HINDSIGHT_ENDPOINT = var.hindsight_endpoint
|
|
58
|
+
AGENTCORE_MEMORY_ID = var.agentcore_memory_id
|
|
59
|
+
MEMORY_ENGINE = var.memory_engine
|
|
38
60
|
# Skip the SSM indirection for cross-function ARN lookup. Terraform
|
|
39
61
|
# already knows this ARN at apply time and the Lambda role's SSM
|
|
40
62
|
# permission has been a recurring source of silent failures where
|
|
@@ -47,30 +69,202 @@ locals {
|
|
|
47
69
|
ECR_REPOSITORY_URL = var.ecr_repository_url
|
|
48
70
|
AWS_ACCOUNT_ID = var.account_id
|
|
49
71
|
NODE_OPTIONS = "--enable-source-maps"
|
|
50
|
-
#
|
|
51
|
-
#
|
|
52
|
-
#
|
|
53
|
-
#
|
|
54
|
-
|
|
72
|
+
# Per-user OAuth wiring (Google Workspace today; Microsoft 365 follow-up).
|
|
73
|
+
# Secret ARNs are the indirection; the actual client_id/client_secret
|
|
74
|
+
# values live in Secrets Manager and are fetched by
|
|
75
|
+
# packages/api/src/lib/oauth-client-credentials.ts at cold-start.
|
|
76
|
+
# OAUTH_CALLBACK_URL is the URL registered with Google/Azure OAuth apps.
|
|
77
|
+
# REDIRECT_SUCCESS_URL is the fallback post-OAuth redirect when the
|
|
78
|
+
# caller doesn't pass a per-request returnUrl (mobile passes thinkwork://).
|
|
79
|
+
GOOGLE_PRODUCTIVITY_OAUTH_SECRET_ARN = aws_secretsmanager_secret.oauth_google_productivity.arn
|
|
80
|
+
OAUTH_CALLBACK_URL = "https://${aws_apigatewayv2_api.main.id}.execute-api.${var.region}.amazonaws.com/api/oauth/callback"
|
|
81
|
+
REDIRECT_SUCCESS_URL = var.redirect_success_url
|
|
82
|
+
COMPANY_BRAIN_SOURCE_AGENT_MODEL_ID = var.company_brain_source_agent_model_id
|
|
83
|
+
# Stripe billing — see stripe-secrets.tf. The ARN is the indirection;
|
|
84
|
+
# the actual keys live in Secrets Manager and are fetched by
|
|
85
|
+
# packages/api/src/lib/stripe-credentials.ts at cold-start. Price IDs
|
|
86
|
+
# are non-secret per-stage config carried as a plain JSON env var so
|
|
87
|
+
# staging/prod can use different products without a secret rotation.
|
|
88
|
+
STRIPE_CREDENTIALS_SECRET_ARN = aws_secretsmanager_secret.stripe_api_credentials.arn
|
|
89
|
+
STRIPE_PRICE_IDS_JSON = var.stripe_price_ids_json
|
|
90
|
+
STRIPE_CHECKOUT_SUCCESS_URL = "${var.admin_url}/onboarding/welcome?session_id={CHECKOUT_SESSION_ID}"
|
|
91
|
+
STRIPE_CHECKOUT_CANCEL_URL = "${var.www_url}/cloud"
|
|
92
|
+
WWW_URL = var.www_url
|
|
93
|
+
# Override the welcome email's From: address. Defaults to
|
|
94
|
+
# hello@agents.thinkwork.ai (the already-verified SES inbound domain);
|
|
95
|
+
# set to hello@thinkwork.ai once the bare-apex identity is verified in SES.
|
|
96
|
+
STRIPE_WELCOME_FROM_EMAIL = var.stripe_welcome_from_email
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
# Computer runtime control handlers only need database access, service-auth,
|
|
100
|
+
# the API callback URL, and ECS/EFS runtime wiring. Using the full common_env
|
|
101
|
+
# pushes computer-manager over Lambda's 4KB environment-variable limit in dev.
|
|
102
|
+
computer_runtime_control_base_env = {
|
|
103
|
+
STAGE = var.stage
|
|
104
|
+
DATABASE_URL = "postgresql://${var.db_username}:${urlencode(var.db_password)}@${var.db_cluster_endpoint}:5432/${var.database_name}?sslmode=no-verify"
|
|
105
|
+
API_AUTH_SECRET = var.api_auth_secret
|
|
106
|
+
THINKWORK_API_URL = "https://${aws_apigatewayv2_api.main.id}.execute-api.${var.region}.amazonaws.com"
|
|
107
|
+
NODE_OPTIONS = "--enable-source-maps"
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
computer_runtime_control_env = {
|
|
111
|
+
COMPUTER_RUNTIME_CLUSTER_NAME = var.computer_runtime_cluster_name
|
|
112
|
+
COMPUTER_RUNTIME_EFS_FILE_SYSTEM_ID = var.computer_runtime_efs_file_system_id
|
|
113
|
+
COMPUTER_RUNTIME_SUBNET_IDS = join(",", var.computer_runtime_subnet_ids)
|
|
114
|
+
COMPUTER_RUNTIME_ASSIGN_PUBLIC_IP = var.computer_runtime_assign_public_ip
|
|
115
|
+
COMPUTER_RUNTIME_TASK_SG_ID = var.computer_runtime_task_sg_id
|
|
116
|
+
COMPUTER_RUNTIME_EXECUTION_ROLE_ARN = var.computer_runtime_execution_role_arn
|
|
117
|
+
COMPUTER_RUNTIME_TASK_ROLE_ARN = var.computer_runtime_task_role_arn
|
|
118
|
+
COMPUTER_RUNTIME_LOG_GROUP_NAME = var.computer_runtime_log_group_name
|
|
119
|
+
COMPUTER_RUNTIME_REPOSITORY_URL = var.computer_runtime_repository_url
|
|
120
|
+
COMPUTER_RUNTIME_DEFAULT_CPU = tostring(var.computer_runtime_default_cpu)
|
|
121
|
+
COMPUTER_RUNTIME_DEFAULT_MEMORY = tostring(var.computer_runtime_default_memory)
|
|
55
122
|
}
|
|
56
123
|
|
|
57
124
|
# Per-handler env-var overrides. ARNs are constructed from the naming
|
|
58
125
|
# pattern (same trick as lambda_api_cross_invoke in main.tf) so we don't
|
|
59
126
|
# introduce a self-referential dependency inside the handler for_each.
|
|
127
|
+
slack_handler_env = {
|
|
128
|
+
SLACK_APP_CREDENTIALS_SECRET_ARN = aws_secretsmanager_secret.slack_app_credentials.arn
|
|
129
|
+
}
|
|
130
|
+
|
|
60
131
|
handler_extra_env = {
|
|
132
|
+
"extension-proxy" = {
|
|
133
|
+
EXTENSION_PROXY_BACKENDS_JSON = var.extension_proxy_backends_json
|
|
134
|
+
EXTENSION_PROXY_SIGNING_SECRET = var.extension_proxy_signing_secret
|
|
135
|
+
}
|
|
61
136
|
"job-schedule-manager" = {
|
|
62
137
|
JOB_TRIGGER_ARN = "arn:aws:lambda:${var.region}:${var.account_id}:function:thinkwork-${var.stage}-api-job-trigger"
|
|
63
138
|
JOB_TRIGGER_ROLE_ARN = var.job_scheduler_role_arn
|
|
64
139
|
}
|
|
65
|
-
# Compounding Memory compile Lambda.
|
|
66
|
-
# planner + section-writer cap themselves at ~500
|
|
67
|
-
# per invocation
|
|
140
|
+
# Compounding Memory compile Lambda. Any Converse-compatible Bedrock
|
|
141
|
+
# model works; the planner + section-writer cap themselves at ~500
|
|
142
|
+
# records / 25 new pages per invocation so a 480 s timeout covers
|
|
143
|
+
# the worst case comfortably. Env vars come from variables so
|
|
144
|
+
# unrelated deploys don't wipe them back to defaults (the aggregation
|
|
145
|
+
# flag got reset on every terraform apply before this was pinned).
|
|
68
146
|
"wiki-compile" = {
|
|
69
|
-
BEDROCK_MODEL_ID
|
|
147
|
+
BEDROCK_MODEL_ID = var.wiki_compile_model_id
|
|
148
|
+
WIKI_AGGREGATION_PASS_ENABLED = var.wiki_aggregation_pass_enabled
|
|
149
|
+
WIKI_DETERMINISTIC_LINKING_ENABLED = var.wiki_deterministic_linking_enabled
|
|
150
|
+
# Name (not value) of the SecureString SSM parameter that holds the
|
|
151
|
+
# Google Places API key. wiki-compile fetches + caches on cold start.
|
|
152
|
+
# The parameter may contain a placeholder value at apply time — the
|
|
153
|
+
# Lambda logs and degrades gracefully if decryption returns empty.
|
|
154
|
+
GOOGLE_PLACES_SSM_PARAM_NAME = "/thinkwork/${var.stage}/google-places/api-key"
|
|
70
155
|
}
|
|
71
156
|
"wiki-export" = {
|
|
72
157
|
WIKI_EXPORT_BUCKET = aws_s3_bucket.wiki_exports.bucket
|
|
73
158
|
}
|
|
159
|
+
# workspace-files invokes the workspace-files-efs sidecar (Request
|
|
160
|
+
# Response) for Computer-target list/get to bypass the computer_tasks
|
|
161
|
+
# queue. ARN constructed from the naming pattern to avoid a self-
|
|
162
|
+
# referential dependency on the standalone Lambda resource defined at
|
|
163
|
+
# the bottom of this file.
|
|
164
|
+
"workspace-files" = {
|
|
165
|
+
WORKSPACE_FILES_EFS_FN_ARN = "arn:aws:lambda:${var.region}:${var.account_id}:function:thinkwork-${var.stage}-api-workspace-files-efs"
|
|
166
|
+
}
|
|
167
|
+
"oauth-authorize" = local.slack_handler_env
|
|
168
|
+
"oauth-callback" = local.slack_handler_env
|
|
169
|
+
"slack-events" = local.slack_handler_env
|
|
170
|
+
"slack-slash-command" = local.slack_handler_env
|
|
171
|
+
"slack-interactivity" = local.slack_handler_env
|
|
172
|
+
"slack-oauth-install" = local.slack_handler_env
|
|
173
|
+
"slack-dispatch" = local.slack_handler_env
|
|
174
|
+
# computer-terminal-start needs the cluster name to scope its
|
|
175
|
+
# ECS ListTasks / DescribeTasks / ExecuteCommand calls.
|
|
176
|
+
"computer-terminal-start" = {
|
|
177
|
+
COMPUTER_RUNTIME_CLUSTER_NAME = var.computer_runtime_cluster_name
|
|
178
|
+
}
|
|
179
|
+
# computer-manager and computer-runtime-reconciler consume ECS/EFS task
|
|
180
|
+
# config from packages/api/src/lib/computers/runtime-control.ts.
|
|
181
|
+
# Scoping the COMPUTER_RUNTIME_* variables here (instead of in
|
|
182
|
+
# local.common_env_vars) keeps the per-Lambda env-var payload under
|
|
183
|
+
# the AWS 4KB hard limit — they were previously dumped into every
|
|
184
|
+
# handler and pushed ~70 Lambdas over quota.
|
|
185
|
+
"computer-manager" = local.computer_runtime_control_env
|
|
186
|
+
"computer-runtime-reconciler" = local.computer_runtime_control_env
|
|
187
|
+
"mcp-context-engine" = {
|
|
188
|
+
CONTEXT_ENGINE_MEMORY_QUERY_MODE = "reflect"
|
|
189
|
+
CONTEXT_ENGINE_MEMORY_TIMEOUT_MS = "20000"
|
|
190
|
+
}
|
|
191
|
+
# routine-task-python (Phase B U6) needs the AgentCore code-interpreter
|
|
192
|
+
# id + the per-stage S3 routine-output bucket. The interpreter id is
|
|
193
|
+
# provisioned by the agentcore-code-interpreter module and exposed via
|
|
194
|
+
# the agentcore_code_interpreter_id input variable; the bucket name
|
|
195
|
+
# follows the per-stage naming convention from the routines-stepfunctions
|
|
196
|
+
# module (Phase A U1).
|
|
197
|
+
"routine-task-python" = {
|
|
198
|
+
SANDBOX_INTERPRETER_ID = var.agentcore_code_interpreter_id
|
|
199
|
+
ROUTINE_OUTPUT_BUCKET = "thinkwork-${var.stage}-routine-output"
|
|
200
|
+
ROUTINE_PYTHON_ENV_ALLOWLIST = "TENANT_ID,ROUTINE_ID,EXECUTION_ID"
|
|
201
|
+
}
|
|
202
|
+
# graphql-http hosts the createRoutine / publishRoutineVersion / etc.
|
|
203
|
+
# resolvers (Phase B U7) AND the routine-approval-bridge (Phase B
|
|
204
|
+
# U8) which invokes routine-resume via the AWS SDK.
|
|
205
|
+
"graphql-http" = {
|
|
206
|
+
ROUTINES_EXECUTION_ROLE_ARN = var.routines_execution_role_arn
|
|
207
|
+
ROUTINES_LOG_GROUP_ARN = var.routines_log_group_arn
|
|
208
|
+
AWS_ACCOUNT_ID = var.account_id
|
|
209
|
+
# routine-approval-bridge (Phase B U8) calls this function name
|
|
210
|
+
# via the AWS SDK Lambda Invoke after a HITL decideInboxItem.
|
|
211
|
+
# The bridge throws if unset — terraform wiring is mandatory.
|
|
212
|
+
ROUTINE_RESUME_FUNCTION_NAME = "thinkwork-${var.stage}-api-routine-resume"
|
|
213
|
+
# triggerRoutineRun seeds this into the SFN execution input so the
|
|
214
|
+
# inbox_approval recipe Task can find the callback Lambda via
|
|
215
|
+
# $$.Execution.Input.inboxApprovalFunctionName.
|
|
216
|
+
ROUTINE_APPROVAL_CALLBACK_FUNCTION_NAME = "thinkwork-${var.stage}-api-routine-approval-callback"
|
|
217
|
+
EMAIL_SEND_FUNCTION_NAME = "thinkwork-${var.stage}-api-email-send"
|
|
218
|
+
ROUTINE_TASK_PYTHON_FUNCTION_NAME = "thinkwork-${var.stage}-api-routine-task-python"
|
|
219
|
+
ADMIN_OPS_MCP_FUNCTION_NAME = "thinkwork-${var.stage}-api-admin-ops-mcp"
|
|
220
|
+
SLACK_SEND_FUNCTION_NAME = "thinkwork-${var.stage}-api-slack-send"
|
|
221
|
+
# Phase 3 U10 — compliance read resolvers (complianceEvents,
|
|
222
|
+
# complianceEvent, complianceEventByHash) connect to Aurora as
|
|
223
|
+
# the compliance_reader role. The existing lambda_secrets policy
|
|
224
|
+
# in main.tf grants secretsmanager:GetSecretValue on the
|
|
225
|
+
# thinkwork/* wildcard, so no new IAM resource is needed.
|
|
226
|
+
COMPLIANCE_READER_SECRET_ARN = var.compliance_reader_secret_arn
|
|
227
|
+
# Phase 3 U11.U2 — createComplianceExport mutation dispatches a
|
|
228
|
+
# jobId to a known-name SQS queue. We do NOT pass the queue URL
|
|
229
|
+
# as an env var here: graphql-http's env block is already at the
|
|
230
|
+
# AWS 4 KB ceiling, and adding another URL pushed the deploy over
|
|
231
|
+
# the limit. The mutation derives the URL from STAGE + AWS_REGION
|
|
232
|
+
# + AWS_ACCOUNT_ID, which the Lambda already has. The runner
|
|
233
|
+
# Lambda (separate function below) keeps an explicit
|
|
234
|
+
# COMPLIANCE_EXPORTS_QUEUE_URL because its env is small.
|
|
235
|
+
}
|
|
236
|
+
# U2 eval fan-out substrate. eval-runner does not dispatch to this
|
|
237
|
+
# queue until U3; eval-worker is a throwing inert stub that redrives
|
|
238
|
+
# accidental traffic to the DLQ.
|
|
239
|
+
"eval-runner" = {
|
|
240
|
+
EVAL_FANOUT_QUEUE_URL = local.eval_fanout_queue_url
|
|
241
|
+
EVAL_DIRECT_AGENTCORE_MESSAGE_SHARDS = "20"
|
|
242
|
+
}
|
|
243
|
+
"eval-worker" = {
|
|
244
|
+
EVAL_FANOUT_QUEUE_URL = local.eval_fanout_queue_url
|
|
245
|
+
EVAL_AGENTCORE_EVALUATORS = "disabled"
|
|
246
|
+
}
|
|
247
|
+
# job-trigger fires scheduled routine runs via SFN.StartExecution
|
|
248
|
+
# (Phase B U7) — the alias ARN comes from the row, but the Lambda
|
|
249
|
+
# also reads AWS_ACCOUNT_ID for diagnostic logging. It also passes
|
|
250
|
+
# the routine-approval-callback function name in the SFN execution
|
|
251
|
+
# input so the inbox_approval recipe can fanout to it on .waitForTaskToken.
|
|
252
|
+
"job-trigger" = {
|
|
253
|
+
AWS_ACCOUNT_ID = var.account_id
|
|
254
|
+
ROUTINE_APPROVAL_CALLBACK_FUNCTION_NAME = "thinkwork-${var.stage}-api-routine-approval-callback"
|
|
255
|
+
EMAIL_SEND_FUNCTION_NAME = "thinkwork-${var.stage}-api-email-send"
|
|
256
|
+
ROUTINE_TASK_PYTHON_FUNCTION_NAME = "thinkwork-${var.stage}-api-routine-task-python"
|
|
257
|
+
ADMIN_OPS_MCP_FUNCTION_NAME = "thinkwork-${var.stage}-api-admin-ops-mcp"
|
|
258
|
+
SLACK_SEND_FUNCTION_NAME = "thinkwork-${var.stage}-api-slack-send"
|
|
259
|
+
}
|
|
260
|
+
# Phase 3 U4 Compliance outbox drainer.
|
|
261
|
+
# Connects to Aurora as the compliance_drainer role (provisioned in
|
|
262
|
+
# U2). The DATABASE_SECRET_ARN-style indirection is via
|
|
263
|
+
# COMPLIANCE_DRAINER_SECRET_ARN so the drainer's connection cache is
|
|
264
|
+
# isolated from the master `getDb()` cache used by other handlers.
|
|
265
|
+
"compliance-outbox-drainer" = {
|
|
266
|
+
COMPLIANCE_DRAINER_SECRET_ARN = var.compliance_drainer_secret_arn
|
|
267
|
+
}
|
|
74
268
|
}
|
|
75
269
|
}
|
|
76
270
|
|
|
@@ -83,18 +277,28 @@ resource "aws_lambda_function" "handler" {
|
|
|
83
277
|
"graphql-http",
|
|
84
278
|
"chat-agent-invoke",
|
|
85
279
|
"wakeup-processor",
|
|
280
|
+
"workspace-event-dispatcher",
|
|
86
281
|
"agents",
|
|
87
282
|
"agent-actions",
|
|
88
283
|
"messages",
|
|
89
284
|
"connections",
|
|
90
285
|
"oauth-authorize",
|
|
91
286
|
"oauth-callback",
|
|
287
|
+
"stripe-checkout",
|
|
288
|
+
"stripe-webhook",
|
|
289
|
+
"stripe-portal",
|
|
290
|
+
"stripe-subscription",
|
|
291
|
+
"auth-me",
|
|
292
|
+
"extension-proxy",
|
|
92
293
|
"teams",
|
|
93
294
|
"team-members",
|
|
94
295
|
"tenants",
|
|
95
296
|
"users",
|
|
96
297
|
"invites",
|
|
97
298
|
"skills",
|
|
299
|
+
"mcp-oauth",
|
|
300
|
+
"mcp-user-memory",
|
|
301
|
+
"mcp-context-engine",
|
|
98
302
|
"activity",
|
|
99
303
|
"routines",
|
|
100
304
|
"budgets",
|
|
@@ -102,14 +306,24 @@ resource "aws_lambda_function" "handler" {
|
|
|
102
306
|
"scheduled-jobs",
|
|
103
307
|
"job-schedule-manager",
|
|
104
308
|
"job-trigger",
|
|
309
|
+
"routine-task-weather-email",
|
|
105
310
|
"webhooks",
|
|
106
311
|
"webhooks-admin",
|
|
107
312
|
"webhook-deliveries-cleanup",
|
|
313
|
+
"skill-runs-reconciler",
|
|
314
|
+
"cron-stall-monitor",
|
|
315
|
+
"webhook-crm-opportunity",
|
|
316
|
+
"webhook-task-event",
|
|
108
317
|
"workspace-files",
|
|
109
318
|
"knowledge-base-manager",
|
|
110
319
|
"knowledge-base-files",
|
|
111
320
|
"email-send",
|
|
112
321
|
"email-inbound",
|
|
322
|
+
"slack-events",
|
|
323
|
+
"slack-slash-command",
|
|
324
|
+
"slack-interactivity",
|
|
325
|
+
"slack-oauth-install",
|
|
326
|
+
"slack-dispatch",
|
|
113
327
|
"github-app",
|
|
114
328
|
"github-repos",
|
|
115
329
|
"memory",
|
|
@@ -122,8 +336,148 @@ resource "aws_lambda_function" "handler" {
|
|
|
122
336
|
"recipe-refresh",
|
|
123
337
|
"agent-skills-list",
|
|
124
338
|
"bootstrap-workspaces",
|
|
339
|
+
"migrate-agents-to-computers",
|
|
340
|
+
"computer-runtime",
|
|
341
|
+
"computer-manager",
|
|
342
|
+
"computer-runtime-reconciler",
|
|
343
|
+
# Admin Terminal tab — POST /api/computers/{computerId}/terminal/start.
|
|
344
|
+
# Returns the SSM Session Manager session envelope (sessionId,
|
|
345
|
+
# streamUrl, tokenValue) so the browser can open a direct WebSocket
|
|
346
|
+
# to ssmmessages. Plan:
|
|
347
|
+
# docs/plans/2026-05-13-004-feat-computer-terminal-ecs-exec-plan.md.
|
|
348
|
+
"computer-terminal-start",
|
|
125
349
|
"code-factory",
|
|
126
350
|
"eval-runner",
|
|
351
|
+
"eval-worker",
|
|
352
|
+
"eval-runs-reconciler",
|
|
353
|
+
# AgentCore Code Sandbox narrow REST endpoints (plan Unit 10 + Unit 11).
|
|
354
|
+
# Both are service-endpoint shape: the Strands container POSTs with
|
|
355
|
+
# Bearer API_AUTH_SECRET. No GraphQL resolver involvement, no extra IAM.
|
|
356
|
+
"sandbox-quota-check",
|
|
357
|
+
"sandbox-invocation-log",
|
|
358
|
+
# Routines Step Functions ASL validator (plan
|
|
359
|
+
# docs/plans/2026-05-01-004-feat-routines-phase-a-substrate-plan.md §U5).
|
|
360
|
+
# Bearer API_AUTH_SECRET; chat builder + publish flow call this before
|
|
361
|
+
# accepting LLM-emitted ASL. Needs states:ValidateStateMachineDefinition
|
|
362
|
+
# IAM grant — see main.tf.
|
|
363
|
+
"routine-asl-validator",
|
|
364
|
+
# Routines Step Functions Task wrappers (plan
|
|
365
|
+
# docs/plans/2026-05-01-005-feat-routines-phase-b-runtime-plan.md §U6).
|
|
366
|
+
# routine-task-python: SFN-invoked Lambda that runs `python` recipe
|
|
367
|
+
# states in the AgentCore code interpreter, offloading stdout/stderr
|
|
368
|
+
# to the per-stage routine-output bucket. Needs bedrock-agentcore
|
|
369
|
+
# (Start/Invoke/Stop CodeInterpreterSession) + S3 PutObject IAM —
|
|
370
|
+
# see main.tf.
|
|
371
|
+
"routine-task-python",
|
|
372
|
+
# routine-resume: SDK-invoked by routine-approval-bridge (Phase B
|
|
373
|
+
# U8) after a HITL decision. Calls SendTaskSuccess/SendTaskFailure;
|
|
374
|
+
# idempotent on already-consumed tokens. Needs states:SendTaskSuccess
|
|
375
|
+
# + states:SendTaskFailure IAM (already granted in U1's substrate).
|
|
376
|
+
"routine-resume",
|
|
377
|
+
# routine-approval-callback: SFN's inbox_approval Task invokes this
|
|
378
|
+
# via .waitForTaskToken (plan 2026-05-01-005 §U8). Creates the
|
|
379
|
+
# inbox_items row + persists the task token in routine_approval_tokens.
|
|
380
|
+
# No additional IAM beyond the lambda execution role's DB access —
|
|
381
|
+
# the trust boundary is the routines-stepfunctions execution role's
|
|
382
|
+
# lambda:InvokeFunction grant scoped to this Lambda's ARN.
|
|
383
|
+
"routine-approval-callback",
|
|
384
|
+
# routine-step-callback + routine-execution-callback (Phase B U9).
|
|
385
|
+
# Bearer API_AUTH_SECRET ingest endpoints — Task wrappers and the
|
|
386
|
+
# EventBridge SFN-state-change rule POST here. routine-step-callback
|
|
387
|
+
# writes routine_step_events; routine-execution-callback updates
|
|
388
|
+
# routine_executions lifecycle status. Idempotent on the dedup index
|
|
389
|
+
# for steps + on the conditional UPDATE for executions.
|
|
390
|
+
"routine-step-callback",
|
|
391
|
+
"routine-execution-callback",
|
|
392
|
+
# Skill-run dispatcher runtime-config fetch (plan
|
|
393
|
+
# docs/plans/2026-04-24-008-feat-skill-run-dispatcher-plan.md §U1). The
|
|
394
|
+
# Strands container's `kind=run_skill` handler calls this with Bearer
|
|
395
|
+
# API_AUTH_SECRET to pull the agent's template + skills + MCP + KBs
|
|
396
|
+
# before building the headless agent turn.
|
|
397
|
+
"agents-runtime-config",
|
|
398
|
+
# Admin-Ops MCP — JSON-RPC endpoint at POST /mcp/admin, exposes the
|
|
399
|
+
# @thinkwork/admin-ops package as MCP tools for Strands agents.
|
|
400
|
+
"admin-ops-mcp",
|
|
401
|
+
# MCP admin key management — per-tenant Bearer tokens for admin-ops.
|
|
402
|
+
# Admin-ops-mcp authenticates incoming tokens by sha256-hash lookup
|
|
403
|
+
# against tenant_mcp_admin_keys, populated by this handler's routes.
|
|
404
|
+
"mcp-admin-keys",
|
|
405
|
+
# One-shot tenant provisioning: mints a tkm_ key + stores in Secrets
|
|
406
|
+
# Manager at thinkwork/<stage>/mcp/<tenantId>/admin-ops + upserts
|
|
407
|
+
# tenant_mcp_servers. SM IAM is already granted on thinkwork/* by
|
|
408
|
+
# aws_iam_role_policy.lambda_secrets in main.tf (Create/Update/Get).
|
|
409
|
+
"mcp-admin-provision",
|
|
410
|
+
# Plugin-installed MCP server admin approval (plan §U11, SI-5). Cognito
|
|
411
|
+
# JWT admin caller → approve/reject. Approve computes url_hash =
|
|
412
|
+
# sha256(canonical(url, auth_config)) and pins it; any subsequent
|
|
413
|
+
# mutation to those fields reverts the row to 'pending'.
|
|
414
|
+
"mcp-approval",
|
|
415
|
+
# Daily sweeper: auto-rejects MCP servers pending > 30 days. Triggered
|
|
416
|
+
# by EventBridge schedule (mcp-approval-sweeper-daily).
|
|
417
|
+
"mcp-approval-sweeper",
|
|
418
|
+
# Plugin upload REST handler (plan §U10). Four routes:
|
|
419
|
+
# POST /api/plugins/presign + /upload, GET /api/plugins (+ /:uploadId).
|
|
420
|
+
# Cognito JWT; admin-role gated. Needs WORKSPACE_BUCKET env for S3.
|
|
421
|
+
"plugin-upload",
|
|
422
|
+
# Finance pilot U2 — thread-attachment upload (presign + finalize).
|
|
423
|
+
# presign issues a 5-min PUT URL the end-user client uses to push
|
|
424
|
+
# Excel/CSV bytes directly to S3; finalize sniffs magic bytes, scans
|
|
425
|
+
# OOXML containers (rejects macros + external links), inserts
|
|
426
|
+
# thread_attachments, and emits attachment.received audit event.
|
|
427
|
+
# Cognito JWT (end-user-facing — NOT admin-gated); tenant pinned via
|
|
428
|
+
# threads.tenant_id lookup. Needs WORKSPACE_BUCKET env for S3.
|
|
429
|
+
"thread-attachments-presign",
|
|
430
|
+
"thread-attachments-finalize",
|
|
431
|
+
# U9-remainder of finance pilot — tenant-pinned download endpoint.
|
|
432
|
+
# GET /api/threads/{tid}/attachments/{aid}/download returns a 302
|
|
433
|
+
# to a 5-minute presigned S3 GET URL with ResponseContentDisposition:
|
|
434
|
+
# attachment so browsers download rather than render inline. Same
|
|
435
|
+
# tenant-pin discipline as presign/finalize.
|
|
436
|
+
"thread-attachment-download",
|
|
437
|
+
# Folder bundle import (fat-folder plan Phase D). Admin uploads a zip
|
|
438
|
+
# or GitHub ref and the handler normalizes vendor folder layouts into
|
|
439
|
+
# the agent workspace.
|
|
440
|
+
"folder-bundle-import",
|
|
441
|
+
# Hourly sweeper: reaps orphan S3 staging from failed / interrupted
|
|
442
|
+
# plugin install sagas + marks matching plugin_uploads rows 'failed'.
|
|
443
|
+
"plugin-staging-sweeper",
|
|
444
|
+
# Resolved Capability Manifest write endpoint (plan §U15). Strands
|
|
445
|
+
# container POSTs one row per agent-session-start. Shared
|
|
446
|
+
# API_AUTH_SECRET bearer (runtime→API; no tenant OAuth).
|
|
447
|
+
"manifest-log",
|
|
448
|
+
# SI-7 catalog-list read endpoint (plan §U15 pt 3/3). Strands
|
|
449
|
+
# container fetches the allowed builtin-tool slug set once per
|
|
450
|
+
# session-start + feature-flag-gated enforcement filter drops
|
|
451
|
+
# catalog-missing tools before Agent(tools=...). Shared
|
|
452
|
+
# API_AUTH_SECRET bearer.
|
|
453
|
+
"capability-catalog-list",
|
|
454
|
+
# Brain v0 narrow write endpoint. Strands calls this with
|
|
455
|
+
# Bearer API_AUTH_SECRET; GraphQL remains user/admin-facing only.
|
|
456
|
+
"brain-agent-write",
|
|
457
|
+
# Phase 3 U4 of the Compliance audit-event log
|
|
458
|
+
# (docs/plans/2026-05-07-004-feat-compliance-u4-outbox-drainer-plan.md).
|
|
459
|
+
# Single-writer drainer with reserved_concurrent_executions=1 (set
|
|
460
|
+
# below). Connects to Aurora as `compliance_drainer` role via the
|
|
461
|
+
# COMPLIANCE_DRAINER_SECRET_ARN env var (compliance secret created in
|
|
462
|
+
# U2). EventBridge rate(1 minute) schedule + DLQ + MaxRetryAttempts=0
|
|
463
|
+
# (defined in dedicated resources below).
|
|
464
|
+
"compliance-outbox-drainer",
|
|
465
|
+
# Phase 3 U6 of the Compliance audit-event log
|
|
466
|
+
# (docs/plans/2026-05-07-007-feat-compliance-u6-strands-emit-path-plan.md).
|
|
467
|
+
# Cross-runtime emit endpoint POST /api/compliance/events — Bearer
|
|
468
|
+
# API_AUTH_SECRET, Strands Python client posts here with a
|
|
469
|
+
# client-supplied UUIDv7 event_id for idempotency. Connects to
|
|
470
|
+
# Aurora via the master DATABASE_SECRET_ARN like every other narrow
|
|
471
|
+
# handler (compliance_writer role is reserved for future hardening).
|
|
472
|
+
"compliance-events",
|
|
473
|
+
# Phase 3 U8b watchdog moved out of for_each into a standalone
|
|
474
|
+
# aws_lambda_function resource (see below). It now uses a sibling
|
|
475
|
+
# IAM role (kms:DescribeKey only on the CMK; s3:ListBucket scoped
|
|
476
|
+
# to anchors/) instead of the shared aws_iam_role.lambda — the
|
|
477
|
+
# widened S3+KMS grant on the shared role would have leaked into
|
|
478
|
+
# 60+ unrelated handlers. Pre-merge step: `terraform state mv`
|
|
479
|
+
# the existing handler["compliance-anchor-watchdog"] address to the
|
|
480
|
+
# new standalone resource (see U8b plan operator-step section).
|
|
127
481
|
]) : toset([])
|
|
128
482
|
|
|
129
483
|
function_name = "thinkwork-${var.stage}-api-${each.key}"
|
|
@@ -136,15 +490,25 @@ resource "aws_lambda_function" "handler" {
|
|
|
136
490
|
# wiki-bootstrap-import runs a full Hindsight ingest for ~3,000 records;
|
|
137
491
|
# the LLM-backed retain path makes it the longest-running Lambda in the
|
|
138
492
|
# set. 900 s is Lambda's per-invocation max and matches eval-runner's ceiling.
|
|
139
|
-
|
|
140
|
-
|
|
493
|
+
# routine-task-python wraps a 300s sandbox session and needs headroom
|
|
494
|
+
# for the Start/Invoke/Stop/S3-offload round trip; 360s leaves ~60s
|
|
495
|
+
# for AWS-call setup and offload after the sandbox's own ceiling.
|
|
496
|
+
timeout = each.key == "wakeup-processor" ? 300 : each.key == "chat-agent-invoke" ? 300 : each.key == "workspace-event-dispatcher" ? 60 : each.key == "eval-runner" ? 900 : each.key == "eval-worker" ? 240 : each.key == "wiki-compile" ? 480 : each.key == "wiki-lint" ? 300 : each.key == "wiki-export" ? 600 : each.key == "wiki-bootstrap-import" ? 900 : each.key == "folder-bundle-import" ? 300 : each.key == "routine-task-python" ? 360 : 30
|
|
497
|
+
memory_size = each.key == "graphql-http" ? 512 : each.key == "wakeup-processor" ? 512 : each.key == "workspace-event-dispatcher" ? 512 : each.key == "eval-runner" ? 512 : each.key == "eval-worker" ? 512 : each.key == "wiki-compile" ? 1024 : each.key == "wiki-export" ? 1024 : each.key == "wiki-bootstrap-import" ? 1024 : each.key == "folder-bundle-import" ? 1024 : 256
|
|
141
498
|
|
|
142
499
|
filename = "${var.lambda_zips_dir}/${each.key}.zip"
|
|
143
500
|
source_code_hash = filebase64sha256("${var.lambda_zips_dir}/${each.key}.zip")
|
|
144
501
|
|
|
502
|
+
# Per-handler reserved concurrency. compliance-outbox-drainer is a
|
|
503
|
+
# single-writer (per-tenant hash chain integrity depends on it — two
|
|
504
|
+
# concurrent drainers would race the chain head SELECT and produce
|
|
505
|
+
# orphan prev_hash links). All other handlers run with the default
|
|
506
|
+
# account-level concurrency pool.
|
|
507
|
+
reserved_concurrent_executions = each.key == "compliance-outbox-drainer" ? 1 : each.key == "eval-worker" ? 20 : -1
|
|
508
|
+
|
|
145
509
|
environment {
|
|
146
510
|
variables = merge(
|
|
147
|
-
local.common_env,
|
|
511
|
+
contains(["computer-manager", "computer-runtime-reconciler"], each.key) ? local.computer_runtime_control_base_env : local.common_env,
|
|
148
512
|
{ FUNCTION_NAME = each.key },
|
|
149
513
|
lookup(local.handler_extra_env, each.key, {}),
|
|
150
514
|
)
|
|
@@ -156,6 +520,161 @@ resource "aws_lambda_function" "handler" {
|
|
|
156
520
|
}
|
|
157
521
|
}
|
|
158
522
|
|
|
523
|
+
# ---------------------------------------------------------------------------
|
|
524
|
+
# wiki-compile async retry config + DLQ
|
|
525
|
+
# ---------------------------------------------------------------------------
|
|
526
|
+
#
|
|
527
|
+
# AWS Lambda's default async invoke retries the function 2 times with a
|
|
528
|
+
# 1-minute delay before sending failures to a DLQ (or dropping). For
|
|
529
|
+
# wiki-compile, retries duplicate Bedrock cost AND can produce duplicate
|
|
530
|
+
# user-visible threads + workspace_runs (the brain-enrichment draft path
|
|
531
|
+
# in particular — see plan 2026-05-01-002 U5/U6 and
|
|
532
|
+
# docs/solutions/architecture-patterns/async-retry-idempotency-lessons).
|
|
533
|
+
#
|
|
534
|
+
# Pin retries to 0 and route failures to a dedicated DLQ. The runner's
|
|
535
|
+
# job-status short-circuit (running/succeeded/failed/skipped) is the
|
|
536
|
+
# in-process protection against duplicate writebacks; this is the
|
|
537
|
+
# infrastructure-level belt-and-suspenders.
|
|
538
|
+
|
|
539
|
+
resource "aws_sqs_queue" "wiki_compile_dlq" {
|
|
540
|
+
count = local.use_local_zips ? 1 : 0
|
|
541
|
+
name = "thinkwork-${var.stage}-wiki-compile-dlq"
|
|
542
|
+
message_retention_seconds = 1209600 # 14 days
|
|
543
|
+
|
|
544
|
+
tags = {
|
|
545
|
+
Name = "thinkwork-${var.stage}-wiki-compile-dlq"
|
|
546
|
+
}
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
resource "aws_iam_role_policy" "wiki_compile_dlq_send" {
|
|
550
|
+
count = local.use_local_zips ? 1 : 0
|
|
551
|
+
name = "thinkwork-${var.stage}-wiki-compile-dlq-send"
|
|
552
|
+
role = aws_iam_role.lambda.id
|
|
553
|
+
|
|
554
|
+
policy = jsonencode({
|
|
555
|
+
Version = "2012-10-17"
|
|
556
|
+
Statement = [{
|
|
557
|
+
Effect = "Allow"
|
|
558
|
+
Action = ["sqs:SendMessage"]
|
|
559
|
+
Resource = aws_sqs_queue.wiki_compile_dlq[0].arn
|
|
560
|
+
}]
|
|
561
|
+
})
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
resource "aws_lambda_function_event_invoke_config" "wiki_compile" {
|
|
565
|
+
count = local.use_local_zips ? 1 : 0
|
|
566
|
+
function_name = aws_lambda_function.handler["wiki-compile"].function_name
|
|
567
|
+
maximum_retry_attempts = 0
|
|
568
|
+
maximum_event_age_in_seconds = 3600
|
|
569
|
+
|
|
570
|
+
destination_config {
|
|
571
|
+
on_failure {
|
|
572
|
+
destination = aws_sqs_queue.wiki_compile_dlq[0].arn
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
# Phase B U8: SFN's inbox_approval Task invokes routine-approval-callback
|
|
578
|
+
# directly via .waitForTaskToken. Lambda's default async-retry policy
|
|
579
|
+
# (2 attempts) is incompatible with the callback's two-insert flow —
|
|
580
|
+
# even though the inserts are now wrapped in db.transaction(), AWS
|
|
581
|
+
# Lambda's own retry-after-error semantics multiply with SFN's task
|
|
582
|
+
# Retry policy and create thundering-herd attempts on transient
|
|
583
|
+
# failures. SFN is the canonical retry path; Lambda async retries are
|
|
584
|
+
# off. Per project_async_retry_idempotency_lessons.
|
|
585
|
+
resource "aws_lambda_function_event_invoke_config" "routine_approval_callback" {
|
|
586
|
+
count = local.use_local_zips ? 1 : 0
|
|
587
|
+
function_name = aws_lambda_function.handler["routine-approval-callback"].function_name
|
|
588
|
+
maximum_retry_attempts = 0
|
|
589
|
+
maximum_event_age_in_seconds = 3600
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
# Per-turn auto-retain: the runtime (Strands + Flue) Event-invokes
|
|
593
|
+
# memory-retain after every chat turn. AWS Lambda's default async-retry
|
|
594
|
+
# policy is 2 attempts; without overriding it, a transient failure on the
|
|
595
|
+
# canonical-transcript fetch or adapter write retries the entire writeback
|
|
596
|
+
# and can multi-write the same per-turn document into Hindsight. The
|
|
597
|
+
# longest-suffix-prefix merge in memory-retain.ts dedupes content but the
|
|
598
|
+
# retain-cost path (Bedrock tokens charged in adapter.retainConversation)
|
|
599
|
+
# is NOT idempotent — retries multiply LLM cost. Per
|
|
600
|
+
# project_async_retry_idempotency_lessons.
|
|
601
|
+
resource "aws_lambda_function_event_invoke_config" "memory_retain" {
|
|
602
|
+
count = local.use_local_zips ? 1 : 0
|
|
603
|
+
function_name = aws_lambda_function.handler["memory-retain"].function_name
|
|
604
|
+
maximum_retry_attempts = 0
|
|
605
|
+
maximum_event_age_in_seconds = 3600
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
# ---------------------------------------------------------------------------
|
|
609
|
+
# Phase 3 U4: compliance-outbox-drainer DLQ + async retry config
|
|
610
|
+
#
|
|
611
|
+
# AWS Lambda's default async-retry policy is 2 attempts. The drainer's
|
|
612
|
+
# INSERT ... ON CONFLICT (outbox_id) DO NOTHING makes per-row replay
|
|
613
|
+
# safe, but reserved-concurrency=1 + retry-0 is the architectural
|
|
614
|
+
# guarantee that we never have two drainers racing the chain head.
|
|
615
|
+
# Per project_async_retry_idempotency_lessons.
|
|
616
|
+
# ---------------------------------------------------------------------------
|
|
617
|
+
|
|
618
|
+
resource "aws_sqs_queue" "compliance_drainer_dlq" {
|
|
619
|
+
count = local.use_local_zips ? 1 : 0
|
|
620
|
+
name = "thinkwork-${var.stage}-compliance-drainer-dlq"
|
|
621
|
+
message_retention_seconds = 1209600 # 14 days
|
|
622
|
+
|
|
623
|
+
tags = {
|
|
624
|
+
Name = "thinkwork-${var.stage}-compliance-drainer-dlq"
|
|
625
|
+
}
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
resource "aws_iam_role_policy" "compliance_drainer_dlq_send" {
|
|
629
|
+
count = local.use_local_zips ? 1 : 0
|
|
630
|
+
name = "compliance-drainer-dlq-send"
|
|
631
|
+
role = aws_iam_role.lambda.id
|
|
632
|
+
|
|
633
|
+
policy = jsonencode({
|
|
634
|
+
Version = "2012-10-17"
|
|
635
|
+
Statement = [{
|
|
636
|
+
Effect = "Allow"
|
|
637
|
+
Action = ["sqs:SendMessage"]
|
|
638
|
+
Resource = aws_sqs_queue.compliance_drainer_dlq[0].arn
|
|
639
|
+
}]
|
|
640
|
+
})
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
resource "aws_lambda_function_event_invoke_config" "compliance_outbox_drainer" {
|
|
644
|
+
count = local.use_local_zips ? 1 : 0
|
|
645
|
+
function_name = aws_lambda_function.handler["compliance-outbox-drainer"].function_name
|
|
646
|
+
maximum_retry_attempts = 0
|
|
647
|
+
maximum_event_age_in_seconds = 3600
|
|
648
|
+
|
|
649
|
+
destination_config {
|
|
650
|
+
on_failure {
|
|
651
|
+
destination = aws_sqs_queue.compliance_drainer_dlq[0].arn
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
# ---------------------------------------------------------------------------
|
|
657
|
+
# Phase 3 U4: compliance-outbox-drainer EventBridge schedule (every 1 min)
|
|
658
|
+
# ---------------------------------------------------------------------------
|
|
659
|
+
|
|
660
|
+
resource "aws_scheduler_schedule" "compliance_outbox_drainer" {
|
|
661
|
+
count = local.use_local_zips ? 1 : 0
|
|
662
|
+
|
|
663
|
+
name = "thinkwork-${var.stage}-compliance-outbox-drainer"
|
|
664
|
+
group_name = "default"
|
|
665
|
+
schedule_expression = "rate(1 minutes)"
|
|
666
|
+
state = "ENABLED"
|
|
667
|
+
|
|
668
|
+
flexible_time_window {
|
|
669
|
+
mode = "OFF"
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
target {
|
|
673
|
+
arn = aws_lambda_function.handler["compliance-outbox-drainer"].arn
|
|
674
|
+
role_arn = aws_iam_role.scheduler.arn
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
|
|
159
678
|
# ---------------------------------------------------------------------------
|
|
160
679
|
# API Gateway routes → Lambda integrations
|
|
161
680
|
# ---------------------------------------------------------------------------
|
|
@@ -198,10 +717,35 @@ locals {
|
|
|
198
717
|
"ANY /api/invites/{proxy+}" = "invites"
|
|
199
718
|
"ANY /api/invites" = "invites"
|
|
200
719
|
|
|
720
|
+
# Compliance audit-event emit (Phase 3 U6) — narrow Bearer
|
|
721
|
+
# API_AUTH_SECRET endpoint, Strands Python client posts here.
|
|
722
|
+
"POST /api/compliance/events" = "compliance-events"
|
|
723
|
+
|
|
201
724
|
# Skills
|
|
202
725
|
"ANY /api/skills/{proxy+}" = "skills"
|
|
203
726
|
"ANY /api/skills" = "skills"
|
|
204
727
|
|
|
728
|
+
# User Memory MCP OAuth/resource-server unblocker. These endpoints are
|
|
729
|
+
# enough for `codex mcp login thinkwork-user-memory-dev` to discover OAuth,
|
|
730
|
+
# register as a public PKCE client, sign the user in through Cognito, and
|
|
731
|
+
# receive a bearer token for the User Memory MCP resource.
|
|
732
|
+
"GET /.well-known/oauth-protected-resource" = "mcp-oauth"
|
|
733
|
+
"GET /.well-known/oauth-protected-resource/{proxy+}" = "mcp-oauth"
|
|
734
|
+
"GET /.well-known/oauth-authorization-server" = "mcp-oauth"
|
|
735
|
+
"GET /.well-known/openid-configuration" = "mcp-oauth"
|
|
736
|
+
"GET /mcp/oauth/jwks" = "mcp-oauth"
|
|
737
|
+
"POST /mcp/oauth/register" = "mcp-oauth"
|
|
738
|
+
"GET /mcp/oauth/authorize" = "mcp-oauth"
|
|
739
|
+
"GET /mcp/oauth/callback" = "mcp-oauth"
|
|
740
|
+
"POST /mcp/oauth/token" = "mcp-oauth"
|
|
741
|
+
"POST /mcp/oauth/revoke" = "mcp-oauth"
|
|
742
|
+
"ANY /mcp/user-memory" = "mcp-user-memory"
|
|
743
|
+
"ANY /mcp/context-engine" = "mcp-context-engine"
|
|
744
|
+
|
|
745
|
+
# Brain v0 service-auth writeback.
|
|
746
|
+
"POST /api/brain/agent-write" = "brain-agent-write"
|
|
747
|
+
"OPTIONS /api/brain/agent-write" = "brain-agent-write"
|
|
748
|
+
|
|
205
749
|
# Activity
|
|
206
750
|
"ANY /api/activity/{proxy+}" = "activity"
|
|
207
751
|
"ANY /api/activity" = "activity"
|
|
@@ -212,6 +756,20 @@ locals {
|
|
|
212
756
|
"GET /api/oauth/authorize" = "oauth-authorize"
|
|
213
757
|
"GET /api/oauth/callback" = "oauth-callback"
|
|
214
758
|
|
|
759
|
+
# Stripe billing (unauthenticated — checkout is pre-signup; webhook is
|
|
760
|
+
# server-to-server with Stripe signature verification).
|
|
761
|
+
"POST /api/stripe/checkout-session" = "stripe-checkout"
|
|
762
|
+
"OPTIONS /api/stripe/checkout-session" = "stripe-checkout"
|
|
763
|
+
"POST /api/stripe/webhook" = "stripe-webhook"
|
|
764
|
+
"POST /api/stripe/portal-session" = "stripe-portal"
|
|
765
|
+
"OPTIONS /api/stripe/portal-session" = "stripe-portal"
|
|
766
|
+
"GET /api/stripe/subscription" = "stripe-subscription"
|
|
767
|
+
"OPTIONS /api/stripe/subscription" = "stripe-subscription"
|
|
768
|
+
"GET /api/auth/me" = "auth-me"
|
|
769
|
+
"OPTIONS /api/auth/me" = "auth-me"
|
|
770
|
+
"ANY /api/extensions/{extensionId}" = "extension-proxy"
|
|
771
|
+
"ANY /api/extensions/{extensionId}/{proxy+}" = "extension-proxy"
|
|
772
|
+
|
|
215
773
|
# Routines
|
|
216
774
|
"ANY /api/routines/{proxy+}" = "routines"
|
|
217
775
|
"ANY /api/routines" = "routines"
|
|
@@ -234,7 +792,15 @@ locals {
|
|
|
234
792
|
"ANY /api/job-schedules/{proxy+}" = "job-schedule-manager"
|
|
235
793
|
"ANY /api/job-schedules" = "job-schedule-manager"
|
|
236
794
|
|
|
237
|
-
#
|
|
795
|
+
# Integration webhooks (Unit 8 — composable-skills). Each integration
|
|
796
|
+
# has its own Lambda + a specific route under /webhooks/{integration}/
|
|
797
|
+
# {tenantId}. Specific routes take precedence over the {proxy+}
|
|
798
|
+
# catch-all below, which still owns the legacy PRD-19 webhook-token
|
|
799
|
+
# surface.
|
|
800
|
+
"POST /webhooks/crm-opportunity/{tenantId}" = "webhook-crm-opportunity"
|
|
801
|
+
"POST /webhooks/task-event/{tenantId}" = "webhook-task-event"
|
|
802
|
+
|
|
803
|
+
# Webhooks (public trigger) — legacy PRD-19 tokenized webhooks.
|
|
238
804
|
"POST /webhooks/{proxy+}" = "webhooks"
|
|
239
805
|
|
|
240
806
|
# Webhooks admin
|
|
@@ -244,12 +810,25 @@ locals {
|
|
|
244
810
|
# Workspace files
|
|
245
811
|
"ANY /api/workspaces/{proxy+}" = "workspace-files"
|
|
246
812
|
|
|
813
|
+
# Phase-one Computer migration. Service-auth only; operator tooling calls
|
|
814
|
+
# dry-run first and apply only after conflict review.
|
|
815
|
+
"POST /api/migrations/agents-to-computers" = "migrate-agents-to-computers"
|
|
816
|
+
"OPTIONS /api/migrations/agents-to-computers" = "migrate-agents-to-computers"
|
|
817
|
+
|
|
247
818
|
# Knowledge bases
|
|
248
819
|
"ANY /api/knowledge-bases/{proxy+}" = "knowledge-base-files"
|
|
249
820
|
|
|
250
821
|
# Email
|
|
251
822
|
"POST /api/email/send" = "email-send"
|
|
252
823
|
|
|
824
|
+
# Slack workspace app ingress. These unauthenticated public endpoints
|
|
825
|
+
# verify Slack signatures in handler code before any tenant work happens.
|
|
826
|
+
"POST /slack/events" = "slack-events"
|
|
827
|
+
"POST /slack/slash-command" = "slack-slash-command"
|
|
828
|
+
"POST /slack/interactivity" = "slack-interactivity"
|
|
829
|
+
"GET /slack/oauth/install" = "slack-oauth-install"
|
|
830
|
+
"POST /slack/oauth/install" = "slack-oauth-install"
|
|
831
|
+
|
|
253
832
|
# Memory
|
|
254
833
|
"ANY /api/memory/{proxy+}" = "memory"
|
|
255
834
|
|
|
@@ -262,6 +841,121 @@ locals {
|
|
|
262
841
|
# GitHub App
|
|
263
842
|
"ANY /api/github-app/{proxy+}" = "github-app"
|
|
264
843
|
"POST /api/github/webhook" = "github-app"
|
|
844
|
+
|
|
845
|
+
# AgentCore Code Sandbox (plan Unit 10 + Unit 11). Strands container
|
|
846
|
+
# calls both with Bearer API_AUTH_SECRET before + after every
|
|
847
|
+
# executeCode. 429 on quota denial, 201 on audit-row insert.
|
|
848
|
+
"POST /api/sandbox/quota/check-and-increment" = "sandbox-quota-check"
|
|
849
|
+
"POST /api/sandbox/invocations" = "sandbox-invocation-log"
|
|
850
|
+
|
|
851
|
+
# Routines ASL validator (plan 2026-05-01-004 §U5). Bearer
|
|
852
|
+
# API_AUTH_SECRET. Chat builder + publish flow POST the candidate
|
|
853
|
+
# ASL document; returns { valid, errors, warnings }.
|
|
854
|
+
"POST /api/routines/validate" = "routine-asl-validator"
|
|
855
|
+
"OPTIONS /api/routines/validate" = "routine-asl-validator"
|
|
856
|
+
|
|
857
|
+
# Routines step-event ingest (plan 2026-05-01-005 §U9). Task wrappers
|
|
858
|
+
# (routine-task-python, routine-resume) POST per-step status
|
|
859
|
+
# transitions; the EventBridge rule in routines-stepfunctions/main.tf
|
|
860
|
+
# POSTs SFN execution-state-change events here for the agent_invoke
|
|
861
|
+
# recipe path (no wrapper Lambda). Bearer API_AUTH_SECRET. Idempotent
|
|
862
|
+
# via partial unique index on (execution_id, node_id, status,
|
|
863
|
+
# started_at) — see migration 0056.
|
|
864
|
+
"POST /api/routines/step" = "routine-step-callback"
|
|
865
|
+
"OPTIONS /api/routines/step" = "routine-step-callback"
|
|
866
|
+
"POST /api/routines/execution" = "routine-execution-callback"
|
|
867
|
+
"OPTIONS /api/routines/execution" = "routine-execution-callback"
|
|
868
|
+
|
|
869
|
+
# Skill-run dispatcher runtime-config fetch. Service-auth GET.
|
|
870
|
+
"GET /api/agents/runtime-config" = "agents-runtime-config"
|
|
871
|
+
|
|
872
|
+
# ThinkWork Computer runtime callback API. ECS tasks call outbound with
|
|
873
|
+
# Bearer API_AUTH_SECRET to fetch config, heartbeat, claim one task, append
|
|
874
|
+
# product/audit events, and complete/fail tasks.
|
|
875
|
+
"ANY /api/computers/runtime/{proxy+}" = "computer-runtime"
|
|
876
|
+
|
|
877
|
+
# Admin Terminal tab — opens an ECS Exec session into the running
|
|
878
|
+
# Computer task and returns {sessionId, streamUrl, tokenValue} to the
|
|
879
|
+
# browser, which then connects WebSocket directly to ssmmessages.
|
|
880
|
+
"POST /api/computers/{computerId}/terminal/start" = "computer-terminal-start"
|
|
881
|
+
"OPTIONS /api/computers/{computerId}/terminal/start" = "computer-terminal-start"
|
|
882
|
+
|
|
883
|
+
# ThinkWork Computer manager API. Internal service-auth endpoint used by
|
|
884
|
+
# admin operations to reconcile per-Computer ECS service desired state.
|
|
885
|
+
"POST /api/computers/manager" = "computer-manager"
|
|
886
|
+
"OPTIONS /api/computers/manager" = "computer-manager"
|
|
887
|
+
|
|
888
|
+
# Admin-Ops MCP server — single JSON-RPC endpoint. Strands agents
|
|
889
|
+
# (and anyone else) POST with Bearer <tenant-scoped token> issued by
|
|
890
|
+
# the mcp-admin-keys handler below. The shared API_AUTH_SECRET is
|
|
891
|
+
# retained as a break-glass superuser path for bootstrap/debug.
|
|
892
|
+
"POST /mcp/admin" = "admin-ops-mcp"
|
|
893
|
+
|
|
894
|
+
# MCP admin key management — per-tenant Bearer token CRUD. Tokens
|
|
895
|
+
# are shown ONCE at creation (POST returns raw value); server stores
|
|
896
|
+
# sha256 hash only. These specific routes take precedence over the
|
|
897
|
+
# existing `ANY /api/tenants/{proxy+}` route (tenants handler) per
|
|
898
|
+
# API Gateway v2's most-specific-match rule.
|
|
899
|
+
"POST /api/tenants/{tenantId}/mcp-admin-keys" = "mcp-admin-keys"
|
|
900
|
+
"GET /api/tenants/{tenantId}/mcp-admin-keys" = "mcp-admin-keys"
|
|
901
|
+
"DELETE /api/tenants/{tenantId}/mcp-admin-keys/{keyId}" = "mcp-admin-keys"
|
|
902
|
+
|
|
903
|
+
# One-shot tenant provisioning for the admin-ops MCP. Mints a fresh
|
|
904
|
+
# tkm_ key + stores it in Secrets Manager at
|
|
905
|
+
# thinkwork/<stage>/mcp/<tenantId>/admin-ops + upserts the
|
|
906
|
+
# tenant_mcp_servers row so the runtime picks the server up for
|
|
907
|
+
# any agent that gets it assigned via agent_mcp_servers.
|
|
908
|
+
"POST /api/tenants/{tenantId}/mcp-admin-provision" = "mcp-admin-provision"
|
|
909
|
+
|
|
910
|
+
# MCP server admin approval (plan §U11, SI-5). Plugin-uploaded MCP
|
|
911
|
+
# servers land with status='pending'; these routes flip them to
|
|
912
|
+
# approved/rejected. Cognito JWT only (mcp-approval handler rejects
|
|
913
|
+
# apikey callers) — the admin SPA is the sole UI surface.
|
|
914
|
+
"POST /api/tenants/{tenantId}/mcp-servers/{serverId}/approve" = "mcp-approval"
|
|
915
|
+
"OPTIONS /api/tenants/{tenantId}/mcp-servers/{serverId}/approve" = "mcp-approval"
|
|
916
|
+
"POST /api/tenants/{tenantId}/mcp-servers/{serverId}/reject" = "mcp-approval"
|
|
917
|
+
"OPTIONS /api/tenants/{tenantId}/mcp-servers/{serverId}/reject" = "mcp-approval"
|
|
918
|
+
|
|
919
|
+
# Plugin upload admin surface (plan §U10). Admin SPA drives the full
|
|
920
|
+
# flow: POST /presign → browser PUT to presigned S3 URL → POST /upload
|
|
921
|
+
# (validator + three-phase install saga). GET routes back the admin's
|
|
922
|
+
# plugin history view. handleCors() short-circuits OPTIONS before auth
|
|
923
|
+
# — required for the browser to preflight successfully.
|
|
924
|
+
"POST /api/plugins/presign" = "plugin-upload"
|
|
925
|
+
"OPTIONS /api/plugins/presign" = "plugin-upload"
|
|
926
|
+
"POST /api/plugins/upload" = "plugin-upload"
|
|
927
|
+
"OPTIONS /api/plugins/upload" = "plugin-upload"
|
|
928
|
+
"GET /api/plugins" = "plugin-upload"
|
|
929
|
+
"OPTIONS /api/plugins" = "plugin-upload"
|
|
930
|
+
"GET /api/plugins/{uploadId}" = "plugin-upload"
|
|
931
|
+
"OPTIONS /api/plugins/{uploadId}" = "plugin-upload"
|
|
932
|
+
|
|
933
|
+
# Finance pilot U2 — thread-attachment upload (presign + finalize).
|
|
934
|
+
# Cognito JWT; tenant pinned via threads.tenant_id lookup. OPTIONS
|
|
935
|
+
# is handled inside the Lambda before auth.
|
|
936
|
+
"POST /api/threads/{threadId}/attachments/presign" = "thread-attachments-presign"
|
|
937
|
+
"OPTIONS /api/threads/{threadId}/attachments/presign" = "thread-attachments-presign"
|
|
938
|
+
"POST /api/threads/{threadId}/attachments/finalize" = "thread-attachments-finalize"
|
|
939
|
+
"OPTIONS /api/threads/{threadId}/attachments/finalize" = "thread-attachments-finalize"
|
|
940
|
+
|
|
941
|
+
# U9-remainder of finance pilot — tenant-pinned download endpoint.
|
|
942
|
+
"GET /api/threads/{threadId}/attachments/{attachmentId}/download" = "thread-attachment-download"
|
|
943
|
+
"OPTIONS /api/threads/{threadId}/attachments/{attachmentId}/download" = "thread-attachment-download"
|
|
944
|
+
|
|
945
|
+
# Fat-folder bundle import. OPTIONS is handled inside the Lambda before auth.
|
|
946
|
+
"POST /api/agents/{agentId}/import-bundle" = "folder-bundle-import"
|
|
947
|
+
"OPTIONS /api/agents/{agentId}/import-bundle" = "folder-bundle-import"
|
|
948
|
+
|
|
949
|
+
# Resolved Capability Manifest write endpoint (plan §U15). Strands
|
|
950
|
+
# container posts one row per agent-session-start. Shared
|
|
951
|
+
# API_AUTH_SECRET; no tenant OAuth.
|
|
952
|
+
"POST /api/runtime/manifests" = "manifest-log"
|
|
953
|
+
"OPTIONS /api/runtime/manifests" = "manifest-log"
|
|
954
|
+
|
|
955
|
+
# SI-7 catalog-list read (plan §U15 pt 3/3). Strands container fetches
|
|
956
|
+
# the allowed slug set once per session-start. Shared API_AUTH_SECRET.
|
|
957
|
+
"GET /api/runtime/capability-catalog" = "capability-catalog-list"
|
|
958
|
+
"OPTIONS /api/runtime/capability-catalog" = "capability-catalog-list"
|
|
265
959
|
} : {}
|
|
266
960
|
}
|
|
267
961
|
|
|
@@ -336,10 +1030,197 @@ resource "aws_scheduler_schedule" "webhook_deliveries_cleanup" {
|
|
|
336
1030
|
}
|
|
337
1031
|
}
|
|
338
1032
|
|
|
1033
|
+
# ---------------------------------------------------------------------------
|
|
1034
|
+
# Plugin staging sweeper — hourly orphan-S3 cleanup for interrupted install
|
|
1035
|
+
# sagas (plan §U10). WORKSPACE_BUCKET env on the Lambda role already grants
|
|
1036
|
+
# the list+delete IAM; this schedule is the hourly trigger. The sweeper's
|
|
1037
|
+
# own cutoff constant (60 min) is independent of this cron cadence.
|
|
1038
|
+
# ---------------------------------------------------------------------------
|
|
1039
|
+
|
|
1040
|
+
resource "aws_scheduler_schedule" "plugin_staging_sweeper" {
|
|
1041
|
+
count = local.use_local_zips ? 1 : 0
|
|
1042
|
+
|
|
1043
|
+
name = "thinkwork-${var.stage}-plugin-staging-sweeper"
|
|
1044
|
+
group_name = "default"
|
|
1045
|
+
schedule_expression = "rate(1 hour)"
|
|
1046
|
+
state = "ENABLED"
|
|
1047
|
+
|
|
1048
|
+
flexible_time_window {
|
|
1049
|
+
mode = "OFF"
|
|
1050
|
+
}
|
|
1051
|
+
|
|
1052
|
+
target {
|
|
1053
|
+
arn = aws_lambda_function.handler["plugin-staging-sweeper"].arn
|
|
1054
|
+
role_arn = aws_iam_role.scheduler.arn
|
|
1055
|
+
}
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1058
|
+
# ---------------------------------------------------------------------------
|
|
1059
|
+
# MCP approval TTL sweeper — daily auto-reject of pending rows > 30 days old
|
|
1060
|
+
# (plan §U11). A plugin whose MCP sat uncurated for a month is stale: clear
|
|
1061
|
+
# pending to keep the admin queue honest and surface the reject action in
|
|
1062
|
+
# the audit log.
|
|
1063
|
+
# ---------------------------------------------------------------------------
|
|
1064
|
+
|
|
1065
|
+
resource "aws_scheduler_schedule" "mcp_approval_sweeper" {
|
|
1066
|
+
count = local.use_local_zips ? 1 : 0
|
|
1067
|
+
|
|
1068
|
+
name = "thinkwork-${var.stage}-mcp-approval-sweeper"
|
|
1069
|
+
group_name = "default"
|
|
1070
|
+
schedule_expression = "cron(15 4 * * ? *)" # daily at 04:15 UTC (offset from webhook cleanup)
|
|
1071
|
+
state = "ENABLED"
|
|
1072
|
+
|
|
1073
|
+
flexible_time_window {
|
|
1074
|
+
mode = "OFF"
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
target {
|
|
1078
|
+
arn = aws_lambda_function.handler["mcp-approval-sweeper"].arn
|
|
1079
|
+
role_arn = aws_iam_role.scheduler.arn
|
|
1080
|
+
}
|
|
1081
|
+
}
|
|
1082
|
+
|
|
1083
|
+
# ---------------------------------------------------------------------------
|
|
1084
|
+
# skill_runs reconciler — transitions stuck-running rows to failed every 5 min.
|
|
1085
|
+
# Guards against agentcore Lambda crashes / OOMs that drop the
|
|
1086
|
+
# /api/skills/complete writeback and leave the row at 'running' forever,
|
|
1087
|
+
# which in turn blocks the dedup partial unique index from letting retries
|
|
1088
|
+
# through.
|
|
1089
|
+
# ---------------------------------------------------------------------------
|
|
1090
|
+
|
|
1091
|
+
resource "aws_scheduler_schedule" "skill_runs_reconciler" {
|
|
1092
|
+
count = local.use_local_zips ? 1 : 0
|
|
1093
|
+
|
|
1094
|
+
name = "thinkwork-${var.stage}-skill-runs-reconciler"
|
|
1095
|
+
group_name = "default"
|
|
1096
|
+
schedule_expression = "rate(5 minutes)"
|
|
1097
|
+
state = "ENABLED"
|
|
1098
|
+
|
|
1099
|
+
flexible_time_window {
|
|
1100
|
+
mode = "OFF"
|
|
1101
|
+
}
|
|
1102
|
+
|
|
1103
|
+
target {
|
|
1104
|
+
arn = aws_lambda_function.handler["skill-runs-reconciler"].arn
|
|
1105
|
+
role_arn = aws_iam_role.scheduler.arn
|
|
1106
|
+
}
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
# ---------------------------------------------------------------------------
|
|
1110
|
+
# eval_runs reconciler — finalizes stuck-running eval runs every 5 min.
|
|
1111
|
+
# Guards against worker crashes/timeouts that occur before a per-case result
|
|
1112
|
+
# row is written. Missing category-selected cases are recorded as error rows,
|
|
1113
|
+
# then the run is finalized so the Admin UI cannot remain "running" forever.
|
|
1114
|
+
# ---------------------------------------------------------------------------
|
|
1115
|
+
|
|
1116
|
+
resource "aws_scheduler_schedule" "eval_runs_reconciler" {
|
|
1117
|
+
count = local.use_local_zips ? 1 : 0
|
|
1118
|
+
|
|
1119
|
+
name = "thinkwork-${var.stage}-eval-runs-reconciler"
|
|
1120
|
+
group_name = "default"
|
|
1121
|
+
schedule_expression = "rate(5 minutes)"
|
|
1122
|
+
state = "ENABLED"
|
|
1123
|
+
|
|
1124
|
+
flexible_time_window {
|
|
1125
|
+
mode = "OFF"
|
|
1126
|
+
}
|
|
1127
|
+
|
|
1128
|
+
target {
|
|
1129
|
+
arn = aws_lambda_function.handler["eval-runs-reconciler"].arn
|
|
1130
|
+
role_arn = aws_iam_role.scheduler.arn
|
|
1131
|
+
}
|
|
1132
|
+
}
|
|
1133
|
+
|
|
1134
|
+
# ---------------------------------------------------------------------------
|
|
1135
|
+
# Stall monitor — marks stalled thread turns and runbook steps failed every
|
|
1136
|
+
# minute. This is the global backstop for agent/runtime crashes; the Computer
|
|
1137
|
+
# heartbeat also reconciles its own stale runbook tasks while it is alive.
|
|
1138
|
+
# ---------------------------------------------------------------------------
|
|
1139
|
+
|
|
1140
|
+
resource "aws_scheduler_schedule" "stall_monitor" {
|
|
1141
|
+
count = local.use_local_zips ? 1 : 0
|
|
1142
|
+
|
|
1143
|
+
name = "thinkwork-${var.stage}-stall-monitor"
|
|
1144
|
+
group_name = "default"
|
|
1145
|
+
schedule_expression = "rate(1 minutes)"
|
|
1146
|
+
state = "ENABLED"
|
|
1147
|
+
|
|
1148
|
+
flexible_time_window {
|
|
1149
|
+
mode = "OFF"
|
|
1150
|
+
}
|
|
1151
|
+
|
|
1152
|
+
target {
|
|
1153
|
+
arn = aws_lambda_function.handler["cron-stall-monitor"].arn
|
|
1154
|
+
role_arn = aws_iam_role.scheduler.arn
|
|
1155
|
+
}
|
|
1156
|
+
}
|
|
1157
|
+
|
|
1158
|
+
# ---------------------------------------------------------------------------
|
|
1159
|
+
# ThinkWork Computer runtime reconciler — keeps active Computers aligned with
|
|
1160
|
+
# desired_runtime_status by provisioning/starting/stopping ECS services in
|
|
1161
|
+
# bounded batches. The handler is conservative and records per-Computer events
|
|
1162
|
+
# for every attempted action.
|
|
1163
|
+
# ---------------------------------------------------------------------------
|
|
1164
|
+
|
|
1165
|
+
resource "aws_scheduler_schedule" "computer_runtime_reconciler" {
|
|
1166
|
+
count = local.use_local_zips ? 1 : 0
|
|
1167
|
+
|
|
1168
|
+
name = "thinkwork-${var.stage}-computer-runtime-reconciler"
|
|
1169
|
+
group_name = "default"
|
|
1170
|
+
schedule_expression = "rate(5 minutes)"
|
|
1171
|
+
state = "ENABLED"
|
|
1172
|
+
|
|
1173
|
+
flexible_time_window {
|
|
1174
|
+
mode = "OFF"
|
|
1175
|
+
}
|
|
1176
|
+
|
|
1177
|
+
target {
|
|
1178
|
+
arn = aws_lambda_function.handler["computer-runtime-reconciler"].arn
|
|
1179
|
+
role_arn = aws_iam_role.scheduler.arn
|
|
1180
|
+
}
|
|
1181
|
+
}
|
|
1182
|
+
|
|
1183
|
+
resource "aws_scheduler_schedule" "slack_dispatch" {
|
|
1184
|
+
count = local.use_local_zips ? 1 : 0
|
|
1185
|
+
|
|
1186
|
+
name = "thinkwork-${var.stage}-slack-dispatch"
|
|
1187
|
+
group_name = "default"
|
|
1188
|
+
schedule_expression = "rate(1 minute)"
|
|
1189
|
+
state = "ENABLED"
|
|
1190
|
+
|
|
1191
|
+
flexible_time_window {
|
|
1192
|
+
mode = "OFF"
|
|
1193
|
+
}
|
|
1194
|
+
|
|
1195
|
+
target {
|
|
1196
|
+
arn = aws_lambda_function.handler["slack-dispatch"].arn
|
|
1197
|
+
role_arn = aws_iam_role.scheduler.arn
|
|
1198
|
+
input = jsonencode({ limit = 25 })
|
|
1199
|
+
}
|
|
1200
|
+
}
|
|
1201
|
+
|
|
339
1202
|
# ---------------------------------------------------------------------------
|
|
340
1203
|
# Compounding Memory — nightly hygiene + export
|
|
341
1204
|
# ---------------------------------------------------------------------------
|
|
342
1205
|
|
|
1206
|
+
resource "aws_scheduler_schedule" "wiki_compile_drainer" {
|
|
1207
|
+
count = local.use_local_zips ? 1 : 0
|
|
1208
|
+
|
|
1209
|
+
name = "thinkwork-${var.stage}-wiki-compile-drainer"
|
|
1210
|
+
group_name = "default"
|
|
1211
|
+
schedule_expression = "rate(1 minutes)"
|
|
1212
|
+
state = "ENABLED"
|
|
1213
|
+
|
|
1214
|
+
flexible_time_window {
|
|
1215
|
+
mode = "OFF"
|
|
1216
|
+
}
|
|
1217
|
+
|
|
1218
|
+
target {
|
|
1219
|
+
arn = aws_lambda_function.handler["wiki-compile"].arn
|
|
1220
|
+
role_arn = aws_iam_role.scheduler.arn
|
|
1221
|
+
}
|
|
1222
|
+
}
|
|
1223
|
+
|
|
343
1224
|
resource "aws_scheduler_schedule" "wiki_lint" {
|
|
344
1225
|
count = local.use_local_zips ? 1 : 0
|
|
345
1226
|
|
|
@@ -417,8 +1298,8 @@ resource "aws_iam_role_policy" "lambda_wiki_exports_s3" {
|
|
|
417
1298
|
policy = jsonencode({
|
|
418
1299
|
Version = "2012-10-17"
|
|
419
1300
|
Statement = [{
|
|
420
|
-
Effect
|
|
421
|
-
Action
|
|
1301
|
+
Effect = "Allow"
|
|
1302
|
+
Action = ["s3:PutObject", "s3:AbortMultipartUpload"]
|
|
422
1303
|
Resource = "${aws_s3_bucket.wiki_exports.arn}/*"
|
|
423
1304
|
}]
|
|
424
1305
|
})
|
|
@@ -427,12 +1308,22 @@ resource "aws_iam_role_policy" "lambda_wiki_exports_s3" {
|
|
|
427
1308
|
resource "aws_iam_role" "scheduler" {
|
|
428
1309
|
name = "thinkwork-${var.stage}-scheduler-role"
|
|
429
1310
|
|
|
1311
|
+
# Phase 3 U8a — `aws:SourceAccount` confused-deputy guard. Without
|
|
1312
|
+
# this condition, a foreign-account principal who learns the role ARN
|
|
1313
|
+
# could potentially construct cross-account Scheduler events. The
|
|
1314
|
+
# guard applies to ALL handlers the scheduler invokes; defense-in-depth
|
|
1315
|
+
# alongside per-Lambda `aws:SourceArn` pins like the U7 anchor role.
|
|
430
1316
|
assume_role_policy = jsonencode({
|
|
431
1317
|
Version = "2012-10-17"
|
|
432
1318
|
Statement = [{
|
|
433
1319
|
Effect = "Allow"
|
|
434
1320
|
Principal = { Service = "scheduler.amazonaws.com" }
|
|
435
1321
|
Action = "sts:AssumeRole"
|
|
1322
|
+
Condition = {
|
|
1323
|
+
StringEquals = {
|
|
1324
|
+
"aws:SourceAccount" = var.account_id
|
|
1325
|
+
}
|
|
1326
|
+
}
|
|
436
1327
|
}]
|
|
437
1328
|
})
|
|
438
1329
|
}
|
|
@@ -444,9 +1335,22 @@ resource "aws_iam_role_policy" "scheduler_invoke" {
|
|
|
444
1335
|
policy = jsonencode({
|
|
445
1336
|
Version = "2012-10-17"
|
|
446
1337
|
Statement = [{
|
|
447
|
-
Effect
|
|
448
|
-
Action
|
|
449
|
-
|
|
1338
|
+
Effect = "Allow"
|
|
1339
|
+
Action = ["lambda:InvokeFunction"]
|
|
1340
|
+
# Includes every for_each handler PLUS the standalone Phase 3 U8a
|
|
1341
|
+
# anchor Lambda (which is intentionally outside the for_each set
|
|
1342
|
+
# because it uses the U7 IAM role, not the shared aws_iam_role.lambda).
|
|
1343
|
+
# Splat (`[*]`) expansion handles count=0 cleanly when local.use_local_zips
|
|
1344
|
+
# is false; an indexed reference (`[0].arn`) would throw on graph eval.
|
|
1345
|
+
# Phase 3 U8b — watchdog moved to standalone resource; its ARN must
|
|
1346
|
+
# be added to the splat list explicitly (SEC-U8B-005). The splat
|
|
1347
|
+
# `[*].arn` form handles count = 0 cleanly when local.use_local_zips
|
|
1348
|
+
# is false; an indexed `[0].arn` would throw on graph eval.
|
|
1349
|
+
Resource = local.use_local_zips ? concat(
|
|
1350
|
+
[for k, v in aws_lambda_function.handler : v.arn],
|
|
1351
|
+
aws_lambda_function.compliance_anchor[*].arn,
|
|
1352
|
+
aws_lambda_function.compliance_anchor_watchdog[*].arn,
|
|
1353
|
+
) : []
|
|
450
1354
|
}]
|
|
451
1355
|
})
|
|
452
1356
|
}
|
|
@@ -455,6 +1359,34 @@ resource "aws_iam_role_policy" "scheduler_invoke" {
|
|
|
455
1359
|
# SSM Parameters — Lambda ARNs for cross-function invocation
|
|
456
1360
|
# ---------------------------------------------------------------------------
|
|
457
1361
|
|
|
1362
|
+
########################################################################
|
|
1363
|
+
# SecureString parameter for the Google Places API key. wiki-compile reads
|
|
1364
|
+
# this on cold start via loadGooglePlacesClientFromSsm() and caches the
|
|
1365
|
+
# client at module scope. When google_places_api_key is empty (the
|
|
1366
|
+
# default), we seed the parameter with a placeholder so the Lambda init
|
|
1367
|
+
# path can distinguish "unconfigured" (skip Google entirely, degrade
|
|
1368
|
+
# gracefully) from "configured but wrong" (log + skip). lifecycle.ignore_
|
|
1369
|
+
# changes on `value` lets ops rotate via
|
|
1370
|
+
# aws ssm put-parameter --overwrite \
|
|
1371
|
+
# --name /thinkwork/<stage>/google-places/api-key \
|
|
1372
|
+
# --type SecureString --value <KEY>
|
|
1373
|
+
# without terraform fighting it on the next apply.
|
|
1374
|
+
########################################################################
|
|
1375
|
+
|
|
1376
|
+
resource "aws_ssm_parameter" "google_places_api_key" {
|
|
1377
|
+
name = "/thinkwork/${var.stage}/google-places/api-key"
|
|
1378
|
+
type = "SecureString"
|
|
1379
|
+
value = var.google_places_api_key != "" ? var.google_places_api_key : "PLACEHOLDER_SET_VIA_CLI"
|
|
1380
|
+
description = "Google Places API (New) key consumed by wiki-compile. See docs/plans/2026-04-21-005-feat-wiki-place-capability-v2-plan.md Unit 4."
|
|
1381
|
+
|
|
1382
|
+
lifecycle {
|
|
1383
|
+
# Allow `aws ssm put-parameter --overwrite` to stick across applies.
|
|
1384
|
+
# New-key rotation or initial population by ops should happen via CLI,
|
|
1385
|
+
# not via terraform var.
|
|
1386
|
+
ignore_changes = [value]
|
|
1387
|
+
}
|
|
1388
|
+
}
|
|
1389
|
+
|
|
458
1390
|
resource "aws_ssm_parameter" "lambda_arns" {
|
|
459
1391
|
for_each = local.use_local_zips ? {
|
|
460
1392
|
"chat-agent-invoke-fn-arn" = aws_lambda_function.handler["chat-agent-invoke"].arn
|
|
@@ -462,9 +1394,488 @@ resource "aws_ssm_parameter" "lambda_arns" {
|
|
|
462
1394
|
"job-schedule-manager-fn-arn" = aws_lambda_function.handler["job-schedule-manager"].arn
|
|
463
1395
|
"memory-retain-fn-arn" = aws_lambda_function.handler["memory-retain"].arn
|
|
464
1396
|
"eval-runner-fn-arn" = aws_lambda_function.handler["eval-runner"].arn
|
|
1397
|
+
"eval-worker-fn-arn" = aws_lambda_function.handler["eval-worker"].arn
|
|
465
1398
|
} : {}
|
|
466
1399
|
|
|
467
1400
|
name = "/thinkwork/${var.stage}/${each.key}"
|
|
468
1401
|
type = "String"
|
|
469
1402
|
value = each.value
|
|
470
1403
|
}
|
|
1404
|
+
|
|
1405
|
+
# ===========================================================================
|
|
1406
|
+
# Phase 3 U8a — Compliance Anchor Lambda (STANDALONE) + Watchdog wiring
|
|
1407
|
+
# ===========================================================================
|
|
1408
|
+
# Plan: docs/plans/2026-05-07-010-feat-compliance-u8a-anchor-lambda-inert-plan.md
|
|
1409
|
+
#
|
|
1410
|
+
# The anchor Lambda is INTENTIONALLY OUTSIDE the for_each handler set
|
|
1411
|
+
# because its execution role is the U7 IAM role (`compliance-anchor-
|
|
1412
|
+
# lambda-role`), not the shared `aws_iam_role.lambda`. Adding a per-key
|
|
1413
|
+
# `role` ternary on the for_each set is the highest-blast-radius single
|
|
1414
|
+
# expression in this PR (any expression error silently downgrades 60+
|
|
1415
|
+
# unrelated handlers); a standalone resource isolates blast radius.
|
|
1416
|
+
#
|
|
1417
|
+
# The watchdog DOES live in the for_each set — it uses the shared
|
|
1418
|
+
# execution role (only needs AWSLambdaBasicExecutionRole + a small inline
|
|
1419
|
+
# policy below for ComplianceAnchorWatchdogHeartbeat metric emit).
|
|
1420
|
+
# ===========================================================================
|
|
1421
|
+
|
|
1422
|
+
resource "aws_lambda_function" "compliance_anchor" {
|
|
1423
|
+
count = local.use_local_zips ? 1 : 0
|
|
1424
|
+
|
|
1425
|
+
function_name = "thinkwork-${var.stage}-api-compliance-anchor"
|
|
1426
|
+
role = var.compliance_anchor_lambda_role_arn
|
|
1427
|
+
handler = "index.handler"
|
|
1428
|
+
runtime = local.runtime
|
|
1429
|
+
timeout = 60
|
|
1430
|
+
memory_size = 1024
|
|
1431
|
+
filename = "${var.lambda_zips_dir}/compliance-anchor.zip"
|
|
1432
|
+
source_code_hash = filebase64sha256("${var.lambda_zips_dir}/compliance-anchor.zip")
|
|
1433
|
+
reserved_concurrent_executions = 1
|
|
1434
|
+
|
|
1435
|
+
environment {
|
|
1436
|
+
variables = {
|
|
1437
|
+
STAGE = var.stage
|
|
1438
|
+
AWS_NODEJS_CONNECTION_REUSE_ENABLED = "1"
|
|
1439
|
+
COMPLIANCE_READER_SECRET_ARN = var.compliance_reader_secret_arn
|
|
1440
|
+
COMPLIANCE_DRAINER_SECRET_ARN = var.compliance_drainer_secret_arn
|
|
1441
|
+
COMPLIANCE_ANCHOR_BUCKET_NAME = var.compliance_anchor_bucket_name
|
|
1442
|
+
COMPLIANCE_ANCHOR_RETENTION_DAYS = tostring(var.compliance_anchor_object_lock_retention_days)
|
|
1443
|
+
# Phase 3 U8b — required by `_anchor_fn_live`. The Lambda throws on
|
|
1444
|
+
# boot if either of these is empty; the U8b composite root wires
|
|
1445
|
+
# both from `module.compliance_anchors` outputs.
|
|
1446
|
+
COMPLIANCE_ANCHOR_KMS_KEY_ARN = var.compliance_anchor_kms_key_arn
|
|
1447
|
+
COMPLIANCE_ANCHOR_OBJECT_LOCK_MODE = var.compliance_anchor_object_lock_mode
|
|
1448
|
+
}
|
|
1449
|
+
}
|
|
1450
|
+
}
|
|
1451
|
+
|
|
1452
|
+
resource "aws_sqs_queue" "compliance_anchor_dlq" {
|
|
1453
|
+
count = local.use_local_zips ? 1 : 0
|
|
1454
|
+
name = "thinkwork-${var.stage}-compliance-anchor-dlq"
|
|
1455
|
+
message_retention_seconds = 1209600 # 14 days, matches the drainer DLQ
|
|
1456
|
+
sqs_managed_sse_enabled = true
|
|
1457
|
+
}
|
|
1458
|
+
|
|
1459
|
+
resource "aws_iam_role_policy" "compliance_anchor_dlq_send" {
|
|
1460
|
+
count = local.use_local_zips ? 1 : 0
|
|
1461
|
+
name = "compliance-anchor-dlq-send"
|
|
1462
|
+
# Attached to the U7 anchor role (which the standalone anchor Lambda assumes).
|
|
1463
|
+
role = var.compliance_anchor_lambda_role_name
|
|
1464
|
+
|
|
1465
|
+
policy = jsonencode({
|
|
1466
|
+
Version = "2012-10-17"
|
|
1467
|
+
Statement = [{
|
|
1468
|
+
Effect = "Allow"
|
|
1469
|
+
Action = ["sqs:SendMessage"]
|
|
1470
|
+
Resource = aws_sqs_queue.compliance_anchor_dlq[0].arn
|
|
1471
|
+
}]
|
|
1472
|
+
})
|
|
1473
|
+
}
|
|
1474
|
+
|
|
1475
|
+
resource "aws_lambda_function_event_invoke_config" "compliance_anchor" {
|
|
1476
|
+
count = local.use_local_zips ? 1 : 0
|
|
1477
|
+
function_name = aws_lambda_function.compliance_anchor[0].function_name
|
|
1478
|
+
maximum_retry_attempts = 0
|
|
1479
|
+
maximum_event_age_in_seconds = 3600
|
|
1480
|
+
|
|
1481
|
+
destination_config {
|
|
1482
|
+
on_failure {
|
|
1483
|
+
destination = aws_sqs_queue.compliance_anchor_dlq[0].arn
|
|
1484
|
+
}
|
|
1485
|
+
}
|
|
1486
|
+
}
|
|
1487
|
+
|
|
1488
|
+
# ---------------------------------------------------------------------------
|
|
1489
|
+
# Phase 3 U8b — Watchdog Lambda (STANDALONE).
|
|
1490
|
+
#
|
|
1491
|
+
# Moves OFF the shared aws_iam_role.lambda onto a dedicated sibling role
|
|
1492
|
+
# (kms:DescribeKey only on the CMK, s3:ListBucket prefix-conditioned on
|
|
1493
|
+
# anchors/, no kms:Decrypt — the watchdog never reads object bodies).
|
|
1494
|
+
# The shared role's prior compliance_watchdog_metrics inline policy is
|
|
1495
|
+
# removed (its function is now on the sibling role).
|
|
1496
|
+
#
|
|
1497
|
+
# Operator pre-merge: `terraform state mv` the existing
|
|
1498
|
+
# `aws_lambda_function.handler["compliance-anchor-watchdog"]` address to
|
|
1499
|
+
# `aws_lambda_function.compliance_anchor_watchdog[0]`. Without it, apply
|
|
1500
|
+
# fails with ResourceConflictException on the function name.
|
|
1501
|
+
# ---------------------------------------------------------------------------
|
|
1502
|
+
|
|
1503
|
+
resource "aws_lambda_function" "compliance_anchor_watchdog" {
|
|
1504
|
+
count = local.use_local_zips ? 1 : 0
|
|
1505
|
+
|
|
1506
|
+
function_name = "thinkwork-${var.stage}-api-compliance-anchor-watchdog"
|
|
1507
|
+
role = var.compliance_anchor_watchdog_role_arn
|
|
1508
|
+
handler = "index.handler"
|
|
1509
|
+
runtime = local.runtime
|
|
1510
|
+
timeout = 30
|
|
1511
|
+
memory_size = 512
|
|
1512
|
+
filename = "${var.lambda_zips_dir}/compliance-anchor-watchdog.zip"
|
|
1513
|
+
source_code_hash = filebase64sha256("${var.lambda_zips_dir}/compliance-anchor-watchdog.zip")
|
|
1514
|
+
|
|
1515
|
+
environment {
|
|
1516
|
+
variables = {
|
|
1517
|
+
STAGE = var.stage
|
|
1518
|
+
AWS_NODEJS_CONNECTION_REUSE_ENABLED = "1"
|
|
1519
|
+
COMPLIANCE_ANCHOR_BUCKET_NAME = var.compliance_anchor_bucket_name
|
|
1520
|
+
}
|
|
1521
|
+
}
|
|
1522
|
+
|
|
1523
|
+
tags = {
|
|
1524
|
+
Name = "thinkwork-${var.stage}-api-compliance-anchor-watchdog"
|
|
1525
|
+
Handler = "compliance-anchor-watchdog"
|
|
1526
|
+
}
|
|
1527
|
+
}
|
|
1528
|
+
|
|
1529
|
+
# ---------------------------------------------------------------------------
|
|
1530
|
+
# Schedules — retry_policy is nested inside target { ... }, NOT at the
|
|
1531
|
+
# schedule top level. Verified against AWS provider schema.
|
|
1532
|
+
# ---------------------------------------------------------------------------
|
|
1533
|
+
|
|
1534
|
+
resource "aws_scheduler_schedule" "compliance_anchor" {
|
|
1535
|
+
count = local.use_local_zips ? 1 : 0
|
|
1536
|
+
|
|
1537
|
+
name = "thinkwork-${var.stage}-compliance-anchor"
|
|
1538
|
+
group_name = "default"
|
|
1539
|
+
schedule_expression = "rate(15 minutes)"
|
|
1540
|
+
state = "ENABLED"
|
|
1541
|
+
|
|
1542
|
+
flexible_time_window {
|
|
1543
|
+
mode = "OFF"
|
|
1544
|
+
}
|
|
1545
|
+
|
|
1546
|
+
target {
|
|
1547
|
+
arn = aws_lambda_function.compliance_anchor[0].arn
|
|
1548
|
+
role_arn = aws_iam_role.scheduler.arn
|
|
1549
|
+
|
|
1550
|
+
retry_policy {
|
|
1551
|
+
maximum_retry_attempts = 0
|
|
1552
|
+
}
|
|
1553
|
+
}
|
|
1554
|
+
}
|
|
1555
|
+
|
|
1556
|
+
resource "aws_scheduler_schedule" "compliance_anchor_watchdog" {
|
|
1557
|
+
count = local.use_local_zips ? 1 : 0
|
|
1558
|
+
|
|
1559
|
+
name = "thinkwork-${var.stage}-compliance-anchor-watchdog"
|
|
1560
|
+
group_name = "default"
|
|
1561
|
+
schedule_expression = "rate(5 minutes)"
|
|
1562
|
+
state = "ENABLED"
|
|
1563
|
+
|
|
1564
|
+
flexible_time_window {
|
|
1565
|
+
mode = "OFF"
|
|
1566
|
+
}
|
|
1567
|
+
|
|
1568
|
+
target {
|
|
1569
|
+
# Phase 3 U8b — points at the standalone watchdog resource (was
|
|
1570
|
+
# aws_lambda_function.handler["compliance-anchor-watchdog"] before
|
|
1571
|
+
# the for_each split-out).
|
|
1572
|
+
arn = aws_lambda_function.compliance_anchor_watchdog[0].arn
|
|
1573
|
+
role_arn = aws_iam_role.scheduler.arn
|
|
1574
|
+
|
|
1575
|
+
retry_policy {
|
|
1576
|
+
maximum_retry_attempts = 0
|
|
1577
|
+
}
|
|
1578
|
+
}
|
|
1579
|
+
}
|
|
1580
|
+
|
|
1581
|
+
# ---------------------------------------------------------------------------
|
|
1582
|
+
# CloudWatch alarms — Phase 3 U8b
|
|
1583
|
+
#
|
|
1584
|
+
# Two alarms split the failure space:
|
|
1585
|
+
#
|
|
1586
|
+
# 1. compliance-anchor-gap (treat_missing_data = "breaching"). Fires
|
|
1587
|
+
# when ComplianceAnchorGap >= 1 for two consecutive 5-min periods
|
|
1588
|
+
# OR when the watchdog stops emitting the metric entirely (IAM
|
|
1589
|
+
# regression, code crash, S3 ListObjectsV2 perma-fail).
|
|
1590
|
+
#
|
|
1591
|
+
# 2. compliance-anchor-watchdog-heartbeat-missing
|
|
1592
|
+
# (treat_missing_data = "notBreaching" born-state). Distinguishes
|
|
1593
|
+
# "real anchor gap" from "watchdog metric path broken". Born-state
|
|
1594
|
+
# is notBreaching to give Greenfield deploys a window before the
|
|
1595
|
+
# first heartbeat lands; flip to breaching in a follow-up after
|
|
1596
|
+
# first soak (Decision #7 / ADV-004).
|
|
1597
|
+
# ---------------------------------------------------------------------------
|
|
1598
|
+
|
|
1599
|
+
resource "aws_cloudwatch_metric_alarm" "compliance_anchor_gap" {
|
|
1600
|
+
count = local.use_local_zips ? 1 : 0
|
|
1601
|
+
|
|
1602
|
+
alarm_name = "thinkwork-${var.stage}-compliance-anchor-gap"
|
|
1603
|
+
alarm_description = "Anchor cadence gap exceeded threshold. LIVE in U8b — fires on >=1 ComplianceAnchorGap=1 OR missing metric (means watchdog broken)."
|
|
1604
|
+
namespace = "Thinkwork/Compliance"
|
|
1605
|
+
metric_name = "ComplianceAnchorGap"
|
|
1606
|
+
statistic = "Maximum"
|
|
1607
|
+
period = 300
|
|
1608
|
+
evaluation_periods = 2
|
|
1609
|
+
threshold = 1
|
|
1610
|
+
comparison_operator = "GreaterThanOrEqualToThreshold"
|
|
1611
|
+
treat_missing_data = "breaching"
|
|
1612
|
+
alarm_actions = []
|
|
1613
|
+
|
|
1614
|
+
dimensions = {
|
|
1615
|
+
Stage = var.stage
|
|
1616
|
+
}
|
|
1617
|
+
}
|
|
1618
|
+
|
|
1619
|
+
resource "aws_cloudwatch_metric_alarm" "compliance_anchor_watchdog_heartbeat_missing" {
|
|
1620
|
+
count = local.use_local_zips ? 1 : 0
|
|
1621
|
+
|
|
1622
|
+
alarm_name = "thinkwork-${var.stage}-compliance-anchor-watchdog-heartbeat-missing"
|
|
1623
|
+
alarm_description = "Watchdog heartbeat metric is missing. LIVE in U8b — born with treat_missing_data = notBreaching to absorb deploy-time gaps; promote to breaching in a follow-up after first soak."
|
|
1624
|
+
namespace = "Thinkwork/Compliance"
|
|
1625
|
+
metric_name = "ComplianceAnchorWatchdogHeartbeat"
|
|
1626
|
+
statistic = "Sum"
|
|
1627
|
+
period = 300
|
|
1628
|
+
evaluation_periods = 2
|
|
1629
|
+
threshold = 1
|
|
1630
|
+
comparison_operator = "LessThanThreshold"
|
|
1631
|
+
treat_missing_data = "notBreaching"
|
|
1632
|
+
alarm_actions = []
|
|
1633
|
+
|
|
1634
|
+
dimensions = {
|
|
1635
|
+
Stage = var.stage
|
|
1636
|
+
}
|
|
1637
|
+
}
|
|
1638
|
+
|
|
1639
|
+
# ---------------------------------------------------------------------------
|
|
1640
|
+
# Phase 3 U11.U2 — Compliance export runner (STANDALONE, INERT)
|
|
1641
|
+
#
|
|
1642
|
+
# The U11.U1 createComplianceExport mutation (PR #944) inserts a queued
|
|
1643
|
+
# row into compliance.export_jobs and dispatches `{jobId}` to this SQS
|
|
1644
|
+
# queue. The runner Lambda below has a STUB body in U11.U2 (throws
|
|
1645
|
+
# "not implemented") — U11.U3 swaps in the live body that streams
|
|
1646
|
+
# CSV/NDJSON to the exports S3 bucket and publishes a 15-minute
|
|
1647
|
+
# presigned URL.
|
|
1648
|
+
#
|
|
1649
|
+
# Inert-substrate posture (per `feedback_ship_inert_pattern`):
|
|
1650
|
+
# - SQS messages from the U11.U1 mutation accumulate.
|
|
1651
|
+
# - After maxReceiveCount=3 attempts the stub throw routes them to
|
|
1652
|
+
# the DLQ.
|
|
1653
|
+
# - The DLQ depth alarm signals operators that the runner needs U11.U3.
|
|
1654
|
+
# - This is the visible inert state — silent no-op stubs are an
|
|
1655
|
+
# anti-pattern (queued jobs would stay QUEUED forever with no signal).
|
|
1656
|
+
#
|
|
1657
|
+
# Standalone Lambda (NOT in the for_each pool) — isolates the runner's
|
|
1658
|
+
# bucket-scoped IAM role from the 60+ unrelated handlers. Mirrors the
|
|
1659
|
+
# U8a anchor Lambda's standalone-resource pattern.
|
|
1660
|
+
# ---------------------------------------------------------------------------
|
|
1661
|
+
|
|
1662
|
+
resource "aws_sqs_queue" "compliance_exports_dlq" {
|
|
1663
|
+
count = local.use_local_zips ? 1 : 0
|
|
1664
|
+
name = "thinkwork-${var.stage}-compliance-exports-dlq"
|
|
1665
|
+
message_retention_seconds = 1209600 # 14 days
|
|
1666
|
+
sqs_managed_sse_enabled = true
|
|
1667
|
+
|
|
1668
|
+
tags = {
|
|
1669
|
+
Name = "thinkwork-${var.stage}-compliance-exports-dlq"
|
|
1670
|
+
}
|
|
1671
|
+
}
|
|
1672
|
+
|
|
1673
|
+
resource "aws_sqs_queue" "compliance_exports" {
|
|
1674
|
+
count = local.use_local_zips ? 1 : 0
|
|
1675
|
+
name = "thinkwork-${var.stage}-compliance-exports"
|
|
1676
|
+
visibility_timeout_seconds = 900 # matches Lambda 15-min timeout
|
|
1677
|
+
message_retention_seconds = 86400 # 1 day; DLQ holds longer-stuck messages
|
|
1678
|
+
sqs_managed_sse_enabled = true
|
|
1679
|
+
|
|
1680
|
+
redrive_policy = jsonencode({
|
|
1681
|
+
deadLetterTargetArn = aws_sqs_queue.compliance_exports_dlq[0].arn
|
|
1682
|
+
maxReceiveCount = 3
|
|
1683
|
+
})
|
|
1684
|
+
|
|
1685
|
+
tags = {
|
|
1686
|
+
Name = "thinkwork-${var.stage}-compliance-exports"
|
|
1687
|
+
}
|
|
1688
|
+
}
|
|
1689
|
+
|
|
1690
|
+
# graphql-http needs sqs:SendMessage on the new queue to dispatch jobIds
|
|
1691
|
+
# from the createComplianceExport mutation. Attached to the shared
|
|
1692
|
+
# lambda role (which graphql-http assumes); scope is queue-specific.
|
|
1693
|
+
resource "aws_iam_role_policy" "compliance_exports_send" {
|
|
1694
|
+
count = local.use_local_zips ? 1 : 0
|
|
1695
|
+
name = "compliance-exports-send"
|
|
1696
|
+
role = aws_iam_role.lambda.id
|
|
1697
|
+
|
|
1698
|
+
policy = jsonencode({
|
|
1699
|
+
Version = "2012-10-17"
|
|
1700
|
+
Statement = [{
|
|
1701
|
+
Effect = "Allow"
|
|
1702
|
+
Action = ["sqs:SendMessage"]
|
|
1703
|
+
Resource = aws_sqs_queue.compliance_exports[0].arn
|
|
1704
|
+
}]
|
|
1705
|
+
})
|
|
1706
|
+
}
|
|
1707
|
+
|
|
1708
|
+
# Runner role's SQS receive grants — only the runner consumes the queue.
|
|
1709
|
+
resource "aws_iam_role_policy" "compliance_exports_runner_sqs" {
|
|
1710
|
+
count = local.use_local_zips ? 1 : 0
|
|
1711
|
+
name = "compliance-exports-runner-sqs"
|
|
1712
|
+
role = var.compliance_exports_runner_role_name
|
|
1713
|
+
|
|
1714
|
+
policy = jsonencode({
|
|
1715
|
+
Version = "2012-10-17"
|
|
1716
|
+
Statement = [
|
|
1717
|
+
{
|
|
1718
|
+
Sid = "RunnerSqsReceive"
|
|
1719
|
+
Effect = "Allow"
|
|
1720
|
+
Action = [
|
|
1721
|
+
"sqs:ReceiveMessage",
|
|
1722
|
+
"sqs:DeleteMessage",
|
|
1723
|
+
"sqs:GetQueueAttributes",
|
|
1724
|
+
"sqs:ChangeMessageVisibility",
|
|
1725
|
+
]
|
|
1726
|
+
Resource = aws_sqs_queue.compliance_exports[0].arn
|
|
1727
|
+
},
|
|
1728
|
+
{
|
|
1729
|
+
Sid = "RunnerDlqSend"
|
|
1730
|
+
Effect = "Allow"
|
|
1731
|
+
Action = ["sqs:SendMessage"]
|
|
1732
|
+
Resource = aws_sqs_queue.compliance_exports_dlq[0].arn
|
|
1733
|
+
},
|
|
1734
|
+
]
|
|
1735
|
+
})
|
|
1736
|
+
}
|
|
1737
|
+
|
|
1738
|
+
resource "aws_lambda_function" "compliance_export_runner" {
|
|
1739
|
+
count = local.use_local_zips ? 1 : 0
|
|
1740
|
+
|
|
1741
|
+
function_name = "thinkwork-${var.stage}-api-compliance-export-runner"
|
|
1742
|
+
role = var.compliance_exports_runner_role_arn
|
|
1743
|
+
handler = "index.handler"
|
|
1744
|
+
runtime = local.runtime
|
|
1745
|
+
timeout = 900
|
|
1746
|
+
memory_size = 1024
|
|
1747
|
+
filename = "${var.lambda_zips_dir}/compliance-export-runner.zip"
|
|
1748
|
+
source_code_hash = filebase64sha256("${var.lambda_zips_dir}/compliance-export-runner.zip")
|
|
1749
|
+
reserved_concurrent_executions = 2
|
|
1750
|
+
|
|
1751
|
+
environment {
|
|
1752
|
+
variables = {
|
|
1753
|
+
STAGE = var.stage
|
|
1754
|
+
AWS_NODEJS_CONNECTION_REUSE_ENABLED = "1"
|
|
1755
|
+
COMPLIANCE_EXPORTS_BUCKET = var.compliance_exports_bucket_name
|
|
1756
|
+
COMPLIANCE_EXPORTS_QUEUE_URL = aws_sqs_queue.compliance_exports[0].url
|
|
1757
|
+
# Phase 3 U11.U3 — the live runner connects to Aurora as the
|
|
1758
|
+
# writer pool (existing app role) for INSERT/UPDATE on
|
|
1759
|
+
# compliance.export_jobs and SELECT on compliance.audit_events.
|
|
1760
|
+
DATABASE_URL_SECRET_ARN = var.graphql_db_secret_arn
|
|
1761
|
+
# The writer-pool secret stores only {username, password}; the
|
|
1762
|
+
# runner constructs the URL from these env vars + the secret.
|
|
1763
|
+
# Mirrors the fallback in packages/database-pg/src/db.ts's
|
|
1764
|
+
# `resolveDatabaseUrlFromSecrets` (deploy run 25563132057
|
|
1765
|
+
# surfaced this as "Invalid URL" when only the ARN was wired).
|
|
1766
|
+
DATABASE_HOST = var.db_cluster_endpoint
|
|
1767
|
+
DATABASE_NAME = var.database_name
|
|
1768
|
+
}
|
|
1769
|
+
}
|
|
1770
|
+
}
|
|
1771
|
+
|
|
1772
|
+
# SQS → Lambda event source mapping. batch_size=1 so each export is a
|
|
1773
|
+
# discrete invocation; ReportBatchItemFailures lets the runner mark
|
|
1774
|
+
# individual messages failed without re-enqueuing the whole batch.
|
|
1775
|
+
# Concurrency is bounded by the Lambda function's
|
|
1776
|
+
# reserved_concurrent_executions=2 (set above) — the
|
|
1777
|
+
# `maximum_concurrency` argument on the event-source mapping requires a
|
|
1778
|
+
# newer aws provider version than this codebase currently pins, and the
|
|
1779
|
+
# function-level reservation gives the equivalent ceiling at v1 scale.
|
|
1780
|
+
resource "aws_lambda_event_source_mapping" "compliance_exports" {
|
|
1781
|
+
count = local.use_local_zips ? 1 : 0
|
|
1782
|
+
|
|
1783
|
+
event_source_arn = aws_sqs_queue.compliance_exports[0].arn
|
|
1784
|
+
function_name = aws_lambda_function.compliance_export_runner[0].function_name
|
|
1785
|
+
batch_size = 1
|
|
1786
|
+
enabled = true
|
|
1787
|
+
function_response_types = ["ReportBatchItemFailures"]
|
|
1788
|
+
}
|
|
1789
|
+
|
|
1790
|
+
resource "aws_cloudwatch_metric_alarm" "compliance_exports_dlq_depth" {
|
|
1791
|
+
count = local.use_local_zips ? 1 : 0
|
|
1792
|
+
|
|
1793
|
+
alarm_name = "thinkwork-${var.stage}-compliance-exports-dlq-depth"
|
|
1794
|
+
alarm_description = "Compliance exports DLQ has messages — runner Lambda crashed (or is inert pre-U11.U3); operator must inspect."
|
|
1795
|
+
namespace = "AWS/SQS"
|
|
1796
|
+
metric_name = "ApproximateNumberOfMessagesVisible"
|
|
1797
|
+
statistic = "Maximum"
|
|
1798
|
+
period = 60
|
|
1799
|
+
evaluation_periods = 1
|
|
1800
|
+
threshold = 1
|
|
1801
|
+
comparison_operator = "GreaterThanOrEqualToThreshold"
|
|
1802
|
+
treat_missing_data = "notBreaching"
|
|
1803
|
+
alarm_actions = []
|
|
1804
|
+
|
|
1805
|
+
dimensions = {
|
|
1806
|
+
QueueName = aws_sqs_queue.compliance_exports_dlq[0].name
|
|
1807
|
+
}
|
|
1808
|
+
}
|
|
1809
|
+
|
|
1810
|
+
# ---------------------------------------------------------------------------
|
|
1811
|
+
# workspace-files-efs — STANDALONE Lambda that reads any Computer's workspace
|
|
1812
|
+
# files directly off the shared EFS file system. Bypasses the
|
|
1813
|
+
# computer_tasks queue for list/get operations so the admin Computer
|
|
1814
|
+
# Workspace tab is independent of runtime liveness or write-queue backlog.
|
|
1815
|
+
#
|
|
1816
|
+
# Plan: docs/plans/2026-05-13-XXX-feat-admin-computer-efs-listing-plan.md
|
|
1817
|
+
#
|
|
1818
|
+
# The Lambda mounts the `workspace_admin` access point at /mnt/efs. That
|
|
1819
|
+
# access point is rooted at /tenants on the shared EFS, so the handler
|
|
1820
|
+
# can address any Computer's workspace as
|
|
1821
|
+
# /mnt/efs/<tenantId>/computers/<computerId>/<path...>
|
|
1822
|
+
# (matches the layout written by `computerWorkspacePath` in
|
|
1823
|
+
# packages/api/src/lib/computers/runtime-control.ts:40).
|
|
1824
|
+
#
|
|
1825
|
+
# VPC config: same subnet set the Computer ECS tasks use (so the mount
|
|
1826
|
+
# targets are reachable). Dedicated security group with an EFS-SG ingress
|
|
1827
|
+
# rule defined as a sibling of the task-SG rule in the computer-runtime
|
|
1828
|
+
# module — keeps Lambda traffic auditable separately.
|
|
1829
|
+
#
|
|
1830
|
+
# Writes intentionally stay on the existing computer_tasks queue path.
|
|
1831
|
+
# Mutations have ordering semantics with the runtime's in-process state;
|
|
1832
|
+
# changing them is out of scope for this PR.
|
|
1833
|
+
# ---------------------------------------------------------------------------
|
|
1834
|
+
|
|
1835
|
+
resource "aws_lambda_function" "workspace_files_efs" {
|
|
1836
|
+
count = local.use_local_zips ? 1 : 0
|
|
1837
|
+
|
|
1838
|
+
function_name = "thinkwork-${var.stage}-api-workspace-files-efs"
|
|
1839
|
+
role = aws_iam_role.lambda.arn
|
|
1840
|
+
handler = "index.handler"
|
|
1841
|
+
runtime = local.runtime
|
|
1842
|
+
timeout = 30
|
|
1843
|
+
memory_size = 512
|
|
1844
|
+
|
|
1845
|
+
filename = "${var.lambda_zips_dir}/workspace-files-efs.zip"
|
|
1846
|
+
source_code_hash = filebase64sha256("${var.lambda_zips_dir}/workspace-files-efs.zip")
|
|
1847
|
+
|
|
1848
|
+
vpc_config {
|
|
1849
|
+
subnet_ids = var.computer_runtime_subnet_ids
|
|
1850
|
+
security_group_ids = [var.workspace_admin_lambda_sg_id]
|
|
1851
|
+
}
|
|
1852
|
+
|
|
1853
|
+
file_system_config {
|
|
1854
|
+
arn = var.workspace_admin_efs_access_point_arn
|
|
1855
|
+
local_mount_path = "/mnt/efs"
|
|
1856
|
+
}
|
|
1857
|
+
|
|
1858
|
+
environment {
|
|
1859
|
+
variables = {
|
|
1860
|
+
STAGE = var.stage
|
|
1861
|
+
AWS_NODEJS_CONNECTION_REUSE_ENABLED = "1"
|
|
1862
|
+
WORKSPACE_EFS_ROOT = "/mnt/efs"
|
|
1863
|
+
}
|
|
1864
|
+
}
|
|
1865
|
+
|
|
1866
|
+
tags = {
|
|
1867
|
+
Name = "thinkwork-${var.stage}-api-workspace-files-efs"
|
|
1868
|
+
Handler = "workspace-files-efs"
|
|
1869
|
+
}
|
|
1870
|
+
}
|
|
1871
|
+
|
|
1872
|
+
# VPC-attached Lambdas need permission to manage ENIs. The shared lambda
|
|
1873
|
+
# role doesn't grant this by default because most handlers run outside a
|
|
1874
|
+
# VPC. AWSLambdaVPCAccessExecutionRole gives Create/Describe/DeleteNetwork
|
|
1875
|
+
# Interface — minimum scope for VPC Lambdas.
|
|
1876
|
+
resource "aws_iam_role_policy_attachment" "lambda_vpc_access" {
|
|
1877
|
+
count = local.use_local_zips ? 1 : 0
|
|
1878
|
+
|
|
1879
|
+
role = aws_iam_role.lambda.name
|
|
1880
|
+
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole"
|
|
1881
|
+
}
|