clawpro-diagnostics-metrics-cls 2.0.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +76 -7
- package/openclaw.plugin.json +9 -1
- package/package.json +1 -31
- package/src/cls-service.ts +83 -9
- package/src/instance-metadata.ts +239 -135
- package/src/trace-service.ts +0 -1
package/README.md
CHANGED
|
@@ -8,8 +8,41 @@ OpenClaw 诊断指标导出插件,提供以下核心能力:
|
|
|
8
8
|
|
|
9
9
|
---
|
|
10
10
|
|
|
11
|
+
## ⚠️ 数据访问声明(请务必阅读)
|
|
12
|
+
|
|
13
|
+
> **本插件需要读取 LLM 会话内容才能产出 token / 长度 / 耗时类指标和 Trace 数据。**
|
|
14
|
+
> 通过 onboard CLI 的 `install` / `UpdateParameter` 命令安装时,CLI 会自动在 `~/.openclaw/openclaw.json` 中为本插件写入 `plugins.entries["clawpro-diagnostics-metrics-cls"].hooks.allowConversationAccess = true`,以满足 OpenClaw 对 non-bundled 插件 typed hook 的安全策略。**安装即视为您授权本插件访问会话内容**,请在团队 / 公司合规要求允许的前提下使用。
|
|
15
|
+
|
|
16
|
+
### 一、本插件会接触哪些数据
|
|
17
|
+
|
|
18
|
+
| 数据类别 | 来源 hook | 是否上报到 CLS |
|
|
19
|
+
|---|---|---|
|
|
20
|
+
| 用户输入的 prompt 原文(含系统提示词、对话历史) | `llm_input` / `message_received` | ✅ 默认会通过 Trace 模块写入 CLS 的 **Trace 日志主题**(`traceTopicId`)字段 `gen_ai.input.messages` / `gen_ai.prompt`,未配置 `traceTopicId` 或 `trace.enabled=false` 时不上报 |
|
|
21
|
+
| 模型生成的 completion 原文 | `llm_output` / `before_message_write` | ✅ 同上,写入字段 `gen_ai.output.messages` / `gen_ai.completion` |
|
|
22
|
+
| Agent 最终输出 / 工具调用参数 / 工具返回结果 | `agent_end` / `before_tool_call` / `after_tool_call` | ✅ 同上,写入字段 `gen_ai.output.messages` / `gen_ai.tool.call.arguments` / `gen_ai.tool.call.result` |
|
|
23
|
+
| Token 用量 / 耗时 / 模型名等元数据 | 全部 hook | ✅ 写入 **Prometheus 指标主题**(`metricTopicId`),**不含原文** |
|
|
24
|
+
|
|
25
|
+
> Prometheus 指标主题(`metricTopicId`)只承载聚合后的数值类指标(counter / histogram / gauge),**不会包含 prompt 或 completion 原文**;会话原文仅出现在 Trace 主题(`traceTopicId`)中。
|
|
26
|
+
|
|
27
|
+
### 二、如果您不希望插件读取会话原文
|
|
28
|
+
|
|
29
|
+
可以选择以下任一方式,按需削减插件的数据访问范围:
|
|
30
|
+
|
|
31
|
+
1. **完全禁用 Trace,仅保留指标**:`UpdateParameter --traceEnabled false`,或在 `openclaw.json` 中将 `plugins.entries["clawpro-diagnostics-metrics-cls"].config.trace.enabled` 设为 `false`。此时插件仍会保留 `allowConversationAccess=true` 以采集 token / 耗时类指标,但**不再向 CLS 写入任何 prompt / completion 原文**。
|
|
32
|
+
2. **按 hook 粒度禁用**:在 `openclaw.json` 的 `config.trace.enabledHooks` 中只列出您允许的 hook(例如只保留 `["session_start", "session_end", "agent_end"]` 等不直接携带消息内容的 hook,不包含 `llm_input` / `before_message_write`),其余 hook 不会被注册到 OpenClaw runtime。
|
|
33
|
+
3. **彻底拒绝插件访问会话内容**:在 `openclaw.json` 中手动将 `plugins.entries["clawpro-diagnostics-metrics-cls"].hooks.allowConversationAccess` 设为 `false`,OpenClaw 会拦截 `llm_input` / `llm_output` / `agent_end` 等 typed hook,本插件相关指标和 trace 将不可用,但其它非会话类指标(如 webhook、队列、会话状态等)仍正常工作。
|
|
34
|
+
|
|
35
|
+
### 三、CLS 侧数据安全建议
|
|
36
|
+
|
|
37
|
+
- 为 `traceTopicId` 单独创建日志主题,配置最严格的访问权限(仅运维 / SRE 可读),不要与业务日志主题混用。
|
|
38
|
+
- 如需进一步限制原文长度,插件已内置**结构感知式渐进裁剪**(详见下文 Trace 章节"超长截断策略"),单字段最大 3,200,000 字符,APM 索引字段单条消息最长 4,096 字符。
|
|
39
|
+
- 上报前可通过 `external_labels` 加上 `data_classification=internal` 等业务标签,便于在 CLS 侧做 RBAC / 脱敏策略。
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
11
43
|
## 目录
|
|
12
44
|
|
|
45
|
+
- [⚠️ 数据访问声明(请务必阅读)](#-数据访问声明请务必阅读)
|
|
13
46
|
- [快速安装(推荐)](#快速安装推荐)
|
|
14
47
|
- [手动安装](#手动安装)
|
|
15
48
|
- [配置说明](#配置说明)
|
|
@@ -69,6 +102,10 @@ npx --yes clawpro-diagnostics-metrics-cls-onboard-cli install
|
|
|
69
102
|
| `--externalLabels <labels>` | Prometheus 自定义标签(格式:`key1=value1,key2=value2`) | 可选 |
|
|
70
103
|
| `--traceEnabled <bool>` | 是否启用 Trace 链路追踪功能(`true`/`false`,默认 `true`) | 可选 |
|
|
71
104
|
| `--traceTopicId <id>` | Trace 数据上报使用的 CLS 日志主题 ID | 可选(可在配置文件中动态更新) |
|
|
105
|
+
| `--instanceId <id>` | 显式指定实例 ID,写入配置文件后插件启动时直接使用,跳过腾讯云 metadata 接口请求 | 可选 |
|
|
106
|
+
| `--instanceName <name>` | 显式指定实例名称,仅覆盖 `cvm_instance_name`,不影响 `host_name`(后者始终取 `os.hostname()`) | 可选 |
|
|
107
|
+
| `--localIpv4 <ip>` | 显式指定内网 IPv4 | 可选 |
|
|
108
|
+
| `--pluginVersion <version>` | 指定安装的插件版本号(如 `1.2.3`),仅在从 npm registry 安装时生效;本地路径存在时忽略 | 可选,默认安装最新版本 |
|
|
72
109
|
|
|
73
110
|
### 安装示例
|
|
74
111
|
|
|
@@ -120,6 +157,17 @@ npx clawpro-diagnostics-metrics-cls-onboard-cli install \
|
|
|
120
157
|
--traceTopicId "zzzzzzzz-trace-topic-id"
|
|
121
158
|
```
|
|
122
159
|
|
|
160
|
+
安装指定版本(从 npm registry 安装时生效):
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
npx openclaw-diagnostics-metrics-cls-onboard-cli install \
|
|
164
|
+
--metricTopicId "xxxxxxxx-metric-topic-id" \
|
|
165
|
+
--secretId "AKIDxxxxxxxx" \
|
|
166
|
+
--secretKey "xxxxxxxxxxxxxxxx" \
|
|
167
|
+
--endpoint "ap-guangzhou.cls.tencentcs.com" \
|
|
168
|
+
--pluginVersion "1.2.3"
|
|
169
|
+
```
|
|
170
|
+
|
|
123
171
|
安装完成后,CLI 会自动重启网关并输出确认信息:
|
|
124
172
|
|
|
125
173
|
```
|
|
@@ -776,9 +824,8 @@ Trace 数据中同时包含以下 APM 兼容的索引属性,可在 CLS 控制
|
|
|
776
824
|
| `cvm_instance_intra_ip` | string | CVM 内网 IP | `instance-metadata.ts` 通过腾讯云 metadata API 获取 `local-ipv4` | 网络定位和故障排查 |
|
|
777
825
|
| `cvm_instance_internet_ip` | string | CVM 公网 IP | `instance-metadata.ts` 通过腾讯云 metadata API 获取 `public-ipv4` | 网络定位和故障排查 |
|
|
778
826
|
| `cvm_instance_region` | string | CVM 所在地域 | `instance-metadata.ts` 通过腾讯云 metadata API 获取 `placement/region` | 标识实例所在地域 |
|
|
779
|
-
| `host_name` | string | 操作系统 hostname | `instance-metadata.ts` 中 `os.hostname()` | 与 `host.name` 类似,字段名格式与指标服务一致 |
|
|
780
827
|
|
|
781
|
-
> **说明**:`cvm_*`
|
|
828
|
+
> **说明**:`cvm_*` 字段仅在腾讯云环境下有值,非腾讯云环境不注入。
|
|
782
829
|
|
|
783
830
|
##### 三、通用 Span 属性(`createSpan()` 注入)
|
|
784
831
|
|
|
@@ -1012,14 +1059,28 @@ agent_end (结束所有长生命周期 Span)
|
|
|
1012
1059
|
| 字段 | 获取方式 | 说明 |
|
|
1013
1060
|
|------|---------|------|
|
|
1014
1061
|
| `cvm_instance_id` | 腾讯云 Metadata 接口 `/latest/meta-data/instance-id` | 实例 ID |
|
|
1015
|
-
| `cvm_instance_name` | `
|
|
1016
|
-
| `cvm_instance_intra_ip` |
|
|
1062
|
+
| `cvm_instance_name` | 腾讯云 Metadata 接口 `/latest/meta-data/instance-name` | 实例名称 |
|
|
1063
|
+
| `cvm_instance_intra_ip` | 腾讯云 Metadata 接口 `/latest/meta-data/local-ipv4` | 内网 IPv4 地址 |
|
|
1064
|
+
| `host_name` | `os.hostname()` 本机主机名 | OTel 资源属性,独立于 CVM 实例名(不参与持久化) |
|
|
1065
|
+
|
|
1066
|
+
> **用户覆盖(Override)**:可在 `openclaw.json` 的 `plugins.entries["openclaw-diagnostics-metrics-cls"].config.instance_metadata` 下显式传入以下字段,覆盖自动获取值。这三个字段**支持热更新**,修改配置文件后约 10 秒内自动生效,无需重启网关:
|
|
1067
|
+
>
|
|
1068
|
+
> | 配置字段 | 覆盖目标 | 跳过的自动逻辑 |
|
|
1069
|
+
> |---------|---------|---------------|
|
|
1070
|
+
> | `instance_id` | `cvm_instance_id` | 腾讯云 Metadata HTTP 请求(以及 `openclaw.json` 持久化读写)、首次失败重试 |
|
|
1071
|
+
> | `instance_name` | `cvm_instance_name` | 腾讯云 Metadata HTTP 请求与 `openclaw.json` 持久化读写(**但不影响 `host_name` 字段**,后者始终取 `os.hostname()`) |
|
|
1072
|
+
> | `local_ipv4` | `cvm_instance_intra_ip` | 腾讯云 Metadata HTTP 请求与 `openclaw.json` 持久化读写 |
|
|
1073
|
+
>
|
|
1074
|
+
> 空字符串或缺失视为未提供,仍走原自动获取逻辑。任一字段提供后,对应的网络请求/持久化读写均会被跳过。
|
|
1017
1075
|
|
|
1018
1076
|
#### 刷新策略
|
|
1019
1077
|
|
|
1020
|
-
-
|
|
1021
|
-
-
|
|
1022
|
-
-
|
|
1078
|
+
- 优先级:**`openclaw.json` 中 `instance_metadata` 显式配置值** > `openclaw.json` 已持久化值 > 腾讯云 metadata 接口
|
|
1079
|
+
- 持久化字段:`plugins.entries["clawpro-diagnostics-metrics-cls"].config.instance_metadata.{instance_id, instance_name, local_ipv4}`
|
|
1080
|
+
- 启动时先从持久化文件读取三个字段,已有值的字段直接使用,不发起网络请求
|
|
1081
|
+
- 仅对- 仅对"未被 override 覆盖且无持久化值"的字段调用 metadata 接口(请求超时 2 秒),成功后立即增量写入 `openclaw.json`;写入采用文件锁 + 临时文件 rename 的方式,支持多进程并发
|
|
1082
|
+
- 首次获取失败的字段会在 30 秒后重试,最多重试 3 次;仅当仍存在未就绪字段时才继续重试,全部就绪后停止
|
|
1083
|
+
- 用户显式传入的 `*_override` 值**不会**被持久化写入 `openclaw.json`(视为用户已掌握权威值)
|
|
1023
1084
|
- 非腾讯云环境下,元数据字段为空字符串,不影响插件正常运行
|
|
1024
1085
|
|
|
1025
1086
|
---
|
|
@@ -1052,6 +1113,9 @@ npx --yes clawpro-diagnostics-metrics-cls-onboard-cli UpdateParameter [选项]
|
|
|
1052
1113
|
| `--externalLabels <labels>` | Prometheus 自定义标签(格式:`key1=value1,key2=value2`) | ✅ 需要重启网关 |
|
|
1053
1114
|
| `--traceEnabled <bool>` | 是否启用 Trace 链路追踪功能(`true`/`false`) | ❌ 热加载(约 10 秒内生效) |
|
|
1054
1115
|
| `--traceTopicId <id>` | Trace 数据上报使用的 CLS 日志主题 ID | ❌ 热加载(约 10 秒内生效) |
|
|
1116
|
+
| `--instanceId <id>` | 显式指定实例 ID,写入配置文件后插件启动时直接使用,跳过腾讯云 metadata 接口请求 | ❌ 热加载(约 10 秒内生效) |
|
|
1117
|
+
| `--instanceName <name>` | 显式指定实例名称,仅覆盖 `cvm_instance_name`(不影响 `host_name`) | ❌ 热加载(约 10 秒内生效) |
|
|
1118
|
+
| `--localIpv4 <ip>` | 显式指定内网 IPv4 | ❌ 热加载(约 10 秒内生效) |
|
|
1055
1119
|
|
|
1056
1120
|
> **智能重启判断**
|
|
1057
1121
|
> **智能重启判断**:CLI 会自动分析本次更新的参数,如果所有参数均支持热加载,则不会重启网关;如果包含需要重启的参数,CLI 会自动重启网关使配置生效。
|
|
@@ -1177,12 +1241,17 @@ npx clawpro-diagnostics-metrics-cls-onboard-cli UpdateParameter \
|
|
|
1177
1241
|
| `cls.secretId` | 自动更新静态密钥(static 模式),新密钥加密后回写配置文件;同时影响 Trace 模块的凭证 |
|
|
1178
1242
|
| `cls.secretKey` | 自动更新静态密钥(static 模式),新密钥加密后回写配置文件;同时影响 Trace 模块的凭证 |
|
|
1179
1243
|
| `cls.token` | 自动更新临时 Token(CVM 角色模式),同时影响 Trace 模块的凭证 |
|
|
1244
|
+
| `instance_metadata.instance_id` | 热更新 `cvm_instance_id`,覆盖自动获取值;设为空字符串可清空覆盖值并重新触发 metadata 接口获取 |
|
|
1245
|
+
| `instance_metadata.instance_name` | 热更新 `cvm_instance_name`,覆盖自动获取值;不影响 `host_name`(后者始终取 `os.hostname()`);设为空字符串可清空覆盖值并重新触发 metadata 接口获取 |
|
|
1246
|
+
| `instance_metadata.local_ipv4` | 热更新 `cvm_instance_intra_ip`,覆盖自动获取值;设为空字符串可清空覆盖值并重新触发 metadata 接口获取 |
|
|
1180
1247
|
| `trace.enabled` | 动态启用/禁用 Trace 链路追踪功能。即使初始配置不完整导致 trace 禁用,后续补全配置后也可通过热更新自动恢复 |
|
|
1181
1248
|
| `trace.traceTopicId` | 自动切换 Trace 数据上报的 CLS 日志主题 |
|
|
1182
1249
|
| `trace.debug` | 动态开启/关闭 Trace 调试日志,无需重启即可在运行时排查问题 |
|
|
1183
1250
|
| `trace.enabledHooks` | 动态调整启用的 hook 列表,可在运行时按需启用/禁用特定 hook,无需重启 |
|
|
1184
1251
|
|
|
1185
1252
|
> **注意**:`credentialMode`、`roleName` 等凭证模式相关字段,以及 `prometheusEnabled`、`pushIntervalMs`、`externalLabels` 等 Prometheus 配置字段变更后需要重启网关才能生效。可通过 `UpdateParameter` 命令更新,CLI 会自动判断是否需要重启。
|
|
1253
|
+
>
|
|
1254
|
+
> `instance_metadata.instance_id`、`instance_metadata.instance_name`、`instance_metadata.local_ipv4` 三个字段支持热更新,修改后约 10 秒内自动生效,无需重启网关。将字段设为空字符串时,插件会清空对应的覆盖值并重新通过 metadata 接口自动获取(最多重试 3 次,每次间隔 30 秒)。
|
|
1186
1255
|
|
|
1187
1256
|
---
|
|
1188
1257
|
|
package/openclaw.plugin.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"id": "clawpro-diagnostics-metrics-cls",
|
|
3
3
|
"name": "Diagnostics Metrics CLS",
|
|
4
|
-
"version": "2.0.
|
|
4
|
+
"version": "2.0.2",
|
|
5
5
|
"description": "CLS诊断指标导出插件:Prometheus 指标(pull/remote-write)+ 全链路 Trace 追踪",
|
|
6
6
|
"type": "plugin",
|
|
7
7
|
"configSchema": {
|
|
@@ -90,6 +90,14 @@
|
|
|
90
90
|
"instance_id": {
|
|
91
91
|
"type": "string",
|
|
92
92
|
"description": "腾讯云 CVM 实例 ID,由插件自动获取并持久化"
|
|
93
|
+
},
|
|
94
|
+
"instance_name": {
|
|
95
|
+
"type": "string",
|
|
96
|
+
"description": "腾讯云 CVM 实例名称,由插件自动获取并持久化"
|
|
97
|
+
},
|
|
98
|
+
"local_ipv4": {
|
|
99
|
+
"type": "string",
|
|
100
|
+
"description": "腾讯云 CVM 内网 IPv4 地址,由插件自动获取并持久化"
|
|
93
101
|
}
|
|
94
102
|
}
|
|
95
103
|
},
|
package/package.json
CHANGED
|
@@ -1,31 +1 @@
|
|
|
1
|
-
{
|
|
2
|
-
"name": "clawpro-diagnostics-metrics-cls",
|
|
3
|
-
"version": "2.0.0",
|
|
4
|
-
"description": "CLS OpenClaw diagnostics exporter: Prometheus metrics (pull/remote-write) + full-link Trace",
|
|
5
|
-
"type": "module",
|
|
6
|
-
"main": "index.ts",
|
|
7
|
-
"files": [
|
|
8
|
-
"index.ts",
|
|
9
|
-
"src/**/*.ts",
|
|
10
|
-
"openclaw.plugin.json"
|
|
11
|
-
],
|
|
12
|
-
"openclaw": {
|
|
13
|
-
"extensions": [
|
|
14
|
-
"./index.ts"
|
|
15
|
-
]
|
|
16
|
-
},
|
|
17
|
-
"scripts": {
|
|
18
|
-
"build": "echo 'Plugin publishes .ts source files, no build needed.'",
|
|
19
|
-
"typecheck": "tsc --noEmit",
|
|
20
|
-
"dev": "tsc --noEmit --watch"
|
|
21
|
-
},
|
|
22
|
-
"license": "MIT",
|
|
23
|
-
"dependencies": {
|
|
24
|
-
"prom-client": "^15.1.3",
|
|
25
|
-
"snappyjs": "^0.7.0",
|
|
26
|
-
"tencentcloud-cls-sdk-nodejs": "1.0.3"
|
|
27
|
-
},
|
|
28
|
-
"devDependencies": {
|
|
29
|
-
"typescript": "^5.0.0"
|
|
30
|
-
}
|
|
31
|
-
}
|
|
1
|
+
{"name":"clawpro-diagnostics-metrics-cls","version":"2.0.2","description":"CLS OpenClaw diagnostics exporter: Prometheus metrics (pull/remote-write) + full-link Trace","type":"module","main":"index.ts","files":["index.ts","src/**/*.ts","openclaw.plugin.json"],"openclaw":{"extensions":["./index.ts"]},"scripts":{"build":"echo 'Plugin publishes .ts source files, no build needed.'","typecheck":"tsc --noEmit","dev":"tsc --noEmit --watch"},"license":"MIT","dependencies":{"prom-client":"^15.1.3","snappyjs":"^0.7.0","tencentcloud-cls-sdk-nodejs":"1.0.3"},"devDependencies":{"typescript":"^5.0.0"}}
|
package/src/cls-service.ts
CHANGED
|
@@ -87,7 +87,7 @@ import { onDiagnosticEvent } from "openclaw/plugin-sdk/diagnostics-otel";
|
|
|
87
87
|
import { Registry, Counter, Histogram, Gauge, collectDefaultMetrics } from "prom-client";
|
|
88
88
|
import type { PromTimeSeries, PromLabel } from "./protobuf.ts";
|
|
89
89
|
import { RemoteWriteClient, type RemoteWriteConfig } from "./remote-write.ts";
|
|
90
|
-
import { getInstanceMetadata, startInstanceMetadataRefresh, stopInstanceMetadataRefresh } from "./instance-metadata.ts";
|
|
90
|
+
import { getInstanceMetadata, startInstanceMetadataRefresh, stopInstanceMetadataRefresh, updateInstanceMetadata } from "./instance-metadata.ts";
|
|
91
91
|
import type { SharedCredentialManager, CredentialSnapshot } from "./shared-credential.ts";
|
|
92
92
|
import { decrypt, encrypt, isEncrypted } from "./crypto-utils.ts";
|
|
93
93
|
import { PLUGIN_ID } from "./shared-constants.ts";
|
|
@@ -212,7 +212,16 @@ const ENV_POLL_INTERVAL_MS = 10_000;
|
|
|
212
212
|
*/
|
|
213
213
|
async function readHotConfigFromDisk(
|
|
214
214
|
configPath: string,
|
|
215
|
-
): Promise<{
|
|
215
|
+
): Promise<{
|
|
216
|
+
metricTopicId: string | undefined;
|
|
217
|
+
endpoint: string;
|
|
218
|
+
enableReport: boolean;
|
|
219
|
+
secretId: string;
|
|
220
|
+
secretKey: string;
|
|
221
|
+
instanceId: string;
|
|
222
|
+
instanceName: string;
|
|
223
|
+
localIpv4: string;
|
|
224
|
+
} | null> {
|
|
216
225
|
try {
|
|
217
226
|
const raw = await nodeFs.readFile(configPath, "utf8");
|
|
218
227
|
const config = JSON.parse(raw) as Record<string, unknown>;
|
|
@@ -225,7 +234,7 @@ async function readHotConfigFromDisk(
|
|
|
225
234
|
const rawMetricTopicId = typeof clsCfg?.metricTopicId === "string" ? clsCfg.metricTopicId.trim() : "";
|
|
226
235
|
const endpoint = typeof clsCfg?.endpoint === "string" ? clsCfg.endpoint.trim() : "";
|
|
227
236
|
const enableReport = clsCfg?.enableReport === false ? false : true;
|
|
228
|
-
// 读取并解密 secretId/secretKey,支持 static
|
|
237
|
+
// 读取并解密 secretId/secretKey,支持 static 模式下密鑰热更新
|
|
229
238
|
const secretId = typeof clsCfg?.secretId === "string" ? decrypt(clsCfg.secretId.trim()) : "";
|
|
230
239
|
const secretKey = typeof clsCfg?.secretKey === "string" ? decrypt(clsCfg.secretKey.trim()) : "";
|
|
231
240
|
|
|
@@ -235,13 +244,30 @@ async function readHotConfigFromDisk(
|
|
|
235
244
|
metricTopicId = undefined;
|
|
236
245
|
}
|
|
237
246
|
|
|
238
|
-
|
|
247
|
+
// 读取 instance_metadata 三个字段,用于热更新实例元数据缓存
|
|
248
|
+
// 使用 pickOrNull 区分「字段不存在/非字符串」(返回 null)和「字段被显式设为空字符串」(返回 ""):
|
|
249
|
+
// - null:配置文件中无此字段,热更新时不修改缓存
|
|
250
|
+
// - "":用户显式清空,热更新时清空缓存并触发重新从 metadata 接口获取
|
|
251
|
+
// - 非空字符串:用新值覆盖缓存
|
|
252
|
+
const instanceMetadataCfg = entryCfg?.instance_metadata as Record<string, unknown> | undefined;
|
|
253
|
+
const pickOrNull = (v: unknown): string | null => {
|
|
254
|
+
if (typeof v !== "string") return null;
|
|
255
|
+
return v.trim();
|
|
256
|
+
};
|
|
257
|
+
const instanceIdRaw = instanceMetadataCfg ? pickOrNull(instanceMetadataCfg.instance_id) : null;
|
|
258
|
+
const instanceNameRaw = instanceMetadataCfg ? pickOrNull(instanceMetadataCfg.instance_name) : null;
|
|
259
|
+
const localIpv4Raw = instanceMetadataCfg ? pickOrNull(instanceMetadataCfg.local_ipv4) : null;
|
|
260
|
+
// 对外仍使用 string 类型(null 转为空字符串),调用方通过与 active 值比较判断是否变化
|
|
261
|
+
const instanceId = instanceIdRaw ?? "";
|
|
262
|
+
const instanceName = instanceNameRaw ?? "";
|
|
263
|
+
const localIpv4 = localIpv4Raw ?? "";
|
|
264
|
+
|
|
265
|
+
return { metricTopicId, endpoint, enableReport, secretId, secretKey, instanceId, instanceName, localIpv4 };
|
|
239
266
|
} catch {
|
|
240
267
|
// 文件不存在、JSON 解析失败等,返回 null 表示无法读取
|
|
241
268
|
return null;
|
|
242
269
|
}
|
|
243
270
|
}
|
|
244
|
-
|
|
245
271
|
/**
|
|
246
272
|
* 根据 endpoint 和 metricTopicId 生成 Prometheus Remote Write URL。
|
|
247
273
|
* URL 格式与 CLI 工具保持一致:https://<endpoint>/prometheus/<metricTopicId>/api/v1/write
|
|
@@ -351,10 +377,13 @@ export function createPrometheusService(
|
|
|
351
377
|
let activeEndpoint = "";
|
|
352
378
|
/** 当前生效的 enableReport(支持热更新) */
|
|
353
379
|
let activeEnableReport = true;
|
|
354
|
-
/** 追踪 static
|
|
380
|
+
/** 追踪 static 模式下的密鑰,用于热更新时检测变化 */
|
|
355
381
|
let activeSecretId = "";
|
|
356
382
|
let activeSecretKey = "";
|
|
357
|
-
|
|
383
|
+
/** 追踪 instance_metadata 三个字段,用于热更新时检测变化 */
|
|
384
|
+
let activeInstanceId = "";
|
|
385
|
+
let activeInstanceName = "";
|
|
386
|
+
let activeLocalIpv4 = "";
|
|
358
387
|
return {
|
|
359
388
|
id: "clawpro-diagnostics-metrics-cls",
|
|
360
389
|
|
|
@@ -372,7 +401,7 @@ export function createPrometheusService(
|
|
|
372
401
|
const prefix = promCfg?.metric_prefix ?? "openclaw";
|
|
373
402
|
const externalLabels = { ...promCfg?.external_labels ?? {} };
|
|
374
403
|
|
|
375
|
-
//
|
|
404
|
+
// 启动实例元数据获取:优先使用 openclaw.json 持久化值,缺失字段再通过 metadata 接口拉取
|
|
376
405
|
await startInstanceMetadataRefresh(ctx.logger, ctx.stateDir).catch((err: unknown) => {
|
|
377
406
|
const detail = err instanceof Error ? err.message : String(err);
|
|
378
407
|
ctx.logger.warn(
|
|
@@ -736,6 +765,10 @@ export function createPrometheusService(
|
|
|
736
765
|
// 从 CLS 配置初始化密钥(变量声明已提升到 start() 外部,确保 stop() 可访问)
|
|
737
766
|
activeSecretId = clsCfg?.secretId ?? "";
|
|
738
767
|
activeSecretKey = clsCfg?.secretKey ?? "";
|
|
768
|
+
// 初始化 instance_metadata 追踪值(从当前缓存读取,避免启动时误判为变化)
|
|
769
|
+
activeInstanceId = getInstanceMetadata().cvmInstanceId;
|
|
770
|
+
activeInstanceName = getInstanceMetadata().cvmInstanceName;
|
|
771
|
+
activeLocalIpv4 = getInstanceMetadata().cvmInstanceIntraIp;
|
|
739
772
|
|
|
740
773
|
const remoteWriteConfigs = promCfg?.remote_write ?? [];
|
|
741
774
|
|
|
@@ -896,16 +929,54 @@ export function createPrometheusService(
|
|
|
896
929
|
const latestEnableReport = diskConfig.enableReport;
|
|
897
930
|
const latestSecretId = diskConfig.secretId;
|
|
898
931
|
const latestSecretKey = diskConfig.secretKey;
|
|
932
|
+
const latestInstanceId = diskConfig.instanceId;
|
|
933
|
+
const latestInstanceName = diskConfig.instanceName;
|
|
934
|
+
const latestLocalIpv4 = diskConfig.localIpv4;
|
|
899
935
|
|
|
900
936
|
// 注意:metricTopicId 格式校验已在 readHotConfigFromDisk 中完成,
|
|
901
937
|
// 不合法时返回 undefined,?? 运算符会自动回退到初始配置值 cfgMetricTopicId
|
|
902
938
|
|
|
903
939
|
const metricChanged = latestMetric !== activeMetricTopicId;
|
|
904
940
|
const endpointChanged = latestEndpoint !== activeEndpoint;
|
|
941
|
+
|
|
942
|
+
// 在比较前先将追踪值与当前缓存同步:
|
|
943
|
+
// 若 activeInstance* 为空但缓存已有值,说明是 metadata 重试成功后缓存被更新,
|
|
944
|
+
// 而非配置文件发生变化。此时静默同步追踪值,避免误判为热更新触发 updateInstanceMetadata。
|
|
945
|
+
const currentCache = getInstanceMetadata();
|
|
946
|
+
if (!activeInstanceId && currentCache.cvmInstanceId) {
|
|
947
|
+
activeInstanceId = currentCache.cvmInstanceId;
|
|
948
|
+
}
|
|
949
|
+
if (!activeInstanceName && currentCache.cvmInstanceName) {
|
|
950
|
+
activeInstanceName = currentCache.cvmInstanceName;
|
|
951
|
+
}
|
|
952
|
+
if (!activeLocalIpv4 && currentCache.cvmInstanceIntraIp) {
|
|
953
|
+
activeLocalIpv4 = currentCache.cvmInstanceIntraIp;
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
// 检测 instance_metadata 三个字段是否变化(包括清空字段的情况,清空表示恢复自动获取)
|
|
957
|
+
const instanceOverrideChanged =
|
|
958
|
+
latestInstanceId !== activeInstanceId
|
|
959
|
+
|| latestInstanceName !== activeInstanceName
|
|
960
|
+
|| latestLocalIpv4 !== activeLocalIpv4;
|
|
905
961
|
const enableReportChanged = latestEnableReport !== activeEnableReport;
|
|
906
|
-
// static
|
|
962
|
+
// static 模式下检测密鑰是否变化
|
|
907
963
|
const secretChanged = credentialMode === "static" && latestSecretId && latestSecretKey
|
|
908
964
|
&& (latestSecretId !== activeSecretId || latestSecretKey !== activeSecretKey);
|
|
965
|
+
// instance_metadata 热更新:检测三个字段变化并更新实例元数据缓存
|
|
966
|
+
if (instanceOverrideChanged) {
|
|
967
|
+
activeInstanceId = latestInstanceId;
|
|
968
|
+
activeInstanceName = latestInstanceName;
|
|
969
|
+
activeLocalIpv4 = latestLocalIpv4;
|
|
970
|
+
// 传入 stateDir,使 updateInstanceMetadata 在字段被清空时能触发重新从 metadata 接口获取并持久化
|
|
971
|
+
updateInstanceMetadata({
|
|
972
|
+
instanceId: latestInstanceId,
|
|
973
|
+
instanceName: latestInstanceName,
|
|
974
|
+
localIpv4: latestLocalIpv4,
|
|
975
|
+
}, ctx.logger, ctx.stateDir);
|
|
976
|
+
ctx.logger.debug(
|
|
977
|
+
`diagnostics-metrics/prometheus: 配置文件热更新,instance_metadata 已更新 (instance_id=${activeInstanceId}, instance_name=${activeInstanceName}, local_ipv4=${activeLocalIpv4})`,
|
|
978
|
+
);
|
|
979
|
+
}
|
|
909
980
|
|
|
910
981
|
// static 模式密钥热更新:检测 secretId/secretKey 变化并通知共享凭证管理器
|
|
911
982
|
if (secretChanged) {
|
|
@@ -1091,6 +1162,9 @@ export function createPrometheusService(
|
|
|
1091
1162
|
activeEnableReport = true;
|
|
1092
1163
|
activeSecretId = "";
|
|
1093
1164
|
activeSecretKey = "";
|
|
1165
|
+
activeInstanceId = "";
|
|
1166
|
+
activeInstanceName = "";
|
|
1167
|
+
activeLocalIpv4 = "";
|
|
1094
1168
|
},
|
|
1095
1169
|
} satisfies OpenClawPluginService & { getExports: () => PrometheusPluginExports | null };
|
|
1096
1170
|
}
|
package/src/instance-metadata.ts
CHANGED
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* 腾讯云实例元数据获取模块
|
|
3
3
|
*
|
|
4
|
-
* 通过腾讯云 metadata 接口获取当前实例的 instance-id
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
* -
|
|
9
|
-
* -
|
|
4
|
+
* 通过腾讯云 metadata 接口获取当前实例的 instance-id / instance-name / local-ipv4 三个字段。
|
|
5
|
+
* hostName 字段独立使用 os.hostname()(OTel 资源语义,与 CVM 实例名不同维度)。
|
|
6
|
+
*
|
|
7
|
+
* 三个字段共享统一的获取与持久化流程:
|
|
8
|
+
* - 启动时先从 openclaw.json 的插件配置项中读取已持久化值,已有的字段直接使用,不再发起网络请求
|
|
9
|
+
* - 配置项中缺失的字段通过 metadata 接口获取,成功后写入 openclaw.json 的插件配置项
|
|
10
|
+
* - 首次获取失败的字段会在 30 秒后重试,最多重试 3 次;仅当 * - 全部字段已成功获取或持久化后不再启动定时刷新
|
|
10
11
|
* - 请求超时 2 秒,失败时使用空字符串作为默认值
|
|
11
|
-
* - 持久化位置:openclaw.json 的 plugins.entries["clawpro-diagnostics-metrics-cls"].config.instance_metadata
|
|
12
|
+
* - 持久化位置:openclaw.json 的 plugins.entries["clawpro-diagnostics-metrics-cls"].config.instance_metadata
|
|
13
|
+
* 字段名:instance_id / instance_name / local_ipv4
|
|
12
14
|
* - 写入 openclaw.json 时使用锁文件(openclaw.json.lock)防止并发写入冲突,锁文件记录 PID 防止僵尸锁
|
|
13
15
|
* - 写入时先写临时文件(openclaw.json.tmp)再 rename 实现原子替换,跨设备时自动降级为直接写入
|
|
14
16
|
*/
|
|
@@ -161,40 +163,15 @@ function readResponseBody(
|
|
|
161
163
|
}
|
|
162
164
|
|
|
163
165
|
/**
|
|
164
|
-
*
|
|
165
|
-
*
|
|
166
|
-
* @returns 第一个匹配的 IPv4 地址,未找到时返回空字符串
|
|
167
|
-
*/
|
|
168
|
-
function getLocalIPv4(): string {
|
|
169
|
-
const interfaces = os.networkInterfaces();
|
|
170
|
-
// 常见虚拟网卡前缀
|
|
171
|
-
const virtualPrefixes = ["docker", "veth", "br-", "virbr", "vnet", "flannel", "cni", "cali"];
|
|
172
|
-
|
|
173
|
-
for (const [name, addrs] of Object.entries(interfaces)) {
|
|
174
|
-
if (!addrs) continue;
|
|
175
|
-
// 跳过虚拟网卡
|
|
176
|
-
const isVirtual = virtualPrefixes.some((prefix) => name.toLowerCase().startsWith(prefix));
|
|
177
|
-
if (isVirtual) continue;
|
|
178
|
-
|
|
179
|
-
for (const addr of addrs) {
|
|
180
|
-
// 仅取 IPv4、非回环、非内部地址优先但也接受内部地址
|
|
181
|
-
if (addr.family === "IPv4" && !addr.internal) {
|
|
182
|
-
return addr.address;
|
|
183
|
-
}
|
|
184
|
-
}
|
|
185
|
-
}
|
|
186
|
-
return "";
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
/**
|
|
190
|
-
* 从 openclaw.json 的插件配置项中读取已持久化的 instance-id
|
|
191
|
-
* 读取路径:plugins.entries[PLUGIN_ID].config.instance_metadata.instance_id
|
|
166
|
+
* 从 openclaw.json 的插件配置项中读取已持久化的实例元数据三元组
|
|
167
|
+
* 读取路径:plugins.entries[PLUGIN_ID].config.instance_metadata.{instance_id, instance_name, local_ipv4}
|
|
192
168
|
* @param stateDir openclaw.json 所在目录
|
|
193
|
-
* @returns
|
|
169
|
+
* @returns 包含三个字段的对象,缺失字段返回空字符串
|
|
194
170
|
*/
|
|
195
|
-
async function
|
|
171
|
+
async function readPersistedMetadata(
|
|
196
172
|
stateDir: string,
|
|
197
|
-
): Promise<string> {
|
|
173
|
+
): Promise<{ instanceId: string; instanceName: string; localIpv4: string }> {
|
|
174
|
+
const empty = { instanceId: "", instanceName: "", localIpv4: "" };
|
|
198
175
|
try {
|
|
199
176
|
const configPath = nodePath.join(stateDir, "openclaw.json");
|
|
200
177
|
const raw = await nodeFs.readFile(configPath, "utf8");
|
|
@@ -204,10 +181,15 @@ async function readInstanceIdFromConfig(
|
|
|
204
181
|
const pluginEntry = entries?.[PLUGIN_ID] as Record<string, unknown> | undefined;
|
|
205
182
|
const pluginConfig = pluginEntry?.config as Record<string, unknown> | undefined;
|
|
206
183
|
const instanceMetadata = pluginConfig?.instance_metadata as Record<string, unknown> | undefined;
|
|
207
|
-
|
|
208
|
-
|
|
184
|
+
if (!instanceMetadata) return empty;
|
|
185
|
+
const pick = (v: unknown): string => (typeof v === "string" && v.trim() ? v.trim() : "");
|
|
186
|
+
return {
|
|
187
|
+
instanceId: pick(instanceMetadata.instance_id),
|
|
188
|
+
instanceName: pick(instanceMetadata.instance_name),
|
|
189
|
+
localIpv4: pick(instanceMetadata.local_ipv4),
|
|
190
|
+
};
|
|
209
191
|
} catch {
|
|
210
|
-
return
|
|
192
|
+
return empty;
|
|
211
193
|
}
|
|
212
194
|
}
|
|
213
195
|
|
|
@@ -312,19 +294,29 @@ async function releaseFileLock(lockPath: string): Promise<void> {
|
|
|
312
294
|
}
|
|
313
295
|
|
|
314
296
|
/**
|
|
315
|
-
*
|
|
316
|
-
* 写入路径:plugins.entries[PLUGIN_ID].config.instance_metadata.instance_id
|
|
297
|
+
* 将实例元数据增量持久化写入 openclaw.json 的插件配置项
|
|
298
|
+
* 写入路径:plugins.entries[PLUGIN_ID].config.instance_metadata.{instance_id, instance_name, local_ipv4}
|
|
317
299
|
* - 使用锁文件(openclaw.json.lock)防止并发写入
|
|
318
300
|
* - 先写临时文件再 rename 实现原子替换,避免写入中途崩溃产生损坏文件
|
|
301
|
+
* - 仅写入 updates 中非空的字段;空字符串字段保留文件中原值(增量合并)
|
|
319
302
|
* @param stateDir openclaw.json 所在目录
|
|
320
|
-
* @param
|
|
303
|
+
* @param updates 待写入的字段,空字符串字段将被忽略
|
|
321
304
|
* @param logger 可选的日志记录器
|
|
322
305
|
*/
|
|
323
|
-
async function
|
|
306
|
+
async function writePersistedMetadata(
|
|
324
307
|
stateDir: string,
|
|
325
|
-
|
|
308
|
+
updates: { instanceId?: string; instanceName?: string; localIpv4?: string },
|
|
326
309
|
logger?: { info: (msg: string) => void; warn: (msg: string) => void; debug: (msg: string) => void },
|
|
327
310
|
): Promise<void> {
|
|
311
|
+
// 过滤掉空字符串字段,避免用空值覆盖文件中已有的旧值
|
|
312
|
+
const effective: Record<string, string> = {};
|
|
313
|
+
if (updates.instanceId) effective.instance_id = updates.instanceId;
|
|
314
|
+
if (updates.instanceName) effective.instance_name = updates.instanceName;
|
|
315
|
+
if (updates.localIpv4) effective.local_ipv4 = updates.localIpv4;
|
|
316
|
+
if (Object.keys(effective).length === 0) {
|
|
317
|
+
return;
|
|
318
|
+
}
|
|
319
|
+
|
|
328
320
|
const configPath = nodePath.join(stateDir, "openclaw.json");
|
|
329
321
|
const lockPath = configPath + ".lock";
|
|
330
322
|
const tmpPath = configPath + ".tmp";
|
|
@@ -367,7 +359,8 @@ async function writeInstanceIdToConfig(
|
|
|
367
359
|
const pluginConfig = pluginEntry.config as Record<string, unknown>;
|
|
368
360
|
|
|
369
361
|
const prev = pluginConfig.instance_metadata as Record<string, unknown> | undefined;
|
|
370
|
-
|
|
362
|
+
// 增量合并:保留文件中已有的其他字段(如用户 override_* 配置),仅覆盖本次 effective 中的字段
|
|
363
|
+
pluginConfig.instance_metadata = { ...(prev ?? {}), ...effective };
|
|
371
364
|
|
|
372
365
|
const content = JSON.stringify(config, null, 2) + "\n";
|
|
373
366
|
// 先写入临时文件,再 rename 实现原子替换,避免写入中途崩溃损坏原文件
|
|
@@ -388,13 +381,13 @@ async function writeInstanceIdToConfig(
|
|
|
388
381
|
}
|
|
389
382
|
|
|
390
383
|
logger?.debug(
|
|
391
|
-
`diagnostics-metrics/instance-metadata:
|
|
384
|
+
`diagnostics-metrics/instance-metadata: 实例元数据已持久化写入 openclaw.json (${Object.keys(effective).join(", ")})`,
|
|
392
385
|
);
|
|
393
386
|
} catch (err) {
|
|
394
387
|
// 清理可能残留的临时文件
|
|
395
388
|
try { await nodeFs.unlink(tmpPath); } catch { /* ignore */ }
|
|
396
389
|
logger?.warn(
|
|
397
|
-
`diagnostics-metrics/instance-metadata:
|
|
390
|
+
`diagnostics-metrics/instance-metadata: 写入实例元数据到 openclaw.json 失败: ${String(err)}`,
|
|
398
391
|
);
|
|
399
392
|
} finally {
|
|
400
393
|
// 无论成功与否,都释放文件锁
|
|
@@ -404,63 +397,97 @@ async function writeInstanceIdToConfig(
|
|
|
404
397
|
|
|
405
398
|
/**
|
|
406
399
|
* 刷新实例元数据
|
|
407
|
-
* 通过 metadata
|
|
408
|
-
*
|
|
409
|
-
*
|
|
400
|
+
* 通过 metadata 接口并行获取 instance-id、instance-name、local-ipv4 三个字段。
|
|
401
|
+
* hostName 字段独立使用 os.hostname()(不依赖网络)。
|
|
402
|
+
*
|
|
403
|
+
* 缓存中已有值的字段会跳过网络请求,避免重试阶段重复拉取已经就绪的字段。
|
|
404
|
+
*
|
|
405
|
+
* @returns
|
|
406
|
+
* - allSuccess:三个 CVM 字段(id/name/ipv4)是否全部就绪
|
|
407
|
+
* - fetched:本次通过 metadata 接口新获取到的字段,仅这些字段需要被持久化
|
|
410
408
|
*/
|
|
411
409
|
async function refreshMetadata(
|
|
412
410
|
logger?: { info: (msg: string) => void; warn: (msg: string) => void; debug: (msg: string) => void },
|
|
413
|
-
): Promise<{
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
411
|
+
): Promise<{
|
|
412
|
+
allSuccess: boolean;
|
|
413
|
+
fetched: { instanceId?: string; instanceName?: string; localIpv4?: string };
|
|
414
|
+
}> {
|
|
415
|
+
const prev = cachedMetadata;
|
|
416
|
+
|
|
417
|
+
// 仅为缓存中尚未就绪的字段发起 metadata 请求,避免重试阶段重复拉取已就绪字段
|
|
418
|
+
const needIdFetch = !prev.cvmInstanceId;
|
|
419
|
+
const needNameFetch = !prev.cvmInstanceName;
|
|
420
|
+
const needIpFetch = !prev.cvmInstanceIntraIp;
|
|
421
|
+
|
|
422
|
+
const [idResult, nameResult, ipResult] = await Promise.all([
|
|
423
|
+
needIdFetch ? settle(fetchMetadataField("instance-id")) : Promise.resolve({ status: "skipped" as const }),
|
|
424
|
+
needNameFetch ? settle(fetchMetadataField("instance-name")) : Promise.resolve({ status: "skipped" as const }),
|
|
425
|
+
needIpFetch ? settle(fetchMetadataField("local-ipv4")) : Promise.resolve({ status: "skipped" as const }),
|
|
426
|
+
]);
|
|
419
427
|
|
|
420
428
|
// 获取本机 hostname(同步调用,不依赖网络)
|
|
421
|
-
|
|
429
|
+
// hostName 字段始终保留 os.hostname(),不受 metadata 响应影响
|
|
430
|
+
let hostName = prev.hostName;
|
|
422
431
|
try {
|
|
423
432
|
hostName = os.hostname();
|
|
424
433
|
} catch {
|
|
425
434
|
logger?.warn(`diagnostics-metrics/instance-metadata: 获取 hostname 失败`);
|
|
426
435
|
}
|
|
427
436
|
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
try {
|
|
431
|
-
localIpv4 = getLocalIPv4();
|
|
432
|
-
} catch {
|
|
433
|
-
logger?.warn(`diagnostics-metrics/instance-metadata: 获取本地 IPv4 地址失败`);
|
|
434
|
-
}
|
|
437
|
+
const fetched: { instanceId?: string; instanceName?: string; localIpv4?: string } = {};
|
|
438
|
+
const merged: InstanceMetadata = { ...prev, hostName: hostName || prev.hostName };
|
|
435
439
|
|
|
436
|
-
//
|
|
437
|
-
const prev = cachedMetadata;
|
|
438
|
-
const updated: InstanceMetadata = {
|
|
439
|
-
cvmInstanceId: prev.cvmInstanceId,
|
|
440
|
-
cvmInstanceName: hostName || prev.cvmInstanceName,
|
|
441
|
-
cvmInstanceIntraIp: localIpv4 || prev.cvmInstanceIntraIp,
|
|
442
|
-
hostName: hostName || prev.hostName,
|
|
443
|
-
};
|
|
444
|
-
|
|
445
|
-
let idSuccess = false;
|
|
446
|
-
let fetchedInstanceId = "";
|
|
440
|
+
// ── instance-id ────────────────────────────────────────────────────
|
|
447
441
|
if (idResult.status === "fulfilled" && idResult.value) {
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
logger?.warn(`diagnostics-metrics/instance-metadata: 获取 instance-id 失败:
|
|
442
|
+
merged.cvmInstanceId = idResult.value;
|
|
443
|
+
fetched.instanceId = idResult.value;
|
|
444
|
+
} else if (idResult.status === "rejected") {
|
|
445
|
+
logger?.warn(`diagnostics-metrics/instance-metadata: 获取 instance-id 失败: ${String(idResult.reason)}`);
|
|
446
|
+
} else if (idResult.status === "fulfilled") {
|
|
447
|
+
logger?.warn(`diagnostics-metrics/instance-metadata: 获取 instance-id 失败: empty response`);
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
// ── instance-name ──────────────────────────────────────────────────
|
|
451
|
+
if (nameResult.status === "fulfilled" && nameResult.value) {
|
|
452
|
+
merged.cvmInstanceName = nameResult.value;
|
|
453
|
+
fetched.instanceName = nameResult.value;
|
|
454
|
+
} else if (nameResult.status === "rejected") {
|
|
455
|
+
logger?.warn(`diagnostics-metrics/instance-metadata: 获取 instance-name 失败: ${String(nameResult.reason)}`);
|
|
456
|
+
} else if (nameResult.status === "fulfilled") {
|
|
457
|
+
logger?.warn(`diagnostics-metrics/instance-metadata: 获取 instance-name 失败: empty response`);
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
// ── local-ipv4 ─────────────────────────────────────────────────────
|
|
461
|
+
if (ipResult.status === "fulfilled" && ipResult.value) {
|
|
462
|
+
merged.cvmInstanceIntraIp = ipResult.value;
|
|
463
|
+
fetched.localIpv4 = ipResult.value;
|
|
464
|
+
} else if (ipResult.status === "rejected") {
|
|
465
|
+
logger?.warn(`diagnostics-metrics/instance-metadata: 获取 local-ipv4 失败: ${String(ipResult.reason)}`);
|
|
466
|
+
} else if (ipResult.status === "fulfilled") {
|
|
467
|
+
logger?.warn(`diagnostics-metrics/instance-metadata: 获取 local-ipv4 失败: empty response`);
|
|
454
468
|
}
|
|
455
469
|
|
|
456
470
|
// 原子替换缓存引用(不可变对象,避免并发读写问题)
|
|
457
|
-
cachedMetadata = Object.freeze(
|
|
471
|
+
cachedMetadata = Object.freeze(merged);
|
|
472
|
+
|
|
473
|
+
const allSuccess = Boolean(merged.cvmInstanceId) && Boolean(merged.cvmInstanceName)
|
|
474
|
+
&& Boolean(merged.cvmInstanceIntraIp);
|
|
458
475
|
|
|
459
476
|
logger?.debug(
|
|
460
477
|
`diagnostics-metrics/instance-metadata: 当前实例信息 cvm_instance_id=${cachedMetadata.cvmInstanceId}, cvm_instance_name=${cachedMetadata.cvmInstanceName}, cvm_instance_intra_ip=${cachedMetadata.cvmInstanceIntraIp}, host_name=${cachedMetadata.hostName}`,
|
|
461
478
|
);
|
|
462
479
|
|
|
463
|
-
return {
|
|
480
|
+
return { allSuccess, fetched };
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
/** 将 Promise 包装为 settled 风格结果(status + value/reason),便于统一处理 */
|
|
484
|
+
function settle<T>(
|
|
485
|
+
p: Promise<T>,
|
|
486
|
+
): Promise<{ status: "fulfilled"; value: T } | { status: "rejected"; reason: unknown }> {
|
|
487
|
+
return p.then(
|
|
488
|
+
(value) => ({ status: "fulfilled" as const, value }),
|
|
489
|
+
(reason) => ({ status: "rejected" as const, reason }),
|
|
490
|
+
);
|
|
464
491
|
}
|
|
465
492
|
|
|
466
493
|
/**
|
|
@@ -471,14 +498,89 @@ export function getInstanceMetadata(): Readonly<InstanceMetadata> {
|
|
|
471
498
|
return cachedMetadata;
|
|
472
499
|
}
|
|
473
500
|
|
|
501
|
+
/**
|
|
502
|
+
* 热更新实例元数据中的三个 CVM 字段(instance_id / instance_name / local_ipv4)
|
|
503
|
+
* 仅在已初始化后生效;未初始化时打印 warn 日志并返回,避免覆盖启动流程的初始化逻辑。
|
|
504
|
+
* 三个字段均未变化时跳过,避免无意义的缓存替换。
|
|
505
|
+
* - 传入非空字符串:用新值覆盖缓存中对应字段
|
|
506
|
+
* - 传入空字符串:清空对应字段,并触发重新从 metadata 接口获取(恢复自动获取逻辑)
|
|
507
|
+
* - 传入 undefined:保留缓存中当前值不变
|
|
508
|
+
* hostName 字段始终保持 os.hostname(),不受此函数影响。
|
|
509
|
+
*
|
|
510
|
+
* @param updates 待更新的字段,undefined 表示保留当前值,空字符串表示清空并恢复自动获取
|
|
511
|
+
* @param logger 可选的日志记录器,用于输出未初始化时的警告
|
|
512
|
+
* @param stateDir openclaw.json 所在目录,清空字段时用于触发重新获取并持久化
|
|
513
|
+
*/
|
|
514
|
+
export function updateInstanceMetadata(
|
|
515
|
+
updates: { instanceId?: string; instanceName?: string; localIpv4?: string },
|
|
516
|
+
logger?: { info: (msg: string) => void; warn: (msg: string) => void; debug: (msg: string) => void },
|
|
517
|
+
stateDir?: string,
|
|
518
|
+
): void {
|
|
519
|
+
// 未初始化时不做任何操作,避免干扰启动流程;打印 warn 便于排查热更新静默失效问题
|
|
520
|
+
if (!initialized) {
|
|
521
|
+
logger?.warn(
|
|
522
|
+
`diagnostics-metrics/instance-metadata: updateInstanceMetadata 调用时模块尚未初始化,本次热更新已跳过`,
|
|
523
|
+
);
|
|
524
|
+
return;
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
const prev = cachedMetadata;
|
|
528
|
+
// 使用 ?? 而非 ||:undefined 表示"不修改,保留旧值";空字符串表示"清空,恢复自动获取"
|
|
529
|
+
const newInstanceId = updates.instanceId !== undefined ? updates.instanceId : prev.cvmInstanceId;
|
|
530
|
+
const newInstanceName = updates.instanceName !== undefined ? updates.instanceName : prev.cvmInstanceName;
|
|
531
|
+
const newLocalIpv4 = updates.localIpv4 !== undefined ? updates.localIpv4 : prev.cvmInstanceIntraIp;
|
|
532
|
+
|
|
533
|
+
// 三个字段均未变化,跳过无意义的缓存替换
|
|
534
|
+
if (
|
|
535
|
+
newInstanceId === prev.cvmInstanceId
|
|
536
|
+
&& newInstanceName === prev.cvmInstanceName
|
|
537
|
+
&& newLocalIpv4 === prev.cvmInstanceIntraIp
|
|
538
|
+
) {
|
|
539
|
+
return;
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
cachedMetadata = Object.freeze({
|
|
543
|
+
cvmInstanceId: newInstanceId,
|
|
544
|
+
cvmInstanceName: newInstanceName,
|
|
545
|
+
cvmInstanceIntraIp: newLocalIpv4,
|
|
546
|
+
hostName: prev.hostName,
|
|
547
|
+
});
|
|
548
|
+
|
|
549
|
+
// 若有字段被清空(恢复自动获取),重置重试计数并重新触发 metadata 接口获取
|
|
550
|
+
const needRefetch = !newInstanceId || !newInstanceName || !newLocalIpv4;
|
|
551
|
+
if (needRefetch) {
|
|
552
|
+
logger?.debug(
|
|
553
|
+
`diagnostics-metrics/instance-metadata: 热更新清空了部分字段,重新触发 metadata 接口获取`,
|
|
554
|
+
);
|
|
555
|
+
// 重置重试计数,使 scheduleRetry 可以重新发起最多 MAX_RETRY_COUNT 次重试
|
|
556
|
+
retryCount = 0;
|
|
557
|
+
if (retryTimer) {
|
|
558
|
+
clearTimeout(retryTimer);
|
|
559
|
+
retryTimer = null;
|
|
560
|
+
}
|
|
561
|
+
// 异步触发重新获取,不阻塞当前调用
|
|
562
|
+
void refreshMetadata(logger).then(({ allSuccess, fetched }) => {
|
|
563
|
+
if (stopped) return;
|
|
564
|
+
if (stateDir && (fetched.instanceId || fetched.instanceName || fetched.localIpv4)) {
|
|
565
|
+
void writePersistedMetadata(stateDir, fetched, logger);
|
|
566
|
+
}
|
|
567
|
+
if (!allSuccess) {
|
|
568
|
+
scheduleRetry(logger, stateDir);
|
|
569
|
+
}
|
|
570
|
+
});
|
|
571
|
+
}
|
|
572
|
+
}
|
|
573
|
+
|
|
474
574
|
/**
|
|
475
575
|
* 启动实例元数据获取
|
|
476
|
-
* -
|
|
477
|
-
* -
|
|
478
|
-
* -
|
|
576
|
+
* - 优先应用用户显式传入的 override(不持久化,不触发网络请求)
|
|
577
|
+
* - 其次从 openclaw.json 中读取已持久化的值,直接注入缓存
|
|
578
|
+
* - 最后仅对尚未就绪的字段调用 metadata 接口拉取
|
|
579
|
+
* - 获取成功的字段立即持久化写入 openclaw.json(override 值不持久化)
|
|
580
|
+
* - 若仍有字段未就绪,启动 30 秒间隔的重试,最多 MAX_RETRY_COUNT 次
|
|
479
581
|
*
|
|
480
582
|
* @param logger 可选的日志记录器
|
|
481
|
-
* @param stateDir openclaw.json
|
|
583
|
+
* @param stateDir openclaw.json 所在目录,用于持久化三元组
|
|
482
584
|
*/
|
|
483
585
|
export async function startInstanceMetadataRefresh(
|
|
484
586
|
logger?: { info: (msg: string) => void; warn: (msg: string) => void; debug: (msg: string) => void },
|
|
@@ -502,50 +604,51 @@ export async function startInstanceMetadataRefresh(
|
|
|
502
604
|
initialized = true;
|
|
503
605
|
|
|
504
606
|
try {
|
|
505
|
-
//
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
607
|
+
// 同步预取 hostName,该字段独立于 CVM metadata 流程
|
|
608
|
+
let hostName = "";
|
|
609
|
+
try { hostName = os.hostname(); } catch { /* ignore */ }
|
|
610
|
+
|
|
611
|
+
// 读取持久化值;配置文件中已有的字段直接使用,不再发起网络请求
|
|
612
|
+
const persisted = stateDir ? await readPersistedMetadata(stateDir) : {
|
|
613
|
+
instanceId: "",
|
|
614
|
+
instanceName: "",
|
|
615
|
+
localIpv4: "",
|
|
616
|
+
};
|
|
617
|
+
|
|
618
|
+
// 基于持久化值构建初始缓存;尚未就绪的字段留空,后续由 metadata 接口填充
|
|
619
|
+
const initial: InstanceMetadata = {
|
|
620
|
+
cvmInstanceId: persisted.instanceId,
|
|
621
|
+
cvmInstanceName: persisted.instanceName,
|
|
622
|
+
cvmInstanceIntraIp: persisted.localIpv4,
|
|
623
|
+
hostName,
|
|
624
|
+
};
|
|
625
|
+
cachedMetadata = Object.freeze(initial);
|
|
626
|
+
|
|
627
|
+
// 全部字段已由 override 或持久化值填充,无需网络请求
|
|
628
|
+
if (initial.cvmInstanceId && initial.cvmInstanceName && initial.cvmInstanceIntraIp) {
|
|
629
|
+
logger?.debug(
|
|
630
|
+
`diagnostics-metrics/instance-metadata: 当前实例信息 cvm_instance_id=${initial.cvmInstanceId}, cvm_instance_name=${initial.cvmInstanceName}, cvm_instance_intra_ip=${initial.cvmInstanceIntraIp}, host_name=${initial.hostName}`,
|
|
631
|
+
);
|
|
632
|
+
return;
|
|
529
633
|
}
|
|
530
634
|
|
|
531
|
-
//
|
|
532
|
-
//
|
|
533
|
-
const {
|
|
635
|
+
// 仍有字段未就绪,通过 metadata 接口拉取
|
|
636
|
+
// refreshMetadata 内部已按缓存和 override 做了跳过处理,仅对未就绪字段发起请求
|
|
637
|
+
const { allSuccess, fetched } = await refreshMetadata(logger);
|
|
534
638
|
|
|
535
|
-
//
|
|
536
|
-
if (
|
|
639
|
+
// 新获取到的字段立即持久化(override 值不会出现在 fetched 中,因此不会被写入)
|
|
640
|
+
if (stateDir && (fetched.instanceId || fetched.instanceName || fetched.localIpv4)) {
|
|
641
|
+
await writePersistedMetadata(stateDir, fetched, logger);
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
if (!allSuccess) {
|
|
537
645
|
logger?.warn(
|
|
538
|
-
"diagnostics-metrics/instance-metadata:
|
|
646
|
+
"diagnostics-metrics/instance-metadata: 首次获取存在未就绪字段,可能不在腾讯云环境,将在 30 秒后重试",
|
|
539
647
|
);
|
|
540
648
|
scheduleRetry(logger, stateDir);
|
|
541
649
|
return;
|
|
542
650
|
}
|
|
543
|
-
|
|
544
|
-
// instance-id 获取成功,持久化写入 openclaw.json
|
|
545
|
-
if (stateDir && fetchedInstanceId) {
|
|
546
|
-
await writeInstanceIdToConfig(stateDir, fetchedInstanceId, logger);
|
|
547
|
-
}
|
|
548
|
-
// instance-id 已成功获取并持久化,不再启动定时刷新
|
|
651
|
+
// 全部就绪,不再启动定时刷新
|
|
549
652
|
} catch (err) {
|
|
550
653
|
// 启动过程中发生未预期的异常,重置初始化标记,使调用方可以在修复问题后重新调用 start()
|
|
551
654
|
// 注意:仅在 refCount > 0 时才递减,防止并发 stop() 已将 refCount 减为 0 后再次变为负数
|
|
@@ -560,7 +663,8 @@ export async function startInstanceMetadataRefresh(
|
|
|
560
663
|
/**
|
|
561
664
|
* 安排短间隔重试
|
|
562
665
|
* 首次失败后每 30 秒重试一次,最多重试 MAX_RETRY_COUNT 次
|
|
563
|
-
*
|
|
666
|
+
* 每次重试仅对未就绪字段发起 metadata 请求;新获取的字段立即持久化写入 openclaw.json
|
|
667
|
+
* 所有 CVM 字段(id/name/ipv4)全部就绪后停止重试
|
|
564
668
|
*/
|
|
565
669
|
function scheduleRetry(
|
|
566
670
|
logger?: { info: (msg: string) => void; warn: (msg: string) => void; debug: (msg: string) => void },
|
|
@@ -583,22 +687,22 @@ function scheduleRetry(
|
|
|
583
687
|
`diagnostics-metrics/instance-metadata: 第 ${retryCount}/${MAX_RETRY_COUNT} 次重试获取元数据`,
|
|
584
688
|
);
|
|
585
689
|
|
|
586
|
-
const {
|
|
690
|
+
const { allSuccess, fetched } = await refreshMetadata(logger);
|
|
587
691
|
|
|
588
692
|
// 再次检查:refreshMetadata 是异步的,期间 stop() 可能已被调用
|
|
589
693
|
if (stopped) return;
|
|
590
694
|
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
return;
|
|
695
|
+
// 本次新获取到的字段立即持久化(override 值不会出现在 fetched 中)
|
|
696
|
+
if (stateDir && (fetched.instanceId || fetched.instanceName || fetched.localIpv4)) {
|
|
697
|
+
await writePersistedMetadata(stateDir, fetched, logger);
|
|
595
698
|
}
|
|
596
699
|
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
700
|
+
if (!allSuccess) {
|
|
701
|
+
// 仍有未就绪字段,继续重试
|
|
702
|
+
scheduleRetry(logger, stateDir);
|
|
703
|
+
return;
|
|
600
704
|
}
|
|
601
|
-
//
|
|
705
|
+
// 全部就绪后不再启动定时刷新
|
|
602
706
|
}, RETRY_INTERVAL_MS);
|
|
603
707
|
|
|
604
708
|
// 避免定时器阻止进程退出(Node.js 环境下 setTimeout 返回的 Timeout 对象具有 unref 方法)
|
package/src/trace-service.ts
CHANGED
|
@@ -134,7 +134,6 @@ export function createTraceService(config: TraceConfig) {
|
|
|
134
134
|
if (metadata.cvmInstanceId) attrs.cvm_instance_id = metadata.cvmInstanceId;
|
|
135
135
|
if (metadata.cvmInstanceName) attrs.cvm_instance_name = metadata.cvmInstanceName;
|
|
136
136
|
if (metadata.cvmInstanceIntraIp) attrs.cvm_instance_intra_ip = metadata.cvmInstanceIntraIp;
|
|
137
|
-
if (metadata.hostName) attrs.host_name = metadata.hostName;
|
|
138
137
|
return attrs;
|
|
139
138
|
}
|
|
140
139
|
|