blue-js-sdk 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (215) hide show
  1. package/CHANGELOG.md +446 -0
  2. package/LICENSE +21 -0
  3. package/README.md +75 -0
  4. package/ai-path/ADMIN-ELEVATION.md +116 -0
  5. package/ai-path/AI-MANIFESTO.md +185 -0
  6. package/ai-path/BREAKING.md +74 -0
  7. package/ai-path/CHECKLIST.md +619 -0
  8. package/ai-path/CONNECTION-STEPS.md +724 -0
  9. package/ai-path/DECISION-TREE.md +378 -0
  10. package/ai-path/DEPENDENCIES.md +459 -0
  11. package/ai-path/E2E-FLOW.md +1555 -0
  12. package/ai-path/FAILURES.md +403 -0
  13. package/ai-path/GUIDE.md +1217 -0
  14. package/ai-path/README.md +558 -0
  15. package/ai-path/SPLIT-TUNNEL.md +266 -0
  16. package/ai-path/cli.js +535 -0
  17. package/ai-path/connect.js +884 -0
  18. package/ai-path/discover.js +178 -0
  19. package/ai-path/environment.js +266 -0
  20. package/ai-path/errors.js +86 -0
  21. package/ai-path/examples/autonomous-agent.mjs +220 -0
  22. package/ai-path/examples/multi-region.mjs +174 -0
  23. package/ai-path/examples/one-shot.mjs +31 -0
  24. package/ai-path/index.js +60 -0
  25. package/ai-path/pricing.js +136 -0
  26. package/ai-path/recommend.js +413 -0
  27. package/ai-path/run-admin.vbs +25 -0
  28. package/ai-path/setup.js +291 -0
  29. package/ai-path/wallet.js +137 -0
  30. package/app-helpers.js +363 -0
  31. package/app-settings.js +95 -0
  32. package/app-types.js +267 -0
  33. package/audit.js +847 -0
  34. package/batch.js +293 -0
  35. package/bin/setup.js +376 -0
  36. package/chain/authz.js +109 -0
  37. package/chain/broadcast.js +472 -0
  38. package/chain/client.js +160 -0
  39. package/chain/fee-grants.js +305 -0
  40. package/chain/index.js +891 -0
  41. package/chain/lcd.js +313 -0
  42. package/chain/queries.js +547 -0
  43. package/chain/rpc.js +408 -0
  44. package/chain/wallet.js +141 -0
  45. package/cli/config.js +143 -0
  46. package/cli/index.js +463 -0
  47. package/cli/output.js +182 -0
  48. package/cli.js +491 -0
  49. package/client/index.js +251 -0
  50. package/client.js +271 -0
  51. package/config/index.js +255 -0
  52. package/connection/connect.js +849 -0
  53. package/connection/disconnect.js +180 -0
  54. package/connection/discovery.js +321 -0
  55. package/connection/index.js +76 -0
  56. package/connection/proxy.js +148 -0
  57. package/connection/resilience.js +428 -0
  58. package/connection/security.js +232 -0
  59. package/connection/state.js +369 -0
  60. package/connection/tunnel.js +691 -0
  61. package/consumer.js +132 -0
  62. package/cosmjs-setup.js +1884 -0
  63. package/defaults.js +366 -0
  64. package/disk-cache.js +107 -0
  65. package/dist/client.d.ts +108 -0
  66. package/dist/client.d.ts.map +1 -0
  67. package/dist/client.js +400 -0
  68. package/dist/client.js.map +1 -0
  69. package/dist/index.d.ts +8 -0
  70. package/dist/index.d.ts.map +1 -0
  71. package/dist/index.js +8 -0
  72. package/dist/index.js.map +1 -0
  73. package/errors/index.js +112 -0
  74. package/errors.js +218 -0
  75. package/examples/README.md +64 -0
  76. package/examples/connect-direct.mjs +106 -0
  77. package/examples/connect-plan.mjs +125 -0
  78. package/examples/error-handling.mjs +109 -0
  79. package/examples/query-nodes.mjs +94 -0
  80. package/examples/wallet-basics.mjs +61 -0
  81. package/generated/amino/amino.ts +9 -0
  82. package/generated/cosmos/base/v1beta1/coin.ts +365 -0
  83. package/generated/cosmos_proto/cosmos.ts +323 -0
  84. package/generated/gogoproto/gogo.ts +9 -0
  85. package/generated/google/protobuf/descriptor.ts +7601 -0
  86. package/generated/google/protobuf/duration.ts +208 -0
  87. package/generated/google/protobuf/timestamp.ts +238 -0
  88. package/generated/sentinel/lease/v1/events.ts +924 -0
  89. package/generated/sentinel/lease/v1/lease.ts +292 -0
  90. package/generated/sentinel/lease/v1/msg.ts +949 -0
  91. package/generated/sentinel/lease/v1/params.ts +164 -0
  92. package/generated/sentinel/node/v3/events.ts +881 -0
  93. package/generated/sentinel/node/v3/msg.ts +1002 -0
  94. package/generated/sentinel/node/v3/node.ts +263 -0
  95. package/generated/sentinel/node/v3/params.ts +183 -0
  96. package/generated/sentinel/plan/v3/events.ts +675 -0
  97. package/generated/sentinel/plan/v3/msg.ts +1191 -0
  98. package/generated/sentinel/plan/v3/plan.ts +283 -0
  99. package/generated/sentinel/provider/v2/events.ts +171 -0
  100. package/generated/sentinel/provider/v2/msg.ts +480 -0
  101. package/generated/sentinel/provider/v2/params.ts +131 -0
  102. package/generated/sentinel/provider/v2/provider.ts +246 -0
  103. package/generated/sentinel/session/v3/events.ts +480 -0
  104. package/generated/sentinel/session/v3/msg.ts +616 -0
  105. package/generated/sentinel/session/v3/params.ts +260 -0
  106. package/generated/sentinel/session/v3/proof.ts +180 -0
  107. package/generated/sentinel/session/v3/session.ts +384 -0
  108. package/generated/sentinel/subscription/v3/events.ts +1181 -0
  109. package/generated/sentinel/subscription/v3/msg.ts +1305 -0
  110. package/generated/sentinel/subscription/v3/params.ts +167 -0
  111. package/generated/sentinel/subscription/v3/subscription.ts +315 -0
  112. package/generated/sentinel/types/v1/bandwidth.ts +124 -0
  113. package/generated/sentinel/types/v1/price.ts +149 -0
  114. package/generated/sentinel/types/v1/renewal.ts +87 -0
  115. package/generated/sentinel/types/v1/status.ts +54 -0
  116. package/generated/typeRegistry.ts +27 -0
  117. package/index.js +486 -0
  118. package/node-connect.js +3015 -0
  119. package/operator.js +134 -0
  120. package/package.json +113 -0
  121. package/plan-operations.js +199 -0
  122. package/preflight.js +352 -0
  123. package/pricing/index.js +262 -0
  124. package/proto/amino/amino.proto +84 -0
  125. package/proto/cosmos/base/v1beta1/coin.proto +61 -0
  126. package/proto/cosmos_proto/cosmos.proto +112 -0
  127. package/proto/gogoproto/gogo.proto +145 -0
  128. package/proto/google/api/annotations.proto +31 -0
  129. package/proto/google/api/http.proto +370 -0
  130. package/proto/google/protobuf/any.proto +106 -0
  131. package/proto/google/protobuf/duration.proto +115 -0
  132. package/proto/google/protobuf/timestamp.proto +145 -0
  133. package/proto/sentinel/lease/v1/events.proto +52 -0
  134. package/proto/sentinel/lease/v1/genesis.proto +15 -0
  135. package/proto/sentinel/lease/v1/lease.proto +25 -0
  136. package/proto/sentinel/lease/v1/msg.proto +62 -0
  137. package/proto/sentinel/lease/v1/params.proto +17 -0
  138. package/proto/sentinel/node/v3/events.proto +50 -0
  139. package/proto/sentinel/node/v3/genesis.proto +15 -0
  140. package/proto/sentinel/node/v3/msg.proto +63 -0
  141. package/proto/sentinel/node/v3/node.proto +27 -0
  142. package/proto/sentinel/node/v3/params.proto +21 -0
  143. package/proto/sentinel/node/v3/querier.proto +63 -0
  144. package/proto/sentinel/plan/v3/events.proto +41 -0
  145. package/proto/sentinel/plan/v3/genesis.proto +21 -0
  146. package/proto/sentinel/plan/v3/msg.proto +83 -0
  147. package/proto/sentinel/plan/v3/plan.proto +32 -0
  148. package/proto/sentinel/plan/v3/querier.proto +53 -0
  149. package/proto/sentinel/provider/v2/events.proto +16 -0
  150. package/proto/sentinel/provider/v2/genesis.proto +15 -0
  151. package/proto/sentinel/provider/v2/msg.proto +35 -0
  152. package/proto/sentinel/provider/v2/params.proto +17 -0
  153. package/proto/sentinel/provider/v2/provider.proto +24 -0
  154. package/proto/sentinel/provider/v3/genesis.proto +15 -0
  155. package/proto/sentinel/provider/v3/params.proto +13 -0
  156. package/proto/sentinel/session/v3/events.proto +30 -0
  157. package/proto/sentinel/session/v3/genesis.proto +15 -0
  158. package/proto/sentinel/session/v3/msg.proto +50 -0
  159. package/proto/sentinel/session/v3/params.proto +25 -0
  160. package/proto/sentinel/session/v3/proof.proto +25 -0
  161. package/proto/sentinel/session/v3/querier.proto +100 -0
  162. package/proto/sentinel/session/v3/session.proto +50 -0
  163. package/proto/sentinel/subscription/v2/allocation.proto +21 -0
  164. package/proto/sentinel/subscription/v2/payout.proto +22 -0
  165. package/proto/sentinel/subscription/v3/events.proto +65 -0
  166. package/proto/sentinel/subscription/v3/genesis.proto +17 -0
  167. package/proto/sentinel/subscription/v3/msg.proto +83 -0
  168. package/proto/sentinel/subscription/v3/params.proto +21 -0
  169. package/proto/sentinel/subscription/v3/subscription.proto +33 -0
  170. package/proto/sentinel/types/v1/bandwidth.proto +19 -0
  171. package/proto/sentinel/types/v1/price.proto +21 -0
  172. package/proto/sentinel/types/v1/renewal.proto +21 -0
  173. package/proto/sentinel/types/v1/status.proto +16 -0
  174. package/protocol/encoding.js +341 -0
  175. package/protocol/events.js +361 -0
  176. package/protocol/handshake.js +297 -0
  177. package/protocol/index.js +15 -0
  178. package/protocol/messages.js +346 -0
  179. package/protocol/plans.js +199 -0
  180. package/protocol/v2ray.js +268 -0
  181. package/protocol/v3.js +723 -0
  182. package/protocol/wireguard.js +125 -0
  183. package/security/index.js +132 -0
  184. package/session-manager.js +329 -0
  185. package/session-tracker.js +80 -0
  186. package/setup.js +376 -0
  187. package/speedtest/index.js +528 -0
  188. package/speedtest.js +567 -0
  189. package/src/client.ts +502 -0
  190. package/src/index.ts +20 -0
  191. package/state/index.js +347 -0
  192. package/state.js +516 -0
  193. package/test-all-chain-ops.js +493 -0
  194. package/test-all-logic.js +199 -0
  195. package/test-all-msg-types.js +292 -0
  196. package/test-every-connection.js +208 -0
  197. package/test-feegrant-connect.js +98 -0
  198. package/test-logic.js +148 -0
  199. package/test-mainnet.js +176 -0
  200. package/test-plan-lifecycle.js +335 -0
  201. package/tls-trust.js +132 -0
  202. package/tsconfig.build.json +20 -0
  203. package/tsconfig.json +34 -0
  204. package/types/chain.d.ts +746 -0
  205. package/types/connection.d.ts +425 -0
  206. package/types/errors.d.ts +174 -0
  207. package/types/index.d.ts +1380 -0
  208. package/types/nodes.d.ts +187 -0
  209. package/types/pricing.d.ts +156 -0
  210. package/types/protocol.d.ts +332 -0
  211. package/types/session.d.ts +236 -0
  212. package/types/settings.d.ts +192 -0
  213. package/v3protocol.js +1053 -0
  214. package/wallet/index.js +153 -0
  215. package/wireguard.js +307 -0
@@ -0,0 +1,403 @@
1
+ # Sentinel SDK Failure Catalog
2
+
3
+ > Every failure pattern discovered across 10 apps, 2200+ mainnet node tests, 25 project findings files, 161 suggestion files, and 200+ hours of development.
4
+ > For any AI building on this SDK: read this BEFORE writing code. Every entry cost real tokens and real debugging time.
5
+
6
+ ---
7
+
8
+ ## Quick Rules -- The 35 Most Critical
9
+
10
+ | # | Rule | Category | Consequence of Violation |
11
+ |---|------|----------|--------------------------|
12
+ | 1 | **Use v3 LCD paths, not v2** -- v2 returns "Not Implemented" except `/sentinel/provider/v2/` | chain | Zero chain queries work |
13
+ | 2 | **Never trust `count_total` or `next_key`** on Sentinel LCD pagination -- use `limit=5000` single request | chain | Missing 400+ nodes |
14
+ | 3 | **`remote_addrs` is an array, not `remote_url` string** -- LCD v3 changed the field name and format | chain | All connections fail silently (undefined) |
15
+ | 4 | **Session data is nested under `base_session`** -- always access `session.base_session.id` | chain | Silent undefined propagation |
16
+ | 5 | **WireGuard requires Administrator privileges** -- check BEFORE paying for a session | tunnel | Money wasted, no connection |
17
+ | 6 | **Never install full-tunnel to unreachable endpoint** -- `AllowedIPs=0.0.0.0/0` kills internet instantly | tunnel | Total internet death |
18
+ | 7 | **V2Ray must be exactly v5.2.1** -- newer versions have observatory bugs | dependencies | Silent connection failures |
19
+ | 8 | **MTU must be 1280, not 1420** -- Sentinel nodes are configured for 1280 | tunnel | TLS handshake failures, service crashes |
20
+ | 9 | **WireGuard DNS must be `10.8.0.1`** (node's internal resolver), not external DNS | tunnel | DNS resolution fails through tunnel |
21
+ | 10 | **`grpc/tls` has 0% success rate** -- filter these transports BEFORE paying | protocol | Guaranteed failure, tokens wasted |
22
+ | 11 | **QUIC `quicSettings` must use `security: 'none'`** -- not `chacha20-poly1305` | protocol | 0% QUIC connections |
23
+ | 12 | **VMess AEAD requires clock drift <120s** -- but VLess is immune to clock drift | protocol | VMess-only nodes fail with drift |
24
+ | 13 | **Register MsgEndSession in protobuf Registry** -- or sessions never end on-chain | chain | Orphaned sessions leak resources |
25
+ | 14 | **Account sequence mismatch** -- serialize broadcasts through a mutex, retry with backoff | chain | TX failures cascade |
26
+ | 15 | **Session may be `inactive_pending` after TX** -- poll until `active` before handshaking | timing | "Invalid session status" errors |
27
+ | 16 | **Chain lag** -- node's RPC may not see session for 10s after broadcast | timing | "Session does not exist" on handshake |
28
+ | 17 | **Verify-before-capture for WireGuard** -- test with split IPs first, then switch to full tunnel | tunnel | 78s of dead internet on failure |
29
+ | 18 | **`autoReconnect()` checks `status?.connected` but that property doesn't exist** -- use `!!status` | protocol | Entire reconnect feature is broken |
30
+ | 19 | **Never `taskkill /F /IM node.exe`** -- it kills ALL Node.js processes on the machine | dependencies | Kills development environment |
31
+ | 20 | **`BigInt` cannot be JSON.stringify'd** -- convert sessionId to string before serialization | protocol | TypeError crash |
32
+ | 21 | **Error code strings are a CONTRACT between SDKs** -- `SESSION_EXISTS` must match exactly | parity | Cross-language apps break |
33
+ | 22 | **Unit tests prove nothing about live chain** -- 656 tests passed, zero features worked | testing | False confidence |
34
+ | 23 | **`fullTunnel: true` is the default** -- routes ALL traffic through VPN. AI agents should explicitly set `fullTunnel: false` or use `protocol: 'v2ray'` for split tunnel. Intentionally `true` since v26c (false caused "IP didn't change" confusion for consumers). | configuration | AI's chain queries slow down through VPN |
35
+ | 24 | **SOCKS5 auth breaks Windows system proxy** -- system proxy cannot pass credentials | tunnel | "Connected" but zero traffic |
36
+ | 25 | **Fee grant auto-detection should be opt-in** -- don't silently use random granters | wallet | Unexpected behavior, unreliable grants |
37
+ | 26 | **Shared VPN client for testing corrupts state** -- always create a DEDICATED VPN client per test | integration | Main VPN session disconnected, state corrupted |
38
+ | 27 | **CancellationToken in speed test kills measurements** -- pass `CancellationToken.None` to speed/google tests | integration | Garbage speed numbers, premature cancellation |
39
+ | 28 | **Background refresh starves test connections** -- cancel background work before starting test scan | integration | Test hangs indefinitely waiting for HTTP client |
40
+ | 29 | **Progress counter must increment on EVERY code path** -- success, failure, AND exception | integration | UI freezes, user thinks app is stuck |
41
+ | 30 | **V2Ray SOCKS5 connection reuse silently fails** -- create FRESH HttpClient/SocksProxyAgent per request | protocol | First request works, subsequent hang until timeout |
42
+ | 31 | **WPF cannot render emoji flags** -- use PNG images from flagcdn.com, not emoji code points | ux | Empty boxes or nothing at all on Windows native apps |
43
+ | 32 | **Load previous results on startup** -- never show "No results" when results exist on disk | ux | User loses trust, thinks data was lost |
44
+ | 33 | **Docs describing non-existent code cause more harm than no docs** -- label IMPLEMENTED vs SPEC | documentation | AI spends 10+ hours trying to use classes that don't exist |
45
+ | 34 | **transport_security is 0-indexed in C#, 1-indexed in JS** -- always check enum mappings when bridging | parity | All C# V2Ray tests fail with wrong TLS setting |
46
+ | 35 | **Missing UUID wait in new code paths** -- copy ALL waits/sleeps when adding a new code path | timing | All V2Ray connections on new path fail silently |
47
+ | 36 | **NEVER use native `fetch()` for V2Ray traffic** -- `fetch()` silently ignores SOCKS5 proxy. You WILL get your real IP, not the VPN IP. Use `axios` with `SocksProxyAgent` for ALL V2Ray verification, speed tests, and IP checks. This is the #1 mistake every AI builder makes. | protocol | IP leak — agent thinks it's on VPN but all traffic goes direct |
48
+ | 37 | **V2Ray split tunnel IS the SOCKS5 proxy** -- V2Ray does not change system routing. Only traffic you explicitly send through `socks5://127.0.0.1:{port}` goes through the VPN. Everything else is direct. There is no `fullTunnel` for V2Ray — `systemProxy: true` sets Windows proxy but that's opt-in, not default. | protocol | Agent assumes all traffic is encrypted when only proxied traffic is |
49
+ | 38 | **WireGuard split tunnel requires exact destination IPs** -- `splitIPs: ['example.com']` does NOT work. WireGuard routes by IP, not domain. CDN/anycast services (Cloudflare, Google) resolve to hundreds of IPs. Use V2Ray SOCKS5 for per-app split tunnel, use WireGuard splitIPs only for known static IPs. | tunnel | Agent sets splitIPs for a CDN domain, traffic goes direct because DNS resolved to a different IP |
50
+ | 39 | **WireGuard disconnect MUST restore DNS to DHCP** -- WireGuard config sets system DNS (10.8.0.1 or custom). This persists in the OS adapter AFTER the WG interface is removed. Every disconnect path (normal, error, emergency) must call `disableDnsLeakPrevention()` or `netsh interface ipv4 set dnsservers Wi-Fi dhcp`. Discovered 2026-03-27: Cloudflare DNS persisted after split tunnel test, broke all V2Ray and node tester connections. | tunnel | System DNS silently changed, all subsequent networking affected |
51
+
52
+ ---
53
+
54
+ ## Failures by Category
55
+
56
+ ### PROTOCOL
57
+
58
+ | # | Failure Name | What Happened | Root Cause | Fix Applied | Prevention Rule |
59
+ |---|-------------|---------------|------------|-------------|-----------------|
60
+ | P1 | QUIC 0% success rate | All 4 QUIC nodes failed despite having active peers | Global `quicSettings` used `security: 'chacha20-poly1305'` but Sentinel Go SDK server uses `security: 'none'` | Changed to `{ security: 'none', key: '', header: { type: 'none' } }` in both global and per-outbound settings | Always match transport security settings to sentinel-go-sdk server defaults |
61
+ | P2 | grpc/tls always fails | 0 nodes passed grpc/tls out of 14+ tested; nodes have active peers | V2Ray gRPC over TLS has incompatible server config; sentinel nodes don't support it | Filter `grpc/tls` (transport_protocol=3, transport_security=2) BEFORE session payment | Pre-filter: if all transports are grpc/tls, skip node |
62
+ | P3 | VMess clock drift AEAD failure | VMess connections to nodes with >120s clock drift fail silently | VMess AEAD timestamp auth has +/-120s tolerance; HTTP Date header drift detected but node still tried | Skip VMess-only nodes with >120s drift; prefer VLess outbounds when drift detected | Check clock drift AND available protocols; VLess ignores drift |
63
+ | P4 | grpc missing serviceName | gRPC connections fail with "context canceled" after VMess auth | Per-outbound `streamSettings` had no `grpcSettings` block; V2Ray fell back to wrong global setting | Added `grpcSettings: { serviceName: '' }` for both `grpc` and `gun` networks | Always include `grpcSettings` for gun(2) and grpc(3) transports |
64
+ | P5 | gun vs grpc documentation contradiction | Docs said gun and grpc are "DIFFERENT protocols" but V2Ray treats them identically | Sentinel uses different enum values (2=gun, 3=grpc) but V2Ray config is identical for both | Updated docs: both use `"network": "grpc"` with `grpcSettings: { serviceName: '' }` | For V2Ray config, gun and grpc are the same -- both use grpcSettings |
65
+ | P6 | v2-format metadata rejection | Most popular node (48 peers) rejected because it returned v2 metadata format | SDK threw error on old `{protocol, tls, ca}` fields instead of mapping to v3 `{proxy_protocol, transport_protocol, transport_security}` | Added v2-to-v3 field mapper: v2 protocol:1->v3 proxy_protocol:2, etc. | Never reject metadata outright -- map old formats to new |
66
+ | P7 | autoReconnect completely broken | `autoReconnect()` never triggers reconnection | Checked `status?.connected` but `getStatus()` returns `{ sessionId, serviceType, ... }` with no `.connected` property -- always false | Changed to `!!status` (getStatus returns null when disconnected) | Test feature with real connection lifecycle, not just existence |
67
+ | P8 | BigInt JSON serialization crash | `JSON.stringify({ sessionId: 123n })` throws TypeError | `sessionId` is BigInt, JavaScript cannot serialize BigInt to JSON | Call `.toString()` before sending to frontend; SDK's `serializeResult()` exists but must be called | Always convert BigInt to string at API boundaries |
68
+ | P9 | broadcast() name collision | SSE broadcast and SDK chain broadcast share the same name | SDK exports `broadcast()` for chain TX; apps commonly have local `broadcast()` for SSE | Document collision risk; consider renaming SDK export to `broadcastTx()` | When importing SDK functions, check for local name conflicts |
69
+ | P10 | Transport success rates outdated | Code cited grpc/none=58%, quic=55% but real rates are grpc/none=87%, quic=0% | Numbers from early testing never updated after 780-node scan | Updated `TRANSPORT_SUCCESS_RATES` to match 780-node data | Update transport stats after every major test run |
70
+ | P11 | Session ID precision time bomb | `Number(sessionId)` silently rounds integers above 2^53 | `initHandshakeV3()` converts BigInt to Number for handshake POST body | Added `Number.isSafeInteger()` bounds check; fails loudly instead of silently | Never downcast BigInt to Number without safety check |
71
+ | P12 | V2Ray balancer unreliable | V2Ray 5.2.1 observatory marks working outbounds as dead | V2Ray internal health checker is buggy in 5.2.1 | Implemented own fallback: balancer first (8s), then individual outbounds in priority order | Don't rely on V2Ray's internal balancer -- implement app-level retry |
72
+ | P13 | Port probe false negative kills nodes with peers | V2Ray "service dead" error for nodes with active connections | Pre-payment port probe scans 12 standard ports, but V2Ray may run on non-standard ports | If peers > 0, skip probe failure and proceed to handshake (reveals actual ports in metadata) | Never hard-fail port probe when peers > 0; someone IS connecting, V2Ray IS alive |
73
+ | P14 | TCP probe fail wastes 20s per V2Ray outbound | Node test times out after trying 3-4 unreachable outbounds | When TCP SYN fails but V2Ray starts anyway, SOCKS5 wait (12s) + sleep (4s) per outbound = 16-20s each | Reduce SOCKS5 wait from 12/8s to 5s when TCP probe already failed; saves ~15s per unreachable port | Use TCP probe result to size timeouts: probe OK → normal timeout; probe fail → fast fail |
74
+ | P15 | VMess clock drift unfixable for AEAD-only servers | VMess nodes with >120s drift fail with both alterId=0 and alterId=64 | AEAD (alterId=0) rejects timestamps >120s off. Legacy (alterId=64) on AEAD-only server → auth mismatch → 15s silent drain. No V2Ray config adjusts timestamp. | Try both alterId values, accept failure. Only fix is node operator fixing their clock or supporting legacy. | VMess with drift >120s on AEAD server = UNFIXABLE from client. Skip these nodes and document why. VLess is immune to drift. |
75
+ | P16 | SOCKS5 "connected" but no internet | V2Ray opens SOCKS5 port even when remote connection fails | V2Ray starts SOCKS5 listener immediately, before establishing remote tunnel. Traffic enters SOCKS5 but can't route. | Add 3s google connectivity pre-check before running full speedtest. Detects dead tunnels 10x faster. | Never assume SOCKS5 port open = tunnel working. Always pre-check connectivity. |
76
+ | P17 | Port scan discovers non-V2Ray ports 7874/7876 | Discovered ports tried as V2Ray, waste time, always fail | Sentinel-go-sdk nodes have internal control/WireGuard ports (7874, 7876) that accept TCP but don't serve V2Ray | Filter discovered ports: if they accept TCP but no TLS and no HTTP response, skip as non-V2Ray | Cross-reference discovered ports with known sentinel internal port ranges before attempting V2Ray |
77
+
78
+ ### CHAIN
79
+
80
+ | # | Failure Name | What Happened | Root Cause | Fix Applied | Prevention Rule |
81
+ |---|-------------|---------------|------------|-------------|-----------------|
82
+ | C1 | v2 LCD endpoints return "Not Implemented" | All C# SDK chain queries fail -- zero features work | C# SDK used v2 paths (`/sentinel/nodes?status=STATUS_ACTIVE`); chain runs v3 | Updated all 9 endpoints to v3 paths (e.g., `/sentinel/node/v3/nodes?status=1`) | Always use v3 paths; only exception: provider remains v2 |
83
+ | C2 | LCD `count_total` returns wrong number | Plan discovery shows 1 node but plan has 733 | LCD returns `min(actual_count, limit)` as `count_total` with `limit=1` | Changed to `limit=5000` and count array length directly | NEVER trust `count_total` -- always count the returned array |
84
+ | C3 | LCD `next_key` always null on plan nodes | Pagination stops at 200 nodes; misses 533 | `/sentinel/node/v3/plans/{id}/nodes` doesn't implement `next_key` pagination | Single request with `limit=5000` | Test pagination per endpoint; some are broken |
85
+ | C4 | `remote_addrs` vs `remote_url` | All node connections fail -- `node.remote_url` is undefined | v3 LCD returns `remote_addrs: ["IP:PORT"]` (array, no protocol) instead of `remote_url: "https://IP:PORT"` | Added `resolveRemoteUrl()` that handles both formats | Always use `resolveRemoteUrl(node)`, never access field directly |
86
+ | C5 | `base_session` nesting undocumented | `session.id` returns undefined; no error, just silent null | v3 LCD nests session fields under `base_session` | Always use `const bs = session.base_session \|\| session` | Flatten `base_session` in all query helpers |
87
+ | C6 | MsgEndSession not registered in protobuf | Sessions NEVER end on-chain; orphaned sessions accumulate | `MsgEndSessionRequest` type URL not registered in CosmJS Registry; no encoder function exists | Registered type in `buildRegistry()` and created `encodeMsgEndSession()` | Verify ALL message types are registered, not just commonly used ones |
88
+ | C7 | `status=1` vs `status=STATUS_ACTIVE` | Node status filter returns wrong results | v3 uses integer status codes (1=active), v2 used string enum | Updated all queries to use `status=1` | Use integer status codes for v3 chain |
89
+ | C8 | `acc_address` vs `address` | Subscription parser fails | v3 uses `acc_address` field, v2 used `address` | Updated parser to read `acc_address` | Always verify field names against actual LCD responses |
90
+ | C9 | Plan discovery endpoint returns 501 | `DiscoverPlansAsync()` returns 0 plans on mainnet | `/sentinel/plan/v3/plans/{id}` is NOT IMPLEMENTED on chain | Use subscription+node endpoints to probe plan existence | Test endpoints with `curl` before building against them |
91
+ | C10 | Provider endpoint is v2 only | Provider queries fail with v3 path | `/sentinel/provider/v3/` returns 501; must use `/sentinel/provider/v2/` | Use v2 path for provider queries only | Document the v2 exception prominently |
92
+ | C11 | Subscription endpoint returns all 143K results | `GetSubscriptionsAsync` fetches entire chain's subscriptions | Wrong path `/sentinel/subscription/v3/subscriptions?account_address=` vs correct `/sentinel/subscription/v3/accounts/{addr}/subscriptions` | Fixed to account-scoped path | Always use account-scoped endpoints for per-user queries |
93
+ | C12 | Handshake error field type mismatch | `JsonException` on deserializing handshake error response | Node returns `{"error": {"code": 2, "message": "..."}}` (object) but SDK typed `Error` as `string?` | Changed to `JsonElement?` with type-safe accessor | Use flexible JSON types for node API responses -- formats vary |
94
+ | C13 | Plans start inactive | AI creates plan, tries to subscribe, gets chain error | `encodeMsgCreatePlan()` creates plan with `status=0` (inactive); needs separate `encodeMsgUpdatePlanStatus()` | Added `createAndActivatePlan()` helper | Document two-step plan creation prominently |
95
+ | C14 | queryNode() downloaded ALL nodes to find one | Single node lookup fetches 900+ nodes then `.find()` | No direct endpoint used; full paginated query used for single lookup | Try `/sentinel/node/v3/nodes/{address}` first; fall back to full list | Always use direct endpoints when querying single items |
96
+ | C15 | `max_price` Code 106 "invalid price" in MsgStartSession | Session payment fails for nodes with certain price combos (base_value=0.005, quote_value=25M) | Chain v3 price validation rejects combinations that were valid at node registration time | Catch Code 106 → retry WITHOUT `max_price` field; chain uses node's registered price directly | Always implement retry-without-max_price for MsgStartSession; 14/987 nodes affected |
97
+ | C16 | Batch payment fails on ONE bad-price node | Entire 5-node batch TX rejected with Code 106 when any node has invalid pricing | Batch contains mix of standard (40M quote) and non-standard (25M quote) prices | Retry entire batch without max_price; if still fails, fall back to individual per-node payments | Batch TX is all-or-nothing; one bad message kills all 5. Always have individual fallback. |
98
+
99
+ ### TUNNEL
100
+
101
+ | # | Failure Name | What Happened | Root Cause | Fix Applied | Prevention Rule |
102
+ |---|-------------|---------------|------------|-------------|-----------------|
103
+ | T1 | WireGuard race condition -- config deleted before service reads it | "The system cannot find the file specified" service error | SDK deletes config file during cleanup before Windows service has finished starting | Never delete config file while service exists; only delete AFTER confirmed uninstall | Config must remain on disk as long as service exists |
104
+ | T2 | MTU 1420 causes service crash | WireGuard service installs but immediately stops | Sentinel nodes configured for MTU 1280; using 1420 causes packet fragmentation and TLS failures | Changed to `MTU = 1280` | Always use MTU 1280 for Sentinel WireGuard tunnels |
105
+ | T3 | External DNS unreachable in full tunnel | DNS queries to OpenDNS (208.67.222.222) fail through tunnel | Full tunnel only routes to WireGuard endpoint; external DNS is unreachable | Changed DNS to `10.8.0.1` (node's internal resolver) | Use node's internal DNS (10.8.0.1) in full-tunnel mode |
106
+ | T4 | Full-tunnel verification kills internet for 78s | User loses all internet while verification loops fail on a broken node | `setupWireGuard()` installed tunnel with `AllowedIPs=0.0.0.0/0` before verification | Verify-before-capture: install with split IPs first, verify, then switch to `0.0.0.0/0` | Always verify tunnel works with safe split IPs before capturing all traffic |
107
+ | T5 | Phantom connected state | App shows "connected" when no tunnel exists; IP leak | `getStatus()` trusted `state.connection` without checking if tunnel was alive | Cross-validate tunnel health; auto-clear stale state; emit `disconnected` event | Always verify tunnel liveness (service running, process alive) before reporting connected |
108
+ | T6 | Windows service race on tunnel switch | Double-uninstall races with Windows Service Manager; tunnel fails to start | Manual `disconnectWireGuard()` + `installWgTunnel()` internal force-remove created double-uninstall | Removed manual pre-disconnect; let `installWgTunnel()` handle its own cleanup | Single uninstall path -- never double-uninstall |
109
+ | T7 | V2Ray process leak on outbound loop exit | Orphaned V2Ray processes accumulate; SOCKS5 ports consumed | Outbound connection loop exits without `finally` block to kill last spawned process | Wrapped loop in try-finally that kills last process on any exception | Always use try-finally when spawning child processes in loops |
110
+ | T8 | Tunnel not cleaned up on handshake retry failure | Dead internet from orphaned WireGuard `0.0.0.0/0` route | Retry catch block only marks session as poisoned; doesn't clean up partially installed tunnel | Added tunnel cleanup check (`state.wgTunnel`, `state.v2rayProc`) in retry catch block | Always clean up tunnel state in all error paths |
111
+ | T9 | SOCKS5 auth breaks Windows system proxy | "Connected" but zero traffic flows | V2Ray config uses password auth; Windows system proxy cannot pass SOCKS5 credentials | When `systemProxy: true`, patch SOCKS5 inbound to `noauth` | System proxy mode requires noauth SOCKS5 |
112
+ | T10 | System proxy stuck after crash | All HTTP traffic goes to dead SOCKS5 port; browser shows "no internet" | `clearSystemProxy()` only runs in cleanup handler; crash leaves registry pointing to dead port | Write sentinel file on proxy set; check at startup; restore on recovery | Always persist proxy state so crash recovery can restore it |
113
+ | T11 | Proxy restore overwrites user's previous proxy | User had corporate proxy; `clearSystemProxy()` sets "no proxy" instead of restoring | Code force-disables proxy entirely instead of restoring saved state | Save proxy state before modifying; restore exact previous state on cleanup | Always backup and restore system proxy state |
114
+ | T12 | Orphaned WireGuard adapters block new tunnels | "Tunnel already installed and running" error on fresh connect | Previous crash left Wintun adapter registered but no service managing it | Emergency cleanup at startup: detect and remove orphaned adapters | Run `emergencyCleanupSync()` at startup; clean all stale `wgsent*` services |
115
+ | T13 | WireGuard private key file not deleted on failure | Private key stays at `C:\ProgramData\sentinel-wg\wgsent0.conf` indefinitely | Bare `catch {}` on file deletion; locked file or permissions failure silently ignored | Retry deletion; overwrite with zeros before delete; log failure | Always zero-fill sensitive files before deletion |
116
+ | T14 | Config ACL race -- key readable before ACL set | Config file (with private key) world-readable in `ProgramData` between write and ACL set | File written FIRST, ACL set SECOND; failure in ACL leaves file exposed | Create directory with restrictive ACL first, then write file | Set restrictive permissions on directory before writing sensitive files |
117
+ | T15 | Full tunnel + Handshake DNS = 0-speed (44 nodes) | WireGuard tunnel connects but speed test returns exactly 0 Mbps | Full tunnel routes DNS through VPN; Handshake DNS (103.196.38.38) unreachable through many nodes | Pre-resolve all speed test hostnames BEFORE tunnel installation; use resolved IPs | Always pre-resolve DNS before installing full tunnel |
118
+ | T16 | V2Ray port TIME_WAIT kills fallback | V2Ray fallback to different outbound fails because SOCKS5 port still in TIME_WAIT | All outbound configs used same SOCKS5 port; Windows TIME_WAIT is ~120s | Each outbound gets incrementing port: `basePort + idx` | Never reuse same port across V2Ray fallback attempts |
119
+
120
+ ### WALLET
121
+
122
+ | # | Failure Name | What Happened | Root Cause | Fix Applied | Prevention Rule |
123
+ |---|-------------|---------------|------------|-------------|-----------------|
124
+ | W1 | Fee grant auto-detection silently applied | Direct-connect app uses random stranger's fee grant without consent | SDK auto-detects fee grants and picks `grants[0]` on every transaction | Made fee grant opt-in via explicit `FeeGranter` option | Never auto-apply fee grants; make it explicit opt-in |
125
+ | W2 | Insufficient funds with no dry-run option | AI builds working code but first real run fails with "insufficient funds" | Blockchain requires funded wallet; AI cannot purchase tokens | Added dry-run mode that validates everything except payment | Provide `dryRun: true` option to validate without spending tokens |
126
+ | W3 | Fast reconnect never sets `state._mnemonic` | Sessions never end on-chain after fast reconnect; session leak | `connectDirect()` called `tryFastReconnect()` which skips `connectInternal()` where `_mnemonic` was set | Set `state._mnemonic = opts.mnemonic` BEFORE calling `tryFastReconnect()` | Set authentication state before any early-return code path |
127
+
128
+ ### TIMING
129
+
130
+ | # | Failure Name | What Happened | Root Cause | Fix Applied | Prevention Rule |
131
+ |---|-------------|---------------|------------|-------------|-----------------|
132
+ | TM1 | Session "does not exist" on handshake | 6.7% of handshakes fail immediately after session payment | `BROADCAST_MODE_SYNC` returns after CheckTx, not DeliverTx; block not committed yet (~6-7s) | Added retry-on-404: wait 10s and retry handshake once | Always retry handshake on "does not exist" with 10s delay |
133
+ | TM2 | Session `inactive_pending` status | "Invalid session status inactive_pending, expected active" | Session TX confirmed but not yet transitioned to active status | Added `waitForSessionActive()` polling: every 2s for up to 20s | Poll session status until active before handshaking |
134
+ | TM3 | Sequence mismatch on rapid transactions | "Account sequence mismatch" cascades through batch operations | CosmJS caches sequence number; concurrent TXs use stale sequence | Broadcast mutex + retry with exponential backoff + fresh sequence on retry | Serialize broadcasts through mutex; wait 7s between TXs |
135
+ | TM4 | Stale session allocation 404 | "Resource not found" when reusing previous session | Session appears "active" in query but allocation endpoint returns 404 (expired/closed) | On 404, skip stale session and create new one automatically | Never throw on allocation 404; fall through to new session |
136
+ | TM5 | Session indexing race (409 "already exists") | Handshake returns 409 even though session just created | Node still indexing session after TX confirmation | 5s delay after payment; then retry on 409 at 15s and 20s intervals | Add post-payment delay and handle 409 with progressive retry |
137
+ | TM6 | V2Ray needs post-handshake warmup | Speed test returns low/zero speed immediately after V2Ray connect | Node needs time to register UUID and stabilize tunnel | Added 10s post-handshake delay for V2Ray before speed test | Wait 10s after V2Ray handshake before testing connectivity |
138
+
139
+ ### CONFIGURATION
140
+
141
+ | # | Failure Name | What Happened | Root Cause | Fix Applied | Prevention Rule |
142
+ |---|-------------|---------------|------------|-------------|-----------------|
143
+ | CF1 | `fullTunnel: true` default bricks AI | AI's own RPC/LCD/npm calls crawl or die after successful connect | Default routes ALL traffic through slow VPN node (median 3 Mbps) | Changed defaults to `false`; explicit opt-in for production apps | Default to split tunnel; full tunnel is opt-in |
144
+ | CF2 | `LCD_ENDPOINTS[0]` is an object, not a string | "Invalid URL" error on first LCD call | `LCD_ENDPOINTS` exports `[{ url, name, verified }]` objects; developer assumed string | Use `LCD_ENDPOINTS[0].url` or `DEFAULT_LCD` string export | Make `lcd()` accept both string and Endpoint objects |
145
+ | CF3 | Missing `axios.defaults.adapter = 'http'` | Opaque "fetch failed" errors on self-signed node certs | Node.js 18+ uses fetch adapter by default; self-signed certs fail | Move `axios.defaults.adapter = 'http'` to top of `index.js` | Ensure adapter is set on any SDK import path |
146
+ | CF4 | Missing `registerCleanupHandlers()` | WireGuard tunnel stays installed with `0.0.0.0/0` after crash; dead internet | No cleanup handlers registered; orphaned tunnel captures all traffic | Hard-fail if cleanup handlers not registered before connect | `connect()` must refuse to proceed without cleanup handlers |
147
+ | CF5 | `subscribeToPlan()` field name mismatch | `renewal_price_policy` (snake_case) silently ignored | Encoder destructures `renewalPricePolicy` (camelCase); different naming convention | Changed to `renewalPricePolicy` | Always match field naming convention between callers and encoders |
148
+ | CF6 | PersistentKeepalive too long | NAT routers expire UDP mappings at 20-30s; keepalive at 30s causes drops | 30s keepalive is at the edge of many NAT timeout windows | Changed to `PersistentKeepalive = 15` | Use 15s keepalive for WireGuard -- safe for all NAT routers |
149
+ | CF7 | Node address mismatch wastes tokens | Session paid for node A, but handshake endpoint serves node B | Node's `remote_addrs` on chain points to wrong IP; different node at that address | Pre-verify node address at remote URL BEFORE creating session | Always verify node identity before paying for session |
150
+ | CF8 | Node missing moniker/location from LCD | `node.moniker` returns undefined; no error | LCD returns only address/prices/remote_addrs; moniker/location requires separate `nodeStatusV3()` call | Document clearly: LCD nodes lack moniker/location; must call node's own API | Enrich LCD node data with `nodeStatusV3()` before displaying |
151
+ | CF9 | Daemon loses wallet and connection state on restart | User must re-login and reconnect after every daemon restart/crash; auth token survives but wallet doesn't | Daemon stores active wallet and connection state only in memory; no disk persistence for wallet or last-connection | Persist `active-user.json` (encrypted mnemonic + address) and `last-connection.json` (nodeAddress, protocol, sessionId) to `~/.sentinel-daemon/`; on startup: load wallet, check session on chain, auto-reconnect if still active | Persist ALL daemon state to disk; on restart, restore wallet and attempt reconnection automatically |
152
+ | CF10 | `connectViaPlan()` BigInt(undefined) crash with confusing error | `TypeError: Cannot convert null to a BigInt` or `SyntaxError: Cannot convert abc to a BigInt` — no context about which parameter | `BigInt(opts.planId)` called without proper type validation; falsy guard `!opts.planId` rejects `0` (potentially valid) and passes through bad types | Added explicit validation: null/empty check first, then `try { BigInt(planId) } catch { throw with context }` | Validate BigInt inputs with try/catch and rethrow with parameter name context |
153
+ | CF11 | Handshake timeout 30s too short for distant nodes | Functional nodes with active peers time out at exactly 30s; `ECONNABORTED: timeout of 30000ms exceeded` | 30s timeout insufficient for distant nodes (Asia, South America) or nodes under load; TLS handshake + session negotiation round trip exceeds 30s | Increased handshake timeout to 45s in both `initHandshakeV3()` (WireGuard) and `initHandshakeV3V2Ray()` (V2Ray) | Use 45s handshake timeout; builders can override via their own axios config if needed |
154
+ | CF12 | C# `SentinelVpnOptions` missing `Hours` property | Consumer apps can't let users choose hour amounts for time-based sessions | `PreferHourly = true` hardcodes `hours = 1` internally; no way to specify 2h, 4h, 8h, etc. | Need to add `Hours` property alongside `Gigabytes` | Expose both `Gigabytes` and `Hours` in connection options; never hardcode session duration |
155
+
156
+ ### DEPENDENCIES
157
+
158
+ | # | Failure Name | What Happened | Root Cause | Fix Applied | Prevention Rule |
159
+ |---|-------------|---------------|------------|-------------|-----------------|
160
+ | D1 | V2Ray version mismatch | Connections fail with no clear error | V2Ray 5.44.1+ has observatory bugs; must be exactly 5.2.1 | `verifyDependencies()` checks version; `connect()` refuses incompatible V2Ray | Check V2Ray version at connect time, not just setup |
161
+ | D2 | WireGuard not installed | "Failed to start" with no explanation | `wireguard.exe` not found on system | Pre-check before session payment; clear error message with install link | Check `wireguard.exe` exists BEFORE paying for session |
162
+ | D3 | No admin privileges | "Service registered but never reached RUNNING state" | WireGuard service installation requires Administrator | Pre-check admin before ANY session payment; provide self-elevation helper | Check admin at step 0, not step 5 |
163
+ | D4 | Windows `taskkill /F` kills all Node.js | Development environment dies during debugging | `taskkill /F /IM node.exe` kills ALL Node.js processes including dev tools | Kill only by specific PID: `taskkill /F /PID <pid>` | NEVER use `/IM node.exe` -- always use `/PID` |
164
+ | D5 | Git Bash mangles `/F` flag | `taskkill /F /PID 32516` fails with "Invalid argument" | Git Bash converts `/F` to `F:/` (POSIX path conversion) | Use `//F` or `execFileSync` (bypasses shell) | Use `execFileSync` for all system commands, never string interpolation |
165
+ | D6 | Competing VPN applications | WireGuard tunnel fails; routing table conflicts | NordVPN/ExpressVPN/etc. have active tunnels, route overrides, port conflicts | Added VPN conflict detection in pre-connect diagnostic | Detect and warn about competing VPNs before connecting |
166
+ | D7 | WireGuard Manager Service ghost | WireGuard GUI takes over tunnel management; conflicts with programmatic control | `wireguard.exe /installmanagerservice` was called instead of `/installtunnelservice` | Never call `/installmanagerservice`; only use `/installtunnelservice` | SDK must only use direct tunnel service management |
167
+
168
+ ### TESTING
169
+
170
+ | # | Failure Name | What Happened | Root Cause | Fix Applied | Prevention Rule |
171
+ |---|-------------|---------------|------------|-------------|-----------------|
172
+ | TS1 | 656 unit tests, zero working features | C# SDK declared "100/100 parity"; every feature broke on first real use | Unit tests used mock data; no test hit a real LCD endpoint or real node | Mandatory live chain smoke tests before any release | Run live chain integration tests, not just unit tests |
173
+ | TS2 | Mock data doesn't match real chain | Tests used clean integer prices; chain returns 18-decimal `sdk.Dec` values | Hand-crafted mock data instead of real LCD response snapshots | Copy-paste actual LCD responses into test fixtures | Use real chain response snapshots as test fixtures |
174
+ | TS3 | JSON round-trip failure | `GbPriceUdvpn = "5500000"` (string) serialized as integer; deserialized back as null | Unit tests never serialize-then-deserialize-then-use | Added round-trip serialization tests for all data models | Test full lifecycle: create -> serialize -> deserialize -> use |
175
+ | TS4 | Second attempt never tested | Reconnect to same node fails; first connection always works | No tests simulated: connect -> disconnect -> wait -> reconnect | Added integration tests for reconnection and crash recovery | Always test the SECOND attempt, not just the first |
176
+ | TS5 | Production-scale data breaks pagination | Plan with 147K subscribers; SDK fetches all (50MB) just to count | Test plans had 5 subscribers; nobody tested with large data | Use `pagination.limit=1&count_total=true` for counts (but count_total itself is broken -- see C2) | Test with production-scale data; not just toy datasets |
177
+ | TS6 | Parallel chain tests kill internet | Running JS + C# test suites simultaneously triggers LCD/RPC rate limits | Multiple HTTP clients hammering same endpoints in parallel | Always sequential: one SDK at a time, 7s between TXs, 60s between suites | NEVER run chain tests in parallel |
178
+ | TS7 | Same code + same node = same result | Retesting without implementing a fix wastes time and tokens | Hoping a transient failure will pass on retry; it never does | Rule: "What is DIFFERENT this time?" before any retest | Never retest without implementing a new solution first |
179
+
180
+ ### DOCUMENTATION
181
+
182
+ | # | Failure Name | What Happened | Root Cause | Fix Applied | Prevention Rule |
183
+ |---|-------------|---------------|------------|-------------|-----------------|
184
+ | DC1 | 74% of SDK capability invisible to builders | ConnectAsync has 19 steps; docs describe 5 | Docs written before features; code improved; docs never updated | Comprehensive documentation of all internal steps and edge cases | Every code fix MUST include a doc update |
185
+ | DC2 | 154 suggestion files, none migrated to builder docs | Critical edge cases discovered and filed but never reach builders | Suggestions treated as internal notes, not source material for docs | Migration pipeline from suggestions to feature docs | Before saying "done," ask: can an AI reading only docs/ build this correctly? |
186
+ | DC3 | Cross-language mapping missing | JS `connectDirect()` = C# `ConnectAsync()` -- not documented anywhere | Each SDK documented independently with different names | Added cross-language reference table | Maintain JS <-> C# function name mapping document |
187
+ | DC4 | Error handling guide is JS-only | C# catch patterns not documented; all errors treated identically | C# error docs never written; only JS examples exist | C# error handling patterns with typed exceptions | Every feature doc must have examples in ALL supported languages |
188
+
189
+ ### PARITY
190
+
191
+ | # | Failure Name | What Happened | Root Cause | Fix Applied | Prevention Rule |
192
+ |---|-------------|---------------|------------|-------------|-----------------|
193
+ | PR1 | Error code string mismatch | `SESSION_EXISTS` (JS) vs `SESSION_ALREADY_EXISTS` (C#) breaks cross-language apps | Each SDK chose "reasonable" names independently; no shared contract | Error code strings ARE the contract; must be identical | Compare error code counts and strings side-by-side before release |
194
+ | PR2 | C# missing 11 error codes | Apps can't distinguish `LCD_ERROR` from `TX_FAILED` in C# | Incremental JS feature additions without C# sync | Added all 11 missing codes to C# with matching string values | When adding ANY error code to JS, mirror to C# in same session |
195
+ | PR3 | C# missing severity classifications | `ErrorSeverity.IsRetryable("TUNNEL_SETUP_FAILED")` returns wrong answer | Only 11 of 22 codes classified in C#; rest return "unknown" | Added severity for all C# error codes | Every error code must have severity, user message, and recovery action |
196
+ | PR4 | Duplicate UserMessage methods in C# | `ErrorSeverity.UserMessage` and `Helpers.UserMessage` have different coverage | Two developers added user messages independently | Merged into single source of truth | One canonical location per concern |
197
+ | PR5 | Country map: JS 183 vs C# 80 | 100+ countries fail `CountryNameToCode()` silently in C# | JS built from complete ISO database; C# hand-typed | Expanded C# country map to match JS | Auto-generate shared data from single source file |
198
+ | PR6 | Speed test: JS 6-level fallback, C# 1 target | 43 WireGuard 0-speed failures in C# apps | C# has no DNS pre-resolve, no fallback targets, no rescue mode | Port JS speed test fallback chain to C# | When porting features, port the COMPLETE implementation including fallbacks |
199
+ | PR7 | Session poisoning: JS has it, C# doesn't | C# SDK tries to reuse broken sessions; fails again, wastes time | Feature added to JS only; never ported | Port `markSessionPoisoned()` / `isSessionPoisoned()` to C# | Every feature added to one SDK must be tracked for porting |
200
+ | PR8 | C# message type URLs wrong | 7 message type URLs don't match JS SDK's `MSG_TYPES` | "Describe and generate" instead of line-by-line translation | Updated all 7 to match JS SDK exactly | NEVER describe-and-generate when porting; translate line by line |
201
+
202
+ ### SECURITY
203
+
204
+ | # | Failure Name | What Happened | Root Cause | Fix Applied | Prevention Rule |
205
+ |---|-------------|---------------|------------|-------------|-----------------|
206
+ | S1 | TLS verification disabled globally | All HTTPS node connections accept any certificate; trivial MITM | Sentinel nodes use self-signed certs; `rejectUnauthorized: false` used everywhere | Implemented TOFU (Trust-On-First-Use); save cert fingerprint on first connect | Use TOFU model; reject changed certificates |
207
+ | S2 | TOFU not wired into handshake path | TOFU exists in tls-trust.js but handshake still uses insecure agent | `v3protocol.js` imports its own `rejectUnauthorized: false` agent; TOFU agent bypassed | Wired `_tofuStore` through all 6 call sites (3 handshake, 3 status) | Verify security features are connected end-to-end, not just implemented |
208
+ | S2b | C# TOFU store not wired into SentinelVpnClient | All C# handshake + status calls used "accept all certs" fallback; TOFU was dead code | `SentinelVpnClient` never passed `tofuStore`/`nodeAddress` to `Handshake.HandshakeAsync()` or `NodeClient.GetStatusAsync()` — all 6 call sites bypassed | Added `TofuStore` to `SentinelVpnOptions`; wired `_tofuStore` + `nodeAddress` through all 6 call sites; added `NodeClient` missing `try/finally` for TOFU client disposal | Wire security features through ALL call sites; verify with grep, not assumption |
209
+ | S3 | 27 command injection surfaces | `execSync` with string interpolation allows shell injection via poisoned state | `state.v2rayPid` or `state.wgTunnelName` could contain shell metacharacters | Replace `execSync` with `execFileSync`; add input validation on state values | ALWAYS use `execFileSync` (array args); NEVER string interpolation for commands |
210
+ | S4 | 38 silent empty catches | Errors swallowed silently; no trace when things fail | `catch {}` blocks throughout 6 files; zero observability | Categorized each catch: expected (comment), unexpected (log), critical (throw) | Every catch must document WHY it's safe to swallow, or log the error |
211
+ | S9 | Bare catch swallows CancellationToken in C# | `OperationCanceledException` silently swallowed; cancellation never propagates | `DiscoverPlansAsync` and `GetProviderByAddressAsync` had bare `catch` blocks that caught ALL exceptions including cancellation | Added `catch (OperationCanceledException) { throw; }` before bare catch blocks | ALWAYS rethrow `OperationCanceledException` before any bare catch; cancellation must propagate |
212
+ | S5 | Kill switch state not persisted across crashes | After crash, user's internet permanently blocked by orphaned firewall rules | `_killSwitchEnabled` was in-memory only; `disableKillSwitch()` early-returns | Added `killSwitchEnabled` to state persistence; `recoverOrphans()` cleans up | Persist ALL security-critical state to disk; recover on startup |
213
+ | S6 | Kill switch partial failure locks down internet | 7 sequential firewall rules; failure after `blockoutbound` leaves system locked | No rollback on partial failure; first rule blocks all, subsequent rules allow exceptions | Wrapped in try-catch: if allow rules fail, immediately restore `allowoutbound` | Implement rollback for any multi-step security modification |
214
+ | S7 | Fire-and-forget EndSession races with Dispose | EndSession TX almost always fails; resources leaked | `DisconnectAsync()` fired EndSession as `_ = Task.Run()` then disposed HTTP client | Added `_pendingEndSession` field; `DisposeAsync` awaits it before disposing | Never fire-and-forget when subsequent code disposes the resources it needs |
215
+ | S8 | Dispose sets `_disposed` before DisconnectAsync completes | `ObjectDisposedException` thrown during disconnect cleanup | `_disposed = true` was set BEFORE calling `DisconnectAsync()`, which then checked `_disposed` and threw | Created `DisconnectInternalAsync(reason)` that skips disposed check; `_disposed` set AFTER disconnect completes | Set disposal flags AFTER cleanup completes, not before; create internal disconnect that skips the guard |
216
+
217
+ ### INTEGRATION
218
+
219
+ | # | Failure Name | What Happened | Root Cause | Fix Applied | Prevention Rule |
220
+ |---|-------------|---------------|------------|-------------|-----------------|
221
+ | I1 | Shared VPN client for testing | Test disconnected user's active VPN; connection state leaked to UI | Single `SentinelVpnClient` instance shared between main connection and test | Create dedicated `SentinelVpnClient` per test with `ForceNewSession = true`; dispose after each test | NEVER share the main VPN client with test functions; create+dispose per test |
222
+ | I2 | CancellationToken kills speed test | Speed test cancelled mid-download; garbage speed numbers; next test also cancelled | CancellationToken propagated from scan loop to `HttpClient.GetByteArrayAsync()` inside speed test | Pass `CancellationToken.None` to speed test and Google check; only check `ct.IsCancellationRequested` BETWEEN phases | Speed test and Google check must run to completion once started; only scan loop is cancellable |
223
+ | I3 | Background refresh blocks test connections | Clicking "New Test" during background node probe hung indefinitely | Single chain client's HTTP connections saturated by 30 parallel status probes; new requests queued | Cancel `_refreshCts` before starting test scan | Cancel ALL background chain operations before starting a test scan |
224
+ | I4 | Progress counter stuck on errors | Progress bar and "X/Y tested" froze during scan | `_testDone++` was only in the success path; exception handler skipped the increment | Added `_testDone++` and `_testFailed++` in catch-all block | EVERY code path (success, error, cancel) must increment progress counter |
225
+ | I5 | testVpn null crash in finally block | NullReferenceException when connection failed before VPN client assigned | `testVpn` declared before try, assigned inside try, but finally always called `testVpn.DisconnectAsync()` | Null-check testVpn in finally; wrap disconnect AND dispose in separate try/catch | Declare VPN client as nullable BEFORE try; null-check in finally BEFORE calling disconnect |
226
+ | I6 | NullReferenceException on unrendered dashboard | App crashed when "New Test" clicked before dashboard fully rendered | Background loop referenced UI TextBlock elements (`_testProgressTb`) that were null because `RenderTestStats()` hadn't run yet | Null-check ALL UI references in background loops; add global crash handler | Null-check every UI element reference in background/async code |
227
+ | I7 | Stop button doesn't stop | User clicked Stop but test kept running for 15-30 seconds (full ConnectAsync duration) | `CancellationToken.Cancel()` only checked between nodes; SDK's internal async operations don't respond to cancellation mid-flight | Added `_testStopRequested` volatile flag checked at 4 points + force tunnel cleanup on stop | Use volatile bool flag checked at explicit points in flow, not just CancellationToken |
228
+ | I8 | Lambda factory pattern fails in WPF | Button click handlers silently didn't bind | WPF click event binding with lambda factory closures doesn't reliably work | Explicit button creation with direct `Click += async (_, _) =>` handlers | Create WPF buttons explicitly with direct event handlers, not factory patterns |
229
+ | I9 | Ternary in C# string interpolation | CS8361 compiler error from ternary inside `$""` string | C# string interpolation doesn't allow ternary without explicit parentheses `$"{(a ? b : c)}"` | Wrapped ternary expressions in parentheses or moved to variable | Always parenthesize ternaries inside C# string interpolation |
230
+ | I10 | Triple LCD probe on startup | Three independent LCD calls (LoadBalance + RefreshAllAsync + preload) serialized; 15s startup | Each component initiated its own chain query independently | Consolidated into single initialization with shared chain client | Coordinate chain queries at startup; never make 3 independent LCD calls for the same data |
231
+ | I11 | Page flickering during user interaction | UI re-rendered node list while user was browsing | Background refresh called `RenderNodes()` after updating `_allNodes` | Background refresh updates data but does NOT re-render; user clicks Refresh to see updates | NEVER re-render during user interaction; update data silently, render on user action |
232
+ | I12 | TextChanged fires during init | Null-check crash from UI element events firing before initialization complete | WPF TextChanged event fires when programmatically setting initial values | Null-check UI elements in all event handlers | Guard ALL event handlers against null UI elements during initialization |
233
+ | I13 | Double node probing on login | Nodes probed twice: once on app open, once on login | Separate probe triggers for app startup and login without deduplication guard | Added `_initDone` guard + `_nodesLoaded` cache | Use a guard flag to prevent duplicate initialization |
234
+ | I14 | Session not visible after disconnect | User disconnected but session tab showed no session | Session saved to chain but not to local cache on disconnect | Save session to local cache instantly on disconnect, before chain confirmation | Always update local cache optimistically on state changes |
235
+ | I15 | results.json format mismatch | C# DiskCache wraps results in `{"Data":[...], "SavedAt":"..."}` but Node Tester expects raw array | Different persistence patterns between JS (raw array) and C# (cache wrapper) | Strip wrapper on export; provide both formats | Define canonical result format (raw array); never wrap in metadata for cross-tool compatibility |
236
+ | I16 | C# bridge was cosmetic for months | SDK toggle showed "C#" but all code ran through JS; nobody noticed | No logging or verification that the C# code path was actually executing | Log `[C# SDK]` on every status/handshake call; verify code path, not label | Verify the code path actually executes, not just that the UI label is correct |
237
+ | I17 | transport_security 0-indexed vs 1-indexed | C# SDK returns 0=none/1=tls; JS expects 1=none/2=tls; all C# V2Ray tests used wrong TLS | Different enum offset conventions between languages; no comparison test | Added +1 offset remap in bridge wrapper | When bridging between languages, ALWAYS check numeric enum mappings with comparison tests |
238
+ | I18 | Session lookup scanning 500+ sessions | `waitForSessionActive` scanned ALL wallet sessions (500+) instead of querying by ID | Used broad session list query instead of direct session ID query | Pass session ID directly; query by specific ID | Always use the most specific query possible; never scan full list when you have the ID |
239
+ | I19 | V2Ray port pre-check referenced uninitialized variable | `useCached` referenced before initialization; server crashed on all retests | New code inserted in middle of function referenced variable defined later | Moved variable declaration before new code; tested locally before deploying | When inserting code into middle of function, verify variable scope; run locally before deploying |
240
+ | I20 | Aggressive port scanning crashed server | Port scan of 1000-65535 in step 100 overwhelmed server with concurrent connections | Too many parallel TCP connection attempts | Limited to probing 10-15 known common ports first; batch and limit | Port scanning must be batched (max 10-15 parallel); never scan full range |
241
+ | I21 | Node loading blocks UI for minutes | Users see "No nodes found" for 2-8 minutes while 1000+ nodes enriched with live status | `GetActiveNodesAsync()` + `Task.WhenAll(statusCalls)` blocks UI render until ALL status calls complete | Two-phase loading: Phase 1 renders chain data immediately (address + pricing); Phase 2 enriches with live status in background, fires event on complete | NEVER block UI on status enrichment; show chain data instantly, enrich in background |
242
+ | I22 | No loading state during async data fetch | Empty panels during 3-10s chain queries; users think app is broken | `await` called without any visual indicator; panel stays blank until data returns | Call `ShowLoadingState()` BEFORE every `await`; "No results" only shown after completed query returns zero items | Every async data load MUST show a loading indicator before the await |
243
+
244
+ ### UX
245
+
246
+ | # | Failure Name | What Happened | Root Cause | Fix Applied | Prevention Rule |
247
+ |---|-------------|---------------|------------|-------------|-----------------|
248
+ | UX1 | No previous results on restart | App showed "No results yet" despite 135 results on disk | In-memory array empty on startup; DiskCache loaded but not rendered | Load cached results in `EnterApp()` BEFORE rendering test tab | On app startup, load and display cached results immediately; never show empty when data exists |
249
+ | UX2 | Binary PASS/FAIL instead of FAST/SLOW/FAIL | All passing nodes showed same green badge regardless of speed | Only two result categories: connected vs not connected | Changed to three-tier: FAST (green, >=10 Mbps), SLOW (amber, <10), FAIL (red) | Use three-tier speed badges: FAST/SLOW/FAIL, never binary PASS/FAIL |
250
+ | UX3 | GridLength negative crash in WPF | `MakeSessionRow` crashed from negative GridLength value | Progress bar calculation used `Math.Max(2, ...)` which created negative widths for remaining column | Clamp progress percentage to 0.01-0.99 range before creating GridLength | ALWAYS clamp progress values to 0.01-0.99 before creating proportional WPF grid columns |
251
+ | UX4 | Grid overlapping text in WPF | Left and right text elements rendered on top of each other | Two elements in Grid without explicit ColumnDefinitions | Always define explicit ColumnDefinitions for left/right layouts in WPF | NEVER put two elements in a WPF Grid without ColumnDefinitions -- they overlap |
252
+ | UX5 | No test run history | Each new scan's results replaced previous; no way to compare Monday vs Tuesday | No run archiving or history mechanism | Design: auto-save to `runs/YYYY-MM-DD_HH-MM/` with dropdown to load previous | Auto-archive every completed scan; provide dropdown to load and compare past runs |
253
+ | UX6 | No baseline measurement | Cannot distinguish slow node from slow user internet | No direct internet speed measurement before tunnel testing | Design: measure `speedtestDirect()` before scanning; store in `baselineHistory` | Always measure baseline (direct speed) before tunnel testing |
254
+ | UX7 | No token spend tracking during scan | User has no idea how much the test scan cost | Balance checked at start but no running delta calculated | Design: record balance before scan; show running spend | Track and display token spend: "Spent: X P2P (balance: Y -> Z)" |
255
+ | UX8 | Country flag rendering failure on native platforms | Empty boxes where flags should be on WPF | WPF (and Windows generally) cannot render emoji country flags; only browsers can | Built three-layer cache: memory -> disk -> download from flagcdn.com PNG | Document platform flag rendering: Web=emoji, WPF=PNG images, Swift=emoji works natively |
256
+ | UX9 | Test dashboard squeezed into sidebar | Full-width dashboard crammed into 360px sidebar panel | No layout guidance: "dashboard should take over main area, not sidebar" | Moved test dashboard to main content area | Node test dashboards are full-width; never squeeze into sidebar |
257
+ | UX10 | No click-to-copy on node addresses | Users had to manually select and copy node addresses | No clipboard integration on table rows | Added `MouseLeftButtonUp` handler for clipboard copy with visual feedback | Every node address in tables must be clickable to copy full address |
258
+ | UX11 | No expandable row diagnostics | Users cannot investigate why a specific node failed | No detail view for individual test results | Design: click row to expand and show full diagnostics (session ID, connect time breakdown, error details) | Provide expandable detail view for each test result |
259
+ | UX12 | Dedup results not implemented | Retesting a node appended a new row instead of replacing the old one | No upsert-by-address logic in results collection | Check `_testResults.Any(r => r.Address == node.Address)` before adding; `RemoveAll` on retest | Results must be deduplicated by node address; replace old result, never show duplicates |
260
+
261
+ ### SPEED TEST
262
+
263
+ | # | Failure Name | What Happened | Root Cause | Fix Applied | Prevention Rule |
264
+ |---|-------------|---------------|------------|-------------|-----------------|
265
+ | SP1 | V2Ray SOCKS5 connection reuse fails | First speed download through SOCKS5 worked; subsequent hung until timeout | V2Ray SOCKS5 proxy doesn't handle HTTP keep-alive correctly; connection pool stalls | Create fresh `SocksProxyAgent` (JS) or `HttpClient+HttpClientHandler` (C#) per request | NEVER reuse HTTP client for V2Ray SOCKS5 requests; create fresh per request |
266
+ | SP2 | V2Ray SOCKS5 connectivity pre-check missing | Speed test failed silently because SOCKS5 binding is asynchronous; proxy not ready | No verification that SOCKS5 tunnel was actually routing traffic before measuring speed | Added 3-attempt connectivity check (6 targets: google, cloudflare, 1.1.1.1, httpbin, ifconfig, ip-api) with 5s pause between | ALWAYS verify SOCKS5 connectivity with multi-target check before speed testing |
267
+ | SP3 | V2Ray preflight consumed tunnel | Preflight 1KB download via separate SocksProxyAgent consumed V2Ray's SOCKS5 connection; speed test's second agent failed with TLS disconnect | Multiple SocksProxyAgent instances competing for same V2Ray tunnel | Removed separate preflight; speed test probe (2MB) acts as connectivity test; `arraybuffer` mode instead of `stream` | Never use separate preflight through V2Ray SOCKS5; let speed test probe serve as connectivity check |
268
+ | SP4 | DNS failures behind WireGuard tunnels | `speedtestDirect` tried hostname URLs first; DNS failed through tunnel; 56/338 nodes showed ENOTFOUND | WireGuard tunnel broke DNS resolution for speed test hostnames | Pre-resolve Cloudflare IP before installing tunnel; try IP-based URL FIRST when cached IP available | ALWAYS pre-resolve speed test hostnames BEFORE installing WireGuard tunnel |
269
+ | SP5 | Speed test fallback chain incomplete in C# | 43 WireGuard nodes showed 0 speed in C# apps | C# had only 1 speed target, no DNS pre-resolve, no fallback chain, no rescue mode | Ported complete 7-level fallback: probe -> multi-request -> OVH -> Tele2 -> rescue -> google-fallback -> connected-no-throughput | Port the COMPLETE speed test fallback chain, not just the primary target |
270
+ | SP6 | Speed test `arraybuffer` vs `stream` mode | Speed test returned 0 bytes through V2Ray when using `responseType: 'stream'` | `stream` mode interacts differently with SocksProxyAgent than `arraybuffer` | Changed speedtestViaSocks5 to `arraybuffer` mode (matching what works in test-v2ray.js) | Use `arraybuffer` mode for all SOCKS5 speed test downloads |
271
+ | SP7 | V2Ray process needs post-handshake warmup | Speed test returned 0 immediately after V2Ray handshake | Node needs time to register UUID after handshake completes | Added 10s post-handshake delay for V2Ray before speed test | Wait 10s after V2Ray handshake before testing connectivity or speed |
272
+ | SP8 | Native fetch silently ignores SOCKS5 agent | Node.js `fetch` (undici) produced opaque failures through SOCKS5; speed test silently measured nothing | Node.js 18+ uses undici fetch adapter which ignores `agent` option for SOCKS5 proxies | Used `axios` with explicit `httpAgent` + `httpsAgent` for all SOCKS5 traffic | MUST use axios (not native fetch) for SOCKS5 proxy traffic in Node.js |
273
+
274
+ ### PRICING
275
+
276
+ | # | Failure Name | What Happened | Root Cause | Fix Applied | Prevention Rule |
277
+ |---|-------------|---------------|------------|-------------|-----------------|
278
+ | PR9 | BaseValue shows 18-decimal garbage | Price displayed as `52573.099722991367791000000000/GB` | Used `BaseValue` (Cosmos `sdk.Dec` with 18 decimal places) instead of `QuoteValue` (clean integer) | Always use `QuoteValue` for display; document the difference prominently | ALWAYS use `quote_value` (integer udvpn), NEVER `base_value` (18-decimal sdk.Dec) |
279
+ | PR10 | PreferHourly creates wrong sessions | SDK silently creates GB sessions when hourly requested | SDK bug: `PreferHourly` flag was ignored internally; always defaulted to GB | Documented as known SDK bug; use explicit `Gigabytes = 1` as fallback for hourly | Verify SDK actually creates the session type requested; don't trust flags blindly |
280
+ | PR11 | Session payment mode not exposed by chain | Cannot determine if session is GB-based or hourly from chain data | Chain `max_bytes` is always `1000000000` and `max_duration` is always `"0s"` regardless of payment mode | Built `SessionTracker` to persist payment mode locally | Consumer apps MUST track payment mode locally; chain data does not distinguish GB vs hourly |
281
+ | PR12 | Estimated cost shows insane numbers | "Est. Cost: 20,000 P2P" shown for a test scan | Sum of ALL viable node prices (500 nodes * 40 P2P) displayed as total | Change to show actual spent per scan or per-node average | Calculate estimated cost as `tested * avgNodePrice`, not `totalViable * nodePrice` |
282
+
283
+ ### DOCUMENTATION (continued)
284
+
285
+ | # | Failure Name | What Happened | Root Cause | Fix Applied | Prevention Rule |
286
+ |---|-------------|---------------|------------|-------------|-----------------|
287
+ | DC5 | Docs describe non-existent code | AI spent 10+ hours trying to use `NodeTester` class, `IVpnTestAdapter`, `createNodeTestAdapter()` -- none exist | Documentation written as design spec but presented as if documenting existing code; no IMPLEMENTED vs SPEC labels | Added status labels; replaced spec references with working code | Label EVERY class/function in docs as IMPLEMENTED or SPEC ONLY; grep codebase to verify |
288
+ | DC6 | No C# integration guide | AI reverse-engineered 6,500 lines of JS to build C# integration; 54% of time wasted | Node Tester docs are JS-only; no C# function mapping, no WPF-specific gotchas | Created complete C# integration report with working code | Every integration guide must include working code in BOTH JS and C# |
289
+ | DC7 | Country map only in index.html | Country-to-code mapping embedded in client-side HTML; not importable as module | `_CC` lookup table defined at line 688 of index.html; no `core/countries.js` export | Built separate country map with 120+ entries | Export shared data (country map, constants, thresholds) as importable modules, not embedded in HTML |
290
+ | DC8 | No cross-language function mapping | `nodeStatusV3()` (JS) = `NodeClient.GetStatusAsync()` (C#) -- no mapping document | Each SDK documented independently with different names | Created JS-to-C# function mapping table | Maintain cross-language function mapping table for every documented function |
291
+ | DC9 | Test result schema undocumented | AI reverse-engineered 30+ fields from `results.json` with wrong field names | No schema document; result shape varies between WG and V2Ray; optional fields not marked | Created complete result schema with real mainnet examples | Document result schema with types, optionality, and REAL JSON examples from mainnet tests |
292
+ | DC10 | Dashboard layout not specified | AI read 700-line HTML and tried to translate to WPF; layout intent lost | No layout specification; dashboard exists only as interleaved HTML+CSS+JS | Created `DASHBOARD-SPEC.md` with exact widths, alignments, data sources | Provide visual layout specification separate from implementation; not just source code |
293
+ | DC11 | Speed test evolved past documentation | Docs describe 3-target basic flow; real code has 7-level fallback chain | Speed test gained rescue mode, google-fallback, connected-no-throughput through iterative fixes; docs never updated | Documented all 7 speed test methods with decision logic | Update speed test documentation after EVERY fallback addition |
294
+ | DC12 | User journeys not documented | Basic features missing (load previous results, export, sort) because nobody walked through user flows | Docs describe WHAT the dashboard shows, not WHAT THE USER DOES across sessions | Defined 5 user journeys: first use, return visit, share results, investigate failure, compare over time | Document user journeys (multi-session workflows), not just features |
295
+
296
+ ### TESTING (continued)
297
+
298
+ | # | Failure Name | What Happened | Root Cause | Fix Applied | Prevention Rule |
299
+ |---|-------------|---------------|------------|-------------|-----------------|
300
+ | TS8 | Blind retesting 5+ times without fixes | Same 24 nodes run through same code 5 times; all failed identically | Hoping transient failures would pass on retry; they never did | Rule: "What is DIFFERENT this time?" -- must name a specific code change before any retest | NEVER retest without implementing a specific fix; write down what changed |
301
+ | TS9 | "Node-side" dismissal of 8 failures | 8 failures dismissed as node problems; all 8 were code bugs | Assumed nodes with weird behavior were broken, not our code | Iron Rule: peers > 0 = OUR fault. Investigated all 8 and found: stale cache, batch mapping, premature rejection, missing UUID wait | NEVER say "node-side" if peers > 0; other clients connect, so a working code path exists |
302
+ | TS10 | Stopped running audit to apply code fixes | Killed mid-audit server; lost 130 C# results permanently | Wanted to deploy fix immediately rather than waiting for natural restart | Auto-save results before ANY restart; code fixes wait for next natural restart | NEVER stop a running audit to apply fixes; save results first, deploy fix on next natural restart |
303
+ | TS11 | TCP port unreachable but peers connected | 10 nodes showed "port closed" but had 3-7 active peers | TCP probe timeout too short; DNS resolution differs; possible rate limiting from rapid-fire probes | Increase probe timeout; try alternate `remote_addrs`; handle rate limiting | Increase TCP probe timeout for distant nodes; try all `remote_addrs` before declaring unreachable |
304
+ | TS12 | SOCKS5 tunnel established but no connectivity | 5 V2Ray nodes with handshake success + SOCKS5 bind but zero internet through tunnel; all had 3-15 peers | V2Ray 5.2.1 may have grpc/quic bugs; connectivity targets may be blocked by node's egress policy; VMess AEAD may silently fail | Try newer V2Ray versions; test with alternate connectivity targets | Verify tunnel actually passes traffic before declaring connected; don't trust SOCKS5 port binding alone |
305
+ | TS13 | Clock drift nodes skipped but peers connected | 4 VMess-only nodes with >120s drift skipped entirely despite having 4-6 active peers | Assumed clock drift = permanent failure; peers may use VLess or have different auth | Try VLess outbound even on "VMess-only" nodes; verify VLess detection is complete | Before skipping for clock drift, verify node truly has NO VLess outbounds; peers may use a protocol we don't detect |
306
+ | TS14 | V2 format metadata rejection | Most popular node (48 peers) rejected because it returned v2 metadata format | SDK threw error on old `{protocol, tls, ca}` fields instead of mapping to v3 | Added v2-to-v3 field mapper | NEVER reject metadata outright -- map old formats to new; if 48 peers connect, the data is usable |
307
+ | TS15 | Session 500+ scan bottleneck | `waitForSessionActive` took 5+ minutes per node during retests | Function scanned ALL wallet sessions (500+) via broad LCD query instead of querying by specific session ID | Pass session ID directly; use `GET /sentinel/session/v3/sessions/{id}` | Always use direct session ID lookup, not full wallet session scan |
308
+
309
+ ---
310
+
311
+ ## Failure Statistics
312
+
313
+ | Category | Count | Most Common Root Cause |
314
+ |----------|-------|----------------------|
315
+ | Protocol | 12 | Transport config mismatch with sentinel-go-sdk server |
316
+ | Chain | 14 | v2/v3 field name changes, broken LCD endpoints |
317
+ | Tunnel | 16 | WireGuard Windows service lifecycle, crash recovery |
318
+ | Wallet | 3 | Unsafe defaults, missing state in early-return paths |
319
+ | Timing | 6 | Chain propagation lag, session lifecycle delays |
320
+ | Configuration | 8 | Wrong defaults for development, field naming |
321
+ | Dependencies | 7 | Missing admin, wrong binary versions, competing software |
322
+ | Testing | 15 | Mock data != real chain, blind retesting, "node-side" dismissals |
323
+ | Documentation | 12 | Docs lag behind code, specs disguised as docs, no cross-language mapping |
324
+ | Parity | 8 | Incremental additions without cross-language sync |
325
+ | Security | 7 | Silent swallowing, unconnected security features |
326
+ | Integration | 20 | Shared state, cancellation propagation, async coordination |
327
+ | UX | 12 | Missing data persistence across restarts, platform rendering gaps |
328
+ | Speed Test | 8 | SOCKS5 connection reuse, missing fallback chain, DNS behind tunnel |
329
+ | Pricing | 4 | BaseValue vs QuoteValue, session mode not on chain |
330
+ | **Total** | **152** | |
331
+
332
+ ---
333
+
334
+ ## The 14 Questions to Ask Before Every Change
335
+
336
+ From the C# SDK debacle (656 tests, 12 critical bugs) and Handshake dVPN integration (135 nodes, 27 problems):
337
+
338
+ 1. **"Does this work on the second attempt?"** -- Test reconnect, retry, recovery
339
+ 2. **"Does this use real chain data format?"** -- Test with actual LCD responses, not mocks
340
+ 3. **"Does this survive serialization round-trip?"** -- Serialize -> deserialize -> use
341
+ 4. **"Does this have silent side effects?"** -- Fee grants, proxy changes, route modifications
342
+ 5. **"Does this pay tokens before verifying success is possible?"** -- Pre-verify everything
343
+ 6. **"Does this parse ALL available fields?"** -- Map every chain field, not just the minimum
344
+ 7. **"Does this work at production scale?"** -- 100K subscribers, 900 nodes, 150K TXs
345
+ 8. **"Does the test verify the OUTPUT or just the INPUT?"** -- Check the broadcast TX bytes, not the flag value
346
+ 9. **"Does this work on app restart?"** -- Data must persist across close/open cycle
347
+ 10. **"Does the user see their data from last session?"** -- Never show empty when data exists on disk
348
+ 11. **"Can an AI reading only the docs build this correctly?"** -- If docs reference non-existent code, they cause more harm than no docs
349
+ 12. **"Is every code path ported from the reference?"** -- Copy ALL waits, sleeps, and fallbacks, not just the happy path
350
+ 13. **"What is DIFFERENT this time?"** -- Before any retest, name the specific code change
351
+ 14. **"Does this code path actually execute?"** -- Verify with logs, not labels; the C# bridge was cosmetic for months
352
+
353
+ ---
354
+
355
+ ## Source Projects
356
+
357
+ Every finding traces back to a specific project. This section documents the source for traceability.
358
+
359
+ ### Handshake dVPN (C# WPF)
360
+ **Files:** `handshake-RETROSPECTIVE.md`, `handshake-STANDARDS.md`, `handshake-sentinel.md`, `handshake-MANIFESTO.md`, `handshake-AI-NODE-TEST-INTEGRATION.md`
361
+ **Findings:** I1-I15, UX1-UX12, SP1-SP8, PR9-PR12, DC5-DC12, TS8-TS10
362
+ **Summary:** 26-hour build, 135 mainnet nodes tested (118 pass, 17 fail). 54% of time wasted on undocumented issues. Discovered that C# integration requires completely reimplementing speed test, flag rendering, disk cache, and session tracking from scratch due to missing SDK components.
363
+
364
+ ### Node Tester (JS Express)
365
+ **Files:** `node-tester-HANDOFF.md`, `node-tester-sentinel.md`, all 12 `node-tester-suggestion-*.md` files
366
+ **Findings:** P1-P12, C1-C14, T1-T16, TM1-TM6, TS1-TS7, TS11-TS15, I16-I20, DC7, DC10, DC11
367
+ **Summary:** 2200+ node tests across JS and C# SDKs. Found 24 protocol bugs. Proved Iron Rule: every node with peers > 0 that failed was our bug, not node-side. Documented complete speed test fallback chain, V2Ray config building, clock drift detection, and WireGuard lifecycle management.
368
+
369
+ ### Test2 (JS SDK Proving Ground)
370
+ **Files:** `test2-sentinel.md`
371
+ **Findings:** P7-P9, CF1-CF4, W1-W3, D1-D4
372
+ **Summary:** First consumer of JS SDK. Discovered autoReconnect was completely broken, BigInt serialization crash, broadcast name collision, fullTunnel default bricking AI's internet, and missing cleanup handler registration.
373
+
374
+ ### Desktop dVPN / C# SDK (EXE)
375
+ **Findings:** C1-C14, PR1-PR8, S1-S7, TS1-TS3, DC1-DC4
376
+ **Summary:** 656 unit tests passing with zero working features on mainnet. Proved that unit tests are meaningless without live chain integration tests. Found all v2/v3 field name mismatches, MsgEndSession not registered, 27 command injection surfaces, and parity gaps.
377
+
378
+ ### One-Shot Buildability Analysis
379
+ **Files:** `node-tester-suggestion-one-shot-buildability-analysis.md`
380
+ **Findings:** Wall 1-7 analysis identifying fundamental barriers to AI building node testers: batch payment not in SDK, session reuse complexity, V2Ray config minefield, clock drift detection, WireGuard admin requirement, fragile speed testing through tunnels, pipeline resilience.
381
+
382
+ ### Undiagnosed Failures Report
383
+ **Files:** `node-tester-suggestion-undiagnosed-failures.md`
384
+ **Findings:** TS11-TS14 (22 nodes with active peers: 10 TCP unreachable, 5 SOCKS5 no connectivity, 4 clock drift skips, 1 v2 format metadata, 2 handshake failures). Total ~130 active users connected to these "failing" nodes. All failures are in our code, not nodes.
385
+
386
+ ---
387
+
388
+ ## Pending Integration
389
+
390
+ ### [PENDING] fix-registry-backup.md
391
+ **Category:** BUG-FIX
392
+ **Summary:** `setSystemProxy()` overwrites Windows proxy settings with `/f` (force), no backup/restore of previous state. If user had corporate proxy, `clearSystemProxy()` sets "no proxy" instead of restoring their previous configuration.
393
+ **Action:** Review and integrate into T10/T11 entries above (proxy restore overwrites user's previous proxy).
394
+
395
+ ### [PENDING] fix-registry-backup.md
396
+ **Category:** BUG-FIX
397
+ **Summary:** **Status:** SUGGESTION — needs review **Date:** 2026-03-09 **Severity:** HIGH — current implementation force-overwrites, no restore to previous state **File affected:** `js-sdk/node-connect.js` lines 79-124 `setSystemProxy()` overwrites Windows proxy settings with `/f` (force):
398
+ **Action:** Review and integrate into main documentation above
399
+
400
+ ### [PENDING] fix-registry-backup.md
401
+ **Category:** BUG-FIX
402
+ **Summary:** **Status:** SUGGESTION — needs review **Date:** 2026-03-09 **Severity:** HIGH — current implementation force-overwrites, no restore to previous state **File affected:** `js-sdk/node-connect.js` lines 79-124 `setSystemProxy()` overwrites Windows proxy settings with `/f` (force):
403
+ **Action:** Review and integrate into main documentation above