querysub 0.415.0 → 0.417.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "querysub",
3
- "version": "0.415.0",
3
+ "version": "0.417.0",
4
4
  "main": "index.js",
5
5
  "license": "MIT",
6
6
  "note1": "note on node-forge fork, see https://github.com/digitalbazaar/forge/issues/744 for details",
@@ -62,7 +62,7 @@
62
62
  "pako": "^2.1.0",
63
63
  "peggy": "^5.0.6",
64
64
  "querysub": "^0.357.0",
65
- "socket-function": "^1.1.18",
65
+ "socket-function": "^1.1.19",
66
66
  "terser": "^5.31.0",
67
67
  "typesafecss": "^0.28.0",
68
68
  "yaml": "^2.5.0",
@@ -2,7 +2,7 @@ import { SocketFunction } from "socket-function/SocketFunction";
2
2
  import { getArchives } from "../-a-archives/archives";
3
3
  import { getDomain, isDevDebugbreak, isNoNetwork, isPublic } from "../config";
4
4
  import { measureBlock } from "socket-function/src/profiling/measure";
5
- import { isNode, sha256Hash, throttleFunction, timeInMinute, timeInSecond } from "socket-function/src/misc";
5
+ import { isNode, keyByArray, sha256Hash, throttleFunction, timeInMinute, timeInSecond } from "socket-function/src/misc";
6
6
  import { errorToUndefinedSilent, ignoreErrors, logErrors, timeoutToError, timeoutToUndefinedSilent } from "../errors";
7
7
  import { ensureWeAreTrusted, requiresNetworkTrustHook } from "../-d-trust/NetworkTrust2";
8
8
  import { delay, runInfinitePoll, runInfinitePollCallAtStart } from "socket-function/src/batching";
@@ -22,6 +22,7 @@ import { EdgeNodeConfig } from "../4-deploy/edgeNodes";
22
22
  import * as certs from "../-a-auth/certs";
23
23
  import { logDisk } from "../diagnostics/logs/diskLogger";
24
24
  import { MaybePromise } from "socket-function/src/types";
25
+ import { getPathStr2 } from "../path";
25
26
 
26
27
  let HEARTBEAT_INTERVAL = timeInMinute * 15;
27
28
  // Interval which we check other heartbeats
@@ -196,10 +197,6 @@ function addNodeIdBase(nodeId: string) {
196
197
  }
197
198
  async function setNodeIds(nodeIds: string[]) {
198
199
  nodeIds = nodeIds.filter(x => x !== SPECIAL_NODE_ID_FOR_UNMOUNTED_NODE);
199
- if (isNode()) {
200
- await Promise.allSettled(nodeIds.map(checkWrongServerNodeId));
201
- nodeIds = nodeIds.filter(nodeId => !wrongServerNodeIds.has(nodeId));
202
- }
203
200
 
204
201
  console.info("setNodeIds", { nodeIds });
205
202
  let newNodeIds = nodeIds.filter(nodeId => !allNodeIds2.has(nodeId));
@@ -297,53 +294,37 @@ export async function triggerNodeChange() {
297
294
  }));
298
295
  }
299
296
 
300
- // If we can connect on the same port, but it has a different thread ID, it means the old thread ID is gone. We're never going to go back to an old thread ID, and we can't have two threads on the same port.
301
- let wrongServerNodeIds = new Set<string>();
302
- let checkWrongServerNodeId = cache(async (nodeId: string) => {
303
- if (wrongServerNodeIds.has(nodeId)) return;
304
- let callFactory = await timeoutToUndefinedSilent(timeInSecond * 5, Promise.resolve(getCreateCallFactory(nodeId)));
305
- if (!callFactory) {
306
- if (SocketFunction.logMessages) {
307
- console.log(`Did not find call factory for ${nodeId}`);
308
- }
309
- // Clear it right away, so we can check for it being alive quickly.
310
- checkWrongServerNodeId.clear(nodeId);
311
- return;
312
- }
313
- if (callFactory) {
314
- // Not great, but... this should work well enough.
315
- for (let i = 0; i < 10; i++) {
316
- if (callFactory.receivedInitializeState) break;
317
- await delay(500);
318
- }
319
- if (!callFactory.receivedInitializeState && SocketFunction.logMessages) {
320
- console.log(`Did not receive initialize state from ${nodeId}`);
321
- }
322
- } else {
323
- if (SocketFunction.logMessages) {
324
- console.log(`Did not find call factory for ${nodeId}`);
325
- }
297
+
298
+ async function clearDeadThreadsFromArchives() {
299
+ let nodes = await archives().find("");
300
+
301
+ function getPortHash(nodeId: string) {
302
+ let obj = decodeNodeId(nodeId);
303
+ if (!obj) return undefined;
304
+ return getPathStr2(obj.machineId, obj.port + "");
326
305
  }
327
- if (callFactory && callFactory.realNodeId && callFactory.realNodeId !== nodeId) {
328
- if (SocketFunction.logMessages) {
329
- console.log(red(`Found dead thread, disconnecting node and deleting from archives ${nodeId}`));
330
- }
331
- wrongServerNodeIds.add(nodeId);
332
- callFactory?.disconnect();
333
- // Dead threads never come back, so this should be safe to do.
334
- await archives().del(nodeId);
335
- // Return, so we don't clear this.
336
- return;
337
- } else {
338
- if (SocketFunction.logMessages) {
339
- console.log(green(`Found live thread, node ${nodeId}, real node id ${callFactory?.realNodeId}`));
306
+ let byPortHash = keyByArray(nodes, getPortHash);
307
+ for (let [portHash, nodeIds] of Array.from(byPortHash)) {
308
+ if (!portHash) continue;
309
+ let aliveNodeId = "";
310
+ await Promise.all(nodeIds.map(async nodeId => {
311
+ let alive = await errorToUndefinedSilent(NodeDiscoveryController.nodes[nodeId].isAlive());
312
+ if (alive) {
313
+ aliveNodeId = nodeId;
314
+ }
315
+ }));
316
+ if (aliveNodeId) {
317
+ let deadThreads = nodeIds.filter(nodeId => nodeId !== aliveNodeId);
318
+ await Promise.all(deadThreads.map(async deadNodeId => {
319
+ console.log(`Removing dead thread. We contacted a node on the same port and same machine (${aliveNodeId}), which means the port has been reused by another thread, which proves that the old thread has died, as otherwise the new thread would not be able to use it.`);
320
+ await archives().del(deadNodeId);
321
+ }));
340
322
  }
341
323
  }
342
324
 
343
- setTimeout(() => {
344
- checkWrongServerNodeId.clear(nodeId);
345
- }, timeInMinute * 5);
346
- });
325
+ return nodes;
326
+
327
+ }
347
328
 
348
329
  async function syncArchives() {
349
330
  if (isServer()) {
@@ -381,7 +362,8 @@ async function runHeartbeatAuditLoop() {
381
362
  //console.log(magenta(`Auditing node list`));
382
363
 
383
364
  let deadTime = Date.now() - DEAD_THRESHOLD;
384
- let nodeIds = await archives().find("");
365
+
366
+ let nodeIds = await clearDeadThreadsFromArchives();
385
367
  // We spent the money checking the node list, so we might as well update it
386
368
  await setNodeIds(nodeIds);
387
369
 
@@ -451,7 +433,6 @@ async function fastMemorySync() {
451
433
  // with extra nodes. However, if we are missing nodes, we'd prefer to have them quickly, so we should
452
434
  // sync now.
453
435
  let missingNodes = otherNodes.filter(nodeId => !allNodeIds2.has(nodeId));
454
- missingNodes = missingNodes.filter(nodeId => !wrongServerNodeIds.has(nodeId));
455
436
  if (missingNodes.length > 0) {
456
437
  console.log(yellow(`Node list is missing nodes, resyncing node`), { missingNodes, otherNodes });
457
438
  await syncArchives();
@@ -614,6 +595,9 @@ const tellEveryoneNodesChanges = throttleFunction(1000, function tellEveryoneNod
614
595
 
615
596
 
616
597
  class NodeDiscoveryControllerBase {
598
+ public async isAlive() {
599
+ return true;
600
+ }
617
601
  public async addNode(nodeId: string) {
618
602
  console.log(magenta(`Received addNode`), { nodeId });
619
603
  addNodeId(nodeId);
@@ -642,7 +626,7 @@ const NodeDiscoveryController = SocketFunction.register(
642
626
  "NodeDiscoveryController-7991037e-fd9e-4085-b1db-52035487e72c",
643
627
  new NodeDiscoveryControllerBase(),
644
628
  () => ({
645
- getOwnNodeId: { noClientHooks: true, noDefaultHooks: true },
629
+ isAlive: {},
646
630
  addNode: { hooks: [requiresNetworkTrustHook] },
647
631
  resyncNodes: { hooks: [requiresNetworkTrustHook] },
648
632
  getAllNodesHash: { hooks: [requiresNetworkTrustHook] },
@@ -581,6 +581,11 @@ function createObjectAliveChecker(data: () => any, path: SchemaPath, delay: numb
581
581
  });
582
582
  }
583
583
 
584
+ let querysub: typeof import("../4-querysub/Querysub") | undefined = undefined;
585
+ setImmediate(async () => {
586
+ querysub = await import("../4-querysub/Querysub");
587
+ });
588
+
584
589
  let callIdOverrideFncs: Map<string, Map<string, {
585
590
  prefix: string;
586
591
  getKey: (...args: unknown[]) => string;
@@ -591,11 +596,22 @@ export function getCallIdOverride(config: {
591
596
  callId: string;
592
597
  args: unknown[];
593
598
  }): string {
594
- let def = callIdOverrideFncs.get(config.moduleId)?.get(config.functionId);
595
- if (!def) return config.callId;
596
- return createRoutingOverrideKey({
597
- remappedPrefix: def.prefix,
598
- originalKey: config.callId,
599
- routeKey: def.getKey(...config.args),
600
- });
599
+ try {
600
+ if (querysub) {
601
+ if (!querysub.Querysub.isInSyncedCall()) {
602
+ // NOTE: This is wrong and it will result in loading user being used as a key sometimes. However, it should be fine, as if the call id override is wrong, it shouldn't break anything.
603
+ return querysub.Querysub.localRead(() => getCallIdOverride(config));
604
+ }
605
+ }
606
+ let def = callIdOverrideFncs.get(config.moduleId)?.get(config.functionId);
607
+ if (!def) return config.callId;
608
+ return createRoutingOverrideKey({
609
+ remappedPrefix: def.prefix,
610
+ originalKey: config.callId,
611
+ routeKey: def.getKey(...config.args),
612
+ });
613
+ } catch (e: any) {
614
+ console.warn(`Error getting call id override for ${config.moduleId}.${config.functionId}, falling back to original call id`, { error: e.stack });
615
+ return config.callId;
616
+ }
601
617
  }
@@ -150,6 +150,7 @@ export class SyncTestPage extends qreact.Component {
150
150
  hasError = true;
151
151
  }
152
152
  if (hasError) return undefined;
153
+ if (!result?.time) return undefined;
153
154
  return (
154
155
  <div className={css.hbox(5).hsl(0, 0, 100).pad2(2)} title={JSON.stringify(thread)}>
155
156
  <div>{getUniqueThreadName(thread, allThreads)}</div>
package/tempnotes.txt CHANGED
@@ -2,60 +2,14 @@
2
2
 
3
3
  Local CYOA + Local FunctionRunner works
4
4
 
5
+ -1) A lot of API calls are getting locked up and never finishing. We need to fix this. What the fuck is happening? It happens a lot with audio calls, as we make a lot of audio calls, but we can probably replicate it with non-audio calls. I think it happened with some embeddings, although we did create a lot of embeddings, and it only happened at the end. If it's only an audio thing, that's different, but we still need some kind of timeout. But I don't think it's an audio thing, I think it's not actually starting.
5
6
 
6
- Wait... Didn't we restart everything? Why does it seem like the function runner is still watching the same things?
7
- "00020e3a3b543382.a794fbcf7b104c68.querysubtest.com:37381"
8
-
9
-
10
-
11
- Hmm... we ask to watch it, but the watch doesn't show up on the other side? Fuck...
12
-
13
- Hmm, it's still failing in the same way. The function runner just has, like, no watches. It tried to watch I assume the same thing as before. Nothing showed up on the other end.
14
-
15
-
16
-
17
-
18
- So we have the remote watcher, so it looks like we were watching it, but we don't think it's synced, so we clearly didn't get a value back.
19
-
20
- WATCHER FAILED TO SYNC findFunctionsToCall DID NOT RECEIVE PATH VALUES. This means PathValueServer is not responding to watches, either to specific paths, or for all paths [
21
- '.,querysubtest._com.,',
22
- '.,querysubtest._com.,PathFunctionRunner.,'
23
- ] [ '.,querysubtest._com.,PathFunctionRunner.,' ] [Function: findFunctionsToCall]
24
- Node list is missing nodes, resyncing node {
25
- missingNodes: [ '716181d6570f7b55.a794fbcf7b104c68.querysubtest.com:34689' ],
26
-
27
-
28
- UGH... poking fixed it. WHICH MEANS, we have to break it again, and get into the bad state again!
29
- - Ugh...
30
-
31
- -1) We got into a bad state where The query sub nodes weren't really listening or it's it th they had zombie watchers that hadn't finished synchronizing, but it wasn't complaining about being able to un to find any any paths. There were no unwatched paths
32
- - It fixed when we poked them.
33
-
34
- 0) If it fails after redeploying, disable the server and run everything locally again, as we made a lot of changes, and so it might not even work locally.
35
-
36
- -1) Get a test script that doesn't even use the function runner working.
37
-
38
- 0) Get our front end working. I guess we need to run the function runner locally, and then if that works, we need to figure out why the remote function runner isn't working.
39
- - Also, see if we run the local query subserver and local function runner, if those writes can be seen remotely. If it's a remote function runner problem but not a remote path value server problem, we should see the writes.
40
-
41
- -1) Fix the remote servers not being able to send values to our local path value server. I don't know why it wouldn't work... They should have changed the name knowing our local name.
42
-
43
- 1) Use connection page to verify server can talk to our locally hosted server
44
- 1.1) Verify by breaking into our local server that we are receiving values written on the remote server in the local server.
45
- - This is extremely important. Without this, we can't run a local server. There's all kinds of issues, but our socket function changes should make this so this just works.
46
7
 
47
8
  0) SHARD THE FUNCTION RUNNER!
48
9
  - And secondary sharding for backup...
49
10
  - First shard locally
50
11
  - Test on our sync test page (Otherwise all the writes are going to go to the same function runner anyway, so it's not a good test.
51
12
 
52
- -2) Errors are not being automatically grouped.
53
- - We are getting notifications though, which is weird. The notifications are supposed to be automatically grouping it. We should probably look at its logs and see why.
54
-
55
- 0) We just are constantly "Node list is missing nodes, resyncing node"
56
- - We should check the logs. I'm assuming what happened is that at some point some server thought it was the wrong node ID and deleted it?
57
- - It's generally the same ones, except new ones get added.
58
-
59
13
  I think even if we run into some occasional issues, we should just power through and try to fix them later. Because I'm sick of working on the framework...
60
14
 
61
15
  MONTHLY SUMMARY!