npm - @agoric/swing-store - Versions diffs - 0.9.2-u16.0 → 0.9.2-u17.0 - Mend

@agoric/swing-store 0.9.2-u16.0 → 0.9.2-u17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/CHANGELOG.md +7 -1
package/docs/bundlestore.md +30 -0
package/docs/data-export.md +6 -6
package/docs/kvstore.md +35 -0
package/docs/snapstore.md +49 -0
package/docs/swingstore.md +62 -6
package/docs/transcriptstore.md +70 -0
package/package.json +10 -10
package/src/archiver.js +80 -0
package/src/bundleStore.js +1 -1
package/src/exporter.js +5 -5
package/src/hasher.js +1 -1
package/src/importer.js +1 -1
package/src/index.js +2 -0
package/src/internal.js +1 -1
package/src/kvStore.js +1 -1
package/src/repairMetadata.js +1 -1
package/src/snapStore.js +113 -77
package/src/swingStore.js +11 -2
package/src/transcriptStore.js +281 -141
package/test/deletion.test.js +644 -20
package/test/export.test.js +4 -4
package/test/exportImport.test.js +1 -1
package/test/snapstore.test.js +47 -2
package/test/state.test.js +91 -67

package/CHANGELOG.md CHANGED Viewed

@@ -3,13 +3,17 @@
 All notable changes to this project will be documented in this file.
 See [Conventional Commits](https://conventionalcommits.org) for commit guidelines.
-### [0.9.2-u16.0](https://github.com/Agoric/agoric-sdk/compare/@agoric/swing-store@0.9.1...@agoric/swing-store@0.9.2-u16.0) (2024-07-02)
+### [0.9.2-u17.0](https://github.com/Agoric/agoric-sdk/compare/@agoric/swing-store@0.9.1...@agoric/swing-store@0.9.2-u17.0) (2024-09-17)
 ### Features
+* Add consensus-independent vat snapshot archiving configuration to AG_COSMOS_INIT ([ffc594f](https://github.com/Agoric/agoric-sdk/commit/ffc594f9441a9374646c43b69d289cc560962f64)), closes [#10036](https://github.com/Agoric/agoric-sdk/issues/10036)
+* Add consensus-independent vat transcript archiving configuration to AG_COSMOS_INIT ([d2d5803](https://github.com/Agoric/agoric-sdk/commit/d2d5803baab6e6379d179723244b2e92aac6319a)), closes [#10036](https://github.com/Agoric/agoric-sdk/issues/10036)
 * add exporter.getHostKV() API ([eb564f9](https://github.com/Agoric/agoric-sdk/commit/eb564f9635397c0706e1f8255b3e125681e2d031)), closes [#8523](https://github.com/Agoric/agoric-sdk/issues/8523)
+* **swing-store:** budget-limited deletion of snapshot and transcripts ([c43bf63](https://github.com/Agoric/agoric-sdk/commit/c43bf63846aedf3493ac6e8f4bc9f2bb48401d66)), closes [#8928](https://github.com/Agoric/agoric-sdk/issues/8928)
 * **swing-store:** faster import of swing-store ([0170568](https://github.com/Agoric/agoric-sdk/commit/0170568d66748af76f0bd24a4acdaa34b9c79cca))
+* **swing-store:** Limit item deletion to the previously-current transcript span ([766c1bb](https://github.com/Agoric/agoric-sdk/commit/766c1bbb082debe9d6fa94e08466d3596c971843)), closes [#9387](https://github.com/Agoric/agoric-sdk/issues/9387)
 * **swing-store:** prevent SwingSet usage of imported swing-store ([6a833eb](https://github.com/Agoric/agoric-sdk/commit/6a833ebda2b2ff0e72040ca8186f93ae91567add))
 * **swingstore:** add repairMetadata() ([33b5c1c](https://github.com/Agoric/agoric-sdk/commit/33b5c1c1fefd5278a24cd5f06630b238439e2891)), closes [#8025](https://github.com/Agoric/agoric-sdk/issues/8025) [#8025](https://github.com/Agoric/agoric-sdk/issues/8025)
 * tool for auditing dangling kindID references ([eeadc46](https://github.com/Agoric/agoric-sdk/commit/eeadc462d8fb09449e4ea6f0118ae8654e0c8e9b)), closes [#7655](https://github.com/Agoric/agoric-sdk/issues/7655)
@@ -20,7 +24,9 @@ See [Conventional Commits](https://conventionalcommits.org) for commit guideline
 * misc runtime robustness found by typecheck ([a033f26](https://github.com/Agoric/agoric-sdk/commit/a033f2638f9f11e19d94d7931e4e0614773b1f60))
 * performance.now() binding ([4a3b59b](https://github.com/Agoric/agoric-sdk/commit/4a3b59b486c0cb916e531d5de390fbf90e4421ad))
 * rewrite importSwingStore to preserve metadata properly ([38c9efc](https://github.com/Agoric/agoric-sdk/commit/38c9efce10957e0eb245e25ae5f9f45792eb58ad)), closes [#8025](https://github.com/Agoric/agoric-sdk/issues/8025)
+* **swing-store:** accept budget=Infinity to allow unlimited deletions ([c22b656](https://github.com/Agoric/agoric-sdk/commit/c22b65610007607f13373bc1dcc54007e30e2d60))
 * **swing-store:** add 'replay' artifactMode, make export more strict ([9939ea6](https://github.com/Agoric/agoric-sdk/commit/9939ea699bb1fd0b711f950679b432eef9054fda)), closes [#8105](https://github.com/Agoric/agoric-sdk/issues/8105)
+* **swing-store:** Delete transcript spans in stopUsingTranscript as in rollover ([2d1e478](https://github.com/Agoric/agoric-sdk/commit/2d1e47844760e0534afb3fe410a9635656949dad)), closes [#10054](https://github.com/Agoric/agoric-sdk/issues/10054)
 * **swing-store:** ensure crank savepoint is wrapped in transaction ([9d2dd3f](https://github.com/Agoric/agoric-sdk/commit/9d2dd3f9966940961a4c21351d256fa3615715d7))
 * **swing-store:** explicitly harden prototypes ([86c128a](https://github.com/Agoric/agoric-sdk/commit/86c128a29b5ed61764a67644c3734b0c05df2993))
 * **swing-store:** no completeness check when creating exporter ([d4df073](https://github.com/Agoric/agoric-sdk/commit/d4df073ffcc48b0f0e62bac107ee8edf21150ad9))

package/docs/bundlestore.md ADDED Viewed

@@ -0,0 +1,30 @@
+# BundleStore
+The `kernelStorage.bundleStore` sub-store manages code bundles. These can be used to hold vat-worker supervisor code (e.g. the [`@endo/lockdown`](https://github.com/endojs/endo/tree/master/packages/lockdown) bundle, or the [`@agoric/swingset-xsnap-supervisor` package](../../swingset-xsnap-supervisor), which incorporates liveslots), or the initial vat code bundles (for both kernel-defined bundles like vat-comms or vat-timer, or for application-defined bundles like vat-zoe or the ZCF code). It can also hold bundles that will be loaded later by userspace vat code, such as contract bundles.
+Each bundle held by the bundleStore is identified by a secure BundleID, which contains a format version integer and a hash, with a format like `b0-123abc456def...` or `b1-789ghi012...`. This contains enough information to securely define the behavior of the code inside the bundle, and to identify the tools needed to load/evaluate it.
+The bundleStore provides a simple add/get/remove API to the kernel. The kernel adds its own bundles during initialization, and provides the host application with an API to load additional ones in later. The kernel code that creates new vats will read bundles from the bundleStore when necessary, as vats are created. Userspace can get access to "BundleCap" objects that represent bundles, to keep the large bundle blobs out of RAM as much as possible.
+## Data Model
+Bundles are actually JavaScript objects: records of at least `{ moduleFormat }`, plus some format-specific fields like `endoZipBase64` and `endoZipBase64Sha512`. They are created by the [`@endo/bundle-source`](https://github.com/endojs/endo/tree/master/packages/bundle-source) package. Many are consumed by [`@endo/import-bundle`](https://github.com/endojs/endo/tree/master/packages/import-bundle), but the `b0-` format bundles can be loaded with some simple string manipulation and a call to `eval()` (which is how supervisor bundles are injected into new vat workers, before `@endo/import-bundle` is available).
+The bundleStore database treats each bundle as BundleID and a blob of contents. The SQLite `bundles` table is just `(bundleID TEXT, bundle BLOB)`. The bundleStore knows about each `moduleFormat` and how to extract the meaningful data and compress it into a blob, and how to produce the Bundle object during retrieval.
+The bundleStore also knows about the BundleID computation rules. The `addBundle()` API will verify that the contents match the ID, however it currently relies upon the caller to verify e.g. that the bundle does not contain any unexpected properties. The `importSwingStore()` API performs more extensive validation, to prevent corruption during the export+import process.
+The kernel is expected to keep track of which bundles are needed and when (with reference counts), and to not delete a bundle unless it is really unneeded. Currently, this means all bundles are retained forever.
+Unlike the `snapStore`, there is no notion of pruning bundles: either the bundle is present (with all its data), or there is no record of the BundleID at all.
+## Export Model
+Each bundle gets a single export-data entry, whose name is `bundle.${bundleID}`, and whose value is just `${bundleID}`. Each bundle also gets a single export artifact, whose name is `bundle.${bundleID}`, and whose contents are the compressed BLOB from the database (from which a Bundle record can be reconstructed).
+## Slow Deletion
+Since bundles are not owned by vats, there is nothing to delete when a vat is terminated. So unlike `transcriptStore` and `snapStore`, there is no concept of "slow deletion", and no APIs to support it.
+When a bundle is deleted by `bundleStore.deleteBundle()`, its export-data item is deleted immediately, and subsequent exports will omit the corresponding artifact.

package/docs/data-export.md CHANGED Viewed

@@ -188,13 +188,13 @@ Once the new SwingStore is fully populated with the previously-exported data, th
 Some of the data maintained by SwingStore is not strictly necessary for kernel execution, at least under normal circumstances. For example, once a vat worker performs a heap snapshot, we no longer need the transcript entries from before the snapshot was taken, since vat replay will start from the snapshot point. We split each vat's transcript into "spans", delimited by heap snapshot events, and the "current span" is the most recent one (still growing), whereas the "historical spans" are all closed and immutable. Likewise, we only really need the most recent heap snapshot for each vat: older snapshots might be interesting for experiments that replay old transcripts with different versions of the XS engine, but no normal kernel will ever need them.
-Most validators would prefer to prune this data, to reduce their storage needs. But we can imagine some [extreme upgrade scenarios](https://github.com/Agoric/agoric-sdk/issues/1691) that would require access to these historical transcript spans. Our compromise is to record *validation data* for these historical spans in the export data, but omit the spans themselves from the export artifacts. Validators can delete the old spans at will, and if we ever need them in the future, we can add code that will fetch copies from an archive service, validate them against the export data hashes, and re-insert the relevant entries into the SwingStore.
+Most blockchain validator nodes would prefer to prune this data, to reduce their storage needs. But we can imagine some [extreme upgrade scenarios](https://github.com/Agoric/agoric-sdk/issues/1691) that would require access to these historical transcript spans. Our compromise is to record *validation data* for these historical spans in the export data, but omit the spans themselves from the export artifacts. Validators can delete the old spans at will, and if we ever need them in the future, we can add code that will fetch copies from an archive service, validate them against the export data hashes, and re-insert the relevant entries into the SwingStore.
 Likewise, each time a heap snapshot is recorded, we cease to need any previous snapshot. And again, as a hedge against even more drastic recovery scenarios, we strike a compromise between minimizing retained data and the ability to validate old snapshots, by retaining only their hashes.
-As a result, for each active vat, the first-stage Export Data contains a record for every old transcript span, plus one for the current span. It also contains a record for every old heap snapshot, plus one for the most recent heap snapshot, plus a `.current` record that points to the most recent snapshot. However the exported artifacts may or may not include blobs for the old transcript spans, or for the old heap snapshots.
+As a result, for each active vat, the first-stage Export Data contains a record for every old heap snapshot, plus one for the most recent heap snapshot, plus a `.current` record that points to the most recent snapshot. It also contains a record for every old transcript span, plus one for the current span. However the exported artifacts may or may not include blobs for the old heap snapshots, or for the old transcript spans.
-The `openSwingStore()` function has an option named `keepTranscripts` (which defaults to `true`), which causes the transcriptStore to retain the old transcript items. A second option named `keepSnapshots` (which defaults to `false`) causes the snapStore to retain the old heap snapshots. Opening the swingStore with a `false` option does not necessarily delete the old items immediately, but they'll probably get deleted the next time the kernel triggers a heap snapshot or transcript-span rollover. Validators who care about minimizing their disk usage will want to set both to `false`. In the future, we will arrange the SwingStore SQLite tables to provide easy `sqlite3` CLI commands that will delete the old data, so validators can also periodically use the CLI command to prune it.
+The `openSwingStore()` function has an option named `keepSnapshots` (which defaults to `false`), which causes the snapStore to retain the old heap snapshots. A second option named `keepTranscripts` (which defaults to `true`) causes the transcriptStore to retain the old transcript items. Opening the swingStore with a `false` option does not necessarily delete the old items immediately, but they may get deleted the next time the kernel triggers a heap snapshot or transcript-span rollover. Hosts who care about minimizing their disk usage will want to set both to `false`. In the future, we will arrange the SwingStore SQLite tables to provide easy `sqlite3` CLI commands that will delete the old data, for use in periodic pruning.
 When exporting, the `makeSwingStoreExporter()` function takes an `artifactMode` option (in an options bag). This serves to both limit, and provide some minimal guarantees about, the set of artifacts that will be provided in the export. The defined values of `artifactMode` each build upon the previous one:
@@ -218,9 +218,9 @@ While `importSwingStore()`'s options bag accepts the same options as `openSwingS
 So, to avoid pruning current-incarnation historical transcript spans when exporting from one swingstore to another, you must set (or avoid overriding) the following options along the way:
 * the original swingstore must not be opened with `{ keepTranscripts: false }`, otherwise the old spans will be pruned immediately
-* the export must use `makeSwingStoreExporter(dirpath, { artifactMode: 'replay'})`, otherwise the export will omit the old spans
-* the import must use `importSwingStore(exporter, dirPath, { artifactMode: 'replay'})`, otherwise the import will ignore the old spans
-  * subsequent `openSwingStore` calls must not use `keepTranscripts: false`, otherwise the new swingstore will prune historical spans as new ones are created (during `rolloverSpan`).
+* the export must use `makeSwingStoreExporter(dirpath, { artifactMode: 'replay' })`, otherwise the export will omit the old spans
+* the import must use `importSwingStore(exporter, dirPath, { artifactMode: 'replay' })`, otherwise the import will ignore the old spans
+  * subsequent `openSwingStore` calls must not use `keepTranscripts: false`, otherwise the new swingstore will prune historical spans they are replaced during `rolloverSpan`.
 ## Implementation Details

package/docs/kvstore.md ADDED Viewed

@@ -0,0 +1,35 @@
+# KVStore
+The `kernelStorage.kvStore` sub-store manages a table of arbitrary key-value (string-to-string) pairs. It provides the usual get/set/has/delete APIs, plus a `getNextKey` call to support lexicographic iteration.
+There are three separate sections of the namespace. The normal one is the "consensus" section.  Each value written here will be given an export-data row, and incorporated into the "crankhash" (described below).
+The second is "local", and includes any key which is prefixed with `local.`. These keys are *not* given export-data rows, nor are they included in the crankhash.
+The third is "host", and includes any key which is prefixed with `host.`. This is not available to `kernelStorage.kvStore` at all: it is only accessed by methods on `hostStorage.kvStore` (the `kernelStorage` methods will throw an error if given a key like `host.foo`, and the `hostStorage` methods will throw *unless* given a key like `host.foo`). These are also excluded from export-data and the crankhash. Host keys are reserved for the host application, and are generally used to keep track of things like which block has been executed, to manage consistency between a separate host database (eg IAVL) and the swingstore. The host can record "I told the kernel to execute the contents of block 56" into `hostStorage.kvStore`, and then do `hostStorage.commit()`, and then it can record "I processed the rest of block 56" into is own DB, and then commit its own DB. If, upon startup, it observes a discrepancy between the `hostStorage.kvStore` record and its own DB, it knows it got interrupted between these two commit points, which can trigger recovery code.
+Any key which doesn't start with `local.` or `host.` is part of the "consensus" section.
+## CrankHash and ActivityHash
+Swingset kernels are frequently run in a consensus mode, where multiple instances of the kernel (on different machines) are expected to execute the same deliveries in lock-step. In this mode, every kernel is expected to do exactly the same computation, and any divergence indicates a failure (or attempt at malice). We want to detect such variations quickly, so the diverging/failing member can "fall out of consensus" promptly.
+The swingstore hashes all changes to the "consensus" portion of the kvStore into the "crank hash". This hash covers every change since the beginning of the current crank, and the kernel logs the result at the end of each crank, at which point the crankhash is reset.
+Each crank also updates a value called the "activity hash", by hashing the previous activityhash and the latest crankhash together. This records a chain of changes, and is logged at the end of each crank too.
+The host application can record the activityhash into its own consensus-tracking database (eg IAVL) at the end of each kernel run, to ensure that any internal divergence of swingset behavior is escalated to a proper consensus failure. Without this, one instance of the kernel might "think differently" than the others, but still "act" the same (in terms of IO or externally-visible messages) without triggering a failure, which would be a lurking problem.
+Logging both the crankhash and the activityhash improves our ability to diagnose consensus failures. By comparing logs between a "good" machine and a "bad" (diverging) one, we can quickly determine which crank caused the problem, and usually compare slogfile delivery/syscall records to narrow the divergence down to a specific syscall.
+kvStore changes are also recorded by the export-data, but these are too voluminous to be logged, and do not capture multiple changes to the same key. And not all host applications use exports, so there might not be anything watching export data.
+## Data Model
+The kvStore holds a simple string-to-string key/value store. The SQLite schema for the `kvStore` table is simply `(key TEXT, value TEXT)`.
+## Export Model
+To ensure that every key/value pair is correctly validatable, *all* in-consensus kvStore rows get their own export-data item. The name is just `kv.${key}`, and the value is just the value. `kvStore.delete(key)` will delete the export-data item. There are no artifacts.
+These make up the vast majority of the export-data items, both by count and by "churn" (the number of export-data items changed in a single crank). In the future, we would prefer to keep the kvStore in some sort of Merkle-tree data structure, and emit only a handful of export-data rows that contain hashes (perhaps just a single root hash). In this approach, the actual data would be exported in one or more artifacts. However, our SQLite backend does not provide the same kind of automatic Merkleization as IAVL, and only holds a single version of data at a time, making this impractical.

package/docs/snapstore.md ADDED Viewed

@@ -0,0 +1,49 @@
+# SnapStore
+The `kernelStorage.snapStore` sub-store tracks vat heap snapshots. These blobs capture the state of an XS JavaScript engine, between deliveries, to speed up replay-based persistence. The kernel can start a vat worker from a recent heap snapshot, and then it only needs to replay a handful of transcript items (deliveries), instead of replaying every delivery since the beginning of the incarnation.
+The XS / [`xsnap`](../../xsnap) engine defines the heap snapshot format. It consists of a large table of "slots", which are linked together to form JavaScript objects, strings, Maps, functions, etc. The snapshot also includes "chunks" for large data fields (like strings and BigInts), a stack, and some other supporting tables. The snapStore doesn't care about any of the internal details: it just gets a big blob of bytes.
+## Data Model
+Each snapshot is compressed and stored in the SQLite row as a BLOB. The snapStore has a single table named `snapshots`, with a schema of `(vatID TEXT, snapPos INTEGER, inUse INTEGER, hash TEXT, uncompressedSize INTEGER, compressedSize INTEGER, compressedSnapshot BLOB)`.
+The kernel has a scheduler which decides when to take a heap snapshot for each vat. There is a tradeoff between the immediate cost of creating the snapshot, versus the expected future savings of having a shorter transcript to replay. More frequent snapshots save time later, at the cost of time spent now.
+The kernel currently uses a [very simple scheduler](../../SwingSet/src/kernel/vat-warehouse.js), which takes a snapshot every `snapshotInterval` deliveries (e.g. 200), plus an extra one a few deliveries (`snapshotInitial`) into the new incarnation, to avoid replaying expensive contract startup code. The [SwingSet configuration documentation](../../SwingSet/docs/configuration.md) has the details.
+However, the swingstore is unaware of the kernel's scheduling policy. Every once in a while, the kernel tells the snapStore about a new snapshot, and the snapStore updates its data.
+Like the [transcriptStore](./transcriptstore.md), the snapStore retains a hash of older records, even after it prunes the snapshot data itself. There is at most one `inUse = 1` record for each vatID, and it will always have the highest `snapPos` value. When a particular vatID's active snapshot is replaced, the SQLite table row is updated to clear the `inUse` flag (i.e. set it to NULL). By default, the `compressedSnapshot` field is also set to NULL, removing the large data blob, but there is an option (`keepSnapshots: true`) to retain the full contents of all snapshots, even the ones that are no longer in use.
+## Export Model
+Each snapshot, both current and historic, gets an export-data entry. The name is `snapshot.${vatID}.${position}`, where `position` is the latest delivery (eg highest delivery number) that was included in the heap state captured by the snapshot. The value is a JSON-serialized record of `{ vatID, snapPos, hash, inUse }`.
+If there is a "current" snapshot, there will be one additional export-data record, whose name is `snapshot.${vatID}.current`, and whose value is `snapshot.${vatID}.${position}`. This value is the same as the name of the latest export-data record, and is meant as a convenient pointer to find that latest snapshot.
+The export *artifacts* will generally only include the current snapshot for each vat. Only the `debug` mode will include historical snapshots (and only if the swingstore was retaining them in the first place).
+## Slow Deletion
+As soon as a vat is terminated, the kernel will call `snapStore.stopUsingLastSnapshot()`. The DB is updated to clear the `inUse` flag of the latest snapshot, leaving no rows with `inUse = 1`. This immediately makes the vat non-loadable by the kernel. The snapshot data itself is deleted (unless `keepSnapshots: true`).
+This also modifies the latest `snapshot.${vatID}.${snapPos}` export-data record, to change `inUse` to 0, and removes the `snapshot.${vatID}.current` record. The modification and deletion are added to the export-data callback queue, so the host-app can learn about them after the next commit. Any subsequent `getExportData()` calls will observe the changes.
+As a result, all non-`debug` swing-store exports after this point will omit any artifacts for the snapshot blob, but they will still include export-data records (hashes) for all snapshots. (Deleting all the export-data records is too much work to do in a single step, so it is spread out over time).
+Later, as the kernel performs cleanup work for this vatID, the cleanup call will delete DB rows (one per `budget`). Each row deleted will also remove one export-data record, which feeds the callback queue, as well as affecting the full `getExportData()` results.
+Eventually, the snapStore runs out of rows to delete, and `deleteVatSnapshots(budget)` returns `{ done: true }`, so the kernel can finally rest.
+### SnapStore Vat Lifetime
+The SnapStore doesn't provide an explicit API to call when a vat is first created. The kernel just calls `saveSnapshot()` for both the first and all subsequent snapshots. Each `saveSnapshot()` marks the previous snapshot as unused, so there is at most one `inUse = 1` snapshot at any time. There will be zero in-use snapshots just after each incarnation starts, until enough deliveries have been made to trigger the first snapshot.
+When terminating a vat, the kernel should first call `snapStore.stopUsingLastSnapshot(vatID)`, the same call it would make at the end of an incarnation, to indicate that we're no longer using the last snapshot. This results in zero in-use snapshots.
+Then, the kernel must either call `snapStore.deleteVatSnapshots(vatID)` or `deleteVatSnapshots(vatID, Infinity)` to delete everything at once, or make a series of calls (spread out over time/blocks) to `snapStore.deleteVatSnapshots(vatID, budget)`. Each will return `{ done, cleanups }`, which can be used to manage the rate-limiting and know when the process is finished.
+The `stopUsingLastSnapshot()` is a performance improvement, but is not mandatory. If omitted, exports will continue to include the vat's snapshot artifacts until the first call to `deleteVatSnapshots()`, after which they will go away. Snapshots are deleted in descending `snapPos` order, so the first call will delete the only `inUse = 1` snapshot, after which exports will omit all artifacts for the vatID. `stopUsingLastSnapshot()` is idempotent, and extra calls will leave the DB unchanged.
+The kernel must keep calling `deleteVatSnapshots(vatID, budget)` until the `{ done }` return value is `true`. It is safe to call it again after that point; the function will keep returning `true`. But note, this costs one DB txn, so it may be cheaper for the kernel to somehow remember that we've reached the end.

package/docs/swingstore.md CHANGED Viewed

@@ -1,13 +1,57 @@
-# SwingStore Data Model
+# The SwingStore
 The "SwingStore" provides a database to hold SwingSet kernel state, with an API crafted to help both the kernel and the host application mutate, commit, export, and import this state.
-The state is broken up into several pieces, or "stores":
+The entire durable state of the kernel lives in the SwingStore: it does not use any other files or databases, and the only commit point is in `hostStorage.commit()`. Careful host applications can use this to avoid "hangover inconsistency", by storing all device output messages in the same database, and only releasing them once the kernel changes have been committed.
+In theory, an alternate implementation of this API could be provided with e.g. a different backend database, such as the host application's own native database (eg IAVL, for cosmos-sdk -based apps). This could simplify the atomicity domains by using just one database instead of two. This must be balanced against performance tradeoffs: swing-store takes advantage of SQL's indexing and iteration abilities, which might not be present in the other database.
+## Creating and Opening a SwingStore
+`initSwingStore(dirPath, options)` will create a new swingstore in the given directory, which will be created if it doesn't already exist. The entire directory is reserved for the swingstore: the host application should not put any other files there. The swingstore library will populated it with the SQLite DB's backing files: `swingstore.sqlite`, `swingstore.sqlite-wal`, and `swingstore.sqlite-shm`. If called on a directory that already contains a database, the DB will be erased first.
+`openSwingStore(dirPath, options)` does the same, but will not erase a pre-existing DB. In general, use `initSwingStore` for the initial creation of the DB, and `openSwingStore` for all subsequent access.
+Both calls return a record with `{ hostStorage, kernelStorage }`, along with some additional facets for testing and debugging. `dirPath` can be null to use a ephemeral (in-memory) DB, which is only useful for unit tests.
+## HostStorage
+The `hostStorage` facet is reserved for the host application. It is mostly used to manage commit points for the application.
+The host is responsible for calling `hostStorage.commit()` when it is done with kernel execution. This causes a SQLite `COMMIT` of the underlying database. It should perform this commit before it releases any device output messages. This facet is the only one with a `commit()` method: the kernel is explicitly unable to commit its own changes to the underlying SQLite database, because the kernel does not know anything about the host's application lifecycle or input/output activity, so it cannot know what qualifies as a safe commit point.
+If, for some reason, the host wants to abandon execution, it can call `hostStorage.close()`, which will close the swingstore without committing any changes. This is not normally useful: the kernel must be abandoned at this point too, so most of the time the host application should just exit entirely.
+`hostStorage.kvStore` is also available to let the host add items to a separate portion of the kvStore, using keys which start with a `host.` prefix. It can use this to coordinate with a separately-committed host database (e.g. to remember how much work has been given to the kernel, and how much has been successfully executed). This portion of the kvStore is unreachable by the kernel.
+`hostStorage.setExportCallback()` is used to register an export callback after swingstore creation, see [data-export.md](./data-export.md) for details. Most applications will instead provide `options.exportCallback` to `openSwingStore()`.
+`hostStorage.repairMetadata()` was used to repair a historical flaw in the database format, and is not needed by new installations.
+## KernelStorage
+The host application is supposed to deliver the `kernelStorage` facet to the kernel, by passing it into `initializeSwingset()`, `upgradeSwingset()`, and `buildVatController()`. The host application should not use `kernelStorage` itself.
+The kernel receives a facet named `kernelStorage`, from which it can access four sub-stores:
+* [`bundleStore`](./bundlestore.md): a string-keyed Bundle-value table, holding source bundles which can be evaluated by `importBundle` to create vats, or new Compartments within a vat
+* [`transcriptStore`](./transcriptstore.md): records a linear sequence of deliveries and syscalls (with results), collectively known as "transcript entries", for each vat
+* [`snapStore`](./snapstore.md): records one or more XS heap snapshots for each vat, to rebuild a worker more efficiently than replaying all transcript entries from the beginning
+* [`kvStore`](./kvstore.md): a string-keyed string-valued table, which holds everything else. Currently, this holds each vat's c-list and vatstore data, as well as the kernel-wide object and promise tables, and run-queues.
+These pieces operate independently: data in one substore does not affect the operation of the others.
+`kernelStorage` also provides access to the "crank" tools. Kernel execution proceeds in a series of steps named "cranks", many of which involve delivering a message to a vat worker. Sometimes these messages cause a failure halfway through the delivery, where it is better to record either complete deliveries or nothing at all. To support this, the kernel can mark the beginning of the crank (by calling `kernelStorage.startCrank()`), and then either discard the changes (`rollbackCrank()`) or accept them (`endCrank()`). The `emitCrankHashes()` method rotates the crankhash and updates the activityhash (see the kvStore documentation for details).
+Note that `endCrank()` does *not* perform a SQLite `COMMIT`, as that power is reserved for the host application (through `hostStorage.commit()`). Instead, the kernel only has access to SQLite "savepoints", which are smaller-scale than full transactions.
+# SwingStore Data Model
+The state is broken up into several pieces, or "sub-stores":
-* `bundleStore`: a string-keyed Bundle-value table, holding source bundles which can be evaluated by `importBundle` to create vats, or new Compartments within a vat
-* `transcriptStore`: records a linear sequence of deliveries and syscalls (with results), collectively known as "transcript entries", for each vat
-* `snapStore`: records one or more XS heap snapshots for each vat, to rebuild a worker more efficiently than replaying all transcript entries from the beginning
-* `kvStore`: a string-keyed string-valued table, which holds everything else. Currently, this holds each vat's c-list and vatstore data, as well as the kernel-wide object and promise tables, and run-queues.
 ## Incarnations, Spans, Snapshots
@@ -50,3 +94,15 @@ When a transcript span is pruned, the `transcriptSpans` row is left alone, but t
 During import, we create the metadata first (as the export-data is parsed), then later, we fill in the details as the artifacts are read.
 Bundles are never pruned, however during import, the `bundles` table will temporarily contain rows whose `bundle` BLOB is NULL.
+## Vat Lifetimes
+Two sub-stores are keyed by VatID: `transcriptStore` and `snapStore` (the `bundleStore` does not know which vats might know about each bundle, and the `kvStore` entries which relate to a specific vat will have the VatID embedded in the key, so the swing-store doesn't need to know about them).
+When the kernel terminates a vat, we want to delete the no-longer-necessary data. However, if the vat had a large number of transcript entries and/or heap snapshots, deleting all this data at the same time might cause excessing CPU or I/O usage (eg thousands of DB queries, or a multi-gigabyte `swingstore.sqlite-wal` file. It might also push a large number of changes into the export-data callbacks, which can cause memory or CPU stall problems in the host application. In the worst case, the entire application could crash.
+To limit this usage, and allow the kernel to delete vat state slowly, the swing-store is somewhat more aware of a vat's lifetime than a mere database should be. In particular, we split the shutdown process into two pieces. "Terminating a vat" happens first, and tells the sub-store to hide the vat from exports and from API calls that are meant to find out which vats are available. The kernel should call this exactly once, when the vat is terminated.
+The second part is "deletion", and it can happen either all-at-once or in multiple budget-limited calls. Both forms share the same API calls, differing only in their `budget` argument (`undefined` means all-at-once). The deletion API can be called multiple times, with a small budget, and each call will only delete a small portion of the state. They will return a value that indicates when the last bit of state has been deleted, so the kernel can know when to stop calling them.
+See [transcriptstore.md](./transcriptstore.md) and [snapstore.md](./snapstore.md) for more details.

package/docs/transcriptstore.md ADDED Viewed

@@ -0,0 +1,70 @@
+# TranscriptStore
+The `kernelStorage.transcriptStore` sub-store tracks vat delivery transcripts, through which the kernel can provide orthogonal persistence of JavaScript runtime environments (vats).
+Each vat is a JavaScript runtime environment, initialized by evaluating some starting code bundle, and then fed a series of deliveries. Each delivery may provoke some number of syscalls back to the kernel, with each get some response data. The delivery finishes with a "delivery result".
+For each delivery, this data (delivery, syscall/response pairs, delivery-result) is serialized and stored in a single "transcript item". Each item is indexed by an incrementing "delivery number" (`deliveryNum`).
+When a vat worker is brought online, the kernel retrieves these transcript items from the transcriptStore and replays them, by performing the delivery and responding to the syscalls, even though the syscall responses are pulled from the transcript instead of causing actual execution. The kernel asserts that the new worker behaves exactly like the original one did. For xsnap workers, the kernel doesn't actually have to replay the *entire* transcript, because it can start from a heap snapshot (stored in the adjoining [`snapStore`](./snapstore.md)). So generally it only needs to replay a single span.
+## Data Model
+Vat lifetimes are broken up into "incarnations", separated by upgrade events. Within each incarnation, the transcript is broken up into "spans", separated by heap-snapshot cycles. To end a span, the kernel records the worker's heap snapshot, then "closes" the old span, and opens a new one.
+This results in a single open or "current" span for each active vat, and a series of historical spans. For operational purposes, we only care about the current span. But to support some potential deep-replay needs, the transcriptStore can retain data about earlier spans.
+The SQLite database has one table that tracks transcript spans, named `transcriptSpans`. All vatIDs and incarnations are stored in the same table, whose schema is `(vatID TEXT, startPos INTEGER, endPos INTEGER, hash TEXT, isCurrent INTEGER, incarnation INTEGER)`. `startPos` and `endPos` define a zero-based range over the sequence of all deliveries into a vat (the former inclusive and the latter exclusive, such that e.g. `startPos=0` and `endPos=3` would encompass the first three deliveries, with positions 0, 1, and 2).
+A separate table named `transcriptItems` tracks the items themselves, with a schema of `(vatID TEXT, position INTEGER, item TEXT, incarnation INTEGER)`. This table has one row per transcript item, each of which is "owned" by a single span with matching values for `vatID` and `incarnation` and having `startPos <= position` and `endPos > position`. Each span owns multiple items (typically 200, but it depends upon how frequently the kernel rolls over new spans).
+In the future, historical spans may be compressed, and their item rows replaced with a single compressed blob in the span record. This would reduce the space needed without actually pruning the data.
+## Retention / Pruning
+If the current swingstore was opened with the `keepTranscripts = false` option, then the transcriptStore will "prune" each span as soon as it becomes historical. Pruned spans will still have a span record, with a hash, to enable safely-validated restoration of the transcript items later, if necessary. However their item records will be deleted, to save space.
+When `keepTranscripts = true`, all span items are retained.
+Pruned spans are not available for export artifacts, of course, because the data is missing. However the span *hashes* are still included in the export-data, to support safe validation. You can start with a pruned swingstore, produce an export dataset, import that dataset into a new swingstore, and the new swingstore will be just as capable of validating replacement span records as the original was.
+## Export Model
+Every transcript span, both current and historic, gets an export-data record. The record name is different for the two types of spans.
+Historical spans, which are "closed" and no longer growing, use a record name of `transcript.${vatID}.${startPos}.${endPos}`, where `startPos` is the delivery number of the first delivery included in the span and `endPos` is the the delivery number of the first delivery included in the **next** span (i.e., the former is an inclusive lower bound and the latter is an exclusive upper bound).
+The value is a JSON-serialized record of `{ vatID, startPos, endPos, hash, isCurrent, incarnation }` (where `isCurrent = 0`).
+The current span, if any, uses a record name of `transcript.${vatID}.current`, and has the same value as historical spans (except `isCurrent = 1`). Current spans are growing: new transcript items are added as more deliveries are made, until the span is closed off (becomes historical) and replaced with a new current span. There is at most one current span per vatID.
+The available export *artifacts* will depend upon the export mode, and upon the swingstore's `keepTranscripts` setting. Each export artifact corresponds to a single span, and the artifact names are always `transcript.${vatID}.${startPos}.${endPos}` (for both historical and current spans).
+In the most-minimal `operational` mode, the export includes one artifact for each active (non-terminated) vat: just the current span. If `keepTranscripts` is not true, these will be the only available artifacts anyways.
+The `replay` mode includes all spans for each vat's current incarnation, but omits spans from earlier incarnations. The `archival` mode includes all spans from all incarnations.
+The `debug` mode includes all available spans, even for terminated vats. For the non-`debug` modes, terminated vats will not provide export-data or artifacts.
+## Slow Deletion
+As soon as a vat is terminated, the kernel will call `transcriptStore.stopUsingTranscript()`.  The DB is updated to clear the `isCurrent` flag of the latest span, leaving no rows with `isCurrent = 1`. This immediately makes the vat non-loadable by the kernel.
+This also removes the `transcript.${vatID}.current` export-data record, and replaces it with a `transcript.${vatID}.${startPos}` one, effectively making the span historical. This change (one deletion, one addition) is added to the export-data callback queue, so the host-app can learn about it after the next commit, and any subsequent `getExportData()` calls will see the replacement record, instead of a `.current` record.
+At this point, all non-`debug` swing-store exports after this point will omit any artifacts for the vat, but they will still include export-data records (hashes) for all spans, all of which look historical. (Deleting all the span records, and their corresponding export-data records, is too much work to do in a single step).
+Later, as the kernel performs cleanup work for this vatID, the `transcriptStore.deleteVatTranscripts(budget)` cleanup call will delete one span row per `budget`, along with all related item rows (typically 200). Each span deleted will also remove one export-data record (which feeds the callback queue, as well as affecting the full `getExportData()` results).
+Eventually, the transcriptStore runs out of rows to delete, and `deleteVatTranscripts(budget)` returns `{ done: true }`, so the kernel can finally rest.
+### TranscriptStore Vat Lifetime
+Unlike the [SnapStore](./snapstore.md), the TranscriptStore *does* have an explicit call to be made when a vat is first created: `transcriptStore.initTranscript(vatID)`. Also unlike SnapStore, TranscriptStore (normally) always has an `isCurrent = 1` span for each vat (it might just be empty of items, immediately after the span rolls over).
+When a vat is terminated, the kernel should first call `transcriptStore.stopUsingTranscript(vatID)`. This will mark the single current span as `isCurrent = 0`. The kernel must not attempt to read, add, or rollover spans or items while in this state. While in this state, exports (export for `mode = debug`) will not emit artifacts for this VatID: export-data records will still exist for all spans, as these must be deleted slowly, however there will be no associated artifacts or artifact names.
+Then, the kernel should either call `transcriptStore.deleteVatTranscripts(vatID)` exactly once, or it should call `transcriptStore.deleteVatTranscripts(vatID, budget)` until it returns `{ done: true }`.
+As with snapshots, the `stopUsingTranscript()` is a non-mandatory performance improvement. If omitted, exports will continue to include (many) span artifacts for this vat until the first call to `deleteVatTranscripts()` removes the one `isCurrent = 1` span (since spans are deleted most-recent-first). After that point, exports will stop including any artifacts for the vatID. `stopUsingTranscript()` is idempotent, and extra calls will leave the DB unchanged.
+The kernel must keep calling `deleteVatTranscripts(vatID, budget)` until the `{ done }` return value is `true`.  As with the SnapStore, it is safe to call it again after that point; the function will keep returning `true`.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@agoric/swing-store",
-  "version": "0.9.2-u16.0",
+  "version": "0.9.2-u17.0",
   "description": "Persistent storage for SwingSet",
   "type": "module",
   "main": "./src/index.js",
@@ -21,16 +21,16 @@
     "lint:eslint": "eslint ."
   },
   "dependencies": {
-    "@agoric/assert": "^0.6.1-u16.0",
-    "@agoric/internal": "^0.4.0-u16.0",
-    "@endo/base64": "^1.0.5",
-    "@endo/bundle-source": "^3.2.3",
-    "@endo/check-bundle": "^1.0.7",
-    "@endo/nat": "^5.0.7",
+    "@agoric/internal": "^0.4.0-u17.0",
+    "@endo/base64": "^1.0.7",
+    "@endo/bundle-source": "^3.4.0",
+    "@endo/check-bundle": "^1.0.9",
+    "@endo/errors": "^1.2.5",
+    "@endo/nat": "^5.0.10",
     "better-sqlite3": "^9.1.1"
   },
   "devDependencies": {
-    "@endo/init": "^1.1.2",
+    "@endo/init": "^1.1.4",
     "@types/better-sqlite3": "^7.6.9",
     "ava": "^5.3.0",
     "c8": "^9.1.0",
@@ -49,7 +49,7 @@
     "timeout": "2m"
   },
   "typeCoverage": {
-    "atLeast": 76.31
+    "atLeast": 78.77
   },
-  "gitHead": "bbdf652c3f413381cb352a8a360db1063974fafd"
+  "gitHead": "515c4c0efccfc91b97da30037c10fc4b076851e2"
 }

package/src/archiver.js ADDED Viewed

@@ -0,0 +1,80 @@
+import { finished as streamFinishedCallback, Readable } from 'node:stream';
+import { promisify } from 'node:util';
+import { createGzip } from 'node:zlib';
+import { withDeferredCleanup } from '@agoric/internal';
+const streamFinished = promisify(streamFinishedCallback);
+/*
+ * @param {string} dirPath
+ * @param {object} powers
+ * @param {Pick<import('fs'), 'createWriteStream' | 'mkdirSync' | 'renameSync'>} powers.fs
+ * @param {Pick<import('path'), 'join'>} powers.path
+ * @param {Pick<import('tmp'), 'fileSync'>} powers.tmp
+ */
+export const makeArchiveSnapshot = (dirPath, powers) => {
+  const { fs, path, tmp } = powers;
+  fs.mkdirSync(dirPath, { recursive: true });
+  const archiveSnapshot = (name, gzData) => {
+    const destPath = path.join(dirPath, `${name}.gz`);
+    return withDeferredCleanup(async addCleanup => {
+      const {
+        name: tmpName,
+        fd,
+        removeCallback,
+      } = tmp.fileSync({
+        prefix: name,
+        postfix: '.gz',
+        detachDescriptor: true,
+      });
+      addCleanup(() => removeCallback());
+      const writer = fs.createWriteStream('', { fd, flush: true });
+      const reader = Readable.from(gzData);
+      const destroyReader = promisify(reader.destroy.bind(reader));
+      addCleanup(() => destroyReader(null));
+      reader.pipe(writer);
+      await streamFinished(writer);
+      fs.renameSync(tmpName, destPath);
+    });
+  };
+  return archiveSnapshot;
+};
+harden(makeArchiveSnapshot);
+/*
+ * @param {string} dirPath
+ * @param {object} powers
+ * @param {Pick<import('fs'), 'createWriteStream' | 'mkdirSync' | 'renameSync'>} powers.fs
+ * @param {Pick<import('path'), 'join'>} powers.path
+ * @param {Pick<import('tmp'), 'fileSync'>} powers.tmp
+ */
+export const makeArchiveTranscript = (dirPath, powers) => {
+  const { fs, path, tmp } = powers;
+  fs.mkdirSync(dirPath, { recursive: true });
+  const archiveTranscript = (spanName, entries) => {
+    const destPath = path.join(dirPath, `${spanName}.gz`);
+    return withDeferredCleanup(async addCleanup => {
+      const {
+        name: tmpName,
+        fd,
+        removeCallback,
+      } = tmp.fileSync({
+        prefix: spanName,
+        postfix: '.gz',
+        detachDescriptor: true,
+      });
+      addCleanup(() => removeCallback());
+      const writer = fs.createWriteStream('', { fd, flush: true });
+      const gzip = createGzip();
+      gzip.pipe(writer);
+      const reader = Readable.from(entries);
+      const destroyReader = promisify(reader.destroy.bind(reader));
+      addCleanup(() => destroyReader(null));
+      reader.pipe(gzip);
+      await streamFinished(gzip);
+      fs.renameSync(tmpName, destPath);
+    });
+  };
+  return archiveTranscript;
+};
+harden(makeArchiveTranscript);

package/src/bundleStore.js CHANGED Viewed

@@ -2,10 +2,10 @@
 import { createHash } from 'crypto';
 import { Readable } from 'stream';
 import { Buffer } from 'buffer';
+import { Fail, q } from '@endo/errors';
 import { encodeBase64, decodeBase64 } from '@endo/base64';
 import { checkBundle } from '@endo/check-bundle/lite.js';
 import { Nat } from '@endo/nat';
-import { Fail, q } from '@agoric/assert';
 import { createSHA256 } from './hasher.js';
 /**

package/src/exporter.js CHANGED Viewed

@@ -1,6 +1,6 @@
 import sqlite3 from 'better-sqlite3';
-import { Fail, q } from '@agoric/assert';
+import { Fail, q } from '@endo/errors';
 import { dbFileInDirectory } from './util.js';
 import { getKeyType } from './kvStore.js';
@@ -163,23 +163,23 @@ export function makeSwingStoreExporter(dirPath, options = {}) {
   harden(getExportData);
   /** @yields {string} */
-  async function* artifactNames() {
+  async function* generateArtifactNames() {
     yield* snapStore.getArtifactNames(artifactMode);
     yield* transcriptStore.getArtifactNames(artifactMode);
     yield* bundleStore.getArtifactNames();
   }
-  harden(artifactNames);
+  harden(generateArtifactNames);
   /**
    * @returns {AsyncIterableIterator<string>}
    */
   function getArtifactNames() {
     if (artifactMode !== 'debug') {
-      // throw if this DB will not be able to yield all the desired artifacts
+      // synchronously throw if this DB will not be able to yield all the desired artifacts
       const internal = { snapStore, bundleStore, transcriptStore };
       assertComplete(internal, artifactMode);
     }
-    return artifactNames();
+    return generateArtifactNames();
   }
   /**

package/src/hasher.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { Fail } from '@agoric/assert';
+import { Fail } from '@endo/errors';
 import { createHash } from 'crypto';

package/src/importer.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { Fail, q } from '@agoric/assert';
+import { Fail, q } from '@endo/errors';
 import { makeSwingStore } from './swingStore.js';
 import { buffer } from './util.js';

package/src/index.js CHANGED Viewed

@@ -2,6 +2,8 @@ export { initSwingStore, openSwingStore, isSwingStore } from './swingStore.js';
 export { makeSwingStoreExporter } from './exporter.js';
 export { importSwingStore } from './importer.js';
+export { makeArchiveSnapshot, makeArchiveTranscript } from './archiver.js';
 // temporary, for the benefit of SwingSet/misc-tools/replay-transcript.js
 export { makeSnapStore } from './snapStore.js';
 // and less temporary, for SwingSet/test/vat-warehouse/test-reload-snapshot.js

package/src/internal.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { Fail, q } from '@agoric/assert';
+import { Fail, q } from '@endo/errors';
 /**
  * @typedef { import('./snapStore.js').SnapStoreInternal } SnapStoreInternal

package/src/kvStore.js CHANGED Viewed

@@ -1,5 +1,5 @@
 // @ts-check
-import { Fail } from '@agoric/assert';
+import { Fail } from '@endo/errors';
 /**
  * @typedef {{

package/src/repairMetadata.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { Fail, q } from '@agoric/assert';
+import { Fail, q } from '@endo/errors';
 import { assertComplete } from './assertComplete.js';
 /**