datahike-browser-tests 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.circleci/config.yml +405 -0
- package/.circleci/scripts/gen_ci.clj +194 -0
- package/.cirrus.yml +60 -0
- package/.clj-kondo/babashka/sci/config.edn +1 -0
- package/.clj-kondo/babashka/sci/sci/core.clj +9 -0
- package/.clj-kondo/config.edn +95 -0
- package/.dir-locals.el +2 -0
- package/.github/FUNDING.yml +3 -0
- package/.github/ISSUE_TEMPLATE/1-bug-report.yml +68 -0
- package/.github/ISSUE_TEMPLATE/2-feature-request.yml +28 -0
- package/.github/ISSUE_TEMPLATE/config.yml +6 -0
- package/.github/pull_request_template.md +24 -0
- package/.github/workflows/native-image.yml +84 -0
- package/LICENSE +203 -0
- package/README.md +273 -0
- package/bb/deps.edn +9 -0
- package/bb/resources/github-fingerprints +3 -0
- package/bb/resources/native-image-tests/run-bb-pod-tests.clj +162 -0
- package/bb/resources/native-image-tests/run-libdatahike-tests +12 -0
- package/bb/resources/native-image-tests/run-native-image-tests +74 -0
- package/bb/resources/native-image-tests/run-python-tests +22 -0
- package/bb/resources/native-image-tests/testconfig.attr-refs.edn +6 -0
- package/bb/resources/native-image-tests/testconfig.edn +5 -0
- package/bb/resources/template/.settings/org.eclipse.jdt.apt.core.prefs +2 -0
- package/bb/resources/template/.settings/org.eclipse.jdt.core.prefs +9 -0
- package/bb/resources/template/.settings/org.eclipse.m2e.core.prefs +4 -0
- package/bb/resources/template/pom.xml +22 -0
- package/bb/src/tools/build.clj +132 -0
- package/bb/src/tools/clj_kondo.clj +32 -0
- package/bb/src/tools/deploy.clj +26 -0
- package/bb/src/tools/examples.clj +19 -0
- package/bb/src/tools/npm.clj +100 -0
- package/bb/src/tools/python.clj +14 -0
- package/bb/src/tools/release.clj +94 -0
- package/bb/src/tools/test.clj +148 -0
- package/bb/src/tools/version.clj +47 -0
- package/bb.edn +269 -0
- package/benchmark/src/benchmark/cli.clj +195 -0
- package/benchmark/src/benchmark/compare.clj +157 -0
- package/benchmark/src/benchmark/config.clj +316 -0
- package/benchmark/src/benchmark/measure.clj +187 -0
- package/benchmark/src/benchmark/store.clj +190 -0
- package/benchmark/test/benchmark/measure_test.clj +156 -0
- package/build.clj +30 -0
- package/config.edn +49 -0
- package/deps.edn +138 -0
- package/dev/sandbox.clj +82 -0
- package/dev/sandbox.cljs +127 -0
- package/dev/sandbox_benchmarks.clj +27 -0
- package/dev/sandbox_client.clj +87 -0
- package/dev/sandbox_transact_bench.clj +109 -0
- package/dev/user.clj +79 -0
- package/doc/README.md +96 -0
- package/doc/adl/README.md +6 -0
- package/doc/adl/adr-000-adr.org +28 -0
- package/doc/adl/adr-001-attribute-references.org +15 -0
- package/doc/adl/adr-002-build-tooling.org +54 -0
- package/doc/adl/adr-003-db-meta-data.md +52 -0
- package/doc/adl/adr-004-github-flow.md +40 -0
- package/doc/adl/adr-XYZ-template.md +30 -0
- package/doc/adl/index.org +3 -0
- package/doc/assets/datahike-logo.svg +3 -0
- package/doc/assets/datahiking-invoice.org +85 -0
- package/doc/assets/hhtree2.png +0 -0
- package/doc/assets/network_topology.svg +624 -0
- package/doc/assets/perf.png +0 -0
- package/doc/assets/schema_mindmap.mm +132 -0
- package/doc/assets/schema_mindmap.svg +970 -0
- package/doc/assets/temporal_index.mm +74 -0
- package/doc/backend-development.md +78 -0
- package/doc/bb-pod.md +89 -0
- package/doc/benchmarking.md +360 -0
- package/doc/bindings/edn-conversion.md +383 -0
- package/doc/cli.md +162 -0
- package/doc/cljdoc.edn +27 -0
- package/doc/cljs-support.md +133 -0
- package/doc/config.md +406 -0
- package/doc/contributing.md +114 -0
- package/doc/datalog-vs-sql.md +210 -0
- package/doc/datomic_differences.md +109 -0
- package/doc/development/pull-api-ns.md +186 -0
- package/doc/development/pull-frame-state-diagram.jpg +0 -0
- package/doc/distributed.md +566 -0
- package/doc/entity_spec.md +92 -0
- package/doc/gc.md +273 -0
- package/doc/java-api.md +808 -0
- package/doc/javascript-api.md +421 -0
- package/doc/libdatahike.md +86 -0
- package/doc/logging_and_error_handling.md +43 -0
- package/doc/norms.md +66 -0
- package/doc/schema-migration.md +85 -0
- package/doc/schema.md +287 -0
- package/doc/storage-backends.md +363 -0
- package/doc/store-id-refactoring.md +596 -0
- package/doc/time_variance.md +325 -0
- package/doc/unstructured.md +167 -0
- package/doc/versioning.md +261 -0
- package/examples/basic/README.md +19 -0
- package/examples/basic/deps.edn +6 -0
- package/examples/basic/docker-compose.yml +13 -0
- package/examples/basic/src/examples/core.clj +60 -0
- package/examples/basic/src/examples/schema.clj +155 -0
- package/examples/basic/src/examples/store.clj +60 -0
- package/examples/basic/src/examples/time_travel.clj +185 -0
- package/examples/java/.settings/org.eclipse.core.resources.prefs +3 -0
- package/examples/java/.settings/org.eclipse.jdt.apt.core.prefs +2 -0
- package/examples/java/.settings/org.eclipse.jdt.core.prefs +9 -0
- package/examples/java/.settings/org.eclipse.m2e.core.prefs +4 -0
- package/examples/java/README.md +162 -0
- package/examples/java/pom.xml +62 -0
- package/examples/java/src/main/java/examples/QuickStart.java +115 -0
- package/examples/java/src/main/java/examples/SchemaExample.java +148 -0
- package/examples/java/src/main/java/examples/TimeTravelExample.java +121 -0
- package/flake.lock +27 -0
- package/flake.nix +27 -0
- package/http-server/datahike/http/middleware.clj +75 -0
- package/http-server/datahike/http/server.clj +269 -0
- package/java/src/datahike/java/Database.java +274 -0
- package/java/src/datahike/java/Datahike.java +281 -0
- package/java/src/datahike/java/DatahikeGeneratedTest.java +349 -0
- package/java/src/datahike/java/DatahikeTest.java +370 -0
- package/java/src/datahike/java/EDN.java +170 -0
- package/java/src/datahike/java/IEntity.java +11 -0
- package/java/src/datahike/java/Keywords.java +161 -0
- package/java/src/datahike/java/SchemaFlexibility.java +52 -0
- package/java/src/datahike/java/Util.java +219 -0
- package/karma.conf.js +19 -0
- package/libdatahike/compile-cpp +7 -0
- package/libdatahike/src/datahike/impl/LibDatahikeBase.java +203 -0
- package/libdatahike/src/datahike/impl/libdatahike.clj +59 -0
- package/libdatahike/src/test_cpp.cpp +61 -0
- package/npm-package/PUBLISHING.md +140 -0
- package/npm-package/README.md +226 -0
- package/npm-package/package.template.json +34 -0
- package/npm-package/test-isomorphic.ts +281 -0
- package/npm-package/test.js +557 -0
- package/npm-package/typescript-test.ts +70 -0
- package/package.json +16 -0
- package/pydatahike/README.md +569 -0
- package/pydatahike/pyproject.toml +91 -0
- package/pydatahike/setup.py +42 -0
- package/pydatahike/src/datahike/__init__.py +134 -0
- package/pydatahike/src/datahike/_native.py +250 -0
- package/pydatahike/src/datahike/_version.py +2 -0
- package/pydatahike/src/datahike/database.py +722 -0
- package/pydatahike/src/datahike/edn.py +311 -0
- package/pydatahike/src/datahike/py.typed +0 -0
- package/pydatahike/tests/conftest.py +17 -0
- package/pydatahike/tests/test_basic.py +170 -0
- package/pydatahike/tests/test_database.py +51 -0
- package/pydatahike/tests/test_edn_conversion.py +299 -0
- package/pydatahike/tests/test_query.py +99 -0
- package/pydatahike/tests/test_schema.py +55 -0
- package/resources/clj-kondo.exports/io.replikativ/datahike/config.edn +5 -0
- package/resources/example_server.edn +4 -0
- package/shadow-cljs.edn +56 -0
- package/src/data_readers.clj +7 -0
- package/src/datahike/api/impl.cljc +176 -0
- package/src/datahike/api/specification.cljc +633 -0
- package/src/datahike/api/types.cljc +261 -0
- package/src/datahike/api.cljc +41 -0
- package/src/datahike/array.cljc +99 -0
- package/src/datahike/cli.clj +166 -0
- package/src/datahike/cljs.cljs +6 -0
- package/src/datahike/codegen/cli.clj +406 -0
- package/src/datahike/codegen/clj_kondo.clj +291 -0
- package/src/datahike/codegen/java.clj +403 -0
- package/src/datahike/codegen/naming.cljc +33 -0
- package/src/datahike/codegen/native.clj +559 -0
- package/src/datahike/codegen/pod.clj +488 -0
- package/src/datahike/codegen/python.clj +838 -0
- package/src/datahike/codegen/report.clj +55 -0
- package/src/datahike/codegen/typescript.clj +262 -0
- package/src/datahike/codegen/validation.clj +145 -0
- package/src/datahike/config.cljc +294 -0
- package/src/datahike/connections.cljc +16 -0
- package/src/datahike/connector.cljc +265 -0
- package/src/datahike/constants.cljc +142 -0
- package/src/datahike/core.cljc +297 -0
- package/src/datahike/datom.cljc +459 -0
- package/src/datahike/db/interface.cljc +119 -0
- package/src/datahike/db/search.cljc +305 -0
- package/src/datahike/db/transaction.cljc +937 -0
- package/src/datahike/db/utils.cljc +338 -0
- package/src/datahike/db.cljc +956 -0
- package/src/datahike/experimental/unstructured.cljc +126 -0
- package/src/datahike/experimental/versioning.cljc +172 -0
- package/src/datahike/externs.js +31 -0
- package/src/datahike/gc.cljc +69 -0
- package/src/datahike/http/client.clj +188 -0
- package/src/datahike/http/writer.clj +79 -0
- package/src/datahike/impl/entity.cljc +218 -0
- package/src/datahike/index/interface.cljc +93 -0
- package/src/datahike/index/persistent_set.cljc +469 -0
- package/src/datahike/index/utils.cljc +44 -0
- package/src/datahike/index.cljc +32 -0
- package/src/datahike/js/api.cljs +172 -0
- package/src/datahike/js/api_macros.clj +22 -0
- package/src/datahike/js.cljs +163 -0
- package/src/datahike/json.cljc +209 -0
- package/src/datahike/lru.cljc +146 -0
- package/src/datahike/migrate.clj +39 -0
- package/src/datahike/norm/norm.clj +245 -0
- package/src/datahike/online_gc.cljc +252 -0
- package/src/datahike/pod.clj +155 -0
- package/src/datahike/pull_api.cljc +325 -0
- package/src/datahike/query.cljc +1945 -0
- package/src/datahike/query_stats.cljc +88 -0
- package/src/datahike/readers.cljc +62 -0
- package/src/datahike/remote.cljc +218 -0
- package/src/datahike/schema.cljc +228 -0
- package/src/datahike/schema_cache.cljc +42 -0
- package/src/datahike/spec.cljc +101 -0
- package/src/datahike/store.cljc +80 -0
- package/src/datahike/tools.cljc +308 -0
- package/src/datahike/transit.cljc +80 -0
- package/src/datahike/writer.cljc +239 -0
- package/src/datahike/writing.cljc +362 -0
- package/src/deps.cljs +1 -0
- package/src-hitchhiker-tree/datahike/index/hitchhiker_tree/insert.cljc +76 -0
- package/src-hitchhiker-tree/datahike/index/hitchhiker_tree/upsert.cljc +128 -0
- package/src-hitchhiker-tree/datahike/index/hitchhiker_tree.cljc +213 -0
- package/test/datahike/backward_compatibility_test/src/backward_test.clj +37 -0
- package/test/datahike/integration_test/config_record_file_test.clj +14 -0
- package/test/datahike/integration_test/config_record_test.clj +14 -0
- package/test/datahike/integration_test/depr_config_uri_test.clj +15 -0
- package/test/datahike/integration_test/return_map_test.clj +62 -0
- package/test/datahike/integration_test.cljc +67 -0
- package/test/datahike/norm/norm_test.clj +124 -0
- package/test/datahike/norm/resources/naming-and-sorting-test/001-a1-example.edn +5 -0
- package/test/datahike/norm/resources/naming-and-sorting-test/002-a2-example.edn +5 -0
- package/test/datahike/norm/resources/naming-and-sorting-test/003-tx-fn-test.edn +1 -0
- package/test/datahike/norm/resources/naming-and-sorting-test/004-tx-data-and-tx-fn-test.edn +5 -0
- package/test/datahike/norm/resources/naming-and-sorting-test/01-transact-basic-characters.edn +2 -0
- package/test/datahike/norm/resources/naming-and-sorting-test/02 add occupation.edn +5 -0
- package/test/datahike/norm/resources/naming-and-sorting-test/checksums.edn +12 -0
- package/test/datahike/norm/resources/simple-test/001-a1-example.edn +5 -0
- package/test/datahike/norm/resources/simple-test/002-a2-example.edn +5 -0
- package/test/datahike/norm/resources/simple-test/checksums.edn +4 -0
- package/test/datahike/norm/resources/tx-data-and-tx-fn-test/first/001-a1-example.edn +5 -0
- package/test/datahike/norm/resources/tx-data-and-tx-fn-test/first/002-a2-example.edn +5 -0
- package/test/datahike/norm/resources/tx-data-and-tx-fn-test/first/003-tx-fn-test.edn +1 -0
- package/test/datahike/norm/resources/tx-data-and-tx-fn-test/first/checksums.edn +6 -0
- package/test/datahike/norm/resources/tx-data-and-tx-fn-test/second/004-tx-data-and-tx-fn-test.edn +5 -0
- package/test/datahike/norm/resources/tx-data-and-tx-fn-test/second/checksums.edn +2 -0
- package/test/datahike/norm/resources/tx-fn-test/first/001-a1-example.edn +5 -0
- package/test/datahike/norm/resources/tx-fn-test/first/002-a2-example.edn +5 -0
- package/test/datahike/norm/resources/tx-fn-test/first/checksums.edn +4 -0
- package/test/datahike/norm/resources/tx-fn-test/second/003-tx-fn-test.edn +1 -0
- package/test/datahike/norm/resources/tx-fn-test/second/checksums.edn +2 -0
- package/test/datahike/test/api_test.cljc +895 -0
- package/test/datahike/test/array_test.cljc +40 -0
- package/test/datahike/test/attribute_refs/datoms_test.cljc +140 -0
- package/test/datahike/test/attribute_refs/db_test.cljc +42 -0
- package/test/datahike/test/attribute_refs/differences_test.cljc +515 -0
- package/test/datahike/test/attribute_refs/entity_test.cljc +89 -0
- package/test/datahike/test/attribute_refs/pull_api_test.cljc +320 -0
- package/test/datahike/test/attribute_refs/query_find_specs_test.cljc +59 -0
- package/test/datahike/test/attribute_refs/query_fns_test.cljc +130 -0
- package/test/datahike/test/attribute_refs/query_interop_test.cljc +47 -0
- package/test/datahike/test/attribute_refs/query_not_test.cljc +193 -0
- package/test/datahike/test/attribute_refs/query_or_test.cljc +137 -0
- package/test/datahike/test/attribute_refs/query_pull_test.cljc +156 -0
- package/test/datahike/test/attribute_refs/query_rules_test.cljc +176 -0
- package/test/datahike/test/attribute_refs/query_test.cljc +241 -0
- package/test/datahike/test/attribute_refs/temporal_search.cljc +22 -0
- package/test/datahike/test/attribute_refs/transact_test.cljc +220 -0
- package/test/datahike/test/attribute_refs/utils.cljc +128 -0
- package/test/datahike/test/cache_test.cljc +38 -0
- package/test/datahike/test/components_test.cljc +92 -0
- package/test/datahike/test/config_test.cljc +158 -0
- package/test/datahike/test/core_test.cljc +105 -0
- package/test/datahike/test/datom_test.cljc +44 -0
- package/test/datahike/test/db_test.cljc +54 -0
- package/test/datahike/test/entity_spec_test.cljc +159 -0
- package/test/datahike/test/entity_test.cljc +103 -0
- package/test/datahike/test/explode_test.cljc +143 -0
- package/test/datahike/test/filter_test.cljc +75 -0
- package/test/datahike/test/gc_test.cljc +159 -0
- package/test/datahike/test/http/server_test.clj +192 -0
- package/test/datahike/test/http/writer_test.clj +86 -0
- package/test/datahike/test/ident_test.cljc +32 -0
- package/test/datahike/test/index_test.cljc +345 -0
- package/test/datahike/test/insert.cljc +125 -0
- package/test/datahike/test/java_bindings_test.clj +6 -0
- package/test/datahike/test/listen_test.cljc +41 -0
- package/test/datahike/test/lookup_refs_test.cljc +266 -0
- package/test/datahike/test/lru_test.cljc +27 -0
- package/test/datahike/test/migrate_test.clj +297 -0
- package/test/datahike/test/model/core.cljc +376 -0
- package/test/datahike/test/model/invariant.cljc +142 -0
- package/test/datahike/test/model/rng.cljc +82 -0
- package/test/datahike/test/model_test.clj +217 -0
- package/test/datahike/test/nodejs_test.cljs +262 -0
- package/test/datahike/test/online_gc_test.cljc +475 -0
- package/test/datahike/test/pod_test.clj +369 -0
- package/test/datahike/test/pull_api_test.cljc +474 -0
- package/test/datahike/test/purge_test.cljc +144 -0
- package/test/datahike/test/query_aggregates_test.cljc +101 -0
- package/test/datahike/test/query_find_specs_test.cljc +52 -0
- package/test/datahike/test/query_fns_test.cljc +523 -0
- package/test/datahike/test/query_interop_test.cljc +47 -0
- package/test/datahike/test/query_not_test.cljc +189 -0
- package/test/datahike/test/query_or_test.cljc +158 -0
- package/test/datahike/test/query_pull_test.cljc +147 -0
- package/test/datahike/test/query_rules_test.cljc +248 -0
- package/test/datahike/test/query_stats_test.cljc +218 -0
- package/test/datahike/test/query_test.cljc +984 -0
- package/test/datahike/test/schema_test.cljc +424 -0
- package/test/datahike/test/specification_test.cljc +30 -0
- package/test/datahike/test/store_test.cljc +78 -0
- package/test/datahike/test/stress_test.cljc +57 -0
- package/test/datahike/test/time_variance_test.cljc +518 -0
- package/test/datahike/test/tools_test.clj +134 -0
- package/test/datahike/test/transact_test.cljc +518 -0
- package/test/datahike/test/tuples_test.cljc +564 -0
- package/test/datahike/test/unstructured_test.cljc +291 -0
- package/test/datahike/test/upsert_impl_test.cljc +205 -0
- package/test/datahike/test/upsert_test.cljc +363 -0
- package/test/datahike/test/utils.cljc +110 -0
- package/test/datahike/test/validation_test.cljc +48 -0
- package/test/datahike/test/versioning_test.cljc +56 -0
- package/test/datahike/test.cljc +66 -0
- package/tests.edn +24 -0
package/doc/gc.md
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
# Garbage Collection
|
|
2
|
+
|
|
3
|
+
Datahike uses persistent data structures that enable structural sharing—each update creates a new version efficiently by reusing unchanged parts. This allows [time-travel queries](./time_variance.md) and [git-like versioning](./versioning.md), but storage grows over time as old snapshots accumulate.
|
|
4
|
+
|
|
5
|
+
**Garbage collection removes old database snapshots from storage while preserving current branch heads.**
|
|
6
|
+
|
|
7
|
+
## GC vs Purging
|
|
8
|
+
|
|
9
|
+
Don't confuse garbage collection with data purging:
|
|
10
|
+
|
|
11
|
+
- **Garbage Collection** (this document): Removes old database *snapshots* to reclaim storage. Used for routine storage maintenance.
|
|
12
|
+
- **[Data Purging](./time_variance.md#data-purging)**: Permanently deletes specific *data* for privacy compliance (GDPR, HIPAA, CCPA). Used only when legally required.
|
|
13
|
+
|
|
14
|
+
## How Garbage Collection Works
|
|
15
|
+
|
|
16
|
+
GC whitelists all current branches and marks snapshots as reachable based on a grace period. Snapshots older than the grace period are deleted from storage, but **branch heads are always retained** regardless of age.
|
|
17
|
+
|
|
18
|
+
## Basic Usage
|
|
19
|
+
|
|
20
|
+
```clojure
|
|
21
|
+
(require '[datahike.api :as d]
|
|
22
|
+
'[superv.async :refer [<?? S]])
|
|
23
|
+
|
|
24
|
+
;; Remove only deleted branches, keep all snapshots
|
|
25
|
+
(<?? S (d/gc-storage conn))
|
|
26
|
+
;; => #{...} ; set of deleted storage blobs
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Running without a date removes **only deleted branches**—all snapshots on active branches are preserved. This is safe to run anytime and reclaims storage from old experimental branches.
|
|
30
|
+
|
|
31
|
+
**Note:** Returns a `core.async` channel. Use `<??` to block, or run without it for background execution. GC requires no coordination and won't slow down transactions or reads.
|
|
32
|
+
|
|
33
|
+
## Grace Periods for Distributed Readers
|
|
34
|
+
|
|
35
|
+
Datahike's [Distributed Index Space](./distributed.md) allows readers to access storage directly without coordination. This is powerful for scalability but means **long-running processes might read from old snapshots for hours**.
|
|
36
|
+
|
|
37
|
+
Examples of long-running readers:
|
|
38
|
+
- **Reporting jobs**: Generate daily/weekly reports by querying yesterday's snapshot
|
|
39
|
+
- **Analytics pipelines**: Process historical data over several hours
|
|
40
|
+
- **Monitoring dashboards**: Display metrics from recent snapshots
|
|
41
|
+
- **Backup processes**: Copy database state while it's being updated
|
|
42
|
+
|
|
43
|
+
**The grace period ensures these readers don't encounter missing data.** Snapshots created after the grace period date are kept; older ones are deleted.
|
|
44
|
+
|
|
45
|
+
```clojure
|
|
46
|
+
(require '[datahike.api :as d])
|
|
47
|
+
|
|
48
|
+
;; Keep last 7 days of snapshots
|
|
49
|
+
(let [seven-days-ago (java.util.Date. (- (System/currentTimeMillis)
|
|
50
|
+
(* 7 24 60 60 1000)))]
|
|
51
|
+
(<?? S (d/gc-storage conn seven-days-ago)))
|
|
52
|
+
|
|
53
|
+
;; Keep last 30 days (common for compliance)
|
|
54
|
+
(let [thirty-days-ago (java.util.Date. (- (System/currentTimeMillis)
|
|
55
|
+
(* 30 24 60 60 1000)))]
|
|
56
|
+
(<?? S (d/gc-storage conn thirty-days-ago)))
|
|
57
|
+
|
|
58
|
+
;; Keep last 24 hours (for fast-moving data)
|
|
59
|
+
(let [yesterday (java.util.Date. (- (System/currentTimeMillis)
|
|
60
|
+
(* 24 60 60 1000)))]
|
|
61
|
+
(<?? S (d/gc-storage conn yesterday)))
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
**Choosing a grace period:**
|
|
65
|
+
- Consider your longest-running reader process
|
|
66
|
+
- Add buffer time for safety (if longest job is 2 hours, use 4-6 hours)
|
|
67
|
+
- Balance storage costs against reader safety
|
|
68
|
+
- Monitor reader patterns before shortening grace periods
|
|
69
|
+
|
|
70
|
+
**Branch heads are always kept** regardless of the grace period—only intermediate snapshots are removed.
|
|
71
|
+
|
|
72
|
+
## Online Garbage Collection (Incremental GC)
|
|
73
|
+
|
|
74
|
+
> ⚠️ **EXPERIMENTAL FEATURE**
|
|
75
|
+
|
|
76
|
+
Online GC automatically deletes freed index nodes during transaction commits, preventing garbage accumulation during bulk imports and high-write workloads.
|
|
77
|
+
|
|
78
|
+
> Online GC is currently an experimental feature. While it has been tested extensively in Clojure/JVM and includes safety mechanisms for multi-branch databases, use with caution in production. We recommend:
|
|
79
|
+
> - Thorough testing in your specific use case before production deployment
|
|
80
|
+
> - Monitoring freed address counts to verify expected behavior
|
|
81
|
+
> - Using it primarily for bulk imports and high-write workloads where it's most beneficial
|
|
82
|
+
> - **ClojureScript**: Online GC functionality is available in CLJS but has not been tested in big bulk loads yet. JVM testing is more comprehensive.
|
|
83
|
+
> - Reporting any issues at https://github.com/replikativ/datahike/issues
|
|
84
|
+
|
|
85
|
+
### How Online GC Works
|
|
86
|
+
|
|
87
|
+
> Online GC is **ONLY safe for single-branch databases**.
|
|
88
|
+
> For multi-branch databases, online GC is automatically disabled because freed nodes
|
|
89
|
+
> from one branch may still be referenced by other branches through structural sharing.
|
|
90
|
+
> Use offline GC (`d/gc-storage`) for multi-branch cleanup instead.
|
|
91
|
+
|
|
92
|
+
When PSS (Persistent Sorted Set) index trees are modified during transactions, old index nodes become unreachable. Online GC tracks these freed addresses with timestamps and deletes them incrementally:
|
|
93
|
+
|
|
94
|
+
1. **During transaction** (transient mode): PSS calls `markFreed()` for each replaced index node
|
|
95
|
+
2. **At commit time**: Freed addresses older than the grace period are batch-deleted
|
|
96
|
+
3. **Multi-branch safety check**: If multiple branches detected, GC is skipped entirely
|
|
97
|
+
4. **No full tree walk**: Only freed addresses are deleted, not requiring expensive tree traversal
|
|
98
|
+
|
|
99
|
+
**Key benefits:**
|
|
100
|
+
- **Prevents unbounded storage growth** during bulk imports (single-branch only)
|
|
101
|
+
- **Incremental deletion**: Small batches per commit, low overhead
|
|
102
|
+
- **Grace period support**: Safe for concurrent readers accessing old snapshots
|
|
103
|
+
- **Multi-branch safety**: Automatically disabled to prevent corruption
|
|
104
|
+
- **Configurable**: Can be disabled, tuned, or run in background
|
|
105
|
+
|
|
106
|
+
### Configuration
|
|
107
|
+
|
|
108
|
+
Enable online GC in your database config:
|
|
109
|
+
|
|
110
|
+
```clojure
|
|
111
|
+
;; For bulk imports (no concurrent readers, single-branch)
|
|
112
|
+
;; See "Address Recycling" section below for details
|
|
113
|
+
{:online-gc {:enabled? true
|
|
114
|
+
:grace-period-ms 0 ;; Recycle immediately
|
|
115
|
+
:max-batch 10000} ;; Large batches for efficiency
|
|
116
|
+
:crypto-hash? false} ;; Required for address recycling
|
|
117
|
+
|
|
118
|
+
;; For production (concurrent readers)
|
|
119
|
+
{:online-gc {:enabled? true
|
|
120
|
+
:grace-period-ms 300000 ;; 5 minutes
|
|
121
|
+
:max-batch 1000}} ;; Smaller batches
|
|
122
|
+
|
|
123
|
+
;; Disabled (default)
|
|
124
|
+
{:online-gc {:enabled? false}}
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
**Configuration options:**
|
|
128
|
+
|
|
129
|
+
- `:enabled?` - Enable/disable online GC (default: `false`)
|
|
130
|
+
- `:grace-period-ms` - Minimum age in milliseconds before deletion (default: `60000` = 1 minute)
|
|
131
|
+
- `:max-batch` - Maximum addresses to delete per commit (default: `1000`)
|
|
132
|
+
- `:sync?` - Synchronous deletion (always `false` inside commits for async operation)
|
|
133
|
+
|
|
134
|
+
### Background GC Mode
|
|
135
|
+
|
|
136
|
+
For production systems, run GC in a background thread instead of blocking commits:
|
|
137
|
+
|
|
138
|
+
```clojure
|
|
139
|
+
(require '[datahike.online-gc :as online-gc])
|
|
140
|
+
|
|
141
|
+
;; Start background GC
|
|
142
|
+
(def stop-ch (online-gc/start-background-gc!
|
|
143
|
+
(:store @conn)
|
|
144
|
+
{:grace-period-ms 60000 ;; 1 minute
|
|
145
|
+
:interval-ms 10000 ;; Run every 10 seconds
|
|
146
|
+
:max-batch 1000}))
|
|
147
|
+
|
|
148
|
+
;; Later, stop background GC
|
|
149
|
+
(clojure.core.async/close! stop-ch)
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
**Background mode advantages:**
|
|
153
|
+
- Non-blocking: Doesn't slow down commits
|
|
154
|
+
- Periodic cleanup: Runs every N milliseconds
|
|
155
|
+
- Graceful shutdown: Close channel to stop
|
|
156
|
+
|
|
157
|
+
### Address Recycling (Bulk Import Optimization)
|
|
158
|
+
|
|
159
|
+
> ⚠️ **EXPERIMENTAL FEATURE**
|
|
160
|
+
>
|
|
161
|
+
> Address recycling is an experimental optimization. It has been designed with safety checks (multi-branch detection, grace periods), but should be thoroughly tested in your environment before production use.
|
|
162
|
+
|
|
163
|
+
Online GC includes **address recycling**—freed addresses are reused for new index nodes instead of being deleted from storage. This optimization is particularly powerful for bulk imports.
|
|
164
|
+
|
|
165
|
+
**How it works:**
|
|
166
|
+
1. When index trees are modified, old root addresses are marked as freed
|
|
167
|
+
2. Online GC moves eligible addresses to a freelist (grace period applies)
|
|
168
|
+
3. New index nodes reuse addresses from the freelist instead of generating new UUIDs
|
|
169
|
+
4. LMDB overwrites the recycled address with new data
|
|
170
|
+
|
|
171
|
+
**Benefits:**
|
|
172
|
+
- **Zero delete operations**: Converts O(freed_nodes) deletes to O(1) freelist append
|
|
173
|
+
- **Reduces LMDB fragmentation**: Addresses are reused rather than accumulating
|
|
174
|
+
- **Perfect for bulk imports**: With `:grace-period-ms 0`, recycling happens immediately
|
|
175
|
+
- **Minimal overhead**: No tree traversal or complex reachability analysis
|
|
176
|
+
|
|
177
|
+
**Safety limitations:**
|
|
178
|
+
|
|
179
|
+
**Address recycling is ONLY safe for:**
|
|
180
|
+
- **Single-branch databases** (shared nodes across branches would be corrupted)
|
|
181
|
+
- **No long-lived readers** (or grace period exceeds reader lifetime)
|
|
182
|
+
- **Bulk import scenarios** (write-only, no concurrent queries)
|
|
183
|
+
|
|
184
|
+
**Online GC is automatically disabled when:**
|
|
185
|
+
- Multiple branches exist (online GC completely skipped - use offline GC instead)
|
|
186
|
+
Reason: Freed nodes from one branch may still be referenced by other branches
|
|
187
|
+
through structural sharing
|
|
188
|
+
- Using `:crypto-hash? true` with recycling (falls back to deletion mode)
|
|
189
|
+
|
|
190
|
+
### Bulk Import Configuration
|
|
191
|
+
|
|
192
|
+
For maximum performance during bulk imports where no concurrent readers exist:
|
|
193
|
+
|
|
194
|
+
```clojure
|
|
195
|
+
;; Optimal bulk import configuration
|
|
196
|
+
{:online-gc {:enabled? true
|
|
197
|
+
:grace-period-ms 0 ;; Recycle immediately (no readers)
|
|
198
|
+
:max-batch 10000} ;; Large batch (only for delete fallback)
|
|
199
|
+
:crypto-hash? false ;; Required for recycling
|
|
200
|
+
:branch :db} ;; Single branch only
|
|
201
|
+
|
|
202
|
+
;; Example bulk import
|
|
203
|
+
(let [cfg {:store {:backend :file :path "/data/bulk-import"}
|
|
204
|
+
:online-gc {:enabled? true :grace-period-ms 0}
|
|
205
|
+
:crypto-hash? false}
|
|
206
|
+
conn (d/connect cfg)]
|
|
207
|
+
;; Import millions of entities
|
|
208
|
+
(doseq [batch entity-batches]
|
|
209
|
+
(d/transact conn batch))
|
|
210
|
+
;; Storage stays bounded - addresses are recycled
|
|
211
|
+
(d/release conn))
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
**Bulk import best practices:**
|
|
215
|
+
1. Set `:grace-period-ms 0` (no concurrent readers to protect)
|
|
216
|
+
2. Use `:crypto-hash? false` (enables address recycling)
|
|
217
|
+
3. Stay on single branch (`:branch :db`)
|
|
218
|
+
4. Increase `:max-batch` for efficiency (only affects delete fallback)
|
|
219
|
+
5. Monitor freed address counts to verify recycling is working
|
|
220
|
+
|
|
221
|
+
**Verifying address recycling:**
|
|
222
|
+
- Check logs for `"Online GC: recycling N addresses to freelist"`
|
|
223
|
+
- If you see `"Online GC: skipped (multi-branch detected)"`, ensure single branch
|
|
224
|
+
(multi-branch databases require offline GC instead)
|
|
225
|
+
- Freed address counts should drop to zero after each transaction
|
|
226
|
+
|
|
227
|
+
### Online GC vs Offline GC
|
|
228
|
+
|
|
229
|
+
**Online GC** (incremental):
|
|
230
|
+
- Runs during commits
|
|
231
|
+
- Deletes only **freed index nodes** from recent transactions
|
|
232
|
+
- Fast: No tree traversal required
|
|
233
|
+
- **With recycling**: No delete operations at all, just freelist management
|
|
234
|
+
- **ONLY for single-branch databases** - automatically disabled for multi-branch
|
|
235
|
+
- Best for: Bulk imports, high-write workloads
|
|
236
|
+
|
|
237
|
+
**Offline GC** (`d/gc-storage`):
|
|
238
|
+
- Runs manually
|
|
239
|
+
- Deletes **entire old snapshots** by walking all branches
|
|
240
|
+
- Slower: Full tree traversal and marking
|
|
241
|
+
- Handles **multi-branch databases** safely through reachability analysis
|
|
242
|
+
- **Required for multi-branch databases** (online GC doesn't work)
|
|
243
|
+
- Best for: Periodic maintenance, deleting old branches, multi-branch cleanup
|
|
244
|
+
|
|
245
|
+
**Use both:** Online GC for incremental cleanup during single-branch writes, offline GC for periodic deep cleaning and all multi-branch scenarios.
|
|
246
|
+
|
|
247
|
+
## Automatic Garbage Collection
|
|
248
|
+
|
|
249
|
+
With online GC enabled, garbage collection becomes largely automatic during normal operation. Manual `d/gc-storage` runs are only needed for:
|
|
250
|
+
- Deleting old branches
|
|
251
|
+
- Periodic deep cleaning (monthly/quarterly)
|
|
252
|
+
- Compliance-driven snapshot removal
|
|
253
|
+
|
|
254
|
+
## When to Run GC
|
|
255
|
+
|
|
256
|
+
- **After deleting branches**: Immediately reclaim storage
|
|
257
|
+
- **Periodic maintenance**: Weekly/monthly based on storage growth
|
|
258
|
+
- **Storage alerts**: When approaching capacity limits
|
|
259
|
+
- **Version cleanup**: After completing long-running migrations
|
|
260
|
+
|
|
261
|
+
## What Gets Deleted
|
|
262
|
+
|
|
263
|
+
GC removes:
|
|
264
|
+
- Old database snapshots older than the grace period
|
|
265
|
+
- Deleted branches and their snapshots
|
|
266
|
+
- Unreachable index nodes from old snapshots
|
|
267
|
+
|
|
268
|
+
GC preserves:
|
|
269
|
+
- All current branch heads (always)
|
|
270
|
+
- Snapshots created after the grace period
|
|
271
|
+
- All data on retained snapshots (GC doesn't delete data, only snapshots)
|
|
272
|
+
|
|
273
|
+
**Remember:** For deleting specific data (GDPR compliance), use [data purging](./time_variance.md#data-purging), not garbage collection.
|