data-refinery-cli 0.4.0__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.github/workflows/tests.yml +12 -0
  2. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/AGENTS.colleague.md +13 -8
  3. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/CHANGELOG.md +35 -0
  4. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/CLAUDE.md +37 -25
  5. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/PKG-INFO +34 -4
  6. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/README.md +30 -3
  7. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/data_refinery/cli/__init__.py +4 -0
  8. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/data_refinery/cli/_commands/learn.py +15 -3
  9. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/data_refinery/cli/_commands/overview.py +2 -0
  10. data_refinery_cli-0.5.1/data_refinery/cli/_commands/quality.py +185 -0
  11. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/data_refinery/cli/_commands/stack.py +50 -32
  12. data_refinery_cli-0.5.1/data_refinery/cli/_commands/store.py +221 -0
  13. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/data_refinery/explain/catalog.py +107 -4
  14. data_refinery_cli-0.5.1/data_refinery/quality/__init__.py +27 -0
  15. data_refinery_cli-0.5.1/data_refinery/quality/checks.py +208 -0
  16. data_refinery_cli-0.5.1/data_refinery/store/__init__.py +72 -0
  17. data_refinery_cli-0.5.1/data_refinery/store/backend.py +62 -0
  18. data_refinery_cli-0.5.1/data_refinery/store/backends/__init__.py +7 -0
  19. data_refinery_cli-0.5.1/data_refinery/store/backends/files.py +123 -0
  20. data_refinery_cli-0.5.1/data_refinery/store/backends/mongo.py +126 -0
  21. data_refinery_cli-0.5.1/data_refinery/store/backends/neo4j.py +199 -0
  22. data_refinery_cli-0.5.1/data_refinery/store/envelope.py +114 -0
  23. data_refinery_cli-0.5.1/docs/contract.md +139 -0
  24. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/pyproject.toml +8 -1
  25. data_refinery_cli-0.5.1/tests/conftest.py +179 -0
  26. data_refinery_cli-0.5.1/tests/test_no_optional_top_import.py +46 -0
  27. data_refinery_cli-0.5.1/tests/test_quality.py +216 -0
  28. data_refinery_cli-0.5.1/tests/test_scope_no_leak.py +51 -0
  29. data_refinery_cli-0.5.1/tests/test_store_adapters.py +105 -0
  30. data_refinery_cli-0.5.1/tests/test_store_api.py +45 -0
  31. data_refinery_cli-0.5.1/tests/test_store_backends.py +86 -0
  32. data_refinery_cli-0.5.1/tests/test_store_cli.py +175 -0
  33. data_refinery_cli-0.5.1/tests/test_store_envelope.py +82 -0
  34. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/uv.lock +93 -1
  35. data_refinery_cli-0.4.0/docs/contract.md +0 -91
  36. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/agent-config/SKILL.md +0 -0
  37. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/agent-config/data/backend-fingerprints.yaml +0 -0
  38. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/agent-config/scripts/show.sh +0 -0
  39. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/ask-colleague/SKILL.md +0 -0
  40. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/ask-colleague/prompts/explore.md +0 -0
  41. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/ask-colleague/prompts/review.md +0 -0
  42. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/ask-colleague/prompts/write.md +0 -0
  43. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/ask-colleague/scripts/ask-colleague.sh +0 -0
  44. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/assign-to-workforce/SKILL.md +0 -0
  45. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/assign-to-workforce/scripts/assign-to-workforce.sh +0 -0
  46. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/cicd/SKILL.md +0 -0
  47. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/cicd/scripts/_resolve-nick.sh +0 -0
  48. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/cicd/scripts/portability-lint.sh +0 -0
  49. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/cicd/scripts/pr-reply.sh +0 -0
  50. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/cicd/scripts/pr-status.sh +0 -0
  51. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/cicd/scripts/workflow.sh +0 -0
  52. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/communicate/SKILL.md +0 -0
  53. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/communicate/scripts/fetch-issues.sh +0 -0
  54. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/communicate/scripts/mesh-message.sh +0 -0
  55. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/communicate/scripts/post-comment.sh +0 -0
  56. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/communicate/scripts/post-issue.sh +0 -0
  57. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/communicate/scripts/templates/skill-new-brief.md +0 -0
  58. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/communicate/scripts/templates/skill-update-brief.md +0 -0
  59. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/doc-test-alignment/SKILL.md +0 -0
  60. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/doc-test-alignment/scripts/check.sh +0 -0
  61. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/pypi-maintainer/SKILL.md +0 -0
  62. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/pypi-maintainer/scripts/switch-source.sh +0 -0
  63. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/run-tests/SKILL.md +0 -0
  64. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/run-tests/scripts/test.sh +0 -0
  65. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/sonarclaude/SKILL.md +0 -0
  66. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/sonarclaude/scripts/sonar.sh +0 -0
  67. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/spec-to-plan/SKILL.md +0 -0
  68. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/spec-to-plan/scripts/spec-to-plan.sh +0 -0
  69. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/think/SKILL.md +0 -0
  70. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/think/scripts/think.sh +0 -0
  71. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/version-bump/SKILL.md +0 -0
  72. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills/version-bump/scripts/bump.py +0 -0
  73. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.claude/skills.local.yaml.example +0 -0
  74. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.devague/current +0 -0
  75. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.devague/current_plan +0 -0
  76. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.devague/frames/data-refinery-cli-ships-the-storage-data-quality-i.json +0 -0
  77. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.devague/plans/data-refinery-cli-ships-the-storage-data-quality-i.json +0 -0
  78. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.flake8 +0 -0
  79. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.github/workflows/publish-stack.yml +0 -0
  80. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.github/workflows/publish.yml +0 -0
  81. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.gitignore +0 -0
  82. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/.markdownlint-cli2.yaml +0 -0
  83. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/LICENSE +0 -0
  84. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/culture.yaml +0 -0
  85. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/data_refinery/__init__.py +0 -0
  86. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/data_refinery/__main__.py +0 -0
  87. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/data_refinery/cli/_commands/__init__.py +0 -0
  88. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/data_refinery/cli/_commands/cli.py +0 -0
  89. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/data_refinery/cli/_commands/doctor.py +0 -0
  90. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/data_refinery/cli/_commands/explain.py +0 -0
  91. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/data_refinery/cli/_commands/whoami.py +0 -0
  92. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/data_refinery/cli/_errors.py +0 -0
  93. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/data_refinery/cli/_output.py +0 -0
  94. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/data_refinery/explain/__init__.py +0 -0
  95. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/docker-compose.yml +0 -0
  96. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/docs/plans/2026-06-20-data-refinery-cli-ships-the-storage-data-quality-i.md +0 -0
  97. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/docs/skill-sources.md +0 -0
  98. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/docs/specs/2026-06-20-data-refinery-cli-ships-the-storage-data-quality-i.md +0 -0
  99. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/docs/stack-image.md +0 -0
  100. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/sonar-project.properties +0 -0
  101. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/tests/__init__.py +0 -0
  102. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/tests/test_cli.py +0 -0
  103. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/tests/test_cli_introspection.py +0 -0
  104. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/tests/test_live_stack.py +0 -0
  105. {data_refinery_cli-0.4.0 → data_refinery_cli-0.5.1}/tests/test_stack.py +0 -0
@@ -31,11 +31,23 @@ jobs:
31
31
 
32
32
  - run: uv run pytest -n auto --cov=data_refinery --cov-report=xml:coverage.xml --cov-report=term -v
33
33
 
34
+ # Stamp each analysis with the package version so SonarCloud's "Previous
35
+ # version" New Code period has a real boundary to diff against (the version
36
+ # is bumped every PR; see version-check below + the version-bump skill).
37
+ - name: Resolve project version
38
+ id: ver
39
+ run: |
40
+ VERSION=$(uv run python -c 'import tomllib; print(tomllib.load(open("pyproject.toml","rb"))["project"]["version"])')
41
+ echo "version=$VERSION" >> "$GITHUB_OUTPUT"
42
+
34
43
  - name: SonarCloud Scan
35
44
  if: env.SONAR_TOKEN != ''
36
45
  uses: SonarSource/sonarqube-scan-action@fd88b7d7ccbaefd23d8f36f73b59db7a3d246602 # v6
37
46
  env:
38
47
  SONAR_HOST_URL: https://sonarcloud.io
48
+ with:
49
+ args: >
50
+ -Dsonar.projectVersion=${{ steps.ver.outputs.version }}
39
51
 
40
52
  lint:
41
53
  runs-on: ubuntu-latest
@@ -15,10 +15,14 @@ behavior, update both.
15
15
 
16
16
  data-refinery-cli owns the **storage + data-quality infrastructure layer** split
17
17
  out of eidetic-cli (issue #1): the mongo + neo4j substrate, the docker stack
18
- (published to GHCR), and a **consumer-agnostic** data-quality surface (validate,
19
- dedup, integrity, freshness). It treats stored data as **opaque documents** and
20
- never interprets them as "memories" — that semantics stays in eidetic, the first
21
- consumer over a subprocess-not-import boundary.
18
+ (published to GHCR), a storage-neutral **store** (`store put/get/list` over a
19
+ files/mongo/neo4j `Backend`, also importable as `data_refinery.store`), and a
20
+ **consumer-agnostic** data-quality surface (`validate`, `dedup`, `integrity`,
21
+ `freshness`). It treats stored data as **opaque envelopes**
22
+ (`{id, hash, content, scope, metadata}`) and never interprets them as "memories"
23
+ — that semantics stays in eidetic, the first consumer over a
24
+ subprocess-not-import boundary. Waves 1 (stack) and 2 (store + quality) are
25
+ built; Wave 3 (the pinned verb contract + eidetic consumption) is open.
22
26
 
23
27
  ## Names (keep them straight)
24
28
 
@@ -35,10 +39,11 @@ consumer over a subprocess-not-import boundary.
35
39
  (`0` ok, `1` user error, `2` environment error, `3+` reserved).
36
40
  - **`--json` on every command**; results to stdout, errors/diagnostics to
37
41
  stderr, never mixed.
38
- - **Runtime deps stay empty by default.** `dependencies = []`. Heavy store
39
- drivers (`neo4j`, `pymongo`, Wave 2) go behind an optional extra and are
40
- lazy-imported inside function bodies, exiting `CliError(code=2)` with an
41
- install `hint:` when absent.
42
+ - **Runtime deps stay empty by default.** `dependencies = []`. The `files`
43
+ backend is stdlib-only; the heavy store drivers (`neo4j`, `pymongo`) live
44
+ behind the optional `[store]` extra and are lazy-imported inside function
45
+ bodies, exiting `CliError(code=2)` with an install `hint:` when absent (a
46
+ static test asserts no top-level driver import).
42
47
  - **Idempotent dedup** (by `id`/`hash`) and the **public/private scope no-leak**
43
48
  (a private-scope document is never returned by a public-scope fetch) are
44
49
  load-bearing across the consumer boundary.
@@ -5,6 +5,41 @@ All notable changes to this project will be documented in this file.
5
5
  Format follows [Keep a Changelog](https://keepachangelog.com/). This project
6
6
  adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.5.1] - 2026-06-21
9
+
10
+ ### Changed
11
+
12
+ - CI stamps each SonarCloud analysis with `sonar.projectVersion` read from `pyproject.toml`, so the "Previous version" New Code period has a real per-release boundary to diff against (`.github/workflows/tests.yml`)
13
+
14
+ ### Fixed
15
+
16
+ - Cleared 4 pre-existing SonarCloud smells in `data_refinery/cli/_commands/stack.py` (wave-1), behaviour-preserving: merged the implicitly concatenated `_DOCKER_HINT` literal (S5799), extracted `_load_ps_rows` to drop `_parse_ps` cognitive complexity (S3776), single-exit `cmd_stack_status` via a `_render_status_text` helper (S3516), and an `_add_json_flag` helper for the repeated `"Emit structured JSON."` literal (S1192)
17
+
18
+ ## [0.5.0] - 2026-06-20
19
+
20
+ ### Added
21
+
22
+ - Generic storage-neutral envelope {id,hash,content,scope{name,visibility},metadata} with no memory semantics (data_refinery/store/envelope.py)
23
+ - Importable store library data_refinery.store.put/get/list, mirrored by the data-refinery store put/get/list CLI noun over one shared implementation
24
+ - Backend Protocol with files (dependency-free default), mongo, and neo4j adapters; neo4j/pymongo live behind the optional [store] extra and are lazy-imported (dependencies = [] stays the default)
25
+ - Data-quality verbs validate, dedup (idempotent by id/hash), integrity (hash matches content), and freshness (age/staleness facts) — all --json
26
+ - Public/private scope no-leak (can_serve) enforced by every backend get/list; a private-scope document is never returned by a public-scope fetch
27
+
28
+ ### Changed
29
+
30
+ - docs/contract.md bumped to contract version 2 documenting the store + data-quality verb JSON shapes and the [store] extra
31
+ - README, CLAUDE.md, AGENTS.colleague.md, learn, overview, and the explain catalog updated for the Wave 2 surface
32
+
33
+ ### Fixed
34
+
35
+ - mongo/neo4j store put now dedups by content hash within the scope on insert, matching the files backend and the documented "dedups by hash on insert" contract (Qodo PR #5 review)
36
+ - neo4j adapter guards metadata JSON parsing: a corrupt node surfaces as a structured exit-2 error with a remediation instead of an uncaught JSONDecodeError wrapped as a generic "unexpected" (Qodo PR #5 review)
37
+ - SonarCloud new-code cleanups (PR #5), behaviour-preserving: single-exit store get/list handlers (S3516), repeated literals extracted into `_add_json_flag` helpers / a `_JSONL_GLOB` constant (S1192), and `validate_payload` cognitive complexity reduced via a `_scope_errors` helper (S3776)
38
+
39
+ ### Security
40
+
41
+ - Scope visibility is validated at ingestion (store put / from_dict reject any value other than public|private with exit 1) and the can_serve no-leak check fails closed — an unrecognised visibility is treated as private, never served across scopes (Qodo PR #5 review)
42
+
8
43
  ## [0.4.0] - 2026-06-20
9
44
 
10
45
  ### Added
@@ -10,19 +10,25 @@ and freshness of data as it is stored and fetched. It is being split out of
10
10
  **eidetic-cli** so eidetic keeps the agent-memory layer; it is a sibling to
11
11
  **daria** (the Data Refinery Intelligent Agent).
12
12
 
13
- **Current state — read this first.** **Wave 1 of issue #1 is built**: the
14
- storage substrate (`docker-compose.yml` — mongo 27018 + neo4j 7687/apoc) and the
15
- `data-refinery stack up/down/status` verb that wraps `docker compose`, plus the
16
- GHCR publish workflow (`.github/workflows/publish-stack.yml`) and the pinnable
17
- docs (`docs/stack-image.md`, `docs/contract.md`). Runtime `dependencies = []`
18
- still holds — `stack` shells out to docker via stdlib `subprocess`, no driver
19
- deps. The **store adapters + data-quality verbs are Wave 2** (still unbuilt;
20
- tracked as a follow-up issue): the generic envelope, files/cypher/mongo
21
- adapters behind an optional `[store]` extra, and validate/dedup/integrity/
22
- freshness. The rest of the code is the inherited *agent-first introspection
23
- scaffold* (`whoami` / `learn` / `explain` / `overview` / `doctor` + a `cli`
24
- noun), cited from [teken](https://github.com/agentculture/teken)'s `python-cli`
25
- reference. The build order lives in **issue #1** (see "Domain roadmap").
13
+ **Current state — read this first.** **Waves 1 and 2 of issue #1 are built.**
14
+ *Wave 1* (issue #1): the storage substrate (`docker-compose.yml` — mongo 27018 +
15
+ neo4j 7687/apoc) and the `data-refinery stack up/down/status` verb wrapping
16
+ `docker compose`, plus the GHCR publish workflow
17
+ (`.github/workflows/publish-stack.yml`) and the pinnable docs
18
+ (`docs/stack-image.md`, `docs/contract.md`). *Wave 2* (issue #3): the
19
+ storage-neutral **store** the generic envelope (`data_refinery/store/`), the
20
+ importable `data_refinery.store.put/get/list` library mirrored by `data-refinery
21
+ store put/get/list`, and the files/mongo/neo4j adapters behind a `Backend`
22
+ Protocol; plus the **data-quality verbs** (`validate` / `dedup` / `integrity` /
23
+ `freshness` in `data_refinery/quality/`). Runtime `dependencies = []` still holds
24
+ the `files` backend is stdlib-only (default) and `neo4j`/`pymongo` are
25
+ lazy-imported behind the optional `[store]` extra. The remaining code is the
26
+ inherited *agent-first introspection scaffold* (`whoami` / `learn` / `explain` /
27
+ `overview` / `doctor` + a `cli` noun), cited from
28
+ [teken](https://github.com/agentculture/teken)'s `python-cli` reference. **Wave
29
+ 3** (the full pinnable verb-JSON contract + eidetic consuming the surface over
30
+ the subprocess boundary) is still open; the build order lives in **issue #1** /
31
+ **issue #3** (see "Domain roadmap").
26
32
 
27
33
  ## Names: there are three, and they differ on purpose
28
34
 
@@ -235,18 +241,24 @@ repo's call (it owns the surface) but must be documented so eidetic can pin.
235
241
 
236
242
  ## Remaining gaps / next steps
237
243
 
238
- Wave 1 of issue #1 is built (the stack: compose + `stack` verb + GHCR publish +
239
- contract docs). README, `AGENTS.colleague.md`, and `overview` were realigned to
240
- the data-quality domain during that work. What is still open:
241
-
242
- 1. **Wave 2 the store + data-quality surface** (tracked as a follow-up issue):
243
- the generic opaque envelope `{id,hash,content,scope,metadata}`, the
244
- files/cypher/mongo store adapters behind an optional `[store]` extra
245
- (lazy-imported, `dependencies = []` stays the default), and the
246
- validate/dedup/integrity/freshness verbs. Idempotent dedup + the public/private
247
- scope no-leak are the load-bearing invariants. This is the substantive work.
248
- 2. **Wave 3 the full pinnable verb contract + eidetic consumption** over the
249
- subprocess boundary (eidetic drops/thins `neo4j`+`pymongo`).
244
+ Waves 1 and 2 of the split are built. *Wave 1* = the stack (compose + `stack`
245
+ verb + GHCR publish + contract docs). *Wave 2* (issue #3) = the store +
246
+ data-quality surface: the generic opaque envelope `{id,hash,content,scope,
247
+ metadata}` (`data_refinery/store/envelope.py`), the `Backend` Protocol +
248
+ files/mongo/neo4j adapters (`data_refinery/store/backends/`, the driver-backed
249
+ two behind the optional `[store]` extra, lazy-imported, `dependencies = []` stays
250
+ the default), the importable `data_refinery.store.put/get/list` mirrored by the
251
+ `store` CLI noun, and the `validate`/`dedup`/`integrity`/`freshness` verbs
252
+ (`data_refinery/quality/`). Idempotent dedup + the public/private scope no-leak
253
+ (`can_serve`, enforced by every backend's `get`/`list`) are the load-bearing
254
+ invariants. README, `AGENTS.colleague.md`, `learn`, `overview`, the explain
255
+ catalog, and `docs/contract.md` (now contract version 2) were updated for the
256
+ surface. What is still open:
257
+
258
+ 1. **Wave 3 — the full pinnable verb contract + eidetic consumption** over the
259
+ subprocess boundary (eidetic drops/thins `neo4j`+`pymongo`). The verb-JSON
260
+ shapes are documented in `docs/contract.md`; Wave 3 freezes them as the pinned
261
+ surface eidetic consumes process-to-process.
250
262
 
251
263
  ## Renaming / scaffold lineage
252
264
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-refinery-cli
3
- Version: 0.4.0
3
+ Version: 0.5.1
4
4
  Summary: Agent and CLI for data quality in storage and retrieval — validating, deduplicating, and checking the integrity and freshness of data as it is stored and fetched. Split out of eidetic-cli so eidetic keeps agent-memory; sibling to daria, the Data Refinery Intelligent Agent.
5
5
  Project-URL: Homepage, https://github.com/agentculture/data-refinery-cli
6
6
  Project-URL: Issues, https://github.com/agentculture/data-refinery-cli/issues
@@ -13,6 +13,9 @@ Classifier: License :: OSI Approved :: Apache Software License
13
13
  Classifier: Programming Language :: Python :: 3.12
14
14
  Classifier: Topic :: Software Development
15
15
  Requires-Python: >=3.12
16
+ Provides-Extra: store
17
+ Requires-Dist: neo4j>=5; extra == 'store'
18
+ Requires-Dist: pymongo>=4; extra == 'store'
16
19
  Description-Content-Type: text/markdown
17
20
 
18
21
  # data-refinery-cli
@@ -40,12 +43,12 @@ The split ships in waves:
40
43
  | Wave | Scope | Status |
41
44
  |------|-------|--------|
42
45
  | **1** | docker stack (mongo 27018 + neo4j 7687), GHCR publish, `stack` CLI verb, pinnable contract | **shipped** |
43
- | **2** | generic storage envelope, files/cypher/mongo store adapters (optional `[store]` extra), data-quality verbs (validate, dedup, integrity, freshness) | planned |
46
+ | **2** | generic storage envelope + importable store API, files/mongo/neo4j adapters (optional `[store]` extra), data-quality verbs (validate, dedup, integrity, freshness) | **shipped** |
44
47
  | **3** | full pinnable verb contract + eidetic consumption over the process boundary | planned |
45
48
 
46
49
  The runtime package has **no third-party dependencies** by default; the heavy
47
- store drivers (`neo4j`, `pymongo`) arrive in Wave 2 behind an optional extra,
48
- lazy-imported.
50
+ store drivers (`neo4j`, `pymongo`) live behind the optional `[store]` extra and
51
+ are lazy-imported, so the `files` backend (the default) stays dependency-free.
49
52
 
50
53
  ## Quickstart
51
54
 
@@ -57,6 +60,23 @@ uv run data-refinery stack status --json
57
60
  uv run data-refinery stack down
58
61
  uv run data-refinery whoami # identity from culture.yaml
59
62
  uv run teken cli doctor . --strict # the agent-first rubric gate CI runs
63
+
64
+ # Store + quality (files backend is dependency-free; no docker needed):
65
+ echo '{"id":"a","content":"hello"}' | uv run data-refinery store put --json
66
+ uv run data-refinery store get a --json
67
+ uv run data-refinery store list --json
68
+ uv run data-refinery integrity --json # hash matches content?
69
+ uv run data-refinery dedup --json # collapse same-hash dups (idempotent)
70
+ echo '{"id":"a","content":"x"}' | uv run data-refinery validate --json
71
+ ```
72
+
73
+ The store is also importable — shell out **or** `import data_refinery.store`:
74
+
75
+ ```python
76
+ import data_refinery.store as store
77
+ store.put(store.Envelope(id="a", content="hello"))
78
+ store.get("a") # -> Envelope | None
79
+ store.list() # -> list[Envelope]
60
80
  ```
61
81
 
62
82
  ## CLI
@@ -64,6 +84,11 @@ uv run teken cli doctor . --strict # the agent-first rubric gate CI runs
64
84
  | Verb | What it does |
65
85
  |------|--------------|
66
86
  | `stack up\|down\|status` | Manage the storage substrate (mongo + neo4j) via docker compose. |
87
+ | `store put\|get\|list` | Put/get/list opaque envelopes (`--backend files\|mongo\|neo4j`). |
88
+ | `validate` | Check envelope shape for JSON piped on stdin. |
89
+ | `dedup` | Collapse same-hash-same-scope duplicates in the store (idempotent). |
90
+ | `integrity` | Check every stored hash matches `sha256(content)`. |
91
+ | `freshness` | Report age/staleness facts from a metadata timestamp field. |
67
92
  | `whoami` | Report this agent's nick, version, backend, and model from `culture.yaml`. |
68
93
  | `learn` | Print a structured self-teaching prompt. |
69
94
  | `explain <path>` | Markdown docs for any noun/verb path. |
@@ -71,6 +96,11 @@ uv run teken cli doctor . --strict # the agent-first rubric gate CI runs
71
96
  | `doctor` | Check the agent-identity invariants (prompt-file-present, backend-consistency). |
72
97
  | `cli overview` | Describe the CLI surface itself. |
73
98
 
99
+ The **envelope** is storage-neutral — `{id, hash, content, scope{name,
100
+ visibility}, metadata}` with no memory semantics — and the store enforces a
101
+ public/private **scope no-leak**: a private-scope document is never returned by a
102
+ public-scope fetch, across every backend.
103
+
74
104
  Every command supports `--json`. Results go to stdout, errors/diagnostics to
75
105
  stderr (never mixed). Exit codes: `0` success, `1` user error, `2` environment
76
106
  error (e.g. docker absent — always with a `hint:`, never a traceback), `3+`
@@ -23,12 +23,12 @@ The split ships in waves:
23
23
  | Wave | Scope | Status |
24
24
  |------|-------|--------|
25
25
  | **1** | docker stack (mongo 27018 + neo4j 7687), GHCR publish, `stack` CLI verb, pinnable contract | **shipped** |
26
- | **2** | generic storage envelope, files/cypher/mongo store adapters (optional `[store]` extra), data-quality verbs (validate, dedup, integrity, freshness) | planned |
26
+ | **2** | generic storage envelope + importable store API, files/mongo/neo4j adapters (optional `[store]` extra), data-quality verbs (validate, dedup, integrity, freshness) | **shipped** |
27
27
  | **3** | full pinnable verb contract + eidetic consumption over the process boundary | planned |
28
28
 
29
29
  The runtime package has **no third-party dependencies** by default; the heavy
30
- store drivers (`neo4j`, `pymongo`) arrive in Wave 2 behind an optional extra,
31
- lazy-imported.
30
+ store drivers (`neo4j`, `pymongo`) live behind the optional `[store]` extra and
31
+ are lazy-imported, so the `files` backend (the default) stays dependency-free.
32
32
 
33
33
  ## Quickstart
34
34
 
@@ -40,6 +40,23 @@ uv run data-refinery stack status --json
40
40
  uv run data-refinery stack down
41
41
  uv run data-refinery whoami # identity from culture.yaml
42
42
  uv run teken cli doctor . --strict # the agent-first rubric gate CI runs
43
+
44
+ # Store + quality (files backend is dependency-free; no docker needed):
45
+ echo '{"id":"a","content":"hello"}' | uv run data-refinery store put --json
46
+ uv run data-refinery store get a --json
47
+ uv run data-refinery store list --json
48
+ uv run data-refinery integrity --json # hash matches content?
49
+ uv run data-refinery dedup --json # collapse same-hash dups (idempotent)
50
+ echo '{"id":"a","content":"x"}' | uv run data-refinery validate --json
51
+ ```
52
+
53
+ The store is also importable — shell out **or** `import data_refinery.store`:
54
+
55
+ ```python
56
+ import data_refinery.store as store
57
+ store.put(store.Envelope(id="a", content="hello"))
58
+ store.get("a") # -> Envelope | None
59
+ store.list() # -> list[Envelope]
43
60
  ```
44
61
 
45
62
  ## CLI
@@ -47,6 +64,11 @@ uv run teken cli doctor . --strict # the agent-first rubric gate CI runs
47
64
  | Verb | What it does |
48
65
  |------|--------------|
49
66
  | `stack up\|down\|status` | Manage the storage substrate (mongo + neo4j) via docker compose. |
67
+ | `store put\|get\|list` | Put/get/list opaque envelopes (`--backend files\|mongo\|neo4j`). |
68
+ | `validate` | Check envelope shape for JSON piped on stdin. |
69
+ | `dedup` | Collapse same-hash-same-scope duplicates in the store (idempotent). |
70
+ | `integrity` | Check every stored hash matches `sha256(content)`. |
71
+ | `freshness` | Report age/staleness facts from a metadata timestamp field. |
50
72
  | `whoami` | Report this agent's nick, version, backend, and model from `culture.yaml`. |
51
73
  | `learn` | Print a structured self-teaching prompt. |
52
74
  | `explain <path>` | Markdown docs for any noun/verb path. |
@@ -54,6 +76,11 @@ uv run teken cli doctor . --strict # the agent-first rubric gate CI runs
54
76
  | `doctor` | Check the agent-identity invariants (prompt-file-present, backend-consistency). |
55
77
  | `cli overview` | Describe the CLI surface itself. |
56
78
 
79
+ The **envelope** is storage-neutral — `{id, hash, content, scope{name,
80
+ visibility}, metadata}` with no memory semantics — and the store enforces a
81
+ public/private **scope no-leak**: a private-scope document is never returned by a
82
+ public-scope fetch, across every backend.
83
+
57
84
  Every command supports `--json`. Results go to stdout, errors/diagnostics to
58
85
  stderr (never mixed). Exit codes: `0` success, `1` user error, `2` environment
59
86
  error (e.g. docker absent — always with a `hint:`, never a traceback), `3+`
@@ -67,7 +67,9 @@ def _build_parser() -> argparse.ArgumentParser:
67
67
  from data_refinery.cli._commands import explain as _explain_cmd
68
68
  from data_refinery.cli._commands import learn as _learn_cmd
69
69
  from data_refinery.cli._commands import overview as _overview_cmd
70
+ from data_refinery.cli._commands import quality as _quality_cmd
70
71
  from data_refinery.cli._commands import stack as _stack_group
72
+ from data_refinery.cli._commands import store as _store_group
71
73
  from data_refinery.cli._commands import whoami as _whoami_cmd
72
74
 
73
75
  parser = _CliArgumentParser(
@@ -90,6 +92,8 @@ def _build_parser() -> argparse.ArgumentParser:
90
92
  _doctor_cmd.register(sub)
91
93
  _cli_group.register(sub)
92
94
  _stack_group.register(sub)
95
+ _store_group.register(sub)
96
+ _quality_cmd.register(sub)
93
97
  # Register your own noun groups here:
94
98
  # from data_refinery.cli._commands import my_noun as _my_noun_group
95
99
  # _my_noun_group.register(sub)
@@ -18,9 +18,9 @@ Purpose
18
18
  -------
19
19
  Validate, deduplicate, and check the integrity and freshness of data as it is
20
20
  stored and fetched. Split out of eidetic-cli so eidetic keeps agent-memory;
21
- sibling to daria. The data-quality verbs are not built yet (see issue #1) —
22
- today this exposes the agent-first introspection surface below on a
23
- self-contained runtime (no third-party dependencies).
21
+ sibling to daria. The store moves OPAQUE envelopes no memory semantics. The
22
+ default runtime has no third-party dependencies; the mongo/neo4j store backends
23
+ live behind the optional [store] extra (lazy-imported).
24
24
 
25
25
  Commands
26
26
  --------
@@ -30,6 +30,12 @@ Commands
30
30
  data-refinery overview Descriptive snapshot of the agent.
31
31
  data-refinery doctor Check the agent-identity invariants.
32
32
  data-refinery cli overview Describe the CLI surface itself.
33
+ data-refinery stack up|down|status Manage the storage substrate.
34
+ data-refinery store put|get|list Put/get/list opaque envelopes.
35
+ data-refinery validate Check envelope shape (JSON on stdin).
36
+ data-refinery dedup Collapse same-hash duplicates (idempotent).
37
+ data-refinery integrity Check stored hash matches sha256(content).
38
+ data-refinery freshness Report age/staleness facts from metadata.
33
39
 
34
40
  Machine-readable output
35
41
  -----------------------
@@ -61,6 +67,12 @@ def _as_json_payload() -> dict[str, object]:
61
67
  {"path": ["overview"], "summary": "Descriptive snapshot of the agent."},
62
68
  {"path": ["doctor"], "summary": "Check the agent-identity invariants."},
63
69
  {"path": ["cli", "overview"], "summary": "Describe the CLI surface."},
70
+ {"path": ["stack"], "summary": "Manage the storage substrate (up/down/status)."},
71
+ {"path": ["store"], "summary": "Put/get/list opaque envelopes in the store."},
72
+ {"path": ["validate"], "summary": "Check envelope shape (JSON on stdin)."},
73
+ {"path": ["dedup"], "summary": "Collapse same-hash duplicates (idempotent)."},
74
+ {"path": ["integrity"], "summary": "Check stored hash matches sha256(content)."},
75
+ {"path": ["freshness"], "summary": "Report age/staleness facts from metadata."},
64
76
  ],
65
77
  "exit_codes": {
66
78
  "0": "success",
@@ -31,6 +31,8 @@ _VERBS = [
31
31
  "overview — this descriptive snapshot",
32
32
  "doctor — check the agent-identity invariants",
33
33
  "stack up|down|status — manage the storage substrate (mongo + neo4j)",
34
+ "store put|get|list — put/get/list opaque envelopes in the store",
35
+ "validate|dedup|integrity|freshness — data-quality checks over the store",
34
36
  ]
35
37
 
36
38
 
@@ -0,0 +1,185 @@
1
+ """``data-refinery validate|dedup|integrity|freshness`` — data-quality verbs.
2
+
3
+ Consumer-agnostic checks over the store, mirroring :mod:`data_refinery.quality`.
4
+ All are global verbs (not a noun group) and all support ``--json``.
5
+
6
+ Exit policy (consistent with ``stack status``): the command exits ``0`` when the
7
+ check *ran* — findings ride in the payload (``valid: false`` / ``ok: false`` /
8
+ duplicate groups / stale counts). A non-zero exit means the command could not
9
+ run: ``1`` for unparseable input, ``2`` for a missing backend driver. No
10
+ traceback ever.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import argparse
16
+ import json
17
+ import sys
18
+ from datetime import datetime
19
+
20
+ from data_refinery.cli._errors import EXIT_USER_ERROR, CliError
21
+ from data_refinery.cli._output import emit_result
22
+ from data_refinery.quality import checks
23
+ from data_refinery.store import get_backend
24
+
25
+ _BACKENDS = ("files", "mongo", "neo4j")
26
+
27
+
28
+ def _add_backend_flag(p: argparse.ArgumentParser) -> None:
29
+ p.add_argument(
30
+ "--backend",
31
+ choices=_BACKENDS,
32
+ default="files",
33
+ help="Store backend (default: files; mongo/neo4j need the [store] extra).",
34
+ )
35
+
36
+
37
+ def _add_json_flag(p: argparse.ArgumentParser) -> None:
38
+ p.add_argument("--json", action="store_true", help="Emit structured JSON.")
39
+
40
+
41
+ def _read_stdin_json() -> object:
42
+ if sys.stdin is None or sys.stdin.isatty():
43
+ raise CliError(
44
+ code=EXIT_USER_ERROR,
45
+ message="validate expects a JSON envelope (or array) on stdin",
46
+ remediation='pipe input, e.g. echo \'{"id":"a","content":"x"}\' | '
47
+ "data-refinery validate",
48
+ )
49
+ raw = sys.stdin.read().strip()
50
+ if not raw:
51
+ raise CliError(
52
+ code=EXIT_USER_ERROR,
53
+ message="no input on stdin",
54
+ remediation='pipe a JSON envelope or array, e.g. {"id":"a","content":"x"}',
55
+ )
56
+ try:
57
+ return json.loads(raw)
58
+ except json.JSONDecodeError as exc:
59
+ raise CliError(
60
+ code=EXIT_USER_ERROR,
61
+ message=f"invalid JSON on stdin: {exc}",
62
+ remediation="pipe a single envelope object or a JSON array of them",
63
+ ) from exc
64
+
65
+
66
+ def cmd_validate(args: argparse.Namespace) -> int:
67
+ json_mode = bool(getattr(args, "json", False))
68
+ payload = _read_stdin_json()
69
+ objs = payload if isinstance(payload, list) else [payload]
70
+ result = checks.validate_many(objs)
71
+ if json_mode:
72
+ emit_result(result, json_mode=True)
73
+ else:
74
+ valid_n = sum(1 for r in result["results"] if r["valid"])
75
+ lines = [f"valid: {valid_n}/{result['count']}"]
76
+ for r in result["results"]:
77
+ if not r["valid"]:
78
+ lines.append(f"- {r['id']}: {'; '.join(r['errors'])}")
79
+ emit_result("\n".join(lines), json_mode=False)
80
+ return 0
81
+
82
+
83
+ def cmd_dedup(args: argparse.Namespace) -> int:
84
+ json_mode = bool(getattr(args, "json", False))
85
+ result = checks.dedup(get_backend(args.backend))
86
+ if json_mode:
87
+ emit_result(result, json_mode=True)
88
+ else:
89
+ emit_result(
90
+ f"dedup: removed {result['duplicates_removed']} duplicate(s), "
91
+ f"{result['kept']} kept",
92
+ json_mode=False,
93
+ )
94
+ return 0
95
+
96
+
97
+ def cmd_integrity(args: argparse.Namespace) -> int:
98
+ json_mode = bool(getattr(args, "json", False))
99
+ result = checks.integrity(get_backend(args.backend).all())
100
+ if json_mode:
101
+ emit_result(result, json_mode=True)
102
+ else:
103
+ lines = [f"ok: {result['ok']} ({result['checked']} checked)"]
104
+ for m in result["mismatches"]:
105
+ lines.append(f"- {m['id']}: stored {m['stored_hash'][:12]} != {m['actual_hash'][:12]}")
106
+ emit_result("\n".join(lines), json_mode=False)
107
+ return 0
108
+
109
+
110
+ def cmd_freshness(args: argparse.Namespace) -> int:
111
+ json_mode = bool(getattr(args, "json", False))
112
+ now = None
113
+ if getattr(args, "now", None):
114
+ try:
115
+ now = datetime.fromisoformat(args.now)
116
+ except ValueError as exc:
117
+ raise CliError(
118
+ code=EXIT_USER_ERROR,
119
+ message=f"--now is not a valid ISO-8601 timestamp: {args.now}",
120
+ remediation="pass e.g. --now 2026-06-20T00:00:00+00:00",
121
+ ) from exc
122
+ result = checks.freshness(
123
+ get_backend(args.backend).all(),
124
+ field=args.field,
125
+ max_age=args.max_age,
126
+ now=now,
127
+ )
128
+ if json_mode:
129
+ emit_result(result, json_mode=True)
130
+ else:
131
+ emit_result(
132
+ f"freshness: {result['checked']} checked, {result['stale']} stale "
133
+ f"(field='{result['field']}', max_age={result['max_age']})",
134
+ json_mode=False,
135
+ )
136
+ return 0
137
+
138
+
139
+ def register(sub: argparse._SubParsersAction) -> None:
140
+ validate = sub.add_parser(
141
+ "validate",
142
+ help="Validate envelope shape for JSON piped on stdin (object or array).",
143
+ )
144
+ _add_json_flag(validate)
145
+ validate.set_defaults(func=cmd_validate)
146
+
147
+ dedup = sub.add_parser(
148
+ "dedup",
149
+ help="Collapse same-hash-same-scope duplicates in the store (idempotent).",
150
+ )
151
+ _add_backend_flag(dedup)
152
+ _add_json_flag(dedup)
153
+ dedup.set_defaults(func=cmd_dedup)
154
+
155
+ integrity = sub.add_parser(
156
+ "integrity",
157
+ help="Check that every stored hash matches sha256(content).",
158
+ )
159
+ _add_backend_flag(integrity)
160
+ _add_json_flag(integrity)
161
+ integrity.set_defaults(func=cmd_integrity)
162
+
163
+ freshness = sub.add_parser(
164
+ "freshness",
165
+ help="Report age/staleness facts from a metadata timestamp field.",
166
+ )
167
+ _add_backend_flag(freshness)
168
+ freshness.add_argument(
169
+ "--field",
170
+ default="created",
171
+ help="metadata key holding an ISO-8601 timestamp (default: created).",
172
+ )
173
+ freshness.add_argument(
174
+ "--max-age",
175
+ type=float,
176
+ default=None,
177
+ help="seconds; an envelope older than this is marked stale.",
178
+ )
179
+ freshness.add_argument(
180
+ "--now",
181
+ default=None,
182
+ help="ISO-8601 'now' override (default: current UTC time). For determinism.",
183
+ )
184
+ _add_json_flag(freshness)
185
+ freshness.set_defaults(func=cmd_freshness)