ba-agent-mcp 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. ba_agent_mcp-0.1.0/MANIFEST.in +28 -0
  2. ba_agent_mcp-0.1.0/PKG-INFO +96 -0
  3. ba_agent_mcp-0.1.0/README_PYPI.md +75 -0
  4. ba_agent_mcp-0.1.0/ba_agent_mcp.egg-info/PKG-INFO +96 -0
  5. ba_agent_mcp-0.1.0/ba_agent_mcp.egg-info/SOURCES.txt +71 -0
  6. ba_agent_mcp-0.1.0/ba_agent_mcp.egg-info/dependency_links.txt +1 -0
  7. ba_agent_mcp-0.1.0/ba_agent_mcp.egg-info/entry_points.txt +2 -0
  8. ba_agent_mcp-0.1.0/ba_agent_mcp.egg-info/requires.txt +5 -0
  9. ba_agent_mcp-0.1.0/ba_agent_mcp.egg-info/top_level.txt +4 -0
  10. ba_agent_mcp-0.1.0/kb/__init__.py +0 -0
  11. ba_agent_mcp-0.1.0/kb/adopt_doc.py +177 -0
  12. ba_agent_mcp-0.1.0/kb/alerts.py +42 -0
  13. ba_agent_mcp-0.1.0/kb/backfill_embeddings.py +67 -0
  14. ba_agent_mcp-0.1.0/kb/chunker.py +254 -0
  15. ba_agent_mcp-0.1.0/kb/embeddings.py +149 -0
  16. ba_agent_mcp-0.1.0/kb/eval_kb.py +134 -0
  17. ba_agent_mcp-0.1.0/kb/extend_doc_types.py +39 -0
  18. ba_agent_mcp-0.1.0/kb/fast_metadata.py +95 -0
  19. ba_agent_mcp-0.1.0/kb/ingest.py +255 -0
  20. ba_agent_mcp-0.1.0/kb/instant_ingest.py +91 -0
  21. ba_agent_mcp-0.1.0/kb/kb_smoke.py +221 -0
  22. ba_agent_mcp-0.1.0/kb/reconcile.py +192 -0
  23. ba_agent_mcp-0.1.0/kb/reference_loader.py +275 -0
  24. ba_agent_mcp-0.1.0/kb/rerank.py +162 -0
  25. ba_agent_mcp-0.1.0/kb/retrieval_eval.py +72 -0
  26. ba_agent_mcp-0.1.0/kb/retrieve.py +393 -0
  27. ba_agent_mcp-0.1.0/kb/search_client.py +176 -0
  28. ba_agent_mcp-0.1.0/kb/supabase_rest.py +150 -0
  29. ba_agent_mcp-0.1.0/lib/__init__.py +0 -0
  30. ba_agent_mcp-0.1.0/lib/analysis_session.py +130 -0
  31. ba_agent_mcp-0.1.0/lib/badoc_schema.py +93 -0
  32. ba_agent_mcp-0.1.0/lib/baseline.py +168 -0
  33. ba_agent_mcp-0.1.0/lib/citations.py +144 -0
  34. ba_agent_mcp-0.1.0/lib/doc_lint.py +423 -0
  35. ba_agent_mcp-0.1.0/lib/docx_text.py +56 -0
  36. ba_agent_mcp-0.1.0/lib/draft.py +210 -0
  37. ba_agent_mcp-0.1.0/lib/e2e.py +450 -0
  38. ba_agent_mcp-0.1.0/lib/egress_audit.py +76 -0
  39. ba_agent_mcp-0.1.0/lib/figma_analyze.py +57 -0
  40. ba_agent_mcp-0.1.0/lib/figma_client.py +149 -0
  41. ba_agent_mcp-0.1.0/lib/figma_doc.py +68 -0
  42. ba_agent_mcp-0.1.0/lib/figma_extract.py +245 -0
  43. ba_agent_mcp-0.1.0/lib/figma_publish.py +99 -0
  44. ba_agent_mcp-0.1.0/lib/health.py +110 -0
  45. ba_agent_mcp-0.1.0/lib/ingest_hook.py +111 -0
  46. ba_agent_mcp-0.1.0/lib/migrate_to_theme_docs.py +174 -0
  47. ba_agent_mcp-0.1.0/lib/project_doc.py +98 -0
  48. ba_agent_mcp-0.1.0/lib/publish_prep.py +65 -0
  49. ba_agent_mcp-0.1.0/lib/readback.py +126 -0
  50. ba_agent_mcp-0.1.0/lib/reconcile_e2e.py +186 -0
  51. ba_agent_mcp-0.1.0/lib/reconcile_plan.py +162 -0
  52. ba_agent_mcp-0.1.0/lib/req_matcher.py +38 -0
  53. ba_agent_mcp-0.1.0/lib/requirement_id.py +28 -0
  54. ba_agent_mcp-0.1.0/lib/restore_lister.py +104 -0
  55. ba_agent_mcp-0.1.0/lib/serializer.py +323 -0
  56. ba_agent_mcp-0.1.0/lib/source_resolver.py +104 -0
  57. ba_agent_mcp-0.1.0/lib/suppa_adapter.py +218 -0
  58. ba_agent_mcp-0.1.0/lib/suppa_client.py +651 -0
  59. ba_agent_mcp-0.1.0/lib/token_gate.py +187 -0
  60. ba_agent_mcp-0.1.0/lib/transcript_store.py +323 -0
  61. ba_agent_mcp-0.1.0/lib/validate_doc.py +314 -0
  62. ba_agent_mcp-0.1.0/lib/write_sequencer.py +499 -0
  63. ba_agent_mcp-0.1.0/mcp_server/__init__.py +8 -0
  64. ba_agent_mcp-0.1.0/mcp_server/config.py +64 -0
  65. ba_agent_mcp-0.1.0/mcp_server/server.py +29 -0
  66. ba_agent_mcp-0.1.0/mcp_server/tools.py +115 -0
  67. ba_agent_mcp-0.1.0/pyproject.toml +36 -0
  68. ba_agent_mcp-0.1.0/setup.cfg +4 -0
  69. ba_agent_mcp-0.1.0/smoke/__init__.py +0 -0
  70. ba_agent_mcp-0.1.0/smoke/canonical.py +154 -0
  71. ba_agent_mcp-0.1.0/smoke/check_supabase.py +101 -0
  72. ba_agent_mcp-0.1.0/smoke/fixture_brd.py +73 -0
  73. ba_agent_mcp-0.1.0/smoke/phase0_smoke.py +512 -0
@@ -0,0 +1,28 @@
1
+ # Keep the published sdist to the runtime package + the PyPI readme only.
2
+ # (The wheel already contains just the lib/kb/mcp_server/smoke packages.)
3
+ # The git-based sdist finder would otherwise sweep in tests/, the internal
4
+ # README, and other repo files — exclude them so nothing internal ships.
5
+ include README_PYPI.md
6
+ include pyproject.toml
7
+
8
+ exclude README.md
9
+ exclude RUNBOOK.md
10
+ exclude Makefile
11
+ exclude vendor.lock
12
+ exclude CLAUDE.md
13
+ exclude .env
14
+ exclude .env.example
15
+
16
+ prune tests
17
+ prune docs
18
+ prune evals
19
+ prune logs
20
+ prune skill
21
+ prune .codegraph
22
+ prune "info to ingest"
23
+
24
+ global-exclude *.sql
25
+ global-exclude .env
26
+ global-exclude *.pyc
27
+ global-exclude *.docx
28
+ global-exclude *.pdf
@@ -0,0 +1,96 @@
1
+ Metadata-Version: 2.4
2
+ Name: ba-agent-mcp
3
+ Version: 0.1.0
4
+ Summary: MCP server exposing the Modern-Expo virtual Business Analyst Knowledge Base (read/derive tools).
5
+ Author: Modern-Expo
6
+ Keywords: mcp,model-context-protocol,business-analyst,knowledge-base,rag
7
+ Classifier: Development Status :: 4 - Beta
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: License :: Other/Proprietary License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: Topic :: Software Development :: Documentation
15
+ Requires-Python: >=3.11
16
+ Description-Content-Type: text/markdown
17
+ Requires-Dist: mcp>=1.2
18
+ Requires-Dist: pypdf>=4
19
+ Provides-Extra: dev
20
+ Requires-Dist: pytest>=8; extra == "dev"
21
+
22
+ # ba-agent-mcp
23
+
24
+ An [MCP](https://modelcontextprotocol.io) server that exposes the Modern-Expo
25
+ virtual **Business Analyst Knowledge Base** as a small set of **read / derive**
26
+ tools. It never writes to SUPPA Docs or the Knowledge Base and is deliberately
27
+ kept off the governed write path — authoring documents goes through the
28
+ human-gated `business-analyst` skill, not this server.
29
+
30
+ ## Install
31
+
32
+ ```bash
33
+ uvx ba-agent-mcp # run on demand in an isolated environment (recommended)
34
+ # or
35
+ pip install ba-agent-mcp # then run: ba-agent-mcp
36
+ ```
37
+
38
+ ## Credentials — you supply your own
39
+
40
+ **This package ships no credentials.** Each user provides their **own** keys and
41
+ their **own** SUPPA identity. Nothing is ever read from the MCP client JSON
42
+ config; secrets are resolved from the environment, in this order:
43
+
44
+ 1. real environment variables in the launching shell, then
45
+ 2. an explicit env file at `$BA_AGENT_ENV_FILE`, then
46
+ 3. `~/.ba-agent/.env`
47
+
48
+ ```bash
49
+ # ~/.ba-agent/.env (or export these in your shell)
50
+ SUPABASE_URL=https://<your-project>.supabase.co
51
+ SUPABASE_SERVICE_ROLE_KEY=... # or SUPABASE_ANON_KEY for read-only
52
+ SUPPA_API_KEY=... # your own SUPPA identity (or SUPPA_TOKEN)
53
+ GEMINI_API_KEY=... # optional — enables hybrid (vector) search
54
+ FIGMA_TOKEN=... # optional — enables analyze_figma
55
+ ```
56
+
57
+ > Never put any secret in the MCP client JSON. A tool whose credentials are
58
+ > absent returns a clear, structured error instead of failing the server, so you
59
+ > can add capabilities incrementally.
60
+
61
+ ## Connect (any MCP client)
62
+
63
+ ```json
64
+ {
65
+ "mcpServers": {
66
+ "ba-agent": {
67
+ "command": "uvx",
68
+ "args": ["ba-agent-mcp"],
69
+ "env": { "BA_AGENT_ENV_FILE": "/absolute/path/to/your/.env" }
70
+ }
71
+ }
72
+ }
73
+ ```
74
+
75
+ ## Tools
76
+
77
+ | Tool | Returns |
78
+ | --- | --- |
79
+ | `kb_search` | Ranked KB hits with provenance (title, breadcrumb, SUPPA anchor, `data_class`). |
80
+ | `kb_assemble_context` | A cited grounding block for a question + coverage + strictest `data_class`. |
81
+ | `kb_assemble_corpus` | The whole project corpus grouped by document, for cross-corpus analysis. |
82
+ | `kb_list_docs` | Inventory of governed documents in a project (reads SUPPA under your own identity). |
83
+ | `analyze_figma` | A structured design profile of a Figma file (requires `FIGMA_TOKEN`). |
84
+
85
+ ## Data governance
86
+
87
+ The KB read tools connect with the Supabase service-role key, which bypasses
88
+ row-level security, so they **fail closed at the application layer**: any chunk
89
+ that is not explicitly `data_class = 'internal'` is excluded from results.
90
+ Consequence — **`client`-classed content is not readable over MCP**. The
91
+ append-only logs never contain document bodies, only ids, hashes, identity, and
92
+ HTTP status.
93
+
94
+ ---
95
+
96
+ Proprietary — Modern-Expo internal tooling. All rights reserved.
@@ -0,0 +1,75 @@
1
+ # ba-agent-mcp
2
+
3
+ An [MCP](https://modelcontextprotocol.io) server that exposes the Modern-Expo
4
+ virtual **Business Analyst Knowledge Base** as a small set of **read / derive**
5
+ tools. It never writes to SUPPA Docs or the Knowledge Base and is deliberately
6
+ kept off the governed write path — authoring documents goes through the
7
+ human-gated `business-analyst` skill, not this server.
8
+
9
+ ## Install
10
+
11
+ ```bash
12
+ uvx ba-agent-mcp # run on demand in an isolated environment (recommended)
13
+ # or
14
+ pip install ba-agent-mcp # then run: ba-agent-mcp
15
+ ```
16
+
17
+ ## Credentials — you supply your own
18
+
19
+ **This package ships no credentials.** Each user provides their **own** keys and
20
+ their **own** SUPPA identity. Nothing is ever read from the MCP client JSON
21
+ config; secrets are resolved from the environment, in this order:
22
+
23
+ 1. real environment variables in the launching shell, then
24
+ 2. an explicit env file at `$BA_AGENT_ENV_FILE`, then
25
+ 3. `~/.ba-agent/.env`
26
+
27
+ ```bash
28
+ # ~/.ba-agent/.env (or export these in your shell)
29
+ SUPABASE_URL=https://<your-project>.supabase.co
30
+ SUPABASE_SERVICE_ROLE_KEY=... # or SUPABASE_ANON_KEY for read-only
31
+ SUPPA_API_KEY=... # your own SUPPA identity (or SUPPA_TOKEN)
32
+ GEMINI_API_KEY=... # optional — enables hybrid (vector) search
33
+ FIGMA_TOKEN=... # optional — enables analyze_figma
34
+ ```
35
+
36
+ > Never put any secret in the MCP client JSON. A tool whose credentials are
37
+ > absent returns a clear, structured error instead of failing the server, so you
38
+ > can add capabilities incrementally.
39
+
40
+ ## Connect (any MCP client)
41
+
42
+ ```json
43
+ {
44
+ "mcpServers": {
45
+ "ba-agent": {
46
+ "command": "uvx",
47
+ "args": ["ba-agent-mcp"],
48
+ "env": { "BA_AGENT_ENV_FILE": "/absolute/path/to/your/.env" }
49
+ }
50
+ }
51
+ }
52
+ ```
53
+
54
+ ## Tools
55
+
56
+ | Tool | Returns |
57
+ | --- | --- |
58
+ | `kb_search` | Ranked KB hits with provenance (title, breadcrumb, SUPPA anchor, `data_class`). |
59
+ | `kb_assemble_context` | A cited grounding block for a question + coverage + strictest `data_class`. |
60
+ | `kb_assemble_corpus` | The whole project corpus grouped by document, for cross-corpus analysis. |
61
+ | `kb_list_docs` | Inventory of governed documents in a project (reads SUPPA under your own identity). |
62
+ | `analyze_figma` | A structured design profile of a Figma file (requires `FIGMA_TOKEN`). |
63
+
64
+ ## Data governance
65
+
66
+ The KB read tools connect with the Supabase service-role key, which bypasses
67
+ row-level security, so they **fail closed at the application layer**: any chunk
68
+ that is not explicitly `data_class = 'internal'` is excluded from results.
69
+ Consequence — **`client`-classed content is not readable over MCP**. The
70
+ append-only logs never contain document bodies, only ids, hashes, identity, and
71
+ HTTP status.
72
+
73
+ ---
74
+
75
+ Proprietary — Modern-Expo internal tooling. All rights reserved.
@@ -0,0 +1,96 @@
1
+ Metadata-Version: 2.4
2
+ Name: ba-agent-mcp
3
+ Version: 0.1.0
4
+ Summary: MCP server exposing the Modern-Expo virtual Business Analyst Knowledge Base (read/derive tools).
5
+ Author: Modern-Expo
6
+ Keywords: mcp,model-context-protocol,business-analyst,knowledge-base,rag
7
+ Classifier: Development Status :: 4 - Beta
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: License :: Other/Proprietary License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: Topic :: Software Development :: Documentation
15
+ Requires-Python: >=3.11
16
+ Description-Content-Type: text/markdown
17
+ Requires-Dist: mcp>=1.2
18
+ Requires-Dist: pypdf>=4
19
+ Provides-Extra: dev
20
+ Requires-Dist: pytest>=8; extra == "dev"
21
+
22
+ # ba-agent-mcp
23
+
24
+ An [MCP](https://modelcontextprotocol.io) server that exposes the Modern-Expo
25
+ virtual **Business Analyst Knowledge Base** as a small set of **read / derive**
26
+ tools. It never writes to SUPPA Docs or the Knowledge Base and is deliberately
27
+ kept off the governed write path — authoring documents goes through the
28
+ human-gated `business-analyst` skill, not this server.
29
+
30
+ ## Install
31
+
32
+ ```bash
33
+ uvx ba-agent-mcp # run on demand in an isolated environment (recommended)
34
+ # or
35
+ pip install ba-agent-mcp # then run: ba-agent-mcp
36
+ ```
37
+
38
+ ## Credentials — you supply your own
39
+
40
+ **This package ships no credentials.** Each user provides their **own** keys and
41
+ their **own** SUPPA identity. Nothing is ever read from the MCP client JSON
42
+ config; secrets are resolved from the environment, in this order:
43
+
44
+ 1. real environment variables in the launching shell, then
45
+ 2. an explicit env file at `$BA_AGENT_ENV_FILE`, then
46
+ 3. `~/.ba-agent/.env`
47
+
48
+ ```bash
49
+ # ~/.ba-agent/.env (or export these in your shell)
50
+ SUPABASE_URL=https://<your-project>.supabase.co
51
+ SUPABASE_SERVICE_ROLE_KEY=... # or SUPABASE_ANON_KEY for read-only
52
+ SUPPA_API_KEY=... # your own SUPPA identity (or SUPPA_TOKEN)
53
+ GEMINI_API_KEY=... # optional — enables hybrid (vector) search
54
+ FIGMA_TOKEN=... # optional — enables analyze_figma
55
+ ```
56
+
57
+ > Never put any secret in the MCP client JSON. A tool whose credentials are
58
+ > absent returns a clear, structured error instead of failing the server, so you
59
+ > can add capabilities incrementally.
60
+
61
+ ## Connect (any MCP client)
62
+
63
+ ```json
64
+ {
65
+ "mcpServers": {
66
+ "ba-agent": {
67
+ "command": "uvx",
68
+ "args": ["ba-agent-mcp"],
69
+ "env": { "BA_AGENT_ENV_FILE": "/absolute/path/to/your/.env" }
70
+ }
71
+ }
72
+ }
73
+ ```
74
+
75
+ ## Tools
76
+
77
+ | Tool | Returns |
78
+ | --- | --- |
79
+ | `kb_search` | Ranked KB hits with provenance (title, breadcrumb, SUPPA anchor, `data_class`). |
80
+ | `kb_assemble_context` | A cited grounding block for a question + coverage + strictest `data_class`. |
81
+ | `kb_assemble_corpus` | The whole project corpus grouped by document, for cross-corpus analysis. |
82
+ | `kb_list_docs` | Inventory of governed documents in a project (reads SUPPA under your own identity). |
83
+ | `analyze_figma` | A structured design profile of a Figma file (requires `FIGMA_TOKEN`). |
84
+
85
+ ## Data governance
86
+
87
+ The KB read tools connect with the Supabase service-role key, which bypasses
88
+ row-level security, so they **fail closed at the application layer**: any chunk
89
+ that is not explicitly `data_class = 'internal'` is excluded from results.
90
+ Consequence — **`client`-classed content is not readable over MCP**. The
91
+ append-only logs never contain document bodies, only ids, hashes, identity, and
92
+ HTTP status.
93
+
94
+ ---
95
+
96
+ Proprietary — Modern-Expo internal tooling. All rights reserved.
@@ -0,0 +1,71 @@
1
+ MANIFEST.in
2
+ README_PYPI.md
3
+ pyproject.toml
4
+ ba_agent_mcp.egg-info/PKG-INFO
5
+ ba_agent_mcp.egg-info/SOURCES.txt
6
+ ba_agent_mcp.egg-info/dependency_links.txt
7
+ ba_agent_mcp.egg-info/entry_points.txt
8
+ ba_agent_mcp.egg-info/requires.txt
9
+ ba_agent_mcp.egg-info/top_level.txt
10
+ kb/__init__.py
11
+ kb/adopt_doc.py
12
+ kb/alerts.py
13
+ kb/backfill_embeddings.py
14
+ kb/chunker.py
15
+ kb/embeddings.py
16
+ kb/eval_kb.py
17
+ kb/extend_doc_types.py
18
+ kb/fast_metadata.py
19
+ kb/ingest.py
20
+ kb/instant_ingest.py
21
+ kb/kb_smoke.py
22
+ kb/reconcile.py
23
+ kb/reference_loader.py
24
+ kb/rerank.py
25
+ kb/retrieval_eval.py
26
+ kb/retrieve.py
27
+ kb/search_client.py
28
+ kb/supabase_rest.py
29
+ lib/__init__.py
30
+ lib/analysis_session.py
31
+ lib/badoc_schema.py
32
+ lib/baseline.py
33
+ lib/citations.py
34
+ lib/doc_lint.py
35
+ lib/docx_text.py
36
+ lib/draft.py
37
+ lib/e2e.py
38
+ lib/egress_audit.py
39
+ lib/figma_analyze.py
40
+ lib/figma_client.py
41
+ lib/figma_doc.py
42
+ lib/figma_extract.py
43
+ lib/figma_publish.py
44
+ lib/health.py
45
+ lib/ingest_hook.py
46
+ lib/migrate_to_theme_docs.py
47
+ lib/project_doc.py
48
+ lib/publish_prep.py
49
+ lib/readback.py
50
+ lib/reconcile_e2e.py
51
+ lib/reconcile_plan.py
52
+ lib/req_matcher.py
53
+ lib/requirement_id.py
54
+ lib/restore_lister.py
55
+ lib/serializer.py
56
+ lib/source_resolver.py
57
+ lib/suppa_adapter.py
58
+ lib/suppa_client.py
59
+ lib/token_gate.py
60
+ lib/transcript_store.py
61
+ lib/validate_doc.py
62
+ lib/write_sequencer.py
63
+ mcp_server/__init__.py
64
+ mcp_server/config.py
65
+ mcp_server/server.py
66
+ mcp_server/tools.py
67
+ smoke/__init__.py
68
+ smoke/canonical.py
69
+ smoke/check_supabase.py
70
+ smoke/fixture_brd.py
71
+ smoke/phase0_smoke.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ ba-agent-mcp = mcp_server.server:main
@@ -0,0 +1,5 @@
1
+ mcp>=1.2
2
+ pypdf>=4
3
+
4
+ [dev]
5
+ pytest>=8
@@ -0,0 +1,4 @@
1
+ kb
2
+ lib
3
+ mcp_server
4
+ smoke
File without changes
@@ -0,0 +1,177 @@
1
+ """Backfill: adopt an EXISTING SUPPA Doc into the BA governance layer + KB
2
+ (BUILD_PLAN 2.9). The inventory (docs/INVENTORY_2_9.md) found legacy Docs with
3
+ no BADoc rows; this turns one such Doc into a governed, retrievable document.
4
+
5
+ Distinct from write_sequencer (which authors NEW content): the content already
6
+ lives in SUPPA, so adoption does NOT write blocks. It:
7
+ 1. verifies the Doc has live pages;
8
+ 2. computes the live-content hash (kb.ingest.live_content_hash — sentinels
9
+ excluded, identical to ingest/reconcile);
10
+ 3. registers a BADoc row pointing at the EXISTING Doc — inverted-order-lite:
11
+ insert status='writing' (external_id=doc_key:1, content_hash=live_hash,
12
+ no content write) -> flip 'published' + content_ref=doc_id -> read-back.
13
+ Idempotent on external_id (re-read on unique conflict); refuses to repoint
14
+ a doc_key already published against a DIFFERENT doc_id;
15
+ 4. best-effort KB ingest (chunk + Gemini-embed + index). Ingest failure does
16
+ NOT fail adoption — the BADoc row stands and `make kb-reconcile` /
17
+ `kb-embed-backfill` heal the index later (same contract as instant ingest).
18
+
19
+ Metadata (project / doc_key / doc_type / data_class / feature) is supplied by
20
+ the operator — NOTHING is invented here. The BA Lead provides real values per
21
+ doc from the inventory verdicts.
22
+
23
+ CLI:
24
+ python -m kb.adopt_doc <doc_id> --project P --doc-key P/tech_doc/api \\
25
+ --doc-type tech_doc --data-class internal [--feature F] [--no-ingest]
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ from dataclasses import dataclass
31
+
32
+ from kb import ingest as kb_ingest
33
+ from kb.supabase_rest import KBRestError, SupabaseREST
34
+ from lib.draft import DATA_CLASSES
35
+ from lib.suppa_client import SuppaError, SuppaHTTPError
36
+
37
+
38
+ class AdoptError(SuppaError):
39
+ pass
40
+
41
+
42
+ @dataclass
43
+ class AdoptResult:
44
+ status: str # adopted | noop-already-adopted
45
+ doc_id: int
46
+ badoc_id: int | None
47
+ external_id: str
48
+ content_hash: str
49
+ ingest_status: str = "skipped"
50
+ chunks: int = 0
51
+ detail: str = ""
52
+
53
+
54
+ def adopt_doc(doc_id: int, *, project: str, doc_key: str, doc_type: str,
55
+ data_class: str, suppa, rest: SupabaseREST | None = None,
56
+ feature: str | None = None, source: str = "backfill",
57
+ ingest: bool = True, doc_title: str | None = None) -> AdoptResult:
58
+ if data_class not in DATA_CLASSES:
59
+ raise AdoptError(f"data_class {data_class!r} not in {sorted(DATA_CLASSES)}")
60
+ doc_id = int(doc_id)
61
+ ext = f"{doc_key}:1" # adopted legacy docs are version 1 of their doc_key
62
+
63
+ pages = kb_ingest.fetch_pages(suppa, doc_id)
64
+ if not pages:
65
+ raise AdoptError(f"doc {doc_id} has no live pages — nothing to adopt")
66
+ live_hash = kb_ingest.live_content_hash(pages)
67
+
68
+ # idempotency / conflict guard on the doc_key
69
+ published = [r for r in suppa.search_badoc(doc_key=doc_key, status="published")]
70
+ for r in published:
71
+ if r.get("content_ref") and int(r["content_ref"]) != doc_id:
72
+ raise AdoptError(
73
+ f"doc_key {doc_key!r} already published against doc "
74
+ f"{r['content_ref']} (not {doc_id}) — refusing to repoint")
75
+
76
+ existing = suppa.search_badoc(external_id=ext)
77
+ if existing and existing[0].get("status") == "published":
78
+ row = existing[0]
79
+ badoc_id = row.get("id")
80
+ # refresh content_hash if the legacy page changed since last adoption
81
+ if row.get("content_hash") != live_hash:
82
+ suppa.update_badoc(badoc_id, {"content_hash": live_hash})
83
+ result = AdoptResult("noop-already-adopted", doc_id, badoc_id, ext, live_hash)
84
+ else:
85
+ badoc_id = _register_badoc(suppa, ext, doc_id, live_hash, project=project,
86
+ feature=feature, doc_type=doc_type,
87
+ data_class=data_class, doc_key=doc_key, source=source)
88
+ result = AdoptResult("adopted", doc_id, badoc_id, ext, live_hash)
89
+
90
+ if ingest:
91
+ _ingest_best_effort(result, suppa, rest, project=project, feature=feature,
92
+ doc_type=doc_type, data_class=data_class, doc_id=doc_id,
93
+ content_hash=live_hash, doc_title=doc_title)
94
+ return result
95
+
96
+
97
+ def _register_badoc(suppa, ext, doc_id, live_hash, *, project, feature, doc_type,
98
+ data_class, doc_key, source) -> int:
99
+ row = {
100
+ "project": project, "feature": feature, "doc_type": doc_type,
101
+ "status": "writing", "data_class": data_class, "source": source,
102
+ "doc_key": doc_key, "version": 1, "external_id": ext,
103
+ "content_hash": live_hash,
104
+ }
105
+ try:
106
+ badoc_id = suppa.insert_badoc(row)
107
+ except SuppaHTTPError:
108
+ # unique conflict on external_id -> re-read, never parse the body
109
+ rows = suppa.search_badoc(external_id=ext)
110
+ if len(rows) != 1:
111
+ raise
112
+ badoc_id = rows[0]["id"]
113
+ if rows[0].get("content_hash") != live_hash:
114
+ suppa.update_badoc(badoc_id, {"content_hash": live_hash})
115
+ # flip to published pointing at the EXISTING doc; read-back
116
+ suppa.update_badoc(badoc_id, {"status": "published", "content_ref": doc_id})
117
+ back = suppa.search_badoc(external_id=ext)
118
+ if not back or back[0].get("status") != "published":
119
+ raise AdoptError(f"publish read-back failed for {ext}")
120
+ return badoc_id
121
+
122
+
123
+ def _ingest_best_effort(result: AdoptResult, suppa, rest, *, project, feature,
124
+ doc_type, data_class, doc_id, content_hash,
125
+ doc_title=None) -> None:
126
+ badoc_row = {
127
+ "id": result.badoc_id, "external_id": result.external_id,
128
+ "project": project, "feature": feature, "doc_type": doc_type,
129
+ "status": "published", "data_class": data_class,
130
+ "content_hash": content_hash, "content_ref": doc_id,
131
+ }
132
+ try:
133
+ rest = rest or SupabaseREST()
134
+ res = kb_ingest.ingest_doc(badoc_row, suppa, rest, doc_title=doc_title)
135
+ result.ingest_status = res.status
136
+ result.chunks = res.chunks
137
+ if res.status == "failed":
138
+ kb_ingest.log_ingest_error(rest, doc_id, res.detail)
139
+ except KBRestError as e:
140
+ result.ingest_status = "error"
141
+ result.detail = str(e)[:200]
142
+ try:
143
+ kb_ingest.log_ingest_error(rest, doc_id, str(e))
144
+ except KBRestError:
145
+ pass # reconcile scans had_errors anyway
146
+
147
+
148
+ def main(argv=None) -> int:
149
+ import argparse
150
+ import sys
151
+ from lib.suppa_adapter import SuppaAdapter
152
+
153
+ p = argparse.ArgumentParser(description="Adopt an existing SUPPA Doc (BUILD_PLAN 2.9)")
154
+ p.add_argument("doc_id", type=int)
155
+ p.add_argument("--project", required=True)
156
+ p.add_argument("--doc-key", required=True, help="unique doc_key, e.g. PROJ/tech_doc/api")
157
+ p.add_argument("--doc-type", required=True)
158
+ p.add_argument("--data-class", required=True, choices=sorted(DATA_CLASSES))
159
+ p.add_argument("--feature", default=None)
160
+ p.add_argument("--source", default="backfill")
161
+ p.add_argument("--no-ingest", action="store_true")
162
+ args = p.parse_args(argv)
163
+
164
+ res = adopt_doc(args.doc_id, project=args.project, doc_key=args.doc_key,
165
+ doc_type=args.doc_type, data_class=args.data_class,
166
+ suppa=SuppaAdapter(), feature=args.feature, source=args.source,
167
+ ingest=not args.no_ingest)
168
+ print(f"{res.status}: badoc={res.badoc_id} doc={res.doc_id} ext={res.external_id} "
169
+ f"hash={res.content_hash[:16]}… ingest={res.ingest_status} chunks={res.chunks}"
170
+ + (f" detail={res.detail}" if res.detail else ""))
171
+ return 0
172
+
173
+
174
+ if __name__ == "__main__":
175
+ import sys
176
+ sys.stdout.reconfigure(encoding="utf-8")
177
+ raise SystemExit(main())
@@ -0,0 +1,42 @@
1
+ """One alert sink for ALL hard failures (BUILD_PLAN 2.4 DoD): ingest retry
2
+ exhaustion, sequencer hard errors, reconcile manual-edit warnings.
3
+
4
+ Pilot transport: append-only logs/alerts.jsonl (ids/hashes only — never
5
+ bodies, PRR §3.8) + optional webhook POST when ALERT_WEBHOOK_URL is set
6
+ (channel decision — SUPPA task vs chat webhook — pending with the customer;
7
+ recorded in BUILD_PLAN). Alerting must never raise into the caller.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import datetime
13
+ import json
14
+ import os
15
+ import urllib.request
16
+ from pathlib import Path
17
+
18
+ DEFAULT_DIR = Path(__file__).resolve().parent.parent / "logs"
19
+
20
+
21
+ def alert(kind: str, *, run_dir=None, **fields) -> dict:
22
+ entry = {
23
+ "ts": datetime.datetime.now(datetime.timezone.utc).isoformat(),
24
+ "alert": kind, **fields,
25
+ }
26
+ try:
27
+ out_dir = Path(run_dir) if run_dir else DEFAULT_DIR
28
+ out_dir.mkdir(parents=True, exist_ok=True)
29
+ with open(out_dir / "alerts.jsonl", "a", encoding="utf-8") as f:
30
+ f.write(json.dumps(entry, ensure_ascii=False, default=str) + "\n")
31
+ except Exception:
32
+ pass
33
+ url = os.environ.get("ALERT_WEBHOOK_URL", "")
34
+ if url:
35
+ try:
36
+ req = urllib.request.Request(
37
+ url, data=json.dumps(entry, ensure_ascii=False).encode("utf-8"),
38
+ headers={"Content-Type": "application/json"}, method="POST")
39
+ urllib.request.urlopen(req, timeout=10).read()
40
+ except Exception:
41
+ pass # the file entry above is the durable record
42
+ return entry
@@ -0,0 +1,67 @@
1
+ """Embedding backfill (`make kb-embed-backfill`) — fills NULL embeddings
2
+ WITHOUT re-ingesting content (the cheap heal path after chunks were ingested
3
+ while GEMINI_API_KEY was absent or the API was down).
4
+
5
+ Loops kb_missing_embeddings -> Gemini embed_texts -> kb_set_embeddings until
6
+ none remain. ingestion_log had_errors rows are left to `make kb-reconcile`,
7
+ which force-reingests them once and then converges (embeddings now succeed).
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import sys
13
+
14
+ from kb import embeddings
15
+ from kb.supabase_rest import SupabaseREST
16
+ from lib import egress_audit
17
+
18
+
19
+ def backfill(rest: SupabaseREST | None = None, *, project: str | None = None,
20
+ batch: int = 100, sink=None) -> int:
21
+ rest = rest or SupabaseREST()
22
+ total = 0
23
+ while True:
24
+ rows = rest.rpc("kb_missing_embeddings",
25
+ {"p_project": project, "p_limit": batch})
26
+ if not rows:
27
+ return total
28
+ texts = [r["content"] for r in rows]
29
+ # Chunk CONTENT egresses to Gemini here — audit it (always) and honor the
30
+ # fail-closed BLOCK switch (GEMINI_EGRESS_CLIENT_BLOCK=1). A blocked batch
31
+ # raises EgressBlocked: skip it (chunks keep their NULL embeddings) and
32
+ # stop — kb_missing_embeddings would just re-return the same rows, so
33
+ # looping would spin forever. They re-embed once the block lifts
34
+ # (best-effort contract, mirrors ingest/reconcile).
35
+ data_class = next((r.get("data_class") for r in rows if r.get("data_class")),
36
+ None) or "client"
37
+ try:
38
+ egress_audit.check_and_record(
39
+ data_class=data_class, path="embeddings",
40
+ byte_count=sum(len(t.encode("utf-8")) for t in texts), sink=sink)
41
+ except egress_audit.EgressBlocked:
42
+ return total
43
+ vectors = embeddings.embed_texts(texts)
44
+ # C3: carry content_hash so kb_set_embeddings only writes the vector when
45
+ # the chunk's content is still the one we embedded (refuses to overwrite a
46
+ # row a concurrent re-ingest has since replaced — no silent vector/content skew).
47
+ n = rest.rpc("kb_set_embeddings", {"p": {"items": [
48
+ {"id": r["id"], "embedding": embeddings.vector_literal(v),
49
+ "content_hash": r.get("content_hash")}
50
+ for r, v in zip(rows, vectors)]}})
51
+ total += int(n or 0)
52
+ if len(rows) < batch:
53
+ return total
54
+
55
+
56
+ def main() -> int:
57
+ if not embeddings.available():
58
+ print("GEMINI_API_KEY is not set — nothing to backfill with")
59
+ return 1
60
+ n = backfill()
61
+ print(f"backfilled embeddings for {n} chunks")
62
+ return 0
63
+
64
+
65
+ if __name__ == "__main__":
66
+ sys.stdout.reconfigure(encoding="utf-8")
67
+ raise SystemExit(main())