provenex-core 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. provenex_core-0.1.0/LICENSE +21 -0
  2. provenex_core-0.1.0/PKG-INFO +229 -0
  3. provenex_core-0.1.0/README.md +195 -0
  4. provenex_core-0.1.0/provenex/__init__.py +49 -0
  5. provenex_core-0.1.0/provenex/cli/__init__.py +1 -0
  6. provenex_core-0.1.0/provenex/cli/main.py +207 -0
  7. provenex_core-0.1.0/provenex/core/__init__.py +1 -0
  8. provenex_core-0.1.0/provenex/core/fingerprinter.py +166 -0
  9. provenex_core-0.1.0/provenex/core/hasher.py +244 -0
  10. provenex_core-0.1.0/provenex/core/merkle.py +205 -0
  11. provenex_core-0.1.0/provenex/core/normalizer.py +143 -0
  12. provenex_core-0.1.0/provenex/core/receipt.py +414 -0
  13. provenex_core-0.1.0/provenex/index/__init__.py +1 -0
  14. provenex_core-0.1.0/provenex/index/base.py +195 -0
  15. provenex_core-0.1.0/provenex/index/merkle_sqlite_index.py +199 -0
  16. provenex_core-0.1.0/provenex/index/sqlite_index.py +318 -0
  17. provenex_core-0.1.0/provenex/integrations/__init__.py +1 -0
  18. provenex_core-0.1.0/provenex/integrations/langchain/__init__.py +16 -0
  19. provenex_core-0.1.0/provenex/integrations/langchain/ingestor.py +154 -0
  20. provenex_core-0.1.0/provenex/integrations/langchain/retriever.py +189 -0
  21. provenex_core-0.1.0/provenex/integrations/llamaindex/__init__.py +1 -0
  22. provenex_core-0.1.0/provenex/policy/__init__.py +1 -0
  23. provenex_core-0.1.0/provenex/policy/policy.py +120 -0
  24. provenex_core-0.1.0/provenex_core.egg-info/PKG-INFO +229 -0
  25. provenex_core-0.1.0/provenex_core.egg-info/SOURCES.txt +39 -0
  26. provenex_core-0.1.0/provenex_core.egg-info/dependency_links.txt +1 -0
  27. provenex_core-0.1.0/provenex_core.egg-info/entry_points.txt +2 -0
  28. provenex_core-0.1.0/provenex_core.egg-info/requires.txt +13 -0
  29. provenex_core-0.1.0/provenex_core.egg-info/top_level.txt +1 -0
  30. provenex_core-0.1.0/pyproject.toml +75 -0
  31. provenex_core-0.1.0/setup.cfg +4 -0
  32. provenex_core-0.1.0/setup.py +8 -0
  33. provenex_core-0.1.0/tests/test_fingerprinter.py +86 -0
  34. provenex_core-0.1.0/tests/test_hasher.py +121 -0
  35. provenex_core-0.1.0/tests/test_langchain_integration.py +145 -0
  36. provenex_core-0.1.0/tests/test_merkle.py +247 -0
  37. provenex_core-0.1.0/tests/test_merkle_sqlite_index.py +250 -0
  38. provenex_core-0.1.0/tests/test_normalizer.py +71 -0
  39. provenex_core-0.1.0/tests/test_policy.py +76 -0
  40. provenex_core-0.1.0/tests/test_receipt.py +290 -0
  41. provenex_core-0.1.0/tests/test_sqlite_index.py +185 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Provenex
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,229 @@
1
+ Metadata-Version: 2.4
2
+ Name: provenex-core
3
+ Version: 0.1.0
4
+ Summary: Cryptographic provenance verification for enterprise RAG pipelines
5
+ Author: Provenex
6
+ License: MIT
7
+ Project-URL: Homepage, https://provenex.ai
8
+ Project-URL: Repository, https://github.com/provenex/provenex-core
9
+ Project-URL: Documentation, https://provenex.ai/docs
10
+ Keywords: rag,provenance,ai,compliance,fingerprinting,langchain
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Security :: Cryptography
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Provides-Extra: langchain
24
+ Requires-Dist: langchain-core<0.4,>=0.3; extra == "langchain"
25
+ Provides-Extra: llamaindex
26
+ Requires-Dist: llama-index-core<0.13,>=0.10; extra == "llamaindex"
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest>=8.0; extra == "dev"
29
+ Requires-Dist: pytest-cov>=4.1; extra == "dev"
30
+ Requires-Dist: black>=24.0; extra == "dev"
31
+ Requires-Dist: ruff>=0.4; extra == "dev"
32
+ Requires-Dist: mypy>=1.10; extra == "dev"
33
+ Dynamic: license-file
34
+
35
+ # provenex-core
36
+
37
+ Cryptographic provenance verification for RAG pipelines. When an enterprise AI system answers a question, this is what proves which documents the answer came from, whether they were current and authorized, and that they weren't tampered with along the way.
38
+
39
+ This repository contains the open source core: fingerprinting, local SQLite index, receipt generation, LangChain integration. The algorithm is open so it can be audited. Hosted infrastructure, Bloom-filter acceleration, compliance-grade exports, and cross-enterprise provenance graphs are available separately at [provenex.ai](https://provenex.ai).
40
+
41
+ > **Note on terminology.** "Provenance" means several different things in the AI stack right now — training-data lineage, vector DB governance (Pinecone Nexus, Weaviate), retrieval verification, output faithfulness, generated-media credentials (C2PA). Provenex is the **retrieval verification** layer: cryptographic proof of which chunks reached the LLM, verifiable offline by anyone with the signing key, across any retriever. We've written up the full map in [Five Things People Mean by "AI Provenance"](https://provenex.ai/blog/five-things-ai-provenance).
42
+
43
+ ## Five-line integration
44
+
45
+ ```python
46
+ from provenex.integrations.langchain import ProvenexIngestor, ProvenexRetriever
47
+ from provenex.index.sqlite_index import SQLiteProvenanceIndex
48
+
49
+ index = SQLiteProvenanceIndex("provenance.db")
50
+ ingestor = ProvenexIngestor(index=index)
51
+
52
+ ingestor.ingest(documents, doc_id="policy_v4", authorized=True)
53
+
54
+ retriever = ProvenexRetriever(base_retriever=your_existing_retriever, index=index)
55
+ result = retriever.get_relevant_documents_with_receipt(query)
56
+ print(result.receipt.to_json())
57
+ ```
58
+
59
+ Your existing vector store is untouched. Provenex runs alongside as a parallel signed index. Whether you use **Pinecone, Weaviate, Milvus, Qdrant, Chroma, FAISS, pgvector, MongoDB Atlas Vector Search, Elasticsearch with vectors, Vespa, or a Postgres table you wrote yourself** — Provenex doesn't know and doesn't care. The integration surface is the retriever (LangChain today; LlamaIndex coming), not the database. `your_existing_retriever` keeps doing semantic similarity; Provenex adds cryptographic identity.
60
+
61
+ ## What a provenance receipt looks like
62
+
63
+ Every retrieval produces a JSON receipt that records exactly what went into the answer. Compliance teams hold onto it. Auditors verify it independently.
64
+
65
+ ```json
66
+ {
67
+ "receipt_id": "prx_f2de431dc125ccfc6b57e6ca327fa504",
68
+ "schema_version": "1.0.0",
69
+ "issued_at": "2026-05-08T14:32:07.441Z",
70
+ "issuer": "provenex-core/0.1.0",
71
+ "output": {
72
+ "hash": "sha256:6e9052525c80e43fb3612dce5edd025d350c8f0a1318097988ab4b0750c2f388",
73
+ "hash_algorithm": "sha256"
74
+ },
75
+ "sources": [
76
+ {
77
+ "chunk_index": 0,
78
+ "fingerprint": "sha256:1ebcde39...",
79
+ "document_id": "policy_v4",
80
+ "document_version": "sha256:1ebcde39...",
81
+ "ingested_at": "2026-04-01T09:00:00Z",
82
+ "chunk_offset": 0,
83
+ "chunk_length": 936,
84
+ "authorized": true,
85
+ "verification_outcome": "VERIFIED",
86
+ "normalization_applied": ["unicode_nfc", "strip_zero_width", "whitespace_collapse"]
87
+ }
88
+ ],
89
+ "policy": { "block_unauthorized": true, "block_tampered": true, "...": "..." },
90
+ "summary": { "total_chunks": 3, "verified": 2, "unverified": 1, "overall_status": "PARTIAL" },
91
+ "signature": { "algorithm": "hmac-sha256", "value": "fc5d40895ca2..." }
92
+ }
93
+ ```
94
+
95
+ Every retrieved chunk gets one of five verification outcomes:
96
+
97
+ | Outcome | Meaning |
98
+ | --- | --- |
99
+ | `VERIFIED` | Chunk in index, document current, authorized. |
100
+ | `STALE` | Chunk in index, but the document has been superseded by a newer version. |
101
+ | `UNAUTHORIZED` | Chunk in index, but the document is not authorized for this context. |
102
+ | `UNVERIFIED` | Chunk fingerprint not in index. It was never ingested through Provenex. |
103
+ | `TAMPERED` | Chunk in index but the stored signature failed verification. Alarm condition. |
104
+
105
+ The receipt is signed (HMAC-SHA256 by default; pluggable). Anyone with the receipt and the key can verify it didn't change since it was issued.
106
+
107
+ ## How it works
108
+
109
+ Three components:
110
+
111
+ **1. Ingestion.** Documents are normalized (Unicode NFC, whitespace collapse, optional case folding, zero-width stripping) and run through a sliding window. Each window gets a Rabin-Karp rolling hash (base `1_000_003`, modulo Mersenne prime `2^61 - 1`) for cheap O(1) updates, strengthened with SHA-256 for collision-resistant identity. The fingerprints — not the document content — are written to the provenance index along with `document_id`, `document_version`, timestamp, and authorization state. The index never stores document text.
112
+
113
+ **2. Retrieval verification.** When your retriever returns chunks, Provenex re-fingerprints each one using the same normalization and hash pipeline, checks the fingerprint against the index, and assigns one of the five outcomes above. Configurable policy decides which outcomes block the chunk before it reaches the LLM.
114
+
115
+ **3. Receipt.** After verification, a JSON receipt is issued that records the chunks, their outcomes, the policy in effect, a SHA-256 of the LLM output, and a signature over the whole thing. The receipt is the artifact you keep.
116
+
117
+ See [`docs/how_it_works.md`](https://github.com/provenex/provenex-core/blob/main/docs/how_it_works.md) for the full algorithm, including the architectural distinction between fingerprint-based identity and embedding-based similarity. See [`docs/receipt_format.md`](https://github.com/provenex/provenex-core/blob/main/docs/receipt_format.md) for the schema spec.
118
+
119
+ ## How this fits alongside Pinecone Nexus, Weaviate, and other vector DBs
120
+
121
+ Vector databases store **semantic similarity** — dense embeddings that let you find content similar to a query. Provenex stores **cryptographic identity** — SHA-256 fingerprints that prove bit-exact match against a signed reference. These solve different problems and compose cleanly.
122
+
123
+ | | Vector DBs (Pinecone Nexus, Weaviate, Milvus, Qdrant, Chroma, FAISS, pgvector, ...) | Provenex |
124
+ | --- | --- | --- |
125
+ | Primary storage | Dense embeddings (semantic similarity) | SHA-256 fingerprints (cryptographic identity) |
126
+ | Retrieval | Approximate nearest neighbor over vectors | Bit-exact match against signed index |
127
+ | Tampering | Not detectable — embeddings are lossy by design | Detectable — any modification produces a different SHA-256 |
128
+ | Audit artifact | Vendor dashboard, internal logs | Signed JSON receipt, verifiable offline |
129
+ | Trust root | Vendor's SOC 2 attestation | HMAC signature, verifiable by anyone with the key |
130
+ | Vendor lock-in | Yes (per database) | None — works alongside any retriever |
131
+
132
+ The expected enterprise deployment is **both**: vector DB for retrieval performance and vendor governance, Provenex for cryptographic audit trails compliance teams can hand to a regulator. See [the blog post](https://provenex.ai/blog/five-things-ai-provenance) for the longer argument.
133
+
134
+ ### Why vendor-agnostic matters
135
+
136
+ Pinecone Nexus is governance inside Pinecone. Weaviate has its own governance stack. Milvus, Qdrant, Chroma, and the rest each have their own — or none. If you run Pinecone for one workload and Weaviate for another, you have two separate audit stories with two separate vendor trust roots, and no way to produce a single cryptographic record that says "this chunk, wherever it came from, is bit-exact identical to the one we authorized."
137
+
138
+ Provenex works the same way against all of them, because it never talks to the vector DB. It re-fingerprints the chunks the retriever returns, regardless of where they were stored. One signed index, one receipt schema, one verifiable artifact — across every retrieval path in the enterprise.
139
+
140
+ This also means **migration risk between vector DBs goes to zero.** If you decide to move from Pinecone to Weaviate, or from a managed service to something self-hosted, your provenance audit trail doesn't change. You re-ingest into the new vector DB; the Provenex index stays the same. Vector DB swaps are decoupled from compliance infrastructure.
141
+
142
+ The technical reason this works: Provenex's integration surface is the retriever (LangChain, LlamaIndex, custom Python), not the vector DB itself. As long as the retriever returns the chunk text the vector DB stored, Provenex can fingerprint it. We've smoke-tested against Chroma and FAISS in the examples; Pinecone, Weaviate, Milvus, Qdrant, and the rest are integration-trivial — a few lines of adapter code if you're not on a framework that already wraps them.
143
+
144
+ ## Install
145
+
146
+ We haven't shipped to PyPI yet — install directly from this repository:
147
+
148
+ ```bash
149
+ pip install git+https://github.com/provenex/provenex-core.git
150
+ pip install "git+https://github.com/provenex/provenex-core.git#egg=provenex-core[langchain]"
151
+ ```
152
+
153
+ Python 3.10+. The core has zero third-party dependencies — it's pure stdlib. LangChain and LlamaIndex are optional extras.
154
+
155
+ A PyPI release (`pip install provenex-core`) is coming once the API stabilizes. Pin to a commit hash in the meantime if you need a fixed version.
156
+
157
+ ### Try it in 30 seconds
158
+
159
+ ```bash
160
+ git clone https://github.com/provenex/provenex-core.git
161
+ cd provenex-core
162
+ pip install -e .
163
+
164
+ export PROVENEX_SIGNING_SECRET="$(python3 -c 'import secrets; print(secrets.token_hex(32))')"
165
+ python examples/standalone_demo.py
166
+ ```
167
+
168
+ `examples/standalone_demo.py` runs the full story end-to-end — ingest a document, get a signed receipt with a cryptographic inclusion proof, watch the HMAC catch a tampered row, then re-verify the proof **with the database deleted** using only the receipt fields and the published tree root. It's the demo we'd show a sceptical compliance team.
169
+
170
+ > Want a shareable asciicast? See [`docs/recording_demo.md`](https://github.com/provenex/provenex-core/blob/main/docs/recording_demo.md) for the asciinema recipe.
171
+
172
+ ## CLI
173
+
174
+ ```bash
175
+ provenex ingest --index prov.db --doc-id policy_v4 policy.txt
176
+ provenex verify --index prov.db retrieved_chunk.txt
177
+ provenex receipt --index prov.db --output llm_output.txt chunk1.txt chunk2.txt
178
+ ```
179
+
180
+ Set `PROVENEX_SIGNING_SECRET` in your environment. The `verify` command exits non-zero when the outcome is not `VERIFIED`, so it composes in shell pipelines.
181
+
182
+ ## Why open source?
183
+
184
+ Compliance teams won't trust a black box. If a regulator asks how your provenance system works, "it's proprietary" is not an answer. The algorithm — normalization, rolling hash, sliding window, SHA-256 strengthening, receipt schema, signature payload — needs to be auditable end to end. So it is. The commercial value is in the hosted infrastructure that runs this algorithm at scale across an enterprise, not in keeping the algorithm secret.
185
+
186
+ What's in this repo:
187
+
188
+ - Fingerprinting engine (normalizer + Rabin-Karp + SHA-256)
189
+ - Local SQLite provenance index with HMAC-signed rows
190
+ - Receipt generation and signature verification
191
+ - LangChain integration (retriever middleware + ingestor)
192
+ - CLI: `provenex ingest / verify / receipt`
193
+ - Python SDK (install from GitHub — see [Install](#install))
194
+
195
+ What's not in this repo (commercial features at provenex.ai):
196
+
197
+ - Hosted provenance index with distributed signed append-only storage
198
+ - Bloom-filter acceleration for high-throughput verification
199
+ - Compliance-grade export formats (PDF, JSON-LD for regulators)
200
+ - Cross-enterprise provenance graphs
201
+ - Inference attribution and temporal decay scoring
202
+ - Enterprise SSO / RBAC
203
+
204
+ The interface (`ProvenanceIndex`) is the same. Moving from open source to commercial is one line of code: the class you instantiate.
205
+
206
+ ## Privacy and data sovereignty
207
+
208
+ The index stores fingerprints — one-way SHA-256 hashes — and metadata. **No document content, no PII, no chunk text is ever written.** Anyone with the index can verify retrieval, but no one can recover document content from it.
209
+
210
+ ## License
211
+
212
+ MIT. See [LICENSE](https://github.com/provenex/provenex-core/blob/main/LICENSE).
213
+
214
+ ## Links
215
+
216
+ **Reading:**
217
+
218
+ - [Five Things People Mean by "AI Provenance" (And Which One Is For You)](https://provenex.ai/blog/five-things-ai-provenance) — the category map, and where Provenex sits
219
+ - [`docs/how_it_works.md`](https://github.com/provenex/provenex-core/blob/main/docs/how_it_works.md) — full algorithm, threat model, and architectural comparison to embedding-based systems
220
+ - [`docs/receipt_format.md`](https://github.com/provenex/provenex-core/blob/main/docs/receipt_format.md) — receipt schema specification
221
+ - [`docs/quickstart.md`](https://github.com/provenex/provenex-core/blob/main/docs/quickstart.md) — 5-minute getting-started
222
+ - [`docs/langchain_integration.md`](https://github.com/provenex/provenex-core/blob/main/docs/langchain_integration.md) — LangChain-specific patterns
223
+
224
+ **Project:**
225
+
226
+ - Homepage: [provenex.ai](https://provenex.ai)
227
+ - Issues and discussion: GitHub Issues on this repo
228
+ - Commercial features: contact via provenex.ai
229
+
@@ -0,0 +1,195 @@
1
+ # provenex-core
2
+
3
+ Cryptographic provenance verification for RAG pipelines. When an enterprise AI system answers a question, this is what proves which documents the answer came from, whether they were current and authorized, and that they weren't tampered with along the way.
4
+
5
+ This repository contains the open source core: fingerprinting, local SQLite index, receipt generation, LangChain integration. The algorithm is open so it can be audited. Hosted infrastructure, Bloom-filter acceleration, compliance-grade exports, and cross-enterprise provenance graphs are available separately at [provenex.ai](https://provenex.ai).
6
+
7
+ > **Note on terminology.** "Provenance" means several different things in the AI stack right now — training-data lineage, vector DB governance (Pinecone Nexus, Weaviate), retrieval verification, output faithfulness, generated-media credentials (C2PA). Provenex is the **retrieval verification** layer: cryptographic proof of which chunks reached the LLM, verifiable offline by anyone with the signing key, across any retriever. We've written up the full map in [Five Things People Mean by "AI Provenance"](https://provenex.ai/blog/five-things-ai-provenance).
8
+
9
+ ## Five-line integration
10
+
11
+ ```python
12
+ from provenex.integrations.langchain import ProvenexIngestor, ProvenexRetriever
13
+ from provenex.index.sqlite_index import SQLiteProvenanceIndex
14
+
15
+ index = SQLiteProvenanceIndex("provenance.db")
16
+ ingestor = ProvenexIngestor(index=index)
17
+
18
+ ingestor.ingest(documents, doc_id="policy_v4", authorized=True)
19
+
20
+ retriever = ProvenexRetriever(base_retriever=your_existing_retriever, index=index)
21
+ result = retriever.get_relevant_documents_with_receipt(query)
22
+ print(result.receipt.to_json())
23
+ ```
24
+
25
+ Your existing vector store is untouched. Provenex runs alongside as a parallel signed index. Whether you use **Pinecone, Weaviate, Milvus, Qdrant, Chroma, FAISS, pgvector, MongoDB Atlas Vector Search, Elasticsearch with vectors, Vespa, or a Postgres table you wrote yourself** — Provenex doesn't know and doesn't care. The integration surface is the retriever (LangChain today; LlamaIndex coming), not the database. `your_existing_retriever` keeps doing semantic similarity; Provenex adds cryptographic identity.
26
+
27
+ ## What a provenance receipt looks like
28
+
29
+ Every retrieval produces a JSON receipt that records exactly what went into the answer. Compliance teams hold onto it. Auditors verify it independently.
30
+
31
+ ```json
32
+ {
33
+ "receipt_id": "prx_f2de431dc125ccfc6b57e6ca327fa504",
34
+ "schema_version": "1.0.0",
35
+ "issued_at": "2026-05-08T14:32:07.441Z",
36
+ "issuer": "provenex-core/0.1.0",
37
+ "output": {
38
+ "hash": "sha256:6e9052525c80e43fb3612dce5edd025d350c8f0a1318097988ab4b0750c2f388",
39
+ "hash_algorithm": "sha256"
40
+ },
41
+ "sources": [
42
+ {
43
+ "chunk_index": 0,
44
+ "fingerprint": "sha256:1ebcde39...",
45
+ "document_id": "policy_v4",
46
+ "document_version": "sha256:1ebcde39...",
47
+ "ingested_at": "2026-04-01T09:00:00Z",
48
+ "chunk_offset": 0,
49
+ "chunk_length": 936,
50
+ "authorized": true,
51
+ "verification_outcome": "VERIFIED",
52
+ "normalization_applied": ["unicode_nfc", "strip_zero_width", "whitespace_collapse"]
53
+ }
54
+ ],
55
+ "policy": { "block_unauthorized": true, "block_tampered": true, "...": "..." },
56
+ "summary": { "total_chunks": 3, "verified": 2, "unverified": 1, "overall_status": "PARTIAL" },
57
+ "signature": { "algorithm": "hmac-sha256", "value": "fc5d40895ca2..." }
58
+ }
59
+ ```
60
+
61
+ Every retrieved chunk gets one of five verification outcomes:
62
+
63
+ | Outcome | Meaning |
64
+ | --- | --- |
65
+ | `VERIFIED` | Chunk in index, document current, authorized. |
66
+ | `STALE` | Chunk in index, but the document has been superseded by a newer version. |
67
+ | `UNAUTHORIZED` | Chunk in index, but the document is not authorized for this context. |
68
+ | `UNVERIFIED` | Chunk fingerprint not in index. It was never ingested through Provenex. |
69
+ | `TAMPERED` | Chunk in index but the stored signature failed verification. Alarm condition. |
70
+
71
+ The receipt is signed (HMAC-SHA256 by default; pluggable). Anyone with the receipt and the key can verify it didn't change since it was issued.
72
+
73
+ ## How it works
74
+
75
+ Three components:
76
+
77
+ **1. Ingestion.** Documents are normalized (Unicode NFC, whitespace collapse, optional case folding, zero-width stripping) and run through a sliding window. Each window gets a Rabin-Karp rolling hash (base `1_000_003`, modulo Mersenne prime `2^61 - 1`) for cheap O(1) updates, strengthened with SHA-256 for collision-resistant identity. The fingerprints — not the document content — are written to the provenance index along with `document_id`, `document_version`, timestamp, and authorization state. The index never stores document text.
78
+
79
+ **2. Retrieval verification.** When your retriever returns chunks, Provenex re-fingerprints each one using the same normalization and hash pipeline, checks the fingerprint against the index, and assigns one of the five outcomes above. Configurable policy decides which outcomes block the chunk before it reaches the LLM.
80
+
81
+ **3. Receipt.** After verification, a JSON receipt is issued that records the chunks, their outcomes, the policy in effect, a SHA-256 of the LLM output, and a signature over the whole thing. The receipt is the artifact you keep.
82
+
83
+ See [`docs/how_it_works.md`](https://github.com/provenex/provenex-core/blob/main/docs/how_it_works.md) for the full algorithm, including the architectural distinction between fingerprint-based identity and embedding-based similarity. See [`docs/receipt_format.md`](https://github.com/provenex/provenex-core/blob/main/docs/receipt_format.md) for the schema spec.
84
+
85
+ ## How this fits alongside Pinecone Nexus, Weaviate, and other vector DBs
86
+
87
+ Vector databases store **semantic similarity** — dense embeddings that let you find content similar to a query. Provenex stores **cryptographic identity** — SHA-256 fingerprints that prove bit-exact match against a signed reference. These solve different problems and compose cleanly.
88
+
89
+ | | Vector DBs (Pinecone Nexus, Weaviate, Milvus, Qdrant, Chroma, FAISS, pgvector, ...) | Provenex |
90
+ | --- | --- | --- |
91
+ | Primary storage | Dense embeddings (semantic similarity) | SHA-256 fingerprints (cryptographic identity) |
92
+ | Retrieval | Approximate nearest neighbor over vectors | Bit-exact match against signed index |
93
+ | Tampering | Not detectable — embeddings are lossy by design | Detectable — any modification produces a different SHA-256 |
94
+ | Audit artifact | Vendor dashboard, internal logs | Signed JSON receipt, verifiable offline |
95
+ | Trust root | Vendor's SOC 2 attestation | HMAC signature, verifiable by anyone with the key |
96
+ | Vendor lock-in | Yes (per database) | None — works alongside any retriever |
97
+
98
+ The expected enterprise deployment is **both**: vector DB for retrieval performance and vendor governance, Provenex for cryptographic audit trails compliance teams can hand to a regulator. See [the blog post](https://provenex.ai/blog/five-things-ai-provenance) for the longer argument.
99
+
100
+ ### Why vendor-agnostic matters
101
+
102
+ Pinecone Nexus is governance inside Pinecone. Weaviate has its own governance stack. Milvus, Qdrant, Chroma, and the rest each have their own — or none. If you run Pinecone for one workload and Weaviate for another, you have two separate audit stories with two separate vendor trust roots, and no way to produce a single cryptographic record that says "this chunk, wherever it came from, is bit-exact identical to the one we authorized."
103
+
104
+ Provenex works the same way against all of them, because it never talks to the vector DB. It re-fingerprints the chunks the retriever returns, regardless of where they were stored. One signed index, one receipt schema, one verifiable artifact — across every retrieval path in the enterprise.
105
+
106
+ This also means **migration risk between vector DBs goes to zero.** If you decide to move from Pinecone to Weaviate, or from a managed service to something self-hosted, your provenance audit trail doesn't change. You re-ingest into the new vector DB; the Provenex index stays the same. Vector DB swaps are decoupled from compliance infrastructure.
107
+
108
+ The technical reason this works: Provenex's integration surface is the retriever (LangChain, LlamaIndex, custom Python), not the vector DB itself. As long as the retriever returns the chunk text the vector DB stored, Provenex can fingerprint it. We've smoke-tested against Chroma and FAISS in the examples; Pinecone, Weaviate, Milvus, Qdrant, and the rest are integration-trivial — a few lines of adapter code if you're not on a framework that already wraps them.
109
+
110
+ ## Install
111
+
112
+ We haven't shipped to PyPI yet — install directly from this repository:
113
+
114
+ ```bash
115
+ pip install git+https://github.com/provenex/provenex-core.git
116
+ pip install "git+https://github.com/provenex/provenex-core.git#egg=provenex-core[langchain]"
117
+ ```
118
+
119
+ Python 3.10+. The core has zero third-party dependencies — it's pure stdlib. LangChain and LlamaIndex are optional extras.
120
+
121
+ A PyPI release (`pip install provenex-core`) is coming once the API stabilizes. Pin to a commit hash in the meantime if you need a fixed version.
122
+
123
+ ### Try it in 30 seconds
124
+
125
+ ```bash
126
+ git clone https://github.com/provenex/provenex-core.git
127
+ cd provenex-core
128
+ pip install -e .
129
+
130
+ export PROVENEX_SIGNING_SECRET="$(python3 -c 'import secrets; print(secrets.token_hex(32))')"
131
+ python examples/standalone_demo.py
132
+ ```
133
+
134
+ `examples/standalone_demo.py` runs the full story end-to-end — ingest a document, get a signed receipt with a cryptographic inclusion proof, watch the HMAC catch a tampered row, then re-verify the proof **with the database deleted** using only the receipt fields and the published tree root. It's the demo we'd show a sceptical compliance team.
135
+
136
+ > Want a shareable asciicast? See [`docs/recording_demo.md`](https://github.com/provenex/provenex-core/blob/main/docs/recording_demo.md) for the asciinema recipe.
137
+
138
+ ## CLI
139
+
140
+ ```bash
141
+ provenex ingest --index prov.db --doc-id policy_v4 policy.txt
142
+ provenex verify --index prov.db retrieved_chunk.txt
143
+ provenex receipt --index prov.db --output llm_output.txt chunk1.txt chunk2.txt
144
+ ```
145
+
146
+ Set `PROVENEX_SIGNING_SECRET` in your environment. The `verify` command exits non-zero when the outcome is not `VERIFIED`, so it composes in shell pipelines.
147
+
148
+ ## Why open source?
149
+
150
+ Compliance teams won't trust a black box. If a regulator asks how your provenance system works, "it's proprietary" is not an answer. The algorithm — normalization, rolling hash, sliding window, SHA-256 strengthening, receipt schema, signature payload — needs to be auditable end to end. So it is. The commercial value is in the hosted infrastructure that runs this algorithm at scale across an enterprise, not in keeping the algorithm secret.
151
+
152
+ What's in this repo:
153
+
154
+ - Fingerprinting engine (normalizer + Rabin-Karp + SHA-256)
155
+ - Local SQLite provenance index with HMAC-signed rows
156
+ - Receipt generation and signature verification
157
+ - LangChain integration (retriever middleware + ingestor)
158
+ - CLI: `provenex ingest / verify / receipt`
159
+ - Python SDK (install from GitHub — see [Install](#install))
160
+
161
+ What's not in this repo (commercial features at provenex.ai):
162
+
163
+ - Hosted provenance index with distributed signed append-only storage
164
+ - Bloom-filter acceleration for high-throughput verification
165
+ - Compliance-grade export formats (PDF, JSON-LD for regulators)
166
+ - Cross-enterprise provenance graphs
167
+ - Inference attribution and temporal decay scoring
168
+ - Enterprise SSO / RBAC
169
+
170
+ The interface (`ProvenanceIndex`) is the same. Moving from open source to commercial is one line of code: the class you instantiate.
171
+
172
+ ## Privacy and data sovereignty
173
+
174
+ The index stores fingerprints — one-way SHA-256 hashes — and metadata. **No document content, no PII, no chunk text is ever written.** Anyone with the index can verify retrieval, but no one can recover document content from it.
175
+
176
+ ## License
177
+
178
+ MIT. See [LICENSE](https://github.com/provenex/provenex-core/blob/main/LICENSE).
179
+
180
+ ## Links
181
+
182
+ **Reading:**
183
+
184
+ - [Five Things People Mean by "AI Provenance" (And Which One Is For You)](https://provenex.ai/blog/five-things-ai-provenance) — the category map, and where Provenex sits
185
+ - [`docs/how_it_works.md`](https://github.com/provenex/provenex-core/blob/main/docs/how_it_works.md) — full algorithm, threat model, and architectural comparison to embedding-based systems
186
+ - [`docs/receipt_format.md`](https://github.com/provenex/provenex-core/blob/main/docs/receipt_format.md) — receipt schema specification
187
+ - [`docs/quickstart.md`](https://github.com/provenex/provenex-core/blob/main/docs/quickstart.md) — 5-minute getting-started
188
+ - [`docs/langchain_integration.md`](https://github.com/provenex/provenex-core/blob/main/docs/langchain_integration.md) — LangChain-specific patterns
189
+
190
+ **Project:**
191
+
192
+ - Homepage: [provenex.ai](https://provenex.ai)
193
+ - Issues and discussion: GitHub Issues on this repo
194
+ - Commercial features: contact via provenex.ai
195
+
@@ -0,0 +1,49 @@
1
+ """Provenex — cryptographic provenance verification for enterprise RAG.
2
+
3
+ This is the open source core: fingerprinting, local SQLite index, receipt
4
+ generation, and LangChain/LlamaIndex integration. The algorithm is open so
5
+ enterprises can audit it. Hosted infrastructure, Bloom-filter acceleration,
6
+ compliance-grade exports, and cross-enterprise provenance graphs are
7
+ available separately — see https://provenex.ai.
8
+ """
9
+
10
+ from .core.fingerprinter import Fingerprinter, FingerprinterConfig, Fingerprint
11
+ from .core.hasher import sha256_fingerprint
12
+ from .core.normalizer import NormalizationOptions, TextNormalizer
13
+ from .core.receipt import (
14
+ HmacSha256Signer,
15
+ ProvenanceReceipt,
16
+ ReceiptBuilder,
17
+ ReceiptSigner,
18
+ verify_receipt_signature,
19
+ )
20
+ from .index.base import IndexEntry, ProvenanceIndex, VerificationOutcome
21
+ from .index.sqlite_index import SQLiteProvenanceIndex
22
+ from .policy.policy import VerificationPolicy, overall_status
23
+
24
+ __version__ = "0.1.0"
25
+
26
+ __all__ = [
27
+ # Core
28
+ "Fingerprinter",
29
+ "FingerprinterConfig",
30
+ "Fingerprint",
31
+ "NormalizationOptions",
32
+ "TextNormalizer",
33
+ "sha256_fingerprint",
34
+ # Index
35
+ "IndexEntry",
36
+ "ProvenanceIndex",
37
+ "SQLiteProvenanceIndex",
38
+ "VerificationOutcome",
39
+ # Policy
40
+ "VerificationPolicy",
41
+ "overall_status",
42
+ # Receipt
43
+ "ProvenanceReceipt",
44
+ "ReceiptBuilder",
45
+ "ReceiptSigner",
46
+ "HmacSha256Signer",
47
+ "verify_receipt_signature",
48
+ "__version__",
49
+ ]
@@ -0,0 +1 @@
1
+ """Provenex CLI."""