vectorshield 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. vectorshield-0.0.1/LICENSE +21 -0
  2. vectorshield-0.0.1/MANIFEST.in +17 -0
  3. vectorshield-0.0.1/PKG-INFO +474 -0
  4. vectorshield-0.0.1/README.md +436 -0
  5. vectorshield-0.0.1/pyproject.toml +63 -0
  6. vectorshield-0.0.1/setup.cfg +4 -0
  7. vectorshield-0.0.1/setup.py +3 -0
  8. vectorshield-0.0.1/vectorshield/__init__.py +130 -0
  9. vectorshield-0.0.1/vectorshield/cli.py +105 -0
  10. vectorshield-0.0.1/vectorshield/config/__init__.py +16 -0
  11. vectorshield-0.0.1/vectorshield/config/defaults.py +100 -0
  12. vectorshield-0.0.1/vectorshield/config/loader.py +119 -0
  13. vectorshield-0.0.1/vectorshield/implementations/__init__.py +41 -0
  14. vectorshield-0.0.1/vectorshield/implementations/cache/__init__.py +10 -0
  15. vectorshield-0.0.1/vectorshield/implementations/cache/generic.py +112 -0
  16. vectorshield-0.0.1/vectorshield/implementations/cache/memory.py +120 -0
  17. vectorshield-0.0.1/vectorshield/implementations/cache/redis.py +179 -0
  18. vectorshield-0.0.1/vectorshield/implementations/embedding/__init__.py +7 -0
  19. vectorshield-0.0.1/vectorshield/implementations/embedding/fastembed.py +97 -0
  20. vectorshield-0.0.1/vectorshield/implementations/embedding/generic.py +101 -0
  21. vectorshield-0.0.1/vectorshield/implementations/tracker/__init__.py +10 -0
  22. vectorshield-0.0.1/vectorshield/implementations/tracker/mlflow.py +170 -0
  23. vectorshield-0.0.1/vectorshield/implementations/tracking/__init__.py +10 -0
  24. vectorshield-0.0.1/vectorshield/implementations/tracking/mlflow.py +170 -0
  25. vectorshield-0.0.1/vectorshield/implementations/vectorstore/__init__.py +11 -0
  26. vectorshield-0.0.1/vectorshield/implementations/vectorstore/chroma.py +193 -0
  27. vectorshield-0.0.1/vectorshield/implementations/vectorstore/generic.py +120 -0
  28. vectorshield-0.0.1/vectorshield/interfaces/__init__.py +18 -0
  29. vectorshield-0.0.1/vectorshield/interfaces/cache.py +58 -0
  30. vectorshield-0.0.1/vectorshield/interfaces/embedder.py +48 -0
  31. vectorshield-0.0.1/vectorshield/interfaces/tracker.py +67 -0
  32. vectorshield-0.0.1/vectorshield/interfaces/vectorstore.py +71 -0
  33. vectorshield-0.0.1/vectorshield/metrics/__init__.py +11 -0
  34. vectorshield-0.0.1/vectorshield/metrics/stats.py +218 -0
  35. vectorshield-0.0.1/vectorshield/pipeline/__init__.py +11 -0
  36. vectorshield-0.0.1/vectorshield/pipeline/ingest.py +433 -0
  37. vectorshield-0.0.1/vectorshield/privacy/__init__.py +15 -0
  38. vectorshield-0.0.1/vectorshield/privacy/pii_detector.py +133 -0
  39. vectorshield-0.0.1/vectorshield/privacy/policy.py +140 -0
  40. vectorshield-0.0.1/vectorshield/privacy/redactor.py +112 -0
  41. vectorshield-0.0.1/vectorshield/registry/__init__.py +9 -0
  42. vectorshield-0.0.1/vectorshield/registry/loader.py +520 -0
  43. vectorshield-0.0.1/vectorshield/utils/__init__.py +17 -0
  44. vectorshield-0.0.1/vectorshield/utils/hashing.py +60 -0
  45. vectorshield-0.0.1/vectorshield/utils/timing.py +122 -0
  46. vectorshield-0.0.1/vectorshield.egg-info/PKG-INFO +474 -0
  47. vectorshield-0.0.1/vectorshield.egg-info/SOURCES.txt +49 -0
  48. vectorshield-0.0.1/vectorshield.egg-info/dependency_links.txt +1 -0
  49. vectorshield-0.0.1/vectorshield.egg-info/entry_points.txt +2 -0
  50. vectorshield-0.0.1/vectorshield.egg-info/requires.txt +17 -0
  51. vectorshield-0.0.1/vectorshield.egg-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Baihela Abid Hussain
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,17 @@
1
+ include README.md
2
+ include LICENSE*
3
+
4
+ recursive-include vectorshield *.py
5
+ recursive-include vectorshield *.typed
6
+
7
+ prune backend
8
+ prune data
9
+ prune dist
10
+ prune vectorshield_data
11
+ prune venv
12
+
13
+ global-exclude *.py[cod]
14
+ global-exclude __pycache__
15
+ global-exclude *.so
16
+ exclude mlflow.db
17
+ exclude requirements-backend.txt
@@ -0,0 +1,474 @@
1
+ Metadata-Version: 2.4
2
+ Name: vectorshield
3
+ Version: 0.0.1
4
+ Summary: Privacy-First Vector Database for Sensitive Data
5
+ Author-email: Shwetan Londhe <shwetan.college@gmail.com>, Varad Limbkar <varadlimbkar@gmail.com>, Baihela Husain <baihelahusain@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/vectorshield/vectorshield
8
+ Project-URL: Repository, https://github.com/vectorshield/vectorshield
9
+ Project-URL: Documentation, https://github.com/vectorshield/vectorshield#documentation
10
+ Project-URL: Issues, https://github.com/vectorshield/vectorshield/issues
11
+ Keywords: pii,privacy,vector-database,embedding,rag
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Requires-Python: >=3.8
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: presidio-analyzer>=2.2.0
24
+ Requires-Dist: presidio-anonymizer>=2.2.0
25
+ Requires-Dist: fastembed>=0.2.0
26
+ Requires-Dist: chromadb>=0.4.0
27
+ Requires-Dist: pydantic>=2.0.0
28
+ Requires-Dist: pyyaml>=6.0
29
+ Requires-Dist: numpy>=1.24.0
30
+ Provides-Extra: redis
31
+ Requires-Dist: redis[async]>=5.0.0; extra == "redis"
32
+ Provides-Extra: mlflow
33
+ Requires-Dist: mlflow>=2.10.0; extra == "mlflow"
34
+ Provides-Extra: all
35
+ Requires-Dist: redis[async]>=5.0.0; extra == "all"
36
+ Requires-Dist: mlflow>=2.10.0; extra == "all"
37
+ Dynamic: license-file
38
+
39
+ # VectorShield 🛡️
40
+
41
+ **Privacy-Preserving RAG Middleware — Stop PII from reaching your vector database.**
42
+
43
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
44
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
45
+ [![PyPI version](https://img.shields.io/pypi/v/vectorshield.svg)](https://pypi.org/project/vectorshield/)
46
+
47
+ ---
48
+
49
+ ## What is VectorShield?
50
+
51
+ VectorShield is a backend middleware layer that sits between your application and your vector database. It automatically detects and redacts Personally Identifiable Information (PII) and Protected Health Information (PHI) **before** embeddings and documents are stored — ensuring that raw sensitive data never reaches persistent storage.
52
+
53
+ Privacy is enforced at the **data ingestion layer**, not at the LLM output layer.
54
+
55
+ ```
56
+ Your App → VectorShield → ChromaDB / Pinecone / Weaviate
57
+
58
+ PII detected & redacted
59
+ Embeddings generated from original (semantic quality preserved)
60
+ Only sanitised text stored
61
+ ```
62
+
63
+ ### The Core Problem It Solves
64
+
65
+ RAG systems retrieve and store documents in vector databases. If those documents contain names, phone numbers, SSNs, emails, or medical data, that information becomes **permanently embedded** in your vector store — queryable, retrievable, and at risk. VectorShield intercepts this data before it ever lands.
66
+
67
+ ---
68
+
69
+ ## Key Features
70
+
71
+ - **Parallel PII detection + embedding** — Both run concurrently using `asyncio.gather` to minimise latency overhead
72
+ - **Semantic quality preserved** — Embeddings are generated from the *original* text before redaction; only sanitised text is stored
73
+ - **Redis-backed caching** — PII detection results are cached to eliminate redundant processing of repeated text
74
+ - **Pluggable architecture** — Swap out any component (embedder, vector store, cache, tracker) with your own implementation
75
+ - **Zero vendor lock-in** — Works with OpenAI, Cohere, Sentence-Transformers, Pinecone, Weaviate, Qdrant, Memcached, and more
76
+ - **9 PII entity types** out of the box (PERSON, PHONE, EMAIL, SSN, CREDIT_CARD, LOCATION, DATE_TIME, MEDICAL_LICENSE, PASSPORT)
77
+ - **Metrics & experiment tracking** — Optional MLflow integration for latency, cache hit rate, and PII entity counts
78
+ - **Async-first** — Built for FastAPI and high-throughput batch ingestion
79
+
80
+ ---
81
+
82
+ ## Architecture
83
+
84
+ ```
85
+ ┌─────────────────────────────────────────────────┐
86
+ │ IngestionPipeline │
87
+ │ │
88
+ │ Document → [Cache Check] │
89
+ │ ├─ HIT → reuse clean_text │
90
+ │ └─ MISS → parallel processing: │
91
+ │ ├─ PII Detection │
92
+ │ └─ Embedding │
93
+ │ ↓ │
94
+ │ Redaction → clean_text │
95
+ │ ↓ │
96
+ │ VectorStore.add_vectors( │
97
+ │ embedding=from original, │ ← semantic quality
98
+ │ document=clean_text │ ← privacy safe
99
+ │ ) │
100
+ └─────────────────────────────────────────────────┘
101
+ ```
102
+
103
+ **Privacy guarantee:** The embedding captures the full semantic meaning of the original text. The stored document contains only the redacted version. The raw PII is never written to disk.
104
+
105
+ ---
106
+
107
+ ## Installation
108
+
109
+ ```bash
110
+ pip install vectorshield
111
+ ```
112
+
113
+ **With optional extras:**
114
+
115
+ ```bash
116
+ # Redis caching support
117
+ pip install vectorshield[redis]
118
+
119
+ # MLflow experiment tracking
120
+ pip install vectorshield[mlflow]
121
+
122
+ # All optional dependencies
123
+ pip install vectorshield[all]
124
+ ```
125
+
126
+ **System requirements:**
127
+ - Python 3.9+
128
+ - FastEmbed downloads BAAI/bge-small-en-v1.5 (~130MB) on first run
129
+ - ChromaDB requires SQLite 3.35+ (standard on Ubuntu 22.04+, macOS 12+)
130
+
131
+ ---
132
+
133
+ ## Quickstart
134
+
135
+ ```python
136
+ import asyncio
137
+ from vectorshield import IngestionPipeline, Document
138
+
139
+ async def main():
140
+ # Initialise with defaults (memory cache + ChromaDB + FastEmbed)
141
+ pipeline = IngestionPipeline()
142
+ await pipeline.initialize()
143
+
144
+ # Process a document containing PII
145
+ doc = Document(
146
+ id="doc_001",
147
+ text="Please call John Smith at 555-867-5309 or email john@example.com",
148
+ metadata={"source": "hr_system", "category": "employee"}
149
+ )
150
+
151
+ result = await pipeline.process_document(doc)
152
+
153
+ print(result.clean_text)
154
+ # → "Please call <PERSON> at <PHONE_NUMBER> or email <EMAIL_ADDRESS>"
155
+
156
+ print(result.pii_entities)
157
+ # → ["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS"]
158
+
159
+ print(result.embedding_preview)
160
+ # → [0.023, -0.014, 0.091, ...] (384-dim from original text)
161
+
162
+ await pipeline.shutdown()
163
+
164
+ asyncio.run(main())
165
+ ```
166
+
167
+ ---
168
+
169
+ ## Batch Processing
170
+
171
+ ```python
172
+ async def batch_example():
173
+ pipeline = IngestionPipeline()
174
+ await pipeline.initialize()
175
+
176
+ documents = [
177
+ Document(id="1", text="Patient Jane Doe, SSN: 123-45-6789, admitted 12 Jan 2024"),
178
+ Document(id="2", text="Invoice for Robert Johnson, card ending 4532 1234 5678 9012"),
179
+ Document(id="3", text="Meeting notes from Q4 planning — no PII here"),
180
+ ]
181
+
182
+ result = await pipeline.process_batch(documents)
183
+
184
+ print(f"Processed: {result['total_processed']}")
185
+ print(f"Time: {result['total_time_seconds']}s")
186
+
187
+ for r in result["results"]:
188
+ print(f"[{r['id']}] PII found: {r['pii_found']}")
189
+
190
+ await pipeline.shutdown()
191
+ ```
192
+
193
+ ---
194
+
195
+ ## Configuration
196
+
197
+ VectorShield ships with safe defaults. Override only what you need:
198
+
199
+ ```python
200
+ config = {
201
+ # Cache
202
+ "cache_backend": "redis", # "memory" | "redis" | "none"
203
+ "redis_host": "localhost",
204
+ "redis_port": 6379,
205
+ "cache_ttl": 3600,
206
+
207
+ # Embeddings
208
+ "embedding_model": "BAAI/bge-small-en-v1.5",
209
+
210
+ # Vector store
211
+ "vectorstore_backend": "chroma",
212
+ "vectorstore_path": "./my_data",
213
+ "vectorstore_collection": "documents",
214
+
215
+ # Processing
216
+ "parallel_processing": True,
217
+ "max_batch_size": 100,
218
+
219
+ # Tracking (optional)
220
+ "tracking_enabled": True,
221
+ "tracking_backend": "mlflow",
222
+ "mlflow_tracking_uri": "http://localhost:5000",
223
+ }
224
+
225
+ pipeline = IngestionPipeline(config=config)
226
+ ```
227
+
228
+ Or load from a YAML file:
229
+
230
+ ```python
231
+ pipeline = IngestionPipeline()
232
+ # In CLI:
233
+ # vectorshield process -i docs.json -c config.yaml
234
+ ```
235
+
236
+ ---
237
+
238
+ ## Custom Components
239
+
240
+ VectorShield's pluggable architecture lets you integrate any backend.
241
+
242
+ ### Custom Embedder (OpenAI)
243
+
244
+ ```python
245
+ from openai import OpenAI
246
+
247
+ client = OpenAI(api_key="sk-...")
248
+
249
+ pipeline = IngestionPipeline(
250
+ embed_func=lambda text: client.embeddings.create(
251
+ input=text,
252
+ model="text-embedding-3-small"
253
+ ).data[0].embedding
254
+ )
255
+ ```
256
+
257
+ ### Custom Cache (Existing Redis Client)
258
+
259
+ ```python
260
+ import redis, json
261
+
262
+ r = redis.Redis(host="localhost")
263
+
264
+ pipeline = IngestionPipeline(
265
+ cache_get_func=lambda k: json.loads(r.get(k)) if r.get(k) else None,
266
+ cache_set_func=lambda k, v, ttl: r.setex(k, ttl or 3600, json.dumps(v)),
267
+ )
268
+ ```
269
+
270
+ ### Custom Vector Store (Pinecone)
271
+
272
+ ```python
273
+ from pinecone import Pinecone
274
+
275
+ pc = Pinecone(api_key="...")
276
+ index = pc.Index("my-index")
277
+
278
+ pipeline = IngestionPipeline(
279
+ add_vectors_func=lambda ids, embs, docs, meta: index.upsert(
280
+ vectors=[(id_, emb, {"text": doc, **(m or {})})
281
+ for id_, emb, doc, m in zip(ids, embs, docs, meta or [{}]*len(ids))]
282
+ ),
283
+ search_func=lambda emb, k, filters: [
284
+ {"id": m["id"], "document": m["metadata"]["text"], "score": m["score"]}
285
+ for m in index.query(vector=emb, top_k=k, filter=filters).matches
286
+ ]
287
+ )
288
+ ```
289
+
290
+ ---
291
+
292
+ ## CLI Usage
293
+
294
+ ```bash
295
+ # Process documents from a JSON file
296
+ vectorshield process -i documents.json -o results.json
297
+
298
+ # With config and stats
299
+ vectorshield process -i documents.json -c config.yaml --stats
300
+
301
+ # Read from stdin
302
+ echo '{"documents": [{"id": "1", "text": "Call me at 555-0100"}]}' | vectorshield process
303
+
304
+ # Check version
305
+ vectorshield version
306
+ ```
307
+
308
+ **Input JSON format:**
309
+
310
+ ```json
311
+ {
312
+ "documents": [
313
+ {"id": "doc_1", "text": "Contact Alice Brown at alice@corp.com", "metadata": {"source": "email"}},
314
+ {"id": "doc_2", "text": "Quarterly revenue report — no PII", "metadata": {"source": "finance"}}
315
+ ]
316
+ }
317
+ ```
318
+
319
+ ---
320
+
321
+ ## Metrics & Monitoring
322
+
323
+ ```python
324
+ # Get metrics after processing
325
+ metrics = pipeline.get_metrics()
326
+
327
+ print(metrics["overview"])
328
+ # → {"total_documents": 50, "successful": 49, "failed": 1, "success_rate": "98.00%"}
329
+
330
+ print(metrics["privacy"])
331
+ # → {"total_pii_entities_redacted": 87, "unique_pii_types": ["PERSON", "EMAIL_ADDRESS", ...]}
332
+
333
+ print(metrics["performance"])
334
+ # → {"avg_latency_ms": "142.35", "min_latency_ms": "98.12", "max_latency_ms": "312.44"}
335
+
336
+ print(metrics["caching"])
337
+ # → {"cache_hits": 23, "cache_misses": 27, "cache_hit_rate": "46.00%"}
338
+ ```
339
+
340
+ ---
341
+
342
+ ## PII Entity Types
343
+
344
+ | Entity | Example Input | Redacted Output |
345
+ |--------|--------------|-----------------|
346
+ | PERSON | `John Smith` | `<PERSON>` |
347
+ | PHONE_NUMBER | `555-867-5309` | `<PHONE_NUMBER>` |
348
+ | EMAIL_ADDRESS | `john@example.com` | `<EMAIL_ADDRESS>` |
349
+ | CREDIT_CARD | `4532 1234 5678 9012` | `<CREDIT_CARD>` |
350
+ | US_SSN | `123-45-6789` | `<US_SSN>` |
351
+ | LOCATION | `London, UK` | `<LOCATION>` |
352
+ | DATE_TIME | `12 January 2024` | `<DATE_TIME>` |
353
+ | MEDICAL_LICENSE | `MD-12345` | `<MEDICAL_LICENSE>` |
354
+ | US_PASSPORT | `A12345678` | `<US_PASSPORT>` |
355
+
356
+ PII detection is powered by [Microsoft Presidio](https://microsoft.github.io/presidio/).
357
+
358
+ ---
359
+
360
+ ## Limitations & Honest Caveats
361
+
362
+ VectorShield is designed to **reduce** PII exposure in vector databases, not to provide absolute guarantees. Be aware of the following:
363
+
364
+ - **Detection is probabilistic.** Presidio uses NLP models — novel PII formats, obfuscated data, or non-English text may not be caught. No detection system is 100% accurate.
365
+ - **Embeddings encode semantics.** Vector embeddings generated from PII-containing text may carry some semantic information about that PII, even if the stored text is redacted. This is a known limitation of the "embed-then-redact" pattern.
366
+ - **Metadata is not redacted.** VectorShield does not inspect or sanitise the `metadata` dict you attach to documents. Ensure PII is not passed in metadata fields.
367
+ - **Not a compliance tool.** VectorShield is a technical privacy control, not a legal compliance framework. It does not constitute GDPR, HIPAA, or CCPA compliance on its own.
368
+ - **English-centric.** The default Presidio configuration performs best on English-language text.
369
+
370
+ ---
371
+
372
+ ## Comparison with Alternative Approaches
373
+
374
+ | Approach | VectorShield | On-device Anonymisation | Local LLM | Access Control Only |
375
+ |----------|-------------|------------------------|-----------|---------------------|
376
+ | PII stored in vector DB | ❌ No | ❌ No | ⚠️ Depends | ✅ Yes |
377
+ | Semantic quality preserved | ✅ Yes | ⚠️ Partial | ✅ Yes | ✅ Yes |
378
+ | Requires local GPU | ❌ No | ❌ No | ✅ Yes | ❌ No |
379
+ | Scales to cloud RAG | ✅ Yes | ⚠️ Limited | ❌ Hard | ✅ Yes |
380
+ | Pluggable / vendor-agnostic | ✅ Yes | ❌ No | ❌ No | ✅ Yes |
381
+
382
+ ---
383
+
384
+ ## Project Structure
385
+
386
+ ```
387
+ vectorshield/
388
+ ├── __init__.py # Public API
389
+ ├── ingest.py # Core IngestionPipeline
390
+ ├── interfaces/
391
+ │ ├── cache.py # CacheBackend ABC
392
+ │ ├── embedder.py # Embedder ABC
393
+ │ ├── vectorstore.py # VectorStore ABC
394
+ │ └── tracker.py # Tracker ABC
395
+ ├── implementations/
396
+ │ ├── cache/
397
+ │ │ ├── memory.py # MemoryCache
398
+ │ │ ├── redis.py # RedisCache + NoOpCache
399
+ │ │ └── generic.py # GenericCache (custom functions)
400
+ │ ├── embedder/
401
+ │ │ ├── fastembed.py # FastEmbedEmbedder (default)
402
+ │ │ └── generic.py # GenericEmbedder (custom functions)
403
+ │ ├── vectorstore/
404
+ │ │ ├── chroma.py # ChromaVectorStore (default)
405
+ │ │ └── generic.py # GenericVectorStore (custom functions)
406
+ │ └── tracker/
407
+ │ └── mlflow.py # MLflowTracker + NoOpTracker
408
+ ├── config/
409
+ │ ├── defaults.py # DEFAULT_CONFIG
410
+ │ └── loader.py # load_config(), YAML support
411
+ ├── stats.py # MetricsCollector, IngestionMetrics
412
+ └── cli.py # vectorshield CLI
413
+ ```
414
+
415
+ ---
416
+
417
+ ## Development Setup
418
+
419
+ ```bash
420
+ git clone https://github.com/yourusername/vectorshield.git
421
+ cd vectorshield
422
+
423
+ python -m venv venv
424
+ source venv/bin/activate # Windows: venv\Scripts\activate
425
+
426
+ pip install -e ".[dev]"
427
+
428
+ # Run tests
429
+ pytest tests/ -v
430
+
431
+ # Run linter
432
+ ruff check vectorshield/
433
+ ```
434
+
435
+ ---
436
+
437
+ ## Contributing
438
+
439
+ Contributions are welcome. Please open an issue first to discuss significant changes.
440
+
441
+ 1. Fork the repo
442
+ 2. Create a feature branch (`git checkout -b feature/your-feature`)
443
+ 3. Commit changes (`git commit -m "Add your feature"`)
444
+ 4. Push to branch (`git push origin feature/your-feature`)
445
+ 5. Open a Pull Request
446
+
447
+ ---
448
+
449
+ ## Citation
450
+
451
+ If you use VectorShield in academic work, please cite:
452
+
453
+ ```bibtex
454
+ @software{vectorshield2025,
455
+ title = {VectorShield: Privacy-Preserving Middleware for RAG Vector Databases},
456
+ author = {Your Name},
457
+ year = {2025},
458
+ url = {https://github.com/yourusername/vectorshield}
459
+ }
460
+ ```
461
+
462
+ ---
463
+
464
+ ## License
465
+
466
+ MIT License — see [LICENSE](LICENSE) for details.
467
+
468
+ ---
469
+
470
+ ## Acknowledgements
471
+
472
+ - [Microsoft Presidio](https://microsoft.github.io/presidio/) — PII detection and anonymisation engine
473
+ - [FastEmbed](https://github.com/qdrant/fastembed) — Lightweight CPU-friendly embedding library
474
+ - [ChromaDB](https://www.trychroma.com/) — Persistent local vector database