ava-protocol 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ava/__init__.py +23 -0
- ava/client.py +157 -0
- ava/config.py +51 -0
- ava/engines/__init__.py +9 -0
- ava/engines/base.py +42 -0
- ava/engines/presidio.py +113 -0
- ava/gateways/__init__.py +4 -0
- ava/gateways/http.py +46 -0
- ava/protocol/__init__.py +12 -0
- ava/protocol/entities.py +18 -0
- ava/protocol/manifest.py +82 -0
- ava/protocol/token_vault.py +167 -0
- ava/session.py +128 -0
- ava_protocol-0.1.0.dist-info/METADATA +335 -0
- ava_protocol-0.1.0.dist-info/RECORD +18 -0
- ava_protocol-0.1.0.dist-info/WHEEL +4 -0
- ava_protocol-0.1.0.dist-info/entry_points.txt +2 -0
- ava_protocol-0.1.0.dist-info/licenses/LICENSE +21 -0
ava/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""AVA Protocol - AI Visibility Anonymizer"""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
__author__ = "AVA Team"
|
|
5
|
+
__license__ = "MIT"
|
|
6
|
+
|
|
7
|
+
from ava.client import Client, GatewayClient, create_client
|
|
8
|
+
from ava.config import Config
|
|
9
|
+
from ava.session import Session
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
from ava.engines.presidio import PresidioEngine
|
|
13
|
+
except ImportError:
|
|
14
|
+
PresidioEngine = None
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"Client",
|
|
18
|
+
"GatewayClient",
|
|
19
|
+
"Config",
|
|
20
|
+
"Session",
|
|
21
|
+
"create_client",
|
|
22
|
+
"PresidioEngine",
|
|
23
|
+
]
|
ava/client.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""AVA Client - Main entry point for protocol operations"""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import os
|
|
4
|
+
from typing import Optional, Dict, Any, Union
|
|
5
|
+
from contextlib import contextmanager
|
|
6
|
+
|
|
7
|
+
from ava.config import Config
|
|
8
|
+
from ava.session import Session
|
|
9
|
+
from ava.protocol.token_vault import MemoryVault, TokenVault
|
|
10
|
+
from ava.protocol.manifest import AVAManifest
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Client:
|
|
14
|
+
"""Embedded AVA client with local detection engine."""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
engine: str = "presidio",
|
|
19
|
+
policy: str = "general_moderate",
|
|
20
|
+
vault: Optional[TokenVault] = None,
|
|
21
|
+
retention: int = 3600,
|
|
22
|
+
config: Optional[Dict[str, Any]] = None
|
|
23
|
+
):
|
|
24
|
+
self.engine_name = engine
|
|
25
|
+
self.policy = policy
|
|
26
|
+
self.retention = retention
|
|
27
|
+
self.vault = vault or MemoryVault(ttl_seconds=retention)
|
|
28
|
+
self._engine = None
|
|
29
|
+
self.config = config or {}
|
|
30
|
+
|
|
31
|
+
self._load_engine()
|
|
32
|
+
|
|
33
|
+
def _load_engine(self):
|
|
34
|
+
"""Lazy-load detection engine."""
|
|
35
|
+
if self._engine is not None:
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
if self.engine_name == "presidio":
|
|
39
|
+
try:
|
|
40
|
+
from ava.engines.presidio import PresidioEngine
|
|
41
|
+
self._engine = PresidioEngine()
|
|
42
|
+
except ImportError as e:
|
|
43
|
+
raise ImportError(
|
|
44
|
+
"Presidio engine not available. "
|
|
45
|
+
"Install with: pip install ava-protocol[local]"
|
|
46
|
+
) from e
|
|
47
|
+
elif self.engine_name == "mock":
|
|
48
|
+
from ava.engines.base import MockEngine
|
|
49
|
+
self._engine = MockEngine()
|
|
50
|
+
else:
|
|
51
|
+
raise ValueError(f"Unknown engine: {self.engine_name}")
|
|
52
|
+
|
|
53
|
+
@contextmanager
|
|
54
|
+
def session(self, reversibility: bool = True):
|
|
55
|
+
"""Create an AVA session for sanitize/restore operations."""
|
|
56
|
+
session = Session(
|
|
57
|
+
engine=self._engine,
|
|
58
|
+
vault=self.vault,
|
|
59
|
+
policy=self.policy,
|
|
60
|
+
reversibility=reversibility
|
|
61
|
+
)
|
|
62
|
+
try:
|
|
63
|
+
yield session
|
|
64
|
+
finally:
|
|
65
|
+
session.close()
|
|
66
|
+
|
|
67
|
+
def sanitize(self, text: str) -> str:
|
|
68
|
+
"""One-shot sanitize (no session context)."""
|
|
69
|
+
with self.session() as s:
|
|
70
|
+
return s.sanitize(text)
|
|
71
|
+
|
|
72
|
+
def close(self):
|
|
73
|
+
"""Cleanup resources."""
|
|
74
|
+
if self.vault:
|
|
75
|
+
self.vault.close()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class GatewayClient:
|
|
79
|
+
"""Client for remote AVA Gateway."""
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
url: str,
|
|
84
|
+
api_key: Optional[str] = None,
|
|
85
|
+
policy: str = "general_moderate",
|
|
86
|
+
timeout: float = 30.0
|
|
87
|
+
):
|
|
88
|
+
self.url = url.rstrip("/")
|
|
89
|
+
self.api_key = api_key or os.getenv("AVA_API_KEY")
|
|
90
|
+
self.policy = policy
|
|
91
|
+
self.timeout = timeout
|
|
92
|
+
self._session: Optional[Any] = None
|
|
93
|
+
|
|
94
|
+
def _get_client(self):
|
|
95
|
+
"""Lazy init HTTP client."""
|
|
96
|
+
if self._session is None:
|
|
97
|
+
import httpx
|
|
98
|
+
headers = {"X-AVA-Policy": self.policy}
|
|
99
|
+
if self.api_key:
|
|
100
|
+
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
101
|
+
self._session = httpx.Client(
|
|
102
|
+
base_url=self.url,
|
|
103
|
+
headers=headers,
|
|
104
|
+
timeout=self.timeout
|
|
105
|
+
)
|
|
106
|
+
return self._session
|
|
107
|
+
|
|
108
|
+
@contextmanager
|
|
109
|
+
def session(self, reversibility: bool = True):
|
|
110
|
+
"""Gateway-backed session."""
|
|
111
|
+
from ava.gateways.http import GatewaySession
|
|
112
|
+
s = GatewaySession(
|
|
113
|
+
client=self._get_client(),
|
|
114
|
+
policy=self.policy,
|
|
115
|
+
reversibility=reversibility
|
|
116
|
+
)
|
|
117
|
+
try:
|
|
118
|
+
yield s
|
|
119
|
+
finally:
|
|
120
|
+
s.close()
|
|
121
|
+
|
|
122
|
+
def health(self) -> Dict[str, Any]:
|
|
123
|
+
"""Check gateway health and capabilities."""
|
|
124
|
+
client = self._get_client()
|
|
125
|
+
resp = client.get("/v1/health")
|
|
126
|
+
resp.raise_for_status()
|
|
127
|
+
return resp.json()
|
|
128
|
+
|
|
129
|
+
def close(self):
|
|
130
|
+
"""Close HTTP session."""
|
|
131
|
+
if self._session:
|
|
132
|
+
self._session.close()
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def create_client(config: Union[Config, Dict[str, Any], str, None] = None) -> Union[Client, GatewayClient]:
|
|
136
|
+
"""Factory function to create appropriate client from config."""
|
|
137
|
+
if isinstance(config, str):
|
|
138
|
+
config = Config.from_yaml(config)
|
|
139
|
+
elif isinstance(config, dict):
|
|
140
|
+
config = Config.from_dict(config)
|
|
141
|
+
elif config is None:
|
|
142
|
+
config = Config.from_env()
|
|
143
|
+
|
|
144
|
+
if config.mode == "embedded":
|
|
145
|
+
return Client(
|
|
146
|
+
engine=config.engine,
|
|
147
|
+
policy=config.policy,
|
|
148
|
+
retention=config.retention
|
|
149
|
+
)
|
|
150
|
+
elif config.mode == "gateway":
|
|
151
|
+
return GatewayClient(
|
|
152
|
+
url=config.url,
|
|
153
|
+
api_key=config.api_key,
|
|
154
|
+
policy=config.policy
|
|
155
|
+
)
|
|
156
|
+
else:
|
|
157
|
+
raise ValueError(f"Unknown mode: {config.mode}")
|
ava/config.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""AVA Configuration management"""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Dict, Any, Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class Config:
|
|
10
|
+
"""AVA client configuration."""
|
|
11
|
+
mode: str = "embedded" # embedded or gateway
|
|
12
|
+
engine: str = "presidio"
|
|
13
|
+
policy: str = "general_moderate"
|
|
14
|
+
retention: int = 3600
|
|
15
|
+
# Gateway mode
|
|
16
|
+
url: Optional[str] = None
|
|
17
|
+
api_key: Optional[str] = None
|
|
18
|
+
# Extensions
|
|
19
|
+
extensions: Dict[str, Any] = field(default_factory=dict)
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def from_env(cls, prefix: str = "AVA_") -> Config:
|
|
23
|
+
"""Load config from environment variables."""
|
|
24
|
+
mode = os.getenv(f"{prefix}MODE", "embedded").lower()
|
|
25
|
+
|
|
26
|
+
config = cls(
|
|
27
|
+
mode=mode,
|
|
28
|
+
engine=os.getenv(f"{prefix}ENGINE", "presidio"),
|
|
29
|
+
policy=os.getenv(f"{prefix}POLICY", "general_moderate"),
|
|
30
|
+
retention=int(os.getenv(f"{prefix}RETENTION", "3600")),
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
if mode == "gateway":
|
|
34
|
+
config.url = os.getenv(f"{prefix}GATEWAY_URL")
|
|
35
|
+
config.api_key = os.getenv(f"{prefix}API_KEY")
|
|
36
|
+
if not config.url:
|
|
37
|
+
raise ValueError(f"{prefix}GATEWAY_URL required for gateway mode")
|
|
38
|
+
|
|
39
|
+
return config
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def from_dict(cls, data: Dict[str, Any]) -> Config:
|
|
43
|
+
"""Load config from dictionary."""
|
|
44
|
+
return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def from_yaml(cls, path: str) -> Config:
|
|
48
|
+
"""Load config from YAML file."""
|
|
49
|
+
import yaml
|
|
50
|
+
with open(path) as f:
|
|
51
|
+
return cls.from_dict(yaml.safe_load(f))
|
ava/engines/__init__.py
ADDED
ava/engines/base.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Base detection engine interface"""
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
from ava.protocol.entities import DetectedEntity
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DetectionEngine(ABC):
|
|
10
|
+
"""Abstract base for PII detection engines."""
|
|
11
|
+
|
|
12
|
+
@abstractmethod
|
|
13
|
+
def detect(self, text: str, policy: Optional[str] = None) -> List[DetectedEntity]:
|
|
14
|
+
"""Detect entities in text."""
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MockEngine(DetectionEngine):
|
|
19
|
+
"""Mock engine for testing - uses regex patterns."""
|
|
20
|
+
|
|
21
|
+
PATTERNS = {
|
|
22
|
+
"EMAIL_ADDRESS": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
|
|
23
|
+
"PHONE_NUMBER": r"\b\+?1?[-.]?\(?[0-9]{3}\)?[-.]?[0-9]{3}[-.]?[0-9]{4}\b",
|
|
24
|
+
"SSN": r"\d{3}-\d{2}-\d{4}",
|
|
25
|
+
"CREDIT_CARD": r"\b\d{4}[-]?\d{4}[-]?\d{4}[-]?\d{4}\b",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
def detect(self, text: str, policy: Optional[str] = None) -> List[DetectedEntity]:
|
|
29
|
+
entities = []
|
|
30
|
+
|
|
31
|
+
for entity_type, pattern in self.PATTERNS.items():
|
|
32
|
+
for match in re.finditer(pattern, text):
|
|
33
|
+
entities.append(DetectedEntity(
|
|
34
|
+
type=entity_type,
|
|
35
|
+
value=match.group(),
|
|
36
|
+
position=(match.start(), match.end()),
|
|
37
|
+
confidence=0.95
|
|
38
|
+
))
|
|
39
|
+
|
|
40
|
+
# Sort by position
|
|
41
|
+
entities.sort(key=lambda e: e.position[0])
|
|
42
|
+
return entities
|
ava/engines/presidio.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""Presidio-based detection engine"""
|
|
2
|
+
from typing import List, Optional, Dict
|
|
3
|
+
|
|
4
|
+
from ava.engines.base import DetectionEngine
|
|
5
|
+
from ava.protocol.entities import DetectedEntity
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PresidioEngine(DetectionEngine):
|
|
9
|
+
"""Microsoft Presidio-based entity detection."""
|
|
10
|
+
|
|
11
|
+
# Map Presidio entities to AVA types
|
|
12
|
+
ENTITY_MAP: Dict[str, str] = {
|
|
13
|
+
"PERSON": "PERSON_NAME",
|
|
14
|
+
"EMAIL_ADDRESS": "EMAIL_ADDRESS",
|
|
15
|
+
"PHONE_NUMBER": "PHONE_NUMBER",
|
|
16
|
+
"CREDIT_CARD": "CREDIT_CARD_NUMBER",
|
|
17
|
+
"US_SSN": "SSN",
|
|
18
|
+
"US_BANK_NUMBER": "BANK_ACCOUNT",
|
|
19
|
+
"IBAN": "IBAN",
|
|
20
|
+
"US_PASSPORT": "PASSPORT",
|
|
21
|
+
"US_DRIVER_LICENSE": "DRIVER_LICENSE",
|
|
22
|
+
"IP_ADDRESS": "IP_ADDRESS",
|
|
23
|
+
"LOCATION": "LOCATION",
|
|
24
|
+
"DATE_TIME": "DATE_TIME",
|
|
25
|
+
"NRP": "NATIONALITY",
|
|
26
|
+
"MEDICAL_LICENSE": "MEDICAL_LICENSE",
|
|
27
|
+
"URL": "URL",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
def __init__(self, languages: Optional[List[str]] = None):
|
|
31
|
+
try:
|
|
32
|
+
from presidio_analyzer import AnalyzerEngine
|
|
33
|
+
from presidio_anonymizer import AnonymizerEngine
|
|
34
|
+
except ImportError as e:
|
|
35
|
+
raise ImportError(
|
|
36
|
+
"Presidio not installed. "
|
|
37
|
+
"Install with: pip install presidio-analyzer presidio-anonymizer"
|
|
38
|
+
) from e
|
|
39
|
+
|
|
40
|
+
self.languages = languages or ["en"]
|
|
41
|
+
self._analyzer = AnalyzerEngine()
|
|
42
|
+
self._anonymizer = AnonymizerEngine()
|
|
43
|
+
|
|
44
|
+
# Policy strictness mapping
|
|
45
|
+
self._score_thresholds = {
|
|
46
|
+
"permissive": 0.3,
|
|
47
|
+
"moderate": 0.5,
|
|
48
|
+
"strict": 0.7,
|
|
49
|
+
"paranoid": 0.3, # Low threshold = more detections
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
def detect(self, text: str, policy: Optional[str] = None) -> List[DetectedEntity]:
|
|
53
|
+
"""Detect entities using Presidio analyzer."""
|
|
54
|
+
from presidio_analyzer import RecognizerResult
|
|
55
|
+
|
|
56
|
+
# Determine threshold from policy
|
|
57
|
+
strictness = "moderate"
|
|
58
|
+
if policy:
|
|
59
|
+
for s in ["permissive", "moderate", "strict", "paranoid"]:
|
|
60
|
+
if s in policy.lower():
|
|
61
|
+
strictness = s
|
|
62
|
+
break
|
|
63
|
+
|
|
64
|
+
threshold = self._score_thresholds.get(strictness, 0.5)
|
|
65
|
+
|
|
66
|
+
# Analyze
|
|
67
|
+
results: List[RecognizerResult] = self._analyzer.analyze(
|
|
68
|
+
text=text,
|
|
69
|
+
language=self.languages[0],
|
|
70
|
+
score_threshold=threshold
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Convert to AVA entities
|
|
74
|
+
entities = []
|
|
75
|
+
for result in results:
|
|
76
|
+
entity_type = self.ENTITY_MAP.get(
|
|
77
|
+
result.entity_type,
|
|
78
|
+
result.entity_type
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
entities.append(DetectedEntity(
|
|
82
|
+
type=entity_type,
|
|
83
|
+
value=text[result.start:result.end],
|
|
84
|
+
position=(result.start, result.end),
|
|
85
|
+
confidence=result.score
|
|
86
|
+
))
|
|
87
|
+
|
|
88
|
+
# Sort by position
|
|
89
|
+
entities.sort(key=lambda e: e.position[0])
|
|
90
|
+
return entities
|
|
91
|
+
|
|
92
|
+
def anonymize(self, text: str, entities: List[DetectedEntity]) -> str:
|
|
93
|
+
"""Direct anonymization (utility method)."""
|
|
94
|
+
from presidio_anonymizer.entities import OperatorConfig
|
|
95
|
+
|
|
96
|
+
presidio_results = [
|
|
97
|
+
{
|
|
98
|
+
"entity_type": e.type,
|
|
99
|
+
"start": e.position[0],
|
|
100
|
+
"end": e.position[1],
|
|
101
|
+
"score": e.confidence
|
|
102
|
+
}
|
|
103
|
+
for e in entities
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
result = self._anonymizer.anonymize(
|
|
107
|
+
text=text,
|
|
108
|
+
analyzer_results=presidio_results,
|
|
109
|
+
operators={
|
|
110
|
+
"DEFAULT": OperatorConfig("replace", {"new_value": "<REDACTED>"})
|
|
111
|
+
}
|
|
112
|
+
)
|
|
113
|
+
return result.text
|
ava/gateways/__init__.py
ADDED
ava/gateways/http.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""HTTP client for AVA Gateway"""
|
|
2
|
+
from typing import Optional, Dict, Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class GatewaySession:
|
|
6
|
+
"""Session backed by remote AVA Gateway."""
|
|
7
|
+
|
|
8
|
+
def __init__(self, client, policy: str, reversibility: bool = True):
|
|
9
|
+
self.client = client
|
|
10
|
+
self.policy = policy
|
|
11
|
+
self.reversibility = reversibility
|
|
12
|
+
self._manifest_id: Optional[str] = None
|
|
13
|
+
self._closed = False
|
|
14
|
+
|
|
15
|
+
def sanitize(self, text: str, context: Optional[str] = None) -> str:
|
|
16
|
+
"""Sanitize via gateway API."""
|
|
17
|
+
if self._closed:
|
|
18
|
+
raise RuntimeError("Session is closed")
|
|
19
|
+
|
|
20
|
+
resp = self.client.post("/v1/sanitize", json={
|
|
21
|
+
"text": text,
|
|
22
|
+
"context": context,
|
|
23
|
+
"reversibility_required": self.reversibility
|
|
24
|
+
})
|
|
25
|
+
resp.raise_for_status()
|
|
26
|
+
|
|
27
|
+
data = resp.json()
|
|
28
|
+
self._manifest_id = data.get("manifest", {}).get("manifest_id")
|
|
29
|
+
return data["sanitized_text"]
|
|
30
|
+
|
|
31
|
+
def restore(self, text: str, manifest_id: Optional[str] = None) -> str:
|
|
32
|
+
"""Restore via gateway API."""
|
|
33
|
+
if self._closed:
|
|
34
|
+
raise RuntimeError("Session is closed")
|
|
35
|
+
|
|
36
|
+
resp = self.client.post("/v1/restore", json={
|
|
37
|
+
"ai_response": text,
|
|
38
|
+
"manifest_id": manifest_id or self._manifest_id
|
|
39
|
+
})
|
|
40
|
+
resp.raise_for_status()
|
|
41
|
+
|
|
42
|
+
return resp.json()["restored_text"]
|
|
43
|
+
|
|
44
|
+
def close(self):
|
|
45
|
+
"""Close session - no-op for stateless HTTP."""
|
|
46
|
+
self._closed = True
|
ava/protocol/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""AVA Protocol - Core data structures and schemas"""
|
|
2
|
+
from ava.protocol.manifest import AVAManifest, EntityRecord
|
|
3
|
+
from ava.protocol.entities import DetectedEntity
|
|
4
|
+
from ava.protocol.token_vault import TokenVault, MemoryVault
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"AVAManifest",
|
|
8
|
+
"EntityRecord",
|
|
9
|
+
"DetectedEntity",
|
|
10
|
+
"TokenVault",
|
|
11
|
+
"MemoryVault",
|
|
12
|
+
]
|
ava/protocol/entities.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Entity definitions for detected PII"""
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Tuple
|
|
4
|
+
import hashlib
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class DetectedEntity:
|
|
9
|
+
"""A detected sensitive entity in text."""
|
|
10
|
+
type: str # e.g., "PERSON_NAME", "EMAIL_ADDRESS"
|
|
11
|
+
value: str # Original text
|
|
12
|
+
position: Tuple[int, int] # (start, end) character positions
|
|
13
|
+
confidence: float # 0.0-1.0 detection confidence
|
|
14
|
+
|
|
15
|
+
def hash(self) -> str:
|
|
16
|
+
"""Hash the value for audit trails without storing original."""
|
|
17
|
+
h = hashlib.sha256(self.value.encode()).hexdigest()[:16]
|
|
18
|
+
return f"sha256:{h}"
|
ava/protocol/manifest.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""AVA Manifest - Audit and transformation tracking"""
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import List, Dict, Any, Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class EntityRecord:
|
|
8
|
+
"""Recorded entity in manifest."""
|
|
9
|
+
type: str
|
|
10
|
+
value_hash: str
|
|
11
|
+
position: List[int]
|
|
12
|
+
confidence: float
|
|
13
|
+
action: str
|
|
14
|
+
token: str
|
|
15
|
+
|
|
16
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
17
|
+
return {
|
|
18
|
+
"type": self.type,
|
|
19
|
+
"value_hash": self.value_hash,
|
|
20
|
+
"position": self.position,
|
|
21
|
+
"confidence": self.confidence,
|
|
22
|
+
"action": self.action,
|
|
23
|
+
"token": self.token
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class AVAManifest:
|
|
29
|
+
"""AVA transformation manifest for audit and reversibility."""
|
|
30
|
+
|
|
31
|
+
# Required fields
|
|
32
|
+
manifest_id: str
|
|
33
|
+
timestamp: str
|
|
34
|
+
policy_domain: str = "general"
|
|
35
|
+
policy_strictness: str = "moderate"
|
|
36
|
+
reversibility: bool = True
|
|
37
|
+
|
|
38
|
+
# Runtime state
|
|
39
|
+
entities: List[EntityRecord] = field(default_factory=list)
|
|
40
|
+
input_transform: str = "pseudonymize"
|
|
41
|
+
output_transform: str = "restore"
|
|
42
|
+
|
|
43
|
+
def add_entity(
|
|
44
|
+
self,
|
|
45
|
+
entity_type: str,
|
|
46
|
+
value_hash: str,
|
|
47
|
+
position: List[int],
|
|
48
|
+
confidence: float,
|
|
49
|
+
action: str,
|
|
50
|
+
token: str
|
|
51
|
+
):
|
|
52
|
+
"""Add an entity record to this manifest."""
|
|
53
|
+
self.entities.append(EntityRecord(
|
|
54
|
+
type=entity_type,
|
|
55
|
+
value_hash=value_hash,
|
|
56
|
+
position=position,
|
|
57
|
+
confidence=confidence,
|
|
58
|
+
action=action,
|
|
59
|
+
token=token
|
|
60
|
+
))
|
|
61
|
+
|
|
62
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
63
|
+
"""Serialize manifest to dictionary."""
|
|
64
|
+
return {
|
|
65
|
+
"ava_version": "1.0",
|
|
66
|
+
"manifest_id": self.manifest_id,
|
|
67
|
+
"timestamp": self.timestamp,
|
|
68
|
+
"policy": {
|
|
69
|
+
"domain": self.policy_domain,
|
|
70
|
+
"strictness": self.policy_strictness,
|
|
71
|
+
"reversibility": self.reversibility
|
|
72
|
+
},
|
|
73
|
+
"entities": [e.to_dict() for e in self.entities],
|
|
74
|
+
"transformations": {
|
|
75
|
+
"input_transform": self.input_transform,
|
|
76
|
+
"output_transform": self.output_transform
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
def get_tokens(self) -> List[str]:
|
|
81
|
+
"""Return all tokens in this manifest."""
|
|
82
|
+
return [e.token for e in self.entities]
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Token vault implementations for secure storage of original values"""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Optional, Dict, Tuple
|
|
5
|
+
import secrets
|
|
6
|
+
import threading
|
|
7
|
+
from datetime import datetime, timedelta, timezone
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TokenVault(ABC):
|
|
11
|
+
"""Abstract base for token storage backends."""
|
|
12
|
+
|
|
13
|
+
TOKEN_PREFIX = "AVA"
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def store(self, original: str, entity_type: str, session_id: str) -> str:
|
|
17
|
+
"""Store original value, return token."""
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def retrieve(self, token: str, session_id: str) -> Optional[str]:
|
|
22
|
+
"""Retrieve original value by token."""
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def expire_session(self, session_id: str):
|
|
27
|
+
"""Remove all tokens for a session."""
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def close(self):
|
|
32
|
+
"""Cleanup resources."""
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
def _generate_token(self, entity_type: str, suffix: str) -> str:
|
|
36
|
+
"""Generate standard AVA token format: AVA_TYPE_suffix"""
|
|
37
|
+
short_type = entity_type.split("_")[0][:4].upper()
|
|
38
|
+
return f"{self.TOKEN_PREFIX}_{short_type}_{suffix}"
|
|
39
|
+
|
|
40
|
+
def _now(self) -> datetime:
|
|
41
|
+
"""Get current UTC time (timezone-aware)."""
|
|
42
|
+
return datetime.now(timezone.utc)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class MemoryVault(TokenVault):
|
|
46
|
+
"""In-memory token vault with TTL support."""
|
|
47
|
+
|
|
48
|
+
def __init__(self, ttl_seconds: int = 3600):
|
|
49
|
+
self.ttl_seconds = ttl_seconds
|
|
50
|
+
self._store: Dict[str, Tuple[str, datetime]] = {}
|
|
51
|
+
self._session_tokens: Dict[str, set] = {}
|
|
52
|
+
self._lock = threading.RLock()
|
|
53
|
+
self._closed = False
|
|
54
|
+
|
|
55
|
+
def store(self, original: str, entity_type: str, session_id: str) -> str:
|
|
56
|
+
with self._lock:
|
|
57
|
+
if self._closed:
|
|
58
|
+
raise RuntimeError("Vault is closed")
|
|
59
|
+
|
|
60
|
+
suffix = secrets.token_urlsafe(6)[:8]
|
|
61
|
+
token = self._generate_token(entity_type, suffix)
|
|
62
|
+
|
|
63
|
+
# Ensure uniqueness
|
|
64
|
+
while token in self._store:
|
|
65
|
+
suffix = secrets.token_urlsafe(6)[:8]
|
|
66
|
+
token = self._generate_token(entity_type, suffix)
|
|
67
|
+
|
|
68
|
+
expiry = self._now() + timedelta(seconds=self.ttl_seconds)
|
|
69
|
+
self._store[token] = (original, expiry)
|
|
70
|
+
|
|
71
|
+
if session_id not in self._session_tokens:
|
|
72
|
+
self._session_tokens[session_id] = set()
|
|
73
|
+
self._session_tokens[session_id].add(token)
|
|
74
|
+
|
|
75
|
+
return token
|
|
76
|
+
|
|
77
|
+
def retrieve(self, token: str, session_id: str) -> Optional[str]:
|
|
78
|
+
with self._lock:
|
|
79
|
+
if self._closed:
|
|
80
|
+
return None
|
|
81
|
+
|
|
82
|
+
entry = self._store.get(token)
|
|
83
|
+
if not entry:
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
original, expiry = entry
|
|
87
|
+
if self._now() > expiry:
|
|
88
|
+
self._store.pop(token, None)
|
|
89
|
+
return None
|
|
90
|
+
|
|
91
|
+
return original
|
|
92
|
+
|
|
93
|
+
def expire_session(self, session_id: str):
|
|
94
|
+
with self._lock:
|
|
95
|
+
tokens = self._session_tokens.pop(session_id, set())
|
|
96
|
+
for token in tokens:
|
|
97
|
+
self._store.pop(token, None)
|
|
98
|
+
|
|
99
|
+
def close(self):
|
|
100
|
+
with self._lock:
|
|
101
|
+
self._store.clear()
|
|
102
|
+
self._session_tokens.clear()
|
|
103
|
+
self._closed = True
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class SqliteVault(TokenVault):
|
|
107
|
+
"""SQLite-backed token vault for persistence."""
|
|
108
|
+
|
|
109
|
+
def __init__(self, db_path: str = ":memory:", ttl_seconds: int = 3600):
|
|
110
|
+
import sqlite3
|
|
111
|
+
self.db_path = db_path
|
|
112
|
+
self.ttl_seconds = ttl_seconds
|
|
113
|
+
self._conn = sqlite3.connect(db_path, check_same_thread=False)
|
|
114
|
+
self._init_db()
|
|
115
|
+
|
|
116
|
+
def _init_db(self):
|
|
117
|
+
self._conn.execute(
|
|
118
|
+
"CREATE TABLE IF NOT EXISTS tokens ("
|
|
119
|
+
"token TEXT PRIMARY KEY, original TEXT NOT NULL, "
|
|
120
|
+
"session_id TEXT NOT NULL, entity_type TEXT NOT NULL, "
|
|
121
|
+
"created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, expires_at TIMESTAMP)"
|
|
122
|
+
)
|
|
123
|
+
self._conn.execute(
|
|
124
|
+
"CREATE INDEX IF NOT EXISTS idx_session ON tokens(session_id)"
|
|
125
|
+
)
|
|
126
|
+
self._conn.commit()
|
|
127
|
+
|
|
128
|
+
def store(self, original: str, entity_type: str, session_id: str) -> str:
|
|
129
|
+
import sqlite3
|
|
130
|
+
suffix = secrets.token_urlsafe(6)[:8]
|
|
131
|
+
token = self._generate_token(entity_type, suffix)
|
|
132
|
+
|
|
133
|
+
expires = self._now() + timedelta(seconds=self.ttl_seconds)
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
with self._conn:
|
|
137
|
+
self._conn.execute(
|
|
138
|
+
"INSERT INTO tokens (token, original, session_id, entity_type, expires_at) VALUES (?, ?, ?, ?, ?)",
|
|
139
|
+
(token, original, session_id, entity_type, expires.isoformat())
|
|
140
|
+
)
|
|
141
|
+
return token
|
|
142
|
+
except sqlite3.IntegrityError:
|
|
143
|
+
# Collision, retry
|
|
144
|
+
return self.store(original, entity_type, session_id)
|
|
145
|
+
|
|
146
|
+
def retrieve(self, token: str, session_id: str) -> Optional[str]:
|
|
147
|
+
cursor = self._conn.execute(
|
|
148
|
+
"SELECT original, expires_at FROM tokens WHERE token = ?",
|
|
149
|
+
(token,)
|
|
150
|
+
)
|
|
151
|
+
row = cursor.fetchone()
|
|
152
|
+
if not row:
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
original, expires_at = row
|
|
156
|
+
if datetime.fromisoformat(expires_at) < self._now():
|
|
157
|
+
self._conn.execute("DELETE FROM tokens WHERE token = ?", (token,))
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
return original
|
|
161
|
+
|
|
162
|
+
def expire_session(self, session_id: str):
|
|
163
|
+
with self._conn:
|
|
164
|
+
self._conn.execute("DELETE FROM tokens WHERE session_id = ?", (session_id,))
|
|
165
|
+
|
|
166
|
+
def close(self):
|
|
167
|
+
self._conn.close()
|
ava/session.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""AVA Session - Context manager for sanitize/restore operations"""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import uuid
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Optional, List, Dict, Any, TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from ava.engines.base import DetectionEngine
|
|
9
|
+
from ava.protocol.token_vault import TokenVault
|
|
10
|
+
from ava.protocol.manifest import AVAManifest
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Session:
|
|
14
|
+
"""Transaction context for AVA operations."""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
engine: DetectionEngine,
|
|
19
|
+
vault: TokenVault,
|
|
20
|
+
policy: str,
|
|
21
|
+
reversibility: bool = True
|
|
22
|
+
):
|
|
23
|
+
self.engine = engine
|
|
24
|
+
self.vault = vault
|
|
25
|
+
self.policy = policy
|
|
26
|
+
self.reversibility = reversibility
|
|
27
|
+
self.session_id = str(uuid.uuid4())[:8]
|
|
28
|
+
self.manifests: List[AVAManifest] = []
|
|
29
|
+
self._closed = False
|
|
30
|
+
|
|
31
|
+
def sanitize(self, text: str, context: Optional[str] = None) -> str:
|
|
32
|
+
"""Sanitize text, return cleaned version."""
|
|
33
|
+
if self._closed:
|
|
34
|
+
raise RuntimeError("Session is closed")
|
|
35
|
+
|
|
36
|
+
# Detect entities
|
|
37
|
+
entities = self.engine.detect(text, policy=self.policy)
|
|
38
|
+
|
|
39
|
+
# Build manifest
|
|
40
|
+
from ava.protocol.manifest import AVAManifest
|
|
41
|
+
manifest = AVAManifest(
|
|
42
|
+
manifest_id=f"ava-{self.session_id}-{len(self.manifests)}",
|
|
43
|
+
timestamp=datetime.utcnow().isoformat() + "Z",
|
|
44
|
+
policy_domain=self.policy.split("_")[0] if "_" in self.policy else "general",
|
|
45
|
+
policy_strictness=self.policy.split("_")[-1] if "_" in self.policy else "moderate",
|
|
46
|
+
reversibility=self.reversibility
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Transform
|
|
50
|
+
transformed_text = text
|
|
51
|
+
offset = 0
|
|
52
|
+
|
|
53
|
+
for entity in sorted(entities, key=lambda e: e.position[0]):
|
|
54
|
+
token = self.vault.store(
|
|
55
|
+
original=entity.value,
|
|
56
|
+
entity_type=entity.type,
|
|
57
|
+
session_id=self.session_id
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
manifest.add_entity(
|
|
61
|
+
entity_type=entity.type,
|
|
62
|
+
value_hash=entity.hash(),
|
|
63
|
+
position=[entity.position[0] + offset, entity.position[1] + offset],
|
|
64
|
+
confidence=entity.confidence,
|
|
65
|
+
action="pseudonymize",
|
|
66
|
+
token=token
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Replace in text
|
|
70
|
+
start, end = entity.position[0] + offset, entity.position[1] + offset
|
|
71
|
+
transformed_text = transformed_text[:start] + token + transformed_text[end:]
|
|
72
|
+
offset += len(token) - (end - start)
|
|
73
|
+
|
|
74
|
+
self.manifests.append(manifest)
|
|
75
|
+
return transformed_text
|
|
76
|
+
|
|
77
|
+
def restore(self, text: str, manifest_id: Optional[str] = None) -> str:
|
|
78
|
+
"""Restore sanitized text to original values."""
|
|
79
|
+
if self._closed:
|
|
80
|
+
raise RuntimeError("Session is closed")
|
|
81
|
+
|
|
82
|
+
if not self.reversibility:
|
|
83
|
+
raise RuntimeError("Session created without reversibility")
|
|
84
|
+
|
|
85
|
+
result = text
|
|
86
|
+
# Find manifest
|
|
87
|
+
manifest = None
|
|
88
|
+
if manifest_id:
|
|
89
|
+
for m in self.manifests:
|
|
90
|
+
if m.manifest_id == manifest_id:
|
|
91
|
+
manifest = m
|
|
92
|
+
break
|
|
93
|
+
else:
|
|
94
|
+
manifest = self.manifests[-1] if self.manifests else None
|
|
95
|
+
|
|
96
|
+
if not manifest:
|
|
97
|
+
raise ValueError("No manifest found for restoration")
|
|
98
|
+
|
|
99
|
+
# Restore entities (reverse order to preserve positions)
|
|
100
|
+
for entity in reversed(manifest.entities):
|
|
101
|
+
original = self.vault.retrieve(entity.token, session_id=self.session_id)
|
|
102
|
+
if original:
|
|
103
|
+
result = result.replace(entity.token, original)
|
|
104
|
+
|
|
105
|
+
return result
|
|
106
|
+
|
|
107
|
+
def get_manifest(self, manifest_id: Optional[str] = None) -> Optional[AVAManifest]:
|
|
108
|
+
"""Retrieve manifest by ID or latest."""
|
|
109
|
+
if manifest_id:
|
|
110
|
+
for m in self.manifests:
|
|
111
|
+
if m.manifest_id == manifest_id:
|
|
112
|
+
return m
|
|
113
|
+
return None
|
|
114
|
+
return self.manifests[-1] if self.manifests else None
|
|
115
|
+
|
|
116
|
+
def close(self):
|
|
117
|
+
"""Cleanup session resources."""
|
|
118
|
+
if self._closed:
|
|
119
|
+
return
|
|
120
|
+
self.vault.expire_session(self.session_id)
|
|
121
|
+
self._closed = True
|
|
122
|
+
|
|
123
|
+
def __enter__(self):
|
|
124
|
+
return self
|
|
125
|
+
|
|
126
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
127
|
+
self.close()
|
|
128
|
+
return False
|
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ava-protocol
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: AVA - AI Visibility Anonymizer Protocol
|
|
5
|
+
Project-URL: Homepage, https://github.com/ava-protocol/ava-protocol
|
|
6
|
+
Project-URL: Documentation, https://ava-protocol.readthedocs.io
|
|
7
|
+
Project-URL: Repository, https://github.com/ava-protocol/ava-protocol
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/ava-protocol/ava-protocol/issues
|
|
9
|
+
Author-email: Gerald Enrique Nelson Mc Kenzie <lordxmen2k@gmail.com>
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: ai,anonymization,data-protection,gdpr,hipaa,llm,pii,presidio,privacy,security
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Security
|
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Classifier: Topic :: Text Processing
|
|
25
|
+
Requires-Python: >=3.9
|
|
26
|
+
Requires-Dist: httpx>=0.25.0
|
|
27
|
+
Requires-Dist: pydantic>=2.0.0
|
|
28
|
+
Provides-Extra: all
|
|
29
|
+
Requires-Dist: boto3>=1.28.0; extra == 'all'
|
|
30
|
+
Requires-Dist: presidio-analyzer>=2.2.0; extra == 'all'
|
|
31
|
+
Requires-Dist: presidio-anonymizer>=2.2.0; extra == 'all'
|
|
32
|
+
Requires-Dist: spacy>=3.7.0; extra == 'all'
|
|
33
|
+
Provides-Extra: aws
|
|
34
|
+
Requires-Dist: boto3>=1.28.0; extra == 'aws'
|
|
35
|
+
Provides-Extra: dev
|
|
36
|
+
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
37
|
+
Requires-Dist: build>=1.0.0; extra == 'dev'
|
|
38
|
+
Requires-Dist: mypy>=1.5.0; extra == 'dev'
|
|
39
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
|
|
40
|
+
Requires-Dist: pytest>=7.4.0; extra == 'dev'
|
|
41
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
42
|
+
Requires-Dist: twine>=4.0.0; extra == 'dev'
|
|
43
|
+
Provides-Extra: local
|
|
44
|
+
Requires-Dist: presidio-analyzer>=2.2.0; extra == 'local'
|
|
45
|
+
Requires-Dist: presidio-anonymizer>=2.2.0; extra == 'local'
|
|
46
|
+
Requires-Dist: spacy>=3.7.0; extra == 'local'
|
|
47
|
+
Description-Content-Type: text/markdown
|
|
48
|
+
|
|
49
|
+
# AVA Protocol ๐ก๏ธ
|
|
50
|
+
|
|
51
|
+
**AI Visibility Anonymizer Protocol** โ A protocol-first approach to privacy-preserving AI interactions.
|
|
52
|
+
|
|
53
|
+
## Authors
|
|
54
|
+
|
|
55
|
+
- Gerald Enrique Nelson Mc Kenzie (https://github.com/lordxmen2k)
|
|
56
|
+
|
|
57
|
+
## Date
|
|
58
|
+
|
|
59
|
+
- 3/13/2026
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
[](https://badge.fury.io/py/ava-protocol)
|
|
63
|
+
[](https://www.python.org/downloads/)
|
|
64
|
+
[](https://opensource.org/licenses/MIT)
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
68
|
+
โ AVA LAYER โ
|
|
69
|
+
โ (AI Visibility Anonymizer Protocol) โ
|
|
70
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
|
|
71
|
+
โ INGEST โ DETECT โ CLASSIFY โ TRANSFORM โ AUDIT โ AI โ
|
|
72
|
+
โ โ โ โ
|
|
73
|
+
โ Original Data Clean Responseโ
|
|
74
|
+
โ โ โ โ
|
|
75
|
+
โ RESTORE โ โ โ โ โ โ โ โ โ โ โ โ โ โ โ โ โ โ โ โ โ โ
|
|
76
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## ๐ฏ What is AVA?
|
|
80
|
+
|
|
81
|
+
AVA is an **open protocol** and **Python library** for anonymizing sensitive data before sending it to AI/LLM services, with complete audit trails and reversible tokenization.
|
|
82
|
+
|
|
83
|
+
### Core Principles
|
|
84
|
+
|
|
85
|
+
| Principle | Description |
|
|
86
|
+
|-----------|-------------|
|
|
87
|
+
| **Visibility** | Complete audit trail of what was sanitized and why |
|
|
88
|
+
| **Reversibility** | Secure token vault for restoring original data in responses |
|
|
89
|
+
| **Interoperability** | Works with any AI provider (OpenAI, Anthropic, local models) |
|
|
90
|
+
| **Configurability** | Policy-driven sensitivity levels per use case |
|
|
91
|
+
| **Engine Agnostic** | Pluggable detection engines (Presidio, AWS Macie, etc.) |
|
|
92
|
+
|
|
93
|
+
## ๐ Quick Start
|
|
94
|
+
|
|
95
|
+
### Installation
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
# Core library only (gateway mode)
|
|
99
|
+
pip install ava-protocol
|
|
100
|
+
|
|
101
|
+
# With local detection engine (Presidio)
|
|
102
|
+
pip install ava-protocol[local]
|
|
103
|
+
|
|
104
|
+
# With all optional dependencies
|
|
105
|
+
pip install ava-protocol[all]
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Basic Usage
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
import ava
|
|
112
|
+
import openai
|
|
113
|
+
|
|
114
|
+
# Initialize client (local Presidio engine)
|
|
115
|
+
client = ava.Client(
|
|
116
|
+
engine="presidio",
|
|
117
|
+
policy="healthcare_strict",
|
|
118
|
+
retention=3600 # Token expiry in seconds
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Sanitize โ AI โ Restore
|
|
122
|
+
with client.session(reversibility=True) as session:
|
|
123
|
+
# 1. Sanitize PII before sending to AI
|
|
124
|
+
clean_prompt = session.sanitize(
|
|
125
|
+
"Patient John Doe (john.doe@email.com) needs prescription refill"
|
|
126
|
+
)
|
|
127
|
+
# Result: "Patient AVA_PERS_7a3f9k2m (AVA_EMAI_x8n4p5qv) needs prescription refill"
|
|
128
|
+
|
|
129
|
+
# 2. Send to AI (AI never sees original PII)
|
|
130
|
+
response = openai.chat.completions.create(
|
|
131
|
+
model="gpt-4",
|
|
132
|
+
messages=[{"role": "user", "content": clean_prompt}]
|
|
133
|
+
)
|
|
134
|
+
ai_output = response.choices[0].message.content
|
|
135
|
+
|
|
136
|
+
# 3. Restore original values in AI response
|
|
137
|
+
original_response = session.restore(ai_output)
|
|
138
|
+
# Result: "John Doe should contact john.doe@email.com..."
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### Gateway Mode (Enterprise)
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
import ava
|
|
145
|
+
|
|
146
|
+
# Connect to centralized AVA Gateway
|
|
147
|
+
client = ava.GatewayClient(
|
|
148
|
+
url="https://ava.internal.company.com",
|
|
149
|
+
api_key="your-api-key",
|
|
150
|
+
policy="finance_strict"
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
with client.session() as session:
|
|
154
|
+
clean = session.sanitize("Invoice to Acme Corp...")
|
|
155
|
+
# ... AI call ...
|
|
156
|
+
original = session.restore(ai_response)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## ๐ Supported Entity Types
|
|
160
|
+
|
|
161
|
+
| Category | Entities | Example |
|
|
162
|
+
|----------|----------|---------|
|
|
163
|
+
| Identity | `PERSON_NAME`, `USERNAME` | "John Doe" โ `AVA_PERS_7a3f9k2m` |
|
|
164
|
+
| Contact | `EMAIL_ADDRESS`, `PHONE_NUMBER` | "john@email.com" โ `AVA_EMAI_x8n4p5qv` |
|
|
165
|
+
| Financial | `CREDIT_CARD`, `BANK_ACCOUNT`, `SSN` | "123-45-6789" โ `AVA_SSN_3x8k9n4p` |
|
|
166
|
+
| Location | `LOCATION`, `IP_ADDRESS` | "192.168.1.1" โ `AVA_IPAD_q2m7n5vx` |
|
|
167
|
+
| Medical | `MEDICAL_LICENSE`, `DIAGNOSIS` | Condition-specific detection |
|
|
168
|
+
|
|
169
|
+
## ๐ง Configuration
|
|
170
|
+
|
|
171
|
+
### Environment Variables
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
export AVA_MODE=embedded # or "gateway"
|
|
175
|
+
export AVA_ENGINE=presidio
|
|
176
|
+
export AVA_POLICY=healthcare_strict
|
|
177
|
+
export AVA_RETENTION=3600
|
|
178
|
+
export AVA_GATEWAY_URL=https://ava.internal.company.com
|
|
179
|
+
export AVA_API_KEY=your-api-key
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
### YAML Config
|
|
183
|
+
|
|
184
|
+
```yaml
|
|
185
|
+
# ava-config.yaml
|
|
186
|
+
mode: embedded
|
|
187
|
+
engine: presidio
|
|
188
|
+
policy: finance_strict
|
|
189
|
+
retention: 7200
|
|
190
|
+
|
|
191
|
+
# Or gateway mode:
|
|
192
|
+
mode: gateway
|
|
193
|
+
url: https://ava-gateway.company.com
|
|
194
|
+
api_key: ${AVA_API_KEY}
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
import ava
|
|
199
|
+
|
|
200
|
+
config = ava.Config.from_yaml("ava-config.yaml")
|
|
201
|
+
client = ava.create_client(config)
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
## ๐๏ธ Architecture
|
|
205
|
+
|
|
206
|
+
### Protocol-First Design
|
|
207
|
+
|
|
208
|
+
```
|
|
209
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
210
|
+
โ AVA PROTOCOL SPEC โ โ The standard (IETF-bound)
|
|
211
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
|
|
212
|
+
โ AVA Python Library (this repo) โ โ Reference implementation
|
|
213
|
+
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
|
|
214
|
+
โ โ Presidio | AWS Macie โ โ โ Pluggable engines
|
|
215
|
+
โ โ | Azure PII | Custom โ โ
|
|
216
|
+
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
|
|
217
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
### Manifest Format
|
|
221
|
+
|
|
222
|
+
Every transformation produces an **AVA Manifest** โ a complete audit record:
|
|
223
|
+
|
|
224
|
+
```json
|
|
225
|
+
{
|
|
226
|
+
"ava_version": "1.0",
|
|
227
|
+
"manifest_id": "ava-a1b2c3d4-001",
|
|
228
|
+
"timestamp": "2025-03-13T15:05:59Z",
|
|
229
|
+
"policy": {
|
|
230
|
+
"domain": "healthcare",
|
|
231
|
+
"strictness": "strict",
|
|
232
|
+
"reversibility": true
|
|
233
|
+
},
|
|
234
|
+
"entities": [
|
|
235
|
+
{
|
|
236
|
+
"type": "PERSON_NAME",
|
|
237
|
+
"value_hash": "sha256:7f83b...",
|
|
238
|
+
"position": [8, 16],
|
|
239
|
+
"confidence": 0.98,
|
|
240
|
+
"action": "pseudonymize",
|
|
241
|
+
"token": "AVA_PERS_7a3f9k2m"
|
|
242
|
+
}
|
|
243
|
+
]
|
|
244
|
+
}
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
## ๐ Pluggable Engines
|
|
248
|
+
|
|
249
|
+
Swap detection engines without changing your code:
|
|
250
|
+
|
|
251
|
+
| Engine | Installation | Best For |
|
|
252
|
+
|--------|--------------|----------|
|
|
253
|
+
| **Presidio** (default) | `pip install ava-protocol[local]` | Self-hosted, free, customizable |
|
|
254
|
+
| **AWS Macie** | `pip install ava-protocol[aws]` | Enterprise, cloud-native |
|
|
255
|
+
| **Mock** (built-in) | No install | Testing, CI/CD |
|
|
256
|
+
| **Custom** | Extend `DetectionEngine` | Domain-specific needs |
|
|
257
|
+
|
|
258
|
+
## ๐ฆ Project Structure
|
|
259
|
+
|
|
260
|
+
```
|
|
261
|
+
ava-protocol/
|
|
262
|
+
โโโ src/ava/
|
|
263
|
+
โ โโโ __init__.py # Main exports
|
|
264
|
+
โ โโโ client.py # Client & GatewayClient
|
|
265
|
+
โ โโโ session.py # Transaction context
|
|
266
|
+
โ โโโ config.py # Configuration management
|
|
267
|
+
โ โโโ protocol/ # Core protocol types
|
|
268
|
+
โ โ โโโ manifest.py # AVA Manifest
|
|
269
|
+
โ โ โโโ entities.py # DetectedEntity
|
|
270
|
+
โ โ โโโ token_vault.py # Vault implementations
|
|
271
|
+
โ โโโ engines/ # Detection engines
|
|
272
|
+
โ โ โโโ base.py # Abstract interface
|
|
273
|
+
โ โ โโโ presidio.py # Presidio adapter
|
|
274
|
+
โ โโโ gateways/ # Remote gateway clients
|
|
275
|
+
โ โโโ http.py # REST client
|
|
276
|
+
โโโ tests/ # Test suite
|
|
277
|
+
โโโ pyproject.toml # Package config
|
|
278
|
+
โโโ README.md # This file
|
|
279
|
+
โโโ PUBLISH.md # PyPI release guide
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
## ๐งช Development
|
|
283
|
+
|
|
284
|
+
```bash
|
|
285
|
+
# Clone repo
|
|
286
|
+
git clone https://github.com/ava-protocol/ava-protocol.git
|
|
287
|
+
cd ava-protocol
|
|
288
|
+
|
|
289
|
+
# Create virtual environment
|
|
290
|
+
python -m venv venv
|
|
291
|
+
source venv/bin/activate # Windows: venv\Scripts\activate
|
|
292
|
+
|
|
293
|
+
# Install in development mode
|
|
294
|
+
pip install -e ".[all,dev]"
|
|
295
|
+
|
|
296
|
+
# Download Presidio models (if using local)
|
|
297
|
+
python -m spacy download en_core_web_lg
|
|
298
|
+
|
|
299
|
+
# Run tests
|
|
300
|
+
pytest
|
|
301
|
+
|
|
302
|
+
# Lint
|
|
303
|
+
black src/ava tests/
|
|
304
|
+
ruff check src/ava tests/
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
## ๐ก๏ธ Security Considerations
|
|
308
|
+
|
|
309
|
+
- **Zero-Retention Mode**: Tokens auto-expire (default: 1 hour)
|
|
310
|
+
- **Vault Isolation**: Session-scoped token storage
|
|
311
|
+
- **Audit Trail**: Every transformation logged in manifest
|
|
312
|
+
- **Hash-Only Storage**: Original values never in manifests (only tokens)
|
|
313
|
+
|
|
314
|
+
## ๐ License
|
|
315
|
+
|
|
316
|
+
MIT License โ see [LICENSE](LICENSE)
|
|
317
|
+
|
|
318
|
+
## ๐ค Contributing
|
|
319
|
+
|
|
320
|
+
1. Fork the repository
|
|
321
|
+
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
|
|
322
|
+
3. Commit changes (`git commit -m 'Add amazing feature'`)
|
|
323
|
+
4. Push to branch (`git push origin feature/amazing-feature`)
|
|
324
|
+
5. Open a Pull Request
|
|
325
|
+
|
|
326
|
+
## ๐ Links
|
|
327
|
+
|
|
328
|
+
- **Documentation**: https://ava-protocol.readthedocs.io
|
|
329
|
+
- **PyPI**: https://pypi.org/project/ava-protocol/
|
|
330
|
+
- **Repository**: https://github.com/ava-protocol/ava-protocol
|
|
331
|
+
- **Issues**: https://github.com/ava-protocol/ava-protocol/issues
|
|
332
|
+
|
|
333
|
+
---
|
|
334
|
+
|
|
335
|
+
**AVA**: Making AI interactions private by default, visible by design.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
ava/__init__.py,sha256=3J2aKq9n8DShZk3CLSNLeYRQT2879Fj9d34RRjrx2Xk,463
|
|
2
|
+
ava/client.py,sha256=N-FGi9mhtv0ymegi3PBirzcBof6imQzF6NfvZWD-VdI,4762
|
|
3
|
+
ava/config.py,sha256=6ifUtgLhKfSSIUlj4B7oJV_D_m6KHAOK4OKZypv4N5I,1666
|
|
4
|
+
ava/session.py,sha256=q2BSjCaOQk5TpELtkvvWvkRGdAGCnep3fp7SfrsYQNw,4337
|
|
5
|
+
ava/engines/__init__.py,sha256=y2YUYLQAhtYMRTsxgNi3bqjjaMUBhY35mogQa7NGmR4,252
|
|
6
|
+
ava/engines/base.py,sha256=fQnjo2nhtkfw40I6YnFkz7NBhIaNKvld5UYKSLbdbcI,1354
|
|
7
|
+
ava/engines/presidio.py,sha256=Cnyaw8jRsDQm3jrBYjHv2-aqqC9OVHzxi7QLBnqfJVM,3663
|
|
8
|
+
ava/gateways/__init__.py,sha256=Fq10R6Wdd3oUOip7tNdZHYmasmTuCqmTXf_eRs2WXn4,101
|
|
9
|
+
ava/gateways/http.py,sha256=RhFuN6za3HLO9ad3TcgMgSGhBtRbzwgIcPntDnOgggI,1475
|
|
10
|
+
ava/protocol/__init__.py,sha256=9yiAH-e0XI5Mc6mJKfgm8apKK4xbtOtZFXjciRpbI4A,338
|
|
11
|
+
ava/protocol/entities.py,sha256=RYayqOaHDLlIPVtn2m1LS7i89HyxT9edPkOnb3yHBeU,645
|
|
12
|
+
ava/protocol/manifest.py,sha256=UE-PKHHkSZMHczLt0fgA1bW05-X-AvqI21dAkS5zcuc,2318
|
|
13
|
+
ava/protocol/token_vault.py,sha256=hlY5qA3zsX3e4P_cubVcHqdl6OtJ6fn5Fhd9L_aY4Zs,5517
|
|
14
|
+
ava_protocol-0.1.0.dist-info/METADATA,sha256=G-IXI-LbDFg2xFS8-iFlnA498SUtdpvjqLA252Vh6sI,11374
|
|
15
|
+
ava_protocol-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
16
|
+
ava_protocol-0.1.0.dist-info/entry_points.txt,sha256=Ss7GoyQPKi4Zsbx0To102--qMW7iqY42ZLqZ9lOgWkI,37
|
|
17
|
+
ava_protocol-0.1.0.dist-info/licenses/LICENSE,sha256=E3QcRMbgb3k2EAQZ_y5uONaUgdtMRE2gYQdUpUHlHxA,1074
|
|
18
|
+
ava_protocol-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 AVA Protocol Team
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|