carnaval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- carnaval/__init__.py +8 -0
- carnaval/cli/__init__.py +6 -0
- carnaval/cli/anonymize.py +157 -0
- carnaval/cli/reinject.py +98 -0
- carnaval/core/__init__.py +3 -0
- carnaval/core/config_loader.py +215 -0
- carnaval/core/language_detector.py +52 -0
- carnaval/core/logger.py +84 -0
- carnaval/core/serializers.py +236 -0
- carnaval/core/span.py +94 -0
- carnaval/core/vault.py +135 -0
- carnaval/pipeline.py +110 -0
- carnaval/recognizers/__init__.py +7 -0
- carnaval/recognizers/ai/__init__.py +3 -0
- carnaval/recognizers/ai/gliner_engine.py +112 -0
- carnaval/recognizers/base.py +208 -0
- carnaval/recognizers/denylist/__init__.py +3 -0
- carnaval/recognizers/denylist/organizations.py +69 -0
- carnaval/recognizers/denylist/people.py +34 -0
- carnaval/recognizers/denylist/places.py +47 -0
- carnaval/recognizers/denylist/singleton.py +86 -0
- carnaval/recognizers/dictionary/__init__.py +13 -0
- carnaval/recognizers/dictionary/_loader.py +117 -0
- carnaval/recognizers/dictionary/cities.py +44 -0
- carnaval/recognizers/dictionary/firstnames.py +49 -0
- carnaval/recognizers/regex/__init__.py +3 -0
- carnaval/recognizers/regex/address/__init__.py +51 -0
- carnaval/recognizers/regex/address/de.py +79 -0
- carnaval/recognizers/regex/address/en.py +67 -0
- carnaval/recognizers/regex/address/es.py +57 -0
- carnaval/recognizers/regex/address/fr.py +149 -0
- carnaval/recognizers/regex/address/it.py +59 -0
- carnaval/recognizers/regex/address/pt.py +65 -0
- carnaval/recognizers/regex/address_fr.py +24 -0
- carnaval/recognizers/regex/context_location.py +216 -0
- carnaval/recognizers/regex/email.py +23 -0
- carnaval/recognizers/regex/fiscal_fr.py +95 -0
- carnaval/recognizers/regex/header_source.py +36 -0
- carnaval/recognizers/regex/iban_bic.py +96 -0
- carnaval/recognizers/regex/name_patterns.py +23 -0
- carnaval/recognizers/regex/names/__init__.py +51 -0
- carnaval/recognizers/regex/names/de.py +236 -0
- carnaval/recognizers/regex/names/en.py +79 -0
- carnaval/recognizers/regex/names/es.py +77 -0
- carnaval/recognizers/regex/names/fr.py +167 -0
- carnaval/recognizers/regex/names/it.py +81 -0
- carnaval/recognizers/regex/names/pt.py +72 -0
- carnaval/recognizers/regex/org_suffix.py +125 -0
- carnaval/recognizers/regex/phone/__init__.py +51 -0
- carnaval/recognizers/regex/phone/de.py +40 -0
- carnaval/recognizers/regex/phone/en.py +63 -0
- carnaval/recognizers/regex/phone/es.py +50 -0
- carnaval/recognizers/regex/phone/fr.py +31 -0
- carnaval/recognizers/regex/phone/it.py +36 -0
- carnaval/recognizers/regex/phone/pt.py +50 -0
- carnaval/recognizers/regex/phone_fr.py +14 -0
- carnaval/recognizers/regex/url.py +27 -0
- carnaval/stages/__init__.py +7 -0
- carnaval/stages/documents.py +96 -0
- carnaval/stages/s1_intake.py +71 -0
- carnaval/stages/s2_preprocess.py +79 -0
- carnaval/stages/s3_detect.py +369 -0
- carnaval/stages/s4_resolve.py +155 -0
- carnaval/stages/s5_mask.py +137 -0
- carnaval/stages/s6_output.py +113 -0
- carnaval/stages/s7_reinject.py +174 -0
- carnaval-0.1.0.dist-info/METADATA +312 -0
- carnaval-0.1.0.dist-info/RECORD +72 -0
- carnaval-0.1.0.dist-info/WHEEL +4 -0
- carnaval-0.1.0.dist-info/entry_points.txt +3 -0
- carnaval-0.1.0.dist-info/licenses/LICENSE +15 -0
- carnaval-0.1.0.dist-info/licenses/NOTICE +34 -0
carnaval/__init__.py
ADDED
carnaval/cli/__init__.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# Copyright 2026 Patrice AUBERT
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""CLI carnaval : anonymise un fichier .txt vers les formats standards.
|
|
4
|
+
|
|
5
|
+
Usage :
|
|
6
|
+
carnaval-anonymize inbox/doc.txt
|
|
7
|
+
carnaval-anonymize doc.txt --profile acknowledge
|
|
8
|
+
carnaval-anonymize doc.txt --profile acknowledge --private mon_profil
|
|
9
|
+
carnaval-anonymize doc.txt --no-gliner
|
|
10
|
+
|
|
11
|
+
La variable d'environnement CARNAVAL_VAULT_PASSWORD doit etre definie.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import os
|
|
18
|
+
import sys
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
from dotenv import load_dotenv
|
|
22
|
+
|
|
23
|
+
from carnaval.core.logger import configure_logging
|
|
24
|
+
from carnaval.pipeline import run_anonymization
|
|
25
|
+
|
|
26
|
+
# Racine du depot (4 niveaux au-dessus : cli/ -> carnaval/ -> src/ -> repo).
|
|
27
|
+
_REPO_ROOT = Path(__file__).resolve().parents[3]
|
|
28
|
+
|
|
29
|
+
DEMO_PASSWORD = "demo_password_change_me_in_prod"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def main() -> int:
|
|
33
|
+
load_dotenv(_REPO_ROOT / ".env")
|
|
34
|
+
|
|
35
|
+
parser = argparse.ArgumentParser(
|
|
36
|
+
prog="carnaval-anonymize",
|
|
37
|
+
description=(
|
|
38
|
+
"Anonymise un fichier texte. Produit simultanement les sorties "
|
|
39
|
+
"TXT, JSON, JSONL, XML, CoNLL et HTML dans outbox/."
|
|
40
|
+
),
|
|
41
|
+
)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"input",
|
|
44
|
+
type=Path,
|
|
45
|
+
help="Chemin du fichier .txt a anonymiser",
|
|
46
|
+
)
|
|
47
|
+
parser.add_argument(
|
|
48
|
+
"--outbox",
|
|
49
|
+
type=Path,
|
|
50
|
+
default=_REPO_ROOT / "outbox",
|
|
51
|
+
help="Dossier de sortie (defaut : ./outbox)",
|
|
52
|
+
)
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"--profile",
|
|
55
|
+
type=str,
|
|
56
|
+
default=None,
|
|
57
|
+
help="Profil metier (acknowledge, invoice, email, ...)",
|
|
58
|
+
)
|
|
59
|
+
parser.add_argument(
|
|
60
|
+
"--private",
|
|
61
|
+
type=str,
|
|
62
|
+
default=None,
|
|
63
|
+
help="Profil prive sous profiles_private/",
|
|
64
|
+
)
|
|
65
|
+
parser.add_argument(
|
|
66
|
+
"--no-gliner",
|
|
67
|
+
action="store_true",
|
|
68
|
+
help="Desactive GLiNER (plus rapide, mais detecte moins de PERSON/LOCATION libres)",
|
|
69
|
+
)
|
|
70
|
+
parser.add_argument(
|
|
71
|
+
"--gliner-threshold",
|
|
72
|
+
type=float,
|
|
73
|
+
default=0.4,
|
|
74
|
+
help="Seuil de confiance GLiNER (defaut : 0.4)",
|
|
75
|
+
)
|
|
76
|
+
parser.add_argument(
|
|
77
|
+
"--cleanup-pipes",
|
|
78
|
+
action="store_true",
|
|
79
|
+
help="Retire les `|` parasites entre les mots (utile pour textes extracteur PDF defaillants)",
|
|
80
|
+
)
|
|
81
|
+
parser.add_argument(
|
|
82
|
+
"--language",
|
|
83
|
+
type=str,
|
|
84
|
+
default=None,
|
|
85
|
+
choices=("fr", "en", "de", "ja"),
|
|
86
|
+
help="Force la langue. Auto-detection si non specifie.",
|
|
87
|
+
)
|
|
88
|
+
parser.add_argument(
|
|
89
|
+
"--log-level",
|
|
90
|
+
type=str,
|
|
91
|
+
default="INFO",
|
|
92
|
+
choices=("DEBUG", "INFO", "WARNING", "ERROR"),
|
|
93
|
+
help="Niveau de log (defaut : INFO)",
|
|
94
|
+
)
|
|
95
|
+
parser.add_argument(
|
|
96
|
+
"--console",
|
|
97
|
+
action="store_true",
|
|
98
|
+
help="Logs en mode console lisible plutot que JSON",
|
|
99
|
+
)
|
|
100
|
+
args = parser.parse_args()
|
|
101
|
+
|
|
102
|
+
configure_logging(level=args.log_level, json_format=not args.console)
|
|
103
|
+
|
|
104
|
+
password = os.environ.get("CARNAVAL_VAULT_PASSWORD") or DEMO_PASSWORD
|
|
105
|
+
if password == DEMO_PASSWORD:
|
|
106
|
+
print(
|
|
107
|
+
"[AVERTISSEMENT] CARNAVAL_VAULT_PASSWORD non defini : password de demo utilise.",
|
|
108
|
+
file=sys.stderr,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
if not args.input.exists():
|
|
112
|
+
print(f"[ERREUR] Fichier introuvable : {args.input}", file=sys.stderr)
|
|
113
|
+
return 2
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
masked, written, cfg = run_anonymization(
|
|
117
|
+
input_path=args.input,
|
|
118
|
+
outbox_dir=args.outbox,
|
|
119
|
+
vault_password=password,
|
|
120
|
+
profile=args.profile,
|
|
121
|
+
private_profile=args.private,
|
|
122
|
+
use_gliner=not args.no_gliner,
|
|
123
|
+
gliner_threshold=args.gliner_threshold,
|
|
124
|
+
cleanup_pipes=args.cleanup_pipes,
|
|
125
|
+
language=args.language,
|
|
126
|
+
repo_root=_REPO_ROOT,
|
|
127
|
+
)
|
|
128
|
+
except Exception as e:
|
|
129
|
+
print(f"[ERREUR] {type(e).__name__}: {e}", file=sys.stderr)
|
|
130
|
+
return 1
|
|
131
|
+
|
|
132
|
+
# Resume console pour l'humain
|
|
133
|
+
print()
|
|
134
|
+
print("=" * 60)
|
|
135
|
+
print(f" carnaval - {args.input.name}")
|
|
136
|
+
print("=" * 60)
|
|
137
|
+
print(f" Langue : {masked.language}")
|
|
138
|
+
print(f" Spans masques : {len(masked.spans)}")
|
|
139
|
+
print(f" Par categorie :")
|
|
140
|
+
for cat, n in sorted(masked.by_category.items()):
|
|
141
|
+
print(f" {cat:18s} : {n}")
|
|
142
|
+
print()
|
|
143
|
+
print(f" Fichiers produits :")
|
|
144
|
+
print(f" TXT : {written.txt_path}")
|
|
145
|
+
print(f" JSON : {written.json_path}")
|
|
146
|
+
print(f" JSONL : {written.jsonl_path}")
|
|
147
|
+
print(f" XML : {written.xml_path}")
|
|
148
|
+
print(f" CoNLL : {written.conll_path}")
|
|
149
|
+
print(f" HTML : {written.html_path}")
|
|
150
|
+
print(f" Vault chiffre : {written.vault_path}")
|
|
151
|
+
print(f" Metadata : {written.meta_path}")
|
|
152
|
+
print("=" * 60)
|
|
153
|
+
return 0
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
if __name__ == "__main__":
|
|
157
|
+
sys.exit(main())
|
carnaval/cli/reinject.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# Copyright 2026 Patrice AUBERT
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""CLI carnaval : restaure les valeurs originales dans un JSON ou XML.
|
|
4
|
+
|
|
5
|
+
Usage :
|
|
6
|
+
carnaval-reinject response.json --vault outbox/vault/doc_vault.enc
|
|
7
|
+
carnaval-reinject response.xml --vault outbox/vault/doc_vault.enc
|
|
8
|
+
carnaval-reinject response.json --vault ... --output final.json
|
|
9
|
+
|
|
10
|
+
Auto-detection JSON vs XML par le contenu du fichier (premier caractere).
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import os
|
|
17
|
+
import sys
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
from dotenv import load_dotenv
|
|
21
|
+
|
|
22
|
+
from carnaval.core.vault import Vault, VaultError
|
|
23
|
+
from carnaval.stages.s7_reinject import reinject_string
|
|
24
|
+
|
|
25
|
+
# Racine du depot (4 niveaux au-dessus : cli/ -> carnaval/ -> src/ -> repo).
|
|
26
|
+
_REPO_ROOT = Path(__file__).resolve().parents[3]
|
|
27
|
+
|
|
28
|
+
DEMO_PASSWORD = "demo_password_change_me_in_prod"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def main() -> int:
|
|
32
|
+
load_dotenv(_REPO_ROOT / ".env")
|
|
33
|
+
|
|
34
|
+
parser = argparse.ArgumentParser(
|
|
35
|
+
prog="carnaval-reinject",
|
|
36
|
+
description=(
|
|
37
|
+
"Restaure les valeurs originales dans un JSON ou XML produit par "
|
|
38
|
+
"un LLM downstream. Auto-detection du format."
|
|
39
|
+
),
|
|
40
|
+
)
|
|
41
|
+
parser.add_argument(
|
|
42
|
+
"input",
|
|
43
|
+
type=Path,
|
|
44
|
+
help="Fichier JSON ou XML contenant des placeholders [TYPE_n]",
|
|
45
|
+
)
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"--vault",
|
|
48
|
+
type=Path,
|
|
49
|
+
required=True,
|
|
50
|
+
help="Chemin du vault chiffre produit par carnaval-anonymize",
|
|
51
|
+
)
|
|
52
|
+
parser.add_argument(
|
|
53
|
+
"--output",
|
|
54
|
+
type=Path,
|
|
55
|
+
default=None,
|
|
56
|
+
help="Fichier de sortie (defaut : <input>_final.<ext>)",
|
|
57
|
+
)
|
|
58
|
+
args = parser.parse_args()
|
|
59
|
+
|
|
60
|
+
password = os.environ.get("CARNAVAL_VAULT_PASSWORD") or DEMO_PASSWORD
|
|
61
|
+
if password == DEMO_PASSWORD:
|
|
62
|
+
print(
|
|
63
|
+
"[AVERTISSEMENT] CARNAVAL_VAULT_PASSWORD non defini : demo utilise.",
|
|
64
|
+
file=sys.stderr,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
if not args.input.exists():
|
|
68
|
+
print(f"[ERREUR] Fichier introuvable : {args.input}", file=sys.stderr)
|
|
69
|
+
return 2
|
|
70
|
+
if not args.vault.exists():
|
|
71
|
+
print(f"[ERREUR] Vault introuvable : {args.vault}", file=sys.stderr)
|
|
72
|
+
return 2
|
|
73
|
+
|
|
74
|
+
# Charger le vault
|
|
75
|
+
vault = Vault(password=password, path=args.vault)
|
|
76
|
+
try:
|
|
77
|
+
vault.load()
|
|
78
|
+
except VaultError as e:
|
|
79
|
+
print(f"[ERREUR] Vault : {e}", file=sys.stderr)
|
|
80
|
+
return 1
|
|
81
|
+
|
|
82
|
+
# Restaurer
|
|
83
|
+
content = args.input.read_text(encoding="utf-8")
|
|
84
|
+
restored = reinject_string(content, vault)
|
|
85
|
+
|
|
86
|
+
# Sortie
|
|
87
|
+
if args.output is None:
|
|
88
|
+
stem = args.input.stem
|
|
89
|
+
ext = args.input.suffix
|
|
90
|
+
args.output = args.input.parent / f"{stem}_final{ext}"
|
|
91
|
+
|
|
92
|
+
args.output.write_text(restored, encoding="utf-8")
|
|
93
|
+
print(f"[OK] Fichier restitue : {args.output}")
|
|
94
|
+
return 0
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
if __name__ == "__main__":
|
|
98
|
+
sys.exit(main())
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
# Copyright 2026 Patrice AUBERT
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""Chargement de la configuration en couches.
|
|
4
|
+
|
|
5
|
+
Strategie de merge :
|
|
6
|
+
1. base -> config/pipeline.yaml + sous-fichiers config/*/*.yaml
|
|
7
|
+
2. profil -> profiles/<type>/profile.yaml + sous-fichiers
|
|
8
|
+
3. prive -> profiles_private/<custom>/profile.yaml + sous-fichiers (optionnel)
|
|
9
|
+
|
|
10
|
+
Les listes sont CONCATENEES (deny_lists, allow_lists), les dicts sont MERGES
|
|
11
|
+
profondement. Les scalaires de la couche superieure ecrasent la couche
|
|
12
|
+
inferieure.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
import yaml
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class Config:
|
|
26
|
+
"""Configuration applicative resolue par couches."""
|
|
27
|
+
|
|
28
|
+
raw: dict[str, Any] = field(default_factory=dict)
|
|
29
|
+
layers: list[str] = field(default_factory=list) # noms des couches mergees
|
|
30
|
+
|
|
31
|
+
# Acces rapides typiques
|
|
32
|
+
@property
|
|
33
|
+
def pipeline(self) -> dict[str, Any]:
|
|
34
|
+
return self.raw.get("pipeline", {})
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def patterns(self) -> dict[str, Any]:
|
|
38
|
+
return self.raw.get("patterns", {})
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def deny_lists(self) -> dict[str, list[str]]:
|
|
42
|
+
return self.raw.get("deny_lists", {})
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def allow_lists(self) -> dict[str, list[str]]:
|
|
46
|
+
return self.raw.get("allow_lists", {})
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def policies(self) -> dict[str, Any]:
|
|
50
|
+
return self.raw.get("policies", {})
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def ai_models(self) -> dict[str, Any]:
|
|
54
|
+
return self.raw.get("ai_models", {})
|
|
55
|
+
|
|
56
|
+
def get(self, dotted_key: str, default: Any = None) -> Any:
|
|
57
|
+
"""Acces dotted-path : cfg.get('policies.priority_rules.DenylistRecognizer')."""
|
|
58
|
+
parts = dotted_key.split(".")
|
|
59
|
+
node: Any = self.raw
|
|
60
|
+
for p in parts:
|
|
61
|
+
if not isinstance(node, dict) or p not in node:
|
|
62
|
+
return default
|
|
63
|
+
node = node[p]
|
|
64
|
+
return node
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# ----------------------------------------------------------------------
|
|
68
|
+
# Merge utilities
|
|
69
|
+
# ----------------------------------------------------------------------
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _deep_merge(base: dict[str, Any], overlay: dict[str, Any]) -> dict[str, Any]:
|
|
73
|
+
"""Merge profond.
|
|
74
|
+
|
|
75
|
+
Regles :
|
|
76
|
+
- dict + dict -> merge cle a cle (recursif)
|
|
77
|
+
- list + list -> concatenation (sans dedoublonnage : c'est le role du caller)
|
|
78
|
+
- scalaire + scalaire -> overlay gagne
|
|
79
|
+
- types mixtes -> overlay gagne (warning implicite)
|
|
80
|
+
"""
|
|
81
|
+
result = dict(base)
|
|
82
|
+
for key, val in overlay.items():
|
|
83
|
+
if key in result:
|
|
84
|
+
existing = result[key]
|
|
85
|
+
if isinstance(existing, dict) and isinstance(val, dict):
|
|
86
|
+
result[key] = _deep_merge(existing, val)
|
|
87
|
+
elif isinstance(existing, list) and isinstance(val, list):
|
|
88
|
+
result[key] = existing + val
|
|
89
|
+
else:
|
|
90
|
+
result[key] = val
|
|
91
|
+
else:
|
|
92
|
+
result[key] = val
|
|
93
|
+
return result
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _load_yaml(path: Path) -> dict[str, Any]:
|
|
97
|
+
"""Charge un YAML, renvoie {} si fichier vide."""
|
|
98
|
+
if not path.exists():
|
|
99
|
+
return {}
|
|
100
|
+
with open(path, encoding="utf-8") as f:
|
|
101
|
+
data = yaml.safe_load(f)
|
|
102
|
+
return data if isinstance(data, dict) else {}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _load_directory_layer(dir_path: Path) -> dict[str, Any]:
|
|
106
|
+
"""Charge tous les YAML d'un dossier en un dict structure par sous-dossier.
|
|
107
|
+
|
|
108
|
+
Pour un layout :
|
|
109
|
+
layer/
|
|
110
|
+
pipeline.yaml
|
|
111
|
+
patterns/
|
|
112
|
+
fiscal_fr.yaml -> patterns.fiscal_fr.*
|
|
113
|
+
deny_lists/
|
|
114
|
+
organizations.yaml -> deny_lists.organizations.*
|
|
115
|
+
|
|
116
|
+
Le contenu du fichier remplace le sous-namespace correspondant.
|
|
117
|
+
"""
|
|
118
|
+
if not dir_path.exists() or not dir_path.is_dir():
|
|
119
|
+
return {}
|
|
120
|
+
|
|
121
|
+
out: dict[str, Any] = {}
|
|
122
|
+
|
|
123
|
+
# 1. fichiers .yaml directement a la racine de la couche (ex: pipeline.yaml, ai_models.yaml)
|
|
124
|
+
for yml in sorted(dir_path.glob("*.yaml")):
|
|
125
|
+
key = yml.stem
|
|
126
|
+
data = _load_yaml(yml)
|
|
127
|
+
# Si le YAML contient une cle racine du meme nom, on la fusionne
|
|
128
|
+
# Sinon on stocke le contenu sous la cle key.
|
|
129
|
+
if key in data and isinstance(data[key], dict):
|
|
130
|
+
out[key] = _deep_merge(out.get(key, {}), data[key])
|
|
131
|
+
else:
|
|
132
|
+
out[key] = (
|
|
133
|
+
_deep_merge(out.get(key, {}), data) if isinstance(data, dict) else data
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# 2. sous-dossiers : patterns/, deny_lists/, allow_lists/, policies/
|
|
137
|
+
# + recursion limitee pour places/{fr,de,...}.yaml
|
|
138
|
+
for sub in sorted(p for p in dir_path.iterdir() if p.is_dir()):
|
|
139
|
+
sub_content: dict[str, Any] = {}
|
|
140
|
+
# 2a. fichiers .yaml directs (ex: deny_lists/organizations.yaml)
|
|
141
|
+
for yml in sorted(sub.glob("*.yaml")):
|
|
142
|
+
sub_content[yml.stem] = _load_yaml(yml)
|
|
143
|
+
# 2b. sous-sous-dossiers (ex: deny_lists/places/fr.yaml)
|
|
144
|
+
# -> deny_lists.places = {fr: [...], de: [...], ...}
|
|
145
|
+
for subsub in sorted(p for p in sub.iterdir() if p.is_dir()):
|
|
146
|
+
lang_dict: dict[str, Any] = {}
|
|
147
|
+
for yml in sorted(subsub.glob("*.yaml")):
|
|
148
|
+
lang_dict[yml.stem] = _load_yaml(yml)
|
|
149
|
+
if lang_dict:
|
|
150
|
+
sub_content[subsub.name] = _deep_merge(
|
|
151
|
+
sub_content.get(subsub.name, {}), lang_dict
|
|
152
|
+
)
|
|
153
|
+
if sub_content:
|
|
154
|
+
out[sub.name] = _deep_merge(out.get(sub.name, {}), sub_content)
|
|
155
|
+
|
|
156
|
+
return out
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
# ----------------------------------------------------------------------
|
|
160
|
+
# API publique
|
|
161
|
+
# ----------------------------------------------------------------------
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def load_config(
|
|
165
|
+
base_dir: Path | str | None = None,
|
|
166
|
+
profile: str | None = None,
|
|
167
|
+
private_profile: str | None = None,
|
|
168
|
+
repo_root: Path | str | None = None,
|
|
169
|
+
) -> Config:
|
|
170
|
+
"""Charge la config en cascade base -> profile -> private_profile.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
base_dir: chemin du dossier `config/` (defaut : <repo>/config).
|
|
174
|
+
profile: nom du profil public a appliquer (ex: 'acknowledge').
|
|
175
|
+
private_profile: nom du profil prive (sous profiles_private/).
|
|
176
|
+
repo_root: racine du repo (auto-detection par defaut).
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Config resolu.
|
|
180
|
+
"""
|
|
181
|
+
if repo_root is None:
|
|
182
|
+
# racine = parent de src/carnaval/core/
|
|
183
|
+
repo_root = Path(__file__).resolve().parents[3]
|
|
184
|
+
repo_root = Path(repo_root)
|
|
185
|
+
|
|
186
|
+
base_path = Path(base_dir) if base_dir else repo_root / "config"
|
|
187
|
+
|
|
188
|
+
layers_loaded: list[str] = []
|
|
189
|
+
merged: dict[str, Any] = {}
|
|
190
|
+
|
|
191
|
+
# Couche 1 : base
|
|
192
|
+
base_layer = _load_directory_layer(base_path)
|
|
193
|
+
if base_layer:
|
|
194
|
+
merged = _deep_merge(merged, base_layer)
|
|
195
|
+
layers_loaded.append(f"base:{base_path}")
|
|
196
|
+
|
|
197
|
+
# Couche 2 : profile public
|
|
198
|
+
if profile:
|
|
199
|
+
prof_path = repo_root / "profiles" / profile
|
|
200
|
+
prof_layer = _load_directory_layer(prof_path)
|
|
201
|
+
if not prof_layer:
|
|
202
|
+
raise FileNotFoundError(f"Profil introuvable : {prof_path}")
|
|
203
|
+
merged = _deep_merge(merged, prof_layer)
|
|
204
|
+
layers_loaded.append(f"profile:{profile}")
|
|
205
|
+
|
|
206
|
+
# Couche 3 : profil prive (optionnel)
|
|
207
|
+
if private_profile:
|
|
208
|
+
priv_path = repo_root / "profiles_private" / private_profile
|
|
209
|
+
priv_layer = _load_directory_layer(priv_path)
|
|
210
|
+
if not priv_layer:
|
|
211
|
+
raise FileNotFoundError(f"Profil prive introuvable : {priv_path}")
|
|
212
|
+
merged = _deep_merge(merged, priv_layer)
|
|
213
|
+
layers_loaded.append(f"private:{private_profile}")
|
|
214
|
+
|
|
215
|
+
return Config(raw=merged, layers=layers_loaded)
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# Copyright 2026 Patrice AUBERT
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""Detection de langue via lingua-language-detector.
|
|
4
|
+
|
|
5
|
+
Wrapper minimal. Renvoie un code ISO 'fr'/'en'/'de'/'ja' ou 'unknown'.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
from lingua import Language, LanguageDetectorBuilder
|
|
13
|
+
|
|
14
|
+
_SUPPORTED = (Language.FRENCH, Language.ENGLISH, Language.GERMAN, Language.JAPANESE)
|
|
15
|
+
_DETECTOR: Optional[object] = None
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _get_detector():
|
|
19
|
+
"""Charge le detecteur paresseusement (premier appel = chargement modeles)."""
|
|
20
|
+
global _DETECTOR
|
|
21
|
+
if _DETECTOR is None:
|
|
22
|
+
_DETECTOR = (
|
|
23
|
+
LanguageDetectorBuilder.from_languages(*_SUPPORTED)
|
|
24
|
+
.with_preloaded_language_models()
|
|
25
|
+
.build()
|
|
26
|
+
)
|
|
27
|
+
return _DETECTOR
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def detect_language(text: str) -> str:
|
|
31
|
+
"""Detecte la langue d'un texte.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
text: contenu textuel a analyser.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
'fr', 'en', 'de', 'ja' ou 'unknown'.
|
|
38
|
+
"""
|
|
39
|
+
if not text or not text.strip():
|
|
40
|
+
return "unknown"
|
|
41
|
+
|
|
42
|
+
sample = text[:5000]
|
|
43
|
+
lang = _get_detector().detect_language_of(sample)
|
|
44
|
+
if lang is None:
|
|
45
|
+
return "unknown"
|
|
46
|
+
mapping = {
|
|
47
|
+
Language.FRENCH: "fr",
|
|
48
|
+
Language.ENGLISH: "en",
|
|
49
|
+
Language.GERMAN: "de",
|
|
50
|
+
Language.JAPANESE: "ja",
|
|
51
|
+
}
|
|
52
|
+
return mapping.get(lang, "unknown")
|
carnaval/core/logger.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# Copyright 2026 Patrice AUBERT
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""Logging structure avec garde-fou anti-fuite.
|
|
4
|
+
|
|
5
|
+
Principe : un logger structlog standard, plus un filtre qui interdit toute
|
|
6
|
+
journalisation contenant des cles a haut risque (`original`, `raw_text`, `mapping`).
|
|
7
|
+
Si une telle cle apparait, sa valeur est remplacee par `<REDACTED>` avant emission.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
import sys
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
import structlog
|
|
17
|
+
|
|
18
|
+
# Cles dont la valeur ne doit JAMAIS apparaitre dans les logs.
|
|
19
|
+
SENSITIVE_KEYS = frozenset(
|
|
20
|
+
{
|
|
21
|
+
"original",
|
|
22
|
+
"raw_text",
|
|
23
|
+
"raw",
|
|
24
|
+
"text",
|
|
25
|
+
"mapping",
|
|
26
|
+
"vault",
|
|
27
|
+
"vault_contents",
|
|
28
|
+
"password",
|
|
29
|
+
"secret",
|
|
30
|
+
"forward",
|
|
31
|
+
"backward",
|
|
32
|
+
}
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _redact_sensitive(
|
|
37
|
+
logger, method_name, event_dict: dict[str, Any]
|
|
38
|
+
) -> dict[str, Any]:
|
|
39
|
+
"""Processor structlog : remplace les valeurs sensibles par <REDACTED>."""
|
|
40
|
+
for key in list(event_dict.keys()):
|
|
41
|
+
if key.lower() in SENSITIVE_KEYS:
|
|
42
|
+
event_dict[key] = "<REDACTED>"
|
|
43
|
+
return event_dict
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def configure_logging(level: str = "INFO", json_format: bool = True) -> None:
|
|
47
|
+
"""Configure structlog au niveau global.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
level: 'DEBUG', 'INFO', 'WARNING', 'ERROR'.
|
|
51
|
+
json_format: True -> sortie JSON (prod). False -> sortie console lisible.
|
|
52
|
+
"""
|
|
53
|
+
logging.basicConfig(
|
|
54
|
+
format="%(message)s",
|
|
55
|
+
stream=sys.stdout,
|
|
56
|
+
level=getattr(logging, level.upper(), logging.INFO),
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
processors: list[Any] = [
|
|
60
|
+
structlog.contextvars.merge_contextvars,
|
|
61
|
+
structlog.stdlib.add_log_level,
|
|
62
|
+
structlog.processors.TimeStamper(fmt="iso"),
|
|
63
|
+
_redact_sensitive, # garde-fou anti-fuite
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
if json_format:
|
|
67
|
+
processors.append(structlog.processors.JSONRenderer())
|
|
68
|
+
else:
|
|
69
|
+
processors.append(structlog.dev.ConsoleRenderer())
|
|
70
|
+
|
|
71
|
+
structlog.configure(
|
|
72
|
+
processors=processors,
|
|
73
|
+
wrapper_class=structlog.make_filtering_bound_logger(
|
|
74
|
+
getattr(logging, level.upper(), logging.INFO)
|
|
75
|
+
),
|
|
76
|
+
context_class=dict,
|
|
77
|
+
logger_factory=structlog.stdlib.LoggerFactory(),
|
|
78
|
+
cache_logger_on_first_use=True,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def get_logger(name: str = "carnaval") -> structlog.BoundLogger:
|
|
83
|
+
"""Obtient un logger structure (configure_logging() doit avoir ete appele)."""
|
|
84
|
+
return structlog.get_logger(name)
|