carnaval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. carnaval/__init__.py +8 -0
  2. carnaval/cli/__init__.py +6 -0
  3. carnaval/cli/anonymize.py +157 -0
  4. carnaval/cli/reinject.py +98 -0
  5. carnaval/core/__init__.py +3 -0
  6. carnaval/core/config_loader.py +215 -0
  7. carnaval/core/language_detector.py +52 -0
  8. carnaval/core/logger.py +84 -0
  9. carnaval/core/serializers.py +236 -0
  10. carnaval/core/span.py +94 -0
  11. carnaval/core/vault.py +135 -0
  12. carnaval/pipeline.py +110 -0
  13. carnaval/recognizers/__init__.py +7 -0
  14. carnaval/recognizers/ai/__init__.py +3 -0
  15. carnaval/recognizers/ai/gliner_engine.py +112 -0
  16. carnaval/recognizers/base.py +208 -0
  17. carnaval/recognizers/denylist/__init__.py +3 -0
  18. carnaval/recognizers/denylist/organizations.py +69 -0
  19. carnaval/recognizers/denylist/people.py +34 -0
  20. carnaval/recognizers/denylist/places.py +47 -0
  21. carnaval/recognizers/denylist/singleton.py +86 -0
  22. carnaval/recognizers/dictionary/__init__.py +13 -0
  23. carnaval/recognizers/dictionary/_loader.py +117 -0
  24. carnaval/recognizers/dictionary/cities.py +44 -0
  25. carnaval/recognizers/dictionary/firstnames.py +49 -0
  26. carnaval/recognizers/regex/__init__.py +3 -0
  27. carnaval/recognizers/regex/address/__init__.py +51 -0
  28. carnaval/recognizers/regex/address/de.py +79 -0
  29. carnaval/recognizers/regex/address/en.py +67 -0
  30. carnaval/recognizers/regex/address/es.py +57 -0
  31. carnaval/recognizers/regex/address/fr.py +149 -0
  32. carnaval/recognizers/regex/address/it.py +59 -0
  33. carnaval/recognizers/regex/address/pt.py +65 -0
  34. carnaval/recognizers/regex/address_fr.py +24 -0
  35. carnaval/recognizers/regex/context_location.py +216 -0
  36. carnaval/recognizers/regex/email.py +23 -0
  37. carnaval/recognizers/regex/fiscal_fr.py +95 -0
  38. carnaval/recognizers/regex/header_source.py +36 -0
  39. carnaval/recognizers/regex/iban_bic.py +96 -0
  40. carnaval/recognizers/regex/name_patterns.py +23 -0
  41. carnaval/recognizers/regex/names/__init__.py +51 -0
  42. carnaval/recognizers/regex/names/de.py +236 -0
  43. carnaval/recognizers/regex/names/en.py +79 -0
  44. carnaval/recognizers/regex/names/es.py +77 -0
  45. carnaval/recognizers/regex/names/fr.py +167 -0
  46. carnaval/recognizers/regex/names/it.py +81 -0
  47. carnaval/recognizers/regex/names/pt.py +72 -0
  48. carnaval/recognizers/regex/org_suffix.py +125 -0
  49. carnaval/recognizers/regex/phone/__init__.py +51 -0
  50. carnaval/recognizers/regex/phone/de.py +40 -0
  51. carnaval/recognizers/regex/phone/en.py +63 -0
  52. carnaval/recognizers/regex/phone/es.py +50 -0
  53. carnaval/recognizers/regex/phone/fr.py +31 -0
  54. carnaval/recognizers/regex/phone/it.py +36 -0
  55. carnaval/recognizers/regex/phone/pt.py +50 -0
  56. carnaval/recognizers/regex/phone_fr.py +14 -0
  57. carnaval/recognizers/regex/url.py +27 -0
  58. carnaval/stages/__init__.py +7 -0
  59. carnaval/stages/documents.py +96 -0
  60. carnaval/stages/s1_intake.py +71 -0
  61. carnaval/stages/s2_preprocess.py +79 -0
  62. carnaval/stages/s3_detect.py +369 -0
  63. carnaval/stages/s4_resolve.py +155 -0
  64. carnaval/stages/s5_mask.py +137 -0
  65. carnaval/stages/s6_output.py +113 -0
  66. carnaval/stages/s7_reinject.py +174 -0
  67. carnaval-0.1.0.dist-info/METADATA +312 -0
  68. carnaval-0.1.0.dist-info/RECORD +72 -0
  69. carnaval-0.1.0.dist-info/WHEEL +4 -0
  70. carnaval-0.1.0.dist-info/entry_points.txt +3 -0
  71. carnaval-0.1.0.dist-info/licenses/LICENSE +15 -0
  72. carnaval-0.1.0.dist-info/licenses/NOTICE +34 -0
carnaval/__init__.py ADDED
@@ -0,0 +1,8 @@
1
+ # Copyright 2026 Patrice AUBERT
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """carnaval - PII anonymization framework with reversible vault.
4
+
5
+ Apache License 2.0.
6
+ """
7
+
8
+ __version__ = "0.1.0"
@@ -0,0 +1,6 @@
1
+ # Copyright 2026 Patrice AUBERT
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """Interfaces en ligne de commande de carnaval.
4
+
5
+ Expose les points d'entree `carnaval-anonymize` et `carnaval-reinject`.
6
+ """
@@ -0,0 +1,157 @@
1
+ # Copyright 2026 Patrice AUBERT
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """CLI carnaval : anonymise un fichier .txt vers les formats standards.
4
+
5
+ Usage :
6
+ carnaval-anonymize inbox/doc.txt
7
+ carnaval-anonymize doc.txt --profile acknowledge
8
+ carnaval-anonymize doc.txt --profile acknowledge --private mon_profil
9
+ carnaval-anonymize doc.txt --no-gliner
10
+
11
+ La variable d'environnement CARNAVAL_VAULT_PASSWORD doit etre definie.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ import os
18
+ import sys
19
+ from pathlib import Path
20
+
21
+ from dotenv import load_dotenv
22
+
23
+ from carnaval.core.logger import configure_logging
24
+ from carnaval.pipeline import run_anonymization
25
+
26
+ # Racine du depot (4 niveaux au-dessus : cli/ -> carnaval/ -> src/ -> repo).
27
+ _REPO_ROOT = Path(__file__).resolve().parents[3]
28
+
29
+ DEMO_PASSWORD = "demo_password_change_me_in_prod"
30
+
31
+
32
+ def main() -> int:
33
+ load_dotenv(_REPO_ROOT / ".env")
34
+
35
+ parser = argparse.ArgumentParser(
36
+ prog="carnaval-anonymize",
37
+ description=(
38
+ "Anonymise un fichier texte. Produit simultanement les sorties "
39
+ "TXT, JSON, JSONL, XML, CoNLL et HTML dans outbox/."
40
+ ),
41
+ )
42
+ parser.add_argument(
43
+ "input",
44
+ type=Path,
45
+ help="Chemin du fichier .txt a anonymiser",
46
+ )
47
+ parser.add_argument(
48
+ "--outbox",
49
+ type=Path,
50
+ default=_REPO_ROOT / "outbox",
51
+ help="Dossier de sortie (defaut : ./outbox)",
52
+ )
53
+ parser.add_argument(
54
+ "--profile",
55
+ type=str,
56
+ default=None,
57
+ help="Profil metier (acknowledge, invoice, email, ...)",
58
+ )
59
+ parser.add_argument(
60
+ "--private",
61
+ type=str,
62
+ default=None,
63
+ help="Profil prive sous profiles_private/",
64
+ )
65
+ parser.add_argument(
66
+ "--no-gliner",
67
+ action="store_true",
68
+ help="Desactive GLiNER (plus rapide, mais detecte moins de PERSON/LOCATION libres)",
69
+ )
70
+ parser.add_argument(
71
+ "--gliner-threshold",
72
+ type=float,
73
+ default=0.4,
74
+ help="Seuil de confiance GLiNER (defaut : 0.4)",
75
+ )
76
+ parser.add_argument(
77
+ "--cleanup-pipes",
78
+ action="store_true",
79
+ help="Retire les `|` parasites entre les mots (utile pour textes extracteur PDF defaillants)",
80
+ )
81
+ parser.add_argument(
82
+ "--language",
83
+ type=str,
84
+ default=None,
85
+ choices=("fr", "en", "de", "ja"),
86
+ help="Force la langue. Auto-detection si non specifie.",
87
+ )
88
+ parser.add_argument(
89
+ "--log-level",
90
+ type=str,
91
+ default="INFO",
92
+ choices=("DEBUG", "INFO", "WARNING", "ERROR"),
93
+ help="Niveau de log (defaut : INFO)",
94
+ )
95
+ parser.add_argument(
96
+ "--console",
97
+ action="store_true",
98
+ help="Logs en mode console lisible plutot que JSON",
99
+ )
100
+ args = parser.parse_args()
101
+
102
+ configure_logging(level=args.log_level, json_format=not args.console)
103
+
104
+ password = os.environ.get("CARNAVAL_VAULT_PASSWORD") or DEMO_PASSWORD
105
+ if password == DEMO_PASSWORD:
106
+ print(
107
+ "[AVERTISSEMENT] CARNAVAL_VAULT_PASSWORD non defini : password de demo utilise.",
108
+ file=sys.stderr,
109
+ )
110
+
111
+ if not args.input.exists():
112
+ print(f"[ERREUR] Fichier introuvable : {args.input}", file=sys.stderr)
113
+ return 2
114
+
115
+ try:
116
+ masked, written, cfg = run_anonymization(
117
+ input_path=args.input,
118
+ outbox_dir=args.outbox,
119
+ vault_password=password,
120
+ profile=args.profile,
121
+ private_profile=args.private,
122
+ use_gliner=not args.no_gliner,
123
+ gliner_threshold=args.gliner_threshold,
124
+ cleanup_pipes=args.cleanup_pipes,
125
+ language=args.language,
126
+ repo_root=_REPO_ROOT,
127
+ )
128
+ except Exception as e:
129
+ print(f"[ERREUR] {type(e).__name__}: {e}", file=sys.stderr)
130
+ return 1
131
+
132
+ # Resume console pour l'humain
133
+ print()
134
+ print("=" * 60)
135
+ print(f" carnaval - {args.input.name}")
136
+ print("=" * 60)
137
+ print(f" Langue : {masked.language}")
138
+ print(f" Spans masques : {len(masked.spans)}")
139
+ print(f" Par categorie :")
140
+ for cat, n in sorted(masked.by_category.items()):
141
+ print(f" {cat:18s} : {n}")
142
+ print()
143
+ print(f" Fichiers produits :")
144
+ print(f" TXT : {written.txt_path}")
145
+ print(f" JSON : {written.json_path}")
146
+ print(f" JSONL : {written.jsonl_path}")
147
+ print(f" XML : {written.xml_path}")
148
+ print(f" CoNLL : {written.conll_path}")
149
+ print(f" HTML : {written.html_path}")
150
+ print(f" Vault chiffre : {written.vault_path}")
151
+ print(f" Metadata : {written.meta_path}")
152
+ print("=" * 60)
153
+ return 0
154
+
155
+
156
+ if __name__ == "__main__":
157
+ sys.exit(main())
@@ -0,0 +1,98 @@
1
+ # Copyright 2026 Patrice AUBERT
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """CLI carnaval : restaure les valeurs originales dans un JSON ou XML.
4
+
5
+ Usage :
6
+ carnaval-reinject response.json --vault outbox/vault/doc_vault.enc
7
+ carnaval-reinject response.xml --vault outbox/vault/doc_vault.enc
8
+ carnaval-reinject response.json --vault ... --output final.json
9
+
10
+ Auto-detection JSON vs XML par le contenu du fichier (premier caractere).
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import argparse
16
+ import os
17
+ import sys
18
+ from pathlib import Path
19
+
20
+ from dotenv import load_dotenv
21
+
22
+ from carnaval.core.vault import Vault, VaultError
23
+ from carnaval.stages.s7_reinject import reinject_string
24
+
25
+ # Racine du depot (4 niveaux au-dessus : cli/ -> carnaval/ -> src/ -> repo).
26
+ _REPO_ROOT = Path(__file__).resolve().parents[3]
27
+
28
+ DEMO_PASSWORD = "demo_password_change_me_in_prod"
29
+
30
+
31
+ def main() -> int:
32
+ load_dotenv(_REPO_ROOT / ".env")
33
+
34
+ parser = argparse.ArgumentParser(
35
+ prog="carnaval-reinject",
36
+ description=(
37
+ "Restaure les valeurs originales dans un JSON ou XML produit par "
38
+ "un LLM downstream. Auto-detection du format."
39
+ ),
40
+ )
41
+ parser.add_argument(
42
+ "input",
43
+ type=Path,
44
+ help="Fichier JSON ou XML contenant des placeholders [TYPE_n]",
45
+ )
46
+ parser.add_argument(
47
+ "--vault",
48
+ type=Path,
49
+ required=True,
50
+ help="Chemin du vault chiffre produit par carnaval-anonymize",
51
+ )
52
+ parser.add_argument(
53
+ "--output",
54
+ type=Path,
55
+ default=None,
56
+ help="Fichier de sortie (defaut : <input>_final.<ext>)",
57
+ )
58
+ args = parser.parse_args()
59
+
60
+ password = os.environ.get("CARNAVAL_VAULT_PASSWORD") or DEMO_PASSWORD
61
+ if password == DEMO_PASSWORD:
62
+ print(
63
+ "[AVERTISSEMENT] CARNAVAL_VAULT_PASSWORD non defini : demo utilise.",
64
+ file=sys.stderr,
65
+ )
66
+
67
+ if not args.input.exists():
68
+ print(f"[ERREUR] Fichier introuvable : {args.input}", file=sys.stderr)
69
+ return 2
70
+ if not args.vault.exists():
71
+ print(f"[ERREUR] Vault introuvable : {args.vault}", file=sys.stderr)
72
+ return 2
73
+
74
+ # Charger le vault
75
+ vault = Vault(password=password, path=args.vault)
76
+ try:
77
+ vault.load()
78
+ except VaultError as e:
79
+ print(f"[ERREUR] Vault : {e}", file=sys.stderr)
80
+ return 1
81
+
82
+ # Restaurer
83
+ content = args.input.read_text(encoding="utf-8")
84
+ restored = reinject_string(content, vault)
85
+
86
+ # Sortie
87
+ if args.output is None:
88
+ stem = args.input.stem
89
+ ext = args.input.suffix
90
+ args.output = args.input.parent / f"{stem}_final{ext}"
91
+
92
+ args.output.write_text(restored, encoding="utf-8")
93
+ print(f"[OK] Fichier restitue : {args.output}")
94
+ return 0
95
+
96
+
97
+ if __name__ == "__main__":
98
+ sys.exit(main())
@@ -0,0 +1,3 @@
1
+ # Copyright 2026 Patrice AUBERT
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """Primitives partagees par tous les etages."""
@@ -0,0 +1,215 @@
1
+ # Copyright 2026 Patrice AUBERT
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """Chargement de la configuration en couches.
4
+
5
+ Strategie de merge :
6
+ 1. base -> config/pipeline.yaml + sous-fichiers config/*/*.yaml
7
+ 2. profil -> profiles/<type>/profile.yaml + sous-fichiers
8
+ 3. prive -> profiles_private/<custom>/profile.yaml + sous-fichiers (optionnel)
9
+
10
+ Les listes sont CONCATENEES (deny_lists, allow_lists), les dicts sont MERGES
11
+ profondement. Les scalaires de la couche superieure ecrasent la couche
12
+ inferieure.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from dataclasses import dataclass, field
18
+ from pathlib import Path
19
+ from typing import Any
20
+
21
+ import yaml
22
+
23
+
24
+ @dataclass
25
+ class Config:
26
+ """Configuration applicative resolue par couches."""
27
+
28
+ raw: dict[str, Any] = field(default_factory=dict)
29
+ layers: list[str] = field(default_factory=list) # noms des couches mergees
30
+
31
+ # Acces rapides typiques
32
+ @property
33
+ def pipeline(self) -> dict[str, Any]:
34
+ return self.raw.get("pipeline", {})
35
+
36
+ @property
37
+ def patterns(self) -> dict[str, Any]:
38
+ return self.raw.get("patterns", {})
39
+
40
+ @property
41
+ def deny_lists(self) -> dict[str, list[str]]:
42
+ return self.raw.get("deny_lists", {})
43
+
44
+ @property
45
+ def allow_lists(self) -> dict[str, list[str]]:
46
+ return self.raw.get("allow_lists", {})
47
+
48
+ @property
49
+ def policies(self) -> dict[str, Any]:
50
+ return self.raw.get("policies", {})
51
+
52
+ @property
53
+ def ai_models(self) -> dict[str, Any]:
54
+ return self.raw.get("ai_models", {})
55
+
56
+ def get(self, dotted_key: str, default: Any = None) -> Any:
57
+ """Acces dotted-path : cfg.get('policies.priority_rules.DenylistRecognizer')."""
58
+ parts = dotted_key.split(".")
59
+ node: Any = self.raw
60
+ for p in parts:
61
+ if not isinstance(node, dict) or p not in node:
62
+ return default
63
+ node = node[p]
64
+ return node
65
+
66
+
67
+ # ----------------------------------------------------------------------
68
+ # Merge utilities
69
+ # ----------------------------------------------------------------------
70
+
71
+
72
+ def _deep_merge(base: dict[str, Any], overlay: dict[str, Any]) -> dict[str, Any]:
73
+ """Merge profond.
74
+
75
+ Regles :
76
+ - dict + dict -> merge cle a cle (recursif)
77
+ - list + list -> concatenation (sans dedoublonnage : c'est le role du caller)
78
+ - scalaire + scalaire -> overlay gagne
79
+ - types mixtes -> overlay gagne (warning implicite)
80
+ """
81
+ result = dict(base)
82
+ for key, val in overlay.items():
83
+ if key in result:
84
+ existing = result[key]
85
+ if isinstance(existing, dict) and isinstance(val, dict):
86
+ result[key] = _deep_merge(existing, val)
87
+ elif isinstance(existing, list) and isinstance(val, list):
88
+ result[key] = existing + val
89
+ else:
90
+ result[key] = val
91
+ else:
92
+ result[key] = val
93
+ return result
94
+
95
+
96
+ def _load_yaml(path: Path) -> dict[str, Any]:
97
+ """Charge un YAML, renvoie {} si fichier vide."""
98
+ if not path.exists():
99
+ return {}
100
+ with open(path, encoding="utf-8") as f:
101
+ data = yaml.safe_load(f)
102
+ return data if isinstance(data, dict) else {}
103
+
104
+
105
+ def _load_directory_layer(dir_path: Path) -> dict[str, Any]:
106
+ """Charge tous les YAML d'un dossier en un dict structure par sous-dossier.
107
+
108
+ Pour un layout :
109
+ layer/
110
+ pipeline.yaml
111
+ patterns/
112
+ fiscal_fr.yaml -> patterns.fiscal_fr.*
113
+ deny_lists/
114
+ organizations.yaml -> deny_lists.organizations.*
115
+
116
+ Le contenu du fichier remplace le sous-namespace correspondant.
117
+ """
118
+ if not dir_path.exists() or not dir_path.is_dir():
119
+ return {}
120
+
121
+ out: dict[str, Any] = {}
122
+
123
+ # 1. fichiers .yaml directement a la racine de la couche (ex: pipeline.yaml, ai_models.yaml)
124
+ for yml in sorted(dir_path.glob("*.yaml")):
125
+ key = yml.stem
126
+ data = _load_yaml(yml)
127
+ # Si le YAML contient une cle racine du meme nom, on la fusionne
128
+ # Sinon on stocke le contenu sous la cle key.
129
+ if key in data and isinstance(data[key], dict):
130
+ out[key] = _deep_merge(out.get(key, {}), data[key])
131
+ else:
132
+ out[key] = (
133
+ _deep_merge(out.get(key, {}), data) if isinstance(data, dict) else data
134
+ )
135
+
136
+ # 2. sous-dossiers : patterns/, deny_lists/, allow_lists/, policies/
137
+ # + recursion limitee pour places/{fr,de,...}.yaml
138
+ for sub in sorted(p for p in dir_path.iterdir() if p.is_dir()):
139
+ sub_content: dict[str, Any] = {}
140
+ # 2a. fichiers .yaml directs (ex: deny_lists/organizations.yaml)
141
+ for yml in sorted(sub.glob("*.yaml")):
142
+ sub_content[yml.stem] = _load_yaml(yml)
143
+ # 2b. sous-sous-dossiers (ex: deny_lists/places/fr.yaml)
144
+ # -> deny_lists.places = {fr: [...], de: [...], ...}
145
+ for subsub in sorted(p for p in sub.iterdir() if p.is_dir()):
146
+ lang_dict: dict[str, Any] = {}
147
+ for yml in sorted(subsub.glob("*.yaml")):
148
+ lang_dict[yml.stem] = _load_yaml(yml)
149
+ if lang_dict:
150
+ sub_content[subsub.name] = _deep_merge(
151
+ sub_content.get(subsub.name, {}), lang_dict
152
+ )
153
+ if sub_content:
154
+ out[sub.name] = _deep_merge(out.get(sub.name, {}), sub_content)
155
+
156
+ return out
157
+
158
+
159
+ # ----------------------------------------------------------------------
160
+ # API publique
161
+ # ----------------------------------------------------------------------
162
+
163
+
164
+ def load_config(
165
+ base_dir: Path | str | None = None,
166
+ profile: str | None = None,
167
+ private_profile: str | None = None,
168
+ repo_root: Path | str | None = None,
169
+ ) -> Config:
170
+ """Charge la config en cascade base -> profile -> private_profile.
171
+
172
+ Args:
173
+ base_dir: chemin du dossier `config/` (defaut : <repo>/config).
174
+ profile: nom du profil public a appliquer (ex: 'acknowledge').
175
+ private_profile: nom du profil prive (sous profiles_private/).
176
+ repo_root: racine du repo (auto-detection par defaut).
177
+
178
+ Returns:
179
+ Config resolu.
180
+ """
181
+ if repo_root is None:
182
+ # racine = parent de src/carnaval/core/
183
+ repo_root = Path(__file__).resolve().parents[3]
184
+ repo_root = Path(repo_root)
185
+
186
+ base_path = Path(base_dir) if base_dir else repo_root / "config"
187
+
188
+ layers_loaded: list[str] = []
189
+ merged: dict[str, Any] = {}
190
+
191
+ # Couche 1 : base
192
+ base_layer = _load_directory_layer(base_path)
193
+ if base_layer:
194
+ merged = _deep_merge(merged, base_layer)
195
+ layers_loaded.append(f"base:{base_path}")
196
+
197
+ # Couche 2 : profile public
198
+ if profile:
199
+ prof_path = repo_root / "profiles" / profile
200
+ prof_layer = _load_directory_layer(prof_path)
201
+ if not prof_layer:
202
+ raise FileNotFoundError(f"Profil introuvable : {prof_path}")
203
+ merged = _deep_merge(merged, prof_layer)
204
+ layers_loaded.append(f"profile:{profile}")
205
+
206
+ # Couche 3 : profil prive (optionnel)
207
+ if private_profile:
208
+ priv_path = repo_root / "profiles_private" / private_profile
209
+ priv_layer = _load_directory_layer(priv_path)
210
+ if not priv_layer:
211
+ raise FileNotFoundError(f"Profil prive introuvable : {priv_path}")
212
+ merged = _deep_merge(merged, priv_layer)
213
+ layers_loaded.append(f"private:{private_profile}")
214
+
215
+ return Config(raw=merged, layers=layers_loaded)
@@ -0,0 +1,52 @@
1
+ # Copyright 2026 Patrice AUBERT
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """Detection de langue via lingua-language-detector.
4
+
5
+ Wrapper minimal. Renvoie un code ISO 'fr'/'en'/'de'/'ja' ou 'unknown'.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Optional
11
+
12
+ from lingua import Language, LanguageDetectorBuilder
13
+
14
+ _SUPPORTED = (Language.FRENCH, Language.ENGLISH, Language.GERMAN, Language.JAPANESE)
15
+ _DETECTOR: Optional[object] = None
16
+
17
+
18
+ def _get_detector():
19
+ """Charge le detecteur paresseusement (premier appel = chargement modeles)."""
20
+ global _DETECTOR
21
+ if _DETECTOR is None:
22
+ _DETECTOR = (
23
+ LanguageDetectorBuilder.from_languages(*_SUPPORTED)
24
+ .with_preloaded_language_models()
25
+ .build()
26
+ )
27
+ return _DETECTOR
28
+
29
+
30
+ def detect_language(text: str) -> str:
31
+ """Detecte la langue d'un texte.
32
+
33
+ Args:
34
+ text: contenu textuel a analyser.
35
+
36
+ Returns:
37
+ 'fr', 'en', 'de', 'ja' ou 'unknown'.
38
+ """
39
+ if not text or not text.strip():
40
+ return "unknown"
41
+
42
+ sample = text[:5000]
43
+ lang = _get_detector().detect_language_of(sample)
44
+ if lang is None:
45
+ return "unknown"
46
+ mapping = {
47
+ Language.FRENCH: "fr",
48
+ Language.ENGLISH: "en",
49
+ Language.GERMAN: "de",
50
+ Language.JAPANESE: "ja",
51
+ }
52
+ return mapping.get(lang, "unknown")
@@ -0,0 +1,84 @@
1
+ # Copyright 2026 Patrice AUBERT
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """Logging structure avec garde-fou anti-fuite.
4
+
5
+ Principe : un logger structlog standard, plus un filtre qui interdit toute
6
+ journalisation contenant des cles a haut risque (`original`, `raw_text`, `mapping`).
7
+ Si une telle cle apparait, sa valeur est remplacee par `<REDACTED>` avant emission.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ import sys
14
+ from typing import Any
15
+
16
+ import structlog
17
+
18
+ # Cles dont la valeur ne doit JAMAIS apparaitre dans les logs.
19
+ SENSITIVE_KEYS = frozenset(
20
+ {
21
+ "original",
22
+ "raw_text",
23
+ "raw",
24
+ "text",
25
+ "mapping",
26
+ "vault",
27
+ "vault_contents",
28
+ "password",
29
+ "secret",
30
+ "forward",
31
+ "backward",
32
+ }
33
+ )
34
+
35
+
36
+ def _redact_sensitive(
37
+ logger, method_name, event_dict: dict[str, Any]
38
+ ) -> dict[str, Any]:
39
+ """Processor structlog : remplace les valeurs sensibles par <REDACTED>."""
40
+ for key in list(event_dict.keys()):
41
+ if key.lower() in SENSITIVE_KEYS:
42
+ event_dict[key] = "<REDACTED>"
43
+ return event_dict
44
+
45
+
46
+ def configure_logging(level: str = "INFO", json_format: bool = True) -> None:
47
+ """Configure structlog au niveau global.
48
+
49
+ Args:
50
+ level: 'DEBUG', 'INFO', 'WARNING', 'ERROR'.
51
+ json_format: True -> sortie JSON (prod). False -> sortie console lisible.
52
+ """
53
+ logging.basicConfig(
54
+ format="%(message)s",
55
+ stream=sys.stdout,
56
+ level=getattr(logging, level.upper(), logging.INFO),
57
+ )
58
+
59
+ processors: list[Any] = [
60
+ structlog.contextvars.merge_contextvars,
61
+ structlog.stdlib.add_log_level,
62
+ structlog.processors.TimeStamper(fmt="iso"),
63
+ _redact_sensitive, # garde-fou anti-fuite
64
+ ]
65
+
66
+ if json_format:
67
+ processors.append(structlog.processors.JSONRenderer())
68
+ else:
69
+ processors.append(structlog.dev.ConsoleRenderer())
70
+
71
+ structlog.configure(
72
+ processors=processors,
73
+ wrapper_class=structlog.make_filtering_bound_logger(
74
+ getattr(logging, level.upper(), logging.INFO)
75
+ ),
76
+ context_class=dict,
77
+ logger_factory=structlog.stdlib.LoggerFactory(),
78
+ cache_logger_on_first_use=True,
79
+ )
80
+
81
+
82
+ def get_logger(name: str = "carnaval") -> structlog.BoundLogger:
83
+ """Obtient un logger structure (configure_logging() doit avoir ete appele)."""
84
+ return structlog.get_logger(name)