cezam-lib 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cezam_lib-0.1.0/.coverage +0 -0
- cezam_lib-0.1.0/.gitignore +7 -0
- cezam_lib-0.1.0/.gitlab-ci.yml +67 -0
- cezam_lib-0.1.0/PKG-INFO +299 -0
- cezam_lib-0.1.0/README.md +280 -0
- cezam_lib-0.1.0/pyproject.toml +50 -0
- cezam_lib-0.1.0/src/cezam_lib/__init__.py +2 -0
- cezam_lib-0.1.0/src/cezam_lib/cezam_shared/__init__.py +47 -0
- cezam_lib-0.1.0/src/cezam_lib/cezam_shared/datalake_client.py +265 -0
- cezam_lib-0.1.0/src/cezam_lib/cezam_shared/datalake_paths.py +72 -0
- cezam_lib-0.1.0/src/cezam_lib/cezam_shared/exceptions.py +25 -0
- cezam_lib-0.1.0/src/cezam_lib/cezam_shared/minio_client.py +154 -0
- cezam_lib-0.1.0/src/cezam_lib/cezam_shared/otel.py +72 -0
- cezam_lib-0.1.0/src/cezam_lib/cezam_shared/rabbitmq.py +297 -0
- cezam_lib-0.1.0/src/cezam_lib/cezam_shared/source_client.py +170 -0
- cezam_lib-0.1.0/src/cezam_lib/pipeline_template/__init__.py +14 -0
- cezam_lib-0.1.0/src/cezam_lib/pipeline_template/base_pipeline.py +299 -0
- cezam_lib-0.1.0/src/cezam_lib/pipeline_template/extractor.py +32 -0
- cezam_lib-0.1.0/src/cezam_lib/pipeline_template/messages.py +53 -0
- cezam_lib-0.1.0/tests/__init__.py +1 -0
- cezam_lib-0.1.0/tests/test_base_pipeline_properties.py +520 -0
- cezam_lib-0.1.0/tests/test_minio_client.py +350 -0
- cezam_lib-0.1.0/tests/test_minio_client_properties.py +405 -0
- cezam_lib-0.1.0/tests/test_otel.py +176 -0
- cezam_lib-0.1.0/tests/test_package_structure.py +141 -0
- cezam_lib-0.1.0/tests/test_pipeline_config.py +406 -0
- cezam_lib-0.1.0/tests/test_rabbitmq_consumer.py +367 -0
- cezam_lib-0.1.0/tests/test_rabbitmq_properties.py +382 -0
- cezam_lib-0.1.0/tests/test_rabbitmq_publisher.py +241 -0
- cezam_lib-0.1.0/uv.lock +1105 -0
|
Binary file
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
image: python:3.11-slim
|
|
2
|
+
|
|
3
|
+
stages:
|
|
4
|
+
- lint
|
|
5
|
+
- test
|
|
6
|
+
- build
|
|
7
|
+
- publish
|
|
8
|
+
|
|
9
|
+
variables:
|
|
10
|
+
UV_CACHE_DIR: .uv-cache
|
|
11
|
+
|
|
12
|
+
cache:
|
|
13
|
+
key: "${CI_COMMIT_REF_SLUG}"
|
|
14
|
+
paths:
|
|
15
|
+
- .venv/
|
|
16
|
+
- .uv-cache/
|
|
17
|
+
|
|
18
|
+
before_script:
|
|
19
|
+
- pip install uv
|
|
20
|
+
- uv sync
|
|
21
|
+
|
|
22
|
+
lint:
|
|
23
|
+
stage: lint
|
|
24
|
+
script:
|
|
25
|
+
- uv run ruff check src/ tests/
|
|
26
|
+
rules:
|
|
27
|
+
- if: $CI_COMMIT_BRANCH =~ /^(feature|hotfix)\//
|
|
28
|
+
- if: $CI_COMMIT_BRANCH == "develop"
|
|
29
|
+
- if: $CI_COMMIT_BRANCH == "main"
|
|
30
|
+
- if: $CI_COMMIT_BRANCH =~ /^release\//
|
|
31
|
+
- if: $CI_COMMIT_TAG =~ /^v\d+\.\d+\.\d+$/
|
|
32
|
+
- if: $CI_MERGE_REQUEST_IID
|
|
33
|
+
|
|
34
|
+
test:
|
|
35
|
+
stage: test
|
|
36
|
+
script:
|
|
37
|
+
- uv run pytest --cov=cezam_lib tests/
|
|
38
|
+
rules:
|
|
39
|
+
- if: $CI_COMMIT_BRANCH =~ /^(feature|hotfix)\//
|
|
40
|
+
- if: $CI_COMMIT_BRANCH == "develop"
|
|
41
|
+
- if: $CI_COMMIT_BRANCH == "main"
|
|
42
|
+
- if: $CI_COMMIT_BRANCH =~ /^release\//
|
|
43
|
+
- if: $CI_COMMIT_TAG =~ /^v\d+\.\d+\.\d+$/
|
|
44
|
+
- if: $CI_MERGE_REQUEST_IID
|
|
45
|
+
|
|
46
|
+
build:
|
|
47
|
+
stage: build
|
|
48
|
+
script:
|
|
49
|
+
- uv build
|
|
50
|
+
artifacts:
|
|
51
|
+
paths:
|
|
52
|
+
- dist/
|
|
53
|
+
expire_in: 1 week
|
|
54
|
+
rules:
|
|
55
|
+
- if: $CI_COMMIT_BRANCH == "main"
|
|
56
|
+
- if: $CI_COMMIT_BRANCH == "develop"
|
|
57
|
+
- if: $CI_COMMIT_BRANCH =~ /^release\//
|
|
58
|
+
- if: $CI_COMMIT_TAG =~ /^v\d+\.\d+\.\d+$/
|
|
59
|
+
|
|
60
|
+
publish:
|
|
61
|
+
stage: publish
|
|
62
|
+
script:
|
|
63
|
+
- uv publish --token $PYPI_TOKEN
|
|
64
|
+
dependencies:
|
|
65
|
+
- build
|
|
66
|
+
rules:
|
|
67
|
+
- if: $CI_COMMIT_TAG =~ /^v\d+\.\d+\.\d+$/
|
cezam_lib-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cezam-lib
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Bibliothèque partagée pour les microservices CEZAM
|
|
5
|
+
Project-URL: Repository, https://gitlab.com/cezamdev/cezam-lib
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Keywords: cezam,microservices,pipeline
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
10
|
+
Requires-Python: >=3.11
|
|
11
|
+
Requires-Dist: minio>=7.2.0
|
|
12
|
+
Requires-Dist: opentelemetry-api>=1.20.0
|
|
13
|
+
Requires-Dist: opentelemetry-exporter-otlp>=1.20.0
|
|
14
|
+
Requires-Dist: opentelemetry-instrumentation>=0.41b0
|
|
15
|
+
Requires-Dist: opentelemetry-sdk>=1.20.0
|
|
16
|
+
Requires-Dist: pika>=1.3.0
|
|
17
|
+
Requires-Dist: pydantic>=2.0.0
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# cezam-lib
|
|
21
|
+
|
|
22
|
+
Bibliothèque partagée pour les microservices CEZAM. Ce package regroupe deux sous-packages sous un namespace unique `cezam_lib` :
|
|
23
|
+
|
|
24
|
+
- **`cezam_shared`** — Clients d'infrastructure (MinIO, S3, RabbitMQ), configuration OpenTelemetry, et exceptions partagées
|
|
25
|
+
- **`pipeline_template`** — Classes de base pour construire des pipelines d'extraction spécialisés
|
|
26
|
+
|
|
27
|
+
> Python >= 3.11 requis
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
# Avec uv (recommandé)
|
|
33
|
+
uv add cezam-lib
|
|
34
|
+
|
|
35
|
+
# Avec pip
|
|
36
|
+
pip install cezam-lib
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Structure du package
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
cezam_lib/
|
|
43
|
+
├── __init__.py # __version__, __all__
|
|
44
|
+
├── cezam_shared/
|
|
45
|
+
│ ├── __init__.py # Exports publics
|
|
46
|
+
│ ├── minio_client.py # MinIOClient
|
|
47
|
+
│ ├── datalake_client.py # DatalakeClient
|
|
48
|
+
│ ├── source_client.py # SourceClient
|
|
49
|
+
│ ├── datalake_paths.py # Fonctions de chemins normalisés
|
|
50
|
+
│ ├── rabbitmq.py # RabbitMQPublisher, RabbitMQConsumer
|
|
51
|
+
│ ├── otel.py # setup_otel, inject/extract_trace_context
|
|
52
|
+
│ └── exceptions.py # MinIOError, RabbitMQError, etc.
|
|
53
|
+
└── pipeline_template/
|
|
54
|
+
├── __init__.py # Exports publics
|
|
55
|
+
├── base_pipeline.py # BasePipeline
|
|
56
|
+
├── extractor.py # DataExtractor (ABC)
|
|
57
|
+
└── messages.py # PipelineMessage, FusionMessage
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Imports :
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from cezam_lib.cezam_shared import MinIOClient, DatalakeClient, SourceClient
|
|
64
|
+
from cezam_lib.cezam_shared import RabbitMQPublisher, RabbitMQConsumer
|
|
65
|
+
from cezam_lib.cezam_shared import datalake_paths
|
|
66
|
+
from cezam_lib.cezam_shared import setup_otel
|
|
67
|
+
|
|
68
|
+
from cezam_lib.pipeline_template import BasePipeline, DataExtractor
|
|
69
|
+
from cezam_lib.pipeline_template import PipelineMessage, FusionMessage
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Composants `cezam_shared`
|
|
73
|
+
|
|
74
|
+
### MinIOClient
|
|
75
|
+
|
|
76
|
+
Client legacy pour les opérations JSON sur MinIO.
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from cezam_lib.cezam_shared import MinIOClient
|
|
80
|
+
|
|
81
|
+
client = MinIOClient(
|
|
82
|
+
endpoint="localhost:9000",
|
|
83
|
+
access_key="minioadmin",
|
|
84
|
+
secret_key="minioadmin",
|
|
85
|
+
bucket="my-bucket",
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
client.put_json("path/to/doc.json", {"key": "value"})
|
|
89
|
+
data = client.get_json("path/to/doc.json")
|
|
90
|
+
exists = client.exists("path/to/doc.json")
|
|
91
|
+
files = client.list_prefix("path/to/")
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### DatalakeClient
|
|
95
|
+
|
|
96
|
+
Client S3 pour le bucket datalake avec préfixage automatique par environnement (lecture/écriture).
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from cezam_lib.cezam_shared import DatalakeClient
|
|
100
|
+
|
|
101
|
+
client = DatalakeClient(
|
|
102
|
+
endpoint="s3.sbg.io.cloud.ovh.net",
|
|
103
|
+
access_key="key",
|
|
104
|
+
secret_key="secret",
|
|
105
|
+
bucket="datalake",
|
|
106
|
+
env_prefix="prod",
|
|
107
|
+
secure=True,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
client.put_json("sim123/ocr/doc.json", {"text": "..."})
|
|
111
|
+
data = client.get_json("sim123/ocr/doc.json")
|
|
112
|
+
client.put_bytes("sim123/pages/page1.png", png_bytes)
|
|
113
|
+
raw = client.get_bytes("sim123/pages/page1.png")
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### SourceClient
|
|
117
|
+
|
|
118
|
+
Client S3 en lecture seule pour le bucket source de production.
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
from cezam_lib.cezam_shared import SourceClient
|
|
122
|
+
|
|
123
|
+
client = SourceClient(
|
|
124
|
+
endpoint="s3.eu-west-par.io.cloud.ovh.net",
|
|
125
|
+
access_key="key",
|
|
126
|
+
secret_key="secret",
|
|
127
|
+
bucket="source",
|
|
128
|
+
secure=True,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
data = client.get_json("path/to/doc.json")
|
|
132
|
+
raw = client.get_bytes("path/to/file.pdf")
|
|
133
|
+
client.download_file("path/to/file.pdf", local_path)
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### datalake_paths
|
|
137
|
+
|
|
138
|
+
Fonctions pures de construction de chemins normalisés pour le datalake. Le préfixage par environnement est géré par `DatalakeClient`.
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from cezam_lib.cezam_shared import datalake_paths
|
|
142
|
+
|
|
143
|
+
path = datalake_paths.original_path("sim123", "doc.pdf")
|
|
144
|
+
# → "sim123/original/doc.pdf"
|
|
145
|
+
|
|
146
|
+
path = datalake_paths.ocr_path("sim123", "doc.json")
|
|
147
|
+
# → "sim123/ocr/doc.json"
|
|
148
|
+
|
|
149
|
+
path = datalake_paths.pipeline_result_path("sim123", "ddp", "result.json")
|
|
150
|
+
# → "sim123/ddp/sim123/result.json"
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### RabbitMQPublisher
|
|
154
|
+
|
|
155
|
+
Publisher RabbitMQ avec propagation automatique du contexte OpenTelemetry.
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
from cezam_lib.cezam_shared import RabbitMQPublisher
|
|
159
|
+
|
|
160
|
+
with RabbitMQPublisher(
|
|
161
|
+
host="localhost", port=5672, user="guest", password="guest"
|
|
162
|
+
) as publisher:
|
|
163
|
+
publisher.publish(
|
|
164
|
+
exchange="",
|
|
165
|
+
routing_key="my_queue",
|
|
166
|
+
message={"simulation_id": "sim123", "status": "ready"},
|
|
167
|
+
)
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
### RabbitMQConsumer
|
|
171
|
+
|
|
172
|
+
Consumer RabbitMQ avec gestion automatique des ack/nack et propagation OTel.
|
|
173
|
+
|
|
174
|
+
- Callback réussit → ack automatique
|
|
175
|
+
- `RetryableError` → nack avec requeue
|
|
176
|
+
- `NonRetryableError` ou autre exception → nack sans requeue
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
from cezam_lib.cezam_shared import RabbitMQConsumer
|
|
180
|
+
|
|
181
|
+
def handle_message(message: dict) -> None:
|
|
182
|
+
print(f"Reçu: {message}")
|
|
183
|
+
|
|
184
|
+
with RabbitMQConsumer(
|
|
185
|
+
host="localhost", port=5672, user="guest", password="guest"
|
|
186
|
+
) as consumer:
|
|
187
|
+
consumer.consume(queue="my_queue", callback=handle_message)
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## Composants `pipeline_template`
|
|
191
|
+
|
|
192
|
+
### BasePipeline
|
|
193
|
+
|
|
194
|
+
Classe de base abstraite pour les pipelines d'extraction spécialisés. Gère le flux complet :
|
|
195
|
+
|
|
196
|
+
1. Parse le `PipelineMessage` entrant
|
|
197
|
+
2. Lit les données OCR depuis le datalake
|
|
198
|
+
3. Appelle l'extracteur spécialisé
|
|
199
|
+
4. Écrit le résultat sur le datalake
|
|
200
|
+
5. Publie un `FusionMessage` vers la queue fusion
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
from cezam_lib.pipeline_template import BasePipeline, DataExtractor
|
|
204
|
+
from pydantic import BaseModel
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
class MyResult(BaseModel):
|
|
208
|
+
status: str
|
|
209
|
+
confidence: float
|
|
210
|
+
field_count: int
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class MyExtractor(DataExtractor[MyResult]):
|
|
214
|
+
def extract(self, ocr_data: dict) -> MyResult:
|
|
215
|
+
return MyResult(status="SUCCESS", confidence=0.95, field_count=10)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
pipeline = BasePipeline(
|
|
219
|
+
datalake_client=datalake_client,
|
|
220
|
+
publisher=publisher,
|
|
221
|
+
consumer=consumer,
|
|
222
|
+
extractor=MyExtractor(),
|
|
223
|
+
queue_name="my_pipeline",
|
|
224
|
+
pipeline_name="my_pipeline",
|
|
225
|
+
)
|
|
226
|
+
pipeline.run()
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### DataExtractor
|
|
230
|
+
|
|
231
|
+
Interface abstraite générique pour l'extraction de données depuis l'OCR. Les extracteurs concrets héritent de `DataExtractor[T]` et implémentent `extract()`.
|
|
232
|
+
|
|
233
|
+
### PipelineMessage / FusionMessage
|
|
234
|
+
|
|
235
|
+
Modèles Pydantic pour la communication inter-pipelines :
|
|
236
|
+
|
|
237
|
+
- `PipelineMessage` — Message reçu du Doc Classifier (simulation_id, doc_name, document_type, ocr_json_path, etc.)
|
|
238
|
+
- `FusionMessage` — Message envoyé vers la queue Fusion avec les métriques d'extraction (status, quality, action, confidence_avg, etc.)
|
|
239
|
+
|
|
240
|
+
## Exceptions
|
|
241
|
+
|
|
242
|
+
```python
|
|
243
|
+
from cezam_lib.cezam_shared import (
|
|
244
|
+
MinIOError,
|
|
245
|
+
RabbitMQError,
|
|
246
|
+
RetryableError,
|
|
247
|
+
NonRetryableError,
|
|
248
|
+
)
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
| Exception | Usage |
|
|
252
|
+
|-----------|-------|
|
|
253
|
+
| `MinIOError` | Erreur lors d'une opération S3/MinIO |
|
|
254
|
+
| `RabbitMQError` | Erreur lors d'une opération RabbitMQ |
|
|
255
|
+
| `RetryableError` | Erreur temporaire, le message sera requeue |
|
|
256
|
+
| `NonRetryableError` | Erreur définitive, le message est rejeté |
|
|
257
|
+
|
|
258
|
+
## Configuration OpenTelemetry
|
|
259
|
+
|
|
260
|
+
```python
|
|
261
|
+
from cezam_lib.cezam_shared import (
|
|
262
|
+
setup_otel,
|
|
263
|
+
inject_trace_context,
|
|
264
|
+
extract_trace_context,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
# Initialiser OTel pour un service
|
|
268
|
+
tracer, meter = setup_otel(
|
|
269
|
+
service_name="doc_classifier",
|
|
270
|
+
otel_endpoint="localhost:4317",
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
# Propager le contexte de trace dans des headers
|
|
274
|
+
headers = {}
|
|
275
|
+
inject_trace_context(headers)
|
|
276
|
+
|
|
277
|
+
# Extraire et activer le contexte depuis des headers entrants
|
|
278
|
+
extract_trace_context(incoming_headers)
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
## Développement local
|
|
282
|
+
|
|
283
|
+
```bash
|
|
284
|
+
# Installer les dépendances (dev inclus)
|
|
285
|
+
uv sync
|
|
286
|
+
|
|
287
|
+
# Lancer les tests
|
|
288
|
+
uv run pytest
|
|
289
|
+
|
|
290
|
+
# Tests avec couverture
|
|
291
|
+
uv run pytest --cov=cezam_lib tests/
|
|
292
|
+
|
|
293
|
+
# Linting
|
|
294
|
+
uv run ruff check src/ tests/
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
## Licence
|
|
298
|
+
|
|
299
|
+
MIT
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
# cezam-lib
|
|
2
|
+
|
|
3
|
+
Bibliothèque partagée pour les microservices CEZAM. Ce package regroupe deux sous-packages sous un namespace unique `cezam_lib` :
|
|
4
|
+
|
|
5
|
+
- **`cezam_shared`** — Clients d'infrastructure (MinIO, S3, RabbitMQ), configuration OpenTelemetry, et exceptions partagées
|
|
6
|
+
- **`pipeline_template`** — Classes de base pour construire des pipelines d'extraction spécialisés
|
|
7
|
+
|
|
8
|
+
> Python >= 3.11 requis
|
|
9
|
+
|
|
10
|
+
## Installation
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
# Avec uv (recommandé)
|
|
14
|
+
uv add cezam-lib
|
|
15
|
+
|
|
16
|
+
# Avec pip
|
|
17
|
+
pip install cezam-lib
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Structure du package
|
|
21
|
+
|
|
22
|
+
```
|
|
23
|
+
cezam_lib/
|
|
24
|
+
├── __init__.py # __version__, __all__
|
|
25
|
+
├── cezam_shared/
|
|
26
|
+
│ ├── __init__.py # Exports publics
|
|
27
|
+
│ ├── minio_client.py # MinIOClient
|
|
28
|
+
│ ├── datalake_client.py # DatalakeClient
|
|
29
|
+
│ ├── source_client.py # SourceClient
|
|
30
|
+
│ ├── datalake_paths.py # Fonctions de chemins normalisés
|
|
31
|
+
│ ├── rabbitmq.py # RabbitMQPublisher, RabbitMQConsumer
|
|
32
|
+
│ ├── otel.py # setup_otel, inject/extract_trace_context
|
|
33
|
+
│ └── exceptions.py # MinIOError, RabbitMQError, etc.
|
|
34
|
+
└── pipeline_template/
|
|
35
|
+
├── __init__.py # Exports publics
|
|
36
|
+
├── base_pipeline.py # BasePipeline
|
|
37
|
+
├── extractor.py # DataExtractor (ABC)
|
|
38
|
+
└── messages.py # PipelineMessage, FusionMessage
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Imports :
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from cezam_lib.cezam_shared import MinIOClient, DatalakeClient, SourceClient
|
|
45
|
+
from cezam_lib.cezam_shared import RabbitMQPublisher, RabbitMQConsumer
|
|
46
|
+
from cezam_lib.cezam_shared import datalake_paths
|
|
47
|
+
from cezam_lib.cezam_shared import setup_otel
|
|
48
|
+
|
|
49
|
+
from cezam_lib.pipeline_template import BasePipeline, DataExtractor
|
|
50
|
+
from cezam_lib.pipeline_template import PipelineMessage, FusionMessage
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Composants `cezam_shared`
|
|
54
|
+
|
|
55
|
+
### MinIOClient
|
|
56
|
+
|
|
57
|
+
Client legacy pour les opérations JSON sur MinIO.
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from cezam_lib.cezam_shared import MinIOClient
|
|
61
|
+
|
|
62
|
+
client = MinIOClient(
|
|
63
|
+
endpoint="localhost:9000",
|
|
64
|
+
access_key="minioadmin",
|
|
65
|
+
secret_key="minioadmin",
|
|
66
|
+
bucket="my-bucket",
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
client.put_json("path/to/doc.json", {"key": "value"})
|
|
70
|
+
data = client.get_json("path/to/doc.json")
|
|
71
|
+
exists = client.exists("path/to/doc.json")
|
|
72
|
+
files = client.list_prefix("path/to/")
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### DatalakeClient
|
|
76
|
+
|
|
77
|
+
Client S3 pour le bucket datalake avec préfixage automatique par environnement (lecture/écriture).
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from cezam_lib.cezam_shared import DatalakeClient
|
|
81
|
+
|
|
82
|
+
client = DatalakeClient(
|
|
83
|
+
endpoint="s3.sbg.io.cloud.ovh.net",
|
|
84
|
+
access_key="key",
|
|
85
|
+
secret_key="secret",
|
|
86
|
+
bucket="datalake",
|
|
87
|
+
env_prefix="prod",
|
|
88
|
+
secure=True,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
client.put_json("sim123/ocr/doc.json", {"text": "..."})
|
|
92
|
+
data = client.get_json("sim123/ocr/doc.json")
|
|
93
|
+
client.put_bytes("sim123/pages/page1.png", png_bytes)
|
|
94
|
+
raw = client.get_bytes("sim123/pages/page1.png")
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### SourceClient
|
|
98
|
+
|
|
99
|
+
Client S3 en lecture seule pour le bucket source de production.
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from cezam_lib.cezam_shared import SourceClient
|
|
103
|
+
|
|
104
|
+
client = SourceClient(
|
|
105
|
+
endpoint="s3.eu-west-par.io.cloud.ovh.net",
|
|
106
|
+
access_key="key",
|
|
107
|
+
secret_key="secret",
|
|
108
|
+
bucket="source",
|
|
109
|
+
secure=True,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
data = client.get_json("path/to/doc.json")
|
|
113
|
+
raw = client.get_bytes("path/to/file.pdf")
|
|
114
|
+
client.download_file("path/to/file.pdf", local_path)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### datalake_paths
|
|
118
|
+
|
|
119
|
+
Fonctions pures de construction de chemins normalisés pour le datalake. Le préfixage par environnement est géré par `DatalakeClient`.
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
from cezam_lib.cezam_shared import datalake_paths
|
|
123
|
+
|
|
124
|
+
path = datalake_paths.original_path("sim123", "doc.pdf")
|
|
125
|
+
# → "sim123/original/doc.pdf"
|
|
126
|
+
|
|
127
|
+
path = datalake_paths.ocr_path("sim123", "doc.json")
|
|
128
|
+
# → "sim123/ocr/doc.json"
|
|
129
|
+
|
|
130
|
+
path = datalake_paths.pipeline_result_path("sim123", "ddp", "result.json")
|
|
131
|
+
# → "sim123/ddp/sim123/result.json"
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### RabbitMQPublisher
|
|
135
|
+
|
|
136
|
+
Publisher RabbitMQ avec propagation automatique du contexte OpenTelemetry.
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
from cezam_lib.cezam_shared import RabbitMQPublisher
|
|
140
|
+
|
|
141
|
+
with RabbitMQPublisher(
|
|
142
|
+
host="localhost", port=5672, user="guest", password="guest"
|
|
143
|
+
) as publisher:
|
|
144
|
+
publisher.publish(
|
|
145
|
+
exchange="",
|
|
146
|
+
routing_key="my_queue",
|
|
147
|
+
message={"simulation_id": "sim123", "status": "ready"},
|
|
148
|
+
)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### RabbitMQConsumer
|
|
152
|
+
|
|
153
|
+
Consumer RabbitMQ avec gestion automatique des ack/nack et propagation OTel.
|
|
154
|
+
|
|
155
|
+
- Callback réussit → ack automatique
|
|
156
|
+
- `RetryableError` → nack avec requeue
|
|
157
|
+
- `NonRetryableError` ou autre exception → nack sans requeue
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from cezam_lib.cezam_shared import RabbitMQConsumer
|
|
161
|
+
|
|
162
|
+
def handle_message(message: dict) -> None:
|
|
163
|
+
print(f"Reçu: {message}")
|
|
164
|
+
|
|
165
|
+
with RabbitMQConsumer(
|
|
166
|
+
host="localhost", port=5672, user="guest", password="guest"
|
|
167
|
+
) as consumer:
|
|
168
|
+
consumer.consume(queue="my_queue", callback=handle_message)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Composants `pipeline_template`
|
|
172
|
+
|
|
173
|
+
### BasePipeline
|
|
174
|
+
|
|
175
|
+
Classe de base abstraite pour les pipelines d'extraction spécialisés. Gère le flux complet :
|
|
176
|
+
|
|
177
|
+
1. Parse le `PipelineMessage` entrant
|
|
178
|
+
2. Lit les données OCR depuis le datalake
|
|
179
|
+
3. Appelle l'extracteur spécialisé
|
|
180
|
+
4. Écrit le résultat sur le datalake
|
|
181
|
+
5. Publie un `FusionMessage` vers la queue fusion
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
from cezam_lib.pipeline_template import BasePipeline, DataExtractor
|
|
185
|
+
from pydantic import BaseModel
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class MyResult(BaseModel):
|
|
189
|
+
status: str
|
|
190
|
+
confidence: float
|
|
191
|
+
field_count: int
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
class MyExtractor(DataExtractor[MyResult]):
|
|
195
|
+
def extract(self, ocr_data: dict) -> MyResult:
|
|
196
|
+
return MyResult(status="SUCCESS", confidence=0.95, field_count=10)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
pipeline = BasePipeline(
|
|
200
|
+
datalake_client=datalake_client,
|
|
201
|
+
publisher=publisher,
|
|
202
|
+
consumer=consumer,
|
|
203
|
+
extractor=MyExtractor(),
|
|
204
|
+
queue_name="my_pipeline",
|
|
205
|
+
pipeline_name="my_pipeline",
|
|
206
|
+
)
|
|
207
|
+
pipeline.run()
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
### DataExtractor
|
|
211
|
+
|
|
212
|
+
Interface abstraite générique pour l'extraction de données depuis l'OCR. Les extracteurs concrets héritent de `DataExtractor[T]` et implémentent `extract()`.
|
|
213
|
+
|
|
214
|
+
### PipelineMessage / FusionMessage
|
|
215
|
+
|
|
216
|
+
Modèles Pydantic pour la communication inter-pipelines :
|
|
217
|
+
|
|
218
|
+
- `PipelineMessage` — Message reçu du Doc Classifier (simulation_id, doc_name, document_type, ocr_json_path, etc.)
|
|
219
|
+
- `FusionMessage` — Message envoyé vers la queue Fusion avec les métriques d'extraction (status, quality, action, confidence_avg, etc.)
|
|
220
|
+
|
|
221
|
+
## Exceptions
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
from cezam_lib.cezam_shared import (
|
|
225
|
+
MinIOError,
|
|
226
|
+
RabbitMQError,
|
|
227
|
+
RetryableError,
|
|
228
|
+
NonRetryableError,
|
|
229
|
+
)
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
| Exception | Usage |
|
|
233
|
+
|-----------|-------|
|
|
234
|
+
| `MinIOError` | Erreur lors d'une opération S3/MinIO |
|
|
235
|
+
| `RabbitMQError` | Erreur lors d'une opération RabbitMQ |
|
|
236
|
+
| `RetryableError` | Erreur temporaire, le message sera requeue |
|
|
237
|
+
| `NonRetryableError` | Erreur définitive, le message est rejeté |
|
|
238
|
+
|
|
239
|
+
## Configuration OpenTelemetry
|
|
240
|
+
|
|
241
|
+
```python
|
|
242
|
+
from cezam_lib.cezam_shared import (
|
|
243
|
+
setup_otel,
|
|
244
|
+
inject_trace_context,
|
|
245
|
+
extract_trace_context,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Initialiser OTel pour un service
|
|
249
|
+
tracer, meter = setup_otel(
|
|
250
|
+
service_name="doc_classifier",
|
|
251
|
+
otel_endpoint="localhost:4317",
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# Propager le contexte de trace dans des headers
|
|
255
|
+
headers = {}
|
|
256
|
+
inject_trace_context(headers)
|
|
257
|
+
|
|
258
|
+
# Extraire et activer le contexte depuis des headers entrants
|
|
259
|
+
extract_trace_context(incoming_headers)
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
## Développement local
|
|
263
|
+
|
|
264
|
+
```bash
|
|
265
|
+
# Installer les dépendances (dev inclus)
|
|
266
|
+
uv sync
|
|
267
|
+
|
|
268
|
+
# Lancer les tests
|
|
269
|
+
uv run pytest
|
|
270
|
+
|
|
271
|
+
# Tests avec couverture
|
|
272
|
+
uv run pytest --cov=cezam_lib tests/
|
|
273
|
+
|
|
274
|
+
# Linting
|
|
275
|
+
uv run ruff check src/ tests/
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
## Licence
|
|
279
|
+
|
|
280
|
+
MIT
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "cezam-lib"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Bibliothèque partagée pour les microservices CEZAM"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
requires-python = ">=3.11"
|
|
8
|
+
keywords = ["cezam", "microservices", "pipeline"]
|
|
9
|
+
classifiers = [
|
|
10
|
+
"Programming Language :: Python :: 3",
|
|
11
|
+
"Programming Language :: Python :: 3.11",
|
|
12
|
+
]
|
|
13
|
+
dependencies = [
|
|
14
|
+
"minio>=7.2.0",
|
|
15
|
+
"pika>=1.3.0",
|
|
16
|
+
"opentelemetry-api>=1.20.0",
|
|
17
|
+
"opentelemetry-sdk>=1.20.0",
|
|
18
|
+
"opentelemetry-exporter-otlp>=1.20.0",
|
|
19
|
+
"opentelemetry-instrumentation>=0.41b0",
|
|
20
|
+
"pydantic>=2.0.0",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[project.urls]
|
|
24
|
+
Repository = "https://gitlab.com/cezamdev/cezam-lib"
|
|
25
|
+
|
|
26
|
+
[build-system]
|
|
27
|
+
requires = ["hatchling"]
|
|
28
|
+
build-backend = "hatchling.build"
|
|
29
|
+
|
|
30
|
+
[tool.hatch.build.targets.sdist]
|
|
31
|
+
exclude = [".venv", ".uv-cache"]
|
|
32
|
+
|
|
33
|
+
[tool.hatch.build.targets.wheel]
|
|
34
|
+
packages = ["src/cezam_lib"]
|
|
35
|
+
|
|
36
|
+
[tool.pytest.ini_options]
|
|
37
|
+
testpaths = ["tests"]
|
|
38
|
+
pythonpath = ["src"]
|
|
39
|
+
|
|
40
|
+
[tool.ruff]
|
|
41
|
+
src = ["src", "tests"]
|
|
42
|
+
|
|
43
|
+
[dependency-groups]
|
|
44
|
+
dev = [
|
|
45
|
+
"pytest>=8.0.0",
|
|
46
|
+
"pytest-cov>=4.0.0",
|
|
47
|
+
"hypothesis>=6.100.0",
|
|
48
|
+
"ruff",
|
|
49
|
+
"pyyaml>=6.0.0",
|
|
50
|
+
]
|