guardianhub 0.1.88__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. guardianhub/__init__.py +29 -0
  2. guardianhub/_version.py +1 -0
  3. guardianhub/agents/runtime.py +12 -0
  4. guardianhub/auth/token_provider.py +22 -0
  5. guardianhub/clients/__init__.py +2 -0
  6. guardianhub/clients/classification_client.py +52 -0
  7. guardianhub/clients/graph_db_client.py +161 -0
  8. guardianhub/clients/langfuse/dataset_client.py +157 -0
  9. guardianhub/clients/langfuse/manager.py +118 -0
  10. guardianhub/clients/langfuse/prompt_client.py +68 -0
  11. guardianhub/clients/langfuse/score_evaluation_client.py +92 -0
  12. guardianhub/clients/langfuse/tracing_client.py +250 -0
  13. guardianhub/clients/langfuse_client.py +63 -0
  14. guardianhub/clients/llm_client.py +144 -0
  15. guardianhub/clients/llm_service.py +295 -0
  16. guardianhub/clients/metadata_extractor_client.py +53 -0
  17. guardianhub/clients/ocr_client.py +81 -0
  18. guardianhub/clients/paperless_client.py +515 -0
  19. guardianhub/clients/registry_client.py +18 -0
  20. guardianhub/clients/text_cleaner_client.py +58 -0
  21. guardianhub/clients/vector_client.py +344 -0
  22. guardianhub/config/__init__.py +0 -0
  23. guardianhub/config/config_development.json +84 -0
  24. guardianhub/config/config_prod.json +39 -0
  25. guardianhub/config/settings.py +221 -0
  26. guardianhub/http/http_client.py +26 -0
  27. guardianhub/logging/__init__.py +2 -0
  28. guardianhub/logging/logging.py +168 -0
  29. guardianhub/logging/logging_filters.py +35 -0
  30. guardianhub/models/__init__.py +0 -0
  31. guardianhub/models/agent_models.py +153 -0
  32. guardianhub/models/base.py +2 -0
  33. guardianhub/models/registry/client.py +16 -0
  34. guardianhub/models/registry/dynamic_loader.py +73 -0
  35. guardianhub/models/registry/loader.py +37 -0
  36. guardianhub/models/registry/registry.py +17 -0
  37. guardianhub/models/registry/signing.py +70 -0
  38. guardianhub/models/template/__init__.py +0 -0
  39. guardianhub/models/template/agent_plan.py +65 -0
  40. guardianhub/models/template/agent_response_evaluation.py +67 -0
  41. guardianhub/models/template/extraction.py +29 -0
  42. guardianhub/models/template/reflection_critique.py +206 -0
  43. guardianhub/models/template/suggestion.py +42 -0
  44. guardianhub/observability/__init__.py +1 -0
  45. guardianhub/observability/instrumentation.py +271 -0
  46. guardianhub/observability/otel_helper.py +43 -0
  47. guardianhub/observability/otel_middlewares.py +73 -0
  48. guardianhub/prompts/base.py +7 -0
  49. guardianhub/prompts/providers/langfuse_provider.py +13 -0
  50. guardianhub/prompts/providers/local_provider.py +22 -0
  51. guardianhub/prompts/registry.py +14 -0
  52. guardianhub/scripts/script.sh +31 -0
  53. guardianhub/services/base.py +15 -0
  54. guardianhub/template/__init__.py +0 -0
  55. guardianhub/tools/gh_registry_cli.py +171 -0
  56. guardianhub/utils/__init__.py +0 -0
  57. guardianhub/utils/app_state.py +74 -0
  58. guardianhub/utils/fastapi_utils.py +152 -0
  59. guardianhub/utils/json_utils.py +137 -0
  60. guardianhub/utils/metrics.py +60 -0
  61. guardianhub-0.1.88.dist-info/METADATA +240 -0
  62. guardianhub-0.1.88.dist-info/RECORD +64 -0
  63. guardianhub-0.1.88.dist-info/WHEEL +4 -0
  64. guardianhub-0.1.88.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,515 @@
1
+ """Client implementation for interacting with the Paperless-ngx REST API.
2
+
3
+ This client provides granular methods for document upload, processing status polling,
4
+ and text retrieval, supporting better error handling and custom OCR integration.
5
+ """
6
+ import asyncio
7
+ import time # <-- Added time import for standard time tracking
8
+ from typing import Dict, Any, Optional, List
9
+
10
+ import httpx
11
+ from guardianhub import get_logger
12
+ from guardianhub.config.settings import settings
13
+ logger = get_logger(__name__)
14
+
15
+ # Define a simple exception for task failures
16
+ class PaperlessTaskFailure(RuntimeError):
17
+ """Custom exception for Paperless task failures."""
18
+ pass
19
+
20
+ # Paperless field types: string, integer, float, boolean, date, document_link
21
+ SCHEMA_TYPE_MAP = {
22
+ "string": "string",
23
+ "number": "float", # Use float for Pydantic's 'number' type (covers decimals)
24
+ "integer": "integer",
25
+ "boolean": "boolean",
26
+ "date": "date" # Assuming a custom type for dates, otherwise use 'string'
27
+ }
28
+ class PaperlessClient:
29
+ """
30
+ Asynchronous client for Paperless-ngx API interactions.
31
+
32
+ This client assumes an API endpoint is available and requires an API token.
33
+ It maintains a persistent httpx client session for reuse across activity calls.
34
+ """
35
+ def __init__(self,base_url:str, poll_interval: int = 5, poll_timeout: int = 300):
36
+ """
37
+ Initializes the Paperless client.
38
+ """
39
+ self.api_url = base_url.rstrip('/')
40
+ self.api_token = settings.endpoints.get("PAPERLESS_ACCESS_KEY")
41
+ self.headers = {
42
+ "Authorization": f"Token {self.api_token}",
43
+ "Accept": "application/json",
44
+ }
45
+ self.poll_interval = poll_interval
46
+ self.poll_timeout = poll_timeout
47
+
48
+ # Initialize the persistent httpx client here.
49
+ # DO NOT use it in an 'async with' block in methods, or it will be closed.
50
+ self.client = httpx.AsyncClient(headers=self.headers, base_url=self.api_url, timeout=self.poll_timeout + 60)
51
+ # self.client = httpx.AsyncClient(
52
+ # auth=("jayant_yantramops", "itapp123"),
53
+ # base_url=self.api_url,
54
+ # timeout=self.poll_timeout + 60,
55
+ # follow_redirects=False
56
+ # )
57
+ logger.info("PaperlessClient initialized for URL: %s", self.api_url)
58
+
59
+ async def upload_document(self, file_path: str, file_name: str, skip_ocr: bool) -> Optional[str]:
60
+ """
61
+ Uploads a single document to Paperless-ngx.
62
+ """
63
+ upload_endpoint = "/api/documents/post_document/"
64
+ logger.info("Attempting to upload file: %s", file_name)
65
+
66
+ try:
67
+ data = {
68
+ 'title': file_name,
69
+ 'full_text_search': 'auto'
70
+ # 'full_text_search': 'skip' if skip_ocr else 'auto'
71
+ }
72
+
73
+
74
+ # Use the persistent client directly without 'async with'
75
+ with open(file_path, 'rb') as f:
76
+ files = {'document': (file_name, f)}
77
+
78
+ response = await self.client.post(upload_endpoint, files=files, data=data)
79
+
80
+ response.raise_for_status()
81
+
82
+ # Reads the response as text and strips quotes/whitespace (Fix from previous step)
83
+ initial_doc_id = response.text.strip().strip('"')
84
+ document_id = str(initial_doc_id)
85
+
86
+ if not document_id or document_id.lower() == 'none' or not document_id.strip():
87
+ raise RuntimeError("Paperless API did not return a document ID after upload (response was empty or invalid).")
88
+
89
+ logger.info("File uploaded. Document ID assigned: %s", document_id)
90
+ return document_id
91
+
92
+ except httpx.HTTPStatusError as e:
93
+ status_code = e.response.status_code
94
+ error_msg = f"Paperless HTTP error during upload (Status {status_code}): {e.response.text}"
95
+ logger.error(error_msg, exc_info=True)
96
+ raise RuntimeError(error_msg)
97
+
98
+ except Exception as e:
99
+ logger.error("A critical error occurred during Paperless upload: %s", str(e), exc_info=True)
100
+ raise
101
+
102
+ async def wait_for_document_processing(self, task_id: str) -> Dict[str, Any]:
103
+ """
104
+ Polls the Paperless API for a document's processing status using the Task ID until it completes.
105
+
106
+ Returns the final Document ID on success, or the ID of the existing document if a duplicate is found.
107
+ """
108
+ endpoint = f"/api/tasks/?task_id={task_id}"
109
+ start_time = time.time()
110
+
111
+ logger.info("Polling Paperless for task %s status...", task_id)
112
+
113
+ while time.time() - start_time < self.poll_timeout:
114
+ try:
115
+ response = await self.client.get(endpoint)
116
+ response.raise_for_status()
117
+ data: List[Dict[str, Any]] = response.json()
118
+
119
+ if not data:
120
+ # Case 1: Task not yet created or found in the system
121
+ logger.info("Task %s is not yet visible in the task list. Retrying in %d seconds.",
122
+ task_id, self.poll_interval)
123
+ await asyncio.sleep(self.poll_interval)
124
+ continue
125
+
126
+ # Case 2: Task found, check status
127
+ task_info = data[0]
128
+ status = task_info.get('status')
129
+ result = task_info.get('result', 'No result message provided.')
130
+
131
+ if status == 'SUCCESS':
132
+ document_id = str(task_info.get('related_document'))
133
+ if not document_id or document_id == 'None':
134
+ raise PaperlessTaskFailure(
135
+ f"Task succeeded but returned no 'related_document' ID. Result: {result}")
136
+
137
+ logger.info("Document ingestion successful. Final Document ID: %s. Result: %s",
138
+ document_id, result)
139
+ return task_info
140
+
141
+ elif status == 'FAILURE':
142
+ logger.error("Paperless-ngx processing failed for task %s: %s", task_id, result)
143
+
144
+ # --- CRITICAL CHANGE: Handle Duplicate Failure (SOFT FAILURE) ---
145
+ # If the failure is due to duplication, the task's 'related_document' field should hold the ID of the existing document.
146
+ if 'duplicate' in result.lower() and task_info.get('related_document'):
147
+ existing_doc_id = str(task_info['related_document'])
148
+ logger.warning(
149
+ "DUPLICATE DETECTED: Task failed, but document already exists as ID %s. The workflow will proceed using this existing ID.",
150
+ existing_doc_id)
151
+ # Return the existing document ID instead of raising an error
152
+ return task_info
153
+
154
+ # Hard Failure
155
+ raise PaperlessTaskFailure(f"Paperless-ngx task failed ({task_id}): {result}")
156
+
157
+ else:
158
+ # STARTED, PENDING, UNKNOWN, etc.
159
+ logger.info("Task %s status: %s. Retrying in %d seconds.",
160
+ task_id, status, self.poll_interval)
161
+
162
+ await asyncio.sleep(self.poll_interval)
163
+
164
+ except httpx.HTTPStatusError as e:
165
+ logger.error("HTTP error while polling task %s: %s", task_id, e)
166
+ raise
167
+ # Catch PaperlessTaskFailure and re-raise immediately (not transient)
168
+ except PaperlessTaskFailure:
169
+ raise
170
+ except Exception as e:
171
+ # Catch unexpected JSON errors or network issues (which are transient)
172
+ logger.warning("Transient error while polling task %s: %s. Retrying...", task_id, e)
173
+ await asyncio.sleep(self.poll_interval)
174
+
175
+ raise TimeoutError(f"Paperless-ngx processing timed out after {self.poll_timeout} seconds for task {task_id}")
176
+
177
+ async def get_document(self, document_id: int) -> Dict[str, Any]:
178
+ """
179
+ Retrieves comprehensive document metadata and content from Paperless-ngx.
180
+
181
+ It hits the document detail endpoint (/api/documents/{id}/) to get:
182
+ - raw_text (from the 'content' field)
183
+ - classification metadata (title, dates, tags, etc.)
184
+ """
185
+ # Use the endpoint that provides full JSON details
186
+ endpoint = f"/api/documents/{document_id}/"
187
+
188
+ try:
189
+ # Use the persistent client directly (assuming self.client is available)
190
+ response = await self.client.get(endpoint)
191
+
192
+ # --- CRITICAL FIX: Manually check for 3xx status codes which indicate unauthenticated redirect ---
193
+ if 300 <= response.status_code < 400:
194
+ logger.error(
195
+ "Authentication required: Received redirect (Status %d) to login page for document %s. Check API token.",
196
+ response.status_code, document_id)
197
+ # Raising a custom error here since a 302 on this endpoint means authorization failed.
198
+ raise RuntimeError(
199
+ f"Authentication Failure: Paperless redirected to login page when trying to fetch document {document_id}. Status: {response.status_code}")
200
+
201
+ response.raise_for_status()
202
+
203
+ doc_data = response.json()
204
+
205
+ # Extract classification-relevant fields
206
+ raw_text = doc_data.get("content", "")
207
+
208
+ # Build a metadata dictionary for the MetadataClassifier
209
+ classification_metadata = {
210
+ # Core Textual/Contextual Fields
211
+ "title": doc_data.get("title", ""),
212
+ "original_file_name": doc_data.get("original_file_name", ""),
213
+
214
+ # Date Fields
215
+ "created_date": doc_data.get("created", ""),
216
+ "modified_date": doc_data.get("modified", ""),
217
+
218
+ # Structural Fields (New)
219
+ "mime_type": doc_data.get("mime_type", ""), # e.g., 'application/pdf'
220
+ "page_count": doc_data.get("page_count", 1), # Page count is a strong signal for reports vs. receipts
221
+
222
+ # Paperless Relations (using IDs)
223
+ "tags": doc_data.get("tags", []),
224
+ "document_type_id": doc_data.get("document_type", None),
225
+ "correspondent_id": doc_data.get("correspondent", None), # Strong signal for vendors/banks
226
+ "archive_serial_number": doc_data.get("archive_serial_number", None)
227
+
228
+ # Note: We skip 'custom_fields' for now until their schema is defined.
229
+ }
230
+
231
+ if not raw_text:
232
+ logger.warning(
233
+ "Retrieved text is empty for document %s. This may indicate poor OCR quality or an empty file.",
234
+ document_id)
235
+
236
+ # Return a dictionary containing all needed classification inputs
237
+ return {
238
+ "document_id": document_id,
239
+ "raw_text": raw_text,
240
+ "metadata": classification_metadata,
241
+ "embedding_id": doc_data.get("embedding_id", None),
242
+ # Note: The 'embedding_id' must be managed/added by the calling service (DCM)
243
+ # if it's an internal ID, as Paperless does not know about it.
244
+ }
245
+
246
+ except httpx.HTTPStatusError as e:
247
+ logger.error("HTTP error fetching document %s: %s", document_id, str(e))
248
+ # Re-raise the exception after logging
249
+ raise
250
+ except Exception as e:
251
+ logger.error("An unexpected error occurred fetching document %s: %s", document_id, str(e))
252
+ # Re-raise the exception after logging
253
+ raise
254
+ async def get_document_text(self, document_id: int) -> str:
255
+ """
256
+ Retrieves the full OCR'd text content of a processed document.
257
+ """
258
+ # Note: Paperless documentation uses /api/documents/{id}/text/
259
+ # endpoint = f"/api/documents/{document_id}/text/"
260
+ # endpoint = f"/api/documents/{document_id}/download/?as_text=1"
261
+ endpoint = f"/api/documents/{document_id}/"
262
+
263
+ try:
264
+ # Use the persistent client directly
265
+ response = await self.client.get(endpoint)
266
+
267
+ # --- CRITICAL FIX: Manually check for 3xx status codes which indicate unauthenticated redirect ---
268
+ if 300 <= response.status_code < 400:
269
+ logger.error(
270
+ "Authentication required: Received redirect (Status %d) to login page for document %s. Check API token.",
271
+ response.status_code, document_id)
272
+ # Raising a custom error here since a 302 on this endpoint means authorization failed.
273
+ raise RuntimeError(
274
+ f"Authentication Failure: Paperless redirected to login page when trying to fetch text for document {document_id}. Status: {response.status_code}")
275
+
276
+ response.raise_for_status()
277
+
278
+ document_text = response.json().get('content')
279
+ if not document_text:
280
+ logger.warning(
281
+ "Retrieved text is empty for document %s. This may indicate poor OCR quality or an empty file.",
282
+ document_id)
283
+
284
+ return document_text
285
+
286
+ except httpx.HTTPStatusError as e:
287
+ logger.error("Failed to retrieve text for document %s: %s", document_id, e)
288
+ raise RuntimeError(
289
+ f"Failed to retrieve text from Paperless for document {document_id}. Status: {e.response.status_code}")
290
+ except Exception as e:
291
+ logger.error("Error retrieving text for document %s: %s", document_id, e)
292
+ raise
293
+
294
+ # services/clients/paperless_client.py (Add these methods to the class)
295
+
296
+ from typing import Dict, Any, List, Optional
297
+ import logging
298
+ # Assume PaperlessClient is initialized with base_url and credentials/token
299
+
300
+ logger = logging.getLogger(__name__)
301
+
302
+ # --- Helper Mapping for Schema Types to Paperless Types ---
303
+ # Paperless field types: string, integer, float, boolean, date, document_link
304
+ SCHEMA_TYPE_MAP = {
305
+ "string": "string",
306
+ "number": "float", # Use float for Pydantic's 'number' type (covers decimals)
307
+ "integer": "integer",
308
+ "boolean": "boolean",
309
+ "date": "date" # Assuming a custom type for dates, otherwise use 'string'
310
+ }
311
+
312
+ async def _get_all_custom_fields(self) -> List[Dict[str, Any]]:
313
+ """Internal method to fetch all existing custom fields."""
314
+ try:
315
+ response = await self.client.get("/api/custom_fields/")
316
+ response.raise_for_status()
317
+ return response.json().get("results", [])
318
+ except httpx.HTTPStatusError as e:
319
+ logger.error(f"Failed to fetch custom fields: {e}")
320
+ return []
321
+
322
+ async def ensure_custom_field(self, schema_field_name: str, schema_type: str) -> Optional[int]:
323
+ """
324
+ Checks if a custom field exists based on the schema name and creates it if not.
325
+ Returns the Paperless Custom Field ID.
326
+ """
327
+
328
+ # Standardize the name and map the type
329
+ paperless_name = schema_field_name.replace('_', ' ').title() # e.g., 'invoice_total' -> 'Invoice Total'
330
+ field_type = SCHEMA_TYPE_MAP.get(schema_type, "string")
331
+
332
+ existing_fields = await self._get_all_custom_fields()
333
+
334
+ # Check if field already exists
335
+ for field in existing_fields:
336
+ if field['name'].lower() == paperless_name.lower():
337
+ logger.info(f"Custom field '{paperless_name}' already exists (ID: {field['id']}).")
338
+ return field['id']
339
+
340
+ # If not found, create it
341
+ creation_payload = {
342
+ "name": paperless_name,
343
+ "data_type": field_type,
344
+ "default_value": None
345
+ }
346
+
347
+ try:
348
+ response = await self.client.post("/api/custom_fields/", json=creation_payload)
349
+ response.raise_for_status()
350
+ new_field = response.json()
351
+ logger.info(f"Created new Custom Field '{paperless_name}' (ID: {new_field['id']}).")
352
+ return new_field['id']
353
+ except httpx.HTTPStatusError as e:
354
+ logger.error(f"Failed to create Custom Field '{paperless_name}': {e}. Response: {e.response.text}")
355
+ return None
356
+
357
+ # services/clients/paperless_client.py (Updated method)
358
+
359
+ # ... (Existing imports and class structure remain) ...
360
+
361
+ async def update_document_metadata(
362
+ self,
363
+ document_id: int,
364
+ template_id: str,
365
+ custom_field_values: Dict[int, Any]
366
+ ) -> bool:
367
+ """
368
+ Updates a document in Paperless using the bulk_edit endpoint for Custom Fields
369
+ and Tagging.
370
+
371
+ Args:
372
+ document_id: The ID of the document in Paperless.
373
+ template_id: The ID of the newly registered template.
374
+ custom_field_values: A dictionary mapping {Paperless_Field_ID: Extracted_Value}.
375
+
376
+ Returns:
377
+ bool: True if all updates were successful, False otherwise.
378
+ """
379
+
380
+ success_status = True
381
+ bulk_edit_endpoint = "/api/documents/bulk_edit/"
382
+
383
+ # --- 1. Prepare Custom Field Payload ---
384
+ # The 'modify_custom_fields' method requires 'add_custom_fields' payload
385
+ add_custom_fields_payload: Dict[int, Any] = {}
386
+ for field_id, value in custom_field_values.items():
387
+ # Ensure value is JSON serializable/Paperless acceptable (converting if needed)
388
+ add_custom_fields_payload[field_id] = str(value) if not isinstance(value, (str, int, float, bool,
389
+ type(None))) else value
390
+
391
+ if add_custom_fields_payload:
392
+ custom_field_payload = {
393
+ "documents": [document_id],
394
+ "method": "modify_custom_fields",
395
+ "parameters": {
396
+ "add_custom_fields": add_custom_fields_payload,
397
+ # We are not removing any fields here
398
+ "remove_custom_fields": []
399
+ }
400
+ }
401
+
402
+ try:
403
+ cf_response = await self.client.post(bulk_edit_endpoint, json=custom_field_payload)
404
+ cf_response.raise_for_status()
405
+ logger.info(f"Custom fields updated for document {document_id}.")
406
+ except httpx.HTTPStatusError as e:
407
+ logger.error(
408
+ f"Failed to apply custom fields via bulk_edit (Doc {document_id}): {e.response.text}")
409
+ success_status = False
410
+
411
+ # --- 2. Tagging Payload (Associating the Template ID) ---
412
+ # We must first ensure the tag exists. The simplest way is to assume the Template ID is
413
+ # the name of a tag, but the API requires the TAG_ID.
414
+
415
+ # NOTE: To use add_tag, we need the TAG_ID. We MUST add a step to get/create the tag ID.
416
+ tag_id = await self._ensure_tag_exists(template_id) # Assume this helper function exists
417
+
418
+ if tag_id and success_status: # Only proceed if custom fields succeeded
419
+ tagging_payload = {
420
+ "documents": [document_id],
421
+ "method": "add_tag",
422
+ "parameters": {
423
+ "tag": tag_id
424
+ }
425
+ }
426
+
427
+ try:
428
+ tag_response = await self.client.post(bulk_edit_endpoint, json=tagging_payload)
429
+ tag_response.raise_for_status()
430
+ logger.info(f"Document {document_id} tagged with Template ID: {template_id}.")
431
+ except httpx.HTTPStatusError as e:
432
+ logger.error(f"Failed to tag document {document_id} via bulk_edit: {e.response.text}")
433
+ success_status = False
434
+
435
+ # 3. Final Result
436
+ if success_status:
437
+ logger.info(f"Document {document_id} successfully updated with metadata and template tag.")
438
+
439
+ return success_status
440
+
441
+ # --- ASSUMED HELPER METHOD (required for the above logic) ---
442
+ async def _ensure_tag_exists(self, tag_name: str) -> Optional[int]:
443
+ """
444
+ Checks if a tag exists by name and creates it if not. Returns the Paperless Tag ID.
445
+ (This is needed because bulk_edit requires the ID, not the name.)
446
+ """
447
+ # 1. GET /api/tags/?name__iexact=tag_name to check existence
448
+ # 2. If not found, POST to /api/tags/ {"name": tag_name}
449
+ # 3. Returns the ID from the existing or new tag.
450
+ # Note: This complexity is hidden here but must be implemented for the call to work.
451
+ try:
452
+ # Example implementation placeholder:
453
+ response = await self.client.get(f"/api/tags/?name__iexact={tag_name}")
454
+ if response.status_code == 200 and response.json().get('results'):
455
+ return response.json()['results'][0]['id']
456
+
457
+ response = await self.client.post("/api/tags/", json={"name": tag_name})
458
+ response.raise_for_status()
459
+ return response.json()['id']
460
+ except Exception as e:
461
+ logger.error(f"Failed to ensure tag '{tag_name}' exists: {str(e)}")
462
+ return None
463
+
464
+ # services/clients/paperless_client.py (Add this to PaperlessClient class)
465
+
466
+ async def _ensure_document_type_exists(self, doc_type_name: str) -> Optional[int]:
467
+ """
468
+ Checks if a Paperless Document Type exists and creates it if not. Returns the ID.
469
+ (This is analogous to the _ensure_tag_exists helper.)
470
+ """
471
+ try:
472
+ # Check existing types
473
+ check_response = await self.client.get(f"/api/document_types/?name__iexact={doc_type_name}")
474
+ check_response.raise_for_status()
475
+
476
+ results = check_response.json().get('results')
477
+ if results:
478
+ return results[0]['id']
479
+
480
+ # Create if not found
481
+ creation_response = await self.client.post("/api/document_types/", json={"name": doc_type_name})
482
+ creation_response.raise_for_status()
483
+ return creation_response.json()['id']
484
+
485
+ except Exception as e:
486
+ logger.error(f"Failed to ensure document type '{doc_type_name}' exists: {str(e)}")
487
+ return None
488
+
489
+ async def set_document_type_by_name(self, document_id: int, doc_type_name: str) -> bool:
490
+ """
491
+ Sets the document_type for a specific document using its name.
492
+ """
493
+ doc_type_id = await self._ensure_document_type_exists(doc_type_name)
494
+
495
+ if not doc_type_id:
496
+ logger.warning(
497
+ f"Skipping document type setting for {document_id}: Could not find or create '{doc_type_name}' type ID.")
498
+ return False
499
+
500
+ bulk_edit_payload = {
501
+ "documents": [document_id],
502
+ "method": "set_document_type",
503
+ "parameters": {
504
+ "document_type": doc_type_id
505
+ }
506
+ }
507
+
508
+ try:
509
+ response = await self.client.post("/api/documents/bulk_edit/", json=bulk_edit_payload)
510
+ response.raise_for_status()
511
+ logger.info(f"Document {document_id} successfully assigned to Document Type: {doc_type_name}.")
512
+ return True
513
+ except httpx.HTTPStatusError as e:
514
+ logger.error(f"Failed to set document type via bulk_edit: {e.response.text}")
515
+ return False
@@ -0,0 +1,18 @@
1
+ # services/registry_client.py
2
+
3
+ import httpx
4
+ from guardianhub.config.settings import settings
5
+
6
+ class RegistryClient:
7
+
8
+ def __init__(self):
9
+ self.base = settings.endpoints.get("DOC_REGISTRY_URL")
10
+
11
+ async def save_document(self, doc):
12
+ async with httpx.AsyncClient() as client:
13
+ await client.post(f"{self.base}/v1/documents", json=doc)
14
+
15
+ async def get_document(self, doc_id):
16
+ async with httpx.AsyncClient() as client:
17
+ r = await client.get(f"{self.base}/v1/documents/{doc_id}")
18
+ return r.json()
@@ -0,0 +1,58 @@
1
+ # clients/metadata_extractor_client.py
2
+
3
+ import uuid
4
+
5
+ from clients import LLMService
6
+
7
+ from guardianhub import get_logger
8
+
9
+ logger = get_logger(__name__)
10
+
11
+
12
+ class TextCleanerClient:
13
+ def __init__(self, llm_service:LLMService):
14
+ self.llm_service = llm_service
15
+
16
+ async def call_llm_extraction_service_impl(self,input_data: dict) -> str:
17
+ """Implementation of LLM-based invoice extraction service call."""
18
+
19
+ # 1️⃣ Normalize the input to get plain text
20
+ if isinstance(input_data, str):
21
+ extracted_text = input_data
22
+ else:
23
+ extracted_text = input_data.get("text")
24
+
25
+ if not extracted_text or not isinstance(extracted_text, str):
26
+ raise ValueError("No valid text provided for LLM extraction")
27
+
28
+ if isinstance(extracted_text, bytes):
29
+ extracted_text = extracted_text.decode("utf-8")
30
+
31
+ # 2️⃣ Initialize LLM service
32
+
33
+ # 3️⃣ Generate internal tracking ID (not passed to LLM)
34
+ workflow_invoice_id = input_data.get("invoice_id") or f"inv-{str(uuid.uuid4())[:8]}"
35
+
36
+ # 4️⃣ Define system prompt (without injecting invoice_id)
37
+ system_prompt = (
38
+ "You are an expert AI assistant specialized in extracting structured data from invoices. "
39
+ "Analyze the given invoice text and extract key details such as invoice number, vendor, date, amount, and due date. "
40
+ "Return your response as a JSON object matching the given schema.\n"
41
+ "If any field is missing, use null."
42
+ )
43
+
44
+ # 5️⃣ Call LLM for structured extraction
45
+ try:
46
+ invoice_data = await self.llm_service.get_structured_response(
47
+ user_input=extracted_text,
48
+ system_prompt=system_prompt,
49
+ response_model=None
50
+ )
51
+
52
+ result = invoice_data.model_dump(exclude_unset=True)
53
+ result["workflow_invoice_id"] = workflow_invoice_id # attach for traceability
54
+ return result
55
+
56
+ except Exception as e:
57
+ logger.error(f"LLM extraction failed: {str(e)}")
58
+ raise RuntimeError("Failed to process invoice with LLM") from e