guardianhub 0.1.88__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. guardianhub/__init__.py +29 -0
  2. guardianhub/_version.py +1 -0
  3. guardianhub/agents/runtime.py +12 -0
  4. guardianhub/auth/token_provider.py +22 -0
  5. guardianhub/clients/__init__.py +2 -0
  6. guardianhub/clients/classification_client.py +52 -0
  7. guardianhub/clients/graph_db_client.py +161 -0
  8. guardianhub/clients/langfuse/dataset_client.py +157 -0
  9. guardianhub/clients/langfuse/manager.py +118 -0
  10. guardianhub/clients/langfuse/prompt_client.py +68 -0
  11. guardianhub/clients/langfuse/score_evaluation_client.py +92 -0
  12. guardianhub/clients/langfuse/tracing_client.py +250 -0
  13. guardianhub/clients/langfuse_client.py +63 -0
  14. guardianhub/clients/llm_client.py +144 -0
  15. guardianhub/clients/llm_service.py +295 -0
  16. guardianhub/clients/metadata_extractor_client.py +53 -0
  17. guardianhub/clients/ocr_client.py +81 -0
  18. guardianhub/clients/paperless_client.py +515 -0
  19. guardianhub/clients/registry_client.py +18 -0
  20. guardianhub/clients/text_cleaner_client.py +58 -0
  21. guardianhub/clients/vector_client.py +344 -0
  22. guardianhub/config/__init__.py +0 -0
  23. guardianhub/config/config_development.json +84 -0
  24. guardianhub/config/config_prod.json +39 -0
  25. guardianhub/config/settings.py +221 -0
  26. guardianhub/http/http_client.py +26 -0
  27. guardianhub/logging/__init__.py +2 -0
  28. guardianhub/logging/logging.py +168 -0
  29. guardianhub/logging/logging_filters.py +35 -0
  30. guardianhub/models/__init__.py +0 -0
  31. guardianhub/models/agent_models.py +153 -0
  32. guardianhub/models/base.py +2 -0
  33. guardianhub/models/registry/client.py +16 -0
  34. guardianhub/models/registry/dynamic_loader.py +73 -0
  35. guardianhub/models/registry/loader.py +37 -0
  36. guardianhub/models/registry/registry.py +17 -0
  37. guardianhub/models/registry/signing.py +70 -0
  38. guardianhub/models/template/__init__.py +0 -0
  39. guardianhub/models/template/agent_plan.py +65 -0
  40. guardianhub/models/template/agent_response_evaluation.py +67 -0
  41. guardianhub/models/template/extraction.py +29 -0
  42. guardianhub/models/template/reflection_critique.py +206 -0
  43. guardianhub/models/template/suggestion.py +42 -0
  44. guardianhub/observability/__init__.py +1 -0
  45. guardianhub/observability/instrumentation.py +271 -0
  46. guardianhub/observability/otel_helper.py +43 -0
  47. guardianhub/observability/otel_middlewares.py +73 -0
  48. guardianhub/prompts/base.py +7 -0
  49. guardianhub/prompts/providers/langfuse_provider.py +13 -0
  50. guardianhub/prompts/providers/local_provider.py +22 -0
  51. guardianhub/prompts/registry.py +14 -0
  52. guardianhub/scripts/script.sh +31 -0
  53. guardianhub/services/base.py +15 -0
  54. guardianhub/template/__init__.py +0 -0
  55. guardianhub/tools/gh_registry_cli.py +171 -0
  56. guardianhub/utils/__init__.py +0 -0
  57. guardianhub/utils/app_state.py +74 -0
  58. guardianhub/utils/fastapi_utils.py +152 -0
  59. guardianhub/utils/json_utils.py +137 -0
  60. guardianhub/utils/metrics.py +60 -0
  61. guardianhub-0.1.88.dist-info/METADATA +240 -0
  62. guardianhub-0.1.88.dist-info/RECORD +64 -0
  63. guardianhub-0.1.88.dist-info/WHEEL +4 -0
  64. guardianhub-0.1.88.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,295 @@
1
+ # llm/llm_service.py
2
+
3
+ import json
4
+ from typing import Any, Dict, Type, TypeVar, List, Optional
5
+
6
+ from openai import BaseModel
7
+ from pydantic import ValidationError
8
+ from pydantic import create_model
9
+
10
+ from guardianhub.config.settings import settings
11
+ from guardianhub import get_logger
12
+ from guardianhub.models.template.extraction import StructuredExtractionResult
13
+ from guardianhub.models.template.suggestion import TemplateSchemaSuggestion
14
+ from .llm_client import LLMClient
15
+ from ..utils.json_utils import parse_structured_response
16
+
17
+ logger = get_logger(__name__)
18
+ T = TypeVar("T", bound=BaseModel)
19
+
20
+
21
+ class LLMService:
22
+ """Service layer for structured extraction from Aura-LLM."""
23
+
24
+
25
+ def __init__(self, llm_client: LLMClient):
26
+
27
+ self.llm = llm_client
28
+ logger.info("LLMService initialized with client: %s", llm_client.__class__.__name__)
29
+
30
+
31
+ def create_model_from_schema(self,schema: Dict[str, Any]) -> Type[BaseModel]:
32
+ """Create a dynamic Pydantic model from a JSON schema."""
33
+ fields = {}
34
+ for field_name, field_props in schema.get('properties', {}).items():
35
+ field_type = self._get_python_type(field_props.get('type', 'string'))
36
+ fields[field_name] = (field_type, field_props.get('description', ''))
37
+
38
+ return create_model('DynamicModel', **fields)
39
+
40
+ def _get_python_type(self, schema_type) -> type:
41
+ """Map JSON schema types to Python types, handling both strings and lists of types."""
42
+ # If it's a list, use the first type (or default to string)
43
+ if isinstance(schema_type, list):
44
+ schema_type = schema_type[0] if schema_type else 'string'
45
+
46
+ # Ensure schema_type is a string
47
+ schema_type = str(schema_type).lower()
48
+
49
+ type_map = {
50
+ 'string': str,
51
+ 'integer': int,
52
+ 'number': float,
53
+ 'boolean': bool,
54
+ 'array': list,
55
+ 'object': dict,
56
+ }
57
+ return type_map.get(schema_type, str)
58
+
59
+ async def get_structured_response(
60
+ self,
61
+ user_input: str,
62
+ system_prompt: str,
63
+ response_model: Optional[Type[T]] = None,
64
+ model_json_schema: Optional[Dict[str, Any]] = None,
65
+ temperature: float = settings.llm.temperature,
66
+ max_tokens: int = settings.llm.max_tokens,
67
+ ) -> T:
68
+ """Request structured JSON response from Aura-LLM and parse it safely.
69
+
70
+ Args:
71
+ user_input: The input text to process
72
+ system_prompt: Task-specific instructions for the LLM
73
+ response_model: Pydantic model defining the expected response schema
74
+ temperature: Controls randomness (0.0 = deterministic)
75
+ max_tokens: Maximum number of tokens to generate
76
+
77
+ Returns:
78
+ An instance of the response model with the extracted data
79
+ """
80
+ # Get JSON schema from the response model
81
+ """Request structured JSON response from Aura-LLM and parse it safely."""
82
+ logger.info("Starting get_structured_response")
83
+ logger.info("System prompt: %s", system_prompt[:200] + "..." if len(system_prompt) > 200 else system_prompt)
84
+ logger.info("User input length: %d characters", len(user_input))
85
+ if model_json_schema and not response_model:
86
+ logger.info("Creating model from JSON schema")
87
+ response_model = self.create_model_from_schema(model_json_schema)
88
+
89
+ if not response_model:
90
+ error_msg = "Either response_model or model_json_schema must be provided"
91
+ logger.error(error_msg)
92
+ raise ValueError(error_msg)
93
+
94
+ # Create schema description for the LLM
95
+ schema = response_model.model_json_schema()
96
+ logger.debug("Using schema: %s", json.dumps(schema, indent=2))
97
+
98
+ schema_description = json.dumps(schema, indent=2)
99
+
100
+ # --- NEW: Single, Unified, Aggressive System Prompt ---
101
+ # Merging the general JSON generator persona with the specific task and schema.
102
+ unified_system_prompt = f"""
103
+ You are an **expert, precise JSON generator and data extraction API**.
104
+
105
+ {system_prompt}
106
+
107
+ Your SOLE output **MUST** be a single, valid JSON object that **STRICTLY** conforms to the following schema. **DO NOT** include any other text, markdown wrappers (like ```json), explanations, or custom fields.
108
+
109
+ --- REQUIRED JSON SCHEMA ---
110
+ {schema_description}
111
+ --- END SCHEMA ---
112
+
113
+ STRICT CONFORMANCE RULES:
114
+ 1. The output MUST be raw JSON text, with NO wrapping characters.
115
+ 2. Only include fields defined in the schema (document_type, metadata, confidence).
116
+ 3. If a field is a complex object (like 'metadata'), it MUST be output as a **direct JSON object**, NOT a nested JSON string. For example: "metadata": {{ "key": "value" }}, NOT "metadata": "{{\\"key\\": \\"value\\"}}".
117
+ 4. Maintain the exact field names and types from the schema.
118
+ """
119
+
120
+ # Using a single system instruction to avoid message conflict
121
+ messages = [
122
+ {"role": "system", "content": unified_system_prompt},
123
+ {"role": "user", "content": user_input},
124
+ ]
125
+
126
+ try:
127
+ logger.debug("Sending request to LLM with %d messages", len(messages))
128
+ response = await self.llm.chat_completion(
129
+ messages=messages,
130
+ temperature=temperature,
131
+ max_tokens=max_tokens,
132
+ model=settings.llm.model_key,
133
+ response_format={"type": "json"},
134
+ )
135
+ logger.debug("Received response from LLM")
136
+ if not response:
137
+ error_msg = "Empty response from LLM"
138
+ logger.error(error_msg)
139
+ raise ValueError(error_msg)
140
+
141
+ logger.debug("Full LLM response: %s", json.dumps(response, indent=2))
142
+
143
+ # Extract and clean the response
144
+ raw_text = (
145
+ response.get("choices", [{}])[0]
146
+ .get("message", {})
147
+ .get("content", "")
148
+ .strip()
149
+ )
150
+
151
+ if not raw_text:
152
+ error_msg = "Empty content in LLM response"
153
+ logger.error(error_msg)
154
+ raise ValueError(error_msg)
155
+
156
+ # Extract and validate JSON
157
+ logger.debug("Raw text from LLM: %s", raw_text)
158
+ result = parse_structured_response(raw_text, response_model)
159
+ logger.info("Successfully parsed structured response")
160
+ return result
161
+ except json.JSONDecodeError as e:
162
+ error_msg = f"Failed to decode LLM response as JSON: {str(e)}"
163
+ logger.error("%s\nRaw response: %s", error_msg, raw_text)
164
+ raise
165
+ except ValidationError as e:
166
+ error_msg = f"Response validation failed: {str(e)}"
167
+ logger.error("%s\nResponse was: %s", error_msg, raw_text)
168
+ raise
169
+ except Exception as e:
170
+ logger.error("Unexpected error in get_structured_response: %s", str(e), exc_info=True)
171
+ raise
172
+
173
+ async def classify_and_extract_document_metadata(
174
+ self,
175
+ document_text: str,
176
+ available_document_types: List[str] = None
177
+ ) -> StructuredExtractionResult:
178
+ """
179
+ Performs unified zero-shot classification and structured metadata extraction.
180
+
181
+ Args:
182
+ document_text: The full text content of the document.
183
+ available_document_types: A list of known document types to classify against.
184
+
185
+ Returns:
186
+ StructuredExtractionResult: An object containing the classified type and extracted metadata.
187
+ """
188
+ """Suggests a template schema based on document text and type."""
189
+ logger.info("Suggesting template schema for document type: %s", document_type)
190
+ logger.debug("Document text length: %d characters", len(document_text))
191
+
192
+ try:
193
+ if available_document_types is None:
194
+ available_document_types = ["Invoice", "Receipt", "Contract", "Bill", "Statement", "Form", "Other",
195
+ "Technical Knowledge Documents"]
196
+
197
+ # Build the system prompt for the unified task
198
+ document_types_list = ", ".join(available_document_types)
199
+
200
+ system_prompt = f"""
201
+ You are an expert document classification and data extraction engine.
202
+ Your task is two-fold:
203
+ 1. Classify the document text into one of the following high-level types: **{document_types_list}**.
204
+ 2. Based on the classification, extract all relevant key-value metadata pairs.
205
+
206
+ Extraction Rules:
207
+ - If classified as 'Invoice' or 'Receipt': Extract fields like `vendor_name`, `date`, `total_amount`, `currency`, and `invoice_number`.
208
+ - If classified as 'Contract' or 'Statement': Extract fields like `parties`, `start_date`, `end_date`, and `document_title`.
209
+ - If classified as 'Technical Knowledge Documents': Extract fields like `document_title`, `key_components` (list of strings), and `abstract`.
210
+ - Extract dates in ISO 8601 format (YYYY-MM-DD) and monetary values as floats/strings.
211
+ - If the document is classified as 'Other', return an empty dictionary for 'metadata'.
212
+ - If no type can be determined, use 'Unknown' for `document_type`.
213
+ """
214
+
215
+ suggestion = await self.get_structured_response(
216
+ user_input=document_text,
217
+ system_prompt=system_prompt,
218
+ response_model=StructuredExtractionResult,
219
+ temperature=0.0
220
+ )
221
+ logger.info("Successfully generated template suggestion")
222
+ logger.debug("Suggestion: %s", suggestion.model_dump_json(indent=2))
223
+ return suggestion
224
+ except Exception as e:
225
+ logger.error("Failed to generate template suggestion: %s", str(e), exc_info=True)
226
+ raise
227
+
228
+ async def suggest_template_schema(self, document_text: str, document_type: str) -> Optional[
229
+ TemplateSchemaSuggestion]:
230
+ """
231
+ Uses the LLM to analyze document text and suggest a new template schema with structured output.
232
+
233
+ Args:
234
+ document_text: The full cleaned text of the new document.
235
+ document_type: The type of document being processed.
236
+
237
+ Returns:
238
+ A validated TemplateSchemaSuggestion instance or None if processing fails.
239
+ """
240
+ try:
241
+ # 1. Define the system prompt with clear instructions for structured output
242
+ system_prompt = (
243
+ "You are an expert document template designer. Analyze the provided document text and "
244
+ "return a structured response with the following fields:\n"
245
+ "- document_type: The type of document (e.g., 'Invoice', 'Contract')\n"
246
+ "- template_name: A descriptive name for this template\n"
247
+ "- description: A brief description of the document's purpose\n"
248
+ "- fields: A list of fields with their types and descriptions\n"
249
+ "- required_fields: List of required field names\n"
250
+ "- examples: Sample values for each field\n\n"
251
+ "The response must be a valid JSON object that matches the TemplateSchemaSuggestion schema."
252
+ )
253
+
254
+ # 2. Prepare the user query with document context
255
+ user_query = (
256
+ f"Document Type: {document_type}\n\n"
257
+ "Document Content:\n"
258
+ f"--- DOCUMENT START ---\n"
259
+ f"{document_text[:4000]}" # Limit context window
260
+ f"\n--- DOCUMENT END ---\n\n"
261
+ "Please analyze this document and provide a structured template suggestion."
262
+ )
263
+
264
+ # 3. Get structured response using the LLM client
265
+ response = await self.llm.generate_structured(
266
+ messages=[
267
+ {"role": "system", "content": system_prompt},
268
+ {"role": "user", "content": user_query}
269
+ ],
270
+ response_model=TemplateSchemaSuggestion,
271
+ temperature=0.1, # Lower temperature for more deterministic output
272
+ max_tokens=2048
273
+ )
274
+
275
+ logger.info(f"Generated template schema: {response}")
276
+ return response
277
+
278
+ except ValidationError as ve:
279
+ logger.error(f"Schema validation failed: {str(ve)}")
280
+ # Try to recover partial data if possible
281
+ try:
282
+ if hasattr(ve, 'raw_errors') and ve.raw_errors:
283
+ # Log the specific validation errors
284
+ for error in ve.raw_errors:
285
+ logger.debug(f"Validation error: {error}")
286
+ # If partial data is available, return it with a warning
287
+ if hasattr(ve, 'model') and ve.model:
288
+ return ve.model
289
+ except Exception as e:
290
+ logger.debug(f"Error during validation error handling: {str(e)}")
291
+
292
+ except Exception as e:
293
+ logger.error(f"Failed to generate template schema: {str(e)}", exc_info=True)
294
+
295
+ return None
@@ -0,0 +1,53 @@
1
+ # clients/metadata_extractor_client.py
2
+
3
+ from clients import LLMService
4
+ from typing import Dict, Any
5
+ import uuid
6
+ import logging
7
+ class MetadataExtractorClient:
8
+ def __init__(self, llm_service:LLMService):
9
+ self.llm_service = llm_service
10
+
11
+ async def call_llm_extraction_service_impl(self,input_data: dict) -> Dict[str, Any]:
12
+ """Implementation of LLM-based invoice extraction service call."""
13
+
14
+ # 1️⃣ Normalize the input to get plain text
15
+ if isinstance(input_data, str):
16
+ extracted_text = input_data
17
+ else:
18
+ extracted_text = input_data.get("text")
19
+
20
+ if not extracted_text or not isinstance(extracted_text, str):
21
+ raise ValueError("No valid text provided for LLM extraction")
22
+
23
+ if isinstance(extracted_text, bytes):
24
+ extracted_text = extracted_text.decode("utf-8")
25
+
26
+ # 2️⃣ Initialize LLM service
27
+
28
+ # 3️⃣ Generate internal tracking ID (not passed to LLM)
29
+ workflow_invoice_id = input_data.get("invoice_id") or f"inv-{str(uuid.uuid4())[:8]}"
30
+
31
+ # 4️⃣ Define system prompt (without injecting invoice_id)
32
+ system_prompt = (
33
+ "You are an expert AI assistant specialized in extracting structured data from invoices. "
34
+ "Analyze the given invoice text and extract key details such as invoice number, vendor, date, amount, and due date. "
35
+ "Return your response as a JSON object matching the given schema.\n"
36
+ "If any field is missing, use null."
37
+ )
38
+
39
+ # 5️⃣ Call LLM for structured extraction
40
+ try:
41
+ invoice_data = await self.llm_service.get_structured_response(
42
+ user_input=extracted_text,
43
+ system_prompt=system_prompt,
44
+ response_model=None
45
+ )
46
+
47
+ result = invoice_data.model_dump(exclude_unset=True)
48
+ result["workflow_invoice_id"] = workflow_invoice_id # attach for traceability
49
+ return result
50
+
51
+ except Exception as e:
52
+ logging.error(f"LLM extraction failed: {str(e)}")
53
+ raise RuntimeError("Failed to process invoice with LLM") from e
@@ -0,0 +1,81 @@
1
+ # services/ocr_client.py
2
+
3
+ import httpx
4
+ from typing import Dict, Any, Optional
5
+ from guardianhub.config.settings import settings
6
+
7
+ from guardianhub import get_logger
8
+
9
+ logger = get_logger(__name__)
10
+
11
+ class OCRClient:
12
+ """Client for interacting with the OCR service."""
13
+
14
+ def __init__(self, base_url: Optional[str] = None):
15
+ """Initialize the OCR client.
16
+
17
+ Args:
18
+ base_url: Base URL of the OCR service. If not provided, uses settings.endpoints.OCR_URL
19
+ """
20
+ self.base = base_url or getattr(settings.endpoints, 'OCR_URL', 'http://doc-ocr.guardianhub.com')
21
+ logger.info("Initialized OCR client with base URL: %s", self.base)
22
+
23
+ # Update the extract_text method in OCRClient
24
+ async def extract_text(self, object_identifier: str) -> Dict[str, Any]:
25
+ """Extract text from a document in MinIO."""
26
+ endpoint = f"{self.base}/v1/ocr/process_from_minio"
27
+
28
+ # Prepare the request payload
29
+ payload = {"object_name": object_identifier}
30
+
31
+ try:
32
+ async with httpx.AsyncClient(timeout=60.0) as client:
33
+ logger.debug(f"Sending OCR request to {endpoint} with payload: {payload}")
34
+ response = await client.post(endpoint, json=payload)
35
+ response.raise_for_status() # Raise HTTPStatusError for bad responses
36
+
37
+ result = response.json()
38
+ logger.debug(f"Received OCR response: {result}")
39
+
40
+ # Handle the nested response structure
41
+ if not isinstance(result, dict):
42
+ raise ValueError(f"Unexpected response type: {type(result).__name__}")
43
+
44
+ # Check for success status and extract data
45
+ if result.get('status') != 'success' or 'data' not in result:
46
+ raise ValueError(f"Unexpected response format: {result}")
47
+
48
+ data = result.get('data', {})
49
+
50
+ # Check for extracted text in the data section
51
+ if 'extracted_text' in data:
52
+ return {
53
+ 'extracted_text': data['extracted_text'],
54
+ 'status': 'success',
55
+ 'raw_response': result
56
+ }
57
+ elif 'text' in data:
58
+ return {
59
+ 'extracted_text': data['text'],
60
+ 'status': 'success',
61
+ 'raw_response': result
62
+ }
63
+ else:
64
+ raise ValueError("Response missing 'text' or 'extracted_text' in data")
65
+
66
+ except httpx.HTTPStatusError as e:
67
+ error_msg = f"OCR service returned {e.response.status_code}: {e.response.text}"
68
+ logger.error(error_msg)
69
+ e.add_note(f"Response content: {e.response.text}")
70
+ raise
71
+ except httpx.RequestError as e:
72
+ error_msg = f"Request to OCR service failed: {str(e)}"
73
+ logger.error(error_msg)
74
+ raise RuntimeError(error_msg) from e
75
+ except ValueError as e:
76
+ logger.error("Failed to parse OCR response: %s. Response: %s", str(e),
77
+ response.text if 'response' in locals() else 'No response')
78
+ raise ValueError(f"Invalid response from OCR service: {str(e)}") from e
79
+ except Exception as e:
80
+ logger.error("Unexpected error in OCR client: %s", str(e), exc_info=True)
81
+ raise RuntimeError(f"Failed to process OCR request: {str(e)}") from e