guardianhub 0.1.88__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- guardianhub/__init__.py +29 -0
- guardianhub/_version.py +1 -0
- guardianhub/agents/runtime.py +12 -0
- guardianhub/auth/token_provider.py +22 -0
- guardianhub/clients/__init__.py +2 -0
- guardianhub/clients/classification_client.py +52 -0
- guardianhub/clients/graph_db_client.py +161 -0
- guardianhub/clients/langfuse/dataset_client.py +157 -0
- guardianhub/clients/langfuse/manager.py +118 -0
- guardianhub/clients/langfuse/prompt_client.py +68 -0
- guardianhub/clients/langfuse/score_evaluation_client.py +92 -0
- guardianhub/clients/langfuse/tracing_client.py +250 -0
- guardianhub/clients/langfuse_client.py +63 -0
- guardianhub/clients/llm_client.py +144 -0
- guardianhub/clients/llm_service.py +295 -0
- guardianhub/clients/metadata_extractor_client.py +53 -0
- guardianhub/clients/ocr_client.py +81 -0
- guardianhub/clients/paperless_client.py +515 -0
- guardianhub/clients/registry_client.py +18 -0
- guardianhub/clients/text_cleaner_client.py +58 -0
- guardianhub/clients/vector_client.py +344 -0
- guardianhub/config/__init__.py +0 -0
- guardianhub/config/config_development.json +84 -0
- guardianhub/config/config_prod.json +39 -0
- guardianhub/config/settings.py +221 -0
- guardianhub/http/http_client.py +26 -0
- guardianhub/logging/__init__.py +2 -0
- guardianhub/logging/logging.py +168 -0
- guardianhub/logging/logging_filters.py +35 -0
- guardianhub/models/__init__.py +0 -0
- guardianhub/models/agent_models.py +153 -0
- guardianhub/models/base.py +2 -0
- guardianhub/models/registry/client.py +16 -0
- guardianhub/models/registry/dynamic_loader.py +73 -0
- guardianhub/models/registry/loader.py +37 -0
- guardianhub/models/registry/registry.py +17 -0
- guardianhub/models/registry/signing.py +70 -0
- guardianhub/models/template/__init__.py +0 -0
- guardianhub/models/template/agent_plan.py +65 -0
- guardianhub/models/template/agent_response_evaluation.py +67 -0
- guardianhub/models/template/extraction.py +29 -0
- guardianhub/models/template/reflection_critique.py +206 -0
- guardianhub/models/template/suggestion.py +42 -0
- guardianhub/observability/__init__.py +1 -0
- guardianhub/observability/instrumentation.py +271 -0
- guardianhub/observability/otel_helper.py +43 -0
- guardianhub/observability/otel_middlewares.py +73 -0
- guardianhub/prompts/base.py +7 -0
- guardianhub/prompts/providers/langfuse_provider.py +13 -0
- guardianhub/prompts/providers/local_provider.py +22 -0
- guardianhub/prompts/registry.py +14 -0
- guardianhub/scripts/script.sh +31 -0
- guardianhub/services/base.py +15 -0
- guardianhub/template/__init__.py +0 -0
- guardianhub/tools/gh_registry_cli.py +171 -0
- guardianhub/utils/__init__.py +0 -0
- guardianhub/utils/app_state.py +74 -0
- guardianhub/utils/fastapi_utils.py +152 -0
- guardianhub/utils/json_utils.py +137 -0
- guardianhub/utils/metrics.py +60 -0
- guardianhub-0.1.88.dist-info/METADATA +240 -0
- guardianhub-0.1.88.dist-info/RECORD +64 -0
- guardianhub-0.1.88.dist-info/WHEEL +4 -0
- guardianhub-0.1.88.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
# llm/llm_service.py
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from typing import Any, Dict, Type, TypeVar, List, Optional
|
|
5
|
+
|
|
6
|
+
from openai import BaseModel
|
|
7
|
+
from pydantic import ValidationError
|
|
8
|
+
from pydantic import create_model
|
|
9
|
+
|
|
10
|
+
from guardianhub.config.settings import settings
|
|
11
|
+
from guardianhub import get_logger
|
|
12
|
+
from guardianhub.models.template.extraction import StructuredExtractionResult
|
|
13
|
+
from guardianhub.models.template.suggestion import TemplateSchemaSuggestion
|
|
14
|
+
from .llm_client import LLMClient
|
|
15
|
+
from ..utils.json_utils import parse_structured_response
|
|
16
|
+
|
|
17
|
+
logger = get_logger(__name__)
|
|
18
|
+
T = TypeVar("T", bound=BaseModel)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class LLMService:
|
|
22
|
+
"""Service layer for structured extraction from Aura-LLM."""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def __init__(self, llm_client: LLMClient):
|
|
26
|
+
|
|
27
|
+
self.llm = llm_client
|
|
28
|
+
logger.info("LLMService initialized with client: %s", llm_client.__class__.__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def create_model_from_schema(self,schema: Dict[str, Any]) -> Type[BaseModel]:
|
|
32
|
+
"""Create a dynamic Pydantic model from a JSON schema."""
|
|
33
|
+
fields = {}
|
|
34
|
+
for field_name, field_props in schema.get('properties', {}).items():
|
|
35
|
+
field_type = self._get_python_type(field_props.get('type', 'string'))
|
|
36
|
+
fields[field_name] = (field_type, field_props.get('description', ''))
|
|
37
|
+
|
|
38
|
+
return create_model('DynamicModel', **fields)
|
|
39
|
+
|
|
40
|
+
def _get_python_type(self, schema_type) -> type:
|
|
41
|
+
"""Map JSON schema types to Python types, handling both strings and lists of types."""
|
|
42
|
+
# If it's a list, use the first type (or default to string)
|
|
43
|
+
if isinstance(schema_type, list):
|
|
44
|
+
schema_type = schema_type[0] if schema_type else 'string'
|
|
45
|
+
|
|
46
|
+
# Ensure schema_type is a string
|
|
47
|
+
schema_type = str(schema_type).lower()
|
|
48
|
+
|
|
49
|
+
type_map = {
|
|
50
|
+
'string': str,
|
|
51
|
+
'integer': int,
|
|
52
|
+
'number': float,
|
|
53
|
+
'boolean': bool,
|
|
54
|
+
'array': list,
|
|
55
|
+
'object': dict,
|
|
56
|
+
}
|
|
57
|
+
return type_map.get(schema_type, str)
|
|
58
|
+
|
|
59
|
+
async def get_structured_response(
|
|
60
|
+
self,
|
|
61
|
+
user_input: str,
|
|
62
|
+
system_prompt: str,
|
|
63
|
+
response_model: Optional[Type[T]] = None,
|
|
64
|
+
model_json_schema: Optional[Dict[str, Any]] = None,
|
|
65
|
+
temperature: float = settings.llm.temperature,
|
|
66
|
+
max_tokens: int = settings.llm.max_tokens,
|
|
67
|
+
) -> T:
|
|
68
|
+
"""Request structured JSON response from Aura-LLM and parse it safely.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
user_input: The input text to process
|
|
72
|
+
system_prompt: Task-specific instructions for the LLM
|
|
73
|
+
response_model: Pydantic model defining the expected response schema
|
|
74
|
+
temperature: Controls randomness (0.0 = deterministic)
|
|
75
|
+
max_tokens: Maximum number of tokens to generate
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
An instance of the response model with the extracted data
|
|
79
|
+
"""
|
|
80
|
+
# Get JSON schema from the response model
|
|
81
|
+
"""Request structured JSON response from Aura-LLM and parse it safely."""
|
|
82
|
+
logger.info("Starting get_structured_response")
|
|
83
|
+
logger.info("System prompt: %s", system_prompt[:200] + "..." if len(system_prompt) > 200 else system_prompt)
|
|
84
|
+
logger.info("User input length: %d characters", len(user_input))
|
|
85
|
+
if model_json_schema and not response_model:
|
|
86
|
+
logger.info("Creating model from JSON schema")
|
|
87
|
+
response_model = self.create_model_from_schema(model_json_schema)
|
|
88
|
+
|
|
89
|
+
if not response_model:
|
|
90
|
+
error_msg = "Either response_model or model_json_schema must be provided"
|
|
91
|
+
logger.error(error_msg)
|
|
92
|
+
raise ValueError(error_msg)
|
|
93
|
+
|
|
94
|
+
# Create schema description for the LLM
|
|
95
|
+
schema = response_model.model_json_schema()
|
|
96
|
+
logger.debug("Using schema: %s", json.dumps(schema, indent=2))
|
|
97
|
+
|
|
98
|
+
schema_description = json.dumps(schema, indent=2)
|
|
99
|
+
|
|
100
|
+
# --- NEW: Single, Unified, Aggressive System Prompt ---
|
|
101
|
+
# Merging the general JSON generator persona with the specific task and schema.
|
|
102
|
+
unified_system_prompt = f"""
|
|
103
|
+
You are an **expert, precise JSON generator and data extraction API**.
|
|
104
|
+
|
|
105
|
+
{system_prompt}
|
|
106
|
+
|
|
107
|
+
Your SOLE output **MUST** be a single, valid JSON object that **STRICTLY** conforms to the following schema. **DO NOT** include any other text, markdown wrappers (like ```json), explanations, or custom fields.
|
|
108
|
+
|
|
109
|
+
--- REQUIRED JSON SCHEMA ---
|
|
110
|
+
{schema_description}
|
|
111
|
+
--- END SCHEMA ---
|
|
112
|
+
|
|
113
|
+
STRICT CONFORMANCE RULES:
|
|
114
|
+
1. The output MUST be raw JSON text, with NO wrapping characters.
|
|
115
|
+
2. Only include fields defined in the schema (document_type, metadata, confidence).
|
|
116
|
+
3. If a field is a complex object (like 'metadata'), it MUST be output as a **direct JSON object**, NOT a nested JSON string. For example: "metadata": {{ "key": "value" }}, NOT "metadata": "{{\\"key\\": \\"value\\"}}".
|
|
117
|
+
4. Maintain the exact field names and types from the schema.
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
# Using a single system instruction to avoid message conflict
|
|
121
|
+
messages = [
|
|
122
|
+
{"role": "system", "content": unified_system_prompt},
|
|
123
|
+
{"role": "user", "content": user_input},
|
|
124
|
+
]
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
logger.debug("Sending request to LLM with %d messages", len(messages))
|
|
128
|
+
response = await self.llm.chat_completion(
|
|
129
|
+
messages=messages,
|
|
130
|
+
temperature=temperature,
|
|
131
|
+
max_tokens=max_tokens,
|
|
132
|
+
model=settings.llm.model_key,
|
|
133
|
+
response_format={"type": "json"},
|
|
134
|
+
)
|
|
135
|
+
logger.debug("Received response from LLM")
|
|
136
|
+
if not response:
|
|
137
|
+
error_msg = "Empty response from LLM"
|
|
138
|
+
logger.error(error_msg)
|
|
139
|
+
raise ValueError(error_msg)
|
|
140
|
+
|
|
141
|
+
logger.debug("Full LLM response: %s", json.dumps(response, indent=2))
|
|
142
|
+
|
|
143
|
+
# Extract and clean the response
|
|
144
|
+
raw_text = (
|
|
145
|
+
response.get("choices", [{}])[0]
|
|
146
|
+
.get("message", {})
|
|
147
|
+
.get("content", "")
|
|
148
|
+
.strip()
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
if not raw_text:
|
|
152
|
+
error_msg = "Empty content in LLM response"
|
|
153
|
+
logger.error(error_msg)
|
|
154
|
+
raise ValueError(error_msg)
|
|
155
|
+
|
|
156
|
+
# Extract and validate JSON
|
|
157
|
+
logger.debug("Raw text from LLM: %s", raw_text)
|
|
158
|
+
result = parse_structured_response(raw_text, response_model)
|
|
159
|
+
logger.info("Successfully parsed structured response")
|
|
160
|
+
return result
|
|
161
|
+
except json.JSONDecodeError as e:
|
|
162
|
+
error_msg = f"Failed to decode LLM response as JSON: {str(e)}"
|
|
163
|
+
logger.error("%s\nRaw response: %s", error_msg, raw_text)
|
|
164
|
+
raise
|
|
165
|
+
except ValidationError as e:
|
|
166
|
+
error_msg = f"Response validation failed: {str(e)}"
|
|
167
|
+
logger.error("%s\nResponse was: %s", error_msg, raw_text)
|
|
168
|
+
raise
|
|
169
|
+
except Exception as e:
|
|
170
|
+
logger.error("Unexpected error in get_structured_response: %s", str(e), exc_info=True)
|
|
171
|
+
raise
|
|
172
|
+
|
|
173
|
+
async def classify_and_extract_document_metadata(
|
|
174
|
+
self,
|
|
175
|
+
document_text: str,
|
|
176
|
+
available_document_types: List[str] = None
|
|
177
|
+
) -> StructuredExtractionResult:
|
|
178
|
+
"""
|
|
179
|
+
Performs unified zero-shot classification and structured metadata extraction.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
document_text: The full text content of the document.
|
|
183
|
+
available_document_types: A list of known document types to classify against.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
StructuredExtractionResult: An object containing the classified type and extracted metadata.
|
|
187
|
+
"""
|
|
188
|
+
"""Suggests a template schema based on document text and type."""
|
|
189
|
+
logger.info("Suggesting template schema for document type: %s", document_type)
|
|
190
|
+
logger.debug("Document text length: %d characters", len(document_text))
|
|
191
|
+
|
|
192
|
+
try:
|
|
193
|
+
if available_document_types is None:
|
|
194
|
+
available_document_types = ["Invoice", "Receipt", "Contract", "Bill", "Statement", "Form", "Other",
|
|
195
|
+
"Technical Knowledge Documents"]
|
|
196
|
+
|
|
197
|
+
# Build the system prompt for the unified task
|
|
198
|
+
document_types_list = ", ".join(available_document_types)
|
|
199
|
+
|
|
200
|
+
system_prompt = f"""
|
|
201
|
+
You are an expert document classification and data extraction engine.
|
|
202
|
+
Your task is two-fold:
|
|
203
|
+
1. Classify the document text into one of the following high-level types: **{document_types_list}**.
|
|
204
|
+
2. Based on the classification, extract all relevant key-value metadata pairs.
|
|
205
|
+
|
|
206
|
+
Extraction Rules:
|
|
207
|
+
- If classified as 'Invoice' or 'Receipt': Extract fields like `vendor_name`, `date`, `total_amount`, `currency`, and `invoice_number`.
|
|
208
|
+
- If classified as 'Contract' or 'Statement': Extract fields like `parties`, `start_date`, `end_date`, and `document_title`.
|
|
209
|
+
- If classified as 'Technical Knowledge Documents': Extract fields like `document_title`, `key_components` (list of strings), and `abstract`.
|
|
210
|
+
- Extract dates in ISO 8601 format (YYYY-MM-DD) and monetary values as floats/strings.
|
|
211
|
+
- If the document is classified as 'Other', return an empty dictionary for 'metadata'.
|
|
212
|
+
- If no type can be determined, use 'Unknown' for `document_type`.
|
|
213
|
+
"""
|
|
214
|
+
|
|
215
|
+
suggestion = await self.get_structured_response(
|
|
216
|
+
user_input=document_text,
|
|
217
|
+
system_prompt=system_prompt,
|
|
218
|
+
response_model=StructuredExtractionResult,
|
|
219
|
+
temperature=0.0
|
|
220
|
+
)
|
|
221
|
+
logger.info("Successfully generated template suggestion")
|
|
222
|
+
logger.debug("Suggestion: %s", suggestion.model_dump_json(indent=2))
|
|
223
|
+
return suggestion
|
|
224
|
+
except Exception as e:
|
|
225
|
+
logger.error("Failed to generate template suggestion: %s", str(e), exc_info=True)
|
|
226
|
+
raise
|
|
227
|
+
|
|
228
|
+
async def suggest_template_schema(self, document_text: str, document_type: str) -> Optional[
|
|
229
|
+
TemplateSchemaSuggestion]:
|
|
230
|
+
"""
|
|
231
|
+
Uses the LLM to analyze document text and suggest a new template schema with structured output.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
document_text: The full cleaned text of the new document.
|
|
235
|
+
document_type: The type of document being processed.
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
A validated TemplateSchemaSuggestion instance or None if processing fails.
|
|
239
|
+
"""
|
|
240
|
+
try:
|
|
241
|
+
# 1. Define the system prompt with clear instructions for structured output
|
|
242
|
+
system_prompt = (
|
|
243
|
+
"You are an expert document template designer. Analyze the provided document text and "
|
|
244
|
+
"return a structured response with the following fields:\n"
|
|
245
|
+
"- document_type: The type of document (e.g., 'Invoice', 'Contract')\n"
|
|
246
|
+
"- template_name: A descriptive name for this template\n"
|
|
247
|
+
"- description: A brief description of the document's purpose\n"
|
|
248
|
+
"- fields: A list of fields with their types and descriptions\n"
|
|
249
|
+
"- required_fields: List of required field names\n"
|
|
250
|
+
"- examples: Sample values for each field\n\n"
|
|
251
|
+
"The response must be a valid JSON object that matches the TemplateSchemaSuggestion schema."
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# 2. Prepare the user query with document context
|
|
255
|
+
user_query = (
|
|
256
|
+
f"Document Type: {document_type}\n\n"
|
|
257
|
+
"Document Content:\n"
|
|
258
|
+
f"--- DOCUMENT START ---\n"
|
|
259
|
+
f"{document_text[:4000]}" # Limit context window
|
|
260
|
+
f"\n--- DOCUMENT END ---\n\n"
|
|
261
|
+
"Please analyze this document and provide a structured template suggestion."
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
# 3. Get structured response using the LLM client
|
|
265
|
+
response = await self.llm.generate_structured(
|
|
266
|
+
messages=[
|
|
267
|
+
{"role": "system", "content": system_prompt},
|
|
268
|
+
{"role": "user", "content": user_query}
|
|
269
|
+
],
|
|
270
|
+
response_model=TemplateSchemaSuggestion,
|
|
271
|
+
temperature=0.1, # Lower temperature for more deterministic output
|
|
272
|
+
max_tokens=2048
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
logger.info(f"Generated template schema: {response}")
|
|
276
|
+
return response
|
|
277
|
+
|
|
278
|
+
except ValidationError as ve:
|
|
279
|
+
logger.error(f"Schema validation failed: {str(ve)}")
|
|
280
|
+
# Try to recover partial data if possible
|
|
281
|
+
try:
|
|
282
|
+
if hasattr(ve, 'raw_errors') and ve.raw_errors:
|
|
283
|
+
# Log the specific validation errors
|
|
284
|
+
for error in ve.raw_errors:
|
|
285
|
+
logger.debug(f"Validation error: {error}")
|
|
286
|
+
# If partial data is available, return it with a warning
|
|
287
|
+
if hasattr(ve, 'model') and ve.model:
|
|
288
|
+
return ve.model
|
|
289
|
+
except Exception as e:
|
|
290
|
+
logger.debug(f"Error during validation error handling: {str(e)}")
|
|
291
|
+
|
|
292
|
+
except Exception as e:
|
|
293
|
+
logger.error(f"Failed to generate template schema: {str(e)}", exc_info=True)
|
|
294
|
+
|
|
295
|
+
return None
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# clients/metadata_extractor_client.py
|
|
2
|
+
|
|
3
|
+
from clients import LLMService
|
|
4
|
+
from typing import Dict, Any
|
|
5
|
+
import uuid
|
|
6
|
+
import logging
|
|
7
|
+
class MetadataExtractorClient:
|
|
8
|
+
def __init__(self, llm_service:LLMService):
|
|
9
|
+
self.llm_service = llm_service
|
|
10
|
+
|
|
11
|
+
async def call_llm_extraction_service_impl(self,input_data: dict) -> Dict[str, Any]:
|
|
12
|
+
"""Implementation of LLM-based invoice extraction service call."""
|
|
13
|
+
|
|
14
|
+
# 1️⃣ Normalize the input to get plain text
|
|
15
|
+
if isinstance(input_data, str):
|
|
16
|
+
extracted_text = input_data
|
|
17
|
+
else:
|
|
18
|
+
extracted_text = input_data.get("text")
|
|
19
|
+
|
|
20
|
+
if not extracted_text or not isinstance(extracted_text, str):
|
|
21
|
+
raise ValueError("No valid text provided for LLM extraction")
|
|
22
|
+
|
|
23
|
+
if isinstance(extracted_text, bytes):
|
|
24
|
+
extracted_text = extracted_text.decode("utf-8")
|
|
25
|
+
|
|
26
|
+
# 2️⃣ Initialize LLM service
|
|
27
|
+
|
|
28
|
+
# 3️⃣ Generate internal tracking ID (not passed to LLM)
|
|
29
|
+
workflow_invoice_id = input_data.get("invoice_id") or f"inv-{str(uuid.uuid4())[:8]}"
|
|
30
|
+
|
|
31
|
+
# 4️⃣ Define system prompt (without injecting invoice_id)
|
|
32
|
+
system_prompt = (
|
|
33
|
+
"You are an expert AI assistant specialized in extracting structured data from invoices. "
|
|
34
|
+
"Analyze the given invoice text and extract key details such as invoice number, vendor, date, amount, and due date. "
|
|
35
|
+
"Return your response as a JSON object matching the given schema.\n"
|
|
36
|
+
"If any field is missing, use null."
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# 5️⃣ Call LLM for structured extraction
|
|
40
|
+
try:
|
|
41
|
+
invoice_data = await self.llm_service.get_structured_response(
|
|
42
|
+
user_input=extracted_text,
|
|
43
|
+
system_prompt=system_prompt,
|
|
44
|
+
response_model=None
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
result = invoice_data.model_dump(exclude_unset=True)
|
|
48
|
+
result["workflow_invoice_id"] = workflow_invoice_id # attach for traceability
|
|
49
|
+
return result
|
|
50
|
+
|
|
51
|
+
except Exception as e:
|
|
52
|
+
logging.error(f"LLM extraction failed: {str(e)}")
|
|
53
|
+
raise RuntimeError("Failed to process invoice with LLM") from e
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# services/ocr_client.py
|
|
2
|
+
|
|
3
|
+
import httpx
|
|
4
|
+
from typing import Dict, Any, Optional
|
|
5
|
+
from guardianhub.config.settings import settings
|
|
6
|
+
|
|
7
|
+
from guardianhub import get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger(__name__)
|
|
10
|
+
|
|
11
|
+
class OCRClient:
|
|
12
|
+
"""Client for interacting with the OCR service."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, base_url: Optional[str] = None):
|
|
15
|
+
"""Initialize the OCR client.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
base_url: Base URL of the OCR service. If not provided, uses settings.endpoints.OCR_URL
|
|
19
|
+
"""
|
|
20
|
+
self.base = base_url or getattr(settings.endpoints, 'OCR_URL', 'http://doc-ocr.guardianhub.com')
|
|
21
|
+
logger.info("Initialized OCR client with base URL: %s", self.base)
|
|
22
|
+
|
|
23
|
+
# Update the extract_text method in OCRClient
|
|
24
|
+
async def extract_text(self, object_identifier: str) -> Dict[str, Any]:
|
|
25
|
+
"""Extract text from a document in MinIO."""
|
|
26
|
+
endpoint = f"{self.base}/v1/ocr/process_from_minio"
|
|
27
|
+
|
|
28
|
+
# Prepare the request payload
|
|
29
|
+
payload = {"object_name": object_identifier}
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
async with httpx.AsyncClient(timeout=60.0) as client:
|
|
33
|
+
logger.debug(f"Sending OCR request to {endpoint} with payload: {payload}")
|
|
34
|
+
response = await client.post(endpoint, json=payload)
|
|
35
|
+
response.raise_for_status() # Raise HTTPStatusError for bad responses
|
|
36
|
+
|
|
37
|
+
result = response.json()
|
|
38
|
+
logger.debug(f"Received OCR response: {result}")
|
|
39
|
+
|
|
40
|
+
# Handle the nested response structure
|
|
41
|
+
if not isinstance(result, dict):
|
|
42
|
+
raise ValueError(f"Unexpected response type: {type(result).__name__}")
|
|
43
|
+
|
|
44
|
+
# Check for success status and extract data
|
|
45
|
+
if result.get('status') != 'success' or 'data' not in result:
|
|
46
|
+
raise ValueError(f"Unexpected response format: {result}")
|
|
47
|
+
|
|
48
|
+
data = result.get('data', {})
|
|
49
|
+
|
|
50
|
+
# Check for extracted text in the data section
|
|
51
|
+
if 'extracted_text' in data:
|
|
52
|
+
return {
|
|
53
|
+
'extracted_text': data['extracted_text'],
|
|
54
|
+
'status': 'success',
|
|
55
|
+
'raw_response': result
|
|
56
|
+
}
|
|
57
|
+
elif 'text' in data:
|
|
58
|
+
return {
|
|
59
|
+
'extracted_text': data['text'],
|
|
60
|
+
'status': 'success',
|
|
61
|
+
'raw_response': result
|
|
62
|
+
}
|
|
63
|
+
else:
|
|
64
|
+
raise ValueError("Response missing 'text' or 'extracted_text' in data")
|
|
65
|
+
|
|
66
|
+
except httpx.HTTPStatusError as e:
|
|
67
|
+
error_msg = f"OCR service returned {e.response.status_code}: {e.response.text}"
|
|
68
|
+
logger.error(error_msg)
|
|
69
|
+
e.add_note(f"Response content: {e.response.text}")
|
|
70
|
+
raise
|
|
71
|
+
except httpx.RequestError as e:
|
|
72
|
+
error_msg = f"Request to OCR service failed: {str(e)}"
|
|
73
|
+
logger.error(error_msg)
|
|
74
|
+
raise RuntimeError(error_msg) from e
|
|
75
|
+
except ValueError as e:
|
|
76
|
+
logger.error("Failed to parse OCR response: %s. Response: %s", str(e),
|
|
77
|
+
response.text if 'response' in locals() else 'No response')
|
|
78
|
+
raise ValueError(f"Invalid response from OCR service: {str(e)}") from e
|
|
79
|
+
except Exception as e:
|
|
80
|
+
logger.error("Unexpected error in OCR client: %s", str(e), exc_info=True)
|
|
81
|
+
raise RuntimeError(f"Failed to process OCR request: {str(e)}") from e
|