agent-runtime-core 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,406 @@
1
+ """
2
+ OCR (Optical Character Recognition) providers.
3
+
4
+ Supports multiple OCR backends:
5
+ - Tesseract (local, free)
6
+ - Google Cloud Vision
7
+ - AWS Textract
8
+ - Azure Document Intelligence
9
+
10
+ All providers are optional and checked at runtime.
11
+ """
12
+
13
+ from abc import ABC, abstractmethod
14
+ from dataclasses import dataclass, field
15
+ from typing import Any, Optional
16
+ import base64
17
+
18
+
19
+ @dataclass
20
+ class OCRResult:
21
+ """Result from OCR processing."""
22
+ text: str
23
+ confidence: Optional[float] = None
24
+ language: Optional[str] = None
25
+ blocks: list[dict] = field(default_factory=list) # Structured text blocks
26
+ raw_response: Optional[Any] = None
27
+
28
+
29
+ class OCRProvider(ABC):
30
+ """Abstract base class for OCR providers."""
31
+
32
+ @property
33
+ @abstractmethod
34
+ def name(self) -> str:
35
+ """Provider name."""
36
+ ...
37
+
38
+ @abstractmethod
39
+ async def extract_text(
40
+ self,
41
+ image_bytes: bytes,
42
+ language: str = "eng",
43
+ **kwargs,
44
+ ) -> OCRResult:
45
+ """
46
+ Extract text from an image.
47
+
48
+ Args:
49
+ image_bytes: Raw image bytes
50
+ language: Language hint (ISO 639-3 code for Tesseract, varies by provider)
51
+ **kwargs: Provider-specific options
52
+
53
+ Returns:
54
+ OCRResult with extracted text
55
+ """
56
+ ...
57
+
58
+ def is_available(self) -> bool:
59
+ """Check if this provider is available (dependencies installed, configured)."""
60
+ return True
61
+
62
+
63
+ class TesseractOCR(OCRProvider):
64
+ """
65
+ Tesseract OCR provider (local, free).
66
+
67
+ Requires: pytesseract, tesseract-ocr system package
68
+ Install: pip install pytesseract
69
+ brew install tesseract (macOS) or apt install tesseract-ocr (Linux)
70
+ """
71
+
72
+ @property
73
+ def name(self) -> str:
74
+ return "tesseract"
75
+
76
+ def is_available(self) -> bool:
77
+ try:
78
+ import pytesseract
79
+ # Try to get version to verify tesseract is installed
80
+ pytesseract.get_tesseract_version()
81
+ return True
82
+ except Exception:
83
+ return False
84
+
85
+ async def extract_text(
86
+ self,
87
+ image_bytes: bytes,
88
+ language: str = "eng",
89
+ **kwargs,
90
+ ) -> OCRResult:
91
+ try:
92
+ import pytesseract
93
+ from PIL import Image
94
+ except ImportError:
95
+ raise ImportError(
96
+ "pytesseract and Pillow are required for Tesseract OCR. "
97
+ "Install with: pip install pytesseract Pillow"
98
+ )
99
+
100
+ import io
101
+
102
+ # Open image
103
+ img = Image.open(io.BytesIO(image_bytes))
104
+
105
+ # Get detailed data
106
+ data = pytesseract.image_to_data(img, lang=language, output_type=pytesseract.Output.DICT)
107
+
108
+ # Extract text
109
+ text = pytesseract.image_to_string(img, lang=language)
110
+
111
+ # Calculate average confidence
112
+ confidences = [c for c in data["conf"] if c > 0]
113
+ avg_confidence = sum(confidences) / len(confidences) if confidences else None
114
+
115
+ # Build blocks
116
+ blocks = []
117
+ current_block = {"text": "", "words": []}
118
+ for i, word in enumerate(data["text"]):
119
+ if word.strip():
120
+ current_block["words"].append({
121
+ "text": word,
122
+ "confidence": data["conf"][i],
123
+ "bbox": {
124
+ "left": data["left"][i],
125
+ "top": data["top"][i],
126
+ "width": data["width"][i],
127
+ "height": data["height"][i],
128
+ },
129
+ })
130
+
131
+ return OCRResult(
132
+ text=text.strip(),
133
+ confidence=avg_confidence,
134
+ language=language,
135
+ blocks=blocks,
136
+ )
137
+
138
+
139
+ class GoogleVisionOCR(OCRProvider):
140
+ """
141
+ Google Cloud Vision OCR provider.
142
+
143
+ Requires: google-cloud-vision
144
+ Install: pip install google-cloud-vision
145
+ Auth: Set GOOGLE_APPLICATION_CREDENTIALS environment variable
146
+ """
147
+
148
+ @property
149
+ def name(self) -> str:
150
+ return "google_vision"
151
+
152
+ def is_available(self) -> bool:
153
+ try:
154
+ from google.cloud import vision
155
+ return True
156
+ except ImportError:
157
+ return False
158
+
159
+ async def extract_text(
160
+ self,
161
+ image_bytes: bytes,
162
+ language: str = "en",
163
+ **kwargs,
164
+ ) -> OCRResult:
165
+ try:
166
+ from google.cloud import vision
167
+ except ImportError:
168
+ raise ImportError(
169
+ "google-cloud-vision is required for Google Vision OCR. "
170
+ "Install with: pip install google-cloud-vision"
171
+ )
172
+
173
+ client = vision.ImageAnnotatorClient()
174
+ image = vision.Image(content=image_bytes)
175
+
176
+ # Request text detection
177
+ response = client.text_detection(
178
+ image=image,
179
+ image_context=vision.ImageContext(language_hints=[language]),
180
+ )
181
+
182
+ if response.error.message:
183
+ raise RuntimeError(f"Google Vision API error: {response.error.message}")
184
+
185
+ # Extract full text
186
+ texts = response.text_annotations
187
+ full_text = texts[0].description if texts else ""
188
+
189
+ # Build blocks from individual text annotations
190
+ blocks = []
191
+ for text in texts[1:]: # Skip first (full text)
192
+ vertices = text.bounding_poly.vertices
193
+ blocks.append({
194
+ "text": text.description,
195
+ "bbox": {
196
+ "left": vertices[0].x if vertices else 0,
197
+ "top": vertices[0].y if vertices else 0,
198
+ "right": vertices[2].x if len(vertices) > 2 else 0,
199
+ "bottom": vertices[2].y if len(vertices) > 2 else 0,
200
+ },
201
+ })
202
+
203
+ return OCRResult(
204
+ text=full_text.strip(),
205
+ confidence=None, # Google Vision doesn't provide overall confidence
206
+ language=language,
207
+ blocks=blocks,
208
+ raw_response=response,
209
+ )
210
+
211
+
212
+ class AWSTextractOCR(OCRProvider):
213
+ """
214
+ AWS Textract OCR provider.
215
+
216
+ Requires: boto3
217
+ Install: pip install boto3
218
+ Auth: Configure AWS credentials (environment variables, ~/.aws/credentials, or IAM role)
219
+ """
220
+
221
+ def __init__(self, region_name: str = "us-east-1"):
222
+ self.region_name = region_name
223
+
224
+ @property
225
+ def name(self) -> str:
226
+ return "aws_textract"
227
+
228
+ def is_available(self) -> bool:
229
+ try:
230
+ import boto3
231
+ return True
232
+ except ImportError:
233
+ return False
234
+
235
+ async def extract_text(
236
+ self,
237
+ image_bytes: bytes,
238
+ language: str = "en",
239
+ **kwargs,
240
+ ) -> OCRResult:
241
+ try:
242
+ import boto3
243
+ except ImportError:
244
+ raise ImportError(
245
+ "boto3 is required for AWS Textract OCR. "
246
+ "Install with: pip install boto3"
247
+ )
248
+
249
+ client = boto3.client("textract", region_name=self.region_name)
250
+
251
+ # Call Textract
252
+ response = client.detect_document_text(Document={"Bytes": image_bytes})
253
+
254
+ # Extract text and blocks
255
+ lines = []
256
+ blocks = []
257
+ confidences = []
258
+
259
+ for block in response.get("Blocks", []):
260
+ if block["BlockType"] == "LINE":
261
+ lines.append(block.get("Text", ""))
262
+ confidences.append(block.get("Confidence", 0))
263
+
264
+ bbox = block.get("Geometry", {}).get("BoundingBox", {})
265
+ blocks.append({
266
+ "text": block.get("Text", ""),
267
+ "confidence": block.get("Confidence"),
268
+ "bbox": {
269
+ "left": bbox.get("Left", 0),
270
+ "top": bbox.get("Top", 0),
271
+ "width": bbox.get("Width", 0),
272
+ "height": bbox.get("Height", 0),
273
+ },
274
+ })
275
+
276
+ avg_confidence = sum(confidences) / len(confidences) if confidences else None
277
+
278
+ return OCRResult(
279
+ text="\n".join(lines),
280
+ confidence=avg_confidence,
281
+ language=language,
282
+ blocks=blocks,
283
+ raw_response=response,
284
+ )
285
+
286
+
287
+ class AzureDocumentOCR(OCRProvider):
288
+ """
289
+ Azure Document Intelligence (Form Recognizer) OCR provider.
290
+
291
+ Requires: azure-ai-formrecognizer
292
+ Install: pip install azure-ai-formrecognizer
293
+ Auth: Set AZURE_FORM_RECOGNIZER_ENDPOINT and AZURE_FORM_RECOGNIZER_KEY environment variables
294
+ """
295
+
296
+ def __init__(self, endpoint: str | None = None, key: str | None = None):
297
+ import os
298
+ self.endpoint = endpoint or os.environ.get("AZURE_FORM_RECOGNIZER_ENDPOINT")
299
+ self.key = key or os.environ.get("AZURE_FORM_RECOGNIZER_KEY")
300
+
301
+ @property
302
+ def name(self) -> str:
303
+ return "azure_document"
304
+
305
+ def is_available(self) -> bool:
306
+ try:
307
+ from azure.ai.formrecognizer import DocumentAnalysisClient
308
+ return bool(self.endpoint and self.key)
309
+ except ImportError:
310
+ return False
311
+
312
+ async def extract_text(
313
+ self,
314
+ image_bytes: bytes,
315
+ language: str = "en",
316
+ **kwargs,
317
+ ) -> OCRResult:
318
+ try:
319
+ from azure.ai.formrecognizer import DocumentAnalysisClient
320
+ from azure.core.credentials import AzureKeyCredential
321
+ except ImportError:
322
+ raise ImportError(
323
+ "azure-ai-formrecognizer is required for Azure Document OCR. "
324
+ "Install with: pip install azure-ai-formrecognizer"
325
+ )
326
+
327
+ if not self.endpoint or not self.key:
328
+ raise ValueError(
329
+ "Azure endpoint and key are required. "
330
+ "Set AZURE_FORM_RECOGNIZER_ENDPOINT and AZURE_FORM_RECOGNIZER_KEY environment variables."
331
+ )
332
+
333
+ client = DocumentAnalysisClient(
334
+ endpoint=self.endpoint,
335
+ credential=AzureKeyCredential(self.key),
336
+ )
337
+
338
+ # Analyze document
339
+ poller = client.begin_analyze_document("prebuilt-read", image_bytes)
340
+ result = poller.result()
341
+
342
+ # Extract text
343
+ lines = []
344
+ blocks = []
345
+ confidences = []
346
+
347
+ for page in result.pages:
348
+ for line in page.lines:
349
+ lines.append(line.content)
350
+ if line.spans:
351
+ confidences.append(getattr(line, "confidence", 0.9))
352
+
353
+ # Get bounding box
354
+ if line.polygon:
355
+ blocks.append({
356
+ "text": line.content,
357
+ "polygon": [(p.x, p.y) for p in line.polygon],
358
+ })
359
+
360
+ avg_confidence = sum(confidences) / len(confidences) if confidences else None
361
+
362
+ return OCRResult(
363
+ text="\n".join(lines),
364
+ confidence=avg_confidence,
365
+ language=language,
366
+ blocks=blocks,
367
+ raw_response=result,
368
+ )
369
+
370
+
371
+ # Registry of available OCR providers
372
+ OCR_PROVIDERS: dict[str, type[OCRProvider]] = {
373
+ "tesseract": TesseractOCR,
374
+ "google_vision": GoogleVisionOCR,
375
+ "aws_textract": AWSTextractOCR,
376
+ "azure_document": AzureDocumentOCR,
377
+ }
378
+
379
+
380
+ def get_ocr_provider(name: str, **kwargs) -> OCRProvider:
381
+ """
382
+ Get an OCR provider by name.
383
+
384
+ Args:
385
+ name: Provider name (tesseract, google_vision, aws_textract, azure_document)
386
+ **kwargs: Provider-specific configuration
387
+
388
+ Returns:
389
+ Configured OCRProvider instance
390
+ """
391
+ if name not in OCR_PROVIDERS:
392
+ raise ValueError(f"Unknown OCR provider: {name}. Available: {list(OCR_PROVIDERS.keys())}")
393
+ return OCR_PROVIDERS[name](**kwargs)
394
+
395
+
396
+ def get_available_ocr_providers() -> list[str]:
397
+ """Get list of available (installed and configured) OCR providers."""
398
+ available = []
399
+ for name, provider_class in OCR_PROVIDERS.items():
400
+ try:
401
+ provider = provider_class()
402
+ if provider.is_available():
403
+ available.append(name)
404
+ except Exception:
405
+ pass
406
+ return available