my-aws-helpers 2.6.0__tar.gz → 2.6.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of my-aws-helpers might be problematic. Click here for more details.
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/PKG-INFO +1 -1
- my_aws_helpers-2.6.2/my_aws_helpers/bedrock.py +199 -0
- my_aws_helpers-2.6.2/my_aws_helpers/prompts/__init__.py +0 -0
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/my_aws_helpers.egg-info/PKG-INFO +1 -1
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/my_aws_helpers.egg-info/SOURCES.txt +2 -0
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/setup.py +3 -2
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/README.md +0 -0
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/my_aws_helpers/api.py +0 -0
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/my_aws_helpers/auth.py +0 -0
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/my_aws_helpers/cognito.py +0 -0
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/my_aws_helpers/dynamo.py +0 -0
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/my_aws_helpers/errors.py +0 -0
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/my_aws_helpers/event.py +0 -0
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/my_aws_helpers/logging.py +0 -0
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/my_aws_helpers/s3.py +0 -0
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/my_aws_helpers/sfn.py +0 -0
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/my_aws_helpers.egg-info/dependency_links.txt +0 -0
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/my_aws_helpers.egg-info/requires.txt +0 -0
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/my_aws_helpers.egg-info/top_level.txt +0 -0
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/my_aws_helpers.egg-info/zip-safe +0 -0
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/setup.cfg +0 -0
- {my_aws_helpers-2.6.0 → my_aws_helpers-2.6.2}/tests/test_event.py +0 -0
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import boto3
|
|
3
|
+
from botocore.config import Config
|
|
4
|
+
import json
|
|
5
|
+
import time
|
|
6
|
+
import os
|
|
7
|
+
import io
|
|
8
|
+
from typing import Optional, List, Dict
|
|
9
|
+
from enum import Enum
|
|
10
|
+
import pymupdf
|
|
11
|
+
import concurrent.futures
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from my_aws_helpers.logging import select_powertools_logger
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
logger = select_powertools_logger('bedrock-boy')
|
|
17
|
+
|
|
18
|
+
logger.info("Got logger")
|
|
19
|
+
|
|
20
|
+
class PromptType(str, Enum):
|
|
21
|
+
json = "json_system_prompt.txt"
|
|
22
|
+
markdown = "markdown_system_prompt.txt"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class TokenUsage:
|
|
27
|
+
input_tokens: int
|
|
28
|
+
output_tokens: int
|
|
29
|
+
total_tokens: int
|
|
30
|
+
|
|
31
|
+
@classmethod
|
|
32
|
+
def from_dict(cls, data: Dict[str, int]) -> TokenUsage:
|
|
33
|
+
return cls(
|
|
34
|
+
input_tokens=data.get('inputTokens', 0),
|
|
35
|
+
output_tokens=data.get('outputTokens', 0),
|
|
36
|
+
total_tokens=data.get('totalTokens', 0),
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class OCRResult:
|
|
42
|
+
content: List[Dict[str, str]]
|
|
43
|
+
token_usage: TokenUsage
|
|
44
|
+
page_number: int
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def from_dict(cls, data: Dict) -> OCRResult:
|
|
48
|
+
return cls(
|
|
49
|
+
content = data.get("content", []),
|
|
50
|
+
token_usage = TokenUsage.from_dict(data.get("token_usage", {})),
|
|
51
|
+
page_number = data.get("page_number", 0)
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class Bedrock:
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
model_id: str = "apac.anthropic.claude-3-5-sonnet-20241022-v2:0",
|
|
59
|
+
):
|
|
60
|
+
|
|
61
|
+
self.session = Bedrock._set_session_params()
|
|
62
|
+
region_name = "ap-southeast-2"
|
|
63
|
+
if self.session is None:
|
|
64
|
+
self.session = boto3.Session(region_name = region_name)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
custom_config = Config(
|
|
68
|
+
retries={
|
|
69
|
+
'max_attempts': 2, # Total attempts = 1 initial + 1 retry
|
|
70
|
+
'mode': 'standard', # or 'adaptive'
|
|
71
|
+
}
|
|
72
|
+
)
|
|
73
|
+
self.client = self.session.client("bedrock-runtime", region_name=region_name, config = custom_config)
|
|
74
|
+
self.model_id = model_id
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def _set_session_params():
|
|
78
|
+
try:
|
|
79
|
+
aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]
|
|
80
|
+
aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
|
|
81
|
+
aws_session_token = os.environ["AWS_SESSION_TOKEN"]
|
|
82
|
+
region_name = os.environ["AWS_DEFAULT_REGION"]
|
|
83
|
+
return boto3.Session(
|
|
84
|
+
aws_access_key_id=aws_access_key_id,
|
|
85
|
+
aws_secret_access_key=aws_secret_access_key,
|
|
86
|
+
aws_session_token=aws_session_token,
|
|
87
|
+
region_name=region_name
|
|
88
|
+
)
|
|
89
|
+
except Exception as e:
|
|
90
|
+
logger.exception(e)
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
def _get_prompt(self, prompt_type: str) -> Optional[str]:
|
|
94
|
+
if prompt_type not in list(PromptType):
|
|
95
|
+
raise Exception(f"Error: Invalid prompt type")
|
|
96
|
+
|
|
97
|
+
path = os.path.join(os.path.dirname(__file__), "prompts", prompt_type)
|
|
98
|
+
try:
|
|
99
|
+
with open(path, "r") as f:
|
|
100
|
+
prompt = f.read()
|
|
101
|
+
return prompt
|
|
102
|
+
except Exception as e:
|
|
103
|
+
logger.exception(f"Failed to get {prompt_type} prompt due to {e}")
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
def _ocr(
|
|
107
|
+
self,
|
|
108
|
+
prompt: str,
|
|
109
|
+
image_bytes: bytes,
|
|
110
|
+
page_number: Optional[int] = 0
|
|
111
|
+
) -> Optional[OCRResult]:
|
|
112
|
+
system_prompt = [{"text": prompt}]
|
|
113
|
+
message = [
|
|
114
|
+
{
|
|
115
|
+
"role": "user",
|
|
116
|
+
"content": [
|
|
117
|
+
{
|
|
118
|
+
"image": {
|
|
119
|
+
"format": "png",
|
|
120
|
+
"source": {
|
|
121
|
+
"bytes": image_bytes,
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
]
|
|
126
|
+
}
|
|
127
|
+
]
|
|
128
|
+
retries = 3
|
|
129
|
+
for i in range(retries):
|
|
130
|
+
logger.info(f"Attempt number {i} for {self.model_id} converse")
|
|
131
|
+
try:
|
|
132
|
+
response = self.client.converse(modelId = self.model_id, messages = message, system = system_prompt)
|
|
133
|
+
if response['ResponseMetadata']['HTTPStatusCode'] == 200:
|
|
134
|
+
break
|
|
135
|
+
except Exception as e:
|
|
136
|
+
logger.exception(f"Error during conversation due to {e}")
|
|
137
|
+
if i >= len(retries) - 1: raise Exception(e)
|
|
138
|
+
continue
|
|
139
|
+
|
|
140
|
+
result = {}
|
|
141
|
+
result["content"] = json.loads(response["output"]["message"]["content"][0]["text"])
|
|
142
|
+
result["token_usage"] = response["usage"]
|
|
143
|
+
result["page_number"] = page_number
|
|
144
|
+
return OCRResult.from_dict(data = result)
|
|
145
|
+
|
|
146
|
+
def _parallel_ocr(
|
|
147
|
+
self,
|
|
148
|
+
image_bytes_list: List[bytes],
|
|
149
|
+
prompt_type: str,
|
|
150
|
+
max_workers: int = 10,
|
|
151
|
+
):
|
|
152
|
+
results = list()
|
|
153
|
+
prompt = self._get_prompt(prompt_type=prompt_type)
|
|
154
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
155
|
+
logger.info("Some log")
|
|
156
|
+
execution_futures = {
|
|
157
|
+
executor.submit(
|
|
158
|
+
self._ocr,
|
|
159
|
+
prompt = prompt,
|
|
160
|
+
image_bytes = img,
|
|
161
|
+
): img for img in image_bytes_list
|
|
162
|
+
}
|
|
163
|
+
for future in concurrent.futures.as_completed(execution_futures):
|
|
164
|
+
result = future.result()
|
|
165
|
+
if result:
|
|
166
|
+
results.append(result)
|
|
167
|
+
return results
|
|
168
|
+
|
|
169
|
+
def get_ocr_result(
|
|
170
|
+
self,
|
|
171
|
+
pdf_bytes: io.BytesIO,
|
|
172
|
+
prompt_type: str,
|
|
173
|
+
zoom: int = 7,
|
|
174
|
+
) -> List[OCRResult]:
|
|
175
|
+
logger.info("Getting OCR Results")
|
|
176
|
+
try:
|
|
177
|
+
document = pymupdf.open(stream=pdf_bytes, filetype="pdf")
|
|
178
|
+
except Exception as e:
|
|
179
|
+
logger.exception(f"Failed to open pdf due to {e}")
|
|
180
|
+
return []
|
|
181
|
+
|
|
182
|
+
pages: List[pymupdf.Page] = [p for p in document]
|
|
183
|
+
|
|
184
|
+
image_bytes_list: List[bytes] = list()
|
|
185
|
+
for i, p in enumerate(pages):
|
|
186
|
+
try:
|
|
187
|
+
image_bytes: bytes = p.get_pixmap(matrix = pymupdf.Matrix(zoom, zoom)).tobytes("png")
|
|
188
|
+
image_bytes_list.append(image_bytes)
|
|
189
|
+
except Exception as e:
|
|
190
|
+
logger.error(f"Could not get pix map for page {i}")
|
|
191
|
+
continue
|
|
192
|
+
prompt = self._get_prompt(prompt_type=prompt_type)
|
|
193
|
+
logger.info("Got Prompt")
|
|
194
|
+
results = list()
|
|
195
|
+
for i, image_bytes in enumerate(image_bytes_list):
|
|
196
|
+
logger.info(f"Starting OCR for page: {i}")
|
|
197
|
+
results.append(self._ocr(image_bytes=image_bytes, prompt=prompt))
|
|
198
|
+
return results
|
|
199
|
+
|
|
File without changes
|
|
@@ -3,6 +3,7 @@ setup.cfg
|
|
|
3
3
|
setup.py
|
|
4
4
|
my_aws_helpers/api.py
|
|
5
5
|
my_aws_helpers/auth.py
|
|
6
|
+
my_aws_helpers/bedrock.py
|
|
6
7
|
my_aws_helpers/cognito.py
|
|
7
8
|
my_aws_helpers/dynamo.py
|
|
8
9
|
my_aws_helpers/errors.py
|
|
@@ -16,4 +17,5 @@ my_aws_helpers.egg-info/dependency_links.txt
|
|
|
16
17
|
my_aws_helpers.egg-info/requires.txt
|
|
17
18
|
my_aws_helpers.egg-info/top_level.txt
|
|
18
19
|
my_aws_helpers.egg-info/zip-safe
|
|
20
|
+
my_aws_helpers/prompts/__init__.py
|
|
19
21
|
tests/test_event.py
|
|
@@ -3,7 +3,7 @@ from setuptools import find_namespace_packages, setup
|
|
|
3
3
|
|
|
4
4
|
base_path = os.path.abspath(os.path.dirname(__file__))
|
|
5
5
|
|
|
6
|
-
version = "2.6.
|
|
6
|
+
version = "2.6.2"
|
|
7
7
|
|
|
8
8
|
setup(
|
|
9
9
|
name="my_aws_helpers",
|
|
@@ -23,5 +23,6 @@ setup(
|
|
|
23
23
|
zip_safe = True,
|
|
24
24
|
install_requires = [
|
|
25
25
|
"boto3==1.34.36"
|
|
26
|
-
]
|
|
26
|
+
],
|
|
27
|
+
include_package_data=True,
|
|
27
28
|
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|