my-aws-helpers 2.6.2__tar.gz → 2.6.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of my-aws-helpers might be problematic. Click here for more details.
- my_aws_helpers-2.6.4/MANIFEST.in +1 -0
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/PKG-INFO +1 -1
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/my_aws_helpers/bedrock.py +26 -31
- my_aws_helpers-2.6.4/my_aws_helpers/prompts/json_system_prompt.txt +36 -0
- my_aws_helpers-2.6.4/my_aws_helpers/prompts/markdown_system_prompt.txt +35 -0
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/my_aws_helpers.egg-info/PKG-INFO +1 -1
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/my_aws_helpers.egg-info/SOURCES.txt +3 -0
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/setup.py +1 -1
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/README.md +0 -0
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/my_aws_helpers/api.py +0 -0
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/my_aws_helpers/auth.py +0 -0
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/my_aws_helpers/cognito.py +0 -0
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/my_aws_helpers/dynamo.py +0 -0
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/my_aws_helpers/errors.py +0 -0
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/my_aws_helpers/event.py +0 -0
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/my_aws_helpers/logging.py +0 -0
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/my_aws_helpers/prompts/__init__.py +0 -0
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/my_aws_helpers/s3.py +0 -0
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/my_aws_helpers/sfn.py +0 -0
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/my_aws_helpers.egg-info/dependency_links.txt +0 -0
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/my_aws_helpers.egg-info/requires.txt +0 -0
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/my_aws_helpers.egg-info/top_level.txt +0 -0
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/my_aws_helpers.egg-info/zip-safe +0 -0
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/setup.cfg +0 -0
- {my_aws_helpers-2.6.2 → my_aws_helpers-2.6.4}/tests/test_event.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
recursive-include my_aws_helpers *.txt
|
|
@@ -10,12 +10,6 @@ from enum import Enum
|
|
|
10
10
|
import pymupdf
|
|
11
11
|
import concurrent.futures
|
|
12
12
|
from dataclasses import dataclass
|
|
13
|
-
from my_aws_helpers.logging import select_powertools_logger
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
logger = select_powertools_logger('bedrock-boy')
|
|
17
|
-
|
|
18
|
-
logger.info("Got logger")
|
|
19
13
|
|
|
20
14
|
class PromptType(str, Enum):
|
|
21
15
|
json = "json_system_prompt.txt"
|
|
@@ -56,9 +50,11 @@ class Bedrock:
|
|
|
56
50
|
def __init__(
|
|
57
51
|
self,
|
|
58
52
|
model_id: str = "apac.anthropic.claude-3-5-sonnet-20241022-v2:0",
|
|
53
|
+
logger = None,
|
|
59
54
|
):
|
|
60
55
|
|
|
61
56
|
self.session = Bedrock._set_session_params()
|
|
57
|
+
self.logger = logger
|
|
62
58
|
region_name = "ap-southeast-2"
|
|
63
59
|
if self.session is None:
|
|
64
60
|
self.session = boto3.Session(region_name = region_name)
|
|
@@ -87,7 +83,7 @@ class Bedrock:
|
|
|
87
83
|
region_name=region_name
|
|
88
84
|
)
|
|
89
85
|
except Exception as e:
|
|
90
|
-
|
|
86
|
+
print(e)
|
|
91
87
|
return None
|
|
92
88
|
|
|
93
89
|
def _get_prompt(self, prompt_type: str) -> Optional[str]:
|
|
@@ -100,7 +96,7 @@ class Bedrock:
|
|
|
100
96
|
prompt = f.read()
|
|
101
97
|
return prompt
|
|
102
98
|
except Exception as e:
|
|
103
|
-
logger.exception(f"Failed to get {prompt_type} prompt due to {e}")
|
|
99
|
+
self.logger.exception(f"Failed to get {prompt_type} prompt due to {e}")
|
|
104
100
|
return None
|
|
105
101
|
|
|
106
102
|
def _ocr(
|
|
@@ -127,13 +123,13 @@ class Bedrock:
|
|
|
127
123
|
]
|
|
128
124
|
retries = 3
|
|
129
125
|
for i in range(retries):
|
|
130
|
-
logger.info(f"Attempt number {i} for {self.model_id} converse")
|
|
126
|
+
self.logger.info(f"Attempt number {i} for {self.model_id} converse")
|
|
131
127
|
try:
|
|
132
128
|
response = self.client.converse(modelId = self.model_id, messages = message, system = system_prompt)
|
|
133
129
|
if response['ResponseMetadata']['HTTPStatusCode'] == 200:
|
|
134
130
|
break
|
|
135
131
|
except Exception as e:
|
|
136
|
-
logger.exception(f"Error during conversation due to {e}")
|
|
132
|
+
self.logger.exception(f"Error during conversation due to {e}")
|
|
137
133
|
if i >= len(retries) - 1: raise Exception(e)
|
|
138
134
|
continue
|
|
139
135
|
|
|
@@ -152,7 +148,7 @@ class Bedrock:
|
|
|
152
148
|
results = list()
|
|
153
149
|
prompt = self._get_prompt(prompt_type=prompt_type)
|
|
154
150
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
155
|
-
logger.info("Some log")
|
|
151
|
+
self.logger.info("Some log")
|
|
156
152
|
execution_futures = {
|
|
157
153
|
executor.submit(
|
|
158
154
|
self._ocr,
|
|
@@ -172,28 +168,27 @@ class Bedrock:
|
|
|
172
168
|
prompt_type: str,
|
|
173
169
|
zoom: int = 7,
|
|
174
170
|
) -> List[OCRResult]:
|
|
175
|
-
logger.info("Getting OCR Results")
|
|
176
171
|
try:
|
|
172
|
+
self.logger.info("Getting OCR Results")
|
|
177
173
|
document = pymupdf.open(stream=pdf_bytes, filetype="pdf")
|
|
174
|
+
pages: List[pymupdf.Page] = [p for p in document]
|
|
175
|
+
|
|
176
|
+
image_bytes_list: List[bytes] = list()
|
|
177
|
+
for i, p in enumerate(pages):
|
|
178
|
+
try:
|
|
179
|
+
image_bytes: bytes = p.get_pixmap(matrix = pymupdf.Matrix(zoom, zoom)).tobytes("png")
|
|
180
|
+
image_bytes_list.append(image_bytes)
|
|
181
|
+
except Exception as e:
|
|
182
|
+
self.logger.error(f"Could not get pix map for page {i}")
|
|
183
|
+
continue
|
|
184
|
+
prompt = self._get_prompt(prompt_type=prompt_type)
|
|
185
|
+
self.logger.info("Got Prompt")
|
|
186
|
+
results = list()
|
|
187
|
+
for i, image_bytes in enumerate(image_bytes_list):
|
|
188
|
+
self.logger.info(f"Starting OCR for page: {i}")
|
|
189
|
+
results.append(self._ocr(image_bytes=image_bytes, prompt=prompt))
|
|
190
|
+
return results
|
|
178
191
|
except Exception as e:
|
|
179
|
-
logger.exception(
|
|
192
|
+
self.logger.exception(e)
|
|
180
193
|
return []
|
|
181
|
-
|
|
182
|
-
pages: List[pymupdf.Page] = [p for p in document]
|
|
183
|
-
|
|
184
|
-
image_bytes_list: List[bytes] = list()
|
|
185
|
-
for i, p in enumerate(pages):
|
|
186
|
-
try:
|
|
187
|
-
image_bytes: bytes = p.get_pixmap(matrix = pymupdf.Matrix(zoom, zoom)).tobytes("png")
|
|
188
|
-
image_bytes_list.append(image_bytes)
|
|
189
|
-
except Exception as e:
|
|
190
|
-
logger.error(f"Could not get pix map for page {i}")
|
|
191
|
-
continue
|
|
192
|
-
prompt = self._get_prompt(prompt_type=prompt_type)
|
|
193
|
-
logger.info("Got Prompt")
|
|
194
|
-
results = list()
|
|
195
|
-
for i, image_bytes in enumerate(image_bytes_list):
|
|
196
|
-
logger.info(f"Starting OCR for page: {i}")
|
|
197
|
-
results.append(self._ocr(image_bytes=image_bytes, prompt=prompt))
|
|
198
|
-
return results
|
|
199
194
|
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
You are an intelligent document and image interpreter.
|
|
2
|
+
|
|
3
|
+
Your task is to analyze the provided image and extract all meaningful data as structured **JSON**.
|
|
4
|
+
|
|
5
|
+
## Output Requirements
|
|
6
|
+
|
|
7
|
+
- Return **only valid JSON**.
|
|
8
|
+
- If the image contains one or more **tables**, represent each row as a JSON object.
|
|
9
|
+
- Use **the table headers as keys**.
|
|
10
|
+
- if there are multiple sections, only include the row data in transactions
|
|
11
|
+
- Every row should have the consistent headers
|
|
12
|
+
- If content is unclear, use: `"[Unclear]"`.
|
|
13
|
+
- **Do not fabricate** values not visible in the image.
|
|
14
|
+
- if a descripion contains 2 lines of text, only include the most important text and the text should reside in 1 key in json response, do not split it
|
|
15
|
+
- Do not return anything except the json content
|
|
16
|
+
|
|
17
|
+
## Example
|
|
18
|
+
|
|
19
|
+
If the image contains this table:
|
|
20
|
+
|
|
21
|
+
| Date | | Price |
|
|
22
|
+
| ------ |----------|-------|
|
|
23
|
+
| June 6 | desc 1 | $2.00 |
|
|
24
|
+
| June 5 | misc 2 | $1.70 |
|
|
25
|
+
| | item x | $1.50 |
|
|
26
|
+
|
|
27
|
+
Return:
|
|
28
|
+
|
|
29
|
+
```json
|
|
30
|
+
{
|
|
31
|
+
"transactions": [
|
|
32
|
+
{ "date": "June 6", "description": "desc 1", "price": "$2.00" },
|
|
33
|
+
{ "date": "June 5", "description": "desc 2", "price": "$1.70" },
|
|
34
|
+
{ "date": "June 5", "description": "item x", "price": "$1.50" },
|
|
35
|
+
]
|
|
36
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
You are an intelligent document and visual layout interpreter.
|
|
2
|
+
|
|
3
|
+
Your task is to:
|
|
4
|
+
1. Analyze the image provided a bank statement with a list of transactions.
|
|
5
|
+
2. Convert the contents into **well-formatted Markdown**, preserving:
|
|
6
|
+
- Headings
|
|
7
|
+
- Lists
|
|
8
|
+
- Tables
|
|
9
|
+
- Emphasis (bold, italic)
|
|
10
|
+
- Line breaks and whitespace if needed
|
|
11
|
+
|
|
12
|
+
## Output Format
|
|
13
|
+
|
|
14
|
+
Please return only valid Markdown, and structure it clearly.
|
|
15
|
+
|
|
16
|
+
Use this format:
|
|
17
|
+
|
|
18
|
+
```markdown
|
|
19
|
+
# [Main Title of Document or Topic]
|
|
20
|
+
|
|
21
|
+
## Section 1 Title
|
|
22
|
+
|
|
23
|
+
- Bullet point 1
|
|
24
|
+
- Bullet point 2
|
|
25
|
+
|
|
26
|
+
### Table
|
|
27
|
+
|
|
28
|
+
| Column A | Column B |
|
|
29
|
+
|----------|----------|
|
|
30
|
+
| Row 1A | Row 1B |
|
|
31
|
+
| Row 2A | Row 2B |
|
|
32
|
+
|
|
33
|
+
3. The table data should be copied exactly as it appears without no exceptions
|
|
34
|
+
|
|
35
|
+
4. Do not return anything except the markdown content
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
MANIFEST.in
|
|
1
2
|
README.md
|
|
2
3
|
setup.cfg
|
|
3
4
|
setup.py
|
|
@@ -18,4 +19,6 @@ my_aws_helpers.egg-info/requires.txt
|
|
|
18
19
|
my_aws_helpers.egg-info/top_level.txt
|
|
19
20
|
my_aws_helpers.egg-info/zip-safe
|
|
20
21
|
my_aws_helpers/prompts/__init__.py
|
|
22
|
+
my_aws_helpers/prompts/json_system_prompt.txt
|
|
23
|
+
my_aws_helpers/prompts/markdown_system_prompt.txt
|
|
21
24
|
tests/test_event.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|