my-aws-helpers 4.3.0__tar.gz → 6.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/PKG-INFO +1 -1
  2. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers/bedrock.py +125 -2
  3. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers/cognito.py +17 -0
  4. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers/dynamo.py +1 -1
  5. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers/s3.py +135 -4
  6. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers.egg-info/PKG-INFO +1 -1
  7. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers.egg-info/requires.txt +3 -3
  8. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/setup.py +2 -2
  9. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/MANIFEST.in +0 -0
  10. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/README.md +0 -0
  11. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers/api.py +0 -0
  12. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers/auth.py +0 -0
  13. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers/errors.py +0 -0
  14. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers/event.py +0 -0
  15. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers/logging.py +0 -0
  16. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers/prompts/__init__.py +0 -0
  17. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers/prompts/markdown_system_prompt.txt +0 -0
  18. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers/prompts/transactions_headers_prompt.txt +0 -0
  19. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers/prompts/transactions_headers_prompt_v2.txt +0 -0
  20. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers/prompts/transactions_prompt.txt +0 -0
  21. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers/sfn.py +0 -0
  22. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers.egg-info/SOURCES.txt +0 -0
  23. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers.egg-info/dependency_links.txt +0 -0
  24. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers.egg-info/top_level.txt +0 -0
  25. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/my_aws_helpers.egg-info/zip-safe +0 -0
  26. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/setup.cfg +0 -0
  27. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/tests/test_cognito.py +0 -0
  28. {my_aws_helpers-4.3.0 → my_aws_helpers-6.0.5}/tests/test_event.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: my_aws_helpers
3
- Version: 4.3.0
3
+ Version: 6.0.5
4
4
  Summary: AWS Helpers
5
5
  Home-page: https://github.com/JarrodMccarthy/aws_helpers.git
6
6
  Author: Jarrod McCarthy
@@ -5,11 +5,26 @@ import json
5
5
  import time
6
6
  import os
7
7
  import io
8
+ import re
9
+ from copy import copy
8
10
  from typing import Optional, List, Dict
9
11
  from enum import Enum
10
12
  import pymupdf
11
13
  import concurrent.futures
12
14
  from dataclasses import dataclass
15
+ from my_aws_helpers.s3 import S3Serialiser, BaseS3Object, BaseS3Queries, S3, S3Location
16
+
17
+ from my_aws_helpers.logging import select_powertools_logger
18
+
19
+
20
+ logger = select_powertools_logger("aws-helpers-bedrock")
21
+
22
+
23
+ class ImageType(str, Enum):
24
+ gif = "gif"
25
+ jpeg = "jpeg"
26
+ png = "png"
27
+ webp = "webp"
13
28
 
14
29
 
15
30
  class PromptType(str, Enum):
@@ -35,7 +50,7 @@ class TokenUsage:
35
50
 
36
51
 
37
52
  @dataclass
38
- class OCRResult:
53
+ class OCRResult(BaseS3Object):
39
54
  content: List[Dict[str, str]]
40
55
  token_usage: TokenUsage
41
56
  page_number: int
@@ -48,11 +63,69 @@ class OCRResult:
48
63
  page_number=data.get("page_number", 0),
49
64
  )
50
65
 
66
+ @classmethod
67
+ def from_s3_representation(cls, obj: dict) -> OCRResult:
68
+ obj["token_usage"] = (TokenUsage.from_dict(obj.get("token_usage", {})),)
69
+ return cls(**obj)
70
+
71
+ def to_s3_representation(self) -> dict:
72
+ obj = copy(vars(self))
73
+ obj["token_usage"] = S3Serialiser.object_serialiser(
74
+ obj=vars(obj["token_usage"])
75
+ )
76
+ return S3Serialiser.object_serialiser(obj=obj)
77
+
78
+ def get_save_location(self, bucket_name: str) -> S3Location:
79
+ pass
80
+
81
+
82
+ class OCRResultQueries(BaseS3Queries):
83
+ def __init__(self, s3_client: S3, bucket_name: str):
84
+ super().__init__(s3_client=s3_client, bucket_name=bucket_name)
85
+
86
+ def save_ocr_result_to_s3(
87
+ self, ocr_result: OCRResult, save_location: S3Location
88
+ ) -> Optional[S3Location]:
89
+ try:
90
+ obj = ocr_result.to_s3_representation()
91
+ return self.s3_client.save_dict_to_s3(
92
+ content=obj,
93
+ s3_location=save_location,
94
+ )
95
+ except Exception as e:
96
+ logger.exception(f"Failed to save ocr result to s3 due to {e}")
97
+ return None
98
+
99
+ def _concurrent_s3_read(
100
+ self, locations: List[S3Location], max_workers: int = 10
101
+ ) -> List[OCRResult]:
102
+ results: List[OCRResult] = list()
103
+ futures = list()
104
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
105
+ for loc in locations:
106
+ future = executor.submit(
107
+ self.s3_client.read_dict_from_s3,
108
+ s3_location=loc,
109
+ )
110
+ futures.append(future)
111
+ for f in futures:
112
+ results.append(f.result())
113
+ results = [r for r in results if r is not None]
114
+ return results
115
+
116
+ def get_ocr_results_by_key_prefix(self, prefix: str) -> List[OCRResult]:
117
+ locations = self.s3_client.list_objects_by_prefix(
118
+ bucket_name=self.bucket_name, prefix=prefix
119
+ )
120
+ objects = self._concurrent_s3_read(locations=locations)
121
+ ocr_results = [OCRResult.from_s3_representation(obj=obj) for obj in objects]
122
+ return ocr_results
123
+
51
124
 
52
125
  class Bedrock:
53
126
  def __init__(
54
127
  self,
55
- model_id: str = "apac.anthropic.claude-3-5-sonnet-20241022-v2:0",
128
+ model_id: str = "apac.anthropic.claude-3-5-sonnet-20241022-v2:0", # anthropic.claude-sonnet-4-20250514-v1:0
56
129
  logger=None,
57
130
  sleep_time: float = 1.0,
58
131
  ):
@@ -92,6 +165,19 @@ class Bedrock:
92
165
  print(e)
93
166
  return None
94
167
 
168
+ @staticmethod
169
+ def extract_json_from_markdown(text: str):
170
+ """
171
+ Extracts the JSON object from a string that may be wrapped in ```json ... ``` code block
172
+ """
173
+ # Match a {...} block anywhere in the text
174
+ match = re.search(r"\{.*\}", text, re.DOTALL)
175
+ if match:
176
+ json_str = match.group(0)
177
+ return json.loads(json_str)
178
+ else:
179
+ raise ValueError("No JSON object found in the text")
180
+
95
181
  def _get_prompt(self, prompt_type: str) -> Optional[str]:
96
182
  if prompt_type not in list(PromptType):
97
183
  raise Exception(f"Error: Invalid prompt type")
@@ -237,3 +323,40 @@ class Bedrock:
237
323
  except Exception as e:
238
324
  self.logger.exception(e)
239
325
  return []
326
+
327
+ def _get_image_block(self, image: bytes, image_content_type: ImageType) -> dict:
328
+ return {
329
+ "image": {
330
+ "format": image_content_type,
331
+ "source": {
332
+ "bytes": image,
333
+ },
334
+ }
335
+ }
336
+
337
+ def image_analysis(
338
+ self, images: List[bytes], prompt: str, image_content_type: ImageType
339
+ ) -> OCRResult:
340
+
341
+ system_prompt = [{"text": prompt}]
342
+ message = [
343
+ {
344
+ "role": "user",
345
+ "content": [
346
+ self._get_image_block(
347
+ image=image, image_content_type=image_content_type
348
+ )
349
+ for image in images
350
+ ],
351
+ }
352
+ ]
353
+ response = self.client.converse(
354
+ modelId=self.model_id, messages=message, system=system_prompt
355
+ )
356
+
357
+ result = {}
358
+ result["content"] = Bedrock.extract_json_from_markdown(
359
+ text=response["output"]["message"]["content"][0]["text"]
360
+ )
361
+ result["token_usage"] = response["usage"]
362
+ return OCRResult.from_dict(data=result)
@@ -69,6 +69,23 @@ class Cognito:
69
69
  logger.exception(f"Failed to sign up due to {e}")
70
70
  return None
71
71
 
72
+ def confirm_sign_up(
73
+ self,
74
+ username: str,
75
+ client_id: str,
76
+ confirmation_code: str,
77
+ ) -> dict:
78
+ try:
79
+ response = self.client.confirm_sign_up(
80
+ ClientId=client_id, Username=username, ConfirmationCode=confirmation_code,
81
+ )
82
+ return response
83
+ except Exception as e:
84
+ logger.exception(
85
+ f"Failed to confirm sign up username {username} due to {e}"
86
+ )
87
+ return None
88
+
72
89
  def admin_confirm_sign_up(
73
90
  self,
74
91
  username: str,
@@ -113,7 +113,7 @@ class Dynamo:
113
113
  return self.table.get_item(Item=item)
114
114
 
115
115
  def delete_item(self, item: dict):
116
- return self.table.delete_item(Item=item)
116
+ return self.table.delete_item(Key=item)
117
117
 
118
118
  def batch_put(self, items: List[dict]) -> None:
119
119
  with self.table.batch_writer() as batch:
@@ -1,7 +1,10 @@
1
+ from __future__ import annotations
1
2
  import boto3
2
3
  import io
3
4
  import json
4
- from typing import Optional, Any, Dict
5
+ from concurrent.futures import Future, ThreadPoolExecutor
6
+ from abc import ABC, abstractmethod
7
+ from typing import Optional, Any, Dict, List
5
8
  from datetime import datetime, date
6
9
  from copy import copy
7
10
  import os
@@ -19,6 +22,26 @@ class ContentType(str, Enum):
19
22
  json_content = "application/json"
20
23
  pdf_content = "application/pdf"
21
24
  jpeg_content = "image/jpeg"
25
+ png_content = "image/png"
26
+
27
+
28
+ @staticmethod
29
+ def get_content_type_from_file_name(file_name: str) -> Optional[ContentType]:
30
+ file_extension = file_name.split('.')[-1]
31
+ if file_extension == "xml":
32
+ return ContentType.xml_content
33
+ if file_extension == "txt":
34
+ return ContentType.plain_text
35
+ if file_extension == "json":
36
+ return ContentType.json_content
37
+ if file_extension == "pdf":
38
+ return ContentType.pdf_content
39
+ if file_extension == "jpg":
40
+ return ContentType.jpeg_content
41
+ if file_extension == "png":
42
+ return ContentType.png_content
43
+ return None
44
+
22
45
 
23
46
 
24
47
  class ContentEncoding(str, Enum):
@@ -31,16 +54,19 @@ class S3Serialiser:
31
54
  def _serialise(obj: Any):
32
55
  if isinstance(obj, datetime) or isinstance(obj, date):
33
56
  return obj.isoformat()
57
+ if isinstance(obj, S3Location):
58
+ return obj.location
34
59
  return obj
35
60
 
36
61
  @staticmethod
37
- def object_serialiser(obj: Any):
62
+ def object_serialiser(obj: Dict):
38
63
  if isinstance(obj, list):
39
64
  return [S3Serialiser.object_serialiser(obj=obj) for obj in obj]
40
65
  if isinstance(obj, dict):
41
66
  return {k: S3Serialiser.object_serialiser(v) for k, v in obj.items()}
42
67
  return S3Serialiser._serialise(obj=obj)
43
68
 
69
+
44
70
  class S3Location:
45
71
  bucket: str
46
72
  file_name: str
@@ -94,10 +120,63 @@ class S3:
94
120
  Body=json.dumps(object), Bucket=bucket_name, Key=file_name
95
121
  )
96
122
 
123
+ def list_objects_by_prefix(self, bucket_name: str, prefix: str) -> List[S3Location]:
124
+ """
125
+ list objects by prefix gets 1000 items at a time, if theres more, I want em
126
+ """
127
+ objects = list()
128
+ try:
129
+ continuation_token = None
130
+ while True:
131
+ if continuation_token:
132
+ response = self.client.list_objects_v2(
133
+ Bucket=bucket_name,
134
+ Prefix=prefix,
135
+ ContinuationToken=continuation_token,
136
+ )
137
+ else:
138
+ response = self.client.list_objects_v2(
139
+ Bucket=bucket_name, Prefix=prefix
140
+ )
141
+
142
+ # Append current batch
143
+ objects.extend(response.get("Contents", []))
144
+
145
+ # Check if more results exist
146
+ if response.get("IsTruncated"): # True if more pages available
147
+ continuation_token = response["NextContinuationToken"]
148
+ else:
149
+ break
150
+ locations = [
151
+ S3Location(bucket=bucket_name, file_name=c["Key"]) for c in objects
152
+ ]
153
+ return locations
154
+ except Exception as e:
155
+ logger.exception(
156
+ f"Failed to get objects from s3: {bucket_name}/{prefix} due to {e}"
157
+ )
158
+ return []
159
+
97
160
  def get_object(self, bucket_name: str, file_name: str):
98
161
  response = self.client.get_object(Bucket=bucket_name, Key=file_name)
99
162
  return self._streaming_body_to_dict(response["Body"])
100
163
 
164
+ def put_presigned_url(
165
+ self,
166
+ s3_location: S3Location,
167
+ expires_in: int = 3600,
168
+ content_type: str = ContentType.pdf_content.value
169
+ ) -> str:
170
+ return self.client.generate_presigned_url(
171
+ "put_object",
172
+ Params={
173
+ "Bucket": s3_location.bucket,
174
+ "Key": s3_location.file_name,
175
+ "ContentType": content_type,
176
+ },
177
+ ExpiresIn=expires_in,
178
+ )
179
+
101
180
  def get_presigned_url(
102
181
  self,
103
182
  bucket_name: str,
@@ -247,7 +326,59 @@ class S3:
247
326
  )
248
327
  except Exception as e:
249
328
  logger.exception(f"Failed to save jpeg to s3 due to {e}")
250
- return None
329
+ return None
251
330
 
252
331
  def read_dict_from_s3(self, s3_location: S3Location) -> dict:
253
- return json.loads(self.read_binary_from_s3(s3_location=s3_location).decode("utf-8"))
332
+ return json.loads(
333
+ self.read_binary_from_s3(s3_location=s3_location).decode("utf-8")
334
+ )
335
+
336
+
337
+ class BaseS3Object(ABC):
338
+ def to_s3_representation(self) -> dict:
339
+ obj = copy(vars(self))
340
+ return S3Serialiser.object_serialiser(obj=obj)
341
+
342
+ @classmethod
343
+ def from_s3_representation(cls, obj: dict) -> BaseS3Object:
344
+ return cls(**obj)
345
+
346
+ @abstractmethod
347
+ def get_save_location(self, bucket_name: str) -> S3Location:
348
+ pass
349
+
350
+
351
+ class BaseS3Queries:
352
+ s3_client: S3
353
+ bucket_name: str
354
+
355
+ def __init__(self, s3_client: S3, bucket_name: str):
356
+ self.s3_client = s3_client
357
+ self.bucket_name = bucket_name
358
+
359
+ def _concurrent_s3_dict_read(
360
+ self, locations: List[S3Location], max_workers: int = 10
361
+ ) -> List[BaseS3Object]:
362
+ results: List[BaseS3Object] = list()
363
+ futures: List[Future] = list()
364
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
365
+ for location in locations:
366
+ future = executor.submit(
367
+ self.s3_client.read_dict_from_s3,
368
+ s3_location=location,
369
+ )
370
+ futures.append(future)
371
+ for f in futures:
372
+ results.append(f.result())
373
+ results = [r for r in results if r is not None]
374
+ return results
375
+
376
+ def save_s3_object_to_s3(self, object: BaseS3Object) -> Optional[S3Location]:
377
+ try:
378
+ obj = object.to_s3_representation()
379
+ return self.s3_client.save_dict_to_s3(
380
+ content=obj, s3_location=object.get_save_location()
381
+ )
382
+ except Exception as e:
383
+ logger.exception(f"Failed to save s3 object to s3 due to {e}")
384
+ return None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: my-aws-helpers
3
- Version: 4.3.0
3
+ Version: 6.0.5
4
4
  Summary: AWS Helpers
5
5
  Home-page: https://github.com/JarrodMccarthy/aws_helpers.git
6
6
  Author: Jarrod McCarthy
@@ -1,15 +1,15 @@
1
- boto3==1.34.36
1
+ boto3
2
2
  python-jose==3.5.0
3
3
 
4
4
  [all]
5
5
  PyMuPDF==1.26.0
6
6
  black<26.0.0,<=25.1.0
7
- boto3==1.34.36
7
+ boto3
8
8
  coverage==7.3.2
9
9
  pytest==7.4.3
10
10
  python-jose==3.5.0
11
11
 
12
12
  [bedrock]
13
13
  PyMuPDF==1.26.0
14
- boto3==1.34.36
14
+ boto3
15
15
  python-jose==3.5.0
@@ -3,10 +3,10 @@ from setuptools import find_namespace_packages, setup
3
3
 
4
4
  base_path = os.path.abspath(os.path.dirname(__file__))
5
5
 
6
- version = "4.3.0"
6
+ version = "6.0.5"
7
7
 
8
8
  core = [
9
- "boto3==1.34.36",
9
+ "boto3",
10
10
  "python-jose==3.5.0",
11
11
  ]
12
12
  dev = [
File without changes
File without changes