awschain 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. awschain/__init__.py +4 -0
  2. awschain/example.py +240 -0
  3. awschain/handlers/__init__.py +5 -0
  4. awschain/handlers/abstract_handler.py +26 -0
  5. awschain/handlers/base_handler.py +17 -0
  6. awschain/handlers/handler_factory.py +57 -0
  7. awschain/handlers/misc/__init__.py +0 -0
  8. awschain/handlers/misc/clipboard_writer_handler.py +8 -0
  9. awschain/handlers/misc/print_context_handler.py +14 -0
  10. awschain/handlers/misc/remote_file_downloader_handler.py +43 -0
  11. awschain/handlers/processors/__init__.py +0 -0
  12. awschain/handlers/processors/amazon_bedrock_chat_handler.py +32 -0
  13. awschain/handlers/processors/amazon_bedrock_handler.py +62 -0
  14. awschain/handlers/processors/amazon_comprehend_insights_handler.py +95 -0
  15. awschain/handlers/processors/amazon_comprehend_pii_classifier_handler.py +42 -0
  16. awschain/handlers/processors/amazon_comprehend_pii_handler.py +48 -0
  17. awschain/handlers/processors/amazon_comprehend_pii_tokenize_handler.py +119 -0
  18. awschain/handlers/processors/amazon_comprehend_pii_untokenize_handler.py +46 -0
  19. awschain/handlers/processors/amazon_rekognition_handler.py +66 -0
  20. awschain/handlers/processors/amazon_textract_handler.py +138 -0
  21. awschain/handlers/processors/amazon_transcribe_handler.py +91 -0
  22. awschain/handlers/processors/anonymize_handler.py +30 -0
  23. awschain/handlers/processors/html_cleaner_handler.py +11 -0
  24. awschain/handlers/processors/prompt_handler.py +27 -0
  25. awschain/handlers/readers/__init__.py +0 -0
  26. awschain/handlers/readers/amazon_s3_reader_handler.py +35 -0
  27. awschain/handlers/readers/aws_secrets_manager_secret_reader.py +50 -0
  28. awschain/handlers/readers/email_reader_handler.py +83 -0
  29. awschain/handlers/readers/http_handler.py +140 -0
  30. awschain/handlers/readers/local_file_reader_handler.py +22 -0
  31. awschain/handlers/readers/microsoft_excel_reader_handler.py +84 -0
  32. awschain/handlers/readers/microsoft_power_point_reader_handler.py +95 -0
  33. awschain/handlers/readers/microsoft_word_handler.py +112 -0
  34. awschain/handlers/readers/pdf_reader_handler.py +81 -0
  35. awschain/handlers/readers/quip_reader_handler.py +148 -0
  36. awschain/handlers/readers/web_crawler_reader_handler.py +52 -0
  37. awschain/handlers/readers/youtube_reader_handler.py +39 -0
  38. awschain/handlers/writers/__init__.py +0 -0
  39. awschain/handlers/writers/amazon_datazone_asset_writer_handler.py +87 -0
  40. awschain/handlers/writers/amazon_datazone_glossary_writer_handler.py +92 -0
  41. awschain/handlers/writers/amazon_s3_writer_handler.py +118 -0
  42. awschain/handlers/writers/email_sender_handler.py +56 -0
  43. awschain/handlers/writers/local_file_writer_handler.py +51 -0
  44. awschain/handlers/writers/quip_writer_handler.py +85 -0
  45. awschain/utils/__init__.py +2 -0
  46. awschain/utils/aws_boto_client_manager.py +16 -0
  47. awschain/utils/bedrock.py +70 -0
  48. awschain/utils/config.py +1 -0
  49. awschain/utils/config_loader.py +43 -0
  50. awschain/utils/web_utils.py +22 -0
  51. awschain-0.1.0.dist-info/LICENSE +21 -0
  52. awschain-0.1.0.dist-info/METADATA +128 -0
  53. awschain-0.1.0.dist-info/RECORD +55 -0
  54. awschain-0.1.0.dist-info/WHEEL +5 -0
  55. awschain-0.1.0.dist-info/top_level.txt +1 -0
awschain/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from .handlers.handler_factory import HandlerFactory
2
+ from .utils.config_loader import ConfigLoader
3
+
4
+ __all__ = ['HandlerFactory', 'ConfigLoader']
awschain/example.py ADDED
@@ -0,0 +1,240 @@
1
+ #!/opt/anaconda3/bin/python
2
+ from concurrent.futures import ThreadPoolExecutor, as_completed
3
+ from datetime import datetime
4
+ import json
5
+ import os
6
+ import sys
7
+ from typing import Any
8
+ from dotenv import load_dotenv
9
+ from handlers.handler_factory import HandlerFactory
10
+ from .utils.config_loader import ConfigLoader
11
+ import argparse
12
+
13
+ # Load configuration
14
+ ConfigLoader.load_config('config.yaml')
15
+
16
+ # Load environment variables from .env file
17
+ load_dotenv()
18
+
19
+ def determine_input_type(file_path):
20
+ if "youtube" in file_path or "youtu.be" in file_path:
21
+ return "youtube_url"
22
+ elif file_path.startswith(('http')):
23
+ return "http"
24
+ elif file_path.startswith(('s3://')):
25
+ return "s3"
26
+ elif file_path.startswith(('quip://')):
27
+ return "quip"
28
+ elif file_path.endswith(('.mp3', '.mp4', '.m4a', '.wav', '.flac', '.mov', '.avi')):
29
+ return "multimedia_file"
30
+ elif file_path.endswith('.pdf'):
31
+ return "pdf"
32
+ elif file_path.endswith('.docx'):
33
+ return "microsoft_word"
34
+ elif file_path.endswith(('.xlsx','.xlsm','.xltx','.xltm')):
35
+ return "microsoft_excel"
36
+ elif file_path.endswith('.pptx'):
37
+ return "microsoft_pp"
38
+ elif file_path.endswith(('.jpg', '.jpeg', '.png', '.tiff')):
39
+ return "image_file"
40
+ elif file_path.endswith(('.txt', '.json')):
41
+ return "text_or_json"
42
+ else:
43
+ # Assume text
44
+ return "text_or_json"
45
+
46
+ def construct_chain(input_type, args):
47
+
48
+
49
+ # Use if-elif-else to construct the appropriate chain. In Python 3.10 we could use match statement.
50
+ if input_type == "youtube_url":
51
+ youtube_handler = HandlerFactory.get_handler("YouTubeReaderHandler")
52
+ s3writer_handler = HandlerFactory.get_handler("AmazonS3WriterHandler")
53
+ transcription_handler = HandlerFactory.get_handler("AmazonTranscriptionHandler")
54
+ local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
55
+
56
+ chain = youtube_handler
57
+ current_handler = youtube_handler.set_next(s3writer_handler).set_next(transcription_handler).set_next(local_file_writer_handler)
58
+ elif input_type == "multimedia_file":
59
+ s3writer_handler = HandlerFactory.get_handler("AmazonS3WriterHandler")
60
+ transcription_handler = HandlerFactory.get_handler("AmazonTranscriptionHandler")
61
+ local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
62
+
63
+ chain = s3writer_handler
64
+ current_handler = s3writer_handler.set_next(transcription_handler).set_next(local_file_writer_handler)
65
+
66
+ elif input_type == "multimedia_file_whisper":
67
+ transcription_handler = HandlerFactory.get_handler("OpenAIWhisperTranscriptionHandler")
68
+ local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
69
+
70
+ chain = transcription_handler
71
+ current_handler = transcription_handler.set_next(local_file_writer_handler)
72
+ elif input_type == "image_file":
73
+ local_file_reader_handler = HandlerFactory.get_handler("LocalFileReaderHandler")
74
+ textract_handler = HandlerFactory.get_handler("AmazonTextractHandler")
75
+ local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
76
+ chain = local_file_reader_handler
77
+ current_handler = local_file_reader_handler.set_next(textract_handler).set_next(local_file_writer_handler)
78
+ elif input_type == "pdf":
79
+ pdf_handler = HandlerFactory.get_handler("PDFReaderHandler")
80
+ local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
81
+
82
+ chain = pdf_handler
83
+ current_handler = pdf_handler.set_next(local_file_writer_handler)
84
+
85
+ elif input_type == "http":
86
+ http_handler = HandlerFactory.get_handler("HTTPHandler")
87
+ http_clean_handler = HandlerFactory.get_handler("HTMLCleanerHandler")
88
+ local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
89
+
90
+ chain = http_handler
91
+ current_handler = http_handler.set_next(http_clean_handler).set_next(local_file_writer_handler)
92
+ elif input_type == "text_or_json":
93
+ local_file_reader_handler = HandlerFactory.get_handler("LocalFileReaderHandler")
94
+
95
+ chain = local_file_reader_handler
96
+ current_handler = local_file_reader_handler
97
+ elif input_type == "s3":
98
+ s3reader_handler = HandlerFactory.get_handler("AmazonS3ReaderHandler")
99
+ # local_file_reader_handler = HandlerFactory.get_handler("LocalFileReaderHandler")
100
+ current_handler = chain = s3reader_handler
101
+
102
+ elif input_type == "quip":
103
+ quip_reader_handler = HandlerFactory.get_handler("QuipReaderHandler")
104
+ http_clean_handler = HandlerFactory.get_handler("HTMLCleanerHandler")
105
+ local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
106
+
107
+ chain = quip_reader_handler
108
+ current_handler = quip_reader_handler.set_next(http_clean_handler).set_next(local_file_writer_handler)
109
+ elif input_type == "microsoft_word":
110
+ msword_handler = HandlerFactory.get_handler("MicrosoftWordReaderHandler")
111
+ local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
112
+ chain = msword_handler
113
+ current_handler = msword_handler.set_next(local_file_writer_handler)
114
+
115
+ elif input_type == "microsoft_excel":
116
+ xls_hanlder = HandlerFactory.get_handler("MicrosoftExcelReaderHandler")
117
+ local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
118
+ chain = xls_hanlder
119
+ current_handler = xls_hanlder.set_next(local_file_writer_handler)
120
+ elif input_type == "microsoft_pp":
121
+ pp_handler = HandlerFactory.get_handler("MicrosoftPowerPointReaderHandler")
122
+ local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
123
+ chain = pp_handler
124
+ current_handler = pp_handler.set_next(local_file_writer_handler)
125
+ else:
126
+ # For unsupported types, default to just summarization_handler
127
+ print("Unsupported file type.", input_type)
128
+ sys.exit(1)
129
+
130
+ # Anonymize data?
131
+ anonymize = args.anonymize in (True, 'true', '1')
132
+ if anonymize:
133
+ anonymize_handler = HandlerFactory.get_handler("AmazonComprehendPIITokenizeHandler")
134
+ current_handler = current_handler.set_next(anonymize_handler)
135
+
136
+ # Add the prompt and bedrock handlers.
137
+ prompt_handler = HandlerFactory.get_handler("PromptHandler")
138
+ bedrock_handler = HandlerFactory.get_handler("AmazonBedrockHandler")
139
+
140
+ # Determinate when / if we need to call summarization or Chat and in what order.
141
+
142
+ if args.chat and args.chat != None:
143
+ chat_handler = HandlerFactory.get_handler("AmazonBedrockChatHandler")
144
+ print("Enable chat", args)
145
+ if args.chat == 'sum_first':
146
+ current_handler = current_handler.set_next(prompt_handler).set_next(bedrock_handler).set_next(chat_handler)
147
+
148
+ elif args.chat == 'chat_only':
149
+ current_handler = current_handler.set_next(chat_handler)
150
+
151
+ else:
152
+ current_handler = current_handler.set_next(chat_handler)
153
+ current_handler = current_handler.set_next(prompt_handler).set_next(bedrock_handler)
154
+ else:
155
+ current_handler = current_handler.set_next(prompt_handler).set_next(bedrock_handler)
156
+
157
+ # Finally, if we have tokenized the content, let's untokenize
158
+ if anonymize:
159
+ unanonymize_handler = HandlerFactory.get_handler("AmazonComprehendPIIUntokenizeHandler")
160
+ current_handler = current_handler.set_next(unanonymize_handler)
161
+
162
+ # Copy to clipboard?
163
+ clipboard = os.getenv('CLIPBOARD_COPY', 'false').lower() in ('true', '1', 't')
164
+ if clipboard:
165
+ clipboard_handler = HandlerFactory.get_handler("ClipboardWriterHandler")
166
+ current_handler = current_handler.set_next(clipboard_handler)
167
+ print("\n\n ================================================\n The summary will be copied to your clipboard.\n ================================================\n")
168
+
169
+ return chain
170
+
171
+ def process_file(file_path, args):
172
+ print(f"Processing: {file_path}")
173
+ input_type = determine_input_type(file_path)
174
+
175
+ handler_chain = construct_chain(input_type, args)
176
+
177
+ # Prepare the output filename with the current date and time
178
+ current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
179
+ local_dir = os.getenv('DIR_STORAGE', './downloads')
180
+ output_file = f"{local_dir}/output_{os.path.basename(file_path)}_{current_time}.txt"
181
+
182
+
183
+ request = {
184
+ "type": input_type,
185
+ "path": file_path,
186
+ "prompt_file_name": args.prompt_file_name,
187
+ "text": "",
188
+ "write_file_path": output_file,
189
+ "extract_media": True
190
+ }
191
+
192
+ result = handler_chain.handle(request)
193
+ return result
194
+
195
+
196
+ def main():
197
+ # Initialize the argument parser
198
+ parser = argparse.ArgumentParser(description='Process input files or URLs.')
199
+
200
+ # Required positional argument for the file/URL to process
201
+ parser.add_argument('path', type=str, help='The path to the file, folder or URL to be processed.')
202
+
203
+ # Optional positional argument for the prompt file name, with a default value
204
+ parser.add_argument('prompt_file_name', nargs='?', default='default_prompt', help='The name of the prompt file. Defaults to "default_prompt" if not specified.')
205
+
206
+ # Optional flag to specify the use of a interactive chat handler
207
+ parser.add_argument("--chat", type=str, default=None, choices=[None, 'sum_first', 'chat_first', 'chat_only'], help="Choose 'sum_first', 'chat_first' to summarize before chat or direct chat interaction with your original text. Select 'chat_only' if you only want to query your original text")
208
+
209
+ # Optional flag to turn off anonymization
210
+ parser.add_argument("--anonymize", type=str, default=True, help="Anonymize customer names before sending to the model. By default this is set to true.")
211
+
212
+ # Optional argument for config file path
213
+ parser.add_argument("--config", type=str, help="Path to the config file. If not provided, will search for config.yaml in the current directory and its parents.")
214
+
215
+ # Parse the command-line arguments
216
+ args = parser.parse_args()
217
+
218
+ # Load configuration
219
+ from awschain.utils.config_loader import ConfigLoader
220
+ ConfigLoader.load_config(args.config)
221
+
222
+ # Handler discovery
223
+ HandlerFactory.discover_handlers()
224
+
225
+ if os.path.isdir(args.path):
226
+ max_processes = int(ConfigLoader.get_config('MAX_PARALLEL_PROCESSES', 1))
227
+ with ThreadPoolExecutor(max_workers=max_processes) as executor:
228
+ futures = [executor.submit(process_file, os.path.join(args.path, f), args) for f in os.listdir(args.path) if os.path.isfile(os.path.join(args.path, f))]
229
+ for future in as_completed(futures):
230
+ result = future.result()
231
+ else:
232
+ result = process_file(args.path, args)
233
+
234
+ if result.get("text", None):
235
+ print(result.get("text"))
236
+ else:
237
+ print(json.dumps(result, indent=2))
238
+
239
+ if __name__ == "__main__":
240
+ main()
@@ -0,0 +1,5 @@
1
+ from .handler_factory import HandlerFactory
2
+ from .abstract_handler import AbstractHandler
3
+ from .base_handler import BaseHandler
4
+
5
+ __all__ = ['HandlerFactory', 'AbstractHandler', 'BaseHandler']
@@ -0,0 +1,26 @@
1
+ from __future__ import annotations
2
+ from abc import abstractmethod
3
+ from typing import Any
4
+ from .base_handler import BaseHandler
5
+
6
+ class AbstractHandler(BaseHandler):
7
+ """
8
+ The default chaining behavior can be implemented inside a base handler
9
+ class.
10
+ """
11
+
12
+ _next_handler: Handler = None
13
+
14
+ def set_next(self, handler: Handler) -> Handler:
15
+ self._next_handler = handler
16
+ # Returning a handler from here will let us link handlers in a
17
+ # convenient way like this:
18
+ # local_file_handler.set_next(prompt_handler).set_next(summarization_handler)
19
+ return handler
20
+
21
+ @abstractmethod
22
+ def handle(self, request: dict) -> dict:
23
+ if self._next_handler:
24
+ return self._next_handler.handle(request)
25
+
26
+ return request
@@ -0,0 +1,17 @@
1
+ from __future__ import annotations
2
+ from abc import ABC, abstractmethod
3
+ from typing import Optional
4
+
5
+ class BaseHandler(ABC):
6
+ """
7
+ The Handler interface declares a method for building the chain of handlers.
8
+ It also declares a method for executing a request.
9
+ """
10
+
11
+ @abstractmethod
12
+ def set_next(self, handler: Handler) -> Handler:
13
+ pass
14
+
15
+ @abstractmethod
16
+ def handle(self, request) -> Optional[dict]:
17
+ pass
@@ -0,0 +1,57 @@
1
+ import ast
2
+ import pathlib
3
+ import importlib
4
+
5
+ from .abstract_handler import AbstractHandler
6
+
7
+ class HandlerFactory:
8
+ _handlers = {}
9
+ _handler_paths = {}
10
+
11
+ @classmethod
12
+ def discover_handlers(cls, root_path='./src/awschain/handlers'):
13
+ # Clear existing handlers
14
+ cls._handlers.clear()
15
+
16
+ root = pathlib.Path(root_path)
17
+ for path in root.rglob('*.py'):
18
+ if path.name == '__init__.py':
19
+ continue
20
+
21
+ # Read the content of the Python file
22
+ with open(path, 'r') as file:
23
+ node = ast.parse(file.read(), filename=path.name)
24
+
25
+ # Traverse the AST to find class definitions that extend AbstractHandler
26
+ for child in ast.iter_child_nodes(node):
27
+ if isinstance(child, ast.ClassDef):
28
+ for base in child.bases:
29
+ if (isinstance(base, ast.Attribute) and base.attr == 'AbstractHandler') or \
30
+ (isinstance(base, ast.Name) and base.id == 'AbstractHandler'):
31
+ handler_name = child.name
32
+ module_path_parts = path.relative_to(root.parent).with_suffix('').parts
33
+ # Remove the first part ('handlers') and prepend a '.'
34
+ module_path = '.' + '.'.join(module_path_parts[1:])
35
+ # Record the module path and class name without importing
36
+ cls._handler_paths[handler_name] = module_path
37
+ break
38
+
39
+ @classmethod
40
+ def get_handler(cls, handler_type):
41
+ if not cls._handlers:
42
+ cls.discover_handlers()
43
+
44
+ if handler_type not in cls._handlers:
45
+ module_path = cls._handler_paths.get(handler_type)
46
+ if module_path:
47
+ # Dynamically import the module when requested
48
+ module = importlib.import_module(module_path, package='awschain.handlers')
49
+ handler_class = getattr(module, handler_type)
50
+ if issubclass(handler_class, AbstractHandler):
51
+ cls._handlers[handler_type] = handler_class
52
+ return handler_class()
53
+ else:
54
+ raise ValueError(f"The class {handler_type} is not a subclass of AbstractHandler")
55
+ else:
56
+ raise ValueError(f"Handler not found for type: {handler_type}")
57
+ return cls._handlers[handler_type]()
File without changes
@@ -0,0 +1,8 @@
1
+ import pyperclip
2
+ from ..abstract_handler import AbstractHandler
3
+
4
+ class ClipboardWriterHandler(AbstractHandler):
5
+ def handle(self, request: dict):
6
+ print("\n\n ================================================\n The summary has been copied to your clipboard.\n ================================================\n")
7
+ pyperclip.copy(request.get("text", None))
8
+ return super().handle(request)
@@ -0,0 +1,14 @@
1
+ from typing import Any
2
+
3
+ from ..abstract_handler import AbstractHandler
4
+
5
+ class PrintContextHandler(AbstractHandler):
6
+ """
7
+ Simple handler that only prints the request context.
8
+ """
9
+ def handle(self, request: dict) -> dict:
10
+ print("============================================================")
11
+ print(request)
12
+ print("============================================================")
13
+
14
+ return super().handle(request)
@@ -0,0 +1,43 @@
1
+ import os
2
+ import urllib.request
3
+ import boto3
4
+ from ..abstract_handler import AbstractHandler
5
+
6
+ class RemoteFileDownloaderHandler(AbstractHandler):
7
+
8
+ def handle(self, request: dict) -> dict:
9
+ file_path = request.get("path")
10
+ print("Downloading file from: ", file_path)
11
+
12
+ if file_path.startswith(('http', 'https')):
13
+ local_path = self.download_from_http(file_path)
14
+ elif file_path.startswith('s3://'):
15
+ local_path = self.download_from_s3(file_path)
16
+
17
+
18
+ # Update the request with the local path of the downloaded file
19
+ request.update({"path": local_path})
20
+
21
+ return super().handle(request)
22
+
23
+ # Update the request with the path of the downloaded file
24
+ request["path"] = local_path
25
+ return super().handle(request)
26
+
27
+ def download_from_http(self, url):
28
+ local_filename = url.split('/')[-1]
29
+ local_path = os.path.join(os.getenv('DIR_STORAGE', './downloads'), local_filename)
30
+ urllib.request.urlretrieve(url, local_path)
31
+ return local_path
32
+
33
+ def download_from_s3(self, s3_url):
34
+ # Extract bucket name and object key from the s3 URL
35
+ path_parts = s3_url.replace("s3://", "").split('/')
36
+ bucket_name = path_parts[0]
37
+ object_key = '/'.join(path_parts[1:])
38
+ local_filename = object_key.split('/')[-1]
39
+ local_path = os.path.join(os.getenv('DIR_STORAGE', './downloads'), local_filename)
40
+
41
+ s3_client = boto3.client('s3')
42
+ s3_client.download_file(bucket_name, object_key, local_path)
43
+ return local_path
File without changes
@@ -0,0 +1,32 @@
1
+ import sys
2
+ from ..abstract_handler import AbstractHandler
3
+ from ...utils.bedrock import invoke_model
4
+ import json
5
+
6
+ class AmazonBedrockChatHandler(AbstractHandler):
7
+
8
+ def handle(self, request: dict) -> dict:
9
+
10
+ print(f"Hello, I'm the Amazon Bedrock chat hanlder buddy.\n")
11
+
12
+ print(request.get("text"))
13
+ print(f"\n\n------\nYou can ask any questions on the text above. To terminate the chat session, use Ctrl+C")
14
+
15
+ chat_history = []
16
+ chat_history.append({"role": "user", "content": request['text']})
17
+ try:
18
+ while True:
19
+ user_input = input("> ")
20
+ chat_history.append({"role": "user", "content": user_input})
21
+ messages = json.dumps({"messages": chat_history})
22
+ # Assume summarize_text_with_bedrock is adapted to handle chat
23
+ response = invoke_model(messages)
24
+
25
+ print(f"Model: {response}")
26
+ chat_history.append({"role": "assistant", "content": response})
27
+ except KeyboardInterrupt:
28
+ exit_choice = input("\n ---------------- Terminating Chat Session ---------------- \n Type 'yes' to include the chat history for your next handler? ")
29
+ if exit_choice == "yes":
30
+ request['text'] = json.dumps({"messages": chat_history})
31
+
32
+ return super().handle(request)
@@ -0,0 +1,62 @@
1
+ from ...utils.bedrock import invoke_model
2
+ from ..abstract_handler import AbstractHandler
3
+ import botocore.exceptions
4
+
5
+ class AmazonBedrockHandler(AbstractHandler):
6
+
7
+ def handle(self, request: dict) -> dict:
8
+ text = request.get("text", None)
9
+ print("Summarizing text with Bedrock. Text Len:", len(text))
10
+
11
+ summary = self.summarize_with_retry(text)
12
+
13
+ request.update({"text": summary})
14
+ return super().handle(request)
15
+
16
+ def summarize_with_retry(self, text: str) -> str:
17
+ try:
18
+ return invoke_model(text)
19
+ except botocore.exceptions.ClientError as e:
20
+ if e.response['Error']['Code'] == 'ValidationException':
21
+ return self.chunk_and_summarize(text)
22
+ else:
23
+ raise e
24
+
25
+ def chunk_and_summarize(self, text: str) -> str:
26
+ max_attempts = 10
27
+ num_chunks = 2
28
+ attempt = 0
29
+ print("Retrying summarization of text with Bedrock with chunking")
30
+
31
+ while attempt < max_attempts:
32
+ chunks = self.split_text(text, num_chunks)
33
+ try:
34
+ summaries = [invoke_model(chunk) for chunk in chunks]
35
+ return "\n".join(summaries)
36
+ except botocore.exceptions.ClientError as e:
37
+ if e.response['Error']['Code'] == 'ValidationException':
38
+ num_chunks *= 2 # Increase the number of chunks
39
+ attempt += 1
40
+ print(f"Attempt {attempt}: Splitting text into {num_chunks} chunks")
41
+ else:
42
+ raise e
43
+
44
+ raise RuntimeError("Failed to summarize text due to input length constraints after multiple attempts.")
45
+
46
+ def split_text(self, text: str, num_chunks: int) -> list:
47
+ """
48
+ Splits the text into the specified number of chunks, respecting word boundaries.
49
+ """
50
+ avg_chunk_length = len(text) // num_chunks
51
+ chunks = []
52
+ start = 0
53
+
54
+ for _ in range(num_chunks - 1):
55
+ end = start + avg_chunk_length
56
+ while end < len(text) and text[end] != ' ':
57
+ end += 1
58
+ chunks.append(text[start:end].strip())
59
+ start = end
60
+
61
+ chunks.append(text[start:].strip())
62
+ return chunks
@@ -0,0 +1,95 @@
1
+ from ..abstract_handler import AbstractHandler
2
+ from ...utils.aws_boto_client_manager import AWSBotoClientManager
3
+
4
+ class AmazonComprehendInsightsHandler(AbstractHandler):
5
+
6
+ def handle(self, request: dict) -> dict:
7
+
8
+ self.comprehend = AWSBotoClientManager.get_client('comprehend')
9
+ self.max_bytes = 3000 # Amazon Comprehend's size limit of 5000kb for various operations
10
+
11
+ print("Extracting insights from text...")
12
+ text = request.get("text", None)
13
+
14
+ if text:
15
+ text_chunks = self.chunk_text(text)
16
+ sentiments = [self.detect_sentiment(chunk) for chunk in text_chunks]
17
+ entities = []
18
+ key_phrases = []
19
+ for chunk in text_chunks:
20
+ entities.extend(self.detect_entities(chunk))
21
+ key_phrases.extend(self.detect_key_phrases(chunk))
22
+
23
+ # Aggregate the insights and append to the request object
24
+ aggregated_data = {
25
+ "sentiment": max(set(sentiments), key=sentiments.count), # Aggregation by most frequent sentiment
26
+ "entities": entities, # Entities from all chunks
27
+ "key_phrases": key_phrases # Key phrases from all chunks
28
+ }
29
+
30
+ # updating the request body and adding the aggregated data.
31
+ request.update({"text": aggregated_data})
32
+
33
+ else:
34
+ print("No text provided for insights extraction.")
35
+
36
+ return super().handle(request)
37
+
38
+ def chunk_text(self, text):
39
+ """
40
+ Breaks the text into chunks, each within the Amazon Comprehend size limit.
41
+ """
42
+ words = text.split()
43
+ chunks = []
44
+ current_chunk = []
45
+ current_size = 0
46
+
47
+ for word in words:
48
+ word_size = len(word.encode('utf-8'))
49
+ if current_size + word_size <= self.max_bytes:
50
+ current_chunk.append(word)
51
+ current_size += word_size
52
+ else:
53
+ chunks.append(" ".join(current_chunk))
54
+ current_chunk = [word]
55
+ current_size = word_size
56
+ if current_chunk:
57
+ chunks.append(" ".join(current_chunk))
58
+
59
+ return chunks
60
+
61
+ def detect_sentiment(self, text):
62
+ """
63
+ Detects the sentiment of the given text using Amazon Comprehend.
64
+ """
65
+ try:
66
+ response = self.comprehend.detect_sentiment(Text=text, LanguageCode='en')
67
+ return response.get("Sentiment")
68
+ except Exception as e:
69
+ print(f"Error detecting sentiment: {e}")
70
+ return None
71
+
72
+ def detect_entities(self, text):
73
+ """
74
+ Detects entities in the given text using Amazon Comprehend.
75
+ """
76
+ try:
77
+ response = self.comprehend.detect_entities(Text=text, LanguageCode='en')
78
+ entities = response.get("Entities")
79
+ return [{"Text": entity["Text"], "Type": entity["Type"], "Score": entity["Score"]} for entity in entities]
80
+ except Exception as e:
81
+ print(f"Error detecting entities: {e}")
82
+ return []
83
+
84
+ def detect_key_phrases(self, text):
85
+ """
86
+ Detects key phrases in the given text using Amazon Comprehend.
87
+ """
88
+ try:
89
+ response = self.comprehend.detect_key_phrases(Text=text, LanguageCode='en')
90
+ key_phrases = response.get("KeyPhrases")
91
+ return [{"Text": phrase["Text"], "Score": phrase["Score"]} for phrase in key_phrases]
92
+ except Exception as e:
93
+ print(f"Error detecting key phrases: {e}")
94
+ return []
95
+
@@ -0,0 +1,42 @@
1
+ from ..abstract_handler import AbstractHandler
2
+ from ...utils.aws_boto_client_manager import AWSBotoClientManager
3
+
4
+ class AmazonComprehendPIIClassifierHandler(AbstractHandler):
5
+
6
+ def handle(self, request: dict) -> dict:
7
+ self.comprehend = AWSBotoClientManager.get_client('comprehend')
8
+
9
+ print("Starting PII Classification...")
10
+ text = request.get("text", None)
11
+
12
+ # Process the text to check for PII
13
+ is_pii, pii_types = self.classify_pii(text)
14
+ # Update the request with PII detection results
15
+ request.update({
16
+ "is_pii": is_pii,
17
+ "detected_pii": pii_types
18
+ })
19
+
20
+ return super().handle(request)
21
+
22
+ def classify_pii(self, text):
23
+ """
24
+ Classifies text to detect PII and identify the types of PII found.
25
+ """
26
+ max_length = 5000 # character length, adjust based on the encoding
27
+ chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
28
+ detected_pii_types = set()
29
+
30
+ for chunk in chunks:
31
+ pii_entities = self.detect_pii(chunk)
32
+ for entity in pii_entities['Entities']:
33
+ detected_pii_types.add(entity['Type'])
34
+
35
+ is_pii_detected = len(detected_pii_types) > 0
36
+ return is_pii_detected, list(detected_pii_types)
37
+
38
+ def detect_pii(self, text):
39
+ """
40
+ Detects PII data in the given text using Amazon Comprehend.
41
+ """
42
+ return self.comprehend.detect_pii_entities(Text=text, LanguageCode='en')