PyPI - awschain - Versions diffs - 0.1.0__py3-none-any.whl - Mend

awschain 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

awschain/__init__.py +4 -0
awschain/example.py +240 -0
awschain/handlers/__init__.py +5 -0
awschain/handlers/abstract_handler.py +26 -0
awschain/handlers/base_handler.py +17 -0
awschain/handlers/handler_factory.py +57 -0
awschain/handlers/misc/__init__.py +0 -0
awschain/handlers/misc/clipboard_writer_handler.py +8 -0
awschain/handlers/misc/print_context_handler.py +14 -0
awschain/handlers/misc/remote_file_downloader_handler.py +43 -0
awschain/handlers/processors/__init__.py +0 -0
awschain/handlers/processors/amazon_bedrock_chat_handler.py +32 -0
awschain/handlers/processors/amazon_bedrock_handler.py +62 -0
awschain/handlers/processors/amazon_comprehend_insights_handler.py +95 -0
awschain/handlers/processors/amazon_comprehend_pii_classifier_handler.py +42 -0
awschain/handlers/processors/amazon_comprehend_pii_handler.py +48 -0
awschain/handlers/processors/amazon_comprehend_pii_tokenize_handler.py +119 -0
awschain/handlers/processors/amazon_comprehend_pii_untokenize_handler.py +46 -0
awschain/handlers/processors/amazon_rekognition_handler.py +66 -0
awschain/handlers/processors/amazon_textract_handler.py +138 -0
awschain/handlers/processors/amazon_transcribe_handler.py +91 -0
awschain/handlers/processors/anonymize_handler.py +30 -0
awschain/handlers/processors/html_cleaner_handler.py +11 -0
awschain/handlers/processors/prompt_handler.py +27 -0
awschain/handlers/readers/__init__.py +0 -0
awschain/handlers/readers/amazon_s3_reader_handler.py +35 -0
awschain/handlers/readers/aws_secrets_manager_secret_reader.py +50 -0
awschain/handlers/readers/email_reader_handler.py +83 -0
awschain/handlers/readers/http_handler.py +140 -0
awschain/handlers/readers/local_file_reader_handler.py +22 -0
awschain/handlers/readers/microsoft_excel_reader_handler.py +84 -0
awschain/handlers/readers/microsoft_power_point_reader_handler.py +95 -0
awschain/handlers/readers/microsoft_word_handler.py +112 -0
awschain/handlers/readers/pdf_reader_handler.py +81 -0
awschain/handlers/readers/quip_reader_handler.py +148 -0
awschain/handlers/readers/web_crawler_reader_handler.py +52 -0
awschain/handlers/readers/youtube_reader_handler.py +39 -0
awschain/handlers/writers/__init__.py +0 -0
awschain/handlers/writers/amazon_datazone_asset_writer_handler.py +87 -0
awschain/handlers/writers/amazon_datazone_glossary_writer_handler.py +92 -0
awschain/handlers/writers/amazon_s3_writer_handler.py +118 -0
awschain/handlers/writers/email_sender_handler.py +56 -0
awschain/handlers/writers/local_file_writer_handler.py +51 -0
awschain/handlers/writers/quip_writer_handler.py +85 -0
awschain/utils/__init__.py +2 -0
awschain/utils/aws_boto_client_manager.py +16 -0
awschain/utils/bedrock.py +70 -0
awschain/utils/config.py +1 -0
awschain/utils/config_loader.py +43 -0
awschain/utils/web_utils.py +22 -0
awschain-0.1.0.dist-info/LICENSE +21 -0
awschain-0.1.0.dist-info/METADATA +128 -0
awschain-0.1.0.dist-info/RECORD +55 -0
awschain-0.1.0.dist-info/WHEEL +5 -0
awschain-0.1.0.dist-info/top_level.txt +1 -0

awschain/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .handlers.handler_factory import HandlerFactory
+from .utils.config_loader import ConfigLoader
+__all__ = ['HandlerFactory', 'ConfigLoader']

awschain/example.py ADDED Viewed

@@ -0,0 +1,240 @@
+#!/opt/anaconda3/bin/python
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime
+import json
+import os
+import sys
+from typing import Any
+from dotenv import load_dotenv
+from handlers.handler_factory import HandlerFactory
+from .utils.config_loader import ConfigLoader
+import argparse
+# Load configuration
+ConfigLoader.load_config('config.yaml')
+# Load environment variables from .env file
+load_dotenv()
+def determine_input_type(file_path):
+    if "youtube" in file_path or "youtu.be" in file_path:
+        return "youtube_url"
+    elif file_path.startswith(('http')):
+        return "http"
+    elif file_path.startswith(('s3://')):
+        return "s3"
+    elif file_path.startswith(('quip://')):
+        return "quip"
+    elif file_path.endswith(('.mp3', '.mp4', '.m4a', '.wav', '.flac', '.mov', '.avi')):
+        return "multimedia_file"
+    elif file_path.endswith('.pdf'):
+        return "pdf"
+    elif file_path.endswith('.docx'):
+        return "microsoft_word"
+    elif file_path.endswith(('.xlsx','.xlsm','.xltx','.xltm')):
+        return "microsoft_excel"
+    elif file_path.endswith('.pptx'):
+        return "microsoft_pp"
+    elif file_path.endswith(('.jpg', '.jpeg', '.png', '.tiff')):
+        return "image_file"
+    elif file_path.endswith(('.txt', '.json')):
+        return "text_or_json"
+    else:
+        # Assume text
+        return "text_or_json"
+def construct_chain(input_type, args):
+    # Use if-elif-else to construct the appropriate chain. In Python 3.10 we could use match statement.
+    if input_type == "youtube_url":
+        youtube_handler = HandlerFactory.get_handler("YouTubeReaderHandler")
+        s3writer_handler = HandlerFactory.get_handler("AmazonS3WriterHandler")
+        transcription_handler = HandlerFactory.get_handler("AmazonTranscriptionHandler")
+        local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
+        chain = youtube_handler
+        current_handler = youtube_handler.set_next(s3writer_handler).set_next(transcription_handler).set_next(local_file_writer_handler)
+    elif input_type == "multimedia_file":
+        s3writer_handler = HandlerFactory.get_handler("AmazonS3WriterHandler")
+        transcription_handler = HandlerFactory.get_handler("AmazonTranscriptionHandler")
+        local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
+        chain = s3writer_handler
+        current_handler = s3writer_handler.set_next(transcription_handler).set_next(local_file_writer_handler)
+    elif input_type == "multimedia_file_whisper":
+        transcription_handler = HandlerFactory.get_handler("OpenAIWhisperTranscriptionHandler")
+        local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
+        chain = transcription_handler
+        current_handler = transcription_handler.set_next(local_file_writer_handler)
+    elif input_type == "image_file":
+        local_file_reader_handler = HandlerFactory.get_handler("LocalFileReaderHandler")
+        textract_handler = HandlerFactory.get_handler("AmazonTextractHandler")
+        local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
+        chain = local_file_reader_handler
+        current_handler = local_file_reader_handler.set_next(textract_handler).set_next(local_file_writer_handler)
+    elif input_type == "pdf":
+        pdf_handler = HandlerFactory.get_handler("PDFReaderHandler")
+        local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
+        chain = pdf_handler
+        current_handler = pdf_handler.set_next(local_file_writer_handler)
+    elif input_type == "http":
+        http_handler = HandlerFactory.get_handler("HTTPHandler")
+        http_clean_handler = HandlerFactory.get_handler("HTMLCleanerHandler")
+        local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
+        chain = http_handler
+        current_handler = http_handler.set_next(http_clean_handler).set_next(local_file_writer_handler)
+    elif input_type == "text_or_json":
+        local_file_reader_handler = HandlerFactory.get_handler("LocalFileReaderHandler")
+        chain = local_file_reader_handler
+        current_handler = local_file_reader_handler
+    elif input_type == "s3":
+        s3reader_handler = HandlerFactory.get_handler("AmazonS3ReaderHandler")
+        # local_file_reader_handler = HandlerFactory.get_handler("LocalFileReaderHandler")
+        current_handler = chain = s3reader_handler
+    elif input_type == "quip":
+        quip_reader_handler = HandlerFactory.get_handler("QuipReaderHandler")
+        http_clean_handler = HandlerFactory.get_handler("HTMLCleanerHandler")
+        local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
+        chain = quip_reader_handler
+        current_handler = quip_reader_handler.set_next(http_clean_handler).set_next(local_file_writer_handler)
+    elif input_type == "microsoft_word":
+        msword_handler = HandlerFactory.get_handler("MicrosoftWordReaderHandler")
+        local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
+        chain = msword_handler
+        current_handler = msword_handler.set_next(local_file_writer_handler)
+    elif input_type == "microsoft_excel":
+        xls_hanlder = HandlerFactory.get_handler("MicrosoftExcelReaderHandler")
+        local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
+        chain = xls_hanlder
+        current_handler = xls_hanlder.set_next(local_file_writer_handler)
+    elif input_type == "microsoft_pp":
+        pp_handler = HandlerFactory.get_handler("MicrosoftPowerPointReaderHandler")
+        local_file_writer_handler = HandlerFactory.get_handler("LocalFileWriterHandler")
+        chain = pp_handler
+        current_handler = pp_handler.set_next(local_file_writer_handler)
+    else:
+        # For unsupported types, default to just summarization_handler
+        print("Unsupported file type.", input_type)
+        sys.exit(1)
+    # Anonymize data?
+    anonymize = args.anonymize in (True, 'true', '1')
+    if anonymize:
+        anonymize_handler = HandlerFactory.get_handler("AmazonComprehendPIITokenizeHandler")
+        current_handler = current_handler.set_next(anonymize_handler)
+    # Add the prompt and bedrock handlers.
+    prompt_handler = HandlerFactory.get_handler("PromptHandler")
+    bedrock_handler = HandlerFactory.get_handler("AmazonBedrockHandler")
+    # Determinate when / if we need to call summarization or Chat and in what order.
+    if args.chat and args.chat != None:
+        chat_handler = HandlerFactory.get_handler("AmazonBedrockChatHandler")
+        print("Enable chat", args)
+        if args.chat == 'sum_first':
+            current_handler = current_handler.set_next(prompt_handler).set_next(bedrock_handler).set_next(chat_handler)
+        elif args.chat == 'chat_only':
+            current_handler = current_handler.set_next(chat_handler)
+        else:
+            current_handler = current_handler.set_next(chat_handler)
+            current_handler = current_handler.set_next(prompt_handler).set_next(bedrock_handler)
+    else:
+        current_handler = current_handler.set_next(prompt_handler).set_next(bedrock_handler)
+    # Finally, if we have tokenized the content, let's untokenize
+    if anonymize:
+        unanonymize_handler = HandlerFactory.get_handler("AmazonComprehendPIIUntokenizeHandler")
+        current_handler = current_handler.set_next(unanonymize_handler)
+    # Copy to clipboard?
+    clipboard = os.getenv('CLIPBOARD_COPY', 'false').lower() in ('true', '1', 't')
+    if clipboard:
+        clipboard_handler = HandlerFactory.get_handler("ClipboardWriterHandler")
+        current_handler = current_handler.set_next(clipboard_handler)
+        print("\n\n  ================================================\n   The summary will be copied to your clipboard.\n  ================================================\n")
+    return chain
+def process_file(file_path, args):
+    print(f"Processing: {file_path}")
+    input_type = determine_input_type(file_path)
+    handler_chain = construct_chain(input_type, args)
+    # Prepare the output filename with the current date and time
+    current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    local_dir = os.getenv('DIR_STORAGE', './downloads')
+    output_file = f"{local_dir}/output_{os.path.basename(file_path)}_{current_time}.txt"
+    request = {
+        "type": input_type,
+        "path": file_path,
+        "prompt_file_name": args.prompt_file_name,
+        "text": "",
+        "write_file_path": output_file,
+        "extract_media": True
+    }
+    result = handler_chain.handle(request)
+    return result
+def main():
+ # Initialize the argument parser
+    parser = argparse.ArgumentParser(description='Process input files or URLs.')
+    # Required positional argument for the file/URL to process
+    parser.add_argument('path', type=str, help='The path to the file, folder or URL to be processed.')
+    # Optional positional argument for the prompt file name, with a default value
+    parser.add_argument('prompt_file_name', nargs='?', default='default_prompt', help='The name of the prompt file. Defaults to "default_prompt" if not specified.')
+    # Optional flag to specify the use of a interactive chat handler
+    parser.add_argument("--chat", type=str, default=None, choices=[None, 'sum_first', 'chat_first', 'chat_only'],  help="Choose 'sum_first', 'chat_first' to summarize before chat or direct chat interaction with your original text. Select 'chat_only' if you only want to query your original text")
+    # Optional flag to turn off anonymization
+    parser.add_argument("--anonymize", type=str, default=True, help="Anonymize customer names before sending to the model. By default this is set to true.")
+    # Optional argument for config file path
+    parser.add_argument("--config", type=str, help="Path to the config file. If not provided, will search for config.yaml in the current directory and its parents.")
+    # Parse the command-line arguments
+    args = parser.parse_args()
+    # Load configuration
+    from awschain.utils.config_loader import ConfigLoader
+    ConfigLoader.load_config(args.config)
+    # Handler discovery
+    HandlerFactory.discover_handlers()
+    if os.path.isdir(args.path):
+        max_processes = int(ConfigLoader.get_config('MAX_PARALLEL_PROCESSES', 1))
+        with ThreadPoolExecutor(max_workers=max_processes) as executor:
+            futures = [executor.submit(process_file, os.path.join(args.path, f), args) for f in os.listdir(args.path) if os.path.isfile(os.path.join(args.path, f))]
+            for future in as_completed(futures):
+                result = future.result()
+    else:
+        result = process_file(args.path, args)
+    if result.get("text", None):
+        print(result.get("text"))
+    else:
+        print(json.dumps(result, indent=2))
+if __name__ == "__main__":
+    main()

awschain/handlers/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .handler_factory import HandlerFactory
+from .abstract_handler import AbstractHandler
+from .base_handler import BaseHandler
+__all__ = ['HandlerFactory', 'AbstractHandler', 'BaseHandler']

awschain/handlers/abstract_handler.py ADDED Viewed

@@ -0,0 +1,26 @@
+from __future__ import annotations
+from abc import abstractmethod
+from typing import Any
+from .base_handler import BaseHandler
+class AbstractHandler(BaseHandler):
+    """
+    The default chaining behavior can be implemented inside a base handler
+    class.
+    """
+    _next_handler: Handler = None
+    def set_next(self, handler: Handler) -> Handler:
+        self._next_handler = handler
+        # Returning a handler from here will let us link handlers in a
+        # convenient way like this:
+        # local_file_handler.set_next(prompt_handler).set_next(summarization_handler)
+        return handler
+    @abstractmethod
+    def handle(self, request: dict) -> dict:
+        if self._next_handler:
+            return self._next_handler.handle(request)
+        return request

awschain/handlers/base_handler.py ADDED Viewed

@@ -0,0 +1,17 @@
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from typing import Optional
+class BaseHandler(ABC):
+    """
+    The Handler interface declares a method for building the chain of handlers.
+    It also declares a method for executing a request.
+    """
+    @abstractmethod
+    def set_next(self, handler: Handler) -> Handler:
+        pass
+    @abstractmethod
+    def handle(self, request) -> Optional[dict]:
+        pass

awschain/handlers/handler_factory.py ADDED Viewed

@@ -0,0 +1,57 @@
+import ast
+import pathlib
+import importlib
+from .abstract_handler import AbstractHandler
+class HandlerFactory:
+    _handlers = {}
+    _handler_paths = {}
+    @classmethod
+    def discover_handlers(cls, root_path='./src/awschain/handlers'):
+        # Clear existing handlers
+        cls._handlers.clear()
+        root = pathlib.Path(root_path)
+        for path in root.rglob('*.py'):
+            if path.name == '__init__.py':
+                continue
+            # Read the content of the Python file
+            with open(path, 'r') as file:
+                node = ast.parse(file.read(), filename=path.name)
+            # Traverse the AST to find class definitions that extend AbstractHandler
+            for child in ast.iter_child_nodes(node):
+                if isinstance(child, ast.ClassDef):
+                    for base in child.bases:
+                        if (isinstance(base, ast.Attribute) and base.attr == 'AbstractHandler') or \
+                           (isinstance(base, ast.Name) and base.id == 'AbstractHandler'):
+                            handler_name = child.name
+                            module_path_parts = path.relative_to(root.parent).with_suffix('').parts
+                            # Remove the first part ('handlers') and prepend a '.'
+                            module_path = '.' + '.'.join(module_path_parts[1:])
+                            # Record the module path and class name without importing
+                            cls._handler_paths[handler_name] = module_path
+                            break
+    @classmethod
+    def get_handler(cls, handler_type):
+        if not cls._handlers:
+            cls.discover_handlers()
+        if handler_type not in cls._handlers:
+            module_path = cls._handler_paths.get(handler_type)
+            if module_path:
+                # Dynamically import the module when requested
+                module = importlib.import_module(module_path, package='awschain.handlers')
+                handler_class = getattr(module, handler_type)
+                if issubclass(handler_class, AbstractHandler):
+                    cls._handlers[handler_type] = handler_class
+                    return handler_class()
+                else:
+                    raise ValueError(f"The class {handler_type} is not a subclass of AbstractHandler")
+            else:
+                raise ValueError(f"Handler not found for type: {handler_type}")
+        return cls._handlers[handler_type]()

awschain/handlers/misc/__init__.py ADDED Viewed

File without changes

awschain/handlers/misc/clipboard_writer_handler.py ADDED Viewed

@@ -0,0 +1,8 @@
+import pyperclip
+from ..abstract_handler import AbstractHandler
+class ClipboardWriterHandler(AbstractHandler):
+    def handle(self, request: dict):
+        print("\n\n  ================================================\n   The summary has been copied to your clipboard.\n  ================================================\n")
+        pyperclip.copy(request.get("text", None))
+        return super().handle(request)

awschain/handlers/misc/print_context_handler.py ADDED Viewed

@@ -0,0 +1,14 @@
+from typing import Any
+from ..abstract_handler import AbstractHandler
+class PrintContextHandler(AbstractHandler):
+    """
+    Simple handler that only prints the request context.
+    """
+    def handle(self, request: dict) -> dict:
+        print("============================================================")
+        print(request)
+        print("============================================================")
+        return super().handle(request)

awschain/handlers/misc/remote_file_downloader_handler.py ADDED Viewed

@@ -0,0 +1,43 @@
+import os
+import urllib.request
+import boto3
+from ..abstract_handler import AbstractHandler
+class RemoteFileDownloaderHandler(AbstractHandler):
+    def handle(self, request: dict) -> dict:
+        file_path = request.get("path")
+        print("Downloading file from: ", file_path)
+        if file_path.startswith(('http', 'https')):
+            local_path = self.download_from_http(file_path)
+        elif file_path.startswith('s3://'):
+            local_path = self.download_from_s3(file_path)
+        # Update the request with the local path of the downloaded file
+        request.update({"path": local_path})
+        return super().handle(request)
+        # Update the request with the path of the downloaded file
+        request["path"] = local_path
+        return super().handle(request)
+    def download_from_http(self, url):
+        local_filename = url.split('/')[-1]
+        local_path = os.path.join(os.getenv('DIR_STORAGE', './downloads'), local_filename)
+        urllib.request.urlretrieve(url, local_path)
+        return local_path
+    def download_from_s3(self, s3_url):
+        # Extract bucket name and object key from the s3 URL
+        path_parts = s3_url.replace("s3://", "").split('/')
+        bucket_name = path_parts[0]
+        object_key = '/'.join(path_parts[1:])
+        local_filename = object_key.split('/')[-1]
+        local_path = os.path.join(os.getenv('DIR_STORAGE', './downloads'), local_filename)
+        s3_client = boto3.client('s3')
+        s3_client.download_file(bucket_name, object_key, local_path)
+        return local_path

awschain/handlers/processors/__init__.py ADDED Viewed

File without changes

awschain/handlers/processors/amazon_bedrock_chat_handler.py ADDED Viewed

@@ -0,0 +1,32 @@
+import sys
+from ..abstract_handler import AbstractHandler
+from ...utils.bedrock import invoke_model
+import json
+class AmazonBedrockChatHandler(AbstractHandler):
+    def handle(self, request: dict) -> dict:
+        print(f"Hello, I'm the Amazon Bedrock chat hanlder buddy.\n")
+        print(request.get("text"))
+        print(f"\n\n------\nYou can ask any questions on the text above. To terminate the chat session, use Ctrl+C")
+        chat_history = []
+        chat_history.append({"role": "user", "content": request['text']})
+        try:
+          while True:
+              user_input = input("> ")
+              chat_history.append({"role": "user", "content": user_input})
+              messages = json.dumps({"messages": chat_history})
+              # Assume summarize_text_with_bedrock is adapted to handle chat
+              response = invoke_model(messages)
+              print(f"Model: {response}")
+              chat_history.append({"role": "assistant", "content": response})
+        except KeyboardInterrupt:
+          exit_choice = input("\n ---------------- Terminating Chat Session ---------------- \n Type 'yes' to include the chat history for your next handler? ")
+          if exit_choice == "yes":
+              request['text'] = json.dumps({"messages": chat_history})
+        return super().handle(request)

awschain/handlers/processors/amazon_bedrock_handler.py ADDED Viewed

@@ -0,0 +1,62 @@
+from ...utils.bedrock import invoke_model
+from ..abstract_handler import AbstractHandler
+import botocore.exceptions
+class AmazonBedrockHandler(AbstractHandler):
+    def handle(self, request: dict) -> dict:
+        text = request.get("text", None)
+        print("Summarizing text with Bedrock. Text Len:", len(text))
+        summary = self.summarize_with_retry(text)
+        request.update({"text": summary})
+        return super().handle(request)
+    def summarize_with_retry(self, text: str) -> str:
+        try:
+            return invoke_model(text)
+        except botocore.exceptions.ClientError as e:
+            if e.response['Error']['Code'] == 'ValidationException':
+                return self.chunk_and_summarize(text)
+            else:
+                raise e
+    def chunk_and_summarize(self, text: str) -> str:
+        max_attempts = 10
+        num_chunks = 2
+        attempt = 0
+        print("Retrying summarization of text with Bedrock with chunking")
+        while attempt < max_attempts:
+            chunks = self.split_text(text, num_chunks)
+            try:
+                summaries = [invoke_model(chunk) for chunk in chunks]
+                return "\n".join(summaries)
+            except botocore.exceptions.ClientError as e:
+                if e.response['Error']['Code'] == 'ValidationException':
+                    num_chunks *= 2  # Increase the number of chunks
+                    attempt += 1
+                    print(f"Attempt {attempt}: Splitting text into {num_chunks} chunks")
+                else:
+                    raise e
+        raise RuntimeError("Failed to summarize text due to input length constraints after multiple attempts.")
+    def split_text(self, text: str, num_chunks: int) -> list:
+        """
+        Splits the text into the specified number of chunks, respecting word boundaries.
+        """
+        avg_chunk_length = len(text) // num_chunks
+        chunks = []
+        start = 0
+        for _ in range(num_chunks - 1):
+            end = start + avg_chunk_length
+            while end < len(text) and text[end] != ' ':
+                end += 1
+            chunks.append(text[start:end].strip())
+            start = end
+        chunks.append(text[start:].strip())
+        return chunks

awschain/handlers/processors/amazon_comprehend_insights_handler.py ADDED Viewed

@@ -0,0 +1,95 @@
+from ..abstract_handler import AbstractHandler
+from ...utils.aws_boto_client_manager import AWSBotoClientManager
+class AmazonComprehendInsightsHandler(AbstractHandler):
+    def handle(self, request: dict) -> dict:
+        self.comprehend = AWSBotoClientManager.get_client('comprehend')
+        self.max_bytes = 3000  # Amazon Comprehend's size limit of 5000kb for various operations
+        print("Extracting insights from text...")
+        text = request.get("text", None)
+        if text:
+            text_chunks = self.chunk_text(text)
+            sentiments = [self.detect_sentiment(chunk) for chunk in text_chunks]
+            entities = []
+            key_phrases = []
+            for chunk in text_chunks:
+                entities.extend(self.detect_entities(chunk))
+                key_phrases.extend(self.detect_key_phrases(chunk))
+            # Aggregate the insights and append to the request object
+            aggregated_data = {
+                "sentiment": max(set(sentiments), key=sentiments.count),  # Aggregation by most frequent sentiment
+                "entities": entities,  # Entities from all chunks
+                "key_phrases": key_phrases  # Key phrases from all chunks
+            }
+            # updating the request body and adding the aggregated data.
+            request.update({"text": aggregated_data})
+        else:
+            print("No text provided for insights extraction.")
+        return super().handle(request)
+    def chunk_text(self, text):
+        """
+        Breaks the text into chunks, each within the Amazon Comprehend size limit.
+        """
+        words = text.split()
+        chunks = []
+        current_chunk = []
+        current_size = 0
+        for word in words:
+            word_size = len(word.encode('utf-8'))
+            if current_size + word_size <= self.max_bytes:
+                current_chunk.append(word)
+                current_size += word_size
+            else:
+                chunks.append(" ".join(current_chunk))
+                current_chunk = [word]
+                current_size = word_size
+        if current_chunk:
+            chunks.append(" ".join(current_chunk))
+        return chunks
+    def detect_sentiment(self, text):
+        """
+        Detects the sentiment of the given text using Amazon Comprehend.
+        """
+        try:
+            response = self.comprehend.detect_sentiment(Text=text, LanguageCode='en')
+            return response.get("Sentiment")
+        except Exception as e:
+            print(f"Error detecting sentiment: {e}")
+            return None
+    def detect_entities(self, text):
+        """
+        Detects entities in the given text using Amazon Comprehend.
+        """
+        try:
+            response = self.comprehend.detect_entities(Text=text, LanguageCode='en')
+            entities = response.get("Entities")
+            return [{"Text": entity["Text"], "Type": entity["Type"], "Score": entity["Score"]} for entity in entities]
+        except Exception as e:
+            print(f"Error detecting entities: {e}")
+            return []
+    def detect_key_phrases(self, text):
+        """
+        Detects key phrases in the given text using Amazon Comprehend.
+        """
+        try:
+            response = self.comprehend.detect_key_phrases(Text=text, LanguageCode='en')
+            key_phrases = response.get("KeyPhrases")
+            return [{"Text": phrase["Text"], "Score": phrase["Score"]} for phrase in key_phrases]
+        except Exception as e:
+            print(f"Error detecting key phrases: {e}")
+            return []

awschain/handlers/processors/amazon_comprehend_pii_classifier_handler.py ADDED Viewed

@@ -0,0 +1,42 @@
+from ..abstract_handler import AbstractHandler
+from ...utils.aws_boto_client_manager import AWSBotoClientManager
+class AmazonComprehendPIIClassifierHandler(AbstractHandler):
+    def handle(self, request: dict) -> dict:
+        self.comprehend = AWSBotoClientManager.get_client('comprehend')
+        print("Starting PII Classification...")
+        text = request.get("text", None)
+        # Process the text to check for PII
+        is_pii, pii_types = self.classify_pii(text)
+                # Update the request with PII detection results
+        request.update({
+            "is_pii": is_pii,
+            "detected_pii": pii_types
+        })
+        return super().handle(request)
+    def classify_pii(self, text):
+        """
+        Classifies text to detect PII and identify the types of PII found.
+        """
+        max_length = 5000  # character length, adjust based on the encoding
+        chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
+        detected_pii_types = set()
+        for chunk in chunks:
+            pii_entities = self.detect_pii(chunk)
+            for entity in pii_entities['Entities']:
+                detected_pii_types.add(entity['Type'])
+        is_pii_detected = len(detected_pii_types) > 0
+        return is_pii_detected, list(detected_pii_types)
+    def detect_pii(self, text):
+        """
+        Detects PII data in the given text using Amazon Comprehend.
+        """
+        return self.comprehend.detect_pii_entities(Text=text, LanguageCode='en')