auris_tools 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auris_tools might be problematic. Click here for more details.
- auris_tools/__init__.py +0 -0
- auris_tools/configuration.py +81 -0
- auris_tools/databaseHandlers.py +132 -0
- auris_tools/geminiHandler.py +245 -0
- auris_tools/officeWordHandler.py +271 -0
- auris_tools/storageHandler.py +195 -0
- auris_tools/textractHandler.py +169 -0
- auris_tools/utils.py +120 -0
- auris_tools-0.0.1.dist-info/METADATA +76 -0
- auris_tools-0.0.1.dist-info/RECORD +11 -0
- auris_tools-0.0.1.dist-info/WHEEL +4 -0
auris_tools/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from dotenv import load_dotenv
|
|
5
|
+
|
|
6
|
+
# Load environment variables from .env file
|
|
7
|
+
load_dotenv()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AWSConfiguration:
|
|
11
|
+
"""
|
|
12
|
+
AWS Configuration class that handles credentials and region settings.
|
|
13
|
+
Prioritizes environment variables over constructor parameters.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
access_key: str = None,
|
|
19
|
+
secret_key: str = None,
|
|
20
|
+
region: str = None,
|
|
21
|
+
profile: str = None,
|
|
22
|
+
endpoint_url: str = None,
|
|
23
|
+
):
|
|
24
|
+
# Try to get credentials from environment variables first
|
|
25
|
+
self.access_key = (
|
|
26
|
+
access_key if access_key else os.environ.get('AWS_ACCESS_KEY_ID')
|
|
27
|
+
)
|
|
28
|
+
self.secret_key = (
|
|
29
|
+
secret_key
|
|
30
|
+
if secret_key
|
|
31
|
+
else os.environ.get('AWS_SECRET_ACCESS_KEY')
|
|
32
|
+
)
|
|
33
|
+
self.region = (
|
|
34
|
+
region
|
|
35
|
+
if region
|
|
36
|
+
else os.environ.get('AWS_DEFAULT_REGION') or 'us-east-1'
|
|
37
|
+
)
|
|
38
|
+
self.profile = profile if profile else os.environ.get('AWS_PROFILE')
|
|
39
|
+
self.endpoint_url = (
|
|
40
|
+
endpoint_url
|
|
41
|
+
if endpoint_url
|
|
42
|
+
else os.environ.get('AWS_ENDPOINT_URL')
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# Validate configuration
|
|
46
|
+
self._validate_config()
|
|
47
|
+
|
|
48
|
+
def _validate_config(self):
|
|
49
|
+
"""Validate that we have enough configuration to proceed."""
|
|
50
|
+
if not ((self.access_key and self.secret_key) or self.profile):
|
|
51
|
+
logging.warning(
|
|
52
|
+
'No AWS credentials provided via environment variables or constructor. '
|
|
53
|
+
'AWS operations may fail unless credentials are configured via '
|
|
54
|
+
'~/.aws/credentials, IAM roles, or other AWS credential providers.'
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
def get_boto3_session_args(self):
|
|
58
|
+
"""
|
|
59
|
+
Return a dictionary of arguments that can be passed to boto3.session.Session()
|
|
60
|
+
"""
|
|
61
|
+
session_args = {'region_name': self.region}
|
|
62
|
+
|
|
63
|
+
if self.access_key and self.secret_key:
|
|
64
|
+
session_args['aws_access_key_id'] = self.access_key
|
|
65
|
+
session_args['aws_secret_access_key'] = self.secret_key
|
|
66
|
+
|
|
67
|
+
if self.profile:
|
|
68
|
+
session_args['profile_name'] = self.profile
|
|
69
|
+
|
|
70
|
+
return session_args
|
|
71
|
+
|
|
72
|
+
def get_client_args(self):
|
|
73
|
+
"""
|
|
74
|
+
Return a dictionary of arguments that can be passed to boto3 client creation
|
|
75
|
+
"""
|
|
76
|
+
client_args = {}
|
|
77
|
+
|
|
78
|
+
if self.endpoint_url:
|
|
79
|
+
client_args['endpoint_url'] = self.endpoint_url
|
|
80
|
+
|
|
81
|
+
return client_args
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import boto3
|
|
4
|
+
from boto3.dynamodb.types import TypeDeserializer, TypeSerializer
|
|
5
|
+
|
|
6
|
+
from auris_tools.configuration import AWSConfiguration
|
|
7
|
+
from auris_tools.utils import generate_uuid
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DatabaseHandler:
|
|
11
|
+
def __init__(self, table_name, config=None):
|
|
12
|
+
"""
|
|
13
|
+
Initialize the database handler.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
table_name: Name of the DynamoDB table.
|
|
17
|
+
config: An AWSConfiguration object, or None to use environment variables.
|
|
18
|
+
"""
|
|
19
|
+
self.table_name = table_name
|
|
20
|
+
if config is None:
|
|
21
|
+
config = AWSConfiguration()
|
|
22
|
+
|
|
23
|
+
# Create a boto3 session with the configuration
|
|
24
|
+
session = boto3.session.Session(**config.get_boto3_session_args())
|
|
25
|
+
|
|
26
|
+
# Create a DynamoDB client with additional configuration if needed
|
|
27
|
+
self.client = session.client('dynamodb', **config.get_client_args())
|
|
28
|
+
|
|
29
|
+
if not self._check_table_exists(table_name):
|
|
30
|
+
raise Exception(f'Table does not exist: {table_name}')
|
|
31
|
+
|
|
32
|
+
logging.info(f'Initialized DynamoDB client in region {config.region}')
|
|
33
|
+
|
|
34
|
+
def insert_item(self, item, primary_key: str = 'id'):
|
|
35
|
+
"""Insert an item with automatic type conversion"""
|
|
36
|
+
if not isinstance(item, dict):
|
|
37
|
+
raise TypeError('Item must be a dictionary')
|
|
38
|
+
|
|
39
|
+
if primary_key not in item:
|
|
40
|
+
item[primary_key] = generate_uuid()
|
|
41
|
+
|
|
42
|
+
dynamo_item = self._serialize_item(item)
|
|
43
|
+
response = self.client.put_item(
|
|
44
|
+
TableName=self.table_name, Item=dynamo_item
|
|
45
|
+
)
|
|
46
|
+
return response
|
|
47
|
+
|
|
48
|
+
def get_item(self, key):
|
|
49
|
+
"""
|
|
50
|
+
Retrieve an item from a DynamoDB table.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
key: A dictionary representing the key of the item to retrieve.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
The retrieved item, or None if not found.
|
|
57
|
+
"""
|
|
58
|
+
if not isinstance(key, dict):
|
|
59
|
+
raise TypeError('Key must be a dictionary')
|
|
60
|
+
|
|
61
|
+
# Check if the key is in DynamoDB format (i.e., values are dicts with type keys)
|
|
62
|
+
if not all(isinstance(v, dict) and len(v) == 1 for v in key.values()):
|
|
63
|
+
# Convert to DynamoDB format
|
|
64
|
+
key = self._serialize_item(key)
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
response = self.client.get_item(TableName=self.table_name, Key=key)
|
|
68
|
+
return response.get('Item')
|
|
69
|
+
except Exception as e:
|
|
70
|
+
logging.error(
|
|
71
|
+
f'Error retrieving item from {self.table_name}: {str(e)}'
|
|
72
|
+
)
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
def delete_item(self, key, primary_key='id'):
|
|
76
|
+
"""
|
|
77
|
+
Delete an item from a DynamoDB table.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
key (str or dict): Either a string identifier for the primary key,
|
|
81
|
+
or a dictionary containing the complete key structure.
|
|
82
|
+
primary_key (str, optional): Name of the primary key field. Defaults to 'id'.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
bool: True if deletion was successful, False otherwise.
|
|
86
|
+
"""
|
|
87
|
+
# Convert string key to a dictionary with the primary key
|
|
88
|
+
if isinstance(key, str):
|
|
89
|
+
key = {primary_key: key}
|
|
90
|
+
elif not isinstance(key, dict):
|
|
91
|
+
raise TypeError('Key must be a string identifier or a dictionary')
|
|
92
|
+
|
|
93
|
+
# Check if the key is in DynamoDB format
|
|
94
|
+
if not self.item_is_serialized(key):
|
|
95
|
+
key = self._serialize_item(key)
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
self.client.delete_item(
|
|
99
|
+
TableName=self.table_name,
|
|
100
|
+
Key=key,
|
|
101
|
+
ReturnValues='ALL_OLD', # Return the deleted item
|
|
102
|
+
)
|
|
103
|
+
logging.info(f'Deleted item from {self.table_name} with key {key}')
|
|
104
|
+
return True
|
|
105
|
+
except Exception as e:
|
|
106
|
+
logging.error(
|
|
107
|
+
f'Error deleting item from {self.table_name}: {str(e)}'
|
|
108
|
+
)
|
|
109
|
+
return False
|
|
110
|
+
|
|
111
|
+
def item_is_serialized(self, item):
|
|
112
|
+
"""Check if an item is in DynamoDB serialized format"""
|
|
113
|
+
return all(isinstance(v, dict) and len(v) == 1 for v in item.values())
|
|
114
|
+
|
|
115
|
+
def _serialize_item(self, item):
|
|
116
|
+
"""Convert Python types to DynamoDB format"""
|
|
117
|
+
serializer = TypeSerializer()
|
|
118
|
+
return {k: serializer.serialize(v) for k, v in item.items()}
|
|
119
|
+
|
|
120
|
+
def _deserialize_item(self, item):
|
|
121
|
+
"""Convert DynamoDB format back to Python types"""
|
|
122
|
+
deserializer = TypeDeserializer()
|
|
123
|
+
return {k: deserializer.deserialize(v) for k, v in item.items()}
|
|
124
|
+
|
|
125
|
+
def _check_table_exists(self, table_name):
|
|
126
|
+
"""Check if a DynamoDB table exists"""
|
|
127
|
+
try:
|
|
128
|
+
existing_tables = self.client.list_tables().get('TableNames', [])
|
|
129
|
+
return table_name in existing_tables
|
|
130
|
+
except Exception as e:
|
|
131
|
+
logging.error(f'Error checking table existence: {str(e)}')
|
|
132
|
+
return False
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
import google.generativeai as genai
|
|
5
|
+
from dotenv import load_dotenv
|
|
6
|
+
|
|
7
|
+
# Load environment variables from .env file
|
|
8
|
+
load_dotenv()
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class GoogleGeminiHandler:
|
|
14
|
+
"""A handler class for interacting with Google's Gemini AI models.
|
|
15
|
+
|
|
16
|
+
This class provides a convenient interface for generating content using Google's
|
|
17
|
+
Gemini generative AI models. It handles authentication, model configuration,
|
|
18
|
+
and content generation with automatic error handling and logging.
|
|
19
|
+
|
|
20
|
+
Attributes:
|
|
21
|
+
api_key (str): The Google AI API key used for authentication.
|
|
22
|
+
model_name (str): The name of the Gemini model to use.
|
|
23
|
+
temperature (float): Controls randomness in generation (0.0 to 1.0).
|
|
24
|
+
response_schema (dict): Optional schema for structured responses.
|
|
25
|
+
response_mime_type (str): MIME type for response format.
|
|
26
|
+
generation_config (genai.types.GenerationConfig): Configuration for content generation.
|
|
27
|
+
model (genai.GenerativeModel): The configured Gemini model instance.
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
Basic usage with environment variable API key:
|
|
31
|
+
|
|
32
|
+
>>> handler = GoogleGeminiHandler()
|
|
33
|
+
>>> response = handler.generate_output("What is artificial intelligence?")
|
|
34
|
+
>>> text = handler.get_text(response)
|
|
35
|
+
|
|
36
|
+
Usage with custom parameters:
|
|
37
|
+
|
|
38
|
+
>>> handler = GoogleGeminiHandler(
|
|
39
|
+
... api_key="your-api-key",
|
|
40
|
+
... model="gemini-2.0-flash-exp",
|
|
41
|
+
... temperature=0.7,
|
|
42
|
+
... response_mime_type="text/plain"
|
|
43
|
+
... )
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self, api_key: str = None, model: str = 'gemini-2.5-flash', **kwargs
|
|
48
|
+
):
|
|
49
|
+
"""Initialize the Google Gemini handler.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
api_key (str, optional): Google AI API key. If not provided, will attempt
|
|
53
|
+
to load from GEMINI_API_KEY environment variable. Defaults to None.
|
|
54
|
+
model (str, optional): Name of the Gemini model to use.
|
|
55
|
+
Defaults to 'gemini-2.5-flash'.
|
|
56
|
+
**kwargs: Additional configuration parameters:
|
|
57
|
+
- temperature (float): Controls randomness (0.0-1.0). Defaults to 0.5.
|
|
58
|
+
- response_schema (dict): Schema for structured responses. Defaults to None.
|
|
59
|
+
- response_mime_type (str): Response MIME type. Defaults to 'application/json'.
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
TypeError: If the specified model is not available.
|
|
63
|
+
|
|
64
|
+
Example:
|
|
65
|
+
>>> handler = GoogleGeminiHandler(
|
|
66
|
+
... api_key="your-api-key",
|
|
67
|
+
... model="gemini-2.0-flash-exp",
|
|
68
|
+
... temperature=0.7
|
|
69
|
+
... )
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
self.api_key = api_key if api_key else os.getenv('GEMINI_API_KEY')
|
|
73
|
+
if self.api_key is None:
|
|
74
|
+
logger.error(
|
|
75
|
+
'Gemini API key not configured. Please, define the GEMINI_API_KEY environment variable or enter your key directly in the code.'
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
self.model_name = model
|
|
79
|
+
self._check_model_availability()
|
|
80
|
+
|
|
81
|
+
# More configuration from input parameters
|
|
82
|
+
self.temperature = kwargs.get('temperature', 0.5)
|
|
83
|
+
self.response_schema = kwargs.get('response_schema', None)
|
|
84
|
+
self.response_mime_type = kwargs.get(
|
|
85
|
+
'response_mime_type', 'application/json'
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
self.generation_config = genai.types.GenerationConfig(
|
|
89
|
+
temperature=self.temperature,
|
|
90
|
+
response_schema=self.response_schema,
|
|
91
|
+
response_mime_type=self.response_mime_type,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
self.model = genai.GenerativeModel(
|
|
95
|
+
generation_config=self.generation_config,
|
|
96
|
+
model_name=self.model_name,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def generate_output(
|
|
100
|
+
self, prompt: str, input_data: str = None, input_mime_type: str = None
|
|
101
|
+
):
|
|
102
|
+
"""Generate content using the configured Gemini model.
|
|
103
|
+
|
|
104
|
+
This method sends a prompt to the Gemini model and returns the generated response.
|
|
105
|
+
It supports both text-only prompts and multimodal inputs with additional data.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
prompt (str): The text prompt to send to the model. This is the main
|
|
109
|
+
instruction or question for the AI to respond to.
|
|
110
|
+
input_data (str, optional): Additional input data to include with the prompt.
|
|
111
|
+
This could be text content, encoded media, or other data. Requires
|
|
112
|
+
input_mime_type to be specified. Defaults to None.
|
|
113
|
+
input_mime_type (str, optional): MIME type of the input_data. Required if
|
|
114
|
+
input_data is provided. Examples: 'text/plain', 'image/jpeg',
|
|
115
|
+
'application/pdf'. Defaults to None.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
genai.types.GenerateContentResponse or str: The response from the Gemini model
|
|
119
|
+
if successful, or an empty string if an error occurred.
|
|
120
|
+
|
|
121
|
+
Raises:
|
|
122
|
+
ValueError: If input_data is provided without input_mime_type or vice versa.
|
|
123
|
+
|
|
124
|
+
Example:
|
|
125
|
+
Text-only generation:
|
|
126
|
+
|
|
127
|
+
>>> response = handler.generate_output("Explain quantum computing")
|
|
128
|
+
|
|
129
|
+
Multimodal generation with additional data:
|
|
130
|
+
|
|
131
|
+
>>> response = handler.generate_output(
|
|
132
|
+
... prompt="Describe this image",
|
|
133
|
+
... input_data=base64_encoded_image,
|
|
134
|
+
... input_mime_type="image/jpeg"
|
|
135
|
+
... )
|
|
136
|
+
"""
|
|
137
|
+
if (input_data is not None and input_mime_type is None) or (
|
|
138
|
+
input_data is None and input_mime_type is not None
|
|
139
|
+
):
|
|
140
|
+
raise ValueError(
|
|
141
|
+
'input_mime_type must be provided if input_data is given, or otherwise both must be None.'
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
if input_data and input_mime_type: # Add input data if provided
|
|
145
|
+
prompt = [
|
|
146
|
+
prompt,
|
|
147
|
+
{'mime_type': input_mime_type, 'content': input_data},
|
|
148
|
+
]
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
response = self.model.generate_content(prompt)
|
|
152
|
+
return response
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.error(f'Error generating LLM output: {str(e)}')
|
|
155
|
+
return ''
|
|
156
|
+
|
|
157
|
+
def get_text(self, response) -> str:
|
|
158
|
+
"""Extract text content from a Gemini model response.
|
|
159
|
+
|
|
160
|
+
This method parses the response object returned by the Gemini model and
|
|
161
|
+
extracts the generated text content. It handles the response structure
|
|
162
|
+
safely and provides fallbacks for various response formats.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
response (genai.types.GenerateContentResponse or dict): The response object
|
|
166
|
+
returned from the generate_output method. This can be either a
|
|
167
|
+
GenerateContentResponse object or a dictionary representation.
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
str: The extracted text content from the response. Returns an empty
|
|
171
|
+
string if no content is found or if an error occurs during extraction.
|
|
172
|
+
|
|
173
|
+
Example:
|
|
174
|
+
>>> response = handler.generate_output("What is AI?")
|
|
175
|
+
>>> text_content = handler.get_text(response)
|
|
176
|
+
>>> print(text_content)
|
|
177
|
+
"Artificial Intelligence (AI) refers to..."
|
|
178
|
+
|
|
179
|
+
>>> # Handle case with no candidates
|
|
180
|
+
>>> empty_response = {'candidates': []}
|
|
181
|
+
>>> text = handler.get_text(empty_response)
|
|
182
|
+
>>> print(text) # Returns empty string
|
|
183
|
+
""
|
|
184
|
+
"""
|
|
185
|
+
try:
|
|
186
|
+
if 'candidates' in response and len(response['candidates']) > 0:
|
|
187
|
+
return response['candidates'][0]['content']
|
|
188
|
+
else:
|
|
189
|
+
logger.warning('No candidates found in the response.')
|
|
190
|
+
return ''
|
|
191
|
+
except Exception as e:
|
|
192
|
+
logger.error(f'Error extracting text from response: {str(e)}')
|
|
193
|
+
return ''
|
|
194
|
+
|
|
195
|
+
def _check_model_availability(self):
|
|
196
|
+
"""Check if the specified Gemini model is available.
|
|
197
|
+
|
|
198
|
+
This private method validates that the requested model name exists in the
|
|
199
|
+
list of available Google Gemini models. It queries the Google AI API to
|
|
200
|
+
get the current list of available models and compares against the requested
|
|
201
|
+
model name.
|
|
202
|
+
|
|
203
|
+
Raises:
|
|
204
|
+
TypeError: If the specified model is not found in the list of available
|
|
205
|
+
models from the Google AI API.
|
|
206
|
+
|
|
207
|
+
Note:
|
|
208
|
+
This method is called automatically during initialization and will
|
|
209
|
+
prevent the handler from being created if an invalid model is specified.
|
|
210
|
+
It also logs the availability check results for debugging purposes.
|
|
211
|
+
|
|
212
|
+
Example:
|
|
213
|
+
This method is called internally during initialization:
|
|
214
|
+
|
|
215
|
+
>>> # This will call _check_model_availability internally
|
|
216
|
+
>>> handler = GoogleGeminiHandler(model="gemini-2.5-flash") # Success
|
|
217
|
+
>>> handler = GoogleGeminiHandler(model="invalid-model") # Raises TypeError
|
|
218
|
+
"""
|
|
219
|
+
try:
|
|
220
|
+
available_models = genai.list_models()
|
|
221
|
+
# Extract model names and handle the 'models/' prefix
|
|
222
|
+
available_model_names = []
|
|
223
|
+
for model in available_models:
|
|
224
|
+
model_name = model.name
|
|
225
|
+
# Remove 'models/' prefix if present
|
|
226
|
+
if model_name.startswith('models/'):
|
|
227
|
+
model_name = model_name[7:] # Remove 'models/' prefix
|
|
228
|
+
available_model_names.append(model_name)
|
|
229
|
+
|
|
230
|
+
if self.model_name not in available_model_names:
|
|
231
|
+
logger.error(
|
|
232
|
+
f'Model {self.model_name} is not available. Please check the model name.'
|
|
233
|
+
)
|
|
234
|
+
logger.info(
|
|
235
|
+
f'Available models: {", ".join(available_model_names)}'
|
|
236
|
+
)
|
|
237
|
+
raise TypeError(f'Invalid model name: {self.model_name}')
|
|
238
|
+
else:
|
|
239
|
+
logger.info(f'Model {self.model_name} is available.')
|
|
240
|
+
except Exception as e:
|
|
241
|
+
if 'Invalid model name' in str(e):
|
|
242
|
+
raise # Re-raise our custom error
|
|
243
|
+
else:
|
|
244
|
+
logger.error(f'Error checking model availability: {str(e)}')
|
|
245
|
+
# Don't raise error for API connectivity issues, just log
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import logging
|
|
3
|
+
import re
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
|
|
6
|
+
import boto3
|
|
7
|
+
from docx import Document
|
|
8
|
+
from docx.opc.constants import RELATIONSHIP_TYPE as RT
|
|
9
|
+
from docx.text.paragraph import Paragraph
|
|
10
|
+
|
|
11
|
+
from auris_tools.configuration import AWSConfiguration
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class OfficeWordHandler:
|
|
15
|
+
"""
|
|
16
|
+
Handler for DOCX operations including text extraction and manipulation.
|
|
17
|
+
|
|
18
|
+
This class provides methods to interact with Microsoft Word documents (DOCX)
|
|
19
|
+
stored in S3, including reading, extracting text, and text replacement operations.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, config=None):
|
|
23
|
+
"""
|
|
24
|
+
Initialize the Office Word handler with AWS configuration.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
config: An AWSConfiguration object, or None to use environment variables
|
|
28
|
+
"""
|
|
29
|
+
if config is None:
|
|
30
|
+
config = AWSConfiguration()
|
|
31
|
+
|
|
32
|
+
# Create a boto3 session with the configuration
|
|
33
|
+
session = boto3.session.Session(**config.get_boto3_session_args())
|
|
34
|
+
|
|
35
|
+
# Create an S3 client with additional configuration if needed
|
|
36
|
+
self.s3_client = session.client('s3', **config.get_client_args())
|
|
37
|
+
logging.info(f'Initialized S3 client in region {config.region}')
|
|
38
|
+
|
|
39
|
+
def read_from_s3(self, bucket_name, object_name, as_bytes_io=False):
|
|
40
|
+
"""
|
|
41
|
+
Read a DOCX file from S3 and return its bytes.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
bucket_name: Name of the S3 bucket containing the document
|
|
45
|
+
object_name: Object key of the document in the S3 bucket
|
|
46
|
+
as_bytes_io: If True, return a BytesIO object instead of raw bytes
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
bytes or BytesIO: The document content
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
Exception: If there is an error retrieving the document
|
|
53
|
+
"""
|
|
54
|
+
try:
|
|
55
|
+
response = self.s3_client.get_object(
|
|
56
|
+
Bucket=bucket_name, Key=object_name
|
|
57
|
+
)
|
|
58
|
+
content = response['Body'].read()
|
|
59
|
+
|
|
60
|
+
if as_bytes_io:
|
|
61
|
+
return io.BytesIO(content)
|
|
62
|
+
return content
|
|
63
|
+
except Exception as e:
|
|
64
|
+
logging.error(
|
|
65
|
+
f'Error reading document from {bucket_name}/{object_name}: {str(e)}'
|
|
66
|
+
)
|
|
67
|
+
raise Exception(f'Error reading file from S3: {str(e)}')
|
|
68
|
+
|
|
69
|
+
def upload_docx(self, docx_document, bucket_name, object_name):
|
|
70
|
+
"""
|
|
71
|
+
Upload a DOCX document to S3.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
docx_document: The Document object to upload
|
|
75
|
+
bucket_name: Name of the S3 bucket
|
|
76
|
+
object_name: Object key for the document in S3
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
bool: True if upload was successful, False otherwise
|
|
80
|
+
|
|
81
|
+
Raises:
|
|
82
|
+
Exception: If there is an error uploading the document
|
|
83
|
+
"""
|
|
84
|
+
try:
|
|
85
|
+
logging.info(f'Starting upload to S3: {bucket_name}/{object_name}')
|
|
86
|
+
|
|
87
|
+
# Convert document to bytes
|
|
88
|
+
temp_stream = io.BytesIO()
|
|
89
|
+
docx_document.save(temp_stream)
|
|
90
|
+
temp_stream.seek(0)
|
|
91
|
+
document_size = len(temp_stream.getvalue())
|
|
92
|
+
|
|
93
|
+
# Upload to S3
|
|
94
|
+
self.s3_client.upload_fileobj(
|
|
95
|
+
temp_stream,
|
|
96
|
+
Bucket=bucket_name,
|
|
97
|
+
Key=object_name,
|
|
98
|
+
ExtraArgs={
|
|
99
|
+
'ContentType': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
|
100
|
+
},
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
logging.info(
|
|
104
|
+
f'Upload finished successfully. Size: {document_size} bytes'
|
|
105
|
+
)
|
|
106
|
+
return True
|
|
107
|
+
except Exception as e:
|
|
108
|
+
logging.error(f'Failed to upload to S3: {str(e)}')
|
|
109
|
+
raise Exception(f'Error uploading file to S3: {str(e)}')
|
|
110
|
+
|
|
111
|
+
def get_text_from_bytes(self, bytes_data):
|
|
112
|
+
"""
|
|
113
|
+
Extract text from a DOCX file bytes.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
bytes_data: The document bytes
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
str: Extracted text from the document
|
|
120
|
+
|
|
121
|
+
Raises:
|
|
122
|
+
ValueError: If there is an error extracting the text
|
|
123
|
+
"""
|
|
124
|
+
try:
|
|
125
|
+
doc = Document(io.BytesIO(bytes_data))
|
|
126
|
+
full_text = []
|
|
127
|
+
|
|
128
|
+
# Extract text from paragraphs
|
|
129
|
+
for para in doc.paragraphs:
|
|
130
|
+
full_text.append(para.text)
|
|
131
|
+
|
|
132
|
+
# Extract text from tables
|
|
133
|
+
for table in doc.tables:
|
|
134
|
+
for row in table.rows:
|
|
135
|
+
for cell in row.cells:
|
|
136
|
+
full_text.append(cell.text)
|
|
137
|
+
|
|
138
|
+
return '\n'.join(full_text)
|
|
139
|
+
except Exception as e:
|
|
140
|
+
logging.error(f'Error extracting text from DOCX: {str(e)}')
|
|
141
|
+
raise ValueError(f'Error extracting text from DOCX: {str(e)}')
|
|
142
|
+
|
|
143
|
+
def clean_text(self, text):
|
|
144
|
+
"""
|
|
145
|
+
Clean extracted text from a DOCX file.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
text: Text to clean
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
str: Cleaned text
|
|
152
|
+
"""
|
|
153
|
+
if not text:
|
|
154
|
+
return ''
|
|
155
|
+
|
|
156
|
+
# Basic cleaning (can be extended)
|
|
157
|
+
cleaned_text = text.strip()
|
|
158
|
+
return cleaned_text
|
|
159
|
+
|
|
160
|
+
def collect_all_paragraphs(self, document: Document) -> List[Paragraph]:
|
|
161
|
+
"""
|
|
162
|
+
Collect all paragraphs from a Document object.
|
|
163
|
+
|
|
164
|
+
This method collects paragraphs from the main document body,
|
|
165
|
+
tables, headers, and footers.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
document: The Document object
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
List[Paragraph]: List of all paragraphs in the document
|
|
172
|
+
"""
|
|
173
|
+
paragraphs = list(document.paragraphs)
|
|
174
|
+
|
|
175
|
+
for table in document.tables:
|
|
176
|
+
for row in table.rows:
|
|
177
|
+
for cell in row.cells:
|
|
178
|
+
paragraphs.extend(cell.paragraphs)
|
|
179
|
+
|
|
180
|
+
for section in document.sections:
|
|
181
|
+
paragraphs.extend(section.header.paragraphs)
|
|
182
|
+
paragraphs.extend(section.footer.paragraphs)
|
|
183
|
+
|
|
184
|
+
return paragraphs
|
|
185
|
+
|
|
186
|
+
def replace_placeholder_by_text(
|
|
187
|
+
self,
|
|
188
|
+
paragraphs: List[Paragraph],
|
|
189
|
+
document: Document,
|
|
190
|
+
placeholder: str,
|
|
191
|
+
replacement: str,
|
|
192
|
+
max_count: Optional[int] = None,
|
|
193
|
+
) -> int:
|
|
194
|
+
"""
|
|
195
|
+
Replace placeholder text with replacement in document's XML w:t nodes.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
paragraphs: List of paragraphs to process
|
|
199
|
+
document: Document object
|
|
200
|
+
placeholder: Text to find and replace
|
|
201
|
+
replacement: Text to insert instead of placeholder
|
|
202
|
+
max_count: Maximum number of replacements, or None for unlimited
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
int: Number of replacements made
|
|
206
|
+
|
|
207
|
+
Note:
|
|
208
|
+
This method works at the XML level to ensure proper formatting is preserved.
|
|
209
|
+
"""
|
|
210
|
+
count = 0
|
|
211
|
+
WORD_NAMESPACE = (
|
|
212
|
+
'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
|
213
|
+
)
|
|
214
|
+
T_TAG = f'{{{WORD_NAMESPACE}}}t'
|
|
215
|
+
|
|
216
|
+
if placeholder in replacement:
|
|
217
|
+
logging.warning(
|
|
218
|
+
f'Replacement skipped to avoid recursion: {placeholder} -> {replacement}'
|
|
219
|
+
)
|
|
220
|
+
return 0
|
|
221
|
+
|
|
222
|
+
def replace_in_element(element):
|
|
223
|
+
nonlocal count
|
|
224
|
+
for node in element.iter(tag=T_TAG):
|
|
225
|
+
if node.text and placeholder in node.text:
|
|
226
|
+
remaining = (
|
|
227
|
+
None if max_count is None else max_count - count
|
|
228
|
+
)
|
|
229
|
+
new_text, n = re.subn(
|
|
230
|
+
re.escape(placeholder),
|
|
231
|
+
replacement,
|
|
232
|
+
node.text,
|
|
233
|
+
count=remaining if remaining else 0,
|
|
234
|
+
)
|
|
235
|
+
if n > 0:
|
|
236
|
+
node.text = new_text
|
|
237
|
+
count += n
|
|
238
|
+
if max_count is not None and count >= max_count:
|
|
239
|
+
return True
|
|
240
|
+
return False
|
|
241
|
+
|
|
242
|
+
# Main paragraphs
|
|
243
|
+
for para in paragraphs:
|
|
244
|
+
if replace_in_element(para._element):
|
|
245
|
+
return count
|
|
246
|
+
|
|
247
|
+
# Headers/footers
|
|
248
|
+
for section in document.sections:
|
|
249
|
+
for container in [section.header, section.footer]:
|
|
250
|
+
for para in container.paragraphs:
|
|
251
|
+
if replace_in_element(para._element):
|
|
252
|
+
return count
|
|
253
|
+
|
|
254
|
+
# Tables
|
|
255
|
+
for table in document.tables:
|
|
256
|
+
for row in table.rows:
|
|
257
|
+
for cell in row.cells:
|
|
258
|
+
for para in cell.paragraphs:
|
|
259
|
+
if replace_in_element(para._element):
|
|
260
|
+
return count
|
|
261
|
+
|
|
262
|
+
# Hyperlinks
|
|
263
|
+
for rel in document.part.rels.values():
|
|
264
|
+
if rel.reltype == RT.HYPERLINK and placeholder in rel.target_ref:
|
|
265
|
+
logging.info(
|
|
266
|
+
f'Replacing hyperlink: {rel.target_ref} -> {rel.target_ref.replace(placeholder, replacement)}'
|
|
267
|
+
)
|
|
268
|
+
rel._target = rel.target_ref.replace(placeholder, replacement)
|
|
269
|
+
count += 1
|
|
270
|
+
|
|
271
|
+
return count
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from http import HTTPStatus
|
|
3
|
+
|
|
4
|
+
import boto3
|
|
5
|
+
|
|
6
|
+
from auris_tools.configuration import AWSConfiguration
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class StorageHandler:
|
|
10
|
+
def __init__(self, config=None):
|
|
11
|
+
"""
|
|
12
|
+
Initialize the storage handler with AWS configuration.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
config: An AWSConfiguration object, or None to use environment variables
|
|
16
|
+
"""
|
|
17
|
+
if config is None:
|
|
18
|
+
config = AWSConfiguration()
|
|
19
|
+
|
|
20
|
+
# Create a boto3 session with the configuration
|
|
21
|
+
session = boto3.session.Session(**config.get_boto3_session_args())
|
|
22
|
+
|
|
23
|
+
# Create an S3 client with additional configuration if needed
|
|
24
|
+
self.client = session.client('s3', **config.get_client_args())
|
|
25
|
+
logging.info(f'Initialized S3 client in region {config.region}')
|
|
26
|
+
|
|
27
|
+
def upload_file(self, file_path, bucket_name, object_name):
|
|
28
|
+
"""
|
|
29
|
+
Upload a file to an S3 bucket.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
file_path: Path to the file to upload
|
|
33
|
+
bucket_name: Name of the bucket to upload to
|
|
34
|
+
object_name: S3 object name (key)
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
True if file was uploaded successfully, else False
|
|
38
|
+
"""
|
|
39
|
+
try:
|
|
40
|
+
self.client.upload_file(file_path, bucket_name, object_name)
|
|
41
|
+
logging.info(
|
|
42
|
+
f'Uploaded {file_path} to {bucket_name}/{object_name}'
|
|
43
|
+
)
|
|
44
|
+
return True
|
|
45
|
+
except Exception as e:
|
|
46
|
+
logging.error(f'Error uploading file {file_path}: {str(e)}')
|
|
47
|
+
return False
|
|
48
|
+
|
|
49
|
+
def download_file(self, bucket_name, object_name, file_path):
|
|
50
|
+
"""
|
|
51
|
+
Download a file from an S3 bucket.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
bucket_name: Bucket name
|
|
55
|
+
object_name: S3 object name (key)
|
|
56
|
+
file_path: Path where the file should be saved
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
True if file was downloaded successfully, else False
|
|
60
|
+
"""
|
|
61
|
+
try:
|
|
62
|
+
self.client.download_file(bucket_name, object_name, file_path)
|
|
63
|
+
logging.info(
|
|
64
|
+
f'Downloaded {bucket_name}/{object_name} to {file_path}'
|
|
65
|
+
)
|
|
66
|
+
return True
|
|
67
|
+
except Exception as e:
|
|
68
|
+
logging.error(f'Error downloading file {object_name}: {str(e)}')
|
|
69
|
+
return False
|
|
70
|
+
|
|
71
|
+
def get_file_object(self, bucket_name, object_name, as_bytes=False):
|
|
72
|
+
"""
|
|
73
|
+
Get a file object from an S3 bucket.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
bucket_name: Bucket name
|
|
77
|
+
object_name: S3 object name (key)
|
|
78
|
+
as_bytes: If True, return the content as bytes instead of a streaming object
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
S3 object (streaming) or bytes if as_bytes=True or None if not found
|
|
82
|
+
"""
|
|
83
|
+
try:
|
|
84
|
+
response = self.client.get_object(
|
|
85
|
+
Bucket=bucket_name, Key=object_name
|
|
86
|
+
)
|
|
87
|
+
if as_bytes:
|
|
88
|
+
return response['Body'].read()
|
|
89
|
+
return response['Body']
|
|
90
|
+
except Exception as e:
|
|
91
|
+
logging.error(f'Error getting file object {object_name}: {str(e)}')
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
def check_file_exists(self, bucket_name, object_name):
|
|
95
|
+
"""
|
|
96
|
+
Check if a file exists in an S3 bucket.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
bucket_name: Bucket name
|
|
100
|
+
object_name: S3 object name (key)
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
True if file exists, else False
|
|
104
|
+
"""
|
|
105
|
+
try:
|
|
106
|
+
self.client.head_object(Bucket=bucket_name, Key=object_name)
|
|
107
|
+
return True
|
|
108
|
+
except Exception:
|
|
109
|
+
return False
|
|
110
|
+
|
|
111
|
+
def check_file_size(self, bucket_name, object_name):
|
|
112
|
+
"""
|
|
113
|
+
Get the size of a file in an S3 bucket.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
bucket_name: Bucket name
|
|
117
|
+
object_name: S3 object name (key)
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
File size in bytes or None if file doesn't exist
|
|
121
|
+
"""
|
|
122
|
+
try:
|
|
123
|
+
response = self.client.head_object(
|
|
124
|
+
Bucket=bucket_name, Key=object_name
|
|
125
|
+
)
|
|
126
|
+
return response.get('ContentLength')
|
|
127
|
+
except Exception as e:
|
|
128
|
+
logging.error(
|
|
129
|
+
f'Error checking file size for {object_name}: {str(e)}'
|
|
130
|
+
)
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
def delete_file(self, bucket_name, object_name):
|
|
134
|
+
"""
|
|
135
|
+
Delete a file from an S3 bucket.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
bucket_name: Bucket name
|
|
139
|
+
object_name: S3 object name (key)
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
True if file was deleted successfully, else False
|
|
143
|
+
"""
|
|
144
|
+
try:
|
|
145
|
+
# Check if file exists before attempting deletion
|
|
146
|
+
if not self.check_file_exists(bucket_name, object_name):
|
|
147
|
+
logging.warning(
|
|
148
|
+
f'File {bucket_name}/{object_name} does not exist.'
|
|
149
|
+
)
|
|
150
|
+
return False
|
|
151
|
+
|
|
152
|
+
response = self.client.delete_object(
|
|
153
|
+
Bucket=bucket_name, Key=object_name
|
|
154
|
+
)
|
|
155
|
+
status_code = response.get('ResponseMetadata', {}).get(
|
|
156
|
+
'HTTPStatusCode'
|
|
157
|
+
)
|
|
158
|
+
# Both 200 (OK) and 204 (No Content) are successful responses
|
|
159
|
+
if status_code not in (HTTPStatus.OK, HTTPStatus.NO_CONTENT):
|
|
160
|
+
logging.error(
|
|
161
|
+
f'Failed to delete {bucket_name}/{object_name}, status code: {status_code}'
|
|
162
|
+
)
|
|
163
|
+
return False
|
|
164
|
+
|
|
165
|
+
logging.info(f'Deleted {bucket_name}/{object_name}')
|
|
166
|
+
return True
|
|
167
|
+
except Exception as e:
|
|
168
|
+
logging.error(f'Error deleting file {object_name}: {str(e)}')
|
|
169
|
+
return False
|
|
170
|
+
|
|
171
|
+
def list_files(self, bucket_name, prefix=''):
|
|
172
|
+
"""
|
|
173
|
+
List files in an S3 bucket with optional prefix filtering.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
bucket_name: Bucket name
|
|
177
|
+
prefix: Prefix to filter objects (folder path)
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
List of object keys or empty list if error occurs
|
|
181
|
+
"""
|
|
182
|
+
try:
|
|
183
|
+
response = self.client.list_objects_v2(
|
|
184
|
+
Bucket=bucket_name, Prefix=prefix
|
|
185
|
+
)
|
|
186
|
+
files = []
|
|
187
|
+
if 'Contents' in response:
|
|
188
|
+
for obj in response['Contents']:
|
|
189
|
+
files.append(obj['Key'])
|
|
190
|
+
return files
|
|
191
|
+
except Exception as e:
|
|
192
|
+
logging.error(
|
|
193
|
+
f'Error listing files in {bucket_name}/{prefix}: {str(e)}'
|
|
194
|
+
)
|
|
195
|
+
return []
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
|
|
4
|
+
import boto3
|
|
5
|
+
|
|
6
|
+
from auris_tools.configuration import AWSConfiguration
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TextractHandler:
|
|
10
|
+
"""
|
|
11
|
+
Handler for Amazon Textract operations to extract text from documents.
|
|
12
|
+
|
|
13
|
+
This class provides methods to interact with AWS Textract service for
|
|
14
|
+
text extraction from documents stored in S3.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, config=None):
|
|
18
|
+
"""
|
|
19
|
+
Initialize the Textract handler with AWS configuration.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
config: An AWSConfiguration object, or None to use environment variables
|
|
23
|
+
"""
|
|
24
|
+
if config is None:
|
|
25
|
+
config = AWSConfiguration()
|
|
26
|
+
|
|
27
|
+
# Create a boto3 session with the configuration
|
|
28
|
+
session = boto3.session.Session(**config.get_boto3_session_args())
|
|
29
|
+
|
|
30
|
+
# Create a Textract client with additional configuration if needed
|
|
31
|
+
self.client = session.client('textract', **config.get_client_args())
|
|
32
|
+
logging.info(f'Initialized Textract client in region {config.region}')
|
|
33
|
+
|
|
34
|
+
def start_job(self, s3_bucket_name, object_name):
|
|
35
|
+
"""
|
|
36
|
+
Start an asynchronous text detection job for a document in S3.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
s3_bucket_name: Name of the S3 bucket containing the document
|
|
40
|
+
object_name: Object key of the document in the S3 bucket
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
str: The JobId of the started Textract job
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
Exception: If there is an error starting the job
|
|
47
|
+
"""
|
|
48
|
+
try:
|
|
49
|
+
response = self.client.start_document_text_detection(
|
|
50
|
+
DocumentLocation={
|
|
51
|
+
'S3Object': {'Bucket': s3_bucket_name, 'Name': object_name}
|
|
52
|
+
}
|
|
53
|
+
)
|
|
54
|
+
job_id = response['JobId']
|
|
55
|
+
logging.info(
|
|
56
|
+
f'Started Textract job {job_id} for {s3_bucket_name}/{object_name}'
|
|
57
|
+
)
|
|
58
|
+
return job_id
|
|
59
|
+
except Exception as e:
|
|
60
|
+
logging.error(
|
|
61
|
+
f'Error starting Textract job for {s3_bucket_name}/{object_name}: {str(e)}'
|
|
62
|
+
)
|
|
63
|
+
raise
|
|
64
|
+
|
|
65
|
+
def get_job_status(self, job_id):
|
|
66
|
+
"""
|
|
67
|
+
Get the status of a Textract job.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
job_id: ID of the Textract job
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
str: The job status (e.g., 'IN_PROGRESS', 'SUCCEEDED', 'FAILED')
|
|
74
|
+
"""
|
|
75
|
+
try:
|
|
76
|
+
response = self.client.get_document_text_detection(JobId=job_id)
|
|
77
|
+
status = response['JobStatus']
|
|
78
|
+
logging.info(f'Textract job {job_id} status: {status}')
|
|
79
|
+
return status
|
|
80
|
+
except Exception as e:
|
|
81
|
+
logging.error(
|
|
82
|
+
f'Error getting status for Textract job {job_id}: {str(e)}'
|
|
83
|
+
)
|
|
84
|
+
raise
|
|
85
|
+
|
|
86
|
+
def is_job_complete(self, job_id):
|
|
87
|
+
"""
|
|
88
|
+
Check if a Textract job has completed.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
job_id: ID of the Textract job
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
str: The job status
|
|
95
|
+
"""
|
|
96
|
+
time.sleep(1) # Avoid rate limiting
|
|
97
|
+
return self.get_job_status(job_id)
|
|
98
|
+
|
|
99
|
+
def get_job_results(self, job_id):
|
|
100
|
+
"""
|
|
101
|
+
Get the results of a completed Textract job.
|
|
102
|
+
|
|
103
|
+
This method handles pagination of results automatically.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
job_id: ID of the Textract job
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
list: List of response pages from Textract
|
|
110
|
+
"""
|
|
111
|
+
pages = []
|
|
112
|
+
next_token = None
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
# Get first page
|
|
116
|
+
response = self.client.get_document_text_detection(JobId=job_id)
|
|
117
|
+
pages.append(response)
|
|
118
|
+
logging.info(f'Received page 1 of results for job {job_id}')
|
|
119
|
+
|
|
120
|
+
# Get next token if available
|
|
121
|
+
if 'NextToken' in response:
|
|
122
|
+
next_token = response['NextToken']
|
|
123
|
+
|
|
124
|
+
# Get additional pages if available
|
|
125
|
+
page_num = 2
|
|
126
|
+
while next_token:
|
|
127
|
+
time.sleep(1) # Avoid rate limiting
|
|
128
|
+
response = self.client.get_document_text_detection(
|
|
129
|
+
JobId=job_id, NextToken=next_token
|
|
130
|
+
)
|
|
131
|
+
pages.append(response)
|
|
132
|
+
logging.info(
|
|
133
|
+
f'Received page {page_num} of results for job {job_id}'
|
|
134
|
+
)
|
|
135
|
+
page_num += 1
|
|
136
|
+
|
|
137
|
+
next_token = response.get('NextToken')
|
|
138
|
+
|
|
139
|
+
return pages
|
|
140
|
+
except Exception as e:
|
|
141
|
+
logging.error(
|
|
142
|
+
f'Error getting results for Textract job {job_id}: {str(e)}'
|
|
143
|
+
)
|
|
144
|
+
raise
|
|
145
|
+
|
|
146
|
+
def get_full_text(self, response):
|
|
147
|
+
"""
|
|
148
|
+
Extract the full text from Textract response pages.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
response: List of response pages from Textract
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
str: The full extracted text as a string
|
|
155
|
+
"""
|
|
156
|
+
try:
|
|
157
|
+
text_lines = []
|
|
158
|
+
for result_page in response:
|
|
159
|
+
for item in result_page.get('Blocks', []):
|
|
160
|
+
if item.get('BlockType') == 'LINE':
|
|
161
|
+
text_lines.append(item.get('Text', ''))
|
|
162
|
+
|
|
163
|
+
full_text = ' '.join(text_lines)
|
|
164
|
+
return full_text
|
|
165
|
+
except Exception as e:
|
|
166
|
+
logging.error(
|
|
167
|
+
f'Error extracting full text from Textract response: {str(e)}'
|
|
168
|
+
)
|
|
169
|
+
return ''
|
auris_tools/utils.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from uuid import uuid4
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def collect_timestamp(as_str: bool = True):
|
|
8
|
+
"""
|
|
9
|
+
Collect the current timestamp in ISO 8601 format.
|
|
10
|
+
|
|
11
|
+
Parameters
|
|
12
|
+
----------
|
|
13
|
+
as_str : bool, default=True
|
|
14
|
+
If True, returns the timestamp as an ISO 8601 formatted string.
|
|
15
|
+
If False, returns a datetime object.
|
|
16
|
+
|
|
17
|
+
Returns
|
|
18
|
+
-------
|
|
19
|
+
str or datetime
|
|
20
|
+
Current timestamp as an ISO 8601 formatted string (if as_str=True)
|
|
21
|
+
or as a datetime object (if as_str=False).
|
|
22
|
+
|
|
23
|
+
Examples
|
|
24
|
+
--------
|
|
25
|
+
>>> collect_timestamp()
|
|
26
|
+
'2023-05-18T15:30:45.123456'
|
|
27
|
+
|
|
28
|
+
>>> collect_timestamp(as_str=False)
|
|
29
|
+
datetime.datetime(2023, 5, 18, 15, 30, 45, 123456)
|
|
30
|
+
"""
|
|
31
|
+
if as_str:
|
|
32
|
+
return datetime.now().isoformat()
|
|
33
|
+
return datetime.now()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def parse_timestamp(timestamp_input):
|
|
37
|
+
"""Parse a timestamp to a datetime object.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
timestamp_input: Either an ISO 8601 timestamp string or a datetime object.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
datetime: The parsed datetime object.
|
|
44
|
+
"""
|
|
45
|
+
if isinstance(timestamp_input, datetime):
|
|
46
|
+
return timestamp_input
|
|
47
|
+
return datetime.fromisoformat(timestamp_input)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def generate_uuid():
|
|
51
|
+
"""
|
|
52
|
+
Generate a unique Universally Unique Identifier (UUID) string.
|
|
53
|
+
|
|
54
|
+
This function creates a random UUID using version 4 (random) of the UUID
|
|
55
|
+
specification and returns it as a string. UUIDs are 128-bit identifiers
|
|
56
|
+
that are guaranteed to be unique across space and time.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
str: A string representation of a UUID4 (e.g., '9f8d8f79-2d6d-4b96-a3f5-e1f025e6379b')
|
|
60
|
+
|
|
61
|
+
Example:
|
|
62
|
+
>>> unique_id = generate_uuid()
|
|
63
|
+
>>> print(unique_id)
|
|
64
|
+
'9f8d8f79-2d6d-4b96-a3f5-e1f025e6379b'
|
|
65
|
+
"""
|
|
66
|
+
return str(uuid4())
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def collect_processing_time():
|
|
70
|
+
"""
|
|
71
|
+
Context manager for measuring code execution time.
|
|
72
|
+
|
|
73
|
+
This function provides a context manager that measures the execution time of
|
|
74
|
+
code within its scope. The execution time is returned in seconds.
|
|
75
|
+
|
|
76
|
+
Returns
|
|
77
|
+
-------
|
|
78
|
+
callable
|
|
79
|
+
Function that returns the current elapsed time in seconds when called.
|
|
80
|
+
|
|
81
|
+
Examples
|
|
82
|
+
--------
|
|
83
|
+
>>> with collect_processing_time() as total_time:
|
|
84
|
+
... # Your code here
|
|
85
|
+
... import time
|
|
86
|
+
... time.sleep(1)
|
|
87
|
+
>>> print(f"Execution took {total_time()} seconds")
|
|
88
|
+
Execution took 1.001234 seconds
|
|
89
|
+
|
|
90
|
+
>>> # Example with multiple measurements during execution
|
|
91
|
+
>>> with collect_processing_time() as get_time:
|
|
92
|
+
... # First operation
|
|
93
|
+
... time.sleep(0.5)
|
|
94
|
+
... first_step = get_time()
|
|
95
|
+
... print(f"First step: {first_step:.2f}s")
|
|
96
|
+
...
|
|
97
|
+
... # Second operation
|
|
98
|
+
... time.sleep(0.5)
|
|
99
|
+
... second_step = get_time()
|
|
100
|
+
... print(f"Second step: {second_step:.2f}s")
|
|
101
|
+
First step: 0.50s
|
|
102
|
+
Second step: 1.00s
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
@contextmanager
|
|
106
|
+
def _timing_context():
|
|
107
|
+
start_time = time.time()
|
|
108
|
+
|
|
109
|
+
# Create a function to get the current elapsed time
|
|
110
|
+
def get_elapsed_time():
|
|
111
|
+
return time.time() - start_time
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
# Yield the function that returns elapsed time
|
|
115
|
+
yield get_elapsed_time
|
|
116
|
+
finally:
|
|
117
|
+
# No cleanup needed
|
|
118
|
+
pass
|
|
119
|
+
|
|
120
|
+
return _timing_context()
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: auris_tools
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: The swiss knife tools to coordinates cloud frameworks with an easy for Auris platforms
|
|
5
|
+
Author: Antonio Senra
|
|
6
|
+
Author-email: acsenrafilho@gmail.com
|
|
7
|
+
Requires-Python: >=3.10,<4.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Requires-Dist: boto3 (>=1.40.29,<2.0.0)
|
|
14
|
+
Requires-Dist: dotenv (>=0.9.9,<0.10.0)
|
|
15
|
+
Requires-Dist: google-generativeai (>=0.8.5,<0.9.0)
|
|
16
|
+
Requires-Dist: python-docx (>=1.2.0,<2.0.0)
|
|
17
|
+
Requires-Dist: rich (>=14.1.0,<15.0.0)
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# auris-tools
|
|
21
|
+
|
|
22
|
+
The swiss knife tools to coordinates cloud frameworks with an easy for Auris platforms
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
This project requires **Python 3.10** and uses [Poetry](https://python-poetry.org/) for dependency management.
|
|
27
|
+
|
|
28
|
+
1. **Clone the repository:**
|
|
29
|
+
```bash
|
|
30
|
+
git clone https://github.com/AurisAASI/auris-tools.git
|
|
31
|
+
cd auris-tools
|
|
32
|
+
```
|
|
33
|
+
2. **Install Poetry (if not already installed):**
|
|
34
|
+
```bash
|
|
35
|
+
pip install poetry
|
|
36
|
+
```
|
|
37
|
+
3. **Install dependencies:**
|
|
38
|
+
```bash
|
|
39
|
+
poetry install
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## Project Structure
|
|
45
|
+
|
|
46
|
+
The main classes and modules are organized as follows:
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
/auris_tools
|
|
50
|
+
├── __init__.py
|
|
51
|
+
├── configuration.py # AWS configuration utilities
|
|
52
|
+
├── databaseHandlers.py # DynamoDB handler class
|
|
53
|
+
├── officeWordHandler.py # Office Word document handler
|
|
54
|
+
├── storageHandler.py # AWS S3 storage handler
|
|
55
|
+
├── textractHandler.py # AWS Textract handler
|
|
56
|
+
├── utils.py # Utility functions
|
|
57
|
+
├── geminiHandler.py # Google Gemini AI handler
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Testing & Linting
|
|
63
|
+
|
|
64
|
+
- **Run all tests:**
|
|
65
|
+
```bash
|
|
66
|
+
task test
|
|
67
|
+
```
|
|
68
|
+
- **Run linter (ruff):**
|
|
69
|
+
```bash
|
|
70
|
+
task lint
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Test coverage and linting are enforced in CI. Make sure all tests pass and code is linted before submitting a PR.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
auris_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
auris_tools/configuration.py,sha256=GzBI95GadtW5os_LfFNY0NsV7PzLaj8-IotLKKZ-p8I,2502
|
|
3
|
+
auris_tools/databaseHandlers.py,sha256=YkKABs1YfoEEMr9bDrU7Vg0-aKwOT1c_wh4iI8ET5cI,4777
|
|
4
|
+
auris_tools/geminiHandler.py,sha256=Cdgle1NICGUIx6XmyfEpJ13obiHJgRkvkRpvqPY8l_c,10137
|
|
5
|
+
auris_tools/officeWordHandler.py,sha256=Y_6K2fpmrEP-B7PZYmCYE6P0p1deSguIiopkHNL_V5E,8885
|
|
6
|
+
auris_tools/storageHandler.py,sha256=-rbD0Oi4lstLv3ZVrF2ikUM8A5uPPrXfKGrn-dbXToI,6315
|
|
7
|
+
auris_tools/textractHandler.py,sha256=OGrCwP_Jvehqivqw9ssLDeasJZX93Lg1O6A2NN553Wo,5247
|
|
8
|
+
auris_tools/utils.py,sha256=pBI_2B0e0hMYFg337bbcjHUQQDM_-AdNgof5rJbcaC8,3308
|
|
9
|
+
auris_tools-0.0.1.dist-info/METADATA,sha256=zcgXRMjeplXPk0GNsvQx-MI66iO78e6hDxACttZ97Jw,2061
|
|
10
|
+
auris_tools-0.0.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
11
|
+
auris_tools-0.0.1.dist-info/RECORD,,
|