auris_tools 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auris_tools might be problematic. Click here for more details.

File without changes
@@ -0,0 +1,81 @@
1
+ import logging
2
+ import os
3
+
4
+ from dotenv import load_dotenv
5
+
6
+ # Load environment variables from .env file
7
+ load_dotenv()
8
+
9
+
10
+ class AWSConfiguration:
11
+ """
12
+ AWS Configuration class that handles credentials and region settings.
13
+ Prioritizes environment variables over constructor parameters.
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ access_key: str = None,
19
+ secret_key: str = None,
20
+ region: str = None,
21
+ profile: str = None,
22
+ endpoint_url: str = None,
23
+ ):
24
+ # Try to get credentials from environment variables first
25
+ self.access_key = (
26
+ access_key if access_key else os.environ.get('AWS_ACCESS_KEY_ID')
27
+ )
28
+ self.secret_key = (
29
+ secret_key
30
+ if secret_key
31
+ else os.environ.get('AWS_SECRET_ACCESS_KEY')
32
+ )
33
+ self.region = (
34
+ region
35
+ if region
36
+ else os.environ.get('AWS_DEFAULT_REGION') or 'us-east-1'
37
+ )
38
+ self.profile = profile if profile else os.environ.get('AWS_PROFILE')
39
+ self.endpoint_url = (
40
+ endpoint_url
41
+ if endpoint_url
42
+ else os.environ.get('AWS_ENDPOINT_URL')
43
+ )
44
+
45
+ # Validate configuration
46
+ self._validate_config()
47
+
48
+ def _validate_config(self):
49
+ """Validate that we have enough configuration to proceed."""
50
+ if not ((self.access_key and self.secret_key) or self.profile):
51
+ logging.warning(
52
+ 'No AWS credentials provided via environment variables or constructor. '
53
+ 'AWS operations may fail unless credentials are configured via '
54
+ '~/.aws/credentials, IAM roles, or other AWS credential providers.'
55
+ )
56
+
57
+ def get_boto3_session_args(self):
58
+ """
59
+ Return a dictionary of arguments that can be passed to boto3.session.Session()
60
+ """
61
+ session_args = {'region_name': self.region}
62
+
63
+ if self.access_key and self.secret_key:
64
+ session_args['aws_access_key_id'] = self.access_key
65
+ session_args['aws_secret_access_key'] = self.secret_key
66
+
67
+ if self.profile:
68
+ session_args['profile_name'] = self.profile
69
+
70
+ return session_args
71
+
72
+ def get_client_args(self):
73
+ """
74
+ Return a dictionary of arguments that can be passed to boto3 client creation
75
+ """
76
+ client_args = {}
77
+
78
+ if self.endpoint_url:
79
+ client_args['endpoint_url'] = self.endpoint_url
80
+
81
+ return client_args
@@ -0,0 +1,132 @@
1
+ import logging
2
+
3
+ import boto3
4
+ from boto3.dynamodb.types import TypeDeserializer, TypeSerializer
5
+
6
+ from auris_tools.configuration import AWSConfiguration
7
+ from auris_tools.utils import generate_uuid
8
+
9
+
10
+ class DatabaseHandler:
11
+ def __init__(self, table_name, config=None):
12
+ """
13
+ Initialize the database handler.
14
+
15
+ Args:
16
+ table_name: Name of the DynamoDB table.
17
+ config: An AWSConfiguration object, or None to use environment variables.
18
+ """
19
+ self.table_name = table_name
20
+ if config is None:
21
+ config = AWSConfiguration()
22
+
23
+ # Create a boto3 session with the configuration
24
+ session = boto3.session.Session(**config.get_boto3_session_args())
25
+
26
+ # Create a DynamoDB client with additional configuration if needed
27
+ self.client = session.client('dynamodb', **config.get_client_args())
28
+
29
+ if not self._check_table_exists(table_name):
30
+ raise Exception(f'Table does not exist: {table_name}')
31
+
32
+ logging.info(f'Initialized DynamoDB client in region {config.region}')
33
+
34
+ def insert_item(self, item, primary_key: str = 'id'):
35
+ """Insert an item with automatic type conversion"""
36
+ if not isinstance(item, dict):
37
+ raise TypeError('Item must be a dictionary')
38
+
39
+ if primary_key not in item:
40
+ item[primary_key] = generate_uuid()
41
+
42
+ dynamo_item = self._serialize_item(item)
43
+ response = self.client.put_item(
44
+ TableName=self.table_name, Item=dynamo_item
45
+ )
46
+ return response
47
+
48
+ def get_item(self, key):
49
+ """
50
+ Retrieve an item from a DynamoDB table.
51
+
52
+ Args:
53
+ key: A dictionary representing the key of the item to retrieve.
54
+
55
+ Returns:
56
+ The retrieved item, or None if not found.
57
+ """
58
+ if not isinstance(key, dict):
59
+ raise TypeError('Key must be a dictionary')
60
+
61
+ # Check if the key is in DynamoDB format (i.e., values are dicts with type keys)
62
+ if not all(isinstance(v, dict) and len(v) == 1 for v in key.values()):
63
+ # Convert to DynamoDB format
64
+ key = self._serialize_item(key)
65
+
66
+ try:
67
+ response = self.client.get_item(TableName=self.table_name, Key=key)
68
+ return response.get('Item')
69
+ except Exception as e:
70
+ logging.error(
71
+ f'Error retrieving item from {self.table_name}: {str(e)}'
72
+ )
73
+ return None
74
+
75
+ def delete_item(self, key, primary_key='id'):
76
+ """
77
+ Delete an item from a DynamoDB table.
78
+
79
+ Args:
80
+ key (str or dict): Either a string identifier for the primary key,
81
+ or a dictionary containing the complete key structure.
82
+ primary_key (str, optional): Name of the primary key field. Defaults to 'id'.
83
+
84
+ Returns:
85
+ bool: True if deletion was successful, False otherwise.
86
+ """
87
+ # Convert string key to a dictionary with the primary key
88
+ if isinstance(key, str):
89
+ key = {primary_key: key}
90
+ elif not isinstance(key, dict):
91
+ raise TypeError('Key must be a string identifier or a dictionary')
92
+
93
+ # Check if the key is in DynamoDB format
94
+ if not self.item_is_serialized(key):
95
+ key = self._serialize_item(key)
96
+
97
+ try:
98
+ self.client.delete_item(
99
+ TableName=self.table_name,
100
+ Key=key,
101
+ ReturnValues='ALL_OLD', # Return the deleted item
102
+ )
103
+ logging.info(f'Deleted item from {self.table_name} with key {key}')
104
+ return True
105
+ except Exception as e:
106
+ logging.error(
107
+ f'Error deleting item from {self.table_name}: {str(e)}'
108
+ )
109
+ return False
110
+
111
+ def item_is_serialized(self, item):
112
+ """Check if an item is in DynamoDB serialized format"""
113
+ return all(isinstance(v, dict) and len(v) == 1 for v in item.values())
114
+
115
+ def _serialize_item(self, item):
116
+ """Convert Python types to DynamoDB format"""
117
+ serializer = TypeSerializer()
118
+ return {k: serializer.serialize(v) for k, v in item.items()}
119
+
120
+ def _deserialize_item(self, item):
121
+ """Convert DynamoDB format back to Python types"""
122
+ deserializer = TypeDeserializer()
123
+ return {k: deserializer.deserialize(v) for k, v in item.items()}
124
+
125
+ def _check_table_exists(self, table_name):
126
+ """Check if a DynamoDB table exists"""
127
+ try:
128
+ existing_tables = self.client.list_tables().get('TableNames', [])
129
+ return table_name in existing_tables
130
+ except Exception as e:
131
+ logging.error(f'Error checking table existence: {str(e)}')
132
+ return False
@@ -0,0 +1,245 @@
1
+ import logging
2
+ import os
3
+
4
+ import google.generativeai as genai
5
+ from dotenv import load_dotenv
6
+
7
+ # Load environment variables from .env file
8
+ load_dotenv()
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class GoogleGeminiHandler:
14
+ """A handler class for interacting with Google's Gemini AI models.
15
+
16
+ This class provides a convenient interface for generating content using Google's
17
+ Gemini generative AI models. It handles authentication, model configuration,
18
+ and content generation with automatic error handling and logging.
19
+
20
+ Attributes:
21
+ api_key (str): The Google AI API key used for authentication.
22
+ model_name (str): The name of the Gemini model to use.
23
+ temperature (float): Controls randomness in generation (0.0 to 1.0).
24
+ response_schema (dict): Optional schema for structured responses.
25
+ response_mime_type (str): MIME type for response format.
26
+ generation_config (genai.types.GenerationConfig): Configuration for content generation.
27
+ model (genai.GenerativeModel): The configured Gemini model instance.
28
+
29
+ Example:
30
+ Basic usage with environment variable API key:
31
+
32
+ >>> handler = GoogleGeminiHandler()
33
+ >>> response = handler.generate_output("What is artificial intelligence?")
34
+ >>> text = handler.get_text(response)
35
+
36
+ Usage with custom parameters:
37
+
38
+ >>> handler = GoogleGeminiHandler(
39
+ ... api_key="your-api-key",
40
+ ... model="gemini-2.0-flash-exp",
41
+ ... temperature=0.7,
42
+ ... response_mime_type="text/plain"
43
+ ... )
44
+ """
45
+
46
+ def __init__(
47
+ self, api_key: str = None, model: str = 'gemini-2.5-flash', **kwargs
48
+ ):
49
+ """Initialize the Google Gemini handler.
50
+
51
+ Args:
52
+ api_key (str, optional): Google AI API key. If not provided, will attempt
53
+ to load from GEMINI_API_KEY environment variable. Defaults to None.
54
+ model (str, optional): Name of the Gemini model to use.
55
+ Defaults to 'gemini-2.5-flash'.
56
+ **kwargs: Additional configuration parameters:
57
+ - temperature (float): Controls randomness (0.0-1.0). Defaults to 0.5.
58
+ - response_schema (dict): Schema for structured responses. Defaults to None.
59
+ - response_mime_type (str): Response MIME type. Defaults to 'application/json'.
60
+
61
+ Raises:
62
+ TypeError: If the specified model is not available.
63
+
64
+ Example:
65
+ >>> handler = GoogleGeminiHandler(
66
+ ... api_key="your-api-key",
67
+ ... model="gemini-2.0-flash-exp",
68
+ ... temperature=0.7
69
+ ... )
70
+ """
71
+
72
+ self.api_key = api_key if api_key else os.getenv('GEMINI_API_KEY')
73
+ if self.api_key is None:
74
+ logger.error(
75
+ 'Gemini API key not configured. Please, define the GEMINI_API_KEY environment variable or enter your key directly in the code.'
76
+ )
77
+
78
+ self.model_name = model
79
+ self._check_model_availability()
80
+
81
+ # More configuration from input parameters
82
+ self.temperature = kwargs.get('temperature', 0.5)
83
+ self.response_schema = kwargs.get('response_schema', None)
84
+ self.response_mime_type = kwargs.get(
85
+ 'response_mime_type', 'application/json'
86
+ )
87
+
88
+ self.generation_config = genai.types.GenerationConfig(
89
+ temperature=self.temperature,
90
+ response_schema=self.response_schema,
91
+ response_mime_type=self.response_mime_type,
92
+ )
93
+
94
+ self.model = genai.GenerativeModel(
95
+ generation_config=self.generation_config,
96
+ model_name=self.model_name,
97
+ )
98
+
99
+ def generate_output(
100
+ self, prompt: str, input_data: str = None, input_mime_type: str = None
101
+ ):
102
+ """Generate content using the configured Gemini model.
103
+
104
+ This method sends a prompt to the Gemini model and returns the generated response.
105
+ It supports both text-only prompts and multimodal inputs with additional data.
106
+
107
+ Args:
108
+ prompt (str): The text prompt to send to the model. This is the main
109
+ instruction or question for the AI to respond to.
110
+ input_data (str, optional): Additional input data to include with the prompt.
111
+ This could be text content, encoded media, or other data. Requires
112
+ input_mime_type to be specified. Defaults to None.
113
+ input_mime_type (str, optional): MIME type of the input_data. Required if
114
+ input_data is provided. Examples: 'text/plain', 'image/jpeg',
115
+ 'application/pdf'. Defaults to None.
116
+
117
+ Returns:
118
+ genai.types.GenerateContentResponse or str: The response from the Gemini model
119
+ if successful, or an empty string if an error occurred.
120
+
121
+ Raises:
122
+ ValueError: If input_data is provided without input_mime_type or vice versa.
123
+
124
+ Example:
125
+ Text-only generation:
126
+
127
+ >>> response = handler.generate_output("Explain quantum computing")
128
+
129
+ Multimodal generation with additional data:
130
+
131
+ >>> response = handler.generate_output(
132
+ ... prompt="Describe this image",
133
+ ... input_data=base64_encoded_image,
134
+ ... input_mime_type="image/jpeg"
135
+ ... )
136
+ """
137
+ if (input_data is not None and input_mime_type is None) or (
138
+ input_data is None and input_mime_type is not None
139
+ ):
140
+ raise ValueError(
141
+ 'input_mime_type must be provided if input_data is given, or otherwise both must be None.'
142
+ )
143
+
144
+ if input_data and input_mime_type: # Add input data if provided
145
+ prompt = [
146
+ prompt,
147
+ {'mime_type': input_mime_type, 'content': input_data},
148
+ ]
149
+
150
+ try:
151
+ response = self.model.generate_content(prompt)
152
+ return response
153
+ except Exception as e:
154
+ logger.error(f'Error generating LLM output: {str(e)}')
155
+ return ''
156
+
157
+ def get_text(self, response) -> str:
158
+ """Extract text content from a Gemini model response.
159
+
160
+ This method parses the response object returned by the Gemini model and
161
+ extracts the generated text content. It handles the response structure
162
+ safely and provides fallbacks for various response formats.
163
+
164
+ Args:
165
+ response (genai.types.GenerateContentResponse or dict): The response object
166
+ returned from the generate_output method. This can be either a
167
+ GenerateContentResponse object or a dictionary representation.
168
+
169
+ Returns:
170
+ str: The extracted text content from the response. Returns an empty
171
+ string if no content is found or if an error occurs during extraction.
172
+
173
+ Example:
174
+ >>> response = handler.generate_output("What is AI?")
175
+ >>> text_content = handler.get_text(response)
176
+ >>> print(text_content)
177
+ "Artificial Intelligence (AI) refers to..."
178
+
179
+ >>> # Handle case with no candidates
180
+ >>> empty_response = {'candidates': []}
181
+ >>> text = handler.get_text(empty_response)
182
+ >>> print(text) # Returns empty string
183
+ ""
184
+ """
185
+ try:
186
+ if 'candidates' in response and len(response['candidates']) > 0:
187
+ return response['candidates'][0]['content']
188
+ else:
189
+ logger.warning('No candidates found in the response.')
190
+ return ''
191
+ except Exception as e:
192
+ logger.error(f'Error extracting text from response: {str(e)}')
193
+ return ''
194
+
195
+ def _check_model_availability(self):
196
+ """Check if the specified Gemini model is available.
197
+
198
+ This private method validates that the requested model name exists in the
199
+ list of available Google Gemini models. It queries the Google AI API to
200
+ get the current list of available models and compares against the requested
201
+ model name.
202
+
203
+ Raises:
204
+ TypeError: If the specified model is not found in the list of available
205
+ models from the Google AI API.
206
+
207
+ Note:
208
+ This method is called automatically during initialization and will
209
+ prevent the handler from being created if an invalid model is specified.
210
+ It also logs the availability check results for debugging purposes.
211
+
212
+ Example:
213
+ This method is called internally during initialization:
214
+
215
+ >>> # This will call _check_model_availability internally
216
+ >>> handler = GoogleGeminiHandler(model="gemini-2.5-flash") # Success
217
+ >>> handler = GoogleGeminiHandler(model="invalid-model") # Raises TypeError
218
+ """
219
+ try:
220
+ available_models = genai.list_models()
221
+ # Extract model names and handle the 'models/' prefix
222
+ available_model_names = []
223
+ for model in available_models:
224
+ model_name = model.name
225
+ # Remove 'models/' prefix if present
226
+ if model_name.startswith('models/'):
227
+ model_name = model_name[7:] # Remove 'models/' prefix
228
+ available_model_names.append(model_name)
229
+
230
+ if self.model_name not in available_model_names:
231
+ logger.error(
232
+ f'Model {self.model_name} is not available. Please check the model name.'
233
+ )
234
+ logger.info(
235
+ f'Available models: {", ".join(available_model_names)}'
236
+ )
237
+ raise TypeError(f'Invalid model name: {self.model_name}')
238
+ else:
239
+ logger.info(f'Model {self.model_name} is available.')
240
+ except Exception as e:
241
+ if 'Invalid model name' in str(e):
242
+ raise # Re-raise our custom error
243
+ else:
244
+ logger.error(f'Error checking model availability: {str(e)}')
245
+ # Don't raise error for API connectivity issues, just log
@@ -0,0 +1,271 @@
1
+ import io
2
+ import logging
3
+ import re
4
+ from typing import List, Optional
5
+
6
+ import boto3
7
+ from docx import Document
8
+ from docx.opc.constants import RELATIONSHIP_TYPE as RT
9
+ from docx.text.paragraph import Paragraph
10
+
11
+ from auris_tools.configuration import AWSConfiguration
12
+
13
+
14
+ class OfficeWordHandler:
15
+ """
16
+ Handler for DOCX operations including text extraction and manipulation.
17
+
18
+ This class provides methods to interact with Microsoft Word documents (DOCX)
19
+ stored in S3, including reading, extracting text, and text replacement operations.
20
+ """
21
+
22
+ def __init__(self, config=None):
23
+ """
24
+ Initialize the Office Word handler with AWS configuration.
25
+
26
+ Args:
27
+ config: An AWSConfiguration object, or None to use environment variables
28
+ """
29
+ if config is None:
30
+ config = AWSConfiguration()
31
+
32
+ # Create a boto3 session with the configuration
33
+ session = boto3.session.Session(**config.get_boto3_session_args())
34
+
35
+ # Create an S3 client with additional configuration if needed
36
+ self.s3_client = session.client('s3', **config.get_client_args())
37
+ logging.info(f'Initialized S3 client in region {config.region}')
38
+
39
+ def read_from_s3(self, bucket_name, object_name, as_bytes_io=False):
40
+ """
41
+ Read a DOCX file from S3 and return its bytes.
42
+
43
+ Args:
44
+ bucket_name: Name of the S3 bucket containing the document
45
+ object_name: Object key of the document in the S3 bucket
46
+ as_bytes_io: If True, return a BytesIO object instead of raw bytes
47
+
48
+ Returns:
49
+ bytes or BytesIO: The document content
50
+
51
+ Raises:
52
+ Exception: If there is an error retrieving the document
53
+ """
54
+ try:
55
+ response = self.s3_client.get_object(
56
+ Bucket=bucket_name, Key=object_name
57
+ )
58
+ content = response['Body'].read()
59
+
60
+ if as_bytes_io:
61
+ return io.BytesIO(content)
62
+ return content
63
+ except Exception as e:
64
+ logging.error(
65
+ f'Error reading document from {bucket_name}/{object_name}: {str(e)}'
66
+ )
67
+ raise Exception(f'Error reading file from S3: {str(e)}')
68
+
69
+ def upload_docx(self, docx_document, bucket_name, object_name):
70
+ """
71
+ Upload a DOCX document to S3.
72
+
73
+ Args:
74
+ docx_document: The Document object to upload
75
+ bucket_name: Name of the S3 bucket
76
+ object_name: Object key for the document in S3
77
+
78
+ Returns:
79
+ bool: True if upload was successful, False otherwise
80
+
81
+ Raises:
82
+ Exception: If there is an error uploading the document
83
+ """
84
+ try:
85
+ logging.info(f'Starting upload to S3: {bucket_name}/{object_name}')
86
+
87
+ # Convert document to bytes
88
+ temp_stream = io.BytesIO()
89
+ docx_document.save(temp_stream)
90
+ temp_stream.seek(0)
91
+ document_size = len(temp_stream.getvalue())
92
+
93
+ # Upload to S3
94
+ self.s3_client.upload_fileobj(
95
+ temp_stream,
96
+ Bucket=bucket_name,
97
+ Key=object_name,
98
+ ExtraArgs={
99
+ 'ContentType': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
100
+ },
101
+ )
102
+
103
+ logging.info(
104
+ f'Upload finished successfully. Size: {document_size} bytes'
105
+ )
106
+ return True
107
+ except Exception as e:
108
+ logging.error(f'Failed to upload to S3: {str(e)}')
109
+ raise Exception(f'Error uploading file to S3: {str(e)}')
110
+
111
+ def get_text_from_bytes(self, bytes_data):
112
+ """
113
+ Extract text from a DOCX file bytes.
114
+
115
+ Args:
116
+ bytes_data: The document bytes
117
+
118
+ Returns:
119
+ str: Extracted text from the document
120
+
121
+ Raises:
122
+ ValueError: If there is an error extracting the text
123
+ """
124
+ try:
125
+ doc = Document(io.BytesIO(bytes_data))
126
+ full_text = []
127
+
128
+ # Extract text from paragraphs
129
+ for para in doc.paragraphs:
130
+ full_text.append(para.text)
131
+
132
+ # Extract text from tables
133
+ for table in doc.tables:
134
+ for row in table.rows:
135
+ for cell in row.cells:
136
+ full_text.append(cell.text)
137
+
138
+ return '\n'.join(full_text)
139
+ except Exception as e:
140
+ logging.error(f'Error extracting text from DOCX: {str(e)}')
141
+ raise ValueError(f'Error extracting text from DOCX: {str(e)}')
142
+
143
+ def clean_text(self, text):
144
+ """
145
+ Clean extracted text from a DOCX file.
146
+
147
+ Args:
148
+ text: Text to clean
149
+
150
+ Returns:
151
+ str: Cleaned text
152
+ """
153
+ if not text:
154
+ return ''
155
+
156
+ # Basic cleaning (can be extended)
157
+ cleaned_text = text.strip()
158
+ return cleaned_text
159
+
160
+ def collect_all_paragraphs(self, document: Document) -> List[Paragraph]:
161
+ """
162
+ Collect all paragraphs from a Document object.
163
+
164
+ This method collects paragraphs from the main document body,
165
+ tables, headers, and footers.
166
+
167
+ Args:
168
+ document: The Document object
169
+
170
+ Returns:
171
+ List[Paragraph]: List of all paragraphs in the document
172
+ """
173
+ paragraphs = list(document.paragraphs)
174
+
175
+ for table in document.tables:
176
+ for row in table.rows:
177
+ for cell in row.cells:
178
+ paragraphs.extend(cell.paragraphs)
179
+
180
+ for section in document.sections:
181
+ paragraphs.extend(section.header.paragraphs)
182
+ paragraphs.extend(section.footer.paragraphs)
183
+
184
+ return paragraphs
185
+
186
+ def replace_placeholder_by_text(
187
+ self,
188
+ paragraphs: List[Paragraph],
189
+ document: Document,
190
+ placeholder: str,
191
+ replacement: str,
192
+ max_count: Optional[int] = None,
193
+ ) -> int:
194
+ """
195
+ Replace placeholder text with replacement in document's XML w:t nodes.
196
+
197
+ Args:
198
+ paragraphs: List of paragraphs to process
199
+ document: Document object
200
+ placeholder: Text to find and replace
201
+ replacement: Text to insert instead of placeholder
202
+ max_count: Maximum number of replacements, or None for unlimited
203
+
204
+ Returns:
205
+ int: Number of replacements made
206
+
207
+ Note:
208
+ This method works at the XML level to ensure proper formatting is preserved.
209
+ """
210
+ count = 0
211
+ WORD_NAMESPACE = (
212
+ 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
213
+ )
214
+ T_TAG = f'{{{WORD_NAMESPACE}}}t'
215
+
216
+ if placeholder in replacement:
217
+ logging.warning(
218
+ f'Replacement skipped to avoid recursion: {placeholder} -> {replacement}'
219
+ )
220
+ return 0
221
+
222
+ def replace_in_element(element):
223
+ nonlocal count
224
+ for node in element.iter(tag=T_TAG):
225
+ if node.text and placeholder in node.text:
226
+ remaining = (
227
+ None if max_count is None else max_count - count
228
+ )
229
+ new_text, n = re.subn(
230
+ re.escape(placeholder),
231
+ replacement,
232
+ node.text,
233
+ count=remaining if remaining else 0,
234
+ )
235
+ if n > 0:
236
+ node.text = new_text
237
+ count += n
238
+ if max_count is not None and count >= max_count:
239
+ return True
240
+ return False
241
+
242
+ # Main paragraphs
243
+ for para in paragraphs:
244
+ if replace_in_element(para._element):
245
+ return count
246
+
247
+ # Headers/footers
248
+ for section in document.sections:
249
+ for container in [section.header, section.footer]:
250
+ for para in container.paragraphs:
251
+ if replace_in_element(para._element):
252
+ return count
253
+
254
+ # Tables
255
+ for table in document.tables:
256
+ for row in table.rows:
257
+ for cell in row.cells:
258
+ for para in cell.paragraphs:
259
+ if replace_in_element(para._element):
260
+ return count
261
+
262
+ # Hyperlinks
263
+ for rel in document.part.rels.values():
264
+ if rel.reltype == RT.HYPERLINK and placeholder in rel.target_ref:
265
+ logging.info(
266
+ f'Replacing hyperlink: {rel.target_ref} -> {rel.target_ref.replace(placeholder, replacement)}'
267
+ )
268
+ rel._target = rel.target_ref.replace(placeholder, replacement)
269
+ count += 1
270
+
271
+ return count
@@ -0,0 +1,195 @@
1
+ import logging
2
+ from http import HTTPStatus
3
+
4
+ import boto3
5
+
6
+ from auris_tools.configuration import AWSConfiguration
7
+
8
+
9
+ class StorageHandler:
10
+ def __init__(self, config=None):
11
+ """
12
+ Initialize the storage handler with AWS configuration.
13
+
14
+ Args:
15
+ config: An AWSConfiguration object, or None to use environment variables
16
+ """
17
+ if config is None:
18
+ config = AWSConfiguration()
19
+
20
+ # Create a boto3 session with the configuration
21
+ session = boto3.session.Session(**config.get_boto3_session_args())
22
+
23
+ # Create an S3 client with additional configuration if needed
24
+ self.client = session.client('s3', **config.get_client_args())
25
+ logging.info(f'Initialized S3 client in region {config.region}')
26
+
27
+ def upload_file(self, file_path, bucket_name, object_name):
28
+ """
29
+ Upload a file to an S3 bucket.
30
+
31
+ Args:
32
+ file_path: Path to the file to upload
33
+ bucket_name: Name of the bucket to upload to
34
+ object_name: S3 object name (key)
35
+
36
+ Returns:
37
+ True if file was uploaded successfully, else False
38
+ """
39
+ try:
40
+ self.client.upload_file(file_path, bucket_name, object_name)
41
+ logging.info(
42
+ f'Uploaded {file_path} to {bucket_name}/{object_name}'
43
+ )
44
+ return True
45
+ except Exception as e:
46
+ logging.error(f'Error uploading file {file_path}: {str(e)}')
47
+ return False
48
+
49
+ def download_file(self, bucket_name, object_name, file_path):
50
+ """
51
+ Download a file from an S3 bucket.
52
+
53
+ Args:
54
+ bucket_name: Bucket name
55
+ object_name: S3 object name (key)
56
+ file_path: Path where the file should be saved
57
+
58
+ Returns:
59
+ True if file was downloaded successfully, else False
60
+ """
61
+ try:
62
+ self.client.download_file(bucket_name, object_name, file_path)
63
+ logging.info(
64
+ f'Downloaded {bucket_name}/{object_name} to {file_path}'
65
+ )
66
+ return True
67
+ except Exception as e:
68
+ logging.error(f'Error downloading file {object_name}: {str(e)}')
69
+ return False
70
+
71
+ def get_file_object(self, bucket_name, object_name, as_bytes=False):
72
+ """
73
+ Get a file object from an S3 bucket.
74
+
75
+ Args:
76
+ bucket_name: Bucket name
77
+ object_name: S3 object name (key)
78
+ as_bytes: If True, return the content as bytes instead of a streaming object
79
+
80
+ Returns:
81
+ S3 object (streaming) or bytes if as_bytes=True or None if not found
82
+ """
83
+ try:
84
+ response = self.client.get_object(
85
+ Bucket=bucket_name, Key=object_name
86
+ )
87
+ if as_bytes:
88
+ return response['Body'].read()
89
+ return response['Body']
90
+ except Exception as e:
91
+ logging.error(f'Error getting file object {object_name}: {str(e)}')
92
+ return None
93
+
94
+ def check_file_exists(self, bucket_name, object_name):
95
+ """
96
+ Check if a file exists in an S3 bucket.
97
+
98
+ Args:
99
+ bucket_name: Bucket name
100
+ object_name: S3 object name (key)
101
+
102
+ Returns:
103
+ True if file exists, else False
104
+ """
105
+ try:
106
+ self.client.head_object(Bucket=bucket_name, Key=object_name)
107
+ return True
108
+ except Exception:
109
+ return False
110
+
111
+ def check_file_size(self, bucket_name, object_name):
112
+ """
113
+ Get the size of a file in an S3 bucket.
114
+
115
+ Args:
116
+ bucket_name: Bucket name
117
+ object_name: S3 object name (key)
118
+
119
+ Returns:
120
+ File size in bytes or None if file doesn't exist
121
+ """
122
+ try:
123
+ response = self.client.head_object(
124
+ Bucket=bucket_name, Key=object_name
125
+ )
126
+ return response.get('ContentLength')
127
+ except Exception as e:
128
+ logging.error(
129
+ f'Error checking file size for {object_name}: {str(e)}'
130
+ )
131
+ return None
132
+
133
+ def delete_file(self, bucket_name, object_name):
134
+ """
135
+ Delete a file from an S3 bucket.
136
+
137
+ Args:
138
+ bucket_name: Bucket name
139
+ object_name: S3 object name (key)
140
+
141
+ Returns:
142
+ True if file was deleted successfully, else False
143
+ """
144
+ try:
145
+ # Check if file exists before attempting deletion
146
+ if not self.check_file_exists(bucket_name, object_name):
147
+ logging.warning(
148
+ f'File {bucket_name}/{object_name} does not exist.'
149
+ )
150
+ return False
151
+
152
+ response = self.client.delete_object(
153
+ Bucket=bucket_name, Key=object_name
154
+ )
155
+ status_code = response.get('ResponseMetadata', {}).get(
156
+ 'HTTPStatusCode'
157
+ )
158
+ # Both 200 (OK) and 204 (No Content) are successful responses
159
+ if status_code not in (HTTPStatus.OK, HTTPStatus.NO_CONTENT):
160
+ logging.error(
161
+ f'Failed to delete {bucket_name}/{object_name}, status code: {status_code}'
162
+ )
163
+ return False
164
+
165
+ logging.info(f'Deleted {bucket_name}/{object_name}')
166
+ return True
167
+ except Exception as e:
168
+ logging.error(f'Error deleting file {object_name}: {str(e)}')
169
+ return False
170
+
171
+ def list_files(self, bucket_name, prefix=''):
172
+ """
173
+ List files in an S3 bucket with optional prefix filtering.
174
+
175
+ Args:
176
+ bucket_name: Bucket name
177
+ prefix: Prefix to filter objects (folder path)
178
+
179
+ Returns:
180
+ List of object keys or empty list if error occurs
181
+ """
182
+ try:
183
+ response = self.client.list_objects_v2(
184
+ Bucket=bucket_name, Prefix=prefix
185
+ )
186
+ files = []
187
+ if 'Contents' in response:
188
+ for obj in response['Contents']:
189
+ files.append(obj['Key'])
190
+ return files
191
+ except Exception as e:
192
+ logging.error(
193
+ f'Error listing files in {bucket_name}/{prefix}: {str(e)}'
194
+ )
195
+ return []
@@ -0,0 +1,169 @@
1
+ import logging
2
+ import time
3
+
4
+ import boto3
5
+
6
+ from auris_tools.configuration import AWSConfiguration
7
+
8
+
9
+ class TextractHandler:
10
+ """
11
+ Handler for Amazon Textract operations to extract text from documents.
12
+
13
+ This class provides methods to interact with AWS Textract service for
14
+ text extraction from documents stored in S3.
15
+ """
16
+
17
+ def __init__(self, config=None):
18
+ """
19
+ Initialize the Textract handler with AWS configuration.
20
+
21
+ Args:
22
+ config: An AWSConfiguration object, or None to use environment variables
23
+ """
24
+ if config is None:
25
+ config = AWSConfiguration()
26
+
27
+ # Create a boto3 session with the configuration
28
+ session = boto3.session.Session(**config.get_boto3_session_args())
29
+
30
+ # Create a Textract client with additional configuration if needed
31
+ self.client = session.client('textract', **config.get_client_args())
32
+ logging.info(f'Initialized Textract client in region {config.region}')
33
+
34
+ def start_job(self, s3_bucket_name, object_name):
35
+ """
36
+ Start an asynchronous text detection job for a document in S3.
37
+
38
+ Args:
39
+ s3_bucket_name: Name of the S3 bucket containing the document
40
+ object_name: Object key of the document in the S3 bucket
41
+
42
+ Returns:
43
+ str: The JobId of the started Textract job
44
+
45
+ Raises:
46
+ Exception: If there is an error starting the job
47
+ """
48
+ try:
49
+ response = self.client.start_document_text_detection(
50
+ DocumentLocation={
51
+ 'S3Object': {'Bucket': s3_bucket_name, 'Name': object_name}
52
+ }
53
+ )
54
+ job_id = response['JobId']
55
+ logging.info(
56
+ f'Started Textract job {job_id} for {s3_bucket_name}/{object_name}'
57
+ )
58
+ return job_id
59
+ except Exception as e:
60
+ logging.error(
61
+ f'Error starting Textract job for {s3_bucket_name}/{object_name}: {str(e)}'
62
+ )
63
+ raise
64
+
65
+ def get_job_status(self, job_id):
66
+ """
67
+ Get the status of a Textract job.
68
+
69
+ Args:
70
+ job_id: ID of the Textract job
71
+
72
+ Returns:
73
+ str: The job status (e.g., 'IN_PROGRESS', 'SUCCEEDED', 'FAILED')
74
+ """
75
+ try:
76
+ response = self.client.get_document_text_detection(JobId=job_id)
77
+ status = response['JobStatus']
78
+ logging.info(f'Textract job {job_id} status: {status}')
79
+ return status
80
+ except Exception as e:
81
+ logging.error(
82
+ f'Error getting status for Textract job {job_id}: {str(e)}'
83
+ )
84
+ raise
85
+
86
+ def is_job_complete(self, job_id):
87
+ """
88
+ Check if a Textract job has completed.
89
+
90
+ Args:
91
+ job_id: ID of the Textract job
92
+
93
+ Returns:
94
+ str: The job status
95
+ """
96
+ time.sleep(1) # Avoid rate limiting
97
+ return self.get_job_status(job_id)
98
+
99
+ def get_job_results(self, job_id):
100
+ """
101
+ Get the results of a completed Textract job.
102
+
103
+ This method handles pagination of results automatically.
104
+
105
+ Args:
106
+ job_id: ID of the Textract job
107
+
108
+ Returns:
109
+ list: List of response pages from Textract
110
+ """
111
+ pages = []
112
+ next_token = None
113
+
114
+ try:
115
+ # Get first page
116
+ response = self.client.get_document_text_detection(JobId=job_id)
117
+ pages.append(response)
118
+ logging.info(f'Received page 1 of results for job {job_id}')
119
+
120
+ # Get next token if available
121
+ if 'NextToken' in response:
122
+ next_token = response['NextToken']
123
+
124
+ # Get additional pages if available
125
+ page_num = 2
126
+ while next_token:
127
+ time.sleep(1) # Avoid rate limiting
128
+ response = self.client.get_document_text_detection(
129
+ JobId=job_id, NextToken=next_token
130
+ )
131
+ pages.append(response)
132
+ logging.info(
133
+ f'Received page {page_num} of results for job {job_id}'
134
+ )
135
+ page_num += 1
136
+
137
+ next_token = response.get('NextToken')
138
+
139
+ return pages
140
+ except Exception as e:
141
+ logging.error(
142
+ f'Error getting results for Textract job {job_id}: {str(e)}'
143
+ )
144
+ raise
145
+
146
+ def get_full_text(self, response):
147
+ """
148
+ Extract the full text from Textract response pages.
149
+
150
+ Args:
151
+ response: List of response pages from Textract
152
+
153
+ Returns:
154
+ str: The full extracted text as a string
155
+ """
156
+ try:
157
+ text_lines = []
158
+ for result_page in response:
159
+ for item in result_page.get('Blocks', []):
160
+ if item.get('BlockType') == 'LINE':
161
+ text_lines.append(item.get('Text', ''))
162
+
163
+ full_text = ' '.join(text_lines)
164
+ return full_text
165
+ except Exception as e:
166
+ logging.error(
167
+ f'Error extracting full text from Textract response: {str(e)}'
168
+ )
169
+ return ''
auris_tools/utils.py ADDED
@@ -0,0 +1,120 @@
1
+ import time
2
+ from contextlib import contextmanager
3
+ from datetime import datetime
4
+ from uuid import uuid4
5
+
6
+
7
+ def collect_timestamp(as_str: bool = True):
8
+ """
9
+ Collect the current timestamp in ISO 8601 format.
10
+
11
+ Parameters
12
+ ----------
13
+ as_str : bool, default=True
14
+ If True, returns the timestamp as an ISO 8601 formatted string.
15
+ If False, returns a datetime object.
16
+
17
+ Returns
18
+ -------
19
+ str or datetime
20
+ Current timestamp as an ISO 8601 formatted string (if as_str=True)
21
+ or as a datetime object (if as_str=False).
22
+
23
+ Examples
24
+ --------
25
+ >>> collect_timestamp()
26
+ '2023-05-18T15:30:45.123456'
27
+
28
+ >>> collect_timestamp(as_str=False)
29
+ datetime.datetime(2023, 5, 18, 15, 30, 45, 123456)
30
+ """
31
+ if as_str:
32
+ return datetime.now().isoformat()
33
+ return datetime.now()
34
+
35
+
36
+ def parse_timestamp(timestamp_input):
37
+ """Parse a timestamp to a datetime object.
38
+
39
+ Args:
40
+ timestamp_input: Either an ISO 8601 timestamp string or a datetime object.
41
+
42
+ Returns:
43
+ datetime: The parsed datetime object.
44
+ """
45
+ if isinstance(timestamp_input, datetime):
46
+ return timestamp_input
47
+ return datetime.fromisoformat(timestamp_input)
48
+
49
+
50
+ def generate_uuid():
51
+ """
52
+ Generate a unique Universally Unique Identifier (UUID) string.
53
+
54
+ This function creates a random UUID using version 4 (random) of the UUID
55
+ specification and returns it as a string. UUIDs are 128-bit identifiers
56
+ that are guaranteed to be unique across space and time.
57
+
58
+ Returns:
59
+ str: A string representation of a UUID4 (e.g., '9f8d8f79-2d6d-4b96-a3f5-e1f025e6379b')
60
+
61
+ Example:
62
+ >>> unique_id = generate_uuid()
63
+ >>> print(unique_id)
64
+ '9f8d8f79-2d6d-4b96-a3f5-e1f025e6379b'
65
+ """
66
+ return str(uuid4())
67
+
68
+
69
+ def collect_processing_time():
70
+ """
71
+ Context manager for measuring code execution time.
72
+
73
+ This function provides a context manager that measures the execution time of
74
+ code within its scope. The execution time is returned in seconds.
75
+
76
+ Returns
77
+ -------
78
+ callable
79
+ Function that returns the current elapsed time in seconds when called.
80
+
81
+ Examples
82
+ --------
83
+ >>> with collect_processing_time() as total_time:
84
+ ... # Your code here
85
+ ... import time
86
+ ... time.sleep(1)
87
+ >>> print(f"Execution took {total_time()} seconds")
88
+ Execution took 1.001234 seconds
89
+
90
+ >>> # Example with multiple measurements during execution
91
+ >>> with collect_processing_time() as get_time:
92
+ ... # First operation
93
+ ... time.sleep(0.5)
94
+ ... first_step = get_time()
95
+ ... print(f"First step: {first_step:.2f}s")
96
+ ...
97
+ ... # Second operation
98
+ ... time.sleep(0.5)
99
+ ... second_step = get_time()
100
+ ... print(f"Second step: {second_step:.2f}s")
101
+ First step: 0.50s
102
+ Second step: 1.00s
103
+ """
104
+
105
+ @contextmanager
106
+ def _timing_context():
107
+ start_time = time.time()
108
+
109
+ # Create a function to get the current elapsed time
110
+ def get_elapsed_time():
111
+ return time.time() - start_time
112
+
113
+ try:
114
+ # Yield the function that returns elapsed time
115
+ yield get_elapsed_time
116
+ finally:
117
+ # No cleanup needed
118
+ pass
119
+
120
+ return _timing_context()
@@ -0,0 +1,76 @@
1
+ Metadata-Version: 2.3
2
+ Name: auris_tools
3
+ Version: 0.0.1
4
+ Summary: The swiss knife tools to coordinates cloud frameworks with an easy for Auris platforms
5
+ Author: Antonio Senra
6
+ Author-email: acsenrafilho@gmail.com
7
+ Requires-Python: >=3.10,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Requires-Dist: boto3 (>=1.40.29,<2.0.0)
14
+ Requires-Dist: dotenv (>=0.9.9,<0.10.0)
15
+ Requires-Dist: google-generativeai (>=0.8.5,<0.9.0)
16
+ Requires-Dist: python-docx (>=1.2.0,<2.0.0)
17
+ Requires-Dist: rich (>=14.1.0,<15.0.0)
18
+ Description-Content-Type: text/markdown
19
+
20
+ # auris-tools
21
+
22
+ The swiss knife tools to coordinates cloud frameworks with an easy for Auris platforms
23
+
24
+ ## Installation
25
+
26
+ This project requires **Python 3.10** and uses [Poetry](https://python-poetry.org/) for dependency management.
27
+
28
+ 1. **Clone the repository:**
29
+ ```bash
30
+ git clone https://github.com/AurisAASI/auris-tools.git
31
+ cd auris-tools
32
+ ```
33
+ 2. **Install Poetry (if not already installed):**
34
+ ```bash
35
+ pip install poetry
36
+ ```
37
+ 3. **Install dependencies:**
38
+ ```bash
39
+ poetry install
40
+ ```
41
+
42
+ ---
43
+
44
+ ## Project Structure
45
+
46
+ The main classes and modules are organized as follows:
47
+
48
+ ```
49
+ /auris_tools
50
+ ├── __init__.py
51
+ ├── configuration.py # AWS configuration utilities
52
+ ├── databaseHandlers.py # DynamoDB handler class
53
+ ├── officeWordHandler.py # Office Word document handler
54
+ ├── storageHandler.py # AWS S3 storage handler
55
+ ├── textractHandler.py # AWS Textract handler
56
+ ├── utils.py # Utility functions
57
+ ├── geminiHandler.py # Google Gemini AI handler
58
+ ```
59
+
60
+ ---
61
+
62
+ ## Testing & Linting
63
+
64
+ - **Run all tests:**
65
+ ```bash
66
+ task test
67
+ ```
68
+ - **Run linter (ruff):**
69
+ ```bash
70
+ task lint
71
+ ```
72
+
73
+ Test coverage and linting are enforced in CI. Make sure all tests pass and code is linted before submitting a PR.
74
+
75
+ ---
76
+
@@ -0,0 +1,11 @@
1
+ auris_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ auris_tools/configuration.py,sha256=GzBI95GadtW5os_LfFNY0NsV7PzLaj8-IotLKKZ-p8I,2502
3
+ auris_tools/databaseHandlers.py,sha256=YkKABs1YfoEEMr9bDrU7Vg0-aKwOT1c_wh4iI8ET5cI,4777
4
+ auris_tools/geminiHandler.py,sha256=Cdgle1NICGUIx6XmyfEpJ13obiHJgRkvkRpvqPY8l_c,10137
5
+ auris_tools/officeWordHandler.py,sha256=Y_6K2fpmrEP-B7PZYmCYE6P0p1deSguIiopkHNL_V5E,8885
6
+ auris_tools/storageHandler.py,sha256=-rbD0Oi4lstLv3ZVrF2ikUM8A5uPPrXfKGrn-dbXToI,6315
7
+ auris_tools/textractHandler.py,sha256=OGrCwP_Jvehqivqw9ssLDeasJZX93Lg1O6A2NN553Wo,5247
8
+ auris_tools/utils.py,sha256=pBI_2B0e0hMYFg337bbcjHUQQDM_-AdNgof5rJbcaC8,3308
9
+ auris_tools-0.0.1.dist-info/METADATA,sha256=zcgXRMjeplXPk0GNsvQx-MI66iO78e6hDxACttZ97Jw,2061
10
+ auris_tools-0.0.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
11
+ auris_tools-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.1.3
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any