@aj-archipelago/cortex 1.4.2 → 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/README.md +1 -0
  2. package/config.js +1 -1
  3. package/helper-apps/cortex-autogen2/.dockerignore +1 -0
  4. package/helper-apps/cortex-autogen2/Dockerfile +6 -10
  5. package/helper-apps/cortex-autogen2/Dockerfile.worker +2 -0
  6. package/helper-apps/cortex-autogen2/agents.py +203 -2
  7. package/helper-apps/cortex-autogen2/main.py +1 -1
  8. package/helper-apps/cortex-autogen2/pyproject.toml +12 -0
  9. package/helper-apps/cortex-autogen2/requirements.txt +14 -0
  10. package/helper-apps/cortex-autogen2/services/redis_publisher.py +1 -1
  11. package/helper-apps/cortex-autogen2/services/run_analyzer.py +1 -1
  12. package/helper-apps/cortex-autogen2/task_processor.py +431 -229
  13. package/helper-apps/cortex-autogen2/test_entity_fetcher.py +305 -0
  14. package/helper-apps/cortex-autogen2/tests/README.md +240 -0
  15. package/helper-apps/cortex-autogen2/tests/TEST_REPORT.md +342 -0
  16. package/helper-apps/cortex-autogen2/tests/__init__.py +8 -0
  17. package/helper-apps/cortex-autogen2/tests/analysis/__init__.py +1 -0
  18. package/helper-apps/cortex-autogen2/tests/analysis/improvement_suggester.py +224 -0
  19. package/helper-apps/cortex-autogen2/tests/analysis/trend_analyzer.py +211 -0
  20. package/helper-apps/cortex-autogen2/tests/cli/__init__.py +1 -0
  21. package/helper-apps/cortex-autogen2/tests/cli/run_tests.py +296 -0
  22. package/helper-apps/cortex-autogen2/tests/collectors/__init__.py +1 -0
  23. package/helper-apps/cortex-autogen2/tests/collectors/log_collector.py +252 -0
  24. package/helper-apps/cortex-autogen2/tests/collectors/progress_collector.py +182 -0
  25. package/helper-apps/cortex-autogen2/tests/conftest.py +15 -0
  26. package/helper-apps/cortex-autogen2/tests/database/__init__.py +1 -0
  27. package/helper-apps/cortex-autogen2/tests/database/repository.py +501 -0
  28. package/helper-apps/cortex-autogen2/tests/database/schema.sql +108 -0
  29. package/helper-apps/cortex-autogen2/tests/evaluators/__init__.py +1 -0
  30. package/helper-apps/cortex-autogen2/tests/evaluators/llm_scorer.py +294 -0
  31. package/helper-apps/cortex-autogen2/tests/evaluators/prompts.py +250 -0
  32. package/helper-apps/cortex-autogen2/tests/evaluators/wordcloud_validator.py +168 -0
  33. package/helper-apps/cortex-autogen2/tests/metrics/__init__.py +1 -0
  34. package/helper-apps/cortex-autogen2/tests/metrics/collector.py +155 -0
  35. package/helper-apps/cortex-autogen2/tests/orchestrator.py +576 -0
  36. package/helper-apps/cortex-autogen2/tests/test_cases.yaml +279 -0
  37. package/helper-apps/cortex-autogen2/tests/test_data.db +0 -0
  38. package/helper-apps/cortex-autogen2/tests/utils/__init__.py +3 -0
  39. package/helper-apps/cortex-autogen2/tests/utils/connectivity.py +112 -0
  40. package/helper-apps/cortex-autogen2/tools/azure_blob_tools.py +74 -24
  41. package/helper-apps/cortex-autogen2/tools/entity_api_registry.json +38 -0
  42. package/helper-apps/cortex-autogen2/tools/file_tools.py +1 -1
  43. package/helper-apps/cortex-autogen2/tools/search_tools.py +436 -238
  44. package/helper-apps/cortex-file-handler/package-lock.json +2 -2
  45. package/helper-apps/cortex-file-handler/package.json +1 -1
  46. package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +4 -5
  47. package/helper-apps/cortex-file-handler/src/blobHandler.js +36 -144
  48. package/helper-apps/cortex-file-handler/src/services/FileConversionService.js +5 -3
  49. package/helper-apps/cortex-file-handler/src/services/storage/AzureStorageProvider.js +34 -1
  50. package/helper-apps/cortex-file-handler/src/services/storage/GCSStorageProvider.js +22 -0
  51. package/helper-apps/cortex-file-handler/src/services/storage/LocalStorageProvider.js +28 -1
  52. package/helper-apps/cortex-file-handler/src/services/storage/StorageFactory.js +29 -4
  53. package/helper-apps/cortex-file-handler/src/services/storage/StorageProvider.js +11 -0
  54. package/helper-apps/cortex-file-handler/src/services/storage/StorageService.js +1 -1
  55. package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +3 -2
  56. package/helper-apps/cortex-file-handler/tests/checkHashShortLived.test.js +8 -1
  57. package/helper-apps/cortex-file-handler/tests/containerConversionFlow.test.js +5 -2
  58. package/helper-apps/cortex-file-handler/tests/containerNameParsing.test.js +14 -7
  59. package/helper-apps/cortex-file-handler/tests/containerParameterFlow.test.js +5 -2
  60. package/helper-apps/cortex-file-handler/tests/storage/StorageFactory.test.js +31 -19
  61. package/package.json +1 -1
  62. package/server/modelExecutor.js +4 -0
  63. package/server/plugins/claude4VertexPlugin.js +540 -0
  64. package/server/plugins/openAiWhisperPlugin.js +43 -2
  65. package/tests/integration/rest/vendors/claude_streaming.test.js +121 -0
  66. package/tests/unit/plugins/claude4VertexPlugin.test.js +462 -0
  67. package/tests/unit/plugins/claude4VertexToolConversion.test.js +413 -0
  68. package/helper-apps/cortex-autogen/.funcignore +0 -8
  69. package/helper-apps/cortex-autogen/Dockerfile +0 -10
  70. package/helper-apps/cortex-autogen/OAI_CONFIG_LIST +0 -6
  71. package/helper-apps/cortex-autogen/agents.py +0 -493
  72. package/helper-apps/cortex-autogen/agents_extra.py +0 -14
  73. package/helper-apps/cortex-autogen/config.py +0 -18
  74. package/helper-apps/cortex-autogen/data_operations.py +0 -29
  75. package/helper-apps/cortex-autogen/function_app.py +0 -44
  76. package/helper-apps/cortex-autogen/host.json +0 -15
  77. package/helper-apps/cortex-autogen/main.py +0 -38
  78. package/helper-apps/cortex-autogen/prompts.py +0 -196
  79. package/helper-apps/cortex-autogen/prompts_extra.py +0 -5
  80. package/helper-apps/cortex-autogen/requirements.txt +0 -9
  81. package/helper-apps/cortex-autogen/search.py +0 -85
  82. package/helper-apps/cortex-autogen/test.sh +0 -40
  83. package/helper-apps/cortex-autogen/tools/sasfileuploader.py +0 -66
  84. package/helper-apps/cortex-autogen/utils.py +0 -88
  85. package/helper-apps/cortex-autogen2/DigiCertGlobalRootCA.crt.pem +0 -22
  86. package/helper-apps/cortex-autogen2/poetry.lock +0 -3652
@@ -0,0 +1,279 @@
1
+ # Cortex AutoGen2 Test Cases
2
+ # Predefined test tasks for automated quality testing
3
+
4
+ test_cases:
5
+ - id: tc001_pokemon_pptx
6
+ name: "Most Powerful Gen 1 Pokemon PowerPoint"
7
+ description: "Creates a PowerPoint about the most powerful Gen 1 Pokemon with their images"
8
+ task: "Create a PowerPoint presentation about the Most Powerful Gen 1 Pokemon. Include images of each Pokemon from the same source."
9
+ timeout_seconds: 420
10
+ requires_ajsql: false
11
+ expected_deliverables:
12
+ - type: pptx
13
+ pattern: "*.pptx"
14
+ min_count: 1
15
+ description: "PowerPoint presentation file"
16
+ - type: preview_images
17
+ pattern: "preview_slide_*.png"
18
+ min_count: 2
19
+ description: "Slide preview images"
20
+ - type: images
21
+ pattern: "*.png"
22
+ min_count: 10
23
+ description: "Pokemon images used in presentation"
24
+ min_progress_updates: 8
25
+ quality_criteria:
26
+ - "CRITICAL: Presentation MUST include actual Pokemon character images (NOT just logos or text)"
27
+ - "CRITICAL: Each slide about a Pokemon MUST show THAT SPECIFIC Pokemon's image (e.g., Gengar slide shows Gengar, NOT Pikachu or any other Pokemon)"
28
+ - "CRITICAL: NO REUSING THE SAME IMAGE on multiple slides - each Pokemon needs its own unique character image"
29
+ - "CRITICAL: Verify in preview images that each Pokemon name matches its image (e.g., if slide says 'Alakazam', the image must show Alakazam)"
30
+ - "Each Pokemon mentioned MUST have its corresponding character image on the slide"
31
+ - "All Pokemon images from SAME consistent source (pokemon.com, bulbapedia.net, or pokemondb.net)"
32
+ - "Images show the actual Pokemon characters (Mewtwo, Dragonite, Alakazam, Gengar, etc.) - NOT generic Pokemon logos"
33
+ - "Images are high-quality official artwork or game sprites (NO fan art, NO thumbnails)"
34
+ - "NO watermarked images unless from official Pokemon source"
35
+ - "Consistent art style throughout (all game sprites OR all official artwork)"
36
+ - "At least 10-12 DIFFERENT Gen 1 Pokemon featured, each with their OWN unique character image"
37
+ - "Professional slide design with Pokemon-themed colors (red, blue, yellow)"
38
+ - "CRITICAL: Images must NOT cover or overlap text - all text must be readable"
39
+ - "Images positioned properly beside or above text, never covering content"
40
+ - "Each slide shows: Pokemon name, stats/power info, AND that Pokemon's specific character image"
41
+ - "Images properly sized and centered (not too small, easily visible)"
42
+ - "Preview slides generated showing Pokemon images are visible and MATCH the Pokemon names"
43
+ - "Gen 1 Pokemon ONLY (original 151, including evolutions)"
44
+ - "Power ranking or stats included (HP, Attack, Special, etc.)"
45
+ expected_agents:
46
+ - planner_agent
47
+ - web_search_agent
48
+ - coder_agent
49
+ - code_executor
50
+ - presenter_agent
51
+
52
+ - id: tc002_pdf_with_images
53
+ name: "PDF Report with Images and Charts"
54
+ description: "Generates a PDF report with images and charts"
55
+ task: "Generate a PDF report about renewable energy trends in 2026."
56
+ timeout_seconds: 300
57
+ requires_ajsql: false
58
+ expected_deliverables:
59
+ - type: pdf
60
+ pattern: "*.pdf"
61
+ min_count: 1
62
+ description: "PDF report file"
63
+ - type: images
64
+ pattern: "*.png"
65
+ min_count: 5
66
+ description: "Images and charts included in report"
67
+ min_progress_updates: 6
68
+ quality_criteria:
69
+ - "PDF contains both text content and images"
70
+ - "Charts and graphs are professionally designed"
71
+ - "Real data used, no placeholder or dummy content"
72
+ - "Proper document formatting with headers and page numbers"
73
+ - "Images are relevant to renewable energy topic"
74
+ - "Preview images or thumbnails provided"
75
+ expected_agents:
76
+ - planner_agent
77
+ - web_search_agent
78
+ - coder_agent
79
+ - code_executor
80
+ - presenter_agent
81
+
82
+ - id: tc003_random_csv
83
+ name: "Random Sales Data CSV Generation"
84
+ description: "Generates random sales data CSV with summary statistics"
85
+ task: "Generate a CSV with 100 rows of random sales data and a summary CSV with statistics."
86
+ timeout_seconds: 180
87
+ requires_ajsql: false
88
+ expected_deliverables:
89
+ - type: csv
90
+ pattern: "*sales*.csv"
91
+ min_count: 1
92
+ description: "Main sales data CSV"
93
+ - type: csv
94
+ pattern: "*summary*.csv"
95
+ min_count: 1
96
+ description: "Summary statistics CSV"
97
+ min_progress_updates: 3
98
+ quality_criteria:
99
+ - "Main CSV contains exactly 100 rows of sales data"
100
+ - "Dates span the last 90 days as specified"
101
+ - "Realistic product names and prices (no generic 'Product1', 'Product2')"
102
+ - "Summary statistics calculated correctly from the main data"
103
+ - "Proper CSV formatting (headers, no missing values)"
104
+ - "Files uploaded with SAS URLs provided"
105
+ expected_agents:
106
+ - planner_agent
107
+ - coder_agent
108
+ - code_executor
109
+ - presenter_agent
110
+
111
+ - id: tc004_aje_aja_comparison
112
+ name: "AJE vs AJA Daily Article Count Comparison"
113
+ description: "Compares daily article counts between AJE and AJA"
114
+ task: "Compare daily article counts for AJE and AJA from the last 30 days. Give me a chart and CSV."
115
+ timeout_seconds: 300
116
+ requires_ajsql: true
117
+ expected_deliverables:
118
+ - type: chart
119
+ pattern: "*.png"
120
+ min_count: 1
121
+ description: "Comparison chart showing AJE vs AJA daily counts"
122
+ - type: csv
123
+ pattern: "*.csv"
124
+ min_count: 1
125
+ description: "Raw data CSV with daily counts"
126
+ min_progress_updates: 5
127
+ quality_criteria:
128
+ - "Data queried from UCMS AJE and AJA databases"
129
+ - "Exactly 30 days of data (excluding today)"
130
+ - "Chart clearly shows both AJE and AJA trends"
131
+ - "CSV contains date, aje_count, aja_count columns"
132
+ - "No missing dates in the 30-day period"
133
+ - "Professional chart with legend, labels, and title"
134
+ expected_agents:
135
+ - planner_agent
136
+ - aj_sql_agent
137
+ - coder_agent
138
+ - code_executor
139
+ - presenter_agent
140
+
141
+ - id: tc005_aje_trump_trend
142
+ name: "AJE Trump Headlines - 6 Month Trend Analysis"
143
+ description: "Analyzes Trump headline trends in AJE"
144
+ task: "Plot Trump headline percentage trends for AJE over the last 6 months by week. Give me a chart and CSV."
145
+ timeout_seconds: 360
146
+ requires_ajsql: true
147
+ expected_deliverables:
148
+ - type: chart
149
+ pattern: "*.png"
150
+ min_count: 1
151
+ description: "Weekly trend chart with 3 metrics"
152
+ - type: csv
153
+ pattern: "*.csv"
154
+ min_count: 1
155
+ description: "Weekly data CSV"
156
+ min_progress_updates: 6
157
+ quality_criteria:
158
+ - "Data covers full 6 months from UCMS AJE database"
159
+ - "Chart shows 3 lines: trump count, total count, and % trump"
160
+ - "Data aggregated by week (ISO weeks recommended)"
161
+ - "CSV contains columns: week, trump_count, total_count, percent_trump"
162
+ - "Case-insensitive Trump matching in headlines"
163
+ - "Professional multi-line chart with legend and axis labels"
164
+ - "All weeks in 6-month period represented"
165
+ expected_agents:
166
+ - planner_agent
167
+ - aj_sql_agent
168
+ - coder_agent
169
+ - code_executor
170
+ - presenter_agent
171
+
172
+ - id: tc006_aje_trump_daily
173
+ name: "AJE Trump Headlines - Last Month Daily Chart"
174
+ description: "Daily Trump headline chart for AJE"
175
+ task: "Chart Trump headlines from AJE for the last month by day. Give me the chart and CSV with the headlines."
176
+ timeout_seconds: 300
177
+ requires_ajsql: true
178
+ expected_deliverables:
179
+ - type: chart
180
+ pattern: "*.png"
181
+ min_count: 1
182
+ description: "Daily Trump headline count chart"
183
+ - type: csv
184
+ pattern: "*headlines*.csv"
185
+ min_count: 1
186
+ description: "All Trump headlines with dates"
187
+ - type: csv
188
+ pattern: "*daily*.csv"
189
+ min_count: 1
190
+ description: "Daily count summary"
191
+ min_progress_updates: 5
192
+ quality_criteria:
193
+ - "Headlines queried from UCMS AJE wp_posts table"
194
+ - "Last 30 days of data"
195
+ - "Chart shows daily Trump headline counts"
196
+ - "CSV includes actual headline text, not just counts"
197
+ - "Headlines CSV has columns: date, headline (minimum)"
198
+ - "Daily summary CSV shows Trump count per day"
199
+ - "Case-insensitive Trump matching"
200
+ - "All data properly dated and sorted"
201
+ expected_agents:
202
+ - planner_agent
203
+ - aj_sql_agent
204
+ - coder_agent
205
+ - code_executor
206
+ - presenter_agent
207
+
208
+ - id: tc007_aja_aje_wordclouds
209
+ name: "AJA & AJE Last Month Word Clouds"
210
+ description: "Generate word clouds from AJA and AJE headlines"
211
+ task: "Generate word clouds from AJA and AJE headlines for the last month. Give me word frequency CSVs too."
212
+ timeout_seconds: 360
213
+ requires_ajsql: true
214
+ expected_deliverables:
215
+ - type: wordcloud
216
+ pattern: "*aja*.png"
217
+ min_count: 1
218
+ description: "AJA word cloud visualization"
219
+ - type: wordcloud
220
+ pattern: "*aje*.png"
221
+ min_count: 1
222
+ description: "AJE word cloud visualization"
223
+ - type: csv
224
+ pattern: "*aja*freq*.csv"
225
+ min_count: 1
226
+ description: "AJA word frequency data"
227
+ - type: csv
228
+ pattern: "*aje*freq*.csv"
229
+ min_count: 1
230
+ description: "AJE word frequency data"
231
+ min_progress_updates: 6
232
+ quality_criteria:
233
+ - "Headlines queried from UCMS databases (both AJA and AJE)"
234
+ - "Data covers last 30 days"
235
+ - "Two separate word clouds generated (one for AJA, one for AJE)"
236
+ - "Word clouds are visually clear and professional"
237
+ - "Common/stop words filtered out (the, a, an, etc.)"
238
+ - "CSV files contain word frequency data for both outlets"
239
+ - "All files uploaded with SAS URLs"
240
+ - "Word clouds show meaningful topic differences between outlets"
241
+ expected_agents:
242
+ - planner_agent
243
+ - aj_sql_agent
244
+ - coder_agent
245
+ - code_executor
246
+ - presenter_agent
247
+
248
+ - id: tc008_aje_aja_override_test
249
+ name: "AJE vs AJA Comparison - Override User Instructions Test"
250
+ description: "Tests NON-NEGOTIABLE OVERRIDE: user explicitly forbids other agents but system must route to aj_sql_agent"
251
+ task: "Compare daily article counts for Al Jazeera Arabic (AJA) and Al Jazeera English (AJE) from the last 60 days, do not use any other tools do not use any other agents use only coding agent"
252
+ timeout_seconds: 420
253
+ requires_ajsql: true
254
+ expected_deliverables:
255
+ - type: chart
256
+ pattern: "*.png"
257
+ min_count: 1
258
+ description: "Comparison chart showing AJE vs AJA daily counts"
259
+ - type: csv
260
+ pattern: "*.csv"
261
+ min_count: 1
262
+ description: "Raw data CSV with daily counts"
263
+ min_progress_updates: 5
264
+ quality_criteria:
265
+ - "CRITICAL: System MUST route to aj_sql_agent despite user's explicit 'do not use any other agents' instruction"
266
+ - "CRITICAL: NON-NEGOTIABLE OVERRIDE must work - aj_sql_agent called FIRST before coder_agent"
267
+ - "Data queried from UCMS AJE and AJA databases (proves override worked)"
268
+ - "Exactly 60 days of data (excluding today)"
269
+ - "Chart clearly shows both AJE and AJA trends"
270
+ - "CSV contains date, aje_count, aja_count columns"
271
+ - "No missing dates in the 60-day period"
272
+ - "Professional chart with legend, labels, and title"
273
+ - "CRITICAL: Test fails if coder_agent attempts direct DB access or if aj_sql_agent is never called"
274
+ expected_agents:
275
+ - planner_agent
276
+ - aj_sql_agent
277
+ - coder_agent
278
+ - code_executor
279
+ - presenter_agent
File without changes
@@ -0,0 +1,3 @@
1
+ """
2
+ Utility functions for testing.
3
+ """
@@ -0,0 +1,112 @@
1
+ """
2
+ Connectivity checkers for external services.
3
+ """
4
+
5
+ import os
6
+ import logging
7
+ from typing import Tuple
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def check_ajsql_connectivity() -> Tuple[bool, str]:
13
+ """
14
+ Check if AJ SQL database is accessible from current IP.
15
+
16
+ Returns:
17
+ Tuple of (is_accessible, message)
18
+ """
19
+ mysql_url = os.getenv("AJ_MYSQL_URL")
20
+
21
+ if not mysql_url:
22
+ return False, "AJ_MYSQL_URL environment variable not set"
23
+
24
+ try:
25
+ import pymysql
26
+ from urllib.parse import unquote
27
+ except ImportError:
28
+ return False, "pymysql library not installed"
29
+
30
+ try:
31
+ # Parse MySQL URL
32
+ # Format: mysql://user:password@host:port/database or mysql+pymysql://...
33
+ if mysql_url.startswith("mysql+pymysql://"):
34
+ url_parts = mysql_url[16:] # Remove mysql+pymysql://
35
+ elif mysql_url.startswith("mysql://"):
36
+ url_parts = mysql_url[8:] # Remove mysql://
37
+ else:
38
+ return False, "Invalid AJ_MYSQL_URL format (must start with mysql:// or mysql+pymysql://)"
39
+
40
+ # Split user:password@host:port/database
41
+ if "@" in url_parts:
42
+ auth_part, host_part = url_parts.split("@", 1)
43
+ user, password = auth_part.split(":", 1) if ":" in auth_part else (auth_part, "")
44
+ # URL-decode username and password (handles special characters like @ encoded as %40)
45
+ user = unquote(user)
46
+ password = unquote(password)
47
+ else:
48
+ return False, "Invalid AJ_MYSQL_URL format (missing credentials)"
49
+
50
+ # Split host:port/database (database is optional)
51
+ if "/" in host_part:
52
+ host_port, database = host_part.split("/", 1)
53
+ # Database can be empty (for multi-database access)
54
+ if not database:
55
+ database = None
56
+ else:
57
+ host_port = host_part
58
+ database = None
59
+
60
+ # Split host:port
61
+ if ":" in host_port:
62
+ host, port = host_port.rsplit(":", 1)
63
+ port = int(port)
64
+ else:
65
+ host = host_port
66
+ port = 3306
67
+
68
+ # Try to connect with a short timeout
69
+ logger.info(f"Testing AJ SQL connectivity to {host}:{port}")
70
+
71
+ # Build connection params
72
+ connect_params = {
73
+ 'host': host,
74
+ 'port': port,
75
+ 'user': user,
76
+ 'password': password,
77
+ 'connect_timeout': 5,
78
+ 'read_timeout': 5,
79
+ 'write_timeout': 5,
80
+ 'ssl': {'ssl': True}
81
+ }
82
+
83
+ # Only include database if specified
84
+ if database:
85
+ connect_params['database'] = database
86
+
87
+ connection = pymysql.connect(**connect_params)
88
+
89
+ # Run a simple query to verify access
90
+ with connection.cursor() as cursor:
91
+ cursor.execute("SELECT 1")
92
+ cursor.fetchone()
93
+
94
+ connection.close()
95
+
96
+ logger.info(f"✅ AJ SQL database is accessible")
97
+ return True, "Database is accessible"
98
+
99
+ except pymysql.err.OperationalError as e:
100
+ error_msg = str(e)
101
+ if "Access denied" in error_msg:
102
+ logger.warning(f"⚠️ AJ SQL access denied: {error_msg}")
103
+ return False, f"Access denied: {error_msg}"
104
+ elif "Can't connect" in error_msg or "timed out" in error_msg:
105
+ logger.warning(f"⚠️ AJ SQL connection failed (IP restriction?): {error_msg}")
106
+ return False, f"Connection failed (likely IP restriction): {error_msg}"
107
+ else:
108
+ logger.warning(f"⚠️ AJ SQL operational error: {error_msg}")
109
+ return False, f"Database error: {error_msg}"
110
+ except Exception as e:
111
+ logger.warning(f"⚠️ AJ SQL connectivity check failed: {e}")
112
+ return False, f"Unexpected error: {str(e)}"
@@ -9,6 +9,8 @@ import mimetypes
9
9
  import uuid
10
10
  import time
11
11
  import hashlib
12
+ import re
13
+ import unicodedata
12
14
  from datetime import datetime, timedelta
13
15
  from urllib.parse import urlparse, parse_qs
14
16
  from azure.storage.blob import BlobServiceClient, generate_blob_sas, BlobSasPermissions, ContentSettings
@@ -17,6 +19,27 @@ import requests
17
19
 
18
20
  logger = logging.getLogger(__name__)
19
21
 
22
+ def _sanitize_blob_name(filename: str) -> str:
23
+ """
24
+ Sanitize filename to be Azure Blob Storage safe.
25
+ Removes special characters and converts to ASCII-safe format.
26
+ """
27
+ # Normalize unicode characters (e.g., é -> e)
28
+ normalized = unicodedata.normalize('NFKD', filename)
29
+ # Remove accents/diacritics
30
+ ascii_str = normalized.encode('ascii', 'ignore').decode('ascii')
31
+ # Replace any remaining problematic characters with underscore
32
+ # Keep only: alphanumeric, dots, dashes, underscores
33
+ safe_name = re.sub(r'[^a-zA-Z0-9._-]', '_', ascii_str)
34
+ # Remove consecutive underscores
35
+ safe_name = re.sub(r'_+', '_', safe_name)
36
+ # Remove leading/trailing underscores or dots
37
+ safe_name = safe_name.strip('_.')
38
+ # Prevent empty filename (e.g., if all chars were special)
39
+ if not safe_name:
40
+ return "file"
41
+ return safe_name
42
+
20
43
  # Ensure correct MIME types for Office files, especially PPT/PPTX, for proper downloads in browsers
21
44
  try:
22
45
  mimetypes.add_type("application/vnd.openxmlformats-officedocument.presentationml.presentation", ".pptx", strict=False)
@@ -112,20 +135,42 @@ class AzureBlobUploader:
112
135
  if not os.path.exists(file_path):
113
136
  raise FileNotFoundError(f"File not found: {file_path}")
114
137
 
138
+ # Determine if we should preserve the exact filename or add timestamp/UUID
139
+ preserve = (os.getenv("PRESERVE_BLOB_FILENAME", "false").lower() in ("1", "true", "yes"))
140
+ prefix = (os.getenv("AZURE_BLOB_PREFIX") or "").strip().strip("/")
141
+
115
142
  if blob_name is None:
143
+ # Use original filename from file_path
116
144
  original_base = os.path.basename(file_path)
117
145
  name, ext = os.path.splitext(original_base)
118
- # Prefix support for virtual folders
119
- prefix = (os.getenv("AZURE_BLOB_PREFIX") or "").strip().strip("/")
120
- # Decide uniqueness policy: default add timestamp+short id to avoid static overwrites
121
- preserve = (os.getenv("PRESERVE_BLOB_FILENAME", "false").lower() in ("1", "true", "yes"))
122
- if preserve:
123
- final_name = original_base
124
- else:
125
- timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
126
- short_id = uuid.uuid4().hex[:8]
127
- final_name = f"{name}__{timestamp}_{short_id}{ext}"
128
- blob_name = f"{prefix}/{final_name}" if prefix else final_name
146
+ else:
147
+ # Use provided blob_name (might have path components)
148
+ # Extract just the filename part
149
+ blob_base = os.path.basename(blob_name)
150
+ name, ext = os.path.splitext(blob_base)
151
+ # Keep any directory prefix from blob_name
152
+ blob_dir = os.path.dirname(blob_name).strip('/')
153
+ if blob_dir:
154
+ prefix = f"{prefix}/{blob_dir}" if prefix else blob_dir
155
+
156
+ # Sanitize filename to be Azure Blob safe (remove special chars like é, ñ, etc.)
157
+ name = _sanitize_blob_name(name)
158
+ # Extension already starts with dot, just sanitize the part after the dot
159
+ if ext:
160
+ ext_without_dot = ext.lstrip('.')
161
+ sanitized_ext = _sanitize_blob_name(ext_without_dot)
162
+ ext = f".{sanitized_ext}" if sanitized_ext else ext
163
+
164
+ # Add timestamp+UUID suffix unless preserve flag is set
165
+ if preserve:
166
+ final_name = f"{name}{ext}"
167
+ else:
168
+ timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
169
+ short_id = uuid.uuid4().hex[:8]
170
+ final_name = f"{name}__{timestamp}_{short_id}{ext}"
171
+
172
+ # Construct final blob_name with prefix if provided
173
+ blob_name = f"{prefix}/{final_name}" if prefix else final_name
129
174
 
130
175
  # Normalize any accidental leading slashes in blob path
131
176
  normalized_blob_name = blob_name.lstrip("/")
@@ -186,20 +231,25 @@ class AzureBlobUploader:
186
231
  # Keep a single function for external calls to use the singleton uploader
187
232
  def upload_file_to_azure_blob(file_path: str, blob_name: str = None) -> str:
188
233
  """
189
- Uploads a file to Azure Blob Storage and returns a JSON string with the download URL.
190
- This function uses the singleton AzureBlobUploader instance.
191
-
192
- Reference local files in absolute path.
193
-
234
+ Uploads a file to Azure Blob Storage with automatic retry on transient failures.
235
+ Returns a JSON string with the download URL.
194
236
  """
195
- try:
196
- uploader = AzureBlobUploader()
197
- result = uploader.upload_file(file_path, blob_name)
198
- logger.info(f"✅ Successfully uploaded and got SAS URL for {file_path}")
199
- return json.dumps(result)
200
- except Exception as e:
201
- logger.error(f"❌ Failed to upload {file_path}. Error: {e}", exc_info=True)
202
- return json.dumps({"error": str(e)})
237
+ max_attempts = 3
238
+ retry_delay = 3
239
+
240
+ for attempt in range(1, max_attempts + 1):
241
+ try:
242
+ uploader = AzureBlobUploader()
243
+ result = uploader.upload_file(file_path, blob_name)
244
+ logger.info(f" Successfully uploaded {file_path} (attempt {attempt}/{max_attempts})")
245
+ return json.dumps(result)
246
+ except Exception as e:
247
+ if attempt < max_attempts:
248
+ logger.warning(f"⚠️ Upload attempt {attempt}/{max_attempts} failed for {file_path}: {e}. Retrying in {retry_delay}s...")
249
+ time.sleep(retry_delay)
250
+ else:
251
+ logger.error(f"❌ Upload failed after {max_attempts} attempts for {file_path}: {e}", exc_info=True)
252
+ return json.dumps({"error": str(e)})
203
253
 
204
254
  # This function is no longer needed as the class handles text uploads if necessary,
205
255
  # and direct calls should go through the singleton.
@@ -0,0 +1,38 @@
1
+ {
2
+ "pokemon": {
3
+ "name": "PokeAPI",
4
+ "description": "Official Pokemon data API with sprites and artwork",
5
+ "url_pattern": "https://pokeapi.co/api/v2/pokemon/{entity}",
6
+ "entity_transform": "lowercase",
7
+ "image_fields": [
8
+ "sprites.other.official-artwork.front_default",
9
+ "sprites.front_default"
10
+ ],
11
+ "fallback_search_query": "{entity} pokemon official artwork",
12
+ "enabled": true
13
+ },
14
+ "country": {
15
+ "name": "REST Countries",
16
+ "description": "Country data including flags",
17
+ "url_pattern": "https://restcountries.com/v3.1/name/{entity}",
18
+ "entity_transform": "none",
19
+ "image_fields": [
20
+ "[0].flags.png",
21
+ "[0].flags.svg"
22
+ ],
23
+ "fallback_search_query": "{entity} country flag",
24
+ "enabled": true
25
+ },
26
+ "movie": {
27
+ "name": "OMDB API",
28
+ "description": "Movie database with posters (requires API key in OMDB_API_KEY env var)",
29
+ "url_pattern": "http://www.omdbapi.com/?apikey={OMDB_API_KEY}&t={entity}",
30
+ "entity_transform": "none",
31
+ "image_fields": [
32
+ "Poster"
33
+ ],
34
+ "fallback_search_query": "{entity} movie poster",
35
+ "enabled": false,
36
+ "requires_env": ["OMDB_API_KEY"]
37
+ }
38
+ }
@@ -213,7 +213,7 @@ async def download_image(url: str, filename: str, work_dir: Optional[str] = None
213
213
  "User-Agent": BROWSER_UA,
214
214
  "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
215
215
  "Accept-Language": "en-US,en;q=0.9",
216
- "Referer": "https://duckduckgo.com/",
216
+ "Referer": "https://www.google.com/",
217
217
  "Cache-Control": "no-cache",
218
218
  })
219
219