@aj-archipelago/cortex 1.4.2 → 1.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/config.js +1 -1
- package/helper-apps/cortex-autogen2/.dockerignore +1 -0
- package/helper-apps/cortex-autogen2/Dockerfile +6 -10
- package/helper-apps/cortex-autogen2/Dockerfile.worker +2 -0
- package/helper-apps/cortex-autogen2/agents.py +203 -2
- package/helper-apps/cortex-autogen2/main.py +1 -1
- package/helper-apps/cortex-autogen2/pyproject.toml +12 -0
- package/helper-apps/cortex-autogen2/requirements.txt +14 -0
- package/helper-apps/cortex-autogen2/services/redis_publisher.py +1 -1
- package/helper-apps/cortex-autogen2/services/run_analyzer.py +1 -1
- package/helper-apps/cortex-autogen2/task_processor.py +431 -229
- package/helper-apps/cortex-autogen2/test_entity_fetcher.py +305 -0
- package/helper-apps/cortex-autogen2/tests/README.md +240 -0
- package/helper-apps/cortex-autogen2/tests/TEST_REPORT.md +342 -0
- package/helper-apps/cortex-autogen2/tests/__init__.py +8 -0
- package/helper-apps/cortex-autogen2/tests/analysis/__init__.py +1 -0
- package/helper-apps/cortex-autogen2/tests/analysis/improvement_suggester.py +224 -0
- package/helper-apps/cortex-autogen2/tests/analysis/trend_analyzer.py +211 -0
- package/helper-apps/cortex-autogen2/tests/cli/__init__.py +1 -0
- package/helper-apps/cortex-autogen2/tests/cli/run_tests.py +296 -0
- package/helper-apps/cortex-autogen2/tests/collectors/__init__.py +1 -0
- package/helper-apps/cortex-autogen2/tests/collectors/log_collector.py +252 -0
- package/helper-apps/cortex-autogen2/tests/collectors/progress_collector.py +182 -0
- package/helper-apps/cortex-autogen2/tests/conftest.py +15 -0
- package/helper-apps/cortex-autogen2/tests/database/__init__.py +1 -0
- package/helper-apps/cortex-autogen2/tests/database/repository.py +501 -0
- package/helper-apps/cortex-autogen2/tests/database/schema.sql +108 -0
- package/helper-apps/cortex-autogen2/tests/evaluators/__init__.py +1 -0
- package/helper-apps/cortex-autogen2/tests/evaluators/llm_scorer.py +294 -0
- package/helper-apps/cortex-autogen2/tests/evaluators/prompts.py +250 -0
- package/helper-apps/cortex-autogen2/tests/evaluators/wordcloud_validator.py +168 -0
- package/helper-apps/cortex-autogen2/tests/metrics/__init__.py +1 -0
- package/helper-apps/cortex-autogen2/tests/metrics/collector.py +155 -0
- package/helper-apps/cortex-autogen2/tests/orchestrator.py +576 -0
- package/helper-apps/cortex-autogen2/tests/test_cases.yaml +279 -0
- package/helper-apps/cortex-autogen2/tests/test_data.db +0 -0
- package/helper-apps/cortex-autogen2/tests/utils/__init__.py +3 -0
- package/helper-apps/cortex-autogen2/tests/utils/connectivity.py +112 -0
- package/helper-apps/cortex-autogen2/tools/azure_blob_tools.py +74 -24
- package/helper-apps/cortex-autogen2/tools/entity_api_registry.json +38 -0
- package/helper-apps/cortex-autogen2/tools/file_tools.py +1 -1
- package/helper-apps/cortex-autogen2/tools/search_tools.py +436 -238
- package/helper-apps/cortex-file-handler/package-lock.json +2 -2
- package/helper-apps/cortex-file-handler/package.json +1 -1
- package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +4 -5
- package/helper-apps/cortex-file-handler/src/blobHandler.js +36 -144
- package/helper-apps/cortex-file-handler/src/services/FileConversionService.js +5 -3
- package/helper-apps/cortex-file-handler/src/services/storage/AzureStorageProvider.js +34 -1
- package/helper-apps/cortex-file-handler/src/services/storage/GCSStorageProvider.js +22 -0
- package/helper-apps/cortex-file-handler/src/services/storage/LocalStorageProvider.js +28 -1
- package/helper-apps/cortex-file-handler/src/services/storage/StorageFactory.js +29 -4
- package/helper-apps/cortex-file-handler/src/services/storage/StorageProvider.js +11 -0
- package/helper-apps/cortex-file-handler/src/services/storage/StorageService.js +1 -1
- package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +3 -2
- package/helper-apps/cortex-file-handler/tests/checkHashShortLived.test.js +8 -1
- package/helper-apps/cortex-file-handler/tests/containerConversionFlow.test.js +5 -2
- package/helper-apps/cortex-file-handler/tests/containerNameParsing.test.js +14 -7
- package/helper-apps/cortex-file-handler/tests/containerParameterFlow.test.js +5 -2
- package/helper-apps/cortex-file-handler/tests/storage/StorageFactory.test.js +31 -19
- package/package.json +1 -1
- package/server/modelExecutor.js +4 -0
- package/server/plugins/claude4VertexPlugin.js +540 -0
- package/server/plugins/openAiWhisperPlugin.js +43 -2
- package/tests/integration/rest/vendors/claude_streaming.test.js +121 -0
- package/tests/unit/plugins/claude4VertexPlugin.test.js +462 -0
- package/tests/unit/plugins/claude4VertexToolConversion.test.js +413 -0
- package/helper-apps/cortex-autogen/.funcignore +0 -8
- package/helper-apps/cortex-autogen/Dockerfile +0 -10
- package/helper-apps/cortex-autogen/OAI_CONFIG_LIST +0 -6
- package/helper-apps/cortex-autogen/agents.py +0 -493
- package/helper-apps/cortex-autogen/agents_extra.py +0 -14
- package/helper-apps/cortex-autogen/config.py +0 -18
- package/helper-apps/cortex-autogen/data_operations.py +0 -29
- package/helper-apps/cortex-autogen/function_app.py +0 -44
- package/helper-apps/cortex-autogen/host.json +0 -15
- package/helper-apps/cortex-autogen/main.py +0 -38
- package/helper-apps/cortex-autogen/prompts.py +0 -196
- package/helper-apps/cortex-autogen/prompts_extra.py +0 -5
- package/helper-apps/cortex-autogen/requirements.txt +0 -9
- package/helper-apps/cortex-autogen/search.py +0 -85
- package/helper-apps/cortex-autogen/test.sh +0 -40
- package/helper-apps/cortex-autogen/tools/sasfileuploader.py +0 -66
- package/helper-apps/cortex-autogen/utils.py +0 -88
- package/helper-apps/cortex-autogen2/DigiCertGlobalRootCA.crt.pem +0 -22
- package/helper-apps/cortex-autogen2/poetry.lock +0 -3652
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Web search tools (keyless).
|
|
3
3
|
|
|
4
|
-
Implements
|
|
5
|
-
- web_search: web results via
|
|
6
|
-
- image_search: image results via
|
|
4
|
+
Implements Google CSE-based search for web and image results:
|
|
5
|
+
- web_search: web results via Google CSE
|
|
6
|
+
- image_search: image results via Google CSE
|
|
7
7
|
- combined_search: combined web + image results
|
|
8
8
|
"""
|
|
9
9
|
|
|
@@ -15,6 +15,7 @@ from typing import Dict, Any, List, Optional
|
|
|
15
15
|
import hashlib
|
|
16
16
|
from PIL import Image
|
|
17
17
|
import asyncio # Import asyncio
|
|
18
|
+
import aiohttp # Add async HTTP client
|
|
18
19
|
import matplotlib.pyplot as plt
|
|
19
20
|
import pandas as pd
|
|
20
21
|
import re
|
|
@@ -180,44 +181,6 @@ def _extract_snippet_near(html: str, start_pos: int) -> Optional[str]:
|
|
|
180
181
|
return text or None
|
|
181
182
|
|
|
182
183
|
|
|
183
|
-
def _ddg_web(query: str, count: int = 25) -> List[Dict[str, Any]]:
|
|
184
|
-
url = f"https://duckduckgo.com/html/?q={urllib.parse.quote_plus(query)}"
|
|
185
|
-
headers = {"User-Agent": USER_AGENT}
|
|
186
|
-
resp = requests.get(url, headers=headers, timeout=20)
|
|
187
|
-
resp.raise_for_status()
|
|
188
|
-
html = resp.text
|
|
189
|
-
|
|
190
|
-
# Capture results: <a class="result__a" href="...">Title</a>
|
|
191
|
-
links_iter = re.finditer(r'<a[^>]*class="[^"]*result__a[^"]*"[^>]*href="([^"]+)"[^>]*>(.*?)</a>', html, flags=re.I | re.S)
|
|
192
|
-
results: List[Dict[str, Any]] = []
|
|
193
|
-
for match in links_iter:
|
|
194
|
-
href = match.group(1)
|
|
195
|
-
title_html = match.group(2)
|
|
196
|
-
title_text = html_lib.unescape(re.sub('<[^<]+?>', '', title_html)).strip()
|
|
197
|
-
if not title_text or not href:
|
|
198
|
-
continue
|
|
199
|
-
# Resolve DDG redirect links and protocol-relative URLs
|
|
200
|
-
url_val = href
|
|
201
|
-
if url_val.startswith("//"):
|
|
202
|
-
url_val = "https:" + url_val
|
|
203
|
-
try:
|
|
204
|
-
parsed = urllib.parse.urlparse(url_val)
|
|
205
|
-
if parsed.netloc.endswith("duckduckgo.com") and parsed.path.startswith("/l/"):
|
|
206
|
-
qs = urllib.parse.parse_qs(parsed.query)
|
|
207
|
-
uddg = qs.get("uddg", [None])[0]
|
|
208
|
-
if uddg:
|
|
209
|
-
url_val = urllib.parse.unquote(uddg)
|
|
210
|
-
except Exception:
|
|
211
|
-
pass
|
|
212
|
-
snippet = _extract_snippet_near(html, match.end())
|
|
213
|
-
results.append({
|
|
214
|
-
"title": title_text,
|
|
215
|
-
"url": url_val,
|
|
216
|
-
"snippet": snippet,
|
|
217
|
-
})
|
|
218
|
-
if len(results) >= max(1, count):
|
|
219
|
-
break
|
|
220
|
-
return _normalize_web_results(results)
|
|
221
184
|
|
|
222
185
|
|
|
223
186
|
def _enrich_web_results_with_meta(results: List[Dict[str, Any]], max_fetch: int = 3, timeout_s: int = 8) -> List[Dict[str, Any]]:
|
|
@@ -366,73 +329,6 @@ async def fetch_webpage(url: str, render: bool = False, timeout_s: int = 20, max
|
|
|
366
329
|
return json.dumps({"error": f"Fetch failed: {str(exc)}"})
|
|
367
330
|
|
|
368
331
|
|
|
369
|
-
# DuckDuckGo vqd token method removed - no API key needed, using HTML scraping only
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
def _ddg_images_html(query: str, count: int = 25) -> List[Dict[str, Any]]:
|
|
373
|
-
headers = {"User-Agent": USER_AGENT, "Referer": "https://duckduckgo.com/"}
|
|
374
|
-
url = f"https://duckduckgo.com/?q={urllib.parse.quote_plus(query)}&ia=images&iar=images"
|
|
375
|
-
try:
|
|
376
|
-
resp = requests.get(url, headers=headers, timeout=20)
|
|
377
|
-
resp.raise_for_status()
|
|
378
|
-
html = resp.text
|
|
379
|
-
items: List[Dict[str, Any]] = []
|
|
380
|
-
|
|
381
|
-
# Method 1: Look for external-content proxied URLs
|
|
382
|
-
for m in re.finditer(r'(?:src|data-src)="(https://external-content\.duckduckgo\.com/iu/\?u=[^"]+)"', html):
|
|
383
|
-
proxy = html_lib.unescape(m.group(1))
|
|
384
|
-
try:
|
|
385
|
-
parsed = urllib.parse.urlparse(proxy)
|
|
386
|
-
qs = urllib.parse.parse_qs(parsed.query)
|
|
387
|
-
orig = qs.get('u', [None])[0]
|
|
388
|
-
if not orig:
|
|
389
|
-
continue
|
|
390
|
-
orig = urllib.parse.unquote(orig)
|
|
391
|
-
items.append({
|
|
392
|
-
"title": None,
|
|
393
|
-
"image": orig,
|
|
394
|
-
"thumbnail": proxy,
|
|
395
|
-
"width": None,
|
|
396
|
-
"height": None,
|
|
397
|
-
"source": None,
|
|
398
|
-
})
|
|
399
|
-
if len(items) >= count:
|
|
400
|
-
break
|
|
401
|
-
except Exception:
|
|
402
|
-
continue
|
|
403
|
-
|
|
404
|
-
# Method 2: Look for direct image URLs in the page
|
|
405
|
-
if len(items) < count // 2:
|
|
406
|
-
direct_patterns = [
|
|
407
|
-
r'"(https://[^"]+\.(?:jpg|jpeg|png|webp|gif))"',
|
|
408
|
-
r"'(https://[^']+\.(?:jpg|jpeg|png|webp|gif))'",
|
|
409
|
-
]
|
|
410
|
-
for pattern in direct_patterns:
|
|
411
|
-
for m in re.finditer(pattern, html, re.I):
|
|
412
|
-
img_url = m.group(1)
|
|
413
|
-
if "duckduckgo.com" not in img_url and img_url not in [i["image"] for i in items]:
|
|
414
|
-
items.append({
|
|
415
|
-
"title": None,
|
|
416
|
-
"image": img_url,
|
|
417
|
-
"thumbnail": img_url,
|
|
418
|
-
"width": None,
|
|
419
|
-
"height": None,
|
|
420
|
-
"source": None,
|
|
421
|
-
})
|
|
422
|
-
if len(items) >= count:
|
|
423
|
-
break
|
|
424
|
-
if len(items) >= count:
|
|
425
|
-
break
|
|
426
|
-
|
|
427
|
-
logging.info(f"[_ddg_images_html] Found {len(items)} images for query: {query}")
|
|
428
|
-
return _normalize_image_results(items)
|
|
429
|
-
except Exception as e:
|
|
430
|
-
logging.error(f"[_ddg_images_html] Failed for query '{query}': {e}")
|
|
431
|
-
return []
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
# DuckDuckGo JSON API method removed - no API key available, using HTML scraping only
|
|
435
|
-
|
|
436
332
|
|
|
437
333
|
async def web_search(query: str, count: int = 25, enrich: bool = True) -> str:
|
|
438
334
|
try:
|
|
@@ -449,8 +345,6 @@ async def web_search(query: str, count: int = 25, enrich: bool = True) -> str:
|
|
|
449
345
|
used_google = False
|
|
450
346
|
results = []
|
|
451
347
|
|
|
452
|
-
if not results:
|
|
453
|
-
results = _ddg_web(query, count)
|
|
454
348
|
|
|
455
349
|
if enrich and results:
|
|
456
350
|
# Enrich only for web-page items
|
|
@@ -549,15 +443,7 @@ async def image_search(query: str, count: int = 25, verify_download: bool = True
|
|
|
549
443
|
logging.warning(f"[image_search] Google CSE failed: {e}")
|
|
550
444
|
results = []
|
|
551
445
|
|
|
552
|
-
|
|
553
|
-
# Fallback to DuckDuckGo HTML scraping if CSE disabled or empty
|
|
554
|
-
try:
|
|
555
|
-
logging.info(f"[image_search] Falling back to DuckDuckGo HTML scraping for query: {query}")
|
|
556
|
-
results = _ddg_images_html(query, count)
|
|
557
|
-
logging.info(f"[image_search] DuckDuckGo HTML returned {len(results)} results")
|
|
558
|
-
except Exception as e:
|
|
559
|
-
logging.error(f"[image_search] All methods failed for query '{query}': {e}")
|
|
560
|
-
results = []
|
|
446
|
+
|
|
561
447
|
|
|
562
448
|
# Post-filtering and ranking for relevance and quality
|
|
563
449
|
def score(item: Dict[str, Any]) -> int:
|
|
@@ -725,13 +611,8 @@ async def combined_search(query: str, count: int = 25, enrich: bool = True) -> s
|
|
|
725
611
|
except Exception:
|
|
726
612
|
img_results = []
|
|
727
613
|
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
if enrich and web_results:
|
|
731
|
-
web_results = _enrich_web_results_with_meta(web_results)
|
|
732
|
-
if not img_results:
|
|
733
|
-
img_results = _ddg_images_html(query, count)
|
|
734
|
-
|
|
614
|
+
if enrich and web_results:
|
|
615
|
+
web_results = _enrich_web_results_with_meta(web_results)
|
|
735
616
|
combined.extend(web_results)
|
|
736
617
|
combined.extend(img_results)
|
|
737
618
|
if not combined:
|
|
@@ -840,136 +721,197 @@ async def collect_task_images(
|
|
|
840
721
|
accepted: List[Dict[str, Any]] = []
|
|
841
722
|
skipped: List[Dict[str, Any]] = []
|
|
842
723
|
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
session = requests.Session()
|
|
848
|
-
session.headers.update({"User-Agent": USER_AGENT})
|
|
849
|
-
except Exception:
|
|
850
|
-
session = None
|
|
851
|
-
|
|
852
|
-
def is_image_ok(url: str) -> bool:
|
|
853
|
-
if not verify_download or not session:
|
|
724
|
+
# Async verification using aiohttp for parallel checks
|
|
725
|
+
async def is_image_ok_async(url: str, session: aiohttp.ClientSession) -> bool:
|
|
726
|
+
"""Async version of image verification for parallel execution."""
|
|
727
|
+
if not verify_download:
|
|
854
728
|
return True
|
|
855
729
|
try:
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
730
|
+
# Increased timeout from 5s to 15s - many image CDNs are slow (Reddit, eBay, etc.)
|
|
731
|
+
async with session.head(url, timeout=aiohttp.ClientTimeout(total=15), allow_redirects=True) as response:
|
|
732
|
+
if response.status != 200:
|
|
733
|
+
return False
|
|
734
|
+
ct = (response.headers.get("content-type") or "").lower()
|
|
735
|
+
if ct.startswith("image/"):
|
|
736
|
+
return True
|
|
737
|
+
# If HEAD doesn't give content-type, try GET with small range
|
|
738
|
+
async with session.get(url, timeout=aiohttp.ClientTimeout(total=15), allow_redirects=True) as get_resp:
|
|
739
|
+
if get_resp.status == 200:
|
|
740
|
+
first_chunk = await get_resp.content.read(2048)
|
|
741
|
+
sigs = [b"\x89PNG\r\n\x1a\n", b"\xff\xd8\xff", b"GIF87a", b"GIF89a", b"RIFF"]
|
|
742
|
+
return any(first_chunk.startswith(sig) for sig in sigs)
|
|
743
|
+
except (asyncio.TimeoutError, aiohttp.ClientError):
|
|
744
|
+
return False
|
|
860
745
|
except Exception:
|
|
861
746
|
return False
|
|
862
747
|
return False
|
|
863
748
|
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
if used >= count:
|
|
868
|
-
break
|
|
869
|
-
# Prefer original_url if available
|
|
870
|
-
img_url = it.get("original_url") or it.get("url")
|
|
871
|
-
if not img_url:
|
|
872
|
-
skipped.append({"reason": "missing_url", "item": it})
|
|
873
|
-
continue
|
|
874
|
-
if not is_image_ok(img_url):
|
|
875
|
-
skipped.append({"reason": "verify_failed", "url": img_url})
|
|
876
|
-
continue
|
|
877
|
-
|
|
878
|
-
# Determine extension; if SVG, download as .svg then convert to PNG
|
|
879
|
-
base = re.sub(r"[^a-zA-Z0-9_-]+", "_", (it.get("title") or "image").strip())[:80] or "image"
|
|
880
|
-
url_lower = (img_url or "").lower()
|
|
881
|
-
is_svg = url_lower.endswith(".svg") or ".svg" in url_lower
|
|
882
|
-
filename = f"{base}_{used+1}.svg" if is_svg else f"{base}_{used+1}.jpg"
|
|
883
|
-
dl_json = await download_image(img_url, filename, work_dir)
|
|
884
|
-
dl = json.loads(dl_json)
|
|
885
|
-
if dl.get("status") != "success":
|
|
886
|
-
skipped.append({"reason": "download_error", "url": img_url, "detail": dl})
|
|
887
|
-
continue
|
|
888
|
-
|
|
889
|
-
file_path = dl.get("file_path")
|
|
890
|
-
# If SVG, convert to PNG for PIL compatibility
|
|
891
|
-
if is_svg and file_path and os.path.exists(file_path):
|
|
892
|
-
try:
|
|
893
|
-
import cairosvg # type: ignore
|
|
894
|
-
png_path = os.path.splitext(file_path)[0] + ".png"
|
|
895
|
-
cairosvg.svg2png(url=file_path, write_to=png_path)
|
|
896
|
-
try:
|
|
897
|
-
os.remove(file_path)
|
|
898
|
-
except Exception:
|
|
899
|
-
pass
|
|
900
|
-
file_path = png_path
|
|
901
|
-
except Exception as e:
|
|
902
|
-
skipped.append({"reason": "svg_convert_failed", "url": img_url, "error": str(e)})
|
|
903
|
-
try:
|
|
904
|
-
os.remove(file_path)
|
|
905
|
-
except Exception:
|
|
906
|
-
pass
|
|
907
|
-
continue
|
|
908
|
-
# Optional dimension filter
|
|
749
|
+
# Parallel download/verification helper
|
|
750
|
+
async def process_single_image(it: Dict[str, Any], idx: int, aio_session: aiohttp.ClientSession) -> Optional[Dict[str, Any]]:
|
|
751
|
+
"""Download, verify, and process a single image. Returns accepted dict or None if skipped."""
|
|
909
752
|
try:
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
753
|
+
img_url = it.get("original_url") or it.get("url")
|
|
754
|
+
if not img_url:
|
|
755
|
+
skipped.append({"reason": "missing_url", "item": it})
|
|
756
|
+
return None
|
|
757
|
+
if not await is_image_ok_async(img_url, aio_session):
|
|
758
|
+
skipped.append({"reason": "verify_failed", "url": img_url})
|
|
759
|
+
return None
|
|
760
|
+
|
|
761
|
+
# Determine extension; if SVG, download as .svg then convert to PNG
|
|
762
|
+
base = re.sub(r"[^a-zA-Z0-9_-]+", "_", (it.get("title") or "image").strip())[:80] or "image"
|
|
763
|
+
url_lower = (img_url or "").lower()
|
|
764
|
+
is_svg = url_lower.endswith(".svg") or ".svg" in url_lower
|
|
765
|
+
filename = f"{base}_{idx+1}.svg" if is_svg else f"{base}_{idx+1}.jpg"
|
|
766
|
+
dl_json = await download_image(img_url, filename, work_dir)
|
|
767
|
+
dl = json.loads(dl_json)
|
|
768
|
+
if dl.get("status") != "success":
|
|
769
|
+
skipped.append({"reason": "download_error", "url": img_url, "detail": dl})
|
|
770
|
+
return None
|
|
922
771
|
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
if
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
digest = hasher.hexdigest()
|
|
931
|
-
if digest in seen_hashes:
|
|
932
|
-
skipped.append({"reason": "content_duplicate", "url": img_url})
|
|
772
|
+
file_path = dl.get("file_path")
|
|
773
|
+
# If SVG, convert to PNG for PIL compatibility
|
|
774
|
+
if is_svg and file_path and os.path.exists(file_path):
|
|
775
|
+
try:
|
|
776
|
+
import cairosvg # type: ignore
|
|
777
|
+
png_path = os.path.splitext(file_path)[0] + ".png"
|
|
778
|
+
cairosvg.svg2png(url=file_path, write_to=png_path)
|
|
933
779
|
try:
|
|
934
780
|
os.remove(file_path)
|
|
935
781
|
except Exception:
|
|
936
782
|
pass
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
783
|
+
file_path = png_path
|
|
784
|
+
except Exception as e:
|
|
785
|
+
skipped.append({"reason": "svg_convert_failed", "url": img_url, "error": str(e)})
|
|
786
|
+
try:
|
|
787
|
+
os.remove(file_path)
|
|
788
|
+
except Exception:
|
|
789
|
+
pass
|
|
790
|
+
return None
|
|
791
|
+
|
|
792
|
+
# Optional dimension filter
|
|
793
|
+
try:
|
|
794
|
+
if (min_width or min_height) and file_path and os.path.exists(file_path):
|
|
795
|
+
with Image.open(file_path) as im:
|
|
796
|
+
w, h = im.size
|
|
797
|
+
if (min_width and w < min_width) or (min_height and h < min_height):
|
|
798
|
+
skipped.append({"reason": "too_small", "url": img_url, "width": w, "height": h})
|
|
799
|
+
try:
|
|
800
|
+
os.remove(file_path)
|
|
801
|
+
except Exception:
|
|
802
|
+
pass
|
|
803
|
+
return None
|
|
804
|
+
except Exception:
|
|
805
|
+
pass
|
|
806
|
+
|
|
807
|
+
# Compute hash for deduplication (will filter later)
|
|
808
|
+
content_hash = None
|
|
809
|
+
try:
|
|
810
|
+
if dedup_content and file_path and os.path.exists(file_path):
|
|
811
|
+
hasher = hashlib.sha256()
|
|
812
|
+
with open(file_path, "rb") as fh:
|
|
813
|
+
for chunk in iter(lambda: fh.read(1024 * 1024), b""):
|
|
814
|
+
hasher.update(chunk)
|
|
815
|
+
content_hash = hasher.hexdigest()
|
|
816
|
+
except Exception:
|
|
817
|
+
pass
|
|
818
|
+
|
|
819
|
+
# Upload if configured, else mark as local only
|
|
820
|
+
azure_conn = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
|
|
821
|
+
if azure_conn:
|
|
822
|
+
up_json = upload_file_to_azure_blob(file_path)
|
|
823
|
+
up = json.loads(up_json)
|
|
824
|
+
if "download_url" in up:
|
|
825
|
+
return {
|
|
826
|
+
"title": it.get("title"),
|
|
827
|
+
"source_page": it.get("host_page_url"),
|
|
828
|
+
"uploaded_url": up["download_url"],
|
|
829
|
+
"local_path": file_path,
|
|
830
|
+
"width": it.get("width"),
|
|
831
|
+
"height": it.get("height"),
|
|
832
|
+
"source_host": it.get("_host"),
|
|
833
|
+
"_content_hash": content_hash,
|
|
834
|
+
}
|
|
835
|
+
else:
|
|
836
|
+
skipped.append({"reason": "upload_error", "file_path": file_path, "detail": up})
|
|
837
|
+
return None
|
|
838
|
+
else:
|
|
839
|
+
return {
|
|
948
840
|
"title": it.get("title"),
|
|
949
841
|
"source_page": it.get("host_page_url"),
|
|
950
|
-
"uploaded_url":
|
|
842
|
+
"uploaded_url": None,
|
|
951
843
|
"local_path": file_path,
|
|
952
844
|
"width": it.get("width"),
|
|
953
845
|
"height": it.get("height"),
|
|
954
846
|
"source_host": it.get("_host"),
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
847
|
+
"note": "AZURE_STORAGE_CONNECTION_STRING not set; upload skipped",
|
|
848
|
+
"_content_hash": content_hash,
|
|
849
|
+
}
|
|
850
|
+
except Exception as e:
|
|
851
|
+
skipped.append({"reason": "processing_error", "error": str(e)})
|
|
852
|
+
return None
|
|
853
|
+
|
|
854
|
+
# Process images in parallel batches with connection pooling
|
|
855
|
+
BATCH_SIZE = 15 # Balanced for connection pool (limit_per_host=10)
|
|
856
|
+
all_results: List[Optional[Dict[str, Any]]] = []
|
|
857
|
+
|
|
858
|
+
# Create aiohttp session with connection pooling for performance
|
|
859
|
+
connector = aiohttp.TCPConnector(limit=30, limit_per_host=10)
|
|
860
|
+
timeout = aiohttp.ClientTimeout(total=30, connect=10)
|
|
861
|
+
|
|
862
|
+
async with aiohttp.ClientSession(
|
|
863
|
+
connector=connector,
|
|
864
|
+
timeout=timeout,
|
|
865
|
+
headers={"User-Agent": USER_AGENT, "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8"}
|
|
866
|
+
) as aio_session:
|
|
867
|
+
for batch_start in range(0, min(len(filtered), count * 3), BATCH_SIZE): # Over-fetch to handle failures
|
|
868
|
+
batch = filtered[batch_start:batch_start + BATCH_SIZE]
|
|
869
|
+
# Early exit: stop processing new batches once we have enough
|
|
870
|
+
if len(accepted) >= count:
|
|
871
|
+
logging.info(f"[collect_task_images] Early exit: {len(accepted)} images collected (target: {count})")
|
|
872
|
+
break
|
|
873
|
+
|
|
874
|
+
# Process batch in parallel with shared session
|
|
875
|
+
tasks = [process_single_image(it, batch_start + i, aio_session) for i, it in enumerate(batch)]
|
|
876
|
+
batch_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
877
|
+
|
|
878
|
+
# Filter out exceptions and add successful results
|
|
879
|
+
for result in batch_results:
|
|
880
|
+
if not isinstance(result, Exception) and result is not None:
|
|
881
|
+
all_results.append(result)
|
|
882
|
+
|
|
883
|
+
# Post-process: deduplicate by hash and limit to count
|
|
884
|
+
seen_hashes: set = set()
|
|
885
|
+
for result in all_results:
|
|
886
|
+
if result is None:
|
|
887
|
+
continue
|
|
888
|
+
# Early exit once we have enough images
|
|
889
|
+
if len(accepted) >= count:
|
|
890
|
+
# Clean up any extra downloaded files beyond what we need
|
|
891
|
+
try:
|
|
892
|
+
if result.get("local_path") and os.path.exists(result["local_path"]):
|
|
893
|
+
os.remove(result["local_path"])
|
|
894
|
+
except Exception:
|
|
895
|
+
pass
|
|
896
|
+
break
|
|
897
|
+
|
|
898
|
+
# Deduplication check
|
|
899
|
+
if dedup_content and result.get("_content_hash"):
|
|
900
|
+
if result["_content_hash"] in seen_hashes:
|
|
901
|
+
skipped.append({"reason": "content_duplicate", "url": result.get("source_page")})
|
|
902
|
+
# Clean up duplicate file
|
|
903
|
+
try:
|
|
904
|
+
if result.get("local_path") and os.path.exists(result["local_path"]):
|
|
905
|
+
os.remove(result["local_path"])
|
|
906
|
+
except Exception:
|
|
907
|
+
pass
|
|
960
908
|
continue
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
"width": it.get("width"),
|
|
968
|
-
"height": it.get("height"),
|
|
969
|
-
"source_host": it.get("_host"),
|
|
970
|
-
"note": "AZURE_STORAGE_CONNECTION_STRING not set; upload skipped"
|
|
971
|
-
})
|
|
972
|
-
used += 1
|
|
909
|
+
seen_hashes.add(result["_content_hash"])
|
|
910
|
+
|
|
911
|
+
# Remove internal hash field before adding to accepted
|
|
912
|
+
if "_content_hash" in result:
|
|
913
|
+
del result["_content_hash"]
|
|
914
|
+
accepted.append(result)
|
|
973
915
|
|
|
974
916
|
# No synthesis: if no accepted items, return zero results as-is
|
|
975
917
|
|
|
@@ -1200,4 +1142,260 @@ async def azure_cognitive_search(
|
|
|
1200
1142
|
tasks.append(_perform_single_cognitive_search(**q_params))
|
|
1201
1143
|
|
|
1202
1144
|
results = await asyncio.gather(*tasks)
|
|
1203
|
-
return json.dumps(results, indent=2)
|
|
1145
|
+
return json.dumps(results, indent=2)
|
|
1146
|
+
|
|
1147
|
+
|
|
1148
|
+
def _extract_json_path(data: Any, path: str) -> Optional[Any]:
|
|
1149
|
+
"""
|
|
1150
|
+
Extract nested value from JSON using dot notation path.
|
|
1151
|
+
Supports array indices like '[0]' and nested paths like 'sprites.other.official-artwork.front_default'.
|
|
1152
|
+
Returns None if path is invalid or key doesn't exist.
|
|
1153
|
+
|
|
1154
|
+
Examples:
|
|
1155
|
+
>>> _extract_json_path({'a': {'b': [1, 2]}}, 'a.b.[0]')
|
|
1156
|
+
1
|
|
1157
|
+
>>> _extract_json_path({'x': {'y': {'z': 42}}}, 'x.y.z')
|
|
1158
|
+
42
|
|
1159
|
+
"""
|
|
1160
|
+
try:
|
|
1161
|
+
current = data
|
|
1162
|
+
parts = path.replace('[', '.[').split('.')
|
|
1163
|
+
for part in parts:
|
|
1164
|
+
if not part:
|
|
1165
|
+
continue
|
|
1166
|
+
if part.startswith('[') and part.endswith(']'):
|
|
1167
|
+
# Array index
|
|
1168
|
+
idx = int(part[1:-1])
|
|
1169
|
+
current = current[idx]
|
|
1170
|
+
else:
|
|
1171
|
+
# Dictionary key
|
|
1172
|
+
current = current[part]
|
|
1173
|
+
return current
|
|
1174
|
+
except (KeyError, IndexError, TypeError, ValueError):
|
|
1175
|
+
return None
|
|
1176
|
+
|
|
1177
|
+
|
|
1178
|
+
async def fetch_entity_images(
|
|
1179
|
+
entities: List[str],
|
|
1180
|
+
entity_type: str = "pokemon",
|
|
1181
|
+
count_per_entity: int = 1,
|
|
1182
|
+
work_dir: Optional[str] = None,
|
|
1183
|
+
force_web_search: bool = False
|
|
1184
|
+
) -> str:
|
|
1185
|
+
"""
|
|
1186
|
+
Multi-tier entity image fetcher with automatic fallback:
|
|
1187
|
+
1. Try entity API registry (fast, reliable for known types)
|
|
1188
|
+
2. Fallback to web search (flexible, works for any entity)
|
|
1189
|
+
|
|
1190
|
+
This generic tool works for ANY structured entity type (Pokemon, countries, movies, etc.)
|
|
1191
|
+
|
|
1192
|
+
Args:
|
|
1193
|
+
entities: List of entity names to fetch images for (e.g., ["Gengar", "Mewtwo", "Alakazam"])
|
|
1194
|
+
entity_type: Type of entity (e.g., "pokemon", "country", "movie")
|
|
1195
|
+
count_per_entity: Number of images to fetch per entity (default: 1)
|
|
1196
|
+
work_dir: Directory to save downloaded images
|
|
1197
|
+
force_web_search: Skip API registry and go straight to web search fallback
|
|
1198
|
+
|
|
1199
|
+
Returns:
|
|
1200
|
+
JSON with fetched entities, images, and fallback status
|
|
1201
|
+
"""
|
|
1202
|
+
try:
|
|
1203
|
+
if not work_dir:
|
|
1204
|
+
work_dir = os.getcwd()
|
|
1205
|
+
os.makedirs(work_dir, exist_ok=True)
|
|
1206
|
+
|
|
1207
|
+
from .file_tools import download_image
|
|
1208
|
+
from .azure_blob_tools import upload_file_to_azure_blob
|
|
1209
|
+
|
|
1210
|
+
results: List[Dict[str, Any]] = []
|
|
1211
|
+
fallback_stats = {"api_success": 0, "api_failed": 0, "web_search_used": 0}
|
|
1212
|
+
|
|
1213
|
+
# Load entity API registry
|
|
1214
|
+
registry_path = os.path.join(os.path.dirname(__file__), "entity_api_registry.json")
|
|
1215
|
+
registry = {}
|
|
1216
|
+
api_config = None
|
|
1217
|
+
|
|
1218
|
+
if not force_web_search and os.path.exists(registry_path):
|
|
1219
|
+
try:
|
|
1220
|
+
with open(registry_path, 'r') as f:
|
|
1221
|
+
registry = json.load(f)
|
|
1222
|
+
api_config = registry.get(entity_type.lower())
|
|
1223
|
+
if api_config and not api_config.get("enabled", True):
|
|
1224
|
+
logging.info(f"[fetch_entity_images] API config for '{entity_type}' is disabled, using web search")
|
|
1225
|
+
api_config = None
|
|
1226
|
+
except Exception as e:
|
|
1227
|
+
logging.warning(f"[fetch_entity_images] Failed to load registry: {e}")
|
|
1228
|
+
|
|
1229
|
+
# Process each entity
|
|
1230
|
+
for entity_idx, entity_name in enumerate(entities):
|
|
1231
|
+
entity_result = {
|
|
1232
|
+
"entity": entity_name,
|
|
1233
|
+
"entity_type": entity_type,
|
|
1234
|
+
"images": [],
|
|
1235
|
+
"method": None,
|
|
1236
|
+
"error": None
|
|
1237
|
+
}
|
|
1238
|
+
|
|
1239
|
+
# === TIER 1: Try API Registry ===
|
|
1240
|
+
if api_config and not force_web_search:
|
|
1241
|
+
try:
|
|
1242
|
+
logging.info(f"[fetch_entity_images] Trying API for {entity_name} ({api_config.get('name')})")
|
|
1243
|
+
|
|
1244
|
+
# Check required env vars
|
|
1245
|
+
if api_config.get("requires_env"):
|
|
1246
|
+
missing_vars = [v for v in api_config["requires_env"] if not os.getenv(v)]
|
|
1247
|
+
if missing_vars:
|
|
1248
|
+
raise Exception(f"Missing required env vars: {missing_vars}")
|
|
1249
|
+
|
|
1250
|
+
# Transform entity name
|
|
1251
|
+
transform = api_config.get("entity_transform", "none")
|
|
1252
|
+
transformed_entity = entity_name
|
|
1253
|
+
if transform == "lowercase":
|
|
1254
|
+
transformed_entity = entity_name.lower()
|
|
1255
|
+
elif transform == "uppercase":
|
|
1256
|
+
transformed_entity = entity_name.upper()
|
|
1257
|
+
elif transform == "slug":
|
|
1258
|
+
transformed_entity = entity_name.lower().replace(" ", "-")
|
|
1259
|
+
|
|
1260
|
+
# Build URL
|
|
1261
|
+
url_pattern = api_config["url_pattern"]
|
|
1262
|
+
# Replace env vars in pattern
|
|
1263
|
+
for env_var in re.findall(r'\{([A-Z_]+)\}', url_pattern):
|
|
1264
|
+
if env_var != "entity":
|
|
1265
|
+
url_pattern = url_pattern.replace(f"{{{env_var}}}", os.getenv(env_var, ""))
|
|
1266
|
+
url = url_pattern.replace("{entity}", transformed_entity)
|
|
1267
|
+
|
|
1268
|
+
# Fetch API data
|
|
1269
|
+
headers = {"User-Agent": USER_AGENT}
|
|
1270
|
+
response = requests.get(url, headers=headers, timeout=15)
|
|
1271
|
+
response.raise_for_status()
|
|
1272
|
+
api_data = response.json()
|
|
1273
|
+
|
|
1274
|
+
# Extract image URLs using configured paths
|
|
1275
|
+
image_fields = api_config.get("image_fields", [])
|
|
1276
|
+
image_urls = []
|
|
1277
|
+
for field_path in image_fields:
|
|
1278
|
+
img_url = _extract_json_path(api_data, field_path)
|
|
1279
|
+
if img_url and isinstance(img_url, str) and img_url.startswith("http"):
|
|
1280
|
+
image_urls.append(img_url)
|
|
1281
|
+
if len(image_urls) >= count_per_entity:
|
|
1282
|
+
break
|
|
1283
|
+
|
|
1284
|
+
if not image_urls:
|
|
1285
|
+
raise Exception(f"No valid image URLs found in API response (tried paths: {image_fields})")
|
|
1286
|
+
|
|
1287
|
+
# Download images from API
|
|
1288
|
+
for img_idx, img_url in enumerate(image_urls[:count_per_entity]):
|
|
1289
|
+
try:
|
|
1290
|
+
# Determine extension
|
|
1291
|
+
ext = "png"
|
|
1292
|
+
if img_url.lower().endswith((".jpg", ".jpeg")):
|
|
1293
|
+
ext = "jpg"
|
|
1294
|
+
elif img_url.lower().endswith(".svg"):
|
|
1295
|
+
ext = "svg"
|
|
1296
|
+
|
|
1297
|
+
safe_name = re.sub(r"[^a-zA-Z0-9_-]+", "_", entity_name)[:50]
|
|
1298
|
+
filename = f"{entity_type}_{safe_name}_{entity_idx+1}_{img_idx+1}.{ext}"
|
|
1299
|
+
|
|
1300
|
+
dl_json = await download_image(img_url, filename, work_dir)
|
|
1301
|
+
dl = json.loads(dl_json)
|
|
1302
|
+
|
|
1303
|
+
if dl.get("status") == "success":
|
|
1304
|
+
file_path = dl.get("file_path")
|
|
1305
|
+
|
|
1306
|
+
# Upload if Azure configured
|
|
1307
|
+
azure_conn = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
|
|
1308
|
+
uploaded_url = None
|
|
1309
|
+
if azure_conn:
|
|
1310
|
+
try:
|
|
1311
|
+
up_json = upload_file_to_azure_blob(file_path)
|
|
1312
|
+
up = json.loads(up_json)
|
|
1313
|
+
uploaded_url = up.get("download_url")
|
|
1314
|
+
except Exception:
|
|
1315
|
+
pass
|
|
1316
|
+
|
|
1317
|
+
entity_result["images"].append({
|
|
1318
|
+
"source_url": img_url,
|
|
1319
|
+
"local_path": file_path,
|
|
1320
|
+
"uploaded_url": uploaded_url
|
|
1321
|
+
})
|
|
1322
|
+
except Exception as img_err:
|
|
1323
|
+
logging.warning(f"[fetch_entity_images] Image download failed for {entity_name}: {img_err}")
|
|
1324
|
+
|
|
1325
|
+
if entity_result["images"]:
|
|
1326
|
+
entity_result["method"] = f"api:{api_config.get('name')}"
|
|
1327
|
+
fallback_stats["api_success"] += 1
|
|
1328
|
+
logging.info(f"[fetch_entity_images] ✓ API success for {entity_name}: {len(entity_result['images'])} images")
|
|
1329
|
+
else:
|
|
1330
|
+
raise Exception("All image downloads failed")
|
|
1331
|
+
|
|
1332
|
+
except Exception as api_error:
|
|
1333
|
+
logging.warning(f"[fetch_entity_images] API failed for {entity_name}: {api_error}")
|
|
1334
|
+
fallback_stats["api_failed"] += 1
|
|
1335
|
+
entity_result["error"] = str(api_error)
|
|
1336
|
+
# Will fall through to web search below
|
|
1337
|
+
|
|
1338
|
+
# === TIER 2: Fallback to Web Search ===
|
|
1339
|
+
if not entity_result["images"]:
|
|
1340
|
+
try:
|
|
1341
|
+
logging.info(f"[fetch_entity_images] Falling back to web search for {entity_name}")
|
|
1342
|
+
|
|
1343
|
+
# Use fallback search query if configured, otherwise construct generic query
|
|
1344
|
+
if api_config and api_config.get("fallback_search_query"):
|
|
1345
|
+
search_query = api_config["fallback_search_query"].replace("{entity}", entity_name)
|
|
1346
|
+
else:
|
|
1347
|
+
search_query = f"{entity_name} {entity_type} official image"
|
|
1348
|
+
|
|
1349
|
+
# Use existing collect_task_images function
|
|
1350
|
+
web_result_json = await collect_task_images(
|
|
1351
|
+
query=search_query,
|
|
1352
|
+
count=count_per_entity,
|
|
1353
|
+
verify_download=True,
|
|
1354
|
+
work_dir=work_dir,
|
|
1355
|
+
required_terms=[entity_name.split()[0]], # At least first word of entity name must match
|
|
1356
|
+
strict_entity=False
|
|
1357
|
+
)
|
|
1358
|
+
|
|
1359
|
+
web_result = json.loads(web_result_json)
|
|
1360
|
+
accepted = web_result.get("accepted", [])
|
|
1361
|
+
|
|
1362
|
+
if accepted:
|
|
1363
|
+
# Map web search results to our format
|
|
1364
|
+
for img in accepted[:count_per_entity]:
|
|
1365
|
+
entity_result["images"].append({
|
|
1366
|
+
"source_url": img.get("source_page"),
|
|
1367
|
+
"local_path": img.get("local_path"),
|
|
1368
|
+
"uploaded_url": img.get("uploaded_url")
|
|
1369
|
+
})
|
|
1370
|
+
|
|
1371
|
+
entity_result["method"] = "web_search_fallback"
|
|
1372
|
+
fallback_stats["web_search_used"] += 1
|
|
1373
|
+
logging.info(f"[fetch_entity_images] ✓ Web search success for {entity_name}: {len(entity_result['images'])} images")
|
|
1374
|
+
else:
|
|
1375
|
+
entity_result["error"] = "Web search found no suitable images"
|
|
1376
|
+
logging.warning(f"[fetch_entity_images] Web search found no images for {entity_name}")
|
|
1377
|
+
|
|
1378
|
+
except Exception as web_error:
|
|
1379
|
+
entity_result["error"] = f"Web search failed: {str(web_error)}"
|
|
1380
|
+
logging.error(f"[fetch_entity_images] Web search failed for {entity_name}: {web_error}")
|
|
1381
|
+
|
|
1382
|
+
results.append(entity_result)
|
|
1383
|
+
|
|
1384
|
+
# Count total images from all entities
|
|
1385
|
+
total_images_found = sum(len(entity_res.get("images", [])) for entity_res in results)
|
|
1386
|
+
fallback_stats["total_images"] = total_images_found
|
|
1387
|
+
|
|
1388
|
+
# Calculate success - at least one image found
|
|
1389
|
+
success = total_images_found > 0 or len(entities) == 0 # Empty list is also "successful"
|
|
1390
|
+
|
|
1391
|
+
return json.dumps({
|
|
1392
|
+
"success": success,
|
|
1393
|
+
"entity_type": entity_type,
|
|
1394
|
+
"total_entities": len(entities),
|
|
1395
|
+
"results": results,
|
|
1396
|
+
"stats": fallback_stats,
|
|
1397
|
+
"registry_available": api_config is not None
|
|
1398
|
+
}, indent=2)
|
|
1399
|
+
|
|
1400
|
+
except Exception as exc:
|
|
1401
|
+
return json.dumps({"error": f"fetch_entity_images failed: {str(exc)}"}, indent=2)
|