@aj-archipelago/cortex 1.4.2 → 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/README.md +1 -0
  2. package/config.js +1 -1
  3. package/helper-apps/cortex-autogen2/.dockerignore +1 -0
  4. package/helper-apps/cortex-autogen2/Dockerfile +6 -10
  5. package/helper-apps/cortex-autogen2/Dockerfile.worker +2 -0
  6. package/helper-apps/cortex-autogen2/agents.py +203 -2
  7. package/helper-apps/cortex-autogen2/main.py +1 -1
  8. package/helper-apps/cortex-autogen2/pyproject.toml +12 -0
  9. package/helper-apps/cortex-autogen2/requirements.txt +14 -0
  10. package/helper-apps/cortex-autogen2/services/redis_publisher.py +1 -1
  11. package/helper-apps/cortex-autogen2/services/run_analyzer.py +1 -1
  12. package/helper-apps/cortex-autogen2/task_processor.py +431 -229
  13. package/helper-apps/cortex-autogen2/test_entity_fetcher.py +305 -0
  14. package/helper-apps/cortex-autogen2/tests/README.md +240 -0
  15. package/helper-apps/cortex-autogen2/tests/TEST_REPORT.md +342 -0
  16. package/helper-apps/cortex-autogen2/tests/__init__.py +8 -0
  17. package/helper-apps/cortex-autogen2/tests/analysis/__init__.py +1 -0
  18. package/helper-apps/cortex-autogen2/tests/analysis/improvement_suggester.py +224 -0
  19. package/helper-apps/cortex-autogen2/tests/analysis/trend_analyzer.py +211 -0
  20. package/helper-apps/cortex-autogen2/tests/cli/__init__.py +1 -0
  21. package/helper-apps/cortex-autogen2/tests/cli/run_tests.py +296 -0
  22. package/helper-apps/cortex-autogen2/tests/collectors/__init__.py +1 -0
  23. package/helper-apps/cortex-autogen2/tests/collectors/log_collector.py +252 -0
  24. package/helper-apps/cortex-autogen2/tests/collectors/progress_collector.py +182 -0
  25. package/helper-apps/cortex-autogen2/tests/conftest.py +15 -0
  26. package/helper-apps/cortex-autogen2/tests/database/__init__.py +1 -0
  27. package/helper-apps/cortex-autogen2/tests/database/repository.py +501 -0
  28. package/helper-apps/cortex-autogen2/tests/database/schema.sql +108 -0
  29. package/helper-apps/cortex-autogen2/tests/evaluators/__init__.py +1 -0
  30. package/helper-apps/cortex-autogen2/tests/evaluators/llm_scorer.py +294 -0
  31. package/helper-apps/cortex-autogen2/tests/evaluators/prompts.py +250 -0
  32. package/helper-apps/cortex-autogen2/tests/evaluators/wordcloud_validator.py +168 -0
  33. package/helper-apps/cortex-autogen2/tests/metrics/__init__.py +1 -0
  34. package/helper-apps/cortex-autogen2/tests/metrics/collector.py +155 -0
  35. package/helper-apps/cortex-autogen2/tests/orchestrator.py +576 -0
  36. package/helper-apps/cortex-autogen2/tests/test_cases.yaml +279 -0
  37. package/helper-apps/cortex-autogen2/tests/test_data.db +0 -0
  38. package/helper-apps/cortex-autogen2/tests/utils/__init__.py +3 -0
  39. package/helper-apps/cortex-autogen2/tests/utils/connectivity.py +112 -0
  40. package/helper-apps/cortex-autogen2/tools/azure_blob_tools.py +74 -24
  41. package/helper-apps/cortex-autogen2/tools/entity_api_registry.json +38 -0
  42. package/helper-apps/cortex-autogen2/tools/file_tools.py +1 -1
  43. package/helper-apps/cortex-autogen2/tools/search_tools.py +436 -238
  44. package/helper-apps/cortex-file-handler/package-lock.json +2 -2
  45. package/helper-apps/cortex-file-handler/package.json +1 -1
  46. package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +4 -5
  47. package/helper-apps/cortex-file-handler/src/blobHandler.js +36 -144
  48. package/helper-apps/cortex-file-handler/src/services/FileConversionService.js +5 -3
  49. package/helper-apps/cortex-file-handler/src/services/storage/AzureStorageProvider.js +34 -1
  50. package/helper-apps/cortex-file-handler/src/services/storage/GCSStorageProvider.js +22 -0
  51. package/helper-apps/cortex-file-handler/src/services/storage/LocalStorageProvider.js +28 -1
  52. package/helper-apps/cortex-file-handler/src/services/storage/StorageFactory.js +29 -4
  53. package/helper-apps/cortex-file-handler/src/services/storage/StorageProvider.js +11 -0
  54. package/helper-apps/cortex-file-handler/src/services/storage/StorageService.js +1 -1
  55. package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +3 -2
  56. package/helper-apps/cortex-file-handler/tests/checkHashShortLived.test.js +8 -1
  57. package/helper-apps/cortex-file-handler/tests/containerConversionFlow.test.js +5 -2
  58. package/helper-apps/cortex-file-handler/tests/containerNameParsing.test.js +14 -7
  59. package/helper-apps/cortex-file-handler/tests/containerParameterFlow.test.js +5 -2
  60. package/helper-apps/cortex-file-handler/tests/storage/StorageFactory.test.js +31 -19
  61. package/package.json +1 -1
  62. package/server/modelExecutor.js +4 -0
  63. package/server/plugins/claude4VertexPlugin.js +540 -0
  64. package/server/plugins/openAiWhisperPlugin.js +43 -2
  65. package/tests/integration/rest/vendors/claude_streaming.test.js +121 -0
  66. package/tests/unit/plugins/claude4VertexPlugin.test.js +462 -0
  67. package/tests/unit/plugins/claude4VertexToolConversion.test.js +413 -0
  68. package/helper-apps/cortex-autogen/.funcignore +0 -8
  69. package/helper-apps/cortex-autogen/Dockerfile +0 -10
  70. package/helper-apps/cortex-autogen/OAI_CONFIG_LIST +0 -6
  71. package/helper-apps/cortex-autogen/agents.py +0 -493
  72. package/helper-apps/cortex-autogen/agents_extra.py +0 -14
  73. package/helper-apps/cortex-autogen/config.py +0 -18
  74. package/helper-apps/cortex-autogen/data_operations.py +0 -29
  75. package/helper-apps/cortex-autogen/function_app.py +0 -44
  76. package/helper-apps/cortex-autogen/host.json +0 -15
  77. package/helper-apps/cortex-autogen/main.py +0 -38
  78. package/helper-apps/cortex-autogen/prompts.py +0 -196
  79. package/helper-apps/cortex-autogen/prompts_extra.py +0 -5
  80. package/helper-apps/cortex-autogen/requirements.txt +0 -9
  81. package/helper-apps/cortex-autogen/search.py +0 -85
  82. package/helper-apps/cortex-autogen/test.sh +0 -40
  83. package/helper-apps/cortex-autogen/tools/sasfileuploader.py +0 -66
  84. package/helper-apps/cortex-autogen/utils.py +0 -88
  85. package/helper-apps/cortex-autogen2/DigiCertGlobalRootCA.crt.pem +0 -22
  86. package/helper-apps/cortex-autogen2/poetry.lock +0 -3652
@@ -1,9 +1,9 @@
1
1
  """
2
2
  Web search tools (keyless).
3
3
 
4
- Implements DuckDuckGo-based search without API keys:
5
- - web_search: web results via HTML endpoint
6
- - image_search: image results via i.js JSON (requires vqd token)
4
+ Implements Google CSE-based search for web and image results:
5
+ - web_search: web results via Google CSE
6
+ - image_search: image results via Google CSE
7
7
  - combined_search: combined web + image results
8
8
  """
9
9
 
@@ -15,6 +15,7 @@ from typing import Dict, Any, List, Optional
15
15
  import hashlib
16
16
  from PIL import Image
17
17
  import asyncio # Import asyncio
18
+ import aiohttp # Add async HTTP client
18
19
  import matplotlib.pyplot as plt
19
20
  import pandas as pd
20
21
  import re
@@ -180,44 +181,6 @@ def _extract_snippet_near(html: str, start_pos: int) -> Optional[str]:
180
181
  return text or None
181
182
 
182
183
 
183
- def _ddg_web(query: str, count: int = 25) -> List[Dict[str, Any]]:
184
- url = f"https://duckduckgo.com/html/?q={urllib.parse.quote_plus(query)}"
185
- headers = {"User-Agent": USER_AGENT}
186
- resp = requests.get(url, headers=headers, timeout=20)
187
- resp.raise_for_status()
188
- html = resp.text
189
-
190
- # Capture results: <a class="result__a" href="...">Title</a>
191
- links_iter = re.finditer(r'<a[^>]*class="[^"]*result__a[^"]*"[^>]*href="([^"]+)"[^>]*>(.*?)</a>', html, flags=re.I | re.S)
192
- results: List[Dict[str, Any]] = []
193
- for match in links_iter:
194
- href = match.group(1)
195
- title_html = match.group(2)
196
- title_text = html_lib.unescape(re.sub('<[^<]+?>', '', title_html)).strip()
197
- if not title_text or not href:
198
- continue
199
- # Resolve DDG redirect links and protocol-relative URLs
200
- url_val = href
201
- if url_val.startswith("//"):
202
- url_val = "https:" + url_val
203
- try:
204
- parsed = urllib.parse.urlparse(url_val)
205
- if parsed.netloc.endswith("duckduckgo.com") and parsed.path.startswith("/l/"):
206
- qs = urllib.parse.parse_qs(parsed.query)
207
- uddg = qs.get("uddg", [None])[0]
208
- if uddg:
209
- url_val = urllib.parse.unquote(uddg)
210
- except Exception:
211
- pass
212
- snippet = _extract_snippet_near(html, match.end())
213
- results.append({
214
- "title": title_text,
215
- "url": url_val,
216
- "snippet": snippet,
217
- })
218
- if len(results) >= max(1, count):
219
- break
220
- return _normalize_web_results(results)
221
184
 
222
185
 
223
186
  def _enrich_web_results_with_meta(results: List[Dict[str, Any]], max_fetch: int = 3, timeout_s: int = 8) -> List[Dict[str, Any]]:
@@ -366,73 +329,6 @@ async def fetch_webpage(url: str, render: bool = False, timeout_s: int = 20, max
366
329
  return json.dumps({"error": f"Fetch failed: {str(exc)}"})
367
330
 
368
331
 
369
- # DuckDuckGo vqd token method removed - no API key needed, using HTML scraping only
370
-
371
-
372
- def _ddg_images_html(query: str, count: int = 25) -> List[Dict[str, Any]]:
373
- headers = {"User-Agent": USER_AGENT, "Referer": "https://duckduckgo.com/"}
374
- url = f"https://duckduckgo.com/?q={urllib.parse.quote_plus(query)}&ia=images&iar=images"
375
- try:
376
- resp = requests.get(url, headers=headers, timeout=20)
377
- resp.raise_for_status()
378
- html = resp.text
379
- items: List[Dict[str, Any]] = []
380
-
381
- # Method 1: Look for external-content proxied URLs
382
- for m in re.finditer(r'(?:src|data-src)="(https://external-content\.duckduckgo\.com/iu/\?u=[^"]+)"', html):
383
- proxy = html_lib.unescape(m.group(1))
384
- try:
385
- parsed = urllib.parse.urlparse(proxy)
386
- qs = urllib.parse.parse_qs(parsed.query)
387
- orig = qs.get('u', [None])[0]
388
- if not orig:
389
- continue
390
- orig = urllib.parse.unquote(orig)
391
- items.append({
392
- "title": None,
393
- "image": orig,
394
- "thumbnail": proxy,
395
- "width": None,
396
- "height": None,
397
- "source": None,
398
- })
399
- if len(items) >= count:
400
- break
401
- except Exception:
402
- continue
403
-
404
- # Method 2: Look for direct image URLs in the page
405
- if len(items) < count // 2:
406
- direct_patterns = [
407
- r'"(https://[^"]+\.(?:jpg|jpeg|png|webp|gif))"',
408
- r"'(https://[^']+\.(?:jpg|jpeg|png|webp|gif))'",
409
- ]
410
- for pattern in direct_patterns:
411
- for m in re.finditer(pattern, html, re.I):
412
- img_url = m.group(1)
413
- if "duckduckgo.com" not in img_url and img_url not in [i["image"] for i in items]:
414
- items.append({
415
- "title": None,
416
- "image": img_url,
417
- "thumbnail": img_url,
418
- "width": None,
419
- "height": None,
420
- "source": None,
421
- })
422
- if len(items) >= count:
423
- break
424
- if len(items) >= count:
425
- break
426
-
427
- logging.info(f"[_ddg_images_html] Found {len(items)} images for query: {query}")
428
- return _normalize_image_results(items)
429
- except Exception as e:
430
- logging.error(f"[_ddg_images_html] Failed for query '{query}': {e}")
431
- return []
432
-
433
-
434
- # DuckDuckGo JSON API method removed - no API key available, using HTML scraping only
435
-
436
332
 
437
333
  async def web_search(query: str, count: int = 25, enrich: bool = True) -> str:
438
334
  try:
@@ -449,8 +345,6 @@ async def web_search(query: str, count: int = 25, enrich: bool = True) -> str:
449
345
  used_google = False
450
346
  results = []
451
347
 
452
- if not results:
453
- results = _ddg_web(query, count)
454
348
 
455
349
  if enrich and results:
456
350
  # Enrich only for web-page items
@@ -549,15 +443,7 @@ async def image_search(query: str, count: int = 25, verify_download: bool = True
549
443
  logging.warning(f"[image_search] Google CSE failed: {e}")
550
444
  results = []
551
445
 
552
- if not results:
553
- # Fallback to DuckDuckGo HTML scraping if CSE disabled or empty
554
- try:
555
- logging.info(f"[image_search] Falling back to DuckDuckGo HTML scraping for query: {query}")
556
- results = _ddg_images_html(query, count)
557
- logging.info(f"[image_search] DuckDuckGo HTML returned {len(results)} results")
558
- except Exception as e:
559
- logging.error(f"[image_search] All methods failed for query '{query}': {e}")
560
- results = []
446
+
561
447
 
562
448
  # Post-filtering and ranking for relevance and quality
563
449
  def score(item: Dict[str, Any]) -> int:
@@ -725,13 +611,8 @@ async def combined_search(query: str, count: int = 25, enrich: bool = True) -> s
725
611
  except Exception:
726
612
  img_results = []
727
613
 
728
- if not web_results:
729
- web_results = _ddg_web(query, count)
730
- if enrich and web_results:
731
- web_results = _enrich_web_results_with_meta(web_results)
732
- if not img_results:
733
- img_results = _ddg_images_html(query, count)
734
-
614
+ if enrich and web_results:
615
+ web_results = _enrich_web_results_with_meta(web_results)
735
616
  combined.extend(web_results)
736
617
  combined.extend(img_results)
737
618
  if not combined:
@@ -840,136 +721,197 @@ async def collect_task_images(
840
721
  accepted: List[Dict[str, Any]] = []
841
722
  skipped: List[Dict[str, Any]] = []
842
723
 
843
- session = None
844
- if verify_download:
845
- try:
846
- import requests
847
- session = requests.Session()
848
- session.headers.update({"User-Agent": USER_AGENT})
849
- except Exception:
850
- session = None
851
-
852
- def is_image_ok(url: str) -> bool:
853
- if not verify_download or not session:
724
+ # Async verification using aiohttp for parallel checks
725
+ async def is_image_ok_async(url: str, session: aiohttp.ClientSession) -> bool:
726
+ """Async version of image verification for parallel execution."""
727
+ if not verify_download:
854
728
  return True
855
729
  try:
856
- r = session.get(url, stream=True, timeout=15, allow_redirects=True)
857
- ct = (r.headers.get("content-type") or "").lower()
858
- if r.status_code == 200 and (ct.startswith("image/") or next(r.iter_content(1024), b"")):
859
- return True
730
+ # Increased timeout from 5s to 15s - many image CDNs are slow (Reddit, eBay, etc.)
731
+ async with session.head(url, timeout=aiohttp.ClientTimeout(total=15), allow_redirects=True) as response:
732
+ if response.status != 200:
733
+ return False
734
+ ct = (response.headers.get("content-type") or "").lower()
735
+ if ct.startswith("image/"):
736
+ return True
737
+ # If HEAD doesn't give content-type, try GET with small range
738
+ async with session.get(url, timeout=aiohttp.ClientTimeout(total=15), allow_redirects=True) as get_resp:
739
+ if get_resp.status == 200:
740
+ first_chunk = await get_resp.content.read(2048)
741
+ sigs = [b"\x89PNG\r\n\x1a\n", b"\xff\xd8\xff", b"GIF87a", b"GIF89a", b"RIFF"]
742
+ return any(first_chunk.startswith(sig) for sig in sigs)
743
+ except (asyncio.TimeoutError, aiohttp.ClientError):
744
+ return False
860
745
  except Exception:
861
746
  return False
862
747
  return False
863
748
 
864
- used = 0
865
- seen_hashes: set = set()
866
- for it in filtered:
867
- if used >= count:
868
- break
869
- # Prefer original_url if available
870
- img_url = it.get("original_url") or it.get("url")
871
- if not img_url:
872
- skipped.append({"reason": "missing_url", "item": it})
873
- continue
874
- if not is_image_ok(img_url):
875
- skipped.append({"reason": "verify_failed", "url": img_url})
876
- continue
877
-
878
- # Determine extension; if SVG, download as .svg then convert to PNG
879
- base = re.sub(r"[^a-zA-Z0-9_-]+", "_", (it.get("title") or "image").strip())[:80] or "image"
880
- url_lower = (img_url or "").lower()
881
- is_svg = url_lower.endswith(".svg") or ".svg" in url_lower
882
- filename = f"{base}_{used+1}.svg" if is_svg else f"{base}_{used+1}.jpg"
883
- dl_json = await download_image(img_url, filename, work_dir)
884
- dl = json.loads(dl_json)
885
- if dl.get("status") != "success":
886
- skipped.append({"reason": "download_error", "url": img_url, "detail": dl})
887
- continue
888
-
889
- file_path = dl.get("file_path")
890
- # If SVG, convert to PNG for PIL compatibility
891
- if is_svg and file_path and os.path.exists(file_path):
892
- try:
893
- import cairosvg # type: ignore
894
- png_path = os.path.splitext(file_path)[0] + ".png"
895
- cairosvg.svg2png(url=file_path, write_to=png_path)
896
- try:
897
- os.remove(file_path)
898
- except Exception:
899
- pass
900
- file_path = png_path
901
- except Exception as e:
902
- skipped.append({"reason": "svg_convert_failed", "url": img_url, "error": str(e)})
903
- try:
904
- os.remove(file_path)
905
- except Exception:
906
- pass
907
- continue
908
- # Optional dimension filter
749
+ # Parallel download/verification helper
750
+ async def process_single_image(it: Dict[str, Any], idx: int, aio_session: aiohttp.ClientSession) -> Optional[Dict[str, Any]]:
751
+ """Download, verify, and process a single image. Returns accepted dict or None if skipped."""
909
752
  try:
910
- if (min_width or min_height) and file_path and os.path.exists(file_path):
911
- with Image.open(file_path) as im:
912
- w, h = im.size
913
- if (min_width and w < min_width) or (min_height and h < min_height):
914
- skipped.append({"reason": "too_small", "url": img_url, "width": w, "height": h})
915
- try:
916
- os.remove(file_path)
917
- except Exception:
918
- pass
919
- continue
920
- except Exception:
921
- pass
753
+ img_url = it.get("original_url") or it.get("url")
754
+ if not img_url:
755
+ skipped.append({"reason": "missing_url", "item": it})
756
+ return None
757
+ if not await is_image_ok_async(img_url, aio_session):
758
+ skipped.append({"reason": "verify_failed", "url": img_url})
759
+ return None
760
+
761
+ # Determine extension; if SVG, download as .svg then convert to PNG
762
+ base = re.sub(r"[^a-zA-Z0-9_-]+", "_", (it.get("title") or "image").strip())[:80] or "image"
763
+ url_lower = (img_url or "").lower()
764
+ is_svg = url_lower.endswith(".svg") or ".svg" in url_lower
765
+ filename = f"{base}_{idx+1}.svg" if is_svg else f"{base}_{idx+1}.jpg"
766
+ dl_json = await download_image(img_url, filename, work_dir)
767
+ dl = json.loads(dl_json)
768
+ if dl.get("status") != "success":
769
+ skipped.append({"reason": "download_error", "url": img_url, "detail": dl})
770
+ return None
922
771
 
923
- # Optional content deduplication by hash
924
- try:
925
- if dedup_content and file_path and os.path.exists(file_path):
926
- hasher = hashlib.sha256()
927
- with open(file_path, "rb") as fh:
928
- for chunk in iter(lambda: fh.read(1024 * 1024), b""):
929
- hasher.update(chunk)
930
- digest = hasher.hexdigest()
931
- if digest in seen_hashes:
932
- skipped.append({"reason": "content_duplicate", "url": img_url})
772
+ file_path = dl.get("file_path")
773
+ # If SVG, convert to PNG for PIL compatibility
774
+ if is_svg and file_path and os.path.exists(file_path):
775
+ try:
776
+ import cairosvg # type: ignore
777
+ png_path = os.path.splitext(file_path)[0] + ".png"
778
+ cairosvg.svg2png(url=file_path, write_to=png_path)
933
779
  try:
934
780
  os.remove(file_path)
935
781
  except Exception:
936
782
  pass
937
- continue
938
- seen_hashes.add(digest)
939
- except Exception:
940
- pass
941
- # Upload if configured, else mark as local only
942
- azure_conn = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
943
- if azure_conn:
944
- up_json = upload_file_to_azure_blob(file_path)
945
- up = json.loads(up_json)
946
- if "download_url" in up:
947
- accepted.append({
783
+ file_path = png_path
784
+ except Exception as e:
785
+ skipped.append({"reason": "svg_convert_failed", "url": img_url, "error": str(e)})
786
+ try:
787
+ os.remove(file_path)
788
+ except Exception:
789
+ pass
790
+ return None
791
+
792
+ # Optional dimension filter
793
+ try:
794
+ if (min_width or min_height) and file_path and os.path.exists(file_path):
795
+ with Image.open(file_path) as im:
796
+ w, h = im.size
797
+ if (min_width and w < min_width) or (min_height and h < min_height):
798
+ skipped.append({"reason": "too_small", "url": img_url, "width": w, "height": h})
799
+ try:
800
+ os.remove(file_path)
801
+ except Exception:
802
+ pass
803
+ return None
804
+ except Exception:
805
+ pass
806
+
807
+ # Compute hash for deduplication (will filter later)
808
+ content_hash = None
809
+ try:
810
+ if dedup_content and file_path and os.path.exists(file_path):
811
+ hasher = hashlib.sha256()
812
+ with open(file_path, "rb") as fh:
813
+ for chunk in iter(lambda: fh.read(1024 * 1024), b""):
814
+ hasher.update(chunk)
815
+ content_hash = hasher.hexdigest()
816
+ except Exception:
817
+ pass
818
+
819
+ # Upload if configured, else mark as local only
820
+ azure_conn = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
821
+ if azure_conn:
822
+ up_json = upload_file_to_azure_blob(file_path)
823
+ up = json.loads(up_json)
824
+ if "download_url" in up:
825
+ return {
826
+ "title": it.get("title"),
827
+ "source_page": it.get("host_page_url"),
828
+ "uploaded_url": up["download_url"],
829
+ "local_path": file_path,
830
+ "width": it.get("width"),
831
+ "height": it.get("height"),
832
+ "source_host": it.get("_host"),
833
+ "_content_hash": content_hash,
834
+ }
835
+ else:
836
+ skipped.append({"reason": "upload_error", "file_path": file_path, "detail": up})
837
+ return None
838
+ else:
839
+ return {
948
840
  "title": it.get("title"),
949
841
  "source_page": it.get("host_page_url"),
950
- "uploaded_url": up["download_url"],
842
+ "uploaded_url": None,
951
843
  "local_path": file_path,
952
844
  "width": it.get("width"),
953
845
  "height": it.get("height"),
954
846
  "source_host": it.get("_host"),
955
- })
956
- used += 1
957
- continue
958
- else:
959
- skipped.append({"reason": "upload_error", "file_path": file_path, "detail": up})
847
+ "note": "AZURE_STORAGE_CONNECTION_STRING not set; upload skipped",
848
+ "_content_hash": content_hash,
849
+ }
850
+ except Exception as e:
851
+ skipped.append({"reason": "processing_error", "error": str(e)})
852
+ return None
853
+
854
+ # Process images in parallel batches with connection pooling
855
+ BATCH_SIZE = 15 # Balanced for connection pool (limit_per_host=10)
856
+ all_results: List[Optional[Dict[str, Any]]] = []
857
+
858
+ # Create aiohttp session with connection pooling for performance
859
+ connector = aiohttp.TCPConnector(limit=30, limit_per_host=10)
860
+ timeout = aiohttp.ClientTimeout(total=30, connect=10)
861
+
862
+ async with aiohttp.ClientSession(
863
+ connector=connector,
864
+ timeout=timeout,
865
+ headers={"User-Agent": USER_AGENT, "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8"}
866
+ ) as aio_session:
867
+ for batch_start in range(0, min(len(filtered), count * 3), BATCH_SIZE): # Over-fetch to handle failures
868
+ batch = filtered[batch_start:batch_start + BATCH_SIZE]
869
+ # Early exit: stop processing new batches once we have enough
870
+ if len(accepted) >= count:
871
+ logging.info(f"[collect_task_images] Early exit: {len(accepted)} images collected (target: {count})")
872
+ break
873
+
874
+ # Process batch in parallel with shared session
875
+ tasks = [process_single_image(it, batch_start + i, aio_session) for i, it in enumerate(batch)]
876
+ batch_results = await asyncio.gather(*tasks, return_exceptions=True)
877
+
878
+ # Filter out exceptions and add successful results
879
+ for result in batch_results:
880
+ if not isinstance(result, Exception) and result is not None:
881
+ all_results.append(result)
882
+
883
+ # Post-process: deduplicate by hash and limit to count
884
+ seen_hashes: set = set()
885
+ for result in all_results:
886
+ if result is None:
887
+ continue
888
+ # Early exit once we have enough images
889
+ if len(accepted) >= count:
890
+ # Clean up any extra downloaded files beyond what we need
891
+ try:
892
+ if result.get("local_path") and os.path.exists(result["local_path"]):
893
+ os.remove(result["local_path"])
894
+ except Exception:
895
+ pass
896
+ break
897
+
898
+ # Deduplication check
899
+ if dedup_content and result.get("_content_hash"):
900
+ if result["_content_hash"] in seen_hashes:
901
+ skipped.append({"reason": "content_duplicate", "url": result.get("source_page")})
902
+ # Clean up duplicate file
903
+ try:
904
+ if result.get("local_path") and os.path.exists(result["local_path"]):
905
+ os.remove(result["local_path"])
906
+ except Exception:
907
+ pass
960
908
  continue
961
- else:
962
- accepted.append({
963
- "title": it.get("title"),
964
- "source_page": it.get("host_page_url"),
965
- "uploaded_url": None,
966
- "local_path": file_path,
967
- "width": it.get("width"),
968
- "height": it.get("height"),
969
- "source_host": it.get("_host"),
970
- "note": "AZURE_STORAGE_CONNECTION_STRING not set; upload skipped"
971
- })
972
- used += 1
909
+ seen_hashes.add(result["_content_hash"])
910
+
911
+ # Remove internal hash field before adding to accepted
912
+ if "_content_hash" in result:
913
+ del result["_content_hash"]
914
+ accepted.append(result)
973
915
 
974
916
  # No synthesis: if no accepted items, return zero results as-is
975
917
 
@@ -1200,4 +1142,260 @@ async def azure_cognitive_search(
1200
1142
  tasks.append(_perform_single_cognitive_search(**q_params))
1201
1143
 
1202
1144
  results = await asyncio.gather(*tasks)
1203
- return json.dumps(results, indent=2)
1145
+ return json.dumps(results, indent=2)
1146
+
1147
+
1148
+ def _extract_json_path(data: Any, path: str) -> Optional[Any]:
1149
+ """
1150
+ Extract nested value from JSON using dot notation path.
1151
+ Supports array indices like '[0]' and nested paths like 'sprites.other.official-artwork.front_default'.
1152
+ Returns None if path is invalid or key doesn't exist.
1153
+
1154
+ Examples:
1155
+ >>> _extract_json_path({'a': {'b': [1, 2]}}, 'a.b.[0]')
1156
+ 1
1157
+ >>> _extract_json_path({'x': {'y': {'z': 42}}}, 'x.y.z')
1158
+ 42
1159
+ """
1160
+ try:
1161
+ current = data
1162
+ parts = path.replace('[', '.[').split('.')
1163
+ for part in parts:
1164
+ if not part:
1165
+ continue
1166
+ if part.startswith('[') and part.endswith(']'):
1167
+ # Array index
1168
+ idx = int(part[1:-1])
1169
+ current = current[idx]
1170
+ else:
1171
+ # Dictionary key
1172
+ current = current[part]
1173
+ return current
1174
+ except (KeyError, IndexError, TypeError, ValueError):
1175
+ return None
1176
+
1177
+
1178
+ async def fetch_entity_images(
1179
+ entities: List[str],
1180
+ entity_type: str = "pokemon",
1181
+ count_per_entity: int = 1,
1182
+ work_dir: Optional[str] = None,
1183
+ force_web_search: bool = False
1184
+ ) -> str:
1185
+ """
1186
+ Multi-tier entity image fetcher with automatic fallback:
1187
+ 1. Try entity API registry (fast, reliable for known types)
1188
+ 2. Fallback to web search (flexible, works for any entity)
1189
+
1190
+ This generic tool works for ANY structured entity type (Pokemon, countries, movies, etc.)
1191
+
1192
+ Args:
1193
+ entities: List of entity names to fetch images for (e.g., ["Gengar", "Mewtwo", "Alakazam"])
1194
+ entity_type: Type of entity (e.g., "pokemon", "country", "movie")
1195
+ count_per_entity: Number of images to fetch per entity (default: 1)
1196
+ work_dir: Directory to save downloaded images
1197
+ force_web_search: Skip API registry and go straight to web search fallback
1198
+
1199
+ Returns:
1200
+ JSON with fetched entities, images, and fallback status
1201
+ """
1202
+ try:
1203
+ if not work_dir:
1204
+ work_dir = os.getcwd()
1205
+ os.makedirs(work_dir, exist_ok=True)
1206
+
1207
+ from .file_tools import download_image
1208
+ from .azure_blob_tools import upload_file_to_azure_blob
1209
+
1210
+ results: List[Dict[str, Any]] = []
1211
+ fallback_stats = {"api_success": 0, "api_failed": 0, "web_search_used": 0}
1212
+
1213
+ # Load entity API registry
1214
+ registry_path = os.path.join(os.path.dirname(__file__), "entity_api_registry.json")
1215
+ registry = {}
1216
+ api_config = None
1217
+
1218
+ if not force_web_search and os.path.exists(registry_path):
1219
+ try:
1220
+ with open(registry_path, 'r') as f:
1221
+ registry = json.load(f)
1222
+ api_config = registry.get(entity_type.lower())
1223
+ if api_config and not api_config.get("enabled", True):
1224
+ logging.info(f"[fetch_entity_images] API config for '{entity_type}' is disabled, using web search")
1225
+ api_config = None
1226
+ except Exception as e:
1227
+ logging.warning(f"[fetch_entity_images] Failed to load registry: {e}")
1228
+
1229
+ # Process each entity
1230
+ for entity_idx, entity_name in enumerate(entities):
1231
+ entity_result = {
1232
+ "entity": entity_name,
1233
+ "entity_type": entity_type,
1234
+ "images": [],
1235
+ "method": None,
1236
+ "error": None
1237
+ }
1238
+
1239
+ # === TIER 1: Try API Registry ===
1240
+ if api_config and not force_web_search:
1241
+ try:
1242
+ logging.info(f"[fetch_entity_images] Trying API for {entity_name} ({api_config.get('name')})")
1243
+
1244
+ # Check required env vars
1245
+ if api_config.get("requires_env"):
1246
+ missing_vars = [v for v in api_config["requires_env"] if not os.getenv(v)]
1247
+ if missing_vars:
1248
+ raise Exception(f"Missing required env vars: {missing_vars}")
1249
+
1250
+ # Transform entity name
1251
+ transform = api_config.get("entity_transform", "none")
1252
+ transformed_entity = entity_name
1253
+ if transform == "lowercase":
1254
+ transformed_entity = entity_name.lower()
1255
+ elif transform == "uppercase":
1256
+ transformed_entity = entity_name.upper()
1257
+ elif transform == "slug":
1258
+ transformed_entity = entity_name.lower().replace(" ", "-")
1259
+
1260
+ # Build URL
1261
+ url_pattern = api_config["url_pattern"]
1262
+ # Replace env vars in pattern
1263
+ for env_var in re.findall(r'\{([A-Z_]+)\}', url_pattern):
1264
+ if env_var != "entity":
1265
+ url_pattern = url_pattern.replace(f"{{{env_var}}}", os.getenv(env_var, ""))
1266
+ url = url_pattern.replace("{entity}", transformed_entity)
1267
+
1268
+ # Fetch API data
1269
+ headers = {"User-Agent": USER_AGENT}
1270
+ response = requests.get(url, headers=headers, timeout=15)
1271
+ response.raise_for_status()
1272
+ api_data = response.json()
1273
+
1274
+ # Extract image URLs using configured paths
1275
+ image_fields = api_config.get("image_fields", [])
1276
+ image_urls = []
1277
+ for field_path in image_fields:
1278
+ img_url = _extract_json_path(api_data, field_path)
1279
+ if img_url and isinstance(img_url, str) and img_url.startswith("http"):
1280
+ image_urls.append(img_url)
1281
+ if len(image_urls) >= count_per_entity:
1282
+ break
1283
+
1284
+ if not image_urls:
1285
+ raise Exception(f"No valid image URLs found in API response (tried paths: {image_fields})")
1286
+
1287
+ # Download images from API
1288
+ for img_idx, img_url in enumerate(image_urls[:count_per_entity]):
1289
+ try:
1290
+ # Determine extension
1291
+ ext = "png"
1292
+ if img_url.lower().endswith((".jpg", ".jpeg")):
1293
+ ext = "jpg"
1294
+ elif img_url.lower().endswith(".svg"):
1295
+ ext = "svg"
1296
+
1297
+ safe_name = re.sub(r"[^a-zA-Z0-9_-]+", "_", entity_name)[:50]
1298
+ filename = f"{entity_type}_{safe_name}_{entity_idx+1}_{img_idx+1}.{ext}"
1299
+
1300
+ dl_json = await download_image(img_url, filename, work_dir)
1301
+ dl = json.loads(dl_json)
1302
+
1303
+ if dl.get("status") == "success":
1304
+ file_path = dl.get("file_path")
1305
+
1306
+ # Upload if Azure configured
1307
+ azure_conn = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
1308
+ uploaded_url = None
1309
+ if azure_conn:
1310
+ try:
1311
+ up_json = upload_file_to_azure_blob(file_path)
1312
+ up = json.loads(up_json)
1313
+ uploaded_url = up.get("download_url")
1314
+ except Exception:
1315
+ pass
1316
+
1317
+ entity_result["images"].append({
1318
+ "source_url": img_url,
1319
+ "local_path": file_path,
1320
+ "uploaded_url": uploaded_url
1321
+ })
1322
+ except Exception as img_err:
1323
+ logging.warning(f"[fetch_entity_images] Image download failed for {entity_name}: {img_err}")
1324
+
1325
+ if entity_result["images"]:
1326
+ entity_result["method"] = f"api:{api_config.get('name')}"
1327
+ fallback_stats["api_success"] += 1
1328
+ logging.info(f"[fetch_entity_images] ✓ API success for {entity_name}: {len(entity_result['images'])} images")
1329
+ else:
1330
+ raise Exception("All image downloads failed")
1331
+
1332
+ except Exception as api_error:
1333
+ logging.warning(f"[fetch_entity_images] API failed for {entity_name}: {api_error}")
1334
+ fallback_stats["api_failed"] += 1
1335
+ entity_result["error"] = str(api_error)
1336
+ # Will fall through to web search below
1337
+
1338
+ # === TIER 2: Fallback to Web Search ===
1339
+ if not entity_result["images"]:
1340
+ try:
1341
+ logging.info(f"[fetch_entity_images] Falling back to web search for {entity_name}")
1342
+
1343
+ # Use fallback search query if configured, otherwise construct generic query
1344
+ if api_config and api_config.get("fallback_search_query"):
1345
+ search_query = api_config["fallback_search_query"].replace("{entity}", entity_name)
1346
+ else:
1347
+ search_query = f"{entity_name} {entity_type} official image"
1348
+
1349
+ # Use existing collect_task_images function
1350
+ web_result_json = await collect_task_images(
1351
+ query=search_query,
1352
+ count=count_per_entity,
1353
+ verify_download=True,
1354
+ work_dir=work_dir,
1355
+ required_terms=[entity_name.split()[0]], # At least first word of entity name must match
1356
+ strict_entity=False
1357
+ )
1358
+
1359
+ web_result = json.loads(web_result_json)
1360
+ accepted = web_result.get("accepted", [])
1361
+
1362
+ if accepted:
1363
+ # Map web search results to our format
1364
+ for img in accepted[:count_per_entity]:
1365
+ entity_result["images"].append({
1366
+ "source_url": img.get("source_page"),
1367
+ "local_path": img.get("local_path"),
1368
+ "uploaded_url": img.get("uploaded_url")
1369
+ })
1370
+
1371
+ entity_result["method"] = "web_search_fallback"
1372
+ fallback_stats["web_search_used"] += 1
1373
+ logging.info(f"[fetch_entity_images] ✓ Web search success for {entity_name}: {len(entity_result['images'])} images")
1374
+ else:
1375
+ entity_result["error"] = "Web search found no suitable images"
1376
+ logging.warning(f"[fetch_entity_images] Web search found no images for {entity_name}")
1377
+
1378
+ except Exception as web_error:
1379
+ entity_result["error"] = f"Web search failed: {str(web_error)}"
1380
+ logging.error(f"[fetch_entity_images] Web search failed for {entity_name}: {web_error}")
1381
+
1382
+ results.append(entity_result)
1383
+
1384
+ # Count total images from all entities
1385
+ total_images_found = sum(len(entity_res.get("images", [])) for entity_res in results)
1386
+ fallback_stats["total_images"] = total_images_found
1387
+
1388
+ # Calculate success - at least one image found
1389
+ success = total_images_found > 0 or len(entities) == 0 # Empty list is also "successful"
1390
+
1391
+ return json.dumps({
1392
+ "success": success,
1393
+ "entity_type": entity_type,
1394
+ "total_entities": len(entities),
1395
+ "results": results,
1396
+ "stats": fallback_stats,
1397
+ "registry_available": api_config is not None
1398
+ }, indent=2)
1399
+
1400
+ except Exception as exc:
1401
+ return json.dumps({"error": f"fetch_entity_images failed: {str(exc)}"}, indent=2)