@aj-archipelago/cortex 1.3.65 → 1.3.67

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/helper-apps/cortex-autogen2/Dockerfile +88 -21
  2. package/helper-apps/cortex-autogen2/docker-compose.yml +15 -8
  3. package/helper-apps/cortex-autogen2/host.json +5 -0
  4. package/helper-apps/cortex-autogen2/pyproject.toml +82 -25
  5. package/helper-apps/cortex-autogen2/requirements.txt +84 -14
  6. package/helper-apps/cortex-autogen2/services/redis_publisher.py +129 -3
  7. package/helper-apps/cortex-autogen2/task_processor.py +432 -116
  8. package/helper-apps/cortex-autogen2/tools/__init__.py +2 -0
  9. package/helper-apps/cortex-autogen2/tools/azure_blob_tools.py +32 -0
  10. package/helper-apps/cortex-autogen2/tools/azure_foundry_agents.py +50 -14
  11. package/helper-apps/cortex-autogen2/tools/file_tools.py +169 -44
  12. package/helper-apps/cortex-autogen2/tools/google_cse.py +117 -0
  13. package/helper-apps/cortex-autogen2/tools/search_tools.py +655 -98
  14. package/lib/entityConstants.js +1 -1
  15. package/lib/pathwayManager.js +42 -8
  16. package/lib/pathwayTools.js +3 -3
  17. package/lib/util.js +58 -2
  18. package/package.json +1 -1
  19. package/pathways/system/entity/memory/sys_memory_format.js +1 -0
  20. package/pathways/system/entity/memory/sys_memory_manager.js +3 -3
  21. package/pathways/system/entity/sys_entity_start.js +1 -1
  22. package/pathways/system/entity/tools/sys_tool_bing_search_afagent.js +2 -0
  23. package/pathways/system/entity/tools/sys_tool_codingagent.js +2 -2
  24. package/pathways/system/entity/tools/sys_tool_google_search.js +3 -3
  25. package/pathways/system/entity/tools/sys_tool_grok_x_search.js +12 -2
  26. package/pathways/system/workspaces/run_workspace_prompt.js +0 -3
  27. package/server/executeWorkspace.js +381 -0
  28. package/server/graphql.js +5 -180
  29. package/server/pathwayResolver.js +3 -3
  30. package/server/plugins/apptekTranslatePlugin.js +2 -2
  31. package/server/plugins/azureFoundryAgentsPlugin.js +1 -1
  32. package/tests/unit/core/parser.test.js +0 -1
  33. package/tests/unit/core/pathwayManagerWithFiles.test.js +256 -0
  34. package/tests/unit/graphql_executeWorkspace_transformation.test.js +244 -0
  35. package/tests/unit/server/graphql.test.js +122 -1
@@ -12,12 +12,16 @@ import os
12
12
  import requests
13
13
  import json
14
14
  from typing import Dict, Any, List, Optional
15
+ import hashlib
16
+ from PIL import Image
15
17
  import asyncio # Import asyncio
16
18
  import matplotlib.pyplot as plt
17
19
  import pandas as pd
18
20
  import re
19
21
  import urllib.parse
20
22
  import html as html_lib
23
+ from .google_cse import google_cse_search
24
+ from urllib.parse import urljoin, urlparse
21
25
 
22
26
  # try:
23
27
  # except ImportError:
@@ -58,16 +62,106 @@ def _normalize_image_results(items: List[Dict[str, Any]]) -> List[Dict[str, Any]
58
62
  url = item.get("image") or item.get("url") or item.get("thumbnail")
59
63
  if not url:
60
64
  continue
65
+ # For Wikimedia thumbnail URLs, add an "original_url" when derivable
66
+ original_url = None
67
+ try:
68
+ if isinstance(url, str) and "upload.wikimedia.org" in url and "/thumb/" in url:
69
+ parts = url.split("/thumb/")
70
+ if len(parts) == 2:
71
+ tail = parts[1]
72
+ segs = tail.split("/")
73
+ if len(segs) >= 3:
74
+ original_url = parts[0] + "/" + segs[0] + "/" + segs[1] + "/" + segs[2]
75
+ except Exception:
76
+ original_url = None
61
77
  normalized.append({
62
78
  "type": "image",
63
79
  "title": item.get("title"),
64
80
  "url": url,
81
+ "original_url": original_url,
65
82
  "thumbnail_url": item.get("thumbnail"),
66
83
  "width": item.get("width"),
67
84
  "height": item.get("height"),
68
85
  "host_page_url": item.get("source") or item.get("page") or item.get("referrer"),
69
86
  })
70
87
  return normalized
88
+ def _normalize_cse_web_results(payload: Dict[str, Any]) -> List[Dict[str, Any]]:
89
+ """
90
+ Normalize Google CSE (web) response to our common web result shape.
91
+ """
92
+ items = (payload or {}).get("items") or []
93
+ normalized: List[Dict[str, Any]] = []
94
+ for it in items:
95
+ title = it.get("title")
96
+ url = it.get("link")
97
+ snippet = it.get("snippet") or (it.get("htmlSnippet") and re.sub('<[^<]+?>', '', it.get("htmlSnippet")))
98
+ if url and title:
99
+ normalized.append({
100
+ "type": "webpage",
101
+ "title": title,
102
+ "url": url,
103
+ "snippet": snippet,
104
+ })
105
+ return normalized
106
+
107
+
108
+ def _normalize_cse_image_results(payload: Dict[str, Any]) -> List[Dict[str, Any]]:
109
+ """
110
+ Normalize Google CSE (image) response to our common image result shape.
111
+ Handles both standard items array and pagemap-based image results.
112
+ """
113
+ normalized: List[Dict[str, Any]] = []
114
+
115
+ # Method 1: Standard items array (searchType=image)
116
+ items = (payload or {}).get("items") or []
117
+ for it in items:
118
+ link = it.get("link") # direct image URL
119
+ image_obj = it.get("image") or {}
120
+ pagemap = it.get("pagemap") or {}
121
+ cse_image = (pagemap.get("cse_image") or [{}])[0] if "cse_image" in pagemap else {}
122
+
123
+ # Prefer direct link, fallback to pagemap images
124
+ img_url = link or image_obj.get("thumbnailLink") or cse_image.get("src")
125
+ if not img_url:
126
+ continue
127
+
128
+ normalized.append({
129
+ "type": "image",
130
+ "title": it.get("title") or cse_image.get("alt"),
131
+ "url": img_url,
132
+ "original_url": link or img_url,
133
+ "thumbnail_url": image_obj.get("thumbnailLink") or img_url,
134
+ "width": image_obj.get("width") or cse_image.get("width"),
135
+ "height": image_obj.get("height") or cse_image.get("height"),
136
+ "host_page_url": image_obj.get("contextLink") or it.get("link"),
137
+ })
138
+
139
+ # Method 2: Extract from pagemap in web results (fallback)
140
+ if not normalized:
141
+ for it in items:
142
+ pagemap = it.get("pagemap") or {}
143
+ cse_images = pagemap.get("cse_image") or []
144
+ for img in cse_images:
145
+ img_url = img.get("src")
146
+ if img_url:
147
+ normalized.append({
148
+ "type": "image",
149
+ "title": it.get("title"),
150
+ "url": img_url,
151
+ "original_url": img_url,
152
+ "thumbnail_url": img_url,
153
+ "width": img.get("width"),
154
+ "height": img.get("height"),
155
+ "host_page_url": it.get("link"),
156
+ })
157
+
158
+ logging.info(f"[_normalize_cse_image_results] Extracted {len(normalized)} images from CSE payload")
159
+ return normalized
160
+
161
+
162
+ def _has_google_cse_env() -> bool:
163
+ return bool(os.getenv("GOOGLE_CSE_KEY") and os.getenv("GOOGLE_CSE_CX"))
164
+
71
165
 
72
166
 
73
167
  def _extract_snippet_near(html: str, start_pos: int) -> Optional[str]:
@@ -272,129 +366,374 @@ async def fetch_webpage(url: str, render: bool = False, timeout_s: int = 20, max
272
366
  return json.dumps({"error": f"Fetch failed: {str(exc)}"})
273
367
 
274
368
 
275
- def _ddg_get_vqd(query: str) -> Optional[str]:
276
- headers = {"User-Agent": USER_AGENT, "Referer": "https://duckduckgo.com/"}
277
- url = f"https://duckduckgo.com/?q={urllib.parse.quote_plus(query)}&iax=images&ia=images"
278
- resp = requests.get(url, headers=headers, timeout=20)
279
- resp.raise_for_status()
280
- text = resp.text
281
- # Common patterns seen in the page scripts
282
- # Try multiple patterns; DDG frequently changes this
283
- m = re.search(r"vqd='([\w-]+)'", text)
284
- if not m:
285
- m = re.search(r'vqd="([\w-]+)"', text)
286
- if not m:
287
- m = re.search(r'vqd=([\w-]+)&', text)
288
- return m.group(1) if m else None
369
+ # DuckDuckGo vqd token method removed - no API key needed, using HTML scraping only
289
370
 
290
371
 
291
372
  def _ddg_images_html(query: str, count: int = 25) -> List[Dict[str, Any]]:
292
373
  headers = {"User-Agent": USER_AGENT, "Referer": "https://duckduckgo.com/"}
293
374
  url = f"https://duckduckgo.com/?q={urllib.parse.quote_plus(query)}&ia=images&iar=images"
294
- resp = requests.get(url, headers=headers, timeout=20)
295
- resp.raise_for_status()
296
- html = resp.text
297
- items: List[Dict[str, Any]] = []
298
- # Look for external-content proxied URLs; extract original via 'u' param
299
- for m in re.finditer(r'(?:src|data-src)="(https://external-content\.duckduckgo\.com/iu/\?u=[^"]+)"', html):
300
- proxy = html_lib.unescape(m.group(1))
301
- try:
302
- parsed = urllib.parse.urlparse(proxy)
303
- qs = urllib.parse.parse_qs(parsed.query)
304
- orig = qs.get('u', [None])[0]
305
- if not orig:
375
+ try:
376
+ resp = requests.get(url, headers=headers, timeout=20)
377
+ resp.raise_for_status()
378
+ html = resp.text
379
+ items: List[Dict[str, Any]] = []
380
+
381
+ # Method 1: Look for external-content proxied URLs
382
+ for m in re.finditer(r'(?:src|data-src)="(https://external-content\.duckduckgo\.com/iu/\?u=[^"]+)"', html):
383
+ proxy = html_lib.unescape(m.group(1))
384
+ try:
385
+ parsed = urllib.parse.urlparse(proxy)
386
+ qs = urllib.parse.parse_qs(parsed.query)
387
+ orig = qs.get('u', [None])[0]
388
+ if not orig:
389
+ continue
390
+ orig = urllib.parse.unquote(orig)
391
+ items.append({
392
+ "title": None,
393
+ "image": orig,
394
+ "thumbnail": proxy,
395
+ "width": None,
396
+ "height": None,
397
+ "source": None,
398
+ })
399
+ if len(items) >= count:
400
+ break
401
+ except Exception:
306
402
  continue
307
- orig = urllib.parse.unquote(orig)
308
- items.append({
309
- "title": None,
310
- "image": orig,
311
- "thumbnail": proxy,
312
- "width": None,
313
- "height": None,
314
- "source": None,
315
- })
316
- if len(items) >= count:
317
- break
318
- except Exception:
319
- continue
320
- return _normalize_image_results(items)
403
+
404
+ # Method 2: Look for direct image URLs in the page
405
+ if len(items) < count // 2:
406
+ direct_patterns = [
407
+ r'"(https://[^"]+\.(?:jpg|jpeg|png|webp|gif))"',
408
+ r"'(https://[^']+\.(?:jpg|jpeg|png|webp|gif))'",
409
+ ]
410
+ for pattern in direct_patterns:
411
+ for m in re.finditer(pattern, html, re.I):
412
+ img_url = m.group(1)
413
+ if "duckduckgo.com" not in img_url and img_url not in [i["image"] for i in items]:
414
+ items.append({
415
+ "title": None,
416
+ "image": img_url,
417
+ "thumbnail": img_url,
418
+ "width": None,
419
+ "height": None,
420
+ "source": None,
421
+ })
422
+ if len(items) >= count:
423
+ break
424
+ if len(items) >= count:
425
+ break
426
+
427
+ logging.info(f"[_ddg_images_html] Found {len(items)} images for query: {query}")
428
+ return _normalize_image_results(items)
429
+ except Exception as e:
430
+ logging.error(f"[_ddg_images_html] Failed for query '{query}': {e}")
431
+ return []
321
432
 
322
433
 
323
- def _ddg_images(query: str, count: int = 25) -> List[Dict[str, Any]]:
324
- vqd = _ddg_get_vqd(query)
325
- if not vqd:
326
- # Fallback to simple HTML scraping if token not found
327
- return _ddg_images_html(query, count)
328
- headers = {"User-Agent": USER_AGENT, "Referer": "https://duckduckgo.com/"}
329
- params = {
330
- "l": "us-en",
331
- "o": "json",
332
- "q": query,
333
- "vqd": vqd,
334
- "f": ",",
335
- "p": "1",
336
- "s": "0",
337
- }
338
- # Fetch multiple pages to maximize results in a single logical call
339
- raw_results: List[Dict[str, Any]] = []
340
- next_url = "https://duckduckgo.com/i.js"
341
- while len(raw_results) < count and next_url:
342
- resp = requests.get(next_url, headers=headers, params=params, timeout=20)
343
- resp.raise_for_status()
344
- data = resp.json()
345
- raw_results.extend(data.get("results") or [])
346
- next_url = data.get("next")
347
- params = None # subsequent calls use absolute next URL
348
- if not next_url:
349
- break
350
- items: List[Dict[str, Any]] = []
351
- for it in raw_results[: max(1, min(count, 200))]:
352
- items.append({
353
- "title": it.get("title"),
354
- "image": it.get("image"),
355
- "thumbnail": it.get("thumbnail"),
356
- "width": it.get("width"),
357
- "height": it.get("height"),
358
- "source": it.get("url"),
359
- })
360
- normalized = _normalize_image_results(items)
361
- if not normalized:
362
- # Extra fallback to HTML scrape if i.js yields nothing
363
- return _ddg_images_html(query, count)
364
- return normalized
434
+ # DuckDuckGo JSON API method removed - no API key available, using HTML scraping only
365
435
 
366
436
 
367
437
  async def web_search(query: str, count: int = 25, enrich: bool = True) -> str:
368
438
  try:
369
- results = _ddg_web(query, count)
370
- if enrich:
439
+ results: List[Dict[str, Any]] = []
440
+ used_google = False
441
+ # Prefer Google CSE when configured
442
+ if _has_google_cse_env():
443
+ try:
444
+ raw = await google_cse_search(text=query, parameters={"num": max(1, min(count, 10))})
445
+ data = json.loads(raw) if raw else {}
446
+ results = _normalize_cse_web_results(data)
447
+ used_google = True
448
+ except Exception:
449
+ used_google = False
450
+ results = []
451
+
452
+ if not results:
453
+ results = _ddg_web(query, count)
454
+
455
+ if enrich and results:
456
+ # Enrich only for web-page items
371
457
  results = _enrich_web_results_with_meta(results)
458
+
372
459
  if not results:
373
460
  return json.dumps({"status": "No relevant results found."})
461
+
374
462
  return json.dumps(results, indent=2)
375
463
  except Exception as exc:
376
464
  return json.dumps({"error": f"Web search failed: {str(exc)}"})
377
465
 
378
466
 
379
- async def image_search(query: str, count: int = 25) -> str:
467
+ def _make_image_session():
380
468
  try:
381
- results = _ddg_images(query, count)
469
+ s = requests.Session()
470
+ s.headers.update({
471
+ "User-Agent": USER_AGENT,
472
+ "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
473
+ "Accept-Language": "en-US,en;q=0.9",
474
+ "Cache-Control": "no-cache",
475
+ })
476
+ return s
477
+ except Exception:
478
+ return None
479
+
480
+
481
+ def _is_downloadable_image(url: str, session=None, timeout: int = 15) -> bool:
482
+ if not url:
483
+ return False
484
+ try:
485
+ s = session or _make_image_session()
486
+ except Exception:
487
+ s = None
488
+ try:
489
+ if not s:
490
+ s = requests
491
+ r = s.get(url, stream=True, timeout=timeout, allow_redirects=True)
492
+ ct = (r.headers.get("content-type") or "").lower()
493
+ if r.status_code != 200:
494
+ return False
495
+ if ct.startswith("image/"):
496
+ return True
497
+ # Peek first bytes for magic
498
+ try:
499
+ first_chunk = next(r.iter_content(2048), b"")
500
+ except Exception:
501
+ first_chunk = b""
502
+ if first_chunk:
503
+ sigs = [b"\x89PNG\r\n\x1a\n", b"\xff\xd8\xff", b"GIF87a", b"GIF89a", b"RIFF"]
504
+ return any(first_chunk.startswith(sig) for sig in sigs)
505
+ return False
506
+ except Exception:
507
+ return False
508
+
509
+
510
+ async def image_search(query: str, count: int = 25, verify_download: bool = True, required_terms: Optional[List[str]] = None, allowed_domains: Optional[List[str]] = None, strict_entity: bool = False) -> str:
511
+ try:
512
+ # Simple query variants - avoid over-expanding which dilutes results
513
+ def generate_query_variants(q: str) -> List[str]:
514
+ base = q.strip()
515
+ # Only add ONE quality variant to keep results focused
516
+ variants = [
517
+ base, # Primary: exact query
518
+ f"{base} hd", # Secondary: just add quality term
519
+ ]
520
+ return variants
521
+
522
+ results: List[Dict[str, Any]] = []
523
+ # Prefer Google CSE when configured; try multiple high-quality variants in parallel
524
+ if _has_google_cse_env():
525
+ try:
526
+ logging.info(f"[image_search] Using Google CSE for query: {query}")
527
+ variants = generate_query_variants(query)[:2] # cap to 2 calls to stay focused
528
+ params_base = {
529
+ "num": max(1, min(count, 10)),
530
+ "searchType": "image",
531
+ # No strict size/type constraints; let ranking + verification decide
532
+ "safe": "active",
533
+ }
534
+ tasks = [
535
+ google_cse_search(text=v, parameters=dict(params_base))
536
+ for v in variants
537
+ ]
538
+ raws = await asyncio.gather(*tasks)
539
+ merged: List[Dict[str, Any]] = []
540
+ for raw in raws:
541
+ try:
542
+ data = json.loads(raw) if raw else {}
543
+ merged.extend(_normalize_cse_image_results(data))
544
+ except Exception:
545
+ continue
546
+ results = merged
547
+ logging.info(f"[image_search] Google CSE returned {len(results)} raw results")
548
+ except Exception as e:
549
+ logging.warning(f"[image_search] Google CSE failed: {e}")
550
+ results = []
551
+
382
552
  if not results:
553
+ # Fallback to DuckDuckGo HTML scraping if CSE disabled or empty
554
+ try:
555
+ logging.info(f"[image_search] Falling back to DuckDuckGo HTML scraping for query: {query}")
556
+ results = _ddg_images_html(query, count)
557
+ logging.info(f"[image_search] DuckDuckGo HTML returned {len(results)} results")
558
+ except Exception as e:
559
+ logging.error(f"[image_search] All methods failed for query '{query}': {e}")
560
+ results = []
561
+
562
+ # Post-filtering and ranking for relevance and quality
563
+ def score(item: Dict[str, Any]) -> int:
564
+ s = 0
565
+ title = (item.get("title") or "").lower()
566
+ url = (item.get("url") or "").lower()
567
+ host = (item.get("host_page_url") or "").lower()
568
+
569
+ # CRITICAL: Strong relevance check - ensure query terms are present
570
+ q_terms = set(re.findall(r"\w+", query.lower()))
571
+ t_terms = set(re.findall(r"\w+", title))
572
+ u_terms = set(re.findall(r"\w+", url))
573
+
574
+ # Primary relevance: query terms in title (highest weight)
575
+ title_overlap = len(q_terms & t_terms)
576
+ s += 10 * title_overlap # Increased from 3 to 10
577
+
578
+ # Secondary relevance: query terms in URL
579
+ url_overlap = len(q_terms & u_terms)
580
+ s += 5 * url_overlap
581
+
582
+ # CRITICAL: If NO query terms match title or URL, heavily penalize
583
+ if title_overlap == 0 and url_overlap == 0:
584
+ s -= 100 # This image is likely completely unrelated
585
+
586
+ # Quality signals
587
+ for ext, ext_score in ((".png", 2), (".jpg", 2), (".jpeg", 2), (".webp", 2), (".svg", 0), (".gif", 0)):
588
+ if url.endswith(ext) or (item.get("original_url") or "").lower().endswith(ext):
589
+ s += ext_score
590
+ break
591
+
592
+ # Penalize low-quality/irrelevant assets (stronger penalties)
593
+ negative_tokens = ["sprite", "icon", "thumbnail", "thumb", "small", "mini", "logo", "watermark", "stock", "avatar", "emoji"]
594
+ s -= 3 * sum(1 for tok in negative_tokens if tok in url) # Increased penalty
595
+
596
+ # Reward quality descriptors
597
+ positive_tokens = ["official", "press", "hd", "4k", "wallpaper", "hero", "high-res", "highres"]
598
+ s += 2 * sum(1 for tok in positive_tokens if tok in title or tok in url)
599
+
600
+ # Reward larger dimensions
601
+ try:
602
+ w = int(item.get("width") or 0)
603
+ h = int(item.get("height") or 0)
604
+ area = w * h
605
+ if area >= 1920 * 1080: # Full HD or larger
606
+ s += 8
607
+ elif area >= 1280 * 720: # HD
608
+ s += 5
609
+ elif area >= 800 * 600: # Decent size
610
+ s += 2
611
+ elif area > 0 and area < 400 * 400: # Too small
612
+ s -= 5
613
+ except Exception:
614
+ pass
615
+
616
+ # Penalize thumbnails
617
+ if ("thumb" in url or "thumbnail" in url) and not item.get("original_url"):
618
+ s -= 5
619
+
620
+ return s
621
+
622
+ # Optional entity/term/domain constraints
623
+ def hostname(url: Optional[str]) -> Optional[str]:
624
+ try:
625
+ from urllib.parse import urlparse
626
+ return urlparse(url).hostname if url else None
627
+ except Exception:
628
+ return None
629
+
630
+ q_terms = set(re.findall(r"\w+", query.lower()))
631
+ req_terms = set((required_terms or []))
632
+
633
+ filtered1: List[Dict[str, Any]] = []
634
+ for it in results:
635
+ try:
636
+ ttl = (it.get("title") or "")
637
+ tset = set(re.findall(r"\w+", ttl.lower()))
638
+ host = hostname(it.get("host_page_url") or it.get("url")) or ""
639
+ if allowed_domains and not any(d.lower() in host.lower() for d in allowed_domains):
640
+ continue
641
+ if req_terms and not req_terms.issubset(tset):
642
+ # If strict mode, skip; otherwise allow but lower score later
643
+ if strict_entity:
644
+ continue
645
+ it = dict(it)
646
+ it["_missing_required_terms"] = True
647
+ # CRITICAL: Always require at least one query term in title OR URL (even without strict mode)
648
+ url_terms = set(re.findall(r"\w+", (it.get("url") or "").lower()))
649
+ has_match = bool(q_terms & tset) or bool(q_terms & url_terms)
650
+
651
+ if strict_entity and not has_match:
652
+ continue
653
+ elif not strict_entity and not has_match:
654
+ # Even in non-strict mode, skip images with ZERO query term matches
655
+ # This prevents completely unrelated images
656
+ continue
657
+
658
+ filtered1.append(it)
659
+ except Exception:
660
+ continue
661
+
662
+ # De-duplicate by original_url or url
663
+ seen = set()
664
+ deduped: List[Dict[str, Any]] = []
665
+ for it in filtered1:
666
+ key = it.get("original_url") or it.get("url")
667
+ if not key or key in seen:
668
+ continue
669
+ seen.add(key)
670
+ deduped.append(it)
671
+
672
+ # Penalize items missing required terms if not strict
673
+ def score_with_penalty(it: Dict[str, Any]) -> int:
674
+ base = score(it)
675
+ if it.get("_missing_required_terms"):
676
+ base -= 5
677
+ return base
678
+
679
+ deduped.sort(key=score_with_penalty, reverse=True)
680
+
681
+ # Filter out images with low scores (unrelated or poor quality)
682
+ # Minimum score threshold: at least some query term overlap is required
683
+ MIN_RELEVANCE_SCORE = 5 # Ensures at least one query term match + some quality
684
+ deduped = [it for it in deduped if score_with_penalty(it) >= MIN_RELEVANCE_SCORE]
685
+
686
+ # Optionally verify downloadability and pick top working images
687
+ if verify_download:
688
+ session = _make_image_session()
689
+ accepted: List[Dict[str, Any]] = []
690
+ for it in deduped:
691
+ if len(accepted) >= count:
692
+ break
693
+ test_url = it.get("original_url") or it.get("url")
694
+ if _is_downloadable_image(test_url, session=session):
695
+ accepted.append(it)
696
+ deduped = accepted
697
+
698
+ deduped = deduped[:count]
699
+
700
+ if not deduped:
383
701
  return json.dumps({"status": "No relevant results found."})
384
- return json.dumps(results, indent=2)
702
+ return json.dumps(deduped, indent=2)
385
703
  except Exception as exc:
386
704
  return json.dumps({"error": f"Image search failed: {str(exc)}"})
387
705
 
388
706
 
389
707
  async def combined_search(query: str, count: int = 25, enrich: bool = True) -> str:
390
708
  try:
391
- web_task = _ddg_web(query, count)
392
- if enrich:
393
- web_task = _enrich_web_results_with_meta(web_task)
394
- img_task = _ddg_images(query, count)
395
709
  combined: List[Dict[str, Any]] = []
396
- combined.extend(web_task)
397
- combined.extend(img_task)
710
+ # Prefer Google for both, with fallback to DDG
711
+ web_results: List[Dict[str, Any]] = []
712
+ img_results: List[Dict[str, Any]] = []
713
+
714
+ if _has_google_cse_env():
715
+ try:
716
+ raw_web = await google_cse_search(text=query, parameters={"num": max(1, min(count, 10))})
717
+ data_web = json.loads(raw_web) if raw_web else {}
718
+ web_results = _normalize_cse_web_results(data_web)
719
+ except Exception:
720
+ web_results = []
721
+ try:
722
+ raw_img = await google_cse_search(text=query, parameters={"num": max(1, min(count, 10)), "searchType": "image"})
723
+ data_img = json.loads(raw_img) if raw_img else {}
724
+ img_results = _normalize_cse_image_results(data_img)
725
+ except Exception:
726
+ img_results = []
727
+
728
+ if not web_results:
729
+ web_results = _ddg_web(query, count)
730
+ if enrich and web_results:
731
+ web_results = _enrich_web_results_with_meta(web_results)
732
+ if not img_results:
733
+ img_results = _ddg_images_html(query, count)
734
+
735
+ combined.extend(web_results)
736
+ combined.extend(img_results)
398
737
  if not combined:
399
738
  return json.dumps({"status": "No relevant results found."})
400
739
  return json.dumps(combined, indent=2)
@@ -408,6 +747,11 @@ async def collect_task_images(
408
747
  allowed_domains: Optional[List[str]] = None,
409
748
  verify_download: bool = True,
410
749
  work_dir: Optional[str] = None,
750
+ required_terms: Optional[List[str]] = None,
751
+ strict_entity: bool = False,
752
+ min_width: int = 0,
753
+ min_height: int = 0,
754
+ dedup_content: bool = True,
411
755
  ) -> str:
412
756
  """
413
757
  Search for task-relevant images, optionally filter by allowed domains, download locally,
@@ -421,8 +765,8 @@ async def collect_task_images(
421
765
  - work_dir: directory to save files; defaults to current working directory
422
766
  """
423
767
  try:
424
- # Step 1: search many to have selection headroom
425
- raw_json = await image_search(query, count=max(count * 3, count))
768
+ # Step 1: search many to have selection headroom; disable double verification here
769
+ raw_json = await image_search(query, count=max(count * 3, count), verify_download=False, required_terms=required_terms, allowed_domains=allowed_domains, strict_entity=strict_entity)
426
770
  parsed = json.loads(raw_json) if raw_json else []
427
771
  # Normalize parsed results to a list of dicts; handle dict status payloads gracefully
428
772
  if isinstance(parsed, dict):
@@ -433,6 +777,17 @@ async def collect_task_images(
433
777
  else:
434
778
  results = []
435
779
 
780
+ # Allow default domains from env if not provided
781
+ if not allowed_domains:
782
+ try:
783
+ env_domains = os.getenv("IMAGE_ALLOWED_DOMAINS")
784
+ if env_domains:
785
+ allowed_domains = [d.strip() for d in env_domains.split(",") if d.strip()]
786
+ except Exception:
787
+ allowed_domains = None
788
+
789
+ logging.info(f"[collect_task_images] Found {len(results)} raw results before filtering. allowed_domains={allowed_domains}, required_terms={required_terms}, strict_entity={strict_entity}")
790
+
436
791
  # Step 2: relevance filter by domain and title match
437
792
  def hostname(url: Optional[str]) -> Optional[str]:
438
793
  try:
@@ -442,22 +797,35 @@ async def collect_task_images(
442
797
  return None
443
798
 
444
799
  query_terms = set(re.findall(r"\w+", query.lower()))
800
+ req_terms_lower = set(t.lower() for t in (required_terms or []))
445
801
  filtered: List[Dict[str, Any]] = []
446
802
  for it in results:
447
803
  if not isinstance(it, dict):
448
804
  continue
449
805
  host = hostname(it.get("host_page_url") or it.get("url")) or ""
806
+ # Domain filter (if specified)
450
807
  if allowed_domains:
451
808
  if not any(d.lower() in (host or "").lower() for d in allowed_domains):
809
+ logging.debug(f"[collect_task_images] Skipped (domain mismatch): host={host}, allowed={allowed_domains}")
452
810
  continue
453
811
  title = (it.get("title") or "").lower()
812
+ url = (it.get("url") or "").lower()
454
813
  title_terms = set(re.findall(r"\w+", title))
814
+ url_terms = set(re.findall(r"\w+", url))
815
+ # Enforce strict entity/required terms if requested (check both title AND url)
816
+ if strict_entity and req_terms_lower:
817
+ combined_terms = title_terms | url_terms
818
+ if not req_terms_lower.issubset(combined_terms):
819
+ logging.debug(f"[collect_task_images] Skipped (strict_entity): required={req_terms_lower}, found={combined_terms}")
820
+ continue
455
821
  overlap = len(query_terms & title_terms)
456
822
  it_copy = dict(it)
457
823
  it_copy["_rank"] = overlap
458
824
  it_copy["_host"] = host
459
825
  filtered.append(it_copy)
460
826
 
827
+ logging.info(f"[collect_task_images] {len(filtered)} results after domain/entity filtering")
828
+
461
829
  # Rank by overlap desc, then presence of host_page_url
462
830
  filtered.sort(key=lambda x: (x.get("_rank", 0), bool(x.get("host_page_url"))), reverse=True)
463
831
 
@@ -494,10 +862,12 @@ async def collect_task_images(
494
862
  return False
495
863
 
496
864
  used = 0
865
+ seen_hashes: set = set()
497
866
  for it in filtered:
498
867
  if used >= count:
499
868
  break
500
- img_url = it.get("url")
869
+ # Prefer original_url if available
870
+ img_url = it.get("original_url") or it.get("url")
501
871
  if not img_url:
502
872
  skipped.append({"reason": "missing_url", "item": it})
503
873
  continue
@@ -505,9 +875,11 @@ async def collect_task_images(
505
875
  skipped.append({"reason": "verify_failed", "url": img_url})
506
876
  continue
507
877
 
508
- # safe filename
878
+ # Determine extension; if SVG, download as .svg then convert to PNG
509
879
  base = re.sub(r"[^a-zA-Z0-9_-]+", "_", (it.get("title") or "image").strip())[:80] or "image"
510
- filename = f"{base}_{used+1}.jpg"
880
+ url_lower = (img_url or "").lower()
881
+ is_svg = url_lower.endswith(".svg") or ".svg" in url_lower
882
+ filename = f"{base}_{used+1}.svg" if is_svg else f"{base}_{used+1}.jpg"
511
883
  dl_json = await download_image(img_url, filename, work_dir)
512
884
  dl = json.loads(dl_json)
513
885
  if dl.get("status") != "success":
@@ -515,6 +887,57 @@ async def collect_task_images(
515
887
  continue
516
888
 
517
889
  file_path = dl.get("file_path")
890
+ # If SVG, convert to PNG for PIL compatibility
891
+ if is_svg and file_path and os.path.exists(file_path):
892
+ try:
893
+ import cairosvg # type: ignore
894
+ png_path = os.path.splitext(file_path)[0] + ".png"
895
+ cairosvg.svg2png(url=file_path, write_to=png_path)
896
+ try:
897
+ os.remove(file_path)
898
+ except Exception:
899
+ pass
900
+ file_path = png_path
901
+ except Exception as e:
902
+ skipped.append({"reason": "svg_convert_failed", "url": img_url, "error": str(e)})
903
+ try:
904
+ os.remove(file_path)
905
+ except Exception:
906
+ pass
907
+ continue
908
+ # Optional dimension filter
909
+ try:
910
+ if (min_width or min_height) and file_path and os.path.exists(file_path):
911
+ with Image.open(file_path) as im:
912
+ w, h = im.size
913
+ if (min_width and w < min_width) or (min_height and h < min_height):
914
+ skipped.append({"reason": "too_small", "url": img_url, "width": w, "height": h})
915
+ try:
916
+ os.remove(file_path)
917
+ except Exception:
918
+ pass
919
+ continue
920
+ except Exception:
921
+ pass
922
+
923
+ # Optional content deduplication by hash
924
+ try:
925
+ if dedup_content and file_path and os.path.exists(file_path):
926
+ hasher = hashlib.sha256()
927
+ with open(file_path, "rb") as fh:
928
+ for chunk in iter(lambda: fh.read(1024 * 1024), b""):
929
+ hasher.update(chunk)
930
+ digest = hasher.hexdigest()
931
+ if digest in seen_hashes:
932
+ skipped.append({"reason": "content_duplicate", "url": img_url})
933
+ try:
934
+ os.remove(file_path)
935
+ except Exception:
936
+ pass
937
+ continue
938
+ seen_hashes.add(digest)
939
+ except Exception:
940
+ pass
518
941
  # Upload if configured, else mark as local only
519
942
  azure_conn = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
520
943
  if azure_conn:
@@ -561,6 +984,140 @@ async def collect_task_images(
561
984
  return json.dumps({"error": f"collect_task_images failed: {str(exc)}"})
562
985
 
563
986
 
987
+ async def collect_images_by_pattern(
988
+ base_url: str,
989
+ filename_pattern: str,
990
+ start: int,
991
+ end: int,
992
+ zpad: int = 0,
993
+ ext: str = "png",
994
+ work_dir: Optional[str] = None,
995
+ ) -> str:
996
+ """
997
+ Enumerate image URLs from a predictable pattern and download them.
998
+
999
+ Example: base_url="https://example.com/images/", filename_pattern="{name}_{i}", start=1, end=10
1000
+ Constructed URL: base_url + filename_pattern.format(i=idx) + "." + ext
1001
+
1002
+ Returns JSON with downloaded local file paths and any errors.
1003
+ """
1004
+ try:
1005
+ if not work_dir:
1006
+ work_dir = os.getcwd()
1007
+ os.makedirs(work_dir, exist_ok=True)
1008
+
1009
+ from .file_tools import download_image
1010
+
1011
+ downloaded: List[Dict[str, Any]] = []
1012
+ errors: List[Dict[str, Any]] = []
1013
+
1014
+ for i in range(start, end + 1):
1015
+ try:
1016
+ idx = str(i).zfill(zpad) if zpad > 0 else str(i)
1017
+ name = filename_pattern.format(i=idx)
1018
+ url = urljoin(base_url, f"{name}.{ext.lstrip('.')}")
1019
+ safe_name = re.sub(r"[^a-zA-Z0-9_-]+", "_", name) + f".{ext.lstrip('.')}"
1020
+ dl_json = await download_image(url, safe_name, work_dir)
1021
+ dl = json.loads(dl_json)
1022
+ if dl.get("status") == "success":
1023
+ downloaded.append({"url": url, "file_path": dl.get("file_path")})
1024
+ else:
1025
+ errors.append({"url": url, "error": dl})
1026
+ except Exception as e:
1027
+ errors.append({"i": i, "error": str(e)})
1028
+
1029
+ return json.dumps({
1030
+ "base_url": base_url,
1031
+ "pattern": filename_pattern,
1032
+ "range": [start, end],
1033
+ "ext": ext,
1034
+ "downloaded": downloaded,
1035
+ "errors": errors,
1036
+ }, indent=2)
1037
+ except Exception as exc:
1038
+ return json.dumps({"error": f"collect_images_by_pattern failed: {str(exc)}"})
1039
+
1040
+
1041
+ async def scrape_image_gallery(
1042
+ page_url: str,
1043
+ css_selector: Optional[str] = None,
1044
+ attribute: str = "src",
1045
+ max_images: int = 100,
1046
+ work_dir: Optional[str] = None,
1047
+ ) -> str:
1048
+ """
1049
+ Scrape a gallery page to collect image URLs (from <img> or link tags) and download them.
1050
+ - css_selector: optional CSS selector for images/links; defaults to <img> and <a> with image-like href.
1051
+ - attribute: which attribute to read (src or href).
1052
+ Returns JSON with downloaded file paths and skipped entries.
1053
+ """
1054
+ try:
1055
+ headers = {"User-Agent": USER_AGENT}
1056
+ r = requests.get(page_url, headers=headers, timeout=20)
1057
+ r.raise_for_status()
1058
+ html = r.text
1059
+ base = str(r.url or page_url)
1060
+
1061
+ urls: List[str] = []
1062
+ if css_selector:
1063
+ try:
1064
+ from bs4 import BeautifulSoup
1065
+ soup = BeautifulSoup(html, "html.parser")
1066
+ for el in soup.select(css_selector):
1067
+ u = el.get(attribute)
1068
+ if isinstance(u, str):
1069
+ urls.append(urljoin(base, u))
1070
+ except Exception:
1071
+ pass
1072
+ if not urls:
1073
+ # Fallback: parse common <img> and <a href> patterns
1074
+ for m in re.finditer(r'<img[^>]+src="([^"]+)"', html, flags=re.I):
1075
+ urls.append(urljoin(base, html_lib.unescape(m.group(1))))
1076
+ for m in re.finditer(r'<a[^>]+href="([^"]+)"', html, flags=re.I):
1077
+ href = html_lib.unescape(m.group(1))
1078
+ if re.search(r'\.(?:png|jpg|jpeg|webp|gif)(?:\?|$)', href, flags=re.I):
1079
+ urls.append(urljoin(base, href))
1080
+
1081
+ # Deduplicate while preserving order
1082
+ seen = set()
1083
+ clean_urls = []
1084
+ for u in urls:
1085
+ if u not in seen:
1086
+ seen.add(u)
1087
+ clean_urls.append(u)
1088
+ clean_urls = clean_urls[:max_images]
1089
+
1090
+ if not work_dir:
1091
+ work_dir = os.getcwd()
1092
+ os.makedirs(work_dir, exist_ok=True)
1093
+
1094
+ from .file_tools import download_image
1095
+ downloaded: List[Dict[str, Any]] = []
1096
+ skipped: List[Dict[str, Any]] = []
1097
+ for idx, u in enumerate(clean_urls, 1):
1098
+ try:
1099
+ ext_match = re.search(r'\.([a-zA-Z0-9]{3,4})(?:\?|$)', u)
1100
+ ext = ext_match.group(1) if ext_match else "jpg"
1101
+ fname = f"gallery_{idx}.{ext}"
1102
+ dl_json = await download_image(u, fname, work_dir)
1103
+ dl = json.loads(dl_json)
1104
+ if dl.get("status") == "success":
1105
+ downloaded.append({"url": u, "file_path": dl.get("file_path")})
1106
+ else:
1107
+ skipped.append({"url": u, "error": dl})
1108
+ except Exception as e:
1109
+ skipped.append({"url": u, "error": str(e)})
1110
+
1111
+ return json.dumps({
1112
+ "page_url": page_url,
1113
+ "collected": len(downloaded),
1114
+ "downloaded": downloaded,
1115
+ "skipped": skipped,
1116
+ }, indent=2)
1117
+ except Exception as exc:
1118
+ return json.dumps({"error": f"scrape_image_gallery failed: {str(exc)}"})
1119
+
1120
+
564
1121
  async def _perform_single_cognitive_search(
565
1122
  query: str = "*",
566
1123
  index_name: str = "indexwires",