@aj-archipelago/cortex 1.3.65 → 1.3.67
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/helper-apps/cortex-autogen2/Dockerfile +88 -21
- package/helper-apps/cortex-autogen2/docker-compose.yml +15 -8
- package/helper-apps/cortex-autogen2/host.json +5 -0
- package/helper-apps/cortex-autogen2/pyproject.toml +82 -25
- package/helper-apps/cortex-autogen2/requirements.txt +84 -14
- package/helper-apps/cortex-autogen2/services/redis_publisher.py +129 -3
- package/helper-apps/cortex-autogen2/task_processor.py +432 -116
- package/helper-apps/cortex-autogen2/tools/__init__.py +2 -0
- package/helper-apps/cortex-autogen2/tools/azure_blob_tools.py +32 -0
- package/helper-apps/cortex-autogen2/tools/azure_foundry_agents.py +50 -14
- package/helper-apps/cortex-autogen2/tools/file_tools.py +169 -44
- package/helper-apps/cortex-autogen2/tools/google_cse.py +117 -0
- package/helper-apps/cortex-autogen2/tools/search_tools.py +655 -98
- package/lib/entityConstants.js +1 -1
- package/lib/pathwayManager.js +42 -8
- package/lib/pathwayTools.js +3 -3
- package/lib/util.js +58 -2
- package/package.json +1 -1
- package/pathways/system/entity/memory/sys_memory_format.js +1 -0
- package/pathways/system/entity/memory/sys_memory_manager.js +3 -3
- package/pathways/system/entity/sys_entity_start.js +1 -1
- package/pathways/system/entity/tools/sys_tool_bing_search_afagent.js +2 -0
- package/pathways/system/entity/tools/sys_tool_codingagent.js +2 -2
- package/pathways/system/entity/tools/sys_tool_google_search.js +3 -3
- package/pathways/system/entity/tools/sys_tool_grok_x_search.js +12 -2
- package/pathways/system/workspaces/run_workspace_prompt.js +0 -3
- package/server/executeWorkspace.js +381 -0
- package/server/graphql.js +5 -180
- package/server/pathwayResolver.js +3 -3
- package/server/plugins/apptekTranslatePlugin.js +2 -2
- package/server/plugins/azureFoundryAgentsPlugin.js +1 -1
- package/tests/unit/core/parser.test.js +0 -1
- package/tests/unit/core/pathwayManagerWithFiles.test.js +256 -0
- package/tests/unit/graphql_executeWorkspace_transformation.test.js +244 -0
- package/tests/unit/server/graphql.test.js +122 -1
|
@@ -12,12 +12,16 @@ import os
|
|
|
12
12
|
import requests
|
|
13
13
|
import json
|
|
14
14
|
from typing import Dict, Any, List, Optional
|
|
15
|
+
import hashlib
|
|
16
|
+
from PIL import Image
|
|
15
17
|
import asyncio # Import asyncio
|
|
16
18
|
import matplotlib.pyplot as plt
|
|
17
19
|
import pandas as pd
|
|
18
20
|
import re
|
|
19
21
|
import urllib.parse
|
|
20
22
|
import html as html_lib
|
|
23
|
+
from .google_cse import google_cse_search
|
|
24
|
+
from urllib.parse import urljoin, urlparse
|
|
21
25
|
|
|
22
26
|
# try:
|
|
23
27
|
# except ImportError:
|
|
@@ -58,16 +62,106 @@ def _normalize_image_results(items: List[Dict[str, Any]]) -> List[Dict[str, Any]
|
|
|
58
62
|
url = item.get("image") or item.get("url") or item.get("thumbnail")
|
|
59
63
|
if not url:
|
|
60
64
|
continue
|
|
65
|
+
# For Wikimedia thumbnail URLs, add an "original_url" when derivable
|
|
66
|
+
original_url = None
|
|
67
|
+
try:
|
|
68
|
+
if isinstance(url, str) and "upload.wikimedia.org" in url and "/thumb/" in url:
|
|
69
|
+
parts = url.split("/thumb/")
|
|
70
|
+
if len(parts) == 2:
|
|
71
|
+
tail = parts[1]
|
|
72
|
+
segs = tail.split("/")
|
|
73
|
+
if len(segs) >= 3:
|
|
74
|
+
original_url = parts[0] + "/" + segs[0] + "/" + segs[1] + "/" + segs[2]
|
|
75
|
+
except Exception:
|
|
76
|
+
original_url = None
|
|
61
77
|
normalized.append({
|
|
62
78
|
"type": "image",
|
|
63
79
|
"title": item.get("title"),
|
|
64
80
|
"url": url,
|
|
81
|
+
"original_url": original_url,
|
|
65
82
|
"thumbnail_url": item.get("thumbnail"),
|
|
66
83
|
"width": item.get("width"),
|
|
67
84
|
"height": item.get("height"),
|
|
68
85
|
"host_page_url": item.get("source") or item.get("page") or item.get("referrer"),
|
|
69
86
|
})
|
|
70
87
|
return normalized
|
|
88
|
+
def _normalize_cse_web_results(payload: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
89
|
+
"""
|
|
90
|
+
Normalize Google CSE (web) response to our common web result shape.
|
|
91
|
+
"""
|
|
92
|
+
items = (payload or {}).get("items") or []
|
|
93
|
+
normalized: List[Dict[str, Any]] = []
|
|
94
|
+
for it in items:
|
|
95
|
+
title = it.get("title")
|
|
96
|
+
url = it.get("link")
|
|
97
|
+
snippet = it.get("snippet") or (it.get("htmlSnippet") and re.sub('<[^<]+?>', '', it.get("htmlSnippet")))
|
|
98
|
+
if url and title:
|
|
99
|
+
normalized.append({
|
|
100
|
+
"type": "webpage",
|
|
101
|
+
"title": title,
|
|
102
|
+
"url": url,
|
|
103
|
+
"snippet": snippet,
|
|
104
|
+
})
|
|
105
|
+
return normalized
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _normalize_cse_image_results(payload: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
109
|
+
"""
|
|
110
|
+
Normalize Google CSE (image) response to our common image result shape.
|
|
111
|
+
Handles both standard items array and pagemap-based image results.
|
|
112
|
+
"""
|
|
113
|
+
normalized: List[Dict[str, Any]] = []
|
|
114
|
+
|
|
115
|
+
# Method 1: Standard items array (searchType=image)
|
|
116
|
+
items = (payload or {}).get("items") or []
|
|
117
|
+
for it in items:
|
|
118
|
+
link = it.get("link") # direct image URL
|
|
119
|
+
image_obj = it.get("image") or {}
|
|
120
|
+
pagemap = it.get("pagemap") or {}
|
|
121
|
+
cse_image = (pagemap.get("cse_image") or [{}])[0] if "cse_image" in pagemap else {}
|
|
122
|
+
|
|
123
|
+
# Prefer direct link, fallback to pagemap images
|
|
124
|
+
img_url = link or image_obj.get("thumbnailLink") or cse_image.get("src")
|
|
125
|
+
if not img_url:
|
|
126
|
+
continue
|
|
127
|
+
|
|
128
|
+
normalized.append({
|
|
129
|
+
"type": "image",
|
|
130
|
+
"title": it.get("title") or cse_image.get("alt"),
|
|
131
|
+
"url": img_url,
|
|
132
|
+
"original_url": link or img_url,
|
|
133
|
+
"thumbnail_url": image_obj.get("thumbnailLink") or img_url,
|
|
134
|
+
"width": image_obj.get("width") or cse_image.get("width"),
|
|
135
|
+
"height": image_obj.get("height") or cse_image.get("height"),
|
|
136
|
+
"host_page_url": image_obj.get("contextLink") or it.get("link"),
|
|
137
|
+
})
|
|
138
|
+
|
|
139
|
+
# Method 2: Extract from pagemap in web results (fallback)
|
|
140
|
+
if not normalized:
|
|
141
|
+
for it in items:
|
|
142
|
+
pagemap = it.get("pagemap") or {}
|
|
143
|
+
cse_images = pagemap.get("cse_image") or []
|
|
144
|
+
for img in cse_images:
|
|
145
|
+
img_url = img.get("src")
|
|
146
|
+
if img_url:
|
|
147
|
+
normalized.append({
|
|
148
|
+
"type": "image",
|
|
149
|
+
"title": it.get("title"),
|
|
150
|
+
"url": img_url,
|
|
151
|
+
"original_url": img_url,
|
|
152
|
+
"thumbnail_url": img_url,
|
|
153
|
+
"width": img.get("width"),
|
|
154
|
+
"height": img.get("height"),
|
|
155
|
+
"host_page_url": it.get("link"),
|
|
156
|
+
})
|
|
157
|
+
|
|
158
|
+
logging.info(f"[_normalize_cse_image_results] Extracted {len(normalized)} images from CSE payload")
|
|
159
|
+
return normalized
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _has_google_cse_env() -> bool:
|
|
163
|
+
return bool(os.getenv("GOOGLE_CSE_KEY") and os.getenv("GOOGLE_CSE_CX"))
|
|
164
|
+
|
|
71
165
|
|
|
72
166
|
|
|
73
167
|
def _extract_snippet_near(html: str, start_pos: int) -> Optional[str]:
|
|
@@ -272,129 +366,374 @@ async def fetch_webpage(url: str, render: bool = False, timeout_s: int = 20, max
|
|
|
272
366
|
return json.dumps({"error": f"Fetch failed: {str(exc)}"})
|
|
273
367
|
|
|
274
368
|
|
|
275
|
-
|
|
276
|
-
headers = {"User-Agent": USER_AGENT, "Referer": "https://duckduckgo.com/"}
|
|
277
|
-
url = f"https://duckduckgo.com/?q={urllib.parse.quote_plus(query)}&iax=images&ia=images"
|
|
278
|
-
resp = requests.get(url, headers=headers, timeout=20)
|
|
279
|
-
resp.raise_for_status()
|
|
280
|
-
text = resp.text
|
|
281
|
-
# Common patterns seen in the page scripts
|
|
282
|
-
# Try multiple patterns; DDG frequently changes this
|
|
283
|
-
m = re.search(r"vqd='([\w-]+)'", text)
|
|
284
|
-
if not m:
|
|
285
|
-
m = re.search(r'vqd="([\w-]+)"', text)
|
|
286
|
-
if not m:
|
|
287
|
-
m = re.search(r'vqd=([\w-]+)&', text)
|
|
288
|
-
return m.group(1) if m else None
|
|
369
|
+
# DuckDuckGo vqd token method removed - no API key needed, using HTML scraping only
|
|
289
370
|
|
|
290
371
|
|
|
291
372
|
def _ddg_images_html(query: str, count: int = 25) -> List[Dict[str, Any]]:
|
|
292
373
|
headers = {"User-Agent": USER_AGENT, "Referer": "https://duckduckgo.com/"}
|
|
293
374
|
url = f"https://duckduckgo.com/?q={urllib.parse.quote_plus(query)}&ia=images&iar=images"
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
375
|
+
try:
|
|
376
|
+
resp = requests.get(url, headers=headers, timeout=20)
|
|
377
|
+
resp.raise_for_status()
|
|
378
|
+
html = resp.text
|
|
379
|
+
items: List[Dict[str, Any]] = []
|
|
380
|
+
|
|
381
|
+
# Method 1: Look for external-content proxied URLs
|
|
382
|
+
for m in re.finditer(r'(?:src|data-src)="(https://external-content\.duckduckgo\.com/iu/\?u=[^"]+)"', html):
|
|
383
|
+
proxy = html_lib.unescape(m.group(1))
|
|
384
|
+
try:
|
|
385
|
+
parsed = urllib.parse.urlparse(proxy)
|
|
386
|
+
qs = urllib.parse.parse_qs(parsed.query)
|
|
387
|
+
orig = qs.get('u', [None])[0]
|
|
388
|
+
if not orig:
|
|
389
|
+
continue
|
|
390
|
+
orig = urllib.parse.unquote(orig)
|
|
391
|
+
items.append({
|
|
392
|
+
"title": None,
|
|
393
|
+
"image": orig,
|
|
394
|
+
"thumbnail": proxy,
|
|
395
|
+
"width": None,
|
|
396
|
+
"height": None,
|
|
397
|
+
"source": None,
|
|
398
|
+
})
|
|
399
|
+
if len(items) >= count:
|
|
400
|
+
break
|
|
401
|
+
except Exception:
|
|
306
402
|
continue
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
"
|
|
312
|
-
"
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
403
|
+
|
|
404
|
+
# Method 2: Look for direct image URLs in the page
|
|
405
|
+
if len(items) < count // 2:
|
|
406
|
+
direct_patterns = [
|
|
407
|
+
r'"(https://[^"]+\.(?:jpg|jpeg|png|webp|gif))"',
|
|
408
|
+
r"'(https://[^']+\.(?:jpg|jpeg|png|webp|gif))'",
|
|
409
|
+
]
|
|
410
|
+
for pattern in direct_patterns:
|
|
411
|
+
for m in re.finditer(pattern, html, re.I):
|
|
412
|
+
img_url = m.group(1)
|
|
413
|
+
if "duckduckgo.com" not in img_url and img_url not in [i["image"] for i in items]:
|
|
414
|
+
items.append({
|
|
415
|
+
"title": None,
|
|
416
|
+
"image": img_url,
|
|
417
|
+
"thumbnail": img_url,
|
|
418
|
+
"width": None,
|
|
419
|
+
"height": None,
|
|
420
|
+
"source": None,
|
|
421
|
+
})
|
|
422
|
+
if len(items) >= count:
|
|
423
|
+
break
|
|
424
|
+
if len(items) >= count:
|
|
425
|
+
break
|
|
426
|
+
|
|
427
|
+
logging.info(f"[_ddg_images_html] Found {len(items)} images for query: {query}")
|
|
428
|
+
return _normalize_image_results(items)
|
|
429
|
+
except Exception as e:
|
|
430
|
+
logging.error(f"[_ddg_images_html] Failed for query '{query}': {e}")
|
|
431
|
+
return []
|
|
321
432
|
|
|
322
433
|
|
|
323
|
-
|
|
324
|
-
vqd = _ddg_get_vqd(query)
|
|
325
|
-
if not vqd:
|
|
326
|
-
# Fallback to simple HTML scraping if token not found
|
|
327
|
-
return _ddg_images_html(query, count)
|
|
328
|
-
headers = {"User-Agent": USER_AGENT, "Referer": "https://duckduckgo.com/"}
|
|
329
|
-
params = {
|
|
330
|
-
"l": "us-en",
|
|
331
|
-
"o": "json",
|
|
332
|
-
"q": query,
|
|
333
|
-
"vqd": vqd,
|
|
334
|
-
"f": ",",
|
|
335
|
-
"p": "1",
|
|
336
|
-
"s": "0",
|
|
337
|
-
}
|
|
338
|
-
# Fetch multiple pages to maximize results in a single logical call
|
|
339
|
-
raw_results: List[Dict[str, Any]] = []
|
|
340
|
-
next_url = "https://duckduckgo.com/i.js"
|
|
341
|
-
while len(raw_results) < count and next_url:
|
|
342
|
-
resp = requests.get(next_url, headers=headers, params=params, timeout=20)
|
|
343
|
-
resp.raise_for_status()
|
|
344
|
-
data = resp.json()
|
|
345
|
-
raw_results.extend(data.get("results") or [])
|
|
346
|
-
next_url = data.get("next")
|
|
347
|
-
params = None # subsequent calls use absolute next URL
|
|
348
|
-
if not next_url:
|
|
349
|
-
break
|
|
350
|
-
items: List[Dict[str, Any]] = []
|
|
351
|
-
for it in raw_results[: max(1, min(count, 200))]:
|
|
352
|
-
items.append({
|
|
353
|
-
"title": it.get("title"),
|
|
354
|
-
"image": it.get("image"),
|
|
355
|
-
"thumbnail": it.get("thumbnail"),
|
|
356
|
-
"width": it.get("width"),
|
|
357
|
-
"height": it.get("height"),
|
|
358
|
-
"source": it.get("url"),
|
|
359
|
-
})
|
|
360
|
-
normalized = _normalize_image_results(items)
|
|
361
|
-
if not normalized:
|
|
362
|
-
# Extra fallback to HTML scrape if i.js yields nothing
|
|
363
|
-
return _ddg_images_html(query, count)
|
|
364
|
-
return normalized
|
|
434
|
+
# DuckDuckGo JSON API method removed - no API key available, using HTML scraping only
|
|
365
435
|
|
|
366
436
|
|
|
367
437
|
async def web_search(query: str, count: int = 25, enrich: bool = True) -> str:
|
|
368
438
|
try:
|
|
369
|
-
results =
|
|
370
|
-
|
|
439
|
+
results: List[Dict[str, Any]] = []
|
|
440
|
+
used_google = False
|
|
441
|
+
# Prefer Google CSE when configured
|
|
442
|
+
if _has_google_cse_env():
|
|
443
|
+
try:
|
|
444
|
+
raw = await google_cse_search(text=query, parameters={"num": max(1, min(count, 10))})
|
|
445
|
+
data = json.loads(raw) if raw else {}
|
|
446
|
+
results = _normalize_cse_web_results(data)
|
|
447
|
+
used_google = True
|
|
448
|
+
except Exception:
|
|
449
|
+
used_google = False
|
|
450
|
+
results = []
|
|
451
|
+
|
|
452
|
+
if not results:
|
|
453
|
+
results = _ddg_web(query, count)
|
|
454
|
+
|
|
455
|
+
if enrich and results:
|
|
456
|
+
# Enrich only for web-page items
|
|
371
457
|
results = _enrich_web_results_with_meta(results)
|
|
458
|
+
|
|
372
459
|
if not results:
|
|
373
460
|
return json.dumps({"status": "No relevant results found."})
|
|
461
|
+
|
|
374
462
|
return json.dumps(results, indent=2)
|
|
375
463
|
except Exception as exc:
|
|
376
464
|
return json.dumps({"error": f"Web search failed: {str(exc)}"})
|
|
377
465
|
|
|
378
466
|
|
|
379
|
-
|
|
467
|
+
def _make_image_session():
|
|
380
468
|
try:
|
|
381
|
-
|
|
469
|
+
s = requests.Session()
|
|
470
|
+
s.headers.update({
|
|
471
|
+
"User-Agent": USER_AGENT,
|
|
472
|
+
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
|
|
473
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
474
|
+
"Cache-Control": "no-cache",
|
|
475
|
+
})
|
|
476
|
+
return s
|
|
477
|
+
except Exception:
|
|
478
|
+
return None
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
def _is_downloadable_image(url: str, session=None, timeout: int = 15) -> bool:
|
|
482
|
+
if not url:
|
|
483
|
+
return False
|
|
484
|
+
try:
|
|
485
|
+
s = session or _make_image_session()
|
|
486
|
+
except Exception:
|
|
487
|
+
s = None
|
|
488
|
+
try:
|
|
489
|
+
if not s:
|
|
490
|
+
s = requests
|
|
491
|
+
r = s.get(url, stream=True, timeout=timeout, allow_redirects=True)
|
|
492
|
+
ct = (r.headers.get("content-type") or "").lower()
|
|
493
|
+
if r.status_code != 200:
|
|
494
|
+
return False
|
|
495
|
+
if ct.startswith("image/"):
|
|
496
|
+
return True
|
|
497
|
+
# Peek first bytes for magic
|
|
498
|
+
try:
|
|
499
|
+
first_chunk = next(r.iter_content(2048), b"")
|
|
500
|
+
except Exception:
|
|
501
|
+
first_chunk = b""
|
|
502
|
+
if first_chunk:
|
|
503
|
+
sigs = [b"\x89PNG\r\n\x1a\n", b"\xff\xd8\xff", b"GIF87a", b"GIF89a", b"RIFF"]
|
|
504
|
+
return any(first_chunk.startswith(sig) for sig in sigs)
|
|
505
|
+
return False
|
|
506
|
+
except Exception:
|
|
507
|
+
return False
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
async def image_search(query: str, count: int = 25, verify_download: bool = True, required_terms: Optional[List[str]] = None, allowed_domains: Optional[List[str]] = None, strict_entity: bool = False) -> str:
|
|
511
|
+
try:
|
|
512
|
+
# Simple query variants - avoid over-expanding which dilutes results
|
|
513
|
+
def generate_query_variants(q: str) -> List[str]:
|
|
514
|
+
base = q.strip()
|
|
515
|
+
# Only add ONE quality variant to keep results focused
|
|
516
|
+
variants = [
|
|
517
|
+
base, # Primary: exact query
|
|
518
|
+
f"{base} hd", # Secondary: just add quality term
|
|
519
|
+
]
|
|
520
|
+
return variants
|
|
521
|
+
|
|
522
|
+
results: List[Dict[str, Any]] = []
|
|
523
|
+
# Prefer Google CSE when configured; try multiple high-quality variants in parallel
|
|
524
|
+
if _has_google_cse_env():
|
|
525
|
+
try:
|
|
526
|
+
logging.info(f"[image_search] Using Google CSE for query: {query}")
|
|
527
|
+
variants = generate_query_variants(query)[:2] # cap to 2 calls to stay focused
|
|
528
|
+
params_base = {
|
|
529
|
+
"num": max(1, min(count, 10)),
|
|
530
|
+
"searchType": "image",
|
|
531
|
+
# No strict size/type constraints; let ranking + verification decide
|
|
532
|
+
"safe": "active",
|
|
533
|
+
}
|
|
534
|
+
tasks = [
|
|
535
|
+
google_cse_search(text=v, parameters=dict(params_base))
|
|
536
|
+
for v in variants
|
|
537
|
+
]
|
|
538
|
+
raws = await asyncio.gather(*tasks)
|
|
539
|
+
merged: List[Dict[str, Any]] = []
|
|
540
|
+
for raw in raws:
|
|
541
|
+
try:
|
|
542
|
+
data = json.loads(raw) if raw else {}
|
|
543
|
+
merged.extend(_normalize_cse_image_results(data))
|
|
544
|
+
except Exception:
|
|
545
|
+
continue
|
|
546
|
+
results = merged
|
|
547
|
+
logging.info(f"[image_search] Google CSE returned {len(results)} raw results")
|
|
548
|
+
except Exception as e:
|
|
549
|
+
logging.warning(f"[image_search] Google CSE failed: {e}")
|
|
550
|
+
results = []
|
|
551
|
+
|
|
382
552
|
if not results:
|
|
553
|
+
# Fallback to DuckDuckGo HTML scraping if CSE disabled or empty
|
|
554
|
+
try:
|
|
555
|
+
logging.info(f"[image_search] Falling back to DuckDuckGo HTML scraping for query: {query}")
|
|
556
|
+
results = _ddg_images_html(query, count)
|
|
557
|
+
logging.info(f"[image_search] DuckDuckGo HTML returned {len(results)} results")
|
|
558
|
+
except Exception as e:
|
|
559
|
+
logging.error(f"[image_search] All methods failed for query '{query}': {e}")
|
|
560
|
+
results = []
|
|
561
|
+
|
|
562
|
+
# Post-filtering and ranking for relevance and quality
|
|
563
|
+
def score(item: Dict[str, Any]) -> int:
|
|
564
|
+
s = 0
|
|
565
|
+
title = (item.get("title") or "").lower()
|
|
566
|
+
url = (item.get("url") or "").lower()
|
|
567
|
+
host = (item.get("host_page_url") or "").lower()
|
|
568
|
+
|
|
569
|
+
# CRITICAL: Strong relevance check - ensure query terms are present
|
|
570
|
+
q_terms = set(re.findall(r"\w+", query.lower()))
|
|
571
|
+
t_terms = set(re.findall(r"\w+", title))
|
|
572
|
+
u_terms = set(re.findall(r"\w+", url))
|
|
573
|
+
|
|
574
|
+
# Primary relevance: query terms in title (highest weight)
|
|
575
|
+
title_overlap = len(q_terms & t_terms)
|
|
576
|
+
s += 10 * title_overlap # Increased from 3 to 10
|
|
577
|
+
|
|
578
|
+
# Secondary relevance: query terms in URL
|
|
579
|
+
url_overlap = len(q_terms & u_terms)
|
|
580
|
+
s += 5 * url_overlap
|
|
581
|
+
|
|
582
|
+
# CRITICAL: If NO query terms match title or URL, heavily penalize
|
|
583
|
+
if title_overlap == 0 and url_overlap == 0:
|
|
584
|
+
s -= 100 # This image is likely completely unrelated
|
|
585
|
+
|
|
586
|
+
# Quality signals
|
|
587
|
+
for ext, ext_score in ((".png", 2), (".jpg", 2), (".jpeg", 2), (".webp", 2), (".svg", 0), (".gif", 0)):
|
|
588
|
+
if url.endswith(ext) or (item.get("original_url") or "").lower().endswith(ext):
|
|
589
|
+
s += ext_score
|
|
590
|
+
break
|
|
591
|
+
|
|
592
|
+
# Penalize low-quality/irrelevant assets (stronger penalties)
|
|
593
|
+
negative_tokens = ["sprite", "icon", "thumbnail", "thumb", "small", "mini", "logo", "watermark", "stock", "avatar", "emoji"]
|
|
594
|
+
s -= 3 * sum(1 for tok in negative_tokens if tok in url) # Increased penalty
|
|
595
|
+
|
|
596
|
+
# Reward quality descriptors
|
|
597
|
+
positive_tokens = ["official", "press", "hd", "4k", "wallpaper", "hero", "high-res", "highres"]
|
|
598
|
+
s += 2 * sum(1 for tok in positive_tokens if tok in title or tok in url)
|
|
599
|
+
|
|
600
|
+
# Reward larger dimensions
|
|
601
|
+
try:
|
|
602
|
+
w = int(item.get("width") or 0)
|
|
603
|
+
h = int(item.get("height") or 0)
|
|
604
|
+
area = w * h
|
|
605
|
+
if area >= 1920 * 1080: # Full HD or larger
|
|
606
|
+
s += 8
|
|
607
|
+
elif area >= 1280 * 720: # HD
|
|
608
|
+
s += 5
|
|
609
|
+
elif area >= 800 * 600: # Decent size
|
|
610
|
+
s += 2
|
|
611
|
+
elif area > 0 and area < 400 * 400: # Too small
|
|
612
|
+
s -= 5
|
|
613
|
+
except Exception:
|
|
614
|
+
pass
|
|
615
|
+
|
|
616
|
+
# Penalize thumbnails
|
|
617
|
+
if ("thumb" in url or "thumbnail" in url) and not item.get("original_url"):
|
|
618
|
+
s -= 5
|
|
619
|
+
|
|
620
|
+
return s
|
|
621
|
+
|
|
622
|
+
# Optional entity/term/domain constraints
|
|
623
|
+
def hostname(url: Optional[str]) -> Optional[str]:
|
|
624
|
+
try:
|
|
625
|
+
from urllib.parse import urlparse
|
|
626
|
+
return urlparse(url).hostname if url else None
|
|
627
|
+
except Exception:
|
|
628
|
+
return None
|
|
629
|
+
|
|
630
|
+
q_terms = set(re.findall(r"\w+", query.lower()))
|
|
631
|
+
req_terms = set((required_terms or []))
|
|
632
|
+
|
|
633
|
+
filtered1: List[Dict[str, Any]] = []
|
|
634
|
+
for it in results:
|
|
635
|
+
try:
|
|
636
|
+
ttl = (it.get("title") or "")
|
|
637
|
+
tset = set(re.findall(r"\w+", ttl.lower()))
|
|
638
|
+
host = hostname(it.get("host_page_url") or it.get("url")) or ""
|
|
639
|
+
if allowed_domains and not any(d.lower() in host.lower() for d in allowed_domains):
|
|
640
|
+
continue
|
|
641
|
+
if req_terms and not req_terms.issubset(tset):
|
|
642
|
+
# If strict mode, skip; otherwise allow but lower score later
|
|
643
|
+
if strict_entity:
|
|
644
|
+
continue
|
|
645
|
+
it = dict(it)
|
|
646
|
+
it["_missing_required_terms"] = True
|
|
647
|
+
# CRITICAL: Always require at least one query term in title OR URL (even without strict mode)
|
|
648
|
+
url_terms = set(re.findall(r"\w+", (it.get("url") or "").lower()))
|
|
649
|
+
has_match = bool(q_terms & tset) or bool(q_terms & url_terms)
|
|
650
|
+
|
|
651
|
+
if strict_entity and not has_match:
|
|
652
|
+
continue
|
|
653
|
+
elif not strict_entity and not has_match:
|
|
654
|
+
# Even in non-strict mode, skip images with ZERO query term matches
|
|
655
|
+
# This prevents completely unrelated images
|
|
656
|
+
continue
|
|
657
|
+
|
|
658
|
+
filtered1.append(it)
|
|
659
|
+
except Exception:
|
|
660
|
+
continue
|
|
661
|
+
|
|
662
|
+
# De-duplicate by original_url or url
|
|
663
|
+
seen = set()
|
|
664
|
+
deduped: List[Dict[str, Any]] = []
|
|
665
|
+
for it in filtered1:
|
|
666
|
+
key = it.get("original_url") or it.get("url")
|
|
667
|
+
if not key or key in seen:
|
|
668
|
+
continue
|
|
669
|
+
seen.add(key)
|
|
670
|
+
deduped.append(it)
|
|
671
|
+
|
|
672
|
+
# Penalize items missing required terms if not strict
|
|
673
|
+
def score_with_penalty(it: Dict[str, Any]) -> int:
|
|
674
|
+
base = score(it)
|
|
675
|
+
if it.get("_missing_required_terms"):
|
|
676
|
+
base -= 5
|
|
677
|
+
return base
|
|
678
|
+
|
|
679
|
+
deduped.sort(key=score_with_penalty, reverse=True)
|
|
680
|
+
|
|
681
|
+
# Filter out images with low scores (unrelated or poor quality)
|
|
682
|
+
# Minimum score threshold: at least some query term overlap is required
|
|
683
|
+
MIN_RELEVANCE_SCORE = 5 # Ensures at least one query term match + some quality
|
|
684
|
+
deduped = [it for it in deduped if score_with_penalty(it) >= MIN_RELEVANCE_SCORE]
|
|
685
|
+
|
|
686
|
+
# Optionally verify downloadability and pick top working images
|
|
687
|
+
if verify_download:
|
|
688
|
+
session = _make_image_session()
|
|
689
|
+
accepted: List[Dict[str, Any]] = []
|
|
690
|
+
for it in deduped:
|
|
691
|
+
if len(accepted) >= count:
|
|
692
|
+
break
|
|
693
|
+
test_url = it.get("original_url") or it.get("url")
|
|
694
|
+
if _is_downloadable_image(test_url, session=session):
|
|
695
|
+
accepted.append(it)
|
|
696
|
+
deduped = accepted
|
|
697
|
+
|
|
698
|
+
deduped = deduped[:count]
|
|
699
|
+
|
|
700
|
+
if not deduped:
|
|
383
701
|
return json.dumps({"status": "No relevant results found."})
|
|
384
|
-
return json.dumps(
|
|
702
|
+
return json.dumps(deduped, indent=2)
|
|
385
703
|
except Exception as exc:
|
|
386
704
|
return json.dumps({"error": f"Image search failed: {str(exc)}"})
|
|
387
705
|
|
|
388
706
|
|
|
389
707
|
async def combined_search(query: str, count: int = 25, enrich: bool = True) -> str:
|
|
390
708
|
try:
|
|
391
|
-
web_task = _ddg_web(query, count)
|
|
392
|
-
if enrich:
|
|
393
|
-
web_task = _enrich_web_results_with_meta(web_task)
|
|
394
|
-
img_task = _ddg_images(query, count)
|
|
395
709
|
combined: List[Dict[str, Any]] = []
|
|
396
|
-
|
|
397
|
-
|
|
710
|
+
# Prefer Google for both, with fallback to DDG
|
|
711
|
+
web_results: List[Dict[str, Any]] = []
|
|
712
|
+
img_results: List[Dict[str, Any]] = []
|
|
713
|
+
|
|
714
|
+
if _has_google_cse_env():
|
|
715
|
+
try:
|
|
716
|
+
raw_web = await google_cse_search(text=query, parameters={"num": max(1, min(count, 10))})
|
|
717
|
+
data_web = json.loads(raw_web) if raw_web else {}
|
|
718
|
+
web_results = _normalize_cse_web_results(data_web)
|
|
719
|
+
except Exception:
|
|
720
|
+
web_results = []
|
|
721
|
+
try:
|
|
722
|
+
raw_img = await google_cse_search(text=query, parameters={"num": max(1, min(count, 10)), "searchType": "image"})
|
|
723
|
+
data_img = json.loads(raw_img) if raw_img else {}
|
|
724
|
+
img_results = _normalize_cse_image_results(data_img)
|
|
725
|
+
except Exception:
|
|
726
|
+
img_results = []
|
|
727
|
+
|
|
728
|
+
if not web_results:
|
|
729
|
+
web_results = _ddg_web(query, count)
|
|
730
|
+
if enrich and web_results:
|
|
731
|
+
web_results = _enrich_web_results_with_meta(web_results)
|
|
732
|
+
if not img_results:
|
|
733
|
+
img_results = _ddg_images_html(query, count)
|
|
734
|
+
|
|
735
|
+
combined.extend(web_results)
|
|
736
|
+
combined.extend(img_results)
|
|
398
737
|
if not combined:
|
|
399
738
|
return json.dumps({"status": "No relevant results found."})
|
|
400
739
|
return json.dumps(combined, indent=2)
|
|
@@ -408,6 +747,11 @@ async def collect_task_images(
|
|
|
408
747
|
allowed_domains: Optional[List[str]] = None,
|
|
409
748
|
verify_download: bool = True,
|
|
410
749
|
work_dir: Optional[str] = None,
|
|
750
|
+
required_terms: Optional[List[str]] = None,
|
|
751
|
+
strict_entity: bool = False,
|
|
752
|
+
min_width: int = 0,
|
|
753
|
+
min_height: int = 0,
|
|
754
|
+
dedup_content: bool = True,
|
|
411
755
|
) -> str:
|
|
412
756
|
"""
|
|
413
757
|
Search for task-relevant images, optionally filter by allowed domains, download locally,
|
|
@@ -421,8 +765,8 @@ async def collect_task_images(
|
|
|
421
765
|
- work_dir: directory to save files; defaults to current working directory
|
|
422
766
|
"""
|
|
423
767
|
try:
|
|
424
|
-
# Step 1: search many to have selection headroom
|
|
425
|
-
raw_json = await image_search(query, count=max(count * 3, count))
|
|
768
|
+
# Step 1: search many to have selection headroom; disable double verification here
|
|
769
|
+
raw_json = await image_search(query, count=max(count * 3, count), verify_download=False, required_terms=required_terms, allowed_domains=allowed_domains, strict_entity=strict_entity)
|
|
426
770
|
parsed = json.loads(raw_json) if raw_json else []
|
|
427
771
|
# Normalize parsed results to a list of dicts; handle dict status payloads gracefully
|
|
428
772
|
if isinstance(parsed, dict):
|
|
@@ -433,6 +777,17 @@ async def collect_task_images(
|
|
|
433
777
|
else:
|
|
434
778
|
results = []
|
|
435
779
|
|
|
780
|
+
# Allow default domains from env if not provided
|
|
781
|
+
if not allowed_domains:
|
|
782
|
+
try:
|
|
783
|
+
env_domains = os.getenv("IMAGE_ALLOWED_DOMAINS")
|
|
784
|
+
if env_domains:
|
|
785
|
+
allowed_domains = [d.strip() for d in env_domains.split(",") if d.strip()]
|
|
786
|
+
except Exception:
|
|
787
|
+
allowed_domains = None
|
|
788
|
+
|
|
789
|
+
logging.info(f"[collect_task_images] Found {len(results)} raw results before filtering. allowed_domains={allowed_domains}, required_terms={required_terms}, strict_entity={strict_entity}")
|
|
790
|
+
|
|
436
791
|
# Step 2: relevance filter by domain and title match
|
|
437
792
|
def hostname(url: Optional[str]) -> Optional[str]:
|
|
438
793
|
try:
|
|
@@ -442,22 +797,35 @@ async def collect_task_images(
|
|
|
442
797
|
return None
|
|
443
798
|
|
|
444
799
|
query_terms = set(re.findall(r"\w+", query.lower()))
|
|
800
|
+
req_terms_lower = set(t.lower() for t in (required_terms or []))
|
|
445
801
|
filtered: List[Dict[str, Any]] = []
|
|
446
802
|
for it in results:
|
|
447
803
|
if not isinstance(it, dict):
|
|
448
804
|
continue
|
|
449
805
|
host = hostname(it.get("host_page_url") or it.get("url")) or ""
|
|
806
|
+
# Domain filter (if specified)
|
|
450
807
|
if allowed_domains:
|
|
451
808
|
if not any(d.lower() in (host or "").lower() for d in allowed_domains):
|
|
809
|
+
logging.debug(f"[collect_task_images] Skipped (domain mismatch): host={host}, allowed={allowed_domains}")
|
|
452
810
|
continue
|
|
453
811
|
title = (it.get("title") or "").lower()
|
|
812
|
+
url = (it.get("url") or "").lower()
|
|
454
813
|
title_terms = set(re.findall(r"\w+", title))
|
|
814
|
+
url_terms = set(re.findall(r"\w+", url))
|
|
815
|
+
# Enforce strict entity/required terms if requested (check both title AND url)
|
|
816
|
+
if strict_entity and req_terms_lower:
|
|
817
|
+
combined_terms = title_terms | url_terms
|
|
818
|
+
if not req_terms_lower.issubset(combined_terms):
|
|
819
|
+
logging.debug(f"[collect_task_images] Skipped (strict_entity): required={req_terms_lower}, found={combined_terms}")
|
|
820
|
+
continue
|
|
455
821
|
overlap = len(query_terms & title_terms)
|
|
456
822
|
it_copy = dict(it)
|
|
457
823
|
it_copy["_rank"] = overlap
|
|
458
824
|
it_copy["_host"] = host
|
|
459
825
|
filtered.append(it_copy)
|
|
460
826
|
|
|
827
|
+
logging.info(f"[collect_task_images] {len(filtered)} results after domain/entity filtering")
|
|
828
|
+
|
|
461
829
|
# Rank by overlap desc, then presence of host_page_url
|
|
462
830
|
filtered.sort(key=lambda x: (x.get("_rank", 0), bool(x.get("host_page_url"))), reverse=True)
|
|
463
831
|
|
|
@@ -494,10 +862,12 @@ async def collect_task_images(
|
|
|
494
862
|
return False
|
|
495
863
|
|
|
496
864
|
used = 0
|
|
865
|
+
seen_hashes: set = set()
|
|
497
866
|
for it in filtered:
|
|
498
867
|
if used >= count:
|
|
499
868
|
break
|
|
500
|
-
|
|
869
|
+
# Prefer original_url if available
|
|
870
|
+
img_url = it.get("original_url") or it.get("url")
|
|
501
871
|
if not img_url:
|
|
502
872
|
skipped.append({"reason": "missing_url", "item": it})
|
|
503
873
|
continue
|
|
@@ -505,9 +875,11 @@ async def collect_task_images(
|
|
|
505
875
|
skipped.append({"reason": "verify_failed", "url": img_url})
|
|
506
876
|
continue
|
|
507
877
|
|
|
508
|
-
#
|
|
878
|
+
# Determine extension; if SVG, download as .svg then convert to PNG
|
|
509
879
|
base = re.sub(r"[^a-zA-Z0-9_-]+", "_", (it.get("title") or "image").strip())[:80] or "image"
|
|
510
|
-
|
|
880
|
+
url_lower = (img_url or "").lower()
|
|
881
|
+
is_svg = url_lower.endswith(".svg") or ".svg" in url_lower
|
|
882
|
+
filename = f"{base}_{used+1}.svg" if is_svg else f"{base}_{used+1}.jpg"
|
|
511
883
|
dl_json = await download_image(img_url, filename, work_dir)
|
|
512
884
|
dl = json.loads(dl_json)
|
|
513
885
|
if dl.get("status") != "success":
|
|
@@ -515,6 +887,57 @@ async def collect_task_images(
|
|
|
515
887
|
continue
|
|
516
888
|
|
|
517
889
|
file_path = dl.get("file_path")
|
|
890
|
+
# If SVG, convert to PNG for PIL compatibility
|
|
891
|
+
if is_svg and file_path and os.path.exists(file_path):
|
|
892
|
+
try:
|
|
893
|
+
import cairosvg # type: ignore
|
|
894
|
+
png_path = os.path.splitext(file_path)[0] + ".png"
|
|
895
|
+
cairosvg.svg2png(url=file_path, write_to=png_path)
|
|
896
|
+
try:
|
|
897
|
+
os.remove(file_path)
|
|
898
|
+
except Exception:
|
|
899
|
+
pass
|
|
900
|
+
file_path = png_path
|
|
901
|
+
except Exception as e:
|
|
902
|
+
skipped.append({"reason": "svg_convert_failed", "url": img_url, "error": str(e)})
|
|
903
|
+
try:
|
|
904
|
+
os.remove(file_path)
|
|
905
|
+
except Exception:
|
|
906
|
+
pass
|
|
907
|
+
continue
|
|
908
|
+
# Optional dimension filter
|
|
909
|
+
try:
|
|
910
|
+
if (min_width or min_height) and file_path and os.path.exists(file_path):
|
|
911
|
+
with Image.open(file_path) as im:
|
|
912
|
+
w, h = im.size
|
|
913
|
+
if (min_width and w < min_width) or (min_height and h < min_height):
|
|
914
|
+
skipped.append({"reason": "too_small", "url": img_url, "width": w, "height": h})
|
|
915
|
+
try:
|
|
916
|
+
os.remove(file_path)
|
|
917
|
+
except Exception:
|
|
918
|
+
pass
|
|
919
|
+
continue
|
|
920
|
+
except Exception:
|
|
921
|
+
pass
|
|
922
|
+
|
|
923
|
+
# Optional content deduplication by hash
|
|
924
|
+
try:
|
|
925
|
+
if dedup_content and file_path and os.path.exists(file_path):
|
|
926
|
+
hasher = hashlib.sha256()
|
|
927
|
+
with open(file_path, "rb") as fh:
|
|
928
|
+
for chunk in iter(lambda: fh.read(1024 * 1024), b""):
|
|
929
|
+
hasher.update(chunk)
|
|
930
|
+
digest = hasher.hexdigest()
|
|
931
|
+
if digest in seen_hashes:
|
|
932
|
+
skipped.append({"reason": "content_duplicate", "url": img_url})
|
|
933
|
+
try:
|
|
934
|
+
os.remove(file_path)
|
|
935
|
+
except Exception:
|
|
936
|
+
pass
|
|
937
|
+
continue
|
|
938
|
+
seen_hashes.add(digest)
|
|
939
|
+
except Exception:
|
|
940
|
+
pass
|
|
518
941
|
# Upload if configured, else mark as local only
|
|
519
942
|
azure_conn = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
|
|
520
943
|
if azure_conn:
|
|
@@ -561,6 +984,140 @@ async def collect_task_images(
|
|
|
561
984
|
return json.dumps({"error": f"collect_task_images failed: {str(exc)}"})
|
|
562
985
|
|
|
563
986
|
|
|
987
|
+
async def collect_images_by_pattern(
|
|
988
|
+
base_url: str,
|
|
989
|
+
filename_pattern: str,
|
|
990
|
+
start: int,
|
|
991
|
+
end: int,
|
|
992
|
+
zpad: int = 0,
|
|
993
|
+
ext: str = "png",
|
|
994
|
+
work_dir: Optional[str] = None,
|
|
995
|
+
) -> str:
|
|
996
|
+
"""
|
|
997
|
+
Enumerate image URLs from a predictable pattern and download them.
|
|
998
|
+
|
|
999
|
+
Example: base_url="https://example.com/images/", filename_pattern="{name}_{i}", start=1, end=10
|
|
1000
|
+
Constructed URL: base_url + filename_pattern.format(i=idx) + "." + ext
|
|
1001
|
+
|
|
1002
|
+
Returns JSON with downloaded local file paths and any errors.
|
|
1003
|
+
"""
|
|
1004
|
+
try:
|
|
1005
|
+
if not work_dir:
|
|
1006
|
+
work_dir = os.getcwd()
|
|
1007
|
+
os.makedirs(work_dir, exist_ok=True)
|
|
1008
|
+
|
|
1009
|
+
from .file_tools import download_image
|
|
1010
|
+
|
|
1011
|
+
downloaded: List[Dict[str, Any]] = []
|
|
1012
|
+
errors: List[Dict[str, Any]] = []
|
|
1013
|
+
|
|
1014
|
+
for i in range(start, end + 1):
|
|
1015
|
+
try:
|
|
1016
|
+
idx = str(i).zfill(zpad) if zpad > 0 else str(i)
|
|
1017
|
+
name = filename_pattern.format(i=idx)
|
|
1018
|
+
url = urljoin(base_url, f"{name}.{ext.lstrip('.')}")
|
|
1019
|
+
safe_name = re.sub(r"[^a-zA-Z0-9_-]+", "_", name) + f".{ext.lstrip('.')}"
|
|
1020
|
+
dl_json = await download_image(url, safe_name, work_dir)
|
|
1021
|
+
dl = json.loads(dl_json)
|
|
1022
|
+
if dl.get("status") == "success":
|
|
1023
|
+
downloaded.append({"url": url, "file_path": dl.get("file_path")})
|
|
1024
|
+
else:
|
|
1025
|
+
errors.append({"url": url, "error": dl})
|
|
1026
|
+
except Exception as e:
|
|
1027
|
+
errors.append({"i": i, "error": str(e)})
|
|
1028
|
+
|
|
1029
|
+
return json.dumps({
|
|
1030
|
+
"base_url": base_url,
|
|
1031
|
+
"pattern": filename_pattern,
|
|
1032
|
+
"range": [start, end],
|
|
1033
|
+
"ext": ext,
|
|
1034
|
+
"downloaded": downloaded,
|
|
1035
|
+
"errors": errors,
|
|
1036
|
+
}, indent=2)
|
|
1037
|
+
except Exception as exc:
|
|
1038
|
+
return json.dumps({"error": f"collect_images_by_pattern failed: {str(exc)}"})
|
|
1039
|
+
|
|
1040
|
+
|
|
1041
|
+
async def scrape_image_gallery(
|
|
1042
|
+
page_url: str,
|
|
1043
|
+
css_selector: Optional[str] = None,
|
|
1044
|
+
attribute: str = "src",
|
|
1045
|
+
max_images: int = 100,
|
|
1046
|
+
work_dir: Optional[str] = None,
|
|
1047
|
+
) -> str:
|
|
1048
|
+
"""
|
|
1049
|
+
Scrape a gallery page to collect image URLs (from <img> or link tags) and download them.
|
|
1050
|
+
- css_selector: optional CSS selector for images/links; defaults to <img> and <a> with image-like href.
|
|
1051
|
+
- attribute: which attribute to read (src or href).
|
|
1052
|
+
Returns JSON with downloaded file paths and skipped entries.
|
|
1053
|
+
"""
|
|
1054
|
+
try:
|
|
1055
|
+
headers = {"User-Agent": USER_AGENT}
|
|
1056
|
+
r = requests.get(page_url, headers=headers, timeout=20)
|
|
1057
|
+
r.raise_for_status()
|
|
1058
|
+
html = r.text
|
|
1059
|
+
base = str(r.url or page_url)
|
|
1060
|
+
|
|
1061
|
+
urls: List[str] = []
|
|
1062
|
+
if css_selector:
|
|
1063
|
+
try:
|
|
1064
|
+
from bs4 import BeautifulSoup
|
|
1065
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
1066
|
+
for el in soup.select(css_selector):
|
|
1067
|
+
u = el.get(attribute)
|
|
1068
|
+
if isinstance(u, str):
|
|
1069
|
+
urls.append(urljoin(base, u))
|
|
1070
|
+
except Exception:
|
|
1071
|
+
pass
|
|
1072
|
+
if not urls:
|
|
1073
|
+
# Fallback: parse common <img> and <a href> patterns
|
|
1074
|
+
for m in re.finditer(r'<img[^>]+src="([^"]+)"', html, flags=re.I):
|
|
1075
|
+
urls.append(urljoin(base, html_lib.unescape(m.group(1))))
|
|
1076
|
+
for m in re.finditer(r'<a[^>]+href="([^"]+)"', html, flags=re.I):
|
|
1077
|
+
href = html_lib.unescape(m.group(1))
|
|
1078
|
+
if re.search(r'\.(?:png|jpg|jpeg|webp|gif)(?:\?|$)', href, flags=re.I):
|
|
1079
|
+
urls.append(urljoin(base, href))
|
|
1080
|
+
|
|
1081
|
+
# Deduplicate while preserving order
|
|
1082
|
+
seen = set()
|
|
1083
|
+
clean_urls = []
|
|
1084
|
+
for u in urls:
|
|
1085
|
+
if u not in seen:
|
|
1086
|
+
seen.add(u)
|
|
1087
|
+
clean_urls.append(u)
|
|
1088
|
+
clean_urls = clean_urls[:max_images]
|
|
1089
|
+
|
|
1090
|
+
if not work_dir:
|
|
1091
|
+
work_dir = os.getcwd()
|
|
1092
|
+
os.makedirs(work_dir, exist_ok=True)
|
|
1093
|
+
|
|
1094
|
+
from .file_tools import download_image
|
|
1095
|
+
downloaded: List[Dict[str, Any]] = []
|
|
1096
|
+
skipped: List[Dict[str, Any]] = []
|
|
1097
|
+
for idx, u in enumerate(clean_urls, 1):
|
|
1098
|
+
try:
|
|
1099
|
+
ext_match = re.search(r'\.([a-zA-Z0-9]{3,4})(?:\?|$)', u)
|
|
1100
|
+
ext = ext_match.group(1) if ext_match else "jpg"
|
|
1101
|
+
fname = f"gallery_{idx}.{ext}"
|
|
1102
|
+
dl_json = await download_image(u, fname, work_dir)
|
|
1103
|
+
dl = json.loads(dl_json)
|
|
1104
|
+
if dl.get("status") == "success":
|
|
1105
|
+
downloaded.append({"url": u, "file_path": dl.get("file_path")})
|
|
1106
|
+
else:
|
|
1107
|
+
skipped.append({"url": u, "error": dl})
|
|
1108
|
+
except Exception as e:
|
|
1109
|
+
skipped.append({"url": u, "error": str(e)})
|
|
1110
|
+
|
|
1111
|
+
return json.dumps({
|
|
1112
|
+
"page_url": page_url,
|
|
1113
|
+
"collected": len(downloaded),
|
|
1114
|
+
"downloaded": downloaded,
|
|
1115
|
+
"skipped": skipped,
|
|
1116
|
+
}, indent=2)
|
|
1117
|
+
except Exception as exc:
|
|
1118
|
+
return json.dumps({"error": f"scrape_image_gallery failed: {str(exc)}"})
|
|
1119
|
+
|
|
1120
|
+
|
|
564
1121
|
async def _perform_single_cognitive_search(
|
|
565
1122
|
query: str = "*",
|
|
566
1123
|
index_name: str = "indexwires",
|