@aj-archipelago/cortex 1.3.62 → 1.3.63

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. package/.github/workflows/cortex-file-handler-test.yml +61 -0
  2. package/README.md +31 -7
  3. package/config/default.example.json +15 -0
  4. package/config.js +133 -12
  5. package/helper-apps/cortex-autogen2/DigiCertGlobalRootCA.crt.pem +22 -0
  6. package/helper-apps/cortex-autogen2/Dockerfile +31 -0
  7. package/helper-apps/cortex-autogen2/Dockerfile.worker +41 -0
  8. package/helper-apps/cortex-autogen2/README.md +183 -0
  9. package/helper-apps/cortex-autogen2/__init__.py +1 -0
  10. package/helper-apps/cortex-autogen2/agents.py +131 -0
  11. package/helper-apps/cortex-autogen2/docker-compose.yml +20 -0
  12. package/helper-apps/cortex-autogen2/function_app.py +55 -0
  13. package/helper-apps/cortex-autogen2/host.json +15 -0
  14. package/helper-apps/cortex-autogen2/main.py +126 -0
  15. package/helper-apps/cortex-autogen2/poetry.lock +3652 -0
  16. package/helper-apps/cortex-autogen2/pyproject.toml +36 -0
  17. package/helper-apps/cortex-autogen2/requirements.txt +20 -0
  18. package/helper-apps/cortex-autogen2/send_task.py +105 -0
  19. package/helper-apps/cortex-autogen2/services/__init__.py +1 -0
  20. package/helper-apps/cortex-autogen2/services/azure_queue.py +85 -0
  21. package/helper-apps/cortex-autogen2/services/redis_publisher.py +153 -0
  22. package/helper-apps/cortex-autogen2/task_processor.py +488 -0
  23. package/helper-apps/cortex-autogen2/tools/__init__.py +24 -0
  24. package/helper-apps/cortex-autogen2/tools/azure_blob_tools.py +175 -0
  25. package/helper-apps/cortex-autogen2/tools/azure_foundry_agents.py +601 -0
  26. package/helper-apps/cortex-autogen2/tools/coding_tools.py +72 -0
  27. package/helper-apps/cortex-autogen2/tools/download_tools.py +48 -0
  28. package/helper-apps/cortex-autogen2/tools/file_tools.py +545 -0
  29. package/helper-apps/cortex-autogen2/tools/search_tools.py +646 -0
  30. package/helper-apps/cortex-azure-cleaner/README.md +36 -0
  31. package/helper-apps/cortex-file-converter/README.md +93 -0
  32. package/helper-apps/cortex-file-converter/key_to_pdf.py +104 -0
  33. package/helper-apps/cortex-file-converter/list_blob_extensions.py +89 -0
  34. package/helper-apps/cortex-file-converter/process_azure_keynotes.py +181 -0
  35. package/helper-apps/cortex-file-converter/requirements.txt +1 -0
  36. package/helper-apps/cortex-file-handler/.env.test.azure.ci +7 -0
  37. package/helper-apps/cortex-file-handler/.env.test.azure.sample +1 -1
  38. package/helper-apps/cortex-file-handler/.env.test.gcs.ci +10 -0
  39. package/helper-apps/cortex-file-handler/.env.test.gcs.sample +2 -2
  40. package/helper-apps/cortex-file-handler/INTERFACE.md +41 -0
  41. package/helper-apps/cortex-file-handler/package.json +1 -1
  42. package/helper-apps/cortex-file-handler/scripts/setup-azure-container.js +41 -17
  43. package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +30 -15
  44. package/helper-apps/cortex-file-handler/scripts/test-azure.sh +32 -6
  45. package/helper-apps/cortex-file-handler/scripts/test-gcs.sh +24 -2
  46. package/helper-apps/cortex-file-handler/scripts/validate-env.js +128 -0
  47. package/helper-apps/cortex-file-handler/src/blobHandler.js +161 -51
  48. package/helper-apps/cortex-file-handler/src/constants.js +3 -0
  49. package/helper-apps/cortex-file-handler/src/fileChunker.js +10 -8
  50. package/helper-apps/cortex-file-handler/src/index.js +116 -9
  51. package/helper-apps/cortex-file-handler/src/redis.js +61 -1
  52. package/helper-apps/cortex-file-handler/src/services/ConversionService.js +11 -8
  53. package/helper-apps/cortex-file-handler/src/services/FileConversionService.js +2 -2
  54. package/helper-apps/cortex-file-handler/src/services/storage/AzureStorageProvider.js +88 -6
  55. package/helper-apps/cortex-file-handler/src/services/storage/GCSStorageProvider.js +58 -0
  56. package/helper-apps/cortex-file-handler/src/services/storage/StorageFactory.js +25 -5
  57. package/helper-apps/cortex-file-handler/src/services/storage/StorageProvider.js +9 -0
  58. package/helper-apps/cortex-file-handler/src/services/storage/StorageService.js +120 -16
  59. package/helper-apps/cortex-file-handler/src/start.js +27 -17
  60. package/helper-apps/cortex-file-handler/tests/FileConversionService.test.js +52 -1
  61. package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +40 -0
  62. package/helper-apps/cortex-file-handler/tests/checkHashShortLived.test.js +553 -0
  63. package/helper-apps/cortex-file-handler/tests/cleanup.test.js +46 -52
  64. package/helper-apps/cortex-file-handler/tests/containerConversionFlow.test.js +451 -0
  65. package/helper-apps/cortex-file-handler/tests/containerNameParsing.test.js +229 -0
  66. package/helper-apps/cortex-file-handler/tests/containerParameterFlow.test.js +392 -0
  67. package/helper-apps/cortex-file-handler/tests/conversionResilience.test.js +7 -2
  68. package/helper-apps/cortex-file-handler/tests/deleteOperations.test.js +348 -0
  69. package/helper-apps/cortex-file-handler/tests/fileChunker.test.js +23 -2
  70. package/helper-apps/cortex-file-handler/tests/fileUpload.test.js +11 -5
  71. package/helper-apps/cortex-file-handler/tests/getOperations.test.js +58 -24
  72. package/helper-apps/cortex-file-handler/tests/postOperations.test.js +11 -4
  73. package/helper-apps/cortex-file-handler/tests/shortLivedUrlConversion.test.js +225 -0
  74. package/helper-apps/cortex-file-handler/tests/start.test.js +8 -12
  75. package/helper-apps/cortex-file-handler/tests/storage/StorageFactory.test.js +80 -0
  76. package/helper-apps/cortex-file-handler/tests/storage/StorageService.test.js +388 -22
  77. package/helper-apps/cortex-file-handler/tests/testUtils.helper.js +74 -0
  78. package/lib/cortexResponse.js +153 -0
  79. package/lib/entityConstants.js +21 -3
  80. package/lib/logger.js +21 -4
  81. package/lib/pathwayTools.js +28 -9
  82. package/lib/util.js +49 -0
  83. package/package.json +1 -1
  84. package/pathways/basePathway.js +1 -0
  85. package/pathways/bing_afagent.js +54 -1
  86. package/pathways/call_tools.js +2 -3
  87. package/pathways/chat_jarvis.js +1 -1
  88. package/pathways/google_cse.js +27 -0
  89. package/pathways/grok_live_search.js +18 -0
  90. package/pathways/system/entity/memory/sys_memory_lookup_required.js +1 -0
  91. package/pathways/system/entity/memory/sys_memory_required.js +1 -0
  92. package/pathways/system/entity/memory/sys_search_memory.js +1 -0
  93. package/pathways/system/entity/sys_entity_agent.js +56 -4
  94. package/pathways/system/entity/sys_generator_quick.js +1 -0
  95. package/pathways/system/entity/tools/sys_tool_bing_search_afagent.js +26 -0
  96. package/pathways/system/entity/tools/sys_tool_google_search.js +141 -0
  97. package/pathways/system/entity/tools/sys_tool_grok_x_search.js +237 -0
  98. package/pathways/system/entity/tools/sys_tool_image.js +1 -1
  99. package/pathways/system/rest_streaming/sys_claude_37_sonnet.js +21 -0
  100. package/pathways/system/rest_streaming/sys_claude_41_opus.js +21 -0
  101. package/pathways/system/rest_streaming/sys_claude_4_sonnet.js +21 -0
  102. package/pathways/system/rest_streaming/sys_google_gemini_25_flash.js +25 -0
  103. package/pathways/system/rest_streaming/{sys_google_gemini_chat.js → sys_google_gemini_25_pro.js} +6 -4
  104. package/pathways/system/rest_streaming/sys_grok_4.js +23 -0
  105. package/pathways/system/rest_streaming/sys_grok_4_fast_non_reasoning.js +23 -0
  106. package/pathways/system/rest_streaming/sys_grok_4_fast_reasoning.js +23 -0
  107. package/pathways/system/rest_streaming/sys_openai_chat.js +3 -0
  108. package/pathways/system/rest_streaming/sys_openai_chat_gpt41.js +22 -0
  109. package/pathways/system/rest_streaming/sys_openai_chat_gpt41_mini.js +21 -0
  110. package/pathways/system/rest_streaming/sys_openai_chat_gpt41_nano.js +21 -0
  111. package/pathways/system/rest_streaming/{sys_claude_35_sonnet.js → sys_openai_chat_gpt4_omni.js} +6 -4
  112. package/pathways/system/rest_streaming/sys_openai_chat_gpt4_omni_mini.js +21 -0
  113. package/pathways/system/rest_streaming/{sys_claude_3_haiku.js → sys_openai_chat_gpt5.js} +7 -5
  114. package/pathways/system/rest_streaming/sys_openai_chat_gpt5_chat.js +21 -0
  115. package/pathways/system/rest_streaming/sys_openai_chat_gpt5_mini.js +21 -0
  116. package/pathways/system/rest_streaming/sys_openai_chat_gpt5_nano.js +21 -0
  117. package/pathways/system/rest_streaming/{sys_openai_chat_o1.js → sys_openai_chat_o3.js} +6 -3
  118. package/pathways/system/rest_streaming/sys_openai_chat_o3_mini.js +3 -0
  119. package/pathways/system/workspaces/run_workspace_prompt.js +99 -0
  120. package/pathways/vision.js +1 -1
  121. package/server/graphql.js +1 -1
  122. package/server/modelExecutor.js +8 -0
  123. package/server/pathwayResolver.js +166 -16
  124. package/server/pathwayResponseParser.js +16 -8
  125. package/server/plugins/azureFoundryAgentsPlugin.js +1 -1
  126. package/server/plugins/claude3VertexPlugin.js +193 -45
  127. package/server/plugins/gemini15ChatPlugin.js +21 -0
  128. package/server/plugins/gemini15VisionPlugin.js +360 -0
  129. package/server/plugins/googleCsePlugin.js +94 -0
  130. package/server/plugins/grokVisionPlugin.js +365 -0
  131. package/server/plugins/modelPlugin.js +3 -1
  132. package/server/plugins/openAiChatPlugin.js +106 -13
  133. package/server/plugins/openAiVisionPlugin.js +42 -30
  134. package/server/resolver.js +28 -4
  135. package/server/rest.js +270 -53
  136. package/server/typeDef.js +1 -0
  137. package/tests/{mocks.js → helpers/mocks.js} +5 -2
  138. package/tests/{server.js → helpers/server.js} +2 -2
  139. package/tests/helpers/sseAssert.js +23 -0
  140. package/tests/helpers/sseClient.js +73 -0
  141. package/tests/helpers/subscriptionAssert.js +11 -0
  142. package/tests/helpers/subscriptions.js +113 -0
  143. package/tests/{sublong.srt → integration/features/translate/sublong.srt} +4543 -4543
  144. package/tests/integration/features/translate/translate_chunking_stream.test.js +100 -0
  145. package/tests/{translate_srt.test.js → integration/features/translate/translate_srt.test.js} +2 -2
  146. package/tests/integration/graphql/async/stream/agentic.test.js +477 -0
  147. package/tests/integration/graphql/async/stream/subscription_streaming.test.js +62 -0
  148. package/tests/integration/graphql/async/stream/sys_entity_start_streaming.test.js +71 -0
  149. package/tests/integration/graphql/async/stream/vendors/claude_streaming.test.js +56 -0
  150. package/tests/integration/graphql/async/stream/vendors/gemini_streaming.test.js +66 -0
  151. package/tests/integration/graphql/async/stream/vendors/grok_streaming.test.js +56 -0
  152. package/tests/integration/graphql/async/stream/vendors/openai_streaming.test.js +72 -0
  153. package/tests/integration/graphql/features/google/sysToolGoogleSearch.test.js +96 -0
  154. package/tests/integration/graphql/features/grok/grok.test.js +688 -0
  155. package/tests/integration/graphql/features/grok/grok_x_search_tool.test.js +354 -0
  156. package/tests/{main.test.js → integration/graphql/features/main.test.js} +1 -1
  157. package/tests/{call_tools.test.js → integration/graphql/features/tools/call_tools.test.js} +2 -2
  158. package/tests/{vision.test.js → integration/graphql/features/vision/vision.test.js} +1 -1
  159. package/tests/integration/graphql/subscriptions/connection.test.js +26 -0
  160. package/tests/{openai_api.test.js → integration/rest/oai/openai_api.test.js} +63 -238
  161. package/tests/integration/rest/oai/tool_calling_api.test.js +343 -0
  162. package/tests/integration/rest/oai/tool_calling_streaming.test.js +85 -0
  163. package/tests/integration/rest/vendors/claude_streaming.test.js +47 -0
  164. package/tests/integration/rest/vendors/claude_tool_calling_streaming.test.js +75 -0
  165. package/tests/integration/rest/vendors/gemini_streaming.test.js +47 -0
  166. package/tests/integration/rest/vendors/gemini_tool_calling_streaming.test.js +75 -0
  167. package/tests/integration/rest/vendors/grok_streaming.test.js +55 -0
  168. package/tests/integration/rest/vendors/grok_tool_calling_streaming.test.js +75 -0
  169. package/tests/{azureAuthTokenHelper.test.js → unit/core/azureAuthTokenHelper.test.js} +1 -1
  170. package/tests/{chunkfunction.test.js → unit/core/chunkfunction.test.js} +2 -2
  171. package/tests/{config.test.js → unit/core/config.test.js} +3 -3
  172. package/tests/{encodeCache.test.js → unit/core/encodeCache.test.js} +1 -1
  173. package/tests/{fastLruCache.test.js → unit/core/fastLruCache.test.js} +1 -1
  174. package/tests/{handleBars.test.js → unit/core/handleBars.test.js} +1 -1
  175. package/tests/{memoryfunction.test.js → unit/core/memoryfunction.test.js} +2 -2
  176. package/tests/unit/core/mergeResolver.test.js +952 -0
  177. package/tests/{parser.test.js → unit/core/parser.test.js} +3 -3
  178. package/tests/unit/core/pathwayResolver.test.js +187 -0
  179. package/tests/{requestMonitor.test.js → unit/core/requestMonitor.test.js} +1 -1
  180. package/tests/{requestMonitorDurationEstimator.test.js → unit/core/requestMonitorDurationEstimator.test.js} +1 -1
  181. package/tests/{truncateMessages.test.js → unit/core/truncateMessages.test.js} +3 -3
  182. package/tests/{util.test.js → unit/core/util.test.js} +1 -1
  183. package/tests/{apptekTranslatePlugin.test.js → unit/plugins/apptekTranslatePlugin.test.js} +3 -3
  184. package/tests/{azureFoundryAgents.test.js → unit/plugins/azureFoundryAgents.test.js} +136 -1
  185. package/tests/{claude3VertexPlugin.test.js → unit/plugins/claude3VertexPlugin.test.js} +32 -10
  186. package/tests/{claude3VertexToolConversion.test.js → unit/plugins/claude3VertexToolConversion.test.js} +3 -3
  187. package/tests/unit/plugins/googleCsePlugin.test.js +111 -0
  188. package/tests/unit/plugins/grokVisionPlugin.test.js +1392 -0
  189. package/tests/{modelPlugin.test.js → unit/plugins/modelPlugin.test.js} +3 -3
  190. package/tests/{multimodal_conversion.test.js → unit/plugins/multimodal_conversion.test.js} +4 -4
  191. package/tests/{openAiChatPlugin.test.js → unit/plugins/openAiChatPlugin.test.js} +13 -4
  192. package/tests/{openAiToolPlugin.test.js → unit/plugins/openAiToolPlugin.test.js} +35 -27
  193. package/tests/{tokenHandlingTests.test.js → unit/plugins/tokenHandlingTests.test.js} +5 -5
  194. package/tests/{translate_apptek.test.js → unit/plugins/translate_apptek.test.js} +3 -3
  195. package/tests/{streaming.test.js → unit/plugins.streaming/plugin_stream_events.test.js} +19 -58
  196. package/helper-apps/mogrt-handler/tests/test-files/test.gif +0 -1
  197. package/helper-apps/mogrt-handler/tests/test-files/test.mogrt +0 -1
  198. package/helper-apps/mogrt-handler/tests/test-files/test.mp4 +0 -1
  199. package/pathways/system/rest_streaming/sys_openai_chat_gpt4.js +0 -19
  200. package/pathways/system/rest_streaming/sys_openai_chat_gpt4_32.js +0 -19
  201. package/pathways/system/rest_streaming/sys_openai_chat_gpt4_turbo.js +0 -19
  202. package/pathways/system/workspaces/run_claude35_sonnet.js +0 -21
  203. package/pathways/system/workspaces/run_claude3_haiku.js +0 -20
  204. package/pathways/system/workspaces/run_gpt35turbo.js +0 -20
  205. package/pathways/system/workspaces/run_gpt4.js +0 -20
  206. package/pathways/system/workspaces/run_gpt4_32.js +0 -20
  207. package/tests/agentic.test.js +0 -256
  208. package/tests/pathwayResolver.test.js +0 -78
  209. package/tests/subscription.test.js +0 -387
  210. /package/tests/{subchunk.srt → integration/features/translate/subchunk.srt} +0 -0
  211. /package/tests/{subhorizontal.srt → integration/features/translate/subhorizontal.srt} +0 -0
@@ -0,0 +1,646 @@
1
+ """
2
+ Web search tools (keyless).
3
+
4
+ Implements DuckDuckGo-based search without API keys:
5
+ - web_search: web results via HTML endpoint
6
+ - image_search: image results via i.js JSON (requires vqd token)
7
+ - combined_search: combined web + image results
8
+ """
9
+
10
+ import logging
11
+ import os
12
+ import requests
13
+ import json
14
+ from typing import Dict, Any, List, Optional
15
+ import asyncio # Import asyncio
16
+ import matplotlib.pyplot as plt
17
+ import pandas as pd
18
+ import re
19
+ import urllib.parse
20
+ import html as html_lib
21
+
22
+ # try:
23
+ # except ImportError:
24
+ # logging.warning("matplotlib.pyplot not found. Plotting functionality will be disabled.")
25
+ # plt = None
26
+
27
+ # try:
28
+ # except ImportError:
29
+ # logging.warning("pandas not found. CSV/DataFrame functionality may be limited.")
30
+ # pd = None
31
+
32
+ USER_AGENT = (
33
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
34
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
35
+ "Chrome/125.0.0.0 Safari/537.36"
36
+ )
37
+
38
+
39
+ def _normalize_web_results(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
40
+ normalized: List[Dict[str, Any]] = []
41
+ for item in items:
42
+ title = item.get("title")
43
+ url = item.get("url") or item.get("href")
44
+ snippet = item.get("snippet")
45
+ if url and title:
46
+ normalized.append({
47
+ "type": "webpage",
48
+ "title": title,
49
+ "url": url,
50
+ "snippet": snippet,
51
+ })
52
+ return normalized
53
+
54
+
55
+ def _normalize_image_results(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
56
+ normalized: List[Dict[str, Any]] = []
57
+ for item in items:
58
+ url = item.get("image") or item.get("url") or item.get("thumbnail")
59
+ if not url:
60
+ continue
61
+ normalized.append({
62
+ "type": "image",
63
+ "title": item.get("title"),
64
+ "url": url,
65
+ "thumbnail_url": item.get("thumbnail"),
66
+ "width": item.get("width"),
67
+ "height": item.get("height"),
68
+ "host_page_url": item.get("source") or item.get("page") or item.get("referrer"),
69
+ })
70
+ return normalized
71
+
72
+
73
+ def _extract_snippet_near(html: str, start_pos: int) -> Optional[str]:
74
+ window = html[start_pos:start_pos + 1500]
75
+ m = re.search(
76
+ r'<(?:div|span|a)[^>]*class="[^"]*result__snippet[^"]*"[^>]*>([\s\S]*?)</(?:div|span|a)>',
77
+ window,
78
+ flags=re.I,
79
+ )
80
+ if not m:
81
+ return None
82
+ raw = m.group(1)
83
+ text = re.sub('<[^<]+?>', '', raw)
84
+ text = html_lib.unescape(text)
85
+ text = re.sub(r'\s+', ' ', text).strip()
86
+ return text or None
87
+
88
+
89
+ def _ddg_web(query: str, count: int = 25) -> List[Dict[str, Any]]:
90
+ url = f"https://duckduckgo.com/html/?q={urllib.parse.quote_plus(query)}"
91
+ headers = {"User-Agent": USER_AGENT}
92
+ resp = requests.get(url, headers=headers, timeout=20)
93
+ resp.raise_for_status()
94
+ html = resp.text
95
+
96
+ # Capture results: <a class="result__a" href="...">Title</a>
97
+ links_iter = re.finditer(r'<a[^>]*class="[^"]*result__a[^"]*"[^>]*href="([^"]+)"[^>]*>(.*?)</a>', html, flags=re.I | re.S)
98
+ results: List[Dict[str, Any]] = []
99
+ for match in links_iter:
100
+ href = match.group(1)
101
+ title_html = match.group(2)
102
+ title_text = html_lib.unescape(re.sub('<[^<]+?>', '', title_html)).strip()
103
+ if not title_text or not href:
104
+ continue
105
+ # Resolve DDG redirect links and protocol-relative URLs
106
+ url_val = href
107
+ if url_val.startswith("//"):
108
+ url_val = "https:" + url_val
109
+ try:
110
+ parsed = urllib.parse.urlparse(url_val)
111
+ if parsed.netloc.endswith("duckduckgo.com") and parsed.path.startswith("/l/"):
112
+ qs = urllib.parse.parse_qs(parsed.query)
113
+ uddg = qs.get("uddg", [None])[0]
114
+ if uddg:
115
+ url_val = urllib.parse.unquote(uddg)
116
+ except Exception:
117
+ pass
118
+ snippet = _extract_snippet_near(html, match.end())
119
+ results.append({
120
+ "title": title_text,
121
+ "url": url_val,
122
+ "snippet": snippet,
123
+ })
124
+ if len(results) >= max(1, count):
125
+ break
126
+ return _normalize_web_results(results)
127
+
128
+
129
+ def _enrich_web_results_with_meta(results: List[Dict[str, Any]], max_fetch: int = 3, timeout_s: int = 8) -> List[Dict[str, Any]]:
130
+ if not results:
131
+ return results
132
+ headers = {"User-Agent": USER_AGENT}
133
+ enriched: List[Dict[str, Any]] = []
134
+ for idx, item in enumerate(results):
135
+ if idx < max_fetch and (not item.get("snippet") or len(item.get("snippet") or "") < 40):
136
+ url = item.get("url")
137
+ try:
138
+ resp = requests.get(url, headers=headers, timeout=timeout_s)
139
+ resp.raise_for_status()
140
+ html = resp.text
141
+ # meta description
142
+ md = re.search(r'<meta[^>]+name=["\']description["\'][^>]+content=["\']([^"\']+)["\']', html, flags=re.I)
143
+ if not md:
144
+ md = re.search(r'<meta[^>]+content=["\']([^"\']+)["\'][^>]+name=["\']description["\']', html, flags=re.I)
145
+ if not md:
146
+ md = re.search(r'<meta[^>]+property=["\']og:description["\'][^>]+content=["\']([^"\']+)["\']', html, flags=re.I)
147
+ snippet = html_lib.unescape(md.group(1)).strip() if md else None
148
+ if not snippet:
149
+ # fallback: plain text excerpt from body
150
+ body = re.search(r'<body[^>]*>([\s\S]*?)</body>', html, flags=re.I)
151
+ if body:
152
+ text_only = re.sub('<script[\s\S]*?</script>', ' ', body.group(1), flags=re.I)
153
+ text_only = re.sub('<style[\s\S]*?</style>', ' ', text_only, flags=re.I)
154
+ text_only = re.sub('<[^<]+?>', ' ', text_only)
155
+ text_only = re.sub(r'\s+', ' ', text_only).strip()
156
+ snippet = text_only[:300] if text_only else None
157
+ if snippet:
158
+ item = dict(item)
159
+ item["snippet"] = snippet
160
+ except Exception:
161
+ pass
162
+ enriched.append(item)
163
+ return enriched
164
+
165
+
166
+ def _html_to_text(html: str, max_chars: int = 200000) -> str:
167
+ # Normalize line breaks for block elements before stripping
168
+ block_tags = [
169
+ 'p','div','br','hr','section','article','header','footer','li','ul','ol','table','tr','td','th','h1','h2','h3','h4','h5','h6'
170
+ ]
171
+ for tag in block_tags:
172
+ html = re.sub(fr'<\s*{tag}[^>]*>', '\n', html, flags=re.I)
173
+ if tag not in ('br','hr'):
174
+ html = re.sub(fr'</\s*{tag}\s*>', '\n', html, flags=re.I)
175
+
176
+ # Remove script/style/noscript
177
+ html = re.sub(r'<script[\s\S]*?</script>', ' ', html, flags=re.I)
178
+ html = re.sub(r'<style[\s\S]*?</style>', ' ', html, flags=re.I)
179
+ html = re.sub(r'<noscript[\s\S]*?</noscript>', ' ', html, flags=re.I)
180
+ # Remove comments
181
+ html = re.sub(r'<!--([\s\S]*?)-->', ' ', html)
182
+ # Strip remaining tags
183
+ text = re.sub(r'<[^>]+>', ' ', html)
184
+ # Decode entities and collapse whitespace
185
+ text = html_lib.unescape(text)
186
+ text = re.sub(r'\s+', ' ', text).strip()
187
+ if len(text) > max_chars:
188
+ text = text[:max_chars]
189
+ return text
190
+
191
+
192
+ async def fetch_webpage(url: str, render: bool = False, timeout_s: int = 20, max_chars: int = 200000) -> str:
193
+ """
194
+ Fetch a full webpage and return structured JSON with title, html, and extracted text.
195
+ - render=False: simple HTTP fetch (no JS)
196
+ - render=True: try Playwright to render JS (falls back to simple fetch if unavailable)
197
+ """
198
+ try:
199
+ # Normalize URL
200
+ if not re.match(r'^https?://', url):
201
+ url = 'https://' + url
202
+
203
+ headers = {"User-Agent": USER_AGENT}
204
+
205
+ # Helper: requests-based fetch
206
+ def fetch_via_requests(target_url: str) -> Dict[str, Any]:
207
+ r = requests.get(target_url, headers=headers, timeout=timeout_s)
208
+ r.raise_for_status()
209
+ html = r.text
210
+ # Title
211
+ mt = re.search(r'<title[^>]*>([\s\S]*?)</title>', html, flags=re.I)
212
+ title = html_lib.unescape(mt.group(1)).strip() if mt else None
213
+ # Meta description
214
+ md = re.search(r'<meta[^>]+name=["\']description["\'][^>]+content=["\']([^"\']+)["\']', html, flags=re.I)
215
+ if not md:
216
+ md = re.search(r'<meta[^>]+property=["\']og:description["\'][^>]+content=["\']([^"\']+)["\']', html, flags=re.I)
217
+ meta_desc = html_lib.unescape(md.group(1)).strip() if md else None
218
+ text = _html_to_text(html, max_chars=max_chars)
219
+ return {
220
+ "url": str(r.url or target_url),
221
+ "title": title,
222
+ "meta_description": meta_desc,
223
+ "html": html if len(html) <= max_chars else html[:max_chars],
224
+ "text": text,
225
+ }
226
+
227
+ if not render:
228
+ data = fetch_via_requests(url)
229
+ return json.dumps(data, indent=2)
230
+
231
+ # Try Playwright render
232
+ try:
233
+ from playwright.async_api import async_playwright
234
+ except Exception:
235
+ data = fetch_via_requests(url)
236
+ return json.dumps(data, indent=2)
237
+
238
+ try:
239
+ async with async_playwright() as p:
240
+ browser = await p.chromium.launch(headless=True)
241
+ context = await browser.new_context()
242
+ page = await context.new_page()
243
+ await page.set_extra_http_headers({"User-Agent": USER_AGENT})
244
+ await page.goto(url, wait_until="domcontentloaded", timeout=timeout_s * 1000)
245
+ # Give some time for client-side render
246
+ await page.wait_for_timeout(800)
247
+ final_url = page.url
248
+ title = await page.title()
249
+ html = await page.content()
250
+ await browser.close()
251
+
252
+ text = _html_to_text(html, max_chars=max_chars)
253
+ # Meta description from rendered HTML
254
+ md = re.search(r'<meta[^>]+name=["\']description["\'][^>]+content=["\']([^"\']+)["\']', html, flags=re.I)
255
+ if not md:
256
+ md = re.search(r'<meta[^>]+property=["\']og:description["\'][^>]+content=["\']([^"\']+)["\']', html, flags=re.I)
257
+ meta_desc = html_lib.unescape(md.group(1)).strip() if md else None
258
+
259
+ data = {
260
+ "url": final_url,
261
+ "title": title or None,
262
+ "meta_description": meta_desc,
263
+ "html": html if len(html) <= max_chars else html[:max_chars],
264
+ "text": text,
265
+ }
266
+ return json.dumps(data, indent=2)
267
+ except Exception:
268
+ # Fallback to non-rendered fetch on any Playwright runtime error
269
+ data = fetch_via_requests(url)
270
+ return json.dumps(data, indent=2)
271
+ except Exception as exc:
272
+ return json.dumps({"error": f"Fetch failed: {str(exc)}"})
273
+
274
+
275
+ def _ddg_get_vqd(query: str) -> Optional[str]:
276
+ headers = {"User-Agent": USER_AGENT, "Referer": "https://duckduckgo.com/"}
277
+ url = f"https://duckduckgo.com/?q={urllib.parse.quote_plus(query)}&iax=images&ia=images"
278
+ resp = requests.get(url, headers=headers, timeout=20)
279
+ resp.raise_for_status()
280
+ text = resp.text
281
+ # Common patterns seen in the page scripts
282
+ # Try multiple patterns; DDG frequently changes this
283
+ m = re.search(r"vqd='([\w-]+)'", text)
284
+ if not m:
285
+ m = re.search(r'vqd="([\w-]+)"', text)
286
+ if not m:
287
+ m = re.search(r'vqd=([\w-]+)&', text)
288
+ return m.group(1) if m else None
289
+
290
+
291
+ def _ddg_images_html(query: str, count: int = 25) -> List[Dict[str, Any]]:
292
+ headers = {"User-Agent": USER_AGENT, "Referer": "https://duckduckgo.com/"}
293
+ url = f"https://duckduckgo.com/?q={urllib.parse.quote_plus(query)}&ia=images&iar=images"
294
+ resp = requests.get(url, headers=headers, timeout=20)
295
+ resp.raise_for_status()
296
+ html = resp.text
297
+ items: List[Dict[str, Any]] = []
298
+ # Look for external-content proxied URLs; extract original via 'u' param
299
+ for m in re.finditer(r'(?:src|data-src)="(https://external-content\.duckduckgo\.com/iu/\?u=[^"]+)"', html):
300
+ proxy = html_lib.unescape(m.group(1))
301
+ try:
302
+ parsed = urllib.parse.urlparse(proxy)
303
+ qs = urllib.parse.parse_qs(parsed.query)
304
+ orig = qs.get('u', [None])[0]
305
+ if not orig:
306
+ continue
307
+ orig = urllib.parse.unquote(orig)
308
+ items.append({
309
+ "title": None,
310
+ "image": orig,
311
+ "thumbnail": proxy,
312
+ "width": None,
313
+ "height": None,
314
+ "source": None,
315
+ })
316
+ if len(items) >= count:
317
+ break
318
+ except Exception:
319
+ continue
320
+ return _normalize_image_results(items)
321
+
322
+
323
+ def _ddg_images(query: str, count: int = 25) -> List[Dict[str, Any]]:
324
+ vqd = _ddg_get_vqd(query)
325
+ if not vqd:
326
+ # Fallback to simple HTML scraping if token not found
327
+ return _ddg_images_html(query, count)
328
+ headers = {"User-Agent": USER_AGENT, "Referer": "https://duckduckgo.com/"}
329
+ params = {
330
+ "l": "us-en",
331
+ "o": "json",
332
+ "q": query,
333
+ "vqd": vqd,
334
+ "f": ",",
335
+ "p": "1",
336
+ "s": "0",
337
+ }
338
+ # Fetch multiple pages to maximize results in a single logical call
339
+ raw_results: List[Dict[str, Any]] = []
340
+ next_url = "https://duckduckgo.com/i.js"
341
+ while len(raw_results) < count and next_url:
342
+ resp = requests.get(next_url, headers=headers, params=params, timeout=20)
343
+ resp.raise_for_status()
344
+ data = resp.json()
345
+ raw_results.extend(data.get("results") or [])
346
+ next_url = data.get("next")
347
+ params = None # subsequent calls use absolute next URL
348
+ if not next_url:
349
+ break
350
+ items: List[Dict[str, Any]] = []
351
+ for it in raw_results[: max(1, min(count, 200))]:
352
+ items.append({
353
+ "title": it.get("title"),
354
+ "image": it.get("image"),
355
+ "thumbnail": it.get("thumbnail"),
356
+ "width": it.get("width"),
357
+ "height": it.get("height"),
358
+ "source": it.get("url"),
359
+ })
360
+ normalized = _normalize_image_results(items)
361
+ if not normalized:
362
+ # Extra fallback to HTML scrape if i.js yields nothing
363
+ return _ddg_images_html(query, count)
364
+ return normalized
365
+
366
+
367
+ async def web_search(query: str, count: int = 25, enrich: bool = True) -> str:
368
+ try:
369
+ results = _ddg_web(query, count)
370
+ if enrich:
371
+ results = _enrich_web_results_with_meta(results)
372
+ if not results:
373
+ return json.dumps({"status": "No relevant results found."})
374
+ return json.dumps(results, indent=2)
375
+ except Exception as exc:
376
+ return json.dumps({"error": f"Web search failed: {str(exc)}"})
377
+
378
+
379
+ async def image_search(query: str, count: int = 25) -> str:
380
+ try:
381
+ results = _ddg_images(query, count)
382
+ if not results:
383
+ return json.dumps({"status": "No relevant results found."})
384
+ return json.dumps(results, indent=2)
385
+ except Exception as exc:
386
+ return json.dumps({"error": f"Image search failed: {str(exc)}"})
387
+
388
+
389
+ async def combined_search(query: str, count: int = 25, enrich: bool = True) -> str:
390
+ try:
391
+ web_task = _ddg_web(query, count)
392
+ if enrich:
393
+ web_task = _enrich_web_results_with_meta(web_task)
394
+ img_task = _ddg_images(query, count)
395
+ combined: List[Dict[str, Any]] = []
396
+ combined.extend(web_task)
397
+ combined.extend(img_task)
398
+ if not combined:
399
+ return json.dumps({"status": "No relevant results found."})
400
+ return json.dumps(combined, indent=2)
401
+ except Exception as exc:
402
+ return json.dumps({"error": f"Combined search failed: {str(exc)}"})
403
+
404
+
405
+ async def collect_task_images(
406
+ query: str,
407
+ count: int = 10,
408
+ allowed_domains: Optional[List[str]] = None,
409
+ verify_download: bool = True,
410
+ work_dir: Optional[str] = None,
411
+ ) -> str:
412
+ """
413
+ Search for task-relevant images, optionally filter by allowed domains, download locally,
414
+ and upload to Azure Blob Storage when configured. Returns JSON with uploaded URLs and details.
415
+
416
+ Params:
417
+ - query: task/topic to ensure relevance
418
+ - count: desired number of images to return (downloads/uploads up to this many)
419
+ - allowed_domains: restrict results to these host domains if set
420
+ - verify_download: if True, ensures HTTP 200 and image/* content-type before accepting
421
+ - work_dir: directory to save files; defaults to current working directory
422
+ """
423
+ try:
424
+ # Step 1: search many to have selection headroom
425
+ raw_json = await image_search(query, count=max(count * 3, count))
426
+ parsed = json.loads(raw_json) if raw_json else []
427
+ # Normalize parsed results to a list of dicts; handle dict status payloads gracefully
428
+ if isinstance(parsed, dict):
429
+ # No results or error payload
430
+ results: List[Dict[str, Any]] = []
431
+ elif isinstance(parsed, list):
432
+ results = parsed
433
+ else:
434
+ results = []
435
+
436
+ # Step 2: relevance filter by domain and title match
437
+ def hostname(url: Optional[str]) -> Optional[str]:
438
+ try:
439
+ from urllib.parse import urlparse
440
+ return urlparse(url).hostname if url else None
441
+ except Exception:
442
+ return None
443
+
444
+ query_terms = set(re.findall(r"\w+", query.lower()))
445
+ filtered: List[Dict[str, Any]] = []
446
+ for it in results:
447
+ if not isinstance(it, dict):
448
+ continue
449
+ host = hostname(it.get("host_page_url") or it.get("url")) or ""
450
+ if allowed_domains:
451
+ if not any(d.lower() in (host or "").lower() for d in allowed_domains):
452
+ continue
453
+ title = (it.get("title") or "").lower()
454
+ title_terms = set(re.findall(r"\w+", title))
455
+ overlap = len(query_terms & title_terms)
456
+ it_copy = dict(it)
457
+ it_copy["_rank"] = overlap
458
+ it_copy["_host"] = host
459
+ filtered.append(it_copy)
460
+
461
+ # Rank by overlap desc, then presence of host_page_url
462
+ filtered.sort(key=lambda x: (x.get("_rank", 0), bool(x.get("host_page_url"))), reverse=True)
463
+
464
+ # Step 3: download and optionally verify
465
+ if not work_dir:
466
+ work_dir = os.getcwd()
467
+ os.makedirs(work_dir, exist_ok=True)
468
+
469
+ from .file_tools import download_image # local tool
470
+ from .azure_blob_tools import upload_file_to_azure_blob # uploader
471
+
472
+ accepted: List[Dict[str, Any]] = []
473
+ skipped: List[Dict[str, Any]] = []
474
+
475
+ session = None
476
+ if verify_download:
477
+ try:
478
+ import requests
479
+ session = requests.Session()
480
+ session.headers.update({"User-Agent": USER_AGENT})
481
+ except Exception:
482
+ session = None
483
+
484
+ def is_image_ok(url: str) -> bool:
485
+ if not verify_download or not session:
486
+ return True
487
+ try:
488
+ r = session.get(url, stream=True, timeout=15, allow_redirects=True)
489
+ ct = (r.headers.get("content-type") or "").lower()
490
+ if r.status_code == 200 and (ct.startswith("image/") or next(r.iter_content(1024), b"")):
491
+ return True
492
+ except Exception:
493
+ return False
494
+ return False
495
+
496
+ used = 0
497
+ for it in filtered:
498
+ if used >= count:
499
+ break
500
+ img_url = it.get("url")
501
+ if not img_url:
502
+ skipped.append({"reason": "missing_url", "item": it})
503
+ continue
504
+ if not is_image_ok(img_url):
505
+ skipped.append({"reason": "verify_failed", "url": img_url})
506
+ continue
507
+
508
+ # safe filename
509
+ base = re.sub(r"[^a-zA-Z0-9_-]+", "_", (it.get("title") or "image").strip())[:80] or "image"
510
+ filename = f"{base}_{used+1}.jpg"
511
+ dl_json = await download_image(img_url, filename, work_dir)
512
+ dl = json.loads(dl_json)
513
+ if dl.get("status") != "success":
514
+ skipped.append({"reason": "download_error", "url": img_url, "detail": dl})
515
+ continue
516
+
517
+ file_path = dl.get("file_path")
518
+ # Upload if configured, else mark as local only
519
+ azure_conn = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
520
+ if azure_conn:
521
+ up_json = upload_file_to_azure_blob(file_path)
522
+ up = json.loads(up_json)
523
+ if "download_url" in up:
524
+ accepted.append({
525
+ "title": it.get("title"),
526
+ "source_page": it.get("host_page_url"),
527
+ "uploaded_url": up["download_url"],
528
+ "local_path": file_path,
529
+ "width": it.get("width"),
530
+ "height": it.get("height"),
531
+ "source_host": it.get("_host"),
532
+ })
533
+ used += 1
534
+ continue
535
+ else:
536
+ skipped.append({"reason": "upload_error", "file_path": file_path, "detail": up})
537
+ continue
538
+ else:
539
+ accepted.append({
540
+ "title": it.get("title"),
541
+ "source_page": it.get("host_page_url"),
542
+ "uploaded_url": None,
543
+ "local_path": file_path,
544
+ "width": it.get("width"),
545
+ "height": it.get("height"),
546
+ "source_host": it.get("_host"),
547
+ "note": "AZURE_STORAGE_CONNECTION_STRING not set; upload skipped"
548
+ })
549
+ used += 1
550
+
551
+ # No synthesis: if no accepted items, return zero results as-is
552
+
553
+ return json.dumps({
554
+ "query": query,
555
+ "requested": count,
556
+ "returned": len(accepted),
557
+ "accepted": accepted,
558
+ "skipped": skipped,
559
+ }, indent=2)
560
+ except Exception as exc:
561
+ return json.dumps({"error": f"collect_task_images failed: {str(exc)}"})
562
+
563
+
564
+ async def _perform_single_cognitive_search(
565
+ query: str = "*",
566
+ index_name: str = "indexwires",
567
+ date_filter: Optional[str] = None,
568
+ top: int = 50,
569
+ select: Optional[str] = None,
570
+ facets: Optional[List[str]] = None,
571
+ orderby: Optional[str] = None, # Added orderby parameter
572
+ requires_bi: bool = False,
573
+ context_id: Optional[str] = None,
574
+ ) -> Dict[str, Any]:
575
+ """
576
+ Performs a single search query on Azure Cognitive Search. Internal helper.
577
+ """
578
+ API_URL = os.environ.get('AZURE_COGNITIVE_API_URL')
579
+ API_KEY = os.environ.get('AZURE_COGNITIVE_API_KEY')
580
+
581
+ if not API_URL or not API_KEY:
582
+ return {"error": "AZURE_COGNITIVE_API_URL or AZURE_COGNITIVE_API_KEY environment variables not set"}
583
+
584
+ headers = {
585
+ 'Content-Type': 'application/json',
586
+ 'api-key': API_KEY
587
+ }
588
+
589
+ search_url = f"{API_URL}indexes/{index_name}/docs/search?api-version=2024-07-01" # Updated API version
590
+
591
+ payload = {
592
+ 'search': query,
593
+ 'orderby': 'date desc', # Changed to date for consistency with previous working examples
594
+ 'top': min(top, 100),
595
+ }
596
+
597
+ if select:
598
+ payload['select'] = select
599
+ if date_filter:
600
+ # Removed explicit stripping of timezone as the agent is responsible for correct ISO 8601 Z format
601
+ payload['filter'] = date_filter
602
+ if facets:
603
+ payload['facets'] = facets
604
+
605
+ # Apply contextId filter for indexcortex
606
+ if index_name == "indexcortex" and context_id:
607
+ if 'filter' in payload:
608
+ payload['filter'] += f" and owner eq '{context_id}'"
609
+ else:
610
+ payload['filter'] = f"owner eq '{context_id}'"
611
+
612
+ print(f"DEBUG: Search URL: {search_url}") # Added debug print
613
+ print(f"DEBUG: Payload: {json.dumps(payload, indent=2)}") # Added debug print
614
+
615
+ try:
616
+ response = requests.post(search_url, headers=headers, json=payload)
617
+ response.raise_for_status() # Raise an exception for HTTP errors
618
+ return {"index_name": index_name, "results": response.json()}
619
+ except requests.exceptions.RequestException as e:
620
+ return {"index_name": index_name, "error": f"Error performing Cognitive Search: {str(e)}"}
621
+ except Exception as e:
622
+ return {"index_name": index_name, "error": f"Unexpected error in Cognitive Search: {str(e)}"}
623
+
624
+
625
+ async def azure_cognitive_search(
626
+ queries: List[Dict[str, Any]]
627
+ ) -> str:
628
+ """
629
+ Perform one or more searches on Azure Cognitive Search indexes in parallel.
630
+
631
+ Args:
632
+ queries: A list of dictionaries, where each dictionary represents a single search query
633
+ with the following potential keys: `query` (str), `index_name` (str),
634
+ `date_filter` (str, optional), `top` (int, optional), `select` (str, optional),
635
+ `facets` (List[str], optional), `requires_bi` (bool, optional),
636
+ `context_id` (str, optional).
637
+
638
+ Returns:
639
+ JSON string with a list of results, each corresponding to an input query.
640
+ """
641
+ tasks = []
642
+ for q_params in queries:
643
+ tasks.append(_perform_single_cognitive_search(**q_params))
644
+
645
+ results = await asyncio.gather(*tasks)
646
+ return json.dumps(results, indent=2)
@@ -0,0 +1,36 @@
1
+ # Cortex Azure Cleaner
2
+
3
+ This helper app deletes specific data from an Azure Cognitive Search index.
4
+
5
+ ## Configuration
6
+
7
+ Before running the script, you need to set up your Azure credentials. Create a `.env` file in this directory (`helper-apps/cortex-azure-cleaner`) with the following content:
8
+
9
+ ```
10
+ # Azure Cognitive Search configuration
11
+ AZURE_COGNITIVE_API_URL=your_azure_search_endpoint
12
+ AZURE_COGNITIVE_API_KEY=your_azure_search_api_key
13
+ ```
14
+
15
+ Replace `your_azure_search_endpoint` and `your_azure_search_api_key` with your actual Azure Search endpoint and admin key.
16
+
17
+ ## Installation
18
+
19
+ Navigate to this directory and install the dependencies:
20
+
21
+ ```bash
22
+ cd helper-apps/cortex-azure-cleaner
23
+ npm install
24
+ ```
25
+
26
+ ## Usage
27
+
28
+ The script is pre-configured to delete documents with the title "AJ+ Notes on QA Editorial Guidelines.docx" from the "vector-tony-vision-resource" index.
29
+
30
+ To run the script:
31
+
32
+ ```bash
33
+ npm start
34
+ ```
35
+
36
+ The script will search for documents matching the title, log them to the console, and then delete them.