@aj-archipelago/cortex 1.3.41 → 1.3.42

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ .git*
2
+ .vscode
3
+ __azurite_db*__.json
4
+ __blobstorage__
5
+ __queuestorage__
6
+ local.settings.json
7
+ test
8
+ .venv
@@ -0,0 +1,52 @@
1
+ # Use an official Python runtime as a parent image suitable for Azure Functions
2
+ FROM mcr.microsoft.com/azure-functions/python:4-python3.9
3
+
4
+ # Set environment variables for Azure Functions runtime
5
+ ENV AzureWebJobsScriptRoot=/home/site/wwwroot
6
+ ENV AzureFunctionsJobHost__Logging__Console__IsEnabled=true
7
+
8
+ # Install OS dependencies needed by Playwright browsers (Debian-based)
9
+ # This list is based on Playwright documentation and common needs
10
+ RUN apt-get update && apt-get install -y --no-install-recommends \
11
+ libnss3 \
12
+ libnspr4 \
13
+ libdbus-glib-1-2 \
14
+ libatk1.0-0 \
15
+ libatk-bridge2.0-0 \
16
+ libcups2 \
17
+ libdrm2 \
18
+ libexpat1 \
19
+ libgbm1 \
20
+ libpango-1.0-0 \
21
+ libx11-6 \
22
+ libxcb1 \
23
+ libxcomposite1 \
24
+ libxdamage1 \
25
+ libxext6 \
26
+ libxfixes3 \
27
+ libxrandr2 \
28
+ libxrender1 \
29
+ libxtst6 \
30
+ lsb-release \
31
+ wget \
32
+ xvfb \
33
+ # Clean up apt cache
34
+ && rm -rf /var/lib/apt/lists/*
35
+
36
+ # Copy requirements file first to leverage Docker cache
37
+ COPY requirements.txt /tmp/
38
+ WORKDIR /tmp
39
+
40
+ # Install Python dependencies
41
+ RUN pip install --no-cache-dir -r requirements.txt
42
+
43
+ # Install Playwright browsers and their dependencies within the container
44
+ # Using --with-deps helps install system dependencies needed by the browsers
45
+ # Installing only chromium as it's specified in the code
46
+ RUN playwright install --with-deps chromium
47
+
48
+ # Copy the function app code to the final location
49
+ COPY . /home/site/wwwroot
50
+
51
+ # Set the working directory for the function app
52
+ WORKDIR /home/site/wwwroot
@@ -0,0 +1,181 @@
1
+ import azure.functions as func
2
+ import logging
3
+ import json
4
+ from playwright.sync_api import sync_playwright
5
+ import trafilatura
6
+ import base64
7
+
8
+ app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS)
9
+
10
+ def scrape_and_screenshot(url: str, should_screenshot: bool = True) -> dict:
11
+ """Scrapes text and takes a screenshot of a given URL, attempting to reject cookies."""
12
+ screenshot_bytes = None
13
+ html_content = None
14
+ extracted_text = None
15
+
16
+ try:
17
+ with sync_playwright() as p:
18
+ browser = p.chromium.launch(headless=True)
19
+ try:
20
+ context = browser.new_context()
21
+ page = context.new_page()
22
+ page.goto(url, wait_until='load', timeout=60000) # Increased timeout
23
+
24
+ # --- Attempt to reject cookies ---
25
+ # Add more selectors here if needed for different sites
26
+ reject_selectors = [
27
+ "button:has-text('Reject All')",
28
+ "button:has-text('Decline')",
29
+ "button:has-text('Only necessary')",
30
+ "button:has-text('Tümünü Reddet')", # From your example
31
+ "button:has-text('Reject')",
32
+ "[aria-label*='Reject']", # Common aria labels
33
+ "[id*='reject']",
34
+ "[class*='reject']",
35
+ # Add more specific selectors based on common banner frameworks if known
36
+ ]
37
+
38
+ cookie_banner_found = False
39
+ for selector in reject_selectors:
40
+ try:
41
+ # Wait briefly for the banner element to appear
42
+ reject_button = page.locator(selector).first
43
+ if reject_button.is_visible(timeout=2000): # Wait up to 2 seconds
44
+ logging.info(f"Found potential cookie reject button with selector: {selector}")
45
+ reject_button.click(timeout=5000) # Click with a timeout
46
+ logging.info("Clicked cookie reject button.")
47
+ # Wait a tiny bit for the banner to disappear/page to settle
48
+ page.wait_for_timeout(500)
49
+ cookie_banner_found = True
50
+ break # Stop searching once one is clicked
51
+ except Exception as e:
52
+ # Ignore timeout errors if the element doesn't appear or other exceptions
53
+ # logging.debug(f"Cookie reject selector '{selector}' not found or failed: {e}")
54
+ pass # Try the next selector
55
+
56
+ if not cookie_banner_found:
57
+ logging.info("No common cookie reject button found or clicked.")
58
+ # ---------------------------------
59
+
60
+ html_content = page.content()
61
+ # Take FULL page screenshot before closing
62
+ if should_screenshot:
63
+ screenshot_bytes = page.screenshot(full_page=True) # Added full_page=True
64
+ finally:
65
+ browser.close()
66
+ except Exception as e:
67
+ logging.error(f"Playwright error accessing {url}: {e}")
68
+ return {"url": url, "error": f"Playwright error: {e}"}
69
+
70
+ if html_content:
71
+ try:
72
+ extracted_text = trafilatura.extract(html_content, include_comments=False)
73
+ except Exception as e:
74
+ logging.error(f"Trafilatura error processing {url}: {e}")
75
+ # Still return screenshot if Playwright succeeded
76
+ extracted_text = f"Trafilatura extraction failed: {e}"
77
+
78
+ screenshot_base64 = base64.b64encode(screenshot_bytes).decode('utf-8') if screenshot_bytes else None
79
+
80
+ response_data = {
81
+ "url": url,
82
+ "text": extracted_text or "",
83
+ }
84
+ if screenshot_base64:
85
+ response_data["screenshot_base64"] = screenshot_base64
86
+
87
+ return response_data
88
+
89
+ @app.route(route="scrape") # Changed route name
90
+ def http_scrape_trigger(req: func.HttpRequest) -> func.HttpResponse:
91
+ logging.info('Python HTTP scrape trigger function processed a request.')
92
+
93
+ url = None
94
+ take_screenshot = True # Default value
95
+
96
+ # 1. Try getting parameters from query string first
97
+ try:
98
+ url = req.params.get('url')
99
+ if url:
100
+ logging.info(f"Found URL in query parameters: {url}")
101
+ # Handle take_screenshot from query params
102
+ ss_param = req.params.get('take_screenshot', 'true') # Query params are strings
103
+ take_screenshot = ss_param.lower() != 'false'
104
+ else:
105
+ logging.info("URL not found in query parameters.")
106
+ except Exception as e:
107
+ # This shouldn't generally happen with req.params, but good practice
108
+ logging.warning(f"Error reading query parameters: {e}")
109
+ url = None # Ensure url is None if error occurs here
110
+
111
+ # 2. If URL not found in query, try getting from JSON body
112
+ if not url:
113
+ logging.info("Attempting to read URL from JSON body.")
114
+ try:
115
+ req_body = req.get_json()
116
+ if req_body:
117
+ url = req_body.get('url')
118
+ if url:
119
+ logging.info(f"Found URL in JSON body: {url}")
120
+ # Handle take_screenshot from JSON body
121
+ ss_param = req_body.get('take_screenshot', True)
122
+ if isinstance(ss_param, str):
123
+ take_screenshot = ss_param.lower() != 'false'
124
+ else:
125
+ take_screenshot = bool(ss_param) # Convert other types
126
+ logging.info(f"Screenshot parameter from JSON: {take_screenshot}")
127
+ else:
128
+ logging.info("URL key not found in JSON body.")
129
+ else:
130
+ logging.info("JSON body is empty.")
131
+ except ValueError:
132
+ logging.info("Request body is not valid JSON or missing.")
133
+ # url remains None
134
+ except Exception as e:
135
+ logging.warning(f"Error reading JSON body: {e}")
136
+ url = None # Ensure url is None if error occurs here
137
+
138
+ # 3. Process the request if URL was found
139
+ if url:
140
+ try:
141
+ # Validate URL basic structure (optional but recommended)
142
+ if not url.startswith(('http://', 'https://')):
143
+ raise ValueError("Invalid URL format. Must start with http:// or https://")
144
+
145
+ result_data = scrape_and_screenshot(url, should_screenshot=take_screenshot) # Pass the flag
146
+ return func.HttpResponse(
147
+ json.dumps(result_data),
148
+ mimetype="application/json",
149
+ status_code=200
150
+ )
151
+ except ValueError as ve:
152
+ logging.error(f"Invalid URL provided: {ve}")
153
+ return func.HttpResponse(
154
+ json.dumps({"error": str(ve)}),
155
+ mimetype="application/json",
156
+ status_code=400
157
+ )
158
+ except Exception as e:
159
+ logging.error(f"Error processing scrape request for {url}: {e}")
160
+ return func.HttpResponse(
161
+ json.dumps({"error": f"An internal error occurred: {e}"}),
162
+ mimetype="application/json",
163
+ status_code=500
164
+ )
165
+ else:
166
+ logging.warning("URL not provided in request body or query string.")
167
+ return func.HttpResponse(
168
+ json.dumps({"error": "Please pass a 'url' in the JSON request body or query string"}),
169
+ mimetype="application/json",
170
+ status_code=400
171
+ )
172
+
173
+ # Keep this if you might have other triggers, otherwise it can be removed
174
+ # if the scrape trigger is the only one.
175
+ # Example of another potential trigger (e.g., timer)
176
+ # @app.timer_trigger(schedule="0 */5 * * * *", arg_name="myTimer", run_on_startup=True,
177
+ # use_monitor=False)
178
+ # def timer_trigger_handler(myTimer: func.TimerRequest) -> None:
179
+ # if myTimer.past_due:
180
+ # logging.info('The timer is past due!')
181
+ # logging.info('Python timer trigger function executed.')
@@ -0,0 +1,15 @@
1
+ {
2
+ "version": "2.0",
3
+ "logging": {
4
+ "applicationInsights": {
5
+ "samplingSettings": {
6
+ "isEnabled": true,
7
+ "excludedTypes": "Request"
8
+ }
9
+ }
10
+ },
11
+ "extensionBundle": {
12
+ "id": "Microsoft.Azure.Functions.ExtensionBundle",
13
+ "version": "[4.*, 5.0.0)"
14
+ }
15
+ }
@@ -0,0 +1,24 @@
1
+ azure-functions==1.23.0
2
+ babel==2.17.0
3
+ certifi==2025.4.26
4
+ charset-normalizer==3.4.2
5
+ courlan==1.3.2
6
+ dateparser==1.2.1
7
+ greenlet==3.2.1
8
+ htmldate==1.9.3
9
+ jusText==3.0.2
10
+ lxml==5.4.0
11
+ lxml_html_clean==0.4.2
12
+ MarkupSafe==3.0.2
13
+ playwright==1.52.0
14
+ pyee==13.0.0
15
+ python-dateutil==2.9.0.post0
16
+ pytz==2025.2
17
+ regex==2024.11.6
18
+ six==1.17.0
19
+ tld==0.13
20
+ trafilatura==2.0.0
21
+ typing_extensions==4.13.2
22
+ tzlocal==5.3.1
23
+ urllib3==2.4.0
24
+ Werkzeug==3.1.3
@@ -195,7 +195,12 @@ const requestWithMonitor = async (endpoint, url, data, axiosConfigObj) => {
195
195
  }
196
196
  } catch (error) {
197
197
  // throw new error with duration as part of the error data
198
- throw { ...error, duration: endpoint?.monitor?.incrementErrorCount(callId, error?.response?.status || null) };
198
+ const { code, name } = error;
199
+ const finalStatus = error?.response?.status ?? error?.status
200
+ const statusText = error?.response?.statusText ?? error?.statusText
201
+ const errorMessage = error?.response?.data?.message ?? error?.response?.data?.error?.message ?? error?.message ?? String(error);
202
+
203
+ throw { code, message: errorMessage, status: finalStatus, statusText, name, duration: endpoint?.monitor?.incrementErrorCount(callId, finalStatus) };
199
204
  }
200
205
  let duration;
201
206
  if (response.status >= 200 && response.status < 300) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@aj-archipelago/cortex",
3
- "version": "1.3.41",
3
+ "version": "1.3.42",
4
4
  "description": "Cortex is a GraphQL API for AI. It provides a simple, extensible interface for using AI services from OpenAI, Azure and others.",
5
5
  "private": false,
6
6
  "repository": {
@@ -27,6 +27,7 @@ export default {
27
27
  codeRequestId: ``,
28
28
  skipCallbackMessage: false,
29
29
  entityId: ``,
30
+ researchMode: false,
30
31
  model: 'oai-gpt41'
31
32
  },
32
33
  timeout: 600,
@@ -179,7 +180,7 @@ export default {
179
180
  let pathwayResolver = resolver;
180
181
 
181
182
  // Load input parameters and information into args
182
- const { entityId, voiceResponse, aiMemorySelfModify, chatId } = { ...pathwayResolver.pathway.inputParameters, ...args };
183
+ const { entityId, voiceResponse, aiMemorySelfModify, chatId, researchMode } = { ...pathwayResolver.pathway.inputParameters, ...args };
183
184
 
184
185
  const entityConfig = loadEntityConfig(entityId);
185
186
  const { entityTools, entityToolsOpenAiFormat } = getToolsForEntity(entityConfig);
@@ -199,18 +200,21 @@ export default {
199
200
  entityInstructions,
200
201
  voiceResponse,
201
202
  aiMemorySelfModify,
202
- chatId
203
+ chatId,
204
+ researchMode
203
205
  };
204
206
 
205
207
  pathwayResolver.args = {...args};
206
208
 
209
+ const promptPrefix = researchMode ? 'Formatting re-enabled\n' : '';
210
+
207
211
  const memoryTemplates = entityUseMemory ?
208
212
  `{{renderTemplate AI_MEMORY}}\n\n{{renderTemplate AI_MEMORY_INSTRUCTIONS}}\n\n` : '';
209
213
 
210
214
  const instructionTemplates = entityInstructions ? (entityInstructions + '\n\n') : `{{renderTemplate AI_EXPERTISE}}\n\n{{renderTemplate AI_COMMON_INSTRUCTIONS}}\n\n`;
211
215
 
212
216
  const promptMessages = [
213
- {"role": "system", "content": `${memoryTemplates}${instructionTemplates}{{renderTemplate AI_TOOLS}}\n\n{{renderTemplate AI_GROUNDING_INSTRUCTIONS}}\n\n{{renderTemplate AI_DATETIME}}`},
217
+ {"role": "system", "content": `${promptPrefix}${memoryTemplates}${instructionTemplates}{{renderTemplate AI_TOOLS}}\n\n{{renderTemplate AI_GROUNDING_INSTRUCTIONS}}\n\n{{renderTemplate AI_DATETIME}}`},
214
218
  "{{chatHistory}}",
215
219
  ];
216
220
 
@@ -218,11 +222,6 @@ export default {
218
222
  new Prompt({ messages: promptMessages }),
219
223
  ];
220
224
 
221
- // if the model has been overridden, make sure to use it
222
- if (pathwayResolver.modelName) {
223
- pathwayResolver.args.model = pathwayResolver.modelName;
224
- }
225
-
226
225
  // set the style model if applicable
227
226
  const { aiStyle, AI_STYLE_ANTHROPIC, AI_STYLE_OPENAI } = args;
228
227
  const styleModel = aiStyle === "Anthropic" ? AI_STYLE_ANTHROPIC : AI_STYLE_OPENAI;
@@ -55,7 +55,14 @@ export default {
55
55
  // Call the Bing search pathway
56
56
  const response = await callPathway('bing', {
57
57
  ...args
58
- });
58
+ }, resolver);
59
+
60
+ if (resolver.errors && resolver.errors.length > 0) {
61
+ const errorMessages = Array.isArray(resolver.errors)
62
+ ? resolver.errors.map(err => err.message || err)
63
+ : [resolver.errors.message || resolver.errors];
64
+ return JSON.stringify({ _type: "SearchError", value: errorMessages });
65
+ }
59
66
 
60
67
  const parsedResponse = JSON.parse(response);
61
68
  const results = [];
@@ -1,5 +1,6 @@
1
1
  // ModelExecutor.js
2
2
  import CortexRequest from '../lib/cortexRequest.js';
3
+ import logger from '../lib/logger.js';
3
4
 
4
5
  import OpenAIChatPlugin from './plugins/openAiChatPlugin.js';
5
6
  import OpenAICompletionPlugin from './plugins/openAiCompletionPlugin.js';
@@ -125,7 +126,14 @@ class ModelExecutor {
125
126
 
126
127
  async execute(text, parameters, prompt, pathwayResolver) {
127
128
  const cortexRequest = new CortexRequest({ pathwayResolver });
128
- return await this.plugin.execute(text, parameters, prompt, cortexRequest);
129
+ try {
130
+ return await this.plugin.execute(text, parameters, prompt, cortexRequest);
131
+ } catch (error) {
132
+ logger.error(`Error executing model plugin for pathway ${pathwayResolver?.pathway?.name}: ${error.message}`);
133
+ logger.debug(error.stack);
134
+ pathwayResolver.errors.push(error.message);
135
+ return null;
136
+ }
129
137
  }
130
138
  }
131
139
 
@@ -565,7 +565,10 @@ class ModelPlugin {
565
565
  return parsedData;
566
566
  } catch (error) {
567
567
  // Log the error and continue
568
- const errorMessage = `${error?.response?.data?.message || error?.response?.data?.error?.message || error?.message || error}`;
568
+ const errorMessage = error?.response?.data?.message
569
+ ?? error?.response?.data?.error?.message
570
+ ?? error?.message
571
+ ?? String(error); // Fallback to string representation
569
572
  logger.error(`Error in executeRequest for ${this.pathwayName}: ${errorMessage}`);
570
573
  if (error.data) {
571
574
  logger.error(`Additional error data: ${JSON.stringify(error.data)}`);
@@ -41,12 +41,15 @@ class OpenAIVisionPlugin extends OpenAIChatPlugin {
41
41
  return { type: 'text', text: parsedItem };
42
42
  }
43
43
 
44
- if (typeof parsedItem === 'object' && parsedItem !== null && parsedItem.type === 'image_url') {
45
- const url = parsedItem.url || parsedItem.image_url?.url;
46
- if (url && await this.validateImageUrl(url)) {
47
- return {type: parsedItem.type, image_url: {url}};
44
+ if (typeof parsedItem === 'object' && parsedItem !== null) {
45
+ // Handle both 'image' and 'image_url' types
46
+ if (parsedItem.type === 'image' || parsedItem.type === 'image_url') {
47
+ const url = parsedItem.image_url?.url || parsedItem.url;
48
+ if (url && await this.validateImageUrl(url)) {
49
+ return { type: 'image_url', image_url: { url } };
50
+ }
51
+ return { type: 'text', text: typeof item === 'string' ? item : JSON.stringify(item) };
48
52
  }
49
- return { type: 'text', text: typeof item === 'string' ? item : JSON.stringify(item) };
50
53
  }
51
54
 
52
55
  return parsedItem;
@@ -90,6 +90,9 @@ class OpenAIWhisperPlugin extends ModelPlugin {
90
90
  sendProgress(true, true);
91
91
  try {
92
92
  res = await this.executeRequest(cortexRequest);
93
+ if (!res) {
94
+ throw new Error('Received null or empty response');
95
+ }
93
96
  if(res?.statusCode && res?.statusCode >= 400){
94
97
  throw new Error(res?.message || 'An error occurred.');
95
98
  }
@@ -107,6 +110,10 @@ class OpenAIWhisperPlugin extends ModelPlugin {
107
110
 
108
111
  if(!wordTimestamped && !responseFormat){
109
112
  //if no response format, convert to text
113
+ if (!res) {
114
+ logger.warn("Received null or empty response from timestamped API when expecting SRT/VTT format. Returning empty string.");
115
+ return "";
116
+ }
110
117
  return convertSrtToText(res);
111
118
  }
112
119
  return res;