@aj-archipelago/cortex 1.3.49 → 1.3.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/config.js +1 -1
  2. package/helper-apps/cortex-browser/Dockerfile +19 -31
  3. package/helper-apps/cortex-browser/function_app.py +708 -181
  4. package/helper-apps/cortex-browser/requirements.txt +4 -4
  5. package/helper-apps/cortex-file-handler/blobHandler.js +850 -429
  6. package/helper-apps/cortex-file-handler/constants.js +64 -48
  7. package/helper-apps/cortex-file-handler/docHelper.js +7 -114
  8. package/helper-apps/cortex-file-handler/fileChunker.js +96 -51
  9. package/helper-apps/cortex-file-handler/function.json +2 -6
  10. package/helper-apps/cortex-file-handler/helper.js +34 -25
  11. package/helper-apps/cortex-file-handler/index.js +324 -136
  12. package/helper-apps/cortex-file-handler/localFileHandler.js +56 -57
  13. package/helper-apps/cortex-file-handler/package-lock.json +6065 -5964
  14. package/helper-apps/cortex-file-handler/package.json +8 -4
  15. package/helper-apps/cortex-file-handler/redis.js +23 -17
  16. package/helper-apps/cortex-file-handler/scripts/setup-azure-container.js +12 -9
  17. package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +21 -18
  18. package/helper-apps/cortex-file-handler/scripts/test-azure.sh +1 -1
  19. package/helper-apps/cortex-file-handler/scripts/test-gcs.sh +1 -1
  20. package/helper-apps/cortex-file-handler/services/ConversionService.js +288 -0
  21. package/helper-apps/cortex-file-handler/services/FileConversionService.js +53 -0
  22. package/helper-apps/cortex-file-handler/start.js +63 -38
  23. package/helper-apps/cortex-file-handler/tests/FileConversionService.test.js +144 -0
  24. package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +88 -64
  25. package/helper-apps/cortex-file-handler/tests/fileChunker.test.js +114 -91
  26. package/helper-apps/cortex-file-handler/tests/fileUpload.test.js +351 -0
  27. package/helper-apps/cortex-file-handler/tests/files/DOCX_TestPage.docx +0 -0
  28. package/helper-apps/cortex-file-handler/tests/files/tests-example.xls +0 -0
  29. package/helper-apps/cortex-file-handler/tests/start.test.js +943 -642
  30. package/helper-apps/cortex-file-handler/tests/testUtils.helper.js +31 -0
  31. package/helper-apps/cortex-markitdown/.funcignore +1 -0
  32. package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/__init__.py +64 -0
  33. package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/function.json +21 -0
  34. package/helper-apps/cortex-markitdown/README.md +94 -0
  35. package/helper-apps/cortex-markitdown/host.json +15 -0
  36. package/helper-apps/cortex-markitdown/requirements.txt +2 -0
  37. package/lib/requestExecutor.js +44 -36
  38. package/package.json +1 -1
  39. package/pathways/system/entity/tools/sys_tool_cognitive_search.js +1 -1
  40. package/pathways/system/entity/tools/sys_tool_readfile.js +24 -2
  41. package/server/plugins/openAiWhisperPlugin.js +59 -87
  42. package/helper-apps/cortex-file-handler/tests/docHelper.test.js +0 -148
@@ -1,181 +1,708 @@
1
- import azure.functions as func
2
- import logging
3
- import json
4
- from playwright.sync_api import sync_playwright
5
- import trafilatura
6
- import base64
7
-
8
- app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS)
9
-
10
- def scrape_and_screenshot(url: str, should_screenshot: bool = True) -> dict:
11
- """Scrapes text and takes a screenshot of a given URL, attempting to reject cookies."""
12
- screenshot_bytes = None
13
- html_content = None
14
- extracted_text = None
15
-
16
- try:
17
- with sync_playwright() as p:
18
- browser = p.chromium.launch(headless=True)
19
- try:
20
- context = browser.new_context()
21
- page = context.new_page()
22
- page.goto(url, wait_until='load', timeout=60000) # Increased timeout
23
-
24
- # --- Attempt to reject cookies ---
25
- # Add more selectors here if needed for different sites
26
- reject_selectors = [
27
- "button:has-text('Reject All')",
28
- "button:has-text('Decline')",
29
- "button:has-text('Only necessary')",
30
- "button:has-text('Tümünü Reddet')", # From your example
31
- "button:has-text('Reject')",
32
- "[aria-label*='Reject']", # Common aria labels
33
- "[id*='reject']",
34
- "[class*='reject']",
35
- # Add more specific selectors based on common banner frameworks if known
36
- ]
37
-
38
- cookie_banner_found = False
39
- for selector in reject_selectors:
40
- try:
41
- # Wait briefly for the banner element to appear
42
- reject_button = page.locator(selector).first
43
- if reject_button.is_visible(timeout=2000): # Wait up to 2 seconds
44
- logging.info(f"Found potential cookie reject button with selector: {selector}")
45
- reject_button.click(timeout=5000) # Click with a timeout
46
- logging.info("Clicked cookie reject button.")
47
- # Wait a tiny bit for the banner to disappear/page to settle
48
- page.wait_for_timeout(500)
49
- cookie_banner_found = True
50
- break # Stop searching once one is clicked
51
- except Exception as e:
52
- # Ignore timeout errors if the element doesn't appear or other exceptions
53
- # logging.debug(f"Cookie reject selector '{selector}' not found or failed: {e}")
54
- pass # Try the next selector
55
-
56
- if not cookie_banner_found:
57
- logging.info("No common cookie reject button found or clicked.")
58
- # ---------------------------------
59
-
60
- html_content = page.content()
61
- # Take FULL page screenshot before closing
62
- if should_screenshot:
63
- screenshot_bytes = page.screenshot(full_page=True) # Added full_page=True
64
- finally:
65
- browser.close()
66
- except Exception as e:
67
- logging.error(f"Playwright error accessing {url}: {e}")
68
- return {"url": url, "error": f"Playwright error: {e}"}
69
-
70
- if html_content:
71
- try:
72
- extracted_text = trafilatura.extract(html_content, include_comments=False)
73
- except Exception as e:
74
- logging.error(f"Trafilatura error processing {url}: {e}")
75
- # Still return screenshot if Playwright succeeded
76
- extracted_text = f"Trafilatura extraction failed: {e}"
77
-
78
- screenshot_base64 = base64.b64encode(screenshot_bytes).decode('utf-8') if screenshot_bytes else None
79
-
80
- response_data = {
81
- "url": url,
82
- "text": extracted_text or "",
83
- }
84
- if screenshot_base64:
85
- response_data["screenshot_base64"] = screenshot_base64
86
-
87
- return response_data
88
-
89
- @app.route(route="scrape") # Changed route name
90
- def http_scrape_trigger(req: func.HttpRequest) -> func.HttpResponse:
91
- logging.info('Python HTTP scrape trigger function processed a request.')
92
-
93
- url = None
94
- take_screenshot = True # Default value
95
-
96
- # 1. Try getting parameters from query string first
97
- try:
98
- url = req.params.get('url')
99
- if url:
100
- logging.info(f"Found URL in query parameters: {url}")
101
- # Handle take_screenshot from query params
102
- ss_param = req.params.get('take_screenshot', 'true') # Query params are strings
103
- take_screenshot = ss_param.lower() != 'false'
104
- else:
105
- logging.info("URL not found in query parameters.")
106
- except Exception as e:
107
- # This shouldn't generally happen with req.params, but good practice
108
- logging.warning(f"Error reading query parameters: {e}")
109
- url = None # Ensure url is None if error occurs here
110
-
111
- # 2. If URL not found in query, try getting from JSON body
112
- if not url:
113
- logging.info("Attempting to read URL from JSON body.")
114
- try:
115
- req_body = req.get_json()
116
- if req_body:
117
- url = req_body.get('url')
118
- if url:
119
- logging.info(f"Found URL in JSON body: {url}")
120
- # Handle take_screenshot from JSON body
121
- ss_param = req_body.get('take_screenshot', True)
122
- if isinstance(ss_param, str):
123
- take_screenshot = ss_param.lower() != 'false'
124
- else:
125
- take_screenshot = bool(ss_param) # Convert other types
126
- logging.info(f"Screenshot parameter from JSON: {take_screenshot}")
127
- else:
128
- logging.info("URL key not found in JSON body.")
129
- else:
130
- logging.info("JSON body is empty.")
131
- except ValueError:
132
- logging.info("Request body is not valid JSON or missing.")
133
- # url remains None
134
- except Exception as e:
135
- logging.warning(f"Error reading JSON body: {e}")
136
- url = None # Ensure url is None if error occurs here
137
-
138
- # 3. Process the request if URL was found
139
- if url:
140
- try:
141
- # Validate URL basic structure (optional but recommended)
142
- if not url.startswith(('http://', 'https://')):
143
- raise ValueError("Invalid URL format. Must start with http:// or https://")
144
-
145
- result_data = scrape_and_screenshot(url, should_screenshot=take_screenshot) # Pass the flag
146
- return func.HttpResponse(
147
- json.dumps(result_data),
148
- mimetype="application/json",
149
- status_code=200
150
- )
151
- except ValueError as ve:
152
- logging.error(f"Invalid URL provided: {ve}")
153
- return func.HttpResponse(
154
- json.dumps({"error": str(ve)}),
155
- mimetype="application/json",
156
- status_code=400
157
- )
158
- except Exception as e:
159
- logging.error(f"Error processing scrape request for {url}: {e}")
160
- return func.HttpResponse(
161
- json.dumps({"error": f"An internal error occurred: {e}"}),
162
- mimetype="application/json",
163
- status_code=500
164
- )
165
- else:
166
- logging.warning("URL not provided in request body or query string.")
167
- return func.HttpResponse(
168
- json.dumps({"error": "Please pass a 'url' in the JSON request body or query string"}),
169
- mimetype="application/json",
170
- status_code=400
171
- )
172
-
173
- # Keep this if you might have other triggers, otherwise it can be removed
174
- # if the scrape trigger is the only one.
175
- # Example of another potential trigger (e.g., timer)
176
- # @app.timer_trigger(schedule="0 */5 * * * *", arg_name="myTimer", run_on_startup=True,
177
- # use_monitor=False)
178
- # def timer_trigger_handler(myTimer: func.TimerRequest) -> None:
179
- # if myTimer.past_due:
180
- # logging.info('The timer is past due!')
181
- # logging.info('Python timer trigger function executed.')
1
+ import azure.functions as func
2
+ import logging
3
+ import json
4
+ import base64
5
+ import time
6
+ import os
7
+ import tempfile
8
+ import re
9
+ import asyncio
10
+ from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError, Error as PlaywrightError
11
+ import trafilatura
12
+ from typing import Union, Dict, Any, Tuple, Optional
13
+ import shutil
14
+ from aiohttp import web # Added for local server
15
+
16
+ app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS)
17
+
18
+ # Get global timeout from environment variable, default to 20
19
+ timeout_str = os.environ.get("GLOBAL_TIMEOUT_SECONDS", "20")
20
+ try:
21
+ GLOBAL_TIMEOUT_SECONDS = int(timeout_str)
22
+ except ValueError:
23
+ logging.warning(f"Invalid GLOBAL_TIMEOUT_SECONDS environment variable '{timeout_str}'. Defaulting to 20.")
24
+ GLOBAL_TIMEOUT_SECONDS = 20
25
+
26
+ MIN_TIME_FOR_STEP = 0.5 # Minimum time to allow for any single step if budget is very low
27
+ BUFFER_FOR_CLEANUP_AND_PROCESSING = 2 # Reserve N seconds for trafilatura, screenshot, and cleanup
28
+ SCREENSHOT_DIR_PLAYWRIGHT = "downloaded_files/playwright_screenshots"
29
+
30
+ def get_remaining_time(start_time: float, budget_for_step: Union[float, None] = None) -> float:
31
+ """Calculates remaining time from the global timeout."""
32
+ elapsed_time = time.time() - start_time
33
+ remaining_global_time = GLOBAL_TIMEOUT_SECONDS - elapsed_time
34
+ if budget_for_step is not None:
35
+ safe_budget = min(budget_for_step, remaining_global_time - BUFFER_FOR_CLEANUP_AND_PROCESSING)
36
+ return max(MIN_TIME_FOR_STEP, safe_budget)
37
+ return max(0, remaining_global_time - BUFFER_FOR_CLEANUP_AND_PROCESSING)
38
+
39
+ def sanitize_filename(url_or_name: str) -> str:
40
+ """Sanitizes a string to be a valid filename."""
41
+ # Remove http(s)://
42
+ name = re.sub(r'^https?://', '', url_or_name)
43
+ # Replace non-alphanumeric characters (except . - _) with underscores
44
+ name = re.sub(r'[^a-zA-Z0-9._-]', '_', name)
45
+ # Truncate if too long
46
+ return name[:100]
47
+
48
+ # --- Task for Text Extraction ---
49
+ async def _task_extract_text(page, start_time: float) -> Tuple[Optional[str], Optional[str]]:
50
+ extracted_text = None
51
+ html_content = None
52
+ error_message = None
53
+ timed_out_early_task = False
54
+
55
+ try:
56
+ logging.info("[Text Task] Attempting primary text extraction with body.inner_text()...")
57
+ primary_inner_text_budget_s = get_remaining_time(start_time, budget_for_step=7)
58
+ if primary_inner_text_budget_s < MIN_TIME_FOR_STEP:
59
+ logging.warning(f"[Text Task] Skipping primary body.inner_text() due to insufficient time ({primary_inner_text_budget_s:.2f}s).")
60
+ else:
61
+ logging.info(f"[Text Task] Allocating {primary_inner_text_budget_s:.2f}s for primary body.inner_text().")
62
+ try:
63
+ body_text_pl = await asyncio.wait_for(page.locator('body').inner_text(timeout=primary_inner_text_budget_s * 1000), timeout=primary_inner_text_budget_s + 0.5)
64
+ if body_text_pl:
65
+ extracted_text = body_text_pl.strip()
66
+ logging.info(f"[Text Task] Captured primary text via body.inner_text(), length: {len(extracted_text)}.")
67
+ else:
68
+ logging.warning("[Text Task] Primary body.inner_text() was empty.")
69
+ except asyncio.TimeoutError:
70
+ logging.warning(f"[Text Task] Timeout ({primary_inner_text_budget_s:.2f}s) during primary body.inner_text().")
71
+ timed_out_early_task = True
72
+ except Exception as e_primary_body_text:
73
+ logging.warning(f"[Text Task] Error during primary body.inner_text(): {e_primary_body_text}.")
74
+ error_message = f"Error during inner_text: {e_primary_body_text}"
75
+
76
+ if not extracted_text and not timed_out_early_task:
77
+ logging.info("[Text Task] Attempting to fetch page.content() for Trafilatura fallback.")
78
+ page_content_budget_s = get_remaining_time(start_time, budget_for_step=15)
79
+ if page_content_budget_s < MIN_TIME_FOR_STEP * 2:
80
+ logging.warning(f"[Text Task] Skipping page.content() fetch due to insufficient time ({page_content_budget_s:.2f}s).")
81
+ else:
82
+ logging.info(f"[Text Task] Allocating {page_content_budget_s:.2f}s for page.content().")
83
+ try:
84
+ html_content = await asyncio.wait_for(page.content(), timeout=page_content_budget_s)
85
+ if html_content:
86
+ logging.info(f"[Text Task] Successfully fetched page.content(), length: {len(html_content)}.")
87
+ else:
88
+ logging.warning("[Text Task] page.content() returned None or empty.")
89
+ html_content = None
90
+ except asyncio.TimeoutError:
91
+ logging.warning(f"[Text Task] Timeout ({page_content_budget_s:.2f}s) while getting page.content(). html_content will be None.")
92
+ html_content = None
93
+ timed_out_early_task = True
94
+ except Exception as e_final_ps:
95
+ logging.warning(f"[Text Task] Could not get final page.content(): {e_final_ps}")
96
+ if not error_message: error_message = f"Failed to get page.content(): {e_final_ps}"
97
+ html_content = None
98
+ elif not timed_out_early_task:
99
+ logging.info("[Text Task] Skipping page.content() fetch as inner_text succeeded or timed out.")
100
+
101
+ if html_content and not extracted_text and not timed_out_early_task:
102
+ try:
103
+ logging.info("[Text Task] Attempting Trafilatura text extraction as fallback...")
104
+ trafilatura_text = trafilatura.extract(html_content, include_comments=False)
105
+ if trafilatura_text:
106
+ extracted_text = trafilatura_text.strip()
107
+ logging.info(f"[Text Task] Trafilatura extracted fallback text, length: {len(extracted_text)}. This will be used.")
108
+ else:
109
+ logging.warning(f"[Text Task] Trafilatura fallback extraction yielded no text.")
110
+ except Exception as e_traf:
111
+ logging.error(f"[Text Task] Trafilatura fallback error: {e_traf}")
112
+ if not error_message: error_message = f"Trafilatura fallback failed: {e_traf}"
113
+
114
+ if not extracted_text and not error_message:
115
+ logging.warning("[Text Task] Text extraction attempts failed or yielded no text.")
116
+
117
+ except Exception as e_task_text:
118
+ logging.error(f"[Text Task] Unexpected error during text extraction: {e_task_text}")
119
+ error_message = error_message or f"Unexpected text task error: {e_task_text}"
120
+
121
+ return extracted_text, error_message
122
+
123
+ # --- Task for Screenshot Capture ---
124
+ async def _task_capture_screenshot(page, context, start_time: float) -> Tuple[Optional[bytes], Optional[str]]:
125
+ screenshot_bytes = None
126
+ error_message = None
127
+ cdp_session_ss = None
128
+
129
+ try:
130
+ logging.info("[Screenshot Task] Starting screenshot capture process...")
131
+ try:
132
+ logging.info("[Screenshot Task] Scrolling to top and modifying fixed elements...")
133
+ scroll_modify_budget = get_remaining_time(start_time, budget_for_step=3)
134
+ if scroll_modify_budget < MIN_TIME_FOR_STEP / 2:
135
+ logging.warning(f"[Screenshot Task] Skipping scroll/modify due to time ({scroll_modify_budget:.2f}s)")
136
+ else:
137
+ await asyncio.wait_for(page.evaluate("window.scrollTo(0, 0)"), timeout=scroll_modify_budget / 2)
138
+ modified_count = await asyncio.wait_for(page.evaluate(
139
+ """() => {
140
+ let m_count = 0;
141
+ const allElements = document.querySelectorAll('*');
142
+ allElements.forEach(el => {
143
+ if (el.tagName !== 'BODY' && el.tagName !== 'HTML') {
144
+ const style = window.getComputedStyle(el);
145
+ if (style.position === 'fixed') {
146
+ el.style.position = 'absolute';
147
+ m_count++;
148
+ }
149
+ }
150
+ });
151
+ return m_count;
152
+ }"""
153
+ ), timeout=scroll_modify_budget / 2)
154
+ logging.info(f"[Screenshot Task] {modified_count} fixed elements modified.")
155
+ await asyncio.sleep(0.3)
156
+ except Exception as e_scroll_modify:
157
+ logging.warning(f"[Screenshot Task] Error during scroll/modify: {e_scroll_modify}")
158
+
159
+ try:
160
+ logging.info("[Screenshot Task] Attempting CDP full page screenshot...")
161
+ metrics_budget = get_remaining_time(start_time, budget_for_step=1)
162
+ cdp_capture_budget = get_remaining_time(start_time, budget_for_step=10)
163
+
164
+ if metrics_budget < MIN_TIME_FOR_STEP / 2 or cdp_capture_budget < MIN_TIME_FOR_STEP:
165
+ raise PlaywrightTimeoutError("Not enough time for CDP screenshot steps.")
166
+
167
+ metrics = await asyncio.wait_for(page.evaluate(
168
+ "() => ({ width: Math.max(document.body.scrollWidth, document.documentElement.scrollWidth), height: Math.max(document.body.scrollHeight, document.documentElement.scrollHeight) })"
169
+ ), timeout=metrics_budget)
170
+ full_width = metrics['width']
171
+ full_height = metrics['height']
172
+ if not full_width or not full_height: raise ValueError("Invalid page dimensions for CDP.")
173
+
174
+ logging.info(f"[Screenshot Task] Allocating {cdp_capture_budget:.2f}s for CDP capture.")
175
+ cdp_session_ss = await asyncio.wait_for(context.new_cdp_session(page), timeout=1.0)
176
+ cdp_result = await asyncio.wait_for(cdp_session_ss.send(
177
+ "Page.captureScreenshot",
178
+ {"format": "jpeg", "quality": 80, "captureBeyondViewport": True, "clip": {"x": 0, "y": 0, "width": full_width, "height": full_height, "scale": 1}, "fromSurface": True }
179
+ ), timeout=cdp_capture_budget)
180
+ screenshot_bytes = base64.b64decode(cdp_result['data'])
181
+ logging.info(f"[Screenshot Task] Captured full page screenshot via CDP (JPEG, {len(screenshot_bytes)} bytes).")
182
+
183
+ except Exception as e_ss_cdp:
184
+ logging.warning(f"[Screenshot Task] CDP screenshot failed: {e_ss_cdp}. Falling back.")
185
+ if not error_message or "closed" in str(e_ss_cdp).lower(): error_message = f"CDP Screenshot fail: {e_ss_cdp}"
186
+ screenshot_bytes = None
187
+
188
+ finally:
189
+ if cdp_session_ss:
190
+ try:
191
+ await asyncio.wait_for(cdp_session_ss.detach(), timeout=1.0)
192
+ logging.info("[Screenshot Task] CDP session detached.")
193
+ except Exception as e_cdp_detach: logging.warning(f"[Screenshot Task] CDP detach error: {e_cdp_detach}")
194
+
195
+ if not screenshot_bytes:
196
+ try:
197
+ pw_full_budget = get_remaining_time(start_time, budget_for_step=8)
198
+ if pw_full_budget < MIN_TIME_FOR_STEP:
199
+ logging.warning("[Screenshot Task] Skipping PW full page fallback due to time.")
200
+ else:
201
+ logging.info("[Screenshot Task] Attempting Playwright full_page fallback...")
202
+ screenshot_bytes = await asyncio.wait_for(page.screenshot(type='jpeg', quality=80, full_page=True), timeout=pw_full_budget)
203
+ logging.info("[Screenshot Task] Captured Playwright full_page fallback.")
204
+ except Exception as e_ss_pw_full:
205
+ logging.warning(f"[Screenshot Task] PW full_page fallback failed: {e_ss_pw_full}. Trying viewport.")
206
+ if not error_message: error_message = f"PW full fallback fail: {e_ss_pw_full}"
207
+
208
+ if not screenshot_bytes:
209
+ try:
210
+ pw_vp_budget = get_remaining_time(start_time, budget_for_step=3)
211
+ if pw_vp_budget < MIN_TIME_FOR_STEP / 2:
212
+ logging.warning("[Screenshot Task] Skipping PW viewport fallback due to time.")
213
+ else:
214
+ logging.info("[Screenshot Task] Attempting Playwright viewport fallback...")
215
+ screenshot_bytes = await asyncio.wait_for(page.screenshot(type='jpeg', quality=80), timeout=pw_vp_budget)
216
+ logging.info("[Screenshot Task] Captured viewport fallback.")
217
+ except Exception as e_ss_pw_viewport:
218
+ logging.warning(f"[Screenshot Task] Viewport fallback failed: {e_ss_pw_viewport}")
219
+ if not error_message: error_message = f"Viewport fallback fail: {e_ss_pw_viewport}"
220
+
221
+ if not screenshot_bytes:
222
+ logging.error("[Screenshot Task] All screenshot attempts failed.")
223
+ error_message = error_message or "Screenshot capture failed despite all attempts."
224
+
225
+ except Exception as e_task_screenshot:
226
+ logging.error(f"[Screenshot Task] Unexpected error during screenshot capture: {e_task_screenshot}")
227
+ error_message = error_message or f"Unexpected screenshot task error: {e_task_screenshot}"
228
+ screenshot_bytes = None
229
+
230
+ return screenshot_bytes, error_message
231
+
232
+
233
+ async def scrape_and_screenshot_playwright(url: str, should_screenshot: bool = True) -> dict:
234
+ operation_start_time = time.time()
235
+ logging.info(f"SCRAPE START for {url}") # Log start of function execution
236
+
237
+ screenshot_bytes = None
238
+ extracted_text = None
239
+ error_message = None
240
+ timed_out_early = False
241
+ temp_data_dir_obj = None
242
+ playwright_instance = None
243
+ browser = None
244
+ context = None
245
+ page = None
246
+
247
+ try:
248
+ temp_data_dir_obj = tempfile.TemporaryDirectory()
249
+ temp_data_dir = temp_data_dir_obj.name
250
+ logging.info(f"Using temporary data directory for Playwright context: {temp_data_dir} for URL: {url}")
251
+
252
+ if get_remaining_time(operation_start_time) <= MIN_TIME_FOR_STEP:
253
+ timed_out_early = True
254
+ raise PlaywrightTimeoutError("Not enough time to initialize browser.")
255
+
256
+ logging.info("Attempting to start Playwright...")
257
+ playwright_instance = await async_playwright().start()
258
+ logging.info("Playwright started successfully.")
259
+
260
+ logging.info("Attempting to launch Chromium browser...")
261
+ browser = await playwright_instance.chromium.launch(
262
+ headless=True,
263
+ args=[
264
+ "--no-sandbox",
265
+ "--disable-setuid-sandbox",
266
+ "--ignore-certificate-errors",
267
+ "--disable-dev-shm-usage",
268
+ "--disable-gpu",
269
+ ]
270
+ )
271
+ logging.info("Chromium browser launched successfully.")
272
+
273
+ logging.info("Attempting to create browser context...")
274
+ context = await browser.new_context(
275
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
276
+ viewport={"width": 1280, "height": 720},
277
+ locale="en-US",
278
+ )
279
+ logging.info("Browser context created successfully.")
280
+
281
+ logging.info("Attempting to create new page...")
282
+ page = await context.new_page()
283
+ logging.info("New page created successfully.")
284
+
285
+ await page.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
286
+
287
+ # Page Load
288
+ page_load_budget_s = get_remaining_time(operation_start_time, budget_for_step=10)
289
+ if page_load_budget_s <= MIN_TIME_FOR_STEP:
290
+ timed_out_early = True
291
+ raise PlaywrightTimeoutError("Not enough time for page load.")
292
+
293
+ logging.info(f"Setting page load timeout to: {page_load_budget_s:.2f}s (wait_until 'domcontentloaded')")
294
+ try:
295
+ await page.goto(url, timeout=page_load_budget_s * 1000, wait_until="domcontentloaded")
296
+ logging.info(f"Navigation to {url} completed (domcontentloaded).")
297
+ except PlaywrightTimeoutError:
298
+ logging.warning(f"Page load timeout for {url} (domcontentloaded strategy) within the allocated budget.")
299
+ timed_out_early = True
300
+ try: html_content_on_timeout = await page.content()
301
+ except: html_content_on_timeout = None
302
+ if should_screenshot:
303
+ try:
304
+ screenshot_bytes = await page.screenshot(type='jpeg', quality=80)
305
+ logging.info("Captured viewport screenshot after page load timeout.")
306
+ except Exception as e_ss_on_timeout: logging.error(f"Failed viewport screenshot after load timeout: {e_ss_on_timeout}")
307
+ extracted_text = trafilatura.extract(html_content_on_timeout) if html_content_on_timeout else None
308
+ error_message = f"Page load timed out ({page_load_budget_s:.2f}s)"
309
+ raise PlaywrightTimeoutError(error_message)
310
+ except Exception as e_get:
311
+ logging.error(f"Error during page.goto({url}): {e_get}")
312
+ error_message = f"Error navigating to URL: {e_get}"
313
+ raise
314
+
315
+ # Scrolling (Sequential while loop)
316
+ scrolled = False
317
+ if not timed_out_early and get_remaining_time(operation_start_time) > MIN_TIME_FOR_STEP * 5:
318
+ logging.info("Attempting to scroll page (while loop)...")
319
+ try:
320
+ last_height = await page.evaluate("document.body.scrollHeight")
321
+ scroll_pause_time = 1.5
322
+ scroll_attempts = 0
323
+ max_scroll_attempts = 7
324
+ scroll_loop_start_time = time.time()
325
+ scroll_loop_budget_s = get_remaining_time(scroll_loop_start_time, budget_for_step=max_scroll_attempts * (scroll_pause_time + 0.5))
326
+
327
+ while scroll_attempts < max_scroll_attempts:
328
+ if time.time() - scroll_loop_start_time > scroll_loop_budget_s:
329
+ logging.warning("Scrolling loop timed out.")
330
+ timed_out_early = True; break
331
+ if time.time() - operation_start_time > GLOBAL_TIMEOUT_SECONDS - BUFFER_FOR_CLEANUP_AND_PROCESSING:
332
+ logging.warning("Global timeout reached during scrolling.")
333
+ timed_out_early = True; break
334
+
335
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
336
+ await asyncio.sleep(scroll_pause_time)
337
+ new_height = await page.evaluate("document.body.scrollHeight")
338
+ if new_height == last_height:
339
+ scrolled = True; break
340
+ last_height = new_height
341
+ scroll_attempts += 1
342
+
343
+ if scrolled: logging.info(f"Scrolling finished after {scroll_attempts} attempts.")
344
+ elif not timed_out_early: logging.warning(f"Scrolling completed max {max_scroll_attempts} attempts without full scroll.")
345
+
346
+ except Exception as e_scroll:
347
+ logging.warning(f"Error during scrolling: {e_scroll}")
348
+
349
+ # Simplified Image Wait (Sequential)
350
+ if not timed_out_early and scrolled:
351
+ logging.info("Attempting simplified dynamic image wait (post-scroll)...")
352
+ await asyncio.sleep(1.0)
353
+
354
+ # Cookie Handling (Sequential)
355
+ cookie_banner_handled = False
356
+ if not timed_out_early:
357
+ logging.info(f"Attempting cookie dismissal (super-fast attempts)...")
358
+ cookie_locator_options = [
359
+ {"role": "button", "name": re.compile(r"(Reject All|Tümünü Reddet|Decline All)", re.IGNORECASE)},
360
+ {"role": "link", "name": re.compile(r"(Reject All|Tümünü Reddet|Decline All)", re.IGNORECASE)},
361
+ {"role": "button", "name": re.compile(r"(Reject|Decline|Only necessary|Essential)", re.IGNORECASE)},
362
+ {"role": "link", "name": re.compile(r"(Reject|Decline|Only necessary|Essential)", re.IGNORECASE)},
363
+ {"locator_str": "button[data-testid='uc-reject-all-button']"},
364
+ {"locator_str": "button#uc-reject-all-button"},
365
+ {"role": "button", "name": re.compile(r"(Accept|Allow|Agree)( all| cookies)?", re.IGNORECASE)},
366
+ {"role": "link", "name": re.compile(r"(Accept|Allow|Agree)( all| cookies)?", re.IGNORECASE)},
367
+ ]
368
+ cookie_check_start_time = time.time()
369
+ for option in cookie_locator_options:
370
+ if get_remaining_time(operation_start_time) < MIN_TIME_FOR_STEP:
371
+ logging.warning("Global timeout imminent during cookie checks. Halting cookie attempts.")
372
+ timed_out_early = True
373
+ break
374
+ try:
375
+ target_element = None
376
+ if "locator_str" in option: target_element = page.locator(option["locator_str"]).first
377
+ elif "role" in option and "name" in option: target_element = page.get_by_role(option["role"], name=option["name"]).first
378
+
379
+ if target_element:
380
+ logging.debug(f"Attempting ultra-fast interaction with cookie element: {option}")
381
+ clicked_successfully_this_attempt = False
382
+ try:
383
+ await target_element.click(timeout=200)
384
+ clicked_successfully_this_attempt = True
385
+ logging.info(f"Playwright click likely succeeded for cookie element: {option}.")
386
+ except Exception:
387
+ bounding_box = None
388
+ try: bounding_box = await asyncio.wait_for(target_element.bounding_box(), timeout=0.1)
389
+ except: pass
390
+ if bounding_box:
391
+ center_x = bounding_box['x'] + bounding_box['width'] / 2
392
+ center_y = bounding_box['y'] + bounding_box['height'] / 2
393
+ cdp_session_cookie = None
394
+ try:
395
+ cdp_session_cookie = await asyncio.wait_for(context.new_cdp_session(page), timeout=0.2)
396
+ await asyncio.wait_for(cdp_session_cookie.send("Input.dispatchMouseEvent", {"type": "mousePressed", "button": "left", "clickCount": 1, "x": center_x, "y": center_y }), timeout=0.1)
397
+ await asyncio.wait_for(cdp_session_cookie.send("Input.dispatchMouseEvent", {"type": "mouseReleased", "button": "left", "clickCount": 1, "x": center_x, "y": center_y }), timeout=0.1)
398
+ clicked_successfully_this_attempt = True
399
+ logging.info(f"Ultra-fast CDP click dispatched for cookie element: {option}.")
400
+ except Exception as e_cdp_click_fast:
401
+ logging.debug(f"Ultra-fast CDP click also failed for {option}: {e_cdp_click_fast}")
402
+ finally:
403
+ if cdp_session_cookie:
404
+ try:
405
+ await asyncio.wait_for(cdp_session_cookie.detach(), timeout=0.1)
406
+ except:
407
+ pass
408
+ else: logging.debug(f"No bounding_box for {option} for fast CDP attempt.")
409
+
410
+ if clicked_successfully_this_attempt:
411
+ cookie_banner_handled = True
412
+ logging.info("Cookie banner handled, breaking loop.")
413
+ break
414
+ except Exception as e_cookie_outer: logging.debug(f"Outer error processing cookie option {option}: {e_cookie_outer}")
415
+
416
+ logging.info(f"Finished cookie attempts. Handled: {cookie_banner_handled}, Timed Out: {timed_out_early}")
417
+ if not cookie_banner_handled and not timed_out_early: logging.warning("Failed to handle cookie banner.")
418
+
419
+ # --- Parallel Execution: Text Extraction and Screenshot ---
420
+ if not timed_out_early:
421
+ logging.info("Starting parallel execution of text extraction and screenshot tasks...")
422
+ tasks_to_run = []
423
+ tasks_to_run.append(_task_extract_text(page, operation_start_time))
424
+ if should_screenshot:
425
+ tasks_to_run.append(_task_capture_screenshot(page, context, operation_start_time))
426
+ else:
427
+ tasks_to_run.append(asyncio.sleep(0, result=(None, None)))
428
+
429
+ gather_budget_s = get_remaining_time(operation_start_time)
430
+ if gather_budget_s <= MIN_TIME_FOR_STEP:
431
+ logging.warning("Not enough time for parallel tasks. Skipping.")
432
+ timed_out_early = True
433
+ else:
434
+ logging.info(f"Running asyncio.gather with remaining time: {gather_budget_s:.2f}s")
435
+ try:
436
+ results = await asyncio.wait_for(asyncio.gather(*tasks_to_run, return_exceptions=True), timeout=gather_budget_s)
437
+
438
+ # Process Text Result
439
+ text_result = results[0]
440
+ if isinstance(text_result, Exception):
441
+ logging.error(f"Text extraction task failed with exception: {text_result}")
442
+ if not error_message: error_message = f"Text task exception: {text_result}"
443
+ elif text_result is not None:
444
+ extracted_text, text_error = text_result
445
+ if text_error and not error_message: error_message = text_error
446
+
447
+ # Process Screenshot Result
448
+ screenshot_result = results[1]
449
+ if isinstance(screenshot_result, Exception):
450
+ logging.error(f"Screenshot task failed with exception: {screenshot_result}")
451
+ if not error_message: error_message = f"Screenshot task exception: {screenshot_result}"
452
+ elif should_screenshot: error_message += f"; Screenshot task exception: {screenshot_result}"
453
+ elif screenshot_result is not None:
454
+ screenshot_bytes, screenshot_error = screenshot_result
455
+ if screenshot_error:
456
+ logging.warning(f"Screenshot task reported an error: {screenshot_error}")
457
+ if not error_message and should_screenshot: error_message = screenshot_error
458
+ elif should_screenshot: error_message += f"; {screenshot_error}"
459
+
460
+ except asyncio.TimeoutError:
461
+ logging.warning(f"Parallel tasks timed out (gather level after {gather_budget_s:.2f}s). Partial results may be missing.")
462
+ timed_out_early = True
463
+ if not error_message: error_message = "Parallel execution phase timed out."
464
+
465
+ # --- End Parallel Execution ---
466
+
467
+ except PlaywrightTimeoutError as pte:
468
+ logging.warning(f"Playwright operation timed out for {url}: {pte}")
469
+ error_message = error_message or f"Processing timed out (Playwright): {pte}"
470
+ timed_out_early = True
471
+ except PlaywrightError as pe:
472
+ logging.error(f"Playwright error for {url}: {pe}")
473
+ error_message = error_message or f"Playwright error: {pe}"
474
+ except Exception as e:
475
+ logging.error(f"General error in scrape_and_screenshot_playwright for {url}: {e}", exc_info=True)
476
+ error_message = error_message or f"General scraping error: {e}"
477
+ finally:
478
+ logging.info("Entering main finally block for cleanup...")
479
+ if page and not page.is_closed():
480
+ try:
481
+ await page.close()
482
+ logging.info("Playwright page closed in finally block.")
483
+ except Exception as e_page_close: logging.warning(f"Error closing Playwright page in finally: {e_page_close}")
484
+ if context:
485
+ try: await context.close()
486
+ except Exception as e_ctx: logging.warning(f"Error closing Playwright context: {e_ctx}")
487
+ if browser:
488
+ try: await browser.close()
489
+ except Exception as e_brws: logging.warning(f"Error closing Playwright browser: {e_brws}")
490
+ if playwright_instance:
491
+ try: await playwright_instance.stop()
492
+ except Exception as e_pw_stop: logging.warning(f"Error stopping Playwright: {e_pw_stop}")
493
+
494
+ if temp_data_dir_obj:
495
+ time.sleep(0.5)
496
+ try:
497
+ temp_data_dir_obj.cleanup()
498
+ logging.info(f"Successfully cleaned up temporary data directory: {temp_data_dir_obj.name}")
499
+ except Exception as e_cleanup:
500
+ logging.error(f"Error cleaning up temporary data directory {temp_data_dir_obj.name}: {e_cleanup}")
501
+ try: shutil.rmtree(temp_data_dir_obj.name, ignore_errors=True)
502
+ except Exception as e_shutil: logging.error(f"Shutil.rmtree cleanup also failed for {temp_data_dir_obj.name}: {e_shutil}")
503
+
504
+
505
+ # Combine results and final error message assembly
506
+ final_error_message = None
507
+ if timed_out_early:
508
+ timeout_info = f"Operation timed out within {GLOBAL_TIMEOUT_SECONDS}s budget."
509
+ if error_message and "timed out" not in error_message.lower(): final_error_message = f"{timeout_info} Last error: {error_message}"
510
+ elif error_message: final_error_message = error_message
511
+ else: final_error_message = timeout_info
512
+ elif error_message:
513
+ final_error_message = error_message
514
+
515
+ if should_screenshot and not screenshot_bytes:
516
+ capture_failure_message = "Screenshot requested but could not be captured."
517
+ if not final_error_message: final_error_message = capture_failure_message
518
+ elif capture_failure_message not in final_error_message: final_error_message += f"; {capture_failure_message}"
519
+
520
+ if not extracted_text and not final_error_message:
521
+ final_error_message = "Text could not be extracted from the page."
522
+ elif not extracted_text and final_error_message and "Text could not be extracted" not in final_error_message and "timed out" not in final_error_message.lower():
523
+ final_error_message += "; Text also failed to extract."
524
+
525
+ final_response = {
526
+ "url": url,
527
+ "text": extracted_text or "",
528
+ "error": final_error_message
529
+ }
530
+ if screenshot_bytes:
531
+ final_response["screenshot_base64"] = base64.b64encode(screenshot_bytes).decode('utf-8')
532
+
533
+ elapsed_total = time.time() - operation_start_time
534
+ logging.info(f"Total time for {url} (Playwright): {elapsed_total:.2f}s. Timed out early: {timed_out_early}. Error: {final_response.get('error')}")
535
+
536
+ return final_response
537
+
538
+ @app.route(route="scrape")
539
+ async def http_scrape_trigger(req: func.HttpRequest) -> func.HttpResponse:
540
+ logging.info('Python HTTP scrape trigger function processed a request.')
541
+ url = None
542
+ take_screenshot = True
543
+
544
+ try:
545
+ url = req.params.get('url')
546
+ if url:
547
+ logging.info(f"Found URL in query parameters: {url}")
548
+ ss_param = req.params.get('take_screenshot', 'true')
549
+ take_screenshot = ss_param.lower() != 'false'
550
+ else:
551
+ logging.info("URL not found in query parameters.")
552
+ except Exception as e:
553
+ logging.warning(f"Error reading query parameters: {e}")
554
+ url = None
555
+
556
+ if not url:
557
+ logging.info("Attempting to read URL from JSON body.")
558
+ try:
559
+ req_body = await req.get_json()
560
+ if req_body:
561
+ url = req_body.get('url')
562
+ if url:
563
+ logging.info(f"Found URL in JSON body: {url}")
564
+ ss_param = req_body.get('take_screenshot', True)
565
+ if isinstance(ss_param, str):
566
+ take_screenshot = ss_param.lower() != 'false'
567
+ else:
568
+ take_screenshot = bool(ss_param)
569
+ logging.info(f"Screenshot parameter from JSON: {take_screenshot}")
570
+ else:
571
+ logging.info("URL key not found in JSON body.")
572
+ else:
573
+ logging.info("JSON body is empty.")
574
+ except ValueError:
575
+ logging.info("Request body is not valid JSON or missing.")
576
+ except Exception as e:
577
+ logging.warning(f"Error reading JSON body: {e}")
578
+ url = None
579
+
580
+ if url:
581
+ try:
582
+ if not url.startswith(('http://', 'https://')):
583
+ raise ValueError("Invalid URL format. Must start with http:// or https://")
584
+
585
+ result_data = await scrape_and_screenshot_playwright(url, should_screenshot=take_screenshot)
586
+
587
+ status_code = 200
588
+ if result_data.get("error") and "timed out" in result_data.get("error", "").lower():
589
+ pass
590
+
591
+ return func.HttpResponse(
592
+ json.dumps(result_data),
593
+ mimetype="application/json",
594
+ status_code=status_code
595
+ )
596
+ except ValueError as ve:
597
+ logging.error(f"Invalid URL provided: {ve}")
598
+ return func.HttpResponse(
599
+ json.dumps({"url": url, "error": str(ve)}),
600
+ mimetype="application/json",
601
+ status_code=400
602
+ )
603
+ except Exception as e:
604
+ logging.error(f"Error in http_scrape_trigger for {url}: {e}")
605
+ return func.HttpResponse(
606
+ json.dumps({"url": url, "error": f"An internal error occurred in trigger: {e}"}),
607
+ mimetype="application/json",
608
+ status_code=500
609
+ )
610
+ else:
611
+ logging.warning("URL not provided in request body or query string.")
612
+ return func.HttpResponse(
613
+ json.dumps({"error": "Please pass a 'url' in the JSON request body or query string"}),
614
+ mimetype="application/json",
615
+ status_code=400
616
+ )
617
+
618
+
619
+ # aiohttp server part
620
+ async def handle_aiohttp_request(request: web.Request) -> web.Response:
621
+ logging.info('aiohttp /scrape endpoint hit.')
622
+ url = None
623
+ take_screenshot = True
624
+
625
+ url = request.query.get('url')
626
+ if url:
627
+ logging.info(f"Found URL in query parameters: {url}")
628
+ ss_param = request.query.get('take_screenshot', 'true')
629
+ take_screenshot = ss_param.lower() != 'false'
630
+ else:
631
+ logging.info("URL not found in query parameters. Attempting to read from JSON body.")
632
+ try:
633
+ req_body = await request.json()
634
+ if req_body:
635
+ url = req_body.get('url')
636
+ if url:
637
+ logging.info(f"Found URL in JSON body: {url}")
638
+ ss_param = req_body.get('take_screenshot', True)
639
+ if isinstance(ss_param, str):
640
+ take_screenshot = ss_param.lower() != 'false'
641
+ else:
642
+ take_screenshot = bool(ss_param)
643
+ logging.info(f"Screenshot parameter from JSON: {take_screenshot}")
644
+ else:
645
+ logging.info("URL key not found in JSON body.")
646
+ else:
647
+ logging.info("JSON body is empty or not provided.")
648
+ except json.JSONDecodeError:
649
+ logging.info("Request body is not valid JSON.")
650
+ except Exception as e:
651
+ logging.warning(f"Error reading JSON body for aiohttp request: {e}")
652
+ url = None
653
+
654
+ if url:
655
+ try:
656
+ if not url.startswith(('http://', 'https://')):
657
+ raise ValueError("Invalid URL format. Must start with http:// or https://")
658
+
659
+ result_data = await scrape_and_screenshot_playwright(url, should_screenshot=take_screenshot)
660
+
661
+ status_code = 200
662
+ return web.json_response(result_data, status=status_code)
663
+ except ValueError as ve:
664
+ logging.error(f"Invalid URL provided to aiohttp server: {ve}")
665
+ return web.json_response({"url": url, "error": str(ve)}, status=400)
666
+ except Exception as e:
667
+ logging.error(f"Error in aiohttp_handle_request for {url}: {e}")
668
+ return web.json_response({"url": url, "error": f"An internal server error occurred: {e}"}, status=500)
669
+ else:
670
+ logging.warning("URL not provided in aiohttp request body or query string.")
671
+ return web.json_response({"error": "Please pass a 'url' in the JSON request body or query string"}, status=400)
672
+
673
+ async def main_server():
674
+ logging.basicConfig(level=logging.INFO)
675
+ port_str = os.environ.get("PORT", "7777")
676
+ try:
677
+ port = int(port_str)
678
+ except ValueError:
679
+ logging.warning(f"Invalid PORT environment variable '{port_str}'. Defaulting to 7777.")
680
+ port = 7777
681
+
682
+ aiohttp_app = web.Application()
683
+ aiohttp_app.router.add_post('/scrape', handle_aiohttp_request)
684
+ aiohttp_app.router.add_get('/scrape', handle_aiohttp_request)
685
+
686
+ runner = web.AppRunner(aiohttp_app)
687
+ await runner.setup()
688
+ site = web.TCPSite(runner, '0.0.0.0', port)
689
+ await site.start()
690
+ logging.info(f"aiohttp server started on http://0.0.0.0:{port}/scrape")
691
+ print(f"======== Running on http://0.0.0.0:{port}/scrape ========")
692
+ print("(Press CTRL+C to quit)")
693
+ try:
694
+ while True:
695
+ await asyncio.sleep(3600)
696
+ except KeyboardInterrupt:
697
+ logging.info("Server shutting down...")
698
+ finally:
699
+ await runner.cleanup()
700
+ logging.info("Server stopped.")
701
+
702
+ if __name__ == "__main__":
703
+ try:
704
+ asyncio.run(main_server())
705
+ except KeyboardInterrupt:
706
+ logging.info("Application shut down by user.")
707
+ except Exception as e:
708
+ logging.critical(f"Application failed to start or crashed: {e}")