@aj-archipelago/cortex 1.3.49 → 1.3.51
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config.js +1 -1
- package/helper-apps/cortex-browser/Dockerfile +19 -31
- package/helper-apps/cortex-browser/function_app.py +708 -181
- package/helper-apps/cortex-browser/requirements.txt +4 -4
- package/helper-apps/cortex-file-handler/blobHandler.js +850 -429
- package/helper-apps/cortex-file-handler/constants.js +64 -48
- package/helper-apps/cortex-file-handler/docHelper.js +7 -114
- package/helper-apps/cortex-file-handler/fileChunker.js +96 -51
- package/helper-apps/cortex-file-handler/function.json +2 -6
- package/helper-apps/cortex-file-handler/helper.js +34 -25
- package/helper-apps/cortex-file-handler/index.js +324 -136
- package/helper-apps/cortex-file-handler/localFileHandler.js +56 -57
- package/helper-apps/cortex-file-handler/package-lock.json +6065 -5964
- package/helper-apps/cortex-file-handler/package.json +8 -4
- package/helper-apps/cortex-file-handler/redis.js +23 -17
- package/helper-apps/cortex-file-handler/scripts/setup-azure-container.js +12 -9
- package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +21 -18
- package/helper-apps/cortex-file-handler/scripts/test-azure.sh +1 -1
- package/helper-apps/cortex-file-handler/scripts/test-gcs.sh +1 -1
- package/helper-apps/cortex-file-handler/services/ConversionService.js +288 -0
- package/helper-apps/cortex-file-handler/services/FileConversionService.js +53 -0
- package/helper-apps/cortex-file-handler/start.js +63 -38
- package/helper-apps/cortex-file-handler/tests/FileConversionService.test.js +144 -0
- package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +88 -64
- package/helper-apps/cortex-file-handler/tests/fileChunker.test.js +114 -91
- package/helper-apps/cortex-file-handler/tests/fileUpload.test.js +351 -0
- package/helper-apps/cortex-file-handler/tests/files/DOCX_TestPage.docx +0 -0
- package/helper-apps/cortex-file-handler/tests/files/tests-example.xls +0 -0
- package/helper-apps/cortex-file-handler/tests/start.test.js +943 -642
- package/helper-apps/cortex-file-handler/tests/testUtils.helper.js +31 -0
- package/helper-apps/cortex-markitdown/.funcignore +1 -0
- package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/__init__.py +64 -0
- package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/function.json +21 -0
- package/helper-apps/cortex-markitdown/README.md +94 -0
- package/helper-apps/cortex-markitdown/host.json +15 -0
- package/helper-apps/cortex-markitdown/requirements.txt +2 -0
- package/lib/requestExecutor.js +44 -36
- package/package.json +1 -1
- package/pathways/system/entity/tools/sys_tool_cognitive_search.js +1 -1
- package/pathways/system/entity/tools/sys_tool_readfile.js +24 -2
- package/server/plugins/openAiWhisperPlugin.js +59 -87
- package/helper-apps/cortex-file-handler/tests/docHelper.test.js +0 -148
|
@@ -1,181 +1,708 @@
|
|
|
1
|
-
import azure.functions as func
|
|
2
|
-
import logging
|
|
3
|
-
import json
|
|
4
|
-
|
|
5
|
-
import
|
|
6
|
-
import
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
logging.info("
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
1
|
+
import azure.functions as func
|
|
2
|
+
import logging
|
|
3
|
+
import json
|
|
4
|
+
import base64
|
|
5
|
+
import time
|
|
6
|
+
import os
|
|
7
|
+
import tempfile
|
|
8
|
+
import re
|
|
9
|
+
import asyncio
|
|
10
|
+
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError, Error as PlaywrightError
|
|
11
|
+
import trafilatura
|
|
12
|
+
from typing import Union, Dict, Any, Tuple, Optional
|
|
13
|
+
import shutil
|
|
14
|
+
from aiohttp import web # Added for local server
|
|
15
|
+
|
|
16
|
+
app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS)
|
|
17
|
+
|
|
18
|
+
# Get global timeout from environment variable, default to 20
|
|
19
|
+
timeout_str = os.environ.get("GLOBAL_TIMEOUT_SECONDS", "20")
|
|
20
|
+
try:
|
|
21
|
+
GLOBAL_TIMEOUT_SECONDS = int(timeout_str)
|
|
22
|
+
except ValueError:
|
|
23
|
+
logging.warning(f"Invalid GLOBAL_TIMEOUT_SECONDS environment variable '{timeout_str}'. Defaulting to 20.")
|
|
24
|
+
GLOBAL_TIMEOUT_SECONDS = 20
|
|
25
|
+
|
|
26
|
+
MIN_TIME_FOR_STEP = 0.5 # Minimum time to allow for any single step if budget is very low
|
|
27
|
+
BUFFER_FOR_CLEANUP_AND_PROCESSING = 2 # Reserve N seconds for trafilatura, screenshot, and cleanup
|
|
28
|
+
SCREENSHOT_DIR_PLAYWRIGHT = "downloaded_files/playwright_screenshots"
|
|
29
|
+
|
|
30
|
+
def get_remaining_time(start_time: float, budget_for_step: Union[float, None] = None) -> float:
|
|
31
|
+
"""Calculates remaining time from the global timeout."""
|
|
32
|
+
elapsed_time = time.time() - start_time
|
|
33
|
+
remaining_global_time = GLOBAL_TIMEOUT_SECONDS - elapsed_time
|
|
34
|
+
if budget_for_step is not None:
|
|
35
|
+
safe_budget = min(budget_for_step, remaining_global_time - BUFFER_FOR_CLEANUP_AND_PROCESSING)
|
|
36
|
+
return max(MIN_TIME_FOR_STEP, safe_budget)
|
|
37
|
+
return max(0, remaining_global_time - BUFFER_FOR_CLEANUP_AND_PROCESSING)
|
|
38
|
+
|
|
39
|
+
def sanitize_filename(url_or_name: str) -> str:
|
|
40
|
+
"""Sanitizes a string to be a valid filename."""
|
|
41
|
+
# Remove http(s)://
|
|
42
|
+
name = re.sub(r'^https?://', '', url_or_name)
|
|
43
|
+
# Replace non-alphanumeric characters (except . - _) with underscores
|
|
44
|
+
name = re.sub(r'[^a-zA-Z0-9._-]', '_', name)
|
|
45
|
+
# Truncate if too long
|
|
46
|
+
return name[:100]
|
|
47
|
+
|
|
48
|
+
# --- Task for Text Extraction ---
|
|
49
|
+
async def _task_extract_text(page, start_time: float) -> Tuple[Optional[str], Optional[str]]:
|
|
50
|
+
extracted_text = None
|
|
51
|
+
html_content = None
|
|
52
|
+
error_message = None
|
|
53
|
+
timed_out_early_task = False
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
logging.info("[Text Task] Attempting primary text extraction with body.inner_text()...")
|
|
57
|
+
primary_inner_text_budget_s = get_remaining_time(start_time, budget_for_step=7)
|
|
58
|
+
if primary_inner_text_budget_s < MIN_TIME_FOR_STEP:
|
|
59
|
+
logging.warning(f"[Text Task] Skipping primary body.inner_text() due to insufficient time ({primary_inner_text_budget_s:.2f}s).")
|
|
60
|
+
else:
|
|
61
|
+
logging.info(f"[Text Task] Allocating {primary_inner_text_budget_s:.2f}s for primary body.inner_text().")
|
|
62
|
+
try:
|
|
63
|
+
body_text_pl = await asyncio.wait_for(page.locator('body').inner_text(timeout=primary_inner_text_budget_s * 1000), timeout=primary_inner_text_budget_s + 0.5)
|
|
64
|
+
if body_text_pl:
|
|
65
|
+
extracted_text = body_text_pl.strip()
|
|
66
|
+
logging.info(f"[Text Task] Captured primary text via body.inner_text(), length: {len(extracted_text)}.")
|
|
67
|
+
else:
|
|
68
|
+
logging.warning("[Text Task] Primary body.inner_text() was empty.")
|
|
69
|
+
except asyncio.TimeoutError:
|
|
70
|
+
logging.warning(f"[Text Task] Timeout ({primary_inner_text_budget_s:.2f}s) during primary body.inner_text().")
|
|
71
|
+
timed_out_early_task = True
|
|
72
|
+
except Exception as e_primary_body_text:
|
|
73
|
+
logging.warning(f"[Text Task] Error during primary body.inner_text(): {e_primary_body_text}.")
|
|
74
|
+
error_message = f"Error during inner_text: {e_primary_body_text}"
|
|
75
|
+
|
|
76
|
+
if not extracted_text and not timed_out_early_task:
|
|
77
|
+
logging.info("[Text Task] Attempting to fetch page.content() for Trafilatura fallback.")
|
|
78
|
+
page_content_budget_s = get_remaining_time(start_time, budget_for_step=15)
|
|
79
|
+
if page_content_budget_s < MIN_TIME_FOR_STEP * 2:
|
|
80
|
+
logging.warning(f"[Text Task] Skipping page.content() fetch due to insufficient time ({page_content_budget_s:.2f}s).")
|
|
81
|
+
else:
|
|
82
|
+
logging.info(f"[Text Task] Allocating {page_content_budget_s:.2f}s for page.content().")
|
|
83
|
+
try:
|
|
84
|
+
html_content = await asyncio.wait_for(page.content(), timeout=page_content_budget_s)
|
|
85
|
+
if html_content:
|
|
86
|
+
logging.info(f"[Text Task] Successfully fetched page.content(), length: {len(html_content)}.")
|
|
87
|
+
else:
|
|
88
|
+
logging.warning("[Text Task] page.content() returned None or empty.")
|
|
89
|
+
html_content = None
|
|
90
|
+
except asyncio.TimeoutError:
|
|
91
|
+
logging.warning(f"[Text Task] Timeout ({page_content_budget_s:.2f}s) while getting page.content(). html_content will be None.")
|
|
92
|
+
html_content = None
|
|
93
|
+
timed_out_early_task = True
|
|
94
|
+
except Exception as e_final_ps:
|
|
95
|
+
logging.warning(f"[Text Task] Could not get final page.content(): {e_final_ps}")
|
|
96
|
+
if not error_message: error_message = f"Failed to get page.content(): {e_final_ps}"
|
|
97
|
+
html_content = None
|
|
98
|
+
elif not timed_out_early_task:
|
|
99
|
+
logging.info("[Text Task] Skipping page.content() fetch as inner_text succeeded or timed out.")
|
|
100
|
+
|
|
101
|
+
if html_content and not extracted_text and not timed_out_early_task:
|
|
102
|
+
try:
|
|
103
|
+
logging.info("[Text Task] Attempting Trafilatura text extraction as fallback...")
|
|
104
|
+
trafilatura_text = trafilatura.extract(html_content, include_comments=False)
|
|
105
|
+
if trafilatura_text:
|
|
106
|
+
extracted_text = trafilatura_text.strip()
|
|
107
|
+
logging.info(f"[Text Task] Trafilatura extracted fallback text, length: {len(extracted_text)}. This will be used.")
|
|
108
|
+
else:
|
|
109
|
+
logging.warning(f"[Text Task] Trafilatura fallback extraction yielded no text.")
|
|
110
|
+
except Exception as e_traf:
|
|
111
|
+
logging.error(f"[Text Task] Trafilatura fallback error: {e_traf}")
|
|
112
|
+
if not error_message: error_message = f"Trafilatura fallback failed: {e_traf}"
|
|
113
|
+
|
|
114
|
+
if not extracted_text and not error_message:
|
|
115
|
+
logging.warning("[Text Task] Text extraction attempts failed or yielded no text.")
|
|
116
|
+
|
|
117
|
+
except Exception as e_task_text:
|
|
118
|
+
logging.error(f"[Text Task] Unexpected error during text extraction: {e_task_text}")
|
|
119
|
+
error_message = error_message or f"Unexpected text task error: {e_task_text}"
|
|
120
|
+
|
|
121
|
+
return extracted_text, error_message
|
|
122
|
+
|
|
123
|
+
# --- Task for Screenshot Capture ---
|
|
124
|
+
async def _task_capture_screenshot(page, context, start_time: float) -> Tuple[Optional[bytes], Optional[str]]:
|
|
125
|
+
screenshot_bytes = None
|
|
126
|
+
error_message = None
|
|
127
|
+
cdp_session_ss = None
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
logging.info("[Screenshot Task] Starting screenshot capture process...")
|
|
131
|
+
try:
|
|
132
|
+
logging.info("[Screenshot Task] Scrolling to top and modifying fixed elements...")
|
|
133
|
+
scroll_modify_budget = get_remaining_time(start_time, budget_for_step=3)
|
|
134
|
+
if scroll_modify_budget < MIN_TIME_FOR_STEP / 2:
|
|
135
|
+
logging.warning(f"[Screenshot Task] Skipping scroll/modify due to time ({scroll_modify_budget:.2f}s)")
|
|
136
|
+
else:
|
|
137
|
+
await asyncio.wait_for(page.evaluate("window.scrollTo(0, 0)"), timeout=scroll_modify_budget / 2)
|
|
138
|
+
modified_count = await asyncio.wait_for(page.evaluate(
|
|
139
|
+
"""() => {
|
|
140
|
+
let m_count = 0;
|
|
141
|
+
const allElements = document.querySelectorAll('*');
|
|
142
|
+
allElements.forEach(el => {
|
|
143
|
+
if (el.tagName !== 'BODY' && el.tagName !== 'HTML') {
|
|
144
|
+
const style = window.getComputedStyle(el);
|
|
145
|
+
if (style.position === 'fixed') {
|
|
146
|
+
el.style.position = 'absolute';
|
|
147
|
+
m_count++;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
});
|
|
151
|
+
return m_count;
|
|
152
|
+
}"""
|
|
153
|
+
), timeout=scroll_modify_budget / 2)
|
|
154
|
+
logging.info(f"[Screenshot Task] {modified_count} fixed elements modified.")
|
|
155
|
+
await asyncio.sleep(0.3)
|
|
156
|
+
except Exception as e_scroll_modify:
|
|
157
|
+
logging.warning(f"[Screenshot Task] Error during scroll/modify: {e_scroll_modify}")
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
logging.info("[Screenshot Task] Attempting CDP full page screenshot...")
|
|
161
|
+
metrics_budget = get_remaining_time(start_time, budget_for_step=1)
|
|
162
|
+
cdp_capture_budget = get_remaining_time(start_time, budget_for_step=10)
|
|
163
|
+
|
|
164
|
+
if metrics_budget < MIN_TIME_FOR_STEP / 2 or cdp_capture_budget < MIN_TIME_FOR_STEP:
|
|
165
|
+
raise PlaywrightTimeoutError("Not enough time for CDP screenshot steps.")
|
|
166
|
+
|
|
167
|
+
metrics = await asyncio.wait_for(page.evaluate(
|
|
168
|
+
"() => ({ width: Math.max(document.body.scrollWidth, document.documentElement.scrollWidth), height: Math.max(document.body.scrollHeight, document.documentElement.scrollHeight) })"
|
|
169
|
+
), timeout=metrics_budget)
|
|
170
|
+
full_width = metrics['width']
|
|
171
|
+
full_height = metrics['height']
|
|
172
|
+
if not full_width or not full_height: raise ValueError("Invalid page dimensions for CDP.")
|
|
173
|
+
|
|
174
|
+
logging.info(f"[Screenshot Task] Allocating {cdp_capture_budget:.2f}s for CDP capture.")
|
|
175
|
+
cdp_session_ss = await asyncio.wait_for(context.new_cdp_session(page), timeout=1.0)
|
|
176
|
+
cdp_result = await asyncio.wait_for(cdp_session_ss.send(
|
|
177
|
+
"Page.captureScreenshot",
|
|
178
|
+
{"format": "jpeg", "quality": 80, "captureBeyondViewport": True, "clip": {"x": 0, "y": 0, "width": full_width, "height": full_height, "scale": 1}, "fromSurface": True }
|
|
179
|
+
), timeout=cdp_capture_budget)
|
|
180
|
+
screenshot_bytes = base64.b64decode(cdp_result['data'])
|
|
181
|
+
logging.info(f"[Screenshot Task] Captured full page screenshot via CDP (JPEG, {len(screenshot_bytes)} bytes).")
|
|
182
|
+
|
|
183
|
+
except Exception as e_ss_cdp:
|
|
184
|
+
logging.warning(f"[Screenshot Task] CDP screenshot failed: {e_ss_cdp}. Falling back.")
|
|
185
|
+
if not error_message or "closed" in str(e_ss_cdp).lower(): error_message = f"CDP Screenshot fail: {e_ss_cdp}"
|
|
186
|
+
screenshot_bytes = None
|
|
187
|
+
|
|
188
|
+
finally:
|
|
189
|
+
if cdp_session_ss:
|
|
190
|
+
try:
|
|
191
|
+
await asyncio.wait_for(cdp_session_ss.detach(), timeout=1.0)
|
|
192
|
+
logging.info("[Screenshot Task] CDP session detached.")
|
|
193
|
+
except Exception as e_cdp_detach: logging.warning(f"[Screenshot Task] CDP detach error: {e_cdp_detach}")
|
|
194
|
+
|
|
195
|
+
if not screenshot_bytes:
|
|
196
|
+
try:
|
|
197
|
+
pw_full_budget = get_remaining_time(start_time, budget_for_step=8)
|
|
198
|
+
if pw_full_budget < MIN_TIME_FOR_STEP:
|
|
199
|
+
logging.warning("[Screenshot Task] Skipping PW full page fallback due to time.")
|
|
200
|
+
else:
|
|
201
|
+
logging.info("[Screenshot Task] Attempting Playwright full_page fallback...")
|
|
202
|
+
screenshot_bytes = await asyncio.wait_for(page.screenshot(type='jpeg', quality=80, full_page=True), timeout=pw_full_budget)
|
|
203
|
+
logging.info("[Screenshot Task] Captured Playwright full_page fallback.")
|
|
204
|
+
except Exception as e_ss_pw_full:
|
|
205
|
+
logging.warning(f"[Screenshot Task] PW full_page fallback failed: {e_ss_pw_full}. Trying viewport.")
|
|
206
|
+
if not error_message: error_message = f"PW full fallback fail: {e_ss_pw_full}"
|
|
207
|
+
|
|
208
|
+
if not screenshot_bytes:
|
|
209
|
+
try:
|
|
210
|
+
pw_vp_budget = get_remaining_time(start_time, budget_for_step=3)
|
|
211
|
+
if pw_vp_budget < MIN_TIME_FOR_STEP / 2:
|
|
212
|
+
logging.warning("[Screenshot Task] Skipping PW viewport fallback due to time.")
|
|
213
|
+
else:
|
|
214
|
+
logging.info("[Screenshot Task] Attempting Playwright viewport fallback...")
|
|
215
|
+
screenshot_bytes = await asyncio.wait_for(page.screenshot(type='jpeg', quality=80), timeout=pw_vp_budget)
|
|
216
|
+
logging.info("[Screenshot Task] Captured viewport fallback.")
|
|
217
|
+
except Exception as e_ss_pw_viewport:
|
|
218
|
+
logging.warning(f"[Screenshot Task] Viewport fallback failed: {e_ss_pw_viewport}")
|
|
219
|
+
if not error_message: error_message = f"Viewport fallback fail: {e_ss_pw_viewport}"
|
|
220
|
+
|
|
221
|
+
if not screenshot_bytes:
|
|
222
|
+
logging.error("[Screenshot Task] All screenshot attempts failed.")
|
|
223
|
+
error_message = error_message or "Screenshot capture failed despite all attempts."
|
|
224
|
+
|
|
225
|
+
except Exception as e_task_screenshot:
|
|
226
|
+
logging.error(f"[Screenshot Task] Unexpected error during screenshot capture: {e_task_screenshot}")
|
|
227
|
+
error_message = error_message or f"Unexpected screenshot task error: {e_task_screenshot}"
|
|
228
|
+
screenshot_bytes = None
|
|
229
|
+
|
|
230
|
+
return screenshot_bytes, error_message
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
async def scrape_and_screenshot_playwright(url: str, should_screenshot: bool = True) -> dict:
|
|
234
|
+
operation_start_time = time.time()
|
|
235
|
+
logging.info(f"SCRAPE START for {url}") # Log start of function execution
|
|
236
|
+
|
|
237
|
+
screenshot_bytes = None
|
|
238
|
+
extracted_text = None
|
|
239
|
+
error_message = None
|
|
240
|
+
timed_out_early = False
|
|
241
|
+
temp_data_dir_obj = None
|
|
242
|
+
playwright_instance = None
|
|
243
|
+
browser = None
|
|
244
|
+
context = None
|
|
245
|
+
page = None
|
|
246
|
+
|
|
247
|
+
try:
|
|
248
|
+
temp_data_dir_obj = tempfile.TemporaryDirectory()
|
|
249
|
+
temp_data_dir = temp_data_dir_obj.name
|
|
250
|
+
logging.info(f"Using temporary data directory for Playwright context: {temp_data_dir} for URL: {url}")
|
|
251
|
+
|
|
252
|
+
if get_remaining_time(operation_start_time) <= MIN_TIME_FOR_STEP:
|
|
253
|
+
timed_out_early = True
|
|
254
|
+
raise PlaywrightTimeoutError("Not enough time to initialize browser.")
|
|
255
|
+
|
|
256
|
+
logging.info("Attempting to start Playwright...")
|
|
257
|
+
playwright_instance = await async_playwright().start()
|
|
258
|
+
logging.info("Playwright started successfully.")
|
|
259
|
+
|
|
260
|
+
logging.info("Attempting to launch Chromium browser...")
|
|
261
|
+
browser = await playwright_instance.chromium.launch(
|
|
262
|
+
headless=True,
|
|
263
|
+
args=[
|
|
264
|
+
"--no-sandbox",
|
|
265
|
+
"--disable-setuid-sandbox",
|
|
266
|
+
"--ignore-certificate-errors",
|
|
267
|
+
"--disable-dev-shm-usage",
|
|
268
|
+
"--disable-gpu",
|
|
269
|
+
]
|
|
270
|
+
)
|
|
271
|
+
logging.info("Chromium browser launched successfully.")
|
|
272
|
+
|
|
273
|
+
logging.info("Attempting to create browser context...")
|
|
274
|
+
context = await browser.new_context(
|
|
275
|
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
|
|
276
|
+
viewport={"width": 1280, "height": 720},
|
|
277
|
+
locale="en-US",
|
|
278
|
+
)
|
|
279
|
+
logging.info("Browser context created successfully.")
|
|
280
|
+
|
|
281
|
+
logging.info("Attempting to create new page...")
|
|
282
|
+
page = await context.new_page()
|
|
283
|
+
logging.info("New page created successfully.")
|
|
284
|
+
|
|
285
|
+
await page.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
|
286
|
+
|
|
287
|
+
# Page Load
|
|
288
|
+
page_load_budget_s = get_remaining_time(operation_start_time, budget_for_step=10)
|
|
289
|
+
if page_load_budget_s <= MIN_TIME_FOR_STEP:
|
|
290
|
+
timed_out_early = True
|
|
291
|
+
raise PlaywrightTimeoutError("Not enough time for page load.")
|
|
292
|
+
|
|
293
|
+
logging.info(f"Setting page load timeout to: {page_load_budget_s:.2f}s (wait_until 'domcontentloaded')")
|
|
294
|
+
try:
|
|
295
|
+
await page.goto(url, timeout=page_load_budget_s * 1000, wait_until="domcontentloaded")
|
|
296
|
+
logging.info(f"Navigation to {url} completed (domcontentloaded).")
|
|
297
|
+
except PlaywrightTimeoutError:
|
|
298
|
+
logging.warning(f"Page load timeout for {url} (domcontentloaded strategy) within the allocated budget.")
|
|
299
|
+
timed_out_early = True
|
|
300
|
+
try: html_content_on_timeout = await page.content()
|
|
301
|
+
except: html_content_on_timeout = None
|
|
302
|
+
if should_screenshot:
|
|
303
|
+
try:
|
|
304
|
+
screenshot_bytes = await page.screenshot(type='jpeg', quality=80)
|
|
305
|
+
logging.info("Captured viewport screenshot after page load timeout.")
|
|
306
|
+
except Exception as e_ss_on_timeout: logging.error(f"Failed viewport screenshot after load timeout: {e_ss_on_timeout}")
|
|
307
|
+
extracted_text = trafilatura.extract(html_content_on_timeout) if html_content_on_timeout else None
|
|
308
|
+
error_message = f"Page load timed out ({page_load_budget_s:.2f}s)"
|
|
309
|
+
raise PlaywrightTimeoutError(error_message)
|
|
310
|
+
except Exception as e_get:
|
|
311
|
+
logging.error(f"Error during page.goto({url}): {e_get}")
|
|
312
|
+
error_message = f"Error navigating to URL: {e_get}"
|
|
313
|
+
raise
|
|
314
|
+
|
|
315
|
+
# Scrolling (Sequential while loop)
|
|
316
|
+
scrolled = False
|
|
317
|
+
if not timed_out_early and get_remaining_time(operation_start_time) > MIN_TIME_FOR_STEP * 5:
|
|
318
|
+
logging.info("Attempting to scroll page (while loop)...")
|
|
319
|
+
try:
|
|
320
|
+
last_height = await page.evaluate("document.body.scrollHeight")
|
|
321
|
+
scroll_pause_time = 1.5
|
|
322
|
+
scroll_attempts = 0
|
|
323
|
+
max_scroll_attempts = 7
|
|
324
|
+
scroll_loop_start_time = time.time()
|
|
325
|
+
scroll_loop_budget_s = get_remaining_time(scroll_loop_start_time, budget_for_step=max_scroll_attempts * (scroll_pause_time + 0.5))
|
|
326
|
+
|
|
327
|
+
while scroll_attempts < max_scroll_attempts:
|
|
328
|
+
if time.time() - scroll_loop_start_time > scroll_loop_budget_s:
|
|
329
|
+
logging.warning("Scrolling loop timed out.")
|
|
330
|
+
timed_out_early = True; break
|
|
331
|
+
if time.time() - operation_start_time > GLOBAL_TIMEOUT_SECONDS - BUFFER_FOR_CLEANUP_AND_PROCESSING:
|
|
332
|
+
logging.warning("Global timeout reached during scrolling.")
|
|
333
|
+
timed_out_early = True; break
|
|
334
|
+
|
|
335
|
+
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
336
|
+
await asyncio.sleep(scroll_pause_time)
|
|
337
|
+
new_height = await page.evaluate("document.body.scrollHeight")
|
|
338
|
+
if new_height == last_height:
|
|
339
|
+
scrolled = True; break
|
|
340
|
+
last_height = new_height
|
|
341
|
+
scroll_attempts += 1
|
|
342
|
+
|
|
343
|
+
if scrolled: logging.info(f"Scrolling finished after {scroll_attempts} attempts.")
|
|
344
|
+
elif not timed_out_early: logging.warning(f"Scrolling completed max {max_scroll_attempts} attempts without full scroll.")
|
|
345
|
+
|
|
346
|
+
except Exception as e_scroll:
|
|
347
|
+
logging.warning(f"Error during scrolling: {e_scroll}")
|
|
348
|
+
|
|
349
|
+
# Simplified Image Wait (Sequential)
|
|
350
|
+
if not timed_out_early and scrolled:
|
|
351
|
+
logging.info("Attempting simplified dynamic image wait (post-scroll)...")
|
|
352
|
+
await asyncio.sleep(1.0)
|
|
353
|
+
|
|
354
|
+
# Cookie Handling (Sequential)
|
|
355
|
+
cookie_banner_handled = False
|
|
356
|
+
if not timed_out_early:
|
|
357
|
+
logging.info(f"Attempting cookie dismissal (super-fast attempts)...")
|
|
358
|
+
cookie_locator_options = [
|
|
359
|
+
{"role": "button", "name": re.compile(r"(Reject All|Tümünü Reddet|Decline All)", re.IGNORECASE)},
|
|
360
|
+
{"role": "link", "name": re.compile(r"(Reject All|Tümünü Reddet|Decline All)", re.IGNORECASE)},
|
|
361
|
+
{"role": "button", "name": re.compile(r"(Reject|Decline|Only necessary|Essential)", re.IGNORECASE)},
|
|
362
|
+
{"role": "link", "name": re.compile(r"(Reject|Decline|Only necessary|Essential)", re.IGNORECASE)},
|
|
363
|
+
{"locator_str": "button[data-testid='uc-reject-all-button']"},
|
|
364
|
+
{"locator_str": "button#uc-reject-all-button"},
|
|
365
|
+
{"role": "button", "name": re.compile(r"(Accept|Allow|Agree)( all| cookies)?", re.IGNORECASE)},
|
|
366
|
+
{"role": "link", "name": re.compile(r"(Accept|Allow|Agree)( all| cookies)?", re.IGNORECASE)},
|
|
367
|
+
]
|
|
368
|
+
cookie_check_start_time = time.time()
|
|
369
|
+
for option in cookie_locator_options:
|
|
370
|
+
if get_remaining_time(operation_start_time) < MIN_TIME_FOR_STEP:
|
|
371
|
+
logging.warning("Global timeout imminent during cookie checks. Halting cookie attempts.")
|
|
372
|
+
timed_out_early = True
|
|
373
|
+
break
|
|
374
|
+
try:
|
|
375
|
+
target_element = None
|
|
376
|
+
if "locator_str" in option: target_element = page.locator(option["locator_str"]).first
|
|
377
|
+
elif "role" in option and "name" in option: target_element = page.get_by_role(option["role"], name=option["name"]).first
|
|
378
|
+
|
|
379
|
+
if target_element:
|
|
380
|
+
logging.debug(f"Attempting ultra-fast interaction with cookie element: {option}")
|
|
381
|
+
clicked_successfully_this_attempt = False
|
|
382
|
+
try:
|
|
383
|
+
await target_element.click(timeout=200)
|
|
384
|
+
clicked_successfully_this_attempt = True
|
|
385
|
+
logging.info(f"Playwright click likely succeeded for cookie element: {option}.")
|
|
386
|
+
except Exception:
|
|
387
|
+
bounding_box = None
|
|
388
|
+
try: bounding_box = await asyncio.wait_for(target_element.bounding_box(), timeout=0.1)
|
|
389
|
+
except: pass
|
|
390
|
+
if bounding_box:
|
|
391
|
+
center_x = bounding_box['x'] + bounding_box['width'] / 2
|
|
392
|
+
center_y = bounding_box['y'] + bounding_box['height'] / 2
|
|
393
|
+
cdp_session_cookie = None
|
|
394
|
+
try:
|
|
395
|
+
cdp_session_cookie = await asyncio.wait_for(context.new_cdp_session(page), timeout=0.2)
|
|
396
|
+
await asyncio.wait_for(cdp_session_cookie.send("Input.dispatchMouseEvent", {"type": "mousePressed", "button": "left", "clickCount": 1, "x": center_x, "y": center_y }), timeout=0.1)
|
|
397
|
+
await asyncio.wait_for(cdp_session_cookie.send("Input.dispatchMouseEvent", {"type": "mouseReleased", "button": "left", "clickCount": 1, "x": center_x, "y": center_y }), timeout=0.1)
|
|
398
|
+
clicked_successfully_this_attempt = True
|
|
399
|
+
logging.info(f"Ultra-fast CDP click dispatched for cookie element: {option}.")
|
|
400
|
+
except Exception as e_cdp_click_fast:
|
|
401
|
+
logging.debug(f"Ultra-fast CDP click also failed for {option}: {e_cdp_click_fast}")
|
|
402
|
+
finally:
|
|
403
|
+
if cdp_session_cookie:
|
|
404
|
+
try:
|
|
405
|
+
await asyncio.wait_for(cdp_session_cookie.detach(), timeout=0.1)
|
|
406
|
+
except:
|
|
407
|
+
pass
|
|
408
|
+
else: logging.debug(f"No bounding_box for {option} for fast CDP attempt.")
|
|
409
|
+
|
|
410
|
+
if clicked_successfully_this_attempt:
|
|
411
|
+
cookie_banner_handled = True
|
|
412
|
+
logging.info("Cookie banner handled, breaking loop.")
|
|
413
|
+
break
|
|
414
|
+
except Exception as e_cookie_outer: logging.debug(f"Outer error processing cookie option {option}: {e_cookie_outer}")
|
|
415
|
+
|
|
416
|
+
logging.info(f"Finished cookie attempts. Handled: {cookie_banner_handled}, Timed Out: {timed_out_early}")
|
|
417
|
+
if not cookie_banner_handled and not timed_out_early: logging.warning("Failed to handle cookie banner.")
|
|
418
|
+
|
|
419
|
+
# --- Parallel Execution: Text Extraction and Screenshot ---
|
|
420
|
+
if not timed_out_early:
|
|
421
|
+
logging.info("Starting parallel execution of text extraction and screenshot tasks...")
|
|
422
|
+
tasks_to_run = []
|
|
423
|
+
tasks_to_run.append(_task_extract_text(page, operation_start_time))
|
|
424
|
+
if should_screenshot:
|
|
425
|
+
tasks_to_run.append(_task_capture_screenshot(page, context, operation_start_time))
|
|
426
|
+
else:
|
|
427
|
+
tasks_to_run.append(asyncio.sleep(0, result=(None, None)))
|
|
428
|
+
|
|
429
|
+
gather_budget_s = get_remaining_time(operation_start_time)
|
|
430
|
+
if gather_budget_s <= MIN_TIME_FOR_STEP:
|
|
431
|
+
logging.warning("Not enough time for parallel tasks. Skipping.")
|
|
432
|
+
timed_out_early = True
|
|
433
|
+
else:
|
|
434
|
+
logging.info(f"Running asyncio.gather with remaining time: {gather_budget_s:.2f}s")
|
|
435
|
+
try:
|
|
436
|
+
results = await asyncio.wait_for(asyncio.gather(*tasks_to_run, return_exceptions=True), timeout=gather_budget_s)
|
|
437
|
+
|
|
438
|
+
# Process Text Result
|
|
439
|
+
text_result = results[0]
|
|
440
|
+
if isinstance(text_result, Exception):
|
|
441
|
+
logging.error(f"Text extraction task failed with exception: {text_result}")
|
|
442
|
+
if not error_message: error_message = f"Text task exception: {text_result}"
|
|
443
|
+
elif text_result is not None:
|
|
444
|
+
extracted_text, text_error = text_result
|
|
445
|
+
if text_error and not error_message: error_message = text_error
|
|
446
|
+
|
|
447
|
+
# Process Screenshot Result
|
|
448
|
+
screenshot_result = results[1]
|
|
449
|
+
if isinstance(screenshot_result, Exception):
|
|
450
|
+
logging.error(f"Screenshot task failed with exception: {screenshot_result}")
|
|
451
|
+
if not error_message: error_message = f"Screenshot task exception: {screenshot_result}"
|
|
452
|
+
elif should_screenshot: error_message += f"; Screenshot task exception: {screenshot_result}"
|
|
453
|
+
elif screenshot_result is not None:
|
|
454
|
+
screenshot_bytes, screenshot_error = screenshot_result
|
|
455
|
+
if screenshot_error:
|
|
456
|
+
logging.warning(f"Screenshot task reported an error: {screenshot_error}")
|
|
457
|
+
if not error_message and should_screenshot: error_message = screenshot_error
|
|
458
|
+
elif should_screenshot: error_message += f"; {screenshot_error}"
|
|
459
|
+
|
|
460
|
+
except asyncio.TimeoutError:
|
|
461
|
+
logging.warning(f"Parallel tasks timed out (gather level after {gather_budget_s:.2f}s). Partial results may be missing.")
|
|
462
|
+
timed_out_early = True
|
|
463
|
+
if not error_message: error_message = "Parallel execution phase timed out."
|
|
464
|
+
|
|
465
|
+
# --- End Parallel Execution ---
|
|
466
|
+
|
|
467
|
+
except PlaywrightTimeoutError as pte:
|
|
468
|
+
logging.warning(f"Playwright operation timed out for {url}: {pte}")
|
|
469
|
+
error_message = error_message or f"Processing timed out (Playwright): {pte}"
|
|
470
|
+
timed_out_early = True
|
|
471
|
+
except PlaywrightError as pe:
|
|
472
|
+
logging.error(f"Playwright error for {url}: {pe}")
|
|
473
|
+
error_message = error_message or f"Playwright error: {pe}"
|
|
474
|
+
except Exception as e:
|
|
475
|
+
logging.error(f"General error in scrape_and_screenshot_playwright for {url}: {e}", exc_info=True)
|
|
476
|
+
error_message = error_message or f"General scraping error: {e}"
|
|
477
|
+
finally:
|
|
478
|
+
logging.info("Entering main finally block for cleanup...")
|
|
479
|
+
if page and not page.is_closed():
|
|
480
|
+
try:
|
|
481
|
+
await page.close()
|
|
482
|
+
logging.info("Playwright page closed in finally block.")
|
|
483
|
+
except Exception as e_page_close: logging.warning(f"Error closing Playwright page in finally: {e_page_close}")
|
|
484
|
+
if context:
|
|
485
|
+
try: await context.close()
|
|
486
|
+
except Exception as e_ctx: logging.warning(f"Error closing Playwright context: {e_ctx}")
|
|
487
|
+
if browser:
|
|
488
|
+
try: await browser.close()
|
|
489
|
+
except Exception as e_brws: logging.warning(f"Error closing Playwright browser: {e_brws}")
|
|
490
|
+
if playwright_instance:
|
|
491
|
+
try: await playwright_instance.stop()
|
|
492
|
+
except Exception as e_pw_stop: logging.warning(f"Error stopping Playwright: {e_pw_stop}")
|
|
493
|
+
|
|
494
|
+
if temp_data_dir_obj:
|
|
495
|
+
time.sleep(0.5)
|
|
496
|
+
try:
|
|
497
|
+
temp_data_dir_obj.cleanup()
|
|
498
|
+
logging.info(f"Successfully cleaned up temporary data directory: {temp_data_dir_obj.name}")
|
|
499
|
+
except Exception as e_cleanup:
|
|
500
|
+
logging.error(f"Error cleaning up temporary data directory {temp_data_dir_obj.name}: {e_cleanup}")
|
|
501
|
+
try: shutil.rmtree(temp_data_dir_obj.name, ignore_errors=True)
|
|
502
|
+
except Exception as e_shutil: logging.error(f"Shutil.rmtree cleanup also failed for {temp_data_dir_obj.name}: {e_shutil}")
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
# Combine results and final error message assembly
|
|
506
|
+
final_error_message = None
|
|
507
|
+
if timed_out_early:
|
|
508
|
+
timeout_info = f"Operation timed out within {GLOBAL_TIMEOUT_SECONDS}s budget."
|
|
509
|
+
if error_message and "timed out" not in error_message.lower(): final_error_message = f"{timeout_info} Last error: {error_message}"
|
|
510
|
+
elif error_message: final_error_message = error_message
|
|
511
|
+
else: final_error_message = timeout_info
|
|
512
|
+
elif error_message:
|
|
513
|
+
final_error_message = error_message
|
|
514
|
+
|
|
515
|
+
if should_screenshot and not screenshot_bytes:
|
|
516
|
+
capture_failure_message = "Screenshot requested but could not be captured."
|
|
517
|
+
if not final_error_message: final_error_message = capture_failure_message
|
|
518
|
+
elif capture_failure_message not in final_error_message: final_error_message += f"; {capture_failure_message}"
|
|
519
|
+
|
|
520
|
+
if not extracted_text and not final_error_message:
|
|
521
|
+
final_error_message = "Text could not be extracted from the page."
|
|
522
|
+
elif not extracted_text and final_error_message and "Text could not be extracted" not in final_error_message and "timed out" not in final_error_message.lower():
|
|
523
|
+
final_error_message += "; Text also failed to extract."
|
|
524
|
+
|
|
525
|
+
final_response = {
|
|
526
|
+
"url": url,
|
|
527
|
+
"text": extracted_text or "",
|
|
528
|
+
"error": final_error_message
|
|
529
|
+
}
|
|
530
|
+
if screenshot_bytes:
|
|
531
|
+
final_response["screenshot_base64"] = base64.b64encode(screenshot_bytes).decode('utf-8')
|
|
532
|
+
|
|
533
|
+
elapsed_total = time.time() - operation_start_time
|
|
534
|
+
logging.info(f"Total time for {url} (Playwright): {elapsed_total:.2f}s. Timed out early: {timed_out_early}. Error: {final_response.get('error')}")
|
|
535
|
+
|
|
536
|
+
return final_response
|
|
537
|
+
|
|
538
|
+
@app.route(route="scrape")
|
|
539
|
+
async def http_scrape_trigger(req: func.HttpRequest) -> func.HttpResponse:
|
|
540
|
+
logging.info('Python HTTP scrape trigger function processed a request.')
|
|
541
|
+
url = None
|
|
542
|
+
take_screenshot = True
|
|
543
|
+
|
|
544
|
+
try:
|
|
545
|
+
url = req.params.get('url')
|
|
546
|
+
if url:
|
|
547
|
+
logging.info(f"Found URL in query parameters: {url}")
|
|
548
|
+
ss_param = req.params.get('take_screenshot', 'true')
|
|
549
|
+
take_screenshot = ss_param.lower() != 'false'
|
|
550
|
+
else:
|
|
551
|
+
logging.info("URL not found in query parameters.")
|
|
552
|
+
except Exception as e:
|
|
553
|
+
logging.warning(f"Error reading query parameters: {e}")
|
|
554
|
+
url = None
|
|
555
|
+
|
|
556
|
+
if not url:
|
|
557
|
+
logging.info("Attempting to read URL from JSON body.")
|
|
558
|
+
try:
|
|
559
|
+
req_body = await req.get_json()
|
|
560
|
+
if req_body:
|
|
561
|
+
url = req_body.get('url')
|
|
562
|
+
if url:
|
|
563
|
+
logging.info(f"Found URL in JSON body: {url}")
|
|
564
|
+
ss_param = req_body.get('take_screenshot', True)
|
|
565
|
+
if isinstance(ss_param, str):
|
|
566
|
+
take_screenshot = ss_param.lower() != 'false'
|
|
567
|
+
else:
|
|
568
|
+
take_screenshot = bool(ss_param)
|
|
569
|
+
logging.info(f"Screenshot parameter from JSON: {take_screenshot}")
|
|
570
|
+
else:
|
|
571
|
+
logging.info("URL key not found in JSON body.")
|
|
572
|
+
else:
|
|
573
|
+
logging.info("JSON body is empty.")
|
|
574
|
+
except ValueError:
|
|
575
|
+
logging.info("Request body is not valid JSON or missing.")
|
|
576
|
+
except Exception as e:
|
|
577
|
+
logging.warning(f"Error reading JSON body: {e}")
|
|
578
|
+
url = None
|
|
579
|
+
|
|
580
|
+
if url:
|
|
581
|
+
try:
|
|
582
|
+
if not url.startswith(('http://', 'https://')):
|
|
583
|
+
raise ValueError("Invalid URL format. Must start with http:// or https://")
|
|
584
|
+
|
|
585
|
+
result_data = await scrape_and_screenshot_playwright(url, should_screenshot=take_screenshot)
|
|
586
|
+
|
|
587
|
+
status_code = 200
|
|
588
|
+
if result_data.get("error") and "timed out" in result_data.get("error", "").lower():
|
|
589
|
+
pass
|
|
590
|
+
|
|
591
|
+
return func.HttpResponse(
|
|
592
|
+
json.dumps(result_data),
|
|
593
|
+
mimetype="application/json",
|
|
594
|
+
status_code=status_code
|
|
595
|
+
)
|
|
596
|
+
except ValueError as ve:
|
|
597
|
+
logging.error(f"Invalid URL provided: {ve}")
|
|
598
|
+
return func.HttpResponse(
|
|
599
|
+
json.dumps({"url": url, "error": str(ve)}),
|
|
600
|
+
mimetype="application/json",
|
|
601
|
+
status_code=400
|
|
602
|
+
)
|
|
603
|
+
except Exception as e:
|
|
604
|
+
logging.error(f"Error in http_scrape_trigger for {url}: {e}")
|
|
605
|
+
return func.HttpResponse(
|
|
606
|
+
json.dumps({"url": url, "error": f"An internal error occurred in trigger: {e}"}),
|
|
607
|
+
mimetype="application/json",
|
|
608
|
+
status_code=500
|
|
609
|
+
)
|
|
610
|
+
else:
|
|
611
|
+
logging.warning("URL not provided in request body or query string.")
|
|
612
|
+
return func.HttpResponse(
|
|
613
|
+
json.dumps({"error": "Please pass a 'url' in the JSON request body or query string"}),
|
|
614
|
+
mimetype="application/json",
|
|
615
|
+
status_code=400
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
# aiohttp server part
|
|
620
|
+
async def handle_aiohttp_request(request: web.Request) -> web.Response:
|
|
621
|
+
logging.info('aiohttp /scrape endpoint hit.')
|
|
622
|
+
url = None
|
|
623
|
+
take_screenshot = True
|
|
624
|
+
|
|
625
|
+
url = request.query.get('url')
|
|
626
|
+
if url:
|
|
627
|
+
logging.info(f"Found URL in query parameters: {url}")
|
|
628
|
+
ss_param = request.query.get('take_screenshot', 'true')
|
|
629
|
+
take_screenshot = ss_param.lower() != 'false'
|
|
630
|
+
else:
|
|
631
|
+
logging.info("URL not found in query parameters. Attempting to read from JSON body.")
|
|
632
|
+
try:
|
|
633
|
+
req_body = await request.json()
|
|
634
|
+
if req_body:
|
|
635
|
+
url = req_body.get('url')
|
|
636
|
+
if url:
|
|
637
|
+
logging.info(f"Found URL in JSON body: {url}")
|
|
638
|
+
ss_param = req_body.get('take_screenshot', True)
|
|
639
|
+
if isinstance(ss_param, str):
|
|
640
|
+
take_screenshot = ss_param.lower() != 'false'
|
|
641
|
+
else:
|
|
642
|
+
take_screenshot = bool(ss_param)
|
|
643
|
+
logging.info(f"Screenshot parameter from JSON: {take_screenshot}")
|
|
644
|
+
else:
|
|
645
|
+
logging.info("URL key not found in JSON body.")
|
|
646
|
+
else:
|
|
647
|
+
logging.info("JSON body is empty or not provided.")
|
|
648
|
+
except json.JSONDecodeError:
|
|
649
|
+
logging.info("Request body is not valid JSON.")
|
|
650
|
+
except Exception as e:
|
|
651
|
+
logging.warning(f"Error reading JSON body for aiohttp request: {e}")
|
|
652
|
+
url = None
|
|
653
|
+
|
|
654
|
+
if url:
|
|
655
|
+
try:
|
|
656
|
+
if not url.startswith(('http://', 'https://')):
|
|
657
|
+
raise ValueError("Invalid URL format. Must start with http:// or https://")
|
|
658
|
+
|
|
659
|
+
result_data = await scrape_and_screenshot_playwright(url, should_screenshot=take_screenshot)
|
|
660
|
+
|
|
661
|
+
status_code = 200
|
|
662
|
+
return web.json_response(result_data, status=status_code)
|
|
663
|
+
except ValueError as ve:
|
|
664
|
+
logging.error(f"Invalid URL provided to aiohttp server: {ve}")
|
|
665
|
+
return web.json_response({"url": url, "error": str(ve)}, status=400)
|
|
666
|
+
except Exception as e:
|
|
667
|
+
logging.error(f"Error in aiohttp_handle_request for {url}: {e}")
|
|
668
|
+
return web.json_response({"url": url, "error": f"An internal server error occurred: {e}"}, status=500)
|
|
669
|
+
else:
|
|
670
|
+
logging.warning("URL not provided in aiohttp request body or query string.")
|
|
671
|
+
return web.json_response({"error": "Please pass a 'url' in the JSON request body or query string"}, status=400)
|
|
672
|
+
|
|
673
|
+
async def main_server():
|
|
674
|
+
logging.basicConfig(level=logging.INFO)
|
|
675
|
+
port_str = os.environ.get("PORT", "7777")
|
|
676
|
+
try:
|
|
677
|
+
port = int(port_str)
|
|
678
|
+
except ValueError:
|
|
679
|
+
logging.warning(f"Invalid PORT environment variable '{port_str}'. Defaulting to 7777.")
|
|
680
|
+
port = 7777
|
|
681
|
+
|
|
682
|
+
aiohttp_app = web.Application()
|
|
683
|
+
aiohttp_app.router.add_post('/scrape', handle_aiohttp_request)
|
|
684
|
+
aiohttp_app.router.add_get('/scrape', handle_aiohttp_request)
|
|
685
|
+
|
|
686
|
+
runner = web.AppRunner(aiohttp_app)
|
|
687
|
+
await runner.setup()
|
|
688
|
+
site = web.TCPSite(runner, '0.0.0.0', port)
|
|
689
|
+
await site.start()
|
|
690
|
+
logging.info(f"aiohttp server started on http://0.0.0.0:{port}/scrape")
|
|
691
|
+
print(f"======== Running on http://0.0.0.0:{port}/scrape ========")
|
|
692
|
+
print("(Press CTRL+C to quit)")
|
|
693
|
+
try:
|
|
694
|
+
while True:
|
|
695
|
+
await asyncio.sleep(3600)
|
|
696
|
+
except KeyboardInterrupt:
|
|
697
|
+
logging.info("Server shutting down...")
|
|
698
|
+
finally:
|
|
699
|
+
await runner.cleanup()
|
|
700
|
+
logging.info("Server stopped.")
|
|
701
|
+
|
|
702
|
+
if __name__ == "__main__":
|
|
703
|
+
try:
|
|
704
|
+
asyncio.run(main_server())
|
|
705
|
+
except KeyboardInterrupt:
|
|
706
|
+
logging.info("Application shut down by user.")
|
|
707
|
+
except Exception as e:
|
|
708
|
+
logging.critical(f"Application failed to start or crashed: {e}")
|