cnhkmcp 2.1.4__py3-none-any.whl → 2.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cnhkmcp/__init__.py +126 -126
- cnhkmcp/untracked/back_up/forum_functions.py +998 -0
- cnhkmcp/untracked/back_up/platform_functions.py +2886 -0
- cnhkmcp/untracked/brain-consultant.md +31 -0
- cnhkmcp/untracked/forum_functions.py +350 -941
- cnhkmcp/untracked/platform_functions.py +445 -730
- cnhkmcp/untracked/skills/Claude_Skill_Creation_Guide.md +140 -0
- cnhkmcp/untracked/skills/expression_verifier/SKILL.md +51 -0
- cnhkmcp/untracked/skills/expression_verifier/scripts/validator.py +889 -0
- cnhkmcp/untracked/skills/expression_verifier/scripts/verify_expr.py +52 -0
- cnhkmcp/untracked/skills/pull_BRAINSkill/SKILL.md +51 -0
- cnhkmcp/untracked/skills/pull_BRAINSkill/scripts/pull_skills.py +188 -0
- cnhkmcp/untracked//321/211/320/225/320/235/321/207/342/225/234/320/276/321/205/320/231/320/235/321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/230/320/241_/321/205/320/276/320/231/321/210/320/263/320/225/321/205/342/224/220/320/225/321/210/320/266/320/221/321/204/342/225/233/320/255/321/210/342/225/241/320/246/321/205/320/234/320/225.py +3 -1
- {cnhkmcp-2.1.4.dist-info → cnhkmcp-2.1.6.dist-info}/METADATA +1 -1
- {cnhkmcp-2.1.4.dist-info → cnhkmcp-2.1.6.dist-info}/RECORD +19 -13
- cnhkmcp/untracked/APP/Tranformer/ace.log +0 -0
- cnhkmcp/untracked/APP/Tranformer/parsetab.py +0 -60
- cnhkmcp/untracked/APP/simulator/wqb20260107015647.log +0 -57
- {cnhkmcp-2.1.4.dist-info → cnhkmcp-2.1.6.dist-info}/WHEEL +0 -0
- {cnhkmcp-2.1.4.dist-info → cnhkmcp-2.1.6.dist-info}/entry_points.txt +0 -0
- {cnhkmcp-2.1.4.dist-info → cnhkmcp-2.1.6.dist-info}/licenses/LICENSE +0 -0
- {cnhkmcp-2.1.4.dist-info → cnhkmcp-2.1.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,998 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
WorldQuant BRAIN Forum Functions - Python Version
|
|
4
|
+
Comprehensive forum functionality including glossary, search, and post viewing.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import re
|
|
9
|
+
import sys
|
|
10
|
+
import time
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from typing import Dict, Any, List, Optional
|
|
13
|
+
|
|
14
|
+
from bs4 import BeautifulSoup
|
|
15
|
+
from selenium import webdriver
|
|
16
|
+
from selenium.webdriver.chrome.options import Options
|
|
17
|
+
from selenium.webdriver.edge.options import Options as EdgeOptions
|
|
18
|
+
from selenium.webdriver.common.by import By
|
|
19
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
20
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
21
|
+
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
|
22
|
+
import requests
|
|
23
|
+
import os
|
|
24
|
+
import shutil
|
|
25
|
+
|
|
26
|
+
# Initialize forum MCP server
|
|
27
|
+
try:
|
|
28
|
+
from mcp.server.fastmcp import FastMCP
|
|
29
|
+
forum_mcp = FastMCP('brain_forum_server')
|
|
30
|
+
except ImportError:
|
|
31
|
+
# Fallback for testing
|
|
32
|
+
forum_mcp = None
|
|
33
|
+
|
|
34
|
+
def log(message: str, level: str = "INFO"):
|
|
35
|
+
"""Log message with timestamp."""
|
|
36
|
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
37
|
+
print(f"[{timestamp}] [{level}] {message}", file=sys.stderr)
|
|
38
|
+
|
|
39
|
+
class ForumClient:
|
|
40
|
+
"""Forum client for WorldQuant BRAIN support site."""
|
|
41
|
+
|
|
42
|
+
def __init__(self):
|
|
43
|
+
self.base_url = "https://support.worldquantbrain.com"
|
|
44
|
+
self.session = requests.Session()
|
|
45
|
+
self.session.headers.update({
|
|
46
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36'
|
|
47
|
+
})
|
|
48
|
+
|
|
49
|
+
def get_brain_session(self):
|
|
50
|
+
"""Get authenticated session from BrainApiClient."""
|
|
51
|
+
try:
|
|
52
|
+
import sys
|
|
53
|
+
import os
|
|
54
|
+
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
55
|
+
from platform_functions import brain_client
|
|
56
|
+
return brain_client.session
|
|
57
|
+
except ImportError:
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
def detect_available_browser(self) -> str:
|
|
61
|
+
"""Detect which browser WebDriver is available."""
|
|
62
|
+
try:
|
|
63
|
+
# Try Chrome first
|
|
64
|
+
from selenium.webdriver.chrome.service import Service
|
|
65
|
+
from selenium.webdriver.chrome.options import Options
|
|
66
|
+
try:
|
|
67
|
+
options = Options()
|
|
68
|
+
options.add_argument('--headless')
|
|
69
|
+
driver = webdriver.Chrome(options=options)
|
|
70
|
+
driver.quit()
|
|
71
|
+
return "chrome"
|
|
72
|
+
except Exception:
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
# Try Edge
|
|
76
|
+
try:
|
|
77
|
+
from selenium.webdriver.edge.options import Options as EdgeOptions
|
|
78
|
+
options = EdgeOptions()
|
|
79
|
+
options.add_argument('--headless')
|
|
80
|
+
driver = webdriver.Edge(options=options)
|
|
81
|
+
driver.quit()
|
|
82
|
+
return "edge"
|
|
83
|
+
except Exception:
|
|
84
|
+
pass
|
|
85
|
+
|
|
86
|
+
# Default to chrome
|
|
87
|
+
return "chrome"
|
|
88
|
+
except Exception:
|
|
89
|
+
return "chrome"
|
|
90
|
+
|
|
91
|
+
def setup_browser_options(self, headless: bool, browser_type: str):
|
|
92
|
+
"""Setup browser options based on browser type."""
|
|
93
|
+
if browser_type.lower() == "chrome":
|
|
94
|
+
return self.setup_chrome_options(headless)
|
|
95
|
+
elif browser_type.lower() == "edge":
|
|
96
|
+
return self.setup_edge_options(headless)
|
|
97
|
+
else:
|
|
98
|
+
return self.setup_chrome_options(headless)
|
|
99
|
+
|
|
100
|
+
def setup_edge_options(self, headless: bool = True) -> EdgeOptions:
|
|
101
|
+
"""Setup Edge options for web scraping."""
|
|
102
|
+
options = EdgeOptions()
|
|
103
|
+
|
|
104
|
+
if headless:
|
|
105
|
+
options.add_argument('--headless')
|
|
106
|
+
|
|
107
|
+
# Performance optimizations
|
|
108
|
+
options.add_argument('--disable-blink-features=AutomationControlled')
|
|
109
|
+
options.add_argument('--log-level=3')
|
|
110
|
+
options.add_argument('--no-sandbox')
|
|
111
|
+
options.add_argument('--disable-dev-shm-usage')
|
|
112
|
+
options.add_argument('--disable-web-security')
|
|
113
|
+
options.add_argument('--disable-features=VizDisplayCompositor')
|
|
114
|
+
options.add_argument('--disable-gpu')
|
|
115
|
+
options.add_argument('--disable-extensions')
|
|
116
|
+
options.add_argument('--disable-images')
|
|
117
|
+
options.add_argument('--disable-javascript')
|
|
118
|
+
options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36')
|
|
119
|
+
|
|
120
|
+
return options
|
|
121
|
+
|
|
122
|
+
def setup_chrome_options(self, headless: bool = True) -> Options:
|
|
123
|
+
"""Setup Chrome options for web scraping."""
|
|
124
|
+
options = Options()
|
|
125
|
+
|
|
126
|
+
if headless:
|
|
127
|
+
options.add_argument('--headless')
|
|
128
|
+
|
|
129
|
+
# Performance optimizations
|
|
130
|
+
options.add_argument('--disable-blink-features=AutomationControlled')
|
|
131
|
+
options.add_argument('--log-level=3')
|
|
132
|
+
options.add_argument('--no-sandbox')
|
|
133
|
+
options.add_argument('--disable-dev-shm-usage')
|
|
134
|
+
options.add_argument('--disable-web-security')
|
|
135
|
+
options.add_argument('--disable-features=VizDisplayCompositor')
|
|
136
|
+
options.add_argument('--disable-gpu')
|
|
137
|
+
options.add_argument('--disable-extensions')
|
|
138
|
+
options.add_argument('--disable-images')
|
|
139
|
+
options.add_argument('--disable-javascript')
|
|
140
|
+
options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36')
|
|
141
|
+
|
|
142
|
+
return options
|
|
143
|
+
|
|
144
|
+
async def create_driver(self, headless: bool = True):
|
|
145
|
+
"""Create and configure WebDriver with cross-browser support."""
|
|
146
|
+
browser_type = self.detect_available_browser()
|
|
147
|
+
log(f"Using browser: {browser_type}", "INFO")
|
|
148
|
+
|
|
149
|
+
options = self.setup_browser_options(headless, browser_type)
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
if browser_type.lower() == "chrome":
|
|
153
|
+
driver = webdriver.Chrome(options=options)
|
|
154
|
+
elif browser_type.lower() == "edge":
|
|
155
|
+
driver = webdriver.Edge(options=options)
|
|
156
|
+
else:
|
|
157
|
+
# Fallback to Chrome
|
|
158
|
+
log("Falling back to Chrome", "WARNING")
|
|
159
|
+
driver = webdriver.Chrome(options=options)
|
|
160
|
+
|
|
161
|
+
# Set aggressive timeouts for speed
|
|
162
|
+
driver.set_page_load_timeout(30)
|
|
163
|
+
driver.implicitly_wait(10)
|
|
164
|
+
|
|
165
|
+
return driver
|
|
166
|
+
|
|
167
|
+
except Exception as e:
|
|
168
|
+
log(f"Failed to create {browser_type} driver: {str(e)}", "ERROR")
|
|
169
|
+
help_text = self.get_driver_installation_help(browser_type)
|
|
170
|
+
log(help_text, "ERROR")
|
|
171
|
+
|
|
172
|
+
# Try Chrome as fallback if Edge failed
|
|
173
|
+
if browser_type.lower() != "chrome":
|
|
174
|
+
try:
|
|
175
|
+
log("Trying Chrome as fallback", "INFO")
|
|
176
|
+
chrome_options = self.setup_browser_options(headless, "chrome")
|
|
177
|
+
driver = webdriver.Chrome(options=chrome_options)
|
|
178
|
+
driver.set_page_load_timeout(30)
|
|
179
|
+
driver.implicitly_wait(10)
|
|
180
|
+
return driver
|
|
181
|
+
except Exception as e2:
|
|
182
|
+
log(f"Chrome fallback also failed: {str(e2)}", "ERROR")
|
|
183
|
+
chrome_help = self.get_driver_installation_help("chrome")
|
|
184
|
+
log(chrome_help, "ERROR")
|
|
185
|
+
|
|
186
|
+
raise Exception(f"Could not create any browser driver. {help_text}")
|
|
187
|
+
|
|
188
|
+
async def login_to_forum(self, driver, email: str, password: str) -> bool:
|
|
189
|
+
"""Login to the WorldQuant BRAIN forum using existing authentication."""
|
|
190
|
+
try:
|
|
191
|
+
# Import BrainApiClient from platform_functions
|
|
192
|
+
import sys
|
|
193
|
+
import os
|
|
194
|
+
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
from platform_functions import brain_client
|
|
198
|
+
log("Using existing BrainApiClient for authentication", "INFO")
|
|
199
|
+
|
|
200
|
+
# First authenticate with BrainApiClient
|
|
201
|
+
auth_result = await brain_client.authenticate(email, password)
|
|
202
|
+
if auth_result.get('status') != 'authenticated':
|
|
203
|
+
log("BrainApiClient authentication failed", "ERROR")
|
|
204
|
+
return False
|
|
205
|
+
|
|
206
|
+
log("Successfully authenticated via BrainApiClient", "SUCCESS")
|
|
207
|
+
|
|
208
|
+
# Navigate to forum with authenticated session
|
|
209
|
+
log("Navigating to forum with authenticated session", "WORK")
|
|
210
|
+
driver.get("https://support.worldquantbrain.com/hc/en-us")
|
|
211
|
+
await asyncio.sleep(2)
|
|
212
|
+
|
|
213
|
+
# Add authentication cookies to browser
|
|
214
|
+
cookies = brain_client.session.cookies
|
|
215
|
+
for cookie in cookies:
|
|
216
|
+
driver.add_cookie({
|
|
217
|
+
'name': cookie.name,
|
|
218
|
+
'value': cookie.value,
|
|
219
|
+
'domain': '.worldquantbrain.com'
|
|
220
|
+
})
|
|
221
|
+
|
|
222
|
+
# Refresh page with cookies
|
|
223
|
+
driver.refresh()
|
|
224
|
+
await asyncio.sleep(2)
|
|
225
|
+
|
|
226
|
+
return True
|
|
227
|
+
|
|
228
|
+
except ImportError:
|
|
229
|
+
log("BrainApiClient not available, using manual login", "WARNING")
|
|
230
|
+
|
|
231
|
+
# Fallback to manual login
|
|
232
|
+
driver.get("https://support.worldquantbrain.com/hc/en-us/signin")
|
|
233
|
+
await asyncio.sleep(3)
|
|
234
|
+
|
|
235
|
+
email_input = WebDriverWait(driver, 15).until(
|
|
236
|
+
EC.presence_of_element_located((By.NAME, "email"))
|
|
237
|
+
)
|
|
238
|
+
password_input = WebDriverWait(driver, 15).until(
|
|
239
|
+
EC.presence_of_element_located((By.NAME, "currentPassword"))
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
email_input.clear()
|
|
243
|
+
email_input.send_keys(email)
|
|
244
|
+
password_input.clear()
|
|
245
|
+
password_input.send_keys(password)
|
|
246
|
+
|
|
247
|
+
login_button = WebDriverWait(driver, 15).until(
|
|
248
|
+
EC.element_to_be_clickable((By.XPATH, '//button[@type="submit"]'))
|
|
249
|
+
)
|
|
250
|
+
login_button.click()
|
|
251
|
+
await asyncio.sleep(3)
|
|
252
|
+
|
|
253
|
+
return True
|
|
254
|
+
|
|
255
|
+
except Exception as e:
|
|
256
|
+
log(f"Login failed: {str(e)}", "ERROR")
|
|
257
|
+
return False
|
|
258
|
+
|
|
259
|
+
async def get_glossary_terms(self, email: str, password: str, headless: bool = False) -> Dict[str, Any]:
|
|
260
|
+
"""Extract glossary terms from the forum."""
|
|
261
|
+
driver = None
|
|
262
|
+
try:
|
|
263
|
+
log("Starting glossary extraction process", "INFO")
|
|
264
|
+
|
|
265
|
+
# Add timeout protection
|
|
266
|
+
async def extraction_with_timeout():
|
|
267
|
+
return await self._perform_glossary_extraction(email, password, headless)
|
|
268
|
+
|
|
269
|
+
# Run with 5-minute timeout
|
|
270
|
+
result = await asyncio.wait_for(extraction_with_timeout(), timeout=300)
|
|
271
|
+
return result
|
|
272
|
+
|
|
273
|
+
except asyncio.TimeoutError:
|
|
274
|
+
log("Glossary extraction timed out after 5 minutes", "ERROR")
|
|
275
|
+
return {"error": "Glossary extraction timed out after 5 minutes"}
|
|
276
|
+
except Exception as e:
|
|
277
|
+
log(f"Glossary extraction failed: {str(e)}", "ERROR")
|
|
278
|
+
return {"error": str(e)}
|
|
279
|
+
finally:
|
|
280
|
+
if driver:
|
|
281
|
+
try:
|
|
282
|
+
driver.quit()
|
|
283
|
+
except:
|
|
284
|
+
pass
|
|
285
|
+
|
|
286
|
+
async def _perform_glossary_extraction(self, email: str, password: str, headless: bool) -> Dict[str, Any]:
|
|
287
|
+
"""Perform the actual glossary extraction."""
|
|
288
|
+
driver = None
|
|
289
|
+
try:
|
|
290
|
+
driver = await self.create_driver(headless)
|
|
291
|
+
|
|
292
|
+
# Login
|
|
293
|
+
if not await self.login_to_forum(driver, email, password):
|
|
294
|
+
raise Exception("Failed to login to forum")
|
|
295
|
+
|
|
296
|
+
# Navigate to glossary page
|
|
297
|
+
log("Navigating to glossary page", "WORK")
|
|
298
|
+
driver.get("https://support.worldquantbrain.com/hc/en-us/articles/4902349883927-Click-here-for-a-list-of-terms-and-their-definitions")
|
|
299
|
+
await asyncio.sleep(5)
|
|
300
|
+
|
|
301
|
+
# Extract content
|
|
302
|
+
log("Extracting glossary content", "WORK")
|
|
303
|
+
page_source = driver.page_source
|
|
304
|
+
soup = BeautifulSoup(page_source, 'html.parser')
|
|
305
|
+
|
|
306
|
+
# Parse glossary terms
|
|
307
|
+
terms = self._parse_glossary_terms(page_source)
|
|
308
|
+
|
|
309
|
+
log(f"Extracted {len(terms)} glossary terms", "SUCCESS")
|
|
310
|
+
return {
|
|
311
|
+
"terms": terms,
|
|
312
|
+
"total_count": len(terms),
|
|
313
|
+
"extraction_timestamp": datetime.now().isoformat()
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
finally:
|
|
317
|
+
if driver:
|
|
318
|
+
try:
|
|
319
|
+
driver.quit()
|
|
320
|
+
except:
|
|
321
|
+
pass
|
|
322
|
+
|
|
323
|
+
def _parse_glossary_terms(self, content: str) -> List[Dict[str, str]]:
|
|
324
|
+
"""Parse glossary terms from HTML content."""
|
|
325
|
+
terms = []
|
|
326
|
+
lines = content.split('\n')
|
|
327
|
+
|
|
328
|
+
current_term = None
|
|
329
|
+
current_definition = []
|
|
330
|
+
is_collecting_definition = False
|
|
331
|
+
found_first_real_term = False
|
|
332
|
+
|
|
333
|
+
for line in lines:
|
|
334
|
+
line = line.strip()
|
|
335
|
+
if not line:
|
|
336
|
+
continue
|
|
337
|
+
|
|
338
|
+
# Skip navigation and metadata lines at the beginning
|
|
339
|
+
if not found_first_real_term and self._is_navigation_or_metadata(line):
|
|
340
|
+
continue
|
|
341
|
+
|
|
342
|
+
# Check if this line looks like a term
|
|
343
|
+
if self._looks_like_term(line) and not is_collecting_definition:
|
|
344
|
+
# Mark that we found the first real term
|
|
345
|
+
if not found_first_real_term:
|
|
346
|
+
found_first_real_term = True
|
|
347
|
+
|
|
348
|
+
# Save previous term if exists
|
|
349
|
+
if current_term and current_definition:
|
|
350
|
+
terms.append({
|
|
351
|
+
"term": current_term.strip(),
|
|
352
|
+
"definition": " ".join(current_definition).strip()
|
|
353
|
+
})
|
|
354
|
+
|
|
355
|
+
current_term = line
|
|
356
|
+
current_definition = []
|
|
357
|
+
is_collecting_definition = True
|
|
358
|
+
elif is_collecting_definition and found_first_real_term:
|
|
359
|
+
# Check if this is the start of a new term
|
|
360
|
+
if self._looks_like_term(line):
|
|
361
|
+
# Save current term
|
|
362
|
+
if current_term and current_definition:
|
|
363
|
+
terms.append({
|
|
364
|
+
"term": current_term.strip(),
|
|
365
|
+
"definition": " ".join(current_definition).strip()
|
|
366
|
+
})
|
|
367
|
+
|
|
368
|
+
current_term = line
|
|
369
|
+
current_definition = []
|
|
370
|
+
else:
|
|
371
|
+
# Add to definition
|
|
372
|
+
if current_definition:
|
|
373
|
+
current_definition.append(line)
|
|
374
|
+
else:
|
|
375
|
+
current_definition = [line]
|
|
376
|
+
|
|
377
|
+
# Don't forget the last term
|
|
378
|
+
if current_term and current_definition and found_first_real_term:
|
|
379
|
+
terms.append({
|
|
380
|
+
"term": current_term.strip(),
|
|
381
|
+
"definition": " ".join(current_definition).strip()
|
|
382
|
+
})
|
|
383
|
+
|
|
384
|
+
# Filter out invalid terms and improve quality
|
|
385
|
+
return [term for term in terms if
|
|
386
|
+
len(term["term"]) > 0 and
|
|
387
|
+
len(term["definition"]) > 10 and # Ensure meaningful definitions
|
|
388
|
+
not self._is_navigation_or_metadata(term["term"]) and
|
|
389
|
+
"ago" not in term["definition"] and # Remove timestamp-like definitions
|
|
390
|
+
"minute read" not in term["definition"]] # Remove reading time
|
|
391
|
+
|
|
392
|
+
def _looks_like_term(self, line: str) -> bool:
|
|
393
|
+
"""Check if a line looks like a glossary term."""
|
|
394
|
+
# Skip very long lines (likely definitions)
|
|
395
|
+
if len(line) > 100:
|
|
396
|
+
return False
|
|
397
|
+
|
|
398
|
+
# Skip navigation and metadata
|
|
399
|
+
if self._is_navigation_or_metadata(line):
|
|
400
|
+
return False
|
|
401
|
+
|
|
402
|
+
# Skip lines that start with common definition words
|
|
403
|
+
definition_starters = ['the', 'a', 'an', 'this', 'that', 'it', 'is', 'are', 'was', 'were', 'for', 'to', 'in', 'on', 'at', 'by', 'with']
|
|
404
|
+
first_word = line.lower().split(' ')[0]
|
|
405
|
+
if first_word and first_word in definition_starters:
|
|
406
|
+
return False
|
|
407
|
+
|
|
408
|
+
# Check if line has characteristics of a term
|
|
409
|
+
# Terms are often short, may be all caps, or start with capital
|
|
410
|
+
is_short = len(line) <= 80
|
|
411
|
+
starts_with_capital = bool(re.match(r'^[A-Z]', line))
|
|
412
|
+
has_all_caps = bool(re.match(r'^[A-Z\s\-\/\(\)]+$', line))
|
|
413
|
+
has_reasonable_length = len(line) >= 2
|
|
414
|
+
|
|
415
|
+
return is_short and has_reasonable_length and (starts_with_capital or has_all_caps)
|
|
416
|
+
|
|
417
|
+
def _is_navigation_or_metadata(self, line: str) -> bool:
|
|
418
|
+
"""Check if a line is navigation or metadata."""
|
|
419
|
+
navigation_patterns = [
|
|
420
|
+
r'^\d+ days? ago$',
|
|
421
|
+
r'~\d+ minute read',
|
|
422
|
+
r'^Follow',
|
|
423
|
+
r'^Not yet followed',
|
|
424
|
+
r'^Updated$',
|
|
425
|
+
r'^AS\d+$',
|
|
426
|
+
r'^[A-Z] - [A-Z] - [A-Z]', # Letter navigation
|
|
427
|
+
r'^A$',
|
|
428
|
+
r'^B$',
|
|
429
|
+
r'^[A-Z]$' # Single letters
|
|
430
|
+
]
|
|
431
|
+
|
|
432
|
+
return any(re.match(pattern, line.strip()) for pattern in navigation_patterns)
|
|
433
|
+
|
|
434
|
+
def get_driver_installation_help(self, browser_type: str) -> str:
|
|
435
|
+
"""Provide helpful instructions for installing WebDriver."""
|
|
436
|
+
if browser_type.lower() == "chrome":
|
|
437
|
+
return """
|
|
438
|
+
Chrome WebDriver not found. Please install ChromeDriver:
|
|
439
|
+
1. Download from: https://chromedriver.chromium.org/downloads
|
|
440
|
+
2. Make sure version matches your Chrome browser
|
|
441
|
+
3. Add to PATH or place in current directory
|
|
442
|
+
4. Alternative: Install via pip: pip install chromedriver-autoinstaller
|
|
443
|
+
"""
|
|
444
|
+
elif browser_type.lower() == "edge":
|
|
445
|
+
return """
|
|
446
|
+
Edge WebDriver not found. Please install Edge WebDriver:
|
|
447
|
+
1. Download from: https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/
|
|
448
|
+
2. Make sure version matches your Edge browser
|
|
449
|
+
3. Add to PATH or place in current directory
|
|
450
|
+
4. Alternative: Install via pip: pip install msedge-selenium-tools
|
|
451
|
+
"""
|
|
452
|
+
else:
|
|
453
|
+
return "Please install either ChromeDriver or Edge WebDriver for browser automation."
|
|
454
|
+
|
|
455
|
+
async def read_full_forum_post(self, email: str, password: str, post_url_or_id: str,
|
|
456
|
+
headless: bool = False, include_comments: bool = True) -> Dict[str, Any]:
|
|
457
|
+
"""Read a complete forum post with optional comments."""
|
|
458
|
+
driver = None
|
|
459
|
+
try:
|
|
460
|
+
log("Starting forum post reading process", "INFO")
|
|
461
|
+
|
|
462
|
+
# Determine if input is URL or article ID
|
|
463
|
+
is_url = post_url_or_id.startswith('http')
|
|
464
|
+
if is_url:
|
|
465
|
+
post_url = post_url_or_id
|
|
466
|
+
else:
|
|
467
|
+
post_url = f"https://support.worldquantbrain.com/hc/zh-cn/community/posts/{post_url_or_id}"
|
|
468
|
+
|
|
469
|
+
log(f"Target URL: {post_url}", "INFO")
|
|
470
|
+
log(f"Include comments: {include_comments}", "INFO")
|
|
471
|
+
|
|
472
|
+
driver = await self.create_driver(headless)
|
|
473
|
+
|
|
474
|
+
# Login
|
|
475
|
+
if not await self.login_to_forum(driver, email, password):
|
|
476
|
+
raise Exception("Failed to login to forum")
|
|
477
|
+
|
|
478
|
+
# Navigate directly to post URL
|
|
479
|
+
log(f"Opening post: {post_url}", "WORK")
|
|
480
|
+
driver.get(post_url)
|
|
481
|
+
log("Post page loaded, extracting content immediately", "WORK")
|
|
482
|
+
|
|
483
|
+
# Wait minimal time for content to appear
|
|
484
|
+
await asyncio.sleep(2)
|
|
485
|
+
|
|
486
|
+
# Extract post content quickly
|
|
487
|
+
post_data = {}
|
|
488
|
+
page_source = driver.page_source
|
|
489
|
+
soup = BeautifulSoup(page_source, 'html.parser')
|
|
490
|
+
|
|
491
|
+
# Extract post title
|
|
492
|
+
title = soup.select_one('.post-title, h1, .article-title')
|
|
493
|
+
if not title:
|
|
494
|
+
title = soup.select_one('title')
|
|
495
|
+
post_data['title'] = title.get_text().strip() if title else 'Unknown Title'
|
|
496
|
+
|
|
497
|
+
# Extract post author
|
|
498
|
+
author = soup.select_one('.post-author, .author, .article-author')
|
|
499
|
+
if not author:
|
|
500
|
+
author = soup.select_one('.comment-author')
|
|
501
|
+
post_data['author'] = author.get_text().strip() if author else 'Unknown Author'
|
|
502
|
+
|
|
503
|
+
# Extract post date
|
|
504
|
+
date = soup.select_one('.post-date, .date, .article-date, time')
|
|
505
|
+
if not date:
|
|
506
|
+
time_element = soup.select_one('time')
|
|
507
|
+
if time_element:
|
|
508
|
+
date = time_element.get('datetime') or time_element.get('title') or time_element.get_text().strip()
|
|
509
|
+
else:
|
|
510
|
+
date = 'Unknown Date'
|
|
511
|
+
else:
|
|
512
|
+
date = date.get_text().strip()
|
|
513
|
+
post_data['date'] = date if date else 'Unknown Date'
|
|
514
|
+
|
|
515
|
+
# Extract post content
|
|
516
|
+
post_content = soup.select_one('.post-body, .article-body, .content, .post-content')
|
|
517
|
+
if not post_content:
|
|
518
|
+
post_content = soup.select_one('article, main')
|
|
519
|
+
|
|
520
|
+
if post_content:
|
|
521
|
+
post_data['content_html'] = str(post_content)
|
|
522
|
+
post_data['content_text'] = post_content.get_text().strip()
|
|
523
|
+
else:
|
|
524
|
+
post_data['content_html'] = 'No content found'
|
|
525
|
+
post_data['content_text'] = 'No content found'
|
|
526
|
+
|
|
527
|
+
post_data['url'] = post_url
|
|
528
|
+
post_data['current_url'] = driver.current_url
|
|
529
|
+
|
|
530
|
+
log(f"Post content extracted: \"{post_data['title']}\"", "SUCCESS")
|
|
531
|
+
|
|
532
|
+
comments = []
|
|
533
|
+
total_comments = 0
|
|
534
|
+
|
|
535
|
+
# Extract comments conditionally
|
|
536
|
+
if include_comments:
|
|
537
|
+
log("Extracting comments...", "WORK")
|
|
538
|
+
comments = await self._extract_forum_comments_full(driver, soup)
|
|
539
|
+
total_comments = len(comments)
|
|
540
|
+
log(f"Extracted {total_comments} comments", "SUCCESS")
|
|
541
|
+
else:
|
|
542
|
+
log("Skipping comment extraction (includeComments=false)", "INFO")
|
|
543
|
+
|
|
544
|
+
return {
|
|
545
|
+
"success": True,
|
|
546
|
+
"post": post_data,
|
|
547
|
+
"comments": comments,
|
|
548
|
+
"total_comments": total_comments,
|
|
549
|
+
"extracted_at": datetime.now().isoformat(),
|
|
550
|
+
"processing_time": "full_extraction_with_comments" if include_comments else "post_only_extraction",
|
|
551
|
+
"include_comments": include_comments
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
except Exception as e:
|
|
555
|
+
log(f"Failed to read forum post: {str(e)}", "ERROR")
|
|
556
|
+
return {"error": str(e)}
|
|
557
|
+
finally:
|
|
558
|
+
if driver:
|
|
559
|
+
try:
|
|
560
|
+
driver.quit()
|
|
561
|
+
except:
|
|
562
|
+
pass
|
|
563
|
+
|
|
564
|
+
async def _extract_forum_comments_full(self, driver, soup: BeautifulSoup) -> List[Dict[str, Any]]:
|
|
565
|
+
"""Extract all comments from forum post with pagination support."""
|
|
566
|
+
all_comments = []
|
|
567
|
+
page_num = 1
|
|
568
|
+
|
|
569
|
+
try:
|
|
570
|
+
# First extract comments from current page source
|
|
571
|
+
page_comments = self._parse_comments_from_html(soup)
|
|
572
|
+
all_comments.extend(page_comments)
|
|
573
|
+
log(f"Found {len(page_comments)} comments on page {page_num}", "INFO")
|
|
574
|
+
|
|
575
|
+
# Check for pagination and continue if needed
|
|
576
|
+
while True:
|
|
577
|
+
try:
|
|
578
|
+
# Look for next page button
|
|
579
|
+
next_button = driver.find_element(By.CSS_SELECTOR, "span.pagination-next-text, .pagination-next, .next")
|
|
580
|
+
next_text = next_button.text
|
|
581
|
+
|
|
582
|
+
if "下一页" in next_text or "Next" in next_text or "next" in next_text.lower():
|
|
583
|
+
log(f"Found next page, continuing to page {page_num + 1}", "INFO")
|
|
584
|
+
next_button.click()
|
|
585
|
+
await asyncio.sleep(2) # Minimal wait for next page
|
|
586
|
+
|
|
587
|
+
# Extract comments from new page
|
|
588
|
+
new_page_source = driver.page_source
|
|
589
|
+
new_soup = BeautifulSoup(new_page_source, 'html.parser')
|
|
590
|
+
new_page_comments = self._parse_comments_from_html(new_soup)
|
|
591
|
+
|
|
592
|
+
if len(new_page_comments) == 0:
|
|
593
|
+
break
|
|
594
|
+
|
|
595
|
+
all_comments.extend(new_page_comments)
|
|
596
|
+
page_num += 1
|
|
597
|
+
log(f"Found {len(new_page_comments)} comments on page {page_num}", "INFO")
|
|
598
|
+
else:
|
|
599
|
+
break
|
|
600
|
+
except Exception as e:
|
|
601
|
+
log("No more pages found", "INFO")
|
|
602
|
+
break
|
|
603
|
+
|
|
604
|
+
return all_comments
|
|
605
|
+
|
|
606
|
+
except Exception as e:
|
|
607
|
+
log(f"Error in comment extraction: {str(e)}", "WARNING")
|
|
608
|
+
return all_comments
|
|
609
|
+
|
|
610
|
+
def _parse_comments_from_html(self, soup: BeautifulSoup) -> List[Dict[str, Any]]:
|
|
611
|
+
"""Parse comments from HTML using BeautifulSoup."""
|
|
612
|
+
comments = []
|
|
613
|
+
|
|
614
|
+
# Try multiple selectors for comments
|
|
615
|
+
comment_selectors = [
|
|
616
|
+
'ul#comments.comment-list li.comment',
|
|
617
|
+
'.comment-list .comment',
|
|
618
|
+
'.comments .comment',
|
|
619
|
+
'li.comment',
|
|
620
|
+
'.comment-item'
|
|
621
|
+
]
|
|
622
|
+
|
|
623
|
+
comment_elements = None
|
|
624
|
+
|
|
625
|
+
for selector in comment_selectors:
|
|
626
|
+
comment_elements = soup.select(selector)
|
|
627
|
+
if comment_elements:
|
|
628
|
+
log(f"Found comments using selector: {selector}", "INFO")
|
|
629
|
+
break
|
|
630
|
+
|
|
631
|
+
if not comment_elements:
|
|
632
|
+
log("No comments found on this page", "INFO")
|
|
633
|
+
return comments
|
|
634
|
+
|
|
635
|
+
for index, element in enumerate(comment_elements):
|
|
636
|
+
try:
|
|
637
|
+
comment = {}
|
|
638
|
+
|
|
639
|
+
# Extract comment ID
|
|
640
|
+
comment['id'] = element.get('id') or f"comment-{index}"
|
|
641
|
+
|
|
642
|
+
# Extract author
|
|
643
|
+
author_element = element.select_one('.comment-author a, .author a, .comment-author')
|
|
644
|
+
comment['author'] = author_element.get_text().strip() if author_element else 'Unknown Author'
|
|
645
|
+
comment['author_link'] = author_element.get('href') if author_element else ''
|
|
646
|
+
|
|
647
|
+
# Extract date
|
|
648
|
+
time_element = element.select_one('.meta-data time, time, .date, .comment-date')
|
|
649
|
+
if time_element:
|
|
650
|
+
comment['date'] = time_element.get('datetime') or time_element.get('title') or time_element.get_text().strip()
|
|
651
|
+
comment['date_display'] = time_element.get('title') or time_element.get_text().strip()
|
|
652
|
+
else:
|
|
653
|
+
comment['date'] = 'Unknown Date'
|
|
654
|
+
comment['date_display'] = 'Unknown Date'
|
|
655
|
+
|
|
656
|
+
# Extract content
|
|
657
|
+
content_element = element.select_one('.comment-body, .comment-content, .content')
|
|
658
|
+
if content_element:
|
|
659
|
+
comment['content_html'] = str(content_element)
|
|
660
|
+
comment['content_text'] = content_element.get_text().strip()
|
|
661
|
+
else:
|
|
662
|
+
comment['content_html'] = ''
|
|
663
|
+
comment['content_text'] = ''
|
|
664
|
+
|
|
665
|
+
# Extract votes
|
|
666
|
+
vote_element = element.select_one('.vote-up span, .votes, .vote-count')
|
|
667
|
+
comment['votes'] = vote_element.get_text().strip() if vote_element else '0'
|
|
668
|
+
|
|
669
|
+
# Extract status
|
|
670
|
+
status_element = element.select_one('.status-label, .status, .badge')
|
|
671
|
+
comment['status'] = status_element.get_text().strip() if status_element else '普通评论'
|
|
672
|
+
|
|
673
|
+
if comment['content_text']:
|
|
674
|
+
comments.append(comment)
|
|
675
|
+
|
|
676
|
+
except Exception as e:
|
|
677
|
+
log(f"Error parsing comment {index}: {str(e)}", "WARNING")
|
|
678
|
+
|
|
679
|
+
return comments
|
|
680
|
+
|
|
681
|
+
async def search_forum_posts(self, email: str, password: str, search_query: str,
|
|
682
|
+
max_results: int = 50, headless: bool = True) -> Dict[str, Any]:
|
|
683
|
+
"""Search forum posts."""
|
|
684
|
+
driver = None
|
|
685
|
+
try:
|
|
686
|
+
log("Starting forum search process", "INFO")
|
|
687
|
+
log(f"Search query: '{search_query}'", "INFO")
|
|
688
|
+
log(f"Max results: {max_results}", "INFO")
|
|
689
|
+
|
|
690
|
+
driver = await self.create_driver(headless)
|
|
691
|
+
|
|
692
|
+
# Login
|
|
693
|
+
if not await self.login_to_forum(driver, email, password):
|
|
694
|
+
raise Exception("Failed to login to forum")
|
|
695
|
+
|
|
696
|
+
# Navigate to search
|
|
697
|
+
encoded_query = requests.utils.quote(search_query)
|
|
698
|
+
search_url = f"https://support.worldquantbrain.com/hc/zh-cn/search?utf8=%E2%9C%93&query={encoded_query}"
|
|
699
|
+
log(f"Opening search URL: {search_url}", "WORK")
|
|
700
|
+
|
|
701
|
+
driver.get(search_url)
|
|
702
|
+
await asyncio.sleep(2)
|
|
703
|
+
|
|
704
|
+
# Collect results with pagination
|
|
705
|
+
all_results = []
|
|
706
|
+
page_num = 1
|
|
707
|
+
|
|
708
|
+
log("Starting result collection with pagination", "WORK")
|
|
709
|
+
|
|
710
|
+
while len(all_results) < max_results:
|
|
711
|
+
log(f"Processing page {page_num}", "INFO")
|
|
712
|
+
|
|
713
|
+
# Wait for search results
|
|
714
|
+
try:
|
|
715
|
+
WebDriverWait(driver, 10).until(
|
|
716
|
+
EC.presence_of_element_located((By.CSS_SELECTOR, '.search-results-list, .search-result-list-item'))
|
|
717
|
+
)
|
|
718
|
+
except TimeoutException:
|
|
719
|
+
log(f"No search results found on page {page_num}", "WARNING")
|
|
720
|
+
break
|
|
721
|
+
|
|
722
|
+
# Extract results from current page
|
|
723
|
+
page_source = driver.page_source
|
|
724
|
+
soup = BeautifulSoup(page_source, 'html.parser')
|
|
725
|
+
page_results = self._extract_search_results(soup, page_num)
|
|
726
|
+
|
|
727
|
+
if not page_results:
|
|
728
|
+
log(f"No more results found on page {page_num}", "INFO")
|
|
729
|
+
break
|
|
730
|
+
|
|
731
|
+
all_results.extend(page_results)
|
|
732
|
+
|
|
733
|
+
# Check if we have enough results
|
|
734
|
+
if len(all_results) >= max_results:
|
|
735
|
+
all_results = all_results[:max_results]
|
|
736
|
+
break
|
|
737
|
+
|
|
738
|
+
# Try to go to next page
|
|
739
|
+
if not await self._go_to_next_search_page(driver, soup):
|
|
740
|
+
log("No more pages available", "INFO")
|
|
741
|
+
break
|
|
742
|
+
|
|
743
|
+
page_num += 1
|
|
744
|
+
await asyncio.sleep(1)
|
|
745
|
+
|
|
746
|
+
# Analyze results
|
|
747
|
+
analysis = self._analyze_search_results(all_results, search_query)
|
|
748
|
+
|
|
749
|
+
log(f"Search completed. Found {len(all_results)} results", "SUCCESS")
|
|
750
|
+
return {
|
|
751
|
+
"results": all_results,
|
|
752
|
+
"total_found": len(all_results),
|
|
753
|
+
"search_query": search_query,
|
|
754
|
+
"analysis": analysis,
|
|
755
|
+
"search_timestamp": datetime.now().isoformat()
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
except Exception as e:
|
|
759
|
+
log(f"Search failed: {str(e)}", "ERROR")
|
|
760
|
+
return {"error": str(e)}
|
|
761
|
+
finally:
|
|
762
|
+
if driver:
|
|
763
|
+
try:
|
|
764
|
+
driver.quit()
|
|
765
|
+
except:
|
|
766
|
+
pass
|
|
767
|
+
|
|
768
|
+
def _extract_search_results(self, soup: BeautifulSoup, page_num: int) -> List[Dict[str, Any]]:
|
|
769
|
+
"""Extract search results from a page using multiple resilient selectors.
|
|
770
|
+
|
|
771
|
+
Improvements vs original implementation:
|
|
772
|
+
- Tries several container selectors (mirrors TS Cheerio approach)
|
|
773
|
+
- Extracts richer metadata: description_html/text, votes, comments, author, date
|
|
774
|
+
- Preserves legacy fields (snippet, metadata) for backward compatibility
|
|
775
|
+
- Adds index & page for downstream analytics
|
|
776
|
+
- Robust fallbacks & normalization of URLs
|
|
777
|
+
"""
|
|
778
|
+
results: List[Dict[str, Any]] = []
|
|
779
|
+
|
|
780
|
+
# Ordered list of possible container selectors (keep broad ones last)
|
|
781
|
+
container_selectors = [
|
|
782
|
+
'.search-result-list-item',
|
|
783
|
+
'.search-results-list .search-result',
|
|
784
|
+
'.striped-list-item',
|
|
785
|
+
'.article-list-item',
|
|
786
|
+
'article.search-result',
|
|
787
|
+
'div.search-result',
|
|
788
|
+
]
|
|
789
|
+
|
|
790
|
+
# Collect candidate elements (stop at first selector that yields results)
|
|
791
|
+
result_items = []
|
|
792
|
+
for selector in container_selectors:
|
|
793
|
+
found = soup.select(selector)
|
|
794
|
+
if found:
|
|
795
|
+
log(f"Found {len(found)} search results using selector: {selector}", "INFO")
|
|
796
|
+
result_items = found
|
|
797
|
+
break
|
|
798
|
+
|
|
799
|
+
# Fallback: regex class scan (original heuristic)
|
|
800
|
+
if not result_items:
|
|
801
|
+
fallback = soup.find_all(['article', 'div'], class_=re.compile(r'search-result|article-item'))
|
|
802
|
+
if fallback:
|
|
803
|
+
log(f"Fallback selector captured {len(fallback)} results", "INFO")
|
|
804
|
+
result_items = fallback
|
|
805
|
+
else:
|
|
806
|
+
log("No search result items found with any selector", "WARNING")
|
|
807
|
+
return results
|
|
808
|
+
|
|
809
|
+
def first_text(element, selector_list: List[str]) -> str:
|
|
810
|
+
for sel in selector_list:
|
|
811
|
+
found = element.select_one(sel)
|
|
812
|
+
if found and found.get_text(strip=True):
|
|
813
|
+
return found.get_text(strip=True)
|
|
814
|
+
return ''
|
|
815
|
+
|
|
816
|
+
for idx, item in enumerate(result_items):
|
|
817
|
+
try:
|
|
818
|
+
# Title & link
|
|
819
|
+
title_link_elem = None
|
|
820
|
+
title_selectors = [
|
|
821
|
+
'.search-result-title a',
|
|
822
|
+
'h3 a',
|
|
823
|
+
'.title a',
|
|
824
|
+
'a'
|
|
825
|
+
]
|
|
826
|
+
for sel in title_selectors:
|
|
827
|
+
candidate = item.select_one(sel)
|
|
828
|
+
if candidate and candidate.get_text(strip=True):
|
|
829
|
+
title_link_elem = candidate
|
|
830
|
+
break
|
|
831
|
+
|
|
832
|
+
title = title_link_elem.get_text(strip=True) if title_link_elem else 'No title'
|
|
833
|
+
link = title_link_elem.get('href') if title_link_elem and title_link_elem.has_attr('href') else ''
|
|
834
|
+
if link and not link.startswith('http'):
|
|
835
|
+
link = f"https://support.worldquantbrain.com{link}"
|
|
836
|
+
|
|
837
|
+
if not link and not title:
|
|
838
|
+
continue # Skip invalid entries
|
|
839
|
+
|
|
840
|
+
# Description / snippet
|
|
841
|
+
desc_elem = None
|
|
842
|
+
desc_selectors = [
|
|
843
|
+
'.search-results-description',
|
|
844
|
+
'.description',
|
|
845
|
+
'.excerpt',
|
|
846
|
+
'.content-preview',
|
|
847
|
+
'p'
|
|
848
|
+
]
|
|
849
|
+
for sel in desc_selectors:
|
|
850
|
+
candidate = item.select_one(sel)
|
|
851
|
+
if candidate and candidate.get_text(strip=True):
|
|
852
|
+
desc_elem = candidate
|
|
853
|
+
break
|
|
854
|
+
|
|
855
|
+
description_html = str(desc_elem) if desc_elem else ''
|
|
856
|
+
description_text = desc_elem.get_text(strip=True) if desc_elem else ''
|
|
857
|
+
|
|
858
|
+
# Votes & comments
|
|
859
|
+
votes = first_text(item, [
|
|
860
|
+
'.search-result-votes span',
|
|
861
|
+
'.votes span',
|
|
862
|
+
'[class*="vote"] span',
|
|
863
|
+
'[class*="vote"]'
|
|
864
|
+
]) or '0'
|
|
865
|
+
comments = first_text(item, [
|
|
866
|
+
'.search-result-meta-count span',
|
|
867
|
+
'.comments span',
|
|
868
|
+
'[class*="comment"] span',
|
|
869
|
+
'[class*="comment"]'
|
|
870
|
+
]) or '0'
|
|
871
|
+
|
|
872
|
+
# Metadata / author / date
|
|
873
|
+
meta_block = item.select_one('.meta-data, .metadata, .post-meta')
|
|
874
|
+
author = 'Unknown'
|
|
875
|
+
date_val = 'Unknown'
|
|
876
|
+
if meta_block:
|
|
877
|
+
meta_text = meta_block.get_text(' ', strip=True)
|
|
878
|
+
# Split on common separators
|
|
879
|
+
parts = [p.strip() for p in re.split(r'[·•|]', meta_text) if p.strip()]
|
|
880
|
+
if len(parts) >= 2:
|
|
881
|
+
author = parts[0] or author
|
|
882
|
+
date_val = parts[1] or date_val
|
|
883
|
+
|
|
884
|
+
# Fallback selectors
|
|
885
|
+
if author == 'Unknown':
|
|
886
|
+
author = first_text(item, ['.author', '.username', '[class*="author"]']) or 'Unknown'
|
|
887
|
+
if date_val == 'Unknown':
|
|
888
|
+
# time element or date class
|
|
889
|
+
time_elem = item.select_one('.date, time, [class*="date"]')
|
|
890
|
+
if time_elem:
|
|
891
|
+
date_val = time_elem.get('datetime') or time_elem.get('title') or time_elem.get_text(strip=True) or 'Unknown'
|
|
892
|
+
|
|
893
|
+
# Compose legacy fields
|
|
894
|
+
snippet = description_text
|
|
895
|
+
metadata = f"author={author} date={date_val} votes={votes} comments={comments}".strip()
|
|
896
|
+
|
|
897
|
+
results.append({
|
|
898
|
+
'title': title,
|
|
899
|
+
'link': link,
|
|
900
|
+
'description_html': description_html or 'No description',
|
|
901
|
+
'description_text': description_text or 'No description',
|
|
902
|
+
'votes': votes,
|
|
903
|
+
'comments': comments,
|
|
904
|
+
'author': author,
|
|
905
|
+
'date': date_val,
|
|
906
|
+
'snippet': snippet, # backward compatibility
|
|
907
|
+
'metadata': metadata, # backward compatibility / quick summary
|
|
908
|
+
'page': page_num,
|
|
909
|
+
'index': idx
|
|
910
|
+
})
|
|
911
|
+
except Exception as e:
|
|
912
|
+
log(f"Error extracting search result {idx}: {str(e)}", "WARNING")
|
|
913
|
+
continue
|
|
914
|
+
|
|
915
|
+
return results
|
|
916
|
+
|
|
917
|
+
async def _go_to_next_search_page(self, driver: webdriver.Chrome, soup: BeautifulSoup) -> bool:
|
|
918
|
+
"""Navigate to the next search page."""
|
|
919
|
+
try:
|
|
920
|
+
# Look for next page link
|
|
921
|
+
next_link = soup.find('a', string=re.compile(r'next|下一页', re.IGNORECASE))
|
|
922
|
+
if not next_link:
|
|
923
|
+
next_link = soup.find('a', {'rel': 'next'})
|
|
924
|
+
|
|
925
|
+
if next_link and next_link.get('href'):
|
|
926
|
+
next_url = next_link['href']
|
|
927
|
+
if not next_url.startswith('http'):
|
|
928
|
+
next_url = f"https://support.worldquantbrain.com{next_url}"
|
|
929
|
+
|
|
930
|
+
driver.get(next_url)
|
|
931
|
+
await asyncio.sleep(2)
|
|
932
|
+
return True
|
|
933
|
+
|
|
934
|
+
return False
|
|
935
|
+
|
|
936
|
+
except Exception as e:
|
|
937
|
+
log(f"Error navigating to next page: {str(e)}", "WARNING")
|
|
938
|
+
return False
|
|
939
|
+
|
|
940
|
+
def _analyze_search_results(self, results: List[Dict[str, Any]], search_query: str) -> Dict[str, Any]:
|
|
941
|
+
"""Analyze search results for insights."""
|
|
942
|
+
if not results:
|
|
943
|
+
return {"message": "No results found"}
|
|
944
|
+
|
|
945
|
+
# Basic statistics
|
|
946
|
+
total_results = len(results)
|
|
947
|
+
|
|
948
|
+
# Categorize results by type
|
|
949
|
+
categories = {}
|
|
950
|
+
for result in results:
|
|
951
|
+
title = result.get('title', '').lower()
|
|
952
|
+
if 'tutorial' in title or 'guide' in title:
|
|
953
|
+
categories['tutorials'] = categories.get('tutorials', 0) + 1
|
|
954
|
+
elif 'api' in title or 'reference' in title:
|
|
955
|
+
categories['api_docs'] = categories.get('api_docs', 0) + 1
|
|
956
|
+
elif 'error' in title or 'issue' in title or 'problem' in title:
|
|
957
|
+
categories['troubleshooting'] = categories.get('troubleshooting', 0) + 1
|
|
958
|
+
elif 'competition' in title or 'event' in title:
|
|
959
|
+
categories['competitions'] = categories.get('competitions', 0) + 1
|
|
960
|
+
else:
|
|
961
|
+
categories['general'] = categories.get('general', 0) + 1
|
|
962
|
+
|
|
963
|
+
# Find most relevant results (containing search terms)
|
|
964
|
+
search_terms = search_query.lower().split()
|
|
965
|
+
relevant_results = []
|
|
966
|
+
|
|
967
|
+
for result in results:
|
|
968
|
+
title = result.get('title', '').lower()
|
|
969
|
+
snippet = result.get('snippet', '').lower()
|
|
970
|
+
text = f"{title} {snippet}"
|
|
971
|
+
|
|
972
|
+
term_matches = sum(1 for term in search_terms if term in text)
|
|
973
|
+
if term_matches > 0:
|
|
974
|
+
relevant_results.append({
|
|
975
|
+
"result": result,
|
|
976
|
+
"relevance_score": term_matches / len(search_terms)
|
|
977
|
+
})
|
|
978
|
+
|
|
979
|
+
# Sort by relevance
|
|
980
|
+
relevant_results.sort(key=lambda x: x['relevance_score'], reverse=True)
|
|
981
|
+
|
|
982
|
+
return {
|
|
983
|
+
"total_results": total_results,
|
|
984
|
+
"categories": categories,
|
|
985
|
+
"most_relevant": relevant_results[:5] if relevant_results else [],
|
|
986
|
+
"search_terms": search_terms
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
# Initialize forum client
|
|
990
|
+
forum_client = ForumClient()
|
|
991
|
+
|
|
992
|
+
# MCP Tools for Forum Functions - REMOVED (duplicate with platform_functions.py)
|
|
993
|
+
# These tools are already properly integrated in the main platform_functions.py
|
|
994
|
+
|
|
995
|
+
if __name__ == "__main__":
|
|
996
|
+
print("📚 WorldQuant BRAIN Forum Functions Server Starting...", file=sys.stderr)
|
|
997
|
+
print("Note: Forum tools are now integrated in the main platform_functions.py", file=sys.stderr)
|
|
998
|
+
print("This file provides the ForumClient class for internal use.", file=sys.stderr)
|