cnhkmcp 2.1.4__py3-none-any.whl → 2.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cnhkmcp/__init__.py +126 -126
- cnhkmcp/untracked/back_up/forum_functions.py +998 -0
- cnhkmcp/untracked/back_up/platform_functions.py +2886 -0
- cnhkmcp/untracked/brain-consultant.md +31 -0
- cnhkmcp/untracked/forum_functions.py +350 -941
- cnhkmcp/untracked/platform_functions.py +445 -730
- cnhkmcp/untracked/skills/Claude_Skill_Creation_Guide.md +140 -0
- cnhkmcp/untracked/skills/expression_verifier/SKILL.md +51 -0
- cnhkmcp/untracked/skills/expression_verifier/scripts/validator.py +889 -0
- cnhkmcp/untracked/skills/expression_verifier/scripts/verify_expr.py +52 -0
- cnhkmcp/untracked/skills/pull_BRAINSkill/SKILL.md +51 -0
- cnhkmcp/untracked/skills/pull_BRAINSkill/scripts/pull_skills.py +188 -0
- cnhkmcp/untracked//321/211/320/225/320/235/321/207/342/225/234/320/276/321/205/320/231/320/235/321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/230/320/241_/321/205/320/276/320/231/321/210/320/263/320/225/321/205/342/224/220/320/225/321/210/320/266/320/221/321/204/342/225/233/320/255/321/210/342/225/241/320/246/321/205/320/234/320/225.py +3 -1
- {cnhkmcp-2.1.4.dist-info → cnhkmcp-2.1.6.dist-info}/METADATA +1 -1
- {cnhkmcp-2.1.4.dist-info → cnhkmcp-2.1.6.dist-info}/RECORD +19 -13
- cnhkmcp/untracked/APP/Tranformer/ace.log +0 -0
- cnhkmcp/untracked/APP/Tranformer/parsetab.py +0 -60
- cnhkmcp/untracked/APP/simulator/wqb20260107015647.log +0 -57
- {cnhkmcp-2.1.4.dist-info → cnhkmcp-2.1.6.dist-info}/WHEEL +0 -0
- {cnhkmcp-2.1.4.dist-info → cnhkmcp-2.1.6.dist-info}/entry_points.txt +0 -0
- {cnhkmcp-2.1.4.dist-info → cnhkmcp-2.1.6.dist-info}/licenses/LICENSE +0 -0
- {cnhkmcp-2.1.4.dist-info → cnhkmcp-2.1.6.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
"""
|
|
3
3
|
WorldQuant BRAIN Forum Functions - Python Version
|
|
4
|
-
Comprehensive forum functionality including glossary, search, and post viewing.
|
|
4
|
+
Comprehensive forum functionality including glossary, search, and post viewing using Playwright.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import asyncio
|
|
@@ -12,987 +12,396 @@ from datetime import datetime
|
|
|
12
12
|
from typing import Dict, Any, List, Optional
|
|
13
13
|
|
|
14
14
|
from bs4 import BeautifulSoup
|
|
15
|
-
from
|
|
16
|
-
from selenium.webdriver.chrome.options import Options
|
|
17
|
-
from selenium.webdriver.edge.options import Options as EdgeOptions
|
|
18
|
-
from selenium.webdriver.common.by import By
|
|
19
|
-
from selenium.webdriver.support.ui import WebDriverWait
|
|
20
|
-
from selenium.webdriver.support import expected_conditions as EC
|
|
21
|
-
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
|
15
|
+
from playwright.async_api import async_playwright
|
|
22
16
|
import requests
|
|
23
17
|
import os
|
|
24
|
-
import shutil
|
|
25
|
-
|
|
26
|
-
# Initialize forum MCP server
|
|
27
|
-
try:
|
|
28
|
-
from mcp.server.fastmcp import FastMCP
|
|
29
|
-
forum_mcp = FastMCP('brain_forum_server')
|
|
30
|
-
except ImportError:
|
|
31
|
-
# Fallback for testing
|
|
32
|
-
forum_mcp = None
|
|
33
18
|
|
|
34
19
|
def log(message: str, level: str = "INFO"):
|
|
35
20
|
"""Log message with timestamp."""
|
|
36
21
|
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
37
22
|
print(f"[{timestamp}] [{level}] {message}", file=sys.stderr)
|
|
38
23
|
|
|
24
|
+
# --- Parsing Helper Functions (from playwright_forum_test.py) ---
|
|
25
|
+
|
|
26
|
+
def _is_navigation_or_metadata(line: str) -> bool:
|
|
27
|
+
"""Check if a line is navigation or metadata."""
|
|
28
|
+
navigation_patterns = [
|
|
29
|
+
r'^\d+ days? ago$',
|
|
30
|
+
r'~\d+ minute read',
|
|
31
|
+
r'^Follow',
|
|
32
|
+
r'^Not yet followed',
|
|
33
|
+
r'^Updated$',
|
|
34
|
+
r'^AS\d+$',
|
|
35
|
+
r'^[A-Z] - [A-Z] - [A-Z]', # Letter navigation
|
|
36
|
+
r'^A$',
|
|
37
|
+
r'^B$',
|
|
38
|
+
r'^[A-Z]$' # Single letters
|
|
39
|
+
]
|
|
40
|
+
return any(re.match(pattern, line.strip()) for pattern in navigation_patterns)
|
|
41
|
+
|
|
42
|
+
def _looks_like_term(line: str) -> bool:
|
|
43
|
+
"""Check if a line looks like a glossary term."""
|
|
44
|
+
if len(line) > 100:
|
|
45
|
+
return False
|
|
46
|
+
if _is_navigation_or_metadata(line):
|
|
47
|
+
return False
|
|
48
|
+
definition_starters = ['the', 'a', 'an', 'this', 'that', 'it', 'is', 'are', 'was', 'were', 'for', 'to', 'in', 'on', 'at', 'by', 'with']
|
|
49
|
+
first_word = line.lower().split(' ')[0] if line else ''
|
|
50
|
+
if first_word and first_word in definition_starters:
|
|
51
|
+
return False
|
|
52
|
+
is_short = len(line) <= 80
|
|
53
|
+
starts_with_capital = bool(re.match(r'^[A-Z]', line))
|
|
54
|
+
has_all_caps = bool(re.match(r'^[A-Z\s\-\/\(\)]+$', line))
|
|
55
|
+
has_reasonable_length = len(line) >= 2
|
|
56
|
+
return is_short and has_reasonable_length and (starts_with_capital or has_all_caps)
|
|
57
|
+
|
|
58
|
+
def _parse_glossary_terms(content: str) -> List[Dict[str, str]]:
|
|
59
|
+
"""Parse glossary terms from HTML content."""
|
|
60
|
+
soup = BeautifulSoup(content, 'html.parser')
|
|
61
|
+
# Get text from the article body, which is more reliable than splitting the whole HTML
|
|
62
|
+
article_body = soup.select_one('.article-body')
|
|
63
|
+
if not article_body:
|
|
64
|
+
return []
|
|
65
|
+
|
|
66
|
+
# Use .get_text with a separator to preserve line breaks, which is key for the logic below
|
|
67
|
+
lines = article_body.get_text(separator='\n').split('\n')
|
|
68
|
+
|
|
69
|
+
terms = []
|
|
70
|
+
current_term = None
|
|
71
|
+
current_definition = []
|
|
72
|
+
|
|
73
|
+
for line in lines:
|
|
74
|
+
line = line.strip()
|
|
75
|
+
if not line:
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
if _looks_like_term(line):
|
|
79
|
+
if current_term:
|
|
80
|
+
# Save the previous term
|
|
81
|
+
terms.append({
|
|
82
|
+
"term": current_term,
|
|
83
|
+
"definition": " ".join(current_definition).strip()
|
|
84
|
+
})
|
|
85
|
+
# Start a new term
|
|
86
|
+
current_term = line
|
|
87
|
+
current_definition = []
|
|
88
|
+
elif current_term:
|
|
89
|
+
# Add to the current definition
|
|
90
|
+
current_definition.append(line)
|
|
91
|
+
|
|
92
|
+
# Add the last term
|
|
93
|
+
if current_term:
|
|
94
|
+
terms.append({
|
|
95
|
+
"term": current_term,
|
|
96
|
+
"definition": " ".join(current_definition).strip()
|
|
97
|
+
})
|
|
98
|
+
|
|
99
|
+
# Filter out invalid terms and improve quality
|
|
100
|
+
return [term for term in terms if
|
|
101
|
+
len(term["term"]) > 0 and
|
|
102
|
+
len(term["definition"]) > 10 and
|
|
103
|
+
not _is_navigation_or_metadata(term["term"]) and
|
|
104
|
+
"ago" not in term["definition"] and
|
|
105
|
+
"minute read" not in term["definition"]]
|
|
106
|
+
|
|
39
107
|
class ForumClient:
|
|
40
|
-
"""Forum client for WorldQuant BRAIN support site."""
|
|
108
|
+
"""Forum client for WorldQuant BRAIN support site, using Playwright."""
|
|
41
109
|
|
|
42
110
|
def __init__(self):
|
|
43
111
|
self.base_url = "https://support.worldquantbrain.com"
|
|
112
|
+
# The session is mainly used for the initial authentication via brain_client
|
|
44
113
|
self.session = requests.Session()
|
|
45
114
|
self.session.headers.update({
|
|
46
115
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36'
|
|
47
116
|
})
|
|
117
|
+
|
|
118
|
+
async def _get_browser_context(self, p: async_playwright, email: str, password: str):
|
|
119
|
+
"""Authenticate and return a browser context with the session."""
|
|
120
|
+
# Import brain_client here to avoid circular dependency
|
|
121
|
+
from platform_functions import brain_client
|
|
122
|
+
|
|
123
|
+
log("Authenticating with BRAIN platform...", "INFO")
|
|
124
|
+
auth_result = await brain_client.authenticate(email, password)
|
|
125
|
+
if auth_result.get('status') != 'authenticated':
|
|
126
|
+
raise Exception("BRAIN platform authentication failed.")
|
|
127
|
+
log("Successfully authenticated with BRAIN platform.", "SUCCESS")
|
|
128
|
+
|
|
129
|
+
browser = await p.chromium.launch(channel="chrome", headless=True, args=['--no-sandbox'])
|
|
130
|
+
context = await browser.new_context(user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36')
|
|
131
|
+
|
|
132
|
+
log("Transferring authentication session to browser...", "INFO")
|
|
133
|
+
cookies = brain_client.session.cookies
|
|
134
|
+
playwright_cookies = []
|
|
135
|
+
for cookie in cookies:
|
|
136
|
+
cookie_dict = {
|
|
137
|
+
'name': cookie.name,
|
|
138
|
+
'value': cookie.value,
|
|
139
|
+
'domain': cookie.domain,
|
|
140
|
+
'path': cookie.path,
|
|
141
|
+
'secure': cookie.secure,
|
|
142
|
+
'httpOnly': 'HttpOnly' in cookie._rest,
|
|
143
|
+
'sameSite': 'Lax'
|
|
144
|
+
}
|
|
145
|
+
if cookie.expires:
|
|
146
|
+
cookie_dict['expires'] = cookie.expires
|
|
147
|
+
playwright_cookies.append(cookie_dict)
|
|
48
148
|
|
|
49
|
-
|
|
50
|
-
"
|
|
51
|
-
try:
|
|
52
|
-
import sys
|
|
53
|
-
import os
|
|
54
|
-
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
55
|
-
from platform_functions import brain_client
|
|
56
|
-
return brain_client.session
|
|
57
|
-
except ImportError:
|
|
58
|
-
return None
|
|
59
|
-
|
|
60
|
-
def detect_available_browser(self) -> str:
|
|
61
|
-
"""Detect which browser WebDriver is available."""
|
|
62
|
-
try:
|
|
63
|
-
# Try Chrome first
|
|
64
|
-
from selenium.webdriver.chrome.service import Service
|
|
65
|
-
from selenium.webdriver.chrome.options import Options
|
|
66
|
-
try:
|
|
67
|
-
options = Options()
|
|
68
|
-
options.add_argument('--headless')
|
|
69
|
-
driver = webdriver.Chrome(options=options)
|
|
70
|
-
driver.quit()
|
|
71
|
-
return "chrome"
|
|
72
|
-
except Exception:
|
|
73
|
-
pass
|
|
74
|
-
|
|
75
|
-
# Try Edge
|
|
76
|
-
try:
|
|
77
|
-
from selenium.webdriver.edge.options import Options as EdgeOptions
|
|
78
|
-
options = EdgeOptions()
|
|
79
|
-
options.add_argument('--headless')
|
|
80
|
-
driver = webdriver.Edge(options=options)
|
|
81
|
-
driver.quit()
|
|
82
|
-
return "edge"
|
|
83
|
-
except Exception:
|
|
84
|
-
pass
|
|
85
|
-
|
|
86
|
-
# Default to chrome
|
|
87
|
-
return "chrome"
|
|
88
|
-
except Exception:
|
|
89
|
-
return "chrome"
|
|
90
|
-
|
|
91
|
-
def setup_browser_options(self, headless: bool, browser_type: str):
|
|
92
|
-
"""Setup browser options based on browser type."""
|
|
93
|
-
if browser_type.lower() == "chrome":
|
|
94
|
-
return self.setup_chrome_options(headless)
|
|
95
|
-
elif browser_type.lower() == "edge":
|
|
96
|
-
return self.setup_edge_options(headless)
|
|
97
|
-
else:
|
|
98
|
-
return self.setup_chrome_options(headless)
|
|
99
|
-
|
|
100
|
-
def setup_edge_options(self, headless: bool = True) -> EdgeOptions:
|
|
101
|
-
"""Setup Edge options for web scraping."""
|
|
102
|
-
options = EdgeOptions()
|
|
103
|
-
|
|
104
|
-
if headless:
|
|
105
|
-
options.add_argument('--headless')
|
|
106
|
-
|
|
107
|
-
# Performance optimizations
|
|
108
|
-
options.add_argument('--disable-blink-features=AutomationControlled')
|
|
109
|
-
options.add_argument('--log-level=3')
|
|
110
|
-
options.add_argument('--no-sandbox')
|
|
111
|
-
options.add_argument('--disable-dev-shm-usage')
|
|
112
|
-
options.add_argument('--disable-web-security')
|
|
113
|
-
options.add_argument('--disable-features=VizDisplayCompositor')
|
|
114
|
-
options.add_argument('--disable-gpu')
|
|
115
|
-
options.add_argument('--disable-extensions')
|
|
116
|
-
options.add_argument('--disable-images')
|
|
117
|
-
options.add_argument('--disable-javascript')
|
|
118
|
-
options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36')
|
|
119
|
-
|
|
120
|
-
return options
|
|
121
|
-
|
|
122
|
-
def setup_chrome_options(self, headless: bool = True) -> Options:
|
|
123
|
-
"""Setup Chrome options for web scraping."""
|
|
124
|
-
options = Options()
|
|
125
|
-
|
|
126
|
-
if headless:
|
|
127
|
-
options.add_argument('--headless')
|
|
128
|
-
|
|
129
|
-
# Performance optimizations
|
|
130
|
-
options.add_argument('--disable-blink-features=AutomationControlled')
|
|
131
|
-
options.add_argument('--log-level=3')
|
|
132
|
-
options.add_argument('--no-sandbox')
|
|
133
|
-
options.add_argument('--disable-dev-shm-usage')
|
|
134
|
-
options.add_argument('--disable-web-security')
|
|
135
|
-
options.add_argument('--disable-features=VizDisplayCompositor')
|
|
136
|
-
options.add_argument('--disable-gpu')
|
|
137
|
-
options.add_argument('--disable-extensions')
|
|
138
|
-
options.add_argument('--disable-images')
|
|
139
|
-
options.add_argument('--disable-javascript')
|
|
140
|
-
options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36')
|
|
141
|
-
|
|
142
|
-
return options
|
|
143
|
-
|
|
144
|
-
async def create_driver(self, headless: bool = True):
|
|
145
|
-
"""Create and configure WebDriver with cross-browser support."""
|
|
146
|
-
browser_type = self.detect_available_browser()
|
|
147
|
-
log(f"Using browser: {browser_type}", "INFO")
|
|
148
|
-
|
|
149
|
-
options = self.setup_browser_options(headless, browser_type)
|
|
149
|
+
await context.add_cookies(playwright_cookies)
|
|
150
|
+
log("Session transferred.", "SUCCESS")
|
|
150
151
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
# Fallback to Chrome
|
|
158
|
-
log("Falling back to Chrome", "WARNING")
|
|
159
|
-
driver = webdriver.Chrome(options=options)
|
|
160
|
-
|
|
161
|
-
# Set aggressive timeouts for speed
|
|
162
|
-
driver.set_page_load_timeout(30)
|
|
163
|
-
driver.implicitly_wait(10)
|
|
164
|
-
|
|
165
|
-
return driver
|
|
166
|
-
|
|
167
|
-
except Exception as e:
|
|
168
|
-
log(f"Failed to create {browser_type} driver: {str(e)}", "ERROR")
|
|
169
|
-
help_text = self.get_driver_installation_help(browser_type)
|
|
170
|
-
log(help_text, "ERROR")
|
|
171
|
-
|
|
172
|
-
# Try Chrome as fallback if Edge failed
|
|
173
|
-
if browser_type.lower() != "chrome":
|
|
174
|
-
try:
|
|
175
|
-
log("Trying Chrome as fallback", "INFO")
|
|
176
|
-
chrome_options = self.setup_browser_options(headless, "chrome")
|
|
177
|
-
driver = webdriver.Chrome(options=chrome_options)
|
|
178
|
-
driver.set_page_load_timeout(30)
|
|
179
|
-
driver.implicitly_wait(10)
|
|
180
|
-
return driver
|
|
181
|
-
except Exception as e2:
|
|
182
|
-
log(f"Chrome fallback also failed: {str(e2)}", "ERROR")
|
|
183
|
-
chrome_help = self.get_driver_installation_help("chrome")
|
|
184
|
-
log(chrome_help, "ERROR")
|
|
185
|
-
|
|
186
|
-
raise Exception(f"Could not create any browser driver. {help_text}")
|
|
187
|
-
|
|
188
|
-
async def login_to_forum(self, driver, email: str, password: str) -> bool:
|
|
189
|
-
"""Login to the WorldQuant BRAIN forum using existing authentication."""
|
|
190
|
-
try:
|
|
191
|
-
# Import BrainApiClient from platform_functions
|
|
192
|
-
import sys
|
|
193
|
-
import os
|
|
194
|
-
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
195
|
-
|
|
152
|
+
return browser, context
|
|
153
|
+
|
|
154
|
+
async def get_glossary_terms(self, email: str, password: str) -> List[Dict[str, str]]:
|
|
155
|
+
"""Extract glossary terms from the forum using Playwright."""
|
|
156
|
+
async with async_playwright() as p:
|
|
157
|
+
browser = None
|
|
196
158
|
try:
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
# First authenticate with BrainApiClient
|
|
201
|
-
auth_result = await brain_client.authenticate(email, password)
|
|
202
|
-
if auth_result.get('status') != 'authenticated':
|
|
203
|
-
log("BrainApiClient authentication failed", "ERROR")
|
|
204
|
-
return False
|
|
205
|
-
|
|
206
|
-
log("Successfully authenticated via BrainApiClient", "SUCCESS")
|
|
207
|
-
|
|
208
|
-
# Navigate to forum with authenticated session
|
|
209
|
-
log("Navigating to forum with authenticated session", "WORK")
|
|
210
|
-
driver.get("https://support.worldquantbrain.com/hc/en-us")
|
|
211
|
-
await asyncio.sleep(2)
|
|
159
|
+
log("Starting glossary extraction process with Playwright", "INFO")
|
|
160
|
+
browser, context = await self._get_browser_context(p, email, password)
|
|
212
161
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
for
|
|
216
|
-
driver.add_cookie({
|
|
217
|
-
'name': cookie.name,
|
|
218
|
-
'value': cookie.value,
|
|
219
|
-
'domain': '.worldquantbrain.com'
|
|
220
|
-
})
|
|
162
|
+
page = await context.new_page()
|
|
163
|
+
log("Navigating to BRAIN support forum glossary...", "INFO")
|
|
164
|
+
await page.goto("https://support.worldquantbrain.com/hc/en-us/articles/4902349883927-Click-here-for-a-list-of-terms-and-their-definitions")
|
|
221
165
|
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
await asyncio.sleep(2)
|
|
166
|
+
log("Extracting glossary content...", "INFO")
|
|
167
|
+
content = await page.content()
|
|
225
168
|
|
|
226
|
-
|
|
169
|
+
terms = _parse_glossary_terms(content)
|
|
227
170
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
)
|
|
250
|
-
login_button.click()
|
|
251
|
-
await asyncio.sleep(3)
|
|
252
|
-
|
|
253
|
-
return True
|
|
254
|
-
|
|
255
|
-
except Exception as e:
|
|
256
|
-
log(f"Login failed: {str(e)}", "ERROR")
|
|
257
|
-
return False
|
|
258
|
-
|
|
259
|
-
async def get_glossary_terms(self, email: str, password: str, headless: bool = False) -> Dict[str, Any]:
|
|
260
|
-
"""Extract glossary terms from the forum."""
|
|
261
|
-
driver = None
|
|
262
|
-
try:
|
|
263
|
-
log("Starting glossary extraction process", "INFO")
|
|
264
|
-
|
|
265
|
-
# Add timeout protection
|
|
266
|
-
async def extraction_with_timeout():
|
|
267
|
-
return await self._perform_glossary_extraction(email, password, headless)
|
|
268
|
-
|
|
269
|
-
# Run with 5-minute timeout
|
|
270
|
-
result = await asyncio.wait_for(extraction_with_timeout(), timeout=300)
|
|
271
|
-
return result
|
|
272
|
-
|
|
273
|
-
except asyncio.TimeoutError:
|
|
274
|
-
log("Glossary extraction timed out after 5 minutes", "ERROR")
|
|
275
|
-
return {"error": "Glossary extraction timed out after 5 minutes"}
|
|
276
|
-
except Exception as e:
|
|
277
|
-
log(f"Glossary extraction failed: {str(e)}", "ERROR")
|
|
278
|
-
return {"error": str(e)}
|
|
279
|
-
finally:
|
|
280
|
-
if driver:
|
|
281
|
-
try:
|
|
282
|
-
driver.quit()
|
|
283
|
-
except:
|
|
284
|
-
pass
|
|
285
|
-
|
|
286
|
-
async def _perform_glossary_extraction(self, email: str, password: str, headless: bool) -> Dict[str, Any]:
|
|
287
|
-
"""Perform the actual glossary extraction."""
|
|
288
|
-
driver = None
|
|
289
|
-
try:
|
|
290
|
-
driver = await self.create_driver(headless)
|
|
291
|
-
|
|
292
|
-
# Login
|
|
293
|
-
if not await self.login_to_forum(driver, email, password):
|
|
294
|
-
raise Exception("Failed to login to forum")
|
|
295
|
-
|
|
296
|
-
# Navigate to glossary page
|
|
297
|
-
log("Navigating to glossary page", "WORK")
|
|
298
|
-
driver.get("https://support.worldquantbrain.com/hc/en-us/articles/4902349883927-Click-here-for-a-list-of-terms-and-their-definitions")
|
|
299
|
-
await asyncio.sleep(5)
|
|
300
|
-
|
|
301
|
-
# Extract content
|
|
302
|
-
log("Extracting glossary content", "WORK")
|
|
303
|
-
page_source = driver.page_source
|
|
304
|
-
soup = BeautifulSoup(page_source, 'html.parser')
|
|
305
|
-
|
|
306
|
-
# Parse glossary terms
|
|
307
|
-
terms = self._parse_glossary_terms(page_source)
|
|
308
|
-
|
|
309
|
-
log(f"Extracted {len(terms)} glossary terms", "SUCCESS")
|
|
310
|
-
return {
|
|
311
|
-
"terms": terms,
|
|
312
|
-
"total_count": len(terms),
|
|
313
|
-
"extraction_timestamp": datetime.now().isoformat()
|
|
314
|
-
}
|
|
315
|
-
|
|
316
|
-
finally:
|
|
317
|
-
if driver:
|
|
318
|
-
try:
|
|
319
|
-
driver.quit()
|
|
320
|
-
except:
|
|
321
|
-
pass
|
|
322
|
-
|
|
323
|
-
def _parse_glossary_terms(self, content: str) -> List[Dict[str, str]]:
|
|
324
|
-
"""Parse glossary terms from HTML content."""
|
|
325
|
-
terms = []
|
|
326
|
-
lines = content.split('\n')
|
|
327
|
-
|
|
328
|
-
current_term = None
|
|
329
|
-
current_definition = []
|
|
330
|
-
is_collecting_definition = False
|
|
331
|
-
found_first_real_term = False
|
|
332
|
-
|
|
333
|
-
for line in lines:
|
|
334
|
-
line = line.strip()
|
|
335
|
-
if not line:
|
|
336
|
-
continue
|
|
337
|
-
|
|
338
|
-
# Skip navigation and metadata lines at the beginning
|
|
339
|
-
if not found_first_real_term and self._is_navigation_or_metadata(line):
|
|
340
|
-
continue
|
|
341
|
-
|
|
342
|
-
# Check if this line looks like a term
|
|
343
|
-
if self._looks_like_term(line) and not is_collecting_definition:
|
|
344
|
-
# Mark that we found the first real term
|
|
345
|
-
if not found_first_real_term:
|
|
346
|
-
found_first_real_term = True
|
|
171
|
+
log(f"Extracted {len(terms)} glossary terms", "SUCCESS")
|
|
172
|
+
return terms
|
|
173
|
+
|
|
174
|
+
except Exception as e:
|
|
175
|
+
log(f"Glossary extraction failed: {str(e)}", "ERROR")
|
|
176
|
+
# Re-raise to be handled by the MCP server wrapper
|
|
177
|
+
raise
|
|
178
|
+
finally:
|
|
179
|
+
if browser:
|
|
180
|
+
await browser.close()
|
|
181
|
+
log("Browser closed.", "INFO")
|
|
182
|
+
|
|
183
|
+
async def search_forum_posts(self, email: str, password: str, search_query: str, max_results: int = 50, locale: str = "zh-cn") -> Dict[str, Any]:
|
|
184
|
+
"""Search for posts on the forum using Playwright, with pagination."""
|
|
185
|
+
async with async_playwright() as p:
|
|
186
|
+
browser = None
|
|
187
|
+
try:
|
|
188
|
+
log(f"Starting forum search for '{search_query}'", "INFO")
|
|
189
|
+
browser, context = await self._get_browser_context(p, email, password)
|
|
190
|
+
|
|
191
|
+
page = await context.new_page()
|
|
347
192
|
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
terms.append({
|
|
351
|
-
"term": current_term.strip(),
|
|
352
|
-
"definition": " ".join(current_definition).strip()
|
|
353
|
-
})
|
|
193
|
+
search_results = []
|
|
194
|
+
page_num = 1
|
|
354
195
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
elif is_collecting_definition and found_first_real_term:
|
|
359
|
-
# Check if this is the start of a new term
|
|
360
|
-
if self._looks_like_term(line):
|
|
361
|
-
# Save current term
|
|
362
|
-
if current_term and current_definition:
|
|
363
|
-
terms.append({
|
|
364
|
-
"term": current_term.strip(),
|
|
365
|
-
"definition": " ".join(current_definition).strip()
|
|
366
|
-
})
|
|
196
|
+
while len(search_results) < max_results:
|
|
197
|
+
search_url = f"{self.base_url}/hc/{locale}/search?page={page_num}&query={search_query}#results"
|
|
198
|
+
log(f"Navigating to search page: {search_url}", "INFO")
|
|
367
199
|
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
"term": current_term.strip(),
|
|
381
|
-
"definition": " ".join(current_definition).strip()
|
|
382
|
-
})
|
|
383
|
-
|
|
384
|
-
# Filter out invalid terms and improve quality
|
|
385
|
-
return [term for term in terms if
|
|
386
|
-
len(term["term"]) > 0 and
|
|
387
|
-
len(term["definition"]) > 10 and # Ensure meaningful definitions
|
|
388
|
-
not self._is_navigation_or_metadata(term["term"]) and
|
|
389
|
-
"ago" not in term["definition"] and # Remove timestamp-like definitions
|
|
390
|
-
"minute read" not in term["definition"]] # Remove reading time
|
|
391
|
-
|
|
392
|
-
def _looks_like_term(self, line: str) -> bool:
|
|
393
|
-
"""Check if a line looks like a glossary term."""
|
|
394
|
-
# Skip very long lines (likely definitions)
|
|
395
|
-
if len(line) > 100:
|
|
396
|
-
return False
|
|
397
|
-
|
|
398
|
-
# Skip navigation and metadata
|
|
399
|
-
if self._is_navigation_or_metadata(line):
|
|
400
|
-
return False
|
|
401
|
-
|
|
402
|
-
# Skip lines that start with common definition words
|
|
403
|
-
definition_starters = ['the', 'a', 'an', 'this', 'that', 'it', 'is', 'are', 'was', 'were', 'for', 'to', 'in', 'on', 'at', 'by', 'with']
|
|
404
|
-
first_word = line.lower().split(' ')[0]
|
|
405
|
-
if first_word and first_word in definition_starters:
|
|
406
|
-
return False
|
|
407
|
-
|
|
408
|
-
# Check if line has characteristics of a term
|
|
409
|
-
# Terms are often short, may be all caps, or start with capital
|
|
410
|
-
is_short = len(line) <= 80
|
|
411
|
-
starts_with_capital = bool(re.match(r'^[A-Z]', line))
|
|
412
|
-
has_all_caps = bool(re.match(r'^[A-Z\s\-\/\(\)]+$', line))
|
|
413
|
-
has_reasonable_length = len(line) >= 2
|
|
414
|
-
|
|
415
|
-
return is_short and has_reasonable_length and (starts_with_capital or has_all_caps)
|
|
416
|
-
|
|
417
|
-
def _is_navigation_or_metadata(self, line: str) -> bool:
|
|
418
|
-
"""Check if a line is navigation or metadata."""
|
|
419
|
-
navigation_patterns = [
|
|
420
|
-
r'^\d+ days? ago$',
|
|
421
|
-
r'~\d+ minute read',
|
|
422
|
-
r'^Follow',
|
|
423
|
-
r'^Not yet followed',
|
|
424
|
-
r'^Updated$',
|
|
425
|
-
r'^AS\d+$',
|
|
426
|
-
r'^[A-Z] - [A-Z] - [A-Z]', # Letter navigation
|
|
427
|
-
r'^A$',
|
|
428
|
-
r'^B$',
|
|
429
|
-
r'^[A-Z]$' # Single letters
|
|
430
|
-
]
|
|
431
|
-
|
|
432
|
-
return any(re.match(pattern, line.strip()) for pattern in navigation_patterns)
|
|
433
|
-
|
|
434
|
-
def get_driver_installation_help(self, browser_type: str) -> str:
|
|
435
|
-
"""Provide helpful instructions for installing WebDriver."""
|
|
436
|
-
if browser_type.lower() == "chrome":
|
|
437
|
-
return """
|
|
438
|
-
Chrome WebDriver not found. Please install ChromeDriver:
|
|
439
|
-
1. Download from: https://chromedriver.chromium.org/downloads
|
|
440
|
-
2. Make sure version matches your Chrome browser
|
|
441
|
-
3. Add to PATH or place in current directory
|
|
442
|
-
4. Alternative: Install via pip: pip install chromedriver-autoinstaller
|
|
443
|
-
"""
|
|
444
|
-
elif browser_type.lower() == "edge":
|
|
445
|
-
return """
|
|
446
|
-
Edge WebDriver not found. Please install Edge WebDriver:
|
|
447
|
-
1. Download from: https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/
|
|
448
|
-
2. Make sure version matches your Edge browser
|
|
449
|
-
3. Add to PATH or place in current directory
|
|
450
|
-
4. Alternative: Install via pip: pip install msedge-selenium-tools
|
|
451
|
-
"""
|
|
452
|
-
else:
|
|
453
|
-
return "Please install either ChromeDriver or Edge WebDriver for browser automation."
|
|
454
|
-
|
|
455
|
-
async def read_full_forum_post(self, email: str, password: str, post_url_or_id: str,
|
|
456
|
-
headless: bool = False, include_comments: bool = True) -> Dict[str, Any]:
|
|
457
|
-
"""Read a complete forum post with optional comments."""
|
|
458
|
-
driver = None
|
|
459
|
-
try:
|
|
460
|
-
log("Starting forum post reading process", "INFO")
|
|
461
|
-
|
|
462
|
-
# Determine if input is URL or article ID
|
|
463
|
-
is_url = post_url_or_id.startswith('http')
|
|
464
|
-
if is_url:
|
|
465
|
-
post_url = post_url_or_id
|
|
466
|
-
else:
|
|
467
|
-
post_url = f"https://support.worldquantbrain.com/hc/zh-cn/community/posts/{post_url_or_id}"
|
|
468
|
-
|
|
469
|
-
log(f"Target URL: {post_url}", "INFO")
|
|
470
|
-
log(f"Include comments: {include_comments}", "INFO")
|
|
471
|
-
|
|
472
|
-
driver = await self.create_driver(headless)
|
|
473
|
-
|
|
474
|
-
# Login
|
|
475
|
-
if not await self.login_to_forum(driver, email, password):
|
|
476
|
-
raise Exception("Failed to login to forum")
|
|
477
|
-
|
|
478
|
-
# Navigate directly to post URL
|
|
479
|
-
log(f"Opening post: {post_url}", "WORK")
|
|
480
|
-
driver.get(post_url)
|
|
481
|
-
log("Post page loaded, extracting content immediately", "WORK")
|
|
482
|
-
|
|
483
|
-
# Wait minimal time for content to appear
|
|
484
|
-
await asyncio.sleep(2)
|
|
485
|
-
|
|
486
|
-
# Extract post content quickly
|
|
487
|
-
post_data = {}
|
|
488
|
-
page_source = driver.page_source
|
|
489
|
-
soup = BeautifulSoup(page_source, 'html.parser')
|
|
490
|
-
|
|
491
|
-
# Extract post title
|
|
492
|
-
title = soup.select_one('.post-title, h1, .article-title')
|
|
493
|
-
if not title:
|
|
494
|
-
title = soup.select_one('title')
|
|
495
|
-
post_data['title'] = title.get_text().strip() if title else 'Unknown Title'
|
|
496
|
-
|
|
497
|
-
# Extract post author
|
|
498
|
-
author = soup.select_one('.post-author, .author, .article-author')
|
|
499
|
-
if not author:
|
|
500
|
-
author = soup.select_one('.comment-author')
|
|
501
|
-
post_data['author'] = author.get_text().strip() if author else 'Unknown Author'
|
|
502
|
-
|
|
503
|
-
# Extract post date
|
|
504
|
-
date = soup.select_one('.post-date, .date, .article-date, time')
|
|
505
|
-
if not date:
|
|
506
|
-
time_element = soup.select_one('time')
|
|
507
|
-
if time_element:
|
|
508
|
-
date = time_element.get('datetime') or time_element.get('title') or time_element.get_text().strip()
|
|
509
|
-
else:
|
|
510
|
-
date = 'Unknown Date'
|
|
511
|
-
else:
|
|
512
|
-
date = date.get_text().strip()
|
|
513
|
-
post_data['date'] = date if date else 'Unknown Date'
|
|
514
|
-
|
|
515
|
-
# Extract post content
|
|
516
|
-
post_content = soup.select_one('.post-body, .article-body, .content, .post-content')
|
|
517
|
-
if not post_content:
|
|
518
|
-
post_content = soup.select_one('article, main')
|
|
519
|
-
|
|
520
|
-
if post_content:
|
|
521
|
-
post_data['content_html'] = str(post_content)
|
|
522
|
-
post_data['content_text'] = post_content.get_text().strip()
|
|
523
|
-
else:
|
|
524
|
-
post_data['content_html'] = 'No content found'
|
|
525
|
-
post_data['content_text'] = 'No content found'
|
|
526
|
-
|
|
527
|
-
post_data['url'] = post_url
|
|
528
|
-
post_data['current_url'] = driver.current_url
|
|
529
|
-
|
|
530
|
-
log(f"Post content extracted: \"{post_data['title']}\"", "SUCCESS")
|
|
531
|
-
|
|
532
|
-
comments = []
|
|
533
|
-
total_comments = 0
|
|
534
|
-
|
|
535
|
-
# Extract comments conditionally
|
|
536
|
-
if include_comments:
|
|
537
|
-
log("Extracting comments...", "WORK")
|
|
538
|
-
comments = await self._extract_forum_comments_full(driver, soup)
|
|
539
|
-
total_comments = len(comments)
|
|
540
|
-
log(f"Extracted {total_comments} comments", "SUCCESS")
|
|
541
|
-
else:
|
|
542
|
-
log("Skipping comment extraction (includeComments=false)", "INFO")
|
|
543
|
-
|
|
544
|
-
return {
|
|
545
|
-
"success": True,
|
|
546
|
-
"post": post_data,
|
|
547
|
-
"comments": comments,
|
|
548
|
-
"total_comments": total_comments,
|
|
549
|
-
"extracted_at": datetime.now().isoformat(),
|
|
550
|
-
"processing_time": "full_extraction_with_comments" if include_comments else "post_only_extraction",
|
|
551
|
-
"include_comments": include_comments
|
|
552
|
-
}
|
|
553
|
-
|
|
554
|
-
except Exception as e:
|
|
555
|
-
log(f"Failed to read forum post: {str(e)}", "ERROR")
|
|
556
|
-
return {"error": str(e)}
|
|
557
|
-
finally:
|
|
558
|
-
if driver:
|
|
559
|
-
try:
|
|
560
|
-
driver.quit()
|
|
561
|
-
except:
|
|
562
|
-
pass
|
|
563
|
-
|
|
564
|
-
async def _extract_forum_comments_full(self, driver, soup: BeautifulSoup) -> List[Dict[str, Any]]:
|
|
565
|
-
"""Extract all comments from forum post with pagination support."""
|
|
566
|
-
all_comments = []
|
|
567
|
-
page_num = 1
|
|
568
|
-
|
|
569
|
-
try:
|
|
570
|
-
# First extract comments from current page source
|
|
571
|
-
page_comments = self._parse_comments_from_html(soup)
|
|
572
|
-
all_comments.extend(page_comments)
|
|
573
|
-
log(f"Found {len(page_comments)} comments on page {page_num}", "INFO")
|
|
574
|
-
|
|
575
|
-
# Check for pagination and continue if needed
|
|
576
|
-
while True:
|
|
577
|
-
try:
|
|
578
|
-
# Look for next page button
|
|
579
|
-
next_button = driver.find_element(By.CSS_SELECTOR, "span.pagination-next-text, .pagination-next, .next")
|
|
580
|
-
next_text = next_button.text
|
|
200
|
+
try:
|
|
201
|
+
response = await page.goto(search_url)
|
|
202
|
+
if response.status == 404:
|
|
203
|
+
log(f"Page {page_num} not found. End of results.", "INFO")
|
|
204
|
+
break
|
|
205
|
+
await page.wait_for_selector('ul.search-results-list', timeout=15000)
|
|
206
|
+
except Exception as e:
|
|
207
|
+
log(f"Could not load search results on page {page_num}: {e}", "INFO")
|
|
208
|
+
break
|
|
209
|
+
|
|
210
|
+
content = await page.content()
|
|
211
|
+
soup = BeautifulSoup(content, 'html.parser')
|
|
581
212
|
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
213
|
+
results_on_page = soup.select('li.search-result-list-item')
|
|
214
|
+
if not results_on_page:
|
|
215
|
+
log("No more search results found.", "INFO")
|
|
216
|
+
break
|
|
217
|
+
|
|
218
|
+
for result in results_on_page:
|
|
219
|
+
title_element = result.select_one('h2.search-result-title a')
|
|
220
|
+
snippet_element = result.select_one('.search-results-description')
|
|
586
221
|
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
222
|
+
if title_element:
|
|
223
|
+
title = title_element.get_text(strip=True)
|
|
224
|
+
link = title_element.get('href')
|
|
225
|
+
|
|
226
|
+
votes_element = result.select_one('.search-result-votes span[aria-hidden="true"]')
|
|
227
|
+
votes_text = votes_element.get_text(strip=True) if votes_element else '0'
|
|
228
|
+
votes_match = re.search(r'\d+', votes_text)
|
|
229
|
+
votes = int(votes_match.group()) if votes_match else 0
|
|
230
|
+
|
|
231
|
+
comments_element = result.select_one('.search-result-meta-count span[aria-hidden="true"]')
|
|
232
|
+
comments_text = comments_element.get_text(strip=True) if comments_element else '0'
|
|
233
|
+
comments_match = re.search(r'\d+', comments_text)
|
|
234
|
+
comments = int(comments_match.group()) if comments_match else 0
|
|
235
|
+
|
|
236
|
+
breadcrumbs_elements = result.select('ol.search-result-breadcrumbs li')
|
|
237
|
+
breadcrumbs = [bc.get_text(strip=True) for bc in breadcrumbs_elements]
|
|
238
|
+
|
|
239
|
+
meta_group = result.select_one('ul.meta-group')
|
|
240
|
+
author = 'Unknown'
|
|
241
|
+
post_date = 'Unknown'
|
|
242
|
+
if meta_group:
|
|
243
|
+
meta_data_elements = meta_group.select('li.meta-data')
|
|
244
|
+
if len(meta_data_elements) > 0:
|
|
245
|
+
author = meta_data_elements[0].get_text(strip=True)
|
|
246
|
+
if len(meta_data_elements) > 1:
|
|
247
|
+
time_element = meta_data_elements[1].select_one('time')
|
|
248
|
+
if time_element:
|
|
249
|
+
post_date = time_element.get('datetime', time_element.get_text(strip=True))
|
|
250
|
+
|
|
251
|
+
snippet = snippet_element.get_text(strip=True) if snippet_element else ''
|
|
252
|
+
|
|
253
|
+
full_link = ''
|
|
254
|
+
if link:
|
|
255
|
+
if link.startswith('http'):
|
|
256
|
+
full_link = link
|
|
257
|
+
else:
|
|
258
|
+
full_link = f"{self.base_url}{link}"
|
|
259
|
+
|
|
260
|
+
search_results.append({
|
|
261
|
+
'title': title,
|
|
262
|
+
'link': full_link,
|
|
263
|
+
'snippet': snippet,
|
|
264
|
+
'votes': votes,
|
|
265
|
+
'comments': comments,
|
|
266
|
+
'author': author,
|
|
267
|
+
'date': post_date,
|
|
268
|
+
'breadcrumbs': breadcrumbs
|
|
269
|
+
})
|
|
591
270
|
|
|
592
|
-
if len(
|
|
271
|
+
if len(search_results) >= max_results:
|
|
593
272
|
break
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
page_num += 1
|
|
597
|
-
log(f"Found {len(new_page_comments)} comments on page {page_num}", "INFO")
|
|
598
|
-
else:
|
|
273
|
+
|
|
274
|
+
if len(search_results) >= max_results:
|
|
599
275
|
break
|
|
600
|
-
except Exception as e:
|
|
601
|
-
log("No more pages found", "INFO")
|
|
602
|
-
break
|
|
603
|
-
|
|
604
|
-
return all_comments
|
|
605
|
-
|
|
606
|
-
except Exception as e:
|
|
607
|
-
log(f"Error in comment extraction: {str(e)}", "WARNING")
|
|
608
|
-
return all_comments
|
|
609
276
|
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
# Try multiple selectors for comments
|
|
615
|
-
comment_selectors = [
|
|
616
|
-
'ul#comments.comment-list li.comment',
|
|
617
|
-
'.comment-list .comment',
|
|
618
|
-
'.comments .comment',
|
|
619
|
-
'li.comment',
|
|
620
|
-
'.comment-item'
|
|
621
|
-
]
|
|
622
|
-
|
|
623
|
-
comment_elements = None
|
|
624
|
-
|
|
625
|
-
for selector in comment_selectors:
|
|
626
|
-
comment_elements = soup.select(selector)
|
|
627
|
-
if comment_elements:
|
|
628
|
-
log(f"Found comments using selector: {selector}", "INFO")
|
|
629
|
-
break
|
|
630
|
-
|
|
631
|
-
if not comment_elements:
|
|
632
|
-
log("No comments found on this page", "INFO")
|
|
633
|
-
return comments
|
|
634
|
-
|
|
635
|
-
for index, element in enumerate(comment_elements):
|
|
636
|
-
try:
|
|
637
|
-
comment = {}
|
|
638
|
-
|
|
639
|
-
# Extract comment ID
|
|
640
|
-
comment['id'] = element.get('id') or f"comment-{index}"
|
|
641
|
-
|
|
642
|
-
# Extract author
|
|
643
|
-
author_element = element.select_one('.comment-author a, .author a, .comment-author')
|
|
644
|
-
comment['author'] = author_element.get_text().strip() if author_element else 'Unknown Author'
|
|
645
|
-
comment['author_link'] = author_element.get('href') if author_element else ''
|
|
646
|
-
|
|
647
|
-
# Extract date
|
|
648
|
-
time_element = element.select_one('.meta-data time, time, .date, .comment-date')
|
|
649
|
-
if time_element:
|
|
650
|
-
comment['date'] = time_element.get('datetime') or time_element.get('title') or time_element.get_text().strip()
|
|
651
|
-
comment['date_display'] = time_element.get('title') or time_element.get_text().strip()
|
|
652
|
-
else:
|
|
653
|
-
comment['date'] = 'Unknown Date'
|
|
654
|
-
comment['date_display'] = 'Unknown Date'
|
|
655
|
-
|
|
656
|
-
# Extract content
|
|
657
|
-
content_element = element.select_one('.comment-body, .comment-content, .content')
|
|
658
|
-
if content_element:
|
|
659
|
-
comment['content_html'] = str(content_element)
|
|
660
|
-
comment['content_text'] = content_element.get_text().strip()
|
|
661
|
-
else:
|
|
662
|
-
comment['content_html'] = ''
|
|
663
|
-
comment['content_text'] = ''
|
|
664
|
-
|
|
665
|
-
# Extract votes
|
|
666
|
-
vote_element = element.select_one('.vote-up span, .votes, .vote-count')
|
|
667
|
-
comment['votes'] = vote_element.get_text().strip() if vote_element else '0'
|
|
668
|
-
|
|
669
|
-
# Extract status
|
|
670
|
-
status_element = element.select_one('.status-label, .status, .badge')
|
|
671
|
-
comment['status'] = status_element.get_text().strip() if status_element else '普通评论'
|
|
672
|
-
|
|
673
|
-
if comment['content_text']:
|
|
674
|
-
comments.append(comment)
|
|
277
|
+
page_num += 1
|
|
278
|
+
|
|
279
|
+
log(f"Found {len(search_results)} results for '{search_query}'", "SUCCESS")
|
|
675
280
|
|
|
281
|
+
return {
|
|
282
|
+
"success": True,
|
|
283
|
+
"results": search_results,
|
|
284
|
+
"total_found": len(search_results)
|
|
285
|
+
}
|
|
286
|
+
|
|
676
287
|
except Exception as e:
|
|
677
|
-
log(f"
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
log(f"Search query: '{search_query}'", "INFO")
|
|
688
|
-
log(f"Max results: {max_results}", "INFO")
|
|
689
|
-
|
|
690
|
-
driver = await self.create_driver(headless)
|
|
691
|
-
|
|
692
|
-
# Login
|
|
693
|
-
if not await self.login_to_forum(driver, email, password):
|
|
694
|
-
raise Exception("Failed to login to forum")
|
|
695
|
-
|
|
696
|
-
# Navigate to search
|
|
697
|
-
encoded_query = requests.utils.quote(search_query)
|
|
698
|
-
search_url = f"https://support.worldquantbrain.com/hc/zh-cn/search?utf8=%E2%9C%93&query={encoded_query}"
|
|
699
|
-
log(f"Opening search URL: {search_url}", "WORK")
|
|
700
|
-
|
|
701
|
-
driver.get(search_url)
|
|
702
|
-
await asyncio.sleep(2)
|
|
703
|
-
|
|
704
|
-
# Collect results with pagination
|
|
705
|
-
all_results = []
|
|
706
|
-
page_num = 1
|
|
707
|
-
|
|
708
|
-
log("Starting result collection with pagination", "WORK")
|
|
709
|
-
|
|
710
|
-
while len(all_results) < max_results:
|
|
711
|
-
log(f"Processing page {page_num}", "INFO")
|
|
712
|
-
|
|
713
|
-
# Wait for search results
|
|
714
|
-
try:
|
|
715
|
-
WebDriverWait(driver, 10).until(
|
|
716
|
-
EC.presence_of_element_located((By.CSS_SELECTOR, '.search-results-list, .search-result-list-item'))
|
|
717
|
-
)
|
|
718
|
-
except TimeoutException:
|
|
719
|
-
log(f"No search results found on page {page_num}", "WARNING")
|
|
720
|
-
break
|
|
721
|
-
|
|
722
|
-
# Extract results from current page
|
|
723
|
-
page_source = driver.page_source
|
|
724
|
-
soup = BeautifulSoup(page_source, 'html.parser')
|
|
725
|
-
page_results = self._extract_search_results(soup, page_num)
|
|
726
|
-
|
|
727
|
-
if not page_results:
|
|
728
|
-
log(f"No more results found on page {page_num}", "INFO")
|
|
729
|
-
break
|
|
730
|
-
|
|
731
|
-
all_results.extend(page_results)
|
|
732
|
-
|
|
733
|
-
# Check if we have enough results
|
|
734
|
-
if len(all_results) >= max_results:
|
|
735
|
-
all_results = all_results[:max_results]
|
|
736
|
-
break
|
|
737
|
-
|
|
738
|
-
# Try to go to next page
|
|
739
|
-
if not await self._go_to_next_search_page(driver, soup):
|
|
740
|
-
log("No more pages available", "INFO")
|
|
741
|
-
break
|
|
742
|
-
|
|
743
|
-
page_num += 1
|
|
744
|
-
await asyncio.sleep(1)
|
|
745
|
-
|
|
746
|
-
# Analyze results
|
|
747
|
-
analysis = self._analyze_search_results(all_results, search_query)
|
|
748
|
-
|
|
749
|
-
log(f"Search completed. Found {len(all_results)} results", "SUCCESS")
|
|
750
|
-
return {
|
|
751
|
-
"results": all_results,
|
|
752
|
-
"total_found": len(all_results),
|
|
753
|
-
"search_query": search_query,
|
|
754
|
-
"analysis": analysis,
|
|
755
|
-
"search_timestamp": datetime.now().isoformat()
|
|
756
|
-
}
|
|
757
|
-
|
|
758
|
-
except Exception as e:
|
|
759
|
-
log(f"Search failed: {str(e)}", "ERROR")
|
|
760
|
-
return {"error": str(e)}
|
|
761
|
-
finally:
|
|
762
|
-
if driver:
|
|
763
|
-
try:
|
|
764
|
-
driver.quit()
|
|
765
|
-
except:
|
|
766
|
-
pass
|
|
767
|
-
|
|
768
|
-
def _extract_search_results(self, soup: BeautifulSoup, page_num: int) -> List[Dict[str, Any]]:
|
|
769
|
-
"""Extract search results from a page using multiple resilient selectors.
|
|
770
|
-
|
|
771
|
-
Improvements vs original implementation:
|
|
772
|
-
- Tries several container selectors (mirrors TS Cheerio approach)
|
|
773
|
-
- Extracts richer metadata: description_html/text, votes, comments, author, date
|
|
774
|
-
- Preserves legacy fields (snippet, metadata) for backward compatibility
|
|
775
|
-
- Adds index & page for downstream analytics
|
|
776
|
-
- Robust fallbacks & normalization of URLs
|
|
777
|
-
"""
|
|
778
|
-
results: List[Dict[str, Any]] = []
|
|
779
|
-
|
|
780
|
-
# Ordered list of possible container selectors (keep broad ones last)
|
|
781
|
-
container_selectors = [
|
|
782
|
-
'.search-result-list-item',
|
|
783
|
-
'.search-results-list .search-result',
|
|
784
|
-
'.striped-list-item',
|
|
785
|
-
'.article-list-item',
|
|
786
|
-
'article.search-result',
|
|
787
|
-
'div.search-result',
|
|
788
|
-
]
|
|
789
|
-
|
|
790
|
-
# Collect candidate elements (stop at first selector that yields results)
|
|
791
|
-
result_items = []
|
|
792
|
-
for selector in container_selectors:
|
|
793
|
-
found = soup.select(selector)
|
|
794
|
-
if found:
|
|
795
|
-
log(f"Found {len(found)} search results using selector: {selector}", "INFO")
|
|
796
|
-
result_items = found
|
|
797
|
-
break
|
|
798
|
-
|
|
799
|
-
# Fallback: regex class scan (original heuristic)
|
|
800
|
-
if not result_items:
|
|
801
|
-
fallback = soup.find_all(['article', 'div'], class_=re.compile(r'search-result|article-item'))
|
|
802
|
-
if fallback:
|
|
803
|
-
log(f"Fallback selector captured {len(fallback)} results", "INFO")
|
|
804
|
-
result_items = fallback
|
|
805
|
-
else:
|
|
806
|
-
log("No search result items found with any selector", "WARNING")
|
|
807
|
-
return results
|
|
808
|
-
|
|
809
|
-
def first_text(element, selector_list: List[str]) -> str:
|
|
810
|
-
for sel in selector_list:
|
|
811
|
-
found = element.select_one(sel)
|
|
812
|
-
if found and found.get_text(strip=True):
|
|
813
|
-
return found.get_text(strip=True)
|
|
814
|
-
return ''
|
|
815
|
-
|
|
816
|
-
for idx, item in enumerate(result_items):
|
|
288
|
+
log(f"Forum search failed: {str(e)}", "ERROR")
|
|
289
|
+
raise
|
|
290
|
+
finally:
|
|
291
|
+
if browser:
|
|
292
|
+
await browser.close()
|
|
293
|
+
|
|
294
|
+
async def read_full_forum_post(self, email: str, password: str, post_url_or_id: str, include_comments: bool = True) -> Dict[str, Any]:
|
|
295
|
+
"""Read a complete forum post and all its comments using Playwright."""
|
|
296
|
+
async with async_playwright() as p:
|
|
297
|
+
browser = None
|
|
817
298
|
try:
|
|
818
|
-
|
|
819
|
-
title_link_elem = None
|
|
820
|
-
title_selectors = [
|
|
821
|
-
'.search-result-title a',
|
|
822
|
-
'h3 a',
|
|
823
|
-
'.title a',
|
|
824
|
-
'a'
|
|
825
|
-
]
|
|
826
|
-
for sel in title_selectors:
|
|
827
|
-
candidate = item.select_one(sel)
|
|
828
|
-
if candidate and candidate.get_text(strip=True):
|
|
829
|
-
title_link_elem = candidate
|
|
830
|
-
break
|
|
299
|
+
log("Starting forum post reading process with Playwright", "INFO")
|
|
831
300
|
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
if not link and not title:
|
|
838
|
-
continue # Skip invalid entries
|
|
839
|
-
|
|
840
|
-
# Description / snippet
|
|
841
|
-
desc_elem = None
|
|
842
|
-
desc_selectors = [
|
|
843
|
-
'.search-results-description',
|
|
844
|
-
'.description',
|
|
845
|
-
'.excerpt',
|
|
846
|
-
'.content-preview',
|
|
847
|
-
'p'
|
|
848
|
-
]
|
|
849
|
-
for sel in desc_selectors:
|
|
850
|
-
candidate = item.select_one(sel)
|
|
851
|
-
if candidate and candidate.get_text(strip=True):
|
|
852
|
-
desc_elem = candidate
|
|
853
|
-
break
|
|
301
|
+
if post_url_or_id.startswith('http'):
|
|
302
|
+
initial_url = post_url_or_id
|
|
303
|
+
else:
|
|
304
|
+
initial_url = f"https://support.worldquantbrain.com/hc/zh-cn/community/posts/{post_url_or_id}"
|
|
854
305
|
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
# Votes & comments
|
|
859
|
-
votes = first_text(item, [
|
|
860
|
-
'.search-result-votes span',
|
|
861
|
-
'.votes span',
|
|
862
|
-
'[class*="vote"] span',
|
|
863
|
-
'[class*="vote"]'
|
|
864
|
-
]) or '0'
|
|
865
|
-
comments = first_text(item, [
|
|
866
|
-
'.search-result-meta-count span',
|
|
867
|
-
'.comments span',
|
|
868
|
-
'[class*="comment"] span',
|
|
869
|
-
'[class*="comment"]'
|
|
870
|
-
]) or '0'
|
|
871
|
-
|
|
872
|
-
# Metadata / author / date
|
|
873
|
-
meta_block = item.select_one('.meta-data, .metadata, .post-meta')
|
|
874
|
-
author = 'Unknown'
|
|
875
|
-
date_val = 'Unknown'
|
|
876
|
-
if meta_block:
|
|
877
|
-
meta_text = meta_block.get_text(' ', strip=True)
|
|
878
|
-
# Split on common separators
|
|
879
|
-
parts = [p.strip() for p in re.split(r'[·•|]', meta_text) if p.strip()]
|
|
880
|
-
if len(parts) >= 2:
|
|
881
|
-
author = parts[0] or author
|
|
882
|
-
date_val = parts[1] or date_val
|
|
883
|
-
|
|
884
|
-
# Fallback selectors
|
|
885
|
-
if author == 'Unknown':
|
|
886
|
-
author = first_text(item, ['.author', '.username', '[class*="author"]']) or 'Unknown'
|
|
887
|
-
if date_val == 'Unknown':
|
|
888
|
-
# time element or date class
|
|
889
|
-
time_elem = item.select_one('.date, time, [class*="date"]')
|
|
890
|
-
if time_elem:
|
|
891
|
-
date_val = time_elem.get('datetime') or time_elem.get('title') or time_elem.get_text(strip=True) or 'Unknown'
|
|
892
|
-
|
|
893
|
-
# Compose legacy fields
|
|
894
|
-
snippet = description_text
|
|
895
|
-
metadata = f"author={author} date={date_val} votes={votes} comments={comments}".strip()
|
|
896
|
-
|
|
897
|
-
results.append({
|
|
898
|
-
'title': title,
|
|
899
|
-
'link': link,
|
|
900
|
-
'description_html': description_html or 'No description',
|
|
901
|
-
'description_text': description_text or 'No description',
|
|
902
|
-
'votes': votes,
|
|
903
|
-
'comments': comments,
|
|
904
|
-
'author': author,
|
|
905
|
-
'date': date_val,
|
|
906
|
-
'snippet': snippet, # backward compatibility
|
|
907
|
-
'metadata': metadata, # backward compatibility / quick summary
|
|
908
|
-
'page': page_num,
|
|
909
|
-
'index': idx
|
|
910
|
-
})
|
|
911
|
-
except Exception as e:
|
|
912
|
-
log(f"Error extracting search result {idx}: {str(e)}", "WARNING")
|
|
913
|
-
continue
|
|
306
|
+
browser, context = await self._get_browser_context(p, email, password)
|
|
307
|
+
page = await context.new_page()
|
|
914
308
|
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
try:
|
|
920
|
-
# Look for next page link
|
|
921
|
-
next_link = soup.find('a', string=re.compile(r'next|下一页', re.IGNORECASE))
|
|
922
|
-
if not next_link:
|
|
923
|
-
next_link = soup.find('a', {'rel': 'next'})
|
|
924
|
-
|
|
925
|
-
if next_link and next_link.get('href'):
|
|
926
|
-
next_url = next_link['href']
|
|
927
|
-
if not next_url.startswith('http'):
|
|
928
|
-
next_url = f"https://support.worldquantbrain.com{next_url}"
|
|
309
|
+
# --- Get Main Post Content and Final URL ---
|
|
310
|
+
log(f"Navigating to initial URL: {initial_url}", "INFO")
|
|
311
|
+
await page.goto(initial_url)
|
|
312
|
+
await page.wait_for_selector('.post-body, .article-body', timeout=15000)
|
|
929
313
|
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
314
|
+
# Get the final URL after any redirects
|
|
315
|
+
base_url = re.sub(r'(\?|&)page=\d+', '', page.url).split('#')[0]
|
|
316
|
+
log(f"Resolved to Base URL: {base_url}", "INFO")
|
|
317
|
+
await page.wait_for_selector('.post-body, .article-body', timeout=15000)
|
|
318
|
+
content = await page.content()
|
|
319
|
+
soup = BeautifulSoup(content, 'html.parser')
|
|
320
|
+
|
|
321
|
+
post_data = {}
|
|
322
|
+
title_element = soup.select_one('.post-title, h1.article-title, .article__title')
|
|
323
|
+
post_data['title'] = title_element.get_text(strip=True) if title_element else 'Unknown Title'
|
|
324
|
+
|
|
325
|
+
author_span = soup.select_one('.post-author span[title]')
|
|
326
|
+
post_data['author'] = author_span['title'] if author_span else 'Unknown Author'
|
|
327
|
+
|
|
328
|
+
body_element = soup.select_one('.post-body, .article-body')
|
|
329
|
+
post_data['body'] = body_element.get_text(strip=True) if body_element else 'Body not found'
|
|
330
|
+
|
|
331
|
+
votes_element = soup.select_one('.vote-sum')
|
|
332
|
+
date_element = soup.select_one('.post-meta .meta-data')
|
|
333
|
+
post_data['details'] = {
|
|
334
|
+
'votes': votes_element.get_text(strip=True) if votes_element else '0',
|
|
335
|
+
'date': date_element.get_text(strip=True) if date_element else 'Unknown Date'
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
# --- Get Comments with Pagination ---
|
|
339
|
+
comments = []
|
|
340
|
+
if include_comments:
|
|
341
|
+
log("Starting comment extraction...", "INFO")
|
|
342
|
+
page_num = 1
|
|
343
|
+
while True:
|
|
344
|
+
comment_url = f"{base_url}?page={page_num}#comments"
|
|
345
|
+
log(f"Navigating to comment page: {comment_url}", "INFO")
|
|
346
|
+
|
|
347
|
+
try:
|
|
348
|
+
response = await page.goto(comment_url)
|
|
349
|
+
if response.status == 404:
|
|
350
|
+
log(f"Page {page_num} returned 404. End of comments.", "INFO")
|
|
351
|
+
break
|
|
352
|
+
await page.wait_for_selector('.comment-list', timeout=10000)
|
|
353
|
+
except Exception as e:
|
|
354
|
+
log(f"Could not load page {page_num}: {e}. Assuming end of comments.", "INFO")
|
|
355
|
+
break
|
|
356
|
+
|
|
357
|
+
comment_soup = BeautifulSoup(await page.content(), 'html.parser')
|
|
358
|
+
comment_elements = comment_soup.select('.comment')
|
|
359
|
+
|
|
360
|
+
if not comment_elements:
|
|
361
|
+
log(f"No comments found on page {page_num}. Ending extraction.", "INFO")
|
|
362
|
+
break
|
|
363
|
+
|
|
364
|
+
log(f"Found {len(comment_elements)} comments on page {page_num}.", "INFO")
|
|
365
|
+
|
|
366
|
+
new_comments_found_on_page = 0
|
|
367
|
+
for comment_element in comment_elements:
|
|
368
|
+
author_span = comment_element.select_one('.comment-author span[title]')
|
|
369
|
+
author_id = author_span['title'] if author_span else 'Unknown'
|
|
370
|
+
|
|
371
|
+
body_element = comment_element.select_one('.comment-body')
|
|
372
|
+
date_element = comment_element.select_one('.comment-meta .meta-data')
|
|
373
|
+
|
|
374
|
+
comment_data = {
|
|
375
|
+
'author': author_id,
|
|
376
|
+
'body': body_element.get_text(strip=True) if body_element else '',
|
|
377
|
+
'date': date_element.get_text(strip=True) if date_element else 'Unknown Date'
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
if comment_data not in comments:
|
|
381
|
+
comments.append(comment_data)
|
|
382
|
+
new_comments_found_on_page += 1
|
|
383
|
+
|
|
384
|
+
if new_comments_found_on_page == 0 and page_num > 1:
|
|
385
|
+
log(f"No new comments detected on page {page_num}. Ending extraction.", "INFO")
|
|
386
|
+
break
|
|
387
|
+
|
|
388
|
+
page_num += 1
|
|
389
|
+
|
|
390
|
+
log(f"Extracted {len(comments)} comments in total.", "SUCCESS")
|
|
391
|
+
return {
|
|
392
|
+
"success": True, "post": post_data, "comments": comments, "total_comments": len(comments)
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
except Exception as e:
|
|
396
|
+
log(f"Failed to read forum post: {str(e)}", "ERROR")
|
|
397
|
+
raise
|
|
398
|
+
finally:
|
|
399
|
+
if browser:
|
|
400
|
+
await browser.close()
|
|
988
401
|
|
|
989
402
|
# Initialize forum client
|
|
990
403
|
forum_client = ForumClient()
|
|
991
404
|
|
|
992
|
-
#
|
|
993
|
-
# These tools are already properly integrated in the main platform_functions.py
|
|
994
|
-
|
|
405
|
+
# The main block is for testing and won't be run by the MCP server.
|
|
995
406
|
if __name__ == "__main__":
|
|
996
|
-
print("📚 WorldQuant BRAIN Forum Functions
|
|
997
|
-
print("Note: Forum tools are now integrated in the main platform_functions.py", file=sys.stderr)
|
|
998
|
-
print("This file provides the ForumClient class for internal use.", file=sys.stderr)
|
|
407
|
+
print("📚 WorldQuant BRAIN Forum Functions - This script provides the ForumClient class.", file=sys.stderr)
|