cnhkmcp 2.1.4__py3-none-any.whl → 2.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. cnhkmcp/__init__.py +126 -126
  2. cnhkmcp/untracked/back_up/forum_functions.py +998 -0
  3. cnhkmcp/untracked/back_up/platform_functions.py +2886 -0
  4. cnhkmcp/untracked/brain-consultant.md +31 -0
  5. cnhkmcp/untracked/forum_functions.py +350 -941
  6. cnhkmcp/untracked/platform_functions.py +445 -730
  7. cnhkmcp/untracked/skills/Claude_Skill_Creation_Guide.md +140 -0
  8. cnhkmcp/untracked/skills/expression_verifier/SKILL.md +51 -0
  9. cnhkmcp/untracked/skills/expression_verifier/scripts/validator.py +889 -0
  10. cnhkmcp/untracked/skills/expression_verifier/scripts/verify_expr.py +52 -0
  11. cnhkmcp/untracked/skills/pull_BRAINSkill/SKILL.md +51 -0
  12. cnhkmcp/untracked/skills/pull_BRAINSkill/scripts/pull_skills.py +188 -0
  13. cnhkmcp/untracked//321/211/320/225/320/235/321/207/342/225/234/320/276/321/205/320/231/320/235/321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/230/320/241_/321/205/320/276/320/231/321/210/320/263/320/225/321/205/342/224/220/320/225/321/210/320/266/320/221/321/204/342/225/233/320/255/321/210/342/225/241/320/246/321/205/320/234/320/225.py +3 -1
  14. {cnhkmcp-2.1.4.dist-info → cnhkmcp-2.1.6.dist-info}/METADATA +1 -1
  15. {cnhkmcp-2.1.4.dist-info → cnhkmcp-2.1.6.dist-info}/RECORD +19 -13
  16. cnhkmcp/untracked/APP/Tranformer/ace.log +0 -0
  17. cnhkmcp/untracked/APP/Tranformer/parsetab.py +0 -60
  18. cnhkmcp/untracked/APP/simulator/wqb20260107015647.log +0 -57
  19. {cnhkmcp-2.1.4.dist-info → cnhkmcp-2.1.6.dist-info}/WHEEL +0 -0
  20. {cnhkmcp-2.1.4.dist-info → cnhkmcp-2.1.6.dist-info}/entry_points.txt +0 -0
  21. {cnhkmcp-2.1.4.dist-info → cnhkmcp-2.1.6.dist-info}/licenses/LICENSE +0 -0
  22. {cnhkmcp-2.1.4.dist-info → cnhkmcp-2.1.6.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python3
2
2
  """
3
3
  WorldQuant BRAIN Forum Functions - Python Version
4
- Comprehensive forum functionality including glossary, search, and post viewing.
4
+ Comprehensive forum functionality including glossary, search, and post viewing using Playwright.
5
5
  """
6
6
 
7
7
  import asyncio
@@ -12,987 +12,396 @@ from datetime import datetime
12
12
  from typing import Dict, Any, List, Optional
13
13
 
14
14
  from bs4 import BeautifulSoup
15
- from selenium import webdriver
16
- from selenium.webdriver.chrome.options import Options
17
- from selenium.webdriver.edge.options import Options as EdgeOptions
18
- from selenium.webdriver.common.by import By
19
- from selenium.webdriver.support.ui import WebDriverWait
20
- from selenium.webdriver.support import expected_conditions as EC
21
- from selenium.common.exceptions import TimeoutException, NoSuchElementException
15
+ from playwright.async_api import async_playwright
22
16
  import requests
23
17
  import os
24
- import shutil
25
-
26
- # Initialize forum MCP server
27
- try:
28
- from mcp.server.fastmcp import FastMCP
29
- forum_mcp = FastMCP('brain_forum_server')
30
- except ImportError:
31
- # Fallback for testing
32
- forum_mcp = None
33
18
 
34
19
  def log(message: str, level: str = "INFO"):
35
20
  """Log message with timestamp."""
36
21
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
37
22
  print(f"[{timestamp}] [{level}] {message}", file=sys.stderr)
38
23
 
24
+ # --- Parsing Helper Functions (from playwright_forum_test.py) ---
25
+
26
+ def _is_navigation_or_metadata(line: str) -> bool:
27
+ """Check if a line is navigation or metadata."""
28
+ navigation_patterns = [
29
+ r'^\d+ days? ago$',
30
+ r'~\d+ minute read',
31
+ r'^Follow',
32
+ r'^Not yet followed',
33
+ r'^Updated$',
34
+ r'^AS\d+$',
35
+ r'^[A-Z] - [A-Z] - [A-Z]', # Letter navigation
36
+ r'^A$',
37
+ r'^B$',
38
+ r'^[A-Z]$' # Single letters
39
+ ]
40
+ return any(re.match(pattern, line.strip()) for pattern in navigation_patterns)
41
+
42
+ def _looks_like_term(line: str) -> bool:
43
+ """Check if a line looks like a glossary term."""
44
+ if len(line) > 100:
45
+ return False
46
+ if _is_navigation_or_metadata(line):
47
+ return False
48
+ definition_starters = ['the', 'a', 'an', 'this', 'that', 'it', 'is', 'are', 'was', 'were', 'for', 'to', 'in', 'on', 'at', 'by', 'with']
49
+ first_word = line.lower().split(' ')[0] if line else ''
50
+ if first_word and first_word in definition_starters:
51
+ return False
52
+ is_short = len(line) <= 80
53
+ starts_with_capital = bool(re.match(r'^[A-Z]', line))
54
+ has_all_caps = bool(re.match(r'^[A-Z\s\-\/\(\)]+$', line))
55
+ has_reasonable_length = len(line) >= 2
56
+ return is_short and has_reasonable_length and (starts_with_capital or has_all_caps)
57
+
58
+ def _parse_glossary_terms(content: str) -> List[Dict[str, str]]:
59
+ """Parse glossary terms from HTML content."""
60
+ soup = BeautifulSoup(content, 'html.parser')
61
+ # Get text from the article body, which is more reliable than splitting the whole HTML
62
+ article_body = soup.select_one('.article-body')
63
+ if not article_body:
64
+ return []
65
+
66
+ # Use .get_text with a separator to preserve line breaks, which is key for the logic below
67
+ lines = article_body.get_text(separator='\n').split('\n')
68
+
69
+ terms = []
70
+ current_term = None
71
+ current_definition = []
72
+
73
+ for line in lines:
74
+ line = line.strip()
75
+ if not line:
76
+ continue
77
+
78
+ if _looks_like_term(line):
79
+ if current_term:
80
+ # Save the previous term
81
+ terms.append({
82
+ "term": current_term,
83
+ "definition": " ".join(current_definition).strip()
84
+ })
85
+ # Start a new term
86
+ current_term = line
87
+ current_definition = []
88
+ elif current_term:
89
+ # Add to the current definition
90
+ current_definition.append(line)
91
+
92
+ # Add the last term
93
+ if current_term:
94
+ terms.append({
95
+ "term": current_term,
96
+ "definition": " ".join(current_definition).strip()
97
+ })
98
+
99
+ # Filter out invalid terms and improve quality
100
+ return [term for term in terms if
101
+ len(term["term"]) > 0 and
102
+ len(term["definition"]) > 10 and
103
+ not _is_navigation_or_metadata(term["term"]) and
104
+ "ago" not in term["definition"] and
105
+ "minute read" not in term["definition"]]
106
+
39
107
  class ForumClient:
40
- """Forum client for WorldQuant BRAIN support site."""
108
+ """Forum client for WorldQuant BRAIN support site, using Playwright."""
41
109
 
42
110
  def __init__(self):
43
111
  self.base_url = "https://support.worldquantbrain.com"
112
+ # The session is mainly used for the initial authentication via brain_client
44
113
  self.session = requests.Session()
45
114
  self.session.headers.update({
46
115
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36'
47
116
  })
117
+
118
+ async def _get_browser_context(self, p: async_playwright, email: str, password: str):
119
+ """Authenticate and return a browser context with the session."""
120
+ # Import brain_client here to avoid circular dependency
121
+ from platform_functions import brain_client
122
+
123
+ log("Authenticating with BRAIN platform...", "INFO")
124
+ auth_result = await brain_client.authenticate(email, password)
125
+ if auth_result.get('status') != 'authenticated':
126
+ raise Exception("BRAIN platform authentication failed.")
127
+ log("Successfully authenticated with BRAIN platform.", "SUCCESS")
128
+
129
+ browser = await p.chromium.launch(channel="chrome", headless=True, args=['--no-sandbox'])
130
+ context = await browser.new_context(user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36')
131
+
132
+ log("Transferring authentication session to browser...", "INFO")
133
+ cookies = brain_client.session.cookies
134
+ playwright_cookies = []
135
+ for cookie in cookies:
136
+ cookie_dict = {
137
+ 'name': cookie.name,
138
+ 'value': cookie.value,
139
+ 'domain': cookie.domain,
140
+ 'path': cookie.path,
141
+ 'secure': cookie.secure,
142
+ 'httpOnly': 'HttpOnly' in cookie._rest,
143
+ 'sameSite': 'Lax'
144
+ }
145
+ if cookie.expires:
146
+ cookie_dict['expires'] = cookie.expires
147
+ playwright_cookies.append(cookie_dict)
48
148
 
49
- def get_brain_session(self):
50
- """Get authenticated session from BrainApiClient."""
51
- try:
52
- import sys
53
- import os
54
- sys.path.append(os.path.dirname(os.path.abspath(__file__)))
55
- from platform_functions import brain_client
56
- return brain_client.session
57
- except ImportError:
58
- return None
59
-
60
- def detect_available_browser(self) -> str:
61
- """Detect which browser WebDriver is available."""
62
- try:
63
- # Try Chrome first
64
- from selenium.webdriver.chrome.service import Service
65
- from selenium.webdriver.chrome.options import Options
66
- try:
67
- options = Options()
68
- options.add_argument('--headless')
69
- driver = webdriver.Chrome(options=options)
70
- driver.quit()
71
- return "chrome"
72
- except Exception:
73
- pass
74
-
75
- # Try Edge
76
- try:
77
- from selenium.webdriver.edge.options import Options as EdgeOptions
78
- options = EdgeOptions()
79
- options.add_argument('--headless')
80
- driver = webdriver.Edge(options=options)
81
- driver.quit()
82
- return "edge"
83
- except Exception:
84
- pass
85
-
86
- # Default to chrome
87
- return "chrome"
88
- except Exception:
89
- return "chrome"
90
-
91
- def setup_browser_options(self, headless: bool, browser_type: str):
92
- """Setup browser options based on browser type."""
93
- if browser_type.lower() == "chrome":
94
- return self.setup_chrome_options(headless)
95
- elif browser_type.lower() == "edge":
96
- return self.setup_edge_options(headless)
97
- else:
98
- return self.setup_chrome_options(headless)
99
-
100
- def setup_edge_options(self, headless: bool = True) -> EdgeOptions:
101
- """Setup Edge options for web scraping."""
102
- options = EdgeOptions()
103
-
104
- if headless:
105
- options.add_argument('--headless')
106
-
107
- # Performance optimizations
108
- options.add_argument('--disable-blink-features=AutomationControlled')
109
- options.add_argument('--log-level=3')
110
- options.add_argument('--no-sandbox')
111
- options.add_argument('--disable-dev-shm-usage')
112
- options.add_argument('--disable-web-security')
113
- options.add_argument('--disable-features=VizDisplayCompositor')
114
- options.add_argument('--disable-gpu')
115
- options.add_argument('--disable-extensions')
116
- options.add_argument('--disable-images')
117
- options.add_argument('--disable-javascript')
118
- options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36')
119
-
120
- return options
121
-
122
- def setup_chrome_options(self, headless: bool = True) -> Options:
123
- """Setup Chrome options for web scraping."""
124
- options = Options()
125
-
126
- if headless:
127
- options.add_argument('--headless')
128
-
129
- # Performance optimizations
130
- options.add_argument('--disable-blink-features=AutomationControlled')
131
- options.add_argument('--log-level=3')
132
- options.add_argument('--no-sandbox')
133
- options.add_argument('--disable-dev-shm-usage')
134
- options.add_argument('--disable-web-security')
135
- options.add_argument('--disable-features=VizDisplayCompositor')
136
- options.add_argument('--disable-gpu')
137
- options.add_argument('--disable-extensions')
138
- options.add_argument('--disable-images')
139
- options.add_argument('--disable-javascript')
140
- options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36')
141
-
142
- return options
143
-
144
- async def create_driver(self, headless: bool = True):
145
- """Create and configure WebDriver with cross-browser support."""
146
- browser_type = self.detect_available_browser()
147
- log(f"Using browser: {browser_type}", "INFO")
148
-
149
- options = self.setup_browser_options(headless, browser_type)
149
+ await context.add_cookies(playwright_cookies)
150
+ log("Session transferred.", "SUCCESS")
150
151
 
151
- try:
152
- if browser_type.lower() == "chrome":
153
- driver = webdriver.Chrome(options=options)
154
- elif browser_type.lower() == "edge":
155
- driver = webdriver.Edge(options=options)
156
- else:
157
- # Fallback to Chrome
158
- log("Falling back to Chrome", "WARNING")
159
- driver = webdriver.Chrome(options=options)
160
-
161
- # Set aggressive timeouts for speed
162
- driver.set_page_load_timeout(30)
163
- driver.implicitly_wait(10)
164
-
165
- return driver
166
-
167
- except Exception as e:
168
- log(f"Failed to create {browser_type} driver: {str(e)}", "ERROR")
169
- help_text = self.get_driver_installation_help(browser_type)
170
- log(help_text, "ERROR")
171
-
172
- # Try Chrome as fallback if Edge failed
173
- if browser_type.lower() != "chrome":
174
- try:
175
- log("Trying Chrome as fallback", "INFO")
176
- chrome_options = self.setup_browser_options(headless, "chrome")
177
- driver = webdriver.Chrome(options=chrome_options)
178
- driver.set_page_load_timeout(30)
179
- driver.implicitly_wait(10)
180
- return driver
181
- except Exception as e2:
182
- log(f"Chrome fallback also failed: {str(e2)}", "ERROR")
183
- chrome_help = self.get_driver_installation_help("chrome")
184
- log(chrome_help, "ERROR")
185
-
186
- raise Exception(f"Could not create any browser driver. {help_text}")
187
-
188
- async def login_to_forum(self, driver, email: str, password: str) -> bool:
189
- """Login to the WorldQuant BRAIN forum using existing authentication."""
190
- try:
191
- # Import BrainApiClient from platform_functions
192
- import sys
193
- import os
194
- sys.path.append(os.path.dirname(os.path.abspath(__file__)))
195
-
152
+ return browser, context
153
+
154
+ async def get_glossary_terms(self, email: str, password: str) -> List[Dict[str, str]]:
155
+ """Extract glossary terms from the forum using Playwright."""
156
+ async with async_playwright() as p:
157
+ browser = None
196
158
  try:
197
- from platform_functions import brain_client
198
- log("Using existing BrainApiClient for authentication", "INFO")
199
-
200
- # First authenticate with BrainApiClient
201
- auth_result = await brain_client.authenticate(email, password)
202
- if auth_result.get('status') != 'authenticated':
203
- log("BrainApiClient authentication failed", "ERROR")
204
- return False
205
-
206
- log("Successfully authenticated via BrainApiClient", "SUCCESS")
207
-
208
- # Navigate to forum with authenticated session
209
- log("Navigating to forum with authenticated session", "WORK")
210
- driver.get("https://support.worldquantbrain.com/hc/en-us")
211
- await asyncio.sleep(2)
159
+ log("Starting glossary extraction process with Playwright", "INFO")
160
+ browser, context = await self._get_browser_context(p, email, password)
212
161
 
213
- # Add authentication cookies to browser
214
- cookies = brain_client.session.cookies
215
- for cookie in cookies:
216
- driver.add_cookie({
217
- 'name': cookie.name,
218
- 'value': cookie.value,
219
- 'domain': '.worldquantbrain.com'
220
- })
162
+ page = await context.new_page()
163
+ log("Navigating to BRAIN support forum glossary...", "INFO")
164
+ await page.goto("https://support.worldquantbrain.com/hc/en-us/articles/4902349883927-Click-here-for-a-list-of-terms-and-their-definitions")
221
165
 
222
- # Refresh page with cookies
223
- driver.refresh()
224
- await asyncio.sleep(2)
166
+ log("Extracting glossary content...", "INFO")
167
+ content = await page.content()
225
168
 
226
- return True
169
+ terms = _parse_glossary_terms(content)
227
170
 
228
- except ImportError:
229
- log("BrainApiClient not available, using manual login", "WARNING")
230
-
231
- # Fallback to manual login
232
- driver.get("https://support.worldquantbrain.com/hc/en-us/signin")
233
- await asyncio.sleep(3)
234
-
235
- email_input = WebDriverWait(driver, 15).until(
236
- EC.presence_of_element_located((By.NAME, "email"))
237
- )
238
- password_input = WebDriverWait(driver, 15).until(
239
- EC.presence_of_element_located((By.NAME, "currentPassword"))
240
- )
241
-
242
- email_input.clear()
243
- email_input.send_keys(email)
244
- password_input.clear()
245
- password_input.send_keys(password)
246
-
247
- login_button = WebDriverWait(driver, 15).until(
248
- EC.element_to_be_clickable((By.XPATH, '//button[@type="submit"]'))
249
- )
250
- login_button.click()
251
- await asyncio.sleep(3)
252
-
253
- return True
254
-
255
- except Exception as e:
256
- log(f"Login failed: {str(e)}", "ERROR")
257
- return False
258
-
259
- async def get_glossary_terms(self, email: str, password: str, headless: bool = False) -> Dict[str, Any]:
260
- """Extract glossary terms from the forum."""
261
- driver = None
262
- try:
263
- log("Starting glossary extraction process", "INFO")
264
-
265
- # Add timeout protection
266
- async def extraction_with_timeout():
267
- return await self._perform_glossary_extraction(email, password, headless)
268
-
269
- # Run with 5-minute timeout
270
- result = await asyncio.wait_for(extraction_with_timeout(), timeout=300)
271
- return result
272
-
273
- except asyncio.TimeoutError:
274
- log("Glossary extraction timed out after 5 minutes", "ERROR")
275
- return {"error": "Glossary extraction timed out after 5 minutes"}
276
- except Exception as e:
277
- log(f"Glossary extraction failed: {str(e)}", "ERROR")
278
- return {"error": str(e)}
279
- finally:
280
- if driver:
281
- try:
282
- driver.quit()
283
- except:
284
- pass
285
-
286
- async def _perform_glossary_extraction(self, email: str, password: str, headless: bool) -> Dict[str, Any]:
287
- """Perform the actual glossary extraction."""
288
- driver = None
289
- try:
290
- driver = await self.create_driver(headless)
291
-
292
- # Login
293
- if not await self.login_to_forum(driver, email, password):
294
- raise Exception("Failed to login to forum")
295
-
296
- # Navigate to glossary page
297
- log("Navigating to glossary page", "WORK")
298
- driver.get("https://support.worldquantbrain.com/hc/en-us/articles/4902349883927-Click-here-for-a-list-of-terms-and-their-definitions")
299
- await asyncio.sleep(5)
300
-
301
- # Extract content
302
- log("Extracting glossary content", "WORK")
303
- page_source = driver.page_source
304
- soup = BeautifulSoup(page_source, 'html.parser')
305
-
306
- # Parse glossary terms
307
- terms = self._parse_glossary_terms(page_source)
308
-
309
- log(f"Extracted {len(terms)} glossary terms", "SUCCESS")
310
- return {
311
- "terms": terms,
312
- "total_count": len(terms),
313
- "extraction_timestamp": datetime.now().isoformat()
314
- }
315
-
316
- finally:
317
- if driver:
318
- try:
319
- driver.quit()
320
- except:
321
- pass
322
-
323
- def _parse_glossary_terms(self, content: str) -> List[Dict[str, str]]:
324
- """Parse glossary terms from HTML content."""
325
- terms = []
326
- lines = content.split('\n')
327
-
328
- current_term = None
329
- current_definition = []
330
- is_collecting_definition = False
331
- found_first_real_term = False
332
-
333
- for line in lines:
334
- line = line.strip()
335
- if not line:
336
- continue
337
-
338
- # Skip navigation and metadata lines at the beginning
339
- if not found_first_real_term and self._is_navigation_or_metadata(line):
340
- continue
341
-
342
- # Check if this line looks like a term
343
- if self._looks_like_term(line) and not is_collecting_definition:
344
- # Mark that we found the first real term
345
- if not found_first_real_term:
346
- found_first_real_term = True
171
+ log(f"Extracted {len(terms)} glossary terms", "SUCCESS")
172
+ return terms
173
+
174
+ except Exception as e:
175
+ log(f"Glossary extraction failed: {str(e)}", "ERROR")
176
+ # Re-raise to be handled by the MCP server wrapper
177
+ raise
178
+ finally:
179
+ if browser:
180
+ await browser.close()
181
+ log("Browser closed.", "INFO")
182
+
183
+ async def search_forum_posts(self, email: str, password: str, search_query: str, max_results: int = 50, locale: str = "zh-cn") -> Dict[str, Any]:
184
+ """Search for posts on the forum using Playwright, with pagination."""
185
+ async with async_playwright() as p:
186
+ browser = None
187
+ try:
188
+ log(f"Starting forum search for '{search_query}'", "INFO")
189
+ browser, context = await self._get_browser_context(p, email, password)
190
+
191
+ page = await context.new_page()
347
192
 
348
- # Save previous term if exists
349
- if current_term and current_definition:
350
- terms.append({
351
- "term": current_term.strip(),
352
- "definition": " ".join(current_definition).strip()
353
- })
193
+ search_results = []
194
+ page_num = 1
354
195
 
355
- current_term = line
356
- current_definition = []
357
- is_collecting_definition = True
358
- elif is_collecting_definition and found_first_real_term:
359
- # Check if this is the start of a new term
360
- if self._looks_like_term(line):
361
- # Save current term
362
- if current_term and current_definition:
363
- terms.append({
364
- "term": current_term.strip(),
365
- "definition": " ".join(current_definition).strip()
366
- })
196
+ while len(search_results) < max_results:
197
+ search_url = f"{self.base_url}/hc/{locale}/search?page={page_num}&query={search_query}#results"
198
+ log(f"Navigating to search page: {search_url}", "INFO")
367
199
 
368
- current_term = line
369
- current_definition = []
370
- else:
371
- # Add to definition
372
- if current_definition:
373
- current_definition.append(line)
374
- else:
375
- current_definition = [line]
376
-
377
- # Don't forget the last term
378
- if current_term and current_definition and found_first_real_term:
379
- terms.append({
380
- "term": current_term.strip(),
381
- "definition": " ".join(current_definition).strip()
382
- })
383
-
384
- # Filter out invalid terms and improve quality
385
- return [term for term in terms if
386
- len(term["term"]) > 0 and
387
- len(term["definition"]) > 10 and # Ensure meaningful definitions
388
- not self._is_navigation_or_metadata(term["term"]) and
389
- "ago" not in term["definition"] and # Remove timestamp-like definitions
390
- "minute read" not in term["definition"]] # Remove reading time
391
-
392
- def _looks_like_term(self, line: str) -> bool:
393
- """Check if a line looks like a glossary term."""
394
- # Skip very long lines (likely definitions)
395
- if len(line) > 100:
396
- return False
397
-
398
- # Skip navigation and metadata
399
- if self._is_navigation_or_metadata(line):
400
- return False
401
-
402
- # Skip lines that start with common definition words
403
- definition_starters = ['the', 'a', 'an', 'this', 'that', 'it', 'is', 'are', 'was', 'were', 'for', 'to', 'in', 'on', 'at', 'by', 'with']
404
- first_word = line.lower().split(' ')[0]
405
- if first_word and first_word in definition_starters:
406
- return False
407
-
408
- # Check if line has characteristics of a term
409
- # Terms are often short, may be all caps, or start with capital
410
- is_short = len(line) <= 80
411
- starts_with_capital = bool(re.match(r'^[A-Z]', line))
412
- has_all_caps = bool(re.match(r'^[A-Z\s\-\/\(\)]+$', line))
413
- has_reasonable_length = len(line) >= 2
414
-
415
- return is_short and has_reasonable_length and (starts_with_capital or has_all_caps)
416
-
417
- def _is_navigation_or_metadata(self, line: str) -> bool:
418
- """Check if a line is navigation or metadata."""
419
- navigation_patterns = [
420
- r'^\d+ days? ago$',
421
- r'~\d+ minute read',
422
- r'^Follow',
423
- r'^Not yet followed',
424
- r'^Updated$',
425
- r'^AS\d+$',
426
- r'^[A-Z] - [A-Z] - [A-Z]', # Letter navigation
427
- r'^A$',
428
- r'^B$',
429
- r'^[A-Z]$' # Single letters
430
- ]
431
-
432
- return any(re.match(pattern, line.strip()) for pattern in navigation_patterns)
433
-
434
- def get_driver_installation_help(self, browser_type: str) -> str:
435
- """Provide helpful instructions for installing WebDriver."""
436
- if browser_type.lower() == "chrome":
437
- return """
438
- Chrome WebDriver not found. Please install ChromeDriver:
439
- 1. Download from: https://chromedriver.chromium.org/downloads
440
- 2. Make sure version matches your Chrome browser
441
- 3. Add to PATH or place in current directory
442
- 4. Alternative: Install via pip: pip install chromedriver-autoinstaller
443
- """
444
- elif browser_type.lower() == "edge":
445
- return """
446
- Edge WebDriver not found. Please install Edge WebDriver:
447
- 1. Download from: https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/
448
- 2. Make sure version matches your Edge browser
449
- 3. Add to PATH or place in current directory
450
- 4. Alternative: Install via pip: pip install msedge-selenium-tools
451
- """
452
- else:
453
- return "Please install either ChromeDriver or Edge WebDriver for browser automation."
454
-
455
- async def read_full_forum_post(self, email: str, password: str, post_url_or_id: str,
456
- headless: bool = False, include_comments: bool = True) -> Dict[str, Any]:
457
- """Read a complete forum post with optional comments."""
458
- driver = None
459
- try:
460
- log("Starting forum post reading process", "INFO")
461
-
462
- # Determine if input is URL or article ID
463
- is_url = post_url_or_id.startswith('http')
464
- if is_url:
465
- post_url = post_url_or_id
466
- else:
467
- post_url = f"https://support.worldquantbrain.com/hc/zh-cn/community/posts/{post_url_or_id}"
468
-
469
- log(f"Target URL: {post_url}", "INFO")
470
- log(f"Include comments: {include_comments}", "INFO")
471
-
472
- driver = await self.create_driver(headless)
473
-
474
- # Login
475
- if not await self.login_to_forum(driver, email, password):
476
- raise Exception("Failed to login to forum")
477
-
478
- # Navigate directly to post URL
479
- log(f"Opening post: {post_url}", "WORK")
480
- driver.get(post_url)
481
- log("Post page loaded, extracting content immediately", "WORK")
482
-
483
- # Wait minimal time for content to appear
484
- await asyncio.sleep(2)
485
-
486
- # Extract post content quickly
487
- post_data = {}
488
- page_source = driver.page_source
489
- soup = BeautifulSoup(page_source, 'html.parser')
490
-
491
- # Extract post title
492
- title = soup.select_one('.post-title, h1, .article-title')
493
- if not title:
494
- title = soup.select_one('title')
495
- post_data['title'] = title.get_text().strip() if title else 'Unknown Title'
496
-
497
- # Extract post author
498
- author = soup.select_one('.post-author, .author, .article-author')
499
- if not author:
500
- author = soup.select_one('.comment-author')
501
- post_data['author'] = author.get_text().strip() if author else 'Unknown Author'
502
-
503
- # Extract post date
504
- date = soup.select_one('.post-date, .date, .article-date, time')
505
- if not date:
506
- time_element = soup.select_one('time')
507
- if time_element:
508
- date = time_element.get('datetime') or time_element.get('title') or time_element.get_text().strip()
509
- else:
510
- date = 'Unknown Date'
511
- else:
512
- date = date.get_text().strip()
513
- post_data['date'] = date if date else 'Unknown Date'
514
-
515
- # Extract post content
516
- post_content = soup.select_one('.post-body, .article-body, .content, .post-content')
517
- if not post_content:
518
- post_content = soup.select_one('article, main')
519
-
520
- if post_content:
521
- post_data['content_html'] = str(post_content)
522
- post_data['content_text'] = post_content.get_text().strip()
523
- else:
524
- post_data['content_html'] = 'No content found'
525
- post_data['content_text'] = 'No content found'
526
-
527
- post_data['url'] = post_url
528
- post_data['current_url'] = driver.current_url
529
-
530
- log(f"Post content extracted: \"{post_data['title']}\"", "SUCCESS")
531
-
532
- comments = []
533
- total_comments = 0
534
-
535
- # Extract comments conditionally
536
- if include_comments:
537
- log("Extracting comments...", "WORK")
538
- comments = await self._extract_forum_comments_full(driver, soup)
539
- total_comments = len(comments)
540
- log(f"Extracted {total_comments} comments", "SUCCESS")
541
- else:
542
- log("Skipping comment extraction (includeComments=false)", "INFO")
543
-
544
- return {
545
- "success": True,
546
- "post": post_data,
547
- "comments": comments,
548
- "total_comments": total_comments,
549
- "extracted_at": datetime.now().isoformat(),
550
- "processing_time": "full_extraction_with_comments" if include_comments else "post_only_extraction",
551
- "include_comments": include_comments
552
- }
553
-
554
- except Exception as e:
555
- log(f"Failed to read forum post: {str(e)}", "ERROR")
556
- return {"error": str(e)}
557
- finally:
558
- if driver:
559
- try:
560
- driver.quit()
561
- except:
562
- pass
563
-
564
- async def _extract_forum_comments_full(self, driver, soup: BeautifulSoup) -> List[Dict[str, Any]]:
565
- """Extract all comments from forum post with pagination support."""
566
- all_comments = []
567
- page_num = 1
568
-
569
- try:
570
- # First extract comments from current page source
571
- page_comments = self._parse_comments_from_html(soup)
572
- all_comments.extend(page_comments)
573
- log(f"Found {len(page_comments)} comments on page {page_num}", "INFO")
574
-
575
- # Check for pagination and continue if needed
576
- while True:
577
- try:
578
- # Look for next page button
579
- next_button = driver.find_element(By.CSS_SELECTOR, "span.pagination-next-text, .pagination-next, .next")
580
- next_text = next_button.text
200
+ try:
201
+ response = await page.goto(search_url)
202
+ if response.status == 404:
203
+ log(f"Page {page_num} not found. End of results.", "INFO")
204
+ break
205
+ await page.wait_for_selector('ul.search-results-list', timeout=15000)
206
+ except Exception as e:
207
+ log(f"Could not load search results on page {page_num}: {e}", "INFO")
208
+ break
209
+
210
+ content = await page.content()
211
+ soup = BeautifulSoup(content, 'html.parser')
581
212
 
582
- if "下一页" in next_text or "Next" in next_text or "next" in next_text.lower():
583
- log(f"Found next page, continuing to page {page_num + 1}", "INFO")
584
- next_button.click()
585
- await asyncio.sleep(2) # Minimal wait for next page
213
+ results_on_page = soup.select('li.search-result-list-item')
214
+ if not results_on_page:
215
+ log("No more search results found.", "INFO")
216
+ break
217
+
218
+ for result in results_on_page:
219
+ title_element = result.select_one('h2.search-result-title a')
220
+ snippet_element = result.select_one('.search-results-description')
586
221
 
587
- # Extract comments from new page
588
- new_page_source = driver.page_source
589
- new_soup = BeautifulSoup(new_page_source, 'html.parser')
590
- new_page_comments = self._parse_comments_from_html(new_soup)
222
+ if title_element:
223
+ title = title_element.get_text(strip=True)
224
+ link = title_element.get('href')
225
+
226
+ votes_element = result.select_one('.search-result-votes span[aria-hidden="true"]')
227
+ votes_text = votes_element.get_text(strip=True) if votes_element else '0'
228
+ votes_match = re.search(r'\d+', votes_text)
229
+ votes = int(votes_match.group()) if votes_match else 0
230
+
231
+ comments_element = result.select_one('.search-result-meta-count span[aria-hidden="true"]')
232
+ comments_text = comments_element.get_text(strip=True) if comments_element else '0'
233
+ comments_match = re.search(r'\d+', comments_text)
234
+ comments = int(comments_match.group()) if comments_match else 0
235
+
236
+ breadcrumbs_elements = result.select('ol.search-result-breadcrumbs li')
237
+ breadcrumbs = [bc.get_text(strip=True) for bc in breadcrumbs_elements]
238
+
239
+ meta_group = result.select_one('ul.meta-group')
240
+ author = 'Unknown'
241
+ post_date = 'Unknown'
242
+ if meta_group:
243
+ meta_data_elements = meta_group.select('li.meta-data')
244
+ if len(meta_data_elements) > 0:
245
+ author = meta_data_elements[0].get_text(strip=True)
246
+ if len(meta_data_elements) > 1:
247
+ time_element = meta_data_elements[1].select_one('time')
248
+ if time_element:
249
+ post_date = time_element.get('datetime', time_element.get_text(strip=True))
250
+
251
+ snippet = snippet_element.get_text(strip=True) if snippet_element else ''
252
+
253
+ full_link = ''
254
+ if link:
255
+ if link.startswith('http'):
256
+ full_link = link
257
+ else:
258
+ full_link = f"{self.base_url}{link}"
259
+
260
+ search_results.append({
261
+ 'title': title,
262
+ 'link': full_link,
263
+ 'snippet': snippet,
264
+ 'votes': votes,
265
+ 'comments': comments,
266
+ 'author': author,
267
+ 'date': post_date,
268
+ 'breadcrumbs': breadcrumbs
269
+ })
591
270
 
592
- if len(new_page_comments) == 0:
271
+ if len(search_results) >= max_results:
593
272
  break
594
-
595
- all_comments.extend(new_page_comments)
596
- page_num += 1
597
- log(f"Found {len(new_page_comments)} comments on page {page_num}", "INFO")
598
- else:
273
+
274
+ if len(search_results) >= max_results:
599
275
  break
600
- except Exception as e:
601
- log("No more pages found", "INFO")
602
- break
603
-
604
- return all_comments
605
-
606
- except Exception as e:
607
- log(f"Error in comment extraction: {str(e)}", "WARNING")
608
- return all_comments
609
276
 
610
- def _parse_comments_from_html(self, soup: BeautifulSoup) -> List[Dict[str, Any]]:
611
- """Parse comments from HTML using BeautifulSoup."""
612
- comments = []
613
-
614
- # Try multiple selectors for comments
615
- comment_selectors = [
616
- 'ul#comments.comment-list li.comment',
617
- '.comment-list .comment',
618
- '.comments .comment',
619
- 'li.comment',
620
- '.comment-item'
621
- ]
622
-
623
- comment_elements = None
624
-
625
- for selector in comment_selectors:
626
- comment_elements = soup.select(selector)
627
- if comment_elements:
628
- log(f"Found comments using selector: {selector}", "INFO")
629
- break
630
-
631
- if not comment_elements:
632
- log("No comments found on this page", "INFO")
633
- return comments
634
-
635
- for index, element in enumerate(comment_elements):
636
- try:
637
- comment = {}
638
-
639
- # Extract comment ID
640
- comment['id'] = element.get('id') or f"comment-{index}"
641
-
642
- # Extract author
643
- author_element = element.select_one('.comment-author a, .author a, .comment-author')
644
- comment['author'] = author_element.get_text().strip() if author_element else 'Unknown Author'
645
- comment['author_link'] = author_element.get('href') if author_element else ''
646
-
647
- # Extract date
648
- time_element = element.select_one('.meta-data time, time, .date, .comment-date')
649
- if time_element:
650
- comment['date'] = time_element.get('datetime') or time_element.get('title') or time_element.get_text().strip()
651
- comment['date_display'] = time_element.get('title') or time_element.get_text().strip()
652
- else:
653
- comment['date'] = 'Unknown Date'
654
- comment['date_display'] = 'Unknown Date'
655
-
656
- # Extract content
657
- content_element = element.select_one('.comment-body, .comment-content, .content')
658
- if content_element:
659
- comment['content_html'] = str(content_element)
660
- comment['content_text'] = content_element.get_text().strip()
661
- else:
662
- comment['content_html'] = ''
663
- comment['content_text'] = ''
664
-
665
- # Extract votes
666
- vote_element = element.select_one('.vote-up span, .votes, .vote-count')
667
- comment['votes'] = vote_element.get_text().strip() if vote_element else '0'
668
-
669
- # Extract status
670
- status_element = element.select_one('.status-label, .status, .badge')
671
- comment['status'] = status_element.get_text().strip() if status_element else '普通评论'
672
-
673
- if comment['content_text']:
674
- comments.append(comment)
277
+ page_num += 1
278
+
279
+ log(f"Found {len(search_results)} results for '{search_query}'", "SUCCESS")
675
280
 
281
+ return {
282
+ "success": True,
283
+ "results": search_results,
284
+ "total_found": len(search_results)
285
+ }
286
+
676
287
  except Exception as e:
677
- log(f"Error parsing comment {index}: {str(e)}", "WARNING")
678
-
679
- return comments
680
-
681
- async def search_forum_posts(self, email: str, password: str, search_query: str,
682
- max_results: int = 50, headless: bool = True) -> Dict[str, Any]:
683
- """Search forum posts."""
684
- driver = None
685
- try:
686
- log("Starting forum search process", "INFO")
687
- log(f"Search query: '{search_query}'", "INFO")
688
- log(f"Max results: {max_results}", "INFO")
689
-
690
- driver = await self.create_driver(headless)
691
-
692
- # Login
693
- if not await self.login_to_forum(driver, email, password):
694
- raise Exception("Failed to login to forum")
695
-
696
- # Navigate to search
697
- encoded_query = requests.utils.quote(search_query)
698
- search_url = f"https://support.worldquantbrain.com/hc/zh-cn/search?utf8=%E2%9C%93&query={encoded_query}"
699
- log(f"Opening search URL: {search_url}", "WORK")
700
-
701
- driver.get(search_url)
702
- await asyncio.sleep(2)
703
-
704
- # Collect results with pagination
705
- all_results = []
706
- page_num = 1
707
-
708
- log("Starting result collection with pagination", "WORK")
709
-
710
- while len(all_results) < max_results:
711
- log(f"Processing page {page_num}", "INFO")
712
-
713
- # Wait for search results
714
- try:
715
- WebDriverWait(driver, 10).until(
716
- EC.presence_of_element_located((By.CSS_SELECTOR, '.search-results-list, .search-result-list-item'))
717
- )
718
- except TimeoutException:
719
- log(f"No search results found on page {page_num}", "WARNING")
720
- break
721
-
722
- # Extract results from current page
723
- page_source = driver.page_source
724
- soup = BeautifulSoup(page_source, 'html.parser')
725
- page_results = self._extract_search_results(soup, page_num)
726
-
727
- if not page_results:
728
- log(f"No more results found on page {page_num}", "INFO")
729
- break
730
-
731
- all_results.extend(page_results)
732
-
733
- # Check if we have enough results
734
- if len(all_results) >= max_results:
735
- all_results = all_results[:max_results]
736
- break
737
-
738
- # Try to go to next page
739
- if not await self._go_to_next_search_page(driver, soup):
740
- log("No more pages available", "INFO")
741
- break
742
-
743
- page_num += 1
744
- await asyncio.sleep(1)
745
-
746
- # Analyze results
747
- analysis = self._analyze_search_results(all_results, search_query)
748
-
749
- log(f"Search completed. Found {len(all_results)} results", "SUCCESS")
750
- return {
751
- "results": all_results,
752
- "total_found": len(all_results),
753
- "search_query": search_query,
754
- "analysis": analysis,
755
- "search_timestamp": datetime.now().isoformat()
756
- }
757
-
758
- except Exception as e:
759
- log(f"Search failed: {str(e)}", "ERROR")
760
- return {"error": str(e)}
761
- finally:
762
- if driver:
763
- try:
764
- driver.quit()
765
- except:
766
- pass
767
-
768
- def _extract_search_results(self, soup: BeautifulSoup, page_num: int) -> List[Dict[str, Any]]:
769
- """Extract search results from a page using multiple resilient selectors.
770
-
771
- Improvements vs original implementation:
772
- - Tries several container selectors (mirrors TS Cheerio approach)
773
- - Extracts richer metadata: description_html/text, votes, comments, author, date
774
- - Preserves legacy fields (snippet, metadata) for backward compatibility
775
- - Adds index & page for downstream analytics
776
- - Robust fallbacks & normalization of URLs
777
- """
778
- results: List[Dict[str, Any]] = []
779
-
780
- # Ordered list of possible container selectors (keep broad ones last)
781
- container_selectors = [
782
- '.search-result-list-item',
783
- '.search-results-list .search-result',
784
- '.striped-list-item',
785
- '.article-list-item',
786
- 'article.search-result',
787
- 'div.search-result',
788
- ]
789
-
790
- # Collect candidate elements (stop at first selector that yields results)
791
- result_items = []
792
- for selector in container_selectors:
793
- found = soup.select(selector)
794
- if found:
795
- log(f"Found {len(found)} search results using selector: {selector}", "INFO")
796
- result_items = found
797
- break
798
-
799
- # Fallback: regex class scan (original heuristic)
800
- if not result_items:
801
- fallback = soup.find_all(['article', 'div'], class_=re.compile(r'search-result|article-item'))
802
- if fallback:
803
- log(f"Fallback selector captured {len(fallback)} results", "INFO")
804
- result_items = fallback
805
- else:
806
- log("No search result items found with any selector", "WARNING")
807
- return results
808
-
809
- def first_text(element, selector_list: List[str]) -> str:
810
- for sel in selector_list:
811
- found = element.select_one(sel)
812
- if found and found.get_text(strip=True):
813
- return found.get_text(strip=True)
814
- return ''
815
-
816
- for idx, item in enumerate(result_items):
288
+ log(f"Forum search failed: {str(e)}", "ERROR")
289
+ raise
290
+ finally:
291
+ if browser:
292
+ await browser.close()
293
+
294
+ async def read_full_forum_post(self, email: str, password: str, post_url_or_id: str, include_comments: bool = True) -> Dict[str, Any]:
295
+ """Read a complete forum post and all its comments using Playwright."""
296
+ async with async_playwright() as p:
297
+ browser = None
817
298
  try:
818
- # Title & link
819
- title_link_elem = None
820
- title_selectors = [
821
- '.search-result-title a',
822
- 'h3 a',
823
- '.title a',
824
- 'a'
825
- ]
826
- for sel in title_selectors:
827
- candidate = item.select_one(sel)
828
- if candidate and candidate.get_text(strip=True):
829
- title_link_elem = candidate
830
- break
299
+ log("Starting forum post reading process with Playwright", "INFO")
831
300
 
832
- title = title_link_elem.get_text(strip=True) if title_link_elem else 'No title'
833
- link = title_link_elem.get('href') if title_link_elem and title_link_elem.has_attr('href') else ''
834
- if link and not link.startswith('http'):
835
- link = f"https://support.worldquantbrain.com{link}"
836
-
837
- if not link and not title:
838
- continue # Skip invalid entries
839
-
840
- # Description / snippet
841
- desc_elem = None
842
- desc_selectors = [
843
- '.search-results-description',
844
- '.description',
845
- '.excerpt',
846
- '.content-preview',
847
- 'p'
848
- ]
849
- for sel in desc_selectors:
850
- candidate = item.select_one(sel)
851
- if candidate and candidate.get_text(strip=True):
852
- desc_elem = candidate
853
- break
301
+ if post_url_or_id.startswith('http'):
302
+ initial_url = post_url_or_id
303
+ else:
304
+ initial_url = f"https://support.worldquantbrain.com/hc/zh-cn/community/posts/{post_url_or_id}"
854
305
 
855
- description_html = str(desc_elem) if desc_elem else ''
856
- description_text = desc_elem.get_text(strip=True) if desc_elem else ''
857
-
858
- # Votes & comments
859
- votes = first_text(item, [
860
- '.search-result-votes span',
861
- '.votes span',
862
- '[class*="vote"] span',
863
- '[class*="vote"]'
864
- ]) or '0'
865
- comments = first_text(item, [
866
- '.search-result-meta-count span',
867
- '.comments span',
868
- '[class*="comment"] span',
869
- '[class*="comment"]'
870
- ]) or '0'
871
-
872
- # Metadata / author / date
873
- meta_block = item.select_one('.meta-data, .metadata, .post-meta')
874
- author = 'Unknown'
875
- date_val = 'Unknown'
876
- if meta_block:
877
- meta_text = meta_block.get_text(' ', strip=True)
878
- # Split on common separators
879
- parts = [p.strip() for p in re.split(r'[·•|]', meta_text) if p.strip()]
880
- if len(parts) >= 2:
881
- author = parts[0] or author
882
- date_val = parts[1] or date_val
883
-
884
- # Fallback selectors
885
- if author == 'Unknown':
886
- author = first_text(item, ['.author', '.username', '[class*="author"]']) or 'Unknown'
887
- if date_val == 'Unknown':
888
- # time element or date class
889
- time_elem = item.select_one('.date, time, [class*="date"]')
890
- if time_elem:
891
- date_val = time_elem.get('datetime') or time_elem.get('title') or time_elem.get_text(strip=True) or 'Unknown'
892
-
893
- # Compose legacy fields
894
- snippet = description_text
895
- metadata = f"author={author} date={date_val} votes={votes} comments={comments}".strip()
896
-
897
- results.append({
898
- 'title': title,
899
- 'link': link,
900
- 'description_html': description_html or 'No description',
901
- 'description_text': description_text or 'No description',
902
- 'votes': votes,
903
- 'comments': comments,
904
- 'author': author,
905
- 'date': date_val,
906
- 'snippet': snippet, # backward compatibility
907
- 'metadata': metadata, # backward compatibility / quick summary
908
- 'page': page_num,
909
- 'index': idx
910
- })
911
- except Exception as e:
912
- log(f"Error extracting search result {idx}: {str(e)}", "WARNING")
913
- continue
306
+ browser, context = await self._get_browser_context(p, email, password)
307
+ page = await context.new_page()
914
308
 
915
- return results
916
-
917
- async def _go_to_next_search_page(self, driver: webdriver.Chrome, soup: BeautifulSoup) -> bool:
918
- """Navigate to the next search page."""
919
- try:
920
- # Look for next page link
921
- next_link = soup.find('a', string=re.compile(r'next|下一页', re.IGNORECASE))
922
- if not next_link:
923
- next_link = soup.find('a', {'rel': 'next'})
924
-
925
- if next_link and next_link.get('href'):
926
- next_url = next_link['href']
927
- if not next_url.startswith('http'):
928
- next_url = f"https://support.worldquantbrain.com{next_url}"
309
+ # --- Get Main Post Content and Final URL ---
310
+ log(f"Navigating to initial URL: {initial_url}", "INFO")
311
+ await page.goto(initial_url)
312
+ await page.wait_for_selector('.post-body, .article-body', timeout=15000)
929
313
 
930
- driver.get(next_url)
931
- await asyncio.sleep(2)
932
- return True
933
-
934
- return False
935
-
936
- except Exception as e:
937
- log(f"Error navigating to next page: {str(e)}", "WARNING")
938
- return False
939
-
940
- def _analyze_search_results(self, results: List[Dict[str, Any]], search_query: str) -> Dict[str, Any]:
941
- """Analyze search results for insights."""
942
- if not results:
943
- return {"message": "No results found"}
944
-
945
- # Basic statistics
946
- total_results = len(results)
947
-
948
- # Categorize results by type
949
- categories = {}
950
- for result in results:
951
- title = result.get('title', '').lower()
952
- if 'tutorial' in title or 'guide' in title:
953
- categories['tutorials'] = categories.get('tutorials', 0) + 1
954
- elif 'api' in title or 'reference' in title:
955
- categories['api_docs'] = categories.get('api_docs', 0) + 1
956
- elif 'error' in title or 'issue' in title or 'problem' in title:
957
- categories['troubleshooting'] = categories.get('troubleshooting', 0) + 1
958
- elif 'competition' in title or 'event' in title:
959
- categories['competitions'] = categories.get('competitions', 0) + 1
960
- else:
961
- categories['general'] = categories.get('general', 0) + 1
962
-
963
- # Find most relevant results (containing search terms)
964
- search_terms = search_query.lower().split()
965
- relevant_results = []
966
-
967
- for result in results:
968
- title = result.get('title', '').lower()
969
- snippet = result.get('snippet', '').lower()
970
- text = f"{title} {snippet}"
971
-
972
- term_matches = sum(1 for term in search_terms if term in text)
973
- if term_matches > 0:
974
- relevant_results.append({
975
- "result": result,
976
- "relevance_score": term_matches / len(search_terms)
977
- })
978
-
979
- # Sort by relevance
980
- relevant_results.sort(key=lambda x: x['relevance_score'], reverse=True)
981
-
982
- return {
983
- "total_results": total_results,
984
- "categories": categories,
985
- "most_relevant": relevant_results[:5] if relevant_results else [],
986
- "search_terms": search_terms
987
- }
314
+ # Get the final URL after any redirects
315
+ base_url = re.sub(r'(\?|&)page=\d+', '', page.url).split('#')[0]
316
+ log(f"Resolved to Base URL: {base_url}", "INFO")
317
+ await page.wait_for_selector('.post-body, .article-body', timeout=15000)
318
+ content = await page.content()
319
+ soup = BeautifulSoup(content, 'html.parser')
320
+
321
+ post_data = {}
322
+ title_element = soup.select_one('.post-title, h1.article-title, .article__title')
323
+ post_data['title'] = title_element.get_text(strip=True) if title_element else 'Unknown Title'
324
+
325
+ author_span = soup.select_one('.post-author span[title]')
326
+ post_data['author'] = author_span['title'] if author_span else 'Unknown Author'
327
+
328
+ body_element = soup.select_one('.post-body, .article-body')
329
+ post_data['body'] = body_element.get_text(strip=True) if body_element else 'Body not found'
330
+
331
+ votes_element = soup.select_one('.vote-sum')
332
+ date_element = soup.select_one('.post-meta .meta-data')
333
+ post_data['details'] = {
334
+ 'votes': votes_element.get_text(strip=True) if votes_element else '0',
335
+ 'date': date_element.get_text(strip=True) if date_element else 'Unknown Date'
336
+ }
337
+
338
+ # --- Get Comments with Pagination ---
339
+ comments = []
340
+ if include_comments:
341
+ log("Starting comment extraction...", "INFO")
342
+ page_num = 1
343
+ while True:
344
+ comment_url = f"{base_url}?page={page_num}#comments"
345
+ log(f"Navigating to comment page: {comment_url}", "INFO")
346
+
347
+ try:
348
+ response = await page.goto(comment_url)
349
+ if response.status == 404:
350
+ log(f"Page {page_num} returned 404. End of comments.", "INFO")
351
+ break
352
+ await page.wait_for_selector('.comment-list', timeout=10000)
353
+ except Exception as e:
354
+ log(f"Could not load page {page_num}: {e}. Assuming end of comments.", "INFO")
355
+ break
356
+
357
+ comment_soup = BeautifulSoup(await page.content(), 'html.parser')
358
+ comment_elements = comment_soup.select('.comment')
359
+
360
+ if not comment_elements:
361
+ log(f"No comments found on page {page_num}. Ending extraction.", "INFO")
362
+ break
363
+
364
+ log(f"Found {len(comment_elements)} comments on page {page_num}.", "INFO")
365
+
366
+ new_comments_found_on_page = 0
367
+ for comment_element in comment_elements:
368
+ author_span = comment_element.select_one('.comment-author span[title]')
369
+ author_id = author_span['title'] if author_span else 'Unknown'
370
+
371
+ body_element = comment_element.select_one('.comment-body')
372
+ date_element = comment_element.select_one('.comment-meta .meta-data')
373
+
374
+ comment_data = {
375
+ 'author': author_id,
376
+ 'body': body_element.get_text(strip=True) if body_element else '',
377
+ 'date': date_element.get_text(strip=True) if date_element else 'Unknown Date'
378
+ }
379
+
380
+ if comment_data not in comments:
381
+ comments.append(comment_data)
382
+ new_comments_found_on_page += 1
383
+
384
+ if new_comments_found_on_page == 0 and page_num > 1:
385
+ log(f"No new comments detected on page {page_num}. Ending extraction.", "INFO")
386
+ break
387
+
388
+ page_num += 1
389
+
390
+ log(f"Extracted {len(comments)} comments in total.", "SUCCESS")
391
+ return {
392
+ "success": True, "post": post_data, "comments": comments, "total_comments": len(comments)
393
+ }
394
+
395
+ except Exception as e:
396
+ log(f"Failed to read forum post: {str(e)}", "ERROR")
397
+ raise
398
+ finally:
399
+ if browser:
400
+ await browser.close()
988
401
 
989
402
  # Initialize forum client
990
403
  forum_client = ForumClient()
991
404
 
992
- # MCP Tools for Forum Functions - REMOVED (duplicate with platform_functions.py)
993
- # These tools are already properly integrated in the main platform_functions.py
994
-
405
+ # The main block is for testing and won't be run by the MCP server.
995
406
  if __name__ == "__main__":
996
- print("📚 WorldQuant BRAIN Forum Functions Server Starting...", file=sys.stderr)
997
- print("Note: Forum tools are now integrated in the main platform_functions.py", file=sys.stderr)
998
- print("This file provides the ForumClient class for internal use.", file=sys.stderr)
407
+ print("📚 WorldQuant BRAIN Forum Functions - This script provides the ForumClient class.", file=sys.stderr)