PyPI - cnhkmcp - Versions diffs - 2.1.4__py3-none-any.whl → 2.1.6__py3-none-any.whl - Mend

cnhkmcp 2.1.4py3-none-any.whl → 2.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

cnhkmcp/untracked/forum_functions.py CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
 WorldQuant BRAIN Forum Functions - Python Version
-Comprehensive forum functionality including glossary, search, and post viewing.
+Comprehensive forum functionality including glossary, search, and post viewing using Playwright.
 """
 import asyncio
@@ -12,987 +12,396 @@ from datetime import datetime
 from typing import Dict, Any, List, Optional
 from bs4 import BeautifulSoup
-from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.edge.options import Options as EdgeOptions
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import TimeoutException, NoSuchElementException
+from playwright.async_api import async_playwright
 import requests
 import os
-import shutil
-# Initialize forum MCP server
-try:
-    from mcp.server.fastmcp import FastMCP
-    forum_mcp = FastMCP('brain_forum_server')
-except ImportError:
-    # Fallback for testing
-    forum_mcp = None
 def log(message: str, level: str = "INFO"):
     """Log message with timestamp."""
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     print(f"[{timestamp}] [{level}] {message}", file=sys.stderr)
+# --- Parsing Helper Functions (from playwright_forum_test.py) ---
+def _is_navigation_or_metadata(line: str) -> bool:
+    """Check if a line is navigation or metadata."""
+    navigation_patterns = [
+        r'^\d+ days? ago$',
+        r'~\d+ minute read',
+        r'^Follow',
+        r'^Not yet followed',
+        r'^Updated$',
+        r'^AS\d+$',
+        r'^[A-Z] - [A-Z] - [A-Z]',  # Letter navigation
+        r'^A$',
+        r'^B$',
+        r'^[A-Z]$'  # Single letters
+    ]
+    return any(re.match(pattern, line.strip()) for pattern in navigation_patterns)
+def _looks_like_term(line: str) -> bool:
+    """Check if a line looks like a glossary term."""
+    if len(line) > 100:
+        return False
+    if _is_navigation_or_metadata(line):
+        return False
+    definition_starters = ['the', 'a', 'an', 'this', 'that', 'it', 'is', 'are', 'was', 'were', 'for', 'to', 'in', 'on', 'at', 'by', 'with']
+    first_word = line.lower().split(' ')[0] if line else ''
+    if first_word and first_word in definition_starters:
+        return False
+    is_short = len(line) <= 80
+    starts_with_capital = bool(re.match(r'^[A-Z]', line))
+    has_all_caps = bool(re.match(r'^[A-Z\s\-\/\(\)]+$', line))
+    has_reasonable_length = len(line) >= 2
+    return is_short and has_reasonable_length and (starts_with_capital or has_all_caps)
+def _parse_glossary_terms(content: str) -> List[Dict[str, str]]:
+    """Parse glossary terms from HTML content."""
+    soup = BeautifulSoup(content, 'html.parser')
+    # Get text from the article body, which is more reliable than splitting the whole HTML
+    article_body = soup.select_one('.article-body')
+    if not article_body:
+        return []
+    # Use .get_text with a separator to preserve line breaks, which is key for the logic below
+    lines = article_body.get_text(separator='\n').split('\n')
+    terms = []
+    current_term = None
+    current_definition = []
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        if _looks_like_term(line):
+            if current_term:
+                # Save the previous term
+                terms.append({
+                    "term": current_term,
+                    "definition": " ".join(current_definition).strip()
+                })
+            # Start a new term
+            current_term = line
+            current_definition = []
+        elif current_term:
+            # Add to the current definition
+            current_definition.append(line)
+    # Add the last term
+    if current_term:
+        terms.append({
+            "term": current_term,
+            "definition": " ".join(current_definition).strip()
+        })
+    # Filter out invalid terms and improve quality
+    return [term for term in terms if
+            len(term["term"]) > 0 and
+            len(term["definition"]) > 10 and
+            not _is_navigation_or_metadata(term["term"]) and
+            "ago" not in term["definition"] and
+            "minute read" not in term["definition"]]
 class ForumClient:
-    """Forum client for WorldQuant BRAIN support site."""
+    """Forum client for WorldQuant BRAIN support site, using Playwright."""
     def __init__(self):
         self.base_url = "https://support.worldquantbrain.com"
+        # The session is mainly used for the initial authentication via brain_client
         self.session = requests.Session()
         self.session.headers.update({
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36'
         })
+    async def _get_browser_context(self, p: async_playwright, email: str, password: str):
+        """Authenticate and return a browser context with the session."""
+        # Import brain_client here to avoid circular dependency
+        from platform_functions import brain_client
+        log("Authenticating with BRAIN platform...", "INFO")
+        auth_result = await brain_client.authenticate(email, password)
+        if auth_result.get('status') != 'authenticated':
+            raise Exception("BRAIN platform authentication failed.")
+        log("Successfully authenticated with BRAIN platform.", "SUCCESS")
+        browser = await p.chromium.launch(channel="chrome", headless=True, args=['--no-sandbox'])
+        context = await browser.new_context(user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36')
+        log("Transferring authentication session to browser...", "INFO")
+        cookies = brain_client.session.cookies
+        playwright_cookies = []
+        for cookie in cookies:
+            cookie_dict = {
+                'name': cookie.name,
+                'value': cookie.value,
+                'domain': cookie.domain,
+                'path': cookie.path,
+                'secure': cookie.secure,
+                'httpOnly': 'HttpOnly' in cookie._rest,
+                'sameSite': 'Lax'
+            }
+            if cookie.expires:
+                cookie_dict['expires'] = cookie.expires
+            playwright_cookies.append(cookie_dict)
-    def get_brain_session(self):
-        """Get authenticated session from BrainApiClient."""
-        try:
-            import sys
-            import os
-            sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-            from platform_functions import brain_client
-            return brain_client.session
-        except ImportError:
-            return None
-    def detect_available_browser(self) -> str:
-        """Detect which browser WebDriver is available."""
-        try:
-            # Try Chrome first
-            from selenium.webdriver.chrome.service import Service
-            from selenium.webdriver.chrome.options import Options
-            try:
-                options = Options()
-                options.add_argument('--headless')
-                driver = webdriver.Chrome(options=options)
-                driver.quit()
-                return "chrome"
-            except Exception:
-                pass
-            # Try Edge
-            try:
-                from selenium.webdriver.edge.options import Options as EdgeOptions
-                options = EdgeOptions()
-                options.add_argument('--headless')
-                driver = webdriver.Edge(options=options)
-                driver.quit()
-                return "edge"
-            except Exception:
-                pass
-            # Default to chrome
-            return "chrome"
-        except Exception:
-            return "chrome"
-    def setup_browser_options(self, headless: bool, browser_type: str):
-        """Setup browser options based on browser type."""
-        if browser_type.lower() == "chrome":
-            return self.setup_chrome_options(headless)
-        elif browser_type.lower() == "edge":
-            return self.setup_edge_options(headless)
-        else:
-            return self.setup_chrome_options(headless)
-    def setup_edge_options(self, headless: bool = True) -> EdgeOptions:
-        """Setup Edge options for web scraping."""
-        options = EdgeOptions()
-        if headless:
-            options.add_argument('--headless')
-        # Performance optimizations
-        options.add_argument('--disable-blink-features=AutomationControlled')
-        options.add_argument('--log-level=3')
-        options.add_argument('--no-sandbox')
-        options.add_argument('--disable-dev-shm-usage')
-        options.add_argument('--disable-web-security')
-        options.add_argument('--disable-features=VizDisplayCompositor')
-        options.add_argument('--disable-gpu')
-        options.add_argument('--disable-extensions')
-        options.add_argument('--disable-images')
-        options.add_argument('--disable-javascript')
-        options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36')
-        return options
-    def setup_chrome_options(self, headless: bool = True) -> Options:
-        """Setup Chrome options for web scraping."""
-        options = Options()
-        if headless:
-            options.add_argument('--headless')
-        # Performance optimizations
-        options.add_argument('--disable-blink-features=AutomationControlled')
-        options.add_argument('--log-level=3')
-        options.add_argument('--no-sandbox')
-        options.add_argument('--disable-dev-shm-usage')
-        options.add_argument('--disable-web-security')
-        options.add_argument('--disable-features=VizDisplayCompositor')
-        options.add_argument('--disable-gpu')
-        options.add_argument('--disable-extensions')
-        options.add_argument('--disable-images')
-        options.add_argument('--disable-javascript')
-        options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36')
-        return options
-    async def create_driver(self, headless: bool = True):
-        """Create and configure WebDriver with cross-browser support."""
-        browser_type = self.detect_available_browser()
-        log(f"Using browser: {browser_type}", "INFO")
-        options = self.setup_browser_options(headless, browser_type)
+        await context.add_cookies(playwright_cookies)
+        log("Session transferred.", "SUCCESS")
-        try:
-            if browser_type.lower() == "chrome":
-                driver = webdriver.Chrome(options=options)
-            elif browser_type.lower() == "edge":
-                driver = webdriver.Edge(options=options)
-            else:
-                # Fallback to Chrome
-                log("Falling back to Chrome", "WARNING")
-                driver = webdriver.Chrome(options=options)
-            # Set aggressive timeouts for speed
-            driver.set_page_load_timeout(30)
-            driver.implicitly_wait(10)
-            return driver
-        except Exception as e:
-            log(f"Failed to create {browser_type} driver: {str(e)}", "ERROR")
-            help_text = self.get_driver_installation_help(browser_type)
-            log(help_text, "ERROR")
-            # Try Chrome as fallback if Edge failed
-            if browser_type.lower() != "chrome":
-                try:
-                    log("Trying Chrome as fallback", "INFO")
-                    chrome_options = self.setup_browser_options(headless, "chrome")
-                    driver = webdriver.Chrome(options=chrome_options)
-                    driver.set_page_load_timeout(30)
-                    driver.implicitly_wait(10)
-                    return driver
-                except Exception as e2:
-                    log(f"Chrome fallback also failed: {str(e2)}", "ERROR")
-                    chrome_help = self.get_driver_installation_help("chrome")
-                    log(chrome_help, "ERROR")
-            raise Exception(f"Could not create any browser driver. {help_text}")
-    async def login_to_forum(self, driver, email: str, password: str) -> bool:
-        """Login to the WorldQuant BRAIN forum using existing authentication."""
-        try:
-            # Import BrainApiClient from platform_functions
-            import sys
-            import os
-            sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+        return browser, context
+    async def get_glossary_terms(self, email: str, password: str) -> List[Dict[str, str]]:
+        """Extract glossary terms from the forum using Playwright."""
+        async with async_playwright() as p:
+            browser = None
             try:
-                from platform_functions import brain_client
-                log("Using existing BrainApiClient for authentication", "INFO")
-                # First authenticate with BrainApiClient
-                auth_result = await brain_client.authenticate(email, password)
-                if auth_result.get('status') != 'authenticated':
-                    log("BrainApiClient authentication failed", "ERROR")
-                    return False
-                log("Successfully authenticated via BrainApiClient", "SUCCESS")
-                # Navigate to forum with authenticated session
-                log("Navigating to forum with authenticated session", "WORK")
-                driver.get("https://support.worldquantbrain.com/hc/en-us")
-                await asyncio.sleep(2)
+                log("Starting glossary extraction process with Playwright", "INFO")
+                browser, context = await self._get_browser_context(p, email, password)
-                # Add authentication cookies to browser
-                cookies = brain_client.session.cookies
-                for cookie in cookies:
-                    driver.add_cookie({
-                        'name': cookie.name,
-                        'value': cookie.value,
-                        'domain': '.worldquantbrain.com'
-                    })
+                page = await context.new_page()
+                log("Navigating to BRAIN support forum glossary...", "INFO")
+                await page.goto("https://support.worldquantbrain.com/hc/en-us/articles/4902349883927-Click-here-for-a-list-of-terms-and-their-definitions")
-                # Refresh page with cookies
-                driver.refresh()
-                await asyncio.sleep(2)
+                log("Extracting glossary content...", "INFO")
+                content = await page.content()
-                return True
+                terms = _parse_glossary_terms(content)
-            except ImportError:
-                log("BrainApiClient not available, using manual login", "WARNING")
-                # Fallback to manual login
-                driver.get("https://support.worldquantbrain.com/hc/en-us/signin")
-                await asyncio.sleep(3)
-                email_input = WebDriverWait(driver, 15).until(
-                    EC.presence_of_element_located((By.NAME, "email"))
-                )
-                password_input = WebDriverWait(driver, 15).until(
-                    EC.presence_of_element_located((By.NAME, "currentPassword"))
-                )
-                email_input.clear()
-                email_input.send_keys(email)
-                password_input.clear()
-                password_input.send_keys(password)
-                login_button = WebDriverWait(driver, 15).until(
-                    EC.element_to_be_clickable((By.XPATH, '//button[@type="submit"]'))
-                )
-                login_button.click()
-                await asyncio.sleep(3)
-                return True
-        except Exception as e:
-            log(f"Login failed: {str(e)}", "ERROR")
-            return False
-    async def get_glossary_terms(self, email: str, password: str, headless: bool = False) -> Dict[str, Any]:
-        """Extract glossary terms from the forum."""
-        driver = None
-        try:
-            log("Starting glossary extraction process", "INFO")
-            # Add timeout protection
-            async def extraction_with_timeout():
-                return await self._perform_glossary_extraction(email, password, headless)
-            # Run with 5-minute timeout
-            result = await asyncio.wait_for(extraction_with_timeout(), timeout=300)
-            return result
-        except asyncio.TimeoutError:
-            log("Glossary extraction timed out after 5 minutes", "ERROR")
-            return {"error": "Glossary extraction timed out after 5 minutes"}
-        except Exception as e:
-            log(f"Glossary extraction failed: {str(e)}", "ERROR")
-            return {"error": str(e)}
-        finally:
-            if driver:
-                try:
-                    driver.quit()
-                except:
-                    pass
-    async def _perform_glossary_extraction(self, email: str, password: str, headless: bool) -> Dict[str, Any]:
-        """Perform the actual glossary extraction."""
-        driver = None
-        try:
-            driver = await self.create_driver(headless)
-            # Login
-            if not await self.login_to_forum(driver, email, password):
-                raise Exception("Failed to login to forum")
-            # Navigate to glossary page
-            log("Navigating to glossary page", "WORK")
-            driver.get("https://support.worldquantbrain.com/hc/en-us/articles/4902349883927-Click-here-for-a-list-of-terms-and-their-definitions")
-            await asyncio.sleep(5)
-            # Extract content
-            log("Extracting glossary content", "WORK")
-            page_source = driver.page_source
-            soup = BeautifulSoup(page_source, 'html.parser')
-            # Parse glossary terms
-            terms = self._parse_glossary_terms(page_source)
-            log(f"Extracted {len(terms)} glossary terms", "SUCCESS")
-            return {
-                "terms": terms,
-                "total_count": len(terms),
-                "extraction_timestamp": datetime.now().isoformat()
-            }
-        finally:
-            if driver:
-                try:
-                    driver.quit()
-                except:
-                    pass
-    def _parse_glossary_terms(self, content: str) -> List[Dict[str, str]]:
-        """Parse glossary terms from HTML content."""
-        terms = []
-        lines = content.split('\n')
-        current_term = None
-        current_definition = []
-        is_collecting_definition = False
-        found_first_real_term = False
-        for line in lines:
-            line = line.strip()
-            if not line:
-                continue
-            # Skip navigation and metadata lines at the beginning
-            if not found_first_real_term and self._is_navigation_or_metadata(line):
-                continue
-            # Check if this line looks like a term
-            if self._looks_like_term(line) and not is_collecting_definition:
-                # Mark that we found the first real term
-                if not found_first_real_term:
-                    found_first_real_term = True
+                log(f"Extracted {len(terms)} glossary terms", "SUCCESS")
+                return terms
+            except Exception as e:
+                log(f"Glossary extraction failed: {str(e)}", "ERROR")
+                # Re-raise to be handled by the MCP server wrapper
+                raise
+            finally:
+                if browser:
+                    await browser.close()
+                    log("Browser closed.", "INFO")
+    async def search_forum_posts(self, email: str, password: str, search_query: str, max_results: int = 50, locale: str = "zh-cn") -> Dict[str, Any]:
+        """Search for posts on the forum using Playwright, with pagination."""
+        async with async_playwright() as p:
+            browser = None
+            try:
+                log(f"Starting forum search for '{search_query}'", "INFO")
+                browser, context = await self._get_browser_context(p, email, password)
+                page = await context.new_page()
-                # Save previous term if exists
-                if current_term and current_definition:
-                    terms.append({
-                        "term": current_term.strip(),
-                        "definition": " ".join(current_definition).strip()
-                    })
+                search_results = []
+                page_num = 1
-                current_term = line
-                current_definition = []
-                is_collecting_definition = True
-            elif is_collecting_definition and found_first_real_term:
-                # Check if this is the start of a new term
-                if self._looks_like_term(line):
-                    # Save current term
-                    if current_term and current_definition:
-                        terms.append({
-                            "term": current_term.strip(),
-                            "definition": " ".join(current_definition).strip()
-                        })
+                while len(search_results) < max_results:
+                    search_url = f"{self.base_url}/hc/{locale}/search?page={page_num}&query={search_query}#results"
+                    log(f"Navigating to search page: {search_url}", "INFO")
-                    current_term = line
-                    current_definition = []
-                else:
-                    # Add to definition
-                    if current_definition:
-                        current_definition.append(line)
-                    else:
-                        current_definition = [line]
-        # Don't forget the last term
-        if current_term and current_definition and found_first_real_term:
-            terms.append({
-                "term": current_term.strip(),
-                "definition": " ".join(current_definition).strip()
-            })
-        # Filter out invalid terms and improve quality
-        return [term for term in terms if
-                len(term["term"]) > 0 and
-                len(term["definition"]) > 10 and  # Ensure meaningful definitions
-                not self._is_navigation_or_metadata(term["term"]) and
-                "ago" not in term["definition"] and  # Remove timestamp-like definitions
-                "minute read" not in term["definition"]]  # Remove reading time
-    def _looks_like_term(self, line: str) -> bool:
-        """Check if a line looks like a glossary term."""
-        # Skip very long lines (likely definitions)
-        if len(line) > 100:
-            return False
-        # Skip navigation and metadata
-        if self._is_navigation_or_metadata(line):
-            return False
-        # Skip lines that start with common definition words
-        definition_starters = ['the', 'a', 'an', 'this', 'that', 'it', 'is', 'are', 'was', 'were', 'for', 'to', 'in', 'on', 'at', 'by', 'with']
-        first_word = line.lower().split(' ')[0]
-        if first_word and first_word in definition_starters:
-            return False
-        # Check if line has characteristics of a term
-        # Terms are often short, may be all caps, or start with capital
-        is_short = len(line) <= 80
-        starts_with_capital = bool(re.match(r'^[A-Z]', line))
-        has_all_caps = bool(re.match(r'^[A-Z\s\-\/\(\)]+$', line))
-        has_reasonable_length = len(line) >= 2
-        return is_short and has_reasonable_length and (starts_with_capital or has_all_caps)
-    def _is_navigation_or_metadata(self, line: str) -> bool:
-        """Check if a line is navigation or metadata."""
-        navigation_patterns = [
-            r'^\d+ days? ago$',
-            r'~\d+ minute read',
-            r'^Follow',
-            r'^Not yet followed',
-            r'^Updated$',
-            r'^AS\d+$',
-            r'^[A-Z] - [A-Z] - [A-Z]',  # Letter navigation
-            r'^A$',
-            r'^B$',
-            r'^[A-Z]$'  # Single letters
-        ]
-        return any(re.match(pattern, line.strip()) for pattern in navigation_patterns)
-    def get_driver_installation_help(self, browser_type: str) -> str:
-        """Provide helpful instructions for installing WebDriver."""
-        if browser_type.lower() == "chrome":
-            return """
-Chrome WebDriver not found. Please install ChromeDriver:
-1. Download from: https://chromedriver.chromium.org/downloads
-2. Make sure version matches your Chrome browser
-3. Add to PATH or place in current directory
-4. Alternative: Install via pip: pip install chromedriver-autoinstaller
-"""
-        elif browser_type.lower() == "edge":
-            return """
-Edge WebDriver not found. Please install Edge WebDriver:
-1. Download from: https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/
-2. Make sure version matches your Edge browser
-3. Add to PATH or place in current directory
-4. Alternative: Install via pip: pip install msedge-selenium-tools
-"""
-        else:
-            return "Please install either ChromeDriver or Edge WebDriver for browser automation."
-    async def read_full_forum_post(self, email: str, password: str, post_url_or_id: str,
-                                 headless: bool = False, include_comments: bool = True) -> Dict[str, Any]:
-        """Read a complete forum post with optional comments."""
-        driver = None
-        try:
-            log("Starting forum post reading process", "INFO")
-            # Determine if input is URL or article ID
-            is_url = post_url_or_id.startswith('http')
-            if is_url:
-                post_url = post_url_or_id
-            else:
-                post_url = f"https://support.worldquantbrain.com/hc/zh-cn/community/posts/{post_url_or_id}"
-            log(f"Target URL: {post_url}", "INFO")
-            log(f"Include comments: {include_comments}", "INFO")
-            driver = await self.create_driver(headless)
-            # Login
-            if not await self.login_to_forum(driver, email, password):
-                raise Exception("Failed to login to forum")
-            # Navigate directly to post URL
-            log(f"Opening post: {post_url}", "WORK")
-            driver.get(post_url)
-            log("Post page loaded, extracting content immediately", "WORK")
-            # Wait minimal time for content to appear
-            await asyncio.sleep(2)
-            # Extract post content quickly
-            post_data = {}
-            page_source = driver.page_source
-            soup = BeautifulSoup(page_source, 'html.parser')
-            # Extract post title
-            title = soup.select_one('.post-title, h1, .article-title')
-            if not title:
-                title = soup.select_one('title')
-            post_data['title'] = title.get_text().strip() if title else 'Unknown Title'
-            # Extract post author
-            author = soup.select_one('.post-author, .author, .article-author')
-            if not author:
-                author = soup.select_one('.comment-author')
-            post_data['author'] = author.get_text().strip() if author else 'Unknown Author'
-            # Extract post date
-            date = soup.select_one('.post-date, .date, .article-date, time')
-            if not date:
-                time_element = soup.select_one('time')
-                if time_element:
-                    date = time_element.get('datetime') or time_element.get('title') or time_element.get_text().strip()
-                else:
-                    date = 'Unknown Date'
-            else:
-                date = date.get_text().strip()
-            post_data['date'] = date if date else 'Unknown Date'
-            # Extract post content
-            post_content = soup.select_one('.post-body, .article-body, .content, .post-content')
-            if not post_content:
-                post_content = soup.select_one('article, main')
-            if post_content:
-                post_data['content_html'] = str(post_content)
-                post_data['content_text'] = post_content.get_text().strip()
-            else:
-                post_data['content_html'] = 'No content found'
-                post_data['content_text'] = 'No content found'
-            post_data['url'] = post_url
-            post_data['current_url'] = driver.current_url
-            log(f"Post content extracted: \"{post_data['title']}\"", "SUCCESS")
-            comments = []
-            total_comments = 0
-            # Extract comments conditionally
-            if include_comments:
-                log("Extracting comments...", "WORK")
-                comments = await self._extract_forum_comments_full(driver, soup)
-                total_comments = len(comments)
-                log(f"Extracted {total_comments} comments", "SUCCESS")
-            else:
-                log("Skipping comment extraction (includeComments=false)", "INFO")
-            return {
-                "success": True,
-                "post": post_data,
-                "comments": comments,
-                "total_comments": total_comments,
-                "extracted_at": datetime.now().isoformat(),
-                "processing_time": "full_extraction_with_comments" if include_comments else "post_only_extraction",
-                "include_comments": include_comments
-            }
-        except Exception as e:
-            log(f"Failed to read forum post: {str(e)}", "ERROR")
-            return {"error": str(e)}
-        finally:
-            if driver:
-                try:
-                    driver.quit()
-                except:
-                    pass
-    async def _extract_forum_comments_full(self, driver, soup: BeautifulSoup) -> List[Dict[str, Any]]:
-        """Extract all comments from forum post with pagination support."""
-        all_comments = []
-        page_num = 1
-        try:
-            # First extract comments from current page source
-            page_comments = self._parse_comments_from_html(soup)
-            all_comments.extend(page_comments)
-            log(f"Found {len(page_comments)} comments on page {page_num}", "INFO")
-            # Check for pagination and continue if needed
-            while True:
-                try:
-                    # Look for next page button
-                    next_button = driver.find_element(By.CSS_SELECTOR, "span.pagination-next-text, .pagination-next, .next")
-                    next_text = next_button.text
+                    try:
+                        response = await page.goto(search_url)
+                        if response.status == 404:
+                            log(f"Page {page_num} not found. End of results.", "INFO")
+                            break
+                        await page.wait_for_selector('ul.search-results-list', timeout=15000)
+                    except Exception as e:
+                        log(f"Could not load search results on page {page_num}: {e}", "INFO")
+                        break
+                    content = await page.content()
+                    soup = BeautifulSoup(content, 'html.parser')
-                    if "下一页" in next_text or "Next" in next_text or "next" in next_text.lower():
-                        log(f"Found next page, continuing to page {page_num + 1}", "INFO")
-                        next_button.click()
-                        await asyncio.sleep(2)  # Minimal wait for next page
+                    results_on_page = soup.select('li.search-result-list-item')
+                    if not results_on_page:
+                        log("No more search results found.", "INFO")
+                        break
+                    for result in results_on_page:
+                        title_element = result.select_one('h2.search-result-title a')
+                        snippet_element = result.select_one('.search-results-description')
-                        # Extract comments from new page
-                        new_page_source = driver.page_source
-                        new_soup = BeautifulSoup(new_page_source, 'html.parser')
-                        new_page_comments = self._parse_comments_from_html(new_soup)
+                        if title_element:
+                            title = title_element.get_text(strip=True)
+                            link = title_element.get('href')
+                            votes_element = result.select_one('.search-result-votes span[aria-hidden="true"]')
+                            votes_text = votes_element.get_text(strip=True) if votes_element else '0'
+                            votes_match = re.search(r'\d+', votes_text)
+                            votes = int(votes_match.group()) if votes_match else 0
+                            comments_element = result.select_one('.search-result-meta-count span[aria-hidden="true"]')
+                            comments_text = comments_element.get_text(strip=True) if comments_element else '0'
+                            comments_match = re.search(r'\d+', comments_text)
+                            comments = int(comments_match.group()) if comments_match else 0
+                            breadcrumbs_elements = result.select('ol.search-result-breadcrumbs li')
+                            breadcrumbs = [bc.get_text(strip=True) for bc in breadcrumbs_elements]
+                            meta_group = result.select_one('ul.meta-group')
+                            author = 'Unknown'
+                            post_date = 'Unknown'
+                            if meta_group:
+                                meta_data_elements = meta_group.select('li.meta-data')
+                                if len(meta_data_elements) > 0:
+                                    author = meta_data_elements[0].get_text(strip=True)
+                                if len(meta_data_elements) > 1:
+                                    time_element = meta_data_elements[1].select_one('time')
+                                    if time_element:
+                                        post_date = time_element.get('datetime', time_element.get_text(strip=True))
+                            snippet = snippet_element.get_text(strip=True) if snippet_element else ''
+                            full_link = ''
+                            if link:
+                                if link.startswith('http'):
+                                    full_link = link
+                                else:
+                                    full_link = f"{self.base_url}{link}"
+                            search_results.append({
+                                'title': title,
+                                'link': full_link,
+                                'snippet': snippet,
+                                'votes': votes,
+                                'comments': comments,
+                                'author': author,
+                                'date': post_date,
+                                'breadcrumbs': breadcrumbs
+                            })
-                        if len(new_page_comments) == 0:
+                        if len(search_results) >= max_results:
                             break
-                        all_comments.extend(new_page_comments)
-                        page_num += 1
-                        log(f"Found {len(new_page_comments)} comments on page {page_num}", "INFO")
-                    else:
+                    if len(search_results) >= max_results:
                         break
-                except Exception as e:
-                    log("No more pages found", "INFO")
-                    break
-            return all_comments
-        except Exception as e:
-            log(f"Error in comment extraction: {str(e)}", "WARNING")
-            return all_comments
-    def _parse_comments_from_html(self, soup: BeautifulSoup) -> List[Dict[str, Any]]:
-        """Parse comments from HTML using BeautifulSoup."""
-        comments = []
-        # Try multiple selectors for comments
-        comment_selectors = [
-            'ul#comments.comment-list li.comment',
-            '.comment-list .comment',
-            '.comments .comment',
-            'li.comment',
-            '.comment-item'
-        ]
-        comment_elements = None
-        for selector in comment_selectors:
-            comment_elements = soup.select(selector)
-            if comment_elements:
-                log(f"Found comments using selector: {selector}", "INFO")
-                break
-        if not comment_elements:
-            log("No comments found on this page", "INFO")
-            return comments
-        for index, element in enumerate(comment_elements):
-            try:
-                comment = {}
-                # Extract comment ID
-                comment['id'] = element.get('id') or f"comment-{index}"
-                # Extract author
-                author_element = element.select_one('.comment-author a, .author a, .comment-author')
-                comment['author'] = author_element.get_text().strip() if author_element else 'Unknown Author'
-                comment['author_link'] = author_element.get('href') if author_element else ''
-                # Extract date
-                time_element = element.select_one('.meta-data time, time, .date, .comment-date')
-                if time_element:
-                    comment['date'] = time_element.get('datetime') or time_element.get('title') or time_element.get_text().strip()
-                    comment['date_display'] = time_element.get('title') or time_element.get_text().strip()
-                else:
-                    comment['date'] = 'Unknown Date'
-                    comment['date_display'] = 'Unknown Date'
-                # Extract content
-                content_element = element.select_one('.comment-body, .comment-content, .content')
-                if content_element:
-                    comment['content_html'] = str(content_element)
-                    comment['content_text'] = content_element.get_text().strip()
-                else:
-                    comment['content_html'] = ''
-                    comment['content_text'] = ''
-                # Extract votes
-                vote_element = element.select_one('.vote-up span, .votes, .vote-count')
-                comment['votes'] = vote_element.get_text().strip() if vote_element else '0'
-                # Extract status
-                status_element = element.select_one('.status-label, .status, .badge')
-                comment['status'] = status_element.get_text().strip() if status_element else '普通评论'
-                if comment['content_text']:
-                    comments.append(comment)
+                    page_num += 1
+                log(f"Found {len(search_results)} results for '{search_query}'", "SUCCESS")
+                return {
+                    "success": True,
+                    "results": search_results,
+                    "total_found": len(search_results)
+                }
             except Exception as e:
-                log(f"Error parsing comment {index}: {str(e)}", "WARNING")
-        return comments
-    async def search_forum_posts(self, email: str, password: str, search_query: str,
-                               max_results: int = 50, headless: bool = True) -> Dict[str, Any]:
-        """Search forum posts."""
-        driver = None
-        try:
-            log("Starting forum search process", "INFO")
-            log(f"Search query: '{search_query}'", "INFO")
-            log(f"Max results: {max_results}", "INFO")
-            driver = await self.create_driver(headless)
-            # Login
-            if not await self.login_to_forum(driver, email, password):
-                raise Exception("Failed to login to forum")
-            # Navigate to search
-            encoded_query = requests.utils.quote(search_query)
-            search_url = f"https://support.worldquantbrain.com/hc/zh-cn/search?utf8=%E2%9C%93&query={encoded_query}"
-            log(f"Opening search URL: {search_url}", "WORK")
-            driver.get(search_url)
-            await asyncio.sleep(2)
-            # Collect results with pagination
-            all_results = []
-            page_num = 1
-            log("Starting result collection with pagination", "WORK")
-            while len(all_results) < max_results:
-                log(f"Processing page {page_num}", "INFO")
-                # Wait for search results
-                try:
-                    WebDriverWait(driver, 10).until(
-                        EC.presence_of_element_located((By.CSS_SELECTOR, '.search-results-list, .search-result-list-item'))
-                    )
-                except TimeoutException:
-                    log(f"No search results found on page {page_num}", "WARNING")
-                    break
-                # Extract results from current page
-                page_source = driver.page_source
-                soup = BeautifulSoup(page_source, 'html.parser')
-                page_results = self._extract_search_results(soup, page_num)
-                if not page_results:
-                    log(f"No more results found on page {page_num}", "INFO")
-                    break
-                all_results.extend(page_results)
-                # Check if we have enough results
-                if len(all_results) >= max_results:
-                    all_results = all_results[:max_results]
-                    break
-                # Try to go to next page
-                if not await self._go_to_next_search_page(driver, soup):
-                    log("No more pages available", "INFO")
-                    break
-                page_num += 1
-                await asyncio.sleep(1)
-            # Analyze results
-            analysis = self._analyze_search_results(all_results, search_query)
-            log(f"Search completed. Found {len(all_results)} results", "SUCCESS")
-            return {
-                "results": all_results,
-                "total_found": len(all_results),
-                "search_query": search_query,
-                "analysis": analysis,
-                "search_timestamp": datetime.now().isoformat()
-            }
-        except Exception as e:
-            log(f"Search failed: {str(e)}", "ERROR")
-            return {"error": str(e)}
-        finally:
-            if driver:
-                try:
-                    driver.quit()
-                except:
-                    pass
-    def _extract_search_results(self, soup: BeautifulSoup, page_num: int) -> List[Dict[str, Any]]:
-        """Extract search results from a page using multiple resilient selectors.
-        Improvements vs original implementation:
-        - Tries several container selectors (mirrors TS Cheerio approach)
-        - Extracts richer metadata: description_html/text, votes, comments, author, date
-        - Preserves legacy fields (snippet, metadata) for backward compatibility
-        - Adds index & page for downstream analytics
-        - Robust fallbacks & normalization of URLs
-        """
-        results: List[Dict[str, Any]] = []
-        # Ordered list of possible container selectors (keep broad ones last)
-        container_selectors = [
-            '.search-result-list-item',
-            '.search-results-list .search-result',
-            '.striped-list-item',
-            '.article-list-item',
-            'article.search-result',
-            'div.search-result',
-        ]
-        # Collect candidate elements (stop at first selector that yields results)
-        result_items = []
-        for selector in container_selectors:
-            found = soup.select(selector)
-            if found:
-                log(f"Found {len(found)} search results using selector: {selector}", "INFO")
-                result_items = found
-                break
-        # Fallback: regex class scan (original heuristic)
-        if not result_items:
-            fallback = soup.find_all(['article', 'div'], class_=re.compile(r'search-result|article-item'))
-            if fallback:
-                log(f"Fallback selector captured {len(fallback)} results", "INFO")
-                result_items = fallback
-            else:
-                log("No search result items found with any selector", "WARNING")
-                return results
-        def first_text(element, selector_list: List[str]) -> str:
-            for sel in selector_list:
-                found = element.select_one(sel)
-                if found and found.get_text(strip=True):
-                    return found.get_text(strip=True)
-            return ''
-        for idx, item in enumerate(result_items):
+                log(f"Forum search failed: {str(e)}", "ERROR")
+                raise
+            finally:
+                if browser:
+                    await browser.close()
+    async def read_full_forum_post(self, email: str, password: str, post_url_or_id: str, include_comments: bool = True) -> Dict[str, Any]:
+        """Read a complete forum post and all its comments using Playwright."""
+        async with async_playwright() as p:
+            browser = None
             try:
-                # Title & link
-                title_link_elem = None
-                title_selectors = [
-                    '.search-result-title a',
-                    'h3 a',
-                    '.title a',
-                    'a'
-                ]
-                for sel in title_selectors:
-                    candidate = item.select_one(sel)
-                    if candidate and candidate.get_text(strip=True):
-                        title_link_elem = candidate
-                        break
+                log("Starting forum post reading process with Playwright", "INFO")
-                title = title_link_elem.get_text(strip=True) if title_link_elem else 'No title'
-                link = title_link_elem.get('href') if title_link_elem and title_link_elem.has_attr('href') else ''
-                if link and not link.startswith('http'):
-                    link = f"https://support.worldquantbrain.com{link}"
-                if not link and not title:
-                    continue  # Skip invalid entries
-                # Description / snippet
-                desc_elem = None
-                desc_selectors = [
-                    '.search-results-description',
-                    '.description',
-                    '.excerpt',
-                    '.content-preview',
-                    'p'
-                ]
-                for sel in desc_selectors:
-                    candidate = item.select_one(sel)
-                    if candidate and candidate.get_text(strip=True):
-                        desc_elem = candidate
-                        break
+                if post_url_or_id.startswith('http'):
+                    initial_url = post_url_or_id
+                else:
+                    initial_url = f"https://support.worldquantbrain.com/hc/zh-cn/community/posts/{post_url_or_id}"
-                description_html = str(desc_elem) if desc_elem else ''
-                description_text = desc_elem.get_text(strip=True) if desc_elem else ''
-                # Votes & comments
-                votes = first_text(item, [
-                    '.search-result-votes span',
-                    '.votes span',
-                    '[class*="vote"] span',
-                    '[class*="vote"]'
-                ]) or '0'
-                comments = first_text(item, [
-                    '.search-result-meta-count span',
-                    '.comments span',
-                    '[class*="comment"] span',
-                    '[class*="comment"]'
-                ]) or '0'
-                # Metadata / author / date
-                meta_block = item.select_one('.meta-data, .metadata, .post-meta')
-                author = 'Unknown'
-                date_val = 'Unknown'
-                if meta_block:
-                    meta_text = meta_block.get_text(' ', strip=True)
-                    # Split on common separators
-                    parts = [p.strip() for p in re.split(r'[·•|]', meta_text) if p.strip()]
-                    if len(parts) >= 2:
-                        author = parts[0] or author
-                        date_val = parts[1] or date_val
-                # Fallback selectors
-                if author == 'Unknown':
-                    author = first_text(item, ['.author', '.username', '[class*="author"]']) or 'Unknown'
-                if date_val == 'Unknown':
-                    # time element or date class
-                    time_elem = item.select_one('.date, time, [class*="date"]')
-                    if time_elem:
-                        date_val = time_elem.get('datetime') or time_elem.get('title') or time_elem.get_text(strip=True) or 'Unknown'
-                # Compose legacy fields
-                snippet = description_text
-                metadata = f"author={author} date={date_val} votes={votes} comments={comments}".strip()
-                results.append({
-                    'title': title,
-                    'link': link,
-                    'description_html': description_html or 'No description',
-                    'description_text': description_text or 'No description',
-                    'votes': votes,
-                    'comments': comments,
-                    'author': author,
-                    'date': date_val,
-                    'snippet': snippet,      # backward compatibility
-                    'metadata': metadata,    # backward compatibility / quick summary
-                    'page': page_num,
-                    'index': idx
-                })
-            except Exception as e:
-                log(f"Error extracting search result {idx}: {str(e)}", "WARNING")
-                continue
+                browser, context = await self._get_browser_context(p, email, password)
+                page = await context.new_page()
-        return results
-    async def _go_to_next_search_page(self, driver: webdriver.Chrome, soup: BeautifulSoup) -> bool:
-        """Navigate to the next search page."""
-        try:
-            # Look for next page link
-            next_link = soup.find('a', string=re.compile(r'next|下一页', re.IGNORECASE))
-            if not next_link:
-                next_link = soup.find('a', {'rel': 'next'})
-            if next_link and next_link.get('href'):
-                next_url = next_link['href']
-                if not next_url.startswith('http'):
-                    next_url = f"https://support.worldquantbrain.com{next_url}"
+                # --- Get Main Post Content and Final URL ---
+                log(f"Navigating to initial URL: {initial_url}", "INFO")
+                await page.goto(initial_url)
+                await page.wait_for_selector('.post-body, .article-body', timeout=15000)
-                driver.get(next_url)
-                await asyncio.sleep(2)
-                return True
-            return False
-        except Exception as e:
-            log(f"Error navigating to next page: {str(e)}", "WARNING")
-            return False
-    def _analyze_search_results(self, results: List[Dict[str, Any]], search_query: str) -> Dict[str, Any]:
-        """Analyze search results for insights."""
-        if not results:
-            return {"message": "No results found"}
-        # Basic statistics
-        total_results = len(results)
-        # Categorize results by type
-        categories = {}
-        for result in results:
-            title = result.get('title', '').lower()
-            if 'tutorial' in title or 'guide' in title:
-                categories['tutorials'] = categories.get('tutorials', 0) + 1
-            elif 'api' in title or 'reference' in title:
-                categories['api_docs'] = categories.get('api_docs', 0) + 1
-            elif 'error' in title or 'issue' in title or 'problem' in title:
-                categories['troubleshooting'] = categories.get('troubleshooting', 0) + 1
-            elif 'competition' in title or 'event' in title:
-                categories['competitions'] = categories.get('competitions', 0) + 1
-            else:
-                categories['general'] = categories.get('general', 0) + 1
-        # Find most relevant results (containing search terms)
-        search_terms = search_query.lower().split()
-        relevant_results = []
-        for result in results:
-            title = result.get('title', '').lower()
-            snippet = result.get('snippet', '').lower()
-            text = f"{title} {snippet}"
-            term_matches = sum(1 for term in search_terms if term in text)
-            if term_matches > 0:
-                relevant_results.append({
-                    "result": result,
-                    "relevance_score": term_matches / len(search_terms)
-                })
-        # Sort by relevance
-        relevant_results.sort(key=lambda x: x['relevance_score'], reverse=True)
-        return {
-            "total_results": total_results,
-            "categories": categories,
-            "most_relevant": relevant_results[:5] if relevant_results else [],
-            "search_terms": search_terms
-        }
+                # Get the final URL after any redirects
+                base_url = re.sub(r'(\?|&)page=\d+', '', page.url).split('#')[0]
+                log(f"Resolved to Base URL: {base_url}", "INFO")
+                await page.wait_for_selector('.post-body, .article-body', timeout=15000)
+                content = await page.content()
+                soup = BeautifulSoup(content, 'html.parser')
+                post_data = {}
+                title_element = soup.select_one('.post-title, h1.article-title, .article__title')
+                post_data['title'] = title_element.get_text(strip=True) if title_element else 'Unknown Title'
+                author_span = soup.select_one('.post-author span[title]')
+                post_data['author'] = author_span['title'] if author_span else 'Unknown Author'
+                body_element = soup.select_one('.post-body, .article-body')
+                post_data['body'] = body_element.get_text(strip=True) if body_element else 'Body not found'
+                votes_element = soup.select_one('.vote-sum')
+                date_element = soup.select_one('.post-meta .meta-data')
+                post_data['details'] = {
+                    'votes': votes_element.get_text(strip=True) if votes_element else '0',
+                    'date': date_element.get_text(strip=True) if date_element else 'Unknown Date'
+                }
+                # --- Get Comments with Pagination ---
+                comments = []
+                if include_comments:
+                    log("Starting comment extraction...", "INFO")
+                    page_num = 1
+                    while True:
+                        comment_url = f"{base_url}?page={page_num}#comments"
+                        log(f"Navigating to comment page: {comment_url}", "INFO")
+                        try:
+                            response = await page.goto(comment_url)
+                            if response.status == 404:
+                                log(f"Page {page_num} returned 404. End of comments.", "INFO")
+                                break
+                            await page.wait_for_selector('.comment-list', timeout=10000)
+                        except Exception as e:
+                            log(f"Could not load page {page_num}: {e}. Assuming end of comments.", "INFO")
+                            break
+                        comment_soup = BeautifulSoup(await page.content(), 'html.parser')
+                        comment_elements = comment_soup.select('.comment')
+                        if not comment_elements:
+                            log(f"No comments found on page {page_num}. Ending extraction.", "INFO")
+                            break
+                        log(f"Found {len(comment_elements)} comments on page {page_num}.", "INFO")
+                        new_comments_found_on_page = 0
+                        for comment_element in comment_elements:
+                            author_span = comment_element.select_one('.comment-author span[title]')
+                            author_id = author_span['title'] if author_span else 'Unknown'
+                            body_element = comment_element.select_one('.comment-body')
+                            date_element = comment_element.select_one('.comment-meta .meta-data')
+                            comment_data = {
+                                'author': author_id,
+                                'body': body_element.get_text(strip=True) if body_element else '',
+                                'date': date_element.get_text(strip=True) if date_element else 'Unknown Date'
+                            }
+                            if comment_data not in comments:
+                                comments.append(comment_data)
+                                new_comments_found_on_page += 1
+                        if new_comments_found_on_page == 0 and page_num > 1:
+                            log(f"No new comments detected on page {page_num}. Ending extraction.", "INFO")
+                            break
+                        page_num += 1
+                log(f"Extracted {len(comments)} comments in total.", "SUCCESS")
+                return {
+                    "success": True, "post": post_data, "comments": comments, "total_comments": len(comments)
+                }
+            except Exception as e:
+                log(f"Failed to read forum post: {str(e)}", "ERROR")
+                raise
+            finally:
+                if browser:
+                    await browser.close()
 # Initialize forum client
 forum_client = ForumClient()
-# MCP Tools for Forum Functions - REMOVED (duplicate with platform_functions.py)
-# These tools are already properly integrated in the main platform_functions.py
+# The main block is for testing and won't be run by the MCP server.
 if __name__ == "__main__":
-    print("📚 WorldQuant BRAIN Forum Functions Server Starting...", file=sys.stderr)
-    print("Note: Forum tools are now integrated in the main platform_functions.py", file=sys.stderr)
-    print("This file provides the ForumClient class for internal use.", file=sys.stderr)
+    print("📚 WorldQuant BRAIN Forum Functions - This script provides the ForumClient class.", file=sys.stderr)

cnhkmcp 2.1.4__py3-none-any.whl → 2.1.6__py3-none-any.whl

cnhkmcp 2.1.4py3-none-any.whl → 2.1.6py3-none-any.whl