PyPI - x-scraper-tool - Versions diffs - 0.1.0__tar.gz - Mend

x-scraper-tool 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

x_scraper_tool-0.1.0/.gitignore +24 -0
x_scraper_tool-0.1.0/PKG-INFO +130 -0
x_scraper_tool-0.1.0/README.md +121 -0
x_scraper_tool-0.1.0/config.example.toml +15 -0
x_scraper_tool-0.1.0/pyproject.toml +28 -0
x_scraper_tool-0.1.0/skills/x-scraper/SKILL.md +81 -0
x_scraper_tool-0.1.0/src/x_scraper/__init__.py +1 -0
x_scraper_tool-0.1.0/src/x_scraper/auth.py +66 -0
x_scraper_tool-0.1.0/src/x_scraper/cli.py +190 -0
x_scraper_tool-0.1.0/src/x_scraper/config.py +55 -0
x_scraper_tool-0.1.0/src/x_scraper/export.py +52 -0
x_scraper_tool-0.1.0/src/x_scraper/models.py +134 -0
x_scraper_tool-0.1.0/src/x_scraper/scraper.py +115 -0
x_scraper_tool-0.1.0/tests/__init__.py +0 -0
x_scraper_tool-0.1.0/tests/test_models.py +184 -0
x_scraper_tool-0.1.0/uv.lock +264 -0

x_scraper_tool-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,24 @@
+# Secrets
+config.toml
+# twscrape account database
+*.db
+# Python
+__pycache__/
+*.py[cod]
+*.egg-info/
+dist/
+build/
+.venv/
+# Output
+output/
+# IDE
+.idea/
+.vscode/
+*.swp
+# OS
+.DS_Store

x_scraper_tool-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,130 @@
+Metadata-Version: 2.4
+Name: x-scraper-tool
+Version: 0.1.0
+Summary: A tool for scraping user data from X (Twitter)
+Requires-Python: >=3.11
+Requires-Dist: click>=8.0
+Requires-Dist: twscrape>=0.17
+Description-Content-Type: text/markdown
+# X (Twitter) Scraper
+基于 [twscrape](https://github.com/vladkens/twscrape) 的 X (Twitter) 数据爬取工具，支持用户资料、推文、粉丝列表和互动数据的抓取，输出为 JSON 或 CSV。
+## 功能
+- 用户资料（profile）爬取
+- 用户推文时间线（最多 3200 条，X API 硬限制）
+- 粉丝列表（followers）和关注列表（following）
+- 推文回复（replies）和转推用户（retweeters）
+- 多账号自动轮换，cookie 导入绕过 CAPTCHA
+- JSON / CSV 双格式输出
+## 前置条件
+- Python >= 3.11
+- [uv](https://github.com/astral-sh/uv) 包管理器
+- 至少一个 X 账号的 cookie（非主账号）
+## 安装
+```bash
+git clone https://github.com/ChaNg1o1/x-scraper.git
+cd x-scraper
+uv sync
+```
+## 配置
+复制示例配置并填入你的账号 cookie：
+```bash
+cp config.example.toml config.toml
+```
+编辑 `config.toml`：
+```toml
+[scraper]
+output_dir = "./output"
+request_delay = 1.5
+max_tweets = 3200
+[[accounts]]
+username = "your_account"
+cookies = "ct0=YOUR_CT0; auth_token=YOUR_AUTH_TOKEN"
+# proxy = "http://user:pass@host:port"  # 可选
+```
+### 获取 Cookie
+1. 在浏览器中登录 X
+2. 打开 DevTools（F12）-> Application -> Cookies -> `https://x.com`
+3. 复制 `ct0` 和 `auth_token` 的值
+## 使用方法
+```bash
+# 爬取用户全部数据（资料 + 推文 + 粉丝 + 关注）
+x-scraper scrape --user elonmusk --all
+# 只爬取推文和粉丝
+x-scraper scrape --user elonmusk --tweets --followers
+# 爬取推文互动数据
+x-scraper scrape --tweet 1234567890 --replies --retweeters
+# 指定输出格式为 CSV
+x-scraper scrape --user elonmusk --all --format csv
+# 限制爬取数量
+x-scraper scrape --user elonmusk --tweets --limit 100
+# 使用指定配置文件
+x-scraper scrape --config ./my_config.toml --user elonmusk --all
+# 查看已配置账号
+x-scraper accounts
+```
+## 输出结构
+```
+output/
+  {username}/
+    profile.json
+    tweets.json
+    followers.json
+    following.json
+  tweet_{id}/
+    replies.json
+    retweeters.json
+```
+## 项目结构
+```
+x-scraper/
+  pyproject.toml
+  config.example.toml
+  src/x_scraper/
+    cli.py          # CLI 入口（click）
+    config.py       # TOML 配置加载
+    auth.py         # 账号管理与 cookie 导入
+    scraper.py      # 核心爬取逻辑
+    models.py       # 数据模型（dataclass）
+    export.py       # JSON/CSV 导出
+  tests/
+    test_models.py  # 数据模型单元测试
+```
+## 注意事项
+- 请勿使用主账号，账号存在被封禁的风险
+- X 的 ToS 禁止未授权的自动化访问，请自行承担使用风险
+- X 的反爬策略大约每 2-4 周变更，工具可能需要随之调整
+- 本工具已内置对 twscrape `xclid` 脚本解析错误的 monkey-patch 修复（[vladkens/twscrape#284](https://github.com/vladkens/twscrape/issues/284)）
+## 许可证
+MIT

x_scraper_tool-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,121 @@
+# X (Twitter) Scraper
+基于 [twscrape](https://github.com/vladkens/twscrape) 的 X (Twitter) 数据爬取工具，支持用户资料、推文、粉丝列表和互动数据的抓取，输出为 JSON 或 CSV。
+## 功能
+- 用户资料（profile）爬取
+- 用户推文时间线（最多 3200 条，X API 硬限制）
+- 粉丝列表（followers）和关注列表（following）
+- 推文回复（replies）和转推用户（retweeters）
+- 多账号自动轮换，cookie 导入绕过 CAPTCHA
+- JSON / CSV 双格式输出
+## 前置条件
+- Python >= 3.11
+- [uv](https://github.com/astral-sh/uv) 包管理器
+- 至少一个 X 账号的 cookie（非主账号）
+## 安装
+```bash
+git clone https://github.com/ChaNg1o1/x-scraper.git
+cd x-scraper
+uv sync
+```
+## 配置
+复制示例配置并填入你的账号 cookie：
+```bash
+cp config.example.toml config.toml
+```
+编辑 `config.toml`：
+```toml
+[scraper]
+output_dir = "./output"
+request_delay = 1.5
+max_tweets = 3200
+[[accounts]]
+username = "your_account"
+cookies = "ct0=YOUR_CT0; auth_token=YOUR_AUTH_TOKEN"
+# proxy = "http://user:pass@host:port"  # 可选
+```
+### 获取 Cookie
+1. 在浏览器中登录 X
+2. 打开 DevTools（F12）-> Application -> Cookies -> `https://x.com`
+3. 复制 `ct0` 和 `auth_token` 的值
+## 使用方法
+```bash
+# 爬取用户全部数据（资料 + 推文 + 粉丝 + 关注）
+x-scraper scrape --user elonmusk --all
+# 只爬取推文和粉丝
+x-scraper scrape --user elonmusk --tweets --followers
+# 爬取推文互动数据
+x-scraper scrape --tweet 1234567890 --replies --retweeters
+# 指定输出格式为 CSV
+x-scraper scrape --user elonmusk --all --format csv
+# 限制爬取数量
+x-scraper scrape --user elonmusk --tweets --limit 100
+# 使用指定配置文件
+x-scraper scrape --config ./my_config.toml --user elonmusk --all
+# 查看已配置账号
+x-scraper accounts
+```
+## 输出结构
+```
+output/
+  {username}/
+    profile.json
+    tweets.json
+    followers.json
+    following.json
+  tweet_{id}/
+    replies.json
+    retweeters.json
+```
+## 项目结构
+```
+x-scraper/
+  pyproject.toml
+  config.example.toml
+  src/x_scraper/
+    cli.py          # CLI 入口（click）
+    config.py       # TOML 配置加载
+    auth.py         # 账号管理与 cookie 导入
+    scraper.py      # 核心爬取逻辑
+    models.py       # 数据模型（dataclass）
+    export.py       # JSON/CSV 导出
+  tests/
+    test_models.py  # 数据模型单元测试
+```
+## 注意事项
+- 请勿使用主账号，账号存在被封禁的风险
+- X 的 ToS 禁止未授权的自动化访问，请自行承担使用风险
+- X 的反爬策略大约每 2-4 周变更，工具可能需要随之调整
+- 本工具已内置对 twscrape `xclid` 脚本解析错误的 monkey-patch 修复（[vladkens/twscrape#284](https://github.com/vladkens/twscrape/issues/284)）
+## 许可证
+MIT

x_scraper_tool-0.1.0/config.example.toml ADDED Viewed

@@ -0,0 +1,15 @@
+[scraper]
+output_dir = "./output"
+request_delay = 1.5        # seconds between requests
+max_tweets = 3200           # per user, X API hard limit
+# Add one or more accounts. Use throwaway accounts only.
+# To get cookies: log in to X in your browser, open DevTools > Application > Cookies,
+# and copy the values of `ct0` and `auth_token`.
+[[accounts]]
+username = "your_account"
+cookies = "ct0=YOUR_CT0_VALUE; auth_token=YOUR_AUTH_TOKEN_VALUE"
+# Optional: per-account proxy
+# proxy = "http://user:pass@host:port"

x_scraper_tool-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,28 @@
+[project]
+name = "x-scraper-tool"
+version = "0.1.0"
+description = "A tool for scraping user data from X (Twitter)"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "twscrape>=0.17",
+    "click>=8.0",
+]
+[project.scripts]
+x-scraper-tool = "x_scraper.cli:main"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["src/x_scraper"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+[dependency-groups]
+dev = [
+    "pytest>=9.0.2",
+]

x_scraper_tool-0.1.0/skills/x-scraper/SKILL.md ADDED Viewed

@@ -0,0 +1,81 @@
+---
+name: x-scraper
+description: Use when scraping X (Twitter) data - user profiles, tweets, followers, following lists, tweet replies, retweeters. Triggers on "scrape twitter", "scrape X", "get tweets", "fetch followers", "twitter data", "X API", "twscrape".
+---
+# X (Twitter) Scraper
+## Overview
+CLI tool for scraping X user data via twscrape with cookie-based auth and multi-account rotation.
+## Quick Start
+```bash
+cp config.example.toml config.toml  # fill in ct0 + auth_token
+x-scraper scrape --user TARGET --all
+```
+## Cookie Setup
+Browser login -> DevTools -> Application -> Cookies -> copy `ct0` and `auth_token`.
+```toml
+[[accounts]]
+username = "throwaway_account"
+cookies = "ct0=xxx; auth_token=yyy"
+```
+## Commands
+```bash
+# User data
+x-scraper scrape --user USERNAME --all              # profile + tweets + followers + following
+x-scraper scrape --user USERNAME --tweets --limit 50 # specific data type
+x-scraper scrape --user USERNAME --all --format csv  # CSV output
+# Tweet interactions
+x-scraper scrape --tweet TWEET_ID --replies --retweeters
+# Account management
+x-scraper accounts
+```
+## Output
+Files written to `output/{username}/` or `output/tweet_{id}/`:
+- `profile.json` - user profile
+- `tweets.json` - tweet timeline
+- `followers.json` / `following.json` - social graph
+- `replies.json` / `retweeters.json` - tweet interactions
+## Architecture
+| Module | Purpose |
+|--------|---------|
+| `cli.py` | Click CLI entry point |
+| `config.py` | TOML config loading |
+| `auth.py` | Cookie import + twscrape xclid monkey-patch |
+| `scraper.py` | Async scraping via twscrape API |
+| `models.py` | Dataclass models with `from_twscrape()` converters |
+| `export.py` | JSON/CSV export |
+## Known Issues
+- twscrape `xclid.py` breaks when X changes JS bundle format -- `auth.py` includes a monkey-patch for unquoted JSON keys ([twscrape#284](https://github.com/vladkens/twscrape/issues/284))
+- X anti-scraping changes every 2-4 weeks; twscrape or the patch may need updates
+- Accounts can get locked/suspended; never use primary accounts
+## Extending
+All scraper methods are async on `XScraper` class:
+```python
+scraper = XScraper(config)
+await scraper.setup()
+profile = await scraper.scrape_user_profile("username")
+tweets = await scraper.scrape_user_tweets(user_id, limit=100)
+followers = await scraper.scrape_user_followers(user_id, limit=500)
+replies = await scraper.scrape_tweet_replies(tweet_id)
+```

x_scraper_tool-0.1.0/src/x_scraper/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """X (Twitter) scraper built on twscrape."""

x_scraper_tool-0.1.0/src/x_scraper/auth.py ADDED Viewed

@@ -0,0 +1,66 @@
+from __future__ import annotations
+import json
+import logging
+import re
+from twscrape import API, AccountsPool
+from x_scraper.config import Config
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Monkey-patch: X changed their JS bundles to emit unquoted JSON keys,
+# which breaks twscrape's script URL parser.
+# See: https://github.com/vladkens/twscrape/issues/284
+# ---------------------------------------------------------------------------
+def _script_url(k: str, v: str) -> str:
+    return f"https://abs.twimg.com/responsive-web/client-web/{k}.{v}.js"
+def _patched_get_scripts_list(text: str):
+    scripts = text.split('e=>e+"."+')[1].split('[e]+"a.js"')[0]
+    try:
+        for k, v in json.loads(scripts).items():
+            yield _script_url(k, f"{v}a")
+    except json.decoder.JSONDecodeError:
+        # Fix unquoted keys like: node_modules_pnpm_ws_8_18_0_...
+        fixed = re.sub(
+            r'([,\{])(\s*)([\w]+_[\w_]+)(\s*):',
+            r'\1\2"\3"\4:',
+            scripts,
+        )
+        for k, v in json.loads(fixed).items():
+            yield _script_url(k, f"{v}a")
+from twscrape import xclid  # noqa: E402
+xclid.get_scripts_list = _patched_get_scripts_list
+logger.debug("Applied monkey-patch for twscrape xclid script parser")
+async def setup_pool(config: Config) -> API:
+    """Create twscrape API with accounts from config using cookie-based auth.
+    Returns a ready-to-use API instance with all accounts added and activated.
+    """
+    pool = AccountsPool()
+    for account in config.accounts:
+        await pool.add_account(
+            username=account.username,
+            password="placeholder",
+            email="placeholder@example.com",
+            email_password="placeholder",
+            proxy=account.proxy,
+            cookies=account.cookies,
+        )
+        logger.info("Added account: %s", account.username)
+    # Do NOT pass proxy to API() -- let per-account proxy take effect
+    api = API(pool=pool)
+    return api

x_scraper_tool-0.1.0/src/x_scraper/cli.py ADDED Viewed

@@ -0,0 +1,190 @@
+from __future__ import annotations
+import asyncio
+import logging
+import sys
+from pathlib import Path
+import click
+from x_scraper.config import load_config
+from x_scraper.export import export_data
+from x_scraper.scraper import XScraper
+logger = logging.getLogger("x_scraper")
+def _setup_logging(verbose: bool) -> None:
+    level = logging.DEBUG if verbose else logging.INFO
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+@click.group()
+@click.option("-v", "--verbose", is_flag=True, help="Enable debug logging.")
+def main(verbose: bool) -> None:
+    """X (Twitter) scraper - fetch user data via twscrape."""
+    _setup_logging(verbose)
+# ---------------------------------------------------------------------------
+# scrape command
+# ---------------------------------------------------------------------------
+@main.command()
+@click.option(
+    "-c", "--config", "config_path",
+    default="config.toml",
+    show_default=True,
+    help="Path to TOML config file.",
+)
+@click.option("-u", "--user", "username", help="Target username to scrape.")
+@click.option("-t", "--tweet", "tweet_id", type=int, help="Target tweet ID.")
+@click.option("--tweets", is_flag=True, help="Scrape user tweets.")
+@click.option("--followers", is_flag=True, help="Scrape user followers.")
+@click.option("--following", is_flag=True, help="Scrape user following.")
+@click.option("--replies", is_flag=True, help="Scrape tweet replies.")
+@click.option("--retweeters", is_flag=True, help="Scrape tweet retweeters.")
+@click.option("--all", "scrape_all", is_flag=True, help="Scrape all available data.")
+@click.option(
+    "--format", "fmt",
+    type=click.Choice(["json", "csv"]),
+    default="json",
+    show_default=True,
+    help="Output format.",
+)
+@click.option("--limit", type=int, default=None, help="Max items to fetch per category.")
+def scrape(
+    config_path: str,
+    username: str | None,
+    tweet_id: int | None,
+    tweets: bool,
+    followers: bool,
+    following: bool,
+    replies: bool,
+    retweeters: bool,
+    scrape_all: bool,
+    fmt: str,
+    limit: int | None,
+) -> None:
+    """Scrape user data or tweet interactions from X."""
+    if not username and not tweet_id:
+        click.echo("Error: provide --user or --tweet", err=True)
+        sys.exit(1)
+    try:
+        config = load_config(config_path)
+    except (FileNotFoundError, ValueError) as exc:
+        click.echo(f"Config error: {exc}", err=True)
+        sys.exit(1)
+    asyncio.run(_run_scrape(
+        config=config,
+        username=username,
+        tweet_id=tweet_id,
+        tweets=tweets or scrape_all,
+        followers=followers or scrape_all,
+        following=following or scrape_all,
+        replies=replies or scrape_all,
+        retweeters=retweeters or scrape_all,
+        fmt=fmt,
+        limit=limit,
+    ))
+async def _run_scrape(
+    *,
+    config,
+    username: str | None,
+    tweet_id: int | None,
+    tweets: bool,
+    followers: bool,
+    following: bool,
+    replies: bool,
+    retweeters: bool,
+    fmt: str,
+    limit: int | None,
+) -> None:
+    scraper = XScraper(config)
+    await scraper.setup()
+    # --- User-level scraping ---
+    if username:
+        user_dir = config.output_dir / username
+        click.echo(f"Scraping user: {username}")
+        profile = await scraper.scrape_user_profile(username)
+        path = export_data([profile], user_dir, "profile", fmt)
+        click.echo(f"  profile -> {path}")
+        user_id = int(profile.id)
+        if tweets:
+            data = await scraper.scrape_user_tweets(user_id, limit=limit)
+            path = export_data(data, user_dir, "tweets", fmt)
+            click.echo(f"  tweets ({len(data)}) -> {path}")
+        if followers:
+            data = await scraper.scrape_user_followers(
+                user_id, limit=limit or 1000,
+            )
+            path = export_data(data, user_dir, "followers", fmt)
+            click.echo(f"  followers ({len(data)}) -> {path}")
+        if following:
+            data = await scraper.scrape_user_following(
+                user_id, limit=limit or 1000,
+            )
+            path = export_data(data, user_dir, "following", fmt)
+            click.echo(f"  following ({len(data)}) -> {path}")
+    # --- Tweet-level scraping ---
+    if tweet_id:
+        tweet_dir = config.output_dir / f"tweet_{tweet_id}"
+        click.echo(f"Scraping tweet: {tweet_id}")
+        if replies:
+            data = await scraper.scrape_tweet_replies(
+                tweet_id, limit=limit or 500,
+            )
+            path = export_data(data, tweet_dir, "replies", fmt)
+            click.echo(f"  replies ({len(data)}) -> {path}")
+        if retweeters:
+            data = await scraper.scrape_tweet_retweeters(
+                tweet_id, limit=limit or 500,
+            )
+            path = export_data(data, tweet_dir, "retweeters", fmt)
+            click.echo(f"  retweeters ({len(data)}) -> {path}")
+    click.echo("Done.")
+# ---------------------------------------------------------------------------
+# accounts command
+# ---------------------------------------------------------------------------
+@main.command()
+@click.option(
+    "-c", "--config", "config_path",
+    default="config.toml",
+    show_default=True,
+    help="Path to TOML config file.",
+)
+def accounts(config_path: str) -> None:
+    """List configured accounts and their status."""
+    try:
+        config = load_config(config_path)
+    except (FileNotFoundError, ValueError) as exc:
+        click.echo(f"Config error: {exc}", err=True)
+        sys.exit(1)
+    click.echo(f"Configured accounts ({len(config.accounts)}):")
+    for acc in config.accounts:
+        proxy_info = f" [proxy: {acc.proxy}]" if acc.proxy else ""
+        click.echo(f"  - {acc.username}{proxy_info}")