x-scraper-tool 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ # Secrets
2
+ config.toml
3
+
4
+ # twscrape account database
5
+ *.db
6
+
7
+ # Python
8
+ __pycache__/
9
+ *.py[cod]
10
+ *.egg-info/
11
+ dist/
12
+ build/
13
+ .venv/
14
+
15
+ # Output
16
+ output/
17
+
18
+ # IDE
19
+ .idea/
20
+ .vscode/
21
+ *.swp
22
+
23
+ # OS
24
+ .DS_Store
@@ -0,0 +1,130 @@
1
+ Metadata-Version: 2.4
2
+ Name: x-scraper-tool
3
+ Version: 0.1.0
4
+ Summary: A tool for scraping user data from X (Twitter)
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: click>=8.0
7
+ Requires-Dist: twscrape>=0.17
8
+ Description-Content-Type: text/markdown
9
+
10
+ # X (Twitter) Scraper
11
+
12
+ 基于 [twscrape](https://github.com/vladkens/twscrape) 的 X (Twitter) 数据爬取工具,支持用户资料、推文、粉丝列表和互动数据的抓取,输出为 JSON 或 CSV。
13
+
14
+ ## 功能
15
+
16
+ - 用户资料(profile)爬取
17
+ - 用户推文时间线(最多 3200 条,X API 硬限制)
18
+ - 粉丝列表(followers)和关注列表(following)
19
+ - 推文回复(replies)和转推用户(retweeters)
20
+ - 多账号自动轮换,cookie 导入绕过 CAPTCHA
21
+ - JSON / CSV 双格式输出
22
+
23
+ ## 前置条件
24
+
25
+ - Python >= 3.11
26
+ - [uv](https://github.com/astral-sh/uv) 包管理器
27
+ - 至少一个 X 账号的 cookie(非主账号)
28
+
29
+ ## 安装
30
+
31
+ ```bash
32
+ git clone https://github.com/ChaNg1o1/x-scraper.git
33
+ cd x-scraper
34
+ uv sync
35
+ ```
36
+
37
+ ## 配置
38
+
39
+ 复制示例配置并填入你的账号 cookie:
40
+
41
+ ```bash
42
+ cp config.example.toml config.toml
43
+ ```
44
+
45
+ 编辑 `config.toml`:
46
+
47
+ ```toml
48
+ [scraper]
49
+ output_dir = "./output"
50
+ request_delay = 1.5
51
+ max_tweets = 3200
52
+
53
+ [[accounts]]
54
+ username = "your_account"
55
+ cookies = "ct0=YOUR_CT0; auth_token=YOUR_AUTH_TOKEN"
56
+ # proxy = "http://user:pass@host:port" # 可选
57
+ ```
58
+
59
+ ### 获取 Cookie
60
+
61
+ 1. 在浏览器中登录 X
62
+ 2. 打开 DevTools(F12)-> Application -> Cookies -> `https://x.com`
63
+ 3. 复制 `ct0` 和 `auth_token` 的值
64
+
65
+ ## 使用方法
66
+
67
+ ```bash
68
+ # 爬取用户全部数据(资料 + 推文 + 粉丝 + 关注)
69
+ x-scraper scrape --user elonmusk --all
70
+
71
+ # 只爬取推文和粉丝
72
+ x-scraper scrape --user elonmusk --tweets --followers
73
+
74
+ # 爬取推文互动数据
75
+ x-scraper scrape --tweet 1234567890 --replies --retweeters
76
+
77
+ # 指定输出格式为 CSV
78
+ x-scraper scrape --user elonmusk --all --format csv
79
+
80
+ # 限制爬取数量
81
+ x-scraper scrape --user elonmusk --tweets --limit 100
82
+
83
+ # 使用指定配置文件
84
+ x-scraper scrape --config ./my_config.toml --user elonmusk --all
85
+
86
+ # 查看已配置账号
87
+ x-scraper accounts
88
+ ```
89
+
90
+ ## 输出结构
91
+
92
+ ```
93
+ output/
94
+ {username}/
95
+ profile.json
96
+ tweets.json
97
+ followers.json
98
+ following.json
99
+ tweet_{id}/
100
+ replies.json
101
+ retweeters.json
102
+ ```
103
+
104
+ ## 项目结构
105
+
106
+ ```
107
+ x-scraper/
108
+ pyproject.toml
109
+ config.example.toml
110
+ src/x_scraper/
111
+ cli.py # CLI 入口(click)
112
+ config.py # TOML 配置加载
113
+ auth.py # 账号管理与 cookie 导入
114
+ scraper.py # 核心爬取逻辑
115
+ models.py # 数据模型(dataclass)
116
+ export.py # JSON/CSV 导出
117
+ tests/
118
+ test_models.py # 数据模型单元测试
119
+ ```
120
+
121
+ ## 注意事项
122
+
123
+ - 请勿使用主账号,账号存在被封禁的风险
124
+ - X 的 ToS 禁止未授权的自动化访问,请自行承担使用风险
125
+ - X 的反爬策略大约每 2-4 周变更,工具可能需要随之调整
126
+ - 本工具已内置对 twscrape `xclid` 脚本解析错误的 monkey-patch 修复([vladkens/twscrape#284](https://github.com/vladkens/twscrape/issues/284))
127
+
128
+ ## 许可证
129
+
130
+ MIT
@@ -0,0 +1,121 @@
1
+ # X (Twitter) Scraper
2
+
3
+ 基于 [twscrape](https://github.com/vladkens/twscrape) 的 X (Twitter) 数据爬取工具,支持用户资料、推文、粉丝列表和互动数据的抓取,输出为 JSON 或 CSV。
4
+
5
+ ## 功能
6
+
7
+ - 用户资料(profile)爬取
8
+ - 用户推文时间线(最多 3200 条,X API 硬限制)
9
+ - 粉丝列表(followers)和关注列表(following)
10
+ - 推文回复(replies)和转推用户(retweeters)
11
+ - 多账号自动轮换,cookie 导入绕过 CAPTCHA
12
+ - JSON / CSV 双格式输出
13
+
14
+ ## 前置条件
15
+
16
+ - Python >= 3.11
17
+ - [uv](https://github.com/astral-sh/uv) 包管理器
18
+ - 至少一个 X 账号的 cookie(非主账号)
19
+
20
+ ## 安装
21
+
22
+ ```bash
23
+ git clone https://github.com/ChaNg1o1/x-scraper.git
24
+ cd x-scraper
25
+ uv sync
26
+ ```
27
+
28
+ ## 配置
29
+
30
+ 复制示例配置并填入你的账号 cookie:
31
+
32
+ ```bash
33
+ cp config.example.toml config.toml
34
+ ```
35
+
36
+ 编辑 `config.toml`:
37
+
38
+ ```toml
39
+ [scraper]
40
+ output_dir = "./output"
41
+ request_delay = 1.5
42
+ max_tweets = 3200
43
+
44
+ [[accounts]]
45
+ username = "your_account"
46
+ cookies = "ct0=YOUR_CT0; auth_token=YOUR_AUTH_TOKEN"
47
+ # proxy = "http://user:pass@host:port" # 可选
48
+ ```
49
+
50
+ ### 获取 Cookie
51
+
52
+ 1. 在浏览器中登录 X
53
+ 2. 打开 DevTools(F12)-> Application -> Cookies -> `https://x.com`
54
+ 3. 复制 `ct0` 和 `auth_token` 的值
55
+
56
+ ## 使用方法
57
+
58
+ ```bash
59
+ # 爬取用户全部数据(资料 + 推文 + 粉丝 + 关注)
60
+ x-scraper scrape --user elonmusk --all
61
+
62
+ # 只爬取推文和粉丝
63
+ x-scraper scrape --user elonmusk --tweets --followers
64
+
65
+ # 爬取推文互动数据
66
+ x-scraper scrape --tweet 1234567890 --replies --retweeters
67
+
68
+ # 指定输出格式为 CSV
69
+ x-scraper scrape --user elonmusk --all --format csv
70
+
71
+ # 限制爬取数量
72
+ x-scraper scrape --user elonmusk --tweets --limit 100
73
+
74
+ # 使用指定配置文件
75
+ x-scraper scrape --config ./my_config.toml --user elonmusk --all
76
+
77
+ # 查看已配置账号
78
+ x-scraper accounts
79
+ ```
80
+
81
+ ## 输出结构
82
+
83
+ ```
84
+ output/
85
+ {username}/
86
+ profile.json
87
+ tweets.json
88
+ followers.json
89
+ following.json
90
+ tweet_{id}/
91
+ replies.json
92
+ retweeters.json
93
+ ```
94
+
95
+ ## 项目结构
96
+
97
+ ```
98
+ x-scraper/
99
+ pyproject.toml
100
+ config.example.toml
101
+ src/x_scraper/
102
+ cli.py # CLI 入口(click)
103
+ config.py # TOML 配置加载
104
+ auth.py # 账号管理与 cookie 导入
105
+ scraper.py # 核心爬取逻辑
106
+ models.py # 数据模型(dataclass)
107
+ export.py # JSON/CSV 导出
108
+ tests/
109
+ test_models.py # 数据模型单元测试
110
+ ```
111
+
112
+ ## 注意事项
113
+
114
+ - 请勿使用主账号,账号存在被封禁的风险
115
+ - X 的 ToS 禁止未授权的自动化访问,请自行承担使用风险
116
+ - X 的反爬策略大约每 2-4 周变更,工具可能需要随之调整
117
+ - 本工具已内置对 twscrape `xclid` 脚本解析错误的 monkey-patch 修复([vladkens/twscrape#284](https://github.com/vladkens/twscrape/issues/284))
118
+
119
+ ## 许可证
120
+
121
+ MIT
@@ -0,0 +1,15 @@
1
+ [scraper]
2
+ output_dir = "./output"
3
+ request_delay = 1.5 # seconds between requests
4
+ max_tweets = 3200 # per user, X API hard limit
5
+
6
+ # Add one or more accounts. Use throwaway accounts only.
7
+ # To get cookies: log in to X in your browser, open DevTools > Application > Cookies,
8
+ # and copy the values of `ct0` and `auth_token`.
9
+
10
+ [[accounts]]
11
+ username = "your_account"
12
+ cookies = "ct0=YOUR_CT0_VALUE; auth_token=YOUR_AUTH_TOKEN_VALUE"
13
+
14
+ # Optional: per-account proxy
15
+ # proxy = "http://user:pass@host:port"
@@ -0,0 +1,28 @@
1
+ [project]
2
+ name = "x-scraper-tool"
3
+ version = "0.1.0"
4
+ description = "A tool for scraping user data from X (Twitter)"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "twscrape>=0.17",
9
+ "click>=8.0",
10
+ ]
11
+
12
+ [project.scripts]
13
+ x-scraper-tool = "x_scraper.cli:main"
14
+
15
+ [build-system]
16
+ requires = ["hatchling"]
17
+ build-backend = "hatchling.build"
18
+
19
+ [tool.hatch.build.targets.wheel]
20
+ packages = ["src/x_scraper"]
21
+
22
+ [tool.pytest.ini_options]
23
+ testpaths = ["tests"]
24
+
25
+ [dependency-groups]
26
+ dev = [
27
+ "pytest>=9.0.2",
28
+ ]
@@ -0,0 +1,81 @@
1
+ ---
2
+ name: x-scraper
3
+ description: Use when scraping X (Twitter) data - user profiles, tweets, followers, following lists, tweet replies, retweeters. Triggers on "scrape twitter", "scrape X", "get tweets", "fetch followers", "twitter data", "X API", "twscrape".
4
+ ---
5
+
6
+ # X (Twitter) Scraper
7
+
8
+ ## Overview
9
+
10
+ CLI tool for scraping X user data via twscrape with cookie-based auth and multi-account rotation.
11
+
12
+ ## Quick Start
13
+
14
+ ```bash
15
+ cp config.example.toml config.toml # fill in ct0 + auth_token
16
+ x-scraper scrape --user TARGET --all
17
+ ```
18
+
19
+ ## Cookie Setup
20
+
21
+ Browser login -> DevTools -> Application -> Cookies -> copy `ct0` and `auth_token`.
22
+
23
+ ```toml
24
+ [[accounts]]
25
+ username = "throwaway_account"
26
+ cookies = "ct0=xxx; auth_token=yyy"
27
+ ```
28
+
29
+ ## Commands
30
+
31
+ ```bash
32
+ # User data
33
+ x-scraper scrape --user USERNAME --all # profile + tweets + followers + following
34
+ x-scraper scrape --user USERNAME --tweets --limit 50 # specific data type
35
+ x-scraper scrape --user USERNAME --all --format csv # CSV output
36
+
37
+ # Tweet interactions
38
+ x-scraper scrape --tweet TWEET_ID --replies --retweeters
39
+
40
+ # Account management
41
+ x-scraper accounts
42
+ ```
43
+
44
+ ## Output
45
+
46
+ Files written to `output/{username}/` or `output/tweet_{id}/`:
47
+ - `profile.json` - user profile
48
+ - `tweets.json` - tweet timeline
49
+ - `followers.json` / `following.json` - social graph
50
+ - `replies.json` / `retweeters.json` - tweet interactions
51
+
52
+ ## Architecture
53
+
54
+ | Module | Purpose |
55
+ |--------|---------|
56
+ | `cli.py` | Click CLI entry point |
57
+ | `config.py` | TOML config loading |
58
+ | `auth.py` | Cookie import + twscrape xclid monkey-patch |
59
+ | `scraper.py` | Async scraping via twscrape API |
60
+ | `models.py` | Dataclass models with `from_twscrape()` converters |
61
+ | `export.py` | JSON/CSV export |
62
+
63
+ ## Known Issues
64
+
65
+ - twscrape `xclid.py` breaks when X changes JS bundle format -- `auth.py` includes a monkey-patch for unquoted JSON keys ([twscrape#284](https://github.com/vladkens/twscrape/issues/284))
66
+ - X anti-scraping changes every 2-4 weeks; twscrape or the patch may need updates
67
+ - Accounts can get locked/suspended; never use primary accounts
68
+
69
+ ## Extending
70
+
71
+ All scraper methods are async on `XScraper` class:
72
+
73
+ ```python
74
+ scraper = XScraper(config)
75
+ await scraper.setup()
76
+
77
+ profile = await scraper.scrape_user_profile("username")
78
+ tweets = await scraper.scrape_user_tweets(user_id, limit=100)
79
+ followers = await scraper.scrape_user_followers(user_id, limit=500)
80
+ replies = await scraper.scrape_tweet_replies(tweet_id)
81
+ ```
@@ -0,0 +1 @@
1
+ """X (Twitter) scraper built on twscrape."""
@@ -0,0 +1,66 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ import re
6
+
7
+ from twscrape import API, AccountsPool
8
+
9
+ from x_scraper.config import Config
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ # ---------------------------------------------------------------------------
15
+ # Monkey-patch: X changed their JS bundles to emit unquoted JSON keys,
16
+ # which breaks twscrape's script URL parser.
17
+ # See: https://github.com/vladkens/twscrape/issues/284
18
+ # ---------------------------------------------------------------------------
19
+
20
+ def _script_url(k: str, v: str) -> str:
21
+ return f"https://abs.twimg.com/responsive-web/client-web/{k}.{v}.js"
22
+
23
+
24
+ def _patched_get_scripts_list(text: str):
25
+ scripts = text.split('e=>e+"."+')[1].split('[e]+"a.js"')[0]
26
+ try:
27
+ for k, v in json.loads(scripts).items():
28
+ yield _script_url(k, f"{v}a")
29
+ except json.decoder.JSONDecodeError:
30
+ # Fix unquoted keys like: node_modules_pnpm_ws_8_18_0_...
31
+ fixed = re.sub(
32
+ r'([,\{])(\s*)([\w]+_[\w_]+)(\s*):',
33
+ r'\1\2"\3"\4:',
34
+ scripts,
35
+ )
36
+ for k, v in json.loads(fixed).items():
37
+ yield _script_url(k, f"{v}a")
38
+
39
+
40
+ from twscrape import xclid # noqa: E402
41
+
42
+ xclid.get_scripts_list = _patched_get_scripts_list
43
+ logger.debug("Applied monkey-patch for twscrape xclid script parser")
44
+
45
+
46
+ async def setup_pool(config: Config) -> API:
47
+ """Create twscrape API with accounts from config using cookie-based auth.
48
+
49
+ Returns a ready-to-use API instance with all accounts added and activated.
50
+ """
51
+ pool = AccountsPool()
52
+
53
+ for account in config.accounts:
54
+ await pool.add_account(
55
+ username=account.username,
56
+ password="placeholder",
57
+ email="placeholder@example.com",
58
+ email_password="placeholder",
59
+ proxy=account.proxy,
60
+ cookies=account.cookies,
61
+ )
62
+ logger.info("Added account: %s", account.username)
63
+
64
+ # Do NOT pass proxy to API() -- let per-account proxy take effect
65
+ api = API(pool=pool)
66
+ return api
@@ -0,0 +1,190 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ import click
9
+
10
+ from x_scraper.config import load_config
11
+ from x_scraper.export import export_data
12
+ from x_scraper.scraper import XScraper
13
+
14
+ logger = logging.getLogger("x_scraper")
15
+
16
+
17
+ def _setup_logging(verbose: bool) -> None:
18
+ level = logging.DEBUG if verbose else logging.INFO
19
+ logging.basicConfig(
20
+ level=level,
21
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
22
+ datefmt="%Y-%m-%d %H:%M:%S",
23
+ )
24
+
25
+
26
+ @click.group()
27
+ @click.option("-v", "--verbose", is_flag=True, help="Enable debug logging.")
28
+ def main(verbose: bool) -> None:
29
+ """X (Twitter) scraper - fetch user data via twscrape."""
30
+ _setup_logging(verbose)
31
+
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # scrape command
35
+ # ---------------------------------------------------------------------------
36
+
37
+
38
+ @main.command()
39
+ @click.option(
40
+ "-c", "--config", "config_path",
41
+ default="config.toml",
42
+ show_default=True,
43
+ help="Path to TOML config file.",
44
+ )
45
+ @click.option("-u", "--user", "username", help="Target username to scrape.")
46
+ @click.option("-t", "--tweet", "tweet_id", type=int, help="Target tweet ID.")
47
+ @click.option("--tweets", is_flag=True, help="Scrape user tweets.")
48
+ @click.option("--followers", is_flag=True, help="Scrape user followers.")
49
+ @click.option("--following", is_flag=True, help="Scrape user following.")
50
+ @click.option("--replies", is_flag=True, help="Scrape tweet replies.")
51
+ @click.option("--retweeters", is_flag=True, help="Scrape tweet retweeters.")
52
+ @click.option("--all", "scrape_all", is_flag=True, help="Scrape all available data.")
53
+ @click.option(
54
+ "--format", "fmt",
55
+ type=click.Choice(["json", "csv"]),
56
+ default="json",
57
+ show_default=True,
58
+ help="Output format.",
59
+ )
60
+ @click.option("--limit", type=int, default=None, help="Max items to fetch per category.")
61
+ def scrape(
62
+ config_path: str,
63
+ username: str | None,
64
+ tweet_id: int | None,
65
+ tweets: bool,
66
+ followers: bool,
67
+ following: bool,
68
+ replies: bool,
69
+ retweeters: bool,
70
+ scrape_all: bool,
71
+ fmt: str,
72
+ limit: int | None,
73
+ ) -> None:
74
+ """Scrape user data or tweet interactions from X."""
75
+ if not username and not tweet_id:
76
+ click.echo("Error: provide --user or --tweet", err=True)
77
+ sys.exit(1)
78
+
79
+ try:
80
+ config = load_config(config_path)
81
+ except (FileNotFoundError, ValueError) as exc:
82
+ click.echo(f"Config error: {exc}", err=True)
83
+ sys.exit(1)
84
+
85
+ asyncio.run(_run_scrape(
86
+ config=config,
87
+ username=username,
88
+ tweet_id=tweet_id,
89
+ tweets=tweets or scrape_all,
90
+ followers=followers or scrape_all,
91
+ following=following or scrape_all,
92
+ replies=replies or scrape_all,
93
+ retweeters=retweeters or scrape_all,
94
+ fmt=fmt,
95
+ limit=limit,
96
+ ))
97
+
98
+
99
+ async def _run_scrape(
100
+ *,
101
+ config,
102
+ username: str | None,
103
+ tweet_id: int | None,
104
+ tweets: bool,
105
+ followers: bool,
106
+ following: bool,
107
+ replies: bool,
108
+ retweeters: bool,
109
+ fmt: str,
110
+ limit: int | None,
111
+ ) -> None:
112
+ scraper = XScraper(config)
113
+ await scraper.setup()
114
+
115
+ # --- User-level scraping ---
116
+ if username:
117
+ user_dir = config.output_dir / username
118
+ click.echo(f"Scraping user: {username}")
119
+
120
+ profile = await scraper.scrape_user_profile(username)
121
+ path = export_data([profile], user_dir, "profile", fmt)
122
+ click.echo(f" profile -> {path}")
123
+
124
+ user_id = int(profile.id)
125
+
126
+ if tweets:
127
+ data = await scraper.scrape_user_tweets(user_id, limit=limit)
128
+ path = export_data(data, user_dir, "tweets", fmt)
129
+ click.echo(f" tweets ({len(data)}) -> {path}")
130
+
131
+ if followers:
132
+ data = await scraper.scrape_user_followers(
133
+ user_id, limit=limit or 1000,
134
+ )
135
+ path = export_data(data, user_dir, "followers", fmt)
136
+ click.echo(f" followers ({len(data)}) -> {path}")
137
+
138
+ if following:
139
+ data = await scraper.scrape_user_following(
140
+ user_id, limit=limit or 1000,
141
+ )
142
+ path = export_data(data, user_dir, "following", fmt)
143
+ click.echo(f" following ({len(data)}) -> {path}")
144
+
145
+ # --- Tweet-level scraping ---
146
+ if tweet_id:
147
+ tweet_dir = config.output_dir / f"tweet_{tweet_id}"
148
+ click.echo(f"Scraping tweet: {tweet_id}")
149
+
150
+ if replies:
151
+ data = await scraper.scrape_tweet_replies(
152
+ tweet_id, limit=limit or 500,
153
+ )
154
+ path = export_data(data, tweet_dir, "replies", fmt)
155
+ click.echo(f" replies ({len(data)}) -> {path}")
156
+
157
+ if retweeters:
158
+ data = await scraper.scrape_tweet_retweeters(
159
+ tweet_id, limit=limit or 500,
160
+ )
161
+ path = export_data(data, tweet_dir, "retweeters", fmt)
162
+ click.echo(f" retweeters ({len(data)}) -> {path}")
163
+
164
+ click.echo("Done.")
165
+
166
+
167
+ # ---------------------------------------------------------------------------
168
+ # accounts command
169
+ # ---------------------------------------------------------------------------
170
+
171
+
172
+ @main.command()
173
+ @click.option(
174
+ "-c", "--config", "config_path",
175
+ default="config.toml",
176
+ show_default=True,
177
+ help="Path to TOML config file.",
178
+ )
179
+ def accounts(config_path: str) -> None:
180
+ """List configured accounts and their status."""
181
+ try:
182
+ config = load_config(config_path)
183
+ except (FileNotFoundError, ValueError) as exc:
184
+ click.echo(f"Config error: {exc}", err=True)
185
+ sys.exit(1)
186
+
187
+ click.echo(f"Configured accounts ({len(config.accounts)}):")
188
+ for acc in config.accounts:
189
+ proxy_info = f" [proxy: {acc.proxy}]" if acc.proxy else ""
190
+ click.echo(f" - {acc.username}{proxy_info}")