tt-help-cli-ycl 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -0
- package/cli.js +9 -0
- package/package.json +44 -0
- package/src/lib/args.js +59 -0
- package/src/lib/constants.js +76 -0
- package/src/lib/explore.js +124 -0
- package/src/lib/fetcher.js +36 -0
- package/src/lib/io.js +13 -0
- package/src/lib/output.js +80 -0
- package/src/lib/parser.js +47 -0
- package/src/lib/scrape.js +39 -0
- package/src/main.mjs +245 -0
package/README.md
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# tt-help
|
|
2
|
+
|
|
3
|
+
TikTok user & video data scraper — extract ttSeller, verified, locationCreated from HTML source.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm i -g tt-help
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
tt-help [options] <urls...>
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
示例、代理配置、和更多用法请查看仓库或 `src/main.mjs` 中的帮助文本。
|
package/cli.js
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { fileURLToPath } from 'url';
|
|
3
|
+
import { dirname, resolve } from 'path';
|
|
4
|
+
|
|
5
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
6
|
+
const __dirname = dirname(__filename);
|
|
7
|
+
|
|
8
|
+
const mainPath = resolve(__dirname, 'src', 'main.mjs');
|
|
9
|
+
await import(`file://${mainPath}`);
|
package/package.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "tt-help-cli-ycl",
|
|
3
|
+
"version": "1.0.1",
|
|
4
|
+
"description": "TikTok user & video data scraper - extract ttSeller, verified, locationCreated from HTML source",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"tt-help": "cli.js"
|
|
8
|
+
},
|
|
9
|
+
"main": "src/main.mjs",
|
|
10
|
+
"files": [
|
|
11
|
+
"cli.js",
|
|
12
|
+
"src/"
|
|
13
|
+
],
|
|
14
|
+
"scripts": {
|
|
15
|
+
"start": "node src/main.mjs"
|
|
16
|
+
},
|
|
17
|
+
"keywords": [
|
|
18
|
+
"tiktok",
|
|
19
|
+
"scraper",
|
|
20
|
+
"cli",
|
|
21
|
+
"seller",
|
|
22
|
+
"ttSeller"
|
|
23
|
+
],
|
|
24
|
+
"author": "",
|
|
25
|
+
"license": "ISC",
|
|
26
|
+
"engines": {
|
|
27
|
+
"node": ">=18"
|
|
28
|
+
},
|
|
29
|
+
"publishConfig": {
|
|
30
|
+
"access": "public"
|
|
31
|
+
},
|
|
32
|
+
"repository": {
|
|
33
|
+
"type": "git",
|
|
34
|
+
"url": "https://github.com/yourname/tt-help.git"
|
|
35
|
+
},
|
|
36
|
+
"bugs": {
|
|
37
|
+
"url": "https://github.com/yourname/tt-help/issues"
|
|
38
|
+
},
|
|
39
|
+
"homepage": "https://github.com/yourname/tt-help#readme",
|
|
40
|
+
"dependencies": {
|
|
41
|
+
"playwright": "^1.59.1",
|
|
42
|
+
"undici": "^8.1.0"
|
|
43
|
+
}
|
|
44
|
+
}
|
package/src/lib/args.js
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import { readFileSync } from 'fs';
|
|
2
|
+
import { proxy } from './constants.js';
|
|
3
|
+
|
|
4
|
+
export function parseArgs() {
|
|
5
|
+
const args = process.argv.slice(2);
|
|
6
|
+
const urls = [];
|
|
7
|
+
let inputFile = null;
|
|
8
|
+
let outputFile = null;
|
|
9
|
+
let outputFormat = 'json';
|
|
10
|
+
let exploreCount = 0;
|
|
11
|
+
let showConfig = false;
|
|
12
|
+
let showHelp = false;
|
|
13
|
+
let customProxy = null;
|
|
14
|
+
let configAction = null;
|
|
15
|
+
let configValue = null;
|
|
16
|
+
let pipeMode = false;
|
|
17
|
+
|
|
18
|
+
for (let i = 0; i < args.length; i++) {
|
|
19
|
+
const arg = args[i];
|
|
20
|
+
|
|
21
|
+
if (arg === '--explore') {
|
|
22
|
+
exploreCount = args[i + 1] && !args[i + 1].startsWith('http')
|
|
23
|
+
? parseInt(args[++i], 10)
|
|
24
|
+
: 100;
|
|
25
|
+
} else if (arg === '--proxy') {
|
|
26
|
+
customProxy = args[++i];
|
|
27
|
+
} else if (arg === 'config') {
|
|
28
|
+
configAction = args[i + 1];
|
|
29
|
+
if (configAction === 'set' || configAction === 'set-proxy') {
|
|
30
|
+
configValue = args[i + 2];
|
|
31
|
+
i += 2;
|
|
32
|
+
} else {
|
|
33
|
+
i++;
|
|
34
|
+
}
|
|
35
|
+
} else if (arg === '--pipe') {
|
|
36
|
+
pipeMode = true;
|
|
37
|
+
} else if (arg === '-i' || arg === '--input') {
|
|
38
|
+
inputFile = args[++i];
|
|
39
|
+
} else if (arg === '-o' || arg === '--output') {
|
|
40
|
+
outputFile = args[++i];
|
|
41
|
+
} else if (arg === '-f' || arg === '--format') {
|
|
42
|
+
outputFormat = args[++i];
|
|
43
|
+
} else if (arg === '-c' || arg === '--config') {
|
|
44
|
+
showConfig = true;
|
|
45
|
+
} else if (arg === '-h' || arg === '--help') {
|
|
46
|
+
showHelp = true;
|
|
47
|
+
} else if (arg.startsWith('http')) {
|
|
48
|
+
urls.push(arg);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
if (inputFile) {
|
|
53
|
+
const content = readFileSync(inputFile, 'utf-8');
|
|
54
|
+
const lines = content.split(/\r?\n/).map(l => l.trim()).filter(l => l.startsWith('http'));
|
|
55
|
+
urls.push(...lines);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
return { urls, outputFile, outputFormat, exploreCount, showConfig, showHelp, customProxy, configAction, configValue, pipeMode };
|
|
59
|
+
}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import { join, dirname } from 'path';
|
|
2
|
+
import { readFileSync, writeFileSync, existsSync } from 'fs';
|
|
3
|
+
import { fileURLToPath } from 'url';
|
|
4
|
+
|
|
5
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
6
|
+
const __dirname = dirname(__filename);
|
|
7
|
+
const homeDir = process.env.HOME || process.env.USERPROFILE || '';
|
|
8
|
+
const configPath = join(homeDir, '.tt-help.json');
|
|
9
|
+
|
|
10
|
+
const DEFAULT_PROXY = 'http://127.0.0.1:7897';
|
|
11
|
+
const DEFAULT_OUTPUT = 'tiktok_data.json';
|
|
12
|
+
const USER_SECTION_SIZE = 12000;
|
|
13
|
+
|
|
14
|
+
let proxy = DEFAULT_PROXY;
|
|
15
|
+
let configFile = null;
|
|
16
|
+
|
|
17
|
+
try {
|
|
18
|
+
if (existsSync(configPath)) {
|
|
19
|
+
const cfg = JSON.parse(readFileSync(configPath, 'utf-8'));
|
|
20
|
+
if (cfg.proxy) {
|
|
21
|
+
proxy = cfg.proxy;
|
|
22
|
+
configFile = configPath;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
} catch {
|
|
26
|
+
// no config file
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
const HELP_TEXT = [
|
|
30
|
+
'用法: tt-help [选项]',
|
|
31
|
+
'',
|
|
32
|
+
'参数:',
|
|
33
|
+
' --explore [count] 从 Explore 页面获取视频列表(默认: 100)',
|
|
34
|
+
' --pipe 将 Explore 结果自动传给 URL 爬取',
|
|
35
|
+
' --proxy <地址> 临时指定代理地址',
|
|
36
|
+
' -i, --input <file> 从文件读取 URL 列表(每行一个)',
|
|
37
|
+
' -o, --output <file> 指定输出文件(默认: tiktok_data.json)',
|
|
38
|
+
' -f, --format <fmt> 输出格式: json(默认), table, raw',
|
|
39
|
+
' -c, --config 显示当前配置',
|
|
40
|
+
' -h, --help 显示帮助',
|
|
41
|
+
'',
|
|
42
|
+
'配置代理:',
|
|
43
|
+
' tt-help config set http://127.0.0.1:7890 设置代理',
|
|
44
|
+
' tt-help config show 查看配置',
|
|
45
|
+
' tt-help config reset 恢复默认',
|
|
46
|
+
'',
|
|
47
|
+
'示例:',
|
|
48
|
+
' tt-help --explore 200 # 先预览 Explore 结果',
|
|
49
|
+
' tt-help --explore 50 --pipe -o result.json # 自动抓取用户数据',
|
|
50
|
+
' tt-help --explore -f raw # 仅输出 URL 列表',
|
|
51
|
+
' tt-help --explore -f raw -o urls.txt # 保存 URL 到文件',
|
|
52
|
+
' tt-help -i urls.txt -o result.json # 再爬取这些 URL',
|
|
53
|
+
' tt-help config set http://127.0.0.1:7890',
|
|
54
|
+
' tt-help https://www.tiktok.com/@username',
|
|
55
|
+
];
|
|
56
|
+
|
|
57
|
+
const CONFIG_TEXT = [
|
|
58
|
+
'tt-help v1.0.1',
|
|
59
|
+
'',
|
|
60
|
+
'配置:',
|
|
61
|
+
` 代理: ${proxy}`,
|
|
62
|
+
` 输出格式: json`,
|
|
63
|
+
` 默认输出: ${DEFAULT_OUTPUT}`,
|
|
64
|
+
` 配置文件: ${configFile || '无(使用默认值)'}`,
|
|
65
|
+
];
|
|
66
|
+
|
|
67
|
+
export {
|
|
68
|
+
proxy,
|
|
69
|
+
configFile,
|
|
70
|
+
configPath,
|
|
71
|
+
DEFAULT_PROXY,
|
|
72
|
+
DEFAULT_OUTPUT,
|
|
73
|
+
USER_SECTION_SIZE,
|
|
74
|
+
HELP_TEXT,
|
|
75
|
+
CONFIG_TEXT,
|
|
76
|
+
};
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import { chromium } from 'playwright';
|
|
2
|
+
|
|
3
|
+
const EXPLORE_URL = 'https://www.tiktok.com/explore';
|
|
4
|
+
|
|
5
|
+
function sleep(ms) {
|
|
6
|
+
return new Promise(r => setTimeout(r, ms));
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export async function fetchExplore(count = 100) {
|
|
10
|
+
const browser = await chromium.launch({
|
|
11
|
+
headless: true,
|
|
12
|
+
args: [
|
|
13
|
+
'--no-sandbox',
|
|
14
|
+
'--disable-setuid-sandbox',
|
|
15
|
+
'--disable-blink-features=AutomationControlled',
|
|
16
|
+
'--disable-dev-shm-usage',
|
|
17
|
+
],
|
|
18
|
+
});
|
|
19
|
+
try {
|
|
20
|
+
const context = await browser.newContext({
|
|
21
|
+
viewport: { width: 1280, height: 900 },
|
|
22
|
+
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
23
|
+
locale: 'en-US',
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
// 注入反检测脚本
|
|
27
|
+
await context.addInitScript(() => {
|
|
28
|
+
// 重写 navigator.webdriver
|
|
29
|
+
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
|
30
|
+
|
|
31
|
+
// 伪造 window.chrome
|
|
32
|
+
if (!window.chrome) {
|
|
33
|
+
window.chrome = { runtime: {} };
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// 覆写 permissions query
|
|
37
|
+
const originalQuery = window.navigator.permissions.query;
|
|
38
|
+
window.navigator.permissions.query = (params) =>
|
|
39
|
+
params.name === 'notifications'
|
|
40
|
+
? Promise.resolve({ state: Notification.permission })
|
|
41
|
+
: originalQuery(params);
|
|
42
|
+
|
|
43
|
+
// 覆写 languages
|
|
44
|
+
Object.defineProperty(navigator, 'languages', {
|
|
45
|
+
get: () => ['en-US', 'en'],
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
// 覆写 plugins
|
|
49
|
+
Object.defineProperty(navigator, 'plugins', {
|
|
50
|
+
get: () => [1, 2, 3, 4, 5],
|
|
51
|
+
});
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
const page = await context.newPage();
|
|
55
|
+
await page.goto(EXPLORE_URL, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
56
|
+
console.log(` [1/6] 页面已加载`);
|
|
57
|
+
|
|
58
|
+
await sleep(5000);
|
|
59
|
+
|
|
60
|
+
let lastCount = 0;
|
|
61
|
+
let noNewCount = 0;
|
|
62
|
+
|
|
63
|
+
for (let i = 0; i < 50; i++) {
|
|
64
|
+
await page.evaluate(() => {
|
|
65
|
+
window.scrollBy({ top: window.innerHeight * 0.8, behavior: 'smooth' });
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
await sleep(1500 + Math.random() * 1000);
|
|
69
|
+
|
|
70
|
+
const urls = await page.$$eval('a', els =>
|
|
71
|
+
els.map(a => a.href).filter(u => /\/video\/\d{16,20}/.test(u))
|
|
72
|
+
);
|
|
73
|
+
const uniqueCount = [...new Set(urls)].length;
|
|
74
|
+
|
|
75
|
+
if (uniqueCount > lastCount) {
|
|
76
|
+
noNewCount = 0;
|
|
77
|
+
} else {
|
|
78
|
+
noNewCount++;
|
|
79
|
+
}
|
|
80
|
+
lastCount = uniqueCount;
|
|
81
|
+
|
|
82
|
+
if ((i + 1) % 10 === 0) {
|
|
83
|
+
console.log(` [2/6] 滚动 ${i + 1}/50,当前 ${uniqueCount} 个视频`);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (noNewCount >= 5) {
|
|
87
|
+
console.log(` [3/6] 内容加载完成(${uniqueCount} 个视频)`);
|
|
88
|
+
break;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
if (uniqueCount >= count * 2) {
|
|
92
|
+
console.log(` [3/6] 视频数量已充足(${uniqueCount} 个)`);
|
|
93
|
+
break;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
await sleep(3000);
|
|
98
|
+
|
|
99
|
+
const urls = await page.$$eval('a', els =>
|
|
100
|
+
els.map(a => a.href).filter(u => /\/video\/\d{16,20}/.test(u))
|
|
101
|
+
);
|
|
102
|
+
const unique = [...new Set(urls)];
|
|
103
|
+
console.log(` [4/6] 共检测到 ${unique.length} 个不重复视频`);
|
|
104
|
+
|
|
105
|
+
const results = [];
|
|
106
|
+
const seen = new Set();
|
|
107
|
+
for (const url of unique) {
|
|
108
|
+
if (results.length >= count) break;
|
|
109
|
+
const videoId = url.match(/video\/(\d{16,20})$/)?.[1];
|
|
110
|
+
if (videoId && !seen.has(videoId)) {
|
|
111
|
+
seen.add(videoId);
|
|
112
|
+
const user = url.match(/\/@([^\/]+)/)?.[1];
|
|
113
|
+
if (user) {
|
|
114
|
+
results.push({ user, id: videoId, url });
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
console.log(` [5/6] 去重后 ${results.length} 个`);
|
|
120
|
+
return results;
|
|
121
|
+
} finally {
|
|
122
|
+
await browser.close();
|
|
123
|
+
}
|
|
124
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { fetch, ProxyAgent } from 'undici';
|
|
2
|
+
import { DEFAULT_PROXY } from './constants.js';
|
|
3
|
+
|
|
4
|
+
const HEADERS = {
|
|
5
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
6
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
7
|
+
};
|
|
8
|
+
|
|
9
|
+
export async function fetchHtml(url, proxyUrl) {
|
|
10
|
+
const p = proxyUrl || DEFAULT_PROXY;
|
|
11
|
+
const agent = new ProxyAgent(p);
|
|
12
|
+
try {
|
|
13
|
+
const res = await fetch(url, { headers: HEADERS, dispatcher: agent });
|
|
14
|
+
return res.text();
|
|
15
|
+
} catch (err) {
|
|
16
|
+
throw new Error(`请求 ${url} 失败,代理 ${p} 不可用`);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export function makeProfileUrl(handle) {
|
|
21
|
+
if (handle.startsWith('http')) return handle;
|
|
22
|
+
return `https://www.tiktok.com/${handle}`;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export function isProfileUrl(url) {
|
|
26
|
+
return /\/@[\w-]+(?:$|[?#])/.test(url);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export function isVideoUrl(url) {
|
|
30
|
+
return /\/video\/\d+/.test(url);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function extractProfileHandle(url) {
|
|
34
|
+
const m = url.match(/https:\/\/www\.tiktok\.com\/(@[\w-]+)/);
|
|
35
|
+
return m ? m[1] : null;
|
|
36
|
+
}
|
package/src/lib/io.js
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { writeFileSync, readFileSync } from 'fs';
|
|
2
|
+
|
|
3
|
+
export function writeOutput(data, outputFile) {
|
|
4
|
+
const output = JSON.stringify(data, null, 2);
|
|
5
|
+
const target = outputFile || 'tiktok_data.json';
|
|
6
|
+
writeFileSync(target, output, 'utf-8');
|
|
7
|
+
console.log(`结果已写入: ${target}`);
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export function readUrlFile(filePath) {
|
|
11
|
+
const content = readFileSync(filePath, 'utf-8');
|
|
12
|
+
return content.split(/\r?\n/).map(l => l.trim()).filter(l => l.startsWith('http'));
|
|
13
|
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
export function deduplicate(results) {
|
|
2
|
+
const seen = new Set();
|
|
3
|
+
return results.filter(r => {
|
|
4
|
+
if (r.id) {
|
|
5
|
+
const key = r.id;
|
|
6
|
+
if (seen.has(key)) return false;
|
|
7
|
+
seen.add(key);
|
|
8
|
+
return true;
|
|
9
|
+
}
|
|
10
|
+
const key = r.secUid || r.uniqueId;
|
|
11
|
+
if (seen.has(key)) return false;
|
|
12
|
+
seen.add(key);
|
|
13
|
+
return true;
|
|
14
|
+
});
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export function formatTable(data) {
|
|
18
|
+
if (data.length === 0) return '';
|
|
19
|
+
|
|
20
|
+
if (data.length === 1) {
|
|
21
|
+
const lines = [];
|
|
22
|
+
for (const [key, val] of Object.entries(data[0])) {
|
|
23
|
+
if (typeof val === 'string' && val.length > 80) {
|
|
24
|
+
lines.push(` ${key}: ${val.substring(0, 80)}...`);
|
|
25
|
+
} else {
|
|
26
|
+
lines.push(` ${key}: ${val}`);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
return lines.join('\n');
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const cols = [
|
|
33
|
+
{ key: 'uniqueId', label: '用户名', width: 20 },
|
|
34
|
+
{ key: 'locationCreated', label: '地区', width: 6 },
|
|
35
|
+
{ key: 'nickname', label: '昵称', width: 20 },
|
|
36
|
+
{ key: 'ttSeller', label: 'TT卖家', width: 8 },
|
|
37
|
+
{ key: 'verified', label: '已认证', width: 8 },
|
|
38
|
+
{ key: 'followerCount', label: '粉丝', width: 10 },
|
|
39
|
+
{ key: 'videoCount', label: '视频', width: 8 },
|
|
40
|
+
];
|
|
41
|
+
|
|
42
|
+
for (const row of data) {
|
|
43
|
+
for (const col of cols) {
|
|
44
|
+
const val = String(row[col.key] ?? '-');
|
|
45
|
+
col.width = Math.max(col.width, val.length, col.label.length);
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
const sep = (w) => '-'.repeat(w);
|
|
50
|
+
const pad = (s, w) => s.padEnd(w);
|
|
51
|
+
|
|
52
|
+
const header = cols.map(c => pad(c.label, c.width)).join(' │ ');
|
|
53
|
+
const divider = cols.map(c => sep(c.width)).join('-+-');
|
|
54
|
+
const rows = data.map(r =>
|
|
55
|
+
cols.map(c => pad(String(r[c.key] ?? '-'), c.width)).join(' │ ')
|
|
56
|
+
);
|
|
57
|
+
|
|
58
|
+
return [header, divider, ...rows].join('\n');
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export function formatOutput(data, format) {
|
|
62
|
+
if (format === 'table') return formatTable(data);
|
|
63
|
+
|
|
64
|
+
if (format === 'raw') {
|
|
65
|
+
if (Array.isArray(data) && data.length > 0 && 'url' in data[0]) {
|
|
66
|
+
return data.map(d => d.url).join('\n');
|
|
67
|
+
}
|
|
68
|
+
if (Array.isArray(data) && data.length > 0 && 'uniqueId' in data[0]) {
|
|
69
|
+
return data.map(d => `https://www.tiktok.com/@${d.uniqueId}`).join('\n');
|
|
70
|
+
}
|
|
71
|
+
return JSON.stringify(data, null, 2);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Default JSON output, but for explore results (url-only) output pure text
|
|
75
|
+
if (Array.isArray(data) && data.length > 0 && 'url' in data[0]) {
|
|
76
|
+
return data.map(d => d.url).join('\n');
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return JSON.stringify(data, null, 2);
|
|
80
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { USER_SECTION_SIZE } from './constants.js';
|
|
2
|
+
|
|
3
|
+
export function extractUserSection(html) {
|
|
4
|
+
const idx = html.indexOf('"uniqueId"');
|
|
5
|
+
if (idx < 0) return null;
|
|
6
|
+
return html.substring(idx, idx + USER_SECTION_SIZE);
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export function parseUserSection(section) {
|
|
10
|
+
const data = {};
|
|
11
|
+
|
|
12
|
+
for (const key of ['uniqueId', 'uid', 'secUid']) {
|
|
13
|
+
const m = section.match(new RegExp(`"${key}":"([^"]*)`));
|
|
14
|
+
if (m) data[key] = m[1];
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
for (const key of ['nickname', 'signature']) {
|
|
18
|
+
const m = section.match(new RegExp(`"${key}":"((?:[^"\\\\]|\\\\.)*)"`, 'g'));
|
|
19
|
+
if (m) {
|
|
20
|
+
const raw = m[0].replace(`"${key}":"`, '').replace(/"$/, '');
|
|
21
|
+
data[key] = raw.replace(/\\n/g, '\n').replace(/\\\\/g, '\\');
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
for (const key of ['ttSeller', 'verified']) {
|
|
26
|
+
const m = section.match(new RegExp(`"${key}":\\s*(true|false)`));
|
|
27
|
+
data[key] = m ? m[1] === 'true' : undefined;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
for (const key of ['followerCount', 'followingCount', 'heartCount', 'videoCount', 'diggCount']) {
|
|
31
|
+
const m = section.match(new RegExp(`"${key}":(\\d+)`));
|
|
32
|
+
if (m) data[key] = parseInt(m[1], 10);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
const mt = section.match(/"createTime":(\d+)/);
|
|
36
|
+
if (mt) data.createTime = parseInt(mt[1], 10);
|
|
37
|
+
|
|
38
|
+
const ma = section.match(/"avatarLarger":"([^"]*)/);
|
|
39
|
+
if (ma) data.avatarLarger = ma[1].replace(/\\u002F/g, '/');
|
|
40
|
+
|
|
41
|
+
return data;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export function extractLocationCreated(html) {
|
|
45
|
+
const m = html.match(/"locationCreated":"([^"]*)/);
|
|
46
|
+
return m ? m[1] : null;
|
|
47
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { extractUserSection, parseUserSection, extractLocationCreated } from './parser.js';
|
|
2
|
+
import { fetchHtml, makeProfileUrl, isProfileUrl, isVideoUrl, extractProfileHandle } from './fetcher.js';
|
|
3
|
+
|
|
4
|
+
export async function extractUserData(profileUrl, proxyUrl) {
|
|
5
|
+
const profileHtml = await fetchHtml(profileUrl, proxyUrl);
|
|
6
|
+
const section = extractUserSection(profileHtml);
|
|
7
|
+
if (!section) throw new Error('无法解析用户信息');
|
|
8
|
+
const data = parseUserSection(section);
|
|
9
|
+
data.locationCreated = extractLocationCreated(profileHtml);
|
|
10
|
+
return data;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export async function extractVideoLocation(videoUrl, proxyUrl) {
|
|
14
|
+
const videoHtml = await fetchHtml(videoUrl, proxyUrl);
|
|
15
|
+
return extractLocationCreated(videoHtml);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export async function processUrl(url, proxyUrl) {
|
|
19
|
+
if (isProfileUrl(url)) {
|
|
20
|
+
const profileUrl = makeProfileUrl(url);
|
|
21
|
+
const profileData = await extractUserData(profileUrl, proxyUrl);
|
|
22
|
+
return [profileData];
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
if (isVideoUrl(url)) {
|
|
26
|
+
const profileHandle = extractProfileHandle(url);
|
|
27
|
+
if (!profileHandle) throw new Error(`无法从视频URL提取用户主页: ${url}`);
|
|
28
|
+
|
|
29
|
+
const profileUrl = makeProfileUrl(profileHandle);
|
|
30
|
+
const [profileData, locationCreated] = await Promise.all([
|
|
31
|
+
extractUserData(profileUrl, proxyUrl),
|
|
32
|
+
extractVideoLocation(url, proxyUrl),
|
|
33
|
+
]);
|
|
34
|
+
|
|
35
|
+
return [{ ...profileData, locationCreated }];
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
return [];
|
|
39
|
+
}
|
package/src/main.mjs
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
import { parseArgs } from './lib/args.js';
|
|
2
|
+
import { HELP_TEXT, CONFIG_TEXT, proxy, configFile, configPath, DEFAULT_PROXY } from './lib/constants.js';
|
|
3
|
+
import { fetchExplore } from './lib/explore.js';
|
|
4
|
+
import { processUrl } from './lib/scrape.js';
|
|
5
|
+
import { deduplicate, formatOutput } from './lib/output.js';
|
|
6
|
+
import { writeFileSync, readFileSync, existsSync } from 'fs';
|
|
7
|
+
|
|
8
|
+
function showConfig(urls, outputFile) {
|
|
9
|
+
const lines = [...CONFIG_TEXT];
|
|
10
|
+
if (outputFile) lines.push(` 输出文件: ${outputFile}`);
|
|
11
|
+
if (urls.length > 0) lines.push(` 待处理URL: ${urls.length}`);
|
|
12
|
+
lines.push('', '参数:', ' -c, --config 显示当前配置', ' -h, --help 显示帮助');
|
|
13
|
+
console.log(lines.join('\n'));
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function showUsage() {
|
|
17
|
+
console.log(HELP_TEXT.join('\n'));
|
|
18
|
+
process.exit(0);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function handleConfig(action, value) {
|
|
22
|
+
if (action === 'show' || action === null) {
|
|
23
|
+
showConfig([], null);
|
|
24
|
+
return;
|
|
25
|
+
}
|
|
26
|
+
if (action === 'set' || action === 'set-proxy') {
|
|
27
|
+
if (!value) {
|
|
28
|
+
console.error('用法: tt-help config set <代理地址>');
|
|
29
|
+
process.exit(1);
|
|
30
|
+
}
|
|
31
|
+
const cfg = { proxy: value };
|
|
32
|
+
writeFileSync(configPath, JSON.stringify(cfg, null, 2), 'utf-8');
|
|
33
|
+
console.log(`代理已设置为: ${value}`);
|
|
34
|
+
console.log(`配置文件: ${configPath}`);
|
|
35
|
+
return;
|
|
36
|
+
}
|
|
37
|
+
if (action === 'reset') {
|
|
38
|
+
if (existsSync(configPath)) {
|
|
39
|
+
readFileSync(configPath, 'utf-8');
|
|
40
|
+
writeFileSync(configPath, JSON.stringify({ proxy: DEFAULT_PROXY }, null, 2), 'utf-8');
|
|
41
|
+
console.log(`已恢复默认代理: ${DEFAULT_PROXY}`);
|
|
42
|
+
console.log(`配置文件: ${configPath}`);
|
|
43
|
+
} else {
|
|
44
|
+
console.log('当前使用默认代理,无需重置');
|
|
45
|
+
}
|
|
46
|
+
return;
|
|
47
|
+
}
|
|
48
|
+
console.error(`未知配置命令: ${action}`);
|
|
49
|
+
console.error('用法: tt-help config [show|set|reset]');
|
|
50
|
+
process.exit(1);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function cleanError(msg) {
|
|
54
|
+
return msg
|
|
55
|
+
.replace(/\x1b\[[0-9;]*m/g, '')
|
|
56
|
+
.replace(/\s*- navigating to.*/s, '')
|
|
57
|
+
.replace(/\s*Call log:/s, '')
|
|
58
|
+
.trim();
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
async function runExplore(exploreCount, urls, proxyUrl, outputFile, outputFormat, isPipe) {
|
|
62
|
+
console.log(`\n代理: ${proxyUrl}`);
|
|
63
|
+
console.log(`Explore 数量: ${exploreCount}`);
|
|
64
|
+
if (urls.length > 0) {
|
|
65
|
+
console.log(`额外 URL: ${urls.length}\n`);
|
|
66
|
+
} else {
|
|
67
|
+
console.log('');
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
const allResults = [];
|
|
71
|
+
|
|
72
|
+
if (exploreCount > 0) {
|
|
73
|
+
try {
|
|
74
|
+
const exploreResults = await fetchExplore(exploreCount);
|
|
75
|
+
console.log(` 获取到 ${exploreResults.length} 个视频\n`);
|
|
76
|
+
if (isPipe) {
|
|
77
|
+
const videoUrls = exploreResults.map(r => r.url).filter(Boolean);
|
|
78
|
+
if (videoUrls.length > 0) {
|
|
79
|
+
await runScrape(videoUrls, proxyUrl, outputFile, outputFormat);
|
|
80
|
+
return;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
allResults.push(...exploreResults);
|
|
84
|
+
} catch (err) {
|
|
85
|
+
console.error(` Explore 获取失败: ${cleanError(err.message)}\n`);
|
|
86
|
+
console.error(` 请确保代理 ${proxyUrl} 正常运行\n`);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
if (urls.length > 0) {
|
|
91
|
+
const errors = [];
|
|
92
|
+
|
|
93
|
+
for (let i = 0; i < urls.length; i++) {
|
|
94
|
+
const bar = '█'.repeat(i + 1).padEnd(urls.length);
|
|
95
|
+
process.stdout.write(`\r [${bar}] ${i + 1}/${urls.length}`);
|
|
96
|
+
try {
|
|
97
|
+
const results = await processUrl(urls[i], proxyUrl);
|
|
98
|
+
allResults.push(...results);
|
|
99
|
+
} catch (err) {
|
|
100
|
+
errors.push({ url: urls[i], message: err.message });
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
console.log();
|
|
104
|
+
|
|
105
|
+
if (errors.length > 0) {
|
|
106
|
+
const msg = errors[0].message;
|
|
107
|
+
if (msg.includes('不可用') || msg.includes('连接被拒绝') || msg.includes('连接中断') ||
|
|
108
|
+
msg.includes('超时') || msg.includes('无法解析')) {
|
|
109
|
+
console.error(` ${errors.length} 个请求失败,请检查代理是否可用: ${proxyUrl}\n`);
|
|
110
|
+
} else {
|
|
111
|
+
console.error(` ${errors.length} 个失败:`);
|
|
112
|
+
const show = errors.slice(0, 5);
|
|
113
|
+
for (const e of show) {
|
|
114
|
+
console.error(` ✗ ${e.url}: ${e.message}`);
|
|
115
|
+
}
|
|
116
|
+
if (errors.length > 5) {
|
|
117
|
+
console.error(` ... 还有 ${errors.length - 5} 个`);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
const uniqueResults = deduplicate(allResults);
|
|
124
|
+
|
|
125
|
+
if (uniqueResults.length === 0) {
|
|
126
|
+
console.log('\n未获取到数据');
|
|
127
|
+
if (outputFile) {
|
|
128
|
+
writeFileSync(outputFile, '[]', 'utf-8');
|
|
129
|
+
}
|
|
130
|
+
return;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
const output = formatOutput(uniqueResults, outputFormat);
|
|
134
|
+
|
|
135
|
+
if (outputFile) {
|
|
136
|
+
writeFileSync(outputFile, output, 'utf-8');
|
|
137
|
+
console.log(`\n结果已写入: ${outputFile}`);
|
|
138
|
+
} else {
|
|
139
|
+
console.log(output);
|
|
140
|
+
}
|
|
141
|
+
console.log(`\n共 ${uniqueResults.length} 个数据`);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
async function runScrape(urls, proxyUrl, outputFile, outputFormat) {
|
|
145
|
+
const allResults = [];
|
|
146
|
+
const errors = [];
|
|
147
|
+
|
|
148
|
+
for (let i = 0; i < urls.length; i++) {
|
|
149
|
+
const bar = '█'.repeat(i + 1).padEnd(urls.length);
|
|
150
|
+
process.stdout.write(`\r [${bar}] ${i + 1}/${urls.length}`);
|
|
151
|
+
try {
|
|
152
|
+
const results = await processUrl(urls[i], proxyUrl);
|
|
153
|
+
allResults.push(...results);
|
|
154
|
+
} catch (err) {
|
|
155
|
+
errors.push({ url: urls[i], message: err.message });
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
console.log();
|
|
159
|
+
|
|
160
|
+
const uniqueResults = deduplicate(allResults);
|
|
161
|
+
|
|
162
|
+
if (errors.length > 0) {
|
|
163
|
+
if (uniqueResults.length === 0) {
|
|
164
|
+
const msg = errors[0].message;
|
|
165
|
+
if (msg.includes('不可用') || msg.includes('连接被拒绝') || msg.includes('连接中断') ||
|
|
166
|
+
msg.includes('超时') || msg.includes('无法解析')) {
|
|
167
|
+
console.error(` 所有请求失败,请检查代理是否可用: ${proxyUrl}\n`);
|
|
168
|
+
} else {
|
|
169
|
+
const show = errors.slice(0, 5);
|
|
170
|
+
for (const e of show) {
|
|
171
|
+
console.error(` ✗ ${e.url}: ${e.message}\n`);
|
|
172
|
+
}
|
|
173
|
+
if (errors.length > 5) {
|
|
174
|
+
console.error(` ... 还有 ${errors.length - 5} 个失败\n`);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
console.log('未获取到数据');
|
|
178
|
+
if (outputFile) {
|
|
179
|
+
writeFileSync(outputFile, '[]', 'utf-8');
|
|
180
|
+
}
|
|
181
|
+
return;
|
|
182
|
+
} else {
|
|
183
|
+
const msg = errors[0].message;
|
|
184
|
+
if (msg.includes('不可用') || msg.includes('连接被拒绝') || msg.includes('连接中断') ||
|
|
185
|
+
msg.includes('超时') || msg.includes('无法解析')) {
|
|
186
|
+
console.error(` ${errors.length} 个请求失败,请检查代理是否可用: ${proxyUrl}\n`);
|
|
187
|
+
} else {
|
|
188
|
+
console.error(` ${errors.length} 个失败:`);
|
|
189
|
+
const show = errors.slice(0, 5);
|
|
190
|
+
for (const e of show) {
|
|
191
|
+
console.error(` ✗ ${e.url}: ${e.message}`);
|
|
192
|
+
}
|
|
193
|
+
if (errors.length > 5) {
|
|
194
|
+
console.error(` ... 还有 ${errors.length - 5} 个`);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
const output = formatOutput(uniqueResults, outputFormat);
|
|
201
|
+
|
|
202
|
+
if (outputFile) {
|
|
203
|
+
writeFileSync(outputFile, output, 'utf-8');
|
|
204
|
+
console.log(`\n结果已写入: ${outputFile}`);
|
|
205
|
+
} else {
|
|
206
|
+
console.log(output);
|
|
207
|
+
}
|
|
208
|
+
console.log(`\n共 ${uniqueResults.length} 个用户的数据`);
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
async function main() {
|
|
212
|
+
const { urls, outputFile, outputFormat, exploreCount, showConfig: showCfg, showHelp, customProxy, configAction, configValue, pipeMode } = parseArgs();
|
|
213
|
+
const proxyUrl = customProxy || proxy;
|
|
214
|
+
|
|
215
|
+
if (showHelp) {
|
|
216
|
+
showUsage();
|
|
217
|
+
return;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
if (configAction) {
|
|
221
|
+
handleConfig(configAction, configValue);
|
|
222
|
+
return;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
if (showCfg) {
|
|
226
|
+
showConfig(urls, outputFile);
|
|
227
|
+
return;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
if (urls.length === 0 && exploreCount === 0) {
|
|
231
|
+
showUsage();
|
|
232
|
+
return;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
if (exploreCount > 0) {
|
|
236
|
+
await runExplore(exploreCount, urls, proxyUrl, outputFile, outputFormat, pipeMode);
|
|
237
|
+
} else {
|
|
238
|
+
await runScrape(urls, proxyUrl, outputFile, outputFormat);
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
main().catch(err => {
|
|
243
|
+
console.error(`错误: ${err.message}`);
|
|
244
|
+
process.exit(1);
|
|
245
|
+
});
|