tt-help-cli-ycl 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +3 -3
- package/src/lib/args.js +4 -1
- package/src/lib/constants.js +7 -0
- package/src/lib/fetcher.js +18 -4
- package/src/lib/filter.js +66 -0
- package/src/main.mjs +26 -12
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "tt-help-cli-ycl",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.4",
|
|
4
4
|
"description": "TikTok user & video data scraper - extract ttSeller, verified, locationCreated from HTML source",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -31,7 +31,7 @@
|
|
|
31
31
|
},
|
|
32
32
|
"repository": {
|
|
33
33
|
"type": "git",
|
|
34
|
-
"url": "https://github.com/jsjhycl/tt-help-cli.git"
|
|
34
|
+
"url": "git+https://github.com/jsjhycl/tt-help-cli.git"
|
|
35
35
|
},
|
|
36
36
|
"bugs": {
|
|
37
37
|
"url": "https://github.com/jsjhycl/tt-help-cli/issues"
|
|
@@ -41,4 +41,4 @@
|
|
|
41
41
|
"playwright": "^1.59.1",
|
|
42
42
|
"undici": "^8.1.0"
|
|
43
43
|
}
|
|
44
|
-
}
|
|
44
|
+
}
|
package/src/lib/args.js
CHANGED
|
@@ -14,6 +14,7 @@ export function parseArgs() {
|
|
|
14
14
|
let configAction = null;
|
|
15
15
|
let configValue = null;
|
|
16
16
|
let pipeMode = false;
|
|
17
|
+
let filterStr = null;
|
|
17
18
|
|
|
18
19
|
for (let i = 0; i < args.length; i++) {
|
|
19
20
|
const arg = args[i];
|
|
@@ -24,6 +25,8 @@ export function parseArgs() {
|
|
|
24
25
|
: 100;
|
|
25
26
|
} else if (arg === '--proxy') {
|
|
26
27
|
customProxy = args[++i];
|
|
28
|
+
} else if (arg === '--filter') {
|
|
29
|
+
filterStr = args[++i];
|
|
27
30
|
} else if (arg === 'config') {
|
|
28
31
|
configAction = args[i + 1];
|
|
29
32
|
if (configAction === 'set' || configAction === 'set-proxy' || configAction === 'set-browser') {
|
|
@@ -55,5 +58,5 @@ export function parseArgs() {
|
|
|
55
58
|
urls.push(...lines);
|
|
56
59
|
}
|
|
57
60
|
|
|
58
|
-
return { urls, outputFile, outputFormat, exploreCount, showConfig, showHelp, customProxy, configAction, configValue, pipeMode };
|
|
61
|
+
return { urls, outputFile, outputFormat, exploreCount, showConfig, showHelp, customProxy, configAction, configValue, pipeMode, filterStr };
|
|
59
62
|
}
|
package/src/lib/constants.js
CHANGED
|
@@ -45,12 +45,18 @@ const HELP_TEXT = [
|
|
|
45
45
|
' --explore [count] 从 Explore 页面获取视频列表(默认: 100)',
|
|
46
46
|
' --pipe 将 Explore 结果自动传给 URL 爬取',
|
|
47
47
|
' --proxy <地址> 临时指定代理地址',
|
|
48
|
+
' --filter <条件> 过滤结果(格式: key=value&key2=value2)',
|
|
48
49
|
' -i, --input <file> 从文件读取 URL 列表(每行一个)',
|
|
49
50
|
' -o, --output <file> 指定输出文件(默认: tiktok_data.json)',
|
|
50
51
|
' -f, --format <fmt> 输出格式: json(默认), table, raw',
|
|
51
52
|
' -c, --config 显示当前配置',
|
|
52
53
|
' -h, --help 显示帮助',
|
|
53
54
|
'',
|
|
55
|
+
'过滤示例:',
|
|
56
|
+
' --filter "ttSeller=true&verified=false" 过滤卖家且未认证',
|
|
57
|
+
' --filter "locationCreated=DE,ES" 过滤指定地区',
|
|
58
|
+
' --filter "ttSeller=true&locationCreated=US" 组合条件',
|
|
59
|
+
'',
|
|
54
60
|
'配置代理:',
|
|
55
61
|
' tt-help config set http://127.0.0.1:7890 设置代理',
|
|
56
62
|
' tt-help config show 查看配置',
|
|
@@ -68,6 +74,7 @@ const HELP_TEXT = [
|
|
|
68
74
|
' tt-help -i urls.txt -o result.json # 再爬取这些 URL',
|
|
69
75
|
' tt-help config set http://127.0.0.1:7890',
|
|
70
76
|
' tt-help https://www.tiktok.com/@username',
|
|
77
|
+
' tt-help https://... --filter "ttSeller=true&locationCreated=DE"',
|
|
71
78
|
];
|
|
72
79
|
|
|
73
80
|
const CONFIG_TEXT = [
|
package/src/lib/fetcher.js
CHANGED
|
@@ -2,16 +2,30 @@ import { fetch, ProxyAgent } from 'undici';
|
|
|
2
2
|
import { DEFAULT_PROXY } from './constants.js';
|
|
3
3
|
|
|
4
4
|
const HEADERS = {
|
|
5
|
-
'User-Agent': 'Mozilla/5.0 (
|
|
6
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
5
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
6
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
|
7
|
+
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
|
|
8
|
+
'Accept-Encoding': 'gzip, deflate, br',
|
|
9
|
+
'Connection': 'keep-alive',
|
|
10
|
+
'Upgrade-Insecure-Requests': '1',
|
|
11
|
+
'Sec-Fetch-Dest': 'document',
|
|
12
|
+
'Sec-Fetch-Mode': 'navigate',
|
|
13
|
+
'Sec-Fetch-Site': 'none',
|
|
14
|
+
'Sec-Fetch-User': '?1',
|
|
15
|
+
'Cache-Control': 'max-age=0',
|
|
7
16
|
};
|
|
8
17
|
|
|
9
18
|
export async function fetchHtml(url, proxyUrl) {
|
|
10
19
|
const p = proxyUrl || DEFAULT_PROXY;
|
|
11
20
|
const agent = new ProxyAgent(p);
|
|
12
21
|
try {
|
|
13
|
-
const res = await fetch(url, {
|
|
14
|
-
|
|
22
|
+
const res = await fetch(url, {
|
|
23
|
+
headers: HEADERS,
|
|
24
|
+
dispatcher: agent,
|
|
25
|
+
redirect: 'follow',
|
|
26
|
+
});
|
|
27
|
+
const html = await res.text();
|
|
28
|
+
return html;
|
|
15
29
|
} catch (err) {
|
|
16
30
|
throw new Error(`请求 ${url} 失败,代理 ${p} 不可用`);
|
|
17
31
|
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
export function parseFilter(filterStr) {
|
|
2
|
+
if (!filterStr) return null;
|
|
3
|
+
|
|
4
|
+
const filter = {};
|
|
5
|
+
const pairs = filterStr.split('&');
|
|
6
|
+
|
|
7
|
+
for (const pair of pairs) {
|
|
8
|
+
const [key, value] = pair.split('=');
|
|
9
|
+
if (!key || value === undefined) continue;
|
|
10
|
+
|
|
11
|
+
const trimmedKey = key.trim();
|
|
12
|
+
const trimmedValue = value.trim();
|
|
13
|
+
|
|
14
|
+
// 处理布尔值
|
|
15
|
+
if (trimmedValue === 'true') {
|
|
16
|
+
filter[trimmedKey] = true;
|
|
17
|
+
} else if (trimmedValue === 'false') {
|
|
18
|
+
filter[trimmedKey] = false;
|
|
19
|
+
} else {
|
|
20
|
+
// 支持逗号分隔的多个值(如 locationCreated=DE,ES)
|
|
21
|
+
filter[trimmedKey] = trimmedValue.split(',').map(v => v.trim());
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
return Object.keys(filter).length > 0 ? filter : null;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export function applyFilter(results, filter) {
|
|
29
|
+
if (!filter || results.length === 0) return results;
|
|
30
|
+
|
|
31
|
+
return results.filter(item => {
|
|
32
|
+
for (const [key, expectedValue] of Object.entries(filter)) {
|
|
33
|
+
const actualValue = item[key];
|
|
34
|
+
|
|
35
|
+
// 如果字段不存在,过滤掉
|
|
36
|
+
if (actualValue === undefined || actualValue === null) {
|
|
37
|
+
return false;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// 数组值匹配(如 locationCreated=DE,ES)
|
|
41
|
+
if (Array.isArray(expectedValue)) {
|
|
42
|
+
if (!expectedValue.includes(String(actualValue))) {
|
|
43
|
+
return false;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
// 布尔值或精确匹配
|
|
47
|
+
else if (actualValue !== expectedValue) {
|
|
48
|
+
return false;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
return true;
|
|
52
|
+
});
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
export function formatFilterDescription(filter) {
|
|
56
|
+
if (!filter) return '';
|
|
57
|
+
|
|
58
|
+
const parts = Object.entries(filter).map(([key, value]) => {
|
|
59
|
+
if (Array.isArray(value)) {
|
|
60
|
+
return `${key}=${value.join(',')}`;
|
|
61
|
+
}
|
|
62
|
+
return `${key}=${value}`;
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
return parts.join(' & ');
|
|
66
|
+
}
|
package/src/main.mjs
CHANGED
|
@@ -3,6 +3,7 @@ import { HELP_TEXT, CONFIG_TEXT, proxy, configFile, configPath, DEFAULT_PROXY, s
|
|
|
3
3
|
import { fetchExplore } from './lib/explore.js';
|
|
4
4
|
import { processUrl } from './lib/scrape.js';
|
|
5
5
|
import { deduplicate, formatOutput } from './lib/output.js';
|
|
6
|
+
import { parseFilter, applyFilter, formatFilterDescription } from './lib/filter.js';
|
|
6
7
|
import { writeFileSync, readFileSync, existsSync } from 'fs';
|
|
7
8
|
|
|
8
9
|
function showConfig(urls, outputFile) {
|
|
@@ -79,7 +80,7 @@ function cleanError(msg) {
|
|
|
79
80
|
.trim();
|
|
80
81
|
}
|
|
81
82
|
|
|
82
|
-
async function runExplore(exploreCount, urls, proxyUrl, outputFile, outputFormat, isPipe) {
|
|
83
|
+
async function runExplore(exploreCount, urls, proxyUrl, outputFile, outputFormat, isPipe, filter) {
|
|
83
84
|
console.log(`\n代理: ${proxyUrl}`);
|
|
84
85
|
console.log(`Explore 数量: ${exploreCount}`);
|
|
85
86
|
if (urls.length > 0) {
|
|
@@ -97,7 +98,7 @@ async function runExplore(exploreCount, urls, proxyUrl, outputFile, outputFormat
|
|
|
97
98
|
if (isPipe) {
|
|
98
99
|
const videoUrls = exploreResults.map(r => r.url).filter(Boolean);
|
|
99
100
|
if (videoUrls.length > 0) {
|
|
100
|
-
await runScrape(videoUrls, proxyUrl, outputFile, outputFormat);
|
|
101
|
+
await runScrape(videoUrls, proxyUrl, outputFile, outputFormat, filter);
|
|
101
102
|
return;
|
|
102
103
|
}
|
|
103
104
|
}
|
|
@@ -142,8 +143,9 @@ async function runExplore(exploreCount, urls, proxyUrl, outputFile, outputFormat
|
|
|
142
143
|
}
|
|
143
144
|
|
|
144
145
|
const uniqueResults = deduplicate(allResults);
|
|
146
|
+
const filteredResults = applyFilter(uniqueResults, filter);
|
|
145
147
|
|
|
146
|
-
if (
|
|
148
|
+
if (filteredResults.length === 0) {
|
|
147
149
|
console.log('\n未获取到数据');
|
|
148
150
|
if (outputFile) {
|
|
149
151
|
writeFileSync(outputFile, '[]', 'utf-8');
|
|
@@ -151,7 +153,7 @@ async function runExplore(exploreCount, urls, proxyUrl, outputFile, outputFormat
|
|
|
151
153
|
return;
|
|
152
154
|
}
|
|
153
155
|
|
|
154
|
-
const output = formatOutput(
|
|
156
|
+
const output = formatOutput(filteredResults, outputFormat);
|
|
155
157
|
|
|
156
158
|
if (outputFile) {
|
|
157
159
|
writeFileSync(outputFile, output, 'utf-8');
|
|
@@ -159,10 +161,15 @@ async function runExplore(exploreCount, urls, proxyUrl, outputFile, outputFormat
|
|
|
159
161
|
} else {
|
|
160
162
|
console.log(output);
|
|
161
163
|
}
|
|
162
|
-
|
|
164
|
+
|
|
165
|
+
if (filter) {
|
|
166
|
+
console.log(`\n共 ${uniqueResults.length} 个数据,过滤后 ${filteredResults.length} 个(过滤条件: ${formatFilterDescription(filter)})`);
|
|
167
|
+
} else {
|
|
168
|
+
console.log(`\n共 ${filteredResults.length} 个数据`);
|
|
169
|
+
}
|
|
163
170
|
}
|
|
164
171
|
|
|
165
|
-
async function runScrape(urls, proxyUrl, outputFile, outputFormat) {
|
|
172
|
+
async function runScrape(urls, proxyUrl, outputFile, outputFormat, filter) {
|
|
166
173
|
const allResults = [];
|
|
167
174
|
const errors = [];
|
|
168
175
|
|
|
@@ -179,9 +186,10 @@ async function runScrape(urls, proxyUrl, outputFile, outputFormat) {
|
|
|
179
186
|
console.log();
|
|
180
187
|
|
|
181
188
|
const uniqueResults = deduplicate(allResults);
|
|
189
|
+
const filteredResults = applyFilter(uniqueResults, filter);
|
|
182
190
|
|
|
183
191
|
if (errors.length > 0) {
|
|
184
|
-
if (
|
|
192
|
+
if (filteredResults.length === 0) {
|
|
185
193
|
const msg = errors[0].message;
|
|
186
194
|
if (msg.includes('不可用') || msg.includes('连接被拒绝') || msg.includes('连接中断') ||
|
|
187
195
|
msg.includes('超时') || msg.includes('无法解析')) {
|
|
@@ -218,7 +226,7 @@ async function runScrape(urls, proxyUrl, outputFile, outputFormat) {
|
|
|
218
226
|
}
|
|
219
227
|
}
|
|
220
228
|
|
|
221
|
-
const output = formatOutput(
|
|
229
|
+
const output = formatOutput(filteredResults, outputFormat);
|
|
222
230
|
|
|
223
231
|
if (outputFile) {
|
|
224
232
|
writeFileSync(outputFile, output, 'utf-8');
|
|
@@ -226,12 +234,18 @@ async function runScrape(urls, proxyUrl, outputFile, outputFormat) {
|
|
|
226
234
|
} else {
|
|
227
235
|
console.log(output);
|
|
228
236
|
}
|
|
229
|
-
|
|
237
|
+
|
|
238
|
+
if (filter) {
|
|
239
|
+
console.log(`\n共 ${uniqueResults.length} 个数据,过滤后 ${filteredResults.length} 个(过滤条件: ${formatFilterDescription(filter)})`);
|
|
240
|
+
} else {
|
|
241
|
+
console.log(`\n共 ${filteredResults.length} 个用户的数据`);
|
|
242
|
+
}
|
|
230
243
|
}
|
|
231
244
|
|
|
232
245
|
async function main() {
|
|
233
|
-
const { urls, outputFile, outputFormat, exploreCount, showConfig: showCfg, showHelp, customProxy, configAction, configValue, pipeMode } = parseArgs();
|
|
246
|
+
const { urls, outputFile, outputFormat, exploreCount, showConfig: showCfg, showHelp, customProxy, configAction, configValue, pipeMode, filterStr } = parseArgs();
|
|
234
247
|
const proxyUrl = customProxy || proxy;
|
|
248
|
+
const filter = parseFilter(filterStr);
|
|
235
249
|
|
|
236
250
|
if (showHelp) {
|
|
237
251
|
showUsage();
|
|
@@ -254,9 +268,9 @@ async function main() {
|
|
|
254
268
|
}
|
|
255
269
|
|
|
256
270
|
if (exploreCount > 0) {
|
|
257
|
-
await runExplore(exploreCount, urls, proxyUrl, outputFile, outputFormat, pipeMode);
|
|
271
|
+
await runExplore(exploreCount, urls, proxyUrl, outputFile, outputFormat, pipeMode, filter);
|
|
258
272
|
} else {
|
|
259
|
-
await runScrape(urls, proxyUrl, outputFile, outputFormat);
|
|
273
|
+
await runScrape(urls, proxyUrl, outputFile, outputFormat, filter);
|
|
260
274
|
}
|
|
261
275
|
}
|
|
262
276
|
|