@raphaellcs/data-cleaner 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +315 -0
- package/bin/cli.js +3 -0
- package/package.json +30 -0
- package/src/index.js +419 -0
package/README.md
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
# @claw-dev/data-cleaner
|
|
2
|
+
|
|
3
|
+
> 数据清洗工具 - 快速清洗和转换数据文件
|
|
4
|
+
|
|
5
|
+
## 🚀 功能
|
|
6
|
+
|
|
7
|
+
- **去除空行**:过滤掉空数据
|
|
8
|
+
- **去重**:基于字段或整行去重
|
|
9
|
+
- **去除空格**:trim 字符串字段
|
|
10
|
+
- **大小写转换**:upper/lower/title
|
|
11
|
+
- **列选择**:只保留指定列
|
|
12
|
+
- **数据过滤**:基于条件的过滤
|
|
13
|
+
- **排序**:按列排序
|
|
14
|
+
- **格式转换**:JSON ↔ CSV
|
|
15
|
+
- **统计信息**:查看数据概况
|
|
16
|
+
|
|
17
|
+
## 📦 安装
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
npx @claw-dev/data-cleaner
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## 📖 快速开始
|
|
24
|
+
|
|
25
|
+
### 1. 查看统计
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
data-cleaner stats data.csv
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
输出:
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
📊 数据统计
|
|
35
|
+
|
|
36
|
+
类型: array
|
|
37
|
+
总数: 1523
|
|
38
|
+
|
|
39
|
+
字段:
|
|
40
|
+
- name
|
|
41
|
+
- email
|
|
42
|
+
- age
|
|
43
|
+
|
|
44
|
+
空值数量: 45
|
|
45
|
+
空字符串数量: 23
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### 2. 去除空行和空格
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
data-cleaner clean data.csv cleaned.csv --remove-empty --trim
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### 3. 去重
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
data-cleaner clean data.csv cleaned.csv --deduplicate
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
基于特定字段去重:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
data-cleaner clean data.csv cleaned.csv --deduplicate --key email
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### 4. 列选择
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
data-cleaner clean data.csv cleaned.csv --columns "name,email"
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### 5. 数据过滤
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
# 年龄大于 18
|
|
76
|
+
data-cleaner clean data.csv cleaned.csv -F "age:gt:18"
|
|
77
|
+
|
|
78
|
+
# 邮件包含 @gmail.com
|
|
79
|
+
data-cleaner clean data.csv cleaned.csv -F "email:contains:@gmail.com"
|
|
80
|
+
|
|
81
|
+
# 等于特定值
|
|
82
|
+
data-cleaner clean data.csv cleaned.csv -F "status:eq:active"
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### 6. 排序
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
# 按年龄升序
|
|
89
|
+
data-cleaner clean data.csv cleaned.csv -S age
|
|
90
|
+
|
|
91
|
+
# 按年龄降序
|
|
92
|
+
data-cleaner clean data.csv cleaned.csv -S age --order desc
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### 7. 大小写转换
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
# 全部大写
|
|
99
|
+
data-cleaner clean data.csv cleaned.csv --case upper
|
|
100
|
+
|
|
101
|
+
# 全部小写
|
|
102
|
+
data-cleaner clean data.csv cleaned.csv --case lower
|
|
103
|
+
|
|
104
|
+
# 首字母大写
|
|
105
|
+
data-cleaner clean data.csv cleaned.csv --case title
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### 8. 格式转换
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
# CSV 转 JSON
|
|
112
|
+
data-cleaner clean data.csv output.json -f json
|
|
113
|
+
|
|
114
|
+
# JSON 转 CSV
|
|
115
|
+
data-cleaner clean data.json output.csv -f csv
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## 📋 过滤操作
|
|
119
|
+
|
|
120
|
+
| 操作符 | 说明 | 示例 |
|
|
121
|
+
|--------|------|------|
|
|
122
|
+
| `eq` | 等于 | `status:eq:active` |
|
|
123
|
+
| `neq` | 不等于 | `status:neq:deleted` |
|
|
124
|
+
| `gt` | 大于 | `age:gt:18` |
|
|
125
|
+
| `lt` | 小于 | `age:lt:65` |
|
|
126
|
+
| `gte` | 大于等于 | `age:gte:18` |
|
|
127
|
+
| `lte` | 小于等于 | `age:lte:65` |
|
|
128
|
+
| `contains` | 包含 | `email:contains:@gmail.com` |
|
|
129
|
+
| `startsWith` | 以...开头 | `name:startsWith:A` |
|
|
130
|
+
| `endsWith` | 以...结尾 | `email:endsWith:.com` |
|
|
131
|
+
|
|
132
|
+
## 🎯 使用场景
|
|
133
|
+
|
|
134
|
+
### 1. 清洗用户数据
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
data-cleaner clean users.csv users_cleaned.csv \
|
|
138
|
+
--remove-empty \
|
|
139
|
+
--deduplicate --key email \
|
|
140
|
+
--trim \
|
|
141
|
+
-F "status:eq:active"
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
去除空行、基于邮箱去重、去除空格、只保留活跃用户。
|
|
145
|
+
|
|
146
|
+
### 2. 提取特定列
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
data-cleaner clean products.csv products_simple.csv \
|
|
150
|
+
--columns "id,name,price"
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
只保留产品 ID、名称和价格。
|
|
154
|
+
|
|
155
|
+
### 3. 格式转换
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
data-cleaner clean data.json data.csv -f csv
|
|
159
|
+
data-cleaner clean data.csv data.json -f json
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
在 JSON 和 CSV 之间转换。
|
|
163
|
+
|
|
164
|
+
### 4. 排序和限制
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
data-cleaner clean products.csv top10.csv \
|
|
168
|
+
-S price --order desc \
|
|
169
|
+
--limit 10
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
按价格降序,只保留前 10 个。
|
|
173
|
+
|
|
174
|
+
### 5. 数据标准化
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
data-cleaner clean emails.csv emails_cleaned.csv \
|
|
178
|
+
--trim \
|
|
179
|
+
--case lower
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
去除空格并转换为小写。
|
|
183
|
+
|
|
184
|
+
## 💡 组合使用
|
|
185
|
+
|
|
186
|
+
多个选项可以组合使用:
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
data-cleaner clean data.csv cleaned.csv \
|
|
190
|
+
--remove-empty \
|
|
191
|
+
--deduplicate --key id \
|
|
192
|
+
--trim \
|
|
193
|
+
--case lower \
|
|
194
|
+
-F "status:eq:active" \
|
|
195
|
+
-S created_at --order desc \
|
|
196
|
+
--limit 1000
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
这会:
|
|
200
|
+
1. 去除空行
|
|
201
|
+
2. 基于 ID 去重
|
|
202
|
+
3. 去除空格
|
|
203
|
+
4. 转换为小写
|
|
204
|
+
5. 只保留状态为 active 的记录
|
|
205
|
+
6. 按创建时间降序排序
|
|
206
|
+
7. 只保留前 1000 条
|
|
207
|
+
|
|
208
|
+
## 📊 统计信息
|
|
209
|
+
|
|
210
|
+
使用 `--stats` 查看清洗前后的对比:
|
|
211
|
+
|
|
212
|
+
```bash
|
|
213
|
+
data-cleaner clean data.csv cleaned.csv --stats
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
输出:
|
|
217
|
+
|
|
218
|
+
```
|
|
219
|
+
🔧 清洗数据
|
|
220
|
+
|
|
221
|
+
输入: data.csv
|
|
222
|
+
输出: cleaned.csv
|
|
223
|
+
|
|
224
|
+
原始数据:
|
|
225
|
+
📊 数据统计
|
|
226
|
+
|
|
227
|
+
类型: array
|
|
228
|
+
总数: 1523
|
|
229
|
+
|
|
230
|
+
字段:
|
|
231
|
+
- id
|
|
232
|
+
- name
|
|
233
|
+
- email
|
|
234
|
+
- age
|
|
235
|
+
- status
|
|
236
|
+
|
|
237
|
+
空值数量: 45
|
|
238
|
+
空字符串数量: 23
|
|
239
|
+
|
|
240
|
+
清洗后数据:
|
|
241
|
+
📊 数据统计
|
|
242
|
+
|
|
243
|
+
类型: array
|
|
244
|
+
总数: 1456
|
|
245
|
+
|
|
246
|
+
字段:
|
|
247
|
+
- id
|
|
248
|
+
- name
|
|
249
|
+
- email
|
|
250
|
+
- age
|
|
251
|
+
- status
|
|
252
|
+
|
|
253
|
+
✅ 已保存到: cleaned.csv
|
|
254
|
+
从 1523 行减少到 1456 行
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
## 🔧 高级功能
|
|
258
|
+
|
|
259
|
+
### 1. 转换为大写并去除空值
|
|
260
|
+
|
|
261
|
+
```bash
|
|
262
|
+
data-cleaner clean data.csv cleaned.csv \
|
|
263
|
+
--remove-empty \
|
|
264
|
+
--trim \
|
|
265
|
+
--case upper
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
### 2. 多步清洗
|
|
269
|
+
|
|
270
|
+
可以链式调用,逐步清洗:
|
|
271
|
+
|
|
272
|
+
```bash
|
|
273
|
+
# 第一步:去重
|
|
274
|
+
data-cleaner clean data.csv step1.csv --deduplicate --key id
|
|
275
|
+
|
|
276
|
+
# 第二步:过滤
|
|
277
|
+
data-cleaner clean step1.csv step2.csv -F "age:gte:18"
|
|
278
|
+
|
|
279
|
+
# 第三步:排序
|
|
280
|
+
data-cleaner clean step2.csv final.csv -S created_at --order desc
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
### 3. 批量处理
|
|
284
|
+
|
|
285
|
+
使用 shell 脚本批量处理:
|
|
286
|
+
|
|
287
|
+
```bash
|
|
288
|
+
#!/bin/bash
|
|
289
|
+
|
|
290
|
+
for file in data/*.csv; do
|
|
291
|
+
output="cleaned/$(basename $file)"
|
|
292
|
+
data-cleaner clean "$file" "$output" --remove-empty --trim
|
|
293
|
+
done
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
## 🚧 待实现
|
|
297
|
+
|
|
298
|
+
- [ ] 支持更多文件格式(Excel、SQL)
|
|
299
|
+
- [ ] 自定义转换函数
|
|
300
|
+
- [ ] 正则表达式替换
|
|
301
|
+
- [ ] 数据验证规则
|
|
302
|
+
- [ ] 合并多个文件
|
|
303
|
+
- [ ] 分组统计
|
|
304
|
+
|
|
305
|
+
## 🤝 贡献
|
|
306
|
+
|
|
307
|
+
欢迎提交 Issue 和 PR!
|
|
308
|
+
|
|
309
|
+
## 📄 许可证
|
|
310
|
+
|
|
311
|
+
MIT © 梦心
|
|
312
|
+
|
|
313
|
+
---
|
|
314
|
+
|
|
315
|
+
Made with 🌙 by 梦心
|
package/bin/cli.js
ADDED
package/package.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@raphaellcs/data-cleaner",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "数据清洗工具 - 快速清洗和转换数据文件",
|
|
5
|
+
"main": "src/index.js",
|
|
6
|
+
"bin": {
|
|
7
|
+
"data-cleaner": "./bin/cli.js"
|
|
8
|
+
},
|
|
9
|
+
"scripts": {
|
|
10
|
+
"clean": "node src/index.js clean"
|
|
11
|
+
},
|
|
12
|
+
"keywords": [
|
|
13
|
+
"data",
|
|
14
|
+
"cleaning",
|
|
15
|
+
"csv",
|
|
16
|
+
"json",
|
|
17
|
+
"transformation"
|
|
18
|
+
],
|
|
19
|
+
"author": "梦心",
|
|
20
|
+
"license": "MIT",
|
|
21
|
+
"dependencies": {
|
|
22
|
+
"commander": "^12.1.0",
|
|
23
|
+
"chalk": "^4.1.2",
|
|
24
|
+
"csv-parse": "^5.5.0",
|
|
25
|
+
"csv-stringify": "^6.4.0"
|
|
26
|
+
},
|
|
27
|
+
"engines": {
|
|
28
|
+
"node": ">=18.0.0"
|
|
29
|
+
}
|
|
30
|
+
}
|
package/src/index.js
ADDED
|
@@ -0,0 +1,419 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
const fs = require('fs');
|
|
4
|
+
const path = require('path');
|
|
5
|
+
const { program } = require('commander');
|
|
6
|
+
const chalk = require('chalk');
|
|
7
|
+
const { parse } = require('csv-parse');
|
|
8
|
+
const { stringify } = require('csv-stringify');
|
|
9
|
+
|
|
10
|
+
// 读取文件
|
|
11
|
+
function readFile(filePath) {
|
|
12
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
13
|
+
const content = fs.readFileSync(filePath, 'utf-8');
|
|
14
|
+
|
|
15
|
+
if (ext === '.json') {
|
|
16
|
+
return JSON.parse(content);
|
|
17
|
+
} else if (ext === '.csv') {
|
|
18
|
+
return new Promise((resolve, reject) => {
|
|
19
|
+
parse(content, { columns: true }, (err, data) => {
|
|
20
|
+
if (err) reject(err);
|
|
21
|
+
else resolve(data);
|
|
22
|
+
});
|
|
23
|
+
});
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
return content;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// 写入文件
|
|
30
|
+
function writeFile(filePath, data, format) {
|
|
31
|
+
const dir = path.dirname(filePath);
|
|
32
|
+
|
|
33
|
+
if (!fs.existsSync(dir)) {
|
|
34
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
if (format === 'json') {
|
|
38
|
+
fs.writeFileSync(filePath, JSON.stringify(data, null, 2), 'utf-8');
|
|
39
|
+
} else if (format === 'csv') {
|
|
40
|
+
return new Promise((resolve, reject) => {
|
|
41
|
+
stringify(data, { header: true }, (err, output) => {
|
|
42
|
+
if (err) reject(err);
|
|
43
|
+
else {
|
|
44
|
+
fs.writeFileSync(filePath, output, 'utf-8');
|
|
45
|
+
resolve();
|
|
46
|
+
}
|
|
47
|
+
});
|
|
48
|
+
});
|
|
49
|
+
} else {
|
|
50
|
+
fs.writeFileSync(filePath, data, 'utf-8');
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// 清洗数据
|
|
55
|
+
function cleanData(data, options) {
|
|
56
|
+
let cleaned = data;
|
|
57
|
+
|
|
58
|
+
// 去除空行
|
|
59
|
+
if (options.removeEmpty) {
|
|
60
|
+
if (Array.isArray(cleaned)) {
|
|
61
|
+
cleaned = cleaned.filter(item => {
|
|
62
|
+
if (typeof item === 'string') return item.trim() !== '';
|
|
63
|
+
if (typeof item === 'object' && item !== null) {
|
|
64
|
+
return Object.values(item).some(v => v !== null && v !== undefined && v !== '');
|
|
65
|
+
}
|
|
66
|
+
return true;
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// 去重
|
|
72
|
+
if (options.deduplicate) {
|
|
73
|
+
if (Array.isArray(cleaned)) {
|
|
74
|
+
const seen = new Set();
|
|
75
|
+
cleaned = cleaned.filter(item => {
|
|
76
|
+
const key = options.key ? item[options.key] : JSON.stringify(item);
|
|
77
|
+
if (seen.has(key)) return false;
|
|
78
|
+
seen.add(key);
|
|
79
|
+
return true;
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// 去除空格
|
|
85
|
+
if (options.trim) {
|
|
86
|
+
if (Array.isArray(cleaned)) {
|
|
87
|
+
cleaned = cleaned.map(item => {
|
|
88
|
+
if (typeof item === 'object' && item !== null) {
|
|
89
|
+
const result = {};
|
|
90
|
+
for (const [key, value] of Object.entries(item)) {
|
|
91
|
+
result[key] = typeof value === 'string' ? value.trim() : value;
|
|
92
|
+
}
|
|
93
|
+
return result;
|
|
94
|
+
}
|
|
95
|
+
return item;
|
|
96
|
+
});
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// 大小写转换
|
|
101
|
+
if (options.case) {
|
|
102
|
+
if (Array.isArray(cleaned)) {
|
|
103
|
+
cleaned = cleaned.map(item => {
|
|
104
|
+
if (typeof item === 'object' && item !== null) {
|
|
105
|
+
const result = {};
|
|
106
|
+
for (const [key, value] of Object.entries(item)) {
|
|
107
|
+
if (typeof value === 'string') {
|
|
108
|
+
switch (options.case) {
|
|
109
|
+
case 'upper':
|
|
110
|
+
result[key] = value.toUpperCase();
|
|
111
|
+
break;
|
|
112
|
+
case 'lower':
|
|
113
|
+
result[key] = value.toLowerCase();
|
|
114
|
+
break;
|
|
115
|
+
case 'title':
|
|
116
|
+
result[key] = value.replace(/\w\S*/g, txt =>
|
|
117
|
+
txt.charAt(0).toUpperCase() + txt.substr(1).toLowerCase()
|
|
118
|
+
);
|
|
119
|
+
break;
|
|
120
|
+
}
|
|
121
|
+
} else {
|
|
122
|
+
result[key] = value;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
return result;
|
|
126
|
+
}
|
|
127
|
+
return item;
|
|
128
|
+
});
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// 列选择
|
|
133
|
+
if (options.columns) {
|
|
134
|
+
if (Array.isArray(cleaned)) {
|
|
135
|
+
cleaned = cleaned.map(item => {
|
|
136
|
+
if (typeof item === 'object' && item !== null) {
|
|
137
|
+
const result = {};
|
|
138
|
+
for (const col of options.columns) {
|
|
139
|
+
if (col in item) {
|
|
140
|
+
result[col] = item[col];
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
return result;
|
|
144
|
+
}
|
|
145
|
+
return item;
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// 过滤
|
|
151
|
+
if (options.filter) {
|
|
152
|
+
if (Array.isArray(cleaned)) {
|
|
153
|
+
const { column, operator, value } = options.filter;
|
|
154
|
+
cleaned = cleaned.filter(item => {
|
|
155
|
+
const itemValue = item[column];
|
|
156
|
+
switch (operator) {
|
|
157
|
+
case 'eq':
|
|
158
|
+
return itemValue == value;
|
|
159
|
+
case 'neq':
|
|
160
|
+
return itemValue != value;
|
|
161
|
+
case 'gt':
|
|
162
|
+
return itemValue > value;
|
|
163
|
+
case 'lt':
|
|
164
|
+
return itemValue < value;
|
|
165
|
+
case 'gte':
|
|
166
|
+
return itemValue >= value;
|
|
167
|
+
case 'lte':
|
|
168
|
+
return itemValue <= value;
|
|
169
|
+
case 'contains':
|
|
170
|
+
return String(itemValue).includes(value);
|
|
171
|
+
case 'startsWith':
|
|
172
|
+
return String(itemValue).startsWith(value);
|
|
173
|
+
case 'endsWith':
|
|
174
|
+
return String(itemValue).endsWith(value);
|
|
175
|
+
case 'regex':
|
|
176
|
+
return new RegExp(value).test(String(itemValue));
|
|
177
|
+
default:
|
|
178
|
+
return true;
|
|
179
|
+
}
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// 数据转换
|
|
185
|
+
if (options.transform) {
|
|
186
|
+
if (Array.isArray(cleaned)) {
|
|
187
|
+
const { column, transform: transformFn } = options.transform;
|
|
188
|
+
cleaned = cleaned.map(item => {
|
|
189
|
+
if (column && item[column] !== undefined) {
|
|
190
|
+
switch (transformFn) {
|
|
191
|
+
case 'uppercase':
|
|
192
|
+
item[column] = String(item[column]).toUpperCase();
|
|
193
|
+
break;
|
|
194
|
+
case 'lowercase':
|
|
195
|
+
item[column] = String(item[column]).toLowerCase();
|
|
196
|
+
break;
|
|
197
|
+
case 'capitalize':
|
|
198
|
+
item[column] = String(item[column]).charAt(0).toUpperCase() + String(item[column]).slice(1).toLowerCase();
|
|
199
|
+
break;
|
|
200
|
+
case 'trim':
|
|
201
|
+
item[column] = String(item[column]).trim();
|
|
202
|
+
break;
|
|
203
|
+
case 'number':
|
|
204
|
+
item[column] = Number(item[column]);
|
|
205
|
+
break;
|
|
206
|
+
case 'string':
|
|
207
|
+
item[column] = String(item[column]);
|
|
208
|
+
break;
|
|
209
|
+
default:
|
|
210
|
+
if (transformFn.startsWith('replace:')) {
|
|
211
|
+
const [from, to] = transformFn.split(':')[1].split(',');
|
|
212
|
+
item[column] = String(item[column]).split(from).join(to);
|
|
213
|
+
} else if (transformFn.startsWith('multiply:')) {
|
|
214
|
+
const factor = Number(transformFn.split(':')[1]);
|
|
215
|
+
item[column] = Number(item[column]) * factor;
|
|
216
|
+
} else if (transformFn.startsWith('divide:')) {
|
|
217
|
+
const divisor = Number(transformFn.split(':')[1]);
|
|
218
|
+
item[column] = Number(item[column]) / divisor;
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
return item;
|
|
223
|
+
});
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// 排序
|
|
228
|
+
if (options.sort) {
|
|
229
|
+
if (Array.isArray(cleaned)) {
|
|
230
|
+
const { column, order = 'asc' } = options.sort;
|
|
231
|
+
cleaned = cleaned.sort((a, b) => {
|
|
232
|
+
const aVal = a[column];
|
|
233
|
+
const bVal = b[column];
|
|
234
|
+
if (aVal < bVal) return order === 'asc' ? -1 : 1;
|
|
235
|
+
if (aVal > bVal) return order === 'asc' ? 1 : -1;
|
|
236
|
+
return 0;
|
|
237
|
+
});
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// 限制数量
|
|
242
|
+
if (options.limit) {
|
|
243
|
+
if (Array.isArray(cleaned)) {
|
|
244
|
+
cleaned = cleaned.slice(0, options.limit);
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
return cleaned;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
// 统计数据
|
|
252
|
+
function getStats(data) {
|
|
253
|
+
if (!Array.isArray(data)) {
|
|
254
|
+
return {
|
|
255
|
+
type: typeof data,
|
|
256
|
+
count: 1
|
|
257
|
+
};
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
const stats = {
|
|
261
|
+
type: 'array',
|
|
262
|
+
count: data.length,
|
|
263
|
+
fields: [],
|
|
264
|
+
nullCount: 0,
|
|
265
|
+
emptyCount: 0
|
|
266
|
+
};
|
|
267
|
+
|
|
268
|
+
if (data.length > 0 && typeof data[0] === 'object' && data[0] !== null) {
|
|
269
|
+
stats.fields = Object.keys(data[0]);
|
|
270
|
+
|
|
271
|
+
for (const item of data) {
|
|
272
|
+
for (const [key, value] of Object.entries(item)) {
|
|
273
|
+
if (value === null || value === undefined) {
|
|
274
|
+
stats.nullCount++;
|
|
275
|
+
} else if (value === '') {
|
|
276
|
+
stats.emptyCount++;
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
return stats;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
// 打印统计
|
|
286
|
+
function printStats(stats) {
|
|
287
|
+
console.log(chalk.cyan('\n📊 数据统计\n'));
|
|
288
|
+
console.log(chalk.gray(`类型: ${stats.type}`));
|
|
289
|
+
console.log(chalk.gray(`总数: ${stats.count}`));
|
|
290
|
+
|
|
291
|
+
if (stats.fields && stats.fields.length > 0) {
|
|
292
|
+
console.log(chalk.cyan('\n字段:'));
|
|
293
|
+
for (const field of stats.fields) {
|
|
294
|
+
console.log(chalk.gray(` - ${field}`));
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
if (stats.nullCount > 0) {
|
|
299
|
+
console.log(chalk.yellow(`\n空值数量: ${stats.nullCount}`));
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
if (stats.emptyCount > 0) {
|
|
303
|
+
console.log(chalk.yellow(`空字符串数量: ${stats.emptyCount}`));
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
console.log();
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
// CLI 配置
|
|
310
|
+
program
|
|
311
|
+
.name('data-cleaner')
|
|
312
|
+
.description('数据清洗工具 - 快速清洗和转换数据文件')
|
|
313
|
+
.version('1.0.0');
|
|
314
|
+
|
|
315
|
+
program
|
|
316
|
+
.command('stats <file>')
|
|
317
|
+
.description('显示文件统计信息')
|
|
318
|
+
.action(async (file) => {
|
|
319
|
+
if (!fs.existsSync(file)) {
|
|
320
|
+
console.log(chalk.red(`文件不存在: ${file}`));
|
|
321
|
+
process.exit(1);
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
const data = await readFile(file);
|
|
325
|
+
const stats = getStats(data);
|
|
326
|
+
printStats(stats);
|
|
327
|
+
});
|
|
328
|
+
|
|
329
|
+
program
|
|
330
|
+
.command('clean <input> [output]')
|
|
331
|
+
.option('-f, --format <type>', '输出格式(json/csv)')
|
|
332
|
+
.option('--remove-empty', '去除空行')
|
|
333
|
+
.option('--deduplicate', '去重')
|
|
334
|
+
.option('-k, --key <field>', '去重时使用的字段')
|
|
335
|
+
.option('--trim', '去除空格')
|
|
336
|
+
.option('--case <type>', '大小写转换(upper/lower/title)')
|
|
337
|
+
.option('-c, --columns <items>', '选择列(逗号分隔)')
|
|
338
|
+
.option('-F, --filter <expr>', '过滤表达式(column:operator:value)')
|
|
339
|
+
.option('-S, --sort <column>', '按列排序')
|
|
340
|
+
.option('--order <dir>', '排序方向(asc/desc)', 'asc')
|
|
341
|
+
.option('-l, --limit <number>', '限制输出数量', parseInt)
|
|
342
|
+
.option('--transform <expr>', '转换表达式(column:transform[:args])')
|
|
343
|
+
.option('--stats', '显示统计信息')
|
|
344
|
+
.description('清洗数据文件')
|
|
345
|
+
.action(async (input, output, options) => {
|
|
346
|
+
if (!fs.existsSync(input)) {
|
|
347
|
+
console.log(chalk.red(`文件不存在: ${input}`));
|
|
348
|
+
process.exit(1);
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
const ext = path.extname(input).toLowerCase();
|
|
352
|
+
const outputFormat = options.format || (ext === '.json' ? 'json' : 'csv');
|
|
353
|
+
const outputFile = output || input.replace(/\.[^.]+$/, `.cleaned.${outputFormat}`);
|
|
354
|
+
|
|
355
|
+
console.log(chalk.cyan(`\n🔧 清洗数据\n`));
|
|
356
|
+
console.log(chalk.gray(`输入: ${input}`));
|
|
357
|
+
console.log(chalk.gray(`输出: ${outputFile}\n`));
|
|
358
|
+
|
|
359
|
+
const data = await readFile(input);
|
|
360
|
+
|
|
361
|
+
// 显示原始统计
|
|
362
|
+
if (options.stats) {
|
|
363
|
+
console.log(chalk.cyan('原始数据:'));
|
|
364
|
+
printStats(getStats(data));
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
// 解析过滤表达式
|
|
368
|
+
if (options.filter) {
|
|
369
|
+
const parts = options.filter.split(':');
|
|
370
|
+
if (parts.length === 3) {
|
|
371
|
+
options.filter = {
|
|
372
|
+
column: parts[0],
|
|
373
|
+
operator: parts[1],
|
|
374
|
+
value: parts[2]
|
|
375
|
+
};
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
// 解析转换表达式
|
|
380
|
+
if (options.transform) {
|
|
381
|
+
const parts = options.transform.split(':');
|
|
382
|
+
if (parts.length >= 2) {
|
|
383
|
+
options.transform = {
|
|
384
|
+
column: parts[0],
|
|
385
|
+
transform: parts[1]
|
|
386
|
+
};
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
// 解析列
|
|
391
|
+
if (options.columns) {
|
|
392
|
+
options.columns = options.columns.split(',');
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
// 清洗数据
|
|
396
|
+
const cleaned = cleanData(data, options);
|
|
397
|
+
|
|
398
|
+
// 显示清洗后统计
|
|
399
|
+
if (options.stats) {
|
|
400
|
+
console.log(chalk.cyan('清洗后数据:'));
|
|
401
|
+
printStats(getStats(cleaned));
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
// 写入文件
|
|
405
|
+
await writeFile(outputFile, cleaned, outputFormat);
|
|
406
|
+
|
|
407
|
+
console.log(chalk.green(`✅ 已保存到: ${outputFile}`));
|
|
408
|
+
|
|
409
|
+
// 显示差异
|
|
410
|
+
const originalCount = Array.isArray(data) ? data.length : 1;
|
|
411
|
+
const cleanedCount = Array.isArray(cleaned) ? cleaned.length : 1;
|
|
412
|
+
if (originalCount !== cleanedCount) {
|
|
413
|
+
console.log(chalk.yellow(` 从 ${originalCount} 行减少到 ${cleanedCount} 行`));
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
console.log();
|
|
417
|
+
});
|
|
418
|
+
|
|
419
|
+
program.parse();
|