@raphaellcs/data-cleaner 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,4 +1,7 @@
1
- # @claw-dev/data-cleaner
1
+ # @raphaellcs/data-cleaner
2
+ [![npm](https://img.shields.io/npm/v/@raphaellcs/data-cleaner)](https://www.npmjs.com/package/@raphaellcs/data-cleaner)
3
+ [![downloads](https://img.shields.io/npm/dm/@raphaellcs/data-cleaner)](https://www.npmjs.com/package/@raphaellcs/data-cleaner)
4
+ [![license](https://img.shields.io/npm/l/@raphaellcs/data-cleaner)](https://www.npmjs.com/package/@raphaellcs/data-cleaner)
2
5
 
3
6
  > 数据清洗工具 - 快速清洗和转换数据文件
4
7
 
@@ -13,6 +16,9 @@
13
16
  - **排序**:按列排序
14
17
  - **格式转换**:JSON ↔ CSV
15
18
  - **统计信息**:查看数据概况
19
+ - **数据验证**:内置验证规则和自定义规则(新)
20
+ - **分组聚合**:字段分组和时间分组(新)
21
+ - **透视表**:创建数据透视表(新)
16
22
 
17
23
  ## 📦 安装
18
24
 
@@ -296,11 +302,134 @@ done
296
302
  ## 🚧 待实现
297
303
 
298
304
  - [ ] 支持更多文件格式(Excel、SQL)
299
- - [ ] 自定义转换函数
300
- - [ ] 正则表达式替换
301
- - [ ] 数据验证规则
302
305
  - [ ] 合并多个文件
303
- - [ ] 分组统计
306
+
307
+ ---
308
+
309
+ ## ✨ 新功能(v2.0.0)
310
+
311
+ ### 数据验证
312
+
313
+ 验证数据是否符合规则:
314
+
315
+ ```bash
316
+ data-cleaner validate data.csv --config rules.json
317
+ ```
318
+
319
+ 创建验证规则配置 `rules.json`:
320
+
321
+ ```json
322
+ {
323
+ "email": ["required", "email"],
324
+ "age": [
325
+ "required",
326
+ {"name": "number", "message": "年龄必须是数字"},
327
+ {"name": "min", "value": 0, "message": "年龄不能为负数"},
328
+ {"name": "max", "value": 120, "message": "年龄不能超过120"}
329
+ ],
330
+ "phone": [
331
+ {"name": "pattern", "value": "^\\d{11}$", "message": "手机号必须是11位数字"}
332
+ ],
333
+ "status": [
334
+ {"name": "enum", "value": ["active", "inactive", "pending"], "message": "状态值不合法"}
335
+ ]
336
+ }
337
+ ```
338
+
339
+ **内置验证规则:**
340
+ - `required` - 必填
341
+ - `email` - 邮箱格式
342
+ - `url` - URL 格式
343
+ - `number` - 数字
344
+ - `integer` - 整数
345
+ - `positive` - 正数
346
+ - `negative` - 负数
347
+ - `min:<value>` - 最小值
348
+ - `max:<value>` - 最大值
349
+ - `minLength:<length>` - 最小长度
350
+ - `maxLength:<length>` - 最大长度
351
+ - `pattern:<regex>` - 正则匹配
352
+ - `enum:[values]` - 枚举值
353
+ - `date` - 日期
354
+ - `future` - 未来日期
355
+ - `past` - 过去日期
356
+ - `phone` - 电话号码
357
+
358
+ 输出错误报告:
359
+
360
+ ```bash
361
+ data-cleaner validate data.csv --config rules.json --output errors.csv --format csv
362
+ ```
363
+
364
+ ### 分组聚合
365
+
366
+ 按字段分组并聚合:
367
+
368
+ ```bash
369
+ # 按部门分组,计算平均工资
370
+ data-cleaner group employees.csv --group-by department --aggregate "salary:avg" --stats
371
+
372
+ # 多字段分组
373
+ data-cleaner group sales.csv --group-by "region,category" --aggregate "revenue:sum,count" --output grouped.json
374
+ ```
375
+
376
+ 时间分组:
377
+
378
+ ```bash
379
+ # 按天分组
380
+ data-cleaner group orders.csv --time-field created_at --interval day --aggregate "amount:sum" --stats
381
+
382
+ # 按月分组
383
+ data-cleaner group orders.csv --time-field created_at --interval month --aggregate "amount:sum,count" --stats
384
+
385
+ # 按小时分组
386
+ data-cleaner group logs.csv --time-field timestamp --interval hour --aggregate "errors:sum" --stats
387
+ ```
388
+
389
+ **聚合类型:**
390
+ - `sum` - 求和
391
+ - `avg` - 平均值
392
+ - `min` - 最小值
393
+ - `max` - 最大值
394
+ - `count` - 计数
395
+ - `count_distinct` - 去重计数
396
+ - `first` - 第一个值
397
+ - `last` - 最后一个值
398
+ - `concat` - 拼接
399
+ - `array` - 数组
400
+ - `percentile:XX` - 百分位数(如 percentile:95)
401
+
402
+ ### 透视表
403
+
404
+ 创建数据透视表:
405
+
406
+ ```bash
407
+ data-cleaner pivot sales.csv \
408
+ --rows region \
409
+ --columns product \
410
+ --values revenue \
411
+ --agg sum
412
+ ```
413
+
414
+ 示例输出:
415
+
416
+ ```
417
+ productA productB productC
418
+ region1 15000.00 23000.00 18000.00
419
+ region2 12000.00 25000.00 21000.00
420
+ region3 18000.00 20000.00 22000.00
421
+ ```
422
+
423
+ 保存透视表:
424
+
425
+ ```bash
426
+ data-cleaner pivot sales.csv \
427
+ --rows region \
428
+ --columns product \
429
+ --values revenue \
430
+ --agg sum \
431
+ --output pivot.json
432
+ ```
304
433
 
305
434
  ## 🤝 贡献
306
435
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@raphaellcs/data-cleaner",
3
- "version": "1.0.0",
4
- "description": "数据清洗工具 - 快速清洗和转换数据文件",
3
+ "version": "2.0.0",
4
+ "description": "数据清洗工具 - 验证、分组、透视表",
5
5
  "main": "src/index.js",
6
6
  "bin": {
7
7
  "data-cleaner": "./bin/cli.js"
package/src/grouper.js ADDED
@@ -0,0 +1,360 @@
1
+ // 分组统计模块
2
+
3
+ /**
4
+ * 按字段分组
5
+ * @param {Array} data - 数据数组
6
+ * @param {string} groupByField - 分组字段
7
+ * @returns {Object} 分组结果
8
+ */
9
+ function groupBy(data, groupByField) {
10
+ const groups = {};
11
+
12
+ for (const item of data) {
13
+ const key = item[groupByField];
14
+ if (groups[key] === undefined) {
15
+ groups[key] = [];
16
+ }
17
+ groups[key].push(item);
18
+ }
19
+
20
+ return groups;
21
+ }
22
+
23
+ /**
24
+ * 按多个字段分组
25
+ * @param {Array} data - 数据数组
26
+ * @param {Array<string>} groupByFields - 分组字段数组
27
+ * @returns {Object} 分组结果
28
+ */
29
+ function groupByMultiple(data, groupByFields) {
30
+ const groups = {};
31
+
32
+ for (const item of data) {
33
+ const keyParts = groupByFields.map(field => {
34
+ const value = item[field];
35
+ return value !== undefined && value !== null ? String(value) : '__null__';
36
+ });
37
+ const key = keyParts.join('|');
38
+
39
+ if (groups[key] === undefined) {
40
+ groups[key] = [];
41
+ }
42
+ groups[key].push(item);
43
+ }
44
+
45
+ return groups;
46
+ }
47
+
48
+ /**
49
+ * 计算组的统计信息
50
+ * @param {Array} group - 组数据
51
+ * @param {Object} aggregations - 聚合规则
52
+ * @returns {Object} 统计结果
53
+ */
54
+ function aggregateGroup(group, aggregations) {
55
+ const result = {};
56
+
57
+ for (const [fieldName, aggType] of Object.entries(aggregations)) {
58
+ const values = group
59
+ .map(item => item[fieldName])
60
+ .filter(v => v !== null && v !== undefined && v !== '');
61
+
62
+ switch (aggType) {
63
+ case 'sum':
64
+ result[fieldName] = values.reduce((sum, v) => sum + (Number(v) || 0), 0);
65
+ break;
66
+
67
+ case 'avg':
68
+ result[fieldName] = values.length > 0
69
+ ? values.reduce((sum, v) => sum + (Number(v) || 0), 0) / values.length
70
+ : 0;
71
+ break;
72
+
73
+ case 'min':
74
+ result[fieldName] = Math.min(...values.map(v => Number(v) || Infinity));
75
+ break;
76
+
77
+ case 'max':
78
+ result[fieldName] = Math.max(...values.map(v => Number(v) || -Infinity));
79
+ break;
80
+
81
+ case 'count':
82
+ result[fieldName] = values.length;
83
+ break;
84
+
85
+ case 'count_distinct':
86
+ result[fieldName] = new Set(values).size;
87
+ break;
88
+
89
+ case 'first':
90
+ result[fieldName] = values[0];
91
+ break;
92
+
93
+ case 'last':
94
+ result[fieldName] = values[values.length - 1];
95
+ break;
96
+
97
+ case 'concat':
98
+ result[fieldName] = values.join(', ');
99
+ break;
100
+
101
+ case 'array':
102
+ result[fieldName] = values;
103
+ break;
104
+
105
+ default:
106
+ if (aggType.startsWith('percentile:')) {
107
+ const p = parseInt(aggType.split(':')[1]);
108
+ result[fieldName] = calculatePercentile(values.map(v => Number(v)), p);
109
+ }
110
+ }
111
+ }
112
+
113
+ return result;
114
+ }
115
+
116
+ /**
117
+ * 计算百分位数
118
+ * @param {Array<number>} values - 数值数组
119
+ * @param {number} percentile - 百分位数(0-100)
120
+ * @returns {number}
121
+ */
122
+ function calculatePercentile(values, percentile) {
123
+ if (values.length === 0) return 0;
124
+
125
+ const sorted = [...values].sort((a, b) => a - b);
126
+ const index = (percentile / 100) * (sorted.length - 1);
127
+
128
+ const lower = Math.floor(index);
129
+ const upper = Math.ceil(index);
130
+ const weight = index - lower;
131
+
132
+ if (upper >= sorted.length) {
133
+ return sorted[sorted.length - 1];
134
+ }
135
+
136
+ return sorted[lower] * (1 - weight) + sorted[upper] * weight;
137
+ }
138
+
139
+ /**
140
+ * 分组并聚合
141
+ * @param {Array} data - 数据数组
142
+ * @param {string|Array<string>} groupBy - 分组字段
143
+ * @param {Object} aggregations - 聚合规则
144
+ * @returns {Array} 分组聚合结果
145
+ */
146
+ function groupAndAggregate(data, groupBy, aggregations) {
147
+ const groupByFields = Array.isArray(groupBy) ? groupBy : [groupBy];
148
+ const groups = groupByMultiple(data, groupByFields);
149
+
150
+ const result = [];
151
+
152
+ for (const [key, group] of Object.entries(groups)) {
153
+ const keyParts = key.split('|');
154
+
155
+ const groupResult = {
156
+ _group: key,
157
+ _count: group.length
158
+ };
159
+
160
+ // 添加分组字段
161
+ groupByFields.forEach((field, index) => {
162
+ groupResult[field] = keyParts[index] === '__null__' ? null : keyParts[index];
163
+ });
164
+
165
+ // 添加聚合结果
166
+ const aggResults = aggregateGroup(group, aggregations);
167
+ Object.assign(groupResult, aggResults);
168
+
169
+ result.push(groupResult);
170
+ }
171
+
172
+ return result;
173
+ }
174
+
175
+ /**
176
+ * 按时间分组
177
+ * @param {Array} data - 数据数组
178
+ * @param {string} dateField - 日期字段
179
+ * @param {string} interval - 时间间隔(day/week/month/year/hour/minute)
180
+ * @returns {Object} 分组结果
181
+ */
182
+ function groupByTime(data, dateField, interval = 'day') {
183
+ const groups = {};
184
+
185
+ for (const item of data) {
186
+ const date = new Date(item[dateField]);
187
+ if (isNaN(date.getTime())) continue;
188
+
189
+ let key;
190
+ switch (interval) {
191
+ case 'minute':
192
+ key = date.toISOString().substring(0, 16); // YYYY-MM-DDTHH:MM
193
+ break;
194
+ case 'hour':
195
+ key = date.toISOString().substring(0, 13); // YYYY-MM-DDTHH
196
+ break;
197
+ case 'day':
198
+ key = date.toISOString().substring(0, 10); // YYYY-MM-DD
199
+ break;
200
+ case 'week':
201
+ const weekStart = new Date(date);
202
+ weekStart.setDate(date.getDate() - date.getDay());
203
+ key = weekStart.toISOString().substring(0, 10);
204
+ break;
205
+ case 'month':
206
+ key = date.toISOString().substring(0, 7); // YYYY-MM
207
+ break;
208
+ case 'year':
209
+ key = date.toISOString().substring(0, 4); // YYYY
210
+ break;
211
+ default:
212
+ key = date.toISOString().substring(0, 10);
213
+ }
214
+
215
+ if (!groups[key]) {
216
+ groups[key] = [];
217
+ }
218
+ groups[key].push(item);
219
+ }
220
+
221
+ return groups;
222
+ }
223
+
224
+ /**
225
+ * 计算分组统计信息
226
+ * @param {Object} groups - 分组结果
227
+ * @param {string} statField - 统计字段
228
+ * @returns {Array} 统计信息
229
+ */
230
+ function getGroupStats(groups, statField) {
231
+ const stats = [];
232
+
233
+ for (const [key, group] of Object.entries(groups)) {
234
+ const values = group
235
+ .map(item => Number(item[statField]))
236
+ .filter(v => !isNaN(v));
237
+
238
+ if (values.length === 0) {
239
+ stats.push({
240
+ group: key,
241
+ count: 0,
242
+ sum: 0,
243
+ avg: 0,
244
+ min: 0,
245
+ max: 0
246
+ });
247
+ continue;
248
+ }
249
+
250
+ stats.push({
251
+ group: key,
252
+ count: values.length,
253
+ sum: values.reduce((sum, v) => sum + v, 0),
254
+ avg: values.reduce((sum, v) => sum + v, 0) / values.length,
255
+ min: Math.min(...values),
256
+ max: Math.max(...values)
257
+ });
258
+ }
259
+
260
+ return stats.sort((a, b) => a.group.localeCompare(b.group));
261
+ }
262
+
263
+ /**
264
+ * 数据透视表
265
+ * @param {Array} data - 数据数组
266
+ * @param {string} rowField - 行字段
267
+ * @param {string} columnField - 列字段
268
+ * @param {string} valueField - 值字段
269
+ * @param {string} aggFunction - 聚合函数(sum/avg/count/min/max)
270
+ * @returns {Object} 透视表
271
+ */
272
+ function pivotTable(data, rowField, columnField, valueField, aggFunction = 'sum') {
273
+ const rows = new Set();
274
+ const columns = new Set();
275
+ const values = {};
276
+
277
+ // 收集行、列和值
278
+ for (const item of data) {
279
+ const rowKey = item[rowField];
280
+ const colKey = item[columnField];
281
+ const val = Number(item[valueField]) || 0;
282
+
283
+ rows.add(rowKey);
284
+ columns.add(colKey);
285
+
286
+ const key = `${rowKey}::${colKey}`;
287
+ if (!values[key]) {
288
+ values[key] = [];
289
+ }
290
+ values[key].push(val);
291
+ }
292
+
293
+ // 计算聚合值
294
+ const pivot = {};
295
+
296
+ for (const row of rows) {
297
+ pivot[row] = {};
298
+ for (const col of columns) {
299
+ const key = `${row}::${col}`;
300
+ const vals = values[key] || [];
301
+
302
+ let aggValue;
303
+ switch (aggFunction) {
304
+ case 'sum':
305
+ aggValue = vals.reduce((sum, v) => sum + v, 0);
306
+ break;
307
+ case 'avg':
308
+ aggValue = vals.length > 0 ? vals.reduce((sum, v) => sum + v, 0) / vals.length : 0;
309
+ break;
310
+ case 'count':
311
+ aggValue = vals.length;
312
+ break;
313
+ case 'min':
314
+ aggValue = vals.length > 0 ? Math.min(...vals) : 0;
315
+ break;
316
+ case 'max':
317
+ aggValue = vals.length > 0 ? Math.max(...vals) : 0;
318
+ break;
319
+ default:
320
+ aggValue = vals.reduce((sum, v) => sum + v, 0);
321
+ }
322
+
323
+ pivot[row][col] = aggValue;
324
+ }
325
+ }
326
+
327
+ return {
328
+ rows: Array.from(rows).sort(),
329
+ columns: Array.from(columns).sort(),
330
+ data: pivot
331
+ };
332
+ }
333
+
334
+ /**
335
+ * 打印分组统计
336
+ * @param {Array} stats - 统计信息
337
+ */
338
+ function printGroupStats(stats) {
339
+ console.log('\n📊 分组统计\n');
340
+
341
+ for (const stat of stats) {
342
+ console.log(`${stat.group}:`);
343
+ console.log(` 数量: ${stat.count}`);
344
+ console.log(` 总和: ${stat.sum.toFixed(2)}`);
345
+ console.log(` 平均: ${stat.avg.toFixed(2)}`);
346
+ console.log(` 最小: ${stat.min.toFixed(2)}`);
347
+ console.log(` 最大: ${stat.max.toFixed(2)}`);
348
+ console.log();
349
+ }
350
+ }
351
+
352
+ module.exports = {
353
+ groupBy,
354
+ groupByMultiple,
355
+ groupAndAggregate,
356
+ groupByTime,
357
+ getGroupStats,
358
+ pivotTable,
359
+ printGroupStats
360
+ };
package/src/index.js CHANGED
@@ -6,6 +6,17 @@ const { program } = require('commander');
6
6
  const chalk = require('chalk');
7
7
  const { parse } = require('csv-parse');
8
8
  const { stringify } = require('csv-stringify');
9
+ const {
10
+ DataValidator,
11
+ createValidatorFromConfig
12
+ } = require('./validator.js');
13
+ const {
14
+ groupAndAggregate,
15
+ groupByTime,
16
+ getGroupStats,
17
+ pivotTable,
18
+ printGroupStats
19
+ } = require('./grouper.js');
9
20
 
10
21
  // 读取文件
11
22
  function readFile(filePath) {
@@ -347,23 +358,23 @@ program
347
358
  console.log(chalk.red(`文件不存在: ${input}`));
348
359
  process.exit(1);
349
360
  }
350
-
361
+
351
362
  const ext = path.extname(input).toLowerCase();
352
363
  const outputFormat = options.format || (ext === '.json' ? 'json' : 'csv');
353
364
  const outputFile = output || input.replace(/\.[^.]+$/, `.cleaned.${outputFormat}`);
354
-
365
+
355
366
  console.log(chalk.cyan(`\n🔧 清洗数据\n`));
356
367
  console.log(chalk.gray(`输入: ${input}`));
357
368
  console.log(chalk.gray(`输出: ${outputFile}\n`));
358
-
369
+
359
370
  const data = await readFile(input);
360
-
371
+
361
372
  // 显示原始统计
362
373
  if (options.stats) {
363
374
  console.log(chalk.cyan('原始数据:'));
364
375
  printStats(getStats(data));
365
376
  }
366
-
377
+
367
378
  // 解析过滤表达式
368
379
  if (options.filter) {
369
380
  const parts = options.filter.split(':');
@@ -391,29 +402,267 @@ program
391
402
  if (options.columns) {
392
403
  options.columns = options.columns.split(',');
393
404
  }
394
-
405
+
395
406
  // 清洗数据
396
407
  const cleaned = cleanData(data, options);
397
-
408
+
398
409
  // 显示清洗后统计
399
410
  if (options.stats) {
400
411
  console.log(chalk.cyan('清洗后数据:'));
401
412
  printStats(getStats(cleaned));
402
413
  }
403
-
414
+
404
415
  // 写入文件
405
416
  await writeFile(outputFile, cleaned, outputFormat);
406
-
417
+
407
418
  console.log(chalk.green(`✅ 已保存到: ${outputFile}`));
408
-
419
+
409
420
  // 显示差异
410
421
  const originalCount = Array.isArray(data) ? data.length : 1;
411
422
  const cleanedCount = Array.isArray(cleaned) ? cleaned.length : 1;
412
423
  if (originalCount !== cleanedCount) {
413
424
  console.log(chalk.yellow(` 从 ${originalCount} 行减少到 ${cleanedCount} 行`));
414
425
  }
415
-
426
+
427
+ console.log();
428
+ });
429
+
430
+ // 验证命令
431
+ program
432
+ .command('validate <input>')
433
+ .option('-c, --config <path>', '验证规则配置文件(JSON)')
434
+ .option('-o, --output <path>', '输出错误报告到文件')
435
+ .option('--format <type>', '输出格式(json/csv)', 'json')
436
+ .description('验证数据')
437
+ .action(async (input, options) => {
438
+ if (!fs.existsSync(input)) {
439
+ console.log(chalk.red(`文件不存在: ${input}`));
440
+ process.exit(1);
441
+ }
442
+
443
+ const data = await readFile(input);
444
+
445
+ if (!Array.isArray(data)) {
446
+ console.log(chalk.red('数据必须是数组格式'));
447
+ process.exit(1);
448
+ }
449
+
450
+ console.log(chalk.cyan(`\n✅ 验证数据\n`));
451
+
452
+ let validator;
453
+
454
+ // 从配置文件加载规则
455
+ if (options.config) {
456
+ if (!fs.existsSync(options.config)) {
457
+ console.log(chalk.red(`配置文件不存在: ${options.config}`));
458
+ process.exit(1);
459
+ }
460
+ const configContent = fs.readFileSync(options.config, 'utf-8');
461
+ const config = JSON.parse(configContent);
462
+ validator = createValidatorFromConfig(config);
463
+ console.log(chalk.gray(`从配置文件加载规则: ${options.config}`));
464
+ } else {
465
+ // 没有配置,提示用户
466
+ console.log(chalk.yellow('未提供验证规则配置,跳过验证'));
467
+ console.log(chalk.gray('使用 --config 指定验证规则文件\n'));
468
+ process.exit(0);
469
+ }
470
+
471
+ console.log(chalk.gray(`规则数量: ${validator.getRuleCount()}`));
472
+ console.log();
473
+
474
+ // 执行验证
475
+ const errors = validator.getErrors(data);
476
+
477
+ if (errors.length === 0) {
478
+ console.log(chalk.green('✓ 所有数据验证通过!\n'));
479
+ } else {
480
+ console.log(chalk.red(`✗ 发现 ${errors.length} 个验证错误:\n`));
481
+
482
+ // 显示前 20 个错误
483
+ const displayErrors = errors.slice(0, 20);
484
+ for (const error of displayErrors) {
485
+ console.log(chalk.red(` [行 ${error.row}] ${error.field}`));
486
+ console.log(chalk.gray(` 规则: ${error.rule}`));
487
+ console.log(chalk.gray(` 值: ${error.value}`));
488
+ console.log(chalk.gray(` 消息: ${error.message}\n`));
489
+ }
490
+
491
+ if (errors.length > 20) {
492
+ console.log(chalk.yellow(`... 还有 ${errors.length - 20} 个错误\n`));
493
+ }
494
+ }
495
+
496
+ // 输出错误报告
497
+ if (options.output && errors.length > 0) {
498
+ if (options.format === 'csv') {
499
+ const headers = ['row', 'field', 'rule', 'value', 'message'];
500
+ const rows = errors.map(e => [
501
+ e.row, e.field, e.rule,
502
+ `"${String(e.value).replace(/"/g, '""')}"`,
503
+ `"${e.message.replace(/"/g, '""')}"`
504
+ ]);
505
+ const csv = [headers.join(','), ...rows.map(r => r.join(','))].join('\n');
506
+ fs.writeFileSync(options.output, csv, 'utf-8');
507
+ } else {
508
+ fs.writeFileSync(options.output, JSON.stringify(errors, null, 2), 'utf-8');
509
+ }
510
+ console.log(chalk.green(`✓ 错误报告已保存到: ${options.output}\n`));
511
+ }
512
+
513
+ process.exit(errors.length === 0 ? 0 : 1);
514
+ });
515
+
516
+ // 分组命令
517
+ program
518
+ .command('group <input>')
519
+ .option('-g, --group-by <field>', '分组字段(支持多个,逗号分隔)')
520
+ .option('-a, --aggregate <expr>', '聚合表达式(field:aggType,逗号分隔)')
521
+ .option('-t, --time-field <field>', '时间字段(用于时间分组)')
522
+ .option('-i, --interval <type>', '时间间隔(minute/hour/day/week/month/year)', 'day')
523
+ .option('-o, --output <path>', '输出文件')
524
+ .option('-f, --format <type>', '输出格式(json/csv)', 'json')
525
+ .option('--stats', '显示统计信息')
526
+ .description('分组和聚合数据')
527
+ .action(async (input, options) => {
528
+ if (!fs.existsSync(input)) {
529
+ console.log(chalk.red(`文件不存在: ${input}`));
530
+ process.exit(1);
531
+ }
532
+
533
+ const data = await readFile(input);
534
+
535
+ if (!Array.isArray(data)) {
536
+ console.log(chalk.red('数据必须是数组格式'));
537
+ process.exit(1);
538
+ }
539
+
540
+ console.log(chalk.cyan(`\n📊 分组和聚合\n`));
541
+
542
+ let result;
543
+
544
+ // 时间分组
545
+ if (options.timeField) {
546
+ const groups = groupByTime(data, options.timeField, options.interval);
547
+ console.log(chalk.gray(`时间字段: ${options.timeField}`));
548
+ console.log(chalk.gray(`时间间隔: ${options.interval}`));
549
+ console.log(chalk.gray(`分组数量: ${Object.keys(groups).length}\n`));
550
+
551
+ if (options.stats && options.aggregate) {
552
+ const aggParts = options.aggregate.split(',');
553
+ const aggregations = {};
554
+ for (const part of aggParts) {
555
+ const [field, aggType] = part.split(':');
556
+ aggregations[field] = aggType;
557
+ }
558
+
559
+ const stats = getGroupStats(groups, Object.keys(aggregations)[0]);
560
+ printGroupStats(stats);
561
+
562
+ // 转换为数组输出
563
+ result = groupAndAggregate(data, options.timeField, aggregations);
564
+ } else {
565
+ result = groups;
566
+ }
567
+ } else if (options.groupBy) {
568
+ // 字段分组
569
+ const groupByFields = options.groupBy.split(',');
570
+ const aggregations = {};
571
+
572
+ if (options.aggregate) {
573
+ const aggParts = options.aggregate.split(',');
574
+ for (const part of aggParts) {
575
+ const [field, aggType] = part.split(':');
576
+ aggregations[field] = aggType;
577
+ }
578
+ }
579
+
580
+ console.log(chalk.gray(`分组字段: ${groupByFields.join(', ')}`));
581
+ console.log(chalk.gray(`聚合规则: ${Object.keys(aggregations).join(', ') || '无'}\n`));
582
+
583
+ result = groupAndAggregate(data, groupByFields, aggregations);
584
+
585
+ // 显示结果
586
+ if (options.stats) {
587
+ for (const item of result) {
588
+ console.log(chalk.cyan(` ${item._group}`));
589
+ console.log(chalk.gray(` 数量: ${item._count}`));
590
+ for (const [key, value] of Object.entries(item)) {
591
+ if (!key.startsWith('_')) {
592
+ console.log(chalk.gray(` ${key}: ${typeof value === 'number' ? value.toFixed(2) : value}`));
593
+ }
594
+ }
595
+ console.log();
596
+ }
597
+ }
598
+ } else {
599
+ console.log(chalk.red('必须指定 --group-by 或 --time-field'));
600
+ process.exit(1);
601
+ }
602
+
603
+ // 输出文件
604
+ if (options.output) {
605
+ if (options.format === 'csv') {
606
+ await writeFile(options.output, result, 'csv');
607
+ } else {
608
+ fs.writeFileSync(options.output, JSON.stringify(result, null, 2), 'utf-8');
609
+ }
610
+ console.log(chalk.green(`✓ 已保存到: ${options.output}\n`));
611
+ }
612
+ });
613
+
614
+ // 透视表命令
615
+ program
616
+ .command('pivot <input>')
617
+ .option('-r, --rows <field>', '行字段')
618
+ .option('-c, --columns <field>', '列字段')
619
+ .option('-v, --values <field>', '值字段')
620
+ .option('-a, --agg <func>', '聚合函数(sum/avg/count/min/max)', 'sum')
621
+ .option('-o, --output <path>', '输出文件')
622
+ .description('创建数据透视表')
623
+ .action(async (input, options) => {
624
+ if (!fs.existsSync(input)) {
625
+ console.log(chalk.red(`文件不存在: ${input}`));
626
+ process.exit(1);
627
+ }
628
+
629
+ if (!options.rows || !options.columns || !options.values) {
630
+ console.log(chalk.red('必须指定 --rows, --columns 和 --values'));
631
+ process.exit(1);
632
+ }
633
+
634
+ const data = await readFile(input);
635
+
636
+ if (!Array.isArray(data)) {
637
+ console.log(chalk.red('数据必须是数组格式'));
638
+ process.exit(1);
639
+ }
640
+
641
+ console.log(chalk.cyan(`\n📊 数据透视表\n`));
642
+ console.log(chalk.gray(`行: ${options.rows}`));
643
+ console.log(chalk.gray(`列: ${options.columns}`));
644
+ console.log(chalk.gray(`值: ${options.values}`));
645
+ console.log(chalk.gray(`聚合: ${options.agg}\n`));
646
+
647
+ const pivot = pivotTable(data, options.rows, options.columns, options.values, options.agg);
648
+
649
+ // 打印透视表
650
+ console.log(chalk.cyan(` ${pivot.columns.join(' ')}`));
651
+ for (const row of pivot.rows) {
652
+ const rowData = [row];
653
+ for (const col of pivot.columns) {
654
+ const value = pivot.data[row][col];
655
+ rowData.push((typeof value === 'number' ? value.toFixed(2) : value).padStart(12));
656
+ }
657
+ console.log(chalk.cyan(rowData.join(' ')));
658
+ }
416
659
  console.log();
660
+
661
+ // 输出文件
662
+ if (options.output) {
663
+ fs.writeFileSync(options.output, JSON.stringify(pivot, null, 2), 'utf-8');
664
+ console.log(chalk.green(`✓ 已保存到: ${options.output}\n`));
665
+ }
417
666
  });
418
667
 
419
668
  program.parse();
@@ -0,0 +1,298 @@
1
+ // 数据验证模块
2
+
3
+ /**
4
+ * 验证规则类
5
+ */
6
+ class ValidationRule {
7
+ constructor(name, validator, errorMessage) {
8
+ this.name = name;
9
+ this.validator = validator;
10
+ this.errorMessage = errorMessage;
11
+ }
12
+ }
13
+
14
+ /**
15
+ * 内置验证规则
16
+ */
17
+ const BUILT_IN_RULES = {
18
+ required: (value) => {
19
+ return value !== null && value !== undefined && value !== '';
20
+ },
21
+ email: (value) => {
22
+ const emailRegex = /^[^\s@]+@[^\s@]+\.[^\s@]+$/;
23
+ return emailRegex.test(value);
24
+ },
25
+ url: (value) => {
26
+ try {
27
+ new URL(value);
28
+ return true;
29
+ } catch {
30
+ return false;
31
+ }
32
+ },
33
+ number: (value) => {
34
+ return !isNaN(parseFloat(value)) && isFinite(value);
35
+ },
36
+ integer: (value) => {
37
+ return Number.isInteger(Number(value));
38
+ },
39
+ positive: (value) => {
40
+ return Number(value) > 0;
41
+ },
42
+ negative: (value) => {
43
+ return Number(value) < 0;
44
+ },
45
+ min: (value, min) => {
46
+ return Number(value) >= min;
47
+ },
48
+ max: (value, max) => {
49
+ return Number(value) <= max;
50
+ },
51
+ minLength: (value, min) => {
52
+ return String(value).length >= min;
53
+ },
54
+ maxLength: (value, max) => {
55
+ return String(value).length <= max;
56
+ },
57
+ pattern: (value, pattern) => {
58
+ return new RegExp(pattern).test(value);
59
+ },
60
+ enum: (value, values) => {
61
+ return values.includes(value);
62
+ },
63
+ date: (value) => {
64
+ return !isNaN(Date.parse(value));
65
+ },
66
+ future: (value) => {
67
+ return new Date(value) > new Date();
68
+ },
69
+ past: (value) => {
70
+ return new Date(value) < new Date();
71
+ },
72
+ phone: (value) => {
73
+ // 简单的电话号码验证(国际)
74
+ const phoneRegex = /^\+?[\d\s-()]+$/;
75
+ return phoneRegex.test(value) && value.replace(/\D/g, '').length >= 10;
76
+ }
77
+ };
78
+
79
+ /**
80
+ * 数据验证器
81
+ */
82
+ class DataValidator {
83
+ constructor() {
84
+ this.rules = new Map(); // fieldName -> ValidationRule[]
85
+ this.customRules = new Map();
86
+ }
87
+
88
+ /**
89
+ * 添加内置规则
90
+ * @param {string} fieldName - 字段名
91
+ * @param {string} ruleName - 规则名
92
+ * @param {*} args - 规则参数
93
+ * @param {string} errorMessage - 自定义错误信息
94
+ */
95
+ addRule(fieldName, ruleName, args, errorMessage) {
96
+ if (!this.rules.has(fieldName)) {
97
+ this.rules.set(fieldName, []);
98
+ }
99
+
100
+ const validator = (value) => {
101
+ const ruleFn = BUILT_IN_RULES[ruleName];
102
+ if (!ruleFn) {
103
+ throw new Error(`未知规则: ${ruleName}`);
104
+ }
105
+
106
+ // 规则参数
107
+ if (args !== undefined && args !== null) {
108
+ return ruleFn(value, args);
109
+ }
110
+
111
+ return ruleFn(value);
112
+ };
113
+
114
+ this.rules.get(fieldName).push(new ValidationRule(
115
+ ruleName,
116
+ validator,
117
+ errorMessage
118
+ ));
119
+ }
120
+
121
+ /**
122
+ * 添加自定义规则
123
+ * @param {string} ruleName - 规则名
124
+ * @param {Function} validator - 验证函数
125
+ */
126
+ addCustomRule(ruleName, validator) {
127
+ this.customRules.set(ruleName, validator);
128
+ }
129
+
130
+ /**
131
+ * 添加正则规则
132
+ * @param {string} fieldName - 字段名
133
+ * @param {string} pattern - 正则表达式
134
+ * @param {string} errorMessage - 错误信息
135
+ */
136
+ addPatternRule(fieldName, pattern, errorMessage) {
137
+ this.addRule(fieldName, 'pattern', pattern, errorMessage);
138
+ }
139
+
140
+ /**
141
+ * 验证单个字段
142
+ * @param {string} fieldName - 字段名
143
+ * @param {*} value - 字段值
144
+ * @returns {Object} 验证结果 { valid: boolean, errors: Array }
145
+ */
146
+ validateField(fieldName, value) {
147
+ const errors = [];
148
+ const rules = this.rules.get(fieldName) || [];
149
+
150
+ for (const rule of rules) {
151
+ try {
152
+ const isValid = rule.validator(value);
153
+ if (!isValid) {
154
+ errors.push({
155
+ rule: rule.name,
156
+ message: rule.errorMessage || `${fieldName} 验证失败: ${rule.name}`
157
+ });
158
+ }
159
+ } catch (error) {
160
+ errors.push({
161
+ rule: rule.name,
162
+ message: `${fieldName} 验证错误: ${error.message}`
163
+ });
164
+ }
165
+ }
166
+
167
+ return {
168
+ valid: errors.length === 0,
169
+ errors
170
+ };
171
+ }
172
+
173
+ /**
174
+ * 验证整个数据对象
175
+ * @param {Object} data - 数据对象
176
+ * @returns {Object} 验证结果 { valid: boolean, fieldResults: Object }
177
+ */
178
+ validate(data) {
179
+ const fieldResults = {};
180
+ let isValid = true;
181
+
182
+ for (const [fieldName] of this.rules) {
183
+ const result = this.validateField(fieldName, data[fieldName]);
184
+ fieldResults[fieldName] = result;
185
+ if (!result.valid) {
186
+ isValid = false;
187
+ }
188
+ }
189
+
190
+ return {
191
+ valid: isValid,
192
+ fieldResults
193
+ };
194
+ }
195
+
196
+ /**
197
+ * 验证数据数组
198
+ * @param {Array} dataArray - 数据数组
199
+ * @returns {Array} 验证结果数组
200
+ */
201
+ validateArray(dataArray) {
202
+ return dataArray.map((data, index) => ({
203
+ index,
204
+ ...this.validate(data)
205
+ }));
206
+ }
207
+
208
+ /**
209
+ * 批量验证并返回错误记录
210
+ * @param {Array} dataArray - 数据数组
211
+ * @returns {Array} 错误记录
212
+ */
213
+ getErrors(dataArray) {
214
+ const errors = [];
215
+
216
+ for (let i = 0; i < dataArray.length; i++) {
217
+ const validation = this.validate(dataArray[i]);
218
+ if (!validation.valid) {
219
+ for (const [fieldName, result] of Object.entries(validation.fieldResults)) {
220
+ if (!result.valid) {
221
+ for (const error of result.errors) {
222
+ errors.push({
223
+ row: i,
224
+ field: fieldName,
225
+ rule: error.rule,
226
+ message: error.message,
227
+ value: dataArray[i][fieldName]
228
+ });
229
+ }
230
+ }
231
+ }
232
+ }
233
+ }
234
+
235
+ return errors;
236
+ }
237
+
238
+ /**
239
+ * 清除所有规则
240
+ */
241
+ clear() {
242
+ this.rules.clear();
243
+ }
244
+
245
+ /**
246
+ * 获取规则数量
247
+ * @returns {number}
248
+ */
249
+ getRuleCount() {
250
+ let count = 0;
251
+ for (const rules of this.rules.values()) {
252
+ count += rules.length;
253
+ }
254
+ return count;
255
+ }
256
+ }
257
+
258
+ /**
259
+ * 从配置创建验证器
260
+ * @param {Object} config - 验证配置
261
+ * @returns {DataValidator}
262
+ */
263
+ function createValidatorFromConfig(config) {
264
+ const validator = new DataValidator();
265
+
266
+ for (const [fieldName, fieldRules] of Object.entries(config)) {
267
+ if (Array.isArray(fieldRules)) {
268
+ for (const rule of fieldRules) {
269
+ if (typeof rule === 'string') {
270
+ // 简单规则名
271
+ validator.addRule(fieldName, rule);
272
+ } else if (typeof rule === 'object') {
273
+ // 带参数的规则
274
+ const args = rule.value !== undefined ? rule.value :
275
+ rule.arg !== undefined ? rule.arg :
276
+ rule.params;
277
+ const errorMessage = rule.message;
278
+
279
+ validator.addRule(
280
+ fieldName,
281
+ rule.name || rule.type || rule.rule,
282
+ args,
283
+ errorMessage
284
+ );
285
+ }
286
+ }
287
+ }
288
+ }
289
+
290
+ return validator;
291
+ }
292
+
293
+ module.exports = {
294
+ DataValidator,
295
+ ValidationRule,
296
+ BUILT_IN_RULES,
297
+ createValidatorFromConfig
298
+ };
@@ -0,0 +1,16 @@
1
+ {
2
+ "email": ["required", "email"],
3
+ "age": [
4
+ "required",
5
+ {"name": "number"},
6
+ {"name": "min", "value": 0},
7
+ {"name": "max", "value": 120}
8
+ ],
9
+ "phone": [
10
+ "required",
11
+ {"name": "pattern", "value": "^\\d{11}$", "message": "手机号必须是11位数字"}
12
+ ],
13
+ "status": [
14
+ {"name": "enum", "value": ["active", "inactive", "pending"]}
15
+ ]
16
+ }