aigroup-econ-mcp 0.4.0__py3-none-any.whl → 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aigroup_econ_mcp/__init__.py +1 -1
- aigroup_econ_mcp/cli.py +81 -86
- aigroup_econ_mcp/server.py +451 -451
- aigroup_econ_mcp/tools/__init__.py +8 -7
- aigroup_econ_mcp/tools/base.py +204 -5
- aigroup_econ_mcp/tools/data_loader.py +51 -27
- aigroup_econ_mcp/tools/file_parser.py +1027 -560
- aigroup_econ_mcp/tools/machine_learning.py +56 -669
- aigroup_econ_mcp/tools/ml_ensemble.py +210 -0
- aigroup_econ_mcp/tools/ml_evaluation.py +272 -0
- aigroup_econ_mcp/tools/ml_models.py +54 -0
- aigroup_econ_mcp/tools/ml_regularization.py +186 -0
- aigroup_econ_mcp/tools/panel_data.py +70 -4
- aigroup_econ_mcp/tools/time_series.py +53 -22
- aigroup_econ_mcp/tools/tool_descriptions.py +410 -0
- aigroup_econ_mcp/tools/tool_handlers.py +681 -43
- aigroup_econ_mcp/tools/tool_registry.py +329 -21
- aigroup_econ_mcp-1.3.3.dist-info/METADATA +525 -0
- aigroup_econ_mcp-1.3.3.dist-info/RECORD +30 -0
- aigroup_econ_mcp/server_v1_backup.py +0 -1250
- aigroup_econ_mcp/server_v1_old.py +0 -1250
- aigroup_econ_mcp/server_with_file_support.py +0 -259
- aigroup_econ_mcp/tools/decorators.py +0 -178
- aigroup_econ_mcp/tools/file_input_handler.py +0 -268
- aigroup_econ_mcp-0.4.0.dist-info/METADATA +0 -718
- aigroup_econ_mcp-0.4.0.dist-info/RECORD +0 -30
- {aigroup_econ_mcp-0.4.0.dist-info → aigroup_econ_mcp-1.3.3.dist-info}/WHEEL +0 -0
- {aigroup_econ_mcp-0.4.0.dist-info → aigroup_econ_mcp-1.3.3.dist-info}/entry_points.txt +0 -0
- {aigroup_econ_mcp-0.4.0.dist-info → aigroup_econ_mcp-1.3.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,560 +1,1027 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
import json
|
|
7
|
-
import csv
|
|
8
|
-
from typing import Dict, List, Any, Union, Tuple, Optional
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
import
|
|
11
|
-
import
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
if file_format == "
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
"
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
)
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
try:
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
"
|
|
247
|
-
"
|
|
248
|
-
"
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
@staticmethod
|
|
316
|
-
def
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
for
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
1
|
+
"""
|
|
2
|
+
文件解析与输入处理模块
|
|
3
|
+
整合了文件解析、数据转换和输入处理功能
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import csv
|
|
8
|
+
from typing import Dict, List, Any, Union, Tuple, Optional, Callable
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from functools import wraps
|
|
11
|
+
import io
|
|
12
|
+
import base64
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class FileParser:
|
|
16
|
+
"""文件解析器,支持CSV、JSON和TXT格式"""
|
|
17
|
+
|
|
18
|
+
@staticmethod
|
|
19
|
+
def parse_file_path(
|
|
20
|
+
file_path: str,
|
|
21
|
+
file_format: str = "auto"
|
|
22
|
+
) -> Dict[str, Any]:
|
|
23
|
+
"""
|
|
24
|
+
从文件路径解析文件
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
file_path: 文件路径(相对或绝对路径)
|
|
28
|
+
file_format: 文件格式 ("csv", "json", "auto")
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
解析后的数据字典
|
|
32
|
+
"""
|
|
33
|
+
path = Path(file_path)
|
|
34
|
+
|
|
35
|
+
if not path.exists():
|
|
36
|
+
raise FileNotFoundError(f"文件不存在: {file_path}")
|
|
37
|
+
|
|
38
|
+
if not path.is_file():
|
|
39
|
+
raise ValueError(f"路径不是文件: {file_path}")
|
|
40
|
+
|
|
41
|
+
# 自动检测格式(基于文件扩展名)
|
|
42
|
+
if file_format == "auto":
|
|
43
|
+
ext = path.suffix.lower()
|
|
44
|
+
if ext == '.csv':
|
|
45
|
+
file_format = "csv"
|
|
46
|
+
elif ext in ['.json', '.jsonl']:
|
|
47
|
+
file_format = "json"
|
|
48
|
+
elif ext == '.txt':
|
|
49
|
+
file_format = 'txt'
|
|
50
|
+
else:
|
|
51
|
+
# 尝试从内容检测
|
|
52
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
53
|
+
content = f.read()
|
|
54
|
+
return FileParser.parse_file_content(content, "auto")
|
|
55
|
+
|
|
56
|
+
# 读取文件内容
|
|
57
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
58
|
+
content = f.read()
|
|
59
|
+
|
|
60
|
+
return FileParser.parse_file_content(content, file_format)
|
|
61
|
+
|
|
62
|
+
@staticmethod
|
|
63
|
+
def parse_file_content(
|
|
64
|
+
content: str,
|
|
65
|
+
file_format: str = "auto"
|
|
66
|
+
) -> Dict[str, Any]:
|
|
67
|
+
"""
|
|
68
|
+
解析文件内容
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
content: 文件内容(base64编码的字符串或直接文本)
|
|
72
|
+
file_format: 文件格式 ("csv", "json", "auto")
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
解析后的数据字典,包含:
|
|
76
|
+
- data: 数据内容
|
|
77
|
+
- variables: 变量名列表
|
|
78
|
+
- format: 检测到的格式
|
|
79
|
+
- data_type: 数据类型('univariate', 'multivariate', 'time_series', 'panel')
|
|
80
|
+
"""
|
|
81
|
+
# 尝试检测是否为base64编码
|
|
82
|
+
try:
|
|
83
|
+
decoded_content = base64.b64decode(content).decode('utf-8')
|
|
84
|
+
except:
|
|
85
|
+
decoded_content = content
|
|
86
|
+
|
|
87
|
+
# 自动检测格式
|
|
88
|
+
if file_format == "auto":
|
|
89
|
+
file_format = FileParser._detect_format(decoded_content)
|
|
90
|
+
|
|
91
|
+
if file_format == "csv":
|
|
92
|
+
return FileParser._parse_csv(decoded_content)
|
|
93
|
+
elif file_format == "txt":
|
|
94
|
+
return FileParser._parse_txt(decoded_content)
|
|
95
|
+
elif file_format == "json":
|
|
96
|
+
return FileParser._parse_json(decoded_content)
|
|
97
|
+
else:
|
|
98
|
+
raise ValueError(f"不支持的文件格式: {file_format}")
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def _detect_format(content: str) -> str:
|
|
102
|
+
"""自动检测文件格式"""
|
|
103
|
+
# 尝试解析JSON
|
|
104
|
+
try:
|
|
105
|
+
json.loads(content.strip())
|
|
106
|
+
return "json"
|
|
107
|
+
except:
|
|
108
|
+
pass
|
|
109
|
+
|
|
110
|
+
# 检测CSV特征
|
|
111
|
+
|
|
112
|
+
# 检测TXT特征
|
|
113
|
+
lines = content.strip().split('\n')
|
|
114
|
+
if lines:
|
|
115
|
+
first_line = lines[0].strip()
|
|
116
|
+
# 检查是否为键值对格式
|
|
117
|
+
if ':' in first_line or '=' in first_line:
|
|
118
|
+
return "txt"
|
|
119
|
+
|
|
120
|
+
# 尝试解析为纯数值
|
|
121
|
+
try:
|
|
122
|
+
float(first_line)
|
|
123
|
+
return "txt"
|
|
124
|
+
except ValueError:
|
|
125
|
+
# 尝试按空格拆分并检查
|
|
126
|
+
parts = first_line.split()
|
|
127
|
+
if parts:
|
|
128
|
+
try:
|
|
129
|
+
for part in parts:
|
|
130
|
+
float(part)
|
|
131
|
+
return "txt"
|
|
132
|
+
except ValueError:
|
|
133
|
+
pass
|
|
134
|
+
|
|
135
|
+
if ',' in content or '\t' in content:
|
|
136
|
+
return "csv"
|
|
137
|
+
|
|
138
|
+
raise ValueError("无法自动检测文件格式,请明确指定")
|
|
139
|
+
|
|
140
|
+
@staticmethod
|
|
141
|
+
def _parse_csv(content: str) -> Dict[str, Any]:
|
|
142
|
+
"""
|
|
143
|
+
解析CSV文件
|
|
144
|
+
|
|
145
|
+
支持的格式:
|
|
146
|
+
1. 带表头的列数据
|
|
147
|
+
2. 无表头的纯数值数据
|
|
148
|
+
"""
|
|
149
|
+
lines = content.strip().split('\n')
|
|
150
|
+
if not lines:
|
|
151
|
+
raise ValueError("CSV文件为空")
|
|
152
|
+
|
|
153
|
+
# 检测分隔符
|
|
154
|
+
delimiter = FileParser._detect_delimiter(lines[0])
|
|
155
|
+
|
|
156
|
+
# 使用csv.reader解析
|
|
157
|
+
reader = csv.reader(io.StringIO(content), delimiter=delimiter)
|
|
158
|
+
rows = list(reader)
|
|
159
|
+
|
|
160
|
+
if not rows:
|
|
161
|
+
raise ValueError("CSV文件没有数据")
|
|
162
|
+
|
|
163
|
+
# 检测是否有表头
|
|
164
|
+
has_header = FileParser._has_header(rows)
|
|
165
|
+
|
|
166
|
+
if has_header:
|
|
167
|
+
headers = rows[0]
|
|
168
|
+
data_rows = rows[1:]
|
|
169
|
+
else:
|
|
170
|
+
# 自动生成列名
|
|
171
|
+
headers = [f"var{i+1}" for i in range(len(rows[0]))]
|
|
172
|
+
data_rows = rows
|
|
173
|
+
|
|
174
|
+
# 转换为数值数据
|
|
175
|
+
parsed_data = {}
|
|
176
|
+
for i, header in enumerate(headers):
|
|
177
|
+
column_data = []
|
|
178
|
+
for row in data_rows:
|
|
179
|
+
if i < len(row):
|
|
180
|
+
try:
|
|
181
|
+
# 尝试转换为浮点数
|
|
182
|
+
value = float(row[i].strip())
|
|
183
|
+
column_data.append(value)
|
|
184
|
+
except ValueError:
|
|
185
|
+
# 如果无法转换,保留原字符串(用于ID列)
|
|
186
|
+
column_data.append(row[i].strip())
|
|
187
|
+
|
|
188
|
+
if column_data: # 只保留有数据的列
|
|
189
|
+
parsed_data[header.strip()] = column_data
|
|
190
|
+
|
|
191
|
+
if not parsed_data:
|
|
192
|
+
raise ValueError("CSV文件中没有有效的数据")
|
|
193
|
+
|
|
194
|
+
# 检测数据类型
|
|
195
|
+
data_type = FileParser._detect_data_type(parsed_data)
|
|
196
|
+
|
|
197
|
+
return {
|
|
198
|
+
"data": parsed_data,
|
|
199
|
+
"variables": list(parsed_data.keys()),
|
|
200
|
+
"format": "csv",
|
|
201
|
+
"data_type": data_type,
|
|
202
|
+
"n_variables": len(parsed_data),
|
|
203
|
+
"n_observations": len(next(iter(parsed_data.values())))
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
@staticmethod
|
|
207
|
+
def _parse_json(content: str) -> Dict[str, Any]:
|
|
208
|
+
"""
|
|
209
|
+
解析JSON文件
|
|
210
|
+
|
|
211
|
+
支持的格式:
|
|
212
|
+
1. {"变量名": [数据列表], ...}
|
|
213
|
+
2. [{"变量1": 值, "变量2": 值, ...}, ...]
|
|
214
|
+
3. {"data": {...}, "metadata": {...}}
|
|
215
|
+
"""
|
|
216
|
+
try:
|
|
217
|
+
json_data = json.loads(content)
|
|
218
|
+
except json.JSONDecodeError as e:
|
|
219
|
+
raise ValueError(f"JSON格式错误: {str(e)}")
|
|
220
|
+
|
|
221
|
+
# 格式1: 直接的变量-数据字典
|
|
222
|
+
if isinstance(json_data, dict) and all(
|
|
223
|
+
isinstance(v, list) for v in json_data.values()
|
|
224
|
+
):
|
|
225
|
+
# 保留所有列(包括字符串类型的ID和时间列)
|
|
226
|
+
parsed_data = {}
|
|
227
|
+
for key, values in json_data.items():
|
|
228
|
+
if key.lower() in ['metadata', 'info', 'description']:
|
|
229
|
+
continue # 跳过元数据字段
|
|
230
|
+
|
|
231
|
+
# 智能转换:尝试转数值,失败则保留原始类型
|
|
232
|
+
converted_values = []
|
|
233
|
+
for v in values:
|
|
234
|
+
try:
|
|
235
|
+
# 尝试转换为浮点数
|
|
236
|
+
converted_values.append(float(v))
|
|
237
|
+
except (ValueError, TypeError):
|
|
238
|
+
# 无法转换则保留原始值(字符串等)
|
|
239
|
+
converted_values.append(v)
|
|
240
|
+
|
|
241
|
+
parsed_data[key] = converted_values
|
|
242
|
+
|
|
243
|
+
if parsed_data:
|
|
244
|
+
data_type = FileParser._detect_data_type(parsed_data)
|
|
245
|
+
return {
|
|
246
|
+
"data": parsed_data,
|
|
247
|
+
"variables": list(parsed_data.keys()),
|
|
248
|
+
"format": "json",
|
|
249
|
+
"data_type": data_type,
|
|
250
|
+
"n_variables": len(parsed_data),
|
|
251
|
+
"n_observations": len(next(iter(parsed_data.values())))
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
# 格式2: 记录数组格式
|
|
255
|
+
elif isinstance(json_data, list) and json_data and isinstance(json_data[0], dict):
|
|
256
|
+
# 转换为变量-数据字典,保留字符串类型
|
|
257
|
+
parsed_data = {}
|
|
258
|
+
for record in json_data:
|
|
259
|
+
for key, value in record.items():
|
|
260
|
+
if key not in parsed_data:
|
|
261
|
+
parsed_data[key] = []
|
|
262
|
+
# 智能转换:尝试转数值,失败则保留原始类型
|
|
263
|
+
try:
|
|
264
|
+
parsed_data[key].append(float(value))
|
|
265
|
+
except (ValueError, TypeError):
|
|
266
|
+
# 保留原始值(字符串等)
|
|
267
|
+
parsed_data[key].append(value)
|
|
268
|
+
|
|
269
|
+
if parsed_data:
|
|
270
|
+
data_type = FileParser._detect_data_type(parsed_data)
|
|
271
|
+
return {
|
|
272
|
+
"data": parsed_data,
|
|
273
|
+
"variables": list(parsed_data.keys()),
|
|
274
|
+
"format": "json",
|
|
275
|
+
"data_type": data_type,
|
|
276
|
+
"n_variables": len(parsed_data),
|
|
277
|
+
"n_observations": len(next(iter(parsed_data.values())))
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
# 格式3: 包含data字段的结构
|
|
281
|
+
elif isinstance(json_data, dict) and "data" in json_data:
|
|
282
|
+
return FileParser._parse_json(json.dumps(json_data["data"]))
|
|
283
|
+
|
|
284
|
+
raise ValueError("不支持的JSON数据格式")
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
@staticmethod
|
|
288
|
+
def _parse_txt(content: str) -> Dict[str, Any]:
|
|
289
|
+
"""
|
|
290
|
+
解析TXT文件
|
|
291
|
+
|
|
292
|
+
支持的格式:
|
|
293
|
+
1. 单列数值(每行一个数值)
|
|
294
|
+
2. 空格/制表符分隔的多列数值
|
|
295
|
+
3. 带标注的键值对(变量名: 数值 或 变量名 数值)
|
|
296
|
+
"""
|
|
297
|
+
lines = [line.strip() for line in content.strip().split('\n') if line.strip()]
|
|
298
|
+
if not lines:
|
|
299
|
+
raise ValueError("TXT文件为空")
|
|
300
|
+
|
|
301
|
+
# 检测格式类型
|
|
302
|
+
first_line = lines[0]
|
|
303
|
+
|
|
304
|
+
# 格式1: 检测是否为键值对格式(包含冒号或等号)
|
|
305
|
+
if ':' in first_line or '=' in first_line:
|
|
306
|
+
return FileParser._parse_txt_keyvalue(lines)
|
|
307
|
+
|
|
308
|
+
# 格式2: 检测是否包含空格或制表符(多列数据)
|
|
309
|
+
if ' ' in first_line or '\t' in first_line:
|
|
310
|
+
return FileParser._parse_txt_multicolumn(lines)
|
|
311
|
+
|
|
312
|
+
# 格式3: 单列数值
|
|
313
|
+
return FileParser._parse_txt_single_column(lines)
|
|
314
|
+
|
|
315
|
+
@staticmethod
|
|
316
|
+
def _parse_txt_single_column(lines: List[str]) -> Dict[str, Any]:
|
|
317
|
+
"""解析单列TXT数据(每行一个数值)"""
|
|
318
|
+
data_list = []
|
|
319
|
+
for i, line in enumerate(lines, 1):
|
|
320
|
+
try:
|
|
321
|
+
value = float(line)
|
|
322
|
+
data_list.append(value)
|
|
323
|
+
except ValueError:
|
|
324
|
+
raise ValueError(f"第{i}行无法解析为数值: {line}")
|
|
325
|
+
|
|
326
|
+
if not data_list:
|
|
327
|
+
raise ValueError("TXT文件中没有有效的数值数据")
|
|
328
|
+
|
|
329
|
+
parsed_data = {"data": data_list}
|
|
330
|
+
|
|
331
|
+
return {
|
|
332
|
+
"data": parsed_data,
|
|
333
|
+
"variables": ["data"],
|
|
334
|
+
"format": "txt",
|
|
335
|
+
"data_type": "univariate",
|
|
336
|
+
"n_variables": 1,
|
|
337
|
+
"n_observations": len(data_list)
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
@staticmethod
|
|
341
|
+
def _parse_txt_multicolumn(lines: List[str]) -> Dict[str, Any]:
|
|
342
|
+
"""解析多列TXT数据(空格或制表符分隔)"""
|
|
343
|
+
# 检测分隔符
|
|
344
|
+
first_line = lines[0]
|
|
345
|
+
if '\t' in first_line:
|
|
346
|
+
delimiter = '\t'
|
|
347
|
+
else:
|
|
348
|
+
delimiter = None # 使用split()默认行为
|
|
349
|
+
|
|
350
|
+
# 解析所有行
|
|
351
|
+
all_rows = []
|
|
352
|
+
for line in lines:
|
|
353
|
+
if delimiter:
|
|
354
|
+
parts = line.split(delimiter)
|
|
355
|
+
else:
|
|
356
|
+
parts = line.split()
|
|
357
|
+
all_rows.append([p.strip() for p in parts if p.strip()])
|
|
358
|
+
|
|
359
|
+
if not all_rows:
|
|
360
|
+
raise ValueError("TXT文件中没有数据")
|
|
361
|
+
|
|
362
|
+
# 检测是否有表头
|
|
363
|
+
has_header = False
|
|
364
|
+
first_row = all_rows[0]
|
|
365
|
+
for cell in first_row:
|
|
366
|
+
try:
|
|
367
|
+
float(cell)
|
|
368
|
+
except ValueError:
|
|
369
|
+
has_header = True
|
|
370
|
+
break
|
|
371
|
+
|
|
372
|
+
if has_header:
|
|
373
|
+
headers = first_row
|
|
374
|
+
data_rows = all_rows[1:]
|
|
375
|
+
else:
|
|
376
|
+
n_cols = len(first_row)
|
|
377
|
+
headers = [f"var{i+1}" for i in range(n_cols)]
|
|
378
|
+
data_rows = all_rows
|
|
379
|
+
|
|
380
|
+
# 转换为数值数据
|
|
381
|
+
parsed_data = {}
|
|
382
|
+
for i, header in enumerate(headers):
|
|
383
|
+
column_data = []
|
|
384
|
+
for row in data_rows:
|
|
385
|
+
if i < len(row):
|
|
386
|
+
try:
|
|
387
|
+
value = float(row[i])
|
|
388
|
+
column_data.append(value)
|
|
389
|
+
except ValueError:
|
|
390
|
+
column_data.append(row[i])
|
|
391
|
+
|
|
392
|
+
if column_data:
|
|
393
|
+
parsed_data[header] = column_data
|
|
394
|
+
|
|
395
|
+
if not parsed_data:
|
|
396
|
+
raise ValueError("TXT文件中没有有效的数据")
|
|
397
|
+
|
|
398
|
+
data_type = FileParser._detect_data_type(parsed_data)
|
|
399
|
+
|
|
400
|
+
return {
|
|
401
|
+
"data": parsed_data,
|
|
402
|
+
"variables": list(parsed_data.keys()),
|
|
403
|
+
"format": "txt",
|
|
404
|
+
"data_type": data_type,
|
|
405
|
+
"n_variables": len(parsed_data),
|
|
406
|
+
"n_observations": len(next(iter(parsed_data.values())))
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
@staticmethod
|
|
410
|
+
def _parse_txt_keyvalue(lines: List[str]) -> Dict[str, Any]:
|
|
411
|
+
"""解析键值对格式的TXT数据"""
|
|
412
|
+
parsed_data = {}
|
|
413
|
+
|
|
414
|
+
for line in lines:
|
|
415
|
+
if ':' in line:
|
|
416
|
+
separator = ':'
|
|
417
|
+
elif '=' in line:
|
|
418
|
+
separator = '='
|
|
419
|
+
else:
|
|
420
|
+
continue
|
|
421
|
+
|
|
422
|
+
parts = line.split(separator, 1)
|
|
423
|
+
if len(parts) != 2:
|
|
424
|
+
continue
|
|
425
|
+
|
|
426
|
+
var_name = parts[0].strip()
|
|
427
|
+
value_str = parts[1].strip()
|
|
428
|
+
values = value_str.split()
|
|
429
|
+
|
|
430
|
+
try:
|
|
431
|
+
if len(values) == 1:
|
|
432
|
+
parsed_data[var_name] = [float(values[0])]
|
|
433
|
+
else:
|
|
434
|
+
parsed_data[var_name] = [float(v) for v in values]
|
|
435
|
+
except ValueError:
|
|
436
|
+
if len(values) == 1:
|
|
437
|
+
parsed_data[var_name] = [values[0]]
|
|
438
|
+
else:
|
|
439
|
+
parsed_data[var_name] = values
|
|
440
|
+
|
|
441
|
+
if not parsed_data:
|
|
442
|
+
raise ValueError("TXT文件中没有有效的键值对数据")
|
|
443
|
+
|
|
444
|
+
data_type = FileParser._detect_data_type(parsed_data)
|
|
445
|
+
n_observations = max(len(v) for v in parsed_data.values())
|
|
446
|
+
|
|
447
|
+
return {
|
|
448
|
+
"data": parsed_data,
|
|
449
|
+
"variables": list(parsed_data.keys()),
|
|
450
|
+
"format": "txt",
|
|
451
|
+
"data_type": data_type,
|
|
452
|
+
"n_variables": len(parsed_data),
|
|
453
|
+
"n_observations": n_observations
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
@staticmethod
|
|
457
|
+
def _detect_delimiter(line: str) -> str:
|
|
458
|
+
"""检测CSV分隔符"""
|
|
459
|
+
# 常见分隔符
|
|
460
|
+
delimiters = [',', '\t', ';', '|']
|
|
461
|
+
counts = {d: line.count(d) for d in delimiters}
|
|
462
|
+
# 返回出现次数最多的分隔符
|
|
463
|
+
return max(counts.items(), key=lambda x: x[1])[0]
|
|
464
|
+
|
|
465
|
+
@staticmethod
|
|
466
|
+
def _has_header(rows: List[List[str]]) -> bool:
|
|
467
|
+
"""检测CSV是否有表头"""
|
|
468
|
+
if len(rows) < 2:
|
|
469
|
+
return False
|
|
470
|
+
|
|
471
|
+
# 检查第一行是否包含非数值字符串
|
|
472
|
+
first_row = rows[0]
|
|
473
|
+
|
|
474
|
+
# 如果第一行有任何元素无法转换为数字,认为有表头
|
|
475
|
+
for cell in first_row:
|
|
476
|
+
try:
|
|
477
|
+
float(cell.strip())
|
|
478
|
+
except ValueError:
|
|
479
|
+
return True
|
|
480
|
+
|
|
481
|
+
return False
|
|
482
|
+
|
|
483
|
+
@staticmethod
|
|
484
|
+
def _detect_data_type(data: Dict[str, List]) -> str:
|
|
485
|
+
"""
|
|
486
|
+
检测数据类型
|
|
487
|
+
|
|
488
|
+
Returns:
|
|
489
|
+
- 'univariate': 单变量
|
|
490
|
+
- 'multivariate': 多变量
|
|
491
|
+
- 'time_series': 时间序列(通过变量名推断)
|
|
492
|
+
- 'panel': 面板数据(通过变量名推断)
|
|
493
|
+
"""
|
|
494
|
+
n_vars = len(data)
|
|
495
|
+
var_names = [v.lower() for v in data.keys()]
|
|
496
|
+
|
|
497
|
+
# 检查是否包含时间/日期相关的变量名
|
|
498
|
+
time_keywords = ['time', 'date', 'year', 'month', 'day', 'period', 'quarter']
|
|
499
|
+
has_time_var = any(any(kw in var for kw in time_keywords) for var in var_names)
|
|
500
|
+
|
|
501
|
+
# 检查是否包含实体/ID相关的变量名
|
|
502
|
+
entity_keywords = ['id', 'entity', 'firm', 'company', 'country', 'region']
|
|
503
|
+
has_entity_var = any(any(kw in var for kw in entity_keywords) for var in var_names)
|
|
504
|
+
|
|
505
|
+
if n_vars == 1:
|
|
506
|
+
return 'univariate'
|
|
507
|
+
elif has_entity_var and has_time_var:
|
|
508
|
+
return 'panel'
|
|
509
|
+
elif has_time_var or n_vars >= 2:
|
|
510
|
+
return 'time_series'
|
|
511
|
+
else:
|
|
512
|
+
return 'multivariate'
|
|
513
|
+
|
|
514
|
+
@staticmethod
|
|
515
|
+
def convert_to_tool_format(
|
|
516
|
+
parsed_data: Dict[str, Any],
|
|
517
|
+
tool_type: str
|
|
518
|
+
) -> Dict[str, Any]:
|
|
519
|
+
"""
|
|
520
|
+
将解析后的数据转换为工具所需的格式
|
|
521
|
+
|
|
522
|
+
Args:
|
|
523
|
+
parsed_data: parse_file_content返回的数据
|
|
524
|
+
tool_type: 工具类型
|
|
525
|
+
- 'single_var': 单变量 (List[float])
|
|
526
|
+
- 'multi_var_dict': 多变量字典 (Dict[str, List[float]])
|
|
527
|
+
- 'multi_var_matrix': 多变量矩阵 (List[List[float]])
|
|
528
|
+
- 'regression': 回归分析 (y_data, x_data)
|
|
529
|
+
- 'panel': 面板数据 (y_data, x_data, entity_ids, time_periods)
|
|
530
|
+
|
|
531
|
+
Returns:
|
|
532
|
+
转换后的数据字典
|
|
533
|
+
"""
|
|
534
|
+
data = parsed_data["data"]
|
|
535
|
+
variables = parsed_data["variables"]
|
|
536
|
+
|
|
537
|
+
if tool_type == 'single_var':
|
|
538
|
+
# 返回第一个变量的数据
|
|
539
|
+
var_data = data[variables[0]]
|
|
540
|
+
return {
|
|
541
|
+
"data": var_data
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
elif tool_type == 'multi_var_dict':
|
|
545
|
+
# 直接返回字典格式
|
|
546
|
+
return {"data": data}
|
|
547
|
+
|
|
548
|
+
elif tool_type == 'time_series':
|
|
549
|
+
# 时间序列类型,与multi_var_dict相同,返回字典格式
|
|
550
|
+
return {"data": data}
|
|
551
|
+
|
|
552
|
+
elif tool_type == 'multi_var_matrix':
|
|
553
|
+
# 转换为矩阵格式 (List[List[float]])
|
|
554
|
+
n_obs = len(data[variables[0]])
|
|
555
|
+
matrix = []
|
|
556
|
+
for i in range(n_obs):
|
|
557
|
+
row = [data[var][i] for var in variables]
|
|
558
|
+
matrix.append(row)
|
|
559
|
+
|
|
560
|
+
return {
|
|
561
|
+
"data": matrix,
|
|
562
|
+
"feature_names": variables
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
elif tool_type == 'regression':
|
|
566
|
+
# 假设最后一个变量是因变量,其余是自变量
|
|
567
|
+
if len(variables) < 2:
|
|
568
|
+
raise ValueError("回归分析至少需要2个变量(1个因变量和至少1个自变量)")
|
|
569
|
+
|
|
570
|
+
y_var = variables[-1]
|
|
571
|
+
x_vars = variables[:-1]
|
|
572
|
+
|
|
573
|
+
y_data = data[y_var]
|
|
574
|
+
n_obs = len(y_data)
|
|
575
|
+
|
|
576
|
+
# 构建x_data矩阵
|
|
577
|
+
x_data = []
|
|
578
|
+
for i in range(n_obs):
|
|
579
|
+
row = [data[var][i] for var in x_vars]
|
|
580
|
+
x_data.append(row)
|
|
581
|
+
|
|
582
|
+
return {
|
|
583
|
+
"y_data": y_data,
|
|
584
|
+
"x_data": x_data,
|
|
585
|
+
"feature_names": x_vars
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
elif tool_type == 'panel':
|
|
589
|
+
# 识别实体ID、时间标识和数据变量
|
|
590
|
+
entity_var = None
|
|
591
|
+
time_var = None
|
|
592
|
+
data_vars = []
|
|
593
|
+
|
|
594
|
+
entity_keywords = ['id', 'entity', 'firm', 'company', 'country', 'region']
|
|
595
|
+
time_keywords = ['time', 'date', 'year', 'month', 'day', 'period', 'quarter']
|
|
596
|
+
|
|
597
|
+
# 更详细的检测逻辑
|
|
598
|
+
print(f"Debug: 开始检测面板数据列...")
|
|
599
|
+
for var in variables:
|
|
600
|
+
var_lower = var.lower()
|
|
601
|
+
print(f"Debug: 检查变量 '{var}' (小写: '{var_lower}')")
|
|
602
|
+
|
|
603
|
+
# 检查是否是实体ID列
|
|
604
|
+
is_entity = any(kw in var_lower for kw in entity_keywords)
|
|
605
|
+
is_time = any(kw in var_lower for kw in time_keywords)
|
|
606
|
+
|
|
607
|
+
if is_entity and entity_var is None:
|
|
608
|
+
entity_var = var
|
|
609
|
+
print(f"Debug: 识别为实体ID列: {var}")
|
|
610
|
+
elif is_time and time_var is None:
|
|
611
|
+
time_var = var
|
|
612
|
+
print(f"Debug: 识别为时间列: {var}")
|
|
613
|
+
else:
|
|
614
|
+
data_vars.append(var)
|
|
615
|
+
print(f"Debug: 识别为数据列: {var}")
|
|
616
|
+
|
|
617
|
+
print(f"Debug: entity_var={entity_var}, time_var={time_var}, data_vars={data_vars}")
|
|
618
|
+
|
|
619
|
+
if not entity_var or not time_var:
|
|
620
|
+
# 提供更详细的错误信息
|
|
621
|
+
available_vars = ', '.join(variables)
|
|
622
|
+
error_msg = f"面板数据需要包含实体ID和时间标识变量。\n"
|
|
623
|
+
error_msg += f"可用列: {available_vars}\n"
|
|
624
|
+
error_msg += f"检测到的实体ID列: {entity_var if entity_var else '未检测到'}\n"
|
|
625
|
+
error_msg += f"检测到的时间列: {time_var if time_var else '未检测到'}\n"
|
|
626
|
+
error_msg += f"实体ID关键词: {entity_keywords}\n"
|
|
627
|
+
error_msg += f"时间关键词: {time_keywords}"
|
|
628
|
+
raise ValueError(error_msg)
|
|
629
|
+
|
|
630
|
+
if len(data_vars) < 1:
|
|
631
|
+
raise ValueError(f"面板数据至少需要1个数据变量。当前数据列: {data_vars}")
|
|
632
|
+
|
|
633
|
+
# 转换ID和时间(保持原类型,可能是字符串或数字)
|
|
634
|
+
entity_ids = [str(x) for x in data[entity_var]]
|
|
635
|
+
time_periods = [str(int(x)) if isinstance(x, float) and x == int(x) else str(x) for x in data[time_var]]
|
|
636
|
+
|
|
637
|
+
print(f"Debug: entity_ids样本: {entity_ids[:5]}")
|
|
638
|
+
print(f"Debug: time_periods样本: {time_periods[:5]}")
|
|
639
|
+
|
|
640
|
+
# 如果只有一个数据变量,将其作为因变量
|
|
641
|
+
if len(data_vars) == 1:
|
|
642
|
+
y_var = data_vars[0]
|
|
643
|
+
y_data = data[y_var]
|
|
644
|
+
# 创建一个虚拟自变量(常数项)
|
|
645
|
+
n_obs = len(y_data)
|
|
646
|
+
x_data = [[1.0] for _ in range(n_obs)]
|
|
647
|
+
x_vars = ['const']
|
|
648
|
+
else:
|
|
649
|
+
# 假设最后一个数据变量是因变量
|
|
650
|
+
y_var = data_vars[-1]
|
|
651
|
+
x_vars = data_vars[:-1]
|
|
652
|
+
|
|
653
|
+
y_data = data[y_var]
|
|
654
|
+
n_obs = len(y_data)
|
|
655
|
+
|
|
656
|
+
# 构建x_data矩阵
|
|
657
|
+
x_data = []
|
|
658
|
+
for i in range(n_obs):
|
|
659
|
+
row = [data[var][i] for var in x_vars]
|
|
660
|
+
x_data.append(row)
|
|
661
|
+
|
|
662
|
+
return {
|
|
663
|
+
"y_data": y_data,
|
|
664
|
+
"x_data": x_data,
|
|
665
|
+
"entity_ids": entity_ids,
|
|
666
|
+
"time_periods": time_periods,
|
|
667
|
+
"feature_names": x_vars
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
else:
|
|
671
|
+
raise ValueError(f"不支持的工具类型: {tool_type}")
|
|
672
|
+
|
|
673
|
+
@staticmethod
|
|
674
|
+
def auto_detect_tool_params(parsed_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
675
|
+
"""
|
|
676
|
+
自动检测并推荐适合的工具参数
|
|
677
|
+
|
|
678
|
+
Args:
|
|
679
|
+
parsed_data: parse_file_content返回的数据
|
|
680
|
+
|
|
681
|
+
Returns:
|
|
682
|
+
推荐的工具和参数
|
|
683
|
+
"""
|
|
684
|
+
data_type = parsed_data["data_type"]
|
|
685
|
+
n_vars = parsed_data["n_variables"]
|
|
686
|
+
n_obs = parsed_data["n_observations"]
|
|
687
|
+
|
|
688
|
+
recommendations = {
|
|
689
|
+
"data_type": data_type,
|
|
690
|
+
"suggested_tools": [],
|
|
691
|
+
"warnings": []
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
# 根据数据类型推荐工具
|
|
695
|
+
if data_type == 'univariate':
|
|
696
|
+
recommendations["suggested_tools"] = [
|
|
697
|
+
"descriptive_statistics",
|
|
698
|
+
"hypothesis_testing",
|
|
699
|
+
"time_series_analysis"
|
|
700
|
+
]
|
|
701
|
+
elif data_type == 'multivariate':
|
|
702
|
+
recommendations["suggested_tools"] = [
|
|
703
|
+
"descriptive_statistics",
|
|
704
|
+
"correlation_analysis",
|
|
705
|
+
"ols_regression",
|
|
706
|
+
"random_forest_regression_analysis",
|
|
707
|
+
"lasso_regression_analysis"
|
|
708
|
+
]
|
|
709
|
+
elif data_type == 'time_series':
|
|
710
|
+
recommendations["suggested_tools"] = [
|
|
711
|
+
"time_series_analysis",
|
|
712
|
+
"var_model_analysis",
|
|
713
|
+
"garch_model_analysis"
|
|
714
|
+
]
|
|
715
|
+
elif data_type == 'panel':
|
|
716
|
+
recommendations["suggested_tools"] = [
|
|
717
|
+
"panel_fixed_effects",
|
|
718
|
+
"panel_random_effects",
|
|
719
|
+
"panel_hausman_test",
|
|
720
|
+
"panel_unit_root_test"
|
|
721
|
+
]
|
|
722
|
+
|
|
723
|
+
# 添加警告
|
|
724
|
+
if n_obs < 30:
|
|
725
|
+
recommendations["warnings"].append(
|
|
726
|
+
f"样本量较小({n_obs}个观测),统计推断可能不可靠"
|
|
727
|
+
)
|
|
728
|
+
|
|
729
|
+
if n_vars > 10:
|
|
730
|
+
recommendations["warnings"].append(
|
|
731
|
+
f"变量数量较多({n_vars}个变量),可能需要特征选择"
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
if n_vars > n_obs / 10:
|
|
735
|
+
recommendations["warnings"].append(
|
|
736
|
+
"变量数量接近样本量的1/10,可能存在过拟合风险"
|
|
737
|
+
)
|
|
738
|
+
|
|
739
|
+
return recommendations
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
# ============================================================================
|
|
743
|
+
# 文件输入处理组件
|
|
744
|
+
# ============================================================================
|
|
745
|
+
|
|
746
|
+
class FileInputHandler:
|
|
747
|
+
"""
|
|
748
|
+
文件输入处理组件
|
|
749
|
+
|
|
750
|
+
使用组件模式,为任何工具函数添加文件输入支持
|
|
751
|
+
"""
|
|
752
|
+
|
|
753
|
+
@staticmethod
|
|
754
|
+
def process_input(
|
|
755
|
+
file_content: Optional[str],
|
|
756
|
+
file_format: str,
|
|
757
|
+
tool_type: str,
|
|
758
|
+
data_params: Dict[str, Any]
|
|
759
|
+
) -> Dict[str, Any]:
|
|
760
|
+
"""
|
|
761
|
+
处理文件输入并转换为工具参数
|
|
762
|
+
|
|
763
|
+
Args:
|
|
764
|
+
file_content: 文件内容
|
|
765
|
+
file_format: 文件格式
|
|
766
|
+
tool_type: 工具类型
|
|
767
|
+
data_params: 当前数据参数
|
|
768
|
+
|
|
769
|
+
Returns:
|
|
770
|
+
更新后的参数字典
|
|
771
|
+
"""
|
|
772
|
+
# 如果没有文件输入,直接返回原参数
|
|
773
|
+
if file_content is None:
|
|
774
|
+
return data_params
|
|
775
|
+
|
|
776
|
+
# 解析文件
|
|
777
|
+
parsed = FileParser.parse_file_content(file_content, file_format)
|
|
778
|
+
|
|
779
|
+
# 转换为工具格式
|
|
780
|
+
converted = FileParser.convert_to_tool_format(parsed, tool_type)
|
|
781
|
+
|
|
782
|
+
# 合并参数(文件数据优先)
|
|
783
|
+
result = {**data_params, **converted}
|
|
784
|
+
|
|
785
|
+
return result
|
|
786
|
+
|
|
787
|
+
@staticmethod
|
|
788
|
+
def with_file_support(tool_type: str):
|
|
789
|
+
"""
|
|
790
|
+
装饰器:为工具函数添加文件输入支持
|
|
791
|
+
|
|
792
|
+
Args:
|
|
793
|
+
tool_type: 工具类型(single_var, multi_var_dict, regression, panel等)
|
|
794
|
+
|
|
795
|
+
Returns:
|
|
796
|
+
装饰后的函数
|
|
797
|
+
|
|
798
|
+
使用示例:
|
|
799
|
+
@FileInputHandler.with_file_support('regression')
|
|
800
|
+
async def my_regression_tool(y_data, x_data, file_content=None, file_format='auto'):
|
|
801
|
+
# 函数会自动处理file_content并填充y_data和x_data
|
|
802
|
+
pass
|
|
803
|
+
"""
|
|
804
|
+
def decorator(func: Callable) -> Callable:
|
|
805
|
+
@wraps(func)
|
|
806
|
+
async def wrapper(*args, **kwargs):
|
|
807
|
+
# 提取文件相关参数
|
|
808
|
+
file_content = kwargs.get('file_content')
|
|
809
|
+
file_format = kwargs.get('file_format', 'auto')
|
|
810
|
+
|
|
811
|
+
if file_content is not None:
|
|
812
|
+
# 处理文件输入
|
|
813
|
+
processed = FileInputHandler.process_input(
|
|
814
|
+
file_content=file_content,
|
|
815
|
+
file_format=file_format,
|
|
816
|
+
tool_type=tool_type,
|
|
817
|
+
data_params=kwargs
|
|
818
|
+
)
|
|
819
|
+
|
|
820
|
+
# 更新kwargs
|
|
821
|
+
kwargs.update(processed)
|
|
822
|
+
|
|
823
|
+
# 调用原函数
|
|
824
|
+
return await func(*args, **kwargs)
|
|
825
|
+
|
|
826
|
+
return wrapper
|
|
827
|
+
return decorator
|
|
828
|
+
|
|
829
|
+
|
|
830
|
+
class FileInputMixin:
|
|
831
|
+
"""
|
|
832
|
+
文件输入混入类
|
|
833
|
+
|
|
834
|
+
为类提供文件输入处理能力
|
|
835
|
+
"""
|
|
836
|
+
|
|
837
|
+
def parse_file_input(
|
|
838
|
+
self,
|
|
839
|
+
file_content: Optional[str],
|
|
840
|
+
file_format: str = "auto"
|
|
841
|
+
) -> Optional[Dict[str, Any]]:
|
|
842
|
+
"""解析文件输入"""
|
|
843
|
+
if file_content is None:
|
|
844
|
+
return None
|
|
845
|
+
return FileParser.parse_file_content(file_content, file_format)
|
|
846
|
+
|
|
847
|
+
def convert_for_tool(
|
|
848
|
+
self,
|
|
849
|
+
parsed_data: Dict[str, Any],
|
|
850
|
+
tool_type: str
|
|
851
|
+
) -> Dict[str, Any]:
|
|
852
|
+
"""转换为工具格式"""
|
|
853
|
+
return FileParser.convert_to_tool_format(parsed_data, tool_type)
|
|
854
|
+
|
|
855
|
+
def get_recommendations(
|
|
856
|
+
self,
|
|
857
|
+
parsed_data: Dict[str, Any]
|
|
858
|
+
) -> Dict[str, Any]:
|
|
859
|
+
"""获取工具推荐"""
|
|
860
|
+
return FileParser.auto_detect_tool_params(parsed_data)
|
|
861
|
+
|
|
862
|
+
|
|
863
|
+
class UnifiedFileInput:
|
|
864
|
+
"""
|
|
865
|
+
统一文件输入接口
|
|
866
|
+
|
|
867
|
+
所有工具通过此类统一处理文件输入
|
|
868
|
+
"""
|
|
869
|
+
|
|
870
|
+
@staticmethod
|
|
871
|
+
async def handle(
|
|
872
|
+
ctx: Any,
|
|
873
|
+
file_content: Optional[str],
|
|
874
|
+
file_format: str,
|
|
875
|
+
tool_type: str,
|
|
876
|
+
original_params: Dict[str, Any]
|
|
877
|
+
) -> Dict[str, Any]:
|
|
878
|
+
"""
|
|
879
|
+
统一处理文件输入
|
|
880
|
+
|
|
881
|
+
Args:
|
|
882
|
+
ctx: MCP上下文
|
|
883
|
+
file_content: 文件内容
|
|
884
|
+
file_format: 文件格式
|
|
885
|
+
tool_type: 工具类型
|
|
886
|
+
original_params: 原始参数
|
|
887
|
+
|
|
888
|
+
Returns:
|
|
889
|
+
处理后的参数
|
|
890
|
+
"""
|
|
891
|
+
if file_content is None:
|
|
892
|
+
return original_params
|
|
893
|
+
|
|
894
|
+
try:
|
|
895
|
+
# 记录日志
|
|
896
|
+
await ctx.info("检测到文件输入,开始解析...")
|
|
897
|
+
|
|
898
|
+
# 解析文件
|
|
899
|
+
parsed = FileParser.parse_file_content(file_content, file_format)
|
|
900
|
+
|
|
901
|
+
# 记录解析结果
|
|
902
|
+
await ctx.info(
|
|
903
|
+
f"文件解析成功:{parsed['n_variables']}个变量,"
|
|
904
|
+
f"{parsed['n_observations']}个观测,"
|
|
905
|
+
f"数据类型={parsed['data_type']}"
|
|
906
|
+
)
|
|
907
|
+
|
|
908
|
+
# 转换为工具格式
|
|
909
|
+
converted = FileParser.convert_to_tool_format(parsed, tool_type)
|
|
910
|
+
|
|
911
|
+
# 合并参数
|
|
912
|
+
result = {**original_params}
|
|
913
|
+
result.update(converted)
|
|
914
|
+
|
|
915
|
+
# 记录转换结果
|
|
916
|
+
if tool_type == 'regression':
|
|
917
|
+
await ctx.info(
|
|
918
|
+
f"数据已转换:因变量={converted.get('y_variable')},"
|
|
919
|
+
f"自变量={converted.get('feature_names')}"
|
|
920
|
+
)
|
|
921
|
+
elif tool_type == 'panel':
|
|
922
|
+
await ctx.info(
|
|
923
|
+
f"面板数据已识别:{len(set(converted.get('entity_ids', [])))}个实体,"
|
|
924
|
+
f"{len(set(converted.get('time_periods', [])))}个时间点"
|
|
925
|
+
)
|
|
926
|
+
else:
|
|
927
|
+
await ctx.info(f"数据已转换为{tool_type}格式")
|
|
928
|
+
|
|
929
|
+
return result
|
|
930
|
+
|
|
931
|
+
except Exception as e:
|
|
932
|
+
await ctx.error(f"文件解析失败: {str(e)}")
|
|
933
|
+
raise ValueError(f"文件解析失败: {str(e)}")
|
|
934
|
+
|
|
935
|
+
|
|
936
|
+
# ============================================================================
|
|
937
|
+
# 便捷函数和参数定义
|
|
938
|
+
# ============================================================================
|
|
939
|
+
|
|
940
|
+
def parse_file_input(
|
|
941
|
+
file_content: Optional[str] = None,
|
|
942
|
+
file_format: str = "auto"
|
|
943
|
+
) -> Optional[Dict[str, Any]]:
|
|
944
|
+
"""
|
|
945
|
+
便捷函数:解析文件输入
|
|
946
|
+
|
|
947
|
+
Args:
|
|
948
|
+
file_content: 文件内容(可选)
|
|
949
|
+
file_format: 文件格式
|
|
950
|
+
|
|
951
|
+
Returns:
|
|
952
|
+
解析后的数据,如果file_content为None则返回None
|
|
953
|
+
"""
|
|
954
|
+
if file_content is None:
|
|
955
|
+
return None
|
|
956
|
+
|
|
957
|
+
return FileParser.parse_file_content(file_content, file_format)
|
|
958
|
+
|
|
959
|
+
|
|
960
|
+
async def process_file_for_tool(
|
|
961
|
+
ctx: Any,
|
|
962
|
+
file_content: Optional[str],
|
|
963
|
+
file_format: str,
|
|
964
|
+
tool_type: str,
|
|
965
|
+
**kwargs
|
|
966
|
+
) -> Dict[str, Any]:
|
|
967
|
+
"""
|
|
968
|
+
为工具处理文件输入的便捷函数
|
|
969
|
+
|
|
970
|
+
使用示例:
|
|
971
|
+
params = await process_file_for_tool(
|
|
972
|
+
ctx=ctx,
|
|
973
|
+
file_
|
|
974
|
+
content=file_content,
|
|
975
|
+
file_format=file_format,
|
|
976
|
+
tool_type='regression',
|
|
977
|
+
y_data=y_data,
|
|
978
|
+
x_data=x_data,
|
|
979
|
+
feature_names=feature_names
|
|
980
|
+
)
|
|
981
|
+
# params 现在包含处理后的所有参数
|
|
982
|
+
"""
|
|
983
|
+
return await UnifiedFileInput.handle(
|
|
984
|
+
ctx=ctx,
|
|
985
|
+
file_content=file_content,
|
|
986
|
+
file_format=file_format,
|
|
987
|
+
tool_type=tool_type,
|
|
988
|
+
original_params=kwargs
|
|
989
|
+
)
|
|
990
|
+
|
|
991
|
+
|
|
992
|
+
def create_file_params(
|
|
993
|
+
description: str = "CSV或JSON文件内容"
|
|
994
|
+
) -> Dict[str, Any]:
|
|
995
|
+
"""
|
|
996
|
+
创建标准的文件输入参数定义
|
|
997
|
+
|
|
998
|
+
Args:
|
|
999
|
+
description: 参数描述
|
|
1000
|
+
|
|
1001
|
+
Returns:
|
|
1002
|
+
参数定义字典,可直接用于Field()
|
|
1003
|
+
"""
|
|
1004
|
+
return {
|
|
1005
|
+
"file_content": {
|
|
1006
|
+
"default": None,
|
|
1007
|
+
"description": f"""{description}
|
|
1008
|
+
|
|
1009
|
+
📁 支持格式:
|
|
1010
|
+
- CSV: 带表头的列数据,自动检测分隔符
|
|
1011
|
+
- JSON: {{"变量名": [数据], ...}} 或 [{{"变量1": 值, ...}}, ...]
|
|
1012
|
+
|
|
1013
|
+
💡 使用方式:
|
|
1014
|
+
- 提供文件内容字符串(可以是base64编码)
|
|
1015
|
+
- 系统会自动解析并识别变量
|
|
1016
|
+
- 优先使用file_content,如果提供则忽略其他数据参数"""
|
|
1017
|
+
},
|
|
1018
|
+
"file_format": {
|
|
1019
|
+
"default": "auto",
|
|
1020
|
+
"description": """文件格式
|
|
1021
|
+
|
|
1022
|
+
可选值:
|
|
1023
|
+
- "auto": 自动检测(默认)
|
|
1024
|
+
- "csv": CSV格式
|
|
1025
|
+
- "json": JSON格式"""
|
|
1026
|
+
}
|
|
1027
|
+
}
|